Message ID | 20180724031837.13038-2-daniel.axtens@canonical.com |
---|---|
State | New |
Headers | show |
Series | Fix LP: #1783246 - cephfs + fscache crash | expand |
On 24.07.2018 05:18, Daniel Axtens wrote: > From: "Yan, Zheng" <zyan@redhat.com> > > BugLink: https://bugs.launchpad.net/bugs/1783246 > > Previously ceph_read_iter() uses current->journal to pass context info > to ceph_readpages(), so that ceph_readpages() can distinguish read(2) > from readahead(2)/fadvise(2)/madvise(2). The problem is that page fault > can happen when copying data to userspace memory. Page fault may call > other filesystem's page_mkwrite() if the userspace memory is mapped to a > file. The later filesystem may also want to use current->journal. > > The fix is define a on-stack data structure in ceph_read_iter(), add it > to context list in ceph_file_info. ceph_readpages() searches the list, > find if there is a context belongs to current thread. > > Signed-off-by: "Yan, Zheng" <zyan@redhat.com> > Signed-off-by: Ilya Dryomov <idryomov@gmail.com> > (cherry picked from commit 5d988308283ecf062fa88f20ae05c52cce0bcdca) > Signed-off-by: Daniel Axtens <daniel.axtens@canonical.com> Acked-by: Stefan Bader <stefan.bader@canonical.com> > --- > fs/ceph/addr.c | 19 ++++++++++++------- > fs/ceph/file.c | 10 ++++++++-- > fs/ceph/super.h | 46 ++++++++++++++++++++++++++++++++++++++++++++++ > 3 files changed, 66 insertions(+), 9 deletions(-) > > diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c > index dbf07051aacd..78a1208b878e 100644 > --- a/fs/ceph/addr.c > +++ b/fs/ceph/addr.c > @@ -299,7 +299,8 @@ static void finish_read(struct ceph_osd_request *req) > * start an async read(ahead) operation. return nr_pages we submitted > * a read for on success, or negative error code. > */ > -static int start_read(struct inode *inode, struct list_head *page_list, int max) > +static int start_read(struct inode *inode, struct ceph_rw_context *rw_ctx, > + struct list_head *page_list, int max) > { > struct ceph_osd_client *osdc = > &ceph_inode_to_client(inode)->client->osdc; > @@ -316,7 +317,7 @@ static int start_read(struct inode *inode, struct list_head *page_list, int max) > int got = 0; > int ret = 0; > > - if (!current->journal_info) { > + if (!rw_ctx) { > /* caller of readpages does not hold buffer and read caps > * (fadvise, madvise and readahead cases) */ > int want = CEPH_CAP_FILE_CACHE; > @@ -437,6 +438,8 @@ static int ceph_readpages(struct file *file, struct address_space *mapping, > { > struct inode *inode = file_inode(file); > struct ceph_fs_client *fsc = ceph_inode_to_client(inode); > + struct ceph_file_info *ci = file->private_data; > + struct ceph_rw_context *rw_ctx; > int rc = 0; > int max = 0; > > @@ -449,11 +452,12 @@ static int ceph_readpages(struct file *file, struct address_space *mapping, > if (rc == 0) > goto out; > > + rw_ctx = ceph_find_rw_context(ci); > max = fsc->mount_options->rsize >> PAGE_SHIFT; > - dout("readpages %p file %p nr_pages %d max %d\n", > - inode, file, nr_pages, max); > + dout("readpages %p file %p ctx %p nr_pages %d max %d\n", > + inode, file, rw_ctx, nr_pages, max); > while (!list_empty(page_list)) { > - rc = start_read(inode, page_list, max); > + rc = start_read(inode, rw_ctx, page_list, max); > if (rc < 0) > goto out; > } > @@ -1450,9 +1454,10 @@ static int ceph_filemap_fault(struct vm_fault *vmf) > > if ((got & (CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_LAZYIO)) || > ci->i_inline_version == CEPH_INLINE_NONE) { > - current->journal_info = vma->vm_file; > + CEPH_DEFINE_RW_CONTEXT(rw_ctx, got); > + ceph_add_rw_context(fi, &rw_ctx); > ret = filemap_fault(vmf); > - current->journal_info = NULL; > + ceph_del_rw_context(fi, &rw_ctx); > } else > ret = -EAGAIN; > > diff --git a/fs/ceph/file.c b/fs/ceph/file.c > index 0024d3e61bcd..7f75601d24d9 100644 > --- a/fs/ceph/file.c > +++ b/fs/ceph/file.c > @@ -181,6 +181,10 @@ static int ceph_init_file(struct inode *inode, struct file *file, int fmode) > return -ENOMEM; > } > cf->fmode = fmode; > + > + spin_lock_init(&cf->rw_contexts_lock); > + INIT_LIST_HEAD(&cf->rw_contexts); > + > cf->next_offset = 2; > cf->readdir_cache_idx = -1; > file->private_data = cf; > @@ -464,6 +468,7 @@ int ceph_release(struct inode *inode, struct file *file) > ceph_mdsc_put_request(cf->last_readdir); > kfree(cf->last_name); > kfree(cf->dir_info); > + WARN_ON(!list_empty(&cf->rw_contexts)); > kmem_cache_free(ceph_file_cachep, cf); > > /* wake up anyone waiting for caps on this inode */ > @@ -1202,12 +1207,13 @@ static ssize_t ceph_read_iter(struct kiocb *iocb, struct iov_iter *to) > retry_op = READ_INLINE; > } > } else { > + CEPH_DEFINE_RW_CONTEXT(rw_ctx, got); > dout("aio_read %p %llx.%llx %llu~%u got cap refs on %s\n", > inode, ceph_vinop(inode), iocb->ki_pos, (unsigned)len, > ceph_cap_string(got)); > - current->journal_info = filp; > + ceph_add_rw_context(fi, &rw_ctx); > ret = generic_file_read_iter(iocb, to); > - current->journal_info = NULL; > + ceph_del_rw_context(fi, &rw_ctx); > } > dout("aio_read %p %llx.%llx dropping cap refs on %s = %d\n", > inode, ceph_vinop(inode), ceph_cap_string(got), (int)ret); > diff --git a/fs/ceph/super.h b/fs/ceph/super.h > index 2beeec07fa76..dd59bc7d2c3d 100644 > --- a/fs/ceph/super.h > +++ b/fs/ceph/super.h > @@ -668,6 +668,9 @@ struct ceph_file_info { > short fmode; /* initialized on open */ > short flags; /* CEPH_F_* */ > > + spinlock_t rw_contexts_lock; > + struct list_head rw_contexts; > + > /* readdir: position within the dir */ > u32 frag; > struct ceph_mds_request *last_readdir; > @@ -684,6 +687,49 @@ struct ceph_file_info { > int dir_info_len; > }; > > +struct ceph_rw_context { > + struct list_head list; > + struct task_struct *thread; > + int caps; > +}; > + > +#define CEPH_DEFINE_RW_CONTEXT(_name, _caps) \ > + struct ceph_rw_context _name = { \ > + .thread = current, \ > + .caps = _caps, \ > + } > + > +static inline void ceph_add_rw_context(struct ceph_file_info *cf, > + struct ceph_rw_context *ctx) > +{ > + spin_lock(&cf->rw_contexts_lock); > + list_add(&ctx->list, &cf->rw_contexts); > + spin_unlock(&cf->rw_contexts_lock); > +} > + > +static inline void ceph_del_rw_context(struct ceph_file_info *cf, > + struct ceph_rw_context *ctx) > +{ > + spin_lock(&cf->rw_contexts_lock); > + list_del(&ctx->list); > + spin_unlock(&cf->rw_contexts_lock); > +} > + > +static inline struct ceph_rw_context* > +ceph_find_rw_context(struct ceph_file_info *cf) > +{ > + struct ceph_rw_context *ctx, *found = NULL; > + spin_lock(&cf->rw_contexts_lock); > + list_for_each_entry(ctx, &cf->rw_contexts, list) { > + if (ctx->thread == current) { > + found = ctx; > + break; > + } > + } > + spin_unlock(&cf->rw_contexts_lock); > + return found; > +} > + > struct ceph_readdir_cache_control { > struct page *page; > struct dentry **dentries; >
On 07/24/18 05:18, Daniel Axtens wrote: > From: "Yan, Zheng" <zyan@redhat.com> > > BugLink: https://bugs.launchpad.net/bugs/1783246 > > Previously ceph_read_iter() uses current->journal to pass context info > to ceph_readpages(), so that ceph_readpages() can distinguish read(2) > from readahead(2)/fadvise(2)/madvise(2). The problem is that page fault > can happen when copying data to userspace memory. Page fault may call > other filesystem's page_mkwrite() if the userspace memory is mapped to a > file. The later filesystem may also want to use current->journal. > > The fix is define a on-stack data structure in ceph_read_iter(), add it > to context list in ceph_file_info. ceph_readpages() searches the list, > find if there is a context belongs to current thread. > > Signed-off-by: "Yan, Zheng" <zyan@redhat.com> > Signed-off-by: Ilya Dryomov <idryomov@gmail.com> > (cherry picked from commit 5d988308283ecf062fa88f20ae05c52cce0bcdca) > Signed-off-by: Daniel Axtens <daniel.axtens@canonical.com> Acked-by: Kleber Sacilotto de Souza <kleber.souza@canonical.com> > --- > fs/ceph/addr.c | 19 ++++++++++++------- > fs/ceph/file.c | 10 ++++++++-- > fs/ceph/super.h | 46 ++++++++++++++++++++++++++++++++++++++++++++++ > 3 files changed, 66 insertions(+), 9 deletions(-) > > diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c > index dbf07051aacd..78a1208b878e 100644 > --- a/fs/ceph/addr.c > +++ b/fs/ceph/addr.c > @@ -299,7 +299,8 @@ static void finish_read(struct ceph_osd_request *req) > * start an async read(ahead) operation. return nr_pages we submitted > * a read for on success, or negative error code. > */ > -static int start_read(struct inode *inode, struct list_head *page_list, int max) > +static int start_read(struct inode *inode, struct ceph_rw_context *rw_ctx, > + struct list_head *page_list, int max) > { > struct ceph_osd_client *osdc = > &ceph_inode_to_client(inode)->client->osdc; > @@ -316,7 +317,7 @@ static int start_read(struct inode *inode, struct list_head *page_list, int max) > int got = 0; > int ret = 0; > > - if (!current->journal_info) { > + if (!rw_ctx) { > /* caller of readpages does not hold buffer and read caps > * (fadvise, madvise and readahead cases) */ > int want = CEPH_CAP_FILE_CACHE; > @@ -437,6 +438,8 @@ static int ceph_readpages(struct file *file, struct address_space *mapping, > { > struct inode *inode = file_inode(file); > struct ceph_fs_client *fsc = ceph_inode_to_client(inode); > + struct ceph_file_info *ci = file->private_data; > + struct ceph_rw_context *rw_ctx; > int rc = 0; > int max = 0; > > @@ -449,11 +452,12 @@ static int ceph_readpages(struct file *file, struct address_space *mapping, > if (rc == 0) > goto out; > > + rw_ctx = ceph_find_rw_context(ci); > max = fsc->mount_options->rsize >> PAGE_SHIFT; > - dout("readpages %p file %p nr_pages %d max %d\n", > - inode, file, nr_pages, max); > + dout("readpages %p file %p ctx %p nr_pages %d max %d\n", > + inode, file, rw_ctx, nr_pages, max); > while (!list_empty(page_list)) { > - rc = start_read(inode, page_list, max); > + rc = start_read(inode, rw_ctx, page_list, max); > if (rc < 0) > goto out; > } > @@ -1450,9 +1454,10 @@ static int ceph_filemap_fault(struct vm_fault *vmf) > > if ((got & (CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_LAZYIO)) || > ci->i_inline_version == CEPH_INLINE_NONE) { > - current->journal_info = vma->vm_file; > + CEPH_DEFINE_RW_CONTEXT(rw_ctx, got); > + ceph_add_rw_context(fi, &rw_ctx); > ret = filemap_fault(vmf); > - current->journal_info = NULL; > + ceph_del_rw_context(fi, &rw_ctx); > } else > ret = -EAGAIN; > > diff --git a/fs/ceph/file.c b/fs/ceph/file.c > index 0024d3e61bcd..7f75601d24d9 100644 > --- a/fs/ceph/file.c > +++ b/fs/ceph/file.c > @@ -181,6 +181,10 @@ static int ceph_init_file(struct inode *inode, struct file *file, int fmode) > return -ENOMEM; > } > cf->fmode = fmode; > + > + spin_lock_init(&cf->rw_contexts_lock); > + INIT_LIST_HEAD(&cf->rw_contexts); > + > cf->next_offset = 2; > cf->readdir_cache_idx = -1; > file->private_data = cf; > @@ -464,6 +468,7 @@ int ceph_release(struct inode *inode, struct file *file) > ceph_mdsc_put_request(cf->last_readdir); > kfree(cf->last_name); > kfree(cf->dir_info); > + WARN_ON(!list_empty(&cf->rw_contexts)); > kmem_cache_free(ceph_file_cachep, cf); > > /* wake up anyone waiting for caps on this inode */ > @@ -1202,12 +1207,13 @@ static ssize_t ceph_read_iter(struct kiocb *iocb, struct iov_iter *to) > retry_op = READ_INLINE; > } > } else { > + CEPH_DEFINE_RW_CONTEXT(rw_ctx, got); > dout("aio_read %p %llx.%llx %llu~%u got cap refs on %s\n", > inode, ceph_vinop(inode), iocb->ki_pos, (unsigned)len, > ceph_cap_string(got)); > - current->journal_info = filp; > + ceph_add_rw_context(fi, &rw_ctx); > ret = generic_file_read_iter(iocb, to); > - current->journal_info = NULL; > + ceph_del_rw_context(fi, &rw_ctx); > } > dout("aio_read %p %llx.%llx dropping cap refs on %s = %d\n", > inode, ceph_vinop(inode), ceph_cap_string(got), (int)ret); > diff --git a/fs/ceph/super.h b/fs/ceph/super.h > index 2beeec07fa76..dd59bc7d2c3d 100644 > --- a/fs/ceph/super.h > +++ b/fs/ceph/super.h > @@ -668,6 +668,9 @@ struct ceph_file_info { > short fmode; /* initialized on open */ > short flags; /* CEPH_F_* */ > > + spinlock_t rw_contexts_lock; > + struct list_head rw_contexts; > + > /* readdir: position within the dir */ > u32 frag; > struct ceph_mds_request *last_readdir; > @@ -684,6 +687,49 @@ struct ceph_file_info { > int dir_info_len; > }; > > +struct ceph_rw_context { > + struct list_head list; > + struct task_struct *thread; > + int caps; > +}; > + > +#define CEPH_DEFINE_RW_CONTEXT(_name, _caps) \ > + struct ceph_rw_context _name = { \ > + .thread = current, \ > + .caps = _caps, \ > + } > + > +static inline void ceph_add_rw_context(struct ceph_file_info *cf, > + struct ceph_rw_context *ctx) > +{ > + spin_lock(&cf->rw_contexts_lock); > + list_add(&ctx->list, &cf->rw_contexts); > + spin_unlock(&cf->rw_contexts_lock); > +} > + > +static inline void ceph_del_rw_context(struct ceph_file_info *cf, > + struct ceph_rw_context *ctx) > +{ > + spin_lock(&cf->rw_contexts_lock); > + list_del(&ctx->list); > + spin_unlock(&cf->rw_contexts_lock); > +} > + > +static inline struct ceph_rw_context* > +ceph_find_rw_context(struct ceph_file_info *cf) > +{ > + struct ceph_rw_context *ctx, *found = NULL; > + spin_lock(&cf->rw_contexts_lock); > + list_for_each_entry(ctx, &cf->rw_contexts, list) { > + if (ctx->thread == current) { > + found = ctx; > + break; > + } > + } > + spin_unlock(&cf->rw_contexts_lock); > + return found; > +} > + > struct ceph_readdir_cache_control { > struct page *page; > struct dentry **dentries; >
diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c index dbf07051aacd..78a1208b878e 100644 --- a/fs/ceph/addr.c +++ b/fs/ceph/addr.c @@ -299,7 +299,8 @@ static void finish_read(struct ceph_osd_request *req) * start an async read(ahead) operation. return nr_pages we submitted * a read for on success, or negative error code. */ -static int start_read(struct inode *inode, struct list_head *page_list, int max) +static int start_read(struct inode *inode, struct ceph_rw_context *rw_ctx, + struct list_head *page_list, int max) { struct ceph_osd_client *osdc = &ceph_inode_to_client(inode)->client->osdc; @@ -316,7 +317,7 @@ static int start_read(struct inode *inode, struct list_head *page_list, int max) int got = 0; int ret = 0; - if (!current->journal_info) { + if (!rw_ctx) { /* caller of readpages does not hold buffer and read caps * (fadvise, madvise and readahead cases) */ int want = CEPH_CAP_FILE_CACHE; @@ -437,6 +438,8 @@ static int ceph_readpages(struct file *file, struct address_space *mapping, { struct inode *inode = file_inode(file); struct ceph_fs_client *fsc = ceph_inode_to_client(inode); + struct ceph_file_info *ci = file->private_data; + struct ceph_rw_context *rw_ctx; int rc = 0; int max = 0; @@ -449,11 +452,12 @@ static int ceph_readpages(struct file *file, struct address_space *mapping, if (rc == 0) goto out; + rw_ctx = ceph_find_rw_context(ci); max = fsc->mount_options->rsize >> PAGE_SHIFT; - dout("readpages %p file %p nr_pages %d max %d\n", - inode, file, nr_pages, max); + dout("readpages %p file %p ctx %p nr_pages %d max %d\n", + inode, file, rw_ctx, nr_pages, max); while (!list_empty(page_list)) { - rc = start_read(inode, page_list, max); + rc = start_read(inode, rw_ctx, page_list, max); if (rc < 0) goto out; } @@ -1450,9 +1454,10 @@ static int ceph_filemap_fault(struct vm_fault *vmf) if ((got & (CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_LAZYIO)) || ci->i_inline_version == CEPH_INLINE_NONE) { - current->journal_info = vma->vm_file; + CEPH_DEFINE_RW_CONTEXT(rw_ctx, got); + ceph_add_rw_context(fi, &rw_ctx); ret = filemap_fault(vmf); - current->journal_info = NULL; + ceph_del_rw_context(fi, &rw_ctx); } else ret = -EAGAIN; diff --git a/fs/ceph/file.c b/fs/ceph/file.c index 0024d3e61bcd..7f75601d24d9 100644 --- a/fs/ceph/file.c +++ b/fs/ceph/file.c @@ -181,6 +181,10 @@ static int ceph_init_file(struct inode *inode, struct file *file, int fmode) return -ENOMEM; } cf->fmode = fmode; + + spin_lock_init(&cf->rw_contexts_lock); + INIT_LIST_HEAD(&cf->rw_contexts); + cf->next_offset = 2; cf->readdir_cache_idx = -1; file->private_data = cf; @@ -464,6 +468,7 @@ int ceph_release(struct inode *inode, struct file *file) ceph_mdsc_put_request(cf->last_readdir); kfree(cf->last_name); kfree(cf->dir_info); + WARN_ON(!list_empty(&cf->rw_contexts)); kmem_cache_free(ceph_file_cachep, cf); /* wake up anyone waiting for caps on this inode */ @@ -1202,12 +1207,13 @@ static ssize_t ceph_read_iter(struct kiocb *iocb, struct iov_iter *to) retry_op = READ_INLINE; } } else { + CEPH_DEFINE_RW_CONTEXT(rw_ctx, got); dout("aio_read %p %llx.%llx %llu~%u got cap refs on %s\n", inode, ceph_vinop(inode), iocb->ki_pos, (unsigned)len, ceph_cap_string(got)); - current->journal_info = filp; + ceph_add_rw_context(fi, &rw_ctx); ret = generic_file_read_iter(iocb, to); - current->journal_info = NULL; + ceph_del_rw_context(fi, &rw_ctx); } dout("aio_read %p %llx.%llx dropping cap refs on %s = %d\n", inode, ceph_vinop(inode), ceph_cap_string(got), (int)ret); diff --git a/fs/ceph/super.h b/fs/ceph/super.h index 2beeec07fa76..dd59bc7d2c3d 100644 --- a/fs/ceph/super.h +++ b/fs/ceph/super.h @@ -668,6 +668,9 @@ struct ceph_file_info { short fmode; /* initialized on open */ short flags; /* CEPH_F_* */ + spinlock_t rw_contexts_lock; + struct list_head rw_contexts; + /* readdir: position within the dir */ u32 frag; struct ceph_mds_request *last_readdir; @@ -684,6 +687,49 @@ struct ceph_file_info { int dir_info_len; }; +struct ceph_rw_context { + struct list_head list; + struct task_struct *thread; + int caps; +}; + +#define CEPH_DEFINE_RW_CONTEXT(_name, _caps) \ + struct ceph_rw_context _name = { \ + .thread = current, \ + .caps = _caps, \ + } + +static inline void ceph_add_rw_context(struct ceph_file_info *cf, + struct ceph_rw_context *ctx) +{ + spin_lock(&cf->rw_contexts_lock); + list_add(&ctx->list, &cf->rw_contexts); + spin_unlock(&cf->rw_contexts_lock); +} + +static inline void ceph_del_rw_context(struct ceph_file_info *cf, + struct ceph_rw_context *ctx) +{ + spin_lock(&cf->rw_contexts_lock); + list_del(&ctx->list); + spin_unlock(&cf->rw_contexts_lock); +} + +static inline struct ceph_rw_context* +ceph_find_rw_context(struct ceph_file_info *cf) +{ + struct ceph_rw_context *ctx, *found = NULL; + spin_lock(&cf->rw_contexts_lock); + list_for_each_entry(ctx, &cf->rw_contexts, list) { + if (ctx->thread == current) { + found = ctx; + break; + } + } + spin_unlock(&cf->rw_contexts_lock); + return found; +} + struct ceph_readdir_cache_control { struct page *page; struct dentry **dentries;