From 4f06dd92b5d0a6f8eec6a34b8d6ef3e1f4ac1e10 Mon Sep 17 00:00:00 2001 From: Vivek Goyal Date: Wed, 21 Oct 2020 16:12:49 -0400 Subject: [PATCH 01/14] fuse: fix write deadlock There are two modes for write(2) and friends in fuse: a) write through (update page cache, send sync WRITE request to userspace) b) buffered write (update page cache, async writeout later) The write through method kept all the page cache pages locked that were used for the request. Keeping more than one page locked is deadlock prone and Qian Cai demonstrated this with trinity fuzzing. The reason for keeping the pages locked is that concurrent mapped reads shouldn't try to pull possibly stale data into the page cache. For full page writes, the easy way to fix this is to make the cached page be the authoritative source by marking the page PG_uptodate immediately. After this the page can be safely unlocked, since mapped/cached reads will take the written data from the cache. Concurrent mapped writes will now cause data in the original WRITE request to be updated; this however doesn't cause any data inconsistency and this scenario should be exceedingly rare anyway. If the WRITE request returns with an error in the above case, currently the page is not marked uptodate; this means that a concurrent read will always read consistent data. After this patch the page is uptodate between writing to the cache and receiving the error: there's window where a cached read will read the wrong data. While theoretically this could be a regression, it is unlikely to be one in practice, since this is normal for buffered writes. In case of a partial page write to an already uptodate page the locking is also unnecessary, with the above caveats. Partial write of a not uptodate page still needs to be handled. One way would be to read the complete page before doing the write. This is not possible, since it might break filesystems that don't expect any READ requests when the file was opened O_WRONLY. The other solution is to serialize the synchronous write with reads from the partial pages. The easiest way to do this is to keep the partial pages locked. The problem is that a write() may involve two such pages (one head and one tail). This patch fixes it by only locking the partial tail page. If there's a partial head page as well, then split that off as a separate WRITE request. Reported-by: Qian Cai Link: https://lore.kernel.org/linux-fsdevel/4794a3fa3742a5e84fb0f934944204b55730829b.camel@lca.pw/ Fixes: ea9b9907b82a ("fuse: implement perform_write") Cc: # v2.6.26 Signed-off-by: Vivek Goyal Signed-off-by: Miklos Szeredi --- fs/fuse/file.c | 41 +++++++++++++++++++++++++++++------------ fs/fuse/fuse_i.h | 1 + 2 files changed, 30 insertions(+), 12 deletions(-) diff --git a/fs/fuse/file.c b/fs/fuse/file.c index 8cccecb55fb8..eff4abaa87da 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -1099,6 +1099,7 @@ static ssize_t fuse_send_write_pages(struct fuse_io_args *ia, struct fuse_file *ff = file->private_data; struct fuse_mount *fm = ff->fm; unsigned int offset, i; + bool short_write; int err; for (i = 0; i < ap->num_pages; i++) @@ -1113,32 +1114,38 @@ static ssize_t fuse_send_write_pages(struct fuse_io_args *ia, if (!err && ia->write.out.size > count) err = -EIO; + short_write = ia->write.out.size < count; offset = ap->descs[0].offset; count = ia->write.out.size; for (i = 0; i < ap->num_pages; i++) { struct page *page = ap->pages[i]; - if (!err && !offset && count >= PAGE_SIZE) - SetPageUptodate(page); - - if (count > PAGE_SIZE - offset) - count -= PAGE_SIZE - offset; - else - count = 0; - offset = 0; - - unlock_page(page); + if (err) { + ClearPageUptodate(page); + } else { + if (count >= PAGE_SIZE - offset) + count -= PAGE_SIZE - offset; + else { + if (short_write) + ClearPageUptodate(page); + count = 0; + } + offset = 0; + } + if (ia->write.page_locked && (i == ap->num_pages - 1)) + unlock_page(page); put_page(page); } return err; } -static ssize_t fuse_fill_write_pages(struct fuse_args_pages *ap, +static ssize_t fuse_fill_write_pages(struct fuse_io_args *ia, struct address_space *mapping, struct iov_iter *ii, loff_t pos, unsigned int max_pages) { + struct fuse_args_pages *ap = &ia->ap; struct fuse_conn *fc = get_fuse_conn(mapping->host); unsigned offset = pos & (PAGE_SIZE - 1); size_t count = 0; @@ -1191,6 +1198,16 @@ static ssize_t fuse_fill_write_pages(struct fuse_args_pages *ap, if (offset == PAGE_SIZE) offset = 0; + /* If we copied full page, mark it uptodate */ + if (tmp == PAGE_SIZE) + SetPageUptodate(page); + + if (PageUptodate(page)) { + unlock_page(page); + } else { + ia->write.page_locked = true; + break; + } if (!fc->big_writes) break; } while (iov_iter_count(ii) && count < fc->max_write && @@ -1234,7 +1251,7 @@ static ssize_t fuse_perform_write(struct kiocb *iocb, break; } - count = fuse_fill_write_pages(ap, mapping, ii, pos, nr_pages); + count = fuse_fill_write_pages(&ia, mapping, ii, pos, nr_pages); if (count <= 0) { err = count; } else { diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h index 63d97a15ffde..74d888c78fa4 100644 --- a/fs/fuse/fuse_i.h +++ b/fs/fuse/fuse_i.h @@ -912,6 +912,7 @@ struct fuse_io_args { struct { struct fuse_write_in in; struct fuse_write_out out; + bool page_locked; } write; }; struct fuse_args_pages ap; From 4b91459ad283a7b174c7a092e31c470f217d1a31 Mon Sep 17 00:00:00 2001 From: Connor Kuehl Date: Thu, 18 Mar 2021 08:52:23 -0500 Subject: [PATCH 02/14] fuse: fix typo for fuse_conn.max_pages comment 'Maxmum' -> 'Maximum' Signed-off-by: Connor Kuehl Signed-off-by: Miklos Szeredi --- fs/fuse/fuse_i.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h index 74d888c78fa4..79c98df34deb 100644 --- a/fs/fuse/fuse_i.h +++ b/fs/fuse/fuse_i.h @@ -552,7 +552,7 @@ struct fuse_conn { /** Maximum write size */ unsigned max_write; - /** Maxmum number of pages that can be used in a single request */ + /** Maximum number of pages that can be used in a single request */ unsigned int max_pages; /** Input queue */ From a73d47f57792dc3140348cc07271500c610b5624 Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Wed, 14 Apr 2021 10:40:56 +0200 Subject: [PATCH 03/14] fuse: don't zero pages twice All callers of fuse_short_read already set the .page_zeroing flag, so no need to do the tail zeroing again. Signed-off-by: Miklos Szeredi --- fs/fuse/file.c | 21 ++++++--------------- 1 file changed, 6 insertions(+), 15 deletions(-) diff --git a/fs/fuse/file.c b/fs/fuse/file.c index eff4abaa87da..51187777c39b 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -798,21 +798,12 @@ static void fuse_short_read(struct inode *inode, u64 attr_ver, size_t num_read, { struct fuse_conn *fc = get_fuse_conn(inode); - if (fc->writeback_cache) { - /* - * A hole in a file. Some data after the hole are in page cache, - * but have not reached the client fs yet. So, the hole is not - * present there. - */ - int i; - int start_idx = num_read >> PAGE_SHIFT; - size_t off = num_read & (PAGE_SIZE - 1); - - for (i = start_idx; i < ap->num_pages; i++) { - zero_user_segment(ap->pages[i], off, PAGE_SIZE); - off = 0; - } - } else { + /* + * If writeback_cache is enabled, a short read means there's a hole in + * the file. Some data after the hole is in page cache, but has not + * reached the client fs yet. So the hole is not present there. + */ + if (!fc->writeback_cache) { loff_t pos = page_offset(ap->pages[0]) + num_read; fuse_read_update_size(inode, pos, attr_ver); } From aa6ff555f0e62bc1c85a2d181c1fae95d47c00ce Mon Sep 17 00:00:00 2001 From: Bhaskar Chowdhury Date: Fri, 26 Mar 2021 06:46:42 +0530 Subject: [PATCH 04/14] fuse: fix a typo s/reponsible/responsible/ Signed-off-by: Bhaskar Chowdhury Signed-off-by: Miklos Szeredi --- fs/fuse/fuse_i.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h index 79c98df34deb..e7775d9c3314 100644 --- a/fs/fuse/fuse_i.h +++ b/fs/fuse/fuse_i.h @@ -713,7 +713,7 @@ struct fuse_conn { /** Use enhanced/automatic page cache invalidation. */ unsigned auto_inval_data:1; - /** Filesystem is fully reponsible for page cache invalidation. */ + /** Filesystem is fully responsible for page cache invalidation. */ unsigned explicit_inval_data:1; /** Does the filesystem support readdirplus? */ From 6076f5f341e612152879bfda99f0b76c1953bf0b Mon Sep 17 00:00:00 2001 From: Alessio Balsini Date: Fri, 19 Mar 2021 15:05:14 +0000 Subject: [PATCH 05/14] fuse: fix matching of FUSE_DEV_IOC_CLONE command With commit f8425c939663 ("fuse: 32-bit user space ioctl compat for fuse device") the matching constraints for the FUSE_DEV_IOC_CLONE ioctl command are relaxed, limited to the testing of command type and number. As Arnd noticed, this is wrong as it wouldn't ensure the correctness of the data size or direction for the received FUSE device ioctl. Fix by bringing back the comparison of the ioctl received by the FUSE device to the originally generated FUSE_DEV_IOC_CLONE. Fixes: f8425c939663 ("fuse: 32-bit user space ioctl compat for fuse device") Reported-by: Arnd Bergmann Signed-off-by: Alessio Balsini Signed-off-by: Miklos Szeredi --- fs/fuse/dev.c | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c index c0fee830a34e..a5ceccc5ef00 100644 --- a/fs/fuse/dev.c +++ b/fs/fuse/dev.c @@ -2233,11 +2233,8 @@ static long fuse_dev_ioctl(struct file *file, unsigned int cmd, int oldfd; struct fuse_dev *fud = NULL; - if (_IOC_TYPE(cmd) != FUSE_DEV_IOC_MAGIC) - return -ENOTTY; - - switch (_IOC_NR(cmd)) { - case _IOC_NR(FUSE_DEV_IOC_CLONE): + switch (cmd) { + case FUSE_DEV_IOC_CLONE: res = -EFAULT; if (!get_user(oldfd, (__u32 __user *)arg)) { struct file *old = fget(oldfd); From 52a4c95f4d24b8bcb50745732f7b9f8513c49c5f Mon Sep 17 00:00:00 2001 From: Vivek Goyal Date: Thu, 25 Mar 2021 11:18:22 -0400 Subject: [PATCH 06/14] fuse: extend FUSE_SETXATTR request Fuse client needs to send additional information to file server when it calls SETXATTR(system.posix_acl_access), so add extra flags field to the structure. Signed-off-by: Vivek Goyal Signed-off-by: Miklos Szeredi --- fs/fuse/acl.c | 2 +- fs/fuse/fuse_i.h | 5 ++++- fs/fuse/inode.c | 4 +++- fs/fuse/xattr.c | 9 ++++++--- include/uapi/linux/fuse.h | 7 +++++++ 5 files changed, 21 insertions(+), 6 deletions(-) diff --git a/fs/fuse/acl.c b/fs/fuse/acl.c index e9c0f916349d..d31260a139d4 100644 --- a/fs/fuse/acl.c +++ b/fs/fuse/acl.c @@ -94,7 +94,7 @@ int fuse_set_acl(struct user_namespace *mnt_userns, struct inode *inode, return ret; } - ret = fuse_setxattr(inode, name, value, size, 0); + ret = fuse_setxattr(inode, name, value, size, 0, 0); kfree(value); } else { ret = fuse_removexattr(inode, name); diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h index e7775d9c3314..979b680bb47e 100644 --- a/fs/fuse/fuse_i.h +++ b/fs/fuse/fuse_i.h @@ -668,6 +668,9 @@ struct fuse_conn { /** Is setxattr not implemented by fs? */ unsigned no_setxattr:1; + /** Does file server support extended setxattr */ + unsigned setxattr_ext:1; + /** Is getxattr not implemented by fs? */ unsigned no_getxattr:1; @@ -1171,7 +1174,7 @@ void fuse_unlock_inode(struct inode *inode, bool locked); bool fuse_lock_inode(struct inode *inode); int fuse_setxattr(struct inode *inode, const char *name, const void *value, - size_t size, int flags); + size_t size, int flags, unsigned int extra_flags); ssize_t fuse_getxattr(struct inode *inode, const char *name, void *value, size_t size); ssize_t fuse_listxattr(struct dentry *entry, char *list, size_t size); diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c index b0e18b470e91..06a68cfa76d8 100644 --- a/fs/fuse/inode.c +++ b/fs/fuse/inode.c @@ -1052,6 +1052,8 @@ static void process_init_reply(struct fuse_mount *fm, struct fuse_args *args, fc->handle_killpriv_v2 = 1; fm->sb->s_flags |= SB_NOSEC; } + if (arg->flags & FUSE_SETXATTR_EXT) + fc->setxattr_ext = 1; } else { ra_pages = fc->max_read / PAGE_SIZE; fc->no_lock = 1; @@ -1095,7 +1097,7 @@ void fuse_send_init(struct fuse_mount *fm) FUSE_PARALLEL_DIROPS | FUSE_HANDLE_KILLPRIV | FUSE_POSIX_ACL | FUSE_ABORT_ERROR | FUSE_MAX_PAGES | FUSE_CACHE_SYMLINKS | FUSE_NO_OPENDIR_SUPPORT | FUSE_EXPLICIT_INVAL_DATA | - FUSE_HANDLE_KILLPRIV_V2; + FUSE_HANDLE_KILLPRIV_V2 | FUSE_SETXATTR_EXT; #ifdef CONFIG_FUSE_DAX if (fm->fc->dax) ia->in.flags |= FUSE_MAP_ALIGNMENT; diff --git a/fs/fuse/xattr.c b/fs/fuse/xattr.c index 1a7d7ace54e1..61dfaf7b7d20 100644 --- a/fs/fuse/xattr.c +++ b/fs/fuse/xattr.c @@ -12,7 +12,7 @@ #include int fuse_setxattr(struct inode *inode, const char *name, const void *value, - size_t size, int flags) + size_t size, int flags, unsigned int extra_flags) { struct fuse_mount *fm = get_fuse_mount(inode); FUSE_ARGS(args); @@ -25,10 +25,13 @@ int fuse_setxattr(struct inode *inode, const char *name, const void *value, memset(&inarg, 0, sizeof(inarg)); inarg.size = size; inarg.flags = flags; + inarg.setxattr_flags = extra_flags; + args.opcode = FUSE_SETXATTR; args.nodeid = get_node_id(inode); args.in_numargs = 3; - args.in_args[0].size = sizeof(inarg); + args.in_args[0].size = fm->fc->setxattr_ext ? + sizeof(inarg) : FUSE_COMPAT_SETXATTR_IN_SIZE; args.in_args[0].value = &inarg; args.in_args[1].size = strlen(name) + 1; args.in_args[1].value = name; @@ -199,7 +202,7 @@ static int fuse_xattr_set(const struct xattr_handler *handler, if (!value) return fuse_removexattr(inode, name); - return fuse_setxattr(inode, name, value, size, flags); + return fuse_setxattr(inode, name, value, size, flags, 0); } static bool no_xattr_list(struct dentry *dentry) diff --git a/include/uapi/linux/fuse.h b/include/uapi/linux/fuse.h index 54442612c48b..390f7b4c889d 100644 --- a/include/uapi/linux/fuse.h +++ b/include/uapi/linux/fuse.h @@ -179,6 +179,7 @@ * 7.33 * - add FUSE_HANDLE_KILLPRIV_V2, FUSE_WRITE_KILL_SUIDGID, FATTR_KILL_SUIDGID * - add FUSE_OPEN_KILL_SUIDGID + * - extend fuse_setxattr_in, add FUSE_SETXATTR_EXT */ #ifndef _LINUX_FUSE_H @@ -330,6 +331,7 @@ struct fuse_file_lock { * does not have CAP_FSETID. Additionally upon * write/truncate sgid is killed only if file has group * execute permission. (Same as Linux VFS behavior). + * FUSE_SETXATTR_EXT: Server supports extended struct fuse_setxattr_in */ #define FUSE_ASYNC_READ (1 << 0) #define FUSE_POSIX_LOCKS (1 << 1) @@ -360,6 +362,7 @@ struct fuse_file_lock { #define FUSE_MAP_ALIGNMENT (1 << 26) #define FUSE_SUBMOUNTS (1 << 27) #define FUSE_HANDLE_KILLPRIV_V2 (1 << 28) +#define FUSE_SETXATTR_EXT (1 << 29) /** * CUSE INIT request/reply flags @@ -681,9 +684,13 @@ struct fuse_fsync_in { uint32_t padding; }; +#define FUSE_COMPAT_SETXATTR_IN_SIZE 8 + struct fuse_setxattr_in { uint32_t size; uint32_t flags; + uint32_t setxattr_flags; + uint32_t padding; }; struct fuse_getxattr_in { From 550a7d3bc0c4049ef8d36ff4d9ed7082ee8cb5ec Mon Sep 17 00:00:00 2001 From: Vivek Goyal Date: Thu, 25 Mar 2021 11:18:23 -0400 Subject: [PATCH 07/14] fuse: add a flag FUSE_SETXATTR_ACL_KILL_SGID to kill SGID When posix access ACL is set, it can have an effect on file mode and it can also need to clear SGID if. - None of caller's group/supplementary groups match file owner group. AND - Caller is not priviliged (No CAP_FSETID). As of now fuser server is responsible for changing the file mode as well. But it does not know whether to clear SGID or not. So add a flag FUSE_SETXATTR_ACL_KILL_SGID and send this info with SETXATTR to let file server know that sgid needs to be cleared as well. Reported-by: Luis Henriques Signed-off-by: Vivek Goyal Signed-off-by: Miklos Szeredi --- fs/fuse/acl.c | 7 ++++++- include/uapi/linux/fuse.h | 7 +++++++ 2 files changed, 13 insertions(+), 1 deletion(-) diff --git a/fs/fuse/acl.c b/fs/fuse/acl.c index d31260a139d4..52b165319be1 100644 --- a/fs/fuse/acl.c +++ b/fs/fuse/acl.c @@ -71,6 +71,7 @@ int fuse_set_acl(struct user_namespace *mnt_userns, struct inode *inode, return -EINVAL; if (acl) { + unsigned int extra_flags = 0; /* * Fuse userspace is responsible for updating access * permissions in the inode, if needed. fuse_setxattr @@ -94,7 +95,11 @@ int fuse_set_acl(struct user_namespace *mnt_userns, struct inode *inode, return ret; } - ret = fuse_setxattr(inode, name, value, size, 0, 0); + if (!in_group_p(i_gid_into_mnt(&init_user_ns, inode)) && + !capable_wrt_inode_uidgid(&init_user_ns, inode, CAP_FSETID)) + extra_flags |= FUSE_SETXATTR_ACL_KILL_SGID; + + ret = fuse_setxattr(inode, name, value, size, 0, extra_flags); kfree(value); } else { ret = fuse_removexattr(inode, name); diff --git a/include/uapi/linux/fuse.h b/include/uapi/linux/fuse.h index 390f7b4c889d..271ae90a9bb7 100644 --- a/include/uapi/linux/fuse.h +++ b/include/uapi/linux/fuse.h @@ -180,6 +180,7 @@ * - add FUSE_HANDLE_KILLPRIV_V2, FUSE_WRITE_KILL_SUIDGID, FATTR_KILL_SUIDGID * - add FUSE_OPEN_KILL_SUIDGID * - extend fuse_setxattr_in, add FUSE_SETXATTR_EXT + * - add FUSE_SETXATTR_ACL_KILL_SGID */ #ifndef _LINUX_FUSE_H @@ -454,6 +455,12 @@ struct fuse_file_lock { */ #define FUSE_OPEN_KILL_SUIDGID (1 << 0) +/** + * setxattr flags + * FUSE_SETXATTR_ACL_KILL_SGID: Clear SGID when system.posix_acl_access is set + */ +#define FUSE_SETXATTR_ACL_KILL_SGID (1 << 0) + enum fuse_opcode { FUSE_LOOKUP = 1, FUSE_FORGET = 2, /* no reply */ From 3466958beb31a8e9d3a1441a34228ed088b84f3e Mon Sep 17 00:00:00 2001 From: Vivek Goyal Date: Tue, 6 Apr 2021 10:07:06 -0400 Subject: [PATCH 08/14] fuse: invalidate attrs when page writeback completes In fuse when a direct/write-through write happens we invalidate attrs because that might have updated mtime/ctime on server and cached mtime/ctime will be stale. What about page writeback path. Looks like we don't invalidate attrs there. To be consistent, invalidate attrs in writeback path as well. Only exception is when writeback_cache is enabled. In that case we strust local mtime/ctime and there is no need to invalidate attrs. Recently users started experiencing failure of xfstests generic/080, geneirc/215 and generic/614 on virtiofs. This happened only newer "stat" utility and not older one. This patch fixes the issue. So what's the root cause of the issue. Here is detailed explanation. generic/080 test does mmap write to a file, closes the file and then checks if mtime has been updated or not. When file is closed, it leads to flushing of dirty pages (and that should update mtime/ctime on server). But we did not explicitly invalidate attrs after writeback finished. Still generic/080 passed so far and reason being that we invalidated atime in fuse_readpages_end(). This is called in fuse_readahead() path and always seems to trigger before mmaped write. So after mmaped write when lstat() is called, it sees that atleast one of the fields being asked for is invalid (atime) and that results in generating GETATTR to server and mtime/ctime also get updated and test passes. But newer /usr/bin/stat seems to have moved to using statx() syscall now (instead of using lstat()). And statx() allows it to query only ctime or mtime (and not rest of the basic stat fields). That means when querying for mtime, fuse_update_get_attr() sees that mtime is not invalid (only atime is invalid). So it does not generate a new GETATTR and fill stat with cached mtime/ctime. And that means updated mtime is not seen by xfstest and tests start failing. Invalidating attrs after writeback completion should solve this problem in a generic manner. Signed-off-by: Vivek Goyal Signed-off-by: Miklos Szeredi --- fs/fuse/file.c | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/fs/fuse/file.c b/fs/fuse/file.c index 51187777c39b..df769f55f205 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -1767,8 +1767,17 @@ static void fuse_writepage_end(struct fuse_mount *fm, struct fuse_args *args, container_of(args, typeof(*wpa), ia.ap.args); struct inode *inode = wpa->inode; struct fuse_inode *fi = get_fuse_inode(inode); + struct fuse_conn *fc = get_fuse_conn(inode); mapping_set_error(inode->i_mapping, error); + /* + * A writeback finished and this might have updated mtime/ctime on + * server making local mtime/ctime stale. Hence invalidate attrs. + * Do this only if writeback_cache is not enabled. If writeback_cache + * is enabled, we trust local ctime/mtime. + */ + if (!fc->writeback_cache) + fuse_invalidate_attr(inode); spin_lock(&fi->lock); rb_erase(&wpa->writepages_entry, &fi->writepages); while (wpa->next) { From c79c5e0178922a9e092ec8fed026750f39dcaef4 Mon Sep 17 00:00:00 2001 From: Luis Henriques Date: Wed, 17 Mar 2021 08:44:43 +0000 Subject: [PATCH 09/14] virtiofs: fix memory leak in virtio_fs_probe() When accidentally passing twice the same tag to qemu, kmemleak ended up reporting a memory leak in virtiofs. Also, looking at the log I saw the following error (that's when I realised the duplicated tag): virtiofs: probe of virtio5 failed with error -17 Here's the kmemleak log for reference: unreferenced object 0xffff888103d47800 (size 1024): comm "systemd-udevd", pid 118, jiffies 4294893780 (age 18.340s) hex dump (first 32 bytes): 00 00 00 00 ad 4e ad de ff ff ff ff 00 00 00 00 .....N.......... ff ff ff ff ff ff ff ff 80 90 02 a0 ff ff ff ff ................ backtrace: [<000000000ebb87c1>] virtio_fs_probe+0x171/0x7ae [virtiofs] [<00000000f8aca419>] virtio_dev_probe+0x15f/0x210 [<000000004d6baf3c>] really_probe+0xea/0x430 [<00000000a6ceeac8>] device_driver_attach+0xa8/0xb0 [<00000000196f47a7>] __driver_attach+0x98/0x140 [<000000000b20601d>] bus_for_each_dev+0x7b/0xc0 [<00000000399c7b7f>] bus_add_driver+0x11b/0x1f0 [<0000000032b09ba7>] driver_register+0x8f/0xe0 [<00000000cdd55998>] 0xffffffffa002c013 [<000000000ea196a2>] do_one_initcall+0x64/0x2e0 [<0000000008f727ce>] do_init_module+0x5c/0x260 [<000000003cdedab6>] __do_sys_finit_module+0xb5/0x120 [<00000000ad2f48c6>] do_syscall_64+0x33/0x40 [<00000000809526b5>] entry_SYSCALL_64_after_hwframe+0x44/0xae Cc: stable@vger.kernel.org Signed-off-by: Luis Henriques Fixes: a62a8ef9d97d ("virtio-fs: add virtiofs filesystem") Reviewed-by: Stefan Hajnoczi Reviewed-by: Vivek Goyal Signed-off-by: Miklos Szeredi --- fs/fuse/virtio_fs.c | 1 + 1 file changed, 1 insertion(+) diff --git a/fs/fuse/virtio_fs.c b/fs/fuse/virtio_fs.c index 4ee6f734ba83..1e5affed158e 100644 --- a/fs/fuse/virtio_fs.c +++ b/fs/fuse/virtio_fs.c @@ -896,6 +896,7 @@ static int virtio_fs_probe(struct virtio_device *vdev) out_vqs: vdev->config->reset(vdev); virtio_fs_cleanup_vqs(vdev, fs); + kfree(fs->vqs); out: vdev->priv = NULL; From a7f0d7aab0b4f3f0780b1f77356e2fe7202ac0cb Mon Sep 17 00:00:00 2001 From: Connor Kuehl Date: Thu, 18 Mar 2021 08:52:22 -0500 Subject: [PATCH 10/14] virtiofs: split requests that exceed virtqueue size If an incoming FUSE request can't fit on the virtqueue, the request is placed onto a workqueue so a worker can try to resubmit it later where there will (hopefully) be space for it next time. This is fine for requests that aren't larger than a virtqueue's maximum capacity. However, if a request's size exceeds the maximum capacity of the virtqueue (even if the virtqueue is empty), it will be doomed to a life of being placed on the workqueue, removed, discovered it won't fit, and placed on the workqueue yet again. Furthermore, from section 2.6.5.3.1 (Driver Requirements: Indirect Descriptors) of the virtio spec: "A driver MUST NOT create a descriptor chain longer than the Queue Size of the device." To fix this, limit the number of pages FUSE will use for an overall request. This way, each request can realistically fit on the virtqueue when it is decomposed into a scattergather list and avoid violating section 2.6.5.3.1 of the virtio spec. Signed-off-by: Connor Kuehl Reviewed-by: Vivek Goyal Signed-off-by: Miklos Szeredi --- fs/fuse/fuse_i.h | 3 +++ fs/fuse/inode.c | 3 ++- fs/fuse/virtio_fs.c | 19 +++++++++++++++++-- 3 files changed, 22 insertions(+), 3 deletions(-) diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h index 979b680bb47e..38369a5094af 100644 --- a/fs/fuse/fuse_i.h +++ b/fs/fuse/fuse_i.h @@ -555,6 +555,9 @@ struct fuse_conn { /** Maximum number of pages that can be used in a single request */ unsigned int max_pages; + /** Constrain ->max_pages to this value during feature negotiation */ + unsigned int max_pages_limit; + /** Input queue */ struct fuse_iqueue iq; diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c index 06a68cfa76d8..2f3161bb4b1c 100644 --- a/fs/fuse/inode.c +++ b/fs/fuse/inode.c @@ -712,6 +712,7 @@ void fuse_conn_init(struct fuse_conn *fc, struct fuse_mount *fm, fc->pid_ns = get_pid_ns(task_active_pid_ns(current)); fc->user_ns = get_user_ns(user_ns); fc->max_pages = FUSE_DEFAULT_MAX_PAGES_PER_REQ; + fc->max_pages_limit = FUSE_MAX_MAX_PAGES; INIT_LIST_HEAD(&fc->mounts); list_add(&fm->fc_entry, &fc->mounts); @@ -1040,7 +1041,7 @@ static void process_init_reply(struct fuse_mount *fm, struct fuse_args *args, fc->abort_err = 1; if (arg->flags & FUSE_MAX_PAGES) { fc->max_pages = - min_t(unsigned int, FUSE_MAX_MAX_PAGES, + min_t(unsigned int, fc->max_pages_limit, max_t(unsigned int, arg->max_pages, 1)); } if (IS_ENABLED(CONFIG_FUSE_DAX) && diff --git a/fs/fuse/virtio_fs.c b/fs/fuse/virtio_fs.c index 1e5affed158e..b55e977f50ac 100644 --- a/fs/fuse/virtio_fs.c +++ b/fs/fuse/virtio_fs.c @@ -18,6 +18,12 @@ #include #include "fuse_i.h" +/* Used to help calculate the FUSE connection's max_pages limit for a request's + * size. Parts of the struct fuse_req are sliced into scattergather lists in + * addition to the pages used, so this can help account for that overhead. + */ +#define FUSE_HEADER_OVERHEAD 4 + /* List of virtio-fs device instances and a lock for the list. Also provides * mutual exclusion in device removal and mounting path */ @@ -1414,9 +1420,10 @@ static int virtio_fs_get_tree(struct fs_context *fsc) { struct virtio_fs *fs; struct super_block *sb; - struct fuse_conn *fc; + struct fuse_conn *fc = NULL; struct fuse_mount *fm; - int err; + unsigned int virtqueue_size; + int err = -EIO; /* This gets a reference on virtio_fs object. This ptr gets installed * in fc->iq->priv. Once fuse_conn is going away, it calls ->put() @@ -1428,6 +1435,10 @@ static int virtio_fs_get_tree(struct fs_context *fsc) return -EINVAL; } + virtqueue_size = virtqueue_get_vring_size(fs->vqs[VQ_REQUEST].vq); + if (WARN_ON(virtqueue_size <= FUSE_HEADER_OVERHEAD)) + goto out_err; + err = -ENOMEM; fc = kzalloc(sizeof(struct fuse_conn), GFP_KERNEL); if (!fc) @@ -1443,6 +1454,10 @@ static int virtio_fs_get_tree(struct fs_context *fsc) fc->delete_stale = true; fc->auto_submounts = true; + /* Tell FUSE to split requests that exceed the virtqueue's size */ + fc->max_pages_limit = min_t(unsigned int, fc->max_pages_limit, + virtqueue_size - FUSE_HEADER_OVERHEAD); + fsc->s_fs_info = fm; sb = sget_fc(fsc, virtio_fs_test_super, set_anon_super_fc); if (fsc->s_fs_info) { From 07595bfa24ce7e8ecef70c84fee9a2374d8c0a61 Mon Sep 17 00:00:00 2001 From: Jiapeng Chong Date: Tue, 13 Apr 2021 17:22:23 +0800 Subject: [PATCH 11/14] virtiofs: remove useless function Fix the following clang warning: fs/fuse/virtio_fs.c:130:35: warning: unused function 'vq_to_fpq' [-Wunused-function]. Reported-by: Abaci Robot Signed-off-by: Jiapeng Chong Signed-off-by: Miklos Szeredi --- fs/fuse/virtio_fs.c | 5 ----- 1 file changed, 5 deletions(-) diff --git a/fs/fuse/virtio_fs.c b/fs/fuse/virtio_fs.c index b55e977f50ac..6623f27e88d4 100644 --- a/fs/fuse/virtio_fs.c +++ b/fs/fuse/virtio_fs.c @@ -133,11 +133,6 @@ static inline struct virtio_fs_vq *vq_to_fsvq(struct virtqueue *vq) return &fs->vqs[vq->index]; } -static inline struct fuse_pqueue *vq_to_fpq(struct virtqueue *vq) -{ - return &vq_to_fsvq(vq)->fud->pq; -} - /* Should be called with fsvq->lock held. */ static inline void inc_in_flight_req(struct virtio_fs_vq *fsvq) { From 0a7419c68a45d2d066b996be5087aa2d07ce80eb Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Wed, 14 Apr 2021 10:40:58 +0200 Subject: [PATCH 12/14] virtiofs: fix userns get_user_ns() is done twice (once in virtio_fs_get_tree() and once in fuse_conn_init()), resulting in a reference leak. Also looks better to use fsc->user_ns (which *should* be the current_user_ns() at this point). Signed-off-by: Miklos Szeredi --- fs/fuse/virtio_fs.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/fs/fuse/virtio_fs.c b/fs/fuse/virtio_fs.c index 6623f27e88d4..bcb8a02e2d8b 100644 --- a/fs/fuse/virtio_fs.c +++ b/fs/fuse/virtio_fs.c @@ -1443,8 +1443,7 @@ static int virtio_fs_get_tree(struct fs_context *fsc) if (!fm) goto out_err; - fuse_conn_init(fc, fm, get_user_ns(current_user_ns()), - &virtio_fs_fiq_ops, fs); + fuse_conn_init(fc, fm, fsc->user_ns, &virtio_fs_fiq_ops, fs); fc->release = fuse_free_conn; fc->delete_stale = true; fc->auto_submounts = true; From 8217673d07256b22881127bf50dce874d0e51653 Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Wed, 14 Apr 2021 10:40:58 +0200 Subject: [PATCH 13/14] cuse: prevent clone For cloned connections cuse_channel_release() will be called more than once, resulting in use after free. Prevent device cloning for CUSE, which does not make sense at this point, and highly unlikely to be used in real life. Signed-off-by: Miklos Szeredi --- fs/fuse/cuse.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/fs/fuse/cuse.c b/fs/fuse/cuse.c index 45082269e698..a37528b51798 100644 --- a/fs/fuse/cuse.c +++ b/fs/fuse/cuse.c @@ -627,6 +627,8 @@ static int __init cuse_init(void) cuse_channel_fops.owner = THIS_MODULE; cuse_channel_fops.open = cuse_channel_open; cuse_channel_fops.release = cuse_channel_release; + /* CUSE is not prepared for FUSE_DEV_IOC_CLONE */ + cuse_channel_fops.unlocked_ioctl = NULL; cuse_class = class_create(THIS_MODULE, "cuse"); if (IS_ERR(cuse_class)) From 3c9c14338c12fb6f8d3aea7e7a1b7f93ce9e84b0 Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Wed, 14 Apr 2021 10:40:58 +0200 Subject: [PATCH 14/14] cuse: simplify refcount Put extra reference early in cuse_channel_open(). Signed-off-by: Miklos Szeredi --- fs/fuse/cuse.c | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) diff --git a/fs/fuse/cuse.c b/fs/fuse/cuse.c index a37528b51798..c7d882a9fe33 100644 --- a/fs/fuse/cuse.c +++ b/fs/fuse/cuse.c @@ -511,20 +511,18 @@ static int cuse_channel_open(struct inode *inode, struct file *file) fuse_conn_init(&cc->fc, &cc->fm, file->f_cred->user_ns, &fuse_dev_fiq_ops, NULL); + cc->fc.release = cuse_fc_release; fud = fuse_dev_alloc_install(&cc->fc); - if (!fud) { - kfree(cc); + fuse_conn_put(&cc->fc); + if (!fud) return -ENOMEM; - } INIT_LIST_HEAD(&cc->list); - cc->fc.release = cuse_fc_release; cc->fc.initialized = 1; rc = cuse_send_init(cc); if (rc) { fuse_dev_free(fud); - fuse_conn_put(&cc->fc); return rc; } file->private_data = fud; @@ -561,8 +559,6 @@ static int cuse_channel_release(struct inode *inode, struct file *file) unregister_chrdev_region(cc->cdev->dev, 1); cdev_del(cc->cdev); } - /* Base reference is now owned by "fud" */ - fuse_conn_put(&cc->fc); rc = fuse_dev_release(inode, file); /* puts the base reference */