From 1dd73601a1cba37a0ed5f89a8662c90191df5873 Mon Sep 17 00:00:00 2001 From: Gao Xiang Date: Fri, 9 Sep 2022 10:39:48 +0800 Subject: [PATCH 01/14] erofs: fix order >= MAX_ORDER warning due to crafted negative i_size As syzbot reported [1], the root cause is that i_size field is a signed type, and negative i_size is also less than EROFS_BLKSIZ. As a consequence, it's handled as fast symlink unexpectedly. Let's fall back to the generic path to deal with such unusual i_size. [1] https://lore.kernel.org/r/000000000000ac8efa05e7feaa1f@google.com Reported-by: syzbot+f966c13b1b4fc0403b19@syzkaller.appspotmail.com Fixes: 431339ba9042 ("staging: erofs: add inode operations") Reviewed-by: Yue Hu Link: https://lore.kernel.org/r/20220909023948.28925-1-hsiangkao@linux.alibaba.com Signed-off-by: Gao Xiang --- fs/erofs/inode.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/erofs/inode.c b/fs/erofs/inode.c index 95a403720e8c..16cf9a283557 100644 --- a/fs/erofs/inode.c +++ b/fs/erofs/inode.c @@ -214,7 +214,7 @@ static int erofs_fill_symlink(struct inode *inode, void *kaddr, /* if it cannot be handled with fast symlink scheme */ if (vi->datalayout != EROFS_INODE_FLAT_INLINE || - inode->i_size >= EROFS_BLKSIZ) { + inode->i_size >= EROFS_BLKSIZ || inode->i_size < 0) { inode->i_op = &erofs_symlink_iops; return 0; } From 1015c1016c231b26d4e2c9b3da65b6c043eb97a3 Mon Sep 17 00:00:00 2001 From: Jia Zhu Date: Sun, 18 Sep 2022 12:34:51 +0800 Subject: [PATCH 02/14] erofs: use kill_anon_super() to kill super in fscache mode Use kill_anon_super() instead of generic_shutdown_super() since the mount() in erofs fscache mode uses get_tree_nodev() and associated anon bdev needs to be freed. Fixes: 9c0cc9c729657 ("erofs: add 'fsid' mount option") Suggested-by: Jingbo Xu Signed-off-by: Jia Zhu Reviewed-by: Jingbo Xu Link: https://lore.kernel.org/r/20220918043456.147-2-zhujia.zj@bytedance.com Signed-off-by: Gao Xiang --- fs/erofs/super.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/erofs/super.c b/fs/erofs/super.c index 3173debeaa5a..9716d355a63e 100644 --- a/fs/erofs/super.c +++ b/fs/erofs/super.c @@ -879,7 +879,7 @@ static void erofs_kill_sb(struct super_block *sb) WARN_ON(sb->s_magic != EROFS_SUPER_MAGIC); if (erofs_is_fscache_mode(sb)) - generic_shutdown_super(sb); + kill_anon_super(sb); else kill_block_super(sb); From e1de2da0b7ac2dc0120c2ba8c7044788611933ea Mon Sep 17 00:00:00 2001 From: Jia Zhu Date: Sun, 18 Sep 2022 12:34:52 +0800 Subject: [PATCH 03/14] erofs: code clean up for fscache Some cleanups. No logic changes. Suggested-by: Jingbo Xu Signed-off-by: Jia Zhu Reviewed-by: Jingbo Xu Link: https://lore.kernel.org/r/20220918043456.147-3-zhujia.zj@bytedance.com Signed-off-by: Gao Xiang --- fs/erofs/fscache.c | 39 +++++++++++++++++++-------------------- fs/erofs/internal.h | 19 +++++++++---------- fs/erofs/super.c | 21 ++++++++------------- 3 files changed, 36 insertions(+), 43 deletions(-) diff --git a/fs/erofs/fscache.c b/fs/erofs/fscache.c index b5fd9d71e67f..1eb63987e815 100644 --- a/fs/erofs/fscache.c +++ b/fs/erofs/fscache.c @@ -421,9 +421,8 @@ const struct address_space_operations erofs_fscache_access_aops = { .readahead = erofs_fscache_readahead, }; -int erofs_fscache_register_cookie(struct super_block *sb, - struct erofs_fscache **fscache, - char *name, bool need_inode) +struct erofs_fscache *erofs_fscache_register_cookie(struct super_block *sb, + char *name, bool need_inode) { struct fscache_volume *volume = EROFS_SB(sb)->volume; struct erofs_fscache *ctx; @@ -432,7 +431,7 @@ int erofs_fscache_register_cookie(struct super_block *sb, ctx = kzalloc(sizeof(*ctx), GFP_KERNEL); if (!ctx) - return -ENOMEM; + return ERR_PTR(-ENOMEM); cookie = fscache_acquire_cookie(volume, FSCACHE_ADV_WANT_CACHE_SIZE, name, strlen(name), NULL, 0, 0); @@ -462,42 +461,33 @@ int erofs_fscache_register_cookie(struct super_block *sb, ctx->inode = inode; } - *fscache = ctx; - return 0; + return ctx; err_cookie: fscache_unuse_cookie(ctx->cookie, NULL, NULL); fscache_relinquish_cookie(ctx->cookie, false); - ctx->cookie = NULL; err: kfree(ctx); - return ret; + return ERR_PTR(ret); } -void erofs_fscache_unregister_cookie(struct erofs_fscache **fscache) +void erofs_fscache_unregister_cookie(struct erofs_fscache *ctx) { - struct erofs_fscache *ctx = *fscache; - if (!ctx) return; fscache_unuse_cookie(ctx->cookie, NULL, NULL); fscache_relinquish_cookie(ctx->cookie, false); - ctx->cookie = NULL; - iput(ctx->inode); - ctx->inode = NULL; - kfree(ctx); - *fscache = NULL; } int erofs_fscache_register_fs(struct super_block *sb) { struct erofs_sb_info *sbi = EROFS_SB(sb); struct fscache_volume *volume; + struct erofs_fscache *fscache; char *name; - int ret = 0; name = kasprintf(GFP_KERNEL, "erofs,%s", sbi->opt.fsid); if (!name) @@ -506,19 +496,28 @@ int erofs_fscache_register_fs(struct super_block *sb) volume = fscache_acquire_volume(name, NULL, NULL, 0); if (IS_ERR_OR_NULL(volume)) { erofs_err(sb, "failed to register volume for %s", name); - ret = volume ? PTR_ERR(volume) : -EOPNOTSUPP; - volume = NULL; + kfree(name); + return volume ? PTR_ERR(volume) : -EOPNOTSUPP; } sbi->volume = volume; kfree(name); - return ret; + + fscache = erofs_fscache_register_cookie(sb, sbi->opt.fsid, true); + /* acquired volume will be relinquished in kill_sb() */ + if (IS_ERR(fscache)) + return PTR_ERR(fscache); + + sbi->s_fscache = fscache; + return 0; } void erofs_fscache_unregister_fs(struct super_block *sb) { struct erofs_sb_info *sbi = EROFS_SB(sb); + erofs_fscache_unregister_cookie(sbi->s_fscache); fscache_relinquish_volume(sbi->volume, NULL, false); + sbi->s_fscache = NULL; sbi->volume = NULL; } diff --git a/fs/erofs/internal.h b/fs/erofs/internal.h index a01cc82795a2..39fb124ce670 100644 --- a/fs/erofs/internal.h +++ b/fs/erofs/internal.h @@ -581,27 +581,26 @@ static inline int z_erofs_load_lzma_config(struct super_block *sb, int erofs_fscache_register_fs(struct super_block *sb); void erofs_fscache_unregister_fs(struct super_block *sb); -int erofs_fscache_register_cookie(struct super_block *sb, - struct erofs_fscache **fscache, - char *name, bool need_inode); -void erofs_fscache_unregister_cookie(struct erofs_fscache **fscache); +struct erofs_fscache *erofs_fscache_register_cookie(struct super_block *sb, + char *name, bool need_inode); +void erofs_fscache_unregister_cookie(struct erofs_fscache *fscache); extern const struct address_space_operations erofs_fscache_access_aops; #else static inline int erofs_fscache_register_fs(struct super_block *sb) { - return 0; + return -EOPNOTSUPP; } static inline void erofs_fscache_unregister_fs(struct super_block *sb) {} -static inline int erofs_fscache_register_cookie(struct super_block *sb, - struct erofs_fscache **fscache, - char *name, bool need_inode) +static inline +struct erofs_fscache *erofs_fscache_register_cookie(struct super_block *sb, + char *name, bool need_inode) { - return -EOPNOTSUPP; + return ERR_PTR(-EOPNOTSUPP); } -static inline void erofs_fscache_unregister_cookie(struct erofs_fscache **fscache) +static inline void erofs_fscache_unregister_cookie(struct erofs_fscache *fscache) { } #endif diff --git a/fs/erofs/super.c b/fs/erofs/super.c index 9716d355a63e..884e7ed3d760 100644 --- a/fs/erofs/super.c +++ b/fs/erofs/super.c @@ -224,10 +224,10 @@ static int erofs_init_device(struct erofs_buf *buf, struct super_block *sb, struct erofs_device_info *dif, erofs_off_t *pos) { struct erofs_sb_info *sbi = EROFS_SB(sb); + struct erofs_fscache *fscache; struct erofs_deviceslot *dis; struct block_device *bdev; void *ptr; - int ret; ptr = erofs_read_metabuf(buf, sb, erofs_blknr(*pos), EROFS_KMAP); if (IS_ERR(ptr)) @@ -245,10 +245,10 @@ static int erofs_init_device(struct erofs_buf *buf, struct super_block *sb, } if (erofs_is_fscache_mode(sb)) { - ret = erofs_fscache_register_cookie(sb, &dif->fscache, - dif->path, false); - if (ret) - return ret; + fscache = erofs_fscache_register_cookie(sb, dif->path, false); + if (IS_ERR(fscache)) + return PTR_ERR(fscache); + dif->fscache = fscache; } else { bdev = blkdev_get_by_path(dif->path, FMODE_READ | FMODE_EXCL, sb->s_type); @@ -706,11 +706,6 @@ static int erofs_fc_fill_super(struct super_block *sb, struct fs_context *fc) if (err) return err; - err = erofs_fscache_register_cookie(sb, &sbi->s_fscache, - sbi->opt.fsid, true); - if (err) - return err; - err = super_setup_bdi(sb); if (err) return err; @@ -817,7 +812,8 @@ static int erofs_release_device_info(int id, void *ptr, void *data) fs_put_dax(dif->dax_dev, NULL); if (dif->bdev) blkdev_put(dif->bdev, FMODE_READ | FMODE_EXCL); - erofs_fscache_unregister_cookie(&dif->fscache); + erofs_fscache_unregister_cookie(dif->fscache); + dif->fscache = NULL; kfree(dif->path); kfree(dif); return 0; @@ -889,7 +885,6 @@ static void erofs_kill_sb(struct super_block *sb) erofs_free_dev_context(sbi->devs); fs_put_dax(sbi->dax_dev, NULL); - erofs_fscache_unregister_cookie(&sbi->s_fscache); erofs_fscache_unregister_fs(sb); kfree(sbi->opt.fsid); kfree(sbi); @@ -909,7 +904,7 @@ static void erofs_put_super(struct super_block *sb) iput(sbi->managed_cache); sbi->managed_cache = NULL; #endif - erofs_fscache_unregister_cookie(&sbi->s_fscache); + erofs_fscache_unregister_fs(sb); } static struct file_system_type erofs_fs_type = { From 8b7adf1dff3d5baf687acda936f193f80b7e0179 Mon Sep 17 00:00:00 2001 From: Jia Zhu Date: Sun, 18 Sep 2022 12:34:53 +0800 Subject: [PATCH 04/14] erofs: introduce fscache-based domain A new fscache-based shared domain mode is going to be introduced for erofs. In which case, same data blobs in same domain will be shared and reused to reduce on-disk space usage. The implementation of sharing blobs will be introduced in subsequent patches. Signed-off-by: Jia Zhu Reviewed-by: Jingbo Xu Link: https://lore.kernel.org/r/20220918043456.147-4-zhujia.zj@bytedance.com Signed-off-by: Gao Xiang --- fs/erofs/fscache.c | 129 ++++++++++++++++++++++++++++++++++++++------ fs/erofs/internal.h | 9 ++++ 2 files changed, 121 insertions(+), 17 deletions(-) diff --git a/fs/erofs/fscache.c b/fs/erofs/fscache.c index 1eb63987e815..8a1be6077ca4 100644 --- a/fs/erofs/fscache.c +++ b/fs/erofs/fscache.c @@ -1,10 +1,14 @@ // SPDX-License-Identifier: GPL-2.0-or-later /* * Copyright (C) 2022, Alibaba Cloud + * Copyright (C) 2022, Bytedance Inc. All rights reserved. */ #include #include "internal.h" +static DEFINE_MUTEX(erofs_domain_list_lock); +static LIST_HEAD(erofs_domain_list); + static struct netfs_io_request *erofs_fscache_alloc_request(struct address_space *mapping, loff_t start, size_t len) { @@ -421,6 +425,99 @@ const struct address_space_operations erofs_fscache_access_aops = { .readahead = erofs_fscache_readahead, }; +static void erofs_fscache_domain_put(struct erofs_domain *domain) +{ + if (!domain) + return; + mutex_lock(&erofs_domain_list_lock); + if (refcount_dec_and_test(&domain->ref)) { + list_del(&domain->list); + mutex_unlock(&erofs_domain_list_lock); + fscache_relinquish_volume(domain->volume, NULL, false); + kfree(domain->domain_id); + kfree(domain); + return; + } + mutex_unlock(&erofs_domain_list_lock); +} + +static int erofs_fscache_register_volume(struct super_block *sb) +{ + struct erofs_sb_info *sbi = EROFS_SB(sb); + char *domain_id = sbi->opt.domain_id; + struct fscache_volume *volume; + char *name; + int ret = 0; + + name = kasprintf(GFP_KERNEL, "erofs,%s", + domain_id ? domain_id : sbi->opt.fsid); + if (!name) + return -ENOMEM; + + volume = fscache_acquire_volume(name, NULL, NULL, 0); + if (IS_ERR_OR_NULL(volume)) { + erofs_err(sb, "failed to register volume for %s", name); + ret = volume ? PTR_ERR(volume) : -EOPNOTSUPP; + volume = NULL; + } + + sbi->volume = volume; + kfree(name); + return ret; +} + +static int erofs_fscache_init_domain(struct super_block *sb) +{ + int err; + struct erofs_domain *domain; + struct erofs_sb_info *sbi = EROFS_SB(sb); + + domain = kzalloc(sizeof(struct erofs_domain), GFP_KERNEL); + if (!domain) + return -ENOMEM; + + domain->domain_id = kstrdup(sbi->opt.domain_id, GFP_KERNEL); + if (!domain->domain_id) { + kfree(domain); + return -ENOMEM; + } + + err = erofs_fscache_register_volume(sb); + if (err) + goto out; + + domain->volume = sbi->volume; + refcount_set(&domain->ref, 1); + list_add(&domain->list, &erofs_domain_list); + sbi->domain = domain; + return 0; +out: + kfree(domain->domain_id); + kfree(domain); + return err; +} + +static int erofs_fscache_register_domain(struct super_block *sb) +{ + int err; + struct erofs_domain *domain; + struct erofs_sb_info *sbi = EROFS_SB(sb); + + mutex_lock(&erofs_domain_list_lock); + list_for_each_entry(domain, &erofs_domain_list, list) { + if (!strcmp(domain->domain_id, sbi->opt.domain_id)) { + sbi->domain = domain; + sbi->volume = domain->volume; + refcount_inc(&domain->ref); + mutex_unlock(&erofs_domain_list_lock); + return 0; + } + } + err = erofs_fscache_init_domain(sb); + mutex_unlock(&erofs_domain_list_lock); + return err; +} + struct erofs_fscache *erofs_fscache_register_cookie(struct super_block *sb, char *name, bool need_inode) { @@ -484,27 +581,19 @@ void erofs_fscache_unregister_cookie(struct erofs_fscache *ctx) int erofs_fscache_register_fs(struct super_block *sb) { + int ret; struct erofs_sb_info *sbi = EROFS_SB(sb); - struct fscache_volume *volume; struct erofs_fscache *fscache; - char *name; - name = kasprintf(GFP_KERNEL, "erofs,%s", sbi->opt.fsid); - if (!name) - return -ENOMEM; - - volume = fscache_acquire_volume(name, NULL, NULL, 0); - if (IS_ERR_OR_NULL(volume)) { - erofs_err(sb, "failed to register volume for %s", name); - kfree(name); - return volume ? PTR_ERR(volume) : -EOPNOTSUPP; - } - - sbi->volume = volume; - kfree(name); + if (sbi->opt.domain_id) + ret = erofs_fscache_register_domain(sb); + else + ret = erofs_fscache_register_volume(sb); + if (ret) + return ret; + /* acquired domain/volume will be relinquished in kill_sb() on error */ fscache = erofs_fscache_register_cookie(sb, sbi->opt.fsid, true); - /* acquired volume will be relinquished in kill_sb() */ if (IS_ERR(fscache)) return PTR_ERR(fscache); @@ -517,7 +606,13 @@ void erofs_fscache_unregister_fs(struct super_block *sb) struct erofs_sb_info *sbi = EROFS_SB(sb); erofs_fscache_unregister_cookie(sbi->s_fscache); - fscache_relinquish_volume(sbi->volume, NULL, false); + + if (sbi->domain) + erofs_fscache_domain_put(sbi->domain); + else + fscache_relinquish_volume(sbi->volume, NULL, false); + sbi->s_fscache = NULL; sbi->volume = NULL; + sbi->domain = NULL; } diff --git a/fs/erofs/internal.h b/fs/erofs/internal.h index 39fb124ce670..93fc4ad61cfb 100644 --- a/fs/erofs/internal.h +++ b/fs/erofs/internal.h @@ -76,6 +76,7 @@ struct erofs_mount_opts { #endif unsigned int mount_opt; char *fsid; + char *domain_id; }; struct erofs_dev_context { @@ -98,6 +99,13 @@ struct erofs_sb_lz4_info { u16 max_pclusterblks; }; +struct erofs_domain { + refcount_t ref; + struct list_head list; + struct fscache_volume *volume; + char *domain_id; +}; + struct erofs_fscache { struct fscache_cookie *cookie; struct inode *inode; @@ -157,6 +165,7 @@ struct erofs_sb_info { /* fscache support */ struct fscache_volume *volume; struct erofs_fscache *s_fscache; + struct erofs_domain *domain; }; #define EROFS_SB(sb) ((struct erofs_sb_info *)(sb)->s_fs_info) From a9849560c55e9e4ab9c53d073363dd6e19ec06ef Mon Sep 17 00:00:00 2001 From: Jia Zhu Date: Sun, 18 Sep 2022 12:34:54 +0800 Subject: [PATCH 05/14] erofs: introduce a pseudo mnt to manage shared cookies Use a pseudo mnt to manage shared cookies. Signed-off-by: Jia Zhu Reviewed-by: Jingbo Xu Link: https://lore.kernel.org/r/20220918043456.147-5-zhujia.zj@bytedance.com Signed-off-by: Gao Xiang --- fs/erofs/fscache.c | 13 +++++++++++++ fs/erofs/internal.h | 1 + fs/erofs/super.c | 33 +++++++++++++++++++++++++++++++-- 3 files changed, 45 insertions(+), 2 deletions(-) diff --git a/fs/erofs/fscache.c b/fs/erofs/fscache.c index 8a1be6077ca4..0480aaf44cb9 100644 --- a/fs/erofs/fscache.c +++ b/fs/erofs/fscache.c @@ -8,6 +8,7 @@ static DEFINE_MUTEX(erofs_domain_list_lock); static LIST_HEAD(erofs_domain_list); +static struct vfsmount *erofs_pseudo_mnt; static struct netfs_io_request *erofs_fscache_alloc_request(struct address_space *mapping, loff_t start, size_t len) @@ -432,6 +433,10 @@ static void erofs_fscache_domain_put(struct erofs_domain *domain) mutex_lock(&erofs_domain_list_lock); if (refcount_dec_and_test(&domain->ref)) { list_del(&domain->list); + if (list_empty(&erofs_domain_list)) { + kern_unmount(erofs_pseudo_mnt); + erofs_pseudo_mnt = NULL; + } mutex_unlock(&erofs_domain_list_lock); fscache_relinquish_volume(domain->volume, NULL, false); kfree(domain->domain_id); @@ -486,6 +491,14 @@ static int erofs_fscache_init_domain(struct super_block *sb) if (err) goto out; + if (!erofs_pseudo_mnt) { + erofs_pseudo_mnt = kern_mount(&erofs_fs_type); + if (IS_ERR(erofs_pseudo_mnt)) { + err = PTR_ERR(erofs_pseudo_mnt); + goto out; + } + } + domain->volume = sbi->volume; refcount_set(&domain->ref, 1); list_add(&domain->list, &erofs_domain_list); diff --git a/fs/erofs/internal.h b/fs/erofs/internal.h index 93fc4ad61cfb..0db56259160c 100644 --- a/fs/erofs/internal.h +++ b/fs/erofs/internal.h @@ -373,6 +373,7 @@ struct page *erofs_grab_cache_page_nowait(struct address_space *mapping, } extern const struct super_operations erofs_sops; +extern struct file_system_type erofs_fs_type; extern const struct address_space_operations erofs_raw_access_aops; extern const struct address_space_operations z_erofs_aops; diff --git a/fs/erofs/super.c b/fs/erofs/super.c index 884e7ed3d760..ab746181ae08 100644 --- a/fs/erofs/super.c +++ b/fs/erofs/super.c @@ -676,6 +676,13 @@ static const struct export_operations erofs_export_ops = { .get_parent = erofs_get_parent, }; +static int erofs_fc_fill_pseudo_super(struct super_block *sb, struct fs_context *fc) +{ + static const struct tree_descr empty_descr = {""}; + + return simple_fill_super(sb, EROFS_SUPER_MAGIC, &empty_descr); +} + static int erofs_fc_fill_super(struct super_block *sb, struct fs_context *fc) { struct inode *inode; @@ -776,6 +783,11 @@ static int erofs_fc_fill_super(struct super_block *sb, struct fs_context *fc) return 0; } +static int erofs_fc_anon_get_tree(struct fs_context *fc) +{ + return get_tree_nodev(fc, erofs_fc_fill_pseudo_super); +} + static int erofs_fc_get_tree(struct fs_context *fc) { struct erofs_fs_context *ctx = fc->fs_private; @@ -844,10 +856,21 @@ static const struct fs_context_operations erofs_context_ops = { .free = erofs_fc_free, }; +static const struct fs_context_operations erofs_anon_context_ops = { + .get_tree = erofs_fc_anon_get_tree, +}; + static int erofs_init_fs_context(struct fs_context *fc) { - struct erofs_fs_context *ctx = kzalloc(sizeof(*ctx), GFP_KERNEL); + struct erofs_fs_context *ctx; + /* pseudo mount for anon inodes */ + if (fc->sb_flags & SB_KERNMOUNT) { + fc->ops = &erofs_anon_context_ops; + return 0; + } + + ctx = kzalloc(sizeof(*ctx), GFP_KERNEL); if (!ctx) return -ENOMEM; ctx->devs = kzalloc(sizeof(struct erofs_dev_context), GFP_KERNEL); @@ -874,6 +897,12 @@ static void erofs_kill_sb(struct super_block *sb) WARN_ON(sb->s_magic != EROFS_SUPER_MAGIC); + /* pseudo mount for anon inodes */ + if (sb->s_flags & SB_KERNMOUNT) { + kill_anon_super(sb); + return; + } + if (erofs_is_fscache_mode(sb)) kill_anon_super(sb); else @@ -907,7 +936,7 @@ static void erofs_put_super(struct super_block *sb) erofs_fscache_unregister_fs(sb); } -static struct file_system_type erofs_fs_type = { +struct file_system_type erofs_fs_type = { .owner = THIS_MODULE, .name = "erofs", .init_fs_context = erofs_init_fs_context, From 7d41963759feb3cfa4c1164b8b9db5d1f055932d Mon Sep 17 00:00:00 2001 From: Jia Zhu Date: Sun, 18 Sep 2022 19:01:50 +0800 Subject: [PATCH 06/14] erofs: Support sharing cookies in the same domain Several erofs filesystems can belong to one domain, and data blobs can be shared among these erofs filesystems of same domain. Users could specify domain_id mount option to create or join into a domain. Signed-off-by: Jia Zhu Reviewed-by: Jingbo Xu Link: https://lore.kernel.org/r/20220918110150.6338-1-zhujia.zj@bytedance.com Signed-off-by: Gao Xiang --- fs/erofs/fscache.c | 99 ++++++++++++++++++++++++++++++++++++++++++--- fs/erofs/internal.h | 3 ++ 2 files changed, 96 insertions(+), 6 deletions(-) diff --git a/fs/erofs/fscache.c b/fs/erofs/fscache.c index 0480aaf44cb9..bc3556e77b93 100644 --- a/fs/erofs/fscache.c +++ b/fs/erofs/fscache.c @@ -7,6 +7,7 @@ #include "internal.h" static DEFINE_MUTEX(erofs_domain_list_lock); +static DEFINE_MUTEX(erofs_domain_cookies_lock); static LIST_HEAD(erofs_domain_list); static struct vfsmount *erofs_pseudo_mnt; @@ -531,8 +532,9 @@ static int erofs_fscache_register_domain(struct super_block *sb) return err; } -struct erofs_fscache *erofs_fscache_register_cookie(struct super_block *sb, - char *name, bool need_inode) +static +struct erofs_fscache *erofs_fscache_acquire_cookie(struct super_block *sb, + char *name, bool need_inode) { struct fscache_volume *volume = EROFS_SB(sb)->volume; struct erofs_fscache *ctx; @@ -581,17 +583,102 @@ err: return ERR_PTR(ret); } -void erofs_fscache_unregister_cookie(struct erofs_fscache *ctx) +static void erofs_fscache_relinquish_cookie(struct erofs_fscache *ctx) { - if (!ctx) - return; - fscache_unuse_cookie(ctx->cookie, NULL, NULL); fscache_relinquish_cookie(ctx->cookie, false); iput(ctx->inode); + kfree(ctx->name); kfree(ctx); } +static +struct erofs_fscache *erofs_fscache_domain_init_cookie(struct super_block *sb, + char *name, bool need_inode) +{ + int err; + struct inode *inode; + struct erofs_fscache *ctx; + struct erofs_domain *domain = EROFS_SB(sb)->domain; + + ctx = erofs_fscache_acquire_cookie(sb, name, need_inode); + if (IS_ERR(ctx)) + return ctx; + + ctx->name = kstrdup(name, GFP_KERNEL); + if (!ctx->name) { + err = -ENOMEM; + goto out; + } + + inode = new_inode(erofs_pseudo_mnt->mnt_sb); + if (!inode) { + err = -ENOMEM; + goto out; + } + + ctx->domain = domain; + ctx->anon_inode = inode; + inode->i_private = ctx; + refcount_inc(&domain->ref); + return ctx; +out: + erofs_fscache_relinquish_cookie(ctx); + return ERR_PTR(err); +} + +static +struct erofs_fscache *erofs_domain_register_cookie(struct super_block *sb, + char *name, bool need_inode) +{ + struct inode *inode; + struct erofs_fscache *ctx; + struct erofs_domain *domain = EROFS_SB(sb)->domain; + struct super_block *psb = erofs_pseudo_mnt->mnt_sb; + + mutex_lock(&erofs_domain_cookies_lock); + list_for_each_entry(inode, &psb->s_inodes, i_sb_list) { + ctx = inode->i_private; + if (!ctx || ctx->domain != domain || strcmp(ctx->name, name)) + continue; + igrab(inode); + mutex_unlock(&erofs_domain_cookies_lock); + return ctx; + } + ctx = erofs_fscache_domain_init_cookie(sb, name, need_inode); + mutex_unlock(&erofs_domain_cookies_lock); + return ctx; +} + +struct erofs_fscache *erofs_fscache_register_cookie(struct super_block *sb, + char *name, bool need_inode) +{ + if (EROFS_SB(sb)->opt.domain_id) + return erofs_domain_register_cookie(sb, name, need_inode); + return erofs_fscache_acquire_cookie(sb, name, need_inode); +} + +void erofs_fscache_unregister_cookie(struct erofs_fscache *ctx) +{ + bool drop; + struct erofs_domain *domain; + + if (!ctx) + return; + domain = ctx->domain; + if (domain) { + mutex_lock(&erofs_domain_cookies_lock); + drop = atomic_read(&ctx->anon_inode->i_count) == 1; + iput(ctx->anon_inode); + mutex_unlock(&erofs_domain_cookies_lock); + if (!drop) + return; + } + + erofs_fscache_relinquish_cookie(ctx); + erofs_fscache_domain_put(domain); +} + int erofs_fscache_register_fs(struct super_block *sb) { int ret; diff --git a/fs/erofs/internal.h b/fs/erofs/internal.h index 0db56259160c..ef3f7982b92d 100644 --- a/fs/erofs/internal.h +++ b/fs/erofs/internal.h @@ -109,6 +109,9 @@ struct erofs_domain { struct erofs_fscache { struct fscache_cookie *cookie; struct inode *inode; + struct inode *anon_inode; + struct erofs_domain *domain; + char *name; }; struct erofs_sb_info { From 2ef164414123fcf574aff7a0be5f71f7e60a3fec Mon Sep 17 00:00:00 2001 From: Jia Zhu Date: Sun, 18 Sep 2022 12:34:56 +0800 Subject: [PATCH 07/14] erofs: introduce 'domain_id' mount option Introduce 'domain_id' mount option to enable shared domain sementics. In which case, the related cookie is shared if two mountpoints in the same domain have the same data blob. Users could specify the name of domain by this mount option. Signed-off-by: Jia Zhu Reviewed-by: Jingbo Xu Link: https://lore.kernel.org/r/20220918043456.147-7-zhujia.zj@bytedance.com Signed-off-by: Gao Xiang --- fs/erofs/super.c | 17 +++++++++++++++++ fs/erofs/sysfs.c | 19 +++++++++++++++++-- 2 files changed, 34 insertions(+), 2 deletions(-) diff --git a/fs/erofs/super.c b/fs/erofs/super.c index ab746181ae08..9f7fe6c04e65 100644 --- a/fs/erofs/super.c +++ b/fs/erofs/super.c @@ -440,6 +440,7 @@ enum { Opt_dax_enum, Opt_device, Opt_fsid, + Opt_domain_id, Opt_err }; @@ -465,6 +466,7 @@ static const struct fs_parameter_spec erofs_fs_parameters[] = { fsparam_enum("dax", Opt_dax_enum, erofs_dax_param_enums), fsparam_string("device", Opt_device), fsparam_string("fsid", Opt_fsid), + fsparam_string("domain_id", Opt_domain_id), {} }; @@ -568,6 +570,16 @@ static int erofs_fc_parse_param(struct fs_context *fc, return -ENOMEM; #else errorfc(fc, "fsid option not supported"); +#endif + break; + case Opt_domain_id: +#ifdef CONFIG_EROFS_FS_ONDEMAND + kfree(ctx->opt.domain_id); + ctx->opt.domain_id = kstrdup(param->string, GFP_KERNEL); + if (!ctx->opt.domain_id) + return -ENOMEM; +#else + errorfc(fc, "domain_id option not supported"); #endif break; default: @@ -702,6 +714,7 @@ static int erofs_fc_fill_super(struct super_block *sb, struct fs_context *fc) sb->s_fs_info = sbi; sbi->opt = ctx->opt; ctx->opt.fsid = NULL; + ctx->opt.domain_id = NULL; sbi->devs = ctx->devs; ctx->devs = NULL; @@ -846,6 +859,7 @@ static void erofs_fc_free(struct fs_context *fc) erofs_free_dev_context(ctx->devs); kfree(ctx->opt.fsid); + kfree(ctx->opt.domain_id); kfree(ctx); } @@ -916,6 +930,7 @@ static void erofs_kill_sb(struct super_block *sb) fs_put_dax(sbi->dax_dev, NULL); erofs_fscache_unregister_fs(sb); kfree(sbi->opt.fsid); + kfree(sbi->opt.domain_id); kfree(sbi); sb->s_fs_info = NULL; } @@ -1068,6 +1083,8 @@ static int erofs_show_options(struct seq_file *seq, struct dentry *root) #ifdef CONFIG_EROFS_FS_ONDEMAND if (opt->fsid) seq_printf(seq, ",fsid=%s", opt->fsid); + if (opt->domain_id) + seq_printf(seq, ",domain_id=%s", opt->domain_id); #endif return 0; } diff --git a/fs/erofs/sysfs.c b/fs/erofs/sysfs.c index c1383e508bbe..341fb43ad587 100644 --- a/fs/erofs/sysfs.c +++ b/fs/erofs/sysfs.c @@ -201,12 +201,27 @@ static struct kobject erofs_feat = { int erofs_register_sysfs(struct super_block *sb) { struct erofs_sb_info *sbi = EROFS_SB(sb); + char *name; + char *str = NULL; int err; + if (erofs_is_fscache_mode(sb)) { + if (sbi->opt.domain_id) { + str = kasprintf(GFP_KERNEL, "%s,%s", sbi->opt.domain_id, + sbi->opt.fsid); + if (!str) + return -ENOMEM; + name = str; + } else { + name = sbi->opt.fsid; + } + } else { + name = sb->s_id; + } sbi->s_kobj.kset = &erofs_root; init_completion(&sbi->s_kobj_unregister); - err = kobject_init_and_add(&sbi->s_kobj, &erofs_sb_ktype, NULL, "%s", - erofs_is_fscache_mode(sb) ? sbi->opt.fsid : sb->s_id); + err = kobject_init_and_add(&sbi->s_kobj, &erofs_sb_ktype, NULL, "%s", name); + kfree(str); if (err) goto put_sb_kobj; return 0; From 1ae9470c3e14624b0f4d8741c22b5a94062c0e33 Mon Sep 17 00:00:00 2001 From: Jingbo Xu Date: Thu, 22 Sep 2022 14:24:14 +0800 Subject: [PATCH 08/14] erofs: clean up .read_folio() and .readahead() in fscache mode The implementation of these two functions in fscache mode is almost the same. Extract the same part as a generic helper to remove the code duplication. Signed-off-by: Jingbo Xu Reviewed-by: Jia Zhu Link: https://lore.kernel.org/r/20220922062414.20437-1-jefflexu@linux.alibaba.com Signed-off-by: Gao Xiang --- fs/erofs/fscache.c | 215 ++++++++++++++++++--------------------------- 1 file changed, 84 insertions(+), 131 deletions(-) diff --git a/fs/erofs/fscache.c b/fs/erofs/fscache.c index bc3556e77b93..998cd26a1b3b 100644 --- a/fs/erofs/fscache.c +++ b/fs/erofs/fscache.c @@ -240,113 +240,111 @@ out: return ret; } -static int erofs_fscache_read_folio_inline(struct folio *folio, - struct erofs_map_blocks *map) +/* + * Read into page cache in the range described by (@pos, @len). + * + * On return, the caller is responsible for page unlocking if the output @unlock + * is true, or the callee will take this responsibility through netfs_io_request + * interface. + * + * The return value is the number of bytes successfully handled, or negative + * error code on failure. The only exception is that, the length of the range + * instead of the error code is returned on failure after netfs_io_request is + * allocated, so that .readahead() could advance rac accordingly. + */ +static int erofs_fscache_data_read(struct address_space *mapping, + loff_t pos, size_t len, bool *unlock) { - struct super_block *sb = folio_mapping(folio)->host->i_sb; - struct erofs_buf buf = __EROFS_BUF_INITIALIZER; - erofs_blk_t blknr; - size_t offset, len; - void *src, *dst; - - /* For tail packing layout, the offset may be non-zero. */ - offset = erofs_blkoff(map->m_pa); - blknr = erofs_blknr(map->m_pa); - len = map->m_llen; - - src = erofs_read_metabuf(&buf, sb, blknr, EROFS_KMAP); - if (IS_ERR(src)) - return PTR_ERR(src); - - dst = kmap_local_folio(folio, 0); - memcpy(dst, src + offset, len); - memset(dst + len, 0, PAGE_SIZE - len); - kunmap_local(dst); - - erofs_put_metabuf(&buf); - return 0; -} - -static int erofs_fscache_read_folio(struct file *file, struct folio *folio) -{ - struct inode *inode = folio_mapping(folio)->host; + struct inode *inode = mapping->host; struct super_block *sb = inode->i_sb; + struct netfs_io_request *rreq; struct erofs_map_blocks map; struct erofs_map_dev mdev; - struct netfs_io_request *rreq; - erofs_off_t pos; - loff_t pstart; + struct iov_iter iter; + size_t count; int ret; - DBG_BUGON(folio_size(folio) != EROFS_BLKSIZ); + *unlock = true; - pos = folio_pos(folio); map.m_la = pos; - ret = erofs_map_blocks(inode, &map, EROFS_GET_BLOCKS_RAW); if (ret) - goto out_unlock; - - if (!(map.m_flags & EROFS_MAP_MAPPED)) { - folio_zero_range(folio, 0, folio_size(folio)); - goto out_uptodate; - } + return ret; if (map.m_flags & EROFS_MAP_META) { - ret = erofs_fscache_read_folio_inline(folio, &map); - goto out_uptodate; + struct erofs_buf buf = __EROFS_BUF_INITIALIZER; + erofs_blk_t blknr; + size_t offset, size; + void *src; + + /* For tail packing layout, the offset may be non-zero. */ + offset = erofs_blkoff(map.m_pa); + blknr = erofs_blknr(map.m_pa); + size = map.m_llen; + + src = erofs_read_metabuf(&buf, sb, blknr, EROFS_KMAP); + if (IS_ERR(src)) + return PTR_ERR(src); + + iov_iter_xarray(&iter, READ, &mapping->i_pages, pos, PAGE_SIZE); + if (copy_to_iter(src + offset, size, &iter) != size) + return -EFAULT; + iov_iter_zero(PAGE_SIZE - size, &iter); + erofs_put_metabuf(&buf); + return PAGE_SIZE; + } + + count = min_t(size_t, map.m_llen - (pos - map.m_la), len); + DBG_BUGON(!count || count % PAGE_SIZE); + + if (!(map.m_flags & EROFS_MAP_MAPPED)) { + iov_iter_xarray(&iter, READ, &mapping->i_pages, pos, count); + iov_iter_zero(count, &iter); + return count; } mdev = (struct erofs_map_dev) { .m_deviceid = map.m_deviceid, .m_pa = map.m_pa, }; - ret = erofs_map_dev(sb, &mdev); if (ret) - goto out_unlock; + return ret; + rreq = erofs_fscache_alloc_request(mapping, pos, count); + if (IS_ERR(rreq)) + return PTR_ERR(rreq); - rreq = erofs_fscache_alloc_request(folio_mapping(folio), - folio_pos(folio), folio_size(folio)); - if (IS_ERR(rreq)) { - ret = PTR_ERR(rreq); - goto out_unlock; - } - - pstart = mdev.m_pa + (pos - map.m_la); - return erofs_fscache_read_folios_async(mdev.m_fscache->cookie, - rreq, pstart); - -out_uptodate: - if (!ret) - folio_mark_uptodate(folio); -out_unlock: - folio_unlock(folio); - return ret; + *unlock = false; + erofs_fscache_read_folios_async(mdev.m_fscache->cookie, + rreq, mdev.m_pa + (pos - map.m_la)); + return count; } -static void erofs_fscache_advance_folios(struct readahead_control *rac, - size_t len, bool unlock) +static int erofs_fscache_read_folio(struct file *file, struct folio *folio) { - while (len) { - struct folio *folio = readahead_folio(rac); - len -= folio_size(folio); - if (unlock) { + bool unlock; + int ret; + + DBG_BUGON(folio_size(folio) != EROFS_BLKSIZ); + + ret = erofs_fscache_data_read(folio_mapping(folio), folio_pos(folio), + folio_size(folio), &unlock); + if (unlock) { + if (ret > 0) folio_mark_uptodate(folio); - folio_unlock(folio); - } + folio_unlock(folio); } + return ret < 0 ? ret : 0; } static void erofs_fscache_readahead(struct readahead_control *rac) { - struct inode *inode = rac->mapping->host; - struct super_block *sb = inode->i_sb; - size_t len, count, done = 0; - erofs_off_t pos; - loff_t start, offset; - int ret; + struct folio *folio; + size_t len, done = 0; + loff_t start, pos; + bool unlock; + int ret, size; if (!readahead_count(rac)) return; @@ -355,67 +353,22 @@ static void erofs_fscache_readahead(struct readahead_control *rac) len = readahead_length(rac); do { - struct erofs_map_blocks map; - struct erofs_map_dev mdev; - struct netfs_io_request *rreq; - pos = start + done; - map.m_la = pos; - - ret = erofs_map_blocks(inode, &map, EROFS_GET_BLOCKS_RAW); - if (ret) + ret = erofs_fscache_data_read(rac->mapping, pos, + len - done, &unlock); + if (ret <= 0) return; - offset = start + done; - count = min_t(size_t, map.m_llen - (pos - map.m_la), - len - done); - - if (!(map.m_flags & EROFS_MAP_MAPPED)) { - struct iov_iter iter; - - iov_iter_xarray(&iter, READ, &rac->mapping->i_pages, - offset, count); - iov_iter_zero(count, &iter); - - erofs_fscache_advance_folios(rac, count, true); - ret = count; - continue; - } - - if (map.m_flags & EROFS_MAP_META) { - struct folio *folio = readahead_folio(rac); - - ret = erofs_fscache_read_folio_inline(folio, &map); - if (!ret) { + size = ret; + while (size) { + folio = readahead_folio(rac); + size -= folio_size(folio); + if (unlock) { folio_mark_uptodate(folio); - ret = folio_size(folio); + folio_unlock(folio); } - - folio_unlock(folio); - continue; } - - mdev = (struct erofs_map_dev) { - .m_deviceid = map.m_deviceid, - .m_pa = map.m_pa, - }; - ret = erofs_map_dev(sb, &mdev); - if (ret) - return; - - rreq = erofs_fscache_alloc_request(rac->mapping, offset, count); - if (IS_ERR(rreq)) - return; - /* - * Drop the ref of folios here. Unlock them in - * rreq_unlock_folios() when rreq complete. - */ - erofs_fscache_advance_folios(rac, count, false); - ret = erofs_fscache_read_folios_async(mdev.m_fscache->cookie, - rreq, mdev.m_pa + (pos - map.m_la)); - if (!ret) - ret = count; - } while (ret > 0 && ((done += ret) < len)); + } while ((done += ret) < len); } static const struct address_space_operations erofs_fscache_meta_aops = { From fdffc091e6f94602558bba712b51bc16f79fd6d5 Mon Sep 17 00:00:00 2001 From: Yue Hu Date: Fri, 23 Sep 2022 10:11:21 +0800 Subject: [PATCH 09/14] erofs: support interlaced uncompressed data for compressed files Currently, uncompressed data is all handled in the shifted way, which means we have to shift the whole on-disk plain pcluster to get the logical data. However, since we are also using in-place I/O for uncompressed data, data copy will be reduced a lot if pcluster is recorded in the interlaced way as illustrated below: _______________________________________________________________ | | | |_ tail part |_ head part _| |<- blk0 ->| .. |<- blkn-2 ->|<- blkn-1 ->| The logical data then becomes: ________________________________________________________ |_ head part _|_ blk0 _| .. |_ blkn-2 _|_ tail part _| In addition, non-4k plain pclusters are also survived by the interlaced way, which can be used for non-4k lclusters as well. However, it's almost impossible to de-duplicate uncompressed data in the interlaced way, therefore shifted uncompressed data is still useful. Signed-off-by: Yue Hu Reviewed-by: Gao Xiang Link: https://lore.kernel.org/r/8369112678604fdf4ef796626d59b1fdd0745a53.1663898962.git.huyue2@coolpad.com Signed-off-by: Gao Xiang --- fs/erofs/decompressor.c | 47 ++++++++++++++++++++++++----------------- fs/erofs/erofs_fs.h | 2 ++ fs/erofs/internal.h | 1 + fs/erofs/zmap.c | 14 ++++++++---- 4 files changed, 41 insertions(+), 23 deletions(-) diff --git a/fs/erofs/decompressor.c b/fs/erofs/decompressor.c index 2d55569f96ac..51b7ac7166d9 100644 --- a/fs/erofs/decompressor.c +++ b/fs/erofs/decompressor.c @@ -317,52 +317,61 @@ dstmap_out: return ret; } -static int z_erofs_shifted_transform(struct z_erofs_decompress_req *rq, - struct page **pagepool) +static int z_erofs_transform_plain(struct z_erofs_decompress_req *rq, + struct page **pagepool) { - const unsigned int nrpages_out = + const unsigned int inpages = PAGE_ALIGN(rq->inputsize) >> PAGE_SHIFT; + const unsigned int outpages = PAGE_ALIGN(rq->pageofs_out + rq->outputsize) >> PAGE_SHIFT; const unsigned int righthalf = min_t(unsigned int, rq->outputsize, PAGE_SIZE - rq->pageofs_out); const unsigned int lefthalf = rq->outputsize - righthalf; + const unsigned int interlaced_offset = + rq->alg == Z_EROFS_COMPRESSION_SHIFTED ? 0 : rq->pageofs_out; unsigned char *src, *dst; - if (nrpages_out > 2) { + if (outpages > 2 && rq->alg == Z_EROFS_COMPRESSION_SHIFTED) { DBG_BUGON(1); - return -EIO; + return -EFSCORRUPTED; } if (rq->out[0] == *rq->in) { - DBG_BUGON(nrpages_out != 1); + DBG_BUGON(rq->pageofs_out); return 0; } - src = kmap_atomic(*rq->in) + rq->pageofs_in; + src = kmap_local_page(rq->in[inpages - 1]) + rq->pageofs_in; if (rq->out[0]) { - dst = kmap_atomic(rq->out[0]); - memcpy(dst + rq->pageofs_out, src, righthalf); - kunmap_atomic(dst); + dst = kmap_local_page(rq->out[0]); + memcpy(dst + rq->pageofs_out, src + interlaced_offset, + righthalf); + kunmap_local(dst); } - if (nrpages_out == 2) { - DBG_BUGON(!rq->out[1]); - if (rq->out[1] == *rq->in) { + if (outpages > inpages) { + DBG_BUGON(!rq->out[outpages - 1]); + if (rq->out[outpages - 1] != rq->in[inpages - 1]) { + dst = kmap_local_page(rq->out[outpages - 1]); + memcpy(dst, interlaced_offset ? src : + (src + righthalf), lefthalf); + kunmap_local(dst); + } else if (!interlaced_offset) { memmove(src, src + righthalf, lefthalf); - } else { - dst = kmap_atomic(rq->out[1]); - memcpy(dst, src + righthalf, lefthalf); - kunmap_atomic(dst); } } - kunmap_atomic(src); + kunmap_local(src); return 0; } static struct z_erofs_decompressor decompressors[] = { [Z_EROFS_COMPRESSION_SHIFTED] = { - .decompress = z_erofs_shifted_transform, + .decompress = z_erofs_transform_plain, .name = "shifted" }, + [Z_EROFS_COMPRESSION_INTERLACED] = { + .decompress = z_erofs_transform_plain, + .name = "interlaced" + }, [Z_EROFS_COMPRESSION_LZ4] = { .decompress = z_erofs_lz4_decompress, .name = "lz4" diff --git a/fs/erofs/erofs_fs.h b/fs/erofs/erofs_fs.h index 2b48373f690b..5c1de6d7ad71 100644 --- a/fs/erofs/erofs_fs.h +++ b/fs/erofs/erofs_fs.h @@ -295,11 +295,13 @@ struct z_erofs_lzma_cfgs { * bit 1 : HEAD1 big pcluster (0 - off; 1 - on) * bit 2 : HEAD2 big pcluster (0 - off; 1 - on) * bit 3 : tailpacking inline pcluster (0 - off; 1 - on) + * bit 4 : interlaced plain pcluster (0 - off; 1 - on) */ #define Z_EROFS_ADVISE_COMPACTED_2B 0x0001 #define Z_EROFS_ADVISE_BIG_PCLUSTER_1 0x0002 #define Z_EROFS_ADVISE_BIG_PCLUSTER_2 0x0004 #define Z_EROFS_ADVISE_INLINE_PCLUSTER 0x0008 +#define Z_EROFS_ADVISE_INTERLACED_PCLUSTER 0x0010 struct z_erofs_map_header { __le16 h_reserved1; diff --git a/fs/erofs/internal.h b/fs/erofs/internal.h index ef3f7982b92d..8dbfeb5f4f84 100644 --- a/fs/erofs/internal.h +++ b/fs/erofs/internal.h @@ -420,6 +420,7 @@ struct erofs_map_blocks { enum { Z_EROFS_COMPRESSION_SHIFTED = Z_EROFS_COMPRESSION_MAX, + Z_EROFS_COMPRESSION_INTERLACED, Z_EROFS_COMPRESSION_RUNTIME_MAX }; diff --git a/fs/erofs/zmap.c b/fs/erofs/zmap.c index d58549ca1df9..7196235a441c 100644 --- a/fs/erofs/zmap.c +++ b/fs/erofs/zmap.c @@ -679,12 +679,18 @@ static int z_erofs_do_map_blocks(struct inode *inode, goto out; } - if (m.headtype == Z_EROFS_VLE_CLUSTER_TYPE_PLAIN) - map->m_algorithmformat = Z_EROFS_COMPRESSION_SHIFTED; - else if (m.headtype == Z_EROFS_VLE_CLUSTER_TYPE_HEAD2) + if (m.headtype == Z_EROFS_VLE_CLUSTER_TYPE_PLAIN) { + if (vi->z_advise & Z_EROFS_ADVISE_INTERLACED_PCLUSTER) + map->m_algorithmformat = + Z_EROFS_COMPRESSION_INTERLACED; + else + map->m_algorithmformat = + Z_EROFS_COMPRESSION_SHIFTED; + } else if (m.headtype == Z_EROFS_VLE_CLUSTER_TYPE_HEAD2) { map->m_algorithmformat = vi->z_algorithmtype[1]; - else + } else { map->m_algorithmformat = vi->z_algorithmtype[0]; + } if ((flags & EROFS_GET_BLOCKS_FIEMAP) || ((flags & EROFS_GET_BLOCKS_READMORE) && From b15b2e307c3a1970d92da77a3ef57ee53d119d8e Mon Sep 17 00:00:00 2001 From: Yue Hu Date: Fri, 23 Sep 2022 10:11:22 +0800 Subject: [PATCH 10/14] erofs: support on-disk compressed fragments data Introduce on-disk compressed fragments data feature. This approach adds a new field called `h_fragmentoff' in the per-file compression header to indicate the fragment offset of each tail pcluster or the whole file in the special packed inode. Similar to ztailpacking, it will also find and record the 'headlcn' of the tail pcluster when initializing per-inode zmap for making follow-on requests more easy. Signed-off-by: Yue Hu Reviewed-by: Gao Xiang Link: https://lore.kernel.org/r/YzHKxcFTlHGgXeH9@B-P7TQMD6M-0146.local Signed-off-by: Gao Xiang --- fs/erofs/erofs_fs.h | 33 +++++++++++++++++++++++----- fs/erofs/internal.h | 16 +++++++++++--- fs/erofs/super.c | 15 +++++++++++++ fs/erofs/sysfs.c | 2 ++ fs/erofs/zdata.c | 50 +++++++++++++++++++++++++++++++++++++++++- fs/erofs/zmap.c | 53 +++++++++++++++++++++++++++++++++++++++------ 6 files changed, 152 insertions(+), 17 deletions(-) diff --git a/fs/erofs/erofs_fs.h b/fs/erofs/erofs_fs.h index 5c1de6d7ad71..b5d763aa8ff0 100644 --- a/fs/erofs/erofs_fs.h +++ b/fs/erofs/erofs_fs.h @@ -25,6 +25,7 @@ #define EROFS_FEATURE_INCOMPAT_DEVICE_TABLE 0x00000008 #define EROFS_FEATURE_INCOMPAT_COMPR_HEAD2 0x00000008 #define EROFS_FEATURE_INCOMPAT_ZTAILPACKING 0x00000010 +#define EROFS_FEATURE_INCOMPAT_FRAGMENTS 0x00000020 #define EROFS_ALL_FEATURE_INCOMPAT \ (EROFS_FEATURE_INCOMPAT_ZERO_PADDING | \ EROFS_FEATURE_INCOMPAT_COMPR_CFGS | \ @@ -32,7 +33,8 @@ EROFS_FEATURE_INCOMPAT_CHUNKED_FILE | \ EROFS_FEATURE_INCOMPAT_DEVICE_TABLE | \ EROFS_FEATURE_INCOMPAT_COMPR_HEAD2 | \ - EROFS_FEATURE_INCOMPAT_ZTAILPACKING) + EROFS_FEATURE_INCOMPAT_ZTAILPACKING | \ + EROFS_FEATURE_INCOMPAT_FRAGMENTS) #define EROFS_SB_EXTSLOT_SIZE 16 @@ -71,7 +73,9 @@ struct erofs_super_block { } __packed u1; __le16 extra_devices; /* # of devices besides the primary device */ __le16 devt_slotoff; /* startoff = devt_slotoff * devt_slotsize */ - __u8 reserved2[38]; + __u8 reserved[6]; + __le64 packed_nid; /* nid of the special packed inode */ + __u8 reserved2[24]; }; /* @@ -296,17 +300,26 @@ struct z_erofs_lzma_cfgs { * bit 2 : HEAD2 big pcluster (0 - off; 1 - on) * bit 3 : tailpacking inline pcluster (0 - off; 1 - on) * bit 4 : interlaced plain pcluster (0 - off; 1 - on) + * bit 5 : fragment pcluster (0 - off; 1 - on) */ #define Z_EROFS_ADVISE_COMPACTED_2B 0x0001 #define Z_EROFS_ADVISE_BIG_PCLUSTER_1 0x0002 #define Z_EROFS_ADVISE_BIG_PCLUSTER_2 0x0004 #define Z_EROFS_ADVISE_INLINE_PCLUSTER 0x0008 #define Z_EROFS_ADVISE_INTERLACED_PCLUSTER 0x0010 +#define Z_EROFS_ADVISE_FRAGMENT_PCLUSTER 0x0020 +#define Z_EROFS_FRAGMENT_INODE_BIT 7 struct z_erofs_map_header { - __le16 h_reserved1; - /* indicates the encoded size of tailpacking data */ - __le16 h_idata_size; + union { + /* fragment data offset in the packed inode */ + __le32 h_fragmentoff; + struct { + __le16 h_reserved1; + /* indicates the encoded size of tailpacking data */ + __le16 h_idata_size; + }; + }; __le16 h_advise; /* * bit 0-3 : algorithm type of head 1 (logical cluster type 01); @@ -315,7 +328,8 @@ struct z_erofs_map_header { __u8 h_algorithmtype; /* * bit 0-2 : logical cluster bits - 12, e.g. 0 for 4096; - * bit 3-7 : reserved. + * bit 3-6 : reserved; + * bit 7 : move the whole file into packed inode or not. */ __u8 h_clusterbits; }; @@ -404,6 +418,10 @@ struct erofs_dirent { /* check the EROFS on-disk layout strictly at compile time */ static inline void erofs_check_ondisk_layout_definitions(void) { + const __le64 fmh = *(__le64 *)&(struct z_erofs_map_header) { + .h_clusterbits = 1 << Z_EROFS_FRAGMENT_INODE_BIT + }; + BUILD_BUG_ON(sizeof(struct erofs_super_block) != 128); BUILD_BUG_ON(sizeof(struct erofs_inode_compact) != 32); BUILD_BUG_ON(sizeof(struct erofs_inode_extended) != 64); @@ -421,6 +439,9 @@ static inline void erofs_check_ondisk_layout_definitions(void) BUILD_BUG_ON(BIT(Z_EROFS_VLE_DI_CLUSTER_TYPE_BITS) < Z_EROFS_VLE_CLUSTER_TYPE_MAX - 1); + /* exclude old compiler versions like gcc 7.5.0 */ + BUILD_BUG_ON(__builtin_constant_p(fmh) ? + fmh != cpu_to_le64(1ULL << 63) : 0); } #endif diff --git a/fs/erofs/internal.h b/fs/erofs/internal.h index 8dbfeb5f4f84..9f89c1da6229 100644 --- a/fs/erofs/internal.h +++ b/fs/erofs/internal.h @@ -131,6 +131,7 @@ struct erofs_sb_info { struct inode *managed_cache; struct erofs_sb_lz4_info lz4; + struct inode *packed_inode; #endif /* CONFIG_EROFS_FS_ZIP */ struct erofs_dev_context *devs; struct dax_device *dax_dev; @@ -289,6 +290,7 @@ EROFS_FEATURE_FUNCS(chunked_file, incompat, INCOMPAT_CHUNKED_FILE) EROFS_FEATURE_FUNCS(device_table, incompat, INCOMPAT_DEVICE_TABLE) EROFS_FEATURE_FUNCS(compr_head2, incompat, INCOMPAT_COMPR_HEAD2) EROFS_FEATURE_FUNCS(ztailpacking, incompat, INCOMPAT_ZTAILPACKING) +EROFS_FEATURE_FUNCS(fragments, incompat, INCOMPAT_FRAGMENTS) EROFS_FEATURE_FUNCS(sb_chksum, compat, COMPAT_SB_CHKSUM) /* atomic flag definitions */ @@ -324,8 +326,13 @@ struct erofs_inode { unsigned char z_algorithmtype[2]; unsigned char z_logical_clusterbits; unsigned long z_tailextent_headlcn; - erofs_off_t z_idataoff; - unsigned short z_idata_size; + union { + struct { + erofs_off_t z_idataoff; + unsigned short z_idata_size; + }; + erofs_off_t z_fragmentoff; + }; }; #endif /* CONFIG_EROFS_FS_ZIP */ }; @@ -384,6 +391,7 @@ extern const struct address_space_operations z_erofs_aops; enum { BH_Encoded = BH_PrivateStart, BH_FullMapped, + BH_Fragment, }; /* Has a disk mapping */ @@ -394,6 +402,8 @@ enum { #define EROFS_MAP_ENCODED (1 << BH_Encoded) /* The length of extent is full */ #define EROFS_MAP_FULL_MAPPED (1 << BH_FullMapped) +/* Located in the special packed inode */ +#define EROFS_MAP_FRAGMENT (1 << BH_Fragment) struct erofs_map_blocks { struct erofs_buf buf; @@ -415,7 +425,7 @@ struct erofs_map_blocks { #define EROFS_GET_BLOCKS_FIEMAP 0x0002 /* Used to map the whole extent if non-negligible data is requested for LZMA */ #define EROFS_GET_BLOCKS_READMORE 0x0004 -/* Used to map tail extent for tailpacking inline pcluster */ +/* Used to map tail extent for tailpacking inline or fragment pcluster */ #define EROFS_GET_BLOCKS_FINDTAIL 0x0008 enum { diff --git a/fs/erofs/super.c b/fs/erofs/super.c index 9f7fe6c04e65..ce20562ca91f 100644 --- a/fs/erofs/super.c +++ b/fs/erofs/super.c @@ -381,6 +381,17 @@ static int erofs_read_superblock(struct super_block *sb) #endif sbi->islotbits = ilog2(sizeof(struct erofs_inode_compact)); sbi->root_nid = le16_to_cpu(dsb->root_nid); +#ifdef CONFIG_EROFS_FS_ZIP + sbi->packed_inode = NULL; + if (erofs_sb_has_fragments(sbi) && dsb->packed_nid) { + sbi->packed_inode = + erofs_iget(sb, le64_to_cpu(dsb->packed_nid), false); + if (IS_ERR(sbi->packed_inode)) { + ret = PTR_ERR(sbi->packed_inode); + goto out; + } + } +#endif sbi->inos = le64_to_cpu(dsb->inos); sbi->build_time = le64_to_cpu(dsb->build_time); @@ -411,6 +422,8 @@ static int erofs_read_superblock(struct super_block *sb) erofs_info(sb, "EXPERIMENTAL compressed inline data feature in use. Use at your own risk!"); if (erofs_is_fscache_mode(sb)) erofs_info(sb, "EXPERIMENTAL fscache-based on-demand read feature in use. Use at your own risk!"); + if (erofs_sb_has_fragments(sbi)) + erofs_info(sb, "EXPERIMENTAL compressed fragments feature in use. Use at your own risk!"); out: erofs_put_metabuf(&buf); return ret; @@ -947,6 +960,8 @@ static void erofs_put_super(struct super_block *sb) #ifdef CONFIG_EROFS_FS_ZIP iput(sbi->managed_cache); sbi->managed_cache = NULL; + iput(sbi->packed_inode); + sbi->packed_inode = NULL; #endif erofs_fscache_unregister_fs(sb); } diff --git a/fs/erofs/sysfs.c b/fs/erofs/sysfs.c index 341fb43ad587..dd6eb7eccf9a 100644 --- a/fs/erofs/sysfs.c +++ b/fs/erofs/sysfs.c @@ -76,6 +76,7 @@ EROFS_ATTR_FEATURE(device_table); EROFS_ATTR_FEATURE(compr_head2); EROFS_ATTR_FEATURE(sb_chksum); EROFS_ATTR_FEATURE(ztailpacking); +EROFS_ATTR_FEATURE(fragments); static struct attribute *erofs_feat_attrs[] = { ATTR_LIST(zero_padding), @@ -86,6 +87,7 @@ static struct attribute *erofs_feat_attrs[] = { ATTR_LIST(compr_head2), ATTR_LIST(sb_chksum), ATTR_LIST(ztailpacking), + ATTR_LIST(fragments), NULL, }; ATTRIBUTE_GROUPS(erofs_feat); diff --git a/fs/erofs/zdata.c b/fs/erofs/zdata.c index 5792ca9e0d5e..c92a72f5bca6 100644 --- a/fs/erofs/zdata.c +++ b/fs/erofs/zdata.c @@ -650,6 +650,35 @@ static bool should_alloc_managed_pages(struct z_erofs_decompress_frontend *fe, la < fe->headoffset; } +static int z_erofs_read_fragment(struct inode *inode, erofs_off_t pos, + struct page *page, unsigned int pageofs, + unsigned int len) +{ + struct inode *packed_inode = EROFS_I_SB(inode)->packed_inode; + struct erofs_buf buf = __EROFS_BUF_INITIALIZER; + u8 *src, *dst; + unsigned int i, cnt; + + pos += EROFS_I(inode)->z_fragmentoff; + for (i = 0; i < len; i += cnt) { + cnt = min_t(unsigned int, len - i, + EROFS_BLKSIZ - erofs_blkoff(pos)); + src = erofs_bread(&buf, packed_inode, + erofs_blknr(pos), EROFS_KMAP); + if (IS_ERR(src)) { + erofs_put_metabuf(&buf); + return PTR_ERR(src); + } + + dst = kmap_local_page(page); + memcpy(dst + pageofs + i, src + erofs_blkoff(pos), cnt); + kunmap_local(dst); + pos += cnt; + } + erofs_put_metabuf(&buf); + return 0; +} + static int z_erofs_do_read_page(struct z_erofs_decompress_frontend *fe, struct page *page, struct page **pagepool) { @@ -688,7 +717,8 @@ repeat: /* didn't get a valid pcluster previously (very rare) */ } - if (!(map->m_flags & EROFS_MAP_MAPPED)) + if (!(map->m_flags & EROFS_MAP_MAPPED) || + map->m_flags & EROFS_MAP_FRAGMENT) goto hitted; err = z_erofs_collector_begin(fe); @@ -735,6 +765,24 @@ hitted: zero_user_segment(page, cur, end); goto next_part; } + if (map->m_flags & EROFS_MAP_FRAGMENT) { + unsigned int pageofs, skip, len; + + if (offset > map->m_la) { + pageofs = 0; + skip = offset - map->m_la; + } else { + pageofs = map->m_la & ~PAGE_MASK; + skip = 0; + } + len = min_t(unsigned int, map->m_llen - skip, end - cur); + err = z_erofs_read_fragment(inode, skip, page, pageofs, len); + if (err) + goto out; + ++spiltted; + tight = false; + goto next_part; + } exclusive = (!cur && (!spiltted || tight)); if (cur) diff --git a/fs/erofs/zmap.c b/fs/erofs/zmap.c index 7196235a441c..d1723910251c 100644 --- a/fs/erofs/zmap.c +++ b/fs/erofs/zmap.c @@ -17,7 +17,7 @@ int z_erofs_fill_inode(struct inode *inode) struct erofs_sb_info *sbi = EROFS_SB(inode->i_sb); if (!erofs_sb_has_big_pcluster(sbi) && - !erofs_sb_has_ztailpacking(sbi) && + !erofs_sb_has_ztailpacking(sbi) && !erofs_sb_has_fragments(sbi) && vi->datalayout == EROFS_INODE_FLAT_COMPRESSION_LEGACY) { vi->z_advise = 0; vi->z_algorithmtype[0] = 0; @@ -55,10 +55,6 @@ static int z_erofs_fill_inode_lazy(struct inode *inode) if (test_bit(EROFS_I_Z_INITED_BIT, &vi->flags)) goto out_unlock; - DBG_BUGON(!erofs_sb_has_big_pcluster(EROFS_SB(sb)) && - !erofs_sb_has_ztailpacking(EROFS_SB(sb)) && - vi->datalayout == EROFS_INODE_FLAT_COMPRESSION_LEGACY); - pos = ALIGN(iloc(EROFS_SB(sb), vi->nid) + vi->inode_isize + vi->xattr_isize, 8); kaddr = erofs_read_metabuf(&buf, sb, erofs_blknr(pos), @@ -69,6 +65,16 @@ static int z_erofs_fill_inode_lazy(struct inode *inode) } h = kaddr + erofs_blkoff(pos); + /* + * if the highest bit of the 8-byte map header is set, the whole file + * is stored in the packed inode. The rest bits keeps z_fragmentoff. + */ + if (h->h_clusterbits >> Z_EROFS_FRAGMENT_INODE_BIT) { + vi->z_advise = Z_EROFS_ADVISE_FRAGMENT_PCLUSTER; + vi->z_fragmentoff = le64_to_cpu(*(__le64 *)h) ^ (1ULL << 63); + vi->z_tailextent_headlcn = 0; + goto unmap_done; + } vi->z_advise = le16_to_cpu(h->h_advise); vi->z_algorithmtype[0] = h->h_algorithmtype & 15; vi->z_algorithmtype[1] = h->h_algorithmtype >> 4; @@ -123,6 +129,20 @@ unmap_done: if (err < 0) goto out_unlock; } + + if (vi->z_advise & Z_EROFS_ADVISE_FRAGMENT_PCLUSTER && + !(h->h_clusterbits >> Z_EROFS_FRAGMENT_INODE_BIT)) { + struct erofs_map_blocks map = { + .buf = __EROFS_BUF_INITIALIZER + }; + + vi->z_fragmentoff = le32_to_cpu(h->h_fragmentoff); + err = z_erofs_do_map_blocks(inode, &map, + EROFS_GET_BLOCKS_FINDTAIL); + erofs_put_metabuf(&map.buf); + if (err < 0) + goto out_unlock; + } /* paired with smp_mb() at the beginning of the function */ smp_mb(); set_bit(EROFS_I_Z_INITED_BIT, &vi->flags); @@ -598,6 +618,7 @@ static int z_erofs_do_map_blocks(struct inode *inode, { struct erofs_inode *const vi = EROFS_I(inode); bool ztailpacking = vi->z_advise & Z_EROFS_ADVISE_INLINE_PCLUSTER; + bool fragment = vi->z_advise & Z_EROFS_ADVISE_FRAGMENT_PCLUSTER; struct z_erofs_maprecorder m = { .inode = inode, .map = map, @@ -666,12 +687,19 @@ static int z_erofs_do_map_blocks(struct inode *inode, map->m_llen = end - map->m_la; - if (flags & EROFS_GET_BLOCKS_FINDTAIL) + if (flags & EROFS_GET_BLOCKS_FINDTAIL) { vi->z_tailextent_headlcn = m.lcn; + /* for non-compact indexes, fragmentoff is 64 bits */ + if (fragment && + vi->datalayout == EROFS_INODE_FLAT_COMPRESSION_LEGACY) + vi->z_fragmentoff |= (u64)m.pblk << 32; + } if (ztailpacking && m.lcn == vi->z_tailextent_headlcn) { map->m_flags |= EROFS_MAP_META; map->m_pa = vi->z_idataoff; map->m_plen = vi->z_idata_size; + } else if (fragment && m.lcn == vi->z_tailextent_headlcn) { + map->m_flags |= EROFS_MAP_FRAGMENT; } else { map->m_pa = blknr_to_addr(m.pblk); err = z_erofs_get_extent_compressedlen(&m, initial_lcn); @@ -715,6 +743,7 @@ int z_erofs_map_blocks_iter(struct inode *inode, struct erofs_map_blocks *map, int flags) { + struct erofs_inode *const vi = EROFS_I(inode); int err = 0; trace_z_erofs_map_blocks_iter_enter(inode, map, flags); @@ -731,6 +760,15 @@ int z_erofs_map_blocks_iter(struct inode *inode, if (err) goto out; + if ((vi->z_advise & Z_EROFS_ADVISE_FRAGMENT_PCLUSTER) && + !vi->z_tailextent_headlcn) { + map->m_la = 0; + map->m_llen = inode->i_size; + map->m_flags = EROFS_MAP_MAPPED | EROFS_MAP_FULL_MAPPED | + EROFS_MAP_FRAGMENT; + goto out; + } + err = z_erofs_do_map_blocks(inode, map, flags); out: trace_z_erofs_map_blocks_iter_exit(inode, map, flags, err); @@ -757,7 +795,8 @@ static int z_erofs_iomap_begin_report(struct inode *inode, loff_t offset, iomap->length = map.m_llen; if (map.m_flags & EROFS_MAP_MAPPED) { iomap->type = IOMAP_MAPPED; - iomap->addr = map.m_pa; + iomap->addr = map.m_flags & EROFS_MAP_FRAGMENT ? + IOMAP_NULL_ADDR : map.m_pa; } else { iomap->type = IOMAP_HOLE; iomap->addr = IOMAP_NULL_ADDR; From 5c2a64252c5dc4cfe78e5b2a531c118894e3d155 Mon Sep 17 00:00:00 2001 From: Gao Xiang Date: Fri, 23 Sep 2022 09:49:15 +0800 Subject: [PATCH 11/14] erofs: introduce partial-referenced pclusters Due to deduplication for compressed data, pclusters can be partially referenced with their prefixes. Together with the user-space implementation, it enables EROFS variable-length global compressed data deduplication with rolling hash. Link: https://lore.kernel.org/r/20220923014915.4362-1-hsiangkao@linux.alibaba.com Reviewed-by: Yue Hu Signed-off-by: Gao Xiang --- fs/erofs/decompressor_lzma.c | 3 +++ fs/erofs/erofs_fs.h | 7 ++++++- fs/erofs/internal.h | 4 ++++ fs/erofs/super.c | 2 ++ fs/erofs/sysfs.c | 2 ++ fs/erofs/zdata.c | 1 + fs/erofs/zmap.c | 6 +++++- 7 files changed, 23 insertions(+), 2 deletions(-) diff --git a/fs/erofs/decompressor_lzma.c b/fs/erofs/decompressor_lzma.c index 5e59b3f523eb..091fd5adf818 100644 --- a/fs/erofs/decompressor_lzma.c +++ b/fs/erofs/decompressor_lzma.c @@ -217,6 +217,9 @@ again: strm->buf.out_size = min_t(u32, outlen, PAGE_SIZE - pageofs); outlen -= strm->buf.out_size; + if (!rq->out[no] && rq->fillgaps) /* deduped */ + rq->out[no] = erofs_allocpage(pagepool, + GFP_KERNEL | __GFP_NOFAIL); if (rq->out[no]) strm->buf.out = kmap(rq->out[no]) + pageofs; pageofs = 0; diff --git a/fs/erofs/erofs_fs.h b/fs/erofs/erofs_fs.h index b5d763aa8ff0..dbcd24371002 100644 --- a/fs/erofs/erofs_fs.h +++ b/fs/erofs/erofs_fs.h @@ -26,6 +26,7 @@ #define EROFS_FEATURE_INCOMPAT_COMPR_HEAD2 0x00000008 #define EROFS_FEATURE_INCOMPAT_ZTAILPACKING 0x00000010 #define EROFS_FEATURE_INCOMPAT_FRAGMENTS 0x00000020 +#define EROFS_FEATURE_INCOMPAT_DEDUPE 0x00000020 #define EROFS_ALL_FEATURE_INCOMPAT \ (EROFS_FEATURE_INCOMPAT_ZERO_PADDING | \ EROFS_FEATURE_INCOMPAT_COMPR_CFGS | \ @@ -34,7 +35,8 @@ EROFS_FEATURE_INCOMPAT_DEVICE_TABLE | \ EROFS_FEATURE_INCOMPAT_COMPR_HEAD2 | \ EROFS_FEATURE_INCOMPAT_ZTAILPACKING | \ - EROFS_FEATURE_INCOMPAT_FRAGMENTS) + EROFS_FEATURE_INCOMPAT_FRAGMENTS | \ + EROFS_FEATURE_INCOMPAT_DEDUPE) #define EROFS_SB_EXTSLOT_SIZE 16 @@ -371,6 +373,9 @@ enum { #define Z_EROFS_VLE_DI_CLUSTER_TYPE_BITS 2 #define Z_EROFS_VLE_DI_CLUSTER_TYPE_BIT 0 +/* (noncompact only, HEAD) This pcluster refers to partial decompressed data */ +#define Z_EROFS_VLE_DI_PARTIAL_REF (1 << 15) + /* * D0_CBLKCNT will be marked _only_ at the 1st non-head lcluster to store the * compressed block count of a compressed extent (in logical clusters, aka. diff --git a/fs/erofs/internal.h b/fs/erofs/internal.h index 9f89c1da6229..a6333c283e3d 100644 --- a/fs/erofs/internal.h +++ b/fs/erofs/internal.h @@ -291,6 +291,7 @@ EROFS_FEATURE_FUNCS(device_table, incompat, INCOMPAT_DEVICE_TABLE) EROFS_FEATURE_FUNCS(compr_head2, incompat, INCOMPAT_COMPR_HEAD2) EROFS_FEATURE_FUNCS(ztailpacking, incompat, INCOMPAT_ZTAILPACKING) EROFS_FEATURE_FUNCS(fragments, incompat, INCOMPAT_FRAGMENTS) +EROFS_FEATURE_FUNCS(dedupe, incompat, INCOMPAT_DEDUPE) EROFS_FEATURE_FUNCS(sb_chksum, compat, COMPAT_SB_CHKSUM) /* atomic flag definitions */ @@ -392,6 +393,7 @@ enum { BH_Encoded = BH_PrivateStart, BH_FullMapped, BH_Fragment, + BH_Partialref, }; /* Has a disk mapping */ @@ -404,6 +406,8 @@ enum { #define EROFS_MAP_FULL_MAPPED (1 << BH_FullMapped) /* Located in the special packed inode */ #define EROFS_MAP_FRAGMENT (1 << BH_Fragment) +/* The extent refers to partial decompressed data */ +#define EROFS_MAP_PARTIAL_REF (1 << BH_Partialref) struct erofs_map_blocks { struct erofs_buf buf; diff --git a/fs/erofs/super.c b/fs/erofs/super.c index ce20562ca91f..8040534ae5c0 100644 --- a/fs/erofs/super.c +++ b/fs/erofs/super.c @@ -424,6 +424,8 @@ static int erofs_read_superblock(struct super_block *sb) erofs_info(sb, "EXPERIMENTAL fscache-based on-demand read feature in use. Use at your own risk!"); if (erofs_sb_has_fragments(sbi)) erofs_info(sb, "EXPERIMENTAL compressed fragments feature in use. Use at your own risk!"); + if (erofs_sb_has_dedupe(sbi)) + erofs_info(sb, "EXPERIMENTAL global deduplication feature in use. Use at your own risk!"); out: erofs_put_metabuf(&buf); return ret; diff --git a/fs/erofs/sysfs.c b/fs/erofs/sysfs.c index dd6eb7eccf9a..783bb7b21b51 100644 --- a/fs/erofs/sysfs.c +++ b/fs/erofs/sysfs.c @@ -77,6 +77,7 @@ EROFS_ATTR_FEATURE(compr_head2); EROFS_ATTR_FEATURE(sb_chksum); EROFS_ATTR_FEATURE(ztailpacking); EROFS_ATTR_FEATURE(fragments); +EROFS_ATTR_FEATURE(dedupe); static struct attribute *erofs_feat_attrs[] = { ATTR_LIST(zero_padding), @@ -88,6 +89,7 @@ static struct attribute *erofs_feat_attrs[] = { ATTR_LIST(sb_chksum), ATTR_LIST(ztailpacking), ATTR_LIST(fragments), + ATTR_LIST(dedupe), NULL, }; ATTRIBUTE_GROUPS(erofs_feat); diff --git a/fs/erofs/zdata.c b/fs/erofs/zdata.c index c92a72f5bca6..cce56dde135c 100644 --- a/fs/erofs/zdata.c +++ b/fs/erofs/zdata.c @@ -814,6 +814,7 @@ retry: fe->pcl->multibases = true; if ((map->m_flags & EROFS_MAP_FULL_MAPPED) && + !(map->m_flags & EROFS_MAP_PARTIAL_REF) && fe->pcl->length == map->m_llen) fe->pcl->partial = false; if (fe->pcl->length < offset + end - map->m_la) { diff --git a/fs/erofs/zmap.c b/fs/erofs/zmap.c index d1723910251c..ccdddb755be8 100644 --- a/fs/erofs/zmap.c +++ b/fs/erofs/zmap.c @@ -163,6 +163,7 @@ struct z_erofs_maprecorder { u16 delta[2]; erofs_blk_t pblk, compressedblks; erofs_off_t nextpackoff; + bool partialref; }; static int z_erofs_reload_indexes(struct z_erofs_maprecorder *m, @@ -221,6 +222,8 @@ static int legacy_load_cluster_from_disk(struct z_erofs_maprecorder *m, case Z_EROFS_VLE_CLUSTER_TYPE_PLAIN: case Z_EROFS_VLE_CLUSTER_TYPE_HEAD1: case Z_EROFS_VLE_CLUSTER_TYPE_HEAD2: + if (advise & Z_EROFS_VLE_DI_PARTIAL_REF) + m->partialref = true; m->clusterofs = le16_to_cpu(di->di_clusterofs); m->pblk = le32_to_cpu(di->di_u.blkaddr); break; @@ -684,7 +687,8 @@ static int z_erofs_do_map_blocks(struct inode *inode, err = -EOPNOTSUPP; goto unmap_out; } - + if (m.partialref) + map->m_flags |= EROFS_MAP_PARTIAL_REF; map->m_llen = end - map->m_la; if (flags & EROFS_GET_BLOCKS_FINDTAIL) { From 31da107fdb0a01b889af41cd94a9904e1f6ffaa6 Mon Sep 17 00:00:00 2001 From: Yue Hu Date: Tue, 27 Sep 2022 11:25:18 +0800 Subject: [PATCH 12/14] erofs: fold in z_erofs_reload_indexes() The name of this function looks not very accurate compared to it's implementation and it's only a wrapper to erofs_read_metabuf(). So, let's fold it directly instead. Signed-off-by: Yue Hu Reviewed-by: Gao Xiang Link: https://lore.kernel.org/r/20220927032518.25266-1-zbestahu@gmail.com Signed-off-by: Gao Xiang --- fs/erofs/zmap.c | 28 ++++++++-------------------- 1 file changed, 8 insertions(+), 20 deletions(-) diff --git a/fs/erofs/zmap.c b/fs/erofs/zmap.c index ccdddb755be8..4cecd32b87c6 100644 --- a/fs/erofs/zmap.c +++ b/fs/erofs/zmap.c @@ -166,18 +166,6 @@ struct z_erofs_maprecorder { bool partialref; }; -static int z_erofs_reload_indexes(struct z_erofs_maprecorder *m, - erofs_blk_t eblk) -{ - struct super_block *const sb = m->inode->i_sb; - - m->kaddr = erofs_read_metabuf(&m->map->buf, sb, eblk, - EROFS_KMAP_ATOMIC); - if (IS_ERR(m->kaddr)) - return PTR_ERR(m->kaddr); - return 0; -} - static int legacy_load_cluster_from_disk(struct z_erofs_maprecorder *m, unsigned long lcn) { @@ -190,11 +178,11 @@ static int legacy_load_cluster_from_disk(struct z_erofs_maprecorder *m, lcn * sizeof(struct z_erofs_vle_decompressed_index); struct z_erofs_vle_decompressed_index *di; unsigned int advise, type; - int err; - err = z_erofs_reload_indexes(m, erofs_blknr(pos)); - if (err) - return err; + m->kaddr = erofs_read_metabuf(&m->map->buf, inode->i_sb, + erofs_blknr(pos), EROFS_KMAP_ATOMIC); + if (IS_ERR(m->kaddr)) + return PTR_ERR(m->kaddr); m->nextpackoff = pos + sizeof(struct z_erofs_vle_decompressed_index); m->lcn = lcn; @@ -393,7 +381,6 @@ static int compacted_load_cluster_from_disk(struct z_erofs_maprecorder *m, unsigned int compacted_4b_initial, compacted_2b; unsigned int amortizedshift; erofs_off_t pos; - int err; if (lclusterbits != 12) return -EOPNOTSUPP; @@ -430,9 +417,10 @@ static int compacted_load_cluster_from_disk(struct z_erofs_maprecorder *m, amortizedshift = 2; out: pos += lcn * (1 << amortizedshift); - err = z_erofs_reload_indexes(m, erofs_blknr(pos)); - if (err) - return err; + m->kaddr = erofs_read_metabuf(&m->map->buf, inode->i_sb, + erofs_blknr(pos), EROFS_KMAP_ATOMIC); + if (IS_ERR(m->kaddr)) + return PTR_ERR(m->kaddr); return unpack_compacted_index(m, amortizedshift, pos, lookahead); } From 53a7f9961cddf505a9f3a7baa017e5af31838840 Mon Sep 17 00:00:00 2001 From: Gao Xiang Date: Tue, 27 Sep 2022 14:36:06 +0800 Subject: [PATCH 13/14] erofs: clean up unnecessary code and comments Some conditional macros and comments are useless. Link: https://lore.kernel.org/r/20220927063607.54832-1-hsiangkao@linux.alibaba.com Reviewed-by: Yue Hu Signed-off-by: Gao Xiang --- fs/erofs/internal.h | 2 -- fs/erofs/namei.c | 11 +---------- fs/erofs/xattr.h | 2 -- fs/erofs/zmap.c | 3 +-- 4 files changed, 2 insertions(+), 16 deletions(-) diff --git a/fs/erofs/internal.h b/fs/erofs/internal.h index a6333c283e3d..0318530bc78a 100644 --- a/fs/erofs/internal.h +++ b/fs/erofs/internal.h @@ -196,7 +196,6 @@ enum { EROFS_ZIP_CACHE_READAROUND }; -#ifdef CONFIG_EROFS_FS_ZIP #define EROFS_LOCKED_MAGIC (INT_MIN | 0xE0F510CCL) /* basic unit of the workstation of a super_block */ @@ -236,7 +235,6 @@ static inline int erofs_wait_on_workgroup_freezed(struct erofs_workgroup *grp) return atomic_cond_read_relaxed(&grp->refcount, VAL != EROFS_LOCKED_MAGIC); } -#endif /* !CONFIG_EROFS_FS_ZIP */ /* we strictly follow PAGE_SIZE and no buffer head yet */ #define LOG_BLOCK_SIZE PAGE_SHIFT diff --git a/fs/erofs/namei.c b/fs/erofs/namei.c index fd75506799c4..afbb80d4e2f1 100644 --- a/fs/erofs/namei.c +++ b/fs/erofs/namei.c @@ -185,7 +185,6 @@ int erofs_namei(struct inode *dir, const struct qstr *name, erofs_nid_t *nid, if (IS_ERR(de)) return PTR_ERR(de); - /* the target page has been mapped */ if (ndirents) de = find_target_dirent(&qn, (u8 *)de, EROFS_BLKSIZ, ndirents); @@ -197,9 +196,7 @@ int erofs_namei(struct inode *dir, const struct qstr *name, erofs_nid_t *nid, return PTR_ERR_OR_ZERO(de); } -/* NOTE: i_mutex is already held by vfs */ -static struct dentry *erofs_lookup(struct inode *dir, - struct dentry *dentry, +static struct dentry *erofs_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags) { int err; @@ -207,17 +204,11 @@ static struct dentry *erofs_lookup(struct inode *dir, unsigned int d_type; struct inode *inode; - DBG_BUGON(!d_really_is_negative(dentry)); - /* dentry must be unhashed in lookup, no need to worry about */ - DBG_BUGON(!d_unhashed(dentry)); - trace_erofs_lookup(dir, dentry, flags); - /* file name exceeds fs limit */ if (dentry->d_name.len > EROFS_NAME_LEN) return ERR_PTR(-ENAMETOOLONG); - /* false uninitialized warnings on gcc 4.8.x */ err = erofs_namei(dir, &dentry->d_name, &nid, &d_type); if (err == -ENOENT) { diff --git a/fs/erofs/xattr.h b/fs/erofs/xattr.h index 332462c59f11..0a43c9ee9f8f 100644 --- a/fs/erofs/xattr.h +++ b/fs/erofs/xattr.h @@ -39,9 +39,7 @@ static inline unsigned int xattrblock_offset(struct erofs_sb_info *sbi, #ifdef CONFIG_EROFS_FS_XATTR extern const struct xattr_handler erofs_xattr_user_handler; extern const struct xattr_handler erofs_xattr_trusted_handler; -#ifdef CONFIG_EROFS_FS_SECURITY extern const struct xattr_handler erofs_xattr_security_handler; -#endif static inline const struct xattr_handler *erofs_xattr_handler(unsigned int idx) { diff --git a/fs/erofs/zmap.c b/fs/erofs/zmap.c index 4cecd32b87c6..44c27ef39c43 100644 --- a/fs/erofs/zmap.c +++ b/fs/erofs/zmap.c @@ -731,8 +731,7 @@ out: return err; } -int z_erofs_map_blocks_iter(struct inode *inode, - struct erofs_map_blocks *map, +int z_erofs_map_blocks_iter(struct inode *inode, struct erofs_map_blocks *map, int flags) { struct erofs_inode *const vi = EROFS_I(inode); From 312fe643ad1153fe0337c46f4573030d0c2bac73 Mon Sep 17 00:00:00 2001 From: Gao Xiang Date: Tue, 27 Sep 2022 14:36:07 +0800 Subject: [PATCH 14/14] erofs: clean up erofs_iget() isdir indicated REQ_META|REQ_PRIO which no longer works now. Get rid of isdir entirely. Link: https://lore.kernel.org/r/20220927063607.54832-2-hsiangkao@linux.alibaba.com Reviewed-by: Yue Hu Signed-off-by: Gao Xiang --- fs/erofs/inode.c | 24 ++++++++---------------- fs/erofs/internal.h | 2 +- fs/erofs/namei.c | 2 +- fs/erofs/super.c | 8 ++++---- include/trace/events/erofs.h | 11 ++++------- 5 files changed, 18 insertions(+), 29 deletions(-) diff --git a/fs/erofs/inode.c b/fs/erofs/inode.c index 16cf9a283557..ad2a82f2eb4c 100644 --- a/fs/erofs/inode.c +++ b/fs/erofs/inode.c @@ -241,7 +241,7 @@ static int erofs_fill_symlink(struct inode *inode, void *kaddr, return 0; } -static int erofs_fill_inode(struct inode *inode, int isdir) +static int erofs_fill_inode(struct inode *inode) { struct erofs_inode *vi = EROFS_I(inode); struct erofs_buf buf = __EROFS_BUF_INITIALIZER; @@ -249,7 +249,7 @@ static int erofs_fill_inode(struct inode *inode, int isdir) unsigned int ofs; int err = 0; - trace_erofs_fill_inode(inode, isdir); + trace_erofs_fill_inode(inode); /* read inode base data from disk */ kaddr = erofs_read_inode(&buf, inode, &ofs); @@ -324,21 +324,13 @@ static int erofs_iget_set_actor(struct inode *inode, void *opaque) return 0; } -static inline struct inode *erofs_iget_locked(struct super_block *sb, - erofs_nid_t nid) +struct inode *erofs_iget(struct super_block *sb, erofs_nid_t nid) { const unsigned long hashval = erofs_inode_hash(nid); + struct inode *inode; - return iget5_locked(sb, hashval, erofs_ilookup_test_actor, + inode = iget5_locked(sb, hashval, erofs_ilookup_test_actor, erofs_iget_set_actor, &nid); -} - -struct inode *erofs_iget(struct super_block *sb, - erofs_nid_t nid, - bool isdir) -{ - struct inode *inode = erofs_iget_locked(sb, nid); - if (!inode) return ERR_PTR(-ENOMEM); @@ -348,10 +340,10 @@ struct inode *erofs_iget(struct super_block *sb, vi->nid = nid; - err = erofs_fill_inode(inode, isdir); - if (!err) + err = erofs_fill_inode(inode); + if (!err) { unlock_new_inode(inode); - else { + } else { iget_failed(inode); inode = ERR_PTR(err); } diff --git a/fs/erofs/internal.h b/fs/erofs/internal.h index 0318530bc78a..1701df48c446 100644 --- a/fs/erofs/internal.h +++ b/fs/erofs/internal.h @@ -492,7 +492,7 @@ extern const struct inode_operations erofs_generic_iops; extern const struct inode_operations erofs_symlink_iops; extern const struct inode_operations erofs_fast_symlink_iops; -struct inode *erofs_iget(struct super_block *sb, erofs_nid_t nid, bool dir); +struct inode *erofs_iget(struct super_block *sb, erofs_nid_t nid); int erofs_getattr(struct user_namespace *mnt_userns, const struct path *path, struct kstat *stat, u32 request_mask, unsigned int query_flags); diff --git a/fs/erofs/namei.c b/fs/erofs/namei.c index afbb80d4e2f1..0dc34721080c 100644 --- a/fs/erofs/namei.c +++ b/fs/erofs/namei.c @@ -219,7 +219,7 @@ static struct dentry *erofs_lookup(struct inode *dir, struct dentry *dentry, } else { erofs_dbg("%s, %pd (nid %llu) found, d_type %u", __func__, dentry, nid, d_type); - inode = erofs_iget(dir->i_sb, nid, d_type == FT_DIR); + inode = erofs_iget(dir->i_sb, nid); } return d_splice_alias(inode, dentry); } diff --git a/fs/erofs/super.c b/fs/erofs/super.c index 8040534ae5c0..2cf96ce1c32e 100644 --- a/fs/erofs/super.c +++ b/fs/erofs/super.c @@ -385,7 +385,7 @@ static int erofs_read_superblock(struct super_block *sb) sbi->packed_inode = NULL; if (erofs_sb_has_fragments(sbi) && dsb->packed_nid) { sbi->packed_inode = - erofs_iget(sb, le64_to_cpu(dsb->packed_nid), false); + erofs_iget(sb, le64_to_cpu(dsb->packed_nid)); if (IS_ERR(sbi->packed_inode)) { ret = PTR_ERR(sbi->packed_inode); goto out; @@ -668,7 +668,7 @@ static int erofs_init_managed_cache(struct super_block *sb) { return 0; } static struct inode *erofs_nfs_get_inode(struct super_block *sb, u64 ino, u32 generation) { - return erofs_iget(sb, ino, false); + return erofs_iget(sb, ino); } static struct dentry *erofs_fh_to_dentry(struct super_block *sb, @@ -694,7 +694,7 @@ static struct dentry *erofs_get_parent(struct dentry *child) err = erofs_namei(d_inode(child), &dotdot_name, &nid, &d_type); if (err) return ERR_PTR(err); - return d_obtain_alias(erofs_iget(child->d_sb, nid, d_type == FT_DIR)); + return d_obtain_alias(erofs_iget(child->d_sb, nid)); } static const struct export_operations erofs_export_ops = { @@ -782,7 +782,7 @@ static int erofs_fc_fill_super(struct super_block *sb, struct fs_context *fc) #endif /* get the root inode */ - inode = erofs_iget(sb, ROOT_NID(sbi), true); + inode = erofs_iget(sb, ROOT_NID(sbi)); if (IS_ERR(inode)) return PTR_ERR(inode); diff --git a/include/trace/events/erofs.h b/include/trace/events/erofs.h index 57de057bd503..4f4c44ea3a65 100644 --- a/include/trace/events/erofs.h +++ b/include/trace/events/erofs.h @@ -53,15 +53,14 @@ TRACE_EVENT(erofs_lookup, ); TRACE_EVENT(erofs_fill_inode, - TP_PROTO(struct inode *inode, int isdir), - TP_ARGS(inode, isdir), + TP_PROTO(struct inode *inode), + TP_ARGS(inode), TP_STRUCT__entry( __field(dev_t, dev ) __field(erofs_nid_t, nid ) __field(erofs_blk_t, blkaddr ) __field(unsigned int, ofs ) - __field(int, isdir ) ), TP_fast_assign( @@ -69,13 +68,11 @@ TRACE_EVENT(erofs_fill_inode, __entry->nid = EROFS_I(inode)->nid; __entry->blkaddr = erofs_blknr(iloc(EROFS_I_SB(inode), __entry->nid)); __entry->ofs = erofs_blkoff(iloc(EROFS_I_SB(inode), __entry->nid)); - __entry->isdir = isdir; ), - TP_printk("dev = (%d,%d), nid = %llu, blkaddr %u ofs %u, isdir %d", + TP_printk("dev = (%d,%d), nid = %llu, blkaddr %u ofs %u", show_dev_nid(__entry), - __entry->blkaddr, __entry->ofs, - __entry->isdir) + __entry->blkaddr, __entry->ofs) ); TRACE_EVENT(erofs_readpage,