From c39de951282df9a60ef70664e4378d88006b2670 Mon Sep 17 00:00:00 2001 From: Konstantin Komarov Date: Fri, 24 Nov 2023 11:04:53 +0300 Subject: [PATCH 001/270] fs/ntfs3: Improve alternative boot processing Signed-off-by: Konstantin Komarov --- fs/ntfs3/super.c | 35 +++++++++++++++++------------------ 1 file changed, 17 insertions(+), 18 deletions(-) diff --git a/fs/ntfs3/super.c b/fs/ntfs3/super.c index 9153dffde950..09d61c6c90aa 100644 --- a/fs/ntfs3/super.c +++ b/fs/ntfs3/super.c @@ -866,6 +866,7 @@ static int ntfs_init_from_boot(struct super_block *sb, u32 sector_size, u16 fn, ao; u8 cluster_bits; u32 boot_off = 0; + sector_t boot_block = 0; const char *hint = "Primary boot"; /* Save original dev_size. Used with alternative boot. */ @@ -873,11 +874,11 @@ static int ntfs_init_from_boot(struct super_block *sb, u32 sector_size, sbi->volume.blocks = dev_size >> PAGE_SHIFT; - bh = ntfs_bread(sb, 0); +read_boot: + bh = ntfs_bread(sb, boot_block); if (!bh) - return -EIO; + return boot_block ? -EINVAL : -EIO; -check_boot: err = -EINVAL; /* Corrupted image; do not read OOB */ @@ -1108,26 +1109,24 @@ check_boot: } out: - if (err == -EINVAL && !bh->b_blocknr && dev_size0 > PAGE_SHIFT) { + brelse(bh); + + if (err == -EINVAL && !boot_block && dev_size0 > PAGE_SHIFT) { u32 block_size = min_t(u32, sector_size, PAGE_SIZE); u64 lbo = dev_size0 - sizeof(*boot); - /* - * Try alternative boot (last sector) - */ - brelse(bh); - - sb_set_blocksize(sb, block_size); - bh = ntfs_bread(sb, lbo >> blksize_bits(block_size)); - if (!bh) - return -EINVAL; - + boot_block = lbo >> blksize_bits(block_size); boot_off = lbo & (block_size - 1); - hint = "Alternative boot"; - dev_size = dev_size0; /* restore original size. */ - goto check_boot; + if (boot_block && block_size >= boot_off + sizeof(*boot)) { + /* + * Try alternative boot (last sector) + */ + sb_set_blocksize(sb, block_size); + hint = "Alternative boot"; + dev_size = dev_size0; /* restore original size. */ + goto read_boot; + } } - brelse(bh); return err; } From 22457c047ed971f2f2e33be593ddfabd9639a409 Mon Sep 17 00:00:00 2001 From: Konstantin Komarov Date: Fri, 24 Nov 2023 11:17:07 +0300 Subject: [PATCH 002/270] fs/ntfs3: Modified fix directory element type detection Unfortunately reparse attribute is used for many purposes (several dozens). It is not possible here to know is this name symlink or not. To get exactly the type of name we should to open inode (read mft). getattr for opened file (fstat) correctly returns symlink. Signed-off-by: Konstantin Komarov --- fs/ntfs3/dir.c | 30 +++++++++++++++++++++++++----- 1 file changed, 25 insertions(+), 5 deletions(-) diff --git a/fs/ntfs3/dir.c b/fs/ntfs3/dir.c index ec0566b322d5..22ede4da0450 100644 --- a/fs/ntfs3/dir.c +++ b/fs/ntfs3/dir.c @@ -309,11 +309,31 @@ static inline int ntfs_filldir(struct ntfs_sb_info *sbi, struct ntfs_inode *ni, return 0; } - /* NTFS: symlinks are "dir + reparse" or "file + reparse" */ - if (fname->dup.fa & FILE_ATTRIBUTE_REPARSE_POINT) - dt_type = DT_LNK; - else - dt_type = (fname->dup.fa & FILE_ATTRIBUTE_DIRECTORY) ? DT_DIR : DT_REG; + /* + * NTFS: symlinks are "dir + reparse" or "file + reparse" + * Unfortunately reparse attribute is used for many purposes (several dozens). + * It is not possible here to know is this name symlink or not. + * To get exactly the type of name we should to open inode (read mft). + * getattr for opened file (fstat) correctly returns symlink. + */ + dt_type = (fname->dup.fa & FILE_ATTRIBUTE_DIRECTORY) ? DT_DIR : DT_REG; + + /* + * It is not reliable to detect the type of name using duplicated information + * stored in parent directory. + * The only correct way to get the type of name - read MFT record and find ATTR_STD. + * The code below is not good idea. + * It does additional locks/reads just to get the type of name. + * Should we use additional mount option to enable branch below? + */ + if ((fname->dup.fa & FILE_ATTRIBUTE_REPARSE_POINT) && + ino != ni->mi.rno) { + struct inode *inode = ntfs_iget5(sbi->sb, &e->ref, NULL); + if (!IS_ERR_OR_NULL(inode)) { + dt_type = fs_umode_to_dtype(inode->i_mode); + iput(inode); + } + } return !dir_emit(ctx, (s8 *)name, name_len, ino, dt_type); } From 6a799c928b78b14999b7705c4cca0f88e297fe96 Mon Sep 17 00:00:00 2001 From: Konstantin Komarov Date: Fri, 24 Nov 2023 11:24:33 +0300 Subject: [PATCH 003/270] fs/ntfs3: Improve ntfs_dir_count Signed-off-by: Konstantin Komarov --- fs/ntfs3/dir.c | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) diff --git a/fs/ntfs3/dir.c b/fs/ntfs3/dir.c index 22ede4da0450..726122ecd39b 100644 --- a/fs/ntfs3/dir.c +++ b/fs/ntfs3/dir.c @@ -515,11 +515,9 @@ static int ntfs_dir_count(struct inode *dir, bool *is_empty, size_t *dirs, struct INDEX_HDR *hdr; const struct ATTR_FILE_NAME *fname; u32 e_size, off, end; - u64 vbo = 0; size_t drs = 0, fles = 0, bit = 0; - loff_t i_size = ni->vfs_inode.i_size; struct indx_node *node = NULL; - u8 index_bits = ni->dir.index_bits; + size_t max_indx = ni->vfs_inode.i_size >> ni->dir.index_bits; if (is_empty) *is_empty = true; @@ -563,7 +561,7 @@ static int ntfs_dir_count(struct inode *dir, bool *is_empty, size_t *dirs, fles += 1; } - if (vbo >= i_size) + if (bit >= max_indx) goto out; err = indx_used_bit(&ni->dir, ni, &bit); @@ -573,8 +571,7 @@ static int ntfs_dir_count(struct inode *dir, bool *is_empty, size_t *dirs, if (bit == MINUS_ONE_T) goto out; - vbo = (u64)bit << index_bits; - if (vbo >= i_size) + if (bit >= max_indx) goto out; err = indx_read(&ni->dir, ni, bit << ni->dir.idx2vbn_bits, @@ -584,7 +581,6 @@ static int ntfs_dir_count(struct inode *dir, bool *is_empty, size_t *dirs, hdr = &node->index->ihdr; bit += 1; - vbo = (u64)bit << ni->dir.idx2vbn_bits; } out: From 1918c10e137eae266b8eb0ab1cc14421dcb0e3e2 Mon Sep 17 00:00:00 2001 From: Konstantin Komarov Date: Fri, 24 Nov 2023 11:26:31 +0300 Subject: [PATCH 004/270] fs/ntfs3: Correct hard links updating when dealing with DOS names Signed-off-by: Konstantin Komarov --- fs/ntfs3/record.c | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) diff --git a/fs/ntfs3/record.c b/fs/ntfs3/record.c index 53629b1f65e9..7b6423584eae 100644 --- a/fs/ntfs3/record.c +++ b/fs/ntfs3/record.c @@ -535,8 +535,20 @@ bool mi_remove_attr(struct ntfs_inode *ni, struct mft_inode *mi, return false; if (ni && is_attr_indexed(attr)) { - le16_add_cpu(&ni->mi.mrec->hard_links, -1); - ni->mi.dirty = true; + u16 links = le16_to_cpu(ni->mi.mrec->hard_links); + struct ATTR_FILE_NAME *fname = + attr->type != ATTR_NAME ? + NULL : + resident_data_ex(attr, + SIZEOF_ATTRIBUTE_FILENAME); + if (fname && fname->type == FILE_NAME_DOS) { + /* Do not decrease links count deleting DOS name. */ + } else if (!links) { + /* minor error. Not critical. */ + } else { + ni->mi.mrec->hard_links = cpu_to_le16(links - 1); + ni->mi.dirty = true; + } } used -= asize; From 85ba2a75faee759809a7e43b4c103ac59bac1026 Mon Sep 17 00:00:00 2001 From: Konstantin Komarov Date: Fri, 24 Nov 2023 11:34:24 +0300 Subject: [PATCH 005/270] fs/ntfs3: Print warning while fixing hard links count Signed-off-by: Konstantin Komarov --- fs/ntfs3/inode.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/ntfs3/inode.c b/fs/ntfs3/inode.c index 5e3d71374918..fa6c7965473c 100644 --- a/fs/ntfs3/inode.c +++ b/fs/ntfs3/inode.c @@ -412,7 +412,6 @@ end_enum: goto out; if (!is_match && name) { - /* Reuse rec as buffer for ascii name. */ err = -ENOENT; goto out; } @@ -427,6 +426,7 @@ end_enum: if (names != le16_to_cpu(rec->hard_links)) { /* Correct minor error on the fly. Do not mark inode as dirty. */ + ntfs_inode_warn(inode, "Correct links count -> %u.", names); rec->hard_links = cpu_to_le16(names); ni->mi.dirty = true; } From 865e7a7700d930d34895a70f8af2eb4e778a5b0e Mon Sep 17 00:00:00 2001 From: Konstantin Komarov Date: Fri, 24 Nov 2023 11:42:04 +0300 Subject: [PATCH 006/270] fs/ntfs3: Reduce stack usage Signed-off-by: Konstantin Komarov --- fs/ntfs3/fslog.c | 214 +++++++++++++++++++++-------------------------- 1 file changed, 96 insertions(+), 118 deletions(-) diff --git a/fs/ntfs3/fslog.c b/fs/ntfs3/fslog.c index 98ccb6650858..7dbb000fc691 100644 --- a/fs/ntfs3/fslog.c +++ b/fs/ntfs3/fslog.c @@ -974,6 +974,16 @@ skip_looking: return e; } +struct restart_info { + u64 last_lsn; + struct RESTART_HDR *r_page; + u32 vbo; + bool chkdsk_was_run; + bool valid_page; + bool initialized; + bool restart; +}; + #define RESTART_SINGLE_PAGE_IO cpu_to_le16(0x0001) #define NTFSLOG_WRAPPED 0x00000001 @@ -987,6 +997,7 @@ struct ntfs_log { struct ntfs_inode *ni; u32 l_size; + u32 orig_file_size; u32 sys_page_size; u32 sys_page_mask; u32 page_size; @@ -1040,6 +1051,8 @@ struct ntfs_log { struct CLIENT_ID client_id; u32 client_undo_commit; + + struct restart_info rst_info, rst_info2; }; static inline u32 lsn_to_vbo(struct ntfs_log *log, const u64 lsn) @@ -1105,16 +1118,6 @@ static inline bool verify_client_lsn(struct ntfs_log *log, lsn <= le64_to_cpu(log->ra->current_lsn) && lsn; } -struct restart_info { - u64 last_lsn; - struct RESTART_HDR *r_page; - u32 vbo; - bool chkdsk_was_run; - bool valid_page; - bool initialized; - bool restart; -}; - static int read_log_page(struct ntfs_log *log, u32 vbo, struct RECORD_PAGE_HDR **buffer, bool *usa_error) { @@ -1176,7 +1179,7 @@ out: * restart page header. It will stop the first time we find a * valid page header. */ -static int log_read_rst(struct ntfs_log *log, u32 l_size, bool first, +static int log_read_rst(struct ntfs_log *log, bool first, struct restart_info *info) { u32 skip, vbo; @@ -1192,7 +1195,7 @@ static int log_read_rst(struct ntfs_log *log, u32 l_size, bool first, } /* Loop continuously until we succeed. */ - for (; vbo < l_size; vbo = 2 * vbo + skip, skip = 0) { + for (; vbo < log->l_size; vbo = 2 * vbo + skip, skip = 0) { bool usa_error; bool brst, bchk; struct RESTART_AREA *ra; @@ -1285,22 +1288,17 @@ check_result: /* * Ilog_init_pg_hdr - Init @log from restart page header. */ -static void log_init_pg_hdr(struct ntfs_log *log, u32 sys_page_size, - u32 page_size, u16 major_ver, u16 minor_ver) +static void log_init_pg_hdr(struct ntfs_log *log, u16 major_ver, u16 minor_ver) { - log->sys_page_size = sys_page_size; - log->sys_page_mask = sys_page_size - 1; - log->page_size = page_size; - log->page_mask = page_size - 1; - log->page_bits = blksize_bits(page_size); + log->sys_page_size = log->page_size; + log->sys_page_mask = log->page_mask; log->clst_per_page = log->page_size >> log->ni->mi.sbi->cluster_bits; if (!log->clst_per_page) log->clst_per_page = 1; - log->first_page = major_ver >= 2 ? - 0x22 * page_size : - ((sys_page_size << 1) + (page_size << 1)); + log->first_page = major_ver >= 2 ? 0x22 * log->page_size : + 4 * log->page_size; log->major_ver = major_ver; log->minor_ver = minor_ver; } @@ -1308,12 +1306,11 @@ static void log_init_pg_hdr(struct ntfs_log *log, u32 sys_page_size, /* * log_create - Init @log in cases when we don't have a restart area to use. */ -static void log_create(struct ntfs_log *log, u32 l_size, const u64 last_lsn, +static void log_create(struct ntfs_log *log, const u64 last_lsn, u32 open_log_count, bool wrapped, bool use_multi_page) { - log->l_size = l_size; /* All file offsets must be quadword aligned. */ - log->file_data_bits = blksize_bits(l_size) - 3; + log->file_data_bits = blksize_bits(log->l_size) - 3; log->seq_num_mask = (8 << log->file_data_bits) - 1; log->seq_num_bits = sizeof(u64) * 8 - log->file_data_bits; log->seq_num = (last_lsn >> log->file_data_bits) + 2; @@ -3720,10 +3717,8 @@ int log_replay(struct ntfs_inode *ni, bool *initialized) struct ntfs_sb_info *sbi = ni->mi.sbi; struct ntfs_log *log; - struct restart_info rst_info, rst_info2; - u64 rec_lsn, ra_lsn, checkpt_lsn = 0, rlsn = 0; + u64 rec_lsn, checkpt_lsn = 0, rlsn = 0; struct ATTR_NAME_ENTRY *attr_names = NULL; - struct ATTR_NAME_ENTRY *ane; struct RESTART_TABLE *dptbl = NULL; struct RESTART_TABLE *trtbl = NULL; const struct RESTART_TABLE *rt; @@ -3741,9 +3736,7 @@ int log_replay(struct ntfs_inode *ni, bool *initialized) struct TRANSACTION_ENTRY *tr; struct DIR_PAGE_ENTRY *dp; u32 i, bytes_per_attr_entry; - u32 l_size = ni->vfs_inode.i_size; - u32 orig_file_size = l_size; - u32 page_size, vbo, tail, off, dlen; + u32 vbo, tail, off, dlen; u32 saved_len, rec_len, transact_id; bool use_second_page; struct RESTART_AREA *ra2, *ra = NULL; @@ -3758,52 +3751,50 @@ int log_replay(struct ntfs_inode *ni, bool *initialized) u16 t16; u32 t32; - /* Get the size of page. NOTE: To replay we can use default page. */ -#if PAGE_SIZE >= DefaultLogPageSize && PAGE_SIZE <= DefaultLogPageSize * 2 - page_size = norm_file_page(PAGE_SIZE, &l_size, true); -#else - page_size = norm_file_page(PAGE_SIZE, &l_size, false); -#endif - if (!page_size) - return -EINVAL; - log = kzalloc(sizeof(struct ntfs_log), GFP_NOFS); if (!log) return -ENOMEM; log->ni = ni; - log->l_size = l_size; - log->one_page_buf = kmalloc(page_size, GFP_NOFS); + log->l_size = log->orig_file_size = ni->vfs_inode.i_size; + /* Get the size of page. NOTE: To replay we can use default page. */ +#if PAGE_SIZE >= DefaultLogPageSize && PAGE_SIZE <= DefaultLogPageSize * 2 + log->page_size = norm_file_page(PAGE_SIZE, &log->l_size, true); +#else + log->page_size = norm_file_page(PAGE_SIZE, &log->l_size, false); +#endif + if (!log->page_size) { + err = -EINVAL; + goto out; + } + + log->one_page_buf = kmalloc(log->page_size, GFP_NOFS); if (!log->one_page_buf) { err = -ENOMEM; goto out; } - log->page_size = page_size; - log->page_mask = page_size - 1; - log->page_bits = blksize_bits(page_size); + log->page_mask = log->page_size - 1; + log->page_bits = blksize_bits(log->page_size); /* Look for a restart area on the disk. */ - memset(&rst_info, 0, sizeof(struct restart_info)); - err = log_read_rst(log, l_size, true, &rst_info); + err = log_read_rst(log, true, &log->rst_info); if (err) goto out; /* remember 'initialized' */ - *initialized = rst_info.initialized; + *initialized = log->rst_info.initialized; - if (!rst_info.restart) { - if (rst_info.initialized) { + if (!log->rst_info.restart) { + if (log->rst_info.initialized) { /* No restart area but the file is not initialized. */ err = -EINVAL; goto out; } - log_init_pg_hdr(log, page_size, page_size, 1, 1); - log_create(log, l_size, 0, get_random_u32(), false, false); - - log->ra = ra; + log_init_pg_hdr(log, 1, 1); + log_create(log, 0, get_random_u32(), false, false); ra = log_create_ra(log); if (!ra) { @@ -3820,25 +3811,26 @@ int log_replay(struct ntfs_inode *ni, bool *initialized) * If the restart offset above wasn't zero then we won't * look for a second restart. */ - if (rst_info.vbo) + if (log->rst_info.vbo) goto check_restart_area; - memset(&rst_info2, 0, sizeof(struct restart_info)); - err = log_read_rst(log, l_size, false, &rst_info2); + err = log_read_rst(log, false, &log->rst_info2); if (err) goto out; /* Determine which restart area to use. */ - if (!rst_info2.restart || rst_info2.last_lsn <= rst_info.last_lsn) + if (!log->rst_info2.restart || + log->rst_info2.last_lsn <= log->rst_info.last_lsn) goto use_first_page; use_second_page = true; - if (rst_info.chkdsk_was_run && page_size != rst_info.vbo) { + if (log->rst_info.chkdsk_was_run && + log->page_size != log->rst_info.vbo) { struct RECORD_PAGE_HDR *sp = NULL; bool usa_error; - if (!read_log_page(log, page_size, &sp, &usa_error) && + if (!read_log_page(log, log->page_size, &sp, &usa_error) && sp->rhdr.sign == NTFS_CHKD_SIGNATURE) { use_second_page = false; } @@ -3846,52 +3838,43 @@ int log_replay(struct ntfs_inode *ni, bool *initialized) } if (use_second_page) { - kfree(rst_info.r_page); - memcpy(&rst_info, &rst_info2, sizeof(struct restart_info)); - rst_info2.r_page = NULL; + kfree(log->rst_info.r_page); + memcpy(&log->rst_info, &log->rst_info2, + sizeof(struct restart_info)); + log->rst_info2.r_page = NULL; } use_first_page: - kfree(rst_info2.r_page); + kfree(log->rst_info2.r_page); check_restart_area: /* * If the restart area is at offset 0, we want * to write the second restart area first. */ - log->init_ra = !!rst_info.vbo; + log->init_ra = !!log->rst_info.vbo; /* If we have a valid page then grab a pointer to the restart area. */ - ra2 = rst_info.valid_page ? - Add2Ptr(rst_info.r_page, - le16_to_cpu(rst_info.r_page->ra_off)) : + ra2 = log->rst_info.valid_page ? + Add2Ptr(log->rst_info.r_page, + le16_to_cpu(log->rst_info.r_page->ra_off)) : NULL; - if (rst_info.chkdsk_was_run || + if (log->rst_info.chkdsk_was_run || (ra2 && ra2->client_idx[1] == LFS_NO_CLIENT_LE)) { bool wrapped = false; bool use_multi_page = false; u32 open_log_count; /* Do some checks based on whether we have a valid log page. */ - if (!rst_info.valid_page) { - open_log_count = get_random_u32(); - goto init_log_instance; - } - open_log_count = le32_to_cpu(ra2->open_log_count); + open_log_count = log->rst_info.valid_page ? + le32_to_cpu(ra2->open_log_count) : + get_random_u32(); - /* - * If the restart page size isn't changing then we want to - * check how much work we need to do. - */ - if (page_size != le32_to_cpu(rst_info.r_page->sys_page_size)) - goto init_log_instance; + log_init_pg_hdr(log, 1, 1); -init_log_instance: - log_init_pg_hdr(log, page_size, page_size, 1, 1); - - log_create(log, l_size, rst_info.last_lsn, open_log_count, - wrapped, use_multi_page); + log_create(log, log->rst_info.last_lsn, open_log_count, wrapped, + use_multi_page); ra = log_create_ra(log); if (!ra) { @@ -3916,28 +3899,27 @@ init_log_instance: * use the log file. We must use the system page size instead of the * default size if there is not a clean shutdown. */ - t32 = le32_to_cpu(rst_info.r_page->sys_page_size); - if (page_size != t32) { - l_size = orig_file_size; - page_size = - norm_file_page(t32, &l_size, t32 == DefaultLogPageSize); + t32 = le32_to_cpu(log->rst_info.r_page->sys_page_size); + if (log->page_size != t32) { + log->l_size = log->orig_file_size; + log->page_size = norm_file_page(t32, &log->l_size, + t32 == DefaultLogPageSize); } - if (page_size != t32 || - page_size != le32_to_cpu(rst_info.r_page->page_size)) { + if (log->page_size != t32 || + log->page_size != le32_to_cpu(log->rst_info.r_page->page_size)) { err = -EINVAL; goto out; } /* If the file size has shrunk then we won't mount it. */ - if (l_size < le64_to_cpu(ra2->l_size)) { + if (log->l_size < le64_to_cpu(ra2->l_size)) { err = -EINVAL; goto out; } - log_init_pg_hdr(log, page_size, page_size, - le16_to_cpu(rst_info.r_page->major_ver), - le16_to_cpu(rst_info.r_page->minor_ver)); + log_init_pg_hdr(log, le16_to_cpu(log->rst_info.r_page->major_ver), + le16_to_cpu(log->rst_info.r_page->minor_ver)); log->l_size = le64_to_cpu(ra2->l_size); log->seq_num_bits = le32_to_cpu(ra2->seq_num_bits); @@ -3945,7 +3927,7 @@ init_log_instance: log->seq_num_mask = (8 << log->file_data_bits) - 1; log->last_lsn = le64_to_cpu(ra2->current_lsn); log->seq_num = log->last_lsn >> log->file_data_bits; - log->ra_off = le16_to_cpu(rst_info.r_page->ra_off); + log->ra_off = le16_to_cpu(log->rst_info.r_page->ra_off); log->restart_size = log->sys_page_size - log->ra_off; log->record_header_len = le16_to_cpu(ra2->rec_hdr_len); log->ra_size = le16_to_cpu(ra2->ra_len); @@ -4045,7 +4027,7 @@ find_oldest: log->current_avail = current_log_avail(log); /* Remember which restart area to write first. */ - log->init_ra = rst_info.vbo; + log->init_ra = log->rst_info.vbo; process_log: /* 1.0, 1.1, 2.0 log->major_ver/minor_ver - short values. */ @@ -4105,7 +4087,7 @@ process_log: log->client_id.seq_num = cr->seq_num; log->client_id.client_idx = client; - err = read_rst_area(log, &rst, &ra_lsn); + err = read_rst_area(log, &rst, &checkpt_lsn); if (err) goto out; @@ -4114,9 +4096,8 @@ process_log: bytes_per_attr_entry = !rst->major_ver ? 0x2C : 0x28; - checkpt_lsn = le64_to_cpu(rst->check_point_start); - if (!checkpt_lsn) - checkpt_lsn = ra_lsn; + if (rst->check_point_start) + checkpt_lsn = le64_to_cpu(rst->check_point_start); /* Allocate and Read the Transaction Table. */ if (!rst->transact_table_len) @@ -4330,23 +4311,20 @@ check_attr_table: lcb = NULL; check_attribute_names2: - if (!rst->attr_names_len) - goto trace_attribute_table; - - ane = attr_names; - if (!oatbl) - goto trace_attribute_table; - while (ane->off) { - /* TODO: Clear table on exit! */ - oe = Add2Ptr(oatbl, le16_to_cpu(ane->off)); - t16 = le16_to_cpu(ane->name_bytes); - oe->name_len = t16 / sizeof(short); - oe->ptr = ane->name; - oe->is_attr_name = 2; - ane = Add2Ptr(ane, sizeof(struct ATTR_NAME_ENTRY) + t16); + if (rst->attr_names_len && oatbl) { + struct ATTR_NAME_ENTRY *ane = attr_names; + while (ane->off) { + /* TODO: Clear table on exit! */ + oe = Add2Ptr(oatbl, le16_to_cpu(ane->off)); + t16 = le16_to_cpu(ane->name_bytes); + oe->name_len = t16 / sizeof(short); + oe->ptr = ane->name; + oe->is_attr_name = 2; + ane = Add2Ptr(ane, + sizeof(struct ATTR_NAME_ENTRY) + t16); + } } -trace_attribute_table: /* * If the checkpt_lsn is zero, then this is a freshly * formatted disk and we have no work to do. @@ -5189,7 +5167,7 @@ out: kfree(oatbl); kfree(dptbl); kfree(attr_names); - kfree(rst_info.r_page); + kfree(log->rst_info.r_page); kfree(ra); kfree(log->one_page_buf); From a8b0c9fc3a2dba07f697ef7825e04363ff12f071 Mon Sep 17 00:00:00 2001 From: Konstantin Komarov Date: Fri, 24 Nov 2023 11:46:16 +0300 Subject: [PATCH 007/270] fs/ntfs3: Fix multithreaded stress test Signed-off-by: Konstantin Komarov --- fs/ntfs3/attrib.c | 21 ++++++++++++++------- 1 file changed, 14 insertions(+), 7 deletions(-) diff --git a/fs/ntfs3/attrib.c b/fs/ntfs3/attrib.c index 63f70259edc0..4b78b669a3bd 100644 --- a/fs/ntfs3/attrib.c +++ b/fs/ntfs3/attrib.c @@ -886,7 +886,7 @@ int attr_data_get_block(struct ntfs_inode *ni, CLST vcn, CLST clen, CLST *lcn, struct runs_tree *run = &ni->file.run; struct ntfs_sb_info *sbi; u8 cluster_bits; - struct ATTRIB *attr = NULL, *attr_b; + struct ATTRIB *attr, *attr_b; struct ATTR_LIST_ENTRY *le, *le_b; struct mft_inode *mi, *mi_b; CLST hint, svcn, to_alloc, evcn1, next_svcn, asize, end, vcn0, alen; @@ -904,12 +904,8 @@ int attr_data_get_block(struct ntfs_inode *ni, CLST vcn, CLST clen, CLST *lcn, *len = 0; up_read(&ni->file.run_lock); - if (*len) { - if (*lcn != SPARSE_LCN || !new) - return 0; /* Fast normal way without allocation. */ - else if (clen > *len) - clen = *len; - } + if (*len && (*lcn != SPARSE_LCN || !new)) + return 0; /* Fast normal way without allocation. */ /* No cluster in cache or we need to allocate cluster in hole. */ sbi = ni->mi.sbi; @@ -918,6 +914,17 @@ int attr_data_get_block(struct ntfs_inode *ni, CLST vcn, CLST clen, CLST *lcn, ni_lock(ni); down_write(&ni->file.run_lock); + /* Repeat the code above (under write lock). */ + if (!run_lookup_entry(run, vcn, lcn, len, NULL)) + *len = 0; + + if (*len) { + if (*lcn != SPARSE_LCN || !new) + goto out; /* normal way without allocation. */ + if (clen > *len) + clen = *len; + } + le_b = NULL; attr_b = ni_find_attr(ni, NULL, &le_b, ATTR_DATA, NULL, 0, NULL, &mi_b); if (!attr_b) { From d155617006ebc172a80d3eb013c4b867f9a8ada4 Mon Sep 17 00:00:00 2001 From: Konstantin Komarov Date: Fri, 24 Nov 2023 11:47:30 +0300 Subject: [PATCH 008/270] fs/ntfs3: Fix detected field-spanning write (size 8) of single field "le->name" Signed-off-by: Konstantin Komarov --- fs/ntfs3/ntfs.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/ntfs3/ntfs.h b/fs/ntfs3/ntfs.h index 86aecbb01a92..13e96fc63dae 100644 --- a/fs/ntfs3/ntfs.h +++ b/fs/ntfs3/ntfs.h @@ -523,7 +523,7 @@ struct ATTR_LIST_ENTRY { __le64 vcn; // 0x08: Starting VCN of this attribute. struct MFT_REF ref; // 0x10: MFT record number with attribute. __le16 id; // 0x18: struct ATTRIB ID. - __le16 name[3]; // 0x1A: Just to align. To get real name can use bNameOffset. + __le16 name[]; // 0x1A: Just to align. To get real name can use name_off. }; // sizeof(0x20) From a40b73f608e7de2120fdb9ddc8970421b3c50bc9 Mon Sep 17 00:00:00 2001 From: Konstantin Komarov Date: Fri, 24 Nov 2023 12:08:36 +0300 Subject: [PATCH 009/270] fs/ntfs3: Correct use bh_read Signed-off-by: Konstantin Komarov --- fs/ntfs3/file.c | 19 +++++++++---------- fs/ntfs3/inode.c | 7 +++---- 2 files changed, 12 insertions(+), 14 deletions(-) diff --git a/fs/ntfs3/file.c b/fs/ntfs3/file.c index a5a30a24ce5d..5691f04e6751 100644 --- a/fs/ntfs3/file.c +++ b/fs/ntfs3/file.c @@ -188,6 +188,7 @@ static int ntfs_zero_range(struct inode *inode, u64 vbo, u64 vbo_to) u32 bh_next, bh_off, to; sector_t iblock; struct folio *folio; + bool dirty = false; for (; idx < idx_end; idx += 1, from = 0) { page_off = (loff_t)idx << PAGE_SHIFT; @@ -223,29 +224,27 @@ static int ntfs_zero_range(struct inode *inode, u64 vbo, u64 vbo_to) /* Ok, it's mapped. Make sure it's up-to-date. */ if (folio_test_uptodate(folio)) set_buffer_uptodate(bh); - - if (!buffer_uptodate(bh)) { - err = bh_read(bh, 0); - if (err < 0) { - folio_unlock(folio); - folio_put(folio); - goto out; - } + else if (bh_read(bh, 0) < 0) { + err = -EIO; + folio_unlock(folio); + folio_put(folio); + goto out; } mark_buffer_dirty(bh); - } while (bh_off = bh_next, iblock += 1, head != (bh = bh->b_this_page)); folio_zero_segment(folio, from, to); + dirty = true; folio_unlock(folio); folio_put(folio); cond_resched(); } out: - mark_inode_dirty(inode); + if (dirty) + mark_inode_dirty(inode); return err; } diff --git a/fs/ntfs3/inode.c b/fs/ntfs3/inode.c index fa6c7965473c..bba0208c4afd 100644 --- a/fs/ntfs3/inode.c +++ b/fs/ntfs3/inode.c @@ -345,9 +345,7 @@ next_attr: inode->i_size = le16_to_cpu(rp.SymbolicLinkReparseBuffer .PrintNameLength) / sizeof(u16); - ni->i_valid = inode->i_size; - /* Clear directory bit. */ if (ni->ni_flags & NI_FLAG_DIR) { indx_clear(&ni->dir); @@ -653,9 +651,10 @@ static noinline int ntfs_get_block_vbo(struct inode *inode, u64 vbo, off = vbo & (PAGE_SIZE - 1); folio_set_bh(bh, folio, off); - err = bh_read(bh, 0); - if (err < 0) + if (bh_read(bh, 0) < 0) { + err = -EIO; goto out; + } folio_zero_segment(folio, off + voff, off + block_size); } } From 4dea9cd522424d3002894c20b729c6fbfb6fc22b Mon Sep 17 00:00:00 2001 From: Konstantin Komarov Date: Fri, 24 Nov 2023 12:16:49 +0300 Subject: [PATCH 010/270] fs/ntfs3: Add file_modified Signed-off-by: Konstantin Komarov --- fs/ntfs3/file.c | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/fs/ntfs3/file.c b/fs/ntfs3/file.c index 5691f04e6751..bb80ce2eec2f 100644 --- a/fs/ntfs3/file.c +++ b/fs/ntfs3/file.c @@ -632,11 +632,17 @@ static long ntfs_fallocate(struct file *file, int mode, loff_t vbo, loff_t len) &ni->file.run, i_size, &ni->i_valid, true, NULL); ni_unlock(ni); + if (err) + goto out; } else if (new_size > i_size) { inode->i_size = new_size; } } + err = file_modified(file); + if (err) + goto out; + out: if (map_locked) filemap_invalidate_unlock(mapping); @@ -1040,6 +1046,7 @@ static ssize_t ntfs_file_write_iter(struct kiocb *iocb, struct iov_iter *from) struct address_space *mapping = file->f_mapping; struct inode *inode = mapping->host; ssize_t ret; + int err; struct ntfs_inode *ni = ntfs_i(inode); if (is_encrypted(ni)) { @@ -1067,6 +1074,12 @@ static ssize_t ntfs_file_write_iter(struct kiocb *iocb, struct iov_iter *from) if (ret <= 0) goto out; + err = file_modified(iocb->ki_filp); + if (err) { + ret = err; + goto out; + } + if (WARN_ON(ni->ni_flags & NI_FLAG_COMPRESSED_MASK)) { /* Should never be here, see ntfs_file_open(). */ ret = -EOPNOTSUPP; From e50f9560b8168a625703a3e7fe1fde9fa53f0837 Mon Sep 17 00:00:00 2001 From: Konstantin Komarov Date: Fri, 24 Nov 2023 12:17:46 +0300 Subject: [PATCH 011/270] fs/ntfs3: Drop suid and sgid bits as a part of fpunch Signed-off-by: Konstantin Komarov --- fs/ntfs3/file.c | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/fs/ntfs3/file.c b/fs/ntfs3/file.c index bb80ce2eec2f..0ff5d3af2889 100644 --- a/fs/ntfs3/file.c +++ b/fs/ntfs3/file.c @@ -498,10 +498,14 @@ static long ntfs_fallocate(struct file *file, int mode, loff_t vbo, loff_t len) ni_lock(ni); err = attr_punch_hole(ni, vbo, len, &frame_size); ni_unlock(ni); + if (!err) + goto ok; + if (err != E_NTFS_NOTALIGNED) goto out; /* Process not aligned punch. */ + err = 0; mask = frame_size - 1; vbo_a = (vbo + mask) & ~mask; end_a = end & ~mask; @@ -524,6 +528,8 @@ static long ntfs_fallocate(struct file *file, int mode, loff_t vbo, loff_t len) ni_lock(ni); err = attr_punch_hole(ni, vbo_a, end_a - vbo_a, NULL); ni_unlock(ni); + if (err) + goto out; } } else if (mode & FALLOC_FL_COLLAPSE_RANGE) { /* @@ -563,6 +569,8 @@ static long ntfs_fallocate(struct file *file, int mode, loff_t vbo, loff_t len) ni_lock(ni); err = attr_insert_range(ni, vbo, len); ni_unlock(ni); + if (err) + goto out; } else { /* Check new size. */ u8 cluster_bits = sbi->cluster_bits; @@ -639,6 +647,7 @@ static long ntfs_fallocate(struct file *file, int mode, loff_t vbo, loff_t len) } } +ok: err = file_modified(file); if (err) goto out; From 6c3684e703837d2116b5cf4beb37aa7145a66b60 Mon Sep 17 00:00:00 2001 From: Konstantin Komarov Date: Fri, 24 Nov 2023 12:19:37 +0300 Subject: [PATCH 012/270] fs/ntfs3: Implement super_operations::shutdown Signed-off-by: Konstantin Komarov --- fs/ntfs3/file.c | 18 ++++++++++++++++++ fs/ntfs3/frecord.c | 3 +++ fs/ntfs3/inode.c | 21 +++++++++++++++++++-- fs/ntfs3/namei.c | 12 ++++++++++++ fs/ntfs3/ntfs_fs.h | 9 ++++++++- fs/ntfs3/super.c | 12 ++++++++++++ fs/ntfs3/xattr.c | 3 +++ 7 files changed, 75 insertions(+), 3 deletions(-) diff --git a/fs/ntfs3/file.c b/fs/ntfs3/file.c index 0ff5d3af2889..07ed3d946e7c 100644 --- a/fs/ntfs3/file.c +++ b/fs/ntfs3/file.c @@ -260,6 +260,9 @@ static int ntfs_file_mmap(struct file *file, struct vm_area_struct *vma) bool rw = vma->vm_flags & VM_WRITE; int err; + if (unlikely(ntfs3_forced_shutdown(inode->i_sb))) + return -EIO; + if (is_encrypted(ni)) { ntfs_inode_warn(inode, "mmap encrypted not supported"); return -EOPNOTSUPP; @@ -677,6 +680,9 @@ int ntfs3_setattr(struct mnt_idmap *idmap, struct dentry *dentry, umode_t mode = inode->i_mode; int err; + if (unlikely(ntfs3_forced_shutdown(inode->i_sb))) + return -EIO; + err = setattr_prepare(idmap, dentry, attr); if (err) goto out; @@ -732,6 +738,9 @@ static ssize_t ntfs_file_read_iter(struct kiocb *iocb, struct iov_iter *iter) struct inode *inode = file->f_mapping->host; struct ntfs_inode *ni = ntfs_i(inode); + if (unlikely(ntfs3_forced_shutdown(inode->i_sb))) + return -EIO; + if (is_encrypted(ni)) { ntfs_inode_warn(inode, "encrypted i/o not supported"); return -EOPNOTSUPP; @@ -766,6 +775,9 @@ static ssize_t ntfs_file_splice_read(struct file *in, loff_t *ppos, struct inode *inode = in->f_mapping->host; struct ntfs_inode *ni = ntfs_i(inode); + if (unlikely(ntfs3_forced_shutdown(inode->i_sb))) + return -EIO; + if (is_encrypted(ni)) { ntfs_inode_warn(inode, "encrypted i/o not supported"); return -EOPNOTSUPP; @@ -1058,6 +1070,9 @@ static ssize_t ntfs_file_write_iter(struct kiocb *iocb, struct iov_iter *from) int err; struct ntfs_inode *ni = ntfs_i(inode); + if (unlikely(ntfs3_forced_shutdown(inode->i_sb))) + return -EIO; + if (is_encrypted(ni)) { ntfs_inode_warn(inode, "encrypted i/o not supported"); return -EOPNOTSUPP; @@ -1118,6 +1133,9 @@ int ntfs_file_open(struct inode *inode, struct file *file) { struct ntfs_inode *ni = ntfs_i(inode); + if (unlikely(ntfs3_forced_shutdown(inode->i_sb))) + return -EIO; + if (unlikely((is_compressed(ni) || is_encrypted(ni)) && (file->f_flags & O_DIRECT))) { return -EOPNOTSUPP; diff --git a/fs/ntfs3/frecord.c b/fs/ntfs3/frecord.c index 3df2d9e34b91..8744ba36d422 100644 --- a/fs/ntfs3/frecord.c +++ b/fs/ntfs3/frecord.c @@ -3259,6 +3259,9 @@ int ni_write_inode(struct inode *inode, int sync, const char *hint) if (is_bad_inode(inode) || sb_rdonly(sb)) return 0; + if (unlikely(ntfs3_forced_shutdown(sb))) + return -EIO; + if (!ni_trylock(ni)) { /* 'ni' is under modification, skip for now. */ mark_inode_dirty_sync(inode); diff --git a/fs/ntfs3/inode.c b/fs/ntfs3/inode.c index bba0208c4afd..85452a6b1d40 100644 --- a/fs/ntfs3/inode.c +++ b/fs/ntfs3/inode.c @@ -852,9 +852,13 @@ static int ntfs_resident_writepage(struct folio *folio, struct writeback_control *wbc, void *data) { struct address_space *mapping = data; - struct ntfs_inode *ni = ntfs_i(mapping->host); + struct inode *inode = mapping->host; + struct ntfs_inode *ni = ntfs_i(inode); int ret; + if (unlikely(ntfs3_forced_shutdown(inode->i_sb))) + return -EIO; + ni_lock(ni); ret = attr_data_write_resident(ni, &folio->page); ni_unlock(ni); @@ -868,7 +872,12 @@ static int ntfs_resident_writepage(struct folio *folio, static int ntfs_writepages(struct address_space *mapping, struct writeback_control *wbc) { - if (is_resident(ntfs_i(mapping->host))) + struct inode *inode = mapping->host; + + if (unlikely(ntfs3_forced_shutdown(inode->i_sb))) + return -EIO; + + if (is_resident(ntfs_i(inode))) return write_cache_pages(mapping, wbc, ntfs_resident_writepage, mapping); return mpage_writepages(mapping, wbc, ntfs_get_block); @@ -888,6 +897,9 @@ int ntfs_write_begin(struct file *file, struct address_space *mapping, struct inode *inode = mapping->host; struct ntfs_inode *ni = ntfs_i(inode); + if (unlikely(ntfs3_forced_shutdown(inode->i_sb))) + return -EIO; + *pagep = NULL; if (is_resident(ni)) { struct page *page = @@ -1305,6 +1317,11 @@ struct inode *ntfs_create_inode(struct mnt_idmap *idmap, struct inode *dir, goto out1; } + if (unlikely(ntfs3_forced_shutdown(sb))) { + err = -EIO; + goto out2; + } + /* Mark rw ntfs as dirty. it will be cleared at umount. */ ntfs_set_state(sbi, NTFS_DIRTY_DIRTY); diff --git a/fs/ntfs3/namei.c b/fs/ntfs3/namei.c index ee3093be5170..cae41db0aaa7 100644 --- a/fs/ntfs3/namei.c +++ b/fs/ntfs3/namei.c @@ -181,6 +181,9 @@ static int ntfs_unlink(struct inode *dir, struct dentry *dentry) struct ntfs_inode *ni = ntfs_i(dir); int err; + if (unlikely(ntfs3_forced_shutdown(dir->i_sb))) + return -EIO; + ni_lock_dir(ni); err = ntfs_unlink_inode(dir, dentry); @@ -199,6 +202,9 @@ static int ntfs_symlink(struct mnt_idmap *idmap, struct inode *dir, u32 size = strlen(symname); struct inode *inode; + if (unlikely(ntfs3_forced_shutdown(dir->i_sb))) + return -EIO; + inode = ntfs_create_inode(idmap, dir, dentry, NULL, S_IFLNK | 0777, 0, symname, size, NULL); @@ -227,6 +233,9 @@ static int ntfs_rmdir(struct inode *dir, struct dentry *dentry) struct ntfs_inode *ni = ntfs_i(dir); int err; + if (unlikely(ntfs3_forced_shutdown(dir->i_sb))) + return -EIO; + ni_lock_dir(ni); err = ntfs_unlink_inode(dir, dentry); @@ -264,6 +273,9 @@ static int ntfs_rename(struct mnt_idmap *idmap, struct inode *dir, 1024); static_assert(PATH_MAX >= 4 * 1024); + if (unlikely(ntfs3_forced_shutdown(sb))) + return -EIO; + if (flags & ~RENAME_NOREPLACE) return -EINVAL; diff --git a/fs/ntfs3/ntfs_fs.h b/fs/ntfs3/ntfs_fs.h index f6706143d14b..d40bc7669ae5 100644 --- a/fs/ntfs3/ntfs_fs.h +++ b/fs/ntfs3/ntfs_fs.h @@ -61,6 +61,8 @@ enum utf16_endian; /* sbi->flags */ #define NTFS_FLAGS_NODISCARD 0x00000001 +/* ntfs in shutdown state. */ +#define NTFS_FLAGS_SHUTDOWN 0x00000002 /* Set when LogFile is replaying. */ #define NTFS_FLAGS_LOG_REPLAYING 0x00000008 /* Set when we changed first MFT's which copy must be updated in $MftMirr. */ @@ -226,7 +228,7 @@ struct ntfs_sb_info { u64 maxbytes; // Maximum size for normal files. u64 maxbytes_sparse; // Maximum size for sparse file. - u32 flags; // See NTFS_FLAGS_XXX. + unsigned long flags; // See NTFS_FLAGS_ CLST zone_max; // Maximum MFT zone length in clusters CLST bad_clusters; // The count of marked bad clusters. @@ -999,6 +1001,11 @@ static inline struct ntfs_sb_info *ntfs_sb(struct super_block *sb) return sb->s_fs_info; } +static inline bool ntfs3_forced_shutdown(struct super_block *sb) +{ + return test_bit(NTFS_FLAGS_SHUTDOWN, &ntfs_sb(sb)->flags); +} + /* * ntfs_up_cluster - Align up on cluster boundary. */ diff --git a/fs/ntfs3/super.c b/fs/ntfs3/super.c index 09d61c6c90aa..af8521a6ed95 100644 --- a/fs/ntfs3/super.c +++ b/fs/ntfs3/super.c @@ -714,6 +714,14 @@ static int ntfs_show_options(struct seq_file *m, struct dentry *root) return 0; } +/* + * ntfs_shutdown - super_operations::shutdown + */ +static void ntfs_shutdown(struct super_block *sb) +{ + set_bit(NTFS_FLAGS_SHUTDOWN, &ntfs_sb(sb)->flags); +} + /* * ntfs_sync_fs - super_operations::sync_fs */ @@ -724,6 +732,9 @@ static int ntfs_sync_fs(struct super_block *sb, int wait) struct ntfs_inode *ni; struct inode *inode; + if (unlikely(ntfs3_forced_shutdown(sb))) + return -EIO; + ni = sbi->security.ni; if (ni) { inode = &ni->vfs_inode; @@ -763,6 +774,7 @@ static const struct super_operations ntfs_sops = { .put_super = ntfs_put_super, .statfs = ntfs_statfs, .show_options = ntfs_show_options, + .shutdown = ntfs_shutdown, .sync_fs = ntfs_sync_fs, .write_inode = ntfs3_write_inode, }; diff --git a/fs/ntfs3/xattr.c b/fs/ntfs3/xattr.c index 4274b6f31cfa..071356d096d8 100644 --- a/fs/ntfs3/xattr.c +++ b/fs/ntfs3/xattr.c @@ -744,6 +744,9 @@ static int ntfs_getxattr(const struct xattr_handler *handler, struct dentry *de, int err; struct ntfs_inode *ni = ntfs_i(inode); + if (unlikely(ntfs3_forced_shutdown(inode->i_sb))) + return -EIO; + /* Dispatch request. */ if (!strcmp(name, SYSTEM_DOS_ATTRIB)) { /* system.dos_attrib */ From 97ec56d390a3a0077b36cb38627f671c72dddce6 Mon Sep 17 00:00:00 2001 From: Konstantin Komarov Date: Fri, 24 Nov 2023 12:21:12 +0300 Subject: [PATCH 013/270] fs/ntfs3: ntfs3_forced_shutdown use int instead of bool Signed-off-by: Konstantin Komarov --- fs/ntfs3/fsntfs.c | 3 ++- fs/ntfs3/ntfs_fs.h | 6 +++--- fs/ntfs3/super.c | 2 +- 3 files changed, 6 insertions(+), 5 deletions(-) diff --git a/fs/ntfs3/fsntfs.c b/fs/ntfs3/fsntfs.c index fbfe21dbb425..350461d8cece 100644 --- a/fs/ntfs3/fsntfs.c +++ b/fs/ntfs3/fsntfs.c @@ -853,7 +853,8 @@ void ntfs_update_mftmirr(struct ntfs_sb_info *sbi, int wait) /* * sb can be NULL here. In this case sbi->flags should be 0 too. */ - if (!sb || !(sbi->flags & NTFS_FLAGS_MFTMIRR)) + if (!sb || !(sbi->flags & NTFS_FLAGS_MFTMIRR) || + unlikely(ntfs3_forced_shutdown(sb))) return; blocksize = sb->s_blocksize; diff --git a/fs/ntfs3/ntfs_fs.h b/fs/ntfs3/ntfs_fs.h index d40bc7669ae5..7510875efef6 100644 --- a/fs/ntfs3/ntfs_fs.h +++ b/fs/ntfs3/ntfs_fs.h @@ -62,7 +62,7 @@ enum utf16_endian; /* sbi->flags */ #define NTFS_FLAGS_NODISCARD 0x00000001 /* ntfs in shutdown state. */ -#define NTFS_FLAGS_SHUTDOWN 0x00000002 +#define NTFS_FLAGS_SHUTDOWN_BIT 0x00000002 /* == 4*/ /* Set when LogFile is replaying. */ #define NTFS_FLAGS_LOG_REPLAYING 0x00000008 /* Set when we changed first MFT's which copy must be updated in $MftMirr. */ @@ -1001,9 +1001,9 @@ static inline struct ntfs_sb_info *ntfs_sb(struct super_block *sb) return sb->s_fs_info; } -static inline bool ntfs3_forced_shutdown(struct super_block *sb) +static inline int ntfs3_forced_shutdown(struct super_block *sb) { - return test_bit(NTFS_FLAGS_SHUTDOWN, &ntfs_sb(sb)->flags); + return test_bit(NTFS_FLAGS_SHUTDOWN_BIT, &ntfs_sb(sb)->flags); } /* diff --git a/fs/ntfs3/super.c b/fs/ntfs3/super.c index af8521a6ed95..65ef4b57411f 100644 --- a/fs/ntfs3/super.c +++ b/fs/ntfs3/super.c @@ -719,7 +719,7 @@ static int ntfs_show_options(struct seq_file *m, struct dentry *root) */ static void ntfs_shutdown(struct super_block *sb) { - set_bit(NTFS_FLAGS_SHUTDOWN, &ntfs_sb(sb)->flags); + set_bit(NTFS_FLAGS_SHUTDOWN_BIT, &ntfs_sb(sb)->flags); } /* From d6ca2d253900b9b0a3a1ad77541d606010f5e5eb Mon Sep 17 00:00:00 2001 From: Konstantin Komarov Date: Tue, 28 Nov 2023 11:08:00 +0300 Subject: [PATCH 014/270] fs/ntfs3: Add and fix comments Signed-off-by: Konstantin Komarov --- fs/ntfs3/dir.c | 4 +++- fs/ntfs3/fsntfs.c | 2 +- fs/ntfs3/ntfs.h | 2 +- fs/ntfs3/ntfs_fs.h | 2 +- 4 files changed, 6 insertions(+), 4 deletions(-) diff --git a/fs/ntfs3/dir.c b/fs/ntfs3/dir.c index 726122ecd39b..9f6dd445eb04 100644 --- a/fs/ntfs3/dir.c +++ b/fs/ntfs3/dir.c @@ -536,8 +536,10 @@ static int ntfs_dir_count(struct inode *dir, bool *is_empty, size_t *dirs, e = Add2Ptr(hdr, off); e_size = le16_to_cpu(e->size); if (e_size < sizeof(struct NTFS_DE) || - off + e_size > end) + off + e_size > end) { + /* Looks like corruption. */ break; + } if (de_is_last(e)) break; diff --git a/fs/ntfs3/fsntfs.c b/fs/ntfs3/fsntfs.c index 350461d8cece..321978019407 100644 --- a/fs/ntfs3/fsntfs.c +++ b/fs/ntfs3/fsntfs.c @@ -2129,8 +2129,8 @@ int ntfs_insert_security(struct ntfs_sb_info *sbi, if (le32_to_cpu(d_security->size) == new_sec_size && d_security->key.hash == hash_key.hash && !memcmp(d_security + 1, sd, size_sd)) { - *security_id = d_security->key.sec_id; /* Such security already exists. */ + *security_id = d_security->key.sec_id; err = 0; goto out; } diff --git a/fs/ntfs3/ntfs.h b/fs/ntfs3/ntfs.h index 13e96fc63dae..c8981429c721 100644 --- a/fs/ntfs3/ntfs.h +++ b/fs/ntfs3/ntfs.h @@ -523,7 +523,7 @@ struct ATTR_LIST_ENTRY { __le64 vcn; // 0x08: Starting VCN of this attribute. struct MFT_REF ref; // 0x10: MFT record number with attribute. __le16 id; // 0x18: struct ATTRIB ID. - __le16 name[]; // 0x1A: Just to align. To get real name can use name_off. + __le16 name[]; // 0x1A: To get real name use name_off. }; // sizeof(0x20) diff --git a/fs/ntfs3/ntfs_fs.h b/fs/ntfs3/ntfs_fs.h index 7510875efef6..abbc7182554a 100644 --- a/fs/ntfs3/ntfs_fs.h +++ b/fs/ntfs3/ntfs_fs.h @@ -874,7 +874,7 @@ int ntfs_init_acl(struct mnt_idmap *idmap, struct inode *inode, int ntfs_acl_chmod(struct mnt_idmap *idmap, struct dentry *dentry); ssize_t ntfs_listxattr(struct dentry *dentry, char *buffer, size_t size); -extern const struct xattr_handler * const ntfs_xattr_handlers[]; +extern const struct xattr_handler *const ntfs_xattr_handlers[]; int ntfs_save_wsl_perm(struct inode *inode, __le16 *ea_size); void ntfs_get_wsl_perm(struct inode *inode); From aaab47f204aaf47838241d57bf8662c8840de60a Mon Sep 17 00:00:00 2001 From: Konstantin Komarov Date: Tue, 28 Nov 2023 11:09:34 +0300 Subject: [PATCH 015/270] fs/ntfs3: Add NULL ptr dereference checking at the end of attr_allocate_frame() It is preferable to exit through the out: label because internal debugging functions are located there. Signed-off-by: Konstantin Komarov --- fs/ntfs3/attrib.c | 20 ++++++++++++-------- 1 file changed, 12 insertions(+), 8 deletions(-) diff --git a/fs/ntfs3/attrib.c b/fs/ntfs3/attrib.c index 4b78b669a3bd..646e2dad1b75 100644 --- a/fs/ntfs3/attrib.c +++ b/fs/ntfs3/attrib.c @@ -1743,8 +1743,10 @@ repack: le_b = NULL; attr_b = ni_find_attr(ni, NULL, &le_b, ATTR_DATA, NULL, 0, NULL, &mi_b); - if (!attr_b) - return -ENOENT; + if (!attr_b) { + err = -ENOENT; + goto out; + } attr = attr_b; le = le_b; @@ -1825,13 +1827,15 @@ ins_ext: ok: run_truncate_around(run, vcn); out: - if (new_valid > data_size) - new_valid = data_size; + if (attr_b) { + if (new_valid > data_size) + new_valid = data_size; - valid_size = le64_to_cpu(attr_b->nres.valid_size); - if (new_valid != valid_size) { - attr_b->nres.valid_size = cpu_to_le64(valid_size); - mi_b->dirty = true; + valid_size = le64_to_cpu(attr_b->nres.valid_size); + if (new_valid != valid_size) { + attr_b->nres.valid_size = cpu_to_le64(valid_size); + mi_b->dirty = true; + } } return err; From 652483bfbc45137e8dce556c9ddbd4458dad4452 Mon Sep 17 00:00:00 2001 From: Konstantin Komarov Date: Tue, 28 Nov 2023 11:17:20 +0300 Subject: [PATCH 016/270] fs/ntfs3: Fix c/mtime typo Signed-off-by: Konstantin Komarov --- fs/ntfs3/frecord.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/ntfs3/frecord.c b/fs/ntfs3/frecord.c index 8744ba36d422..6ff4f70ba077 100644 --- a/fs/ntfs3/frecord.c +++ b/fs/ntfs3/frecord.c @@ -3291,7 +3291,7 @@ int ni_write_inode(struct inode *inode, int sync, const char *hint) modified = true; } - ts = inode_get_mtime(inode); + ts = inode_get_ctime(inode); dup.c_time = kernel2nt(&ts); if (std->c_time != dup.c_time) { std->c_time = dup.c_time; From 4cdfb6e7bc9c80142d33bf1d4653a73fa678ba56 Mon Sep 17 00:00:00 2001 From: Konstantin Komarov Date: Thu, 21 Dec 2023 13:59:43 +0300 Subject: [PATCH 017/270] fs/ntfs3: Disable ATTR_LIST_ENTRY size check The use of sizeof(struct ATTR_LIST_ENTRY) has been replaced with le_size(0) due to alignment peculiarities on different platforms. Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-kbuild-all/202312071005.g6YrbaIe-lkp@intel.com/ Signed-off-by: Konstantin Komarov --- fs/ntfs3/attrlist.c | 8 ++++---- fs/ntfs3/ntfs.h | 2 -- 2 files changed, 4 insertions(+), 6 deletions(-) diff --git a/fs/ntfs3/attrlist.c b/fs/ntfs3/attrlist.c index 7c01735d1219..48e7da47c6b7 100644 --- a/fs/ntfs3/attrlist.c +++ b/fs/ntfs3/attrlist.c @@ -127,12 +127,13 @@ struct ATTR_LIST_ENTRY *al_enumerate(struct ntfs_inode *ni, { size_t off; u16 sz; + const unsigned le_min_size = le_size(0); if (!le) { le = ni->attr_list.le; } else { sz = le16_to_cpu(le->size); - if (sz < sizeof(struct ATTR_LIST_ENTRY)) { + if (sz < le_min_size) { /* Impossible 'cause we should not return such le. */ return NULL; } @@ -141,7 +142,7 @@ struct ATTR_LIST_ENTRY *al_enumerate(struct ntfs_inode *ni, /* Check boundary. */ off = PtrOffset(ni->attr_list.le, le); - if (off + sizeof(struct ATTR_LIST_ENTRY) > ni->attr_list.size) { + if (off + le_min_size > ni->attr_list.size) { /* The regular end of list. */ return NULL; } @@ -149,8 +150,7 @@ struct ATTR_LIST_ENTRY *al_enumerate(struct ntfs_inode *ni, sz = le16_to_cpu(le->size); /* Check le for errors. */ - if (sz < sizeof(struct ATTR_LIST_ENTRY) || - off + sz > ni->attr_list.size || + if (sz < le_min_size || off + sz > ni->attr_list.size || sz < le->name_off + le->name_len * sizeof(short)) { return NULL; } diff --git a/fs/ntfs3/ntfs.h b/fs/ntfs3/ntfs.h index c8981429c721..9c7478150a03 100644 --- a/fs/ntfs3/ntfs.h +++ b/fs/ntfs3/ntfs.h @@ -527,8 +527,6 @@ struct ATTR_LIST_ENTRY { }; // sizeof(0x20) -static_assert(sizeof(struct ATTR_LIST_ENTRY) == 0x20); - static inline u32 le_size(u8 name_len) { return ALIGN(offsetof(struct ATTR_LIST_ENTRY, name) + From ddb17dc880eeaac37b5a6e984de07b882de7d78d Mon Sep 17 00:00:00 2001 From: Konstantin Komarov Date: Tue, 16 Jan 2024 10:32:20 +0300 Subject: [PATCH 018/270] fs/ntfs3: Use kvfree to free memory allocated by kvmalloc Signed-off-by: Konstantin Komarov --- fs/ntfs3/attrlist.c | 4 ++-- fs/ntfs3/bitmap.c | 4 ++-- fs/ntfs3/frecord.c | 4 ++-- fs/ntfs3/super.c | 2 +- 4 files changed, 7 insertions(+), 7 deletions(-) diff --git a/fs/ntfs3/attrlist.c b/fs/ntfs3/attrlist.c index 48e7da47c6b7..9f4bd8d26090 100644 --- a/fs/ntfs3/attrlist.c +++ b/fs/ntfs3/attrlist.c @@ -29,7 +29,7 @@ static inline bool al_is_valid_le(const struct ntfs_inode *ni, void al_destroy(struct ntfs_inode *ni) { run_close(&ni->attr_list.run); - kfree(ni->attr_list.le); + kvfree(ni->attr_list.le); ni->attr_list.le = NULL; ni->attr_list.size = 0; ni->attr_list.dirty = false; @@ -318,7 +318,7 @@ int al_add_le(struct ntfs_inode *ni, enum ATTR_TYPE type, const __le16 *name, memcpy(ptr, al->le, off); memcpy(Add2Ptr(ptr, off + sz), le, old_size - off); le = Add2Ptr(ptr, off); - kfree(al->le); + kvfree(al->le); al->le = ptr; } else { memmove(Add2Ptr(le, sz), le, old_size - off); diff --git a/fs/ntfs3/bitmap.c b/fs/ntfs3/bitmap.c index 63f14a0232f6..845f9b22deef 100644 --- a/fs/ntfs3/bitmap.c +++ b/fs/ntfs3/bitmap.c @@ -124,7 +124,7 @@ void wnd_close(struct wnd_bitmap *wnd) { struct rb_node *node, *next; - kfree(wnd->free_bits); + kvfree(wnd->free_bits); wnd->free_bits = NULL; run_close(&wnd->run); @@ -1360,7 +1360,7 @@ int wnd_extend(struct wnd_bitmap *wnd, size_t new_bits) memcpy(new_free, wnd->free_bits, wnd->nwnd * sizeof(short)); memset(new_free + wnd->nwnd, 0, (new_wnd - wnd->nwnd) * sizeof(short)); - kfree(wnd->free_bits); + kvfree(wnd->free_bits); wnd->free_bits = new_free; } diff --git a/fs/ntfs3/frecord.c b/fs/ntfs3/frecord.c index 6ff4f70ba077..2636ab7640ac 100644 --- a/fs/ntfs3/frecord.c +++ b/fs/ntfs3/frecord.c @@ -778,7 +778,7 @@ static int ni_try_remove_attr_list(struct ntfs_inode *ni) run_deallocate(sbi, &ni->attr_list.run, true); run_close(&ni->attr_list.run); ni->attr_list.size = 0; - kfree(ni->attr_list.le); + kvfree(ni->attr_list.le); ni->attr_list.le = NULL; ni->attr_list.dirty = false; @@ -927,7 +927,7 @@ int ni_create_attr_list(struct ntfs_inode *ni) return 0; out: - kfree(ni->attr_list.le); + kvfree(ni->attr_list.le); ni->attr_list.le = NULL; ni->attr_list.size = 0; return err; diff --git a/fs/ntfs3/super.c b/fs/ntfs3/super.c index 65ef4b57411f..c55a29793a8d 100644 --- a/fs/ntfs3/super.c +++ b/fs/ntfs3/super.c @@ -625,7 +625,7 @@ static void ntfs3_free_sbi(struct ntfs_sb_info *sbi) { kfree(sbi->new_rec); kvfree(ntfs_put_shared(sbi->upcase)); - kfree(sbi->def_table); + kvfree(sbi->def_table); kfree(sbi->compress.lznt); #ifdef CONFIG_NTFS3_LZX_XPRESS xpress_free_decompressor(sbi->compress.xpress); From 741ba0134fa7822fcf4e4a0a537a5c4cfd706b20 Mon Sep 17 00:00:00 2001 From: Konrad Dybcio Date: Wed, 27 Dec 2023 16:21:24 +0100 Subject: [PATCH 019/270] pmdomain: core: Move the unused cleanup to a _sync initcall The unused clock cleanup uses the _sync initcall to give all users at earlier initcalls time to probe. Do the same to avoid leaving some PDs dangling at "on" (which actually happened on qcom!). Fixes: 2fe71dcdfd10 ("PM / domains: Add late_initcall to disable unused PM domains") Signed-off-by: Konrad Dybcio Cc: stable@vger.kernel.org Link: https://lore.kernel.org/r/20231227-topic-pmdomain_sync_cleanup-v1-1-5f36769d538b@linaro.org Signed-off-by: Ulf Hansson --- drivers/pmdomain/core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/pmdomain/core.c b/drivers/pmdomain/core.c index a1f6cba3ae6c..18e232b5ed53 100644 --- a/drivers/pmdomain/core.c +++ b/drivers/pmdomain/core.c @@ -1109,7 +1109,7 @@ static int __init genpd_power_off_unused(void) return 0; } -late_initcall(genpd_power_off_unused); +late_initcall_sync(genpd_power_off_unused); #ifdef CONFIG_PM_SLEEP From f0e4a1356466ec1858ae8e5c70bea2ce5e55008b Mon Sep 17 00:00:00 2001 From: Geert Uytterhoeven Date: Fri, 12 Jan 2024 17:33:55 +0100 Subject: [PATCH 020/270] pmdomain: renesas: r8a77980-sysc: CR7 must be always on MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The power domain containing the Cortex-R7 CPU core on the R-Car V3H SoC must always be in power-on state, unlike on other SoCs in the R-Car Gen3 family. See Table 9.4 "Power domains" in the R-Car Series, 3rd Generation Hardware User’s Manual Rev.1.00 and later. Fix this by marking the domain as a CPU domain without control registers, so the driver will not touch it. Fixes: 41d6d8bd8ae9 ("soc: renesas: rcar-sysc: add R8A77980 support") Signed-off-by: Geert Uytterhoeven Cc: stable@vger.kernel.org Link: https://lore.kernel.org/r/fdad9a86132d53ecddf72b734dac406915c4edc0.1705076735.git.geert+renesas@glider.be Signed-off-by: Ulf Hansson --- drivers/pmdomain/renesas/r8a77980-sysc.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/pmdomain/renesas/r8a77980-sysc.c b/drivers/pmdomain/renesas/r8a77980-sysc.c index 39ca84a67daa..621e411fc999 100644 --- a/drivers/pmdomain/renesas/r8a77980-sysc.c +++ b/drivers/pmdomain/renesas/r8a77980-sysc.c @@ -25,7 +25,8 @@ static const struct rcar_sysc_area r8a77980_areas[] __initconst = { PD_CPU_NOCR }, { "ca53-cpu3", 0x200, 3, R8A77980_PD_CA53_CPU3, R8A77980_PD_CA53_SCU, PD_CPU_NOCR }, - { "cr7", 0x240, 0, R8A77980_PD_CR7, R8A77980_PD_ALWAYS_ON }, + { "cr7", 0x240, 0, R8A77980_PD_CR7, R8A77980_PD_ALWAYS_ON, + PD_CPU_NOCR }, { "a3ir", 0x180, 0, R8A77980_PD_A3IR, R8A77980_PD_ALWAYS_ON }, { "a2ir0", 0x400, 0, R8A77980_PD_A2IR0, R8A77980_PD_A3IR }, { "a2ir1", 0x400, 1, R8A77980_PD_A2IR1, R8A77980_PD_A3IR }, From c41336f4d69057cbf88fed47951379b384540df5 Mon Sep 17 00:00:00 2001 From: Eugen Hristev Date: Mon, 25 Dec 2023 15:36:15 +0200 Subject: [PATCH 021/270] pmdomain: mediatek: fix race conditions with genpd If the power domains are registered first with genpd and *after that* the driver attempts to power them on in the probe sequence, then it is possible that a race condition occurs if genpd tries to power them on in the same time. The same is valid for powering them off before unregistering them from genpd. Attempt to fix race conditions by first removing the domains from genpd and *after that* powering down domains. Also first power up the domains and *after that* register them to genpd. Fixes: 59b644b01cf4 ("soc: mediatek: Add MediaTek SCPSYS power domains") Signed-off-by: Eugen Hristev Cc: stable@vger.kernel.org Link: https://lore.kernel.org/r/20231225133615.78993-1-eugen.hristev@collabora.com Signed-off-by: Ulf Hansson --- drivers/pmdomain/mediatek/mtk-pm-domains.c | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/drivers/pmdomain/mediatek/mtk-pm-domains.c b/drivers/pmdomain/mediatek/mtk-pm-domains.c index e26dc17d07ad..e274e3315fe7 100644 --- a/drivers/pmdomain/mediatek/mtk-pm-domains.c +++ b/drivers/pmdomain/mediatek/mtk-pm-domains.c @@ -561,6 +561,11 @@ static int scpsys_add_subdomain(struct scpsys *scpsys, struct device_node *paren goto err_put_node; } + /* recursive call to add all subdomains */ + ret = scpsys_add_subdomain(scpsys, child); + if (ret) + goto err_put_node; + ret = pm_genpd_add_subdomain(parent_pd, child_pd); if (ret) { dev_err(scpsys->dev, "failed to add %s subdomain to parent %s\n", @@ -570,11 +575,6 @@ static int scpsys_add_subdomain(struct scpsys *scpsys, struct device_node *paren dev_dbg(scpsys->dev, "%s add subdomain: %s\n", parent_pd->name, child_pd->name); } - - /* recursive call to add all subdomains */ - ret = scpsys_add_subdomain(scpsys, child); - if (ret) - goto err_put_node; } return 0; @@ -588,9 +588,6 @@ static void scpsys_remove_one_domain(struct scpsys_domain *pd) { int ret; - if (scpsys_domain_is_on(pd)) - scpsys_power_off(&pd->genpd); - /* * We're in the error cleanup already, so we only complain, * but won't emit another error on top of the original one. @@ -600,6 +597,8 @@ static void scpsys_remove_one_domain(struct scpsys_domain *pd) dev_err(pd->scpsys->dev, "failed to remove domain '%s' : %d - state may be inconsistent\n", pd->genpd.name, ret); + if (scpsys_domain_is_on(pd)) + scpsys_power_off(&pd->genpd); clk_bulk_put(pd->num_clks, pd->clks); clk_bulk_put(pd->num_subsys_clks, pd->subsys_clks); From afb2a4fb84555ef9e61061f6ea63ed7087b295d5 Mon Sep 17 00:00:00 2001 From: Jan Kiszka Date: Fri, 12 Jan 2024 19:37:29 +0100 Subject: [PATCH 022/270] riscv/efistub: Ensure GP-relative addressing is not used The cflags for the RISC-V efistub were missing -mno-relax, thus were under the risk that the compiler could use GP-relative addressing. That happened for _edata with binutils-2.41 and kernel 6.1, causing the relocation to fail due to an invalid kernel_size in handle_kernel_image. It was not yet observed with newer versions, but that may just be luck. Cc: Signed-off-by: Jan Kiszka Signed-off-by: Ard Biesheuvel --- drivers/firmware/efi/libstub/Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/firmware/efi/libstub/Makefile b/drivers/firmware/efi/libstub/Makefile index 06964a3c130f..d561d7de46a9 100644 --- a/drivers/firmware/efi/libstub/Makefile +++ b/drivers/firmware/efi/libstub/Makefile @@ -28,7 +28,7 @@ cflags-$(CONFIG_ARM) += -DEFI_HAVE_STRLEN -DEFI_HAVE_STRNLEN \ -DEFI_HAVE_MEMCHR -DEFI_HAVE_STRRCHR \ -DEFI_HAVE_STRCMP -fno-builtin -fpic \ $(call cc-option,-mno-single-pic-base) -cflags-$(CONFIG_RISCV) += -fpic -DNO_ALTERNATIVE +cflags-$(CONFIG_RISCV) += -fpic -DNO_ALTERNATIVE -mno-relax cflags-$(CONFIG_LOONGARCH) += -fpie cflags-$(CONFIG_EFI_PARAMS_FROM_FDT) += -I$(srctree)/scripts/dtc/libfdt From d2baf8cc82c17459fca019a12348efcf86bfec29 Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Tue, 16 Jan 2024 14:52:27 +0100 Subject: [PATCH 023/270] riscv/efistub: Tighten ELF relocation check The EFI stub makefile contains logic to ensure that the objects that make up the stub do not contain relocations that require runtime fixups (typically to account for the runtime load address of the executable) On RISC-V, we also avoid GP based relocations, as they require that GP is assigned the correct base in the startup code, which is not implemented in the EFI stub. So add these relocation types to the grep expression that is used to carry out this check. Link: https://lkml.kernel.org/r/42c63cb9-87d0-49db-9af8-95771b186684%40siemens.com Signed-off-by: Ard Biesheuvel --- drivers/firmware/efi/libstub/Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/firmware/efi/libstub/Makefile b/drivers/firmware/efi/libstub/Makefile index d561d7de46a9..73f4810f6db3 100644 --- a/drivers/firmware/efi/libstub/Makefile +++ b/drivers/firmware/efi/libstub/Makefile @@ -143,7 +143,7 @@ STUBCOPY_RELOC-$(CONFIG_ARM64) := R_AARCH64_ABS # exist. STUBCOPY_FLAGS-$(CONFIG_RISCV) += --prefix-alloc-sections=.init \ --prefix-symbols=__efistub_ -STUBCOPY_RELOC-$(CONFIG_RISCV) := R_RISCV_HI20 +STUBCOPY_RELOC-$(CONFIG_RISCV) := -E R_RISCV_HI20\|R_RISCV_$(BITS)\|R_RISCV_RELAX # For LoongArch, keep all the symbols in .init section and make sure that no # absolute symbols references exist. From 1793ce9f205efe07ca2992ef129b86dab2c329f5 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Sat, 30 Dec 2023 22:08:23 -0800 Subject: [PATCH 024/270] drm/msm/dpu: fix kernel-doc warnings Correct all kernel-doc warnings in dpu_encoder.c and dpu_rm.c: dpu_encoder.c:212: warning: Excess struct member 'crtc_kickoff_cb' description in 'dpu_encoder_virt' dpu_encoder.c:212: warning: Excess struct member 'crtc_kickoff_cb_data' description in 'dpu_encoder_virt' dpu_encoder.c:212: warning: Excess struct member 'debugfs_root' description in 'dpu_encoder_virt' dpu_rm.c:35: warning: Excess struct member 'hw_res' description in 'dpu_rm_requirements' dpu_rm.c:208: warning: No description found for return value of '_dpu_rm_get_lm_peer' Signed-off-by: Randy Dunlap Cc: Rob Clark Cc: Abhinav Kumar Cc: Dmitry Baryshkov Cc: Sean Paul Cc: Marijn Suijten Cc: linux-arm-msm@vger.kernel.org Cc: dri-devel@lists.freedesktop.org Cc: freedreno@lists.freedesktop.org Cc: Maarten Lankhorst Cc: Maxime Ripard Cc: Thomas Zimmermann Cc: Jonathan Corbet Cc: Vegard Nossum Reviewed-by: Paloma Arellano Reviewed-by: Dmitry Baryshkov Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-kbuild-all/202312170641.5exlvQQx-lkp@intel.com/ Fixes: 62d35629da80 ("drm/msm/dpu: move encoder status to standard encoder debugfs dir") Fixes: 25fdd5933e4c ("drm/msm: Add SDM845 DPU support") Patchwork: https://patchwork.freedesktop.org/patch/572962/ Link: https://lore.kernel.org/r/20231231060823.1934-1-rdunlap@infradead.org Signed-off-by: Abhinav Kumar --- drivers/gpu/drm/msm/disp/dpu1/dpu_encoder.c | 4 ---- drivers/gpu/drm/msm/disp/dpu1/dpu_rm.c | 3 ++- 2 files changed, 2 insertions(+), 5 deletions(-) diff --git a/drivers/gpu/drm/msm/disp/dpu1/dpu_encoder.c b/drivers/gpu/drm/msm/disp/dpu1/dpu_encoder.c index 83380bc92a00..f2b82ca5efb3 100644 --- a/drivers/gpu/drm/msm/disp/dpu1/dpu_encoder.c +++ b/drivers/gpu/drm/msm/disp/dpu1/dpu_encoder.c @@ -144,10 +144,6 @@ enum dpu_enc_rc_states { * to track crtc in the disable() hook which is called * _after_ encoder_mask is cleared. * @connector: If a mode is set, cached pointer to the active connector - * @crtc_kickoff_cb: Callback into CRTC that will flush & start - * all CTL paths - * @crtc_kickoff_cb_data: Opaque user data given to crtc_kickoff_cb - * @debugfs_root: Debug file system root file node * @enc_lock: Lock around physical encoder * create/destroy/enable/disable * @frame_busy_mask: Bitmask tracking which phys_enc we are still diff --git a/drivers/gpu/drm/msm/disp/dpu1/dpu_rm.c b/drivers/gpu/drm/msm/disp/dpu1/dpu_rm.c index b58a9c2ae326..724537ab776d 100644 --- a/drivers/gpu/drm/msm/disp/dpu1/dpu_rm.c +++ b/drivers/gpu/drm/msm/disp/dpu1/dpu_rm.c @@ -29,7 +29,6 @@ static inline bool reserved_by_other(uint32_t *res_map, int idx, /** * struct dpu_rm_requirements - Reservation requirements parameter bundle * @topology: selected topology for the display - * @hw_res: Hardware resources required as reported by the encoders */ struct dpu_rm_requirements { struct msm_display_topology topology; @@ -204,6 +203,8 @@ static bool _dpu_rm_needs_split_display(const struct msm_display_topology *top) * _dpu_rm_get_lm_peer - get the id of a mixer which is a peer of the primary * @rm: dpu resource manager handle * @primary_idx: index of primary mixer in rm->mixer_blks[] + * + * Returns: lm peer mixed id on success or %-EINVAL on error */ static int _dpu_rm_get_lm_peer(struct dpu_rm *rm, int primary_idx) { From 77e8aad5519e04f6c1e132aaec1c5f8faf41844f Mon Sep 17 00:00:00 2001 From: Kuogee Hsieh Date: Wed, 10 Jan 2024 12:18:51 -0800 Subject: [PATCH 025/270] drm/msms/dp: fixed link clock divider bits be over written in BPC unknown case Since the value of DP_TEST_BIT_DEPTH_8 is already left shifted, in the BPC unknown case, the additional shift causes spill over to the other bits of the [DP_CONFIGURATION_CTRL] register. Fix this by changing the return value of dp_link_get_test_bits_depth() in the BPC unknown case to (DP_TEST_BIT_DEPTH_8 >> DP_TEST_BIT_DEPTH_SHIFT). Fixes: c943b4948b58 ("drm/msm/dp: add displayPort driver support") Signed-off-by: Kuogee Hsieh Reviewed-by: Abhinav Kumar Reviewed-by: Dmitry Baryshkov Patchwork: https://patchwork.freedesktop.org/patch/573989/ Link: https://lore.kernel.org/r/1704917931-30133-1-git-send-email-quic_khsieh@quicinc.com [quic_abhinavk@quicinc.com: fix minor checkpatch warning to align with opening braces] Signed-off-by: Abhinav Kumar --- drivers/gpu/drm/msm/dp/dp_ctrl.c | 5 ----- drivers/gpu/drm/msm/dp/dp_link.c | 10 +++++++--- 2 files changed, 7 insertions(+), 8 deletions(-) diff --git a/drivers/gpu/drm/msm/dp/dp_ctrl.c b/drivers/gpu/drm/msm/dp/dp_ctrl.c index 77a8d9366ed7..fb588fde298a 100644 --- a/drivers/gpu/drm/msm/dp/dp_ctrl.c +++ b/drivers/gpu/drm/msm/dp/dp_ctrl.c @@ -135,11 +135,6 @@ static void dp_ctrl_config_ctrl(struct dp_ctrl_private *ctrl) tbd = dp_link_get_test_bits_depth(ctrl->link, ctrl->panel->dp_mode.bpp); - if (tbd == DP_TEST_BIT_DEPTH_UNKNOWN) { - pr_debug("BIT_DEPTH not set. Configure default\n"); - tbd = DP_TEST_BIT_DEPTH_8; - } - config |= tbd << DP_CONFIGURATION_CTRL_BPC_SHIFT; /* Num of Lanes */ diff --git a/drivers/gpu/drm/msm/dp/dp_link.c b/drivers/gpu/drm/msm/dp/dp_link.c index 98427d45e9a7..a0015b9e79eb 100644 --- a/drivers/gpu/drm/msm/dp/dp_link.c +++ b/drivers/gpu/drm/msm/dp/dp_link.c @@ -1179,6 +1179,9 @@ void dp_link_reset_phy_params_vx_px(struct dp_link *dp_link) u32 dp_link_get_test_bits_depth(struct dp_link *dp_link, u32 bpp) { u32 tbd; + struct dp_link_private *link; + + link = container_of(dp_link, struct dp_link_private, dp_link); /* * Few simplistic rules and assumptions made here: @@ -1196,12 +1199,13 @@ u32 dp_link_get_test_bits_depth(struct dp_link *dp_link, u32 bpp) tbd = DP_TEST_BIT_DEPTH_10; break; default: - tbd = DP_TEST_BIT_DEPTH_UNKNOWN; + drm_dbg_dp(link->drm_dev, "bpp=%d not supported, use bpc=8\n", + bpp); + tbd = DP_TEST_BIT_DEPTH_8; break; } - if (tbd != DP_TEST_BIT_DEPTH_UNKNOWN) - tbd = (tbd >> DP_TEST_BIT_DEPTH_SHIFT); + tbd = (tbd >> DP_TEST_BIT_DEPTH_SHIFT); return tbd; } From fcccdafd91f8bdde568b86ff70848cf83f029add Mon Sep 17 00:00:00 2001 From: Kuogee Hsieh Date: Wed, 17 Jan 2024 13:13:30 -0800 Subject: [PATCH 026/270] drm/msm/dp: return correct Colorimetry for DP_TEST_DYNAMIC_RANGE_CEA case MSA MISC0 bit 1 to 7 contains Colorimetry Indicator Field. dp_link_get_colorimetry_config() returns wrong colorimetry value in the DP_TEST_DYNAMIC_RANGE_CEA case in the current implementation. Hence fix this problem by having dp_link_get_colorimetry_config() return defined CEA RGB colorimetry value in the case of DP_TEST_DYNAMIC_RANGE_CEA. Changes in V2: -- drop retrieving colorimetry from colorspace -- drop dr = link->dp_link.test_video.test_dyn_range assignment Changes in V3: -- move defined MISCr0a Colorimetry vale to dp_reg.h -- rewording commit title -- rewording commit text to more precise describe this patch Fixes: c943b4948b58 ("drm/msm/dp: add displayPort driver support") Signed-off-by: Kuogee Hsieh Reviewed-by: Dmitry Baryshkov Patchwork: https://patchwork.freedesktop.org/patch/574888/ Link: https://lore.kernel.org/r/1705526010-597-1-git-send-email-quic_khsieh@quicinc.com Signed-off-by: Abhinav Kumar --- drivers/gpu/drm/msm/dp/dp_link.c | 12 +++++++----- drivers/gpu/drm/msm/dp/dp_reg.h | 3 +++ 2 files changed, 10 insertions(+), 5 deletions(-) diff --git a/drivers/gpu/drm/msm/dp/dp_link.c b/drivers/gpu/drm/msm/dp/dp_link.c index a0015b9e79eb..49dfac1fd1ef 100644 --- a/drivers/gpu/drm/msm/dp/dp_link.c +++ b/drivers/gpu/drm/msm/dp/dp_link.c @@ -7,6 +7,7 @@ #include +#include "dp_reg.h" #include "dp_link.h" #include "dp_panel.h" @@ -1082,7 +1083,7 @@ int dp_link_process_request(struct dp_link *dp_link) int dp_link_get_colorimetry_config(struct dp_link *dp_link) { - u32 cc; + u32 cc = DP_MISC0_COLORIMERY_CFG_LEGACY_RGB; struct dp_link_private *link; if (!dp_link) { @@ -1096,10 +1097,11 @@ int dp_link_get_colorimetry_config(struct dp_link *dp_link) * Unless a video pattern CTS test is ongoing, use RGB_VESA * Only RGB_VESA and RGB_CEA supported for now */ - if (dp_link_is_video_pattern_requested(link)) - cc = link->dp_link.test_video.test_dyn_range; - else - cc = DP_TEST_DYNAMIC_RANGE_VESA; + if (dp_link_is_video_pattern_requested(link)) { + if (link->dp_link.test_video.test_dyn_range & + DP_TEST_DYNAMIC_RANGE_CEA) + cc = DP_MISC0_COLORIMERY_CFG_CEA_RGB; + } return cc; } diff --git a/drivers/gpu/drm/msm/dp/dp_reg.h b/drivers/gpu/drm/msm/dp/dp_reg.h index ea85a691e72b..78785ed4b40c 100644 --- a/drivers/gpu/drm/msm/dp/dp_reg.h +++ b/drivers/gpu/drm/msm/dp/dp_reg.h @@ -143,6 +143,9 @@ #define DP_MISC0_COLORIMETRY_CFG_SHIFT (0x00000001) #define DP_MISC0_TEST_BITS_DEPTH_SHIFT (0x00000005) +#define DP_MISC0_COLORIMERY_CFG_LEGACY_RGB (0) +#define DP_MISC0_COLORIMERY_CFG_CEA_RGB (0x04) + #define REG_DP_VALID_BOUNDARY (0x00000030) #define REG_DP_VALID_BOUNDARY_2 (0x00000034) From 7f3d03c48b1eb6bc45ab20ca98b8b11be25f9f52 Mon Sep 17 00:00:00 2001 From: Abhinav Kumar Date: Wed, 17 Jan 2024 11:41:09 -0800 Subject: [PATCH 027/270] drm/msm/dpu: check for valid hw_pp in dpu_encoder_helper_phys_cleanup The commit 8b45a26f2ba9 ("drm/msm/dpu: reserve cdm blocks for writeback in case of YUV output") introduced a smatch warning about another conditional block in dpu_encoder_helper_phys_cleanup() which had assumed hw_pp will always be valid which may not necessarily be true. Lets fix the other conditional block by making sure hw_pp is valid before dereferencing it. Reported-by: Dan Carpenter Fixes: ae4d721ce100 ("drm/msm/dpu: add an API to reset the encoder related hw blocks") Reviewed-by: Dmitry Baryshkov Patchwork: https://patchwork.freedesktop.org/patch/574878/ Link: https://lore.kernel.org/r/20240117194109.21609-1-quic_abhinavk@quicinc.com Signed-off-by: Abhinav Kumar --- drivers/gpu/drm/msm/disp/dpu1/dpu_encoder.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/drm/msm/disp/dpu1/dpu_encoder.c b/drivers/gpu/drm/msm/disp/dpu1/dpu_encoder.c index f2b82ca5efb3..6a4b489d44e5 100644 --- a/drivers/gpu/drm/msm/disp/dpu1/dpu_encoder.c +++ b/drivers/gpu/drm/msm/disp/dpu1/dpu_encoder.c @@ -2068,7 +2068,7 @@ void dpu_encoder_helper_phys_cleanup(struct dpu_encoder_phys *phys_enc) } /* reset the merge 3D HW block */ - if (phys_enc->hw_pp->merge_3d) { + if (phys_enc->hw_pp && phys_enc->hw_pp->merge_3d) { phys_enc->hw_pp->merge_3d->ops.setup_3d_mode(phys_enc->hw_pp->merge_3d, BLEND_3D_NONE); if (phys_enc->hw_ctl->ops.update_pending_flush_merge_3d) @@ -2099,7 +2099,7 @@ void dpu_encoder_helper_phys_cleanup(struct dpu_encoder_phys *phys_enc) if (phys_enc->hw_wb) intf_cfg.wb = phys_enc->hw_wb->idx; - if (phys_enc->hw_pp->merge_3d) + if (phys_enc->hw_pp && phys_enc->hw_pp->merge_3d) intf_cfg.merge_3d = phys_enc->hw_pp->merge_3d->idx; if (ctl->ops.reset_intf_cfg) From 8d35217149daa33358c284aca6a56d5ab92cfc6c Mon Sep 17 00:00:00 2001 From: Dmitry Baryshkov Date: Fri, 15 Dec 2023 03:32:22 +0200 Subject: [PATCH 028/270] drm/msm/mdss: specify cfg bandwidth for SDM670 Lower the requested CFG bus bandwidth for the SDM670 platform. The default value is 153600 kBps, which is twice as big as required by the platform according to the vendor kernel. Fixes: a55c8ff252d3 ("drm/msm/mdss: Handle the reg bus ICC path") Signed-off-by: Dmitry Baryshkov Reviewed-by: Abhinav Kumar Tested-by: Richard Acayan Patchwork: https://patchwork.freedesktop.org/patch/572182/ Link: https://lore.kernel.org/r/20231215013222.827975-1-dmitry.baryshkov@linaro.org Signed-off-by: Abhinav Kumar --- drivers/gpu/drm/msm/msm_mdss.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/gpu/drm/msm/msm_mdss.c b/drivers/gpu/drm/msm/msm_mdss.c index 455b2e3a0cdd..35423d10aafa 100644 --- a/drivers/gpu/drm/msm/msm_mdss.c +++ b/drivers/gpu/drm/msm/msm_mdss.c @@ -562,6 +562,7 @@ static const struct msm_mdss_data sdm670_data = { .ubwc_enc_version = UBWC_2_0, .ubwc_dec_version = UBWC_2_0, .highest_bank_bit = 1, + .reg_bus_bw = 76800, }; static const struct msm_mdss_data sdm845_data = { From 9b3058d1f456464a02e202cc7fc660508709097c Mon Sep 17 00:00:00 2001 From: Gregory Greenman Date: Tue, 2 Jan 2024 14:20:19 +0200 Subject: [PATCH 029/270] MAINTAINERS: remove myself as iwlwifi driver maintainer As I'm resigning from Intel, it's time to remove myself as a maintainer of iwlwifi. Good luck to Miri! Signed-off-by: Gregory Greenman Link: https://msgid.link/20240102122019.1689602-1-gregory.greenman@intel.com Signed-off-by: Johannes Berg --- MAINTAINERS | 1 - 1 file changed, 1 deletion(-) diff --git a/MAINTAINERS b/MAINTAINERS index 92152ac346c8..d8ca0e10c8d1 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -11128,7 +11128,6 @@ S: Supported F: drivers/net/wireless/intel/iwlegacy/ INTEL WIRELESS WIFI LINK (iwlwifi) -M: Gregory Greenman M: Miri Korenblit L: linux-wireless@vger.kernel.org S: Supported From 353d321f63f7dbfc9ef58498cc732c9fe886a596 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Tue, 23 Jan 2024 20:08:11 +0200 Subject: [PATCH 030/270] wifi: iwlwifi: fix double-free bug The storage for the TLV PC register data wasn't done like all the other storage in the drv->fw area, which is cleared at the end of deallocation. Therefore, the freeing must also be done differently, explicitly NULL'ing it out after the free, since otherwise there's a nasty double-free bug here if a file fails to load after this has been parsed, and we get another free later (e.g. because no other file exists.) Fix that by adding the missing NULL assignment. Cc: stable@vger.kernel.org Fixes: 5e31b3df86ec ("wifi: iwlwifi: dbg: print pc register data once fw dump occurred") Reported-by: Guy Kaplan Signed-off-by: Johannes Berg Reviewed-by: Gregory Greenman Signed-off-by: Miri Korenblit Link: https://msgid.link/20240123200528.675f3c24ec0d.I6ab4015cd78d82dd95471f840629972ef0331de3@changeid Signed-off-by: Johannes Berg --- drivers/net/wireless/intel/iwlwifi/iwl-drv.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/net/wireless/intel/iwlwifi/iwl-drv.c b/drivers/net/wireless/intel/iwlwifi/iwl-drv.c index ffe2670720c9..abf8001bdac1 100644 --- a/drivers/net/wireless/intel/iwlwifi/iwl-drv.c +++ b/drivers/net/wireless/intel/iwlwifi/iwl-drv.c @@ -128,6 +128,7 @@ static void iwl_dealloc_ucode(struct iwl_drv *drv) kfree(drv->fw.ucode_capa.cmd_versions); kfree(drv->fw.phy_integration_ver); kfree(drv->trans->dbg.pc_data); + drv->trans->dbg.pc_data = NULL; for (i = 0; i < IWL_UCODE_TYPE_MAX; i++) iwl_free_fw_img(drv, drv->fw.img + i); From b743287d7a0007493f5cada34ed2085d475050b4 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Thu, 25 Jan 2024 09:51:09 +0100 Subject: [PATCH 031/270] wifi: cfg80211: fix wiphy delayed work queueing When a wiphy work is queued with timer, and then again without a delay, it's started immediately but *also* started again after the timer expires. This can lead, for example, to warnings in mac80211's offchannel code as reported by Jouni. Running the same work twice isn't expected, of course. Fix this by deleting the timer at this point, when queuing immediately due to delay=0. Cc: stable@vger.kernel.org Reported-by: Jouni Malinen Fixes: a3ee4dc84c4e ("wifi: cfg80211: add a work abstraction with special semantics") Link: https://msgid.link/20240125095108.2feb0eaaa446.I4617f3210ed0e7f252290d5970dac6a876aa595b@changeid Signed-off-by: Johannes Berg --- net/wireless/core.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/net/wireless/core.c b/net/wireless/core.c index 409d74c57ca0..3fb1b637352a 100644 --- a/net/wireless/core.c +++ b/net/wireless/core.c @@ -5,7 +5,7 @@ * Copyright 2006-2010 Johannes Berg * Copyright 2013-2014 Intel Mobile Communications GmbH * Copyright 2015-2017 Intel Deutschland GmbH - * Copyright (C) 2018-2023 Intel Corporation + * Copyright (C) 2018-2024 Intel Corporation */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt @@ -1661,6 +1661,7 @@ void wiphy_delayed_work_queue(struct wiphy *wiphy, unsigned long delay) { if (!delay) { + del_timer(&dwork->timer); wiphy_work_queue(wiphy, &dwork->work); return; } From 3a3ef3940798e85121066a859127e72a528dc32a Mon Sep 17 00:00:00 2001 From: Benjamin Berg Date: Tue, 23 Jan 2024 20:08:19 +0200 Subject: [PATCH 032/270] wifi: iwlwifi: mvm: skip adding debugfs symlink for reconfig The function to add an interface may be called without a previous removal if the HW is being reconfigured. As such, only add the symlink if the hardware is not being reconfigured due to a HW_RESTART. Fixes: c36235acb34f ("wifi: iwlwifi: mvm: rework debugfs handling") Signed-off-by: Benjamin Berg Reviewed-by: Gregory Greenman Signed-off-by: Miri Korenblit Link: https://msgid.link/20240123200528.314395eacda4.I5823e962c3c3674b942383733debd10b3fe903e2@changeid Signed-off-by: Johannes Berg --- drivers/net/wireless/intel/iwlwifi/mvm/mac80211.c | 6 ++++-- drivers/net/wireless/intel/iwlwifi/mvm/mld-mac80211.c | 3 ++- 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/drivers/net/wireless/intel/iwlwifi/mvm/mac80211.c b/drivers/net/wireless/intel/iwlwifi/mvm/mac80211.c index 7f13dff04b26..3447d67a8b31 100644 --- a/drivers/net/wireless/intel/iwlwifi/mvm/mac80211.c +++ b/drivers/net/wireless/intel/iwlwifi/mvm/mac80211.c @@ -1600,7 +1600,8 @@ static int iwl_mvm_mac_add_interface(struct ieee80211_hw *hw, */ if (vif->type == NL80211_IFTYPE_AP || vif->type == NL80211_IFTYPE_ADHOC) { - iwl_mvm_vif_dbgfs_add_link(mvm, vif); + if (!test_bit(IWL_MVM_STATUS_IN_HW_RESTART, &mvm->status)) + iwl_mvm_vif_dbgfs_add_link(mvm, vif); ret = 0; goto out; } @@ -1640,7 +1641,8 @@ static int iwl_mvm_mac_add_interface(struct ieee80211_hw *hw, iwl_mvm_chandef_get_primary_80(&vif->bss_conf.chandef); } - iwl_mvm_vif_dbgfs_add_link(mvm, vif); + if (!test_bit(IWL_MVM_STATUS_IN_HW_RESTART, &mvm->status)) + iwl_mvm_vif_dbgfs_add_link(mvm, vif); if (!test_bit(IWL_MVM_STATUS_IN_HW_RESTART, &mvm->status) && vif->type == NL80211_IFTYPE_STATION && !vif->p2p && diff --git a/drivers/net/wireless/intel/iwlwifi/mvm/mld-mac80211.c b/drivers/net/wireless/intel/iwlwifi/mvm/mld-mac80211.c index 61170173f917..1f36e934ef69 100644 --- a/drivers/net/wireless/intel/iwlwifi/mvm/mld-mac80211.c +++ b/drivers/net/wireless/intel/iwlwifi/mvm/mld-mac80211.c @@ -81,7 +81,8 @@ static int iwl_mvm_mld_mac_add_interface(struct ieee80211_hw *hw, ieee80211_hw_set(mvm->hw, RX_INCLUDES_FCS); } - iwl_mvm_vif_dbgfs_add_link(mvm, vif); + if (!test_bit(IWL_MVM_STATUS_IN_HW_RESTART, &mvm->status)) + iwl_mvm_vif_dbgfs_add_link(mvm, vif); if (!test_bit(IWL_MVM_STATUS_IN_HW_RESTART, &mvm->status) && vif->type == NL80211_IFTYPE_STATION && !vif->p2p && From 1b023d475ae928f3036cefee9ea0a499af1d8900 Mon Sep 17 00:00:00 2001 From: Mario Limonciello Date: Tue, 16 Jan 2024 21:05:25 -0600 Subject: [PATCH 033/270] wifi: mac80211: Drop WBRF debugging statements Due to the way that debugging is used in the mac80211 subsystem this message ends up being noisier than it needs to be. As the statement is only useful at a first stage of triage for BIOS bugs, just drop it. Cc: Jun Ma Suggested-by: Kalle Valo Signed-off-by: Mario Limonciello Tested-by: Kalle Valo Link: https://msgid.link/20240117030525.539-1-mario.limonciello@amd.com Signed-off-by: Johannes Berg --- net/mac80211/wbrf.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/net/mac80211/wbrf.c b/net/mac80211/wbrf.c index a05c5b971789..3a8612309137 100644 --- a/net/mac80211/wbrf.c +++ b/net/mac80211/wbrf.c @@ -23,8 +23,6 @@ void ieee80211_check_wbrf_support(struct ieee80211_local *local) return; local->wbrf_supported = acpi_amd_wbrf_supported_producer(dev); - dev_dbg(dev, "WBRF is %s supported\n", - local->wbrf_supported ? "" : "not"); } static void get_chan_freq_boundary(u32 center_freq, u32 bandwidth, u64 *start, u64 *end) From a7a6a01f88e87dec4bf2365571dd2dc7403d52d0 Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Fri, 26 Jan 2024 12:14:30 +0100 Subject: [PATCH 034/270] x86/efistub: Give up if memory attribute protocol returns an error The recently introduced EFI memory attributes protocol should be used if it exists to ensure that the memory allocation created for the kernel permits execution. This is needed for compatibility with tightened requirements related to Windows logo certification for x86 PCs. Currently, we simply strip the execute protect (XP) attribute from the entire range, but this might be rejected under some firmware security policies, and so in a subsequent patch, this will be changed to only strip XP from the executable region that runs early, and make it read-only (RO) as well. In order to catch any issues early, ensure that the memory attribute protocol works as intended, and give up if it produces spurious errors. Note that the DXE services based fallback was always based on best effort, so don't propagate any errors returned by that API. Fixes: a1b87d54f4e4 ("x86/efistub: Avoid legacy decompressor when doing EFI boot") Signed-off-by: Ard Biesheuvel --- drivers/firmware/efi/libstub/x86-stub.c | 24 ++++++++++++++---------- drivers/firmware/efi/libstub/x86-stub.h | 4 ++-- 2 files changed, 16 insertions(+), 12 deletions(-) diff --git a/drivers/firmware/efi/libstub/x86-stub.c b/drivers/firmware/efi/libstub/x86-stub.c index 0d510c9a06a4..cb0be88c8131 100644 --- a/drivers/firmware/efi/libstub/x86-stub.c +++ b/drivers/firmware/efi/libstub/x86-stub.c @@ -223,8 +223,8 @@ static void retrieve_apple_device_properties(struct boot_params *boot_params) } } -void efi_adjust_memory_range_protection(unsigned long start, - unsigned long size) +efi_status_t efi_adjust_memory_range_protection(unsigned long start, + unsigned long size) { efi_status_t status; efi_gcd_memory_space_desc_t desc; @@ -236,13 +236,17 @@ void efi_adjust_memory_range_protection(unsigned long start, rounded_end = roundup(start + size, EFI_PAGE_SIZE); if (memattr != NULL) { - efi_call_proto(memattr, clear_memory_attributes, rounded_start, - rounded_end - rounded_start, EFI_MEMORY_XP); - return; + status = efi_call_proto(memattr, clear_memory_attributes, + rounded_start, + rounded_end - rounded_start, + EFI_MEMORY_XP); + if (status != EFI_SUCCESS) + efi_warn("Failed to clear EFI_MEMORY_XP attribute\n"); + return status; } if (efi_dxe_table == NULL) - return; + return EFI_SUCCESS; /* * Don't modify memory region attributes, they are @@ -255,7 +259,7 @@ void efi_adjust_memory_range_protection(unsigned long start, status = efi_dxe_call(get_memory_space_descriptor, start, &desc); if (status != EFI_SUCCESS) - return; + break; next = desc.base_address + desc.length; @@ -280,8 +284,10 @@ void efi_adjust_memory_range_protection(unsigned long start, unprotect_start, unprotect_start + unprotect_size, status); + break; } } + return EFI_SUCCESS; } static void setup_unaccepted_memory(void) @@ -805,9 +811,7 @@ static efi_status_t efi_decompress_kernel(unsigned long *kernel_entry) *kernel_entry = addr + entry; - efi_adjust_memory_range_protection(addr, kernel_total_size); - - return EFI_SUCCESS; + return efi_adjust_memory_range_protection(addr, kernel_total_size); } static void __noreturn enter_kernel(unsigned long kernel_addr, diff --git a/drivers/firmware/efi/libstub/x86-stub.h b/drivers/firmware/efi/libstub/x86-stub.h index 37c5a36b9d8c..1c20e99a6494 100644 --- a/drivers/firmware/efi/libstub/x86-stub.h +++ b/drivers/firmware/efi/libstub/x86-stub.h @@ -5,8 +5,8 @@ extern void trampoline_32bit_src(void *, bool); extern const u16 trampoline_ljmp_imm_offset; -void efi_adjust_memory_range_protection(unsigned long start, - unsigned long size); +efi_status_t efi_adjust_memory_range_protection(unsigned long start, + unsigned long size); #ifdef CONFIG_X86_64 efi_status_t efi_setup_5level_paging(void); From d6d33f03baa43d763fe094ca926eeae7d3421d07 Mon Sep 17 00:00:00 2001 From: Ism Hong Date: Tue, 26 Dec 2023 16:51:41 +0800 Subject: [PATCH 035/270] fs/ntfs3: use non-movable memory for ntfs3 MFT buffer cache Since the buffer cache for ntfs3 metadata is not released until the file system is unmounted, allocating from the movable zone may result in cma allocation failures. This is due to the page still being used by ntfs3, leading to migration failures. To address this, this commit use sb_bread_umovable() instead of sb_bread(). This change prevents allocation from the movable zone, ensuring compatibility with scenarios where the buffer head is not released until unmount. This patch is inspired by commit a8ac900b8163("ext4: use non-movable memory for the ext4 superblock"). The issue is found when playing video files stored in NTFS on the Android TV platform. During this process, the media parser reads the video file, causing ntfs3 to allocate buffer cache from the CMA area. Subsequently, the hardware decoder attempts to allocate memory from the same CMA area. However, the page is still in use by ntfs3, resulting in a migrate failure in alloc_contig_range(). The pinned page and allocating stacktrace reported by page owner shows below: page:ffffffff00b68880 refcount:3 mapcount:0 mapping:ffffff80046aa828 index:0xc0040 pfn:0x20fa4 aops:def_blk_aops ino:0 flags: 0x2020(active|private) page dumped because: migration failure page last allocated via order 0, migratetype Movable, gfp_mask 0x108c48 (GFP_NOFS|__GFP_NOFAIL|__GFP_HARDWALL|__GFP_MOVABLE), page_owner tracks the page as allocated prep_new_page get_page_from_freelist __alloc_pages_nodemask pagecache_get_page __getblk_gfp __bread_gfp ntfs_read_run_nb ntfs_read_bh mi_read ntfs_iget5 dir_search_u ntfs_lookup __lookup_slow lookup_slow walk_component path_lookupat Signed-off-by: Ism Hong Signed-off-by: Konstantin Komarov --- fs/ntfs3/ntfs_fs.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/ntfs3/ntfs_fs.h b/fs/ntfs3/ntfs_fs.h index abbc7182554a..2b54ae94440f 100644 --- a/fs/ntfs3/ntfs_fs.h +++ b/fs/ntfs3/ntfs_fs.h @@ -1035,7 +1035,7 @@ static inline u64 bytes_to_block(const struct super_block *sb, u64 size) static inline struct buffer_head *ntfs_bread(struct super_block *sb, sector_t block) { - struct buffer_head *bh = sb_bread(sb, block); + struct buffer_head *bh = sb_bread_unmovable(sb, block); if (bh) return bh; From 5ca87d01eba7bdfe9536a157ca33c1455bb8d16c Mon Sep 17 00:00:00 2001 From: Konstantin Komarov Date: Fri, 26 Jan 2024 11:03:21 +0300 Subject: [PATCH 036/270] fs/ntfs3: Prevent generic message "attempt to access beyond end of device" It used in test environment. Signed-off-by: Konstantin Komarov --- fs/ntfs3/fsntfs.c | 24 ++++++++++++++++++++++++ fs/ntfs3/ntfs_fs.h | 14 +------------- 2 files changed, 25 insertions(+), 13 deletions(-) diff --git a/fs/ntfs3/fsntfs.c b/fs/ntfs3/fsntfs.c index 321978019407..ae2ef5c11868 100644 --- a/fs/ntfs3/fsntfs.c +++ b/fs/ntfs3/fsntfs.c @@ -1007,6 +1007,30 @@ static inline __le32 security_hash(const void *sd, size_t bytes) return cpu_to_le32(hash); } +/* + * simple wrapper for sb_bread_unmovable. + */ +struct buffer_head *ntfs_bread(struct super_block *sb, sector_t block) +{ + struct ntfs_sb_info *sbi = sb->s_fs_info; + struct buffer_head *bh; + + if (unlikely(block >= sbi->volume.blocks)) { + /* prevent generic message "attempt to access beyond end of device" */ + ntfs_err(sb, "try to read out of volume at offset 0x%llx", + (u64)block << sb->s_blocksize_bits); + return NULL; + } + + bh = sb_bread_unmovable(sb, block); + if (bh) + return bh; + + ntfs_err(sb, "failed to read volume at offset 0x%llx", + (u64)block << sb->s_blocksize_bits); + return NULL; +} + int ntfs_sb_read(struct super_block *sb, u64 lbo, size_t bytes, void *buffer) { struct block_device *bdev = sb->s_bdev; diff --git a/fs/ntfs3/ntfs_fs.h b/fs/ntfs3/ntfs_fs.h index 2b54ae94440f..81f7563428ee 100644 --- a/fs/ntfs3/ntfs_fs.h +++ b/fs/ntfs3/ntfs_fs.h @@ -586,6 +586,7 @@ bool check_index_header(const struct INDEX_HDR *hdr, size_t bytes); int log_replay(struct ntfs_inode *ni, bool *initialized); /* Globals from fsntfs.c */ +struct buffer_head *ntfs_bread(struct super_block *sb, sector_t block); bool ntfs_fix_pre_write(struct NTFS_RECORD_HEADER *rhdr, size_t bytes); int ntfs_fix_post_read(struct NTFS_RECORD_HEADER *rhdr, size_t bytes, bool simple); @@ -1032,19 +1033,6 @@ static inline u64 bytes_to_block(const struct super_block *sb, u64 size) return (size + sb->s_blocksize - 1) >> sb->s_blocksize_bits; } -static inline struct buffer_head *ntfs_bread(struct super_block *sb, - sector_t block) -{ - struct buffer_head *bh = sb_bread_unmovable(sb, block); - - if (bh) - return bh; - - ntfs_err(sb, "failed to read volume at offset 0x%llx", - (u64)block << sb->s_blocksize_bits); - return NULL; -} - static inline struct ntfs_inode *ntfs_i(struct inode *inode) { return container_of(inode, struct ntfs_inode, vfs_inode); From 4fd6c08a16d7f1ba10212c9ef7bc73218144b463 Mon Sep 17 00:00:00 2001 From: Konstantin Komarov Date: Fri, 26 Jan 2024 11:12:38 +0300 Subject: [PATCH 037/270] fs/ntfs3: Use i_size_read and i_size_write Signed-off-by: Konstantin Komarov --- fs/ntfs3/attrib.c | 4 ++-- fs/ntfs3/dir.c | 2 +- fs/ntfs3/file.c | 11 ++++++----- fs/ntfs3/frecord.c | 10 +++++----- fs/ntfs3/index.c | 8 ++++---- fs/ntfs3/inode.c | 2 +- 6 files changed, 19 insertions(+), 18 deletions(-) diff --git a/fs/ntfs3/attrib.c b/fs/ntfs3/attrib.c index 646e2dad1b75..7aadf5010999 100644 --- a/fs/ntfs3/attrib.c +++ b/fs/ntfs3/attrib.c @@ -2084,7 +2084,7 @@ next_attr: /* Update inode size. */ ni->i_valid = valid_size; - ni->vfs_inode.i_size = data_size; + i_size_write(&ni->vfs_inode, data_size); inode_set_bytes(&ni->vfs_inode, total_size); ni->ni_flags |= NI_FLAG_UPDATE_PARENT; mark_inode_dirty(&ni->vfs_inode); @@ -2499,7 +2499,7 @@ int attr_insert_range(struct ntfs_inode *ni, u64 vbo, u64 bytes) mi_b->dirty = true; done: - ni->vfs_inode.i_size += bytes; + i_size_write(&ni->vfs_inode, ni->vfs_inode.i_size + bytes); ni->ni_flags |= NI_FLAG_UPDATE_PARENT; mark_inode_dirty(&ni->vfs_inode); diff --git a/fs/ntfs3/dir.c b/fs/ntfs3/dir.c index 9f6dd445eb04..effa6accf8a8 100644 --- a/fs/ntfs3/dir.c +++ b/fs/ntfs3/dir.c @@ -517,7 +517,7 @@ static int ntfs_dir_count(struct inode *dir, bool *is_empty, size_t *dirs, u32 e_size, off, end; size_t drs = 0, fles = 0, bit = 0; struct indx_node *node = NULL; - size_t max_indx = ni->vfs_inode.i_size >> ni->dir.index_bits; + size_t max_indx = i_size_read(&ni->vfs_inode) >> ni->dir.index_bits; if (is_empty) *is_empty = true; diff --git a/fs/ntfs3/file.c b/fs/ntfs3/file.c index 07ed3d946e7c..b702543a8795 100644 --- a/fs/ntfs3/file.c +++ b/fs/ntfs3/file.c @@ -646,7 +646,7 @@ static long ntfs_fallocate(struct file *file, int mode, loff_t vbo, loff_t len) if (err) goto out; } else if (new_size > i_size) { - inode->i_size = new_size; + i_size_write(inode, new_size); } } @@ -696,7 +696,7 @@ int ntfs3_setattr(struct mnt_idmap *idmap, struct dentry *dentry, goto out; } inode_dio_wait(inode); - oldsize = inode->i_size; + oldsize = i_size_read(inode); newsize = attr->ia_size; if (newsize <= oldsize) @@ -708,7 +708,7 @@ int ntfs3_setattr(struct mnt_idmap *idmap, struct dentry *dentry, goto out; ni->ni_flags |= NI_FLAG_UPDATE_PARENT; - inode->i_size = newsize; + i_size_write(inode, newsize); } setattr_copy(idmap, inode, attr); @@ -847,7 +847,7 @@ static ssize_t ntfs_compress_write(struct kiocb *iocb, struct iov_iter *from) size_t count = iov_iter_count(from); loff_t pos = iocb->ki_pos; struct inode *inode = file_inode(file); - loff_t i_size = inode->i_size; + loff_t i_size = i_size_read(inode); struct address_space *mapping = inode->i_mapping; struct ntfs_inode *ni = ntfs_i(inode); u64 valid = ni->i_valid; @@ -1177,7 +1177,8 @@ static int ntfs_file_release(struct inode *inode, struct file *file) down_write(&ni->file.run_lock); err = attr_set_size(ni, ATTR_DATA, NULL, 0, &ni->file.run, - inode->i_size, &ni->i_valid, false, NULL); + i_size_read(inode), &ni->i_valid, false, + NULL); up_write(&ni->file.run_lock); ni_unlock(ni); diff --git a/fs/ntfs3/frecord.c b/fs/ntfs3/frecord.c index 2636ab7640ac..3b42938a9d3b 100644 --- a/fs/ntfs3/frecord.c +++ b/fs/ntfs3/frecord.c @@ -2099,7 +2099,7 @@ int ni_readpage_cmpr(struct ntfs_inode *ni, struct page *page) gfp_t gfp_mask; struct page *pg; - if (vbo >= ni->vfs_inode.i_size) { + if (vbo >= i_size_read(&ni->vfs_inode)) { SetPageUptodate(page); err = 0; goto out; @@ -2173,7 +2173,7 @@ int ni_decompress_file(struct ntfs_inode *ni) { struct ntfs_sb_info *sbi = ni->mi.sbi; struct inode *inode = &ni->vfs_inode; - loff_t i_size = inode->i_size; + loff_t i_size = i_size_read(inode); struct address_space *mapping = inode->i_mapping; gfp_t gfp_mask = mapping_gfp_mask(mapping); struct page **pages = NULL; @@ -2457,6 +2457,7 @@ int ni_read_frame(struct ntfs_inode *ni, u64 frame_vbo, struct page **pages, struct ATTR_LIST_ENTRY *le = NULL; struct runs_tree *run = &ni->file.run; u64 valid_size = ni->i_valid; + loff_t i_size = i_size_read(&ni->vfs_inode); u64 vbo_disk; size_t unc_size; u32 frame_size, i, npages_disk, ondisk_size; @@ -2548,7 +2549,7 @@ int ni_read_frame(struct ntfs_inode *ni, u64 frame_vbo, struct page **pages, } } - frames = (ni->vfs_inode.i_size - 1) >> frame_bits; + frames = (i_size - 1) >> frame_bits; err = attr_wof_frame_info(ni, attr, run, frame64, frames, frame_bits, &ondisk_size, &vbo_data); @@ -2556,8 +2557,7 @@ int ni_read_frame(struct ntfs_inode *ni, u64 frame_vbo, struct page **pages, goto out2; if (frame64 == frames) { - unc_size = 1 + ((ni->vfs_inode.i_size - 1) & - (frame_size - 1)); + unc_size = 1 + ((i_size - 1) & (frame_size - 1)); ondisk_size = attr_size(attr) - vbo_data; } else { unc_size = frame_size; diff --git a/fs/ntfs3/index.c b/fs/ntfs3/index.c index cf92b2433f7a..daabaad63aaf 100644 --- a/fs/ntfs3/index.c +++ b/fs/ntfs3/index.c @@ -1462,7 +1462,7 @@ static int indx_create_allocate(struct ntfs_index *indx, struct ntfs_inode *ni, goto out2; if (in->name == I30_NAME) { - ni->vfs_inode.i_size = data_size; + i_size_write(&ni->vfs_inode, data_size); inode_set_bytes(&ni->vfs_inode, alloc_size); } @@ -1544,7 +1544,7 @@ static int indx_add_allocate(struct ntfs_index *indx, struct ntfs_inode *ni, } if (in->name == I30_NAME) - ni->vfs_inode.i_size = data_size; + i_size_write(&ni->vfs_inode, data_size); *vbn = bit << indx->idx2vbn_bits; @@ -2090,7 +2090,7 @@ static int indx_shrink(struct ntfs_index *indx, struct ntfs_inode *ni, return err; if (in->name == I30_NAME) - ni->vfs_inode.i_size = new_data; + i_size_write(&ni->vfs_inode, new_data); bpb = bitmap_size(bit); if (bpb * 8 == nbits) @@ -2576,7 +2576,7 @@ int indx_delete_entry(struct ntfs_index *indx, struct ntfs_inode *ni, err = attr_set_size(ni, ATTR_ALLOC, in->name, in->name_len, &indx->alloc_run, 0, NULL, false, NULL); if (in->name == I30_NAME) - ni->vfs_inode.i_size = 0; + i_size_write(&ni->vfs_inode, 0); err = ni_remove_attr(ni, ATTR_ALLOC, in->name, in->name_len, false, NULL); diff --git a/fs/ntfs3/inode.c b/fs/ntfs3/inode.c index 85452a6b1d40..eb7a8c9fba01 100644 --- a/fs/ntfs3/inode.c +++ b/fs/ntfs3/inode.c @@ -985,7 +985,7 @@ int ntfs_write_end(struct file *file, struct address_space *mapping, loff_t pos, } if (pos + err > inode->i_size) { - inode->i_size = pos + err; + i_size_write(inode, pos + err); dirty = true; } From 1b7dd28e14c4728ae1a815605ca33ffb4ce1b309 Mon Sep 17 00:00:00 2001 From: Konstantin Komarov Date: Fri, 26 Jan 2024 11:13:59 +0300 Subject: [PATCH 038/270] fs/ntfs3: Correct function is_rst_area_valid Reported-by: Robert Morris Signed-off-by: Konstantin Komarov --- fs/ntfs3/fslog.c | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/fs/ntfs3/fslog.c b/fs/ntfs3/fslog.c index 7dbb000fc691..855519713bf7 100644 --- a/fs/ntfs3/fslog.c +++ b/fs/ntfs3/fslog.c @@ -465,7 +465,7 @@ static inline bool is_rst_area_valid(const struct RESTART_HDR *rhdr) { const struct RESTART_AREA *ra; u16 cl, fl, ul; - u32 off, l_size, file_dat_bits, file_size_round; + u32 off, l_size, seq_bits; u16 ro = le16_to_cpu(rhdr->ra_off); u32 sys_page = le32_to_cpu(rhdr->sys_page_size); @@ -511,13 +511,15 @@ static inline bool is_rst_area_valid(const struct RESTART_HDR *rhdr) /* Make sure the sequence number bits match the log file size. */ l_size = le64_to_cpu(ra->l_size); - file_dat_bits = sizeof(u64) * 8 - le32_to_cpu(ra->seq_num_bits); - file_size_round = 1u << (file_dat_bits + 3); - if (file_size_round != l_size && - (file_size_round < l_size || (file_size_round / 2) > l_size)) { - return false; + seq_bits = sizeof(u64) * 8 + 3; + while (l_size) { + l_size >>= 1; + seq_bits -= 1; } + if (seq_bits != ra->seq_num_bits) + return false; + /* The log page data offset and record header length must be quad-aligned. */ if (!IS_ALIGNED(le16_to_cpu(ra->data_off), 8) || !IS_ALIGNED(le16_to_cpu(ra->rec_hdr_len), 8)) From 652cfeb43d6b9aba5c7c4902bed7a7340df131fb Mon Sep 17 00:00:00 2001 From: Konstantin Komarov Date: Fri, 26 Jan 2024 11:14:31 +0300 Subject: [PATCH 039/270] fs/ntfs3: Fixed overflow check in mi_enum_attr() Reported-by: Robert Morris Signed-off-by: Konstantin Komarov --- fs/ntfs3/record.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/ntfs3/record.c b/fs/ntfs3/record.c index 7b6423584eae..6aa3a9d44df1 100644 --- a/fs/ntfs3/record.c +++ b/fs/ntfs3/record.c @@ -279,7 +279,7 @@ struct ATTRIB *mi_enum_attr(struct mft_inode *mi, struct ATTRIB *attr) if (t16 > asize) return NULL; - if (t16 + le32_to_cpu(attr->res.data_size) > asize) + if (le32_to_cpu(attr->res.data_size) > asize - t16) return NULL; t32 = sizeof(short) * attr->name_len; From d68968440b1a75dee05cfac7f368f1aa139e1911 Mon Sep 17 00:00:00 2001 From: Konstantin Komarov Date: Mon, 29 Jan 2024 10:30:09 +0300 Subject: [PATCH 040/270] fs/ntfs3: Update inode->i_size after success write into compressed file Reported-by: Giovanni Santini Signed-off-by: Konstantin Komarov --- fs/ntfs3/file.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/fs/ntfs3/file.c b/fs/ntfs3/file.c index b702543a8795..691b0c9b95ae 100644 --- a/fs/ntfs3/file.c +++ b/fs/ntfs3/file.c @@ -1054,6 +1054,8 @@ out: iocb->ki_pos += written; if (iocb->ki_pos > ni->i_valid) ni->i_valid = iocb->ki_pos; + if (iocb->ki_pos > i_size) + i_size_write(inode, iocb->ki_pos); return written; } From b2dd7b953c25ffd5912dda17e980e7168bebcf6c Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Tue, 17 Oct 2023 17:04:39 +0300 Subject: [PATCH 041/270] fs/ntfs3: Fix an NULL dereference bug The issue here is when this is called from ntfs_load_attr_list(). The "size" comes from le32_to_cpu(attr->res.data_size) so it can't overflow on a 64bit systems but on 32bit systems the "+ 1023" can overflow and the result is zero. This means that the kmalloc will succeed by returning the ZERO_SIZE_PTR and then the memcpy() will crash with an Oops on the next line. Fixes: be71b5cba2e6 ("fs/ntfs3: Add attrib operations") Signed-off-by: Dan Carpenter Signed-off-by: Konstantin Komarov --- fs/ntfs3/ntfs_fs.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/ntfs3/ntfs_fs.h b/fs/ntfs3/ntfs_fs.h index 81f7563428ee..627419bd6f77 100644 --- a/fs/ntfs3/ntfs_fs.h +++ b/fs/ntfs3/ntfs_fs.h @@ -475,7 +475,7 @@ bool al_delete_le(struct ntfs_inode *ni, enum ATTR_TYPE type, CLST vcn, int al_update(struct ntfs_inode *ni, int sync); static inline size_t al_aligned(size_t size) { - return (size + 1023) & ~(size_t)1023; + return size_add(size, 1023) & ~(size_t)1023; } /* Globals from bitfunc.c */ From 731ab1f9828800df871c5a7ab9ffe965317d3f15 Mon Sep 17 00:00:00 2001 From: Edward Adam Davis Date: Sat, 30 Dec 2023 17:00:03 +0800 Subject: [PATCH 042/270] fs/ntfs3: Fix oob in ntfs_listxattr The length of name cannot exceed the space occupied by ea. Reported-and-tested-by: syzbot+65e940cfb8f99a97aca7@syzkaller.appspotmail.com Signed-off-by: Edward Adam Davis Signed-off-by: Konstantin Komarov --- fs/ntfs3/xattr.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/fs/ntfs3/xattr.c b/fs/ntfs3/xattr.c index 071356d096d8..53e7d1fa036a 100644 --- a/fs/ntfs3/xattr.c +++ b/fs/ntfs3/xattr.c @@ -219,6 +219,9 @@ static ssize_t ntfs_list_ea(struct ntfs_inode *ni, char *buffer, if (!ea->name_len) break; + if (ea->name_len > ea_size) + break; + if (buffer) { /* Check if we can use field ea->name */ if (off + ea_size > size) From 1f5fa4b3b85ceb43f1053290f0ade037b50e6297 Mon Sep 17 00:00:00 2001 From: Nekun Date: Mon, 30 Oct 2023 08:33:54 +0000 Subject: [PATCH 043/270] fs/ntfs3: Add ioctl operation for directories (FITRIM) While ntfs3 supports discards, FITRIM ioctl() command has defined only for regular files. This may confuse users trying to invoke `fstrim` utility with the directory argument (for example, call `fstrim ` which is the common practice). In this case, ioctl() returns -ENOTTY without any error messages in kernel ring buffer, this may be easily interpreted as no support for discards in ntfs3 driver. Currently only FITRIM command implemented in ntfs_ioctl() and passed inode used only for dereferencing NTFS superblock, so no need for separate ioctl() handler for directories, just add existing ntfs_ioctl() handler to ntfs_dir_operations. Signed-off-by: Nekun Signed-off-by: Konstantin Komarov --- fs/ntfs3/dir.c | 4 ++++ fs/ntfs3/file.c | 4 ++-- fs/ntfs3/ntfs_fs.h | 2 ++ 3 files changed, 8 insertions(+), 2 deletions(-) diff --git a/fs/ntfs3/dir.c b/fs/ntfs3/dir.c index effa6accf8a8..5cf3d9decf64 100644 --- a/fs/ntfs3/dir.c +++ b/fs/ntfs3/dir.c @@ -611,5 +611,9 @@ const struct file_operations ntfs_dir_operations = { .iterate_shared = ntfs_readdir, .fsync = generic_file_fsync, .open = ntfs_file_open, + .unlocked_ioctl = ntfs_ioctl, +#ifdef CONFIG_COMPAT + .compat_ioctl = ntfs_compat_ioctl, +#endif }; // clang-format on diff --git a/fs/ntfs3/file.c b/fs/ntfs3/file.c index 691b0c9b95ae..5418662c80d8 100644 --- a/fs/ntfs3/file.c +++ b/fs/ntfs3/file.c @@ -48,7 +48,7 @@ static int ntfs_ioctl_fitrim(struct ntfs_sb_info *sbi, unsigned long arg) return 0; } -static long ntfs_ioctl(struct file *filp, u32 cmd, unsigned long arg) +long ntfs_ioctl(struct file *filp, u32 cmd, unsigned long arg) { struct inode *inode = file_inode(filp); struct ntfs_sb_info *sbi = inode->i_sb->s_fs_info; @@ -61,7 +61,7 @@ static long ntfs_ioctl(struct file *filp, u32 cmd, unsigned long arg) } #ifdef CONFIG_COMPAT -static long ntfs_compat_ioctl(struct file *filp, u32 cmd, unsigned long arg) +long ntfs_compat_ioctl(struct file *filp, u32 cmd, unsigned long arg) { return ntfs_ioctl(filp, cmd, (unsigned long)compat_ptr(arg)); diff --git a/fs/ntfs3/ntfs_fs.h b/fs/ntfs3/ntfs_fs.h index 627419bd6f77..79356fd29a14 100644 --- a/fs/ntfs3/ntfs_fs.h +++ b/fs/ntfs3/ntfs_fs.h @@ -502,6 +502,8 @@ int ntfs3_setattr(struct mnt_idmap *idmap, struct dentry *dentry, int ntfs_file_open(struct inode *inode, struct file *file); int ntfs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, __u64 start, __u64 len); +long ntfs_ioctl(struct file *filp, u32 cmd, unsigned long arg); +long ntfs_compat_ioctl(struct file *filp, u32 cmd, unsigned long arg); extern const struct inode_operations ntfs_special_inode_operations; extern const struct inode_operations ntfs_file_inode_operations; extern const struct file_operations ntfs_file_operations; From 622cd3daa8eae37359a6fd3c07c36d19f66606b5 Mon Sep 17 00:00:00 2001 From: Christophe JAILLET Date: Fri, 10 Nov 2023 20:59:22 +0100 Subject: [PATCH 044/270] fs/ntfs3: Slightly simplify ntfs_inode_printk() The size passed to snprintf() includes the space for the trailing space. So there is no reason here not to use all the available space. So remove the -1 when computing 'name_len'. While at it, use the size of the array directly instead of the intermediate 'name_len' variable. snprintf() also guaranties that the buffer if NULL terminated, so there is no need to write an additional trailing NULL "To be sure". Signed-off-by: Christophe JAILLET Signed-off-by: Konstantin Komarov --- fs/ntfs3/super.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/fs/ntfs3/super.c b/fs/ntfs3/super.c index c55a29793a8d..cef5467fd928 100644 --- a/fs/ntfs3/super.c +++ b/fs/ntfs3/super.c @@ -122,13 +122,12 @@ void ntfs_inode_printk(struct inode *inode, const char *fmt, ...) if (name) { struct dentry *de = d_find_alias(inode); - const u32 name_len = ARRAY_SIZE(s_name_buf) - 1; if (de) { spin_lock(&de->d_lock); - snprintf(name, name_len, " \"%s\"", de->d_name.name); + snprintf(name, sizeof(s_name_buf), " \"%s\"", + de->d_name.name); spin_unlock(&de->d_lock); - name[name_len] = 0; /* To be sure. */ } else { name[0] = 0; } From 8eed4e00a370b37b4e5985ed983dccedd555ea9d Mon Sep 17 00:00:00 2001 From: Qiuxu Zhuo Date: Mon, 29 Jan 2024 14:38:42 +0800 Subject: [PATCH 045/270] x86/lib: Revert to _ASM_EXTABLE_UA() for {get,put}_user() fixups During memory error injection test on kernels >= v6.4, the kernel panics like below. However, this issue couldn't be reproduced on kernels <= v6.3. mce: [Hardware Error]: CPU 296: Machine Check Exception: f Bank 1: bd80000000100134 mce: [Hardware Error]: RIP 10: {__get_user_nocheck_4+0x6/0x20} mce: [Hardware Error]: TSC 411a93533ed ADDR 346a8730040 MISC 86 mce: [Hardware Error]: PROCESSOR 0:a06d0 TIME 1706000767 SOCKET 1 APIC 211 microcode 80001490 mce: [Hardware Error]: Run the above through 'mcelog --ascii' mce: [Hardware Error]: Machine check: Data load in unrecoverable area of kernel Kernel panic - not syncing: Fatal local machine check The MCA code can recover from an in-kernel #MC if the fixup type is EX_TYPE_UACCESS, explicitly indicating that the kernel is attempting to access userspace memory. However, if the fixup type is EX_TYPE_DEFAULT the only thing that is raised for an in-kernel #MC is a panic. ex_handler_uaccess() would warn if users gave a non-canonical addresses (with bit 63 clear) to {get, put}_user(), which was unexpected. Therefore, commit b19b74bc99b1 ("x86/mm: Rework address range check in get_user() and put_user()") replaced _ASM_EXTABLE_UA() with _ASM_EXTABLE() for {get, put}_user() fixups. However, the new fixup type EX_TYPE_DEFAULT results in a panic. Commit 6014bc27561f ("x86-64: make access_ok() independent of LAM") added the check gp_fault_address_ok() right before the WARN_ONCE() in ex_handler_uaccess() to not warn about non-canonical user addresses due to LAM. With that in place, revert back to _ASM_EXTABLE_UA() for {get,put}_user() exception fixups in order to be able to handle in-kernel MCEs correctly again. [ bp: Massage commit message. ] Fixes: b19b74bc99b1 ("x86/mm: Rework address range check in get_user() and put_user()") Signed-off-by: Qiuxu Zhuo Signed-off-by: Borislav Petkov (AMD) Reviewed-by: Kirill A. Shutemov Cc: Link: https://lore.kernel.org/r/20240129063842.61584-1-qiuxu.zhuo@intel.com --- arch/x86/lib/getuser.S | 24 ++++++++++++------------ arch/x86/lib/putuser.S | 20 ++++++++++---------- 2 files changed, 22 insertions(+), 22 deletions(-) diff --git a/arch/x86/lib/getuser.S b/arch/x86/lib/getuser.S index 20ef350a60fb..10d5ed8b5990 100644 --- a/arch/x86/lib/getuser.S +++ b/arch/x86/lib/getuser.S @@ -163,23 +163,23 @@ SYM_CODE_END(__get_user_8_handle_exception) #endif /* get_user */ - _ASM_EXTABLE(1b, __get_user_handle_exception) - _ASM_EXTABLE(2b, __get_user_handle_exception) - _ASM_EXTABLE(3b, __get_user_handle_exception) + _ASM_EXTABLE_UA(1b, __get_user_handle_exception) + _ASM_EXTABLE_UA(2b, __get_user_handle_exception) + _ASM_EXTABLE_UA(3b, __get_user_handle_exception) #ifdef CONFIG_X86_64 - _ASM_EXTABLE(4b, __get_user_handle_exception) + _ASM_EXTABLE_UA(4b, __get_user_handle_exception) #else - _ASM_EXTABLE(4b, __get_user_8_handle_exception) - _ASM_EXTABLE(5b, __get_user_8_handle_exception) + _ASM_EXTABLE_UA(4b, __get_user_8_handle_exception) + _ASM_EXTABLE_UA(5b, __get_user_8_handle_exception) #endif /* __get_user */ - _ASM_EXTABLE(6b, __get_user_handle_exception) - _ASM_EXTABLE(7b, __get_user_handle_exception) - _ASM_EXTABLE(8b, __get_user_handle_exception) + _ASM_EXTABLE_UA(6b, __get_user_handle_exception) + _ASM_EXTABLE_UA(7b, __get_user_handle_exception) + _ASM_EXTABLE_UA(8b, __get_user_handle_exception) #ifdef CONFIG_X86_64 - _ASM_EXTABLE(9b, __get_user_handle_exception) + _ASM_EXTABLE_UA(9b, __get_user_handle_exception) #else - _ASM_EXTABLE(9b, __get_user_8_handle_exception) - _ASM_EXTABLE(10b, __get_user_8_handle_exception) + _ASM_EXTABLE_UA(9b, __get_user_8_handle_exception) + _ASM_EXTABLE_UA(10b, __get_user_8_handle_exception) #endif diff --git a/arch/x86/lib/putuser.S b/arch/x86/lib/putuser.S index 2877f5934177..975c9c18263d 100644 --- a/arch/x86/lib/putuser.S +++ b/arch/x86/lib/putuser.S @@ -133,15 +133,15 @@ SYM_CODE_START_LOCAL(__put_user_handle_exception) RET SYM_CODE_END(__put_user_handle_exception) - _ASM_EXTABLE(1b, __put_user_handle_exception) - _ASM_EXTABLE(2b, __put_user_handle_exception) - _ASM_EXTABLE(3b, __put_user_handle_exception) - _ASM_EXTABLE(4b, __put_user_handle_exception) - _ASM_EXTABLE(5b, __put_user_handle_exception) - _ASM_EXTABLE(6b, __put_user_handle_exception) - _ASM_EXTABLE(7b, __put_user_handle_exception) - _ASM_EXTABLE(9b, __put_user_handle_exception) + _ASM_EXTABLE_UA(1b, __put_user_handle_exception) + _ASM_EXTABLE_UA(2b, __put_user_handle_exception) + _ASM_EXTABLE_UA(3b, __put_user_handle_exception) + _ASM_EXTABLE_UA(4b, __put_user_handle_exception) + _ASM_EXTABLE_UA(5b, __put_user_handle_exception) + _ASM_EXTABLE_UA(6b, __put_user_handle_exception) + _ASM_EXTABLE_UA(7b, __put_user_handle_exception) + _ASM_EXTABLE_UA(9b, __put_user_handle_exception) #ifdef CONFIG_X86_32 - _ASM_EXTABLE(8b, __put_user_handle_exception) - _ASM_EXTABLE(10b, __put_user_handle_exception) + _ASM_EXTABLE_UA(8b, __put_user_handle_exception) + _ASM_EXTABLE_UA(10b, __put_user_handle_exception) #endif From e4ad71e2367f3f7b7409c30df9cdbb34da15e012 Mon Sep 17 00:00:00 2001 From: Arend van Spriel Date: Fri, 26 Jan 2024 11:57:24 +0100 Subject: [PATCH 046/270] MAINTAINERS: wifi: brcm80211: cleanup entry There has been some discussion about what is expected from a maintainer and so a cleanup seems to be in order. A dedicated mailing list has been created to discuss brcm80211 specific development issues. Keeping the status as Supported although help in maintaining this driver is welcomed. Cc: brcm80211@lists.linux.dev Signed-off-by: Arend van Spriel Signed-off-by: Kalle Valo Link: https://msgid.link/20240126105724.384063-1-arend.vanspriel@broadcom.com --- MAINTAINERS | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/MAINTAINERS b/MAINTAINERS index d8ca0e10c8d1..25d530ab90d7 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -4169,14 +4169,14 @@ F: drivers/firmware/broadcom/tee_bnxt_fw.c F: drivers/net/ethernet/broadcom/bnxt/ F: include/linux/firmware/broadcom/tee_bnxt_fw.h -BROADCOM BRCM80211 IEEE802.11n WIRELESS DRIVER -M: Arend van Spriel -M: Franky Lin -M: Hante Meuleman +BROADCOM BRCM80211 IEEE802.11 WIRELESS DRIVERS +M: Arend van Spriel L: linux-wireless@vger.kernel.org +L: brcm80211@lists.linux.dev L: brcm80211-dev-list.pdl@broadcom.com S: Supported F: drivers/net/wireless/broadcom/brcm80211/ +F: include/linux/platform_data/brcmfmac.h BROADCOM BRCMSTB GPIO DRIVER M: Doug Berger From 02add85a9eefb5c969b9bb6916f14607ca2faae0 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 29 Nov 2023 14:40:41 -0800 Subject: [PATCH 047/270] KVM: selftests: Reword the NX hugepage test's skip message to be more helpful Rework the NX hugepage test's skip message regarding the magic token to provide all of the necessary magic, and to very explicitly recommended using the wrapper shell script. Opportunistically remove an overzealous newline; splitting the recommendation message across two lines of ~45 characters makes it much harder to read than running out a single line to 98 characters. Link: https://lore.kernel.org/r/20231129224042.530798-1-seanjc@google.com Signed-off-by: Sean Christopherson --- tools/testing/selftests/kvm/x86_64/nx_huge_pages_test.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tools/testing/selftests/kvm/x86_64/nx_huge_pages_test.c b/tools/testing/selftests/kvm/x86_64/nx_huge_pages_test.c index 83e25bccc139..17bbb96fc4df 100644 --- a/tools/testing/selftests/kvm/x86_64/nx_huge_pages_test.c +++ b/tools/testing/selftests/kvm/x86_64/nx_huge_pages_test.c @@ -257,9 +257,9 @@ int main(int argc, char **argv) TEST_REQUIRE(kvm_has_cap(KVM_CAP_VM_DISABLE_NX_HUGE_PAGES)); __TEST_REQUIRE(token == MAGIC_TOKEN, - "This test must be run with the magic token %d.\n" - "This is done by nx_huge_pages_test.sh, which\n" - "also handles environment setup for the test.", MAGIC_TOKEN); + "This test must be run with the magic token via '-t %d'.\n" + "Running via nx_huge_pages_test.sh, which also handles " + "environment setup, is strongly recommended.", MAGIC_TOKEN); run_test(reclaim_period_ms, false, reboot_permissions); run_test(reclaim_period_ms, true, reboot_permissions); From 250e138d876838774bca3140d544438898df0489 Mon Sep 17 00:00:00 2001 From: Andrew Jones Date: Wed, 6 Dec 2023 18:02:43 +0100 Subject: [PATCH 048/270] KVM: selftests: Remove redundant newlines TEST_* functions append their own newline. Remove newlines from TEST_* callsites to avoid extra newlines in output. Signed-off-by: Andrew Jones Link: https://lore.kernel.org/r/20231206170241.82801-8-ajones@ventanamicro.com Signed-off-by: Sean Christopherson --- tools/testing/selftests/kvm/demand_paging_test.c | 4 ++-- .../testing/selftests/kvm/dirty_log_perf_test.c | 4 ++-- tools/testing/selftests/kvm/dirty_log_test.c | 4 ++-- tools/testing/selftests/kvm/get-reg-list.c | 2 +- tools/testing/selftests/kvm/guest_print_test.c | 8 ++++---- .../selftests/kvm/hardware_disable_test.c | 6 +++--- .../testing/selftests/kvm/kvm_create_max_vcpus.c | 2 +- .../testing/selftests/kvm/kvm_page_table_test.c | 4 ++-- tools/testing/selftests/kvm/lib/elf.c | 2 +- tools/testing/selftests/kvm/lib/kvm_util.c | 16 ++++++++-------- tools/testing/selftests/kvm/lib/memstress.c | 2 +- .../testing/selftests/kvm/lib/userfaultfd_util.c | 2 +- .../kvm/memslot_modification_stress_test.c | 2 +- tools/testing/selftests/kvm/memslot_perf_test.c | 6 +++--- tools/testing/selftests/kvm/rseq_test.c | 4 ++-- .../selftests/kvm/set_memory_region_test.c | 6 +++--- .../selftests/kvm/system_counter_offset_test.c | 2 +- 17 files changed, 38 insertions(+), 38 deletions(-) diff --git a/tools/testing/selftests/kvm/demand_paging_test.c b/tools/testing/selftests/kvm/demand_paging_test.c index 09c116a82a84..bf3609f71854 100644 --- a/tools/testing/selftests/kvm/demand_paging_test.c +++ b/tools/testing/selftests/kvm/demand_paging_test.c @@ -45,10 +45,10 @@ static void vcpu_worker(struct memstress_vcpu_args *vcpu_args) /* Let the guest access its memory */ ret = _vcpu_run(vcpu); - TEST_ASSERT(ret == 0, "vcpu_run failed: %d\n", ret); + TEST_ASSERT(ret == 0, "vcpu_run failed: %d", ret); if (get_ucall(vcpu, NULL) != UCALL_SYNC) { TEST_ASSERT(false, - "Invalid guest sync status: exit_reason=%s\n", + "Invalid guest sync status: exit_reason=%s", exit_reason_str(run->exit_reason)); } diff --git a/tools/testing/selftests/kvm/dirty_log_perf_test.c b/tools/testing/selftests/kvm/dirty_log_perf_test.c index d374dbcf9a53..504f6fe980e8 100644 --- a/tools/testing/selftests/kvm/dirty_log_perf_test.c +++ b/tools/testing/selftests/kvm/dirty_log_perf_test.c @@ -88,9 +88,9 @@ static void vcpu_worker(struct memstress_vcpu_args *vcpu_args) ret = _vcpu_run(vcpu); ts_diff = timespec_elapsed(start); - TEST_ASSERT(ret == 0, "vcpu_run failed: %d\n", ret); + TEST_ASSERT(ret == 0, "vcpu_run failed: %d", ret); TEST_ASSERT(get_ucall(vcpu, NULL) == UCALL_SYNC, - "Invalid guest sync status: exit_reason=%s\n", + "Invalid guest sync status: exit_reason=%s", exit_reason_str(run->exit_reason)); pr_debug("Got sync event from vCPU %d\n", vcpu_idx); diff --git a/tools/testing/selftests/kvm/dirty_log_test.c b/tools/testing/selftests/kvm/dirty_log_test.c index 6cbecf499767..babea97b31a4 100644 --- a/tools/testing/selftests/kvm/dirty_log_test.c +++ b/tools/testing/selftests/kvm/dirty_log_test.c @@ -262,7 +262,7 @@ static void default_after_vcpu_run(struct kvm_vcpu *vcpu, int ret, int err) "vcpu run failed: errno=%d", err); TEST_ASSERT(get_ucall(vcpu, NULL) == UCALL_SYNC, - "Invalid guest sync status: exit_reason=%s\n", + "Invalid guest sync status: exit_reason=%s", exit_reason_str(run->exit_reason)); vcpu_handle_sync_stop(); @@ -410,7 +410,7 @@ static void dirty_ring_after_vcpu_run(struct kvm_vcpu *vcpu, int ret, int err) pr_info("vcpu continues now.\n"); } else { TEST_ASSERT(false, "Invalid guest sync status: " - "exit_reason=%s\n", + "exit_reason=%s", exit_reason_str(run->exit_reason)); } } diff --git a/tools/testing/selftests/kvm/get-reg-list.c b/tools/testing/selftests/kvm/get-reg-list.c index 8274ef04301f..91f05f78e824 100644 --- a/tools/testing/selftests/kvm/get-reg-list.c +++ b/tools/testing/selftests/kvm/get-reg-list.c @@ -152,7 +152,7 @@ static void check_supported(struct vcpu_reg_list *c) continue; __TEST_REQUIRE(kvm_has_cap(s->capability), - "%s: %s not available, skipping tests\n", + "%s: %s not available, skipping tests", config_name(c), s->name); } } diff --git a/tools/testing/selftests/kvm/guest_print_test.c b/tools/testing/selftests/kvm/guest_print_test.c index 41230b746190..3502caa3590c 100644 --- a/tools/testing/selftests/kvm/guest_print_test.c +++ b/tools/testing/selftests/kvm/guest_print_test.c @@ -98,7 +98,7 @@ static void ucall_abort(const char *assert_msg, const char *expected_assert_msg) int offset = len_str - len_substr; TEST_ASSERT(len_substr <= len_str, - "Expected '%s' to be a substring of '%s'\n", + "Expected '%s' to be a substring of '%s'", assert_msg, expected_assert_msg); TEST_ASSERT(strcmp(&assert_msg[offset], expected_assert_msg) == 0, @@ -116,7 +116,7 @@ static void run_test(struct kvm_vcpu *vcpu, const char *expected_printf, vcpu_run(vcpu); TEST_ASSERT(run->exit_reason == UCALL_EXIT_REASON, - "Unexpected exit reason: %u (%s),\n", + "Unexpected exit reason: %u (%s),", run->exit_reason, exit_reason_str(run->exit_reason)); switch (get_ucall(vcpu, &uc)) { @@ -161,11 +161,11 @@ static void test_limits(void) vcpu_run(vcpu); TEST_ASSERT(run->exit_reason == UCALL_EXIT_REASON, - "Unexpected exit reason: %u (%s),\n", + "Unexpected exit reason: %u (%s),", run->exit_reason, exit_reason_str(run->exit_reason)); TEST_ASSERT(get_ucall(vcpu, &uc) == UCALL_ABORT, - "Unexpected ucall command: %lu, Expected: %u (UCALL_ABORT)\n", + "Unexpected ucall command: %lu, Expected: %u (UCALL_ABORT)", uc.cmd, UCALL_ABORT); kvm_vm_free(vm); diff --git a/tools/testing/selftests/kvm/hardware_disable_test.c b/tools/testing/selftests/kvm/hardware_disable_test.c index f5d59b9934f1..decc521fc760 100644 --- a/tools/testing/selftests/kvm/hardware_disable_test.c +++ b/tools/testing/selftests/kvm/hardware_disable_test.c @@ -41,7 +41,7 @@ static void *run_vcpu(void *arg) vcpu_run(vcpu); - TEST_ASSERT(false, "%s: exited with reason %d: %s\n", + TEST_ASSERT(false, "%s: exited with reason %d: %s", __func__, run->exit_reason, exit_reason_str(run->exit_reason)); pthread_exit(NULL); @@ -55,7 +55,7 @@ static void *sleeping_thread(void *arg) fd = open("/dev/null", O_RDWR); close(fd); } - TEST_ASSERT(false, "%s: exited\n", __func__); + TEST_ASSERT(false, "%s: exited", __func__); pthread_exit(NULL); } @@ -118,7 +118,7 @@ static void run_test(uint32_t run) for (i = 0; i < VCPU_NUM; ++i) check_join(threads[i], &b); /* Should not be reached */ - TEST_ASSERT(false, "%s: [%d] child escaped the ninja\n", __func__, run); + TEST_ASSERT(false, "%s: [%d] child escaped the ninja", __func__, run); } void wait_for_child_setup(pid_t pid) diff --git a/tools/testing/selftests/kvm/kvm_create_max_vcpus.c b/tools/testing/selftests/kvm/kvm_create_max_vcpus.c index 31b3cb24b9a7..b9e23265e4b3 100644 --- a/tools/testing/selftests/kvm/kvm_create_max_vcpus.c +++ b/tools/testing/selftests/kvm/kvm_create_max_vcpus.c @@ -65,7 +65,7 @@ int main(int argc, char *argv[]) int r = setrlimit(RLIMIT_NOFILE, &rl); __TEST_REQUIRE(r >= 0, - "RLIMIT_NOFILE hard limit is too low (%d, wanted %d)\n", + "RLIMIT_NOFILE hard limit is too low (%d, wanted %d)", old_rlim_max, nr_fds_wanted); } else { TEST_ASSERT(!setrlimit(RLIMIT_NOFILE, &rl), "setrlimit() failed!"); diff --git a/tools/testing/selftests/kvm/kvm_page_table_test.c b/tools/testing/selftests/kvm/kvm_page_table_test.c index e37dc9c21888..e0ba97ac1c56 100644 --- a/tools/testing/selftests/kvm/kvm_page_table_test.c +++ b/tools/testing/selftests/kvm/kvm_page_table_test.c @@ -204,9 +204,9 @@ static void *vcpu_worker(void *data) ret = _vcpu_run(vcpu); ts_diff = timespec_elapsed(start); - TEST_ASSERT(ret == 0, "vcpu_run failed: %d\n", ret); + TEST_ASSERT(ret == 0, "vcpu_run failed: %d", ret); TEST_ASSERT(get_ucall(vcpu, NULL) == UCALL_SYNC, - "Invalid guest sync status: exit_reason=%s\n", + "Invalid guest sync status: exit_reason=%s", exit_reason_str(vcpu->run->exit_reason)); pr_debug("Got sync event from vCPU %d\n", vcpu->id); diff --git a/tools/testing/selftests/kvm/lib/elf.c b/tools/testing/selftests/kvm/lib/elf.c index 266f3876e10a..f34d926d9735 100644 --- a/tools/testing/selftests/kvm/lib/elf.c +++ b/tools/testing/selftests/kvm/lib/elf.c @@ -184,7 +184,7 @@ void kvm_vm_elf_load(struct kvm_vm *vm, const char *filename) "Seek to program segment offset failed,\n" " program header idx: %u errno: %i\n" " offset_rv: 0x%jx\n" - " expected: 0x%jx\n", + " expected: 0x%jx", n1, errno, (intmax_t) offset_rv, (intmax_t) phdr.p_offset); test_read(fd, addr_gva2hva(vm, phdr.p_vaddr), diff --git a/tools/testing/selftests/kvm/lib/kvm_util.c b/tools/testing/selftests/kvm/lib/kvm_util.c index e066d584c656..8754abaff5e9 100644 --- a/tools/testing/selftests/kvm/lib/kvm_util.c +++ b/tools/testing/selftests/kvm/lib/kvm_util.c @@ -320,7 +320,7 @@ static uint64_t vm_nr_pages_required(enum vm_guest_mode mode, uint64_t nr_pages; TEST_ASSERT(nr_runnable_vcpus, - "Use vm_create_barebones() for VMs that _never_ have vCPUs\n"); + "Use vm_create_barebones() for VMs that _never_ have vCPUs"); TEST_ASSERT(nr_runnable_vcpus <= kvm_check_cap(KVM_CAP_MAX_VCPUS), "nr_vcpus = %d too large for host, max-vcpus = %d", @@ -491,7 +491,7 @@ void kvm_pin_this_task_to_pcpu(uint32_t pcpu) CPU_ZERO(&mask); CPU_SET(pcpu, &mask); r = sched_setaffinity(0, sizeof(mask), &mask); - TEST_ASSERT(!r, "sched_setaffinity() failed for pCPU '%u'.\n", pcpu); + TEST_ASSERT(!r, "sched_setaffinity() failed for pCPU '%u'.", pcpu); } static uint32_t parse_pcpu(const char *cpu_str, const cpu_set_t *allowed_mask) @@ -499,7 +499,7 @@ static uint32_t parse_pcpu(const char *cpu_str, const cpu_set_t *allowed_mask) uint32_t pcpu = atoi_non_negative("CPU number", cpu_str); TEST_ASSERT(CPU_ISSET(pcpu, allowed_mask), - "Not allowed to run on pCPU '%d', check cgroups?\n", pcpu); + "Not allowed to run on pCPU '%d', check cgroups?", pcpu); return pcpu; } @@ -529,7 +529,7 @@ void kvm_parse_vcpu_pinning(const char *pcpus_string, uint32_t vcpu_to_pcpu[], int i, r; cpu_list = strdup(pcpus_string); - TEST_ASSERT(cpu_list, "strdup() allocation failed.\n"); + TEST_ASSERT(cpu_list, "strdup() allocation failed."); r = sched_getaffinity(0, sizeof(allowed_mask), &allowed_mask); TEST_ASSERT(!r, "sched_getaffinity() failed"); @@ -538,7 +538,7 @@ void kvm_parse_vcpu_pinning(const char *pcpus_string, uint32_t vcpu_to_pcpu[], /* 1. Get all pcpus for vcpus. */ for (i = 0; i < nr_vcpus; i++) { - TEST_ASSERT(cpu, "pCPU not provided for vCPU '%d'\n", i); + TEST_ASSERT(cpu, "pCPU not provided for vCPU '%d'", i); vcpu_to_pcpu[i] = parse_pcpu(cpu, &allowed_mask); cpu = strtok(NULL, delim); } @@ -1057,7 +1057,7 @@ void vm_mem_add(struct kvm_vm *vm, enum vm_mem_backing_src_type src_type, TEST_ASSERT(ret == 0, "KVM_SET_USER_MEMORY_REGION2 IOCTL failed,\n" " rc: %i errno: %i\n" " slot: %u flags: 0x%x\n" - " guest_phys_addr: 0x%lx size: 0x%lx guest_memfd: %d\n", + " guest_phys_addr: 0x%lx size: 0x%lx guest_memfd: %d", ret, errno, slot, flags, guest_paddr, (uint64_t) region->region.memory_size, region->region.guest_memfd); @@ -1222,7 +1222,7 @@ void vm_guest_mem_fallocate(struct kvm_vm *vm, uint64_t base, uint64_t size, len = min_t(uint64_t, end - gpa, region->region.memory_size - offset); ret = fallocate(region->region.guest_memfd, mode, fd_offset, len); - TEST_ASSERT(!ret, "fallocate() failed to %s at %lx (len = %lu), fd = %d, mode = %x, offset = %lx\n", + TEST_ASSERT(!ret, "fallocate() failed to %s at %lx (len = %lu), fd = %d, mode = %x, offset = %lx", punch_hole ? "punch hole" : "allocate", gpa, len, region->region.guest_memfd, mode, fd_offset); } @@ -1265,7 +1265,7 @@ struct kvm_vcpu *__vm_vcpu_add(struct kvm_vm *vm, uint32_t vcpu_id) struct kvm_vcpu *vcpu; /* Confirm a vcpu with the specified id doesn't already exist. */ - TEST_ASSERT(!vcpu_exists(vm, vcpu_id), "vCPU%d already exists\n", vcpu_id); + TEST_ASSERT(!vcpu_exists(vm, vcpu_id), "vCPU%d already exists", vcpu_id); /* Allocate and initialize new vcpu structure. */ vcpu = calloc(1, sizeof(*vcpu)); diff --git a/tools/testing/selftests/kvm/lib/memstress.c b/tools/testing/selftests/kvm/lib/memstress.c index d05487e5a371..cf2c73971308 100644 --- a/tools/testing/selftests/kvm/lib/memstress.c +++ b/tools/testing/selftests/kvm/lib/memstress.c @@ -192,7 +192,7 @@ struct kvm_vm *memstress_create_vm(enum vm_guest_mode mode, int nr_vcpus, TEST_ASSERT(guest_num_pages < region_end_gfn, "Requested more guest memory than address space allows.\n" " guest pages: %" PRIx64 " max gfn: %" PRIx64 - " nr_vcpus: %d wss: %" PRIx64 "]\n", + " nr_vcpus: %d wss: %" PRIx64 "]", guest_num_pages, region_end_gfn - 1, nr_vcpus, vcpu_memory_bytes); args->gpa = (region_end_gfn - guest_num_pages - 1) * args->guest_page_size; diff --git a/tools/testing/selftests/kvm/lib/userfaultfd_util.c b/tools/testing/selftests/kvm/lib/userfaultfd_util.c index 271f63891581..f4eef6eb2dc2 100644 --- a/tools/testing/selftests/kvm/lib/userfaultfd_util.c +++ b/tools/testing/selftests/kvm/lib/userfaultfd_util.c @@ -69,7 +69,7 @@ static void *uffd_handler_thread_fn(void *arg) if (pollfd[1].revents & POLLIN) { r = read(pollfd[1].fd, &tmp_chr, 1); TEST_ASSERT(r == 1, - "Error reading pipefd in UFFD thread\n"); + "Error reading pipefd in UFFD thread"); break; } diff --git a/tools/testing/selftests/kvm/memslot_modification_stress_test.c b/tools/testing/selftests/kvm/memslot_modification_stress_test.c index 9855c41ca811..156361966612 100644 --- a/tools/testing/selftests/kvm/memslot_modification_stress_test.c +++ b/tools/testing/selftests/kvm/memslot_modification_stress_test.c @@ -45,7 +45,7 @@ static void vcpu_worker(struct memstress_vcpu_args *vcpu_args) /* Let the guest access its memory until a stop signal is received */ while (!READ_ONCE(memstress_args.stop_vcpus)) { ret = _vcpu_run(vcpu); - TEST_ASSERT(ret == 0, "vcpu_run failed: %d\n", ret); + TEST_ASSERT(ret == 0, "vcpu_run failed: %d", ret); if (get_ucall(vcpu, NULL) == UCALL_SYNC) continue; diff --git a/tools/testing/selftests/kvm/memslot_perf_test.c b/tools/testing/selftests/kvm/memslot_perf_test.c index 8698d1ab60d0..579a64f97333 100644 --- a/tools/testing/selftests/kvm/memslot_perf_test.c +++ b/tools/testing/selftests/kvm/memslot_perf_test.c @@ -175,11 +175,11 @@ static void wait_for_vcpu(void) struct timespec ts; TEST_ASSERT(!clock_gettime(CLOCK_REALTIME, &ts), - "clock_gettime() failed: %d\n", errno); + "clock_gettime() failed: %d", errno); ts.tv_sec += 2; TEST_ASSERT(!sem_timedwait(&vcpu_ready, &ts), - "sem_timedwait() failed: %d\n", errno); + "sem_timedwait() failed: %d", errno); } static void *vm_gpa2hva(struct vm_data *data, uint64_t gpa, uint64_t *rempages) @@ -336,7 +336,7 @@ static bool prepare_vm(struct vm_data *data, int nslots, uint64_t *maxslots, gpa = vm_phy_pages_alloc(data->vm, npages, guest_addr, slot); TEST_ASSERT(gpa == guest_addr, - "vm_phy_pages_alloc() failed\n"); + "vm_phy_pages_alloc() failed"); data->hva_slots[slot - 1] = addr_gpa2hva(data->vm, guest_addr); memset(data->hva_slots[slot - 1], 0, npages * guest_page_size); diff --git a/tools/testing/selftests/kvm/rseq_test.c b/tools/testing/selftests/kvm/rseq_test.c index f74e76d03b7e..28f97fb52044 100644 --- a/tools/testing/selftests/kvm/rseq_test.c +++ b/tools/testing/selftests/kvm/rseq_test.c @@ -245,7 +245,7 @@ int main(int argc, char *argv[]) } while (snapshot != atomic_read(&seq_cnt)); TEST_ASSERT(rseq_cpu == cpu, - "rseq CPU = %d, sched CPU = %d\n", rseq_cpu, cpu); + "rseq CPU = %d, sched CPU = %d", rseq_cpu, cpu); } /* @@ -256,7 +256,7 @@ int main(int argc, char *argv[]) * migrations given the 1us+ delay in the migration task. */ TEST_ASSERT(i > (NR_TASK_MIGRATIONS / 2), - "Only performed %d KVM_RUNs, task stalled too much?\n", i); + "Only performed %d KVM_RUNs, task stalled too much?", i); pthread_join(migration_thread, NULL); diff --git a/tools/testing/selftests/kvm/set_memory_region_test.c b/tools/testing/selftests/kvm/set_memory_region_test.c index 075b80dbe237..40337f566eeb 100644 --- a/tools/testing/selftests/kvm/set_memory_region_test.c +++ b/tools/testing/selftests/kvm/set_memory_region_test.c @@ -98,11 +98,11 @@ static void wait_for_vcpu(void) struct timespec ts; TEST_ASSERT(!clock_gettime(CLOCK_REALTIME, &ts), - "clock_gettime() failed: %d\n", errno); + "clock_gettime() failed: %d", errno); ts.tv_sec += 2; TEST_ASSERT(!sem_timedwait(&vcpu_ready, &ts), - "sem_timedwait() failed: %d\n", errno); + "sem_timedwait() failed: %d", errno); /* Wait for the vCPU thread to reenter the guest. */ usleep(100000); @@ -302,7 +302,7 @@ static void test_delete_memory_region(void) if (run->exit_reason == KVM_EXIT_INTERNAL_ERROR) TEST_ASSERT(regs.rip >= final_rip_start && regs.rip < final_rip_end, - "Bad rip, expected 0x%lx - 0x%lx, got 0x%llx\n", + "Bad rip, expected 0x%lx - 0x%lx, got 0x%llx", final_rip_start, final_rip_end, regs.rip); kvm_vm_free(vm); diff --git a/tools/testing/selftests/kvm/system_counter_offset_test.c b/tools/testing/selftests/kvm/system_counter_offset_test.c index 7f5b330b6a1b..513d421a9bff 100644 --- a/tools/testing/selftests/kvm/system_counter_offset_test.c +++ b/tools/testing/selftests/kvm/system_counter_offset_test.c @@ -108,7 +108,7 @@ static void enter_guest(struct kvm_vcpu *vcpu) handle_abort(&uc); return; default: - TEST_ASSERT(0, "unhandled ucall %ld\n", + TEST_ASSERT(0, "unhandled ucall %ld", get_ucall(vcpu, &uc)); } } From 95be17e4008b592342ec6cfb9264f4b54c21b790 Mon Sep 17 00:00:00 2001 From: Andrew Jones Date: Wed, 6 Dec 2023 18:02:44 +0100 Subject: [PATCH 049/270] KVM: selftests: aarch64: Remove redundant newlines TEST_* functions append their own newline. Remove newlines from TEST_* callsites to avoid extra newlines in output. Signed-off-by: Andrew Jones Acked-by: Zenghui Yu Link: https://lore.kernel.org/r/20231206170241.82801-9-ajones@ventanamicro.com Signed-off-by: Sean Christopherson --- tools/testing/selftests/kvm/aarch64/arch_timer.c | 12 ++++++------ tools/testing/selftests/kvm/aarch64/hypercalls.c | 16 ++++++++-------- .../selftests/kvm/aarch64/page_fault_test.c | 6 +++--- .../testing/selftests/kvm/aarch64/smccc_filter.c | 2 +- .../selftests/kvm/aarch64/vpmu_counter_access.c | 12 ++++++------ .../selftests/kvm/lib/aarch64/processor.c | 2 +- tools/testing/selftests/kvm/lib/aarch64/vgic.c | 4 ++-- 7 files changed, 27 insertions(+), 27 deletions(-) diff --git a/tools/testing/selftests/kvm/aarch64/arch_timer.c b/tools/testing/selftests/kvm/aarch64/arch_timer.c index 274b8465b42a..2cb8dd1f8275 100644 --- a/tools/testing/selftests/kvm/aarch64/arch_timer.c +++ b/tools/testing/selftests/kvm/aarch64/arch_timer.c @@ -248,7 +248,7 @@ static void *test_vcpu_run(void *arg) REPORT_GUEST_ASSERT(uc); break; default: - TEST_FAIL("Unexpected guest exit\n"); + TEST_FAIL("Unexpected guest exit"); } return NULL; @@ -287,7 +287,7 @@ static int test_migrate_vcpu(unsigned int vcpu_idx) /* Allow the error where the vCPU thread is already finished */ TEST_ASSERT(ret == 0 || ret == ESRCH, - "Failed to migrate the vCPU:%u to pCPU: %u; ret: %d\n", + "Failed to migrate the vCPU:%u to pCPU: %u; ret: %d", vcpu_idx, new_pcpu, ret); return ret; @@ -326,12 +326,12 @@ static void test_run(struct kvm_vm *vm) pthread_mutex_init(&vcpu_done_map_lock, NULL); vcpu_done_map = bitmap_zalloc(test_args.nr_vcpus); - TEST_ASSERT(vcpu_done_map, "Failed to allocate vcpu done bitmap\n"); + TEST_ASSERT(vcpu_done_map, "Failed to allocate vcpu done bitmap"); for (i = 0; i < (unsigned long)test_args.nr_vcpus; i++) { ret = pthread_create(&pt_vcpu_run[i], NULL, test_vcpu_run, (void *)(unsigned long)i); - TEST_ASSERT(!ret, "Failed to create vCPU-%d pthread\n", i); + TEST_ASSERT(!ret, "Failed to create vCPU-%d pthread", i); } /* Spawn a thread to control the vCPU migrations */ @@ -340,7 +340,7 @@ static void test_run(struct kvm_vm *vm) ret = pthread_create(&pt_vcpu_migration, NULL, test_vcpu_migration, NULL); - TEST_ASSERT(!ret, "Failed to create the migration pthread\n"); + TEST_ASSERT(!ret, "Failed to create the migration pthread"); } @@ -384,7 +384,7 @@ static struct kvm_vm *test_vm_create(void) if (kvm_has_cap(KVM_CAP_COUNTER_OFFSET)) vm_ioctl(vm, KVM_ARM_SET_COUNTER_OFFSET, &test_args.offset); else - TEST_FAIL("no support for global offset\n"); + TEST_FAIL("no support for global offset"); } for (i = 0; i < nr_vcpus; i++) diff --git a/tools/testing/selftests/kvm/aarch64/hypercalls.c b/tools/testing/selftests/kvm/aarch64/hypercalls.c index 31f66ba97228..27c10e7a7e01 100644 --- a/tools/testing/selftests/kvm/aarch64/hypercalls.c +++ b/tools/testing/selftests/kvm/aarch64/hypercalls.c @@ -175,18 +175,18 @@ static void test_fw_regs_before_vm_start(struct kvm_vcpu *vcpu) /* First 'read' should be an upper limit of the features supported */ vcpu_get_reg(vcpu, reg_info->reg, &val); TEST_ASSERT(val == FW_REG_ULIMIT_VAL(reg_info->max_feat_bit), - "Expected all the features to be set for reg: 0x%lx; expected: 0x%lx; read: 0x%lx\n", + "Expected all the features to be set for reg: 0x%lx; expected: 0x%lx; read: 0x%lx", reg_info->reg, FW_REG_ULIMIT_VAL(reg_info->max_feat_bit), val); /* Test a 'write' by disabling all the features of the register map */ ret = __vcpu_set_reg(vcpu, reg_info->reg, 0); TEST_ASSERT(ret == 0, - "Failed to clear all the features of reg: 0x%lx; ret: %d\n", + "Failed to clear all the features of reg: 0x%lx; ret: %d", reg_info->reg, errno); vcpu_get_reg(vcpu, reg_info->reg, &val); TEST_ASSERT(val == 0, - "Expected all the features to be cleared for reg: 0x%lx\n", reg_info->reg); + "Expected all the features to be cleared for reg: 0x%lx", reg_info->reg); /* * Test enabling a feature that's not supported. @@ -195,7 +195,7 @@ static void test_fw_regs_before_vm_start(struct kvm_vcpu *vcpu) if (reg_info->max_feat_bit < 63) { ret = __vcpu_set_reg(vcpu, reg_info->reg, BIT(reg_info->max_feat_bit + 1)); TEST_ASSERT(ret != 0 && errno == EINVAL, - "Unexpected behavior or return value (%d) while setting an unsupported feature for reg: 0x%lx\n", + "Unexpected behavior or return value (%d) while setting an unsupported feature for reg: 0x%lx", errno, reg_info->reg); } } @@ -216,7 +216,7 @@ static void test_fw_regs_after_vm_start(struct kvm_vcpu *vcpu) */ vcpu_get_reg(vcpu, reg_info->reg, &val); TEST_ASSERT(val == 0, - "Expected all the features to be cleared for reg: 0x%lx\n", + "Expected all the features to be cleared for reg: 0x%lx", reg_info->reg); /* @@ -226,7 +226,7 @@ static void test_fw_regs_after_vm_start(struct kvm_vcpu *vcpu) */ ret = __vcpu_set_reg(vcpu, reg_info->reg, FW_REG_ULIMIT_VAL(reg_info->max_feat_bit)); TEST_ASSERT(ret != 0 && errno == EBUSY, - "Unexpected behavior or return value (%d) while setting a feature while VM is running for reg: 0x%lx\n", + "Unexpected behavior or return value (%d) while setting a feature while VM is running for reg: 0x%lx", errno, reg_info->reg); } } @@ -265,7 +265,7 @@ static void test_guest_stage(struct kvm_vm **vm, struct kvm_vcpu **vcpu) case TEST_STAGE_HVC_IFACE_FALSE_INFO: break; default: - TEST_FAIL("Unknown test stage: %d\n", prev_stage); + TEST_FAIL("Unknown test stage: %d", prev_stage); } } @@ -294,7 +294,7 @@ static void test_run(void) REPORT_GUEST_ASSERT(uc); break; default: - TEST_FAIL("Unexpected guest exit\n"); + TEST_FAIL("Unexpected guest exit"); } } diff --git a/tools/testing/selftests/kvm/aarch64/page_fault_test.c b/tools/testing/selftests/kvm/aarch64/page_fault_test.c index 08a5ca5bed56..53fddad57cbb 100644 --- a/tools/testing/selftests/kvm/aarch64/page_fault_test.c +++ b/tools/testing/selftests/kvm/aarch64/page_fault_test.c @@ -414,10 +414,10 @@ static bool punch_hole_in_backing_store(struct kvm_vm *vm, if (fd != -1) { ret = fallocate(fd, FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE, 0, paging_size); - TEST_ASSERT(ret == 0, "fallocate failed\n"); + TEST_ASSERT(ret == 0, "fallocate failed"); } else { ret = madvise(hva, paging_size, MADV_DONTNEED); - TEST_ASSERT(ret == 0, "madvise failed\n"); + TEST_ASSERT(ret == 0, "madvise failed"); } return true; @@ -501,7 +501,7 @@ static bool handle_cmd(struct kvm_vm *vm, int cmd) void fail_vcpu_run_no_handler(int ret) { - TEST_FAIL("Unexpected vcpu run failure\n"); + TEST_FAIL("Unexpected vcpu run failure"); } void fail_vcpu_run_mmio_no_syndrome_handler(int ret) diff --git a/tools/testing/selftests/kvm/aarch64/smccc_filter.c b/tools/testing/selftests/kvm/aarch64/smccc_filter.c index f4ceae9c8925..2d189f3da228 100644 --- a/tools/testing/selftests/kvm/aarch64/smccc_filter.c +++ b/tools/testing/selftests/kvm/aarch64/smccc_filter.c @@ -178,7 +178,7 @@ static void expect_call_denied(struct kvm_vcpu *vcpu) struct ucall uc; if (get_ucall(vcpu, &uc) != UCALL_SYNC) - TEST_FAIL("Unexpected ucall: %lu\n", uc.cmd); + TEST_FAIL("Unexpected ucall: %lu", uc.cmd); TEST_ASSERT(uc.args[1] == SMCCC_RET_NOT_SUPPORTED, "Unexpected SMCCC return code: %lu", uc.args[1]); diff --git a/tools/testing/selftests/kvm/aarch64/vpmu_counter_access.c b/tools/testing/selftests/kvm/aarch64/vpmu_counter_access.c index 9d51b5691349..5f9713364693 100644 --- a/tools/testing/selftests/kvm/aarch64/vpmu_counter_access.c +++ b/tools/testing/selftests/kvm/aarch64/vpmu_counter_access.c @@ -517,11 +517,11 @@ static void test_create_vpmu_vm_with_pmcr_n(uint64_t pmcr_n, bool expect_fail) if (expect_fail) TEST_ASSERT(pmcr_orig == pmcr, - "PMCR.N modified by KVM to a larger value (PMCR: 0x%lx) for pmcr_n: 0x%lx\n", + "PMCR.N modified by KVM to a larger value (PMCR: 0x%lx) for pmcr_n: 0x%lx", pmcr, pmcr_n); else TEST_ASSERT(pmcr_n == get_pmcr_n(pmcr), - "Failed to update PMCR.N to %lu (received: %lu)\n", + "Failed to update PMCR.N to %lu (received: %lu)", pmcr_n, get_pmcr_n(pmcr)); } @@ -594,12 +594,12 @@ static void run_pmregs_validity_test(uint64_t pmcr_n) */ vcpu_get_reg(vcpu, KVM_ARM64_SYS_REG(set_reg_id), ®_val); TEST_ASSERT((reg_val & (~valid_counters_mask)) == 0, - "Initial read of set_reg: 0x%llx has unimplemented counters enabled: 0x%lx\n", + "Initial read of set_reg: 0x%llx has unimplemented counters enabled: 0x%lx", KVM_ARM64_SYS_REG(set_reg_id), reg_val); vcpu_get_reg(vcpu, KVM_ARM64_SYS_REG(clr_reg_id), ®_val); TEST_ASSERT((reg_val & (~valid_counters_mask)) == 0, - "Initial read of clr_reg: 0x%llx has unimplemented counters enabled: 0x%lx\n", + "Initial read of clr_reg: 0x%llx has unimplemented counters enabled: 0x%lx", KVM_ARM64_SYS_REG(clr_reg_id), reg_val); /* @@ -611,12 +611,12 @@ static void run_pmregs_validity_test(uint64_t pmcr_n) vcpu_get_reg(vcpu, KVM_ARM64_SYS_REG(set_reg_id), ®_val); TEST_ASSERT((reg_val & (~valid_counters_mask)) == 0, - "Read of set_reg: 0x%llx has unimplemented counters enabled: 0x%lx\n", + "Read of set_reg: 0x%llx has unimplemented counters enabled: 0x%lx", KVM_ARM64_SYS_REG(set_reg_id), reg_val); vcpu_get_reg(vcpu, KVM_ARM64_SYS_REG(clr_reg_id), ®_val); TEST_ASSERT((reg_val & (~valid_counters_mask)) == 0, - "Read of clr_reg: 0x%llx has unimplemented counters enabled: 0x%lx\n", + "Read of clr_reg: 0x%llx has unimplemented counters enabled: 0x%lx", KVM_ARM64_SYS_REG(clr_reg_id), reg_val); } diff --git a/tools/testing/selftests/kvm/lib/aarch64/processor.c b/tools/testing/selftests/kvm/lib/aarch64/processor.c index 41c776b642c0..43b9a7283360 100644 --- a/tools/testing/selftests/kvm/lib/aarch64/processor.c +++ b/tools/testing/selftests/kvm/lib/aarch64/processor.c @@ -398,7 +398,7 @@ void vcpu_args_set(struct kvm_vcpu *vcpu, unsigned int num, ...) int i; TEST_ASSERT(num >= 1 && num <= 8, "Unsupported number of args,\n" - " num: %u\n", num); + " num: %u", num); va_start(ap, num); diff --git a/tools/testing/selftests/kvm/lib/aarch64/vgic.c b/tools/testing/selftests/kvm/lib/aarch64/vgic.c index b5f28d21a947..184378d593e9 100644 --- a/tools/testing/selftests/kvm/lib/aarch64/vgic.c +++ b/tools/testing/selftests/kvm/lib/aarch64/vgic.c @@ -38,7 +38,7 @@ int vgic_v3_setup(struct kvm_vm *vm, unsigned int nr_vcpus, uint32_t nr_irqs, struct list_head *iter; unsigned int nr_gic_pages, nr_vcpus_created = 0; - TEST_ASSERT(nr_vcpus, "Number of vCPUs cannot be empty\n"); + TEST_ASSERT(nr_vcpus, "Number of vCPUs cannot be empty"); /* * Make sure that the caller is infact calling this @@ -47,7 +47,7 @@ int vgic_v3_setup(struct kvm_vm *vm, unsigned int nr_vcpus, uint32_t nr_irqs, list_for_each(iter, &vm->vcpus) nr_vcpus_created++; TEST_ASSERT(nr_vcpus == nr_vcpus_created, - "Number of vCPUs requested (%u) doesn't match with the ones created for the VM (%u)\n", + "Number of vCPUs requested (%u) doesn't match with the ones created for the VM (%u)", nr_vcpus, nr_vcpus_created); /* Distributor setup */ From 93e43e50b80b4f342251fb48f8ebced4d3c34983 Mon Sep 17 00:00:00 2001 From: Andrew Jones Date: Wed, 6 Dec 2023 18:02:45 +0100 Subject: [PATCH 050/270] KVM: selftests: riscv: Remove redundant newlines TEST_* functions append their own newline. Remove newlines from TEST_* callsites to avoid extra newlines in output. Signed-off-by: Andrew Jones Acked-by: Anup Patel Link: https://lore.kernel.org/r/20231206170241.82801-10-ajones@ventanamicro.com Signed-off-by: Sean Christopherson --- tools/testing/selftests/kvm/lib/riscv/processor.c | 2 +- tools/testing/selftests/kvm/riscv/get-reg-list.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/tools/testing/selftests/kvm/lib/riscv/processor.c b/tools/testing/selftests/kvm/lib/riscv/processor.c index 7ca736fb4194..2bb33a8ac03c 100644 --- a/tools/testing/selftests/kvm/lib/riscv/processor.c +++ b/tools/testing/selftests/kvm/lib/riscv/processor.c @@ -327,7 +327,7 @@ void vcpu_args_set(struct kvm_vcpu *vcpu, unsigned int num, ...) int i; TEST_ASSERT(num >= 1 && num <= 8, "Unsupported number of args,\n" - " num: %u\n", num); + " num: %u", num); va_start(ap, num); diff --git a/tools/testing/selftests/kvm/riscv/get-reg-list.c b/tools/testing/selftests/kvm/riscv/get-reg-list.c index 6652108816db..50bb9bb73a7b 100644 --- a/tools/testing/selftests/kvm/riscv/get-reg-list.c +++ b/tools/testing/selftests/kvm/riscv/get-reg-list.c @@ -150,7 +150,7 @@ void finalize_vcpu(struct kvm_vcpu *vcpu, struct vcpu_reg_list *c) /* Double check whether the desired extension was enabled */ __TEST_REQUIRE(vcpu_has_ext(vcpu, feature), - "%s not available, skipping tests\n", s->name); + "%s not available, skipping tests", s->name); } } From a38125f188c141cd1297d14af84c28b5276ab287 Mon Sep 17 00:00:00 2001 From: Andrew Jones Date: Wed, 6 Dec 2023 18:02:46 +0100 Subject: [PATCH 051/270] KVM: selftests: s390x: Remove redundant newlines TEST_* functions append their own newline. Remove newlines from TEST_* callsites to avoid extra newlines in output. Signed-off-by: Andrew Jones Acked-by: Janosch Frank Acked-by: Claudio Imbrenda Link: https://lore.kernel.org/r/20231206170241.82801-11-ajones@ventanamicro.com Signed-off-by: Sean Christopherson --- .../selftests/kvm/lib/s390x/processor.c | 2 +- tools/testing/selftests/kvm/s390x/resets.c | 4 ++-- .../selftests/kvm/s390x/sync_regs_test.c | 20 +++++++++---------- 3 files changed, 13 insertions(+), 13 deletions(-) diff --git a/tools/testing/selftests/kvm/lib/s390x/processor.c b/tools/testing/selftests/kvm/lib/s390x/processor.c index 15945121daf1..f6d227892cbc 100644 --- a/tools/testing/selftests/kvm/lib/s390x/processor.c +++ b/tools/testing/selftests/kvm/lib/s390x/processor.c @@ -198,7 +198,7 @@ void vcpu_args_set(struct kvm_vcpu *vcpu, unsigned int num, ...) int i; TEST_ASSERT(num >= 1 && num <= 5, "Unsupported number of args,\n" - " num: %u\n", + " num: %u", num); va_start(ap, num); diff --git a/tools/testing/selftests/kvm/s390x/resets.c b/tools/testing/selftests/kvm/s390x/resets.c index e41e2cb8ffa9..357943f2bea8 100644 --- a/tools/testing/selftests/kvm/s390x/resets.c +++ b/tools/testing/selftests/kvm/s390x/resets.c @@ -78,7 +78,7 @@ static void assert_noirq(struct kvm_vcpu *vcpu) * (notably, the emergency call interrupt we have injected) should * be cleared by the resets, so this should be 0. */ - TEST_ASSERT(irqs >= 0, "Could not fetch IRQs: errno %d\n", errno); + TEST_ASSERT(irqs >= 0, "Could not fetch IRQs: errno %d", errno); TEST_ASSERT(!irqs, "IRQ pending"); } @@ -199,7 +199,7 @@ static void inject_irq(struct kvm_vcpu *vcpu) irq->type = KVM_S390_INT_EMERGENCY; irq->u.emerg.code = vcpu->id; irqs = __vcpu_ioctl(vcpu, KVM_S390_SET_IRQ_STATE, &irq_state); - TEST_ASSERT(irqs >= 0, "Error injecting EMERGENCY IRQ errno %d\n", errno); + TEST_ASSERT(irqs >= 0, "Error injecting EMERGENCY IRQ errno %d", errno); } static struct kvm_vm *create_vm(struct kvm_vcpu **vcpu) diff --git a/tools/testing/selftests/kvm/s390x/sync_regs_test.c b/tools/testing/selftests/kvm/s390x/sync_regs_test.c index 636a70ddac1e..43fb25ddc3ec 100644 --- a/tools/testing/selftests/kvm/s390x/sync_regs_test.c +++ b/tools/testing/selftests/kvm/s390x/sync_regs_test.c @@ -39,13 +39,13 @@ static void guest_code(void) #define REG_COMPARE(reg) \ TEST_ASSERT(left->reg == right->reg, \ "Register " #reg \ - " values did not match: 0x%llx, 0x%llx\n", \ + " values did not match: 0x%llx, 0x%llx", \ left->reg, right->reg) #define REG_COMPARE32(reg) \ TEST_ASSERT(left->reg == right->reg, \ "Register " #reg \ - " values did not match: 0x%x, 0x%x\n", \ + " values did not match: 0x%x, 0x%x", \ left->reg, right->reg) @@ -82,14 +82,14 @@ void test_read_invalid(struct kvm_vcpu *vcpu) run->kvm_valid_regs = INVALID_SYNC_FIELD; rv = _vcpu_run(vcpu); TEST_ASSERT(rv < 0 && errno == EINVAL, - "Invalid kvm_valid_regs did not cause expected KVM_RUN error: %d\n", + "Invalid kvm_valid_regs did not cause expected KVM_RUN error: %d", rv); run->kvm_valid_regs = 0; run->kvm_valid_regs = INVALID_SYNC_FIELD | TEST_SYNC_FIELDS; rv = _vcpu_run(vcpu); TEST_ASSERT(rv < 0 && errno == EINVAL, - "Invalid kvm_valid_regs did not cause expected KVM_RUN error: %d\n", + "Invalid kvm_valid_regs did not cause expected KVM_RUN error: %d", rv); run->kvm_valid_regs = 0; } @@ -103,14 +103,14 @@ void test_set_invalid(struct kvm_vcpu *vcpu) run->kvm_dirty_regs = INVALID_SYNC_FIELD; rv = _vcpu_run(vcpu); TEST_ASSERT(rv < 0 && errno == EINVAL, - "Invalid kvm_dirty_regs did not cause expected KVM_RUN error: %d\n", + "Invalid kvm_dirty_regs did not cause expected KVM_RUN error: %d", rv); run->kvm_dirty_regs = 0; run->kvm_dirty_regs = INVALID_SYNC_FIELD | TEST_SYNC_FIELDS; rv = _vcpu_run(vcpu); TEST_ASSERT(rv < 0 && errno == EINVAL, - "Invalid kvm_dirty_regs did not cause expected KVM_RUN error: %d\n", + "Invalid kvm_dirty_regs did not cause expected KVM_RUN error: %d", rv); run->kvm_dirty_regs = 0; } @@ -125,12 +125,12 @@ void test_req_and_verify_all_valid_regs(struct kvm_vcpu *vcpu) /* Request and verify all valid register sets. */ run->kvm_valid_regs = TEST_SYNC_FIELDS; rv = _vcpu_run(vcpu); - TEST_ASSERT(rv == 0, "vcpu_run failed: %d\n", rv); + TEST_ASSERT(rv == 0, "vcpu_run failed: %d", rv); TEST_ASSERT_KVM_EXIT_REASON(vcpu, KVM_EXIT_S390_SIEIC); TEST_ASSERT(run->s390_sieic.icptcode == 4 && (run->s390_sieic.ipa >> 8) == 0x83 && (run->s390_sieic.ipb >> 16) == 0x501, - "Unexpected interception code: ic=%u, ipa=0x%x, ipb=0x%x\n", + "Unexpected interception code: ic=%u, ipa=0x%x, ipb=0x%x", run->s390_sieic.icptcode, run->s390_sieic.ipa, run->s390_sieic.ipb); @@ -161,7 +161,7 @@ void test_set_and_verify_various_reg_values(struct kvm_vcpu *vcpu) } rv = _vcpu_run(vcpu); - TEST_ASSERT(rv == 0, "vcpu_run failed: %d\n", rv); + TEST_ASSERT(rv == 0, "vcpu_run failed: %d", rv); TEST_ASSERT_KVM_EXIT_REASON(vcpu, KVM_EXIT_S390_SIEIC); TEST_ASSERT(run->s.regs.gprs[11] == 0xBAD1DEA + 1, "r11 sync regs value incorrect 0x%llx.", @@ -193,7 +193,7 @@ void test_clear_kvm_dirty_regs_bits(struct kvm_vcpu *vcpu) run->s.regs.gprs[11] = 0xDEADBEEF; run->s.regs.diag318 = 0x4B1D; rv = _vcpu_run(vcpu); - TEST_ASSERT(rv == 0, "vcpu_run failed: %d\n", rv); + TEST_ASSERT(rv == 0, "vcpu_run failed: %d", rv); TEST_ASSERT_KVM_EXIT_REASON(vcpu, KVM_EXIT_S390_SIEIC); TEST_ASSERT(run->s.regs.gprs[11] != 0xDEADBEEF, "r11 sync regs value incorrect 0x%llx.", From d877550eaf2dc9090d782864c96939397a3c6835 Mon Sep 17 00:00:00 2001 From: Andrei Vagin Date: Mon, 29 Jan 2024 22:36:03 -0800 Subject: [PATCH 052/270] x86/fpu: Stop relying on userspace for info to fault in xsave buffer Before this change, the expected size of the user space buffer was taken from fx_sw->xstate_size. fx_sw->xstate_size can be changed from user-space, so it is possible construct a sigreturn frame where: * fx_sw->xstate_size is smaller than the size required by valid bits in fx_sw->xfeatures. * user-space unmaps parts of the sigrame fpu buffer so that not all of the buffer required by xrstor is accessible. In this case, xrstor tries to restore and accesses the unmapped area which results in a fault. But fault_in_readable succeeds because buf + fx_sw->xstate_size is within the still mapped area, so it goes back and tries xrstor again. It will spin in this loop forever. Instead, fault in the maximum size which can be touched by XRSTOR (taken from fpstate->user_size). [ dhansen: tweak subject / changelog ] Fixes: fcb3635f5018 ("x86/fpu/signal: Handle #PF in the direct restore path") Reported-by: Konstantin Bogomolov Suggested-by: Thomas Gleixner Signed-off-by: Andrei Vagin Signed-off-by: Dave Hansen Cc:stable@vger.kernel.org Link: https://lore.kernel.org/all/20240130063603.3392627-1-avagin%40google.com --- arch/x86/kernel/fpu/signal.c | 13 +++++-------- 1 file changed, 5 insertions(+), 8 deletions(-) diff --git a/arch/x86/kernel/fpu/signal.c b/arch/x86/kernel/fpu/signal.c index 558076dbde5b..247f2225aa9f 100644 --- a/arch/x86/kernel/fpu/signal.c +++ b/arch/x86/kernel/fpu/signal.c @@ -274,12 +274,13 @@ static int __restore_fpregs_from_user(void __user *buf, u64 ufeatures, * Attempt to restore the FPU registers directly from user memory. * Pagefaults are handled and any errors returned are fatal. */ -static bool restore_fpregs_from_user(void __user *buf, u64 xrestore, - bool fx_only, unsigned int size) +static bool restore_fpregs_from_user(void __user *buf, u64 xrestore, bool fx_only) { struct fpu *fpu = ¤t->thread.fpu; int ret; + /* Restore enabled features only. */ + xrestore &= fpu->fpstate->user_xfeatures; retry: fpregs_lock(); /* Ensure that XFD is up to date */ @@ -309,7 +310,7 @@ retry: if (ret != X86_TRAP_PF) return false; - if (!fault_in_readable(buf, size)) + if (!fault_in_readable(buf, fpu->fpstate->user_size)) goto retry; return false; } @@ -339,7 +340,6 @@ static bool __fpu_restore_sig(void __user *buf, void __user *buf_fx, struct user_i387_ia32_struct env; bool success, fx_only = false; union fpregs_state *fpregs; - unsigned int state_size; u64 user_xfeatures = 0; if (use_xsave()) { @@ -349,17 +349,14 @@ static bool __fpu_restore_sig(void __user *buf, void __user *buf_fx, return false; fx_only = !fx_sw_user.magic1; - state_size = fx_sw_user.xstate_size; user_xfeatures = fx_sw_user.xfeatures; } else { user_xfeatures = XFEATURE_MASK_FPSSE; - state_size = fpu->fpstate->user_size; } if (likely(!ia32_fxstate)) { /* Restore the FPU registers directly from user memory. */ - return restore_fpregs_from_user(buf_fx, user_xfeatures, fx_only, - state_size); + return restore_fpregs_from_user(buf_fx, user_xfeatures, fx_only); } /* From 65612e993493014cb95be81ca4f98e08732c46d0 Mon Sep 17 00:00:00 2001 From: Andrew Jones Date: Wed, 6 Dec 2023 18:02:47 +0100 Subject: [PATCH 053/270] KVM: selftests: x86_64: Remove redundant newlines TEST_* functions append their own newline. Remove newlines from TEST_* callsites to avoid extra newlines in output. Signed-off-by: Andrew Jones Link: https://lore.kernel.org/r/20231206170241.82801-12-ajones@ventanamicro.com [sean: keep the newline in the "tsc\n" strncmp()] Signed-off-by: Sean Christopherson --- .../selftests/kvm/lib/x86_64/processor.c | 10 +++---- tools/testing/selftests/kvm/lib/x86_64/vmx.c | 6 ++-- tools/testing/selftests/kvm/x86_64/amx_test.c | 2 +- .../testing/selftests/kvm/x86_64/cpuid_test.c | 4 +-- .../selftests/kvm/x86_64/flds_emulation.h | 2 +- .../selftests/kvm/x86_64/hyperv_clock.c | 4 +-- .../testing/selftests/kvm/x86_64/hyperv_ipi.c | 2 +- .../selftests/kvm/x86_64/hyperv_tlb_flush.c | 2 +- .../selftests/kvm/x86_64/kvm_clock_test.c | 6 ++-- .../selftests/kvm/x86_64/platform_info_test.c | 2 +- .../kvm/x86_64/pmu_event_filter_test.c | 2 +- .../selftests/kvm/x86_64/sev_migrate_tests.c | 28 +++++++++---------- .../smaller_maxphyaddr_emulation_test.c | 4 +-- .../selftests/kvm/x86_64/sync_regs_test.c | 10 +++---- .../kvm/x86_64/ucna_injection_test.c | 8 +++--- .../selftests/kvm/x86_64/userspace_io_test.c | 2 +- .../kvm/x86_64/vmx_apic_access_test.c | 2 +- .../selftests/kvm/x86_64/vmx_dirty_log_test.c | 16 +++++------ .../vmx_exception_with_invalid_guest_state.c | 2 +- .../selftests/kvm/x86_64/xapic_ipi_test.c | 8 +++--- .../selftests/kvm/x86_64/xcr0_cpuid_test.c | 2 +- .../selftests/kvm/x86_64/xss_msr_test.c | 2 +- 22 files changed, 63 insertions(+), 63 deletions(-) diff --git a/tools/testing/selftests/kvm/lib/x86_64/processor.c b/tools/testing/selftests/kvm/lib/x86_64/processor.c index d8288374078e..4bc52948447d 100644 --- a/tools/testing/selftests/kvm/lib/x86_64/processor.c +++ b/tools/testing/selftests/kvm/lib/x86_64/processor.c @@ -170,10 +170,10 @@ static uint64_t *virt_create_upper_pte(struct kvm_vm *vm, * this level. */ TEST_ASSERT(current_level != target_level, - "Cannot create hugepage at level: %u, vaddr: 0x%lx\n", + "Cannot create hugepage at level: %u, vaddr: 0x%lx", current_level, vaddr); TEST_ASSERT(!(*pte & PTE_LARGE_MASK), - "Cannot create page table at level: %u, vaddr: 0x%lx\n", + "Cannot create page table at level: %u, vaddr: 0x%lx", current_level, vaddr); } return pte; @@ -220,7 +220,7 @@ void __virt_pg_map(struct kvm_vm *vm, uint64_t vaddr, uint64_t paddr, int level) /* Fill in page table entry. */ pte = virt_get_pte(vm, pde, vaddr, PG_LEVEL_4K); TEST_ASSERT(!(*pte & PTE_PRESENT_MASK), - "PTE already present for 4k page at vaddr: 0x%lx\n", vaddr); + "PTE already present for 4k page at vaddr: 0x%lx", vaddr); *pte = PTE_PRESENT_MASK | PTE_WRITABLE_MASK | (paddr & PHYSICAL_PAGE_MASK); } @@ -253,7 +253,7 @@ static bool vm_is_target_pte(uint64_t *pte, int *level, int current_level) if (*pte & PTE_LARGE_MASK) { TEST_ASSERT(*level == PG_LEVEL_NONE || *level == current_level, - "Unexpected hugepage at level %d\n", current_level); + "Unexpected hugepage at level %d", current_level); *level = current_level; } @@ -825,7 +825,7 @@ void vcpu_args_set(struct kvm_vcpu *vcpu, unsigned int num, ...) struct kvm_regs regs; TEST_ASSERT(num >= 1 && num <= 6, "Unsupported number of args,\n" - " num: %u\n", + " num: %u", num); va_start(ap, num); diff --git a/tools/testing/selftests/kvm/lib/x86_64/vmx.c b/tools/testing/selftests/kvm/lib/x86_64/vmx.c index 59d97531c9b1..089b8925b6b2 100644 --- a/tools/testing/selftests/kvm/lib/x86_64/vmx.c +++ b/tools/testing/selftests/kvm/lib/x86_64/vmx.c @@ -54,7 +54,7 @@ int vcpu_enable_evmcs(struct kvm_vcpu *vcpu) /* KVM should return supported EVMCS version range */ TEST_ASSERT(((evmcs_ver >> 8) >= (evmcs_ver & 0xff)) && (evmcs_ver & 0xff) > 0, - "Incorrect EVMCS version range: %x:%x\n", + "Incorrect EVMCS version range: %x:%x", evmcs_ver & 0xff, evmcs_ver >> 8); return evmcs_ver; @@ -387,10 +387,10 @@ static void nested_create_pte(struct kvm_vm *vm, * this level. */ TEST_ASSERT(current_level != target_level, - "Cannot create hugepage at level: %u, nested_paddr: 0x%lx\n", + "Cannot create hugepage at level: %u, nested_paddr: 0x%lx", current_level, nested_paddr); TEST_ASSERT(!pte->page_size, - "Cannot create page table at level: %u, nested_paddr: 0x%lx\n", + "Cannot create page table at level: %u, nested_paddr: 0x%lx", current_level, nested_paddr); } } diff --git a/tools/testing/selftests/kvm/x86_64/amx_test.c b/tools/testing/selftests/kvm/x86_64/amx_test.c index 11329e5ff945..f7da543570ad 100644 --- a/tools/testing/selftests/kvm/x86_64/amx_test.c +++ b/tools/testing/selftests/kvm/x86_64/amx_test.c @@ -296,7 +296,7 @@ int main(int argc, char *argv[]) void *tiles_data = (void *)addr_gva2hva(vm, tiledata); /* Only check TMM0 register, 1 tile */ ret = memcmp(amx_start, tiles_data, TILE_SIZE); - TEST_ASSERT(ret == 0, "memcmp failed, ret=%d\n", ret); + TEST_ASSERT(ret == 0, "memcmp failed, ret=%d", ret); kvm_x86_state_cleanup(state); break; case 9: diff --git a/tools/testing/selftests/kvm/x86_64/cpuid_test.c b/tools/testing/selftests/kvm/x86_64/cpuid_test.c index 3b34d8156d1c..8c579ce714e9 100644 --- a/tools/testing/selftests/kvm/x86_64/cpuid_test.c +++ b/tools/testing/selftests/kvm/x86_64/cpuid_test.c @@ -84,7 +84,7 @@ static void compare_cpuids(const struct kvm_cpuid2 *cpuid1, TEST_ASSERT(e1->function == e2->function && e1->index == e2->index && e1->flags == e2->flags, - "CPUID entries[%d] mismtach: 0x%x.%d.%x vs. 0x%x.%d.%x\n", + "CPUID entries[%d] mismtach: 0x%x.%d.%x vs. 0x%x.%d.%x", i, e1->function, e1->index, e1->flags, e2->function, e2->index, e2->flags); @@ -170,7 +170,7 @@ static void test_get_cpuid2(struct kvm_vcpu *vcpu) vcpu_ioctl(vcpu, KVM_GET_CPUID2, cpuid); TEST_ASSERT(cpuid->nent == vcpu->cpuid->nent, - "KVM didn't update nent on success, wanted %u, got %u\n", + "KVM didn't update nent on success, wanted %u, got %u", vcpu->cpuid->nent, cpuid->nent); for (i = 0; i < vcpu->cpuid->nent; i++) { diff --git a/tools/testing/selftests/kvm/x86_64/flds_emulation.h b/tools/testing/selftests/kvm/x86_64/flds_emulation.h index 0a1573d52882..37b1a9f52864 100644 --- a/tools/testing/selftests/kvm/x86_64/flds_emulation.h +++ b/tools/testing/selftests/kvm/x86_64/flds_emulation.h @@ -41,7 +41,7 @@ static inline void handle_flds_emulation_failure_exit(struct kvm_vcpu *vcpu) insn_bytes = run->emulation_failure.insn_bytes; TEST_ASSERT(insn_bytes[0] == 0xd9 && insn_bytes[1] == 0, - "Expected 'flds [eax]', opcode '0xd9 0x00', got opcode 0x%02x 0x%02x\n", + "Expected 'flds [eax]', opcode '0xd9 0x00', got opcode 0x%02x 0x%02x", insn_bytes[0], insn_bytes[1]); vcpu_regs_get(vcpu, ®s); diff --git a/tools/testing/selftests/kvm/x86_64/hyperv_clock.c b/tools/testing/selftests/kvm/x86_64/hyperv_clock.c index f5e1e98f04f9..65690d916db7 100644 --- a/tools/testing/selftests/kvm/x86_64/hyperv_clock.c +++ b/tools/testing/selftests/kvm/x86_64/hyperv_clock.c @@ -220,7 +220,7 @@ int main(void) tsc_page_gva = vm_vaddr_alloc_page(vm); memset(addr_gva2hva(vm, tsc_page_gva), 0x0, getpagesize()); TEST_ASSERT((addr_gva2gpa(vm, tsc_page_gva) & (getpagesize() - 1)) == 0, - "TSC page has to be page aligned\n"); + "TSC page has to be page aligned"); vcpu_args_set(vcpu, 2, tsc_page_gva, addr_gva2gpa(vm, tsc_page_gva)); host_check_tsc_msr_rdtsc(vcpu); @@ -237,7 +237,7 @@ int main(void) break; case UCALL_DONE: /* Keep in sync with guest_main() */ - TEST_ASSERT(stage == 11, "Testing ended prematurely, stage %d\n", + TEST_ASSERT(stage == 11, "Testing ended prematurely, stage %d", stage); goto out; default: diff --git a/tools/testing/selftests/kvm/x86_64/hyperv_ipi.c b/tools/testing/selftests/kvm/x86_64/hyperv_ipi.c index 65e5f4c05068..f1617762c22f 100644 --- a/tools/testing/selftests/kvm/x86_64/hyperv_ipi.c +++ b/tools/testing/selftests/kvm/x86_64/hyperv_ipi.c @@ -289,7 +289,7 @@ int main(int argc, char *argv[]) switch (get_ucall(vcpu[0], &uc)) { case UCALL_SYNC: TEST_ASSERT(uc.args[1] == stage, - "Unexpected stage: %ld (%d expected)\n", + "Unexpected stage: %ld (%d expected)", uc.args[1], stage); break; case UCALL_DONE: diff --git a/tools/testing/selftests/kvm/x86_64/hyperv_tlb_flush.c b/tools/testing/selftests/kvm/x86_64/hyperv_tlb_flush.c index c4443f71f8dd..05b56095cf76 100644 --- a/tools/testing/selftests/kvm/x86_64/hyperv_tlb_flush.c +++ b/tools/testing/selftests/kvm/x86_64/hyperv_tlb_flush.c @@ -658,7 +658,7 @@ int main(int argc, char *argv[]) switch (get_ucall(vcpu[0], &uc)) { case UCALL_SYNC: TEST_ASSERT(uc.args[1] == stage, - "Unexpected stage: %ld (%d expected)\n", + "Unexpected stage: %ld (%d expected)", uc.args[1], stage); break; case UCALL_ABORT: diff --git a/tools/testing/selftests/kvm/x86_64/kvm_clock_test.c b/tools/testing/selftests/kvm/x86_64/kvm_clock_test.c index 1778704360a6..3e0b7d51abda 100644 --- a/tools/testing/selftests/kvm/x86_64/kvm_clock_test.c +++ b/tools/testing/selftests/kvm/x86_64/kvm_clock_test.c @@ -92,7 +92,7 @@ static void setup_clock(struct kvm_vm *vm, struct test_case *test_case) break; } while (errno == EINTR); - TEST_ASSERT(!r, "clock_gettime() failed: %d\n", r); + TEST_ASSERT(!r, "clock_gettime() failed: %d", r); data.realtime = ts.tv_sec * NSEC_PER_SEC; data.realtime += ts.tv_nsec; @@ -127,7 +127,7 @@ static void enter_guest(struct kvm_vcpu *vcpu) handle_abort(&uc); return; default: - TEST_ASSERT(0, "unhandled ucall: %ld\n", uc.cmd); + TEST_ASSERT(0, "unhandled ucall: %ld", uc.cmd); } } } @@ -154,7 +154,7 @@ static void check_clocksource(void) } clk_name = malloc(st.st_size); - TEST_ASSERT(clk_name, "failed to allocate buffer to read file\n"); + TEST_ASSERT(clk_name, "failed to allocate buffer to read file"); if (!fgets(clk_name, st.st_size, fp)) { pr_info("failed to read clocksource file: %d; assuming TSC.\n", diff --git a/tools/testing/selftests/kvm/x86_64/platform_info_test.c b/tools/testing/selftests/kvm/x86_64/platform_info_test.c index c9a07963d68a..87011965dc41 100644 --- a/tools/testing/selftests/kvm/x86_64/platform_info_test.c +++ b/tools/testing/selftests/kvm/x86_64/platform_info_test.c @@ -44,7 +44,7 @@ static void test_msr_platform_info_enabled(struct kvm_vcpu *vcpu) get_ucall(vcpu, &uc); TEST_ASSERT(uc.cmd == UCALL_SYNC, - "Received ucall other than UCALL_SYNC: %lu\n", uc.cmd); + "Received ucall other than UCALL_SYNC: %lu", uc.cmd); TEST_ASSERT((uc.args[1] & MSR_PLATFORM_INFO_MAX_TURBO_RATIO) == MSR_PLATFORM_INFO_MAX_TURBO_RATIO, "Expected MSR_PLATFORM_INFO to have max turbo ratio mask: %i.", diff --git a/tools/testing/selftests/kvm/x86_64/pmu_event_filter_test.c b/tools/testing/selftests/kvm/x86_64/pmu_event_filter_test.c index 283cc55597a4..a3bd54b925ab 100644 --- a/tools/testing/selftests/kvm/x86_64/pmu_event_filter_test.c +++ b/tools/testing/selftests/kvm/x86_64/pmu_event_filter_test.c @@ -866,7 +866,7 @@ static void __test_fixed_counter_bitmap(struct kvm_vcpu *vcpu, uint8_t idx, * userspace doesn't set any pmu filter. */ count = run_vcpu_to_sync(vcpu); - TEST_ASSERT(count, "Unexpected count value: %ld\n", count); + TEST_ASSERT(count, "Unexpected count value: %ld", count); for (i = 0; i < BIT(nr_fixed_counters); i++) { bitmap = BIT(i); diff --git a/tools/testing/selftests/kvm/x86_64/sev_migrate_tests.c b/tools/testing/selftests/kvm/x86_64/sev_migrate_tests.c index c7ef97561038..a49828adf294 100644 --- a/tools/testing/selftests/kvm/x86_64/sev_migrate_tests.c +++ b/tools/testing/selftests/kvm/x86_64/sev_migrate_tests.c @@ -91,7 +91,7 @@ static void sev_migrate_from(struct kvm_vm *dst, struct kvm_vm *src) int ret; ret = __sev_migrate_from(dst, src); - TEST_ASSERT(!ret, "Migration failed, ret: %d, errno: %d\n", ret, errno); + TEST_ASSERT(!ret, "Migration failed, ret: %d, errno: %d", ret, errno); } static void test_sev_migrate_from(bool es) @@ -113,7 +113,7 @@ static void test_sev_migrate_from(bool es) /* Migrate the guest back to the original VM. */ ret = __sev_migrate_from(src_vm, dst_vms[NR_MIGRATE_TEST_VMS - 1]); TEST_ASSERT(ret == -1 && errno == EIO, - "VM that was migrated from should be dead. ret %d, errno: %d\n", ret, + "VM that was migrated from should be dead. ret %d, errno: %d", ret, errno); kvm_vm_free(src_vm); @@ -172,7 +172,7 @@ static void test_sev_migrate_parameters(void) vm_no_sev = aux_vm_create(true); ret = __sev_migrate_from(vm_no_vcpu, vm_no_sev); TEST_ASSERT(ret == -1 && errno == EINVAL, - "Migrations require SEV enabled. ret %d, errno: %d\n", ret, + "Migrations require SEV enabled. ret %d, errno: %d", ret, errno); if (!have_sev_es) @@ -187,25 +187,25 @@ static void test_sev_migrate_parameters(void) ret = __sev_migrate_from(sev_vm, sev_es_vm); TEST_ASSERT( ret == -1 && errno == EINVAL, - "Should not be able migrate to SEV enabled VM. ret: %d, errno: %d\n", + "Should not be able migrate to SEV enabled VM. ret: %d, errno: %d", ret, errno); ret = __sev_migrate_from(sev_es_vm, sev_vm); TEST_ASSERT( ret == -1 && errno == EINVAL, - "Should not be able migrate to SEV-ES enabled VM. ret: %d, errno: %d\n", + "Should not be able migrate to SEV-ES enabled VM. ret: %d, errno: %d", ret, errno); ret = __sev_migrate_from(vm_no_vcpu, sev_es_vm); TEST_ASSERT( ret == -1 && errno == EINVAL, - "SEV-ES migrations require same number of vCPUS. ret: %d, errno: %d\n", + "SEV-ES migrations require same number of vCPUS. ret: %d, errno: %d", ret, errno); ret = __sev_migrate_from(vm_no_vcpu, sev_es_vm_no_vmsa); TEST_ASSERT( ret == -1 && errno == EINVAL, - "SEV-ES migrations require UPDATE_VMSA. ret %d, errno: %d\n", + "SEV-ES migrations require UPDATE_VMSA. ret %d, errno: %d", ret, errno); kvm_vm_free(sev_vm); @@ -227,7 +227,7 @@ static void sev_mirror_create(struct kvm_vm *dst, struct kvm_vm *src) int ret; ret = __sev_mirror_create(dst, src); - TEST_ASSERT(!ret, "Copying context failed, ret: %d, errno: %d\n", ret, errno); + TEST_ASSERT(!ret, "Copying context failed, ret: %d, errno: %d", ret, errno); } static void verify_mirror_allowed_cmds(int vm_fd) @@ -259,7 +259,7 @@ static void verify_mirror_allowed_cmds(int vm_fd) ret = __sev_ioctl(vm_fd, cmd_id, NULL, &fw_error); TEST_ASSERT( ret == -1 && errno == EINVAL, - "Should not be able call command: %d. ret: %d, errno: %d\n", + "Should not be able call command: %d. ret: %d, errno: %d", cmd_id, ret, errno); } @@ -301,18 +301,18 @@ static void test_sev_mirror_parameters(void) ret = __sev_mirror_create(sev_vm, sev_vm); TEST_ASSERT( ret == -1 && errno == EINVAL, - "Should not be able copy context to self. ret: %d, errno: %d\n", + "Should not be able copy context to self. ret: %d, errno: %d", ret, errno); ret = __sev_mirror_create(vm_no_vcpu, vm_with_vcpu); TEST_ASSERT(ret == -1 && errno == EINVAL, - "Copy context requires SEV enabled. ret %d, errno: %d\n", ret, + "Copy context requires SEV enabled. ret %d, errno: %d", ret, errno); ret = __sev_mirror_create(vm_with_vcpu, sev_vm); TEST_ASSERT( ret == -1 && errno == EINVAL, - "SEV copy context requires no vCPUS on the destination. ret: %d, errno: %d\n", + "SEV copy context requires no vCPUS on the destination. ret: %d, errno: %d", ret, errno); if (!have_sev_es) @@ -322,13 +322,13 @@ static void test_sev_mirror_parameters(void) ret = __sev_mirror_create(sev_vm, sev_es_vm); TEST_ASSERT( ret == -1 && errno == EINVAL, - "Should not be able copy context to SEV enabled VM. ret: %d, errno: %d\n", + "Should not be able copy context to SEV enabled VM. ret: %d, errno: %d", ret, errno); ret = __sev_mirror_create(sev_es_vm, sev_vm); TEST_ASSERT( ret == -1 && errno == EINVAL, - "Should not be able copy context to SEV-ES enabled VM. ret: %d, errno: %d\n", + "Should not be able copy context to SEV-ES enabled VM. ret: %d, errno: %d", ret, errno); kvm_vm_free(sev_es_vm); diff --git a/tools/testing/selftests/kvm/x86_64/smaller_maxphyaddr_emulation_test.c b/tools/testing/selftests/kvm/x86_64/smaller_maxphyaddr_emulation_test.c index 06edf00a97d6..1a46dd7bb391 100644 --- a/tools/testing/selftests/kvm/x86_64/smaller_maxphyaddr_emulation_test.c +++ b/tools/testing/selftests/kvm/x86_64/smaller_maxphyaddr_emulation_test.c @@ -74,7 +74,7 @@ int main(int argc, char *argv[]) MEM_REGION_SIZE / PAGE_SIZE, 0); gpa = vm_phy_pages_alloc(vm, MEM_REGION_SIZE / PAGE_SIZE, MEM_REGION_GPA, MEM_REGION_SLOT); - TEST_ASSERT(gpa == MEM_REGION_GPA, "Failed vm_phy_pages_alloc\n"); + TEST_ASSERT(gpa == MEM_REGION_GPA, "Failed vm_phy_pages_alloc"); virt_map(vm, MEM_REGION_GVA, MEM_REGION_GPA, 1); hva = addr_gpa2hva(vm, MEM_REGION_GPA); memset(hva, 0, PAGE_SIZE); @@ -102,7 +102,7 @@ int main(int argc, char *argv[]) case UCALL_DONE: break; default: - TEST_FAIL("Unrecognized ucall: %lu\n", uc.cmd); + TEST_FAIL("Unrecognized ucall: %lu", uc.cmd); } kvm_vm_free(vm); diff --git a/tools/testing/selftests/kvm/x86_64/sync_regs_test.c b/tools/testing/selftests/kvm/x86_64/sync_regs_test.c index 00965ba33f73..a91b5b145fa3 100644 --- a/tools/testing/selftests/kvm/x86_64/sync_regs_test.c +++ b/tools/testing/selftests/kvm/x86_64/sync_regs_test.c @@ -46,7 +46,7 @@ static void compare_regs(struct kvm_regs *left, struct kvm_regs *right) #define REG_COMPARE(reg) \ TEST_ASSERT(left->reg == right->reg, \ "Register " #reg \ - " values did not match: 0x%llx, 0x%llx\n", \ + " values did not match: 0x%llx, 0x%llx", \ left->reg, right->reg) REG_COMPARE(rax); REG_COMPARE(rbx); @@ -230,14 +230,14 @@ int main(int argc, char *argv[]) run->kvm_valid_regs = INVALID_SYNC_FIELD; rv = _vcpu_run(vcpu); TEST_ASSERT(rv < 0 && errno == EINVAL, - "Invalid kvm_valid_regs did not cause expected KVM_RUN error: %d\n", + "Invalid kvm_valid_regs did not cause expected KVM_RUN error: %d", rv); run->kvm_valid_regs = 0; run->kvm_valid_regs = INVALID_SYNC_FIELD | TEST_SYNC_FIELDS; rv = _vcpu_run(vcpu); TEST_ASSERT(rv < 0 && errno == EINVAL, - "Invalid kvm_valid_regs did not cause expected KVM_RUN error: %d\n", + "Invalid kvm_valid_regs did not cause expected KVM_RUN error: %d", rv); run->kvm_valid_regs = 0; @@ -245,14 +245,14 @@ int main(int argc, char *argv[]) run->kvm_dirty_regs = INVALID_SYNC_FIELD; rv = _vcpu_run(vcpu); TEST_ASSERT(rv < 0 && errno == EINVAL, - "Invalid kvm_dirty_regs did not cause expected KVM_RUN error: %d\n", + "Invalid kvm_dirty_regs did not cause expected KVM_RUN error: %d", rv); run->kvm_dirty_regs = 0; run->kvm_dirty_regs = INVALID_SYNC_FIELD | TEST_SYNC_FIELDS; rv = _vcpu_run(vcpu); TEST_ASSERT(rv < 0 && errno == EINVAL, - "Invalid kvm_dirty_regs did not cause expected KVM_RUN error: %d\n", + "Invalid kvm_dirty_regs did not cause expected KVM_RUN error: %d", rv); run->kvm_dirty_regs = 0; diff --git a/tools/testing/selftests/kvm/x86_64/ucna_injection_test.c b/tools/testing/selftests/kvm/x86_64/ucna_injection_test.c index 0ed32ec903d0..dcbb3c29fb8e 100644 --- a/tools/testing/selftests/kvm/x86_64/ucna_injection_test.c +++ b/tools/testing/selftests/kvm/x86_64/ucna_injection_test.c @@ -143,7 +143,7 @@ static void run_vcpu_expect_gp(struct kvm_vcpu *vcpu) TEST_ASSERT_KVM_EXIT_REASON(vcpu, KVM_EXIT_IO); TEST_ASSERT(get_ucall(vcpu, &uc) == UCALL_SYNC, - "Expect UCALL_SYNC\n"); + "Expect UCALL_SYNC"); TEST_ASSERT(uc.args[1] == SYNC_GP, "#GP is expected."); printf("vCPU received GP in guest.\n"); } @@ -188,7 +188,7 @@ static void *run_ucna_injection(void *arg) TEST_ASSERT_KVM_EXIT_REASON(params->vcpu, KVM_EXIT_IO); TEST_ASSERT(get_ucall(params->vcpu, &uc) == UCALL_SYNC, - "Expect UCALL_SYNC\n"); + "Expect UCALL_SYNC"); TEST_ASSERT(uc.args[1] == SYNC_FIRST_UCNA, "Injecting first UCNA."); printf("Injecting first UCNA at %#x.\n", FIRST_UCNA_ADDR); @@ -198,7 +198,7 @@ static void *run_ucna_injection(void *arg) TEST_ASSERT_KVM_EXIT_REASON(params->vcpu, KVM_EXIT_IO); TEST_ASSERT(get_ucall(params->vcpu, &uc) == UCALL_SYNC, - "Expect UCALL_SYNC\n"); + "Expect UCALL_SYNC"); TEST_ASSERT(uc.args[1] == SYNC_SECOND_UCNA, "Injecting second UCNA."); printf("Injecting second UCNA at %#x.\n", SECOND_UCNA_ADDR); @@ -208,7 +208,7 @@ static void *run_ucna_injection(void *arg) TEST_ASSERT_KVM_EXIT_REASON(params->vcpu, KVM_EXIT_IO); if (get_ucall(params->vcpu, &uc) == UCALL_ABORT) { - TEST_ASSERT(false, "vCPU assertion failure: %s.\n", + TEST_ASSERT(false, "vCPU assertion failure: %s.", (const char *)uc.args[0]); } diff --git a/tools/testing/selftests/kvm/x86_64/userspace_io_test.c b/tools/testing/selftests/kvm/x86_64/userspace_io_test.c index 255c50b0dc32..9481cbcf284f 100644 --- a/tools/testing/selftests/kvm/x86_64/userspace_io_test.c +++ b/tools/testing/selftests/kvm/x86_64/userspace_io_test.c @@ -71,7 +71,7 @@ int main(int argc, char *argv[]) break; TEST_ASSERT(run->io.port == 0x80, - "Expected I/O at port 0x80, got port 0x%x\n", run->io.port); + "Expected I/O at port 0x80, got port 0x%x", run->io.port); /* * Modify the rep string count in RCX: 2 => 1 and 3 => 8192. diff --git a/tools/testing/selftests/kvm/x86_64/vmx_apic_access_test.c b/tools/testing/selftests/kvm/x86_64/vmx_apic_access_test.c index 2bed5fb3a0d6..a81a24761aac 100644 --- a/tools/testing/selftests/kvm/x86_64/vmx_apic_access_test.c +++ b/tools/testing/selftests/kvm/x86_64/vmx_apic_access_test.c @@ -99,7 +99,7 @@ int main(int argc, char *argv[]) TEST_ASSERT_KVM_EXIT_REASON(vcpu, KVM_EXIT_INTERNAL_ERROR); TEST_ASSERT(run->internal.suberror == KVM_INTERNAL_ERROR_EMULATION, - "Got internal suberror other than KVM_INTERNAL_ERROR_EMULATION: %u\n", + "Got internal suberror other than KVM_INTERNAL_ERROR_EMULATION: %u", run->internal.suberror); break; } diff --git a/tools/testing/selftests/kvm/x86_64/vmx_dirty_log_test.c b/tools/testing/selftests/kvm/x86_64/vmx_dirty_log_test.c index e4ad5fef52ff..7f6f5f23fb9b 100644 --- a/tools/testing/selftests/kvm/x86_64/vmx_dirty_log_test.c +++ b/tools/testing/selftests/kvm/x86_64/vmx_dirty_log_test.c @@ -128,17 +128,17 @@ int main(int argc, char *argv[]) */ kvm_vm_get_dirty_log(vm, TEST_MEM_SLOT_INDEX, bmap); if (uc.args[1]) { - TEST_ASSERT(test_bit(0, bmap), "Page 0 incorrectly reported clean\n"); - TEST_ASSERT(host_test_mem[0] == 1, "Page 0 not written by guest\n"); + TEST_ASSERT(test_bit(0, bmap), "Page 0 incorrectly reported clean"); + TEST_ASSERT(host_test_mem[0] == 1, "Page 0 not written by guest"); } else { - TEST_ASSERT(!test_bit(0, bmap), "Page 0 incorrectly reported dirty\n"); - TEST_ASSERT(host_test_mem[0] == 0xaaaaaaaaaaaaaaaaULL, "Page 0 written by guest\n"); + TEST_ASSERT(!test_bit(0, bmap), "Page 0 incorrectly reported dirty"); + TEST_ASSERT(host_test_mem[0] == 0xaaaaaaaaaaaaaaaaULL, "Page 0 written by guest"); } - TEST_ASSERT(!test_bit(1, bmap), "Page 1 incorrectly reported dirty\n"); - TEST_ASSERT(host_test_mem[4096 / 8] == 0xaaaaaaaaaaaaaaaaULL, "Page 1 written by guest\n"); - TEST_ASSERT(!test_bit(2, bmap), "Page 2 incorrectly reported dirty\n"); - TEST_ASSERT(host_test_mem[8192 / 8] == 0xaaaaaaaaaaaaaaaaULL, "Page 2 written by guest\n"); + TEST_ASSERT(!test_bit(1, bmap), "Page 1 incorrectly reported dirty"); + TEST_ASSERT(host_test_mem[4096 / 8] == 0xaaaaaaaaaaaaaaaaULL, "Page 1 written by guest"); + TEST_ASSERT(!test_bit(2, bmap), "Page 2 incorrectly reported dirty"); + TEST_ASSERT(host_test_mem[8192 / 8] == 0xaaaaaaaaaaaaaaaaULL, "Page 2 written by guest"); break; case UCALL_DONE: done = true; diff --git a/tools/testing/selftests/kvm/x86_64/vmx_exception_with_invalid_guest_state.c b/tools/testing/selftests/kvm/x86_64/vmx_exception_with_invalid_guest_state.c index a9b827c69f32..fad3634fd9eb 100644 --- a/tools/testing/selftests/kvm/x86_64/vmx_exception_with_invalid_guest_state.c +++ b/tools/testing/selftests/kvm/x86_64/vmx_exception_with_invalid_guest_state.c @@ -28,7 +28,7 @@ static void __run_vcpu_with_invalid_state(struct kvm_vcpu *vcpu) TEST_ASSERT_KVM_EXIT_REASON(vcpu, KVM_EXIT_INTERNAL_ERROR); TEST_ASSERT(run->emulation_failure.suberror == KVM_INTERNAL_ERROR_EMULATION, - "Expected emulation failure, got %d\n", + "Expected emulation failure, got %d", run->emulation_failure.suberror); } diff --git a/tools/testing/selftests/kvm/x86_64/xapic_ipi_test.c b/tools/testing/selftests/kvm/x86_64/xapic_ipi_test.c index 67ac2a3292ef..725c206ba0b9 100644 --- a/tools/testing/selftests/kvm/x86_64/xapic_ipi_test.c +++ b/tools/testing/selftests/kvm/x86_64/xapic_ipi_test.c @@ -216,7 +216,7 @@ static void *vcpu_thread(void *arg) "Halting vCPU halted %lu times, woke %lu times, received %lu IPIs.\n" "Halter TPR=%#x PPR=%#x LVR=%#x\n" "Migrations attempted: %lu\n" - "Migrations completed: %lu\n", + "Migrations completed: %lu", vcpu->id, (const char *)uc.args[0], params->data->ipis_sent, params->data->hlt_count, params->data->wake_count, @@ -288,7 +288,7 @@ void do_migrations(struct test_data_page *data, int run_secs, int delay_usecs, } TEST_ASSERT(nodes > 1, - "Did not find at least 2 numa nodes. Can't do migration\n"); + "Did not find at least 2 numa nodes. Can't do migration"); fprintf(stderr, "Migrating amongst %d nodes found\n", nodes); @@ -347,7 +347,7 @@ void do_migrations(struct test_data_page *data, int run_secs, int delay_usecs, wake_count != data->wake_count, "IPI, HLT and wake count have not increased " "in the last %lu seconds. " - "HLTer is likely hung.\n", interval_secs); + "HLTer is likely hung.", interval_secs); ipis_sent = data->ipis_sent; hlt_count = data->hlt_count; @@ -381,7 +381,7 @@ void get_cmdline_args(int argc, char *argv[], int *run_secs, "-m adds calls to migrate_pages while vCPUs are running." " Default is no migrations.\n" "-d - delay between migrate_pages() calls." - " Default is %d microseconds.\n", + " Default is %d microseconds.", DEFAULT_RUN_SECS, DEFAULT_DELAY_USECS); } } diff --git a/tools/testing/selftests/kvm/x86_64/xcr0_cpuid_test.c b/tools/testing/selftests/kvm/x86_64/xcr0_cpuid_test.c index dc6217440db3..25a0b0db5c3c 100644 --- a/tools/testing/selftests/kvm/x86_64/xcr0_cpuid_test.c +++ b/tools/testing/selftests/kvm/x86_64/xcr0_cpuid_test.c @@ -116,7 +116,7 @@ int main(int argc, char *argv[]) vcpu_run(vcpu); TEST_ASSERT(run->exit_reason == KVM_EXIT_IO, - "Unexpected exit reason: %u (%s),\n", + "Unexpected exit reason: %u (%s),", run->exit_reason, exit_reason_str(run->exit_reason)); diff --git a/tools/testing/selftests/kvm/x86_64/xss_msr_test.c b/tools/testing/selftests/kvm/x86_64/xss_msr_test.c index e0ddf47362e7..167c97abff1b 100644 --- a/tools/testing/selftests/kvm/x86_64/xss_msr_test.c +++ b/tools/testing/selftests/kvm/x86_64/xss_msr_test.c @@ -29,7 +29,7 @@ int main(int argc, char *argv[]) xss_val = vcpu_get_msr(vcpu, MSR_IA32_XSS); TEST_ASSERT(xss_val == 0, - "MSR_IA32_XSS should be initialized to zero\n"); + "MSR_IA32_XSS should be initialized to zero"); vcpu_set_msr(vcpu, MSR_IA32_XSS, xss_val); From 46fee9e38995af9ae16a8cc7d05031486d44cf35 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 9 Jan 2024 14:03:02 -0800 Subject: [PATCH 054/270] KVM: selftests: Delete superfluous, unused "stage" variable in AMX test Delete the AMX's tests "stage" counter, as the counter is no longer used, which makes clang unhappy: x86_64/amx_test.c:224:6: error: variable 'stage' set but not used int stage, ret; ^ 1 error generated. Note, "stage" was never really used, it just happened to be dumped out by a (failed) assertion on run->exit_reason, i.e. the AMX test has no concept of stages, the code was likely copy+pasted from a different test. Fixes: c96f57b08012 ("KVM: selftests: Make vCPU exit reason test assertion common") Reviewed-by: Jim Mattson Link: https://lore.kernel.org/r/20240109220302.399296-1-seanjc@google.com Signed-off-by: Sean Christopherson --- tools/testing/selftests/kvm/x86_64/amx_test.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tools/testing/selftests/kvm/x86_64/amx_test.c b/tools/testing/selftests/kvm/x86_64/amx_test.c index f7da543570ad..eae521f050e0 100644 --- a/tools/testing/selftests/kvm/x86_64/amx_test.c +++ b/tools/testing/selftests/kvm/x86_64/amx_test.c @@ -221,7 +221,7 @@ int main(int argc, char *argv[]) vm_vaddr_t amx_cfg, tiledata, xstate; struct ucall uc; u32 amx_offset; - int stage, ret; + int ret; /* * Note, all off-by-default features must be enabled before anything @@ -263,7 +263,7 @@ int main(int argc, char *argv[]) memset(addr_gva2hva(vm, xstate), 0, PAGE_SIZE * DIV_ROUND_UP(XSAVE_SIZE, PAGE_SIZE)); vcpu_args_set(vcpu, 3, amx_cfg, tiledata, xstate); - for (stage = 1; ; stage++) { + for (;;) { vcpu_run(vcpu); TEST_ASSERT_KVM_EXIT_REASON(vcpu, KVM_EXIT_IO); From 8ad4855273488c9bd5320b3fee80f66f0023f326 Mon Sep 17 00:00:00 2001 From: Vitaly Kuznetsov Date: Mon, 29 Jan 2024 09:58:46 +0100 Subject: [PATCH 055/270] KVM: selftests: Avoid infinite loop in hyperv_features when invtsc is missing When X86_FEATURE_INVTSC is missing, guest_test_msrs_access() was supposed to skip testing dependent Hyper-V invariant TSC feature. Unfortunately, 'continue' does not lead to that as stage is not incremented. Moreover, 'vm' allocated with vm_create_with_one_vcpu() is not freed and the test runs out of available file descriptors very quickly. Fixes: bd827bd77537 ("KVM: selftests: Test Hyper-V invariant TSC control") Signed-off-by: Vitaly Kuznetsov Link: https://lore.kernel.org/r/20240129085847.2674082-1-vkuznets@redhat.com Signed-off-by: Sean Christopherson --- tools/testing/selftests/kvm/x86_64/hyperv_features.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/tools/testing/selftests/kvm/x86_64/hyperv_features.c b/tools/testing/selftests/kvm/x86_64/hyperv_features.c index 4f4193fc74ff..b923a285e96f 100644 --- a/tools/testing/selftests/kvm/x86_64/hyperv_features.c +++ b/tools/testing/selftests/kvm/x86_64/hyperv_features.c @@ -454,7 +454,7 @@ static void guest_test_msrs_access(void) case 44: /* MSR is not available when CPUID feature bit is unset */ if (!has_invtsc) - continue; + goto next_stage; msr->idx = HV_X64_MSR_TSC_INVARIANT_CONTROL; msr->write = false; msr->fault_expected = true; @@ -462,7 +462,7 @@ static void guest_test_msrs_access(void) case 45: /* MSR is vailable when CPUID feature bit is set */ if (!has_invtsc) - continue; + goto next_stage; vcpu_set_cpuid_feature(vcpu, HV_ACCESS_TSC_INVARIANT); msr->idx = HV_X64_MSR_TSC_INVARIANT_CONTROL; msr->write = false; @@ -471,7 +471,7 @@ static void guest_test_msrs_access(void) case 46: /* Writing bits other than 0 is forbidden */ if (!has_invtsc) - continue; + goto next_stage; msr->idx = HV_X64_MSR_TSC_INVARIANT_CONTROL; msr->write = true; msr->write_val = 0xdeadbeef; @@ -480,7 +480,7 @@ static void guest_test_msrs_access(void) case 47: /* Setting bit 0 enables the feature */ if (!has_invtsc) - continue; + goto next_stage; msr->idx = HV_X64_MSR_TSC_INVARIANT_CONTROL; msr->write = true; msr->write_val = 1; @@ -513,6 +513,7 @@ static void guest_test_msrs_access(void) return; } +next_stage: stage++; kvm_vm_free(vm); } From c2a449a30fa2b5ddc1dd058e92e047157c9eeb9e Mon Sep 17 00:00:00 2001 From: Vitaly Kuznetsov Date: Mon, 29 Jan 2024 09:58:47 +0100 Subject: [PATCH 056/270] KVM: selftests: Fail tests when open() fails with !ENOENT open_path_or_exit() is used for '/dev/kvm', '/dev/sev', and '/sys/module/%s/parameters/%s' and skipping test when the entry is missing is completely reasonable. Other errors, however, may indicate a real issue which is easy to miss. E.g. when 'hyperv_features' test was entering an infinite loop the output was: ./hyperv_features Testing access to Hyper-V specific MSRs 1..0 # SKIP - /dev/kvm not available (errno: 24) and this can easily get overlooked. Keep ENOENT case 'special' for skipping tests and fail when open() results in any other errno. Signed-off-by: Vitaly Kuznetsov Link: https://lore.kernel.org/r/20240129085847.2674082-2-vkuznets@redhat.com Signed-off-by: Sean Christopherson --- tools/testing/selftests/kvm/lib/kvm_util.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tools/testing/selftests/kvm/lib/kvm_util.c b/tools/testing/selftests/kvm/lib/kvm_util.c index 8754abaff5e9..1b197426f29f 100644 --- a/tools/testing/selftests/kvm/lib/kvm_util.c +++ b/tools/testing/selftests/kvm/lib/kvm_util.c @@ -27,7 +27,8 @@ int open_path_or_exit(const char *path, int flags) int fd; fd = open(path, flags); - __TEST_REQUIRE(fd >= 0, "%s not available (errno: %d)", path, errno); + __TEST_REQUIRE(fd >= 0 || errno != ENOENT, "Cannot open %s: %s", path, strerror(errno)); + TEST_ASSERT(fd >= 0, "Failed to open '%s'", path); return fd; } From 2f77465b05b1270c832b5e2ee27037672ad2a10a Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Tue, 30 Jan 2024 19:01:35 +0100 Subject: [PATCH 057/270] x86/efistub: Avoid placing the kernel below LOAD_PHYSICAL_ADDR The EFI stub's kernel placement logic randomizes the physical placement of the kernel by taking all available memory into account, and picking a region at random, based on a random seed. When KASLR is disabled, this seed is set to 0x0, and this results in the lowest available region of memory to be selected for loading the kernel, even if this is below LOAD_PHYSICAL_ADDR. Some of this memory is typically reserved for the GFP_DMA region, to accommodate masters that can only access the first 16 MiB of system memory. Even if such devices are rare these days, we may still end up with a warning in the kernel log, as reported by Tom: swapper/0: page allocation failure: order:10, mode:0xcc1(GFP_KERNEL|GFP_DMA), nodemask=(null),cpuset=/,mems_allowed=0 Fix this by tweaking the random allocation logic to accept a low bound on the placement, and set it to LOAD_PHYSICAL_ADDR. Fixes: a1b87d54f4e4 ("x86/efistub: Avoid legacy decompressor when doing EFI boot") Reported-by: Tom Englund Closes: https://bugzilla.kernel.org/show_bug.cgi?id=218404 Signed-off-by: Ard Biesheuvel --- drivers/firmware/efi/libstub/efistub.h | 3 ++- drivers/firmware/efi/libstub/kaslr.c | 2 +- drivers/firmware/efi/libstub/randomalloc.c | 12 +++++++----- drivers/firmware/efi/libstub/x86-stub.c | 1 + drivers/firmware/efi/libstub/zboot.c | 2 +- 5 files changed, 12 insertions(+), 8 deletions(-) diff --git a/drivers/firmware/efi/libstub/efistub.h b/drivers/firmware/efi/libstub/efistub.h index 212687c30d79..c04b82ea40f2 100644 --- a/drivers/firmware/efi/libstub/efistub.h +++ b/drivers/firmware/efi/libstub/efistub.h @@ -956,7 +956,8 @@ efi_status_t efi_get_random_bytes(unsigned long size, u8 *out); efi_status_t efi_random_alloc(unsigned long size, unsigned long align, unsigned long *addr, unsigned long random_seed, - int memory_type, unsigned long alloc_limit); + int memory_type, unsigned long alloc_min, + unsigned long alloc_max); efi_status_t efi_random_get_seed(void); diff --git a/drivers/firmware/efi/libstub/kaslr.c b/drivers/firmware/efi/libstub/kaslr.c index 62d63f7a2645..1a9808012abd 100644 --- a/drivers/firmware/efi/libstub/kaslr.c +++ b/drivers/firmware/efi/libstub/kaslr.c @@ -119,7 +119,7 @@ efi_status_t efi_kaslr_relocate_kernel(unsigned long *image_addr, */ status = efi_random_alloc(*reserve_size, min_kimg_align, reserve_addr, phys_seed, - EFI_LOADER_CODE, EFI_ALLOC_LIMIT); + EFI_LOADER_CODE, 0, EFI_ALLOC_LIMIT); if (status != EFI_SUCCESS) efi_warn("efi_random_alloc() failed: 0x%lx\n", status); } else { diff --git a/drivers/firmware/efi/libstub/randomalloc.c b/drivers/firmware/efi/libstub/randomalloc.c index 674a064b8f7a..4e96a855fdf4 100644 --- a/drivers/firmware/efi/libstub/randomalloc.c +++ b/drivers/firmware/efi/libstub/randomalloc.c @@ -17,7 +17,7 @@ static unsigned long get_entry_num_slots(efi_memory_desc_t *md, unsigned long size, unsigned long align_shift, - u64 alloc_limit) + u64 alloc_min, u64 alloc_max) { unsigned long align = 1UL << align_shift; u64 first_slot, last_slot, region_end; @@ -30,11 +30,11 @@ static unsigned long get_entry_num_slots(efi_memory_desc_t *md, return 0; region_end = min(md->phys_addr + md->num_pages * EFI_PAGE_SIZE - 1, - alloc_limit); + alloc_max); if (region_end < size) return 0; - first_slot = round_up(md->phys_addr, align); + first_slot = round_up(max(md->phys_addr, alloc_min), align); last_slot = round_down(region_end - size + 1, align); if (first_slot > last_slot) @@ -56,7 +56,8 @@ efi_status_t efi_random_alloc(unsigned long size, unsigned long *addr, unsigned long random_seed, int memory_type, - unsigned long alloc_limit) + unsigned long alloc_min, + unsigned long alloc_max) { unsigned long total_slots = 0, target_slot; unsigned long total_mirrored_slots = 0; @@ -78,7 +79,8 @@ efi_status_t efi_random_alloc(unsigned long size, efi_memory_desc_t *md = (void *)map->map + map_offset; unsigned long slots; - slots = get_entry_num_slots(md, size, ilog2(align), alloc_limit); + slots = get_entry_num_slots(md, size, ilog2(align), alloc_min, + alloc_max); MD_NUM_SLOTS(md) = slots; total_slots += slots; if (md->attribute & EFI_MEMORY_MORE_RELIABLE) diff --git a/drivers/firmware/efi/libstub/x86-stub.c b/drivers/firmware/efi/libstub/x86-stub.c index cb0be88c8131..99429bc4b0c7 100644 --- a/drivers/firmware/efi/libstub/x86-stub.c +++ b/drivers/firmware/efi/libstub/x86-stub.c @@ -799,6 +799,7 @@ static efi_status_t efi_decompress_kernel(unsigned long *kernel_entry) status = efi_random_alloc(alloc_size, CONFIG_PHYSICAL_ALIGN, &addr, seed[0], EFI_LOADER_CODE, + LOAD_PHYSICAL_ADDR, EFI_X86_KERNEL_ALLOC_LIMIT); if (status != EFI_SUCCESS) return status; diff --git a/drivers/firmware/efi/libstub/zboot.c b/drivers/firmware/efi/libstub/zboot.c index bdb17eac0cb4..1ceace956758 100644 --- a/drivers/firmware/efi/libstub/zboot.c +++ b/drivers/firmware/efi/libstub/zboot.c @@ -119,7 +119,7 @@ efi_zboot_entry(efi_handle_t handle, efi_system_table_t *systab) } status = efi_random_alloc(alloc_size, min_kimg_align, &image_base, - seed, EFI_LOADER_CODE, EFI_ALLOC_LIMIT); + seed, EFI_LOADER_CODE, 0, EFI_ALLOC_LIMIT); if (status != EFI_SUCCESS) { efi_err("Failed to allocate memory\n"); goto free_cmdline; From aa0e784dea7c1a026aabff9db1cb5d2bd92b3e92 Mon Sep 17 00:00:00 2001 From: Yang Li Date: Fri, 19 Jan 2024 18:06:35 +0800 Subject: [PATCH 058/270] efi/libstub: Add one kernel-doc comment Add the description of @memory_type to silence the warning: drivers/firmware/efi/libstub/alignedmem.c:27: warning: Function parameter or struct member 'memory_type' not described in 'efi_allocate_pages_aligned' Signed-off-by: Yang Li [ardb: tweak comment] Signed-off-by: Ard Biesheuvel --- drivers/firmware/efi/libstub/alignedmem.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/firmware/efi/libstub/alignedmem.c b/drivers/firmware/efi/libstub/alignedmem.c index 6b83c492c3b8..31928bd87e0f 100644 --- a/drivers/firmware/efi/libstub/alignedmem.c +++ b/drivers/firmware/efi/libstub/alignedmem.c @@ -14,6 +14,7 @@ * @max: the address that the last allocated memory page shall not * exceed * @align: minimum alignment of the base of the allocation + * @memory_type: the type of memory to allocate * * Allocate pages as EFI_LOADER_DATA. The allocated pages are aligned according * to @align, which should be >= EFI_ALLOC_ALIGN. The last allocated page will From 10c02aad111df02088d1a81792a709f6a7eca6cc Mon Sep 17 00:00:00 2001 From: Sebastian Ene Date: Wed, 24 Jan 2024 09:10:28 +0000 Subject: [PATCH 059/270] KVM: arm64: Fix circular locking dependency The rule inside kvm enforces that the vcpu->mutex is taken *inside* kvm->lock. The rule is violated by the pkvm_create_hyp_vm() which acquires the kvm->lock while already holding the vcpu->mutex lock from kvm_vcpu_ioctl(). Avoid the circular locking dependency altogether by protecting the hyp vm handle with the config_lock, much like we already do for other forms of VM-scoped data. Signed-off-by: Sebastian Ene Cc: stable@vger.kernel.org Reviewed-by: Oliver Upton Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20240124091027.1477174-2-sebastianene@google.com --- arch/arm64/kvm/pkvm.c | 27 +++++++++++++++++---------- 1 file changed, 17 insertions(+), 10 deletions(-) diff --git a/arch/arm64/kvm/pkvm.c b/arch/arm64/kvm/pkvm.c index 8350fb8fee0b..b7be96a53597 100644 --- a/arch/arm64/kvm/pkvm.c +++ b/arch/arm64/kvm/pkvm.c @@ -101,6 +101,17 @@ void __init kvm_hyp_reserve(void) hyp_mem_base); } +static void __pkvm_destroy_hyp_vm(struct kvm *host_kvm) +{ + if (host_kvm->arch.pkvm.handle) { + WARN_ON(kvm_call_hyp_nvhe(__pkvm_teardown_vm, + host_kvm->arch.pkvm.handle)); + } + + host_kvm->arch.pkvm.handle = 0; + free_hyp_memcache(&host_kvm->arch.pkvm.teardown_mc); +} + /* * Allocates and donates memory for hypervisor VM structs at EL2. * @@ -181,7 +192,7 @@ static int __pkvm_create_hyp_vm(struct kvm *host_kvm) return 0; destroy_vm: - pkvm_destroy_hyp_vm(host_kvm); + __pkvm_destroy_hyp_vm(host_kvm); return ret; free_vm: free_pages_exact(hyp_vm, hyp_vm_sz); @@ -194,23 +205,19 @@ int pkvm_create_hyp_vm(struct kvm *host_kvm) { int ret = 0; - mutex_lock(&host_kvm->lock); + mutex_lock(&host_kvm->arch.config_lock); if (!host_kvm->arch.pkvm.handle) ret = __pkvm_create_hyp_vm(host_kvm); - mutex_unlock(&host_kvm->lock); + mutex_unlock(&host_kvm->arch.config_lock); return ret; } void pkvm_destroy_hyp_vm(struct kvm *host_kvm) { - if (host_kvm->arch.pkvm.handle) { - WARN_ON(kvm_call_hyp_nvhe(__pkvm_teardown_vm, - host_kvm->arch.pkvm.handle)); - } - - host_kvm->arch.pkvm.handle = 0; - free_hyp_memcache(&host_kvm->arch.pkvm.teardown_mc); + mutex_lock(&host_kvm->arch.config_lock); + __pkvm_destroy_hyp_vm(host_kvm); + mutex_unlock(&host_kvm->arch.config_lock); } int pkvm_init_host_vm(struct kvm *host_kvm) From 4451e8e8415e4ef48cdc763d66855f8c25fda94c Mon Sep 17 00:00:00 2001 From: Mario Limonciello Date: Tue, 23 Jan 2024 12:08:18 -0600 Subject: [PATCH 060/270] pinctrl: amd: Add IRQF_ONESHOT to the interrupt request On some systems the interrupt is shared between GPIO controller and ACPI SCI. When the interrupt is shared with the ACPI SCI the flags need to be identical. This should fix the GPIO controller failing to work after commit 7a36b901a6eb ("ACPI: OSL: Use a threaded interrupt handler for SCI"). ``` [ 0.417335] genirq: Flags mismatch irq 9. 00000088 (pinctrl_amd) vs. 00002080 (acpi) [ 0.420073] amd_gpio: probe of AMDI0030:00 failed with error -16 ``` Cc: Rafael J. Wysocki Reported-by: Christian Heusel Closes: https://bugzilla.kernel.org/show_bug.cgi?id=218407 Fixes: 7a36b901a6eb ("ACPI: OSL: Use a threaded interrupt handler for SCI") Link: https://lore.kernel.org/linux-acpi/CAJZ5v0iRqUXeuKmC_+dAJtDBLWQ3x15n4gRH48y7MEaLoXF+UA@mail.gmail.com/T/#mc5506014141b61e472b24e095889535a04458083 Signed-off-by: Mario Limonciello Acked-by: Rafael J. Wysocki Tested-by: Christian Heusel Link: https://lore.kernel.org/r/20240123180818.3994-1-mario.limonciello@amd.com Signed-off-by: Linus Walleij --- drivers/pinctrl/pinctrl-amd.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/pinctrl/pinctrl-amd.c b/drivers/pinctrl/pinctrl-amd.c index 03ecb3d1aaf6..49f89b70dcec 100644 --- a/drivers/pinctrl/pinctrl-amd.c +++ b/drivers/pinctrl/pinctrl-amd.c @@ -1159,7 +1159,7 @@ static int amd_gpio_probe(struct platform_device *pdev) } ret = devm_request_irq(&pdev->dev, gpio_dev->irq, amd_gpio_irq_handler, - IRQF_SHARED, KBUILD_MODNAME, gpio_dev); + IRQF_SHARED | IRQF_ONESHOT, KBUILD_MODNAME, gpio_dev); if (ret) goto out2; From 6231c9e1a9f35b535c66709aa8a6eda40dbc4132 Mon Sep 17 00:00:00 2001 From: Prasad Pandit Date: Wed, 3 Jan 2024 13:23:43 +0530 Subject: [PATCH 061/270] KVM: x86: make KVM_REQ_NMI request iff NMI pending for vcpu kvm_vcpu_ioctl_x86_set_vcpu_events() routine makes 'KVM_REQ_NMI' request for a vcpu even when its 'events->nmi.pending' is zero. Ex: qemu_thread_start kvm_vcpu_thread_fn qemu_wait_io_event qemu_wait_io_event_common process_queued_cpu_work do_kvm_cpu_synchronize_post_init/_reset kvm_arch_put_registers kvm_put_vcpu_events (cpu, level=[2|3]) This leads vCPU threads in QEMU to constantly acquire & release the global mutex lock, delaying the guest boot due to lock contention. Add check to make KVM_REQ_NMI request only if vcpu has NMI pending. Fixes: bdedff263132 ("KVM: x86: Route pending NMIs from userspace through process_nmi()") Cc: stable@vger.kernel.org Signed-off-by: Prasad Pandit Link: https://lore.kernel.org/r/20240103075343.549293-1-ppandit@redhat.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/x86.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 363b1c080205..25bc52cdf8f4 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -5454,7 +5454,8 @@ static int kvm_vcpu_ioctl_x86_set_vcpu_events(struct kvm_vcpu *vcpu, if (events->flags & KVM_VCPUEVENT_VALID_NMI_PENDING) { vcpu->arch.nmi_pending = 0; atomic_set(&vcpu->arch.nmi_queued, events->nmi.pending); - kvm_make_request(KVM_REQ_NMI, vcpu); + if (events->nmi.pending) + kvm_make_request(KVM_REQ_NMI, vcpu); } static_call(kvm_x86_set_nmi_mask)(vcpu, events->nmi.masked); From 5b778e1c2e9760ffe99482de26e70cd74683ba5c Mon Sep 17 00:00:00 2001 From: Breno Leitao Date: Tue, 30 Jan 2024 02:42:35 -0800 Subject: [PATCH 062/270] wifi: fill in MODULE_DESCRIPTION()s for wlcore W=1 builds now warn if module is built without a MODULE_DESCRIPTION(). Add descriptions to the TI WLAN wlcore drivers. Signed-off-by: Breno Leitao Signed-off-by: Kalle Valo Link: https://msgid.link/20240130104243.3025393-2-leitao@debian.org --- drivers/net/wireless/ti/wlcore/main.c | 1 + drivers/net/wireless/ti/wlcore/sdio.c | 1 + drivers/net/wireless/ti/wlcore/spi.c | 1 + 3 files changed, 3 insertions(+) diff --git a/drivers/net/wireless/ti/wlcore/main.c b/drivers/net/wireless/ti/wlcore/main.c index fb9ed97774c7..5736acb4d206 100644 --- a/drivers/net/wireless/ti/wlcore/main.c +++ b/drivers/net/wireless/ti/wlcore/main.c @@ -6793,6 +6793,7 @@ MODULE_PARM_DESC(bug_on_recovery, "BUG() on fw recovery"); module_param(no_recovery, int, 0600); MODULE_PARM_DESC(no_recovery, "Prevent HW recovery. FW will remain stuck."); +MODULE_DESCRIPTION("TI WLAN core driver"); MODULE_LICENSE("GPL"); MODULE_AUTHOR("Luciano Coelho "); MODULE_AUTHOR("Juuso Oikarinen "); diff --git a/drivers/net/wireless/ti/wlcore/sdio.c b/drivers/net/wireless/ti/wlcore/sdio.c index f0686635db46..eb5482ed76ae 100644 --- a/drivers/net/wireless/ti/wlcore/sdio.c +++ b/drivers/net/wireless/ti/wlcore/sdio.c @@ -447,6 +447,7 @@ module_sdio_driver(wl1271_sdio_driver); module_param(dump, bool, 0600); MODULE_PARM_DESC(dump, "Enable sdio read/write dumps."); +MODULE_DESCRIPTION("TI WLAN SDIO helpers"); MODULE_LICENSE("GPL"); MODULE_AUTHOR("Luciano Coelho "); MODULE_AUTHOR("Juuso Oikarinen "); diff --git a/drivers/net/wireless/ti/wlcore/spi.c b/drivers/net/wireless/ti/wlcore/spi.c index 7d9a139db59e..0aa2b2f3c5c9 100644 --- a/drivers/net/wireless/ti/wlcore/spi.c +++ b/drivers/net/wireless/ti/wlcore/spi.c @@ -562,6 +562,7 @@ static struct spi_driver wl1271_spi_driver = { }; module_spi_driver(wl1271_spi_driver); +MODULE_DESCRIPTION("TI WLAN SPI helpers"); MODULE_LICENSE("GPL"); MODULE_AUTHOR("Luciano Coelho "); MODULE_AUTHOR("Juuso Oikarinen "); From 2f2b503ea770cf817044851a583e8880d1de4ae2 Mon Sep 17 00:00:00 2001 From: Breno Leitao Date: Tue, 30 Jan 2024 02:42:36 -0800 Subject: [PATCH 063/270] wifi: fill in MODULE_DESCRIPTION()s for wl1251 and wl12xx W=1 builds now warn if module is built without a MODULE_DESCRIPTION(). Add descriptions to the TI wireless drivers wl12xx and wl1251. Signed-off-by: Breno Leitao Signed-off-by: Kalle Valo Link: https://msgid.link/20240130104243.3025393-3-leitao@debian.org --- drivers/net/wireless/ti/wl1251/sdio.c | 1 + drivers/net/wireless/ti/wl1251/spi.c | 1 + drivers/net/wireless/ti/wl12xx/main.c | 1 + 3 files changed, 3 insertions(+) diff --git a/drivers/net/wireless/ti/wl1251/sdio.c b/drivers/net/wireless/ti/wl1251/sdio.c index 301bd0043a43..4e5b351f80f0 100644 --- a/drivers/net/wireless/ti/wl1251/sdio.c +++ b/drivers/net/wireless/ti/wl1251/sdio.c @@ -343,5 +343,6 @@ static void __exit wl1251_sdio_exit(void) module_init(wl1251_sdio_init); module_exit(wl1251_sdio_exit); +MODULE_DESCRIPTION("TI WL1251 SDIO helpers"); MODULE_LICENSE("GPL"); MODULE_AUTHOR("Kalle Valo "); diff --git a/drivers/net/wireless/ti/wl1251/spi.c b/drivers/net/wireless/ti/wl1251/spi.c index 29292f06bd3d..1936bb3af54a 100644 --- a/drivers/net/wireless/ti/wl1251/spi.c +++ b/drivers/net/wireless/ti/wl1251/spi.c @@ -342,6 +342,7 @@ static struct spi_driver wl1251_spi_driver = { module_spi_driver(wl1251_spi_driver); +MODULE_DESCRIPTION("TI WL1251 SPI helpers"); MODULE_LICENSE("GPL"); MODULE_AUTHOR("Kalle Valo "); MODULE_ALIAS("spi:wl1251"); diff --git a/drivers/net/wireless/ti/wl12xx/main.c b/drivers/net/wireless/ti/wl12xx/main.c index de045fe4ca1e..b26d42b4e3cc 100644 --- a/drivers/net/wireless/ti/wl12xx/main.c +++ b/drivers/net/wireless/ti/wl12xx/main.c @@ -1955,6 +1955,7 @@ module_param_named(tcxo, tcxo_param, charp, 0); MODULE_PARM_DESC(tcxo, "TCXO clock: 19.2, 26, 38.4, 52, 16.368, 32.736, 16.8, 33.6"); +MODULE_DESCRIPTION("TI WL12xx wireless driver"); MODULE_LICENSE("GPL v2"); MODULE_AUTHOR("Luciano Coelho "); MODULE_FIRMWARE(WL127X_FW_NAME_SINGLE); From 257ca10c7317d4a424e48bb95d14ca53a1f1dd6f Mon Sep 17 00:00:00 2001 From: Breno Leitao Date: Tue, 30 Jan 2024 02:42:37 -0800 Subject: [PATCH 064/270] wifi: fill in MODULE_DESCRIPTION()s for Broadcom WLAN W=1 builds now warn if module is built without a MODULE_DESCRIPTION(). Add descriptions to the Broadcom FullMac WLAN drivers. Signed-off-by: Breno Leitao Signed-off-by: Kalle Valo Link: https://msgid.link/20240130104243.3025393-4-leitao@debian.org --- drivers/net/wireless/broadcom/brcm80211/brcmfmac/bca/module.c | 1 + drivers/net/wireless/broadcom/brcm80211/brcmfmac/cyw/module.c | 1 + drivers/net/wireless/broadcom/brcm80211/brcmfmac/wcc/module.c | 1 + 3 files changed, 3 insertions(+) diff --git a/drivers/net/wireless/broadcom/brcm80211/brcmfmac/bca/module.c b/drivers/net/wireless/broadcom/brcm80211/brcmfmac/bca/module.c index d55f3271d619..4f0c1e1a8e60 100644 --- a/drivers/net/wireless/broadcom/brcm80211/brcmfmac/bca/module.c +++ b/drivers/net/wireless/broadcom/brcm80211/brcmfmac/bca/module.c @@ -20,6 +20,7 @@ static void __exit brcmf_bca_exit(void) brcmf_fwvid_unregister_vendor(BRCMF_FWVENDOR_BCA, THIS_MODULE); } +MODULE_DESCRIPTION("Broadcom FullMAC WLAN driver plugin for Broadcom AP chipsets"); MODULE_LICENSE("Dual BSD/GPL"); MODULE_IMPORT_NS(BRCMFMAC); diff --git a/drivers/net/wireless/broadcom/brcm80211/brcmfmac/cyw/module.c b/drivers/net/wireless/broadcom/brcm80211/brcmfmac/cyw/module.c index f82fbbe3ecef..90d06cda03a2 100644 --- a/drivers/net/wireless/broadcom/brcm80211/brcmfmac/cyw/module.c +++ b/drivers/net/wireless/broadcom/brcm80211/brcmfmac/cyw/module.c @@ -20,6 +20,7 @@ static void __exit brcmf_cyw_exit(void) brcmf_fwvid_unregister_vendor(BRCMF_FWVENDOR_CYW, THIS_MODULE); } +MODULE_DESCRIPTION("Broadcom FullMAC WLAN driver plugin for Cypress/Infineon chipsets"); MODULE_LICENSE("Dual BSD/GPL"); MODULE_IMPORT_NS(BRCMFMAC); diff --git a/drivers/net/wireless/broadcom/brcm80211/brcmfmac/wcc/module.c b/drivers/net/wireless/broadcom/brcm80211/brcmfmac/wcc/module.c index 02918d434556..b66135e3cff4 100644 --- a/drivers/net/wireless/broadcom/brcm80211/brcmfmac/wcc/module.c +++ b/drivers/net/wireless/broadcom/brcm80211/brcmfmac/wcc/module.c @@ -20,6 +20,7 @@ static void __exit brcmf_wcc_exit(void) brcmf_fwvid_unregister_vendor(BRCMF_FWVENDOR_WCC, THIS_MODULE); } +MODULE_DESCRIPTION("Broadcom FullMAC WLAN driver plugin for Broadcom mobility chipsets"); MODULE_LICENSE("Dual BSD/GPL"); MODULE_IMPORT_NS(BRCMFMAC); From f8782ea450ad83df5f3dfdb1fca093f46238febe Mon Sep 17 00:00:00 2001 From: Breno Leitao Date: Tue, 30 Jan 2024 02:42:38 -0800 Subject: [PATCH 065/270] wifi: fill in MODULE_DESCRIPTION()s for ar5523 W=1 builds now warn if module is built without a MODULE_DESCRIPTION(). Add descriptions to the Atheros AR5523 wireless driver. Signed-off-by: Breno Leitao Signed-off-by: Kalle Valo Link: https://msgid.link/20240130104243.3025393-5-leitao@debian.org --- drivers/net/wireless/ath/ar5523/ar5523.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/net/wireless/ath/ar5523/ar5523.c b/drivers/net/wireless/ath/ar5523/ar5523.c index 43e0db78d42b..a742cec44e3d 100644 --- a/drivers/net/wireless/ath/ar5523/ar5523.c +++ b/drivers/net/wireless/ath/ar5523/ar5523.c @@ -1803,5 +1803,6 @@ static struct usb_driver ar5523_driver = { module_usb_driver(ar5523_driver); +MODULE_DESCRIPTION("Atheros AR5523 wireless driver"); MODULE_LICENSE("Dual BSD/GPL"); MODULE_FIRMWARE(AR5523_FIRMWARE_FILE); From e063d2a05d713d396044124867704b08e5c30860 Mon Sep 17 00:00:00 2001 From: Breno Leitao Date: Tue, 30 Jan 2024 02:42:39 -0800 Subject: [PATCH 066/270] wifi: fill in MODULE_DESCRIPTION()s for wcn36xx W=1 builds now warn if module is built without a MODULE_DESCRIPTION(). Add descriptions to the Qualcomm Atheros WCN3660/3680 wireless driver. Signed-off-by: Breno Leitao Reviewed-by: Jeff Johnson Signed-off-by: Kalle Valo Link: https://msgid.link/20240130104243.3025393-6-leitao@debian.org --- drivers/net/wireless/ath/wcn36xx/main.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/net/wireless/ath/wcn36xx/main.c b/drivers/net/wireless/ath/wcn36xx/main.c index 41119fb177e3..4e6b4df8562f 100644 --- a/drivers/net/wireless/ath/wcn36xx/main.c +++ b/drivers/net/wireless/ath/wcn36xx/main.c @@ -1685,6 +1685,7 @@ static struct platform_driver wcn36xx_driver = { module_platform_driver(wcn36xx_driver); +MODULE_DESCRIPTION("Qualcomm Atheros WCN3660/3680 wireless driver"); MODULE_LICENSE("Dual BSD/GPL"); MODULE_AUTHOR("Eugene Krasnikov k.eugene.e@gmail.com"); MODULE_FIRMWARE(WLAN_NV_FILE); From 714ea2f109d9d561789078fd8a1beeffa9af36d6 Mon Sep 17 00:00:00 2001 From: Breno Leitao Date: Tue, 30 Jan 2024 02:42:40 -0800 Subject: [PATCH 067/270] wifi: fill in MODULE_DESCRIPTION()s for p54spi W=1 builds now warn if module is built without a MODULE_DESCRIPTION(). Add descriptions to the Prism54 SPI wireless driver. Signed-off-by: Breno Leitao Signed-off-by: Kalle Valo Link: https://msgid.link/20240130104243.3025393-7-leitao@debian.org --- drivers/net/wireless/intersil/p54/p54spi.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/net/wireless/intersil/p54/p54spi.c b/drivers/net/wireless/intersil/p54/p54spi.c index ce0179b8ab36..0073b5e0f9c9 100644 --- a/drivers/net/wireless/intersil/p54/p54spi.c +++ b/drivers/net/wireless/intersil/p54/p54spi.c @@ -700,6 +700,7 @@ static struct spi_driver p54spi_driver = { module_spi_driver(p54spi_driver); +MODULE_DESCRIPTION("Prism54 SPI wireless driver"); MODULE_LICENSE("GPL"); MODULE_AUTHOR("Christian Lamparter "); MODULE_ALIAS("spi:cx3110x"); From 35337ac472605da9db051827fa2d39e6c02c6d81 Mon Sep 17 00:00:00 2001 From: Breno Leitao Date: Tue, 30 Jan 2024 02:42:41 -0800 Subject: [PATCH 068/270] wifi: fill in MODULE_DESCRIPTION()s for wl18xx W=1 builds now warn if module is built without a MODULE_DESCRIPTION(). Add descriptions to the TI WiLink 8 wireless driver. Signed-off-by: Breno Leitao Signed-off-by: Kalle Valo Link: https://msgid.link/20240130104243.3025393-8-leitao@debian.org --- drivers/net/wireless/ti/wl18xx/main.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/net/wireless/ti/wl18xx/main.c b/drivers/net/wireless/ti/wl18xx/main.c index 20d9181b3410..2ccac1cdec01 100644 --- a/drivers/net/wireless/ti/wl18xx/main.c +++ b/drivers/net/wireless/ti/wl18xx/main.c @@ -2086,6 +2086,7 @@ module_param_named(num_rx_desc, num_rx_desc_param, int, 0400); MODULE_PARM_DESC(num_rx_desc_param, "Number of Rx descriptors: u8 (default is 32)"); +MODULE_DESCRIPTION("TI WiLink 8 wireless driver"); MODULE_LICENSE("GPL v2"); MODULE_AUTHOR("Luciano Coelho "); MODULE_FIRMWARE(WL18XX_FW_NAME); From c9013880284d78bac6498d9c0b0b7043cf0f5639 Mon Sep 17 00:00:00 2001 From: Breno Leitao Date: Tue, 30 Jan 2024 02:42:42 -0800 Subject: [PATCH 069/270] wifi: fill in MODULE_DESCRIPTION()s for wilc1000 W=1 builds now warn if module is built without a MODULE_DESCRIPTION(). Add descriptions to the Atmel WILC1000 SPI driver. Signed-off-by: Breno Leitao Signed-off-by: Kalle Valo Link: https://msgid.link/20240130104243.3025393-9-leitao@debian.org --- drivers/net/wireless/microchip/wilc1000/netdev.c | 1 + drivers/net/wireless/microchip/wilc1000/sdio.c | 1 + drivers/net/wireless/microchip/wilc1000/spi.c | 1 + 3 files changed, 3 insertions(+) diff --git a/drivers/net/wireless/microchip/wilc1000/netdev.c b/drivers/net/wireless/microchip/wilc1000/netdev.c index 91d71e0f7ef2..81e8f25863f5 100644 --- a/drivers/net/wireless/microchip/wilc1000/netdev.c +++ b/drivers/net/wireless/microchip/wilc1000/netdev.c @@ -1018,5 +1018,6 @@ unregister_netdev: return ERR_PTR(ret); } +MODULE_DESCRIPTION("Atmel WILC1000 core wireless driver"); MODULE_LICENSE("GPL"); MODULE_FIRMWARE(WILC1000_FW(WILC1000_API_VER)); diff --git a/drivers/net/wireless/microchip/wilc1000/sdio.c b/drivers/net/wireless/microchip/wilc1000/sdio.c index 0d13e3e46e98..d6d394693090 100644 --- a/drivers/net/wireless/microchip/wilc1000/sdio.c +++ b/drivers/net/wireless/microchip/wilc1000/sdio.c @@ -984,4 +984,5 @@ static struct sdio_driver wilc_sdio_driver = { module_driver(wilc_sdio_driver, sdio_register_driver, sdio_unregister_driver); +MODULE_DESCRIPTION("Atmel WILC1000 SDIO wireless driver"); MODULE_LICENSE("GPL"); diff --git a/drivers/net/wireless/microchip/wilc1000/spi.c b/drivers/net/wireless/microchip/wilc1000/spi.c index 77b4cdff73c3..1d8b241ce43c 100644 --- a/drivers/net/wireless/microchip/wilc1000/spi.c +++ b/drivers/net/wireless/microchip/wilc1000/spi.c @@ -273,6 +273,7 @@ static struct spi_driver wilc_spi_driver = { .remove = wilc_bus_remove, }; module_spi_driver(wilc_spi_driver); +MODULE_DESCRIPTION("Atmel WILC1000 SPI wireless driver"); MODULE_LICENSE("GPL"); static int wilc_spi_tx(struct wilc *wilc, u8 *b, u32 len) From f3f8f050316893fe2da523458ff7f5f6d61fb1a6 Mon Sep 17 00:00:00 2001 From: Breno Leitao Date: Tue, 30 Jan 2024 02:42:43 -0800 Subject: [PATCH 070/270] wifi: fill in MODULE_DESCRIPTION()s for mt76 drivers W=1 builds now warn if module is built without a MODULE_DESCRIPTION(). Add descriptions to the MediaTek mt76 drivers. Here is a sorted list of descriptions. It might make the reviewing process easier. MODULE_DESCRIPTION("MediaTek MT7603E and MT76x8 wireless driver"); MODULE_DESCRIPTION("MediaTek MT7615E and MT7663E wireless driver"); MODULE_DESCRIPTION("MediaTek MT7615E MMIO helpers"); MODULE_DESCRIPTION("MediaTek MT7663 SDIO/USB helpers"); MODULE_DESCRIPTION("MediaTek MT7663S (SDIO) wireless driver"); MODULE_DESCRIPTION("MediaTek MT7663U (USB) wireless driver"); MODULE_DESCRIPTION("MediaTek MT76x02 helpers"); MODULE_DESCRIPTION("MediaTek MT76x02 MCU helpers"); MODULE_DESCRIPTION("MediaTek MT76x0E (PCIe) wireless driver"); MODULE_DESCRIPTION("MediaTek MT76x0U (USB) wireless driver"); MODULE_DESCRIPTION("MediaTek MT76x2 EEPROM helpers"); MODULE_DESCRIPTION("MediaTek MT76x2E (PCIe) wireless driver"); MODULE_DESCRIPTION("MediaTek MT76x2U (USB) wireless driver"); MODULE_DESCRIPTION("MediaTek MT76x connac layer helpers"); MODULE_DESCRIPTION("MediaTek MT76x EEPROM helpers"); MODULE_DESCRIPTION("MediaTek MT76x helpers"); MODULE_DESCRIPTION("MediaTek MT76x SDIO helpers"); MODULE_DESCRIPTION("MediaTek MT76x USB helpers"); MODULE_DESCRIPTION("MediaTek MT7915E MMIO helpers"); MODULE_DESCRIPTION("MediaTek MT7921 core driver"); MODULE_DESCRIPTION("MediaTek MT7921E (PCIe) wireless driver"); MODULE_DESCRIPTION("MediaTek MT7921S (SDIO) wireless driver"); MODULE_DESCRIPTION("MediaTek MT7921U (USB) wireless driver"); MODULE_DESCRIPTION("MediaTek MT7925 core driver"); MODULE_DESCRIPTION("MediaTek MT7925E (PCIe) wireless driver"); MODULE_DESCRIPTION("MediaTek MT7925U (USB) wireless driver"); MODULE_DESCRIPTION("MediaTek MT792x core driver"); MODULE_DESCRIPTION("MediaTek MT792x USB helpers"); MODULE_DESCRIPTION("MediaTek MT7996 MMIO helpers"); Signed-off-by: Breno Leitao Signed-off-by: Kalle Valo Link: https://msgid.link/20240130104243.3025393-10-leitao@debian.org --- drivers/net/wireless/mediatek/mt76/mt7603/main.c | 1 + drivers/net/wireless/mediatek/mt76/mt7615/main.c | 1 + drivers/net/wireless/mediatek/mt76/mt7615/mmio.c | 1 + drivers/net/wireless/mediatek/mt76/mt7615/sdio.c | 1 + drivers/net/wireless/mediatek/mt76/mt7615/usb.c | 1 + drivers/net/wireless/mediatek/mt76/mt7615/usb_sdio.c | 1 + drivers/net/wireless/mediatek/mt76/mt76_connac_mcu.c | 1 + drivers/net/wireless/mediatek/mt76/mt76x0/eeprom.c | 1 + drivers/net/wireless/mediatek/mt76/mt76x0/pci.c | 1 + drivers/net/wireless/mediatek/mt76/mt76x0/usb.c | 1 + drivers/net/wireless/mediatek/mt76/mt76x02_usb_mcu.c | 1 + drivers/net/wireless/mediatek/mt76/mt76x02_util.c | 1 + drivers/net/wireless/mediatek/mt76/mt76x2/eeprom.c | 1 + drivers/net/wireless/mediatek/mt76/mt76x2/pci.c | 1 + drivers/net/wireless/mediatek/mt76/mt76x2/usb.c | 1 + drivers/net/wireless/mediatek/mt76/mt7915/mmio.c | 1 + drivers/net/wireless/mediatek/mt76/mt7921/main.c | 1 + drivers/net/wireless/mediatek/mt76/mt7921/pci.c | 1 + drivers/net/wireless/mediatek/mt76/mt7921/sdio.c | 1 + drivers/net/wireless/mediatek/mt76/mt7921/usb.c | 1 + drivers/net/wireless/mediatek/mt76/mt7925/main.c | 1 + drivers/net/wireless/mediatek/mt76/mt7925/pci.c | 1 + drivers/net/wireless/mediatek/mt76/mt7925/usb.c | 1 + drivers/net/wireless/mediatek/mt76/mt792x_core.c | 1 + drivers/net/wireless/mediatek/mt76/mt792x_usb.c | 1 + drivers/net/wireless/mediatek/mt76/mt7996/mmio.c | 1 + drivers/net/wireless/mediatek/mt76/sdio.c | 1 + drivers/net/wireless/mediatek/mt76/usb.c | 1 + drivers/net/wireless/mediatek/mt76/util.c | 1 + 29 files changed, 29 insertions(+) diff --git a/drivers/net/wireless/mediatek/mt76/mt7603/main.c b/drivers/net/wireless/mediatek/mt76/mt7603/main.c index 89d738deea62..e2146d30e553 100644 --- a/drivers/net/wireless/mediatek/mt76/mt7603/main.c +++ b/drivers/net/wireless/mediatek/mt76/mt7603/main.c @@ -728,6 +728,7 @@ const struct ieee80211_ops mt7603_ops = { .set_sar_specs = mt7603_set_sar_specs, }; +MODULE_DESCRIPTION("MediaTek MT7603E and MT76x8 wireless driver"); MODULE_LICENSE("Dual BSD/GPL"); static int __init mt7603_init(void) diff --git a/drivers/net/wireless/mediatek/mt76/mt7615/main.c b/drivers/net/wireless/mediatek/mt76/mt7615/main.c index dab16b5fc386..0971c164b57e 100644 --- a/drivers/net/wireless/mediatek/mt76/mt7615/main.c +++ b/drivers/net/wireless/mediatek/mt76/mt7615/main.c @@ -1375,4 +1375,5 @@ const struct ieee80211_ops mt7615_ops = { }; EXPORT_SYMBOL_GPL(mt7615_ops); +MODULE_DESCRIPTION("MediaTek MT7615E and MT7663E wireless driver"); MODULE_LICENSE("Dual BSD/GPL"); diff --git a/drivers/net/wireless/mediatek/mt76/mt7615/mmio.c b/drivers/net/wireless/mediatek/mt76/mt7615/mmio.c index ac036a072439..87a956ea3ad7 100644 --- a/drivers/net/wireless/mediatek/mt76/mt7615/mmio.c +++ b/drivers/net/wireless/mediatek/mt76/mt7615/mmio.c @@ -270,4 +270,5 @@ static void __exit mt7615_exit(void) module_init(mt7615_init); module_exit(mt7615_exit); +MODULE_DESCRIPTION("MediaTek MT7615E MMIO helpers"); MODULE_LICENSE("Dual BSD/GPL"); diff --git a/drivers/net/wireless/mediatek/mt76/mt7615/sdio.c b/drivers/net/wireless/mediatek/mt76/mt7615/sdio.c index 67cedd2555f9..9692890ba51b 100644 --- a/drivers/net/wireless/mediatek/mt76/mt7615/sdio.c +++ b/drivers/net/wireless/mediatek/mt76/mt7615/sdio.c @@ -253,4 +253,5 @@ module_sdio_driver(mt7663s_driver); MODULE_AUTHOR("Sean Wang "); MODULE_AUTHOR("Lorenzo Bianconi "); +MODULE_DESCRIPTION("MediaTek MT7663S (SDIO) wireless driver"); MODULE_LICENSE("Dual BSD/GPL"); diff --git a/drivers/net/wireless/mediatek/mt76/mt7615/usb.c b/drivers/net/wireless/mediatek/mt76/mt7615/usb.c index 04963b9f7498..df737e1ff27b 100644 --- a/drivers/net/wireless/mediatek/mt76/mt7615/usb.c +++ b/drivers/net/wireless/mediatek/mt76/mt7615/usb.c @@ -281,4 +281,5 @@ module_usb_driver(mt7663u_driver); MODULE_AUTHOR("Sean Wang "); MODULE_AUTHOR("Lorenzo Bianconi "); +MODULE_DESCRIPTION("MediaTek MT7663U (USB) wireless driver"); MODULE_LICENSE("Dual BSD/GPL"); diff --git a/drivers/net/wireless/mediatek/mt76/mt7615/usb_sdio.c b/drivers/net/wireless/mediatek/mt76/mt7615/usb_sdio.c index 0052d103e276..820b39590027 100644 --- a/drivers/net/wireless/mediatek/mt76/mt7615/usb_sdio.c +++ b/drivers/net/wireless/mediatek/mt76/mt7615/usb_sdio.c @@ -349,4 +349,5 @@ EXPORT_SYMBOL_GPL(mt7663_usb_sdio_register_device); MODULE_AUTHOR("Lorenzo Bianconi "); MODULE_AUTHOR("Sean Wang "); +MODULE_DESCRIPTION("MediaTek MT7663 SDIO/USB helpers"); MODULE_LICENSE("Dual BSD/GPL"); diff --git a/drivers/net/wireless/mediatek/mt76/mt76_connac_mcu.c b/drivers/net/wireless/mediatek/mt76/mt76_connac_mcu.c index 96494ba2fdf7..3a20ba0d2492 100644 --- a/drivers/net/wireless/mediatek/mt76/mt76_connac_mcu.c +++ b/drivers/net/wireless/mediatek/mt76/mt76_connac_mcu.c @@ -3160,4 +3160,5 @@ exit: EXPORT_SYMBOL_GPL(mt76_connac2_mcu_fill_message); MODULE_AUTHOR("Lorenzo Bianconi "); +MODULE_DESCRIPTION("MediaTek MT76x connac layer helpers"); MODULE_LICENSE("Dual BSD/GPL"); diff --git a/drivers/net/wireless/mediatek/mt76/mt76x0/eeprom.c b/drivers/net/wireless/mediatek/mt76/mt76x0/eeprom.c index c3a392a1a659..bcd24c9072ec 100644 --- a/drivers/net/wireless/mediatek/mt76/mt76x0/eeprom.c +++ b/drivers/net/wireless/mediatek/mt76/mt76x0/eeprom.c @@ -342,4 +342,5 @@ int mt76x0_eeprom_init(struct mt76x02_dev *dev) return 0; } +MODULE_DESCRIPTION("MediaTek MT76x EEPROM helpers"); MODULE_LICENSE("Dual BSD/GPL"); diff --git a/drivers/net/wireless/mediatek/mt76/mt76x0/pci.c b/drivers/net/wireless/mediatek/mt76/mt76x0/pci.c index 9277ff38b7a2..293e66fa83d5 100644 --- a/drivers/net/wireless/mediatek/mt76/mt76x0/pci.c +++ b/drivers/net/wireless/mediatek/mt76/mt76x0/pci.c @@ -302,6 +302,7 @@ static const struct pci_device_id mt76x0e_device_table[] = { MODULE_DEVICE_TABLE(pci, mt76x0e_device_table); MODULE_FIRMWARE(MT7610E_FIRMWARE); MODULE_FIRMWARE(MT7650E_FIRMWARE); +MODULE_DESCRIPTION("MediaTek MT76x0E (PCIe) wireless driver"); MODULE_LICENSE("Dual BSD/GPL"); static struct pci_driver mt76x0e_driver = { diff --git a/drivers/net/wireless/mediatek/mt76/mt76x0/usb.c b/drivers/net/wireless/mediatek/mt76/mt76x0/usb.c index 0422c332354a..dd042949cf82 100644 --- a/drivers/net/wireless/mediatek/mt76/mt76x0/usb.c +++ b/drivers/net/wireless/mediatek/mt76/mt76x0/usb.c @@ -336,6 +336,7 @@ err: MODULE_DEVICE_TABLE(usb, mt76x0_device_table); MODULE_FIRMWARE(MT7610E_FIRMWARE); MODULE_FIRMWARE(MT7610U_FIRMWARE); +MODULE_DESCRIPTION("MediaTek MT76x0U (USB) wireless driver"); MODULE_LICENSE("GPL"); static struct usb_driver mt76x0_driver = { diff --git a/drivers/net/wireless/mediatek/mt76/mt76x02_usb_mcu.c b/drivers/net/wireless/mediatek/mt76/mt76x02_usb_mcu.c index 02da543dfc5c..b2cc44914294 100644 --- a/drivers/net/wireless/mediatek/mt76/mt76x02_usb_mcu.c +++ b/drivers/net/wireless/mediatek/mt76/mt76x02_usb_mcu.c @@ -293,4 +293,5 @@ void mt76x02u_init_mcu(struct mt76_dev *dev) EXPORT_SYMBOL_GPL(mt76x02u_init_mcu); MODULE_AUTHOR("Lorenzo Bianconi "); +MODULE_DESCRIPTION("MediaTek MT76x02 MCU helpers"); MODULE_LICENSE("Dual BSD/GPL"); diff --git a/drivers/net/wireless/mediatek/mt76/mt76x02_util.c b/drivers/net/wireless/mediatek/mt76/mt76x02_util.c index 8a0e8124b894..8020446be37b 100644 --- a/drivers/net/wireless/mediatek/mt76/mt76x02_util.c +++ b/drivers/net/wireless/mediatek/mt76/mt76x02_util.c @@ -696,4 +696,5 @@ void mt76x02_config_mac_addr_list(struct mt76x02_dev *dev) } EXPORT_SYMBOL_GPL(mt76x02_config_mac_addr_list); +MODULE_DESCRIPTION("MediaTek MT76x02 helpers"); MODULE_LICENSE("Dual BSD/GPL"); diff --git a/drivers/net/wireless/mediatek/mt76/mt76x2/eeprom.c b/drivers/net/wireless/mediatek/mt76/mt76x2/eeprom.c index 8c01855885ce..1fe5f5a02f93 100644 --- a/drivers/net/wireless/mediatek/mt76/mt76x2/eeprom.c +++ b/drivers/net/wireless/mediatek/mt76/mt76x2/eeprom.c @@ -506,4 +506,5 @@ int mt76x2_eeprom_init(struct mt76x02_dev *dev) } EXPORT_SYMBOL_GPL(mt76x2_eeprom_init); +MODULE_DESCRIPTION("MediaTek MT76x2 EEPROM helpers"); MODULE_LICENSE("Dual BSD/GPL"); diff --git a/drivers/net/wireless/mediatek/mt76/mt76x2/pci.c b/drivers/net/wireless/mediatek/mt76/mt76x2/pci.c index df85ebc6e1df..30959746e924 100644 --- a/drivers/net/wireless/mediatek/mt76/mt76x2/pci.c +++ b/drivers/net/wireless/mediatek/mt76/mt76x2/pci.c @@ -165,6 +165,7 @@ mt76x2e_resume(struct pci_dev *pdev) MODULE_DEVICE_TABLE(pci, mt76x2e_device_table); MODULE_FIRMWARE(MT7662_FIRMWARE); MODULE_FIRMWARE(MT7662_ROM_PATCH); +MODULE_DESCRIPTION("MediaTek MT76x2E (PCIe) wireless driver"); MODULE_LICENSE("Dual BSD/GPL"); static struct pci_driver mt76pci_driver = { diff --git a/drivers/net/wireless/mediatek/mt76/mt76x2/usb.c b/drivers/net/wireless/mediatek/mt76/mt76x2/usb.c index 55068f3252ef..ca78e14251c2 100644 --- a/drivers/net/wireless/mediatek/mt76/mt76x2/usb.c +++ b/drivers/net/wireless/mediatek/mt76/mt76x2/usb.c @@ -147,4 +147,5 @@ static struct usb_driver mt76x2u_driver = { module_usb_driver(mt76x2u_driver); MODULE_AUTHOR("Lorenzo Bianconi "); +MODULE_DESCRIPTION("MediaTek MT76x2U (USB) wireless driver"); MODULE_LICENSE("Dual BSD/GPL"); diff --git a/drivers/net/wireless/mediatek/mt76/mt7915/mmio.c b/drivers/net/wireless/mediatek/mt76/mt7915/mmio.c index aff4f21e843d..3039f53e2245 100644 --- a/drivers/net/wireless/mediatek/mt76/mt7915/mmio.c +++ b/drivers/net/wireless/mediatek/mt76/mt7915/mmio.c @@ -958,4 +958,5 @@ static void __exit mt7915_exit(void) module_init(mt7915_init); module_exit(mt7915_exit); +MODULE_DESCRIPTION("MediaTek MT7915E MMIO helpers"); MODULE_LICENSE("Dual BSD/GPL"); diff --git a/drivers/net/wireless/mediatek/mt76/mt7921/main.c b/drivers/net/wireless/mediatek/mt76/mt7921/main.c index 0645417e0582..0d5adc5ddae3 100644 --- a/drivers/net/wireless/mediatek/mt76/mt7921/main.c +++ b/drivers/net/wireless/mediatek/mt76/mt7921/main.c @@ -1418,5 +1418,6 @@ const struct ieee80211_ops mt7921_ops = { }; EXPORT_SYMBOL_GPL(mt7921_ops); +MODULE_DESCRIPTION("MediaTek MT7921 core driver"); MODULE_LICENSE("Dual BSD/GPL"); MODULE_AUTHOR("Sean Wang "); diff --git a/drivers/net/wireless/mediatek/mt76/mt7921/pci.c b/drivers/net/wireless/mediatek/mt76/mt7921/pci.c index 57903c6e4f11..dde26f327478 100644 --- a/drivers/net/wireless/mediatek/mt76/mt7921/pci.c +++ b/drivers/net/wireless/mediatek/mt76/mt7921/pci.c @@ -544,4 +544,5 @@ MODULE_FIRMWARE(MT7922_FIRMWARE_WM); MODULE_FIRMWARE(MT7922_ROM_PATCH); MODULE_AUTHOR("Sean Wang "); MODULE_AUTHOR("Lorenzo Bianconi "); +MODULE_DESCRIPTION("MediaTek MT7921E (PCIe) wireless driver"); MODULE_LICENSE("Dual BSD/GPL"); diff --git a/drivers/net/wireless/mediatek/mt76/mt7921/sdio.c b/drivers/net/wireless/mediatek/mt76/mt7921/sdio.c index 7591e54d2897..a9ce1e746b95 100644 --- a/drivers/net/wireless/mediatek/mt76/mt7921/sdio.c +++ b/drivers/net/wireless/mediatek/mt76/mt7921/sdio.c @@ -323,5 +323,6 @@ static struct sdio_driver mt7921s_driver = { .drv.pm = pm_sleep_ptr(&mt7921s_pm_ops), }; module_sdio_driver(mt7921s_driver); +MODULE_DESCRIPTION("MediaTek MT7921S (SDIO) wireless driver"); MODULE_AUTHOR("Sean Wang "); MODULE_LICENSE("Dual BSD/GPL"); diff --git a/drivers/net/wireless/mediatek/mt76/mt7921/usb.c b/drivers/net/wireless/mediatek/mt76/mt7921/usb.c index e5258c74fc07..8b7c03c47598 100644 --- a/drivers/net/wireless/mediatek/mt76/mt7921/usb.c +++ b/drivers/net/wireless/mediatek/mt76/mt7921/usb.c @@ -336,5 +336,6 @@ static struct usb_driver mt7921u_driver = { }; module_usb_driver(mt7921u_driver); +MODULE_DESCRIPTION("MediaTek MT7921U (USB) wireless driver"); MODULE_AUTHOR("Lorenzo Bianconi "); MODULE_LICENSE("Dual BSD/GPL"); diff --git a/drivers/net/wireless/mediatek/mt76/mt7925/main.c b/drivers/net/wireless/mediatek/mt76/mt7925/main.c index 8f1075da4903..125a1be3cb64 100644 --- a/drivers/net/wireless/mediatek/mt76/mt7925/main.c +++ b/drivers/net/wireless/mediatek/mt76/mt7925/main.c @@ -1450,4 +1450,5 @@ const struct ieee80211_ops mt7925_ops = { EXPORT_SYMBOL_GPL(mt7925_ops); MODULE_AUTHOR("Deren Wu "); +MODULE_DESCRIPTION("MediaTek MT7925 core driver"); MODULE_LICENSE("Dual BSD/GPL"); diff --git a/drivers/net/wireless/mediatek/mt76/mt7925/pci.c b/drivers/net/wireless/mediatek/mt76/mt7925/pci.c index 734f31ee40d3..1fd99a856541 100644 --- a/drivers/net/wireless/mediatek/mt76/mt7925/pci.c +++ b/drivers/net/wireless/mediatek/mt76/mt7925/pci.c @@ -583,4 +583,5 @@ MODULE_FIRMWARE(MT7925_FIRMWARE_WM); MODULE_FIRMWARE(MT7925_ROM_PATCH); MODULE_AUTHOR("Deren Wu "); MODULE_AUTHOR("Lorenzo Bianconi "); +MODULE_DESCRIPTION("MediaTek MT7925E (PCIe) wireless driver"); MODULE_LICENSE("Dual BSD/GPL"); diff --git a/drivers/net/wireless/mediatek/mt76/mt7925/usb.c b/drivers/net/wireless/mediatek/mt76/mt7925/usb.c index 9b885c5b3ed5..1e0f094fc905 100644 --- a/drivers/net/wireless/mediatek/mt76/mt7925/usb.c +++ b/drivers/net/wireless/mediatek/mt76/mt7925/usb.c @@ -329,4 +329,5 @@ static struct usb_driver mt7925u_driver = { module_usb_driver(mt7925u_driver); MODULE_AUTHOR("Lorenzo Bianconi "); +MODULE_DESCRIPTION("MediaTek MT7925U (USB) wireless driver"); MODULE_LICENSE("Dual BSD/GPL"); diff --git a/drivers/net/wireless/mediatek/mt76/mt792x_core.c b/drivers/net/wireless/mediatek/mt76/mt792x_core.c index 502be22dbe36..c42101aa9e45 100644 --- a/drivers/net/wireless/mediatek/mt76/mt792x_core.c +++ b/drivers/net/wireless/mediatek/mt76/mt792x_core.c @@ -862,5 +862,6 @@ int mt792x_load_firmware(struct mt792x_dev *dev) } EXPORT_SYMBOL_GPL(mt792x_load_firmware); +MODULE_DESCRIPTION("MediaTek MT792x core driver"); MODULE_LICENSE("Dual BSD/GPL"); MODULE_AUTHOR("Lorenzo Bianconi "); diff --git a/drivers/net/wireless/mediatek/mt76/mt792x_usb.c b/drivers/net/wireless/mediatek/mt76/mt792x_usb.c index 2dd283caed36..589a3efb9f8c 100644 --- a/drivers/net/wireless/mediatek/mt76/mt792x_usb.c +++ b/drivers/net/wireless/mediatek/mt76/mt792x_usb.c @@ -314,5 +314,6 @@ void mt792xu_disconnect(struct usb_interface *usb_intf) } EXPORT_SYMBOL_GPL(mt792xu_disconnect); +MODULE_DESCRIPTION("MediaTek MT792x USB helpers"); MODULE_LICENSE("Dual BSD/GPL"); MODULE_AUTHOR("Lorenzo Bianconi "); diff --git a/drivers/net/wireless/mediatek/mt76/mt7996/mmio.c b/drivers/net/wireless/mediatek/mt76/mt7996/mmio.c index c50d89a445e9..9f2abfa273c9 100644 --- a/drivers/net/wireless/mediatek/mt76/mt7996/mmio.c +++ b/drivers/net/wireless/mediatek/mt76/mt7996/mmio.c @@ -650,4 +650,5 @@ static void __exit mt7996_exit(void) module_init(mt7996_init); module_exit(mt7996_exit); +MODULE_DESCRIPTION("MediaTek MT7996 MMIO helpers"); MODULE_LICENSE("Dual BSD/GPL"); diff --git a/drivers/net/wireless/mediatek/mt76/sdio.c b/drivers/net/wireless/mediatek/mt76/sdio.c index c52d550f0c32..3e88798df017 100644 --- a/drivers/net/wireless/mediatek/mt76/sdio.c +++ b/drivers/net/wireless/mediatek/mt76/sdio.c @@ -672,4 +672,5 @@ EXPORT_SYMBOL_GPL(mt76s_init); MODULE_AUTHOR("Sean Wang "); MODULE_AUTHOR("Lorenzo Bianconi "); +MODULE_DESCRIPTION("MediaTek MT76x SDIO helpers"); MODULE_LICENSE("Dual BSD/GPL"); diff --git a/drivers/net/wireless/mediatek/mt76/usb.c b/drivers/net/wireless/mediatek/mt76/usb.c index 1584665fe3cb..5a0bcb5071bd 100644 --- a/drivers/net/wireless/mediatek/mt76/usb.c +++ b/drivers/net/wireless/mediatek/mt76/usb.c @@ -1128,4 +1128,5 @@ int mt76u_init(struct mt76_dev *dev, struct usb_interface *intf) EXPORT_SYMBOL_GPL(mt76u_init); MODULE_AUTHOR("Lorenzo Bianconi "); +MODULE_DESCRIPTION("MediaTek MT76x USB helpers"); MODULE_LICENSE("Dual BSD/GPL"); diff --git a/drivers/net/wireless/mediatek/mt76/util.c b/drivers/net/wireless/mediatek/mt76/util.c index fc76c66ff1a5..d6c01a2dd198 100644 --- a/drivers/net/wireless/mediatek/mt76/util.c +++ b/drivers/net/wireless/mediatek/mt76/util.c @@ -138,4 +138,5 @@ int __mt76_worker_fn(void *ptr) } EXPORT_SYMBOL_GPL(__mt76_worker_fn); +MODULE_DESCRIPTION("MediaTek MT76x helpers"); MODULE_LICENSE("Dual BSD/GPL"); From d9807d60c145836043ffa602328ea1d66dc458b1 Mon Sep 17 00:00:00 2001 From: Vincent Chen Date: Wed, 17 Jan 2024 22:03:33 +0800 Subject: [PATCH 071/270] riscv: mm: execute local TLB flush after populating vmemmap The spare_init() calls memmap_populate() many times to create VA to PA mapping for the VMEMMAP area, where all "struct page" are located once CONFIG_SPARSEMEM_VMEMMAP is defined. These "struct page" are later initialized in the zone_sizes_init() function. However, during this process, no sfence.vma instruction is executed for this VMEMMAP area. This omission may cause the hart to fail to perform page table walk because some data related to the address translation is invisible to the hart. To solve this issue, the local_flush_tlb_kernel_range() is called right after the sparse_init() to execute a sfence.vma instruction for this VMEMMAP area, ensuring that all data related to the address translation is visible to the hart. Fixes: d95f1a542c3d ("RISC-V: Implement sparsemem") Signed-off-by: Vincent Chen Reviewed-by: Alexandre Ghiti Link: https://lore.kernel.org/r/20240117140333.2479667-1-vincent.chen@sifive.com Fixes: 7a92fc8b4d20 ("mm: Introduce flush_cache_vmap_early()") Signed-off-by: Palmer Dabbelt --- arch/riscv/include/asm/tlbflush.h | 1 + arch/riscv/mm/init.c | 4 ++++ arch/riscv/mm/tlbflush.c | 3 ++- 3 files changed, 7 insertions(+), 1 deletion(-) diff --git a/arch/riscv/include/asm/tlbflush.h b/arch/riscv/include/asm/tlbflush.h index 928f096dca21..4112cc8d1d69 100644 --- a/arch/riscv/include/asm/tlbflush.h +++ b/arch/riscv/include/asm/tlbflush.h @@ -75,6 +75,7 @@ static inline void flush_tlb_kernel_range(unsigned long start, #define flush_tlb_mm(mm) flush_tlb_all() #define flush_tlb_mm_range(mm, start, end, page_size) flush_tlb_all() +#define local_flush_tlb_kernel_range(start, end) flush_tlb_all() #endif /* !CONFIG_SMP || !CONFIG_MMU */ #endif /* _ASM_RISCV_TLBFLUSH_H */ diff --git a/arch/riscv/mm/init.c b/arch/riscv/mm/init.c index 32cad6a65ccd..fa34cf55037b 100644 --- a/arch/riscv/mm/init.c +++ b/arch/riscv/mm/init.c @@ -1385,6 +1385,10 @@ void __init misc_mem_init(void) early_memtest(min_low_pfn << PAGE_SHIFT, max_low_pfn << PAGE_SHIFT); arch_numa_init(); sparse_init(); +#ifdef CONFIG_SPARSEMEM_VMEMMAP + /* The entire VMEMMAP region has been populated. Flush TLB for this region */ + local_flush_tlb_kernel_range(VMEMMAP_START, VMEMMAP_END); +#endif zone_sizes_init(); arch_reserve_crashkernel(); memblock_dump_all(); diff --git a/arch/riscv/mm/tlbflush.c b/arch/riscv/mm/tlbflush.c index 8d12b26f5ac3..dffb0e5bd942 100644 --- a/arch/riscv/mm/tlbflush.c +++ b/arch/riscv/mm/tlbflush.c @@ -66,9 +66,10 @@ static inline void local_flush_tlb_range_asid(unsigned long start, local_flush_tlb_range_threshold_asid(start, size, stride, asid); } +/* Flush a range of kernel pages without broadcasting */ void local_flush_tlb_kernel_range(unsigned long start, unsigned long end) { - local_flush_tlb_range_asid(start, end, PAGE_SIZE, FLUSH_TLB_NO_ASID); + local_flush_tlb_range_asid(start, end - start, PAGE_SIZE, FLUSH_TLB_NO_ASID); } static void __ipi_flush_tlb_all(void *info) From aa125f229076a8230a171d519dbdb2a862145d33 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Sun, 28 Jan 2024 10:23:09 +0200 Subject: [PATCH 072/270] wifi: iwlwifi: remove extra kernel-doc This no longer exists, remove the kernel-doc. Signed-off-by: Johannes Berg Signed-off-by: Miri Korenblit Signed-off-by: Kalle Valo Link: https://msgid.link/20240128102209.d2192d79bc09.Id9551728d618248dd471382a5283503a8976237a@changeid --- drivers/net/wireless/intel/iwlwifi/fw/dbg.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/net/wireless/intel/iwlwifi/fw/dbg.c b/drivers/net/wireless/intel/iwlwifi/fw/dbg.c index e27774e7ed74..80fda056e46a 100644 --- a/drivers/net/wireless/intel/iwlwifi/fw/dbg.c +++ b/drivers/net/wireless/intel/iwlwifi/fw/dbg.c @@ -1,6 +1,6 @@ // SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause /* - * Copyright (C) 2005-2014, 2018-2023 Intel Corporation + * Copyright (C) 2005-2014, 2018-2024 Intel Corporation * Copyright (C) 2013-2015 Intel Mobile Communications GmbH * Copyright (C) 2015-2017 Intel Deutschland GmbH */ @@ -19,7 +19,6 @@ * @fwrt_ptr: pointer to the buffer coming from fwrt * @trans_ptr: pointer to struct %iwl_trans_dump_data which contains the * transport's data. - * @trans_len: length of the valid data in trans_ptr * @fwrt_len: length of the valid data in fwrt_ptr */ struct iwl_fw_dump_ptrs { From e440c5f2e3e6893aeb39bbba6dd181207840a795 Mon Sep 17 00:00:00 2001 From: Vitaly Kuznetsov Date: Tue, 9 Jan 2024 15:11:17 +0100 Subject: [PATCH 073/270] KVM: selftests: Generalize check_clocksource() from kvm_clock_test Several existing x86 selftests need to check that the underlying system clocksource is TSC or based on TSC but every test implements its own check. As a first step towards unification, extract check_clocksource() from kvm_clock_test and split it into two functions: arch-neutral 'sys_get_cur_clocksource()' and x86-specific 'sys_clocksource_is_tsc()'. Fix a couple of pre-existing issues in kvm_clock_test: memory leakage in check_clocksource() and using TEST_ASSERT() instead of TEST_REQUIRE(). The change also makes the test fail when system clocksource can't be read from sysfs. Signed-off-by: Vitaly Kuznetsov Link: https://lore.kernel.org/r/20240109141121.1619463-2-vkuznets@redhat.com [sean: eliminate if-elif pattern just to set a bool true] Signed-off-by: Sean Christopherson --- .../testing/selftests/kvm/include/test_util.h | 2 + .../selftests/kvm/include/x86_64/processor.h | 2 + tools/testing/selftests/kvm/lib/test_util.c | 25 ++++++++++++ .../selftests/kvm/lib/x86_64/processor.c | 10 +++++ .../selftests/kvm/x86_64/kvm_clock_test.c | 38 +------------------ 5 files changed, 40 insertions(+), 37 deletions(-) diff --git a/tools/testing/selftests/kvm/include/test_util.h b/tools/testing/selftests/kvm/include/test_util.h index 71a41fa924b7..50a5e31ba8da 100644 --- a/tools/testing/selftests/kvm/include/test_util.h +++ b/tools/testing/selftests/kvm/include/test_util.h @@ -195,4 +195,6 @@ __printf(3, 4) int guest_snprintf(char *buf, int n, const char *fmt, ...); char *strdup_printf(const char *fmt, ...) __attribute__((format(printf, 1, 2), nonnull(1))); +char *sys_get_cur_clocksource(void); + #endif /* SELFTEST_KVM_TEST_UTIL_H */ diff --git a/tools/testing/selftests/kvm/include/x86_64/processor.h b/tools/testing/selftests/kvm/include/x86_64/processor.h index a84863503fcb..01eec72e0d3e 100644 --- a/tools/testing/selftests/kvm/include/x86_64/processor.h +++ b/tools/testing/selftests/kvm/include/x86_64/processor.h @@ -1271,4 +1271,6 @@ void virt_map_level(struct kvm_vm *vm, uint64_t vaddr, uint64_t paddr, #define PFERR_GUEST_PAGE_MASK BIT_ULL(PFERR_GUEST_PAGE_BIT) #define PFERR_IMPLICIT_ACCESS BIT_ULL(PFERR_IMPLICIT_ACCESS_BIT) +bool sys_clocksource_is_tsc(void); + #endif /* SELFTEST_KVM_PROCESSOR_H */ diff --git a/tools/testing/selftests/kvm/lib/test_util.c b/tools/testing/selftests/kvm/lib/test_util.c index 5d7f28b02d73..5a8f8becb129 100644 --- a/tools/testing/selftests/kvm/lib/test_util.c +++ b/tools/testing/selftests/kvm/lib/test_util.c @@ -392,3 +392,28 @@ char *strdup_printf(const char *fmt, ...) return str; } + +#define CLOCKSOURCE_PATH "/sys/devices/system/clocksource/clocksource0/current_clocksource" + +char *sys_get_cur_clocksource(void) +{ + char *clk_name; + struct stat st; + FILE *fp; + + fp = fopen(CLOCKSOURCE_PATH, "r"); + TEST_ASSERT(fp, "failed to open clocksource file, errno: %d", errno); + + TEST_ASSERT(!fstat(fileno(fp), &st), "failed to stat clocksource file, errno: %d", + errno); + + clk_name = malloc(st.st_size); + TEST_ASSERT(clk_name, "failed to allocate buffer to read file"); + + TEST_ASSERT(fgets(clk_name, st.st_size, fp), "failed to read clocksource file: %d", + ferror(fp)); + + fclose(fp); + + return clk_name; +} diff --git a/tools/testing/selftests/kvm/lib/x86_64/processor.c b/tools/testing/selftests/kvm/lib/x86_64/processor.c index 4bc52948447d..e6964ff2a37d 100644 --- a/tools/testing/selftests/kvm/lib/x86_64/processor.c +++ b/tools/testing/selftests/kvm/lib/x86_64/processor.c @@ -1299,3 +1299,13 @@ void kvm_selftest_arch_init(void) host_cpu_is_intel = this_cpu_is_intel(); host_cpu_is_amd = this_cpu_is_amd(); } + +bool sys_clocksource_is_tsc(void) +{ + char *clk_name = sys_get_cur_clocksource(); + bool ret = !strcmp(clk_name, "tsc\n"); + + free(clk_name); + + return ret; +} diff --git a/tools/testing/selftests/kvm/x86_64/kvm_clock_test.c b/tools/testing/selftests/kvm/x86_64/kvm_clock_test.c index 3e0b7d51abda..6fcc1a433587 100644 --- a/tools/testing/selftests/kvm/x86_64/kvm_clock_test.c +++ b/tools/testing/selftests/kvm/x86_64/kvm_clock_test.c @@ -132,42 +132,6 @@ static void enter_guest(struct kvm_vcpu *vcpu) } } -#define CLOCKSOURCE_PATH "/sys/devices/system/clocksource/clocksource0/current_clocksource" - -static void check_clocksource(void) -{ - char *clk_name; - struct stat st; - FILE *fp; - - fp = fopen(CLOCKSOURCE_PATH, "r"); - if (!fp) { - pr_info("failed to open clocksource file: %d; assuming TSC.\n", - errno); - return; - } - - if (fstat(fileno(fp), &st)) { - pr_info("failed to stat clocksource file: %d; assuming TSC.\n", - errno); - goto out; - } - - clk_name = malloc(st.st_size); - TEST_ASSERT(clk_name, "failed to allocate buffer to read file"); - - if (!fgets(clk_name, st.st_size, fp)) { - pr_info("failed to read clocksource file: %d; assuming TSC.\n", - ferror(fp)); - goto out; - } - - TEST_ASSERT(!strncmp(clk_name, "tsc\n", st.st_size), - "clocksource not supported: %s", clk_name); -out: - fclose(fp); -} - int main(void) { struct kvm_vcpu *vcpu; @@ -179,7 +143,7 @@ int main(void) flags = kvm_check_cap(KVM_CAP_ADJUST_CLOCK); TEST_REQUIRE(flags & KVM_CLOCK_REALTIME); - check_clocksource(); + TEST_REQUIRE(sys_clocksource_is_tsc()); vm = vm_create_with_one_vcpu(&vcpu, guest_main); From 410cb01ead5bcec500c0654f361d620553f930aa Mon Sep 17 00:00:00 2001 From: Vitaly Kuznetsov Date: Tue, 9 Jan 2024 15:11:18 +0100 Subject: [PATCH 074/270] KVM: selftests: Use generic sys_clocksource_is_tsc() in vmx_nested_tsc_scaling_test Despite its name, system_has_stable_tsc() just checks that system clocksource is 'tsc'; this can now be done with generic sys_clocksource_is_tsc(). No functional change intended. Signed-off-by: Vitaly Kuznetsov Link: https://lore.kernel.org/r/20240109141121.1619463-3-vkuznets@redhat.com Signed-off-by: Sean Christopherson --- .../kvm/x86_64/vmx_nested_tsc_scaling_test.c | 19 +------------------ 1 file changed, 1 insertion(+), 18 deletions(-) diff --git a/tools/testing/selftests/kvm/x86_64/vmx_nested_tsc_scaling_test.c b/tools/testing/selftests/kvm/x86_64/vmx_nested_tsc_scaling_test.c index e710b6e7fb38..93b0a850a240 100644 --- a/tools/testing/selftests/kvm/x86_64/vmx_nested_tsc_scaling_test.c +++ b/tools/testing/selftests/kvm/x86_64/vmx_nested_tsc_scaling_test.c @@ -116,23 +116,6 @@ static void l1_guest_code(struct vmx_pages *vmx_pages) GUEST_DONE(); } -static bool system_has_stable_tsc(void) -{ - bool tsc_is_stable; - FILE *fp; - char buf[4]; - - fp = fopen("/sys/devices/system/clocksource/clocksource0/current_clocksource", "r"); - if (fp == NULL) - return false; - - tsc_is_stable = fgets(buf, sizeof(buf), fp) && - !strncmp(buf, "tsc", sizeof(buf)); - - fclose(fp); - return tsc_is_stable; -} - int main(int argc, char *argv[]) { struct kvm_vcpu *vcpu; @@ -148,7 +131,7 @@ int main(int argc, char *argv[]) TEST_REQUIRE(kvm_cpu_has(X86_FEATURE_VMX)); TEST_REQUIRE(kvm_has_cap(KVM_CAP_TSC_CONTROL)); - TEST_REQUIRE(system_has_stable_tsc()); + TEST_REQUIRE(sys_clocksource_is_tsc()); /* * We set L1's scale factor to be a random number from 2 to 10. From 09951bf2cbb3a7893f76d1364b0ae6e3007ff1de Mon Sep 17 00:00:00 2001 From: Vitaly Kuznetsov Date: Tue, 9 Jan 2024 15:11:19 +0100 Subject: [PATCH 075/270] KVM: selftests: Run clocksource dependent tests with hyperv_clocksource_tsc_page too KVM's 'gtod_is_based_on_tsc()' recognizes two clocksources: 'tsc' and 'hyperv_clocksource_tsc_page' and enables kvmclock in 'masterclock' mode when either is in use. Transform 'sys_clocksource_is_tsc()' into 'sys_clocksource_is_based_on_tsc()' to support the later. This affects two tests: kvm_clock_test and vmx_nested_tsc_scaling_test, both seem to work well when system clocksource is 'hyperv_clocksource_tsc_page'. Signed-off-by: Vitaly Kuznetsov Link: https://lore.kernel.org/r/20240109141121.1619463-4-vkuznets@redhat.com Signed-off-by: Sean Christopherson --- tools/testing/selftests/kvm/include/x86_64/processor.h | 2 +- tools/testing/selftests/kvm/lib/x86_64/processor.c | 5 +++-- tools/testing/selftests/kvm/x86_64/kvm_clock_test.c | 2 +- .../selftests/kvm/x86_64/vmx_nested_tsc_scaling_test.c | 2 +- 4 files changed, 6 insertions(+), 5 deletions(-) diff --git a/tools/testing/selftests/kvm/include/x86_64/processor.h b/tools/testing/selftests/kvm/include/x86_64/processor.h index 01eec72e0d3e..5bca8c947c82 100644 --- a/tools/testing/selftests/kvm/include/x86_64/processor.h +++ b/tools/testing/selftests/kvm/include/x86_64/processor.h @@ -1271,6 +1271,6 @@ void virt_map_level(struct kvm_vm *vm, uint64_t vaddr, uint64_t paddr, #define PFERR_GUEST_PAGE_MASK BIT_ULL(PFERR_GUEST_PAGE_BIT) #define PFERR_IMPLICIT_ACCESS BIT_ULL(PFERR_IMPLICIT_ACCESS_BIT) -bool sys_clocksource_is_tsc(void); +bool sys_clocksource_is_based_on_tsc(void); #endif /* SELFTEST_KVM_PROCESSOR_H */ diff --git a/tools/testing/selftests/kvm/lib/x86_64/processor.c b/tools/testing/selftests/kvm/lib/x86_64/processor.c index e6964ff2a37d..f639b3e062e3 100644 --- a/tools/testing/selftests/kvm/lib/x86_64/processor.c +++ b/tools/testing/selftests/kvm/lib/x86_64/processor.c @@ -1300,10 +1300,11 @@ void kvm_selftest_arch_init(void) host_cpu_is_amd = this_cpu_is_amd(); } -bool sys_clocksource_is_tsc(void) +bool sys_clocksource_is_based_on_tsc(void) { char *clk_name = sys_get_cur_clocksource(); - bool ret = !strcmp(clk_name, "tsc\n"); + bool ret = !strcmp(clk_name, "tsc\n") || + !strcmp(clk_name, "hyperv_clocksource_tsc_page\n"); free(clk_name); diff --git a/tools/testing/selftests/kvm/x86_64/kvm_clock_test.c b/tools/testing/selftests/kvm/x86_64/kvm_clock_test.c index 6fcc1a433587..5bc12222d87a 100644 --- a/tools/testing/selftests/kvm/x86_64/kvm_clock_test.c +++ b/tools/testing/selftests/kvm/x86_64/kvm_clock_test.c @@ -143,7 +143,7 @@ int main(void) flags = kvm_check_cap(KVM_CAP_ADJUST_CLOCK); TEST_REQUIRE(flags & KVM_CLOCK_REALTIME); - TEST_REQUIRE(sys_clocksource_is_tsc()); + TEST_REQUIRE(sys_clocksource_is_based_on_tsc()); vm = vm_create_with_one_vcpu(&vcpu, guest_main); diff --git a/tools/testing/selftests/kvm/x86_64/vmx_nested_tsc_scaling_test.c b/tools/testing/selftests/kvm/x86_64/vmx_nested_tsc_scaling_test.c index 93b0a850a240..1759fa5cb3f2 100644 --- a/tools/testing/selftests/kvm/x86_64/vmx_nested_tsc_scaling_test.c +++ b/tools/testing/selftests/kvm/x86_64/vmx_nested_tsc_scaling_test.c @@ -131,7 +131,7 @@ int main(int argc, char *argv[]) TEST_REQUIRE(kvm_cpu_has(X86_FEATURE_VMX)); TEST_REQUIRE(kvm_has_cap(KVM_CAP_TSC_CONTROL)); - TEST_REQUIRE(sys_clocksource_is_tsc()); + TEST_REQUIRE(sys_clocksource_is_based_on_tsc()); /* * We set L1's scale factor to be a random number from 2 to 10. From b6831a108be1206cd9f0e7905b48677b4147d5f9 Mon Sep 17 00:00:00 2001 From: Vitaly Kuznetsov Date: Tue, 9 Jan 2024 15:11:20 +0100 Subject: [PATCH 076/270] KVM: selftests: Make hyperv_clock require TSC based system clocksource KVM sets up Hyper-V TSC page clocksource for its guests when system clocksource is 'based on TSC' (see gtod_is_based_on_tsc()), running hyperv_clock with any other clocksource leads to imminent failure. Add the missing requirement to make the test skip gracefully. Signed-off-by: Vitaly Kuznetsov Link: https://lore.kernel.org/r/20240109141121.1619463-5-vkuznets@redhat.com Signed-off-by: Sean Christopherson --- tools/testing/selftests/kvm/x86_64/hyperv_clock.c | 1 + 1 file changed, 1 insertion(+) diff --git a/tools/testing/selftests/kvm/x86_64/hyperv_clock.c b/tools/testing/selftests/kvm/x86_64/hyperv_clock.c index 65690d916db7..e058bc676cd6 100644 --- a/tools/testing/selftests/kvm/x86_64/hyperv_clock.c +++ b/tools/testing/selftests/kvm/x86_64/hyperv_clock.c @@ -212,6 +212,7 @@ int main(void) int stage; TEST_REQUIRE(kvm_has_cap(KVM_CAP_HYPERV_TIME)); + TEST_REQUIRE(sys_clocksource_is_based_on_tsc()); vm = vm_create_with_one_vcpu(&vcpu, guest_main); From 9e62797fd7e8eef9c8a3f7b54b57fbc9caf6a20a Mon Sep 17 00:00:00 2001 From: Vitaly Kuznetsov Date: Tue, 9 Jan 2024 15:11:21 +0100 Subject: [PATCH 077/270] KVM: x86: Make gtod_is_based_on_tsc() return 'bool' gtod_is_based_on_tsc() is boolean in nature, i.e. it returns '1' for good clocksources and '0' otherwise. Moreover, its result is used raw by kvm_get_time_and_clockread()/kvm_get_walltime_and_clockread() which are 'bool'. No functional change intended. Signed-off-by: Vitaly Kuznetsov Link: https://lore.kernel.org/r/20240109141121.1619463-6-vkuznets@redhat.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/x86.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 363b1c080205..aa27aec10860 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -2507,7 +2507,7 @@ static u64 compute_guest_tsc(struct kvm_vcpu *vcpu, s64 kernel_ns) } #ifdef CONFIG_X86_64 -static inline int gtod_is_based_on_tsc(int mode) +static inline bool gtod_is_based_on_tsc(int mode) { return mode == VDSO_CLOCKMODE_TSC || mode == VDSO_CLOCKMODE_HVCLOCK; } From f0377ff97509f5a4921993d5d61da000361bd884 Mon Sep 17 00:00:00 2001 From: Maurizio Lombardi Date: Thu, 18 Jan 2024 12:48:54 +0100 Subject: [PATCH 078/270] nvme-host: fix the updating of the firmware version The original code didn't update the firmware version if the "next slot" of the AFI register isn't zero or if the "current slot" field is zero; in those cases it assumed that a reset was needed. However, the NVMe specification doesn't exclude the possibility that the "next slot" value is equal to the "current slot" value, meaning that the same firmware slot will be activated after performing a controller level reset; in this case a reset is clearly not necessary and we can safely update the firmware version. Modify the code so the kernel will report that a Controller Level Reset is needed only in the following cases: 1) If the "current slot" field is zero. This is invalid and means that something is wrong, a reset is needed. or 2) if the "next slot" field isn't zero AND it's not equal to the "current slot" value. This means that at the next reset a different firmware slot will be activated. Fixes: 983a338b96c8 ("nvme: update firmware version after commit") Signed-off-by: Maurizio Lombardi Reviewed-by: Daniel Wagner Signed-off-by: Keith Busch --- drivers/nvme/host/core.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 0d124a8ca9c3..975245527c1f 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -4191,6 +4191,7 @@ static bool nvme_ctrl_pp_status(struct nvme_ctrl *ctrl) static void nvme_get_fw_slot_info(struct nvme_ctrl *ctrl) { struct nvme_fw_slot_info_log *log; + u8 next_fw_slot, cur_fw_slot; log = kmalloc(sizeof(*log), GFP_KERNEL); if (!log) @@ -4202,13 +4203,15 @@ static void nvme_get_fw_slot_info(struct nvme_ctrl *ctrl) goto out_free_log; } - if (log->afi & 0x70 || !(log->afi & 0x7)) { + cur_fw_slot = log->afi & 0x7; + next_fw_slot = (log->afi & 0x70) >> 4; + if (!cur_fw_slot || (next_fw_slot && (cur_fw_slot != next_fw_slot))) { dev_info(ctrl->device, "Firmware is activated after next Controller Level Reset\n"); goto out_free_log; } - memcpy(ctrl->subsys->firmware_rev, &log->frs[(log->afi & 0x7) - 1], + memcpy(ctrl->subsys->firmware_rev, &log->frs[cur_fw_slot - 1], sizeof(ctrl->subsys->firmware_rev)); out_free_log: From 1458eb2c9d88ad4b35eb6d6a4aa1d43d8fbf7f62 Mon Sep 17 00:00:00 2001 From: Alexandre Ghiti Date: Wed, 17 Jan 2024 20:57:40 +0100 Subject: [PATCH 079/270] riscv: Fix set_huge_pte_at() for NAPOT mapping As stated by the privileged specification, we must clear a NAPOT mapping and emit a sfence.vma before setting a new translation. Fixes: 82a1a1f3bfb6 ("riscv: mm: support Svnapot in hugetlb page") Signed-off-by: Alexandre Ghiti Link: https://lore.kernel.org/r/20240117195741.1926459-2-alexghiti@rivosinc.com Signed-off-by: Palmer Dabbelt --- arch/riscv/mm/hugetlbpage.c | 42 +++++++++++++++++++++++++++++++++++-- 1 file changed, 40 insertions(+), 2 deletions(-) diff --git a/arch/riscv/mm/hugetlbpage.c b/arch/riscv/mm/hugetlbpage.c index 431596c0e20e..865ac5c7d159 100644 --- a/arch/riscv/mm/hugetlbpage.c +++ b/arch/riscv/mm/hugetlbpage.c @@ -177,13 +177,36 @@ pte_t arch_make_huge_pte(pte_t entry, unsigned int shift, vm_flags_t flags) return entry; } +static void clear_flush(struct mm_struct *mm, + unsigned long addr, + pte_t *ptep, + unsigned long pgsize, + unsigned long ncontig) +{ + struct vm_area_struct vma = TLB_FLUSH_VMA(mm, 0); + unsigned long i, saddr = addr; + + for (i = 0; i < ncontig; i++, addr += pgsize, ptep++) + ptep_get_and_clear(mm, addr, ptep); + + flush_tlb_range(&vma, saddr, addr); +} + +/* + * When dealing with NAPOT mappings, the privileged specification indicates that + * "if an update needs to be made, the OS generally should first mark all of the + * PTEs invalid, then issue SFENCE.VMA instruction(s) covering all 4 KiB regions + * within the range, [...] then update the PTE(s), as described in Section + * 4.2.1.". That's the equivalent of the Break-Before-Make approach used by + * arm64. + */ void set_huge_pte_at(struct mm_struct *mm, unsigned long addr, pte_t *ptep, pte_t pte, unsigned long sz) { - unsigned long hugepage_shift; + unsigned long hugepage_shift, pgsize; int i, pte_num; if (sz >= PGDIR_SIZE) @@ -198,7 +221,22 @@ void set_huge_pte_at(struct mm_struct *mm, hugepage_shift = PAGE_SHIFT; pte_num = sz >> hugepage_shift; - for (i = 0; i < pte_num; i++, ptep++, addr += (1 << hugepage_shift)) + pgsize = 1 << hugepage_shift; + + if (!pte_present(pte)) { + for (i = 0; i < pte_num; i++, ptep++, addr += pgsize) + set_ptes(mm, addr, ptep, pte, 1); + return; + } + + if (!pte_napot(pte)) { + set_ptes(mm, addr, ptep, pte, 1); + return; + } + + clear_flush(mm, addr, ptep, pgsize, pte_num); + + for (i = 0; i < pte_num; i++, ptep++, addr += pgsize) set_pte_at(mm, addr, ptep, pte); } From a179a4bfb694f80f2709a1d0398469e787acb974 Mon Sep 17 00:00:00 2001 From: Alexandre Ghiti Date: Wed, 17 Jan 2024 20:57:41 +0100 Subject: [PATCH 080/270] riscv: Fix hugetlb_mask_last_page() when NAPOT is enabled When NAPOT is enabled, a new hugepage size is available and then we need to make hugetlb_mask_last_page() aware of that. Fixes: 82a1a1f3bfb6 ("riscv: mm: support Svnapot in hugetlb page") Signed-off-by: Alexandre Ghiti Link: https://lore.kernel.org/r/20240117195741.1926459-3-alexghiti@rivosinc.com Signed-off-by: Palmer Dabbelt --- arch/riscv/mm/hugetlbpage.c | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/arch/riscv/mm/hugetlbpage.c b/arch/riscv/mm/hugetlbpage.c index 865ac5c7d159..87406b26c3da 100644 --- a/arch/riscv/mm/hugetlbpage.c +++ b/arch/riscv/mm/hugetlbpage.c @@ -125,6 +125,26 @@ pte_t *huge_pte_offset(struct mm_struct *mm, return pte; } +unsigned long hugetlb_mask_last_page(struct hstate *h) +{ + unsigned long hp_size = huge_page_size(h); + + switch (hp_size) { +#ifndef __PAGETABLE_PMD_FOLDED + case PUD_SIZE: + return P4D_SIZE - PUD_SIZE; +#endif + case PMD_SIZE: + return PUD_SIZE - PMD_SIZE; + case napot_cont_size(NAPOT_CONT64KB_ORDER): + return PMD_SIZE - napot_cont_size(NAPOT_CONT64KB_ORDER); + default: + break; + } + + return 0UL; +} + static pte_t get_clear_contig(struct mm_struct *mm, unsigned long addr, pte_t *ptep, From 46eba193d04f8bd717e525eb4110f3c46c12aec3 Mon Sep 17 00:00:00 2001 From: Furong Xu <0x1207@gmail.com> Date: Wed, 31 Jan 2024 10:08:28 +0800 Subject: [PATCH 081/270] net: stmmac: xgmac: fix handling of DPP safety error for DMA channels Commit 56e58d6c8a56 ("net: stmmac: Implement Safety Features in XGMAC core") checks and reports safety errors, but leaves the Data Path Parity Errors for each channel in DMA unhandled at all, lead to a storm of interrupt. Fix it by checking and clearing the DMA_DPP_Interrupt_Status register. Fixes: 56e58d6c8a56 ("net: stmmac: Implement Safety Features in XGMAC core") Signed-off-by: Furong Xu <0x1207@gmail.com> Reviewed-by: Simon Horman Signed-off-by: David S. Miller --- drivers/net/ethernet/stmicro/stmmac/common.h | 1 + .../net/ethernet/stmicro/stmmac/dwxgmac2.h | 3 + .../ethernet/stmicro/stmmac/dwxgmac2_core.c | 57 ++++++++++++++++++- 3 files changed, 60 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/stmicro/stmmac/common.h b/drivers/net/ethernet/stmicro/stmmac/common.h index 721c1f8e892f..b4f60ab078d6 100644 --- a/drivers/net/ethernet/stmicro/stmmac/common.h +++ b/drivers/net/ethernet/stmicro/stmmac/common.h @@ -216,6 +216,7 @@ struct stmmac_safety_stats { unsigned long mac_errors[32]; unsigned long mtl_errors[32]; unsigned long dma_errors[32]; + unsigned long dma_dpp_errors[32]; }; /* Number of fields in Safety Stats */ diff --git a/drivers/net/ethernet/stmicro/stmmac/dwxgmac2.h b/drivers/net/ethernet/stmicro/stmmac/dwxgmac2.h index 207ff1799f2c..5c67a3f89f08 100644 --- a/drivers/net/ethernet/stmicro/stmmac/dwxgmac2.h +++ b/drivers/net/ethernet/stmicro/stmmac/dwxgmac2.h @@ -303,6 +303,8 @@ #define XGMAC_RXCEIE BIT(4) #define XGMAC_TXCEIE BIT(0) #define XGMAC_MTL_ECC_INT_STATUS 0x000010cc +#define XGMAC_MTL_DPP_CONTROL 0x000010e0 +#define XGMAC_DDPP_DISABLE BIT(0) #define XGMAC_MTL_TXQ_OPMODE(x) (0x00001100 + (0x80 * (x))) #define XGMAC_TQS GENMASK(25, 16) #define XGMAC_TQS_SHIFT 16 @@ -385,6 +387,7 @@ #define XGMAC_DCEIE BIT(1) #define XGMAC_TCEIE BIT(0) #define XGMAC_DMA_ECC_INT_STATUS 0x0000306c +#define XGMAC_DMA_DPP_INT_STATUS 0x00003074 #define XGMAC_DMA_CH_CONTROL(x) (0x00003100 + (0x80 * (x))) #define XGMAC_SPH BIT(24) #define XGMAC_PBLx8 BIT(16) diff --git a/drivers/net/ethernet/stmicro/stmmac/dwxgmac2_core.c b/drivers/net/ethernet/stmicro/stmmac/dwxgmac2_core.c index eb48211d9b0e..04d7c4dc2e35 100644 --- a/drivers/net/ethernet/stmicro/stmmac/dwxgmac2_core.c +++ b/drivers/net/ethernet/stmicro/stmmac/dwxgmac2_core.c @@ -830,6 +830,43 @@ static const struct dwxgmac3_error_desc dwxgmac3_dma_errors[32]= { { false, "UNKNOWN", "Unknown Error" }, /* 31 */ }; +static const char * const dpp_rx_err = "Read Rx Descriptor Parity checker Error"; +static const char * const dpp_tx_err = "Read Tx Descriptor Parity checker Error"; +static const struct dwxgmac3_error_desc dwxgmac3_dma_dpp_errors[32] = { + { true, "TDPES0", dpp_tx_err }, + { true, "TDPES1", dpp_tx_err }, + { true, "TDPES2", dpp_tx_err }, + { true, "TDPES3", dpp_tx_err }, + { true, "TDPES4", dpp_tx_err }, + { true, "TDPES5", dpp_tx_err }, + { true, "TDPES6", dpp_tx_err }, + { true, "TDPES7", dpp_tx_err }, + { true, "TDPES8", dpp_tx_err }, + { true, "TDPES9", dpp_tx_err }, + { true, "TDPES10", dpp_tx_err }, + { true, "TDPES11", dpp_tx_err }, + { true, "TDPES12", dpp_tx_err }, + { true, "TDPES13", dpp_tx_err }, + { true, "TDPES14", dpp_tx_err }, + { true, "TDPES15", dpp_tx_err }, + { true, "RDPES0", dpp_rx_err }, + { true, "RDPES1", dpp_rx_err }, + { true, "RDPES2", dpp_rx_err }, + { true, "RDPES3", dpp_rx_err }, + { true, "RDPES4", dpp_rx_err }, + { true, "RDPES5", dpp_rx_err }, + { true, "RDPES6", dpp_rx_err }, + { true, "RDPES7", dpp_rx_err }, + { true, "RDPES8", dpp_rx_err }, + { true, "RDPES9", dpp_rx_err }, + { true, "RDPES10", dpp_rx_err }, + { true, "RDPES11", dpp_rx_err }, + { true, "RDPES12", dpp_rx_err }, + { true, "RDPES13", dpp_rx_err }, + { true, "RDPES14", dpp_rx_err }, + { true, "RDPES15", dpp_rx_err }, +}; + static void dwxgmac3_handle_dma_err(struct net_device *ndev, void __iomem *ioaddr, bool correctable, struct stmmac_safety_stats *stats) @@ -841,6 +878,13 @@ static void dwxgmac3_handle_dma_err(struct net_device *ndev, dwxgmac3_log_error(ndev, value, correctable, "DMA", dwxgmac3_dma_errors, STAT_OFF(dma_errors), stats); + + value = readl(ioaddr + XGMAC_DMA_DPP_INT_STATUS); + writel(value, ioaddr + XGMAC_DMA_DPP_INT_STATUS); + + dwxgmac3_log_error(ndev, value, false, "DMA_DPP", + dwxgmac3_dma_dpp_errors, + STAT_OFF(dma_dpp_errors), stats); } static int @@ -881,6 +925,12 @@ dwxgmac3_safety_feat_config(void __iomem *ioaddr, unsigned int asp, value |= XGMAC_TMOUTEN; /* FSM Timeout Feature */ writel(value, ioaddr + XGMAC_MAC_FSM_CONTROL); + /* 5. Enable Data Path Parity Protection */ + value = readl(ioaddr + XGMAC_MTL_DPP_CONTROL); + /* already enabled by default, explicit enable it again */ + value &= ~XGMAC_DDPP_DISABLE; + writel(value, ioaddr + XGMAC_MTL_DPP_CONTROL); + return 0; } @@ -914,7 +964,11 @@ static int dwxgmac3_safety_feat_irq_status(struct net_device *ndev, ret |= !corr; } - err = dma & (XGMAC_DEUIS | XGMAC_DECIS); + /* DMA_DPP_Interrupt_Status is indicated by MCSIS bit in + * DMA_Safety_Interrupt_Status, so we handle DMA Data Path + * Parity Errors here + */ + err = dma & (XGMAC_DEUIS | XGMAC_DECIS | XGMAC_MCSIS); corr = dma & XGMAC_DECIS; if (err) { dwxgmac3_handle_dma_err(ndev, ioaddr, corr, stats); @@ -930,6 +984,7 @@ static const struct dwxgmac3_error { { dwxgmac3_mac_errors }, { dwxgmac3_mtl_errors }, { dwxgmac3_dma_errors }, + { dwxgmac3_dma_dpp_errors }, }; static int dwxgmac3_safety_feat_dump(struct stmmac_safety_stats *stats, From 177fbbcb4ed6b306c1626a277fac3fb1c495a4c7 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Mon, 29 Jan 2024 13:14:13 +0100 Subject: [PATCH 082/270] wifi: cfg80211: detect stuck ECSA element in probe resp We recently added some validation that we don't try to connect to an AP that is currently in a channel switch process, since that might want the channel to be quiet or we might not be able to connect in time to hear the switching in a beacon. This was in commit c09c4f31998b ("wifi: mac80211: don't connect to an AP while it's in a CSA process"). However, we promptly got a report that this caused new connection failures, and it turns out that the AP that we now cannot connect to is permanently advertising an extended channel switch announcement, even with quiet. The AP in question was an Asus RT-AC53, with firmware 3.0.0.4.380_10760-g21a5898. As a first step, attempt to detect that we're dealing with such a situation, so mac80211 can use this later. Reported-by: coldolt Closes: https://lore.kernel.org/linux-wireless/CAJvGw+DQhBk_mHXeu6RTOds5iramMW2FbMB01VbKRA4YbHHDTA@mail.gmail.com/ Fixes: c09c4f31998b ("wifi: mac80211: don't connect to an AP while it's in a CSA process") Reviewed-by: Miriam Rachel Korenblit Link: https://msgid.link/20240129131413.246972c8775e.Ibf834d7f52f9951a353b6872383da710a7358338@changeid Signed-off-by: Johannes Berg --- include/net/cfg80211.h | 4 +++ net/wireless/scan.c | 59 +++++++++++++++++++++++++++++++++++++++++- 2 files changed, 62 insertions(+), 1 deletion(-) diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h index cf79656ce09c..2b54fdd8ca15 100644 --- a/include/net/cfg80211.h +++ b/include/net/cfg80211.h @@ -2910,6 +2910,8 @@ struct cfg80211_bss_ies { * own the beacon_ies, but they're just pointers to the ones from the * @hidden_beacon_bss struct) * @proberesp_ies: the information elements from the last Probe Response frame + * @proberesp_ecsa_stuck: ECSA element is stuck in the Probe Response frame, + * cannot rely on it having valid data * @hidden_beacon_bss: in case this BSS struct represents a probe response from * a BSS that hides the SSID in its beacon, this points to the BSS struct * that holds the beacon data. @beacon_ies is still valid, of course, and @@ -2950,6 +2952,8 @@ struct cfg80211_bss { u8 chains; s8 chain_signal[IEEE80211_MAX_CHAINS]; + u8 proberesp_ecsa_stuck:1; + u8 bssid_index; u8 max_bssid_indicator; diff --git a/net/wireless/scan.c b/net/wireless/scan.c index 2249b1a89d1c..389a52c29bfc 100644 --- a/net/wireless/scan.c +++ b/net/wireless/scan.c @@ -1731,6 +1731,61 @@ static void cfg80211_update_hidden_bsses(struct cfg80211_internal_bss *known, } } +static void cfg80211_check_stuck_ecsa(struct cfg80211_registered_device *rdev, + struct cfg80211_internal_bss *known, + const struct cfg80211_bss_ies *old) +{ + const struct ieee80211_ext_chansw_ie *ecsa; + const struct element *elem_new, *elem_old; + const struct cfg80211_bss_ies *new, *bcn; + + if (known->pub.proberesp_ecsa_stuck) + return; + + new = rcu_dereference_protected(known->pub.proberesp_ies, + lockdep_is_held(&rdev->bss_lock)); + if (WARN_ON(!new)) + return; + + if (new->tsf - old->tsf < USEC_PER_SEC) + return; + + elem_old = cfg80211_find_elem(WLAN_EID_EXT_CHANSWITCH_ANN, + old->data, old->len); + if (!elem_old) + return; + + elem_new = cfg80211_find_elem(WLAN_EID_EXT_CHANSWITCH_ANN, + new->data, new->len); + if (!elem_new) + return; + + bcn = rcu_dereference_protected(known->pub.beacon_ies, + lockdep_is_held(&rdev->bss_lock)); + if (bcn && + cfg80211_find_elem(WLAN_EID_EXT_CHANSWITCH_ANN, + bcn->data, bcn->len)) + return; + + if (elem_new->datalen != elem_old->datalen) + return; + if (elem_new->datalen < sizeof(struct ieee80211_ext_chansw_ie)) + return; + if (memcmp(elem_new->data, elem_old->data, elem_new->datalen)) + return; + + ecsa = (void *)elem_new->data; + + if (!ecsa->mode) + return; + + if (ecsa->new_ch_num != + ieee80211_frequency_to_channel(known->pub.channel->center_freq)) + return; + + known->pub.proberesp_ecsa_stuck = 1; +} + static bool cfg80211_update_known_bss(struct cfg80211_registered_device *rdev, struct cfg80211_internal_bss *known, @@ -1750,8 +1805,10 @@ cfg80211_update_known_bss(struct cfg80211_registered_device *rdev, /* Override possible earlier Beacon frame IEs */ rcu_assign_pointer(known->pub.ies, new->pub.proberesp_ies); - if (old) + if (old) { + cfg80211_check_stuck_ecsa(rdev, known, old); kfree_rcu((struct cfg80211_bss_ies *)old, rcu_head); + } } if (rcu_access_pointer(new->pub.beacon_ies)) { From 35e2385dbe787936c793d70755a5177d267a40aa Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Mon, 29 Jan 2024 13:14:14 +0100 Subject: [PATCH 083/270] wifi: mac80211: improve CSA/ECSA connection refusal As mentioned in the previous commit, we pretty quickly found that some APs have ECSA elements stuck in their probe response, so using that to not attempt to connect while CSA is happening we never connect to such an AP. Improve this situation by checking more carefully and ignoring the ECSA if cfg80211 has previously detected the ECSA element being stuck in the probe response. Additionally, allow connecting to an AP that's switching to a channel it's already using, unless it's using quiet mode. In this case, we may just have to adjust bandwidth later. If it's actually switching channels, it's better not to try to connect in the middle of that. Reported-by: coldolt Closes: https://lore.kernel.org/linux-wireless/CAJvGw+DQhBk_mHXeu6RTOds5iramMW2FbMB01VbKRA4YbHHDTA@mail.gmail.com/ Fixes: c09c4f31998b ("wifi: mac80211: don't connect to an AP while it's in a CSA process") Reviewed-by: Miriam Rachel Korenblit Link: https://msgid.link/20240129131413.cc2d0a26226e.I682c016af76e35b6c47007db50e8554c5a426910@changeid Signed-off-by: Johannes Berg --- net/mac80211/mlme.c | 103 ++++++++++++++++++++++++++++++++------------ 1 file changed, 76 insertions(+), 27 deletions(-) diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c index 073105deb424..c62c7c6ce91f 100644 --- a/net/mac80211/mlme.c +++ b/net/mac80211/mlme.c @@ -7309,6 +7309,75 @@ out_err: return err; } +static bool ieee80211_mgd_csa_present(struct ieee80211_sub_if_data *sdata, + const struct cfg80211_bss_ies *ies, + u8 cur_channel, bool ignore_ecsa) +{ + const struct element *csa_elem, *ecsa_elem; + struct ieee80211_channel_sw_ie *csa = NULL; + struct ieee80211_ext_chansw_ie *ecsa = NULL; + + if (!ies) + return false; + + csa_elem = cfg80211_find_elem(WLAN_EID_CHANNEL_SWITCH, + ies->data, ies->len); + if (csa_elem && csa_elem->datalen == sizeof(*csa)) + csa = (void *)csa_elem->data; + + ecsa_elem = cfg80211_find_elem(WLAN_EID_EXT_CHANSWITCH_ANN, + ies->data, ies->len); + if (ecsa_elem && ecsa_elem->datalen == sizeof(*ecsa)) + ecsa = (void *)ecsa_elem->data; + + if (csa && csa->count == 0) + csa = NULL; + if (csa && !csa->mode && csa->new_ch_num == cur_channel) + csa = NULL; + + if (ecsa && ecsa->count == 0) + ecsa = NULL; + if (ecsa && !ecsa->mode && ecsa->new_ch_num == cur_channel) + ecsa = NULL; + + if (ignore_ecsa && ecsa) { + sdata_info(sdata, + "Ignoring ECSA in probe response - was considered stuck!\n"); + return csa; + } + + return csa || ecsa; +} + +static bool ieee80211_mgd_csa_in_process(struct ieee80211_sub_if_data *sdata, + struct cfg80211_bss *bss) +{ + u8 cur_channel; + bool ret; + + cur_channel = ieee80211_frequency_to_channel(bss->channel->center_freq); + + rcu_read_lock(); + if (ieee80211_mgd_csa_present(sdata, + rcu_dereference(bss->beacon_ies), + cur_channel, false)) { + ret = true; + goto out; + } + + if (ieee80211_mgd_csa_present(sdata, + rcu_dereference(bss->proberesp_ies), + cur_channel, bss->proberesp_ecsa_stuck)) { + ret = true; + goto out; + } + + ret = false; +out: + rcu_read_unlock(); + return ret; +} + /* config hooks */ int ieee80211_mgd_auth(struct ieee80211_sub_if_data *sdata, struct cfg80211_auth_request *req) @@ -7317,7 +7386,6 @@ int ieee80211_mgd_auth(struct ieee80211_sub_if_data *sdata, struct ieee80211_if_managed *ifmgd = &sdata->u.mgd; struct ieee80211_mgd_auth_data *auth_data; struct ieee80211_link_data *link; - const struct element *csa_elem, *ecsa_elem; u16 auth_alg; int err; bool cont_auth; @@ -7360,21 +7428,10 @@ int ieee80211_mgd_auth(struct ieee80211_sub_if_data *sdata, if (ifmgd->assoc_data) return -EBUSY; - rcu_read_lock(); - csa_elem = ieee80211_bss_get_elem(req->bss, WLAN_EID_CHANNEL_SWITCH); - ecsa_elem = ieee80211_bss_get_elem(req->bss, - WLAN_EID_EXT_CHANSWITCH_ANN); - if ((csa_elem && - csa_elem->datalen == sizeof(struct ieee80211_channel_sw_ie) && - ((struct ieee80211_channel_sw_ie *)csa_elem->data)->count != 0) || - (ecsa_elem && - ecsa_elem->datalen == sizeof(struct ieee80211_ext_chansw_ie) && - ((struct ieee80211_ext_chansw_ie *)ecsa_elem->data)->count != 0)) { - rcu_read_unlock(); + if (ieee80211_mgd_csa_in_process(sdata, req->bss)) { sdata_info(sdata, "AP is in CSA process, reject auth\n"); return -EINVAL; } - rcu_read_unlock(); auth_data = kzalloc(sizeof(*auth_data) + req->auth_data_len + req->ie_len, GFP_KERNEL); @@ -7684,7 +7741,7 @@ int ieee80211_mgd_assoc(struct ieee80211_sub_if_data *sdata, struct ieee80211_local *local = sdata->local; struct ieee80211_if_managed *ifmgd = &sdata->u.mgd; struct ieee80211_mgd_assoc_data *assoc_data; - const struct element *ssid_elem, *csa_elem, *ecsa_elem; + const struct element *ssid_elem; struct ieee80211_vif_cfg *vif_cfg = &sdata->vif.cfg; ieee80211_conn_flags_t conn_flags = 0; struct ieee80211_link_data *link; @@ -7707,23 +7764,15 @@ int ieee80211_mgd_assoc(struct ieee80211_sub_if_data *sdata, cbss = req->link_id < 0 ? req->bss : req->links[req->link_id].bss; - rcu_read_lock(); - ssid_elem = ieee80211_bss_get_elem(cbss, WLAN_EID_SSID); - if (!ssid_elem || ssid_elem->datalen > sizeof(assoc_data->ssid)) { - rcu_read_unlock(); + if (ieee80211_mgd_csa_in_process(sdata, cbss)) { + sdata_info(sdata, "AP is in CSA process, reject assoc\n"); kfree(assoc_data); return -EINVAL; } - csa_elem = ieee80211_bss_get_elem(cbss, WLAN_EID_CHANNEL_SWITCH); - ecsa_elem = ieee80211_bss_get_elem(cbss, WLAN_EID_EXT_CHANSWITCH_ANN); - if ((csa_elem && - csa_elem->datalen == sizeof(struct ieee80211_channel_sw_ie) && - ((struct ieee80211_channel_sw_ie *)csa_elem->data)->count != 0) || - (ecsa_elem && - ecsa_elem->datalen == sizeof(struct ieee80211_ext_chansw_ie) && - ((struct ieee80211_ext_chansw_ie *)ecsa_elem->data)->count != 0)) { - sdata_info(sdata, "AP is in CSA process, reject assoc\n"); + rcu_read_lock(); + ssid_elem = ieee80211_bss_get_elem(cbss, WLAN_EID_SSID); + if (!ssid_elem || ssid_elem->datalen > sizeof(assoc_data->ssid)) { rcu_read_unlock(); kfree(assoc_data); return -EINVAL; From 9480adfe4e0f0319b9da04b44e4eebd5ad07e0cd Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Mon, 29 Jan 2024 15:53:48 +0100 Subject: [PATCH 084/270] wifi: mac80211: fix RCU use in TDLS fast-xmit This looks up the link under RCU protection, but isn't guaranteed to actually have protection. Fix that. Fixes: 8cc07265b691 ("wifi: mac80211: handle TDLS data frames with MLO") Link: https://msgid.link/20240129155348.8a9c0b1e1d89.I553f96ce953bb41b0b877d592056164dec20d01c@changeid Signed-off-by: Johannes Berg --- net/mac80211/tx.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/net/mac80211/tx.c b/net/mac80211/tx.c index 68a48abc7287..e448ab338448 100644 --- a/net/mac80211/tx.c +++ b/net/mac80211/tx.c @@ -3100,10 +3100,11 @@ void ieee80211_check_fast_xmit(struct sta_info *sta) /* DA SA BSSID */ build.da_offs = offsetof(struct ieee80211_hdr, addr1); build.sa_offs = offsetof(struct ieee80211_hdr, addr2); + rcu_read_lock(); link = rcu_dereference(sdata->link[tdls_link_id]); - if (WARN_ON_ONCE(!link)) - break; - memcpy(hdr->addr3, link->u.mgd.bssid, ETH_ALEN); + if (!WARN_ON_ONCE(!link)) + memcpy(hdr->addr3, link->u.mgd.bssid, ETH_ALEN); + rcu_read_unlock(); build.hdr_len = 24; break; } From dd6c064cfc3fc18d871107c6f5db8837e88572e4 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Mon, 29 Jan 2024 15:53:55 +0100 Subject: [PATCH 085/270] wifi: mac80211: set station RX-NSS on reconfig When a station is added/reconfigured by userspace, e.g. a TDLS peer or a SoftAP client STA, rx_nss is currently not always set, so that it might be left zero. Set it up properly. Link: https://msgid.link/20240129155354.98f148a3d654.I193a02155f557ea54dc9d0232da66cf96734119a@changeid Signed-off-by: Johannes Berg --- net/mac80211/cfg.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/net/mac80211/cfg.c b/net/mac80211/cfg.c index 489dd97f5172..321698012e12 100644 --- a/net/mac80211/cfg.c +++ b/net/mac80211/cfg.c @@ -1869,6 +1869,8 @@ static int sta_link_apply_parameters(struct ieee80211_local *local, sband->band); } + ieee80211_sta_set_rx_nss(link_sta); + return ret; } From 733c498a80853acbafe284a40468b91f4d41f0b4 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Mon, 29 Jan 2024 15:54:02 +0100 Subject: [PATCH 086/270] wifi: mac80211: fix driver debugfs for vif type change If a driver implements the change_interface() method, we switch interface type without taking the interface down, but still will recreate the debugfs for it since it's a new type. As such, we should use the ieee80211_debugfs_recreate_netdev() function here to also recreate the driver's files, if it is indeed from a type change while up. Link: https://msgid.link/20240129155402.7311a36ffeeb.I18df02bbeb685d4250911de5ffbaf090f60c3803@changeid Signed-off-by: Johannes Berg --- net/mac80211/debugfs_netdev.c | 4 ++-- net/mac80211/debugfs_netdev.h | 5 ----- net/mac80211/iface.c | 2 +- 3 files changed, 3 insertions(+), 8 deletions(-) diff --git a/net/mac80211/debugfs_netdev.c b/net/mac80211/debugfs_netdev.c index dce5606ed66d..68596ef78b15 100644 --- a/net/mac80211/debugfs_netdev.c +++ b/net/mac80211/debugfs_netdev.c @@ -997,8 +997,8 @@ static void add_link_files(struct ieee80211_link_data *link, } } -void ieee80211_debugfs_add_netdev(struct ieee80211_sub_if_data *sdata, - bool mld_vif) +static void ieee80211_debugfs_add_netdev(struct ieee80211_sub_if_data *sdata, + bool mld_vif) { char buf[10+IFNAMSIZ]; diff --git a/net/mac80211/debugfs_netdev.h b/net/mac80211/debugfs_netdev.h index b226b1aae88a..a02ec0a413f6 100644 --- a/net/mac80211/debugfs_netdev.h +++ b/net/mac80211/debugfs_netdev.h @@ -11,8 +11,6 @@ #include "ieee80211_i.h" #ifdef CONFIG_MAC80211_DEBUGFS -void ieee80211_debugfs_add_netdev(struct ieee80211_sub_if_data *sdata, - bool mld_vif); void ieee80211_debugfs_remove_netdev(struct ieee80211_sub_if_data *sdata); void ieee80211_debugfs_rename_netdev(struct ieee80211_sub_if_data *sdata); void ieee80211_debugfs_recreate_netdev(struct ieee80211_sub_if_data *sdata, @@ -24,9 +22,6 @@ void ieee80211_link_debugfs_remove(struct ieee80211_link_data *link); void ieee80211_link_debugfs_drv_add(struct ieee80211_link_data *link); void ieee80211_link_debugfs_drv_remove(struct ieee80211_link_data *link); #else -static inline void ieee80211_debugfs_add_netdev( - struct ieee80211_sub_if_data *sdata, bool mld_vif) -{} static inline void ieee80211_debugfs_remove_netdev( struct ieee80211_sub_if_data *sdata) {} diff --git a/net/mac80211/iface.c b/net/mac80211/iface.c index e4e7c0b38cb6..11c4caa4748e 100644 --- a/net/mac80211/iface.c +++ b/net/mac80211/iface.c @@ -1783,7 +1783,7 @@ static void ieee80211_setup_sdata(struct ieee80211_sub_if_data *sdata, /* need to do this after the switch so vif.type is correct */ ieee80211_link_setup(&sdata->deflink); - ieee80211_debugfs_add_netdev(sdata, false); + ieee80211_debugfs_recreate_netdev(sdata, false); } static int ieee80211_runtime_change_iftype(struct ieee80211_sub_if_data *sdata, From 86b2dac224f963be92634a878888222e1e938f48 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Mon, 29 Jan 2024 19:54:05 +0100 Subject: [PATCH 087/270] wifi: mac80211: initialize SMPS mode correctly The SMPS mode is currently re-initialized too late, since ieee80211_prep_channel() can be called again after we've already done ieee80211_setup_assoc_link(), in case there's some override of the channel configuration. Fix this. Link: https://msgid.link/20240129195405.d6d74508be18.I0a7303b1ce4d8e5436011951ab624372a445c069@changeid Signed-off-by: Johannes Berg --- net/mac80211/mlme.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c index c62c7c6ce91f..f1cdd2df323e 100644 --- a/net/mac80211/mlme.c +++ b/net/mac80211/mlme.c @@ -8,7 +8,7 @@ * Copyright 2007, Michael Wu * Copyright 2013-2014 Intel Mobile Communications GmbH * Copyright (C) 2015 - 2017 Intel Deutschland GmbH - * Copyright (C) 2018 - 2023 Intel Corporation + * Copyright (C) 2018 - 2024 Intel Corporation */ #include @@ -2918,6 +2918,7 @@ static void ieee80211_set_disassoc(struct ieee80211_sub_if_data *sdata, /* other links will be destroyed */ sdata->deflink.u.mgd.bss = NULL; + sdata->deflink.smps_mode = IEEE80211_SMPS_OFF; netif_carrier_off(sdata->dev); @@ -5045,9 +5046,6 @@ static int ieee80211_prep_channel(struct ieee80211_sub_if_data *sdata, if (!link) return 0; - /* will change later if needed */ - link->smps_mode = IEEE80211_SMPS_OFF; - /* * If this fails (possibly due to channel context sharing * on incompatible channels, e.g. 80+80 and 160 sharing the @@ -7096,6 +7094,7 @@ void ieee80211_mgd_setup_link(struct ieee80211_link_data *link) link->u.mgd.p2p_noa_index = -1; link->u.mgd.conn_flags = 0; link->conf->bssid = link->u.mgd.bssid; + link->smps_mode = IEEE80211_SMPS_OFF; wiphy_work_init(&link->u.mgd.request_smps_work, ieee80211_request_smps_mgd_work); From 178e9d6adc4356c2f1659f575ecea626e7fbd05a Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Mon, 29 Jan 2024 19:57:30 +0100 Subject: [PATCH 088/270] wifi: mac80211: fix unsolicited broadcast probe config There's a bug in ieee80211_set_unsol_bcast_probe_resp(), it tries to return BSS_CHANGED_UNSOL_BCAST_PROBE_RESP (which has the value 1<<31) in an int, which makes it negative and considered an error. Fix this by passing the changed flags to set separately. Fixes: 3b1c256eb4ae ("wifi: mac80211: fixes in FILS discovery updates") Reviewed-by: Jeff Johnson Link: https://msgid.link/20240129195729.965b0740bf80.I6bc6f5236863f686c17d689be541b1dd2633c417@changeid Signed-off-by: Johannes Berg --- net/mac80211/cfg.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/net/mac80211/cfg.c b/net/mac80211/cfg.c index 321698012e12..327682995c92 100644 --- a/net/mac80211/cfg.c +++ b/net/mac80211/cfg.c @@ -5,7 +5,7 @@ * Copyright 2006-2010 Johannes Berg * Copyright 2013-2015 Intel Mobile Communications GmbH * Copyright (C) 2015-2017 Intel Deutschland GmbH - * Copyright (C) 2018-2022 Intel Corporation + * Copyright (C) 2018-2024 Intel Corporation */ #include @@ -987,7 +987,8 @@ static int ieee80211_set_unsol_bcast_probe_resp(struct ieee80211_sub_if_data *sdata, struct cfg80211_unsol_bcast_probe_resp *params, struct ieee80211_link_data *link, - struct ieee80211_bss_conf *link_conf) + struct ieee80211_bss_conf *link_conf, + u64 *changed) { struct unsol_bcast_probe_resp_data *new, *old = NULL; @@ -1011,7 +1012,8 @@ ieee80211_set_unsol_bcast_probe_resp(struct ieee80211_sub_if_data *sdata, RCU_INIT_POINTER(link->u.ap.unsol_bcast_probe_resp, NULL); } - return BSS_CHANGED_UNSOL_BCAST_PROBE_RESP; + *changed |= BSS_CHANGED_UNSOL_BCAST_PROBE_RESP; + return 0; } static int ieee80211_set_ftm_responder_params( @@ -1450,10 +1452,9 @@ static int ieee80211_start_ap(struct wiphy *wiphy, struct net_device *dev, err = ieee80211_set_unsol_bcast_probe_resp(sdata, ¶ms->unsol_bcast_probe_resp, - link, link_conf); + link, link_conf, &changed); if (err < 0) goto error; - changed |= err; err = drv_start_ap(sdata->local, sdata, link_conf); if (err) { @@ -1525,10 +1526,9 @@ static int ieee80211_change_beacon(struct wiphy *wiphy, struct net_device *dev, err = ieee80211_set_unsol_bcast_probe_resp(sdata, ¶ms->unsol_bcast_probe_resp, - link, link_conf); + link, link_conf, &changed); if (err < 0) return err; - changed |= err; if (beacon->he_bss_color_valid && beacon->he_bss_color.enabled != link_conf->he_bss_color.enabled) { From a0b4f2291319c5d47ecb196b90400814fdcfd126 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Wed, 31 Jan 2024 16:48:56 +0100 Subject: [PATCH 089/270] wifi: mac80211: fix waiting for beacons logic This should be waiting if we don't have a beacon yet, but somehow I managed to invert the logic. Fix that. Fixes: 74e1309acedc ("wifi: mac80211: mlme: look up beacon elems only if needed") Link: https://msgid.link/20240131164856.922701229546.I239b379e7cee04608e73c016b737a5245e5b23dd@changeid Signed-off-by: Johannes Berg --- net/mac80211/mlme.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c index f1cdd2df323e..12f67871af69 100644 --- a/net/mac80211/mlme.c +++ b/net/mac80211/mlme.c @@ -8046,8 +8046,7 @@ int ieee80211_mgd_assoc(struct ieee80211_sub_if_data *sdata, rcu_read_lock(); beacon_ies = rcu_dereference(req->bss->beacon_ies); - - if (beacon_ies) { + if (!beacon_ies) { /* * Wait up to one beacon interval ... * should this be more if we miss one? From c042600c17d8c490279f0ae2baee29475fe8047d Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Wed, 31 Jan 2024 16:48:23 +0100 Subject: [PATCH 090/270] wifi: mac80211: adding missing drv_mgd_complete_tx() call There's a call to drv_mgd_prepare_tx() and so there should be one to drv_mgd_complete_tx(), but on this path it's not. Add it. Link: https://msgid.link/20240131164824.2f0922a514e1.I5aac89b93bcead88c374187d70cad0599d29d2c8@changeid Signed-off-by: Johannes Berg --- net/mac80211/mlme.c | 1 + 1 file changed, 1 insertion(+) diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c index 12f67871af69..2022a26eb881 100644 --- a/net/mac80211/mlme.c +++ b/net/mac80211/mlme.c @@ -8127,6 +8127,7 @@ int ieee80211_mgd_deauth(struct ieee80211_sub_if_data *sdata, ieee80211_report_disconnect(sdata, frame_buf, sizeof(frame_buf), true, req->reason_code, false); + drv_mgd_complete_tx(sdata->local, sdata, &info); return 0; } From 62a6183c13319e4d2227473a04abd104c4f56dcf Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Mon, 29 Jan 2024 20:09:07 +0100 Subject: [PATCH 091/270] wifi: mac80211: accept broadcast probe responses on 6 GHz On the 6 GHz band, probe responses are sent as broadcast to optimise medium usage. However, without OCE configuration we weren't accepting them, which is wrong, even if wpa_s is by default enabling OCE. Accept them without the OCE config as well. Link: https://msgid.link/20240129200907.5a89c2821897.I92e9dfa0f9b350bc7f37dd4bb38031d156d78d8a@changeid Signed-off-by: Johannes Berg --- net/mac80211/scan.c | 30 ++++++++++++++++++------------ 1 file changed, 18 insertions(+), 12 deletions(-) diff --git a/net/mac80211/scan.c b/net/mac80211/scan.c index 645355e5f1bc..f9d5842601fa 100644 --- a/net/mac80211/scan.c +++ b/net/mac80211/scan.c @@ -9,7 +9,7 @@ * Copyright 2007, Michael Wu * Copyright 2013-2015 Intel Mobile Communications GmbH * Copyright 2016-2017 Intel Deutschland GmbH - * Copyright (C) 2018-2023 Intel Corporation + * Copyright (C) 2018-2024 Intel Corporation */ #include @@ -237,14 +237,18 @@ ieee80211_bss_info_update(struct ieee80211_local *local, } static bool ieee80211_scan_accept_presp(struct ieee80211_sub_if_data *sdata, + struct ieee80211_channel *channel, u32 scan_flags, const u8 *da) { if (!sdata) return false; - /* accept broadcast for OCE */ - if (scan_flags & NL80211_SCAN_FLAG_ACCEPT_BCAST_PROBE_RESP && - is_broadcast_ether_addr(da)) + + /* accept broadcast on 6 GHz and for OCE */ + if (is_broadcast_ether_addr(da) && + (channel->band == NL80211_BAND_6GHZ || + scan_flags & NL80211_SCAN_FLAG_ACCEPT_BCAST_PROBE_RESP)) return true; + if (scan_flags & NL80211_SCAN_FLAG_RANDOM_ADDR) return true; return ether_addr_equal(da, sdata->vif.addr); @@ -293,6 +297,12 @@ void ieee80211_scan_rx(struct ieee80211_local *local, struct sk_buff *skb) wiphy_delayed_work_queue(local->hw.wiphy, &local->scan_work, 0); } + channel = ieee80211_get_channel_khz(local->hw.wiphy, + ieee80211_rx_status_to_khz(rx_status)); + + if (!channel || channel->flags & IEEE80211_CHAN_DISABLED) + return; + if (ieee80211_is_probe_resp(mgmt->frame_control)) { struct cfg80211_scan_request *scan_req; struct cfg80211_sched_scan_request *sched_scan_req; @@ -310,19 +320,15 @@ void ieee80211_scan_rx(struct ieee80211_local *local, struct sk_buff *skb) /* ignore ProbeResp to foreign address or non-bcast (OCE) * unless scanning with randomised address */ - if (!ieee80211_scan_accept_presp(sdata1, scan_req_flags, + if (!ieee80211_scan_accept_presp(sdata1, channel, + scan_req_flags, mgmt->da) && - !ieee80211_scan_accept_presp(sdata2, sched_scan_req_flags, + !ieee80211_scan_accept_presp(sdata2, channel, + sched_scan_req_flags, mgmt->da)) return; } - channel = ieee80211_get_channel_khz(local->hw.wiphy, - ieee80211_rx_status_to_khz(rx_status)); - - if (!channel || channel->flags & IEEE80211_CHAN_DISABLED) - return; - bss = ieee80211_bss_info_update(local, rx_status, mgmt, skb->len, channel); From fe92f874f09145a6951deacaa4961390238bbe0d Mon Sep 17 00:00:00 2001 From: Michael Lass Date: Wed, 31 Jan 2024 16:52:20 +0100 Subject: [PATCH 092/270] net: Fix from address in memcpy_to_iter_csum() While inlining csum_and_memcpy() into memcpy_to_iter_csum(), the from address passed to csum_partial_copy_nocheck() was accidentally changed. This causes a regression in applications using UDP, as for example OpenAFS, causing loss of datagrams. Fixes: dc32bff195b4 ("iov_iter, net: Fold in csum_and_memcpy()") Cc: David Howells Cc: stable@vger.kernel.org Cc: regressions@lists.linux.dev Signed-off-by: Michael Lass Reviewed-by: Jeffrey Altman Acked-by: David Howells Signed-off-by: David S. Miller --- net/core/datagram.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/core/datagram.c b/net/core/datagram.c index 103d46fa0eeb..a8b625abe242 100644 --- a/net/core/datagram.c +++ b/net/core/datagram.c @@ -751,7 +751,7 @@ size_t memcpy_to_iter_csum(void *iter_to, size_t progress, size_t len, void *from, void *priv2) { __wsum *csum = priv2; - __wsum next = csum_partial_copy_nocheck(from, iter_to, len); + __wsum next = csum_partial_copy_nocheck(from + progress, iter_to, len); *csum = csum_block_add(*csum, next, progress); return 0; From 1fa942f31665ea5dc5d4d95893dd13723eaa97cc Mon Sep 17 00:00:00 2001 From: Emmanuel Grumbach Date: Sun, 28 Jan 2024 08:53:57 +0200 Subject: [PATCH 093/270] wifi: iwlwifi: mvm: fix a battery life regression Fix the DBG_CONFIG_TOKEN to not enable debug components that would prevent the device to save power. Fixes: fc2fe0a5e856 ("wifi: iwlwifi: fw: disable firmware debug asserts") Cc: stable@vger.kernel.org Signed-off-by: Emmanuel Grumbach Reviewed-by: Eilon Rinat Reviewed-by: Gregory Greenman Signed-off-by: Miri Korenblit Link: https://msgid.link/20240128084842.90d2600edc27.Id657ea2f0ddb131f5f9d0ac39aeb8c88754fe54b@changeid Signed-off-by: Johannes Berg --- drivers/net/wireless/intel/iwlwifi/fw/api/debug.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/wireless/intel/iwlwifi/fw/api/debug.h b/drivers/net/wireless/intel/iwlwifi/fw/api/debug.h index 798731ecbefd..b740c65a7dca 100644 --- a/drivers/net/wireless/intel/iwlwifi/fw/api/debug.h +++ b/drivers/net/wireless/intel/iwlwifi/fw/api/debug.h @@ -537,7 +537,7 @@ enum iwl_fw_dbg_config_cmd_type { }; /* LDBG_CFG_CMD_TYPE_API_E_VER_1 */ /* this token disables debug asserts in the firmware */ -#define IWL_FW_DBG_CONFIG_TOKEN 0x00011301 +#define IWL_FW_DBG_CONFIG_TOKEN 0x00010001 /** * struct iwl_fw_dbg_config_cmd - configure FW debug From 16867c38bcd3be2eb9016a3198a096f93959086e Mon Sep 17 00:00:00 2001 From: Miri Korenblit Date: Thu, 1 Feb 2024 16:17:39 +0200 Subject: [PATCH 094/270] wifi: iwlwifi: exit eSR only after the FW does Currently the driver exits eSR by calling iwl_mvm_esr_mode_inactive() before updating the FW (by deactivating one of the links), and therefore before sending the EML frame notifying that we are no longer in eSR. This is wrong for several reasons: 1. The driver sends SMPS activation frames when we are still in eSR and SMPS should be disabled when in eSR 2. The driver restores RLC configuration as it was before eSR entering, and RLC command shouldn't be sent in eSR Fix this by calling iwl_mvm_esr_mode_inactive() after FW update Fixes: 12bacfc2c065 ("wifi: iwlwifi: handle eSR transitions") Signed-off-by: Miri Korenblit Reviewed-by: Ilan Peer Reviewed-by: Gregory Greenman Link: https://msgid.link/20240201155157.d8d9dc277d4e.Ib5aee0fd05e35b1da7f18753eb3c8fa0a3f872f3@changeid Signed-off-by: Johannes Berg --- drivers/net/wireless/intel/iwlwifi/mvm/mld-mac80211.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/net/wireless/intel/iwlwifi/mvm/mld-mac80211.c b/drivers/net/wireless/intel/iwlwifi/mvm/mld-mac80211.c index 1f36e934ef69..893b69fc841b 100644 --- a/drivers/net/wireless/intel/iwlwifi/mvm/mld-mac80211.c +++ b/drivers/net/wireless/intel/iwlwifi/mvm/mld-mac80211.c @@ -438,6 +438,9 @@ __iwl_mvm_mld_unassign_vif_chanctx(struct iwl_mvm *mvm, mvmvif->ap_ibss_active = false; } + iwl_mvm_link_changed(mvm, vif, link_conf, + LINK_CONTEXT_MODIFY_ACTIVE, false); + if (iwl_mvm_is_esr_supported(mvm->fwrt.trans) && n_active > 1) { int ret = iwl_mvm_esr_mode_inactive(mvm, vif); @@ -449,9 +452,6 @@ __iwl_mvm_mld_unassign_vif_chanctx(struct iwl_mvm *mvm, if (vif->type == NL80211_IFTYPE_MONITOR) iwl_mvm_mld_rm_snif_sta(mvm, vif); - iwl_mvm_link_changed(mvm, vif, link_conf, - LINK_CONTEXT_MODIFY_ACTIVE, false); - if (switching_chanctx) return; mvmvif->link[link_id]->phy_ctxt = NULL; From a23c0af103e184bb1252dddddda040f6641bea7b Mon Sep 17 00:00:00 2001 From: Benjamin Berg Date: Thu, 1 Feb 2024 16:17:30 +0200 Subject: [PATCH 095/270] wifi: iwlwifi: do not announce EPCS support mac80211 does not have proper support for EPCS currently as that would require changing the ECDA parameters if EPCS (Emergency Preparedness Communications Service) is in use. As such, do not announce support for it in the capabilities. Signed-off-by: Benjamin Berg Signed-off-by: Miri Korenblit Link: https://msgid.link/20240201155157.59d71656addc.Idde91b3018239c49fc6ed231b411d05354fb9fb1@changeid Signed-off-by: Johannes Berg --- drivers/net/wireless/intel/iwlwifi/iwl-nvm-parse.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/drivers/net/wireless/intel/iwlwifi/iwl-nvm-parse.c b/drivers/net/wireless/intel/iwlwifi/iwl-nvm-parse.c index 402896988686..2f6774ec37b2 100644 --- a/drivers/net/wireless/intel/iwlwifi/iwl-nvm-parse.c +++ b/drivers/net/wireless/intel/iwlwifi/iwl-nvm-parse.c @@ -668,7 +668,6 @@ static const struct ieee80211_sband_iftype_data iwl_he_eht_capa[] = { .has_eht = true, .eht_cap_elem = { .mac_cap_info[0] = - IEEE80211_EHT_MAC_CAP0_EPCS_PRIO_ACCESS | IEEE80211_EHT_MAC_CAP0_OM_CONTROL | IEEE80211_EHT_MAC_CAP0_TRIG_TXOP_SHARING_MODE1 | IEEE80211_EHT_MAC_CAP0_TRIG_TXOP_SHARING_MODE2 | @@ -793,7 +792,6 @@ static const struct ieee80211_sband_iftype_data iwl_he_eht_capa[] = { .has_eht = true, .eht_cap_elem = { .mac_cap_info[0] = - IEEE80211_EHT_MAC_CAP0_EPCS_PRIO_ACCESS | IEEE80211_EHT_MAC_CAP0_OM_CONTROL | IEEE80211_EHT_MAC_CAP0_TRIG_TXOP_SHARING_MODE1 | IEEE80211_EHT_MAC_CAP0_TRIG_TXOP_SHARING_MODE2, @@ -1020,8 +1018,7 @@ iwl_nvm_fixup_sband_iftd(struct iwl_trans *trans, if (CSR_HW_REV_TYPE(trans->hw_rev) == IWL_CFG_MAC_TYPE_GL && iftype_data->eht_cap.has_eht) { iftype_data->eht_cap.eht_cap_elem.mac_cap_info[0] &= - ~(IEEE80211_EHT_MAC_CAP0_EPCS_PRIO_ACCESS | - IEEE80211_EHT_MAC_CAP0_TRIG_TXOP_SHARING_MODE1 | + ~(IEEE80211_EHT_MAC_CAP0_TRIG_TXOP_SHARING_MODE1 | IEEE80211_EHT_MAC_CAP0_TRIG_TXOP_SHARING_MODE2); iftype_data->eht_cap.eht_cap_elem.phy_cap_info[3] &= ~(IEEE80211_EHT_PHY_CAP0_PARTIAL_BW_UL_MU_MIMO | From 5bdda0048c8d1bbe2019513b2d6200cc0d09c7bd Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Fri, 26 Jan 2024 14:31:53 -0800 Subject: [PATCH 096/270] wifi: brcmfmac: Adjust n_channels usage for __counted_by After commit e3eac9f32ec0 ("wifi: cfg80211: Annotate struct cfg80211_scan_request with __counted_by"), the compiler may enforce dynamic array indexing of req->channels to stay below n_channels. As a result, n_channels needs to be increased _before_ accessing the newly added array index. Increment it first, then use "i" for the prior index. Solves this warning in the coming GCC that has __counted_by support: ../drivers/net/wireless/broadcom/brcm80211/brcmfmac/cfg80211.c: In function 'brcmf_internal_escan_add_info': ../drivers/net/wireless/broadcom/brcm80211/brcmfmac/cfg80211.c:3783:46: warning: operation on 'req-> n_channels' may be undefined [-Wsequence-point] 3783 | req->channels[req->n_channels++] = chan; | ~~~~~~~~~~~~~~~^~ Fixes: e3eac9f32ec0 ("wifi: cfg80211: Annotate struct cfg80211_scan_request with __counted_by") Cc: Arend van Spriel Cc: Franky Lin Cc: Hante Meuleman Cc: Kalle Valo Cc: Chi-hsien Lin Cc: Ian Lin Cc: Johannes Berg Cc: Wright Feng Cc: Hector Martin Cc: linux-wireless@vger.kernel.org Cc: brcm80211-dev-list.pdl@broadcom.com Signed-off-by: Kees Cook Reviewed-by: Hans de Goede Reviewed-by: Linus Walleij Reviewed-by: Gustavo A. R. Silva Signed-off-by: Kalle Valo Link: https://msgid.link/20240126223150.work.548-kees@kernel.org --- drivers/net/wireless/broadcom/brcm80211/brcmfmac/cfg80211.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/drivers/net/wireless/broadcom/brcm80211/brcmfmac/cfg80211.c b/drivers/net/wireless/broadcom/brcm80211/brcmfmac/cfg80211.c index 133c5ea6429c..28d6a30cc010 100644 --- a/drivers/net/wireless/broadcom/brcm80211/brcmfmac/cfg80211.c +++ b/drivers/net/wireless/broadcom/brcm80211/brcmfmac/cfg80211.c @@ -3779,8 +3779,10 @@ static int brcmf_internal_escan_add_info(struct cfg80211_scan_request *req, if (req->channels[i] == chan) break; } - if (i == req->n_channels) - req->channels[req->n_channels++] = chan; + if (i == req->n_channels) { + req->n_channels++; + req->channels[i] = chan; + } for (i = 0; i < req->n_ssids; i++) { if (req->ssids[i].ssid_len == ssid_len && From de1034b38a346ef6be25fe8792f5d1e0684d5ff4 Mon Sep 17 00:00:00 2001 From: Andrew Bresticker Date: Fri, 2 Feb 2024 10:07:03 -0800 Subject: [PATCH 097/270] efi: runtime: Fix potential overflow of soft-reserved region size md_size will have been narrowed if we have >= 4GB worth of pages in a soft-reserved region. Signed-off-by: Andrew Bresticker Signed-off-by: Ard Biesheuvel --- drivers/firmware/efi/arm-runtime.c | 2 +- drivers/firmware/efi/riscv-runtime.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/firmware/efi/arm-runtime.c b/drivers/firmware/efi/arm-runtime.c index 83f5bb57fa4c..83092d93f36a 100644 --- a/drivers/firmware/efi/arm-runtime.c +++ b/drivers/firmware/efi/arm-runtime.c @@ -107,7 +107,7 @@ static int __init arm_enable_runtime_services(void) efi_memory_desc_t *md; for_each_efi_memory_desc(md) { - int md_size = md->num_pages << EFI_PAGE_SHIFT; + u64 md_size = md->num_pages << EFI_PAGE_SHIFT; struct resource *res; if (!(md->attribute & EFI_MEMORY_SP)) diff --git a/drivers/firmware/efi/riscv-runtime.c b/drivers/firmware/efi/riscv-runtime.c index 09525fb5c240..01f0f90ea418 100644 --- a/drivers/firmware/efi/riscv-runtime.c +++ b/drivers/firmware/efi/riscv-runtime.c @@ -85,7 +85,7 @@ static int __init riscv_enable_runtime_services(void) efi_memory_desc_t *md; for_each_efi_memory_desc(md) { - int md_size = md->num_pages << EFI_PAGE_SHIFT; + u64 md_size = md->num_pages << EFI_PAGE_SHIFT; struct resource *res; if (!(md->attribute & EFI_MEMORY_SP)) From 0bcff59ef7a652fcdc6d535554b63278c2406c8f Mon Sep 17 00:00:00 2001 From: Andrew Bresticker Date: Fri, 2 Feb 2024 10:07:04 -0800 Subject: [PATCH 098/270] efi: Don't add memblocks for soft-reserved memory Adding memblocks for soft-reserved regions prevents them from later being hotplugged in by dax_kmem. Signed-off-by: Andrew Bresticker Signed-off-by: Ard Biesheuvel --- drivers/firmware/efi/efi-init.c | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/drivers/firmware/efi/efi-init.c b/drivers/firmware/efi/efi-init.c index d4987d013080..a00e07b853f2 100644 --- a/drivers/firmware/efi/efi-init.c +++ b/drivers/firmware/efi/efi-init.c @@ -143,15 +143,6 @@ static __init int is_usable_memory(efi_memory_desc_t *md) case EFI_BOOT_SERVICES_DATA: case EFI_CONVENTIONAL_MEMORY: case EFI_PERSISTENT_MEMORY: - /* - * Special purpose memory is 'soft reserved', which means it - * is set aside initially, but can be hotplugged back in or - * be assigned to the dax driver after boot. - */ - if (efi_soft_reserve_enabled() && - (md->attribute & EFI_MEMORY_SP)) - return false; - /* * According to the spec, these regions are no longer reserved * after calling ExitBootServices(). However, we can only use @@ -196,6 +187,16 @@ static __init void reserve_regions(void) size = npages << PAGE_SHIFT; if (is_memory(md)) { + /* + * Special purpose memory is 'soft reserved', which + * means it is set aside initially. Don't add a memblock + * for it now so that it can be hotplugged back in or + * be assigned to the dax driver after boot. + */ + if (efi_soft_reserve_enabled() && + (md->attribute & EFI_MEMORY_SP)) + continue; + early_init_dt_add_memory_arch(paddr, size); if (!is_usable_memory(md)) From ba5e1272142d051dcc57ca1d3225ad8a089f9858 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 1 Feb 2024 17:53:24 +0000 Subject: [PATCH 099/270] netdevsim: avoid potential loop in nsim_dev_trap_report_work() Many syzbot reports include the following trace [1] If nsim_dev_trap_report_work() can not grab the mutex, it should rearm itself at least one jiffie later. [1] Sending NMI from CPU 1 to CPUs 0: NMI backtrace for cpu 0 CPU: 0 PID: 32383 Comm: kworker/0:2 Not tainted 6.8.0-rc2-syzkaller-00031-g861c0981648f #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 11/17/2023 Workqueue: events nsim_dev_trap_report_work RIP: 0010:bytes_is_nonzero mm/kasan/generic.c:89 [inline] RIP: 0010:memory_is_nonzero mm/kasan/generic.c:104 [inline] RIP: 0010:memory_is_poisoned_n mm/kasan/generic.c:129 [inline] RIP: 0010:memory_is_poisoned mm/kasan/generic.c:161 [inline] RIP: 0010:check_region_inline mm/kasan/generic.c:180 [inline] RIP: 0010:kasan_check_range+0x101/0x190 mm/kasan/generic.c:189 Code: 07 49 39 d1 75 0a 45 3a 11 b8 01 00 00 00 7c 0b 44 89 c2 e8 21 ed ff ff 83 f0 01 5b 5d 41 5c c3 48 85 d2 74 4f 48 01 ea eb 09 <48> 83 c0 01 48 39 d0 74 41 80 38 00 74 f2 eb b6 41 bc 08 00 00 00 RSP: 0018:ffffc90012dcf998 EFLAGS: 00000046 RAX: fffffbfff258af1e RBX: fffffbfff258af1f RCX: ffffffff8168eda3 RDX: fffffbfff258af1f RSI: 0000000000000004 RDI: ffffffff92c578f0 RBP: fffffbfff258af1e R08: 0000000000000000 R09: fffffbfff258af1e R10: ffffffff92c578f3 R11: ffffffff8acbcbc0 R12: 0000000000000002 R13: ffff88806db38400 R14: 1ffff920025b9f42 R15: ffffffff92c578e8 FS: 0000000000000000(0000) GS:ffff8880b9800000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 000000c00994e078 CR3: 000000002c250000 CR4: 00000000003506f0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 Call Trace: instrument_atomic_read include/linux/instrumented.h:68 [inline] atomic_read include/linux/atomic/atomic-instrumented.h:32 [inline] queued_spin_is_locked include/asm-generic/qspinlock.h:57 [inline] debug_spin_unlock kernel/locking/spinlock_debug.c:101 [inline] do_raw_spin_unlock+0x53/0x230 kernel/locking/spinlock_debug.c:141 __raw_spin_unlock_irqrestore include/linux/spinlock_api_smp.h:150 [inline] _raw_spin_unlock_irqrestore+0x22/0x70 kernel/locking/spinlock.c:194 debug_object_activate+0x349/0x540 lib/debugobjects.c:726 debug_work_activate kernel/workqueue.c:578 [inline] insert_work+0x30/0x230 kernel/workqueue.c:1650 __queue_work+0x62e/0x11d0 kernel/workqueue.c:1802 __queue_delayed_work+0x1bf/0x270 kernel/workqueue.c:1953 queue_delayed_work_on+0x106/0x130 kernel/workqueue.c:1989 queue_delayed_work include/linux/workqueue.h:563 [inline] schedule_delayed_work include/linux/workqueue.h:677 [inline] nsim_dev_trap_report_work+0x9c0/0xc80 drivers/net/netdevsim/dev.c:842 process_one_work+0x886/0x15d0 kernel/workqueue.c:2633 process_scheduled_works kernel/workqueue.c:2706 [inline] worker_thread+0x8b9/0x1290 kernel/workqueue.c:2787 kthread+0x2c6/0x3a0 kernel/kthread.c:388 ret_from_fork+0x45/0x80 arch/x86/kernel/process.c:147 ret_from_fork_asm+0x11/0x20 arch/x86/entry/entry_64.S:242 Fixes: 012ec02ae441 ("netdevsim: convert driver to use unlocked devlink API during init/fini") Reported-by: syzbot Signed-off-by: Eric Dumazet Reviewed-by: Jiri Pirko Link: https://lore.kernel.org/r/20240201175324.3752746-1-edumazet@google.com Signed-off-by: Jakub Kicinski --- drivers/net/netdevsim/dev.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/drivers/net/netdevsim/dev.c b/drivers/net/netdevsim/dev.c index b4d3b9cde8bd..92a7a36b93ac 100644 --- a/drivers/net/netdevsim/dev.c +++ b/drivers/net/netdevsim/dev.c @@ -835,14 +835,14 @@ static void nsim_dev_trap_report_work(struct work_struct *work) trap_report_dw.work); nsim_dev = nsim_trap_data->nsim_dev; - /* For each running port and enabled packet trap, generate a UDP - * packet with a random 5-tuple and report it. - */ if (!devl_trylock(priv_to_devlink(nsim_dev))) { - schedule_delayed_work(&nsim_dev->trap_data->trap_report_dw, 0); + schedule_delayed_work(&nsim_dev->trap_data->trap_report_dw, 1); return; } + /* For each running port and enabled packet trap, generate a UDP + * packet with a random 5-tuple and report it. + */ list_for_each_entry(nsim_dev_port, &nsim_dev->port_list, list) { if (!netif_running(nsim_dev_port->ns->netdev)) continue; From 05519c86d6997cfb9bb6c82ce1595d1015b718dc Mon Sep 17 00:00:00 2001 From: Mingwei Zhang Date: Tue, 23 Jan 2024 22:12:20 +0000 Subject: [PATCH 100/270] KVM: x86/pmu: Fix type length error when reading pmu->fixed_ctr_ctrl Use a u64 instead of a u8 when taking a snapshot of pmu->fixed_ctr_ctrl when reprogramming fixed counters, as truncating the value results in KVM thinking fixed counter 2 is already disabled (the bug also affects fixed counters 3+, but KVM doesn't yet support those). As a result, if the guest disables fixed counter 2, KVM will get a false negative and fail to reprogram/disable emulation of the counter, which can leads to incorrect counts and spurious PMIs in the guest. Fixes: 76d287b2342e ("KVM: x86/pmu: Drop "u8 ctrl, int idx" for reprogram_fixed_counter()") Cc: stable@vger.kernel.org Signed-off-by: Mingwei Zhang Link: https://lore.kernel.org/r/20240123221220.3911317-1-mizhang@google.com [sean: rewrite changelog to call out the effects of the bug] Signed-off-by: Sean Christopherson --- arch/x86/kvm/vmx/pmu_intel.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/kvm/vmx/pmu_intel.c b/arch/x86/kvm/vmx/pmu_intel.c index a6216c874729..315c7c2ba89b 100644 --- a/arch/x86/kvm/vmx/pmu_intel.c +++ b/arch/x86/kvm/vmx/pmu_intel.c @@ -71,7 +71,7 @@ static int fixed_pmc_events[] = { static void reprogram_fixed_counters(struct kvm_pmu *pmu, u64 data) { struct kvm_pmc *pmc; - u8 old_fixed_ctr_ctrl = pmu->fixed_ctr_ctrl; + u64 old_fixed_ctr_ctrl = pmu->fixed_ctr_ctrl; int i; pmu->fixed_ctr_ctrl = data; From 2e7d3b67630dfd8f178c41fa2217aa00e79a5887 Mon Sep 17 00:00:00 2001 From: Ivan Vecera Date: Thu, 1 Feb 2024 10:47:51 +0100 Subject: [PATCH 101/270] net: atlantic: Fix DMA mapping for PTP hwts ring Function aq_ring_hwts_rx_alloc() maps extra AQ_CFG_RXDS_DEF bytes for PTP HWTS ring but then generic aq_ring_free() does not take this into account. Create and use a specific function to free HWTS ring to fix this issue. Trace: [ 215.351607] ------------[ cut here ]------------ [ 215.351612] DMA-API: atlantic 0000:4b:00.0: device driver frees DMA memory with different size [device address=0x00000000fbdd0000] [map size=34816 bytes] [unmap size=32768 bytes] [ 215.351635] WARNING: CPU: 33 PID: 10759 at kernel/dma/debug.c:988 check_unmap+0xa6f/0x2360 ... [ 215.581176] Call Trace: [ 215.583632] [ 215.585745] ? show_trace_log_lvl+0x1c4/0x2df [ 215.590114] ? show_trace_log_lvl+0x1c4/0x2df [ 215.594497] ? debug_dma_free_coherent+0x196/0x210 [ 215.599305] ? check_unmap+0xa6f/0x2360 [ 215.603147] ? __warn+0xca/0x1d0 [ 215.606391] ? check_unmap+0xa6f/0x2360 [ 215.610237] ? report_bug+0x1ef/0x370 [ 215.613921] ? handle_bug+0x3c/0x70 [ 215.617423] ? exc_invalid_op+0x14/0x50 [ 215.621269] ? asm_exc_invalid_op+0x16/0x20 [ 215.625480] ? check_unmap+0xa6f/0x2360 [ 215.629331] ? mark_lock.part.0+0xca/0xa40 [ 215.633445] debug_dma_free_coherent+0x196/0x210 [ 215.638079] ? __pfx_debug_dma_free_coherent+0x10/0x10 [ 215.643242] ? slab_free_freelist_hook+0x11d/0x1d0 [ 215.648060] dma_free_attrs+0x6d/0x130 [ 215.651834] aq_ring_free+0x193/0x290 [atlantic] [ 215.656487] aq_ptp_ring_free+0x67/0x110 [atlantic] ... [ 216.127540] ---[ end trace 6467e5964dd2640b ]--- [ 216.132160] DMA-API: Mapped at: [ 216.132162] debug_dma_alloc_coherent+0x66/0x2f0 [ 216.132165] dma_alloc_attrs+0xf5/0x1b0 [ 216.132168] aq_ring_hwts_rx_alloc+0x150/0x1f0 [atlantic] [ 216.132193] aq_ptp_ring_alloc+0x1bb/0x540 [atlantic] [ 216.132213] aq_nic_init+0x4a1/0x760 [atlantic] Fixes: 94ad94558b0f ("net: aquantia: add PTP rings infrastructure") Signed-off-by: Ivan Vecera Reviewed-by: Jiri Pirko Link: https://lore.kernel.org/r/20240201094752.883026-1-ivecera@redhat.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/aquantia/atlantic/aq_ptp.c | 4 ++-- drivers/net/ethernet/aquantia/atlantic/aq_ring.c | 13 +++++++++++++ drivers/net/ethernet/aquantia/atlantic/aq_ring.h | 1 + 3 files changed, 16 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/aquantia/atlantic/aq_ptp.c b/drivers/net/ethernet/aquantia/atlantic/aq_ptp.c index abd4832e4ed2..5acb3e16b567 100644 --- a/drivers/net/ethernet/aquantia/atlantic/aq_ptp.c +++ b/drivers/net/ethernet/aquantia/atlantic/aq_ptp.c @@ -993,7 +993,7 @@ int aq_ptp_ring_alloc(struct aq_nic_s *aq_nic) return 0; err_exit_hwts_rx: - aq_ring_free(&aq_ptp->hwts_rx); + aq_ring_hwts_rx_free(&aq_ptp->hwts_rx); err_exit_ptp_rx: aq_ring_free(&aq_ptp->ptp_rx); err_exit_ptp_tx: @@ -1011,7 +1011,7 @@ void aq_ptp_ring_free(struct aq_nic_s *aq_nic) aq_ring_free(&aq_ptp->ptp_tx); aq_ring_free(&aq_ptp->ptp_rx); - aq_ring_free(&aq_ptp->hwts_rx); + aq_ring_hwts_rx_free(&aq_ptp->hwts_rx); aq_ptp_skb_ring_release(&aq_ptp->skb_ring); } diff --git a/drivers/net/ethernet/aquantia/atlantic/aq_ring.c b/drivers/net/ethernet/aquantia/atlantic/aq_ring.c index cda8597b4e14..f7433abd6591 100644 --- a/drivers/net/ethernet/aquantia/atlantic/aq_ring.c +++ b/drivers/net/ethernet/aquantia/atlantic/aq_ring.c @@ -919,6 +919,19 @@ void aq_ring_free(struct aq_ring_s *self) } } +void aq_ring_hwts_rx_free(struct aq_ring_s *self) +{ + if (!self) + return; + + if (self->dx_ring) { + dma_free_coherent(aq_nic_get_dev(self->aq_nic), + self->size * self->dx_size + AQ_CFG_RXDS_DEF, + self->dx_ring, self->dx_ring_pa); + self->dx_ring = NULL; + } +} + unsigned int aq_ring_fill_stats_data(struct aq_ring_s *self, u64 *data) { unsigned int count; diff --git a/drivers/net/ethernet/aquantia/atlantic/aq_ring.h b/drivers/net/ethernet/aquantia/atlantic/aq_ring.h index 52847310740a..d627ace850ff 100644 --- a/drivers/net/ethernet/aquantia/atlantic/aq_ring.h +++ b/drivers/net/ethernet/aquantia/atlantic/aq_ring.h @@ -210,6 +210,7 @@ int aq_ring_rx_fill(struct aq_ring_s *self); int aq_ring_hwts_rx_alloc(struct aq_ring_s *self, struct aq_nic_s *aq_nic, unsigned int idx, unsigned int size, unsigned int dx_size); +void aq_ring_hwts_rx_free(struct aq_ring_s *self); void aq_ring_hwts_rx_clean(struct aq_ring_s *self, struct aq_nic_s *aq_nic); unsigned int aq_ring_fill_stats_data(struct aq_ring_s *self, u64 *data); From cb9f4a30fb85e1f4f149ada595a67899adb3db19 Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Thu, 1 Feb 2024 19:42:38 +0100 Subject: [PATCH 102/270] selftests: net: cut more slack for gro fwd tests. The udpgro_fwd.sh self-tests are somewhat unstable. There are a few timing constraints the we struggle to meet on very slow environments. Instead of skipping the whole tests in such envs, increase the test resilience WRT very slow hosts: increase the inter-packets timeouts, avoid resetting the counters every second and finally disable reduce the background traffic noise. Tested with: for I in $(seq 1 100); do ./tools/testing/selftests/kselftest_install/run_kselftest.sh \ -t net:udpgro_fwd.sh || exit -1 done in a slow environment. Fixes: a062260a9d5f ("selftests: net: add UDP GRO forwarding self-tests") Signed-off-by: Paolo Abeni Reviewed-by: David Ahern Link: https://lore.kernel.org/r/f4b6b11064a0d39182a9ae6a853abae3e9b4426a.1706812005.git.pabeni@redhat.com Signed-off-by: Jakub Kicinski --- tools/testing/selftests/net/udpgro_fwd.sh | 14 ++++++++++++-- tools/testing/selftests/net/udpgso_bench_rx.c | 2 +- 2 files changed, 13 insertions(+), 3 deletions(-) diff --git a/tools/testing/selftests/net/udpgro_fwd.sh b/tools/testing/selftests/net/udpgro_fwd.sh index d6b9c759043c..9cd5e885e91f 100755 --- a/tools/testing/selftests/net/udpgro_fwd.sh +++ b/tools/testing/selftests/net/udpgro_fwd.sh @@ -39,6 +39,10 @@ create_ns() { for ns in $NS_SRC $NS_DST; do ip netns add $ns ip -n $ns link set dev lo up + + # disable route solicitations to decrease 'noise' traffic + ip netns exec $ns sysctl -qw net.ipv6.conf.default.router_solicitations=0 + ip netns exec $ns sysctl -qw net.ipv6.conf.all.router_solicitations=0 done ip link add name veth$SRC type veth peer name veth$DST @@ -80,6 +84,12 @@ create_vxlan_pair() { create_vxlan_endpoint $BASE$ns veth$ns $BM_NET_V6$((3 - $ns)) vxlan6$ns 6 ip -n $BASE$ns addr add dev vxlan6$ns $OL_NET_V6$ns/24 nodad done + + # preload neighbur cache, do avoid some noisy traffic + local addr_dst=$(ip -j -n $BASE$DST link show dev vxlan6$DST |jq -r '.[]["address"]') + local addr_src=$(ip -j -n $BASE$SRC link show dev vxlan6$SRC |jq -r '.[]["address"]') + ip -n $BASE$DST neigh add dev vxlan6$DST lladdr $addr_src $OL_NET_V6$SRC + ip -n $BASE$SRC neigh add dev vxlan6$SRC lladdr $addr_dst $OL_NET_V6$DST } is_ipv6() { @@ -119,7 +129,7 @@ run_test() { # not enable GRO ip netns exec $NS_DST $ipt -A INPUT -p udp --dport 4789 ip netns exec $NS_DST $ipt -A INPUT -p udp --dport 8000 - ip netns exec $NS_DST ./udpgso_bench_rx -C 1000 -R 10 -n 10 -l 1300 $rx_args & + ip netns exec $NS_DST ./udpgso_bench_rx -C 2000 -R 100 -n 10 -l 1300 $rx_args & local spid=$! wait_local_port_listen "$NS_DST" 8000 udp ip netns exec $NS_SRC ./udpgso_bench_tx $family -M 1 -s 13000 -S 1300 -D $dst @@ -168,7 +178,7 @@ run_bench() { # bind the sender and the receiver to different CPUs to try # get reproducible results ip netns exec $NS_DST bash -c "echo 2 > /sys/class/net/veth$DST/queues/rx-0/rps_cpus" - ip netns exec $NS_DST taskset 0x2 ./udpgso_bench_rx -C 1000 -R 10 & + ip netns exec $NS_DST taskset 0x2 ./udpgso_bench_rx -C 2000 -R 100 & local spid=$! wait_local_port_listen "$NS_DST" 8000 udp ip netns exec $NS_SRC taskset 0x1 ./udpgso_bench_tx $family -l 3 -S 1300 -D $dst diff --git a/tools/testing/selftests/net/udpgso_bench_rx.c b/tools/testing/selftests/net/udpgso_bench_rx.c index f35a924d4a30..1cbadd267c96 100644 --- a/tools/testing/selftests/net/udpgso_bench_rx.c +++ b/tools/testing/selftests/net/udpgso_bench_rx.c @@ -375,7 +375,7 @@ static void do_recv(void) do_flush_udp(fd); tnow = gettimeofday_ms(); - if (tnow > treport) { + if (!cfg_expected_pkt_nr && tnow > treport) { if (packets) fprintf(stderr, "%s rx: %6lu MB/s %8lu calls/s\n", From d75df752647728c6d65e594e5e4493448bea6cda Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Thu, 1 Feb 2024 19:42:39 +0100 Subject: [PATCH 103/270] selftests: net: fix setup_ns usage in rtnetlink.sh The setup_ns helper marks the testns global variable as readonly. Later attempts to set such variable are unsuccessful, causing a couple test failures. Avoid completely the variable re-initialization and let the function access the global value. Fixes: e9ce7ededf14 ("selftests: rtnetlink: use setup_ns in bonding test") Signed-off-by: Paolo Abeni Reviewed-by: David Ahern Link: https://lore.kernel.org/r/6e7c937c8ff73ca52a21a4a536a13a76ec0173a8.1706812005.git.pabeni@redhat.com Signed-off-by: Jakub Kicinski --- tools/testing/selftests/net/rtnetlink.sh | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/tools/testing/selftests/net/rtnetlink.sh b/tools/testing/selftests/net/rtnetlink.sh index 4667d74579d1..874a2952aa8e 100755 --- a/tools/testing/selftests/net/rtnetlink.sh +++ b/tools/testing/selftests/net/rtnetlink.sh @@ -440,7 +440,6 @@ kci_test_encap_vxlan() local ret=0 vxlan="test-vxlan0" vlan="test-vlan0" - testns="$1" run_cmd ip -netns "$testns" link add "$vxlan" type vxlan id 42 group 239.1.1.1 \ dev "$devdummy" dstport 4789 if [ $? -ne 0 ]; then @@ -485,7 +484,6 @@ kci_test_encap_fou() { local ret=0 name="test-fou" - testns="$1" run_cmd_grep 'Usage: ip fou' ip fou help if [ $? -ne 0 ];then end_test "SKIP: fou: iproute2 too old" @@ -526,8 +524,8 @@ kci_test_encap() run_cmd ip -netns "$testns" link set lo up run_cmd ip -netns "$testns" link add name "$devdummy" type dummy run_cmd ip -netns "$testns" link set "$devdummy" up - run_cmd kci_test_encap_vxlan "$testns" - run_cmd kci_test_encap_fou "$testns" + run_cmd kci_test_encap_vxlan + run_cmd kci_test_encap_fou ip netns del "$testns" return $ret From e71e016ad0f6e641a7898b8cda5f62f8e2beb2f1 Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Thu, 1 Feb 2024 19:42:40 +0100 Subject: [PATCH 104/270] selftests: net: fix tcp listener handling in pmtu.sh The pmtu.sh test uses a few TCP listener in a problematic way: It hard-codes a constant timeout to wait for the listener starting-up in background. That introduces unneeded latency and on very slow and busy host it can fail. Additionally the test starts again the same listener in the same namespace on the same port, just after the previous connection completed. Fast host can attempt starting the new server before the old one really closed the socket. Address the issues using the wait_local_port_listen helper and explicitly waiting for the background listener process exit. Fixes: 136a1b434bbb ("selftests: net: test vxlan pmtu exceptions with tcp") Signed-off-by: Paolo Abeni Reviewed-by: David Ahern Link: https://lore.kernel.org/r/f8e8f6d44427d8c45e9f6a71ee1a321047452087.1706812005.git.pabeni@redhat.com Signed-off-by: Jakub Kicinski --- tools/testing/selftests/net/pmtu.sh | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/tools/testing/selftests/net/pmtu.sh b/tools/testing/selftests/net/pmtu.sh index 3f118e3f1c66..f0febc19baae 100755 --- a/tools/testing/selftests/net/pmtu.sh +++ b/tools/testing/selftests/net/pmtu.sh @@ -199,6 +199,7 @@ # Same as above but with IPv6 source lib.sh +source net_helper.sh PAUSE_ON_FAIL=no VERBOSE=0 @@ -1336,13 +1337,15 @@ test_pmtu_ipvX_over_bridged_vxlanY_or_geneveY_exception() { TCPDST="TCP:[${dst}]:50000" fi ${ns_b} socat -T 3 -u -6 TCP-LISTEN:50000 STDOUT > $tmpoutfile & + local socat_pid=$! - sleep 1 + wait_local_port_listen ${NS_B} 50000 tcp dd if=/dev/zero status=none bs=1M count=1 | ${target} socat -T 3 -u STDIN $TCPDST,connect-timeout=3 size=$(du -sb $tmpoutfile) size=${size%%/tmp/*} + wait ${socat_pid} [ $size -ne 1048576 ] && err "File size $size mismatches exepcted value in locally bridged vxlan test" && return 1 done From 691bb4e49c98a47bc643dd808453136ce78b15b4 Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Thu, 1 Feb 2024 19:42:41 +0100 Subject: [PATCH 105/270] selftests: net: avoid just another constant wait Using hard-coded constant timeout to wait for some expected event is deemed to fail sooner or later, especially in slow env. Our CI has spotted another of such race: # TEST: ipv6: cleanup of cached exceptions - nexthop objects [FAIL] # can't delete veth device in a timely manner, PMTU dst likely leaked Replace the crude sleep with a loop looking for the expected condition at low interval for a much longer range. Fixes: b3cc4f8a8a41 ("selftests: pmtu: add explicit tests for PMTU exceptions cleanup") Signed-off-by: Paolo Abeni Reviewed-by: David Ahern Link: https://lore.kernel.org/r/fd5c745e9bb665b724473af6a9373a8c2a62b247.1706812005.git.pabeni@redhat.com Signed-off-by: Jakub Kicinski --- tools/testing/selftests/net/pmtu.sh | 18 +++++++++++++----- 1 file changed, 13 insertions(+), 5 deletions(-) diff --git a/tools/testing/selftests/net/pmtu.sh b/tools/testing/selftests/net/pmtu.sh index f0febc19baae..d65fdd407d73 100755 --- a/tools/testing/selftests/net/pmtu.sh +++ b/tools/testing/selftests/net/pmtu.sh @@ -1957,6 +1957,13 @@ check_command() { return 0 } +check_running() { + pid=${1} + cmd=${2} + + [ "$(cat /proc/${pid}/cmdline 2>/dev/null | tr -d '\0')" = "{cmd}" ] +} + test_cleanup_vxlanX_exception() { outer="${1}" encap="vxlan" @@ -1987,11 +1994,12 @@ test_cleanup_vxlanX_exception() { ${ns_a} ip link del dev veth_A-R1 & iplink_pid=$! - sleep 1 - if [ "$(cat /proc/${iplink_pid}/cmdline 2>/dev/null | tr -d '\0')" = "iplinkdeldevveth_A-R1" ]; then - err " can't delete veth device in a timely manner, PMTU dst likely leaked" - return 1 - fi + for i in $(seq 1 20); do + check_running ${iplink_pid} "iplinkdeldevveth_A-R1" || return 0 + sleep 0.1 + done + err " can't delete veth device in a timely manner, PMTU dst likely leaked" + return 1 } test_cleanup_ipv6_exception() { From d7f5fb33cf77247b7bf9a871aaeea72ca4f51ad7 Mon Sep 17 00:00:00 2001 From: Gerhard Engleder Date: Wed, 31 Jan 2024 21:14:13 +0100 Subject: [PATCH 106/270] tsnep: Fix mapping for zero copy XDP_TX action For XDP_TX action xdp_buff is converted to xdp_frame. The conversion is done by xdp_convert_buff_to_frame(). The memory type of the resulting xdp_frame depends on the memory type of the xdp_buff. For page pool based xdp_buff it produces xdp_frame with memory type MEM_TYPE_PAGE_POOL. For zero copy XSK pool based xdp_buff it produces xdp_frame with memory type MEM_TYPE_PAGE_ORDER0. tsnep_xdp_xmit_back() is not prepared for that and uses always the page pool buffer type TSNEP_TX_TYPE_XDP_TX. This leads to invalid mappings and the transmission of undefined data. Improve tsnep_xdp_xmit_back() to use the generic buffer type TSNEP_TX_TYPE_XDP_NDO for zero copy XDP_TX. Fixes: 3fc2333933fd ("tsnep: Add XDP socket zero-copy RX support") Signed-off-by: Gerhard Engleder Signed-off-by: David S. Miller --- drivers/net/ethernet/engleder/tsnep_main.c | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/drivers/net/ethernet/engleder/tsnep_main.c b/drivers/net/ethernet/engleder/tsnep_main.c index 9aeff2b37a61..64eadd320798 100644 --- a/drivers/net/ethernet/engleder/tsnep_main.c +++ b/drivers/net/ethernet/engleder/tsnep_main.c @@ -719,17 +719,25 @@ static void tsnep_xdp_xmit_flush(struct tsnep_tx *tx) static bool tsnep_xdp_xmit_back(struct tsnep_adapter *adapter, struct xdp_buff *xdp, - struct netdev_queue *tx_nq, struct tsnep_tx *tx) + struct netdev_queue *tx_nq, struct tsnep_tx *tx, + bool zc) { struct xdp_frame *xdpf = xdp_convert_buff_to_frame(xdp); bool xmit; + u32 type; if (unlikely(!xdpf)) return false; + /* no page pool for zero copy */ + if (zc) + type = TSNEP_TX_TYPE_XDP_NDO; + else + type = TSNEP_TX_TYPE_XDP_TX; + __netif_tx_lock(tx_nq, smp_processor_id()); - xmit = tsnep_xdp_xmit_frame_ring(xdpf, tx, TSNEP_TX_TYPE_XDP_TX); + xmit = tsnep_xdp_xmit_frame_ring(xdpf, tx, type); /* Avoid transmit queue timeout since we share it with the slow path */ if (xmit) @@ -1273,7 +1281,7 @@ static bool tsnep_xdp_run_prog(struct tsnep_rx *rx, struct bpf_prog *prog, case XDP_PASS: return false; case XDP_TX: - if (!tsnep_xdp_xmit_back(rx->adapter, xdp, tx_nq, tx)) + if (!tsnep_xdp_xmit_back(rx->adapter, xdp, tx_nq, tx, false)) goto out_failure; *status |= TSNEP_XDP_TX; return true; @@ -1323,7 +1331,7 @@ static bool tsnep_xdp_run_prog_zc(struct tsnep_rx *rx, struct bpf_prog *prog, case XDP_PASS: return false; case XDP_TX: - if (!tsnep_xdp_xmit_back(rx->adapter, xdp, tx_nq, tx)) + if (!tsnep_xdp_xmit_back(rx->adapter, xdp, tx_nq, tx, true)) goto out_failure; *status |= TSNEP_XDP_TX; return true; From d75abeec401f8c86b470e7028a13fcdc87e5dd06 Mon Sep 17 00:00:00 2001 From: Antoine Tenart Date: Thu, 1 Feb 2024 09:38:15 +0100 Subject: [PATCH 107/270] tunnels: fix out of bounds access when building IPv6 PMTU error If the ICMPv6 error is built from a non-linear skb we get the following splat, BUG: KASAN: slab-out-of-bounds in do_csum+0x220/0x240 Read of size 4 at addr ffff88811d402c80 by task netperf/820 CPU: 0 PID: 820 Comm: netperf Not tainted 6.8.0-rc1+ #543 ... kasan_report+0xd8/0x110 do_csum+0x220/0x240 csum_partial+0xc/0x20 skb_tunnel_check_pmtu+0xeb9/0x3280 vxlan_xmit_one+0x14c2/0x4080 vxlan_xmit+0xf61/0x5c00 dev_hard_start_xmit+0xfb/0x510 __dev_queue_xmit+0x7cd/0x32a0 br_dev_queue_push_xmit+0x39d/0x6a0 Use skb_checksum instead of csum_partial who cannot deal with non-linear SKBs. Fixes: 4cb47a8644cc ("tunnels: PMTU discovery support for directly bridged IP packets") Signed-off-by: Antoine Tenart Reviewed-by: Jiri Pirko Signed-off-by: David S. Miller --- net/ipv4/ip_tunnel_core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/ipv4/ip_tunnel_core.c b/net/ipv4/ip_tunnel_core.c index 586b1b3e35b8..80ccd6661aa3 100644 --- a/net/ipv4/ip_tunnel_core.c +++ b/net/ipv4/ip_tunnel_core.c @@ -332,7 +332,7 @@ static int iptunnel_pmtud_build_icmpv6(struct sk_buff *skb, int mtu) }; skb_reset_network_header(skb); - csum = csum_partial(icmp6h, len, 0); + csum = skb_checksum(skb, skb_transport_offset(skb), len, 0); icmp6h->icmp6_cksum = csum_ipv6_magic(&nip6h->saddr, &nip6h->daddr, len, IPPROTO_ICMPV6, csum); From f3616173bf9be9bf39d131b120d6eea4e6324cb5 Mon Sep 17 00:00:00 2001 From: Zhipeng Lu Date: Thu, 1 Feb 2024 20:41:05 +0800 Subject: [PATCH 108/270] atm: idt77252: fix a memleak in open_card_ubr0 When alloc_scq fails, card->vcs[0] (i.e. vc) should be freed. Otherwise, in the following call chain: idt77252_init_one |-> idt77252_dev_open |-> open_card_ubr0 |-> alloc_scq [failed] |-> deinit_card |-> vfree(card->vcs); card->vcs is freed and card->vcs[0] is leaked. Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2") Signed-off-by: Zhipeng Lu Reviewed-by: Jiri Pirko Signed-off-by: David S. Miller --- drivers/atm/idt77252.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/atm/idt77252.c b/drivers/atm/idt77252.c index e327a0229dc1..e7f713cd70d3 100644 --- a/drivers/atm/idt77252.c +++ b/drivers/atm/idt77252.c @@ -2930,6 +2930,8 @@ open_card_ubr0(struct idt77252_dev *card) vc->scq = alloc_scq(card, vc->class); if (!vc->scq) { printk("%s: can't get SCQ.\n", card->name); + kfree(card->vcs[0]); + card->vcs[0] = NULL; return -ENOMEM; } From b09b58e31b0f43d76f79b9943da3fb7c2843dcbb Mon Sep 17 00:00:00 2001 From: Zhipeng Lu Date: Thu, 1 Feb 2024 20:47:13 +0800 Subject: [PATCH 109/270] octeontx2-pf: Fix a memleak otx2_sq_init When qmem_alloc and pfvf->hw_ops->sq_aq_init fails, sq->sg should be freed to prevent memleak. Fixes: c9c12d339d93 ("octeontx2-pf: Add support for PTP clock") Signed-off-by: Zhipeng Lu Acked-by: Jiri Pirko Signed-off-by: David S. Miller --- .../ethernet/marvell/octeontx2/nic/otx2_common.c | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/marvell/octeontx2/nic/otx2_common.c b/drivers/net/ethernet/marvell/octeontx2/nic/otx2_common.c index 7ca6941ea0b9..02d0b707aea5 100644 --- a/drivers/net/ethernet/marvell/octeontx2/nic/otx2_common.c +++ b/drivers/net/ethernet/marvell/octeontx2/nic/otx2_common.c @@ -951,8 +951,11 @@ int otx2_sq_init(struct otx2_nic *pfvf, u16 qidx, u16 sqb_aura) if (pfvf->ptp && qidx < pfvf->hw.tx_queues) { err = qmem_alloc(pfvf->dev, &sq->timestamps, qset->sqe_cnt, sizeof(*sq->timestamps)); - if (err) + if (err) { + kfree(sq->sg); + sq->sg = NULL; return err; + } } sq->head = 0; @@ -968,7 +971,14 @@ int otx2_sq_init(struct otx2_nic *pfvf, u16 qidx, u16 sqb_aura) sq->stats.bytes = 0; sq->stats.pkts = 0; - return pfvf->hw_ops->sq_aq_init(pfvf, qidx, sqb_aura); + err = pfvf->hw_ops->sq_aq_init(pfvf, qidx, sqb_aura); + if (err) { + kfree(sq->sg); + sq->sg = NULL; + return err; + } + + return 0; } From 54ce1927eb787f7bbb7ee664841c8f5932703f39 Mon Sep 17 00:00:00 2001 From: Ira Weiny Date: Wed, 31 Jan 2024 15:55:38 -0800 Subject: [PATCH 110/270] cxl/cper: Fix errant CPER prints for CXL events Jonathan reports that CXL CPER events dump an extra generic error message. {1}[Hardware Error]: Hardware error from APEI Generic Hardware Error Source: 1 {1}[Hardware Error]: event severity: recoverable {1}[Hardware Error]: Error 0, type: recoverable {1}[Hardware Error]: section type: unknown, fbcd0a77-c260-417f-85a9-088b1621eba6 {1}[Hardware Error]: section length: 0x90 {1}[Hardware Error]: 00000000: 00000090 00000007 00000000 0d938086 ................ {1}[Hardware Error]: 00000010: 00100000 00000000 00040000 00000000 ................ ... CXL events were rerouted though the CXL subsystem for additional processing. However, when that work was done it was missed that cper_estatus_print_section() continued with a generic error message which is confusing. Teach CPER print code to ignore printing details of some section types. Assign the CXL event GUIDs to this set to prevent confusing unknown prints. Reported-by: Jonathan Cameron Suggested-by: Jonathan Cameron Signed-off-by: Ira Weiny Reviewed-by: Dave Jiang Reviewed-by: Jonathan Cameron Reviewed-by: Alison Schofield Signed-off-by: Ard Biesheuvel --- drivers/acpi/apei/ghes.c | 26 -------------------------- drivers/firmware/efi/cper.c | 19 +++++++++++++++++++ include/linux/cper.h | 23 +++++++++++++++++++++++ 3 files changed, 42 insertions(+), 26 deletions(-) diff --git a/drivers/acpi/apei/ghes.c b/drivers/acpi/apei/ghes.c index 7b7c605166e0..fe825a432c5b 100644 --- a/drivers/acpi/apei/ghes.c +++ b/drivers/acpi/apei/ghes.c @@ -680,32 +680,6 @@ static void ghes_defer_non_standard_event(struct acpi_hest_generic_data *gdata, static DECLARE_RWSEM(cxl_cper_rw_sem); static cxl_cper_callback cper_callback; -/* CXL Event record UUIDs are formatted as GUIDs and reported in section type */ - -/* - * General Media Event Record - * CXL rev 3.0 Section 8.2.9.2.1.1; Table 8-43 - */ -#define CPER_SEC_CXL_GEN_MEDIA_GUID \ - GUID_INIT(0xfbcd0a77, 0xc260, 0x417f, \ - 0x85, 0xa9, 0x08, 0x8b, 0x16, 0x21, 0xeb, 0xa6) - -/* - * DRAM Event Record - * CXL rev 3.0 section 8.2.9.2.1.2; Table 8-44 - */ -#define CPER_SEC_CXL_DRAM_GUID \ - GUID_INIT(0x601dcbb3, 0x9c06, 0x4eab, \ - 0xb8, 0xaf, 0x4e, 0x9b, 0xfb, 0x5c, 0x96, 0x24) - -/* - * Memory Module Event Record - * CXL rev 3.0 section 8.2.9.2.1.3; Table 8-45 - */ -#define CPER_SEC_CXL_MEM_MODULE_GUID \ - GUID_INIT(0xfe927475, 0xdd59, 0x4339, \ - 0xa5, 0x86, 0x79, 0xba, 0xb1, 0x13, 0xb7, 0x74) - static void cxl_cper_post_event(enum cxl_event_type event_type, struct cxl_cper_event_rec *rec) { diff --git a/drivers/firmware/efi/cper.c b/drivers/firmware/efi/cper.c index 35c37f667781..9b3884ff81e6 100644 --- a/drivers/firmware/efi/cper.c +++ b/drivers/firmware/efi/cper.c @@ -523,6 +523,17 @@ static void cper_print_tstamp(const char *pfx, } } +struct ignore_section { + guid_t guid; + const char *name; +}; + +static const struct ignore_section ignore_sections[] = { + { .guid = CPER_SEC_CXL_GEN_MEDIA_GUID, .name = "CXL General Media Event" }, + { .guid = CPER_SEC_CXL_DRAM_GUID, .name = "CXL DRAM Event" }, + { .guid = CPER_SEC_CXL_MEM_MODULE_GUID, .name = "CXL Memory Module Event" }, +}; + static void cper_estatus_print_section(const char *pfx, struct acpi_hest_generic_data *gdata, int sec_no) @@ -543,6 +554,14 @@ cper_estatus_print_section(const char *pfx, struct acpi_hest_generic_data *gdata printk("%s""fru_text: %.20s\n", pfx, gdata->fru_text); snprintf(newpfx, sizeof(newpfx), "%s ", pfx); + + for (int i = 0; i < ARRAY_SIZE(ignore_sections); i++) { + if (guid_equal(sec_type, &ignore_sections[i].guid)) { + printk("%ssection_type: %s\n", newpfx, ignore_sections[i].name); + return; + } + } + if (guid_equal(sec_type, &CPER_SEC_PROC_GENERIC)) { struct cper_sec_proc_generic *proc_err = acpi_hest_get_payload(gdata); diff --git a/include/linux/cper.h b/include/linux/cper.h index c1a7dc325121..265b0f8fc0b3 100644 --- a/include/linux/cper.h +++ b/include/linux/cper.h @@ -90,6 +90,29 @@ enum { GUID_INIT(0x667DD791, 0xC6B3, 0x4c27, 0x8A, 0x6B, 0x0F, 0x8E, \ 0x72, 0x2D, 0xEB, 0x41) +/* CXL Event record UUIDs are formatted as GUIDs and reported in section type */ +/* + * General Media Event Record + * CXL rev 3.0 Section 8.2.9.2.1.1; Table 8-43 + */ +#define CPER_SEC_CXL_GEN_MEDIA_GUID \ + GUID_INIT(0xfbcd0a77, 0xc260, 0x417f, \ + 0x85, 0xa9, 0x08, 0x8b, 0x16, 0x21, 0xeb, 0xa6) +/* + * DRAM Event Record + * CXL rev 3.0 section 8.2.9.2.1.2; Table 8-44 + */ +#define CPER_SEC_CXL_DRAM_GUID \ + GUID_INIT(0x601dcbb3, 0x9c06, 0x4eab, \ + 0xb8, 0xaf, 0x4e, 0x9b, 0xfb, 0x5c, 0x96, 0x24) +/* + * Memory Module Event Record + * CXL rev 3.0 section 8.2.9.2.1.3; Table 8-45 + */ +#define CPER_SEC_CXL_MEM_MODULE_GUID \ + GUID_INIT(0xfe927475, 0xdd59, 0x4339, \ + 0xa5, 0x86, 0x79, 0xba, 0xb1, 0x13, 0xb7, 0x74) + /* * Flags bits definitions for flags in struct cper_record_header * If set, the error has been recovered From dbea519d6878c298dd0f48e6ec2dbacebe4bbb2a Mon Sep 17 00:00:00 2001 From: Ira Weiny Date: Wed, 31 Jan 2024 15:55:39 -0800 Subject: [PATCH 111/270] cxl/trace: Remove unnecessary memcpy's CPER events don't have UUIDs. Therefore UUIDs were removed from the records passed to trace events and replaced with hard coded values. As pointed out by Jonathan, the new defines for the UUIDs present a more efficient way to assign UUID in trace records.[1] Replace memcpy's with the use of static data. [1] https://lore.kernel.org/all/20240108132325.00000e9c@Huawei.com/ Suggested-by: Jonathan Cameron Signed-off-by: Ira Weiny Reviewed-by: Dave Jiang Reviewed-by: Alison Schofield Reviewed-by: Jonathan Cameron Signed-off-by: Ard Biesheuvel --- drivers/cxl/core/trace.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/cxl/core/trace.h b/drivers/cxl/core/trace.h index 89445435303a..bdf117a33744 100644 --- a/drivers/cxl/core/trace.h +++ b/drivers/cxl/core/trace.h @@ -338,7 +338,7 @@ TRACE_EVENT(cxl_general_media, TP_fast_assign( CXL_EVT_TP_fast_assign(cxlmd, log, rec->hdr); - memcpy(&__entry->hdr_uuid, &CXL_EVENT_GEN_MEDIA_UUID, sizeof(uuid_t)); + __entry->hdr_uuid = CXL_EVENT_GEN_MEDIA_UUID; /* General Media */ __entry->dpa = le64_to_cpu(rec->phys_addr); @@ -425,7 +425,7 @@ TRACE_EVENT(cxl_dram, TP_fast_assign( CXL_EVT_TP_fast_assign(cxlmd, log, rec->hdr); - memcpy(&__entry->hdr_uuid, &CXL_EVENT_DRAM_UUID, sizeof(uuid_t)); + __entry->hdr_uuid = CXL_EVENT_DRAM_UUID; /* DRAM */ __entry->dpa = le64_to_cpu(rec->phys_addr); @@ -573,7 +573,7 @@ TRACE_EVENT(cxl_memory_module, TP_fast_assign( CXL_EVT_TP_fast_assign(cxlmd, log, rec->hdr); - memcpy(&__entry->hdr_uuid, &CXL_EVENT_MEM_MODULE_UUID, sizeof(uuid_t)); + __entry->hdr_uuid = CXL_EVENT_MEM_MODULE_UUID; /* Memory Module Event */ __entry->event_type = rec->event_type; From 42dfa94d802a48c871e2017cbf86153270c86632 Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Sun, 4 Feb 2024 16:43:05 +0900 Subject: [PATCH 112/270] KVM: arm64: Do not source virt/lib/Kconfig twice For ARCH=arm64, virt/lib/Kconfig is sourced twice, from arch/arm64/kvm/Kconfig and from drivers/vfio/Kconfig. There is no good reason to parse virt/lib/Kconfig twice. Commit 2412405b3141 ("KVM: arm/arm64: register irq bypass consumer on ARM/ARM64") should not have added this 'source' directive. Signed-off-by: Masahiro Yamada Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20240204074305.31492-1-masahiroy@kernel.org --- arch/arm64/kvm/Kconfig | 1 - 1 file changed, 1 deletion(-) diff --git a/arch/arm64/kvm/Kconfig b/arch/arm64/kvm/Kconfig index 6c3c8ca73e7f..27ca89b628a0 100644 --- a/arch/arm64/kvm/Kconfig +++ b/arch/arm64/kvm/Kconfig @@ -3,7 +3,6 @@ # KVM configuration # -source "virt/lib/Kconfig" source "virt/kvm/Kconfig" menuconfig VIRTUALIZATION From 9cae43da9867412f8bd09aee5c8a8dc5e8dc3dc2 Mon Sep 17 00:00:00 2001 From: Shradha Gupta Date: Thu, 1 Feb 2024 20:40:38 -0800 Subject: [PATCH 113/270] hv_netvsc: Register VF in netvsc_probe if NET_DEVICE_REGISTER missed If hv_netvsc driver is unloaded and reloaded, the NET_DEVICE_REGISTER handler cannot perform VF register successfully as the register call is received before netvsc_probe is finished. This is because we register register_netdevice_notifier() very early( even before vmbus_driver_register()). To fix this, we try to register each such matching VF( if it is visible as a netdevice) at the end of netvsc_probe. Cc: stable@vger.kernel.org Fixes: 85520856466e ("hv_netvsc: Fix race of register_netdevice_notifier and VF register") Suggested-by: Dexuan Cui Signed-off-by: Shradha Gupta Reviewed-by: Haiyang Zhang Reviewed-by: Dexuan Cui Signed-off-by: David S. Miller --- drivers/net/hyperv/netvsc_drv.c | 82 +++++++++++++++++++++++++-------- 1 file changed, 62 insertions(+), 20 deletions(-) diff --git a/drivers/net/hyperv/netvsc_drv.c b/drivers/net/hyperv/netvsc_drv.c index 273bd8a20122..11831a1c9762 100644 --- a/drivers/net/hyperv/netvsc_drv.c +++ b/drivers/net/hyperv/netvsc_drv.c @@ -42,6 +42,10 @@ #define LINKCHANGE_INT (2 * HZ) #define VF_TAKEOVER_INT (HZ / 10) +/* Macros to define the context of vf registration */ +#define VF_REG_IN_PROBE 1 +#define VF_REG_IN_NOTIFIER 2 + static unsigned int ring_size __ro_after_init = 128; module_param(ring_size, uint, 0444); MODULE_PARM_DESC(ring_size, "Ring buffer size (# of 4K pages)"); @@ -2185,7 +2189,7 @@ static rx_handler_result_t netvsc_vf_handle_frame(struct sk_buff **pskb) } static int netvsc_vf_join(struct net_device *vf_netdev, - struct net_device *ndev) + struct net_device *ndev, int context) { struct net_device_context *ndev_ctx = netdev_priv(ndev); int ret; @@ -2208,7 +2212,11 @@ static int netvsc_vf_join(struct net_device *vf_netdev, goto upper_link_failed; } - schedule_delayed_work(&ndev_ctx->vf_takeover, VF_TAKEOVER_INT); + /* If this registration is called from probe context vf_takeover + * is taken care of later in probe itself. + */ + if (context == VF_REG_IN_NOTIFIER) + schedule_delayed_work(&ndev_ctx->vf_takeover, VF_TAKEOVER_INT); call_netdevice_notifiers(NETDEV_JOIN, vf_netdev); @@ -2346,7 +2354,7 @@ static int netvsc_prepare_bonding(struct net_device *vf_netdev) return NOTIFY_DONE; } -static int netvsc_register_vf(struct net_device *vf_netdev) +static int netvsc_register_vf(struct net_device *vf_netdev, int context) { struct net_device_context *net_device_ctx; struct netvsc_device *netvsc_dev; @@ -2386,7 +2394,7 @@ static int netvsc_register_vf(struct net_device *vf_netdev) netdev_info(ndev, "VF registering: %s\n", vf_netdev->name); - if (netvsc_vf_join(vf_netdev, ndev) != 0) + if (netvsc_vf_join(vf_netdev, ndev, context) != 0) return NOTIFY_DONE; dev_hold(vf_netdev); @@ -2484,10 +2492,31 @@ static int netvsc_unregister_vf(struct net_device *vf_netdev) return NOTIFY_OK; } +static int check_dev_is_matching_vf(struct net_device *event_ndev) +{ + /* Skip NetVSC interfaces */ + if (event_ndev->netdev_ops == &device_ops) + return -ENODEV; + + /* Avoid non-Ethernet type devices */ + if (event_ndev->type != ARPHRD_ETHER) + return -ENODEV; + + /* Avoid Vlan dev with same MAC registering as VF */ + if (is_vlan_dev(event_ndev)) + return -ENODEV; + + /* Avoid Bonding master dev with same MAC registering as VF */ + if (netif_is_bond_master(event_ndev)) + return -ENODEV; + + return 0; +} + static int netvsc_probe(struct hv_device *dev, const struct hv_vmbus_device_id *dev_id) { - struct net_device *net = NULL; + struct net_device *net = NULL, *vf_netdev; struct net_device_context *net_device_ctx; struct netvsc_device_info *device_info = NULL; struct netvsc_device *nvdev; @@ -2599,6 +2628,30 @@ static int netvsc_probe(struct hv_device *dev, } list_add(&net_device_ctx->list, &netvsc_dev_list); + + /* When the hv_netvsc driver is unloaded and reloaded, the + * NET_DEVICE_REGISTER for the vf device is replayed before probe + * is complete. This is because register_netdevice_notifier() gets + * registered before vmbus_driver_register() so that callback func + * is set before probe and we don't miss events like NETDEV_POST_INIT + * So, in this section we try to register the matching vf device that + * is present as a netdevice, knowing that its register call is not + * processed in the netvsc_netdev_notifier(as probing is progress and + * get_netvsc_byslot fails). + */ + for_each_netdev(dev_net(net), vf_netdev) { + ret = check_dev_is_matching_vf(vf_netdev); + if (ret != 0) + continue; + + if (net != get_netvsc_byslot(vf_netdev)) + continue; + + netvsc_prepare_bonding(vf_netdev); + netvsc_register_vf(vf_netdev, VF_REG_IN_PROBE); + __netvsc_vf_setup(net, vf_netdev); + break; + } rtnl_unlock(); netvsc_devinfo_put(device_info); @@ -2754,28 +2807,17 @@ static int netvsc_netdev_event(struct notifier_block *this, unsigned long event, void *ptr) { struct net_device *event_dev = netdev_notifier_info_to_dev(ptr); + int ret = 0; - /* Skip our own events */ - if (event_dev->netdev_ops == &device_ops) - return NOTIFY_DONE; - - /* Avoid non-Ethernet type devices */ - if (event_dev->type != ARPHRD_ETHER) - return NOTIFY_DONE; - - /* Avoid Vlan dev with same MAC registering as VF */ - if (is_vlan_dev(event_dev)) - return NOTIFY_DONE; - - /* Avoid Bonding master dev with same MAC registering as VF */ - if (netif_is_bond_master(event_dev)) + ret = check_dev_is_matching_vf(event_dev); + if (ret != 0) return NOTIFY_DONE; switch (event) { case NETDEV_POST_INIT: return netvsc_prepare_bonding(event_dev); case NETDEV_REGISTER: - return netvsc_register_vf(event_dev); + return netvsc_register_vf(event_dev, VF_REG_IN_NOTIFIER); case NETDEV_UNREGISTER: return netvsc_unregister_vf(event_dev); case NETDEV_UP: From 1168491e7f53581ba7b6014a39a49cfbbb722feb Mon Sep 17 00:00:00 2001 From: Loic Prylli Date: Fri, 3 Nov 2023 11:30:55 +0100 Subject: [PATCH 114/270] hwmon: (aspeed-pwm-tacho) mutex for tach reading the ASPEED_PTCR_RESULT Register can only hold the result for a single fan input. Adding a mutex to protect the register until the reading is done. Signed-off-by: Loic Prylli Signed-off-by: Alexander Hansen Fixes: 2d7a548a3eff ("drivers: hwmon: Support for ASPEED PWM/Fan tach") Link: https://lore.kernel.org/r/121d888762a1232ef403cf35230ccf7b3887083a.1699007401.git.alexander.hansen@9elements.com Signed-off-by: Guenter Roeck --- drivers/hwmon/aspeed-pwm-tacho.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/drivers/hwmon/aspeed-pwm-tacho.c b/drivers/hwmon/aspeed-pwm-tacho.c index f6e1e55e8292..4acc1858d8ac 100644 --- a/drivers/hwmon/aspeed-pwm-tacho.c +++ b/drivers/hwmon/aspeed-pwm-tacho.c @@ -195,6 +195,8 @@ struct aspeed_pwm_tacho_data { u8 fan_tach_ch_source[MAX_ASPEED_FAN_TACH_CHANNELS]; struct aspeed_cooling_device *cdev[8]; const struct attribute_group *groups[3]; + /* protects access to shared ASPEED_PTCR_RESULT */ + struct mutex tach_lock; }; enum type { TYPEM, TYPEN, TYPEO }; @@ -529,6 +531,8 @@ static int aspeed_get_fan_tach_ch_rpm(struct aspeed_pwm_tacho_data *priv, u8 fan_tach_ch_source, type, mode, both; int ret; + mutex_lock(&priv->tach_lock); + regmap_write(priv->regmap, ASPEED_PTCR_TRIGGER, 0); regmap_write(priv->regmap, ASPEED_PTCR_TRIGGER, 0x1 << fan_tach_ch); @@ -546,6 +550,8 @@ static int aspeed_get_fan_tach_ch_rpm(struct aspeed_pwm_tacho_data *priv, ASPEED_RPM_STATUS_SLEEP_USEC, usec); + mutex_unlock(&priv->tach_lock); + /* return -ETIMEDOUT if we didn't get an answer. */ if (ret) return ret; @@ -915,6 +921,7 @@ static int aspeed_pwm_tacho_probe(struct platform_device *pdev) priv = devm_kzalloc(dev, sizeof(*priv), GFP_KERNEL); if (!priv) return -ENOMEM; + mutex_init(&priv->tach_lock); priv->regmap = devm_regmap_init(dev, NULL, (__force void *)regs, &aspeed_pwm_tacho_regmap_config); if (IS_ERR(priv->regmap)) From 4e440abc894585a34c2904a32cd54af1742311b3 Mon Sep 17 00:00:00 2001 From: Zhang Rui Date: Fri, 2 Feb 2024 17:21:34 +0800 Subject: [PATCH 115/270] hwmon: (coretemp) Fix out-of-bounds memory access Fix a bug that pdata->cpu_map[] is set before out-of-bounds check. The problem might be triggered on systems with more than 128 cores per package. Fixes: 7108b80a542b ("hwmon/coretemp: Handle large core ID value") Signed-off-by: Zhang Rui Cc: Link: https://lore.kernel.org/r/20240202092144.71180-2-rui.zhang@intel.com Signed-off-by: Guenter Roeck --- drivers/hwmon/coretemp.c | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/drivers/hwmon/coretemp.c b/drivers/hwmon/coretemp.c index ba82d1e79c13..e78c76919111 100644 --- a/drivers/hwmon/coretemp.c +++ b/drivers/hwmon/coretemp.c @@ -509,18 +509,14 @@ static int create_core_data(struct platform_device *pdev, unsigned int cpu, if (pkg_flag) { attr_no = PKG_SYSFS_ATTR_NO; } else { - index = ida_alloc(&pdata->ida, GFP_KERNEL); + index = ida_alloc_max(&pdata->ida, NUM_REAL_CORES - 1, GFP_KERNEL); if (index < 0) return index; + pdata->cpu_map[index] = topology_core_id(cpu); attr_no = index + BASE_SYSFS_ATTR_NO; } - if (attr_no > MAX_CORE_DATA - 1) { - err = -ERANGE; - goto ida_free; - } - tdata = init_temp_data(cpu, pkg_flag); if (!tdata) { err = -ENOMEM; From fdaf0c8629d4524a168cb9e4ad4231875749b28c Mon Sep 17 00:00:00 2001 From: Zhang Rui Date: Fri, 2 Feb 2024 17:21:35 +0800 Subject: [PATCH 116/270] hwmon: (coretemp) Fix bogus core_id to attr name mapping Before commit 7108b80a542b ("hwmon/coretemp: Handle large core ID value"), there is a fixed mapping between 1. cpu_core_id 2. the index in pdata->core_data[] array 3. the sysfs attr name, aka "tempX_" The later two always equal cpu_core_id + 2. After the commit, pdata->core_data[] index is got from ida so that it can handle sparse core ids and support more cores within a package. However, the commit erroneously maps the sysfs attr name to pdata->core_data[] index instead of cpu_core_id + 2. As a result, the code is not aligned with the comments, and brings user visible changes in hwmon sysfs on systems with sparse core id. For example, before commit 7108b80a542b ("hwmon/coretemp: Handle large core ID value"), /sys/class/hwmon/hwmon2/temp2_label:Core 0 /sys/class/hwmon/hwmon2/temp3_label:Core 1 /sys/class/hwmon/hwmon2/temp4_label:Core 2 /sys/class/hwmon/hwmon2/temp5_label:Core 3 /sys/class/hwmon/hwmon2/temp6_label:Core 4 /sys/class/hwmon/hwmon3/temp10_label:Core 8 /sys/class/hwmon/hwmon3/temp11_label:Core 9 after commit, /sys/class/hwmon/hwmon2/temp2_label:Core 0 /sys/class/hwmon/hwmon2/temp3_label:Core 1 /sys/class/hwmon/hwmon2/temp4_label:Core 2 /sys/class/hwmon/hwmon2/temp5_label:Core 3 /sys/class/hwmon/hwmon2/temp6_label:Core 4 /sys/class/hwmon/hwmon2/temp7_label:Core 8 /sys/class/hwmon/hwmon2/temp8_label:Core 9 Restore the previous behavior and rework the code, comments and variable names to avoid future confusions. Fixes: 7108b80a542b ("hwmon/coretemp: Handle large core ID value") Signed-off-by: Zhang Rui Link: https://lore.kernel.org/r/20240202092144.71180-3-rui.zhang@intel.com Signed-off-by: Guenter Roeck --- drivers/hwmon/coretemp.c | 32 +++++++++++++++++++------------- 1 file changed, 19 insertions(+), 13 deletions(-) diff --git a/drivers/hwmon/coretemp.c b/drivers/hwmon/coretemp.c index e78c76919111..95f4c0b00b2d 100644 --- a/drivers/hwmon/coretemp.c +++ b/drivers/hwmon/coretemp.c @@ -419,7 +419,7 @@ static ssize_t show_temp(struct device *dev, } static int create_core_attrs(struct temp_data *tdata, struct device *dev, - int attr_no) + int index) { int i; static ssize_t (*const rd_ptr[TOTAL_ATTRS]) (struct device *dev, @@ -431,13 +431,20 @@ static int create_core_attrs(struct temp_data *tdata, struct device *dev, }; for (i = 0; i < tdata->attr_size; i++) { + /* + * We map the attr number to core id of the CPU + * The attr number is always core id + 2 + * The Pkgtemp will always show up as temp1_*, if available + */ + int attr_no = tdata->is_pkg_data ? 1 : tdata->cpu_core_id + 2; + snprintf(tdata->attr_name[i], CORETEMP_NAME_LENGTH, "temp%d_%s", attr_no, suffixes[i]); sysfs_attr_init(&tdata->sd_attrs[i].dev_attr.attr); tdata->sd_attrs[i].dev_attr.attr.name = tdata->attr_name[i]; tdata->sd_attrs[i].dev_attr.attr.mode = 0444; tdata->sd_attrs[i].dev_attr.show = rd_ptr[i]; - tdata->sd_attrs[i].index = attr_no; + tdata->sd_attrs[i].index = index; tdata->attrs[i] = &tdata->sd_attrs[i].dev_attr.attr; } tdata->attr_group.attrs = tdata->attrs; @@ -495,26 +502,25 @@ static int create_core_data(struct platform_device *pdev, unsigned int cpu, struct platform_data *pdata = platform_get_drvdata(pdev); struct cpuinfo_x86 *c = &cpu_data(cpu); u32 eax, edx; - int err, index, attr_no; + int err, index; if (!housekeeping_cpu(cpu, HK_TYPE_MISC)) return 0; /* - * Find attr number for sysfs: - * We map the attr number to core id of the CPU - * The attr number is always core id + 2 - * The Pkgtemp will always show up as temp1_*, if available + * Get the index of tdata in pdata->core_data[] + * tdata for package: pdata->core_data[1] + * tdata for core: pdata->core_data[2] .. pdata->core_data[NUM_REAL_CORES + 1] */ if (pkg_flag) { - attr_no = PKG_SYSFS_ATTR_NO; + index = PKG_SYSFS_ATTR_NO; } else { index = ida_alloc_max(&pdata->ida, NUM_REAL_CORES - 1, GFP_KERNEL); if (index < 0) return index; pdata->cpu_map[index] = topology_core_id(cpu); - attr_no = index + BASE_SYSFS_ATTR_NO; + index += BASE_SYSFS_ATTR_NO; } tdata = init_temp_data(cpu, pkg_flag); @@ -540,20 +546,20 @@ static int create_core_data(struct platform_device *pdev, unsigned int cpu, if (get_ttarget(tdata, &pdev->dev) >= 0) tdata->attr_size++; - pdata->core_data[attr_no] = tdata; + pdata->core_data[index] = tdata; /* Create sysfs interfaces */ - err = create_core_attrs(tdata, pdata->hwmon_dev, attr_no); + err = create_core_attrs(tdata, pdata->hwmon_dev, index); if (err) goto exit_free; return 0; exit_free: - pdata->core_data[attr_no] = NULL; + pdata->core_data[index] = NULL; kfree(tdata); ida_free: if (!pkg_flag) - ida_free(&pdata->ida, index); + ida_free(&pdata->ida, index - BASE_SYSFS_ATTR_NO); return err; } From 34cf8c657cf0365791cdc658ddbca9cc907726ce Mon Sep 17 00:00:00 2001 From: Zhang Rui Date: Fri, 2 Feb 2024 17:21:36 +0800 Subject: [PATCH 117/270] hwmon: (coretemp) Enlarge per package core count limit Currently, coretemp driver supports only 128 cores per package. This loses some core temperature information on systems that have more than 128 cores per package. [ 58.685033] coretemp coretemp.0: Adding Core 128 failed [ 58.692009] coretemp coretemp.0: Adding Core 129 failed ... Enlarge the limitation to 512 because there are platforms with more than 256 cores per package. Signed-off-by: Zhang Rui Link: https://lore.kernel.org/r/20240202092144.71180-4-rui.zhang@intel.com Signed-off-by: Guenter Roeck --- drivers/hwmon/coretemp.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/hwmon/coretemp.c b/drivers/hwmon/coretemp.c index 95f4c0b00b2d..b8fc8d1ef20d 100644 --- a/drivers/hwmon/coretemp.c +++ b/drivers/hwmon/coretemp.c @@ -41,7 +41,7 @@ MODULE_PARM_DESC(tjmax, "TjMax value in degrees Celsius"); #define PKG_SYSFS_ATTR_NO 1 /* Sysfs attribute for package temp */ #define BASE_SYSFS_ATTR_NO 2 /* Sysfs Base attr no for coretemp */ -#define NUM_REAL_CORES 128 /* Number of Real cores per cpu */ +#define NUM_REAL_CORES 512 /* Number of Real cores per cpu */ #define CORETEMP_NAME_LENGTH 28 /* String Length of attrs */ #define MAX_CORE_ATTRS 4 /* Maximum no of basic attrs */ #define TOTAL_ATTRS (MAX_CORE_ATTRS + 1) From eef00a82c568944f113f2de738156ac591bbd5cd Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 2 Feb 2024 09:54:04 +0000 Subject: [PATCH 118/270] inet: read sk->sk_family once in inet_recv_error() inet_recv_error() is called without holding the socket lock. IPv6 socket could mutate to IPv4 with IPV6_ADDRFORM socket option and trigger a KCSAN warning. Fixes: f4713a3dfad0 ("net-timestamp: make tcp_recvmsg call ipv6_recv_error for AF_INET6 socks") Signed-off-by: Eric Dumazet Cc: Willem de Bruijn Reviewed-by: Willem de Bruijn Signed-off-by: David S. Miller --- net/ipv4/af_inet.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index 4e635dd3d3c8..a5a820ee2026 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -1628,10 +1628,12 @@ EXPORT_SYMBOL(inet_current_timestamp); int inet_recv_error(struct sock *sk, struct msghdr *msg, int len, int *addr_len) { - if (sk->sk_family == AF_INET) + unsigned int family = READ_ONCE(sk->sk_family); + + if (family == AF_INET) return ip_recv_error(sk, msg, len, addr_len); #if IS_ENABLED(CONFIG_IPV6) - if (sk->sk_family == AF_INET6) + if (family == AF_INET6) return pingv6_ops.ipv6_recv_error(sk, msg, len, addr_len); #endif return -EINVAL; From 1a00897e5e96c29b21580dfcfec168dc16c67469 Mon Sep 17 00:00:00 2001 From: Zhenyu Wang Date: Fri, 4 Aug 2023 12:05:44 +0800 Subject: [PATCH 119/270] drm/i915: Replace dead 01.org link 01.org is dead so replace old gvt link with current wiki page. Acked-by: Jani Nikula Reviewed-by: Zhi Wang Signed-off-by: Zhenyu Wang Link: http://patchwork.freedesktop.org/patch/msgid/20230804040544.1972958-1-zhenyuw@linux.intel.com --- MAINTAINERS | 2 +- drivers/gpu/drm/i915/Kconfig | 2 +- drivers/gpu/drm/i915/intel_gvt.c | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/MAINTAINERS b/MAINTAINERS index 8d1052fa6a69..03fbe24bf40d 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -10804,7 +10804,7 @@ M: Zhi Wang L: intel-gvt-dev@lists.freedesktop.org L: intel-gfx@lists.freedesktop.org S: Supported -W: https://01.org/igvt-g +W: https://github.com/intel/gvt-linux/wiki T: git https://github.com/intel/gvt-linux.git F: drivers/gpu/drm/i915/gvt/ diff --git a/drivers/gpu/drm/i915/Kconfig b/drivers/gpu/drm/i915/Kconfig index b5d6e3352071..3089029abba4 100644 --- a/drivers/gpu/drm/i915/Kconfig +++ b/drivers/gpu/drm/i915/Kconfig @@ -140,7 +140,7 @@ config DRM_I915_GVT_KVMGT Note that this driver only supports newer device from Broadwell on. For further information and setup guide, you can visit: - http://01.org/igvt-g. + https://github.com/intel/gvt-linux/wiki. If in doubt, say "N". diff --git a/drivers/gpu/drm/i915/intel_gvt.c b/drivers/gpu/drm/i915/intel_gvt.c index e98b6d69a91a..9b6d87c8b583 100644 --- a/drivers/gpu/drm/i915/intel_gvt.c +++ b/drivers/gpu/drm/i915/intel_gvt.c @@ -41,7 +41,7 @@ * To virtualize GPU resources GVT-g driver depends on hypervisor technology * e.g KVM/VFIO/mdev, Xen, etc. to provide resource access trapping capability * and be virtualized within GVT-g device module. More architectural design - * doc is available on https://01.org/group/2230/documentation-list. + * doc is available on https://github.com/intel/gvt-linux/wiki. */ static LIST_HEAD(intel_gvt_devices); From 47caa96478b99d6d1199b89467cc3e5a6cc754ee Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Fri, 26 Jan 2024 11:41:47 +0300 Subject: [PATCH 120/270] drm/i915/gvt: Fix uninitialized variable in handle_mmio() This code prints the wrong variable in the warning message. It should print "i" instead of "info->offset". On the first iteration "info" is uninitialized leading to a crash and on subsequent iterations it prints the previous offset instead of the current one. Fixes: e0f74ed4634d ("i915/gvt: Separate the MMIO tracking table from GVT-g") Signed-off-by: Dan Carpenter Signed-off-by: Zhenyu Wang Link: http://patchwork.freedesktop.org/patch/msgid/11957c20-b178-4027-9b0a-e32e9591dd7c@moroto.mountain Reviewed-by: Zhenyu Wang --- drivers/gpu/drm/i915/gvt/handlers.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/gpu/drm/i915/gvt/handlers.c b/drivers/gpu/drm/i915/gvt/handlers.c index 90f6c1ece57d..efcb00472be2 100644 --- a/drivers/gpu/drm/i915/gvt/handlers.c +++ b/drivers/gpu/drm/i915/gvt/handlers.c @@ -2849,8 +2849,7 @@ static int handle_mmio(struct intel_gvt_mmio_table_iter *iter, u32 offset, for (i = start; i < end; i += 4) { p = intel_gvt_find_mmio_info(gvt, i); if (p) { - WARN(1, "dup mmio definition offset %x\n", - info->offset); + WARN(1, "dup mmio definition offset %x\n", i); /* We return -EEXIST here to make GVT-g load fail. * So duplicated MMIO can be found as soon as From 44e4192f88978e32e4ac08b27141f3767366f79b Mon Sep 17 00:00:00 2001 From: Zhi Wang Date: Tue, 30 Jan 2024 21:27:43 +0000 Subject: [PATCH 121/270] MAINTAINERS: Update Zhi Wang's email address Update my email address to zhi.wang.linux@gmail.com. CC: Zhenyu Wang Acked-by: Zhenyu Wang Signed-off-by: Zhi Wang Signed-off-by: Zhenyu Wang Link: http://patchwork.freedesktop.org/patch/msgid/20240130212743.7727-1-zhi.wang.linux@gmail.com --- MAINTAINERS | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/MAINTAINERS b/MAINTAINERS index 03fbe24bf40d..052149d304ba 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -10800,7 +10800,7 @@ F: drivers/gpio/gpio-tangier.h INTEL GVT-g DRIVERS (Intel GPU Virtualization) M: Zhenyu Wang -M: Zhi Wang +M: Zhi Wang L: intel-gvt-dev@lists.freedesktop.org L: intel-gfx@lists.freedesktop.org S: Supported From 1ad55cecf22f05f1c884adf63cc09d3c3e609ebf Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Mon, 5 Feb 2024 09:11:07 +0100 Subject: [PATCH 122/270] x86/efistub: Use 1:1 file:memory mapping for PE/COFF .compat section The .compat section is a dummy PE section that contains the address of the 32-bit entrypoint of the 64-bit kernel image if it is bootable from 32-bit firmware (i.e., CONFIG_EFI_MIXED=y) This section is only 8 bytes in size and is only referenced from the loader, and so it is placed at the end of the memory view of the image, to avoid the need for padding it to 4k, which is required for sections appearing in the middle of the image. Unfortunately, this violates the PE/COFF spec, and even if most EFI loaders will work correctly (including the Tianocore reference implementation), PE loaders do exist that reject such images, on the basis that both the file and memory views of the file contents should be described by the section headers in a monotonically increasing manner without leaving any gaps. So reorganize the sections to avoid this issue. This results in a slight padding overhead (< 4k) which can be avoided if desired by disabling CONFIG_EFI_MIXED (which is only needed in rare cases these days) Fixes: 3e3eabe26dc8 ("x86/boot: Increase section and file alignment to 4k/512") Reported-by: Mike Beaton Link: https://lkml.kernel.org/r/CAHzAAWQ6srV6LVNdmfbJhOwhBw5ZzxxZZ07aHt9oKkfYAdvuQQ%40mail.gmail.com Signed-off-by: Ard Biesheuvel --- arch/x86/boot/header.S | 14 ++++++-------- arch/x86/boot/setup.ld | 6 +++--- 2 files changed, 9 insertions(+), 11 deletions(-) diff --git a/arch/x86/boot/header.S b/arch/x86/boot/header.S index b2771710ed98..a1bbedd989e4 100644 --- a/arch/x86/boot/header.S +++ b/arch/x86/boot/header.S @@ -106,8 +106,7 @@ extra_header_fields: .word 0 # MinorSubsystemVersion .long 0 # Win32VersionValue - .long setup_size + ZO__end + pecompat_vsize - # SizeOfImage + .long setup_size + ZO__end # SizeOfImage .long salign # SizeOfHeaders .long 0 # CheckSum @@ -143,7 +142,7 @@ section_table: .ascii ".setup" .byte 0 .byte 0 - .long setup_size - salign # VirtualSize + .long pecompat_fstart - salign # VirtualSize .long salign # VirtualAddress .long pecompat_fstart - salign # SizeOfRawData .long salign # PointerToRawData @@ -156,8 +155,8 @@ section_table: #ifdef CONFIG_EFI_MIXED .asciz ".compat" - .long 8 # VirtualSize - .long setup_size + ZO__end # VirtualAddress + .long pecompat_fsize # VirtualSize + .long pecompat_fstart # VirtualAddress .long pecompat_fsize # SizeOfRawData .long pecompat_fstart # PointerToRawData @@ -172,17 +171,16 @@ section_table: * modes this image supports. */ .pushsection ".pecompat", "a", @progbits - .balign falign - .set pecompat_vsize, salign + .balign salign .globl pecompat_fstart pecompat_fstart: .byte 0x1 # Version .byte 8 # Size .word IMAGE_FILE_MACHINE_I386 # PE machine type .long setup_size + ZO_efi32_pe_entry # Entrypoint + .byte 0x0 # Sentinel .popsection #else - .set pecompat_vsize, 0 .set pecompat_fstart, setup_size #endif .ascii ".text" diff --git a/arch/x86/boot/setup.ld b/arch/x86/boot/setup.ld index 83bb7efad8ae..3a2d1360abb0 100644 --- a/arch/x86/boot/setup.ld +++ b/arch/x86/boot/setup.ld @@ -24,6 +24,9 @@ SECTIONS .text : { *(.text .text.*) } .text32 : { *(.text32) } + .pecompat : { *(.pecompat) } + PROVIDE(pecompat_fsize = setup_size - pecompat_fstart); + . = ALIGN(16); .rodata : { *(.rodata*) } @@ -36,9 +39,6 @@ SECTIONS . = ALIGN(16); .data : { *(.data*) } - .pecompat : { *(.pecompat) } - PROVIDE(pecompat_fsize = setup_size - pecompat_fstart); - .signature : { setup_sig = .; LONG(0x5a5aaa55) From b3d4f7f2288901ed2392695919b3c0e24c1b4084 Mon Sep 17 00:00:00 2001 From: Daniel Basilio Date: Fri, 2 Feb 2024 13:37:17 +0200 Subject: [PATCH 123/270] nfp: use correct macro for LengthSelect in BAR config The 1st and 2nd expansion BAR configuration registers are configured, when the driver starts up, in variables 'barcfg_msix_general' and 'barcfg_msix_xpb', respectively. The 'LengthSelect' field is ORed in from bit 0, which is incorrect. The 'LengthSelect' field should start from bit 27. This has largely gone un-noticed because NFP_PCIE_BAR_PCIE2CPP_LengthSelect_32BIT happens to be 0. Fixes: 4cb584e0ee7d ("nfp: add CPP access core") Cc: stable@vger.kernel.org # 4.11+ Signed-off-by: Daniel Basilio Signed-off-by: Louis Peens Signed-off-by: David S. Miller --- drivers/net/ethernet/netronome/nfp/nfpcore/nfp6000_pcie.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/netronome/nfp/nfpcore/nfp6000_pcie.c b/drivers/net/ethernet/netronome/nfp/nfpcore/nfp6000_pcie.c index 33b4c2856316..3f10c5365c80 100644 --- a/drivers/net/ethernet/netronome/nfp/nfpcore/nfp6000_pcie.c +++ b/drivers/net/ethernet/netronome/nfp/nfpcore/nfp6000_pcie.c @@ -537,11 +537,13 @@ static int enable_bars(struct nfp6000_pcie *nfp, u16 interface) const u32 barcfg_msix_general = NFP_PCIE_BAR_PCIE2CPP_MapType( NFP_PCIE_BAR_PCIE2CPP_MapType_GENERAL) | - NFP_PCIE_BAR_PCIE2CPP_LengthSelect_32BIT; + NFP_PCIE_BAR_PCIE2CPP_LengthSelect( + NFP_PCIE_BAR_PCIE2CPP_LengthSelect_32BIT); const u32 barcfg_msix_xpb = NFP_PCIE_BAR_PCIE2CPP_MapType( NFP_PCIE_BAR_PCIE2CPP_MapType_BULK) | - NFP_PCIE_BAR_PCIE2CPP_LengthSelect_32BIT | + NFP_PCIE_BAR_PCIE2CPP_LengthSelect( + NFP_PCIE_BAR_PCIE2CPP_LengthSelect_32BIT) | NFP_PCIE_BAR_PCIE2CPP_Target_BaseAddress( NFP_CPP_TARGET_ISLAND_XPB); const u32 barcfg_explicit[4] = { From 1a1c13303ff6d64e6f718dc8aa614e580ca8d9b4 Mon Sep 17 00:00:00 2001 From: Daniel de Villiers Date: Fri, 2 Feb 2024 13:37:18 +0200 Subject: [PATCH 124/270] nfp: flower: prevent re-adding mac index for bonded port When physical ports are reset (either through link failure or manually toggled down and up again) that are slaved to a Linux bond with a tunnel endpoint IP address on the bond device, not all tunnel packets arriving on the bond port are decapped as expected. The bond dev assigns the same MAC address to itself and each of its slaves. When toggling a slave device, the same MAC address is therefore offloaded to the NFP multiple times with different indexes. The issue only occurs when re-adding the shared mac. The nfp_tunnel_add_shared_mac() function has a conditional check early on that checks if a mac entry already exists and if that mac entry is global: (entry && nfp_tunnel_is_mac_idx_global(entry->index)). In the case of a bonded device (For example br-ex), the mac index is obtained, and no new index is assigned. We therefore modify the conditional in nfp_tunnel_add_shared_mac() to check if the port belongs to the LAG along with the existing checks to prevent a new global mac index from being re-assigned to the slave port. Fixes: 20cce8865098 ("nfp: flower: enable MAC address sharing for offloadable devs") CC: stable@vger.kernel.org # 5.1+ Signed-off-by: Daniel de Villiers Signed-off-by: Louis Peens Signed-off-by: David S. Miller --- drivers/net/ethernet/netronome/nfp/flower/tunnel_conf.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/netronome/nfp/flower/tunnel_conf.c b/drivers/net/ethernet/netronome/nfp/flower/tunnel_conf.c index e522845c7c21..0d7d138d6e0d 100644 --- a/drivers/net/ethernet/netronome/nfp/flower/tunnel_conf.c +++ b/drivers/net/ethernet/netronome/nfp/flower/tunnel_conf.c @@ -1084,7 +1084,7 @@ nfp_tunnel_add_shared_mac(struct nfp_app *app, struct net_device *netdev, u16 nfp_mac_idx = 0; entry = nfp_tunnel_lookup_offloaded_macs(app, netdev->dev_addr); - if (entry && nfp_tunnel_is_mac_idx_global(entry->index)) { + if (entry && (nfp_tunnel_is_mac_idx_global(entry->index) || netif_is_lag_port(netdev))) { if (entry->bridge_count || !nfp_flower_is_supported_bridge(netdev)) { nfp_tunnel_offloaded_macs_inc_ref_and_link(entry, From 0f4d6f011bca0df2051532b41b596366aa272019 Mon Sep 17 00:00:00 2001 From: James Hershaw Date: Fri, 2 Feb 2024 13:37:19 +0200 Subject: [PATCH 125/270] nfp: enable NETDEV_XDP_ACT_REDIRECT feature flag Enable previously excluded xdp feature flag for NFD3 devices. This feature flag is required in order to bind nfp interfaces to an xdp socket and the nfp driver does in fact support the feature. Fixes: 66c0e13ad236 ("drivers: net: turn on XDP features") Cc: stable@vger.kernel.org # 6.3+ Signed-off-by: James Hershaw Signed-off-by: Louis Peens Signed-off-by: David S. Miller --- drivers/net/ethernet/netronome/nfp/nfp_net_common.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/net/ethernet/netronome/nfp/nfp_net_common.c b/drivers/net/ethernet/netronome/nfp/nfp_net_common.c index 3b3210d823e8..f28e769e6fda 100644 --- a/drivers/net/ethernet/netronome/nfp/nfp_net_common.c +++ b/drivers/net/ethernet/netronome/nfp/nfp_net_common.c @@ -2776,6 +2776,7 @@ static void nfp_net_netdev_init(struct nfp_net *nn) case NFP_NFD_VER_NFD3: netdev->netdev_ops = &nfp_nfd3_netdev_ops; netdev->xdp_features |= NETDEV_XDP_ACT_XSK_ZEROCOPY; + netdev->xdp_features |= NETDEV_XDP_ACT_REDIRECT; break; case NFP_NFD_VER_NFDK: netdev->netdev_ops = &nfp_nfdk_netdev_ops; From f31041417bf7f4a4df8b3bfb52cb31bbe805b934 Mon Sep 17 00:00:00 2001 From: David Howells Date: Fri, 2 Feb 2024 15:19:13 +0000 Subject: [PATCH 126/270] rxrpc: Fix generation of serial numbers to skip zero In the Rx protocol, every packet generated is marked with a per-connection monotonically increasing serial number. This number can be referenced in an ACK packet generated in response to an incoming packet - thereby allowing the sender to use this for RTT determination, amongst other things. However, if the reference field in the ACK is zero, it doesn't refer to any incoming packet (it could be a ping to find out if a packet got lost, for example) - so we shouldn't generate zero serial numbers. Fix the generation of serial numbers to retry if it comes up with a zero. Furthermore, since the serial numbers are only ever allocated within the I/O thread this connection is bound to, there's no need for atomics so remove that too. Fixes: 17926a79320a ("[AF_RXRPC]: Provide secure RxRPC sockets for use by userspace and kernel both") Signed-off-by: David Howells cc: Marc Dionne cc: "David S. Miller" cc: Eric Dumazet cc: Jakub Kicinski cc: Paolo Abeni cc: linux-afs@lists.infradead.org cc: netdev@vger.kernel.org Signed-off-by: David S. Miller --- net/rxrpc/ar-internal.h | 16 +++++++++++++++- net/rxrpc/conn_event.c | 2 +- net/rxrpc/output.c | 8 ++++---- net/rxrpc/proc.c | 2 +- net/rxrpc/rxkad.c | 4 ++-- 5 files changed, 23 insertions(+), 9 deletions(-) diff --git a/net/rxrpc/ar-internal.h b/net/rxrpc/ar-internal.h index dbeb75c29857..31b0dd8c9b2d 100644 --- a/net/rxrpc/ar-internal.h +++ b/net/rxrpc/ar-internal.h @@ -510,7 +510,7 @@ struct rxrpc_connection { enum rxrpc_call_completion completion; /* Completion condition */ s32 abort_code; /* Abort code of connection abort */ int debug_id; /* debug ID for printks */ - atomic_t serial; /* packet serial number counter */ + rxrpc_serial_t tx_serial; /* Outgoing packet serial number counter */ unsigned int hi_serial; /* highest serial number received */ u32 service_id; /* Service ID, possibly upgraded */ u32 security_level; /* Security level selected */ @@ -822,6 +822,20 @@ static inline bool rxrpc_sending_to_client(const struct rxrpc_txbuf *txb) #include +/* + * Allocate the next serial number on a connection. 0 must be skipped. + */ +static inline rxrpc_serial_t rxrpc_get_next_serial(struct rxrpc_connection *conn) +{ + rxrpc_serial_t serial; + + serial = conn->tx_serial; + if (serial == 0) + serial = 1; + conn->tx_serial = serial + 1; + return serial; +} + /* * af_rxrpc.c */ diff --git a/net/rxrpc/conn_event.c b/net/rxrpc/conn_event.c index 95f4bc206b3d..ec5eae60ab0c 100644 --- a/net/rxrpc/conn_event.c +++ b/net/rxrpc/conn_event.c @@ -117,7 +117,7 @@ void rxrpc_conn_retransmit_call(struct rxrpc_connection *conn, iov[2].iov_base = &ack_info; iov[2].iov_len = sizeof(ack_info); - serial = atomic_inc_return(&conn->serial); + serial = rxrpc_get_next_serial(conn); pkt.whdr.epoch = htonl(conn->proto.epoch); pkt.whdr.cid = htonl(conn->proto.cid | channel); diff --git a/net/rxrpc/output.c b/net/rxrpc/output.c index a0906145e829..4a292f860ae3 100644 --- a/net/rxrpc/output.c +++ b/net/rxrpc/output.c @@ -216,7 +216,7 @@ int rxrpc_send_ack_packet(struct rxrpc_call *call, struct rxrpc_txbuf *txb) iov[0].iov_len = sizeof(txb->wire) + sizeof(txb->ack) + n; len = iov[0].iov_len; - serial = atomic_inc_return(&conn->serial); + serial = rxrpc_get_next_serial(conn); txb->wire.serial = htonl(serial); trace_rxrpc_tx_ack(call->debug_id, serial, ntohl(txb->ack.firstPacket), @@ -302,7 +302,7 @@ int rxrpc_send_abort_packet(struct rxrpc_call *call) iov[0].iov_base = &pkt; iov[0].iov_len = sizeof(pkt); - serial = atomic_inc_return(&conn->serial); + serial = rxrpc_get_next_serial(conn); pkt.whdr.serial = htonl(serial); iov_iter_kvec(&msg.msg_iter, WRITE, iov, 1, sizeof(pkt)); @@ -334,7 +334,7 @@ int rxrpc_send_data_packet(struct rxrpc_call *call, struct rxrpc_txbuf *txb) _enter("%x,{%d}", txb->seq, txb->len); /* Each transmission of a Tx packet needs a new serial number */ - serial = atomic_inc_return(&conn->serial); + serial = rxrpc_get_next_serial(conn); txb->wire.serial = htonl(serial); if (test_bit(RXRPC_CONN_PROBING_FOR_UPGRADE, &conn->flags) && @@ -558,7 +558,7 @@ void rxrpc_send_conn_abort(struct rxrpc_connection *conn) len = iov[0].iov_len + iov[1].iov_len; - serial = atomic_inc_return(&conn->serial); + serial = rxrpc_get_next_serial(conn); whdr.serial = htonl(serial); iov_iter_kvec(&msg.msg_iter, WRITE, iov, 2, len); diff --git a/net/rxrpc/proc.c b/net/rxrpc/proc.c index 6c86cbb98d1d..26dc2f26d92d 100644 --- a/net/rxrpc/proc.c +++ b/net/rxrpc/proc.c @@ -181,7 +181,7 @@ print: atomic_read(&conn->active), state, key_serial(conn->key), - atomic_read(&conn->serial), + conn->tx_serial, conn->hi_serial, conn->channels[0].call_id, conn->channels[1].call_id, diff --git a/net/rxrpc/rxkad.c b/net/rxrpc/rxkad.c index b52dedcebce0..6b32d61d4cdc 100644 --- a/net/rxrpc/rxkad.c +++ b/net/rxrpc/rxkad.c @@ -664,7 +664,7 @@ static int rxkad_issue_challenge(struct rxrpc_connection *conn) len = iov[0].iov_len + iov[1].iov_len; - serial = atomic_inc_return(&conn->serial); + serial = rxrpc_get_next_serial(conn); whdr.serial = htonl(serial); ret = kernel_sendmsg(conn->local->socket, &msg, iov, 2, len); @@ -721,7 +721,7 @@ static int rxkad_send_response(struct rxrpc_connection *conn, len = iov[0].iov_len + iov[1].iov_len + iov[2].iov_len; - serial = atomic_inc_return(&conn->serial); + serial = rxrpc_get_next_serial(conn); whdr.serial = htonl(serial); rxrpc_local_dont_fragment(conn->local, false); From e7870cf13d20f56bfc19f9c3e89707c69cf104ef Mon Sep 17 00:00:00 2001 From: David Howells Date: Fri, 2 Feb 2024 15:19:14 +0000 Subject: [PATCH 127/270] rxrpc: Fix delayed ACKs to not set the reference serial number Fix the construction of delayed ACKs to not set the reference serial number as they can't be used as an RTT reference. Fixes: 17926a79320a ("[AF_RXRPC]: Provide secure RxRPC sockets for use by userspace and kernel both") Signed-off-by: David Howells cc: Marc Dionne cc: "David S. Miller" cc: Eric Dumazet cc: Jakub Kicinski cc: Paolo Abeni cc: linux-afs@lists.infradead.org cc: netdev@vger.kernel.org Signed-off-by: David S. Miller --- net/rxrpc/ar-internal.h | 1 - net/rxrpc/call_event.c | 6 +----- 2 files changed, 1 insertion(+), 6 deletions(-) diff --git a/net/rxrpc/ar-internal.h b/net/rxrpc/ar-internal.h index 31b0dd8c9b2d..b4ab26c3718a 100644 --- a/net/rxrpc/ar-internal.h +++ b/net/rxrpc/ar-internal.h @@ -696,7 +696,6 @@ struct rxrpc_call { /* Receive-phase ACK management (ACKs we send). */ u8 ackr_reason; /* reason to ACK */ u16 ackr_sack_base; /* Starting slot in SACK table ring */ - rxrpc_serial_t ackr_serial; /* serial of packet being ACK'd */ rxrpc_seq_t ackr_window; /* Base of SACK window */ rxrpc_seq_t ackr_wtop; /* Base of SACK window */ unsigned int ackr_nr_unacked; /* Number of unacked packets */ diff --git a/net/rxrpc/call_event.c b/net/rxrpc/call_event.c index e363f21a2014..c61efe08695d 100644 --- a/net/rxrpc/call_event.c +++ b/net/rxrpc/call_event.c @@ -43,8 +43,6 @@ void rxrpc_propose_delay_ACK(struct rxrpc_call *call, rxrpc_serial_t serial, unsigned long expiry = rxrpc_soft_ack_delay; unsigned long now = jiffies, ack_at; - call->ackr_serial = serial; - if (rxrpc_soft_ack_delay < expiry) expiry = rxrpc_soft_ack_delay; if (call->peer->srtt_us != 0) @@ -373,7 +371,6 @@ static void rxrpc_send_initial_ping(struct rxrpc_call *call) bool rxrpc_input_call_event(struct rxrpc_call *call, struct sk_buff *skb) { unsigned long now, next, t; - rxrpc_serial_t ackr_serial; bool resend = false, expired = false; s32 abort_code; @@ -423,8 +420,7 @@ bool rxrpc_input_call_event(struct rxrpc_call *call, struct sk_buff *skb) if (time_after_eq(now, t)) { trace_rxrpc_timer(call, rxrpc_timer_exp_ack, now); cmpxchg(&call->delay_ack_at, t, now + MAX_JIFFY_OFFSET); - ackr_serial = xchg(&call->ackr_serial, 0); - rxrpc_send_ACK(call, RXRPC_ACK_DELAY, ackr_serial, + rxrpc_send_ACK(call, RXRPC_ACK_DELAY, 0, rxrpc_propose_ack_ping_for_lost_ack); } From 6f769f22822aa4124b556339781b04d810f0e038 Mon Sep 17 00:00:00 2001 From: David Howells Date: Fri, 2 Feb 2024 15:19:15 +0000 Subject: [PATCH 128/270] rxrpc: Fix response to PING RESPONSE ACKs to a dead call Stop rxrpc from sending a DUP ACK in response to a PING RESPONSE ACK on a dead call. We may have initiated the ping but the call may have beaten the response to completion. Fixes: 18bfeba50dfd ("rxrpc: Perform terminal call ACK/ABORT retransmission from conn processor") Signed-off-by: David Howells cc: Marc Dionne cc: "David S. Miller" cc: Eric Dumazet cc: Jakub Kicinski cc: Paolo Abeni cc: linux-afs@lists.infradead.org cc: netdev@vger.kernel.org Signed-off-by: David S. Miller --- net/rxrpc/conn_event.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/net/rxrpc/conn_event.c b/net/rxrpc/conn_event.c index ec5eae60ab0c..1f251d758cb9 100644 --- a/net/rxrpc/conn_event.c +++ b/net/rxrpc/conn_event.c @@ -95,6 +95,14 @@ void rxrpc_conn_retransmit_call(struct rxrpc_connection *conn, _enter("%d", conn->debug_id); + if (sp && sp->hdr.type == RXRPC_PACKET_TYPE_ACK) { + if (skb_copy_bits(skb, sizeof(struct rxrpc_wire_header), + &pkt.ack, sizeof(pkt.ack)) < 0) + return; + if (pkt.ack.reason == RXRPC_ACK_PING_RESPONSE) + return; + } + chan = &conn->channels[channel]; /* If the last call got moved on whilst we were waiting to run, just From 41b7fa157ea1c8c3a575ca7f5f32034de9bee3ae Mon Sep 17 00:00:00 2001 From: David Howells Date: Fri, 2 Feb 2024 15:19:16 +0000 Subject: [PATCH 129/270] rxrpc: Fix counting of new acks and nacks Fix the counting of new acks and nacks when parsing a packet - something that is used in congestion control. As the code stands, it merely notes if there are any nacks whereas what we really should do is compare the previous SACK table to the new one, assuming we get two successive ACK packets with nacks in them. However, we really don't want to do that if we can avoid it as the tables might not correspond directly as one may be shifted from the other - something that will only get harder to deal with once extended ACK tables come into full use (with a capacity of up to 8192). Instead, count the number of nacks shifted out of the old SACK, the number of nacks retained in the portion still active and the number of new acks and nacks in the new table then calculate what we need. Note this ends up a bit of an estimate as the Rx protocol allows acks to be withdrawn by the receiver and packets requested to be retransmitted. Fixes: d57a3a151660 ("rxrpc: Save last ACK's SACK table rather than marking txbufs") Signed-off-by: David Howells cc: Marc Dionne cc: "David S. Miller" cc: Eric Dumazet cc: Jakub Kicinski cc: Paolo Abeni cc: linux-afs@lists.infradead.org cc: netdev@vger.kernel.org Signed-off-by: David S. Miller --- include/trace/events/rxrpc.h | 8 ++- net/rxrpc/ar-internal.h | 20 ++++-- net/rxrpc/call_event.c | 6 +- net/rxrpc/call_object.c | 1 + net/rxrpc/input.c | 115 +++++++++++++++++++++++++++++------ 5 files changed, 122 insertions(+), 28 deletions(-) diff --git a/include/trace/events/rxrpc.h b/include/trace/events/rxrpc.h index 4c1ef7b3705c..87b8de9b6c1c 100644 --- a/include/trace/events/rxrpc.h +++ b/include/trace/events/rxrpc.h @@ -128,6 +128,7 @@ EM(rxrpc_skb_eaten_by_unshare_nomem, "ETN unshar-nm") \ EM(rxrpc_skb_get_conn_secured, "GET conn-secd") \ EM(rxrpc_skb_get_conn_work, "GET conn-work") \ + EM(rxrpc_skb_get_last_nack, "GET last-nack") \ EM(rxrpc_skb_get_local_work, "GET locl-work") \ EM(rxrpc_skb_get_reject_work, "GET rej-work ") \ EM(rxrpc_skb_get_to_recvmsg, "GET to-recv ") \ @@ -141,6 +142,7 @@ EM(rxrpc_skb_put_error_report, "PUT error-rep") \ EM(rxrpc_skb_put_input, "PUT input ") \ EM(rxrpc_skb_put_jumbo_subpacket, "PUT jumbo-sub") \ + EM(rxrpc_skb_put_last_nack, "PUT last-nack") \ EM(rxrpc_skb_put_purge, "PUT purge ") \ EM(rxrpc_skb_put_rotate, "PUT rotate ") \ EM(rxrpc_skb_put_unknown, "PUT unknown ") \ @@ -1552,7 +1554,7 @@ TRACE_EVENT(rxrpc_congest, memcpy(&__entry->sum, summary, sizeof(__entry->sum)); ), - TP_printk("c=%08x r=%08x %s q=%08x %s cw=%u ss=%u nA=%u,%u+%u r=%u b=%u u=%u d=%u l=%x%s%s%s", + TP_printk("c=%08x r=%08x %s q=%08x %s cw=%u ss=%u nA=%u,%u+%u,%u b=%u u=%u d=%u l=%x%s%s%s", __entry->call, __entry->ack_serial, __print_symbolic(__entry->sum.ack_reason, rxrpc_ack_names), @@ -1560,9 +1562,9 @@ TRACE_EVENT(rxrpc_congest, __print_symbolic(__entry->sum.mode, rxrpc_congest_modes), __entry->sum.cwnd, __entry->sum.ssthresh, - __entry->sum.nr_acks, __entry->sum.saw_nacks, + __entry->sum.nr_acks, __entry->sum.nr_retained_nacks, __entry->sum.nr_new_acks, - __entry->sum.nr_rot_new_acks, + __entry->sum.nr_new_nacks, __entry->top - __entry->hard_ack, __entry->sum.cumulative_acks, __entry->sum.dup_acks, diff --git a/net/rxrpc/ar-internal.h b/net/rxrpc/ar-internal.h index b4ab26c3718a..7818aae1be8e 100644 --- a/net/rxrpc/ar-internal.h +++ b/net/rxrpc/ar-internal.h @@ -199,11 +199,19 @@ struct rxrpc_host_header { */ struct rxrpc_skb_priv { struct rxrpc_connection *conn; /* Connection referred to (poke packet) */ - u16 offset; /* Offset of data */ - u16 len; /* Length of data */ - u8 flags; + union { + struct { + u16 offset; /* Offset of data */ + u16 len; /* Length of data */ + u8 flags; #define RXRPC_RX_VERIFIED 0x01 - + }; + struct { + rxrpc_seq_t first_ack; /* First packet in acks table */ + u8 nr_acks; /* Number of acks+nacks */ + u8 nr_nacks; /* Number of nacks */ + }; + }; struct rxrpc_host_header hdr; /* RxRPC packet header from this packet */ }; @@ -692,6 +700,7 @@ struct rxrpc_call { u8 cong_dup_acks; /* Count of ACKs showing missing packets */ u8 cong_cumul_acks; /* Cumulative ACK count */ ktime_t cong_tstamp; /* Last time cwnd was changed */ + struct sk_buff *cong_last_nack; /* Last ACK with nacks received */ /* Receive-phase ACK management (ACKs we send). */ u8 ackr_reason; /* reason to ACK */ @@ -729,7 +738,8 @@ struct rxrpc_call { struct rxrpc_ack_summary { u16 nr_acks; /* Number of ACKs in packet */ u16 nr_new_acks; /* Number of new ACKs in packet */ - u16 nr_rot_new_acks; /* Number of rotated new ACKs */ + u16 nr_new_nacks; /* Number of new nacks in packet */ + u16 nr_retained_nacks; /* Number of nacks retained between ACKs */ u8 ack_reason; bool saw_nacks; /* Saw NACKs in packet */ bool new_low_nack; /* T if new low NACK found */ diff --git a/net/rxrpc/call_event.c b/net/rxrpc/call_event.c index c61efe08695d..0f78544d043b 100644 --- a/net/rxrpc/call_event.c +++ b/net/rxrpc/call_event.c @@ -112,6 +112,7 @@ static void rxrpc_congestion_timeout(struct rxrpc_call *call) void rxrpc_resend(struct rxrpc_call *call, struct sk_buff *ack_skb) { struct rxrpc_ackpacket *ack = NULL; + struct rxrpc_skb_priv *sp; struct rxrpc_txbuf *txb; unsigned long resend_at; rxrpc_seq_t transmitted = READ_ONCE(call->tx_transmitted); @@ -139,14 +140,15 @@ void rxrpc_resend(struct rxrpc_call *call, struct sk_buff *ack_skb) * explicitly NAK'd packets. */ if (ack_skb) { + sp = rxrpc_skb(ack_skb); ack = (void *)ack_skb->data + sizeof(struct rxrpc_wire_header); - for (i = 0; i < ack->nAcks; i++) { + for (i = 0; i < sp->nr_acks; i++) { rxrpc_seq_t seq; if (ack->acks[i] & 1) continue; - seq = ntohl(ack->firstPacket) + i; + seq = sp->first_ack + i; if (after(txb->seq, transmitted)) break; if (after(txb->seq, seq)) diff --git a/net/rxrpc/call_object.c b/net/rxrpc/call_object.c index 0943e54370ba..9fc9a6c3f685 100644 --- a/net/rxrpc/call_object.c +++ b/net/rxrpc/call_object.c @@ -686,6 +686,7 @@ static void rxrpc_destroy_call(struct work_struct *work) del_timer_sync(&call->timer); + rxrpc_free_skb(call->cong_last_nack, rxrpc_skb_put_last_nack); rxrpc_cleanup_ring(call); while ((txb = list_first_entry_or_null(&call->tx_sendmsg, struct rxrpc_txbuf, call_link))) { diff --git a/net/rxrpc/input.c b/net/rxrpc/input.c index 92495e73b869..9691de00ade7 100644 --- a/net/rxrpc/input.c +++ b/net/rxrpc/input.c @@ -45,11 +45,9 @@ static void rxrpc_congestion_management(struct rxrpc_call *call, } cumulative_acks += summary->nr_new_acks; - cumulative_acks += summary->nr_rot_new_acks; if (cumulative_acks > 255) cumulative_acks = 255; - summary->mode = call->cong_mode; summary->cwnd = call->cong_cwnd; summary->ssthresh = call->cong_ssthresh; summary->cumulative_acks = cumulative_acks; @@ -151,6 +149,7 @@ out_no_clear_ca: cwnd = RXRPC_TX_MAX_WINDOW; call->cong_cwnd = cwnd; call->cong_cumul_acks = cumulative_acks; + summary->mode = call->cong_mode; trace_rxrpc_congest(call, summary, acked_serial, change); if (resend) rxrpc_resend(call, skb); @@ -213,7 +212,6 @@ static bool rxrpc_rotate_tx_window(struct rxrpc_call *call, rxrpc_seq_t to, list_for_each_entry_rcu(txb, &call->tx_buffer, call_link, false) { if (before_eq(txb->seq, call->acks_hard_ack)) continue; - summary->nr_rot_new_acks++; if (test_bit(RXRPC_TXBUF_LAST, &txb->flags)) { set_bit(RXRPC_CALL_TX_LAST, &call->flags); rot_last = true; @@ -254,6 +252,11 @@ static void rxrpc_end_tx_phase(struct rxrpc_call *call, bool reply_begun, { ASSERT(test_bit(RXRPC_CALL_TX_LAST, &call->flags)); + if (unlikely(call->cong_last_nack)) { + rxrpc_free_skb(call->cong_last_nack, rxrpc_skb_put_last_nack); + call->cong_last_nack = NULL; + } + switch (__rxrpc_call_state(call)) { case RXRPC_CALL_CLIENT_SEND_REQUEST: case RXRPC_CALL_CLIENT_AWAIT_REPLY: @@ -702,6 +705,43 @@ static void rxrpc_input_ackinfo(struct rxrpc_call *call, struct sk_buff *skb, wake_up(&call->waitq); } +/* + * Determine how many nacks from the previous ACK have now been satisfied. + */ +static rxrpc_seq_t rxrpc_input_check_prev_ack(struct rxrpc_call *call, + struct rxrpc_ack_summary *summary, + rxrpc_seq_t seq) +{ + struct sk_buff *skb = call->cong_last_nack; + struct rxrpc_ackpacket ack; + struct rxrpc_skb_priv *sp = rxrpc_skb(skb); + unsigned int i, new_acks = 0, retained_nacks = 0; + rxrpc_seq_t old_seq = sp->first_ack; + u8 *acks = skb->data + sizeof(struct rxrpc_wire_header) + sizeof(ack); + + if (after_eq(seq, old_seq + sp->nr_acks)) { + summary->nr_new_acks += sp->nr_nacks; + summary->nr_new_acks += seq - (old_seq + sp->nr_acks); + summary->nr_retained_nacks = 0; + } else if (seq == old_seq) { + summary->nr_retained_nacks = sp->nr_nacks; + } else { + for (i = 0; i < sp->nr_acks; i++) { + if (acks[i] == RXRPC_ACK_TYPE_NACK) { + if (before(old_seq + i, seq)) + new_acks++; + else + retained_nacks++; + } + } + + summary->nr_new_acks += new_acks; + summary->nr_retained_nacks = retained_nacks; + } + + return old_seq + sp->nr_acks; +} + /* * Process individual soft ACKs. * @@ -711,25 +751,51 @@ static void rxrpc_input_ackinfo(struct rxrpc_call *call, struct sk_buff *skb, * the timer on the basis that the peer might just not have processed them at * the time the ACK was sent. */ -static void rxrpc_input_soft_acks(struct rxrpc_call *call, u8 *acks, - rxrpc_seq_t seq, int nr_acks, - struct rxrpc_ack_summary *summary) +static void rxrpc_input_soft_acks(struct rxrpc_call *call, + struct rxrpc_ack_summary *summary, + struct sk_buff *skb, + rxrpc_seq_t seq, + rxrpc_seq_t since) { - unsigned int i; + struct rxrpc_skb_priv *sp = rxrpc_skb(skb); + unsigned int i, old_nacks = 0; + rxrpc_seq_t lowest_nak = seq + sp->nr_acks; + u8 *acks = skb->data + sizeof(struct rxrpc_wire_header) + sizeof(struct rxrpc_ackpacket); - for (i = 0; i < nr_acks; i++) { + for (i = 0; i < sp->nr_acks; i++) { if (acks[i] == RXRPC_ACK_TYPE_ACK) { summary->nr_acks++; - summary->nr_new_acks++; + if (after_eq(seq, since)) + summary->nr_new_acks++; } else { - if (!summary->saw_nacks && - call->acks_lowest_nak != seq + i) { - call->acks_lowest_nak = seq + i; - summary->new_low_nack = true; - } summary->saw_nacks = true; + if (before(seq, since)) { + /* Overlap with previous ACK */ + old_nacks++; + } else { + summary->nr_new_nacks++; + sp->nr_nacks++; + } + + if (before(seq, lowest_nak)) + lowest_nak = seq; } + seq++; } + + if (lowest_nak != call->acks_lowest_nak) { + call->acks_lowest_nak = lowest_nak; + summary->new_low_nack = true; + } + + /* We *can* have more nacks than we did - the peer is permitted to drop + * packets it has soft-acked and re-request them. Further, it is + * possible for the nack distribution to change whilst the number of + * nacks stays the same or goes down. + */ + if (old_nacks < summary->nr_retained_nacks) + summary->nr_new_acks += summary->nr_retained_nacks - old_nacks; + summary->nr_retained_nacks = old_nacks; } /* @@ -773,7 +839,7 @@ static void rxrpc_input_ack(struct rxrpc_call *call, struct sk_buff *skb) struct rxrpc_skb_priv *sp = rxrpc_skb(skb); struct rxrpc_ackinfo info; rxrpc_serial_t ack_serial, acked_serial; - rxrpc_seq_t first_soft_ack, hard_ack, prev_pkt; + rxrpc_seq_t first_soft_ack, hard_ack, prev_pkt, since; int nr_acks, offset, ioffset; _enter(""); @@ -789,6 +855,8 @@ static void rxrpc_input_ack(struct rxrpc_call *call, struct sk_buff *skb) prev_pkt = ntohl(ack.previousPacket); hard_ack = first_soft_ack - 1; nr_acks = ack.nAcks; + sp->first_ack = first_soft_ack; + sp->nr_acks = nr_acks; summary.ack_reason = (ack.reason < RXRPC_ACK__INVALID ? ack.reason : RXRPC_ACK__INVALID); @@ -858,6 +926,16 @@ static void rxrpc_input_ack(struct rxrpc_call *call, struct sk_buff *skb) if (nr_acks > 0) skb_condense(skb); + if (call->cong_last_nack) { + since = rxrpc_input_check_prev_ack(call, &summary, first_soft_ack); + rxrpc_free_skb(call->cong_last_nack, rxrpc_skb_put_last_nack); + call->cong_last_nack = NULL; + } else { + summary.nr_new_acks = first_soft_ack - call->acks_first_seq; + call->acks_lowest_nak = first_soft_ack + nr_acks; + since = first_soft_ack; + } + call->acks_latest_ts = skb->tstamp; call->acks_first_seq = first_soft_ack; call->acks_prev_seq = prev_pkt; @@ -866,7 +944,7 @@ static void rxrpc_input_ack(struct rxrpc_call *call, struct sk_buff *skb) case RXRPC_ACK_PING: break; default: - if (after(acked_serial, call->acks_highest_serial)) + if (acked_serial && after(acked_serial, call->acks_highest_serial)) call->acks_highest_serial = acked_serial; break; } @@ -905,8 +983,9 @@ static void rxrpc_input_ack(struct rxrpc_call *call, struct sk_buff *skb) if (nr_acks > 0) { if (offset > (int)skb->len - nr_acks) return rxrpc_proto_abort(call, 0, rxrpc_eproto_ackr_short_sack); - rxrpc_input_soft_acks(call, skb->data + offset, first_soft_ack, - nr_acks, &summary); + rxrpc_input_soft_acks(call, &summary, skb, first_soft_ack, since); + rxrpc_get_skb(skb, rxrpc_skb_get_last_nack); + call->cong_last_nack = skb; } if (test_bit(RXRPC_CALL_TX_LAST, &call->flags) && From a19747c3b9bf6476cc36d0a3a5ef0ff92999169e Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Fri, 2 Feb 2024 17:06:59 +0100 Subject: [PATCH 130/270] selftests: net: let big_tcp test cope with slow env In very slow environments, most big TCP cases including segmentation and reassembly of big TCP packets have a good chance to fail: by default the TCP client uses write size well below 64K. If the host is low enough autocorking is unable to build real big TCP packets. Address the issue using much larger write operations. Note that is hard to observe the issue without an extremely slow and/or overloaded environment; reduce the TCP transfer time to allow for much easier/faster reproducibility. Fixes: 6bb382bcf742 ("selftests: add a selftest for big tcp") Signed-off-by: Paolo Abeni Reviewed-by: Eric Dumazet Acked-by: Xin Long Signed-off-by: David S. Miller --- tools/testing/selftests/net/big_tcp.sh | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/tools/testing/selftests/net/big_tcp.sh b/tools/testing/selftests/net/big_tcp.sh index cde9a91c4797..2db9d15cd45f 100755 --- a/tools/testing/selftests/net/big_tcp.sh +++ b/tools/testing/selftests/net/big_tcp.sh @@ -122,7 +122,9 @@ do_netperf() { local netns=$1 [ "$NF" = "6" ] && serip=$SERVER_IP6 - ip net exec $netns netperf -$NF -t TCP_STREAM -H $serip 2>&1 >/dev/null + + # use large write to be sure to generate big tcp packets + ip net exec $netns netperf -$NF -t TCP_STREAM -l 1 -H $serip -- -m 262144 2>&1 >/dev/null } do_test() { From 61712c94782ce105253ee1939cda0c5c025b2c0c Mon Sep 17 00:00:00 2001 From: Dave Airlie Date: Tue, 30 Jan 2024 13:26:43 +1000 Subject: [PATCH 131/270] nouveau/gsp: use correct size for registry rpc. Timur pointed this out before, and it just slipped my mind, but this might help some things work better, around pcie power management. Cc: # v6.7 Fixes: 8d55b0a940bb ("nouveau/gsp: add some basic registry entries.") Signed-off-by: Dave Airlie Signed-off-by: Danilo Krummrich Link: https://patchwork.freedesktop.org/patch/msgid/20240130032643.2498315-1-airlied@gmail.com --- drivers/gpu/drm/nouveau/nvkm/subdev/gsp/r535.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/nouveau/nvkm/subdev/gsp/r535.c b/drivers/gpu/drm/nouveau/nvkm/subdev/gsp/r535.c index 9ee58e2a0eb2..5e1fa176aac4 100644 --- a/drivers/gpu/drm/nouveau/nvkm/subdev/gsp/r535.c +++ b/drivers/gpu/drm/nouveau/nvkm/subdev/gsp/r535.c @@ -1078,7 +1078,6 @@ r535_gsp_rpc_set_registry(struct nvkm_gsp *gsp) if (IS_ERR(rpc)) return PTR_ERR(rpc); - rpc->size = sizeof(*rpc); rpc->numEntries = NV_GSP_REG_NUM_ENTRIES; str_offset = offsetof(typeof(*rpc), entries[NV_GSP_REG_NUM_ENTRIES]); @@ -1094,6 +1093,7 @@ r535_gsp_rpc_set_registry(struct nvkm_gsp *gsp) strings += name_len; str_offset += name_len; } + rpc->size = str_offset; return nvkm_gsp_rpc_wr(gsp, rpc, false); } From 042b5f83841fbf7ce39474412db3b5e4765a7ea7 Mon Sep 17 00:00:00 2001 From: Timur Tabi Date: Fri, 2 Feb 2024 17:06:07 -0600 Subject: [PATCH 132/270] drm/nouveau: fix several DMA buffer leaks Nouveau manages GSP-RM DMA buffers with nvkm_gsp_mem objects. Several of these buffers are never dealloced. Some of them can be deallocated right after GSP-RM is initialized, but the rest need to stay until the driver unloads. Also futher bullet-proof these objects by poisoning the buffer and clearing the nvkm_gsp_mem object when it is deallocated. Poisoning the buffer should trigger an error (or crash) from GSP-RM if it tries to access the buffer after we've deallocated it, because we were wrong about when it is safe to deallocate. Finally, change the mem->size field to a size_t because that's the same type that dma_alloc_coherent expects. Cc: # v6.7 Fixes: 176fdcbddfd2 ("drm/nouveau/gsp/r535: add support for booting GSP-RM") Signed-off-by: Timur Tabi Signed-off-by: Danilo Krummrich Link: https://patchwork.freedesktop.org/patch/msgid/20240202230608.1981026-1-ttabi@nvidia.com --- .../gpu/drm/nouveau/include/nvkm/subdev/gsp.h | 2 +- .../gpu/drm/nouveau/nvkm/subdev/gsp/r535.c | 59 ++++++++++++------- 2 files changed, 39 insertions(+), 22 deletions(-) diff --git a/drivers/gpu/drm/nouveau/include/nvkm/subdev/gsp.h b/drivers/gpu/drm/nouveau/include/nvkm/subdev/gsp.h index d1437c08645f..6f5d376d8fcc 100644 --- a/drivers/gpu/drm/nouveau/include/nvkm/subdev/gsp.h +++ b/drivers/gpu/drm/nouveau/include/nvkm/subdev/gsp.h @@ -9,7 +9,7 @@ #define GSP_PAGE_SIZE BIT(GSP_PAGE_SHIFT) struct nvkm_gsp_mem { - u32 size; + size_t size; void *data; dma_addr_t addr; }; diff --git a/drivers/gpu/drm/nouveau/nvkm/subdev/gsp/r535.c b/drivers/gpu/drm/nouveau/nvkm/subdev/gsp/r535.c index 5e1fa176aac4..6208ddd92964 100644 --- a/drivers/gpu/drm/nouveau/nvkm/subdev/gsp/r535.c +++ b/drivers/gpu/drm/nouveau/nvkm/subdev/gsp/r535.c @@ -997,6 +997,32 @@ r535_gsp_rpc_get_gsp_static_info(struct nvkm_gsp *gsp) return 0; } +static void +nvkm_gsp_mem_dtor(struct nvkm_gsp *gsp, struct nvkm_gsp_mem *mem) +{ + if (mem->data) { + /* + * Poison the buffer to catch any unexpected access from + * GSP-RM if the buffer was prematurely freed. + */ + memset(mem->data, 0xFF, mem->size); + + dma_free_coherent(gsp->subdev.device->dev, mem->size, mem->data, mem->addr); + memset(mem, 0, sizeof(*mem)); + } +} + +static int +nvkm_gsp_mem_ctor(struct nvkm_gsp *gsp, size_t size, struct nvkm_gsp_mem *mem) +{ + mem->size = size; + mem->data = dma_alloc_coherent(gsp->subdev.device->dev, size, &mem->addr, GFP_KERNEL); + if (WARN_ON(!mem->data)) + return -ENOMEM; + + return 0; +} + static int r535_gsp_postinit(struct nvkm_gsp *gsp) { @@ -1024,6 +1050,13 @@ r535_gsp_postinit(struct nvkm_gsp *gsp) nvkm_inth_allow(&gsp->subdev.inth); nvkm_wr32(device, 0x110004, 0x00000040); + + /* Release the DMA buffers that were needed only for boot and init */ + nvkm_gsp_mem_dtor(gsp, &gsp->boot.fw); + nvkm_gsp_mem_dtor(gsp, &gsp->libos); + nvkm_gsp_mem_dtor(gsp, &gsp->rmargs); + nvkm_gsp_mem_dtor(gsp, &gsp->wpr_meta); + return ret; } @@ -1532,27 +1565,6 @@ r535_gsp_msg_run_cpu_sequencer(void *priv, u32 fn, void *repv, u32 repc) return 0; } -static void -nvkm_gsp_mem_dtor(struct nvkm_gsp *gsp, struct nvkm_gsp_mem *mem) -{ - if (mem->data) { - dma_free_coherent(gsp->subdev.device->dev, mem->size, mem->data, mem->addr); - mem->data = NULL; - } -} - -static int -nvkm_gsp_mem_ctor(struct nvkm_gsp *gsp, u32 size, struct nvkm_gsp_mem *mem) -{ - mem->size = size; - mem->data = dma_alloc_coherent(gsp->subdev.device->dev, size, &mem->addr, GFP_KERNEL); - if (WARN_ON(!mem->data)) - return -ENOMEM; - - return 0; -} - - static int r535_gsp_booter_unload(struct nvkm_gsp *gsp, u32 mbox0, u32 mbox1) { @@ -2150,6 +2162,11 @@ r535_gsp_dtor(struct nvkm_gsp *gsp) mutex_destroy(&gsp->cmdq.mutex); r535_gsp_dtor_fws(gsp); + + nvkm_gsp_mem_dtor(gsp, &gsp->shm.mem); + nvkm_gsp_mem_dtor(gsp, &gsp->loginit); + nvkm_gsp_mem_dtor(gsp, &gsp->logintr); + nvkm_gsp_mem_dtor(gsp, &gsp->logrm); } int From 34e659f34a7559ecfd9c1f5b24d4c291f3f54711 Mon Sep 17 00:00:00 2001 From: Timur Tabi Date: Fri, 2 Feb 2024 17:06:08 -0600 Subject: [PATCH 133/270] drm/nouveau: nvkm_gsp_radix3_sg() should use nvkm_gsp_mem_ctor() Function nvkm_gsp_radix3_sg() uses nvkm_gsp_mem objects to allocate the radix3 tables, but it unnecessarily creates those objects manually instead of using the standard nvkm_gsp_mem_ctor() function like the rest of the code does. Signed-off-by: Timur Tabi Signed-off-by: Danilo Krummrich Link: https://patchwork.freedesktop.org/patch/msgid/20240202230608.1981026-2-ttabi@nvidia.com --- drivers/gpu/drm/nouveau/nvkm/subdev/gsp/r535.c | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/drivers/gpu/drm/nouveau/nvkm/subdev/gsp/r535.c b/drivers/gpu/drm/nouveau/nvkm/subdev/gsp/r535.c index 6208ddd92964..a41735ab6068 100644 --- a/drivers/gpu/drm/nouveau/nvkm/subdev/gsp/r535.c +++ b/drivers/gpu/drm/nouveau/nvkm/subdev/gsp/r535.c @@ -1950,20 +1950,20 @@ nvkm_gsp_radix3_dtor(struct nvkm_gsp *gsp, struct nvkm_gsp_radix3 *rx3) * See kgspCreateRadix3_IMPL */ static int -nvkm_gsp_radix3_sg(struct nvkm_device *device, struct sg_table *sgt, u64 size, +nvkm_gsp_radix3_sg(struct nvkm_gsp *gsp, struct sg_table *sgt, u64 size, struct nvkm_gsp_radix3 *rx3) { u64 addr; for (int i = ARRAY_SIZE(rx3->mem) - 1; i >= 0; i--) { u64 *ptes; - int idx; + size_t bufsize; + int ret, idx; - rx3->mem[i].size = ALIGN((size / GSP_PAGE_SIZE) * sizeof(u64), GSP_PAGE_SIZE); - rx3->mem[i].data = dma_alloc_coherent(device->dev, rx3->mem[i].size, - &rx3->mem[i].addr, GFP_KERNEL); - if (WARN_ON(!rx3->mem[i].data)) - return -ENOMEM; + bufsize = ALIGN((size / GSP_PAGE_SIZE) * sizeof(u64), GSP_PAGE_SIZE); + ret = nvkm_gsp_mem_ctor(gsp, bufsize, &rx3->mem[i]); + if (ret) + return ret; ptes = rx3->mem[i].data; if (i == 2) { @@ -2003,7 +2003,7 @@ r535_gsp_fini(struct nvkm_gsp *gsp, bool suspend) if (ret) return ret; - ret = nvkm_gsp_radix3_sg(gsp->subdev.device, &gsp->sr.sgt, len, &gsp->sr.radix3); + ret = nvkm_gsp_radix3_sg(gsp, &gsp->sr.sgt, len, &gsp->sr.radix3); if (ret) return ret; @@ -2211,7 +2211,7 @@ r535_gsp_oneinit(struct nvkm_gsp *gsp) memcpy(gsp->sig.data, data, size); /* Build radix3 page table for ELF image. */ - ret = nvkm_gsp_radix3_sg(device, &gsp->fw.mem.sgt, gsp->fw.len, &gsp->radix3); + ret = nvkm_gsp_radix3_sg(gsp, &gsp->fw.mem.sgt, gsp->fw.len, &gsp->radix3); if (ret) return ret; From 0647903efbc84b772325b4d24d9487e24d6d1e03 Mon Sep 17 00:00:00 2001 From: Felix Fietkau Date: Sat, 3 Feb 2024 14:24:46 +0100 Subject: [PATCH 134/270] wifi: mt76: mt7996: fix fortify warning Copy cck and ofdm separately in order to avoid __read_overflow2_field warning. Fixes: f75e4779d215 ("wifi: mt76: mt7996: add txpower setting support") Signed-off-by: Felix Fietkau Signed-off-by: Kalle Valo Link: https://msgid.link/20240203132446.54790-1-nbd@nbd.name --- drivers/net/wireless/mediatek/mt76/mt7996/mcu.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/net/wireless/mediatek/mt76/mt7996/mcu.c b/drivers/net/wireless/mediatek/mt76/mt7996/mcu.c index 3c729b563edc..699be57309c2 100644 --- a/drivers/net/wireless/mediatek/mt76/mt7996/mcu.c +++ b/drivers/net/wireless/mediatek/mt76/mt7996/mcu.c @@ -4477,7 +4477,8 @@ int mt7996_mcu_set_txpower_sku(struct mt7996_phy *phy) skb_put_data(skb, &req, sizeof(req)); /* cck and ofdm */ - skb_put_data(skb, &la.cck, sizeof(la.cck) + sizeof(la.ofdm)); + skb_put_data(skb, &la.cck, sizeof(la.cck)); + skb_put_data(skb, &la.ofdm, sizeof(la.ofdm)); /* ht20 */ skb_put_data(skb, &la.mcs[0], 8); /* ht40 */ From 3376ca3f1a2075eaa23c5576c47d04d7e8a4adda Mon Sep 17 00:00:00 2001 From: Mathias Krause Date: Sat, 3 Feb 2024 13:45:20 +0100 Subject: [PATCH 135/270] KVM: x86: Fix KVM_GET_MSRS stack info leak Commit 6abe9c1386e5 ("KVM: X86: Move ignore_msrs handling upper the stack") changed the 'ignore_msrs' handling, including sanitizing return values to the caller. This was fine until commit 12bc2132b15e ("KVM: X86: Do the same ignore_msrs check for feature msrs") which allowed non-existing feature MSRs to be ignored, i.e. to not generate an error on the ioctl() level. It even tried to preserve the sanitization of the return value. However, the logic is flawed, as '*data' will be overwritten again with the uninitialized stack value of msr.data. Fix this by simplifying the logic and always initializing msr.data, vanishing the need for an additional error exit path. Fixes: 12bc2132b15e ("KVM: X86: Do the same ignore_msrs check for feature msrs") Signed-off-by: Mathias Krause Reviewed-by: Xiaoyao Li Link: https://lore.kernel.org/r/20240203124522.592778-2-minipli@grsecurity.net Signed-off-by: Sean Christopherson --- arch/x86/kvm/x86.c | 15 +++++---------- 1 file changed, 5 insertions(+), 10 deletions(-) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 25bc52cdf8f4..3f6e82fd37f3 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -1704,22 +1704,17 @@ static int do_get_msr_feature(struct kvm_vcpu *vcpu, unsigned index, u64 *data) struct kvm_msr_entry msr; int r; + /* Unconditionally clear the output for simplicity */ + msr.data = 0; msr.index = index; r = kvm_get_msr_feature(&msr); - if (r == KVM_MSR_RET_INVALID) { - /* Unconditionally clear the output for simplicity */ - *data = 0; - if (kvm_msr_ignored_check(index, 0, false)) - r = 0; - } - - if (r) - return r; + if (r == KVM_MSR_RET_INVALID && kvm_msr_ignored_check(index, 0, false)) + r = 0; *data = msr.data; - return 0; + return r; } static bool __kvm_valid_efer(struct kvm_vcpu *vcpu, u64 efer) From 4e6c9011990726f4d175e2cdfebe5b0b8cce4839 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Sat, 3 Feb 2024 10:45:21 +0800 Subject: [PATCH 136/270] scsi: core: Move scsi_host_busy() out of host lock if it is for per-command Commit 4373534a9850 ("scsi: core: Move scsi_host_busy() out of host lock for waking up EH handler") intended to fix a hard lockup issue triggered by EH. The core idea was to move scsi_host_busy() out of the host lock when processing individual commands for EH. However, a suggested style change inadvertently caused scsi_host_busy() to remain under the host lock. Fix this by calling scsi_host_busy() outside the lock. Fixes: 4373534a9850 ("scsi: core: Move scsi_host_busy() out of host lock for waking up EH handler") Cc: Sathya Prakash Veerichetty Cc: Bart Van Assche Cc: Ewan D. Milne Signed-off-by: Ming Lei Link: https://lore.kernel.org/r/20240203024521.2006455-1-ming.lei@redhat.com Reviewed-by: Bart Van Assche Signed-off-by: Martin K. Petersen --- drivers/scsi/scsi_error.c | 3 ++- drivers/scsi/scsi_lib.c | 4 +++- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/drivers/scsi/scsi_error.c b/drivers/scsi/scsi_error.c index 4f455884fdc4..612489afe8d2 100644 --- a/drivers/scsi/scsi_error.c +++ b/drivers/scsi/scsi_error.c @@ -282,11 +282,12 @@ static void scsi_eh_inc_host_failed(struct rcu_head *head) { struct scsi_cmnd *scmd = container_of(head, typeof(*scmd), rcu); struct Scsi_Host *shost = scmd->device->host; + unsigned int busy = scsi_host_busy(shost); unsigned long flags; spin_lock_irqsave(shost->host_lock, flags); shost->host_failed++; - scsi_eh_wakeup(shost, scsi_host_busy(shost)); + scsi_eh_wakeup(shost, busy); spin_unlock_irqrestore(shost->host_lock, flags); } diff --git a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c index 1fb80eae9a63..df5ac03d5d6c 100644 --- a/drivers/scsi/scsi_lib.c +++ b/drivers/scsi/scsi_lib.c @@ -278,9 +278,11 @@ static void scsi_dec_host_busy(struct Scsi_Host *shost, struct scsi_cmnd *cmd) rcu_read_lock(); __clear_bit(SCMD_STATE_INFLIGHT, &cmd->state); if (unlikely(scsi_host_in_recovery(shost))) { + unsigned int busy = scsi_host_busy(shost); + spin_lock_irqsave(shost->host_lock, flags); if (shost->host_failed || shost->host_eh_scheduled) - scsi_eh_wakeup(shost, scsi_host_busy(shost)); + scsi_eh_wakeup(shost, busy); spin_unlock_irqrestore(shost->host_lock, flags); } rcu_read_unlock(); From d6c1b19153f92e95e5e1801d540e98771053afae Mon Sep 17 00:00:00 2001 From: Hannes Reinecke Date: Wed, 20 Dec 2023 17:26:58 +0100 Subject: [PATCH 137/270] scsi: lpfc: Use unsigned type for num_sge LUNs going into "failed ready running" state observed on >1T and on even numbers of size (2T, 4T, 6T, 8T and 10T). The issue occurs when DIF is enabled at the host. The kernel logs: Cannot setup S/G List for HBAIO segs 1/1 SGL 512 SCSI 256: 3 0 The host lpfc driver is failing to setup scatter/gather list (protection data) for the I/Os. The return type lpfc_bg_setup_sgl()/lpfc_bg_setup_sgl_prot() causes the compiler to remove the most significant bit. Use an unsigned type instead. Signed-off-by: Hannes Reinecke [dwagner: added commit message] Signed-off-by: Daniel Wagner Link: https://lore.kernel.org/r/20231220162658.12392-1-dwagner@suse.de Signed-off-by: Martin K. Petersen --- drivers/scsi/lpfc/lpfc_scsi.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/drivers/scsi/lpfc/lpfc_scsi.c b/drivers/scsi/lpfc/lpfc_scsi.c index d26941b131fd..bf879d81846b 100644 --- a/drivers/scsi/lpfc/lpfc_scsi.c +++ b/drivers/scsi/lpfc/lpfc_scsi.c @@ -1918,7 +1918,7 @@ out: * * Returns the number of SGEs added to the SGL. **/ -static int +static uint32_t lpfc_bg_setup_sgl(struct lpfc_hba *phba, struct scsi_cmnd *sc, struct sli4_sge *sgl, int datasegcnt, struct lpfc_io_buf *lpfc_cmd) @@ -1926,8 +1926,8 @@ lpfc_bg_setup_sgl(struct lpfc_hba *phba, struct scsi_cmnd *sc, struct scatterlist *sgde = NULL; /* s/g data entry */ struct sli4_sge_diseed *diseed = NULL; dma_addr_t physaddr; - int i = 0, num_sge = 0, status; - uint32_t reftag; + int i = 0, status; + uint32_t reftag, num_sge = 0; uint8_t txop, rxop; #ifdef CONFIG_SCSI_LPFC_DEBUG_FS uint32_t rc; @@ -2099,7 +2099,7 @@ out: * * Returns the number of SGEs added to the SGL. **/ -static int +static uint32_t lpfc_bg_setup_sgl_prot(struct lpfc_hba *phba, struct scsi_cmnd *sc, struct sli4_sge *sgl, int datacnt, int protcnt, struct lpfc_io_buf *lpfc_cmd) @@ -2123,8 +2123,8 @@ lpfc_bg_setup_sgl_prot(struct lpfc_hba *phba, struct scsi_cmnd *sc, uint32_t rc; #endif uint32_t checking = 1; - uint32_t dma_offset = 0; - int num_sge = 0, j = 2; + uint32_t dma_offset = 0, num_sge = 0; + int j = 2; struct sli4_hybrid_sgl *sgl_xtra = NULL; sgpe = scsi_prot_sglist(sc); From b513d30d59bb383a6a5d6b533afcab2cee99a8f8 Mon Sep 17 00:00:00 2001 From: Alice Chao Date: Mon, 5 Feb 2024 18:49:04 +0800 Subject: [PATCH 138/270] scsi: ufs: core: Fix shift issue in ufshcd_clear_cmd() When task_tag >= 32 (in MCQ mode) and sizeof(unsigned int) == 4, 1U << task_tag will out of bounds for a u32 mask. Fix this up to prevent SHIFT_ISSUE (bitwise shifts that are out of bounds for their data type). [name:debug_monitors&]Unexpected kernel BRK exception at EL1 [name:traps&]Internal error: BRK handler: 00000000f2005514 [#1] PREEMPT SMP [name:mediatek_cpufreq_hw&]cpufreq stop DVFS log done [name:mrdump&]Kernel Offset: 0x1ba5800000 from 0xffffffc008000000 [name:mrdump&]PHYS_OFFSET: 0x80000000 [name:mrdump&]pstate: 22400005 (nzCv daif +PAN -UAO) [name:mrdump&]pc : [0xffffffdbaf52bb2c] ufshcd_clear_cmd+0x280/0x288 [name:mrdump&]lr : [0xffffffdbaf52a774] ufshcd_wait_for_dev_cmd+0x3e4/0x82c [name:mrdump&]sp : ffffffc0081471b0 Workqueue: ufs_eh_wq_0 ufshcd_err_handler Call trace: dump_backtrace+0xf8/0x144 show_stack+0x18/0x24 dump_stack_lvl+0x78/0x9c dump_stack+0x18/0x44 mrdump_common_die+0x254/0x480 [mrdump] ipanic_die+0x20/0x30 [mrdump] notify_die+0x15c/0x204 die+0x10c/0x5f8 arm64_notify_die+0x74/0x13c do_debug_exception+0x164/0x26c el1_dbg+0x64/0x80 el1h_64_sync_handler+0x3c/0x90 el1h_64_sync+0x68/0x6c ufshcd_clear_cmd+0x280/0x288 ufshcd_wait_for_dev_cmd+0x3e4/0x82c ufshcd_exec_dev_cmd+0x5bc/0x9ac ufshcd_verify_dev_init+0x84/0x1c8 ufshcd_probe_hba+0x724/0x1ce0 ufshcd_host_reset_and_restore+0x260/0x574 ufshcd_reset_and_restore+0x138/0xbd0 ufshcd_err_handler+0x1218/0x2f28 process_one_work+0x5fc/0x1140 worker_thread+0x7d8/0xe20 kthread+0x25c/0x468 ret_from_fork+0x10/0x20 Signed-off-by: Alice Chao Link: https://lore.kernel.org/r/20240205104905.24929-1-alice.chao@mediatek.com Reviewed-by: Stanley Jhu Reviewed-by: Bart Van Assche Signed-off-by: Martin K. Petersen --- drivers/ufs/core/ufshcd.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/ufs/core/ufshcd.c b/drivers/ufs/core/ufshcd.c index 029d017fc1b6..c6cff4aa440a 100644 --- a/drivers/ufs/core/ufshcd.c +++ b/drivers/ufs/core/ufshcd.c @@ -3057,7 +3057,7 @@ bool ufshcd_cmd_inflight(struct scsi_cmnd *cmd) */ static int ufshcd_clear_cmd(struct ufs_hba *hba, u32 task_tag) { - u32 mask = 1U << task_tag; + u32 mask; unsigned long flags; int err; @@ -3075,6 +3075,8 @@ static int ufshcd_clear_cmd(struct ufs_hba *hba, u32 task_tag) return 0; } + mask = 1U << task_tag; + /* clear outstanding transaction before retry */ spin_lock_irqsave(hba->host->host_lock, flags); ufshcd_utrl_clear(hba, mask); From 17e94b2585417e04dabc2f13bc03b4665ae687f3 Mon Sep 17 00:00:00 2001 From: SEO HOYOUNG Date: Mon, 22 Jan 2024 17:33:24 +0900 Subject: [PATCH 139/270] scsi: ufs: core: Remove the ufshcd_release() in ufshcd_err_handling_prepare() If ufshcd_err_handler() is called in a suspend/resume situation, ufs_release() can be called twice and active_reqs end up going negative. This is because ufshcd_err_handling_prepare() and ufshcd_err_handling_unprepare() both call ufshcd_release(). Remove superfluous call to ufshcd_release(). Signed-off-by: SEO HOYOUNG Link: https://lore.kernel.org/r/20240122083324.11797-1-hy50.seo@samsung.com Reviewed-by: Bart Van Assche Reviewed-by: Can Guo Signed-off-by: Martin K. Petersen --- drivers/ufs/core/ufshcd.c | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/ufs/core/ufshcd.c b/drivers/ufs/core/ufshcd.c index c6cff4aa440a..d77b25b79ae3 100644 --- a/drivers/ufs/core/ufshcd.c +++ b/drivers/ufs/core/ufshcd.c @@ -6354,7 +6354,6 @@ static void ufshcd_err_handling_prepare(struct ufs_hba *hba) ufshcd_hold(hba); if (!ufshcd_is_clkgating_allowed(hba)) ufshcd_setup_clocks(hba, true); - ufshcd_release(hba); pm_op = hba->is_sys_suspended ? UFS_SYSTEM_PM : UFS_RUNTIME_PM; ufshcd_vops_resume(hba, pm_op); } else { From d0399da9fb5f8e3d897b9776bffee2d3bfe20210 Mon Sep 17 00:00:00 2001 From: Matthew Brost Date: Mon, 29 Jan 2024 19:04:13 -0800 Subject: [PATCH 140/270] drm/sched: Re-queue run job worker when drm_sched_entity_pop_job() returns NULL MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Rather then loop over entities until one with a ready job is found, re-queue the run job worker when drm_sched_entity_pop_job() returns NULL. Signed-off-by: Matthew Brost Reviewed-by: Christian König Fixes: 66dbd9004a55 ("drm/sched: Drain all entities in DRM sched run job worker") Reviewed-by: Luben Tuikov Signed-off-by: Dave Airlie Link: https://patchwork.freedesktop.org/patch/msgid/20240130030413.2031009-1-matthew.brost@intel.com --- drivers/gpu/drm/scheduler/sched_main.c | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/drivers/gpu/drm/scheduler/sched_main.c b/drivers/gpu/drm/scheduler/sched_main.c index 85f082396d42..d442b893275b 100644 --- a/drivers/gpu/drm/scheduler/sched_main.c +++ b/drivers/gpu/drm/scheduler/sched_main.c @@ -1178,21 +1178,24 @@ static void drm_sched_run_job_work(struct work_struct *w) struct drm_sched_entity *entity; struct dma_fence *fence; struct drm_sched_fence *s_fence; - struct drm_sched_job *sched_job = NULL; + struct drm_sched_job *sched_job; int r; if (READ_ONCE(sched->pause_submit)) return; /* Find entity with a ready job */ - while (!sched_job && (entity = drm_sched_select_entity(sched))) { - sched_job = drm_sched_entity_pop_job(entity); - if (!sched_job) - complete_all(&entity->entity_idle); - } + entity = drm_sched_select_entity(sched); if (!entity) return; /* No more work */ + sched_job = drm_sched_entity_pop_job(entity); + if (!sched_job) { + complete_all(&entity->entity_idle); + drm_sched_run_job_queue(sched); + return; + } + s_fence = sched_job->s_fence; atomic_add(sched_job->credits, &sched->credit_count); From 3871aa01e1a779d866fa9dfdd5a836f342f4eb87 Mon Sep 17 00:00:00 2001 From: Shigeru Yoshida Date: Thu, 1 Feb 2024 00:23:09 +0900 Subject: [PATCH 141/270] tipc: Check the bearer type before calling tipc_udp_nl_bearer_add() syzbot reported the following general protection fault [1]: general protection fault, probably for non-canonical address 0xdffffc0000000010: 0000 [#1] PREEMPT SMP KASAN KASAN: null-ptr-deref in range [0x0000000000000080-0x0000000000000087] ... RIP: 0010:tipc_udp_is_known_peer+0x9c/0x250 net/tipc/udp_media.c:291 ... Call Trace: tipc_udp_nl_bearer_add+0x212/0x2f0 net/tipc/udp_media.c:646 tipc_nl_bearer_add+0x21e/0x360 net/tipc/bearer.c:1089 genl_family_rcv_msg_doit+0x1fc/0x2e0 net/netlink/genetlink.c:972 genl_family_rcv_msg net/netlink/genetlink.c:1052 [inline] genl_rcv_msg+0x561/0x800 net/netlink/genetlink.c:1067 netlink_rcv_skb+0x16b/0x440 net/netlink/af_netlink.c:2544 genl_rcv+0x28/0x40 net/netlink/genetlink.c:1076 netlink_unicast_kernel net/netlink/af_netlink.c:1341 [inline] netlink_unicast+0x53b/0x810 net/netlink/af_netlink.c:1367 netlink_sendmsg+0x8b7/0xd70 net/netlink/af_netlink.c:1909 sock_sendmsg_nosec net/socket.c:730 [inline] __sock_sendmsg+0xd5/0x180 net/socket.c:745 ____sys_sendmsg+0x6ac/0x940 net/socket.c:2584 ___sys_sendmsg+0x135/0x1d0 net/socket.c:2638 __sys_sendmsg+0x117/0x1e0 net/socket.c:2667 do_syscall_x64 arch/x86/entry/common.c:52 [inline] do_syscall_64+0x40/0x110 arch/x86/entry/common.c:83 entry_SYSCALL_64_after_hwframe+0x63/0x6b The cause of this issue is that when tipc_nl_bearer_add() is called with the TIPC_NLA_BEARER_UDP_OPTS attribute, tipc_udp_nl_bearer_add() is called even if the bearer is not UDP. tipc_udp_is_known_peer() called by tipc_udp_nl_bearer_add() assumes that the media_ptr field of the tipc_bearer has an udp_bearer type object, so the function goes crazy for non-UDP bearers. This patch fixes the issue by checking the bearer type before calling tipc_udp_nl_bearer_add() in tipc_nl_bearer_add(). Fixes: ef20cd4dd163 ("tipc: introduce UDP replicast") Reported-and-tested-by: syzbot+5142b87a9abc510e14fa@syzkaller.appspotmail.com Closes: https://syzkaller.appspot.com/bug?extid=5142b87a9abc510e14fa [1] Signed-off-by: Shigeru Yoshida Reviewed-by: Tung Nguyen Link: https://lore.kernel.org/r/20240131152310.4089541-1-syoshida@redhat.com Signed-off-by: Paolo Abeni --- net/tipc/bearer.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/net/tipc/bearer.c b/net/tipc/bearer.c index 2cde375477e3..878415c43527 100644 --- a/net/tipc/bearer.c +++ b/net/tipc/bearer.c @@ -1086,6 +1086,12 @@ int tipc_nl_bearer_add(struct sk_buff *skb, struct genl_info *info) #ifdef CONFIG_TIPC_MEDIA_UDP if (attrs[TIPC_NLA_BEARER_UDP_OPTS]) { + if (b->media->type_id != TIPC_MEDIA_TYPE_UDP) { + rtnl_unlock(); + NL_SET_ERR_MSG(info->extack, "UDP option is unsupported"); + return -EINVAL; + } + err = tipc_udp_nl_bearer_add(b, attrs[TIPC_NLA_BEARER_UDP_OPTS]); if (err) { From b083d24fcf57580cc0b45dd7b0b4f84d8c4624f4 Mon Sep 17 00:00:00 2001 From: Dmitry Safonov Date: Fri, 2 Feb 2024 02:24:59 +0000 Subject: [PATCH 142/270] selftests/net: Amend per-netns counter checks Selftests here check not only that connect()/accept() for TCP-AO/TCP-MD5/non-signed-TCP combinations do/don't establish connections, but also counters: those are per-AO-key, per-socket and per-netns. The counters are checked on the server's side, as the server listener has TCP-AO/TCP-MD5/no keys for different peers. All tests run in the same namespaces with the same veth pair, created in test_init(). After close() in both client and server, the sides go through the regular FIN/ACK + FIN/ACK sequence, which goes in the background. If the selftest has already started a new testing scenario, read per-netns counters - it may fail in the end iff it doesn't expect the TCPAOGood per-netns counters go up during the test. Let's just kill both TCP-AO sides - that will avoid any asynchronous background TCP-AO segments going to either sides. Reported-by: Jakub Kicinski Closes: https://lore.kernel.org/all/20240201132153.4d68f45e@kernel.org/T/#u Fixes: 6f0c472a6815 ("selftests/net: Add TCP-AO + TCP-MD5 + no sign listen socket tests") Signed-off-by: Dmitry Safonov Link: https://lore.kernel.org/r/20240202-unsigned-md5-netns-counters-v1-1-8b90c37c0566@arista.com Signed-off-by: Paolo Abeni --- tools/testing/selftests/net/tcp_ao/unsigned-md5.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/tools/testing/selftests/net/tcp_ao/unsigned-md5.c b/tools/testing/selftests/net/tcp_ao/unsigned-md5.c index c5b568cd7d90..6b59a652159f 100644 --- a/tools/testing/selftests/net/tcp_ao/unsigned-md5.c +++ b/tools/testing/selftests/net/tcp_ao/unsigned-md5.c @@ -110,9 +110,9 @@ static void try_accept(const char *tst_name, unsigned int port, test_tcp_ao_counters_cmp(tst_name, &ao_cnt1, &ao_cnt2, cnt_expected); out: - synchronize_threads(); /* close() */ + synchronize_threads(); /* test_kill_sk() */ if (sk > 0) - close(sk); + test_kill_sk(sk); } static void server_add_routes(void) @@ -302,10 +302,10 @@ static void try_connect(const char *tst_name, unsigned int port, test_ok("%s: connected", tst_name); out: - synchronize_threads(); /* close() */ + synchronize_threads(); /* test_kill_sk() */ /* _test_connect_socket() cleans up on failure */ if (ret > 0) - close(sk); + test_kill_sk(sk); } #define PREINSTALL_MD5_FIRST BIT(0) @@ -486,10 +486,10 @@ static void try_to_add(const char *tst_name, unsigned int port, } out: - synchronize_threads(); /* close() */ + synchronize_threads(); /* test_kill_sk() */ /* _test_connect_socket() cleans up on failure */ if (ret > 0) - close(sk); + test_kill_sk(sk); } static void client_add_ip(union tcp_addr *client, const char *ip) From dad6a09f3148257ac1773cd90934d721d68ab595 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Mon, 29 Jan 2024 15:56:36 -0800 Subject: [PATCH 143/270] hrtimer: Report offline hrtimer enqueue The hrtimers migration on CPU-down hotplug process has been moved earlier, before the CPU actually goes to die. This leaves a small window of opportunity to queue an hrtimer in a blind spot, leaving it ignored. For example a practical case has been reported with RCU waking up a SCHED_FIFO task right before the CPUHP_AP_IDLE_DEAD stage, queuing that way a sched/rt timer to the local offline CPU. Make sure such situations never go unnoticed and warn when that happens. Fixes: 5c0930ccaad5 ("hrtimers: Push pending hrtimers away from outgoing CPU earlier") Reported-by: Paul E. McKenney Signed-off-by: Frederic Weisbecker Signed-off-by: Paul E. McKenney Signed-off-by: Thomas Gleixner Cc: stable@vger.kernel.org Link: https://lore.kernel.org/r/20240129235646.3171983-4-boqun.feng@gmail.com --- include/linux/hrtimer.h | 4 +++- kernel/time/hrtimer.c | 3 +++ 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/include/linux/hrtimer.h b/include/linux/hrtimer.h index 87e3bedf8eb0..641c4567cfa7 100644 --- a/include/linux/hrtimer.h +++ b/include/linux/hrtimer.h @@ -157,6 +157,7 @@ enum hrtimer_base_type { * @max_hang_time: Maximum time spent in hrtimer_interrupt * @softirq_expiry_lock: Lock which is taken while softirq based hrtimer are * expired + * @online: CPU is online from an hrtimers point of view * @timer_waiters: A hrtimer_cancel() invocation waits for the timer * callback to finish. * @expires_next: absolute time of the next event, is required for remote @@ -179,7 +180,8 @@ struct hrtimer_cpu_base { unsigned int hres_active : 1, in_hrtirq : 1, hang_detected : 1, - softirq_activated : 1; + softirq_activated : 1, + online : 1; #ifdef CONFIG_HIGH_RES_TIMERS unsigned int nr_events; unsigned short nr_retries; diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c index 760793998cdd..edb0f821dcea 100644 --- a/kernel/time/hrtimer.c +++ b/kernel/time/hrtimer.c @@ -1085,6 +1085,7 @@ static int enqueue_hrtimer(struct hrtimer *timer, enum hrtimer_mode mode) { debug_activate(timer, mode); + WARN_ON_ONCE(!base->cpu_base->online); base->cpu_base->active_bases |= 1 << base->index; @@ -2183,6 +2184,7 @@ int hrtimers_prepare_cpu(unsigned int cpu) cpu_base->softirq_next_timer = NULL; cpu_base->expires_next = KTIME_MAX; cpu_base->softirq_expires_next = KTIME_MAX; + cpu_base->online = 1; hrtimer_cpu_base_init_expiry_lock(cpu_base); return 0; } @@ -2250,6 +2252,7 @@ int hrtimers_cpu_dying(unsigned int dying_cpu) smp_call_function_single(ncpu, retrigger_next_event, NULL, 0); raw_spin_unlock(&new_base->lock); + old_base->online = 0; raw_spin_unlock(&old_base->lock); return 0; From 58aeb5623c2ebdadefe6352b14f8076a7073fea0 Mon Sep 17 00:00:00 2001 From: Fred Ai Date: Sat, 3 Feb 2024 02:29:08 -0800 Subject: [PATCH 144/270] mmc: sdhci-pci-o2micro: Fix a warm reboot issue that disk can't be detected by BIOS Driver shall switch clock source from DLL clock to OPE clock when power off card to ensure that card can be identified with OPE clock by BIOS. Signed-off-by: Fred Ai Fixes:4be33cf18703 ("mmc: sdhci-pci-o2micro: Improve card input timing at SDR104/HS200 mode") Cc: stable@vger.kernel.org Link: https://lore.kernel.org/r/20240203102908.4683-1-fredaibayhubtech@126.com Signed-off-by: Ulf Hansson --- drivers/mmc/host/sdhci-pci-o2micro.c | 30 ++++++++++++++++++++++++++++ 1 file changed, 30 insertions(+) diff --git a/drivers/mmc/host/sdhci-pci-o2micro.c b/drivers/mmc/host/sdhci-pci-o2micro.c index 7bfee28116af..d4a02184784a 100644 --- a/drivers/mmc/host/sdhci-pci-o2micro.c +++ b/drivers/mmc/host/sdhci-pci-o2micro.c @@ -693,6 +693,35 @@ static int sdhci_pci_o2_init_sd_express(struct mmc_host *mmc, struct mmc_ios *io return 0; } +static void sdhci_pci_o2_set_power(struct sdhci_host *host, unsigned char mode, unsigned short vdd) +{ + struct sdhci_pci_chip *chip; + struct sdhci_pci_slot *slot = sdhci_priv(host); + u32 scratch_32 = 0; + u8 scratch_8 = 0; + + chip = slot->chip; + + if (mode == MMC_POWER_OFF) { + /* UnLock WP */ + pci_read_config_byte(chip->pdev, O2_SD_LOCK_WP, &scratch_8); + scratch_8 &= 0x7f; + pci_write_config_byte(chip->pdev, O2_SD_LOCK_WP, scratch_8); + + /* Set PCR 0x354[16] to switch Clock Source back to OPE Clock */ + pci_read_config_dword(chip->pdev, O2_SD_OUTPUT_CLK_SOURCE_SWITCH, &scratch_32); + scratch_32 &= ~(O2_SD_SEL_DLL); + pci_write_config_dword(chip->pdev, O2_SD_OUTPUT_CLK_SOURCE_SWITCH, scratch_32); + + /* Lock WP */ + pci_read_config_byte(chip->pdev, O2_SD_LOCK_WP, &scratch_8); + scratch_8 |= 0x80; + pci_write_config_byte(chip->pdev, O2_SD_LOCK_WP, scratch_8); + } + + sdhci_set_power(host, mode, vdd); +} + static int sdhci_pci_o2_probe_slot(struct sdhci_pci_slot *slot) { struct sdhci_pci_chip *chip; @@ -1051,6 +1080,7 @@ static const struct sdhci_ops sdhci_pci_o2_ops = { .set_bus_width = sdhci_set_bus_width, .reset = sdhci_reset, .set_uhs_signaling = sdhci_set_uhs_signaling, + .set_power = sdhci_pci_o2_set_power, }; const struct sdhci_pci_fixes sdhci_o2 = { From cc9432c4fb159a3913e0ce3173b8218cd5bad2e0 Mon Sep 17 00:00:00 2001 From: Alexander Stein Date: Tue, 6 Feb 2024 09:39:12 +0100 Subject: [PATCH 145/270] mmc: slot-gpio: Allow non-sleeping GPIO ro This change uses the appropriate _cansleep or non-sleeping API for reading GPIO read-only state. This allows users with GPIOs that never sleepbeing called in atomic context. Implement the same mechanism as in commit 52af318c93e97 ("mmc: Allow non-sleeping GPIO cd"). Signed-off-by: Alexander Stein Cc: stable@vger.kernel.org Link: https://lore.kernel.org/r/20240206083912.2543142-1-alexander.stein@ew.tq-group.com Signed-off-by: Ulf Hansson --- drivers/mmc/core/slot-gpio.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/drivers/mmc/core/slot-gpio.c b/drivers/mmc/core/slot-gpio.c index 2a2d949a9344..39f45c2b6de8 100644 --- a/drivers/mmc/core/slot-gpio.c +++ b/drivers/mmc/core/slot-gpio.c @@ -75,11 +75,15 @@ EXPORT_SYMBOL(mmc_gpio_set_cd_irq); int mmc_gpio_get_ro(struct mmc_host *host) { struct mmc_gpio *ctx = host->slot.handler_priv; + int cansleep; if (!ctx || !ctx->ro_gpio) return -ENOSYS; - return gpiod_get_value_cansleep(ctx->ro_gpio); + cansleep = gpiod_cansleep(ctx->ro_gpio); + return cansleep ? + gpiod_get_value_cansleep(ctx->ro_gpio) : + gpiod_get_value(ctx->ro_gpio); } EXPORT_SYMBOL(mmc_gpio_get_ro); From c9da9a1f17bf4fa96b115950fd389c917b583c1c Mon Sep 17 00:00:00 2001 From: "Wachowski, Karol" Date: Fri, 26 Jan 2024 13:27:58 +0100 Subject: [PATCH 146/270] accel/ivpu: Force snooping for MMU writes Set AW_SNOOP_OVERRIDE bit in VPU_37/40XX_HOST_IF_TCU_PTW_OVERRIDES to force snooping for MMU write accesses (setting event queue events). MMU event queue buffer is the only buffer written by MMU and mapped as write-back which break cache coherency. Force write transactions to be snooped solving the problem. Signed-off-by: Wachowski, Karol Signed-off-by: Jacek Lawrynowicz Reviewed-by: Jeffrey Hugo Link: https://patchwork.freedesktop.org/patch/msgid/20240126122804.2169129-2-jacek.lawrynowicz@linux.intel.com --- drivers/accel/ivpu/ivpu_hw_37xx.c | 2 +- drivers/accel/ivpu/ivpu_hw_40xx.c | 2 +- drivers/accel/ivpu/ivpu_mmu.c | 3 --- 3 files changed, 2 insertions(+), 5 deletions(-) diff --git a/drivers/accel/ivpu/ivpu_hw_37xx.c b/drivers/accel/ivpu/ivpu_hw_37xx.c index f15a93d83057..77accd029c4a 100644 --- a/drivers/accel/ivpu/ivpu_hw_37xx.c +++ b/drivers/accel/ivpu/ivpu_hw_37xx.c @@ -525,7 +525,7 @@ static void ivpu_boot_no_snoop_enable(struct ivpu_device *vdev) u32 val = REGV_RD32(VPU_37XX_HOST_IF_TCU_PTW_OVERRIDES); val = REG_SET_FLD(VPU_37XX_HOST_IF_TCU_PTW_OVERRIDES, NOSNOOP_OVERRIDE_EN, val); - val = REG_SET_FLD(VPU_37XX_HOST_IF_TCU_PTW_OVERRIDES, AW_NOSNOOP_OVERRIDE, val); + val = REG_CLR_FLD(VPU_37XX_HOST_IF_TCU_PTW_OVERRIDES, AW_NOSNOOP_OVERRIDE, val); val = REG_SET_FLD(VPU_37XX_HOST_IF_TCU_PTW_OVERRIDES, AR_NOSNOOP_OVERRIDE, val); REGV_WR32(VPU_37XX_HOST_IF_TCU_PTW_OVERRIDES, val); diff --git a/drivers/accel/ivpu/ivpu_hw_40xx.c b/drivers/accel/ivpu/ivpu_hw_40xx.c index 704288084f37..86b89b94f9f3 100644 --- a/drivers/accel/ivpu/ivpu_hw_40xx.c +++ b/drivers/accel/ivpu/ivpu_hw_40xx.c @@ -530,7 +530,7 @@ static void ivpu_boot_no_snoop_enable(struct ivpu_device *vdev) u32 val = REGV_RD32(VPU_40XX_HOST_IF_TCU_PTW_OVERRIDES); val = REG_SET_FLD(VPU_40XX_HOST_IF_TCU_PTW_OVERRIDES, SNOOP_OVERRIDE_EN, val); - val = REG_CLR_FLD(VPU_40XX_HOST_IF_TCU_PTW_OVERRIDES, AW_SNOOP_OVERRIDE, val); + val = REG_SET_FLD(VPU_40XX_HOST_IF_TCU_PTW_OVERRIDES, AW_SNOOP_OVERRIDE, val); val = REG_CLR_FLD(VPU_40XX_HOST_IF_TCU_PTW_OVERRIDES, AR_SNOOP_OVERRIDE, val); REGV_WR32(VPU_40XX_HOST_IF_TCU_PTW_OVERRIDES, val); diff --git a/drivers/accel/ivpu/ivpu_mmu.c b/drivers/accel/ivpu/ivpu_mmu.c index 9a3122ffce03..8df78adeee33 100644 --- a/drivers/accel/ivpu/ivpu_mmu.c +++ b/drivers/accel/ivpu/ivpu_mmu.c @@ -560,7 +560,6 @@ static int ivpu_mmu_reset(struct ivpu_device *vdev) mmu->cmdq.cons = 0; memset(mmu->evtq.base, 0, IVPU_MMU_EVTQ_SIZE); - clflush_cache_range(mmu->evtq.base, IVPU_MMU_EVTQ_SIZE); mmu->evtq.prod = 0; mmu->evtq.cons = 0; @@ -877,8 +876,6 @@ static u32 *ivpu_mmu_get_event(struct ivpu_device *vdev) if (!CIRC_CNT(IVPU_MMU_Q_IDX(evtq->prod), IVPU_MMU_Q_IDX(evtq->cons), IVPU_MMU_Q_COUNT)) return NULL; - clflush_cache_range(evt, IVPU_MMU_EVTQ_CMD_SIZE); - evtq->cons = (evtq->cons + 1) & IVPU_MMU_Q_WRAP_MASK; REGV_WR32(IVPU_MMU_REG_EVTQ_CONS_SEC, evtq->cons); From b039f1c4d3727c3e44a7e89ff79e7e8533e1beec Mon Sep 17 00:00:00 2001 From: "Wachowski, Karol" Date: Fri, 26 Jan 2024 13:27:59 +0100 Subject: [PATCH 147/270] accel/ivpu: Correct MMU queue size checking functions Do not use kernel CIRC_SPACE and CIRC_CNT that incorrectly return space of a queue when wrap bit was set. Use correct implementation that compares producer, consumer and wrap bit values. Without this fix it was possible to lose events in case when event queue was full. Signed-off-by: Wachowski, Karol Signed-off-by: Jacek Lawrynowicz Reviewed-by: Jeffrey Hugo Link: https://patchwork.freedesktop.org/patch/msgid/20240126122804.2169129-3-jacek.lawrynowicz@linux.intel.com --- drivers/accel/ivpu/ivpu_mmu.c | 33 ++++++++++++++++++++++----------- 1 file changed, 22 insertions(+), 11 deletions(-) diff --git a/drivers/accel/ivpu/ivpu_mmu.c b/drivers/accel/ivpu/ivpu_mmu.c index 8df78adeee33..91bd640655ab 100644 --- a/drivers/accel/ivpu/ivpu_mmu.c +++ b/drivers/accel/ivpu/ivpu_mmu.c @@ -72,10 +72,10 @@ #define IVPU_MMU_Q_COUNT_LOG2 4 /* 16 entries */ #define IVPU_MMU_Q_COUNT ((u32)1 << IVPU_MMU_Q_COUNT_LOG2) -#define IVPU_MMU_Q_WRAP_BIT (IVPU_MMU_Q_COUNT << 1) -#define IVPU_MMU_Q_WRAP_MASK (IVPU_MMU_Q_WRAP_BIT - 1) -#define IVPU_MMU_Q_IDX_MASK (IVPU_MMU_Q_COUNT - 1) +#define IVPU_MMU_Q_WRAP_MASK GENMASK(IVPU_MMU_Q_COUNT_LOG2, 0) +#define IVPU_MMU_Q_IDX_MASK (IVPU_MMU_Q_COUNT - 1) #define IVPU_MMU_Q_IDX(val) ((val) & IVPU_MMU_Q_IDX_MASK) +#define IVPU_MMU_Q_WRP(val) ((val) & IVPU_MMU_Q_COUNT) #define IVPU_MMU_CMDQ_CMD_SIZE 16 #define IVPU_MMU_CMDQ_SIZE (IVPU_MMU_Q_COUNT * IVPU_MMU_CMDQ_CMD_SIZE) @@ -475,20 +475,32 @@ static int ivpu_mmu_cmdq_wait_for_cons(struct ivpu_device *vdev) return 0; } +static bool ivpu_mmu_queue_is_full(struct ivpu_mmu_queue *q) +{ + return ((IVPU_MMU_Q_IDX(q->prod) == IVPU_MMU_Q_IDX(q->cons)) && + (IVPU_MMU_Q_WRP(q->prod) != IVPU_MMU_Q_WRP(q->cons))); +} + +static bool ivpu_mmu_queue_is_empty(struct ivpu_mmu_queue *q) +{ + return ((IVPU_MMU_Q_IDX(q->prod) == IVPU_MMU_Q_IDX(q->cons)) && + (IVPU_MMU_Q_WRP(q->prod) == IVPU_MMU_Q_WRP(q->cons))); +} + static int ivpu_mmu_cmdq_cmd_write(struct ivpu_device *vdev, const char *name, u64 data0, u64 data1) { - struct ivpu_mmu_queue *q = &vdev->mmu->cmdq; - u64 *queue_buffer = q->base; - int idx = IVPU_MMU_Q_IDX(q->prod) * (IVPU_MMU_CMDQ_CMD_SIZE / sizeof(*queue_buffer)); + struct ivpu_mmu_queue *cmdq = &vdev->mmu->cmdq; + u64 *queue_buffer = cmdq->base; + int idx = IVPU_MMU_Q_IDX(cmdq->prod) * (IVPU_MMU_CMDQ_CMD_SIZE / sizeof(*queue_buffer)); - if (!CIRC_SPACE(IVPU_MMU_Q_IDX(q->prod), IVPU_MMU_Q_IDX(q->cons), IVPU_MMU_Q_COUNT)) { + if (ivpu_mmu_queue_is_full(cmdq)) { ivpu_err(vdev, "Failed to write MMU CMD %s\n", name); return -EBUSY; } queue_buffer[idx] = data0; queue_buffer[idx + 1] = data1; - q->prod = (q->prod + 1) & IVPU_MMU_Q_WRAP_MASK; + cmdq->prod = (cmdq->prod + 1) & IVPU_MMU_Q_WRAP_MASK; ivpu_dbg(vdev, MMU, "CMD write: %s data: 0x%llx 0x%llx\n", name, data0, data1); @@ -873,12 +885,10 @@ static u32 *ivpu_mmu_get_event(struct ivpu_device *vdev) u32 *evt = evtq->base + (idx * IVPU_MMU_EVTQ_CMD_SIZE); evtq->prod = REGV_RD32(IVPU_MMU_REG_EVTQ_PROD_SEC); - if (!CIRC_CNT(IVPU_MMU_Q_IDX(evtq->prod), IVPU_MMU_Q_IDX(evtq->cons), IVPU_MMU_Q_COUNT)) + if (ivpu_mmu_queue_is_empty(evtq)) return NULL; evtq->cons = (evtq->cons + 1) & IVPU_MMU_Q_WRAP_MASK; - REGV_WR32(IVPU_MMU_REG_EVTQ_CONS_SEC, evtq->cons); - return evt; } @@ -899,6 +909,7 @@ void ivpu_mmu_irq_evtq_handler(struct ivpu_device *vdev) } ivpu_mmu_user_context_mark_invalid(vdev, ssid); + REGV_WR32(IVPU_MMU_REG_EVTQ_CONS_SEC, vdev->mmu->evtq.cons); } } From a7f31091ddf457352e3dd7ac183fdbd26b4dcd04 Mon Sep 17 00:00:00 2001 From: Jacek Lawrynowicz Date: Fri, 26 Jan 2024 13:28:00 +0100 Subject: [PATCH 148/270] accel/ivpu: Disable d3hot_delay on all NPU generations NPU does not require this delay regardless of the generation. All generations are integrated into the SOC. Signed-off-by: Jacek Lawrynowicz Reviewed-by: Jeffrey Hugo Link: https://patchwork.freedesktop.org/patch/msgid/20240126122804.2169129-4-jacek.lawrynowicz@linux.intel.com --- drivers/accel/ivpu/ivpu_drv.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/drivers/accel/ivpu/ivpu_drv.c b/drivers/accel/ivpu/ivpu_drv.c index 9418c73ee8ef..4b0640226986 100644 --- a/drivers/accel/ivpu/ivpu_drv.c +++ b/drivers/accel/ivpu/ivpu_drv.c @@ -480,9 +480,8 @@ static int ivpu_pci_init(struct ivpu_device *vdev) /* Clear any pending errors */ pcie_capability_clear_word(pdev, PCI_EXP_DEVSTA, 0x3f); - /* VPU 37XX does not require 10m D3hot delay */ - if (ivpu_hw_gen(vdev) == IVPU_HW_37XX) - pdev->d3hot_delay = 0; + /* NPU does not require 10m D3hot delay */ + pdev->d3hot_delay = 0; ret = pcim_enable_device(pdev); if (ret) { From a939c03d37788e4a9a645c88d5e5b6a475ba0fd5 Mon Sep 17 00:00:00 2001 From: Krystian Pradzynski Date: Fri, 26 Jan 2024 13:28:02 +0100 Subject: [PATCH 149/270] accel/ivpu/40xx: Enable D0i3 message All recent 40xx firmware already supports D0i3 entry message and this WA is no longer needed. Signed-off-by: Krystian Pradzynski Signed-off-by: Jacek Lawrynowicz Reviewed-by: Jacek Lawrynowicz Link: https://patchwork.freedesktop.org/patch/msgid/20240126122804.2169129-6-jacek.lawrynowicz@linux.intel.com --- drivers/accel/ivpu/ivpu_fw.c | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/accel/ivpu/ivpu_fw.c b/drivers/accel/ivpu/ivpu_fw.c index 6576232f3e67..5fa8bd4603d5 100644 --- a/drivers/accel/ivpu/ivpu_fw.c +++ b/drivers/accel/ivpu/ivpu_fw.c @@ -222,7 +222,6 @@ ivpu_fw_init_wa(struct ivpu_device *vdev) const struct vpu_firmware_header *fw_hdr = (const void *)vdev->fw->file->data; if (IVPU_FW_CHECK_API_VER_LT(vdev, fw_hdr, BOOT, 3, 17) || - (ivpu_hw_gen(vdev) > IVPU_HW_37XX) || (ivpu_test_mode & IVPU_TEST_MODE_D0I3_MSG_DISABLE)) vdev->wa.disable_d0i3_msg = true; From 553099da45397914a995dce6307d6c26523c2567 Mon Sep 17 00:00:00 2001 From: Krystian Pradzynski Date: Fri, 26 Jan 2024 13:28:03 +0100 Subject: [PATCH 150/270] accel/ivpu/40xx: Stop passing SKU boot parameters to FW This parameter was never used by the 40xx FW. Signed-off-by: Krystian Pradzynski Signed-off-by: Jacek Lawrynowicz Reviewed-by: Jeffrey Hugo Link: https://patchwork.freedesktop.org/patch/msgid/20240126122804.2169129-7-jacek.lawrynowicz@linux.intel.com --- drivers/accel/ivpu/ivpu_hw_40xx.c | 5 ----- 1 file changed, 5 deletions(-) diff --git a/drivers/accel/ivpu/ivpu_hw_40xx.c b/drivers/accel/ivpu/ivpu_hw_40xx.c index 86b89b94f9f3..1c995307c113 100644 --- a/drivers/accel/ivpu/ivpu_hw_40xx.c +++ b/drivers/accel/ivpu/ivpu_hw_40xx.c @@ -704,7 +704,6 @@ static int ivpu_hw_40xx_info_init(struct ivpu_device *vdev) { struct ivpu_hw_info *hw = vdev->hw; u32 tile_disable; - u32 tile_enable; u32 fuse; fuse = REGB_RD32(VPU_40XX_BUTTRESS_TILE_FUSE); @@ -725,10 +724,6 @@ static int ivpu_hw_40xx_info_init(struct ivpu_device *vdev) else ivpu_dbg(vdev, MISC, "Fuse: All %d tiles enabled\n", TILE_MAX_NUM); - tile_enable = (~tile_disable) & TILE_MAX_MASK; - - hw->sku = REG_SET_FLD_NUM(SKU, HW_ID, LNL_HW_ID, hw->sku); - hw->sku = REG_SET_FLD_NUM(SKU, TILE, tile_enable, hw->sku); hw->tile_fuse = tile_disable; hw->pll.profiling_freq = PLL_PROFILING_FREQ_DEFAULT; From 5f8408aca66772d3aa9b4831577b2ac5ec41bcd9 Mon Sep 17 00:00:00 2001 From: Grzegorz Trzebiatowski Date: Fri, 26 Jan 2024 13:28:04 +0100 Subject: [PATCH 151/270] accel/ivpu: Add job status for jobs aborted by the driver Add DRM_IVPU_JOB_STATUS_ABORTED to indicate that the job was aborted by the driver due to e.g. TDR or user context MMU faults. This will help UMD and tests distinguish if job was aborted by the FW or the driver. Signed-off-by: Grzegorz Trzebiatowski Signed-off-by: Jacek Lawrynowicz Reviewed-by: Jeffrey Hugo Link: https://patchwork.freedesktop.org/patch/msgid/20240126122804.2169129-8-jacek.lawrynowicz@linux.intel.com --- drivers/accel/ivpu/ivpu_job.c | 4 ++-- include/uapi/drm/ivpu_accel.h | 1 + 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/drivers/accel/ivpu/ivpu_job.c b/drivers/accel/ivpu/ivpu_job.c index 0440bee3ecaf..e70cfb859339 100644 --- a/drivers/accel/ivpu/ivpu_job.c +++ b/drivers/accel/ivpu/ivpu_job.c @@ -294,7 +294,7 @@ static int ivpu_job_signal_and_destroy(struct ivpu_device *vdev, u32 job_id, u32 return -ENOENT; if (job->file_priv->has_mmu_faults) - job_status = VPU_JSM_STATUS_ABORTED; + job_status = DRM_IVPU_JOB_STATUS_ABORTED; job->bos[CMD_BUF_IDX]->job_status = job_status; dma_fence_signal(job->done_fence); @@ -315,7 +315,7 @@ void ivpu_jobs_abort_all(struct ivpu_device *vdev) unsigned long id; xa_for_each(&vdev->submitted_jobs_xa, id, job) - ivpu_job_signal_and_destroy(vdev, id, VPU_JSM_STATUS_ABORTED); + ivpu_job_signal_and_destroy(vdev, id, DRM_IVPU_JOB_STATUS_ABORTED); } static int ivpu_job_submit(struct ivpu_job *job) diff --git a/include/uapi/drm/ivpu_accel.h b/include/uapi/drm/ivpu_accel.h index 63c49318a863..19a13468eca5 100644 --- a/include/uapi/drm/ivpu_accel.h +++ b/include/uapi/drm/ivpu_accel.h @@ -305,6 +305,7 @@ struct drm_ivpu_submit { /* drm_ivpu_bo_wait job status codes */ #define DRM_IVPU_JOB_STATUS_SUCCESS 0 +#define DRM_IVPU_JOB_STATUS_ABORTED 256 /** * struct drm_ivpu_bo_wait - Wait for BO to become inactive From 1ce2654d87e2fb91fea83b288bd9b2641045e42a Mon Sep 17 00:00:00 2001 From: Furong Xu <0x1207@gmail.com> Date: Sat, 3 Feb 2024 13:31:33 +0800 Subject: [PATCH 152/270] net: stmmac: xgmac: fix a typo of register name in DPP safety handling DDPP is copied from Synopsys Data book: DDPP: Disable Data path Parity Protection. When it is 0x0, Data path Parity Protection is enabled. When it is 0x1, Data path Parity Protection is disabled. The macro name should be XGMAC_DPP_DISABLE. Fixes: 46eba193d04f ("net: stmmac: xgmac: fix handling of DPP safety error for DMA channels") Signed-off-by: Furong Xu <0x1207@gmail.com> Reviewed-by: Serge Semin Link: https://lore.kernel.org/r/20240203053133.1129236-1-0x1207@gmail.com Signed-off-by: Paolo Abeni --- drivers/net/ethernet/stmicro/stmmac/dwxgmac2.h | 2 +- drivers/net/ethernet/stmicro/stmmac/dwxgmac2_core.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/stmicro/stmmac/dwxgmac2.h b/drivers/net/ethernet/stmicro/stmmac/dwxgmac2.h index 5c67a3f89f08..6a2c7d22df1e 100644 --- a/drivers/net/ethernet/stmicro/stmmac/dwxgmac2.h +++ b/drivers/net/ethernet/stmicro/stmmac/dwxgmac2.h @@ -304,7 +304,7 @@ #define XGMAC_TXCEIE BIT(0) #define XGMAC_MTL_ECC_INT_STATUS 0x000010cc #define XGMAC_MTL_DPP_CONTROL 0x000010e0 -#define XGMAC_DDPP_DISABLE BIT(0) +#define XGMAC_DPP_DISABLE BIT(0) #define XGMAC_MTL_TXQ_OPMODE(x) (0x00001100 + (0x80 * (x))) #define XGMAC_TQS GENMASK(25, 16) #define XGMAC_TQS_SHIFT 16 diff --git a/drivers/net/ethernet/stmicro/stmmac/dwxgmac2_core.c b/drivers/net/ethernet/stmicro/stmmac/dwxgmac2_core.c index 04d7c4dc2e35..323c57f03c93 100644 --- a/drivers/net/ethernet/stmicro/stmmac/dwxgmac2_core.c +++ b/drivers/net/ethernet/stmicro/stmmac/dwxgmac2_core.c @@ -928,7 +928,7 @@ dwxgmac3_safety_feat_config(void __iomem *ioaddr, unsigned int asp, /* 5. Enable Data Path Parity Protection */ value = readl(ioaddr + XGMAC_MTL_DPP_CONTROL); /* already enabled by default, explicit enable it again */ - value &= ~XGMAC_DDPP_DISABLE; + value &= ~XGMAC_DPP_DISABLE; writel(value, ioaddr + XGMAC_MTL_DPP_CONTROL); return 0; From f814bdda774c183b0cc15ec8f3b6e7c6f4527ba5 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Tue, 23 Jan 2024 18:58:26 +0100 Subject: [PATCH 153/270] blk-wbt: Fix detection of dirty-throttled tasks The detection of dirty-throttled tasks in blk-wbt has been subtly broken since its beginning in 2016. Namely if we are doing cgroup writeback and the throttled task is not in the root cgroup, balance_dirty_pages() will set dirty_sleep for the non-root bdi_writeback structure. However blk-wbt checks dirty_sleep only in the root cgroup bdi_writeback structure. Thus detection of recently throttled tasks is not working in this case (we noticed this when we switched to cgroup v2 and suddently writeback was slow). Since blk-wbt has no easy way to get to proper bdi_writeback and furthermore its intention has always been to work on the whole device rather than on individual cgroups, just move the dirty_sleep timestamp from bdi_writeback to backing_dev_info. That fixes the checking for recently throttled task and saves memory for everybody as a bonus. CC: stable@vger.kernel.org Fixes: b57d74aff9ab ("writeback: track if we're sleeping on progress in balance_dirty_pages()") Signed-off-by: Jan Kara Link: https://lore.kernel.org/r/20240123175826.21452-1-jack@suse.cz [axboe: fixup indentation errors] Signed-off-by: Jens Axboe --- block/blk-wbt.c | 4 ++-- include/linux/backing-dev-defs.h | 7 +++++-- mm/backing-dev.c | 2 +- mm/page-writeback.c | 2 +- 4 files changed, 9 insertions(+), 6 deletions(-) diff --git a/block/blk-wbt.c b/block/blk-wbt.c index 5ba3cd574eac..0c0e270a8265 100644 --- a/block/blk-wbt.c +++ b/block/blk-wbt.c @@ -163,9 +163,9 @@ static void wb_timestamp(struct rq_wb *rwb, unsigned long *var) */ static bool wb_recent_wait(struct rq_wb *rwb) { - struct bdi_writeback *wb = &rwb->rqos.disk->bdi->wb; + struct backing_dev_info *bdi = rwb->rqos.disk->bdi; - return time_before(jiffies, wb->dirty_sleep + HZ); + return time_before(jiffies, bdi->last_bdp_sleep + HZ); } static inline struct rq_wait *get_rq_wait(struct rq_wb *rwb, diff --git a/include/linux/backing-dev-defs.h b/include/linux/backing-dev-defs.h index ae12696ec492..2ad261082bba 100644 --- a/include/linux/backing-dev-defs.h +++ b/include/linux/backing-dev-defs.h @@ -141,8 +141,6 @@ struct bdi_writeback { struct delayed_work dwork; /* work item used for writeback */ struct delayed_work bw_dwork; /* work item used for bandwidth estimate */ - unsigned long dirty_sleep; /* last wait */ - struct list_head bdi_node; /* anchored at bdi->wb_list */ #ifdef CONFIG_CGROUP_WRITEBACK @@ -179,6 +177,11 @@ struct backing_dev_info { * any dirty wbs, which is depended upon by bdi_has_dirty(). */ atomic_long_t tot_write_bandwidth; + /* + * Jiffies when last process was dirty throttled on this bdi. Used by + * blk-wbt. + */ + unsigned long last_bdp_sleep; struct bdi_writeback wb; /* the root writeback info for this bdi */ struct list_head wb_list; /* list of all wbs */ diff --git a/mm/backing-dev.c b/mm/backing-dev.c index 1e3447bccdb1..e039d05304dd 100644 --- a/mm/backing-dev.c +++ b/mm/backing-dev.c @@ -436,7 +436,6 @@ static int wb_init(struct bdi_writeback *wb, struct backing_dev_info *bdi, INIT_LIST_HEAD(&wb->work_list); INIT_DELAYED_WORK(&wb->dwork, wb_workfn); INIT_DELAYED_WORK(&wb->bw_dwork, wb_update_bandwidth_workfn); - wb->dirty_sleep = jiffies; err = fprop_local_init_percpu(&wb->completions, gfp); if (err) @@ -921,6 +920,7 @@ int bdi_init(struct backing_dev_info *bdi) INIT_LIST_HEAD(&bdi->bdi_list); INIT_LIST_HEAD(&bdi->wb_list); init_waitqueue_head(&bdi->wb_waitq); + bdi->last_bdp_sleep = jiffies; return cgwb_bdi_init(bdi); } diff --git a/mm/page-writeback.c b/mm/page-writeback.c index cd4e4ae77c40..cc37fa7f3364 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -1921,7 +1921,7 @@ pause: break; } __set_current_state(TASK_KILLABLE); - wb->dirty_sleep = now; + bdi->last_bdp_sleep = jiffies; io_schedule_timeout(pause); current->dirty_paused_when = now + pause; From ba58f873cdeec30b6da48e28dd5782c5a3e1371b Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 2 Feb 2024 15:18:31 -0800 Subject: [PATCH 154/270] KVM: selftests: Fix a semaphore imbalance in the dirty ring logging test When finishing the final iteration of dirty_log_test testcase, set host_quit _before_ the final "continue" so that the vCPU worker doesn't run an extra iteration, and delete the hack-a-fix of an extra "continue" from the dirty ring testcase. This fixes a bug where the extra post to sem_vcpu_cont may not be consumed, which results in failures in subsequent runs of the testcases. The bug likely was missed during development as x86 supports only a single "guest mode", i.e. there aren't any subsequent testcases after the dirty ring test, because for_each_guest_mode() only runs a single iteration. For the regular dirty log testcases, letting the vCPU run one extra iteration is a non-issue as the vCPU worker waits on sem_vcpu_cont if and only if the worker is explicitly told to stop (vcpu_sync_stop_requested). But for the dirty ring test, which needs to periodically stop the vCPU to reap the dirty ring, letting the vCPU resume the guest _after_ the last iteration means the vCPU will get stuck without an extra "continue". However, blindly firing off an post to sem_vcpu_cont isn't guaranteed to be consumed, e.g. if the vCPU worker sees host_quit==true before resuming the guest. This results in a dangling sem_vcpu_cont, which leads to subsequent iterations getting out of sync, as the vCPU worker will continue on before the main task is ready for it to resume the guest, leading to a variety of asserts, e.g. ==== Test Assertion Failure ==== dirty_log_test.c:384: dirty_ring_vcpu_ring_full pid=14854 tid=14854 errno=22 - Invalid argument 1 0x00000000004033eb: dirty_ring_collect_dirty_pages at dirty_log_test.c:384 2 0x0000000000402d27: log_mode_collect_dirty_pages at dirty_log_test.c:505 3 (inlined by) run_test at dirty_log_test.c:802 4 0x0000000000403dc7: for_each_guest_mode at guest_modes.c:100 5 0x0000000000401dff: main at dirty_log_test.c:941 (discriminator 3) 6 0x0000ffff9be173c7: ?? ??:0 7 0x0000ffff9be1749f: ?? ??:0 8 0x000000000040206f: _start at ??:? Didn't continue vcpu even without ring full Alternatively, the test could simply reset the semaphores before each testcase, but papering over hacks with more hacks usually ends in tears. Reported-by: Shaoqin Huang Fixes: 84292e565951 ("KVM: selftests: Add dirty ring buffer test") Reviewed-by: Peter Xu Reviewed-by: Shaoqin Huang Link: https://lore.kernel.org/r/20240202231831.354848-1-seanjc@google.com Signed-off-by: Sean Christopherson --- tools/testing/selftests/kvm/dirty_log_test.c | 50 +++++++++++--------- 1 file changed, 27 insertions(+), 23 deletions(-) diff --git a/tools/testing/selftests/kvm/dirty_log_test.c b/tools/testing/selftests/kvm/dirty_log_test.c index babea97b31a4..eaad5b20854c 100644 --- a/tools/testing/selftests/kvm/dirty_log_test.c +++ b/tools/testing/selftests/kvm/dirty_log_test.c @@ -376,7 +376,10 @@ static void dirty_ring_collect_dirty_pages(struct kvm_vcpu *vcpu, int slot, cleared = kvm_vm_reset_dirty_ring(vcpu->vm); - /* Cleared pages should be the same as collected */ + /* + * Cleared pages should be the same as collected, as KVM is supposed to + * clear only the entries that have been harvested. + */ TEST_ASSERT(cleared == count, "Reset dirty pages (%u) mismatch " "with collected (%u)", cleared, count); @@ -415,12 +418,6 @@ static void dirty_ring_after_vcpu_run(struct kvm_vcpu *vcpu, int ret, int err) } } -static void dirty_ring_before_vcpu_join(void) -{ - /* Kick another round of vcpu just to make sure it will quit */ - sem_post(&sem_vcpu_cont); -} - struct log_mode { const char *name; /* Return true if this mode is supported, otherwise false */ @@ -433,7 +430,6 @@ struct log_mode { uint32_t *ring_buf_idx); /* Hook to call when after each vcpu run */ void (*after_vcpu_run)(struct kvm_vcpu *vcpu, int ret, int err); - void (*before_vcpu_join) (void); } log_modes[LOG_MODE_NUM] = { { .name = "dirty-log", @@ -452,7 +448,6 @@ struct log_mode { .supported = dirty_ring_supported, .create_vm_done = dirty_ring_create_vm_done, .collect_dirty_pages = dirty_ring_collect_dirty_pages, - .before_vcpu_join = dirty_ring_before_vcpu_join, .after_vcpu_run = dirty_ring_after_vcpu_run, }, }; @@ -513,14 +508,6 @@ static void log_mode_after_vcpu_run(struct kvm_vcpu *vcpu, int ret, int err) mode->after_vcpu_run(vcpu, ret, err); } -static void log_mode_before_vcpu_join(void) -{ - struct log_mode *mode = &log_modes[host_log_mode]; - - if (mode->before_vcpu_join) - mode->before_vcpu_join(); -} - static void generate_random_array(uint64_t *guest_array, uint64_t size) { uint64_t i; @@ -719,6 +706,7 @@ static void run_test(enum vm_guest_mode mode, void *arg) struct kvm_vm *vm; unsigned long *bmap; uint32_t ring_buf_idx = 0; + int sem_val; if (!log_mode_supported()) { print_skip("Log mode '%s' not supported", @@ -788,12 +776,22 @@ static void run_test(enum vm_guest_mode mode, void *arg) /* Start the iterations */ iteration = 1; sync_global_to_guest(vm, iteration); - host_quit = false; + WRITE_ONCE(host_quit, false); host_dirty_count = 0; host_clear_count = 0; host_track_next_count = 0; WRITE_ONCE(dirty_ring_vcpu_ring_full, false); + /* + * Ensure the previous iteration didn't leave a dangling semaphore, i.e. + * that the main task and vCPU worker were synchronized and completed + * verification of all iterations. + */ + sem_getvalue(&sem_vcpu_stop, &sem_val); + TEST_ASSERT_EQ(sem_val, 0); + sem_getvalue(&sem_vcpu_cont, &sem_val); + TEST_ASSERT_EQ(sem_val, 0); + pthread_create(&vcpu_thread, NULL, vcpu_worker, vcpu); while (iteration < p->iterations) { @@ -819,15 +817,21 @@ static void run_test(enum vm_guest_mode mode, void *arg) assert(host_log_mode == LOG_MODE_DIRTY_RING || atomic_read(&vcpu_sync_stop_requested) == false); vm_dirty_log_verify(mode, bmap); - sem_post(&sem_vcpu_cont); - iteration++; + /* + * Set host_quit before sem_vcpu_cont in the final iteration to + * ensure that the vCPU worker doesn't resume the guest. As + * above, the dirty ring test may stop and wait even when not + * explicitly request to do so, i.e. would hang waiting for a + * "continue" if it's allowed to resume the guest. + */ + if (++iteration == p->iterations) + WRITE_ONCE(host_quit, true); + + sem_post(&sem_vcpu_cont); sync_global_to_guest(vm, iteration); } - /* Tell the vcpu thread to quit */ - host_quit = true; - log_mode_before_vcpu_join(); pthread_join(vcpu_thread, NULL); pr_info("Total bits checked: dirty (%"PRIu64"), clear (%"PRIu64"), " From 6fd78beed0213655c16ef728af0caec6d5ae4c73 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 31 Jan 2024 14:27:28 -0800 Subject: [PATCH 155/270] KVM: selftests: Don't assert on exact number of 4KiB in dirty log split test Drop dirty_log_page_splitting_test's assertion that the number of 4KiB pages remains the same across dirty logging being enabled and disabled, as the test doesn't guarantee that mappings outside of the memslots being dirty logged are stable, e.g. KVM's mappings for code and pages in memslot0 can be zapped by things like NUMA balancing. To preserve the spirit of the check, assert that (a) the number of 4KiB pages after splitting is _at least_ the number of 4KiB pages across all memslots under test, and (b) the number of hugepages before splitting adds up to the number of pages across all memslots under test. (b) is a little tenuous as it relies on memslot0 being incompatible with transparent hugepages, but that holds true for now as selftests explicitly madvise() MADV_NOHUGEPAGE for memslot0 (__vm_create() unconditionally specifies the backing type as VM_MEM_SRC_ANONYMOUS). Reported-by: Yi Lai Reported-by: Tao Su Reviewed-by: Tao Su Link: https://lore.kernel.org/r/20240131222728.4100079-1-seanjc@google.com Signed-off-by: Sean Christopherson --- .../x86_64/dirty_log_page_splitting_test.c | 21 +++++++++++-------- 1 file changed, 12 insertions(+), 9 deletions(-) diff --git a/tools/testing/selftests/kvm/x86_64/dirty_log_page_splitting_test.c b/tools/testing/selftests/kvm/x86_64/dirty_log_page_splitting_test.c index 634c6bfcd572..ee3b384b991c 100644 --- a/tools/testing/selftests/kvm/x86_64/dirty_log_page_splitting_test.c +++ b/tools/testing/selftests/kvm/x86_64/dirty_log_page_splitting_test.c @@ -92,7 +92,6 @@ static void run_test(enum vm_guest_mode mode, void *unused) uint64_t host_num_pages; uint64_t pages_per_slot; int i; - uint64_t total_4k_pages; struct kvm_page_stats stats_populated; struct kvm_page_stats stats_dirty_logging_enabled; struct kvm_page_stats stats_dirty_pass[ITERATIONS]; @@ -107,6 +106,9 @@ static void run_test(enum vm_guest_mode mode, void *unused) guest_num_pages = vm_adjust_num_guest_pages(mode, guest_num_pages); host_num_pages = vm_num_host_pages(mode, guest_num_pages); pages_per_slot = host_num_pages / SLOTS; + TEST_ASSERT_EQ(host_num_pages, pages_per_slot * SLOTS); + TEST_ASSERT(!(host_num_pages % 512), + "Number of pages, '%lu' not a multiple of 2MiB", host_num_pages); bitmaps = memstress_alloc_bitmaps(SLOTS, pages_per_slot); @@ -165,10 +167,8 @@ static void run_test(enum vm_guest_mode mode, void *unused) memstress_free_bitmaps(bitmaps, SLOTS); memstress_destroy_vm(vm); - /* Make assertions about the page counts. */ - total_4k_pages = stats_populated.pages_4k; - total_4k_pages += stats_populated.pages_2m * 512; - total_4k_pages += stats_populated.pages_1g * 512 * 512; + TEST_ASSERT_EQ((stats_populated.pages_2m * 512 + + stats_populated.pages_1g * 512 * 512), host_num_pages); /* * Check that all huge pages were split. Since large pages can only @@ -180,19 +180,22 @@ static void run_test(enum vm_guest_mode mode, void *unused) */ if (dirty_log_manual_caps) { TEST_ASSERT_EQ(stats_clear_pass[0].hugepages, 0); - TEST_ASSERT_EQ(stats_clear_pass[0].pages_4k, total_4k_pages); + TEST_ASSERT(stats_clear_pass[0].pages_4k >= host_num_pages, + "Expected at least '%lu' 4KiB pages, found only '%lu'", + host_num_pages, stats_clear_pass[0].pages_4k); TEST_ASSERT_EQ(stats_dirty_logging_enabled.hugepages, stats_populated.hugepages); } else { TEST_ASSERT_EQ(stats_dirty_logging_enabled.hugepages, 0); - TEST_ASSERT_EQ(stats_dirty_logging_enabled.pages_4k, total_4k_pages); + TEST_ASSERT(stats_dirty_logging_enabled.pages_4k >= host_num_pages, + "Expected at least '%lu' 4KiB pages, found only '%lu'", + host_num_pages, stats_dirty_logging_enabled.pages_4k); } /* * Once dirty logging is disabled and the vCPUs have touched all their - * memory again, the page counts should be the same as they were + * memory again, the hugepage counts should be the same as they were * right after initial population of memory. */ - TEST_ASSERT_EQ(stats_populated.pages_4k, stats_repopulated.pages_4k); TEST_ASSERT_EQ(stats_populated.pages_2m, stats_repopulated.pages_2m); TEST_ASSERT_EQ(stats_populated.pages_1g, stats_repopulated.pages_1g); } From 7ed4380009e96d9e9c605e12822e987b35b05648 Mon Sep 17 00:00:00 2001 From: Takashi Sakamoto Date: Wed, 7 Feb 2024 08:01:17 +0900 Subject: [PATCH 156/270] firewire: core: send bus reset promptly on gap count error If we are bus manager and the bus has inconsistent gap counts, send a bus reset immediately instead of trying to read the root node's config ROM first. Otherwise, we could spend a lot of time trying to read the config ROM but never succeeding. This eliminates a 50+ second delay before the FireWire bus is usable after a newly connected device is powered on in certain circumstances. The delay occurs if a gap count inconsistency occurs, we are not the root node, and we become bus manager. One scenario that causes this is with a TI XIO2213B OHCI, the first time a Sony DSR-25 is powered on after being connected to the FireWire cable. In this configuration, the Linux box will not receive the initial PHY configuration packet sent by the DSR-25 as IRM, resulting in the DSR-25 having a gap count of 44 while the Linux box has a gap count of 63. FireWire devices have a gap count parameter, which is set to 63 on power-up and can be changed with a PHY configuration packet. This determines the duration of the subaction and arbitration gaps. For reliable communication, all nodes on a FireWire bus must have the same gap count. A node may have zero or more of the following roles: root node, bus manager (BM), isochronous resource manager (IRM), and cycle master. Unless a root node was forced with a PHY configuration packet, any node might become root node after a bus reset. Only the root node can become cycle master. If the root node is not cycle master capable, the BM or IRM should force a change of root node. After a bus reset, each node sends a self-ID packet, which contains its current gap count. A single bus reset does not change the gap count, but two bus resets in a row will set the gap count to 63. Because a consistent gap count is required for reliable communication, IEEE 1394a-2000 requires that the bus manager generate a bus reset if it detects that the gap count is inconsistent. When the gap count is inconsistent, build_tree() will notice this after the self identification process. It will set card->gap_count to the invalid value 0. If we become bus master, this will force bm_work() to send a bus reset when it performs gap count optimization. After a bus reset, there is no bus manager. We will almost always try to become bus manager. Once we become bus manager, we will first determine whether the root node is cycle master capable. Then, we will determine if the gap count should be changed. If either the root node or the gap count should be changed, we will generate a bus reset. To determine if the root node is cycle master capable, we read its configuration ROM. bm_work() will wait until we have finished trying to read the configuration ROM. However, an inconsistent gap count can make this take a long time. read_config_rom() will read the first few quadlets from the config ROM. Due to the gap count inconsistency, eventually one of the reads will time out. When read_config_rom() fails, fw_device_init() calls it again until MAX_RETRIES is reached. This takes 50+ seconds. Once we give up trying to read the configuration ROM, bm_work() will wake up, assume that the root node is not cycle master capable, and do a bus reset. Hopefully, this will resolve the gap count inconsistency. This change makes bm_work() check for an inconsistent gap count before waiting for the root node's configuration ROM. If the gap count is inconsistent, bm_work() will immediately do a bus reset. This eliminates the 50+ second delay and rapidly brings the bus to a working state. I considered that if the gap count is inconsistent, a PHY configuration packet might not be successful, so it could be desirable to skip the PHY configuration packet before the bus reset in this case. However, IEEE 1394a-2000 and IEEE 1394-2008 say that the bus manager may transmit a PHY configuration packet before a bus reset when correcting a gap count error. Since the standard endorses this, I decided it's safe to retain the PHY configuration packet transmission. Normally, after a topology change, we will reset the bus a maximum of 5 times to change the root node and perform gap count optimization. However, if there is a gap count inconsistency, we must always generate a bus reset. Otherwise the gap count inconsistency will persist and communication will be unreliable. For that reason, if there is a gap count inconstency, we generate a bus reset even if we already reached the 5 reset limit. Signed-off-by: Adam Goldman Reference: https://sourceforge.net/p/linux1394/mailman/message/58727806/ Signed-off-by: Takashi Sakamoto --- drivers/firewire/core-card.c | 18 +++++++++++++++++- 1 file changed, 17 insertions(+), 1 deletion(-) diff --git a/drivers/firewire/core-card.c b/drivers/firewire/core-card.c index 6ac5ff20a2fe..8aaa7fcb2630 100644 --- a/drivers/firewire/core-card.c +++ b/drivers/firewire/core-card.c @@ -429,7 +429,23 @@ static void bm_work(struct work_struct *work) */ card->bm_generation = generation; - if (root_device == NULL) { + if (card->gap_count == 0) { + /* + * If self IDs have inconsistent gap counts, do a + * bus reset ASAP. The config rom read might never + * complete, so don't wait for it. However, still + * send a PHY configuration packet prior to the + * bus reset. The PHY configuration packet might + * fail, but 1394-2008 8.4.5.2 explicitly permits + * it in this case, so it should be safe to try. + */ + new_root_id = local_id; + /* + * We must always send a bus reset if the gap count + * is inconsistent, so bypass the 5-reset limit. + */ + card->bm_retries = 0; + } else if (root_device == NULL) { /* * Either link_on is false, or we failed to read the * config rom. In either case, pick another root. From 97cf301fa42e8ea6e0a24de97bc0abcdc87d9504 Mon Sep 17 00:00:00 2001 From: Alexandre Ghiti Date: Sun, 28 Jan 2024 13:04:05 +0100 Subject: [PATCH 157/270] riscv: Flush the tlb when a page directory is freed The riscv privileged specification mandates to flush the TLB whenever a page directory is modified, so add that to tlb_flush(). Fixes: c5e9b2c2ae82 ("riscv: Improve tlb_flush()") Signed-off-by: Alexandre Ghiti Reviewed-by: Charlie Jenkins Link: https://lore.kernel.org/r/20240128120405.25876-1-alexghiti@rivosinc.com Signed-off-by: Palmer Dabbelt --- arch/riscv/include/asm/tlb.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/riscv/include/asm/tlb.h b/arch/riscv/include/asm/tlb.h index 1eb5682b2af6..50b63b5c15bd 100644 --- a/arch/riscv/include/asm/tlb.h +++ b/arch/riscv/include/asm/tlb.h @@ -16,7 +16,7 @@ static void tlb_flush(struct mmu_gather *tlb); static inline void tlb_flush(struct mmu_gather *tlb) { #ifdef CONFIG_MMU - if (tlb->fullmm || tlb->need_flush_all) + if (tlb->fullmm || tlb->need_flush_all || tlb->freed_tables) flush_tlb_mm(tlb->mm); else flush_tlb_mm_range(tlb->mm, tlb->start, tlb->end, From 1279f9d9dec2d7462823a18c29ad61359e0a007d Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Sat, 3 Feb 2024 10:31:49 -0800 Subject: [PATCH 158/270] af_unix: Call kfree_skb() for dead unix_(sk)->oob_skb in GC. syzbot reported a warning [0] in __unix_gc() with a repro, which creates a socketpair and sends one socket's fd to itself using the peer. socketpair(AF_UNIX, SOCK_STREAM, 0, [3, 4]) = 0 sendmsg(4, {msg_name=NULL, msg_namelen=0, msg_iov=[{iov_base="\360", iov_len=1}], msg_iovlen=1, msg_control=[{cmsg_len=20, cmsg_level=SOL_SOCKET, cmsg_type=SCM_RIGHTS, cmsg_data=[3]}], msg_controllen=24, msg_flags=0}, MSG_OOB|MSG_PROBE|MSG_DONTWAIT|MSG_ZEROCOPY) = 1 This forms a self-cyclic reference that GC should finally untangle but does not due to lack of MSG_OOB handling, resulting in memory leak. Recently, commit 11498715f266 ("af_unix: Remove io_uring code for GC.") removed io_uring's dead code in GC and revealed the problem. The code was executed at the final stage of GC and unconditionally moved all GC candidates from gc_candidates to gc_inflight_list. That papered over the reported problem by always making the following WARN_ON_ONCE(!list_empty(&gc_candidates)) false. The problem has been there since commit 2aab4b969002 ("af_unix: fix struct pid leaks in OOB support") added full scm support for MSG_OOB while fixing another bug. To fix this problem, we must call kfree_skb() for unix_sk(sk)->oob_skb if the socket still exists in gc_candidates after purging collected skb. Then, we need to set NULL to oob_skb before calling kfree_skb() because it calls last fput() and triggers unix_release_sock(), where we call duplicate kfree_skb(u->oob_skb) if not NULL. Note that the leaked socket remained being linked to a global list, so kmemleak also could not detect it. We need to check /proc/net/protocol to notice the unfreed socket. [0]: WARNING: CPU: 0 PID: 2863 at net/unix/garbage.c:345 __unix_gc+0xc74/0xe80 net/unix/garbage.c:345 Modules linked in: CPU: 0 PID: 2863 Comm: kworker/u4:11 Not tainted 6.8.0-rc1-syzkaller-00583-g1701940b1a02 #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/25/2024 Workqueue: events_unbound __unix_gc RIP: 0010:__unix_gc+0xc74/0xe80 net/unix/garbage.c:345 Code: 8b 5c 24 50 e9 86 f8 ff ff e8 f8 e4 22 f8 31 d2 48 c7 c6 30 6a 69 89 4c 89 ef e8 97 ef ff ff e9 80 f9 ff ff e8 dd e4 22 f8 90 <0f> 0b 90 e9 7b fd ff ff 48 89 df e8 5c e7 7c f8 e9 d3 f8 ff ff e8 RSP: 0018:ffffc9000b03fba0 EFLAGS: 00010293 RAX: 0000000000000000 RBX: ffffc9000b03fc10 RCX: ffffffff816c493e RDX: ffff88802c02d940 RSI: ffffffff896982f3 RDI: ffffc9000b03fb30 RBP: ffffc9000b03fce0 R08: 0000000000000001 R09: fffff52001607f66 R10: 0000000000000003 R11: 0000000000000002 R12: dffffc0000000000 R13: ffffc9000b03fc10 R14: ffffc9000b03fc10 R15: 0000000000000001 FS: 0000000000000000(0000) GS:ffff8880b9400000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00005559c8677a60 CR3: 000000000d57a000 CR4: 00000000003506f0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 Call Trace: process_one_work+0x889/0x15e0 kernel/workqueue.c:2633 process_scheduled_works kernel/workqueue.c:2706 [inline] worker_thread+0x8b9/0x12a0 kernel/workqueue.c:2787 kthread+0x2c6/0x3b0 kernel/kthread.c:388 ret_from_fork+0x45/0x80 arch/x86/kernel/process.c:147 ret_from_fork_asm+0x1b/0x30 arch/x86/entry/entry_64.S:242 Reported-by: syzbot+fa3ef895554bdbfd1183@syzkaller.appspotmail.com Closes: https://syzkaller.appspot.com/bug?extid=fa3ef895554bdbfd1183 Fixes: 2aab4b969002 ("af_unix: fix struct pid leaks in OOB support") Signed-off-by: Kuniyuki Iwashima Reviewed-by: Eric Dumazet Link: https://lore.kernel.org/r/20240203183149.63573-1-kuniyu@amazon.com Signed-off-by: Jakub Kicinski --- net/unix/garbage.c | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/net/unix/garbage.c b/net/unix/garbage.c index 2405f0f9af31..8f63f0b4bf01 100644 --- a/net/unix/garbage.c +++ b/net/unix/garbage.c @@ -314,6 +314,17 @@ void unix_gc(void) /* Here we are. Hitlist is filled. Die. */ __skb_queue_purge(&hitlist); +#if IS_ENABLED(CONFIG_AF_UNIX_OOB) + list_for_each_entry_safe(u, next, &gc_candidates, link) { + struct sk_buff *skb = u->oob_skb; + + if (skb) { + u->oob_skb = NULL; + kfree_skb(skb); + } + } +#endif + spin_lock(&unix_gc_lock); /* There could be io_uring registered files, just push them back to From 58086721b7781c3e35b19c9b78c8f5a791070ba3 Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Mon, 5 Feb 2024 18:11:14 +0100 Subject: [PATCH 159/270] devlink: avoid potential loop in devlink_rel_nested_in_notify_work() In case devlink_rel_nested_in_notify_work() can not take the devlink lock mutex. Convert the work to delayed work and in case of reschedule do it jiffie later and avoid potential looping. Suggested-by: Paolo Abeni Fixes: c137743bce02 ("devlink: introduce object and nested devlink relationship infra") Signed-off-by: Jiri Pirko Link: https://lore.kernel.org/r/20240205171114.338679-1-jiri@resnulli.us Signed-off-by: Jakub Kicinski --- net/devlink/core.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/net/devlink/core.c b/net/devlink/core.c index 4275a2bc6d8e..6a58342752b4 100644 --- a/net/devlink/core.c +++ b/net/devlink/core.c @@ -46,7 +46,7 @@ struct devlink_rel { u32 obj_index; devlink_rel_notify_cb_t *notify_cb; devlink_rel_cleanup_cb_t *cleanup_cb; - struct work_struct notify_work; + struct delayed_work notify_work; } nested_in; }; @@ -70,7 +70,7 @@ static void __devlink_rel_put(struct devlink_rel *rel) static void devlink_rel_nested_in_notify_work(struct work_struct *work) { struct devlink_rel *rel = container_of(work, struct devlink_rel, - nested_in.notify_work); + nested_in.notify_work.work); struct devlink *devlink; devlink = devlinks_xa_get(rel->nested_in.devlink_index); @@ -96,13 +96,13 @@ rel_put: return; reschedule_work: - schedule_work(&rel->nested_in.notify_work); + schedule_delayed_work(&rel->nested_in.notify_work, 1); } static void devlink_rel_nested_in_notify_work_schedule(struct devlink_rel *rel) { __devlink_rel_get(rel); - schedule_work(&rel->nested_in.notify_work); + schedule_delayed_work(&rel->nested_in.notify_work, 0); } static struct devlink_rel *devlink_rel_alloc(void) @@ -123,8 +123,8 @@ static struct devlink_rel *devlink_rel_alloc(void) } refcount_set(&rel->refcount, 1); - INIT_WORK(&rel->nested_in.notify_work, - &devlink_rel_nested_in_notify_work); + INIT_DELAYED_WORK(&rel->nested_in.notify_work, + &devlink_rel_nested_in_notify_work); return rel; } From cb88cb53badb8aeb3955ad6ce80b07b598e310b8 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 5 Feb 2024 17:10:04 +0000 Subject: [PATCH 160/270] ppp_async: limit MRU to 64K syzbot triggered a warning [1] in __alloc_pages(): WARN_ON_ONCE_GFP(order > MAX_PAGE_ORDER, gfp) Willem fixed a similar issue in commit c0a2a1b0d631 ("ppp: limit MRU to 64K") Adopt the same sanity check for ppp_async_ioctl(PPPIOCSMRU) [1]: WARNING: CPU: 1 PID: 11 at mm/page_alloc.c:4543 __alloc_pages+0x308/0x698 mm/page_alloc.c:4543 Modules linked in: CPU: 1 PID: 11 Comm: kworker/u4:0 Not tainted 6.8.0-rc2-syzkaller-g41bccc98fb79 #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 11/17/2023 Workqueue: events_unbound flush_to_ldisc pstate: 204000c5 (nzCv daIF +PAN -UAO -TCO -DIT -SSBS BTYPE=--) pc : __alloc_pages+0x308/0x698 mm/page_alloc.c:4543 lr : __alloc_pages+0xc8/0x698 mm/page_alloc.c:4537 sp : ffff800093967580 x29: ffff800093967660 x28: ffff8000939675a0 x27: dfff800000000000 x26: ffff70001272ceb4 x25: 0000000000000000 x24: ffff8000939675c0 x23: 0000000000000000 x22: 0000000000060820 x21: 1ffff0001272ceb8 x20: ffff8000939675e0 x19: 0000000000000010 x18: ffff800093967120 x17: ffff800083bded5c x16: ffff80008ac97500 x15: 0000000000000005 x14: 1ffff0001272cebc x13: 0000000000000000 x12: 0000000000000000 x11: ffff70001272cec1 x10: 1ffff0001272cec0 x9 : 0000000000000001 x8 : ffff800091c91000 x7 : 0000000000000000 x6 : 000000000000003f x5 : 00000000ffffffff x4 : 0000000000000000 x3 : 0000000000000020 x2 : 0000000000000008 x1 : 0000000000000000 x0 : ffff8000939675e0 Call trace: __alloc_pages+0x308/0x698 mm/page_alloc.c:4543 __alloc_pages_node include/linux/gfp.h:238 [inline] alloc_pages_node include/linux/gfp.h:261 [inline] __kmalloc_large_node+0xbc/0x1fc mm/slub.c:3926 __do_kmalloc_node mm/slub.c:3969 [inline] __kmalloc_node_track_caller+0x418/0x620 mm/slub.c:4001 kmalloc_reserve+0x17c/0x23c net/core/skbuff.c:590 __alloc_skb+0x1c8/0x3d8 net/core/skbuff.c:651 __netdev_alloc_skb+0xb8/0x3e8 net/core/skbuff.c:715 netdev_alloc_skb include/linux/skbuff.h:3235 [inline] dev_alloc_skb include/linux/skbuff.h:3248 [inline] ppp_async_input drivers/net/ppp/ppp_async.c:863 [inline] ppp_asynctty_receive+0x588/0x186c drivers/net/ppp/ppp_async.c:341 tty_ldisc_receive_buf+0x12c/0x15c drivers/tty/tty_buffer.c:390 tty_port_default_receive_buf+0x74/0xac drivers/tty/tty_port.c:37 receive_buf drivers/tty/tty_buffer.c:444 [inline] flush_to_ldisc+0x284/0x6e4 drivers/tty/tty_buffer.c:494 process_one_work+0x694/0x1204 kernel/workqueue.c:2633 process_scheduled_works kernel/workqueue.c:2706 [inline] worker_thread+0x938/0xef4 kernel/workqueue.c:2787 kthread+0x288/0x310 kernel/kthread.c:388 ret_from_fork+0x10/0x20 arch/arm64/kernel/entry.S:860 Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2") Reported-and-tested-by: syzbot+c5da1f087c9e4ec6c933@syzkaller.appspotmail.com Signed-off-by: Eric Dumazet Reviewed-by: Willem de Bruijn Link: https://lore.kernel.org/r/20240205171004.1059724-1-edumazet@google.com Signed-off-by: Jakub Kicinski --- drivers/net/ppp/ppp_async.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/drivers/net/ppp/ppp_async.c b/drivers/net/ppp/ppp_async.c index 840da924708b..125793d8aefa 100644 --- a/drivers/net/ppp/ppp_async.c +++ b/drivers/net/ppp/ppp_async.c @@ -460,6 +460,10 @@ ppp_async_ioctl(struct ppp_channel *chan, unsigned int cmd, unsigned long arg) case PPPIOCSMRU: if (get_user(val, p)) break; + if (val > U16_MAX) { + err = -EINVAL; + break; + } if (val < PPP_MRU) val = PPP_MRU; ap->mru = val; From 38cc3c6dcc09dc3a1800b5ec22aef643ca11eab8 Mon Sep 17 00:00:00 2001 From: Petr Tesarik Date: Sat, 3 Feb 2024 20:09:27 +0100 Subject: [PATCH 161/270] net: stmmac: protect updates of 64-bit statistics counters As explained by a comment in , write side of struct u64_stats_sync must ensure mutual exclusion, or one seqcount update could be lost on 32-bit platforms, thus blocking readers forever. Such lockups have been observed in real world after stmmac_xmit() on one CPU raced with stmmac_napi_poll_tx() on another CPU. To fix the issue without introducing a new lock, split the statics into three parts: 1. fields updated only under the tx queue lock, 2. fields updated only during NAPI poll, 3. fields updated only from interrupt context, Updates to fields in the first two groups are already serialized through other locks. It is sufficient to split the existing struct u64_stats_sync so that each group has its own. Note that tx_set_ic_bit is updated from both contexts. Split this counter so that each context gets its own, and calculate their sum to get the total value in stmmac_get_ethtool_stats(). For the third group, multiple interrupts may be processed by different CPUs at the same time, but interrupts on the same CPU will not nest. Move fields from this group to a newly created per-cpu struct stmmac_pcpu_stats. Fixes: 133466c3bbe1 ("net: stmmac: use per-queue 64 bit statistics where necessary") Link: https://lore.kernel.org/netdev/Za173PhviYg-1qIn@torres.zugschlus.de/t/ Cc: stable@vger.kernel.org Signed-off-by: Petr Tesarik Reviewed-by: Jisheng Zhang Signed-off-by: David S. Miller --- drivers/net/ethernet/stmicro/stmmac/common.h | 56 +++++--- .../net/ethernet/stmicro/stmmac/dwmac-sun8i.c | 15 +- .../net/ethernet/stmicro/stmmac/dwmac4_lib.c | 15 +- .../net/ethernet/stmicro/stmmac/dwmac_lib.c | 15 +- .../ethernet/stmicro/stmmac/dwxgmac2_dma.c | 15 +- .../ethernet/stmicro/stmmac/stmmac_ethtool.c | 125 ++++++++++------ .../net/ethernet/stmicro/stmmac/stmmac_main.c | 133 +++++++++--------- 7 files changed, 219 insertions(+), 155 deletions(-) diff --git a/drivers/net/ethernet/stmicro/stmmac/common.h b/drivers/net/ethernet/stmicro/stmmac/common.h index b4f60ab078d6..5ba606a596e7 100644 --- a/drivers/net/ethernet/stmicro/stmmac/common.h +++ b/drivers/net/ethernet/stmicro/stmmac/common.h @@ -59,28 +59,51 @@ #undef FRAME_FILTER_DEBUG /* #define FRAME_FILTER_DEBUG */ +struct stmmac_q_tx_stats { + u64_stats_t tx_bytes; + u64_stats_t tx_set_ic_bit; + u64_stats_t tx_tso_frames; + u64_stats_t tx_tso_nfrags; +}; + +struct stmmac_napi_tx_stats { + u64_stats_t tx_packets; + u64_stats_t tx_pkt_n; + u64_stats_t poll; + u64_stats_t tx_clean; + u64_stats_t tx_set_ic_bit; +}; + struct stmmac_txq_stats { - u64 tx_bytes; - u64 tx_packets; - u64 tx_pkt_n; - u64 tx_normal_irq_n; - u64 napi_poll; - u64 tx_clean; - u64 tx_set_ic_bit; - u64 tx_tso_frames; - u64 tx_tso_nfrags; - struct u64_stats_sync syncp; + /* Updates protected by tx queue lock. */ + struct u64_stats_sync q_syncp; + struct stmmac_q_tx_stats q; + + /* Updates protected by NAPI poll logic. */ + struct u64_stats_sync napi_syncp; + struct stmmac_napi_tx_stats napi; } ____cacheline_aligned_in_smp; +struct stmmac_napi_rx_stats { + u64_stats_t rx_bytes; + u64_stats_t rx_packets; + u64_stats_t rx_pkt_n; + u64_stats_t poll; +}; + struct stmmac_rxq_stats { - u64 rx_bytes; - u64 rx_packets; - u64 rx_pkt_n; - u64 rx_normal_irq_n; - u64 napi_poll; - struct u64_stats_sync syncp; + /* Updates protected by NAPI poll logic. */ + struct u64_stats_sync napi_syncp; + struct stmmac_napi_rx_stats napi; } ____cacheline_aligned_in_smp; +/* Updates on each CPU protected by not allowing nested irqs. */ +struct stmmac_pcpu_stats { + struct u64_stats_sync syncp; + u64_stats_t rx_normal_irq_n[MTL_MAX_TX_QUEUES]; + u64_stats_t tx_normal_irq_n[MTL_MAX_RX_QUEUES]; +}; + /* Extra statistic and debug information exposed by ethtool */ struct stmmac_extra_stats { /* Transmit errors */ @@ -205,6 +228,7 @@ struct stmmac_extra_stats { /* per queue statistics */ struct stmmac_txq_stats txq_stats[MTL_MAX_TX_QUEUES]; struct stmmac_rxq_stats rxq_stats[MTL_MAX_RX_QUEUES]; + struct stmmac_pcpu_stats __percpu *pcpu_stats; unsigned long rx_dropped; unsigned long rx_errors; unsigned long tx_dropped; diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac-sun8i.c b/drivers/net/ethernet/stmicro/stmmac/dwmac-sun8i.c index 137741b94122..b21d99faa2d0 100644 --- a/drivers/net/ethernet/stmicro/stmmac/dwmac-sun8i.c +++ b/drivers/net/ethernet/stmicro/stmmac/dwmac-sun8i.c @@ -441,8 +441,7 @@ static int sun8i_dwmac_dma_interrupt(struct stmmac_priv *priv, struct stmmac_extra_stats *x, u32 chan, u32 dir) { - struct stmmac_rxq_stats *rxq_stats = &priv->xstats.rxq_stats[chan]; - struct stmmac_txq_stats *txq_stats = &priv->xstats.txq_stats[chan]; + struct stmmac_pcpu_stats *stats = this_cpu_ptr(priv->xstats.pcpu_stats); int ret = 0; u32 v; @@ -455,9 +454,9 @@ static int sun8i_dwmac_dma_interrupt(struct stmmac_priv *priv, if (v & EMAC_TX_INT) { ret |= handle_tx; - u64_stats_update_begin(&txq_stats->syncp); - txq_stats->tx_normal_irq_n++; - u64_stats_update_end(&txq_stats->syncp); + u64_stats_update_begin(&stats->syncp); + u64_stats_inc(&stats->tx_normal_irq_n[chan]); + u64_stats_update_end(&stats->syncp); } if (v & EMAC_TX_DMA_STOP_INT) @@ -479,9 +478,9 @@ static int sun8i_dwmac_dma_interrupt(struct stmmac_priv *priv, if (v & EMAC_RX_INT) { ret |= handle_rx; - u64_stats_update_begin(&rxq_stats->syncp); - rxq_stats->rx_normal_irq_n++; - u64_stats_update_end(&rxq_stats->syncp); + u64_stats_update_begin(&stats->syncp); + u64_stats_inc(&stats->rx_normal_irq_n[chan]); + u64_stats_update_end(&stats->syncp); } if (v & EMAC_RX_BUF_UA_INT) diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac4_lib.c b/drivers/net/ethernet/stmicro/stmmac/dwmac4_lib.c index 9470d3fd2ded..0d185e54eb7e 100644 --- a/drivers/net/ethernet/stmicro/stmmac/dwmac4_lib.c +++ b/drivers/net/ethernet/stmicro/stmmac/dwmac4_lib.c @@ -171,8 +171,7 @@ int dwmac4_dma_interrupt(struct stmmac_priv *priv, void __iomem *ioaddr, const struct dwmac4_addrs *dwmac4_addrs = priv->plat->dwmac4_addrs; u32 intr_status = readl(ioaddr + DMA_CHAN_STATUS(dwmac4_addrs, chan)); u32 intr_en = readl(ioaddr + DMA_CHAN_INTR_ENA(dwmac4_addrs, chan)); - struct stmmac_rxq_stats *rxq_stats = &priv->xstats.rxq_stats[chan]; - struct stmmac_txq_stats *txq_stats = &priv->xstats.txq_stats[chan]; + struct stmmac_pcpu_stats *stats = this_cpu_ptr(priv->xstats.pcpu_stats); int ret = 0; if (dir == DMA_DIR_RX) @@ -201,15 +200,15 @@ int dwmac4_dma_interrupt(struct stmmac_priv *priv, void __iomem *ioaddr, } /* TX/RX NORMAL interrupts */ if (likely(intr_status & DMA_CHAN_STATUS_RI)) { - u64_stats_update_begin(&rxq_stats->syncp); - rxq_stats->rx_normal_irq_n++; - u64_stats_update_end(&rxq_stats->syncp); + u64_stats_update_begin(&stats->syncp); + u64_stats_inc(&stats->rx_normal_irq_n[chan]); + u64_stats_update_end(&stats->syncp); ret |= handle_rx; } if (likely(intr_status & DMA_CHAN_STATUS_TI)) { - u64_stats_update_begin(&txq_stats->syncp); - txq_stats->tx_normal_irq_n++; - u64_stats_update_end(&txq_stats->syncp); + u64_stats_update_begin(&stats->syncp); + u64_stats_inc(&stats->tx_normal_irq_n[chan]); + u64_stats_update_end(&stats->syncp); ret |= handle_tx; } diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac_lib.c b/drivers/net/ethernet/stmicro/stmmac/dwmac_lib.c index 7907d62d3437..85e18f9a22f9 100644 --- a/drivers/net/ethernet/stmicro/stmmac/dwmac_lib.c +++ b/drivers/net/ethernet/stmicro/stmmac/dwmac_lib.c @@ -162,8 +162,7 @@ static void show_rx_process_state(unsigned int status) int dwmac_dma_interrupt(struct stmmac_priv *priv, void __iomem *ioaddr, struct stmmac_extra_stats *x, u32 chan, u32 dir) { - struct stmmac_rxq_stats *rxq_stats = &priv->xstats.rxq_stats[chan]; - struct stmmac_txq_stats *txq_stats = &priv->xstats.txq_stats[chan]; + struct stmmac_pcpu_stats *stats = this_cpu_ptr(priv->xstats.pcpu_stats); int ret = 0; /* read the status register (CSR5) */ u32 intr_status = readl(ioaddr + DMA_STATUS); @@ -215,16 +214,16 @@ int dwmac_dma_interrupt(struct stmmac_priv *priv, void __iomem *ioaddr, u32 value = readl(ioaddr + DMA_INTR_ENA); /* to schedule NAPI on real RIE event. */ if (likely(value & DMA_INTR_ENA_RIE)) { - u64_stats_update_begin(&rxq_stats->syncp); - rxq_stats->rx_normal_irq_n++; - u64_stats_update_end(&rxq_stats->syncp); + u64_stats_update_begin(&stats->syncp); + u64_stats_inc(&stats->rx_normal_irq_n[chan]); + u64_stats_update_end(&stats->syncp); ret |= handle_rx; } } if (likely(intr_status & DMA_STATUS_TI)) { - u64_stats_update_begin(&txq_stats->syncp); - txq_stats->tx_normal_irq_n++; - u64_stats_update_end(&txq_stats->syncp); + u64_stats_update_begin(&stats->syncp); + u64_stats_inc(&stats->tx_normal_irq_n[chan]); + u64_stats_update_end(&stats->syncp); ret |= handle_tx; } if (unlikely(intr_status & DMA_STATUS_ERI)) diff --git a/drivers/net/ethernet/stmicro/stmmac/dwxgmac2_dma.c b/drivers/net/ethernet/stmicro/stmmac/dwxgmac2_dma.c index 3cde695fec91..dd2ab6185c40 100644 --- a/drivers/net/ethernet/stmicro/stmmac/dwxgmac2_dma.c +++ b/drivers/net/ethernet/stmicro/stmmac/dwxgmac2_dma.c @@ -337,8 +337,7 @@ static int dwxgmac2_dma_interrupt(struct stmmac_priv *priv, struct stmmac_extra_stats *x, u32 chan, u32 dir) { - struct stmmac_rxq_stats *rxq_stats = &priv->xstats.rxq_stats[chan]; - struct stmmac_txq_stats *txq_stats = &priv->xstats.txq_stats[chan]; + struct stmmac_pcpu_stats *stats = this_cpu_ptr(priv->xstats.pcpu_stats); u32 intr_status = readl(ioaddr + XGMAC_DMA_CH_STATUS(chan)); u32 intr_en = readl(ioaddr + XGMAC_DMA_CH_INT_EN(chan)); int ret = 0; @@ -367,15 +366,15 @@ static int dwxgmac2_dma_interrupt(struct stmmac_priv *priv, /* TX/RX NORMAL interrupts */ if (likely(intr_status & XGMAC_NIS)) { if (likely(intr_status & XGMAC_RI)) { - u64_stats_update_begin(&rxq_stats->syncp); - rxq_stats->rx_normal_irq_n++; - u64_stats_update_end(&rxq_stats->syncp); + u64_stats_update_begin(&stats->syncp); + u64_stats_inc(&stats->rx_normal_irq_n[chan]); + u64_stats_update_end(&stats->syncp); ret |= handle_rx; } if (likely(intr_status & (XGMAC_TI | XGMAC_TBU))) { - u64_stats_update_begin(&txq_stats->syncp); - txq_stats->tx_normal_irq_n++; - u64_stats_update_end(&txq_stats->syncp); + u64_stats_update_begin(&stats->syncp); + u64_stats_inc(&stats->tx_normal_irq_n[chan]); + u64_stats_update_end(&stats->syncp); ret |= handle_tx; } } diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_ethtool.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_ethtool.c index 42d27b97dd1d..ec44becf0e2d 100644 --- a/drivers/net/ethernet/stmicro/stmmac/stmmac_ethtool.c +++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_ethtool.c @@ -549,44 +549,79 @@ stmmac_set_pauseparam(struct net_device *netdev, } } +static u64 stmmac_get_rx_normal_irq_n(struct stmmac_priv *priv, int q) +{ + u64 total; + int cpu; + + total = 0; + for_each_possible_cpu(cpu) { + struct stmmac_pcpu_stats *pcpu; + unsigned int start; + u64 irq_n; + + pcpu = per_cpu_ptr(priv->xstats.pcpu_stats, cpu); + do { + start = u64_stats_fetch_begin(&pcpu->syncp); + irq_n = u64_stats_read(&pcpu->rx_normal_irq_n[q]); + } while (u64_stats_fetch_retry(&pcpu->syncp, start)); + total += irq_n; + } + return total; +} + +static u64 stmmac_get_tx_normal_irq_n(struct stmmac_priv *priv, int q) +{ + u64 total; + int cpu; + + total = 0; + for_each_possible_cpu(cpu) { + struct stmmac_pcpu_stats *pcpu; + unsigned int start; + u64 irq_n; + + pcpu = per_cpu_ptr(priv->xstats.pcpu_stats, cpu); + do { + start = u64_stats_fetch_begin(&pcpu->syncp); + irq_n = u64_stats_read(&pcpu->tx_normal_irq_n[q]); + } while (u64_stats_fetch_retry(&pcpu->syncp, start)); + total += irq_n; + } + return total; +} + static void stmmac_get_per_qstats(struct stmmac_priv *priv, u64 *data) { u32 tx_cnt = priv->plat->tx_queues_to_use; u32 rx_cnt = priv->plat->rx_queues_to_use; unsigned int start; - int q, stat; - char *p; + int q; for (q = 0; q < tx_cnt; q++) { struct stmmac_txq_stats *txq_stats = &priv->xstats.txq_stats[q]; - struct stmmac_txq_stats snapshot; + u64 pkt_n; do { - start = u64_stats_fetch_begin(&txq_stats->syncp); - snapshot = *txq_stats; - } while (u64_stats_fetch_retry(&txq_stats->syncp, start)); + start = u64_stats_fetch_begin(&txq_stats->napi_syncp); + pkt_n = u64_stats_read(&txq_stats->napi.tx_pkt_n); + } while (u64_stats_fetch_retry(&txq_stats->napi_syncp, start)); - p = (char *)&snapshot + offsetof(struct stmmac_txq_stats, tx_pkt_n); - for (stat = 0; stat < STMMAC_TXQ_STATS; stat++) { - *data++ = (*(u64 *)p); - p += sizeof(u64); - } + *data++ = pkt_n; + *data++ = stmmac_get_tx_normal_irq_n(priv, q); } for (q = 0; q < rx_cnt; q++) { struct stmmac_rxq_stats *rxq_stats = &priv->xstats.rxq_stats[q]; - struct stmmac_rxq_stats snapshot; + u64 pkt_n; do { - start = u64_stats_fetch_begin(&rxq_stats->syncp); - snapshot = *rxq_stats; - } while (u64_stats_fetch_retry(&rxq_stats->syncp, start)); + start = u64_stats_fetch_begin(&rxq_stats->napi_syncp); + pkt_n = u64_stats_read(&rxq_stats->napi.rx_pkt_n); + } while (u64_stats_fetch_retry(&rxq_stats->napi_syncp, start)); - p = (char *)&snapshot + offsetof(struct stmmac_rxq_stats, rx_pkt_n); - for (stat = 0; stat < STMMAC_RXQ_STATS; stat++) { - *data++ = (*(u64 *)p); - p += sizeof(u64); - } + *data++ = pkt_n; + *data++ = stmmac_get_rx_normal_irq_n(priv, q); } } @@ -645,39 +680,49 @@ static void stmmac_get_ethtool_stats(struct net_device *dev, pos = j; for (i = 0; i < rx_queues_count; i++) { struct stmmac_rxq_stats *rxq_stats = &priv->xstats.rxq_stats[i]; - struct stmmac_rxq_stats snapshot; + struct stmmac_napi_rx_stats snapshot; + u64 n_irq; j = pos; do { - start = u64_stats_fetch_begin(&rxq_stats->syncp); - snapshot = *rxq_stats; - } while (u64_stats_fetch_retry(&rxq_stats->syncp, start)); + start = u64_stats_fetch_begin(&rxq_stats->napi_syncp); + snapshot = rxq_stats->napi; + } while (u64_stats_fetch_retry(&rxq_stats->napi_syncp, start)); - data[j++] += snapshot.rx_pkt_n; - data[j++] += snapshot.rx_normal_irq_n; - normal_irq_n += snapshot.rx_normal_irq_n; - napi_poll += snapshot.napi_poll; + data[j++] += u64_stats_read(&snapshot.rx_pkt_n); + n_irq = stmmac_get_rx_normal_irq_n(priv, i); + data[j++] += n_irq; + normal_irq_n += n_irq; + napi_poll += u64_stats_read(&snapshot.poll); } pos = j; for (i = 0; i < tx_queues_count; i++) { struct stmmac_txq_stats *txq_stats = &priv->xstats.txq_stats[i]; - struct stmmac_txq_stats snapshot; + struct stmmac_napi_tx_stats napi_snapshot; + struct stmmac_q_tx_stats q_snapshot; + u64 n_irq; j = pos; do { - start = u64_stats_fetch_begin(&txq_stats->syncp); - snapshot = *txq_stats; - } while (u64_stats_fetch_retry(&txq_stats->syncp, start)); + start = u64_stats_fetch_begin(&txq_stats->q_syncp); + q_snapshot = txq_stats->q; + } while (u64_stats_fetch_retry(&txq_stats->q_syncp, start)); + do { + start = u64_stats_fetch_begin(&txq_stats->napi_syncp); + napi_snapshot = txq_stats->napi; + } while (u64_stats_fetch_retry(&txq_stats->napi_syncp, start)); - data[j++] += snapshot.tx_pkt_n; - data[j++] += snapshot.tx_normal_irq_n; - normal_irq_n += snapshot.tx_normal_irq_n; - data[j++] += snapshot.tx_clean; - data[j++] += snapshot.tx_set_ic_bit; - data[j++] += snapshot.tx_tso_frames; - data[j++] += snapshot.tx_tso_nfrags; - napi_poll += snapshot.napi_poll; + data[j++] += u64_stats_read(&napi_snapshot.tx_pkt_n); + n_irq = stmmac_get_tx_normal_irq_n(priv, i); + data[j++] += n_irq; + normal_irq_n += n_irq; + data[j++] += u64_stats_read(&napi_snapshot.tx_clean); + data[j++] += u64_stats_read(&q_snapshot.tx_set_ic_bit) + + u64_stats_read(&napi_snapshot.tx_set_ic_bit); + data[j++] += u64_stats_read(&q_snapshot.tx_tso_frames); + data[j++] += u64_stats_read(&q_snapshot.tx_tso_nfrags); + napi_poll += u64_stats_read(&napi_snapshot.poll); } normal_irq_n += priv->xstats.rx_early_irq; data[j++] = normal_irq_n; diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c index 25519952f754..75d029704503 100644 --- a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c +++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c @@ -2482,7 +2482,6 @@ static bool stmmac_xdp_xmit_zc(struct stmmac_priv *priv, u32 queue, u32 budget) struct xdp_desc xdp_desc; bool work_done = true; u32 tx_set_ic_bit = 0; - unsigned long flags; /* Avoids TX time-out as we are sharing with slow path */ txq_trans_cond_update(nq); @@ -2566,9 +2565,9 @@ static bool stmmac_xdp_xmit_zc(struct stmmac_priv *priv, u32 queue, u32 budget) tx_q->cur_tx = STMMAC_GET_ENTRY(tx_q->cur_tx, priv->dma_conf.dma_tx_size); entry = tx_q->cur_tx; } - flags = u64_stats_update_begin_irqsave(&txq_stats->syncp); - txq_stats->tx_set_ic_bit += tx_set_ic_bit; - u64_stats_update_end_irqrestore(&txq_stats->syncp, flags); + u64_stats_update_begin(&txq_stats->napi_syncp); + u64_stats_add(&txq_stats->napi.tx_set_ic_bit, tx_set_ic_bit); + u64_stats_update_end(&txq_stats->napi_syncp); if (tx_desc) { stmmac_flush_tx_descriptors(priv, queue); @@ -2616,7 +2615,6 @@ static int stmmac_tx_clean(struct stmmac_priv *priv, int budget, u32 queue, unsigned int bytes_compl = 0, pkts_compl = 0; unsigned int entry, xmits = 0, count = 0; u32 tx_packets = 0, tx_errors = 0; - unsigned long flags; __netif_tx_lock_bh(netdev_get_tx_queue(priv->dev, queue)); @@ -2782,11 +2780,11 @@ static int stmmac_tx_clean(struct stmmac_priv *priv, int budget, u32 queue, if (tx_q->dirty_tx != tx_q->cur_tx) *pending_packets = true; - flags = u64_stats_update_begin_irqsave(&txq_stats->syncp); - txq_stats->tx_packets += tx_packets; - txq_stats->tx_pkt_n += tx_packets; - txq_stats->tx_clean++; - u64_stats_update_end_irqrestore(&txq_stats->syncp, flags); + u64_stats_update_begin(&txq_stats->napi_syncp); + u64_stats_add(&txq_stats->napi.tx_packets, tx_packets); + u64_stats_add(&txq_stats->napi.tx_pkt_n, tx_packets); + u64_stats_inc(&txq_stats->napi.tx_clean); + u64_stats_update_end(&txq_stats->napi_syncp); priv->xstats.tx_errors += tx_errors; @@ -4213,7 +4211,6 @@ static netdev_tx_t stmmac_tso_xmit(struct sk_buff *skb, struct net_device *dev) struct stmmac_tx_queue *tx_q; bool has_vlan, set_ic; u8 proto_hdr_len, hdr; - unsigned long flags; u32 pay_len, mss; dma_addr_t des; int i; @@ -4378,13 +4375,13 @@ static netdev_tx_t stmmac_tso_xmit(struct sk_buff *skb, struct net_device *dev) netif_tx_stop_queue(netdev_get_tx_queue(priv->dev, queue)); } - flags = u64_stats_update_begin_irqsave(&txq_stats->syncp); - txq_stats->tx_bytes += skb->len; - txq_stats->tx_tso_frames++; - txq_stats->tx_tso_nfrags += nfrags; + u64_stats_update_begin(&txq_stats->q_syncp); + u64_stats_add(&txq_stats->q.tx_bytes, skb->len); + u64_stats_inc(&txq_stats->q.tx_tso_frames); + u64_stats_add(&txq_stats->q.tx_tso_nfrags, nfrags); if (set_ic) - txq_stats->tx_set_ic_bit++; - u64_stats_update_end_irqrestore(&txq_stats->syncp, flags); + u64_stats_inc(&txq_stats->q.tx_set_ic_bit); + u64_stats_update_end(&txq_stats->q_syncp); if (priv->sarc_type) stmmac_set_desc_sarc(priv, first, priv->sarc_type); @@ -4483,7 +4480,6 @@ static netdev_tx_t stmmac_xmit(struct sk_buff *skb, struct net_device *dev) struct stmmac_tx_queue *tx_q; bool has_vlan, set_ic; int entry, first_tx; - unsigned long flags; dma_addr_t des; tx_q = &priv->dma_conf.tx_queue[queue]; @@ -4653,11 +4649,11 @@ static netdev_tx_t stmmac_xmit(struct sk_buff *skb, struct net_device *dev) netif_tx_stop_queue(netdev_get_tx_queue(priv->dev, queue)); } - flags = u64_stats_update_begin_irqsave(&txq_stats->syncp); - txq_stats->tx_bytes += skb->len; + u64_stats_update_begin(&txq_stats->q_syncp); + u64_stats_add(&txq_stats->q.tx_bytes, skb->len); if (set_ic) - txq_stats->tx_set_ic_bit++; - u64_stats_update_end_irqrestore(&txq_stats->syncp, flags); + u64_stats_inc(&txq_stats->q.tx_set_ic_bit); + u64_stats_update_end(&txq_stats->q_syncp); if (priv->sarc_type) stmmac_set_desc_sarc(priv, first, priv->sarc_type); @@ -4921,12 +4917,11 @@ static int stmmac_xdp_xmit_xdpf(struct stmmac_priv *priv, int queue, set_ic = false; if (set_ic) { - unsigned long flags; tx_q->tx_count_frames = 0; stmmac_set_tx_ic(priv, tx_desc); - flags = u64_stats_update_begin_irqsave(&txq_stats->syncp); - txq_stats->tx_set_ic_bit++; - u64_stats_update_end_irqrestore(&txq_stats->syncp, flags); + u64_stats_update_begin(&txq_stats->q_syncp); + u64_stats_inc(&txq_stats->q.tx_set_ic_bit); + u64_stats_update_end(&txq_stats->q_syncp); } stmmac_enable_dma_transmission(priv, priv->ioaddr); @@ -5076,7 +5071,6 @@ static void stmmac_dispatch_skb_zc(struct stmmac_priv *priv, u32 queue, unsigned int len = xdp->data_end - xdp->data; enum pkt_hash_types hash_type; int coe = priv->hw->rx_csum; - unsigned long flags; struct sk_buff *skb; u32 hash; @@ -5106,10 +5100,10 @@ static void stmmac_dispatch_skb_zc(struct stmmac_priv *priv, u32 queue, skb_record_rx_queue(skb, queue); napi_gro_receive(&ch->rxtx_napi, skb); - flags = u64_stats_update_begin_irqsave(&rxq_stats->syncp); - rxq_stats->rx_pkt_n++; - rxq_stats->rx_bytes += len; - u64_stats_update_end_irqrestore(&rxq_stats->syncp, flags); + u64_stats_update_begin(&rxq_stats->napi_syncp); + u64_stats_inc(&rxq_stats->napi.rx_pkt_n); + u64_stats_add(&rxq_stats->napi.rx_bytes, len); + u64_stats_update_end(&rxq_stats->napi_syncp); } static bool stmmac_rx_refill_zc(struct stmmac_priv *priv, u32 queue, u32 budget) @@ -5191,7 +5185,6 @@ static int stmmac_rx_zc(struct stmmac_priv *priv, int limit, u32 queue) unsigned int desc_size; struct bpf_prog *prog; bool failure = false; - unsigned long flags; int xdp_status = 0; int status = 0; @@ -5346,9 +5339,9 @@ read_again: stmmac_finalize_xdp_rx(priv, xdp_status); - flags = u64_stats_update_begin_irqsave(&rxq_stats->syncp); - rxq_stats->rx_pkt_n += count; - u64_stats_update_end_irqrestore(&rxq_stats->syncp, flags); + u64_stats_update_begin(&rxq_stats->napi_syncp); + u64_stats_add(&rxq_stats->napi.rx_pkt_n, count); + u64_stats_update_end(&rxq_stats->napi_syncp); priv->xstats.rx_dropped += rx_dropped; priv->xstats.rx_errors += rx_errors; @@ -5386,7 +5379,6 @@ static int stmmac_rx(struct stmmac_priv *priv, int limit, u32 queue) unsigned int desc_size; struct sk_buff *skb = NULL; struct stmmac_xdp_buff ctx; - unsigned long flags; int xdp_status = 0; int buf_sz; @@ -5646,11 +5638,11 @@ drain_data: stmmac_rx_refill(priv, queue); - flags = u64_stats_update_begin_irqsave(&rxq_stats->syncp); - rxq_stats->rx_packets += rx_packets; - rxq_stats->rx_bytes += rx_bytes; - rxq_stats->rx_pkt_n += count; - u64_stats_update_end_irqrestore(&rxq_stats->syncp, flags); + u64_stats_update_begin(&rxq_stats->napi_syncp); + u64_stats_add(&rxq_stats->napi.rx_packets, rx_packets); + u64_stats_add(&rxq_stats->napi.rx_bytes, rx_bytes); + u64_stats_add(&rxq_stats->napi.rx_pkt_n, count); + u64_stats_update_end(&rxq_stats->napi_syncp); priv->xstats.rx_dropped += rx_dropped; priv->xstats.rx_errors += rx_errors; @@ -5665,13 +5657,12 @@ static int stmmac_napi_poll_rx(struct napi_struct *napi, int budget) struct stmmac_priv *priv = ch->priv_data; struct stmmac_rxq_stats *rxq_stats; u32 chan = ch->index; - unsigned long flags; int work_done; rxq_stats = &priv->xstats.rxq_stats[chan]; - flags = u64_stats_update_begin_irqsave(&rxq_stats->syncp); - rxq_stats->napi_poll++; - u64_stats_update_end_irqrestore(&rxq_stats->syncp, flags); + u64_stats_update_begin(&rxq_stats->napi_syncp); + u64_stats_inc(&rxq_stats->napi.poll); + u64_stats_update_end(&rxq_stats->napi_syncp); work_done = stmmac_rx(priv, budget, chan); if (work_done < budget && napi_complete_done(napi, work_done)) { @@ -5693,13 +5684,12 @@ static int stmmac_napi_poll_tx(struct napi_struct *napi, int budget) struct stmmac_txq_stats *txq_stats; bool pending_packets = false; u32 chan = ch->index; - unsigned long flags; int work_done; txq_stats = &priv->xstats.txq_stats[chan]; - flags = u64_stats_update_begin_irqsave(&txq_stats->syncp); - txq_stats->napi_poll++; - u64_stats_update_end_irqrestore(&txq_stats->syncp, flags); + u64_stats_update_begin(&txq_stats->napi_syncp); + u64_stats_inc(&txq_stats->napi.poll); + u64_stats_update_end(&txq_stats->napi_syncp); work_done = stmmac_tx_clean(priv, budget, chan, &pending_packets); work_done = min(work_done, budget); @@ -5729,17 +5719,16 @@ static int stmmac_napi_poll_rxtx(struct napi_struct *napi, int budget) struct stmmac_rxq_stats *rxq_stats; struct stmmac_txq_stats *txq_stats; u32 chan = ch->index; - unsigned long flags; rxq_stats = &priv->xstats.rxq_stats[chan]; - flags = u64_stats_update_begin_irqsave(&rxq_stats->syncp); - rxq_stats->napi_poll++; - u64_stats_update_end_irqrestore(&rxq_stats->syncp, flags); + u64_stats_update_begin(&rxq_stats->napi_syncp); + u64_stats_inc(&rxq_stats->napi.poll); + u64_stats_update_end(&rxq_stats->napi_syncp); txq_stats = &priv->xstats.txq_stats[chan]; - flags = u64_stats_update_begin_irqsave(&txq_stats->syncp); - txq_stats->napi_poll++; - u64_stats_update_end_irqrestore(&txq_stats->syncp, flags); + u64_stats_update_begin(&txq_stats->napi_syncp); + u64_stats_inc(&txq_stats->napi.poll); + u64_stats_update_end(&txq_stats->napi_syncp); tx_done = stmmac_tx_clean(priv, budget, chan, &tx_pending_packets); tx_done = min(tx_done, budget); @@ -7065,10 +7054,13 @@ static void stmmac_get_stats64(struct net_device *dev, struct rtnl_link_stats64 u64 tx_bytes; do { - start = u64_stats_fetch_begin(&txq_stats->syncp); - tx_packets = txq_stats->tx_packets; - tx_bytes = txq_stats->tx_bytes; - } while (u64_stats_fetch_retry(&txq_stats->syncp, start)); + start = u64_stats_fetch_begin(&txq_stats->q_syncp); + tx_bytes = u64_stats_read(&txq_stats->q.tx_bytes); + } while (u64_stats_fetch_retry(&txq_stats->q_syncp, start)); + do { + start = u64_stats_fetch_begin(&txq_stats->napi_syncp); + tx_packets = u64_stats_read(&txq_stats->napi.tx_packets); + } while (u64_stats_fetch_retry(&txq_stats->napi_syncp, start)); stats->tx_packets += tx_packets; stats->tx_bytes += tx_bytes; @@ -7080,10 +7072,10 @@ static void stmmac_get_stats64(struct net_device *dev, struct rtnl_link_stats64 u64 rx_bytes; do { - start = u64_stats_fetch_begin(&rxq_stats->syncp); - rx_packets = rxq_stats->rx_packets; - rx_bytes = rxq_stats->rx_bytes; - } while (u64_stats_fetch_retry(&rxq_stats->syncp, start)); + start = u64_stats_fetch_begin(&rxq_stats->napi_syncp); + rx_packets = u64_stats_read(&rxq_stats->napi.rx_packets); + rx_bytes = u64_stats_read(&rxq_stats->napi.rx_bytes); + } while (u64_stats_fetch_retry(&rxq_stats->napi_syncp, start)); stats->rx_packets += rx_packets; stats->rx_bytes += rx_bytes; @@ -7477,9 +7469,16 @@ int stmmac_dvr_probe(struct device *device, priv->dev = ndev; for (i = 0; i < MTL_MAX_RX_QUEUES; i++) - u64_stats_init(&priv->xstats.rxq_stats[i].syncp); - for (i = 0; i < MTL_MAX_TX_QUEUES; i++) - u64_stats_init(&priv->xstats.txq_stats[i].syncp); + u64_stats_init(&priv->xstats.rxq_stats[i].napi_syncp); + for (i = 0; i < MTL_MAX_TX_QUEUES; i++) { + u64_stats_init(&priv->xstats.txq_stats[i].q_syncp); + u64_stats_init(&priv->xstats.txq_stats[i].napi_syncp); + } + + priv->xstats.pcpu_stats = + devm_netdev_alloc_pcpu_stats(device, struct stmmac_pcpu_stats); + if (!priv->xstats.pcpu_stats) + return -ENOMEM; stmmac_set_ethtool_ops(ndev); priv->pause = pause; From 4b00d0c513da58b68df015968721b11396fe4ab3 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Sun, 4 Feb 2024 08:56:18 -0800 Subject: [PATCH 162/270] selftests: cmsg_ipv6: repeat the exact packet cmsg_ipv6 test requests tcpdump to capture 4 packets, and sends until tcpdump quits. Only the first packet is "real", however, and the rest are basic UDP packets. So if tcpdump doesn't start in time it will miss the real packet and only capture the UDP ones. This makes the test fail on slow machine (no KVM or with debug enabled) 100% of the time, while it passes in fast environments. Repeat the "real" / expected packet. Fixes: 9657ad09e1fa ("selftests: net: test IPV6_TCLASS") Fixes: 05ae83d5a4a2 ("selftests: net: test IPV6_HOPLIMIT") Signed-off-by: Jakub Kicinski Reviewed-by: Simon Horman Signed-off-by: David S. Miller --- tools/testing/selftests/net/cmsg_ipv6.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tools/testing/selftests/net/cmsg_ipv6.sh b/tools/testing/selftests/net/cmsg_ipv6.sh index f30bd57d5e38..8bc23fb4c82b 100755 --- a/tools/testing/selftests/net/cmsg_ipv6.sh +++ b/tools/testing/selftests/net/cmsg_ipv6.sh @@ -89,7 +89,7 @@ for ovr in setsock cmsg both diff; do check_result $? 0 "TCLASS $prot $ovr - pass" while [ -d /proc/$BG ]; do - $NSEXE ./cmsg_sender -6 -p u $TGT6 1234 + $NSEXE ./cmsg_sender -6 -p $p $m $((TOS2)) $TGT6 1234 done tcpdump -r $TMPF -v 2>&1 | grep "class $TOS2" >> /dev/null @@ -126,7 +126,7 @@ for ovr in setsock cmsg both diff; do check_result $? 0 "HOPLIMIT $prot $ovr - pass" while [ -d /proc/$BG ]; do - $NSEXE ./cmsg_sender -6 -p u $TGT6 1234 + $NSEXE ./cmsg_sender -6 -p $p $m $LIM $TGT6 1234 done tcpdump -r $TMPF -v 2>&1 | grep "hlim $LIM[^0-9]" >> /dev/null From cd7d469c25704d414d71bf3644f163fb74e7996b Mon Sep 17 00:00:00 2001 From: Xiubo Li Date: Fri, 13 Oct 2023 13:55:44 +0800 Subject: [PATCH 163/270] libceph: fail sparse-read if the data length doesn't match Once this happens that means there have bugs. Signed-off-by: Xiubo Li Reviewed-by: Jeff Layton Signed-off-by: Ilya Dryomov --- include/linux/ceph/osd_client.h | 3 ++- net/ceph/osd_client.c | 18 +++++++++++++++--- 2 files changed, 17 insertions(+), 4 deletions(-) diff --git a/include/linux/ceph/osd_client.h b/include/linux/ceph/osd_client.h index fa018d5864e7..f66f6aac74f6 100644 --- a/include/linux/ceph/osd_client.h +++ b/include/linux/ceph/osd_client.h @@ -45,6 +45,7 @@ enum ceph_sparse_read_state { CEPH_SPARSE_READ_HDR = 0, CEPH_SPARSE_READ_EXTENTS, CEPH_SPARSE_READ_DATA_LEN, + CEPH_SPARSE_READ_DATA_PRE, CEPH_SPARSE_READ_DATA, }; @@ -64,7 +65,7 @@ struct ceph_sparse_read { u64 sr_req_len; /* orig request length */ u64 sr_pos; /* current pos in buffer */ int sr_index; /* current extent index */ - __le32 sr_datalen; /* length of actual data */ + u32 sr_datalen; /* length of actual data */ u32 sr_count; /* extent count in reply */ int sr_ext_len; /* length of extent array */ struct ceph_sparse_extent *sr_extent; /* extent array */ diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c index 625622016f57..2cea35e4ff8e 100644 --- a/net/ceph/osd_client.c +++ b/net/ceph/osd_client.c @@ -5857,8 +5857,8 @@ static int osd_sparse_read(struct ceph_connection *con, struct ceph_osd *o = con->private; struct ceph_sparse_read *sr = &o->o_sparse_read; u32 count = sr->sr_count; - u64 eoff, elen; - int ret; + u64 eoff, elen, len = 0; + int i, ret; switch (sr->sr_state) { case CEPH_SPARSE_READ_HDR: @@ -5903,8 +5903,20 @@ next_op: convert_extent_map(sr); ret = sizeof(sr->sr_datalen); *pbuf = (char *)&sr->sr_datalen; - sr->sr_state = CEPH_SPARSE_READ_DATA; + sr->sr_state = CEPH_SPARSE_READ_DATA_PRE; break; + case CEPH_SPARSE_READ_DATA_PRE: + /* Convert sr_datalen to host-endian */ + sr->sr_datalen = le32_to_cpu((__force __le32)sr->sr_datalen); + for (i = 0; i < count; i++) + len += sr->sr_extent[i].len; + if (sr->sr_datalen != len) { + pr_warn_ratelimited("data len %u != extent len %llu\n", + sr->sr_datalen, len); + return -EREMOTEIO; + } + sr->sr_state = CEPH_SPARSE_READ_DATA; + fallthrough; case CEPH_SPARSE_READ_DATA: if (sr->sr_index >= count) { sr->sr_state = CEPH_SPARSE_READ_HDR; From ee97302fbc0c98a25732d736fc73aaf4d62c4128 Mon Sep 17 00:00:00 2001 From: Xiubo Li Date: Thu, 14 Dec 2023 09:21:15 +0800 Subject: [PATCH 164/270] libceph: rename read_sparse_msg_*() to read_partial_sparse_msg_*() These functions are supposed to behave like other read_partial_*() handlers: the contract with messenger v1 is that the handler bails if the area of the message it's responsible for is already processed. This comes up when handling short reads from the socket. [ idryomov: changelog ] Signed-off-by: Xiubo Li Acked-by: Jeff Layton Signed-off-by: Ilya Dryomov --- net/ceph/messenger_v1.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/net/ceph/messenger_v1.c b/net/ceph/messenger_v1.c index f9a50d7f0d20..4cb60bacf5f5 100644 --- a/net/ceph/messenger_v1.c +++ b/net/ceph/messenger_v1.c @@ -991,7 +991,7 @@ static inline int read_partial_message_section(struct ceph_connection *con, return read_partial_message_chunk(con, section, sec_len, crc); } -static int read_sparse_msg_extent(struct ceph_connection *con, u32 *crc) +static int read_partial_sparse_msg_extent(struct ceph_connection *con, u32 *crc) { struct ceph_msg_data_cursor *cursor = &con->in_msg->cursor; bool do_bounce = ceph_test_opt(from_msgr(con->msgr), RXBOUNCE); @@ -1026,7 +1026,7 @@ static int read_sparse_msg_extent(struct ceph_connection *con, u32 *crc) return 1; } -static int read_sparse_msg_data(struct ceph_connection *con) +static int read_partial_sparse_msg_data(struct ceph_connection *con) { struct ceph_msg_data_cursor *cursor = &con->in_msg->cursor; bool do_datacrc = !ceph_test_opt(from_msgr(con->msgr), NOCRC); @@ -1043,7 +1043,7 @@ static int read_sparse_msg_data(struct ceph_connection *con) con->v1.in_sr_len, &crc); else if (cursor->sr_resid > 0) - ret = read_sparse_msg_extent(con, &crc); + ret = read_partial_sparse_msg_extent(con, &crc); if (ret <= 0) { if (do_datacrc) @@ -1254,7 +1254,7 @@ static int read_partial_message(struct ceph_connection *con) return -EIO; if (m->sparse_read) - ret = read_sparse_msg_data(con); + ret = read_partial_sparse_msg_data(con); else if (ceph_test_opt(from_msgr(con->msgr), RXBOUNCE)) ret = read_partial_msg_data_bounce(con); else From 8e46a2d068c92a905d01cbb018b00d66991585ab Mon Sep 17 00:00:00 2001 From: Xiubo Li Date: Thu, 14 Dec 2023 16:01:03 +0800 Subject: [PATCH 165/270] libceph: just wait for more data to be available on the socket A short read may occur while reading the message footer from the socket. Later, when the socket is ready for another read, the messenger invokes all read_partial_*() handlers, including read_partial_sparse_msg_data(). The expectation is that read_partial_sparse_msg_data() would bail, allowing the messenger to invoke read_partial() for the footer and pick up where it left off. However read_partial_sparse_msg_data() violates that and ends up calling into the state machine in the OSD client. The sparse-read state machine assumes that it's a new op and interprets some piece of the footer as the sparse-read header and returns bogus extents/data length, etc. To determine whether read_partial_sparse_msg_data() should bail, let's reuse cursor->total_resid. Because once it reaches to zero that means all the extents and data have been successfully received in last read, else it could break out when partially reading any of the extents and data. And then osd_sparse_read() could continue where it left off. [ idryomov: changelog ] Link: https://tracker.ceph.com/issues/63586 Fixes: d396f89db39a ("libceph: add sparse read support to msgr1") Signed-off-by: Xiubo Li Reviewed-by: Jeff Layton Signed-off-by: Ilya Dryomov --- include/linux/ceph/messenger.h | 2 +- net/ceph/messenger_v1.c | 25 +++++++++++++------------ net/ceph/messenger_v2.c | 4 ++-- net/ceph/osd_client.c | 9 +++------ 4 files changed, 19 insertions(+), 21 deletions(-) diff --git a/include/linux/ceph/messenger.h b/include/linux/ceph/messenger.h index 2eaaabbe98cb..1717cc57cdac 100644 --- a/include/linux/ceph/messenger.h +++ b/include/linux/ceph/messenger.h @@ -283,7 +283,7 @@ struct ceph_msg { struct kref kref; bool more_to_follow; bool needs_out_seq; - bool sparse_read; + u64 sparse_read_total; int front_alloc_len; struct ceph_msgpool *pool; diff --git a/net/ceph/messenger_v1.c b/net/ceph/messenger_v1.c index 4cb60bacf5f5..0cb61c76b9b8 100644 --- a/net/ceph/messenger_v1.c +++ b/net/ceph/messenger_v1.c @@ -160,8 +160,9 @@ static size_t sizeof_footer(struct ceph_connection *con) static void prepare_message_data(struct ceph_msg *msg, u32 data_len) { /* Initialize data cursor if it's not a sparse read */ - if (!msg->sparse_read) - ceph_msg_data_cursor_init(&msg->cursor, msg, data_len); + u64 len = msg->sparse_read_total ? : data_len; + + ceph_msg_data_cursor_init(&msg->cursor, msg, len); } /* @@ -1036,7 +1037,7 @@ static int read_partial_sparse_msg_data(struct ceph_connection *con) if (do_datacrc) crc = con->in_data_crc; - do { + while (cursor->total_resid) { if (con->v1.in_sr_kvec.iov_base) ret = read_partial_message_chunk(con, &con->v1.in_sr_kvec, @@ -1044,23 +1045,23 @@ static int read_partial_sparse_msg_data(struct ceph_connection *con) &crc); else if (cursor->sr_resid > 0) ret = read_partial_sparse_msg_extent(con, &crc); - - if (ret <= 0) { - if (do_datacrc) - con->in_data_crc = crc; - return ret; - } + if (ret <= 0) + break; memset(&con->v1.in_sr_kvec, 0, sizeof(con->v1.in_sr_kvec)); ret = con->ops->sparse_read(con, cursor, (char **)&con->v1.in_sr_kvec.iov_base); + if (ret <= 0) { + ret = ret ? ret : 1; /* must return > 0 to indicate success */ + break; + } con->v1.in_sr_len = ret; - } while (ret > 0); + } if (do_datacrc) con->in_data_crc = crc; - return ret < 0 ? ret : 1; /* must return > 0 to indicate success */ + return ret; } static int read_partial_msg_data(struct ceph_connection *con) @@ -1253,7 +1254,7 @@ static int read_partial_message(struct ceph_connection *con) if (!m->num_data_items) return -EIO; - if (m->sparse_read) + if (m->sparse_read_total) ret = read_partial_sparse_msg_data(con); else if (ceph_test_opt(from_msgr(con->msgr), RXBOUNCE)) ret = read_partial_msg_data_bounce(con); diff --git a/net/ceph/messenger_v2.c b/net/ceph/messenger_v2.c index f8ec60e1aba3..a0ca5414b333 100644 --- a/net/ceph/messenger_v2.c +++ b/net/ceph/messenger_v2.c @@ -1128,7 +1128,7 @@ static int decrypt_tail(struct ceph_connection *con) struct sg_table enc_sgt = {}; struct sg_table sgt = {}; struct page **pages = NULL; - bool sparse = con->in_msg->sparse_read; + bool sparse = !!con->in_msg->sparse_read_total; int dpos = 0; int tail_len; int ret; @@ -2060,7 +2060,7 @@ static int prepare_read_tail_plain(struct ceph_connection *con) } if (data_len(msg)) { - if (msg->sparse_read) + if (msg->sparse_read_total) con->v2.in_state = IN_S_PREPARE_SPARSE_DATA; else con->v2.in_state = IN_S_PREPARE_READ_DATA; diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c index 2cea35e4ff8e..9d078b37fe0b 100644 --- a/net/ceph/osd_client.c +++ b/net/ceph/osd_client.c @@ -5510,7 +5510,7 @@ static struct ceph_msg *get_reply(struct ceph_connection *con, } m = ceph_msg_get(req->r_reply); - m->sparse_read = (bool)srlen; + m->sparse_read_total = srlen; dout("get_reply tid %lld %p\n", tid, m); @@ -5777,11 +5777,8 @@ static int prep_next_sparse_read(struct ceph_connection *con, } if (o->o_sparse_op_idx < 0) { - u64 srlen = sparse_data_requested(req); - - dout("%s: [%d] starting new sparse read req. srlen=0x%llx\n", - __func__, o->o_osd, srlen); - ceph_msg_data_cursor_init(cursor, con->in_msg, srlen); + dout("%s: [%d] starting new sparse read req\n", + __func__, o->o_osd); } else { u64 end; From bbb20ea993f46743f7429092ddc52f1a5c5428ef Mon Sep 17 00:00:00 2001 From: Xiubo Li Date: Thu, 18 Jan 2024 14:24:41 +0800 Subject: [PATCH 166/270] ceph: always set initial i_blkbits to CEPH_FSCRYPT_BLOCK_SHIFT The fscrypt code will use i_blkbits to setup ci_data_unit_bits when allocating the new inode, but ceph will initiate i_blkbits ater when filling the inode, which is too late. Since ci_data_unit_bits will only be used by the fscrypt framework so initiating i_blkbits with CEPH_FSCRYPT_BLOCK_SHIFT is safe. Link: https://tracker.ceph.com/issues/64035 Fixes: 5b1188847180 ("fscrypt: support crypto data unit size less than filesystem block size") Signed-off-by: Xiubo Li Reviewed-by: Eric Biggers Signed-off-by: Ilya Dryomov --- fs/ceph/inode.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c index 0c25d326afc4..7b2e77517f23 100644 --- a/fs/ceph/inode.c +++ b/fs/ceph/inode.c @@ -78,6 +78,8 @@ struct inode *ceph_new_inode(struct inode *dir, struct dentry *dentry, if (!inode) return ERR_PTR(-ENOMEM); + inode->i_blkbits = CEPH_FSCRYPT_BLOCK_SHIFT; + if (!S_ISLNK(*mode)) { err = ceph_pre_init_acls(dir, mode, as_ctx); if (err < 0) From cda4672da1c26835dcbd7aec2bfed954eda9b5ef Mon Sep 17 00:00:00 2001 From: Rishabh Dave Date: Thu, 1 Feb 2024 17:07:16 +0530 Subject: [PATCH 167/270] ceph: prevent use-after-free in encode_cap_msg() In fs/ceph/caps.c, in encode_cap_msg(), "use after free" error was caught by KASAN at this line - 'ceph_buffer_get(arg->xattr_buf);'. This implies before the refcount could be increment here, it was freed. In same file, in "handle_cap_grant()" refcount is decremented by this line - 'ceph_buffer_put(ci->i_xattrs.blob);'. It appears that a race occurred and resource was freed by the latter line before the former line could increment it. encode_cap_msg() is called by __send_cap() and __send_cap() is called by ceph_check_caps() after calling __prep_cap(). __prep_cap() is where arg->xattr_buf is assigned to ci->i_xattrs.blob. This is the spot where the refcount must be increased to prevent "use after free" error. Cc: stable@vger.kernel.org Link: https://tracker.ceph.com/issues/59259 Signed-off-by: Rishabh Dave Reviewed-by: Jeff Layton Reviewed-by: Xiubo Li Signed-off-by: Ilya Dryomov --- fs/ceph/caps.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c index 9c02f328c966..e8bf082105d8 100644 --- a/fs/ceph/caps.c +++ b/fs/ceph/caps.c @@ -1452,7 +1452,7 @@ static void __prep_cap(struct cap_msg_args *arg, struct ceph_cap *cap, if (flushing & CEPH_CAP_XATTR_EXCL) { arg->old_xattr_buf = __ceph_build_xattrs_blob(ci); arg->xattr_version = ci->i_xattrs.version; - arg->xattr_buf = ci->i_xattrs.blob; + arg->xattr_buf = ceph_buffer_get(ci->i_xattrs.blob); } else { arg->xattr_buf = NULL; arg->old_xattr_buf = NULL; @@ -1553,6 +1553,7 @@ static void __send_cap(struct cap_msg_args *arg, struct ceph_inode_info *ci) encode_cap_msg(msg, arg); ceph_con_send(&arg->session->s_con, msg); ceph_buffer_put(arg->old_xattr_buf); + ceph_buffer_put(arg->xattr_buf); if (arg->wake) wake_up_all(&ci->i_cap_wq); } From 07045648c07c5632e0dfd5ce084d3cd0cec0258a Mon Sep 17 00:00:00 2001 From: Xiubo Li Date: Thu, 4 Jan 2024 09:21:30 +0800 Subject: [PATCH 168/270] ceph: always check dir caps asynchronously The MDS will issue the 'Fr' caps for async dirop, while there is buggy in kclient and it could miss releasing the async dirop caps, which is 'Fsxr'. And then the MDS will complain with: "[WRN] client.xxx isn't responding to mclientcaps(revoke) ..." So when releasing the dirop async requests or when they fail we should always make sure that being revoked caps could be released. Link: https://tracker.ceph.com/issues/50223 Signed-off-by: Xiubo Li Reviewed-by: Milind Changire Signed-off-by: Ilya Dryomov --- fs/ceph/caps.c | 6 ------ fs/ceph/mds_client.c | 9 ++++----- fs/ceph/mds_client.h | 2 +- fs/ceph/super.h | 2 -- 4 files changed, 5 insertions(+), 14 deletions(-) diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c index e8bf082105d8..ad1f46c66fbf 100644 --- a/fs/ceph/caps.c +++ b/fs/ceph/caps.c @@ -3216,7 +3216,6 @@ static int ceph_try_drop_cap_snap(struct ceph_inode_info *ci, enum put_cap_refs_mode { PUT_CAP_REFS_SYNC = 0, - PUT_CAP_REFS_NO_CHECK, PUT_CAP_REFS_ASYNC, }; @@ -3332,11 +3331,6 @@ void ceph_put_cap_refs_async(struct ceph_inode_info *ci, int had) __ceph_put_cap_refs(ci, had, PUT_CAP_REFS_ASYNC); } -void ceph_put_cap_refs_no_check_caps(struct ceph_inode_info *ci, int had) -{ - __ceph_put_cap_refs(ci, had, PUT_CAP_REFS_NO_CHECK); -} - /* * Release @nr WRBUFFER refs on dirty pages for the given @snapc snap * context. Adjust per-snap dirty page accounting as appropriate. diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c index 548d1de379f3..f71bb9c9569f 100644 --- a/fs/ceph/mds_client.c +++ b/fs/ceph/mds_client.c @@ -1089,7 +1089,7 @@ void ceph_mdsc_release_request(struct kref *kref) struct ceph_mds_request *req = container_of(kref, struct ceph_mds_request, r_kref); - ceph_mdsc_release_dir_caps_no_check(req); + ceph_mdsc_release_dir_caps_async(req); destroy_reply_info(&req->r_reply_info); if (req->r_request) ceph_msg_put(req->r_request); @@ -4261,7 +4261,7 @@ void ceph_mdsc_release_dir_caps(struct ceph_mds_request *req) } } -void ceph_mdsc_release_dir_caps_no_check(struct ceph_mds_request *req) +void ceph_mdsc_release_dir_caps_async(struct ceph_mds_request *req) { struct ceph_client *cl = req->r_mdsc->fsc->client; int dcaps; @@ -4269,8 +4269,7 @@ void ceph_mdsc_release_dir_caps_no_check(struct ceph_mds_request *req) dcaps = xchg(&req->r_dir_caps, 0); if (dcaps) { doutc(cl, "releasing r_dir_caps=%s\n", ceph_cap_string(dcaps)); - ceph_put_cap_refs_no_check_caps(ceph_inode(req->r_parent), - dcaps); + ceph_put_cap_refs_async(ceph_inode(req->r_parent), dcaps); } } @@ -4306,7 +4305,7 @@ static void replay_unsafe_requests(struct ceph_mds_client *mdsc, if (req->r_session->s_mds != session->s_mds) continue; - ceph_mdsc_release_dir_caps_no_check(req); + ceph_mdsc_release_dir_caps_async(req); __send_request(session, req, true); } diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h index 2e6ddaa13d72..40560af38827 100644 --- a/fs/ceph/mds_client.h +++ b/fs/ceph/mds_client.h @@ -552,7 +552,7 @@ extern int ceph_mdsc_do_request(struct ceph_mds_client *mdsc, struct inode *dir, struct ceph_mds_request *req); extern void ceph_mdsc_release_dir_caps(struct ceph_mds_request *req); -extern void ceph_mdsc_release_dir_caps_no_check(struct ceph_mds_request *req); +extern void ceph_mdsc_release_dir_caps_async(struct ceph_mds_request *req); static inline void ceph_mdsc_get_request(struct ceph_mds_request *req) { kref_get(&req->r_kref); diff --git a/fs/ceph/super.h b/fs/ceph/super.h index b06e2bc86221..b63b4cd9b5b6 100644 --- a/fs/ceph/super.h +++ b/fs/ceph/super.h @@ -1255,8 +1255,6 @@ extern void ceph_take_cap_refs(struct ceph_inode_info *ci, int caps, extern void ceph_get_cap_refs(struct ceph_inode_info *ci, int caps); extern void ceph_put_cap_refs(struct ceph_inode_info *ci, int had); extern void ceph_put_cap_refs_async(struct ceph_inode_info *ci, int had); -extern void ceph_put_cap_refs_no_check_caps(struct ceph_inode_info *ci, - int had); extern void ceph_put_wrbuffer_cap_refs(struct ceph_inode_info *ci, int nr, struct ceph_snap_context *snapc); extern void __ceph_remove_capsnap(struct inode *inode, From 5001bfe927b594470f3a2484cb410b8b42a47903 Mon Sep 17 00:00:00 2001 From: Allison Henderson Date: Mon, 5 Feb 2024 12:03:43 -0700 Subject: [PATCH 169/270] MAINTAINERS: Maintainer change for rds At this point, Santosh has moved onto other things and I am happy to take over the role of rds maintainer. Update the MAINTAINERS accordingly. Signed-off-by: Allison Henderson Link: https://lore.kernel.org/r/20240205190343.112436-1-allison.henderson@oracle.com Signed-off-by: Jakub Kicinski --- MAINTAINERS | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/MAINTAINERS b/MAINTAINERS index 722b894f305e..e3e670ee5610 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -18433,7 +18433,7 @@ S: Supported F: drivers/infiniband/sw/rdmavt RDS - RELIABLE DATAGRAM SOCKETS -M: Santosh Shilimkar +M: Allison Henderson L: netdev@vger.kernel.org L: linux-rdma@vger.kernel.org L: rds-devel@oss.oracle.com (moderated for non-subscribers) From 75428f537d7cae33c7e4dd726144074f78622c09 Mon Sep 17 00:00:00 2001 From: Jesse Brandeburg Date: Mon, 5 Feb 2024 18:29:06 -0800 Subject: [PATCH 170/270] net: intel: fix old compiler regressions MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The kernel build regressions/improvements email contained a couple of issues with old compilers (in fact all the reports were on different architectures, but all gcc 5.5) and the FIELD_PREP() and FIELD_GET() conversions. They're all because an integer #define that should have been declared as unsigned, was shifted to the point that it could set the sign bit. The fix just involves making sure the defines use the "U" identifier on the constants to make sure they're unsigned. Should make the checkers happier. Confirmed with objdump before/after that there is no change to the binaries. Issues were reported as follows: ./drivers/net/ethernet/intel/ice/ice_base.c:238:7: note: in expansion of macro 'FIELD_GET' (FIELD_GET(GLINT_CTL_ITR_GRAN_25_M, regval) == ICE_ITR_GRAN_US)) ^ ./include/linux/compiler_types.h:435:38: error: call to '__compiletime_assert_1093' declared with attribute error: FIELD_GET: mask is not constant drivers/net/ethernet/intel/ice/ice_nvm.c:709:16: note: in expansion of macro ‘FIELD_GET’ orom->major = FIELD_GET(ICE_OROM_VER_MASK, combo_ver); ^ ./include/linux/compiler_types.h:435:38: error: call to ‘__compiletime_assert_796’ declared with attribute error: FIELD_GET: mask is not constant drivers/net/ethernet/intel/ice/ice_common.c:945:18: note: in expansion of macro ‘FIELD_GET’ u8 max_agg_bw = FIELD_GET(GL_PWR_MODE_CTL_CAR_MAX_BW_M, ^ ./include/linux/compiler_types.h:435:38: error: call to ‘__compiletime_assert_420’ declared with attribute error: FIELD_GET: mask is not constant drivers/net/ethernet/intel/i40e/i40e_dcb.c:458:8: note: in expansion of macro ‘FIELD_GET’ oui = FIELD_GET(I40E_LLDP_TLV_OUI_MASK, ouisubtype); ^ Reported-by: Geert Uytterhoeven Closes: https://lore.kernel.org/lkml/d03e90ca-8485-4d1b-5ec1-c3398e0e8da@linux-m68k.org/ #i40e #ice Fixes: 62589808d73b ("i40e: field get conversion") Fixes: 5a259f8e0baf ("ice: field get conversion") Signed-off-by: Jesse Brandeburg Link: https://lore.kernel.org/r/20240206022906.2194214-1-jesse.brandeburg@intel.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/intel/i40e/i40e_dcb.h | 2 +- drivers/net/ethernet/intel/ice/ice_osdep.h | 2 +- drivers/net/ethernet/intel/ice/ice_type.h | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/net/ethernet/intel/i40e/i40e_dcb.h b/drivers/net/ethernet/intel/i40e/i40e_dcb.h index 6b60dc9b7736..d76497566e40 100644 --- a/drivers/net/ethernet/intel/i40e/i40e_dcb.h +++ b/drivers/net/ethernet/intel/i40e/i40e_dcb.h @@ -43,7 +43,7 @@ #define I40E_LLDP_TLV_SUBTYPE_SHIFT 0 #define I40E_LLDP_TLV_SUBTYPE_MASK (0xFF << I40E_LLDP_TLV_SUBTYPE_SHIFT) #define I40E_LLDP_TLV_OUI_SHIFT 8 -#define I40E_LLDP_TLV_OUI_MASK (0xFFFFFF << I40E_LLDP_TLV_OUI_SHIFT) +#define I40E_LLDP_TLV_OUI_MASK (0xFFFFFFU << I40E_LLDP_TLV_OUI_SHIFT) /* Defines for IEEE ETS TLV */ #define I40E_IEEE_ETS_MAXTC_SHIFT 0 diff --git a/drivers/net/ethernet/intel/ice/ice_osdep.h b/drivers/net/ethernet/intel/ice/ice_osdep.h index 82bc54fec7f3..a2562f04267f 100644 --- a/drivers/net/ethernet/intel/ice/ice_osdep.h +++ b/drivers/net/ethernet/intel/ice/ice_osdep.h @@ -24,7 +24,7 @@ #define rd64(a, reg) readq((a)->hw_addr + (reg)) #define ice_flush(a) rd32((a), GLGEN_STAT) -#define ICE_M(m, s) ((m) << (s)) +#define ICE_M(m, s) ((m ## U) << (s)) struct ice_dma_mem { void *va; diff --git a/drivers/net/ethernet/intel/ice/ice_type.h b/drivers/net/ethernet/intel/ice/ice_type.h index 41ab6d7bbd9e..a508e917ce5f 100644 --- a/drivers/net/ethernet/intel/ice/ice_type.h +++ b/drivers/net/ethernet/intel/ice/ice_type.h @@ -1072,7 +1072,7 @@ struct ice_aq_get_set_rss_lut_params { #define ICE_OROM_VER_BUILD_SHIFT 8 #define ICE_OROM_VER_BUILD_MASK (0xffff << ICE_OROM_VER_BUILD_SHIFT) #define ICE_OROM_VER_SHIFT 24 -#define ICE_OROM_VER_MASK (0xff << ICE_OROM_VER_SHIFT) +#define ICE_OROM_VER_MASK (0xffU << ICE_OROM_VER_SHIFT) #define ICE_SR_PFA_PTR 0x40 #define ICE_SR_1ST_NVM_BANK_PTR 0x42 #define ICE_SR_NVM_BANK_SIZE 0x43 From ce68c035457bdd025a9961e0ba2157323090c581 Mon Sep 17 00:00:00 2001 From: Alexandre Ghiti Date: Tue, 30 Jan 2024 13:01:14 +0100 Subject: [PATCH 171/270] riscv: Fix arch_hugetlb_migration_supported() for NAPOT arch_hugetlb_migration_supported() must be reimplemented to add support for NAPOT hugepages, which is done here. Fixes: 82a1a1f3bfb6 ("riscv: mm: support Svnapot in hugetlb page") Signed-off-by: Alexandre Ghiti Link: https://lore.kernel.org/r/20240130120114.106003-1-alexghiti@rivosinc.com Signed-off-by: Palmer Dabbelt --- arch/riscv/include/asm/hugetlb.h | 3 +++ arch/riscv/mm/hugetlbpage.c | 16 +++++++++++++--- 2 files changed, 16 insertions(+), 3 deletions(-) diff --git a/arch/riscv/include/asm/hugetlb.h b/arch/riscv/include/asm/hugetlb.h index 4c5b0e929890..20f9c3ba2341 100644 --- a/arch/riscv/include/asm/hugetlb.h +++ b/arch/riscv/include/asm/hugetlb.h @@ -11,6 +11,9 @@ static inline void arch_clear_hugepage_flags(struct page *page) } #define arch_clear_hugepage_flags arch_clear_hugepage_flags +bool arch_hugetlb_migration_supported(struct hstate *h); +#define arch_hugetlb_migration_supported arch_hugetlb_migration_supported + #ifdef CONFIG_RISCV_ISA_SVNAPOT #define __HAVE_ARCH_HUGE_PTE_CLEAR void huge_pte_clear(struct mm_struct *mm, unsigned long addr, diff --git a/arch/riscv/mm/hugetlbpage.c b/arch/riscv/mm/hugetlbpage.c index 87406b26c3da..29c7606414d2 100644 --- a/arch/riscv/mm/hugetlbpage.c +++ b/arch/riscv/mm/hugetlbpage.c @@ -364,7 +364,7 @@ void huge_pte_clear(struct mm_struct *mm, pte_clear(mm, addr, ptep); } -static __init bool is_napot_size(unsigned long size) +static bool is_napot_size(unsigned long size) { unsigned long order; @@ -392,7 +392,7 @@ arch_initcall(napot_hugetlbpages_init); #else -static __init bool is_napot_size(unsigned long size) +static bool is_napot_size(unsigned long size) { return false; } @@ -409,7 +409,7 @@ int pmd_huge(pmd_t pmd) return pmd_leaf(pmd); } -bool __init arch_hugetlb_valid_size(unsigned long size) +static bool __hugetlb_valid_size(unsigned long size) { if (size == HPAGE_SIZE) return true; @@ -421,6 +421,16 @@ bool __init arch_hugetlb_valid_size(unsigned long size) return false; } +bool __init arch_hugetlb_valid_size(unsigned long size) +{ + return __hugetlb_valid_size(size); +} + +bool arch_hugetlb_migration_supported(struct hstate *h) +{ + return __hugetlb_valid_size(huge_page_size(h)); +} + #ifdef CONFIG_CONTIG_ALLOC static __init int gigantic_pages_init(void) { From 2cf963787529f615f7c93bdcf13a5e82029e7f38 Mon Sep 17 00:00:00 2001 From: Ben Dooks Date: Thu, 23 Nov 2023 13:42:14 +0000 Subject: [PATCH 172/270] riscv: declare overflow_stack as exported from traps.c The percpu area overflow_stacks is exported from arch/riscv/kernel/traps.c for use in the entry code, but is not declared anywhere. Add the relevant declaration to arch/riscv/include/asm/stacktrace.h to silence the following sparse warning: arch/riscv/kernel/traps.c:395:1: warning: symbol '__pcpu_scope_overflow_stack' was not declared. Should it be static? We don't add the stackinfo_get_overflow() call as for some of the other architectures as this doesn't seem to be used yet, so just silence the warning. Signed-off-by: Ben Dooks Reviewed-by: Conor Dooley Fixes: be97d0db5f44 ("riscv: VMAP_STACK overflow detection thread-safe") Link: https://lore.kernel.org/r/20231123134214.81481-1-ben.dooks@codethink.co.uk Signed-off-by: Palmer Dabbelt --- arch/riscv/include/asm/stacktrace.h | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/arch/riscv/include/asm/stacktrace.h b/arch/riscv/include/asm/stacktrace.h index f7e8ef2418b9..b1495a7e06ce 100644 --- a/arch/riscv/include/asm/stacktrace.h +++ b/arch/riscv/include/asm/stacktrace.h @@ -21,4 +21,9 @@ static inline bool on_thread_stack(void) return !(((unsigned long)(current->stack) ^ current_stack_pointer) & ~(THREAD_SIZE - 1)); } + +#ifdef CONFIG_VMAP_STACK +DECLARE_PER_CPU(unsigned long [OVERFLOW_STACK_SIZE/sizeof(long)], overflow_stack); +#endif /* CONFIG_VMAP_STACK */ + #endif /* _ASM_RISCV_STACKTRACE_H */ From 3951f6add519a8e954bf78691a412f65b24f4715 Mon Sep 17 00:00:00 2001 From: Alexandre Ghiti Date: Tue, 30 Jan 2024 12:55:08 +0100 Subject: [PATCH 173/270] riscv: Fix arch_tlbbatch_flush() by clearing the batch cpumask We must clear the cpumask once we have flushed the batch, otherwise cpus get accumulated and we end sending IPIs to more cpus than needed. Fixes: 54d7431af73e ("riscv: Add support for BATCHED_UNMAP_TLB_FLUSH") Signed-off-by: Alexandre Ghiti Reviewed-by: Charlie Jenkins Reviewed-by: Jisheng Zhang Link: https://lore.kernel.org/r/20240130115508.105386-1-alexghiti@rivosinc.com Signed-off-by: Palmer Dabbelt --- arch/riscv/mm/tlbflush.c | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/riscv/mm/tlbflush.c b/arch/riscv/mm/tlbflush.c index dffb0e5bd942..893566e004b7 100644 --- a/arch/riscv/mm/tlbflush.c +++ b/arch/riscv/mm/tlbflush.c @@ -234,4 +234,5 @@ void arch_tlbbatch_flush(struct arch_tlbflush_unmap_batch *batch) { __flush_tlb_range(&batch->cpumask, FLUSH_TLB_NO_ASID, 0, FLUSH_TLB_MAX_SIZE, PAGE_SIZE); + cpumask_clear(&batch->cpumask); } From 1f4137e882c621f8ed4bd4da9b10cf91d059e52a Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Tue, 6 Feb 2024 09:47:21 -0800 Subject: [PATCH 174/270] nvme: move passthrough logging attribute to head The namespace does not have attributes, but the head does. Move the new logging attribute to that structure instead of dereferencing the wrong type. And while we're here, fix the reverse-tree coding style. Fixes: 9f079dda14339e ("nvme: allow passthru cmd error logging") Reported-by: Tasmiya Nalatwad Tested-by: Tasmiya Nalatwad Reviewed-by: Chaitanya Kulkarni Reviewed-by: Alan Adamson Signed-off-by: Keith Busch --- drivers/nvme/host/core.c | 3 +-- drivers/nvme/host/nvme.h | 2 +- drivers/nvme/host/sysfs.c | 30 +++++++++++++++--------------- 3 files changed, 17 insertions(+), 18 deletions(-) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 975245527c1f..246e92c05aac 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -713,7 +713,7 @@ void nvme_init_request(struct request *req, struct nvme_command *cmd) if (req->q->queuedata) { struct nvme_ns *ns = req->q->disk->private_data; - logging_enabled = ns->passthru_err_log_enabled; + logging_enabled = ns->head->passthru_err_log_enabled; req->timeout = NVME_IO_TIMEOUT; } else { /* no queuedata implies admin queue */ logging_enabled = nr->ctrl->passthru_err_log_enabled; @@ -3696,7 +3696,6 @@ static void nvme_alloc_ns(struct nvme_ctrl *ctrl, struct nvme_ns_info *info) ns->disk = disk; ns->queue = disk->queue; - ns->passthru_err_log_enabled = false; if (ctrl->opts && ctrl->opts->data_digest) blk_queue_flag_set(QUEUE_FLAG_STABLE_WRITES, ns->queue); diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h index 3897334e3950..7b87763e2f8a 100644 --- a/drivers/nvme/host/nvme.h +++ b/drivers/nvme/host/nvme.h @@ -455,6 +455,7 @@ struct nvme_ns_head { struct list_head entry; struct kref ref; bool shared; + bool passthru_err_log_enabled; int instance; struct nvme_effects_log *effects; u64 nuse; @@ -523,7 +524,6 @@ struct nvme_ns { struct device cdev_device; struct nvme_fault_inject fault_inject; - bool passthru_err_log_enabled; }; /* NVMe ns supports metadata actions by the controller (generate/strip) */ diff --git a/drivers/nvme/host/sysfs.c b/drivers/nvme/host/sysfs.c index d099218e494a..f2832f70e7e0 100644 --- a/drivers/nvme/host/sysfs.c +++ b/drivers/nvme/host/sysfs.c @@ -48,8 +48,8 @@ static ssize_t nvme_adm_passthru_err_log_enabled_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t count) { struct nvme_ctrl *ctrl = dev_get_drvdata(dev); - int err; bool passthru_err_log_enabled; + int err; err = kstrtobool(buf, &passthru_err_log_enabled); if (err) @@ -60,25 +60,34 @@ static ssize_t nvme_adm_passthru_err_log_enabled_store(struct device *dev, return count; } +static inline struct nvme_ns_head *dev_to_ns_head(struct device *dev) +{ + struct gendisk *disk = dev_to_disk(dev); + + if (nvme_disk_is_ns_head(disk)) + return disk->private_data; + return nvme_get_ns_from_dev(dev)->head; +} + static ssize_t nvme_io_passthru_err_log_enabled_show(struct device *dev, struct device_attribute *attr, char *buf) { - struct nvme_ns *n = dev_get_drvdata(dev); + struct nvme_ns_head *head = dev_to_ns_head(dev); - return sysfs_emit(buf, n->passthru_err_log_enabled ? "on\n" : "off\n"); + return sysfs_emit(buf, head->passthru_err_log_enabled ? "on\n" : "off\n"); } static ssize_t nvme_io_passthru_err_log_enabled_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t count) { - struct nvme_ns *ns = dev_get_drvdata(dev); - int err; + struct nvme_ns_head *head = dev_to_ns_head(dev); bool passthru_err_log_enabled; + int err; err = kstrtobool(buf, &passthru_err_log_enabled); if (err) return -EINVAL; - ns->passthru_err_log_enabled = passthru_err_log_enabled; + head->passthru_err_log_enabled = passthru_err_log_enabled; return count; } @@ -91,15 +100,6 @@ static struct device_attribute dev_attr_io_passthru_err_log_enabled = \ __ATTR(passthru_err_log_enabled, S_IRUGO | S_IWUSR, \ nvme_io_passthru_err_log_enabled_show, nvme_io_passthru_err_log_enabled_store); -static inline struct nvme_ns_head *dev_to_ns_head(struct device *dev) -{ - struct gendisk *disk = dev_to_disk(dev); - - if (nvme_disk_is_ns_head(disk)) - return disk->private_data; - return nvme_get_ns_from_dev(dev)->head; -} - static ssize_t wwid_show(struct device *dev, struct device_attribute *attr, char *buf) { From e8c263ed6de8183e670fbd127c2512292a2ad8eb Mon Sep 17 00:00:00 2001 From: Chaitanya Kulkarni Date: Mon, 5 Feb 2024 16:30:21 -0800 Subject: [PATCH 175/270] nvme-core: fix comment to reflect right functions The functions and the attribute listed in the comment doesn't exists in the code, (ns->logging_enabled, nvme_passthru_err_log_enabled_store() and nvme_passthru_err_log_enabled_show()) Update the comment with right function names and a comment ns->head->passthru_err_log_enabled, nvme_io_passthru_err_log_enabled_store() and nvme_io_passthru_err_log_enabled_show(). Signed-off-by: Chaitanya Kulkarni Reviewed-by: Alan Adamson Signed-off-by: Keith Busch --- drivers/nvme/host/core.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 246e92c05aac..60537c9224bf 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -3761,8 +3761,8 @@ static void nvme_alloc_ns(struct nvme_ctrl *ctrl, struct nvme_ns_info *info) /* * Set ns->disk->device->driver_data to ns so we can access - * ns->logging_enabled in nvme_passthru_err_log_enabled_store() and - * nvme_passthru_err_log_enabled_show(). + * ns->head->passthru_err_log_enabled in + * nvme_io_passthru_err_log_enabled_[store | show](). */ dev_set_drvdata(disk_to_dev(ns->disk), ns); From b5d1b4b46f856da1473c7ba9a5cdfcb55c9b2478 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Fri, 26 Jan 2024 11:40:37 +0300 Subject: [PATCH 176/270] PCI: dwc: Fix a 64bit bug in dw_pcie_ep_raise_msix_irq() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The "msg_addr" variable is u64. However, the "aligned_offset" is an unsigned int. This means that when the code does: msg_addr &= ~aligned_offset; it will unintentionally zero out the high 32 bits. Use ALIGN_DOWN() to do the alignment instead. Fixes: 2217fffcd63f ("PCI: dwc: endpoint: Fix dw_pcie_ep_raise_msix_irq() alignment support") Link: https://lore.kernel.org/r/af59c7ad-ab93-40f7-ad4a-7ac0b14d37f5@moroto.mountain Signed-off-by: Dan Carpenter Signed-off-by: Bjorn Helgaas Reviewed-by: Niklas Cassel Reviewed-by: Ilpo Järvinen Reviewed-by: Manivannan Sadhasivam Cc: --- drivers/pci/controller/dwc/pcie-designware-ep.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/pci/controller/dwc/pcie-designware-ep.c b/drivers/pci/controller/dwc/pcie-designware-ep.c index 5befed2dc02b..d6b66597101e 100644 --- a/drivers/pci/controller/dwc/pcie-designware-ep.c +++ b/drivers/pci/controller/dwc/pcie-designware-ep.c @@ -6,6 +6,7 @@ * Author: Kishon Vijay Abraham I */ +#include #include #include #include @@ -551,7 +552,7 @@ int dw_pcie_ep_raise_msix_irq(struct dw_pcie_ep *ep, u8 func_no, } aligned_offset = msg_addr & (epc->mem->window.page_size - 1); - msg_addr &= ~aligned_offset; + msg_addr = ALIGN_DOWN(msg_addr, epc->mem->window.page_size); ret = dw_pcie_ep_map_addr(epc, func_no, 0, ep->msi_mem_phys, msg_addr, epc->mem->window.page_size); if (ret) From 67057f48df79a3d73683385f521215146861684b Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Fri, 26 Jan 2024 11:41:01 +0300 Subject: [PATCH 177/270] PCI: dwc: Clean up dw_pcie_ep_raise_msi_irq() alignment MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit I recently changed the alignment code in dw_pcie_ep_raise_msix_irq(). The code in dw_pcie_ep_raise_msi_irq() is similar, so update it to match, just for consistency. (No effect on runtime, just a cleanup). Link: https://lore.kernel.org/r/184097e0-c728-42c7-9e8a-556bd33fb612@moroto.mountain Signed-off-by: Dan Carpenter Signed-off-by: Bjorn Helgaas Reviewed-by: Niklas Cassel Reviewed-by: Ilpo Järvinen Reviewed-by: Manivannan Sadhasivam --- drivers/pci/controller/dwc/pcie-designware-ep.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/drivers/pci/controller/dwc/pcie-designware-ep.c b/drivers/pci/controller/dwc/pcie-designware-ep.c index d6b66597101e..9a437cfce073 100644 --- a/drivers/pci/controller/dwc/pcie-designware-ep.c +++ b/drivers/pci/controller/dwc/pcie-designware-ep.c @@ -483,9 +483,10 @@ int dw_pcie_ep_raise_msi_irq(struct dw_pcie_ep *ep, u8 func_no, reg = ep_func->msi_cap + PCI_MSI_DATA_32; msg_data = dw_pcie_ep_readw_dbi(ep, func_no, reg); } - aligned_offset = msg_addr_lower & (epc->mem->window.page_size - 1); - msg_addr = ((u64)msg_addr_upper) << 32 | - (msg_addr_lower & ~aligned_offset); + msg_addr = ((u64)msg_addr_upper) << 32 | msg_addr_lower; + + aligned_offset = msg_addr & (epc->mem->window.page_size - 1); + msg_addr = ALIGN_DOWN(msg_addr, epc->mem->window.page_size); ret = dw_pcie_ep_map_addr(epc, func_no, 0, ep->msi_mem_phys, msg_addr, epc->mem->window.page_size); if (ret) From 36fa8d697132b4bed2312d700310e8a78b000c84 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Thu, 1 Feb 2024 22:58:36 +0100 Subject: [PATCH 178/270] netfilter: nft_compat: narrow down revision to unsigned 8-bits xt_find_revision() expects u8, restrict it to this datatype. Fixes: 0ca743a55991 ("netfilter: nf_tables: add compatibility layer for x_tables") Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nft_compat.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/net/netfilter/nft_compat.c b/net/netfilter/nft_compat.c index f0eeda97bfcd..001b6841a4b6 100644 --- a/net/netfilter/nft_compat.c +++ b/net/netfilter/nft_compat.c @@ -135,7 +135,7 @@ static void nft_target_eval_bridge(const struct nft_expr *expr, static const struct nla_policy nft_target_policy[NFTA_TARGET_MAX + 1] = { [NFTA_TARGET_NAME] = { .type = NLA_NUL_STRING }, - [NFTA_TARGET_REV] = { .type = NLA_U32 }, + [NFTA_TARGET_REV] = NLA_POLICY_MAX(NLA_BE32, 255), [NFTA_TARGET_INFO] = { .type = NLA_BINARY }, }; @@ -419,7 +419,7 @@ static void nft_match_eval(const struct nft_expr *expr, static const struct nla_policy nft_match_policy[NFTA_MATCH_MAX + 1] = { [NFTA_MATCH_NAME] = { .type = NLA_NUL_STRING }, - [NFTA_MATCH_REV] = { .type = NLA_U32 }, + [NFTA_MATCH_REV] = NLA_POLICY_MAX(NLA_BE32, 255), [NFTA_MATCH_INFO] = { .type = NLA_BINARY }, }; @@ -724,7 +724,7 @@ out_put: static const struct nla_policy nfnl_compat_policy_get[NFTA_COMPAT_MAX+1] = { [NFTA_COMPAT_NAME] = { .type = NLA_NUL_STRING, .len = NFT_COMPAT_NAME_MAX-1 }, - [NFTA_COMPAT_REV] = { .type = NLA_U32 }, + [NFTA_COMPAT_REV] = NLA_POLICY_MAX(NLA_BE32, 255), [NFTA_COMPAT_TYPE] = { .type = NLA_U32 }, }; From 292781c3c5485ce33bd22b2ef1b2bed709b4d672 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Thu, 1 Feb 2024 23:33:29 +0100 Subject: [PATCH 179/270] netfilter: nft_compat: reject unused compat flag Flag (1 << 0) is ignored is set, never used, reject it it with EINVAL instead. Fixes: 0ca743a55991 ("netfilter: nf_tables: add compatibility layer for x_tables") Signed-off-by: Pablo Neira Ayuso --- include/uapi/linux/netfilter/nf_tables.h | 2 ++ net/netfilter/nft_compat.c | 3 ++- 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/include/uapi/linux/netfilter/nf_tables.h b/include/uapi/linux/netfilter/nf_tables.h index ca30232b7bc8..117c6a9b845b 100644 --- a/include/uapi/linux/netfilter/nf_tables.h +++ b/include/uapi/linux/netfilter/nf_tables.h @@ -285,9 +285,11 @@ enum nft_rule_attributes { /** * enum nft_rule_compat_flags - nf_tables rule compat flags * + * @NFT_RULE_COMPAT_F_UNUSED: unused * @NFT_RULE_COMPAT_F_INV: invert the check result */ enum nft_rule_compat_flags { + NFT_RULE_COMPAT_F_UNUSED = (1 << 0), NFT_RULE_COMPAT_F_INV = (1 << 1), NFT_RULE_COMPAT_F_MASK = NFT_RULE_COMPAT_F_INV, }; diff --git a/net/netfilter/nft_compat.c b/net/netfilter/nft_compat.c index 001b6841a4b6..ed71d5ecbe0a 100644 --- a/net/netfilter/nft_compat.c +++ b/net/netfilter/nft_compat.c @@ -212,7 +212,8 @@ static int nft_parse_compat(const struct nlattr *attr, u16 *proto, bool *inv) return -EINVAL; flags = ntohl(nla_get_be32(tb[NFTA_RULE_COMPAT_FLAGS])); - if (flags & ~NFT_RULE_COMPAT_F_MASK) + if (flags & NFT_RULE_COMPAT_F_UNUSED || + flags & ~NFT_RULE_COMPAT_F_MASK) return -EINVAL; if (flags & NFT_RULE_COMPAT_F_INV) *inv = true; From d694b754894c93fb4d71a7f3699439dec111decc Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Fri, 2 Feb 2024 00:05:23 +0100 Subject: [PATCH 180/270] netfilter: nft_compat: restrict match/target protocol to u16 xt_check_{match,target} expects u16, but NFTA_RULE_COMPAT_PROTO is u32. NLA_POLICY_MAX(NLA_BE32, 65535) cannot be used because .max in nla_policy is s16, see 3e48be05f3c7 ("netlink: add attribute range validation to policy"). Fixes: 0ca743a55991 ("netfilter: nf_tables: add compatibility layer for x_tables") Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nft_compat.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/net/netfilter/nft_compat.c b/net/netfilter/nft_compat.c index ed71d5ecbe0a..1f9474fefe84 100644 --- a/net/netfilter/nft_compat.c +++ b/net/netfilter/nft_compat.c @@ -200,6 +200,7 @@ static const struct nla_policy nft_rule_compat_policy[NFTA_RULE_COMPAT_MAX + 1] static int nft_parse_compat(const struct nlattr *attr, u16 *proto, bool *inv) { struct nlattr *tb[NFTA_RULE_COMPAT_MAX+1]; + u32 l4proto; u32 flags; int err; @@ -218,7 +219,12 @@ static int nft_parse_compat(const struct nlattr *attr, u16 *proto, bool *inv) if (flags & NFT_RULE_COMPAT_F_INV) *inv = true; - *proto = ntohl(nla_get_be32(tb[NFTA_RULE_COMPAT_PROTO])); + l4proto = ntohl(nla_get_be32(tb[NFTA_RULE_COMPAT_PROTO])); + if (l4proto > U16_MAX) + return -EINVAL; + + *proto = l4proto; + return 0; } From e96fddb32931d007db12b1fce9b5e8e4c080401b Mon Sep 17 00:00:00 2001 From: Srinivasan Shanmugam Date: Sat, 27 Jan 2024 18:34:01 +0530 Subject: [PATCH 181/270] drm/amd/display: Fix 'panel_cntl' could be null in 'dcn21_set_backlight_level()' 'panel_cntl' structure used to control the display panel could be null, dereferencing it could lead to a null pointer access. Fixes the below: drivers/gpu/drm/amd/amdgpu/../display/dc/hwss/dcn21/dcn21_hwseq.c:269 dcn21_set_backlight_level() error: we previously assumed 'panel_cntl' could be null (see line 250) Fixes: 474ac4a875ca ("drm/amd/display: Implement some asic specific abm call backs.") Cc: Yongqiang Sun Cc: Anthony Koo Cc: Rodrigo Siqueira Cc: Aurabindo Pillai Signed-off-by: Srinivasan Shanmugam Reviewed-by: Anthony Koo Signed-off-by: Alex Deucher --- .../amd/display/dc/hwss/dcn21/dcn21_hwseq.c | 39 ++++++++++--------- 1 file changed, 20 insertions(+), 19 deletions(-) diff --git a/drivers/gpu/drm/amd/display/dc/hwss/dcn21/dcn21_hwseq.c b/drivers/gpu/drm/amd/display/dc/hwss/dcn21/dcn21_hwseq.c index 8e88dcaf88f5..a9cd39f77360 100644 --- a/drivers/gpu/drm/amd/display/dc/hwss/dcn21/dcn21_hwseq.c +++ b/drivers/gpu/drm/amd/display/dc/hwss/dcn21/dcn21_hwseq.c @@ -237,34 +237,35 @@ bool dcn21_set_backlight_level(struct pipe_ctx *pipe_ctx, { struct dc_context *dc = pipe_ctx->stream->ctx; struct abm *abm = pipe_ctx->stream_res.abm; + struct timing_generator *tg = pipe_ctx->stream_res.tg; struct panel_cntl *panel_cntl = pipe_ctx->stream->link->panel_cntl; + uint32_t otg_inst; + + if (!abm && !tg && !panel_cntl) + return false; + + otg_inst = tg->inst; if (dc->dc->res_pool->dmcu) { dce110_set_backlight_level(pipe_ctx, backlight_pwm_u16_16, frame_ramp); return true; } - if (abm != NULL) { - uint32_t otg_inst = pipe_ctx->stream_res.tg->inst; - - if (abm && panel_cntl) { - if (abm->funcs && abm->funcs->set_pipe_ex) { - abm->funcs->set_pipe_ex(abm, - otg_inst, - SET_ABM_PIPE_NORMAL, - panel_cntl->inst, - panel_cntl->pwrseq_inst); - } else { - dmub_abm_set_pipe(abm, - otg_inst, - SET_ABM_PIPE_NORMAL, - panel_cntl->inst, - panel_cntl->pwrseq_inst); - } - } + if (abm->funcs && abm->funcs->set_pipe_ex) { + abm->funcs->set_pipe_ex(abm, + otg_inst, + SET_ABM_PIPE_NORMAL, + panel_cntl->inst, + panel_cntl->pwrseq_inst); + } else { + dmub_abm_set_pipe(abm, + otg_inst, + SET_ABM_PIPE_NORMAL, + panel_cntl->inst, + panel_cntl->pwrseq_inst); } - if (abm && abm->funcs && abm->funcs->set_backlight_level_pwm) + if (abm->funcs && abm->funcs->set_backlight_level_pwm) abm->funcs->set_backlight_level_pwm(abm, backlight_pwm_u16_16, frame_ramp, 0, panel_cntl->inst); else From 66951d98d9bf45ba25acf37fe0747253fafdf298 Mon Sep 17 00:00:00 2001 From: Srinivasan Shanmugam Date: Wed, 31 Jan 2024 08:49:41 +0530 Subject: [PATCH 182/270] drm/amd/display: Add NULL test for 'timing generator' in 'dcn21_set_pipe()' In "u32 otg_inst = pipe_ctx->stream_res.tg->inst;" pipe_ctx->stream_res.tg could be NULL, it is relying on the caller to ensure the tg is not NULL. Fixes: 474ac4a875ca ("drm/amd/display: Implement some asic specific abm call backs.") Cc: Yongqiang Sun Cc: Anthony Koo Cc: Rodrigo Siqueira Cc: Aurabindo Pillai Signed-off-by: Srinivasan Shanmugam Reviewed-by: Anthony Koo Signed-off-by: Alex Deucher --- .../amd/display/dc/hwss/dcn21/dcn21_hwseq.c | 24 +++++++++++-------- 1 file changed, 14 insertions(+), 10 deletions(-) diff --git a/drivers/gpu/drm/amd/display/dc/hwss/dcn21/dcn21_hwseq.c b/drivers/gpu/drm/amd/display/dc/hwss/dcn21/dcn21_hwseq.c index a9cd39f77360..5c7f380a84f9 100644 --- a/drivers/gpu/drm/amd/display/dc/hwss/dcn21/dcn21_hwseq.c +++ b/drivers/gpu/drm/amd/display/dc/hwss/dcn21/dcn21_hwseq.c @@ -206,28 +206,32 @@ void dcn21_set_abm_immediate_disable(struct pipe_ctx *pipe_ctx) void dcn21_set_pipe(struct pipe_ctx *pipe_ctx) { struct abm *abm = pipe_ctx->stream_res.abm; - uint32_t otg_inst = pipe_ctx->stream_res.tg->inst; + struct timing_generator *tg = pipe_ctx->stream_res.tg; struct panel_cntl *panel_cntl = pipe_ctx->stream->link->panel_cntl; struct dmcu *dmcu = pipe_ctx->stream->ctx->dc->res_pool->dmcu; + uint32_t otg_inst; + + if (!abm && !tg && !panel_cntl) + return; + + otg_inst = tg->inst; if (dmcu) { dce110_set_pipe(pipe_ctx); return; } - if (abm && panel_cntl) { - if (abm->funcs && abm->funcs->set_pipe_ex) { - abm->funcs->set_pipe_ex(abm, + if (abm->funcs && abm->funcs->set_pipe_ex) { + abm->funcs->set_pipe_ex(abm, otg_inst, SET_ABM_PIPE_NORMAL, panel_cntl->inst, panel_cntl->pwrseq_inst); - } else { - dmub_abm_set_pipe(abm, otg_inst, - SET_ABM_PIPE_NORMAL, - panel_cntl->inst, - panel_cntl->pwrseq_inst); - } + } else { + dmub_abm_set_pipe(abm, otg_inst, + SET_ABM_PIPE_NORMAL, + panel_cntl->inst, + panel_cntl->pwrseq_inst); } } From 2103370afba74dda39ff5d2d69163c86644ce528 Mon Sep 17 00:00:00 2001 From: Wenjing Liu Date: Thu, 18 Jan 2024 18:12:15 -0500 Subject: [PATCH 183/270] drm/amd/display: set odm_combine_policy based on context in dcn32 resource [why] When populating dml pipes, odm combine policy should be assigned based on the pipe topology of the context passed in. DML pipes could be repopulated multiple times during single validate bandwidth attempt. We need to make sure that whenever we repopulate the dml pipes it is always aligned with the updated context. There is a case where DML pipes get repopulated during FPO optimization after ODM combine policy is changed. Since in the current code we reinitlaize ODM combine policy, even though the current context has ODM combine enabled, we overwrite it despite the pipes are already split. This causes DML to think that MPC combine is used so we mistakenly enable MPC combine because we apply pipe split with ODM combine policy reset. This issue doesn't impact non windowed MPO with ODM case because the legacy policy has restricted use cases. We don't encounter the case where both ODM and FPO optimizations are enabled together. So we decide to leave it as is because it is about to be replaced anyway. Cc: stable@vger.kernel.org # 6.6+ Reviewed-by: Chaitanya Dhere Reviewed-by: Alvin Lee Acked-by: Hamza Mahfooz Signed-off-by: Wenjing Liu Signed-off-by: Alex Deucher --- .../drm/amd/display/dc/dml/dcn32/dcn32_fpu.c | 15 ++++++++++---- drivers/gpu/drm/amd/display/dc/inc/resource.h | 20 ++++++++----------- .../dc/resource/dcn32/dcn32_resource.c | 16 ++++++++++++++- 3 files changed, 34 insertions(+), 17 deletions(-) diff --git a/drivers/gpu/drm/amd/display/dc/dml/dcn32/dcn32_fpu.c b/drivers/gpu/drm/amd/display/dc/dml/dcn32/dcn32_fpu.c index dd781a20692e..ba76dd4a2ce2 100644 --- a/drivers/gpu/drm/amd/display/dc/dml/dcn32/dcn32_fpu.c +++ b/drivers/gpu/drm/amd/display/dc/dml/dcn32/dcn32_fpu.c @@ -1288,7 +1288,7 @@ static bool update_pipes_with_split_flags(struct dc *dc, struct dc_state *contex return updated; } -static bool should_allow_odm_power_optimization(struct dc *dc, +static bool should_apply_odm_power_optimization(struct dc *dc, struct dc_state *context, struct vba_vars_st *v, int *split, bool *merge) { @@ -1392,9 +1392,12 @@ static void try_odm_power_optimization_and_revalidate( { int i; unsigned int new_vlevel; + unsigned int cur_policy[MAX_PIPES]; - for (i = 0; i < pipe_cnt; i++) + for (i = 0; i < pipe_cnt; i++) { + cur_policy[i] = pipes[i].pipe.dest.odm_combine_policy; pipes[i].pipe.dest.odm_combine_policy = dm_odm_combine_policy_2to1; + } new_vlevel = dml_get_voltage_level(&context->bw_ctx.dml, pipes, pipe_cnt); @@ -1403,6 +1406,9 @@ static void try_odm_power_optimization_and_revalidate( memset(merge, 0, MAX_PIPES * sizeof(bool)); *vlevel = dcn20_validate_apply_pipe_split_flags(dc, context, new_vlevel, split, merge); context->bw_ctx.dml.vba.VoltageLevel = *vlevel; + } else { + for (i = 0; i < pipe_cnt; i++) + pipes[i].pipe.dest.odm_combine_policy = cur_policy[i]; } } @@ -1580,7 +1586,7 @@ static void dcn32_full_validate_bw_helper(struct dc *dc, } } - if (should_allow_odm_power_optimization(dc, context, vba, split, merge)) + if (should_apply_odm_power_optimization(dc, context, vba, split, merge)) try_odm_power_optimization_and_revalidate( dc, context, pipes, split, merge, vlevel, *pipe_cnt); @@ -2209,7 +2215,8 @@ bool dcn32_internal_validate_bw(struct dc *dc, int i; pipe_cnt = dc->res_pool->funcs->populate_dml_pipes(dc, context, pipes, fast_validate); - dcn32_update_dml_pipes_odm_policy_based_on_context(dc, context, pipes); + if (!dc->config.enable_windowed_mpo_odm) + dcn32_update_dml_pipes_odm_policy_based_on_context(dc, context, pipes); /* repopulate_pipes = 1 means the pipes were either split or merged. In this case * we have to re-calculate the DET allocation and run through DML once more to diff --git a/drivers/gpu/drm/amd/display/dc/inc/resource.h b/drivers/gpu/drm/amd/display/dc/inc/resource.h index c958ef37b78a..77a60aa9f27b 100644 --- a/drivers/gpu/drm/amd/display/dc/inc/resource.h +++ b/drivers/gpu/drm/amd/display/dc/inc/resource.h @@ -427,22 +427,18 @@ struct pipe_ctx *resource_get_primary_dpp_pipe(const struct pipe_ctx *dpp_pipe); int resource_get_mpc_slice_index(const struct pipe_ctx *dpp_pipe); /* - * Get number of MPC "cuts" of the plane associated with the pipe. MPC slice - * count is equal to MPC splits + 1. For example if a plane is cut 3 times, it - * will have 4 pieces of slice. - * return - 0 if pipe is not used for a plane with MPCC combine. otherwise - * the number of MPC "cuts" for the plane. + * Get the number of MPC slices associated with the pipe. + * The function returns 0 if the pipe is not associated with an MPC combine + * pipe topology. */ -int resource_get_mpc_slice_count(const struct pipe_ctx *opp_head); +int resource_get_mpc_slice_count(const struct pipe_ctx *pipe); /* - * Get number of ODM "cuts" of the timing associated with the pipe. ODM slice - * count is equal to ODM splits + 1. For example if a timing is cut 3 times, it - * will have 4 pieces of slice. - * return - 0 if pipe is not used for ODM combine. otherwise - * the number of ODM "cuts" for the timing. + * Get the number of ODM slices associated with the pipe. + * The function returns 0 if the pipe is not associated with an ODM combine + * pipe topology. */ -int resource_get_odm_slice_count(const struct pipe_ctx *otg_master); +int resource_get_odm_slice_count(const struct pipe_ctx *pipe); /* Get the ODM slice index counting from 0 from left most slice */ int resource_get_odm_slice_index(const struct pipe_ctx *opp_head); diff --git a/drivers/gpu/drm/amd/display/dc/resource/dcn32/dcn32_resource.c b/drivers/gpu/drm/amd/display/dc/resource/dcn32/dcn32_resource.c index c4d71e7f18af..6f10052caeef 100644 --- a/drivers/gpu/drm/amd/display/dc/resource/dcn32/dcn32_resource.c +++ b/drivers/gpu/drm/amd/display/dc/resource/dcn32/dcn32_resource.c @@ -1829,7 +1829,21 @@ int dcn32_populate_dml_pipes_from_context( dcn32_zero_pipe_dcc_fraction(pipes, pipe_cnt); DC_FP_END(); pipes[pipe_cnt].pipe.dest.vfront_porch = timing->v_front_porch; - pipes[pipe_cnt].pipe.dest.odm_combine_policy = dm_odm_combine_policy_dal; + if (dc->config.enable_windowed_mpo_odm && + dc->debug.enable_single_display_2to1_odm_policy) { + switch (resource_get_odm_slice_count(pipe)) { + case 2: + pipes[pipe_cnt].pipe.dest.odm_combine_policy = dm_odm_combine_policy_2to1; + break; + case 4: + pipes[pipe_cnt].pipe.dest.odm_combine_policy = dm_odm_combine_policy_4to1; + break; + default: + pipes[pipe_cnt].pipe.dest.odm_combine_policy = dm_odm_combine_policy_dal; + } + } else { + pipes[pipe_cnt].pipe.dest.odm_combine_policy = dm_odm_combine_policy_dal; + } pipes[pipe_cnt].pipe.src.gpuvm_min_page_size_kbytes = 256; // according to spreadsheet pipes[pipe_cnt].pipe.src.unbounded_req_mode = false; pipes[pipe_cnt].pipe.scale_ratio_depth.lb_depth = dm_lb_19; From 93bafa32a6918154aa0caf9f66679a32c2431357 Mon Sep 17 00:00:00 2001 From: Prike Liang Date: Tue, 16 Jan 2024 19:10:45 +0800 Subject: [PATCH 184/270] drm/amdgpu: skip to program GFXDEC registers for suspend abort In the suspend abort cases, the gfx power rail doesn't turn off so some GFXDEC registers/CSB can't reset to default value and at this moment reinitialize GFXDEC/CSB will result in an unexpected error. So let skip those program sequence for the suspend abort case. Signed-off-by: Prike Liang Reviewed-by: Alex Deucher Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu.h | 2 ++ drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c | 2 ++ drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c | 8 ++++++++ 3 files changed, 12 insertions(+) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h index 3d8a48f46b01..6dce81a061ab 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h @@ -1078,6 +1078,8 @@ struct amdgpu_device { bool in_s3; bool in_s4; bool in_s0ix; + /* indicate amdgpu suspension status */ + bool suspend_complete; enum pp_mp1_state mp1_state; struct amdgpu_doorbell_index doorbell_index; diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c index 971acf01bea6..211501ea9169 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c @@ -2476,6 +2476,7 @@ static int amdgpu_pmops_suspend(struct device *dev) struct drm_device *drm_dev = dev_get_drvdata(dev); struct amdgpu_device *adev = drm_to_adev(drm_dev); + adev->suspend_complete = false; if (amdgpu_acpi_is_s0ix_active(adev)) adev->in_s0ix = true; else if (amdgpu_acpi_is_s3_active(adev)) @@ -2490,6 +2491,7 @@ static int amdgpu_pmops_suspend_noirq(struct device *dev) struct drm_device *drm_dev = dev_get_drvdata(dev); struct amdgpu_device *adev = drm_to_adev(drm_dev); + adev->suspend_complete = true; if (amdgpu_acpi_should_gpu_reset(adev)) return amdgpu_asic_reset(adev); diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c index 69c500910746..3bc6943365a4 100644 --- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c @@ -3034,6 +3034,14 @@ static int gfx_v9_0_cp_gfx_start(struct amdgpu_device *adev) gfx_v9_0_cp_gfx_enable(adev, true); + /* Now only limit the quirk on the APU gfx9 series and already + * confirmed that the APU gfx10/gfx11 needn't such update. + */ + if (adev->flags & AMD_IS_APU && + adev->in_s3 && !adev->suspend_complete) { + DRM_INFO(" Will skip the CSB packet resubmit\n"); + return 0; + } r = amdgpu_ring_alloc(ring, gfx_v9_0_get_csb_size(adev) + 4 + 3); if (r) { DRM_ERROR("amdgpu: cp failed to lock ring (%d).\n", r); From 6ef82ac664bb9568ca3956e0d9c9c478e25077ff Mon Sep 17 00:00:00 2001 From: Prike Liang Date: Wed, 17 Jan 2024 13:39:37 +0800 Subject: [PATCH 185/270] drm/amdgpu: reset gpu for s3 suspend abort case In the s3 suspend abort case some type of gfx9 power rail not turn off from FCH side and this will put the GPU in an unknown power status, so let's reset the gpu to a known good power state before reinitialize gpu device. Signed-off-by: Prike Liang Acked-by: Alex Deucher Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/soc15.c | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) diff --git a/drivers/gpu/drm/amd/amdgpu/soc15.c b/drivers/gpu/drm/amd/amdgpu/soc15.c index 15033efec2ba..c64c01e2944a 100644 --- a/drivers/gpu/drm/amd/amdgpu/soc15.c +++ b/drivers/gpu/drm/amd/amdgpu/soc15.c @@ -1298,10 +1298,32 @@ static int soc15_common_suspend(void *handle) return soc15_common_hw_fini(adev); } +static bool soc15_need_reset_on_resume(struct amdgpu_device *adev) +{ + u32 sol_reg; + + sol_reg = RREG32_SOC15(MP0, 0, mmMP0_SMN_C2PMSG_81); + + /* Will reset for the following suspend abort cases. + * 1) Only reset limit on APU side, dGPU hasn't checked yet. + * 2) S3 suspend abort and TOS already launched. + */ + if (adev->flags & AMD_IS_APU && adev->in_s3 && + !adev->suspend_complete && + sol_reg) + return true; + + return false; +} + static int soc15_common_resume(void *handle) { struct amdgpu_device *adev = (struct amdgpu_device *)handle; + if (soc15_need_reset_on_resume(adev)) { + dev_info(adev->dev, "S3 suspend abort case, let's reset ASIC.\n"); + soc15_asic_reset(adev); + } return soc15_common_hw_init(adev); } From 897925dcc5dfff5b3b23ba991a89fe3ebaca6ef8 Mon Sep 17 00:00:00 2001 From: Li Ma Date: Thu, 1 Feb 2024 19:31:06 +0800 Subject: [PATCH 186/270] drm/amdgpu: remove asymmetrical irq disabling in jpeg 4.0.5 suspend A supplement to commit: 615dd56ac5379f4239940be69139a33e79e59c67 There is an irq warning of jpeg during resume in s2idle process. No irq enabled in jpeg 4.0.5 resume. Fixes: 615dd56ac537 ("drm/amdgpu: remove asymmetrical irq disabling in vcn 4.0.5 suspend") Signed-off-by: Li Ma Acked-By: Saleemkhan Jamadar Reviewed-by: Yifan Zhang Reviewed-by: Veerabadhran Gopalakrishnan Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/jpeg_v4_0.c | 9 --------- drivers/gpu/drm/amd/amdgpu/jpeg_v4_0_5.c | 10 ---------- 2 files changed, 19 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/jpeg_v4_0.c b/drivers/gpu/drm/amd/amdgpu/jpeg_v4_0.c index bc38b90f8cf8..88ea58d5c4ab 100644 --- a/drivers/gpu/drm/amd/amdgpu/jpeg_v4_0.c +++ b/drivers/gpu/drm/amd/amdgpu/jpeg_v4_0.c @@ -674,14 +674,6 @@ static int jpeg_v4_0_set_powergating_state(void *handle, return ret; } -static int jpeg_v4_0_set_interrupt_state(struct amdgpu_device *adev, - struct amdgpu_irq_src *source, - unsigned type, - enum amdgpu_interrupt_state state) -{ - return 0; -} - static int jpeg_v4_0_set_ras_interrupt_state(struct amdgpu_device *adev, struct amdgpu_irq_src *source, unsigned int type, @@ -765,7 +757,6 @@ static void jpeg_v4_0_set_dec_ring_funcs(struct amdgpu_device *adev) } static const struct amdgpu_irq_src_funcs jpeg_v4_0_irq_funcs = { - .set = jpeg_v4_0_set_interrupt_state, .process = jpeg_v4_0_process_interrupt, }; diff --git a/drivers/gpu/drm/amd/amdgpu/jpeg_v4_0_5.c b/drivers/gpu/drm/amd/amdgpu/jpeg_v4_0_5.c index 6ede85b28cc8..78b74daf4eeb 100644 --- a/drivers/gpu/drm/amd/amdgpu/jpeg_v4_0_5.c +++ b/drivers/gpu/drm/amd/amdgpu/jpeg_v4_0_5.c @@ -181,7 +181,6 @@ static int jpeg_v4_0_5_hw_fini(void *handle) RREG32_SOC15(JPEG, 0, regUVD_JRBC_STATUS)) jpeg_v4_0_5_set_powergating_state(adev, AMD_PG_STATE_GATE); } - amdgpu_irq_put(adev, &adev->jpeg.inst->irq, 0); return 0; } @@ -516,14 +515,6 @@ static int jpeg_v4_0_5_set_powergating_state(void *handle, return ret; } -static int jpeg_v4_0_5_set_interrupt_state(struct amdgpu_device *adev, - struct amdgpu_irq_src *source, - unsigned type, - enum amdgpu_interrupt_state state) -{ - return 0; -} - static int jpeg_v4_0_5_process_interrupt(struct amdgpu_device *adev, struct amdgpu_irq_src *source, struct amdgpu_iv_entry *entry) @@ -603,7 +594,6 @@ static void jpeg_v4_0_5_set_dec_ring_funcs(struct amdgpu_device *adev) } static const struct amdgpu_irq_src_funcs jpeg_v4_0_5_irq_funcs = { - .set = jpeg_v4_0_5_set_interrupt_state, .process = jpeg_v4_0_5_process_interrupt, }; From 280df4996c2bfc0e340ae758ab6da35748853a7e Mon Sep 17 00:00:00 2001 From: Nicholas Kazlauskas Date: Tue, 23 Jan 2024 12:20:06 -0500 Subject: [PATCH 187/270] drm/amd/display: Increase eval/entry delay for DCN35 [Why] To match firmware measurements and avoid hanging when accessing HW that's in idle. [How] Increase the delays to what we've measured. Reviewed-by: Ovidiu Bunea Acked-by: Hamza Mahfooz Signed-off-by: Nicholas Kazlauskas Signed-off-by: Alex Deucher --- .../gpu/drm/amd/display/dc/resource/dcn35/dcn35_resource.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/drm/amd/display/dc/resource/dcn35/dcn35_resource.c b/drivers/gpu/drm/amd/display/dc/resource/dcn35/dcn35_resource.c index 761ec9891875..2142cee6cac2 100644 --- a/drivers/gpu/drm/amd/display/dc/resource/dcn35/dcn35_resource.c +++ b/drivers/gpu/drm/amd/display/dc/resource/dcn35/dcn35_resource.c @@ -780,8 +780,8 @@ static const struct dc_debug_options debug_defaults_drv = { .disable_z10 = false, .ignore_pg = true, .psp_disabled_wa = true, - .ips2_eval_delay_us = 200, - .ips2_entry_delay_us = 400, + .ips2_eval_delay_us = 1650, + .ips2_entry_delay_us = 800, .static_screen_wait_frames = 2, }; From 2dcf82a8e8dc930655787797ef8a3692b527c7a9 Mon Sep 17 00:00:00 2001 From: "Stanley.Yang" Date: Mon, 5 Feb 2024 15:55:48 +0800 Subject: [PATCH 188/270] drm/amdgpu: Fix shared buff copy to user ta if invoke node buffer |-------- ta type ----------| |-------- ta id ----------| |-------- cmd id ----------| |------ shared buf len -----| |------ shared buffer ------| ta if invoke node buffer is as above, copy shared buffer data to correct location Signed-off-by: Stanley.Yang Reviewed-by: Hawking Zhang Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_psp_ta.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp_ta.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp_ta.c index 468a67b302d4..ca5c86e5f7cd 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp_ta.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp_ta.c @@ -362,7 +362,7 @@ static ssize_t ta_if_invoke_debugfs_write(struct file *fp, const char *buf, size } } - if (copy_to_user((char *)buf, context->mem_context.shared_buf, shared_buf_len)) + if (copy_to_user((char *)&buf[copy_pos], context->mem_context.shared_buf, shared_buf_len)) ret = -EFAULT; err_free_shared_buf: From e6a7df96facdcf5b1f71eb3ec26f2f9f6ad61e57 Mon Sep 17 00:00:00 2001 From: Fangzhi Zuo Date: Mon, 22 Jan 2024 13:43:46 -0500 Subject: [PATCH 189/270] drm/amd/display: Fix MST Null Ptr for RV The change try to fix below error specific to RV platform: BUG: kernel NULL pointer dereference, address: 0000000000000008 PGD 0 P4D 0 Oops: 0000 [#1] PREEMPT SMP NOPTI CPU: 4 PID: 917 Comm: sway Not tainted 6.3.9-arch1-1 #1 124dc55df4f5272ccb409f39ef4872fc2b3376a2 Hardware name: LENOVO 20NKS01Y00/20NKS01Y00, BIOS R12ET61W(1.31 ) 07/28/2022 RIP: 0010:drm_dp_atomic_find_time_slots+0x5e/0x260 [drm_display_helper] Code: 01 00 00 48 8b 85 60 05 00 00 48 63 80 88 00 00 00 3b 43 28 0f 8d 2e 01 00 00 48 8b 53 30 48 8d 04 80 48 8d 04 c2 48 8b 40 18 <48> 8> RSP: 0018:ffff960cc2df77d8 EFLAGS: 00010293 RAX: 0000000000000000 RBX: ffff8afb87e81280 RCX: 0000000000000224 RDX: ffff8afb9ee37c00 RSI: ffff8afb8da1a578 RDI: ffff8afb87e81280 RBP: ffff8afb83d67000 R08: 0000000000000001 R09: ffff8afb9652f850 R10: ffff960cc2df7908 R11: 0000000000000002 R12: 0000000000000000 R13: ffff8afb8d7688a0 R14: ffff8afb8da1a578 R15: 0000000000000224 FS: 00007f4dac35ce00(0000) GS:ffff8afe30b00000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 0000000000000008 CR3: 000000010ddc6000 CR4: 00000000003506e0 Call Trace: ? __die+0x23/0x70 ? page_fault_oops+0x171/0x4e0 ? plist_add+0xbe/0x100 ? exc_page_fault+0x7c/0x180 ? asm_exc_page_fault+0x26/0x30 ? drm_dp_atomic_find_time_slots+0x5e/0x260 [drm_display_helper 0e67723696438d8e02b741593dd50d80b44c2026] ? drm_dp_atomic_find_time_slots+0x28/0x260 [drm_display_helper 0e67723696438d8e02b741593dd50d80b44c2026] compute_mst_dsc_configs_for_link+0x2ff/0xa40 [amdgpu 62e600d2a75e9158e1cd0a243bdc8e6da040c054] ? fill_plane_buffer_attributes+0x419/0x510 [amdgpu 62e600d2a75e9158e1cd0a243bdc8e6da040c054] compute_mst_dsc_configs_for_state+0x1e1/0x250 [amdgpu 62e600d2a75e9158e1cd0a243bdc8e6da040c054] amdgpu_dm_atomic_check+0xecd/0x1190 [amdgpu 62e600d2a75e9158e1cd0a243bdc8e6da040c054] drm_atomic_check_only+0x5c5/0xa40 drm_mode_atomic_ioctl+0x76e/0xbc0 ? _copy_to_user+0x25/0x30 ? drm_ioctl+0x296/0x4b0 ? __pfx_drm_mode_atomic_ioctl+0x10/0x10 drm_ioctl_kernel+0xcd/0x170 drm_ioctl+0x26d/0x4b0 ? __pfx_drm_mode_atomic_ioctl+0x10/0x10 amdgpu_drm_ioctl+0x4e/0x90 [amdgpu 62e600d2a75e9158e1cd0a243bdc8e6da040c054] __x64_sys_ioctl+0x94/0xd0 do_syscall_64+0x60/0x90 ? do_syscall_64+0x6c/0x90 entry_SYSCALL_64_after_hwframe+0x72/0xdc RIP: 0033:0x7f4dad17f76f Code: 00 48 89 44 24 18 31 c0 48 8d 44 24 60 c7 04 24 10 00 00 00 48 89 44 24 08 48 8d 44 24 20 48 89 44 24 10 b8 10 00 00 00 0f 05 <89> c> RSP: 002b:00007ffd9ae859f0 EFLAGS: 00000246 ORIG_RAX: 0000000000000010 RAX: ffffffffffffffda RBX: 000055e255a55900 RCX: 00007f4dad17f76f RDX: 00007ffd9ae85a90 RSI: 00000000c03864bc RDI: 000000000000000b RBP: 00007ffd9ae85a90 R08: 0000000000000003 R09: 0000000000000003 R10: 0000000000000000 R11: 0000000000000246 R12: 00000000c03864bc R13: 000000000000000b R14: 000055e255a7fc60 R15: 000055e255a01eb0 Modules linked in: rfcomm snd_seq_dummy snd_hrtimer snd_seq snd_seq_device ccm cmac algif_hash algif_skcipher af_alg joydev mousedev bnep > typec libphy k10temp ipmi_msghandler roles i2c_scmi acpi_cpufreq mac_hid nft_reject_inet nf_reject_ipv4 nf_reject_ipv6 nft_reject nft_mas> CR2: 0000000000000008 ---[ end trace 0000000000000000 ]--- RIP: 0010:drm_dp_atomic_find_time_slots+0x5e/0x260 [drm_display_helper] Code: 01 00 00 48 8b 85 60 05 00 00 48 63 80 88 00 00 00 3b 43 28 0f 8d 2e 01 00 00 48 8b 53 30 48 8d 04 80 48 8d 04 c2 48 8b 40 18 <48> 8> RSP: 0018:ffff960cc2df77d8 EFLAGS: 00010293 RAX: 0000000000000000 RBX: ffff8afb87e81280 RCX: 0000000000000224 RDX: ffff8afb9ee37c00 RSI: ffff8afb8da1a578 RDI: ffff8afb87e81280 RBP: ffff8afb83d67000 R08: 0000000000000001 R09: ffff8afb9652f850 R10: ffff960cc2df7908 R11: 0000000000000002 R12: 0000000000000000 R13: ffff8afb8d7688a0 R14: ffff8afb8da1a578 R15: 0000000000000224 FS: 00007f4dac35ce00(0000) GS:ffff8afe30b00000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 0000000000000008 CR3: 000000010ddc6000 CR4: 00000000003506e0 With a second DP monitor connected, drm_atomic_state in dm atomic check sequence does not include the connector state for the old/existing/first DP monitor. In such case, dsc determination policy would hit a null ptr when it tries to iterate the old/existing stream that does not have a valid connector state attached to it. When that happens, dm atomic check should call drm_atomic_get_connector_state for a new connector state. Existing dm has already done that, except for RV due to it does not have official support of dsc where .num_dsc is not defined in dcn10 resource cap, that prevent from getting drm_atomic_get_connector_state called. So, skip dsc determination policy for ASICs that don't have DSC support. Cc: stable@vger.kernel.org # 6.1+ Link: https://gitlab.freedesktop.org/drm/amd/-/issues/2314 Reviewed-by: Wayne Lin Acked-by: Hamza Mahfooz Signed-off-by: Fangzhi Zuo Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c index d292f290cd6e..59d2eee72a32 100644 --- a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c +++ b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c @@ -10731,11 +10731,13 @@ static int amdgpu_dm_atomic_check(struct drm_device *dev, goto fail; } - ret = compute_mst_dsc_configs_for_state(state, dm_state->context, vars); - if (ret) { - DRM_DEBUG_DRIVER("compute_mst_dsc_configs_for_state() failed\n"); - ret = -EINVAL; - goto fail; + if (dc_resource_is_dsc_encoding_supported(dc)) { + ret = compute_mst_dsc_configs_for_state(state, dm_state->context, vars); + if (ret) { + DRM_DEBUG_DRIVER("compute_mst_dsc_configs_for_state() failed\n"); + ret = -EINVAL; + goto fail; + } } ret = dm_update_mst_vcpi_slots_for_dsc(state, dm_state->context, vars); From ca8179ba11f211cdcb6c12ddd83814eaec999738 Mon Sep 17 00:00:00 2001 From: Alvin Lee Date: Fri, 26 Jan 2024 16:47:20 -0500 Subject: [PATCH 190/270] drm/amd/display: Update phantom pipe enable / disable sequence Previously we would call apply_ctx_to_hw to enable and disable phantom pipes. However, apply_ctx_to_hw can potentially update non-phantom pipes as well which is undesired. Instead of calling apply_ctx_to_hw as a whole, call the relevant helpers for each phantom pipe when enabling / disabling which will avoid us modifying hardware state for non-phantom pipes unknowingly. The use case is for an FRL display where FRL_Update is requested by the display. In this case link_state_valid flag is cleared in a passive callback thread and should be handled in the next stream / link update. However, due to the call to apply_ctx_to_hw for the phantom pipes during a flip, the main pipes were modified outside of the desired sequence (driver does not handle link_state_valid = 0 on flips). Cc: stable@vger.kernel.org # 6.6+ Reviewed-by: Samson Tam Acked-by: Hamza Mahfooz Signed-off-by: Alvin Lee Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/display/dc/core/dc.c | 4 +- .../amd/display/dc/hwss/dce110/dce110_hwseq.c | 4 +- .../amd/display/dc/hwss/dce110/dce110_hwseq.h | 4 + .../amd/display/dc/hwss/dcn20/dcn20_hwseq.c | 2 +- .../amd/display/dc/hwss/dcn20/dcn20_hwseq.h | 4 + .../amd/display/dc/hwss/dcn32/dcn32_hwseq.c | 74 +++++++++++++++++-- .../amd/display/dc/hwss/dcn32/dcn32_hwseq.h | 2 + .../amd/display/dc/hwss/dcn32/dcn32_init.c | 3 + .../drm/amd/display/dc/hwss/hw_sequencer.h | 1 + .../display/dc/hwss/hw_sequencer_private.h | 7 ++ 10 files changed, 93 insertions(+), 12 deletions(-) diff --git a/drivers/gpu/drm/amd/display/dc/core/dc.c b/drivers/gpu/drm/amd/display/dc/core/dc.c index aa7c02ba948e..2c424e435962 100644 --- a/drivers/gpu/drm/amd/display/dc/core/dc.c +++ b/drivers/gpu/drm/amd/display/dc/core/dc.c @@ -3817,7 +3817,9 @@ static void commit_planes_for_stream(struct dc *dc, * programming has completed (we turn on phantom OTG in order * to complete the plane disable for phantom pipes). */ - dc->hwss.apply_ctx_to_hw(dc, context); + + if (dc->hwss.disable_phantom_streams) + dc->hwss.disable_phantom_streams(dc, context); } if (update_type != UPDATE_TYPE_FAST) diff --git a/drivers/gpu/drm/amd/display/dc/hwss/dce110/dce110_hwseq.c b/drivers/gpu/drm/amd/display/dc/hwss/dce110/dce110_hwseq.c index 2352428bcea3..01493c49bd7a 100644 --- a/drivers/gpu/drm/amd/display/dc/hwss/dce110/dce110_hwseq.c +++ b/drivers/gpu/drm/amd/display/dc/hwss/dce110/dce110_hwseq.c @@ -1476,7 +1476,7 @@ static enum dc_status dce110_enable_stream_timing( return DC_OK; } -static enum dc_status apply_single_controller_ctx_to_hw( +enum dc_status dce110_apply_single_controller_ctx_to_hw( struct pipe_ctx *pipe_ctx, struct dc_state *context, struct dc *dc) @@ -2302,7 +2302,7 @@ enum dc_status dce110_apply_ctx_to_hw( if (pipe_ctx->top_pipe || pipe_ctx->prev_odm_pipe) continue; - status = apply_single_controller_ctx_to_hw( + status = dce110_apply_single_controller_ctx_to_hw( pipe_ctx, context, dc); diff --git a/drivers/gpu/drm/amd/display/dc/hwss/dce110/dce110_hwseq.h b/drivers/gpu/drm/amd/display/dc/hwss/dce110/dce110_hwseq.h index 08028a1779ae..ed3cc3648e8e 100644 --- a/drivers/gpu/drm/amd/display/dc/hwss/dce110/dce110_hwseq.h +++ b/drivers/gpu/drm/amd/display/dc/hwss/dce110/dce110_hwseq.h @@ -39,6 +39,10 @@ enum dc_status dce110_apply_ctx_to_hw( struct dc *dc, struct dc_state *context); +enum dc_status dce110_apply_single_controller_ctx_to_hw( + struct pipe_ctx *pipe_ctx, + struct dc_state *context, + struct dc *dc); void dce110_enable_stream(struct pipe_ctx *pipe_ctx); diff --git a/drivers/gpu/drm/amd/display/dc/hwss/dcn20/dcn20_hwseq.c b/drivers/gpu/drm/amd/display/dc/hwss/dcn20/dcn20_hwseq.c index 4853ecac53f9..931ac8ed7069 100644 --- a/drivers/gpu/drm/amd/display/dc/hwss/dcn20/dcn20_hwseq.c +++ b/drivers/gpu/drm/amd/display/dc/hwss/dcn20/dcn20_hwseq.c @@ -2561,7 +2561,7 @@ void dcn20_setup_vupdate_interrupt(struct dc *dc, struct pipe_ctx *pipe_ctx) tg->funcs->setup_vertical_interrupt2(tg, start_line); } -static void dcn20_reset_back_end_for_pipe( +void dcn20_reset_back_end_for_pipe( struct dc *dc, struct pipe_ctx *pipe_ctx, struct dc_state *context) diff --git a/drivers/gpu/drm/amd/display/dc/hwss/dcn20/dcn20_hwseq.h b/drivers/gpu/drm/amd/display/dc/hwss/dcn20/dcn20_hwseq.h index b94c85340abf..d950b3e54ec2 100644 --- a/drivers/gpu/drm/amd/display/dc/hwss/dcn20/dcn20_hwseq.h +++ b/drivers/gpu/drm/amd/display/dc/hwss/dcn20/dcn20_hwseq.h @@ -84,6 +84,10 @@ enum dc_status dcn20_enable_stream_timing( void dcn20_disable_stream_gating(struct dc *dc, struct pipe_ctx *pipe_ctx); void dcn20_enable_stream_gating(struct dc *dc, struct pipe_ctx *pipe_ctx); void dcn20_setup_vupdate_interrupt(struct dc *dc, struct pipe_ctx *pipe_ctx); +void dcn20_reset_back_end_for_pipe( + struct dc *dc, + struct pipe_ctx *pipe_ctx, + struct dc_state *context); void dcn20_init_blank( struct dc *dc, struct timing_generator *tg); diff --git a/drivers/gpu/drm/amd/display/dc/hwss/dcn32/dcn32_hwseq.c b/drivers/gpu/drm/amd/display/dc/hwss/dcn32/dcn32_hwseq.c index 6c9299c7683d..aa36d7a56ca8 100644 --- a/drivers/gpu/drm/amd/display/dc/hwss/dcn32/dcn32_hwseq.c +++ b/drivers/gpu/drm/amd/display/dc/hwss/dcn32/dcn32_hwseq.c @@ -1474,9 +1474,44 @@ void dcn32_update_dsc_pg(struct dc *dc, } } +void dcn32_disable_phantom_streams(struct dc *dc, struct dc_state *context) +{ + struct dce_hwseq *hws = dc->hwseq; + int i; + + for (i = dc->res_pool->pipe_count - 1; i >= 0 ; i--) { + struct pipe_ctx *pipe_ctx_old = + &dc->current_state->res_ctx.pipe_ctx[i]; + struct pipe_ctx *pipe_ctx = &context->res_ctx.pipe_ctx[i]; + + if (!pipe_ctx_old->stream) + continue; + + if (dc_state_get_pipe_subvp_type(dc->current_state, pipe_ctx_old) != SUBVP_PHANTOM) + continue; + + if (pipe_ctx_old->top_pipe || pipe_ctx_old->prev_odm_pipe) + continue; + + if (!pipe_ctx->stream || pipe_need_reprogram(pipe_ctx_old, pipe_ctx) || + (pipe_ctx->stream && dc_state_get_pipe_subvp_type(context, pipe_ctx) != SUBVP_PHANTOM)) { + struct clock_source *old_clk = pipe_ctx_old->clock_source; + + if (hws->funcs.reset_back_end_for_pipe) + hws->funcs.reset_back_end_for_pipe(dc, pipe_ctx_old, dc->current_state); + if (hws->funcs.enable_stream_gating) + hws->funcs.enable_stream_gating(dc, pipe_ctx_old); + if (old_clk) + old_clk->funcs->cs_power_down(old_clk); + } + } +} + void dcn32_enable_phantom_streams(struct dc *dc, struct dc_state *context) { unsigned int i; + enum dc_status status = DC_OK; + struct dce_hwseq *hws = dc->hwseq; for (i = 0; i < dc->res_pool->pipe_count; i++) { struct pipe_ctx *pipe = &context->res_ctx.pipe_ctx[i]; @@ -1497,16 +1532,39 @@ void dcn32_enable_phantom_streams(struct dc *dc, struct dc_state *context) } } for (i = 0; i < dc->res_pool->pipe_count; i++) { - struct pipe_ctx *new_pipe = &context->res_ctx.pipe_ctx[i]; + struct pipe_ctx *pipe_ctx_old = + &dc->current_state->res_ctx.pipe_ctx[i]; + struct pipe_ctx *pipe_ctx = &context->res_ctx.pipe_ctx[i]; - if (new_pipe->stream && dc_state_get_pipe_subvp_type(context, new_pipe) == SUBVP_PHANTOM) { - // If old context or new context has phantom pipes, apply - // the phantom timings now. We can't change the phantom - // pipe configuration safely without driver acquiring - // the DMCUB lock first. - dc->hwss.apply_ctx_to_hw(dc, context); - break; + if (pipe_ctx->stream == NULL) + continue; + + if (dc_state_get_pipe_subvp_type(context, pipe_ctx) != SUBVP_PHANTOM) + continue; + + if (pipe_ctx->stream == pipe_ctx_old->stream && + pipe_ctx->stream->link->link_state_valid) { + continue; } + + if (pipe_ctx_old->stream && !pipe_need_reprogram(pipe_ctx_old, pipe_ctx)) + continue; + + if (pipe_ctx->top_pipe || pipe_ctx->prev_odm_pipe) + continue; + + if (hws->funcs.apply_single_controller_ctx_to_hw) + status = hws->funcs.apply_single_controller_ctx_to_hw( + pipe_ctx, + context, + dc); + + ASSERT(status == DC_OK); + +#ifdef CONFIG_DRM_AMD_DC_FP + if (hws->funcs.resync_fifo_dccg_dio) + hws->funcs.resync_fifo_dccg_dio(hws, dc, context); +#endif } } diff --git a/drivers/gpu/drm/amd/display/dc/hwss/dcn32/dcn32_hwseq.h b/drivers/gpu/drm/amd/display/dc/hwss/dcn32/dcn32_hwseq.h index cecf7f0f5671..069e20bc87c0 100644 --- a/drivers/gpu/drm/amd/display/dc/hwss/dcn32/dcn32_hwseq.h +++ b/drivers/gpu/drm/amd/display/dc/hwss/dcn32/dcn32_hwseq.h @@ -111,6 +111,8 @@ void dcn32_update_dsc_pg(struct dc *dc, void dcn32_enable_phantom_streams(struct dc *dc, struct dc_state *context); +void dcn32_disable_phantom_streams(struct dc *dc, struct dc_state *context); + void dcn32_init_blank( struct dc *dc, struct timing_generator *tg); diff --git a/drivers/gpu/drm/amd/display/dc/hwss/dcn32/dcn32_init.c b/drivers/gpu/drm/amd/display/dc/hwss/dcn32/dcn32_init.c index 427cfc8c24a4..e8ac94a005b8 100644 --- a/drivers/gpu/drm/amd/display/dc/hwss/dcn32/dcn32_init.c +++ b/drivers/gpu/drm/amd/display/dc/hwss/dcn32/dcn32_init.c @@ -109,6 +109,7 @@ static const struct hw_sequencer_funcs dcn32_funcs = { .get_dcc_en_bits = dcn10_get_dcc_en_bits, .commit_subvp_config = dcn32_commit_subvp_config, .enable_phantom_streams = dcn32_enable_phantom_streams, + .disable_phantom_streams = dcn32_disable_phantom_streams, .subvp_pipe_control_lock = dcn32_subvp_pipe_control_lock, .update_visual_confirm_color = dcn10_update_visual_confirm_color, .subvp_pipe_control_lock_fast = dcn32_subvp_pipe_control_lock_fast, @@ -159,6 +160,8 @@ static const struct hwseq_private_funcs dcn32_private_funcs = { .set_pixels_per_cycle = dcn32_set_pixels_per_cycle, .resync_fifo_dccg_dio = dcn32_resync_fifo_dccg_dio, .is_dp_dig_pixel_rate_div_policy = dcn32_is_dp_dig_pixel_rate_div_policy, + .apply_single_controller_ctx_to_hw = dce110_apply_single_controller_ctx_to_hw, + .reset_back_end_for_pipe = dcn20_reset_back_end_for_pipe, }; void dcn32_hw_sequencer_init_functions(struct dc *dc) diff --git a/drivers/gpu/drm/amd/display/dc/hwss/hw_sequencer.h b/drivers/gpu/drm/amd/display/dc/hwss/hw_sequencer.h index a54399383318..64ca7c66509b 100644 --- a/drivers/gpu/drm/amd/display/dc/hwss/hw_sequencer.h +++ b/drivers/gpu/drm/amd/display/dc/hwss/hw_sequencer.h @@ -379,6 +379,7 @@ struct hw_sequencer_funcs { struct dc_cursor_attributes *cursor_attr); void (*commit_subvp_config)(struct dc *dc, struct dc_state *context); void (*enable_phantom_streams)(struct dc *dc, struct dc_state *context); + void (*disable_phantom_streams)(struct dc *dc, struct dc_state *context); void (*subvp_pipe_control_lock)(struct dc *dc, struct dc_state *context, bool lock, diff --git a/drivers/gpu/drm/amd/display/dc/hwss/hw_sequencer_private.h b/drivers/gpu/drm/amd/display/dc/hwss/hw_sequencer_private.h index 6137cf09aa54..b3c62a82cb1c 100644 --- a/drivers/gpu/drm/amd/display/dc/hwss/hw_sequencer_private.h +++ b/drivers/gpu/drm/amd/display/dc/hwss/hw_sequencer_private.h @@ -165,8 +165,15 @@ struct hwseq_private_funcs { void (*set_pixels_per_cycle)(struct pipe_ctx *pipe_ctx); void (*resync_fifo_dccg_dio)(struct dce_hwseq *hws, struct dc *dc, struct dc_state *context); + enum dc_status (*apply_single_controller_ctx_to_hw)( + struct pipe_ctx *pipe_ctx, + struct dc_state *context, + struct dc *dc); bool (*is_dp_dig_pixel_rate_div_policy)(struct pipe_ctx *pipe_ctx); #endif + void (*reset_back_end_for_pipe)(struct dc *dc, + struct pipe_ctx *pipe_ctx, + struct dc_state *context); }; struct dce_hwseq { From 29c5da1a124671caa87c4a936c625432c16ad8ca Mon Sep 17 00:00:00 2001 From: Rodrigo Siqueira Date: Thu, 11 Jan 2024 14:25:27 -0700 Subject: [PATCH 191/270] drm/amd/display: Disable ODM by default for DCN35 Just ensure that ODM optimization is disabled by default. Acked-by: Hamza Mahfooz Signed-off-by: Rodrigo Siqueira Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/display/dc/resource/dcn35/dcn35_resource.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/gpu/drm/amd/display/dc/resource/dcn35/dcn35_resource.c b/drivers/gpu/drm/amd/display/dc/resource/dcn35/dcn35_resource.c index 2142cee6cac2..1c3d89264ef7 100644 --- a/drivers/gpu/drm/amd/display/dc/resource/dcn35/dcn35_resource.c +++ b/drivers/gpu/drm/amd/display/dc/resource/dcn35/dcn35_resource.c @@ -2130,6 +2130,7 @@ static bool dcn35_resource_construct( dc->dml2_options.dcn_pipe_count = pool->base.pipe_count; dc->dml2_options.use_native_pstate_optimization = true; dc->dml2_options.use_native_soc_bb_construction = true; + dc->dml2_options.minimize_dispclk_using_odm = false; if (dc->config.EnableMinDispClkODM) dc->dml2_options.minimize_dispclk_using_odm = true; dc->dml2_options.enable_windowed_mpo_odm = dc->config.enable_windowed_mpo_odm; From 55173942a63668bdc1d61812c7c9e0406aefb5bf Mon Sep 17 00:00:00 2001 From: Lijo Lazar Date: Wed, 31 Jan 2024 16:34:32 +0530 Subject: [PATCH 192/270] drm/amdgpu: Avoid fetching VRAM vendor info The present way to fetch VRAM vendor information turns out to be not reliable on GFX 9.4.3 dGPUs as well. Avoid using the data. Signed-off-by: Lijo Lazar Reviewed-by: Hawking Zhang Signed-off-by: Alex Deucher Cc: stable@vger.kernel.org --- drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c | 8 -------- 1 file changed, 8 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c index 40a00ea0009f..e67a62db9e12 100644 --- a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c @@ -1947,14 +1947,6 @@ static int gmc_v9_0_init_mem_ranges(struct amdgpu_device *adev) static void gmc_v9_4_3_init_vram_info(struct amdgpu_device *adev) { - static const u32 regBIF_BIOS_SCRATCH_4 = 0x50; - u32 vram_info; - - /* Only for dGPU, vendor informaton is reliable */ - if (!amdgpu_sriov_vf(adev) && !(adev->flags & AMD_IS_APU)) { - vram_info = RREG32(regBIF_BIOS_SCRATCH_4); - adev->gmc.vram_vendor = vram_info & 0xF; - } adev->gmc.vram_type = AMDGPU_VRAM_TYPE_HBM; adev->gmc.vram_width = 128 * 64; } From da48914e1fcdbf57f6b95d4552fcc088e6547ce4 Mon Sep 17 00:00:00 2001 From: Mario Limonciello Date: Fri, 2 Feb 2024 18:30:59 -0600 Subject: [PATCH 193/270] drm/amd/display: Clear phantom stream count and plane count When dc_state_destruct() was refactored the new phantom_stream_count and phantom_plane_count members weren't cleared. Fixes: 012a04b1d6af ("drm/amd/display: Refactor phantom resource allocation") Acked-by: Alex Deucher Signed-off-by: Mario Limonciello Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/display/dc/core/dc_state.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/gpu/drm/amd/display/dc/core/dc_state.c b/drivers/gpu/drm/amd/display/dc/core/dc_state.c index 88c6436b28b6..180ac47868c2 100644 --- a/drivers/gpu/drm/amd/display/dc/core/dc_state.c +++ b/drivers/gpu/drm/amd/display/dc/core/dc_state.c @@ -291,11 +291,14 @@ void dc_state_destruct(struct dc_state *state) dc_stream_release(state->phantom_streams[i]); state->phantom_streams[i] = NULL; } + state->phantom_stream_count = 0; for (i = 0; i < state->phantom_plane_count; i++) { dc_plane_state_release(state->phantom_planes[i]); state->phantom_planes[i] = NULL; } + state->phantom_plane_count = 0; + state->stream_mask = 0; memset(&state->res_ctx, 0, sizeof(state->res_ctx)); memset(&state->pp_display_cfg, 0, sizeof(state->pp_display_cfg)); From e63e35f0164c43fbc1adb481d6604f253b9f9667 Mon Sep 17 00:00:00 2001 From: Nathan Chancellor Date: Mon, 5 Feb 2024 14:54:05 -0700 Subject: [PATCH 194/270] drm/amd/display: Increase frame-larger-than for all display_mode_vba files After a recent change in LLVM, allmodconfig (which has CONFIG_KCSAN=y and CONFIG_WERROR=y enabled) has a few new instances of -Wframe-larger-than for the mode support and system configuration functions: drivers/gpu/drm/amd/amdgpu/../display/dc/dml/dcn20/display_mode_vba_20v2.c:3393:6: error: stack frame size (2144) exceeds limit (2048) in 'dml20v2_ModeSupportAndSystemConfigurationFull' [-Werror,-Wframe-larger-than] 3393 | void dml20v2_ModeSupportAndSystemConfigurationFull(struct display_mode_lib *mode_lib) | ^ 1 error generated. drivers/gpu/drm/amd/amdgpu/../display/dc/dml/dcn21/display_mode_vba_21.c:3520:6: error: stack frame size (2192) exceeds limit (2048) in 'dml21_ModeSupportAndSystemConfigurationFull' [-Werror,-Wframe-larger-than] 3520 | void dml21_ModeSupportAndSystemConfigurationFull(struct display_mode_lib *mode_lib) | ^ 1 error generated. drivers/gpu/drm/amd/amdgpu/../display/dc/dml/dcn20/display_mode_vba_20.c:3286:6: error: stack frame size (2128) exceeds limit (2048) in 'dml20_ModeSupportAndSystemConfigurationFull' [-Werror,-Wframe-larger-than] 3286 | void dml20_ModeSupportAndSystemConfigurationFull(struct display_mode_lib *mode_lib) | ^ 1 error generated. Without the sanitizers enabled, there are no warnings. This was the catalyst for commit 6740ec97bcdb ("drm/amd/display: Increase frame warning limit with KASAN or KCSAN in dml2") and that same change was made to dml in commit 5b750b22530f ("drm/amd/display: Increase frame warning limit with KASAN or KCSAN in dml") but the frame_warn_flag variable was not applied to all files. Do so now to clear up the warnings and make all these files consistent. Cc: stable@vger.kernel.org Closes: https://github.com/ClangBuiltLinux/linux/issue/1990 Signed-off-by: Nathan Chancellor Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/display/dc/dml/Makefile | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/gpu/drm/amd/display/dc/dml/Makefile b/drivers/gpu/drm/amd/display/dc/dml/Makefile index 6042a5a6a44f..59ade76ffb18 100644 --- a/drivers/gpu/drm/amd/display/dc/dml/Makefile +++ b/drivers/gpu/drm/amd/display/dc/dml/Makefile @@ -72,11 +72,11 @@ CFLAGS_$(AMDDALPATH)/dc/dml/display_mode_lib.o := $(dml_ccflags) CFLAGS_$(AMDDALPATH)/dc/dml/display_mode_vba.o := $(dml_ccflags) CFLAGS_$(AMDDALPATH)/dc/dml/dcn10/dcn10_fpu.o := $(dml_ccflags) CFLAGS_$(AMDDALPATH)/dc/dml/dcn20/dcn20_fpu.o := $(dml_ccflags) -CFLAGS_$(AMDDALPATH)/dc/dml/dcn20/display_mode_vba_20.o := $(dml_ccflags) +CFLAGS_$(AMDDALPATH)/dc/dml/dcn20/display_mode_vba_20.o := $(dml_ccflags) $(frame_warn_flag) CFLAGS_$(AMDDALPATH)/dc/dml/dcn20/display_rq_dlg_calc_20.o := $(dml_ccflags) -CFLAGS_$(AMDDALPATH)/dc/dml/dcn20/display_mode_vba_20v2.o := $(dml_ccflags) +CFLAGS_$(AMDDALPATH)/dc/dml/dcn20/display_mode_vba_20v2.o := $(dml_ccflags) $(frame_warn_flag) CFLAGS_$(AMDDALPATH)/dc/dml/dcn20/display_rq_dlg_calc_20v2.o := $(dml_ccflags) -CFLAGS_$(AMDDALPATH)/dc/dml/dcn21/display_mode_vba_21.o := $(dml_ccflags) +CFLAGS_$(AMDDALPATH)/dc/dml/dcn21/display_mode_vba_21.o := $(dml_ccflags) $(frame_warn_flag) CFLAGS_$(AMDDALPATH)/dc/dml/dcn21/display_rq_dlg_calc_21.o := $(dml_ccflags) CFLAGS_$(AMDDALPATH)/dc/dml/dcn30/display_mode_vba_30.o := $(dml_ccflags) $(frame_warn_flag) CFLAGS_$(AMDDALPATH)/dc/dml/dcn30/display_rq_dlg_calc_30.o := $(dml_ccflags) From 58fca355ad37dcb5f785d9095db5f748b79c5dc2 Mon Sep 17 00:00:00 2001 From: Srinivasan Shanmugam Date: Wed, 7 Feb 2024 10:20:57 +0530 Subject: [PATCH 195/270] drm/amd/display: Implement bounds check for stream encoder creation in DCN301 'stream_enc_regs' array is an array of dcn10_stream_enc_registers structures. The array is initialized with four elements, corresponding to the four calls to stream_enc_regs() in the array initializer. This means that valid indices for this array are 0, 1, 2, and 3. The error message 'stream_enc_regs' 4 <= 5 below, is indicating that there is an attempt to access this array with an index of 5, which is out of bounds. This could lead to undefined behavior Here, eng_id is used as an index to access the stream_enc_regs array. If eng_id is 5, this would result in an out-of-bounds access on the stream_enc_regs array. Thus fixing Buffer overflow error in dcn301_stream_encoder_create reported by Smatch: drivers/gpu/drm/amd/amdgpu/../display/dc/resource/dcn301/dcn301_resource.c:1011 dcn301_stream_encoder_create() error: buffer overflow 'stream_enc_regs' 4 <= 5 Fixes: 3a83e4e64bb1 ("drm/amd/display: Add dcn3.01 support to DC (v2)") Cc: Roman Li Cc: Rodrigo Siqueira Cc: Aurabindo Pillai Signed-off-by: Srinivasan Shanmugam Reviewed-by: Roman Li Signed-off-by: Alex Deucher --- .../gpu/drm/amd/display/dc/resource/dcn301/dcn301_resource.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/display/dc/resource/dcn301/dcn301_resource.c b/drivers/gpu/drm/amd/display/dc/resource/dcn301/dcn301_resource.c index 511ff6b5b985..7538b548c572 100644 --- a/drivers/gpu/drm/amd/display/dc/resource/dcn301/dcn301_resource.c +++ b/drivers/gpu/drm/amd/display/dc/resource/dcn301/dcn301_resource.c @@ -999,7 +999,7 @@ static struct stream_encoder *dcn301_stream_encoder_create(enum engine_id eng_id vpg = dcn301_vpg_create(ctx, vpg_inst); afmt = dcn301_afmt_create(ctx, afmt_inst); - if (!enc1 || !vpg || !afmt) { + if (!enc1 || !vpg || !afmt || eng_id >= ARRAY_SIZE(stream_enc_regs)) { kfree(enc1); kfree(vpg); kfree(afmt); From 534c8a5b9d5d41d30cdcac93cfa1bca5e17be009 Mon Sep 17 00:00:00 2001 From: Lijo Lazar Date: Fri, 8 Dec 2023 13:48:22 +0530 Subject: [PATCH 196/270] drm/amdgpu: Fix HDP flush for VFs on nbio v7.9 HDP flush remapping is not done for VFs. Keep the original offsets in VF environment. Signed-off-by: Lijo Lazar Reviewed-by: Hawking Zhang Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/nbio_v7_9.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/drivers/gpu/drm/amd/amdgpu/nbio_v7_9.c b/drivers/gpu/drm/amd/amdgpu/nbio_v7_9.c index e90f33780803..b4723d68eab0 100644 --- a/drivers/gpu/drm/amd/amdgpu/nbio_v7_9.c +++ b/drivers/gpu/drm/amd/amdgpu/nbio_v7_9.c @@ -431,6 +431,12 @@ static void nbio_v7_9_init_registers(struct amdgpu_device *adev) u32 inst_mask; int i; + if (amdgpu_sriov_vf(adev)) + adev->rmmio_remap.reg_offset = + SOC15_REG_OFFSET( + NBIO, 0, + regBIF_BX_DEV0_EPF0_VF0_HDP_MEM_COHERENCY_FLUSH_CNTL) + << 2; WREG32_SOC15(NBIO, 0, regXCC_DOORBELL_FENCE, 0xff & ~(adev->gfx.xcc_mask)); From 4054705215ad7e6dd8e4e6ff27c39abd7667f700 Mon Sep 17 00:00:00 2001 From: Francis Pravin Date: Wed, 7 Feb 2024 05:04:17 +0530 Subject: [PATCH 197/270] nvme: use ns->head->pi_size instead of t10_pi_tuple structure size Currently kernel supports 8 byte and 16 byte protection information. So, use ns->head->pi_size instead of sizeof(struct t10_pi_tuple). Signed-off-by: Francis Pravin Signed-off-by: Sathyavathi M Signed-off-by: Keith Busch --- drivers/nvme/host/ioctl.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/nvme/host/ioctl.c b/drivers/nvme/host/ioctl.c index 18f5c1be5d67..3dfd5ae99ae0 100644 --- a/drivers/nvme/host/ioctl.c +++ b/drivers/nvme/host/ioctl.c @@ -228,7 +228,7 @@ static int nvme_submit_io(struct nvme_ns *ns, struct nvme_user_io __user *uio) length = (io.nblocks + 1) << ns->head->lba_shift; if ((io.control & NVME_RW_PRINFO_PRACT) && - ns->head->ms == sizeof(struct t10_pi_tuple)) { + (ns->head->ms == ns->head->pi_size)) { /* * Protection information is stripped/inserted by the * controller. From a12bc36032a2f7917068f9ce9eb26d869e54b31a Mon Sep 17 00:00:00 2001 From: Yang Li Date: Fri, 2 Feb 2024 16:13:17 +0800 Subject: [PATCH 198/270] ksmbd: Add kernel-doc for ksmbd_extract_sharename() function The ksmbd_extract_sharename() function lacked a complete kernel-doc comment. This patch adds parameter descriptions and detailed function behavior to improve code readability and maintainability. Signed-off-by: Yang Li Acked-by: Randy Dunlap Acked-by: Namjae Jeon Signed-off-by: Steve French --- fs/smb/server/misc.c | 1 + 1 file changed, 1 insertion(+) diff --git a/fs/smb/server/misc.c b/fs/smb/server/misc.c index 9e8afaa686e3..1a5faa6f6e7b 100644 --- a/fs/smb/server/misc.c +++ b/fs/smb/server/misc.c @@ -261,6 +261,7 @@ out_ascii: /** * ksmbd_extract_sharename() - get share name from tree connect request + * @um: pointer to a unicode_map structure for character encoding handling * @treename: buffer containing tree name and share name * * Return: share name on success, otherwise error From 108a020c64434fed4b69762879d78cd24088b4c7 Mon Sep 17 00:00:00 2001 From: Fedor Pchelkin Date: Mon, 5 Feb 2024 14:19:16 +0300 Subject: [PATCH 199/270] ksmbd: free aux buffer if ksmbd_iov_pin_rsp_read fails ksmbd_iov_pin_rsp_read() doesn't free the provided aux buffer if it fails. Seems to be the caller's responsibility to clear the buffer in error case. Found by Linux Verification Center (linuxtesting.org). Fixes: e2b76ab8b5c9 ("ksmbd: add support for read compound") Cc: stable@vger.kernel.org Signed-off-by: Fedor Pchelkin Acked-by: Namjae Jeon Signed-off-by: Steve French --- fs/smb/server/smb2pdu.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/fs/smb/server/smb2pdu.c b/fs/smb/server/smb2pdu.c index ba7a72a6a4f4..0c97d3c86072 100644 --- a/fs/smb/server/smb2pdu.c +++ b/fs/smb/server/smb2pdu.c @@ -6173,8 +6173,10 @@ static noinline int smb2_read_pipe(struct ksmbd_work *work) err = ksmbd_iov_pin_rsp_read(work, (void *)rsp, offsetof(struct smb2_read_rsp, Buffer), aux_payload_buf, nbytes); - if (err) + if (err) { + kvfree(aux_payload_buf); goto out; + } kvfree(rpc_resp); } else { err = ksmbd_iov_pin_rsp(work, (void *)rsp, @@ -6384,8 +6386,10 @@ int smb2_read(struct ksmbd_work *work) err = ksmbd_iov_pin_rsp_read(work, (void *)rsp, offsetof(struct smb2_read_rsp, Buffer), aux_payload_buf, nbytes); - if (err) + if (err) { + kvfree(aux_payload_buf); goto out; + } ksmbd_fd_put(work, fp); return 0; From e656c7a9e59607d1672d85ffa9a89031876ffe67 Mon Sep 17 00:00:00 2001 From: Prakash Sangappa Date: Tue, 23 Jan 2024 12:04:42 -0800 Subject: [PATCH 200/270] mm: hugetlb pages should not be reserved by shmat() if SHM_NORESERVE For shared memory of type SHM_HUGETLB, hugetlb pages are reserved in shmget() call. If SHM_NORESERVE flags is specified then the hugetlb pages are not reserved. However when the shared memory is attached with the shmat() call the hugetlb pages are getting reserved incorrectly for SHM_HUGETLB shared memory created with SHM_NORESERVE which is a bug. ------------------------------- Following test shows the issue. $cat shmhtb.c int main() { int shmflags = 0660 | IPC_CREAT | SHM_HUGETLB | SHM_NORESERVE; int shmid; shmid = shmget(SKEY, SHMSZ, shmflags); if (shmid < 0) { printf("shmat: shmget() failed, %d\n", errno); return 1; } printf("After shmget()\n"); system("cat /proc/meminfo | grep -i hugepages_"); shmat(shmid, NULL, 0); printf("\nAfter shmat()\n"); system("cat /proc/meminfo | grep -i hugepages_"); shmctl(shmid, IPC_RMID, NULL); return 0; } #sysctl -w vm.nr_hugepages=20 #./shmhtb After shmget() HugePages_Total: 20 HugePages_Free: 20 HugePages_Rsvd: 0 HugePages_Surp: 0 After shmat() HugePages_Total: 20 HugePages_Free: 20 HugePages_Rsvd: 5 <-- HugePages_Surp: 0 -------------------------------- Fix is to ensure that hugetlb pages are not reserved for SHM_HUGETLB shared memory in the shmat() call. Link: https://lkml.kernel.org/r/1706040282-12388-1-git-send-email-prakash.sangappa@oracle.com Signed-off-by: Prakash Sangappa Acked-by: Muchun Song Cc: Signed-off-by: Andrew Morton --- fs/hugetlbfs/inode.c | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index 671664fed307..ee13c2ca8ad2 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -100,6 +100,7 @@ static int hugetlbfs_file_mmap(struct file *file, struct vm_area_struct *vma) loff_t len, vma_len; int ret; struct hstate *h = hstate_file(file); + vm_flags_t vm_flags; /* * vma address alignment (but not the pgoff alignment) has @@ -141,10 +142,20 @@ static int hugetlbfs_file_mmap(struct file *file, struct vm_area_struct *vma) file_accessed(file); ret = -ENOMEM; + + vm_flags = vma->vm_flags; + /* + * for SHM_HUGETLB, the pages are reserved in the shmget() call so skip + * reserving here. Note: only for SHM hugetlbfs file, the inode + * flag S_PRIVATE is set. + */ + if (inode->i_flags & S_PRIVATE) + vm_flags |= VM_NORESERVE; + if (!hugetlb_reserve_pages(inode, vma->vm_pgoff >> huge_page_order(h), len >> huge_page_shift(h), vma, - vma->vm_flags)) + vm_flags)) goto out; ret = 0; From daa694e4137571b4ebec330f9a9b4d54aa8b8089 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Mon, 22 Jan 2024 16:50:50 +0100 Subject: [PATCH 201/270] getrusage: move thread_group_cputime_adjusted() outside of lock_task_sighand() Patch series "getrusage: use sig->stats_lock", v2. This patch (of 2): thread_group_cputime() does its own locking, we can safely shift thread_group_cputime_adjusted() which does another for_each_thread loop outside of ->siglock protected section. This is also preparation for the next patch which changes getrusage() to use stats_lock instead of siglock, thread_group_cputime() takes the same lock. With the current implementation recursive read_seqbegin_or_lock() is fine, thread_group_cputime() can't enter the slow mode if the caller holds stats_lock, yet this looks more safe and better performance-wise. Link: https://lkml.kernel.org/r/20240122155023.GA26169@redhat.com Link: https://lkml.kernel.org/r/20240122155050.GA26205@redhat.com Signed-off-by: Oleg Nesterov Reported-by: Dylan Hatch Tested-by: Dylan Hatch Cc: Eric W. Biederman Cc: Signed-off-by: Andrew Morton --- kernel/sys.c | 38 +++++++++++++++++++++----------------- 1 file changed, 21 insertions(+), 17 deletions(-) diff --git a/kernel/sys.c b/kernel/sys.c index e219fcfa112d..70ad06ad852e 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -1785,17 +1785,19 @@ void getrusage(struct task_struct *p, int who, struct rusage *r) struct task_struct *t; unsigned long flags; u64 tgutime, tgstime, utime, stime; - unsigned long maxrss = 0; + unsigned long maxrss; + struct mm_struct *mm; struct signal_struct *sig = p->signal; - memset((char *)r, 0, sizeof (*r)); + memset(r, 0, sizeof(*r)); utime = stime = 0; + maxrss = 0; if (who == RUSAGE_THREAD) { task_cputime_adjusted(current, &utime, &stime); accumulate_thread_rusage(p, r); maxrss = sig->maxrss; - goto out; + goto out_thread; } if (!lock_task_sighand(p, &flags)) @@ -1819,9 +1821,6 @@ void getrusage(struct task_struct *p, int who, struct rusage *r) fallthrough; case RUSAGE_SELF: - thread_group_cputime_adjusted(p, &tgutime, &tgstime); - utime += tgutime; - stime += tgstime; r->ru_nvcsw += sig->nvcsw; r->ru_nivcsw += sig->nivcsw; r->ru_minflt += sig->min_flt; @@ -1839,19 +1838,24 @@ void getrusage(struct task_struct *p, int who, struct rusage *r) } unlock_task_sighand(p, &flags); -out: + if (who == RUSAGE_CHILDREN) + goto out_children; + + thread_group_cputime_adjusted(p, &tgutime, &tgstime); + utime += tgutime; + stime += tgstime; + +out_thread: + mm = get_task_mm(p); + if (mm) { + setmax_mm_hiwater_rss(&maxrss, mm); + mmput(mm); + } + +out_children: + r->ru_maxrss = maxrss * (PAGE_SIZE / 1024); /* convert pages to KBs */ r->ru_utime = ns_to_kernel_old_timeval(utime); r->ru_stime = ns_to_kernel_old_timeval(stime); - - if (who != RUSAGE_CHILDREN) { - struct mm_struct *mm = get_task_mm(p); - - if (mm) { - setmax_mm_hiwater_rss(&maxrss, mm); - mmput(mm); - } - } - r->ru_maxrss = maxrss * (PAGE_SIZE / 1024); /* convert pages to KBs */ } SYSCALL_DEFINE2(getrusage, int, who, struct rusage __user *, ru) From f7ec1cd5cc7ef3ad964b677ba82b8b77f1c93009 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Mon, 22 Jan 2024 16:50:53 +0100 Subject: [PATCH 202/270] getrusage: use sig->stats_lock rather than lock_task_sighand() lock_task_sighand() can trigger a hard lockup. If NR_CPUS threads call getrusage() at the same time and the process has NR_THREADS, spin_lock_irq will spin with irqs disabled O(NR_CPUS * NR_THREADS) time. Change getrusage() to use sig->stats_lock, it was specifically designed for this type of use. This way it runs lockless in the likely case. TODO: - Change do_task_stat() to use sig->stats_lock too, then we can remove spin_lock_irq(siglock) in wait_task_zombie(). - Turn sig->stats_lock into seqcount_rwlock_t, this way the readers in the slow mode won't exclude each other. See https://lore.kernel.org/all/20230913154907.GA26210@redhat.com/ - stats_lock has to disable irqs because ->siglock can be taken in irq context, it would be very nice to change __exit_signal() to avoid the siglock->stats_lock dependency. Link: https://lkml.kernel.org/r/20240122155053.GA26214@redhat.com Signed-off-by: Oleg Nesterov Reported-by: Dylan Hatch Tested-by: Dylan Hatch Cc: Eric W. Biederman Cc: Signed-off-by: Andrew Morton --- kernel/sys.c | 16 +++++++++++++--- 1 file changed, 13 insertions(+), 3 deletions(-) diff --git a/kernel/sys.c b/kernel/sys.c index 70ad06ad852e..f8e543f1e38a 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -1788,7 +1788,9 @@ void getrusage(struct task_struct *p, int who, struct rusage *r) unsigned long maxrss; struct mm_struct *mm; struct signal_struct *sig = p->signal; + unsigned int seq = 0; +retry: memset(r, 0, sizeof(*r)); utime = stime = 0; maxrss = 0; @@ -1800,8 +1802,7 @@ void getrusage(struct task_struct *p, int who, struct rusage *r) goto out_thread; } - if (!lock_task_sighand(p, &flags)) - return; + flags = read_seqbegin_or_lock_irqsave(&sig->stats_lock, &seq); switch (who) { case RUSAGE_BOTH: @@ -1829,14 +1830,23 @@ void getrusage(struct task_struct *p, int who, struct rusage *r) r->ru_oublock += sig->oublock; if (maxrss < sig->maxrss) maxrss = sig->maxrss; + + rcu_read_lock(); __for_each_thread(sig, t) accumulate_thread_rusage(t, r); + rcu_read_unlock(); + break; default: BUG(); } - unlock_task_sighand(p, &flags); + + if (need_seqretry(&sig->stats_lock, seq)) { + seq = 1; + goto retry; + } + done_seqretry_irqrestore(&sig->stats_lock, seq, flags); if (who == RUSAGE_CHILDREN) goto out_children; From 60f92acb60a989b14e4b744501a0df0f82ef30a3 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Tue, 23 Jan 2024 16:33:55 +0100 Subject: [PATCH 203/270] fs/proc: do_task_stat: move thread_group_cputime_adjusted() outside of lock_task_sighand() Patch series "fs/proc: do_task_stat: use sig->stats_". do_task_stat() has the same problem as getrusage() had before "getrusage: use sig->stats_lock rather than lock_task_sighand()": a hard lockup. If NR_CPUS threads call lock_task_sighand() at the same time and the process has NR_THREADS, spin_lock_irq will spin with irqs disabled O(NR_CPUS * NR_THREADS) time. This patch (of 3): thread_group_cputime() does its own locking, we can safely shift thread_group_cputime_adjusted() which does another for_each_thread loop outside of ->siglock protected section. Not only this removes for_each_thread() from the critical section with irqs disabled, this removes another case when stats_lock is taken with siglock held. We want to remove this dependency, then we can change the users of stats_lock to not disable irqs. Link: https://lkml.kernel.org/r/20240123153313.GA21832@redhat.com Link: https://lkml.kernel.org/r/20240123153355.GA21854@redhat.com Signed-off-by: Oleg Nesterov Signed-off-by: Dylan Hatch Cc: Eric W. Biederman Cc: Signed-off-by: Andrew Morton --- fs/proc/array.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/fs/proc/array.c b/fs/proc/array.c index ff08a8957552..45ba91863808 100644 --- a/fs/proc/array.c +++ b/fs/proc/array.c @@ -511,7 +511,7 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns, sigemptyset(&sigign); sigemptyset(&sigcatch); - cutime = cstime = utime = stime = 0; + cutime = cstime = 0; cgtime = gtime = 0; if (lock_task_sighand(task, &flags)) { @@ -546,7 +546,6 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns, min_flt += sig->min_flt; maj_flt += sig->maj_flt; - thread_group_cputime_adjusted(task, &utime, &stime); gtime += sig->gtime; if (sig->flags & (SIGNAL_GROUP_EXIT | SIGNAL_STOP_STOPPED)) @@ -562,10 +561,13 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns, if (permitted && (!whole || num_threads < 2)) wchan = !task_is_running(task); - if (!whole) { + + if (whole) { + thread_group_cputime_adjusted(task, &utime, &stime); + } else { + task_cputime_adjusted(task, &utime, &stime); min_flt = task->min_flt; maj_flt = task->maj_flt; - task_cputime_adjusted(task, &utime, &stime); gtime = task_gtime(task); } From 7601df8031fd67310af891897ef6cc0df4209305 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Tue, 23 Jan 2024 16:33:57 +0100 Subject: [PATCH 204/270] fs/proc: do_task_stat: use sig->stats_lock to gather the threads/children stats lock_task_sighand() can trigger a hard lockup. If NR_CPUS threads call do_task_stat() at the same time and the process has NR_THREADS, it will spin with irqs disabled O(NR_CPUS * NR_THREADS) time. Change do_task_stat() to use sig->stats_lock to gather the statistics outside of ->siglock protected section, in the likely case this code will run lockless. Link: https://lkml.kernel.org/r/20240123153357.GA21857@redhat.com Signed-off-by: Oleg Nesterov Signed-off-by: Dylan Hatch Cc: Eric W. Biederman Cc: Signed-off-by: Andrew Morton --- fs/proc/array.c | 58 +++++++++++++++++++++++++++---------------------- 1 file changed, 32 insertions(+), 26 deletions(-) diff --git a/fs/proc/array.c b/fs/proc/array.c index 45ba91863808..34a47fb0c57f 100644 --- a/fs/proc/array.c +++ b/fs/proc/array.c @@ -477,13 +477,13 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns, int permitted; struct mm_struct *mm; unsigned long long start_time; - unsigned long cmin_flt = 0, cmaj_flt = 0; - unsigned long min_flt = 0, maj_flt = 0; - u64 cutime, cstime, utime, stime; - u64 cgtime, gtime; + unsigned long cmin_flt, cmaj_flt, min_flt, maj_flt; + u64 cutime, cstime, cgtime, utime, stime, gtime; unsigned long rsslim = 0; unsigned long flags; int exit_code = task->exit_code; + struct signal_struct *sig = task->signal; + unsigned int seq = 1; state = *get_task_state(task); vsize = eip = esp = 0; @@ -511,12 +511,8 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns, sigemptyset(&sigign); sigemptyset(&sigcatch); - cutime = cstime = 0; - cgtime = gtime = 0; if (lock_task_sighand(task, &flags)) { - struct signal_struct *sig = task->signal; - if (sig->tty) { struct pid *pgrp = tty_get_pgrp(sig->tty); tty_pgrp = pid_nr_ns(pgrp, ns); @@ -527,27 +523,9 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns, num_threads = get_nr_threads(task); collect_sigign_sigcatch(task, &sigign, &sigcatch); - cmin_flt = sig->cmin_flt; - cmaj_flt = sig->cmaj_flt; - cutime = sig->cutime; - cstime = sig->cstime; - cgtime = sig->cgtime; rsslim = READ_ONCE(sig->rlim[RLIMIT_RSS].rlim_cur); - /* add up live thread stats at the group level */ if (whole) { - struct task_struct *t; - - __for_each_thread(sig, t) { - min_flt += t->min_flt; - maj_flt += t->maj_flt; - gtime += task_gtime(t); - } - - min_flt += sig->min_flt; - maj_flt += sig->maj_flt; - gtime += sig->gtime; - if (sig->flags & (SIGNAL_GROUP_EXIT | SIGNAL_STOP_STOPPED)) exit_code = sig->group_exit_code; } @@ -562,6 +540,34 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns, if (permitted && (!whole || num_threads < 2)) wchan = !task_is_running(task); + do { + seq++; /* 2 on the 1st/lockless path, otherwise odd */ + flags = read_seqbegin_or_lock_irqsave(&sig->stats_lock, &seq); + + cmin_flt = sig->cmin_flt; + cmaj_flt = sig->cmaj_flt; + cutime = sig->cutime; + cstime = sig->cstime; + cgtime = sig->cgtime; + + if (whole) { + struct task_struct *t; + + min_flt = sig->min_flt; + maj_flt = sig->maj_flt; + gtime = sig->gtime; + + rcu_read_lock(); + __for_each_thread(sig, t) { + min_flt += t->min_flt; + maj_flt += t->maj_flt; + gtime += task_gtime(t); + } + rcu_read_unlock(); + } + } while (need_seqretry(&sig->stats_lock, seq)); + done_seqretry_irqrestore(&sig->stats_lock, seq, flags); + if (whole) { thread_group_cputime_adjusted(task, &utime, &stime); } else { From c1be35a16b2f1fe21f4f26f9de030ad6eaaf6a25 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Tue, 23 Jan 2024 16:34:00 +0100 Subject: [PATCH 205/270] exit: wait_task_zombie: kill the no longer necessary spin_lock_irq(siglock) After the recent changes nobody use siglock to read the values protected by stats_lock, we can kill spin_lock_irq(¤t->sighand->siglock) and update the comment. With this patch only __exit_signal() and thread_group_start_cputime() take stats_lock under siglock. Link: https://lkml.kernel.org/r/20240123153359.GA21866@redhat.com Signed-off-by: Oleg Nesterov Signed-off-by: Dylan Hatch Cc: Eric W. Biederman Cc: Signed-off-by: Andrew Morton --- kernel/exit.c | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) diff --git a/kernel/exit.c b/kernel/exit.c index 3988a02efaef..dfb963d2f862 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -1127,17 +1127,14 @@ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p) * and nobody can change them. * * psig->stats_lock also protects us from our sub-threads - * which can reap other children at the same time. Until - * we change k_getrusage()-like users to rely on this lock - * we have to take ->siglock as well. + * which can reap other children at the same time. * * We use thread_group_cputime_adjusted() to get times for * the thread group, which consolidates times for all threads * in the group including the group leader. */ thread_group_cputime_adjusted(p, &tgutime, &tgstime); - spin_lock_irq(¤t->sighand->siglock); - write_seqlock(&psig->stats_lock); + write_seqlock_irq(&psig->stats_lock); psig->cutime += tgutime + sig->cutime; psig->cstime += tgstime + sig->cstime; psig->cgtime += task_gtime(p) + sig->gtime + sig->cgtime; @@ -1160,8 +1157,7 @@ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p) psig->cmaxrss = maxrss; task_io_accounting_add(&psig->ioac, &p->ioac); task_io_accounting_add(&psig->ioac, &sig->ioac); - write_sequnlock(&psig->stats_lock); - spin_unlock_irq(¤t->sighand->siglock); + write_sequnlock_irq(&psig->stats_lock); } if (wo->wo_rusage) From 56ae10cf628b02279980d17439c6241a643959c2 Mon Sep 17 00:00:00 2001 From: Ryan Roberts Date: Tue, 23 Jan 2024 14:17:55 +0000 Subject: [PATCH 206/270] mm/userfaultfd: UFFDIO_MOVE implementation should use ptep_get() Commit c33c794828f2 ("mm: ptep_get() conversion") converted all (non-arch) call sites to use ptep_get() instead of doing a direct dereference of the pte. Full rationale can be found in that commit's log. Since then, UFFDIO_MOVE has been implemented which does 7 direct pte dereferences. Let's fix those up to use ptep_get(). I've asserted in the past that there is no reliable automated mechanism to catch these; I'm relying on a combination of Coccinelle (which throws up a lot of false positives) and some compiler magic to force a compiler error on dereference. But given the frequency with which new issues are coming up, I'll add it to my todo list to try to find an automated solution. Link: https://lkml.kernel.org/r/20240123141755.3836179-1-ryan.roberts@arm.com Fixes: adef440691ba ("userfaultfd: UFFDIO_MOVE uABI") Signed-off-by: Ryan Roberts Reviewed-by: Suren Baghdasaryan Cc: Andrea Arcangeli Cc: David Hildenbrand Signed-off-by: Andrew Morton --- mm/userfaultfd.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c index 75fcf1f783bc..7cf7d4384259 100644 --- a/mm/userfaultfd.c +++ b/mm/userfaultfd.c @@ -902,8 +902,8 @@ static int move_present_pte(struct mm_struct *mm, double_pt_lock(dst_ptl, src_ptl); - if (!pte_same(*src_pte, orig_src_pte) || - !pte_same(*dst_pte, orig_dst_pte)) { + if (!pte_same(ptep_get(src_pte), orig_src_pte) || + !pte_same(ptep_get(dst_pte), orig_dst_pte)) { err = -EAGAIN; goto out; } @@ -946,8 +946,8 @@ static int move_swap_pte(struct mm_struct *mm, double_pt_lock(dst_ptl, src_ptl); - if (!pte_same(*src_pte, orig_src_pte) || - !pte_same(*dst_pte, orig_dst_pte)) { + if (!pte_same(ptep_get(src_pte), orig_src_pte) || + !pte_same(ptep_get(dst_pte), orig_dst_pte)) { double_pt_unlock(dst_ptl, src_ptl); return -EAGAIN; } @@ -1016,7 +1016,7 @@ retry: } spin_lock(dst_ptl); - orig_dst_pte = *dst_pte; + orig_dst_pte = ptep_get(dst_pte); spin_unlock(dst_ptl); if (!pte_none(orig_dst_pte)) { err = -EEXIST; @@ -1024,7 +1024,7 @@ retry: } spin_lock(src_ptl); - orig_src_pte = *src_pte; + orig_src_pte = ptep_get(src_pte); spin_unlock(src_ptl); if (pte_none(orig_src_pte)) { if (!(mode & UFFDIO_MOVE_MODE_ALLOW_SRC_HOLES)) @@ -1054,7 +1054,7 @@ retry: * page isn't freed under us */ spin_lock(src_ptl); - if (!pte_same(orig_src_pte, *src_pte)) { + if (!pte_same(orig_src_pte, ptep_get(src_pte))) { spin_unlock(src_ptl); err = -EAGAIN; goto out; From 67b8bcbaed4777871bb0dcc888fb02a614a98ab1 Mon Sep 17 00:00:00 2001 From: Ryusuke Konishi Date: Wed, 24 Jan 2024 21:19:36 +0900 Subject: [PATCH 207/270] nilfs2: fix data corruption in dsync block recovery for small block sizes The helper function nilfs_recovery_copy_block() of nilfs_recovery_dsync_blocks(), which recovers data from logs created by data sync writes during a mount after an unclean shutdown, incorrectly calculates the on-page offset when copying repair data to the file's page cache. In environments where the block size is smaller than the page size, this flaw can cause data corruption and leak uninitialized memory bytes during the recovery process. Fix these issues by correcting this byte offset calculation on the page. Link: https://lkml.kernel.org/r/20240124121936.10575-1-konishi.ryusuke@gmail.com Signed-off-by: Ryusuke Konishi Tested-by: Ryusuke Konishi Cc: Signed-off-by: Andrew Morton --- fs/nilfs2/recovery.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/fs/nilfs2/recovery.c b/fs/nilfs2/recovery.c index 0955b657938f..a9b8d77c8c1d 100644 --- a/fs/nilfs2/recovery.c +++ b/fs/nilfs2/recovery.c @@ -472,9 +472,10 @@ static int nilfs_prepare_segment_for_recovery(struct the_nilfs *nilfs, static int nilfs_recovery_copy_block(struct the_nilfs *nilfs, struct nilfs_recovery_block *rb, - struct page *page) + loff_t pos, struct page *page) { struct buffer_head *bh_org; + size_t from = pos & ~PAGE_MASK; void *kaddr; bh_org = __bread(nilfs->ns_bdev, rb->blocknr, nilfs->ns_blocksize); @@ -482,7 +483,7 @@ static int nilfs_recovery_copy_block(struct the_nilfs *nilfs, return -EIO; kaddr = kmap_atomic(page); - memcpy(kaddr + bh_offset(bh_org), bh_org->b_data, bh_org->b_size); + memcpy(kaddr + from, bh_org->b_data, bh_org->b_size); kunmap_atomic(kaddr); brelse(bh_org); return 0; @@ -521,7 +522,7 @@ static int nilfs_recover_dsync_blocks(struct the_nilfs *nilfs, goto failed_inode; } - err = nilfs_recovery_copy_block(nilfs, rb, page); + err = nilfs_recovery_copy_block(nilfs, rb, pos, page); if (unlikely(err)) goto failed_page; From 9cee7e8ef3e31ca25b40ca52b8585dc6935deff2 Mon Sep 17 00:00:00 2001 From: Yosry Ahmed Date: Wed, 24 Jan 2024 10:00:22 +0000 Subject: [PATCH 208/270] mm: memcg: optimize parent iteration in memcg_rstat_updated() In memcg_rstat_updated(), we iterate the memcg being updated and its parents to update memcg->vmstats_percpu->stats_updates in the fast path (i.e. no atomic updates). According to my math, this is 3 memory loads (and potentially 3 cache misses) per memcg: - Load the address of memcg->vmstats_percpu. - Load vmstats_percpu->stats_updates (based on some percpu calculation). - Load the address of the parent memcg. Avoid most of the cache misses by caching a pointer from each struct memcg_vmstats_percpu to its parent on the corresponding CPU. In this case, for the first memcg we have 2 memory loads (same as above): - Load the address of memcg->vmstats_percpu. - Load vmstats_percpu->stats_updates (based on some percpu calculation). Then for each additional memcg, we need a single load to get the parent's stats_updates directly. This reduces the number of loads from O(3N) to O(2+N) -- where N is the number of memcgs we need to iterate. Additionally, stash a pointer to memcg->vmstats in each struct memcg_vmstats_percpu such that we can access the atomic counter that all CPUs fold into, memcg->vmstats->stats_updates. memcg_should_flush_stats() is changed to memcg_vmstats_needs_flush() to accept a struct memcg_vmstats pointer accordingly. In struct memcg_vmstats_percpu, make sure both pointers together with stats_updates live on the same cacheline. Finally, update mem_cgroup_alloc() to take in a parent pointer and initialize the new cache pointers on each CPU. The percpu loop in mem_cgroup_alloc() may look concerning, but there are multiple similar loops in the cgroup creation path (e.g. cgroup_rstat_init()), most of which are hidden within alloc_percpu(). According to Oliver's testing [1], this fixes multiple 30-38% regressions in vm-scalability, will-it-scale-tlb_flush2, and will-it-scale-fallocate1. This comes at a cost of 2 more pointers per CPU (<2KB on a machine with 128 CPUs). [1] https://lore.kernel.org/lkml/ZbDJsfsZt2ITyo61@xsang-OptiPlex-9020/ [yosryahmed@google.com: fix struct memcg_vmstats_percpu size and alignment] Link: https://lkml.kernel.org/r/20240203044612.1234216-1-yosryahmed@google.com Link: https://lkml.kernel.org/r/20240124100023.660032-1-yosryahmed@google.com Signed-off-by: Yosry Ahmed Fixes: 8d59d2214c23 ("mm: memcg: make stats flushing threshold per-memcg") Tested-by: kernel test robot Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-lkp/202401221624.cb53a8ca-oliver.sang@intel.com Acked-by: Shakeel Butt Acked-by: Johannes Weiner Cc: Michal Hocko Cc: Muchun Song Cc: Roman Gushchin Cc: Greg Thelen Signed-off-by: Andrew Morton --- mm/memcontrol.c | 56 ++++++++++++++++++++++++++++++------------------- 1 file changed, 35 insertions(+), 21 deletions(-) diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 46d8d02114cf..1ed40f9d3a27 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -621,6 +621,15 @@ static inline int memcg_events_index(enum vm_event_item idx) } struct memcg_vmstats_percpu { + /* Stats updates since the last flush */ + unsigned int stats_updates; + + /* Cached pointers for fast iteration in memcg_rstat_updated() */ + struct memcg_vmstats_percpu *parent; + struct memcg_vmstats *vmstats; + + /* The above should fit a single cacheline for memcg_rstat_updated() */ + /* Local (CPU and cgroup) page state & events */ long state[MEMCG_NR_STAT]; unsigned long events[NR_MEMCG_EVENTS]; @@ -632,10 +641,7 @@ struct memcg_vmstats_percpu { /* Cgroup1: threshold notifications & softlimit tree updates */ unsigned long nr_page_events; unsigned long targets[MEM_CGROUP_NTARGETS]; - - /* Stats updates since the last flush */ - unsigned int stats_updates; -}; +} ____cacheline_aligned; struct memcg_vmstats { /* Aggregated (CPU and subtree) page state & events */ @@ -698,36 +704,35 @@ static void memcg_stats_unlock(void) } -static bool memcg_should_flush_stats(struct mem_cgroup *memcg) +static bool memcg_vmstats_needs_flush(struct memcg_vmstats *vmstats) { - return atomic64_read(&memcg->vmstats->stats_updates) > + return atomic64_read(&vmstats->stats_updates) > MEMCG_CHARGE_BATCH * num_online_cpus(); } static inline void memcg_rstat_updated(struct mem_cgroup *memcg, int val) { + struct memcg_vmstats_percpu *statc; int cpu = smp_processor_id(); - unsigned int x; if (!val) return; cgroup_rstat_updated(memcg->css.cgroup, cpu); - - for (; memcg; memcg = parent_mem_cgroup(memcg)) { - x = __this_cpu_add_return(memcg->vmstats_percpu->stats_updates, - abs(val)); - - if (x < MEMCG_CHARGE_BATCH) + statc = this_cpu_ptr(memcg->vmstats_percpu); + for (; statc; statc = statc->parent) { + statc->stats_updates += abs(val); + if (statc->stats_updates < MEMCG_CHARGE_BATCH) continue; /* * If @memcg is already flush-able, increasing stats_updates is * redundant. Avoid the overhead of the atomic update. */ - if (!memcg_should_flush_stats(memcg)) - atomic64_add(x, &memcg->vmstats->stats_updates); - __this_cpu_write(memcg->vmstats_percpu->stats_updates, 0); + if (!memcg_vmstats_needs_flush(statc->vmstats)) + atomic64_add(statc->stats_updates, + &statc->vmstats->stats_updates); + statc->stats_updates = 0; } } @@ -756,7 +761,7 @@ void mem_cgroup_flush_stats(struct mem_cgroup *memcg) if (!memcg) memcg = root_mem_cgroup; - if (memcg_should_flush_stats(memcg)) + if (memcg_vmstats_needs_flush(memcg->vmstats)) do_flush_stats(memcg); } @@ -770,7 +775,7 @@ void mem_cgroup_flush_stats_ratelimited(struct mem_cgroup *memcg) static void flush_memcg_stats_dwork(struct work_struct *w) { /* - * Deliberately ignore memcg_should_flush_stats() here so that flushing + * Deliberately ignore memcg_vmstats_needs_flush() here so that flushing * in latency-sensitive paths is as cheap as possible. */ do_flush_stats(root_mem_cgroup); @@ -5477,10 +5482,11 @@ static void mem_cgroup_free(struct mem_cgroup *memcg) __mem_cgroup_free(memcg); } -static struct mem_cgroup *mem_cgroup_alloc(void) +static struct mem_cgroup *mem_cgroup_alloc(struct mem_cgroup *parent) { + struct memcg_vmstats_percpu *statc, *pstatc; struct mem_cgroup *memcg; - int node; + int node, cpu; int __maybe_unused i; long error = -ENOMEM; @@ -5504,6 +5510,14 @@ static struct mem_cgroup *mem_cgroup_alloc(void) if (!memcg->vmstats_percpu) goto fail; + for_each_possible_cpu(cpu) { + if (parent) + pstatc = per_cpu_ptr(parent->vmstats_percpu, cpu); + statc = per_cpu_ptr(memcg->vmstats_percpu, cpu); + statc->parent = parent ? pstatc : NULL; + statc->vmstats = memcg->vmstats; + } + for_each_node(node) if (alloc_mem_cgroup_per_node_info(memcg, node)) goto fail; @@ -5549,7 +5563,7 @@ mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css) struct mem_cgroup *memcg, *old_memcg; old_memcg = set_active_memcg(parent); - memcg = mem_cgroup_alloc(); + memcg = mem_cgroup_alloc(parent); set_active_memcg(old_memcg); if (IS_ERR(memcg)) return ERR_CAST(memcg); From 2fde9e7f9e6dc38e1d7091b9705c22be945c8697 Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Wed, 24 Jan 2024 16:40:14 +0800 Subject: [PATCH 209/270] mm/memory-failure: fix crash in split_huge_page_to_list from soft_offline_page When I did soft offline stress test, a machine was observed to crash with the following message: kernel BUG at include/linux/memcontrol.h:554! invalid opcode: 0000 [#1] PREEMPT SMP NOPTI CPU: 5 PID: 3837 Comm: hwpoison.sh Not tainted 6.7.0-next-20240112-00001-g8ecf3e7fb7c8-dirty #97 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.14.0-0-g155821a1990b-prebuilt.qemu.org 04/01/2014 RIP: 0010:folio_memcg+0xaf/0xd0 Code: 10 5b 5d c3 cc cc cc cc 48 c7 c6 08 b1 f2 b2 48 89 ef e8 b4 c5 f8 ff 90 0f 0b 48 c7 c6 d0 b0 f2 b2 48 89 ef e8 a2 c5 f8 ff 90 <0f> 0b 48 c7 c6 08 b1 f2 b2 48 89 ef e8 90 c5 f8 ff 90 0f 0b 66 66 RSP: 0018:ffffb6c043657c98 EFLAGS: 00000296 RAX: 000000000000004b RBX: ffff932bc1d1e401 RCX: ffff933abfb5c908 RDX: 0000000000000000 RSI: 0000000000000027 RDI: ffff933abfb5c900 RBP: ffffea6f04019080 R08: ffffffffb3338ce8 R09: 0000000000009ffb R10: 00000000000004dd R11: ffffffffb3308d00 R12: ffffea6f04019080 R13: ffffea6f04019080 R14: 0000000000000001 R15: ffffb6c043657da0 FS: 00007f6c60f6b740(0000) GS:ffff933abfb40000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 0000559c3bc8b980 CR3: 0000000107f1c000 CR4: 00000000000006f0 Call Trace: split_huge_page_to_list+0x4d/0x1380 try_to_split_thp_page+0x3a/0xf0 soft_offline_page+0x1ea/0x8a0 soft_offline_page_store+0x52/0x90 kernfs_fop_write_iter+0x118/0x1b0 vfs_write+0x30b/0x430 ksys_write+0x5e/0xe0 do_syscall_64+0xb0/0x1b0 entry_SYSCALL_64_after_hwframe+0x6d/0x75 RIP: 0033:0x7f6c60d14697 Code: 10 00 f7 d8 64 89 02 48 c7 c0 ff ff ff ff eb b7 0f 1f 00 f3 0f 1e fa 64 8b 04 25 18 00 00 00 85 c0 75 10 b8 01 00 00 00 0f 05 <48> 3d 00 f0 ff ff 77 51 c3 48 83 ec 28 48 89 54 24 18 48 89 74 24 RSP: 002b:00007ffe9b72b8d8 EFLAGS: 00000246 ORIG_RAX: 0000000000000001 RAX: ffffffffffffffda RBX: 000000000000000c RCX: 00007f6c60d14697 RDX: 000000000000000c RSI: 0000559c3bc8b980 RDI: 0000000000000001 RBP: 0000559c3bc8b980 R08: 00007f6c60dd1460 R09: 000000007fffffff R10: 0000000000000000 R11: 0000000000000246 R12: 000000000000000c R13: 00007f6c60e1a780 R14: 00007f6c60e16600 R15: 00007f6c60e15a00 The problem is that page->mapping is overloaded with slab->slab_list or slabs fields now, so slab pages could be taken as non-LRU movable pages if field slabs contains PAGE_MAPPING_MOVABLE or slab_list->prev is set to LIST_POISON2. These slab pages will be treated as thp later leading to crash in split_huge_page_to_list(). Link: https://lkml.kernel.org/r/20240126065837.2100184-1-linmiaohe@huawei.com Link: https://lkml.kernel.org/r/20240124084014.1772906-1-linmiaohe@huawei.com Signed-off-by: Miaohe Lin Fixes: 130d4df57390 ("mm/sl[au]b: rearrange struct slab fields to allow larger rcu_head") Reviewed-by: Matthew Wilcox (Oracle) Cc: Miaohe Lin Cc: Naoya Horiguchi Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- mm/memory-failure.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 636280d04008..9349948f1abf 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -1377,6 +1377,9 @@ void ClearPageHWPoisonTakenOff(struct page *page) */ static inline bool HWPoisonHandlable(struct page *page, unsigned long flags) { + if (PageSlab(page)) + return false; + /* Soft offline could migrate non-LRU movable pages */ if ((flags & MF_SOFT_OFFLINE) && __PageMovable(page)) return true; From 01c1484ac04790fe27a37f89dd3a350f99646815 Mon Sep 17 00:00:00 2001 From: Muhammad Usama Anjum Date: Tue, 24 Oct 2023 20:51:25 +0500 Subject: [PATCH 210/270] selftests: core: include linux/close_range.h for CLOSE_RANGE_* macros Correct header file is needed for getting CLOSE_RANGE_* macros. Previously it was tested with newer glibc which didn't show the need to include the header which was a mistake. Link: https://lkml.kernel.org/r/20231024155137.219700-1-usama.anjum@collabora.com Fixes: ec54424923cf ("selftests: core: remove duplicate defines") Reported-by: Aishwarya TCV Link: https://lore.kernel.org/all/7161219e-0223-d699-d6f3-81abd9abf13b@arm.com Signed-off-by: Muhammad Usama Anjum Cc: Shuah Khan Signed-off-by: Andrew Morton --- tools/testing/selftests/core/close_range_test.c | 1 + 1 file changed, 1 insertion(+) diff --git a/tools/testing/selftests/core/close_range_test.c b/tools/testing/selftests/core/close_range_test.c index 534576f06df1..c59e4adb905d 100644 --- a/tools/testing/selftests/core/close_range_test.c +++ b/tools/testing/selftests/core/close_range_test.c @@ -12,6 +12,7 @@ #include #include #include +#include #include "../kselftest_harness.h" #include "../clone3/clone3_selftests.h" From e870920bbe68e52335a4c31a059e6af6a9a59dbb Mon Sep 17 00:00:00 2001 From: Suren Baghdasaryan Date: Mon, 22 Jan 2024 22:43:05 -0800 Subject: [PATCH 211/270] arch/arm/mm: fix major fault accounting when retrying under per-VMA lock The change [1] missed ARM architecture when fixing major fault accounting for page fault retry under per-VMA lock. The user-visible effects is that it restores correct major fault accounting that was broken after [2] was merged in 6.7 kernel. The more detailed description is in [3] and this patch simply adds the same fix to ARM architecture which I missed in [3]. Add missing code to fix ARM architecture fault accounting. [1] 46e714c729c8 ("arch/mm/fault: fix major fault accounting when retrying under per-VMA lock") [2] https://lore.kernel.org/all/20231006195318.4087158-6-willy@infradead.org/ [3] https://lore.kernel.org/all/20231226214610.109282-1-surenb@google.com/ Link: https://lkml.kernel.org/r/20240123064305.2829244-1-surenb@google.com Fixes: 12214eba1992 ("mm: handle read faults under the VMA lock") Reported-by: Russell King (Oracle) Signed-off-by: Suren Baghdasaryan Cc: Alexander Gordeev Cc: Andy Lutomirski Cc: Catalin Marinas Cc: Christophe Leroy Cc: Dave Hansen Cc: Gerald Schaefer Cc: Matthew Wilcox (Oracle) Cc: Michael Ellerman Cc: Palmer Dabbelt Cc: Peter Zijlstra Cc: Will Deacon Cc: Signed-off-by: Andrew Morton --- arch/arm/mm/fault.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/arch/arm/mm/fault.c b/arch/arm/mm/fault.c index e96fb40b9cc3..07565b593ed6 100644 --- a/arch/arm/mm/fault.c +++ b/arch/arm/mm/fault.c @@ -298,6 +298,8 @@ do_page_fault(unsigned long addr, unsigned int fsr, struct pt_regs *regs) goto done; } count_vm_vma_lock_event(VMA_LOCK_RETRY); + if (fault & VM_FAULT_MAJOR) + flags |= FAULT_FLAG_TRIED; /* Quick path to respond to signals */ if (fault_signal_pending(fault, regs)) { From 4c2da3188b848d33c26d7f0f8b14f3150331c923 Mon Sep 17 00:00:00 2001 From: Sergey Senozhatsky Date: Fri, 26 Jan 2024 12:25:48 +0900 Subject: [PATCH 212/270] mm/madvise: don't forget to leave lazy MMU mode in madvise_cold_or_pageout_pte_range() We need to leave lazy MMU mode before unlocking. Link: https://lkml.kernel.org/r/20240126032608.355899-1-senozhatsky@chromium.org Fixes: b2f557a21bc8 ("mm/madvise: add cond_resched() in madvise_cold_or_pageout_pte_range()") Signed-off-by: Sergey Senozhatsky Cc: Jiexun Wang Signed-off-by: Andrew Morton --- mm/madvise.c | 1 + 1 file changed, 1 insertion(+) diff --git a/mm/madvise.c b/mm/madvise.c index 912155a94ed5..cfa5e7288261 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -429,6 +429,7 @@ restart: if (++batch_count == SWAP_CLUSTER_MAX) { batch_count = 0; if (need_resched()) { + arch_leave_lazy_mmu_mode(); pte_unmap_unlock(start_pte, ptl); cond_resched(); goto restart; From 2e601e1e8e4b330020a346c55ba111d49e0b188e Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Mon, 29 Jan 2024 20:34:38 -0500 Subject: [PATCH 213/270] mm: zswap: fix objcg use-after-free in entry destruction In the per-memcg LRU universe, LRU removal uses entry->objcg to determine which list count needs to be decreased. Drop the objcg reference after updating the LRU, to fix a possible use-after-free. Link: https://lkml.kernel.org/r/20240130013438.565167-1-hannes@cmpxchg.org Fixes: a65b0e7607cc ("zswap: make shrinking memcg-aware") Signed-off-by: Johannes Weiner Acked-by: Yosry Ahmed Reviewed-by: Nhat Pham Reviewed-by: Chengming Zhou Signed-off-by: Andrew Morton --- mm/zswap.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/mm/zswap.c b/mm/zswap.c index ca25b676048e..0a94b197ed32 100644 --- a/mm/zswap.c +++ b/mm/zswap.c @@ -536,10 +536,6 @@ static struct zpool *zswap_find_zpool(struct zswap_entry *entry) */ static void zswap_free_entry(struct zswap_entry *entry) { - if (entry->objcg) { - obj_cgroup_uncharge_zswap(entry->objcg, entry->length); - obj_cgroup_put(entry->objcg); - } if (!entry->length) atomic_dec(&zswap_same_filled_pages); else { @@ -548,6 +544,10 @@ static void zswap_free_entry(struct zswap_entry *entry) atomic_dec(&entry->pool->nr_stored); zswap_pool_put(entry->pool); } + if (entry->objcg) { + obj_cgroup_uncharge_zswap(entry->objcg, entry->length); + obj_cgroup_put(entry->objcg); + } zswap_entry_cache_free(entry); atomic_dec(&zswap_stored_pages); zswap_update_total_size(); From f2076032096775d1bb1af16b6eddbc6534575328 Mon Sep 17 00:00:00 2001 From: John Moon Date: Wed, 31 Jan 2024 03:43:18 +0000 Subject: [PATCH 214/270] mailmap: switch email address for John Moon Add current email address as QUIC email is no longer active. Link: https://lkml.kernel.org/r/20240131034311.46706-1-john@jmoon.dev Signed-off-by: John Moon Acked-by: Trilok Soni Cc: Elliot Berman Signed-off-by: Andrew Morton --- .mailmap | 1 + 1 file changed, 1 insertion(+) diff --git a/.mailmap b/.mailmap index 04998f7bda81..8ae00bd3708a 100644 --- a/.mailmap +++ b/.mailmap @@ -289,6 +289,7 @@ Johan Hovold John Crispin John Fastabend John Keeping +John Moon John Paul Adrian Glaubitz John Stultz From 79d72c68c58784a3e1cd2378669d51bfd0cb7498 Mon Sep 17 00:00:00 2001 From: Oscar Salvador Date: Tue, 30 Jan 2024 22:04:18 +0100 Subject: [PATCH 215/270] fs,hugetlb: fix NULL pointer dereference in hugetlbs_fill_super When configuring a hugetlb filesystem via the fsconfig() syscall, there is a possible NULL dereference in hugetlbfs_fill_super() caused by assigning NULL to ctx->hstate in hugetlbfs_parse_param() when the requested pagesize is non valid. E.g: Taking the following steps: fd = fsopen("hugetlbfs", FSOPEN_CLOEXEC); fsconfig(fd, FSCONFIG_SET_STRING, "pagesize", "1024", 0); fsconfig(fd, FSCONFIG_CMD_CREATE, NULL, NULL, 0); Given that the requested "pagesize" is invalid, ctxt->hstate will be replaced with NULL, losing its previous value, and we will print an error: ... ... case Opt_pagesize: ps = memparse(param->string, &rest); ctx->hstate = h; if (!ctx->hstate) { pr_err("Unsupported page size %lu MB\n", ps / SZ_1M); return -EINVAL; } return 0; ... ... This is a problem because later on, we will dereference ctxt->hstate in hugetlbfs_fill_super() ... ... sb->s_blocksize = huge_page_size(ctx->hstate); ... ... Causing below Oops. Fix this by replacing cxt->hstate value only when then pagesize is known to be valid. kernel: hugetlbfs: Unsupported page size 0 MB kernel: BUG: kernel NULL pointer dereference, address: 0000000000000028 kernel: #PF: supervisor read access in kernel mode kernel: #PF: error_code(0x0000) - not-present page kernel: PGD 800000010f66c067 P4D 800000010f66c067 PUD 1b22f8067 PMD 0 kernel: Oops: 0000 [#1] PREEMPT SMP PTI kernel: CPU: 4 PID: 5659 Comm: syscall Tainted: G E 6.8.0-rc2-default+ #22 5a47c3fef76212addcc6eb71344aabc35190ae8f kernel: Hardware name: Intel Corp. GROVEPORT/GROVEPORT, BIOS GVPRCRB1.86B.0016.D04.1705030402 05/03/2017 kernel: RIP: 0010:hugetlbfs_fill_super+0xb4/0x1a0 kernel: Code: 48 8b 3b e8 3e c6 ed ff 48 85 c0 48 89 45 20 0f 84 d6 00 00 00 48 b8 ff ff ff ff ff ff ff 7f 4c 89 e7 49 89 44 24 20 48 8b 03 <8b> 48 28 b8 00 10 00 00 48 d3 e0 49 89 44 24 18 48 8b 03 8b 40 28 kernel: RSP: 0018:ffffbe9960fcbd48 EFLAGS: 00010246 kernel: RAX: 0000000000000000 RBX: ffff9af5272ae780 RCX: 0000000000372004 kernel: RDX: ffffffffffffffff RSI: ffffffffffffffff RDI: ffff9af555e9b000 kernel: RBP: ffff9af52ee66b00 R08: 0000000000000040 R09: 0000000000370004 kernel: R10: ffffbe9960fcbd48 R11: 0000000000000040 R12: ffff9af555e9b000 kernel: R13: ffffffffa66b86c0 R14: ffff9af507d2f400 R15: ffff9af507d2f400 kernel: FS: 00007ffbc0ba4740(0000) GS:ffff9b0bd7000000(0000) knlGS:0000000000000000 kernel: CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 kernel: CR2: 0000000000000028 CR3: 00000001b1ee0000 CR4: 00000000001506f0 kernel: Call Trace: kernel: kernel: ? __die_body+0x1a/0x60 kernel: ? page_fault_oops+0x16f/0x4a0 kernel: ? search_bpf_extables+0x65/0x70 kernel: ? fixup_exception+0x22/0x310 kernel: ? exc_page_fault+0x69/0x150 kernel: ? asm_exc_page_fault+0x22/0x30 kernel: ? __pfx_hugetlbfs_fill_super+0x10/0x10 kernel: ? hugetlbfs_fill_super+0xb4/0x1a0 kernel: ? hugetlbfs_fill_super+0x28/0x1a0 kernel: ? __pfx_hugetlbfs_fill_super+0x10/0x10 kernel: vfs_get_super+0x40/0xa0 kernel: ? __pfx_bpf_lsm_capable+0x10/0x10 kernel: vfs_get_tree+0x25/0xd0 kernel: vfs_cmd_create+0x64/0xe0 kernel: __x64_sys_fsconfig+0x395/0x410 kernel: do_syscall_64+0x80/0x160 kernel: ? syscall_exit_to_user_mode+0x82/0x240 kernel: ? do_syscall_64+0x8d/0x160 kernel: ? syscall_exit_to_user_mode+0x82/0x240 kernel: ? do_syscall_64+0x8d/0x160 kernel: ? exc_page_fault+0x69/0x150 kernel: entry_SYSCALL_64_after_hwframe+0x6e/0x76 kernel: RIP: 0033:0x7ffbc0cb87c9 kernel: Code: 00 90 90 90 90 90 90 90 90 90 90 90 90 90 90 66 90 48 89 f8 48 89 f7 48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05 <48> 3d 01 f0 ff ff 73 01 c3 48 8b 0d 97 96 0d 00 f7 d8 64 89 01 48 kernel: RSP: 002b:00007ffc29d2f388 EFLAGS: 00000206 ORIG_RAX: 00000000000001af kernel: RAX: ffffffffffffffda RBX: 0000000000000000 RCX: 00007ffbc0cb87c9 kernel: RDX: 0000000000000000 RSI: 0000000000000006 RDI: 0000000000000003 kernel: RBP: 00007ffc29d2f3b0 R08: 0000000000000000 R09: 0000000000000000 kernel: R10: 0000000000000000 R11: 0000000000000206 R12: 0000000000000000 kernel: R13: 00007ffc29d2f4c0 R14: 0000000000000000 R15: 0000000000000000 kernel: kernel: Modules linked in: rpcsec_gss_krb5(E) auth_rpcgss(E) nfsv4(E) dns_resolver(E) nfs(E) lockd(E) grace(E) sunrpc(E) netfs(E) af_packet(E) bridge(E) stp(E) llc(E) iscsi_ibft(E) iscsi_boot_sysfs(E) intel_rapl_msr(E) intel_rapl_common(E) iTCO_wdt(E) intel_pmc_bxt(E) sb_edac(E) iTCO_vendor_support(E) x86_pkg_temp_thermal(E) intel_powerclamp(E) coretemp(E) kvm_intel(E) rfkill(E) ipmi_ssif(E) kvm(E) acpi_ipmi(E) irqbypass(E) pcspkr(E) igb(E) ipmi_si(E) mei_me(E) i2c_i801(E) joydev(E) intel_pch_thermal(E) i2c_smbus(E) dca(E) lpc_ich(E) mei(E) ipmi_devintf(E) ipmi_msghandler(E) acpi_pad(E) tiny_power_button(E) button(E) fuse(E) efi_pstore(E) configfs(E) ip_tables(E) x_tables(E) ext4(E) mbcache(E) jbd2(E) hid_generic(E) usbhid(E) sd_mod(E) t10_pi(E) crct10dif_pclmul(E) crc32_pclmul(E) crc32c_intel(E) polyval_clmulni(E) ahci(E) xhci_pci(E) polyval_generic(E) gf128mul(E) ghash_clmulni_intel(E) sha512_ssse3(E) sha256_ssse3(E) xhci_pci_renesas(E) libahci(E) ehci_pci(E) sha1_ssse3(E) xhci_hcd(E) ehci_hcd(E) libata(E) kernel: mgag200(E) i2c_algo_bit(E) usbcore(E) wmi(E) sg(E) dm_multipath(E) dm_mod(E) scsi_dh_rdac(E) scsi_dh_emc(E) scsi_dh_alua(E) scsi_mod(E) scsi_common(E) aesni_intel(E) crypto_simd(E) cryptd(E) kernel: Unloaded tainted modules: acpi_cpufreq(E):1 fjes(E):1 kernel: CR2: 0000000000000028 kernel: ---[ end trace 0000000000000000 ]--- kernel: RIP: 0010:hugetlbfs_fill_super+0xb4/0x1a0 kernel: Code: 48 8b 3b e8 3e c6 ed ff 48 85 c0 48 89 45 20 0f 84 d6 00 00 00 48 b8 ff ff ff ff ff ff ff 7f 4c 89 e7 49 89 44 24 20 48 8b 03 <8b> 48 28 b8 00 10 00 00 48 d3 e0 49 89 44 24 18 48 8b 03 8b 40 28 kernel: RSP: 0018:ffffbe9960fcbd48 EFLAGS: 00010246 kernel: RAX: 0000000000000000 RBX: ffff9af5272ae780 RCX: 0000000000372004 kernel: RDX: ffffffffffffffff RSI: ffffffffffffffff RDI: ffff9af555e9b000 kernel: RBP: ffff9af52ee66b00 R08: 0000000000000040 R09: 0000000000370004 kernel: R10: ffffbe9960fcbd48 R11: 0000000000000040 R12: ffff9af555e9b000 kernel: R13: ffffffffa66b86c0 R14: ffff9af507d2f400 R15: ffff9af507d2f400 kernel: FS: 00007ffbc0ba4740(0000) GS:ffff9b0bd7000000(0000) knlGS:0000000000000000 kernel: CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 kernel: CR2: 0000000000000028 CR3: 00000001b1ee0000 CR4: 00000000001506f0 Link: https://lkml.kernel.org/r/20240130210418.3771-1-osalvador@suse.de Fixes: 32021982a324 ("hugetlbfs: Convert to fs_context") Signed-off-by: Michal Hocko Signed-off-by: Oscar Salvador Acked-by: Muchun Song Cc: Signed-off-by: Andrew Morton --- fs/hugetlbfs/inode.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index ee13c2ca8ad2..d746866ae3b6 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -1365,6 +1365,7 @@ static int hugetlbfs_parse_param(struct fs_context *fc, struct fs_parameter *par { struct hugetlbfs_fs_context *ctx = fc->fs_private; struct fs_parse_result result; + struct hstate *h; char *rest; unsigned long ps; int opt; @@ -1409,11 +1410,12 @@ static int hugetlbfs_parse_param(struct fs_context *fc, struct fs_parameter *par case Opt_pagesize: ps = memparse(param->string, &rest); - ctx->hstate = size_to_hstate(ps); - if (!ctx->hstate) { + h = size_to_hstate(ps); + if (!h) { pr_err("Unsupported page size %lu MB\n", ps / SZ_1M); return -EINVAL; } + ctx->hstate = h; return 0; case Opt_min_size: From 27d3969b47cc38810b3fd65d72231940e8671e6c Mon Sep 17 00:00:00 2001 From: Chengming Zhou Date: Sun, 28 Jan 2024 13:28:49 +0000 Subject: [PATCH 216/270] mm/zswap: don't return LRU_SKIP if we have dropped lru lock LRU_SKIP can only be returned if we don't ever dropped lru lock, or we need to return LRU_RETRY to restart from the head of lru list. Otherwise, the iteration might continue from a cursor position that was freed while the locks were dropped. Actually we may need to introduce another LRU_STOP to really terminate the ongoing shrinking scan process, when we encounter a warm page already in the swap cache. The current list_lru implementation doesn't have this function to early break from __list_lru_walk_one. Link: https://lkml.kernel.org/r/20240126-zswap-writeback-race-v2-1-b10479847099@bytedance.com Fixes: b5ba474f3f51 ("zswap: shrink zswap pool based on memory pressure") Signed-off-by: Chengming Zhou Acked-by: Johannes Weiner Reviewed-by: Nhat Pham Cc: Chris Li Cc: Yosry Ahmed Signed-off-by: Andrew Morton --- mm/zswap.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/mm/zswap.c b/mm/zswap.c index 0a94b197ed32..350dd2fc8159 100644 --- a/mm/zswap.c +++ b/mm/zswap.c @@ -895,10 +895,8 @@ static enum lru_status shrink_memcg_cb(struct list_head *item, struct list_lru_o * into the warmer region. We should terminate shrinking (if we're in the dynamic * shrinker context). */ - if (writeback_result == -EEXIST && encountered_page_in_swapcache) { - ret = LRU_SKIP; + if (writeback_result == -EEXIST && encountered_page_in_swapcache) *encountered_page_in_swapcache = true; - } goto put_unlock; } From 6f1f15a5e492082c8082bea56d87852b65588b5c Mon Sep 17 00:00:00 2001 From: Leo Yan Date: Thu, 1 Feb 2024 10:10:08 +0800 Subject: [PATCH 217/270] MAINTAINERS: Leo Yan has moved I will lose access to my @linaro.org email address next week, update the MAINTAINERS file and map it in .mailmap with the new email address. Link: https://lkml.kernel.org/r/20240201021022.886-1-leo.yan@linux.dev Signed-off-by: Leo Yan Cc: Arnaldo Carvalho de Melo Cc: Ian Rogers Cc: James Clark Cc: Mike Leach Cc: Namhyung Kim Signed-off-by: Andrew Morton --- .mailmap | 1 + MAINTAINERS | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/.mailmap b/.mailmap index 8ae00bd3708a..ee8f03cc7f72 100644 --- a/.mailmap +++ b/.mailmap @@ -345,6 +345,7 @@ Leonid I Ananiev Leon Romanovsky Leon Romanovsky Leon Romanovsky +Leo Yan Liam Mark Linas Vepstas Linus Lüssing diff --git a/MAINTAINERS b/MAINTAINERS index 960512bec428..95b47e3cee0a 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -17183,7 +17183,7 @@ R: John Garry R: Will Deacon R: James Clark R: Mike Leach -R: Leo Yan +R: Leo Yan L: linux-arm-kernel@lists.infradead.org (moderated for non-subscribers) S: Supported F: tools/build/feature/test-libopencsd.c From 38296afe3c6ee07319e01bb249aa4bb47c07b534 Mon Sep 17 00:00:00 2001 From: Ryusuke Konishi Date: Wed, 31 Jan 2024 23:56:57 +0900 Subject: [PATCH 218/270] nilfs2: fix hang in nilfs_lookup_dirty_data_buffers() Syzbot reported a hang issue in migrate_pages_batch() called by mbind() and nilfs_lookup_dirty_data_buffers() called in the log writer of nilfs2. While migrate_pages_batch() locks a folio and waits for the writeback to complete, the log writer thread that should bring the writeback to completion picks up the folio being written back in nilfs_lookup_dirty_data_buffers() that it calls for subsequent log creation and was trying to lock the folio. Thus causing a deadlock. In the first place, it is unexpected that folios/pages in the middle of writeback will be updated and become dirty. Nilfs2 adds a checksum to verify the validity of the log being written and uses it for recovery at mount, so data changes during writeback are suppressed. Since this is broken, an unclean shutdown could potentially cause recovery to fail. Investigation revealed that the root cause is that the wait for writeback completion in nilfs_page_mkwrite() is conditional, and if the backing device does not require stable writes, data may be modified without waiting. Fix these issues by making nilfs_page_mkwrite() wait for writeback to finish regardless of the stable write requirement of the backing device. Link: https://lkml.kernel.org/r/20240131145657.4209-1-konishi.ryusuke@gmail.com Fixes: 1d1d1a767206 ("mm: only enforce stable page writes if the backing device requires it") Signed-off-by: Ryusuke Konishi Reported-by: syzbot+ee2ae68da3b22d04cd8d@syzkaller.appspotmail.com Closes: https://lkml.kernel.org/r/00000000000047d819061004ad6c@google.com Tested-by: Ryusuke Konishi Cc: Signed-off-by: Andrew Morton --- fs/nilfs2/file.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/fs/nilfs2/file.c b/fs/nilfs2/file.c index bec33b89a075..0e3fc5ba33c7 100644 --- a/fs/nilfs2/file.c +++ b/fs/nilfs2/file.c @@ -107,7 +107,13 @@ static vm_fault_t nilfs_page_mkwrite(struct vm_fault *vmf) nilfs_transaction_commit(inode->i_sb); mapped: - folio_wait_stable(folio); + /* + * Since checksumming including data blocks is performed to determine + * the validity of the log to be written and used for recovery, it is + * necessary to wait for writeback to finish here, regardless of the + * stable write requirement of the backing device. + */ + folio_wait_writeback(folio); out: sb_end_pagefault(inode->i_sb); return vmf_fs_error(ret); From b9e4bc1046d20e0623a80660ef8627448056f817 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Fri, 2 Feb 2024 11:19:56 -0800 Subject: [PATCH 219/270] mm/damon/sysfs-schemes: fix wrong DAMOS tried regions update timeout setup DAMON sysfs interface's update_schemes_tried_regions command has a timeout of two apply intervals of the DAMOS scheme. Having zero value DAMOS scheme apply interval means it will use the aggregation interval as the value. However, the timeout setup logic is mistakenly using the sampling interval insted of the aggregartion interval for the case. This could cause earlier-than-expected timeout of the command. Fix it. Link: https://lkml.kernel.org/r/20240202191956.88791-1-sj@kernel.org Fixes: 7d6fa31a2fd7 ("mm/damon/sysfs-schemes: add timeout for update_schemes_tried_regions") Signed-off-by: SeongJae Park Cc: # 6.7.x Signed-off-by: Andrew Morton --- mm/damon/sysfs-schemes.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/damon/sysfs-schemes.c b/mm/damon/sysfs-schemes.c index 8dbaac6e5c2d..dd2fb5127009 100644 --- a/mm/damon/sysfs-schemes.c +++ b/mm/damon/sysfs-schemes.c @@ -2194,7 +2194,7 @@ static void damos_tried_regions_init_upd_status( sysfs_regions->upd_timeout_jiffies = jiffies + 2 * usecs_to_jiffies(scheme->apply_interval_us ? scheme->apply_interval_us : - ctx->attrs.sample_interval); + ctx->attrs.aggr_interval); } } From 5bc09b397cbf1221f8a8aacb1152650c9195b02b Mon Sep 17 00:00:00 2001 From: Ryusuke Konishi Date: Sun, 4 Feb 2024 01:16:45 +0900 Subject: [PATCH 220/270] nilfs2: fix potential bug in end_buffer_async_write According to a syzbot report, end_buffer_async_write(), which handles the completion of block device writes, may detect abnormal condition of the buffer async_write flag and cause a BUG_ON failure when using nilfs2. Nilfs2 itself does not use end_buffer_async_write(). But, the async_write flag is now used as a marker by commit 7f42ec394156 ("nilfs2: fix issue with race condition of competition between segments for dirty blocks") as a means of resolving double list insertion of dirty blocks in nilfs_lookup_dirty_data_buffers() and nilfs_lookup_node_buffers() and the resulting crash. This modification is safe as long as it is used for file data and b-tree node blocks where the page caches are independent. However, it was irrelevant and redundant to also introduce async_write for segment summary and super root blocks that share buffers with the backing device. This led to the possibility that the BUG_ON check in end_buffer_async_write would fail as described above, if independent writebacks of the backing device occurred in parallel. The use of async_write for segment summary buffers has already been removed in a previous change. Fix this issue by removing the manipulation of the async_write flag for the remaining super root block buffer. Link: https://lkml.kernel.org/r/20240203161645.4992-1-konishi.ryusuke@gmail.com Fixes: 7f42ec394156 ("nilfs2: fix issue with race condition of competition between segments for dirty blocks") Signed-off-by: Ryusuke Konishi Reported-by: syzbot+5c04210f7c7f897c1e7f@syzkaller.appspotmail.com Closes: https://lkml.kernel.org/r/00000000000019a97c05fd42f8c8@google.com Cc: Signed-off-by: Andrew Morton --- fs/nilfs2/segment.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/fs/nilfs2/segment.c b/fs/nilfs2/segment.c index 2590a0860eab..2bfb08052d39 100644 --- a/fs/nilfs2/segment.c +++ b/fs/nilfs2/segment.c @@ -1703,7 +1703,6 @@ static void nilfs_segctor_prepare_write(struct nilfs_sc_info *sci) list_for_each_entry(bh, &segbuf->sb_payload_buffers, b_assoc_buffers) { - set_buffer_async_write(bh); if (bh == segbuf->sb_super_root) { if (bh->b_folio != bd_folio) { folio_lock(bd_folio); @@ -1714,6 +1713,7 @@ static void nilfs_segctor_prepare_write(struct nilfs_sc_info *sci) } break; } + set_buffer_async_write(bh); if (bh->b_folio != fs_folio) { nilfs_begin_folio_io(fs_folio); fs_folio = bh->b_folio; @@ -1800,7 +1800,6 @@ static void nilfs_abort_logs(struct list_head *logs, int err) list_for_each_entry(bh, &segbuf->sb_payload_buffers, b_assoc_buffers) { - clear_buffer_async_write(bh); if (bh == segbuf->sb_super_root) { clear_buffer_uptodate(bh); if (bh->b_folio != bd_folio) { @@ -1809,6 +1808,7 @@ static void nilfs_abort_logs(struct list_head *logs, int err) } break; } + clear_buffer_async_write(bh); if (bh->b_folio != fs_folio) { nilfs_end_folio_io(fs_folio, err); fs_folio = bh->b_folio; @@ -1896,8 +1896,9 @@ static void nilfs_segctor_complete_write(struct nilfs_sc_info *sci) BIT(BH_Delay) | BIT(BH_NILFS_Volatile) | BIT(BH_NILFS_Redirected)); - set_mask_bits(&bh->b_state, clear_bits, set_bits); if (bh == segbuf->sb_super_root) { + set_buffer_uptodate(bh); + clear_buffer_dirty(bh); if (bh->b_folio != bd_folio) { folio_end_writeback(bd_folio); bd_folio = bh->b_folio; @@ -1905,6 +1906,7 @@ static void nilfs_segctor_complete_write(struct nilfs_sc_info *sci) update_sr = true; break; } + set_mask_bits(&bh->b_state, clear_bits, set_bits); if (bh->b_folio != fs_folio) { nilfs_end_folio_io(fs_folio, 0); fs_folio = bh->b_folio; From ddc7d4c584704666fe7088bbd9ec2d72d0f63e65 Mon Sep 17 00:00:00 2001 From: Matthew Brost Date: Thu, 1 Feb 2024 09:55:32 -0800 Subject: [PATCH 221/270] drm/xe: Fix loop in vm_bind_ioctl_ops_unwind MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The logic for the unwind loop is incorrect resulting in an infinite loop. Fix to unwind to go from the last operations list to he first. Fixes: 617eebb9c480 ("drm/xe: Fix array of binds") Signed-off-by: Matthew Brost Reviewed-by: Maarten Lankhorst Link: https://patchwork.freedesktop.org/patch/msgid/20240201175532.2303168-1-matthew.brost@intel.com (cherry picked from commit 3acc1ff1a72fce00cdbd3ef1c27108a967fd5616) Signed-off-by: Thomas Hellström --- drivers/gpu/drm/xe/xe_vm.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/xe/xe_vm.c b/drivers/gpu/drm/xe/xe_vm.c index 30db264d34a3..d45cbf75d203 100644 --- a/drivers/gpu/drm/xe/xe_vm.c +++ b/drivers/gpu/drm/xe/xe_vm.c @@ -2669,7 +2669,7 @@ static void vm_bind_ioctl_ops_unwind(struct xe_vm *vm, { int i; - for (i = num_ops_list - 1; i; ++i) { + for (i = num_ops_list - 1; i >= 0; --i) { struct drm_gpuva_ops *__ops = ops[i]; struct drm_gpuva_op *__op; From fc29b6d5ab5395dcb9f35de71e0347f3a6bca542 Mon Sep 17 00:00:00 2001 From: Matthew Brost Date: Wed, 31 Jan 2024 16:48:48 -0800 Subject: [PATCH 222/270] drm/xe: Take a reference in xe_exec_queue_last_fence_get() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Take a reference in xe_exec_queue_last_fence_get(). Also fix a reference counting underflow bug VM bind and unbind. Fixes: dd08ebf6c352 ("drm/xe: Introduce a new DRM driver for Intel GPUs") Signed-off-by: Matthew Brost Reviewed-by: Maarten Lankhorst Link: https://patchwork.freedesktop.org/patch/msgid/20240201004849.2219558-2-matthew.brost@intel.com (cherry picked from commit a856b67a84169e065ebbeee50258936b1eacc9eb) Signed-off-by: Thomas Hellström --- drivers/gpu/drm/xe/xe_exec_queue.c | 8 ++++++-- drivers/gpu/drm/xe/xe_migrate.c | 5 ++++- drivers/gpu/drm/xe/xe_sched_job.c | 1 - drivers/gpu/drm/xe/xe_sync.c | 2 -- drivers/gpu/drm/xe/xe_vm.c | 1 + 5 files changed, 11 insertions(+), 6 deletions(-) diff --git a/drivers/gpu/drm/xe/xe_exec_queue.c b/drivers/gpu/drm/xe/xe_exec_queue.c index bcfc4127c7c5..254b1d3af4cb 100644 --- a/drivers/gpu/drm/xe/xe_exec_queue.c +++ b/drivers/gpu/drm/xe/xe_exec_queue.c @@ -926,20 +926,24 @@ void xe_exec_queue_last_fence_put_unlocked(struct xe_exec_queue *q) * @q: The exec queue * @vm: The VM the engine does a bind or exec for * - * Get last fence, does not take a ref + * Get last fence, takes a ref * * Returns: last fence if not signaled, dma fence stub if signaled */ struct dma_fence *xe_exec_queue_last_fence_get(struct xe_exec_queue *q, struct xe_vm *vm) { + struct dma_fence *fence; + xe_exec_queue_last_fence_lockdep_assert(q, vm); if (q->last_fence && test_bit(DMA_FENCE_FLAG_SIGNALED_BIT, &q->last_fence->flags)) xe_exec_queue_last_fence_put(q, vm); - return q->last_fence ? q->last_fence : dma_fence_get_stub(); + fence = q->last_fence ? q->last_fence : dma_fence_get_stub(); + dma_fence_get(fence); + return fence; } /** diff --git a/drivers/gpu/drm/xe/xe_migrate.c b/drivers/gpu/drm/xe/xe_migrate.c index 5c6c54624252..0cc31837ef26 100644 --- a/drivers/gpu/drm/xe/xe_migrate.c +++ b/drivers/gpu/drm/xe/xe_migrate.c @@ -1204,8 +1204,11 @@ static bool no_in_syncs(struct xe_vm *vm, struct xe_exec_queue *q, } if (q) { fence = xe_exec_queue_last_fence_get(q, vm); - if (!test_bit(DMA_FENCE_FLAG_SIGNALED_BIT, &fence->flags)) + if (!test_bit(DMA_FENCE_FLAG_SIGNALED_BIT, &fence->flags)) { + dma_fence_put(fence); return false; + } + dma_fence_put(fence); } return true; diff --git a/drivers/gpu/drm/xe/xe_sched_job.c b/drivers/gpu/drm/xe/xe_sched_job.c index 01106a1156ad..4e2ccad0e52f 100644 --- a/drivers/gpu/drm/xe/xe_sched_job.c +++ b/drivers/gpu/drm/xe/xe_sched_job.c @@ -274,7 +274,6 @@ int xe_sched_job_last_fence_add_dep(struct xe_sched_job *job, struct xe_vm *vm) struct dma_fence *fence; fence = xe_exec_queue_last_fence_get(job->q, vm); - dma_fence_get(fence); return drm_sched_job_add_dependency(&job->drm, fence); } diff --git a/drivers/gpu/drm/xe/xe_sync.c b/drivers/gpu/drm/xe/xe_sync.c index e4c220cf9115..aab92bee1d7c 100644 --- a/drivers/gpu/drm/xe/xe_sync.c +++ b/drivers/gpu/drm/xe/xe_sync.c @@ -307,7 +307,6 @@ xe_sync_in_fence_get(struct xe_sync_entry *sync, int num_sync, /* Easy case... */ if (!num_in_fence) { fence = xe_exec_queue_last_fence_get(q, vm); - dma_fence_get(fence); return fence; } @@ -322,7 +321,6 @@ xe_sync_in_fence_get(struct xe_sync_entry *sync, int num_sync, } } fences[current_fence++] = xe_exec_queue_last_fence_get(q, vm); - dma_fence_get(fences[current_fence - 1]); cf = dma_fence_array_create(num_in_fence, fences, vm->composite_fence_ctx, vm->composite_fence_seqno++, diff --git a/drivers/gpu/drm/xe/xe_vm.c b/drivers/gpu/drm/xe/xe_vm.c index d45cbf75d203..96b1833c0e8c 100644 --- a/drivers/gpu/drm/xe/xe_vm.c +++ b/drivers/gpu/drm/xe/xe_vm.c @@ -1984,6 +1984,7 @@ static int xe_vm_prefetch(struct xe_vm *vm, struct xe_vma *vma, xe_exec_queue_last_fence_get(wait_exec_queue, vm); xe_sync_entry_signal(&syncs[i], NULL, fence); + dma_fence_put(fence); } } From 21abf108a062fa0323077b5ba3d26e2c0bba9232 Mon Sep 17 00:00:00 2001 From: Matthew Brost Date: Wed, 31 Jan 2024 16:48:49 -0800 Subject: [PATCH 223/270] drm/xe: Pick correct userptr VMA to repin on REMAP op failure MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit A REMAP op is composed of 3 VMA's - unmap, prev map, and next map. When op_execute fails with -EAGAIN we need to update the local VMA pointer to the current op state and then repin the VMA if it is a userptr. Fixes a failure seen in xe_vm.munmap-style-unbind-userptr-one-partial. Fixes: b06d47be7c83 ("drm/xe: Port Xe to GPUVA") Signed-off-by: Matthew Brost Reviewed-by: Maarten Lankhorst Link: https://patchwork.freedesktop.org/patch/msgid/20240201004849.2219558-3-matthew.brost@intel.com (cherry picked from commit 447f74d223b4f6cbab74963bf1099050c15374ce) Signed-off-by: Thomas Hellström --- drivers/gpu/drm/xe/xe_vm.c | 22 +++++++++++++++++----- 1 file changed, 17 insertions(+), 5 deletions(-) diff --git a/drivers/gpu/drm/xe/xe_vm.c b/drivers/gpu/drm/xe/xe_vm.c index 96b1833c0e8c..ca11f8381115 100644 --- a/drivers/gpu/drm/xe/xe_vm.c +++ b/drivers/gpu/drm/xe/xe_vm.c @@ -2531,13 +2531,25 @@ retry_userptr: } drm_exec_fini(&exec); - if (err == -EAGAIN && xe_vma_is_userptr(vma)) { + if (err == -EAGAIN) { lockdep_assert_held_write(&vm->lock); - err = xe_vma_userptr_pin_pages(to_userptr_vma(vma)); - if (!err) - goto retry_userptr; - trace_xe_vma_fail(vma); + if (op->base.op == DRM_GPUVA_OP_REMAP) { + if (!op->remap.unmap_done) + vma = gpuva_to_vma(op->base.remap.unmap->va); + else if (op->remap.prev) + vma = op->remap.prev; + else + vma = op->remap.next; + } + + if (xe_vma_is_userptr(vma)) { + err = xe_vma_userptr_pin_pages(to_userptr_vma(vma)); + if (!err) + goto retry_userptr; + + trace_xe_vma_fail(vma); + } } return err; From 90773aaf9129ea6f47915bd3c47da261abe6a447 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Wed, 3 Jan 2024 12:48:02 +0100 Subject: [PATCH 224/270] drm/xe: circumvent bogus stringop-overflow warning MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit gcc-13 warns about an array overflow that it sees but that is prevented by the "asid % NUM_PF_QUEUE" calculation: drivers/gpu/drm/xe/xe_gt_pagefault.c: In function 'xe_guc_pagefault_handler': include/linux/fortify-string.h:57:33: error: writing 16 bytes into a region of size 0 [-Werror=stringop-overflow=] include/linux/fortify-string.h:689:26: note: in expansion of macro '__fortify_memcpy_chk' 689 | #define memcpy(p, q, s) __fortify_memcpy_chk(p, q, s, \ | ^~~~~~~~~~~~~~~~~~~~ drivers/gpu/drm/xe/xe_gt_pagefault.c:341:17: note: in expansion of macro 'memcpy' 341 | memcpy(pf_queue->data + pf_queue->tail, msg, len * sizeof(u32)); | ^~~~~~ drivers/gpu/drm/xe/xe_gt_types.h:102:25: note: at offset [1144, 265324] into destination object 'tile' of size 8 I found that rewriting the assignment using pointer addition rather than the equivalent array index calculation prevents the warning, so use that instead. I sent a bug report against gcc for the false positive warning. Fixes: dd08ebf6c352 ("drm/xe: Introduce a new DRM driver for Intel GPUs") Link: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=113214 Signed-off-by: Arnd Bergmann Signed-off-by: Lucas De Marchi Link: https://patchwork.freedesktop.org/patch/msgid/20240103114819.2913937-1-arnd@kernel.org (cherry picked from commit 774ef5dfc95578a9079426d5106076dcd59c4dfa) Signed-off-by: Thomas Hellström --- drivers/gpu/drm/xe/xe_gt_pagefault.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/xe/xe_gt_pagefault.c b/drivers/gpu/drm/xe/xe_gt_pagefault.c index 9c2fe1697d6e..73f08f1924df 100644 --- a/drivers/gpu/drm/xe/xe_gt_pagefault.c +++ b/drivers/gpu/drm/xe/xe_gt_pagefault.c @@ -335,7 +335,7 @@ int xe_guc_pagefault_handler(struct xe_guc *guc, u32 *msg, u32 len) return -EPROTO; asid = FIELD_GET(PFD_ASID, msg[1]); - pf_queue = >->usm.pf_queue[asid % NUM_PF_QUEUE]; + pf_queue = gt->usm.pf_queue + (asid % NUM_PF_QUEUE); spin_lock_irqsave(&pf_queue->lock, flags); full = pf_queue_full(pf_queue); From 3aa3c5c249086ffc920e8f6d6a15bdd441153d45 Mon Sep 17 00:00:00 2001 From: Matthew Brost Date: Thu, 1 Feb 2024 19:34:40 -0800 Subject: [PATCH 225/270] drm/xe: Map both mem.kernel_bb_pool and usm.bb_pool MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit For integrated devices we need to map both mem.kernel_bb_pool and usm.bb_pool to be able to run batches from both pools. Fixes: a682b6a42d4d ("drm/xe: Support device page faults on integrated platforms") Tested-by: Brian Welty Signed-off-by: Matthew Brost Reviewed-by: Brian Welty Link: https://patchwork.freedesktop.org/patch/msgid/20240202033440.2351862-1-matthew.brost@intel.com (cherry picked from commit 72f86ed3c88933d6fa09b036de93621ea71097a7) Signed-off-by: Thomas Hellström --- drivers/gpu/drm/xe/xe_gt.c | 5 ++++- drivers/gpu/drm/xe/xe_migrate.c | 23 ++++++++++++++++++----- 2 files changed, 22 insertions(+), 6 deletions(-) diff --git a/drivers/gpu/drm/xe/xe_gt.c b/drivers/gpu/drm/xe/xe_gt.c index 3af2adec1295..35474ddbaf97 100644 --- a/drivers/gpu/drm/xe/xe_gt.c +++ b/drivers/gpu/drm/xe/xe_gt.c @@ -437,7 +437,10 @@ static int all_fw_domain_init(struct xe_gt *gt) * USM has its only SA pool to non-block behind user operations */ if (gt_to_xe(gt)->info.has_usm) { - gt->usm.bb_pool = xe_sa_bo_manager_init(gt_to_tile(gt), SZ_1M, 16); + struct xe_device *xe = gt_to_xe(gt); + + gt->usm.bb_pool = xe_sa_bo_manager_init(gt_to_tile(gt), + IS_DGFX(xe) ? SZ_1M : SZ_512K, 16); if (IS_ERR(gt->usm.bb_pool)) { err = PTR_ERR(gt->usm.bb_pool); goto err_force_wake; diff --git a/drivers/gpu/drm/xe/xe_migrate.c b/drivers/gpu/drm/xe/xe_migrate.c index 0cc31837ef26..70480c305602 100644 --- a/drivers/gpu/drm/xe/xe_migrate.c +++ b/drivers/gpu/drm/xe/xe_migrate.c @@ -170,11 +170,6 @@ static int xe_migrate_prepare_vm(struct xe_tile *tile, struct xe_migrate *m, if (!IS_DGFX(xe)) { /* Write out batch too */ m->batch_base_ofs = NUM_PT_SLOTS * XE_PAGE_SIZE; - if (xe->info.has_usm) { - batch = tile->primary_gt->usm.bb_pool->bo; - m->usm_batch_base_ofs = m->batch_base_ofs; - } - for (i = 0; i < batch->size; i += vm->flags & XE_VM_FLAG_64K ? XE_64K_PAGE_SIZE : XE_PAGE_SIZE) { @@ -185,6 +180,24 @@ static int xe_migrate_prepare_vm(struct xe_tile *tile, struct xe_migrate *m, entry); level++; } + if (xe->info.has_usm) { + xe_tile_assert(tile, batch->size == SZ_1M); + + batch = tile->primary_gt->usm.bb_pool->bo; + m->usm_batch_base_ofs = m->batch_base_ofs + SZ_1M; + xe_tile_assert(tile, batch->size == SZ_512K); + + for (i = 0; i < batch->size; + i += vm->flags & XE_VM_FLAG_64K ? XE_64K_PAGE_SIZE : + XE_PAGE_SIZE) { + entry = vm->pt_ops->pte_encode_bo(batch, i, + pat_index, 0); + + xe_map_wr(xe, &bo->vmap, map_ofs + level * 8, u64, + entry); + level++; + } + } } else { u64 batch_addr = xe_bo_addr(batch, 0, XE_PAGE_SIZE); From 11572b3f68d9933fef5c1afef4c20041701d8025 Mon Sep 17 00:00:00 2001 From: Xiaoming Wang Date: Fri, 2 Feb 2024 13:56:58 -0800 Subject: [PATCH 226/270] drm/xe/display: Fix memleak in display initialization MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit intel_power_domains_init is called twice in xe_device_probe: 1) intel_power_domains_init() xe_display_init_nommio() xe_device_probe() 2) intel_power_domains_init() intel_display_driver_probe_noirq() xe_display_init_noirq() xe_device_probe() It needs remove one to avoid power_domains->power_wells double malloc. unreferenced object 0xffff88811150ee00 (size 512): comm "systemd-udevd", pid 506, jiffies 4294674198 (age 3605.560s) hex dump (first 32 bytes): 10 b4 9d a0 ff ff ff ff ff ff ff ff ff ff ff ff ................ ff ff ff ff ff ff ff ff 00 00 00 00 00 00 00 00 ................ backtrace: [] __kmem_cache_alloc_node+0x1c1/0x2b0 [] __kmalloc+0x52/0x150 [] __set_power_wells+0xc3/0x360 [xe] [] xe_display_init_nommio+0x4c/0x70 [xe] [] xe_device_probe+0x3c/0x5a0 [xe] [] xe_pci_probe+0x33f/0x5a0 [xe] [] local_pci_probe+0x47/0xa0 [] pci_device_probe+0xc3/0x1f0 [] really_probe+0x1a2/0x410 [] __driver_probe_device+0x78/0x160 [] driver_probe_device+0x1e/0x90 [] __driver_attach+0xda/0x1d0 [] bus_for_each_dev+0x7c/0xd0 [] bus_add_driver+0x119/0x220 [] driver_register+0x60/0x120 [] 0xffffffffa05e50a0 The call to intel_power_domains_cleanup() needs to stay where it is for now. The main issue is that while the init is called by the display side, shared by i915 and xe, the cleanup is called by a non-shared code path. Fixing that will be done as a separate commit. Fixes: 44e694958b95 ("drm/xe/display: Implement display support") Signed-off-by: Xiaoming Wang [ reword commit message and explain why the fini needs to stay where it is ] Reviewed-by: Lucas De Marchi Signed-off-by: Lucas De Marchi Link: https://patchwork.freedesktop.org/patch/msgid/20240202215658.561298-1-lucas.demarchi@intel.com (cherry picked from commit 86c99abb5f1b6fcd69fb268eeb2e34cb7c4f355c) Signed-off-by: Thomas Hellström --- drivers/gpu/drm/xe/xe_display.c | 6 ------ 1 file changed, 6 deletions(-) diff --git a/drivers/gpu/drm/xe/xe_display.c b/drivers/gpu/drm/xe/xe_display.c index 74391d9b11ae..e4db069f0db3 100644 --- a/drivers/gpu/drm/xe/xe_display.c +++ b/drivers/gpu/drm/xe/xe_display.c @@ -134,8 +134,6 @@ static void xe_display_fini_nommio(struct drm_device *dev, void *dummy) int xe_display_init_nommio(struct xe_device *xe) { - int err; - if (!xe->info.enable_display) return 0; @@ -145,10 +143,6 @@ int xe_display_init_nommio(struct xe_device *xe) /* This must be called before any calls to HAS_PCH_* */ intel_detect_pch(xe); - err = intel_power_domains_init(xe); - if (err) - return err; - return drmm_add_action_or_reset(&xe->drm, xe_display_fini_nommio, xe); } From 95c058c8ef1d5d9e39ab2039a5eea4d5b93f4117 Mon Sep 17 00:00:00 2001 From: Matthew Brost Date: Mon, 5 Feb 2024 15:17:14 -0800 Subject: [PATCH 227/270] drm/xe: Assume large page size if VMA not yet bound MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The calculation to determine max page size of a VMA during a REMAP operations assumes the VMA has been bound. This assumption is not true if the VMA is from an eariler operation in an array of binds. If a VMA has not been bound use the maximum page size which will ensure the previous / next REMAP operations are not incorrectly skipped. Fixes: 8f33b4f054fc ("drm/xe: Avoid doing rebinds") Signed-off-by: Matthew Brost Reviewed-by: Rodrigo Vivi Link: https://patchwork.freedesktop.org/patch/msgid/20240205231714.2956225-1-matthew.brost@intel.com (cherry picked from commit 5ad6af5c91e9b942c44b657122270d935db3a813) Signed-off-by: Thomas Hellström --- drivers/gpu/drm/xe/xe_vm.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/xe/xe_vm.c b/drivers/gpu/drm/xe/xe_vm.c index ca11f8381115..99f151d73154 100644 --- a/drivers/gpu/drm/xe/xe_vm.c +++ b/drivers/gpu/drm/xe/xe_vm.c @@ -2200,8 +2200,10 @@ static u64 xe_vma_max_pte_size(struct xe_vma *vma) return SZ_1G; else if (vma->gpuva.flags & XE_VMA_PTE_2M) return SZ_2M; + else if (vma->gpuva.flags & XE_VMA_PTE_4K) + return SZ_4K; - return SZ_4K; + return SZ_1G; /* Uninitialized, used max size */ } static u64 xe_vma_set_pte_size(struct xe_vma *vma, u64 size) From 9e3fc1d65d4e8cf302e289847ab165ad9358fdb2 Mon Sep 17 00:00:00 2001 From: Matthew Auld Date: Fri, 2 Feb 2024 17:14:36 +0000 Subject: [PATCH 228/270] drm/xe/vm: don't ignore error when in_kthread MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit If GUP fails and we are in_kthread, we can have pinned = 0 and ret = 0. If that happens we call sg_alloc_append_table_from_pages() with n_pages = 0, which is not well behaved and can trigger: kernel BUG at include/linux/scatterlist.h:115! depending on if the pages array happens to be zeroed or not. Even if we don't hit that it crashes later when trying to dma_map the returned table. Fixes: dd08ebf6c352 ("drm/xe: Introduce a new DRM driver for Intel GPUs") Signed-off-by: Matthew Auld Cc: Thomas Hellström Cc: Matthew Brost Reviewed-by: Matthew Brost Link: https://patchwork.freedesktop.org/patch/msgid/20240202171435.427630-2-matthew.auld@intel.com (cherry picked from commit 8087199cd5951c1eba26003b3e4296dbb2110adf) Signed-off-by: Thomas Hellström --- drivers/gpu/drm/xe/xe_vm.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/drivers/gpu/drm/xe/xe_vm.c b/drivers/gpu/drm/xe/xe_vm.c index 99f151d73154..a9df89413155 100644 --- a/drivers/gpu/drm/xe/xe_vm.c +++ b/drivers/gpu/drm/xe/xe_vm.c @@ -114,11 +114,8 @@ retry: num_pages - pinned, read_only ? 0 : FOLL_WRITE, &pages[pinned]); - if (ret < 0) { - if (in_kthread) - ret = 0; + if (ret < 0) break; - } pinned += ret; ret = 0; From bf4c27b8267d7848bb81fd41e6aa07aa662f07fb Mon Sep 17 00:00:00 2001 From: Matthew Brost Date: Mon, 5 Feb 2024 20:50:10 -0800 Subject: [PATCH 229/270] drm/xe: Remove TEST_VM_ASYNC_OPS_ERROR MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit TEST_VM_ASYNC_OPS_ERROR is broken and unused. Remove for now and will pull back in a later time when it is used, fixed, and properly hidden behind a Kconfig option. Also fixup the supported flags value. Fixes: dd08ebf6c352 ("drm/xe: Introduce a new DRM driver for Intel GPUs") Signed-off-by: Matthew Brost Reviewed-by: Maarten Lankhorst Link: https://patchwork.freedesktop.org/patch/msgid/20240206045010.2981051-1-matthew.brost@intel.com (cherry picked from commit d9890c028d66a9e1ee3cccaa081ab5aedcbfe431) Signed-off-by: Thomas Hellström --- drivers/gpu/drm/xe/xe_vm.c | 28 +--------------------------- drivers/gpu/drm/xe/xe_vm_types.h | 8 -------- 2 files changed, 1 insertion(+), 35 deletions(-) diff --git a/drivers/gpu/drm/xe/xe_vm.c b/drivers/gpu/drm/xe/xe_vm.c index a9df89413155..865e10d0a06a 100644 --- a/drivers/gpu/drm/xe/xe_vm.c +++ b/drivers/gpu/drm/xe/xe_vm.c @@ -37,8 +37,6 @@ #include "generated/xe_wa_oob.h" #include "xe_wa.h" -#define TEST_VM_ASYNC_OPS_ERROR - static struct drm_gem_object *xe_vm_obj(struct xe_vm *vm) { return vm->gpuvm.r_obj; @@ -2062,7 +2060,6 @@ vm_bind_ioctl_ops_create(struct xe_vm *vm, struct xe_bo *bo, struct drm_gem_object *obj = bo ? &bo->ttm.base : NULL; struct drm_gpuva_ops *ops; struct drm_gpuva_op *__op; - struct xe_vma_op *op; struct drm_gpuvm_bo *vm_bo; int err; @@ -2109,15 +2106,6 @@ vm_bind_ioctl_ops_create(struct xe_vm *vm, struct xe_bo *bo, if (IS_ERR(ops)) return ops; -#ifdef TEST_VM_ASYNC_OPS_ERROR - if (operation & FORCE_ASYNC_OP_ERROR) { - op = list_first_entry_or_null(&ops->list, struct xe_vma_op, - base.entry); - if (op) - op->inject_error = true; - } -#endif - drm_gpuva_for_each_op(__op, ops) { struct xe_vma_op *op = gpuva_op_to_vma_op(__op); @@ -2560,13 +2548,6 @@ static int xe_vma_op_execute(struct xe_vm *vm, struct xe_vma_op *op) lockdep_assert_held_write(&vm->lock); -#ifdef TEST_VM_ASYNC_OPS_ERROR - if (op->inject_error) { - op->inject_error = false; - return -ENOMEM; - } -#endif - switch (op->base.op) { case DRM_GPUVA_OP_MAP: ret = __xe_vma_op_execute(vm, op->map.vma, op); @@ -2726,16 +2707,9 @@ static int vm_bind_ioctl_ops_execute(struct xe_vm *vm, return 0; } -#ifdef TEST_VM_ASYNC_OPS_ERROR -#define SUPPORTED_FLAGS \ - (FORCE_ASYNC_OP_ERROR | DRM_XE_VM_BIND_FLAG_READONLY | \ - DRM_XE_VM_BIND_FLAG_IMMEDIATE | DRM_XE_VM_BIND_FLAG_NULL | 0xffff) -#else #define SUPPORTED_FLAGS \ (DRM_XE_VM_BIND_FLAG_READONLY | \ - DRM_XE_VM_BIND_FLAG_IMMEDIATE | DRM_XE_VM_BIND_FLAG_NULL | \ - 0xffff) -#endif + DRM_XE_VM_BIND_FLAG_IMMEDIATE | DRM_XE_VM_BIND_FLAG_NULL) #define XE_64K_PAGE_MASK 0xffffull #define ALL_DRM_XE_SYNCS_FLAGS (DRM_XE_SYNCS_FLAG_WAIT_FOR_OP) diff --git a/drivers/gpu/drm/xe/xe_vm_types.h b/drivers/gpu/drm/xe/xe_vm_types.h index 1fec66ae2eb2..5ac9c5bebabc 100644 --- a/drivers/gpu/drm/xe/xe_vm_types.h +++ b/drivers/gpu/drm/xe/xe_vm_types.h @@ -21,9 +21,6 @@ struct xe_bo; struct xe_sync_entry; struct xe_vm; -#define TEST_VM_ASYNC_OPS_ERROR -#define FORCE_ASYNC_OP_ERROR BIT(31) - #define XE_VMA_READ_ONLY DRM_GPUVA_USERBITS #define XE_VMA_DESTROYED (DRM_GPUVA_USERBITS << 1) #define XE_VMA_ATOMIC_PTE_BIT (DRM_GPUVA_USERBITS << 2) @@ -360,11 +357,6 @@ struct xe_vma_op { /** @flags: operation flags */ enum xe_vma_op_flags flags; -#ifdef TEST_VM_ASYNC_OPS_ERROR - /** @inject_error: inject error to test async op error handling */ - bool inject_error; -#endif - union { /** @map: VMA map operation specific data */ struct xe_vma_op_map map; From ab0beafd52b98dfb8b8244b2c6794efbc87478db Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Fri, 2 Feb 2024 10:09:34 +0100 Subject: [PATCH 230/270] netfilter: nft_set_pipapo: remove static in nft_pipapo_get() This has slipped through when reducing memory footprint for set elements, remove it. Fixes: 9dad402b89e8 ("netfilter: nf_tables: expose opaque set element as struct nft_elem_priv") Reported-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nft_set_pipapo.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/netfilter/nft_set_pipapo.c b/net/netfilter/nft_set_pipapo.c index efd523496be4..f24ecdaa1c1e 100644 --- a/net/netfilter/nft_set_pipapo.c +++ b/net/netfilter/nft_set_pipapo.c @@ -603,7 +603,7 @@ static struct nft_elem_priv * nft_pipapo_get(const struct net *net, const struct nft_set *set, const struct nft_set_elem *elem, unsigned int flags) { - static struct nft_pipapo_elem *e; + struct nft_pipapo_elem *e; e = pipapo_get(net, set, (const u8 *)elem->key.val.data, nft_genmask_cur(net)); From 2526dffc6d65cffa32b88556bd68e4e72e889a55 Mon Sep 17 00:00:00 2001 From: Bartosz Golaszewski Date: Mon, 5 Feb 2024 11:22:29 +0100 Subject: [PATCH 231/270] gpio: remove GPIO device from the list unconditionally in error path Since commit 48e1b4d369cf ("gpiolib: remove the GPIO device from the list when it's unregistered") we remove the GPIO device entry from the global list (used to order devices by their GPIO ranges) when unregistering the chip, not when releasing the device. It will not happen when the last reference is put anymore. This means, we need to remove it in error path in gpiochip_add_data_with_key() unconditionally, without checking if the device's .release() callback is set. Fixes: 48e1b4d369cf ("gpiolib: remove the GPIO device from the list when it's unregistered") Signed-off-by: Bartosz Golaszewski --- drivers/gpio/gpiolib.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/drivers/gpio/gpiolib.c b/drivers/gpio/gpiolib.c index 44c8f5743a24..8b3a0f45b574 100644 --- a/drivers/gpio/gpiolib.c +++ b/drivers/gpio/gpiolib.c @@ -1005,15 +1005,15 @@ err_remove_of_chip: err_free_gpiochip_mask: gpiochip_remove_pin_ranges(gc); gpiochip_free_valid_mask(gc); +err_remove_from_list: + spin_lock_irqsave(&gpio_lock, flags); + list_del(&gdev->list); + spin_unlock_irqrestore(&gpio_lock, flags); if (gdev->dev.release) { /* release() has been registered by gpiochip_setup_dev() */ gpio_device_put(gdev); goto err_print_message; } -err_remove_from_list: - spin_lock_irqsave(&gpio_lock, flags); - list_del(&gdev->list); - spin_unlock_irqrestore(&gpio_lock, flags); err_free_label: kfree_const(gdev->label); err_free_descs: From 9def04e759caa5a3d741891037ae99f81e2fff01 Mon Sep 17 00:00:00 2001 From: Sinthu Raja Date: Tue, 6 Feb 2024 06:29:27 +0530 Subject: [PATCH 232/270] net: ethernet: ti: cpsw_new: enable mac_managed_pm to fix mdio The below commit introduced a WARN when phy state is not in the states: PHY_HALTED, PHY_READY and PHY_UP. commit 744d23c71af3 ("net: phy: Warn about incorrect mdio_bus_phy_resume() state") When cpsw_new resumes, there have port in PHY_NOLINK state, so the below warning comes out. Set mac_managed_pm be true to tell mdio that the phy resume/suspend is managed by the mac, to fix the following warning: WARNING: CPU: 0 PID: 965 at drivers/net/phy/phy_device.c:326 mdio_bus_phy_resume+0x140/0x144 CPU: 0 PID: 965 Comm: sh Tainted: G O 6.1.46-g247b2535b2 #1 Hardware name: Generic AM33XX (Flattened Device Tree) unwind_backtrace from show_stack+0x18/0x1c show_stack from dump_stack_lvl+0x24/0x2c dump_stack_lvl from __warn+0x84/0x15c __warn from warn_slowpath_fmt+0x1a8/0x1c8 warn_slowpath_fmt from mdio_bus_phy_resume+0x140/0x144 mdio_bus_phy_resume from dpm_run_callback+0x3c/0x140 dpm_run_callback from device_resume+0xb8/0x2b8 device_resume from dpm_resume+0x144/0x314 dpm_resume from dpm_resume_end+0x14/0x20 dpm_resume_end from suspend_devices_and_enter+0xd0/0x924 suspend_devices_and_enter from pm_suspend+0x2e0/0x33c pm_suspend from state_store+0x74/0xd0 state_store from kernfs_fop_write_iter+0x104/0x1ec kernfs_fop_write_iter from vfs_write+0x1b8/0x358 vfs_write from ksys_write+0x78/0xf8 ksys_write from ret_fast_syscall+0x0/0x54 Exception stack(0xe094dfa8 to 0xe094dff0) dfa0: 00000004 005c3fb8 00000001 005c3fb8 00000004 00000001 dfc0: 00000004 005c3fb8 b6f6bba0 00000004 00000004 0059edb8 00000000 00000000 dfe0: 00000004 bed918f0 b6f09bd3 b6e89a66 Cc: # v6.0+ Fixes: 744d23c71af3 ("net: phy: Warn about incorrect mdio_bus_phy_resume() state") Fixes: fba863b81604 ("net: phy: make PHY PM ops a no-op if MAC driver manages PHY PM") Signed-off-by: Sinthu Raja Signed-off-by: Paolo Abeni --- drivers/net/ethernet/ti/cpsw_new.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/net/ethernet/ti/cpsw_new.c b/drivers/net/ethernet/ti/cpsw_new.c index 498c50c6d1a7..087dcb67505a 100644 --- a/drivers/net/ethernet/ti/cpsw_new.c +++ b/drivers/net/ethernet/ti/cpsw_new.c @@ -773,6 +773,9 @@ static void cpsw_slave_open(struct cpsw_slave *slave, struct cpsw_priv *priv) slave->slave_num); return; } + + phy->mac_managed_pm = true; + slave->phy = phy; phy_attached_info(slave->phy); From bc4ce46b1e3d1da4309405cd4afc7c0fcddd0b90 Mon Sep 17 00:00:00 2001 From: Sinthu Raja Date: Tue, 6 Feb 2024 06:29:28 +0530 Subject: [PATCH 233/270] net: ethernet: ti: cpsw: enable mac_managed_pm to fix mdio The below commit introduced a WARN when phy state is not in the states: PHY_HALTED, PHY_READY and PHY_UP. commit 744d23c71af3 ("net: phy: Warn about incorrect mdio_bus_phy_resume() state") When cpsw resumes, there have port in PHY_NOLINK state, so the below warning comes out. Set mac_managed_pm be true to tell mdio that the phy resume/suspend is managed by the mac, to fix the following warning: WARNING: CPU: 0 PID: 965 at drivers/net/phy/phy_device.c:326 mdio_bus_phy_resume+0x140/0x144 CPU: 0 PID: 965 Comm: sh Tainted: G O 6.1.46-g247b2535b2 #1 Hardware name: Generic AM33XX (Flattened Device Tree) unwind_backtrace from show_stack+0x18/0x1c show_stack from dump_stack_lvl+0x24/0x2c dump_stack_lvl from __warn+0x84/0x15c __warn from warn_slowpath_fmt+0x1a8/0x1c8 warn_slowpath_fmt from mdio_bus_phy_resume+0x140/0x144 mdio_bus_phy_resume from dpm_run_callback+0x3c/0x140 dpm_run_callback from device_resume+0xb8/0x2b8 device_resume from dpm_resume+0x144/0x314 dpm_resume from dpm_resume_end+0x14/0x20 dpm_resume_end from suspend_devices_and_enter+0xd0/0x924 suspend_devices_and_enter from pm_suspend+0x2e0/0x33c pm_suspend from state_store+0x74/0xd0 state_store from kernfs_fop_write_iter+0x104/0x1ec kernfs_fop_write_iter from vfs_write+0x1b8/0x358 vfs_write from ksys_write+0x78/0xf8 ksys_write from ret_fast_syscall+0x0/0x54 Exception stack(0xe094dfa8 to 0xe094dff0) dfa0: 00000004 005c3fb8 00000001 005c3fb8 00000004 00000001 dfc0: 00000004 005c3fb8 b6f6bba0 00000004 00000004 0059edb8 00000000 00000000 dfe0: 00000004 bed918f0 b6f09bd3 b6e89a66 Cc: # v6.0+ Fixes: 744d23c71af3 ("net: phy: Warn about incorrect mdio_bus_phy_resume() state") Fixes: fba863b81604 ("net: phy: make PHY PM ops a no-op if MAC driver manages PHY PM") Signed-off-by: Sinthu Raja Signed-off-by: Paolo Abeni --- drivers/net/ethernet/ti/cpsw.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/net/ethernet/ti/cpsw.c b/drivers/net/ethernet/ti/cpsw.c index ea85c6dd5484..c0a5abd8d9a8 100644 --- a/drivers/net/ethernet/ti/cpsw.c +++ b/drivers/net/ethernet/ti/cpsw.c @@ -631,6 +631,8 @@ static void cpsw_slave_open(struct cpsw_slave *slave, struct cpsw_priv *priv) } } + phy->mac_managed_pm = true; + slave->phy = phy; phy_attached_info(slave->phy); From db010ff6700f24f96e91ed56ca29cb69335008ef Mon Sep 17 00:00:00 2001 From: Ratheesh Kannoth Date: Tue, 6 Feb 2024 08:10:00 +0530 Subject: [PATCH 234/270] octeontx2-af: Initialize maps. kmalloc_array() without __GFP_ZERO flag does not initialize memory to zero. This causes issues. Use kcalloc() for maps and bitmap_zalloc() for bitmaps. Fixes: dd7842878633 ("octeontx2-af: Add new devlink param to configure maximum usable NIX block LFs") Signed-off-by: Ratheesh Kannoth Reviewed-by: Brett Creeley Reviewed-by: Simon Horman Link: https://lore.kernel.org/r/20240206024000.1070260-1-rkannoth@marvell.com Signed-off-by: Paolo Abeni --- .../ethernet/marvell/octeontx2/af/rvu_npc.c | 31 +++++++++---------- 1 file changed, 15 insertions(+), 16 deletions(-) diff --git a/drivers/net/ethernet/marvell/octeontx2/af/rvu_npc.c b/drivers/net/ethernet/marvell/octeontx2/af/rvu_npc.c index 167145bdcb75..8cfd74ad991c 100644 --- a/drivers/net/ethernet/marvell/octeontx2/af/rvu_npc.c +++ b/drivers/net/ethernet/marvell/octeontx2/af/rvu_npc.c @@ -1850,8 +1850,8 @@ void npc_mcam_rsrcs_deinit(struct rvu *rvu) { struct npc_mcam *mcam = &rvu->hw->mcam; - kfree(mcam->bmap); - kfree(mcam->bmap_reverse); + bitmap_free(mcam->bmap); + bitmap_free(mcam->bmap_reverse); kfree(mcam->entry2pfvf_map); kfree(mcam->cntr2pfvf_map); kfree(mcam->entry2cntr_map); @@ -1904,21 +1904,20 @@ int npc_mcam_rsrcs_init(struct rvu *rvu, int blkaddr) mcam->pf_offset = mcam->nixlf_offset + nixlf_count; /* Allocate bitmaps for managing MCAM entries */ - mcam->bmap = kmalloc_array(BITS_TO_LONGS(mcam->bmap_entries), - sizeof(long), GFP_KERNEL); + mcam->bmap = bitmap_zalloc(mcam->bmap_entries, GFP_KERNEL); if (!mcam->bmap) return -ENOMEM; - mcam->bmap_reverse = kmalloc_array(BITS_TO_LONGS(mcam->bmap_entries), - sizeof(long), GFP_KERNEL); + mcam->bmap_reverse = bitmap_zalloc(mcam->bmap_entries, GFP_KERNEL); if (!mcam->bmap_reverse) goto free_bmap; mcam->bmap_fcnt = mcam->bmap_entries; /* Alloc memory for saving entry to RVU PFFUNC allocation mapping */ - mcam->entry2pfvf_map = kmalloc_array(mcam->bmap_entries, - sizeof(u16), GFP_KERNEL); + mcam->entry2pfvf_map = kcalloc(mcam->bmap_entries, sizeof(u16), + GFP_KERNEL); + if (!mcam->entry2pfvf_map) goto free_bmap_reverse; @@ -1941,21 +1940,21 @@ int npc_mcam_rsrcs_init(struct rvu *rvu, int blkaddr) if (err) goto free_entry_map; - mcam->cntr2pfvf_map = kmalloc_array(mcam->counters.max, - sizeof(u16), GFP_KERNEL); + mcam->cntr2pfvf_map = kcalloc(mcam->counters.max, sizeof(u16), + GFP_KERNEL); if (!mcam->cntr2pfvf_map) goto free_cntr_bmap; /* Alloc memory for MCAM entry to counter mapping and for tracking * counter's reference count. */ - mcam->entry2cntr_map = kmalloc_array(mcam->bmap_entries, - sizeof(u16), GFP_KERNEL); + mcam->entry2cntr_map = kcalloc(mcam->bmap_entries, sizeof(u16), + GFP_KERNEL); if (!mcam->entry2cntr_map) goto free_cntr_map; - mcam->cntr_refcnt = kmalloc_array(mcam->counters.max, - sizeof(u16), GFP_KERNEL); + mcam->cntr_refcnt = kcalloc(mcam->counters.max, sizeof(u16), + GFP_KERNEL); if (!mcam->cntr_refcnt) goto free_entry_cntr_map; @@ -1988,9 +1987,9 @@ free_cntr_bmap: free_entry_map: kfree(mcam->entry2pfvf_map); free_bmap_reverse: - kfree(mcam->bmap_reverse); + bitmap_free(mcam->bmap_reverse); free_bmap: - kfree(mcam->bmap); + bitmap_free(mcam->bmap); return -ENOMEM; } From 27c5a095e2518975e20a10102908ae8231699879 Mon Sep 17 00:00:00 2001 From: Jozsef Kadlecsik Date: Sun, 4 Feb 2024 16:26:42 +0100 Subject: [PATCH 235/270] netfilter: ipset: Missing gc cancellations fixed MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The patch fdb8e12cc2cc ("netfilter: ipset: fix performance regression in swap operation") missed to add the calls to gc cancellations at the error path of create operations and at module unload. Also, because the half of the destroy operations now executed by a function registered by call_rcu(), neither NFNL_SUBSYS_IPSET mutex or rcu read lock is held and therefore the checking of them results false warnings. Fixes: 97f7cf1cd80e ("netfilter: ipset: fix performance regression in swap operation") Reported-by: syzbot+52bbc0ad036f6f0d4a25@syzkaller.appspotmail.com Reported-by: Brad Spengler Reported-by: Стас Ничипорович Tested-by: Brad Spengler Tested-by: Стас Ничипорович Signed-off-by: Jozsef Kadlecsik Signed-off-by: Pablo Neira Ayuso --- net/netfilter/ipset/ip_set_core.c | 2 ++ net/netfilter/ipset/ip_set_hash_gen.h | 4 ++-- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/net/netfilter/ipset/ip_set_core.c b/net/netfilter/ipset/ip_set_core.c index bcaad9c009fe..3184cc6be4c9 100644 --- a/net/netfilter/ipset/ip_set_core.c +++ b/net/netfilter/ipset/ip_set_core.c @@ -1154,6 +1154,7 @@ static int ip_set_create(struct sk_buff *skb, const struct nfnl_info *info, return ret; cleanup: + set->variant->cancel_gc(set); set->variant->destroy(set); put_out: module_put(set->type->me); @@ -2378,6 +2379,7 @@ ip_set_net_exit(struct net *net) set = ip_set(inst, i); if (set) { ip_set(inst, i) = NULL; + set->variant->cancel_gc(set); ip_set_destroy_set(set); } } diff --git a/net/netfilter/ipset/ip_set_hash_gen.h b/net/netfilter/ipset/ip_set_hash_gen.h index 1136510521a8..cf3ce72c3de6 100644 --- a/net/netfilter/ipset/ip_set_hash_gen.h +++ b/net/netfilter/ipset/ip_set_hash_gen.h @@ -432,7 +432,7 @@ mtype_ahash_destroy(struct ip_set *set, struct htable *t, bool ext_destroy) u32 i; for (i = 0; i < jhash_size(t->htable_bits); i++) { - n = __ipset_dereference(hbucket(t, i)); + n = (__force struct hbucket *)hbucket(t, i); if (!n) continue; if (set->extensions & IPSET_EXT_DESTROY && ext_destroy) @@ -452,7 +452,7 @@ mtype_destroy(struct ip_set *set) struct htype *h = set->data; struct list_head *l, *lt; - mtype_ahash_destroy(set, ipset_dereference_nfnl(h->table), true); + mtype_ahash_destroy(set, (__force struct htable *)h->table, true); list_for_each_safe(l, lt, &h->ad) { list_del(l); kfree(l); From 2fe8a236436fe40d8d26a1af8d150fc80f04ee1a Mon Sep 17 00:00:00 2001 From: Alexandra Winter Date: Tue, 6 Feb 2024 09:58:49 +0100 Subject: [PATCH 236/270] s390/qeth: Fix potential loss of L3-IP@ in case of network issues Symptom: In case of a bad cable connection (e.g. dirty optics) a fast sequence of network DOWN-UP-DOWN-UP could happen. UP triggers recovery of the qeth interface. In case of a second DOWN while recovery is still ongoing, it can happen that the IP@ of a Layer3 qeth interface is lost and will not be recovered by the second UP. Problem: When registration of IP addresses with Layer 3 qeth devices fails, (e.g. because of bad address format) the respective IP address is deleted from its hash-table in the driver. If registration fails because of a ENETDOWN condition, the address should stay in the hashtable, so a subsequent recovery can restore it. 3caa4af834df ("qeth: keep ip-address after LAN_OFFLINE failure") fixes this for registration failures during normal operation, but not during recovery. Solution: Keep L3-IP address in case of ENETDOWN in qeth_l3_recover_ip(). For consistency with qeth_l3_add_ip() we also keep it in case of EADDRINUSE, i.e. for some reason the card already/still has this address registered. Fixes: 4a71df50047f ("qeth: new qeth device driver") Cc: stable@vger.kernel.org Signed-off-by: Alexandra Winter Link: https://lore.kernel.org/r/20240206085849.2902775-1-wintera@linux.ibm.com Signed-off-by: Paolo Abeni --- drivers/s390/net/qeth_l3_main.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/drivers/s390/net/qeth_l3_main.c b/drivers/s390/net/qeth_l3_main.c index b92a32b4b114..04c64ce0a1ca 100644 --- a/drivers/s390/net/qeth_l3_main.c +++ b/drivers/s390/net/qeth_l3_main.c @@ -255,9 +255,10 @@ static void qeth_l3_clear_ip_htable(struct qeth_card *card, int recover) if (!recover) { hash_del(&addr->hnode); kfree(addr); - continue; + } else { + /* prepare for recovery */ + addr->disp_flag = QETH_DISP_ADDR_ADD; } - addr->disp_flag = QETH_DISP_ADDR_ADD; } mutex_unlock(&card->ip_lock); @@ -278,9 +279,11 @@ static void qeth_l3_recover_ip(struct qeth_card *card) if (addr->disp_flag == QETH_DISP_ADDR_ADD) { rc = qeth_l3_register_addr_entry(card, addr); - if (!rc) { + if (!rc || rc == -EADDRINUSE || rc == -ENETDOWN) { + /* keep it in the records */ addr->disp_flag = QETH_DISP_ADDR_DO_NOTHING; } else { + /* bad address */ hash_del(&addr->hnode); kfree(addr); } From fa173a1b4e3fd1ab5451cbc57de6fc624c824b0a Mon Sep 17 00:00:00 2001 From: Felix Huettner Date: Mon, 5 Feb 2024 09:59:59 +0000 Subject: [PATCH 237/270] netfilter: ctnetlink: fix filtering for zone 0 previously filtering for the default zone would actually skip the zone filter and flush all zones. Fixes: eff3c558bb7e ("netfilter: ctnetlink: support filtering by zone") Reported-by: Ilya Maximets Closes: https://lore.kernel.org/netdev/2032238f-31ac-4106-8f22-522e76df5a12@ovn.org/ Signed-off-by: Felix Huettner Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_conntrack_netlink.c | 12 ++++-- .../netfilter/conntrack_dump_flush.c | 43 ++++++++++++++++++- 2 files changed, 50 insertions(+), 5 deletions(-) diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c index 0c22a02c2035..3b846cbdc050 100644 --- a/net/netfilter/nf_conntrack_netlink.c +++ b/net/netfilter/nf_conntrack_netlink.c @@ -876,6 +876,7 @@ struct ctnetlink_filter_u32 { struct ctnetlink_filter { u8 family; + bool zone_filter; u_int32_t orig_flags; u_int32_t reply_flags; @@ -992,9 +993,12 @@ ctnetlink_alloc_filter(const struct nlattr * const cda[], u8 family) if (err) goto err_filter; - err = ctnetlink_parse_zone(cda[CTA_ZONE], &filter->zone); - if (err < 0) - goto err_filter; + if (cda[CTA_ZONE]) { + err = ctnetlink_parse_zone(cda[CTA_ZONE], &filter->zone); + if (err < 0) + goto err_filter; + filter->zone_filter = true; + } if (!cda[CTA_FILTER]) return filter; @@ -1148,7 +1152,7 @@ static int ctnetlink_filter_match(struct nf_conn *ct, void *data) if (filter->family && nf_ct_l3num(ct) != filter->family) goto ignore_entry; - if (filter->zone.id != NF_CT_DEFAULT_ZONE_ID && + if (filter->zone_filter && !nf_ct_zone_equal_any(ct, &filter->zone)) goto ignore_entry; diff --git a/tools/testing/selftests/netfilter/conntrack_dump_flush.c b/tools/testing/selftests/netfilter/conntrack_dump_flush.c index f18c6db13bbf..b11ea8ee6719 100644 --- a/tools/testing/selftests/netfilter/conntrack_dump_flush.c +++ b/tools/testing/selftests/netfilter/conntrack_dump_flush.c @@ -13,7 +13,7 @@ #include "../kselftest_harness.h" #define TEST_ZONE_ID 123 -#define CTA_FILTER_F_CTA_TUPLE_ZONE (1 << 2) +#define NF_CT_DEFAULT_ZONE_ID 0 static int reply_counter; @@ -336,6 +336,9 @@ FIXTURE_SETUP(conntrack_dump_flush) ret = conntrack_data_generate_v4(self->sock, 0xf4f4f4f4, 0xf5f5f5f5, TEST_ZONE_ID + 2); EXPECT_EQ(ret, 0); + ret = conntrack_data_generate_v4(self->sock, 0xf6f6f6f6, 0xf7f7f7f7, + NF_CT_DEFAULT_ZONE_ID); + EXPECT_EQ(ret, 0); src = (struct in6_addr) {{ .__u6_addr32 = { @@ -395,6 +398,26 @@ FIXTURE_SETUP(conntrack_dump_flush) TEST_ZONE_ID + 2); EXPECT_EQ(ret, 0); + src = (struct in6_addr) {{ + .__u6_addr32 = { + 0xb80d0120, + 0x00000000, + 0x00000000, + 0x07000000 + } + }}; + dst = (struct in6_addr) {{ + .__u6_addr32 = { + 0xb80d0120, + 0x00000000, + 0x00000000, + 0x08000000 + } + }}; + ret = conntrack_data_generate_v6(self->sock, src, dst, + NF_CT_DEFAULT_ZONE_ID); + EXPECT_EQ(ret, 0); + ret = conntracK_count_zone(self->sock, TEST_ZONE_ID); EXPECT_GE(ret, 2); if (ret > 2) @@ -425,6 +448,24 @@ TEST_F(conntrack_dump_flush, test_flush_by_zone) EXPECT_EQ(ret, 2); ret = conntracK_count_zone(self->sock, TEST_ZONE_ID + 2); EXPECT_EQ(ret, 2); + ret = conntracK_count_zone(self->sock, NF_CT_DEFAULT_ZONE_ID); + EXPECT_EQ(ret, 2); +} + +TEST_F(conntrack_dump_flush, test_flush_by_zone_default) +{ + int ret; + + ret = conntrack_flush_zone(self->sock, NF_CT_DEFAULT_ZONE_ID); + EXPECT_EQ(ret, 0); + ret = conntracK_count_zone(self->sock, TEST_ZONE_ID); + EXPECT_EQ(ret, 2); + ret = conntracK_count_zone(self->sock, TEST_ZONE_ID + 1); + EXPECT_EQ(ret, 2); + ret = conntracK_count_zone(self->sock, TEST_ZONE_ID + 2); + EXPECT_EQ(ret, 2); + ret = conntracK_count_zone(self->sock, NF_CT_DEFAULT_ZONE_ID); + EXPECT_EQ(ret, 0); } TEST_HARNESS_MAIN From 38ed1c7062ada30d7c11e7a7acc749bf27aa14aa Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Mon, 5 Feb 2024 14:59:24 +0100 Subject: [PATCH 238/270] netfilter: nft_ct: reject direction for ct id Direction attribute is ignored, reject it in case this ever needs to be supported Fixes: 3087c3f7c23b ("netfilter: nft_ct: Add ct id support") Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nft_ct.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/net/netfilter/nft_ct.c b/net/netfilter/nft_ct.c index aac98a3c966e..bfd3e5a14dab 100644 --- a/net/netfilter/nft_ct.c +++ b/net/netfilter/nft_ct.c @@ -476,6 +476,9 @@ static int nft_ct_get_init(const struct nft_ctx *ctx, break; #endif case NFT_CT_ID: + if (tb[NFTA_CT_DIRECTION]) + return -EINVAL; + len = sizeof(u32); break; default: From 7395dfacfff65e9938ac0889dafa1ab01e987d15 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Tue, 6 Feb 2024 00:11:40 +0100 Subject: [PATCH 239/270] netfilter: nf_tables: use timestamp to check for set element timeout Add a timestamp field at the beginning of the transaction, store it in the nftables per-netns area. Update set backend .insert, .deactivate and sync gc path to use the timestamp, this avoids that an element expires while control plane transaction is still unfinished. .lookup and .update, which are used from packet path, still use the current time to check if the element has expired. And .get path and dump also since this runs lockless under rcu read size lock. Then, there is async gc which also needs to check the current time since it runs asynchronously from a workqueue. Fixes: c3e1b005ed1c ("netfilter: nf_tables: add set element timeout support") Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_tables.h | 16 ++++++++++++++-- net/netfilter/nf_tables_api.c | 4 +++- net/netfilter/nft_set_hash.c | 8 +++++++- net/netfilter/nft_set_pipapo.c | 18 +++++++++++------- net/netfilter/nft_set_rbtree.c | 11 +++++++---- 5 files changed, 42 insertions(+), 15 deletions(-) diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h index 001226c34621..510244cc0f8f 100644 --- a/include/net/netfilter/nf_tables.h +++ b/include/net/netfilter/nf_tables.h @@ -808,10 +808,16 @@ static inline struct nft_set_elem_expr *nft_set_ext_expr(const struct nft_set_ex return nft_set_ext(ext, NFT_SET_EXT_EXPRESSIONS); } -static inline bool nft_set_elem_expired(const struct nft_set_ext *ext) +static inline bool __nft_set_elem_expired(const struct nft_set_ext *ext, + u64 tstamp) { return nft_set_ext_exists(ext, NFT_SET_EXT_EXPIRATION) && - time_is_before_eq_jiffies64(*nft_set_ext_expiration(ext)); + time_after_eq64(tstamp, *nft_set_ext_expiration(ext)); +} + +static inline bool nft_set_elem_expired(const struct nft_set_ext *ext) +{ + return __nft_set_elem_expired(ext, get_jiffies_64()); } static inline struct nft_set_ext *nft_set_elem_ext(const struct nft_set *set, @@ -1779,6 +1785,7 @@ struct nftables_pernet { struct list_head notify_list; struct mutex commit_mutex; u64 table_handle; + u64 tstamp; unsigned int base_seq; unsigned int gc_seq; u8 validate_state; @@ -1791,6 +1798,11 @@ static inline struct nftables_pernet *nft_pernet(const struct net *net) return net_generic(net, nf_tables_net_id); } +static inline u64 nft_net_tstamp(const struct net *net) +{ + return nft_pernet(net)->tstamp; +} + #define __NFT_REDUCE_READONLY 1UL #define NFT_REDUCE_READONLY (void *)__NFT_REDUCE_READONLY diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index fc016befb46f..f8e3f70c35bd 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -9827,6 +9827,7 @@ dead_elem: struct nft_trans_gc *nft_trans_gc_catchall_sync(struct nft_trans_gc *gc) { struct nft_set_elem_catchall *catchall, *next; + u64 tstamp = nft_net_tstamp(gc->net); const struct nft_set *set = gc->set; struct nft_elem_priv *elem_priv; struct nft_set_ext *ext; @@ -9836,7 +9837,7 @@ struct nft_trans_gc *nft_trans_gc_catchall_sync(struct nft_trans_gc *gc) list_for_each_entry_safe(catchall, next, &set->catchall_list, list) { ext = nft_set_elem_ext(set, catchall->elem); - if (!nft_set_elem_expired(ext)) + if (!__nft_set_elem_expired(ext, tstamp)) continue; gc = nft_trans_gc_queue_sync(gc, GFP_KERNEL); @@ -10622,6 +10623,7 @@ static bool nf_tables_valid_genid(struct net *net, u32 genid) bool genid_ok; mutex_lock(&nft_net->commit_mutex); + nft_net->tstamp = get_jiffies_64(); genid_ok = genid == 0 || nft_net->base_seq == genid; if (!genid_ok) diff --git a/net/netfilter/nft_set_hash.c b/net/netfilter/nft_set_hash.c index 6c2061bfdae6..6968a3b34236 100644 --- a/net/netfilter/nft_set_hash.c +++ b/net/netfilter/nft_set_hash.c @@ -36,6 +36,7 @@ struct nft_rhash_cmp_arg { const struct nft_set *set; const u32 *key; u8 genmask; + u64 tstamp; }; static inline u32 nft_rhash_key(const void *data, u32 len, u32 seed) @@ -62,7 +63,7 @@ static inline int nft_rhash_cmp(struct rhashtable_compare_arg *arg, return 1; if (nft_set_elem_is_dead(&he->ext)) return 1; - if (nft_set_elem_expired(&he->ext)) + if (__nft_set_elem_expired(&he->ext, x->tstamp)) return 1; if (!nft_set_elem_active(&he->ext, x->genmask)) return 1; @@ -87,6 +88,7 @@ bool nft_rhash_lookup(const struct net *net, const struct nft_set *set, .genmask = nft_genmask_cur(net), .set = set, .key = key, + .tstamp = get_jiffies_64(), }; he = rhashtable_lookup(&priv->ht, &arg, nft_rhash_params); @@ -106,6 +108,7 @@ nft_rhash_get(const struct net *net, const struct nft_set *set, .genmask = nft_genmask_cur(net), .set = set, .key = elem->key.val.data, + .tstamp = get_jiffies_64(), }; he = rhashtable_lookup(&priv->ht, &arg, nft_rhash_params); @@ -131,6 +134,7 @@ static bool nft_rhash_update(struct nft_set *set, const u32 *key, .genmask = NFT_GENMASK_ANY, .set = set, .key = key, + .tstamp = get_jiffies_64(), }; he = rhashtable_lookup(&priv->ht, &arg, nft_rhash_params); @@ -175,6 +179,7 @@ static int nft_rhash_insert(const struct net *net, const struct nft_set *set, .genmask = nft_genmask_next(net), .set = set, .key = elem->key.val.data, + .tstamp = nft_net_tstamp(net), }; struct nft_rhash_elem *prev; @@ -216,6 +221,7 @@ nft_rhash_deactivate(const struct net *net, const struct nft_set *set, .genmask = nft_genmask_next(net), .set = set, .key = elem->key.val.data, + .tstamp = nft_net_tstamp(net), }; rcu_read_lock(); diff --git a/net/netfilter/nft_set_pipapo.c b/net/netfilter/nft_set_pipapo.c index f24ecdaa1c1e..b17c18203416 100644 --- a/net/netfilter/nft_set_pipapo.c +++ b/net/netfilter/nft_set_pipapo.c @@ -504,6 +504,7 @@ out: * @set: nftables API set representation * @data: Key data to be matched against existing elements * @genmask: If set, check that element is active in given genmask + * @tstamp: timestamp to check for expired elements * * This is essentially the same as the lookup function, except that it matches * key data against the uncommitted copy and doesn't use preallocated maps for @@ -513,7 +514,8 @@ out: */ static struct nft_pipapo_elem *pipapo_get(const struct net *net, const struct nft_set *set, - const u8 *data, u8 genmask) + const u8 *data, u8 genmask, + u64 tstamp) { struct nft_pipapo_elem *ret = ERR_PTR(-ENOENT); struct nft_pipapo *priv = nft_set_priv(set); @@ -566,7 +568,7 @@ next_match: goto out; if (last) { - if (nft_set_elem_expired(&f->mt[b].e->ext)) + if (__nft_set_elem_expired(&f->mt[b].e->ext, tstamp)) goto next_match; if ((genmask && !nft_set_elem_active(&f->mt[b].e->ext, genmask))) @@ -606,7 +608,7 @@ nft_pipapo_get(const struct net *net, const struct nft_set *set, struct nft_pipapo_elem *e; e = pipapo_get(net, set, (const u8 *)elem->key.val.data, - nft_genmask_cur(net)); + nft_genmask_cur(net), get_jiffies_64()); if (IS_ERR(e)) return ERR_CAST(e); @@ -1173,6 +1175,7 @@ static int nft_pipapo_insert(const struct net *net, const struct nft_set *set, struct nft_pipapo_match *m = priv->clone; u8 genmask = nft_genmask_next(net); struct nft_pipapo_elem *e, *dup; + u64 tstamp = nft_net_tstamp(net); struct nft_pipapo_field *f; const u8 *start_p, *end_p; int i, bsize_max, err = 0; @@ -1182,7 +1185,7 @@ static int nft_pipapo_insert(const struct net *net, const struct nft_set *set, else end = start; - dup = pipapo_get(net, set, start, genmask); + dup = pipapo_get(net, set, start, genmask, tstamp); if (!IS_ERR(dup)) { /* Check if we already have the same exact entry */ const struct nft_data *dup_key, *dup_end; @@ -1204,7 +1207,7 @@ static int nft_pipapo_insert(const struct net *net, const struct nft_set *set, if (PTR_ERR(dup) == -ENOENT) { /* Look for partially overlapping entries */ - dup = pipapo_get(net, set, end, nft_genmask_next(net)); + dup = pipapo_get(net, set, end, nft_genmask_next(net), tstamp); } if (PTR_ERR(dup) != -ENOENT) { @@ -1560,6 +1563,7 @@ static void pipapo_gc(struct nft_set *set, struct nft_pipapo_match *m) { struct nft_pipapo *priv = nft_set_priv(set); struct net *net = read_pnet(&set->net); + u64 tstamp = nft_net_tstamp(net); int rules_f0, first_rule = 0; struct nft_pipapo_elem *e; struct nft_trans_gc *gc; @@ -1594,7 +1598,7 @@ static void pipapo_gc(struct nft_set *set, struct nft_pipapo_match *m) /* synchronous gc never fails, there is no need to set on * NFT_SET_ELEM_DEAD_BIT. */ - if (nft_set_elem_expired(&e->ext)) { + if (__nft_set_elem_expired(&e->ext, tstamp)) { priv->dirty = true; gc = nft_trans_gc_queue_sync(gc, GFP_KERNEL); @@ -1769,7 +1773,7 @@ static void *pipapo_deactivate(const struct net *net, const struct nft_set *set, { struct nft_pipapo_elem *e; - e = pipapo_get(net, set, data, nft_genmask_next(net)); + e = pipapo_get(net, set, data, nft_genmask_next(net), nft_net_tstamp(net)); if (IS_ERR(e)) return NULL; diff --git a/net/netfilter/nft_set_rbtree.c b/net/netfilter/nft_set_rbtree.c index baa3fea4fe65..5fd74f993988 100644 --- a/net/netfilter/nft_set_rbtree.c +++ b/net/netfilter/nft_set_rbtree.c @@ -313,6 +313,7 @@ static int __nft_rbtree_insert(const struct net *net, const struct nft_set *set, struct nft_rbtree *priv = nft_set_priv(set); u8 cur_genmask = nft_genmask_cur(net); u8 genmask = nft_genmask_next(net); + u64 tstamp = nft_net_tstamp(net); int d; /* Descend the tree to search for an existing element greater than the @@ -360,7 +361,7 @@ static int __nft_rbtree_insert(const struct net *net, const struct nft_set *set, /* perform garbage collection to avoid bogus overlap reports * but skip new elements in this transaction. */ - if (nft_set_elem_expired(&rbe->ext) && + if (__nft_set_elem_expired(&rbe->ext, tstamp) && nft_set_elem_active(&rbe->ext, cur_genmask)) { const struct nft_rbtree_elem *removed_end; @@ -551,6 +552,7 @@ nft_rbtree_deactivate(const struct net *net, const struct nft_set *set, const struct nft_rbtree *priv = nft_set_priv(set); const struct rb_node *parent = priv->root.rb_node; u8 genmask = nft_genmask_next(net); + u64 tstamp = nft_net_tstamp(net); int d; while (parent != NULL) { @@ -571,7 +573,7 @@ nft_rbtree_deactivate(const struct net *net, const struct nft_set *set, nft_rbtree_interval_end(this)) { parent = parent->rb_right; continue; - } else if (nft_set_elem_expired(&rbe->ext)) { + } else if (__nft_set_elem_expired(&rbe->ext, tstamp)) { break; } else if (!nft_set_elem_active(&rbe->ext, genmask)) { parent = parent->rb_left; @@ -624,9 +626,10 @@ static void nft_rbtree_gc(struct nft_set *set) { struct nft_rbtree *priv = nft_set_priv(set); struct nft_rbtree_elem *rbe, *rbe_end = NULL; + struct net *net = read_pnet(&set->net); + u64 tstamp = nft_net_tstamp(net); struct rb_node *node, *next; struct nft_trans_gc *gc; - struct net *net; set = nft_set_container_of(priv); net = read_pnet(&set->net); @@ -648,7 +651,7 @@ static void nft_rbtree_gc(struct nft_set *set) rbe_end = rbe; continue; } - if (!nft_set_elem_expired(&rbe->ext)) + if (!__nft_set_elem_expired(&rbe->ext, tstamp)) continue; gc = nft_trans_gc_queue_sync(gc, GFP_KERNEL); From f82777e8ce6c039cdcacbcf1eb8619b99a20c06d Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Tue, 6 Feb 2024 17:54:18 +0100 Subject: [PATCH 240/270] netfilter: nfnetlink_queue: un-break NF_REPEAT Only override userspace verdict if the ct hook returns something other than ACCEPT. Else, this replaces NF_REPEAT (run all hooks again) with NF_ACCEPT (move to next hook). Fixes: 6291b3a67ad5 ("netfilter: conntrack: convert nf_conntrack_update to netfilter verdicts") Reported-by: l.6diay@passmail.com Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nfnetlink_queue.c | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/net/netfilter/nfnetlink_queue.c b/net/netfilter/nfnetlink_queue.c index 171d1f52d3dd..5cf38fc0a366 100644 --- a/net/netfilter/nfnetlink_queue.c +++ b/net/netfilter/nfnetlink_queue.c @@ -232,18 +232,25 @@ static void nfqnl_reinject(struct nf_queue_entry *entry, unsigned int verdict) if (verdict == NF_ACCEPT || verdict == NF_REPEAT || verdict == NF_STOP) { + unsigned int ct_verdict = verdict; + rcu_read_lock(); ct_hook = rcu_dereference(nf_ct_hook); if (ct_hook) - verdict = ct_hook->update(entry->state.net, entry->skb); + ct_verdict = ct_hook->update(entry->state.net, entry->skb); rcu_read_unlock(); - switch (verdict & NF_VERDICT_MASK) { + switch (ct_verdict & NF_VERDICT_MASK) { + case NF_ACCEPT: + /* follow userspace verdict, could be REPEAT */ + break; case NF_STOLEN: nf_queue_entry_free(entry); return; + default: + verdict = ct_verdict & NF_VERDICT_MASK; + break; } - } nf_reinject(entry, verdict); } From 60c0c230c6f046da536d3df8b39a20b9a9fd6af0 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Wed, 7 Feb 2024 18:49:51 +0100 Subject: [PATCH 241/270] netfilter: nft_set_rbtree: skip end interval element from gc rbtree lazy gc on insert might collect an end interval element that has been just added in this transactions, skip end interval elements that are not yet active. Fixes: f718863aca46 ("netfilter: nft_set_rbtree: fix overlap expiration walk") Cc: stable@vger.kernel.org Reported-by: lonial con Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nft_set_rbtree.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/net/netfilter/nft_set_rbtree.c b/net/netfilter/nft_set_rbtree.c index 5fd74f993988..9944fe479e53 100644 --- a/net/netfilter/nft_set_rbtree.c +++ b/net/netfilter/nft_set_rbtree.c @@ -234,7 +234,7 @@ static void nft_rbtree_gc_elem_remove(struct net *net, struct nft_set *set, static const struct nft_rbtree_elem * nft_rbtree_gc_elem(const struct nft_set *__set, struct nft_rbtree *priv, - struct nft_rbtree_elem *rbe, u8 genmask) + struct nft_rbtree_elem *rbe) { struct nft_set *set = (struct nft_set *)__set; struct rb_node *prev = rb_prev(&rbe->node); @@ -253,7 +253,7 @@ nft_rbtree_gc_elem(const struct nft_set *__set, struct nft_rbtree *priv, while (prev) { rbe_prev = rb_entry(prev, struct nft_rbtree_elem, node); if (nft_rbtree_interval_end(rbe_prev) && - nft_set_elem_active(&rbe_prev->ext, genmask)) + nft_set_elem_active(&rbe_prev->ext, NFT_GENMASK_ANY)) break; prev = rb_prev(prev); @@ -365,7 +365,7 @@ static int __nft_rbtree_insert(const struct net *net, const struct nft_set *set, nft_set_elem_active(&rbe->ext, cur_genmask)) { const struct nft_rbtree_elem *removed_end; - removed_end = nft_rbtree_gc_elem(set, priv, rbe, genmask); + removed_end = nft_rbtree_gc_elem(set, priv, rbe); if (IS_ERR(removed_end)) return PTR_ERR(removed_end); From 76313d1a4aa9e30d5b43dee5efd8bcd4d8250006 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Wed, 7 Feb 2024 21:52:46 +0100 Subject: [PATCH 242/270] netfilter: nft_set_pipapo: store index in scratch maps Pipapo needs a scratchpad area to keep state during matching. This state can be large and thus cannot reside on stack. Each set preallocates percpu areas for this. On each match stage, one scratchpad half starts with all-zero and the other is inited to all-ones. At the end of each stage, the half that starts with all-ones is always zero. Before next field is tested, pointers to the two halves are swapped, i.e. resmap pointer turns into fill pointer and vice versa. After the last field has been processed, pipapo stashes the index toggle in a percpu variable, with assumption that next packet will start with the all-zero half and sets all bits in the other to 1. This isn't reliable. There can be multiple sets and we can't be sure that the upper and lower half of all set scratch map is always in sync (lookups can be conditional), so one set might have swapped, but other might not have been queried. Thus we need to keep the index per-set-and-cpu, just like the scratchpad. Note that this bug fix is incomplete, there is a related issue. avx2 and normal implementation might use slightly different areas of the map array space due to the avx2 alignment requirements, so m->scratch (generic/fallback implementation) and ->scratch_aligned (avx) may partially overlap. scratch and scratch_aligned are not distinct objects, the latter is just the aligned address of the former. After this change, write to scratch_align->map_index may write to scratch->map, so this issue becomes more prominent, we can set to 1 a bit in the supposedly-all-zero area of scratch->map[]. A followup patch will remove the scratch_aligned and makes generic and avx code use the same (aligned) area. Its done in a separate change to ease review. Fixes: 3c4287f62044 ("nf_tables: Add set type for arbitrary concatenation of ranges") Reviewed-by: Stefano Brivio Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nft_set_pipapo.c | 41 ++++++++++++++++++----------- net/netfilter/nft_set_pipapo.h | 14 ++++++++-- net/netfilter/nft_set_pipapo_avx2.c | 15 +++++------ 3 files changed, 44 insertions(+), 26 deletions(-) diff --git a/net/netfilter/nft_set_pipapo.c b/net/netfilter/nft_set_pipapo.c index b17c18203416..54d0bac140a3 100644 --- a/net/netfilter/nft_set_pipapo.c +++ b/net/netfilter/nft_set_pipapo.c @@ -342,9 +342,6 @@ #include "nft_set_pipapo_avx2.h" #include "nft_set_pipapo.h" -/* Current working bitmap index, toggled between field matches */ -static DEFINE_PER_CPU(bool, nft_pipapo_scratch_index); - /** * pipapo_refill() - For each set bit, set bits from selected mapping table item * @map: Bitmap to be scanned for set bits @@ -412,6 +409,7 @@ bool nft_pipapo_lookup(const struct net *net, const struct nft_set *set, const u32 *key, const struct nft_set_ext **ext) { struct nft_pipapo *priv = nft_set_priv(set); + struct nft_pipapo_scratch *scratch; unsigned long *res_map, *fill_map; u8 genmask = nft_genmask_cur(net); const u8 *rp = (const u8 *)key; @@ -422,15 +420,17 @@ bool nft_pipapo_lookup(const struct net *net, const struct nft_set *set, local_bh_disable(); - map_index = raw_cpu_read(nft_pipapo_scratch_index); - m = rcu_dereference(priv->match); if (unlikely(!m || !*raw_cpu_ptr(m->scratch))) goto out; - res_map = *raw_cpu_ptr(m->scratch) + (map_index ? m->bsize_max : 0); - fill_map = *raw_cpu_ptr(m->scratch) + (map_index ? 0 : m->bsize_max); + scratch = *raw_cpu_ptr(m->scratch); + + map_index = scratch->map_index; + + res_map = scratch->map + (map_index ? m->bsize_max : 0); + fill_map = scratch->map + (map_index ? 0 : m->bsize_max); memset(res_map, 0xff, m->bsize_max * sizeof(*res_map)); @@ -460,7 +460,7 @@ next_match: b = pipapo_refill(res_map, f->bsize, f->rules, fill_map, f->mt, last); if (b < 0) { - raw_cpu_write(nft_pipapo_scratch_index, map_index); + scratch->map_index = map_index; local_bh_enable(); return false; @@ -477,7 +477,7 @@ next_match: * current inactive bitmap is clean and can be reused as * *next* bitmap (not initial) for the next packet. */ - raw_cpu_write(nft_pipapo_scratch_index, map_index); + scratch->map_index = map_index; local_bh_enable(); return true; @@ -1123,12 +1123,12 @@ static int pipapo_realloc_scratch(struct nft_pipapo_match *clone, int i; for_each_possible_cpu(i) { - unsigned long *scratch; + struct nft_pipapo_scratch *scratch; #ifdef NFT_PIPAPO_ALIGN - unsigned long *scratch_aligned; + void *scratch_aligned; #endif - - scratch = kzalloc_node(bsize_max * sizeof(*scratch) * 2 + + scratch = kzalloc_node(struct_size(scratch, map, + bsize_max * 2) + NFT_PIPAPO_ALIGN_HEADROOM, GFP_KERNEL, cpu_to_node(i)); if (!scratch) { @@ -1147,7 +1147,16 @@ static int pipapo_realloc_scratch(struct nft_pipapo_match *clone, *per_cpu_ptr(clone->scratch, i) = scratch; #ifdef NFT_PIPAPO_ALIGN - scratch_aligned = NFT_PIPAPO_LT_ALIGN(scratch); + /* Align &scratch->map (not the struct itself): the extra + * %NFT_PIPAPO_ALIGN_HEADROOM bytes passed to kzalloc_node() + * above guarantee we can waste up to those bytes in order + * to align the map field regardless of its offset within + * the struct. + */ + BUILD_BUG_ON(offsetof(struct nft_pipapo_scratch, map) > NFT_PIPAPO_ALIGN_HEADROOM); + + scratch_aligned = NFT_PIPAPO_LT_ALIGN(&scratch->map); + scratch_aligned -= offsetof(struct nft_pipapo_scratch, map); *per_cpu_ptr(clone->scratch_aligned, i) = scratch_aligned; #endif } @@ -2136,7 +2145,7 @@ static int nft_pipapo_init(const struct nft_set *set, m->field_count = field_count; m->bsize_max = 0; - m->scratch = alloc_percpu(unsigned long *); + m->scratch = alloc_percpu(struct nft_pipapo_scratch *); if (!m->scratch) { err = -ENOMEM; goto out_scratch; @@ -2145,7 +2154,7 @@ static int nft_pipapo_init(const struct nft_set *set, *per_cpu_ptr(m->scratch, i) = NULL; #ifdef NFT_PIPAPO_ALIGN - m->scratch_aligned = alloc_percpu(unsigned long *); + m->scratch_aligned = alloc_percpu(struct nft_pipapo_scratch *); if (!m->scratch_aligned) { err = -ENOMEM; goto out_free; diff --git a/net/netfilter/nft_set_pipapo.h b/net/netfilter/nft_set_pipapo.h index 1040223da5fa..d3bc1551694f 100644 --- a/net/netfilter/nft_set_pipapo.h +++ b/net/netfilter/nft_set_pipapo.h @@ -130,6 +130,16 @@ struct nft_pipapo_field { union nft_pipapo_map_bucket *mt; }; +/** + * struct nft_pipapo_scratch - percpu data used for lookup and matching + * @map_index: Current working bitmap index, toggled between field matches + * @map: store partial matching results during lookup + */ +struct nft_pipapo_scratch { + u8 map_index; + unsigned long map[]; +}; + /** * struct nft_pipapo_match - Data used for lookup and matching * @field_count Amount of fields in set @@ -142,9 +152,9 @@ struct nft_pipapo_field { struct nft_pipapo_match { int field_count; #ifdef NFT_PIPAPO_ALIGN - unsigned long * __percpu *scratch_aligned; + struct nft_pipapo_scratch * __percpu *scratch_aligned; #endif - unsigned long * __percpu *scratch; + struct nft_pipapo_scratch * __percpu *scratch; size_t bsize_max; struct rcu_head rcu; struct nft_pipapo_field f[] __counted_by(field_count); diff --git a/net/netfilter/nft_set_pipapo_avx2.c b/net/netfilter/nft_set_pipapo_avx2.c index 52e0d026d30a..78213c73af2e 100644 --- a/net/netfilter/nft_set_pipapo_avx2.c +++ b/net/netfilter/nft_set_pipapo_avx2.c @@ -71,9 +71,6 @@ #define NFT_PIPAPO_AVX2_ZERO(reg) \ asm volatile("vpxor %ymm" #reg ", %ymm" #reg ", %ymm" #reg) -/* Current working bitmap index, toggled between field matches */ -static DEFINE_PER_CPU(bool, nft_pipapo_avx2_scratch_index); - /** * nft_pipapo_avx2_prepare() - Prepare before main algorithm body * @@ -1120,11 +1117,12 @@ bool nft_pipapo_avx2_lookup(const struct net *net, const struct nft_set *set, const u32 *key, const struct nft_set_ext **ext) { struct nft_pipapo *priv = nft_set_priv(set); - unsigned long *res, *fill, *scratch; + struct nft_pipapo_scratch *scratch; u8 genmask = nft_genmask_cur(net); const u8 *rp = (const u8 *)key; struct nft_pipapo_match *m; struct nft_pipapo_field *f; + unsigned long *res, *fill; bool map_index; int i, ret = 0; @@ -1146,10 +1144,11 @@ bool nft_pipapo_avx2_lookup(const struct net *net, const struct nft_set *set, kernel_fpu_end(); return false; } - map_index = raw_cpu_read(nft_pipapo_avx2_scratch_index); - res = scratch + (map_index ? m->bsize_max : 0); - fill = scratch + (map_index ? 0 : m->bsize_max); + map_index = scratch->map_index; + + res = scratch->map + (map_index ? m->bsize_max : 0); + fill = scratch->map + (map_index ? 0 : m->bsize_max); /* Starting map doesn't need to be set for this implementation */ @@ -1221,7 +1220,7 @@ next_match: out: if (i % 2) - raw_cpu_write(nft_pipapo_avx2_scratch_index, !map_index); + scratch->map_index = !map_index; kernel_fpu_end(); return ret >= 0; From 47b1c03c3c1a119435480a1e73f27197dc59131d Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Wed, 7 Feb 2024 21:52:47 +0100 Subject: [PATCH 243/270] netfilter: nft_set_pipapo: add helper to release pcpu scratch area After next patch simple kfree() is not enough anymore, so add a helper for it. Reviewed-by: Stefano Brivio Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nft_set_pipapo.c | 28 +++++++++++++++++++++++----- 1 file changed, 23 insertions(+), 5 deletions(-) diff --git a/net/netfilter/nft_set_pipapo.c b/net/netfilter/nft_set_pipapo.c index 54d0bac140a3..5094d4c439c3 100644 --- a/net/netfilter/nft_set_pipapo.c +++ b/net/netfilter/nft_set_pipapo.c @@ -1110,6 +1110,24 @@ static void pipapo_map(struct nft_pipapo_match *m, f->mt[map[i].to + j].e = e; } +/** + * pipapo_free_scratch() - Free per-CPU map at original (not aligned) address + * @m: Matching data + * @cpu: CPU number + */ +static void pipapo_free_scratch(const struct nft_pipapo_match *m, unsigned int cpu) +{ + struct nft_pipapo_scratch *s; + void *mem; + + s = *per_cpu_ptr(m->scratch, cpu); + if (!s) + return; + + mem = s; + kfree(mem); +} + /** * pipapo_realloc_scratch() - Reallocate scratch maps for partial match results * @clone: Copy of matching data with pending insertions and deletions @@ -1142,7 +1160,7 @@ static int pipapo_realloc_scratch(struct nft_pipapo_match *clone, return -ENOMEM; } - kfree(*per_cpu_ptr(clone->scratch, i)); + pipapo_free_scratch(clone, i); *per_cpu_ptr(clone->scratch, i) = scratch; @@ -1369,7 +1387,7 @@ out_lt: } out_scratch_realloc: for_each_possible_cpu(i) - kfree(*per_cpu_ptr(new->scratch, i)); + pipapo_free_scratch(new, i); #ifdef NFT_PIPAPO_ALIGN free_percpu(new->scratch_aligned); #endif @@ -1653,7 +1671,7 @@ static void pipapo_free_match(struct nft_pipapo_match *m) int i; for_each_possible_cpu(i) - kfree(*per_cpu_ptr(m->scratch, i)); + pipapo_free_scratch(m, i); #ifdef NFT_PIPAPO_ALIGN free_percpu(m->scratch_aligned); @@ -2253,7 +2271,7 @@ static void nft_pipapo_destroy(const struct nft_ctx *ctx, free_percpu(m->scratch_aligned); #endif for_each_possible_cpu(cpu) - kfree(*per_cpu_ptr(m->scratch, cpu)); + pipapo_free_scratch(m, cpu); free_percpu(m->scratch); pipapo_free_fields(m); kfree(m); @@ -2270,7 +2288,7 @@ static void nft_pipapo_destroy(const struct nft_ctx *ctx, free_percpu(priv->clone->scratch_aligned); #endif for_each_possible_cpu(cpu) - kfree(*per_cpu_ptr(priv->clone->scratch, cpu)); + pipapo_free_scratch(priv->clone, cpu); free_percpu(priv->clone->scratch); pipapo_free_fields(priv->clone); From 5a8cdf6fd860ac5e6d08d72edbcecee049a7fec4 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Thu, 8 Feb 2024 10:31:29 +0100 Subject: [PATCH 244/270] netfilter: nft_set_pipapo: remove scratch_aligned pointer use ->scratch for both avx2 and the generic implementation. After previous change the scratch->map member is always aligned properly for AVX2, so we can just use scratch->map in AVX2 too. The alignoff delta is stored in the scratchpad so we can reconstruct the correct address to free the area again. Fixes: 7400b063969b ("nft_set_pipapo: Introduce AVX2-based lookup implementation") Reviewed-by: Stefano Brivio Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nft_set_pipapo.c | 41 +++++------------------------ net/netfilter/nft_set_pipapo.h | 6 ++--- net/netfilter/nft_set_pipapo_avx2.c | 2 +- 3 files changed, 10 insertions(+), 39 deletions(-) diff --git a/net/netfilter/nft_set_pipapo.c b/net/netfilter/nft_set_pipapo.c index 5094d4c439c3..aa1d9e93a9a0 100644 --- a/net/netfilter/nft_set_pipapo.c +++ b/net/netfilter/nft_set_pipapo.c @@ -1125,6 +1125,7 @@ static void pipapo_free_scratch(const struct nft_pipapo_match *m, unsigned int c return; mem = s; + mem -= s->align_off; kfree(mem); } @@ -1144,6 +1145,7 @@ static int pipapo_realloc_scratch(struct nft_pipapo_match *clone, struct nft_pipapo_scratch *scratch; #ifdef NFT_PIPAPO_ALIGN void *scratch_aligned; + u32 align_off; #endif scratch = kzalloc_node(struct_size(scratch, map, bsize_max * 2) + @@ -1162,8 +1164,6 @@ static int pipapo_realloc_scratch(struct nft_pipapo_match *clone, pipapo_free_scratch(clone, i); - *per_cpu_ptr(clone->scratch, i) = scratch; - #ifdef NFT_PIPAPO_ALIGN /* Align &scratch->map (not the struct itself): the extra * %NFT_PIPAPO_ALIGN_HEADROOM bytes passed to kzalloc_node() @@ -1175,8 +1175,12 @@ static int pipapo_realloc_scratch(struct nft_pipapo_match *clone, scratch_aligned = NFT_PIPAPO_LT_ALIGN(&scratch->map); scratch_aligned -= offsetof(struct nft_pipapo_scratch, map); - *per_cpu_ptr(clone->scratch_aligned, i) = scratch_aligned; + align_off = scratch_aligned - (void *)scratch; + + scratch = scratch_aligned; + scratch->align_off = align_off; #endif + *per_cpu_ptr(clone->scratch, i) = scratch; } return 0; @@ -1331,11 +1335,6 @@ static struct nft_pipapo_match *pipapo_clone(struct nft_pipapo_match *old) if (!new->scratch) goto out_scratch; -#ifdef NFT_PIPAPO_ALIGN - new->scratch_aligned = alloc_percpu(*new->scratch_aligned); - if (!new->scratch_aligned) - goto out_scratch; -#endif for_each_possible_cpu(i) *per_cpu_ptr(new->scratch, i) = NULL; @@ -1388,9 +1387,6 @@ out_lt: out_scratch_realloc: for_each_possible_cpu(i) pipapo_free_scratch(new, i); -#ifdef NFT_PIPAPO_ALIGN - free_percpu(new->scratch_aligned); -#endif out_scratch: free_percpu(new->scratch); kfree(new); @@ -1673,11 +1669,7 @@ static void pipapo_free_match(struct nft_pipapo_match *m) for_each_possible_cpu(i) pipapo_free_scratch(m, i); -#ifdef NFT_PIPAPO_ALIGN - free_percpu(m->scratch_aligned); -#endif free_percpu(m->scratch); - pipapo_free_fields(m); kfree(m); @@ -2171,16 +2163,6 @@ static int nft_pipapo_init(const struct nft_set *set, for_each_possible_cpu(i) *per_cpu_ptr(m->scratch, i) = NULL; -#ifdef NFT_PIPAPO_ALIGN - m->scratch_aligned = alloc_percpu(struct nft_pipapo_scratch *); - if (!m->scratch_aligned) { - err = -ENOMEM; - goto out_free; - } - for_each_possible_cpu(i) - *per_cpu_ptr(m->scratch_aligned, i) = NULL; -#endif - rcu_head_init(&m->rcu); nft_pipapo_for_each_field(f, i, m) { @@ -2211,9 +2193,6 @@ static int nft_pipapo_init(const struct nft_set *set, return 0; out_free: -#ifdef NFT_PIPAPO_ALIGN - free_percpu(m->scratch_aligned); -#endif free_percpu(m->scratch); out_scratch: kfree(m); @@ -2267,9 +2246,6 @@ static void nft_pipapo_destroy(const struct nft_ctx *ctx, nft_set_pipapo_match_destroy(ctx, set, m); -#ifdef NFT_PIPAPO_ALIGN - free_percpu(m->scratch_aligned); -#endif for_each_possible_cpu(cpu) pipapo_free_scratch(m, cpu); free_percpu(m->scratch); @@ -2284,9 +2260,6 @@ static void nft_pipapo_destroy(const struct nft_ctx *ctx, if (priv->dirty) nft_set_pipapo_match_destroy(ctx, set, m); -#ifdef NFT_PIPAPO_ALIGN - free_percpu(priv->clone->scratch_aligned); -#endif for_each_possible_cpu(cpu) pipapo_free_scratch(priv->clone, cpu); free_percpu(priv->clone->scratch); diff --git a/net/netfilter/nft_set_pipapo.h b/net/netfilter/nft_set_pipapo.h index d3bc1551694f..f59a0cd81105 100644 --- a/net/netfilter/nft_set_pipapo.h +++ b/net/netfilter/nft_set_pipapo.h @@ -133,10 +133,12 @@ struct nft_pipapo_field { /** * struct nft_pipapo_scratch - percpu data used for lookup and matching * @map_index: Current working bitmap index, toggled between field matches + * @align_off: Offset to get the originally allocated address * @map: store partial matching results during lookup */ struct nft_pipapo_scratch { u8 map_index; + u32 align_off; unsigned long map[]; }; @@ -144,16 +146,12 @@ struct nft_pipapo_scratch { * struct nft_pipapo_match - Data used for lookup and matching * @field_count Amount of fields in set * @scratch: Preallocated per-CPU maps for partial matching results - * @scratch_aligned: Version of @scratch aligned to NFT_PIPAPO_ALIGN bytes * @bsize_max: Maximum lookup table bucket size of all fields, in longs * @rcu Matching data is swapped on commits * @f: Fields, with lookup and mapping tables */ struct nft_pipapo_match { int field_count; -#ifdef NFT_PIPAPO_ALIGN - struct nft_pipapo_scratch * __percpu *scratch_aligned; -#endif struct nft_pipapo_scratch * __percpu *scratch; size_t bsize_max; struct rcu_head rcu; diff --git a/net/netfilter/nft_set_pipapo_avx2.c b/net/netfilter/nft_set_pipapo_avx2.c index 78213c73af2e..90e275bb3e5d 100644 --- a/net/netfilter/nft_set_pipapo_avx2.c +++ b/net/netfilter/nft_set_pipapo_avx2.c @@ -1139,7 +1139,7 @@ bool nft_pipapo_avx2_lookup(const struct net *net, const struct nft_set *set, */ kernel_fpu_begin_mask(0); - scratch = *raw_cpu_ptr(m->scratch_aligned); + scratch = *raw_cpu_ptr(m->scratch); if (unlikely(!scratch)) { kernel_fpu_end(); return false; From 8c427cc2fa73684ea140999e121b7b6c1c717632 Mon Sep 17 00:00:00 2001 From: "Masami Hiramatsu (Google)" Date: Wed, 24 Jan 2024 00:02:34 +0900 Subject: [PATCH 245/270] tracing/probes: Fix to show a parse error for bad type for $comm Fix to show a parse error for bad type (non-string) for $comm/$COMM and immediate-string. With this fix, error_log file shows appropriate error message as below. /sys/kernel/tracing # echo 'p vfs_read $comm:u32' >> kprobe_events sh: write error: Invalid argument /sys/kernel/tracing # echo 'p vfs_read \"hoge":u32' >> kprobe_events sh: write error: Invalid argument /sys/kernel/tracing # cat error_log [ 30.144183] trace_kprobe: error: $comm and immediate-string only accepts string type Command: p vfs_read $comm:u32 ^ [ 62.618500] trace_kprobe: error: $comm and immediate-string only accepts string type Command: p vfs_read \"hoge":u32 ^ Link: https://lore.kernel.org/all/170602215411.215583.2238016352271091852.stgit@devnote2/ Fixes: 3dd1f7f24f8c ("tracing: probeevent: Fix to make the type of $comm string") Cc: stable@vger.kernel.org Signed-off-by: Masami Hiramatsu (Google) --- kernel/trace/trace_probe.c | 7 +++++-- kernel/trace/trace_probe.h | 3 ++- 2 files changed, 7 insertions(+), 3 deletions(-) diff --git a/kernel/trace/trace_probe.c b/kernel/trace/trace_probe.c index 4dc74d73fc1d..c6da5923e5b9 100644 --- a/kernel/trace/trace_probe.c +++ b/kernel/trace/trace_probe.c @@ -1159,9 +1159,12 @@ static int traceprobe_parse_probe_arg_body(const char *argv, ssize_t *size, if (!(ctx->flags & TPARG_FL_TEVENT) && (strcmp(arg, "$comm") == 0 || strcmp(arg, "$COMM") == 0 || strncmp(arg, "\\\"", 2) == 0)) { - /* The type of $comm must be "string", and not an array. */ - if (parg->count || (t && strcmp(t, "string"))) + /* The type of $comm must be "string", and not an array type. */ + if (parg->count || (t && strcmp(t, "string"))) { + trace_probe_log_err(ctx->offset + (t ? (t - arg) : 0), + NEED_STRING_TYPE); goto out; + } parg->type = find_fetch_type("string", ctx->flags); } else parg->type = find_fetch_type(t, ctx->flags); diff --git a/kernel/trace/trace_probe.h b/kernel/trace/trace_probe.h index 850d9ecb6765..c1877d018269 100644 --- a/kernel/trace/trace_probe.h +++ b/kernel/trace/trace_probe.h @@ -515,7 +515,8 @@ extern int traceprobe_define_arg_fields(struct trace_event_call *event_call, C(BAD_HYPHEN, "Failed to parse single hyphen. Forgot '>'?"), \ C(NO_BTF_FIELD, "This field is not found."), \ C(BAD_BTF_TID, "Failed to get BTF type info."),\ - C(BAD_TYPE4STR, "This type does not fit for string."), + C(BAD_TYPE4STR, "This type does not fit for string."),\ + C(NEED_STRING_TYPE, "$comm and immediate-string only accepts string type"), #undef C #define C(a, b) TP_ERR_##a From 9a571c1e275cedacd48c66a6bddd0c23f1dffdbf Mon Sep 17 00:00:00 2001 From: "Masami Hiramatsu (Google)" Date: Wed, 24 Jan 2024 00:03:02 +0900 Subject: [PATCH 246/270] tracing/probes: Fix to set arg size and fmt after setting type from BTF Since the BTF type setting updates probe_arg::type, the type size calculation and setting print-fmt should be done after that. Without this fix, the argument size and print-fmt can be wrong. Link: https://lore.kernel.org/all/170602218196.215583.6417859469540955777.stgit@devnote2/ Fixes: b576e09701c7 ("tracing/probes: Support function parameters if BTF is available") Cc: stable@vger.kernel.org Signed-off-by: Masami Hiramatsu (Google) --- kernel/trace/trace_probe.c | 25 +++++++++++++------------ 1 file changed, 13 insertions(+), 12 deletions(-) diff --git a/kernel/trace/trace_probe.c b/kernel/trace/trace_probe.c index c6da5923e5b9..34289f9c6707 100644 --- a/kernel/trace/trace_probe.c +++ b/kernel/trace/trace_probe.c @@ -1172,18 +1172,6 @@ static int traceprobe_parse_probe_arg_body(const char *argv, ssize_t *size, trace_probe_log_err(ctx->offset + (t ? (t - arg) : 0), BAD_TYPE); goto out; } - parg->offset = *size; - *size += parg->type->size * (parg->count ?: 1); - - ret = -ENOMEM; - if (parg->count) { - len = strlen(parg->type->fmttype) + 6; - parg->fmt = kmalloc(len, GFP_KERNEL); - if (!parg->fmt) - goto out; - snprintf(parg->fmt, len, "%s[%d]", parg->type->fmttype, - parg->count); - } code = tmp = kcalloc(FETCH_INSN_MAX, sizeof(*code), GFP_KERNEL); if (!code) @@ -1207,6 +1195,19 @@ static int traceprobe_parse_probe_arg_body(const char *argv, ssize_t *size, goto fail; } } + parg->offset = *size; + *size += parg->type->size * (parg->count ?: 1); + + if (parg->count) { + len = strlen(parg->type->fmttype) + 6; + parg->fmt = kmalloc(len, GFP_KERNEL); + if (!parg->fmt) { + ret = -ENOMEM; + goto out; + } + snprintf(parg->fmt, len, "%s[%d]", parg->type->fmttype, + parg->count); + } ret = -EINVAL; /* Store operation */ From 9efd24ec5599ed485b7c4d9aeb731141f6285167 Mon Sep 17 00:00:00 2001 From: Li zeming Date: Tue, 19 Sep 2023 09:28:23 +0800 Subject: [PATCH 247/270] kprobes: Remove unnecessary initial values of variables ri and sym is assigned first, so it does not need to initialize the assignment. Link: https://lore.kernel.org/all/20230919012823.7815-1-zeming@nfschina.com/ Signed-off-by: Li zeming Acked-by: Masami Hiramatsu (Google) Signed-off-by: Masami Hiramatsu (Google) --- kernel/kprobes.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/kprobes.c b/kernel/kprobes.c index d5a0ee40bf66..9d9095e81792 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -1993,7 +1993,7 @@ NOKPROBE_SYMBOL(__kretprobe_find_ret_addr); unsigned long kretprobe_find_ret_addr(struct task_struct *tsk, void *fp, struct llist_node **cur) { - struct kretprobe_instance *ri = NULL; + struct kretprobe_instance *ri; kprobe_opcode_t *ret; if (WARN_ON_ONCE(!cur)) @@ -2802,7 +2802,7 @@ static int show_kprobe_addr(struct seq_file *pi, void *v) { struct hlist_head *head; struct kprobe *p, *kp; - const char *sym = NULL; + const char *sym; unsigned int i = *(loff_t *) v; unsigned long offset = 0; char *modname, namebuf[KSYM_NAME_LEN]; From 45be0882c5f91e1b92e645001dd1a53b3bd58c97 Mon Sep 17 00:00:00 2001 From: Steve French Date: Mon, 5 Feb 2024 14:43:17 -0600 Subject: [PATCH 248/270] smb3: add missing null server pointer check Address static checker warning in cifs_ses_get_chan_index(): warn: variable dereferenced before check 'server' To be consistent, and reduce risk, we should add another check for null server pointer. Fixes: 88675b22d34e ("cifs: do not search for channel if server is terminating") Reported-by: Dan Carpenter Reviewed-by: Shyam Prasad N Signed-off-by: Steve French --- fs/smb/client/sess.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/smb/client/sess.c b/fs/smb/client/sess.c index ed4bd88dd528..476d54fceb50 100644 --- a/fs/smb/client/sess.c +++ b/fs/smb/client/sess.c @@ -76,7 +76,7 @@ cifs_ses_get_chan_index(struct cifs_ses *ses, unsigned int i; /* if the channel is waiting for termination */ - if (server->terminate) + if (server && server->terminate) return CIFS_INVAL_CHAN_INDEX; for (i = 0; i < ses->chan_count; i++) { From 55c7788c37242702868bfac7861cdf0c358d6c3d Mon Sep 17 00:00:00 2001 From: Paulo Alcantara Date: Fri, 2 Feb 2024 12:38:24 -0300 Subject: [PATCH 249/270] smb: client: set correct d_type for reparse points under DFS mounts Send query dir requests with an info level of SMB_FIND_FILE_FULL_DIRECTORY_INFO rather than SMB_FIND_FILE_DIRECTORY_INFO when the client is generating its own inode numbers (e.g. noserverino) so that reparse tags still can be parsed directly from the responses, but server won't send UniqueId (server inode number) Signed-off-by: Paulo Alcantara Signed-off-by: Steve French --- fs/smb/client/readdir.c | 15 ++++++++------- fs/smb/client/smb2pdu.c | 6 ++++++ 2 files changed, 14 insertions(+), 7 deletions(-) diff --git a/fs/smb/client/readdir.c b/fs/smb/client/readdir.c index 3b1b01d10f7d..b520eea7bfce 100644 --- a/fs/smb/client/readdir.c +++ b/fs/smb/client/readdir.c @@ -307,14 +307,16 @@ cifs_dir_info_to_fattr(struct cifs_fattr *fattr, FILE_DIRECTORY_INFO *info, } static void cifs_fulldir_info_to_fattr(struct cifs_fattr *fattr, - SEARCH_ID_FULL_DIR_INFO *info, + const void *info, struct cifs_sb_info *cifs_sb) { + const FILE_FULL_DIRECTORY_INFO *di = info; + __dir_info_to_fattr(fattr, info); - /* See MS-FSCC 2.4.19 FileIdFullDirectoryInformation */ + /* See MS-FSCC 2.4.14, 2.4.19 */ if (fattr->cf_cifsattrs & ATTR_REPARSE) - fattr->cf_cifstag = le32_to_cpu(info->EaSize); + fattr->cf_cifstag = le32_to_cpu(di->EaSize); cifs_fill_common_info(fattr, cifs_sb); } @@ -396,7 +398,7 @@ ffirst_retry: } else if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SERVER_INUM) { cifsFile->srch_inf.info_level = SMB_FIND_FILE_ID_FULL_DIR_INFO; } else /* not srvinos - BB fixme add check for backlevel? */ { - cifsFile->srch_inf.info_level = SMB_FIND_FILE_DIRECTORY_INFO; + cifsFile->srch_inf.info_level = SMB_FIND_FILE_FULL_DIRECTORY_INFO; } search_flags = CIFS_SEARCH_CLOSE_AT_END | CIFS_SEARCH_RETURN_RESUME; @@ -987,10 +989,9 @@ static int cifs_filldir(char *find_entry, struct file *file, (FIND_FILE_STANDARD_INFO *)find_entry, cifs_sb); break; + case SMB_FIND_FILE_FULL_DIRECTORY_INFO: case SMB_FIND_FILE_ID_FULL_DIR_INFO: - cifs_fulldir_info_to_fattr(&fattr, - (SEARCH_ID_FULL_DIR_INFO *)find_entry, - cifs_sb); + cifs_fulldir_info_to_fattr(&fattr, find_entry, cifs_sb); break; default: cifs_dir_info_to_fattr(&fattr, diff --git a/fs/smb/client/smb2pdu.c b/fs/smb/client/smb2pdu.c index c58fa44dd6b0..4085ce27fd38 100644 --- a/fs/smb/client/smb2pdu.c +++ b/fs/smb/client/smb2pdu.c @@ -5206,6 +5206,9 @@ int SMB2_query_directory_init(const unsigned int xid, case SMB_FIND_FILE_POSIX_INFO: req->FileInformationClass = SMB_FIND_FILE_POSIX_INFO; break; + case SMB_FIND_FILE_FULL_DIRECTORY_INFO: + req->FileInformationClass = FILE_FULL_DIRECTORY_INFORMATION; + break; default: cifs_tcon_dbg(VFS, "info level %u isn't supported\n", info_level); @@ -5275,6 +5278,9 @@ smb2_parse_query_directory(struct cifs_tcon *tcon, /* note that posix payload are variable size */ info_buf_size = sizeof(struct smb2_posix_info); break; + case SMB_FIND_FILE_FULL_DIRECTORY_INFO: + info_buf_size = sizeof(FILE_FULL_DIRECTORY_INFO); + break; default: cifs_tcon_dbg(VFS, "info level %u isn't supported\n", srch_inf->info_level); From 2a427b49d02995ea4a6ff93a1432c40fa4d36821 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 20 Nov 2023 12:25:56 -1000 Subject: [PATCH 250/270] blk-iocost: Fix an UBSAN shift-out-of-bounds warning MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When iocg_kick_delay() is called from a CPU different than the one which set the delay, @now may be in the past of @iocg->delay_at leading to the following warning: UBSAN: shift-out-of-bounds in block/blk-iocost.c:1359:23 shift exponent 18446744073709 is too large for 64-bit type 'u64' (aka 'unsigned long long') ... Call Trace: dump_stack_lvl+0x79/0xc0 __ubsan_handle_shift_out_of_bounds+0x2ab/0x300 iocg_kick_delay+0x222/0x230 ioc_rqos_merge+0x1d7/0x2c0 __rq_qos_merge+0x2c/0x80 bio_attempt_back_merge+0x83/0x190 blk_attempt_plug_merge+0x101/0x150 blk_mq_submit_bio+0x2b1/0x720 submit_bio_noacct_nocheck+0x320/0x3e0 __swap_writepage+0x2ab/0x9d0 The underflow itself doesn't really affect the behavior in any meaningful way; however, the past timestamp may exaggerate the delay amount calculated later in the code, which shouldn't be a material problem given the nature of the delay mechanism. If @now is in the past, this CPU is racing another CPU which recently set up the delay and there's nothing this CPU can contribute w.r.t. the delay. Let's bail early from iocg_kick_delay() in such cases. Reported-by: Breno Leitão Signed-off-by: Tejun Heo Fixes: 5160a5a53c0c ("blk-iocost: implement delay adjustment hysteresis") Link: https://lore.kernel.org/r/ZVvc9L_CYk5LO1fT@slm.duckdns.org Signed-off-by: Jens Axboe --- block/blk-iocost.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/block/blk-iocost.c b/block/blk-iocost.c index c8beec6d7df0..04d44f0bcbc8 100644 --- a/block/blk-iocost.c +++ b/block/blk-iocost.c @@ -1353,6 +1353,13 @@ static bool iocg_kick_delay(struct ioc_gq *iocg, struct ioc_now *now) lockdep_assert_held(&iocg->waitq.lock); + /* + * If the delay is set by another CPU, we may be in the past. No need to + * change anything if so. This avoids decay calculation underflow. + */ + if (time_before64(now->now, iocg->delay_at)) + return false; + /* calculate the current delay in effect - 1/2 every second */ tdelta = now->now - iocg->delay_at; if (iocg->delay) From 4ce6e2db00de8103a0687fb0f65fd17124a51aaa Mon Sep 17 00:00:00 2001 From: Yi Sun Date: Mon, 29 Jan 2024 16:52:50 +0800 Subject: [PATCH 251/270] virtio-blk: Ensure no requests in virtqueues before deleting vqs. Ensure no remaining requests in virtqueues before resetting vdev and deleting virtqueues. Otherwise these requests will never be completed. It may cause the system to become unresponsive. Function blk_mq_quiesce_queue() can ensure that requests have become in_flight status, but it cannot guarantee that requests have been processed by the device. Virtqueues should never be deleted before all requests become complete status. Function blk_mq_freeze_queue() ensure that all requests in virtqueues become complete status. And no requests can enter in virtqueues. Signed-off-by: Yi Sun Reviewed-by: Stefan Hajnoczi Link: https://lore.kernel.org/r/20240129085250.1550594-1-yi.sun@unisoc.com Signed-off-by: Jens Axboe --- drivers/block/virtio_blk.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/drivers/block/virtio_blk.c b/drivers/block/virtio_blk.c index 5bf98fd6a651..2bf14a0e2815 100644 --- a/drivers/block/virtio_blk.c +++ b/drivers/block/virtio_blk.c @@ -1593,14 +1593,15 @@ static int virtblk_freeze(struct virtio_device *vdev) { struct virtio_blk *vblk = vdev->priv; + /* Ensure no requests in virtqueues before deleting vqs. */ + blk_mq_freeze_queue(vblk->disk->queue); + /* Ensure we don't receive any more interrupts */ virtio_reset_device(vdev); /* Make sure no work handler is accessing the device. */ flush_work(&vblk->config_work); - blk_mq_quiesce_queue(vblk->disk->queue); - vdev->config->del_vqs(vdev); kfree(vblk->vqs); @@ -1618,7 +1619,7 @@ static int virtblk_restore(struct virtio_device *vdev) virtio_device_ready(vdev); - blk_mq_unquiesce_queue(vblk->disk->queue); + blk_mq_unfreeze_queue(vblk->disk->queue); return 0; } #endif From 17c8e9ac95d81d10e9619a845a68b83c9384be64 Mon Sep 17 00:00:00 2001 From: Andrew Jones Date: Wed, 31 Jan 2024 13:05:13 +0100 Subject: [PATCH 252/270] RISC-V: paravirt: steal_time should be static steal_time is not used outside paravirt.c, make it static, as sparse suggested. Fixes: fdf68acccfc6 ("RISC-V: paravirt: Implement steal-time support") Signed-off-by: Andrew Jones Reviewed-by: Atish Patra Signed-off-by: Anup Patel --- arch/riscv/kernel/paravirt.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/riscv/kernel/paravirt.c b/arch/riscv/kernel/paravirt.c index 8e114f5930ce..15c5d4048db5 100644 --- a/arch/riscv/kernel/paravirt.c +++ b/arch/riscv/kernel/paravirt.c @@ -41,7 +41,7 @@ static int __init parse_no_stealacc(char *arg) early_param("no-steal-acc", parse_no_stealacc); -DEFINE_PER_CPU(struct sbi_sta_struct, steal_time) __aligned(64); +static DEFINE_PER_CPU(struct sbi_sta_struct, steal_time) __aligned(64); static bool __init has_pv_steal_clock(void) { From 3752219b6007ee0421cc228f442f33b31f85addd Mon Sep 17 00:00:00 2001 From: Andrew Jones Date: Wed, 31 Jan 2024 13:05:14 +0100 Subject: [PATCH 253/270] RISC-V: paravirt: Use correct restricted types __le32 and __le64 types should be used with le32_to_cpu() and le64_to_cpu(), as sparse helpfully points out. Fixes: fdf68acccfc6 ("RISC-V: paravirt: Implement steal-time support") Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-kbuild-all/202401011933.hL9zqmKo-lkp@intel.com/ Signed-off-by: Andrew Jones Reviewed-by: Atish Patra Signed-off-by: Anup Patel --- arch/riscv/kernel/paravirt.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/riscv/kernel/paravirt.c b/arch/riscv/kernel/paravirt.c index 15c5d4048db5..0d6225fd3194 100644 --- a/arch/riscv/kernel/paravirt.c +++ b/arch/riscv/kernel/paravirt.c @@ -91,8 +91,8 @@ static int pv_time_cpu_down_prepare(unsigned int cpu) static u64 pv_time_steal_clock(int cpu) { struct sbi_sta_struct *st = per_cpu_ptr(&steal_time, cpu); - u32 sequence; - u64 steal; + __le32 sequence; + __le64 steal; /* * Check the sequence field before and after reading the steal From f072b272aa27d57cf7fe6fdedb30fb50f391974e Mon Sep 17 00:00:00 2001 From: Andrew Jones Date: Wed, 31 Jan 2024 13:05:15 +0100 Subject: [PATCH 254/270] RISC-V: KVM: Use correct restricted types __le32 and __le64 types should be used with le32_to_cpu() and le64_to_cpu() and __user is needed for pointers referencing guest memory, as sparse helpfully points out. Fixes: e9f12b5fff8a ("RISC-V: KVM: Implement SBI STA extension") Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-kbuild-all/202401020142.lwFEDK5v-lkp@intel.com/ Signed-off-by: Andrew Jones Reviewed-by: Atish Patra Signed-off-by: Anup Patel --- arch/riscv/kvm/vcpu_sbi_sta.c | 20 ++++++++++++-------- 1 file changed, 12 insertions(+), 8 deletions(-) diff --git a/arch/riscv/kvm/vcpu_sbi_sta.c b/arch/riscv/kvm/vcpu_sbi_sta.c index 01f09fe8c3b0..d8cf9ca28c61 100644 --- a/arch/riscv/kvm/vcpu_sbi_sta.c +++ b/arch/riscv/kvm/vcpu_sbi_sta.c @@ -26,8 +26,12 @@ void kvm_riscv_vcpu_record_steal_time(struct kvm_vcpu *vcpu) { gpa_t shmem = vcpu->arch.sta.shmem; u64 last_steal = vcpu->arch.sta.last_steal; - u32 *sequence_ptr, sequence; - u64 *steal_ptr, steal; + __le32 __user *sequence_ptr; + __le64 __user *steal_ptr; + __le32 sequence_le; + __le64 steal_le; + u32 sequence; + u64 steal; unsigned long hva; gfn_t gfn; @@ -47,22 +51,22 @@ void kvm_riscv_vcpu_record_steal_time(struct kvm_vcpu *vcpu) return; } - sequence_ptr = (u32 *)(hva + offset_in_page(shmem) + + sequence_ptr = (__le32 __user *)(hva + offset_in_page(shmem) + offsetof(struct sbi_sta_struct, sequence)); - steal_ptr = (u64 *)(hva + offset_in_page(shmem) + + steal_ptr = (__le64 __user *)(hva + offset_in_page(shmem) + offsetof(struct sbi_sta_struct, steal)); - if (WARN_ON(get_user(sequence, sequence_ptr))) + if (WARN_ON(get_user(sequence_le, sequence_ptr))) return; - sequence = le32_to_cpu(sequence); + sequence = le32_to_cpu(sequence_le); sequence += 1; if (WARN_ON(put_user(cpu_to_le32(sequence), sequence_ptr))) return; - if (!WARN_ON(get_user(steal, steal_ptr))) { - steal = le64_to_cpu(steal); + if (!WARN_ON(get_user(steal_le, steal_ptr))) { + steal = le64_to_cpu(steal_le); vcpu->arch.sta.last_steal = READ_ONCE(current->sched_info.run_delay); steal += vcpu->arch.sta.last_steal - last_steal; WARN_ON(put_user(cpu_to_le64(steal), steal_ptr)); From a8b9cf62ade1bf17261a979fc97e40c2d7842353 Mon Sep 17 00:00:00 2001 From: "Masami Hiramatsu (Google)" Date: Wed, 10 Jan 2024 09:13:06 +0900 Subject: [PATCH 255/270] ftrace: Fix DIRECT_CALLS to use SAVE_REGS by default The commit 60c8971899f3 ("ftrace: Make DIRECT_CALLS work WITH_ARGS and !WITH_REGS") changed DIRECT_CALLS to use SAVE_ARGS when there are multiple ftrace_ops at the same function, but since the x86 only support to jump to direct_call from ftrace_regs_caller, when we set the function tracer on the same target function on x86, ftrace-direct does not work as below (this actually works on arm64.) At first, insmod ftrace-direct.ko to put a direct_call on 'wake_up_process()'. # insmod kernel/samples/ftrace/ftrace-direct.ko # less trace ... -0 [006] ..s1. 564.686958: my_direct_func: waking up rcu_preempt-17 -0 [007] ..s1. 564.687836: my_direct_func: waking up kcompactd0-63 -0 [006] ..s1. 564.690926: my_direct_func: waking up rcu_preempt-17 -0 [006] ..s1. 564.696872: my_direct_func: waking up rcu_preempt-17 -0 [007] ..s1. 565.191982: my_direct_func: waking up kcompactd0-63 Setup a function filter to the 'wake_up_process' too, and enable it. # cd /sys/kernel/tracing/ # echo wake_up_process > set_ftrace_filter # echo function > current_tracer # less trace ... -0 [006] ..s3. 686.180972: wake_up_process <-call_timer_fn -0 [006] ..s3. 686.186919: wake_up_process <-call_timer_fn -0 [002] ..s3. 686.264049: wake_up_process <-call_timer_fn -0 [002] d.h6. 686.515216: wake_up_process <-kick_pool -0 [002] d.h6. 686.691386: wake_up_process <-kick_pool Then, only function tracer is shown on x86. But if you enable 'kprobe on ftrace' event (which uses SAVE_REGS flag) on the same function, it is shown again. # echo 'p wake_up_process' >> dynamic_events # echo 1 > events/kprobes/p_wake_up_process_0/enable # echo > trace # less trace ... -0 [006] ..s2. 2710.345919: p_wake_up_process_0: (wake_up_process+0x4/0x20) -0 [006] ..s3. 2710.345923: wake_up_process <-call_timer_fn -0 [006] ..s1. 2710.345928: my_direct_func: waking up rcu_preempt-17 -0 [006] ..s2. 2710.349931: p_wake_up_process_0: (wake_up_process+0x4/0x20) -0 [006] ..s3. 2710.349934: wake_up_process <-call_timer_fn -0 [006] ..s1. 2710.349937: my_direct_func: waking up rcu_preempt-17 To fix this issue, use SAVE_REGS flag for multiple ftrace_ops flag of direct_call by default. Link: https://lore.kernel.org/linux-trace-kernel/170484558617.178953.1590516949390270842.stgit@devnote2 Fixes: 60c8971899f3 ("ftrace: Make DIRECT_CALLS work WITH_ARGS and !WITH_REGS") Cc: stable@vger.kernel.org Cc: Florent Revest Signed-off-by: Masami Hiramatsu (Google) Reviewed-by: Mark Rutland Tested-by: Mark Rutland [arm64] Acked-by: Jiri Olsa Signed-off-by: Steven Rostedt (Google) --- kernel/trace/ftrace.c | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index b01ae7d36021..c060d5b47910 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -5325,7 +5325,17 @@ static LIST_HEAD(ftrace_direct_funcs); static int register_ftrace_function_nolock(struct ftrace_ops *ops); +/* + * If there are multiple ftrace_ops, use SAVE_REGS by default, so that direct + * call will be jumped from ftrace_regs_caller. Only if the architecture does + * not support ftrace_regs_caller but direct_call, use SAVE_ARGS so that it + * jumps from ftrace_caller for multiple ftrace_ops. + */ +#ifndef HAVE_DYNAMIC_FTRACE_WITH_REGS #define MULTI_FLAGS (FTRACE_OPS_FL_DIRECT | FTRACE_OPS_FL_SAVE_ARGS) +#else +#define MULTI_FLAGS (FTRACE_OPS_FL_DIRECT | FTRACE_OPS_FL_SAVE_REGS) +#endif static int check_direct_multi(struct ftrace_ops *ops) { From 44dc5c41b5b1267d4dd037d26afc0c4d3a568acb Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Google)" Date: Fri, 9 Feb 2024 06:36:22 -0500 Subject: [PATCH 256/270] tracing: Fix wasted memory in saved_cmdlines logic While looking at improving the saved_cmdlines cache I found a huge amount of wasted memory that should be used for the cmdlines. The tracing data saves pids during the trace. At sched switch, if a trace occurred, it will save the comm of the task that did the trace. This is saved in a "cache" that maps pids to comms and exposed to user space via the /sys/kernel/tracing/saved_cmdlines file. Currently it only caches by default 128 comms. The structure that uses this creates an array to store the pids using PID_MAX_DEFAULT (which is usually set to 32768). This causes the structure to be of the size of 131104 bytes on 64 bit machines. In hex: 131104 = 0x20020, and since the kernel allocates generic memory in powers of two, the kernel would allocate 0x40000 or 262144 bytes to store this structure. That leaves 131040 bytes of wasted space. Worse, the structure points to an allocated array to store the comm names, which is 16 bytes times the amount of names to save (currently 128), which is 2048 bytes. Instead of allocating a separate array, make the structure end with a variable length string and use the extra space for that. This is similar to a recommendation that Linus had made about eventfs_inode names: https://lore.kernel.org/all/20240130190355.11486-5-torvalds@linux-foundation.org/ Instead of allocating a separate string array to hold the saved comms, have the structure end with: char saved_cmdlines[]; and round up to the next power of two over sizeof(struct saved_cmdline_buffers) + num_cmdlines * TASK_COMM_LEN It will use this extra space for the saved_cmdline portion. Now, instead of saving only 128 comms by default, by using this wasted space at the end of the structure it can save over 8000 comms and even saves space by removing the need for allocating the other array. Link: https://lore.kernel.org/linux-trace-kernel/20240209063622.1f7b6d5f@rorschach.local.home Cc: stable@vger.kernel.org Cc: Masami Hiramatsu Cc: Mathieu Desnoyers Cc: Vincent Donnefort Cc: Sven Schnelle Cc: Mete Durlu Fixes: 939c7a4f04fcd ("tracing: Introduce saved_cmdlines_size file") Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace.c | 75 ++++++++++++++++++++++---------------------- 1 file changed, 37 insertions(+), 38 deletions(-) diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 2a7c6fd934e9..9ff8a439d674 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -2320,7 +2320,7 @@ struct saved_cmdlines_buffer { unsigned *map_cmdline_to_pid; unsigned cmdline_num; int cmdline_idx; - char *saved_cmdlines; + char saved_cmdlines[]; }; static struct saved_cmdlines_buffer *savedcmd; @@ -2334,47 +2334,58 @@ static inline void set_cmdline(int idx, const char *cmdline) strncpy(get_saved_cmdlines(idx), cmdline, TASK_COMM_LEN); } -static int allocate_cmdlines_buffer(unsigned int val, - struct saved_cmdlines_buffer *s) +static void free_saved_cmdlines_buffer(struct saved_cmdlines_buffer *s) { + int order = get_order(sizeof(*s) + s->cmdline_num * TASK_COMM_LEN); + + kfree(s->map_cmdline_to_pid); + free_pages((unsigned long)s, order); +} + +static struct saved_cmdlines_buffer *allocate_cmdlines_buffer(unsigned int val) +{ + struct saved_cmdlines_buffer *s; + struct page *page; + int orig_size, size; + int order; + + /* Figure out how much is needed to hold the given number of cmdlines */ + orig_size = sizeof(*s) + val * TASK_COMM_LEN; + order = get_order(orig_size); + size = 1 << (order + PAGE_SHIFT); + page = alloc_pages(GFP_KERNEL, order); + if (!page) + return NULL; + + s = page_address(page); + memset(s, 0, sizeof(*s)); + + /* Round up to actual allocation */ + val = (size - sizeof(*s)) / TASK_COMM_LEN; + s->cmdline_num = val; + s->map_cmdline_to_pid = kmalloc_array(val, sizeof(*s->map_cmdline_to_pid), GFP_KERNEL); - if (!s->map_cmdline_to_pid) - return -ENOMEM; - - s->saved_cmdlines = kmalloc_array(TASK_COMM_LEN, val, GFP_KERNEL); - if (!s->saved_cmdlines) { - kfree(s->map_cmdline_to_pid); - return -ENOMEM; + if (!s->map_cmdline_to_pid) { + free_saved_cmdlines_buffer(s); + return NULL; } s->cmdline_idx = 0; - s->cmdline_num = val; memset(&s->map_pid_to_cmdline, NO_CMDLINE_MAP, sizeof(s->map_pid_to_cmdline)); memset(s->map_cmdline_to_pid, NO_CMDLINE_MAP, val * sizeof(*s->map_cmdline_to_pid)); - return 0; + return s; } static int trace_create_savedcmd(void) { - int ret; + savedcmd = allocate_cmdlines_buffer(SAVED_CMDLINES_DEFAULT); - savedcmd = kmalloc(sizeof(*savedcmd), GFP_KERNEL); - if (!savedcmd) - return -ENOMEM; - - ret = allocate_cmdlines_buffer(SAVED_CMDLINES_DEFAULT, savedcmd); - if (ret < 0) { - kfree(savedcmd); - savedcmd = NULL; - return -ENOMEM; - } - - return 0; + return savedcmd ? 0 : -ENOMEM; } int is_tracing_stopped(void) @@ -6056,26 +6067,14 @@ tracing_saved_cmdlines_size_read(struct file *filp, char __user *ubuf, return simple_read_from_buffer(ubuf, cnt, ppos, buf, r); } -static void free_saved_cmdlines_buffer(struct saved_cmdlines_buffer *s) -{ - kfree(s->saved_cmdlines); - kfree(s->map_cmdline_to_pid); - kfree(s); -} - static int tracing_resize_saved_cmdlines(unsigned int val) { struct saved_cmdlines_buffer *s, *savedcmd_temp; - s = kmalloc(sizeof(*s), GFP_KERNEL); + s = allocate_cmdlines_buffer(val); if (!s) return -ENOMEM; - if (allocate_cmdlines_buffer(val, s) < 0) { - kfree(s); - return -ENOMEM; - } - preempt_disable(); arch_spin_lock(&trace_cmdline_lock); savedcmd_temp = savedcmd; From f6a1892585cd19e63c4ef2334e26cd536d5b678d Mon Sep 17 00:00:00 2001 From: Aleksander Mazur Date: Tue, 23 Jan 2024 14:43:00 +0100 Subject: [PATCH 257/270] x86/Kconfig: Transmeta Crusoe is CPU family 5, not 6 The kernel built with MCRUSOE is unbootable on Transmeta Crusoe. It shows the following error message: This kernel requires an i686 CPU, but only detected an i586 CPU. Unable to boot - please use a kernel appropriate for your CPU. Remove MCRUSOE from the condition introduced in commit in Fixes, effectively changing X86_MINIMUM_CPU_FAMILY back to 5 on that machine, which matches the CPU family given by CPUID. [ bp: Massage commit message. ] Fixes: 25d76ac88821 ("x86/Kconfig: Explicitly enumerate i686-class CPUs in Kconfig") Signed-off-by: Aleksander Mazur Signed-off-by: Borislav Petkov (AMD) Acked-by: H. Peter Anvin Cc: Link: https://lore.kernel.org/r/20240123134309.1117782-1-deweloper@wp.pl --- arch/x86/Kconfig.cpu | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/Kconfig.cpu b/arch/x86/Kconfig.cpu index b9224cf2ee4d..2a7279d80460 100644 --- a/arch/x86/Kconfig.cpu +++ b/arch/x86/Kconfig.cpu @@ -379,7 +379,7 @@ config X86_CMOV config X86_MINIMUM_CPU_FAMILY int default "64" if X86_64 - default "6" if X86_32 && (MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || MVIAC3_2 || MVIAC7 || MEFFICEON || MATOM || MCRUSOE || MCORE2 || MK7 || MK8) + default "6" if X86_32 && (MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || MVIAC3_2 || MVIAC7 || MEFFICEON || MATOM || MCORE2 || MK7 || MK8) default "5" if X86_32 && X86_CMPXCHG64 default "4" From c6e02eefd6ace3da3369c764f15429f5647056af Mon Sep 17 00:00:00 2001 From: Shyam Prasad N Date: Tue, 6 Feb 2024 15:00:46 +0000 Subject: [PATCH 258/270] cifs: change tcon status when need_reconnect is set on it When a tcon is marked for need_reconnect, the intention is to have it reconnected. This change adjusts tcon->status in cifs_tree_connect when need_reconnect is set. Also, this change has a minor correction in resetting need_reconnect on success. It makes sure that it is done with tc_lock held. Signed-off-by: Shyam Prasad N Signed-off-by: Steve French --- fs/smb/client/connect.c | 5 +++++ fs/smb/client/dfs.c | 7 ++++++- fs/smb/client/file.c | 3 +++ 3 files changed, 14 insertions(+), 1 deletion(-) diff --git a/fs/smb/client/connect.c b/fs/smb/client/connect.c index bfd568f89710..01c49b29c499 100644 --- a/fs/smb/client/connect.c +++ b/fs/smb/client/connect.c @@ -4228,6 +4228,11 @@ int cifs_tree_connect(const unsigned int xid, struct cifs_tcon *tcon, const stru /* only send once per connect */ spin_lock(&tcon->tc_lock); + + /* if tcon is marked for needing reconnect, update state */ + if (tcon->need_reconnect) + tcon->status = TID_NEED_TCON; + if (tcon->status == TID_GOOD) { spin_unlock(&tcon->tc_lock); return 0; diff --git a/fs/smb/client/dfs.c b/fs/smb/client/dfs.c index a8a1d386da65..449c59830039 100644 --- a/fs/smb/client/dfs.c +++ b/fs/smb/client/dfs.c @@ -565,6 +565,11 @@ int cifs_tree_connect(const unsigned int xid, struct cifs_tcon *tcon, const stru /* only send once per connect */ spin_lock(&tcon->tc_lock); + + /* if tcon is marked for needing reconnect, update state */ + if (tcon->need_reconnect) + tcon->status = TID_NEED_TCON; + if (tcon->status == TID_GOOD) { spin_unlock(&tcon->tc_lock); return 0; @@ -625,8 +630,8 @@ out: spin_lock(&tcon->tc_lock); if (tcon->status == TID_IN_TCON) tcon->status = TID_GOOD; - spin_unlock(&tcon->tc_lock); tcon->need_reconnect = false; + spin_unlock(&tcon->tc_lock); } return rc; diff --git a/fs/smb/client/file.c b/fs/smb/client/file.c index b75282c204da..f391c9b803d8 100644 --- a/fs/smb/client/file.c +++ b/fs/smb/client/file.c @@ -175,6 +175,9 @@ cifs_mark_open_files_invalid(struct cifs_tcon *tcon) /* only send once per connect */ spin_lock(&tcon->tc_lock); + if (tcon->need_reconnect) + tcon->status = TID_NEED_RECON; + if (tcon->status != TID_NEED_RECON) { spin_unlock(&tcon->tc_lock); return; From a39c757bf0596b17482a507f31c3ef0af0d1d2b4 Mon Sep 17 00:00:00 2001 From: Shyam Prasad N Date: Tue, 6 Feb 2024 15:00:47 +0000 Subject: [PATCH 259/270] cifs: handle cases where multiple sessions share connection Based on our implementation of multichannel, it is entirely possible that a server struct may not be found in any channel of an SMB session. In such cases, we should be prepared to move on and search for the server struct in the next session. Signed-off-by: Shyam Prasad N Signed-off-by: Steve French --- fs/smb/client/connect.c | 6 ++++++ fs/smb/client/sess.c | 1 - 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/fs/smb/client/connect.c b/fs/smb/client/connect.c index 01c49b29c499..d03253f8f145 100644 --- a/fs/smb/client/connect.c +++ b/fs/smb/client/connect.c @@ -233,6 +233,12 @@ cifs_mark_tcp_ses_conns_for_reconnect(struct TCP_Server_Info *server, list_for_each_entry_safe(ses, nses, &pserver->smb_ses_list, smb_ses_list) { /* check if iface is still active */ spin_lock(&ses->chan_lock); + if (cifs_ses_get_chan_index(ses, server) == + CIFS_INVAL_CHAN_INDEX) { + spin_unlock(&ses->chan_lock); + continue; + } + if (!cifs_chan_is_iface_active(ses, server)) { spin_unlock(&ses->chan_lock); cifs_chan_update_iface(ses, server); diff --git a/fs/smb/client/sess.c b/fs/smb/client/sess.c index 476d54fceb50..8f37373fd333 100644 --- a/fs/smb/client/sess.c +++ b/fs/smb/client/sess.c @@ -88,7 +88,6 @@ cifs_ses_get_chan_index(struct cifs_ses *ses, if (server) cifs_dbg(VFS, "unable to get chan index for server: 0x%llx", server->conn_id); - WARN_ON(1); return CIFS_INVAL_CHAN_INDEX; } From a5cc98eba2592d6e3c5a4351319595ddde2a5901 Mon Sep 17 00:00:00 2001 From: Steve French Date: Tue, 6 Feb 2024 23:57:18 -0600 Subject: [PATCH 260/270] smb3: clarify mount warning When a user tries to use the "sec=krb5p" mount parameter to encrypt data on connection to a server (when authenticating with Kerberos), we indicate that it is not supported, but do not note the equivalent recommended mount parameter ("sec=krb5,seal") which turns on encryption for that mount (and uses Kerberos for auth). Update the warning message. Reviewed-by: Shyam Prasad N Signed-off-by: Steve French --- fs/smb/client/fs_context.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/smb/client/fs_context.c b/fs/smb/client/fs_context.c index 52cbef2eeb28..aec8dbd1f9db 100644 --- a/fs/smb/client/fs_context.c +++ b/fs/smb/client/fs_context.c @@ -211,7 +211,7 @@ cifs_parse_security_flavors(struct fs_context *fc, char *value, struct smb3_fs_c switch (match_token(value, cifs_secflavor_tokens, args)) { case Opt_sec_krb5p: - cifs_errorf(fc, "sec=krb5p is not supported!\n"); + cifs_errorf(fc, "sec=krb5p is not supported. Use sec=krb5,seal instead\n"); return 1; case Opt_sec_krb5i: ctx->sign = true; From 4356e9f841f7fbb945521cef3577ba394c65f3fc Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Fri, 9 Feb 2024 12:39:31 -0800 Subject: [PATCH 261/270] work around gcc bugs with 'asm goto' with outputs We've had issues with gcc and 'asm goto' before, and we created a 'asm_volatile_goto()' macro for that in the past: see commits 3f0116c3238a ("compiler/gcc4: Add quirk for 'asm goto' miscompilation bug") and a9f180345f53 ("compiler/gcc4: Make quirk for asm_volatile_goto() unconditional"). Then, much later, we ended up removing the workaround in commit 43c249ea0b1e ("compiler-gcc.h: remove ancient workaround for gcc PR 58670") because we no longer supported building the kernel with the affected gcc versions, but we left the macro uses around. Now, Sean Christopherson reports a new version of a very similar problem, which is fixed by re-applying that ancient workaround. But the problem in question is limited to only the 'asm goto with outputs' cases, so instead of re-introducing the old workaround as-is, let's rename and limit the workaround to just that much less common case. It looks like there are at least two separate issues that all hit in this area: (a) some versions of gcc don't mark the asm goto as 'volatile' when it has outputs: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=98619 https://gcc.gnu.org/bugzilla/show_bug.cgi?id=110420 which is easy to work around by just adding the 'volatile' by hand. (b) Internal compiler errors: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=110422 which are worked around by adding the extra empty 'asm' as a barrier, as in the original workaround. but the problem Sean sees may be a third thing since it involves bad code generation (not an ICE) even with the manually added 'volatile'. but the same old workaround works for this case, even if this feels a bit like voodoo programming and may only be hiding the issue. Reported-and-tested-by: Sean Christopherson Link: https://lore.kernel.org/all/20240208220604.140859-1-seanjc@google.com/ Cc: Nick Desaulniers Cc: Uros Bizjak Cc: Jakub Jelinek Cc: Andrew Pinski Signed-off-by: Linus Torvalds --- arch/arc/include/asm/jump_label.h | 4 ++-- arch/arm/include/asm/jump_label.h | 4 ++-- arch/arm64/include/asm/alternative-macros.h | 4 ++-- arch/arm64/include/asm/jump_label.h | 4 ++-- arch/csky/include/asm/jump_label.h | 4 ++-- arch/loongarch/include/asm/jump_label.h | 4 ++-- arch/mips/include/asm/jump_label.h | 4 ++-- arch/parisc/include/asm/jump_label.h | 4 ++-- arch/powerpc/include/asm/jump_label.h | 4 ++-- arch/powerpc/include/asm/uaccess.h | 12 ++++++------ arch/powerpc/kernel/irq_64.c | 2 +- arch/riscv/include/asm/arch_hweight.h | 4 ++-- arch/riscv/include/asm/bitops.h | 8 ++++---- arch/riscv/include/asm/checksum.h | 2 +- arch/riscv/include/asm/cpufeature.h | 4 ++-- arch/riscv/include/asm/jump_label.h | 4 ++-- arch/riscv/lib/csum.c | 10 +++++----- arch/s390/include/asm/jump_label.h | 4 ++-- arch/sparc/include/asm/jump_label.h | 4 ++-- arch/um/include/asm/cpufeature.h | 2 +- arch/x86/include/asm/cpufeature.h | 2 +- arch/x86/include/asm/jump_label.h | 6 +++--- arch/x86/include/asm/rmwcc.h | 2 +- arch/x86/include/asm/special_insns.h | 2 +- arch/x86/include/asm/uaccess.h | 10 +++++----- arch/x86/kvm/svm/svm_ops.h | 6 +++--- arch/x86/kvm/vmx/vmx.c | 4 ++-- arch/x86/kvm/vmx/vmx_ops.h | 6 +++--- arch/xtensa/include/asm/jump_label.h | 4 ++-- include/linux/compiler-gcc.h | 19 +++++++++++++++++++ include/linux/compiler_types.h | 4 ++-- net/netfilter/nft_set_pipapo_avx2.c | 2 +- samples/bpf/asm_goto_workaround.h | 8 ++++---- tools/arch/x86/include/asm/rmwcc.h | 2 +- tools/include/linux/compiler_types.h | 4 ++-- 35 files changed, 96 insertions(+), 77 deletions(-) diff --git a/arch/arc/include/asm/jump_label.h b/arch/arc/include/asm/jump_label.h index 9d9618079739..a339223d9e05 100644 --- a/arch/arc/include/asm/jump_label.h +++ b/arch/arc/include/asm/jump_label.h @@ -31,7 +31,7 @@ static __always_inline bool arch_static_branch(struct static_key *key, bool branch) { - asm_volatile_goto(".balign "__stringify(JUMP_LABEL_NOP_SIZE)" \n" + asm goto(".balign "__stringify(JUMP_LABEL_NOP_SIZE)" \n" "1: \n" "nop \n" ".pushsection __jump_table, \"aw\" \n" @@ -47,7 +47,7 @@ l_yes: static __always_inline bool arch_static_branch_jump(struct static_key *key, bool branch) { - asm_volatile_goto(".balign "__stringify(JUMP_LABEL_NOP_SIZE)" \n" + asm goto(".balign "__stringify(JUMP_LABEL_NOP_SIZE)" \n" "1: \n" "b %l[l_yes] \n" ".pushsection __jump_table, \"aw\" \n" diff --git a/arch/arm/include/asm/jump_label.h b/arch/arm/include/asm/jump_label.h index e12d7d096fc0..e4eb54f6cd9f 100644 --- a/arch/arm/include/asm/jump_label.h +++ b/arch/arm/include/asm/jump_label.h @@ -11,7 +11,7 @@ static __always_inline bool arch_static_branch(struct static_key *key, bool branch) { - asm_volatile_goto("1:\n\t" + asm goto("1:\n\t" WASM(nop) "\n\t" ".pushsection __jump_table, \"aw\"\n\t" ".word 1b, %l[l_yes], %c0\n\t" @@ -25,7 +25,7 @@ l_yes: static __always_inline bool arch_static_branch_jump(struct static_key *key, bool branch) { - asm_volatile_goto("1:\n\t" + asm goto("1:\n\t" WASM(b) " %l[l_yes]\n\t" ".pushsection __jump_table, \"aw\"\n\t" ".word 1b, %l[l_yes], %c0\n\t" diff --git a/arch/arm64/include/asm/alternative-macros.h b/arch/arm64/include/asm/alternative-macros.h index 210bb43cff2c..d328f549b1a6 100644 --- a/arch/arm64/include/asm/alternative-macros.h +++ b/arch/arm64/include/asm/alternative-macros.h @@ -229,7 +229,7 @@ alternative_has_cap_likely(const unsigned long cpucap) if (!cpucap_is_possible(cpucap)) return false; - asm_volatile_goto( + asm goto( ALTERNATIVE_CB("b %l[l_no]", %[cpucap], alt_cb_patch_nops) : : [cpucap] "i" (cpucap) @@ -247,7 +247,7 @@ alternative_has_cap_unlikely(const unsigned long cpucap) if (!cpucap_is_possible(cpucap)) return false; - asm_volatile_goto( + asm goto( ALTERNATIVE("nop", "b %l[l_yes]", %[cpucap]) : : [cpucap] "i" (cpucap) diff --git a/arch/arm64/include/asm/jump_label.h b/arch/arm64/include/asm/jump_label.h index 48ddc0f45d22..6aafbb789991 100644 --- a/arch/arm64/include/asm/jump_label.h +++ b/arch/arm64/include/asm/jump_label.h @@ -18,7 +18,7 @@ static __always_inline bool arch_static_branch(struct static_key * const key, const bool branch) { - asm_volatile_goto( + asm goto( "1: nop \n\t" " .pushsection __jump_table, \"aw\" \n\t" " .align 3 \n\t" @@ -35,7 +35,7 @@ l_yes: static __always_inline bool arch_static_branch_jump(struct static_key * const key, const bool branch) { - asm_volatile_goto( + asm goto( "1: b %l[l_yes] \n\t" " .pushsection __jump_table, \"aw\" \n\t" " .align 3 \n\t" diff --git a/arch/csky/include/asm/jump_label.h b/arch/csky/include/asm/jump_label.h index 98a3f4b168bd..ef2e37a10a0f 100644 --- a/arch/csky/include/asm/jump_label.h +++ b/arch/csky/include/asm/jump_label.h @@ -12,7 +12,7 @@ static __always_inline bool arch_static_branch(struct static_key *key, bool branch) { - asm_volatile_goto( + asm goto( "1: nop32 \n" " .pushsection __jump_table, \"aw\" \n" " .align 2 \n" @@ -29,7 +29,7 @@ label: static __always_inline bool arch_static_branch_jump(struct static_key *key, bool branch) { - asm_volatile_goto( + asm goto( "1: bsr32 %l[label] \n" " .pushsection __jump_table, \"aw\" \n" " .align 2 \n" diff --git a/arch/loongarch/include/asm/jump_label.h b/arch/loongarch/include/asm/jump_label.h index 3cea299a5ef5..29acfe3de3fa 100644 --- a/arch/loongarch/include/asm/jump_label.h +++ b/arch/loongarch/include/asm/jump_label.h @@ -22,7 +22,7 @@ static __always_inline bool arch_static_branch(struct static_key * const key, const bool branch) { - asm_volatile_goto( + asm goto( "1: nop \n\t" JUMP_TABLE_ENTRY : : "i"(&((char *)key)[branch]) : : l_yes); @@ -35,7 +35,7 @@ l_yes: static __always_inline bool arch_static_branch_jump(struct static_key * const key, const bool branch) { - asm_volatile_goto( + asm goto( "1: b %l[l_yes] \n\t" JUMP_TABLE_ENTRY : : "i"(&((char *)key)[branch]) : : l_yes); diff --git a/arch/mips/include/asm/jump_label.h b/arch/mips/include/asm/jump_label.h index 081be98c71ef..ff5d388502d4 100644 --- a/arch/mips/include/asm/jump_label.h +++ b/arch/mips/include/asm/jump_label.h @@ -39,7 +39,7 @@ extern void jump_label_apply_nops(struct module *mod); static __always_inline bool arch_static_branch(struct static_key *key, bool branch) { - asm_volatile_goto("1:\t" B_INSN " 2f\n\t" + asm goto("1:\t" B_INSN " 2f\n\t" "2:\t.insn\n\t" ".pushsection __jump_table, \"aw\"\n\t" WORD_INSN " 1b, %l[l_yes], %0\n\t" @@ -53,7 +53,7 @@ l_yes: static __always_inline bool arch_static_branch_jump(struct static_key *key, bool branch) { - asm_volatile_goto("1:\t" J_INSN " %l[l_yes]\n\t" + asm goto("1:\t" J_INSN " %l[l_yes]\n\t" ".pushsection __jump_table, \"aw\"\n\t" WORD_INSN " 1b, %l[l_yes], %0\n\t" ".popsection\n\t" diff --git a/arch/parisc/include/asm/jump_label.h b/arch/parisc/include/asm/jump_label.h index 94428798b6aa..317ebc5edc9f 100644 --- a/arch/parisc/include/asm/jump_label.h +++ b/arch/parisc/include/asm/jump_label.h @@ -12,7 +12,7 @@ static __always_inline bool arch_static_branch(struct static_key *key, bool branch) { - asm_volatile_goto("1:\n\t" + asm goto("1:\n\t" "nop\n\t" ".pushsection __jump_table, \"aw\"\n\t" ".align %1\n\t" @@ -29,7 +29,7 @@ l_yes: static __always_inline bool arch_static_branch_jump(struct static_key *key, bool branch) { - asm_volatile_goto("1:\n\t" + asm goto("1:\n\t" "b,n %l[l_yes]\n\t" ".pushsection __jump_table, \"aw\"\n\t" ".align %1\n\t" diff --git a/arch/powerpc/include/asm/jump_label.h b/arch/powerpc/include/asm/jump_label.h index 93ce3ec25387..2f2a86ed2280 100644 --- a/arch/powerpc/include/asm/jump_label.h +++ b/arch/powerpc/include/asm/jump_label.h @@ -17,7 +17,7 @@ static __always_inline bool arch_static_branch(struct static_key *key, bool branch) { - asm_volatile_goto("1:\n\t" + asm goto("1:\n\t" "nop # arch_static_branch\n\t" ".pushsection __jump_table, \"aw\"\n\t" ".long 1b - ., %l[l_yes] - .\n\t" @@ -32,7 +32,7 @@ l_yes: static __always_inline bool arch_static_branch_jump(struct static_key *key, bool branch) { - asm_volatile_goto("1:\n\t" + asm goto("1:\n\t" "b %l[l_yes] # arch_static_branch_jump\n\t" ".pushsection __jump_table, \"aw\"\n\t" ".long 1b - ., %l[l_yes] - .\n\t" diff --git a/arch/powerpc/include/asm/uaccess.h b/arch/powerpc/include/asm/uaccess.h index f1f9890f50d3..de10437fd206 100644 --- a/arch/powerpc/include/asm/uaccess.h +++ b/arch/powerpc/include/asm/uaccess.h @@ -74,7 +74,7 @@ __pu_failed: \ /* -mprefixed can generate offsets beyond range, fall back hack */ #ifdef CONFIG_PPC_KERNEL_PREFIXED #define __put_user_asm_goto(x, addr, label, op) \ - asm_volatile_goto( \ + asm goto( \ "1: " op " %0,0(%1) # put_user\n" \ EX_TABLE(1b, %l2) \ : \ @@ -83,7 +83,7 @@ __pu_failed: \ : label) #else #define __put_user_asm_goto(x, addr, label, op) \ - asm_volatile_goto( \ + asm goto( \ "1: " op "%U1%X1 %0,%1 # put_user\n" \ EX_TABLE(1b, %l2) \ : \ @@ -97,7 +97,7 @@ __pu_failed: \ __put_user_asm_goto(x, ptr, label, "std") #else /* __powerpc64__ */ #define __put_user_asm2_goto(x, addr, label) \ - asm_volatile_goto( \ + asm goto( \ "1: stw%X1 %0, %1\n" \ "2: stw%X1 %L0, %L1\n" \ EX_TABLE(1b, %l2) \ @@ -146,7 +146,7 @@ do { \ /* -mprefixed can generate offsets beyond range, fall back hack */ #ifdef CONFIG_PPC_KERNEL_PREFIXED #define __get_user_asm_goto(x, addr, label, op) \ - asm_volatile_goto( \ + asm_goto_output( \ "1: "op" %0,0(%1) # get_user\n" \ EX_TABLE(1b, %l2) \ : "=r" (x) \ @@ -155,7 +155,7 @@ do { \ : label) #else #define __get_user_asm_goto(x, addr, label, op) \ - asm_volatile_goto( \ + asm_goto_output( \ "1: "op"%U1%X1 %0, %1 # get_user\n" \ EX_TABLE(1b, %l2) \ : "=r" (x) \ @@ -169,7 +169,7 @@ do { \ __get_user_asm_goto(x, addr, label, "ld") #else /* __powerpc64__ */ #define __get_user_asm2_goto(x, addr, label) \ - asm_volatile_goto( \ + asm_goto_output( \ "1: lwz%X1 %0, %1\n" \ "2: lwz%X1 %L0, %L1\n" \ EX_TABLE(1b, %l2) \ diff --git a/arch/powerpc/kernel/irq_64.c b/arch/powerpc/kernel/irq_64.c index 938e66829eae..d5c48d1b0a31 100644 --- a/arch/powerpc/kernel/irq_64.c +++ b/arch/powerpc/kernel/irq_64.c @@ -230,7 +230,7 @@ again: * This allows interrupts to be unmasked without hard disabling, and * also without new hard interrupts coming in ahead of pending ones. */ - asm_volatile_goto( + asm goto( "1: \n" " lbz 9,%0(13) \n" " cmpwi 9,0 \n" diff --git a/arch/riscv/include/asm/arch_hweight.h b/arch/riscv/include/asm/arch_hweight.h index c20236a0725b..85b2c443823e 100644 --- a/arch/riscv/include/asm/arch_hweight.h +++ b/arch/riscv/include/asm/arch_hweight.h @@ -20,7 +20,7 @@ static __always_inline unsigned int __arch_hweight32(unsigned int w) { #ifdef CONFIG_RISCV_ISA_ZBB - asm_volatile_goto(ALTERNATIVE("j %l[legacy]", "nop", 0, + asm goto(ALTERNATIVE("j %l[legacy]", "nop", 0, RISCV_ISA_EXT_ZBB, 1) : : : : legacy); @@ -51,7 +51,7 @@ static inline unsigned int __arch_hweight8(unsigned int w) static __always_inline unsigned long __arch_hweight64(__u64 w) { # ifdef CONFIG_RISCV_ISA_ZBB - asm_volatile_goto(ALTERNATIVE("j %l[legacy]", "nop", 0, + asm goto(ALTERNATIVE("j %l[legacy]", "nop", 0, RISCV_ISA_EXT_ZBB, 1) : : : : legacy); diff --git a/arch/riscv/include/asm/bitops.h b/arch/riscv/include/asm/bitops.h index 9ffc35537024..329d8244a9b3 100644 --- a/arch/riscv/include/asm/bitops.h +++ b/arch/riscv/include/asm/bitops.h @@ -39,7 +39,7 @@ static __always_inline unsigned long variable__ffs(unsigned long word) { int num; - asm_volatile_goto(ALTERNATIVE("j %l[legacy]", "nop", 0, + asm goto(ALTERNATIVE("j %l[legacy]", "nop", 0, RISCV_ISA_EXT_ZBB, 1) : : : : legacy); @@ -95,7 +95,7 @@ static __always_inline unsigned long variable__fls(unsigned long word) { int num; - asm_volatile_goto(ALTERNATIVE("j %l[legacy]", "nop", 0, + asm goto(ALTERNATIVE("j %l[legacy]", "nop", 0, RISCV_ISA_EXT_ZBB, 1) : : : : legacy); @@ -154,7 +154,7 @@ static __always_inline int variable_ffs(int x) if (!x) return 0; - asm_volatile_goto(ALTERNATIVE("j %l[legacy]", "nop", 0, + asm goto(ALTERNATIVE("j %l[legacy]", "nop", 0, RISCV_ISA_EXT_ZBB, 1) : : : : legacy); @@ -209,7 +209,7 @@ static __always_inline int variable_fls(unsigned int x) if (!x) return 0; - asm_volatile_goto(ALTERNATIVE("j %l[legacy]", "nop", 0, + asm goto(ALTERNATIVE("j %l[legacy]", "nop", 0, RISCV_ISA_EXT_ZBB, 1) : : : : legacy); diff --git a/arch/riscv/include/asm/checksum.h b/arch/riscv/include/asm/checksum.h index a5b60b54b101..88e6f1499e88 100644 --- a/arch/riscv/include/asm/checksum.h +++ b/arch/riscv/include/asm/checksum.h @@ -53,7 +53,7 @@ static inline __sum16 ip_fast_csum(const void *iph, unsigned int ihl) IS_ENABLED(CONFIG_RISCV_ALTERNATIVE)) { unsigned long fold_temp; - asm_volatile_goto(ALTERNATIVE("j %l[no_zbb]", "nop", 0, + asm goto(ALTERNATIVE("j %l[no_zbb]", "nop", 0, RISCV_ISA_EXT_ZBB, 1) : : diff --git a/arch/riscv/include/asm/cpufeature.h b/arch/riscv/include/asm/cpufeature.h index 5a626ed2c47a..0bd11862b760 100644 --- a/arch/riscv/include/asm/cpufeature.h +++ b/arch/riscv/include/asm/cpufeature.h @@ -80,7 +80,7 @@ riscv_has_extension_likely(const unsigned long ext) "ext must be < RISCV_ISA_EXT_MAX"); if (IS_ENABLED(CONFIG_RISCV_ALTERNATIVE)) { - asm_volatile_goto( + asm goto( ALTERNATIVE("j %l[l_no]", "nop", 0, %[ext], 1) : : [ext] "i" (ext) @@ -103,7 +103,7 @@ riscv_has_extension_unlikely(const unsigned long ext) "ext must be < RISCV_ISA_EXT_MAX"); if (IS_ENABLED(CONFIG_RISCV_ALTERNATIVE)) { - asm_volatile_goto( + asm goto( ALTERNATIVE("nop", "j %l[l_yes]", 0, %[ext], 1) : : [ext] "i" (ext) diff --git a/arch/riscv/include/asm/jump_label.h b/arch/riscv/include/asm/jump_label.h index 14a5ea8d8ef0..4a35d787c019 100644 --- a/arch/riscv/include/asm/jump_label.h +++ b/arch/riscv/include/asm/jump_label.h @@ -17,7 +17,7 @@ static __always_inline bool arch_static_branch(struct static_key * const key, const bool branch) { - asm_volatile_goto( + asm goto( " .align 2 \n\t" " .option push \n\t" " .option norelax \n\t" @@ -39,7 +39,7 @@ label: static __always_inline bool arch_static_branch_jump(struct static_key * const key, const bool branch) { - asm_volatile_goto( + asm goto( " .align 2 \n\t" " .option push \n\t" " .option norelax \n\t" diff --git a/arch/riscv/lib/csum.c b/arch/riscv/lib/csum.c index af3df5274ccb..74af3ab520b6 100644 --- a/arch/riscv/lib/csum.c +++ b/arch/riscv/lib/csum.c @@ -53,7 +53,7 @@ __sum16 csum_ipv6_magic(const struct in6_addr *saddr, * support, so nop when Zbb is available and jump when Zbb is * not available. */ - asm_volatile_goto(ALTERNATIVE("j %l[no_zbb]", "nop", 0, + asm goto(ALTERNATIVE("j %l[no_zbb]", "nop", 0, RISCV_ISA_EXT_ZBB, 1) : : @@ -170,7 +170,7 @@ do_csum_with_alignment(const unsigned char *buff, int len) * support, so nop when Zbb is available and jump when Zbb is * not available. */ - asm_volatile_goto(ALTERNATIVE("j %l[no_zbb]", "nop", 0, + asm goto(ALTERNATIVE("j %l[no_zbb]", "nop", 0, RISCV_ISA_EXT_ZBB, 1) : : @@ -178,7 +178,7 @@ do_csum_with_alignment(const unsigned char *buff, int len) : no_zbb); #ifdef CONFIG_32BIT - asm_volatile_goto(".option push \n\ + asm_goto_output(".option push \n\ .option arch,+zbb \n\ rori %[fold_temp], %[csum], 16 \n\ andi %[offset], %[offset], 1 \n\ @@ -193,7 +193,7 @@ do_csum_with_alignment(const unsigned char *buff, int len) return (unsigned short)csum; #else /* !CONFIG_32BIT */ - asm_volatile_goto(".option push \n\ + asm_goto_output(".option push \n\ .option arch,+zbb \n\ rori %[fold_temp], %[csum], 32 \n\ add %[csum], %[fold_temp], %[csum] \n\ @@ -257,7 +257,7 @@ do_csum_no_alignment(const unsigned char *buff, int len) * support, so nop when Zbb is available and jump when Zbb is * not available. */ - asm_volatile_goto(ALTERNATIVE("j %l[no_zbb]", "nop", 0, + asm goto(ALTERNATIVE("j %l[no_zbb]", "nop", 0, RISCV_ISA_EXT_ZBB, 1) : : diff --git a/arch/s390/include/asm/jump_label.h b/arch/s390/include/asm/jump_label.h index 895f774bbcc5..bf78cf381dfc 100644 --- a/arch/s390/include/asm/jump_label.h +++ b/arch/s390/include/asm/jump_label.h @@ -25,7 +25,7 @@ */ static __always_inline bool arch_static_branch(struct static_key *key, bool branch) { - asm_volatile_goto("0: brcl 0,%l[label]\n" + asm goto("0: brcl 0,%l[label]\n" ".pushsection __jump_table,\"aw\"\n" ".balign 8\n" ".long 0b-.,%l[label]-.\n" @@ -39,7 +39,7 @@ label: static __always_inline bool arch_static_branch_jump(struct static_key *key, bool branch) { - asm_volatile_goto("0: brcl 15,%l[label]\n" + asm goto("0: brcl 15,%l[label]\n" ".pushsection __jump_table,\"aw\"\n" ".balign 8\n" ".long 0b-.,%l[label]-.\n" diff --git a/arch/sparc/include/asm/jump_label.h b/arch/sparc/include/asm/jump_label.h index 94eb529dcb77..2718cbea826a 100644 --- a/arch/sparc/include/asm/jump_label.h +++ b/arch/sparc/include/asm/jump_label.h @@ -10,7 +10,7 @@ static __always_inline bool arch_static_branch(struct static_key *key, bool branch) { - asm_volatile_goto("1:\n\t" + asm goto("1:\n\t" "nop\n\t" "nop\n\t" ".pushsection __jump_table, \"aw\"\n\t" @@ -26,7 +26,7 @@ l_yes: static __always_inline bool arch_static_branch_jump(struct static_key *key, bool branch) { - asm_volatile_goto("1:\n\t" + asm goto("1:\n\t" "b %l[l_yes]\n\t" "nop\n\t" ".pushsection __jump_table, \"aw\"\n\t" diff --git a/arch/um/include/asm/cpufeature.h b/arch/um/include/asm/cpufeature.h index 4b6d1b526bc1..66fe06db872f 100644 --- a/arch/um/include/asm/cpufeature.h +++ b/arch/um/include/asm/cpufeature.h @@ -75,7 +75,7 @@ extern void setup_clear_cpu_cap(unsigned int bit); */ static __always_inline bool _static_cpu_has(u16 bit) { - asm_volatile_goto("1: jmp 6f\n" + asm goto("1: jmp 6f\n" "2:\n" ".skip -(((5f-4f) - (2b-1b)) > 0) * " "((5f-4f) - (2b-1b)),0x90\n" diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h index a26bebbdff87..a1273698fc43 100644 --- a/arch/x86/include/asm/cpufeature.h +++ b/arch/x86/include/asm/cpufeature.h @@ -168,7 +168,7 @@ extern void clear_cpu_cap(struct cpuinfo_x86 *c, unsigned int bit); */ static __always_inline bool _static_cpu_has(u16 bit) { - asm_volatile_goto( + asm goto( ALTERNATIVE_TERNARY("jmp 6f", %P[feature], "", "jmp %l[t_no]") ".pushsection .altinstr_aux,\"ax\"\n" "6:\n" diff --git a/arch/x86/include/asm/jump_label.h b/arch/x86/include/asm/jump_label.h index 071572e23d3a..cbbef32517f0 100644 --- a/arch/x86/include/asm/jump_label.h +++ b/arch/x86/include/asm/jump_label.h @@ -24,7 +24,7 @@ static __always_inline bool arch_static_branch(struct static_key *key, bool branch) { - asm_volatile_goto("1:" + asm goto("1:" "jmp %l[l_yes] # objtool NOPs this \n\t" JUMP_TABLE_ENTRY : : "i" (key), "i" (2 | branch) : : l_yes); @@ -38,7 +38,7 @@ l_yes: static __always_inline bool arch_static_branch(struct static_key * const key, const bool branch) { - asm_volatile_goto("1:" + asm goto("1:" ".byte " __stringify(BYTES_NOP5) "\n\t" JUMP_TABLE_ENTRY : : "i" (key), "i" (branch) : : l_yes); @@ -52,7 +52,7 @@ l_yes: static __always_inline bool arch_static_branch_jump(struct static_key * const key, const bool branch) { - asm_volatile_goto("1:" + asm goto("1:" "jmp %l[l_yes]\n\t" JUMP_TABLE_ENTRY : : "i" (key), "i" (branch) : : l_yes); diff --git a/arch/x86/include/asm/rmwcc.h b/arch/x86/include/asm/rmwcc.h index 4b081e0d3306..363266cbcada 100644 --- a/arch/x86/include/asm/rmwcc.h +++ b/arch/x86/include/asm/rmwcc.h @@ -13,7 +13,7 @@ #define __GEN_RMWcc(fullop, _var, cc, clobbers, ...) \ ({ \ bool c = false; \ - asm_volatile_goto (fullop "; j" #cc " %l[cc_label]" \ + asm goto (fullop "; j" #cc " %l[cc_label]" \ : : [var] "m" (_var), ## __VA_ARGS__ \ : clobbers : cc_label); \ if (0) { \ diff --git a/arch/x86/include/asm/special_insns.h b/arch/x86/include/asm/special_insns.h index d6cd9344f6c7..48f8dd47cf68 100644 --- a/arch/x86/include/asm/special_insns.h +++ b/arch/x86/include/asm/special_insns.h @@ -205,7 +205,7 @@ static inline void clwb(volatile void *__p) #ifdef CONFIG_X86_USER_SHADOW_STACK static inline int write_user_shstk_64(u64 __user *addr, u64 val) { - asm_volatile_goto("1: wrussq %[val], (%[addr])\n" + asm goto("1: wrussq %[val], (%[addr])\n" _ASM_EXTABLE(1b, %l[fail]) :: [addr] "r" (addr), [val] "r" (val) :: fail); diff --git a/arch/x86/include/asm/uaccess.h b/arch/x86/include/asm/uaccess.h index 5c367c1290c3..237dc8cdd12b 100644 --- a/arch/x86/include/asm/uaccess.h +++ b/arch/x86/include/asm/uaccess.h @@ -133,7 +133,7 @@ extern int __get_user_bad(void); #ifdef CONFIG_X86_32 #define __put_user_goto_u64(x, addr, label) \ - asm_volatile_goto("\n" \ + asm goto("\n" \ "1: movl %%eax,0(%1)\n" \ "2: movl %%edx,4(%1)\n" \ _ASM_EXTABLE_UA(1b, %l2) \ @@ -295,7 +295,7 @@ do { \ } while (0) #define __get_user_asm(x, addr, itype, ltype, label) \ - asm_volatile_goto("\n" \ + asm_goto_output("\n" \ "1: mov"itype" %[umem],%[output]\n" \ _ASM_EXTABLE_UA(1b, %l2) \ : [output] ltype(x) \ @@ -375,7 +375,7 @@ do { \ __typeof__(_ptr) _old = (__typeof__(_ptr))(_pold); \ __typeof__(*(_ptr)) __old = *_old; \ __typeof__(*(_ptr)) __new = (_new); \ - asm_volatile_goto("\n" \ + asm_goto_output("\n" \ "1: " LOCK_PREFIX "cmpxchg"itype" %[new], %[ptr]\n"\ _ASM_EXTABLE_UA(1b, %l[label]) \ : CC_OUT(z) (success), \ @@ -394,7 +394,7 @@ do { \ __typeof__(_ptr) _old = (__typeof__(_ptr))(_pold); \ __typeof__(*(_ptr)) __old = *_old; \ __typeof__(*(_ptr)) __new = (_new); \ - asm_volatile_goto("\n" \ + asm_goto_output("\n" \ "1: " LOCK_PREFIX "cmpxchg8b %[ptr]\n" \ _ASM_EXTABLE_UA(1b, %l[label]) \ : CC_OUT(z) (success), \ @@ -477,7 +477,7 @@ struct __large_struct { unsigned long buf[100]; }; * aliasing issues. */ #define __put_user_goto(x, addr, itype, ltype, label) \ - asm_volatile_goto("\n" \ + asm goto("\n" \ "1: mov"itype" %0,%1\n" \ _ASM_EXTABLE_UA(1b, %l2) \ : : ltype(x), "m" (__m(addr)) \ diff --git a/arch/x86/kvm/svm/svm_ops.h b/arch/x86/kvm/svm/svm_ops.h index 36c8af87a707..4e725854c63a 100644 --- a/arch/x86/kvm/svm/svm_ops.h +++ b/arch/x86/kvm/svm/svm_ops.h @@ -8,7 +8,7 @@ #define svm_asm(insn, clobber...) \ do { \ - asm_volatile_goto("1: " __stringify(insn) "\n\t" \ + asm goto("1: " __stringify(insn) "\n\t" \ _ASM_EXTABLE(1b, %l[fault]) \ ::: clobber : fault); \ return; \ @@ -18,7 +18,7 @@ fault: \ #define svm_asm1(insn, op1, clobber...) \ do { \ - asm_volatile_goto("1: " __stringify(insn) " %0\n\t" \ + asm goto("1: " __stringify(insn) " %0\n\t" \ _ASM_EXTABLE(1b, %l[fault]) \ :: op1 : clobber : fault); \ return; \ @@ -28,7 +28,7 @@ fault: \ #define svm_asm2(insn, op1, op2, clobber...) \ do { \ - asm_volatile_goto("1: " __stringify(insn) " %1, %0\n\t" \ + asm goto("1: " __stringify(insn) " %1, %0\n\t" \ _ASM_EXTABLE(1b, %l[fault]) \ :: op1, op2 : clobber : fault); \ return; \ diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index e262bc2ba4e5..1111d9d08903 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -738,7 +738,7 @@ static int vmx_set_guest_uret_msr(struct vcpu_vmx *vmx, */ static int kvm_cpu_vmxoff(void) { - asm_volatile_goto("1: vmxoff\n\t" + asm goto("1: vmxoff\n\t" _ASM_EXTABLE(1b, %l[fault]) ::: "cc", "memory" : fault); @@ -2784,7 +2784,7 @@ static int kvm_cpu_vmxon(u64 vmxon_pointer) cr4_set_bits(X86_CR4_VMXE); - asm_volatile_goto("1: vmxon %[vmxon_pointer]\n\t" + asm goto("1: vmxon %[vmxon_pointer]\n\t" _ASM_EXTABLE(1b, %l[fault]) : : [vmxon_pointer] "m"(vmxon_pointer) : : fault); diff --git a/arch/x86/kvm/vmx/vmx_ops.h b/arch/x86/kvm/vmx/vmx_ops.h index f41ce3c24123..8060e5fc6dbd 100644 --- a/arch/x86/kvm/vmx/vmx_ops.h +++ b/arch/x86/kvm/vmx/vmx_ops.h @@ -94,7 +94,7 @@ static __always_inline unsigned long __vmcs_readl(unsigned long field) #ifdef CONFIG_CC_HAS_ASM_GOTO_OUTPUT - asm_volatile_goto("1: vmread %[field], %[output]\n\t" + asm_goto_output("1: vmread %[field], %[output]\n\t" "jna %l[do_fail]\n\t" _ASM_EXTABLE(1b, %l[do_exception]) @@ -188,7 +188,7 @@ static __always_inline unsigned long vmcs_readl(unsigned long field) #define vmx_asm1(insn, op1, error_args...) \ do { \ - asm_volatile_goto("1: " __stringify(insn) " %0\n\t" \ + asm goto("1: " __stringify(insn) " %0\n\t" \ ".byte 0x2e\n\t" /* branch not taken hint */ \ "jna %l[error]\n\t" \ _ASM_EXTABLE(1b, %l[fault]) \ @@ -205,7 +205,7 @@ fault: \ #define vmx_asm2(insn, op1, op2, error_args...) \ do { \ - asm_volatile_goto("1: " __stringify(insn) " %1, %0\n\t" \ + asm goto("1: " __stringify(insn) " %1, %0\n\t" \ ".byte 0x2e\n\t" /* branch not taken hint */ \ "jna %l[error]\n\t" \ _ASM_EXTABLE(1b, %l[fault]) \ diff --git a/arch/xtensa/include/asm/jump_label.h b/arch/xtensa/include/asm/jump_label.h index c812bf85021c..46c8596259d2 100644 --- a/arch/xtensa/include/asm/jump_label.h +++ b/arch/xtensa/include/asm/jump_label.h @@ -13,7 +13,7 @@ static __always_inline bool arch_static_branch(struct static_key *key, bool branch) { - asm_volatile_goto("1:\n\t" + asm goto("1:\n\t" "_nop\n\t" ".pushsection __jump_table, \"aw\"\n\t" ".word 1b, %l[l_yes], %c0\n\t" @@ -38,7 +38,7 @@ static __always_inline bool arch_static_branch_jump(struct static_key *key, * make it reachable and wrap both into a no-transform block * to avoid any assembler interference with this. */ - asm_volatile_goto("1:\n\t" + asm goto("1:\n\t" ".begin no-transform\n\t" "_j %l[l_yes]\n\t" "2:\n\t" diff --git a/include/linux/compiler-gcc.h b/include/linux/compiler-gcc.h index aebb65bf95a7..c1a963be7d28 100644 --- a/include/linux/compiler-gcc.h +++ b/include/linux/compiler-gcc.h @@ -64,6 +64,25 @@ __builtin_unreachable(); \ } while (0) +/* + * GCC 'asm goto' with outputs miscompiles certain code sequences: + * + * https://gcc.gnu.org/bugzilla/show_bug.cgi?id=110420 + * https://gcc.gnu.org/bugzilla/show_bug.cgi?id=110422 + * + * Work it around via the same compiler barrier quirk that we used + * to use for the old 'asm goto' workaround. + * + * Also, always mark such 'asm goto' statements as volatile: all + * asm goto statements are supposed to be volatile as per the + * documentation, but some versions of gcc didn't actually do + * that for asms with outputs: + * + * https://gcc.gnu.org/bugzilla/show_bug.cgi?id=98619 + */ +#define asm_goto_output(x...) \ + do { asm volatile goto(x); asm (""); } while (0) + #if defined(CONFIG_ARCH_USE_BUILTIN_BSWAP) #define __HAVE_BUILTIN_BSWAP32__ #define __HAVE_BUILTIN_BSWAP64__ diff --git a/include/linux/compiler_types.h b/include/linux/compiler_types.h index 6f1ca49306d2..663d8791c871 100644 --- a/include/linux/compiler_types.h +++ b/include/linux/compiler_types.h @@ -362,8 +362,8 @@ struct ftrace_likely_data { #define __member_size(p) __builtin_object_size(p, 1) #endif -#ifndef asm_volatile_goto -#define asm_volatile_goto(x...) asm goto(x) +#ifndef asm_goto_output +#define asm_goto_output(x...) asm goto(x) #endif #ifdef CONFIG_CC_HAS_ASM_INLINE diff --git a/net/netfilter/nft_set_pipapo_avx2.c b/net/netfilter/nft_set_pipapo_avx2.c index 90e275bb3e5d..a3a8ddca9918 100644 --- a/net/netfilter/nft_set_pipapo_avx2.c +++ b/net/netfilter/nft_set_pipapo_avx2.c @@ -57,7 +57,7 @@ /* Jump to label if @reg is zero */ #define NFT_PIPAPO_AVX2_NOMATCH_GOTO(reg, label) \ - asm_volatile_goto("vptest %%ymm" #reg ", %%ymm" #reg ";" \ + asm goto("vptest %%ymm" #reg ", %%ymm" #reg ";" \ "je %l[" #label "]" : : : : label) /* Store 256 bits from YMM register into memory. Contrary to bucket load diff --git a/samples/bpf/asm_goto_workaround.h b/samples/bpf/asm_goto_workaround.h index 7048bb3594d6..634e81d83efd 100644 --- a/samples/bpf/asm_goto_workaround.h +++ b/samples/bpf/asm_goto_workaround.h @@ -4,14 +4,14 @@ #define __ASM_GOTO_WORKAROUND_H /* - * This will bring in asm_volatile_goto and asm_inline macro definitions + * This will bring in asm_goto_output and asm_inline macro definitions * if enabled by compiler and config options. */ #include -#ifdef asm_volatile_goto -#undef asm_volatile_goto -#define asm_volatile_goto(x...) asm volatile("invalid use of asm_volatile_goto") +#ifdef asm_goto_output +#undef asm_goto_output +#define asm_goto_output(x...) asm volatile("invalid use of asm_goto_output") #endif /* diff --git a/tools/arch/x86/include/asm/rmwcc.h b/tools/arch/x86/include/asm/rmwcc.h index 11ff975242ca..e2ff22b379a4 100644 --- a/tools/arch/x86/include/asm/rmwcc.h +++ b/tools/arch/x86/include/asm/rmwcc.h @@ -4,7 +4,7 @@ #define __GEN_RMWcc(fullop, var, cc, ...) \ do { \ - asm_volatile_goto (fullop "; j" cc " %l[cc_label]" \ + asm goto (fullop "; j" cc " %l[cc_label]" \ : : "m" (var), ## __VA_ARGS__ \ : "memory" : cc_label); \ return 0; \ diff --git a/tools/include/linux/compiler_types.h b/tools/include/linux/compiler_types.h index 1bdd834bdd57..d09f9dc172a4 100644 --- a/tools/include/linux/compiler_types.h +++ b/tools/include/linux/compiler_types.h @@ -36,8 +36,8 @@ #include #endif -#ifndef asm_volatile_goto -#define asm_volatile_goto(x...) asm goto(x) +#ifndef asm_goto_output +#define asm_goto_output(x...) asm goto(x) #endif #endif /* __LINUX_COMPILER_TYPES_H */ From 841c35169323cd833294798e58b9bf63fa4fa1de Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Sun, 11 Feb 2024 12:18:13 -0800 Subject: [PATCH 262/270] Linux 6.8-rc4 --- Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile b/Makefile index a171eafce2a3..7e0b2ad98905 100644 --- a/Makefile +++ b/Makefile @@ -2,7 +2,7 @@ VERSION = 6 PATCHLEVEL = 8 SUBLEVEL = 0 -EXTRAVERSION = -rc3 +EXTRAVERSION = -rc4 NAME = Hurr durr I'ma ninja sloth # *DOCUMENTATION* From c60d847be7b8e69e419e02a2b3d19c2842a3c35d Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Mon, 12 Feb 2024 19:30:52 +0000 Subject: [PATCH 263/270] KVM: arm64: Fix double-free following kvm_pgtable_stage2_free_unlinked() kvm_pgtable_stage2_free_unlinked() does the final put_page() on the root page of the sub-tree before returning, so remove the additional put_page() invocations in the callers. Cc: Ricardo Koller Fixes: f6a27d6dc51b2 ("KVM: arm64: Drop last page ref in kvm_pgtable_stage2_free_removed()") Signed-off-by: Will Deacon Reviewed-by: Oliver Upton Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20240212193052.27765-1-will@kernel.org --- arch/arm64/kvm/hyp/pgtable.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/arch/arm64/kvm/hyp/pgtable.c b/arch/arm64/kvm/hyp/pgtable.c index c651df904fe3..ab9d05fcf98b 100644 --- a/arch/arm64/kvm/hyp/pgtable.c +++ b/arch/arm64/kvm/hyp/pgtable.c @@ -1419,7 +1419,6 @@ kvm_pte_t *kvm_pgtable_stage2_create_unlinked(struct kvm_pgtable *pgt, level + 1); if (ret) { kvm_pgtable_stage2_free_unlinked(mm_ops, pgtable, level); - mm_ops->put_page(pgtable); return ERR_PTR(ret); } @@ -1502,7 +1501,6 @@ static int stage2_split_walker(const struct kvm_pgtable_visit_ctx *ctx, if (!stage2_try_break_pte(ctx, mmu)) { kvm_pgtable_stage2_free_unlinked(mm_ops, childp, level); - mm_ops->put_page(childp); return -EAGAIN; } From 8d3a7dfb801d157ac423261d7cd62c33e95375f8 Mon Sep 17 00:00:00 2001 From: Oliver Upton Date: Wed, 21 Feb 2024 09:27:31 +0000 Subject: [PATCH 264/270] KVM: arm64: vgic-its: Test for valid IRQ in its_sync_lpi_pending_table() vgic_get_irq() may not return a valid descriptor if there is no ITS that holds a valid translation for the specified INTID. If that is the case, it is safe to silently ignore it and continue processing the LPI pending table. Cc: stable@vger.kernel.org Fixes: 33d3bc9556a7 ("KVM: arm64: vgic-its: Read initial LPI pending table") Signed-off-by: Oliver Upton Link: https://lore.kernel.org/r/20240221092732.4126848-2-oliver.upton@linux.dev Signed-off-by: Marc Zyngier --- arch/arm64/kvm/vgic/vgic-its.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/arch/arm64/kvm/vgic/vgic-its.c b/arch/arm64/kvm/vgic/vgic-its.c index e2764d0ffa9f..082448de27ed 100644 --- a/arch/arm64/kvm/vgic/vgic-its.c +++ b/arch/arm64/kvm/vgic/vgic-its.c @@ -468,6 +468,9 @@ static int its_sync_lpi_pending_table(struct kvm_vcpu *vcpu) } irq = vgic_get_irq(vcpu->kvm, NULL, intids[i]); + if (!irq) + continue; + raw_spin_lock_irqsave(&irq->irq_lock, flags); irq->pending_latch = pendmask & (1U << bit_nr); vgic_queue_irq_unlock(vcpu->kvm, irq, flags); From 85a71ee9a0700f6c18862ef3b0011ed9dad99aca Mon Sep 17 00:00:00 2001 From: Oliver Upton Date: Wed, 21 Feb 2024 09:27:32 +0000 Subject: [PATCH 265/270] KVM: arm64: vgic-its: Test for valid IRQ in MOVALL handler It is possible that an LPI mapped in a different ITS gets unmapped while handling the MOVALL command. If that is the case, there is no state that can be migrated to the destination. Silently ignore it and continue migrating other LPIs. Cc: stable@vger.kernel.org Fixes: ff9c114394aa ("KVM: arm/arm64: GICv4: Handle MOVALL applied to a vPE") Signed-off-by: Oliver Upton Link: https://lore.kernel.org/r/20240221092732.4126848-3-oliver.upton@linux.dev Signed-off-by: Marc Zyngier --- arch/arm64/kvm/vgic/vgic-its.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/arch/arm64/kvm/vgic/vgic-its.c b/arch/arm64/kvm/vgic/vgic-its.c index 082448de27ed..28a93074eca1 100644 --- a/arch/arm64/kvm/vgic/vgic-its.c +++ b/arch/arm64/kvm/vgic/vgic-its.c @@ -1435,6 +1435,8 @@ static int vgic_its_cmd_handle_movall(struct kvm *kvm, struct vgic_its *its, for (i = 0; i < irq_count; i++) { irq = vgic_get_irq(kvm, NULL, intids[i]); + if (!irq) + continue; update_affinity(irq, vcpu2); From e563592224e02f87048edee3ce3f0da16cceee88 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 22 Feb 2024 11:06:08 -0800 Subject: [PATCH 266/270] KVM: Make KVM_MEM_GUEST_MEMFD mutually exclusive with KVM_MEM_READONLY Disallow creating read-only memslots that support GUEST_MEMFD, as GUEST_MEMFD is fundamentally incompatible with KVM's semantics for read-only memslots. Read-only memslots allow the userspace VMM to emulate option ROMs by filling the backing memory with readable, executable code and data, while triggering emulated MMIO on writes. GUEST_MEMFD doesn't currently support writes from userspace and KVM doesn't support emulated MMIO on private accesses, i.e. the guest can only ever read zeros, and writes will always be treated as errors. Cc: Fuad Tabba Cc: Michael Roth Cc: Isaku Yamahata Cc: Yu Zhang Cc: Chao Peng Fixes: a7800aa80ea4 ("KVM: Add KVM_CREATE_GUEST_MEMFD ioctl() for guest-specific backing memory") Link: https://lore.kernel.org/r/20240222190612.2942589-2-seanjc@google.com Signed-off-by: Sean Christopherson --- virt/kvm/kvm_main.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 10bfc88a69f7..0f50960b0e3a 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -1615,7 +1615,13 @@ static int check_memory_region_flags(struct kvm *kvm, valid_flags &= ~KVM_MEM_LOG_DIRTY_PAGES; #ifdef __KVM_HAVE_READONLY_MEM - valid_flags |= KVM_MEM_READONLY; + /* + * GUEST_MEMFD is incompatible with read-only memslots, as writes to + * read-only memslots have emulated MMIO, not page fault, semantics, + * and KVM doesn't allow emulated MMIO for private memory. + */ + if (!(mem->flags & KVM_MEM_GUEST_MEMFD)) + valid_flags |= KVM_MEM_READONLY; #endif if (mem->flags & ~valid_flags) From 422692098c4c53a6b65c2ef235621aee6a38721f Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 22 Feb 2024 11:06:09 -0800 Subject: [PATCH 267/270] KVM: x86: Update KVM_SW_PROTECTED_VM docs to make it clear they're a WIP Rewrite the help message for KVM_SW_PROTECTED_VM to make it clear that software-protected VMs are a development and testing vehicle for guest_memfd(), and that attempting to use KVM_SW_PROTECTED_VM for anything remotely resembling a "real" VM will fail. E.g. any memory accesses from KVM will incorrectly access shared memory, nested TDP is wildly broken, and so on and so forth. Update KVM's API documentation with similar warnings to discourage anyone from attempting to run anything but selftests with KVM_X86_SW_PROTECTED_VM. Fixes: 89ea60c2c7b5 ("KVM: x86: Add support for "protected VMs" that can utilize private memory") Link: https://lore.kernel.org/r/20240222190612.2942589-3-seanjc@google.com Signed-off-by: Sean Christopherson --- Documentation/virt/kvm/api.rst | 5 +++++ arch/x86/kvm/Kconfig | 7 ++++--- 2 files changed, 9 insertions(+), 3 deletions(-) diff --git a/Documentation/virt/kvm/api.rst b/Documentation/virt/kvm/api.rst index 3ec0b7a455a0..09c7e585ff58 100644 --- a/Documentation/virt/kvm/api.rst +++ b/Documentation/virt/kvm/api.rst @@ -8791,6 +8791,11 @@ means the VM type with value @n is supported. Possible values of @n are:: #define KVM_X86_DEFAULT_VM 0 #define KVM_X86_SW_PROTECTED_VM 1 +Note, KVM_X86_SW_PROTECTED_VM is currently only for development and testing. +Do not use KVM_X86_SW_PROTECTED_VM for "real" VMs, and especially not in +production. The behavior and effective ABI for software-protected VMs is +unstable. + 9. Known KVM API problems ========================= diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig index 87e3da7b0439..65ed14b6540b 100644 --- a/arch/x86/kvm/Kconfig +++ b/arch/x86/kvm/Kconfig @@ -80,9 +80,10 @@ config KVM_SW_PROTECTED_VM depends on KVM && X86_64 select KVM_GENERIC_PRIVATE_MEM help - Enable support for KVM software-protected VMs. Currently "protected" - means the VM can be backed with memory provided by - KVM_CREATE_GUEST_MEMFD. + Enable support for KVM software-protected VMs. Currently, software- + protected VMs are purely a development and testing vehicle for + KVM_CREATE_GUEST_MEMFD. Attempting to run a "real" VM workload as a + software-protected VM will fail miserably. If unsure, say "N". From a1176ef5c92aa58e63ecf184b7cac2e311b2b233 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 22 Feb 2024 11:06:10 -0800 Subject: [PATCH 268/270] KVM: x86/mmu: Restrict KVM_SW_PROTECTED_VM to the TDP MMU Advertise and support software-protected VMs if and only if the TDP MMU is enabled, i.e. disallow KVM_SW_PROTECTED_VM if TDP is enabled for KVM's legacy/shadow MMU. TDP support for the shadow MMU is maintenance-only, e.g. support for TDX and SNP will also be restricted to the TDP MMU. Fixes: 89ea60c2c7b5 ("KVM: x86: Add support for "protected VMs" that can utilize private memory") Link: https://lore.kernel.org/r/20240222190612.2942589-4-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/x86.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 48a61d283406..3638a104bcf7 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -4580,7 +4580,7 @@ static bool kvm_is_vm_type_supported(unsigned long type) { return type == KVM_X86_DEFAULT_VM || (type == KVM_X86_SW_PROTECTED_VM && - IS_ENABLED(CONFIG_KVM_SW_PROTECTED_VM) && tdp_enabled); + IS_ENABLED(CONFIG_KVM_SW_PROTECTED_VM) && tdp_mmu_enabled); } int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) From 63e5c5a10559077bb5f32edf783084e7164af9c3 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 22 Feb 2024 11:06:11 -0800 Subject: [PATCH 269/270] KVM: selftests: Create GUEST_MEMFD for relevant invalid flags testcases Actually create a GUEST_MEMFD instance and pass it to KVM when doing negative tests for KVM_SET_USER_MEMORY_REGION2 + KVM_MEM_GUEST_MEMFD. Without a valid GUEST_MEMFD file descriptor, KVM_SET_USER_MEMORY_REGION2 will always fail with -EINVAL, resulting in false passes for any and all tests of illegal combinations of KVM_MEM_GUEST_MEMFD and other flags. Fixes: 5d74316466f4 ("KVM: selftests: Add a memory region subtest to validate invalid flags") Link: https://lore.kernel.org/r/20240222190612.2942589-5-seanjc@google.com Signed-off-by: Sean Christopherson --- tools/testing/selftests/kvm/set_memory_region_test.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/tools/testing/selftests/kvm/set_memory_region_test.c b/tools/testing/selftests/kvm/set_memory_region_test.c index 40337f566eeb..9df4b61116bc 100644 --- a/tools/testing/selftests/kvm/set_memory_region_test.c +++ b/tools/testing/selftests/kvm/set_memory_region_test.c @@ -367,11 +367,15 @@ static void test_invalid_memory_region_flags(void) } if (supported_flags & KVM_MEM_GUEST_MEMFD) { + int guest_memfd = vm_create_guest_memfd(vm, MEM_REGION_SIZE, 0); + r = __vm_set_user_memory_region2(vm, 0, KVM_MEM_LOG_DIRTY_PAGES | KVM_MEM_GUEST_MEMFD, - 0, MEM_REGION_SIZE, NULL, 0, 0); + 0, MEM_REGION_SIZE, NULL, guest_memfd, 0); TEST_ASSERT(r && errno == EINVAL, "KVM_SET_USER_MEMORY_REGION2 should have failed, dirty logging private memory is unsupported"); + + close(guest_memfd); } } From 2dfd2383034421101300a3b7325cf339a182d218 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 22 Feb 2024 11:06:12 -0800 Subject: [PATCH 270/270] KVM: selftests: Add a testcase to verify GUEST_MEMFD and READONLY are exclusive Extend set_memory_region_test's invalid flags subtest to verify that GUEST_MEMFD is incompatible with READONLY. GUEST_MEMFD doesn't currently support writes from userspace and KVM doesn't support emulated MMIO on private accesses, and so KVM is supposed to reject the GUEST_MEMFD+READONLY in order to avoid configuration that KVM can't support. Link: https://lore.kernel.org/r/20240222190612.2942589-6-seanjc@google.com Signed-off-by: Sean Christopherson --- tools/testing/selftests/kvm/set_memory_region_test.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/tools/testing/selftests/kvm/set_memory_region_test.c b/tools/testing/selftests/kvm/set_memory_region_test.c index 9df4b61116bc..06b43ed23580 100644 --- a/tools/testing/selftests/kvm/set_memory_region_test.c +++ b/tools/testing/selftests/kvm/set_memory_region_test.c @@ -375,6 +375,12 @@ static void test_invalid_memory_region_flags(void) TEST_ASSERT(r && errno == EINVAL, "KVM_SET_USER_MEMORY_REGION2 should have failed, dirty logging private memory is unsupported"); + r = __vm_set_user_memory_region2(vm, 0, + KVM_MEM_READONLY | KVM_MEM_GUEST_MEMFD, + 0, MEM_REGION_SIZE, NULL, guest_memfd, 0); + TEST_ASSERT(r && errno == EINVAL, + "KVM_SET_USER_MEMORY_REGION2 should have failed, read-only GUEST_MEMFD memslots are unsupported"); + close(guest_memfd); } }