From 4b9748055457ac3a0710bf210c229d01ea1b01b9 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Sun, 18 Jan 2026 19:48:01 -0700 Subject: [PATCH 1/5] io_uring/rw: free potentially allocated iovec on cache put failure If a read/write request goes through io_req_rw_cleanup() and has an allocated iovec attached and fails to put to the rw_cache, then it may end up with an unaccounted iovec pointer. Have io_rw_recycle() return whether it recycled the request or not, and use that to gauge whether to free a potential iovec or not. Reviewed-by: Nitesh Shetty Signed-off-by: Jens Axboe --- io_uring/rw.c | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) diff --git a/io_uring/rw.c b/io_uring/rw.c index 331af6bf4234..2b7521129f8b 100644 --- a/io_uring/rw.c +++ b/io_uring/rw.c @@ -144,19 +144,22 @@ static inline int io_import_rw_buffer(int rw, struct io_kiocb *req, return 0; } -static void io_rw_recycle(struct io_kiocb *req, unsigned int issue_flags) +static bool io_rw_recycle(struct io_kiocb *req, unsigned int issue_flags) { struct io_async_rw *rw = req->async_data; if (unlikely(issue_flags & IO_URING_F_UNLOCKED)) - return; + return false; io_alloc_cache_vec_kasan(&rw->vec); if (rw->vec.nr > IO_VEC_CACHE_SOFT_CAP) io_vec_free(&rw->vec); - if (io_alloc_cache_put(&req->ctx->rw_cache, rw)) + if (io_alloc_cache_put(&req->ctx->rw_cache, rw)) { io_req_async_data_clear(req, 0); + return true; + } + return false; } static void io_req_rw_cleanup(struct io_kiocb *req, unsigned int issue_flags) @@ -190,7 +193,11 @@ static void io_req_rw_cleanup(struct io_kiocb *req, unsigned int issue_flags) */ if (!(req->flags & (REQ_F_REISSUE | REQ_F_REFCOUNT))) { req->flags &= ~REQ_F_NEED_CLEANUP; - io_rw_recycle(req, issue_flags); + if (!io_rw_recycle(req, issue_flags)) { + struct io_async_rw *rw = req->async_data; + + io_vec_free(&rw->vec); + } } } From b994ace83a2bc7699420f6a4c6b860c8da133159 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Mon, 19 Jan 2026 19:46:26 -0700 Subject: [PATCH 2/5] io_uring/waitid: fix KCSAN warning on io_waitid->head Storing of the iw->head entry inside the wait_queue callback, or when removing a waitid item, really should use proper load/store acquire/release semantics, and KCSAN correctly warns of that. Ensure that they do so. Reported-by: syzbot+eb441775f4f948a0902f@syzkaller.appspotmail.com Fixes: a48c0cbf28c0 ("io_uring/waitid: have io_waitid_complete() remove wait queue entry") Signed-off-by: Jens Axboe --- io_uring/waitid.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/io_uring/waitid.c b/io_uring/waitid.c index 2d4cbd47c67c..d25d60aed6af 100644 --- a/io_uring/waitid.c +++ b/io_uring/waitid.c @@ -114,11 +114,11 @@ static void io_waitid_remove_wq(struct io_kiocb *req) struct io_waitid *iw = io_kiocb_to_cmd(req, struct io_waitid); struct wait_queue_head *head; - head = READ_ONCE(iw->head); + head = smp_load_acquire(&iw->head); if (head) { struct io_waitid_async *iwa = req->async_data; - iw->head = NULL; + smp_store_release(&iw->head, NULL); spin_lock_irq(&head->lock); list_del_init(&iwa->wo.child_wait.entry); spin_unlock_irq(&head->lock); @@ -246,7 +246,7 @@ static int io_waitid_wait(struct wait_queue_entry *wait, unsigned mode, return 0; list_del_init(&wait->entry); - iw->head = NULL; + smp_store_release(&iw->head, NULL); /* cancel is in progress */ if (atomic_fetch_inc(&iw->refs) & IO_WAITID_REF_MASK) From 10dc959398175736e495f71c771f8641e1ca1907 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Tue, 20 Jan 2026 07:42:50 -0700 Subject: [PATCH 3/5] io_uring/io-wq: check IO_WQ_BIT_EXIT inside work run loop Currently this is checked before running the pending work. Normally this is quite fine, as work items either end up blocking (which will create a new worker for other items), or they complete fairly quickly. But syzbot reports an issue where io-wq takes seemingly forever to exit, and with a bit of debugging, this turns out to be because it queues a bunch of big (2GB - 4096b) reads with a /dev/msr* file. Since this file type doesn't support ->read_iter(), loop_rw_iter() ends up handling them. Each read returns 16MB of data read, which takes 20 (!!) seconds. With a bunch of these pending, processing the whole chain can take a long time. Easily longer than the syzbot uninterruptible sleep timeout of 140 seconds. This then triggers a complaint off the io-wq exit path: INFO: task syz.4.135:6326 blocked for more than 143 seconds. Not tainted syzkaller #0 Blocked by coredump. "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message. task:syz.4.135 state:D stack:26824 pid:6326 tgid:6324 ppid:5957 task_flags:0x400548 flags:0x00080000 Call Trace: context_switch kernel/sched/core.c:5256 [inline] __schedule+0x1139/0x6150 kernel/sched/core.c:6863 __schedule_loop kernel/sched/core.c:6945 [inline] schedule+0xe7/0x3a0 kernel/sched/core.c:6960 schedule_timeout+0x257/0x290 kernel/time/sleep_timeout.c:75 do_wait_for_common kernel/sched/completion.c:100 [inline] __wait_for_common+0x2fc/0x4e0 kernel/sched/completion.c:121 io_wq_exit_workers io_uring/io-wq.c:1328 [inline] io_wq_put_and_exit+0x271/0x8a0 io_uring/io-wq.c:1356 io_uring_clean_tctx+0x10d/0x190 io_uring/tctx.c:203 io_uring_cancel_generic+0x69c/0x9a0 io_uring/cancel.c:651 io_uring_files_cancel include/linux/io_uring.h:19 [inline] do_exit+0x2ce/0x2bd0 kernel/exit.c:911 do_group_exit+0xd3/0x2a0 kernel/exit.c:1112 get_signal+0x2671/0x26d0 kernel/signal.c:3034 arch_do_signal_or_restart+0x8f/0x7e0 arch/x86/kernel/signal.c:337 __exit_to_user_mode_loop kernel/entry/common.c:41 [inline] exit_to_user_mode_loop+0x8c/0x540 kernel/entry/common.c:75 __exit_to_user_mode_prepare include/linux/irq-entry-common.h:226 [inline] syscall_exit_to_user_mode_prepare include/linux/irq-entry-common.h:256 [inline] syscall_exit_to_user_mode_work include/linux/entry-common.h:159 [inline] syscall_exit_to_user_mode include/linux/entry-common.h:194 [inline] do_syscall_64+0x4ee/0xf80 arch/x86/entry/syscall_64.c:100 entry_SYSCALL_64_after_hwframe+0x77/0x7f RIP: 0033:0x7fa02738f749 RSP: 002b:00007fa0281ae0e8 EFLAGS: 00000246 ORIG_RAX: 00000000000000ca RAX: fffffffffffffe00 RBX: 00007fa0275e6098 RCX: 00007fa02738f749 RDX: 0000000000000000 RSI: 0000000000000080 RDI: 00007fa0275e6098 RBP: 00007fa0275e6090 R08: 0000000000000000 R09: 0000000000000000 R10: 0000000000000000 R11: 0000000000000246 R12: 0000000000000000 R13: 00007fa0275e6128 R14: 00007fff14e4fcb0 R15: 00007fff14e4fd98 There's really nothing wrong here, outside of processing these reads will take a LONG time. However, we can speed up the exit by checking the IO_WQ_BIT_EXIT inside the io_worker_handle_work() loop, as syzbot will exit the ring after queueing up all of these reads. Then once the first item is processed, io-wq will simply cancel the rest. That should avoid syzbot running into this complaint again. Cc: stable@vger.kernel.org Link: https://lore.kernel.org/all/68a2decc.050a0220.e29e5.0099.GAE@google.com/ Reported-by: syzbot+4eb282331cab6d5b6588@syzkaller.appspotmail.com Signed-off-by: Jens Axboe --- io_uring/io-wq.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/io_uring/io-wq.c b/io_uring/io-wq.c index 9fd9f6ab722c..2fa7d3601edb 100644 --- a/io_uring/io-wq.c +++ b/io_uring/io-wq.c @@ -598,9 +598,9 @@ static void io_worker_handle_work(struct io_wq_acct *acct, __releases(&acct->lock) { struct io_wq *wq = worker->wq; - bool do_kill = test_bit(IO_WQ_BIT_EXIT, &wq->state); do { + bool do_kill = test_bit(IO_WQ_BIT_EXIT, &wq->state); struct io_wq_work *work; /* From 73061dbeca783aaf311e1af9610f8cba1c1176cd Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Tue, 20 Jan 2026 21:11:44 +0000 Subject: [PATCH 4/5] selftests/io_uring: add io_uring_queue_init_params Add a ring init variant taking struct io_uring_params, which mimicks liburing API. Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- tools/include/io_uring/mini_liburing.h | 25 +++++++++++++++++-------- 1 file changed, 17 insertions(+), 8 deletions(-) diff --git a/tools/include/io_uring/mini_liburing.h b/tools/include/io_uring/mini_liburing.h index 9ccb16074eb5..a55407b09dbb 100644 --- a/tools/include/io_uring/mini_liburing.h +++ b/tools/include/io_uring/mini_liburing.h @@ -126,21 +126,18 @@ static inline int io_uring_enter(int fd, unsigned int to_submit, flags, sig, _NSIG / 8); } -static inline int io_uring_queue_init(unsigned int entries, - struct io_uring *ring, - unsigned int flags) +static inline int io_uring_queue_init_params(unsigned int entries, + struct io_uring *ring, + struct io_uring_params *p) { - struct io_uring_params p; int fd, ret; memset(ring, 0, sizeof(*ring)); - memset(&p, 0, sizeof(p)); - p.flags = flags; - fd = io_uring_setup(entries, &p); + fd = io_uring_setup(entries, p); if (fd < 0) return fd; - ret = io_uring_mmap(fd, &p, &ring->sq, &ring->cq); + ret = io_uring_mmap(fd, p, &ring->sq, &ring->cq); if (!ret) ring->ring_fd = fd; else @@ -148,6 +145,18 @@ static inline int io_uring_queue_init(unsigned int entries, return ret; } +static inline int io_uring_queue_init(unsigned int entries, + struct io_uring *ring, + unsigned int flags) +{ + struct io_uring_params p; + + memset(&p, 0, sizeof(p)); + p.flags = flags; + + return io_uring_queue_init_params(entries, ring, &p); +} + /* Get a sqe */ static inline struct io_uring_sqe *io_uring_get_sqe(struct io_uring *ring) { From 145e0074392587606aa5df353d0e761f0b8357d5 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Tue, 20 Jan 2026 21:11:45 +0000 Subject: [PATCH 5/5] selftests/io_uring: support NO_SQARRAY in miniliburing Add support for IORING_SETUP_NO_SQARRAY in miniliburing. Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- tools/include/io_uring/mini_liburing.h | 34 ++++++++++++++++++++------ 1 file changed, 27 insertions(+), 7 deletions(-) diff --git a/tools/include/io_uring/mini_liburing.h b/tools/include/io_uring/mini_liburing.h index a55407b09dbb..44be4446feda 100644 --- a/tools/include/io_uring/mini_liburing.h +++ b/tools/include/io_uring/mini_liburing.h @@ -6,6 +6,7 @@ #include #include #include +#include struct io_sq_ring { unsigned int *head; @@ -55,6 +56,7 @@ struct io_uring { struct io_uring_sq sq; struct io_uring_cq cq; int ring_fd; + unsigned flags; }; #if defined(__x86_64) || defined(__i386__) @@ -72,7 +74,14 @@ static inline int io_uring_mmap(int fd, struct io_uring_params *p, void *ptr; int ret; - sq->ring_sz = p->sq_off.array + p->sq_entries * sizeof(unsigned int); + if (p->flags & IORING_SETUP_NO_SQARRAY) { + sq->ring_sz = p->cq_off.cqes; + sq->ring_sz += p->cq_entries * sizeof(struct io_uring_cqe); + } else { + sq->ring_sz = p->sq_off.array; + sq->ring_sz += p->sq_entries * sizeof(unsigned int); + } + ptr = mmap(0, sq->ring_sz, PROT_READ | PROT_WRITE, MAP_SHARED | MAP_POPULATE, fd, IORING_OFF_SQ_RING); if (ptr == MAP_FAILED) @@ -83,7 +92,8 @@ static inline int io_uring_mmap(int fd, struct io_uring_params *p, sq->kring_entries = ptr + p->sq_off.ring_entries; sq->kflags = ptr + p->sq_off.flags; sq->kdropped = ptr + p->sq_off.dropped; - sq->array = ptr + p->sq_off.array; + if (!(p->flags & IORING_SETUP_NO_SQARRAY)) + sq->array = ptr + p->sq_off.array; size = p->sq_entries * sizeof(struct io_uring_sqe); sq->sqes = mmap(0, size, PROT_READ | PROT_WRITE, @@ -138,10 +148,12 @@ static inline int io_uring_queue_init_params(unsigned int entries, if (fd < 0) return fd; ret = io_uring_mmap(fd, p, &ring->sq, &ring->cq); - if (!ret) + if (!ret) { ring->ring_fd = fd; - else + ring->flags = p->flags; + } else { close(fd); + } return ret; } @@ -208,10 +220,18 @@ static inline int io_uring_submit(struct io_uring *ring) ktail = *sq->ktail; to_submit = sq->sqe_tail - sq->sqe_head; - for (submitted = 0; submitted < to_submit; submitted++) { - read_barrier(); - sq->array[ktail++ & mask] = sq->sqe_head++ & mask; + + if (!(ring->flags & IORING_SETUP_NO_SQARRAY)) { + for (submitted = 0; submitted < to_submit; submitted++) { + read_barrier(); + sq->array[ktail++ & mask] = sq->sqe_head++ & mask; + } + } else { + ktail += to_submit; + sq->sqe_head += to_submit; + submitted = to_submit; } + if (!submitted) return 0;