Merge tag 'for-6.15/io_uring-epoll-wait-20250325' of git://git.kernel.dk/linux

Pull io_uring epoll support from Jens Axboe:
 "This adds support for reading epoll events via io_uring.

  While this may seem counter-intuitive (and/or productive), the
  reasoning here is that quite a few existing epoll event loops can
  easily do a partial conversion to a completion based model, but are
  still stuck with one (or few) event types that remain readiness based.

  For that case, they then need to add the io_uring fd to the epoll
  context, and continue to rely on epoll_wait(2) for waiting on events.
  This misses out on the finer grained waiting that io_uring can do, to
  reduce context switches and wait for multiple events in one batch
  reliably.

  With adding support for reaping epoll events via io_uring, the whole
  legacy readiness based event types can still be reaped via epoll, with
  the overall waiting in the loop be driven by io_uring"

* tag 'for-6.15/io_uring-epoll-wait-20250325' of git://git.kernel.dk/linux:
  io_uring/epoll: add support for IORING_OP_EPOLL_WAIT
  io_uring/epoll: remove CONFIG_EPOLL guards
This commit is contained in:
Linus Torvalds
2025-03-28 14:55:32 -07:00
5 changed files with 55 additions and 6 deletions

View File

@@ -280,6 +280,7 @@ enum io_uring_op {
IORING_OP_BIND,
IORING_OP_LISTEN,
IORING_OP_RECV_ZC,
IORING_OP_EPOLL_WAIT,
/* this goes last, obviously */
IORING_OP_LAST,

View File

@@ -11,10 +11,11 @@ obj-$(CONFIG_IO_URING) += io_uring.o opdef.o kbuf.o rsrc.o notif.o \
eventfd.o uring_cmd.o openclose.o \
sqpoll.o xattr.o nop.o fs.o splice.o \
sync.o msg_ring.o advise.o openclose.o \
epoll.o statx.o timeout.o fdinfo.o \
cancel.o waitid.o register.o \
truncate.o memmap.o alloc_cache.o
statx.o timeout.o fdinfo.o cancel.o \
waitid.o register.o truncate.o \
memmap.o alloc_cache.o
obj-$(CONFIG_IO_URING_ZCRX) += zcrx.o
obj-$(CONFIG_IO_WQ) += io-wq.o
obj-$(CONFIG_FUTEX) += futex.o
obj-$(CONFIG_NET_RX_BUSY_POLL) += napi.o
obj-$(CONFIG_EPOLL) += epoll.o
obj-$(CONFIG_NET_RX_BUSY_POLL) += napi.o

View File

@@ -12,7 +12,6 @@
#include "io_uring.h"
#include "epoll.h"
#if defined(CONFIG_EPOLL)
struct io_epoll {
struct file *file;
int epfd;
@@ -21,6 +20,12 @@ struct io_epoll {
struct epoll_event event;
};
struct io_epoll_wait {
struct file *file;
int maxevents;
struct epoll_event __user *events;
};
int io_epoll_ctl_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
{
struct io_epoll *epoll = io_kiocb_to_cmd(req, struct io_epoll);
@@ -58,4 +63,30 @@ int io_epoll_ctl(struct io_kiocb *req, unsigned int issue_flags)
io_req_set_res(req, ret, 0);
return IOU_OK;
}
#endif
int io_epoll_wait_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
{
struct io_epoll_wait *iew = io_kiocb_to_cmd(req, struct io_epoll_wait);
if (sqe->off || sqe->rw_flags || sqe->buf_index || sqe->splice_fd_in)
return -EINVAL;
iew->maxevents = READ_ONCE(sqe->len);
iew->events = u64_to_user_ptr(READ_ONCE(sqe->addr));
return 0;
}
int io_epoll_wait(struct io_kiocb *req, unsigned int issue_flags)
{
struct io_epoll_wait *iew = io_kiocb_to_cmd(req, struct io_epoll_wait);
int ret;
ret = epoll_sendevents(req->file, iew->events, iew->maxevents);
if (ret == 0)
return -EAGAIN;
if (ret < 0)
req_set_fail(req);
io_req_set_res(req, ret, 0);
return IOU_OK;
}

View File

@@ -3,4 +3,6 @@
#if defined(CONFIG_EPOLL)
int io_epoll_ctl_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe);
int io_epoll_ctl(struct io_kiocb *req, unsigned int issue_flags);
int io_epoll_wait_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe);
int io_epoll_wait(struct io_kiocb *req, unsigned int issue_flags);
#endif

View File

@@ -527,6 +527,17 @@ const struct io_issue_def io_issue_defs[] = {
.issue = io_recvzc,
#else
.prep = io_eopnotsupp_prep,
#endif
},
[IORING_OP_EPOLL_WAIT] = {
.needs_file = 1,
.audit_skip = 1,
.pollin = 1,
#if defined(CONFIG_EPOLL)
.prep = io_epoll_wait_prep,
.issue = io_epoll_wait,
#else
.prep = io_eopnotsupp_prep,
#endif
},
};
@@ -761,6 +772,9 @@ const struct io_cold_def io_cold_defs[] = {
[IORING_OP_RECV_ZC] = {
.name = "RECV_ZC",
},
[IORING_OP_EPOLL_WAIT] = {
.name = "EPOLL_WAIT",
},
};
const char *io_uring_get_opcode(u8 opcode)