From b5f96cb719d8ba220b565ddd3ba4ac0d8bcfb130 Mon Sep 17 00:00:00 2001 From: Klaus Jensen Date: Tue, 13 Dec 2022 09:58:07 +0100 Subject: [PATCH 1/4] nvme-pci: fix doorbell buffer value endianness When using shadow doorbells, the event index and the doorbell values are written to host memory. Prior to this patch, the values written would erroneously be written in host endianness. This causes trouble on big-endian platforms. Fix this by adding missing endian conversions. This issue was noticed by Guenter while testing various big-endian platforms under QEMU[1]. A similar fix required for hw/nvme in QEMU is up for review as well[2]. [1]: https://lore.kernel.org/qemu-devel/20221209110022.GA3396194@roeck-us.net/ [2]: https://lore.kernel.org/qemu-devel/20221212114409.34972-4-its@irrelevant.dk/ Fixes: f9f38e33389c ("nvme: improve performance for virtual NVMe devices") Reported-by: Guenter Roeck Signed-off-by: Klaus Jensen Signed-off-by: Christoph Hellwig --- drivers/nvme/host/pci.c | 25 +++++++++++++------------ 1 file changed, 13 insertions(+), 12 deletions(-) diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index f0f8027644bb..017442858054 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -144,9 +144,9 @@ struct nvme_dev { mempool_t *iod_mempool; /* shadow doorbell buffer support: */ - u32 *dbbuf_dbs; + __le32 *dbbuf_dbs; dma_addr_t dbbuf_dbs_dma_addr; - u32 *dbbuf_eis; + __le32 *dbbuf_eis; dma_addr_t dbbuf_eis_dma_addr; /* host memory buffer support: */ @@ -208,10 +208,10 @@ struct nvme_queue { #define NVMEQ_SQ_CMB 1 #define NVMEQ_DELETE_ERROR 2 #define NVMEQ_POLLED 3 - u32 *dbbuf_sq_db; - u32 *dbbuf_cq_db; - u32 *dbbuf_sq_ei; - u32 *dbbuf_cq_ei; + __le32 *dbbuf_sq_db; + __le32 *dbbuf_cq_db; + __le32 *dbbuf_sq_ei; + __le32 *dbbuf_cq_ei; struct completion delete_done; }; @@ -343,11 +343,11 @@ static inline int nvme_dbbuf_need_event(u16 event_idx, u16 new_idx, u16 old) } /* Update dbbuf and return true if an MMIO is required */ -static bool nvme_dbbuf_update_and_check_event(u16 value, u32 *dbbuf_db, - volatile u32 *dbbuf_ei) +static bool nvme_dbbuf_update_and_check_event(u16 value, __le32 *dbbuf_db, + volatile __le32 *dbbuf_ei) { if (dbbuf_db) { - u16 old_value; + u16 old_value, event_idx; /* * Ensure that the queue is written before updating @@ -355,8 +355,8 @@ static bool nvme_dbbuf_update_and_check_event(u16 value, u32 *dbbuf_db, */ wmb(); - old_value = *dbbuf_db; - *dbbuf_db = value; + old_value = le32_to_cpu(*dbbuf_db); + *dbbuf_db = cpu_to_le32(value); /* * Ensure that the doorbell is updated before reading the event @@ -366,7 +366,8 @@ static bool nvme_dbbuf_update_and_check_event(u16 value, u32 *dbbuf_db, */ mb(); - if (!nvme_dbbuf_need_event(*dbbuf_ei, value, old_value)) + event_idx = le32_to_cpu(*dbbuf_ei); + if (!nvme_dbbuf_need_event(event_idx, value, old_value)) return false; } From c89a529e823d51dd23c7ec0c047c7a454a428541 Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Mon, 19 Dec 2022 10:59:06 -0800 Subject: [PATCH 2/4] nvme-pci: fix mempool alloc size Convert the max size to bytes to match the units of the divisor that calculates the worst-case number of PRP entries. The result is used to determine how many PRP Lists are required. The code was previously rounding this to 1 list, but we can require 2 in the worst case. In that scenario, the driver would corrupt memory beyond the size provided by the mempool. While unlikely to occur (you'd need a 4MB in exactly 127 phys segments on a queue that doesn't support SGLs), this memory corruption has been observed by kfence. Cc: Jens Axboe Fixes: 943e942e6266f ("nvme-pci: limit max IO size and segments to avoid high order allocations") Signed-off-by: Keith Busch Reviewed-by: Jens Axboe Reviewed-by: Kanchan Joshi Reviewed-by: Chaitanya Kulkarni Signed-off-by: Christoph Hellwig --- drivers/nvme/host/pci.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index 017442858054..6e9d1c7409a9 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -381,8 +381,8 @@ static bool nvme_dbbuf_update_and_check_event(u16 value, __le32 *dbbuf_db, */ static int nvme_pci_npages_prp(void) { - unsigned nprps = DIV_ROUND_UP(NVME_MAX_KB_SZ + NVME_CTRL_PAGE_SIZE, - NVME_CTRL_PAGE_SIZE); + unsigned max_bytes = (NVME_MAX_KB_SZ * 1024) + NVME_CTRL_PAGE_SIZE; + unsigned nprps = DIV_ROUND_UP(max_bytes, NVME_CTRL_PAGE_SIZE); return DIV_ROUND_UP(8 * nprps, PAGE_SIZE - 8); } From 841734234a28fd5cd0889b84bd4d93a0988fa11e Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Mon, 19 Dec 2022 13:54:55 -0800 Subject: [PATCH 3/4] nvme-pci: fix page size checks The size allocated out of the dma pool is at most NVME_CTRL_PAGE_SIZE, which may be smaller than the PAGE_SIZE. Fixes: c61b82c7b7134 ("nvme-pci: fix PRP pool size") Signed-off-by: Keith Busch Signed-off-by: Christoph Hellwig --- drivers/nvme/host/pci.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index 6e9d1c7409a9..804b6a6cb43a 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -36,7 +36,7 @@ #define SQ_SIZE(q) ((q)->q_depth << (q)->sqes) #define CQ_SIZE(q) ((q)->q_depth * sizeof(struct nvme_completion)) -#define SGES_PER_PAGE (PAGE_SIZE / sizeof(struct nvme_sgl_desc)) +#define SGES_PER_PAGE (NVME_CTRL_PAGE_SIZE / sizeof(struct nvme_sgl_desc)) /* * These can be higher, but we need to ensure that any command doesn't @@ -383,7 +383,7 @@ static int nvme_pci_npages_prp(void) { unsigned max_bytes = (NVME_MAX_KB_SZ * 1024) + NVME_CTRL_PAGE_SIZE; unsigned nprps = DIV_ROUND_UP(max_bytes, NVME_CTRL_PAGE_SIZE); - return DIV_ROUND_UP(8 * nprps, PAGE_SIZE - 8); + return DIV_ROUND_UP(8 * nprps, NVME_CTRL_PAGE_SIZE - 8); } /* @@ -393,7 +393,7 @@ static int nvme_pci_npages_prp(void) static int nvme_pci_npages_sgl(void) { return DIV_ROUND_UP(NVME_MAX_SEGS * sizeof(struct nvme_sgl_desc), - PAGE_SIZE); + NVME_CTRL_PAGE_SIZE); } static int nvme_admin_init_hctx(struct blk_mq_hw_ctx *hctx, void *data, @@ -709,7 +709,7 @@ static void nvme_pci_sgl_set_seg(struct nvme_sgl_desc *sge, sge->length = cpu_to_le32(entries * sizeof(*sge)); sge->type = NVME_SGL_FMT_LAST_SEG_DESC << 4; } else { - sge->length = cpu_to_le32(PAGE_SIZE); + sge->length = cpu_to_le32(NVME_CTRL_PAGE_SIZE); sge->type = NVME_SGL_FMT_SEG_DESC << 4; } } From 3659fb5ac29a5e6102bebe494ac789fd47fb78f4 Mon Sep 17 00:00:00 2001 From: Yanjun Zhang Date: Thu, 22 Dec 2022 09:57:21 +0800 Subject: [PATCH 4/4] nvme: fix multipath crash caused by flush request when blktrace is enabled The flush request initialized by blk_kick_flush has NULL bio, and it may be dealt with nvme_end_req during io completion. When blktrace is enabled, nvme_trace_bio_complete with multipath activated trying to access NULL pointer bio from flush request results in the following crash: [ 2517.831677] BUG: kernel NULL pointer dereference, address: 000000000000001a [ 2517.835213] #PF: supervisor read access in kernel mode [ 2517.838724] #PF: error_code(0x0000) - not-present page [ 2517.842222] PGD 7b2d51067 P4D 0 [ 2517.845684] Oops: 0000 [#1] SMP NOPTI [ 2517.849125] CPU: 2 PID: 732 Comm: kworker/2:1H Kdump: loaded Tainted: G S 5.15.67-0.cl9.x86_64 #1 [ 2517.852723] Hardware name: XFUSION 2288H V6/BC13MBSBC, BIOS 1.13 07/27/2022 [ 2517.856358] Workqueue: nvme_tcp_wq nvme_tcp_io_work [nvme_tcp] [ 2517.859993] RIP: 0010:blk_add_trace_bio_complete+0x6/0x30 [ 2517.863628] Code: 1f 44 00 00 48 8b 46 08 31 c9 ba 04 00 10 00 48 8b 80 50 03 00 00 48 8b 78 50 e9 e5 fe ff ff 0f 1f 44 00 00 41 54 49 89 f4 55 <0f> b6 7a 1a 48 89 d5 e8 3e 1c 2b 00 48 89 ee 4c 89 e7 5d 89 c1 ba [ 2517.871269] RSP: 0018:ff7f6a008d9dbcd0 EFLAGS: 00010286 [ 2517.875081] RAX: ff3d5b4be00b1d50 RBX: 0000000002040002 RCX: ff3d5b0a270f2000 [ 2517.878966] RDX: 0000000000000000 RSI: ff3d5b0b021fb9f8 RDI: 0000000000000000 [ 2517.882849] RBP: ff3d5b0b96a6fa00 R08: 0000000000000001 R09: 0000000000000000 [ 2517.886718] R10: 000000000000000c R11: 000000000000000c R12: ff3d5b0b021fb9f8 [ 2517.890575] R13: 0000000002000000 R14: ff3d5b0b021fb1b0 R15: 0000000000000018 [ 2517.894434] FS: 0000000000000000(0000) GS:ff3d5b42bfc80000(0000) knlGS:0000000000000000 [ 2517.898299] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 2517.902157] CR2: 000000000000001a CR3: 00000004f023e005 CR4: 0000000000771ee0 [ 2517.906053] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 [ 2517.909930] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 [ 2517.913761] PKRU: 55555554 [ 2517.917558] Call Trace: [ 2517.921294] [ 2517.924982] nvme_complete_rq+0x1c3/0x1e0 [nvme_core] [ 2517.928715] nvme_tcp_recv_pdu+0x4d7/0x540 [nvme_tcp] [ 2517.932442] nvme_tcp_recv_skb+0x4f/0x240 [nvme_tcp] [ 2517.936137] ? nvme_tcp_recv_pdu+0x540/0x540 [nvme_tcp] [ 2517.939830] tcp_read_sock+0x9c/0x260 [ 2517.943486] nvme_tcp_try_recv+0x65/0xa0 [nvme_tcp] [ 2517.947173] nvme_tcp_io_work+0x64/0x90 [nvme_tcp] [ 2517.950834] process_one_work+0x1e8/0x390 [ 2517.954473] worker_thread+0x53/0x3c0 [ 2517.958069] ? process_one_work+0x390/0x390 [ 2517.961655] kthread+0x10c/0x130 [ 2517.965211] ? set_kthread_struct+0x40/0x40 [ 2517.968760] ret_from_fork+0x1f/0x30 [ 2517.972285] To avoid this situation, add a NULL check for req->bio before calling trace_block_bio_complete. Signed-off-by: Yanjun Zhang Signed-off-by: Christoph Hellwig --- drivers/nvme/host/nvme.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h index 6bbb73ef8b25..424c8a467a0c 100644 --- a/drivers/nvme/host/nvme.h +++ b/drivers/nvme/host/nvme.h @@ -893,7 +893,7 @@ static inline void nvme_trace_bio_complete(struct request *req) { struct nvme_ns *ns = req->q->queuedata; - if (req->cmd_flags & REQ_NVME_MPATH) + if ((req->cmd_flags & REQ_NVME_MPATH) && req->bio) trace_block_bio_complete(ns->head->disk->queue, req->bio); }