From bc7acc0bd0f94c26bc0defc902311794a3d0fae9 Mon Sep 17 00:00:00 2001 From: Samuel Holland Date: Wed, 20 Nov 2024 15:31:16 -0800 Subject: [PATCH 001/337] of: property: fw_devlink: Do not use interrupt-parent directly commit 7f00be96f125 ("of: property: Add device link support for interrupt-parent, dmas and -gpio(s)") started adding device links for the interrupt-parent property. commit 4104ca776ba3 ("of: property: Add fw_devlink support for interrupts") and commit f265f06af194 ("of: property: Fix fw_devlink handling of interrupts/interrupts-extended") later added full support for parsing the interrupts and interrupts-extended properties, which includes looking up the node of the parent domain. This made the handler for the interrupt-parent property redundant. In fact, creating device links based solely on interrupt-parent is problematic, because it can create spurious cycles. A node may have this property without itself being an interrupt controller or consumer. For example, this property is often present in the root node or a /soc bus node to set the default interrupt parent for child nodes. However, it is incorrect for the bus to depend on the interrupt controller, as some of the bus's children may not be interrupt consumers at all or may have a different interrupt parent. Resolving these spurious dependency cycles can cause an incorrect probe order for interrupt controller drivers. This was observed on a RISC-V system with both an APLIC and IMSIC under /soc, where interrupt-parent in /soc points to the APLIC, and the APLIC msi-parent points to the IMSIC. fw_devlink found three dependency cycles and attempted to probe the APLIC before the IMSIC. After applying this patch, there were no dependency cycles and the probe order was correct. Acked-by: Marc Zyngier Cc: stable@vger.kernel.org Fixes: 4104ca776ba3 ("of: property: Add fw_devlink support for interrupts") Signed-off-by: Samuel Holland Link: https://lore.kernel.org/r/20241120233124.3649382-1-samuel.holland@sifive.com Signed-off-by: Rob Herring (Arm) --- drivers/of/property.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/drivers/of/property.c b/drivers/of/property.c index 519bf9229e61..cfc8aea002e4 100644 --- a/drivers/of/property.c +++ b/drivers/of/property.c @@ -1286,7 +1286,6 @@ DEFINE_SIMPLE_PROP(iommus, "iommus", "#iommu-cells") DEFINE_SIMPLE_PROP(mboxes, "mboxes", "#mbox-cells") DEFINE_SIMPLE_PROP(io_channels, "io-channels", "#io-channel-cells") DEFINE_SIMPLE_PROP(io_backends, "io-backends", "#io-backend-cells") -DEFINE_SIMPLE_PROP(interrupt_parent, "interrupt-parent", NULL) DEFINE_SIMPLE_PROP(dmas, "dmas", "#dma-cells") DEFINE_SIMPLE_PROP(power_domains, "power-domains", "#power-domain-cells") DEFINE_SIMPLE_PROP(hwlocks, "hwlocks", "#hwlock-cells") @@ -1432,7 +1431,6 @@ static const struct supplier_bindings of_supplier_bindings[] = { { .parse_prop = parse_mboxes, }, { .parse_prop = parse_io_channels, }, { .parse_prop = parse_io_backends, }, - { .parse_prop = parse_interrupt_parent, }, { .parse_prop = parse_dmas, .optional = true, }, { .parse_prop = parse_power_domains, }, { .parse_prop = parse_hwlocks, }, From 1a75e81baf4f1b322f3498ffd373eaada8e60589 Mon Sep 17 00:00:00 2001 From: Andrea della Porta Date: Sun, 24 Nov 2024 11:05:36 +0100 Subject: [PATCH 002/337] of/unittest: Add empty dma-ranges address translation tests Intermediate DT PCI nodes dynamically generated by enabling CONFIG_PCI_DYNAMIC_OF_NODES have empty dma-ranges property. PCI address specifiers have 3 cells and when dma-ranges is missing or empty, of_translate_one() is currently dropping the flag portion of PCI addresses which are subnodes of the aforementioned ones, failing the translation. Add new tests covering this case. With this test, we get 1 new failure which is fixed in subsequent commit: FAIL of_unittest_pci_empty_dma_ranges():1245 for_each_of_pci_range wrong CPU addr (ffffffffffffffff) on node /testcase-data/address-tests2/pcie@d1070000/pci@0,0/dev@0,0/local-bus@0 Signed-off-by: Andrea della Porta Link: https://lore.kernel.org/r/08f8fee4fdc0379240fda2f4a0e6f11ebf9647a8.1732441813.git.andrea.porta@suse.com Signed-off-by: Rob Herring (Arm) --- drivers/of/unittest-data/tests-address.dtsi | 2 ++ drivers/of/unittest.c | 39 +++++++++++++++++++++ 2 files changed, 41 insertions(+) diff --git a/drivers/of/unittest-data/tests-address.dtsi b/drivers/of/unittest-data/tests-address.dtsi index 3344f15c3755..f02a181bb125 100644 --- a/drivers/of/unittest-data/tests-address.dtsi +++ b/drivers/of/unittest-data/tests-address.dtsi @@ -114,6 +114,7 @@ device_type = "pci"; ranges = <0x82000000 0 0xe8000000 0 0xe8000000 0 0x7f00000>, <0x81000000 0 0x00000000 0 0xefff0000 0 0x0010000>; + dma-ranges = <0x43000000 0x10 0x00 0x00 0x00 0x00 0x10000000>; reg = <0x00000000 0xd1070000 0x20000>; pci@0,0 { @@ -142,6 +143,7 @@ #size-cells = <0x01>; ranges = <0xa0000000 0 0 0 0x2000000>, <0xb0000000 1 0 0 0x1000000>; + dma-ranges = <0xc0000000 0x43000000 0x10 0x00 0x10000000>; dev@e0000000 { reg = <0xa0001000 0x1000>, diff --git a/drivers/of/unittest.c b/drivers/of/unittest.c index daf9a2dddd7e..80483e38d7b4 100644 --- a/drivers/of/unittest.c +++ b/drivers/of/unittest.c @@ -1213,6 +1213,44 @@ static void __init of_unittest_pci_dma_ranges(void) of_node_put(np); } +static void __init of_unittest_pci_empty_dma_ranges(void) +{ + struct device_node *np; + struct of_pci_range range; + struct of_pci_range_parser parser; + + if (!IS_ENABLED(CONFIG_PCI)) + return; + + np = of_find_node_by_path("/testcase-data/address-tests2/pcie@d1070000/pci@0,0/dev@0,0/local-bus@0"); + if (!np) { + pr_err("missing testcase data\n"); + return; + } + + if (of_pci_dma_range_parser_init(&parser, np)) { + pr_err("missing dma-ranges property\n"); + return; + } + + /* + * Get the dma-ranges from the device tree + */ + for_each_of_pci_range(&parser, &range) { + unittest(range.size == 0x10000000, + "for_each_of_pci_range wrong size on node %pOF size=%llx\n", + np, range.size); + unittest(range.cpu_addr == 0x00000000, + "for_each_of_pci_range wrong CPU addr (%llx) on node %pOF", + range.cpu_addr, np); + unittest(range.pci_addr == 0xc0000000, + "for_each_of_pci_range wrong DMA addr (%llx) on node %pOF", + range.pci_addr, np); + } + + of_node_put(np); +} + static void __init of_unittest_bus_ranges(void) { struct device_node *np; @@ -4272,6 +4310,7 @@ static int __init of_unittest(void) of_unittest_dma_get_max_cpu_address(); of_unittest_parse_dma_ranges(); of_unittest_pci_dma_ranges(); + of_unittest_pci_empty_dma_ranges(); of_unittest_bus_ranges(); of_unittest_bus_3cell_ranges(); of_unittest_reg(); From 7f05e20b989ac33c9c0f8c2028ec0a566493548f Mon Sep 17 00:00:00 2001 From: Andrea della Porta Date: Sun, 24 Nov 2024 11:05:37 +0100 Subject: [PATCH 003/337] of: address: Preserve the flags portion on 1:1 dma-ranges mapping A missing or empty dma-ranges in a DT node implies a 1:1 mapping for dma translations. In this specific case, the current behaviour is to zero out the entire specifier so that the translation could be carried on as an offset from zero. This includes address specifier that has flags (e.g. PCI ranges). Once the flags portion has been zeroed, the translation chain is broken since the mapping functions will check the upcoming address specifier against mismatching flags, always failing the 1:1 mapping and its entire purpose of always succeeding. Set to zero only the address portion while passing the flags through. Fixes: dbbdee94734b ("of/address: Merge all of the bus translation code") Cc: stable@vger.kernel.org Signed-off-by: Andrea della Porta Tested-by: Herve Codina Link: https://lore.kernel.org/r/e51ae57874e58a9b349c35e2e877425ebc075d7a.1732441813.git.andrea.porta@suse.com Signed-off-by: Rob Herring (Arm) --- drivers/of/address.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/of/address.c b/drivers/of/address.c index c5b925ac469f..5b7ee3ed5296 100644 --- a/drivers/of/address.c +++ b/drivers/of/address.c @@ -459,7 +459,8 @@ static int of_translate_one(const struct device_node *parent, const struct of_bu } if (ranges == NULL || rlen == 0) { offset = of_read_number(addr, na); - memset(addr, 0, pna * 4); + /* set address to zero, pass flags through */ + memset(addr + pbus->flag_cells, 0, (pna - pbus->flag_cells) * 4); pr_debug("empty ranges; 1:1 translation\n"); goto finish; } From 61a6ba233fe1198e9eacc9ca1d1cbdb27f70cee5 Mon Sep 17 00:00:00 2001 From: "Rob Herring (Arm)" Date: Wed, 13 Nov 2024 16:56:13 -0600 Subject: [PATCH 004/337] dt-bindings: Unify "fsl,liodn" type definitions The type definition of "fsl,liodn" is defined as uint32 in crypto/fsl,sec-v4.0.yaml and uint32-array in soc/fsl/fsl,bman.yaml, soc/fsl/fsl,qman-portal.yaml, and soc/fsl/fsl,qman.yaml. Unify the type to be uint32-array and constraint the single entry cases. Reviewed-by: Conor Dooley Link: https://lore.kernel.org/r/20241113225614.1782862-1-robh@kernel.org Signed-off-by: Rob Herring (Arm) --- .../devicetree/bindings/crypto/fsl,sec-v4.0.yaml | 10 ++++++---- .../devicetree/bindings/soc/fsl/fsl,qman-portal.yaml | 2 ++ 2 files changed, 8 insertions(+), 4 deletions(-) diff --git a/Documentation/devicetree/bindings/crypto/fsl,sec-v4.0.yaml b/Documentation/devicetree/bindings/crypto/fsl,sec-v4.0.yaml index 9c8c9991f29a..f0c4a7c83568 100644 --- a/Documentation/devicetree/bindings/crypto/fsl,sec-v4.0.yaml +++ b/Documentation/devicetree/bindings/crypto/fsl,sec-v4.0.yaml @@ -114,8 +114,9 @@ patternProperties: table that specifies the PPID to LIODN mapping. Needed if the PAMU is used. Value is a 12 bit value where value is a LIODN ID for this JR. This property is normally set by boot firmware. - $ref: /schemas/types.yaml#/definitions/uint32 - maximum: 0xfff + $ref: /schemas/types.yaml#/definitions/uint32-array + items: + - maximum: 0xfff '^rtic@[0-9a-f]+$': type: object @@ -186,8 +187,9 @@ patternProperties: Needed if the PAMU is used. Value is a 12 bit value where value is a LIODN ID for this JR. This property is normally set by boot firmware. - $ref: /schemas/types.yaml#/definitions/uint32 - maximum: 0xfff + $ref: /schemas/types.yaml#/definitions/uint32-array + items: + - maximum: 0xfff fsl,rtic-region: description: diff --git a/Documentation/devicetree/bindings/soc/fsl/fsl,qman-portal.yaml b/Documentation/devicetree/bindings/soc/fsl/fsl,qman-portal.yaml index 17016184143f..e459fec02ba8 100644 --- a/Documentation/devicetree/bindings/soc/fsl/fsl,qman-portal.yaml +++ b/Documentation/devicetree/bindings/soc/fsl/fsl,qman-portal.yaml @@ -35,6 +35,7 @@ properties: fsl,liodn: $ref: /schemas/types.yaml#/definitions/uint32-array + maxItems: 2 description: See pamu.txt. Two LIODN(s). DQRR LIODN (DLIODN) and Frame LIODN (FLIODN) @@ -69,6 +70,7 @@ patternProperties: type: object properties: fsl,liodn: + $ref: /schemas/types.yaml#/definitions/uint32-array description: See pamu.txt, PAMU property used for static LIODN assignment fsl,iommu-parent: From 60bc447c85f80d3184c7ac327e1d29e0b0a11d46 Mon Sep 17 00:00:00 2001 From: Herve Codina Date: Mon, 2 Dec 2024 14:15:17 +0100 Subject: [PATCH 005/337] of: Add #address-cells/#size-cells in the device-tree root empty node On systems where ACPI is enabled or when a device-tree is not passed to the kernel by the bootloader, a device-tree root empty node is created. This device-tree root empty node does not have the #address-cells and the #size-cells properties This leads to the use of the default address cells and size cells values which are defined in the code to 1 for the address cells value and 1 for the size cells value. According to the devicetree specification and the OpenFirmware standard (IEEE 1275-1994) the default value for #address-cells should be 2. Also, according to the devicetree specification, the #address-cells and the #size-cells are required properties in the root node. The device tree compiler already uses 2 as default value for address cells and 1 for size cells. The powerpc PROM code also uses 2 as default value for address cells and 1 for size cells. Modern implementation should have the #address-cells and the #size-cells properties set and should not rely on default values. On x86, this root empty node is used and the code default values are used. In preparation of the support for device-tree overlay on PCI devices feature on x86 (i.e. the creation of the PCI root bus device-tree node), the default value for #address-cells needs to be updated. Indeed, on x86_64, addresses are on 64bits and the upper part of an address is needed for correct address translations. On x86_32 having the default value updated does not lead to issues while the upper part of a 64-bit value is zero. Changing the default value for all architectures may break device-tree compatibility. Indeed, existing dts file without the #address-cells property set in the root node will not be compatible with this modification. Instead of updating default values, add both required #address-cells and #size-cells properties in the device-tree empty node. Use 2 for both properties value in order to fully support 64-bit addresses and sizes on systems using this empty root node. Signed-off-by: Herve Codina Link: https://lore.kernel.org/r/20241202131522.142268-6-herve.codina@bootlin.com Signed-off-by: Rob Herring (Arm) --- drivers/of/empty_root.dts | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/drivers/of/empty_root.dts b/drivers/of/empty_root.dts index cf9e97a60f48..cbe169ba3db5 100644 --- a/drivers/of/empty_root.dts +++ b/drivers/of/empty_root.dts @@ -2,5 +2,12 @@ /dts-v1/; / { - + /* + * #address-cells/#size-cells are required properties at root node. + * Use 2 cells for both address cells and size cells in order to fully + * support 64-bit addresses and sizes on systems using this empty root + * node. + */ + #address-cells = <0x02>; + #size-cells = <0x02>; }; From c43ec96e8d34399bd9dab2f2dc316b904892133f Mon Sep 17 00:00:00 2001 From: Chen Ridong Date: Tue, 29 Oct 2024 08:28:45 +0000 Subject: [PATCH 006/337] dmaengine: at_xdmac: avoid null_prt_deref in at_xdmac_prep_dma_memset The at_xdmac_memset_create_desc may return NULL, which will lead to a null pointer dereference. For example, the len input is error, or the atchan->free_descs_list is empty and memory is exhausted. Therefore, add check to avoid this. Fixes: b206d9a23ac7 ("dmaengine: xdmac: Add memset support") Signed-off-by: Chen Ridong Link: https://lore.kernel.org/r/20241029082845.1185380-1-chenridong@huaweicloud.com Signed-off-by: Vinod Koul --- drivers/dma/at_xdmac.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/dma/at_xdmac.c b/drivers/dma/at_xdmac.c index 9c7b40220004..ba25c23164e7 100644 --- a/drivers/dma/at_xdmac.c +++ b/drivers/dma/at_xdmac.c @@ -1363,6 +1363,8 @@ at_xdmac_prep_dma_memset(struct dma_chan *chan, dma_addr_t dest, int value, return NULL; desc = at_xdmac_memset_create_desc(chan, atchan, dest, len, value); + if (!desc) + return NULL; list_add_tail(&desc->desc_node, &desc->descs_list); desc->tx_dma_desc.cookie = -EBUSY; From f0e870a0e9c5521f2952ea9f3ea9d3d122631a89 Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Mon, 4 Nov 2024 11:50:50 +0200 Subject: [PATCH 007/337] dmaengine: dw: Select only supported masters for ACPI devices The recently submitted fix-commit revealed a problem in the iDMA 32-bit platform code. Even though the controller supported only a single master the dw_dma_acpi_filter() method hard-coded two master interfaces with IDs 0 and 1. As a result the sanity check implemented in the commit b336268dde75 ("dmaengine: dw: Add peripheral bus width verification") got incorrect interface data width and thus prevented the client drivers from configuring the DMA-channel with the EINVAL error returned. E.g., the next error was printed for the PXA2xx SPI controller driver trying to configure the requested channels: > [ 164.525604] pxa2xx_spi_pci 0000:00:07.1: DMA slave config failed > [ 164.536105] pxa2xx_spi_pci 0000:00:07.1: failed to get DMA TX descriptor > [ 164.543213] spidev spi-SPT0001:00: SPI transfer failed: -16 The problem would have been spotted much earlier if the iDMA 32-bit controller supported more than one master interfaces. But since it supports just a single master and the iDMA 32-bit specific code just ignores the master IDs in the CTLLO preparation method, the issue has been gone unnoticed so far. Fix the problem by specifying the default master ID for both memory and peripheral devices in the driver data. Thus the issue noticed for the iDMA 32-bit controllers will be eliminated and the ACPI-probed DW DMA controllers will be configured with the correct master ID by default. Cc: stable@vger.kernel.org Fixes: b336268dde75 ("dmaengine: dw: Add peripheral bus width verification") Fixes: 199244d69458 ("dmaengine: dw: add support of iDMA 32-bit hardware") Reported-by: Ferry Toth Closes: https://lore.kernel.org/dmaengine/ZuXbCKUs1iOqFu51@black.fi.intel.com/ Reported-by: Andy Shevchenko Closes: https://lore.kernel.org/dmaengine/ZuXgI-VcHpMgbZ91@black.fi.intel.com/ Tested-by: Ferry Toth Signed-off-by: Andy Shevchenko Link: https://lore.kernel.org/r/20241104095142.157925-1-andriy.shevchenko@linux.intel.com Signed-off-by: Vinod Koul --- drivers/dma/dw/acpi.c | 6 ++++-- drivers/dma/dw/internal.h | 8 ++++++++ drivers/dma/dw/pci.c | 4 ++-- 3 files changed, 14 insertions(+), 4 deletions(-) diff --git a/drivers/dma/dw/acpi.c b/drivers/dma/dw/acpi.c index c510c109d2c3..b6452fffa657 100644 --- a/drivers/dma/dw/acpi.c +++ b/drivers/dma/dw/acpi.c @@ -8,13 +8,15 @@ static bool dw_dma_acpi_filter(struct dma_chan *chan, void *param) { + struct dw_dma *dw = to_dw_dma(chan->device); + struct dw_dma_chip_pdata *data = dev_get_drvdata(dw->dma.dev); struct acpi_dma_spec *dma_spec = param; struct dw_dma_slave slave = { .dma_dev = dma_spec->dev, .src_id = dma_spec->slave_id, .dst_id = dma_spec->slave_id, - .m_master = 0, - .p_master = 1, + .m_master = data->m_master, + .p_master = data->p_master, }; return dw_dma_filter(chan, &slave); diff --git a/drivers/dma/dw/internal.h b/drivers/dma/dw/internal.h index 563ce73488db..f1bd06a20cd6 100644 --- a/drivers/dma/dw/internal.h +++ b/drivers/dma/dw/internal.h @@ -51,11 +51,15 @@ struct dw_dma_chip_pdata { int (*probe)(struct dw_dma_chip *chip); int (*remove)(struct dw_dma_chip *chip); struct dw_dma_chip *chip; + u8 m_master; + u8 p_master; }; static __maybe_unused const struct dw_dma_chip_pdata dw_dma_chip_pdata = { .probe = dw_dma_probe, .remove = dw_dma_remove, + .m_master = 0, + .p_master = 1, }; static const struct dw_dma_platform_data idma32_pdata = { @@ -72,6 +76,8 @@ static __maybe_unused const struct dw_dma_chip_pdata idma32_chip_pdata = { .pdata = &idma32_pdata, .probe = idma32_dma_probe, .remove = idma32_dma_remove, + .m_master = 0, + .p_master = 0, }; static const struct dw_dma_platform_data xbar_pdata = { @@ -88,6 +94,8 @@ static __maybe_unused const struct dw_dma_chip_pdata xbar_chip_pdata = { .pdata = &xbar_pdata, .probe = idma32_dma_probe, .remove = idma32_dma_remove, + .m_master = 0, + .p_master = 0, }; #endif /* _DMA_DW_INTERNAL_H */ diff --git a/drivers/dma/dw/pci.c b/drivers/dma/dw/pci.c index ad2d4d012cf7..e8a0eb81726a 100644 --- a/drivers/dma/dw/pci.c +++ b/drivers/dma/dw/pci.c @@ -56,10 +56,10 @@ static int dw_pci_probe(struct pci_dev *pdev, const struct pci_device_id *pid) if (ret) return ret; - dw_dma_acpi_controller_register(chip->dw); - pci_set_drvdata(pdev, data); + dw_dma_acpi_controller_register(chip->dw); + return 0; } From 4b65d5322e1d8994acfdb9b867aa00bdb30d177b Mon Sep 17 00:00:00 2001 From: Binbin Zhou Date: Mon, 28 Oct 2024 17:34:13 +0800 Subject: [PATCH 008/337] dmaengine: loongson2-apb: Change GENMASK to GENMASK_ULL Fix the following smatch static checker warning: drivers/dma/loongson2-apb-dma.c:189 ls2x_dma_write_cmd() warn: was expecting a 64 bit value instead of '~(((0)) + (((~((0))) - (((1)) << (0)) + 1) & (~((0)) >> ((8 * 4) - 1 - (4)))))' The GENMASK macro used "unsigned long", which caused build issues when using a 32-bit toolchain because it would try to access bits > 31. This patch switches GENMASK to GENMASK_ULL, which uses "unsigned long long". Fixes: 71e7d3cb6e55 ("dmaengine: ls2x-apb: New driver for the Loongson LS2X APB DMA controller") Reported-by: Dan Carpenter Closes: https://lore.kernel.org/all/87cdc025-7246-4548-85ca-3d36fdc2be2d@stanley.mountain/ Signed-off-by: Binbin Zhou Link: https://lore.kernel.org/r/20241028093413.1145820-1-zhoubinbin@loongson.cn Signed-off-by: Vinod Koul --- drivers/dma/loongson2-apb-dma.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/dma/loongson2-apb-dma.c b/drivers/dma/loongson2-apb-dma.c index 367ed34ce4da..c528f02b9f84 100644 --- a/drivers/dma/loongson2-apb-dma.c +++ b/drivers/dma/loongson2-apb-dma.c @@ -31,7 +31,7 @@ #define LDMA_ASK_VALID BIT(2) #define LDMA_START BIT(3) /* DMA start operation */ #define LDMA_STOP BIT(4) /* DMA stop operation */ -#define LDMA_CONFIG_MASK GENMASK(4, 0) /* DMA controller config bits mask */ +#define LDMA_CONFIG_MASK GENMASK_ULL(4, 0) /* DMA controller config bits mask */ /* Bitfields in ndesc_addr field of HW descriptor */ #define LDMA_DESC_EN BIT(0) /*1: The next descriptor is valid */ From b77bd3ba762f34e5eb731134cf50e233d1060053 Mon Sep 17 00:00:00 2001 From: Fabio Estevam Date: Wed, 27 Nov 2024 16:06:05 -0300 Subject: [PATCH 009/337] ARM: imx: Re-introduce the PINCTRL selection Since commit 17d210018914 ("ARM: imx: Allow user to disable pinctrl"), the CONFIG_PINCTRL option is no longer implicitly selected, causing several i.MX SoC pinctrl drivers no longer getting selected by default. This causes boot regressions on the ARMv4, ARMv5, ARMv6 and ARMv7 i.MX SoCs. Fix it by selecting CONFIG_PINCTRL as before. This defeats the purpose of 7d210018914 ("ARM: imx: Allow user to disable pinctrl"), but it is the less invasive fix for the boot regressions. The attempt to build Layerscape without pinctrl can still be explored later as suggested by Arnd: "Overall, my best advice here is still to not change the way i.MX pinctrl works at all, but just fix Layerscape to not depend on i.MX. The reason for the 'select' here is clearly that the i.MX machines would fail to boot without pinctrl, and changing that because of Layerscape seems backwards." Fixes: 17d210018914 ("ARM: imx: Allow user to disable pinctrl") Reported-by: Guenter Roeck Closes: https://lore.kernel.org/linux-arm-kernel/49ff070a-ce67-42d7-84ec-8b54fd7e9742@roeck-us.net/ Signed-off-by: Fabio Estevam Acked-by: Arnd Bergmann Reviewed-by: Linus Walleij Tested-by: Guenter Roeck Link: https://lore.kernel.org/20241127190605.1367157-1-festevam@gmail.com Signed-off-by: Linus Walleij --- arch/arm/mach-imx/Kconfig | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/arm/mach-imx/Kconfig b/arch/arm/mach-imx/Kconfig index e4fe059cd861..dc47b2312127 100644 --- a/arch/arm/mach-imx/Kconfig +++ b/arch/arm/mach-imx/Kconfig @@ -6,6 +6,7 @@ menuconfig ARCH_MXC select CLKSRC_IMX_GPT select GENERIC_IRQ_CHIP select GPIOLIB + select PINCTRL select PM_OPP if PM select SOC_BUS select SRAM From 239521712b2b568b99d5f0ef7c1f874d797f4a29 Mon Sep 17 00:00:00 2001 From: "Rob Herring (Arm)" Date: Wed, 13 Nov 2024 16:56:31 -0600 Subject: [PATCH 010/337] dt-bindings: mtd: fixed-partitions: Fix "compression" typo The example erroneously has "compress" property rather than the documented "compression" property. Acked-by: Conor Dooley Link: https://lore.kernel.org/r/20241113225632.1783241-1-robh@kernel.org Signed-off-by: Rob Herring (Arm) --- .../devicetree/bindings/mtd/partitions/fixed-partitions.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Documentation/devicetree/bindings/mtd/partitions/fixed-partitions.yaml b/Documentation/devicetree/bindings/mtd/partitions/fixed-partitions.yaml index 058253d6d889..62086366837c 100644 --- a/Documentation/devicetree/bindings/mtd/partitions/fixed-partitions.yaml +++ b/Documentation/devicetree/bindings/mtd/partitions/fixed-partitions.yaml @@ -82,7 +82,7 @@ examples: uimage@100000 { reg = <0x0100000 0x200000>; - compress = "lzma"; + compression = "lzma"; }; }; From d7dfa7fde63dde4d2ec0083133efe2c6686c03ff Mon Sep 17 00:00:00 2001 From: Herve Codina Date: Mon, 2 Dec 2024 17:58:19 +0100 Subject: [PATCH 011/337] of: Fix error path in of_parse_phandle_with_args_map() The current code uses some 'goto put;' to cancel the parsing operation and can lead to a return code value of 0 even on error cases. Indeed, some goto calls are done from a loop without setting the ret value explicitly before the goto call and so the ret value can be set to 0 due to operation done in previous loop iteration. For instance match can be set to 0 in the previous loop iteration (leading to a new iteration) but ret can also be set to 0 it the of_property_read_u32() call succeed. In that case if no match are found or if an error is detected the new iteration, the return value can be wrongly 0. Avoid those cases setting the ret value explicitly before the goto calls. Fixes: bd6f2fd5a1d5 ("of: Support parsing phandle argument lists through a nexus node") Cc: stable@vger.kernel.org Signed-off-by: Herve Codina Link: https://lore.kernel.org/r/20241202165819.158681-1-herve.codina@bootlin.com Signed-off-by: Rob Herring (Arm) --- drivers/of/base.c | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/drivers/of/base.c b/drivers/of/base.c index a8b0c42bdc8e..44b1c8bf9cc0 100644 --- a/drivers/of/base.c +++ b/drivers/of/base.c @@ -1471,8 +1471,10 @@ int of_parse_phandle_with_args_map(const struct device_node *np, map_len--; /* Check if not found */ - if (!new) + if (!new) { + ret = -EINVAL; goto put; + } if (!of_device_is_available(new)) match = 0; @@ -1482,17 +1484,20 @@ int of_parse_phandle_with_args_map(const struct device_node *np, goto put; /* Check for malformed properties */ - if (WARN_ON(new_size > MAX_PHANDLE_ARGS)) - goto put; - if (map_len < new_size) + if (WARN_ON(new_size > MAX_PHANDLE_ARGS) || + map_len < new_size) { + ret = -EINVAL; goto put; + } /* Move forward by new node's #-cells amount */ map += new_size; map_len -= new_size; } - if (!match) + if (!match) { + ret = -ENOENT; goto put; + } /* Get the -map-pass-thru property (optional) */ pass = of_get_property(cur, pass_name, NULL); From 790fb9956eead785b720ccc0851f09a5ca3a093e Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Mon, 2 Dec 2024 09:20:04 -0800 Subject: [PATCH 012/337] linux/dmaengine.h: fix a few kernel-doc warnings The comment block for "Interleaved Transfer Request" should not begin with "/**" since it is not in kernel-doc format. Fix doc name for enum sum_check_flags. Fix all (4) missing struct member warnings. Use "Warning:" for one "Note:" in enum dma_desc_metadata_mode since scripts/kernel-doc does not allow more than one Note: per function or identifier description. This leaves around 49 kernel-doc warnings like: include/linux/dmaengine.h:43: warning: Enum value 'DMA_OUT_OF_ORDER' not described in enum 'dma_status' and another scripts/kernel-doc problem with it not being able to parse some typedefs. Fixes: b14dab792dee ("DMAEngine: Define interleaved transfer request api") Fixes: ad283ea4a3ce ("async_tx: add sum check flags") Fixes: 272420214d26 ("dmaengine: Add DMA_CTRL_REUSE") Fixes: f067025bc676 ("dmaengine: add support to provide error result from a DMA transation") Fixes: d38a8c622a1b ("dmaengine: prepare for generic 'unmap' data") Fixes: 5878853fc938 ("dmaengine: Add API function dmaengine_prep_peripheral_dma_vec()") Signed-off-by: Randy Dunlap Cc: Dan Williams Cc: Dave Jiang Cc: Paul Cercueil Cc: Nuno Sa Cc: Vinod Koul Cc: dmaengine@vger.kernel.org Link: https://lore.kernel.org/r/20241202172004.76020-1-rdunlap@infradead.org Signed-off-by: Vinod Koul --- include/linux/dmaengine.h | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/include/linux/dmaengine.h b/include/linux/dmaengine.h index b137fdb56093..346251bf1026 100644 --- a/include/linux/dmaengine.h +++ b/include/linux/dmaengine.h @@ -84,7 +84,7 @@ enum dma_transfer_direction { DMA_TRANS_NONE, }; -/** +/* * Interleaved Transfer Request * ---------------------------- * A chunk is collection of contiguous bytes to be transferred. @@ -223,7 +223,7 @@ enum sum_check_bits { }; /** - * enum pq_check_flags - result of async_{xor,pq}_zero_sum operations + * enum sum_check_flags - result of async_{xor,pq}_zero_sum operations * @SUM_CHECK_P_RESULT - 1 if xor zero sum error, 0 otherwise * @SUM_CHECK_Q_RESULT - 1 if reed-solomon zero sum error, 0 otherwise */ @@ -286,7 +286,7 @@ typedef struct { DECLARE_BITMAP(bits, DMA_TX_TYPE_END); } dma_cap_mask_t; * pointer to the engine's metadata area * 4. Read out the metadata from the pointer * - * Note: the two mode is not compatible and clients must use one mode for a + * Warning: the two modes are not compatible and clients must use one mode for a * descriptor. */ enum dma_desc_metadata_mode { @@ -594,9 +594,13 @@ struct dma_descriptor_metadata_ops { * @phys: physical address of the descriptor * @chan: target channel for this operation * @tx_submit: accept the descriptor, assign ordered cookie and mark the + * @desc_free: driver's callback function to free a resusable descriptor + * after completion * descriptor pending. To be pushed on .issue_pending() call * @callback: routine to call after this operation is complete + * @callback_result: error result from a DMA transaction * @callback_param: general parameter to pass to the callback routine + * @unmap: hook for generic DMA unmap data * @desc_metadata_mode: core managed metadata mode to protect mixed use of * DESC_METADATA_CLIENT or DESC_METADATA_ENGINE. Otherwise * DESC_METADATA_NONE @@ -827,6 +831,9 @@ struct dma_filter { * @device_prep_dma_memset: prepares a memset operation * @device_prep_dma_memset_sg: prepares a memset operation over a scatter list * @device_prep_dma_interrupt: prepares an end of chain interrupt operation + * @device_prep_peripheral_dma_vec: prepares a scatter-gather DMA transfer, + * where the address and size of each segment is located in one entry of + * the dma_vec array. * @device_prep_slave_sg: prepares a slave dma operation * @device_prep_dma_cyclic: prepare a cyclic dma operation suitable for audio. * The function takes a buffer of size buf_len. The callback function will From 8d55e8a16f019211163f1180fd9f9fbe05901900 Mon Sep 17 00:00:00 2001 From: Sasha Finkelstein Date: Sun, 24 Nov 2024 16:48:28 +0100 Subject: [PATCH 013/337] dmaengine: apple-admac: Avoid accessing registers in probe The ADMAC attached to the AOP has complex power sequencing, and is power gated when the probe callback runs. Move the register reads to other functions, where we can guarantee that the hardware is switched on. Fixes: 568aa6dd641f ("dmaengine: apple-admac: Allocate cache SRAM to channels") Signed-off-by: Sasha Finkelstein Link: https://lore.kernel.org/r/20241124-admac-power-v1-1-58f2165a4d55@gmail.com Signed-off-by: Vinod Koul --- drivers/dma/apple-admac.c | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/drivers/dma/apple-admac.c b/drivers/dma/apple-admac.c index c499173d80b2..bd49f0374291 100644 --- a/drivers/dma/apple-admac.c +++ b/drivers/dma/apple-admac.c @@ -153,6 +153,8 @@ static int admac_alloc_sram_carveout(struct admac_data *ad, { struct admac_sram *sram; int i, ret = 0, nblocks; + ad->txcache.size = readl_relaxed(ad->base + REG_TX_SRAM_SIZE); + ad->rxcache.size = readl_relaxed(ad->base + REG_RX_SRAM_SIZE); if (dir == DMA_MEM_TO_DEV) sram = &ad->txcache; @@ -912,12 +914,7 @@ static int admac_probe(struct platform_device *pdev) goto free_irq; } - ad->txcache.size = readl_relaxed(ad->base + REG_TX_SRAM_SIZE); - ad->rxcache.size = readl_relaxed(ad->base + REG_RX_SRAM_SIZE); - dev_info(&pdev->dev, "Audio DMA Controller\n"); - dev_info(&pdev->dev, "imprint %x TX cache %u RX cache %u\n", - readl_relaxed(ad->base + REG_IMPRINT), ad->txcache.size, ad->rxcache.size); return 0; From dcbef0798eb825cd584f7a93f62bed63f7fbbfc9 Mon Sep 17 00:00:00 2001 From: Lizhi Hou Date: Wed, 18 Sep 2024 11:10:22 -0700 Subject: [PATCH 014/337] dmaengine: amd: qdma: Remove using the private get and set dma_ops APIs The get_dma_ops and set_dma_ops APIs were never for driver to use. Remove these calls from QDMA driver. Instead, pass the DMA device pointer from the qdma_platdata structure. Fixes: 73d5fc92a11c ("dmaengine: amd: qdma: Add AMD QDMA driver") Signed-off-by: Lizhi Hou Reviewed-by: Christoph Hellwig Link: https://lore.kernel.org/r/20240918181022.2155715-1-lizhi.hou@amd.com Signed-off-by: Vinod Koul --- drivers/dma/amd/qdma/qdma.c | 28 +++++++++++--------------- include/linux/platform_data/amd_qdma.h | 2 ++ 2 files changed, 14 insertions(+), 16 deletions(-) diff --git a/drivers/dma/amd/qdma/qdma.c b/drivers/dma/amd/qdma/qdma.c index 6d9079458fe9..66f00ad67351 100644 --- a/drivers/dma/amd/qdma/qdma.c +++ b/drivers/dma/amd/qdma/qdma.c @@ -7,9 +7,9 @@ #include #include #include +#include #include #include -#include #include #include #include @@ -492,18 +492,9 @@ static int qdma_device_verify(struct qdma_device *qdev) static int qdma_device_setup(struct qdma_device *qdev) { - struct device *dev = &qdev->pdev->dev; u32 ring_sz = QDMA_DEFAULT_RING_SIZE; int ret = 0; - while (dev && get_dma_ops(dev)) - dev = dev->parent; - if (!dev) { - qdma_err(qdev, "dma device not found"); - return -EINVAL; - } - set_dma_ops(&qdev->pdev->dev, get_dma_ops(dev)); - ret = qdma_setup_fmap_context(qdev); if (ret) { qdma_err(qdev, "Failed setup fmap context"); @@ -548,11 +539,12 @@ static void qdma_free_queue_resources(struct dma_chan *chan) { struct qdma_queue *queue = to_qdma_queue(chan); struct qdma_device *qdev = queue->qdev; - struct device *dev = qdev->dma_dev.dev; + struct qdma_platdata *pdata; qdma_clear_queue_context(queue); vchan_free_chan_resources(&queue->vchan); - dma_free_coherent(dev, queue->ring_size * QDMA_MM_DESC_SIZE, + pdata = dev_get_platdata(&qdev->pdev->dev); + dma_free_coherent(pdata->dma_dev, queue->ring_size * QDMA_MM_DESC_SIZE, queue->desc_base, queue->dma_desc_base); } @@ -565,6 +557,7 @@ static int qdma_alloc_queue_resources(struct dma_chan *chan) struct qdma_queue *queue = to_qdma_queue(chan); struct qdma_device *qdev = queue->qdev; struct qdma_ctxt_sw_desc desc; + struct qdma_platdata *pdata; size_t size; int ret; @@ -572,8 +565,9 @@ static int qdma_alloc_queue_resources(struct dma_chan *chan) if (ret) return ret; + pdata = dev_get_platdata(&qdev->pdev->dev); size = queue->ring_size * QDMA_MM_DESC_SIZE; - queue->desc_base = dma_alloc_coherent(qdev->dma_dev.dev, size, + queue->desc_base = dma_alloc_coherent(pdata->dma_dev, size, &queue->dma_desc_base, GFP_KERNEL); if (!queue->desc_base) { @@ -588,7 +582,7 @@ static int qdma_alloc_queue_resources(struct dma_chan *chan) if (ret) { qdma_err(qdev, "Failed to setup SW desc ctxt for %s", chan->name); - dma_free_coherent(qdev->dma_dev.dev, size, queue->desc_base, + dma_free_coherent(pdata->dma_dev, size, queue->desc_base, queue->dma_desc_base); return ret; } @@ -948,8 +942,9 @@ static int qdma_init_error_irq(struct qdma_device *qdev) static int qdmam_alloc_qintr_rings(struct qdma_device *qdev) { - u32 ctxt[QDMA_CTXT_REGMAP_LEN]; + struct qdma_platdata *pdata = dev_get_platdata(&qdev->pdev->dev); struct device *dev = &qdev->pdev->dev; + u32 ctxt[QDMA_CTXT_REGMAP_LEN]; struct qdma_intr_ring *ring; struct qdma_ctxt_intr intr_ctxt; u32 vector; @@ -969,7 +964,8 @@ static int qdmam_alloc_qintr_rings(struct qdma_device *qdev) ring->msix_id = qdev->err_irq_idx + i + 1; ring->ridx = i; ring->color = 1; - ring->base = dmam_alloc_coherent(dev, QDMA_INTR_RING_SIZE, + ring->base = dmam_alloc_coherent(pdata->dma_dev, + QDMA_INTR_RING_SIZE, &ring->dev_base, GFP_KERNEL); if (!ring->base) { qdma_err(qdev, "Failed to alloc intr ring %d", i); diff --git a/include/linux/platform_data/amd_qdma.h b/include/linux/platform_data/amd_qdma.h index 576d952f97ed..967a6ef31cf9 100644 --- a/include/linux/platform_data/amd_qdma.h +++ b/include/linux/platform_data/amd_qdma.h @@ -26,11 +26,13 @@ struct dma_slave_map; * @max_mm_channels: Maximum number of MM DMA channels in each direction * @device_map: DMA slave map * @irq_index: The index of first IRQ + * @dma_dev: The device pointer for dma operations */ struct qdma_platdata { u32 max_mm_channels; u32 irq_index; struct dma_slave_map *device_map; + struct device *dma_dev; }; #endif /* _PLATDATA_AMD_QDMA_H */ From eb867d797d294a00a092b5027d08439da68940b2 Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Tue, 26 Nov 2024 15:10:31 +0200 Subject: [PATCH 015/337] RDMA/bnxt_re: Remove always true dattr validity check res->dattr is always valid at this point as it was initialized during device addition in bnxt_re_add_device(). This change is fixing the following smatch error: drivers/infiniband/hw/bnxt_re/qplib_fp.c:1090 bnxt_qplib_create_qp() error: we previously assumed 'res->dattr' could be null (see line 985) Fixes: 07f830ae4913 ("RDMA/bnxt_re: Adds MSN table capability for Gen P7 adapters") Reported-by: kernel test robot Reported-by: Dan Carpenter Closes: https://lore.kernel.org/r/202411222329.YTrwonWi-lkp@intel.com/ Link: https://patch.msgid.link/be0d8836b64cba3e479fbcbca717acad04aae02e.1732626579.git.leonro@nvidia.com Acked-by: Selvin Xavier Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/bnxt_re/qplib_fp.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/drivers/infiniband/hw/bnxt_re/qplib_fp.c b/drivers/infiniband/hw/bnxt_re/qplib_fp.c index e42abf5be6c0..9af8aaadc99a 100644 --- a/drivers/infiniband/hw/bnxt_re/qplib_fp.c +++ b/drivers/infiniband/hw/bnxt_re/qplib_fp.c @@ -1000,9 +1000,7 @@ int bnxt_qplib_create_qp(struct bnxt_qplib_res *res, struct bnxt_qplib_qp *qp) u32 tbl_indx; u16 nsge; - if (res->dattr) - qp->is_host_msn_tbl = _is_host_msn_table(res->dattr->dev_cap_flags2); - + qp->is_host_msn_tbl = _is_host_msn_table(res->dattr->dev_cap_flags2); sq->dbinfo.flags = 0; bnxt_qplib_rcfw_cmd_prep((struct cmdq_base *)&req, CMDQ_BASE_OPCODE_CREATE_QP, From 0a92ea87bdd6f77ca4e17fe19649882cf5209edd Mon Sep 17 00:00:00 2001 From: Justin Chen Date: Thu, 24 Oct 2024 14:35:40 -0700 Subject: [PATCH 016/337] phy: usb: Toggle the PHY power during init When bringing up the PHY, it might be in a bad state if left powered. One case is we lose the PLL lock if the PLL is gated while the PHY is powered. Toggle the PHY power so we can start from a known state. Fixes: 4e5b9c9a73b3 ("phy: usb: Add support for new Synopsys USB controller on the 7216") Signed-off-by: Justin Chen Acked-by: Florian Fainelli Link: https://lore.kernel.org/r/20241024213540.1059412-1-justin.chen@broadcom.com Signed-off-by: Vinod Koul --- drivers/phy/broadcom/phy-brcm-usb-init-synopsys.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/drivers/phy/broadcom/phy-brcm-usb-init-synopsys.c b/drivers/phy/broadcom/phy-brcm-usb-init-synopsys.c index 950b7ae1d1a8..dc452610934a 100644 --- a/drivers/phy/broadcom/phy-brcm-usb-init-synopsys.c +++ b/drivers/phy/broadcom/phy-brcm-usb-init-synopsys.c @@ -325,6 +325,12 @@ static void usb_init_common_7216(struct brcm_usb_init_params *params) void __iomem *ctrl = params->regs[BRCM_REGS_CTRL]; USB_CTRL_UNSET(ctrl, USB_PM, XHC_S2_CLK_SWITCH_EN); + + /* + * The PHY might be in a bad state if it is already powered + * up. Toggle the power just in case. + */ + USB_CTRL_SET(ctrl, USB_PM, USB_PWRDN); USB_CTRL_UNSET(ctrl, USB_PM, USB_PWRDN); /* 1 millisecond - for USB clocks to settle down */ From fbcbffbac994aca1264e3c14da96ac9bfd90466e Mon Sep 17 00:00:00 2001 From: Chukun Pan Date: Fri, 22 Nov 2024 15:30:06 +0800 Subject: [PATCH 017/337] phy: rockchip: naneng-combphy: fix phy reset Currently, the USB port via combophy on the RK3528/RK3588 SoC is broken. usb usb8-port1: Cannot enable. Maybe the USB cable is bad? This is due to the combphy of RK3528/RK3588 SoC has multiple resets, but only "phy resets" need assert and deassert, "apb resets" don't need. So change the driver to only match the phy resets, which is also what the vendor kernel does. Fixes: 7160820d742a ("phy: rockchip: add naneng combo phy for RK3568") Cc: FUKAUMI Naoki Cc: Michael Zimmermann Signed-off-by: Chukun Pan Reviewed-by: Heiko Stuebner Tested-by: FUKAUMI Naoki Link: https://lore.kernel.org/r/20241122073006.99309-2-amadeus@jmu.edu.cn Signed-off-by: Vinod Koul --- drivers/phy/rockchip/phy-rockchip-naneng-combphy.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/phy/rockchip/phy-rockchip-naneng-combphy.c b/drivers/phy/rockchip/phy-rockchip-naneng-combphy.c index 0a9989e41237..2eb3329ca23f 100644 --- a/drivers/phy/rockchip/phy-rockchip-naneng-combphy.c +++ b/drivers/phy/rockchip/phy-rockchip-naneng-combphy.c @@ -309,7 +309,7 @@ static int rockchip_combphy_parse_dt(struct device *dev, struct rockchip_combphy priv->ext_refclk = device_property_present(dev, "rockchip,ext-refclk"); - priv->phy_rst = devm_reset_control_array_get_exclusive(dev); + priv->phy_rst = devm_reset_control_get(dev, "phy"); if (IS_ERR(priv->phy_rst)) return dev_err_probe(dev, PTR_ERR(priv->phy_rst), "failed to get phy reset\n"); From d0257e089d1bbd35c69b6c97ff73e3690ab149a9 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Sat, 30 Nov 2024 13:06:41 +0300 Subject: [PATCH 018/337] RDMA/uverbs: Prevent integer overflow issue In the expression "cmd.wqe_size * cmd.wr_count", both variables are u32 values that come from the user so the multiplication can lead to integer wrapping. Then we pass the result to uverbs_request_next_ptr() which also could potentially wrap. The "cmd.sge_count * sizeof(struct ib_uverbs_sge)" multiplication can also overflow on 32bit systems although it's fine on 64bit systems. This patch does two things. First, I've re-arranged the condition in uverbs_request_next_ptr() so that the use controlled variable "len" is on one side of the comparison by itself without any math. Then I've modified all the callers to use size_mul() for the multiplications. Fixes: 67cdb40ca444 ("[IB] uverbs: Implement more commands") Cc: stable@vger.kernel.org Signed-off-by: Dan Carpenter Link: https://patch.msgid.link/b8765ab3-c2da-4611-aae0-ddd6ba173d23@stanley.mountain Signed-off-by: Leon Romanovsky --- drivers/infiniband/core/uverbs_cmd.c | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/drivers/infiniband/core/uverbs_cmd.c b/drivers/infiniband/core/uverbs_cmd.c index 66b02fbf077a..5ad14c39d48c 100644 --- a/drivers/infiniband/core/uverbs_cmd.c +++ b/drivers/infiniband/core/uverbs_cmd.c @@ -161,7 +161,7 @@ static const void __user *uverbs_request_next_ptr(struct uverbs_req_iter *iter, { const void __user *res = iter->cur; - if (iter->cur + len > iter->end) + if (len > iter->end - iter->cur) return (void __force __user *)ERR_PTR(-ENOSPC); iter->cur += len; return res; @@ -2008,11 +2008,13 @@ static int ib_uverbs_post_send(struct uverbs_attr_bundle *attrs) ret = uverbs_request_start(attrs, &iter, &cmd, sizeof(cmd)); if (ret) return ret; - wqes = uverbs_request_next_ptr(&iter, cmd.wqe_size * cmd.wr_count); + wqes = uverbs_request_next_ptr(&iter, size_mul(cmd.wqe_size, + cmd.wr_count)); if (IS_ERR(wqes)) return PTR_ERR(wqes); - sgls = uverbs_request_next_ptr( - &iter, cmd.sge_count * sizeof(struct ib_uverbs_sge)); + sgls = uverbs_request_next_ptr(&iter, + size_mul(cmd.sge_count, + sizeof(struct ib_uverbs_sge))); if (IS_ERR(sgls)) return PTR_ERR(sgls); ret = uverbs_request_finish(&iter); @@ -2198,11 +2200,11 @@ ib_uverbs_unmarshall_recv(struct uverbs_req_iter *iter, u32 wr_count, if (wqe_size < sizeof(struct ib_uverbs_recv_wr)) return ERR_PTR(-EINVAL); - wqes = uverbs_request_next_ptr(iter, wqe_size * wr_count); + wqes = uverbs_request_next_ptr(iter, size_mul(wqe_size, wr_count)); if (IS_ERR(wqes)) return ERR_CAST(wqes); - sgls = uverbs_request_next_ptr( - iter, sge_count * sizeof(struct ib_uverbs_sge)); + sgls = uverbs_request_next_ptr(iter, size_mul(sge_count, + sizeof(struct ib_uverbs_sge))); if (IS_ERR(sgls)) return ERR_CAST(sgls); ret = uverbs_request_finish(iter); From 8886fb3240931a0afce82dea87edfe46bcb0a586 Mon Sep 17 00:00:00 2001 From: Krishna Kurapati Date: Tue, 12 Nov 2024 14:58:31 +0530 Subject: [PATCH 019/337] phy: qcom-qmp: Fix register name in RX Lane config of SC8280XP In RX Lane configuration sequence of SC8280XP, the register V5_RX_UCDR_FO_GAIN is incorrectly spelled as RX_UCDR_SO_GAIN and hence the programming sequence is wrong. Fix the register sequence accordingly to avoid any compliance failures. This has been tested on SA8775P by checking device mode enumeration in SuperSpeed. Cc: stable@vger.kernel.org Fixes: c0c7769cdae2 ("phy: qcom-qmp: Add SC8280XP USB3 UNI phy") Signed-off-by: Krishna Kurapati Reviewed-by: Konrad Dybcio Link: https://lore.kernel.org/r/20241112092831.4110942-1-quic_kriskura@quicinc.com Signed-off-by: Vinod Koul --- drivers/phy/qualcomm/phy-qcom-qmp-usb.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/phy/qualcomm/phy-qcom-qmp-usb.c b/drivers/phy/qualcomm/phy-qcom-qmp-usb.c index acd6075bf6d9..c9c337840715 100644 --- a/drivers/phy/qualcomm/phy-qcom-qmp-usb.c +++ b/drivers/phy/qualcomm/phy-qcom-qmp-usb.c @@ -1052,7 +1052,7 @@ static const struct qmp_phy_init_tbl sc8280xp_usb3_uniphy_rx_tbl[] = { QMP_PHY_INIT_CFG(QSERDES_V5_RX_UCDR_FASTLOCK_FO_GAIN, 0x2f), QMP_PHY_INIT_CFG(QSERDES_V5_RX_UCDR_FASTLOCK_COUNT_LOW, 0xff), QMP_PHY_INIT_CFG(QSERDES_V5_RX_UCDR_FASTLOCK_COUNT_HIGH, 0x0f), - QMP_PHY_INIT_CFG(QSERDES_V5_RX_UCDR_SO_GAIN, 0x0a), + QMP_PHY_INIT_CFG(QSERDES_V5_RX_UCDR_FO_GAIN, 0x0a), QMP_PHY_INIT_CFG(QSERDES_V5_RX_VGA_CAL_CNTRL1, 0x54), QMP_PHY_INIT_CFG(QSERDES_V5_RX_VGA_CAL_CNTRL2, 0x0f), QMP_PHY_INIT_CFG(QSERDES_V5_RX_RX_EQU_ADAPTOR_CNTRL2, 0x0f), From 2de679ecd724b823c2cb58caab8508c7eec8aefc Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Mon, 11 Nov 2024 11:37:02 +0100 Subject: [PATCH 020/337] phy: stm32: work around constant-value overflow assertion FIELD_PREP() checks that a constant fits into the available bitfield, but if one of the two lookup tables in stm32_impedance_tune() does not find a matching entry, the index is out of range, which gcc correctly complains about: In file included from : In function 'stm32_impedance_tune', inlined from 'stm32_combophy_pll_init' at drivers/phy/st/phy-stm32-combophy.c:247:9: include/linux/compiler_types.h:517:38: error: call to '__compiletime_assert_447' declared with attribute error: FIELD_PREP: value too large for the field 517 | _compiletime_assert(condition, msg, __compiletime_assert_, __COUNTER__) | ^ include/linux/bitfield.h:68:3: note: in expansion of macro 'BUILD_BUG_ON_MSG' 68 | BUILD_BUG_ON_MSG(__builtin_constant_p(_val) ? \ 115 | __BF_FIELD_CHECK(_mask, 0ULL, _val, "FIELD_PREP: "); \ | ^~~~~~~~~~~~~~~~ drivers/phy/st/phy-stm32-combophy.c:162:8: note: in expansion of macro 'FIELD_PREP' 162 | FIELD_PREP(STM32MP25_PCIEPRG_IMPCTRL_VSWING, vswing_of)); | ^~~~~~~~~~ Rework this so the field value gets set inside of the loop and otherwise set to zero. Fixes: 47e1bb6b4ba0 ("phy: stm32: Add support for STM32MP25 COMBOPHY.") Signed-off-by: Arnd Bergmann Link: https://lore.kernel.org/r/20241111103712.3520611-1-arnd@kernel.org Signed-off-by: Vinod Koul --- drivers/phy/st/phy-stm32-combophy.c | 21 +++++++++++++++------ 1 file changed, 15 insertions(+), 6 deletions(-) diff --git a/drivers/phy/st/phy-stm32-combophy.c b/drivers/phy/st/phy-stm32-combophy.c index 765bb34fe358..49e9fa90a681 100644 --- a/drivers/phy/st/phy-stm32-combophy.c +++ b/drivers/phy/st/phy-stm32-combophy.c @@ -122,6 +122,7 @@ static int stm32_impedance_tune(struct stm32_combophy *combophy) u32 max_vswing = imp_lookup[imp_size - 1].vswing[vswing_size - 1]; u32 min_vswing = imp_lookup[0].vswing[0]; u32 val; + u32 regval; if (!of_property_read_u32(combophy->dev->of_node, "st,output-micro-ohms", &val)) { if (val < min_imp || val > max_imp) { @@ -129,16 +130,20 @@ static int stm32_impedance_tune(struct stm32_combophy *combophy) return -EINVAL; } - for (imp_of = 0; imp_of < ARRAY_SIZE(imp_lookup); imp_of++) - if (imp_lookup[imp_of].microohm <= val) + regval = 0; + for (imp_of = 0; imp_of < ARRAY_SIZE(imp_lookup); imp_of++) { + if (imp_lookup[imp_of].microohm <= val) { + regval = FIELD_PREP(STM32MP25_PCIEPRG_IMPCTRL_OHM, imp_of); break; + } + } dev_dbg(combophy->dev, "Set %u micro-ohms output impedance\n", imp_lookup[imp_of].microohm); regmap_update_bits(combophy->regmap, SYSCFG_PCIEPRGCR, STM32MP25_PCIEPRG_IMPCTRL_OHM, - FIELD_PREP(STM32MP25_PCIEPRG_IMPCTRL_OHM, imp_of)); + regval); } else { regmap_read(combophy->regmap, SYSCFG_PCIEPRGCR, &val); imp_of = FIELD_GET(STM32MP25_PCIEPRG_IMPCTRL_OHM, val); @@ -150,16 +155,20 @@ static int stm32_impedance_tune(struct stm32_combophy *combophy) return -EINVAL; } - for (vswing_of = 0; vswing_of < ARRAY_SIZE(imp_lookup[imp_of].vswing); vswing_of++) - if (imp_lookup[imp_of].vswing[vswing_of] >= val) + regval = 0; + for (vswing_of = 0; vswing_of < ARRAY_SIZE(imp_lookup[imp_of].vswing); vswing_of++) { + if (imp_lookup[imp_of].vswing[vswing_of] >= val) { + regval = FIELD_PREP(STM32MP25_PCIEPRG_IMPCTRL_VSWING, vswing_of); break; + } + } dev_dbg(combophy->dev, "Set %u microvolt swing\n", imp_lookup[imp_of].vswing[vswing_of]); regmap_update_bits(combophy->regmap, SYSCFG_PCIEPRGCR, STM32MP25_PCIEPRG_IMPCTRL_VSWING, - FIELD_PREP(STM32MP25_PCIEPRG_IMPCTRL_VSWING, vswing_of)); + regval); } return 0; From e05feab22fd7dabcd6d272c4e2401ec1acdfdb9b Mon Sep 17 00:00:00 2001 From: Patrisious Haddad Date: Tue, 3 Dec 2024 15:45:37 +0200 Subject: [PATCH 021/337] RDMA/mlx5: Enforce same type port association for multiport RoCE Different core device types such as PFs and VFs shouldn't be affiliated together since they have different capabilities, fix that by enforcing type check before doing the affiliation. Fixes: 32f69e4be269 ("{net, IB}/mlx5: Manage port association for multiport RoCE") Reviewed-by: Mark Bloch Signed-off-by: Patrisious Haddad Link: https://patch.msgid.link/88699500f690dff1c1852c1ddb71f8a1cc8b956e.1733233480.git.leonro@nvidia.com Reviewed-by: Mateusz Polchlopek Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/mlx5/main.c | 6 ++++-- include/linux/mlx5/driver.h | 6 ++++++ 2 files changed, 10 insertions(+), 2 deletions(-) diff --git a/drivers/infiniband/hw/mlx5/main.c b/drivers/infiniband/hw/mlx5/main.c index bc7930d0c564..c2314797afc9 100644 --- a/drivers/infiniband/hw/mlx5/main.c +++ b/drivers/infiniband/hw/mlx5/main.c @@ -3639,7 +3639,8 @@ static int mlx5_ib_init_multiport_master(struct mlx5_ib_dev *dev) list_for_each_entry(mpi, &mlx5_ib_unaffiliated_port_list, list) { if (dev->sys_image_guid == mpi->sys_image_guid && - (mlx5_core_native_port_num(mpi->mdev) - 1) == i) { + (mlx5_core_native_port_num(mpi->mdev) - 1) == i && + mlx5_core_same_coredev_type(dev->mdev, mpi->mdev)) { bound = mlx5_ib_bind_slave_port(dev, mpi); } @@ -4785,7 +4786,8 @@ static int mlx5r_mp_probe(struct auxiliary_device *adev, mutex_lock(&mlx5_ib_multiport_mutex); list_for_each_entry(dev, &mlx5_ib_dev_list, ib_dev_list) { - if (dev->sys_image_guid == mpi->sys_image_guid) + if (dev->sys_image_guid == mpi->sys_image_guid && + mlx5_core_same_coredev_type(dev->mdev, mpi->mdev)) bound = mlx5_ib_bind_slave_port(dev, mpi); if (bound) { diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h index fc7e6153b73d..4f9e6f6dbaab 100644 --- a/include/linux/mlx5/driver.h +++ b/include/linux/mlx5/driver.h @@ -1202,6 +1202,12 @@ static inline bool mlx5_core_is_vf(const struct mlx5_core_dev *dev) return dev->coredev_type == MLX5_COREDEV_VF; } +static inline bool mlx5_core_same_coredev_type(const struct mlx5_core_dev *dev1, + const struct mlx5_core_dev *dev2) +{ + return dev1->coredev_type == dev2->coredev_type; +} + static inline bool mlx5_core_is_ecpf(const struct mlx5_core_dev *dev) { return dev->caps.embedded_cpu; From 79d330fbdffd8cee06d8bdf38d82cb62d8363a27 Mon Sep 17 00:00:00 2001 From: Kashyap Desai Date: Wed, 4 Dec 2024 13:24:12 +0530 Subject: [PATCH 022/337] RDMA/bnxt_re: Fix max SGEs for the Work Request Gen P7 supports up to 13 SGEs for now. WQE software structure can hold only 6 now. Since the max send sge is reported as 13, the stack can give requests up to 13 SGEs. This is causing traffic failures and system crashes. Use the define for max SGE supported for variable size. This will work for both static and variable WQEs. Fixes: 227f51743b61 ("RDMA/bnxt_re: Fix the max WQE size for static WQE support") Signed-off-by: Kashyap Desai Signed-off-by: Selvin Xavier Link: https://patch.msgid.link/20241204075416.478431-2-kalesh-anakkur.purayil@broadcom.com Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/bnxt_re/qplib_fp.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/infiniband/hw/bnxt_re/qplib_fp.h b/drivers/infiniband/hw/bnxt_re/qplib_fp.h index ef3424c81345..19e279871f10 100644 --- a/drivers/infiniband/hw/bnxt_re/qplib_fp.h +++ b/drivers/infiniband/hw/bnxt_re/qplib_fp.h @@ -114,7 +114,6 @@ struct bnxt_qplib_sge { u32 size; }; -#define BNXT_QPLIB_QP_MAX_SGL 6 struct bnxt_qplib_swq { u64 wr_id; int next_idx; @@ -154,7 +153,7 @@ struct bnxt_qplib_swqe { #define BNXT_QPLIB_SWQE_FLAGS_UC_FENCE BIT(2) #define BNXT_QPLIB_SWQE_FLAGS_SOLICIT_EVENT BIT(3) #define BNXT_QPLIB_SWQE_FLAGS_INLINE BIT(4) - struct bnxt_qplib_sge sg_list[BNXT_QPLIB_QP_MAX_SGL]; + struct bnxt_qplib_sge sg_list[BNXT_VAR_MAX_SGE]; int num_sge; /* Max inline data is 96 bytes */ u32 inline_len; From 5effcacc8a8f3eb2a9f069d7e81a9ac793598dfb Mon Sep 17 00:00:00 2001 From: Selvin Xavier Date: Wed, 4 Dec 2024 13:24:13 +0530 Subject: [PATCH 023/337] RDMA/bnxt_re: Avoid initializing the software queue for user queues Software Queues to hold the WRs needs to be created for only kernel queues. Avoid allocating the unnecessary memory for user Queues. Fixes: 1ac5a4047975 ("RDMA/bnxt_re: Add bnxt_re RoCE driver") Fixes: 159fb4ceacd7 ("RDMA/bnxt_re: introduce a function to allocate swq") Signed-off-by: Selvin Xavier Link: https://patch.msgid.link/20241204075416.478431-3-kalesh-anakkur.purayil@broadcom.com Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/bnxt_re/qplib_fp.c | 42 +++++++++++++----------- 1 file changed, 23 insertions(+), 19 deletions(-) diff --git a/drivers/infiniband/hw/bnxt_re/qplib_fp.c b/drivers/infiniband/hw/bnxt_re/qplib_fp.c index 9af8aaadc99a..72f35070f671 100644 --- a/drivers/infiniband/hw/bnxt_re/qplib_fp.c +++ b/drivers/infiniband/hw/bnxt_re/qplib_fp.c @@ -659,13 +659,6 @@ int bnxt_qplib_create_srq(struct bnxt_qplib_res *res, rc = bnxt_qplib_alloc_init_hwq(&srq->hwq, &hwq_attr); if (rc) return rc; - - srq->swq = kcalloc(srq->hwq.max_elements, sizeof(*srq->swq), - GFP_KERNEL); - if (!srq->swq) { - rc = -ENOMEM; - goto fail; - } srq->dbinfo.flags = 0; bnxt_qplib_rcfw_cmd_prep((struct cmdq_base *)&req, CMDQ_BASE_OPCODE_CREATE_SRQ, @@ -694,9 +687,17 @@ int bnxt_qplib_create_srq(struct bnxt_qplib_res *res, spin_lock_init(&srq->lock); srq->start_idx = 0; srq->last_idx = srq->hwq.max_elements - 1; - for (idx = 0; idx < srq->hwq.max_elements; idx++) - srq->swq[idx].next_idx = idx + 1; - srq->swq[srq->last_idx].next_idx = -1; + if (!srq->hwq.is_user) { + srq->swq = kcalloc(srq->hwq.max_elements, sizeof(*srq->swq), + GFP_KERNEL); + if (!srq->swq) { + rc = -ENOMEM; + goto fail; + } + for (idx = 0; idx < srq->hwq.max_elements; idx++) + srq->swq[idx].next_idx = idx + 1; + srq->swq[srq->last_idx].next_idx = -1; + } srq->id = le32_to_cpu(resp.xid); srq->dbinfo.hwq = &srq->hwq; @@ -1042,13 +1043,14 @@ int bnxt_qplib_create_qp(struct bnxt_qplib_res *res, struct bnxt_qplib_qp *qp) if (rc) return rc; - rc = bnxt_qplib_alloc_init_swq(sq); - if (rc) - goto fail_sq; - - if (psn_sz) - bnxt_qplib_init_psn_ptr(qp, psn_sz); + if (!sq->hwq.is_user) { + rc = bnxt_qplib_alloc_init_swq(sq); + if (rc) + goto fail_sq; + if (psn_sz) + bnxt_qplib_init_psn_ptr(qp, psn_sz); + } req.sq_size = cpu_to_le32(bnxt_qplib_set_sq_size(sq, qp->wqe_mode)); pbl = &sq->hwq.pbl[PBL_LVL_0]; req.sq_pbl = cpu_to_le64(pbl->pg_map_arr[0]); @@ -1074,9 +1076,11 @@ int bnxt_qplib_create_qp(struct bnxt_qplib_res *res, struct bnxt_qplib_qp *qp) rc = bnxt_qplib_alloc_init_hwq(&rq->hwq, &hwq_attr); if (rc) goto sq_swq; - rc = bnxt_qplib_alloc_init_swq(rq); - if (rc) - goto fail_rq; + if (!rq->hwq.is_user) { + rc = bnxt_qplib_alloc_init_swq(rq); + if (rc) + goto fail_rq; + } req.rq_size = cpu_to_le32(rq->max_wqe); pbl = &rq->hwq.pbl[PBL_LVL_0]; From 064c22408a73b9e945139b64614c534cbbefb591 Mon Sep 17 00:00:00 2001 From: Kashyap Desai Date: Wed, 4 Dec 2024 13:24:14 +0530 Subject: [PATCH 024/337] RDMA/bnxt_re: Avoid sending the modify QP workaround for latest adapters The workaround to modify the UD QP from RTS to RTS is required only for older adapters. Issuing this for latest adapters can caus some unexpected behavior. Fix it Fixes: 1801d87b3598 ("RDMA/bnxt_re: Support new 5760X P7 devices") Signed-off-by: Kashyap Desai Signed-off-by: Selvin Xavier Link: https://patch.msgid.link/20241204075416.478431-4-kalesh-anakkur.purayil@broadcom.com Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/bnxt_re/ib_verbs.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/drivers/infiniband/hw/bnxt_re/ib_verbs.c b/drivers/infiniband/hw/bnxt_re/ib_verbs.c index 82023394e330..5428a1408cee 100644 --- a/drivers/infiniband/hw/bnxt_re/ib_verbs.c +++ b/drivers/infiniband/hw/bnxt_re/ib_verbs.c @@ -2824,7 +2824,8 @@ bad: wr = wr->next; } bnxt_qplib_post_send_db(&qp->qplib_qp); - bnxt_ud_qp_hw_stall_workaround(qp); + if (!bnxt_qplib_is_chip_gen_p5_p7(qp->rdev->chip_ctx)) + bnxt_ud_qp_hw_stall_workaround(qp); spin_unlock_irqrestore(&qp->sq_lock, flags); return rc; } @@ -2936,7 +2937,8 @@ bad: wr = wr->next; } bnxt_qplib_post_send_db(&qp->qplib_qp); - bnxt_ud_qp_hw_stall_workaround(qp); + if (!bnxt_qplib_is_chip_gen_p5_p7(qp->rdev->chip_ctx)) + bnxt_ud_qp_hw_stall_workaround(qp); spin_unlock_irqrestore(&qp->sq_lock, flags); return rc; From d507d29bfde3fee6a74d098a9ac640b8fc1a549b Mon Sep 17 00:00:00 2001 From: Kalesh AP Date: Wed, 4 Dec 2024 13:24:16 +0530 Subject: [PATCH 025/337] RDMA/bnxt_re: Don't fail destroy QP and cleanup debugfs earlier Change bnxt_re_destroy_qp to always return 0 and don't fail in case of error during destroy. In addition, delete debugfs QP to earlier stage. Fixes: d7d54769c042 ("RDMA/bnxt_re: Add debugfs hook in the driver") Reviewed-by: Kashyap Desai Signed-off-by: Kalesh AP Signed-off-by: Selvin Xavier Link: https://patch.msgid.link/20241204075416.478431-6-kalesh-anakkur.purayil@broadcom.com Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/bnxt_re/ib_verbs.c | 15 +++++---------- 1 file changed, 5 insertions(+), 10 deletions(-) diff --git a/drivers/infiniband/hw/bnxt_re/ib_verbs.c b/drivers/infiniband/hw/bnxt_re/ib_verbs.c index 5428a1408cee..215074c0860b 100644 --- a/drivers/infiniband/hw/bnxt_re/ib_verbs.c +++ b/drivers/infiniband/hw/bnxt_re/ib_verbs.c @@ -967,13 +967,13 @@ int bnxt_re_destroy_qp(struct ib_qp *ib_qp, struct ib_udata *udata) unsigned int flags; int rc; + bnxt_re_debug_rem_qpinfo(rdev, qp); + bnxt_qplib_flush_cqn_wq(&qp->qplib_qp); rc = bnxt_qplib_destroy_qp(&rdev->qplib_res, &qp->qplib_qp); - if (rc) { + if (rc) ibdev_err(&rdev->ibdev, "Failed to destroy HW QP"); - return rc; - } if (rdma_is_kernel_res(&qp->ib_qp.res)) { flags = bnxt_re_lock_cqs(qp); @@ -983,11 +983,8 @@ int bnxt_re_destroy_qp(struct ib_qp *ib_qp, struct ib_udata *udata) bnxt_qplib_free_qp_res(&rdev->qplib_res, &qp->qplib_qp); - if (ib_qp->qp_type == IB_QPT_GSI && rdev->gsi_ctx.gsi_sqp) { - rc = bnxt_re_destroy_gsi_sqp(qp); - if (rc) - return rc; - } + if (ib_qp->qp_type == IB_QPT_GSI && rdev->gsi_ctx.gsi_sqp) + bnxt_re_destroy_gsi_sqp(qp); mutex_lock(&rdev->qp_lock); list_del(&qp->list); @@ -998,8 +995,6 @@ int bnxt_re_destroy_qp(struct ib_qp *ib_qp, struct ib_udata *udata) else if (qp->qplib_qp.type == CMDQ_CREATE_QP_TYPE_UD) atomic_dec(&rdev->stats.res.ud_qp_count); - bnxt_re_debug_rem_qpinfo(rdev, qp); - ib_umem_release(qp->rumem); ib_umem_release(qp->sumem); From d8e4771f99c0400a1873235704b28bb803c83d17 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Wed, 23 Oct 2024 11:40:56 +0300 Subject: [PATCH 026/337] mtd: rawnand: fix double free in atmel_pmecc_create_user() The "user" pointer was converted from being allocated with kzalloc() to being allocated by devm_kzalloc(). Calling kfree(user) will lead to a double free. Fixes: 6d734f1bfc33 ("mtd: rawnand: atmel: Fix possible memory leak") Signed-off-by: Dan Carpenter Signed-off-by: Miquel Raynal --- drivers/mtd/nand/raw/atmel/pmecc.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/drivers/mtd/nand/raw/atmel/pmecc.c b/drivers/mtd/nand/raw/atmel/pmecc.c index a22aab4ed4e8..3c7dee1be21d 100644 --- a/drivers/mtd/nand/raw/atmel/pmecc.c +++ b/drivers/mtd/nand/raw/atmel/pmecc.c @@ -380,10 +380,8 @@ atmel_pmecc_create_user(struct atmel_pmecc *pmecc, user->delta = user->dmu + req->ecc.strength + 1; gf_tables = atmel_pmecc_get_gf_tables(req); - if (IS_ERR(gf_tables)) { - kfree(user); + if (IS_ERR(gf_tables)) return ERR_CAST(gf_tables); - } user->gf_tables = gf_tables; From 9b458e8be0d13e81ed03fffa23f8f9b528bbd786 Mon Sep 17 00:00:00 2001 From: Zichen Xie Date: Wed, 23 Oct 2024 16:13:10 -0500 Subject: [PATCH 027/337] mtd: diskonchip: Cast an operand to prevent potential overflow There may be a potential integer overflow issue in inftl_partscan(). parts[0].size is defined as "uint64_t" while mtd->erasesize and ip->firstUnit are defined as 32-bit unsigned integer. The result of the calculation will be limited to 32 bits without correct casting. Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2") Signed-off-by: Zichen Xie Cc: stable@vger.kernel.org Signed-off-by: Miquel Raynal --- drivers/mtd/nand/raw/diskonchip.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/mtd/nand/raw/diskonchip.c b/drivers/mtd/nand/raw/diskonchip.c index 8db7fc424571..70d6c2250f32 100644 --- a/drivers/mtd/nand/raw/diskonchip.c +++ b/drivers/mtd/nand/raw/diskonchip.c @@ -1098,7 +1098,7 @@ static inline int __init inftl_partscan(struct mtd_info *mtd, struct mtd_partiti (i == 0) && (ip->firstUnit > 0)) { parts[0].name = " DiskOnChip IPL / Media Header partition"; parts[0].offset = 0; - parts[0].size = mtd->erasesize * ip->firstUnit; + parts[0].size = (uint64_t)mtd->erasesize * ip->firstUnit; numparts = 1; } From b086a46dae48829e11c0c02580e30d920b76743c Mon Sep 17 00:00:00 2001 From: Maciej Andrzejewski Date: Mon, 2 Dec 2024 13:51:07 +0100 Subject: [PATCH 028/337] mtd: rawnand: arasan: Fix double assertion of chip-select When two chip-selects are configured in the device tree, and the second is a non-native GPIO, both the GPIO-based chip-select and the first native chip-select may be asserted simultaneously. This double assertion causes incorrect read and write operations. The issue occurs because when nfc->ncs <= 2, nfc->spare_cs is always initialized to 0 due to static initialization. Consequently, when the second chip-select (GPIO-based) is selected in anfc_assert_cs(), it is detected by anfc_is_gpio_cs(), and nfc->native_cs is assigned the value 0. This results in both the GPIO-based chip-select being asserted and the NAND controller register receiving 0, erroneously selecting the native chip-select. This patch resolves the issue, as confirmed by oscilloscope testing with configurations involving two or more chip-selects in the device tree. Fixes: acbd3d0945f9 ("mtd: rawnand: arasan: Leverage additional GPIO CS") Cc: stable@vger.kernel.org Signed-off-by: Maciej Andrzejewski Signed-off-by: Miquel Raynal --- drivers/mtd/nand/raw/arasan-nand-controller.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/mtd/nand/raw/arasan-nand-controller.c b/drivers/mtd/nand/raw/arasan-nand-controller.c index db42aa0c7b6b..26b506107a1a 100644 --- a/drivers/mtd/nand/raw/arasan-nand-controller.c +++ b/drivers/mtd/nand/raw/arasan-nand-controller.c @@ -1409,8 +1409,8 @@ static int anfc_parse_cs(struct arasan_nfc *nfc) * case, the "not" chosen CS is assigned to nfc->spare_cs and selected * whenever a GPIO CS must be asserted. */ - if (nfc->cs_array && nfc->ncs > 2) { - if (!nfc->cs_array[0] && !nfc->cs_array[1]) { + if (nfc->cs_array) { + if (nfc->ncs > 2 && !nfc->cs_array[0] && !nfc->cs_array[1]) { dev_err(nfc->dev, "Assign a single native CS when using GPIOs\n"); return -EINVAL; From 11e6831fd81468cf48155b9b3c11295c391da723 Mon Sep 17 00:00:00 2001 From: Maciej Andrzejewski Date: Mon, 2 Dec 2024 19:58:36 +0100 Subject: [PATCH 029/337] mtd: rawnand: arasan: Fix missing de-registration of NAND The NAND chip-selects are registered for the Arasan driver during initialization but are not de-registered when the driver is unloaded. As a result, if the driver is loaded again, the chip-selects remain registered and busy, making them unavailable for use. Fixes: 197b88fecc50 ("mtd: rawnand: arasan: Add new Arasan NAND controller") Cc: stable@vger.kernel.org Signed-off-by: Maciej Andrzejewski ICEYE Signed-off-by: Miquel Raynal --- drivers/mtd/nand/raw/arasan-nand-controller.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/drivers/mtd/nand/raw/arasan-nand-controller.c b/drivers/mtd/nand/raw/arasan-nand-controller.c index 26b506107a1a..865754737f5f 100644 --- a/drivers/mtd/nand/raw/arasan-nand-controller.c +++ b/drivers/mtd/nand/raw/arasan-nand-controller.c @@ -1478,8 +1478,15 @@ static int anfc_probe(struct platform_device *pdev) static void anfc_remove(struct platform_device *pdev) { + int i; struct arasan_nfc *nfc = platform_get_drvdata(pdev); + for (i = 0; i < nfc->ncs; i++) { + if (nfc->cs_array[i]) { + gpiod_put(nfc->cs_array[i]); + } + } + anfc_chips_cleanup(nfc); } From 140054a25f85036ec847e722c76cc1bfaf3f0d96 Mon Sep 17 00:00:00 2001 From: Roger Quadros Date: Tue, 3 Dec 2024 15:33:17 +0200 Subject: [PATCH 030/337] mtd: rawnand: omap2: Fix build warnings with W=1 Add kernel-doc for functions to get rid of below warnings when built with W=1. drivers/mtd/nand/raw/omap2.c:260: warning: Function parameter or struct member 'chip' not described in 'omap_nand_data_in_pref' drivers/mtd/nand/raw/omap2.c:260: warning: Function parameter or struct member 'buf' not described in 'omap_nand_data_in_pref' drivers/mtd/nand/raw/omap2.c:260: warning: Function parameter or struct member 'len' not described in 'omap_nand_data_in_pref' drivers/mtd/nand/raw/omap2.c:260: warning: Function parameter or struct member 'force_8bit' not described in 'omap_nand_data_in_pref' drivers/mtd/nand/raw/omap2.c:304: warning: Function parameter or struct member 'chip' not described in 'omap_nand_data_out_pref' drivers/mtd/nand/raw/omap2.c:304: warning: Function parameter or struct member 'buf' not described in 'omap_nand_data_out_pref' drivers/mtd/nand/raw/omap2.c:304: warning: Function parameter or struct member 'len' not described in 'omap_nand_data_out_pref' drivers/mtd/nand/raw/omap2.c:304: warning: Function parameter or struct member 'force_8bit' not described in 'omap_nand_data_out_pref' drivers/mtd/nand/raw/omap2.c:446: warning: Function parameter or struct member 'chip' not described in 'omap_nand_data_in_dma_pref' drivers/mtd/nand/raw/omap2.c:446: warning: Function parameter or struct member 'buf' not described in 'omap_nand_data_in_dma_pref' drivers/mtd/nand/raw/omap2.c:446: warning: Function parameter or struct member 'len' not described in 'omap_nand_data_in_dma_pref' drivers/mtd/nand/raw/omap2.c:446: warning: Function parameter or struct member 'force_8bit' not described in 'omap_nand_data_in_dma_pref' drivers/mtd/nand/raw/omap2.c:467: warning: Function parameter or struct member 'chip' not described in 'omap_nand_data_out_dma_pref' drivers/mtd/nand/raw/omap2.c:467: warning: Function parameter or struct member 'buf' not described in 'omap_nand_data_out_dma_pref' drivers/mtd/nand/raw/omap2.c:467: warning: Function parameter or struct member 'len' not described in 'omap_nand_data_out_dma_pref' drivers/mtd/nand/raw/omap2.c:467: warning: Function parameter or struct member 'force_8bit' not described in 'omap_nand_data_out_dma_pref' Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-kbuild-all/202412031716.JfNIh1Uu-lkp@intel.com/ Signed-off-by: Roger Quadros Signed-off-by: Miquel Raynal --- drivers/mtd/nand/raw/omap2.c | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/drivers/mtd/nand/raw/omap2.c b/drivers/mtd/nand/raw/omap2.c index d9141f3c0dd1..b8af3a3533fc 100644 --- a/drivers/mtd/nand/raw/omap2.c +++ b/drivers/mtd/nand/raw/omap2.c @@ -254,6 +254,10 @@ static int omap_prefetch_reset(int cs, struct omap_nand_info *info) /** * omap_nand_data_in_pref - NAND data in using prefetch engine + * @chip: NAND chip + * @buf: output buffer where NAND data is placed into + * @len: length of transfer + * @force_8bit: force 8-bit transfers */ static void omap_nand_data_in_pref(struct nand_chip *chip, void *buf, unsigned int len, bool force_8bit) @@ -297,6 +301,10 @@ static void omap_nand_data_in_pref(struct nand_chip *chip, void *buf, /** * omap_nand_data_out_pref - NAND data out using Write Posting engine + * @chip: NAND chip + * @buf: input buffer that is sent to NAND + * @len: length of transfer + * @force_8bit: force 8-bit transfers */ static void omap_nand_data_out_pref(struct nand_chip *chip, const void *buf, unsigned int len, @@ -440,6 +448,10 @@ out_copy: /** * omap_nand_data_in_dma_pref - NAND data in using DMA and Prefetch + * @chip: NAND chip + * @buf: output buffer where NAND data is placed into + * @len: length of transfer + * @force_8bit: force 8-bit transfers */ static void omap_nand_data_in_dma_pref(struct nand_chip *chip, void *buf, unsigned int len, bool force_8bit) @@ -460,6 +472,10 @@ static void omap_nand_data_in_dma_pref(struct nand_chip *chip, void *buf, /** * omap_nand_data_out_dma_pref - NAND data out using DMA and write posting + * @chip: NAND chip + * @buf: input buffer that is sent to NAND + * @len: length of transfer + * @force_8bit: force 8-bit transfers */ static void omap_nand_data_out_dma_pref(struct nand_chip *chip, const void *buf, unsigned int len, From 48808b55b07c3cea64805267a5547f03e6452a9f Mon Sep 17 00:00:00 2001 From: Valentina Fernandez Date: Mon, 18 Nov 2024 15:53:54 +0000 Subject: [PATCH 031/337] firmware: microchip: fix UL_IAP lock check in mpfs_auto_update_state() To verify that Auto Update is possible, the mpfs_auto_update_state() function performs a "Query Security Service Request" to the system controller. Previously, the check was performed on the first element of the response message, which was accessed using a 32-bit pointer. This caused the bitwise operation to reference incorrect data, as the response should be inspected at the byte level. Fixed this by casting the response to a u8 * pointer, ensuring the check correctly inspects the appropriate byte of the response message. Additionally, rename "UL_Auto Update" to "UL_IAP" to match the PolarFire Family System Services User Guide. Signed-off-by: Valentina Fernandez Signed-off-by: Conor Dooley --- drivers/firmware/microchip/mpfs-auto-update.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/firmware/microchip/mpfs-auto-update.c b/drivers/firmware/microchip/mpfs-auto-update.c index 38a03698cec9..e194f7acb2a9 100644 --- a/drivers/firmware/microchip/mpfs-auto-update.c +++ b/drivers/firmware/microchip/mpfs-auto-update.c @@ -402,10 +402,10 @@ static int mpfs_auto_update_available(struct mpfs_auto_update_priv *priv) return -EIO; /* - * Bit 5 of byte 1 is "UL_Auto Update" & if it is set, Auto Update is + * Bit 5 of byte 1 is "UL_IAP" & if it is set, Auto Update is * not possible. */ - if (response_msg[1] & AUTO_UPDATE_FEATURE_ENABLED) + if ((((u8 *)response_msg)[1] & AUTO_UPDATE_FEATURE_ENABLED)) return -EPERM; return 0; From 9d23e48654620fdccfcc74cc2cef04eaf7353d07 Mon Sep 17 00:00:00 2001 From: Cristian Ciocaltea Date: Wed, 23 Oct 2024 20:29:54 +0300 Subject: [PATCH 032/337] phy: rockchip: samsung-hdptx: Set drvdata before enabling runtime PM In some cases, rk_hdptx_phy_runtime_resume() may be invoked before platform_set_drvdata() is executed in ->probe(), leading to a NULL pointer dereference when using the return of dev_get_drvdata(). Ensure platform_set_drvdata() is called before devm_pm_runtime_enable(). Reported-by: Dmitry Osipenko Fixes: 553be2830c5f ("phy: rockchip: Add Samsung HDMI/eDP Combo PHY driver") Signed-off-by: Cristian Ciocaltea Reviewed-by: Heiko Stuebner Link: https://lore.kernel.org/r/20241023-phy-sam-hdptx-rpm-fix-v1-1-87f4c994e346@collabora.com Signed-off-by: Vinod Koul --- drivers/phy/rockchip/phy-rockchip-samsung-hdptx.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/phy/rockchip/phy-rockchip-samsung-hdptx.c b/drivers/phy/rockchip/phy-rockchip-samsung-hdptx.c index ceab9c71d3b5..0965b9d4f9cf 100644 --- a/drivers/phy/rockchip/phy-rockchip-samsung-hdptx.c +++ b/drivers/phy/rockchip/phy-rockchip-samsung-hdptx.c @@ -1101,6 +1101,8 @@ static int rk_hdptx_phy_probe(struct platform_device *pdev) return dev_err_probe(dev, PTR_ERR(hdptx->grf), "Could not get GRF syscon\n"); + platform_set_drvdata(pdev, hdptx); + ret = devm_pm_runtime_enable(dev); if (ret) return dev_err_probe(dev, ret, "Failed to enable runtime PM\n"); @@ -1110,7 +1112,6 @@ static int rk_hdptx_phy_probe(struct platform_device *pdev) return dev_err_probe(dev, PTR_ERR(hdptx->phy), "Failed to create HDMI PHY\n"); - platform_set_drvdata(pdev, hdptx); phy_set_drvdata(hdptx->phy, hdptx); phy_set_bus_width(hdptx->phy, 8); From afc6e39e824ad0e44b2af50a97885caec8d213d1 Mon Sep 17 00:00:00 2001 From: Dimitri Fedrau Date: Mon, 9 Dec 2024 11:46:15 +0100 Subject: [PATCH 033/337] power: supply: gpio-charger: Fix set charge current limits Fix set charge current limits for devices which allow to set the lowest charge current limit to be greater zero. If requested charge current limit is below lowest limit, the index equals current_limit_map_size which leads to accessing memory beyond allocated memory. Fixes: be2919d8355e ("power: supply: gpio-charger: add charge-current-limit feature") Cc: stable@vger.kernel.org Signed-off-by: Dimitri Fedrau Link: https://lore.kernel.org/r/20241209-fix-charge-current-limit-v1-1-760d9b8f2af3@liebherr.com Signed-off-by: Sebastian Reichel --- drivers/power/supply/gpio-charger.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/drivers/power/supply/gpio-charger.c b/drivers/power/supply/gpio-charger.c index 68212b39785b..6139f736ecbe 100644 --- a/drivers/power/supply/gpio-charger.c +++ b/drivers/power/supply/gpio-charger.c @@ -67,6 +67,14 @@ static int set_charge_current_limit(struct gpio_charger *gpio_charger, int val) if (gpio_charger->current_limit_map[i].limit_ua <= val) break; } + + /* + * If a valid charge current limit isn't found, default to smallest + * current limitation for safety reasons. + */ + if (i >= gpio_charger->current_limit_map_size) + i = gpio_charger->current_limit_map_size - 1; + mapping = gpio_charger->current_limit_map[i]; for (i = 0; i < ndescs; i++) { From e5f84d1cf562f7b45e28d6e5f6490626f870f81c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20Wei=C3=9Fschuh?= Date: Sun, 8 Dec 2024 15:59:26 +0100 Subject: [PATCH 034/337] power: supply: cros_charge-control: add mutex for driver data MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Concurrent accesses through sysfs may lead to inconsistent state in the priv data. Introduce a mutex to avoid this. Fixes: c6ed48ef5259 ("power: supply: add ChromeOS EC based charge control driver") Cc: stable@vger.kernel.org Signed-off-by: Thomas Weißschuh Link: https://lore.kernel.org/r/20241208-cros_charge-control-v2-v1-1-8d168d0f08a3@weissschuh.net Signed-off-by: Sebastian Reichel --- drivers/power/supply/cros_charge-control.c | 18 +++++++++++++++++- 1 file changed, 17 insertions(+), 1 deletion(-) diff --git a/drivers/power/supply/cros_charge-control.c b/drivers/power/supply/cros_charge-control.c index 17c53591ce19..58ca6d9ed613 100644 --- a/drivers/power/supply/cros_charge-control.c +++ b/drivers/power/supply/cros_charge-control.c @@ -7,8 +7,10 @@ #include #include #include +#include #include #include +#include #include #include #include @@ -49,6 +51,7 @@ struct cros_chctl_priv { struct attribute *attributes[_CROS_CHCTL_ATTR_COUNT]; struct attribute_group group; + struct mutex lock; /* protects fields below and cros_ec */ enum power_supply_charge_behaviour current_behaviour; u8 current_start_threshold, current_end_threshold; }; @@ -85,6 +88,8 @@ static int cros_chctl_configure_ec(struct cros_chctl_priv *priv) { struct ec_params_charge_control req = {}; + lockdep_assert_held(&priv->lock); + req.cmd = EC_CHARGE_CONTROL_CMD_SET; switch (priv->current_behaviour) { @@ -159,6 +164,7 @@ static ssize_t charge_control_start_threshold_show(struct device *dev, struct cros_chctl_priv *priv = cros_chctl_attr_to_priv(&attr->attr, CROS_CHCTL_ATTR_START_THRESHOLD); + guard(mutex)(&priv->lock); return sysfs_emit(buf, "%u\n", (unsigned int)priv->current_start_threshold); } @@ -169,6 +175,7 @@ static ssize_t charge_control_start_threshold_store(struct device *dev, struct cros_chctl_priv *priv = cros_chctl_attr_to_priv(&attr->attr, CROS_CHCTL_ATTR_START_THRESHOLD); + guard(mutex)(&priv->lock); return cros_chctl_store_threshold(dev, priv, 0, buf, count); } @@ -178,6 +185,7 @@ static ssize_t charge_control_end_threshold_show(struct device *dev, struct devi struct cros_chctl_priv *priv = cros_chctl_attr_to_priv(&attr->attr, CROS_CHCTL_ATTR_END_THRESHOLD); + guard(mutex)(&priv->lock); return sysfs_emit(buf, "%u\n", (unsigned int)priv->current_end_threshold); } @@ -187,6 +195,7 @@ static ssize_t charge_control_end_threshold_store(struct device *dev, struct dev struct cros_chctl_priv *priv = cros_chctl_attr_to_priv(&attr->attr, CROS_CHCTL_ATTR_END_THRESHOLD); + guard(mutex)(&priv->lock); return cros_chctl_store_threshold(dev, priv, 1, buf, count); } @@ -195,6 +204,7 @@ static ssize_t charge_behaviour_show(struct device *dev, struct device_attribute struct cros_chctl_priv *priv = cros_chctl_attr_to_priv(&attr->attr, CROS_CHCTL_ATTR_CHARGE_BEHAVIOUR); + guard(mutex)(&priv->lock); return power_supply_charge_behaviour_show(dev, EC_CHARGE_CONTROL_BEHAVIOURS, priv->current_behaviour, buf); } @@ -210,6 +220,7 @@ static ssize_t charge_behaviour_store(struct device *dev, struct device_attribut if (ret < 0) return ret; + guard(mutex)(&priv->lock); priv->current_behaviour = ret; ret = cros_chctl_configure_ec(priv); @@ -290,6 +301,10 @@ static int cros_chctl_probe(struct platform_device *pdev) if (!priv) return -ENOMEM; + ret = devm_mutex_init(dev, &priv->lock); + if (ret) + return ret; + ret = cros_ec_get_cmd_versions(cros_ec, EC_CMD_CHARGE_CONTROL); if (ret < 0) return ret; @@ -327,7 +342,8 @@ static int cros_chctl_probe(struct platform_device *pdev) priv->current_end_threshold = 100; /* Bring EC into well-known state */ - ret = cros_chctl_configure_ec(priv); + scoped_guard(mutex, &priv->lock) + ret = cros_chctl_configure_ec(priv); if (ret < 0) return ret; From e65a1b7fad0e112573eea7d64d4ab4fc513b8695 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20Wei=C3=9Fschuh?= Date: Sun, 8 Dec 2024 15:59:27 +0100 Subject: [PATCH 035/337] power: supply: cros_charge-control: allow start_threshold == end_threshold MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Allow setting the start and stop thresholds to the same value. There is no reason to disallow it. Suggested-by: Thomas Koch Fixes: c6ed48ef5259 ("power: supply: add ChromeOS EC based charge control driver") Cc: stable@vger.kernel.org Signed-off-by: Thomas Weißschuh Link: https://lore.kernel.org/r/20241208-cros_charge-control-v2-v1-2-8d168d0f08a3@weissschuh.net Signed-off-by: Sebastian Reichel --- drivers/power/supply/cros_charge-control.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/power/supply/cros_charge-control.c b/drivers/power/supply/cros_charge-control.c index 58ca6d9ed613..108b121db442 100644 --- a/drivers/power/supply/cros_charge-control.c +++ b/drivers/power/supply/cros_charge-control.c @@ -139,11 +139,11 @@ static ssize_t cros_chctl_store_threshold(struct device *dev, struct cros_chctl_ return -EINVAL; if (is_end_threshold) { - if (val <= priv->current_start_threshold) + if (val < priv->current_start_threshold) return -EINVAL; priv->current_end_threshold = val; } else { - if (val >= priv->current_end_threshold) + if (val > priv->current_end_threshold) return -EINVAL; priv->current_start_threshold = val; } From c28dc9fc24f5fa802d44ef7620a511035bdd803e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20Wei=C3=9Fschuh?= Date: Sun, 8 Dec 2024 15:59:28 +0100 Subject: [PATCH 036/337] power: supply: cros_charge-control: hide start threshold on v2 cmd MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ECs implementing the v2 command will not stop charging when the end threshold is reached. Instead they will begin discharging until the start threshold is reached, leading to permanent charge and discharge cycles. This defeats the point of the charge control mechanism. Avoid the issue by hiding the start threshold on v2 systems. Instead on those systems program the EC with start == end which forces the EC to reach and stay at that level. v1 does not support thresholds and v3 works correctly, at least judging from the code. Reported-by: Thomas Koch Fixes: c6ed48ef5259 ("power: supply: add ChromeOS EC based charge control driver") Cc: stable@vger.kernel.org Signed-off-by: Thomas Weißschuh Link: https://lore.kernel.org/r/20241208-cros_charge-control-v2-v1-3-8d168d0f08a3@weissschuh.net Signed-off-by: Sebastian Reichel --- drivers/power/supply/cros_charge-control.c | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/drivers/power/supply/cros_charge-control.c b/drivers/power/supply/cros_charge-control.c index 108b121db442..9b0a7500296b 100644 --- a/drivers/power/supply/cros_charge-control.c +++ b/drivers/power/supply/cros_charge-control.c @@ -139,6 +139,10 @@ static ssize_t cros_chctl_store_threshold(struct device *dev, struct cros_chctl_ return -EINVAL; if (is_end_threshold) { + /* Start threshold is not exposed, use fixed value */ + if (priv->cmd_version == 2) + priv->current_start_threshold = val == 100 ? 0 : val; + if (val < priv->current_start_threshold) return -EINVAL; priv->current_end_threshold = val; @@ -234,12 +238,10 @@ static umode_t cros_chtl_attr_is_visible(struct kobject *kobj, struct attribute { struct cros_chctl_priv *priv = cros_chctl_attr_to_priv(attr, n); - if (priv->cmd_version < 2) { - if (n == CROS_CHCTL_ATTR_START_THRESHOLD) - return 0; - if (n == CROS_CHCTL_ATTR_END_THRESHOLD) - return 0; - } + if (n == CROS_CHCTL_ATTR_START_THRESHOLD && priv->cmd_version < 3) + return 0; + else if (n == CROS_CHCTL_ATTR_END_THRESHOLD && priv->cmd_version < 2) + return 0; return attr->mode; } From 919bfa9b2dbf3bc0c478afd4e44445836381dacb Mon Sep 17 00:00:00 2001 From: K Prateek Nayak Date: Tue, 10 Dec 2024 03:25:57 +0000 Subject: [PATCH 037/337] cpufreq/amd-pstate: Detect preferred core support before driver registration Booting with amd-pstate on 3rd Generation EPYC system incorrectly enabled ITMT support despite the system not supporting Preferred Core ranking. amd_pstate_init_prefcore() called during amd_pstate*_cpu_init() requires "amd_pstate_prefcore" to be set correctly however the preferred core support is detected only after driver registration which is too late. Swap the function calls around to detect preferred core support before registring the driver via amd_pstate_register_driver(). This ensures amd_pstate*_cpu_init() sees the correct value of "amd_pstate_prefcore" considering the platform support. Fixes: 279f838a61f9 ("x86/amd: Detect preferred cores in amd_get_boost_ratio_numerator()") Fixes: ff2653ded4d9 ("cpufreq/amd-pstate: Move registration after static function call update") Signed-off-by: K Prateek Nayak Acked-by: Mario Limonciello Link: https://lore.kernel.org/r/20241210032557.754-1-kprateek.nayak@amd.com Signed-off-by: Mario Limonciello --- drivers/cpufreq/amd-pstate.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/drivers/cpufreq/amd-pstate.c b/drivers/cpufreq/amd-pstate.c index d7630bab2516..8b36450bbdf6 100644 --- a/drivers/cpufreq/amd-pstate.c +++ b/drivers/cpufreq/amd-pstate.c @@ -1869,18 +1869,18 @@ static int __init amd_pstate_init(void) static_call_update(amd_pstate_update_perf, shmem_update_perf); } - ret = amd_pstate_register_driver(cppc_state); - if (ret) { - pr_err("failed to register with return %d\n", ret); - return ret; - } - if (amd_pstate_prefcore) { ret = amd_detect_prefcore(&amd_pstate_prefcore); if (ret) return ret; } + ret = amd_pstate_register_driver(cppc_state); + if (ret) { + pr_err("failed to register with return %d\n", ret); + return ret; + } + dev_root = bus_get_dev_root(&cpu_subsys); if (dev_root) { ret = sysfs_create_group(&dev_root->kobj, &amd_pstate_global_attr_group); From 8644b48714dca8bf2f42a4ff8311de8efc9bd8c3 Mon Sep 17 00:00:00 2001 From: Mika Westerberg Date: Tue, 14 May 2024 10:15:14 +0300 Subject: [PATCH 038/337] thunderbolt: Add support for Intel Panther Lake-M/P Intel Panther Lake-M/P has the same integrated Thunderbolt/USB4 controller as Lunar Lake. Add these PCI IDs to the driver list of supported devices. Cc: stable@vger.kernel.org Signed-off-by: Mika Westerberg --- drivers/thunderbolt/nhi.c | 8 ++++++++ drivers/thunderbolt/nhi.h | 4 ++++ 2 files changed, 12 insertions(+) diff --git a/drivers/thunderbolt/nhi.c b/drivers/thunderbolt/nhi.c index 1257dd3ce7e6..f3a2264e012b 100644 --- a/drivers/thunderbolt/nhi.c +++ b/drivers/thunderbolt/nhi.c @@ -1520,6 +1520,14 @@ static struct pci_device_id nhi_ids[] = { .driver_data = (kernel_ulong_t)&icl_nhi_ops }, { PCI_VDEVICE(INTEL, PCI_DEVICE_ID_INTEL_LNL_NHI1), .driver_data = (kernel_ulong_t)&icl_nhi_ops }, + { PCI_VDEVICE(INTEL, PCI_DEVICE_ID_INTEL_PTL_M_NHI0), + .driver_data = (kernel_ulong_t)&icl_nhi_ops }, + { PCI_VDEVICE(INTEL, PCI_DEVICE_ID_INTEL_PTL_M_NHI1), + .driver_data = (kernel_ulong_t)&icl_nhi_ops }, + { PCI_VDEVICE(INTEL, PCI_DEVICE_ID_INTEL_PTL_P_NHI0), + .driver_data = (kernel_ulong_t)&icl_nhi_ops }, + { PCI_VDEVICE(INTEL, PCI_DEVICE_ID_INTEL_PTL_P_NHI1), + .driver_data = (kernel_ulong_t)&icl_nhi_ops }, { PCI_VDEVICE(INTEL, PCI_DEVICE_ID_INTEL_BARLOW_RIDGE_HOST_80G_NHI) }, { PCI_VDEVICE(INTEL, PCI_DEVICE_ID_INTEL_BARLOW_RIDGE_HOST_40G_NHI) }, diff --git a/drivers/thunderbolt/nhi.h b/drivers/thunderbolt/nhi.h index 7a07c7c1a9c2..16744f25a9a0 100644 --- a/drivers/thunderbolt/nhi.h +++ b/drivers/thunderbolt/nhi.h @@ -92,6 +92,10 @@ extern const struct tb_nhi_ops icl_nhi_ops; #define PCI_DEVICE_ID_INTEL_RPL_NHI1 0xa76d #define PCI_DEVICE_ID_INTEL_LNL_NHI0 0xa833 #define PCI_DEVICE_ID_INTEL_LNL_NHI1 0xa834 +#define PCI_DEVICE_ID_INTEL_PTL_M_NHI0 0xe333 +#define PCI_DEVICE_ID_INTEL_PTL_M_NHI1 0xe334 +#define PCI_DEVICE_ID_INTEL_PTL_P_NHI0 0xe433 +#define PCI_DEVICE_ID_INTEL_PTL_P_NHI1 0xe434 #define PCI_CLASS_SERIAL_USB_USB4 0x0c0340 From a4048c83fd87c65657a4acb17d639092d4b6133d Mon Sep 17 00:00:00 2001 From: Anumula Murali Mohan Reddy Date: Tue, 3 Dec 2024 19:30:53 +0530 Subject: [PATCH 039/337] RDMA/core: Fix ENODEV error for iWARP test over vlan If traffic is over vlan, cma_validate_port() fails to match net_device ifindex with bound_if_index and results in ENODEV error. As iWARP gid table is static, it contains entry corresponding to only one net device which is either real netdev or vlan netdev for cases like siw attached to a vlan interface. This patch fixes the issue by assigning bound_if_index with net device index, if real net device obtained from bound if index matches with net device retrieved from gid table Fixes: f8ef1be816bf ("RDMA/cma: Avoid GID lookups on iWARP devices") Link: https://lore.kernel.org/all/ZzNgdrjo1kSCGbRz@chelsio.com/ Signed-off-by: Anumula Murali Mohan Reddy Signed-off-by: Potnuri Bharat Teja Link: https://patch.msgid.link/20241203140052.3985-1-anumula@chelsio.com Signed-off-by: Leon Romanovsky --- drivers/infiniband/core/cma.c | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/drivers/infiniband/core/cma.c b/drivers/infiniband/core/cma.c index 64ace0b968f0..91db10515d74 100644 --- a/drivers/infiniband/core/cma.c +++ b/drivers/infiniband/core/cma.c @@ -690,6 +690,7 @@ cma_validate_port(struct ib_device *device, u32 port, int bound_if_index = dev_addr->bound_dev_if; int dev_type = dev_addr->dev_type; struct net_device *ndev = NULL; + struct net_device *pdev = NULL; if (!rdma_dev_access_netns(device, id_priv->id.route.addr.dev_addr.net)) goto out; @@ -714,6 +715,21 @@ cma_validate_port(struct ib_device *device, u32 port, rcu_read_lock(); ndev = rcu_dereference(sgid_attr->ndev); + if (ndev->ifindex != bound_if_index) { + pdev = dev_get_by_index_rcu(dev_addr->net, bound_if_index); + if (pdev) { + if (is_vlan_dev(pdev)) { + pdev = vlan_dev_real_dev(pdev); + if (ndev->ifindex == pdev->ifindex) + bound_if_index = pdev->ifindex; + } + if (is_vlan_dev(ndev)) { + pdev = vlan_dev_real_dev(ndev); + if (bound_if_index == pdev->ifindex) + bound_if_index = ndev->ifindex; + } + } + } if (!net_eq(dev_net(ndev), dev_addr->net) || ndev->ifindex != bound_if_index) { rdma_put_gid_attr(sgid_attr); From efb113fc30e7b805f7375d269b93bb4593d11d97 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Fri, 15 Nov 2024 17:23:10 +0100 Subject: [PATCH 040/337] drm: rework FB_CORE dependency The 'select FB_CORE' statement moved from CONFIG_DRM to DRM_CLIENT_LIB, but there are now configurations that have code calling into fb_core as built-in even though the client_lib itself is a loadable module: x86_64-linux-ld: drivers/gpu/drm/drm_fbdev_shmem.o: in function `drm_fbdev_shmem_driver_fbdev_probe': drm_fbdev_shmem.c:(.text+0x1fc): undefined reference to `fb_deferred_io_init' x86_64-linux-ld: drivers/gpu/drm/drm_fbdev_shmem.o: in function `drm_fbdev_shmem_fb_destroy': drm_fbdev_shmem.c:(.text+0x2e1): undefined reference to `fb_deferred_io_cleanup' In addition to DRM_CLIENT_LIB, the 'select' needs to be at least in two more parts, DRM_KMS_HELPER and DRM_GEM_SHMEM_HELPER, so add those here. v3: - Remove FB_CORE from DRM_KMS_HELPER to avoid circular dependency Fixes: dadd28d4142f ("drm/client: Add client-lib module") Signed-off-by: Arnd Bergmann Reviewed-by: Thomas Zimmermann Signed-off-by: Thomas Zimmermann Link: https://patchwork.freedesktop.org/patch/msgid/20241115162323.3555229-1-arnd@kernel.org --- drivers/gpu/drm/Kconfig | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/gpu/drm/Kconfig b/drivers/gpu/drm/Kconfig index 5504721007cc..a0690049b292 100644 --- a/drivers/gpu/drm/Kconfig +++ b/drivers/gpu/drm/Kconfig @@ -372,6 +372,7 @@ config DRM_GEM_DMA_HELPER config DRM_GEM_SHMEM_HELPER tristate depends on DRM && MMU + select FB_CORE if DRM_FBDEV_EMULATION select FB_SYSMEM_HELPERS_DEFERRED if DRM_FBDEV_EMULATION help Choose this if you need the GEM shmem helper functions From 50a062a7620051c09adacd6d140ebd56881a333b Mon Sep 17 00:00:00 2001 From: Mario Limonciello Date: Mon, 9 Dec 2024 12:52:34 -0600 Subject: [PATCH 041/337] cpufreq/amd-pstate: Store the boost numerator as highest perf again commit ad4caad58d91d ("cpufreq: amd-pstate: Merge amd_pstate_highest_perf_set() into amd_get_boost_ratio_numerator()") changed the semantics for highest perf and commit 18d9b52271213 ("cpufreq/amd-pstate: Use nominal perf for limits when boost is disabled") worked around those semantic changes. This however is a confusing result and furthermore makes it awkward to change frequency limits and boost due to the scaling differences. Restore the boost numerator to highest perf again. Suggested-by: Dhananjay Ugwekar Reviewed-by: Gautham R. Shenoy Fixes: ad4caad58d91 ("cpufreq: amd-pstate: Merge amd_pstate_highest_perf_set() into amd_get_boost_ratio_numerator()") Link: https://lore.kernel.org/r/20241209185248.16301-2-mario.limonciello@amd.com Signed-off-by: Mario Limonciello --- Documentation/admin-guide/pm/amd-pstate.rst | 4 +--- drivers/cpufreq/amd-pstate.c | 25 ++++++++++++--------- 2 files changed, 16 insertions(+), 13 deletions(-) diff --git a/Documentation/admin-guide/pm/amd-pstate.rst b/Documentation/admin-guide/pm/amd-pstate.rst index 210a808b74ec..412423c54f25 100644 --- a/Documentation/admin-guide/pm/amd-pstate.rst +++ b/Documentation/admin-guide/pm/amd-pstate.rst @@ -251,9 +251,7 @@ performance supported in `AMD CPPC Performance Capability `_). In some ASICs, the highest CPPC performance is not the one in the ``_CPC`` table, so we need to expose it to sysfs. If boost is not active, but still supported, this maximum frequency will be larger than the one in -``cpuinfo``. On systems that support preferred core, the driver will have -different values for some cores than others and this will reflect the values -advertised by the platform at bootup. +``cpuinfo``. This attribute is read-only. ``amd_pstate_lowest_nonlinear_freq`` diff --git a/drivers/cpufreq/amd-pstate.c b/drivers/cpufreq/amd-pstate.c index 8b36450bbdf6..ab6fe9c2150c 100644 --- a/drivers/cpufreq/amd-pstate.c +++ b/drivers/cpufreq/amd-pstate.c @@ -374,15 +374,19 @@ static inline int amd_pstate_cppc_enable(bool enable) static int msr_init_perf(struct amd_cpudata *cpudata) { - u64 cap1; + u64 cap1, numerator; int ret = rdmsrl_safe_on_cpu(cpudata->cpu, MSR_AMD_CPPC_CAP1, &cap1); if (ret) return ret; - WRITE_ONCE(cpudata->highest_perf, AMD_CPPC_HIGHEST_PERF(cap1)); - WRITE_ONCE(cpudata->max_limit_perf, AMD_CPPC_HIGHEST_PERF(cap1)); + ret = amd_get_boost_ratio_numerator(cpudata->cpu, &numerator); + if (ret) + return ret; + + WRITE_ONCE(cpudata->highest_perf, numerator); + WRITE_ONCE(cpudata->max_limit_perf, numerator); WRITE_ONCE(cpudata->nominal_perf, AMD_CPPC_NOMINAL_PERF(cap1)); WRITE_ONCE(cpudata->lowest_nonlinear_perf, AMD_CPPC_LOWNONLIN_PERF(cap1)); WRITE_ONCE(cpudata->lowest_perf, AMD_CPPC_LOWEST_PERF(cap1)); @@ -394,13 +398,18 @@ static int msr_init_perf(struct amd_cpudata *cpudata) static int shmem_init_perf(struct amd_cpudata *cpudata) { struct cppc_perf_caps cppc_perf; + u64 numerator; int ret = cppc_get_perf_caps(cpudata->cpu, &cppc_perf); if (ret) return ret; - WRITE_ONCE(cpudata->highest_perf, cppc_perf.highest_perf); - WRITE_ONCE(cpudata->max_limit_perf, cppc_perf.highest_perf); + ret = amd_get_boost_ratio_numerator(cpudata->cpu, &numerator); + if (ret) + return ret; + + WRITE_ONCE(cpudata->highest_perf, numerator); + WRITE_ONCE(cpudata->max_limit_perf, numerator); WRITE_ONCE(cpudata->nominal_perf, cppc_perf.nominal_perf); WRITE_ONCE(cpudata->lowest_nonlinear_perf, cppc_perf.lowest_nonlinear_perf); @@ -889,7 +898,6 @@ static int amd_pstate_init_freq(struct amd_cpudata *cpudata) { int ret; u32 min_freq, max_freq; - u64 numerator; u32 nominal_perf, nominal_freq; u32 lowest_nonlinear_perf, lowest_nonlinear_freq; u32 boost_ratio, lowest_nonlinear_ratio; @@ -911,10 +919,7 @@ static int amd_pstate_init_freq(struct amd_cpudata *cpudata) nominal_perf = READ_ONCE(cpudata->nominal_perf); - ret = amd_get_boost_ratio_numerator(cpudata->cpu, &numerator); - if (ret) - return ret; - boost_ratio = div_u64(numerator << SCHED_CAPACITY_SHIFT, nominal_perf); + boost_ratio = div_u64(cpudata->highest_perf << SCHED_CAPACITY_SHIFT, nominal_perf); max_freq = (nominal_freq * boost_ratio >> SCHED_CAPACITY_SHIFT) * 1000; lowest_nonlinear_perf = READ_ONCE(cpudata->lowest_nonlinear_perf); From 2993b29b2a98f2bc9d55dfd37ef39f56a2908748 Mon Sep 17 00:00:00 2001 From: Mario Limonciello Date: Mon, 9 Dec 2024 12:52:35 -0600 Subject: [PATCH 042/337] cpufreq/amd-pstate: Use boost numerator for upper bound of frequencies commit 18d9b5227121 ("cpufreq/amd-pstate: Use nominal perf for limits when boost is disabled") introduced different semantics for min/max limits based upon whether the user turned off boost from sysfs. This however is not necessary when the highest perf value is the boost numerator. Suggested-by: Dhananjay Ugwekar Reviewed-by: Gautham R. Shenoy Fixes: 18d9b5227121 ("cpufreq/amd-pstate: Use nominal perf for limits when boost is disabled") Link: https://lore.kernel.org/r/20241209185248.16301-3-mario.limonciello@amd.com Signed-off-by: Mario Limonciello --- drivers/cpufreq/amd-pstate.c | 13 +++++-------- 1 file changed, 5 insertions(+), 8 deletions(-) diff --git a/drivers/cpufreq/amd-pstate.c b/drivers/cpufreq/amd-pstate.c index ab6fe9c2150c..66e5dfc711c0 100644 --- a/drivers/cpufreq/amd-pstate.c +++ b/drivers/cpufreq/amd-pstate.c @@ -570,16 +570,13 @@ static int amd_pstate_verify(struct cpufreq_policy_data *policy_data) static int amd_pstate_update_min_max_limit(struct cpufreq_policy *policy) { - u32 max_limit_perf, min_limit_perf, lowest_perf, max_perf; + u32 max_limit_perf, min_limit_perf, lowest_perf, max_perf, max_freq; struct amd_cpudata *cpudata = policy->driver_data; - if (cpudata->boost_supported && !policy->boost_enabled) - max_perf = READ_ONCE(cpudata->nominal_perf); - else - max_perf = READ_ONCE(cpudata->highest_perf); - - max_limit_perf = div_u64(policy->max * max_perf, policy->cpuinfo.max_freq); - min_limit_perf = div_u64(policy->min * max_perf, policy->cpuinfo.max_freq); + max_perf = READ_ONCE(cpudata->highest_perf); + max_freq = READ_ONCE(cpudata->max_freq); + max_limit_perf = div_u64(policy->max * max_perf, max_freq); + min_limit_perf = div_u64(policy->min * max_perf, max_freq); lowest_perf = READ_ONCE(cpudata->lowest_perf); if (min_limit_perf < lowest_perf) From 5d009e024056ded20c5bb1583146b833b23bbd5a Mon Sep 17 00:00:00 2001 From: Zijun Hu Date: Fri, 6 Dec 2024 08:52:30 +0800 Subject: [PATCH 043/337] of: Fix refcount leakage for OF node returned by __of_get_dma_parent() __of_get_dma_parent() returns OF device node @args.np, but the node's refcount is increased twice, by both of_parse_phandle_with_args() and of_node_get(), so causes refcount leakage for the node. Fix by directly returning the node got by of_parse_phandle_with_args(). Fixes: f83a6e5dea6c ("of: address: Add support for the parent DMA bus") Cc: stable@vger.kernel.org Signed-off-by: Zijun Hu Link: https://lore.kernel.org/r/20241206-of_core_fix-v1-4-dc28ed56bec3@quicinc.com Signed-off-by: Rob Herring (Arm) --- drivers/of/address.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/of/address.c b/drivers/of/address.c index 5b7ee3ed5296..c1f1c810e810 100644 --- a/drivers/of/address.c +++ b/drivers/of/address.c @@ -620,7 +620,7 @@ struct device_node *__of_get_dma_parent(const struct device_node *np) if (ret < 0) return of_get_parent(np); - return of_node_get(args.np); + return args.np; } #endif From fec3edc47d5cfc2dd296a5141df887bf567944db Mon Sep 17 00:00:00 2001 From: Zijun Hu Date: Mon, 9 Dec 2024 21:24:59 +0800 Subject: [PATCH 044/337] of/irq: Fix interrupt-map cell length check in of_irq_parse_imap_parent() On a malformed interrupt-map property which is shorter than expected by 1 cell, we may read bogus data past the end of the property instead of returning an error in of_irq_parse_imap_parent(). Decrement the remaining length when skipping over the interrupt parent phandle cell. Fixes: 935df1bd40d4 ("of/irq: Factor out parsing of interrupt-map parent phandle+args from of_irq_parse_raw()") Cc: stable@vger.kernel.org Signed-off-by: Zijun Hu Link: https://lore.kernel.org/r/20241209-of_irq_fix-v1-1-782f1419c8a1@quicinc.com [rh: reword commit msg] Signed-off-by: Rob Herring (Arm) --- drivers/of/irq.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/of/irq.c b/drivers/of/irq.c index 67fc0ceaa5f5..43cf60479b9e 100644 --- a/drivers/of/irq.c +++ b/drivers/of/irq.c @@ -111,6 +111,7 @@ const __be32 *of_irq_parse_imap_parent(const __be32 *imap, int len, struct of_ph else np = of_find_node_by_phandle(be32_to_cpup(imap)); imap++; + len--; /* Check if not found */ if (!np) { From 0f7ca6f69354e0c3923bbc28c92d0ecab4d50a3e Mon Sep 17 00:00:00 2001 From: Zijun Hu Date: Mon, 9 Dec 2024 21:25:02 +0800 Subject: [PATCH 045/337] of/irq: Fix using uninitialized variable @addr_len in API of_irq_parse_one() of_irq_parse_one() may use uninitialized variable @addr_len as shown below: // @addr_len is uninitialized int addr_len; // This operation does not touch @addr_len if it fails. addr = of_get_property(device, "reg", &addr_len); // Use uninitialized @addr_len if the operation fails. if (addr_len > sizeof(addr_buf)) addr_len = sizeof(addr_buf); // Check the operation result here. if (addr) memcpy(addr_buf, addr, addr_len); Fix by initializing @addr_len before the operation. Fixes: b739dffa5d57 ("of/irq: Prevent device address out-of-bounds read in interrupt map walk") Cc: stable@vger.kernel.org Signed-off-by: Zijun Hu Link: https://lore.kernel.org/r/20241209-of_irq_fix-v1-4-782f1419c8a1@quicinc.com Signed-off-by: Rob Herring (Arm) --- drivers/of/irq.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/of/irq.c b/drivers/of/irq.c index 43cf60479b9e..98b1cf78ecac 100644 --- a/drivers/of/irq.c +++ b/drivers/of/irq.c @@ -355,6 +355,7 @@ int of_irq_parse_one(struct device_node *device, int index, struct of_phandle_ar return of_irq_parse_oldworld(device, index, out_irq); /* Get the reg property (if any) */ + addr_len = 0; addr = of_get_property(device, "reg", &addr_len); /* Prevent out-of-bounds read in case of longer interrupt parent address size */ From 2872e21c47c359b902e53faf7e749c8ea682f7f7 Mon Sep 17 00:00:00 2001 From: Danilo Krummrich Date: Wed, 4 Dec 2024 16:22:47 +0100 Subject: [PATCH 046/337] MAINTAINERS: align Danilo's maintainer entries Some entries use my kernel.org address, while others use my Red Hat one. Since this is a bit of an inconvinience for me, align them to all use the same (kernel.org) address. Acked-by: Dave Airlie Signed-off-by: Danilo Krummrich Link: https://patchwork.freedesktop.org/patch/msgid/20241204152248.8644-1-dakr@kernel.org --- MAINTAINERS | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/MAINTAINERS b/MAINTAINERS index 1e930c7a58b1..c669c5bd61e7 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -7345,7 +7345,7 @@ F: drivers/gpu/drm/panel/panel-novatek-nt36672a.c DRM DRIVER FOR NVIDIA GEFORCE/QUADRO GPUS M: Karol Herbst M: Lyude Paul -M: Danilo Krummrich +M: Danilo Krummrich L: dri-devel@lists.freedesktop.org L: nouveau@lists.freedesktop.org S: Supported @@ -8922,7 +8922,7 @@ F: include/linux/arm_ffa.h FIRMWARE LOADER (request_firmware) M: Luis Chamberlain M: Russ Weight -M: Danilo Krummrich +M: Danilo Krummrich L: linux-kernel@vger.kernel.org S: Maintained F: Documentation/firmware_class/ From e34f1717ef0632fcec5cb827e5e0e9f223d70c9b Mon Sep 17 00:00:00 2001 From: Mario Limonciello Date: Mon, 9 Dec 2024 10:25:51 -0600 Subject: [PATCH 047/337] thunderbolt: Don't display nvm_version unless upgrade supported The read will never succeed if NVM wasn't initialized due to an unknown format. Add a new callback for visibility to only show when supported. Cc: stable@vger.kernel.org Fixes: aef9c693e7e5 ("thunderbolt: Move vendor specific NVM handling into nvm.c") Reported-by: Richard Hughes Closes: https://github.com/fwupd/fwupd/issues/8200 Signed-off-by: Mario Limonciello Signed-off-by: Mika Westerberg --- drivers/thunderbolt/retimer.c | 19 +++++++++++++++---- 1 file changed, 15 insertions(+), 4 deletions(-) diff --git a/drivers/thunderbolt/retimer.c b/drivers/thunderbolt/retimer.c index 89d2919d0193..eeb64433ebbc 100644 --- a/drivers/thunderbolt/retimer.c +++ b/drivers/thunderbolt/retimer.c @@ -103,6 +103,7 @@ static int tb_retimer_nvm_add(struct tb_retimer *rt) err_nvm: dev_dbg(&rt->dev, "NVM upgrade disabled\n"); + rt->no_nvm_upgrade = true; if (!IS_ERR(nvm)) tb_nvm_free(nvm); @@ -182,8 +183,6 @@ static ssize_t nvm_authenticate_show(struct device *dev, if (!rt->nvm) ret = -EAGAIN; - else if (rt->no_nvm_upgrade) - ret = -EOPNOTSUPP; else ret = sysfs_emit(buf, "%#x\n", rt->auth_status); @@ -323,8 +322,6 @@ static ssize_t nvm_version_show(struct device *dev, if (!rt->nvm) ret = -EAGAIN; - else if (rt->no_nvm_upgrade) - ret = -EOPNOTSUPP; else ret = sysfs_emit(buf, "%x.%x\n", rt->nvm->major, rt->nvm->minor); @@ -342,6 +339,19 @@ static ssize_t vendor_show(struct device *dev, struct device_attribute *attr, } static DEVICE_ATTR_RO(vendor); +static umode_t retimer_is_visible(struct kobject *kobj, struct attribute *attr, + int n) +{ + struct device *dev = kobj_to_dev(kobj); + struct tb_retimer *rt = tb_to_retimer(dev); + + if (attr == &dev_attr_nvm_authenticate.attr || + attr == &dev_attr_nvm_version.attr) + return rt->no_nvm_upgrade ? 0 : attr->mode; + + return attr->mode; +} + static struct attribute *retimer_attrs[] = { &dev_attr_device.attr, &dev_attr_nvm_authenticate.attr, @@ -351,6 +361,7 @@ static struct attribute *retimer_attrs[] = { }; static const struct attribute_group retimer_group = { + .is_visible = retimer_is_visible, .attrs = retimer_attrs, }; From fdad4fb7c506bea8b419f70ff2163d99962e8ede Mon Sep 17 00:00:00 2001 From: Daniel Swanemar Date: Mon, 4 Nov 2024 14:42:17 +0100 Subject: [PATCH 048/337] USB: serial: option: add TCL IK512 MBIM & ECM Add the following TCL IK512 compositions: 0x0530: Modem + Diag + AT + MBIM T: Bus=04 Lev=01 Prnt=01 Port=00 Cnt=01 Dev#= 3 Spd=10000 MxCh= 0 D: Ver= 3.20 Cls=00(>ifc ) Sub=00 Prot=00 MxPS= 9 #Cfgs= 1 P: Vendor=1bbb ProdID=0530 Rev=05.04 S: Manufacturer=TCL S: Product=TCL 5G USB Dongle S: SerialNumber=3136b91a C: #Ifs= 5 Cfg#= 1 Atr=80 MxPwr=896mA I: If#= 0 Alt= 0 #EPs= 3 Cls=ff(vend.) Sub=ff Prot=40 Driver=option E: Ad=01(O) Atr=02(Bulk) MxPS=1024 Ivl=0ms E: Ad=81(I) Atr=02(Bulk) MxPS=1024 Ivl=0ms E: Ad=82(I) Atr=03(Int.) MxPS= 10 Ivl=32ms I: If#= 1 Alt= 0 #EPs= 2 Cls=ff(vend.) Sub=ff Prot=30 Driver=option E: Ad=02(O) Atr=02(Bulk) MxPS=1024 Ivl=0ms E: Ad=83(I) Atr=02(Bulk) MxPS=1024 Ivl=0ms I: If#= 2 Alt= 0 #EPs= 3 Cls=ff(vend.) Sub=ff Prot=40 Driver=option E: Ad=03(O) Atr=02(Bulk) MxPS=1024 Ivl=0ms E: Ad=84(I) Atr=02(Bulk) MxPS=1024 Ivl=0ms E: Ad=85(I) Atr=03(Int.) MxPS= 10 Ivl=32ms I: If#= 3 Alt= 0 #EPs= 1 Cls=02(commc) Sub=0e Prot=00 Driver=cdc_mbim E: Ad=86(I) Atr=03(Int.) MxPS= 64 Ivl=32ms I: If#= 4 Alt= 1 #EPs= 2 Cls=0a(data ) Sub=00 Prot=02 Driver=cdc_mbim E: Ad=0f(O) Atr=02(Bulk) MxPS=1024 Ivl=0ms E: Ad=8e(I) Atr=02(Bulk) MxPS=1024 Ivl=0ms 0x0640: ECM + Modem + Diag + AT T: Bus=04 Lev=01 Prnt=01 Port=00 Cnt=01 Dev#= 4 Spd=10000 MxCh= 0 D: Ver= 3.20 Cls=00(>ifc ) Sub=00 Prot=00 MxPS= 9 #Cfgs= 1 P: Vendor=1bbb ProdID=0640 Rev=05.04 S: Manufacturer=TCL S: Product=TCL 5G USB Dongle S: SerialNumber=3136b91a C: #Ifs= 5 Cfg#= 1 Atr=80 MxPwr=896mA I: If#= 0 Alt= 0 #EPs= 1 Cls=02(commc) Sub=06 Prot=00 Driver=cdc_ether E: Ad=81(I) Atr=03(Int.) MxPS= 16 Ivl=32ms I: If#= 1 Alt= 1 #EPs= 2 Cls=0a(data ) Sub=00 Prot=00 Driver=cdc_ether E: Ad=0f(O) Atr=02(Bulk) MxPS=1024 Ivl=0ms E: Ad=8e(I) Atr=02(Bulk) MxPS=1024 Ivl=0ms I: If#= 2 Alt= 0 #EPs= 3 Cls=ff(vend.) Sub=ff Prot=40 Driver=option E: Ad=01(O) Atr=02(Bulk) MxPS=1024 Ivl=0ms E: Ad=82(I) Atr=02(Bulk) MxPS=1024 Ivl=0ms E: Ad=83(I) Atr=03(Int.) MxPS= 10 Ivl=32ms I: If#= 3 Alt= 0 #EPs= 2 Cls=ff(vend.) Sub=ff Prot=30 Driver=option E: Ad=02(O) Atr=02(Bulk) MxPS=1024 Ivl=0ms E: Ad=84(I) Atr=02(Bulk) MxPS=1024 Ivl=0ms I: If#= 4 Alt= 0 #EPs= 3 Cls=ff(vend.) Sub=ff Prot=40 Driver=option E: Ad=03(O) Atr=02(Bulk) MxPS=1024 Ivl=0ms E: Ad=85(I) Atr=02(Bulk) MxPS=1024 Ivl=0ms E: Ad=86(I) Atr=03(Int.) MxPS= 10 Ivl=32ms Signed-off-by: Daniel Swanemar Cc: stable@vger.kernel.org Signed-off-by: Johan Hovold --- drivers/usb/serial/option.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/drivers/usb/serial/option.c b/drivers/usb/serial/option.c index 9ba5584061c8..437960002bc3 100644 --- a/drivers/usb/serial/option.c +++ b/drivers/usb/serial/option.c @@ -2385,6 +2385,10 @@ static const struct usb_device_id option_ids[] = { { USB_DEVICE_AND_INTERFACE_INFO(MEIGSMART_VENDOR_ID, MEIGSMART_PRODUCT_SRM825L, 0xff, 0xff, 0x30) }, { USB_DEVICE_AND_INTERFACE_INFO(MEIGSMART_VENDOR_ID, MEIGSMART_PRODUCT_SRM825L, 0xff, 0xff, 0x40) }, { USB_DEVICE_AND_INTERFACE_INFO(MEIGSMART_VENDOR_ID, MEIGSMART_PRODUCT_SRM825L, 0xff, 0xff, 0x60) }, + { USB_DEVICE_INTERFACE_CLASS(0x1bbb, 0x0530, 0xff), /* TCL IK512 MBIM */ + .driver_info = NCTRL(1) }, + { USB_DEVICE_INTERFACE_CLASS(0x1bbb, 0x0640, 0xff), /* TCL IK512 ECM */ + .driver_info = NCTRL(3) }, { } /* Terminating entry */ }; MODULE_DEVICE_TABLE(usb, option_ids); From 724d461e44dfc0815624d2a9792f2f2beb7ee46d Mon Sep 17 00:00:00 2001 From: Michal Hrusecky Date: Tue, 19 Nov 2024 14:00:18 +0100 Subject: [PATCH 049/337] USB: serial: option: add MeiG Smart SLM770A Update the USB serial option driver to support MeiG Smart SLM770A. ID 2dee:4d57 Marvell Mobile Composite Device Bus T: Bus=02 Lev=01 Prnt=01 Port=00 Cnt=01 Dev#= 2 Spd=480 MxCh= 0 D: Ver= 2.00 Cls=ef(misc ) Sub=02 Prot=01 MxPS=64 #Cfgs= 1 P: Vendor=2dee ProdID=4d57 Rev= 1.00 S: Manufacturer=Marvell S: Product=Mobile Composite Device Bus C:* #Ifs= 6 Cfg#= 1 Atr=c0 MxPwr=500mA A: FirstIf#= 0 IfCount= 2 Cls=e0(wlcon) Sub=01 Prot=03 I:* If#= 0 Alt= 0 #EPs= 1 Cls=e0(wlcon) Sub=01 Prot=03 Driver=rndis_host E: Ad=87(I) Atr=03(Int.) MxPS= 64 Ivl=4096ms I:* If#= 1 Alt= 0 #EPs= 2 Cls=0a(data ) Sub=00 Prot=00 Driver=rndis_host E: Ad=83(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=0c(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms I:* If#= 2 Alt= 0 #EPs= 2 Cls=ff(vend.) Sub=00 Prot=00 Driver=option E: Ad=82(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=0b(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms I:* If#= 3 Alt= 0 #EPs= 3 Cls=ff(vend.) Sub=00 Prot=00 Driver=option E: Ad=88(I) Atr=03(Int.) MxPS= 64 Ivl=4096ms E: Ad=81(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=0a(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms I:* If#= 4 Alt= 0 #EPs= 3 Cls=ff(vend.) Sub=00 Prot=00 Driver=option E: Ad=89(I) Atr=03(Int.) MxPS= 64 Ivl=4096ms E: Ad=86(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=0f(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms I:* If#= 5 Alt= 0 #EPs= 2 Cls=ff(vend.) Sub=00 Prot=00 Driver=option E: Ad=85(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=0e(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms Tested successfully connecting to the Internet via rndis interface after dialing via AT commands on If#=3 or If#=4. Not sure of the purpose of the other serial interfaces. Signed-off-by: Michal Hrusecky Cc: stable@vger.kernel.org Signed-off-by: Johan Hovold --- drivers/usb/serial/option.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/usb/serial/option.c b/drivers/usb/serial/option.c index 437960002bc3..a807101548e7 100644 --- a/drivers/usb/serial/option.c +++ b/drivers/usb/serial/option.c @@ -625,6 +625,8 @@ static void option_instat_callback(struct urb *urb); #define MEIGSMART_PRODUCT_SRM825L 0x4d22 /* MeiG Smart SLM320 based on UNISOC UIS8910 */ #define MEIGSMART_PRODUCT_SLM320 0x4d41 +/* MeiG Smart SLM770A based on ASR1803 */ +#define MEIGSMART_PRODUCT_SLM770A 0x4d57 /* Device flags */ @@ -2382,6 +2384,7 @@ static const struct usb_device_id option_ids[] = { { USB_DEVICE_AND_INTERFACE_INFO(UNISOC_VENDOR_ID, TOZED_PRODUCT_LT70C, 0xff, 0, 0) }, { USB_DEVICE_AND_INTERFACE_INFO(UNISOC_VENDOR_ID, LUAT_PRODUCT_AIR720U, 0xff, 0, 0) }, { USB_DEVICE_AND_INTERFACE_INFO(MEIGSMART_VENDOR_ID, MEIGSMART_PRODUCT_SLM320, 0xff, 0, 0) }, + { USB_DEVICE_AND_INTERFACE_INFO(MEIGSMART_VENDOR_ID, MEIGSMART_PRODUCT_SLM770A, 0xff, 0, 0) }, { USB_DEVICE_AND_INTERFACE_INFO(MEIGSMART_VENDOR_ID, MEIGSMART_PRODUCT_SRM825L, 0xff, 0xff, 0x30) }, { USB_DEVICE_AND_INTERFACE_INFO(MEIGSMART_VENDOR_ID, MEIGSMART_PRODUCT_SRM825L, 0xff, 0xff, 0x40) }, { USB_DEVICE_AND_INTERFACE_INFO(MEIGSMART_VENDOR_ID, MEIGSMART_PRODUCT_SRM825L, 0xff, 0xff, 0x60) }, From aa954ae08262bb5cd6ab18dd56a0b58c1315db8b Mon Sep 17 00:00:00 2001 From: Mank Wang Date: Fri, 22 Nov 2024 09:06:00 +0000 Subject: [PATCH 050/337] USB: serial: option: add Netprisma LCUK54 modules for WWAN Ready LCUK54-WRD's pid/vid 0x3731/0x010a 0x3731/0x010c LCUK54-WWD's pid/vid 0x3731/0x010b 0x3731/0x010d Above products use the exact same interface layout and option driver: MBIM + GNSS + DIAG + NMEA + AT + QDSS + DPL T: Bus=01 Lev=01 Prnt=01 Port=01 Cnt=02 Dev#= 5 Spd=480 MxCh= 0 D: Ver= 2.00 Cls=00(>ifc ) Sub=00 Prot=00 MxPS=64 #Cfgs= 1 P: Vendor=3731 ProdID=0101 Rev= 5.04 S: Manufacturer=NetPrisma S: Product=LCUK54-WRD S: SerialNumber=feeba631 C:* #Ifs= 8 Cfg#= 1 Atr=a0 MxPwr=500mA A: FirstIf#= 0 IfCount= 2 Cls=02(comm.) Sub=0e Prot=00 I:* If#= 0 Alt= 0 #EPs= 1 Cls=02(comm.) Sub=0e Prot=00 Driver=cdc_mbim E: Ad=81(I) Atr=03(Int.) MxPS= 64 Ivl=32ms I: If#= 1 Alt= 0 #EPs= 0 Cls=0a(data ) Sub=00 Prot=02 Driver=cdc_mbim I:* If#= 1 Alt= 1 #EPs= 2 Cls=0a(data ) Sub=00 Prot=02 Driver=cdc_mbim E: Ad=8e(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=0f(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms I:* If#= 2 Alt= 0 #EPs= 1 Cls=ff(vend.) Sub=ff Prot=ff Driver=(none) E: Ad=82(I) Atr=03(Int.) MxPS= 64 Ivl=32ms I:* If#= 3 Alt= 0 #EPs= 2 Cls=ff(vend.) Sub=ff Prot=30 Driver=option E: Ad=01(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=83(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms I:* If#= 4 Alt= 0 #EPs= 3 Cls=ff(vend.) Sub=00 Prot=40 Driver=option E: Ad=85(I) Atr=03(Int.) MxPS= 10 Ivl=32ms E: Ad=84(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=02(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms I:* If#= 5 Alt= 0 #EPs= 3 Cls=ff(vend.) Sub=ff Prot=40 Driver=option E: Ad=87(I) Atr=03(Int.) MxPS= 10 Ivl=32ms E: Ad=86(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=03(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms I:* If#= 6 Alt= 0 #EPs= 1 Cls=ff(vend.) Sub=ff Prot=70 Driver=(none) E: Ad=88(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms I:* If#= 7 Alt= 0 #EPs= 1 Cls=ff(vend.) Sub=ff Prot=80 Driver=(none) E: Ad=8f(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms Signed-off-by: Mank Wang [ johan: use lower case hex notation ] Cc: stable@vger.kernel.org Signed-off-by: Johan Hovold --- drivers/usb/serial/option.c | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/drivers/usb/serial/option.c b/drivers/usb/serial/option.c index a807101548e7..e897c723b041 100644 --- a/drivers/usb/serial/option.c +++ b/drivers/usb/serial/option.c @@ -2377,6 +2377,18 @@ static const struct usb_device_id option_ids[] = { { USB_DEVICE_AND_INTERFACE_INFO(0x3731, 0x0116, 0xff, 0xff, 0x30) }, /* NetPrisma LCUK54-WWD for Golbal EDU */ { USB_DEVICE_AND_INTERFACE_INFO(0x3731, 0x0116, 0xff, 0x00, 0x40) }, { USB_DEVICE_AND_INTERFACE_INFO(0x3731, 0x0116, 0xff, 0xff, 0x40) }, + { USB_DEVICE_AND_INTERFACE_INFO(0x3731, 0x010a, 0xff, 0xff, 0x30) }, /* NetPrisma LCUK54-WRD for WWAN Ready */ + { USB_DEVICE_AND_INTERFACE_INFO(0x3731, 0x010a, 0xff, 0x00, 0x40) }, + { USB_DEVICE_AND_INTERFACE_INFO(0x3731, 0x010a, 0xff, 0xff, 0x40) }, + { USB_DEVICE_AND_INTERFACE_INFO(0x3731, 0x010b, 0xff, 0xff, 0x30) }, /* NetPrisma LCUK54-WWD for WWAN Ready */ + { USB_DEVICE_AND_INTERFACE_INFO(0x3731, 0x010b, 0xff, 0x00, 0x40) }, + { USB_DEVICE_AND_INTERFACE_INFO(0x3731, 0x010b, 0xff, 0xff, 0x40) }, + { USB_DEVICE_AND_INTERFACE_INFO(0x3731, 0x010c, 0xff, 0xff, 0x30) }, /* NetPrisma LCUK54-WRD for WWAN Ready */ + { USB_DEVICE_AND_INTERFACE_INFO(0x3731, 0x010c, 0xff, 0x00, 0x40) }, + { USB_DEVICE_AND_INTERFACE_INFO(0x3731, 0x010c, 0xff, 0xff, 0x40) }, + { USB_DEVICE_AND_INTERFACE_INFO(0x3731, 0x010d, 0xff, 0xff, 0x30) }, /* NetPrisma LCUK54-WWD for WWAN Ready */ + { USB_DEVICE_AND_INTERFACE_INFO(0x3731, 0x010d, 0xff, 0x00, 0x40) }, + { USB_DEVICE_AND_INTERFACE_INFO(0x3731, 0x010d, 0xff, 0xff, 0x40) }, { USB_DEVICE_AND_INTERFACE_INFO(OPPO_VENDOR_ID, OPPO_PRODUCT_R11, 0xff, 0xff, 0x30) }, { USB_DEVICE_AND_INTERFACE_INFO(SIERRA_VENDOR_ID, SIERRA_PRODUCT_EM9191, 0xff, 0xff, 0x30) }, { USB_DEVICE_AND_INTERFACE_INFO(SIERRA_VENDOR_ID, SIERRA_PRODUCT_EM9191, 0xff, 0xff, 0x40) }, From f07dfa6a1b65034a5c3ba3a555950d972f252757 Mon Sep 17 00:00:00 2001 From: Jack Wu Date: Thu, 28 Nov 2024 10:22:27 +0800 Subject: [PATCH 051/337] USB: serial: option: add MediaTek T7XX compositions Add the MediaTek T7XX compositions: T: Bus=03 Lev=01 Prnt=01 Port=05 Cnt=01 Dev#= 74 Spd=480 MxCh= 0 D: Ver= 2.10 Cls=ef(misc ) Sub=02 Prot=01 MxPS=64 #Cfgs= 1 P: Vendor=0e8d ProdID=7129 Rev= 0.01 S: Manufacturer=MediaTek Inc. S: Product=USB DATA CARD S: SerialNumber=004402459035402 C:* #Ifs=10 Cfg#= 1 Atr=a0 MxPwr=500mA A: FirstIf#= 0 IfCount= 2 Cls=02(comm.) Sub=0e Prot=00 I:* If#= 0 Alt= 0 #EPs= 1 Cls=02(comm.) Sub=0e Prot=00 Driver=cdc_mbim E: Ad=82(I) Atr=03(Int.) MxPS= 64 Ivl=32ms I: If#= 1 Alt= 0 #EPs= 0 Cls=0a(data ) Sub=00 Prot=02 Driver=cdc_mbim I:* If#= 1 Alt= 1 #EPs= 2 Cls=0a(data ) Sub=00 Prot=02 Driver=cdc_mbim E: Ad=81(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=01(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms I:* If#= 2 Alt= 0 #EPs= 2 Cls=ff(vend.) Sub=00 Prot=00 Driver=option E: Ad=83(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=02(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms I:* If#= 3 Alt= 0 #EPs= 2 Cls=ff(vend.) Sub=00 Prot=00 Driver=option E: Ad=84(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=03(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms I:* If#= 4 Alt= 0 #EPs= 2 Cls=ff(vend.) Sub=00 Prot=00 Driver=option E: Ad=85(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=04(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms I:* If#= 5 Alt= 0 #EPs= 2 Cls=ff(vend.) Sub=42 Prot=01 Driver=(none) E: Ad=86(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=05(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms I:* If#= 6 Alt= 0 #EPs= 2 Cls=ff(vend.) Sub=00 Prot=00 Driver=option E: Ad=87(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=06(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms I:* If#= 7 Alt= 0 #EPs= 2 Cls=ff(vend.) Sub=00 Prot=00 Driver=option E: Ad=88(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=07(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms I:* If#= 8 Alt= 0 #EPs= 2 Cls=ff(vend.) Sub=00 Prot=00 Driver=option E: Ad=89(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=08(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms I:* If#= 9 Alt= 0 #EPs= 2 Cls=ff(vend.) Sub=00 Prot=00 Driver=option E: Ad=8a(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=09(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms ------------------------------- | If Number | Function | ------------------------------- | 2 | USB AP Log Port | ------------------------------- | 3 | USB AP GNSS Port| ------------------------------- | 4 | USB AP META Port| ------------------------------- | 5 | ADB port | ------------------------------- | 6 | USB MD AT Port | ------------------------------ | 7 | USB MD META Port| ------------------------------- | 8 | USB NTZ Port | ------------------------------- | 9 | USB Debug port | ------------------------------- Signed-off-by: Jack Wu Cc: stable@vger.kernel.org Signed-off-by: Johan Hovold --- drivers/usb/serial/option.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/usb/serial/option.c b/drivers/usb/serial/option.c index e897c723b041..dcedb88ad7c1 100644 --- a/drivers/usb/serial/option.c +++ b/drivers/usb/serial/option.c @@ -2249,6 +2249,8 @@ static const struct usb_device_id option_ids[] = { .driver_info = NCTRL(2) }, { USB_DEVICE_AND_INTERFACE_INFO(MEDIATEK_VENDOR_ID, 0x7127, 0xff, 0x00, 0x00), .driver_info = NCTRL(2) | NCTRL(3) | NCTRL(4) }, + { USB_DEVICE_AND_INTERFACE_INFO(MEDIATEK_VENDOR_ID, 0x7129, 0xff, 0x00, 0x00), /* MediaTek T7XX */ + .driver_info = NCTRL(2) | NCTRL(3) | NCTRL(4) }, { USB_DEVICE(CELLIENT_VENDOR_ID, CELLIENT_PRODUCT_MEN200) }, { USB_DEVICE(CELLIENT_VENDOR_ID, CELLIENT_PRODUCT_MPL200), .driver_info = RSVD(1) | RSVD(4) }, From 8366e64a4454481339e7c56a8ad280161f2e441d Mon Sep 17 00:00:00 2001 From: Daniele Palmas Date: Mon, 9 Dec 2024 16:32:54 +0100 Subject: [PATCH 052/337] USB: serial: option: add Telit FE910C04 rmnet compositions Add the following Telit FE910C04 compositions: 0x10c0: rmnet + tty (AT/NMEA) + tty (AT) + tty (diag) T: Bus=02 Lev=01 Prnt=03 Port=06 Cnt=01 Dev#= 13 Spd=480 MxCh= 0 D: Ver= 2.00 Cls=00(>ifc ) Sub=00 Prot=00 MxPS=64 #Cfgs= 1 P: Vendor=1bc7 ProdID=10c0 Rev=05.15 S: Manufacturer=Telit Cinterion S: Product=FE910 S: SerialNumber=f71b8b32 C: #Ifs= 4 Cfg#= 1 Atr=e0 MxPwr=500mA I: If#= 0 Alt= 0 #EPs= 3 Cls=ff(vend.) Sub=ff Prot=50 Driver=qmi_wwan E: Ad=01(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=81(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=82(I) Atr=03(Int.) MxPS= 8 Ivl=32ms I: If#= 1 Alt= 0 #EPs= 3 Cls=ff(vend.) Sub=ff Prot=60 Driver=option E: Ad=02(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=83(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=84(I) Atr=03(Int.) MxPS= 10 Ivl=32ms I: If#= 2 Alt= 0 #EPs= 3 Cls=ff(vend.) Sub=ff Prot=40 Driver=option E: Ad=03(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=85(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=86(I) Atr=03(Int.) MxPS= 10 Ivl=32ms I: If#= 3 Alt= 0 #EPs= 2 Cls=ff(vend.) Sub=ff Prot=30 Driver=option E: Ad=04(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=87(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms 0x10c4: rmnet + tty (AT) + tty (AT) + tty (diag) T: Bus=02 Lev=01 Prnt=03 Port=06 Cnt=01 Dev#= 14 Spd=480 MxCh= 0 D: Ver= 2.00 Cls=00(>ifc ) Sub=00 Prot=00 MxPS=64 #Cfgs= 1 P: Vendor=1bc7 ProdID=10c4 Rev=05.15 S: Manufacturer=Telit Cinterion S: Product=FE910 S: SerialNumber=f71b8b32 C: #Ifs= 4 Cfg#= 1 Atr=e0 MxPwr=500mA I: If#= 0 Alt= 0 #EPs= 3 Cls=ff(vend.) Sub=ff Prot=50 Driver=qmi_wwan E: Ad=01(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=81(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=82(I) Atr=03(Int.) MxPS= 8 Ivl=32ms I: If#= 1 Alt= 0 #EPs= 3 Cls=ff(vend.) Sub=ff Prot=40 Driver=option E: Ad=02(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=83(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=84(I) Atr=03(Int.) MxPS= 10 Ivl=32ms I: If#= 2 Alt= 0 #EPs= 3 Cls=ff(vend.) Sub=ff Prot=40 Driver=option E: Ad=03(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=85(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=86(I) Atr=03(Int.) MxPS= 10 Ivl=32ms I: If#= 3 Alt= 0 #EPs= 2 Cls=ff(vend.) Sub=ff Prot=30 Driver=option E: Ad=04(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=87(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms 0x10c8: rmnet + tty (AT) + tty (diag) + DPL (data packet logging) + adb T: Bus=02 Lev=01 Prnt=03 Port=06 Cnt=01 Dev#= 17 Spd=480 MxCh= 0 D: Ver= 2.00 Cls=00(>ifc ) Sub=00 Prot=00 MxPS=64 #Cfgs= 1 P: Vendor=1bc7 ProdID=10c8 Rev=05.15 S: Manufacturer=Telit Cinterion S: Product=FE910 S: SerialNumber=f71b8b32 C: #Ifs= 5 Cfg#= 1 Atr=e0 MxPwr=500mA I: If#= 0 Alt= 0 #EPs= 3 Cls=ff(vend.) Sub=ff Prot=50 Driver=qmi_wwan E: Ad=01(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=81(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=82(I) Atr=03(Int.) MxPS= 8 Ivl=32ms I: If#= 1 Alt= 0 #EPs= 3 Cls=ff(vend.) Sub=ff Prot=40 Driver=option E: Ad=02(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=83(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=84(I) Atr=03(Int.) MxPS= 10 Ivl=32ms I: If#= 2 Alt= 0 #EPs= 2 Cls=ff(vend.) Sub=ff Prot=30 Driver=option E: Ad=03(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=85(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms I: If#= 3 Alt= 0 #EPs= 1 Cls=ff(vend.) Sub=ff Prot=80 Driver=(none) E: Ad=86(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms I: If#= 4 Alt= 0 #EPs= 2 Cls=ff(vend.) Sub=42 Prot=01 Driver=(none) E: Ad=04(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=87(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms Signed-off-by: Daniele Palmas Cc: stable@vger.kernel.org Signed-off-by: Johan Hovold --- drivers/usb/serial/option.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/drivers/usb/serial/option.c b/drivers/usb/serial/option.c index dcedb88ad7c1..64317b390d22 100644 --- a/drivers/usb/serial/option.c +++ b/drivers/usb/serial/option.c @@ -1397,6 +1397,12 @@ static const struct usb_device_id option_ids[] = { .driver_info = RSVD(0) | NCTRL(2) | RSVD(3) | RSVD(4) }, { USB_DEVICE_INTERFACE_CLASS(TELIT_VENDOR_ID, 0x10aa, 0xff), /* Telit FN920C04 (MBIM) */ .driver_info = NCTRL(3) | RSVD(4) | RSVD(5) }, + { USB_DEVICE_INTERFACE_CLASS(TELIT_VENDOR_ID, 0x10c0, 0xff), /* Telit FE910C04 (rmnet) */ + .driver_info = RSVD(0) | NCTRL(3) }, + { USB_DEVICE_INTERFACE_CLASS(TELIT_VENDOR_ID, 0x10c4, 0xff), /* Telit FE910C04 (rmnet) */ + .driver_info = RSVD(0) | NCTRL(3) }, + { USB_DEVICE_INTERFACE_CLASS(TELIT_VENDOR_ID, 0x10c8, 0xff), /* Telit FE910C04 (rmnet) */ + .driver_info = RSVD(0) | NCTRL(2) | RSVD(3) | RSVD(4) }, { USB_DEVICE(TELIT_VENDOR_ID, TELIT_PRODUCT_ME910), .driver_info = NCTRL(0) | RSVD(1) | RSVD(3) }, { USB_DEVICE(TELIT_VENDOR_ID, TELIT_PRODUCT_ME910_DUAL_MODEM), From 2dd59fe0e19e1ab955259978082b62e5751924c7 Mon Sep 17 00:00:00 2001 From: Nikita Zhandarovich Date: Fri, 17 May 2024 08:58:00 -0700 Subject: [PATCH 053/337] media: dvb-frontends: dib3000mb: fix uninit-value in dib3000_write_reg Syzbot reports [1] an uninitialized value issue found by KMSAN in dib3000_read_reg(). Local u8 rb[2] is used in i2c_transfer() as a read buffer; in case that call fails, the buffer may end up with some undefined values. Since no elaborate error handling is expected in dib3000_write_reg(), simply zero out rb buffer to mitigate the problem. [1] Syzkaller report dvb-usb: bulk message failed: -22 (6/0) ===================================================== BUG: KMSAN: uninit-value in dib3000mb_attach+0x2d8/0x3c0 drivers/media/dvb-frontends/dib3000mb.c:758 dib3000mb_attach+0x2d8/0x3c0 drivers/media/dvb-frontends/dib3000mb.c:758 dibusb_dib3000mb_frontend_attach+0x155/0x2f0 drivers/media/usb/dvb-usb/dibusb-mb.c:31 dvb_usb_adapter_frontend_init+0xed/0x9a0 drivers/media/usb/dvb-usb/dvb-usb-dvb.c:290 dvb_usb_adapter_init drivers/media/usb/dvb-usb/dvb-usb-init.c:90 [inline] dvb_usb_init drivers/media/usb/dvb-usb/dvb-usb-init.c:186 [inline] dvb_usb_device_init+0x25a8/0x3760 drivers/media/usb/dvb-usb/dvb-usb-init.c:310 dibusb_probe+0x46/0x250 drivers/media/usb/dvb-usb/dibusb-mb.c:110 ... Local variable rb created at: dib3000_read_reg+0x86/0x4e0 drivers/media/dvb-frontends/dib3000mb.c:54 dib3000mb_attach+0x123/0x3c0 drivers/media/dvb-frontends/dib3000mb.c:758 ... Fixes: 74340b0a8bc6 ("V4L/DVB (4457): Remove dib3000-common-module") Reported-by: syzbot+c88fc0ebe0d5935c70da@syzkaller.appspotmail.com Signed-off-by: Nikita Zhandarovich Link: https://lore.kernel.org/r/20240517155800.9881-1-n.zhandarovich@fintech.ru Signed-off-by: Mauro Carvalho Chehab --- drivers/media/dvb-frontends/dib3000mb.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/media/dvb-frontends/dib3000mb.c b/drivers/media/dvb-frontends/dib3000mb.c index 822639f11c04..63bc7b74bc8b 100644 --- a/drivers/media/dvb-frontends/dib3000mb.c +++ b/drivers/media/dvb-frontends/dib3000mb.c @@ -51,7 +51,7 @@ MODULE_PARM_DESC(debug, "set debugging level (1=info,2=xfer,4=setfe,8=getfe (|-a static int dib3000_read_reg(struct dib3000_state *state, u16 reg) { u8 wb[] = { ((reg >> 8) | 0x80) & 0xff, reg & 0xff }; - u8 rb[2]; + u8 rb[2] = {}; struct i2c_msg msg[] = { { .addr = state->config.demod_address, .flags = 0, .buf = wb, .len = 2 }, { .addr = state->config.demod_address, .flags = I2C_M_RD, .buf = rb, .len = 2 }, From 9cb189a882738c1d28b349d4e7c6a1ef9b3d8f87 Mon Sep 17 00:00:00 2001 From: Jann Horn Date: Wed, 4 Dec 2024 17:26:19 +0100 Subject: [PATCH 054/337] udmabuf: fix racy memfd sealing check The current check_memfd_seals() is racy: Since we first do check_memfd_seals() and then udmabuf_pin_folios() without holding any relevant lock across both, F_SEAL_WRITE can be set in between. This is problematic because we can end up holding pins to pages in a write-sealed memfd. Fix it using the inode lock, that's probably the easiest way. In the future, we might want to consider moving this logic into memfd, especially if anyone else wants to use memfd_pin_folios(). Reported-by: Julian Orth Closes: https://bugzilla.kernel.org/show_bug.cgi?id=219106 Closes: https://lore.kernel.org/r/CAG48ez0w8HrFEZtJkfmkVKFDhE5aP7nz=obrimeTgpD+StkV9w@mail.gmail.com Fixes: fbb0de795078 ("Add udmabuf misc device") Cc: stable@vger.kernel.org Signed-off-by: Jann Horn Acked-by: Joel Fernandes (Google) Acked-by: Vivek Kasireddy Signed-off-by: Vivek Kasireddy Link: https://patchwork.freedesktop.org/patch/msgid/20241204-udmabuf-fixes-v2-1-23887289de1c@google.com --- drivers/dma-buf/udmabuf.c | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/drivers/dma-buf/udmabuf.c b/drivers/dma-buf/udmabuf.c index 8ce1f074c2d3..c1d8c2766d6d 100644 --- a/drivers/dma-buf/udmabuf.c +++ b/drivers/dma-buf/udmabuf.c @@ -436,14 +436,19 @@ static long udmabuf_create(struct miscdevice *device, goto err; } + /* + * Take the inode lock to protect against concurrent + * memfd_add_seals(), which takes this lock in write mode. + */ + inode_lock_shared(file_inode(memfd)); ret = check_memfd_seals(memfd); - if (ret < 0) { - fput(memfd); - goto err; - } + if (ret) + goto out_unlock; ret = udmabuf_pin_folios(ubuf, memfd, list[i].offset, list[i].size, folios); +out_unlock: + inode_unlock_shared(file_inode(memfd)); fput(memfd); if (ret) goto err; From 0a16e24e34f28210f68195259456c73462518597 Mon Sep 17 00:00:00 2001 From: Jann Horn Date: Wed, 4 Dec 2024 17:26:20 +0100 Subject: [PATCH 055/337] udmabuf: also check for F_SEAL_FUTURE_WRITE When F_SEAL_FUTURE_WRITE was introduced, it was overlooked that udmabuf must reject memfds with this flag, just like ones with F_SEAL_WRITE. Fix it by adding F_SEAL_FUTURE_WRITE to SEALS_DENIED. Fixes: ab3948f58ff8 ("mm/memfd: add an F_SEAL_FUTURE_WRITE seal to memfd") Cc: stable@vger.kernel.org Acked-by: Vivek Kasireddy Signed-off-by: Jann Horn Reviewed-by: Joel Fernandes (Google) Signed-off-by: Vivek Kasireddy Link: https://patchwork.freedesktop.org/patch/msgid/20241204-udmabuf-fixes-v2-2-23887289de1c@google.com --- drivers/dma-buf/udmabuf.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/dma-buf/udmabuf.c b/drivers/dma-buf/udmabuf.c index c1d8c2766d6d..b330b99fcc76 100644 --- a/drivers/dma-buf/udmabuf.c +++ b/drivers/dma-buf/udmabuf.c @@ -297,7 +297,7 @@ static const struct dma_buf_ops udmabuf_ops = { }; #define SEALS_WANTED (F_SEAL_SHRINK) -#define SEALS_DENIED (F_SEAL_WRITE) +#define SEALS_DENIED (F_SEAL_WRITE|F_SEAL_FUTURE_WRITE) static int check_memfd_seals(struct file *memfd) { From f49856f525acd5bef52ae28b7da2e001bbe7439e Mon Sep 17 00:00:00 2001 From: Jann Horn Date: Wed, 4 Dec 2024 17:26:21 +0100 Subject: [PATCH 056/337] udmabuf: fix memory leak on last export_udmabuf() error path In export_udmabuf(), if dma_buf_fd() fails because the FD table is full, a dma_buf owning the udmabuf has already been created; but the error handling in udmabuf_create() will tear down the udmabuf without doing anything about the containing dma_buf. This leaves a dma_buf in memory that contains a dangling pointer; though that doesn't seem to lead to anything bad except a memory leak. Fix it by moving the dma_buf_fd() call out of export_udmabuf() so that we can give it different error handling. Note that the shape of this code changed a lot in commit 5e72b2b41a21 ("udmabuf: convert udmabuf driver to use folios"); but the memory leak seems to have existed since the introduction of udmabuf. Fixes: fbb0de795078 ("Add udmabuf misc device") Acked-by: Vivek Kasireddy Signed-off-by: Jann Horn Signed-off-by: Vivek Kasireddy Link: https://patchwork.freedesktop.org/patch/msgid/20241204-udmabuf-fixes-v2-3-23887289de1c@google.com --- drivers/dma-buf/udmabuf.c | 28 +++++++++++++++++----------- 1 file changed, 17 insertions(+), 11 deletions(-) diff --git a/drivers/dma-buf/udmabuf.c b/drivers/dma-buf/udmabuf.c index b330b99fcc76..cc7398cc17d6 100644 --- a/drivers/dma-buf/udmabuf.c +++ b/drivers/dma-buf/udmabuf.c @@ -317,12 +317,10 @@ static int check_memfd_seals(struct file *memfd) return 0; } -static int export_udmabuf(struct udmabuf *ubuf, - struct miscdevice *device, - u32 flags) +static struct dma_buf *export_udmabuf(struct udmabuf *ubuf, + struct miscdevice *device) { DEFINE_DMA_BUF_EXPORT_INFO(exp_info); - struct dma_buf *buf; ubuf->device = device; exp_info.ops = &udmabuf_ops; @@ -330,11 +328,7 @@ static int export_udmabuf(struct udmabuf *ubuf, exp_info.priv = ubuf; exp_info.flags = O_RDWR; - buf = dma_buf_export(&exp_info); - if (IS_ERR(buf)) - return PTR_ERR(buf); - - return dma_buf_fd(buf, flags); + return dma_buf_export(&exp_info); } static long udmabuf_pin_folios(struct udmabuf *ubuf, struct file *memfd, @@ -391,6 +385,7 @@ static long udmabuf_create(struct miscdevice *device, struct folio **folios = NULL; pgoff_t pgcnt = 0, pglimit; struct udmabuf *ubuf; + struct dma_buf *dmabuf; long ret = -EINVAL; u32 i, flags; @@ -455,9 +450,20 @@ out_unlock: } flags = head->flags & UDMABUF_FLAGS_CLOEXEC ? O_CLOEXEC : 0; - ret = export_udmabuf(ubuf, device, flags); - if (ret < 0) + dmabuf = export_udmabuf(ubuf, device); + if (IS_ERR(dmabuf)) { + ret = PTR_ERR(dmabuf); goto err; + } + /* + * Ownership of ubuf is held by the dmabuf from here. + * If the following dma_buf_fd() fails, dma_buf_put() cleans up both the + * dmabuf and the ubuf (through udmabuf_ops.release). + */ + + ret = dma_buf_fd(dmabuf, flags); + if (ret < 0) + dma_buf_put(dmabuf); kvfree(folios); return ret; From 0cff90dec63da908fb16d9ea2872ebbcd2d18e6a Mon Sep 17 00:00:00 2001 From: "T.J. Mercier" Date: Sun, 17 Nov 2024 17:03:25 +0000 Subject: [PATCH 057/337] dma-buf: Fix __dma_buf_debugfs_list_del argument for !CONFIG_DEBUG_FS MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The arguments for __dma_buf_debugfs_list_del do not match for both the CONFIG_DEBUG_FS case and the !CONFIG_DEBUG_FS case. The !CONFIG_DEBUG_FS case should take a struct dma_buf *, but it's currently struct file *. This can lead to the build error: error: passing argument 1 of ‘__dma_buf_debugfs_list_del’ from incompatible pointer type [-Werror=incompatible-pointer-types] dma-buf.c:63:53: note: expected ‘struct file *’ but argument is of type ‘struct dma_buf *’ 63 | static void __dma_buf_debugfs_list_del(struct file *file) Fixes: bfc7bc539392 ("dma-buf: Do not build debugfs related code when !CONFIG_DEBUG_FS") Signed-off-by: T.J. Mercier Reviewed-by: Tvrtko Ursulin Signed-off-by: Sumit Semwal Link: https://patchwork.freedesktop.org/patch/msgid/20241117170326.1971113-1-tjmercier@google.com --- drivers/dma-buf/dma-buf.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/dma-buf/dma-buf.c b/drivers/dma-buf/dma-buf.c index 5ad0e9e2e1b9..84bc32134862 100644 --- a/drivers/dma-buf/dma-buf.c +++ b/drivers/dma-buf/dma-buf.c @@ -60,7 +60,7 @@ static void __dma_buf_debugfs_list_add(struct dma_buf *dmabuf) { } -static void __dma_buf_debugfs_list_del(struct file *file) +static void __dma_buf_debugfs_list_del(struct dma_buf *dmabuf) { } #endif From e1e1af9148dc4c866eda3fb59cd6ec3c7ea34b1d Mon Sep 17 00:00:00 2001 From: Zhang Zekun Date: Fri, 25 Oct 2024 15:34:08 +0800 Subject: [PATCH 058/337] drm/panel: himax-hx83102: Add a check to prevent NULL pointer dereference drm_mode_duplicate() could return NULL due to lack of memory, which will then call NULL pointer dereference. Add a check to prevent it. Fixes: 0ef94554dc40 ("drm/panel: himax-hx83102: Break out as separate driver") Signed-off-by: Zhang Zekun Reviewed-by: Neil Armstrong Link: https://lore.kernel.org/r/20241025073408.27481-3-zhangzekun11@huawei.com Signed-off-by: Neil Armstrong Link: https://patchwork.freedesktop.org/patch/msgid/20241025073408.27481-3-zhangzekun11@huawei.com --- drivers/gpu/drm/panel/panel-himax-hx83102.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/gpu/drm/panel/panel-himax-hx83102.c b/drivers/gpu/drm/panel/panel-himax-hx83102.c index 8b48bba18131..3644a7544b93 100644 --- a/drivers/gpu/drm/panel/panel-himax-hx83102.c +++ b/drivers/gpu/drm/panel/panel-himax-hx83102.c @@ -565,6 +565,8 @@ static int hx83102_get_modes(struct drm_panel *panel, struct drm_display_mode *mode; mode = drm_mode_duplicate(connector->dev, m); + if (!mode) + return -ENOMEM; mode->type = DRM_MODE_TYPE_DRIVER | DRM_MODE_TYPE_PREFERRED; drm_mode_set_name(mode); From f8fd0968eff52cf092c0d517d17507ea2f6e5ea5 Mon Sep 17 00:00:00 2001 From: Yang Yingliang Date: Tue, 29 Oct 2024 20:39:57 +0800 Subject: [PATCH 059/337] drm/panel: novatek-nt35950: fix return value check in nt35950_probe() mipi_dsi_device_register_full() never returns NULL pointer, it will return ERR_PTR() when it fails, so replace the check with IS_ERR(). Fixes: 623a3531e9cf ("drm/panel: Add driver for Novatek NT35950 DSI DriverIC panels") Signed-off-by: Yang Yingliang Reviewed-by: Neil Armstrong Link: https://lore.kernel.org/r/20241029123957.1588-1-yangyingliang@huaweicloud.com Signed-off-by: Neil Armstrong Link: https://patchwork.freedesktop.org/patch/msgid/20241029123957.1588-1-yangyingliang@huaweicloud.com --- drivers/gpu/drm/panel/panel-novatek-nt35950.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/drm/panel/panel-novatek-nt35950.c b/drivers/gpu/drm/panel/panel-novatek-nt35950.c index b036208f9356..08b22b592ab0 100644 --- a/drivers/gpu/drm/panel/panel-novatek-nt35950.c +++ b/drivers/gpu/drm/panel/panel-novatek-nt35950.c @@ -481,9 +481,9 @@ static int nt35950_probe(struct mipi_dsi_device *dsi) return dev_err_probe(dev, -EPROBE_DEFER, "Cannot get secondary DSI host\n"); nt->dsi[1] = mipi_dsi_device_register_full(dsi_r_host, info); - if (!nt->dsi[1]) { + if (IS_ERR(nt->dsi[1])) { dev_err(dev, "Cannot get secondary DSI node\n"); - return -ENODEV; + return PTR_ERR(nt->dsi[1]); } num_dsis++; } From 406dd4c7984a457567ca652455d5efad81983f02 Mon Sep 17 00:00:00 2001 From: Marek Vasut Date: Sun, 24 Nov 2024 23:48:07 +0100 Subject: [PATCH 060/337] drm/panel: st7701: Add prepare_prev_first flag to drm_panel The DSI host must be enabled for the panel to be initialized in prepare(). Set the prepare_prev_first flag to guarantee this. This fixes the panel operation on NXP i.MX8MP SoC / Samsung DSIM DSI host. Fixes: 849b2e3ff969 ("drm/panel: Add Sitronix ST7701 panel driver") Signed-off-by: Marek Vasut Reviewed-by: Jessica Zhang Link: https://lore.kernel.org/r/20241124224812.150263-1-marex@denx.de Signed-off-by: Neil Armstrong Link: https://patchwork.freedesktop.org/patch/msgid/20241124224812.150263-1-marex@denx.de --- drivers/gpu/drm/panel/panel-sitronix-st7701.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/gpu/drm/panel/panel-sitronix-st7701.c b/drivers/gpu/drm/panel/panel-sitronix-st7701.c index eef03d04e0cd..1f72ef7ca74c 100644 --- a/drivers/gpu/drm/panel/panel-sitronix-st7701.c +++ b/drivers/gpu/drm/panel/panel-sitronix-st7701.c @@ -1177,6 +1177,7 @@ static int st7701_probe(struct device *dev, int connector_type) return dev_err_probe(dev, ret, "Failed to get orientation\n"); drm_panel_init(&st7701->panel, dev, &st7701_funcs, connector_type); + st7701->panel.prepare_prev_first = true; /** * Once sleep out has been issued, ST7701 IC required to wait 120ms From d2bd3fcb825725a59c8880070b1206b1710922bd Mon Sep 17 00:00:00 2001 From: Michael Trimarchi Date: Thu, 5 Dec 2024 17:29:58 +0100 Subject: [PATCH 061/337] drm/panel: synaptics-r63353: Fix regulator unbalance The shutdown function can be called when the display is already unprepared. For example during reboot this trigger a kernel backlog. Calling the drm_panel_unprepare, allow us to avoid to trigger the kernel warning. Fixes: 2e87bad7cd33 ("drm/panel: Add Synaptics R63353 panel driver") Tested-by: Dario Binacchi Signed-off-by: Michael Trimarchi Signed-off-by: Dario Binacchi Reviewed-by: Neil Armstrong Reviewed-by: Jessica Zhang Link: https://lore.kernel.org/r/20241205163002.1804784-1-dario.binacchi@amarulasolutions.com Signed-off-by: Neil Armstrong Link: https://patchwork.freedesktop.org/patch/msgid/20241205163002.1804784-1-dario.binacchi@amarulasolutions.com --- drivers/gpu/drm/panel/panel-synaptics-r63353.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/panel/panel-synaptics-r63353.c b/drivers/gpu/drm/panel/panel-synaptics-r63353.c index 169c629746c7..17349825543f 100644 --- a/drivers/gpu/drm/panel/panel-synaptics-r63353.c +++ b/drivers/gpu/drm/panel/panel-synaptics-r63353.c @@ -325,7 +325,7 @@ static void r63353_panel_shutdown(struct mipi_dsi_device *dsi) { struct r63353_panel *rpanel = mipi_dsi_get_drvdata(dsi); - r63353_panel_unprepare(&rpanel->base); + drm_panel_unprepare(&rpanel->base); } static const struct r63353_desc sharp_ls068b3sx02_data = { From 0e8c52091633b354b12d0c29a27a22077584c111 Mon Sep 17 00:00:00 2001 From: Emmanuel Grumbach Date: Thu, 12 Dec 2024 13:29:42 +0200 Subject: [PATCH 062/337] wifi: iwlwifi: fix CRF name for Bz We had BE201 hard coded. Look at the RF_ID and decide based on its value. Fixes: 6795a37161fb ("wifi: iwlwifi: Print a specific device name.") Signed-off-by: Emmanuel Grumbach Signed-off-by: Miri Korenblit Link: https://patch.msgid.link/20241212132940.b9eebda1ca60.I36791a134ed5e538e059418eb6520761da97b44c@changeid Signed-off-by: Johannes Berg --- drivers/net/wireless/intel/iwlwifi/cfg/bz.c | 1 + .../net/wireless/intel/iwlwifi/iwl-config.h | 1 + drivers/net/wireless/intel/iwlwifi/pcie/drv.c | 41 +++++++++++++++++-- 3 files changed, 40 insertions(+), 3 deletions(-) diff --git a/drivers/net/wireless/intel/iwlwifi/cfg/bz.c b/drivers/net/wireless/intel/iwlwifi/cfg/bz.c index cd1fe8490ae5..1c43f283ac4a 100644 --- a/drivers/net/wireless/intel/iwlwifi/cfg/bz.c +++ b/drivers/net/wireless/intel/iwlwifi/cfg/bz.c @@ -161,6 +161,7 @@ const struct iwl_cfg_trans_params iwl_gl_trans_cfg = { const char iwl_bz_name[] = "Intel(R) TBD Bz device"; const char iwl_fm_name[] = "Intel(R) Wi-Fi 7 BE201 320MHz"; +const char iwl_wh_name[] = "Intel(R) Wi-Fi 7 BE211 320MHz"; const char iwl_gl_name[] = "Intel(R) Wi-Fi 7 BE200 320MHz"; const char iwl_mtp_name[] = "Intel(R) Wi-Fi 7 BE202 160MHz"; diff --git a/drivers/net/wireless/intel/iwlwifi/iwl-config.h b/drivers/net/wireless/intel/iwlwifi/iwl-config.h index 34c91deca57b..17721bb47e25 100644 --- a/drivers/net/wireless/intel/iwlwifi/iwl-config.h +++ b/drivers/net/wireless/intel/iwlwifi/iwl-config.h @@ -545,6 +545,7 @@ extern const char iwl_ax231_name[]; extern const char iwl_ax411_name[]; extern const char iwl_bz_name[]; extern const char iwl_fm_name[]; +extern const char iwl_wh_name[]; extern const char iwl_gl_name[]; extern const char iwl_mtp_name[]; extern const char iwl_sc_name[]; diff --git a/drivers/net/wireless/intel/iwlwifi/pcie/drv.c b/drivers/net/wireless/intel/iwlwifi/pcie/drv.c index 805fb249a0c6..8fb2aa282242 100644 --- a/drivers/net/wireless/intel/iwlwifi/pcie/drv.c +++ b/drivers/net/wireless/intel/iwlwifi/pcie/drv.c @@ -1106,18 +1106,53 @@ VISIBLE_IF_IWLWIFI_KUNIT const struct iwl_dev_info iwl_dev_info_table[] = { iwlax210_2ax_cfg_so_jf_b0, iwl9462_name), /* Bz */ -/* FIXME: need to change the naming according to the actual CRF */ _IWL_DEV_INFO(IWL_CFG_ANY, IWL_CFG_ANY, IWL_CFG_MAC_TYPE_BZ, IWL_CFG_ANY, + IWL_CFG_RF_TYPE_HR2, IWL_CFG_ANY, IWL_CFG_ANY, IWL_CFG_ANY, IWL_CFG_ANY, IWL_CFG_ANY, + iwl_cfg_bz, iwl_ax201_name), + + _IWL_DEV_INFO(IWL_CFG_ANY, IWL_CFG_ANY, + IWL_CFG_MAC_TYPE_BZ, IWL_CFG_ANY, + IWL_CFG_RF_TYPE_GF, IWL_CFG_ANY, IWL_CFG_ANY, + IWL_CFG_ANY, IWL_CFG_ANY, IWL_CFG_ANY, + iwl_cfg_bz, iwl_ax211_name), + + _IWL_DEV_INFO(IWL_CFG_ANY, IWL_CFG_ANY, + IWL_CFG_MAC_TYPE_BZ, IWL_CFG_ANY, + IWL_CFG_RF_TYPE_FM, IWL_CFG_ANY, IWL_CFG_ANY, + IWL_CFG_ANY, IWL_CFG_ANY, IWL_CFG_ANY, + iwl_cfg_bz, iwl_fm_name), + + _IWL_DEV_INFO(IWL_CFG_ANY, IWL_CFG_ANY, + IWL_CFG_MAC_TYPE_BZ, IWL_CFG_ANY, + IWL_CFG_RF_TYPE_WH, IWL_CFG_ANY, IWL_CFG_ANY, + IWL_CFG_ANY, IWL_CFG_ANY, IWL_CFG_ANY, + iwl_cfg_bz, iwl_wh_name), + + _IWL_DEV_INFO(IWL_CFG_ANY, IWL_CFG_ANY, + IWL_CFG_MAC_TYPE_BZ_W, IWL_CFG_ANY, + IWL_CFG_RF_TYPE_HR2, IWL_CFG_ANY, IWL_CFG_ANY, + IWL_CFG_ANY, IWL_CFG_ANY, IWL_CFG_ANY, + iwl_cfg_bz, iwl_ax201_name), + + _IWL_DEV_INFO(IWL_CFG_ANY, IWL_CFG_ANY, + IWL_CFG_MAC_TYPE_BZ_W, IWL_CFG_ANY, + IWL_CFG_RF_TYPE_GF, IWL_CFG_ANY, IWL_CFG_ANY, + IWL_CFG_ANY, IWL_CFG_ANY, IWL_CFG_ANY, + iwl_cfg_bz, iwl_ax211_name), + + _IWL_DEV_INFO(IWL_CFG_ANY, IWL_CFG_ANY, + IWL_CFG_MAC_TYPE_BZ_W, IWL_CFG_ANY, + IWL_CFG_RF_TYPE_FM, IWL_CFG_ANY, IWL_CFG_ANY, IWL_CFG_ANY, IWL_CFG_ANY, IWL_CFG_ANY, iwl_cfg_bz, iwl_fm_name), _IWL_DEV_INFO(IWL_CFG_ANY, IWL_CFG_ANY, IWL_CFG_MAC_TYPE_BZ_W, IWL_CFG_ANY, + IWL_CFG_RF_TYPE_WH, IWL_CFG_ANY, IWL_CFG_ANY, IWL_CFG_ANY, IWL_CFG_ANY, IWL_CFG_ANY, - IWL_CFG_ANY, IWL_CFG_ANY, IWL_CFG_ANY, - iwl_cfg_bz, iwl_fm_name), + iwl_cfg_bz, iwl_wh_name), /* Ga (Gl) */ _IWL_DEV_INFO(IWL_CFG_ANY, IWL_CFG_ANY, From b83accfec0811421df065f820e73ca8df7f6439a Mon Sep 17 00:00:00 2001 From: Jeff Johnson Date: Thu, 12 Dec 2024 08:27:05 -0800 Subject: [PATCH 063/337] MAINTAINERS: wifi: ath: add Jeff Johnson as maintainer The "ATHEROS ATH GENERIC UTILITIES" entry shares the same git tree as the ATH10K, ATH11K, and ATH12K entries which I already maintain, so add me to that entry as well. Signed-off-by: Jeff Johnson Signed-off-by: Kalle Valo Link: https://patch.msgid.link/20241212-ath-maintainer-v1-1-7ea5e86780a8@kernel.org --- MAINTAINERS | 1 + 1 file changed, 1 insertion(+) diff --git a/MAINTAINERS b/MAINTAINERS index e6e71b05710b..158740a571b3 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -3608,6 +3608,7 @@ F: drivers/phy/qualcomm/phy-ath79-usb.c ATHEROS ATH GENERIC UTILITIES M: Kalle Valo +M: Jeff Johnson L: linux-wireless@vger.kernel.org S: Supported F: drivers/net/wireless/ath/* From 8b55f8818900c99dd4f55a59a103f5b29e41eb2c Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Fri, 18 Oct 2024 15:14:42 +0000 Subject: [PATCH 064/337] media: mediatek: vcodec: mark vdec_vp9_slice_map_counts_eob_coef noinline With KASAN enabled, clang fails to optimize the inline version of vdec_vp9_slice_map_counts_eob_coef() properly, leading to kilobytes of temporary values spilled to the stack: drivers/media/platform/mediatek/vcodec/decoder/vdec/vdec_vp9_req_lat_if.c:1526:12: error: stack frame size (2160) exceeds limit (2048) in 'vdec_vp9_slice_update_prob' [-Werror,-Wframe-larger-than] This seems to affect all versions of clang including the latest (clang-20), but the degree of stack overhead is different per release. Marking the function as noinline_for_stack is harmless here and avoids the problem completely. Signed-off-by: Arnd Bergmann Reviewed-by: Nathan Chancellor Signed-off-by: Sebastian Fricke Signed-off-by: Mauro Carvalho Chehab --- .../mediatek/vcodec/decoder/vdec/vdec_vp9_req_lat_if.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/media/platform/mediatek/vcodec/decoder/vdec/vdec_vp9_req_lat_if.c b/drivers/media/platform/mediatek/vcodec/decoder/vdec/vdec_vp9_req_lat_if.c index eea709d93820..47c302745c1d 100644 --- a/drivers/media/platform/mediatek/vcodec/decoder/vdec/vdec_vp9_req_lat_if.c +++ b/drivers/media/platform/mediatek/vcodec/decoder/vdec/vdec_vp9_req_lat_if.c @@ -1188,7 +1188,8 @@ err: return ret; } -static +/* clang stack usage explodes if this is inlined */ +static noinline_for_stack void vdec_vp9_slice_map_counts_eob_coef(unsigned int i, unsigned int j, unsigned int k, struct vdec_vp9_slice_frame_counts *counts, struct v4l2_vp9_frame_symbol_counts *counts_helper) From 080b2e7b5e9ad23343e4b11f0751e4c724a78958 Mon Sep 17 00:00:00 2001 From: Krzysztof Karas Date: Thu, 12 Dec 2024 11:00:41 +0000 Subject: [PATCH 065/337] drm/display: use ERR_PTR on DP tunnel manager creation fail Instead of returning a generic NULL on error from drm_dp_tunnel_mgr_create(), use error pointers with informative codes to align the function with stub that is executed when CONFIG_DRM_DISPLAY_DP_TUNNEL is unset. This will also trigger IS_ERR() in current caller (intel_dp_tunnerl_mgr_init()) instead of bypassing it via NULL pointer. v2: use error codes inside drm_dp_tunnel_mgr_create() instead of handling on caller's side (Michal, Imre) v3: fixup commit message and add "CC"/"Fixes" lines (Andi), mention aligning function code with stub Fixes: 91888b5b1ad2 ("drm/i915/dp: Add support for DP tunnel BW allocation") Cc: Imre Deak Cc: # v6.9+ Signed-off-by: Krzysztof Karas Reviewed-by: Andi Shyti Signed-off-by: Imre Deak Link: https://patchwork.freedesktop.org/patch/msgid/7q4fpnmmztmchczjewgm6igy55qt6jsm7tfd4fl4ucfq6yg2oy@q4lxtsu6445c --- drivers/gpu/drm/display/drm_dp_tunnel.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/drivers/gpu/drm/display/drm_dp_tunnel.c b/drivers/gpu/drm/display/drm_dp_tunnel.c index 48b2df120086..90fe07a89260 100644 --- a/drivers/gpu/drm/display/drm_dp_tunnel.c +++ b/drivers/gpu/drm/display/drm_dp_tunnel.c @@ -1896,8 +1896,8 @@ static void destroy_mgr(struct drm_dp_tunnel_mgr *mgr) * * Creates a DP tunnel manager for @dev. * - * Returns a pointer to the tunnel manager if created successfully or NULL in - * case of an error. + * Returns a pointer to the tunnel manager if created successfully or error + * pointer in case of failure. */ struct drm_dp_tunnel_mgr * drm_dp_tunnel_mgr_create(struct drm_device *dev, int max_group_count) @@ -1907,7 +1907,7 @@ drm_dp_tunnel_mgr_create(struct drm_device *dev, int max_group_count) mgr = kzalloc(sizeof(*mgr), GFP_KERNEL); if (!mgr) - return NULL; + return ERR_PTR(-ENOMEM); mgr->dev = dev; init_waitqueue_head(&mgr->bw_req_queue); @@ -1916,7 +1916,7 @@ drm_dp_tunnel_mgr_create(struct drm_device *dev, int max_group_count) if (!mgr->groups) { kfree(mgr); - return NULL; + return ERR_PTR(-ENOMEM); } #ifdef CONFIG_DRM_DISPLAY_DP_TUNNEL_STATE_DEBUG @@ -1927,7 +1927,7 @@ drm_dp_tunnel_mgr_create(struct drm_device *dev, int max_group_count) if (!init_group(mgr, &mgr->groups[i])) { destroy_mgr(mgr); - return NULL; + return ERR_PTR(-ENOMEM); } mgr->group_count++; From 9398332f23fab10c5ec57c168b44e72997d6318e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ville=20Syrj=C3=A4l=C3=A4?= Date: Fri, 29 Nov 2024 06:26:28 +0200 Subject: [PATCH 066/337] drm/modes: Avoid divide by zero harder in drm_mode_vrefresh() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit drm_mode_vrefresh() is trying to avoid divide by zero by checking whether htotal or vtotal are zero. But we may still end up with a div-by-zero of vtotal*htotal*... Cc: stable@vger.kernel.org Reported-by: syzbot+622bba18029bcde672e1@syzkaller.appspotmail.com Closes: https://syzkaller.appspot.com/bug?extid=622bba18029bcde672e1 Signed-off-by: Ville Syrjälä Link: https://patchwork.freedesktop.org/patch/msgid/20241129042629.18280-2-ville.syrjala@linux.intel.com Reviewed-by: Jani Nikula --- drivers/gpu/drm/drm_modes.c | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/drivers/gpu/drm/drm_modes.c b/drivers/gpu/drm/drm_modes.c index 6ba167a33461..71573b85d924 100644 --- a/drivers/gpu/drm/drm_modes.c +++ b/drivers/gpu/drm/drm_modes.c @@ -1287,14 +1287,11 @@ EXPORT_SYMBOL(drm_mode_set_name); */ int drm_mode_vrefresh(const struct drm_display_mode *mode) { - unsigned int num, den; + unsigned int num = 1, den = 1; if (mode->htotal == 0 || mode->vtotal == 0) return 0; - num = mode->clock; - den = mode->htotal * mode->vtotal; - if (mode->flags & DRM_MODE_FLAG_INTERLACE) num *= 2; if (mode->flags & DRM_MODE_FLAG_DBLSCAN) @@ -1302,6 +1299,12 @@ int drm_mode_vrefresh(const struct drm_display_mode *mode) if (mode->vscan > 1) den *= mode->vscan; + if (check_mul_overflow(mode->clock, num, &num)) + return 0; + + if (check_mul_overflow(mode->htotal * mode->vtotal, den, &den)) + return 0; + return DIV_ROUND_CLOSEST_ULL(mul_u32_u32(num, 1000), den); } EXPORT_SYMBOL(drm_mode_vrefresh); From 24740385cb0d6d22ab7fa7adf36546d5b3cdcf73 Mon Sep 17 00:00:00 2001 From: Mika Westerberg Date: Fri, 15 Nov 2024 11:54:40 +0200 Subject: [PATCH 067/337] thunderbolt: Improve redrive mode handling When USB-C monitor is connected directly to Intel Barlow Ridge host, it goes into "redrive" mode that basically routes the DisplayPort signals directly from the GPU to the USB-C monitor without any tunneling needed. However, the host router must be powered on for this to work. Aaron reported that there are a couple of cases where this will not work with the current code: - Booting with USB-C monitor plugged in. - Plugging in USB-C monitor when the host router is in sleep state (runtime suspended). - Plugging in USB-C device while the system is in system sleep state. In all these cases once the host router is runtime suspended the picture on the connected USB-C display disappears too. This is certainly not what the user expected. For this reason improve the redrive mode handling to keep the host router from runtime suspending when detect that any of the above cases is happening. Fixes: a75e0684efe5 ("thunderbolt: Keep the domain powered when USB4 port is in redrive mode") Reported-by: Aaron Rainbolt Closes: https://lore.kernel.org/linux-usb/20241009220118.70bfedd0@kf-ir16/ Cc: stable@vger.kernel.org Signed-off-by: Mika Westerberg --- drivers/thunderbolt/tb.c | 41 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 41 insertions(+) diff --git a/drivers/thunderbolt/tb.c b/drivers/thunderbolt/tb.c index 4f777788e917..a7c6919fbf97 100644 --- a/drivers/thunderbolt/tb.c +++ b/drivers/thunderbolt/tb.c @@ -2059,6 +2059,37 @@ static void tb_exit_redrive(struct tb_port *port) } } +static void tb_switch_enter_redrive(struct tb_switch *sw) +{ + struct tb_port *port; + + tb_switch_for_each_port(sw, port) + tb_enter_redrive(port); +} + +/* + * Called during system and runtime suspend to forcefully exit redrive + * mode without querying whether the resource is available. + */ +static void tb_switch_exit_redrive(struct tb_switch *sw) +{ + struct tb_port *port; + + if (!(sw->quirks & QUIRK_KEEP_POWER_IN_DP_REDRIVE)) + return; + + tb_switch_for_each_port(sw, port) { + if (!tb_port_is_dpin(port)) + continue; + + if (port->redrive) { + port->redrive = false; + pm_runtime_put(&sw->dev); + tb_port_dbg(port, "exit redrive mode\n"); + } + } +} + static void tb_dp_resource_unavailable(struct tb *tb, struct tb_port *port) { struct tb_port *in, *out; @@ -2909,6 +2940,7 @@ static int tb_start(struct tb *tb, bool reset) tb_create_usb3_tunnels(tb->root_switch); /* Add DP IN resources for the root switch */ tb_add_dp_resources(tb->root_switch); + tb_switch_enter_redrive(tb->root_switch); /* Make the discovered switches available to the userspace */ device_for_each_child(&tb->root_switch->dev, NULL, tb_scan_finalize_switch); @@ -2924,6 +2956,7 @@ static int tb_suspend_noirq(struct tb *tb) tb_dbg(tb, "suspending...\n"); tb_disconnect_and_release_dp(tb); + tb_switch_exit_redrive(tb->root_switch); tb_switch_suspend(tb->root_switch, false); tcm->hotplug_active = false; /* signal tb_handle_hotplug to quit */ tb_dbg(tb, "suspend finished\n"); @@ -3016,6 +3049,7 @@ static int tb_resume_noirq(struct tb *tb) tb_dbg(tb, "tunnels restarted, sleeping for 100ms\n"); msleep(100); } + tb_switch_enter_redrive(tb->root_switch); /* Allow tb_handle_hotplug to progress events */ tcm->hotplug_active = true; tb_dbg(tb, "resume finished\n"); @@ -3079,6 +3113,12 @@ static int tb_runtime_suspend(struct tb *tb) struct tb_cm *tcm = tb_priv(tb); mutex_lock(&tb->lock); + /* + * The below call only releases DP resources to allow exiting and + * re-entering redrive mode. + */ + tb_disconnect_and_release_dp(tb); + tb_switch_exit_redrive(tb->root_switch); tb_switch_suspend(tb->root_switch, true); tcm->hotplug_active = false; mutex_unlock(&tb->lock); @@ -3110,6 +3150,7 @@ static int tb_runtime_resume(struct tb *tb) tb_restore_children(tb->root_switch); list_for_each_entry_safe(tunnel, n, &tcm->tunnel_list, list) tb_tunnel_restart(tunnel); + tb_switch_enter_redrive(tb->root_switch); tcm->hotplug_active = true; mutex_unlock(&tb->lock); From a60b990798eb17433d0283788280422b1bd94b18 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sat, 14 Dec 2024 12:50:18 +0100 Subject: [PATCH 068/337] PCI/MSI: Handle lack of irqdomain gracefully Alexandre observed a warning emitted from pci_msi_setup_msi_irqs() on a RISCV platform which does not provide PCI/MSI support: WARNING: CPU: 1 PID: 1 at drivers/pci/msi/msi.h:121 pci_msi_setup_msi_irqs+0x2c/0x32 __pci_enable_msix_range+0x30c/0x596 pci_msi_setup_msi_irqs+0x2c/0x32 pci_alloc_irq_vectors_affinity+0xb8/0xe2 RISCV uses hierarchical interrupt domains and correctly does not implement the legacy fallback. The warning triggers from the legacy fallback stub. That warning is bogus as the PCI/MSI layer knows whether a PCI/MSI parent domain is associated with the device or not. There is a check for MSI-X, which has a legacy assumption. But that legacy fallback assumption is only valid when legacy support is enabled, but otherwise the check should simply return -ENOTSUPP. Loongarch tripped over the same problem and blindly enabled legacy support without implementing the legacy fallbacks. There are weak implementations which return an error, so the problem was papered over. Correct pci_msi_domain_supports() to evaluate the legacy mode and add the missing supported check into the MSI enable path to complete it. Fixes: d2a463b29741 ("PCI/MSI: Reject multi-MSI early") Reported-by: Alexandre Ghiti Signed-off-by: Thomas Gleixner Tested-by: Alexandre Ghiti Cc: stable@vger.kernel.org Link: https://lore.kernel.org/all/87ed2a8ow5.ffs@tglx --- drivers/pci/msi/irqdomain.c | 7 +++++-- drivers/pci/msi/msi.c | 4 ++++ 2 files changed, 9 insertions(+), 2 deletions(-) diff --git a/drivers/pci/msi/irqdomain.c b/drivers/pci/msi/irqdomain.c index 569125726b3e..d7ba8795d60f 100644 --- a/drivers/pci/msi/irqdomain.c +++ b/drivers/pci/msi/irqdomain.c @@ -350,8 +350,11 @@ bool pci_msi_domain_supports(struct pci_dev *pdev, unsigned int feature_mask, domain = dev_get_msi_domain(&pdev->dev); - if (!domain || !irq_domain_is_hierarchy(domain)) - return mode == ALLOW_LEGACY; + if (!domain || !irq_domain_is_hierarchy(domain)) { + if (IS_ENABLED(CONFIG_PCI_MSI_ARCH_FALLBACKS)) + return mode == ALLOW_LEGACY; + return false; + } if (!irq_domain_is_msi_parent(domain)) { /* diff --git a/drivers/pci/msi/msi.c b/drivers/pci/msi/msi.c index 3a45879d85db..2f647cac4cae 100644 --- a/drivers/pci/msi/msi.c +++ b/drivers/pci/msi/msi.c @@ -433,6 +433,10 @@ int __pci_enable_msi_range(struct pci_dev *dev, int minvec, int maxvec, if (WARN_ON_ONCE(dev->msi_enabled)) return -EINVAL; + /* Test for the availability of MSI support */ + if (!pci_msi_domain_supports(dev, 0, ALLOW_LEGACY)) + return -ENOTSUPP; + nvec = pci_msi_vec_count(dev); if (nvec < 0) return nvec; From 88438444fdddd0244c8b2697713adcca3e71599e Mon Sep 17 00:00:00 2001 From: Venkata Prasad Potturu Date: Fri, 13 Dec 2024 11:41:46 +0530 Subject: [PATCH 069/337] ASoC: amd: ps: Fix for enabling DMIC on acp63 platform via _DSD entry Add condition check to register ACP PDM sound card by reading _WOV acpi entry. Fixes: 0386d765f27a ("ASoC: amd: ps: refactor acp device configuration read logic") Signed-off-by: Venkata Prasad Potturu Link: https://patch.msgid.link/20241213061147.1060451-1-venkataprasad.potturu@amd.com Signed-off-by: Mark Brown --- sound/soc/amd/ps/pci-ps.c | 17 ++++++++++++++++- 1 file changed, 16 insertions(+), 1 deletion(-) diff --git a/sound/soc/amd/ps/pci-ps.c b/sound/soc/amd/ps/pci-ps.c index 823a69bf778b..4575326d0635 100644 --- a/sound/soc/amd/ps/pci-ps.c +++ b/sound/soc/amd/ps/pci-ps.c @@ -375,11 +375,18 @@ static int get_acp63_device_config(struct pci_dev *pci, struct acp63_dev_data *a { struct acpi_device *pdm_dev; const union acpi_object *obj; + acpi_handle handle; + acpi_integer dmic_status; u32 config; bool is_dmic_dev = false; bool is_sdw_dev = false; + bool wov_en, dmic_en; int ret; + /* IF WOV entry not found, enable dmic based on acp-audio-device-type entry*/ + wov_en = true; + dmic_en = false; + config = readl(acp_data->acp63_base + ACP_PIN_CONFIG); switch (config) { case ACP_CONFIG_4: @@ -412,10 +419,18 @@ static int get_acp63_device_config(struct pci_dev *pci, struct acp63_dev_data *a if (!acpi_dev_get_property(pdm_dev, "acp-audio-device-type", ACPI_TYPE_INTEGER, &obj) && obj->integer.value == ACP_DMIC_DEV) - is_dmic_dev = true; + dmic_en = true; } + + handle = ACPI_HANDLE(&pci->dev); + ret = acpi_evaluate_integer(handle, "_WOV", NULL, &dmic_status); + if (!ACPI_FAILURE(ret)) + wov_en = dmic_status; } + if (dmic_en && wov_en) + is_dmic_dev = true; + if (acp_data->is_sdw_config) { ret = acp_scan_sdw_devices(&pci->dev, ACP63_SDW_ADDR); if (!ret && acp_data->info.link_mask) From 38651476e46e088598354510502c383e932e2297 Mon Sep 17 00:00:00 2001 From: Kalesh AP Date: Wed, 11 Dec 2024 14:09:27 +0530 Subject: [PATCH 070/337] RDMA/bnxt_re: Fix the check for 9060 condition The check for 9060 condition should only be made for legacy chips. Fixes: 9152e0b722b2 ("RDMA/bnxt_re: HW workarounds for handling specific conditions") Reviewed-by: Kashyap Desai Signed-off-by: Kalesh AP Signed-off-by: Selvin Xavier Link: https://patch.msgid.link/20241211083931.968831-2-kalesh-anakkur.purayil@broadcom.com Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/bnxt_re/qplib_fp.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/drivers/infiniband/hw/bnxt_re/qplib_fp.c b/drivers/infiniband/hw/bnxt_re/qplib_fp.c index 72f35070f671..093bfb748cdf 100644 --- a/drivers/infiniband/hw/bnxt_re/qplib_fp.c +++ b/drivers/infiniband/hw/bnxt_re/qplib_fp.c @@ -2669,10 +2669,12 @@ static int bnxt_qplib_cq_process_req(struct bnxt_qplib_cq *cq, bnxt_qplib_add_flush_qp(qp); } else { /* Before we complete, do WA 9060 */ - if (do_wa9060(qp, cq, cq_cons, sq->swq_last, - cqe_sq_cons)) { - *lib_qp = qp; - goto out; + if (!bnxt_qplib_is_chip_gen_p5_p7(qp->cctx)) { + if (do_wa9060(qp, cq, cq_cons, sq->swq_last, + cqe_sq_cons)) { + *lib_qp = qp; + goto out; + } } if (swq->flags & SQ_SEND_FLAGS_SIGNAL_COMP) { cqe->status = CQ_REQ_STATUS_OK; From 798653a0ee30d3cd495099282751c0f248614ae7 Mon Sep 17 00:00:00 2001 From: Saravanan Vajravel Date: Wed, 11 Dec 2024 14:09:28 +0530 Subject: [PATCH 071/337] RDMA/bnxt_re: Add check for path mtu in modify_qp When RDMA app configures path MTU, add a check in modify_qp verb to make sure that it doesn't go beyond interface MTU. If this check fails, driver will fail the modify_qp verb. Fixes: 1ac5a4047975 ("RDMA/bnxt_re: Add bnxt_re RoCE driver") Reviewed-by: Kalesh AP Signed-off-by: Saravanan Vajravel Signed-off-by: Selvin Xavier Link: https://patch.msgid.link/20241211083931.968831-3-kalesh-anakkur.purayil@broadcom.com Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/bnxt_re/ib_verbs.c | 26 +++++++++++++----------- 1 file changed, 14 insertions(+), 12 deletions(-) diff --git a/drivers/infiniband/hw/bnxt_re/ib_verbs.c b/drivers/infiniband/hw/bnxt_re/ib_verbs.c index 215074c0860b..a609e1635a3d 100644 --- a/drivers/infiniband/hw/bnxt_re/ib_verbs.c +++ b/drivers/infiniband/hw/bnxt_re/ib_verbs.c @@ -2162,18 +2162,20 @@ int bnxt_re_modify_qp(struct ib_qp *ib_qp, struct ib_qp_attr *qp_attr, } } - if (qp_attr_mask & IB_QP_PATH_MTU) { - qp->qplib_qp.modify_flags |= - CMDQ_MODIFY_QP_MODIFY_MASK_PATH_MTU; - qp->qplib_qp.path_mtu = __from_ib_mtu(qp_attr->path_mtu); - qp->qplib_qp.mtu = ib_mtu_enum_to_int(qp_attr->path_mtu); - } else if (qp_attr->qp_state == IB_QPS_RTR) { - qp->qplib_qp.modify_flags |= - CMDQ_MODIFY_QP_MODIFY_MASK_PATH_MTU; - qp->qplib_qp.path_mtu = - __from_ib_mtu(iboe_get_mtu(rdev->netdev->mtu)); - qp->qplib_qp.mtu = - ib_mtu_enum_to_int(iboe_get_mtu(rdev->netdev->mtu)); + if (qp_attr->qp_state == IB_QPS_RTR) { + enum ib_mtu qpmtu; + + qpmtu = iboe_get_mtu(rdev->netdev->mtu); + if (qp_attr_mask & IB_QP_PATH_MTU) { + if (ib_mtu_enum_to_int(qp_attr->path_mtu) > + ib_mtu_enum_to_int(qpmtu)) + return -EINVAL; + qpmtu = qp_attr->path_mtu; + } + + qp->qplib_qp.modify_flags |= CMDQ_MODIFY_QP_MODIFY_MASK_PATH_MTU; + qp->qplib_qp.path_mtu = __from_ib_mtu(qpmtu); + qp->qplib_qp.mtu = ib_mtu_enum_to_int(qpmtu); } if (qp_attr_mask & IB_QP_TIMEOUT) { From da2132e683954e7ddda3cd674e866a847b7389eb Mon Sep 17 00:00:00 2001 From: Damodharam Ammepalli Date: Wed, 11 Dec 2024 14:09:29 +0530 Subject: [PATCH 072/337] RDMA/bnxt_re: Fix setting mandatory attributes for modify_qp Firmware expects "min_rnr_timer" as a mandatory attribute in MODIFY_QP command during the RTR-RTS transition. This needs to be enforced by the driver which is missing while setting bnxt_set_mandatory_attributes that sends these flags as part of modify_qp optimization. Fixes: 82c32d219272 ("RDMA/bnxt_re: Add support for optimized modify QP") Reviewed-by: Rukhsana Ansari Reviewed-by: Kalesh AP Signed-off-by: Damodharam Ammepalli Signed-off-by: Selvin Xavier Link: https://patch.msgid.link/20241211083931.968831-4-kalesh-anakkur.purayil@broadcom.com Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/bnxt_re/qplib_fp.c | 13 +++++++++++-- drivers/infiniband/hw/bnxt_re/qplib_res.h | 5 +++++ drivers/infiniband/hw/bnxt_re/roce_hsi.h | 1 + 3 files changed, 17 insertions(+), 2 deletions(-) diff --git a/drivers/infiniband/hw/bnxt_re/qplib_fp.c b/drivers/infiniband/hw/bnxt_re/qplib_fp.c index 093bfb748cdf..5169804e6f12 100644 --- a/drivers/infiniband/hw/bnxt_re/qplib_fp.c +++ b/drivers/infiniband/hw/bnxt_re/qplib_fp.c @@ -1285,7 +1285,8 @@ static void __filter_modify_flags(struct bnxt_qplib_qp *qp) } } -static void bnxt_set_mandatory_attributes(struct bnxt_qplib_qp *qp, +static void bnxt_set_mandatory_attributes(struct bnxt_qplib_res *res, + struct bnxt_qplib_qp *qp, struct cmdq_modify_qp *req) { u32 mandatory_flags = 0; @@ -1300,6 +1301,14 @@ static void bnxt_set_mandatory_attributes(struct bnxt_qplib_qp *qp, mandatory_flags |= CMDQ_MODIFY_QP_MODIFY_MASK_PKEY; } + if (_is_min_rnr_in_rtr_rts_mandatory(res->dattr->dev_cap_flags2) && + (qp->cur_qp_state == CMDQ_MODIFY_QP_NEW_STATE_RTR && + qp->state == CMDQ_MODIFY_QP_NEW_STATE_RTS)) { + if (qp->type == CMDQ_MODIFY_QP_QP_TYPE_RC) + mandatory_flags |= + CMDQ_MODIFY_QP_MODIFY_MASK_MIN_RNR_TIMER; + } + if (qp->type == CMDQ_MODIFY_QP_QP_TYPE_UD || qp->type == CMDQ_MODIFY_QP_QP_TYPE_GSI) mandatory_flags |= CMDQ_MODIFY_QP_MODIFY_MASK_QKEY; @@ -1340,7 +1349,7 @@ int bnxt_qplib_modify_qp(struct bnxt_qplib_res *res, struct bnxt_qplib_qp *qp) /* Set mandatory attributes for INIT -> RTR and RTR -> RTS transition */ if (_is_optimize_modify_qp_supported(res->dattr->dev_cap_flags2) && is_optimized_state_transition(qp)) - bnxt_set_mandatory_attributes(qp, &req); + bnxt_set_mandatory_attributes(res, qp, &req); } bmask = qp->modify_flags; req.modify_mask = cpu_to_le32(qp->modify_flags); diff --git a/drivers/infiniband/hw/bnxt_re/qplib_res.h b/drivers/infiniband/hw/bnxt_re/qplib_res.h index 21fb148713a6..cbfc49a1a56d 100644 --- a/drivers/infiniband/hw/bnxt_re/qplib_res.h +++ b/drivers/infiniband/hw/bnxt_re/qplib_res.h @@ -584,6 +584,11 @@ static inline bool _is_optimize_modify_qp_supported(u16 dev_cap_ext_flags2) return dev_cap_ext_flags2 & CREQ_QUERY_FUNC_RESP_SB_OPTIMIZE_MODIFY_QP_SUPPORTED; } +static inline bool _is_min_rnr_in_rtr_rts_mandatory(u16 dev_cap_ext_flags2) +{ + return !!(dev_cap_ext_flags2 & CREQ_QUERY_FUNC_RESP_SB_MIN_RNR_RTR_RTS_OPT_SUPPORTED); +} + static inline bool _is_cq_coalescing_supported(u16 dev_cap_ext_flags2) { return dev_cap_ext_flags2 & CREQ_QUERY_FUNC_RESP_SB_CQ_COALESCING_SUPPORTED; diff --git a/drivers/infiniband/hw/bnxt_re/roce_hsi.h b/drivers/infiniband/hw/bnxt_re/roce_hsi.h index a98fc9c2313e..0ee60fdc18b3 100644 --- a/drivers/infiniband/hw/bnxt_re/roce_hsi.h +++ b/drivers/infiniband/hw/bnxt_re/roce_hsi.h @@ -2215,6 +2215,7 @@ struct creq_query_func_resp_sb { #define CREQ_QUERY_FUNC_RESP_SB_REQ_RETRANSMISSION_SUPPORT_IQM_MSN_TABLE (0x2UL << 4) #define CREQ_QUERY_FUNC_RESP_SB_REQ_RETRANSMISSION_SUPPORT_LAST \ CREQ_QUERY_FUNC_RESP_SB_REQ_RETRANSMISSION_SUPPORT_IQM_MSN_TABLE + #define CREQ_QUERY_FUNC_RESP_SB_MIN_RNR_RTR_RTS_OPT_SUPPORTED 0x1000UL __le16 max_xp_qp_size; __le16 create_qp_batch_size; __le16 destroy_qp_batch_size; From 34db8ec931b84d1426423f263b1927539e73b397 Mon Sep 17 00:00:00 2001 From: Hongguang Gao Date: Wed, 11 Dec 2024 14:09:30 +0530 Subject: [PATCH 073/337] RDMA/bnxt_re: Fix to export port num to ib_query_qp Current driver implementation doesn't populate the port_num field in query_qp. Adding the code to convert internal firmware port id to ibv defined port number and export it. Reviewed-by: Saravanan Vajravel Reviewed-by: Kalesh AP Signed-off-by: Hongguang Gao Signed-off-by: Selvin Xavier Link: https://patch.msgid.link/20241211083931.968831-5-kalesh-anakkur.purayil@broadcom.com Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/bnxt_re/ib_verbs.c | 1 + drivers/infiniband/hw/bnxt_re/ib_verbs.h | 4 ++++ drivers/infiniband/hw/bnxt_re/qplib_fp.c | 1 + drivers/infiniband/hw/bnxt_re/qplib_fp.h | 1 + 4 files changed, 7 insertions(+) diff --git a/drivers/infiniband/hw/bnxt_re/ib_verbs.c b/drivers/infiniband/hw/bnxt_re/ib_verbs.c index a609e1635a3d..bcb7cfc63d09 100644 --- a/drivers/infiniband/hw/bnxt_re/ib_verbs.c +++ b/drivers/infiniband/hw/bnxt_re/ib_verbs.c @@ -2325,6 +2325,7 @@ int bnxt_re_query_qp(struct ib_qp *ib_qp, struct ib_qp_attr *qp_attr, qp_attr->retry_cnt = qplib_qp->retry_cnt; qp_attr->rnr_retry = qplib_qp->rnr_retry; qp_attr->min_rnr_timer = qplib_qp->min_rnr_timer; + qp_attr->port_num = __to_ib_port_num(qplib_qp->port_id); qp_attr->rq_psn = qplib_qp->rq.psn; qp_attr->max_rd_atomic = qplib_qp->max_rd_atomic; qp_attr->sq_psn = qplib_qp->sq.psn; diff --git a/drivers/infiniband/hw/bnxt_re/ib_verbs.h b/drivers/infiniband/hw/bnxt_re/ib_verbs.h index ac59f1d73b15..fbb16a411d6a 100644 --- a/drivers/infiniband/hw/bnxt_re/ib_verbs.h +++ b/drivers/infiniband/hw/bnxt_re/ib_verbs.h @@ -268,6 +268,10 @@ void bnxt_re_dealloc_ucontext(struct ib_ucontext *context); int bnxt_re_mmap(struct ib_ucontext *context, struct vm_area_struct *vma); void bnxt_re_mmap_free(struct rdma_user_mmap_entry *rdma_entry); +static inline u32 __to_ib_port_num(u16 port_id) +{ + return (u32)port_id + 1; +} unsigned long bnxt_re_lock_cqs(struct bnxt_re_qp *qp); void bnxt_re_unlock_cqs(struct bnxt_re_qp *qp, unsigned long flags); diff --git a/drivers/infiniband/hw/bnxt_re/qplib_fp.c b/drivers/infiniband/hw/bnxt_re/qplib_fp.c index 5169804e6f12..d8a2a929bbe3 100644 --- a/drivers/infiniband/hw/bnxt_re/qplib_fp.c +++ b/drivers/infiniband/hw/bnxt_re/qplib_fp.c @@ -1532,6 +1532,7 @@ int bnxt_qplib_query_qp(struct bnxt_qplib_res *res, struct bnxt_qplib_qp *qp) qp->dest_qpn = le32_to_cpu(sb->dest_qp_id); memcpy(qp->smac, sb->src_mac, 6); qp->vlan_id = le16_to_cpu(sb->vlan_pcp_vlan_dei_vlan_id); + qp->port_id = le16_to_cpu(sb->port_id); bail: dma_free_coherent(&rcfw->pdev->dev, sbuf.size, sbuf.sb, sbuf.dma_addr); diff --git a/drivers/infiniband/hw/bnxt_re/qplib_fp.h b/drivers/infiniband/hw/bnxt_re/qplib_fp.h index 19e279871f10..0660101b5310 100644 --- a/drivers/infiniband/hw/bnxt_re/qplib_fp.h +++ b/drivers/infiniband/hw/bnxt_re/qplib_fp.h @@ -298,6 +298,7 @@ struct bnxt_qplib_qp { u32 dest_qpn; u8 smac[6]; u16 vlan_id; + u16 port_id; u8 nw_type; struct bnxt_qplib_ah ah; From 7179fe0074a3c962e43a9e51169304c4911989ed Mon Sep 17 00:00:00 2001 From: Kalesh AP Date: Wed, 11 Dec 2024 14:09:31 +0530 Subject: [PATCH 074/337] RDMA/bnxt_re: Fix reporting hw_ver in query_device Driver currently populates subsystem_device id in the "hw_ver" field of ib_attr structure in query_device. Updated to populate PCI revision ID. Fixes: 1ac5a4047975 ("RDMA/bnxt_re: Add bnxt_re RoCE driver") Reviewed-by: Preethi G Signed-off-by: Kalesh AP Signed-off-by: Selvin Xavier Link: https://patch.msgid.link/20241211083931.968831-6-kalesh-anakkur.purayil@broadcom.com Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/bnxt_re/ib_verbs.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/infiniband/hw/bnxt_re/ib_verbs.c b/drivers/infiniband/hw/bnxt_re/ib_verbs.c index bcb7cfc63d09..e3d26bd6de05 100644 --- a/drivers/infiniband/hw/bnxt_re/ib_verbs.c +++ b/drivers/infiniband/hw/bnxt_re/ib_verbs.c @@ -199,7 +199,7 @@ int bnxt_re_query_device(struct ib_device *ibdev, ib_attr->vendor_id = rdev->en_dev->pdev->vendor; ib_attr->vendor_part_id = rdev->en_dev->pdev->device; - ib_attr->hw_ver = rdev->en_dev->pdev->subsystem_device; + ib_attr->hw_ver = rdev->en_dev->pdev->revision; ib_attr->max_qp = dev_attr->max_qp; ib_attr->max_qp_wr = dev_attr->max_qp_wqes; ib_attr->device_cap_flags = From 7c449ef0fdce540bfb235a2d93e7184864c3388b Mon Sep 17 00:00:00 2001 From: Richard Fitzgerald Date: Mon, 16 Dec 2024 22:08:20 +0800 Subject: [PATCH 075/337] ASoC: Intel: sof_sdw: Fix DMI match for Lenovo 21Q6 and 21Q7 Update the DMI match for a Lenovo laptop to the new DMI identifier. This laptop ships with a different DMI identifier to what was expected, and now has two identifiers. Signed-off-by: Richard Fitzgerald Fixes: 83c062ae81e8 ("ASoC: Intel: sof_sdw: Add quirks for some new Lenovo laptops") Signed-off-by: Bard Liao Link: https://patch.msgid.link/20241216140821.153670-2-yung-chuan.liao@linux.intel.com Signed-off-by: Mark Brown --- sound/soc/intel/boards/sof_sdw.c | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/sound/soc/intel/boards/sof_sdw.c b/sound/soc/intel/boards/sof_sdw.c index 810be7c949a5..e20ab6bfa5dd 100644 --- a/sound/soc/intel/boards/sof_sdw.c +++ b/sound/soc/intel/boards/sof_sdw.c @@ -641,9 +641,17 @@ static const struct dmi_system_id sof_sdw_quirk_table[] = { .callback = sof_sdw_quirk_cb, .matches = { DMI_MATCH(DMI_SYS_VENDOR, "LENOVO"), - DMI_EXACT_MATCH(DMI_PRODUCT_SKU, "233B") + DMI_MATCH(DMI_PRODUCT_NAME, "21Q6") }, - .driver_data = (void *)(SOC_SDW_SIDECAR_AMPS), + .driver_data = (void *)(SOC_SDW_SIDECAR_AMPS | SOC_SDW_CODEC_MIC), + }, + { + .callback = sof_sdw_quirk_cb, + .matches = { + DMI_MATCH(DMI_SYS_VENDOR, "LENOVO"), + DMI_MATCH(DMI_PRODUCT_NAME, "21Q7") + }, + .driver_data = (void *)(SOC_SDW_SIDECAR_AMPS | SOC_SDW_CODEC_MIC), }, /* ArrowLake devices */ From ba7d47a54bf23a7201bdd2978e16b04fc1cb1f6e Mon Sep 17 00:00:00 2001 From: Richard Fitzgerald Date: Mon, 16 Dec 2024 22:08:21 +0800 Subject: [PATCH 076/337] ASoC: Intel: sof_sdw: Fix DMI match for Lenovo 21QA and 21QB Update the DMI match for a Lenovo laptop to the new DMI identifier. This laptop ships with a different DMI identifier to what was expected, and now has two identifiers. Signed-off-by: Richard Fitzgerald Fixes: ea657f6b24e1 ("ASoC: Intel: sof_sdw: Add quirk for cs42l43 system using host DMICs") Signed-off-by: Bard Liao Link: https://patch.msgid.link/20241216140821.153670-3-yung-chuan.liao@linux.intel.com Signed-off-by: Mark Brown --- sound/soc/intel/boards/sof_sdw.c | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/sound/soc/intel/boards/sof_sdw.c b/sound/soc/intel/boards/sof_sdw.c index e20ab6bfa5dd..667103027f7e 100644 --- a/sound/soc/intel/boards/sof_sdw.c +++ b/sound/soc/intel/boards/sof_sdw.c @@ -632,7 +632,16 @@ static const struct dmi_system_id sof_sdw_quirk_table[] = { .callback = sof_sdw_quirk_cb, .matches = { DMI_MATCH(DMI_SYS_VENDOR, "LENOVO"), - DMI_EXACT_MATCH(DMI_PRODUCT_SKU, "233C") + DMI_MATCH(DMI_PRODUCT_NAME, "21QB") + }, + /* Note this quirk excludes the CODEC mic */ + .driver_data = (void *)(SOC_SDW_CODEC_MIC), + }, + { + .callback = sof_sdw_quirk_cb, + .matches = { + DMI_MATCH(DMI_SYS_VENDOR, "LENOVO"), + DMI_MATCH(DMI_PRODUCT_NAME, "21QA") }, /* Note this quirk excludes the CODEC mic */ .driver_data = (void *)(SOC_SDW_CODEC_MIC), From 6f4a0fd03ce856c6d9811429b9969b4f27e2eaee Mon Sep 17 00:00:00 2001 From: Chen-Yu Tsai Date: Wed, 11 Dec 2024 11:54:02 +0800 Subject: [PATCH 077/337] ASoC: dt-bindings: realtek,rt5645: Fix CPVDD voltage comment Both the ALC5645 and ALC5650 datasheets specify a recommended voltage of 1.8V for CPVDD, not 3.5V. Fix the comment. Cc: Matthias Brugger Fixes: 26aa19174f0d ("ASoC: dt-bindings: rt5645: add suppliers") Fixes: 83d43ab0a1cb ("ASoC: dt-bindings: realtek,rt5645: Convert to dtschema") Signed-off-by: Chen-Yu Tsai Acked-by: Krzysztof Kozlowski Link: https://patch.msgid.link/20241211035403.4157760-1-wenst@chromium.org Signed-off-by: Mark Brown --- Documentation/devicetree/bindings/sound/realtek,rt5645.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Documentation/devicetree/bindings/sound/realtek,rt5645.yaml b/Documentation/devicetree/bindings/sound/realtek,rt5645.yaml index 13f09f1bc800..0a698798c22b 100644 --- a/Documentation/devicetree/bindings/sound/realtek,rt5645.yaml +++ b/Documentation/devicetree/bindings/sound/realtek,rt5645.yaml @@ -51,7 +51,7 @@ properties: description: Power supply for AVDD, providing 1.8V. cpvdd-supply: - description: Power supply for CPVDD, providing 3.5V. + description: Power supply for CPVDD, providing 1.8V. hp-detect-gpios: description: From 65c8c78cc74d5bcbc43f1f785a004796a2d78360 Mon Sep 17 00:00:00 2001 From: Daniel Lezcano Date: Thu, 12 Dec 2024 21:13:10 +0100 Subject: [PATCH 078/337] thermal/thresholds: Fix uapi header macros leading to a compilation error The macros giving the direction of the crossing thresholds use the BIT macro which is not exported to the userspace. Consequently when an userspace program includes the header, it fails to compile. Replace the macros by their litteral to allow the compilation of userspace program using this header. Fixes: 445936f9e258 ("thermal: core: Add user thresholds support") Signed-off-by: Daniel Lezcano Link: https://patch.msgid.link/20241212201311.4143196-1-daniel.lezcano@linaro.org [ rjw: Add Fixes: ] Signed-off-by: Rafael J. Wysocki --- include/uapi/linux/thermal.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/include/uapi/linux/thermal.h b/include/uapi/linux/thermal.h index ba8604bdf206..349718c271eb 100644 --- a/include/uapi/linux/thermal.h +++ b/include/uapi/linux/thermal.h @@ -3,8 +3,8 @@ #define _UAPI_LINUX_THERMAL_H #define THERMAL_NAME_LENGTH 20 -#define THERMAL_THRESHOLD_WAY_UP BIT(0) -#define THERMAL_THRESHOLD_WAY_DOWN BIT(1) +#define THERMAL_THRESHOLD_WAY_UP 0x1 +#define THERMAL_THRESHOLD_WAY_DOWN 0x2 enum thermal_device_mode { THERMAL_DEVICE_DISABLED = 0, From d6fd6f8280f0257ba93f16900a0d3d3912f32c79 Mon Sep 17 00:00:00 2001 From: Max Kellermann Date: Thu, 5 Dec 2024 16:49:51 +0100 Subject: [PATCH 079/337] ceph: fix memory leaks in __ceph_sync_read() In two `break` statements, the call to ceph_release_page_vector() was missing, leaking the allocation from ceph_alloc_page_vector(). Instead of adding the missing ceph_release_page_vector() calls, the Ceph maintainers preferred to transfer page ownership to the `ceph_osd_request` by passing `own_pages=true` to osd_req_op_extent_osd_data_pages(). This requires postponing the ceph_osdc_put_request() call until after the block that accesses the `pages`. Cc: stable@vger.kernel.org Fixes: 03bc06c7b0bd ("ceph: add new mount option to enable sparse reads") Fixes: f0fe1e54cfcf ("ceph: plumb in decryption during reads") Signed-off-by: Max Kellermann Reviewed-by: Ilya Dryomov Signed-off-by: Ilya Dryomov --- fs/ceph/file.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/fs/ceph/file.c b/fs/ceph/file.c index 4b8d59ebda00..ce342a5d4b8b 100644 --- a/fs/ceph/file.c +++ b/fs/ceph/file.c @@ -1127,7 +1127,7 @@ ssize_t __ceph_sync_read(struct inode *inode, loff_t *ki_pos, osd_req_op_extent_osd_data_pages(req, 0, pages, read_len, offset_in_page(read_off), - false, false); + false, true); op = &req->r_ops[0]; if (sparse) { @@ -1186,8 +1186,6 @@ ssize_t __ceph_sync_read(struct inode *inode, loff_t *ki_pos, ret = min_t(ssize_t, fret, len); } - ceph_osdc_put_request(req); - /* Short read but not EOF? Zero out the remainder. */ if (ret >= 0 && ret < len && (off + ret < i_size)) { int zlen = min(len - ret, i_size - off - ret); @@ -1221,7 +1219,8 @@ ssize_t __ceph_sync_read(struct inode *inode, loff_t *ki_pos, break; } } - ceph_release_page_vector(pages, num_pages); + + ceph_osdc_put_request(req); if (ret < 0) { if (ret == -EBLOCKLISTED) From 550f7ca98ee028a606aa75705a7e77b1bd11720f Mon Sep 17 00:00:00 2001 From: Max Kellermann Date: Mon, 18 Nov 2024 23:28:28 +0100 Subject: [PATCH 080/337] ceph: give up on paths longer than PATH_MAX MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit If the full path to be built by ceph_mdsc_build_path() happens to be longer than PATH_MAX, then this function will enter an endless (retry) loop, effectively blocking the whole task. Most of the machine becomes unusable, making this a very simple and effective DoS vulnerability. I cannot imagine why this retry was ever implemented, but it seems rather useless and harmful to me. Let's remove it and fail with ENAMETOOLONG instead. Cc: stable@vger.kernel.org Reported-by: Dario Weißer Signed-off-by: Max Kellermann Reviewed-by: Alex Markuze Signed-off-by: Ilya Dryomov --- fs/ceph/mds_client.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c index 219a2cc2bf3c..785fe489ef4b 100644 --- a/fs/ceph/mds_client.c +++ b/fs/ceph/mds_client.c @@ -2800,12 +2800,11 @@ retry: if (pos < 0) { /* - * A rename didn't occur, but somehow we didn't end up where - * we thought we would. Throw a warning and try again. + * The path is longer than PATH_MAX and this function + * cannot ever succeed. Creating paths that long is + * possible with Ceph, but Linux cannot use them. */ - pr_warn_client(cl, "did not end path lookup where expected (pos = %d)\n", - pos); - goto retry; + return ERR_PTR(-ENAMETOOLONG); } *pbase = base; From 12eb22a5a609421b380c3c6ca887474fb2089b2c Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Wed, 20 Nov 2024 16:43:51 +0100 Subject: [PATCH 081/337] ceph: validate snapdirname option length when mounting It becomes a path component, so it shouldn't exceed NAME_MAX characters. This was hardened in commit c152737be22b ("ceph: Use strscpy() instead of strcpy() in __get_snap_name()"), but no actual check was put in place. Cc: stable@vger.kernel.org Signed-off-by: Ilya Dryomov Reviewed-by: Alex Markuze --- fs/ceph/super.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/fs/ceph/super.c b/fs/ceph/super.c index de03cd6eb86e..4344e1f11806 100644 --- a/fs/ceph/super.c +++ b/fs/ceph/super.c @@ -431,6 +431,8 @@ static int ceph_parse_mount_param(struct fs_context *fc, switch (token) { case Opt_snapdirname: + if (strlen(param->string) > NAME_MAX) + return invalfc(fc, "snapdirname too long"); kfree(fsopt->snapdir_name); fsopt->snapdir_name = param->string; param->string = NULL; From 9abee475803fab6ad59d4f4fc59c6a75374a7d9d Mon Sep 17 00:00:00 2001 From: Alex Markuze Date: Wed, 27 Nov 2024 15:34:10 +0200 Subject: [PATCH 082/337] ceph: improve error handling and short/overflow-read logic in __ceph_sync_read() This patch refines the read logic in __ceph_sync_read() to ensure more predictable and efficient behavior in various edge cases. - Return early if the requested read length is zero or if the file size (`i_size`) is zero. - Initialize the index variable (`idx`) where needed and reorder some code to ensure it is always set before use. - Improve error handling by checking for negative return values earlier. - Remove redundant encrypted file checks after failures. Only attempt filesystem-level decryption if the read succeeded. - Simplify leftover calculations to correctly handle cases where the read extends beyond the end of the file or stops short. This can be hit by continuously reading a file while, on another client, we keep truncating and writing new data into it. - This resolves multiple issues caused by integer and consequent buffer overflow (`pages` array being accessed beyond `num_pages`): - https://tracker.ceph.com/issues/67524 - https://tracker.ceph.com/issues/68980 - https://tracker.ceph.com/issues/68981 Cc: stable@vger.kernel.org Fixes: 1065da21e5df ("ceph: stop copying to iter at EOF on sync reads") Reported-by: Luis Henriques (SUSE) Signed-off-by: Alex Markuze Reviewed-by: Viacheslav Dubeyko Signed-off-by: Ilya Dryomov --- fs/ceph/file.c | 29 ++++++++++++++--------------- 1 file changed, 14 insertions(+), 15 deletions(-) diff --git a/fs/ceph/file.c b/fs/ceph/file.c index ce342a5d4b8b..8e0400d461a2 100644 --- a/fs/ceph/file.c +++ b/fs/ceph/file.c @@ -1066,7 +1066,7 @@ ssize_t __ceph_sync_read(struct inode *inode, loff_t *ki_pos, if (ceph_inode_is_shutdown(inode)) return -EIO; - if (!len) + if (!len || !i_size) return 0; /* * flush any page cache pages in this range. this @@ -1086,7 +1086,7 @@ ssize_t __ceph_sync_read(struct inode *inode, loff_t *ki_pos, int num_pages; size_t page_off; bool more; - int idx; + int idx = 0; size_t left; struct ceph_osd_req_op *op; u64 read_off = off; @@ -1160,7 +1160,14 @@ ssize_t __ceph_sync_read(struct inode *inode, loff_t *ki_pos, else if (ret == -ENOENT) ret = 0; - if (ret > 0 && IS_ENCRYPTED(inode)) { + if (ret < 0) { + ceph_osdc_put_request(req); + if (ret == -EBLOCKLISTED) + fsc->blocklisted = true; + break; + } + + if (IS_ENCRYPTED(inode)) { int fret; fret = ceph_fscrypt_decrypt_extents(inode, pages, @@ -1187,7 +1194,7 @@ ssize_t __ceph_sync_read(struct inode *inode, loff_t *ki_pos, } /* Short read but not EOF? Zero out the remainder. */ - if (ret >= 0 && ret < len && (off + ret < i_size)) { + if (ret < len && (off + ret < i_size)) { int zlen = min(len - ret, i_size - off - ret); int zoff = page_off + ret; @@ -1197,13 +1204,11 @@ ssize_t __ceph_sync_read(struct inode *inode, loff_t *ki_pos, ret += zlen; } - idx = 0; - if (ret <= 0) - left = 0; - else if (off + ret > i_size) - left = i_size - off; + if (off + ret > i_size) + left = (i_size > off) ? i_size - off : 0; else left = ret; + while (left > 0) { size_t plen, copied; @@ -1222,12 +1227,6 @@ ssize_t __ceph_sync_read(struct inode *inode, loff_t *ki_pos, ceph_osdc_put_request(req); - if (ret < 0) { - if (ret == -EBLOCKLISTED) - fsc->blocklisted = true; - break; - } - if (off >= i_size || !more) break; } From 66e0c4f91461d17d48071695271c824620bed4ef Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Fri, 6 Dec 2024 17:32:59 +0100 Subject: [PATCH 083/337] ceph: fix memory leak in ceph_direct_read_write() The bvecs array which is allocated in iter_get_bvecs_alloc() is leaked and pages remain pinned if ceph_alloc_sparse_ext_map() fails. There is no need to delay the allocation of sparse_ext map until after the bvecs array is set up, so fix this by moving sparse_ext allocation a bit earlier. Also, make a similar adjustment in __ceph_sync_read() for consistency (a leak of the same kind in __ceph_sync_read() has been addressed differently). Cc: stable@vger.kernel.org Fixes: 03bc06c7b0bd ("ceph: add new mount option to enable sparse reads") Signed-off-by: Ilya Dryomov Reviewed-by: Alex Markuze --- fs/ceph/file.c | 43 ++++++++++++++++++++++--------------------- 1 file changed, 22 insertions(+), 21 deletions(-) diff --git a/fs/ceph/file.c b/fs/ceph/file.c index 8e0400d461a2..67468d88f139 100644 --- a/fs/ceph/file.c +++ b/fs/ceph/file.c @@ -1116,6 +1116,16 @@ ssize_t __ceph_sync_read(struct inode *inode, loff_t *ki_pos, len = read_off + read_len - off; more = len < iov_iter_count(to); + op = &req->r_ops[0]; + if (sparse) { + extent_cnt = __ceph_sparse_read_ext_count(inode, read_len); + ret = ceph_alloc_sparse_ext_map(op, extent_cnt); + if (ret) { + ceph_osdc_put_request(req); + break; + } + } + num_pages = calc_pages_for(read_off, read_len); page_off = offset_in_page(off); pages = ceph_alloc_page_vector(num_pages, GFP_KERNEL); @@ -1129,16 +1139,6 @@ ssize_t __ceph_sync_read(struct inode *inode, loff_t *ki_pos, offset_in_page(read_off), false, true); - op = &req->r_ops[0]; - if (sparse) { - extent_cnt = __ceph_sparse_read_ext_count(inode, read_len); - ret = ceph_alloc_sparse_ext_map(op, extent_cnt); - if (ret) { - ceph_osdc_put_request(req); - break; - } - } - ceph_osdc_start_request(osdc, req); ret = ceph_osdc_wait_request(osdc, req); @@ -1551,6 +1551,16 @@ ceph_direct_read_write(struct kiocb *iocb, struct iov_iter *iter, break; } + op = &req->r_ops[0]; + if (sparse) { + extent_cnt = __ceph_sparse_read_ext_count(inode, size); + ret = ceph_alloc_sparse_ext_map(op, extent_cnt); + if (ret) { + ceph_osdc_put_request(req); + break; + } + } + len = iter_get_bvecs_alloc(iter, size, &bvecs, &num_pages); if (len < 0) { ceph_osdc_put_request(req); @@ -1560,6 +1570,8 @@ ceph_direct_read_write(struct kiocb *iocb, struct iov_iter *iter, if (len != size) osd_req_op_extent_update(req, 0, len); + osd_req_op_extent_osd_data_bvecs(req, 0, bvecs, num_pages, len); + /* * To simplify error handling, allow AIO when IO within i_size * or IO can be satisfied by single OSD request. @@ -1591,17 +1603,6 @@ ceph_direct_read_write(struct kiocb *iocb, struct iov_iter *iter, req->r_mtime = mtime; } - osd_req_op_extent_osd_data_bvecs(req, 0, bvecs, num_pages, len); - op = &req->r_ops[0]; - if (sparse) { - extent_cnt = __ceph_sparse_read_ext_count(inode, size); - ret = ceph_alloc_sparse_ext_map(op, extent_cnt); - if (ret) { - ceph_osdc_put_request(req); - break; - } - } - if (aio_req) { aio_req->total_len += len; aio_req->num_reqs++; From 18d44c5d062b97b97bb0162d9742440518958dc1 Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Sat, 7 Dec 2024 17:33:25 +0100 Subject: [PATCH 084/337] ceph: allocate sparse_ext map only for sparse reads If mounted with sparseread option, ceph_direct_read_write() ends up making an unnecessarily allocation for O_DIRECT writes. Fixes: 03bc06c7b0bd ("ceph: add new mount option to enable sparse reads") Signed-off-by: Ilya Dryomov Reviewed-by: Alex Markuze --- fs/ceph/file.c | 2 +- net/ceph/osd_client.c | 2 ++ 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/fs/ceph/file.c b/fs/ceph/file.c index 67468d88f139..851d70200c6b 100644 --- a/fs/ceph/file.c +++ b/fs/ceph/file.c @@ -1552,7 +1552,7 @@ ceph_direct_read_write(struct kiocb *iocb, struct iov_iter *iter, } op = &req->r_ops[0]; - if (sparse) { + if (!write && sparse) { extent_cnt = __ceph_sparse_read_ext_count(inode, size); ret = ceph_alloc_sparse_ext_map(op, extent_cnt); if (ret) { diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c index 9b1168eb77ab..b24afec24138 100644 --- a/net/ceph/osd_client.c +++ b/net/ceph/osd_client.c @@ -1173,6 +1173,8 @@ EXPORT_SYMBOL(ceph_osdc_new_request); int __ceph_alloc_sparse_ext_map(struct ceph_osd_req_op *op, int cnt) { + WARN_ON(op->op != CEPH_OSD_OP_SPARSE_READ); + op->extent.sparse_ext_cnt = cnt; op->extent.sparse_ext = kmalloc_array(cnt, sizeof(*op->extent.sparse_ext), From 74d7e038fd072635d21e4734e3223378e09168d3 Mon Sep 17 00:00:00 2001 From: Murad Masimov Date: Mon, 16 Dec 2024 20:36:46 +0300 Subject: [PATCH 085/337] hwmon: (tmp513) Fix interpretation of values of Shunt Voltage and Limit Registers The values returned by the driver after processing the contents of the Shunt Voltage Register and the Shunt Limit Registers do not correspond to the TMP512/TMP513 specifications. A raw register value is converted to a signed integer value by a sign extension in accordance with the algorithm provided in the specification, but due to the off-by-one error in the sign bit index, the result is incorrect. Moreover, the PGA shift calculated with the tmp51x_get_pga_shift function is relevant only to the Shunt Voltage Register, but is also applied to the Shunt Limit Registers. According to the TMP512 and TMP513 datasheets, the Shunt Voltage Register (04h) is 13 to 16 bit two's complement integer value, depending on the PGA setting. The Shunt Positive (0Ch) and Negative (0Dh) Limit Registers are 16-bit two's complement integer values. Below are some examples: * Shunt Voltage Register If PGA = 8, and regval = 1000 0011 0000 0000, then the decimal value must be -32000, but the value calculated by the driver will be 33536. * Shunt Limit Register If regval = 1000 0011 0000 0000, then the decimal value must be -32000, but the value calculated by the driver will be 768, if PGA = 1. Fix sign bit index, and also correct misleading comment describing the tmp51x_get_pga_shift function. Found by Linux Verification Center (linuxtesting.org) with SVACE. Fixes: 59dfa75e5d82 ("hwmon: Add driver for Texas Instruments TMP512/513 sensor chips.") Signed-off-by: Murad Masimov Link: https://lore.kernel.org/r/20241216173648.526-2-m.masimov@maxima.ru [groeck: Fixed description and multi-line alignments] Signed-off-by: Guenter Roeck --- drivers/hwmon/tmp513.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/drivers/hwmon/tmp513.c b/drivers/hwmon/tmp513.c index 926d28cd3fab..d87fcea3ef24 100644 --- a/drivers/hwmon/tmp513.c +++ b/drivers/hwmon/tmp513.c @@ -182,7 +182,7 @@ struct tmp51x_data { struct regmap *regmap; }; -// Set the shift based on the gain 8=4, 4=3, 2=2, 1=1 +// Set the shift based on the gain: 8 -> 1, 4 -> 2, 2 -> 3, 1 -> 4 static inline u8 tmp51x_get_pga_shift(struct tmp51x_data *data) { return 5 - ffs(data->pga_gain); @@ -204,7 +204,9 @@ static int tmp51x_get_value(struct tmp51x_data *data, u8 reg, u8 pos, * 2's complement number shifted by one to four depending * on the pga gain setting. 1lsb = 10uV */ - *val = sign_extend32(regval, 17 - tmp51x_get_pga_shift(data)); + *val = sign_extend32(regval, + reg == TMP51X_SHUNT_CURRENT_RESULT ? + 16 - tmp51x_get_pga_shift(data) : 15); *val = DIV_ROUND_CLOSEST(*val * 10 * MILLI, data->shunt_uohms); break; case TMP51X_BUS_VOLTAGE_RESULT: From da1d0e6ba211baf6747db74c07700caddfd8a179 Mon Sep 17 00:00:00 2001 From: Murad Masimov Date: Mon, 16 Dec 2024 20:36:47 +0300 Subject: [PATCH 086/337] hwmon: (tmp513) Fix Current Register value interpretation The value returned by the driver after processing the contents of the Current Register does not correspond to the TMP512/TMP513 specifications. A raw register value is converted to a signed integer value by a sign extension in accordance with the algorithm provided in the specification, but due to the off-by-one error in the sign bit index, the result is incorrect. Moreover, negative values will be reported as large positive due to missing sign extension from u32 to long. According to the TMP512 and TMP513 datasheets, the Current Register (07h) is a 16-bit two's complement integer value. E.g., if regval = 1000 0011 0000 0000, then the value must be (-32000 * lsb), but the driver will return (33536 * lsb). Fix off-by-one bug, and also cast data->curr_lsb_ua (which is of type u32) to long to prevent incorrect cast for negative values. Found by Linux Verification Center (linuxtesting.org) with SVACE. Fixes: 59dfa75e5d82 ("hwmon: Add driver for Texas Instruments TMP512/513 sensor chips.") Signed-off-by: Murad Masimov Link: https://lore.kernel.org/r/20241216173648.526-3-m.masimov@maxima.ru [groeck: Fixed description line length] Signed-off-by: Guenter Roeck --- drivers/hwmon/tmp513.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/hwmon/tmp513.c b/drivers/hwmon/tmp513.c index d87fcea3ef24..2846b1cc515d 100644 --- a/drivers/hwmon/tmp513.c +++ b/drivers/hwmon/tmp513.c @@ -222,7 +222,7 @@ static int tmp51x_get_value(struct tmp51x_data *data, u8 reg, u8 pos, break; case TMP51X_BUS_CURRENT_RESULT: // Current = (ShuntVoltage * CalibrationRegister) / 4096 - *val = sign_extend32(regval, 16) * data->curr_lsb_ua; + *val = sign_extend32(regval, 15) * (long)data->curr_lsb_ua; *val = DIV_ROUND_CLOSEST(*val, MILLI); break; case TMP51X_LOCAL_TEMP_RESULT: From dd471e25770e7e632f736b90db1e2080b2171668 Mon Sep 17 00:00:00 2001 From: Murad Masimov Date: Mon, 16 Dec 2024 20:36:48 +0300 Subject: [PATCH 087/337] hwmon: (tmp513) Fix interpretation of values of Temperature Result and Limit Registers The values returned by the driver after processing the contents of the Temperature Result and the Temperature Limit Registers do not correspond to the TMP512/TMP513 specifications. A raw register value is converted to a signed integer value by a sign extension in accordance with the algorithm provided in the specification, but due to the off-by-one error in the sign bit index, the result is incorrect. According to the TMP512 and TMP513 datasheets, the Temperature Result (08h to 0Bh) and Limit (11h to 14h) Registers are 13-bit two's complement integer values, shifted left by 3 bits. The value is scaled by 0.0625 degrees Celsius per bit. E.g., if regval = 1 1110 0111 0000 000, the output should be -25 degrees, but the driver will return +487 degrees. Found by Linux Verification Center (linuxtesting.org) with SVACE. Fixes: 59dfa75e5d82 ("hwmon: Add driver for Texas Instruments TMP512/513 sensor chips.") Signed-off-by: Murad Masimov Link: https://lore.kernel.org/r/20241216173648.526-4-m.masimov@maxima.ru [groeck: fixed description line length] Signed-off-by: Guenter Roeck --- drivers/hwmon/tmp513.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/hwmon/tmp513.c b/drivers/hwmon/tmp513.c index 2846b1cc515d..1c2cb12071b8 100644 --- a/drivers/hwmon/tmp513.c +++ b/drivers/hwmon/tmp513.c @@ -234,7 +234,7 @@ static int tmp51x_get_value(struct tmp51x_data *data, u8 reg, u8 pos, case TMP51X_REMOTE_TEMP_LIMIT_2: case TMP513_REMOTE_TEMP_LIMIT_3: // 1lsb = 0.0625 degrees centigrade - *val = sign_extend32(regval, 16) >> TMP51X_TEMP_SHIFT; + *val = sign_extend32(regval, 15) >> TMP51X_TEMP_SHIFT; *val = DIV_ROUND_CLOSEST(*val * 625, 10); break; case TMP51X_N_FACTOR_AND_HYST_1: From 0efbca0fec7d1665e19fa008ba95daab71f76f4d Mon Sep 17 00:00:00 2001 From: Thorsten Blum Date: Mon, 9 Dec 2024 12:06:39 +0100 Subject: [PATCH 088/337] nios2: Use str_yes_no() helper in show_cpuinfo() Remove hard-coded strings by using the str_yes_no() helper function. Signed-off-by: Thorsten Blum Signed-off-by: Dinh Nguyen --- arch/nios2/kernel/cpuinfo.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/arch/nios2/kernel/cpuinfo.c b/arch/nios2/kernel/cpuinfo.c index 338849c430a5..7b1e8f9128e9 100644 --- a/arch/nios2/kernel/cpuinfo.c +++ b/arch/nios2/kernel/cpuinfo.c @@ -143,11 +143,11 @@ static int show_cpuinfo(struct seq_file *m, void *v) " DIV:\t\t%s\n" " BMX:\t\t%s\n" " CDX:\t\t%s\n", - cpuinfo.has_mul ? "yes" : "no", - cpuinfo.has_mulx ? "yes" : "no", - cpuinfo.has_div ? "yes" : "no", - cpuinfo.has_bmx ? "yes" : "no", - cpuinfo.has_cdx ? "yes" : "no"); + str_yes_no(cpuinfo.has_mul), + str_yes_no(cpuinfo.has_mulx), + str_yes_no(cpuinfo.has_div), + str_yes_no(cpuinfo.has_bmx), + str_yes_no(cpuinfo.has_cdx)); seq_printf(m, "Icache:\t\t%ukB, line length: %u\n", From abcc2ddae5f82aa6cfca162e3db643dd33f0a2e8 Mon Sep 17 00:00:00 2001 From: Umesh Nerlige Ramappa Date: Wed, 27 Nov 2024 09:40:04 -0800 Subject: [PATCH 089/337] i915/guc: Reset engine utilization buffer before registration On GT reset, we store total busyness counts for all engines and re-register the utilization buffer with GuC. At that time we should reset the buffer, so that we don't get spurious busyness counts on subsequent queries. To repro this issue, run igt@perf_pmu@busy-hang followed by igt@perf_pmu@most-busy-idle-check-all for a couple iterations. Fixes: 77cdd054dd2c ("drm/i915/pmu: Connect engine busyness stats from GuC to pmu") Signed-off-by: Umesh Nerlige Ramappa Reviewed-by: John Harrison Link: https://patchwork.freedesktop.org/patch/msgid/20241127174006.190128-2-umesh.nerlige.ramappa@intel.com (cherry picked from commit abd318237fa6556c1e5225529af145ef15d5ff0d) Signed-off-by: Tvrtko Ursulin --- .../gpu/drm/i915/gt/uc/intel_guc_submission.c | 21 +++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c b/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c index 9ede6f240d79..b1d0c66e166f 100644 --- a/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c +++ b/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c @@ -1243,6 +1243,21 @@ static void __get_engine_usage_record(struct intel_engine_cs *engine, } while (++i < 6); } +static void __set_engine_usage_record(struct intel_engine_cs *engine, + u32 last_in, u32 id, u32 total) +{ + struct iosys_map rec_map = intel_guc_engine_usage_record_map(engine); + +#define record_write(map_, field_, val_) \ + iosys_map_wr_field(map_, 0, struct guc_engine_usage_record, field_, val_) + + record_write(&rec_map, last_switch_in_stamp, last_in); + record_write(&rec_map, current_context_index, id); + record_write(&rec_map, total_runtime, total); + +#undef record_write +} + static void guc_update_engine_gt_clks(struct intel_engine_cs *engine) { struct intel_engine_guc_stats *stats = &engine->stats.guc; @@ -1543,6 +1558,9 @@ err_trylock: static int guc_action_enable_usage_stats(struct intel_guc *guc) { + struct intel_gt *gt = guc_to_gt(guc); + struct intel_engine_cs *engine; + enum intel_engine_id id; u32 offset = intel_guc_engine_usage_offset(guc); u32 action[] = { INTEL_GUC_ACTION_SET_ENG_UTIL_BUFF, @@ -1550,6 +1568,9 @@ static int guc_action_enable_usage_stats(struct intel_guc *guc) 0, }; + for_each_engine(engine, gt, id) + __set_engine_usage_record(engine, 0, 0xffffffff, 0); + return intel_guc_send(guc, action, ARRAY_SIZE(action)); } From 59a0b46788d58fdcee8d2f6b4e619d264a1799bf Mon Sep 17 00:00:00 2001 From: Umesh Nerlige Ramappa Date: Wed, 27 Nov 2024 09:40:05 -0800 Subject: [PATCH 090/337] i915/guc: Ensure busyness counter increases motonically Active busyness of an engine is calculated using gt timestamp and the context switch in time. While capturing the gt timestamp, it's possible that the context switches out. This race could result in an active busyness value that is greater than the actual context runtime value by a small amount. This leads to a negative delta and throws off busyness calculations for the user. If a subsequent count is smaller than the previous one, just return the previous one, since we expect the busyness to catch up. Fixes: 77cdd054dd2c ("drm/i915/pmu: Connect engine busyness stats from GuC to pmu") Signed-off-by: Umesh Nerlige Ramappa Reviewed-by: John Harrison Link: https://patchwork.freedesktop.org/patch/msgid/20241127174006.190128-3-umesh.nerlige.ramappa@intel.com (cherry picked from commit cf907f6d294217985e9dafd9985dce874e04ca37) Signed-off-by: Tvrtko Ursulin --- drivers/gpu/drm/i915/gt/intel_engine_types.h | 5 +++++ drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c | 5 ++++- 2 files changed, 9 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/i915/gt/intel_engine_types.h b/drivers/gpu/drm/i915/gt/intel_engine_types.h index ba55c059063d..fe1f85e5dda3 100644 --- a/drivers/gpu/drm/i915/gt/intel_engine_types.h +++ b/drivers/gpu/drm/i915/gt/intel_engine_types.h @@ -343,6 +343,11 @@ struct intel_engine_guc_stats { * @start_gt_clk: GT clock time of last idle to active transition. */ u64 start_gt_clk; + + /** + * @total: The last value of total returned + */ + u64 total; }; union intel_engine_tlb_inv_reg { diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c b/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c index b1d0c66e166f..9dcf76d440d8 100644 --- a/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c +++ b/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c @@ -1378,9 +1378,12 @@ static ktime_t guc_engine_busyness(struct intel_engine_cs *engine, ktime_t *now) total += intel_gt_clock_interval_to_ns(gt, clk); } + if (total > stats->total) + stats->total = total; + spin_unlock_irqrestore(&guc->timestamp.lock, flags); - return ns_to_ktime(total); + return ns_to_ktime(stats->total); } static void guc_enable_busyness_worker(struct intel_guc *guc) From 1622ed27d26ab4c234476be746aa55bcd39159dd Mon Sep 17 00:00:00 2001 From: Umesh Nerlige Ramappa Date: Wed, 27 Nov 2024 09:40:06 -0800 Subject: [PATCH 091/337] i915/guc: Accumulate active runtime on gt reset On gt reset, if a context is running, then accumulate it's active time into the busyness counter since there will be no chance for the context to switch out and update it's run time. v2: Move comment right above the if (John) Fixes: 77cdd054dd2c ("drm/i915/pmu: Connect engine busyness stats from GuC to pmu") Signed-off-by: Umesh Nerlige Ramappa Reviewed-by: John Harrison Link: https://patchwork.freedesktop.org/patch/msgid/20241127174006.190128-4-umesh.nerlige.ramappa@intel.com (cherry picked from commit 7ed047da59cfa1acb558b95169d347acc8d85da1) Signed-off-by: Tvrtko Ursulin --- drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c b/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c index 9dcf76d440d8..c0bd730383f2 100644 --- a/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c +++ b/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c @@ -1449,8 +1449,21 @@ static void __reset_guc_busyness_stats(struct intel_guc *guc) guc_update_pm_timestamp(guc, &unused); for_each_engine(engine, gt, id) { + struct intel_engine_guc_stats *stats = &engine->stats.guc; + guc_update_engine_gt_clks(engine); - engine->stats.guc.prev_total = 0; + + /* + * If resetting a running context, accumulate the active + * time as well since there will be no context switch. + */ + if (stats->running) { + u64 clk = guc->timestamp.gt_stamp - stats->start_gt_clk; + + stats->total_gt_clks += clk; + } + stats->prev_total = 0; + stats->running = 0; } spin_unlock_irqrestore(&guc->timestamp.lock, flags); From e21ebe51af688eb98fd6269240212a3c7300deea Mon Sep 17 00:00:00 2001 From: Mathias Nyman Date: Tue, 17 Dec 2024 12:21:21 +0200 Subject: [PATCH 092/337] xhci: Turn NEC specific quirk for handling Stop Endpoint errors generic xHC hosts from several vendors have the same issue where endpoints start so slowly that a later queued 'Stop Endpoint' command may complete before endpoint is up and running. The 'Stop Endpoint' command fails with context state error as the endpoint still appears as stopped. See commit 42b758137601 ("usb: xhci: Limit Stop Endpoint retries") for details CC: stable@vger.kernel.org Signed-off-by: Mathias Nyman Link: https://lore.kernel.org/r/20241217102122.2316814-2-mathias.nyman@linux.intel.com Signed-off-by: Greg Kroah-Hartman --- drivers/usb/host/xhci-ring.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/drivers/usb/host/xhci-ring.c b/drivers/usb/host/xhci-ring.c index 4cf5363875c7..09b05a62375e 100644 --- a/drivers/usb/host/xhci-ring.c +++ b/drivers/usb/host/xhci-ring.c @@ -1199,8 +1199,6 @@ static void xhci_handle_cmd_stop_ep(struct xhci_hcd *xhci, int slot_id, * Keep retrying until the EP starts and stops again, on * chips where this is known to help. Wait for 100ms. */ - if (!(xhci->quirks & XHCI_NEC_HOST)) - break; if (time_is_before_jiffies(ep->stop_time + msecs_to_jiffies(100))) break; fallthrough; From b9252f80b807801056e67e3a672fb1be0ecb81d8 Mon Sep 17 00:00:00 2001 From: Niklas Neronin Date: Tue, 17 Dec 2024 12:21:22 +0200 Subject: [PATCH 093/337] usb: xhci: fix ring expansion regression in 6.13-rc1 The source and destination rings were incorrectly assigned during the ring linking process. The "source" ring, which contains the new segments, was not spliced into the "destination" ring, leading to incorrect ring expansion. Fixes: fe688e500613 ("usb: xhci: refactor xhci_link_rings() to use source and destination rings") Reported-by: Jeff Chua Closes: https://lore.kernel.org/lkml/CAAJw_ZtppNqC9XA=-WVQDr+vaAS=di7jo15CzSqONeX48H75MA@mail.gmail.com/ Signed-off-by: Niklas Neronin Signed-off-by: Mathias Nyman Link: https://lore.kernel.org/r/20241217102122.2316814-3-mathias.nyman@linux.intel.com Signed-off-by: Greg Kroah-Hartman --- drivers/usb/host/xhci-mem.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/usb/host/xhci-mem.c b/drivers/usb/host/xhci-mem.c index 15db90c54a45..92703efda1f7 100644 --- a/drivers/usb/host/xhci-mem.c +++ b/drivers/usb/host/xhci-mem.c @@ -436,7 +436,7 @@ int xhci_ring_expansion(struct xhci_hcd *xhci, struct xhci_ring *ring, goto free_segments; } - xhci_link_rings(xhci, ring, &new_ring); + xhci_link_rings(xhci, &new_ring, ring); trace_xhci_ring_expansion(ring); xhci_dbg_trace(xhci, trace_xhci_dbg_ring_expansion, "ring expansion succeed, now has %d segments", From e8d0ba147d901022bcb69da8d8fd817f84e9f3ca Mon Sep 17 00:00:00 2001 From: Peter Ujfalusi Date: Tue, 17 Dec 2024 11:10:19 +0200 Subject: [PATCH 094/337] ASoC: SOF: Intel: hda-dai: Do not release the link DMA on STOP The linkDMA should not be released on stop trigger since a stream re-start might happen without closing of the stream. This leaves a short time for other streams to 'steal' the linkDMA since it has been released. This issue is not easy to reproduce under normal conditions as usually after stop the stream is closed, or the same stream is restarted, but if another stream got in between the stop and start, like this: aplay -Dhw:0,3 -c2 -r48000 -fS32_LE /dev/zero -d 120 CTRL+z aplay -Dhw:0,0 -c2 -r48000 -fS32_LE /dev/zero -d 120 then the link DMA channels will be mixed up, resulting firmware error or crash. Fixes: ab5593793e90 ("ASoC: SOF: Intel: hda: Always clean up link DMA during stop") Cc: stable@vger.kernel.org Closes: https://github.com/thesofproject/sof/issues/9695 Signed-off-by: Peter Ujfalusi Reviewed-by: Ranjani Sridharan Reviewed-by: Liam Girdwood Reviewed-by: Bard Liao Link: https://patch.msgid.link/20241217091019.31798-1-peter.ujfalusi@linux.intel.com Signed-off-by: Mark Brown --- sound/soc/sof/intel/hda-dai.c | 25 +++++++++++++++++++------ sound/soc/sof/intel/hda.h | 2 -- 2 files changed, 19 insertions(+), 8 deletions(-) diff --git a/sound/soc/sof/intel/hda-dai.c b/sound/soc/sof/intel/hda-dai.c index c13f89b7065e..0db2a3e554fb 100644 --- a/sound/soc/sof/intel/hda-dai.c +++ b/sound/soc/sof/intel/hda-dai.c @@ -103,8 +103,10 @@ hda_dai_get_ops(struct snd_pcm_substream *substream, struct snd_soc_dai *cpu_dai return sdai->platform_private; } -int hda_link_dma_cleanup(struct snd_pcm_substream *substream, struct hdac_ext_stream *hext_stream, - struct snd_soc_dai *cpu_dai) +static int +hda_link_dma_cleanup(struct snd_pcm_substream *substream, + struct hdac_ext_stream *hext_stream, + struct snd_soc_dai *cpu_dai, bool release) { const struct hda_dai_widget_dma_ops *ops = hda_dai_get_ops(substream, cpu_dai); struct sof_intel_hda_stream *hda_stream; @@ -128,6 +130,17 @@ int hda_link_dma_cleanup(struct snd_pcm_substream *substream, struct hdac_ext_st snd_hdac_ext_bus_link_clear_stream_id(hlink, stream_tag); } + if (!release) { + /* + * Force stream reconfiguration without releasing the channel on + * subsequent stream restart (without free), including LinkDMA + * reset. + * The stream is released via hda_dai_hw_free() + */ + hext_stream->link_prepared = 0; + return 0; + } + if (ops->release_hext_stream) ops->release_hext_stream(sdev, cpu_dai, substream); @@ -211,7 +224,7 @@ static int __maybe_unused hda_dai_hw_free(struct snd_pcm_substream *substream, if (!hext_stream) return 0; - return hda_link_dma_cleanup(substream, hext_stream, cpu_dai); + return hda_link_dma_cleanup(substream, hext_stream, cpu_dai, true); } static int __maybe_unused hda_dai_hw_params_data(struct snd_pcm_substream *substream, @@ -304,7 +317,8 @@ static int __maybe_unused hda_dai_trigger(struct snd_pcm_substream *substream, i switch (cmd) { case SNDRV_PCM_TRIGGER_STOP: case SNDRV_PCM_TRIGGER_SUSPEND: - ret = hda_link_dma_cleanup(substream, hext_stream, dai); + ret = hda_link_dma_cleanup(substream, hext_stream, dai, + cmd == SNDRV_PCM_TRIGGER_STOP ? false : true); if (ret < 0) { dev_err(sdev->dev, "%s: failed to clean up link DMA\n", __func__); return ret; @@ -660,8 +674,7 @@ static int hda_dai_suspend(struct hdac_bus *bus) } ret = hda_link_dma_cleanup(hext_stream->link_substream, - hext_stream, - cpu_dai); + hext_stream, cpu_dai, true); if (ret < 0) return ret; } diff --git a/sound/soc/sof/intel/hda.h b/sound/soc/sof/intel/hda.h index 22bd9c3c8216..ee4ccc1a5490 100644 --- a/sound/soc/sof/intel/hda.h +++ b/sound/soc/sof/intel/hda.h @@ -1038,8 +1038,6 @@ const struct hda_dai_widget_dma_ops * hda_select_dai_widget_ops(struct snd_sof_dev *sdev, struct snd_sof_widget *swidget); int hda_dai_config(struct snd_soc_dapm_widget *w, unsigned int flags, struct snd_sof_dai_config_data *data); -int hda_link_dma_cleanup(struct snd_pcm_substream *substream, struct hdac_ext_stream *hext_stream, - struct snd_soc_dai *cpu_dai); static inline struct snd_sof_dev *widget_to_sdev(struct snd_soc_dapm_widget *w) { From a37eecb705f33726f1fb7cd2a67e514a15dfe693 Mon Sep 17 00:00:00 2001 From: Evgenii Shatokhin Date: Mon, 9 Dec 2024 10:46:59 +0300 Subject: [PATCH 095/337] pinctrl: mcp23s08: Fix sleeping in atomic context due to regmap locking If a device uses MCP23xxx IO expander to receive IRQs, the following bug can happen: BUG: sleeping function called from invalid context at kernel/locking/mutex.c:283 in_atomic(): 1, irqs_disabled(): 1, non_block: 0, ... preempt_count: 1, expected: 0 ... Call Trace: ... __might_resched+0x104/0x10e __might_sleep+0x3e/0x62 mutex_lock+0x20/0x4c regmap_lock_mutex+0x10/0x18 regmap_update_bits_base+0x2c/0x66 mcp23s08_irq_set_type+0x1ae/0x1d6 __irq_set_trigger+0x56/0x172 __setup_irq+0x1e6/0x646 request_threaded_irq+0xb6/0x160 ... We observed the problem while experimenting with a touchscreen driver which used MCP23017 IO expander (I2C). The regmap in the pinctrl-mcp23s08 driver uses a mutex for protection from concurrent accesses, which is the default for regmaps without .fast_io, .disable_locking, etc. mcp23s08_irq_set_type() calls regmap_update_bits_base(), and the latter locks the mutex. However, __setup_irq() locks desc->lock spinlock before calling these functions. As a result, the system tries to lock the mutex whole holding the spinlock. It seems, the internal regmap locks are not needed in this driver at all. mcp->lock seems to protect the regmap from concurrent accesses already, except, probably, in mcp_pinconf_get/set. mcp23s08_irq_set_type() and mcp23s08_irq_mask/unmask() are called under chip_bus_lock(), which calls mcp23s08_irq_bus_lock(). The latter takes mcp->lock and enables regmap caching, so that the potentially slow I2C accesses are deferred until chip_bus_unlock(). The accesses to the regmap from mcp23s08_probe_one() do not need additional locking. In all remaining places where the regmap is accessed, except mcp_pinconf_get/set(), the driver already takes mcp->lock. This patch adds locking in mcp_pinconf_get/set() and disables internal locking in the regmap config. Among other things, it fixes the sleeping in atomic context described above. Fixes: 8f38910ba4f6 ("pinctrl: mcp23s08: switch to regmap caching") Cc: stable@vger.kernel.org Signed-off-by: Evgenii Shatokhin Link: https://lore.kernel.org/20241209074659.1442898-1-e.shatokhin@yadro.com Signed-off-by: Linus Walleij --- drivers/pinctrl/pinctrl-mcp23s08.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/drivers/pinctrl/pinctrl-mcp23s08.c b/drivers/pinctrl/pinctrl-mcp23s08.c index d66c3a3e8429..b96e6368a956 100644 --- a/drivers/pinctrl/pinctrl-mcp23s08.c +++ b/drivers/pinctrl/pinctrl-mcp23s08.c @@ -86,6 +86,7 @@ const struct regmap_config mcp23x08_regmap = { .num_reg_defaults = ARRAY_SIZE(mcp23x08_defaults), .cache_type = REGCACHE_FLAT, .max_register = MCP_OLAT, + .disable_locking = true, /* mcp->lock protects the regmap */ }; EXPORT_SYMBOL_GPL(mcp23x08_regmap); @@ -132,6 +133,7 @@ const struct regmap_config mcp23x17_regmap = { .num_reg_defaults = ARRAY_SIZE(mcp23x17_defaults), .cache_type = REGCACHE_FLAT, .val_format_endian = REGMAP_ENDIAN_LITTLE, + .disable_locking = true, /* mcp->lock protects the regmap */ }; EXPORT_SYMBOL_GPL(mcp23x17_regmap); @@ -228,7 +230,9 @@ static int mcp_pinconf_get(struct pinctrl_dev *pctldev, unsigned int pin, switch (param) { case PIN_CONFIG_BIAS_PULL_UP: + mutex_lock(&mcp->lock); ret = mcp_read(mcp, MCP_GPPU, &data); + mutex_unlock(&mcp->lock); if (ret < 0) return ret; status = (data & BIT(pin)) ? 1 : 0; @@ -257,7 +261,9 @@ static int mcp_pinconf_set(struct pinctrl_dev *pctldev, unsigned int pin, switch (param) { case PIN_CONFIG_BIAS_PULL_UP: + mutex_lock(&mcp->lock); ret = mcp_set_bit(mcp, MCP_GPPU, pin, arg); + mutex_unlock(&mcp->lock); break; default: dev_dbg(mcp->dev, "Invalid config param %04x\n", param); From 69d803c40edeaf94089fbc8751c9b746cdc35044 Mon Sep 17 00:00:00 2001 From: Yang Erkun Date: Mon, 16 Dec 2024 22:21:52 +0800 Subject: [PATCH 096/337] nfsd: Revert "nfsd: release svc_expkey/svc_export with rcu_work" This reverts commit f8c989a0c89a75d30f899a7cabdc14d72522bb8d. Before this commit, svc_export_put or expkey_put will call path_put with sync mode. After this commit, path_put will be called with async mode. And this can lead the unexpected results show as follow. mkfs.xfs -f /dev/sda echo "/ *(rw,no_root_squash,fsid=0)" > /etc/exports echo "/mnt *(rw,no_root_squash,fsid=1)" >> /etc/exports exportfs -ra service nfs-server start mount -t nfs -o vers=4.0 127.0.0.1:/mnt /mnt1 mount /dev/sda /mnt/sda touch /mnt1/sda/file exportfs -r umount /mnt/sda # failed unexcepted The touch will finally call nfsd_cross_mnt, add refcount to mount, and then add cache_head. Before this commit, exportfs -r will call cache_flush to cleanup all cache_head, and path_put in svc_export_put/expkey_put will be finished with sync mode. So, the latter umount will always success. However, after this commit, path_put will be called with async mode, the latter umount may failed, and if we add some delay, umount will success too. Personally I think this bug and should be fixed. We first revert before bugfix patch, and then fix the original bug with a different way. Fixes: f8c989a0c89a ("nfsd: release svc_expkey/svc_export with rcu_work") Signed-off-by: Yang Erkun Signed-off-by: Chuck Lever --- fs/nfsd/export.c | 31 ++++++------------------------- fs/nfsd/export.h | 4 ++-- 2 files changed, 8 insertions(+), 27 deletions(-) diff --git a/fs/nfsd/export.c b/fs/nfsd/export.c index eacafe46e3b6..aa4712362b3b 100644 --- a/fs/nfsd/export.c +++ b/fs/nfsd/export.c @@ -40,24 +40,15 @@ #define EXPKEY_HASHMAX (1 << EXPKEY_HASHBITS) #define EXPKEY_HASHMASK (EXPKEY_HASHMAX -1) -static void expkey_put_work(struct work_struct *work) +static void expkey_put(struct kref *ref) { - struct svc_expkey *key = - container_of(to_rcu_work(work), struct svc_expkey, ek_rcu_work); + struct svc_expkey *key = container_of(ref, struct svc_expkey, h.ref); if (test_bit(CACHE_VALID, &key->h.flags) && !test_bit(CACHE_NEGATIVE, &key->h.flags)) path_put(&key->ek_path); auth_domain_put(key->ek_client); - kfree(key); -} - -static void expkey_put(struct kref *ref) -{ - struct svc_expkey *key = container_of(ref, struct svc_expkey, h.ref); - - INIT_RCU_WORK(&key->ek_rcu_work, expkey_put_work); - queue_rcu_work(system_wq, &key->ek_rcu_work); + kfree_rcu(key, ek_rcu); } static int expkey_upcall(struct cache_detail *cd, struct cache_head *h) @@ -364,26 +355,16 @@ static void export_stats_destroy(struct export_stats *stats) EXP_STATS_COUNTERS_NUM); } -static void svc_export_put_work(struct work_struct *work) +static void svc_export_put(struct kref *ref) { - struct svc_export *exp = - container_of(to_rcu_work(work), struct svc_export, ex_rcu_work); - + struct svc_export *exp = container_of(ref, struct svc_export, h.ref); path_put(&exp->ex_path); auth_domain_put(exp->ex_client); nfsd4_fslocs_free(&exp->ex_fslocs); export_stats_destroy(exp->ex_stats); kfree(exp->ex_stats); kfree(exp->ex_uuid); - kfree(exp); -} - -static void svc_export_put(struct kref *ref) -{ - struct svc_export *exp = container_of(ref, struct svc_export, h.ref); - - INIT_RCU_WORK(&exp->ex_rcu_work, svc_export_put_work); - queue_rcu_work(system_wq, &exp->ex_rcu_work); + kfree_rcu(exp, ex_rcu); } static int svc_export_upcall(struct cache_detail *cd, struct cache_head *h) diff --git a/fs/nfsd/export.h b/fs/nfsd/export.h index 6f2fbaae01fa..4d92b99c1ffd 100644 --- a/fs/nfsd/export.h +++ b/fs/nfsd/export.h @@ -75,7 +75,7 @@ struct svc_export { u32 ex_layout_types; struct nfsd4_deviceid_map *ex_devid_map; struct cache_detail *cd; - struct rcu_work ex_rcu_work; + struct rcu_head ex_rcu; unsigned long ex_xprtsec_modes; struct export_stats *ex_stats; }; @@ -92,7 +92,7 @@ struct svc_expkey { u32 ek_fsid[6]; struct path ek_path; - struct rcu_work ek_rcu_work; + struct rcu_head ek_rcu; }; #define EX_ISSYNC(exp) (!((exp)->ex_flags & NFSEXP_ASYNC)) From 020b40f3562495f3c703a283ece145ffec19e82d Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Tue, 17 Dec 2024 08:21:46 -0700 Subject: [PATCH 097/337] io_uring: make ctx->timeout_lock a raw spinlock Chase reports that their tester complaints about a locking context mismatch: ============================= [ BUG: Invalid wait context ] 6.13.0-rc1-gf137f14b7ccb-dirty #9 Not tainted ----------------------------- syz.1.25198/182604 is trying to lock: ffff88805e66a358 (&ctx->timeout_lock){-.-.}-{3:3}, at: spin_lock_irq include/linux/spinlock.h:376 [inline] ffff88805e66a358 (&ctx->timeout_lock){-.-.}-{3:3}, at: io_match_task_safe io_uring/io_uring.c:218 [inline] ffff88805e66a358 (&ctx->timeout_lock){-.-.}-{3:3}, at: io_match_task_safe+0x187/0x250 io_uring/io_uring.c:204 other info that might help us debug this: context-{5:5} 1 lock held by syz.1.25198/182604: #0: ffff88802b7d48c0 (&acct->lock){+.+.}-{2:2}, at: io_acct_cancel_pending_work+0x2d/0x6b0 io_uring/io-wq.c:1049 stack backtrace: CPU: 0 UID: 0 PID: 182604 Comm: syz.1.25198 Not tainted 6.13.0-rc1-gf137f14b7ccb-dirty #9 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.15.0-1 04/01/2014 Call Trace: __dump_stack lib/dump_stack.c:94 [inline] dump_stack_lvl+0x82/0xd0 lib/dump_stack.c:120 print_lock_invalid_wait_context kernel/locking/lockdep.c:4826 [inline] check_wait_context kernel/locking/lockdep.c:4898 [inline] __lock_acquire+0x883/0x3c80 kernel/locking/lockdep.c:5176 lock_acquire.part.0+0x11b/0x370 kernel/locking/lockdep.c:5849 __raw_spin_lock_irq include/linux/spinlock_api_smp.h:119 [inline] _raw_spin_lock_irq+0x36/0x50 kernel/locking/spinlock.c:170 spin_lock_irq include/linux/spinlock.h:376 [inline] io_match_task_safe io_uring/io_uring.c:218 [inline] io_match_task_safe+0x187/0x250 io_uring/io_uring.c:204 io_acct_cancel_pending_work+0xb8/0x6b0 io_uring/io-wq.c:1052 io_wq_cancel_pending_work io_uring/io-wq.c:1074 [inline] io_wq_cancel_cb+0xb0/0x390 io_uring/io-wq.c:1112 io_uring_try_cancel_requests+0x15e/0xd70 io_uring/io_uring.c:3062 io_uring_cancel_generic+0x6ec/0x8c0 io_uring/io_uring.c:3140 io_uring_files_cancel include/linux/io_uring.h:20 [inline] do_exit+0x494/0x27a0 kernel/exit.c:894 do_group_exit+0xb3/0x250 kernel/exit.c:1087 get_signal+0x1d77/0x1ef0 kernel/signal.c:3017 arch_do_signal_or_restart+0x79/0x5b0 arch/x86/kernel/signal.c:337 exit_to_user_mode_loop kernel/entry/common.c:111 [inline] exit_to_user_mode_prepare include/linux/entry-common.h:329 [inline] __syscall_exit_to_user_mode_work kernel/entry/common.c:207 [inline] syscall_exit_to_user_mode+0x150/0x2a0 kernel/entry/common.c:218 do_syscall_64+0xd8/0x250 arch/x86/entry/common.c:89 entry_SYSCALL_64_after_hwframe+0x77/0x7f which is because io_uring has ctx->timeout_lock nesting inside the io-wq acct lock, the latter of which is used from inside the scheduler and hence is a raw spinlock, while the former is a "normal" spinlock and can hence be sleeping on PREEMPT_RT. Change ctx->timeout_lock to be a raw spinlock to solve this nesting dependency on PREEMPT_RT=y. Reported-by: chase xd Signed-off-by: Jens Axboe --- include/linux/io_uring_types.h | 2 +- io_uring/io_uring.c | 10 ++++----- io_uring/timeout.c | 40 +++++++++++++++++----------------- 3 files changed, 26 insertions(+), 26 deletions(-) diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h index 011860ade268..fd4cdb0860a2 100644 --- a/include/linux/io_uring_types.h +++ b/include/linux/io_uring_types.h @@ -345,7 +345,7 @@ struct io_ring_ctx { /* timeouts */ struct { - spinlock_t timeout_lock; + raw_spinlock_t timeout_lock; struct list_head timeout_list; struct list_head ltimeout_list; unsigned cq_last_tm_flush; diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index 06ff41484e29..605625e932eb 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -215,9 +215,9 @@ bool io_match_task_safe(struct io_kiocb *head, struct io_uring_task *tctx, struct io_ring_ctx *ctx = head->ctx; /* protect against races with linked timeouts */ - spin_lock_irq(&ctx->timeout_lock); + raw_spin_lock_irq(&ctx->timeout_lock); matched = io_match_linked(head); - spin_unlock_irq(&ctx->timeout_lock); + raw_spin_unlock_irq(&ctx->timeout_lock); } else { matched = io_match_linked(head); } @@ -333,7 +333,7 @@ static __cold struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p) init_waitqueue_head(&ctx->cq_wait); init_waitqueue_head(&ctx->poll_wq); spin_lock_init(&ctx->completion_lock); - spin_lock_init(&ctx->timeout_lock); + raw_spin_lock_init(&ctx->timeout_lock); INIT_WQ_LIST(&ctx->iopoll_list); INIT_LIST_HEAD(&ctx->io_buffers_comp); INIT_LIST_HEAD(&ctx->defer_list); @@ -498,10 +498,10 @@ static void io_prep_async_link(struct io_kiocb *req) if (req->flags & REQ_F_LINK_TIMEOUT) { struct io_ring_ctx *ctx = req->ctx; - spin_lock_irq(&ctx->timeout_lock); + raw_spin_lock_irq(&ctx->timeout_lock); io_for_each_link(cur, req) io_prep_async_work(cur); - spin_unlock_irq(&ctx->timeout_lock); + raw_spin_unlock_irq(&ctx->timeout_lock); } else { io_for_each_link(cur, req) io_prep_async_work(cur); diff --git a/io_uring/timeout.c b/io_uring/timeout.c index f3d502717aeb..bbe58638eca7 100644 --- a/io_uring/timeout.c +++ b/io_uring/timeout.c @@ -74,10 +74,10 @@ static void io_timeout_complete(struct io_kiocb *req, struct io_tw_state *ts) if (!io_timeout_finish(timeout, data)) { if (io_req_post_cqe(req, -ETIME, IORING_CQE_F_MORE)) { /* re-arm timer */ - spin_lock_irq(&ctx->timeout_lock); + raw_spin_lock_irq(&ctx->timeout_lock); list_add(&timeout->list, ctx->timeout_list.prev); hrtimer_start(&data->timer, timespec64_to_ktime(data->ts), data->mode); - spin_unlock_irq(&ctx->timeout_lock); + raw_spin_unlock_irq(&ctx->timeout_lock); return; } } @@ -109,7 +109,7 @@ __cold void io_flush_timeouts(struct io_ring_ctx *ctx) u32 seq; struct io_timeout *timeout, *tmp; - spin_lock_irq(&ctx->timeout_lock); + raw_spin_lock_irq(&ctx->timeout_lock); seq = ctx->cached_cq_tail - atomic_read(&ctx->cq_timeouts); list_for_each_entry_safe(timeout, tmp, &ctx->timeout_list, list) { @@ -134,7 +134,7 @@ __cold void io_flush_timeouts(struct io_ring_ctx *ctx) io_kill_timeout(req, 0); } ctx->cq_last_tm_flush = seq; - spin_unlock_irq(&ctx->timeout_lock); + raw_spin_unlock_irq(&ctx->timeout_lock); } static void io_req_tw_fail_links(struct io_kiocb *link, struct io_tw_state *ts) @@ -200,9 +200,9 @@ void io_disarm_next(struct io_kiocb *req) } else if (req->flags & REQ_F_LINK_TIMEOUT) { struct io_ring_ctx *ctx = req->ctx; - spin_lock_irq(&ctx->timeout_lock); + raw_spin_lock_irq(&ctx->timeout_lock); link = io_disarm_linked_timeout(req); - spin_unlock_irq(&ctx->timeout_lock); + raw_spin_unlock_irq(&ctx->timeout_lock); if (link) io_req_queue_tw_complete(link, -ECANCELED); } @@ -238,11 +238,11 @@ static enum hrtimer_restart io_timeout_fn(struct hrtimer *timer) struct io_ring_ctx *ctx = req->ctx; unsigned long flags; - spin_lock_irqsave(&ctx->timeout_lock, flags); + raw_spin_lock_irqsave(&ctx->timeout_lock, flags); list_del_init(&timeout->list); atomic_set(&req->ctx->cq_timeouts, atomic_read(&req->ctx->cq_timeouts) + 1); - spin_unlock_irqrestore(&ctx->timeout_lock, flags); + raw_spin_unlock_irqrestore(&ctx->timeout_lock, flags); if (!(data->flags & IORING_TIMEOUT_ETIME_SUCCESS)) req_set_fail(req); @@ -285,9 +285,9 @@ int io_timeout_cancel(struct io_ring_ctx *ctx, struct io_cancel_data *cd) { struct io_kiocb *req; - spin_lock_irq(&ctx->timeout_lock); + raw_spin_lock_irq(&ctx->timeout_lock); req = io_timeout_extract(ctx, cd); - spin_unlock_irq(&ctx->timeout_lock); + raw_spin_unlock_irq(&ctx->timeout_lock); if (IS_ERR(req)) return PTR_ERR(req); @@ -330,7 +330,7 @@ static enum hrtimer_restart io_link_timeout_fn(struct hrtimer *timer) struct io_ring_ctx *ctx = req->ctx; unsigned long flags; - spin_lock_irqsave(&ctx->timeout_lock, flags); + raw_spin_lock_irqsave(&ctx->timeout_lock, flags); prev = timeout->head; timeout->head = NULL; @@ -345,7 +345,7 @@ static enum hrtimer_restart io_link_timeout_fn(struct hrtimer *timer) } list_del(&timeout->list); timeout->prev = prev; - spin_unlock_irqrestore(&ctx->timeout_lock, flags); + raw_spin_unlock_irqrestore(&ctx->timeout_lock, flags); req->io_task_work.func = io_req_task_link_timeout; io_req_task_work_add(req); @@ -472,12 +472,12 @@ int io_timeout_remove(struct io_kiocb *req, unsigned int issue_flags) } else { enum hrtimer_mode mode = io_translate_timeout_mode(tr->flags); - spin_lock_irq(&ctx->timeout_lock); + raw_spin_lock_irq(&ctx->timeout_lock); if (tr->ltimeout) ret = io_linked_timeout_update(ctx, tr->addr, &tr->ts, mode); else ret = io_timeout_update(ctx, tr->addr, &tr->ts, mode); - spin_unlock_irq(&ctx->timeout_lock); + raw_spin_unlock_irq(&ctx->timeout_lock); } if (ret < 0) @@ -572,7 +572,7 @@ int io_timeout(struct io_kiocb *req, unsigned int issue_flags) struct list_head *entry; u32 tail, off = timeout->off; - spin_lock_irq(&ctx->timeout_lock); + raw_spin_lock_irq(&ctx->timeout_lock); /* * sqe->off holds how many events that need to occur for this @@ -611,7 +611,7 @@ add: list_add(&timeout->list, entry); data->timer.function = io_timeout_fn; hrtimer_start(&data->timer, timespec64_to_ktime(data->ts), data->mode); - spin_unlock_irq(&ctx->timeout_lock); + raw_spin_unlock_irq(&ctx->timeout_lock); return IOU_ISSUE_SKIP_COMPLETE; } @@ -620,7 +620,7 @@ void io_queue_linked_timeout(struct io_kiocb *req) struct io_timeout *timeout = io_kiocb_to_cmd(req, struct io_timeout); struct io_ring_ctx *ctx = req->ctx; - spin_lock_irq(&ctx->timeout_lock); + raw_spin_lock_irq(&ctx->timeout_lock); /* * If the back reference is NULL, then our linked request finished * before we got a chance to setup the timer @@ -633,7 +633,7 @@ void io_queue_linked_timeout(struct io_kiocb *req) data->mode); list_add_tail(&timeout->list, &ctx->ltimeout_list); } - spin_unlock_irq(&ctx->timeout_lock); + raw_spin_unlock_irq(&ctx->timeout_lock); /* drop submission reference */ io_put_req(req); } @@ -668,7 +668,7 @@ __cold bool io_kill_timeouts(struct io_ring_ctx *ctx, struct io_uring_task *tctx * timeout_lockfirst to keep locking ordering. */ spin_lock(&ctx->completion_lock); - spin_lock_irq(&ctx->timeout_lock); + raw_spin_lock_irq(&ctx->timeout_lock); list_for_each_entry_safe(timeout, tmp, &ctx->timeout_list, list) { struct io_kiocb *req = cmd_to_io_kiocb(timeout); @@ -676,7 +676,7 @@ __cold bool io_kill_timeouts(struct io_ring_ctx *ctx, struct io_uring_task *tctx io_kill_timeout(req, -ECANCELED)) canceled++; } - spin_unlock_irq(&ctx->timeout_lock); + raw_spin_unlock_irq(&ctx->timeout_lock); spin_unlock(&ctx->completion_lock); return canceled != 0; } From 62e2a47ceab8f3f7d2e3f0e03fdd1c5e0059fd8b Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Mon, 16 Dec 2024 19:28:06 -0500 Subject: [PATCH 098/337] NFS/pnfs: Fix a live lock between recalled layouts and layoutget When the server is recalling a layout, we should ignore the count of outstanding layoutget calls, since the server is expected to return either NFS4ERR_RECALLCONFLICT or NFS4ERR_RETURNCONFLICT for as long as the recall is outstanding. Currently, we may end up livelocking, causing the layout to eventually be forcibly revoked. Fixes: bf0291dd2267 ("pNFS: Ensure LAYOUTGET and LAYOUTRETURN are properly serialised") Cc: stable@vger.kernel.org Signed-off-by: Trond Myklebust --- fs/nfs/pnfs.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c index 0d16b383a452..5f582713bf05 100644 --- a/fs/nfs/pnfs.c +++ b/fs/nfs/pnfs.c @@ -1308,7 +1308,7 @@ pnfs_prepare_layoutreturn(struct pnfs_layout_hdr *lo, enum pnfs_iomode *iomode) { /* Serialise LAYOUTGET/LAYOUTRETURN */ - if (atomic_read(&lo->plh_outstanding) != 0) + if (atomic_read(&lo->plh_outstanding) != 0 && lo->plh_return_seq == 0) return false; if (test_and_set_bit(NFS_LAYOUT_RETURN_LOCK, &lo->plh_flags)) return false; From bedb4e6088a886f587d2ea44e0c198c8ce2182c9 Mon Sep 17 00:00:00 2001 From: Zhang Kunbo Date: Tue, 17 Dec 2024 07:19:21 +0000 Subject: [PATCH 099/337] fs/nfs: fix missing declaration of nfs_idmap_cache_timeout fs/nfs/super.c should include fs/nfs/nfs4idmap.h for declaration of nfs_idmap_cache_timeout. This fixes the sparse warning: fs/nfs/super.c:1397:14: warning: symbol 'nfs_idmap_cache_timeout' was not declared. Should it be static? Signed-off-by: Zhang Kunbo Signed-off-by: Trond Myklebust --- fs/nfs/super.c | 1 + 1 file changed, 1 insertion(+) diff --git a/fs/nfs/super.c b/fs/nfs/super.c index ae5c5e39afa0..aeb715b4a690 100644 --- a/fs/nfs/super.c +++ b/fs/nfs/super.c @@ -73,6 +73,7 @@ #include "nfs.h" #include "netns.h" #include "sysfs.h" +#include "nfs4idmap.h" #define NFSDBG_FACILITY NFSDBG_VFS From 185e1b1d91e419445d3fd99c1c0376a970438acf Mon Sep 17 00:00:00 2001 From: Joe Hattori Date: Mon, 16 Dec 2024 11:25:38 +0900 Subject: [PATCH 100/337] platform/x86: mlx-platform: call pci_dev_put() to balance the refcount MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit mlxplat_pci_fpga_device_init() calls pci_get_device() but does not release the refcount on error path. Call pci_dev_put() on the error path and in mlxplat_pci_fpga_device_exit() to fix this. This bug was found by an experimental static analysis tool that I am developing. Fixes: 02daa222fbdd ("platform: mellanox: Add initial support for PCIe based programming logic device") Signed-off-by: Joe Hattori Reviewed-by: Vadim Pasternak Link: https://lore.kernel.org/r/20241216022538.381209-1-joe@pf.is.s.u-tokyo.ac.jp Reviewed-by: Ilpo Järvinen Signed-off-by: Ilpo Järvinen --- drivers/platform/x86/mlx-platform.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/platform/x86/mlx-platform.c b/drivers/platform/x86/mlx-platform.c index 671021cd1f59..9c7f30a47f1f 100644 --- a/drivers/platform/x86/mlx-platform.c +++ b/drivers/platform/x86/mlx-platform.c @@ -6237,6 +6237,7 @@ fail_pci_set_dma_mask: fail_pci_request_regions: pci_disable_device(pci_dev); fail_pci_enable_device: + pci_dev_put(pci_dev); return err; } @@ -6247,6 +6248,7 @@ mlxplat_pci_fpga_device_exit(struct pci_dev *pci_bridge, iounmap(pci_bridge_addr); pci_release_regions(pci_bridge); pci_disable_device(pci_bridge); + pci_dev_put(pci_bridge); } static int From b6ccddd6fe1fd49c7a82b6fbed01cccad21a29c7 Mon Sep 17 00:00:00 2001 From: Kan Liang Date: Wed, 11 Dec 2024 08:11:46 -0800 Subject: [PATCH 101/337] perf/x86/intel/uncore: Add Clearwater Forest support From the perspective of the uncore PMU, the Clearwater Forest is the same as the previous Sierra Forest. The only difference is the event list, which will be supported in the perf tool later. Signed-off-by: Kan Liang Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20241211161146.235253-1-kan.liang@linux.intel.com --- arch/x86/events/intel/uncore.c | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/x86/events/intel/uncore.c b/arch/x86/events/intel/uncore.c index d98fac567684..e7aba7349231 100644 --- a/arch/x86/events/intel/uncore.c +++ b/arch/x86/events/intel/uncore.c @@ -1910,6 +1910,7 @@ static const struct x86_cpu_id intel_uncore_match[] __initconst = { X86_MATCH_VFM(INTEL_ATOM_GRACEMONT, &adl_uncore_init), X86_MATCH_VFM(INTEL_ATOM_CRESTMONT_X, &gnr_uncore_init), X86_MATCH_VFM(INTEL_ATOM_CRESTMONT, &gnr_uncore_init), + X86_MATCH_VFM(INTEL_ATOM_DARKMONT_X, &gnr_uncore_init), {}, }; MODULE_DEVICE_TABLE(x86cpu, intel_uncore_match); From b8c3a2502a205321fe66c356f4b70cabd8e1a5fc Mon Sep 17 00:00:00 2001 From: Kan Liang Date: Mon, 16 Dec 2024 12:45:02 -0800 Subject: [PATCH 102/337] perf/x86/intel/ds: Add PEBS format 6 The only difference between 5 and 6 is the new counters snapshotting group, without the following counters snapshotting enabling patches, it's impossible to utilize the feature in a PEBS record. It's safe to share the same code path with format 5. Add format 6, so the end user can at least utilize the legacy PEBS features. Fixes: a932aa0e868f ("perf/x86: Add Lunar Lake and Arrow Lake support") Signed-off-by: Kan Liang Signed-off-by: Peter Zijlstra (Intel) Cc: stable@vger.kernel.org Link: https://lore.kernel.org/r/20241216204505.748363-1-kan.liang@linux.intel.com --- arch/x86/events/intel/ds.c | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/x86/events/intel/ds.c b/arch/x86/events/intel/ds.c index 1a4b326ca2ce..6ba6549f26fa 100644 --- a/arch/x86/events/intel/ds.c +++ b/arch/x86/events/intel/ds.c @@ -2517,6 +2517,7 @@ void __init intel_ds_init(void) x86_pmu.large_pebs_flags |= PERF_SAMPLE_TIME; break; + case 6: case 5: x86_pmu.pebs_ept = 1; fallthrough; From 4a077914578183ec397ad09f7156a357e00e5d72 Mon Sep 17 00:00:00 2001 From: John Stultz Date: Thu, 12 Dec 2024 14:21:33 -0800 Subject: [PATCH 103/337] locking/rtmutex: Make sure we wake anything on the wake_q when we release the lock->wait_lock Bert reported seeing occasional boot hangs when running with PREEPT_RT and bisected it down to commit 894d1b3db41c ("locking/mutex: Remove wakeups from under mutex::wait_lock"). It looks like I missed a few spots where we drop the wait_lock and potentially call into schedule without waking up the tasks on the wake_q structure. Since the tasks being woken are ww_mutex tasks they need to be able to run to release the mutex and unblock the task that currently is planning to wake them. Thus we can deadlock. So make sure we wake the wake_q tasks when we unlock the wait_lock. Closes: https://lore.kernel.org/lkml/20241211182502.2915-1-spasswolf@web.de Fixes: 894d1b3db41c ("locking/mutex: Remove wakeups from under mutex::wait_lock") Reported-by: Bert Karwatzki Signed-off-by: John Stultz Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20241212222138.2400498-1-jstultz@google.com --- kernel/locking/rtmutex.c | 18 ++++++++++++++++-- kernel/locking/rtmutex_api.c | 2 +- 2 files changed, 17 insertions(+), 3 deletions(-) diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c index e858de203eb6..697a56d3d949 100644 --- a/kernel/locking/rtmutex.c +++ b/kernel/locking/rtmutex.c @@ -1292,7 +1292,13 @@ static int __sched task_blocks_on_rt_mutex(struct rt_mutex_base *lock, */ get_task_struct(owner); + preempt_disable(); raw_spin_unlock_irq(&lock->wait_lock); + /* wake up any tasks on the wake_q before calling rt_mutex_adjust_prio_chain */ + wake_up_q(wake_q); + wake_q_init(wake_q); + preempt_enable(); + res = rt_mutex_adjust_prio_chain(owner, chwalk, lock, next_lock, waiter, task); @@ -1596,6 +1602,7 @@ static void __sched remove_waiter(struct rt_mutex_base *lock, * or TASK_UNINTERRUPTIBLE) * @timeout: the pre-initialized and started timer, or NULL for none * @waiter: the pre-initialized rt_mutex_waiter + * @wake_q: wake_q of tasks to wake when we drop the lock->wait_lock * * Must be called with lock->wait_lock held and interrupts disabled */ @@ -1603,7 +1610,8 @@ static int __sched rt_mutex_slowlock_block(struct rt_mutex_base *lock, struct ww_acquire_ctx *ww_ctx, unsigned int state, struct hrtimer_sleeper *timeout, - struct rt_mutex_waiter *waiter) + struct rt_mutex_waiter *waiter, + struct wake_q_head *wake_q) __releases(&lock->wait_lock) __acquires(&lock->wait_lock) { struct rt_mutex *rtm = container_of(lock, struct rt_mutex, rtmutex); @@ -1634,7 +1642,13 @@ static int __sched rt_mutex_slowlock_block(struct rt_mutex_base *lock, owner = rt_mutex_owner(lock); else owner = NULL; + preempt_disable(); raw_spin_unlock_irq(&lock->wait_lock); + if (wake_q) { + wake_up_q(wake_q); + wake_q_init(wake_q); + } + preempt_enable(); if (!owner || !rtmutex_spin_on_owner(lock, waiter, owner)) rt_mutex_schedule(); @@ -1708,7 +1722,7 @@ static int __sched __rt_mutex_slowlock(struct rt_mutex_base *lock, ret = task_blocks_on_rt_mutex(lock, waiter, current, ww_ctx, chwalk, wake_q); if (likely(!ret)) - ret = rt_mutex_slowlock_block(lock, ww_ctx, state, NULL, waiter); + ret = rt_mutex_slowlock_block(lock, ww_ctx, state, NULL, waiter, wake_q); if (likely(!ret)) { /* acquired the lock */ diff --git a/kernel/locking/rtmutex_api.c b/kernel/locking/rtmutex_api.c index 33ea31d6a7b3..191e4720e546 100644 --- a/kernel/locking/rtmutex_api.c +++ b/kernel/locking/rtmutex_api.c @@ -383,7 +383,7 @@ int __sched rt_mutex_wait_proxy_lock(struct rt_mutex_base *lock, raw_spin_lock_irq(&lock->wait_lock); /* sleep on the mutex */ set_current_state(TASK_INTERRUPTIBLE); - ret = rt_mutex_slowlock_block(lock, NULL, TASK_INTERRUPTIBLE, to, waiter); + ret = rt_mutex_slowlock_block(lock, NULL, TASK_INTERRUPTIBLE, to, waiter, NULL); /* * try_to_take_rt_mutex() sets the waiter bit unconditionally. We might * have to fix that up. From 8fc38062be3f692ff8816da84fde71972530bcc4 Mon Sep 17 00:00:00 2001 From: Thomas Zimmermann Date: Mon, 16 Dec 2024 08:42:47 +0100 Subject: [PATCH 104/337] fbdev: Fix recursive dependencies wrt BACKLIGHT_CLASS_DEVICE Do not select BACKLIGHT_CLASS_DEVICE from FB_BACKLIGHT. The latter only controls backlight support within fbdev core code and data structures. Make fbdev drivers depend on BACKLIGHT_CLASS_DEVICE and let users select it explicitly. Fixes warnings about recursive dependencies, such as error: recursive dependency detected! symbol BACKLIGHT_CLASS_DEVICE is selected by FB_BACKLIGHT symbol FB_BACKLIGHT is selected by FB_SH_MOBILE_LCDC symbol FB_SH_MOBILE_LCDC depends on FB_DEVICE symbol FB_DEVICE depends on FB_CORE symbol FB_CORE is selected by DRM_GEM_DMA_HELPER symbol DRM_GEM_DMA_HELPER is selected by DRM_PANEL_ILITEK_ILI9341 symbol DRM_PANEL_ILITEK_ILI9341 depends on BACKLIGHT_CLASS_DEVICE BACKLIGHT_CLASS_DEVICE is user-selectable, so making drivers adapt to it is the correct approach in any case. For most drivers, backlight support is also configurable separately. v3: - Select BACKLIGHT_CLASS_DEVICE in PowerMac defconfigs (Christophe) - Fix PMAC_BACKLIGHT module dependency corner cases (Christophe) v2: - s/BACKLIGHT_DEVICE_CLASS/BACKLIGHT_CLASS_DEVICE (Helge) - Fix fbdev driver-dependency corner case (Arnd) Signed-off-by: Thomas Zimmermann Reviewed-by: Arnd Bergmann Link: https://patchwork.freedesktop.org/patch/msgid/20241216074450.8590-2-tzimmermann@suse.de --- arch/powerpc/configs/pmac32_defconfig | 1 + arch/powerpc/configs/ppc6xx_defconfig | 1 + drivers/auxdisplay/Kconfig | 2 +- drivers/macintosh/Kconfig | 1 + drivers/staging/fbtft/Kconfig | 1 + drivers/video/fbdev/Kconfig | 18 +++++++++++++----- drivers/video/fbdev/core/Kconfig | 3 +-- 7 files changed, 19 insertions(+), 8 deletions(-) diff --git a/arch/powerpc/configs/pmac32_defconfig b/arch/powerpc/configs/pmac32_defconfig index 57ded82c2840..e8b3f67bf3f5 100644 --- a/arch/powerpc/configs/pmac32_defconfig +++ b/arch/powerpc/configs/pmac32_defconfig @@ -208,6 +208,7 @@ CONFIG_FB_ATY=y CONFIG_FB_ATY_CT=y CONFIG_FB_ATY_GX=y CONFIG_FB_3DFX=y +CONFIG_BACKLIGHT_CLASS_DEVICE=y # CONFIG_VGA_CONSOLE is not set CONFIG_FRAMEBUFFER_CONSOLE=y CONFIG_LOGO=y diff --git a/arch/powerpc/configs/ppc6xx_defconfig b/arch/powerpc/configs/ppc6xx_defconfig index 4d77e17541e9..ca0c90e95837 100644 --- a/arch/powerpc/configs/ppc6xx_defconfig +++ b/arch/powerpc/configs/ppc6xx_defconfig @@ -716,6 +716,7 @@ CONFIG_FB_TRIDENT=m CONFIG_FB_SM501=m CONFIG_FB_IBM_GXT4500=y CONFIG_LCD_PLATFORM=m +CONFIG_BACKLIGHT_CLASS_DEVICE=y CONFIG_FRAMEBUFFER_CONSOLE=y CONFIG_FRAMEBUFFER_CONSOLE_ROTATION=y CONFIG_LOGO=y diff --git a/drivers/auxdisplay/Kconfig b/drivers/auxdisplay/Kconfig index 21545ffba065..8934e6ad5772 100644 --- a/drivers/auxdisplay/Kconfig +++ b/drivers/auxdisplay/Kconfig @@ -489,7 +489,7 @@ config IMG_ASCII_LCD config HT16K33 tristate "Holtek Ht16K33 LED controller with keyscan" - depends on FB && I2C && INPUT + depends on FB && I2C && INPUT && BACKLIGHT_CLASS_DEVICE select FB_SYSMEM_HELPERS select INPUT_MATRIXKMAP select FB_BACKLIGHT diff --git a/drivers/macintosh/Kconfig b/drivers/macintosh/Kconfig index fb38f684444f..d00e713c1092 100644 --- a/drivers/macintosh/Kconfig +++ b/drivers/macintosh/Kconfig @@ -120,6 +120,7 @@ config PMAC_MEDIABAY config PMAC_BACKLIGHT bool "Backlight control for LCD screens" depends on PPC_PMAC && ADB_PMU && FB = y && (BROKEN || !PPC64) + depends on BACKLIGHT_CLASS_DEVICE=y select FB_BACKLIGHT help Say Y here to enable Macintosh specific extensions of the generic diff --git a/drivers/staging/fbtft/Kconfig b/drivers/staging/fbtft/Kconfig index 77ab44362f16..dcf6a70455cc 100644 --- a/drivers/staging/fbtft/Kconfig +++ b/drivers/staging/fbtft/Kconfig @@ -3,6 +3,7 @@ menuconfig FB_TFT tristate "Support for small TFT LCD display modules" depends on FB && SPI depends on FB_DEVICE + depends on BACKLIGHT_CLASS_DEVICE depends on GPIOLIB || COMPILE_TEST select FB_BACKLIGHT select FB_SYSMEM_HELPERS_DEFERRED diff --git a/drivers/video/fbdev/Kconfig b/drivers/video/fbdev/Kconfig index de035071fedb..55c6686f091e 100644 --- a/drivers/video/fbdev/Kconfig +++ b/drivers/video/fbdev/Kconfig @@ -649,6 +649,7 @@ config FB_S1D13XXX config FB_ATMEL tristate "AT91 LCD Controller support" depends on FB && OF && HAVE_CLK && HAS_IOMEM + depends on BACKLIGHT_CLASS_DEVICE depends on HAVE_FB_ATMEL || COMPILE_TEST select FB_BACKLIGHT select FB_IOMEM_HELPERS @@ -660,7 +661,6 @@ config FB_ATMEL config FB_NVIDIA tristate "nVidia Framebuffer Support" depends on FB && PCI - select FB_BACKLIGHT if FB_NVIDIA_BACKLIGHT select FB_CFB_FILLRECT select FB_CFB_COPYAREA select FB_CFB_IMAGEBLIT @@ -700,6 +700,8 @@ config FB_NVIDIA_DEBUG config FB_NVIDIA_BACKLIGHT bool "Support for backlight control" depends on FB_NVIDIA + depends on BACKLIGHT_CLASS_DEVICE=y || BACKLIGHT_CLASS_DEVICE=FB_NVIDIA + select FB_BACKLIGHT default y help Say Y here if you want to control the backlight of your display. @@ -707,7 +709,6 @@ config FB_NVIDIA_BACKLIGHT config FB_RIVA tristate "nVidia Riva support" depends on FB && PCI - select FB_BACKLIGHT if FB_RIVA_BACKLIGHT select FB_CFB_FILLRECT select FB_CFB_COPYAREA select FB_CFB_IMAGEBLIT @@ -747,6 +748,8 @@ config FB_RIVA_DEBUG config FB_RIVA_BACKLIGHT bool "Support for backlight control" depends on FB_RIVA + depends on BACKLIGHT_CLASS_DEVICE=y || BACKLIGHT_CLASS_DEVICE=FB_RIVA + select FB_BACKLIGHT default y help Say Y here if you want to control the backlight of your display. @@ -934,7 +937,6 @@ config FB_MATROX_MAVEN config FB_RADEON tristate "ATI Radeon display support" depends on FB && PCI - select FB_BACKLIGHT if FB_RADEON_BACKLIGHT select FB_CFB_FILLRECT select FB_CFB_COPYAREA select FB_CFB_IMAGEBLIT @@ -960,6 +962,8 @@ config FB_RADEON_I2C config FB_RADEON_BACKLIGHT bool "Support for backlight control" depends on FB_RADEON + depends on BACKLIGHT_CLASS_DEVICE=y || BACKLIGHT_CLASS_DEVICE=FB_RADEON + select FB_BACKLIGHT default y help Say Y here if you want to control the backlight of your display. @@ -975,7 +979,6 @@ config FB_RADEON_DEBUG config FB_ATY128 tristate "ATI Rage128 display support" depends on FB && PCI - select FB_BACKLIGHT if FB_ATY128_BACKLIGHT select FB_IOMEM_HELPERS select FB_MACMODES if PPC_PMAC help @@ -989,6 +992,8 @@ config FB_ATY128 config FB_ATY128_BACKLIGHT bool "Support for backlight control" depends on FB_ATY128 + depends on BACKLIGHT_CLASS_DEVICE=y || BACKLIGHT_CLASS_DEVICE=FB_ATY128 + select FB_BACKLIGHT default y help Say Y here if you want to control the backlight of your display. @@ -999,7 +1004,6 @@ config FB_ATY select FB_CFB_FILLRECT select FB_CFB_COPYAREA select FB_CFB_IMAGEBLIT - select FB_BACKLIGHT if FB_ATY_BACKLIGHT select FB_IOMEM_FOPS select FB_MACMODES if PPC select FB_ATY_CT if SPARC64 && PCI @@ -1040,6 +1044,8 @@ config FB_ATY_GX config FB_ATY_BACKLIGHT bool "Support for backlight control" depends on FB_ATY + depends on BACKLIGHT_CLASS_DEVICE=y || BACKLIGHT_CLASS_DEVICE=FB_ATY + select FB_BACKLIGHT default y help Say Y here if you want to control the backlight of your display. @@ -1528,6 +1534,7 @@ config FB_SH_MOBILE_LCDC depends on FB && HAVE_CLK && HAS_IOMEM depends on SUPERH || COMPILE_TEST depends on FB_DEVICE + depends on BACKLIGHT_CLASS_DEVICE select FB_BACKLIGHT select FB_DEFERRED_IO select FB_DMAMEM_HELPERS @@ -1793,6 +1800,7 @@ config FB_SSD1307 tristate "Solomon SSD1307 framebuffer support" depends on FB && I2C depends on GPIOLIB || COMPILE_TEST + depends on BACKLIGHT_CLASS_DEVICE select FB_BACKLIGHT select FB_SYSMEM_HELPERS_DEFERRED help diff --git a/drivers/video/fbdev/core/Kconfig b/drivers/video/fbdev/core/Kconfig index 0ab8848ba2f1..d554d8c543d4 100644 --- a/drivers/video/fbdev/core/Kconfig +++ b/drivers/video/fbdev/core/Kconfig @@ -183,9 +183,8 @@ config FB_SYSMEM_HELPERS_DEFERRED select FB_SYSMEM_HELPERS config FB_BACKLIGHT - tristate + bool depends on FB - select BACKLIGHT_CLASS_DEVICE config FB_MODE_HELPERS bool "Enable Video Mode Handling Helpers" From 8ce35bf0ef5a659f3a15237152770a7c1d13c996 Mon Sep 17 00:00:00 2001 From: Thomas Zimmermann Date: Mon, 16 Dec 2024 08:42:48 +0100 Subject: [PATCH 105/337] drm/fbdev: Select FB_CORE dependency for fbdev on DMA and TTM Select FB_CORE if GEM's DMA and TTM implementations support fbdev emulation. Fixes linker errors about missing symbols from the fbdev subsystem. Also see [1] for a related SHMEM fix. Fixes: dadd28d4142f ("drm/client: Add client-lib module") Signed-off-by: Thomas Zimmermann Link: https://patchwork.freedesktop.org/series/141411/ # 1 Reviewed-by: Arnd Bergmann Link: https://patchwork.freedesktop.org/patch/msgid/20241216074450.8590-3-tzimmermann@suse.de --- drivers/gpu/drm/Kconfig | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/gpu/drm/Kconfig b/drivers/gpu/drm/Kconfig index a0690049b292..ccee570eab7d 100644 --- a/drivers/gpu/drm/Kconfig +++ b/drivers/gpu/drm/Kconfig @@ -358,6 +358,7 @@ config DRM_TTM_HELPER tristate depends on DRM select DRM_TTM + select FB_CORE if DRM_FBDEV_EMULATION select FB_SYSMEM_HELPERS_DEFERRED if DRM_FBDEV_EMULATION help Helpers for ttm-based gem objects @@ -365,6 +366,7 @@ config DRM_TTM_HELPER config DRM_GEM_DMA_HELPER tristate depends on DRM + select FB_CORE if DRM_FBDEV_EMULATION select FB_DMAMEM_HELPERS_DEFERRED if DRM_FBDEV_EMULATION help Choose this if you need the GEM DMA helper functions From 2182e0f200d097805f2f6bc0042de8695c60f386 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Mon, 16 Dec 2024 08:42:49 +0100 Subject: [PATCH 106/337] drm: rework FB_CORE dependency The 'select FB_CORE' statement moved from CONFIG_DRM to DRM_CLIENT_LIB, but there are now configurations that have code calling into fb_core as built-in even though the client_lib itself is a loadable module: x86_64-linux-ld: drivers/gpu/drm/drm_fb_helper.o: in function `drm_fb_helper_set_suspend': drm_fb_helper.c:(.text+0x2c6): undefined reference to `fb_set_suspend' x86_64-linux-ld: drivers/gpu/drm/drm_fb_helper.o: in function `drm_fb_helper_resume_worker': drm_fb_helper.c:(.text+0x2e1): undefined reference to `fb_set_suspend' In addition to DRM_CLIENT_LIB, the 'select' needs to be at least in DRM_KMS_HELPER and DRM_GEM_SHMEM_HELPER, so add it here. This patch is the KMS_HELPER part of [1]. Fixes: dadd28d4142f ("drm/client: Add client-lib module") Signed-off-by: Arnd Bergmann Reviewed-by: Thomas Zimmermann Link: https://patchwork.freedesktop.org/series/141411/ # 1 Link: https://patchwork.freedesktop.org/patch/msgid/20241216074450.8590-4-tzimmermann@suse.de Signed-off-by: Thomas Zimmermann --- drivers/gpu/drm/Kconfig | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/gpu/drm/Kconfig b/drivers/gpu/drm/Kconfig index ccee570eab7d..772fc7625639 100644 --- a/drivers/gpu/drm/Kconfig +++ b/drivers/gpu/drm/Kconfig @@ -99,6 +99,7 @@ config DRM_KUNIT_TEST config DRM_KMS_HELPER tristate depends on DRM + select FB_CORE if DRM_FBDEV_EMULATION help CRTC helpers for KMS drivers. From 058387d9c6b70e225da82492e1e193635c3fac3f Mon Sep 17 00:00:00 2001 From: Willow Cunningham Date: Mon, 7 Oct 2024 17:29:54 -0400 Subject: [PATCH 107/337] arm64: dts: broadcom: Fix L2 linesize for Raspberry Pi 5 Set the cache-line-size parameter of the L2 cache for each core to the correct value of 64 bytes. Previously, the L2 cache line size was incorrectly set to 128 bytes for the Broadcom BCM2712. This causes validation tests for the Performance Application Programming Interface (PAPI) tool to fail as they depend on sysfs accurately reporting cache line sizes. The correct value of 64 bytes is stated in the official documentation of the ARM Cortex A-72, which is linked in the comments of arm64/boot/dts/broadcom/bcm2712.dtsi as the source for cache-line-size. Fixes: faa3381267d0 ("arm64: dts: broadcom: Add minimal support for Raspberry Pi 5") Signed-off-by: Willow Cunningham Link: https://lore.kernel.org/r/20241007212954.214724-1-willow.e.cunningham@maine.edu Signed-off-by: Florian Fainelli --- arch/arm64/boot/dts/broadcom/bcm2712.dtsi | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/arch/arm64/boot/dts/broadcom/bcm2712.dtsi b/arch/arm64/boot/dts/broadcom/bcm2712.dtsi index 6e5a984c1d4e..26a29e5e5078 100644 --- a/arch/arm64/boot/dts/broadcom/bcm2712.dtsi +++ b/arch/arm64/boot/dts/broadcom/bcm2712.dtsi @@ -67,7 +67,7 @@ l2_cache_l0: l2-cache-l0 { compatible = "cache"; cache-size = <0x80000>; - cache-line-size = <128>; + cache-line-size = <64>; cache-sets = <1024>; //512KiB(size)/64(line-size)=8192ways/8-way set cache-level = <2>; cache-unified; @@ -91,7 +91,7 @@ l2_cache_l1: l2-cache-l1 { compatible = "cache"; cache-size = <0x80000>; - cache-line-size = <128>; + cache-line-size = <64>; cache-sets = <1024>; //512KiB(size)/64(line-size)=8192ways/8-way set cache-level = <2>; cache-unified; @@ -115,7 +115,7 @@ l2_cache_l2: l2-cache-l2 { compatible = "cache"; cache-size = <0x80000>; - cache-line-size = <128>; + cache-line-size = <64>; cache-sets = <1024>; //512KiB(size)/64(line-size)=8192ways/8-way set cache-level = <2>; cache-unified; @@ -139,7 +139,7 @@ l2_cache_l3: l2-cache-l3 { compatible = "cache"; cache-size = <0x80000>; - cache-line-size = <128>; + cache-line-size = <64>; cache-sets = <1024>; //512KiB(size)/64(line-size)=8192ways/8-way set cache-level = <2>; cache-unified; From 9048cf05a17a7bc26f0b8e2e53750b1237303970 Mon Sep 17 00:00:00 2001 From: Olga Kornievskaia Date: Tue, 17 Dec 2024 16:18:12 -0500 Subject: [PATCH 108/337] NFSD: fix management of pending async copies Currently the pending_async_copies count is decremented just before a struct nfsd4_copy is destroyed. After commit aa0ebd21df9c ("NFSD: Add nfsd4_copy time-to-live") nfsd4_copy structures sticks around for 10 lease periods after the COPY itself has completed, the pending_async_copies count stays high for a long time. This causes NFSD to avoid the use of background copy even though the actual background copy workload might no longer be running. In this patch, decrement pending_async_copies once async copy thread is done processing the copy work. Fixes: aa0ebd21df9c ("NFSD: Add nfsd4_copy time-to-live") Signed-off-by: Olga Kornievskaia Signed-off-by: Chuck Lever --- fs/nfsd/nfs4proc.c | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c index f8a10f90bc7a..ad44ad49274f 100644 --- a/fs/nfsd/nfs4proc.c +++ b/fs/nfsd/nfs4proc.c @@ -1347,7 +1347,6 @@ static void nfs4_put_copy(struct nfsd4_copy *copy) { if (!refcount_dec_and_test(©->refcount)) return; - atomic_dec(©->cp_nn->pending_async_copies); kfree(copy->cp_src); kfree(copy); } @@ -1870,6 +1869,7 @@ do_callback: set_bit(NFSD4_COPY_F_COMPLETED, ©->cp_flags); trace_nfsd_copy_async_done(copy); nfsd4_send_cb_offload(copy); + atomic_dec(©->cp_nn->pending_async_copies); return 0; } @@ -1927,19 +1927,19 @@ nfsd4_copy(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, /* Arbitrary cap on number of pending async copy operations */ if (atomic_inc_return(&nn->pending_async_copies) > (int)rqstp->rq_pool->sp_nrthreads) - goto out_err; + goto out_dec_async_copy_err; async_copy->cp_src = kmalloc(sizeof(*async_copy->cp_src), GFP_KERNEL); if (!async_copy->cp_src) - goto out_err; + goto out_dec_async_copy_err; if (!nfs4_init_copy_state(nn, copy)) - goto out_err; + goto out_dec_async_copy_err; memcpy(&result->cb_stateid, ©->cp_stateid.cs_stid, sizeof(result->cb_stateid)); dup_copy_fields(copy, async_copy); async_copy->copy_task = kthread_create(nfsd4_do_async_copy, async_copy, "%s", "copy thread"); if (IS_ERR(async_copy->copy_task)) - goto out_err; + goto out_dec_async_copy_err; spin_lock(&async_copy->cp_clp->async_lock); list_add(&async_copy->copies, &async_copy->cp_clp->async_copies); @@ -1954,6 +1954,9 @@ out: trace_nfsd_copy_done(copy, status); release_copy_files(copy); return status; +out_dec_async_copy_err: + if (async_copy) + atomic_dec(&nn->pending_async_copies); out_err: if (nfsd4_ssc_is_inter(copy)) { /* From 23579010cf0a12476e96a5f1acdf78a9c5843657 Mon Sep 17 00:00:00 2001 From: Andrea Righi Date: Tue, 17 Dec 2024 20:58:13 +0100 Subject: [PATCH 109/337] bpf: Fix bpf_get_smp_processor_id() on !CONFIG_SMP On x86-64 calling bpf_get_smp_processor_id() in a kernel with CONFIG_SMP disabled can trigger the following bug, as pcpu_hot is unavailable: [ 8.471774] BUG: unable to handle page fault for address: 00000000936a290c [ 8.471849] #PF: supervisor read access in kernel mode [ 8.471881] #PF: error_code(0x0000) - not-present page Fix by inlining a return 0 in the !CONFIG_SMP case. Fixes: 1ae6921009e5 ("bpf: inline bpf_get_smp_processor_id() helper") Signed-off-by: Andrea Righi Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20241217195813.622568-1-arighi@nvidia.com --- kernel/bpf/verifier.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index f7f892a52a37..77f56674aaa9 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -21281,11 +21281,15 @@ patch_map_ops_generic: * changed in some incompatible and hard to support * way, it's fine to back out this inlining logic */ +#ifdef CONFIG_SMP insn_buf[0] = BPF_MOV32_IMM(BPF_REG_0, (u32)(unsigned long)&pcpu_hot.cpu_number); insn_buf[1] = BPF_MOV64_PERCPU_REG(BPF_REG_0, BPF_REG_0); insn_buf[2] = BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_0, 0); cnt = 3; - +#else + insn_buf[0] = BPF_ALU32_REG(BPF_XOR, BPF_REG_0, BPF_REG_0); + cnt = 1; +#endif new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt); if (!new_prog) return -ENOMEM; From 05aa156e156ef3168e7ab8a68721945196495c17 Mon Sep 17 00:00:00 2001 From: Haren Myneni Date: Fri, 13 Dec 2024 21:17:58 -0800 Subject: [PATCH 110/337] powerpc/pseries/vas: Add close() callback in vas_vm_ops struct The mapping VMA address is saved in VAS window struct when the paste address is mapped. This VMA address is used during migration to unmap the paste address if the window is active. The paste address mapping will be removed when the window is closed or with the munmap(). But the VMA address in the VAS window is not updated with munmap() which is causing invalid access during migration. The KASAN report shows: [16386.254991] BUG: KASAN: slab-use-after-free in reconfig_close_windows+0x1a0/0x4e8 [16386.255043] Read of size 8 at addr c00000014a819670 by task drmgr/696928 [16386.255096] CPU: 29 UID: 0 PID: 696928 Comm: drmgr Kdump: loaded Tainted: G B 6.11.0-rc5-nxgzip #2 [16386.255128] Tainted: [B]=BAD_PAGE [16386.255148] Hardware name: IBM,9080-HEX Power11 (architected) 0x820200 0xf000007 of:IBM,FW1110.00 (NH1110_016) hv:phyp pSeries [16386.255181] Call Trace: [16386.255202] [c00000016b297660] [c0000000018ad0ac] dump_stack_lvl+0x84/0xe8 (unreliable) [16386.255246] [c00000016b297690] [c0000000006e8a90] print_report+0x19c/0x764 [16386.255285] [c00000016b297760] [c0000000006e9490] kasan_report+0x128/0x1f8 [16386.255309] [c00000016b297880] [c0000000006eb5c8] __asan_load8+0xac/0xe0 [16386.255326] [c00000016b2978a0] [c00000000013f898] reconfig_close_windows+0x1a0/0x4e8 [16386.255343] [c00000016b297990] [c000000000140e58] vas_migration_handler+0x3a4/0x3fc [16386.255368] [c00000016b297a90] [c000000000128848] pseries_migrate_partition+0x4c/0x4c4 ... [16386.256136] Allocated by task 696554 on cpu 31 at 16377.277618s: [16386.256149] kasan_save_stack+0x34/0x68 [16386.256163] kasan_save_track+0x34/0x80 [16386.256175] kasan_save_alloc_info+0x58/0x74 [16386.256196] __kasan_slab_alloc+0xb8/0xdc [16386.256209] kmem_cache_alloc_noprof+0x200/0x3d0 [16386.256225] vm_area_alloc+0x44/0x150 [16386.256245] mmap_region+0x214/0x10c4 [16386.256265] do_mmap+0x5fc/0x750 [16386.256277] vm_mmap_pgoff+0x14c/0x24c [16386.256292] ksys_mmap_pgoff+0x20c/0x348 [16386.256303] sys_mmap+0xd0/0x160 ... [16386.256350] Freed by task 0 on cpu 31 at 16386.204848s: [16386.256363] kasan_save_stack+0x34/0x68 [16386.256374] kasan_save_track+0x34/0x80 [16386.256384] kasan_save_free_info+0x64/0x10c [16386.256396] __kasan_slab_free+0x120/0x204 [16386.256415] kmem_cache_free+0x128/0x450 [16386.256428] vm_area_free_rcu_cb+0xa8/0xd8 [16386.256441] rcu_do_batch+0x2c8/0xcf0 [16386.256458] rcu_core+0x378/0x3c4 [16386.256473] handle_softirqs+0x20c/0x60c [16386.256495] do_softirq_own_stack+0x6c/0x88 [16386.256509] do_softirq_own_stack+0x58/0x88 [16386.256521] __irq_exit_rcu+0x1a4/0x20c [16386.256533] irq_exit+0x20/0x38 [16386.256544] interrupt_async_exit_prepare.constprop.0+0x18/0x2c ... [16386.256717] Last potentially related work creation: [16386.256729] kasan_save_stack+0x34/0x68 [16386.256741] __kasan_record_aux_stack+0xcc/0x12c [16386.256753] __call_rcu_common.constprop.0+0x94/0xd04 [16386.256766] vm_area_free+0x28/0x3c [16386.256778] remove_vma+0xf4/0x114 [16386.256797] do_vmi_align_munmap.constprop.0+0x684/0x870 [16386.256811] __vm_munmap+0xe0/0x1f8 [16386.256821] sys_munmap+0x54/0x6c [16386.256830] system_call_exception+0x1a0/0x4a0 [16386.256841] system_call_vectored_common+0x15c/0x2ec [16386.256868] The buggy address belongs to the object at c00000014a819670 which belongs to the cache vm_area_struct of size 168 [16386.256887] The buggy address is located 0 bytes inside of freed 168-byte region [c00000014a819670, c00000014a819718) [16386.256915] The buggy address belongs to the physical page: [16386.256928] page: refcount:1 mapcount:0 mapping:0000000000000000 index:0x0 pfn:0x14a81 [16386.256950] memcg:c0000000ba430001 [16386.256961] anon flags: 0x43ffff800000000(node=4|zone=0|lastcpupid=0x7ffff) [16386.256975] page_type: 0xfdffffff(slab) [16386.256990] raw: 043ffff800000000 c00000000501c080 0000000000000000 5deadbee00000001 [16386.257003] raw: 0000000000000000 00000000011a011a 00000001fdffffff c0000000ba430001 [16386.257018] page dumped because: kasan: bad access detected This patch adds close() callback in vas_vm_ops vm_operations_struct which will be executed during munmap() before freeing VMA. The VMA address in the VAS window is set to NULL after holding the window mmap_mutex. Fixes: 37e6764895ef ("powerpc/pseries/vas: Add VAS migration handler") Signed-off-by: Haren Myneni Signed-off-by: Madhavan Srinivasan Link: https://patch.msgid.link/20241214051758.997759-1-haren@linux.ibm.com --- arch/powerpc/platforms/book3s/vas-api.c | 36 +++++++++++++++++++++++++ 1 file changed, 36 insertions(+) diff --git a/arch/powerpc/platforms/book3s/vas-api.c b/arch/powerpc/platforms/book3s/vas-api.c index f381b177ea06..0b6365d85d11 100644 --- a/arch/powerpc/platforms/book3s/vas-api.c +++ b/arch/powerpc/platforms/book3s/vas-api.c @@ -464,7 +464,43 @@ static vm_fault_t vas_mmap_fault(struct vm_fault *vmf) return VM_FAULT_SIGBUS; } +/* + * During mmap() paste address, mapping VMA is saved in VAS window + * struct which is used to unmap during migration if the window is + * still open. But the user space can remove this mapping with + * munmap() before closing the window and the VMA address will + * be invalid. Set VAS window VMA to NULL in this function which + * is called before VMA free. + */ +static void vas_mmap_close(struct vm_area_struct *vma) +{ + struct file *fp = vma->vm_file; + struct coproc_instance *cp_inst = fp->private_data; + struct vas_window *txwin; + + /* Should not happen */ + if (!cp_inst || !cp_inst->txwin) { + pr_err("No attached VAS window for the paste address mmap\n"); + return; + } + + txwin = cp_inst->txwin; + /* + * task_ref.vma is set in coproc_mmap() during mmap paste + * address. So it has to be the same VMA that is getting freed. + */ + if (WARN_ON(txwin->task_ref.vma != vma)) { + pr_err("Invalid paste address mmaping\n"); + return; + } + + mutex_lock(&txwin->task_ref.mmap_mutex); + txwin->task_ref.vma = NULL; + mutex_unlock(&txwin->task_ref.mmap_mutex); +} + static const struct vm_operations_struct vas_vm_ops = { + .close = vas_mmap_close, .fault = vas_mmap_fault, }; From 4feaedf7d243f1a9af36dfb2711a5641fe3559dc Mon Sep 17 00:00:00 2001 From: Daniel Lezcano Date: Mon, 16 Dec 2024 22:26:44 +0100 Subject: [PATCH 111/337] thermal/thresholds: Fix boundaries and detection routine The current implementation does not work if the thermal zone is interrupt driven only. The boundaries are not correctly checked and computed as it happens only when the temperature is increasing or decreasing. The problem arises because the routine to detect when we cross a threshold is correlated with the computation of the boundaries. We assume we have to recompute the boundaries when a threshold is crossed but actually we should do that even if the it is not the case. Mixing the boundaries computation and the threshold detection for the sake of optimizing the routine is much more complex as it appears intuitively and prone to errors. This fix separates the boundaries computation and the threshold crossing detection into different routines. The result is a code much more simple to understand, thus easier to maintain. The drawback is we browse the thresholds list several time but we can consider that as neglictible because that happens when the temperature is updated. There are certainly some aeras to improve in the temperature update routine but it would be not adequate as this change aims to fix the thresholds for v6.13. Fixes: 445936f9e258 ("thermal: core: Add user thresholds support") Tested-by: Daniel Lezcano # rock5b, Lenovo x13s Signed-off-by: Daniel Lezcano Link: https://patch.msgid.link/20241216212644.1145122-1-daniel.lezcano@linaro.org Signed-off-by: Rafael J. Wysocki --- drivers/thermal/thermal_thresholds.c | 76 +++++++++++++++------------- 1 file changed, 40 insertions(+), 36 deletions(-) diff --git a/drivers/thermal/thermal_thresholds.c b/drivers/thermal/thermal_thresholds.c index d9b2a0bb44fc..38f5fd0e8930 100644 --- a/drivers/thermal/thermal_thresholds.c +++ b/drivers/thermal/thermal_thresholds.c @@ -69,40 +69,18 @@ static struct user_threshold *__thermal_thresholds_find(const struct list_head * return NULL; } -static bool __thermal_threshold_is_crossed(struct user_threshold *threshold, int temperature, - int last_temperature, int direction, - int *low, int *high) -{ - - if (temperature >= threshold->temperature) { - if (threshold->temperature > *low && - THERMAL_THRESHOLD_WAY_DOWN & threshold->direction) - *low = threshold->temperature; - - if (last_temperature < threshold->temperature && - threshold->direction & direction) - return true; - } else { - if (threshold->temperature < *high && THERMAL_THRESHOLD_WAY_UP - & threshold->direction) - *high = threshold->temperature; - - if (last_temperature >= threshold->temperature && - threshold->direction & direction) - return true; - } - - return false; -} - static bool thermal_thresholds_handle_raising(struct list_head *thresholds, int temperature, - int last_temperature, int *low, int *high) + int last_temperature) { struct user_threshold *t; list_for_each_entry(t, thresholds, list_node) { - if (__thermal_threshold_is_crossed(t, temperature, last_temperature, - THERMAL_THRESHOLD_WAY_UP, low, high)) + + if (!(t->direction & THERMAL_THRESHOLD_WAY_UP)) + continue; + + if (temperature >= t->temperature && + last_temperature < t->temperature) return true; } @@ -110,19 +88,43 @@ static bool thermal_thresholds_handle_raising(struct list_head *thresholds, int } static bool thermal_thresholds_handle_dropping(struct list_head *thresholds, int temperature, - int last_temperature, int *low, int *high) + int last_temperature) { struct user_threshold *t; list_for_each_entry_reverse(t, thresholds, list_node) { - if (__thermal_threshold_is_crossed(t, temperature, last_temperature, - THERMAL_THRESHOLD_WAY_DOWN, low, high)) + + if (!(t->direction & THERMAL_THRESHOLD_WAY_DOWN)) + continue; + + if (temperature <= t->temperature && + last_temperature > t->temperature) return true; } return false; } +static void thermal_threshold_find_boundaries(struct list_head *thresholds, int temperature, + int *low, int *high) +{ + struct user_threshold *t; + + list_for_each_entry(t, thresholds, list_node) { + if (temperature < t->temperature && + (t->direction & THERMAL_THRESHOLD_WAY_UP) && + *high > t->temperature) + *high = t->temperature; + } + + list_for_each_entry_reverse(t, thresholds, list_node) { + if (temperature > t->temperature && + (t->direction & THERMAL_THRESHOLD_WAY_DOWN) && + *low < t->temperature) + *low = t->temperature; + } +} + void thermal_thresholds_handle(struct thermal_zone_device *tz, int *low, int *high) { struct list_head *thresholds = &tz->user_thresholds; @@ -132,6 +134,8 @@ void thermal_thresholds_handle(struct thermal_zone_device *tz, int *low, int *hi lockdep_assert_held(&tz->lock); + thermal_threshold_find_boundaries(thresholds, temperature, low, high); + /* * We need a second update in order to detect a threshold being crossed */ @@ -151,12 +155,12 @@ void thermal_thresholds_handle(struct thermal_zone_device *tz, int *low, int *hi * - decreased : thresholds are crossed the way down */ if (temperature > last_temperature) { - if (thermal_thresholds_handle_raising(thresholds, temperature, - last_temperature, low, high)) + if (thermal_thresholds_handle_raising(thresholds, + temperature, last_temperature)) thermal_notify_threshold_up(tz); } else { - if (thermal_thresholds_handle_dropping(thresholds, temperature, - last_temperature, low, high)) + if (thermal_thresholds_handle_dropping(thresholds, + temperature, last_temperature)) thermal_notify_threshold_down(tz); } } From c9e3ebdc52ebe028f238c9df5162ae92483bedd5 Mon Sep 17 00:00:00 2001 From: Shuming Fan Date: Wed, 18 Dec 2024 17:13:07 +0800 Subject: [PATCH 112/337] ASoC: rt722: add delay time to wait for the calibration procedure The calibration procedure needs some time to finish. This patch adds the delay time to ensure the calibration procedure is completed correctly. Signed-off-by: Shuming Fan Link: https://patch.msgid.link/20241218091307.96656-1-shumingf@realtek.com Signed-off-by: Mark Brown --- sound/soc/codecs/rt722-sdca.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/sound/soc/codecs/rt722-sdca.c b/sound/soc/codecs/rt722-sdca.c index 908846e994df..e17a142d03b9 100644 --- a/sound/soc/codecs/rt722-sdca.c +++ b/sound/soc/codecs/rt722-sdca.c @@ -1468,13 +1468,18 @@ static void rt722_sdca_jack_preset(struct rt722_sdca_priv *rt722) 0x008d); /* check HP calibration FSM status */ for (loop_check = 0; loop_check < chk_cnt; loop_check++) { + usleep_range(10000, 11000); ret = rt722_sdca_index_read(rt722, RT722_VENDOR_CALI, RT722_DAC_DC_CALI_CTL3, &calib_status); - if (ret < 0 || loop_check == chk_cnt) + if (ret < 0) dev_dbg(&rt722->slave->dev, "calibration failed!, ret=%d\n", ret); if ((calib_status & 0x0040) == 0x0) break; } + + if (loop_check == chk_cnt) + dev_dbg(&rt722->slave->dev, "%s, calibration time-out!\n", __func__); + /* Set ADC09 power entity floating control */ rt722_sdca_index_write(rt722, RT722_VENDOR_HDA_CTL, RT722_ADC0A_08_PDE_FLOAT_CTL, 0x2a12); From 26fff8a4432ffd03409346b7dae1e1a2c5318b7c Mon Sep 17 00:00:00 2001 From: Luis Chamberlain Date: Tue, 17 Dec 2024 18:02:11 -0800 Subject: [PATCH 113/337] block/bdev: use helper for max block size check We already have a helper for checking the limits on the block size both low and high, just use that. No functional changes. Reviewed-by: John Garry Signed-off-by: Luis Chamberlain Reviewed-by: Keith Busch Reviewed-by: Christoph Hellwig Link: https://lore.kernel.org/r/20241218020212.3657139-2-mcgrof@kernel.org Signed-off-by: Jens Axboe --- block/bdev.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/block/bdev.c b/block/bdev.c index 738e3c8457e7..9d73a8fbf7f9 100644 --- a/block/bdev.c +++ b/block/bdev.c @@ -155,8 +155,7 @@ int set_blocksize(struct file *file, int size) struct inode *inode = file->f_mapping->host; struct block_device *bdev = I_BDEV(inode); - /* Size must be a power of two, and between 512 and PAGE_SIZE */ - if (size > PAGE_SIZE || size < 512 || !is_power_of_2(size)) + if (blk_validate_block_size(size)) return -EINVAL; /* Size cannot be smaller than the size supported by the device */ From 51588b1b77b65cd0fb3440f78f37bef7178a2715 Mon Sep 17 00:00:00 2001 From: Luis Chamberlain Date: Tue, 17 Dec 2024 18:02:12 -0800 Subject: [PATCH 114/337] nvme: use blk_validate_block_size() for max LBA check The block layer already has support to validates proper block sizes with blk_validate_block_size(), we can leverage that as well. No functional changes. Signed-off-by: Luis Chamberlain Reviewed-by: John Garry Reviewed-by: Keith Busch Reviewed-by: Christoph Hellwig Link: https://lore.kernel.org/r/20241218020212.3657139-3-mcgrof@kernel.org Signed-off-by: Jens Axboe --- drivers/nvme/host/core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index d169a30eb935..a970168a3014 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -2034,7 +2034,7 @@ static bool nvme_update_disk_info(struct nvme_ns *ns, struct nvme_id_ns *id, * or smaller than a sector size yet, so catch this early and don't * allow block I/O. */ - if (head->lba_shift > PAGE_SHIFT || head->lba_shift < SECTOR_SHIFT) { + if (blk_validate_block_size(bs)) { bs = (1 << 9); valid = false; } From 224749be6c23efe7fb8a030854f4fc5d1dd813b3 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Wed, 18 Dec 2024 18:16:14 +0800 Subject: [PATCH 115/337] block: Revert "block: Fix potential deadlock while freezing queue and acquiring sysfs_lock" This reverts commit be26ba96421ab0a8fa2055ccf7db7832a13c44d2. Commit be26ba96421a ("block: Fix potential deadlock while freezing queue and acquiring sysfs_loc") actually reverts commit 22465bbac53c ("blk-mq: move cpuhp callback registering out of q->sysfs_lock"), and causes the original resctrl lockdep warning. So revert it and we need to fix the issue in another way. Cc: Nilay Shroff Fixes: be26ba96421a ("block: Fix potential deadlock while freezing queue and acquiring sysfs_loc") Signed-off-by: Ming Lei Link: https://lore.kernel.org/r/20241218101617.3275704-2-ming.lei@redhat.com Signed-off-by: Jens Axboe --- block/blk-mq-sysfs.c | 16 ++++++++++------ block/blk-mq.c | 29 +++++++++++------------------ block/blk-sysfs.c | 4 ++-- 3 files changed, 23 insertions(+), 26 deletions(-) diff --git a/block/blk-mq-sysfs.c b/block/blk-mq-sysfs.c index cd5ea6eaa76b..156e9bb07abf 100644 --- a/block/blk-mq-sysfs.c +++ b/block/blk-mq-sysfs.c @@ -275,13 +275,15 @@ void blk_mq_sysfs_unregister_hctxs(struct request_queue *q) struct blk_mq_hw_ctx *hctx; unsigned long i; - lockdep_assert_held(&q->sysfs_dir_lock); - + mutex_lock(&q->sysfs_dir_lock); if (!q->mq_sysfs_init_done) - return; + goto unlock; queue_for_each_hw_ctx(q, hctx, i) blk_mq_unregister_hctx(hctx); + +unlock: + mutex_unlock(&q->sysfs_dir_lock); } int blk_mq_sysfs_register_hctxs(struct request_queue *q) @@ -290,10 +292,9 @@ int blk_mq_sysfs_register_hctxs(struct request_queue *q) unsigned long i; int ret = 0; - lockdep_assert_held(&q->sysfs_dir_lock); - + mutex_lock(&q->sysfs_dir_lock); if (!q->mq_sysfs_init_done) - return ret; + goto unlock; queue_for_each_hw_ctx(q, hctx, i) { ret = blk_mq_register_hctx(hctx); @@ -301,5 +302,8 @@ int blk_mq_sysfs_register_hctxs(struct request_queue *q) break; } +unlock: + mutex_unlock(&q->sysfs_dir_lock); + return ret; } diff --git a/block/blk-mq.c b/block/blk-mq.c index 6b6111513986..92e8ddf34575 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -4453,8 +4453,7 @@ static void blk_mq_realloc_hw_ctxs(struct blk_mq_tag_set *set, unsigned long i, j; /* protect against switching io scheduler */ - lockdep_assert_held(&q->sysfs_lock); - + mutex_lock(&q->sysfs_lock); for (i = 0; i < set->nr_hw_queues; i++) { int old_node; int node = blk_mq_get_hctx_node(set, i); @@ -4487,6 +4486,7 @@ static void blk_mq_realloc_hw_ctxs(struct blk_mq_tag_set *set, xa_for_each_start(&q->hctx_table, j, hctx, j) blk_mq_exit_hctx(q, set, hctx, j); + mutex_unlock(&q->sysfs_lock); /* unregister cpuhp callbacks for exited hctxs */ blk_mq_remove_hw_queues_cpuhp(q); @@ -4518,14 +4518,10 @@ int blk_mq_init_allocated_queue(struct blk_mq_tag_set *set, xa_init(&q->hctx_table); - mutex_lock(&q->sysfs_lock); - blk_mq_realloc_hw_ctxs(set, q); if (!q->nr_hw_queues) goto err_hctxs; - mutex_unlock(&q->sysfs_lock); - INIT_WORK(&q->timeout_work, blk_mq_timeout_work); blk_queue_rq_timeout(q, set->timeout ? set->timeout : 30 * HZ); @@ -4544,7 +4540,6 @@ int blk_mq_init_allocated_queue(struct blk_mq_tag_set *set, return 0; err_hctxs: - mutex_unlock(&q->sysfs_lock); blk_mq_release(q); err_exit: q->mq_ops = NULL; @@ -4925,12 +4920,12 @@ static bool blk_mq_elv_switch_none(struct list_head *head, return false; /* q->elevator needs protection from ->sysfs_lock */ - lockdep_assert_held(&q->sysfs_lock); + mutex_lock(&q->sysfs_lock); /* the check has to be done with holding sysfs_lock */ if (!q->elevator) { kfree(qe); - goto out; + goto unlock; } INIT_LIST_HEAD(&qe->node); @@ -4940,7 +4935,9 @@ static bool blk_mq_elv_switch_none(struct list_head *head, __elevator_get(qe->type); list_add(&qe->node, head); elevator_disable(q); -out: +unlock: + mutex_unlock(&q->sysfs_lock); + return true; } @@ -4969,9 +4966,11 @@ static void blk_mq_elv_switch_back(struct list_head *head, list_del(&qe->node); kfree(qe); + mutex_lock(&q->sysfs_lock); elevator_switch(q, t); /* drop the reference acquired in blk_mq_elv_switch_none */ elevator_put(t); + mutex_unlock(&q->sysfs_lock); } static void __blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set, @@ -4991,11 +4990,8 @@ static void __blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set, if (set->nr_maps == 1 && nr_hw_queues == set->nr_hw_queues) return; - list_for_each_entry(q, &set->tag_list, tag_set_list) { - mutex_lock(&q->sysfs_dir_lock); - mutex_lock(&q->sysfs_lock); + list_for_each_entry(q, &set->tag_list, tag_set_list) blk_mq_freeze_queue(q); - } /* * Switch IO scheduler to 'none', cleaning up the data associated * with the previous scheduler. We will switch back once we are done @@ -5051,11 +5047,8 @@ switch_back: list_for_each_entry(q, &set->tag_list, tag_set_list) blk_mq_elv_switch_back(&head, q); - list_for_each_entry(q, &set->tag_list, tag_set_list) { + list_for_each_entry(q, &set->tag_list, tag_set_list) blk_mq_unfreeze_queue(q); - mutex_unlock(&q->sysfs_lock); - mutex_unlock(&q->sysfs_dir_lock); - } /* Free the excess tags when nr_hw_queues shrink. */ for (i = set->nr_hw_queues; i < prev_nr_hw_queues; i++) diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c index 64f70c713d2f..767598e719ab 100644 --- a/block/blk-sysfs.c +++ b/block/blk-sysfs.c @@ -706,11 +706,11 @@ queue_attr_store(struct kobject *kobj, struct attribute *attr, if (entry->load_module) entry->load_module(disk, page, length); - mutex_lock(&q->sysfs_lock); blk_mq_freeze_queue(q); + mutex_lock(&q->sysfs_lock); res = entry->store(disk, page, length); - blk_mq_unfreeze_queue(q); mutex_unlock(&q->sysfs_lock); + blk_mq_unfreeze_queue(q); return res; } From 85672ca9ceeaa1dcf2777a7048af5f4aee3fd02b Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Wed, 18 Dec 2024 18:16:15 +0800 Subject: [PATCH 116/337] block: avoid to reuse `hctx` not removed from cpuhp callback list If the 'hctx' isn't removed from cpuhp callback list, we can't reuse it, otherwise use-after-free may be triggered. Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-lkp/202412172217.b906db7c-lkp@intel.com Tested-by: kernel test robot Fixes: 22465bbac53c ("blk-mq: move cpuhp callback registering out of q->sysfs_lock") Signed-off-by: Ming Lei Link: https://lore.kernel.org/r/20241218101617.3275704-3-ming.lei@redhat.com Signed-off-by: Jens Axboe --- block/blk-mq.c | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/block/blk-mq.c b/block/blk-mq.c index 92e8ddf34575..8ac19d4ae3c0 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -4412,6 +4412,15 @@ struct gendisk *blk_mq_alloc_disk_for_queue(struct request_queue *q, } EXPORT_SYMBOL(blk_mq_alloc_disk_for_queue); +/* + * Only hctx removed from cpuhp list can be reused + */ +static bool blk_mq_hctx_is_reusable(struct blk_mq_hw_ctx *hctx) +{ + return hlist_unhashed(&hctx->cpuhp_online) && + hlist_unhashed(&hctx->cpuhp_dead); +} + static struct blk_mq_hw_ctx *blk_mq_alloc_and_init_hctx( struct blk_mq_tag_set *set, struct request_queue *q, int hctx_idx, int node) @@ -4421,7 +4430,7 @@ static struct blk_mq_hw_ctx *blk_mq_alloc_and_init_hctx( /* reuse dead hctx first */ spin_lock(&q->unused_hctx_lock); list_for_each_entry(tmp, &q->unused_hctx_list, hctx_list) { - if (tmp->numa_node == node) { + if (tmp->numa_node == node && blk_mq_hctx_is_reusable(tmp)) { hctx = tmp; break; } From 7f9a1eed1ad8b274ed9163a02cef891a90427237 Mon Sep 17 00:00:00 2001 From: Jon Lin Date: Wed, 18 Dec 2024 23:47:41 +0800 Subject: [PATCH 117/337] spi: rockchip-sfc: Fix error in remove progress Fix error in remove progress: [ 43.026148] Call trace: [ 43.026370] klist_next+0x1c/0x1d4 [ 43.026671] device_for_each_child+0x48/0xac [ 43.027049] spi_unregister_controller+0x30/0x130 [ 43.027469] rockchip_sfc_remove+0x48/0x80 [spi_rockchip_sfc] Signed-off-by: Jon Lin Link: https://patch.msgid.link/20241218154741.901591-1-jon.lin@rock-chips.com Signed-off-by: Mark Brown --- drivers/spi/spi-rockchip-sfc.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/spi/spi-rockchip-sfc.c b/drivers/spi/spi-rockchip-sfc.c index 69d0f2175568..70bbb459caa4 100644 --- a/drivers/spi/spi-rockchip-sfc.c +++ b/drivers/spi/spi-rockchip-sfc.c @@ -182,6 +182,7 @@ struct rockchip_sfc { bool use_dma; u32 max_iosize; u16 version; + struct spi_controller *host; }; static int rockchip_sfc_reset(struct rockchip_sfc *sfc) @@ -574,6 +575,7 @@ static int rockchip_sfc_probe(struct platform_device *pdev) sfc = spi_controller_get_devdata(host); sfc->dev = dev; + sfc->host = host; sfc->regbase = devm_platform_ioremap_resource(pdev, 0); if (IS_ERR(sfc->regbase)) @@ -651,8 +653,8 @@ err_hclk: static void rockchip_sfc_remove(struct platform_device *pdev) { - struct spi_controller *host = platform_get_drvdata(pdev); struct rockchip_sfc *sfc = platform_get_drvdata(pdev); + struct spi_controller *host = sfc->host; spi_unregister_controller(host); From cc0c53f4fac562efb3aca2bc493515e77642ae33 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Wed, 19 Jun 2024 14:12:45 -0700 Subject: [PATCH 118/337] wifi: iwlwifi: mvm: Fix __counted_by usage in cfg80211_wowlan_nd_* Both struct cfg80211_wowlan_nd_match and struct cfg80211_wowlan_nd_info pre-allocate space for channels and matches, but then may end up using fewer that the full allocation. Shrink the associated counter (n_channels and n_matches) after counting the results. This avoids compile-time (and run-time) warnings from __counted_by. (The counter member needs to be updated _before_ accessing the array index.) Seen with coming GCC 15: drivers/net/wireless/intel/iwlwifi/mvm/d3.c: In function 'iwl_mvm_query_set_freqs': drivers/net/wireless/intel/iwlwifi/mvm/d3.c:2877:66: warning: operation on 'match->n_channels' may be undefined [-Wsequence-point] 2877 | match->channels[match->n_channels++] = | ~~~~~~~~~~~~~~~~~^~ drivers/net/wireless/intel/iwlwifi/mvm/d3.c:2885:66: warning: operation on 'match->n_channels' may be undefined [-Wsequence-point] 2885 | match->channels[match->n_channels++] = | ~~~~~~~~~~~~~~~~~^~ drivers/net/wireless/intel/iwlwifi/mvm/d3.c: In function 'iwl_mvm_query_netdetect_reasons': drivers/net/wireless/intel/iwlwifi/mvm/d3.c:2982:58: warning: operation on 'net_detect->n_matches' may be undefined [-Wsequence-point] 2982 | net_detect->matches[net_detect->n_matches++] = match; | ~~~~~~~~~~~~~~~~~~~~~^~ Cc: stable@vger.kernel.org Fixes: aa4ec06c455d ("wifi: cfg80211: use __counted_by where appropriate") Signed-off-by: Kees Cook Reviewed-by: Gustavo A. R. Silva Link: https://patch.msgid.link/20240619211233.work.355-kees@kernel.org Signed-off-by: Johannes Berg --- drivers/net/wireless/intel/iwlwifi/mvm/d3.c | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/drivers/net/wireless/intel/iwlwifi/mvm/d3.c b/drivers/net/wireless/intel/iwlwifi/mvm/d3.c index f85c01e04ebf..7d973546c9fb 100644 --- a/drivers/net/wireless/intel/iwlwifi/mvm/d3.c +++ b/drivers/net/wireless/intel/iwlwifi/mvm/d3.c @@ -2954,6 +2954,7 @@ static void iwl_mvm_query_set_freqs(struct iwl_mvm *mvm, int idx) { int i; + int n_channels = 0; if (fw_has_api(&mvm->fw->ucode_capa, IWL_UCODE_TLV_API_SCAN_OFFLOAD_CHANS)) { @@ -2962,7 +2963,7 @@ static void iwl_mvm_query_set_freqs(struct iwl_mvm *mvm, for (i = 0; i < SCAN_OFFLOAD_MATCHING_CHANNELS_LEN * 8; i++) if (matches[idx].matching_channels[i / 8] & (BIT(i % 8))) - match->channels[match->n_channels++] = + match->channels[n_channels++] = mvm->nd_channels[i]->center_freq; } else { struct iwl_scan_offload_profile_match_v1 *matches = @@ -2970,9 +2971,11 @@ static void iwl_mvm_query_set_freqs(struct iwl_mvm *mvm, for (i = 0; i < SCAN_OFFLOAD_MATCHING_CHANNELS_LEN_V1 * 8; i++) if (matches[idx].matching_channels[i / 8] & (BIT(i % 8))) - match->channels[match->n_channels++] = + match->channels[n_channels++] = mvm->nd_channels[i]->center_freq; } + /* We may have ended up with fewer channels than we allocated. */ + match->n_channels = n_channels; } /** @@ -3053,6 +3056,8 @@ static void iwl_mvm_query_netdetect_reasons(struct iwl_mvm *mvm, GFP_KERNEL); if (!net_detect || !n_matches) goto out_report_nd; + net_detect->n_matches = n_matches; + n_matches = 0; for_each_set_bit(i, &matched_profiles, mvm->n_nd_match_sets) { struct cfg80211_wowlan_nd_match *match; @@ -3066,8 +3071,9 @@ static void iwl_mvm_query_netdetect_reasons(struct iwl_mvm *mvm, GFP_KERNEL); if (!match) goto out_report_nd; + match->n_channels = n_channels; - net_detect->matches[net_detect->n_matches++] = match; + net_detect->matches[n_matches++] = match; /* We inverted the order of the SSIDs in the scan * request, so invert the index here. @@ -3082,6 +3088,8 @@ static void iwl_mvm_query_netdetect_reasons(struct iwl_mvm *mvm, iwl_mvm_query_set_freqs(mvm, d3_data->nd_results, match, i); } + /* We may have fewer matches than we allocated. */ + net_detect->n_matches = n_matches; out_report_nd: wakeup.net_detect = net_detect; From 536ae08d7b6ae16872f0b3c2679e656a7fc9d5e2 Mon Sep 17 00:00:00 2001 From: Mario Limonciello Date: Wed, 11 Dec 2024 09:56:01 -0600 Subject: [PATCH 119/337] drm/amd: Require CONFIG_HOTPLUG_PCI_PCIE for BOCO If the kernel hasn't been compiled with PCIe hotplug support this can lead to problems with dGPUs that use BOCO because they effectively drop off the bus. To prevent issues, disable BOCO support when compiled without PCIe hotplug. Reported-by: Gabriel Marcano Closes: https://gitlab.freedesktop.org/drm/amd/-/issues/1707#note_2696862 Acked-by: Alex Deucher Link: https://lore.kernel.org/r/20241211155601.3585256-1-superm1@kernel.org Signed-off-by: Mario Limonciello Signed-off-by: Alex Deucher (cherry picked from commit 1ad5bdc28bafa66db0f041cc6cdd278a80426aae) --- drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index d272d95dd5b2..cd4fac120834 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -417,6 +417,9 @@ bool amdgpu_device_supports_boco(struct drm_device *dev) { struct amdgpu_device *adev = drm_to_adev(dev); + if (!IS_ENABLED(CONFIG_HOTPLUG_PCI_PCIE)) + return false; + if (adev->has_pr3 || ((adev->flags & AMD_IS_PX) && amdgpu_is_atpx_hybrid())) return true; From a93b1020eb9386d7da11608477121b10079c076a Mon Sep 17 00:00:00 2001 From: Pierre-Eric Pelloux-Prayer Date: Fri, 6 Dec 2024 13:17:45 +0100 Subject: [PATCH 120/337] drm/amdgpu: don't access invalid sched MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Since 2320c9e6a768 ("drm/sched: memset() 'job' in drm_sched_job_init()") accessing job->base.sched can produce unexpected results as the initialisation of (*job)->base.sched done in amdgpu_job_alloc is overwritten by the memset. This commit fixes an issue when a CS would fail validation and would be rejected after job->num_ibs is incremented. In this case, amdgpu_ib_free(ring->adev, ...) will be called, which would crash the machine because the ring value is bogus. To fix this, pass a NULL pointer to amdgpu_ib_free(): we can do this because the device is actually not used in this function. The next commit will remove the ring argument completely. Fixes: 2320c9e6a768 ("drm/sched: memset() 'job' in drm_sched_job_init()") Signed-off-by: Pierre-Eric Pelloux-Prayer Reviewed-by: Alex Deucher Reviewed-by: Christian König Signed-off-by: Alex Deucher (cherry picked from commit 2ae520cb12831d264ceb97c61f72c59d33c0dbd7) --- drivers/gpu/drm/amd/amdgpu/amdgpu_job.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c index b9d08bc96581..a21c510c408e 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c @@ -255,7 +255,6 @@ void amdgpu_job_set_resources(struct amdgpu_job *job, struct amdgpu_bo *gds, void amdgpu_job_free_resources(struct amdgpu_job *job) { - struct amdgpu_ring *ring = to_amdgpu_ring(job->base.sched); struct dma_fence *f; unsigned i; @@ -268,7 +267,7 @@ void amdgpu_job_free_resources(struct amdgpu_job *job) f = NULL; for (i = 0; i < job->num_ibs; ++i) - amdgpu_ib_free(ring->adev, &job->ibs[i], f); + amdgpu_ib_free(NULL, &job->ibs[i], f); } static void amdgpu_job_free_cb(struct drm_sched_job *s_job) From 146b6057e1fd28fb1a38d300bf76a38dfba7f9fb Mon Sep 17 00:00:00 2001 From: Linus Walleij Date: Tue, 17 Dec 2024 13:55:48 +0100 Subject: [PATCH 121/337] wifi: cw1200: Fix potential NULL dereference A recent refactoring was identified by smatch to cause another potential NULL dereference: drivers/net/wireless/st/cw1200/cw1200_spi.c:440 cw1200_spi_disconnect() error: we previously assumed 'self' could be null (see line 433) Reported-by: kernel test robot Reported-by: Dan Carpenter Closes: https://lore.kernel.org/r/202411271742.Xa7CNVh1-lkp@intel.com/ Fixes: 2719a9e7156c ("wifi: cw1200: Convert to GPIO descriptors") Signed-off-by: Linus Walleij Signed-off-by: Kalle Valo Link: https://patch.msgid.link/20241217-cw1200-fix-v1-1-911e6b5823ec@linaro.org --- drivers/net/wireless/st/cw1200/cw1200_spi.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/wireless/st/cw1200/cw1200_spi.c b/drivers/net/wireless/st/cw1200/cw1200_spi.c index 862964a8cc87..52386dfb5f4a 100644 --- a/drivers/net/wireless/st/cw1200/cw1200_spi.c +++ b/drivers/net/wireless/st/cw1200/cw1200_spi.c @@ -442,8 +442,8 @@ static void cw1200_spi_disconnect(struct spi_device *func) cw1200_core_release(self->core); self->core = NULL; } + cw1200_spi_off(self, dev_get_platdata(&func->dev)); } - cw1200_spi_off(self, dev_get_platdata(&func->dev)); } static int __maybe_unused cw1200_spi_suspend(struct device *dev) From 458600da793da12e0f3724ecbea34a80703f4d5b Mon Sep 17 00:00:00 2001 From: Alex Deucher Date: Thu, 12 Dec 2024 16:47:48 -0500 Subject: [PATCH 122/337] drm/amdgpu/nbio7.7: fix IP version check Use the helper function rather than reading it directly. Reviewed-by: Yang Wang Signed-off-by: Alex Deucher (cherry picked from commit 22b9555bc90df22b585bdd1f161b61584b13af51) Cc: stable@vger.kernel.org --- drivers/gpu/drm/amd/amdgpu/nbio_v7_7.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/amdgpu/nbio_v7_7.c b/drivers/gpu/drm/amd/amdgpu/nbio_v7_7.c index 1ac730328516..3fb6d2aa7e3b 100644 --- a/drivers/gpu/drm/amd/amdgpu/nbio_v7_7.c +++ b/drivers/gpu/drm/amd/amdgpu/nbio_v7_7.c @@ -247,7 +247,7 @@ static void nbio_v7_7_init_registers(struct amdgpu_device *adev) if (def != data) WREG32_SOC15(NBIO, 0, regBIF0_PCIE_MST_CTRL_3, data); - switch (adev->ip_versions[NBIO_HWIP][0]) { + switch (amdgpu_ip_version(adev, NBIO_HWIP, 0)) { case IP_VERSION(7, 7, 0): data = RREG32_SOC15(NBIO, 0, regRCC_DEV0_EPF5_STRAP4) & ~BIT(23); WREG32_SOC15(NBIO, 0, regRCC_DEV0_EPF5_STRAP4, data); From 8c1ecc7197a88c6ae62de56e1c0887f220712a32 Mon Sep 17 00:00:00 2001 From: Alex Deucher Date: Thu, 12 Dec 2024 17:00:07 -0500 Subject: [PATCH 123/337] drm/amdgpu/nbio7.11: fix IP version check Use the helper function rather than reading it directly. Reviewed-by: Yang Wang Signed-off-by: Alex Deucher (cherry picked from commit 2c8eeaaa0fe5841ccf07a0eb51b1426f34ef39f7) Cc: stable@vger.kernel.org --- drivers/gpu/drm/amd/amdgpu/nbio_v7_11.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/amdgpu/nbio_v7_11.c b/drivers/gpu/drm/amd/amdgpu/nbio_v7_11.c index 814ab59fdd4a..41421da63a08 100644 --- a/drivers/gpu/drm/amd/amdgpu/nbio_v7_11.c +++ b/drivers/gpu/drm/amd/amdgpu/nbio_v7_11.c @@ -275,7 +275,7 @@ static void nbio_v7_11_init_registers(struct amdgpu_device *adev) if (def != data) WREG32_SOC15(NBIO, 0, regBIF_BIF256_CI256_RC3X4_USB4_PCIE_MST_CTRL_3, data); - switch (adev->ip_versions[NBIO_HWIP][0]) { + switch (amdgpu_ip_version(adev, NBIO_HWIP, 0)) { case IP_VERSION(7, 11, 0): case IP_VERSION(7, 11, 1): case IP_VERSION(7, 11, 2): From 6ebc5b92190e01dd48313b68cbf752c9adcfefa8 Mon Sep 17 00:00:00 2001 From: Alex Deucher Date: Thu, 12 Dec 2024 17:03:20 -0500 Subject: [PATCH 124/337] drm/amdgpu/mmhub4.1: fix IP version check Use the helper function rather than reading it directly. Reviewed-by: Yang Wang Signed-off-by: Alex Deucher (cherry picked from commit 63bfd24088b42c6f55c2096bfc41b50213d419b2) Cc: stable@vger.kernel.org --- drivers/gpu/drm/amd/amdgpu/mmhub_v4_1_0.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/amdgpu/mmhub_v4_1_0.c b/drivers/gpu/drm/amd/amdgpu/mmhub_v4_1_0.c index 0fbc3be81f14..f2ab5001b492 100644 --- a/drivers/gpu/drm/amd/amdgpu/mmhub_v4_1_0.c +++ b/drivers/gpu/drm/amd/amdgpu/mmhub_v4_1_0.c @@ -108,7 +108,7 @@ mmhub_v4_1_0_print_l2_protection_fault_status(struct amdgpu_device *adev, dev_err(adev->dev, "MMVM_L2_PROTECTION_FAULT_STATUS_LO32:0x%08X\n", status); - switch (adev->ip_versions[MMHUB_HWIP][0]) { + switch (amdgpu_ip_version(adev, MMHUB_HWIP, 0)) { case IP_VERSION(4, 1, 0): mmhub_cid = mmhub_client_ids_v4_1_0[cid][rw]; break; From 41be00f839e9ee7753892a73a36ce4c14c6f5cbf Mon Sep 17 00:00:00 2001 From: Alex Deucher Date: Thu, 12 Dec 2024 17:04:58 -0500 Subject: [PATCH 125/337] drm/amdgpu/gfx12: fix IP version check Use the helper function rather than reading it directly. Reviewed-by: Yang Wang Signed-off-by: Alex Deucher (cherry picked from commit f1fd1d0f40272948aa6ab82a3a82ecbbc76dff53) Cc: stable@vger.kernel.org --- drivers/gpu/drm/amd/amdgpu/gfx_v12_0.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v12_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v12_0.c index fe7c48f2fb2a..da327ab48a57 100644 --- a/drivers/gpu/drm/amd/amdgpu/gfx_v12_0.c +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v12_0.c @@ -4123,7 +4123,7 @@ static int gfx_v12_0_set_clockgating_state(void *handle, if (amdgpu_sriov_vf(adev)) return 0; - switch (adev->ip_versions[GC_HWIP][0]) { + switch (amdgpu_ip_version(adev, GC_HWIP, 0)) { case IP_VERSION(12, 0, 0): case IP_VERSION(12, 0, 1): gfx_v12_0_update_gfx_clock_gating(adev, From 9e752ee26c1031312a01d2afc281f5f6fdfca176 Mon Sep 17 00:00:00 2001 From: Alex Deucher Date: Thu, 12 Dec 2024 17:06:26 -0500 Subject: [PATCH 126/337] drm/amdgpu/smu14.0.2: fix IP version check Use the helper function rather than reading it directly. Reviewed-by: Yang Wang Signed-off-by: Alex Deucher (cherry picked from commit 8f2cd1067afe68372a1723e05e19b68ed187676a) Cc: stable@vger.kernel.org --- drivers/gpu/drm/amd/pm/swsmu/smu14/smu_v14_0_2_ppt.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu14/smu_v14_0_2_ppt.c b/drivers/gpu/drm/amd/pm/swsmu/smu14/smu_v14_0_2_ppt.c index 6a565ce74d5b..5cad09c5f2ff 100644 --- a/drivers/gpu/drm/amd/pm/swsmu/smu14/smu_v14_0_2_ppt.c +++ b/drivers/gpu/drm/amd/pm/swsmu/smu14/smu_v14_0_2_ppt.c @@ -2096,7 +2096,7 @@ static int smu_v14_0_2_enable_gfx_features(struct smu_context *smu) { struct amdgpu_device *adev = smu->adev; - if (adev->ip_versions[MP1_HWIP][0] == IP_VERSION(14, 0, 2)) + if (amdgpu_ip_version(adev, MP1_HWIP, 0) == IP_VERSION(14, 0, 2)) return smu_cmn_send_smc_msg_with_param(smu, SMU_MSG_EnableAllSmuFeatures, FEATURE_PWR_GFX, NULL); else From 8d1a13816e59254bd3b18f5ae0895230922bd120 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Christian=20K=C3=B6nig?= Date: Thu, 12 Dec 2024 16:29:18 +0100 Subject: [PATCH 127/337] drm/amdgpu: fix amdgpu_coredump MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The VM pointer might already be outdated when that function is called. Use the PASID instead to gather the information instead. Signed-off-by: Christian König Reviewed-by: Alex Deucher Signed-off-by: Alex Deucher (cherry picked from commit 57f812d171af4ba233d3ed7c94dfa5b8e92dcc04) Cc: stable@vger.kernel.org --- drivers/gpu/drm/amd/amdgpu/amdgpu_dev_coredump.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_dev_coredump.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_dev_coredump.c index 946c48829f19..824f9da5b6ce 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_dev_coredump.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_dev_coredump.c @@ -343,11 +343,10 @@ void amdgpu_coredump(struct amdgpu_device *adev, bool skip_vram_check, coredump->skip_vram_check = skip_vram_check; coredump->reset_vram_lost = vram_lost; - if (job && job->vm) { - struct amdgpu_vm *vm = job->vm; + if (job && job->pasid) { struct amdgpu_task_info *ti; - ti = amdgpu_vm_get_task_info_vm(vm); + ti = amdgpu_vm_get_task_info_pasid(adev, job->pasid); if (ti) { coredump->reset_task_info = *ti; amdgpu_vm_put_task_info(ti); From 85230ee36d88e7a09fb062d43203035659dd10a5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Michel=20D=C3=A4nzer?= Date: Tue, 17 Dec 2024 18:22:56 +0100 Subject: [PATCH 128/337] drm/amdgpu: Handle NULL bo->tbo.resource (again) in amdgpu_vm_bo_update MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Third time's the charm, I hope? Fixes: d3116756a710 ("drm/ttm: rename bo->mem and make it a pointer") Closes: https://gitlab.freedesktop.org/drm/amd/-/issues/3837 Reviewed-by: Christian König Signed-off-by: Michel Dänzer Signed-off-by: Alex Deucher (cherry picked from commit 695c2c745e5dff201b75da8a1d237ce403600d04) Cc: stable@vger.kernel.org --- drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c index ddd7f05e4db9..c9c48b782ec1 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c @@ -1266,10 +1266,9 @@ int amdgpu_vm_bo_update(struct amdgpu_device *adev, struct amdgpu_bo_va *bo_va, * next command submission. */ if (amdgpu_vm_is_bo_always_valid(vm, bo)) { - uint32_t mem_type = bo->tbo.resource->mem_type; - - if (!(bo->preferred_domains & - amdgpu_mem_type_to_domain(mem_type))) + if (bo->tbo.resource && + !(bo->preferred_domains & + amdgpu_mem_type_to_domain(bo->tbo.resource->mem_type))) amdgpu_vm_bo_evicted(&bo_va->base); else amdgpu_vm_bo_idle(&bo_va->base); From c58a812c8e49ad688f94f4b050ad5c5b388fc5d2 Mon Sep 17 00:00:00 2001 From: Edward Adam Davis Date: Wed, 18 Dec 2024 21:36:55 +0800 Subject: [PATCH 129/337] ring-buffer: Fix overflow in __rb_map_vma An overflow occurred when performing the following calculation: nr_pages = ((nr_subbufs + 1) << subbuf_order) - pgoff; Add a check before the calculation to avoid this problem. syzbot reported this as a slab-out-of-bounds in __rb_map_vma: BUG: KASAN: slab-out-of-bounds in __rb_map_vma+0x9ab/0xae0 kernel/trace/ring_buffer.c:7058 Read of size 8 at addr ffff8880767dd2b8 by task syz-executor187/5836 CPU: 0 UID: 0 PID: 5836 Comm: syz-executor187 Not tainted 6.13.0-rc2-syzkaller-00159-gf932fb9b4074 #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 11/25/2024 Call Trace: __dump_stack lib/dump_stack.c:94 [inline] dump_stack_lvl+0x116/0x1f0 lib/dump_stack.c:120 print_address_description mm/kasan/report.c:378 [inline] print_report+0xc3/0x620 mm/kasan/report.c:489 kasan_report+0xd9/0x110 mm/kasan/report.c:602 __rb_map_vma+0x9ab/0xae0 kernel/trace/ring_buffer.c:7058 ring_buffer_map+0x56e/0x9b0 kernel/trace/ring_buffer.c:7138 tracing_buffers_mmap+0xa6/0x120 kernel/trace/trace.c:8482 call_mmap include/linux/fs.h:2183 [inline] mmap_file mm/internal.h:124 [inline] __mmap_new_file_vma mm/vma.c:2291 [inline] __mmap_new_vma mm/vma.c:2355 [inline] __mmap_region+0x1786/0x2670 mm/vma.c:2456 mmap_region+0x127/0x320 mm/mmap.c:1348 do_mmap+0xc00/0xfc0 mm/mmap.c:496 vm_mmap_pgoff+0x1ba/0x360 mm/util.c:580 ksys_mmap_pgoff+0x32c/0x5c0 mm/mmap.c:542 __do_sys_mmap arch/x86/kernel/sys_x86_64.c:89 [inline] __se_sys_mmap arch/x86/kernel/sys_x86_64.c:82 [inline] __x64_sys_mmap+0x125/0x190 arch/x86/kernel/sys_x86_64.c:82 do_syscall_x64 arch/x86/entry/common.c:52 [inline] do_syscall_64+0xcd/0x250 arch/x86/entry/common.c:83 entry_SYSCALL_64_after_hwframe+0x77/0x7f The reproducer for this bug is: ------------------------8<------------------------- #include #include #include #include #include int main(int argc, char **argv) { int page_size = getpagesize(); int fd; void *meta; system("echo 1 > /sys/kernel/tracing/buffer_size_kb"); fd = open("/sys/kernel/tracing/per_cpu/cpu0/trace_pipe_raw", O_RDONLY); meta = mmap(NULL, page_size, PROT_READ, MAP_SHARED, fd, page_size * 5); } ------------------------>8------------------------- Cc: stable@vger.kernel.org Fixes: 117c39200d9d7 ("ring-buffer: Introducing ring-buffer mapping functions") Link: https://lore.kernel.org/tencent_06924B6674ED771167C23CC336C097223609@qq.com Reported-by: syzbot+345e4443a21200874b18@syzkaller.appspotmail.com Closes: https://syzkaller.appspot.com/bug?extid=345e4443a21200874b18 Signed-off-by: Edward Adam Davis Signed-off-by: Steven Rostedt (Google) --- kernel/trace/ring_buffer.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 7e257e855dd1..60210fb5b211 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -7019,7 +7019,11 @@ static int __rb_map_vma(struct ring_buffer_per_cpu *cpu_buffer, lockdep_assert_held(&cpu_buffer->mapping_lock); nr_subbufs = cpu_buffer->nr_pages + 1; /* + reader-subbuf */ - nr_pages = ((nr_subbufs + 1) << subbuf_order) - pgoff; /* + meta-page */ + nr_pages = ((nr_subbufs + 1) << subbuf_order); /* + meta-page */ + if (nr_pages <= pgoff) + return -EINVAL; + + nr_pages -= pgoff; nr_vma_pages = vma_pages(vma); if (!nr_vma_pages || nr_vma_pages > nr_pages) From 8cd63406d08110c8098e1efda8aef7ddab4db348 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Wed, 18 Dec 2024 14:15:07 -0500 Subject: [PATCH 130/337] trace/ring-buffer: Do not use TP_printk() formatting for boot mapped buffers The TP_printk() of a TRACE_EVENT() is a generic printf format that any developer can create for their event. It may include pointers to strings and such. A boot mapped buffer may contain data from a previous kernel where the strings addresses are different. One solution is to copy the event content and update the pointers by the recorded delta, but a simpler solution (for now) is to just use the print_fields() function to print these events. The print_fields() function just iterates the fields and prints them according to what type they are, and ignores the TP_printk() format from the event itself. To understand the difference, when printing via TP_printk() the output looks like this: 4582.696626: kmem_cache_alloc: call_site=getname_flags+0x47/0x1f0 ptr=00000000e70e10e0 bytes_req=4096 bytes_alloc=4096 gfp_flags=GFP_KERNEL node=-1 accounted=false 4582.696629: kmem_cache_alloc: call_site=alloc_empty_file+0x6b/0x110 ptr=0000000095808002 bytes_req=360 bytes_alloc=384 gfp_flags=GFP_KERNEL node=-1 accounted=false 4582.696630: kmem_cache_alloc: call_site=security_file_alloc+0x24/0x100 ptr=00000000576339c3 bytes_req=16 bytes_alloc=16 gfp_flags=GFP_KERNEL|__GFP_ZERO node=-1 accounted=false 4582.696653: kmem_cache_free: call_site=do_sys_openat2+0xa7/0xd0 ptr=00000000e70e10e0 name=names_cache But when printing via print_fields() (echo 1 > /sys/kernel/tracing/options/fields) the same event output looks like this: 4582.696626: kmem_cache_alloc: call_site=0xffffffff92d10d97 (-1831793257) ptr=0xffff9e0e8571e000 (-107689771147264) bytes_req=0x1000 (4096) bytes_alloc=0x1000 (4096) gfp_flags=0xcc0 (3264) node=0xffffffff (-1) accounted=(0) 4582.696629: kmem_cache_alloc: call_site=0xffffffff92d0250b (-1831852789) ptr=0xffff9e0e8577f800 (-107689770747904) bytes_req=0x168 (360) bytes_alloc=0x180 (384) gfp_flags=0xcc0 (3264) node=0xffffffff (-1) accounted=(0) 4582.696630: kmem_cache_alloc: call_site=0xffffffff92efca74 (-1829778828) ptr=0xffff9e0e8d35d3b0 (-107689640864848) bytes_req=0x10 (16) bytes_alloc=0x10 (16) gfp_flags=0xdc0 (3520) node=0xffffffff (-1) accounted=(0) 4582.696653: kmem_cache_free: call_site=0xffffffff92cfbea7 (-1831879001) ptr=0xffff9e0e8571e000 (-107689771147264) name=names_cache Cc: stable@vger.kernel.org Cc: Masami Hiramatsu Cc: Mathieu Desnoyers Cc: Linus Torvalds Link: https://lore.kernel.org/20241218141507.28389a1d@gandalf.local.home Fixes: 07714b4bb3f98 ("tracing: Handle old buffer mappings for event strings and functions") Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace.c | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index be62f0ea1814..6581cb2bc67f 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -4353,6 +4353,15 @@ static enum print_line_t print_trace_fmt(struct trace_iterator *iter) if (event) { if (tr->trace_flags & TRACE_ITER_FIELDS) return print_event_fields(iter, event); + /* + * For TRACE_EVENT() events, the print_fmt is not + * safe to use if the array has delta offsets + * Force printing via the fields. + */ + if ((tr->text_delta || tr->data_delta) && + event->type > __TRACE_LAST_TYPE) + return print_event_fields(iter, event); + return event->funcs->trace(iter, sym_flags, event); } From 0674188f2f4d38d74aa863f17373d76256f2ed09 Mon Sep 17 00:00:00 2001 From: Huacai Chen Date: Tue, 17 Dec 2024 15:37:04 +0800 Subject: [PATCH 131/337] ACPI: EC: Enable EC support on LoongArch by default Commit a6021aa24f6417416d933 ("ACPI: EC: make EC support compile-time conditional") only enable ACPI_EC on X86 by default, but the embedded controller is also widely used on LoongArch laptops so we also enable ACPI_EC for LoongArch. The laptop driver cannot work without EC, so also update the dependency of LOONGSON_LAPTOP to let it depend on APCI_EC. Fixes: a6021aa24f6417416d933 ("ACPI: EC: make EC support compile-time conditional") Reported-by: Xiaotian Wu Tested-by: Binbin Zhou Signed-off-by: Huacai Chen Link: https://patch.msgid.link/20241217073704.3339587-1-chenhuacai@loongson.cn [ rjw: Added Fixes: ] Signed-off-by: Rafael J. Wysocki --- drivers/acpi/Kconfig | 4 ++-- drivers/platform/loongarch/Kconfig | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/acpi/Kconfig b/drivers/acpi/Kconfig index d65cd08ba8e1..d81b55f5068c 100644 --- a/drivers/acpi/Kconfig +++ b/drivers/acpi/Kconfig @@ -135,10 +135,10 @@ config ACPI_REV_OVERRIDE_POSSIBLE config ACPI_EC bool "Embedded Controller" depends on HAS_IOPORT - default X86 + default X86 || LOONGARCH help This driver handles communication with the microcontroller - on many x86 laptops and other machines. + on many x86/LoongArch laptops and other machines. config ACPI_EC_DEBUGFS tristate "EC read/write access through /sys/kernel/debug/ec" diff --git a/drivers/platform/loongarch/Kconfig b/drivers/platform/loongarch/Kconfig index 5633e4d73991..447528797d07 100644 --- a/drivers/platform/loongarch/Kconfig +++ b/drivers/platform/loongarch/Kconfig @@ -18,7 +18,7 @@ if LOONGARCH_PLATFORM_DEVICES config LOONGSON_LAPTOP tristate "Generic Loongson-3 Laptop Driver" - depends on ACPI + depends on ACPI_EC depends on BACKLIGHT_CLASS_DEVICE depends on INPUT depends on MACH_LOONGSON64 From a7f9d98eb1202132014ba760c26ad8608ffc9caf Mon Sep 17 00:00:00 2001 From: Mario Limonciello Date: Tue, 10 Dec 2024 20:44:14 -0600 Subject: [PATCH 132/337] drm/amd: Update strapping for NBIO 2.5.0 This helps to avoid a spurious PME event on hotplug to Azalia. Cc: Vijendar Mukunda Reported-and-tested-by: ionut_n2001@yahoo.com Closes: https://bugzilla.kernel.org/show_bug.cgi?id=215884 Tested-by: Gabriel Marcano Acked-by: Alex Deucher Link: https://lore.kernel.org/r/20241211024414.7840-1-mario.limonciello@amd.com Signed-off-by: Mario Limonciello Signed-off-by: Alex Deucher (cherry picked from commit 3f6f237b9dd189e1fb85b8a3f7c97a8f27c1e49a) Cc: stable@vger.kernel.org --- drivers/gpu/drm/amd/amdgpu/nbio_v7_0.c | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/drivers/gpu/drm/amd/amdgpu/nbio_v7_0.c b/drivers/gpu/drm/amd/amdgpu/nbio_v7_0.c index b1b57dcc5a73..49e953f86ced 100644 --- a/drivers/gpu/drm/amd/amdgpu/nbio_v7_0.c +++ b/drivers/gpu/drm/amd/amdgpu/nbio_v7_0.c @@ -271,8 +271,19 @@ const struct nbio_hdp_flush_reg nbio_v7_0_hdp_flush_reg = { .ref_and_mask_sdma1 = GPU_HDP_FLUSH_DONE__SDMA1_MASK, }; +#define regRCC_DEV0_EPF6_STRAP4 0xd304 +#define regRCC_DEV0_EPF6_STRAP4_BASE_IDX 5 + static void nbio_v7_0_init_registers(struct amdgpu_device *adev) { + uint32_t data; + + switch (adev->ip_versions[NBIO_HWIP][0]) { + case IP_VERSION(2, 5, 0): + data = RREG32_SOC15(NBIO, 0, regRCC_DEV0_EPF6_STRAP4) & ~BIT(23); + WREG32_SOC15(NBIO, 0, regRCC_DEV0_EPF6_STRAP4, data); + break; + } } #define MMIO_REG_HOLE_OFFSET (0x80000 - PAGE_SIZE) From 3abb660f9e18925468685591a3702bda05faba4f Mon Sep 17 00:00:00 2001 From: Alex Deucher Date: Thu, 12 Dec 2024 16:49:20 -0500 Subject: [PATCH 133/337] drm/amdgpu/nbio7.0: fix IP version check Use the helper function rather than reading it directly. Reviewed-by: Yang Wang Signed-off-by: Alex Deucher (cherry picked from commit 0ec43fbece784215d3c4469973e4556d70bce915) Cc: stable@vger.kernel.org --- drivers/gpu/drm/amd/amdgpu/nbio_v7_0.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/amdgpu/nbio_v7_0.c b/drivers/gpu/drm/amd/amdgpu/nbio_v7_0.c index 49e953f86ced..d1032e9992b4 100644 --- a/drivers/gpu/drm/amd/amdgpu/nbio_v7_0.c +++ b/drivers/gpu/drm/amd/amdgpu/nbio_v7_0.c @@ -278,7 +278,7 @@ static void nbio_v7_0_init_registers(struct amdgpu_device *adev) { uint32_t data; - switch (adev->ip_versions[NBIO_HWIP][0]) { + switch (amdgpu_ip_version(adev, NBIO_HWIP, 0)) { case IP_VERSION(2, 5, 0): data = RREG32_SOC15(NBIO, 0, regRCC_DEV0_EPF6_STRAP4) & ~BIT(23); WREG32_SOC15(NBIO, 0, regRCC_DEV0_EPF6_STRAP4, data); From b3ded6072c5600704cfa3ce3a8dc8718d34bda66 Mon Sep 17 00:00:00 2001 From: Hans de Goede Date: Sat, 16 Nov 2024 21:36:47 +0100 Subject: [PATCH 134/337] power: supply: bq24190: Fix BQ24296 Vbus regulator support There are 2 issues with bq24296_set_otg_vbus(): 1. When writing the OTG_CONFIG bit it uses POC_CHG_CONFIG_SHIFT which should be POC_OTG_CONFIG_SHIFT. 2. When turning the regulator off it never turns charging back on. Note this must be done through bq24190_charger_set_charge_type(), to ensure that the charge_type property value of none/trickle/fast is honored. Resolve both issues to fix BQ24296 Vbus regulator support not working. Fixes: b150a703b56f ("power: supply: bq24190_charger: Add support for BQ24296") Signed-off-by: Hans de Goede Link: https://lore.kernel.org/r/20241116203648.169100-2-hdegoede@redhat.com Signed-off-by: Sebastian Reichel --- drivers/power/supply/bq24190_charger.c | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/drivers/power/supply/bq24190_charger.c b/drivers/power/supply/bq24190_charger.c index 2b393eb5c282..c47f32f152e6 100644 --- a/drivers/power/supply/bq24190_charger.c +++ b/drivers/power/supply/bq24190_charger.c @@ -567,6 +567,7 @@ static int bq24190_set_otg_vbus(struct bq24190_dev_info *bdi, bool enable) static int bq24296_set_otg_vbus(struct bq24190_dev_info *bdi, bool enable) { + union power_supply_propval val = { .intval = bdi->charge_type }; int ret; ret = pm_runtime_resume_and_get(bdi->dev); @@ -587,13 +588,18 @@ static int bq24296_set_otg_vbus(struct bq24190_dev_info *bdi, bool enable) ret = bq24190_write_mask(bdi, BQ24190_REG_POC, BQ24296_REG_POC_OTG_CONFIG_MASK, - BQ24296_REG_POC_CHG_CONFIG_SHIFT, + BQ24296_REG_POC_OTG_CONFIG_SHIFT, BQ24296_REG_POC_OTG_CONFIG_OTG); - } else + } else { ret = bq24190_write_mask(bdi, BQ24190_REG_POC, BQ24296_REG_POC_OTG_CONFIG_MASK, - BQ24296_REG_POC_CHG_CONFIG_SHIFT, + BQ24296_REG_POC_OTG_CONFIG_SHIFT, BQ24296_REG_POC_OTG_CONFIG_DISABLE); + if (ret < 0) + goto out; + + ret = bq24190_charger_set_charge_type(bdi, &val); + } out: pm_runtime_mark_last_busy(bdi->dev); From 12d908116f7efd34f255a482b9afc729d7a5fb78 Mon Sep 17 00:00:00 2001 From: Jann Horn Date: Wed, 18 Dec 2024 17:56:25 +0100 Subject: [PATCH 135/337] io_uring: Fix registered ring file refcount leak Currently, io_uring_unreg_ringfd() (which cleans up registered rings) is only called on exit, but __io_uring_free (which frees the tctx in which the registered ring pointers are stored) is also called on execve (via begin_new_exec -> io_uring_task_cancel -> __io_uring_cancel -> io_uring_cancel_generic -> __io_uring_free). This means: A process going through execve while having registered rings will leak references to the rings' `struct file`. Fix it by zapping registered rings on execve(). This is implemented by moving the io_uring_unreg_ringfd() from io_uring_files_cancel() into its callee __io_uring_cancel(), which is called from io_uring_task_cancel() on execve. This could probably be exploited *on 32-bit kernels* by leaking 2^32 references to the same ring, because the file refcount is stored in a pointer-sized field and get_file() doesn't have protection against refcount overflow, just a WARN_ONCE(); but on 64-bit it should have no impact beyond a memory leak. Cc: stable@vger.kernel.org Fixes: e7a6c00dc77a ("io_uring: add support for registering ring file descriptors") Signed-off-by: Jann Horn Link: https://lore.kernel.org/r/20241218-uring-reg-ring-cleanup-v1-1-8f63e999045b@google.com Signed-off-by: Jens Axboe --- include/linux/io_uring.h | 4 +--- io_uring/io_uring.c | 1 + 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/include/linux/io_uring.h b/include/linux/io_uring.h index e123d5e17b52..85fe4e6b275c 100644 --- a/include/linux/io_uring.h +++ b/include/linux/io_uring.h @@ -15,10 +15,8 @@ bool io_is_uring_fops(struct file *file); static inline void io_uring_files_cancel(void) { - if (current->io_uring) { - io_uring_unreg_ringfd(); + if (current->io_uring) __io_uring_cancel(false); - } } static inline void io_uring_task_cancel(void) { diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index 605625e932eb..432b95ca9c85 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -3214,6 +3214,7 @@ end_wait: void __io_uring_cancel(bool cancel_all) { + io_uring_unreg_ringfd(); io_uring_cancel_generic(cancel_all, NULL); } From dbf8be8218e7ff2ee2bbeebc91bf0e0c58a8c60b Mon Sep 17 00:00:00 2001 From: Lorenzo Stoakes Date: Fri, 8 Nov 2024 13:57:06 +0000 Subject: [PATCH 136/337] docs/mm: add VMA locks documentation Locking around VMAs is complicated and confusing. While we have a number of disparate comments scattered around the place, we seem to be reaching a level of complexity that justifies a serious effort at clearly documenting how locks are expected to be used when it comes to interacting with mm_struct and vm_area_struct objects. This is especially pertinent as regards the efforts to find sensible abstractions for these fundamental objects in kernel rust code whose compiler strictly requires some means of expressing these rules (and through this expression, self-document these requirements as well as enforce them). The document limits scope to mmap and VMA locks and those that are immediately adjacent and relevant to them - so additionally covers page table locking as this is so very closely tied to VMA operations (and relies upon us handling these correctly). The document tries to cover some of the nastier and more confusing edge cases and concerns especially around lock ordering and page table teardown. The document is split between generally useful information for users of mm interfaces, and separately a section intended for mm kernel developers providing a discussion around internal implementation details. [lorenzo.stoakes@oracle.com: v3] Link: https://lkml.kernel.org/r/20241114205402.859737-1-lorenzo.stoakes@oracle.com [lorenzo.stoakes@oracle.com: docs/mm: minor corrections] Link: https://lkml.kernel.org/r/d3de735a-25ae-4eb2-866c-a9624fe6f795@lucifer.local [jannh@google.com: docs/mm: add more warnings around page table access] Link: https://lkml.kernel.org/r/20241118-vma-docs-addition1-onv3-v2-1-c9d5395b72ee@google.com Link: https://lkml.kernel.org/r/20241108135708.48567-1-lorenzo.stoakes@oracle.com Signed-off-by: Lorenzo Stoakes Acked-by: Qi Zheng Acked-by: Mike Rapoport (Microsoft) Reviewed-by: Bagas Sanjaya Reviewed-by: Jann Horn Cc: Alice Ryhl Cc: Boqun Feng Cc: Hillf Danton Cc: Jonathan Corbet Cc: Liam R. Howlett Cc: Matthew Wilcox Cc: SeongJae Park Cc: Suren Baghdasaryan Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- Documentation/mm/process_addrs.rst | 850 +++++++++++++++++++++++++++++ 1 file changed, 850 insertions(+) diff --git a/Documentation/mm/process_addrs.rst b/Documentation/mm/process_addrs.rst index e8618fbc62c9..1d416658d7f5 100644 --- a/Documentation/mm/process_addrs.rst +++ b/Documentation/mm/process_addrs.rst @@ -3,3 +3,853 @@ ================= Process Addresses ================= + +.. toctree:: + :maxdepth: 3 + + +Userland memory ranges are tracked by the kernel via Virtual Memory Areas or +'VMA's of type :c:struct:`!struct vm_area_struct`. + +Each VMA describes a virtually contiguous memory range with identical +attributes, each described by a :c:struct:`!struct vm_area_struct` +object. Userland access outside of VMAs is invalid except in the case where an +adjacent stack VMA could be extended to contain the accessed address. + +All VMAs are contained within one and only one virtual address space, described +by a :c:struct:`!struct mm_struct` object which is referenced by all tasks (that is, +threads) which share the virtual address space. We refer to this as the +:c:struct:`!mm`. + +Each mm object contains a maple tree data structure which describes all VMAs +within the virtual address space. + +.. note:: An exception to this is the 'gate' VMA which is provided by + architectures which use :c:struct:`!vsyscall` and is a global static + object which does not belong to any specific mm. + +------- +Locking +------- + +The kernel is designed to be highly scalable against concurrent read operations +on VMA **metadata** so a complicated set of locks are required to ensure memory +corruption does not occur. + +.. note:: Locking VMAs for their metadata does not have any impact on the memory + they describe nor the page tables that map them. + +Terminology +----------- + +* **mmap locks** - Each MM has a read/write semaphore :c:member:`!mmap_lock` + which locks at a process address space granularity which can be acquired via + :c:func:`!mmap_read_lock`, :c:func:`!mmap_write_lock` and variants. +* **VMA locks** - The VMA lock is at VMA granularity (of course) which behaves + as a read/write semaphore in practice. A VMA read lock is obtained via + :c:func:`!lock_vma_under_rcu` (and unlocked via :c:func:`!vma_end_read`) and a + write lock via :c:func:`!vma_start_write` (all VMA write locks are unlocked + automatically when the mmap write lock is released). To take a VMA write lock + you **must** have already acquired an :c:func:`!mmap_write_lock`. +* **rmap locks** - When trying to access VMAs through the reverse mapping via a + :c:struct:`!struct address_space` or :c:struct:`!struct anon_vma` object + (reachable from a folio via :c:member:`!folio->mapping`). VMAs must be stabilised via + :c:func:`!anon_vma_[try]lock_read` or :c:func:`!anon_vma_[try]lock_write` for + anonymous memory and :c:func:`!i_mmap_[try]lock_read` or + :c:func:`!i_mmap_[try]lock_write` for file-backed memory. We refer to these + locks as the reverse mapping locks, or 'rmap locks' for brevity. + +We discuss page table locks separately in the dedicated section below. + +The first thing **any** of these locks achieve is to **stabilise** the VMA +within the MM tree. That is, guaranteeing that the VMA object will not be +deleted from under you nor modified (except for some specific fields +described below). + +Stabilising a VMA also keeps the address space described by it around. + +Lock usage +---------- + +If you want to **read** VMA metadata fields or just keep the VMA stable, you +must do one of the following: + +* Obtain an mmap read lock at the MM granularity via :c:func:`!mmap_read_lock` (or a + suitable variant), unlocking it with a matching :c:func:`!mmap_read_unlock` when + you're done with the VMA, *or* +* Try to obtain a VMA read lock via :c:func:`!lock_vma_under_rcu`. This tries to + acquire the lock atomically so might fail, in which case fall-back logic is + required to instead obtain an mmap read lock if this returns :c:macro:`!NULL`, + *or* +* Acquire an rmap lock before traversing the locked interval tree (whether + anonymous or file-backed) to obtain the required VMA. + +If you want to **write** VMA metadata fields, then things vary depending on the +field (we explore each VMA field in detail below). For the majority you must: + +* Obtain an mmap write lock at the MM granularity via :c:func:`!mmap_write_lock` (or a + suitable variant), unlocking it with a matching :c:func:`!mmap_write_unlock` when + you're done with the VMA, *and* +* Obtain a VMA write lock via :c:func:`!vma_start_write` for each VMA you wish to + modify, which will be released automatically when :c:func:`!mmap_write_unlock` is + called. +* If you want to be able to write to **any** field, you must also hide the VMA + from the reverse mapping by obtaining an **rmap write lock**. + +VMA locks are special in that you must obtain an mmap **write** lock **first** +in order to obtain a VMA **write** lock. A VMA **read** lock however can be +obtained without any other lock (:c:func:`!lock_vma_under_rcu` will acquire then +release an RCU lock to lookup the VMA for you). + +This constrains the impact of writers on readers, as a writer can interact with +one VMA while a reader interacts with another simultaneously. + +.. note:: The primary users of VMA read locks are page fault handlers, which + means that without a VMA write lock, page faults will run concurrent with + whatever you are doing. + +Examining all valid lock states: + +.. table:: + + ========= ======== ========= ======= ===== =========== ========== + mmap lock VMA lock rmap lock Stable? Read? Write most? Write all? + ========= ======== ========= ======= ===== =========== ========== + \- \- \- N N N N + \- R \- Y Y N N + \- \- R/W Y Y N N + R/W \-/R \-/R/W Y Y N N + W W \-/R Y Y Y N + W W W Y Y Y Y + ========= ======== ========= ======= ===== =========== ========== + +.. warning:: While it's possible to obtain a VMA lock while holding an mmap read lock, + attempting to do the reverse is invalid as it can result in deadlock - if + another task already holds an mmap write lock and attempts to acquire a VMA + write lock that will deadlock on the VMA read lock. + +All of these locks behave as read/write semaphores in practice, so you can +obtain either a read or a write lock for each of these. + +.. note:: Generally speaking, a read/write semaphore is a class of lock which + permits concurrent readers. However a write lock can only be obtained + once all readers have left the critical region (and pending readers + made to wait). + + This renders read locks on a read/write semaphore concurrent with other + readers and write locks exclusive against all others holding the semaphore. + +VMA fields +^^^^^^^^^^ + +We can subdivide :c:struct:`!struct vm_area_struct` fields by their purpose, which makes it +easier to explore their locking characteristics: + +.. note:: We exclude VMA lock-specific fields here to avoid confusion, as these + are in effect an internal implementation detail. + +.. table:: Virtual layout fields + + ===================== ======================================== =========== + Field Description Write lock + ===================== ======================================== =========== + :c:member:`!vm_start` Inclusive start virtual address of range mmap write, + VMA describes. VMA write, + rmap write. + :c:member:`!vm_end` Exclusive end virtual address of range mmap write, + VMA describes. VMA write, + rmap write. + :c:member:`!vm_pgoff` Describes the page offset into the file, mmap write, + the original page offset within the VMA write, + virtual address space (prior to any rmap write. + :c:func:`!mremap`), or PFN if a PFN map + and the architecture does not support + :c:macro:`!CONFIG_ARCH_HAS_PTE_SPECIAL`. + ===================== ======================================== =========== + +These fields describes the size, start and end of the VMA, and as such cannot be +modified without first being hidden from the reverse mapping since these fields +are used to locate VMAs within the reverse mapping interval trees. + +.. table:: Core fields + + ============================ ======================================== ========================= + Field Description Write lock + ============================ ======================================== ========================= + :c:member:`!vm_mm` Containing mm_struct. None - written once on + initial map. + :c:member:`!vm_page_prot` Architecture-specific page table mmap write, VMA write. + protection bits determined from VMA + flags. + :c:member:`!vm_flags` Read-only access to VMA flags describing N/A + attributes of the VMA, in union with + private writable + :c:member:`!__vm_flags`. + :c:member:`!__vm_flags` Private, writable access to VMA flags mmap write, VMA write. + field, updated by + :c:func:`!vm_flags_*` functions. + :c:member:`!vm_file` If the VMA is file-backed, points to a None - written once on + struct file object describing the initial map. + underlying file, if anonymous then + :c:macro:`!NULL`. + :c:member:`!vm_ops` If the VMA is file-backed, then either None - Written once on + the driver or file-system provides a initial map by + :c:struct:`!struct vm_operations_struct` :c:func:`!f_ops->mmap()`. + object describing callbacks to be + invoked on VMA lifetime events. + :c:member:`!vm_private_data` A :c:member:`!void *` field for Handled by driver. + driver-specific metadata. + ============================ ======================================== ========================= + +These are the core fields which describe the MM the VMA belongs to and its attributes. + +.. table:: Config-specific fields + + ================================= ===================== ======================================== =============== + Field Configuration option Description Write lock + ================================= ===================== ======================================== =============== + :c:member:`!anon_name` CONFIG_ANON_VMA_NAME A field for storing a mmap write, + :c:struct:`!struct anon_vma_name` VMA write. + object providing a name for anonymous + mappings, or :c:macro:`!NULL` if none + is set or the VMA is file-backed. The + underlying object is reference counted + and can be shared across multiple VMAs + for scalability. + :c:member:`!swap_readahead_info` CONFIG_SWAP Metadata used by the swap mechanism mmap read, + to perform readahead. This field is swap-specific + accessed atomically. lock. + :c:member:`!vm_policy` CONFIG_NUMA :c:type:`!mempolicy` object which mmap write, + describes the NUMA behaviour of the VMA write. + VMA. The underlying object is reference + counted. + :c:member:`!numab_state` CONFIG_NUMA_BALANCING :c:type:`!vma_numab_state` object which mmap read, + describes the current state of numab-specific + NUMA balancing in relation to this VMA. lock. + Updated under mmap read lock by + :c:func:`!task_numa_work`. + :c:member:`!vm_userfaultfd_ctx` CONFIG_USERFAULTFD Userfaultfd context wrapper object of mmap write, + type :c:type:`!vm_userfaultfd_ctx`, VMA write. + either of zero size if userfaultfd is + disabled, or containing a pointer + to an underlying + :c:type:`!userfaultfd_ctx` object which + describes userfaultfd metadata. + ================================= ===================== ======================================== =============== + +These fields are present or not depending on whether the relevant kernel +configuration option is set. + +.. table:: Reverse mapping fields + + =================================== ========================================= ============================ + Field Description Write lock + =================================== ========================================= ============================ + :c:member:`!shared.rb` A red/black tree node used, if the mmap write, VMA write, + mapping is file-backed, to place the VMA i_mmap write. + in the + :c:member:`!struct address_space->i_mmap` + red/black interval tree. + :c:member:`!shared.rb_subtree_last` Metadata used for management of the mmap write, VMA write, + interval tree if the VMA is file-backed. i_mmap write. + :c:member:`!anon_vma_chain` List of pointers to both forked/CoW’d mmap read, anon_vma write. + :c:type:`!anon_vma` objects and + :c:member:`!vma->anon_vma` if it is + non-:c:macro:`!NULL`. + :c:member:`!anon_vma` :c:type:`!anon_vma` object used by When :c:macro:`NULL` and + anonymous folios mapped exclusively to setting non-:c:macro:`NULL`: + this VMA. Initially set by mmap read, page_table_lock. + :c:func:`!anon_vma_prepare` serialised + by the :c:macro:`!page_table_lock`. This When non-:c:macro:`NULL` and + is set as soon as any page is faulted in. setting :c:macro:`NULL`: + mmap write, VMA write, + anon_vma write. + =================================== ========================================= ============================ + +These fields are used to both place the VMA within the reverse mapping, and for +anonymous mappings, to be able to access both related :c:struct:`!struct anon_vma` objects +and the :c:struct:`!struct anon_vma` in which folios mapped exclusively to this VMA should +reside. + +.. note:: If a file-backed mapping is mapped with :c:macro:`!MAP_PRIVATE` set + then it can be in both the :c:type:`!anon_vma` and :c:type:`!i_mmap` + trees at the same time, so all of these fields might be utilised at + once. + +Page tables +----------- + +We won't speak exhaustively on the subject but broadly speaking, page tables map +virtual addresses to physical ones through a series of page tables, each of +which contain entries with physical addresses for the next page table level +(along with flags), and at the leaf level the physical addresses of the +underlying physical data pages or a special entry such as a swap entry, +migration entry or other special marker. Offsets into these pages are provided +by the virtual address itself. + +In Linux these are divided into five levels - PGD, P4D, PUD, PMD and PTE. Huge +pages might eliminate one or two of these levels, but when this is the case we +typically refer to the leaf level as the PTE level regardless. + +.. note:: In instances where the architecture supports fewer page tables than + five the kernel cleverly 'folds' page table levels, that is stubbing + out functions related to the skipped levels. This allows us to + conceptually act as if there were always five levels, even if the + compiler might, in practice, eliminate any code relating to missing + ones. + +There are four key operations typically performed on page tables: + +1. **Traversing** page tables - Simply reading page tables in order to traverse + them. This only requires that the VMA is kept stable, so a lock which + establishes this suffices for traversal (there are also lockless variants + which eliminate even this requirement, such as :c:func:`!gup_fast`). +2. **Installing** page table mappings - Whether creating a new mapping or + modifying an existing one in such a way as to change its identity. This + requires that the VMA is kept stable via an mmap or VMA lock (explicitly not + rmap locks). +3. **Zapping/unmapping** page table entries - This is what the kernel calls + clearing page table mappings at the leaf level only, whilst leaving all page + tables in place. This is a very common operation in the kernel performed on + file truncation, the :c:macro:`!MADV_DONTNEED` operation via + :c:func:`!madvise`, and others. This is performed by a number of functions + including :c:func:`!unmap_mapping_range` and :c:func:`!unmap_mapping_pages`. + The VMA need only be kept stable for this operation. +4. **Freeing** page tables - When finally the kernel removes page tables from a + userland process (typically via :c:func:`!free_pgtables`) extreme care must + be taken to ensure this is done safely, as this logic finally frees all page + tables in the specified range, ignoring existing leaf entries (it assumes the + caller has both zapped the range and prevented any further faults or + modifications within it). + +.. note:: Modifying mappings for reclaim or migration is performed under rmap + lock as it, like zapping, does not fundamentally modify the identity + of what is being mapped. + +**Traversing** and **zapping** ranges can be performed holding any one of the +locks described in the terminology section above - that is the mmap lock, the +VMA lock or either of the reverse mapping locks. + +That is - as long as you keep the relevant VMA **stable** - you are good to go +ahead and perform these operations on page tables (though internally, kernel +operations that perform writes also acquire internal page table locks to +serialise - see the page table implementation detail section for more details). + +When **installing** page table entries, the mmap or VMA lock must be held to +keep the VMA stable. We explore why this is in the page table locking details +section below. + +.. warning:: Page tables are normally only traversed in regions covered by VMAs. + If you want to traverse page tables in areas that might not be + covered by VMAs, heavier locking is required. + See :c:func:`!walk_page_range_novma` for details. + +**Freeing** page tables is an entirely internal memory management operation and +has special requirements (see the page freeing section below for more details). + +.. warning:: When **freeing** page tables, it must not be possible for VMAs + containing the ranges those page tables map to be accessible via + the reverse mapping. + + The :c:func:`!free_pgtables` function removes the relevant VMAs + from the reverse mappings, but no other VMAs can be permitted to be + accessible and span the specified range. + +Lock ordering +------------- + +As we have multiple locks across the kernel which may or may not be taken at the +same time as explicit mm or VMA locks, we have to be wary of lock inversion, and +the **order** in which locks are acquired and released becomes very important. + +.. note:: Lock inversion occurs when two threads need to acquire multiple locks, + but in doing so inadvertently cause a mutual deadlock. + + For example, consider thread 1 which holds lock A and tries to acquire lock B, + while thread 2 holds lock B and tries to acquire lock A. + + Both threads are now deadlocked on each other. However, had they attempted to + acquire locks in the same order, one would have waited for the other to + complete its work and no deadlock would have occurred. + +The opening comment in :c:macro:`!mm/rmap.c` describes in detail the required +ordering of locks within memory management code: + +.. code-block:: + + inode->i_rwsem (while writing or truncating, not reading or faulting) + mm->mmap_lock + mapping->invalidate_lock (in filemap_fault) + folio_lock + hugetlbfs_i_mmap_rwsem_key (in huge_pmd_share, see hugetlbfs below) + vma_start_write + mapping->i_mmap_rwsem + anon_vma->rwsem + mm->page_table_lock or pte_lock + swap_lock (in swap_duplicate, swap_info_get) + mmlist_lock (in mmput, drain_mmlist and others) + mapping->private_lock (in block_dirty_folio) + i_pages lock (widely used) + lruvec->lru_lock (in folio_lruvec_lock_irq) + inode->i_lock (in set_page_dirty's __mark_inode_dirty) + bdi.wb->list_lock (in set_page_dirty's __mark_inode_dirty) + sb_lock (within inode_lock in fs/fs-writeback.c) + i_pages lock (widely used, in set_page_dirty, + in arch-dependent flush_dcache_mmap_lock, + within bdi.wb->list_lock in __sync_single_inode) + +There is also a file-system specific lock ordering comment located at the top of +:c:macro:`!mm/filemap.c`: + +.. code-block:: + + ->i_mmap_rwsem (truncate_pagecache) + ->private_lock (__free_pte->block_dirty_folio) + ->swap_lock (exclusive_swap_page, others) + ->i_pages lock + + ->i_rwsem + ->invalidate_lock (acquired by fs in truncate path) + ->i_mmap_rwsem (truncate->unmap_mapping_range) + + ->mmap_lock + ->i_mmap_rwsem + ->page_table_lock or pte_lock (various, mainly in memory.c) + ->i_pages lock (arch-dependent flush_dcache_mmap_lock) + + ->mmap_lock + ->invalidate_lock (filemap_fault) + ->lock_page (filemap_fault, access_process_vm) + + ->i_rwsem (generic_perform_write) + ->mmap_lock (fault_in_readable->do_page_fault) + + bdi->wb.list_lock + sb_lock (fs/fs-writeback.c) + ->i_pages lock (__sync_single_inode) + + ->i_mmap_rwsem + ->anon_vma.lock (vma_merge) + + ->anon_vma.lock + ->page_table_lock or pte_lock (anon_vma_prepare and various) + + ->page_table_lock or pte_lock + ->swap_lock (try_to_unmap_one) + ->private_lock (try_to_unmap_one) + ->i_pages lock (try_to_unmap_one) + ->lruvec->lru_lock (follow_page_mask->mark_page_accessed) + ->lruvec->lru_lock (check_pte_range->folio_isolate_lru) + ->private_lock (folio_remove_rmap_pte->set_page_dirty) + ->i_pages lock (folio_remove_rmap_pte->set_page_dirty) + bdi.wb->list_lock (folio_remove_rmap_pte->set_page_dirty) + ->inode->i_lock (folio_remove_rmap_pte->set_page_dirty) + bdi.wb->list_lock (zap_pte_range->set_page_dirty) + ->inode->i_lock (zap_pte_range->set_page_dirty) + ->private_lock (zap_pte_range->block_dirty_folio) + +Please check the current state of these comments which may have changed since +the time of writing of this document. + +------------------------------ +Locking Implementation Details +------------------------------ + +.. warning:: Locking rules for PTE-level page tables are very different from + locking rules for page tables at other levels. + +Page table locking details +-------------------------- + +In addition to the locks described in the terminology section above, we have +additional locks dedicated to page tables: + +* **Higher level page table locks** - Higher level page tables, that is PGD, P4D + and PUD each make use of the process address space granularity + :c:member:`!mm->page_table_lock` lock when modified. + +* **Fine-grained page table locks** - PMDs and PTEs each have fine-grained locks + either kept within the folios describing the page tables or allocated + separated and pointed at by the folios if :c:macro:`!ALLOC_SPLIT_PTLOCKS` is + set. The PMD spin lock is obtained via :c:func:`!pmd_lock`, however PTEs are + mapped into higher memory (if a 32-bit system) and carefully locked via + :c:func:`!pte_offset_map_lock`. + +These locks represent the minimum required to interact with each page table +level, but there are further requirements. + +Importantly, note that on a **traversal** of page tables, sometimes no such +locks are taken. However, at the PTE level, at least concurrent page table +deletion must be prevented (using RCU) and the page table must be mapped into +high memory, see below. + +Whether care is taken on reading the page table entries depends on the +architecture, see the section on atomicity below. + +Locking rules +^^^^^^^^^^^^^ + +We establish basic locking rules when interacting with page tables: + +* When changing a page table entry the page table lock for that page table + **must** be held, except if you can safely assume nobody can access the page + tables concurrently (such as on invocation of :c:func:`!free_pgtables`). +* Reads from and writes to page table entries must be *appropriately* + atomic. See the section on atomicity below for details. +* Populating previously empty entries requires that the mmap or VMA locks are + held (read or write), doing so with only rmap locks would be dangerous (see + the warning below). +* As mentioned previously, zapping can be performed while simply keeping the VMA + stable, that is holding any one of the mmap, VMA or rmap locks. + +.. warning:: Populating previously empty entries is dangerous as, when unmapping + VMAs, :c:func:`!vms_clear_ptes` has a window of time between + zapping (via :c:func:`!unmap_vmas`) and freeing page tables (via + :c:func:`!free_pgtables`), where the VMA is still visible in the + rmap tree. :c:func:`!free_pgtables` assumes that the zap has + already been performed and removes PTEs unconditionally (along with + all other page tables in the freed range), so installing new PTE + entries could leak memory and also cause other unexpected and + dangerous behaviour. + +There are additional rules applicable when moving page tables, which we discuss +in the section on this topic below. + +PTE-level page tables are different from page tables at other levels, and there +are extra requirements for accessing them: + +* On 32-bit architectures, they may be in high memory (meaning they need to be + mapped into kernel memory to be accessible). +* When empty, they can be unlinked and RCU-freed while holding an mmap lock or + rmap lock for reading in combination with the PTE and PMD page table locks. + In particular, this happens in :c:func:`!retract_page_tables` when handling + :c:macro:`!MADV_COLLAPSE`. + So accessing PTE-level page tables requires at least holding an RCU read lock; + but that only suffices for readers that can tolerate racing with concurrent + page table updates such that an empty PTE is observed (in a page table that + has actually already been detached and marked for RCU freeing) while another + new page table has been installed in the same location and filled with + entries. Writers normally need to take the PTE lock and revalidate that the + PMD entry still refers to the same PTE-level page table. + +To access PTE-level page tables, a helper like :c:func:`!pte_offset_map_lock` or +:c:func:`!pte_offset_map` can be used depending on stability requirements. +These map the page table into kernel memory if required, take the RCU lock, and +depending on variant, may also look up or acquire the PTE lock. +See the comment on :c:func:`!__pte_offset_map_lock`. + +Atomicity +^^^^^^^^^ + +Regardless of page table locks, the MMU hardware concurrently updates accessed +and dirty bits (perhaps more, depending on architecture). Additionally, page +table traversal operations in parallel (though holding the VMA stable) and +functionality like GUP-fast locklessly traverses (that is reads) page tables, +without even keeping the VMA stable at all. + +When performing a page table traversal and keeping the VMA stable, whether a +read must be performed once and only once or not depends on the architecture +(for instance x86-64 does not require any special precautions). + +If a write is being performed, or if a read informs whether a write takes place +(on an installation of a page table entry say, for instance in +:c:func:`!__pud_install`), special care must always be taken. In these cases we +can never assume that page table locks give us entirely exclusive access, and +must retrieve page table entries once and only once. + +If we are reading page table entries, then we need only ensure that the compiler +does not rearrange our loads. This is achieved via :c:func:`!pXXp_get` +functions - :c:func:`!pgdp_get`, :c:func:`!p4dp_get`, :c:func:`!pudp_get`, +:c:func:`!pmdp_get`, and :c:func:`!ptep_get`. + +Each of these uses :c:func:`!READ_ONCE` to guarantee that the compiler reads +the page table entry only once. + +However, if we wish to manipulate an existing page table entry and care about +the previously stored data, we must go further and use an hardware atomic +operation as, for example, in :c:func:`!ptep_get_and_clear`. + +Equally, operations that do not rely on the VMA being held stable, such as +GUP-fast (see :c:func:`!gup_fast` and its various page table level handlers like +:c:func:`!gup_fast_pte_range`), must very carefully interact with page table +entries, using functions such as :c:func:`!ptep_get_lockless` and equivalent for +higher level page table levels. + +Writes to page table entries must also be appropriately atomic, as established +by :c:func:`!set_pXX` functions - :c:func:`!set_pgd`, :c:func:`!set_p4d`, +:c:func:`!set_pud`, :c:func:`!set_pmd`, and :c:func:`!set_pte`. + +Equally functions which clear page table entries must be appropriately atomic, +as in :c:func:`!pXX_clear` functions - :c:func:`!pgd_clear`, +:c:func:`!p4d_clear`, :c:func:`!pud_clear`, :c:func:`!pmd_clear`, and +:c:func:`!pte_clear`. + +Page table installation +^^^^^^^^^^^^^^^^^^^^^^^ + +Page table installation is performed with the VMA held stable explicitly by an +mmap or VMA lock in read or write mode (see the warning in the locking rules +section for details as to why). + +When allocating a P4D, PUD or PMD and setting the relevant entry in the above +PGD, P4D or PUD, the :c:member:`!mm->page_table_lock` must be held. This is +acquired in :c:func:`!__p4d_alloc`, :c:func:`!__pud_alloc` and +:c:func:`!__pmd_alloc` respectively. + +.. note:: :c:func:`!__pmd_alloc` actually invokes :c:func:`!pud_lock` and + :c:func:`!pud_lockptr` in turn, however at the time of writing it ultimately + references the :c:member:`!mm->page_table_lock`. + +Allocating a PTE will either use the :c:member:`!mm->page_table_lock` or, if +:c:macro:`!USE_SPLIT_PMD_PTLOCKS` is defined, a lock embedded in the PMD +physical page metadata in the form of a :c:struct:`!struct ptdesc`, acquired by +:c:func:`!pmd_ptdesc` called from :c:func:`!pmd_lock` and ultimately +:c:func:`!__pte_alloc`. + +Finally, modifying the contents of the PTE requires special treatment, as the +PTE page table lock must be acquired whenever we want stable and exclusive +access to entries contained within a PTE, especially when we wish to modify +them. + +This is performed via :c:func:`!pte_offset_map_lock` which carefully checks to +ensure that the PTE hasn't changed from under us, ultimately invoking +:c:func:`!pte_lockptr` to obtain a spin lock at PTE granularity contained within +the :c:struct:`!struct ptdesc` associated with the physical PTE page. The lock +must be released via :c:func:`!pte_unmap_unlock`. + +.. note:: There are some variants on this, such as + :c:func:`!pte_offset_map_rw_nolock` when we know we hold the PTE stable but + for brevity we do not explore this. See the comment for + :c:func:`!__pte_offset_map_lock` for more details. + +When modifying data in ranges we typically only wish to allocate higher page +tables as necessary, using these locks to avoid races or overwriting anything, +and set/clear data at the PTE level as required (for instance when page faulting +or zapping). + +A typical pattern taken when traversing page table entries to install a new +mapping is to optimistically determine whether the page table entry in the table +above is empty, if so, only then acquiring the page table lock and checking +again to see if it was allocated underneath us. + +This allows for a traversal with page table locks only being taken when +required. An example of this is :c:func:`!__pud_alloc`. + +At the leaf page table, that is the PTE, we can't entirely rely on this pattern +as we have separate PMD and PTE locks and a THP collapse for instance might have +eliminated the PMD entry as well as the PTE from under us. + +This is why :c:func:`!__pte_offset_map_lock` locklessly retrieves the PMD entry +for the PTE, carefully checking it is as expected, before acquiring the +PTE-specific lock, and then *again* checking that the PMD entry is as expected. + +If a THP collapse (or similar) were to occur then the lock on both pages would +be acquired, so we can ensure this is prevented while the PTE lock is held. + +Installing entries this way ensures mutual exclusion on write. + +Page table freeing +^^^^^^^^^^^^^^^^^^ + +Tearing down page tables themselves is something that requires significant +care. There must be no way that page tables designated for removal can be +traversed or referenced by concurrent tasks. + +It is insufficient to simply hold an mmap write lock and VMA lock (which will +prevent racing faults, and rmap operations), as a file-backed mapping can be +truncated under the :c:struct:`!struct address_space->i_mmap_rwsem` alone. + +As a result, no VMA which can be accessed via the reverse mapping (either +through the :c:struct:`!struct anon_vma->rb_root` or the :c:member:`!struct +address_space->i_mmap` interval trees) can have its page tables torn down. + +The operation is typically performed via :c:func:`!free_pgtables`, which assumes +either the mmap write lock has been taken (as specified by its +:c:member:`!mm_wr_locked` parameter), or that the VMA is already unreachable. + +It carefully removes the VMA from all reverse mappings, however it's important +that no new ones overlap these or any route remain to permit access to addresses +within the range whose page tables are being torn down. + +Additionally, it assumes that a zap has already been performed and steps have +been taken to ensure that no further page table entries can be installed between +the zap and the invocation of :c:func:`!free_pgtables`. + +Since it is assumed that all such steps have been taken, page table entries are +cleared without page table locks (in the :c:func:`!pgd_clear`, :c:func:`!p4d_clear`, +:c:func:`!pud_clear`, and :c:func:`!pmd_clear` functions. + +.. note:: It is possible for leaf page tables to be torn down independent of + the page tables above it as is done by + :c:func:`!retract_page_tables`, which is performed under the i_mmap + read lock, PMD, and PTE page table locks, without this level of care. + +Page table moving +^^^^^^^^^^^^^^^^^ + +Some functions manipulate page table levels above PMD (that is PUD, P4D and PGD +page tables). Most notable of these is :c:func:`!mremap`, which is capable of +moving higher level page tables. + +In these instances, it is required that **all** locks are taken, that is +the mmap lock, the VMA lock and the relevant rmap locks. + +You can observe this in the :c:func:`!mremap` implementation in the functions +:c:func:`!take_rmap_locks` and :c:func:`!drop_rmap_locks` which perform the rmap +side of lock acquisition, invoked ultimately by :c:func:`!move_page_tables`. + +VMA lock internals +------------------ + +Overview +^^^^^^^^ + +VMA read locking is entirely optimistic - if the lock is contended or a competing +write has started, then we do not obtain a read lock. + +A VMA **read** lock is obtained by :c:func:`!lock_vma_under_rcu`, which first +calls :c:func:`!rcu_read_lock` to ensure that the VMA is looked up in an RCU +critical section, then attempts to VMA lock it via :c:func:`!vma_start_read`, +before releasing the RCU lock via :c:func:`!rcu_read_unlock`. + +VMA read locks hold the read lock on the :c:member:`!vma->vm_lock` semaphore for +their duration and the caller of :c:func:`!lock_vma_under_rcu` must release it +via :c:func:`!vma_end_read`. + +VMA **write** locks are acquired via :c:func:`!vma_start_write` in instances where a +VMA is about to be modified, unlike :c:func:`!vma_start_read` the lock is always +acquired. An mmap write lock **must** be held for the duration of the VMA write +lock, releasing or downgrading the mmap write lock also releases the VMA write +lock so there is no :c:func:`!vma_end_write` function. + +Note that a semaphore write lock is not held across a VMA lock. Rather, a +sequence number is used for serialisation, and the write semaphore is only +acquired at the point of write lock to update this. + +This ensures the semantics we require - VMA write locks provide exclusive write +access to the VMA. + +Implementation details +^^^^^^^^^^^^^^^^^^^^^^ + +The VMA lock mechanism is designed to be a lightweight means of avoiding the use +of the heavily contended mmap lock. It is implemented using a combination of a +read/write semaphore and sequence numbers belonging to the containing +:c:struct:`!struct mm_struct` and the VMA. + +Read locks are acquired via :c:func:`!vma_start_read`, which is an optimistic +operation, i.e. it tries to acquire a read lock but returns false if it is +unable to do so. At the end of the read operation, :c:func:`!vma_end_read` is +called to release the VMA read lock. + +Invoking :c:func:`!vma_start_read` requires that :c:func:`!rcu_read_lock` has +been called first, establishing that we are in an RCU critical section upon VMA +read lock acquisition. Once acquired, the RCU lock can be released as it is only +required for lookup. This is abstracted by :c:func:`!lock_vma_under_rcu` which +is the interface a user should use. + +Writing requires the mmap to be write-locked and the VMA lock to be acquired via +:c:func:`!vma_start_write`, however the write lock is released by the termination or +downgrade of the mmap write lock so no :c:func:`!vma_end_write` is required. + +All this is achieved by the use of per-mm and per-VMA sequence counts, which are +used in order to reduce complexity, especially for operations which write-lock +multiple VMAs at once. + +If the mm sequence count, :c:member:`!mm->mm_lock_seq` is equal to the VMA +sequence count :c:member:`!vma->vm_lock_seq` then the VMA is write-locked. If +they differ, then it is not. + +Each time the mmap write lock is released in :c:func:`!mmap_write_unlock` or +:c:func:`!mmap_write_downgrade`, :c:func:`!vma_end_write_all` is invoked which +also increments :c:member:`!mm->mm_lock_seq` via +:c:func:`!mm_lock_seqcount_end`. + +This way, we ensure that, regardless of the VMA's sequence number, a write lock +is never incorrectly indicated and that when we release an mmap write lock we +efficiently release **all** VMA write locks contained within the mmap at the +same time. + +Since the mmap write lock is exclusive against others who hold it, the automatic +release of any VMA locks on its release makes sense, as you would never want to +keep VMAs locked across entirely separate write operations. It also maintains +correct lock ordering. + +Each time a VMA read lock is acquired, we acquire a read lock on the +:c:member:`!vma->vm_lock` read/write semaphore and hold it, while checking that +the sequence count of the VMA does not match that of the mm. + +If it does, the read lock fails. If it does not, we hold the lock, excluding +writers, but permitting other readers, who will also obtain this lock under RCU. + +Importantly, maple tree operations performed in :c:func:`!lock_vma_under_rcu` +are also RCU safe, so the whole read lock operation is guaranteed to function +correctly. + +On the write side, we acquire a write lock on the :c:member:`!vma->vm_lock` +read/write semaphore, before setting the VMA's sequence number under this lock, +also simultaneously holding the mmap write lock. + +This way, if any read locks are in effect, :c:func:`!vma_start_write` will sleep +until these are finished and mutual exclusion is achieved. + +After setting the VMA's sequence number, the lock is released, avoiding +complexity with a long-term held write lock. + +This clever combination of a read/write semaphore and sequence count allows for +fast RCU-based per-VMA lock acquisition (especially on page fault, though +utilised elsewhere) with minimal complexity around lock ordering. + +mmap write lock downgrading +--------------------------- + +When an mmap write lock is held one has exclusive access to resources within the +mmap (with the usual caveats about requiring VMA write locks to avoid races with +tasks holding VMA read locks). + +It is then possible to **downgrade** from a write lock to a read lock via +:c:func:`!mmap_write_downgrade` which, similar to :c:func:`!mmap_write_unlock`, +implicitly terminates all VMA write locks via :c:func:`!vma_end_write_all`, but +importantly does not relinquish the mmap lock while downgrading, therefore +keeping the locked virtual address space stable. + +An interesting consequence of this is that downgraded locks are exclusive +against any other task possessing a downgraded lock (since a racing task would +have to acquire a write lock first to downgrade it, and the downgraded lock +prevents a new write lock from being obtained until the original lock is +released). + +For clarity, we map read (R)/downgraded write (D)/write (W) locks against one +another showing which locks exclude the others: + +.. list-table:: Lock exclusivity + :widths: 5 5 5 5 + :header-rows: 1 + :stub-columns: 1 + + * - + - R + - D + - W + * - R + - N + - N + - Y + * - D + - N + - Y + - Y + * - W + - Y + - Y + - Y + +Here a Y indicates the locks in the matching row/column are mutually exclusive, +and N indicates that they are not. + +Stack expansion +--------------- + +Stack expansion throws up additional complexities in that we cannot permit there +to be racing page faults, as a result we invoke :c:func:`!vma_start_write` to +prevent this in :c:func:`!expand_downwards` or :c:func:`!expand_upwards`. From 6a75f19af16ff482cfd6085c77123aa0f464f8dd Mon Sep 17 00:00:00 2001 From: "Isaac J. Manjarres" Date: Thu, 5 Dec 2024 11:29:41 -0800 Subject: [PATCH 137/337] selftests/memfd: run sysctl tests when PID namespace support is enabled The sysctl tests for vm.memfd_noexec rely on the kernel to support PID namespaces (i.e. the kernel is built with CONFIG_PID_NS=y). If the kernel the test runs on does not support PID namespaces, the first sysctl test will fail when attempting to spawn a new thread in a new PID namespace, abort the test, preventing the remaining tests from being run. This is not desirable, as not all kernels need PID namespaces, but can still use the other features provided by memfd. Therefore, only run the sysctl tests if the kernel supports PID namespaces. Otherwise, skip those tests and emit an informative message to let the user know why the sysctl tests are not being run. Link: https://lkml.kernel.org/r/20241205192943.3228757-1-isaacmanjarres@google.com Fixes: 11f75a01448f ("selftests/memfd: add tests for MFD_NOEXEC_SEAL MFD_EXEC") Signed-off-by: Isaac J. Manjarres Reviewed-by: Jeff Xu Cc: Suren Baghdasaryan Cc: Kalesh Singh Cc: [6.6+] Signed-off-by: Andrew Morton --- tools/testing/selftests/memfd/memfd_test.c | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/tools/testing/selftests/memfd/memfd_test.c b/tools/testing/selftests/memfd/memfd_test.c index 95af2d78fd31..0a0b55516028 100644 --- a/tools/testing/selftests/memfd/memfd_test.c +++ b/tools/testing/selftests/memfd/memfd_test.c @@ -9,6 +9,7 @@ #include #include #include +#include #include #include #include @@ -1557,6 +1558,11 @@ static void test_share_fork(char *banner, char *b_suffix) close(fd); } +static bool pid_ns_supported(void) +{ + return access("/proc/self/ns/pid", F_OK) == 0; +} + int main(int argc, char **argv) { pid_t pid; @@ -1591,8 +1597,12 @@ int main(int argc, char **argv) test_seal_grow(); test_seal_resize(); - test_sysctl_simple(); - test_sysctl_nested(); + if (pid_ns_supported()) { + test_sysctl_simple(); + test_sysctl_nested(); + } else { + printf("PID namespaces are not supported; skipping sysctl tests\n"); + } test_share_dup("SHARE-DUP", ""); test_share_mmap("SHARE-MMAP", ""); From da5bd7fa789ae212ac18ebc3ac52b7f2ce1781da Mon Sep 17 00:00:00 2001 From: Huang Ying Date: Thu, 5 Dec 2024 20:42:01 +0800 Subject: [PATCH 138/337] mailmap: add entry for Ying Huang Map my old company email to my personal email. Link: https://lkml.kernel.org/r/20241205124201.529308-1-huang.ying.caritas@gmail.com Signed-off-by: "Huang, Ying" Signed-off-by: Andrew Morton --- .mailmap | 1 + 1 file changed, 1 insertion(+) diff --git a/.mailmap b/.mailmap index 5ff0e5d681e7..7efe43237ca8 100644 --- a/.mailmap +++ b/.mailmap @@ -735,6 +735,7 @@ Wolfram Sang Wolfram Sang Yakir Yang Yanteng Si +Ying Huang Yusuke Goda Zack Rusin Zhu Yanjun From 1a72d2ebeec51f10e5b0f0609c6754e92b11ee9d Mon Sep 17 00:00:00 2001 From: Heming Zhao Date: Thu, 5 Dec 2024 18:48:32 +0800 Subject: [PATCH 139/337] ocfs2: revert "ocfs2: fix the la space leak when unmounting an ocfs2 volume" Patch series "Revert ocfs2 commit dfe6c5692fb5 and provide a new fix". SUSE QA team detected a mistake in my commit dfe6c5692fb5 ("ocfs2: fix the la space leak when unmounting an ocfs2 volume"). I am very sorry for my error. (If my eyes are correct) From the mailling list mails, this patch shouldn't be applied to 4.19 5.4 5.10 5.15 6.1 6.6, and these branches should perform a revert operation. Reason for revert: In commit dfe6c5692fb5, I mistakenly wrote: "This bug has existed since the initial OCFS2 code.". The statement is wrong. The correct introduction commit is 30dd3478c3cd. IOW, if the branch doesn't include 30dd3478c3cd, dfe6c5692fb5 should also not be included. This reverts commit dfe6c5692fb5 ("ocfs2: fix the la space leak when unmounting an ocfs2 volume"). In commit dfe6c5692fb5, the commit log "This bug has existed since the initial OCFS2 code." is wrong. The correct introduction commit is 30dd3478c3cd ("ocfs2: correctly use ocfs2_find_next_zero_bit()"). The influence of commit dfe6c5692fb5 is that it provides a correct fix for the latest kernel. however, it shouldn't be pushed to stable branches. Let's use this commit to revert all branches that include dfe6c5692fb5 and use a new fix method to fix commit 30dd3478c3cd. Link: https://lkml.kernel.org/r/20241205104835.18223-1-heming.zhao@suse.com Link: https://lkml.kernel.org/r/20241205104835.18223-2-heming.zhao@suse.com Fixes: dfe6c5692fb5 ("ocfs2: fix the la space leak when unmounting an ocfs2 volume") Signed-off-by: Heming Zhao Reviewed-by: Joseph Qi Cc: Mark Fasheh Cc: Joel Becker Cc: Junxiao Bi Cc: Changwei Ge Cc: Jun Piao Cc: Signed-off-by: Andrew Morton --- fs/ocfs2/localalloc.c | 19 ------------------- 1 file changed, 19 deletions(-) diff --git a/fs/ocfs2/localalloc.c b/fs/ocfs2/localalloc.c index 8ac42ea81a17..5df34561c551 100644 --- a/fs/ocfs2/localalloc.c +++ b/fs/ocfs2/localalloc.c @@ -1002,25 +1002,6 @@ static int ocfs2_sync_local_to_main(struct ocfs2_super *osb, start = bit_off + 1; } - /* clear the contiguous bits until the end boundary */ - if (count) { - blkno = la_start_blk + - ocfs2_clusters_to_blocks(osb->sb, - start - count); - - trace_ocfs2_sync_local_to_main_free( - count, start - count, - (unsigned long long)la_start_blk, - (unsigned long long)blkno); - - status = ocfs2_release_clusters(handle, - main_bm_inode, - main_bm_bh, blkno, - count); - if (status < 0) - mlog_errno(status); - } - bail: if (status) mlog_errno(status); From 7782e3b3b004e8cb94a88621a22cc3c2f33e5b90 Mon Sep 17 00:00:00 2001 From: Heming Zhao Date: Thu, 5 Dec 2024 18:48:33 +0800 Subject: [PATCH 140/337] ocfs2: fix the space leak in LA when releasing LA Commit 30dd3478c3cd ("ocfs2: correctly use ocfs2_find_next_zero_bit()") introduced an issue, the ocfs2_sync_local_to_main() ignores the last contiguous free bits, which causes an OCFS2 volume to lose the last free clusters of LA window during the release routine. Please note, because commit dfe6c5692fb5 ("ocfs2: fix the la space leak when unmounting an ocfs2 volume") was reverted, this commit is a replacement fix for commit dfe6c5692fb5. Link: https://lkml.kernel.org/r/20241205104835.18223-3-heming.zhao@suse.com Fixes: 30dd3478c3cd ("ocfs2: correctly use ocfs2_find_next_zero_bit()") Signed-off-by: Heming Zhao Suggested-by: Joseph Qi Reviewed-by: Joseph Qi Cc: Mark Fasheh Cc: Joel Becker Cc: Junxiao Bi Cc: Changwei Ge Cc: Jun Piao Cc: Signed-off-by: Andrew Morton --- fs/ocfs2/localalloc.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/fs/ocfs2/localalloc.c b/fs/ocfs2/localalloc.c index 5df34561c551..d1aa04a5af1b 100644 --- a/fs/ocfs2/localalloc.c +++ b/fs/ocfs2/localalloc.c @@ -971,9 +971,9 @@ static int ocfs2_sync_local_to_main(struct ocfs2_super *osb, start = count = 0; left = le32_to_cpu(alloc->id1.bitmap1.i_total); - while ((bit_off = ocfs2_find_next_zero_bit(bitmap, left, start)) < - left) { - if (bit_off == start) { + while (1) { + bit_off = ocfs2_find_next_zero_bit(bitmap, left, start); + if ((bit_off < left) && (bit_off == start)) { count++; start++; continue; @@ -998,6 +998,8 @@ static int ocfs2_sync_local_to_main(struct ocfs2_super *osb, } } + if (bit_off >= left) + break; count = 1; start = bit_off + 1; } From dad2dc9c92e0f93f33cebcb0595b8daa3d57473f Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Wed, 4 Dec 2024 22:50:06 -0800 Subject: [PATCH 141/337] mm: shmem: fix ShmemHugePages at swapout /proc/meminfo ShmemHugePages has been showing overlarge amounts (more than Shmem) after swapping out THPs: we forgot to update NR_SHMEM_THPS. Add shmem_update_stats(), to avoid repetition, and risk of making that mistake again: the call from shmem_delete_from_page_cache() is the bugfix; the call from shmem_replace_folio() is reassuring, but not really a bugfix (replace corrects misplaced swapin readahead, but huge swapin readahead would be a mistake). Link: https://lkml.kernel.org/r/5ba477c8-a569-70b5-923e-09ab221af45b@google.com Fixes: 809bc86517cc ("mm: shmem: support large folio swap out") Signed-off-by: Hugh Dickins Reviewed-by: Shakeel Butt Reviewed-by: Yosry Ahmed Reviewed-by: Baolin Wang Tested-by: Baolin Wang Cc: Signed-off-by: Andrew Morton --- mm/shmem.c | 22 ++++++++++++---------- 1 file changed, 12 insertions(+), 10 deletions(-) diff --git a/mm/shmem.c b/mm/shmem.c index ccb9629a0f70..f6fb053ac50d 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -787,6 +787,14 @@ static bool shmem_huge_global_enabled(struct inode *inode, pgoff_t index, } #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ +static void shmem_update_stats(struct folio *folio, int nr_pages) +{ + if (folio_test_pmd_mappable(folio)) + __lruvec_stat_mod_folio(folio, NR_SHMEM_THPS, nr_pages); + __lruvec_stat_mod_folio(folio, NR_FILE_PAGES, nr_pages); + __lruvec_stat_mod_folio(folio, NR_SHMEM, nr_pages); +} + /* * Somewhat like filemap_add_folio, but error if expected item has gone. */ @@ -821,10 +829,7 @@ static int shmem_add_to_page_cache(struct folio *folio, xas_store(&xas, folio); if (xas_error(&xas)) goto unlock; - if (folio_test_pmd_mappable(folio)) - __lruvec_stat_mod_folio(folio, NR_SHMEM_THPS, nr); - __lruvec_stat_mod_folio(folio, NR_FILE_PAGES, nr); - __lruvec_stat_mod_folio(folio, NR_SHMEM, nr); + shmem_update_stats(folio, nr); mapping->nrpages += nr; unlock: xas_unlock_irq(&xas); @@ -852,8 +857,7 @@ static void shmem_delete_from_page_cache(struct folio *folio, void *radswap) error = shmem_replace_entry(mapping, folio->index, folio, radswap); folio->mapping = NULL; mapping->nrpages -= nr; - __lruvec_stat_mod_folio(folio, NR_FILE_PAGES, -nr); - __lruvec_stat_mod_folio(folio, NR_SHMEM, -nr); + shmem_update_stats(folio, -nr); xa_unlock_irq(&mapping->i_pages); folio_put_refs(folio, nr); BUG_ON(error); @@ -1969,10 +1973,8 @@ static int shmem_replace_folio(struct folio **foliop, gfp_t gfp, } if (!error) { mem_cgroup_replace_folio(old, new); - __lruvec_stat_mod_folio(new, NR_FILE_PAGES, nr_pages); - __lruvec_stat_mod_folio(new, NR_SHMEM, nr_pages); - __lruvec_stat_mod_folio(old, NR_FILE_PAGES, -nr_pages); - __lruvec_stat_mod_folio(old, NR_SHMEM, -nr_pages); + shmem_update_stats(new, nr_pages); + shmem_update_stats(old, -nr_pages); } xa_unlock_irq(&swap_mapping->i_pages); From 8aca2bc96c833ba695ede7a45ad7784c836a262e Mon Sep 17 00:00:00 2001 From: Kefeng Wang Date: Mon, 28 Oct 2024 22:56:55 +0800 Subject: [PATCH 142/337] mm: use aligned address in clear_gigantic_page() In current kernel, hugetlb_no_page() calls folio_zero_user() with the fault address. Where the fault address may be not aligned with the huge page size. Then, folio_zero_user() may call clear_gigantic_page() with the address, while clear_gigantic_page() requires the address to be huge page size aligned. So, this may cause memory corruption or information leak, addtional, use more obvious naming 'addr_hint' instead of 'addr' for clear_gigantic_page(). Link: https://lkml.kernel.org/r/20241028145656.932941-1-wangkefeng.wang@huawei.com Fixes: 78fefd04c123 ("mm: memory: convert clear_huge_page() to folio_zero_user()") Signed-off-by: Kefeng Wang Reviewed-by: "Huang, Ying" Reviewed-by: David Hildenbrand Cc: Matthew Wilcox (Oracle) Cc: Muchun Song Cc: Signed-off-by: Andrew Morton --- fs/hugetlbfs/inode.c | 2 +- mm/memory.c | 3 ++- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index 90f883d6b8fd..fc1ae5132127 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -825,7 +825,7 @@ static long hugetlbfs_fallocate(struct file *file, int mode, loff_t offset, error = PTR_ERR(folio); goto out; } - folio_zero_user(folio, ALIGN_DOWN(addr, hpage_size)); + folio_zero_user(folio, addr); __folio_mark_uptodate(folio); error = hugetlb_add_to_page_cache(folio, mapping, index); if (unlikely(error)) { diff --git a/mm/memory.c b/mm/memory.c index 75c2dfd04f72..84864387f965 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -6815,9 +6815,10 @@ static inline int process_huge_page( return 0; } -static void clear_gigantic_page(struct folio *folio, unsigned long addr, +static void clear_gigantic_page(struct folio *folio, unsigned long addr_hint, unsigned int nr_pages) { + unsigned long addr = ALIGN_DOWN(addr_hint, folio_size(folio)); int i; might_sleep(); From f5d09de9f1bf9674c6418ff10d0a40cfe29268e1 Mon Sep 17 00:00:00 2001 From: Kefeng Wang Date: Mon, 28 Oct 2024 22:56:56 +0800 Subject: [PATCH 143/337] mm: use aligned address in copy_user_gigantic_page() In current kernel, hugetlb_wp() calls copy_user_large_folio() with the fault address. Where the fault address may be not aligned with the huge page size. Then, copy_user_large_folio() may call copy_user_gigantic_page() with the address, while copy_user_gigantic_page() requires the address to be huge page size aligned. So, this may cause memory corruption or information leak, addtional, use more obvious naming 'addr_hint' instead of 'addr' for copy_user_gigantic_page(). Link: https://lkml.kernel.org/r/20241028145656.932941-2-wangkefeng.wang@huawei.com Fixes: 530dd9926dc1 ("mm: memory: improve copy_user_large_folio()") Signed-off-by: Kefeng Wang Reviewed-by: David Hildenbrand Cc: Huang Ying Cc: Matthew Wilcox (Oracle) Cc: Muchun Song Cc: Signed-off-by: Andrew Morton --- mm/hugetlb.c | 5 ++--- mm/memory.c | 5 +++-- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/mm/hugetlb.c b/mm/hugetlb.c index ea2ed8e301ef..cec4b121193f 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -5340,7 +5340,7 @@ again: break; } ret = copy_user_large_folio(new_folio, pte_folio, - ALIGN_DOWN(addr, sz), dst_vma); + addr, dst_vma); folio_put(pte_folio); if (ret) { folio_put(new_folio); @@ -6643,8 +6643,7 @@ int hugetlb_mfill_atomic_pte(pte_t *dst_pte, *foliop = NULL; goto out; } - ret = copy_user_large_folio(folio, *foliop, - ALIGN_DOWN(dst_addr, size), dst_vma); + ret = copy_user_large_folio(folio, *foliop, dst_addr, dst_vma); folio_put(*foliop); *foliop = NULL; if (ret) { diff --git a/mm/memory.c b/mm/memory.c index 84864387f965..209885a4134f 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -6852,13 +6852,14 @@ void folio_zero_user(struct folio *folio, unsigned long addr_hint) } static int copy_user_gigantic_page(struct folio *dst, struct folio *src, - unsigned long addr, + unsigned long addr_hint, struct vm_area_struct *vma, unsigned int nr_pages) { - int i; + unsigned long addr = ALIGN_DOWN(addr_hint, folio_size(dst)); struct page *dst_page; struct page *src_page; + int i; for (i = 0; i < nr_pages; i++) { dst_page = folio_page(dst, i); From 42c4e4b20d9c4651903c4afc53a4ff18b7451b3e Mon Sep 17 00:00:00 2001 From: Lorenzo Stoakes Date: Fri, 6 Dec 2024 21:52:29 +0000 Subject: [PATCH 144/337] mm: correctly reference merged VMA On second merge attempt on mmap() we incorrectly discard the possibly merged VMA, resulting in a possible use-after-free (and most certainly a reference to the wrong VMA) in this instance in the subsequent __mmap_complete() invocation. Correct this mistake by reassigning vma correctly if a merge succeeds in this case. Link: https://lkml.kernel.org/r/20241206215229.244413-1-lorenzo.stoakes@oracle.com Fixes: 5ac87a885aec ("mm: defer second attempt at merge on mmap()") Signed-off-by: Lorenzo Stoakes Suggested-by: Jann Horn Reported-by: syzbot+91cf8da9401355f946c3@syzkaller.appspotmail.com Closes: https://lore.kernel.org/all/67536a25.050a0220.a30f1.0149.GAE@google.com/ Reviewed-by: Liam R. Howlett Reviewed-by: Vlastimil Babka Signed-off-by: Andrew Morton --- mm/vma.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/mm/vma.c b/mm/vma.c index 8e31b7e25aeb..bb2119e5a0d0 100644 --- a/mm/vma.c +++ b/mm/vma.c @@ -2460,10 +2460,13 @@ unsigned long __mmap_region(struct file *file, unsigned long addr, /* If flags changed, we might be able to merge, so try again. */ if (map.retry_merge) { + struct vm_area_struct *merged; VMG_MMAP_STATE(vmg, &map, vma); vma_iter_config(map.vmi, map.addr, map.end); - vma_merge_existing_range(&vmg); + merged = vma_merge_existing_range(&vmg); + if (merged) + vma = merged; } __mmap_complete(&map, vma); From 31c5629920b82ddf66059f20f79be2bc00c4197b Mon Sep 17 00:00:00 2001 From: Petr Malat Date: Tue, 10 Dec 2024 01:06:04 +0100 Subject: [PATCH 145/337] mm: add RCU annotation to pte_offset_map(_lock) RCU lock is taken by ___pte_offset_map() unless it returns NULL. Add this information to its inline callers to avoid sparse warning about context imbalance in pte_unmap(). Link: https://lkml.kernel.org/r/20241210000604.700710-1-oss@malat.biz Signed-off-by: Petr Malat Signed-off-by: Andrew Morton --- include/linux/mm.h | 13 +++++++++++-- mm/pgtable-generic.c | 2 +- 2 files changed, 12 insertions(+), 3 deletions(-) diff --git a/include/linux/mm.h b/include/linux/mm.h index c39c4945946c..3a6ee6a05aa0 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -3010,7 +3010,15 @@ static inline void pagetable_pte_dtor(struct ptdesc *ptdesc) lruvec_stat_sub_folio(folio, NR_PAGETABLE); } -pte_t *__pte_offset_map(pmd_t *pmd, unsigned long addr, pmd_t *pmdvalp); +pte_t *___pte_offset_map(pmd_t *pmd, unsigned long addr, pmd_t *pmdvalp); +static inline pte_t *__pte_offset_map(pmd_t *pmd, unsigned long addr, + pmd_t *pmdvalp) +{ + pte_t *pte; + + __cond_lock(RCU, pte = ___pte_offset_map(pmd, addr, pmdvalp)); + return pte; +} static inline pte_t *pte_offset_map(pmd_t *pmd, unsigned long addr) { return __pte_offset_map(pmd, addr, NULL); @@ -3023,7 +3031,8 @@ static inline pte_t *pte_offset_map_lock(struct mm_struct *mm, pmd_t *pmd, { pte_t *pte; - __cond_lock(*ptlp, pte = __pte_offset_map_lock(mm, pmd, addr, ptlp)); + __cond_lock(RCU, __cond_lock(*ptlp, + pte = __pte_offset_map_lock(mm, pmd, addr, ptlp))); return pte; } diff --git a/mm/pgtable-generic.c b/mm/pgtable-generic.c index 5297dcc38c37..5a882f2b10f9 100644 --- a/mm/pgtable-generic.c +++ b/mm/pgtable-generic.c @@ -279,7 +279,7 @@ static unsigned long pmdp_get_lockless_start(void) { return 0; } static void pmdp_get_lockless_end(unsigned long irqflags) { } #endif -pte_t *__pte_offset_map(pmd_t *pmd, unsigned long addr, pmd_t *pmdvalp) +pte_t *___pte_offset_map(pmd_t *pmd, unsigned long addr, pmd_t *pmdvalp) { unsigned long irqflags; pmd_t pmdval; From 5c0541e11c16bd2f162e23a22d07c09d58017e5a Mon Sep 17 00:00:00 2001 From: Zi Yan Date: Mon, 9 Dec 2024 13:23:25 -0500 Subject: [PATCH 146/337] mm: introduce cpu_icache_is_aliasing() across all architectures In commit eacd0e950dc2 ("ARC: [mm] Lazy D-cache flush (non aliasing VIPT)"), arc adds the need to flush dcache to make icache see the code page change. This also requires special handling for clear_user_(high)page(). Introduce cpu_icache_is_aliasing() to make MM code query special clear_user_(high)page() easier. This will be used by the following commit. Link: https://lkml.kernel.org/r/20241209182326.2955963-1-ziy@nvidia.com Fixes: 5708d96da20b ("mm: avoid zeroing user movable page twice with init_on_alloc=1") Signed-off-by: Zi Yan Suggested-by: Mathieu Desnoyers Reviewed-by: Mathieu Desnoyers Acked-by: Vlastimil Babka Cc: Alexander Potapenko Cc: David Hildenbrand Cc: Geert Uytterhoeven Cc: John Hubbard Cc: Kees Cook Cc: Kefeng Wang Cc: Matthew Wilcox Cc: Miaohe Lin Cc: Ryan Roberts Cc: Vineet Gupta Signed-off-by: Andrew Morton --- arch/arc/Kconfig | 1 + arch/arc/include/asm/cachetype.h | 8 ++++++++ include/linux/cacheinfo.h | 6 ++++++ 3 files changed, 15 insertions(+) create mode 100644 arch/arc/include/asm/cachetype.h diff --git a/arch/arc/Kconfig b/arch/arc/Kconfig index ea5a1dcb133b..4f2eeda907ec 100644 --- a/arch/arc/Kconfig +++ b/arch/arc/Kconfig @@ -6,6 +6,7 @@ config ARC def_bool y select ARC_TIMERS + select ARCH_HAS_CPU_CACHE_ALIASING select ARCH_HAS_CACHE_LINE_SIZE select ARCH_HAS_DEBUG_VM_PGTABLE select ARCH_HAS_DMA_PREP_COHERENT diff --git a/arch/arc/include/asm/cachetype.h b/arch/arc/include/asm/cachetype.h new file mode 100644 index 000000000000..acd3b6cb4bf5 --- /dev/null +++ b/arch/arc/include/asm/cachetype.h @@ -0,0 +1,8 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef __ASM_ARC_CACHETYPE_H +#define __ASM_ARC_CACHETYPE_H + +#define cpu_dcache_is_aliasing() false +#define cpu_icache_is_aliasing() true + +#endif diff --git a/include/linux/cacheinfo.h b/include/linux/cacheinfo.h index 108060612bb8..7ad736538649 100644 --- a/include/linux/cacheinfo.h +++ b/include/linux/cacheinfo.h @@ -155,8 +155,14 @@ static inline int get_cpu_cacheinfo_id(int cpu, int level) #ifndef CONFIG_ARCH_HAS_CPU_CACHE_ALIASING #define cpu_dcache_is_aliasing() false +#define cpu_icache_is_aliasing() cpu_dcache_is_aliasing() #else #include + +#ifndef cpu_icache_is_aliasing +#define cpu_icache_is_aliasing() cpu_dcache_is_aliasing() +#endif + #endif #endif /* _LINUX_CACHEINFO_H */ From c51a4f11e6d8246590b5e64908c1ed84b33e8ba2 Mon Sep 17 00:00:00 2001 From: Zi Yan Date: Mon, 9 Dec 2024 13:23:26 -0500 Subject: [PATCH 147/337] mm: use clear_user_(high)page() for arch with special user folio handling Some architectures have special handling after clearing user folios: architectures, which set cpu_dcache_is_aliasing() to true, require flushing dcache; arc, which sets cpu_icache_is_aliasing() to true, changes folio->flags to make icache coherent to dcache. So __GFP_ZERO using only clear_page() is not enough to zero user folios and clear_user_(high)page() must be used. Otherwise, user data will be corrupted. Fix it by always clearing user folios with clear_user_(high)page() when cpu_dcache_is_aliasing() is true or cpu_icache_is_aliasing() is true. Rename alloc_zeroed() to user_alloc_needs_zeroing() and invert the logic to clarify its intend. Link: https://lkml.kernel.org/r/20241209182326.2955963-2-ziy@nvidia.com Fixes: 5708d96da20b ("mm: avoid zeroing user movable page twice with init_on_alloc=1") Signed-off-by: Zi Yan Reported-by: Geert Uytterhoeven Closes: https://lore.kernel.org/linux-mm/CAMuHMdV1hRp_NtR5YnJo=HsfgKQeH91J537Gh4gKk3PFZhSkbA@mail.gmail.com/ Tested-by: Geert Uytterhoeven Acked-by: Vlastimil Babka Cc: Alexander Potapenko Cc: David Hildenbrand Cc: John Hubbard Cc: Kees Cook Cc: Kefeng Wang Cc: Mathieu Desnoyers Cc: Matthew Wilcox Cc: Miaohe Lin Cc: Ryan Roberts Cc: Vineet Gupta Signed-off-by: Andrew Morton --- include/linux/highmem.h | 8 +++++++- include/linux/mm.h | 18 ++++++++++++++++++ mm/huge_memory.c | 9 +++++---- mm/internal.h | 6 ------ mm/memory.c | 10 +++++----- 5 files changed, 35 insertions(+), 16 deletions(-) diff --git a/include/linux/highmem.h b/include/linux/highmem.h index 6e452bd8e7e3..5c6bea81a90e 100644 --- a/include/linux/highmem.h +++ b/include/linux/highmem.h @@ -224,7 +224,13 @@ static inline struct folio *vma_alloc_zeroed_movable_folio(struct vm_area_struct *vma, unsigned long vaddr) { - return vma_alloc_folio(GFP_HIGHUSER_MOVABLE | __GFP_ZERO, 0, vma, vaddr); + struct folio *folio; + + folio = vma_alloc_folio(GFP_HIGHUSER_MOVABLE, 0, vma, vaddr); + if (folio && user_alloc_needs_zeroing()) + clear_user_highpage(&folio->page, vaddr); + + return folio; } #endif diff --git a/include/linux/mm.h b/include/linux/mm.h index 3a6ee6a05aa0..338a76ce9083 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -31,6 +31,7 @@ #include #include #include +#include struct mempolicy; struct anon_vma; @@ -4184,6 +4185,23 @@ static inline int do_mseal(unsigned long start, size_t len_in, unsigned long fla } #endif +/* + * user_alloc_needs_zeroing checks if a user folio from page allocator needs to + * be zeroed or not. + */ +static inline bool user_alloc_needs_zeroing(void) +{ + /* + * for user folios, arch with cache aliasing requires cache flush and + * arc changes folio->flags to make icache coherent with dcache, so + * always return false to make caller use + * clear_user_page()/clear_user_highpage(). + */ + return cpu_dcache_is_aliasing() || cpu_icache_is_aliasing() || + !static_branch_maybe(CONFIG_INIT_ON_ALLOC_DEFAULT_ON, + &init_on_alloc); +} + int arch_get_shadow_stack_status(struct task_struct *t, unsigned long __user *status); int arch_set_shadow_stack_status(struct task_struct *t, unsigned long status); int arch_lock_shadow_stack_status(struct task_struct *t, unsigned long status); diff --git a/mm/huge_memory.c b/mm/huge_memory.c index ee335d96fc39..9bb351caa619 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -1176,11 +1176,12 @@ static struct folio *vma_alloc_anon_folio_pmd(struct vm_area_struct *vma, folio_throttle_swaprate(folio, gfp); /* - * When a folio is not zeroed during allocation (__GFP_ZERO not used), - * folio_zero_user() is used to make sure that the page corresponding - * to the faulting address will be hot in the cache after zeroing. + * When a folio is not zeroed during allocation (__GFP_ZERO not used) + * or user folios require special handling, folio_zero_user() is used to + * make sure that the page corresponding to the faulting address will be + * hot in the cache after zeroing. */ - if (!alloc_zeroed()) + if (user_alloc_needs_zeroing()) folio_zero_user(folio, addr); /* * The memory barrier inside __folio_mark_uptodate makes sure that diff --git a/mm/internal.h b/mm/internal.h index cb8d8e8e3ffa..3bd08bafad04 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -1285,12 +1285,6 @@ void touch_pud(struct vm_area_struct *vma, unsigned long addr, void touch_pmd(struct vm_area_struct *vma, unsigned long addr, pmd_t *pmd, bool write); -static inline bool alloc_zeroed(void) -{ - return static_branch_maybe(CONFIG_INIT_ON_ALLOC_DEFAULT_ON, - &init_on_alloc); -} - /* * Parses a string with mem suffixes into its order. Useful to parse kernel * parameters. diff --git a/mm/memory.c b/mm/memory.c index 209885a4134f..398c031be9ba 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -4733,12 +4733,12 @@ static struct folio *alloc_anon_folio(struct vm_fault *vmf) folio_throttle_swaprate(folio, gfp); /* * When a folio is not zeroed during allocation - * (__GFP_ZERO not used), folio_zero_user() is used - * to make sure that the page corresponding to the - * faulting address will be hot in the cache after - * zeroing. + * (__GFP_ZERO not used) or user folios require special + * handling, folio_zero_user() is used to make sure + * that the page corresponding to the faulting address + * will be hot in the cache after zeroing. */ - if (!alloc_zeroed()) + if (user_alloc_needs_zeroing()) folio_zero_user(folio, vmf->address); return folio; } From be48c412f6ebf38849213c19547bc6d5b692b5e5 Mon Sep 17 00:00:00 2001 From: Kairui Song Date: Tue, 10 Dec 2024 00:57:15 +0800 Subject: [PATCH 148/337] zram: refuse to use zero sized block device as backing device Patch series "zram: fix backing device setup issue", v2. This series fixes two bugs of backing device setting: - ZRAM should reject using a zero sized (or the uninitialized ZRAM device itself) as the backing device. - Fix backing device leaking when removing a uninitialized ZRAM device. This patch (of 2): Setting a zero sized block device as backing device is pointless, and one can easily create a recursive loop by setting the uninitialized ZRAM device itself as its own backing device by (zram0 is uninitialized): echo /dev/zram0 > /sys/block/zram0/backing_dev It's definitely a wrong config, and the module will pin itself, kernel should refuse doing so in the first place. By refusing to use zero sized device we avoided misuse cases including this one above. Link: https://lkml.kernel.org/r/20241209165717.94215-1-ryncsn@gmail.com Link: https://lkml.kernel.org/r/20241209165717.94215-2-ryncsn@gmail.com Fixes: 013bf95a83ec ("zram: add interface to specif backing device") Signed-off-by: Kairui Song Reported-by: Desheng Wu Reviewed-by: Sergey Senozhatsky Cc: Signed-off-by: Andrew Morton --- drivers/block/zram/zram_drv.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c index 3dee026988dc..e86cc3d2f4d2 100644 --- a/drivers/block/zram/zram_drv.c +++ b/drivers/block/zram/zram_drv.c @@ -614,6 +614,12 @@ static ssize_t backing_dev_store(struct device *dev, } nr_pages = i_size_read(inode) >> PAGE_SHIFT; + /* Refuse to use zero sized device (also prevents self reference) */ + if (!nr_pages) { + err = -EINVAL; + goto out; + } + bitmap_sz = BITS_TO_LONGS(nr_pages) * sizeof(long); bitmap = kvzalloc(bitmap_sz, GFP_KERNEL); if (!bitmap) { From 74363ec674cb172d8856de25776c8f3103f05e2f Mon Sep 17 00:00:00 2001 From: Kairui Song Date: Tue, 10 Dec 2024 00:57:16 +0800 Subject: [PATCH 149/337] zram: fix uninitialized ZRAM not releasing backing device Setting backing device is done before ZRAM initialization. If we set the backing device, then remove the ZRAM module without initializing the device, the backing device reference will be leaked and the device will be hold forever. Fix this by always reset the ZRAM fully on rmmod or reset store. Link: https://lkml.kernel.org/r/20241209165717.94215-3-ryncsn@gmail.com Fixes: 013bf95a83ec ("zram: add interface to specif backing device") Signed-off-by: Kairui Song Reported-by: Desheng Wu Suggested-by: Sergey Senozhatsky Reviewed-by: Sergey Senozhatsky Cc: Signed-off-by: Andrew Morton --- drivers/block/zram/zram_drv.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c index e86cc3d2f4d2..45df5eeabc5e 100644 --- a/drivers/block/zram/zram_drv.c +++ b/drivers/block/zram/zram_drv.c @@ -1444,12 +1444,16 @@ static void zram_meta_free(struct zram *zram, u64 disksize) size_t num_pages = disksize >> PAGE_SHIFT; size_t index; + if (!zram->table) + return; + /* Free all pages that are still in this zram device */ for (index = 0; index < num_pages; index++) zram_free_page(zram, index); zs_destroy_pool(zram->mem_pool); vfree(zram->table); + zram->table = NULL; } static bool zram_meta_alloc(struct zram *zram, u64 disksize) @@ -2326,11 +2330,6 @@ static void zram_reset_device(struct zram *zram) zram->limit_pages = 0; - if (!init_done(zram)) { - up_write(&zram->init_lock); - return; - } - set_capacity_and_notify(zram->disk, 0); part_stat_set_all(zram->disk->part0, 0); From 901ce9705fbb9f330ff1f19600e5daf9770b0175 Mon Sep 17 00:00:00 2001 From: Edward Adam Davis Date: Mon, 9 Dec 2024 15:56:52 +0900 Subject: [PATCH 150/337] nilfs2: prevent use of deleted inode syzbot reported a WARNING in nilfs_rmdir. [1] Because the inode bitmap is corrupted, an inode with an inode number that should exist as a ".nilfs" file was reassigned by nilfs_mkdir for "file0", causing an inode duplication during execution. And this causes an underflow of i_nlink in rmdir operations. The inode is used twice by the same task to unmount and remove directories ".nilfs" and "file0", it trigger warning in nilfs_rmdir. Avoid to this issue, check i_nlink in nilfs_iget(), if it is 0, it means that this inode has been deleted, and iput is executed to reclaim it. [1] WARNING: CPU: 1 PID: 5824 at fs/inode.c:407 drop_nlink+0xc4/0x110 fs/inode.c:407 ... Call Trace: nilfs_rmdir+0x1b0/0x250 fs/nilfs2/namei.c:342 vfs_rmdir+0x3a3/0x510 fs/namei.c:4394 do_rmdir+0x3b5/0x580 fs/namei.c:4453 __do_sys_rmdir fs/namei.c:4472 [inline] __se_sys_rmdir fs/namei.c:4470 [inline] __x64_sys_rmdir+0x47/0x50 fs/namei.c:4470 do_syscall_x64 arch/x86/entry/common.c:52 [inline] do_syscall_64+0xf3/0x230 arch/x86/entry/common.c:83 entry_SYSCALL_64_after_hwframe+0x77/0x7f Link: https://lkml.kernel.org/r/20241209065759.6781-1-konishi.ryusuke@gmail.com Fixes: d25006523d0b ("nilfs2: pathname operations") Signed-off-by: Ryusuke Konishi Reported-by: syzbot+9260555647a5132edd48@syzkaller.appspotmail.com Closes: https://syzkaller.appspot.com/bug?extid=9260555647a5132edd48 Tested-by: syzbot+9260555647a5132edd48@syzkaller.appspotmail.com Signed-off-by: Edward Adam Davis Cc: Signed-off-by: Andrew Morton --- fs/nilfs2/inode.c | 8 +++++++- fs/nilfs2/namei.c | 5 +++++ 2 files changed, 12 insertions(+), 1 deletion(-) diff --git a/fs/nilfs2/inode.c b/fs/nilfs2/inode.c index cf9ba481ae37..b7d4105f37bf 100644 --- a/fs/nilfs2/inode.c +++ b/fs/nilfs2/inode.c @@ -544,8 +544,14 @@ struct inode *nilfs_iget(struct super_block *sb, struct nilfs_root *root, inode = nilfs_iget_locked(sb, root, ino); if (unlikely(!inode)) return ERR_PTR(-ENOMEM); - if (!(inode->i_state & I_NEW)) + + if (!(inode->i_state & I_NEW)) { + if (!inode->i_nlink) { + iput(inode); + return ERR_PTR(-ESTALE); + } return inode; + } err = __nilfs_read_inode(sb, root, ino, inode); if (unlikely(err)) { diff --git a/fs/nilfs2/namei.c b/fs/nilfs2/namei.c index 9b108052d9f7..1d836a5540f3 100644 --- a/fs/nilfs2/namei.c +++ b/fs/nilfs2/namei.c @@ -67,6 +67,11 @@ nilfs_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags) inode = NULL; } else { inode = nilfs_iget(dir->i_sb, NILFS_I(dir)->i_root, ino); + if (inode == ERR_PTR(-ESTALE)) { + nilfs_error(dir->i_sb, + "deleted inode referenced: %lu", ino); + return ERR_PTR(-EIO); + } } return d_splice_alias(inode, dentry); From 8ac662f5da19f5873fdd94c48a5cdb45b2e1b58f Mon Sep 17 00:00:00 2001 From: Lorenzo Stoakes Date: Tue, 10 Dec 2024 17:24:12 +0000 Subject: [PATCH 151/337] fork: avoid inappropriate uprobe access to invalid mm If dup_mmap() encounters an issue, currently uprobe is able to access the relevant mm via the reverse mapping (in build_map_info()), and if we are very unlucky with a race window, observe invalid XA_ZERO_ENTRY state which we establish as part of the fork error path. This occurs because uprobe_write_opcode() invokes anon_vma_prepare() which in turn invokes find_mergeable_anon_vma() that uses a VMA iterator, invoking vma_iter_load() which uses the advanced maple tree API and thus is able to observe XA_ZERO_ENTRY entries added to dup_mmap() in commit d24062914837 ("fork: use __mt_dup() to duplicate maple tree in dup_mmap()"). This change was made on the assumption that only process tear-down code would actually observe (and make use of) these values. However this very unlikely but still possible edge case with uprobes exists and unfortunately does make these observable. The uprobe operation prevents races against the dup_mmap() operation via the dup_mmap_sem semaphore, which is acquired via uprobe_start_dup_mmap() and dropped via uprobe_end_dup_mmap(), and held across register_for_each_vma() prior to invoking build_map_info() which does the reverse mapping lookup. Currently these are acquired and dropped within dup_mmap(), which exposes the race window prior to error handling in the invoking dup_mm() which tears down the mm. We can avoid all this by just moving the invocation of uprobe_start_dup_mmap() and uprobe_end_dup_mmap() up a level to dup_mm() and only release this lock once the dup_mmap() operation succeeds or clean up is done. This means that the uprobe code can never observe an incompletely constructed mm and resolves the issue in this case. Link: https://lkml.kernel.org/r/20241210172412.52995-1-lorenzo.stoakes@oracle.com Fixes: d24062914837 ("fork: use __mt_dup() to duplicate maple tree in dup_mmap()") Signed-off-by: Lorenzo Stoakes Reported-by: syzbot+2d788f4f7cb660dac4b7@syzkaller.appspotmail.com Closes: https://lore.kernel.org/all/6756d273.050a0220.2477f.003d.GAE@google.com/ Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Arnaldo Carvalho de Melo Cc: Ian Rogers Cc: Ingo Molnar Cc: Jann Horn Cc: Jiri Olsa Cc: Kan Liang Cc: Liam R. Howlett Cc: Mark Rutland Cc: Masami Hiramatsu Cc: Namhyung Kim Cc: Oleg Nesterov Cc: Peng Zhang Cc: Peter Zijlstra Cc: Vlastimil Babka Cc: David Hildenbrand Signed-off-by: Andrew Morton --- kernel/fork.c | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/kernel/fork.c b/kernel/fork.c index 1450b461d196..9b301180fd41 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -639,11 +639,8 @@ static __latent_entropy int dup_mmap(struct mm_struct *mm, LIST_HEAD(uf); VMA_ITERATOR(vmi, mm, 0); - uprobe_start_dup_mmap(); - if (mmap_write_lock_killable(oldmm)) { - retval = -EINTR; - goto fail_uprobe_end; - } + if (mmap_write_lock_killable(oldmm)) + return -EINTR; flush_cache_dup_mm(oldmm); uprobe_dup_mmap(oldmm, mm); /* @@ -782,8 +779,6 @@ out: dup_userfaultfd_complete(&uf); else dup_userfaultfd_fail(&uf); -fail_uprobe_end: - uprobe_end_dup_mmap(); return retval; fail_nomem_anon_vma_fork: @@ -1692,9 +1687,11 @@ static struct mm_struct *dup_mm(struct task_struct *tsk, if (!mm_init(mm, tsk, mm->user_ns)) goto fail_nomem; + uprobe_start_dup_mmap(); err = dup_mmap(mm, oldmm); if (err) goto free_pt; + uprobe_end_dup_mmap(); mm->hiwater_rss = get_mm_rss(mm); mm->hiwater_vm = mm->total_vm; @@ -1709,6 +1706,8 @@ free_pt: mm->binfmt = NULL; mm_init_owner(mm, NULL); mmput(mm); + if (err) + uprobe_end_dup_mmap(); fail_nomem: return NULL; From faeec8e23c10bd30e8aa759a2eb3018dae00f924 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Tue, 10 Dec 2024 10:34:37 +0100 Subject: [PATCH 152/337] mm/page_alloc: don't call pfn_to_page() on possibly non-existent PFN in split_large_buddy() In split_large_buddy(), we might call pfn_to_page() on a PFN that might not exist. In corner cases, such as when freeing the highest pageblock in the last memory section, this could result with CONFIG_SPARSEMEM && !CONFIG_SPARSEMEM_EXTREME in __pfn_to_section() returning NULL and and __section_mem_map_addr() dereferencing that NULL pointer. Let's fix it, and avoid doing a pfn_to_page() call for the first iteration, where we already have the page. So far this was found by code inspection, but let's just CC stable as the fix is easy. Link: https://lkml.kernel.org/r/20241210093437.174413-1-david@redhat.com Fixes: fd919a85cd55 ("mm: page_isolation: prepare for hygienic freelists") Signed-off-by: David Hildenbrand Reported-by: Vlastimil Babka Closes: https://lkml.kernel.org/r/e1a898ba-a717-4d20-9144-29df1a6c8813@suse.cz Reviewed-by: Vlastimil Babka Reviewed-by: Zi Yan Acked-by: Johannes Weiner Cc: Yu Zhao Cc: Signed-off-by: Andrew Morton --- mm/page_alloc.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 1cb4b8c8886d..cae7b93864c2 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1238,13 +1238,15 @@ static void split_large_buddy(struct zone *zone, struct page *page, if (order > pageblock_order) order = pageblock_order; - while (pfn != end) { + do { int mt = get_pfnblock_migratetype(page, pfn); __free_one_page(page, pfn, zone, order, mt, fpi); pfn += 1 << order; + if (pfn == end) + break; page = pfn_to_page(pfn); - } + } while (1); } static void free_one_page(struct zone *zone, struct page *page, From a2e740e216f5bf49ccb83b6d490c72a340558a43 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Wed, 11 Dec 2024 20:25:37 +0000 Subject: [PATCH 153/337] vmalloc: fix accounting with i915 If the caller of vmap() specifies VM_MAP_PUT_PAGES (currently only the i915 driver), we will decrement nr_vmalloc_pages and MEMCG_VMALLOC in vfree(). These counters are incremented by vmalloc() but not by vmap() so this will cause an underflow. Check the VM_MAP_PUT_PAGES flag before decrementing either counter. Link: https://lkml.kernel.org/r/20241211202538.168311-1-willy@infradead.org Fixes: b944afc9d64d ("mm: add a VM_MAP_PUT_PAGES flag for vmap") Signed-off-by: Matthew Wilcox (Oracle) Acked-by: Johannes Weiner Reviewed-by: Shakeel Butt Reviewed-by: Balbir Singh Acked-by: Michal Hocko Cc: Christoph Hellwig Cc: Muchun Song Cc: Roman Gushchin Cc: "Uladzislau Rezki (Sony)" Cc: Signed-off-by: Andrew Morton --- mm/vmalloc.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/mm/vmalloc.c b/mm/vmalloc.c index f009b21705c1..5c88d0e90c20 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -3374,7 +3374,8 @@ void vfree(const void *addr) struct page *page = vm->pages[i]; BUG_ON(!page); - mod_memcg_page_state(page, MEMCG_VMALLOC, -1); + if (!(vm->flags & VM_MAP_PUT_PAGES)) + mod_memcg_page_state(page, MEMCG_VMALLOC, -1); /* * High-order allocs for huge vmallocs are split, so * can be freed as an array of order-0 allocations @@ -3382,7 +3383,8 @@ void vfree(const void *addr) __free_page(page); cond_resched(); } - atomic_long_sub(vm->nr_pages, &nr_vmalloc_pages); + if (!(vm->flags & VM_MAP_PUT_PAGES)) + atomic_long_sub(vm->nr_pages, &nr_vmalloc_pages); kvfree(vm->pages); kfree(vm); } From 6309b8ce98e9a18390b9fd8f03fc412f3c17aee9 Mon Sep 17 00:00:00 2001 From: Ryusuke Konishi Date: Fri, 13 Dec 2024 01:43:28 +0900 Subject: [PATCH 154/337] nilfs2: fix buffer head leaks in calls to truncate_inode_pages() When block_invalidatepage was converted to block_invalidate_folio, the fallback to block_invalidatepage in folio_invalidate() if the address_space_operations method invalidatepage (currently invalidate_folio) was not set, was removed. Unfortunately, some pseudo-inodes in nilfs2 use empty_aops set by inode_init_always_gfp() as is, or explicitly set it to address_space_operations. Therefore, with this change, block_invalidatepage() is no longer called from folio_invalidate(), and as a result, the buffer_head structures attached to these pages/folios are no longer freed via try_to_free_buffers(). Thus, these buffer heads are now leaked by truncate_inode_pages(), which cleans up the page cache from inode evict(), etc. Three types of caches use empty_aops: gc inode caches and the DAT shadow inode used by GC, and b-tree node caches. Of these, b-tree node caches explicitly call invalidate_mapping_pages() during cleanup, which involves calling try_to_free_buffers(), so the leak was not visible during normal operation but worsened when GC was performed. Fix this issue by using address_space_operations with invalidate_folio set to block_invalidate_folio instead of empty_aops, which will ensure the same behavior as before. Link: https://lkml.kernel.org/r/20241212164556.21338-1-konishi.ryusuke@gmail.com Fixes: 7ba13abbd31e ("fs: Turn block_invalidatepage into block_invalidate_folio") Signed-off-by: Ryusuke Konishi Cc: [5.18+] Signed-off-by: Andrew Morton --- fs/nilfs2/btnode.c | 1 + fs/nilfs2/gcinode.c | 2 +- fs/nilfs2/inode.c | 5 +++++ fs/nilfs2/nilfs.h | 1 + 4 files changed, 8 insertions(+), 1 deletion(-) diff --git a/fs/nilfs2/btnode.c b/fs/nilfs2/btnode.c index 501ad7be5174..54a3fa0cf67e 100644 --- a/fs/nilfs2/btnode.c +++ b/fs/nilfs2/btnode.c @@ -35,6 +35,7 @@ void nilfs_init_btnc_inode(struct inode *btnc_inode) ii->i_flags = 0; memset(&ii->i_bmap_data, 0, sizeof(struct nilfs_bmap)); mapping_set_gfp_mask(btnc_inode->i_mapping, GFP_NOFS); + btnc_inode->i_mapping->a_ops = &nilfs_buffer_cache_aops; } void nilfs_btnode_cache_clear(struct address_space *btnc) diff --git a/fs/nilfs2/gcinode.c b/fs/nilfs2/gcinode.c index ace22253fed0..2dbb15767df1 100644 --- a/fs/nilfs2/gcinode.c +++ b/fs/nilfs2/gcinode.c @@ -163,7 +163,7 @@ int nilfs_init_gcinode(struct inode *inode) inode->i_mode = S_IFREG; mapping_set_gfp_mask(inode->i_mapping, GFP_NOFS); - inode->i_mapping->a_ops = &empty_aops; + inode->i_mapping->a_ops = &nilfs_buffer_cache_aops; ii->i_flags = 0; nilfs_bmap_init_gc(ii->i_bmap); diff --git a/fs/nilfs2/inode.c b/fs/nilfs2/inode.c index b7d4105f37bf..23f3a75edd50 100644 --- a/fs/nilfs2/inode.c +++ b/fs/nilfs2/inode.c @@ -276,6 +276,10 @@ const struct address_space_operations nilfs_aops = { .is_partially_uptodate = block_is_partially_uptodate, }; +const struct address_space_operations nilfs_buffer_cache_aops = { + .invalidate_folio = block_invalidate_folio, +}; + static int nilfs_insert_inode_locked(struct inode *inode, struct nilfs_root *root, unsigned long ino) @@ -681,6 +685,7 @@ struct inode *nilfs_iget_for_shadow(struct inode *inode) NILFS_I(s_inode)->i_flags = 0; memset(NILFS_I(s_inode)->i_bmap, 0, sizeof(struct nilfs_bmap)); mapping_set_gfp_mask(s_inode->i_mapping, GFP_NOFS); + s_inode->i_mapping->a_ops = &nilfs_buffer_cache_aops; err = nilfs_attach_btree_node_cache(s_inode); if (unlikely(err)) { diff --git a/fs/nilfs2/nilfs.h b/fs/nilfs2/nilfs.h index 45d03826eaf1..dff241c53fc5 100644 --- a/fs/nilfs2/nilfs.h +++ b/fs/nilfs2/nilfs.h @@ -401,6 +401,7 @@ extern const struct file_operations nilfs_dir_operations; extern const struct inode_operations nilfs_file_inode_operations; extern const struct file_operations nilfs_file_operations; extern const struct address_space_operations nilfs_aops; +extern const struct address_space_operations nilfs_buffer_cache_aops; extern const struct inode_operations nilfs_dir_inode_operations; extern const struct inode_operations nilfs_special_inode_operations; extern const struct inode_operations nilfs_symlink_inode_operations; From 42b2eb69835b0fda797f70eb5b4fc213dbe3a7ea Mon Sep 17 00:00:00 2001 From: Usama Arif Date: Thu, 12 Dec 2024 18:33:51 +0000 Subject: [PATCH 155/337] mm: convert partially_mapped set/clear operations to be atomic Other page flags in the 2nd page, like PG_hwpoison and PG_anon_exclusive can get modified concurrently. Changes to other page flags might be lost if they are happening at the same time as non-atomic partially_mapped operations. Hence, make partially_mapped operations atomic. Link: https://lkml.kernel.org/r/20241212183351.1345389-1-usamaarif642@gmail.com Fixes: 8422acdc97ed ("mm: introduce a pageflag for partially mapped folios") Reported-by: David Hildenbrand Link: https://lore.kernel.org/all/e53b04ad-1827-43a2-a1ab-864c7efecf6e@redhat.com/ Signed-off-by: Usama Arif Acked-by: David Hildenbrand Acked-by: Johannes Weiner Acked-by: Roman Gushchin Cc: Barry Song Cc: Domenico Cerasuolo Cc: Jonathan Corbet Cc: Matthew Wilcox Cc: Mike Rapoport (Microsoft) Cc: Nico Pache Cc: Rik van Riel Cc: Ryan Roberts Cc: Shakeel Butt Cc: Yu Zhao Cc: Signed-off-by: Andrew Morton --- include/linux/page-flags.h | 12 ++---------- mm/huge_memory.c | 8 ++++---- 2 files changed, 6 insertions(+), 14 deletions(-) diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index cf46ac720802..691506bdf2c5 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -862,18 +862,10 @@ static inline void ClearPageCompound(struct page *page) ClearPageHead(page); } FOLIO_FLAG(large_rmappable, FOLIO_SECOND_PAGE) -FOLIO_TEST_FLAG(partially_mapped, FOLIO_SECOND_PAGE) -/* - * PG_partially_mapped is protected by deferred_split split_queue_lock, - * so its safe to use non-atomic set/clear. - */ -__FOLIO_SET_FLAG(partially_mapped, FOLIO_SECOND_PAGE) -__FOLIO_CLEAR_FLAG(partially_mapped, FOLIO_SECOND_PAGE) +FOLIO_FLAG(partially_mapped, FOLIO_SECOND_PAGE) #else FOLIO_FLAG_FALSE(large_rmappable) -FOLIO_TEST_FLAG_FALSE(partially_mapped) -__FOLIO_SET_FLAG_NOOP(partially_mapped) -__FOLIO_CLEAR_FLAG_NOOP(partially_mapped) +FOLIO_FLAG_FALSE(partially_mapped) #endif #define PG_head_mask ((1UL << PG_head)) diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 9bb351caa619..df0c4988dd88 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -3577,7 +3577,7 @@ int split_huge_page_to_list_to_order(struct page *page, struct list_head *list, !list_empty(&folio->_deferred_list)) { ds_queue->split_queue_len--; if (folio_test_partially_mapped(folio)) { - __folio_clear_partially_mapped(folio); + folio_clear_partially_mapped(folio); mod_mthp_stat(folio_order(folio), MTHP_STAT_NR_ANON_PARTIALLY_MAPPED, -1); } @@ -3689,7 +3689,7 @@ bool __folio_unqueue_deferred_split(struct folio *folio) if (!list_empty(&folio->_deferred_list)) { ds_queue->split_queue_len--; if (folio_test_partially_mapped(folio)) { - __folio_clear_partially_mapped(folio); + folio_clear_partially_mapped(folio); mod_mthp_stat(folio_order(folio), MTHP_STAT_NR_ANON_PARTIALLY_MAPPED, -1); } @@ -3733,7 +3733,7 @@ void deferred_split_folio(struct folio *folio, bool partially_mapped) spin_lock_irqsave(&ds_queue->split_queue_lock, flags); if (partially_mapped) { if (!folio_test_partially_mapped(folio)) { - __folio_set_partially_mapped(folio); + folio_set_partially_mapped(folio); if (folio_test_pmd_mappable(folio)) count_vm_event(THP_DEFERRED_SPLIT_PAGE); count_mthp_stat(folio_order(folio), MTHP_STAT_SPLIT_DEFERRED); @@ -3826,7 +3826,7 @@ static unsigned long deferred_split_scan(struct shrinker *shrink, } else { /* We lost race with folio_put() */ if (folio_test_partially_mapped(folio)) { - __folio_clear_partially_mapped(folio); + folio_clear_partially_mapped(folio); mod_mthp_stat(folio_order(folio), MTHP_STAT_NR_ANON_PARTIALLY_MAPPED, -1); } From 30c2de0a267c04046d89e678cc0067a9cfb455df Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Thu, 12 Dec 2024 13:31:26 -0800 Subject: [PATCH 156/337] mm/vmstat: fix a W=1 clang compiler warning Fix the following clang compiler warning that is reported if the kernel is built with W=1: ./include/linux/vmstat.h:518:36: error: arithmetic between different enumeration types ('enum node_stat_item' and 'enum lru_list') [-Werror,-Wenum-enum-conversion] 518 | return node_stat_name(NR_LRU_BASE + lru) + 3; // skip "nr_" | ~~~~~~~~~~~ ^ ~~~ Link: https://lkml.kernel.org/r/20241212213126.1269116-1-bvanassche@acm.org Fixes: 9d7ea9a297e6 ("mm/vmstat: add helpers to get vmstat item names for each enum type") Signed-off-by: Bart Van Assche Cc: Konstantin Khlebnikov Cc: Nathan Chancellor Signed-off-by: Andrew Morton --- include/linux/vmstat.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/linux/vmstat.h b/include/linux/vmstat.h index d2761bf8ff32..9f3a04345b86 100644 --- a/include/linux/vmstat.h +++ b/include/linux/vmstat.h @@ -515,7 +515,7 @@ static inline const char *node_stat_name(enum node_stat_item item) static inline const char *lru_list_name(enum lru_list lru) { - return node_stat_name(NR_LRU_BASE + lru) + 3; // skip "nr_" + return node_stat_name(NR_LRU_BASE + (enum node_stat_item)lru) + 3; // skip "nr_" } #if defined(CONFIG_VM_EVENT_COUNTERS) || defined(CONFIG_MEMCG) From 640a603943a7659340c10044c0a1c98ae4e13189 Mon Sep 17 00:00:00 2001 From: David Wang <00107082@163.com> Date: Fri, 13 Dec 2024 09:33:32 +0800 Subject: [PATCH 157/337] mm/codetag: clear tags before swap When CONFIG_MEM_ALLOC_PROFILING_DEBUG is set, kernel WARN would be triggered when calling __alloc_tag_ref_set() during swap: alloc_tag was not cleared (got tag for mm/filemap.c:1951) WARNING: CPU: 0 PID: 816 at ./include/linux/alloc_tag.h... Clear code tags before swap can fix the warning. And this patch also fix a potential invalid address dereference in alloc_tag_add_check() when CONFIG_MEM_ALLOC_PROFILING_DEBUG is set and ref->ct is CODETAG_EMPTY, which is defined as ((void *)1). Link: https://lkml.kernel.org/r/20241213013332.89910-1-00107082@163.com Fixes: 51f43d5d82ed ("mm/codetag: swap tags when migrate pages") Signed-off-by: David Wang <00107082@163.com> Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-lkp/202412112227.df61ebb-lkp@intel.com Acked-by: Suren Baghdasaryan Cc: Kent Overstreet Cc: Yu Zhao Cc: Signed-off-by: Andrew Morton --- include/linux/alloc_tag.h | 2 +- lib/alloc_tag.c | 7 +++++++ 2 files changed, 8 insertions(+), 1 deletion(-) diff --git a/include/linux/alloc_tag.h b/include/linux/alloc_tag.h index 7c0786bdf9af..cba024bf2db3 100644 --- a/include/linux/alloc_tag.h +++ b/include/linux/alloc_tag.h @@ -135,7 +135,7 @@ static inline struct alloc_tag_counters alloc_tag_read(struct alloc_tag *tag) #ifdef CONFIG_MEM_ALLOC_PROFILING_DEBUG static inline void alloc_tag_add_check(union codetag_ref *ref, struct alloc_tag *tag) { - WARN_ONCE(ref && ref->ct, + WARN_ONCE(ref && ref->ct && !is_codetag_empty(ref), "alloc_tag was not cleared (got tag for %s:%u)\n", ref->ct->filename, ref->ct->lineno); diff --git a/lib/alloc_tag.c b/lib/alloc_tag.c index 35f7560a309a..3a0413462e9f 100644 --- a/lib/alloc_tag.c +++ b/lib/alloc_tag.c @@ -209,6 +209,13 @@ void pgalloc_tag_swap(struct folio *new, struct folio *old) return; } + /* + * Clear tag references to avoid debug warning when using + * __alloc_tag_ref_set() with non-empty reference. + */ + set_codetag_empty(&ref_old); + set_codetag_empty(&ref_new); + /* swap tags */ __alloc_tag_ref_set(&ref_old, tag_new); update_page_tag_ref(handle_old, &ref_old); From e269b5d2916d7a696c2d2ed370cea95d95a0675a Mon Sep 17 00:00:00 2001 From: Suren Baghdasaryan Date: Fri, 29 Nov 2024 16:14:22 -0800 Subject: [PATCH 158/337] alloc_tag: fix module allocation tags populated area calculation vm_module_tags_populate() calculation of the populated area assumes that area starts at a page boundary and therefore when new pages are allocation, the end of the area is page-aligned as well. If the start of the area is not page-aligned then allocating a page and incrementing the end of the area by PAGE_SIZE leads to an area at the end but within the area boundary which is not populated. Accessing this are will lead to a kernel panic. Fix the calculation by down-aligning the start of the area and using that as the location allocated pages are mapped to. [gehao@kylinos.cn: fix vm_module_tags_populate's KASAN poisoning logic] Link: https://lkml.kernel.org/r/20241205170528.81000-1-hao.ge@linux.dev [gehao@kylinos.cn: fix panic when CONFIG_KASAN enabled and CONFIG_KASAN_VMALLOC not enabled] Link: https://lkml.kernel.org/r/20241212072126.134572-1-hao.ge@linux.dev Link: https://lkml.kernel.org/r/20241130001423.1114965-1-surenb@google.com Fixes: 0f9b685626da ("alloc_tag: populate memory for module tags as needed") Signed-off-by: Suren Baghdasaryan Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-lkp/202411132111.6a221562-lkp@intel.com Acked-by: Yu Zhao Tested-by: Adrian Huang Cc: David Wang <00107082@163.com> Cc: Kent Overstreet Cc: Mike Rapoport (Microsoft) Cc: Pasha Tatashin Cc: Sourav Panda Cc: Signed-off-by: Andrew Morton --- lib/alloc_tag.c | 34 +++++++++++++++++++++++++++++----- 1 file changed, 29 insertions(+), 5 deletions(-) diff --git a/lib/alloc_tag.c b/lib/alloc_tag.c index 3a0413462e9f..7dcebf118a3e 100644 --- a/lib/alloc_tag.c +++ b/lib/alloc_tag.c @@ -408,28 +408,52 @@ repeat: static int vm_module_tags_populate(void) { - unsigned long phys_size = vm_module_tags->nr_pages << PAGE_SHIFT; + unsigned long phys_end = ALIGN_DOWN(module_tags.start_addr, PAGE_SIZE) + + (vm_module_tags->nr_pages << PAGE_SHIFT); + unsigned long new_end = module_tags.start_addr + module_tags.size; - if (phys_size < module_tags.size) { + if (phys_end < new_end) { struct page **next_page = vm_module_tags->pages + vm_module_tags->nr_pages; - unsigned long addr = module_tags.start_addr + phys_size; + unsigned long old_shadow_end = ALIGN(phys_end, MODULE_ALIGN); + unsigned long new_shadow_end = ALIGN(new_end, MODULE_ALIGN); unsigned long more_pages; unsigned long nr; - more_pages = ALIGN(module_tags.size - phys_size, PAGE_SIZE) >> PAGE_SHIFT; + more_pages = ALIGN(new_end - phys_end, PAGE_SIZE) >> PAGE_SHIFT; nr = alloc_pages_bulk_array_node(GFP_KERNEL | __GFP_NOWARN, NUMA_NO_NODE, more_pages, next_page); if (nr < more_pages || - vmap_pages_range(addr, addr + (nr << PAGE_SHIFT), PAGE_KERNEL, + vmap_pages_range(phys_end, phys_end + (nr << PAGE_SHIFT), PAGE_KERNEL, next_page, PAGE_SHIFT) < 0) { /* Clean up and error out */ for (int i = 0; i < nr; i++) __free_page(next_page[i]); return -ENOMEM; } + vm_module_tags->nr_pages += nr; + + /* + * Kasan allocates 1 byte of shadow for every 8 bytes of data. + * When kasan_alloc_module_shadow allocates shadow memory, + * its unit of allocation is a page. + * Therefore, here we need to align to MODULE_ALIGN. + */ + if (old_shadow_end < new_shadow_end) + kasan_alloc_module_shadow((void *)old_shadow_end, + new_shadow_end - old_shadow_end, + GFP_KERNEL); } + /* + * Mark the pages as accessible, now that they are mapped. + * With hardware tag-based KASAN, marking is skipped for + * non-VM_ALLOC mappings, see __kasan_unpoison_vmalloc(). + */ + kasan_unpoison_vmalloc((void *)module_tags.start_addr, + new_end - module_tags.start_addr, + KASAN_VMALLOC_PROT_NORMAL); + return 0; } From 60da7445a142bd15e67f3cda915497781c3f781f Mon Sep 17 00:00:00 2001 From: Suren Baghdasaryan Date: Fri, 29 Nov 2024 16:14:23 -0800 Subject: [PATCH 159/337] alloc_tag: fix set_codetag_empty() when !CONFIG_MEM_ALLOC_PROFILING_DEBUG It was recently noticed that set_codetag_empty() might be used not only to mark NULL alloctag references as empty to avoid warnings but also to reset valid tags (in clear_page_tag_ref()). Since set_codetag_empty() is defined as NOOP for CONFIG_MEM_ALLOC_PROFILING_DEBUG=n, such use of set_codetag_empty() leads to subtle bugs. Fix set_codetag_empty() for CONFIG_MEM_ALLOC_PROFILING_DEBUG=n to reset the tag reference. Link: https://lkml.kernel.org/r/20241130001423.1114965-2-surenb@google.com Fixes: a8fc28dad6d5 ("alloc_tag: introduce clear_page_tag_ref() helper function") Signed-off-by: Suren Baghdasaryan Reported-by: David Wang <00107082@163.com> Closes: https://lore.kernel.org/lkml/20241124074318.399027-1-00107082@163.com/ Cc: David Wang <00107082@163.com> Cc: Kent Overstreet Cc: Mike Rapoport (Microsoft) Cc: Pasha Tatashin Cc: Sourav Panda Cc: Yu Zhao Cc: Signed-off-by: Andrew Morton --- include/linux/alloc_tag.h | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/include/linux/alloc_tag.h b/include/linux/alloc_tag.h index cba024bf2db3..0bbbe537c5f9 100644 --- a/include/linux/alloc_tag.h +++ b/include/linux/alloc_tag.h @@ -63,7 +63,12 @@ static inline void set_codetag_empty(union codetag_ref *ref) #else /* CONFIG_MEM_ALLOC_PROFILING_DEBUG */ static inline bool is_codetag_empty(union codetag_ref *ref) { return false; } -static inline void set_codetag_empty(union codetag_ref *ref) {} + +static inline void set_codetag_empty(union codetag_ref *ref) +{ + if (ref) + ref->ct = NULL; +} #endif /* CONFIG_MEM_ALLOC_PROFILING_DEBUG */ From d3ac65d274b3a93cf9cf9559fd1473ab65e00e10 Mon Sep 17 00:00:00 2001 From: Leo Stone Date: Sun, 15 Dec 2024 20:27:51 -0800 Subject: [PATCH 160/337] mm: huge_memory: handle strsep not finding delimiter split_huge_pages_write() does not handle the case where strsep finds no delimiter in the given string and sets the input buffer to NULL, which allows this reproducer to trigger a protection fault. Link: https://lkml.kernel.org/r/20241216042752.257090-2-leocstone@gmail.com Signed-off-by: Leo Stone Reported-by: syzbot+8a3da2f1bbf59227c289@syzkaller.appspotmail.com Closes: https://syzkaller.appspot.com/bug?extid=8a3da2f1bbf59227c289 Signed-off-by: Andrew Morton --- mm/huge_memory.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/huge_memory.c b/mm/huge_memory.c index df0c4988dd88..e53d83b3e5cf 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -4169,7 +4169,7 @@ static ssize_t split_huge_pages_write(struct file *file, const char __user *buf, size_t input_len = strlen(input_buf); tok = strsep(&buf, ","); - if (tok) { + if (tok && buf) { strscpy(file_path, tok); } else { ret = -EINVAL; From 13a6691910cc23ea9ba4066e098603088673d5b0 Mon Sep 17 00:00:00 2001 From: Chiara Meiohas Date: Tue, 10 Dec 2024 09:33:10 +0200 Subject: [PATCH 161/337] RDMA/nldev: Set error code in rdma_nl_notify_event In case of error set the error code before the goto. Fixes: 6ff57a2ea7c2 ("RDMA/nldev: Fix NULL pointer dereferences issue in rdma_nl_notify_event") Reported-by: Dan Carpenter Closes: https://lore.kernel.org/linux-rdma/a84a2fc3-33b6-46da-a1bd-3343fa07eaf9@stanley.mountain/ Signed-off-by: Chiara Meiohas Reviewed-by: Maher Sanalla Link: https://patch.msgid.link/13eb25961923f5de9eb9ecbbc94e26113d6049ef.1733815944.git.leonro@nvidia.com Reviewed-by: Kalesh AP Signed-off-by: Leon Romanovsky --- drivers/infiniband/core/nldev.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/infiniband/core/nldev.c b/drivers/infiniband/core/nldev.c index ff121e59b9c0..cb987ab0177c 100644 --- a/drivers/infiniband/core/nldev.c +++ b/drivers/infiniband/core/nldev.c @@ -2833,8 +2833,8 @@ int rdma_nl_notify_event(struct ib_device *device, u32 port_num, enum rdma_nl_notify_event_type type) { struct sk_buff *skb; + int ret = -EMSGSIZE; struct net *net; - int ret = 0; void *nlh; net = read_pnet(&device->coredev.rdma_net); From 16b87037b48889d21854c8e97aec8a1baf2642b3 Mon Sep 17 00:00:00 2001 From: Bernard Metzler Date: Thu, 12 Dec 2024 16:18:48 +0100 Subject: [PATCH 162/337] RDMA/siw: Remove direct link to net_device Do not manage a per device direct link to net_device. Rely on associated ib_devices net_device management, not doubling the effort locally. A badly managed local link to net_device was causing a 'KASAN: slab-use-after-free' exception during siw_query_port() call. Fixes: bdcf26bf9b3a ("rdma/siw: network and RDMA core interface") Reported-by: syzbot+4b87489410b4efd181bf@syzkaller.appspotmail.com Closes: https://syzkaller.appspot.com/bug?extid=4b87489410b4efd181bf Signed-off-by: Bernard Metzler Link: https://patch.msgid.link/20241212151848.564872-1-bmt@zurich.ibm.com Reviewed-by: Zhu Yanjun Signed-off-by: Leon Romanovsky --- drivers/infiniband/sw/siw/siw.h | 7 +++--- drivers/infiniband/sw/siw/siw_cm.c | 27 ++++++++++++++++----- drivers/infiniband/sw/siw/siw_main.c | 15 +----------- drivers/infiniband/sw/siw/siw_verbs.c | 35 ++++++++++++++++++--------- 4 files changed, 49 insertions(+), 35 deletions(-) diff --git a/drivers/infiniband/sw/siw/siw.h b/drivers/infiniband/sw/siw/siw.h index 86d4d6a2170e..ea5eee50dc39 100644 --- a/drivers/infiniband/sw/siw/siw.h +++ b/drivers/infiniband/sw/siw/siw.h @@ -46,6 +46,9 @@ */ #define SIW_IRQ_MAXBURST_SQ_ACTIVE 4 +/* There is always only a port 1 per siw device */ +#define SIW_PORT 1 + struct siw_dev_cap { int max_qp; int max_qp_wr; @@ -69,16 +72,12 @@ struct siw_pd { struct siw_device { struct ib_device base_dev; - struct net_device *netdev; struct siw_dev_cap attrs; u32 vendor_part_id; int numa_node; char raw_gid[ETH_ALEN]; - /* physical port state (only one port per device) */ - enum ib_port_state state; - spinlock_t lock; struct xarray qp_xa; diff --git a/drivers/infiniband/sw/siw/siw_cm.c b/drivers/infiniband/sw/siw/siw_cm.c index 86323918a570..708b13993fdf 100644 --- a/drivers/infiniband/sw/siw/siw_cm.c +++ b/drivers/infiniband/sw/siw/siw_cm.c @@ -1759,6 +1759,7 @@ int siw_create_listen(struct iw_cm_id *id, int backlog) { struct socket *s; struct siw_cep *cep = NULL; + struct net_device *ndev = NULL; struct siw_device *sdev = to_siw_dev(id->device); int addr_family = id->local_addr.ss_family; int rv = 0; @@ -1779,9 +1780,15 @@ int siw_create_listen(struct iw_cm_id *id, int backlog) struct sockaddr_in *laddr = &to_sockaddr_in(id->local_addr); /* For wildcard addr, limit binding to current device only */ - if (ipv4_is_zeronet(laddr->sin_addr.s_addr)) - s->sk->sk_bound_dev_if = sdev->netdev->ifindex; - + if (ipv4_is_zeronet(laddr->sin_addr.s_addr)) { + ndev = ib_device_get_netdev(id->device, SIW_PORT); + if (ndev) { + s->sk->sk_bound_dev_if = ndev->ifindex; + } else { + rv = -ENODEV; + goto error; + } + } rv = s->ops->bind(s, (struct sockaddr *)laddr, sizeof(struct sockaddr_in)); } else { @@ -1797,9 +1804,15 @@ int siw_create_listen(struct iw_cm_id *id, int backlog) } /* For wildcard addr, limit binding to current device only */ - if (ipv6_addr_any(&laddr->sin6_addr)) - s->sk->sk_bound_dev_if = sdev->netdev->ifindex; - + if (ipv6_addr_any(&laddr->sin6_addr)) { + ndev = ib_device_get_netdev(id->device, SIW_PORT); + if (ndev) { + s->sk->sk_bound_dev_if = ndev->ifindex; + } else { + rv = -ENODEV; + goto error; + } + } rv = s->ops->bind(s, (struct sockaddr *)laddr, sizeof(struct sockaddr_in6)); } @@ -1860,6 +1873,7 @@ int siw_create_listen(struct iw_cm_id *id, int backlog) } list_add_tail(&cep->listenq, (struct list_head *)id->provider_data); cep->state = SIW_EPSTATE_LISTENING; + dev_put(ndev); siw_dbg(id->device, "Listen at laddr %pISp\n", &id->local_addr); @@ -1879,6 +1893,7 @@ error: siw_cep_set_free_and_put(cep); } sock_release(s); + dev_put(ndev); return rv; } diff --git a/drivers/infiniband/sw/siw/siw_main.c b/drivers/infiniband/sw/siw/siw_main.c index 17abef48abcd..14d3103aee6f 100644 --- a/drivers/infiniband/sw/siw/siw_main.c +++ b/drivers/infiniband/sw/siw/siw_main.c @@ -287,7 +287,6 @@ static struct siw_device *siw_device_create(struct net_device *netdev) return NULL; base_dev = &sdev->base_dev; - sdev->netdev = netdev; if (netdev->addr_len) { memcpy(sdev->raw_gid, netdev->dev_addr, @@ -381,12 +380,10 @@ static int siw_netdev_event(struct notifier_block *nb, unsigned long event, switch (event) { case NETDEV_UP: - sdev->state = IB_PORT_ACTIVE; siw_port_event(sdev, 1, IB_EVENT_PORT_ACTIVE); break; case NETDEV_DOWN: - sdev->state = IB_PORT_DOWN; siw_port_event(sdev, 1, IB_EVENT_PORT_ERR); break; @@ -407,12 +404,8 @@ static int siw_netdev_event(struct notifier_block *nb, unsigned long event, siw_port_event(sdev, 1, IB_EVENT_LID_CHANGE); break; /* - * Todo: Below netdev events are currently not handled. + * All other events are not handled */ - case NETDEV_CHANGEMTU: - case NETDEV_CHANGE: - break; - default: break; } @@ -442,12 +435,6 @@ static int siw_newlink(const char *basedev_name, struct net_device *netdev) sdev = siw_device_create(netdev); if (sdev) { dev_dbg(&netdev->dev, "siw: new device\n"); - - if (netif_running(netdev) && netif_carrier_ok(netdev)) - sdev->state = IB_PORT_ACTIVE; - else - sdev->state = IB_PORT_DOWN; - ib_mark_name_assigned_by_user(&sdev->base_dev); rv = siw_device_register(sdev, basedev_name); if (rv) diff --git a/drivers/infiniband/sw/siw/siw_verbs.c b/drivers/infiniband/sw/siw/siw_verbs.c index 986666c19378..7ca0297d68a4 100644 --- a/drivers/infiniband/sw/siw/siw_verbs.c +++ b/drivers/infiniband/sw/siw/siw_verbs.c @@ -171,21 +171,29 @@ int siw_query_device(struct ib_device *base_dev, struct ib_device_attr *attr, int siw_query_port(struct ib_device *base_dev, u32 port, struct ib_port_attr *attr) { - struct siw_device *sdev = to_siw_dev(base_dev); + struct net_device *ndev; int rv; memset(attr, 0, sizeof(*attr)); rv = ib_get_eth_speed(base_dev, port, &attr->active_speed, &attr->active_width); + if (rv) + return rv; + + ndev = ib_device_get_netdev(base_dev, SIW_PORT); + if (!ndev) + return -ENODEV; + attr->gid_tbl_len = 1; attr->max_msg_sz = -1; - attr->max_mtu = ib_mtu_int_to_enum(sdev->netdev->mtu); - attr->active_mtu = ib_mtu_int_to_enum(sdev->netdev->mtu); - attr->phys_state = sdev->state == IB_PORT_ACTIVE ? + attr->max_mtu = ib_mtu_int_to_enum(ndev->max_mtu); + attr->active_mtu = ib_mtu_int_to_enum(READ_ONCE(ndev->mtu)); + attr->phys_state = (netif_running(ndev) && netif_carrier_ok(ndev)) ? IB_PORT_PHYS_STATE_LINK_UP : IB_PORT_PHYS_STATE_DISABLED; + attr->state = attr->phys_state == IB_PORT_PHYS_STATE_LINK_UP ? + IB_PORT_ACTIVE : IB_PORT_DOWN; attr->port_cap_flags = IB_PORT_CM_SUP | IB_PORT_DEVICE_MGMT_SUP; - attr->state = sdev->state; /* * All zero * @@ -199,6 +207,7 @@ int siw_query_port(struct ib_device *base_dev, u32 port, * attr->subnet_timeout = 0; * attr->init_type_repy = 0; */ + dev_put(ndev); return rv; } @@ -505,21 +514,24 @@ int siw_query_qp(struct ib_qp *base_qp, struct ib_qp_attr *qp_attr, int qp_attr_mask, struct ib_qp_init_attr *qp_init_attr) { struct siw_qp *qp; - struct siw_device *sdev; + struct net_device *ndev; - if (base_qp && qp_attr && qp_init_attr) { + if (base_qp && qp_attr && qp_init_attr) qp = to_siw_qp(base_qp); - sdev = to_siw_dev(base_qp->device); - } else { + else return -EINVAL; - } + + ndev = ib_device_get_netdev(base_qp->device, SIW_PORT); + if (!ndev) + return -ENODEV; + qp_attr->qp_state = siw_qp_state_to_ib_qp_state[qp->attrs.state]; qp_attr->cap.max_inline_data = SIW_MAX_INLINE; qp_attr->cap.max_send_wr = qp->attrs.sq_size; qp_attr->cap.max_send_sge = qp->attrs.sq_max_sges; qp_attr->cap.max_recv_wr = qp->attrs.rq_size; qp_attr->cap.max_recv_sge = qp->attrs.rq_max_sges; - qp_attr->path_mtu = ib_mtu_int_to_enum(sdev->netdev->mtu); + qp_attr->path_mtu = ib_mtu_int_to_enum(READ_ONCE(ndev->mtu)); qp_attr->max_rd_atomic = qp->attrs.irq_size; qp_attr->max_dest_rd_atomic = qp->attrs.orq_size; @@ -534,6 +546,7 @@ int siw_query_qp(struct ib_qp *base_qp, struct ib_qp_attr *qp_attr, qp_init_attr->cap = qp_attr->cap; + dev_put(ndev); return 0; } From 32c9c06adb5b157ef259233775a063a43746d699 Mon Sep 17 00:00:00 2001 From: Chen-Yu Tsai Date: Thu, 19 Dec 2024 18:53:02 +0800 Subject: [PATCH 163/337] ASoC: mediatek: disable buffer pre-allocation On Chromebooks based on Mediatek MT8195 or MT8188, the audio frontend (AFE) is limited to accessing a very small window (1 MiB) of memory, which is described as a reserved memory region in the device tree. On these two platforms, the maximum buffer size is given as 512 KiB. The MediaTek common code uses the same value for preallocations. This means that only the first two PCM substreams get preallocations, and then the whole space is exhausted, barring any other substreams from working. Since the substreams used are not always the first two, this means audio won't work correctly. This is observed on the MT8188 Geralt Chromebooks, on which the "mediatek,dai-link" property was dropped when it was upstreamed. That property causes the driver to only register the PCM substreams listed in the property, and in the order given. Instead of trying to compute an optimal value and figuring out which streams are used, simply disable preallocation. The PCM buffers are managed by the core and are allocated and released on the fly. There should be no impact to any of the other MediaTek platforms. Signed-off-by: Chen-Yu Tsai Reviewed-by: AngeloGioacchino Del Regno Link: https://patch.msgid.link/20241219105303.548437-1-wenst@chromium.org Signed-off-by: Mark Brown --- sound/soc/mediatek/common/mtk-afe-platform-driver.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sound/soc/mediatek/common/mtk-afe-platform-driver.c b/sound/soc/mediatek/common/mtk-afe-platform-driver.c index 9b72b2a7ae91..6b6330583941 100644 --- a/sound/soc/mediatek/common/mtk-afe-platform-driver.c +++ b/sound/soc/mediatek/common/mtk-afe-platform-driver.c @@ -120,8 +120,8 @@ int mtk_afe_pcm_new(struct snd_soc_component *component, struct mtk_base_afe *afe = snd_soc_component_get_drvdata(component); size = afe->mtk_afe_hardware->buffer_bytes_max; - snd_pcm_set_managed_buffer_all(pcm, SNDRV_DMA_TYPE_DEV, - afe->dev, size, size); + snd_pcm_set_managed_buffer_all(pcm, SNDRV_DMA_TYPE_DEV, afe->dev, 0, size); + return 0; } EXPORT_SYMBOL_GPL(mtk_afe_pcm_new); From 13221496065fa12fac4f8a8e725444679ffddb78 Mon Sep 17 00:00:00 2001 From: Ahmad Fatoum Date: Wed, 18 Dec 2024 20:54:53 +0100 Subject: [PATCH 164/337] regulator: rename regulator-uv-survival-time-ms according to DT binding MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The regulator bindings don't document regulator-uv-survival-time-ms, but the more descriptive regulator-uv-less-critical-window-ms instead. Looking back at v3[1] and v4[2] of the series adding the support, the property was indeed renamed between these patch series, but unfortunately the rename only made it into the DT bindings with the driver code still using the old name. Let's therefore rename the property in the driver code to follow suit. This will break backwards compatibility, but there are no upstream device trees using the property and we never documented the old name of the property anyway. ¯\_(ツ)_/¯" [1]: https://lore.kernel.org/all/20231025084614.3092295-7-o.rempel@pengutronix.de/ [2]: https://lore.kernel.org/all/20231026144824.4065145-5-o.rempel@pengutronix.de/ Signed-off-by: Ahmad Fatoum Link: https://patch.msgid.link/20241218-regulator-uv-survival-time-ms-rename-v1-1-6cac9c3c75da@pengutronix.de Signed-off-by: Mark Brown --- drivers/regulator/of_regulator.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/regulator/of_regulator.c b/drivers/regulator/of_regulator.c index 3d85762beda6..e5b4b93c07e3 100644 --- a/drivers/regulator/of_regulator.c +++ b/drivers/regulator/of_regulator.c @@ -175,7 +175,7 @@ static int of_get_regulation_constraints(struct device *dev, if (!ret) constraints->enable_time = pval; - ret = of_property_read_u32(np, "regulator-uv-survival-time-ms", &pval); + ret = of_property_read_u32(np, "regulator-uv-less-critical-window-ms", &pval); if (!ret) constraints->uv_less_critical_window_ms = pval; else From 40be32303ec829ea12f9883e499bfd3fe9e52baf Mon Sep 17 00:00:00 2001 From: Selvin Xavier Date: Tue, 17 Dec 2024 15:56:45 +0530 Subject: [PATCH 165/337] RDMA/bnxt_re: Fix max_qp_wrs reported While creating qps, driver adds one extra entry to the sq size passed by the ULPs in order to avoid queue full condition. When ULPs creates QPs with max_qp_wr reported, driver creates QP with 1 more than the max_wqes supported by HW. Create QP fails in this case. To avoid this error, reduce 1 entry in max_qp_wqes and report it to the stack. Fixes: 1ac5a4047975 ("RDMA/bnxt_re: Add bnxt_re RoCE driver") Reviewed-by: Kalesh AP Signed-off-by: Selvin Xavier Link: https://patch.msgid.link/20241217102649.1377704-2-kalesh-anakkur.purayil@broadcom.com Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/bnxt_re/qplib_sp.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/infiniband/hw/bnxt_re/qplib_sp.c b/drivers/infiniband/hw/bnxt_re/qplib_sp.c index 7e20ae3d2c4f..73c9baaebb4e 100644 --- a/drivers/infiniband/hw/bnxt_re/qplib_sp.c +++ b/drivers/infiniband/hw/bnxt_re/qplib_sp.c @@ -129,7 +129,7 @@ int bnxt_qplib_get_dev_attr(struct bnxt_qplib_rcfw *rcfw, attr->max_qp_init_rd_atom = sb->max_qp_init_rd_atom > BNXT_QPLIB_MAX_OUT_RD_ATOM ? BNXT_QPLIB_MAX_OUT_RD_ATOM : sb->max_qp_init_rd_atom; - attr->max_qp_wqes = le16_to_cpu(sb->max_qp_wr); + attr->max_qp_wqes = le16_to_cpu(sb->max_qp_wr) - 1; /* * 128 WQEs needs to be reserved for the HW (8916). Prevent * reporting the max number From d5a38bf2f35979537c526acbc56bc435ed40685f Mon Sep 17 00:00:00 2001 From: Kalesh AP Date: Tue, 17 Dec 2024 15:56:46 +0530 Subject: [PATCH 166/337] RDMA/bnxt_re: Disable use of reserved wqes Disabling the reserved wqes logic for Gen P5/P7 devices because this workaround is required only for legacy devices. Fixes: ecb53febfcad ("RDMA/bnxt_en: Enable RDMA driver support for 57500 chip") Signed-off-by: Kalesh AP Signed-off-by: Selvin Xavier Link: https://patch.msgid.link/20241217102649.1377704-3-kalesh-anakkur.purayil@broadcom.com Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/bnxt_re/qplib_sp.c | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/drivers/infiniband/hw/bnxt_re/qplib_sp.c b/drivers/infiniband/hw/bnxt_re/qplib_sp.c index 73c9baaebb4e..776f8f1f1432 100644 --- a/drivers/infiniband/hw/bnxt_re/qplib_sp.c +++ b/drivers/infiniband/hw/bnxt_re/qplib_sp.c @@ -130,11 +130,13 @@ int bnxt_qplib_get_dev_attr(struct bnxt_qplib_rcfw *rcfw, sb->max_qp_init_rd_atom > BNXT_QPLIB_MAX_OUT_RD_ATOM ? BNXT_QPLIB_MAX_OUT_RD_ATOM : sb->max_qp_init_rd_atom; attr->max_qp_wqes = le16_to_cpu(sb->max_qp_wr) - 1; - /* - * 128 WQEs needs to be reserved for the HW (8916). Prevent - * reporting the max number - */ - attr->max_qp_wqes -= BNXT_QPLIB_RESERVED_QP_WRS + 1; + if (!bnxt_qplib_is_chip_gen_p5_p7(rcfw->res->cctx)) { + /* + * 128 WQEs needs to be reserved for the HW (8916). Prevent + * reporting the max number on legacy devices + */ + attr->max_qp_wqes -= BNXT_QPLIB_RESERVED_QP_WRS + 1; + } attr->max_qp_sges = cctx->modes.wqe_mode == BNXT_QPLIB_WQE_MODE_VARIABLE ? min_t(u32, sb->max_sge_var_wqe, BNXT_VAR_MAX_SGE) : 6; From d13be54dc18baee7a3e44349b80755a8c8205d3f Mon Sep 17 00:00:00 2001 From: Damodharam Ammepalli Date: Tue, 17 Dec 2024 15:56:47 +0530 Subject: [PATCH 167/337] RDMA/bnxt_re: Add send queue size check for variable wqe For the fixed WQE case, HW supports 0xFFFF WQEs. For variable Size WQEs, HW treats this number as the 16 bytes slots. The maximum supported WQEs needs to be adjusted based on the number of slots. Set a maximum WQE limit for variable WQE scenario. Fixes: de1d364c3815 ("RDMA/bnxt_re: Add support for Variable WQE in Genp7 adapters") Reviewed-by: Kalesh AP Signed-off-by: Damodharam Ammepalli Signed-off-by: Selvin Xavier Link: https://patch.msgid.link/20241217102649.1377704-4-kalesh-anakkur.purayil@broadcom.com Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/bnxt_re/qplib_sp.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/drivers/infiniband/hw/bnxt_re/qplib_sp.c b/drivers/infiniband/hw/bnxt_re/qplib_sp.c index 776f8f1f1432..9df3e3271577 100644 --- a/drivers/infiniband/hw/bnxt_re/qplib_sp.c +++ b/drivers/infiniband/hw/bnxt_re/qplib_sp.c @@ -138,6 +138,10 @@ int bnxt_qplib_get_dev_attr(struct bnxt_qplib_rcfw *rcfw, attr->max_qp_wqes -= BNXT_QPLIB_RESERVED_QP_WRS + 1; } + /* Adjust for max_qp_wqes for variable wqe */ + if (cctx->modes.wqe_mode == BNXT_QPLIB_WQE_MODE_VARIABLE) + attr->max_qp_wqes = BNXT_VAR_MAX_WQE - 1; + attr->max_qp_sges = cctx->modes.wqe_mode == BNXT_QPLIB_WQE_MODE_VARIABLE ? min_t(u32, sb->max_sge_var_wqe, BNXT_VAR_MAX_SGE) : 6; attr->max_cq = le32_to_cpu(sb->max_cq); From bb839f3ace0fee532a0487b692cc4d868fccb7cf Mon Sep 17 00:00:00 2001 From: Damodharam Ammepalli Date: Tue, 17 Dec 2024 15:56:48 +0530 Subject: [PATCH 168/337] RDMA/bnxt_re: Fix MSN table size for variable wqe mode For variable size wqe mode, the MSN table size should be half the size of the SQ depth. Fixing this to avoid wrap around problems in the retransmission path. Fixes: de1d364c3815 ("RDMA/bnxt_re: Add support for Variable WQE in Genp7 adapters") Reviewed-by: Kashyap Desai Reviewed-by: Kalesh AP Signed-off-by: Damodharam Ammepalli Signed-off-by: Selvin Xavier Link: https://patch.msgid.link/20241217102649.1377704-5-kalesh-anakkur.purayil@broadcom.com Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/bnxt_re/qplib_fp.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/drivers/infiniband/hw/bnxt_re/qplib_fp.c b/drivers/infiniband/hw/bnxt_re/qplib_fp.c index d8a2a929bbe3..951ad90f5aa9 100644 --- a/drivers/infiniband/hw/bnxt_re/qplib_fp.c +++ b/drivers/infiniband/hw/bnxt_re/qplib_fp.c @@ -1033,7 +1033,12 @@ int bnxt_qplib_create_qp(struct bnxt_qplib_res *res, struct bnxt_qplib_qp *qp) : 0; /* Update msn tbl size */ if (qp->is_host_msn_tbl && psn_sz) { - hwq_attr.aux_depth = roundup_pow_of_two(bnxt_qplib_set_sq_size(sq, qp->wqe_mode)); + if (qp->wqe_mode == BNXT_QPLIB_WQE_MODE_STATIC) + hwq_attr.aux_depth = + roundup_pow_of_two(bnxt_qplib_set_sq_size(sq, qp->wqe_mode)); + else + hwq_attr.aux_depth = + roundup_pow_of_two(bnxt_qplib_set_sq_size(sq, qp->wqe_mode)) / 2; qp->msn_tbl_sz = hwq_attr.aux_depth; qp->msn = 0; } From 9272cba0ded71b5a2084da3004ec7806b8cb7fd2 Mon Sep 17 00:00:00 2001 From: Selvin Xavier Date: Tue, 17 Dec 2024 15:56:49 +0530 Subject: [PATCH 169/337] RDMA/bnxt_re: Fix the locking while accessing the QP table QP table handling is synchronized with destroy QP and Async event from the HW. The same needs to be synchronized during create_qp also. Use the same lock in create_qp also. Fixes: 76d3ddff7153 ("RDMA/bnxt_re: synchronize the qp-handle table array") Fixes: f218d67ef004 ("RDMA/bnxt_re: Allow posting when QPs are in error") Fixes: 84cf229f4001 ("RDMA/bnxt_re: Fix the qp table indexing") Signed-off-by: Selvin Xavier Link: https://patch.msgid.link/20241217102649.1377704-6-kalesh-anakkur.purayil@broadcom.com Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/bnxt_re/qplib_fp.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/infiniband/hw/bnxt_re/qplib_fp.c b/drivers/infiniband/hw/bnxt_re/qplib_fp.c index 951ad90f5aa9..5336f74297f8 100644 --- a/drivers/infiniband/hw/bnxt_re/qplib_fp.c +++ b/drivers/infiniband/hw/bnxt_re/qplib_fp.c @@ -1181,9 +1181,11 @@ int bnxt_qplib_create_qp(struct bnxt_qplib_res *res, struct bnxt_qplib_qp *qp) rq->dbinfo.db = qp->dpi->dbr; rq->dbinfo.max_slot = bnxt_qplib_set_rq_max_slot(rq->wqe_size); } + spin_lock_bh(&rcfw->tbl_lock); tbl_indx = map_qp_id_to_tbl_indx(qp->id, rcfw); rcfw->qp_tbl[tbl_indx].qp_id = qp->id; rcfw->qp_tbl[tbl_indx].qp_handle = (void *)qp; + spin_unlock_bh(&rcfw->tbl_lock); return 0; fail: From 29d44cce324dab2bd86c447071a596262e7109b6 Mon Sep 17 00:00:00 2001 From: Tiezhu Yang Date: Thu, 19 Dec 2024 19:15:06 +0800 Subject: [PATCH 170/337] selftests/bpf: Use asm constraint "m" for LoongArch Currently, LoongArch LLVM does not support the constraint "o" and no plan to support it, it only supports the similar constraint "m", so change the constraints from "nor" in the "else" case to arch-specific "nmr" to avoid the build error such as "unexpected asm memory constraint" for LoongArch. Fixes: 630301b0d59d ("selftests/bpf: Add basic USDT selftests") Suggested-by: Weining Lu Suggested-by: Li Chen Signed-off-by: Tiezhu Yang Signed-off-by: Daniel Borkmann Reviewed-by: Huacai Chen Cc: stable@vger.kernel.org Link: https://llvm.org/docs/LangRef.html#supported-constraint-code-list Link: https://github.com/llvm/llvm-project/blob/main/llvm/lib/Target/LoongArch/LoongArchISelDAGToDAG.cpp#L172 Link: https://lore.kernel.org/bpf/20241219111506.20643-1-yangtiezhu@loongson.cn --- tools/testing/selftests/bpf/sdt.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tools/testing/selftests/bpf/sdt.h b/tools/testing/selftests/bpf/sdt.h index ca0162b4dc57..1fcfa5160231 100644 --- a/tools/testing/selftests/bpf/sdt.h +++ b/tools/testing/selftests/bpf/sdt.h @@ -102,6 +102,8 @@ # define STAP_SDT_ARG_CONSTRAINT nZr # elif defined __arm__ # define STAP_SDT_ARG_CONSTRAINT g +# elif defined __loongarch__ +# define STAP_SDT_ARG_CONSTRAINT nmr # else # define STAP_SDT_ARG_CONSTRAINT nor # endif From 4b2efb9db0c22a130bbd1275e489b42c02d08050 Mon Sep 17 00:00:00 2001 From: Jacek Lawrynowicz Date: Tue, 10 Dec 2024 14:09:37 +0100 Subject: [PATCH 171/337] accel/ivpu: Fix general protection fault in ivpu_bo_list() Check if ctx is not NULL before accessing its fields. Fixes: 37dee2a2f433 ("accel/ivpu: Improve buffer object debug logs") Cc: stable@vger.kernel.org # v6.8 Reviewed-by: Karol Wachowski Reviewed-by: Jeffrey Hugo Signed-off-by: Jacek Lawrynowicz Link: https://patchwork.freedesktop.org/patch/msgid/20241210130939.1575610-2-jacek.lawrynowicz@linux.intel.com --- drivers/accel/ivpu/ivpu_gem.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/accel/ivpu/ivpu_gem.c b/drivers/accel/ivpu/ivpu_gem.c index d8e97a760fbc..16178054e629 100644 --- a/drivers/accel/ivpu/ivpu_gem.c +++ b/drivers/accel/ivpu/ivpu_gem.c @@ -409,7 +409,7 @@ static void ivpu_bo_print_info(struct ivpu_bo *bo, struct drm_printer *p) mutex_lock(&bo->lock); drm_printf(p, "%-9p %-3u 0x%-12llx %-10lu 0x%-8x %-4u", - bo, bo->ctx->id, bo->vpu_addr, bo->base.base.size, + bo, bo->ctx ? bo->ctx->id : 0, bo->vpu_addr, bo->base.base.size, bo->flags, kref_read(&bo->base.base.refcount)); if (bo->base.pages) From 6c9ba75f147b24b5c59aac7356a38a0fef664afa Mon Sep 17 00:00:00 2001 From: Jacek Lawrynowicz Date: Tue, 10 Dec 2024 14:09:38 +0100 Subject: [PATCH 172/337] accel/ivpu: Fix memory leak in ivpu_mmu_reserved_context_init() Add appropriate error handling to ensure all allocated resources are released upon encountering an error. Fixes: a74f4d991352 ("accel/ivpu: Defer MMU root page table allocation") Cc: Karol Wachowski Reviewed-by: Karol Wachowski Reviewed-by: Jeffrey Hugo Signed-off-by: Jacek Lawrynowicz Link: https://patchwork.freedesktop.org/patch/msgid/20241210130939.1575610-3-jacek.lawrynowicz@linux.intel.com --- drivers/accel/ivpu/ivpu_mmu_context.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/drivers/accel/ivpu/ivpu_mmu_context.c b/drivers/accel/ivpu/ivpu_mmu_context.c index 891967a95bc3..0af614dfb6f9 100644 --- a/drivers/accel/ivpu/ivpu_mmu_context.c +++ b/drivers/accel/ivpu/ivpu_mmu_context.c @@ -612,18 +612,22 @@ int ivpu_mmu_reserved_context_init(struct ivpu_device *vdev) if (!ivpu_mmu_ensure_pgd(vdev, &vdev->rctx.pgtable)) { ivpu_err(vdev, "Failed to allocate root page table for reserved context\n"); ret = -ENOMEM; - goto unlock; + goto err_ctx_fini; } ret = ivpu_mmu_cd_set(vdev, vdev->rctx.id, &vdev->rctx.pgtable); if (ret) { ivpu_err(vdev, "Failed to set context descriptor for reserved context\n"); - goto unlock; + goto err_ctx_fini; } -unlock: mutex_unlock(&vdev->rctx.lock); return ret; + +err_ctx_fini: + mutex_unlock(&vdev->rctx.lock); + ivpu_mmu_context_fini(vdev, &vdev->rctx); + return ret; } void ivpu_mmu_reserved_context_fini(struct ivpu_device *vdev) From 0f6482caa6acdfdfc744db7430771fe7e6c4e787 Mon Sep 17 00:00:00 2001 From: Jacek Lawrynowicz Date: Tue, 10 Dec 2024 14:09:39 +0100 Subject: [PATCH 173/337] accel/ivpu: Fix WARN in ivpu_ipc_send_receive_internal() Move pm_runtime_set_active() to ivpu_pm_init() so when ivpu_ipc_send_receive_internal() is executed before ivpu_pm_enable() it already has correct runtime state, even if last resume was not successful. Fixes: 8ed520ff4682 ("accel/ivpu: Move set autosuspend delay to HW specific code") Cc: stable@vger.kernel.org # v6.7+ Reviewed-by: Karol Wachowski Reviewed-by: Jeffrey Hugo Signed-off-by: Jacek Lawrynowicz Link: https://patchwork.freedesktop.org/patch/msgid/20241210130939.1575610-4-jacek.lawrynowicz@linux.intel.com --- drivers/accel/ivpu/ivpu_pm.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/accel/ivpu/ivpu_pm.c b/drivers/accel/ivpu/ivpu_pm.c index dbc0711e28d1..949f4233946c 100644 --- a/drivers/accel/ivpu/ivpu_pm.c +++ b/drivers/accel/ivpu/ivpu_pm.c @@ -378,6 +378,7 @@ void ivpu_pm_init(struct ivpu_device *vdev) pm_runtime_use_autosuspend(dev); pm_runtime_set_autosuspend_delay(dev, delay); + pm_runtime_set_active(dev); ivpu_dbg(vdev, PM, "Autosuspend delay = %d\n", delay); } @@ -392,7 +393,6 @@ void ivpu_pm_enable(struct ivpu_device *vdev) { struct device *dev = vdev->drm.dev; - pm_runtime_set_active(dev); pm_runtime_allow(dev); pm_runtime_mark_last_busy(dev); pm_runtime_put_autosuspend(dev); From 716f2bca1ce93bb95364f1fc0555c1650507b588 Mon Sep 17 00:00:00 2001 From: Jerome Marchand Date: Wed, 18 Dec 2024 18:57:24 +0100 Subject: [PATCH 174/337] selftests/bpf: Fix compilation error in get_uprobe_offset() In get_uprobe_offset(), the call to procmap_query() use the constant PROCMAP_QUERY_VMA_EXECUTABLE, even if PROCMAP_QUERY is not defined. Define PROCMAP_QUERY_VMA_EXECUTABLE when PROCMAP_QUERY isn't. Fixes: 4e9e07603ecd ("selftests/bpf: make use of PROCMAP_QUERY ioctl if available") Signed-off-by: Jerome Marchand Signed-off-by: Daniel Borkmann Acked-by: Yonghong Song Link: https://lore.kernel.org/bpf/20241218175724.578884-1-jmarchan@redhat.com --- tools/testing/selftests/bpf/trace_helpers.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/tools/testing/selftests/bpf/trace_helpers.c b/tools/testing/selftests/bpf/trace_helpers.c index 2d742fdac6b9..81943c6254e6 100644 --- a/tools/testing/selftests/bpf/trace_helpers.c +++ b/tools/testing/selftests/bpf/trace_helpers.c @@ -293,6 +293,10 @@ static int procmap_query(int fd, const void *addr, __u32 query_flags, size_t *st return 0; } #else +# ifndef PROCMAP_QUERY_VMA_EXECUTABLE +# define PROCMAP_QUERY_VMA_EXECUTABLE 0x04 +# endif + static int procmap_query(int fd, const void *addr, __u32 query_flags, size_t *start, size_t *offset, int *flags) { return -EOPNOTSUPP; From 8d90a86ed053226a297ce062f4d9f4f521e05c4c Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Thu, 12 Dec 2024 20:19:48 -0800 Subject: [PATCH 175/337] mmc: sdhci-msm: fix crypto key eviction Commit c7eed31e235c ("mmc: sdhci-msm: Switch to the new ICE API") introduced an incorrect check of the algorithm ID into the key eviction path, and thus qcom_ice_evict_key() is no longer ever called. Fix it. Fixes: c7eed31e235c ("mmc: sdhci-msm: Switch to the new ICE API") Cc: stable@vger.kernel.org Cc: Abel Vesa Signed-off-by: Eric Biggers Message-ID: <20241213041958.202565-6-ebiggers@kernel.org> Signed-off-by: Ulf Hansson --- drivers/mmc/host/sdhci-msm.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/drivers/mmc/host/sdhci-msm.c b/drivers/mmc/host/sdhci-msm.c index e00208535bd1..319f0ebbe652 100644 --- a/drivers/mmc/host/sdhci-msm.c +++ b/drivers/mmc/host/sdhci-msm.c @@ -1867,20 +1867,20 @@ static int sdhci_msm_program_key(struct cqhci_host *cq_host, struct sdhci_msm_host *msm_host = sdhci_pltfm_priv(pltfm_host); union cqhci_crypto_cap_entry cap; + if (!(cfg->config_enable & CQHCI_CRYPTO_CONFIGURATION_ENABLE)) + return qcom_ice_evict_key(msm_host->ice, slot); + /* Only AES-256-XTS has been tested so far. */ cap = cq_host->crypto_cap_array[cfg->crypto_cap_idx]; if (cap.algorithm_id != CQHCI_CRYPTO_ALG_AES_XTS || cap.key_size != CQHCI_CRYPTO_KEY_SIZE_256) return -EINVAL; - if (cfg->config_enable & CQHCI_CRYPTO_CONFIGURATION_ENABLE) - return qcom_ice_program_key(msm_host->ice, - QCOM_ICE_CRYPTO_ALG_AES_XTS, - QCOM_ICE_CRYPTO_KEY_SIZE_256, - cfg->crypto_key, - cfg->data_unit_size, slot); - else - return qcom_ice_evict_key(msm_host->ice, slot); + return qcom_ice_program_key(msm_host->ice, + QCOM_ICE_CRYPTO_ALG_AES_XTS, + QCOM_ICE_CRYPTO_KEY_SIZE_256, + cfg->crypto_key, + cfg->data_unit_size, slot); } #else /* CONFIG_MMC_CRYPTO */ From 469c0682e03d67d8dc970ecaa70c2d753057c7c0 Mon Sep 17 00:00:00 2001 From: Joe Hattori Date: Sun, 15 Dec 2024 12:01:59 +0900 Subject: [PATCH 176/337] pmdomain: imx: gpcv2: fix an OF node reference leak in imx_gpcv2_probe() imx_gpcv2_probe() leaks an OF node reference obtained by of_get_child_by_name(). Fix it by declaring the device node with the __free(device_node) cleanup construct. This bug was found by an experimental static analysis tool that I am developing. Fixes: 03aa12629fc4 ("soc: imx: Add GPCv2 power gating driver") Signed-off-by: Joe Hattori Cc: stable@vger.kernel.org Message-ID: <20241215030159.1526624-1-joe@pf.is.s.u-tokyo.ac.jp> Signed-off-by: Ulf Hansson --- drivers/pmdomain/imx/gpcv2.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/pmdomain/imx/gpcv2.c b/drivers/pmdomain/imx/gpcv2.c index e67ecf99ef84..9bdb80fd7210 100644 --- a/drivers/pmdomain/imx/gpcv2.c +++ b/drivers/pmdomain/imx/gpcv2.c @@ -1458,12 +1458,12 @@ static int imx_gpcv2_probe(struct platform_device *pdev) .max_register = SZ_4K, }; struct device *dev = &pdev->dev; - struct device_node *pgc_np; + struct device_node *pgc_np __free(device_node) = + of_get_child_by_name(dev->of_node, "pgc"); struct regmap *regmap; void __iomem *base; int ret; - pgc_np = of_get_child_by_name(dev->of_node, "pgc"); if (!pgc_np) { dev_err(dev, "No power domains specified in DT\n"); return -EINVAL; From f64f610ec6ab59dd0391b03842cea3a4cd8ee34f Mon Sep 17 00:00:00 2001 From: Lucas Stach Date: Wed, 18 Dec 2024 19:44:33 +0100 Subject: [PATCH 177/337] pmdomain: core: add dummy release function to genpd device The genpd device, which is really only used as a handle to lookup OPP, but not even registered to the device core otherwise and thus lifetime linked to the genpd struct it is contained in, is missing a release function. After b8f7bbd1f4ec ("pmdomain: core: Add missing put_device()") the device will be cleaned up going through the driver core device_release() function, which will warn when no release callback is present for the device. Add a dummy release function to shut up the warning. Signed-off-by: Lucas Stach Tested-by: Luca Ceresoli Fixes: b8f7bbd1f4ec ("pmdomain: core: Add missing put_device()") Cc: stable@vger.kernel.org Message-ID: <20241218184433.1930532-1-l.stach@pengutronix.de> Signed-off-by: Ulf Hansson --- drivers/pmdomain/core.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/drivers/pmdomain/core.c b/drivers/pmdomain/core.c index bb11f467dc78..20a9efebbcb7 100644 --- a/drivers/pmdomain/core.c +++ b/drivers/pmdomain/core.c @@ -2142,6 +2142,11 @@ static int genpd_set_default_power_state(struct generic_pm_domain *genpd) return 0; } +static void genpd_provider_release(struct device *dev) +{ + /* nothing to be done here */ +} + static int genpd_alloc_data(struct generic_pm_domain *genpd) { struct genpd_governor_data *gd = NULL; @@ -2173,6 +2178,7 @@ static int genpd_alloc_data(struct generic_pm_domain *genpd) genpd->gd = gd; device_initialize(&genpd->dev); + genpd->dev.release = genpd_provider_release; if (!genpd_is_dev_name_fw(genpd)) { dev_set_name(&genpd->dev, "%s", genpd->name); From 1b684ca15f9d78f45de3cdba7e19611387e16aa7 Mon Sep 17 00:00:00 2001 From: Bagas Sanjaya Date: Tue, 17 Dec 2024 10:49:15 +0700 Subject: [PATCH 178/337] drm/sched: Fix drm_sched_fini() docu generation Commit baf4afc5831438 ("drm/sched: Improve teardown documentation") added a list of drm_sched_fini()'s problems. The list triggers htmldocs warning (but renders correctly in htmldocs output): Documentation/gpu/drm-mm:571: ./drivers/gpu/drm/scheduler/sched_main.c:1359: ERROR: Unexpected indentation. Separate the list from the preceding paragraph by a blank line to fix the warning. While at it, also end the aforementioned paragraph by a colon. Fixes: baf4afc58314 ("drm/sched: Improve teardown documentation") Reported-by: Stephen Rothwell Closes: https://lore.kernel.org/r/20241108175655.6d3fcfb7@canb.auug.org.au/ Signed-off-by: Bagas Sanjaya [phasta: Adjust commit message] Signed-off-by: Philipp Stanner Link: https://patchwork.freedesktop.org/patch/msgid/20241217034915.62594-1-bagasdotme@gmail.com --- drivers/gpu/drm/scheduler/sched_main.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/scheduler/sched_main.c b/drivers/gpu/drm/scheduler/sched_main.c index 7ce25281c74c..57da84908752 100644 --- a/drivers/gpu/drm/scheduler/sched_main.c +++ b/drivers/gpu/drm/scheduler/sched_main.c @@ -1355,7 +1355,8 @@ EXPORT_SYMBOL(drm_sched_init); * drm_sched_backend_ops.run_job(). Consequently, drm_sched_backend_ops.free_job() * will not be called for all jobs still in drm_gpu_scheduler.pending_list. * There is no solution for this currently. Thus, it is up to the driver to make - * sure that + * sure that: + * * a) drm_sched_fini() is only called after for all submitted jobs * drm_sched_backend_ops.free_job() has been called or that * b) the jobs for which drm_sched_backend_ops.free_job() has not been called From a769bee5f9fbca47efd4fa6bc3d726d370cedebe Mon Sep 17 00:00:00 2001 From: Bharath SM Date: Tue, 17 Dec 2024 00:09:36 +0530 Subject: [PATCH 179/337] smb: use macros instead of constants for leasekey size and default cifsattrs value Replace default hardcoded value for cifsAttrs with ATTR_ARCHIVE macro Use SMB2_LEASE_KEY_SIZE macro for leasekey size in smb2_lease_break Signed-off-by: Bharath SM Signed-off-by: Steve French --- fs/smb/client/cifsfs.c | 2 +- fs/smb/client/smb2pdu.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/smb/client/cifsfs.c b/fs/smb/client/cifsfs.c index 9d96b833015c..b800c9f585d8 100644 --- a/fs/smb/client/cifsfs.c +++ b/fs/smb/client/cifsfs.c @@ -398,7 +398,7 @@ cifs_alloc_inode(struct super_block *sb) cifs_inode = alloc_inode_sb(sb, cifs_inode_cachep, GFP_KERNEL); if (!cifs_inode) return NULL; - cifs_inode->cifsAttrs = 0x20; /* default */ + cifs_inode->cifsAttrs = ATTR_ARCHIVE; /* default */ cifs_inode->time = 0; /* * Until the file is open and we have gotten oplock info back from the diff --git a/fs/smb/client/smb2pdu.c b/fs/smb/client/smb2pdu.c index 010eae9d6c47..c945b94318f8 100644 --- a/fs/smb/client/smb2pdu.c +++ b/fs/smb/client/smb2pdu.c @@ -6204,7 +6204,7 @@ SMB2_lease_break(const unsigned int xid, struct cifs_tcon *tcon, req->StructureSize = cpu_to_le16(36); total_len += 12; - memcpy(req->LeaseKey, lease_key, 16); + memcpy(req->LeaseKey, lease_key, SMB2_LEASE_KEY_SIZE); req->LeaseState = lease_state; flags |= CIFS_NO_RSP_BUF; From ee1c8e6b2931811a906b8c78006bfe0a3386fa60 Mon Sep 17 00:00:00 2001 From: Dragan Simic Date: Tue, 17 Dec 2024 10:25:10 +0100 Subject: [PATCH 180/337] smb: client: Deduplicate "select NETFS_SUPPORT" in Kconfig Repeating automatically selected options in Kconfig files is redundant, so let's delete repeated "select NETFS_SUPPORT" that was added accidentally. Fixes: 69c3c023af25 ("cifs: Implement netfslib hooks") Signed-off-by: Dragan Simic Signed-off-by: Steve French --- fs/smb/client/Kconfig | 1 - 1 file changed, 1 deletion(-) diff --git a/fs/smb/client/Kconfig b/fs/smb/client/Kconfig index 2aff6d1395ce..9f05f94e265a 100644 --- a/fs/smb/client/Kconfig +++ b/fs/smb/client/Kconfig @@ -2,7 +2,6 @@ config CIFS tristate "SMB3 and CIFS support (advanced network filesystem)" depends on INET - select NETFS_SUPPORT select NLS select NLS_UCS2_UTILS select CRYPTO From e9f2517a3e18a54a3943c098d2226b245d488801 Mon Sep 17 00:00:00 2001 From: Enzo Matsumiya Date: Tue, 10 Dec 2024 18:15:12 -0300 Subject: [PATCH 181/337] smb: client: fix TCP timers deadlock after rmmod Commit ef7134c7fc48 ("smb: client: Fix use-after-free of network namespace.") fixed a netns UAF by manually enabled socket refcounting (sk->sk_net_refcnt=1 and sock_inuse_add(net, 1)). The reason the patch worked for that bug was because we now hold references to the netns (get_net_track() gets a ref internally) and they're properly released (internally, on __sk_destruct()), but only because sk->sk_net_refcnt was set. Problem: (this happens regardless of CONFIG_NET_NS_REFCNT_TRACKER and regardless if init_net or other) Setting sk->sk_net_refcnt=1 *manually* and *after* socket creation is not only out of cifs scope, but also technically wrong -- it's set conditionally based on user (=1) vs kernel (=0) sockets. And net/ implementations seem to base their user vs kernel space operations on it. e.g. upon TCP socket close, the TCP timers are not cleared because sk->sk_net_refcnt=1: (cf. commit 151c9c724d05 ("tcp: properly terminate timers for kernel sockets")) net/ipv4/tcp.c: void tcp_close(struct sock *sk, long timeout) { lock_sock(sk); __tcp_close(sk, timeout); release_sock(sk); if (!sk->sk_net_refcnt) inet_csk_clear_xmit_timers_sync(sk); sock_put(sk); } Which will throw a lockdep warning and then, as expected, deadlock on tcp_write_timer(). A way to reproduce this is by running the reproducer from ef7134c7fc48 and then 'rmmod cifs'. A few seconds later, the deadlock/lockdep warning shows up. Fix: We shouldn't mess with socket internals ourselves, so do not set sk_net_refcnt manually. Also change __sock_create() to sock_create_kern() for explicitness. As for non-init_net network namespaces, we deal with it the best way we can -- hold an extra netns reference for server->ssocket and drop it when it's released. This ensures that the netns still exists whenever we need to create/destroy server->ssocket, but is not directly tied to it. Fixes: ef7134c7fc48 ("smb: client: Fix use-after-free of network namespace.") Cc: stable@vger.kernel.org Signed-off-by: Enzo Matsumiya Signed-off-by: Steve French --- fs/smb/client/connect.c | 36 ++++++++++++++++++++++++++---------- 1 file changed, 26 insertions(+), 10 deletions(-) diff --git a/fs/smb/client/connect.c b/fs/smb/client/connect.c index 2372538a1211..ddcc9e514a0e 100644 --- a/fs/smb/client/connect.c +++ b/fs/smb/client/connect.c @@ -987,9 +987,13 @@ clean_demultiplex_info(struct TCP_Server_Info *server) msleep(125); if (cifs_rdma_enabled(server)) smbd_destroy(server); + if (server->ssocket) { sock_release(server->ssocket); server->ssocket = NULL; + + /* Release netns reference for the socket. */ + put_net(cifs_net_ns(server)); } if (!list_empty(&server->pending_mid_q)) { @@ -1037,6 +1041,7 @@ clean_demultiplex_info(struct TCP_Server_Info *server) */ } + /* Release netns reference for this server. */ put_net(cifs_net_ns(server)); kfree(server->leaf_fullpath); kfree(server); @@ -1713,6 +1718,8 @@ cifs_get_tcp_session(struct smb3_fs_context *ctx, tcp_ses->ops = ctx->ops; tcp_ses->vals = ctx->vals; + + /* Grab netns reference for this server. */ cifs_set_net_ns(tcp_ses, get_net(current->nsproxy->net_ns)); tcp_ses->conn_id = atomic_inc_return(&tcpSesNextId); @@ -1844,6 +1851,7 @@ smbd_connected: out_err_crypto_release: cifs_crypto_secmech_release(tcp_ses); + /* Release netns reference for this server. */ put_net(cifs_net_ns(tcp_ses)); out_err: @@ -1852,8 +1860,10 @@ out_err: cifs_put_tcp_session(tcp_ses->primary_server, false); kfree(tcp_ses->hostname); kfree(tcp_ses->leaf_fullpath); - if (tcp_ses->ssocket) + if (tcp_ses->ssocket) { sock_release(tcp_ses->ssocket); + put_net(cifs_net_ns(tcp_ses)); + } kfree(tcp_ses); } return ERR_PTR(rc); @@ -3131,20 +3141,20 @@ generic_ip_connect(struct TCP_Server_Info *server) socket = server->ssocket; } else { struct net *net = cifs_net_ns(server); - struct sock *sk; - rc = __sock_create(net, sfamily, SOCK_STREAM, - IPPROTO_TCP, &server->ssocket, 1); + rc = sock_create_kern(net, sfamily, SOCK_STREAM, IPPROTO_TCP, &server->ssocket); if (rc < 0) { cifs_server_dbg(VFS, "Error %d creating socket\n", rc); return rc; } - sk = server->ssocket->sk; - __netns_tracker_free(net, &sk->ns_tracker, false); - sk->sk_net_refcnt = 1; - get_net_track(net, &sk->ns_tracker, GFP_KERNEL); - sock_inuse_add(net, 1); + /* + * Grab netns reference for the socket. + * + * It'll be released here, on error, or in clean_demultiplex_info() upon server + * teardown. + */ + get_net(net); /* BB other socket options to set KEEPALIVE, NODELAY? */ cifs_dbg(FYI, "Socket created\n"); @@ -3158,8 +3168,10 @@ generic_ip_connect(struct TCP_Server_Info *server) } rc = bind_socket(server); - if (rc < 0) + if (rc < 0) { + put_net(cifs_net_ns(server)); return rc; + } /* * Eventually check for other socket options to change from @@ -3196,6 +3208,7 @@ generic_ip_connect(struct TCP_Server_Info *server) if (rc < 0) { cifs_dbg(FYI, "Error %d connecting to server\n", rc); trace_smb3_connect_err(server->hostname, server->conn_id, &server->dstaddr, rc); + put_net(cifs_net_ns(server)); sock_release(socket); server->ssocket = NULL; return rc; @@ -3204,6 +3217,9 @@ generic_ip_connect(struct TCP_Server_Info *server) if (sport == htons(RFC1001_PORT)) rc = ip_rfc1001_connect(server); + if (rc < 0) + put_net(cifs_net_ns(server)); + return rc; } From c261e4f1dd29fabab54b325bc1da8769a3998be1 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Thu, 19 Dec 2024 09:32:26 -0700 Subject: [PATCH 182/337] io_uring/register: limit ring resizing to DEFER_TASKRUN With DEFER_TASKRUN, we know the ring can't be both waited upon and resized at the same time. This is important for CQ resizing. Allowing SQ ring resizing is more trivial, but isn't the interesting use case. Hence limit ring resizing in general to DEFER_TASKRUN only for now. This isn't a huge problem as CQ ring resizing is generally the most useful on networking type of workloads where it can be hard to size the ring appropriately upfront, and those should be using DEFER_TASKRUN for better performance. Fixes: 79cfe9e59c2a ("io_uring/register: add IORING_REGISTER_RESIZE_RINGS") Signed-off-by: Jens Axboe --- io_uring/register.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/io_uring/register.c b/io_uring/register.c index 1e99c783abdf..fdd44914c39c 100644 --- a/io_uring/register.c +++ b/io_uring/register.c @@ -414,6 +414,9 @@ static int io_register_resize_rings(struct io_ring_ctx *ctx, void __user *arg) if (ctx->flags & IORING_SETUP_SINGLE_ISSUER && current != ctx->submitter_task) return -EEXIST; + /* limited to DEFER_TASKRUN for now */ + if (!(ctx->flags & IORING_SETUP_DEFER_TASKRUN)) + return -EINVAL; if (copy_from_user(&p, arg, sizeof(p))) return -EFAULT; if (p.flags & ~RESIZE_FLAGS) From 3202ca221578850f34e0fea39dc6cfa745ed7aac Mon Sep 17 00:00:00 2001 From: Lukas Wunner Date: Tue, 17 Dec 2024 10:51:01 +0100 Subject: [PATCH 183/337] PCI: Honor Max Link Speed when determining supported speeds MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The Supported Link Speeds Vector in the Link Capabilities 2 Register indicates the *supported* link speeds. The Max Link Speed field in the Link Capabilities Register indicates the *maximum* of those speeds. pcie_get_supported_speeds() neglects to honor the Max Link Speed field and will thus incorrectly deem higher speeds as supported. Fix it. One user-visible issue addressed here is an incorrect value in the sysfs attribute "max_link_speed". But the main motivation is a boot hang reported by Niklas: Intel JHL7540 "Titan Ridge 2018" Thunderbolt controllers supports 2.5-8 GT/s speeds, but indicate 2.5 GT/s as maximum. Ilpo recalls seeing this on more devices. It can be explained by the controller's Downstream Ports supporting 8 GT/s if an Endpoint is attached, but limiting to 2.5 GT/s if the port interfaces to a PCIe Adapter, in accordance with USB4 v2 sec 11.2.1: "This section defines the functionality of an Internal PCIe Port that interfaces to a PCIe Adapter. [...] The Logical sub-block shall update the PCIe configuration registers with the following characteristics: [...] Max Link Speed field in the Link Capabilities Register set to 0001b (data rate of 2.5 GT/s only). Note: These settings do not represent actual throughput. Throughput is implementation specific and based on the USB4 Fabric performance." The present commit is not sufficient on its own to fix Niklas' boot hang, but it is a prerequisite: A subsequent commit will fix the boot hang by enabling bandwidth control only if more than one speed is supported. The GENMASK() macro used herein specifies 0 as lowest bit, even though the Supported Link Speeds Vector ends at bit 1. This is done on purpose to avoid a GENMASK(0, 1) macro if Max Link Speed is zero. That macro would be invalid as the lowest bit is greater than the highest bit. Ilpo has witnessed a zero Max Link Speed on Root Complex Integrated Endpoints in particular, so it does occur in practice. (The Link Capabilities Register is optional on RCiEPs per PCIe r6.2 sec 7.5.3.) Fixes: d2bd39c0456b ("PCI: Store all PCIe Supported Link Speeds") Closes: https://lore.kernel.org/r/70829798889c6d779ca0f6cd3260a765780d1369.camel@kernel.org Link: https://lore.kernel.org/r/fe03941e3e1cc42fb9bf4395e302bff53ee2198b.1734428762.git.lukas@wunner.de Reported-by: Niklas Schnelle Tested-by: Niklas Schnelle Signed-off-by: Lukas Wunner Signed-off-by: Krzysztof Wilczyński Reviewed-by: Jonathan Cameron Reviewed-by: Ilpo Järvinen --- drivers/pci/pci.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c index 0b29ec6e8e5e..661f98c6c63a 100644 --- a/drivers/pci/pci.c +++ b/drivers/pci/pci.c @@ -6232,12 +6232,14 @@ u8 pcie_get_supported_speeds(struct pci_dev *dev) pcie_capability_read_dword(dev, PCI_EXP_LNKCAP2, &lnkcap2); speeds = lnkcap2 & PCI_EXP_LNKCAP2_SLS; + /* Ignore speeds higher than Max Link Speed */ + pcie_capability_read_dword(dev, PCI_EXP_LNKCAP, &lnkcap); + speeds &= GENMASK(lnkcap & PCI_EXP_LNKCAP_SLS, 0); + /* PCIe r3.0-compliant */ if (speeds) return speeds; - pcie_capability_read_dword(dev, PCI_EXP_LNKCAP, &lnkcap); - /* Synthesize from the Max Link Speed field */ if ((lnkcap & PCI_EXP_LNKCAP_SLS) == PCI_EXP_LNKCAP_SLS_5_0GB) speeds = PCI_EXP_LNKCAP2_SLS_5_0GB | PCI_EXP_LNKCAP2_SLS_2_5GB; From 774c71c52aa487001c7da9f93b10cedc9985c371 Mon Sep 17 00:00:00 2001 From: Lukas Wunner Date: Tue, 17 Dec 2024 10:51:02 +0100 Subject: [PATCH 184/337] PCI/bwctrl: Enable only if more than one speed is supported MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit If a PCIe port only supports a single speed, enabling bandwidth control is pointless: There's no need to monitor autonomous speed changes, nor can the speed be changed. Not enabling it saves a small amount of memory and compute resources, but also fixes a boot hang reported by Niklas: It occurs when enabling bandwidth control on Downstream Ports of Intel JHL7540 "Titan Ridge 2018" Thunderbolt controllers. The ports only support 2.5 GT/s in accordance with USB4 v2 sec 11.2.1, so the present commit works around the issue. PCIe r6.2 sec 8.2.1 prescribes that: "A device must support 2.5 GT/s and is not permitted to skip support for any data rates between 2.5 GT/s and the highest supported rate." Consequently, bandwidth control is currently only disabled if a port doesn't support higher speeds than 2.5 GT/s. However the Implementation Note in PCIe r6.2 sec 7.5.3.18 cautions: "It is strongly encouraged that software primarily utilize the Supported Link Speeds Vector instead of the Max Link Speed field, so that software can determine the exact set of supported speeds on current and future hardware. This can avoid software being confused if a future specification defines Links that do not require support for all slower speeds." In other words, future revisions of the PCIe Base Spec may allow gaps in the Supported Link Speeds Vector. To be future-proof, don't just check whether speeds above 2.5 GT/s are supported, but rather check whether *more than one* speed is supported. Fixes: 665745f27487 ("PCI/bwctrl: Re-add BW notification portdrv as PCIe BW controller") Closes: https://lore.kernel.org/r/db8e457fcd155436449b035e8791a8241b0df400.camel@kernel.org Link: https://lore.kernel.org/r/3564908a9c99fc0d2a292473af7a94ebfc8f5820.1734428762.git.lukas@wunner.de Reported-by: Niklas Schnelle Tested-by: Niklas Schnelle Signed-off-by: Lukas Wunner Signed-off-by: Krzysztof Wilczyński Reviewed-by: Jonathan Cameron Reviewed-by: Mario Limonciello Reviewed-by: Ilpo Järvinen --- drivers/pci/pcie/portdrv.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/pci/pcie/portdrv.c b/drivers/pci/pcie/portdrv.c index 5e10306b6308..02e73099bad0 100644 --- a/drivers/pci/pcie/portdrv.c +++ b/drivers/pci/pcie/portdrv.c @@ -265,12 +265,14 @@ static int get_port_device_capability(struct pci_dev *dev) (pcie_ports_dpc_native || (services & PCIE_PORT_SERVICE_AER))) services |= PCIE_PORT_SERVICE_DPC; + /* Enable bandwidth control if more than one speed is supported. */ if (pci_pcie_type(dev) == PCI_EXP_TYPE_DOWNSTREAM || pci_pcie_type(dev) == PCI_EXP_TYPE_ROOT_PORT) { u32 linkcap; pcie_capability_read_dword(dev, PCI_EXP_LNKCAP, &linkcap); - if (linkcap & PCI_EXP_LNKCAP_LBNC) + if (linkcap & PCI_EXP_LNKCAP_LBNC && + hweight8(dev->supported_speeds) > 1) services |= PCIE_PORT_SERVICE_BWCTRL; } From 92941c7f2c9529fac1b2670482d0ced3b46eac70 Mon Sep 17 00:00:00 2001 From: Bharath SM Date: Thu, 19 Dec 2024 23:28:50 +0530 Subject: [PATCH 185/337] smb: fix bytes written value in /proc/fs/cifs/Stats With recent netfs apis changes, the bytes written value was not getting updated in /proc/fs/cifs/Stats. Fix this by updating tcon->bytes in write operations. Fixes: 3ee1a1fc3981 ("cifs: Cut over to using netfslib") Signed-off-by: Bharath SM Signed-off-by: Steve French --- fs/smb/client/smb2pdu.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/fs/smb/client/smb2pdu.c b/fs/smb/client/smb2pdu.c index c945b94318f8..959359301250 100644 --- a/fs/smb/client/smb2pdu.c +++ b/fs/smb/client/smb2pdu.c @@ -4840,6 +4840,8 @@ smb2_writev_callback(struct mid_q_entry *mid) if (written > wdata->subreq.len) written &= 0xFFFF; + cifs_stats_bytes_written(tcon, written); + if (written < wdata->subreq.len) wdata->result = -ENOSPC; else @@ -5156,6 +5158,7 @@ replay_again: cifs_dbg(VFS, "Send error in write = %d\n", rc); } else { *nbytes = le32_to_cpu(rsp->DataLength); + cifs_stats_bytes_written(io_parms->tcon, *nbytes); trace_smb3_write_done(0, 0, xid, req->PersistentFileId, io_parms->tcon->tid, From dbd2ca9367eb19bc5e269b8c58b0b1514ada9156 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Thu, 19 Dec 2024 19:52:58 +0000 Subject: [PATCH 186/337] io_uring: check if iowq is killed before queuing task work can be executed after the task has gone through io_uring termination, whether it's the final task_work run or the fallback path. In this case, task work will find ->io_wq being already killed and null'ed, which is a problem if it then tries to forward the request to io_queue_iowq(). Make io_queue_iowq() fail requests in this case. Note that it also checks PF_KTHREAD, because the user can first close a DEFER_TASKRUN ring and shortly after kill the task, in which case ->iowq check would race. Cc: stable@vger.kernel.org Fixes: 50c52250e2d74 ("block: implement async io_uring discard cmd") Fixes: 773af69121ecc ("io_uring: always reissue from task_work context") Reported-by: Will Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/63312b4a2c2bb67ad67b857d17a300e1d3b078e8.1734637909.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- io_uring/io_uring.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index 432b95ca9c85..d3403c8216db 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -514,7 +514,11 @@ static void io_queue_iowq(struct io_kiocb *req) struct io_uring_task *tctx = req->tctx; BUG_ON(!tctx); - BUG_ON(!tctx->io_wq); + + if ((current->flags & PF_KTHREAD) || !tctx->io_wq) { + io_req_task_queue_fail(req, -ECANCELED); + return; + } /* init ->work of the whole link before punting */ io_prep_async_link(req); From 25c6a5ab151fb9c886552bf5aa7cbf2a5c6e96af Mon Sep 17 00:00:00 2001 From: Wei Fang Date: Tue, 17 Dec 2024 14:35:00 +0800 Subject: [PATCH 187/337] net: phy: micrel: Dynamically control external clock of KSZ PHY On the i.MX6ULL-14x14-EVK board, enet1_ref and enet2_ref are used as the clock sources for two external KSZ PHYs. However, after closing the two FEC ports, the clk_enable_count of the enet1_ref and enet2_ref clocks is not 0. The root cause is that since the commit 985329462723 ("net: phy: micrel: use devm_clk_get_optional_enabled for the rmii-ref clock"), the external clock of KSZ PHY has been enabled when the PHY driver probes, and it can only be disabled when the PHY driver is removed. This causes the clock to continue working when the system is suspended or the network port is down. Although Heiko explained in the commit message that the patch was because some clock suppliers need to enable the clock to get the valid clock rate , it seems that the simple fix is to disable the clock after getting the clock rate to solve the current problem. This is indeed true, but we need to admit that Heiko's patch has been applied for more than a year, and we cannot guarantee whether there are platforms that only enable rmii-ref in the KSZ PHY driver during this period. If this is the case, disabling rmii-ref will cause RMII on these platforms to not work. Secondly, commit 99ac4cbcc2a5 ("net: phy: micrel: allow usage of generic ethernet-phy clock") just simply enables the generic clock permanently, which seems like the generic clock may only be enabled in the PHY driver. If we simply disable the generic clock, RMII may not work. If we keep it as it is, the platform using the generic clock will have the same problem as the i.MX6ULL platform. To solve this problem, the clock is enabled when phy_driver::resume() is called, and the clock is disabled when phy_driver::suspend() is called. Since phy_driver::resume() and phy_driver::suspend() are not called in pairs, an additional clk_enable flag is added. When phy_driver::suspend() is called, the clock is disabled only if clk_enable is true. Conversely, when phy_driver::resume() is called, the clock is enabled if clk_enable is false. The changes that introduced the problem were only a few lines, while the current fix is about a hundred lines, which seems out of proportion, but it is necessary because kszphy_probe() is used by multiple KSZ PHYs and we need to fix all of them. Fixes: 985329462723 ("net: phy: micrel: use devm_clk_get_optional_enabled for the rmii-ref clock") Fixes: 99ac4cbcc2a5 ("net: phy: micrel: allow usage of generic ethernet-phy clock") Signed-off-by: Wei Fang Reviewed-by: Andrew Lunn Reviewed-by: Florian Fainelli Link: https://patch.msgid.link/20241217063500.1424011-1-wei.fang@nxp.com Signed-off-by: Jakub Kicinski --- drivers/net/phy/micrel.c | 114 ++++++++++++++++++++++++++++++++++----- 1 file changed, 101 insertions(+), 13 deletions(-) diff --git a/drivers/net/phy/micrel.c b/drivers/net/phy/micrel.c index 3ef508840674..eeb33eb181ac 100644 --- a/drivers/net/phy/micrel.c +++ b/drivers/net/phy/micrel.c @@ -432,10 +432,12 @@ struct kszphy_ptp_priv { struct kszphy_priv { struct kszphy_ptp_priv ptp_priv; const struct kszphy_type *type; + struct clk *clk; int led_mode; u16 vct_ctrl1000; bool rmii_ref_clk_sel; bool rmii_ref_clk_sel_val; + bool clk_enable; u64 stats[ARRAY_SIZE(kszphy_hw_stats)]; }; @@ -2050,6 +2052,46 @@ static void kszphy_get_stats(struct phy_device *phydev, data[i] = kszphy_get_stat(phydev, i); } +static void kszphy_enable_clk(struct phy_device *phydev) +{ + struct kszphy_priv *priv = phydev->priv; + + if (!priv->clk_enable && priv->clk) { + clk_prepare_enable(priv->clk); + priv->clk_enable = true; + } +} + +static void kszphy_disable_clk(struct phy_device *phydev) +{ + struct kszphy_priv *priv = phydev->priv; + + if (priv->clk_enable && priv->clk) { + clk_disable_unprepare(priv->clk); + priv->clk_enable = false; + } +} + +static int kszphy_generic_resume(struct phy_device *phydev) +{ + kszphy_enable_clk(phydev); + + return genphy_resume(phydev); +} + +static int kszphy_generic_suspend(struct phy_device *phydev) +{ + int ret; + + ret = genphy_suspend(phydev); + if (ret) + return ret; + + kszphy_disable_clk(phydev); + + return 0; +} + static int kszphy_suspend(struct phy_device *phydev) { /* Disable PHY Interrupts */ @@ -2059,7 +2101,7 @@ static int kszphy_suspend(struct phy_device *phydev) phydev->drv->config_intr(phydev); } - return genphy_suspend(phydev); + return kszphy_generic_suspend(phydev); } static void kszphy_parse_led_mode(struct phy_device *phydev) @@ -2090,7 +2132,9 @@ static int kszphy_resume(struct phy_device *phydev) { int ret; - genphy_resume(phydev); + ret = kszphy_generic_resume(phydev); + if (ret) + return ret; /* After switching from power-down to normal mode, an internal global * reset is automatically generated. Wait a minimum of 1 ms before @@ -2112,6 +2156,24 @@ static int kszphy_resume(struct phy_device *phydev) return 0; } +/* Because of errata DS80000700A, receiver error following software + * power down. Suspend and resume callbacks only disable and enable + * external rmii reference clock. + */ +static int ksz8041_resume(struct phy_device *phydev) +{ + kszphy_enable_clk(phydev); + + return 0; +} + +static int ksz8041_suspend(struct phy_device *phydev) +{ + kszphy_disable_clk(phydev); + + return 0; +} + static int ksz9477_resume(struct phy_device *phydev) { int ret; @@ -2159,7 +2221,10 @@ static int ksz8061_resume(struct phy_device *phydev) if (!(ret & BMCR_PDOWN)) return 0; - genphy_resume(phydev); + ret = kszphy_generic_resume(phydev); + if (ret) + return ret; + usleep_range(1000, 2000); /* Re-program the value after chip is reset. */ @@ -2177,6 +2242,11 @@ static int ksz8061_resume(struct phy_device *phydev) return 0; } +static int ksz8061_suspend(struct phy_device *phydev) +{ + return kszphy_suspend(phydev); +} + static int kszphy_probe(struct phy_device *phydev) { const struct kszphy_type *type = phydev->drv->driver_data; @@ -2217,10 +2287,14 @@ static int kszphy_probe(struct phy_device *phydev) } else if (!clk) { /* unnamed clock from the generic ethernet-phy binding */ clk = devm_clk_get_optional_enabled(&phydev->mdio.dev, NULL); - if (IS_ERR(clk)) - return PTR_ERR(clk); } + if (IS_ERR(clk)) + return PTR_ERR(clk); + + clk_disable_unprepare(clk); + priv->clk = clk; + if (ksz8041_fiber_mode(phydev)) phydev->port = PORT_FIBRE; @@ -5290,6 +5364,21 @@ static int lan8841_probe(struct phy_device *phydev) return 0; } +static int lan8804_resume(struct phy_device *phydev) +{ + return kszphy_resume(phydev); +} + +static int lan8804_suspend(struct phy_device *phydev) +{ + return kszphy_generic_suspend(phydev); +} + +static int lan8841_resume(struct phy_device *phydev) +{ + return kszphy_generic_resume(phydev); +} + static int lan8841_suspend(struct phy_device *phydev) { struct kszphy_priv *priv = phydev->priv; @@ -5298,7 +5387,7 @@ static int lan8841_suspend(struct phy_device *phydev) if (ptp_priv->ptp_clock) ptp_cancel_worker_sync(ptp_priv->ptp_clock); - return genphy_suspend(phydev); + return kszphy_generic_suspend(phydev); } static struct phy_driver ksphy_driver[] = { @@ -5358,9 +5447,8 @@ static struct phy_driver ksphy_driver[] = { .get_sset_count = kszphy_get_sset_count, .get_strings = kszphy_get_strings, .get_stats = kszphy_get_stats, - /* No suspend/resume callbacks because of errata DS80000700A, - * receiver error following software power down. - */ + .suspend = ksz8041_suspend, + .resume = ksz8041_resume, }, { .phy_id = PHY_ID_KSZ8041RNLI, .phy_id_mask = MICREL_PHY_ID_MASK, @@ -5436,7 +5524,7 @@ static struct phy_driver ksphy_driver[] = { .soft_reset = genphy_soft_reset, .config_intr = kszphy_config_intr, .handle_interrupt = kszphy_handle_interrupt, - .suspend = kszphy_suspend, + .suspend = ksz8061_suspend, .resume = ksz8061_resume, }, { .phy_id = PHY_ID_KSZ9021, @@ -5507,8 +5595,8 @@ static struct phy_driver ksphy_driver[] = { .get_sset_count = kszphy_get_sset_count, .get_strings = kszphy_get_strings, .get_stats = kszphy_get_stats, - .suspend = genphy_suspend, - .resume = kszphy_resume, + .suspend = lan8804_suspend, + .resume = lan8804_resume, .config_intr = lan8804_config_intr, .handle_interrupt = lan8804_handle_interrupt, }, { @@ -5526,7 +5614,7 @@ static struct phy_driver ksphy_driver[] = { .get_strings = kszphy_get_strings, .get_stats = kszphy_get_stats, .suspend = lan8841_suspend, - .resume = genphy_resume, + .resume = lan8841_resume, .cable_test_start = lan8814_cable_test_start, .cable_test_get_status = ksz886x_cable_test_get_status, }, { From d81cadbe164265337f149cf31c9462d7217c1eed Mon Sep 17 00:00:00 2001 From: Suravee Suthikulpanit Date: Mon, 4 Nov 2024 07:58:45 +0000 Subject: [PATCH 188/337] KVM: SVM: Disable AVIC on SNP-enabled system without HvInUseWrAllowed feature MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit On SNP-enabled system, VMRUN marks AVIC Backing Page as in-use while the guest is running for both secure and non-secure guest. Any hypervisor write to the in-use vCPU's AVIC backing page (e.g. to inject an interrupt) will generate unexpected #PF in the host. Currently, attempt to run AVIC guest would result in the following error: BUG: unable to handle page fault for address: ff3a442e549cc270 #PF: supervisor write access in kernel mode #PF: error_code(0x80000003) - RMP violation PGD b6ee01067 P4D b6ee02067 PUD 10096d063 PMD 11c540063 PTE 80000001149cc163 SEV-SNP: PFN 0x1149cc unassigned, dumping non-zero entries in 2M PFN region: [0x114800 - 0x114a00] ... Newer AMD system is enhanced to allow hypervisor to modify the backing page for non-secure guest on SNP-enabled system. This enhancement is available when the CPUID Fn8000_001F_EAX bit 30 is set (HvInUseWrAllowed). This table describes AVIC support matrix w.r.t. SNP enablement: | Non-SNP system | SNP system ----------------------------------------------------- Non-SNP guest | AVIC Activate | AVIC Activate iff | | HvInuseWrAllowed=1 ----------------------------------------------------- SNP guest | N/A | Secure AVIC Therefore, check and disable AVIC in kvm_amd driver when the feature is not available on SNP-enabled system. See the AMD64 Architecture Programmer’s Manual (APM) Volume 2 for detail. (https://www.amd.com/content/dam/amd/en/documents/processor-tech-docs/ programmer-references/40332.pdf) Fixes: 216d106c7ff7 ("x86/sev: Add SEV-SNP host initialization support") Signed-off-by: Suravee Suthikulpanit Link: https://lore.kernel.org/r/20241104075845.7583-1-suravee.suthikulpanit@amd.com Signed-off-by: Sean Christopherson --- arch/x86/include/asm/cpufeatures.h | 1 + arch/x86/kvm/svm/avic.c | 6 ++++++ 2 files changed, 7 insertions(+) diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h index 17b6590748c0..645aa360628d 100644 --- a/arch/x86/include/asm/cpufeatures.h +++ b/arch/x86/include/asm/cpufeatures.h @@ -452,6 +452,7 @@ #define X86_FEATURE_SME_COHERENT (19*32+10) /* AMD hardware-enforced cache coherency */ #define X86_FEATURE_DEBUG_SWAP (19*32+14) /* "debug_swap" AMD SEV-ES full debug state swap support */ #define X86_FEATURE_SVSM (19*32+28) /* "svsm" SVSM present */ +#define X86_FEATURE_HV_INUSE_WR_ALLOWED (19*32+30) /* Allow Write to in-use hypervisor-owned pages */ /* AMD-defined Extended Feature 2 EAX, CPUID level 0x80000021 (EAX), word 20 */ #define X86_FEATURE_NO_NESTED_DATA_BP (20*32+ 0) /* No Nested Data Breakpoints */ diff --git a/arch/x86/kvm/svm/avic.c b/arch/x86/kvm/svm/avic.c index 4b74ea91f4e6..65fd245a9953 100644 --- a/arch/x86/kvm/svm/avic.c +++ b/arch/x86/kvm/svm/avic.c @@ -1199,6 +1199,12 @@ bool avic_hardware_setup(void) return false; } + if (cc_platform_has(CC_ATTR_HOST_SEV_SNP) && + !boot_cpu_has(X86_FEATURE_HV_INUSE_WR_ALLOWED)) { + pr_warn("AVIC disabled: missing HvInUseWrAllowed on SNP-enabled system\n"); + return false; + } + if (boot_cpu_has(X86_FEATURE_AVIC)) { pr_info("AVIC enabled\n"); } else if (force_avic) { From 9b42d1e8e4fe9dc631162c04caa69b0d1860b0f0 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 27 Nov 2024 16:43:39 -0800 Subject: [PATCH 189/337] KVM: x86: Play nice with protected guests in complete_hypercall_exit() Use is_64_bit_hypercall() instead of is_64_bit_mode() to detect a 64-bit hypercall when completing said hypercall. For guests with protected state, e.g. SEV-ES and SEV-SNP, KVM must assume the hypercall was made in 64-bit mode as the vCPU state needed to detect 64-bit mode is unavailable. Hacking the sev_smoke_test selftest to generate a KVM_HC_MAP_GPA_RANGE hypercall via VMGEXIT trips the WARN: ------------[ cut here ]------------ WARNING: CPU: 273 PID: 326626 at arch/x86/kvm/x86.h:180 complete_hypercall_exit+0x44/0xe0 [kvm] Modules linked in: kvm_amd kvm ... [last unloaded: kvm] CPU: 273 UID: 0 PID: 326626 Comm: sev_smoke_test Not tainted 6.12.0-smp--392e932fa0f3-feat #470 Hardware name: Google Astoria/astoria, BIOS 0.20240617.0-0 06/17/2024 RIP: 0010:complete_hypercall_exit+0x44/0xe0 [kvm] Call Trace: kvm_arch_vcpu_ioctl_run+0x2400/0x2720 [kvm] kvm_vcpu_ioctl+0x54f/0x630 [kvm] __se_sys_ioctl+0x6b/0xc0 do_syscall_64+0x83/0x160 entry_SYSCALL_64_after_hwframe+0x76/0x7e ---[ end trace 0000000000000000 ]--- Fixes: b5aead0064f3 ("KVM: x86: Assume a 64-bit hypercall for guests with protected state") Cc: stable@vger.kernel.org Cc: Tom Lendacky Reviewed-by: Xiaoyao Li Reviewed-by: Nikunj A Dadhania Reviewed-by: Tom Lendacky Reviewed-by: Binbin Wu Reviewed-by: Kai Huang Link: https://lore.kernel.org/r/20241128004344.4072099-2-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/x86.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 2e713480933a..0b2fe4aa04a2 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -9976,7 +9976,7 @@ static int complete_hypercall_exit(struct kvm_vcpu *vcpu) { u64 ret = vcpu->run->hypercall.ret; - if (!is_64_bit_mode(vcpu)) + if (!is_64_bit_hypercall(vcpu)) ret = (u32)ret; kvm_rax_write(vcpu, ret); ++vcpu->stat.hypercalls; From 4d5163cba43fe96902165606fa54e1aecbbb32de Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 11 Dec 2024 09:29:52 -0800 Subject: [PATCH 190/337] KVM: SVM: Allow guest writes to set MSR_AMD64_DE_CFG bits Drop KVM's arbitrary behavior of making DE_CFG.LFENCE_SERIALIZE read-only for the guest, as rejecting writes can lead to guest crashes, e.g. Windows in particular doesn't gracefully handle unexpected #GPs on the WRMSR, and nothing in the AMD manuals suggests that LFENCE_SERIALIZE is read-only _if it exists_. KVM only allows LFENCE_SERIALIZE to be set, by the guest or host, if the underlying CPU has X86_FEATURE_LFENCE_RDTSC, i.e. if LFENCE is guaranteed to be serializing. So if the guest sets LFENCE_SERIALIZE, KVM will provide the desired/correct behavior without any additional action (the guest's value is never stuffed into hardware). And having LFENCE be serializing even when it's not _required_ to be is a-ok from a functional perspective. Fixes: 74a0e79df68a ("KVM: SVM: Disallow guest from changing userspace's MSR_AMD64_DE_CFG value") Fixes: d1d93fa90f1a ("KVM: SVM: Add MSR-based feature support for serializing LFENCE") Reported-by: Simon Pilkington Closes: https://lore.kernel.org/all/52914da7-a97b-45ad-86a0-affdf8266c61@mailbox.org Cc: Tom Lendacky Cc: stable@vger.kernel.org Reviewed-by: Tom Lendacky Link: https://lore.kernel.org/r/20241211172952.1477605-1-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/svm/svm.c | 9 --------- 1 file changed, 9 deletions(-) diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index dd15cc635655..21dacd312779 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -3201,15 +3201,6 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr) if (data & ~supported_de_cfg) return 1; - /* - * Don't let the guest change the host-programmed value. The - * MSR is very model specific, i.e. contains multiple bits that - * are completely unknown to KVM, and the one bit known to KVM - * is simply a reflection of hardware capabilities. - */ - if (!msr->host_initiated && data != svm->msr_decfg) - return 1; - svm->msr_decfg = data; break; } From 386d69f9f29b0814881fa4f92ac7b8dfa9b4f44a Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 18 Dec 2024 13:36:11 -0800 Subject: [PATCH 191/337] KVM: x86/mmu: Treat TDP MMU faults as spurious if access is already allowed Treat slow-path TDP MMU faults as spurious if the access is allowed given the existing SPTE to fix a benign warning (other than the WARN itself) due to replacing a writable SPTE with a read-only SPTE, and to avoid the unnecessary LOCK CMPXCHG and subsequent TLB flush. If a read fault races with a write fault, fast GUP fails for any reason when trying to "promote" the read fault to a writable mapping, and KVM resolves the write fault first, then KVM will end up trying to install a read-only SPTE (for a !map_writable fault) overtop a writable SPTE. Note, it's not entirely clear why fast GUP fails, or if that's even how KVM ends up with a !map_writable fault with a writable SPTE. If something else is going awry, e.g. due to a bug in mmu_notifiers, then treating read faults as spurious in this scenario could effectively mask the underlying problem. However, retrying the faulting access instead of overwriting an existing SPTE is functionally correct and desirable irrespective of the WARN, and fast GUP _can_ legitimately fail with a writable VMA, e.g. if the Accessed bit in primary MMU's PTE is toggled and causes a PTE value mismatch. The WARN was also recently added, specifically to track down scenarios where KVM is unnecessarily overwrites SPTEs, i.e. treating the fault as spurious doesn't regress KVM's bug-finding capabilities in any way. In short, letting the WARN linger because there's a tiny chance it's due to a bug elsewhere would be excessively paranoid. Fixes: 1a175082b190 ("KVM: x86/mmu: WARN and flush if resolving a TDP MMU fault clears MMU-writable") Reported-by: Lei Yang Closes: https://bugzilla.kernel.org/show_bug.cgi?id=219588 Tested-by: Lei Yang Link: https://lore.kernel.org/r/20241218213611.3181643-1-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/mmu/mmu.c | 12 ------------ arch/x86/kvm/mmu/spte.h | 17 +++++++++++++++++ arch/x86/kvm/mmu/tdp_mmu.c | 5 +++++ 3 files changed, 22 insertions(+), 12 deletions(-) diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 22e7ad235123..2401606db260 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -3364,18 +3364,6 @@ static bool fast_pf_fix_direct_spte(struct kvm_vcpu *vcpu, return true; } -static bool is_access_allowed(struct kvm_page_fault *fault, u64 spte) -{ - if (fault->exec) - return is_executable_pte(spte); - - if (fault->write) - return is_writable_pte(spte); - - /* Fault was on Read access */ - return spte & PT_PRESENT_MASK; -} - /* * Returns the last level spte pointer of the shadow page walk for the given * gpa, and sets *spte to the spte value. This spte may be non-preset. If no diff --git a/arch/x86/kvm/mmu/spte.h b/arch/x86/kvm/mmu/spte.h index f332b33bc817..af10bc0380a3 100644 --- a/arch/x86/kvm/mmu/spte.h +++ b/arch/x86/kvm/mmu/spte.h @@ -461,6 +461,23 @@ static inline bool is_mmu_writable_spte(u64 spte) return spte & shadow_mmu_writable_mask; } +/* + * Returns true if the access indicated by @fault is allowed by the existing + * SPTE protections. Note, the caller is responsible for checking that the + * SPTE is a shadow-present, leaf SPTE (either before or after). + */ +static inline bool is_access_allowed(struct kvm_page_fault *fault, u64 spte) +{ + if (fault->exec) + return is_executable_pte(spte); + + if (fault->write) + return is_writable_pte(spte); + + /* Fault was on Read access */ + return spte & PT_PRESENT_MASK; +} + /* * If the MMU-writable flag is cleared, i.e. the SPTE is write-protected for * write-tracking, remote TLBs must be flushed, even if the SPTE was read-only, diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c index 4508d868f1cd..2f15e0e33903 100644 --- a/arch/x86/kvm/mmu/tdp_mmu.c +++ b/arch/x86/kvm/mmu/tdp_mmu.c @@ -985,6 +985,11 @@ static int tdp_mmu_map_handle_target_level(struct kvm_vcpu *vcpu, if (fault->prefetch && is_shadow_present_pte(iter->old_spte)) return RET_PF_SPURIOUS; + if (is_shadow_present_pte(iter->old_spte) && + is_access_allowed(fault, iter->old_spte) && + is_last_spte(iter->old_spte, iter->level)) + return RET_PF_SPURIOUS; + if (unlikely(!fault->slot)) new_spte = make_mmio_spte(vcpu, iter->gfn, ACC_ALL); else From 902806baf3c1e8383c1fe3ff0b6042b8cb5c2707 Mon Sep 17 00:00:00 2001 From: Stefan Ekenberg Date: Tue, 19 Nov 2024 08:40:29 +0100 Subject: [PATCH 192/337] drm/bridge: adv7511_audio: Update Audio InfoFrame properly AUDIO_UPDATE bit (Bit 5 of MAIN register 0x4A) needs to be set to 1 while updating Audio InfoFrame information and then set to 0 when done. Otherwise partially updated Audio InfoFrames could be sent out. Two cases where this rule were not followed are fixed: - In adv7511_hdmi_hw_params() make sure AUDIO_UPDATE bit is updated before/after setting ADV7511_REG_AUDIO_INFOFRAME. - In audio_startup() use the correct register for clearing AUDIO_UPDATE bit. The problem with corrupted audio infoframes were discovered by letting a HDMI logic analyser check the output of ADV7535. Note that this patchs replaces writing REG_GC(1) with REG_INFOFRAME_UPDATE. Bit 5 of REG_GC(1) is positioned within field GC_PP[3:0] and that field doesn't control audio infoframe and is read- only. My conclusion therefore was that the author if this code meant to clear bit 5 of REG_INFOFRAME_UPDATE from the very beginning. Tested-by: Biju Das Fixes: 53c515befe28 ("drm/bridge: adv7511: Add Audio support") Signed-off-by: Stefan Ekenberg Reviewed-by: Dmitry Baryshkov Link: https://patchwork.freedesktop.org/patch/msgid/20241119-adv7511-audio-info-frame-v4-1-4ae68e76c89c@axis.com Signed-off-by: Dmitry Baryshkov --- drivers/gpu/drm/bridge/adv7511/adv7511_audio.c | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/drm/bridge/adv7511/adv7511_audio.c b/drivers/gpu/drm/bridge/adv7511/adv7511_audio.c index 61f4a38e7d2b..8f786592143b 100644 --- a/drivers/gpu/drm/bridge/adv7511/adv7511_audio.c +++ b/drivers/gpu/drm/bridge/adv7511/adv7511_audio.c @@ -153,7 +153,16 @@ static int adv7511_hdmi_hw_params(struct device *dev, void *data, ADV7511_AUDIO_CFG3_LEN_MASK, len); regmap_update_bits(adv7511->regmap, ADV7511_REG_I2C_FREQ_ID_CFG, ADV7511_I2C_FREQ_ID_CFG_RATE_MASK, rate << 4); - regmap_write(adv7511->regmap, 0x73, 0x1); + + /* send current Audio infoframe values while updating */ + regmap_update_bits(adv7511->regmap, ADV7511_REG_INFOFRAME_UPDATE, + BIT(5), BIT(5)); + + regmap_write(adv7511->regmap, ADV7511_REG_AUDIO_INFOFRAME(0), 0x1); + + /* use Audio infoframe updated info */ + regmap_update_bits(adv7511->regmap, ADV7511_REG_INFOFRAME_UPDATE, + BIT(5), 0); return 0; } @@ -184,8 +193,9 @@ static int audio_startup(struct device *dev, void *data) regmap_update_bits(adv7511->regmap, ADV7511_REG_GC(0), BIT(7) | BIT(6), BIT(7)); /* use Audio infoframe updated info */ - regmap_update_bits(adv7511->regmap, ADV7511_REG_GC(1), + regmap_update_bits(adv7511->regmap, ADV7511_REG_INFOFRAME_UPDATE, BIT(5), 0); + /* enable SPDIF receiver */ if (adv7511->audio_source == ADV7511_AUDIO_SOURCE_SPDIF) regmap_update_bits(adv7511->regmap, ADV7511_REG_AUDIO_CONFIG, From 81adbd3ff21c1182e06aa02c6be0bfd9ea02d8e8 Mon Sep 17 00:00:00 2001 From: Biju Das Date: Tue, 19 Nov 2024 19:20:29 +0000 Subject: [PATCH 193/337] drm: adv7511: Fix use-after-free in adv7533_attach_dsi() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The host_node pointer was assigned and freed in adv7533_parse_dt(), and later, adv7533_attach_dsi() uses the same. Fix this use-after-free issue by dropping of_node_put() in adv7533_parse_dt() and calling of_node_put() in error path of probe() and also in the remove(). Fixes: 1e4d58cd7f88 ("drm/bridge: adv7533: Create a MIPI DSI device") Cc: stable@vger.kernel.org Reviewed-by: Laurent Pinchart Signed-off-by: Biju Das Link: https://patchwork.freedesktop.org/patch/msgid/20241119192040.152657-2-biju.das.jz@bp.renesas.com Signed-off-by: Dmitry Baryshkov --- drivers/gpu/drm/bridge/adv7511/adv7511_drv.c | 10 ++++++++-- drivers/gpu/drm/bridge/adv7511/adv7533.c | 2 -- 2 files changed, 8 insertions(+), 4 deletions(-) diff --git a/drivers/gpu/drm/bridge/adv7511/adv7511_drv.c b/drivers/gpu/drm/bridge/adv7511/adv7511_drv.c index eb5919b38263..a13b3d8ab6ac 100644 --- a/drivers/gpu/drm/bridge/adv7511/adv7511_drv.c +++ b/drivers/gpu/drm/bridge/adv7511/adv7511_drv.c @@ -1241,8 +1241,10 @@ static int adv7511_probe(struct i2c_client *i2c) return ret; ret = adv7511_init_regulators(adv7511); - if (ret) - return dev_err_probe(dev, ret, "failed to init regulators\n"); + if (ret) { + dev_err_probe(dev, ret, "failed to init regulators\n"); + goto err_of_node_put; + } /* * The power down GPIO is optional. If present, toggle it from active to @@ -1363,6 +1365,8 @@ err_i2c_unregister_edid: i2c_unregister_device(adv7511->i2c_edid); uninit_regulators: adv7511_uninit_regulators(adv7511); +err_of_node_put: + of_node_put(adv7511->host_node); return ret; } @@ -1371,6 +1375,8 @@ static void adv7511_remove(struct i2c_client *i2c) { struct adv7511 *adv7511 = i2c_get_clientdata(i2c); + of_node_put(adv7511->host_node); + adv7511_uninit_regulators(adv7511); drm_bridge_remove(&adv7511->bridge); diff --git a/drivers/gpu/drm/bridge/adv7511/adv7533.c b/drivers/gpu/drm/bridge/adv7511/adv7533.c index 4481489aaf5e..5f195e91b3e6 100644 --- a/drivers/gpu/drm/bridge/adv7511/adv7533.c +++ b/drivers/gpu/drm/bridge/adv7511/adv7533.c @@ -181,8 +181,6 @@ int adv7533_parse_dt(struct device_node *np, struct adv7511 *adv) if (!adv->host_node) return -ENODEV; - of_node_put(adv->host_node); - adv->use_timing_gen = !of_property_read_bool(np, "adi,disable-timing-generator"); From ee8f9ed57a397605434caeef351bafa3ec4dfdd4 Mon Sep 17 00:00:00 2001 From: Biju Das Date: Tue, 19 Nov 2024 19:20:30 +0000 Subject: [PATCH 194/337] dt-bindings: display: adi,adv7533: Drop single lane support As per [1] and [2], ADV7535/7533 supports only 2-, 3-, or 4-lane. Drop unsupported 1-lane from bindings. [1] https://www.analog.com/media/en/technical-documentation/data-sheets/ADV7535.pdf [2] https://www.analog.com/media/en/technical-documentation/data-sheets/ADV7533.pdf Fixes: 1e4d58cd7f88 ("drm/bridge: adv7533: Create a MIPI DSI device") Cc: stable@vger.kernel.org Acked-by: Krzysztof Kozlowski Reviewed-by: Geert Uytterhoeven Reviewed-by: Laurent Pinchart Signed-off-by: Biju Das Link: https://patchwork.freedesktop.org/patch/msgid/20241119192040.152657-3-biju.das.jz@bp.renesas.com Signed-off-by: Dmitry Baryshkov --- .../devicetree/bindings/display/bridge/adi,adv7533.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Documentation/devicetree/bindings/display/bridge/adi,adv7533.yaml b/Documentation/devicetree/bindings/display/bridge/adi,adv7533.yaml index df20a3c9c744..ec89115c74e4 100644 --- a/Documentation/devicetree/bindings/display/bridge/adi,adv7533.yaml +++ b/Documentation/devicetree/bindings/display/bridge/adi,adv7533.yaml @@ -90,7 +90,7 @@ properties: adi,dsi-lanes: description: Number of DSI data lanes connected to the DSI host. $ref: /schemas/types.yaml#/definitions/uint32 - enum: [ 1, 2, 3, 4 ] + enum: [ 2, 3, 4 ] "#sound-dai-cells": const: 0 From 79d67c499c3f886202a40c5cb27e747e4fa4d738 Mon Sep 17 00:00:00 2001 From: Biju Das Date: Tue, 19 Nov 2024 19:20:31 +0000 Subject: [PATCH 195/337] drm: adv7511: Drop dsi single lane support As per [1] and [2], ADV7535/7533 supports only 2-, 3-, or 4-lane. Drop unsupported 1-lane. [1] https://www.analog.com/media/en/technical-documentation/data-sheets/ADV7535.pdf [2] https://www.analog.com/media/en/technical-documentation/data-sheets/ADV7533.pdf Fixes: 1e4d58cd7f88 ("drm/bridge: adv7533: Create a MIPI DSI device") Reported-by: Hien Huynh Cc: stable@vger.kernel.org Reviewed-by: Laurent Pinchart Reviewed-by: Adam Ford Signed-off-by: Biju Das Link: https://patchwork.freedesktop.org/patch/msgid/20241119192040.152657-4-biju.das.jz@bp.renesas.com Signed-off-by: Dmitry Baryshkov --- drivers/gpu/drm/bridge/adv7511/adv7533.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/bridge/adv7511/adv7533.c b/drivers/gpu/drm/bridge/adv7511/adv7533.c index 5f195e91b3e6..122ad91e8a32 100644 --- a/drivers/gpu/drm/bridge/adv7511/adv7533.c +++ b/drivers/gpu/drm/bridge/adv7511/adv7533.c @@ -172,7 +172,7 @@ int adv7533_parse_dt(struct device_node *np, struct adv7511 *adv) of_property_read_u32(np, "adi,dsi-lanes", &num_lanes); - if (num_lanes < 1 || num_lanes > 4) + if (num_lanes < 2 || num_lanes > 4) return -EINVAL; adv->num_dsi_lanes = num_lanes; From 262bfba8ab820641c8cfbbf03b86d6c00242c078 Mon Sep 17 00:00:00 2001 From: Tristram Ha Date: Tue, 17 Dec 2024 18:02:23 -0800 Subject: [PATCH 196/337] net: dsa: microchip: Fix KSZ9477 set_ageing_time function The aging count is not a simple 11-bit value but comprises a 3-bit multiplier and an 8-bit second count. The code tries to use the original multiplier which is 4 as the second count is still 300 seconds by default. Fixes: 2c119d9982b1 ("net: dsa: microchip: add the support for set_ageing_time") Signed-off-by: Tristram Ha Reviewed-by: Andrew Lunn Link: https://patch.msgid.link/20241218020224.70590-2-Tristram.Ha@microchip.com Signed-off-by: Jakub Kicinski --- drivers/net/dsa/microchip/ksz9477.c | 47 +++++++++++++++++++------ drivers/net/dsa/microchip/ksz9477_reg.h | 4 +-- 2 files changed, 37 insertions(+), 14 deletions(-) diff --git a/drivers/net/dsa/microchip/ksz9477.c b/drivers/net/dsa/microchip/ksz9477.c index d16817e0476f..29fe79ea74cd 100644 --- a/drivers/net/dsa/microchip/ksz9477.c +++ b/drivers/net/dsa/microchip/ksz9477.c @@ -2,7 +2,7 @@ /* * Microchip KSZ9477 switch driver main logic * - * Copyright (C) 2017-2019 Microchip Technology Inc. + * Copyright (C) 2017-2024 Microchip Technology Inc. */ #include @@ -983,26 +983,51 @@ void ksz9477_get_caps(struct ksz_device *dev, int port, int ksz9477_set_ageing_time(struct ksz_device *dev, unsigned int msecs) { u32 secs = msecs / 1000; - u8 value; - u8 data; + u8 data, mult, value; + u32 max_val; int ret; - value = FIELD_GET(SW_AGE_PERIOD_7_0_M, secs); +#define MAX_TIMER_VAL ((1 << 8) - 1) - ret = ksz_write8(dev, REG_SW_LUE_CTRL_3, value); - if (ret < 0) - return ret; + /* The aging timer comprises a 3-bit multiplier and an 8-bit second + * value. Either of them cannot be zero. The maximum timer is then + * 7 * 255 = 1785 seconds. + */ + if (!secs) + secs = 1; - data = FIELD_GET(SW_AGE_PERIOD_10_8_M, secs); + /* Return error if too large. */ + else if (secs > 7 * MAX_TIMER_VAL) + return -EINVAL; ret = ksz_read8(dev, REG_SW_LUE_CTRL_0, &value); if (ret < 0) return ret; - value &= ~SW_AGE_CNT_M; - value |= FIELD_PREP(SW_AGE_CNT_M, data); + /* Check whether there is need to update the multiplier. */ + mult = FIELD_GET(SW_AGE_CNT_M, value); + max_val = MAX_TIMER_VAL; + if (mult > 0) { + /* Try to use the same multiplier already in the register as + * the hardware default uses multiplier 4 and 75 seconds for + * 300 seconds. + */ + max_val = DIV_ROUND_UP(secs, mult); + if (max_val > MAX_TIMER_VAL || max_val * mult != secs) + max_val = MAX_TIMER_VAL; + } - return ksz_write8(dev, REG_SW_LUE_CTRL_0, value); + data = DIV_ROUND_UP(secs, max_val); + if (mult != data) { + value &= ~SW_AGE_CNT_M; + value |= FIELD_PREP(SW_AGE_CNT_M, data); + ret = ksz_write8(dev, REG_SW_LUE_CTRL_0, value); + if (ret < 0) + return ret; + } + + value = DIV_ROUND_UP(secs, data); + return ksz_write8(dev, REG_SW_LUE_CTRL_3, value); } void ksz9477_port_queue_split(struct ksz_device *dev, int port) diff --git a/drivers/net/dsa/microchip/ksz9477_reg.h b/drivers/net/dsa/microchip/ksz9477_reg.h index 04235c22bf40..ff579920078e 100644 --- a/drivers/net/dsa/microchip/ksz9477_reg.h +++ b/drivers/net/dsa/microchip/ksz9477_reg.h @@ -2,7 +2,7 @@ /* * Microchip KSZ9477 register definitions * - * Copyright (C) 2017-2018 Microchip Technology Inc. + * Copyright (C) 2017-2024 Microchip Technology Inc. */ #ifndef __KSZ9477_REGS_H @@ -165,8 +165,6 @@ #define SW_VLAN_ENABLE BIT(7) #define SW_DROP_INVALID_VID BIT(6) #define SW_AGE_CNT_M GENMASK(5, 3) -#define SW_AGE_CNT_S 3 -#define SW_AGE_PERIOD_10_8_M GENMASK(10, 8) #define SW_RESV_MCAST_ENABLE BIT(2) #define SW_HASH_OPTION_M 0x03 #define SW_HASH_OPTION_CRC 1 From bb9869043438af5b94230f94fb4c39206525d758 Mon Sep 17 00:00:00 2001 From: Tristram Ha Date: Tue, 17 Dec 2024 18:02:24 -0800 Subject: [PATCH 197/337] net: dsa: microchip: Fix LAN937X set_ageing_time function The aging count is not a simple 20-bit value but comprises a 3-bit multiplier and a 20-bit second time. The code tries to use the original multiplier which is 4 as the second count is still 300 seconds by default. As the 20-bit number is now too large for practical use there is an option to interpret it as microseconds instead of seconds. Fixes: 2c119d9982b1 ("net: dsa: microchip: add the support for set_ageing_time") Signed-off-by: Tristram Ha Reviewed-by: Andrew Lunn Link: https://patch.msgid.link/20241218020224.70590-3-Tristram.Ha@microchip.com Signed-off-by: Jakub Kicinski --- drivers/net/dsa/microchip/lan937x_main.c | 62 ++++++++++++++++++++++-- drivers/net/dsa/microchip/lan937x_reg.h | 9 ++-- 2 files changed, 65 insertions(+), 6 deletions(-) diff --git a/drivers/net/dsa/microchip/lan937x_main.c b/drivers/net/dsa/microchip/lan937x_main.c index b7652efd632e..b1ae3b9de3d1 100644 --- a/drivers/net/dsa/microchip/lan937x_main.c +++ b/drivers/net/dsa/microchip/lan937x_main.c @@ -1,6 +1,6 @@ // SPDX-License-Identifier: GPL-2.0 /* Microchip LAN937X switch driver main logic - * Copyright (C) 2019-2022 Microchip Technology Inc. + * Copyright (C) 2019-2024 Microchip Technology Inc. */ #include #include @@ -461,10 +461,66 @@ int lan937x_change_mtu(struct ksz_device *dev, int port, int new_mtu) int lan937x_set_ageing_time(struct ksz_device *dev, unsigned int msecs) { - u32 secs = msecs / 1000; - u32 value; + u8 data, mult, value8; + bool in_msec = false; + u32 max_val, value; + u32 secs = msecs; int ret; +#define MAX_TIMER_VAL ((1 << 20) - 1) + + /* The aging timer comprises a 3-bit multiplier and a 20-bit second + * value. Either of them cannot be zero. The maximum timer is then + * 7 * 1048575 = 7340025 seconds. As this value is too large for + * practical use it can be interpreted as microseconds, making the + * maximum timer 7340 seconds with finer control. This allows for + * maximum 122 minutes compared to 29 minutes in KSZ9477 switch. + */ + if (msecs % 1000) + in_msec = true; + else + secs /= 1000; + if (!secs) + secs = 1; + + /* Return error if too large. */ + else if (secs > 7 * MAX_TIMER_VAL) + return -EINVAL; + + /* Configure how to interpret the number value. */ + ret = ksz_rmw8(dev, REG_SW_LUE_CTRL_2, SW_AGE_CNT_IN_MICROSEC, + in_msec ? SW_AGE_CNT_IN_MICROSEC : 0); + if (ret < 0) + return ret; + + ret = ksz_read8(dev, REG_SW_LUE_CTRL_0, &value8); + if (ret < 0) + return ret; + + /* Check whether there is need to update the multiplier. */ + mult = FIELD_GET(SW_AGE_CNT_M, value8); + max_val = MAX_TIMER_VAL; + if (mult > 0) { + /* Try to use the same multiplier already in the register as + * the hardware default uses multiplier 4 and 75 seconds for + * 300 seconds. + */ + max_val = DIV_ROUND_UP(secs, mult); + if (max_val > MAX_TIMER_VAL || max_val * mult != secs) + max_val = MAX_TIMER_VAL; + } + + data = DIV_ROUND_UP(secs, max_val); + if (mult != data) { + value8 &= ~SW_AGE_CNT_M; + value8 |= FIELD_PREP(SW_AGE_CNT_M, data); + ret = ksz_write8(dev, REG_SW_LUE_CTRL_0, value8); + if (ret < 0) + return ret; + } + + secs = DIV_ROUND_UP(secs, data); + value = FIELD_GET(SW_AGE_PERIOD_7_0_M, secs); ret = ksz_write8(dev, REG_SW_AGE_PERIOD__1, value); diff --git a/drivers/net/dsa/microchip/lan937x_reg.h b/drivers/net/dsa/microchip/lan937x_reg.h index 4ec93e421da4..72042fd64e5b 100644 --- a/drivers/net/dsa/microchip/lan937x_reg.h +++ b/drivers/net/dsa/microchip/lan937x_reg.h @@ -1,6 +1,6 @@ /* SPDX-License-Identifier: GPL-2.0 */ /* Microchip LAN937X switch register definitions - * Copyright (C) 2019-2021 Microchip Technology Inc. + * Copyright (C) 2019-2024 Microchip Technology Inc. */ #ifndef __LAN937X_REG_H #define __LAN937X_REG_H @@ -56,8 +56,7 @@ #define SW_VLAN_ENABLE BIT(7) #define SW_DROP_INVALID_VID BIT(6) -#define SW_AGE_CNT_M 0x7 -#define SW_AGE_CNT_S 3 +#define SW_AGE_CNT_M GENMASK(5, 3) #define SW_RESV_MCAST_ENABLE BIT(2) #define REG_SW_LUE_CTRL_1 0x0311 @@ -70,6 +69,10 @@ #define SW_FAST_AGING BIT(1) #define SW_LINK_AUTO_AGING BIT(0) +#define REG_SW_LUE_CTRL_2 0x0312 + +#define SW_AGE_CNT_IN_MICROSEC BIT(7) + #define REG_SW_AGE_PERIOD__1 0x0313 #define SW_AGE_PERIOD_7_0_M GENMASK(7, 0) From 1ae40d5231732275c620a1c58c83884a979b6eb1 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Mon, 16 Dec 2024 10:33:40 +0100 Subject: [PATCH 198/337] ALSA: compress_offload: import DMA_BUF namespace The compression offload code cannot be in a loadable module unless it imports that namespace: ERROR: modpost: module snd-compress uses symbol dma_buf_get from namespace DMA_BUF, but does not import it. ERROR: modpost: module snd-compress uses symbol dma_buf_put from namespace DMA_BUF, but does not import it. ERROR: modpost: module snd-compress uses symbol dma_buf_fd from namespace DMA_BUF, but does not import it. Fixes: 04177158cf98 ("ALSA: compress_offload: introduce accel operation mode") Signed-off-by: Arnd Bergmann Acked-by: Shengjiu Wang Acked-by: Vinod Koul Signed-off-by: Takashi Iwai Link: https://patch.msgid.link/20241216093410.377112-1-arnd@kernel.org --- sound/core/compress_offload.c | 1 + 1 file changed, 1 insertion(+) diff --git a/sound/core/compress_offload.c b/sound/core/compress_offload.c index 86ed2fbee0c8..ec2485c00e49 100644 --- a/sound/core/compress_offload.c +++ b/sound/core/compress_offload.c @@ -1247,6 +1247,7 @@ void snd_compr_task_finished(struct snd_compr_stream *stream, } EXPORT_SYMBOL_GPL(snd_compr_task_finished); +MODULE_IMPORT_NS("DMA_BUF"); #endif /* CONFIG_SND_COMPRESS_ACCEL */ static long snd_compr_ioctl(struct file *f, unsigned int cmd, unsigned long arg) From 6018f2fe1089b46c6c9eb136338eca7b16a92331 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Mon, 16 Dec 2024 10:33:41 +0100 Subject: [PATCH 199/337] ALSA: compress_offload: avoid 64-bit get_user() On some architectures, get_user() cannot read a 64-bit user variable: arm-linux-gnueabi-ld: sound/core/compress_offload.o: in function `snd_compr_ioctl': compress_offload.c:(.text.snd_compr_ioctl+0x538): undefined reference to `__get_user_bad' Use an equivalent copy_from_user() instead. Fixes: 04177158cf98 ("ALSA: compress_offload: introduce accel operation mode") Signed-off-by: Arnd Bergmann Acked-by: Shengjiu Wang Acked-by: Vinod Koul Signed-off-by: Takashi Iwai Link: https://patch.msgid.link/20241216093410.377112-2-arnd@kernel.org --- sound/core/compress_offload.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/sound/core/compress_offload.c b/sound/core/compress_offload.c index ec2485c00e49..1d6769a66810 100644 --- a/sound/core/compress_offload.c +++ b/sound/core/compress_offload.c @@ -1180,9 +1180,9 @@ static int snd_compr_task_seq(struct snd_compr_stream *stream, unsigned long arg if (stream->runtime->state != SNDRV_PCM_STATE_SETUP) return -EPERM; - retval = get_user(seqno, (__u64 __user *)arg); - if (retval < 0) - return retval; + retval = copy_from_user(&seqno, (__u64 __user *)arg, sizeof(seqno)); + if (retval) + return -EFAULT; retval = 0; if (seqno == 0) { list_for_each_entry_reverse(task, &stream->runtime->tasks, list) From f25a51b47c61540585a9e8a4e16f91677ebcbbc4 Mon Sep 17 00:00:00 2001 From: Jaroslav Kysela Date: Tue, 17 Dec 2024 11:07:07 +0100 Subject: [PATCH 200/337] ALSA: compress_offload: use safe list iteration in snd_compr_task_seq() The sequence function can call snd_compr_task_free_one(). Use list_for_each_entry_safe_reverse() to make sure that the used pointers are safe. Link: https://lore.kernel.org/linux-sound/f2769cff-6c7a-4092-a2d1-c33a5411a182@stanley.mountain/ Fixes: 04177158cf98 ("ALSA: compress_offload: introduce accel operation mode") Reported-by: Dan Carpenter Cc: Vinod Koul Signed-off-by: Jaroslav Kysela Signed-off-by: Takashi Iwai Link: https://patch.msgid.link/20241217100707.732766-1-perex@perex.cz --- sound/core/compress_offload.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sound/core/compress_offload.c b/sound/core/compress_offload.c index 1d6769a66810..bdb6e307e453 100644 --- a/sound/core/compress_offload.c +++ b/sound/core/compress_offload.c @@ -1174,7 +1174,7 @@ typedef void (*snd_compr_seq_func_t)(struct snd_compr_stream *stream, static int snd_compr_task_seq(struct snd_compr_stream *stream, unsigned long arg, snd_compr_seq_func_t fcn) { - struct snd_compr_task_runtime *task; + struct snd_compr_task_runtime *task, *temp; __u64 seqno; int retval; @@ -1185,7 +1185,7 @@ static int snd_compr_task_seq(struct snd_compr_stream *stream, unsigned long arg return -EFAULT; retval = 0; if (seqno == 0) { - list_for_each_entry_reverse(task, &stream->runtime->tasks, list) + list_for_each_entry_safe_reverse(task, temp, &stream->runtime->tasks, list) fcn(stream, task); } else { task = snd_compr_find_task(stream, seqno); From 3d3f43fab4cfb9cf245e3dbffa1736ce925bb54a Mon Sep 17 00:00:00 2001 From: Jaroslav Kysela Date: Tue, 17 Dec 2024 11:07:26 +0100 Subject: [PATCH 201/337] ALSA: compress_offload: improve file descriptors installation for dma-buf Avoid to use single dma_buf_fd() call for both directions. This code ensures that both file descriptors are allocated before fd_install(). Link: https://lore.kernel.org/linux-sound/6a923647-4495-4cff-a253-b73f48cfd0ea@stanley.mountain/ Fixes: 04177158cf98 ("ALSA: compress_offload: introduce accel operation mode") Reported-by: Dan Carpenter Cc: Vinod Koul Signed-off-by: Jaroslav Kysela Signed-off-by: Takashi Iwai Link: https://patch.msgid.link/20241217100726.732863-1-perex@perex.cz --- sound/core/compress_offload.c | 22 +++++++++++++++------- 1 file changed, 15 insertions(+), 7 deletions(-) diff --git a/sound/core/compress_offload.c b/sound/core/compress_offload.c index bdb6e307e453..edf5aadf38e5 100644 --- a/sound/core/compress_offload.c +++ b/sound/core/compress_offload.c @@ -1025,7 +1025,7 @@ static u64 snd_compr_seqno_next(struct snd_compr_stream *stream) static int snd_compr_task_new(struct snd_compr_stream *stream, struct snd_compr_task *utask) { struct snd_compr_task_runtime *task; - int retval; + int retval, fd_i, fd_o; if (stream->runtime->total_tasks >= stream->runtime->fragments) return -EBUSY; @@ -1039,16 +1039,24 @@ static int snd_compr_task_new(struct snd_compr_stream *stream, struct snd_compr_ retval = stream->ops->task_create(stream, task); if (retval < 0) goto cleanup; - utask->input_fd = dma_buf_fd(task->input, O_WRONLY|O_CLOEXEC); - if (utask->input_fd < 0) { - retval = utask->input_fd; + /* similar functionality as in dma_buf_fd(), but ensure that both + file descriptors are allocated before fd_install() */ + if (!task->input || !task->input->file || !task->output || !task->output->file) { + retval = -EINVAL; goto cleanup; } - utask->output_fd = dma_buf_fd(task->output, O_RDONLY|O_CLOEXEC); - if (utask->output_fd < 0) { - retval = utask->output_fd; + fd_i = get_unused_fd_flags(O_WRONLY|O_CLOEXEC); + if (fd_i < 0) + goto cleanup; + fd_o = get_unused_fd_flags(O_RDONLY|O_CLOEXEC); + if (fd_o < 0) { + put_unused_fd(fd_i); goto cleanup; } + fd_install(fd_i, task->input->file); + fd_install(fd_o, task->output->file); + utask->input_fd = fd_i; + utask->output_fd = fd_o; /* keep dmabuf reference until freed with task free ioctl */ dma_buf_get(utask->input_fd); dma_buf_get(utask->output_fd); From fa0308134d26dbbeb209a1581eea46df663866b6 Mon Sep 17 00:00:00 2001 From: Fedor Pchelkin Date: Thu, 19 Dec 2024 23:33:45 +0300 Subject: [PATCH 202/337] ALSA: memalloc: prefer dma_mapping_error() over explicit address checking With CONFIG_DMA_API_DEBUG enabled, the following warning is observed: DMA-API: snd_hda_intel 0000:03:00.1: device driver failed to check map error[device address=0x00000000ffff0000] [size=20480 bytes] [mapped as single] WARNING: CPU: 28 PID: 2255 at kernel/dma/debug.c:1036 check_unmap+0x1408/0x2430 CPU: 28 UID: 42 PID: 2255 Comm: wireplumber Tainted: G W L 6.12.0-10-133577cad6bf48e5a7848c4338124081393bfe8a+ #759 debug_dma_unmap_page+0xe9/0xf0 snd_dma_wc_free+0x85/0x130 [snd_pcm] snd_pcm_lib_free_pages+0x1e3/0x440 [snd_pcm] snd_pcm_common_ioctl+0x1c9a/0x2960 [snd_pcm] snd_pcm_ioctl+0x6a/0xc0 [snd_pcm] ... Check for returned DMA addresses using specialized dma_mapping_error() helper which is generally recommended for this purpose by Documentation/core-api/dma-api.rst. Fixes: c880a5146642 ("ALSA: memalloc: Use proper DMA mapping API for x86 WC buffer allocations") Reported-by: Mikhail Gavrilov Closes: https://lore.kernel.org/r/CABXGCsNB3RsMGvCucOy3byTEOxoc-Ys+zB_HQ=Opb_GhX1ioDA@mail.gmail.com/ Tested-by: Mikhail Gavrilov Signed-off-by: Fedor Pchelkin Link: https://patch.msgid.link/20241219203345.195898-1-pchelkin@ispras.ru Signed-off-by: Takashi Iwai --- sound/core/memalloc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sound/core/memalloc.c b/sound/core/memalloc.c index 13b71069ae18..b3853583d2ae 100644 --- a/sound/core/memalloc.c +++ b/sound/core/memalloc.c @@ -505,7 +505,7 @@ static void *snd_dma_wc_alloc(struct snd_dma_buffer *dmab, size_t size) if (!p) return NULL; dmab->addr = dma_map_single(dmab->dev.dev, p, size, DMA_BIDIRECTIONAL); - if (dmab->addr == DMA_MAPPING_ERROR) { + if (dma_mapping_error(dmab->dev.dev, dmab->addr)) { do_free_pages(dmab->area, size, true); return NULL; } From 55853cb829dc707427c3519f6b8686682a204368 Mon Sep 17 00:00:00 2001 From: Li Zhijian Date: Wed, 18 Dec 2024 10:59:31 +0800 Subject: [PATCH 203/337] selftests/alsa: Fix circular dependency involving global-timer The pattern rule `$(OUTPUT)/%: %.c` inadvertently included a circular dependency on the global-timer target due to its inclusion in $(TEST_GEN_PROGS_EXTENDED). This resulted in a circular dependency warning during the build process. To resolve this, the dependency on $(TEST_GEN_PROGS_EXTENDED) has been replaced with an explicit dependency on $(OUTPUT)/libatest.so. This change ensures that libatest.so is built before any other targets that require it, without creating a circular dependency. This fix addresses the following warning: make[4]: Entering directory 'tools/testing/selftests/alsa' make[4]: Circular default_modconfig/kselftest/alsa/global-timer <- default_modconfig/kselftest/alsa/global-timer dependency dropped. make[4]: Nothing to be done for 'all'. make[4]: Leaving directory 'tools/testing/selftests/alsa' Cc: Mark Brown Cc: Jaroslav Kysela Cc: Takashi Iwai Cc: Shuah Khan Signed-off-by: Li Zhijian Link: https://patch.msgid.link/20241218025931.914164-1-lizhijian@fujitsu.com Signed-off-by: Takashi Iwai --- tools/testing/selftests/alsa/Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/testing/selftests/alsa/Makefile b/tools/testing/selftests/alsa/Makefile index 944279160fed..8dab90ad22bb 100644 --- a/tools/testing/selftests/alsa/Makefile +++ b/tools/testing/selftests/alsa/Makefile @@ -27,5 +27,5 @@ include ../lib.mk $(OUTPUT)/libatest.so: conf.c alsa-local.h $(CC) $(CFLAGS) -shared -fPIC $< $(LDLIBS) -o $@ -$(OUTPUT)/%: %.c $(TEST_GEN_PROGS_EXTENDED) alsa-local.h +$(OUTPUT)/%: %.c $(OUTPUT)/libatest.so alsa-local.h $(CC) $(CFLAGS) $< $(LDLIBS) -latest -o $@ From 66a0a2b0473c39ae85c44628d14e4366fdc0aa0d Mon Sep 17 00:00:00 2001 From: Takashi Iwai Date: Fri, 20 Dec 2024 12:44:16 +0100 Subject: [PATCH 204/337] ALSA: sh: Fix wrong argument order for copy_from_iter() Fix a brown paper bag bug I introduced at converting to the standard iter helper; the arguments were wrongly passed and have to be swapped. Fixes: 9b5f8ee43e48 ("ALSA: sh: Use standard helper for buffer accesses") Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-kbuild-all/202412140019.jat5Dofr-lkp@intel.com/ Link: https://patch.msgid.link/20241220114417.5898-1-tiwai@suse.de Signed-off-by: Takashi Iwai --- sound/sh/sh_dac_audio.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sound/sh/sh_dac_audio.c b/sound/sh/sh_dac_audio.c index a4d07438ad64..3f5422145c5e 100644 --- a/sound/sh/sh_dac_audio.c +++ b/sound/sh/sh_dac_audio.c @@ -163,7 +163,7 @@ static int snd_sh_dac_pcm_copy(struct snd_pcm_substream *substream, /* channel is not used (interleaved data) */ struct snd_sh_dac *chip = snd_pcm_substream_chip(substream); - if (copy_from_iter(chip->data_buffer + pos, src, count) != count) + if (copy_from_iter(chip->data_buffer + pos, count, src) != count) return -EFAULT; chip->buffer_end = chip->data_buffer + pos + count; From 6321f5fb70d502d95de8a212a7b484c297ec9644 Mon Sep 17 00:00:00 2001 From: Joshua Washington Date: Wed, 18 Dec 2024 05:34:11 -0800 Subject: [PATCH 205/337] gve: clean XDP queues in gve_tx_stop_ring_gqi When stopping XDP TX rings, the XDP clean function needs to be called to clean out the entire queue, similar to what happens in the normal TX queue case. Otherwise, the FIFO won't be cleared correctly, and xsk_tx_completed won't be reported. Fixes: 75eaae158b1b ("gve: Add XDP DROP and TX support for GQI-QPL format") Cc: stable@vger.kernel.org Signed-off-by: Joshua Washington Signed-off-by: Praveen Kaligineedi Reviewed-by: Praveen Kaligineedi Reviewed-by: Willem de Bruijn Signed-off-by: David S. Miller --- drivers/net/ethernet/google/gve/gve_tx.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/google/gve/gve_tx.c b/drivers/net/ethernet/google/gve/gve_tx.c index e7fb7d6d283d..83ad278ec91f 100644 --- a/drivers/net/ethernet/google/gve/gve_tx.c +++ b/drivers/net/ethernet/google/gve/gve_tx.c @@ -206,7 +206,10 @@ void gve_tx_stop_ring_gqi(struct gve_priv *priv, int idx) return; gve_remove_napi(priv, ntfy_idx); - gve_clean_tx_done(priv, tx, priv->tx_desc_cnt, false); + if (tx->q_num < priv->tx_cfg.num_queues) + gve_clean_tx_done(priv, tx, priv->tx_desc_cnt, false); + else + gve_clean_xdp_done(priv, tx, priv->tx_desc_cnt); netdev_tx_reset_queue(tx->netdev_txq); gve_tx_remove_from_block(priv, idx); } From ff7c2dea9dd1a436fc79d6273adffdcc4a7ffea3 Mon Sep 17 00:00:00 2001 From: Joshua Washington Date: Wed, 18 Dec 2024 05:34:12 -0800 Subject: [PATCH 206/337] gve: guard XDP xmit NDO on existence of xdp queues In GVE, dedicated XDP queues only exist when an XDP program is installed and the interface is up. As such, the NDO XDP XMIT callback should return early if either of these conditions are false. In the case of no loaded XDP program, priv->num_xdp_queues=0 which can cause a divide-by-zero error, and in the case of interface down, num_xdp_queues remains untouched to persist XDP queue count for the next interface up, but the TX pointer itself would be NULL. The XDP xmit callback also needs to synchronize with a device transitioning from open to close. This synchronization will happen via the GVE_PRIV_FLAGS_NAPI_ENABLED bit along with a synchronize_net() call, which waits for any RCU critical sections at call-time to complete. Fixes: 39a7f4aa3e4a ("gve: Add XDP REDIRECT support for GQI-QPL format") Cc: stable@vger.kernel.org Signed-off-by: Joshua Washington Signed-off-by: Praveen Kaligineedi Reviewed-by: Praveen Kaligineedi Reviewed-by: Shailend Chand Reviewed-by: Willem de Bruijn Signed-off-by: David S. Miller --- drivers/net/ethernet/google/gve/gve_main.c | 3 +++ drivers/net/ethernet/google/gve/gve_tx.c | 5 ++++- 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/google/gve/gve_main.c b/drivers/net/ethernet/google/gve/gve_main.c index e171ca248f9a..5d7b0cc59959 100644 --- a/drivers/net/ethernet/google/gve/gve_main.c +++ b/drivers/net/ethernet/google/gve/gve_main.c @@ -1899,6 +1899,9 @@ static void gve_turndown(struct gve_priv *priv) gve_clear_napi_enabled(priv); gve_clear_report_stats(priv); + + /* Make sure that all traffic is finished processing. */ + synchronize_net(); } static void gve_turnup(struct gve_priv *priv) diff --git a/drivers/net/ethernet/google/gve/gve_tx.c b/drivers/net/ethernet/google/gve/gve_tx.c index 83ad278ec91f..852f8c7e39d2 100644 --- a/drivers/net/ethernet/google/gve/gve_tx.c +++ b/drivers/net/ethernet/google/gve/gve_tx.c @@ -837,9 +837,12 @@ int gve_xdp_xmit(struct net_device *dev, int n, struct xdp_frame **frames, struct gve_tx_ring *tx; int i, err = 0, qid; - if (unlikely(flags & ~XDP_XMIT_FLAGS_MASK)) + if (unlikely(flags & ~XDP_XMIT_FLAGS_MASK) || !priv->xdp_prog) return -EINVAL; + if (!gve_get_napi_enabled(priv)) + return -ENETDOWN; + qid = gve_xdp_tx_queue_id(priv, smp_processor_id() % priv->num_xdp_queues); From 40338d7987d810fcaa95c500b1068a52b08eec9b Mon Sep 17 00:00:00 2001 From: Joshua Washington Date: Wed, 18 Dec 2024 05:34:13 -0800 Subject: [PATCH 207/337] gve: guard XSK operations on the existence of queues This patch predicates the enabling and disabling of XSK pools on the existence of queues. As it stands, if the interface is down, disabling or enabling XSK pools would result in a crash, as the RX queue pointer would be NULL. XSK pool registration will occur as part of the next interface up. Similarly, xsk_wakeup needs be guarded against queues disappearing while the function is executing, so a check against the GVE_PRIV_FLAGS_NAPI_ENABLED flag is added to synchronize with the disabling of the bit and the synchronize_net() in gve_turndown. Fixes: fd8e40321a12 ("gve: Add AF_XDP zero-copy support for GQI-QPL format") Cc: stable@vger.kernel.org Signed-off-by: Joshua Washington Signed-off-by: Praveen Kaligineedi Reviewed-by: Praveen Kaligineedi Reviewed-by: Shailend Chand Reviewed-by: Willem de Bruijn Reviewed-by: Larysa Zaremba Signed-off-by: David S. Miller --- drivers/net/ethernet/google/gve/gve_main.c | 22 ++++++++++------------ 1 file changed, 10 insertions(+), 12 deletions(-) diff --git a/drivers/net/ethernet/google/gve/gve_main.c b/drivers/net/ethernet/google/gve/gve_main.c index 5d7b0cc59959..e4e8ff4f9f80 100644 --- a/drivers/net/ethernet/google/gve/gve_main.c +++ b/drivers/net/ethernet/google/gve/gve_main.c @@ -1623,8 +1623,8 @@ static int gve_xsk_pool_enable(struct net_device *dev, if (err) return err; - /* If XDP prog is not installed, return */ - if (!priv->xdp_prog) + /* If XDP prog is not installed or interface is down, return. */ + if (!priv->xdp_prog || !netif_running(dev)) return 0; rx = &priv->rx[qid]; @@ -1669,21 +1669,16 @@ static int gve_xsk_pool_disable(struct net_device *dev, if (qid >= priv->rx_cfg.num_queues) return -EINVAL; - /* If XDP prog is not installed, unmap DMA and return */ - if (!priv->xdp_prog) + /* If XDP prog is not installed or interface is down, unmap DMA and + * return. + */ + if (!priv->xdp_prog || !netif_running(dev)) goto done; - tx_qid = gve_xdp_tx_queue_id(priv, qid); - if (!netif_running(dev)) { - priv->rx[qid].xsk_pool = NULL; - xdp_rxq_info_unreg(&priv->rx[qid].xsk_rxq); - priv->tx[tx_qid].xsk_pool = NULL; - goto done; - } - napi_rx = &priv->ntfy_blocks[priv->rx[qid].ntfy_id].napi; napi_disable(napi_rx); /* make sure current rx poll is done */ + tx_qid = gve_xdp_tx_queue_id(priv, qid); napi_tx = &priv->ntfy_blocks[priv->tx[tx_qid].ntfy_id].napi; napi_disable(napi_tx); /* make sure current tx poll is done */ @@ -1711,6 +1706,9 @@ static int gve_xsk_wakeup(struct net_device *dev, u32 queue_id, u32 flags) struct gve_priv *priv = netdev_priv(dev); int tx_queue_id = gve_xdp_tx_queue_id(priv, queue_id); + if (!gve_get_napi_enabled(priv)) + return -ENETDOWN; + if (queue_id >= priv->rx_cfg.num_queues || !priv->xdp_prog) return -EINVAL; From ba0925c34e0fa6fe02d3d642bc02ab099ab312c7 Mon Sep 17 00:00:00 2001 From: Joshua Washington Date: Wed, 18 Dec 2024 05:34:14 -0800 Subject: [PATCH 208/337] gve: process XSK TX descriptors as part of RX NAPI When busy polling is enabled, xsk_sendmsg for AF_XDP zero copy marks the NAPI ID corresponding to the memory pool allocated for the socket. In GVE, this NAPI ID will never correspond to a NAPI ID of one of the dedicated XDP TX queues registered with the umem because XDP TX is not set up to share a NAPI with a corresponding RX queue. This patch moves XSK TX descriptor processing from the TX NAPI to the RX NAPI, and the gve_xsk_wakeup callback is updated to use the RX NAPI instead of the TX NAPI, accordingly. The branch on if the wakeup is for TX is removed, as the NAPI poll should be invoked whether the wakeup is for TX or for RX. Fixes: fd8e40321a12 ("gve: Add AF_XDP zero-copy support for GQI-QPL format") Cc: stable@vger.kernel.org Signed-off-by: Praveen Kaligineedi Signed-off-by: Joshua Washington Reviewed-by: Willem de Bruijn Signed-off-by: David S. Miller --- drivers/net/ethernet/google/gve/gve.h | 1 + drivers/net/ethernet/google/gve/gve_main.c | 8 +++++ drivers/net/ethernet/google/gve/gve_tx.c | 36 +++++++++++++--------- 3 files changed, 31 insertions(+), 14 deletions(-) diff --git a/drivers/net/ethernet/google/gve/gve.h b/drivers/net/ethernet/google/gve/gve.h index dd92949bb214..8167cc5fb0df 100644 --- a/drivers/net/ethernet/google/gve/gve.h +++ b/drivers/net/ethernet/google/gve/gve.h @@ -1140,6 +1140,7 @@ int gve_xdp_xmit_one(struct gve_priv *priv, struct gve_tx_ring *tx, void gve_xdp_tx_flush(struct gve_priv *priv, u32 xdp_qid); bool gve_tx_poll(struct gve_notify_block *block, int budget); bool gve_xdp_poll(struct gve_notify_block *block, int budget); +int gve_xsk_tx_poll(struct gve_notify_block *block, int budget); int gve_tx_alloc_rings_gqi(struct gve_priv *priv, struct gve_tx_alloc_rings_cfg *cfg); void gve_tx_free_rings_gqi(struct gve_priv *priv, diff --git a/drivers/net/ethernet/google/gve/gve_main.c b/drivers/net/ethernet/google/gve/gve_main.c index e4e8ff4f9f80..5cab7b88610f 100644 --- a/drivers/net/ethernet/google/gve/gve_main.c +++ b/drivers/net/ethernet/google/gve/gve_main.c @@ -333,6 +333,14 @@ int gve_napi_poll(struct napi_struct *napi, int budget) if (block->rx) { work_done = gve_rx_poll(block, budget); + + /* Poll XSK TX as part of RX NAPI. Setup re-poll based on max of + * TX and RX work done. + */ + if (priv->xdp_prog) + work_done = max_t(int, work_done, + gve_xsk_tx_poll(block, budget)); + reschedule |= work_done == budget; } diff --git a/drivers/net/ethernet/google/gve/gve_tx.c b/drivers/net/ethernet/google/gve/gve_tx.c index 852f8c7e39d2..4350ebd9c2bd 100644 --- a/drivers/net/ethernet/google/gve/gve_tx.c +++ b/drivers/net/ethernet/google/gve/gve_tx.c @@ -981,33 +981,41 @@ out: return sent; } +int gve_xsk_tx_poll(struct gve_notify_block *rx_block, int budget) +{ + struct gve_rx_ring *rx = rx_block->rx; + struct gve_priv *priv = rx->gve; + struct gve_tx_ring *tx; + int sent = 0; + + tx = &priv->tx[gve_xdp_tx_queue_id(priv, rx->q_num)]; + if (tx->xsk_pool) { + sent = gve_xsk_tx(priv, tx, budget); + + u64_stats_update_begin(&tx->statss); + tx->xdp_xsk_sent += sent; + u64_stats_update_end(&tx->statss); + if (xsk_uses_need_wakeup(tx->xsk_pool)) + xsk_set_tx_need_wakeup(tx->xsk_pool); + } + + return sent; +} + bool gve_xdp_poll(struct gve_notify_block *block, int budget) { struct gve_priv *priv = block->priv; struct gve_tx_ring *tx = block->tx; u32 nic_done; - bool repoll; u32 to_do; /* Find out how much work there is to be done */ nic_done = gve_tx_load_event_counter(priv, tx); to_do = min_t(u32, (nic_done - tx->done), budget); gve_clean_xdp_done(priv, tx, to_do); - repoll = nic_done != tx->done; - - if (tx->xsk_pool) { - int sent = gve_xsk_tx(priv, tx, budget); - - u64_stats_update_begin(&tx->statss); - tx->xdp_xsk_sent += sent; - u64_stats_update_end(&tx->statss); - repoll |= (sent == budget); - if (xsk_uses_need_wakeup(tx->xsk_pool)) - xsk_set_tx_need_wakeup(tx->xsk_pool); - } /* If we still have work we want to repoll */ - return repoll; + return nic_done != tx->done; } bool gve_tx_poll(struct gve_notify_block *block, int budget) From de63ac44a527b2c5067551dbd70d939fe151325a Mon Sep 17 00:00:00 2001 From: Joshua Washington Date: Wed, 18 Dec 2024 05:34:15 -0800 Subject: [PATCH 209/337] gve: fix XDP allocation path in edge cases This patch fixes a number of consistency issues in the queue allocation path related to XDP. As it stands, the number of allocated XDP queues changes in three different scenarios. 1) Adding an XDP program while the interface is up via gve_add_xdp_queues 2) Removing an XDP program while the interface is up via gve_remove_xdp_queues 3) After queues have been allocated and the old queue memory has been removed in gve_queues_start. However, the requirement for the interface to be up for gve_(add|remove)_xdp_queues to be called, in conjunction with the fact that the number of queues stored in priv isn't updated until _after_ XDP queues have been allocated in the normal queue allocation path means that if an XDP program is added while the interface is down, XDP queues won't be added until the _second_ if_up, not the first. Given the expectation that the number of XDP queues is equal to the number of RX queues, scenario (3) has another problematic implication. When changing the number of queues while an XDP program is loaded, the number of XDP queues must be updated as well, as there is logic in the driver (gve_xdp_tx_queue_id()) which relies on every RX queue having a corresponding XDP TX queue. However, the number of XDP queues stored in priv would not be updated until _after_ a close/open leading to a mismatch in the number of XDP queues reported vs the number of XDP queues which actually exist after the queue count update completes. This patch remedies these issues by doing the following: 1) The allocation config getter function is set up to retrieve the _expected_ number of XDP queues to allocate instead of relying on the value stored in `priv` which is only updated once the queues have been allocated. 2) When adjusting queues, XDP queues are adjusted to match the number of RX queues when XDP is enabled. This only works in the case when queues are live, so part (1) of the fix must still be available in the case that queues are adjusted when there is an XDP program and the interface is down. Fixes: 5f08cd3d6423 ("gve: Alloc before freeing when adjusting queues") Cc: stable@vger.kernel.org Signed-off-by: Joshua Washington Signed-off-by: Praveen Kaligineedi Reviewed-by: Praveen Kaligineedi Reviewed-by: Shailend Chand Reviewed-by: Willem de Bruijn Signed-off-by: David S. Miller --- drivers/net/ethernet/google/gve/gve_main.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/google/gve/gve_main.c b/drivers/net/ethernet/google/gve/gve_main.c index 5cab7b88610f..09fb7f16f73e 100644 --- a/drivers/net/ethernet/google/gve/gve_main.c +++ b/drivers/net/ethernet/google/gve/gve_main.c @@ -930,11 +930,13 @@ static void gve_init_sync_stats(struct gve_priv *priv) static void gve_tx_get_curr_alloc_cfg(struct gve_priv *priv, struct gve_tx_alloc_rings_cfg *cfg) { + int num_xdp_queues = priv->xdp_prog ? priv->rx_cfg.num_queues : 0; + cfg->qcfg = &priv->tx_cfg; cfg->raw_addressing = !gve_is_qpl(priv); cfg->ring_size = priv->tx_desc_cnt; cfg->start_idx = 0; - cfg->num_rings = gve_num_tx_queues(priv); + cfg->num_rings = priv->tx_cfg.num_queues + num_xdp_queues; cfg->tx = priv->tx; } @@ -1843,6 +1845,7 @@ int gve_adjust_queues(struct gve_priv *priv, { struct gve_tx_alloc_rings_cfg tx_alloc_cfg = {0}; struct gve_rx_alloc_rings_cfg rx_alloc_cfg = {0}; + int num_xdp_queues; int err; gve_get_curr_alloc_cfgs(priv, &tx_alloc_cfg, &rx_alloc_cfg); @@ -1853,6 +1856,10 @@ int gve_adjust_queues(struct gve_priv *priv, rx_alloc_cfg.qcfg = &new_rx_config; tx_alloc_cfg.num_rings = new_tx_config.num_queues; + /* Add dedicated XDP TX queues if enabled. */ + num_xdp_queues = priv->xdp_prog ? new_rx_config.num_queues : 0; + tx_alloc_cfg.num_rings += num_xdp_queues; + if (netif_running(priv->dev)) { err = gve_adjust_config(priv, &tx_alloc_cfg, &rx_alloc_cfg); return err; From 926e862058978a8f81872845715d67ad21c30f65 Mon Sep 17 00:00:00 2001 From: Mark Brown Date: Sat, 14 Dec 2024 02:12:06 +0000 Subject: [PATCH 210/337] arm64/signal: Silence sparse warning storing GCSPR_EL0 We are seeing a sparse warning in gcs_restore_signal(): arch/arm64/kernel/signal.c:1054:9: sparse: sparse: cast removes address space '__user' of expression when storing the final GCSPR_EL0 value back into the register, caused by the fact that write_sysreg_s() casts the value it writes to a u64 which sparse sees as discarding the __userness of the pointer. Avoid this by treating the address as an integer, casting to a pointer only when using it to write to userspace. While we're at it also inline gcs_signal_cap_valid() into it's one user and make equivalent updates to gcs_signal_entry(). Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-kbuild-all/202412082005.OBJ0BbWs-lkp@intel.com/ Signed-off-by: Mark Brown Link: https://lore.kernel.org/r/20241214-arm64-gcs-signal-sparse-v3-1-5e8d18fffc0c@kernel.org Signed-off-by: Catalin Marinas --- arch/arm64/kernel/signal.c | 35 +++++++++++++++-------------------- 1 file changed, 15 insertions(+), 20 deletions(-) diff --git a/arch/arm64/kernel/signal.c b/arch/arm64/kernel/signal.c index 37e24f1bd227..99ea26d400ff 100644 --- a/arch/arm64/kernel/signal.c +++ b/arch/arm64/kernel/signal.c @@ -36,15 +36,8 @@ #include #include -#ifdef CONFIG_ARM64_GCS #define GCS_SIGNAL_CAP(addr) (((unsigned long)addr) & GCS_CAP_ADDR_MASK) -static bool gcs_signal_cap_valid(u64 addr, u64 val) -{ - return val == GCS_SIGNAL_CAP(addr); -} -#endif - /* * Do a signal return; undo the signal stack. These are aligned to 128-bit. */ @@ -1062,8 +1055,7 @@ static int restore_sigframe(struct pt_regs *regs, #ifdef CONFIG_ARM64_GCS static int gcs_restore_signal(void) { - unsigned long __user *gcspr_el0; - u64 cap; + u64 gcspr_el0, cap; int ret; if (!system_supports_gcs()) @@ -1072,7 +1064,7 @@ static int gcs_restore_signal(void) if (!(current->thread.gcs_el0_mode & PR_SHADOW_STACK_ENABLE)) return 0; - gcspr_el0 = (unsigned long __user *)read_sysreg_s(SYS_GCSPR_EL0); + gcspr_el0 = read_sysreg_s(SYS_GCSPR_EL0); /* * Ensure that any changes to the GCS done via GCS operations @@ -1087,22 +1079,23 @@ static int gcs_restore_signal(void) * then faults will be generated on GCS operations - the main * concern is to protect GCS pages. */ - ret = copy_from_user(&cap, gcspr_el0, sizeof(cap)); + ret = copy_from_user(&cap, (unsigned long __user *)gcspr_el0, + sizeof(cap)); if (ret) return -EFAULT; /* * Check that the cap is the actual GCS before replacing it. */ - if (!gcs_signal_cap_valid((u64)gcspr_el0, cap)) + if (cap != GCS_SIGNAL_CAP(gcspr_el0)) return -EINVAL; /* Invalidate the token to prevent reuse */ - put_user_gcs(0, (__user void*)gcspr_el0, &ret); + put_user_gcs(0, (unsigned long __user *)gcspr_el0, &ret); if (ret != 0) return -EFAULT; - write_sysreg_s(gcspr_el0 + 1, SYS_GCSPR_EL0); + write_sysreg_s(gcspr_el0 + 8, SYS_GCSPR_EL0); return 0; } @@ -1421,7 +1414,7 @@ static int get_sigframe(struct rt_sigframe_user_layout *user, static int gcs_signal_entry(__sigrestore_t sigtramp, struct ksignal *ksig) { - unsigned long __user *gcspr_el0; + u64 gcspr_el0; int ret = 0; if (!system_supports_gcs()) @@ -1434,18 +1427,20 @@ static int gcs_signal_entry(__sigrestore_t sigtramp, struct ksignal *ksig) * We are entering a signal handler, current register state is * active. */ - gcspr_el0 = (unsigned long __user *)read_sysreg_s(SYS_GCSPR_EL0); + gcspr_el0 = read_sysreg_s(SYS_GCSPR_EL0); /* * Push a cap and the GCS entry for the trampoline onto the GCS. */ - put_user_gcs((unsigned long)sigtramp, gcspr_el0 - 2, &ret); - put_user_gcs(GCS_SIGNAL_CAP(gcspr_el0 - 1), gcspr_el0 - 1, &ret); + put_user_gcs((unsigned long)sigtramp, + (unsigned long __user *)(gcspr_el0 - 16), &ret); + put_user_gcs(GCS_SIGNAL_CAP(gcspr_el0 - 8), + (unsigned long __user *)(gcspr_el0 - 8), &ret); if (ret != 0) return ret; - gcspr_el0 -= 2; - write_sysreg_s((unsigned long)gcspr_el0, SYS_GCSPR_EL0); + gcspr_el0 -= 16; + write_sysreg_s(gcspr_el0, SYS_GCSPR_EL0); return 0; } From 7917f01a286ce01e9c085e24468421f596ee1a0c Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Fri, 20 Dec 2024 15:28:18 +1100 Subject: [PATCH 211/337] nfsd: restore callback functionality for NFSv4.0 A recent patch inadvertently broke callbacks for NFSv4.0. In the 4.0 case we do not expect a session to be found but still need to call setup_callback_client() which will not try to dereference it. This patch moves the check for failure to find a session into the 4.1+ branch of setup_callback_client() Fixes: 1e02c641c3a4 ("NFSD: Prevent NULL dereference in nfsd4_process_cb_update()") Signed-off-by: NeilBrown Reviewed-by: Jeff Layton Signed-off-by: Chuck Lever --- fs/nfsd/nfs4callback.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c index 3877b53e429f..c083e539e898 100644 --- a/fs/nfsd/nfs4callback.c +++ b/fs/nfsd/nfs4callback.c @@ -1100,7 +1100,7 @@ static int setup_callback_client(struct nfs4_client *clp, struct nfs4_cb_conn *c args.authflavor = clp->cl_cred.cr_flavor; clp->cl_cb_ident = conn->cb_ident; } else { - if (!conn->cb_xprt) + if (!conn->cb_xprt || !ses) return -EINVAL; clp->cl_cb_session = ses; args.bc_xprt = conn->cb_xprt; @@ -1522,8 +1522,6 @@ static void nfsd4_process_cb_update(struct nfsd4_callback *cb) ses = c->cn_session; } spin_unlock(&clp->cl_lock); - if (!c) - return; err = setup_callback_client(clp, &conn, ses); if (err) { From aa5d2ca7c179c40669edb5e96d931bf9828dea3d Mon Sep 17 00:00:00 2001 From: Kan Liang Date: Mon, 16 Dec 2024 08:02:52 -0800 Subject: [PATCH 212/337] perf/x86/intel: Fix bitmask of OCR and FRONTEND events for LNC The released OCR and FRONTEND events utilized more bits on Lunar Lake p-core. The corresponding mask in the extra_regs has to be extended to unblock the extra bits. Add a dedicated intel_lnc_extra_regs. Fixes: a932aa0e868f ("perf/x86: Add Lunar Lake and Arrow Lake support") Reported-by: Andi Kleen Signed-off-by: Kan Liang Signed-off-by: Peter Zijlstra (Intel) Cc: stable@vger.kernel.org Link: https://lkml.kernel.org/r/20241216160252.430858-1-kan.liang@linux.intel.com --- arch/x86/events/intel/core.c | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c index 2e1e26846050..99c590da0ae2 100644 --- a/arch/x86/events/intel/core.c +++ b/arch/x86/events/intel/core.c @@ -429,6 +429,16 @@ static struct event_constraint intel_lnc_event_constraints[] = { EVENT_CONSTRAINT_END }; +static struct extra_reg intel_lnc_extra_regs[] __read_mostly = { + INTEL_UEVENT_EXTRA_REG(0x012a, MSR_OFFCORE_RSP_0, 0xfffffffffffull, RSP_0), + INTEL_UEVENT_EXTRA_REG(0x012b, MSR_OFFCORE_RSP_1, 0xfffffffffffull, RSP_1), + INTEL_UEVENT_PEBS_LDLAT_EXTRA_REG(0x01cd), + INTEL_UEVENT_EXTRA_REG(0x02c6, MSR_PEBS_FRONTEND, 0x9, FE), + INTEL_UEVENT_EXTRA_REG(0x03c6, MSR_PEBS_FRONTEND, 0x7fff1f, FE), + INTEL_UEVENT_EXTRA_REG(0x40ad, MSR_PEBS_FRONTEND, 0xf, FE), + INTEL_UEVENT_EXTRA_REG(0x04c2, MSR_PEBS_FRONTEND, 0x8, FE), + EVENT_EXTRA_END +}; EVENT_ATTR_STR(mem-loads, mem_ld_nhm, "event=0x0b,umask=0x10,ldlat=3"); EVENT_ATTR_STR(mem-loads, mem_ld_snb, "event=0xcd,umask=0x1,ldlat=3"); @@ -6422,7 +6432,7 @@ static __always_inline void intel_pmu_init_lnc(struct pmu *pmu) intel_pmu_init_glc(pmu); hybrid(pmu, event_constraints) = intel_lnc_event_constraints; hybrid(pmu, pebs_constraints) = intel_lnc_pebs_event_constraints; - hybrid(pmu, extra_regs) = intel_rwc_extra_regs; + hybrid(pmu, extra_regs) = intel_lnc_extra_regs; } static __always_inline void intel_pmu_init_skt(struct pmu *pmu) From 54f89b3178d5448dd4457afbb98fc1ab99090a65 Mon Sep 17 00:00:00 2001 From: Cong Wang Date: Tue, 10 Dec 2024 01:20:38 +0000 Subject: [PATCH 213/337] tcp_bpf: Charge receive socket buffer in bpf_tcp_ingress() When bpf_tcp_ingress() is called, the skmsg is being redirected to the ingress of the destination socket. Therefore, we should charge its receive socket buffer, instead of sending socket buffer. Because sk_rmem_schedule() tests pfmemalloc of skb, we need to introduce a wrapper and call it for skmsg. Fixes: 604326b41a6f ("bpf, sockmap: convert to generic sk_msg interface") Signed-off-by: Cong Wang Signed-off-by: Daniel Borkmann Reviewed-by: John Fastabend Link: https://lore.kernel.org/bpf/20241210012039.1669389-2-zijianzhang@bytedance.com --- include/net/sock.h | 10 ++++++++-- net/ipv4/tcp_bpf.c | 2 +- 2 files changed, 9 insertions(+), 3 deletions(-) diff --git a/include/net/sock.h b/include/net/sock.h index 7464e9f9f47c..c383126f691d 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -1527,7 +1527,7 @@ static inline bool sk_wmem_schedule(struct sock *sk, int size) } static inline bool -sk_rmem_schedule(struct sock *sk, struct sk_buff *skb, int size) +__sk_rmem_schedule(struct sock *sk, int size, bool pfmemalloc) { int delta; @@ -1535,7 +1535,13 @@ sk_rmem_schedule(struct sock *sk, struct sk_buff *skb, int size) return true; delta = size - sk->sk_forward_alloc; return delta <= 0 || __sk_mem_schedule(sk, delta, SK_MEM_RECV) || - skb_pfmemalloc(skb); + pfmemalloc; +} + +static inline bool +sk_rmem_schedule(struct sock *sk, struct sk_buff *skb, int size) +{ + return __sk_rmem_schedule(sk, size, skb_pfmemalloc(skb)); } static inline int sk_unused_reserved_mem(const struct sock *sk) diff --git a/net/ipv4/tcp_bpf.c b/net/ipv4/tcp_bpf.c index 99cef92e6290..b21ea634909c 100644 --- a/net/ipv4/tcp_bpf.c +++ b/net/ipv4/tcp_bpf.c @@ -49,7 +49,7 @@ static int bpf_tcp_ingress(struct sock *sk, struct sk_psock *psock, sge = sk_msg_elem(msg, i); size = (apply && apply_bytes < sge->length) ? apply_bytes : sge->length; - if (!sk_wmem_schedule(sk, size)) { + if (!__sk_rmem_schedule(sk, size, false)) { if (!copied) ret = -ENOMEM; break; From d888b7af7c149c115dd6ac772cc11c375da3e17c Mon Sep 17 00:00:00 2001 From: Zijian Zhang Date: Tue, 10 Dec 2024 01:20:39 +0000 Subject: [PATCH 214/337] tcp_bpf: Add sk_rmem_alloc related logic for tcp_bpf ingress redirection When we do sk_psock_verdict_apply->sk_psock_skb_ingress, an sk_msg will be created out of the skb, and the rmem accounting of the sk_msg will be handled by the skb. For skmsgs in __SK_REDIRECT case of tcp_bpf_send_verdict, when redirecting to the ingress of a socket, although we sk_rmem_schedule and add sk_msg to the ingress_msg of sk_redir, we do not update sk_rmem_alloc. As a result, except for the global memory limit, the rmem of sk_redir is nearly unlimited. Thus, add sk_rmem_alloc related logic to limit the recv buffer. Since the function sk_msg_recvmsg and __sk_psock_purge_ingress_msg are used in these two paths. We use "msg->skb" to test whether the sk_msg is skb backed up. If it's not, we shall do the memory accounting explicitly. Fixes: 604326b41a6f ("bpf, sockmap: convert to generic sk_msg interface") Signed-off-by: Zijian Zhang Signed-off-by: Daniel Borkmann Reviewed-by: John Fastabend Link: https://lore.kernel.org/bpf/20241210012039.1669389-3-zijianzhang@bytedance.com --- include/linux/skmsg.h | 11 ++++++++--- net/core/skmsg.c | 6 +++++- net/ipv4/tcp_bpf.c | 4 +++- 3 files changed, 16 insertions(+), 5 deletions(-) diff --git a/include/linux/skmsg.h b/include/linux/skmsg.h index d9b03e0746e7..2cbe0c22a32f 100644 --- a/include/linux/skmsg.h +++ b/include/linux/skmsg.h @@ -317,17 +317,22 @@ static inline void sock_drop(struct sock *sk, struct sk_buff *skb) kfree_skb(skb); } -static inline void sk_psock_queue_msg(struct sk_psock *psock, +static inline bool sk_psock_queue_msg(struct sk_psock *psock, struct sk_msg *msg) { + bool ret; + spin_lock_bh(&psock->ingress_lock); - if (sk_psock_test_state(psock, SK_PSOCK_TX_ENABLED)) + if (sk_psock_test_state(psock, SK_PSOCK_TX_ENABLED)) { list_add_tail(&msg->list, &psock->ingress_msg); - else { + ret = true; + } else { sk_msg_free(psock->sk, msg); kfree(msg); + ret = false; } spin_unlock_bh(&psock->ingress_lock); + return ret; } static inline struct sk_msg *sk_psock_dequeue_msg(struct sk_psock *psock) diff --git a/net/core/skmsg.c b/net/core/skmsg.c index e90fbab703b2..8ad7e6755fd6 100644 --- a/net/core/skmsg.c +++ b/net/core/skmsg.c @@ -445,8 +445,10 @@ int sk_msg_recvmsg(struct sock *sk, struct sk_psock *psock, struct msghdr *msg, if (likely(!peek)) { sge->offset += copy; sge->length -= copy; - if (!msg_rx->skb) + if (!msg_rx->skb) { sk_mem_uncharge(sk, copy); + atomic_sub(copy, &sk->sk_rmem_alloc); + } msg_rx->sg.size -= copy; if (!sge->length) { @@ -772,6 +774,8 @@ static void __sk_psock_purge_ingress_msg(struct sk_psock *psock) list_for_each_entry_safe(msg, tmp, &psock->ingress_msg, list) { list_del(&msg->list); + if (!msg->skb) + atomic_sub(msg->sg.size, &psock->sk->sk_rmem_alloc); sk_msg_free(psock->sk, msg); kfree(msg); } diff --git a/net/ipv4/tcp_bpf.c b/net/ipv4/tcp_bpf.c index b21ea634909c..392678ae80f4 100644 --- a/net/ipv4/tcp_bpf.c +++ b/net/ipv4/tcp_bpf.c @@ -56,6 +56,7 @@ static int bpf_tcp_ingress(struct sock *sk, struct sk_psock *psock, } sk_mem_charge(sk, size); + atomic_add(size, &sk->sk_rmem_alloc); sk_msg_xfer(tmp, msg, i, size); copied += size; if (sge->length) @@ -74,7 +75,8 @@ static int bpf_tcp_ingress(struct sock *sk, struct sk_psock *psock, if (!ret) { msg->sg.start = i; - sk_psock_queue_msg(psock, tmp); + if (!sk_psock_queue_msg(psock, tmp)) + atomic_sub(copied, &sk->sk_rmem_alloc); sk_psock_data_ready(sk, psock); } else { sk_msg_free(sk, tmp); From 724c6ce38bbaeb4b3f109b0e066d6c0ecd15446c Mon Sep 17 00:00:00 2001 From: Alexander Lobakin Date: Thu, 19 Dec 2024 14:57:34 +0100 Subject: [PATCH 215/337] stddef: make __struct_group() UAPI C++-friendly For the most part of the C++ history, it couldn't have type declarations inside anonymous unions for different reasons. At the same time, __struct_group() relies on the latters, so when the @TAG argument is not empty, C++ code doesn't want to build (even under `extern "C"`): ../linux/include/uapi/linux/pkt_cls.h:25:24: error: 'struct tc_u32_sel::::tc_u32_sel_hdr,' invalid; an anonymous union may only have public non-static data members [-fpermissive] The safest way to fix this without trying to switch standards (which is impossible in UAPI anyway) etc., is to disable tag declaration for that language. This won't break anything since for now it's not buildable at all. Use a separate definition for __struct_group() when __cplusplus is defined to mitigate the error, including the version from tools/. Fixes: 50d7bd38c3aa ("stddef: Introduce struct_group() helper macro") Reported-by: Christopher Ferris Closes: https://lore.kernel.org/linux-hardening/Z1HZpe3WE5As8UAz@google.com Suggested-by: Kees Cook # __struct_group_tag() Signed-off-by: Alexander Lobakin Reviewed-by: Gustavo A. R. Silva Link: https://lore.kernel.org/r/20241219135734.2130002-1-aleksander.lobakin@intel.com Signed-off-by: Kees Cook --- include/uapi/linux/stddef.h | 13 ++++++++++--- tools/include/uapi/linux/stddef.h | 15 +++++++++++---- 2 files changed, 21 insertions(+), 7 deletions(-) diff --git a/include/uapi/linux/stddef.h b/include/uapi/linux/stddef.h index 58154117d9b0..a6fce46aeb37 100644 --- a/include/uapi/linux/stddef.h +++ b/include/uapi/linux/stddef.h @@ -8,6 +8,13 @@ #define __always_inline inline #endif +/* Not all C++ standards support type declarations inside an anonymous union */ +#ifndef __cplusplus +#define __struct_group_tag(TAG) TAG +#else +#define __struct_group_tag(TAG) +#endif + /** * __struct_group() - Create a mirrored named and anonyomous struct * @@ -20,13 +27,13 @@ * and size: one anonymous and one named. The former's members can be used * normally without sub-struct naming, and the latter can be used to * reason about the start, end, and size of the group of struct members. - * The named struct can also be explicitly tagged for layer reuse, as well - * as both having struct attributes appended. + * The named struct can also be explicitly tagged for layer reuse (C only), + * as well as both having struct attributes appended. */ #define __struct_group(TAG, NAME, ATTRS, MEMBERS...) \ union { \ struct { MEMBERS } ATTRS; \ - struct TAG { MEMBERS } ATTRS NAME; \ + struct __struct_group_tag(TAG) { MEMBERS } ATTRS NAME; \ } ATTRS #ifdef __cplusplus diff --git a/tools/include/uapi/linux/stddef.h b/tools/include/uapi/linux/stddef.h index bb6ea517efb5..c53cde425406 100644 --- a/tools/include/uapi/linux/stddef.h +++ b/tools/include/uapi/linux/stddef.h @@ -8,6 +8,13 @@ #define __always_inline __inline__ #endif +/* Not all C++ standards support type declarations inside an anonymous union */ +#ifndef __cplusplus +#define __struct_group_tag(TAG) TAG +#else +#define __struct_group_tag(TAG) +#endif + /** * __struct_group() - Create a mirrored named and anonyomous struct * @@ -20,14 +27,14 @@ * and size: one anonymous and one named. The former's members can be used * normally without sub-struct naming, and the latter can be used to * reason about the start, end, and size of the group of struct members. - * The named struct can also be explicitly tagged for layer reuse, as well - * as both having struct attributes appended. + * The named struct can also be explicitly tagged for layer reuse (C only), + * as well as both having struct attributes appended. */ #define __struct_group(TAG, NAME, ATTRS, MEMBERS...) \ union { \ struct { MEMBERS } ATTRS; \ - struct TAG { MEMBERS } ATTRS NAME; \ - } + struct __struct_group_tag(TAG) { MEMBERS } ATTRS NAME; \ + } ATTRS /** * __DECLARE_FLEX_ARRAY() - Declare a flexible array usable in a union From 246068b86b1c36e4590388ab8f278e21f1997dc1 Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Thu, 19 Dec 2024 17:54:10 +0200 Subject: [PATCH 216/337] selftests: net: local_termination: require mausezahn Since the blamed commit, we require mausezahn because send_raw() uses it. Remove the "REQUIRE_MZ=no" line, which overwrites the default of requiring it. Fixes: 237979504264 ("selftests: net: local_termination: add PTP frames to the mix") Signed-off-by: Vladimir Oltean Link: https://patch.msgid.link/20241219155410.1856868-1-vladimir.oltean@nxp.com Signed-off-by: Jakub Kicinski --- tools/testing/selftests/net/forwarding/local_termination.sh | 1 - 1 file changed, 1 deletion(-) diff --git a/tools/testing/selftests/net/forwarding/local_termination.sh b/tools/testing/selftests/net/forwarding/local_termination.sh index c35548767756..ecd34f364125 100755 --- a/tools/testing/selftests/net/forwarding/local_termination.sh +++ b/tools/testing/selftests/net/forwarding/local_termination.sh @@ -7,7 +7,6 @@ ALL_TESTS="standalone vlan_unaware_bridge vlan_aware_bridge test_vlan \ NUM_NETIFS=2 PING_COUNT=1 REQUIRE_MTOOLS=yes -REQUIRE_MZ=no source lib.sh From 4a25201aa46ce88e8e31f9ccdec0e4e3dd6bb736 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Wed, 18 Dec 2024 19:28:32 -0800 Subject: [PATCH 217/337] netdev-genl: avoid empty messages in napi get Empty netlink responses from do() are not correct (as opposed to dump() where not dumping anything is perfectly fine). We should return an error if the target object does not exist, in this case if the netdev is down we "hide" the NAPI instances. Fixes: 27f91aaf49b3 ("netdev-genl: Add netlink framework functions for napi") Reviewed-by: Eric Dumazet Link: https://patch.msgid.link/20241219032833.1165433-1-kuba@kernel.org Signed-off-by: Jakub Kicinski --- net/core/netdev-genl.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/net/core/netdev-genl.c b/net/core/netdev-genl.c index 2d3ae0cd3ad2..b0772d135efb 100644 --- a/net/core/netdev-genl.c +++ b/net/core/netdev-genl.c @@ -246,8 +246,12 @@ int netdev_nl_napi_get_doit(struct sk_buff *skb, struct genl_info *info) rcu_read_unlock(); rtnl_unlock(); - if (err) + if (err) { goto err_free_msg; + } else if (!rsp->len) { + err = -ENOENT; + goto err_free_msg; + } return genlmsg_reply(rsp, info); From 30b981796b94b083da8fdded7cb74cb493608760 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Wed, 18 Dec 2024 19:28:33 -0800 Subject: [PATCH 218/337] selftests: drv-net: test empty queue and NAPI responses in netlink Make sure kernel doesn't respond to GETs for queues and NAPIs when link is down. Not with valid data, or with empty message, we want a ENOENT. Link: https://patch.msgid.link/20241219032833.1165433-2-kuba@kernel.org Signed-off-by: Jakub Kicinski --- tools/testing/selftests/drivers/net/queues.py | 28 ++++++++++++++++--- 1 file changed, 24 insertions(+), 4 deletions(-) diff --git a/tools/testing/selftests/drivers/net/queues.py b/tools/testing/selftests/drivers/net/queues.py index 9c5473abbd78..38303da957ee 100755 --- a/tools/testing/selftests/drivers/net/queues.py +++ b/tools/testing/selftests/drivers/net/queues.py @@ -1,10 +1,12 @@ #!/usr/bin/env python3 # SPDX-License-Identifier: GPL-2.0 -from lib.py import ksft_run, ksft_exit, ksft_eq, KsftSkipEx -from lib.py import EthtoolFamily, NetdevFamily +from lib.py import ksft_disruptive, ksft_exit, ksft_run +from lib.py import ksft_eq, ksft_raises, KsftSkipEx +from lib.py import EthtoolFamily, NetdevFamily, NlError from lib.py import NetDrvEnv -from lib.py import cmd +from lib.py import cmd, defer, ip +import errno import glob @@ -59,9 +61,27 @@ def addremove_queues(cfg, nl) -> None: ksft_eq(queues, expected) +@ksft_disruptive +def check_down(cfg, nl) -> None: + # Check the NAPI IDs before interface goes down and hides them + napis = nl.napi_get({'ifindex': cfg.ifindex}, dump=True) + + ip(f"link set dev {cfg.dev['ifname']} down") + defer(ip, f"link set dev {cfg.dev['ifname']} up") + + with ksft_raises(NlError) as cm: + nl.queue_get({'ifindex': cfg.ifindex, 'id': 0, 'type': 'rx'}) + ksft_eq(cm.exception.nl_msg.error, -errno.ENOENT) + + if napis: + with ksft_raises(NlError) as cm: + nl.napi_get({'id': napis[0]['id']}) + ksft_eq(cm.exception.nl_msg.error, -errno.ENOENT) + + def main() -> None: with NetDrvEnv(__file__, queue_count=100) as cfg: - ksft_run([get_queues, addremove_queues], args=(cfg, NetdevFamily())) + ksft_run([get_queues, addremove_queues, check_down], args=(cfg, NetdevFamily())) ksft_exit() From 8600058ba28a7b07660ddcd150372d72fb3bc895 Mon Sep 17 00:00:00 2001 From: "Rob Herring (Arm)" Date: Fri, 20 Dec 2024 15:06:47 -0600 Subject: [PATCH 219/337] of: Add coreboot firmware to excluded default cells list Google Juniper and other Chromebook platforms have a very old bootloader which populates /firmware node without proper address/size-cells leading to warnings: Missing '#address-cells' in /firmware WARNING: CPU: 0 PID: 1 at drivers/of/base.c:106 of_bus_n_addr_cells+0x90/0xf0 Modules linked in: CPU: 0 UID: 0 PID: 1 Comm: swapper/0 Not tainted 6.12.0 #1 933ab9971ff4d5dc58cb378a96f64c7f72e3454d Hardware name: Google juniper sku16 board (DT) ... Missing '#size-cells' in /firmware WARNING: CPU: 0 PID: 1 at drivers/of/base.c:133 of_bus_n_size_cells+0x90/0xf0 Modules linked in: CPU: 0 UID: 0 PID: 1 Comm: swapper/0 Tainted: G W 6.12.0 #1 933ab9971ff4d5dc58cb378a96f64c7f72e3454d Tainted: [W]=WARN Hardware name: Google juniper sku16 board (DT) These platform won't receive updated bootloader/firmware, so add an exclusion for platforms with a "coreboot" compatible node. While this is wider than necessary, that's the easiest fix and it doesn't doesn't matter if we miss checking other platforms using coreboot. We may revisit this later and address with a fixup to the DT itself. Reported-by: Sasha Levin Closes: https://lore.kernel.org/all/Z0NUdoG17EwuCigT@sashalap/ Cc: AngeloGioacchino Del Regno Cc: Matthias Brugger Cc: Chen-Yu Tsai Cc: Krzysztof Kozlowski Signed-off-by: Rob Herring (Arm) --- drivers/of/base.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/of/base.c b/drivers/of/base.c index 44b1c8bf9cc0..e6ef31c4940f 100644 --- a/drivers/of/base.c +++ b/drivers/of/base.c @@ -88,7 +88,8 @@ static bool __of_node_is_type(const struct device_node *np, const char *type) } #define EXCLUDED_DEFAULT_CELLS_PLATFORMS ( \ - IS_ENABLED(CONFIG_SPARC) \ + IS_ENABLED(CONFIG_SPARC) || \ + of_find_compatible_node(NULL, NULL, "coreboot") \ ) int of_bus_n_addr_cells(struct device_node *np) From fdf478d236dcf0f1f68534df5d456ced625195bd Mon Sep 17 00:00:00 2001 From: Levi Zim Date: Sat, 30 Nov 2024 21:38:22 +0800 Subject: [PATCH 220/337] skmsg: Return copied bytes in sk_msg_memcopy_from_iter MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Previously sk_msg_memcopy_from_iter returns the copied bytes from the last copy_from_iter{,_nocache} call upon success. This commit changes it to return the total number of copied bytes on success. Signed-off-by: Levi Zim Signed-off-by: Daniel Borkmann Tested-by: Björn Töpel Reviewed-by: John Fastabend Link: https://lore.kernel.org/bpf/20241130-tcp-bpf-sendmsg-v1-1-bae583d014f3@outlook.com --- net/core/skmsg.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/net/core/skmsg.c b/net/core/skmsg.c index 8ad7e6755fd6..61f3f3d4e528 100644 --- a/net/core/skmsg.c +++ b/net/core/skmsg.c @@ -369,8 +369,8 @@ int sk_msg_memcopy_from_iter(struct sock *sk, struct iov_iter *from, struct sk_msg *msg, u32 bytes) { int ret = -ENOSPC, i = msg->sg.curr; + u32 copy, buf_size, copied = 0; struct scatterlist *sge; - u32 copy, buf_size; void *to; do { @@ -397,6 +397,7 @@ int sk_msg_memcopy_from_iter(struct sock *sk, struct iov_iter *from, goto out; } bytes -= copy; + copied += copy; if (!bytes) break; msg->sg.copybreak = 0; @@ -404,7 +405,7 @@ int sk_msg_memcopy_from_iter(struct sock *sk, struct iov_iter *from, } while (i != msg->sg.end); out: msg->sg.curr = i; - return ret; + return (ret < 0) ? ret : copied; } EXPORT_SYMBOL_GPL(sk_msg_memcopy_from_iter); From 5153a75ef34b3f7478ca918044d0f05eed8fb3f9 Mon Sep 17 00:00:00 2001 From: Levi Zim Date: Sat, 30 Nov 2024 21:38:23 +0800 Subject: [PATCH 221/337] tcp_bpf: Fix copied value in tcp_bpf_sendmsg MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit bpf kselftest sockhash::test_txmsg_cork_hangs in test_sockmap.c triggers a kernel NULL pointer dereference: BUG: kernel NULL pointer dereference, address: 0000000000000008 ? __die_body+0x6e/0xb0 ? __die+0x8b/0xa0 ? page_fault_oops+0x358/0x3c0 ? local_clock+0x19/0x30 ? lock_release+0x11b/0x440 ? kernelmode_fixup_or_oops+0x54/0x60 ? __bad_area_nosemaphore+0x4f/0x210 ? mmap_read_unlock+0x13/0x30 ? bad_area_nosemaphore+0x16/0x20 ? do_user_addr_fault+0x6fd/0x740 ? prb_read_valid+0x1d/0x30 ? exc_page_fault+0x55/0xd0 ? asm_exc_page_fault+0x2b/0x30 ? splice_to_socket+0x52e/0x630 ? shmem_file_splice_read+0x2b1/0x310 direct_splice_actor+0x47/0x70 splice_direct_to_actor+0x133/0x300 ? do_splice_direct+0x90/0x90 do_splice_direct+0x64/0x90 ? __ia32_sys_tee+0x30/0x30 do_sendfile+0x214/0x300 __se_sys_sendfile64+0x8e/0xb0 __x64_sys_sendfile64+0x25/0x30 x64_sys_call+0xb82/0x2840 do_syscall_64+0x75/0x110 entry_SYSCALL_64_after_hwframe+0x4b/0x53 This is caused by tcp_bpf_sendmsg() returning a larger value(12289) than size (8192), which causes the while loop in splice_to_socket() to release an uninitialized pipe buf. The underlying cause is that this code assumes sk_msg_memcopy_from_iter() will copy all bytes upon success but it actually might only copy part of it. This commit changes it to use the real copied bytes. Signed-off-by: Levi Zim Signed-off-by: Daniel Borkmann Tested-by: Björn Töpel Reviewed-by: John Fastabend Link: https://lore.kernel.org/bpf/20241130-tcp-bpf-sendmsg-v1-2-bae583d014f3@outlook.com --- net/ipv4/tcp_bpf.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/net/ipv4/tcp_bpf.c b/net/ipv4/tcp_bpf.c index 392678ae80f4..47f65b1b70ca 100644 --- a/net/ipv4/tcp_bpf.c +++ b/net/ipv4/tcp_bpf.c @@ -495,7 +495,7 @@ more_data: static int tcp_bpf_sendmsg(struct sock *sk, struct msghdr *msg, size_t size) { struct sk_msg tmp, *msg_tx = NULL; - int copied = 0, err = 0; + int copied = 0, err = 0, ret = 0; struct sk_psock *psock; long timeo; int flags; @@ -538,14 +538,14 @@ static int tcp_bpf_sendmsg(struct sock *sk, struct msghdr *msg, size_t size) copy = msg_tx->sg.size - osize; } - err = sk_msg_memcopy_from_iter(sk, &msg->msg_iter, msg_tx, + ret = sk_msg_memcopy_from_iter(sk, &msg->msg_iter, msg_tx, copy); - if (err < 0) { + if (ret < 0) { sk_msg_trim(sk, msg_tx, osize); goto out_err; } - copied += copy; + copied += ret; if (psock->cork_bytes) { if (size > psock->cork_bytes) psock->cork_bytes = 0; From 9ecc4d858b92c1bb0673ad9c327298e600c55659 Mon Sep 17 00:00:00 2001 From: Cong Wang Date: Thu, 12 Dec 2024 19:40:54 -0800 Subject: [PATCH 222/337] bpf: Check negative offsets in __bpf_skb_min_len() skb_network_offset() and skb_transport_offset() can be negative when they are called after we pull the transport header, for example, when we use eBPF sockmap at the point of ->sk_data_ready(). __bpf_skb_min_len() uses an unsigned int to get these offsets, this leads to a very large number which then causes bpf_skb_change_tail() failed unexpectedly. Fix this by using a signed int to get these offsets and ensure the minimum is at least zero. Fixes: 5293efe62df8 ("bpf: add bpf_skb_change_tail helper") Signed-off-by: Cong Wang Signed-off-by: Daniel Borkmann Acked-by: John Fastabend Link: https://lore.kernel.org/bpf/20241213034057.246437-2-xiyou.wangcong@gmail.com --- net/core/filter.c | 21 +++++++++++++++------ 1 file changed, 15 insertions(+), 6 deletions(-) diff --git a/net/core/filter.c b/net/core/filter.c index 21131ec25f24..834614071727 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -3734,13 +3734,22 @@ static const struct bpf_func_proto bpf_skb_adjust_room_proto = { static u32 __bpf_skb_min_len(const struct sk_buff *skb) { - u32 min_len = skb_network_offset(skb); + int offset = skb_network_offset(skb); + u32 min_len = 0; - if (skb_transport_header_was_set(skb)) - min_len = skb_transport_offset(skb); - if (skb->ip_summed == CHECKSUM_PARTIAL) - min_len = skb_checksum_start_offset(skb) + - skb->csum_offset + sizeof(__sum16); + if (offset > 0) + min_len = offset; + if (skb_transport_header_was_set(skb)) { + offset = skb_transport_offset(skb); + if (offset > 0) + min_len = offset; + } + if (skb->ip_summed == CHECKSUM_PARTIAL) { + offset = skb_checksum_start_offset(skb) + + skb->csum_offset + sizeof(__sum16); + if (offset > 0) + min_len = offset; + } return min_len; } From 9ee0c7b8654346d60c823babe4b3747357a30477 Mon Sep 17 00:00:00 2001 From: Cong Wang Date: Thu, 12 Dec 2024 19:40:55 -0800 Subject: [PATCH 223/337] selftests/bpf: Add a BPF selftest for bpf_skb_change_tail() As requested by Daniel, we need to add a selftest to cover bpf_skb_change_tail() cases in skb_verdict. Here we test trimming, growing and error cases, and validate its expected return values and the expected sizes of the payload. Signed-off-by: Cong Wang Signed-off-by: Daniel Borkmann Acked-by: John Fastabend Link: https://lore.kernel.org/bpf/20241213034057.246437-3-xiyou.wangcong@gmail.com --- .../selftests/bpf/prog_tests/sockmap_basic.c | 51 +++++++++++++++++++ .../bpf/progs/test_sockmap_change_tail.c | 40 +++++++++++++++ 2 files changed, 91 insertions(+) create mode 100644 tools/testing/selftests/bpf/progs/test_sockmap_change_tail.c diff --git a/tools/testing/selftests/bpf/prog_tests/sockmap_basic.c b/tools/testing/selftests/bpf/prog_tests/sockmap_basic.c index 248754296d97..884ad87783d5 100644 --- a/tools/testing/selftests/bpf/prog_tests/sockmap_basic.c +++ b/tools/testing/selftests/bpf/prog_tests/sockmap_basic.c @@ -12,6 +12,7 @@ #include "test_sockmap_progs_query.skel.h" #include "test_sockmap_pass_prog.skel.h" #include "test_sockmap_drop_prog.skel.h" +#include "test_sockmap_change_tail.skel.h" #include "bpf_iter_sockmap.skel.h" #include "sockmap_helpers.h" @@ -643,6 +644,54 @@ out: test_sockmap_drop_prog__destroy(drop); } +static void test_sockmap_skb_verdict_change_tail(void) +{ + struct test_sockmap_change_tail *skel; + int err, map, verdict; + int c1, p1, sent, recvd; + int zero = 0; + char buf[2]; + + skel = test_sockmap_change_tail__open_and_load(); + if (!ASSERT_OK_PTR(skel, "open_and_load")) + return; + verdict = bpf_program__fd(skel->progs.prog_skb_verdict); + map = bpf_map__fd(skel->maps.sock_map_rx); + + err = bpf_prog_attach(verdict, map, BPF_SK_SKB_STREAM_VERDICT, 0); + if (!ASSERT_OK(err, "bpf_prog_attach")) + goto out; + err = create_pair(AF_INET, SOCK_STREAM, &c1, &p1); + if (!ASSERT_OK(err, "create_pair()")) + goto out; + err = bpf_map_update_elem(map, &zero, &c1, BPF_NOEXIST); + if (!ASSERT_OK(err, "bpf_map_update_elem(c1)")) + goto out_close; + sent = xsend(p1, "Tr", 2, 0); + ASSERT_EQ(sent, 2, "xsend(p1)"); + recvd = recv(c1, buf, 2, 0); + ASSERT_EQ(recvd, 1, "recv(c1)"); + ASSERT_EQ(skel->data->change_tail_ret, 0, "change_tail_ret"); + + sent = xsend(p1, "G", 1, 0); + ASSERT_EQ(sent, 1, "xsend(p1)"); + recvd = recv(c1, buf, 2, 0); + ASSERT_EQ(recvd, 2, "recv(c1)"); + ASSERT_EQ(skel->data->change_tail_ret, 0, "change_tail_ret"); + + sent = xsend(p1, "E", 1, 0); + ASSERT_EQ(sent, 1, "xsend(p1)"); + recvd = recv(c1, buf, 1, 0); + ASSERT_EQ(recvd, 1, "recv(c1)"); + ASSERT_EQ(skel->data->change_tail_ret, -EINVAL, "change_tail_ret"); + +out_close: + close(c1); + close(p1); +out: + test_sockmap_change_tail__destroy(skel); +} + static void test_sockmap_skb_verdict_peek_helper(int map) { int err, c1, p1, zero = 0, sent, recvd, avail; @@ -1058,6 +1107,8 @@ void test_sockmap_basic(void) test_sockmap_skb_verdict_fionread(true); if (test__start_subtest("sockmap skb_verdict fionread on drop")) test_sockmap_skb_verdict_fionread(false); + if (test__start_subtest("sockmap skb_verdict change tail")) + test_sockmap_skb_verdict_change_tail(); if (test__start_subtest("sockmap skb_verdict msg_f_peek")) test_sockmap_skb_verdict_peek(); if (test__start_subtest("sockmap skb_verdict msg_f_peek with link")) diff --git a/tools/testing/selftests/bpf/progs/test_sockmap_change_tail.c b/tools/testing/selftests/bpf/progs/test_sockmap_change_tail.c new file mode 100644 index 000000000000..2796dd8545eb --- /dev/null +++ b/tools/testing/selftests/bpf/progs/test_sockmap_change_tail.c @@ -0,0 +1,40 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2024 ByteDance */ +#include +#include + +struct { + __uint(type, BPF_MAP_TYPE_SOCKMAP); + __uint(max_entries, 1); + __type(key, int); + __type(value, int); +} sock_map_rx SEC(".maps"); + +long change_tail_ret = 1; + +SEC("sk_skb") +int prog_skb_verdict(struct __sk_buff *skb) +{ + char *data, *data_end; + + bpf_skb_pull_data(skb, 1); + data = (char *)(unsigned long)skb->data; + data_end = (char *)(unsigned long)skb->data_end; + + if (data + 1 > data_end) + return SK_PASS; + + if (data[0] == 'T') { /* Trim the packet */ + change_tail_ret = bpf_skb_change_tail(skb, skb->len - 1, 0); + return SK_PASS; + } else if (data[0] == 'G') { /* Grow the packet */ + change_tail_ret = bpf_skb_change_tail(skb, skb->len + 1, 0); + return SK_PASS; + } else if (data[0] == 'E') { /* Error */ + change_tail_ret = bpf_skb_change_tail(skb, 65535, 0); + return SK_PASS; + } + return SK_PASS; +} + +char _license[] SEC("license") = "GPL"; From 472759c9f5377912c7483cca5da847888a27cecc Mon Sep 17 00:00:00 2001 From: Cong Wang Date: Thu, 12 Dec 2024 19:40:56 -0800 Subject: [PATCH 224/337] selftests/bpf: Introduce socket_helpers.h for TC tests Pull socket helpers out of sockmap_helpers.h so that they can be reused for TC tests as well. This prepares for the next patch. Signed-off-by: Cong Wang Signed-off-by: Daniel Borkmann Acked-by: John Fastabend Link: https://lore.kernel.org/bpf/20241213034057.246437-4-xiyou.wangcong@gmail.com --- .../selftests/bpf/prog_tests/socket_helpers.h | 394 ++++++++++++++++++ .../bpf/prog_tests/sockmap_helpers.h | 385 +---------------- 2 files changed, 395 insertions(+), 384 deletions(-) create mode 100644 tools/testing/selftests/bpf/prog_tests/socket_helpers.h diff --git a/tools/testing/selftests/bpf/prog_tests/socket_helpers.h b/tools/testing/selftests/bpf/prog_tests/socket_helpers.h new file mode 100644 index 000000000000..1bdfb79ef009 --- /dev/null +++ b/tools/testing/selftests/bpf/prog_tests/socket_helpers.h @@ -0,0 +1,394 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#ifndef __SOCKET_HELPERS__ +#define __SOCKET_HELPERS__ + +#include + +/* include/linux/net.h */ +#define SOCK_TYPE_MASK 0xf + +#define IO_TIMEOUT_SEC 30 +#define MAX_STRERR_LEN 256 + +/* workaround for older vm_sockets.h */ +#ifndef VMADDR_CID_LOCAL +#define VMADDR_CID_LOCAL 1 +#endif + +/* include/linux/cleanup.h */ +#define __get_and_null(p, nullvalue) \ + ({ \ + __auto_type __ptr = &(p); \ + __auto_type __val = *__ptr; \ + *__ptr = nullvalue; \ + __val; \ + }) + +#define take_fd(fd) __get_and_null(fd, -EBADF) + +/* Wrappers that fail the test on error and report it. */ + +#define _FAIL(errnum, fmt...) \ + ({ \ + error_at_line(0, (errnum), __func__, __LINE__, fmt); \ + CHECK_FAIL(true); \ + }) +#define FAIL(fmt...) _FAIL(0, fmt) +#define FAIL_ERRNO(fmt...) _FAIL(errno, fmt) +#define FAIL_LIBBPF(err, msg) \ + ({ \ + char __buf[MAX_STRERR_LEN]; \ + libbpf_strerror((err), __buf, sizeof(__buf)); \ + FAIL("%s: %s", (msg), __buf); \ + }) + + +#define xaccept_nonblock(fd, addr, len) \ + ({ \ + int __ret = \ + accept_timeout((fd), (addr), (len), IO_TIMEOUT_SEC); \ + if (__ret == -1) \ + FAIL_ERRNO("accept"); \ + __ret; \ + }) + +#define xbind(fd, addr, len) \ + ({ \ + int __ret = bind((fd), (addr), (len)); \ + if (__ret == -1) \ + FAIL_ERRNO("bind"); \ + __ret; \ + }) + +#define xclose(fd) \ + ({ \ + int __ret = close((fd)); \ + if (__ret == -1) \ + FAIL_ERRNO("close"); \ + __ret; \ + }) + +#define xconnect(fd, addr, len) \ + ({ \ + int __ret = connect((fd), (addr), (len)); \ + if (__ret == -1) \ + FAIL_ERRNO("connect"); \ + __ret; \ + }) + +#define xgetsockname(fd, addr, len) \ + ({ \ + int __ret = getsockname((fd), (addr), (len)); \ + if (__ret == -1) \ + FAIL_ERRNO("getsockname"); \ + __ret; \ + }) + +#define xgetsockopt(fd, level, name, val, len) \ + ({ \ + int __ret = getsockopt((fd), (level), (name), (val), (len)); \ + if (__ret == -1) \ + FAIL_ERRNO("getsockopt(" #name ")"); \ + __ret; \ + }) + +#define xlisten(fd, backlog) \ + ({ \ + int __ret = listen((fd), (backlog)); \ + if (__ret == -1) \ + FAIL_ERRNO("listen"); \ + __ret; \ + }) + +#define xsetsockopt(fd, level, name, val, len) \ + ({ \ + int __ret = setsockopt((fd), (level), (name), (val), (len)); \ + if (__ret == -1) \ + FAIL_ERRNO("setsockopt(" #name ")"); \ + __ret; \ + }) + +#define xsend(fd, buf, len, flags) \ + ({ \ + ssize_t __ret = send((fd), (buf), (len), (flags)); \ + if (__ret == -1) \ + FAIL_ERRNO("send"); \ + __ret; \ + }) + +#define xrecv_nonblock(fd, buf, len, flags) \ + ({ \ + ssize_t __ret = recv_timeout((fd), (buf), (len), (flags), \ + IO_TIMEOUT_SEC); \ + if (__ret == -1) \ + FAIL_ERRNO("recv"); \ + __ret; \ + }) + +#define xsocket(family, sotype, flags) \ + ({ \ + int __ret = socket(family, sotype, flags); \ + if (__ret == -1) \ + FAIL_ERRNO("socket"); \ + __ret; \ + }) + +static inline void close_fd(int *fd) +{ + if (*fd >= 0) + xclose(*fd); +} + +#define __close_fd __attribute__((cleanup(close_fd))) + +static inline struct sockaddr *sockaddr(struct sockaddr_storage *ss) +{ + return (struct sockaddr *)ss; +} + +static inline void init_addr_loopback4(struct sockaddr_storage *ss, + socklen_t *len) +{ + struct sockaddr_in *addr4 = memset(ss, 0, sizeof(*ss)); + + addr4->sin_family = AF_INET; + addr4->sin_port = 0; + addr4->sin_addr.s_addr = htonl(INADDR_LOOPBACK); + *len = sizeof(*addr4); +} + +static inline void init_addr_loopback6(struct sockaddr_storage *ss, + socklen_t *len) +{ + struct sockaddr_in6 *addr6 = memset(ss, 0, sizeof(*ss)); + + addr6->sin6_family = AF_INET6; + addr6->sin6_port = 0; + addr6->sin6_addr = in6addr_loopback; + *len = sizeof(*addr6); +} + +static inline void init_addr_loopback_vsock(struct sockaddr_storage *ss, + socklen_t *len) +{ + struct sockaddr_vm *addr = memset(ss, 0, sizeof(*ss)); + + addr->svm_family = AF_VSOCK; + addr->svm_port = VMADDR_PORT_ANY; + addr->svm_cid = VMADDR_CID_LOCAL; + *len = sizeof(*addr); +} + +static inline void init_addr_loopback(int family, struct sockaddr_storage *ss, + socklen_t *len) +{ + switch (family) { + case AF_INET: + init_addr_loopback4(ss, len); + return; + case AF_INET6: + init_addr_loopback6(ss, len); + return; + case AF_VSOCK: + init_addr_loopback_vsock(ss, len); + return; + default: + FAIL("unsupported address family %d", family); + } +} + +static inline int enable_reuseport(int s, int progfd) +{ + int err, one = 1; + + err = xsetsockopt(s, SOL_SOCKET, SO_REUSEPORT, &one, sizeof(one)); + if (err) + return -1; + err = xsetsockopt(s, SOL_SOCKET, SO_ATTACH_REUSEPORT_EBPF, &progfd, + sizeof(progfd)); + if (err) + return -1; + + return 0; +} + +static inline int socket_loopback_reuseport(int family, int sotype, int progfd) +{ + struct sockaddr_storage addr; + socklen_t len = 0; + int err, s; + + init_addr_loopback(family, &addr, &len); + + s = xsocket(family, sotype, 0); + if (s == -1) + return -1; + + if (progfd >= 0) + enable_reuseport(s, progfd); + + err = xbind(s, sockaddr(&addr), len); + if (err) + goto close; + + if (sotype & SOCK_DGRAM) + return s; + + err = xlisten(s, SOMAXCONN); + if (err) + goto close; + + return s; +close: + xclose(s); + return -1; +} + +static inline int socket_loopback(int family, int sotype) +{ + return socket_loopback_reuseport(family, sotype, -1); +} + +static inline int poll_connect(int fd, unsigned int timeout_sec) +{ + struct timeval timeout = { .tv_sec = timeout_sec }; + fd_set wfds; + int r, eval; + socklen_t esize = sizeof(eval); + + FD_ZERO(&wfds); + FD_SET(fd, &wfds); + + r = select(fd + 1, NULL, &wfds, NULL, &timeout); + if (r == 0) + errno = ETIME; + if (r != 1) + return -1; + + if (getsockopt(fd, SOL_SOCKET, SO_ERROR, &eval, &esize) < 0) + return -1; + if (eval != 0) { + errno = eval; + return -1; + } + + return 0; +} + +static inline int poll_read(int fd, unsigned int timeout_sec) +{ + struct timeval timeout = { .tv_sec = timeout_sec }; + fd_set rfds; + int r; + + FD_ZERO(&rfds); + FD_SET(fd, &rfds); + + r = select(fd + 1, &rfds, NULL, NULL, &timeout); + if (r == 0) + errno = ETIME; + + return r == 1 ? 0 : -1; +} + +static inline int accept_timeout(int fd, struct sockaddr *addr, socklen_t *len, + unsigned int timeout_sec) +{ + if (poll_read(fd, timeout_sec)) + return -1; + + return accept(fd, addr, len); +} + +static inline int recv_timeout(int fd, void *buf, size_t len, int flags, + unsigned int timeout_sec) +{ + if (poll_read(fd, timeout_sec)) + return -1; + + return recv(fd, buf, len, flags); +} + + +static inline int create_pair(int family, int sotype, int *p0, int *p1) +{ + __close_fd int s, c = -1, p = -1; + struct sockaddr_storage addr; + socklen_t len = sizeof(addr); + int err; + + s = socket_loopback(family, sotype); + if (s < 0) + return s; + + err = xgetsockname(s, sockaddr(&addr), &len); + if (err) + return err; + + c = xsocket(family, sotype, 0); + if (c < 0) + return c; + + err = connect(c, sockaddr(&addr), len); + if (err) { + if (errno != EINPROGRESS) { + FAIL_ERRNO("connect"); + return err; + } + + err = poll_connect(c, IO_TIMEOUT_SEC); + if (err) { + FAIL_ERRNO("poll_connect"); + return err; + } + } + + switch (sotype & SOCK_TYPE_MASK) { + case SOCK_DGRAM: + err = xgetsockname(c, sockaddr(&addr), &len); + if (err) + return err; + + err = xconnect(s, sockaddr(&addr), len); + if (err) + return err; + + *p0 = take_fd(s); + break; + case SOCK_STREAM: + case SOCK_SEQPACKET: + p = xaccept_nonblock(s, NULL, NULL); + if (p < 0) + return p; + + *p0 = take_fd(p); + break; + default: + FAIL("Unsupported socket type %#x", sotype); + return -EOPNOTSUPP; + } + + *p1 = take_fd(c); + return 0; +} + +static inline int create_socket_pairs(int family, int sotype, int *c0, int *c1, + int *p0, int *p1) +{ + int err; + + err = create_pair(family, sotype, c0, p0); + if (err) + return err; + + err = create_pair(family, sotype, c1, p1); + if (err) { + close(*c0); + close(*p0); + } + + return err; +} + +#endif // __SOCKET_HELPERS__ diff --git a/tools/testing/selftests/bpf/prog_tests/sockmap_helpers.h b/tools/testing/selftests/bpf/prog_tests/sockmap_helpers.h index 38e35c72bdaa..3e5571dd578d 100644 --- a/tools/testing/selftests/bpf/prog_tests/sockmap_helpers.h +++ b/tools/testing/selftests/bpf/prog_tests/sockmap_helpers.h @@ -1,139 +1,12 @@ #ifndef __SOCKMAP_HELPERS__ #define __SOCKMAP_HELPERS__ -#include +#include "socket_helpers.h" -/* include/linux/net.h */ -#define SOCK_TYPE_MASK 0xf - -#define IO_TIMEOUT_SEC 30 -#define MAX_STRERR_LEN 256 #define MAX_TEST_NAME 80 -/* workaround for older vm_sockets.h */ -#ifndef VMADDR_CID_LOCAL -#define VMADDR_CID_LOCAL 1 -#endif - #define __always_unused __attribute__((__unused__)) -/* include/linux/cleanup.h */ -#define __get_and_null(p, nullvalue) \ - ({ \ - __auto_type __ptr = &(p); \ - __auto_type __val = *__ptr; \ - *__ptr = nullvalue; \ - __val; \ - }) - -#define take_fd(fd) __get_and_null(fd, -EBADF) - -#define _FAIL(errnum, fmt...) \ - ({ \ - error_at_line(0, (errnum), __func__, __LINE__, fmt); \ - CHECK_FAIL(true); \ - }) -#define FAIL(fmt...) _FAIL(0, fmt) -#define FAIL_ERRNO(fmt...) _FAIL(errno, fmt) -#define FAIL_LIBBPF(err, msg) \ - ({ \ - char __buf[MAX_STRERR_LEN]; \ - libbpf_strerror((err), __buf, sizeof(__buf)); \ - FAIL("%s: %s", (msg), __buf); \ - }) - -/* Wrappers that fail the test on error and report it. */ - -#define xaccept_nonblock(fd, addr, len) \ - ({ \ - int __ret = \ - accept_timeout((fd), (addr), (len), IO_TIMEOUT_SEC); \ - if (__ret == -1) \ - FAIL_ERRNO("accept"); \ - __ret; \ - }) - -#define xbind(fd, addr, len) \ - ({ \ - int __ret = bind((fd), (addr), (len)); \ - if (__ret == -1) \ - FAIL_ERRNO("bind"); \ - __ret; \ - }) - -#define xclose(fd) \ - ({ \ - int __ret = close((fd)); \ - if (__ret == -1) \ - FAIL_ERRNO("close"); \ - __ret; \ - }) - -#define xconnect(fd, addr, len) \ - ({ \ - int __ret = connect((fd), (addr), (len)); \ - if (__ret == -1) \ - FAIL_ERRNO("connect"); \ - __ret; \ - }) - -#define xgetsockname(fd, addr, len) \ - ({ \ - int __ret = getsockname((fd), (addr), (len)); \ - if (__ret == -1) \ - FAIL_ERRNO("getsockname"); \ - __ret; \ - }) - -#define xgetsockopt(fd, level, name, val, len) \ - ({ \ - int __ret = getsockopt((fd), (level), (name), (val), (len)); \ - if (__ret == -1) \ - FAIL_ERRNO("getsockopt(" #name ")"); \ - __ret; \ - }) - -#define xlisten(fd, backlog) \ - ({ \ - int __ret = listen((fd), (backlog)); \ - if (__ret == -1) \ - FAIL_ERRNO("listen"); \ - __ret; \ - }) - -#define xsetsockopt(fd, level, name, val, len) \ - ({ \ - int __ret = setsockopt((fd), (level), (name), (val), (len)); \ - if (__ret == -1) \ - FAIL_ERRNO("setsockopt(" #name ")"); \ - __ret; \ - }) - -#define xsend(fd, buf, len, flags) \ - ({ \ - ssize_t __ret = send((fd), (buf), (len), (flags)); \ - if (__ret == -1) \ - FAIL_ERRNO("send"); \ - __ret; \ - }) - -#define xrecv_nonblock(fd, buf, len, flags) \ - ({ \ - ssize_t __ret = recv_timeout((fd), (buf), (len), (flags), \ - IO_TIMEOUT_SEC); \ - if (__ret == -1) \ - FAIL_ERRNO("recv"); \ - __ret; \ - }) - -#define xsocket(family, sotype, flags) \ - ({ \ - int __ret = socket(family, sotype, flags); \ - if (__ret == -1) \ - FAIL_ERRNO("socket"); \ - __ret; \ - }) - #define xbpf_map_delete_elem(fd, key) \ ({ \ int __ret = bpf_map_delete_elem((fd), (key)); \ @@ -193,130 +66,6 @@ __ret; \ }) -static inline void close_fd(int *fd) -{ - if (*fd >= 0) - xclose(*fd); -} - -#define __close_fd __attribute__((cleanup(close_fd))) - -static inline int poll_connect(int fd, unsigned int timeout_sec) -{ - struct timeval timeout = { .tv_sec = timeout_sec }; - fd_set wfds; - int r, eval; - socklen_t esize = sizeof(eval); - - FD_ZERO(&wfds); - FD_SET(fd, &wfds); - - r = select(fd + 1, NULL, &wfds, NULL, &timeout); - if (r == 0) - errno = ETIME; - if (r != 1) - return -1; - - if (getsockopt(fd, SOL_SOCKET, SO_ERROR, &eval, &esize) < 0) - return -1; - if (eval != 0) { - errno = eval; - return -1; - } - - return 0; -} - -static inline int poll_read(int fd, unsigned int timeout_sec) -{ - struct timeval timeout = { .tv_sec = timeout_sec }; - fd_set rfds; - int r; - - FD_ZERO(&rfds); - FD_SET(fd, &rfds); - - r = select(fd + 1, &rfds, NULL, NULL, &timeout); - if (r == 0) - errno = ETIME; - - return r == 1 ? 0 : -1; -} - -static inline int accept_timeout(int fd, struct sockaddr *addr, socklen_t *len, - unsigned int timeout_sec) -{ - if (poll_read(fd, timeout_sec)) - return -1; - - return accept(fd, addr, len); -} - -static inline int recv_timeout(int fd, void *buf, size_t len, int flags, - unsigned int timeout_sec) -{ - if (poll_read(fd, timeout_sec)) - return -1; - - return recv(fd, buf, len, flags); -} - -static inline void init_addr_loopback4(struct sockaddr_storage *ss, - socklen_t *len) -{ - struct sockaddr_in *addr4 = memset(ss, 0, sizeof(*ss)); - - addr4->sin_family = AF_INET; - addr4->sin_port = 0; - addr4->sin_addr.s_addr = htonl(INADDR_LOOPBACK); - *len = sizeof(*addr4); -} - -static inline void init_addr_loopback6(struct sockaddr_storage *ss, - socklen_t *len) -{ - struct sockaddr_in6 *addr6 = memset(ss, 0, sizeof(*ss)); - - addr6->sin6_family = AF_INET6; - addr6->sin6_port = 0; - addr6->sin6_addr = in6addr_loopback; - *len = sizeof(*addr6); -} - -static inline void init_addr_loopback_vsock(struct sockaddr_storage *ss, - socklen_t *len) -{ - struct sockaddr_vm *addr = memset(ss, 0, sizeof(*ss)); - - addr->svm_family = AF_VSOCK; - addr->svm_port = VMADDR_PORT_ANY; - addr->svm_cid = VMADDR_CID_LOCAL; - *len = sizeof(*addr); -} - -static inline void init_addr_loopback(int family, struct sockaddr_storage *ss, - socklen_t *len) -{ - switch (family) { - case AF_INET: - init_addr_loopback4(ss, len); - return; - case AF_INET6: - init_addr_loopback6(ss, len); - return; - case AF_VSOCK: - init_addr_loopback_vsock(ss, len); - return; - default: - FAIL("unsupported address family %d", family); - } -} - -static inline struct sockaddr *sockaddr(struct sockaddr_storage *ss) -{ - return (struct sockaddr *)ss; -} - static inline int add_to_sockmap(int sock_mapfd, int fd1, int fd2) { u64 value; @@ -334,136 +83,4 @@ static inline int add_to_sockmap(int sock_mapfd, int fd1, int fd2) return xbpf_map_update_elem(sock_mapfd, &key, &value, BPF_NOEXIST); } -static inline int enable_reuseport(int s, int progfd) -{ - int err, one = 1; - - err = xsetsockopt(s, SOL_SOCKET, SO_REUSEPORT, &one, sizeof(one)); - if (err) - return -1; - err = xsetsockopt(s, SOL_SOCKET, SO_ATTACH_REUSEPORT_EBPF, &progfd, - sizeof(progfd)); - if (err) - return -1; - - return 0; -} - -static inline int socket_loopback_reuseport(int family, int sotype, int progfd) -{ - struct sockaddr_storage addr; - socklen_t len = 0; - int err, s; - - init_addr_loopback(family, &addr, &len); - - s = xsocket(family, sotype, 0); - if (s == -1) - return -1; - - if (progfd >= 0) - enable_reuseport(s, progfd); - - err = xbind(s, sockaddr(&addr), len); - if (err) - goto close; - - if (sotype & SOCK_DGRAM) - return s; - - err = xlisten(s, SOMAXCONN); - if (err) - goto close; - - return s; -close: - xclose(s); - return -1; -} - -static inline int socket_loopback(int family, int sotype) -{ - return socket_loopback_reuseport(family, sotype, -1); -} - -static inline int create_pair(int family, int sotype, int *p0, int *p1) -{ - __close_fd int s, c = -1, p = -1; - struct sockaddr_storage addr; - socklen_t len = sizeof(addr); - int err; - - s = socket_loopback(family, sotype); - if (s < 0) - return s; - - err = xgetsockname(s, sockaddr(&addr), &len); - if (err) - return err; - - c = xsocket(family, sotype, 0); - if (c < 0) - return c; - - err = connect(c, sockaddr(&addr), len); - if (err) { - if (errno != EINPROGRESS) { - FAIL_ERRNO("connect"); - return err; - } - - err = poll_connect(c, IO_TIMEOUT_SEC); - if (err) { - FAIL_ERRNO("poll_connect"); - return err; - } - } - - switch (sotype & SOCK_TYPE_MASK) { - case SOCK_DGRAM: - err = xgetsockname(c, sockaddr(&addr), &len); - if (err) - return err; - - err = xconnect(s, sockaddr(&addr), len); - if (err) - return err; - - *p0 = take_fd(s); - break; - case SOCK_STREAM: - case SOCK_SEQPACKET: - p = xaccept_nonblock(s, NULL, NULL); - if (p < 0) - return p; - - *p0 = take_fd(p); - break; - default: - FAIL("Unsupported socket type %#x", sotype); - return -EOPNOTSUPP; - } - - *p1 = take_fd(c); - return 0; -} - -static inline int create_socket_pairs(int family, int sotype, int *c0, int *c1, - int *p0, int *p1) -{ - int err; - - err = create_pair(family, sotype, c0, p0); - if (err) - return err; - - err = create_pair(family, sotype, c1, p1); - if (err) { - close(*c0); - close(*p0); - } - - return err; -} - #endif // __SOCKMAP_HELPERS__ From 4a58963d10fa3cb654b859e3f9a8aecbcf9f4982 Mon Sep 17 00:00:00 2001 From: Cong Wang Date: Thu, 12 Dec 2024 19:40:57 -0800 Subject: [PATCH 225/337] selftests/bpf: Test bpf_skb_change_tail() in TC ingress Similarly to the previous test, we also need a test case to cover positive offsets as well, TC is an excellent hook for this. Signed-off-by: Cong Wang Signed-off-by: Daniel Borkmann Tested-by: Zijian Zhang Acked-by: John Fastabend Link: https://lore.kernel.org/bpf/20241213034057.246437-5-xiyou.wangcong@gmail.com --- .../selftests/bpf/prog_tests/tc_change_tail.c | 62 ++++++++++ .../selftests/bpf/progs/test_tc_change_tail.c | 106 ++++++++++++++++++ 2 files changed, 168 insertions(+) create mode 100644 tools/testing/selftests/bpf/prog_tests/tc_change_tail.c create mode 100644 tools/testing/selftests/bpf/progs/test_tc_change_tail.c diff --git a/tools/testing/selftests/bpf/prog_tests/tc_change_tail.c b/tools/testing/selftests/bpf/prog_tests/tc_change_tail.c new file mode 100644 index 000000000000..74752233e779 --- /dev/null +++ b/tools/testing/selftests/bpf/prog_tests/tc_change_tail.c @@ -0,0 +1,62 @@ +// SPDX-License-Identifier: GPL-2.0 +#include +#include +#include + +#include "test_tc_change_tail.skel.h" +#include "socket_helpers.h" + +#define LO_IFINDEX 1 + +void test_tc_change_tail(void) +{ + LIBBPF_OPTS(bpf_tcx_opts, tcx_opts); + struct test_tc_change_tail *skel = NULL; + struct bpf_link *link; + int c1, p1; + char buf[2]; + int ret; + + skel = test_tc_change_tail__open_and_load(); + if (!ASSERT_OK_PTR(skel, "test_tc_change_tail__open_and_load")) + return; + + link = bpf_program__attach_tcx(skel->progs.change_tail, LO_IFINDEX, + &tcx_opts); + if (!ASSERT_OK_PTR(link, "bpf_program__attach_tcx")) + goto destroy; + + skel->links.change_tail = link; + ret = create_pair(AF_INET, SOCK_DGRAM, &c1, &p1); + if (!ASSERT_OK(ret, "create_pair")) + goto destroy; + + ret = xsend(p1, "Tr", 2, 0); + ASSERT_EQ(ret, 2, "xsend(p1)"); + ret = recv(c1, buf, 2, 0); + ASSERT_EQ(ret, 2, "recv(c1)"); + ASSERT_EQ(skel->data->change_tail_ret, 0, "change_tail_ret"); + + ret = xsend(p1, "G", 1, 0); + ASSERT_EQ(ret, 1, "xsend(p1)"); + ret = recv(c1, buf, 2, 0); + ASSERT_EQ(ret, 1, "recv(c1)"); + ASSERT_EQ(skel->data->change_tail_ret, 0, "change_tail_ret"); + + ret = xsend(p1, "E", 1, 0); + ASSERT_EQ(ret, 1, "xsend(p1)"); + ret = recv(c1, buf, 1, 0); + ASSERT_EQ(ret, 1, "recv(c1)"); + ASSERT_EQ(skel->data->change_tail_ret, -EINVAL, "change_tail_ret"); + + ret = xsend(p1, "Z", 1, 0); + ASSERT_EQ(ret, 1, "xsend(p1)"); + ret = recv(c1, buf, 1, 0); + ASSERT_EQ(ret, 1, "recv(c1)"); + ASSERT_EQ(skel->data->change_tail_ret, -EINVAL, "change_tail_ret"); + + close(c1); + close(p1); +destroy: + test_tc_change_tail__destroy(skel); +} diff --git a/tools/testing/selftests/bpf/progs/test_tc_change_tail.c b/tools/testing/selftests/bpf/progs/test_tc_change_tail.c new file mode 100644 index 000000000000..28edafe803f0 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/test_tc_change_tail.c @@ -0,0 +1,106 @@ +// SPDX-License-Identifier: GPL-2.0 +#include +#include +#include +#include +#include +#include +#include + +long change_tail_ret = 1; + +static __always_inline struct iphdr *parse_ip_header(struct __sk_buff *skb, int *ip_proto) +{ + void *data_end = (void *)(long)skb->data_end; + void *data = (void *)(long)skb->data; + struct ethhdr *eth = data; + struct iphdr *iph; + + /* Verify Ethernet header */ + if ((void *)(data + sizeof(*eth)) > data_end) + return NULL; + + /* Skip Ethernet header to get to IP header */ + iph = (void *)(data + sizeof(struct ethhdr)); + + /* Verify IP header */ + if ((void *)(data + sizeof(struct ethhdr) + sizeof(*iph)) > data_end) + return NULL; + + /* Basic IP header validation */ + if (iph->version != 4) /* Only support IPv4 */ + return NULL; + + if (iph->ihl < 5) /* Minimum IP header length */ + return NULL; + + *ip_proto = iph->protocol; + return iph; +} + +static __always_inline struct udphdr *parse_udp_header(struct __sk_buff *skb, struct iphdr *iph) +{ + void *data_end = (void *)(long)skb->data_end; + void *hdr = (void *)iph; + struct udphdr *udp; + + /* Calculate UDP header position */ + udp = hdr + (iph->ihl * 4); + hdr = (void *)udp; + + /* Verify UDP header bounds */ + if ((void *)(hdr + sizeof(*udp)) > data_end) + return NULL; + + return udp; +} + +SEC("tc/ingress") +int change_tail(struct __sk_buff *skb) +{ + int len = skb->len; + struct udphdr *udp; + struct iphdr *iph; + void *data_end; + char *payload; + int ip_proto; + + bpf_skb_pull_data(skb, len); + + data_end = (void *)(long)skb->data_end; + iph = parse_ip_header(skb, &ip_proto); + if (!iph) + return TCX_PASS; + + if (ip_proto != IPPROTO_UDP) + return TCX_PASS; + + udp = parse_udp_header(skb, iph); + if (!udp) + return TCX_PASS; + + payload = (char *)udp + (sizeof(struct udphdr)); + if (payload + 1 > (char *)data_end) + return TCX_PASS; + + if (payload[0] == 'T') { /* Trim the packet */ + change_tail_ret = bpf_skb_change_tail(skb, len - 1, 0); + if (!change_tail_ret) + bpf_skb_change_tail(skb, len, 0); + return TCX_PASS; + } else if (payload[0] == 'G') { /* Grow the packet */ + change_tail_ret = bpf_skb_change_tail(skb, len + 1, 0); + if (!change_tail_ret) + bpf_skb_change_tail(skb, len, 0); + return TCX_PASS; + } else if (payload[0] == 'E') { /* Error */ + change_tail_ret = bpf_skb_change_tail(skb, 65535, 0); + return TCX_PASS; + } else if (payload[0] == 'Z') { /* Zero */ + change_tail_ret = bpf_skb_change_tail(skb, 0, 0); + return TCX_PASS; + } + return TCX_DROP; +} + +char _license[] SEC("license") = "GPL"; From d67393f4d28ef0544eaf382f1123dcaf56495dc9 Mon Sep 17 00:00:00 2001 From: Geert Uytterhoeven Date: Thu, 5 Dec 2024 14:20:43 +0100 Subject: [PATCH 226/337] kbuild: Drop support for include/asm- in headers_check.pl "include/asm-" was replaced by "arch//include/asm" a long time ago. All assembler header files are now included using "#include ", so there is no longer a need to rewrite paths. Signed-off-by: Geert Uytterhoeven Signed-off-by: Masahiro Yamada --- usr/include/Makefile | 2 +- usr/include/headers_check.pl | 9 ++------- 2 files changed, 3 insertions(+), 8 deletions(-) diff --git a/usr/include/Makefile b/usr/include/Makefile index 771e32872b2a..6c6de1b1622b 100644 --- a/usr/include/Makefile +++ b/usr/include/Makefile @@ -78,7 +78,7 @@ quiet_cmd_hdrtest = HDRTEST $< cmd_hdrtest = \ $(CC) $(c_flags) -fsyntax-only -x c /dev/null \ $(if $(filter-out $(no-header-test), $*.h), -include $< -include $<); \ - $(PERL) $(src)/headers_check.pl $(obj) $(SRCARCH) $<; \ + $(PERL) $(src)/headers_check.pl $(obj) $<; \ touch $@ $(obj)/%.hdrtest: $(obj)/%.h FORCE diff --git a/usr/include/headers_check.pl b/usr/include/headers_check.pl index b6aec5e4365f..2b70bfa5558e 100755 --- a/usr/include/headers_check.pl +++ b/usr/include/headers_check.pl @@ -3,9 +3,8 @@ # # headers_check.pl execute a number of trivial consistency checks # -# Usage: headers_check.pl dir arch [files...] +# Usage: headers_check.pl dir [files...] # dir: dir to look for included files -# arch: architecture # files: list of files to check # # The script reads the supplied files line by line and: @@ -23,7 +22,7 @@ use warnings; use strict; use File::Basename; -my ($dir, $arch, @files) = @ARGV; +my ($dir, @files) = @ARGV; my $ret = 0; my $line; @@ -54,10 +53,6 @@ sub check_include my $inc = $1; my $found; $found = stat($dir . "/" . $inc); - if (!$found) { - $inc =~ s#asm/#asm-$arch/#; - $found = stat($dir . "/" . $inc); - } if (!$found) { printf STDERR "$filename:$lineno: included file '$inc' is not exported\n"; $ret = 1; From a34e92d2e831729f0ed5df20d15b4df419cd0ba4 Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Tue, 3 Dec 2024 20:14:45 +0900 Subject: [PATCH 227/337] kbuild: deb-pkg: add debarch for ARCH=um 'make ARCH=um bindeb-pkg' shows the following warning. $ make ARCH=um bindeb-pkg [snip] GEN debian ** ** ** WARNING ** ** ** Your architecture doesn't have its equivalent Debian userspace architecture defined! Falling back to the current host architecture (amd64). Please add support for um to ./scripts/package/mkdebian ... This commit hard-codes i386/amd64 because UML is only supported for x86. Signed-off-by: Masahiro Yamada Reviewed-by: Nicolas Schier --- scripts/package/mkdebian | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/scripts/package/mkdebian b/scripts/package/mkdebian index 4ffcc70f8e31..b038a1380b8a 100755 --- a/scripts/package/mkdebian +++ b/scripts/package/mkdebian @@ -70,6 +70,13 @@ set_debarch() { debarch=sh4$(if_enabled_echo CONFIG_CPU_BIG_ENDIAN eb) fi ;; + um) + if is_enabled CONFIG_64BIT; then + debarch=amd64 + else + debarch=i386 + fi + ;; esac if [ -z "$debarch" ]; then debarch=$(dpkg-architecture -qDEB_HOST_ARCH) From 54956567a055345d17438f08c895c68aff3f4cf2 Mon Sep 17 00:00:00 2001 From: Nicolas Schier Date: Thu, 12 Dec 2024 14:05:29 +0100 Subject: [PATCH 228/337] kbuild: deb-pkg: Do not install maint scripts for arch 'um' Stop installing Debian maintainer scripts when building a user-mode-linux Debian package. Debian maintainer scripts are used for e.g. requesting rebuilds of initrd, rebuilding DKMS modules and updating of grub configuration. As all of this is not relevant for UML but also may lead to failures while processing the kernel hooks, do no more install maintainer scripts for the UML package. Suggested-by: Masahiro Yamada Signed-off-by: Nicolas Schier Signed-off-by: Masahiro Yamada --- scripts/package/builddeb | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/scripts/package/builddeb b/scripts/package/builddeb index fb686fd3266f..ad7aba0f268e 100755 --- a/scripts/package/builddeb +++ b/scripts/package/builddeb @@ -63,6 +63,12 @@ install_linux_image () { esac cp "$(${MAKE} -s -f ${srctree}/Makefile image_name)" "${pdir}/${installed_image_path}" + if [ "${ARCH}" != um ]; then + install_maint_scripts "${pdir}" + fi +} + +install_maint_scripts () { # Install the maintainer scripts # Note: hook scripts under /etc/kernel are also executed by official Debian # kernel packages, as well as kernel packages built using make-kpkg. From 9435dc77a33fa20afec7cd35ceaae5f7f42dbbe2 Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Fri, 13 Dec 2024 00:46:15 +0900 Subject: [PATCH 229/337] modpost: distinguish same module paths from different dump files Since commit 13b25489b6f8 ("kbuild: change working directory to external module directory with M="), module paths are always relative to the top of the external module tree. The module paths recorded in Module.symvers are no longer globally unique when they are passed via KBUILD_EXTRA_SYMBOLS for building other external modules, which may result in false-positive "exported twice" errors. Such errors should not occur because external modules should be able to override in-tree modules. To address this, record the dump file path in struct module and check it when searching for a module. Fixes: 13b25489b6f8 ("kbuild: change working directory to external module directory with M=") Reported-by: Jon Hunter Closes: https://lore.kernel.org/all/eb21a546-a19c-40df-b821-bbba80f19a3d@nvidia.com/ Signed-off-by: Masahiro Yamada Tested-by: Jon Hunter --- scripts/mod/modpost.c | 17 +++++++++-------- scripts/mod/modpost.h | 3 ++- 2 files changed, 11 insertions(+), 9 deletions(-) diff --git a/scripts/mod/modpost.c b/scripts/mod/modpost.c index fb787a5715f5..94ee49207a45 100644 --- a/scripts/mod/modpost.c +++ b/scripts/mod/modpost.c @@ -155,12 +155,13 @@ char *get_line(char **stringp) /* A list of all modules we processed */ LIST_HEAD(modules); -static struct module *find_module(const char *modname) +static struct module *find_module(const char *filename, const char *modname) { struct module *mod; list_for_each_entry(mod, &modules, list) { - if (strcmp(mod->name, modname) == 0) + if (!strcmp(mod->dump_file, filename) && + !strcmp(mod->name, modname)) return mod; } return NULL; @@ -2030,10 +2031,10 @@ static void read_dump(const char *fname) continue; } - mod = find_module(modname); + mod = find_module(fname, modname); if (!mod) { mod = new_module(modname, strlen(modname)); - mod->from_dump = true; + mod->dump_file = fname; } s = sym_add_exported(symname, mod, gpl_only, namespace); sym_set_crc(s, crc); @@ -2052,7 +2053,7 @@ static void write_dump(const char *fname) struct symbol *sym; list_for_each_entry(mod, &modules, list) { - if (mod->from_dump) + if (mod->dump_file) continue; list_for_each_entry(sym, &mod->exported_symbols, list) { if (trim_unused_exports && !sym->used) @@ -2076,7 +2077,7 @@ static void write_namespace_deps_files(const char *fname) list_for_each_entry(mod, &modules, list) { - if (mod->from_dump || list_empty(&mod->missing_namespaces)) + if (mod->dump_file || list_empty(&mod->missing_namespaces)) continue; buf_printf(&ns_deps_buf, "%s.ko:", mod->name); @@ -2194,7 +2195,7 @@ int main(int argc, char **argv) read_symbols_from_files(files_source); list_for_each_entry(mod, &modules, list) { - if (mod->from_dump || mod->is_vmlinux) + if (mod->dump_file || mod->is_vmlinux) continue; check_modname_len(mod); @@ -2205,7 +2206,7 @@ int main(int argc, char **argv) handle_white_list_exports(unused_exports_white_list); list_for_each_entry(mod, &modules, list) { - if (mod->from_dump) + if (mod->dump_file) continue; if (mod->is_vmlinux) diff --git a/scripts/mod/modpost.h b/scripts/mod/modpost.h index 49848fcbe2a1..8b72c227ebf4 100644 --- a/scripts/mod/modpost.h +++ b/scripts/mod/modpost.h @@ -95,14 +95,15 @@ struct module_alias { /** * struct module - represent a module (vmlinux or *.ko) * + * @dump_file: path to the .symvers file if loaded from a file * @aliases: list head for module_aliases */ struct module { struct list_head list; struct list_head exported_symbols; struct list_head unresolved_symbols; + const char *dump_file; bool is_gpl_compatible; - bool from_dump; /* true if module was loaded from *.symvers */ bool is_vmlinux; bool seen; bool has_init; From e84a3bf7f4aa669c05e3884497774148ac111468 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Tue, 17 Dec 2024 10:19:04 -0500 Subject: [PATCH 230/337] staging: gpib: Fix allyesconfig build failures My tests run an allyesconfig build and it failed with the following errors: LD [M] samples/kfifo/dma-example.ko ld.lld: error: undefined symbol: nec7210_board_reset ld.lld: error: undefined symbol: nec7210_read ld.lld: error: undefined symbol: nec7210_write It appears that some modules call the function nec7210_board_reset() that is defined in nec7210.c. In an allyesconfig build, these other modules are built in. But the file that holds nec7210_board_reset() has: obj-m += nec7210.o Where that "-m" means it only gets built as a module. With the other modules built in, they have no access to nec7210_board_reset() and the build fails. This isn't the only function. After fixing that one, I hit another: ld.lld: error: undefined symbol: push_gpib_event ld.lld: error: undefined symbol: gpib_match_device_path Where push_gpib_event() was also used outside of the file it was defined in, and that file too only was built as a module. Since the directory that nec7210.c is only traversed when CONFIG_GPIB_NEC7210 is set, and the directory with gpib_common.c is only traversed when CONFIG_GPIB_COMMON is set, use those configs as the option to build those modules. When it is an allyesconfig, then they will both be built in and their functions will be available to the other modules that are also built in. Fixes: 3ba84ac69b53e ("staging: gpib: Add nec7210 GPIB chip driver") Fixes: 9dde4559e9395 ("staging: gpib: Add GPIB common core driver") Signed-off-by: Steven Rostedt (Google) Reviewed-by: Palmer Dabbelt Signed-off-by: Linus Torvalds --- drivers/staging/gpib/common/Makefile | 2 +- drivers/staging/gpib/nec7210/Makefile | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/staging/gpib/common/Makefile b/drivers/staging/gpib/common/Makefile index 0c4c77bea75b..460586edb574 100644 --- a/drivers/staging/gpib/common/Makefile +++ b/drivers/staging/gpib/common/Makefile @@ -1,5 +1,5 @@ -obj-m += gpib_common.o +obj-$(CONFIG_GPIB_COMMON) += gpib_common.o gpib_common-objs := gpib_os.o iblib.o diff --git a/drivers/staging/gpib/nec7210/Makefile b/drivers/staging/gpib/nec7210/Makefile index 8d4d90f21109..64330f2e89d1 100644 --- a/drivers/staging/gpib/nec7210/Makefile +++ b/drivers/staging/gpib/nec7210/Makefile @@ -1,4 +1,4 @@ -obj-m += nec7210.o +obj-$(CONFIG_GPIB_NEC7210) += nec7210.o From 37d1d99b8806b24ffe4a2b453620df932994a5c0 Mon Sep 17 00:00:00 2001 From: Wolfram Sang Date: Tue, 17 Dec 2024 08:05:40 +0100 Subject: [PATCH 231/337] KVM: VMX: don't include '' directly The header clearly states that it does not want to be included directly, only via ''. Replace the include accordingly. Signed-off-by: Wolfram Sang Message-ID: <20241217070539.2433-2-wsa+renesas@sang-engineering.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/posted_intr.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/kvm/vmx/posted_intr.h b/arch/x86/kvm/vmx/posted_intr.h index 1715d2ab07be..ad9116a99bcc 100644 --- a/arch/x86/kvm/vmx/posted_intr.h +++ b/arch/x86/kvm/vmx/posted_intr.h @@ -2,7 +2,7 @@ #ifndef __KVM_X86_VMX_POSTED_INTR_H #define __KVM_X86_VMX_POSTED_INTR_H -#include +#include #include void vmx_vcpu_pi_load(struct kvm_vcpu *vcpu, int cpu); From 398b7b6cb9e046f137a188670da12f790492b56b Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Thu, 19 Dec 2024 07:43:20 -0500 Subject: [PATCH 232/337] KVM: x86: let it be known that ignore_msrs is a bad idea When running KVM with ignore_msrs=1 and report_ignored_msrs=0, the user has no clue that that the guest is being lied to. This may cause bug reports such as https://gitlab.com/qemu-project/qemu/-/issues/2571, where enabling a CPUID bit in QEMU caused Linux guests to try reading MSR_CU_DEF_ERR; and being lied about the existence of MSR_CU_DEF_ERR caused the guest to assume other things about the local APIC which were not true: Sep 14 12:02:53 kernel: mce: [Firmware Bug]: Your BIOS is not setting up LVT offset 0x2 for deferred error IRQs correctly. Sep 14 12:02:53 kernel: unchecked MSR access error: RDMSR from 0x852 at rIP: 0xffffffffb548ffa7 (native_read_msr+0x7/0x40) Sep 14 12:02:53 kernel: Call Trace: ... Sep 14 12:02:53 kernel: native_apic_msr_read+0x20/0x30 Sep 14 12:02:53 kernel: setup_APIC_eilvt+0x47/0x110 Sep 14 12:02:53 kernel: mce_amd_feature_init+0x485/0x4e0 ... Sep 14 12:02:53 kernel: [Firmware Bug]: cpu 0, try to use APIC520 (LVT offset 2) for vector 0xf4, but the register is already in use for vector 0x0 on this cpu Without reported_ignored_msrs=0 at least the host kernel log will contain enough information to avoid going on a wild goose chase. But if reports about individual MSR accesses are being silenced too, at least complain loudly the first time a VM is started. Signed-off-by: Paolo Bonzini --- arch/x86/kvm/x86.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index c8160baf3838..12fa68a06966 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -12724,6 +12724,13 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type) kvm_hv_init_vm(kvm); kvm_xen_init_vm(kvm); + if (ignore_msrs && !report_ignored_msrs) { + pr_warn_once("Running KVM with ignore_msrs=1 and report_ignored_msrs=0 is not a\n" + "a supported configuration. Lying to the guest about the existence of MSRs\n" + "may cause the guest operating system to hang or produce errors. If a guest\n" + "does not run without ignore_msrs=1, please report it to kvm@vger.kernel.org.\n"); + } + return 0; out_uninit_mmu: From 4bbf9020becbfd8fc2c3da790855b7042fad455b Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Sun, 22 Dec 2024 13:22:21 -0800 Subject: [PATCH 233/337] Linux 6.13-rc4 --- Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile b/Makefile index e5b8a8832c0c..5c9b1d2d59b4 100644 --- a/Makefile +++ b/Makefile @@ -2,7 +2,7 @@ VERSION = 6 PATCHLEVEL = 13 SUBLEVEL = 0 -EXTRAVERSION = -rc3 +EXTRAVERSION = -rc4 NAME = Baby Opossum Posse # *DOCUMENTATION* From 0b7a66a2c864859fbf9bb16229c03172eef02c05 Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Thu, 5 Dec 2024 17:06:02 +0100 Subject: [PATCH 234/337] preempt: Move PREEMPT_RT before PREEMPT in vermagic. Since the dynamic preemption has been enabled for PREEMPT_RT we have now CONFIG_PREEMPT and CONFIG_PREEMPT_RT set simultaneously. This affects the vermagic strings which comes now PREEMPT with PREEMPT_RT enabled. The PREEMPT_RT module usually can not be loaded on a PREEMPT kernel because some symbols are missing. However if the symbols are fine then it continues and it crashes later. The problem is that the struct module has a different layout and the num_exentries or init members are at a different position leading to a crash later on. This is not necessary caught by the size check in elf_validity_cache_index_mod() because the mem member has an alignment requirement of __module_memory_align which is big enough keep the total size unchanged. Therefore we should keep the string accurate instead of removing it. Move the PREEMPT_RT check before the PREEMPT so that it takes precedence if both symbols are enabled. Fixes: 35772d627b55c ("sched: Enable PREEMPT_DYNAMIC for PREEMPT_RT") Signed-off-by: Sebastian Andrzej Siewior Reviewed-by: Petr Pavlu Link: https://lore.kernel.org/r/20241205160602.3lIAsJRT@linutronix.de Signed-off-by: Petr Pavlu --- include/linux/vermagic.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/include/linux/vermagic.h b/include/linux/vermagic.h index a54046bf37e5..939ceabcaf06 100644 --- a/include/linux/vermagic.h +++ b/include/linux/vermagic.h @@ -15,10 +15,10 @@ #else #define MODULE_VERMAGIC_SMP "" #endif -#ifdef CONFIG_PREEMPT_BUILD -#define MODULE_VERMAGIC_PREEMPT "preempt " -#elif defined(CONFIG_PREEMPT_RT) +#ifdef CONFIG_PREEMPT_RT #define MODULE_VERMAGIC_PREEMPT "preempt_rt " +#elif defined(CONFIG_PREEMPT_BUILD) +#define MODULE_VERMAGIC_PREEMPT "preempt " #else #define MODULE_VERMAGIC_PREEMPT "" #endif From b8ea3b1ff544b47c1d64a22860f33b755638164e Mon Sep 17 00:00:00 2001 From: Bharath SM Date: Fri, 13 Dec 2024 22:50:21 +0530 Subject: [PATCH 235/337] smb: enable reuse of deferred file handles for write operations Previously, deferred file handles were reused only for read operations, this commit extends to reusing deferred handles for write operations. By reusing these handles we can reduce the need for open/close operations over the wire. Signed-off-by: Bharath SM Signed-off-by: Steve French --- fs/smb/client/file.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/fs/smb/client/file.c b/fs/smb/client/file.c index a58a3333ecc3..3b2d33291a7e 100644 --- a/fs/smb/client/file.c +++ b/fs/smb/client/file.c @@ -990,7 +990,11 @@ int cifs_open(struct inode *inode, struct file *file) } /* Get the cached handle as SMB2 close is deferred */ - rc = cifs_get_readable_path(tcon, full_path, &cfile); + if (OPEN_FMODE(file->f_flags) & FMODE_WRITE) { + rc = cifs_get_writable_path(tcon, full_path, FIND_WR_FSUID_ONLY, &cfile); + } else { + rc = cifs_get_readable_path(tcon, full_path, &cfile); + } if (rc == 0) { if (file->f_flags == cfile->f_flags) { file->private_data = cfile; From f17224c2a7bdc11a17c96d9d8cb2d829f54d40bb Mon Sep 17 00:00:00 2001 From: "Dr. David Alan Gilbert" Date: Fri, 20 Dec 2024 21:59:37 +0000 Subject: [PATCH 236/337] cifs: Remove unused is_server_using_iface() The last use of is_server_using_iface() was removed in 2022 by commit aa45dadd34e4 ("cifs: change iface_list from array to sorted linked list") Remove it. Signed-off-by: Dr. David Alan Gilbert Signed-off-by: Steve French --- fs/smb/client/cifsproto.h | 2 -- fs/smb/client/sess.c | 25 ------------------------- 2 files changed, 27 deletions(-) diff --git a/fs/smb/client/cifsproto.h b/fs/smb/client/cifsproto.h index 754417cb3294..d26f9bbb5382 100644 --- a/fs/smb/client/cifsproto.h +++ b/fs/smb/client/cifsproto.h @@ -614,8 +614,6 @@ int cifs_alloc_hash(const char *name, struct shash_desc **sdesc); void cifs_free_hash(struct shash_desc **sdesc); int cifs_try_adding_channels(struct cifs_ses *ses); -bool is_server_using_iface(struct TCP_Server_Info *server, - struct cifs_server_iface *iface); bool is_ses_using_iface(struct cifs_ses *ses, struct cifs_server_iface *iface); void cifs_ses_mark_for_reconnect(struct cifs_ses *ses); diff --git a/fs/smb/client/sess.c b/fs/smb/client/sess.c index 3306fb655136..91d4d409cb1d 100644 --- a/fs/smb/client/sess.c +++ b/fs/smb/client/sess.c @@ -27,31 +27,6 @@ static int cifs_ses_add_channel(struct cifs_ses *ses, struct cifs_server_iface *iface); -bool -is_server_using_iface(struct TCP_Server_Info *server, - struct cifs_server_iface *iface) -{ - struct sockaddr_in *i4 = (struct sockaddr_in *)&iface->sockaddr; - struct sockaddr_in6 *i6 = (struct sockaddr_in6 *)&iface->sockaddr; - struct sockaddr_in *s4 = (struct sockaddr_in *)&server->dstaddr; - struct sockaddr_in6 *s6 = (struct sockaddr_in6 *)&server->dstaddr; - - if (server->dstaddr.ss_family != iface->sockaddr.ss_family) - return false; - if (server->dstaddr.ss_family == AF_INET) { - if (s4->sin_addr.s_addr != i4->sin_addr.s_addr) - return false; - } else if (server->dstaddr.ss_family == AF_INET6) { - if (memcmp(&s6->sin6_addr, &i6->sin6_addr, - sizeof(i6->sin6_addr)) != 0) - return false; - } else { - /* unknown family.. */ - return false; - } - return true; -} - bool is_ses_using_iface(struct cifs_ses *ses, struct cifs_server_iface *iface) { int i; From 8673a6c2d9e483dfeeef83a1f06f59e05636f4d1 Mon Sep 17 00:00:00 2001 From: wenglianfa Date: Fri, 20 Dec 2024 13:52:46 +0800 Subject: [PATCH 237/337] RDMA/hns: Fix mapping error of zero-hop WQE buffer Due to HW limitation, the three region of WQE buffer must be mapped and set to HW in a fixed order: SQ buffer, SGE buffer, and RQ buffer. Currently when one region is zero-hop while the other two are not, the zero-hop region will not be mapped. This violate the limitation above and leads to address error. Fixes: 38389eaa4db1 ("RDMA/hns: Add mtr support for mixed multihop addressing") Signed-off-by: wenglianfa Signed-off-by: Junxian Huang Link: https://patch.msgid.link/20241220055249.146943-2-huangjunxian6@hisilicon.com Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/hns/hns_roce_hem.c | 43 ++++++++++++++++-------- drivers/infiniband/hw/hns/hns_roce_mr.c | 5 --- 2 files changed, 29 insertions(+), 19 deletions(-) diff --git a/drivers/infiniband/hw/hns/hns_roce_hem.c b/drivers/infiniband/hw/hns/hns_roce_hem.c index f84521be3bea..605562122ecc 100644 --- a/drivers/infiniband/hw/hns/hns_roce_hem.c +++ b/drivers/infiniband/hw/hns/hns_roce_hem.c @@ -931,6 +931,7 @@ struct hns_roce_hem_item { size_t count; /* max ba numbers */ int start; /* start buf offset in this hem */ int end; /* end buf offset in this hem */ + bool exist_bt; }; /* All HEM items are linked in a tree structure */ @@ -959,6 +960,7 @@ hem_list_alloc_item(struct hns_roce_dev *hr_dev, int start, int end, int count, } } + hem->exist_bt = exist_bt; hem->count = count; hem->start = start; hem->end = end; @@ -969,22 +971,22 @@ hem_list_alloc_item(struct hns_roce_dev *hr_dev, int start, int end, int count, } static void hem_list_free_item(struct hns_roce_dev *hr_dev, - struct hns_roce_hem_item *hem, bool exist_bt) + struct hns_roce_hem_item *hem) { - if (exist_bt) + if (hem->exist_bt) dma_free_coherent(hr_dev->dev, hem->count * BA_BYTE_LEN, hem->addr, hem->dma_addr); kfree(hem); } static void hem_list_free_all(struct hns_roce_dev *hr_dev, - struct list_head *head, bool exist_bt) + struct list_head *head) { struct hns_roce_hem_item *hem, *temp_hem; list_for_each_entry_safe(hem, temp_hem, head, list) { list_del(&hem->list); - hem_list_free_item(hr_dev, hem, exist_bt); + hem_list_free_item(hr_dev, hem); } } @@ -1084,6 +1086,10 @@ int hns_roce_hem_list_calc_root_ba(const struct hns_roce_buf_region *regions, for (i = 0; i < region_cnt; i++) { r = (struct hns_roce_buf_region *)®ions[i]; + /* when r->hopnum = 0, the region should not occupy root_ba. */ + if (!r->hopnum) + continue; + if (r->hopnum > 1) { step = hem_list_calc_ba_range(r->hopnum, 1, unit); if (step > 0) @@ -1177,7 +1183,7 @@ static int hem_list_alloc_mid_bt(struct hns_roce_dev *hr_dev, err_exit: for (level = 1; level < hopnum; level++) - hem_list_free_all(hr_dev, &temp_list[level], true); + hem_list_free_all(hr_dev, &temp_list[level]); return ret; } @@ -1218,16 +1224,26 @@ static int alloc_fake_root_bt(struct hns_roce_dev *hr_dev, void *cpu_base, { struct hns_roce_hem_item *hem; + /* This is on the has_mtt branch, if r->hopnum + * is 0, there is no root_ba to reuse for the + * region's fake hem, so a dma_alloc request is + * necessary here. + */ hem = hem_list_alloc_item(hr_dev, r->offset, r->offset + r->count - 1, - r->count, false); + r->count, !r->hopnum); if (!hem) return -ENOMEM; - hem_list_assign_bt(hem, cpu_base, phy_base); + /* The root_ba can be reused only when r->hopnum > 0. */ + if (r->hopnum) + hem_list_assign_bt(hem, cpu_base, phy_base); list_add(&hem->list, branch_head); list_add(&hem->sibling, leaf_head); - return r->count; + /* If r->hopnum == 0, 0 is returned, + * so that the root_bt entry is not occupied. + */ + return r->hopnum ? r->count : 0; } static int setup_middle_bt(struct hns_roce_dev *hr_dev, void *cpu_base, @@ -1271,7 +1287,7 @@ setup_root_hem(struct hns_roce_dev *hr_dev, struct hns_roce_hem_list *hem_list, return -ENOMEM; total = 0; - for (i = 0; i < region_cnt && total < max_ba_num; i++) { + for (i = 0; i < region_cnt && total <= max_ba_num; i++) { r = ®ions[i]; if (!r->count) continue; @@ -1337,9 +1353,9 @@ static int hem_list_alloc_root_bt(struct hns_roce_dev *hr_dev, region_cnt); if (ret) { for (i = 0; i < region_cnt; i++) - hem_list_free_all(hr_dev, &head.branch[i], false); + hem_list_free_all(hr_dev, &head.branch[i]); - hem_list_free_all(hr_dev, &head.root, true); + hem_list_free_all(hr_dev, &head.root); } return ret; @@ -1402,10 +1418,9 @@ void hns_roce_hem_list_release(struct hns_roce_dev *hr_dev, for (i = 0; i < HNS_ROCE_MAX_BT_REGION; i++) for (j = 0; j < HNS_ROCE_MAX_BT_LEVEL; j++) - hem_list_free_all(hr_dev, &hem_list->mid_bt[i][j], - j != 0); + hem_list_free_all(hr_dev, &hem_list->mid_bt[i][j]); - hem_list_free_all(hr_dev, &hem_list->root_bt, true); + hem_list_free_all(hr_dev, &hem_list->root_bt); INIT_LIST_HEAD(&hem_list->btm_bt); hem_list->root_ba = 0; } diff --git a/drivers/infiniband/hw/hns/hns_roce_mr.c b/drivers/infiniband/hw/hns/hns_roce_mr.c index bf30b3a65a9b..55b9283bfc6f 100644 --- a/drivers/infiniband/hw/hns/hns_roce_mr.c +++ b/drivers/infiniband/hw/hns/hns_roce_mr.c @@ -814,11 +814,6 @@ int hns_roce_mtr_map(struct hns_roce_dev *hr_dev, struct hns_roce_mtr *mtr, for (i = 0, mapped_cnt = 0; i < mtr->hem_cfg.region_count && mapped_cnt < page_cnt; i++) { r = &mtr->hem_cfg.region[i]; - /* if hopnum is 0, no need to map pages in this region */ - if (!r->hopnum) { - mapped_cnt += r->count; - continue; - } if (r->offset + r->count > page_cnt) { ret = -EINVAL; From 0572eccf239ce4bd89bd531767ec5ab20e249290 Mon Sep 17 00:00:00 2001 From: Chengchang Tang Date: Fri, 20 Dec 2024 13:52:47 +0800 Subject: [PATCH 238/337] RDMA/hns: Fix accessing invalid dip_ctx during destroying QP If it fails to modify QP to RTR, dip_ctx will not be attached. And during detroying QP, the invalid dip_ctx pointer will be accessed. Fixes: faa62440a577 ("RDMA/hns: Fix different dgids mapping to the same dip_idx") Signed-off-by: Chengchang Tang Signed-off-by: Junxian Huang Link: https://patch.msgid.link/20241220055249.146943-3-huangjunxian6@hisilicon.com Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/hns/hns_roce_hw_v2.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c index 697b17cca02e..6dddadb90e02 100644 --- a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c +++ b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c @@ -5619,6 +5619,9 @@ static void put_dip_ctx_idx(struct hns_roce_dev *hr_dev, { struct hns_roce_dip *hr_dip = hr_qp->dip; + if (!hr_dip) + return; + xa_lock(&hr_dev->qp_table.dip_xa); hr_dip->qp_cnt--; From fa5c4ba8cdbfd2c2d6422e001311c8213283ebbf Mon Sep 17 00:00:00 2001 From: Chengchang Tang Date: Fri, 20 Dec 2024 13:52:48 +0800 Subject: [PATCH 239/337] RDMA/hns: Fix warning storm caused by invalid input in IO path WARN_ON() is called in the IO path. And it could lead to a warning storm. Use WARN_ON_ONCE() instead of WARN_ON(). Fixes: 12542f1de179 ("RDMA/hns: Refactor process about opcode in post_send()") Signed-off-by: Chengchang Tang Signed-off-by: Junxian Huang Link: https://patch.msgid.link/20241220055249.146943-4-huangjunxian6@hisilicon.com Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/hns/hns_roce_hw_v2.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c index 6dddadb90e02..d0469d27c63c 100644 --- a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c +++ b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c @@ -468,7 +468,7 @@ static inline int set_ud_wqe(struct hns_roce_qp *qp, valid_num_sge = calc_wr_sge_num(wr, &msg_len); ret = set_ud_opcode(ud_sq_wqe, wr); - if (WARN_ON(ret)) + if (WARN_ON_ONCE(ret)) return ret; ud_sq_wqe->msg_len = cpu_to_le32(msg_len); @@ -572,7 +572,7 @@ static inline int set_rc_wqe(struct hns_roce_qp *qp, rc_sq_wqe->msg_len = cpu_to_le32(msg_len); ret = set_rc_opcode(hr_dev, rc_sq_wqe, wr); - if (WARN_ON(ret)) + if (WARN_ON_ONCE(ret)) return ret; hr_reg_write(rc_sq_wqe, RC_SEND_WQE_SO, From e3debdd48423d3d75b9d366399228d7225d902cd Mon Sep 17 00:00:00 2001 From: Chengchang Tang Date: Fri, 20 Dec 2024 13:52:49 +0800 Subject: [PATCH 240/337] RDMA/hns: Fix missing flush CQE for DWQE Flush CQE handler has not been called if QP state gets into errored mode in DWQE path. So, the new added outstanding WQEs will never be flushed. It leads to a hung task timeout when using NFS over RDMA: __switch_to+0x7c/0xd0 __schedule+0x350/0x750 schedule+0x50/0xf0 schedule_timeout+0x2c8/0x340 wait_for_common+0xf4/0x2b0 wait_for_completion+0x20/0x40 __ib_drain_sq+0x140/0x1d0 [ib_core] ib_drain_sq+0x98/0xb0 [ib_core] rpcrdma_xprt_disconnect+0x68/0x270 [rpcrdma] xprt_rdma_close+0x20/0x60 [rpcrdma] xprt_autoclose+0x64/0x1cc [sunrpc] process_one_work+0x1d8/0x4e0 worker_thread+0x154/0x420 kthread+0x108/0x150 ret_from_fork+0x10/0x18 Fixes: 01584a5edcc4 ("RDMA/hns: Add support of direct wqe") Signed-off-by: Chengchang Tang Signed-off-by: Junxian Huang Link: https://patch.msgid.link/20241220055249.146943-5-huangjunxian6@hisilicon.com Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/hns/hns_roce_hw_v2.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c index d0469d27c63c..0144e7210d05 100644 --- a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c +++ b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c @@ -670,6 +670,10 @@ static void write_dwqe(struct hns_roce_dev *hr_dev, struct hns_roce_qp *qp, #define HNS_ROCE_SL_SHIFT 2 struct hns_roce_v2_rc_send_wqe *rc_sq_wqe = wqe; + if (unlikely(qp->state == IB_QPS_ERR)) { + flush_cqe(hr_dev, qp); + return; + } /* All kinds of DirectWQE have the same header field layout */ hr_reg_enable(rc_sq_wqe, RC_SEND_WQE_FLAG); hr_reg_write(rc_sq_wqe, RC_SEND_WQE_DB_SL_L, qp->sl); From d685d55dfc86b1a4bdcec77c3c1f8a83f181264e Mon Sep 17 00:00:00 2001 From: "Masami Hiramatsu (Google)" Date: Wed, 11 Dec 2024 09:10:55 +0900 Subject: [PATCH 241/337] tracing/kprobe: Make trace_kprobe's module callback called after jump_label update Make sure the trace_kprobe's module notifer callback function is called after jump_label's callback is called. Since the trace_kprobe's callback eventually checks jump_label address during registering new kprobe on the loading module, jump_label must be updated before this registration happens. Link: https://lore.kernel.org/all/173387585556.995044.3157941002975446119.stgit@devnote2/ Fixes: 614243181050 ("tracing/kprobes: Support module init function probing") Signed-off-by: Masami Hiramatsu (Google) --- kernel/trace/trace_kprobe.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index 263fac44d3ca..935a886af40c 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -725,7 +725,7 @@ static int trace_kprobe_module_callback(struct notifier_block *nb, static struct notifier_block trace_kprobe_module_nb = { .notifier_call = trace_kprobe_module_callback, - .priority = 1 /* Invoked after kprobe module callback */ + .priority = 2 /* Invoked after kprobe and jump_label module callback */ }; static int trace_kprobe_register_module_notifier(void) { From a53da2fb25a31f4fb8eaeb93c7b1134fc14fd209 Mon Sep 17 00:00:00 2001 From: John Harrison Date: Fri, 13 Dec 2024 09:28:33 -0800 Subject: [PATCH 242/337] drm/xe: Revert some changes that break a mesa debug tool MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit There is a mesa debug tool for decoding devcoredump files. Recent changes to improve the devcoredump output broke that tool. So revert the changes until the tool can be extended to support the new fields. Signed-off-by: John Harrison Fixes: c28fd6c358db ("drm/xe/devcoredump: Improve section headings and add tile info") Fixes: ec1455ce7e35 ("drm/xe/devcoredump: Add ASCII85 dump helper function") Cc: John Harrison Cc: Julia Filipchuk Cc: Lucas De Marchi Cc: Thomas Hellström Cc: Rodrigo Vivi Cc: intel-xe@lists.freedesktop.org Reviewed-by: Jonathan Cavitt Reviewed-by: José Roberto de Souza Link: https://patchwork.freedesktop.org/patch/msgid/20241213172833.1733376-1-John.C.Harrison@Intel.com Signed-off-by: Rodrigo Vivi (cherry picked from commit 70fb86a85dc9fd66014d7eb2fe356f50702ceeb6) Signed-off-by: Thomas Hellström --- drivers/gpu/drm/xe/xe_devcoredump.c | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/xe/xe_devcoredump.c b/drivers/gpu/drm/xe/xe_devcoredump.c index f8947e7e917e..21a50d539426 100644 --- a/drivers/gpu/drm/xe/xe_devcoredump.c +++ b/drivers/gpu/drm/xe/xe_devcoredump.c @@ -109,7 +109,11 @@ static ssize_t __xe_devcoredump_read(char *buffer, size_t count, drm_puts(&p, "\n**** GuC CT ****\n"); xe_guc_ct_snapshot_print(ss->guc.ct, &p); - drm_puts(&p, "\n**** Contexts ****\n"); + /* + * Don't add a new section header here because the mesa debug decoder + * tool expects the context information to be in the 'GuC CT' section. + */ + /* drm_puts(&p, "\n**** Contexts ****\n"); */ xe_guc_exec_queue_snapshot_print(ss->ge, &p); drm_puts(&p, "\n**** Job ****\n"); @@ -363,6 +367,15 @@ void xe_print_blob_ascii85(struct drm_printer *p, const char *prefix, char buff[ASCII85_BUFSZ], *line_buff; size_t line_pos = 0; + /* + * Splitting blobs across multiple lines is not compatible with the mesa + * debug decoder tool. Note that even dropping the explicit '\n' below + * doesn't help because the GuC log is so big some underlying implementation + * still splits the lines at 512K characters. So just bail completely for + * the moment. + */ + return; + #define DMESG_MAX_LINE_LEN 800 #define MIN_SPACE (ASCII85_BUFSZ + 2) /* 85 + "\n\0" */ From 528cef1b4170f328d28d4e9b437380d8e5a2d18f Mon Sep 17 00:00:00 2001 From: Nirmoy Das Date: Fri, 13 Dec 2024 13:24:14 +0100 Subject: [PATCH 243/337] drm/xe: Use non-interruptible wait when moving BO to system MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Ensure a non-interruptible wait is used when moving a bo to XE_PL_SYSTEM. This prevents dma_mappings from being removed prematurely while a GPU job is still in progress, even if the CPU receives a signal during the operation. Fixes: 75521e8b56e8 ("drm/xe: Perform dma_map when moving system buffer objects to TT") Cc: Thomas Hellström Cc: Matthew Brost Cc: Lucas De Marchi Cc: stable@vger.kernel.org # v6.11+ Suggested-by: Matthew Auld Reviewed-by: Matthew Auld Reviewed-by: Thomas Hellström Link: https://patchwork.freedesktop.org/patch/msgid/20241213122415.3880017-1-nirmoy.das@intel.com Signed-off-by: Nirmoy Das (cherry picked from commit dc5e20ae1f8a7c354dc9833faa2720254e5a5443) Signed-off-by: Thomas Hellström --- drivers/gpu/drm/xe/xe_bo.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/xe/xe_bo.c b/drivers/gpu/drm/xe/xe_bo.c index ae6b337cdc54..1aec4133008e 100644 --- a/drivers/gpu/drm/xe/xe_bo.c +++ b/drivers/gpu/drm/xe/xe_bo.c @@ -724,7 +724,7 @@ static int xe_bo_move(struct ttm_buffer_object *ttm_bo, bool evict, new_mem->mem_type == XE_PL_SYSTEM) { long timeout = dma_resv_wait_timeout(ttm_bo->base.resv, DMA_RESV_USAGE_BOOKKEEP, - true, + false, MAX_SCHEDULE_TIMEOUT); if (timeout < 0) { ret = timeout; From 5e0a67fdb894d34c5f109e969320eef9ddae7480 Mon Sep 17 00:00:00 2001 From: Nirmoy Das Date: Fri, 13 Dec 2024 13:24:15 +0100 Subject: [PATCH 244/337] drm/xe: Wait for migration job before unmapping pages MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Fix a potential GPU page fault during tt -> system moves by waiting for migration jobs to complete before unmapping SG. This ensures that IOMMU mappings are not prematurely torn down while a migration job is still in progress. v2: Use intr=false(Matt A) v3: Update commit message(Matt A) v4: s/DMA_RESV_USAGE_BOOKKEEP/DMA_RESV_USAGE_KERNEL(Thomas) Closes: https://gitlab.freedesktop.org/drm/xe/kernel/-/issues/3466 Fixes: 75521e8b56e8 ("drm/xe: Perform dma_map when moving system buffer objects to TT") Cc: Thomas Hellström Cc: Matthew Brost Cc: Lucas De Marchi Cc: stable@vger.kernel.org # v6.11+ Cc: Matthew Auld Reviewed-by: Matthew Auld Reviewed-by: Thomas Hellström Link: https://patchwork.freedesktop.org/patch/msgid/20241213122415.3880017-2-nirmoy.das@intel.com Signed-off-by: Nirmoy Das (cherry picked from commit cda06412c06893a6f07a2fbf89d42a0972ec9e8e) Signed-off-by: Thomas Hellström --- drivers/gpu/drm/xe/xe_bo.c | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/xe/xe_bo.c b/drivers/gpu/drm/xe/xe_bo.c index 1aec4133008e..f61a8ef38094 100644 --- a/drivers/gpu/drm/xe/xe_bo.c +++ b/drivers/gpu/drm/xe/xe_bo.c @@ -848,8 +848,16 @@ static int xe_bo_move(struct ttm_buffer_object *ttm_bo, bool evict, out: if ((!ttm_bo->resource || ttm_bo->resource->mem_type == XE_PL_SYSTEM) && - ttm_bo->ttm) + ttm_bo->ttm) { + long timeout = dma_resv_wait_timeout(ttm_bo->base.resv, + DMA_RESV_USAGE_KERNEL, + false, + MAX_SCHEDULE_TIMEOUT); + if (timeout < 0) + ret = timeout; + xe_tt_unmap_sg(ttm_bo->ttm); + } return ret; } From af12ba67d09ebe2b31ab997cea1a930864028562 Mon Sep 17 00:00:00 2001 From: Michal Wajdeczko Date: Mon, 16 Dec 2024 23:32:53 +0100 Subject: [PATCH 245/337] drm/xe/pf: Use correct function to check LMEM provisioning MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit There is a typo in function call and instead of VF LMEM we were looking at VF GGTT provisioning. Fix that. Fixes: 234670cea9a2 ("drm/xe/pf: Skip fair VFs provisioning if already provisioned") Signed-off-by: Michal Wajdeczko Cc: Piotr Piórkowski Reviewed-by: Himal Prasad Ghimiray Link: https://patchwork.freedesktop.org/patch/msgid/20241216223253.819-1-michal.wajdeczko@intel.com (cherry picked from commit a8d0aa0e7fcd20c9f1992688c0f0d07a68287403) Signed-off-by: Thomas Hellström --- drivers/gpu/drm/xe/xe_gt_sriov_pf_config.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/xe/xe_gt_sriov_pf_config.c b/drivers/gpu/drm/xe/xe_gt_sriov_pf_config.c index 192643d63d22..ca49860168f6 100644 --- a/drivers/gpu/drm/xe/xe_gt_sriov_pf_config.c +++ b/drivers/gpu/drm/xe/xe_gt_sriov_pf_config.c @@ -2046,7 +2046,7 @@ static int pf_validate_vf_config(struct xe_gt *gt, unsigned int vfid) valid_any = valid_any || (valid_ggtt && is_primary); if (IS_DGFX(xe)) { - bool valid_lmem = pf_get_vf_config_ggtt(primary_gt, vfid); + bool valid_lmem = pf_get_vf_config_lmem(primary_gt, vfid); valid_any = valid_any || (valid_lmem && is_primary); valid_all = valid_all && valid_lmem; From fe39b222a4139354d32ff9d46b88757f63f71d63 Mon Sep 17 00:00:00 2001 From: Lucas De Marchi Date: Tue, 17 Dec 2024 21:31:21 -0800 Subject: [PATCH 246/337] drm/xe: Fix fault on fd close after unbind MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit If userspace holds an fd open, unbinds the device and then closes it, the driver shouldn't try to access the hardware. Protect it by using drm_dev_enter()/drm_dev_exit(). This fixes the following page fault: <6> [IGT] xe_wedged: exiting, ret=98 <1> BUG: unable to handle page fault for address: ffffc901bc5e508c <1> #PF: supervisor read access in kernel mode <1> #PF: error_code(0x0000) - not-present page ... <4> xe_lrc_update_timestamp+0x1c/0xd0 [xe] <4> xe_exec_queue_update_run_ticks+0x50/0xb0 [xe] <4> xe_exec_queue_fini+0x16/0xb0 [xe] <4> __guc_exec_queue_fini_async+0xc4/0x190 [xe] <4> guc_exec_queue_fini_async+0xa0/0xe0 [xe] <4> guc_exec_queue_fini+0x23/0x40 [xe] <4> xe_exec_queue_destroy+0xb3/0xf0 [xe] <4> xe_file_close+0xd4/0x1a0 [xe] <4> drm_file_free+0x210/0x280 [drm] <4> drm_close_helper.isra.0+0x6d/0x80 [drm] <4> drm_release_noglobal+0x20/0x90 [drm] Fixes: 514447a12190 ("drm/xe: Stop accumulating LRC timestamp on job_free") Closes: https://gitlab.freedesktop.org/drm/xe/kernel/-/issues/3421 Reviewed-by: Umesh Nerlige Ramappa Link: https://patchwork.freedesktop.org/patch/msgid/20241218053122.2730195-1-lucas.demarchi@intel.com Signed-off-by: Lucas De Marchi (cherry picked from commit 4ca1fd418338d4d135428a0eb1e16e3b3ce17ee8) Signed-off-by: Thomas Hellström --- drivers/gpu/drm/xe/xe_exec_queue.c | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/drivers/gpu/drm/xe/xe_exec_queue.c b/drivers/gpu/drm/xe/xe_exec_queue.c index fd0f3b3c9101..268cd3123be9 100644 --- a/drivers/gpu/drm/xe/xe_exec_queue.c +++ b/drivers/gpu/drm/xe/xe_exec_queue.c @@ -8,6 +8,7 @@ #include #include +#include #include #include @@ -762,9 +763,11 @@ bool xe_exec_queue_is_idle(struct xe_exec_queue *q) */ void xe_exec_queue_update_run_ticks(struct xe_exec_queue *q) { + struct xe_device *xe = gt_to_xe(q->gt); struct xe_file *xef; struct xe_lrc *lrc; u32 old_ts, new_ts; + int idx; /* * Jobs that are run during driver load may use an exec_queue, but are @@ -774,6 +777,10 @@ void xe_exec_queue_update_run_ticks(struct xe_exec_queue *q) if (!q->vm || !q->vm->xef) return; + /* Synchronize with unbind while holding the xe file open */ + if (!drm_dev_enter(&xe->drm, &idx)) + return; + xef = q->vm->xef; /* @@ -787,6 +794,8 @@ void xe_exec_queue_update_run_ticks(struct xe_exec_queue *q) lrc = q->lrc[0]; new_ts = xe_lrc_update_timestamp(lrc, &old_ts); xef->run_ticks[q->class] += (new_ts - old_ts) * q->width; + + drm_dev_exit(idx); } /** From a072ffd896efa6a6c8a0334c712fbc98a63c789c Mon Sep 17 00:00:00 2001 From: Mohsin Bashir Date: Wed, 18 Dec 2024 15:25:58 -0800 Subject: [PATCH 247/337] eth: fbnic: fix csr boundary for RPM RAM section MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The CSR dump support leverages the FBNIC_BOUNDS macro, which pads the end condition for each section by adding an offset of 1. However, the RPC RAM section, which is dumped differently from other sections, does not rely on this macro and instead directly uses end boundary address. Hence, subtracting 1 from the end address results in skipping a register. Fixes 3d12862b216d (“eth: fbnic: Add support to dump registers”) Signed-off-by: Mohsin Bashir Reviewed-by: Michal Swiatkowski Link: https://patch.msgid.link/20241218232614.439329-1-mohsin.bashr@gmail.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/meta/fbnic/fbnic_csr.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/meta/fbnic/fbnic_csr.c b/drivers/net/ethernet/meta/fbnic/fbnic_csr.c index 2118901b25e9..aeb9f333f4c7 100644 --- a/drivers/net/ethernet/meta/fbnic/fbnic_csr.c +++ b/drivers/net/ethernet/meta/fbnic/fbnic_csr.c @@ -64,7 +64,7 @@ static void fbnic_csr_get_regs_rpc_ram(struct fbnic_dev *fbd, u32 **data_p) u32 i, j; *(data++) = start; - *(data++) = end - 1; + *(data++) = end; /* FBNIC_RPC_TCAM_ACT */ for (i = 0; i < FBNIC_RPC_TCAM_ACT_NUM_ENTRIES; i++) { From 2b6ffcd7873b7e8a62c3e15a6f305bfc747c466b Mon Sep 17 00:00:00 2001 From: Joe Hattori Date: Thu, 19 Dec 2024 11:41:19 +0900 Subject: [PATCH 248/337] net: stmmac: restructure the error path of stmmac_probe_config_dt() Current implementation of stmmac_probe_config_dt() does not release the OF node reference obtained by of_parse_phandle() in some error paths. The problem is that some error paths call stmmac_remove_config_dt() to clean up but others use and unwind ladder. These two types of error handling have not kept in sync and have been a recurring source of bugs. Re-write the error handling in stmmac_probe_config_dt() to use an unwind ladder. Consequently, stmmac_remove_config_dt() is not needed anymore, thus remove it. This bug was found by an experimental verification tool that I am developing. Fixes: 4838a5405028 ("net: stmmac: Fix wrapper drivers not detecting PHY") Signed-off-by: Joe Hattori Link: https://patch.msgid.link/20241219024119.2017012-1-joe@pf.is.s.u-tokyo.ac.jp Signed-off-by: Jakub Kicinski --- .../ethernet/stmicro/stmmac/stmmac_platform.c | 43 ++++++++----------- 1 file changed, 17 insertions(+), 26 deletions(-) diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_platform.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_platform.c index 3ac32444e492..dc9884130b91 100644 --- a/drivers/net/ethernet/stmicro/stmmac/stmmac_platform.c +++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_platform.c @@ -405,22 +405,6 @@ static int stmmac_of_get_mac_mode(struct device_node *np) return -ENODEV; } -/** - * stmmac_remove_config_dt - undo the effects of stmmac_probe_config_dt() - * @pdev: platform_device structure - * @plat: driver data platform structure - * - * Release resources claimed by stmmac_probe_config_dt(). - */ -static void stmmac_remove_config_dt(struct platform_device *pdev, - struct plat_stmmacenet_data *plat) -{ - clk_disable_unprepare(plat->stmmac_clk); - clk_disable_unprepare(plat->pclk); - of_node_put(plat->phy_node); - of_node_put(plat->mdio_node); -} - /** * stmmac_probe_config_dt - parse device-tree driver parameters * @pdev: platform_device structure @@ -490,8 +474,10 @@ stmmac_probe_config_dt(struct platform_device *pdev, u8 *mac) dev_warn(&pdev->dev, "snps,phy-addr property is deprecated\n"); rc = stmmac_mdio_setup(plat, np, &pdev->dev); - if (rc) - return ERR_PTR(rc); + if (rc) { + ret = ERR_PTR(rc); + goto error_put_phy; + } of_property_read_u32(np, "tx-fifo-depth", &plat->tx_fifo_size); @@ -581,8 +567,8 @@ stmmac_probe_config_dt(struct platform_device *pdev, u8 *mac) dma_cfg = devm_kzalloc(&pdev->dev, sizeof(*dma_cfg), GFP_KERNEL); if (!dma_cfg) { - stmmac_remove_config_dt(pdev, plat); - return ERR_PTR(-ENOMEM); + ret = ERR_PTR(-ENOMEM); + goto error_put_mdio; } plat->dma_cfg = dma_cfg; @@ -610,8 +596,8 @@ stmmac_probe_config_dt(struct platform_device *pdev, u8 *mac) rc = stmmac_mtl_setup(pdev, plat); if (rc) { - stmmac_remove_config_dt(pdev, plat); - return ERR_PTR(rc); + ret = ERR_PTR(rc); + goto error_put_mdio; } /* clock setup */ @@ -663,6 +649,10 @@ error_hw_init: clk_disable_unprepare(plat->pclk); error_pclk_get: clk_disable_unprepare(plat->stmmac_clk); +error_put_mdio: + of_node_put(plat->mdio_node); +error_put_phy: + of_node_put(plat->phy_node); return ret; } @@ -671,16 +661,17 @@ static void devm_stmmac_remove_config_dt(void *data) { struct plat_stmmacenet_data *plat = data; - /* Platform data argument is unused */ - stmmac_remove_config_dt(NULL, plat); + clk_disable_unprepare(plat->stmmac_clk); + clk_disable_unprepare(plat->pclk); + of_node_put(plat->mdio_node); + of_node_put(plat->phy_node); } /** * devm_stmmac_probe_config_dt * @pdev: platform_device structure * @mac: MAC address to use - * Description: Devres variant of stmmac_probe_config_dt(). Does not require - * the user to call stmmac_remove_config_dt() at driver detach. + * Description: Devres variant of stmmac_probe_config_dt(). */ struct plat_stmmacenet_data * devm_stmmac_probe_config_dt(struct platform_device *pdev, u8 *mac) From 4f4aa4aa28142d53f8b06585c478476cfe325cfc Mon Sep 17 00:00:00 2001 From: Wang Liang Date: Thu, 19 Dec 2024 15:28:59 +0800 Subject: [PATCH 249/337] net: fix memory leak in tcp_conn_request() If inet_csk_reqsk_queue_hash_add() return false, tcp_conn_request() will return without free the dst memory, which allocated in af_ops->route_req. Here is the kmemleak stack: unreferenced object 0xffff8881198631c0 (size 240): comm "softirq", pid 0, jiffies 4299266571 (age 1802.392s) hex dump (first 32 bytes): 00 10 9b 03 81 88 ff ff 80 98 da bc ff ff ff ff ................ 81 55 18 bb ff ff ff ff 00 00 00 00 00 00 00 00 .U.............. backtrace: [] kmem_cache_alloc+0x60c/0xa80 [] dst_alloc+0x55/0x250 [] rt_dst_alloc+0x46/0x1d0 [] __mkroute_output+0x29a/0xa50 [] ip_route_output_key_hash+0x10b/0x240 [] ip_route_output_flow+0x1d/0x90 [] inet_csk_route_req+0x2c5/0x500 [] tcp_conn_request+0x691/0x12c0 [] tcp_rcv_state_process+0x3c8/0x11b0 [] tcp_v4_do_rcv+0x156/0x3b0 [] tcp_v4_rcv+0x1cf8/0x1d80 [] ip_protocol_deliver_rcu+0xf6/0x360 [] ip_local_deliver_finish+0xe6/0x1e0 [] ip_local_deliver+0xee/0x360 [] ip_rcv+0xad/0x2f0 [] __netif_receive_skb_one_core+0x123/0x140 Call dst_release() to free the dst memory when inet_csk_reqsk_queue_hash_add() return false in tcp_conn_request(). Fixes: ff46e3b44219 ("Fix race for duplicate reqsk on identical SYN") Signed-off-by: Wang Liang Link: https://patch.msgid.link/20241219072859.3783576-1-wangliang74@huawei.com Signed-off-by: Jakub Kicinski --- net/ipv4/tcp_input.c | 1 + 1 file changed, 1 insertion(+) diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 5bdf13ac26ef..4811727b8a02 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -7328,6 +7328,7 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops, if (unlikely(!inet_csk_reqsk_queue_hash_add(sk, req, req->timeout))) { reqsk_free(req); + dst_release(dst); return 0; } From b5a7b661a073727219fedc35f5619f62418ffe72 Mon Sep 17 00:00:00 2001 From: Xiao Liang Date: Thu, 19 Dec 2024 21:03:36 +0800 Subject: [PATCH 250/337] net: Fix netns for ip_tunnel_init_flow() The device denoted by tunnel->parms.link resides in the underlay net namespace. Therefore pass tunnel->net to ip_tunnel_init_flow(). Fixes: db53cd3d88dc ("net: Handle l3mdev in ip_tunnel_init_flow") Signed-off-by: Xiao Liang Reviewed-by: Ido Schimmel Link: https://patch.msgid.link/20241219130336.103839-1-shaw.leon@gmail.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/mellanox/mlxsw/spectrum_span.c | 3 +-- net/ipv4/ip_tunnel.c | 6 +++--- 2 files changed, 4 insertions(+), 5 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_span.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum_span.c index 4b5fd71c897d..32d2e61f2b82 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/spectrum_span.c +++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_span.c @@ -423,8 +423,7 @@ mlxsw_sp_span_gretap4_route(const struct net_device *to_dev, parms = mlxsw_sp_ipip_netdev_parms4(to_dev); ip_tunnel_init_flow(&fl4, parms.iph.protocol, *daddrp, *saddrp, - 0, 0, dev_net(to_dev), parms.link, tun->fwmark, 0, - 0); + 0, 0, tun->net, parms.link, tun->fwmark, 0, 0); rt = ip_route_output_key(tun->net, &fl4); if (IS_ERR(rt)) diff --git a/net/ipv4/ip_tunnel.c b/net/ipv4/ip_tunnel.c index 25505f9b724c..09b73acf037a 100644 --- a/net/ipv4/ip_tunnel.c +++ b/net/ipv4/ip_tunnel.c @@ -294,7 +294,7 @@ static int ip_tunnel_bind_dev(struct net_device *dev) ip_tunnel_init_flow(&fl4, iph->protocol, iph->daddr, iph->saddr, tunnel->parms.o_key, - iph->tos & INET_DSCP_MASK, dev_net(dev), + iph->tos & INET_DSCP_MASK, tunnel->net, tunnel->parms.link, tunnel->fwmark, 0, 0); rt = ip_route_output_key(tunnel->net, &fl4); @@ -611,7 +611,7 @@ void ip_md_tunnel_xmit(struct sk_buff *skb, struct net_device *dev, } ip_tunnel_init_flow(&fl4, proto, key->u.ipv4.dst, key->u.ipv4.src, tunnel_id_to_key32(key->tun_id), - tos & INET_DSCP_MASK, dev_net(dev), 0, skb->mark, + tos & INET_DSCP_MASK, tunnel->net, 0, skb->mark, skb_get_hash(skb), key->flow_flags); if (!tunnel_hlen) @@ -774,7 +774,7 @@ void ip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev, ip_tunnel_init_flow(&fl4, protocol, dst, tnl_params->saddr, tunnel->parms.o_key, tos & INET_DSCP_MASK, - dev_net(dev), READ_ONCE(tunnel->parms.link), + tunnel->net, READ_ONCE(tunnel->parms.link), tunnel->fwmark, skb_get_hash(skb), 0); if (ip_tunnel_encap(skb, &tunnel->encap, &protocol, &fl4) < 0) From a4fd163aed2edd967a244499754dec991d8b4c7d Mon Sep 17 00:00:00 2001 From: Ilya Shchipletsov Date: Thu, 19 Dec 2024 08:23:07 +0000 Subject: [PATCH 251/337] netrom: check buffer length before accessing it Syzkaller reports an uninit value read from ax25cmp when sending raw message through ieee802154 implementation. ===================================================== BUG: KMSAN: uninit-value in ax25cmp+0x3a5/0x460 net/ax25/ax25_addr.c:119 ax25cmp+0x3a5/0x460 net/ax25/ax25_addr.c:119 nr_dev_get+0x20e/0x450 net/netrom/nr_route.c:601 nr_route_frame+0x1a2/0xfc0 net/netrom/nr_route.c:774 nr_xmit+0x5a/0x1c0 net/netrom/nr_dev.c:144 __netdev_start_xmit include/linux/netdevice.h:4940 [inline] netdev_start_xmit include/linux/netdevice.h:4954 [inline] xmit_one net/core/dev.c:3548 [inline] dev_hard_start_xmit+0x247/0xa10 net/core/dev.c:3564 __dev_queue_xmit+0x33b8/0x5130 net/core/dev.c:4349 dev_queue_xmit include/linux/netdevice.h:3134 [inline] raw_sendmsg+0x654/0xc10 net/ieee802154/socket.c:299 ieee802154_sock_sendmsg+0x91/0xc0 net/ieee802154/socket.c:96 sock_sendmsg_nosec net/socket.c:730 [inline] __sock_sendmsg net/socket.c:745 [inline] ____sys_sendmsg+0x9c2/0xd60 net/socket.c:2584 ___sys_sendmsg+0x28d/0x3c0 net/socket.c:2638 __sys_sendmsg net/socket.c:2667 [inline] __do_sys_sendmsg net/socket.c:2676 [inline] __se_sys_sendmsg net/socket.c:2674 [inline] __x64_sys_sendmsg+0x307/0x490 net/socket.c:2674 do_syscall_x64 arch/x86/entry/common.c:52 [inline] do_syscall_64+0x44/0x110 arch/x86/entry/common.c:83 entry_SYSCALL_64_after_hwframe+0x63/0x6b Uninit was created at: slab_post_alloc_hook+0x129/0xa70 mm/slab.h:768 slab_alloc_node mm/slub.c:3478 [inline] kmem_cache_alloc_node+0x5e9/0xb10 mm/slub.c:3523 kmalloc_reserve+0x13d/0x4a0 net/core/skbuff.c:560 __alloc_skb+0x318/0x740 net/core/skbuff.c:651 alloc_skb include/linux/skbuff.h:1286 [inline] alloc_skb_with_frags+0xc8/0xbd0 net/core/skbuff.c:6334 sock_alloc_send_pskb+0xa80/0xbf0 net/core/sock.c:2780 sock_alloc_send_skb include/net/sock.h:1884 [inline] raw_sendmsg+0x36d/0xc10 net/ieee802154/socket.c:282 ieee802154_sock_sendmsg+0x91/0xc0 net/ieee802154/socket.c:96 sock_sendmsg_nosec net/socket.c:730 [inline] __sock_sendmsg net/socket.c:745 [inline] ____sys_sendmsg+0x9c2/0xd60 net/socket.c:2584 ___sys_sendmsg+0x28d/0x3c0 net/socket.c:2638 __sys_sendmsg net/socket.c:2667 [inline] __do_sys_sendmsg net/socket.c:2676 [inline] __se_sys_sendmsg net/socket.c:2674 [inline] __x64_sys_sendmsg+0x307/0x490 net/socket.c:2674 do_syscall_x64 arch/x86/entry/common.c:52 [inline] do_syscall_64+0x44/0x110 arch/x86/entry/common.c:83 entry_SYSCALL_64_after_hwframe+0x63/0x6b CPU: 0 PID: 5037 Comm: syz-executor166 Not tainted 6.7.0-rc7-syzkaller-00003-gfbafc3e621c3 #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 11/17/2023 ===================================================== This issue occurs because the skb buffer is too small, and it's actual allocation is aligned. This hides an actual issue, which is that nr_route_frame does not validate the buffer size before using it. Fix this issue by checking skb->len before accessing any fields in skb->data. Found by Linux Verification Center (linuxtesting.org) with Syzkaller. Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2") Co-developed-by: Nikita Marushkin Signed-off-by: Nikita Marushkin Signed-off-by: Ilya Shchipletsov Link: https://patch.msgid.link/20241219082308.3942-1-rabbelkin@mail.ru Signed-off-by: Jakub Kicinski --- net/netrom/nr_route.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/net/netrom/nr_route.c b/net/netrom/nr_route.c index 2b5e246b8d9a..b94cb2ffbaf8 100644 --- a/net/netrom/nr_route.c +++ b/net/netrom/nr_route.c @@ -754,6 +754,12 @@ int nr_route_frame(struct sk_buff *skb, ax25_cb *ax25) int ret; struct sk_buff *skbn; + /* + * Reject malformed packets early. Check that it contains at least 2 + * addresses and 1 byte more for Time-To-Live + */ + if (skb->len < 2 * sizeof(ax25_address) + 1) + return 0; nr_src = (ax25_address *)(skb->data + 0); nr_dest = (ax25_address *)(skb->data + 7); From 4e86729d1ff329815a6e8a920cb554a1d4cb5b8d Mon Sep 17 00:00:00 2001 From: Nikolay Kuratov Date: Thu, 19 Dec 2024 19:21:14 +0300 Subject: [PATCH 252/337] net/sctp: Prevent autoclose integer overflow in sctp_association_init() While by default max_autoclose equals to INT_MAX / HZ, one may set net.sctp.max_autoclose to UINT_MAX. There is code in sctp_association_init() that can consequently trigger overflow. Cc: stable@vger.kernel.org Fixes: 9f70f46bd4c7 ("sctp: properly latch and use autoclose value from sock to association") Signed-off-by: Nikolay Kuratov Acked-by: Xin Long Link: https://patch.msgid.link/20241219162114.2863827-1-kniv@yandex-team.ru Signed-off-by: Jakub Kicinski --- net/sctp/associola.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/net/sctp/associola.c b/net/sctp/associola.c index c45c192b7878..0b0794f164cf 100644 --- a/net/sctp/associola.c +++ b/net/sctp/associola.c @@ -137,7 +137,8 @@ static struct sctp_association *sctp_association_init( = 5 * asoc->rto_max; asoc->timeouts[SCTP_EVENT_TIMEOUT_SACK] = asoc->sackdelay; - asoc->timeouts[SCTP_EVENT_TIMEOUT_AUTOCLOSE] = sp->autoclose * HZ; + asoc->timeouts[SCTP_EVENT_TIMEOUT_AUTOCLOSE] = + (unsigned long)sp->autoclose * HZ; /* Initializes the timers */ for (i = SCTP_EVENT_TIMEOUT_NONE; i < SCTP_NUM_TIMEOUT_TYPES; ++i) From 4a4d38ace1fb0586bffd2aab03caaa05d6011748 Mon Sep 17 00:00:00 2001 From: Siddharth Vadapalli Date: Fri, 20 Dec 2024 13:26:14 +0530 Subject: [PATCH 253/337] net: ethernet: ti: am65-cpsw: default to round-robin for host port receive The Host Port (i.e. CPU facing port) of CPSW receives traffic from Linux via TX DMA Channels which are Hardware Queues consisting of traffic categorized according to their priority. The Host Port is configured to dequeue traffic from these Hardware Queues on the basis of priority i.e. as long as traffic exists on a Hardware Queue of a higher priority, the traffic on Hardware Queues of lower priority isn't dequeued. An alternate operation is also supported wherein traffic can be dequeued by the Host Port in a Round-Robin manner. Until commit under Fixes, the am65-cpsw driver enabled a single TX DMA Channel, due to which, unless modified by user via "ethtool", all traffic from Linux is transmitted on DMA Channel 0. Therefore, configuring the Host Port for priority based dequeuing or Round-Robin operation is identical since there is a single DMA Channel. Since commit under Fixes, all 8 TX DMA Channels are enabled by default. Additionally, the default "tc mapping" doesn't take into account the possibility of different traffic profiles which various users might have. This results in traffic starvation at the Host Port due to the priority based dequeuing which has been enabled by default since the inception of the driver. The traffic starvation triggers NETDEV WATCHDOG timeout for all TX DMA Channels that haven't been serviced due to the presence of traffic on the higher priority TX DMA Channels. Fix this by defaulting to Round-Robin dequeuing at the Host Port, which shall ensure that traffic is dequeued from all TX DMA Channels irrespective of the traffic profile. This will address the NETDEV WATCHDOG timeouts. At the same time, users can still switch from Round-Robin to Priority based dequeuing at the Host Port with the help of the "p0-rx-ptype-rrobin" private flag of "ethtool". Users are expected to setup an appropriate "tc mapping" that suits their traffic profile when switching to priority based dequeuing at the Host Port. Fixes: be397ea3473d ("net: ethernet: am65-cpsw: Set default TX channels to maximum") Cc: Signed-off-by: Siddharth Vadapalli Link: https://patch.msgid.link/20241220075618.228202-1-s-vadapalli@ti.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/ti/am65-cpsw-nuss.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/ti/am65-cpsw-nuss.c b/drivers/net/ethernet/ti/am65-cpsw-nuss.c index 14e1df721f2e..5465bf872734 100644 --- a/drivers/net/ethernet/ti/am65-cpsw-nuss.c +++ b/drivers/net/ethernet/ti/am65-cpsw-nuss.c @@ -3551,7 +3551,7 @@ static int am65_cpsw_nuss_probe(struct platform_device *pdev) init_completion(&common->tdown_complete); common->tx_ch_num = AM65_CPSW_DEFAULT_TX_CHNS; common->rx_ch_num_flows = AM65_CPSW_DEFAULT_RX_CHN_FLOWS; - common->pf_p0_rx_ptype_rrobin = false; + common->pf_p0_rx_ptype_rrobin = true; common->default_vlan = 1; common->ports = devm_kcalloc(dev, common->port_num, From 75221e96101fa93390d3db5c23e026f5e3565d9b Mon Sep 17 00:00:00 2001 From: Kory Maincent Date: Fri, 20 Dec 2024 18:04:00 +0100 Subject: [PATCH 254/337] net: pse-pd: tps23881: Fix power on/off issue An issue was present in the initial driver implementation. The driver read the power status of all channels before toggling the bit of the desired one. Using the power status register as a base value introduced a problem, because only the bit corresponding to the concerned channel ID should be set in the write-only power enable register. This led to cases where disabling power for one channel also powered off other channels. This patch removes the power status read and ensures the value is limited to the bit matching the channel index of the PI. Fixes: 20e6d190ffe1 ("net: pse-pd: Add TI TPS23881 PSE controller driver") Signed-off-by: Kory Maincent Acked-by: Oleksij Rempel Link: https://patch.msgid.link/20241220170400.291705-1-kory.maincent@bootlin.com Signed-off-by: Jakub Kicinski --- drivers/net/pse-pd/tps23881.c | 16 ++++------------ 1 file changed, 4 insertions(+), 12 deletions(-) diff --git a/drivers/net/pse-pd/tps23881.c b/drivers/net/pse-pd/tps23881.c index 5c4e88be46ee..8797ca1a8a21 100644 --- a/drivers/net/pse-pd/tps23881.c +++ b/drivers/net/pse-pd/tps23881.c @@ -64,15 +64,11 @@ static int tps23881_pi_enable(struct pse_controller_dev *pcdev, int id) if (id >= TPS23881_MAX_CHANS) return -ERANGE; - ret = i2c_smbus_read_word_data(client, TPS23881_REG_PW_STATUS); - if (ret < 0) - return ret; - chan = priv->port[id].chan[0]; if (chan < 4) - val = (u16)(ret | BIT(chan)); + val = BIT(chan); else - val = (u16)(ret | BIT(chan + 4)); + val = BIT(chan + 4); if (priv->port[id].is_4p) { chan = priv->port[id].chan[1]; @@ -100,15 +96,11 @@ static int tps23881_pi_disable(struct pse_controller_dev *pcdev, int id) if (id >= TPS23881_MAX_CHANS) return -ERANGE; - ret = i2c_smbus_read_word_data(client, TPS23881_REG_PW_STATUS); - if (ret < 0) - return ret; - chan = priv->port[id].chan[0]; if (chan < 4) - val = (u16)(ret | BIT(chan + 4)); + val = BIT(chan + 4); else - val = (u16)(ret | BIT(chan + 8)); + val = BIT(chan + 8); if (priv->port[id].is_4p) { chan = priv->port[id].chan[1]; From 050a4c011b0dfeb91664a5d7bd3647ff38db08ce Mon Sep 17 00:00:00 2001 From: Shahar Shitrit Date: Fri, 20 Dec 2024 10:15:02 +0200 Subject: [PATCH 255/337] net/mlx5: DR, select MSIX vector 0 for completion queue creation When creating a software steering completion queue (CQ), an arbitrary MSIX vector n is selected. This results in the CQ sharing the same Ethernet traffic channel n associated with the chosen vector. However, the value of n is often unpredictable, which can introduce complications for interrupt monitoring and verification tools. Moreover, SW steering uses polling rather than event-driven interrupts. Therefore, there is no need to select any MSIX vector other than the existing vector 0 for CQ creation. In light of these factors, and to enhance predictability, we modify the code to consistently select MSIX vector 0 for CQ creation. Fixes: 297cccebdc5a ("net/mlx5: DR, Expose an internal API to issue RDMA operations") Signed-off-by: Shahar Shitrit Reviewed-by: Yevgeny Kliteynik Signed-off-by: Tariq Toukan Link: https://patch.msgid.link/20241220081505.1286093-2-tariqt@nvidia.com Signed-off-by: Jakub Kicinski --- .../net/ethernet/mellanox/mlx5/core/steering/sws/dr_send.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/sws/dr_send.c b/drivers/net/ethernet/mellanox/mlx5/core/steering/sws/dr_send.c index 6fa06ba2d346..f57c84e5128b 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/steering/sws/dr_send.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/sws/dr_send.c @@ -1067,7 +1067,6 @@ static struct mlx5dr_cq *dr_create_cq(struct mlx5_core_dev *mdev, int inlen, err, eqn; void *cqc, *in; __be64 *pas; - int vector; u32 i; cq = kzalloc(sizeof(*cq), GFP_KERNEL); @@ -1096,8 +1095,7 @@ static struct mlx5dr_cq *dr_create_cq(struct mlx5_core_dev *mdev, if (!in) goto err_cqwq; - vector = raw_smp_processor_id() % mlx5_comp_vectors_max(mdev); - err = mlx5_comp_eqn_get(mdev, vector, &eqn); + err = mlx5_comp_eqn_get(mdev, 0, &eqn); if (err) { kvfree(in); goto err_cqwq; From 8c6254479b3d5bd788d2b5fefaa48fb194331ed0 Mon Sep 17 00:00:00 2001 From: Dragos Tatulea Date: Fri, 20 Dec 2024 10:15:03 +0200 Subject: [PATCH 256/337] net/mlx5e: macsec: Maintain TX SA from encoding_sa In MACsec, it is possible to create multiple active TX SAs on a SC, but only one such SA can be used at a time for transmission. This SA is selected through the encoding_sa link parameter. When there are 2 or more active TX SAs configured (encoding_sa=0): ip macsec add macsec0 tx sa 0 pn 1 on key 00 ip macsec add macsec0 tx sa 1 pn 1 on key 00 ... the traffic should be still sent via TX SA 0 as the encoding_sa was not changed. However, the driver ignores the encoding_sa and overrides it to SA 1 by installing the flow steering id of the newly created TX SA into the SCI -> flow steering id hash map. The future packet tx descriptors will point to the incorrect flow steering rule (SA 1). This patch fixes the issue by avoiding the creation of the flow steering rule for an active TX SA that is not the encoding_sa. The driver side tx_sa object and the FW side macsec object are still created. When the encoding_sa link parameter is changed to another active TX SA, only the new flow steering rule will be created in the mlx5e_macsec_upd_txsa() handler. Fixes: 8ff0ac5be144 ("net/mlx5: Add MACsec offload Tx command support") Signed-off-by: Dragos Tatulea Reviewed-by: Cosmin Ratiu Reviewed-by: Lior Nahmanson Signed-off-by: Tariq Toukan Link: https://patch.msgid.link/20241220081505.1286093-3-tariqt@nvidia.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/mellanox/mlx5/core/en_accel/macsec.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_accel/macsec.c b/drivers/net/ethernet/mellanox/mlx5/core/en_accel/macsec.c index cc9bcc420032..6ab02f3fc291 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_accel/macsec.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_accel/macsec.c @@ -339,9 +339,13 @@ static int mlx5e_macsec_init_sa_fs(struct macsec_context *ctx, { struct mlx5e_priv *priv = macsec_netdev_priv(ctx->netdev); struct mlx5_macsec_fs *macsec_fs = priv->mdev->macsec_fs; + const struct macsec_tx_sc *tx_sc = &ctx->secy->tx_sc; struct mlx5_macsec_rule_attrs rule_attrs; union mlx5_macsec_rule *macsec_rule; + if (is_tx && tx_sc->encoding_sa != sa->assoc_num) + return 0; + rule_attrs.macsec_obj_id = sa->macsec_obj_id; rule_attrs.sci = sa->sci; rule_attrs.assoc_num = sa->assoc_num; From 5a03b368562a7ff5f5f1f63b5adf8309cbdbd5be Mon Sep 17 00:00:00 2001 From: Jianbo Liu Date: Fri, 20 Dec 2024 10:15:04 +0200 Subject: [PATCH 257/337] net/mlx5e: Skip restore TC rules for vport rep without loaded flag During driver unload, unregister_netdev is called after unloading vport rep. So, the mlx5e_rep_priv is already freed while trying to get rpriv->netdev, or walk rpriv->tc_ht, which results in use-after-free. So add the checking to make sure access the data of vport rep which is still loaded. Fixes: d1569537a837 ("net/mlx5e: Modify and restore TC rules for IPSec TX rules") Signed-off-by: Jianbo Liu Reviewed-by: Saeed Mahameed Signed-off-by: Tariq Toukan Link: https://patch.msgid.link/20241220081505.1286093-4-tariqt@nvidia.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/mellanox/mlx5/core/esw/ipsec_fs.c | 6 +++--- drivers/net/ethernet/mellanox/mlx5/core/eswitch.h | 3 +++ drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c | 3 --- 3 files changed, 6 insertions(+), 6 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/esw/ipsec_fs.c b/drivers/net/ethernet/mellanox/mlx5/core/esw/ipsec_fs.c index 5a0047bdcb51..ed977ae75fab 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/esw/ipsec_fs.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/esw/ipsec_fs.c @@ -150,11 +150,11 @@ void mlx5_esw_ipsec_restore_dest_uplink(struct mlx5_core_dev *mdev) unsigned long i; int err; - xa_for_each(&esw->offloads.vport_reps, i, rep) { - rpriv = rep->rep_data[REP_ETH].priv; - if (!rpriv || !rpriv->netdev) + mlx5_esw_for_each_rep(esw, i, rep) { + if (atomic_read(&rep->rep_data[REP_ETH].state) != REP_LOADED) continue; + rpriv = rep->rep_data[REP_ETH].priv; rhashtable_walk_enter(&rpriv->tc_ht, &iter); rhashtable_walk_start(&iter); while ((flow = rhashtable_walk_next(&iter)) != NULL) { diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h index a83d41121db6..8573d36785f4 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h @@ -714,6 +714,9 @@ void mlx5e_tc_clean_fdb_peer_flows(struct mlx5_eswitch *esw); MLX5_CAP_GEN_2((esw->dev), ec_vf_vport_base) +\ (last) - 1) +#define mlx5_esw_for_each_rep(esw, i, rep) \ + xa_for_each(&((esw)->offloads.vport_reps), i, rep) + struct mlx5_eswitch *__must_check mlx5_devlink_eswitch_get(struct devlink *devlink); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c b/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c index d5b42b3a19fd..40359f320724 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c @@ -53,9 +53,6 @@ #include "lag/lag.h" #include "en/tc/post_meter.h" -#define mlx5_esw_for_each_rep(esw, i, rep) \ - xa_for_each(&((esw)->offloads.vport_reps), i, rep) - /* There are two match-all miss flows, one for unicast dst mac and * one for multicast. */ From 2a4f56fbcc473d8faeb29b73082df39efbe5893c Mon Sep 17 00:00:00 2001 From: Jianbo Liu Date: Fri, 20 Dec 2024 10:15:05 +0200 Subject: [PATCH 258/337] net/mlx5e: Keep netdev when leave switchdev for devlink set legacy only In the cited commit, when changing from switchdev to legacy mode, uplink representor's netdev is kept, and its profile is replaced with nic profile, so netdev is detached from old profile, then attach to new profile. During profile change, the hardware resources allocated by the old profile will be cleaned up. However, the cleanup is relying on the related kernel modules. And they may need to flush themselves first, which is triggered by netdev events, for example, NETDEV_UNREGISTER. However, netdev is kept, or netdev_register is called after the cleanup, which may cause troubles because the resources are still referred by kernel modules. The same process applies to all the caes when uplink is leaving switchdev mode, including devlink eswitch mode set legacy, driver unload and devlink reload. For the first one, it can be blocked and returns failure to users, whenever possible. But it's hard for the others. Besides, the attachment to nic profile is unnecessary as the netdev will be unregistered anyway for such cases. So in this patch, the original behavior is kept only for devlink eswitch set mode legacy. For the others, moves netdev unregistration before the profile change. Fixes: 7a9fb35e8c3a ("net/mlx5e: Do not reload ethernet ports when changing eswitch mode") Signed-off-by: Jianbo Liu Signed-off-by: Tariq Toukan Link: https://patch.msgid.link/20241220081505.1286093-5-tariqt@nvidia.com Signed-off-by: Jakub Kicinski --- .../net/ethernet/mellanox/mlx5/core/en_main.c | 19 +++++++++++++++++-- .../net/ethernet/mellanox/mlx5/core/en_rep.c | 15 +++++++++++++++ .../mellanox/mlx5/core/eswitch_offloads.c | 2 ++ include/linux/mlx5/driver.h | 1 + 4 files changed, 35 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c index dd16d73000c3..0ec17c276bdd 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c @@ -6542,8 +6542,23 @@ static void _mlx5e_remove(struct auxiliary_device *adev) mlx5_core_uplink_netdev_set(mdev, NULL); mlx5e_dcbnl_delete_app(priv); - unregister_netdev(priv->netdev); - _mlx5e_suspend(adev, false); + /* When unload driver, the netdev is in registered state + * if it's from legacy mode. If from switchdev mode, it + * is already unregistered before changing to NIC profile. + */ + if (priv->netdev->reg_state == NETREG_REGISTERED) { + unregister_netdev(priv->netdev); + _mlx5e_suspend(adev, false); + } else { + struct mlx5_core_dev *pos; + int i; + + if (test_bit(MLX5E_STATE_DESTROYING, &priv->state)) + mlx5_sd_for_each_dev(i, mdev, pos) + mlx5e_destroy_mdev_resources(pos); + else + _mlx5e_suspend(adev, true); + } /* Avoid cleanup if profile rollback failed. */ if (priv->profile) priv->profile->cleanup(priv); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c b/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c index 554f9cb5b53f..fdff9fd8a89e 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c @@ -1509,6 +1509,21 @@ mlx5e_vport_uplink_rep_unload(struct mlx5e_rep_priv *rpriv) priv = netdev_priv(netdev); + /* This bit is set when using devlink to change eswitch mode from + * switchdev to legacy. As need to keep uplink netdev ifindex, we + * detach uplink representor profile and attach NIC profile only. + * The netdev will be unregistered later when unload NIC auxiliary + * driver for this case. + * We explicitly block devlink eswitch mode change if any IPSec rules + * offloaded, but can't block other cases, such as driver unload + * and devlink reload. We have to unregister netdev before profile + * change for those cases. This is to avoid resource leak because + * the offloaded rules don't have the chance to be unoffloaded before + * cleanup which is triggered by detach uplink representor profile. + */ + if (!(priv->mdev->priv.flags & MLX5_PRIV_FLAGS_SWITCH_LEGACY)) + unregister_netdev(netdev); + mlx5e_netdev_attach_nic_profile(priv); } diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c b/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c index 40359f320724..06076dd9ec64 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c @@ -3777,6 +3777,8 @@ int mlx5_devlink_eswitch_mode_set(struct devlink *devlink, u16 mode, esw->eswitch_operation_in_progress = true; up_write(&esw->mode_lock); + if (mode == DEVLINK_ESWITCH_MODE_LEGACY) + esw->dev->priv.flags |= MLX5_PRIV_FLAGS_SWITCH_LEGACY; mlx5_eswitch_disable_locked(esw); if (mode == DEVLINK_ESWITCH_MODE_SWITCHDEV) { if (mlx5_devlink_trap_get_num_active(esw->dev)) { diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h index fc7e6153b73d..8f5991168ccd 100644 --- a/include/linux/mlx5/driver.h +++ b/include/linux/mlx5/driver.h @@ -524,6 +524,7 @@ enum { * creation/deletion on drivers rescan. Unset during device attach. */ MLX5_PRIV_FLAGS_DETACH = 1 << 2, + MLX5_PRIV_FLAGS_SWITCH_LEGACY = 1 << 3, }; struct mlx5_adev { From d29662695ed7c015521e5fc9387df25aab192a2e Mon Sep 17 00:00:00 2001 From: Johannes Thumshirn Date: Wed, 13 Nov 2024 18:16:49 +0100 Subject: [PATCH 259/337] btrfs: fix use-after-free waiting for encoded read endios Fix a use-after-free in the I/O completion path for encoded reads by using a completion instead of a wait_queue for synchronizing the destruction of 'struct btrfs_encoded_read_private'. Fixes: 1881fba89bd5 ("btrfs: add BTRFS_IOC_ENCODED_READ ioctl") CC: stable@vger.kernel.org # 6.1+ Reviewed-by: Filipe Manana Reviewed-by: Qu Wenruo Signed-off-by: Johannes Thumshirn Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/inode.c | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 94c8809e8170..6baa0269a85b 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -9078,9 +9078,9 @@ out: } struct btrfs_encoded_read_private { - wait_queue_head_t wait; + struct completion done; void *uring_ctx; - atomic_t pending; + refcount_t pending_refs; blk_status_t status; }; @@ -9099,14 +9099,14 @@ static void btrfs_encoded_read_endio(struct btrfs_bio *bbio) */ WRITE_ONCE(priv->status, bbio->bio.bi_status); } - if (atomic_dec_and_test(&priv->pending)) { + if (refcount_dec_and_test(&priv->pending_refs)) { int err = blk_status_to_errno(READ_ONCE(priv->status)); if (priv->uring_ctx) { btrfs_uring_read_extent_endio(priv->uring_ctx, err); kfree(priv); } else { - wake_up(&priv->wait); + complete(&priv->done); } } bio_put(&bbio->bio); @@ -9126,8 +9126,8 @@ int btrfs_encoded_read_regular_fill_pages(struct btrfs_inode *inode, if (!priv) return -ENOMEM; - init_waitqueue_head(&priv->wait); - atomic_set(&priv->pending, 1); + init_completion(&priv->done); + refcount_set(&priv->pending_refs, 1); priv->status = 0; priv->uring_ctx = uring_ctx; @@ -9140,7 +9140,7 @@ int btrfs_encoded_read_regular_fill_pages(struct btrfs_inode *inode, size_t bytes = min_t(u64, disk_io_size, PAGE_SIZE); if (bio_add_page(&bbio->bio, pages[i], bytes, 0) < bytes) { - atomic_inc(&priv->pending); + refcount_inc(&priv->pending_refs); btrfs_submit_bbio(bbio, 0); bbio = btrfs_bio_alloc(BIO_MAX_VECS, REQ_OP_READ, fs_info, @@ -9155,11 +9155,11 @@ int btrfs_encoded_read_regular_fill_pages(struct btrfs_inode *inode, disk_io_size -= bytes; } while (disk_io_size); - atomic_inc(&priv->pending); + refcount_inc(&priv->pending_refs); btrfs_submit_bbio(bbio, 0); if (uring_ctx) { - if (atomic_dec_return(&priv->pending) == 0) { + if (refcount_dec_and_test(&priv->pending_refs)) { ret = blk_status_to_errno(READ_ONCE(priv->status)); btrfs_uring_read_extent_endio(uring_ctx, ret); kfree(priv); @@ -9168,8 +9168,8 @@ int btrfs_encoded_read_regular_fill_pages(struct btrfs_inode *inode, return -EIOCBQUEUED; } else { - if (atomic_dec_return(&priv->pending) != 0) - io_wait_event(priv->wait, !atomic_read(&priv->pending)); + if (!refcount_dec_and_test(&priv->pending_refs)) + wait_for_completion_io(&priv->done); /* See btrfs_encoded_read_endio() for ordering. */ ret = blk_status_to_errno(READ_ONCE(priv->status)); kfree(priv); From 44f52bbe96dfdbe4aca3818a2534520082a07040 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Wed, 11 Dec 2024 16:08:07 +0000 Subject: [PATCH 260/337] btrfs: fix use-after-free when COWing tree bock and tracing is enabled When a COWing a tree block, at btrfs_cow_block(), and we have the tracepoint trace_btrfs_cow_block() enabled and preemption is also enabled (CONFIG_PREEMPT=y), we can trigger a use-after-free in the COWed extent buffer while inside the tracepoint code. This is because in some paths that call btrfs_cow_block(), such as btrfs_search_slot(), we are holding the last reference on the extent buffer @buf so btrfs_force_cow_block() drops the last reference on the @buf extent buffer when it calls free_extent_buffer_stale(buf), which schedules the release of the extent buffer with RCU. This means that if we are on a kernel with preemption, the current task may be preempted before calling trace_btrfs_cow_block() and the extent buffer already released by the time trace_btrfs_cow_block() is called, resulting in a use-after-free. Fix this by moving the trace_btrfs_cow_block() from btrfs_cow_block() to btrfs_force_cow_block() before the COWed extent buffer is freed. This also has a side effect of invoking the tracepoint in the tree defrag code, at defrag.c:btrfs_realloc_node(), since btrfs_force_cow_block() is called there, but this is fine and it was actually missing there. Reported-by: syzbot+8517da8635307182c8a5@syzkaller.appspotmail.com Link: https://lore.kernel.org/linux-btrfs/6759a9b9.050a0220.1ac542.000d.GAE@google.com/ CC: stable@vger.kernel.org # 5.4+ Reviewed-by: Qu Wenruo Signed-off-by: Filipe Manana Signed-off-by: David Sterba --- fs/btrfs/ctree.c | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index 693dc27ffb89..185985a337b3 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -654,6 +654,8 @@ int btrfs_force_cow_block(struct btrfs_trans_handle *trans, goto error_unlock_cow; } } + + trace_btrfs_cow_block(root, buf, cow); if (unlock_orig) btrfs_tree_unlock(buf); free_extent_buffer_stale(buf); @@ -710,7 +712,6 @@ int btrfs_cow_block(struct btrfs_trans_handle *trans, { struct btrfs_fs_info *fs_info = root->fs_info; u64 search_start; - int ret; if (unlikely(test_bit(BTRFS_ROOT_DELETING, &root->state))) { btrfs_abort_transaction(trans, -EUCLEAN); @@ -751,12 +752,8 @@ int btrfs_cow_block(struct btrfs_trans_handle *trans, * Also We don't care about the error, as it's handled internally. */ btrfs_qgroup_trace_subtree_after_cow(trans, root, buf); - ret = btrfs_force_cow_block(trans, root, buf, parent, parent_slot, - cow_ret, search_start, 0, nest); - - trace_btrfs_cow_block(root, buf, *cow_ret); - - return ret; + return btrfs_force_cow_block(trans, root, buf, parent, parent_slot, + cow_ret, search_start, 0, nest); } ALLOW_ERROR_INJECTION(btrfs_cow_block, ERRNO); From 3e74859ee35edc33a022c3f3971df066ea0ca6b9 Mon Sep 17 00:00:00 2001 From: Boris Burkov Date: Fri, 13 Dec 2024 12:22:32 -0800 Subject: [PATCH 261/337] btrfs: check folio mapping after unlock in relocate_one_folio() When we call btrfs_read_folio() to bring a folio uptodate, we unlock the folio. The result of that is that a different thread can modify the mapping (like remove it with invalidate) before we call folio_lock(). This results in an invalid page and we need to try again. In particular, if we are relocating concurrently with aborting a transaction, this can result in a crash like the following: BUG: kernel NULL pointer dereference, address: 0000000000000000 PGD 0 P4D 0 Oops: 0000 [#1] SMP CPU: 76 PID: 1411631 Comm: kworker/u322:5 Workqueue: events_unbound btrfs_reclaim_bgs_work RIP: 0010:set_page_extent_mapped+0x20/0xb0 RSP: 0018:ffffc900516a7be8 EFLAGS: 00010246 RAX: ffffea009e851d08 RBX: ffffea009e0b1880 RCX: 0000000000000000 RDX: 0000000000000000 RSI: ffffc900516a7b90 RDI: ffffea009e0b1880 RBP: 0000000003573000 R08: 0000000000000001 R09: ffff88c07fd2f3f0 R10: 0000000000000000 R11: 0000194754b575be R12: 0000000003572000 R13: 0000000003572fff R14: 0000000000100cca R15: 0000000005582fff FS: 0000000000000000(0000) GS:ffff88c07fd00000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 0000000000000000 CR3: 000000407d00f002 CR4: 00000000007706f0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 PKRU: 55555554 Call Trace: ? __die+0x78/0xc0 ? page_fault_oops+0x2a8/0x3a0 ? __switch_to+0x133/0x530 ? wq_worker_running+0xa/0x40 ? exc_page_fault+0x63/0x130 ? asm_exc_page_fault+0x22/0x30 ? set_page_extent_mapped+0x20/0xb0 relocate_file_extent_cluster+0x1a7/0x940 relocate_data_extent+0xaf/0x120 relocate_block_group+0x20f/0x480 btrfs_relocate_block_group+0x152/0x320 btrfs_relocate_chunk+0x3d/0x120 btrfs_reclaim_bgs_work+0x2ae/0x4e0 process_scheduled_works+0x184/0x370 worker_thread+0xc6/0x3e0 ? blk_add_timer+0xb0/0xb0 kthread+0xae/0xe0 ? flush_tlb_kernel_range+0x90/0x90 ret_from_fork+0x2f/0x40 ? flush_tlb_kernel_range+0x90/0x90 ret_from_fork_asm+0x11/0x20 This occurs because cleanup_one_transaction() calls destroy_delalloc_inodes() which calls invalidate_inode_pages2() which takes the folio_lock before setting mapping to NULL. We fail to check this, and subsequently call set_extent_mapping(), which assumes that mapping != NULL (in fact it asserts that in debug mode) Note that the "fixes" patch here is not the one that introduced the race (the very first iteration of this code from 2009) but a more recent change that made this particular crash happen in practice. Fixes: e7f1326cc24e ("btrfs: set page extent mapped after read_folio in relocate_one_page") CC: stable@vger.kernel.org # 6.1+ Reviewed-by: Qu Wenruo Signed-off-by: Boris Burkov Signed-off-by: David Sterba --- fs/btrfs/relocation.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index bf267bdfa8f8..db8b42f674b7 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -2902,6 +2902,7 @@ static int relocate_one_folio(struct reloc_control *rc, const bool use_rst = btrfs_need_stripe_tree_update(fs_info, rc->block_group->flags); ASSERT(index <= last_index); +again: folio = filemap_lock_folio(inode->i_mapping, index); if (IS_ERR(folio)) { @@ -2937,6 +2938,11 @@ static int relocate_one_folio(struct reloc_control *rc, ret = -EIO; goto release_folio; } + if (folio->mapping != inode->i_mapping) { + folio_unlock(folio); + folio_put(folio); + goto again; + } } /* From 0fba7be1ca6df2881e68386e5575fe096f33c4ca Mon Sep 17 00:00:00 2001 From: Boris Burkov Date: Fri, 13 Dec 2024 12:33:22 -0800 Subject: [PATCH 262/337] btrfs: check folio mapping after unlock in put_file_data() When we call btrfs_read_folio() we get an unlocked folio, so it is possible for a different thread to concurrently modify folio->mapping. We must check that this hasn't happened once we do have the lock. CC: stable@vger.kernel.org # 6.12+ Reviewed-by: Qu Wenruo Signed-off-by: Boris Burkov Signed-off-by: David Sterba --- fs/btrfs/send.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c index 7254279c3cc9..498c84323253 100644 --- a/fs/btrfs/send.c +++ b/fs/btrfs/send.c @@ -5280,6 +5280,7 @@ static int put_file_data(struct send_ctx *sctx, u64 offset, u32 len) unsigned cur_len = min_t(unsigned, len, PAGE_SIZE - pg_offset); +again: folio = filemap_lock_folio(mapping, index); if (IS_ERR(folio)) { page_cache_sync_readahead(mapping, @@ -5312,6 +5313,11 @@ static int put_file_data(struct send_ctx *sctx, u64 offset, u32 len) ret = -EIO; break; } + if (folio->mapping != mapping) { + folio_unlock(folio); + folio_put(folio); + goto again; + } } memcpy_from_folio(sctx->send_buf + sctx->send_size, folio, From 0525064bb82e50d59543b62b9d41a606198a4a44 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Fri, 29 Nov 2024 12:25:30 +0000 Subject: [PATCH 263/337] btrfs: fix race with memory mapped writes when activating swap file When activating the swap file we flush all delalloc and wait for ordered extent completion, so that we don't miss any delalloc and extents before we check that the file's extent layout is usable for a swap file and activate the swap file. We are called with the inode's VFS lock acquired, so we won't race with buffered and direct IO writes, however we can still race with memory mapped writes since they don't acquire the inode's VFS lock. The race window is between flushing all delalloc and locking the whole file's extent range, since memory mapped writes lock an extent range with the length of a page. Fix this by acquiring the inode's mmap lock before we flush delalloc. CC: stable@vger.kernel.org # 5.4+ Reviewed-by: Qu Wenruo Signed-off-by: Filipe Manana Signed-off-by: David Sterba --- fs/btrfs/inode.c | 31 ++++++++++++++++++++++++------- 1 file changed, 24 insertions(+), 7 deletions(-) diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 6baa0269a85b..b2abc0aa5300 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -9809,6 +9809,15 @@ static int btrfs_swap_activate(struct swap_info_struct *sis, struct file *file, u64 isize; u64 start; + /* + * Acquire the inode's mmap lock to prevent races with memory mapped + * writes, as they could happen after we flush delalloc below and before + * we lock the extent range further below. The inode was already locked + * up in the call chain. + */ + btrfs_assert_inode_locked(BTRFS_I(inode)); + down_write(&BTRFS_I(inode)->i_mmap_lock); + /* * If the swap file was just created, make sure delalloc is done. If the * file changes again after this, the user is doing something stupid and @@ -9816,22 +9825,25 @@ static int btrfs_swap_activate(struct swap_info_struct *sis, struct file *file, */ ret = btrfs_wait_ordered_range(BTRFS_I(inode), 0, (u64)-1); if (ret) - return ret; + goto out_unlock_mmap; /* * The inode is locked, so these flags won't change after we check them. */ if (BTRFS_I(inode)->flags & BTRFS_INODE_COMPRESS) { btrfs_warn(fs_info, "swapfile must not be compressed"); - return -EINVAL; + ret = -EINVAL; + goto out_unlock_mmap; } if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW)) { btrfs_warn(fs_info, "swapfile must not be copy-on-write"); - return -EINVAL; + ret = -EINVAL; + goto out_unlock_mmap; } if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)) { btrfs_warn(fs_info, "swapfile must not be checksummed"); - return -EINVAL; + ret = -EINVAL; + goto out_unlock_mmap; } /* @@ -9846,7 +9858,8 @@ static int btrfs_swap_activate(struct swap_info_struct *sis, struct file *file, if (!btrfs_exclop_start(fs_info, BTRFS_EXCLOP_SWAP_ACTIVATE)) { btrfs_warn(fs_info, "cannot activate swapfile while exclusive operation is running"); - return -EBUSY; + ret = -EBUSY; + goto out_unlock_mmap; } /* @@ -9860,7 +9873,8 @@ static int btrfs_swap_activate(struct swap_info_struct *sis, struct file *file, btrfs_exclop_finish(fs_info); btrfs_warn(fs_info, "cannot activate swapfile because snapshot creation is in progress"); - return -EINVAL; + ret = -EINVAL; + goto out_unlock_mmap; } /* * Snapshots can create extents which require COW even if NODATACOW is @@ -9881,7 +9895,8 @@ static int btrfs_swap_activate(struct swap_info_struct *sis, struct file *file, btrfs_warn(fs_info, "cannot activate swapfile because subvolume %llu is being deleted", btrfs_root_id(root)); - return -EPERM; + ret = -EPERM; + goto out_unlock_mmap; } atomic_inc(&root->nr_swapfiles); spin_unlock(&root->root_item_lock); @@ -10036,6 +10051,8 @@ out: btrfs_exclop_finish(fs_info); +out_unlock_mmap: + up_write(&BTRFS_I(inode)->i_mmap_lock); if (ret) return ret; From 03018e5d8508254534511d40fb57bc150e6a87f2 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Mon, 9 Dec 2024 12:54:14 +0000 Subject: [PATCH 264/337] btrfs: fix swap file activation failure due to extents that used to be shared When activating a swap file, to determine if an extent is shared we use can_nocow_extent(), which ends up at btrfs_cross_ref_exist(). That helper is meant to be quick because it's used in the NOCOW write path, when flushing delalloc and when doing a direct IO write, however it does return some false positives, meaning it may indicate that an extent is shared even if it's no longer the case. For the write path this is fine, we just do a unnecessary COW operation instead of doing a more rigorous check which would be too heavy (calling btrfs_is_data_extent_shared()). However when activating a swap file, the false positives simply result in a failure, which is confusing for users/applications. One particular case where this happens is when a data extent only has 1 reference but that reference is not inlined in the extent item located in the extent tree - this happens when we create more than 33 references for an extent and then delete those 33 references plus every other non-inline reference except one. The function check_committed_ref() assumes that if the size of an extent item doesn't match the size of struct btrfs_extent_item plus the size of an inline reference (plus an owner reference in case simple quotas are enabled), then the extent is shared - that is not the case however, we can have a single reference but it's not inlined - the reason we do this is to be fast and avoid inspecting non-inline references which may be located in another leaf of the extent tree, slowing down write paths. The following test script reproduces the bug: $ cat test.sh #!/bin/bash DEV=/dev/sdi MNT=/mnt/sdi NUM_CLONES=50 umount $DEV &> /dev/null run_test() { local sync_after_add_reflinks=$1 local sync_after_remove_reflinks=$2 mkfs.btrfs -f $DEV > /dev/null #mkfs.xfs -f $DEV > /dev/null mount $DEV $MNT touch $MNT/foo chmod 0600 $MNT/foo # On btrfs the file must be NOCOW. chattr +C $MNT/foo &> /dev/null xfs_io -s -c "pwrite -b 1M 0 1M" $MNT/foo mkswap $MNT/foo for ((i = 1; i <= $NUM_CLONES; i++)); do touch $MNT/foo_clone_$i chmod 0600 $MNT/foo_clone_$i # On btrfs the file must be NOCOW. chattr +C $MNT/foo_clone_$i &> /dev/null cp --reflink=always $MNT/foo $MNT/foo_clone_$i done if [ $sync_after_add_reflinks -ne 0 ]; then # Flush delayed refs and commit current transaction. sync -f $MNT fi # Remove the original file and all clones except the last. rm -f $MNT/foo for ((i = 1; i < $NUM_CLONES; i++)); do rm -f $MNT/foo_clone_$i done if [ $sync_after_remove_reflinks -ne 0 ]; then # Flush delayed refs and commit current transaction. sync -f $MNT fi # Now use the last clone as a swap file. It should work since # its extent are not shared anymore. swapon $MNT/foo_clone_${NUM_CLONES} swapoff $MNT/foo_clone_${NUM_CLONES} umount $MNT } echo -e "\nTest without sync after creating and removing clones" run_test 0 0 echo -e "\nTest with sync after creating clones" run_test 1 0 echo -e "\nTest with sync after removing clones" run_test 0 1 echo -e "\nTest with sync after creating and removing clones" run_test 1 1 Running the test: $ ./test.sh Test without sync after creating and removing clones wrote 1048576/1048576 bytes at offset 0 1 MiB, 1 ops; 0.0017 sec (556.793 MiB/sec and 556.7929 ops/sec) Setting up swapspace version 1, size = 1020 KiB (1044480 bytes) no label, UUID=a6b9c29e-5ef4-4689-a8ac-bc199c750f02 swapon: /mnt/sdi/foo_clone_50: swapon failed: Invalid argument swapoff: /mnt/sdi/foo_clone_50: swapoff failed: Invalid argument Test with sync after creating clones wrote 1048576/1048576 bytes at offset 0 1 MiB, 1 ops; 0.0036 sec (271.739 MiB/sec and 271.7391 ops/sec) Setting up swapspace version 1, size = 1020 KiB (1044480 bytes) no label, UUID=5e9008d6-1f7a-4948-a1b4-3f30aba20a33 swapon: /mnt/sdi/foo_clone_50: swapon failed: Invalid argument swapoff: /mnt/sdi/foo_clone_50: swapoff failed: Invalid argument Test with sync after removing clones wrote 1048576/1048576 bytes at offset 0 1 MiB, 1 ops; 0.0103 sec (96.665 MiB/sec and 96.6651 ops/sec) Setting up swapspace version 1, size = 1020 KiB (1044480 bytes) no label, UUID=916c2740-fa9f-4385-9f06-29c3f89e4764 Test with sync after creating and removing clones wrote 1048576/1048576 bytes at offset 0 1 MiB, 1 ops; 0.0031 sec (314.268 MiB/sec and 314.2678 ops/sec) Setting up swapspace version 1, size = 1020 KiB (1044480 bytes) no label, UUID=06aab1dd-4d90-49c0-bd9f-3a8db4e2f912 swapon: /mnt/sdi/foo_clone_50: swapon failed: Invalid argument swapoff: /mnt/sdi/foo_clone_50: swapoff failed: Invalid argument Fix this by reworking btrfs_swap_activate() to instead of using extent maps and checking for shared extents with can_nocow_extent(), iterate over the inode's file extent items and use the accurate btrfs_is_data_extent_shared(). CC: stable@vger.kernel.org # 5.4+ Reviewed-by: Qu Wenruo Signed-off-by: Filipe Manana Signed-off-by: David Sterba --- fs/btrfs/inode.c | 96 ++++++++++++++++++++++++++++++++++-------------- 1 file changed, 69 insertions(+), 27 deletions(-) diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index b2abc0aa5300..b87f19630b00 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -9799,15 +9799,16 @@ static int btrfs_swap_activate(struct swap_info_struct *sis, struct file *file, struct btrfs_fs_info *fs_info = root->fs_info; struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; struct extent_state *cached_state = NULL; - struct extent_map *em = NULL; struct btrfs_chunk_map *map = NULL; struct btrfs_device *device = NULL; struct btrfs_swap_info bsi = { .lowest_ppage = (sector_t)-1ULL, }; + struct btrfs_backref_share_check_ctx *backref_ctx = NULL; + struct btrfs_path *path = NULL; int ret = 0; u64 isize; - u64 start; + u64 prev_extent_end = 0; /* * Acquire the inode's mmap lock to prevent races with memory mapped @@ -9846,6 +9847,13 @@ static int btrfs_swap_activate(struct swap_info_struct *sis, struct file *file, goto out_unlock_mmap; } + path = btrfs_alloc_path(); + backref_ctx = btrfs_alloc_backref_share_check_ctx(); + if (!path || !backref_ctx) { + ret = -ENOMEM; + goto out_unlock_mmap; + } + /* * Balance or device remove/replace/resize can move stuff around from * under us. The exclop protection makes sure they aren't running/won't @@ -9904,24 +9912,39 @@ static int btrfs_swap_activate(struct swap_info_struct *sis, struct file *file, isize = ALIGN_DOWN(inode->i_size, fs_info->sectorsize); lock_extent(io_tree, 0, isize - 1, &cached_state); - start = 0; - while (start < isize) { - u64 logical_block_start, physical_block_start; + while (prev_extent_end < isize) { + struct btrfs_key key; + struct extent_buffer *leaf; + struct btrfs_file_extent_item *ei; struct btrfs_block_group *bg; - u64 len = isize - start; + u64 logical_block_start; + u64 physical_block_start; + u64 extent_gen; + u64 disk_bytenr; + u64 len; - em = btrfs_get_extent(BTRFS_I(inode), NULL, start, len); - if (IS_ERR(em)) { - ret = PTR_ERR(em); + key.objectid = btrfs_ino(BTRFS_I(inode)); + key.type = BTRFS_EXTENT_DATA_KEY; + key.offset = prev_extent_end; + + ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); + if (ret < 0) goto out; - } - if (em->disk_bytenr == EXTENT_MAP_HOLE) { + /* + * If key not found it means we have an implicit hole (NO_HOLES + * is enabled). + */ + if (ret > 0) { btrfs_warn(fs_info, "swapfile must not have holes"); ret = -EINVAL; goto out; } - if (em->disk_bytenr == EXTENT_MAP_INLINE) { + + leaf = path->nodes[0]; + ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_file_extent_item); + + if (btrfs_file_extent_type(leaf, ei) == BTRFS_FILE_EXTENT_INLINE) { /* * It's unlikely we'll ever actually find ourselves * here, as a file small enough to fit inline won't be @@ -9933,23 +9956,45 @@ static int btrfs_swap_activate(struct swap_info_struct *sis, struct file *file, ret = -EINVAL; goto out; } - if (extent_map_is_compressed(em)) { + + if (btrfs_file_extent_compression(leaf, ei) != BTRFS_COMPRESS_NONE) { btrfs_warn(fs_info, "swapfile must not be compressed"); ret = -EINVAL; goto out; } - logical_block_start = extent_map_block_start(em) + (start - em->start); - len = min(len, em->len - (start - em->start)); - free_extent_map(em); - em = NULL; + disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, ei); + if (disk_bytenr == 0) { + btrfs_warn(fs_info, "swapfile must not have holes"); + ret = -EINVAL; + goto out; + } - ret = can_nocow_extent(inode, start, &len, NULL, false, true); + logical_block_start = disk_bytenr + btrfs_file_extent_offset(leaf, ei); + extent_gen = btrfs_file_extent_generation(leaf, ei); + prev_extent_end = btrfs_file_extent_end(path); + + if (prev_extent_end > isize) + len = isize - key.offset; + else + len = btrfs_file_extent_num_bytes(leaf, ei); + + backref_ctx->curr_leaf_bytenr = leaf->start; + + /* + * Don't need the path anymore, release to avoid deadlocks when + * calling btrfs_is_data_extent_shared() because when joining a + * transaction it can block waiting for the current one's commit + * which in turn may be trying to lock the same leaf to flush + * delayed items for example. + */ + btrfs_release_path(path); + + ret = btrfs_is_data_extent_shared(BTRFS_I(inode), disk_bytenr, + extent_gen, backref_ctx); if (ret < 0) { goto out; - } else if (ret) { - ret = 0; - } else { + } else if (ret > 0) { btrfs_warn(fs_info, "swapfile must not be copy-on-write"); ret = -EINVAL; @@ -9984,7 +10029,6 @@ static int btrfs_swap_activate(struct swap_info_struct *sis, struct file *file, physical_block_start = (map->stripes[0].physical + (logical_block_start - map->start)); - len = min(len, map->chunk_len - (logical_block_start - map->start)); btrfs_free_chunk_map(map); map = NULL; @@ -10025,20 +10069,16 @@ static int btrfs_swap_activate(struct swap_info_struct *sis, struct file *file, if (ret) goto out; } - bsi.start = start; + bsi.start = key.offset; bsi.block_start = physical_block_start; bsi.block_len = len; } - - start += len; } if (bsi.block_len) ret = btrfs_add_swap_extent(sis, &bsi); out: - if (!IS_ERR_OR_NULL(em)) - free_extent_map(em); if (!IS_ERR_OR_NULL(map)) btrfs_free_chunk_map(map); @@ -10053,6 +10093,8 @@ out: out_unlock_mmap: up_write(&BTRFS_I(inode)->i_mmap_lock); + btrfs_free_backref_share_ctx(backref_ctx); + btrfs_free_path(path); if (ret) return ret; From 9a45022a0efadd99bcc58f7f1cc2b6fb3b808c40 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Mon, 9 Dec 2024 16:31:41 +0000 Subject: [PATCH 265/337] btrfs: allow swap activation to be interruptible During swap activation we iterate over the extents of a file, then do several checks for each extent, some of which may take some significant time such as checking if an extent is shared. Since a file can have many thousands of extents, this can be a very slow operation and it's currently not interruptible. I had a bug during development of a previous patch that resulted in an infinite loop when iterating the extents, so a core was busy looping and I couldn't cancel the operation, which is very annoying and requires a reboot. So make the loop interruptible by checking for fatal signals at the end of each iteration and stopping immediately if there is one. CC: stable@vger.kernel.org # 5.4+ Reviewed-by: Qu Wenruo Signed-off-by: Filipe Manana Signed-off-by: David Sterba --- fs/btrfs/inode.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index b87f19630b00..c4675f4345fd 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -10073,6 +10073,11 @@ static int btrfs_swap_activate(struct swap_info_struct *sis, struct file *file, bsi.block_start = physical_block_start; bsi.block_len = len; } + + if (fatal_signal_pending(current)) { + ret = -EINTR; + goto out; + } } if (bsi.block_len) From 2c8507c63f5498d4ee4af404a8e44ceae4345056 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Mon, 9 Dec 2024 16:43:44 +0000 Subject: [PATCH 266/337] btrfs: avoid monopolizing a core when activating a swap file During swap activation we iterate over the extents of a file and we can have many thousands of them, so we can end up in a busy loop monopolizing a core. Avoid this by doing a voluntary reschedule after processing each extent. CC: stable@vger.kernel.org # 5.4+ Reviewed-by: Qu Wenruo Signed-off-by: Filipe Manana Signed-off-by: David Sterba --- fs/btrfs/inode.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index c4675f4345fd..623d9d7ab480 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -10078,6 +10078,8 @@ static int btrfs_swap_activate(struct swap_info_struct *sis, struct file *file, ret = -EINTR; goto out; } + + cond_resched(); } if (bsi.block_len) From f2363e6fcc7938c5f0f6ac066fad0dd247598b51 Mon Sep 17 00:00:00 2001 From: Julian Sun Date: Wed, 11 Dec 2024 19:13:15 +0800 Subject: [PATCH 267/337] btrfs: fix transaction atomicity bug when enabling simple quotas Set squota incompat bit before committing the transaction that enables the feature. With the config CONFIG_BTRFS_ASSERT enabled, an assertion failure occurs regarding the simple quota feature. [5.596534] assertion failed: btrfs_fs_incompat(fs_info, SIMPLE_QUOTA), in fs/btrfs/qgroup.c:365 [5.597098] ------------[ cut here ]------------ [5.597371] kernel BUG at fs/btrfs/qgroup.c:365! [5.597946] CPU: 1 UID: 0 PID: 268 Comm: mount Not tainted 6.13.0-rc2-00031-gf92f4749861b #146 [5.598450] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.16.2-debian-1.16.2-1 04/01/2014 [5.599008] RIP: 0010:btrfs_read_qgroup_config+0x74d/0x7a0 [5.604303] [5.605230] ? btrfs_read_qgroup_config+0x74d/0x7a0 [5.605538] ? exc_invalid_op+0x56/0x70 [5.605775] ? btrfs_read_qgroup_config+0x74d/0x7a0 [5.606066] ? asm_exc_invalid_op+0x1f/0x30 [5.606441] ? btrfs_read_qgroup_config+0x74d/0x7a0 [5.606741] ? btrfs_read_qgroup_config+0x74d/0x7a0 [5.607038] ? try_to_wake_up+0x317/0x760 [5.607286] open_ctree+0xd9c/0x1710 [5.607509] btrfs_get_tree+0x58a/0x7e0 [5.608002] vfs_get_tree+0x2e/0x100 [5.608224] fc_mount+0x16/0x60 [5.608420] btrfs_get_tree+0x2f8/0x7e0 [5.608897] vfs_get_tree+0x2e/0x100 [5.609121] path_mount+0x4c8/0xbc0 [5.609538] __x64_sys_mount+0x10d/0x150 The issue can be easily reproduced using the following reproducer: root@q:linux# cat repro.sh set -e mkfs.btrfs -q -f /dev/sdb mount /dev/sdb /mnt/btrfs btrfs quota enable -s /mnt/btrfs umount /mnt/btrfs mount /dev/sdb /mnt/btrfs The issue is that when enabling quotas, at btrfs_quota_enable(), we set BTRFS_QGROUP_STATUS_FLAG_SIMPLE_MODE at fs_info->qgroup_flags and persist it in the quota root in the item with the key BTRFS_QGROUP_STATUS_KEY, but we only set the incompat bit BTRFS_FEATURE_INCOMPAT_SIMPLE_QUOTA after we commit the transaction used to enable simple quotas. This means that if after that transaction commit we unmount the filesystem without starting and committing any other transaction, or we have a power failure, the next time we mount the filesystem we will find the flag BTRFS_QGROUP_STATUS_FLAG_SIMPLE_MODE set in the item with the key BTRFS_QGROUP_STATUS_KEY but we will not find the incompat bit BTRFS_FEATURE_INCOMPAT_SIMPLE_QUOTA set in the superblock, triggering an assertion failure at: btrfs_read_qgroup_config() -> qgroup_read_enable_gen() To fix this issue, set the BTRFS_FEATURE_INCOMPAT_SIMPLE_QUOTA flag immediately after setting the BTRFS_QGROUP_STATUS_FLAG_SIMPLE_MODE. This ensures that both flags are flushed to disk within the same transaction. Fixes: 182940f4f4db ("btrfs: qgroup: add new quota mode for simple quotas") CC: stable@vger.kernel.org # 6.6+ Reviewed-by: Filipe Manana Signed-off-by: Julian Sun Signed-off-by: Filipe Manana Signed-off-by: David Sterba --- fs/btrfs/qgroup.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c index a6f92836c9b1..f9b214992212 100644 --- a/fs/btrfs/qgroup.c +++ b/fs/btrfs/qgroup.c @@ -1121,6 +1121,7 @@ int btrfs_quota_enable(struct btrfs_fs_info *fs_info, fs_info->qgroup_flags = BTRFS_QGROUP_STATUS_FLAG_ON; if (simple) { fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_SIMPLE_MODE; + btrfs_set_fs_incompat(fs_info, SIMPLE_QUOTA); btrfs_set_qgroup_status_enable_gen(leaf, ptr, trans->transid); } else { fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT; @@ -1254,8 +1255,6 @@ out_add_root: spin_lock(&fs_info->qgroup_lock); fs_info->quota_root = quota_root; set_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags); - if (simple) - btrfs_set_fs_incompat(fs_info, SIMPLE_QUOTA); spin_unlock(&fs_info->qgroup_lock); /* Skip rescan for simple qgroups. */ From fca432e73db2bec0fdbfbf6d98d3ebcd5388a977 Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Wed, 18 Dec 2024 17:00:56 +1030 Subject: [PATCH 268/337] btrfs: sysfs: fix direct super block member reads The following sysfs entries are reading super block member directly, which can have a different endian and cause wrong values: - sys/fs/btrfs//nodesize - sys/fs/btrfs//sectorsize - sys/fs/btrfs//clone_alignment Thankfully those values (nodesize and sectorsize) are always aligned inside the btrfs_super_block, so it won't trigger unaligned read errors, just endian problems. Fix them by using the native cached members instead. Fixes: df93589a1737 ("btrfs: export more from FS_INFO to sysfs") CC: stable@vger.kernel.org Reviewed-by: Naohiro Aota Reviewed-by: Johannes Thumshirn Signed-off-by: Qu Wenruo Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/sysfs.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c index fdcbf650ac31..7f09b6c9cc2d 100644 --- a/fs/btrfs/sysfs.c +++ b/fs/btrfs/sysfs.c @@ -1118,7 +1118,7 @@ static ssize_t btrfs_nodesize_show(struct kobject *kobj, { struct btrfs_fs_info *fs_info = to_fs_info(kobj); - return sysfs_emit(buf, "%u\n", fs_info->super_copy->nodesize); + return sysfs_emit(buf, "%u\n", fs_info->nodesize); } BTRFS_ATTR(, nodesize, btrfs_nodesize_show); @@ -1128,7 +1128,7 @@ static ssize_t btrfs_sectorsize_show(struct kobject *kobj, { struct btrfs_fs_info *fs_info = to_fs_info(kobj); - return sysfs_emit(buf, "%u\n", fs_info->super_copy->sectorsize); + return sysfs_emit(buf, "%u\n", fs_info->sectorsize); } BTRFS_ATTR(, sectorsize, btrfs_sectorsize_show); @@ -1180,7 +1180,7 @@ static ssize_t btrfs_clone_alignment_show(struct kobject *kobj, { struct btrfs_fs_info *fs_info = to_fs_info(kobj); - return sysfs_emit(buf, "%u\n", fs_info->super_copy->sectorsize); + return sysfs_emit(buf, "%u\n", fs_info->sectorsize); } BTRFS_ATTR(, clone_alignment, btrfs_clone_alignment_show); From 452f4b31e3f70a52b97890888eeb9eaa9a87139a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Christian=20G=C3=B6ttsche?= Date: Mon, 25 Nov 2024 11:50:25 +0100 Subject: [PATCH 269/337] tracing: Constify string literal data member in struct trace_event_call MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The name member of the struct trace_event_call is assigned with generated string literals; declare them pointer to read-only. Reported by clang: security/landlock/syscalls.c:179:1: warning: initializing 'char *' with an expression of type 'const char[34]' discards qualifiers [-Wincompatible-pointer-types-discards-qualifiers] 179 | SYSCALL_DEFINE3(landlock_create_ruleset, | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 180 | const struct landlock_ruleset_attr __user *const, attr, | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 181 | const size_t, size, const __u32, flags) | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ./include/linux/syscalls.h:226:36: note: expanded from macro 'SYSCALL_DEFINE3' 226 | #define SYSCALL_DEFINE3(name, ...) SYSCALL_DEFINEx(3, _##name, __VA_ARGS__) | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ./include/linux/syscalls.h:234:2: note: expanded from macro 'SYSCALL_DEFINEx' 234 | SYSCALL_METADATA(sname, x, __VA_ARGS__) \ | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ./include/linux/syscalls.h:184:2: note: expanded from macro 'SYSCALL_METADATA' 184 | SYSCALL_TRACE_ENTER_EVENT(sname); \ | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ./include/linux/syscalls.h:151:30: note: expanded from macro 'SYSCALL_TRACE_ENTER_EVENT' 151 | .name = "sys_enter"#sname, \ | ^~~~~~~~~~~~~~~~~ Cc: stable@vger.kernel.org Cc: Masami Hiramatsu Cc: Mathieu Desnoyers Cc: Mickaël Salaün Cc: Günther Noack Cc: Nathan Chancellor Cc: Nick Desaulniers Cc: Bill Wendling Cc: Justin Stitt Link: https://lore.kernel.org/20241125105028.42807-1-cgoettsche@seltendoof.de Fixes: b77e38aa240c3 ("tracing: add event trace infrastructure") Signed-off-by: Christian Göttsche Signed-off-by: Steven Rostedt (Google) --- include/linux/trace_events.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/linux/trace_events.h b/include/linux/trace_events.h index 91b8ffbdfa8c..58ad4ead33fc 100644 --- a/include/linux/trace_events.h +++ b/include/linux/trace_events.h @@ -364,7 +364,7 @@ struct trace_event_call { struct list_head list; struct trace_event_class *class; union { - char *name; + const char *name; /* Set TRACE_EVENT_FL_TRACEPOINT flag when using "tp" */ struct tracepoint *tp; }; From 98feccbf32cfdde8c722bc4587aaa60ee5ac33f0 Mon Sep 17 00:00:00 2001 From: Lizhi Xu Date: Mon, 16 Dec 2024 15:32:38 +0800 Subject: [PATCH 270/337] tracing: Prevent bad count for tracing_cpumask_write If a large count is provided, it will trigger a warning in bitmap_parse_user. Also check zero for it. Cc: stable@vger.kernel.org Fixes: 9e01c1b74c953 ("cpumask: convert kernel trace functions") Link: https://lore.kernel.org/20241216073238.2573704-1-lizhi.xu@windriver.com Reported-by: syzbot+0aecfd34fb878546f3fd@syzkaller.appspotmail.com Closes: https://syzkaller.appspot.com/bug?extid=0aecfd34fb878546f3fd Tested-by: syzbot+0aecfd34fb878546f3fd@syzkaller.appspotmail.com Signed-off-by: Lizhi Xu Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 957f941a08e7..f8aebcb01e62 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -5087,6 +5087,9 @@ tracing_cpumask_write(struct file *filp, const char __user *ubuf, cpumask_var_t tracing_cpumask_new; int err; + if (count == 0 || count > KMALLOC_MAX_SIZE) + return -EINVAL; + if (!zalloc_cpumask_var(&tracing_cpumask_new, GFP_KERNEL)) return -ENOMEM; From ccfa3131d4a0347988e73638edea5c8281b6d2c7 Mon Sep 17 00:00:00 2001 From: Joe Hattori Date: Sat, 21 Dec 2024 16:57:12 +0900 Subject: [PATCH 271/337] dmaengine: fsl-edma: implement the cleanup path of fsl_edma3_attach_pd() Current implementation of fsl_edma3_attach_pd() does not provide a cleanup path, resulting in a memory leak. For example, dev_pm_domain_detach() is not called after dev_pm_domain_attach_by_id(), and the device link created with the DL_FLAG_STATELESS is not released explicitly. Therefore, provide a cleanup function fsl_edma3_detach_pd() and call it upon failure. Also add a devm_add_action_or_reset() call with this function after a successful fsl_edma3_attach_pd(). Fixes: 72f5801a4e2b ("dmaengine: fsl-edma: integrate v3 support") Signed-off-by: Joe Hattori Link: https://lore.kernel.org/r/20241221075712.3297200-1-joe@pf.is.s.u-tokyo.ac.jp Signed-off-by: Vinod Koul --- drivers/dma/fsl-edma-common.h | 1 + drivers/dma/fsl-edma-main.c | 41 ++++++++++++++++++++++++++++++----- 2 files changed, 37 insertions(+), 5 deletions(-) diff --git a/drivers/dma/fsl-edma-common.h b/drivers/dma/fsl-edma-common.h index ce37e1ee9c46..fe8f103d4a63 100644 --- a/drivers/dma/fsl-edma-common.h +++ b/drivers/dma/fsl-edma-common.h @@ -166,6 +166,7 @@ struct fsl_edma_chan { struct work_struct issue_worker; struct platform_device *pdev; struct device *pd_dev; + struct device_link *pd_dev_link; u32 srcid; struct clk *clk; int priority; diff --git a/drivers/dma/fsl-edma-main.c b/drivers/dma/fsl-edma-main.c index 60de1003193a..1a613236b3e4 100644 --- a/drivers/dma/fsl-edma-main.c +++ b/drivers/dma/fsl-edma-main.c @@ -417,10 +417,33 @@ static const struct of_device_id fsl_edma_dt_ids[] = { }; MODULE_DEVICE_TABLE(of, fsl_edma_dt_ids); +static void fsl_edma3_detach_pd(struct fsl_edma_engine *fsl_edma) +{ + struct fsl_edma_chan *fsl_chan; + int i; + + for (i = 0; i < fsl_edma->n_chans; i++) { + if (fsl_edma->chan_masked & BIT(i)) + continue; + fsl_chan = &fsl_edma->chans[i]; + if (fsl_chan->pd_dev_link) + device_link_del(fsl_chan->pd_dev_link); + if (fsl_chan->pd_dev) { + dev_pm_domain_detach(fsl_chan->pd_dev, false); + pm_runtime_dont_use_autosuspend(fsl_chan->pd_dev); + pm_runtime_set_suspended(fsl_chan->pd_dev); + } + } +} + +static void devm_fsl_edma3_detach_pd(void *data) +{ + fsl_edma3_detach_pd(data); +} + static int fsl_edma3_attach_pd(struct platform_device *pdev, struct fsl_edma_engine *fsl_edma) { struct fsl_edma_chan *fsl_chan; - struct device_link *link; struct device *pd_chan; struct device *dev; int i; @@ -436,15 +459,16 @@ static int fsl_edma3_attach_pd(struct platform_device *pdev, struct fsl_edma_eng pd_chan = dev_pm_domain_attach_by_id(dev, i); if (IS_ERR_OR_NULL(pd_chan)) { dev_err(dev, "Failed attach pd %d\n", i); - return -EINVAL; + goto detach; } - link = device_link_add(dev, pd_chan, DL_FLAG_STATELESS | + fsl_chan->pd_dev_link = device_link_add(dev, pd_chan, DL_FLAG_STATELESS | DL_FLAG_PM_RUNTIME | DL_FLAG_RPM_ACTIVE); - if (!link) { + if (!fsl_chan->pd_dev_link) { dev_err(dev, "Failed to add device_link to %d\n", i); - return -EINVAL; + dev_pm_domain_detach(pd_chan, false); + goto detach; } fsl_chan->pd_dev = pd_chan; @@ -455,6 +479,10 @@ static int fsl_edma3_attach_pd(struct platform_device *pdev, struct fsl_edma_eng } return 0; + +detach: + fsl_edma3_detach_pd(fsl_edma); + return -EINVAL; } static int fsl_edma_probe(struct platform_device *pdev) @@ -544,6 +572,9 @@ static int fsl_edma_probe(struct platform_device *pdev) ret = fsl_edma3_attach_pd(pdev, fsl_edma); if (ret) return ret; + ret = devm_add_action_or_reset(&pdev->dev, devm_fsl_edma3_detach_pd, fsl_edma); + if (ret) + return ret; } if (drvdata->flags & FSL_EDMA_DRV_TCD64) From 2ac5415022d16d63d912a39a06f32f1f51140261 Mon Sep 17 00:00:00 2001 From: Zhu Yanjun Date: Fri, 20 Dec 2024 23:23:25 +0100 Subject: [PATCH 272/337] RDMA/rxe: Remove the direct link to net_device The similar patch in siw is in the link: https://git.kernel.org/rdma/rdma/c/16b87037b48889 This problem also occurred in RXE. The following analyze this problem. In the following Call Traces: " BUG: KASAN: slab-use-after-free in dev_get_flags+0x188/0x1d0 net/core/dev.c:8782 Read of size 4 at addr ffff8880554640b0 by task kworker/1:4/5295 CPU: 1 UID: 0 PID: 5295 Comm: kworker/1:4 Not tainted 6.12.0-rc3-syzkaller-00399-g9197b73fd7bb #0 Hardware name: Google Compute Engine/Google Compute Engine, BIOS Google 09/13/2024 Workqueue: infiniband ib_cache_event_task Call Trace: __dump_stack lib/dump_stack.c:94 [inline] dump_stack_lvl+0x241/0x360 lib/dump_stack.c:120 print_address_description mm/kasan/report.c:377 [inline] print_report+0x169/0x550 mm/kasan/report.c:488 kasan_report+0x143/0x180 mm/kasan/report.c:601 dev_get_flags+0x188/0x1d0 net/core/dev.c:8782 rxe_query_port+0x12d/0x260 drivers/infiniband/sw/rxe/rxe_verbs.c:60 __ib_query_port drivers/infiniband/core/device.c:2111 [inline] ib_query_port+0x168/0x7d0 drivers/infiniband/core/device.c:2143 ib_cache_update+0x1a9/0xb80 drivers/infiniband/core/cache.c:1494 ib_cache_event_task+0xf3/0x1e0 drivers/infiniband/core/cache.c:1568 process_one_work kernel/workqueue.c:3229 [inline] process_scheduled_works+0xa65/0x1850 kernel/workqueue.c:3310 worker_thread+0x870/0xd30 kernel/workqueue.c:3391 kthread+0x2f2/0x390 kernel/kthread.c:389 ret_from_fork+0x4d/0x80 arch/x86/kernel/process.c:147 ret_from_fork_asm+0x1a/0x30 arch/x86/entry/entry_64.S:244 " 1). In the link [1], " infiniband syz2: set down " This means that on 839.350575, the event ib_cache_event_task was sent andi queued in ib_wq. 2). In the link [1], " team0 (unregistering): Port device team_slave_0 removed " It indicates that before 843.251853, the net device should be freed. 3). In the link [1], " BUG: KASAN: slab-use-after-free in dev_get_flags+0x188/0x1d0 " This means that on 850.559070, this slab-use-after-free problem occurred. In all, on 839.350575, the event ib_cache_event_task was sent and queued in ib_wq, before 843.251853, the net device veth was freed. on 850.559070, this event was executed, and the mentioned freed net device was called. Thus, the above call trace occurred. [1] https://syzkaller.appspot.com/x/log.txt?x=12e7025f980000 Reported-by: syzbot+4b87489410b4efd181bf@syzkaller.appspotmail.com Closes: https://syzkaller.appspot.com/bug?extid=4b87489410b4efd181bf Fixes: 8700e3e7c485 ("Soft RoCE driver") Signed-off-by: Zhu Yanjun Link: https://patch.msgid.link/20241220222325.2487767-1-yanjun.zhu@linux.dev Signed-off-by: Leon Romanovsky --- drivers/infiniband/sw/rxe/rxe.c | 23 +++++++++++++++++++---- drivers/infiniband/sw/rxe/rxe.h | 3 ++- drivers/infiniband/sw/rxe/rxe_mcast.c | 22 ++++++++++++++++++++-- drivers/infiniband/sw/rxe/rxe_net.c | 24 ++++++++++++++++++++---- drivers/infiniband/sw/rxe/rxe_verbs.c | 26 +++++++++++++++++++++----- drivers/infiniband/sw/rxe/rxe_verbs.h | 11 ++++++++--- 6 files changed, 90 insertions(+), 19 deletions(-) diff --git a/drivers/infiniband/sw/rxe/rxe.c b/drivers/infiniband/sw/rxe/rxe.c index 255677bc12b2..1ba4a0c8726a 100644 --- a/drivers/infiniband/sw/rxe/rxe.c +++ b/drivers/infiniband/sw/rxe/rxe.c @@ -40,6 +40,8 @@ void rxe_dealloc(struct ib_device *ib_dev) /* initialize rxe device parameters */ static void rxe_init_device_param(struct rxe_dev *rxe) { + struct net_device *ndev; + rxe->max_inline_data = RXE_MAX_INLINE_DATA; rxe->attr.vendor_id = RXE_VENDOR_ID; @@ -71,8 +73,15 @@ static void rxe_init_device_param(struct rxe_dev *rxe) rxe->attr.max_fast_reg_page_list_len = RXE_MAX_FMR_PAGE_LIST_LEN; rxe->attr.max_pkeys = RXE_MAX_PKEYS; rxe->attr.local_ca_ack_delay = RXE_LOCAL_CA_ACK_DELAY; + + ndev = rxe_ib_device_get_netdev(&rxe->ib_dev); + if (!ndev) + return; + addrconf_addr_eui48((unsigned char *)&rxe->attr.sys_image_guid, - rxe->ndev->dev_addr); + ndev->dev_addr); + + dev_put(ndev); rxe->max_ucontext = RXE_MAX_UCONTEXT; } @@ -109,10 +118,15 @@ static void rxe_init_port_param(struct rxe_port *port) static void rxe_init_ports(struct rxe_dev *rxe) { struct rxe_port *port = &rxe->port; + struct net_device *ndev; rxe_init_port_param(port); + ndev = rxe_ib_device_get_netdev(&rxe->ib_dev); + if (!ndev) + return; addrconf_addr_eui48((unsigned char *)&port->port_guid, - rxe->ndev->dev_addr); + ndev->dev_addr); + dev_put(ndev); spin_lock_init(&port->port_lock); } @@ -167,12 +181,13 @@ void rxe_set_mtu(struct rxe_dev *rxe, unsigned int ndev_mtu) /* called by ifc layer to create new rxe device. * The caller should allocate memory for rxe by calling ib_alloc_device. */ -int rxe_add(struct rxe_dev *rxe, unsigned int mtu, const char *ibdev_name) +int rxe_add(struct rxe_dev *rxe, unsigned int mtu, const char *ibdev_name, + struct net_device *ndev) { rxe_init(rxe); rxe_set_mtu(rxe, mtu); - return rxe_register_device(rxe, ibdev_name); + return rxe_register_device(rxe, ibdev_name, ndev); } static int rxe_newlink(const char *ibdev_name, struct net_device *ndev) diff --git a/drivers/infiniband/sw/rxe/rxe.h b/drivers/infiniband/sw/rxe/rxe.h index d8fb2c7af30a..fe7f97066732 100644 --- a/drivers/infiniband/sw/rxe/rxe.h +++ b/drivers/infiniband/sw/rxe/rxe.h @@ -139,7 +139,8 @@ enum resp_states { void rxe_set_mtu(struct rxe_dev *rxe, unsigned int dev_mtu); -int rxe_add(struct rxe_dev *rxe, unsigned int mtu, const char *ibdev_name); +int rxe_add(struct rxe_dev *rxe, unsigned int mtu, const char *ibdev_name, + struct net_device *ndev); void rxe_rcv(struct sk_buff *skb); diff --git a/drivers/infiniband/sw/rxe/rxe_mcast.c b/drivers/infiniband/sw/rxe/rxe_mcast.c index 86cc2e18a7fd..07ff47bae31d 100644 --- a/drivers/infiniband/sw/rxe/rxe_mcast.c +++ b/drivers/infiniband/sw/rxe/rxe_mcast.c @@ -31,10 +31,19 @@ static int rxe_mcast_add(struct rxe_dev *rxe, union ib_gid *mgid) { unsigned char ll_addr[ETH_ALEN]; + struct net_device *ndev; + int ret; + + ndev = rxe_ib_device_get_netdev(&rxe->ib_dev); + if (!ndev) + return -ENODEV; ipv6_eth_mc_map((struct in6_addr *)mgid->raw, ll_addr); - return dev_mc_add(rxe->ndev, ll_addr); + ret = dev_mc_add(ndev, ll_addr); + dev_put(ndev); + + return ret; } /** @@ -47,10 +56,19 @@ static int rxe_mcast_add(struct rxe_dev *rxe, union ib_gid *mgid) static int rxe_mcast_del(struct rxe_dev *rxe, union ib_gid *mgid) { unsigned char ll_addr[ETH_ALEN]; + struct net_device *ndev; + int ret; + + ndev = rxe_ib_device_get_netdev(&rxe->ib_dev); + if (!ndev) + return -ENODEV; ipv6_eth_mc_map((struct in6_addr *)mgid->raw, ll_addr); - return dev_mc_del(rxe->ndev, ll_addr); + ret = dev_mc_del(ndev, ll_addr); + dev_put(ndev); + + return ret; } /** diff --git a/drivers/infiniband/sw/rxe/rxe_net.c b/drivers/infiniband/sw/rxe/rxe_net.c index 75d1407db52d..8cc64ceeb356 100644 --- a/drivers/infiniband/sw/rxe/rxe_net.c +++ b/drivers/infiniband/sw/rxe/rxe_net.c @@ -524,7 +524,16 @@ out: */ const char *rxe_parent_name(struct rxe_dev *rxe, unsigned int port_num) { - return rxe->ndev->name; + struct net_device *ndev; + char *ndev_name; + + ndev = rxe_ib_device_get_netdev(&rxe->ib_dev); + if (!ndev) + return NULL; + ndev_name = ndev->name; + dev_put(ndev); + + return ndev_name; } int rxe_net_add(const char *ibdev_name, struct net_device *ndev) @@ -536,10 +545,9 @@ int rxe_net_add(const char *ibdev_name, struct net_device *ndev) if (!rxe) return -ENOMEM; - rxe->ndev = ndev; ib_mark_name_assigned_by_user(&rxe->ib_dev); - err = rxe_add(rxe, ndev->mtu, ibdev_name); + err = rxe_add(rxe, ndev->mtu, ibdev_name, ndev); if (err) { ib_dealloc_device(&rxe->ib_dev); return err; @@ -587,10 +595,18 @@ void rxe_port_down(struct rxe_dev *rxe) void rxe_set_port_state(struct rxe_dev *rxe) { - if (netif_running(rxe->ndev) && netif_carrier_ok(rxe->ndev)) + struct net_device *ndev; + + ndev = rxe_ib_device_get_netdev(&rxe->ib_dev); + if (!ndev) + return; + + if (netif_running(ndev) && netif_carrier_ok(ndev)) rxe_port_up(rxe); else rxe_port_down(rxe); + + dev_put(ndev); } static int rxe_notify(struct notifier_block *not_blk, diff --git a/drivers/infiniband/sw/rxe/rxe_verbs.c b/drivers/infiniband/sw/rxe/rxe_verbs.c index 5c18f7e342f2..8a5fc20fd186 100644 --- a/drivers/infiniband/sw/rxe/rxe_verbs.c +++ b/drivers/infiniband/sw/rxe/rxe_verbs.c @@ -41,6 +41,7 @@ static int rxe_query_port(struct ib_device *ibdev, u32 port_num, struct ib_port_attr *attr) { struct rxe_dev *rxe = to_rdev(ibdev); + struct net_device *ndev; int err, ret; if (port_num != 1) { @@ -49,6 +50,12 @@ static int rxe_query_port(struct ib_device *ibdev, goto err_out; } + ndev = rxe_ib_device_get_netdev(ibdev); + if (!ndev) { + err = -ENODEV; + goto err_out; + } + memcpy(attr, &rxe->port.attr, sizeof(*attr)); mutex_lock(&rxe->usdev_lock); @@ -57,13 +64,14 @@ static int rxe_query_port(struct ib_device *ibdev, if (attr->state == IB_PORT_ACTIVE) attr->phys_state = IB_PORT_PHYS_STATE_LINK_UP; - else if (dev_get_flags(rxe->ndev) & IFF_UP) + else if (dev_get_flags(ndev) & IFF_UP) attr->phys_state = IB_PORT_PHYS_STATE_POLLING; else attr->phys_state = IB_PORT_PHYS_STATE_DISABLED; mutex_unlock(&rxe->usdev_lock); + dev_put(ndev); return ret; err_out: @@ -1425,9 +1433,16 @@ static const struct attribute_group rxe_attr_group = { static int rxe_enable_driver(struct ib_device *ib_dev) { struct rxe_dev *rxe = container_of(ib_dev, struct rxe_dev, ib_dev); + struct net_device *ndev; + + ndev = rxe_ib_device_get_netdev(ib_dev); + if (!ndev) + return -ENODEV; rxe_set_port_state(rxe); - dev_info(&rxe->ib_dev.dev, "added %s\n", netdev_name(rxe->ndev)); + dev_info(&rxe->ib_dev.dev, "added %s\n", netdev_name(ndev)); + + dev_put(ndev); return 0; } @@ -1495,7 +1510,8 @@ static const struct ib_device_ops rxe_dev_ops = { INIT_RDMA_OBJ_SIZE(ib_mw, rxe_mw, ibmw), }; -int rxe_register_device(struct rxe_dev *rxe, const char *ibdev_name) +int rxe_register_device(struct rxe_dev *rxe, const char *ibdev_name, + struct net_device *ndev) { int err; struct ib_device *dev = &rxe->ib_dev; @@ -1507,13 +1523,13 @@ int rxe_register_device(struct rxe_dev *rxe, const char *ibdev_name) dev->num_comp_vectors = num_possible_cpus(); dev->local_dma_lkey = 0; addrconf_addr_eui48((unsigned char *)&dev->node_guid, - rxe->ndev->dev_addr); + ndev->dev_addr); dev->uverbs_cmd_mask |= BIT_ULL(IB_USER_VERBS_CMD_POST_SEND) | BIT_ULL(IB_USER_VERBS_CMD_REQ_NOTIFY_CQ); ib_set_device_ops(dev, &rxe_dev_ops); - err = ib_device_set_netdev(&rxe->ib_dev, rxe->ndev, 1); + err = ib_device_set_netdev(&rxe->ib_dev, ndev, 1); if (err) return err; diff --git a/drivers/infiniband/sw/rxe/rxe_verbs.h b/drivers/infiniband/sw/rxe/rxe_verbs.h index 3c1354f82283..6573ceec0ef5 100644 --- a/drivers/infiniband/sw/rxe/rxe_verbs.h +++ b/drivers/infiniband/sw/rxe/rxe_verbs.h @@ -370,6 +370,7 @@ struct rxe_port { u32 qp_gsi_index; }; +#define RXE_PORT 1 struct rxe_dev { struct ib_device ib_dev; struct ib_device_attr attr; @@ -377,8 +378,6 @@ struct rxe_dev { int max_inline_data; struct mutex usdev_lock; - struct net_device *ndev; - struct rxe_pool uc_pool; struct rxe_pool pd_pool; struct rxe_pool ah_pool; @@ -406,6 +405,11 @@ struct rxe_dev { struct crypto_shash *tfm; }; +static inline struct net_device *rxe_ib_device_get_netdev(struct ib_device *dev) +{ + return ib_device_get_netdev(dev, RXE_PORT); +} + static inline void rxe_counter_inc(struct rxe_dev *rxe, enum rxe_counters index) { atomic64_inc(&rxe->stats_counters[index]); @@ -471,6 +475,7 @@ static inline struct rxe_pd *rxe_mw_pd(struct rxe_mw *mw) return to_rpd(mw->ibmw.pd); } -int rxe_register_device(struct rxe_dev *rxe, const char *ibdev_name); +int rxe_register_device(struct rxe_dev *rxe, const char *ibdev_name, + struct net_device *ndev); #endif /* RXE_VERBS_H */ From 385a95cc72941c7f88630a7bc4176048cc03b395 Mon Sep 17 00:00:00 2001 From: Suraj Kandpal Date: Mon, 16 Dec 2024 23:45:54 +0530 Subject: [PATCH 273/337] drm/i915/cx0_phy: Fix C10 pll programming sequence According to spec VDR_CUSTOM_WIDTH register gets programmed after pll specific VDR registers and TX Lane programming registers are done. Moreover we only program into C10_VDR_CONTROL1 to update config and setup master lane once all VDR registers are written into. Bspec: 67636 Fixes: 51390cc0e00a ("drm/i915/mtl: Add Support for C10 PHY message bus and pll programming") Signed-off-by: Suraj Kandpal Reviewed-by: Ankit Nautiyal Link: https://patchwork.freedesktop.org/patch/msgid/20241216181554.2861381-1-suraj.kandpal@intel.com (cherry picked from commit f9d418552ba1e3a0e92487ff82eb515dab7516c0) Signed-off-by: Tvrtko Ursulin --- drivers/gpu/drm/i915/display/intel_cx0_phy.c | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) diff --git a/drivers/gpu/drm/i915/display/intel_cx0_phy.c b/drivers/gpu/drm/i915/display/intel_cx0_phy.c index 71dc659228ab..0c7aee13495a 100644 --- a/drivers/gpu/drm/i915/display/intel_cx0_phy.c +++ b/drivers/gpu/drm/i915/display/intel_cx0_phy.c @@ -2115,14 +2115,6 @@ static void intel_c10_pll_program(struct intel_display *display, 0, C10_VDR_CTRL_MSGBUS_ACCESS, MB_WRITE_COMMITTED); - /* Custom width needs to be programmed to 0 for both the phy lanes */ - intel_cx0_rmw(encoder, INTEL_CX0_BOTH_LANES, PHY_C10_VDR_CUSTOM_WIDTH, - C10_VDR_CUSTOM_WIDTH_MASK, C10_VDR_CUSTOM_WIDTH_8_10, - MB_WRITE_COMMITTED); - intel_cx0_rmw(encoder, INTEL_CX0_BOTH_LANES, PHY_C10_VDR_CONTROL(1), - 0, C10_VDR_CTRL_UPDATE_CFG, - MB_WRITE_COMMITTED); - /* Program the pll values only for the master lane */ for (i = 0; i < ARRAY_SIZE(pll_state->pll); i++) intel_cx0_write(encoder, INTEL_CX0_LANE0, PHY_C10_VDR_PLL(i), @@ -2132,6 +2124,10 @@ static void intel_c10_pll_program(struct intel_display *display, intel_cx0_write(encoder, INTEL_CX0_LANE0, PHY_C10_VDR_CMN(0), pll_state->cmn, MB_WRITE_COMMITTED); intel_cx0_write(encoder, INTEL_CX0_LANE0, PHY_C10_VDR_TX(0), pll_state->tx, MB_WRITE_COMMITTED); + /* Custom width needs to be programmed to 0 for both the phy lanes */ + intel_cx0_rmw(encoder, INTEL_CX0_BOTH_LANES, PHY_C10_VDR_CUSTOM_WIDTH, + C10_VDR_CUSTOM_WIDTH_MASK, C10_VDR_CUSTOM_WIDTH_8_10, + MB_WRITE_COMMITTED); intel_cx0_rmw(encoder, INTEL_CX0_LANE0, PHY_C10_VDR_CONTROL(1), 0, C10_VDR_CTRL_MASTER_LANE | C10_VDR_CTRL_UPDATE_CFG, MB_WRITE_COMMITTED); From 20e7c5313ffbf11c34a46395345677adbe890bee Mon Sep 17 00:00:00 2001 From: Rodrigo Vivi Date: Thu, 19 Dec 2024 16:00:19 -0500 Subject: [PATCH 274/337] drm/i915/dg1: Fix power gate sequence. sub-pipe PG is not present on DG1. Setting these bits can disable other power gates and cause GPU hangs on video playbacks. VLK: 16314, 4304 Closes: https://gitlab.freedesktop.org/drm/i915/kernel/-/issues/13381 Fixes: 85a12d7eb8fe ("drm/i915/tgl: Fix Media power gate sequence.") Cc: Vinay Belgaumkar Cc: Himal Prasad Ghimiray Reviewed-by: Vinay Belgaumkar Reviewed-by: Himal Prasad Ghimiray Link: https://patchwork.freedesktop.org/patch/msgid/20241219210019.70532-1-rodrigo.vivi@intel.com Signed-off-by: Rodrigo Vivi (cherry picked from commit de7061947b4ed4be857d452c60d5fb795831d79e) Signed-off-by: Tvrtko Ursulin --- drivers/gpu/drm/i915/gt/intel_rc6.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/i915/gt/intel_rc6.c b/drivers/gpu/drm/i915/gt/intel_rc6.c index c864d101faf9..9378d5901c49 100644 --- a/drivers/gpu/drm/i915/gt/intel_rc6.c +++ b/drivers/gpu/drm/i915/gt/intel_rc6.c @@ -133,7 +133,7 @@ static void gen11_rc6_enable(struct intel_rc6 *rc6) GEN9_MEDIA_PG_ENABLE | GEN11_MEDIA_SAMPLER_PG_ENABLE; - if (GRAPHICS_VER(gt->i915) >= 12) { + if (GRAPHICS_VER(gt->i915) >= 12 && !IS_DG1(gt->i915)) { for (i = 0; i < I915_MAX_VCS; i++) if (HAS_ENGINE(gt, _VCS(i))) pg_enable |= (VDN_HCP_POWERGATE_ENABLE(i) | From 362f1bf98a3ecb5a2a4fcbdaa9718c8403beceb2 Mon Sep 17 00:00:00 2001 From: Javier Carrasco Date: Fri, 11 Oct 2024 22:57:59 +0200 Subject: [PATCH 275/337] dmaengine: mv_xor: fix child node refcount handling in early exit The for_each_child_of_node() loop requires explicit calls to of_node_put() to decrement the child's refcount upon early exits (break, goto, return). Add the missing calls in the two early exits before the goto instructions. Cc: stable@vger.kernel.org Fixes: f7d12ef53ddf ("dma: mv_xor: add Device Tree binding") Signed-off-by: Javier Carrasco Link: https://lore.kernel.org/r/20241011-dma_mv_xor_of_node_put-v1-1-3c2de819f463@gmail.com Signed-off-by: Vinod Koul --- drivers/dma/mv_xor.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/dma/mv_xor.c b/drivers/dma/mv_xor.c index 43efce77bb57..40b76b40bc30 100644 --- a/drivers/dma/mv_xor.c +++ b/drivers/dma/mv_xor.c @@ -1388,6 +1388,7 @@ static int mv_xor_probe(struct platform_device *pdev) irq = irq_of_parse_and_map(np, 0); if (!irq) { ret = -ENODEV; + of_node_put(np); goto err_channel_add; } @@ -1396,6 +1397,7 @@ static int mv_xor_probe(struct platform_device *pdev) if (IS_ERR(chan)) { ret = PTR_ERR(chan); irq_dispose_mapping(irq); + of_node_put(np); goto err_channel_add; } From ebc008699fd95701c9af5ebaeb0793eef81a71d5 Mon Sep 17 00:00:00 2001 From: Akhil R Date: Thu, 12 Dec 2024 18:14:12 +0530 Subject: [PATCH 276/337] dmaengine: tegra: Return correct DMA status when paused Currently, the driver does not return the correct DMA status when a DMA pause is issued by the client drivers. This causes GPCDMA users to assume that DMA is still running, while in reality, the DMA is paused. Return DMA_PAUSED for tx_status() if the channel is paused in the middle of a transfer. Fixes: ee17028009d4 ("dmaengine: tegra: Add tegra gpcdma driver") Cc: stable@vger.kernel.org Signed-off-by: Akhil R Signed-off-by: Kartik Rajput Link: https://lore.kernel.org/r/20241212124412.5650-1-kkartik@nvidia.com Signed-off-by: Vinod Koul --- drivers/dma/tegra186-gpc-dma.c | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/drivers/dma/tegra186-gpc-dma.c b/drivers/dma/tegra186-gpc-dma.c index cacf3757adc2..4d6fe0efa76e 100644 --- a/drivers/dma/tegra186-gpc-dma.c +++ b/drivers/dma/tegra186-gpc-dma.c @@ -231,6 +231,7 @@ struct tegra_dma_channel { bool config_init; char name[30]; enum dma_transfer_direction sid_dir; + enum dma_status status; int id; int irq; int slave_id; @@ -393,6 +394,8 @@ static int tegra_dma_pause(struct tegra_dma_channel *tdc) tegra_dma_dump_chan_regs(tdc); } + tdc->status = DMA_PAUSED; + return ret; } @@ -419,6 +422,8 @@ static void tegra_dma_resume(struct tegra_dma_channel *tdc) val = tdc_read(tdc, TEGRA_GPCDMA_CHAN_CSRE); val &= ~TEGRA_GPCDMA_CHAN_CSRE_PAUSE; tdc_write(tdc, TEGRA_GPCDMA_CHAN_CSRE, val); + + tdc->status = DMA_IN_PROGRESS; } static int tegra_dma_device_resume(struct dma_chan *dc) @@ -544,6 +549,7 @@ static void tegra_dma_xfer_complete(struct tegra_dma_channel *tdc) tegra_dma_sid_free(tdc); tdc->dma_desc = NULL; + tdc->status = DMA_COMPLETE; } static void tegra_dma_chan_decode_error(struct tegra_dma_channel *tdc, @@ -716,6 +722,7 @@ static int tegra_dma_terminate_all(struct dma_chan *dc) tdc->dma_desc = NULL; } + tdc->status = DMA_COMPLETE; tegra_dma_sid_free(tdc); vchan_get_all_descriptors(&tdc->vc, &head); spin_unlock_irqrestore(&tdc->vc.lock, flags); @@ -769,6 +776,9 @@ static enum dma_status tegra_dma_tx_status(struct dma_chan *dc, if (ret == DMA_COMPLETE) return ret; + if (tdc->status == DMA_PAUSED) + ret = DMA_PAUSED; + spin_lock_irqsave(&tdc->vc.lock, flags); vd = vchan_find_desc(&tdc->vc, cookie); if (vd) { From fe4bfa9b6d7bd752bfe4700c937f235aa8ce997b Mon Sep 17 00:00:00 2001 From: Zijun Hu Date: Fri, 13 Dec 2024 20:36:41 +0800 Subject: [PATCH 277/337] phy: core: Fix that API devm_phy_put() fails to release the phy MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit For devm_phy_put(), its comment says it needs to invoke phy_put() to release the phy, but it will not actually invoke the function since devres_destroy() does not call devm_phy_release(), and the missing phy_put() call will cause: - The phy fails to be released. - devm_phy_put() can not fully undo what API devm_phy_get() does. - Leak refcount of both the module and device for below typical usage: devm_phy_get(); // or its variant ... err = do_something(); if (err) goto err_out; ... err_out: devm_phy_put(); // leak refcount here The file(s) affected by this issue are shown below since they have such typical usage. drivers/pci/controller/cadence/pcie-cadence.c drivers/net/ethernet/ti/am65-cpsw-nuss.c Fix by using devres_release() instead of devres_destroy() within the API. Fixes: ff764963479a ("drivers: phy: add generic PHY framework") Cc: stable@vger.kernel.org Cc: Lorenzo Pieralisi Cc: Krzysztof Wilczyński Cc: Bjorn Helgaas Cc: David S. Miller Cc: Eric Dumazet Cc: Jakub Kicinski Cc: Paolo Abeni Reviewed-by: Johan Hovold Signed-off-by: Zijun Hu Link: https://lore.kernel.org/r/20241213-phy_core_fix-v6-1-40ae28f5015a@quicinc.com Signed-off-by: Vinod Koul --- drivers/phy/phy-core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/phy/phy-core.c b/drivers/phy/phy-core.c index f053b525ccff..f190d7126613 100644 --- a/drivers/phy/phy-core.c +++ b/drivers/phy/phy-core.c @@ -737,7 +737,7 @@ void devm_phy_put(struct device *dev, struct phy *phy) if (!phy) return; - r = devres_destroy(dev, devm_phy_release, devm_phy_match, phy); + r = devres_release(dev, devm_phy_release, devm_phy_match, phy); dev_WARN_ONCE(dev, r, "couldn't find PHY resource\n"); } EXPORT_SYMBOL_GPL(devm_phy_put); From c0b82ab95b4f1fbc3e3aeab9d829d012669524b6 Mon Sep 17 00:00:00 2001 From: Zijun Hu Date: Fri, 13 Dec 2024 20:36:42 +0800 Subject: [PATCH 278/337] phy: core: Fix that API devm_of_phy_provider_unregister() fails to unregister the phy provider For devm_of_phy_provider_unregister(), its comment says it needs to invoke of_phy_provider_unregister() to unregister the phy provider, but it will not actually invoke the function since devres_destroy() does not call devm_phy_provider_release(), and the missing of_phy_provider_unregister() call will cause: - The phy provider fails to be unregistered. - Leak both memory and the OF node refcount. Fortunately, the faulty API has not been used by current kernel tree. Fix by using devres_release() instead of devres_destroy() within the API. Fixes: ff764963479a ("drivers: phy: add generic PHY framework") Reviewed-by: Johan Hovold Signed-off-by: Zijun Hu Link: https://lore.kernel.org/stable/20241213-phy_core_fix-v6-2-40ae28f5015a%40quicinc.com Link: https://lore.kernel.org/r/20241213-phy_core_fix-v6-2-40ae28f5015a@quicinc.com Signed-off-by: Vinod Koul --- drivers/phy/phy-core.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/phy/phy-core.c b/drivers/phy/phy-core.c index f190d7126613..de07e1616b34 100644 --- a/drivers/phy/phy-core.c +++ b/drivers/phy/phy-core.c @@ -1259,12 +1259,12 @@ EXPORT_SYMBOL_GPL(of_phy_provider_unregister); * of_phy_provider_unregister to unregister the phy provider. */ void devm_of_phy_provider_unregister(struct device *dev, - struct phy_provider *phy_provider) + struct phy_provider *phy_provider) { int r; - r = devres_destroy(dev, devm_phy_provider_release, devm_phy_match, - phy_provider); + r = devres_release(dev, devm_phy_provider_release, devm_phy_match, + phy_provider); dev_WARN_ONCE(dev, r, "couldn't find PHY provider device resource\n"); } EXPORT_SYMBOL_GPL(devm_of_phy_provider_unregister); From 4dc48c88fcf82b89fdebd83a906aaa64f40fb8a9 Mon Sep 17 00:00:00 2001 From: Zijun Hu Date: Fri, 13 Dec 2024 20:36:43 +0800 Subject: [PATCH 279/337] phy: core: Fix that API devm_phy_destroy() fails to destroy the phy For devm_phy_destroy(), its comment says it needs to invoke phy_destroy() to destroy the phy, but it will not actually invoke the function since devres_destroy() does not call devm_phy_consume(), and the missing phy_destroy() call will cause that the phy fails to be destroyed. Fortunately, the faulty API has not been used by current kernel tree. Fix by using devres_release() instead of devres_destroy() within the API. Fixes: ff764963479a ("drivers: phy: add generic PHY framework") Reviewed-by: Johan Hovold Signed-off-by: Zijun Hu Link: https://lore.kernel.org/r/20241213-phy_core_fix-v6-3-40ae28f5015a@quicinc.com Signed-off-by: Vinod Koul --- drivers/phy/phy-core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/phy/phy-core.c b/drivers/phy/phy-core.c index de07e1616b34..52ca590a58b9 100644 --- a/drivers/phy/phy-core.c +++ b/drivers/phy/phy-core.c @@ -1121,7 +1121,7 @@ void devm_phy_destroy(struct device *dev, struct phy *phy) { int r; - r = devres_destroy(dev, devm_phy_consume, devm_phy_match, phy); + r = devres_release(dev, devm_phy_consume, devm_phy_match, phy); dev_WARN_ONCE(dev, r, "couldn't find PHY resource\n"); } EXPORT_SYMBOL_GPL(devm_phy_destroy); From 5ebdc6be16c2000e37fcb8b4072d442d268ad492 Mon Sep 17 00:00:00 2001 From: Zijun Hu Date: Fri, 13 Dec 2024 20:36:44 +0800 Subject: [PATCH 280/337] phy: core: Fix an OF node refcount leakage in _of_phy_get() _of_phy_get() will directly return when suffers of_device_is_compatible() error, but it forgets to decrease refcount of OF node @args.np before error return, the refcount was increased by previous of_parse_phandle_with_args() so causes the OF node's refcount leakage. Fix by decreasing the refcount via of_node_put() before the error return. Fixes: b7563e2796f8 ("phy: work around 'phys' references to usb-nop-xceiv devices") Cc: stable@vger.kernel.org Reviewed-by: Johan Hovold Signed-off-by: Zijun Hu Link: https://lore.kernel.org/r/20241213-phy_core_fix-v6-4-40ae28f5015a@quicinc.com Signed-off-by: Vinod Koul --- drivers/phy/phy-core.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/drivers/phy/phy-core.c b/drivers/phy/phy-core.c index 52ca590a58b9..b88fbda6c046 100644 --- a/drivers/phy/phy-core.c +++ b/drivers/phy/phy-core.c @@ -629,8 +629,10 @@ static struct phy *_of_phy_get(struct device_node *np, int index) return ERR_PTR(-ENODEV); /* This phy type handled by the usb-phy subsystem for now */ - if (of_device_is_compatible(args.np, "usb-nop-xceiv")) - return ERR_PTR(-ENODEV); + if (of_device_is_compatible(args.np, "usb-nop-xceiv")) { + phy = ERR_PTR(-ENODEV); + goto out_put_node; + } mutex_lock(&phy_provider_mutex); phy_provider = of_phy_provider_lookup(args.np); @@ -652,6 +654,7 @@ out_put_module: out_unlock: mutex_unlock(&phy_provider_mutex); +out_put_node: of_node_put(args.np); return phy; From a2d633cb1421e679b56f1a9fe1f42f089706f1ed Mon Sep 17 00:00:00 2001 From: Zijun Hu Date: Fri, 13 Dec 2024 20:36:45 +0800 Subject: [PATCH 281/337] phy: core: Fix an OF node refcount leakage in of_phy_provider_lookup() For macro for_each_child_of_node(parent, child), refcount of @child has been increased before entering its loop body, so normally needs to call of_node_put(@child) before returning from the loop body to avoid refcount leakage. of_phy_provider_lookup() has such usage but does not call of_node_put() before returning, so cause leakage of the OF node refcount. Fix by simply calling of_node_put() before returning from the loop body. The APIs affected by this issue are shown below since they indirectly invoke problematic of_phy_provider_lookup(). phy_get() of_phy_get() devm_phy_get() devm_of_phy_get() devm_of_phy_get_by_index() Fixes: 2a4c37016ca9 ("phy: core: Fix of_phy_provider_lookup to return PHY provider for sub node") Cc: stable@vger.kernel.org Reviewed-by: Johan Hovold Signed-off-by: Zijun Hu Link: https://lore.kernel.org/r/20241213-phy_core_fix-v6-5-40ae28f5015a@quicinc.com Signed-off-by: Vinod Koul --- drivers/phy/phy-core.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/phy/phy-core.c b/drivers/phy/phy-core.c index b88fbda6c046..413f76e2d174 100644 --- a/drivers/phy/phy-core.c +++ b/drivers/phy/phy-core.c @@ -145,8 +145,10 @@ static struct phy_provider *of_phy_provider_lookup(struct device_node *node) return phy_provider; for_each_child_of_node(phy_provider->children, child) - if (child == node) + if (child == node) { + of_node_put(child); return phy_provider; + } } return ERR_PTR(-EPROBE_DEFER); From 739214dd1c209e34323814fb815fb17cccb9f95b Mon Sep 17 00:00:00 2001 From: Adam Ford Date: Sun, 15 Dec 2024 16:05:55 -0600 Subject: [PATCH 282/337] phy: freescale: fsl-samsung-hdmi: Fix 64-by-32 division cocci warnings The Kernel test robot returns the following warning: do_div() does a 64-by-32 division, please consider using div64_ul instead. To prevent the 64-by-32 divsion, consolidate both the multiplication and the do_div into one line which explicitly uses u64 sizes. Fixes: 1951dbb41d1d ("phy: freescale: fsl-samsung-hdmi: Support dynamic integer") Signed-off-by: Adam Ford Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-kbuild-all/202412091243.fSObwwPi-lkp@intel.com/ Link: https://lore.kernel.org/r/20241215220555.99113-1-aford173@gmail.com Signed-off-by: Vinod Koul --- drivers/phy/freescale/phy-fsl-samsung-hdmi.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/phy/freescale/phy-fsl-samsung-hdmi.c b/drivers/phy/freescale/phy-fsl-samsung-hdmi.c index 2c8038864357..d3ccf547ba1c 100644 --- a/drivers/phy/freescale/phy-fsl-samsung-hdmi.c +++ b/drivers/phy/freescale/phy-fsl-samsung-hdmi.c @@ -424,8 +424,7 @@ static unsigned long fsl_samsung_hdmi_phy_find_pms(unsigned long fout, u8 *p, u1 * Fvco = (M * f_ref) / P, * where f_ref is 24MHz. */ - tmp = (u64)_m * 24 * MHZ; - do_div(tmp, _p); + tmp = div64_ul((u64)_m * 24 * MHZ, _p); if (tmp < 750 * MHZ || tmp > 3000 * MHZ) continue; From 17194c2998d39ab366a2ecbc4d1f3281e00d6a05 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Fri, 13 Dec 2024 09:30:51 +0100 Subject: [PATCH 283/337] phy: mediatek: phy-mtk-hdmi: add regulator dependency The driver no longer builds when regulator support is unavailable: arm-linux-gnueabi-ld: drivers/phy/mediatek/phy-mtk-hdmi.o: in function `mtk_hdmi_phy_register_regulators': phy-mtk-hdmi.c:(.text.unlikely+0x3e): undefined reference to `devm_regulator_register' arm-linux-gnueabi-ld: drivers/phy/mediatek/phy-mtk-hdmi-mt8195.o: in function `mtk_hdmi_phy_pwr5v_is_enabled': phy-mtk-hdmi-mt8195.c:(.text+0x326): undefined reference to `rdev_get_drvdata' arm-linux-gnueabi-ld: drivers/phy/mediatek/phy-mtk-hdmi-mt8195.o: in function `mtk_hdmi_phy_pwr5v_disable': phy-mtk-hdmi-mt8195.c:(.text+0x346): undefined reference to `rdev_get_drvdata' arm-linux-gnueabi-ld: drivers/phy/mediatek/phy-mtk-hdmi-mt8195.o: in function `mtk_hdmi_phy_pwr5v_enable': Fixes: 49393b2da1cd ("phy: mediatek: phy-mtk-hdmi: Register PHY provided regulator") Signed-off-by: Arnd Bergmann Reviewed-by: AngeloGioacchino Del Regno Link: https://lore.kernel.org/r/20241213083056.2596499-1-arnd@kernel.org Signed-off-by: Vinod Koul --- drivers/phy/mediatek/Kconfig | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/phy/mediatek/Kconfig b/drivers/phy/mediatek/Kconfig index 60e00057e8bc..ba6461350951 100644 --- a/drivers/phy/mediatek/Kconfig +++ b/drivers/phy/mediatek/Kconfig @@ -65,6 +65,7 @@ config PHY_MTK_HDMI depends on ARCH_MEDIATEK || COMPILE_TEST depends on COMMON_CLK depends on OF + depends on REGULATOR select GENERIC_PHY help Support HDMI PHY for Mediatek SoCs. From 542ed8145e6f9392e3d0a86a0e9027d2ffd183e4 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Sat, 21 Dec 2024 00:29:20 +0100 Subject: [PATCH 284/337] netfilter: nft_set_hash: unaligned atomic read on struct nft_set_ext Access to genmask field in struct nft_set_ext results in unaligned atomic read: [ 72.130109] Unable to handle kernel paging request at virtual address ffff0000c2bb708c [ 72.131036] Mem abort info: [ 72.131213] ESR = 0x0000000096000021 [ 72.131446] EC = 0x25: DABT (current EL), IL = 32 bits [ 72.132209] SET = 0, FnV = 0 [ 72.133216] EA = 0, S1PTW = 0 [ 72.134080] FSC = 0x21: alignment fault [ 72.135593] Data abort info: [ 72.137194] ISV = 0, ISS = 0x00000021, ISS2 = 0x00000000 [ 72.142351] CM = 0, WnR = 0, TnD = 0, TagAccess = 0 [ 72.145989] GCS = 0, Overlay = 0, DirtyBit = 0, Xs = 0 [ 72.150115] swapper pgtable: 4k pages, 48-bit VAs, pgdp=0000000237d27000 [ 72.154893] [ffff0000c2bb708c] pgd=0000000000000000, p4d=180000023ffff403, pud=180000023f84b403, pmd=180000023f835403, +pte=0068000102bb7707 [ 72.163021] Internal error: Oops: 0000000096000021 [#1] SMP [...] [ 72.170041] CPU: 7 UID: 0 PID: 54 Comm: kworker/7:0 Tainted: G E 6.13.0-rc3+ #2 [ 72.170509] Tainted: [E]=UNSIGNED_MODULE [ 72.170720] Hardware name: QEMU QEMU Virtual Machine, BIOS edk2-stable202302-for-qemu 03/01/2023 [ 72.171192] Workqueue: events_power_efficient nft_rhash_gc [nf_tables] [ 72.171552] pstate: 21400005 (nzCv daif +PAN -UAO -TCO +DIT -SSBS BTYPE=--) [ 72.171915] pc : nft_rhash_gc+0x200/0x2d8 [nf_tables] [ 72.172166] lr : nft_rhash_gc+0x128/0x2d8 [nf_tables] [ 72.172546] sp : ffff800081f2bce0 [ 72.172724] x29: ffff800081f2bd40 x28: ffff0000c2bb708c x27: 0000000000000038 [ 72.173078] x26: ffff0000c6780ef0 x25: ffff0000c643df00 x24: ffff0000c6778f78 [ 72.173431] x23: 000000000000001a x22: ffff0000c4b1f000 x21: ffff0000c6780f78 [ 72.173782] x20: ffff0000c2bb70dc x19: ffff0000c2bb7080 x18: 0000000000000000 [ 72.174135] x17: ffff0000c0a4e1c0 x16: 0000000000003000 x15: 0000ac26d173b978 [ 72.174485] x14: ffffffffffffffff x13: 0000000000000030 x12: ffff0000c6780ef0 [ 72.174841] x11: 0000000000000000 x10: ffff800081f2bcf8 x9 : ffff0000c3000000 [ 72.175193] x8 : 00000000000004be x7 : 0000000000000000 x6 : 0000000000000000 [ 72.175544] x5 : 0000000000000040 x4 : ffff0000c3000010 x3 : 0000000000000000 [ 72.175871] x2 : 0000000000003a98 x1 : ffff0000c2bb708c x0 : 0000000000000004 [ 72.176207] Call trace: [ 72.176316] nft_rhash_gc+0x200/0x2d8 [nf_tables] (P) [ 72.176653] process_one_work+0x178/0x3d0 [ 72.176831] worker_thread+0x200/0x3f0 [ 72.176995] kthread+0xe8/0xf8 [ 72.177130] ret_from_fork+0x10/0x20 [ 72.177289] Code: 54fff984 d503201f d2800080 91003261 (f820303f) [ 72.177557] ---[ end trace 0000000000000000 ]--- Align struct nft_set_ext to word size to address this and documentation it. pahole reports that this increases the size of elements for rhash and pipapo in 8 bytes on x86_64. Fixes: 7ffc7481153b ("netfilter: nft_set_hash: skip duplicated elements pending gc run") Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_tables.h | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h index 4afa64c81304..0027beca5cd5 100644 --- a/include/net/netfilter/nf_tables.h +++ b/include/net/netfilter/nf_tables.h @@ -733,15 +733,18 @@ struct nft_set_ext_tmpl { /** * struct nft_set_ext - set extensions * - * @genmask: generation mask + * @genmask: generation mask, but also flags (see NFT_SET_ELEM_DEAD_BIT) * @offset: offsets of individual extension types * @data: beginning of extension data + * + * This structure must be aligned to word size, otherwise atomic bitops + * on genmask field can cause alignment failure on some archs. */ struct nft_set_ext { u8 genmask; u8 offset[NFT_SET_EXT_NUM]; char data[]; -}; +} __aligned(BITS_PER_LONG / 8); static inline void nft_set_ext_prepare(struct nft_set_ext_tmpl *tmpl) { From dcd59d0d7d51b2a4b768fc132b0d74a97dfd6d6a Mon Sep 17 00:00:00 2001 From: "Dustin L. Howett" Date: Tue, 24 Dec 2024 12:55:58 -0600 Subject: [PATCH 285/337] platform/chrome: cros_ec_lpc: fix product identity for early Framework Laptops MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The product names for the Framework Laptop (12th and 13th Generation Intel Core) are incorrect as of 62be134abf42. Fixes: 62be134abf42 ("platform/chrome: cros_ec_lpc: switch primary DMI data for Framework Laptop") Cc: stable@vger.kernel.org # 6.12.x Signed-off-by: Dustin L. Howett Reviewed-by: Thomas Weißschuh Link: https://lore.kernel.org/r/20241224-platform-chrome-cros_ec_lpc-fix-product-identity-for-early-framework-laptops-v1-1-0d31d6e1d22c@howett.net Signed-off-by: Tzung-Bi Shih --- drivers/platform/chrome/cros_ec_lpc.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/platform/chrome/cros_ec_lpc.c b/drivers/platform/chrome/cros_ec_lpc.c index 924bf4d3cc77..8470b7f2b135 100644 --- a/drivers/platform/chrome/cros_ec_lpc.c +++ b/drivers/platform/chrome/cros_ec_lpc.c @@ -707,7 +707,7 @@ static const struct dmi_system_id cros_ec_lpc_dmi_table[] __initconst = { /* Framework Laptop (12th Gen Intel Core) */ .matches = { DMI_MATCH(DMI_SYS_VENDOR, "Framework"), - DMI_EXACT_MATCH(DMI_PRODUCT_NAME, "12th Gen Intel Core"), + DMI_EXACT_MATCH(DMI_PRODUCT_NAME, "Laptop (12th Gen Intel Core)"), }, .driver_data = (void *)&framework_laptop_mec_lpc_driver_data, }, @@ -715,7 +715,7 @@ static const struct dmi_system_id cros_ec_lpc_dmi_table[] __initconst = { /* Framework Laptop (13th Gen Intel Core) */ .matches = { DMI_MATCH(DMI_SYS_VENDOR, "Framework"), - DMI_EXACT_MATCH(DMI_PRODUCT_NAME, "13th Gen Intel Core"), + DMI_EXACT_MATCH(DMI_PRODUCT_NAME, "Laptop (13th Gen Intel Core)"), }, .driver_data = (void *)&framework_laptop_mec_lpc_driver_data, }, From 768776dd4efc681cdca33a79e29bb508d6de9bc0 Mon Sep 17 00:00:00 2001 From: Stefan Eichenberger Date: Mon, 16 Dec 2024 16:16:40 +0100 Subject: [PATCH 286/337] i2c: imx: fix missing stop condition in single-master mode A regression was introduced with the implementation of single-master mode, preventing proper stop conditions from being generated. Devices that require a valid stop condition, such as EEPROMs, fail to function correctly as a result. The issue only affects devices with the single-master property enabled. This commit resolves the issue by re-enabling I2C bus busy bit (IBB) polling for single-master mode when generating a stop condition. The fix further ensures that the i2c_imx->stopped flag is cleared at the start of each transfer, allowing the stop condition to be correctly generated in i2c_imx_stop(). According to the reference manual (IMX8MMRM, Rev. 2, 09/2019, page 5270), polling the IBB bit to determine if the bus is free is only necessary in multi-master mode. Consequently, the IBB bit is not polled for the start condition in single-master mode. Fixes: 6692694aca86 ("i2c: imx: do not poll for bus busy in single master mode") Signed-off-by: Stefan Eichenberger Reviewed-by: Frank Li Reviewed-by: Francesco Dolcini Link: https://lore.kernel.org/r/20241216151829.74056-1-eichest@gmail.com Signed-off-by: Andi Shyti --- drivers/i2c/busses/i2c-imx.c | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/drivers/i2c/busses/i2c-imx.c b/drivers/i2c/busses/i2c-imx.c index f751d231ded8..488ee3511314 100644 --- a/drivers/i2c/busses/i2c-imx.c +++ b/drivers/i2c/busses/i2c-imx.c @@ -532,22 +532,20 @@ static void i2c_imx_dma_free(struct imx_i2c_struct *i2c_imx) static int i2c_imx_bus_busy(struct imx_i2c_struct *i2c_imx, int for_busy, bool atomic) { + bool multi_master = i2c_imx->multi_master; unsigned long orig_jiffies = jiffies; unsigned int temp; - if (!i2c_imx->multi_master) - return 0; - while (1) { temp = imx_i2c_read_reg(i2c_imx, IMX_I2C_I2SR); /* check for arbitration lost */ - if (temp & I2SR_IAL) { + if (multi_master && (temp & I2SR_IAL)) { i2c_imx_clear_irq(i2c_imx, I2SR_IAL); return -EAGAIN; } - if (for_busy && (temp & I2SR_IBB)) { + if (for_busy && (!multi_master || (temp & I2SR_IBB))) { i2c_imx->stopped = 0; break; } From e0cec363197e41af870613e8e17b30bf0e3d41b5 Mon Sep 17 00:00:00 2001 From: Carlos Song Date: Wed, 18 Dec 2024 12:42:38 +0800 Subject: [PATCH 287/337] i2c: imx: add imx7d compatible string for applying erratum ERR007805 Compatible string "fsl,imx7d-i2c" is not exited at i2c-imx driver compatible string table, at the result, "fsl,imx21-i2c" will be matched, but it will cause erratum ERR007805 not be applied in fact. So Add "fsl,imx7d-i2c" compatible string in i2c-imx driver to apply the erratum ERR007805(https://www.nxp.com/docs/en/errata/IMX7DS_3N09P.pdf). " ERR007805 I2C: When the I2C clock speed is configured for 400 kHz, the SCL low period violates the I2C spec of 1.3 uS min Description: When the I2C module is programmed to operate at the maximum clock speed of 400 kHz (as defined by the I2C spec), the SCL clock low period violates the I2C spec of 1.3 uS min. The user must reduce the clock speed to obtain the SCL low time to meet the 1.3us I2C minimum required. This behavior means the SoC is not compliant to the I2C spec at 400kHz. Workaround: To meet the clock low period requirement in fast speed mode, SCL must be configured to 384KHz or less. " "fsl,imx7d-i2c" already is documented in binding doc. This erratum fix has been included in imx6_i2c_hwdata and it is the same in all I.MX6/7/8, so just reuse it. Fixes: 39c025721d70 ("i2c: imx: Implement errata ERR007805 or e7805 bus frequency limit") Cc: stable@vger.kernel.org # v5.18+ Signed-off-by: Carlos Song Signed-off-by: Haibo Chen Reviewed-by: Frank Li Fixes: 39c025721d70 ("i2c: imx: Implement errata ERR007805 or e7805 bus frequency limit") Acked-by: Oleksij Rempel Link: https://lore.kernel.org/r/20241218044238.143414-1-carlos.song@nxp.com Signed-off-by: Andi Shyti --- drivers/i2c/busses/i2c-imx.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/i2c/busses/i2c-imx.c b/drivers/i2c/busses/i2c-imx.c index 488ee3511314..5c9a8dfbc4a0 100644 --- a/drivers/i2c/busses/i2c-imx.c +++ b/drivers/i2c/busses/i2c-imx.c @@ -335,6 +335,7 @@ static const struct of_device_id i2c_imx_dt_ids[] = { { .compatible = "fsl,imx6sll-i2c", .data = &imx6_i2c_hwdata, }, { .compatible = "fsl,imx6sx-i2c", .data = &imx6_i2c_hwdata, }, { .compatible = "fsl,imx6ul-i2c", .data = &imx6_i2c_hwdata, }, + { .compatible = "fsl,imx7d-i2c", .data = &imx6_i2c_hwdata, }, { .compatible = "fsl,imx7s-i2c", .data = &imx6_i2c_hwdata, }, { .compatible = "fsl,imx8mm-i2c", .data = &imx6_i2c_hwdata, }, { .compatible = "fsl,imx8mn-i2c", .data = &imx6_i2c_hwdata, }, From 9a8f9320d67b27ddd7f1ee88d91820197a0e908f Mon Sep 17 00:00:00 2001 From: Conor Dooley Date: Wed, 18 Dec 2024 12:07:40 +0000 Subject: [PATCH 288/337] i2c: microchip-core: actually use repeated sends At present, where repeated sends are intended to be used, the i2c-microchip-core driver sends a stop followed by a start. Lots of i2c devices must not malfunction in the face of this behaviour, because the driver has operated like this for years! Try to keep track of whether or not a repeated send is required, and suppress sending a stop in these cases. CC: stable@vger.kernel.org Fixes: 64a6f1c4987e ("i2c: add support for microchip fpga i2c controllers") Signed-off-by: Conor Dooley Reviewed-by: Andi Shyti Link: https://lore.kernel.org/r/20241218-football-composure-e56df2461461@spud Signed-off-by: Andi Shyti --- drivers/i2c/busses/i2c-microchip-corei2c.c | 124 ++++++++++++++++----- 1 file changed, 96 insertions(+), 28 deletions(-) diff --git a/drivers/i2c/busses/i2c-microchip-corei2c.c b/drivers/i2c/busses/i2c-microchip-corei2c.c index d1543e7d8380..6a124e903c66 100644 --- a/drivers/i2c/busses/i2c-microchip-corei2c.c +++ b/drivers/i2c/busses/i2c-microchip-corei2c.c @@ -93,27 +93,35 @@ * @base: pointer to register struct * @dev: device reference * @i2c_clk: clock reference for i2c input clock + * @msg_queue: pointer to the messages requiring sending * @buf: pointer to msg buffer for easier use * @msg_complete: xfer completion object * @adapter: core i2c abstraction * @msg_err: error code for completed message * @bus_clk_rate: current i2c bus clock rate * @isr_status: cached copy of local ISR status + * @total_num: total number of messages to be sent/received + * @current_num: index of the current message being sent/received * @msg_len: number of bytes transferred in msg * @addr: address of the current slave + * @restart_needed: whether or not a repeated start is required after current message */ struct mchp_corei2c_dev { void __iomem *base; struct device *dev; struct clk *i2c_clk; + struct i2c_msg *msg_queue; u8 *buf; struct completion msg_complete; struct i2c_adapter adapter; int msg_err; + int total_num; + int current_num; u32 bus_clk_rate; u32 isr_status; u16 msg_len; u8 addr; + bool restart_needed; }; static void mchp_corei2c_core_disable(struct mchp_corei2c_dev *idev) @@ -222,6 +230,47 @@ static int mchp_corei2c_fill_tx(struct mchp_corei2c_dev *idev) return 0; } +static void mchp_corei2c_next_msg(struct mchp_corei2c_dev *idev) +{ + struct i2c_msg *this_msg; + u8 ctrl; + + if (idev->current_num >= idev->total_num) { + complete(&idev->msg_complete); + return; + } + + /* + * If there's been an error, the isr needs to return control + * to the "main" part of the driver, so as not to keep sending + * messages once it completes and clears the SI bit. + */ + if (idev->msg_err) { + complete(&idev->msg_complete); + return; + } + + this_msg = idev->msg_queue++; + + if (idev->current_num < (idev->total_num - 1)) { + struct i2c_msg *next_msg = idev->msg_queue; + + idev->restart_needed = next_msg->flags & I2C_M_RD; + } else { + idev->restart_needed = false; + } + + idev->addr = i2c_8bit_addr_from_msg(this_msg); + idev->msg_len = this_msg->len; + idev->buf = this_msg->buf; + + ctrl = readb(idev->base + CORE_I2C_CTRL); + ctrl |= CTRL_STA; + writeb(ctrl, idev->base + CORE_I2C_CTRL); + + idev->current_num++; +} + static irqreturn_t mchp_corei2c_handle_isr(struct mchp_corei2c_dev *idev) { u32 status = idev->isr_status; @@ -247,10 +296,14 @@ static irqreturn_t mchp_corei2c_handle_isr(struct mchp_corei2c_dev *idev) break; case STATUS_M_SLAW_ACK: case STATUS_M_TX_DATA_ACK: - if (idev->msg_len > 0) + if (idev->msg_len > 0) { mchp_corei2c_fill_tx(idev); - else - last_byte = true; + } else { + if (idev->restart_needed) + finished = true; + else + last_byte = true; + } break; case STATUS_M_TX_DATA_NACK: case STATUS_M_SLAR_NACK: @@ -287,7 +340,7 @@ static irqreturn_t mchp_corei2c_handle_isr(struct mchp_corei2c_dev *idev) mchp_corei2c_stop(idev); if (last_byte || finished) - complete(&idev->msg_complete); + mchp_corei2c_next_msg(idev); return IRQ_HANDLED; } @@ -311,21 +364,48 @@ static irqreturn_t mchp_corei2c_isr(int irq, void *_dev) return ret; } -static int mchp_corei2c_xfer_msg(struct mchp_corei2c_dev *idev, - struct i2c_msg *msg) +static int mchp_corei2c_xfer(struct i2c_adapter *adap, struct i2c_msg *msgs, + int num) { - u8 ctrl; + struct mchp_corei2c_dev *idev = i2c_get_adapdata(adap); + struct i2c_msg *this_msg = msgs; unsigned long time_left; - - idev->addr = i2c_8bit_addr_from_msg(msg); - idev->msg_len = msg->len; - idev->buf = msg->buf; - idev->msg_err = 0; - - reinit_completion(&idev->msg_complete); + u8 ctrl; mchp_corei2c_core_enable(idev); + /* + * The isr controls the flow of a transfer, this info needs to be saved + * to a location that it can access the queue information from. + */ + idev->restart_needed = false; + idev->msg_queue = msgs; + idev->total_num = num; + idev->current_num = 0; + + /* + * But the first entry to the isr is triggered by the start in this + * function, so the first message needs to be "dequeued". + */ + idev->addr = i2c_8bit_addr_from_msg(this_msg); + idev->msg_len = this_msg->len; + idev->buf = this_msg->buf; + idev->msg_err = 0; + + if (idev->total_num > 1) { + struct i2c_msg *next_msg = msgs + 1; + + idev->restart_needed = next_msg->flags & I2C_M_RD; + } + + idev->current_num++; + idev->msg_queue++; + + reinit_completion(&idev->msg_complete); + + /* + * Send the first start to pass control to the isr + */ ctrl = readb(idev->base + CORE_I2C_CTRL); ctrl |= CTRL_STA; writeb(ctrl, idev->base + CORE_I2C_CTRL); @@ -335,20 +415,8 @@ static int mchp_corei2c_xfer_msg(struct mchp_corei2c_dev *idev, if (!time_left) return -ETIMEDOUT; - return idev->msg_err; -} - -static int mchp_corei2c_xfer(struct i2c_adapter *adap, struct i2c_msg *msgs, - int num) -{ - struct mchp_corei2c_dev *idev = i2c_get_adapdata(adap); - int i, ret; - - for (i = 0; i < num; i++) { - ret = mchp_corei2c_xfer_msg(idev, msgs++); - if (ret) - return ret; - } + if (idev->msg_err) + return idev->msg_err; return num; } From 49e1f0fd0d4cb03a16b8526c4e683e1958f71490 Mon Sep 17 00:00:00 2001 From: Conor Dooley Date: Wed, 18 Dec 2024 12:07:42 +0000 Subject: [PATCH 289/337] i2c: microchip-core: fix "ghost" detections Running i2c-detect currently produces an output akin to: 0 1 2 3 4 5 6 7 8 9 a b c d e f 00: 08 -- 0a -- 0c -- 0e -- 10: 10 -- 12 -- 14 -- 16 -- UU 19 -- 1b -- 1d -- 1f 20: -- 21 -- 23 -- 25 -- 27 -- 29 -- 2b -- 2d -- 2f 30: -- -- -- -- -- -- -- -- 38 -- 3a -- 3c -- 3e -- 40: 40 -- 42 -- 44 -- 46 -- 48 -- 4a -- 4c -- 4e -- 50: -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- 60: 60 -- 62 -- 64 -- 66 -- 68 -- 6a -- 6c -- 6e -- 70: 70 -- 72 -- 74 -- 76 -- This happens because for an i2c_msg with a len of 0 the driver will mark the transmission of the message as a success once the START has been sent, without waiting for the devices on the bus to respond with an ACK/NAK. Since i2cdetect seems to run in a tight loop over all addresses the NAK is treated as part of the next test for the next address. Delete the fast path that marks a message as complete when idev->msg_len is zero after sending a START/RESTART since this isn't a valid scenario. CC: stable@vger.kernel.org Fixes: 64a6f1c4987e ("i2c: add support for microchip fpga i2c controllers") Signed-off-by: Conor Dooley Reviewed-by: Andi Shyti Link: https://lore.kernel.org/r/20241218-outbid-encounter-b2e78b1cc707@spud Signed-off-by: Andi Shyti --- drivers/i2c/busses/i2c-microchip-corei2c.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/drivers/i2c/busses/i2c-microchip-corei2c.c b/drivers/i2c/busses/i2c-microchip-corei2c.c index 6a124e903c66..5db73429125c 100644 --- a/drivers/i2c/busses/i2c-microchip-corei2c.c +++ b/drivers/i2c/busses/i2c-microchip-corei2c.c @@ -287,8 +287,6 @@ static irqreturn_t mchp_corei2c_handle_isr(struct mchp_corei2c_dev *idev) ctrl &= ~CTRL_STA; writeb(idev->addr, idev->base + CORE_I2C_DATA); writeb(ctrl, idev->base + CORE_I2C_CTRL); - if (idev->msg_len == 0) - finished = true; break; case STATUS_M_ARB_LOST: idev->msg_err = -EAGAIN; From 75cd4005da5492129917a4a4ee45e81660556104 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Wed, 25 Dec 2024 19:06:40 +0800 Subject: [PATCH 290/337] ublk: detach gendisk from ublk device if add_disk() fails Inside ublk_abort_requests(), gendisk is grabbed for aborting all inflight requests. And ublk_abort_requests() is called when exiting the uring context or handling timeout. If add_disk() fails, the gendisk may have been freed when calling ublk_abort_requests(), so use-after-free can be caused when getting disk's reference in ublk_abort_requests(). Fixes the bug by detaching gendisk from ublk device if add_disk() fails. Fixes: bd23f6c2c2d0 ("ublk: quiesce request queue when aborting queue") Signed-off-by: Ming Lei Link: https://lore.kernel.org/r/20241225110640.351531-1-ming.lei@redhat.com Signed-off-by: Jens Axboe --- drivers/block/ublk_drv.c | 26 +++++++++++++++++--------- 1 file changed, 17 insertions(+), 9 deletions(-) diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c index d4aed12dd436..934ab9332c80 100644 --- a/drivers/block/ublk_drv.c +++ b/drivers/block/ublk_drv.c @@ -1618,6 +1618,21 @@ static void ublk_unquiesce_dev(struct ublk_device *ub) blk_mq_kick_requeue_list(ub->ub_disk->queue); } +static struct gendisk *ublk_detach_disk(struct ublk_device *ub) +{ + struct gendisk *disk; + + /* Sync with ublk_abort_queue() by holding the lock */ + spin_lock(&ub->lock); + disk = ub->ub_disk; + ub->dev_info.state = UBLK_S_DEV_DEAD; + ub->dev_info.ublksrv_pid = -1; + ub->ub_disk = NULL; + spin_unlock(&ub->lock); + + return disk; +} + static void ublk_stop_dev(struct ublk_device *ub) { struct gendisk *disk; @@ -1631,14 +1646,7 @@ static void ublk_stop_dev(struct ublk_device *ub) ublk_unquiesce_dev(ub); } del_gendisk(ub->ub_disk); - - /* Sync with ublk_abort_queue() by holding the lock */ - spin_lock(&ub->lock); - disk = ub->ub_disk; - ub->dev_info.state = UBLK_S_DEV_DEAD; - ub->dev_info.ublksrv_pid = -1; - ub->ub_disk = NULL; - spin_unlock(&ub->lock); + disk = ublk_detach_disk(ub); put_disk(disk); unlock: mutex_unlock(&ub->mutex); @@ -2336,7 +2344,7 @@ static int ublk_ctrl_start_dev(struct ublk_device *ub, struct io_uring_cmd *cmd) out_put_cdev: if (ret) { - ub->dev_info.state = UBLK_S_DEV_DEAD; + ublk_detach_disk(ub); ublk_put_device(ub); } if (ret) From e33ac68e5e21ec1292490dfe061e75c0dbdd3bd4 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Thu, 26 Dec 2024 16:49:23 +0000 Subject: [PATCH 291/337] io_uring/sqpoll: fix sqpoll error handling races BUG: KASAN: slab-use-after-free in __lock_acquire+0x370b/0x4a10 kernel/locking/lockdep.c:5089 Call Trace: ... _raw_spin_lock_irqsave+0x3d/0x60 kernel/locking/spinlock.c:162 class_raw_spinlock_irqsave_constructor include/linux/spinlock.h:551 [inline] try_to_wake_up+0xb5/0x23c0 kernel/sched/core.c:4205 io_sq_thread_park+0xac/0xe0 io_uring/sqpoll.c:55 io_sq_thread_finish+0x6b/0x310 io_uring/sqpoll.c:96 io_sq_offload_create+0x162/0x11d0 io_uring/sqpoll.c:497 io_uring_create io_uring/io_uring.c:3724 [inline] io_uring_setup+0x1728/0x3230 io_uring/io_uring.c:3806 ... Kun Hu reports that the SQPOLL creating error path has UAF, which happens if io_uring_alloc_task_context() fails and then io_sq_thread() manages to run and complete before the rest of error handling code, which means io_sq_thread_finish() is looking at already killed task. Note that this is mostly theoretical, requiring fault injection on the allocation side to trigger in practice. Cc: stable@vger.kernel.org Reported-by: Kun Hu Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/0f2f1aa5729332612bd01fe0f2f385fd1f06ce7c.1735231717.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- io_uring/sqpoll.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/io_uring/sqpoll.c b/io_uring/sqpoll.c index 6df5e649c413..9e5bd79fd2b5 100644 --- a/io_uring/sqpoll.c +++ b/io_uring/sqpoll.c @@ -405,6 +405,7 @@ void io_sqpoll_wait_sq(struct io_ring_ctx *ctx) __cold int io_sq_offload_create(struct io_ring_ctx *ctx, struct io_uring_params *p) { + struct task_struct *task_to_put = NULL; int ret; /* Retain compatibility with failing for an invalid attach attempt */ @@ -480,6 +481,7 @@ __cold int io_sq_offload_create(struct io_ring_ctx *ctx, } sqd->thread = tsk; + task_to_put = get_task_struct(tsk); ret = io_uring_alloc_task_context(tsk, ctx); wake_up_new_task(tsk); if (ret) @@ -490,11 +492,15 @@ __cold int io_sq_offload_create(struct io_ring_ctx *ctx, goto err; } + if (task_to_put) + put_task_struct(task_to_put); return 0; err_sqpoll: complete(&ctx->sq_data->exited); err: io_sq_thread_finish(ctx); + if (task_to_put) + put_task_struct(task_to_put); return ret; } From 6cc45f8c1f898570916044f606be9890d295e129 Mon Sep 17 00:00:00 2001 From: Tomas Glozar Date: Wed, 27 Nov 2024 14:41:30 +0100 Subject: [PATCH 292/337] rtla/timerlat: Fix histogram ALL for zero samples rtla timerlat hist currently computers the minimum, maximum and average latency even in cases when there are zero samples. This leads to nonsensical values being calculated for maximum and minimum, and to divide by zero for average. A similar bug is fixed by 01b05fc0e5f3 ("rtla/timerlat: Fix histogram report when a cpu count is 0") but the bug still remains for printing the sum over all CPUs in timerlat_print_stats_all. The issue can be reproduced with this command: $ rtla timerlat hist -U -d 1s Index over: count: min: avg: max: Floating point exception (core dumped) (There are always no samples with -U unless the user workload is created.) Fix the bug by omitting max/min/avg when sample count is zero, displaying a dash instead, just like we already do for the individual CPUs. The logic is moved into a new function called format_summary_value, which is used for both the individual CPUs and for the overall summary. Cc: stable@vger.kernel.org Link: https://lore.kernel.org/20241127134130.51171-1-tglozar@redhat.com Fixes: 1462501c7a8 ("rtla/timerlat: Add a summary for hist mode") Signed-off-by: Tomas Glozar Signed-off-by: Steven Rostedt (Google) --- tools/tracing/rtla/src/timerlat_hist.c | 177 ++++++++++++++----------- 1 file changed, 96 insertions(+), 81 deletions(-) diff --git a/tools/tracing/rtla/src/timerlat_hist.c b/tools/tracing/rtla/src/timerlat_hist.c index 8b66387e5f35..4403cc4eba30 100644 --- a/tools/tracing/rtla/src/timerlat_hist.c +++ b/tools/tracing/rtla/src/timerlat_hist.c @@ -281,6 +281,21 @@ static void timerlat_hist_header(struct osnoise_tool *tool) trace_seq_reset(s); } +/* + * format_summary_value - format a line of summary value (min, max or avg) + * of hist data + */ +static void format_summary_value(struct trace_seq *seq, + int count, + unsigned long long val, + bool avg) +{ + if (count) + trace_seq_printf(seq, "%9llu ", avg ? val / count : val); + else + trace_seq_printf(seq, "%9c ", '-'); +} + /* * timerlat_print_summary - print the summary of the hist data to the output */ @@ -328,29 +343,23 @@ timerlat_print_summary(struct timerlat_hist_params *params, if (!data->hist[cpu].irq_count && !data->hist[cpu].thread_count) continue; - if (!params->no_irq) { - if (data->hist[cpu].irq_count) - trace_seq_printf(trace->seq, "%9llu ", - data->hist[cpu].min_irq); - else - trace_seq_printf(trace->seq, " - "); - } + if (!params->no_irq) + format_summary_value(trace->seq, + data->hist[cpu].irq_count, + data->hist[cpu].min_irq, + false); - if (!params->no_thread) { - if (data->hist[cpu].thread_count) - trace_seq_printf(trace->seq, "%9llu ", - data->hist[cpu].min_thread); - else - trace_seq_printf(trace->seq, " - "); - } + if (!params->no_thread) + format_summary_value(trace->seq, + data->hist[cpu].thread_count, + data->hist[cpu].min_thread, + false); - if (params->user_hist) { - if (data->hist[cpu].user_count) - trace_seq_printf(trace->seq, "%9llu ", - data->hist[cpu].min_user); - else - trace_seq_printf(trace->seq, " - "); - } + if (params->user_hist) + format_summary_value(trace->seq, + data->hist[cpu].user_count, + data->hist[cpu].min_user, + false); } trace_seq_printf(trace->seq, "\n"); @@ -364,29 +373,23 @@ timerlat_print_summary(struct timerlat_hist_params *params, if (!data->hist[cpu].irq_count && !data->hist[cpu].thread_count) continue; - if (!params->no_irq) { - if (data->hist[cpu].irq_count) - trace_seq_printf(trace->seq, "%9llu ", - data->hist[cpu].sum_irq / data->hist[cpu].irq_count); - else - trace_seq_printf(trace->seq, " - "); - } + if (!params->no_irq) + format_summary_value(trace->seq, + data->hist[cpu].irq_count, + data->hist[cpu].sum_irq, + true); - if (!params->no_thread) { - if (data->hist[cpu].thread_count) - trace_seq_printf(trace->seq, "%9llu ", - data->hist[cpu].sum_thread / data->hist[cpu].thread_count); - else - trace_seq_printf(trace->seq, " - "); - } + if (!params->no_thread) + format_summary_value(trace->seq, + data->hist[cpu].thread_count, + data->hist[cpu].sum_thread, + true); - if (params->user_hist) { - if (data->hist[cpu].user_count) - trace_seq_printf(trace->seq, "%9llu ", - data->hist[cpu].sum_user / data->hist[cpu].user_count); - else - trace_seq_printf(trace->seq, " - "); - } + if (params->user_hist) + format_summary_value(trace->seq, + data->hist[cpu].user_count, + data->hist[cpu].sum_user, + true); } trace_seq_printf(trace->seq, "\n"); @@ -400,29 +403,23 @@ timerlat_print_summary(struct timerlat_hist_params *params, if (!data->hist[cpu].irq_count && !data->hist[cpu].thread_count) continue; - if (!params->no_irq) { - if (data->hist[cpu].irq_count) - trace_seq_printf(trace->seq, "%9llu ", - data->hist[cpu].max_irq); - else - trace_seq_printf(trace->seq, " - "); - } + if (!params->no_irq) + format_summary_value(trace->seq, + data->hist[cpu].irq_count, + data->hist[cpu].max_irq, + false); - if (!params->no_thread) { - if (data->hist[cpu].thread_count) - trace_seq_printf(trace->seq, "%9llu ", - data->hist[cpu].max_thread); - else - trace_seq_printf(trace->seq, " - "); - } + if (!params->no_thread) + format_summary_value(trace->seq, + data->hist[cpu].thread_count, + data->hist[cpu].max_thread, + false); - if (params->user_hist) { - if (data->hist[cpu].user_count) - trace_seq_printf(trace->seq, "%9llu ", - data->hist[cpu].max_user); - else - trace_seq_printf(trace->seq, " - "); - } + if (params->user_hist) + format_summary_value(trace->seq, + data->hist[cpu].user_count, + data->hist[cpu].max_user, + false); } trace_seq_printf(trace->seq, "\n"); trace_seq_do_printf(trace->seq); @@ -506,16 +503,22 @@ timerlat_print_stats_all(struct timerlat_hist_params *params, trace_seq_printf(trace->seq, "min: "); if (!params->no_irq) - trace_seq_printf(trace->seq, "%9llu ", - sum.min_irq); + format_summary_value(trace->seq, + sum.irq_count, + sum.min_irq, + false); if (!params->no_thread) - trace_seq_printf(trace->seq, "%9llu ", - sum.min_thread); + format_summary_value(trace->seq, + sum.thread_count, + sum.min_thread, + false); if (params->user_hist) - trace_seq_printf(trace->seq, "%9llu ", - sum.min_user); + format_summary_value(trace->seq, + sum.user_count, + sum.min_user, + false); trace_seq_printf(trace->seq, "\n"); @@ -523,16 +526,22 @@ timerlat_print_stats_all(struct timerlat_hist_params *params, trace_seq_printf(trace->seq, "avg: "); if (!params->no_irq) - trace_seq_printf(trace->seq, "%9llu ", - sum.sum_irq / sum.irq_count); + format_summary_value(trace->seq, + sum.irq_count, + sum.sum_irq, + true); if (!params->no_thread) - trace_seq_printf(trace->seq, "%9llu ", - sum.sum_thread / sum.thread_count); + format_summary_value(trace->seq, + sum.thread_count, + sum.sum_thread, + true); if (params->user_hist) - trace_seq_printf(trace->seq, "%9llu ", - sum.sum_user / sum.user_count); + format_summary_value(trace->seq, + sum.user_count, + sum.sum_user, + true); trace_seq_printf(trace->seq, "\n"); @@ -540,16 +549,22 @@ timerlat_print_stats_all(struct timerlat_hist_params *params, trace_seq_printf(trace->seq, "max: "); if (!params->no_irq) - trace_seq_printf(trace->seq, "%9llu ", - sum.max_irq); + format_summary_value(trace->seq, + sum.irq_count, + sum.max_irq, + false); if (!params->no_thread) - trace_seq_printf(trace->seq, "%9llu ", - sum.max_thread); + format_summary_value(trace->seq, + sum.thread_count, + sum.max_thread, + false); if (params->user_hist) - trace_seq_printf(trace->seq, "%9llu ", - sum.max_user); + format_summary_value(trace->seq, + sum.user_count, + sum.max_user, + false); trace_seq_printf(trace->seq, "\n"); trace_seq_do_printf(trace->seq); From 6b830c6a023ff6e8fe05dbe47a9e5cd276df09ee Mon Sep 17 00:00:00 2001 From: "Matthieu Baerts (NGI0)" Date: Sat, 21 Dec 2024 12:09:14 +0100 Subject: [PATCH 293/337] netlink: specs: mptcp: add missing 'server-side' attr This attribute is added with the 'created' and 'established' events, but the documentation didn't mention it. The documentation in the UAPI header has been auto-generated by: ./tools/net/ynl/ynl-regen.sh Reviewed-by: Geliang Tang Signed-off-by: Matthieu Baerts (NGI0) Link: https://patch.msgid.link/20241221-net-mptcp-netlink-specs-pm-doc-fixes-v2-1-e54f2db3f844@kernel.org Signed-off-by: Jakub Kicinski --- Documentation/netlink/specs/mptcp_pm.yaml | 6 ++++-- include/uapi/linux/mptcp_pm.h | 11 ++++++----- 2 files changed, 10 insertions(+), 7 deletions(-) diff --git a/Documentation/netlink/specs/mptcp_pm.yaml b/Documentation/netlink/specs/mptcp_pm.yaml index dc190bf838fe..fc0603f51665 100644 --- a/Documentation/netlink/specs/mptcp_pm.yaml +++ b/Documentation/netlink/specs/mptcp_pm.yaml @@ -23,7 +23,8 @@ definitions: - name: created doc: - token, family, saddr4 | saddr6, daddr4 | daddr6, sport, dport + token, family, saddr4 | saddr6, daddr4 | daddr6, sport, dport, + server-side A new MPTCP connection has been created. It is the good time to allocate memory and send ADD_ADDR if needed. Depending on the traffic-patterns it can take a long time until the @@ -31,7 +32,8 @@ definitions: - name: established doc: - token, family, saddr4 | saddr6, daddr4 | daddr6, sport, dport + token, family, saddr4 | saddr6, daddr4 | daddr6, sport, dport, + server-side A MPTCP connection is established (can start new subflows). - name: closed diff --git a/include/uapi/linux/mptcp_pm.h b/include/uapi/linux/mptcp_pm.h index 50589e5dd6a3..b34fd95b6f84 100644 --- a/include/uapi/linux/mptcp_pm.h +++ b/include/uapi/linux/mptcp_pm.h @@ -13,12 +13,13 @@ * enum mptcp_event_type * @MPTCP_EVENT_UNSPEC: unused event * @MPTCP_EVENT_CREATED: token, family, saddr4 | saddr6, daddr4 | daddr6, - * sport, dport A new MPTCP connection has been created. It is the good time - * to allocate memory and send ADD_ADDR if needed. Depending on the - * traffic-patterns it can take a long time until the MPTCP_EVENT_ESTABLISHED - * is sent. + * sport, dport, server-side A new MPTCP connection has been created. It is + * the good time to allocate memory and send ADD_ADDR if needed. Depending on + * the traffic-patterns it can take a long time until the + * MPTCP_EVENT_ESTABLISHED is sent. * @MPTCP_EVENT_ESTABLISHED: token, family, saddr4 | saddr6, daddr4 | daddr6, - * sport, dport A MPTCP connection is established (can start new subflows). + * sport, dport, server-side A MPTCP connection is established (can start new + * subflows). * @MPTCP_EVENT_CLOSED: token A MPTCP connection has stopped. * @MPTCP_EVENT_ANNOUNCED: token, rem_id, family, daddr4 | daddr6 [, dport] A * new address has been announced by the peer. From bea87657b5ee8e6f18af2833ee4b88212ef52d28 Mon Sep 17 00:00:00 2001 From: "Matthieu Baerts (NGI0)" Date: Sat, 21 Dec 2024 12:09:15 +0100 Subject: [PATCH 294/337] netlink: specs: mptcp: clearly mention attributes The rendered version of the MPTCP events [1] looked strange, because the whole content of the 'doc' was displayed in the same block. It was then not clear that the first words, not even ended by a period, were the attributes that are defined when such events are emitted. These attributes have now been moved to the end, prefixed by 'Attributes:' and ended with a period. Note that '>-' has been added after 'doc:' to allow ':' in the text below. The documentation in the UAPI header has been auto-generated by: ./tools/net/ynl/ynl-regen.sh Link: https://docs.kernel.org/networking/netlink_spec/mptcp_pm.html#event-type [1] Reviewed-by: Geliang Tang Signed-off-by: Matthieu Baerts (NGI0) Link: https://patch.msgid.link/20241221-net-mptcp-netlink-specs-pm-doc-fixes-v2-2-e54f2db3f844@kernel.org Signed-off-by: Jakub Kicinski --- Documentation/netlink/specs/mptcp_pm.yaml | 50 ++++++++++----------- include/uapi/linux/mptcp_pm.h | 53 ++++++++++++----------- 2 files changed, 52 insertions(+), 51 deletions(-) diff --git a/Documentation/netlink/specs/mptcp_pm.yaml b/Documentation/netlink/specs/mptcp_pm.yaml index fc0603f51665..59087a230565 100644 --- a/Documentation/netlink/specs/mptcp_pm.yaml +++ b/Documentation/netlink/specs/mptcp_pm.yaml @@ -22,67 +22,67 @@ definitions: doc: unused event - name: created - doc: - token, family, saddr4 | saddr6, daddr4 | daddr6, sport, dport, - server-side + doc: >- A new MPTCP connection has been created. It is the good time to allocate memory and send ADD_ADDR if needed. Depending on the traffic-patterns it can take a long time until the MPTCP_EVENT_ESTABLISHED is sent. + Attributes: token, family, saddr4 | saddr6, daddr4 | daddr6, sport, + dport, server-side. - name: established - doc: - token, family, saddr4 | saddr6, daddr4 | daddr6, sport, dport, - server-side + doc: >- A MPTCP connection is established (can start new subflows). + Attributes: token, family, saddr4 | saddr6, daddr4 | daddr6, sport, + dport, server-side. - name: closed - doc: - token + doc: >- A MPTCP connection has stopped. + Attribute: token. - name: announced value: 6 - doc: - token, rem_id, family, daddr4 | daddr6 [, dport] + doc: >- A new address has been announced by the peer. + Attributes: token, rem_id, family, daddr4 | daddr6 [, dport]. - name: removed - doc: - token, rem_id + doc: >- An address has been lost by the peer. + Attributes: token, rem_id. - name: sub-established value: 10 - doc: - token, family, loc_id, rem_id, saddr4 | saddr6, daddr4 | daddr6, sport, - dport, backup, if_idx [, error] + doc: >- A new subflow has been established. 'error' should not be set. + Attributes: token, family, loc_id, rem_id, saddr4 | saddr6, daddr4 | + daddr6, sport, dport, backup, if_idx [, error]. - name: sub-closed - doc: - token, family, loc_id, rem_id, saddr4 | saddr6, daddr4 | daddr6, sport, - dport, backup, if_idx [, error] + doc: >- A subflow has been closed. An error (copy of sk_err) could be set if an error has been detected for this subflow. + Attributes: token, family, loc_id, rem_id, saddr4 | saddr6, daddr4 | + daddr6, sport, dport, backup, if_idx [, error]. - name: sub-priority value: 13 - doc: - token, family, loc_id, rem_id, saddr4 | saddr6, daddr4 | daddr6, sport, - dport, backup, if_idx [, error] + doc: >- The priority of a subflow has changed. 'error' should not be set. + Attributes: token, family, loc_id, rem_id, saddr4 | saddr6, daddr4 | + daddr6, sport, dport, backup, if_idx [, error]. - name: listener-created value: 15 - doc: - family, sport, saddr4 | saddr6 + doc: >- A new PM listener is created. + Attributes: family, sport, saddr4 | saddr6. - name: listener-closed - doc: - family, sport, saddr4 | saddr6 + doc: >- A PM listener is closed. + Attributes: family, sport, saddr4 | saddr6. attribute-sets: - diff --git a/include/uapi/linux/mptcp_pm.h b/include/uapi/linux/mptcp_pm.h index b34fd95b6f84..84fa8a21dfd0 100644 --- a/include/uapi/linux/mptcp_pm.h +++ b/include/uapi/linux/mptcp_pm.h @@ -12,32 +12,33 @@ /** * enum mptcp_event_type * @MPTCP_EVENT_UNSPEC: unused event - * @MPTCP_EVENT_CREATED: token, family, saddr4 | saddr6, daddr4 | daddr6, - * sport, dport, server-side A new MPTCP connection has been created. It is - * the good time to allocate memory and send ADD_ADDR if needed. Depending on - * the traffic-patterns it can take a long time until the - * MPTCP_EVENT_ESTABLISHED is sent. - * @MPTCP_EVENT_ESTABLISHED: token, family, saddr4 | saddr6, daddr4 | daddr6, - * sport, dport, server-side A MPTCP connection is established (can start new - * subflows). - * @MPTCP_EVENT_CLOSED: token A MPTCP connection has stopped. - * @MPTCP_EVENT_ANNOUNCED: token, rem_id, family, daddr4 | daddr6 [, dport] A - * new address has been announced by the peer. - * @MPTCP_EVENT_REMOVED: token, rem_id An address has been lost by the peer. - * @MPTCP_EVENT_SUB_ESTABLISHED: token, family, loc_id, rem_id, saddr4 | - * saddr6, daddr4 | daddr6, sport, dport, backup, if_idx [, error] A new - * subflow has been established. 'error' should not be set. - * @MPTCP_EVENT_SUB_CLOSED: token, family, loc_id, rem_id, saddr4 | saddr6, - * daddr4 | daddr6, sport, dport, backup, if_idx [, error] A subflow has been - * closed. An error (copy of sk_err) could be set if an error has been - * detected for this subflow. - * @MPTCP_EVENT_SUB_PRIORITY: token, family, loc_id, rem_id, saddr4 | saddr6, - * daddr4 | daddr6, sport, dport, backup, if_idx [, error] The priority of a - * subflow has changed. 'error' should not be set. - * @MPTCP_EVENT_LISTENER_CREATED: family, sport, saddr4 | saddr6 A new PM - * listener is created. - * @MPTCP_EVENT_LISTENER_CLOSED: family, sport, saddr4 | saddr6 A PM listener - * is closed. + * @MPTCP_EVENT_CREATED: A new MPTCP connection has been created. It is the + * good time to allocate memory and send ADD_ADDR if needed. Depending on the + * traffic-patterns it can take a long time until the MPTCP_EVENT_ESTABLISHED + * is sent. Attributes: token, family, saddr4 | saddr6, daddr4 | daddr6, + * sport, dport, server-side. + * @MPTCP_EVENT_ESTABLISHED: A MPTCP connection is established (can start new + * subflows). Attributes: token, family, saddr4 | saddr6, daddr4 | daddr6, + * sport, dport, server-side. + * @MPTCP_EVENT_CLOSED: A MPTCP connection has stopped. Attribute: token. + * @MPTCP_EVENT_ANNOUNCED: A new address has been announced by the peer. + * Attributes: token, rem_id, family, daddr4 | daddr6 [, dport]. + * @MPTCP_EVENT_REMOVED: An address has been lost by the peer. Attributes: + * token, rem_id. + * @MPTCP_EVENT_SUB_ESTABLISHED: A new subflow has been established. 'error' + * should not be set. Attributes: token, family, loc_id, rem_id, saddr4 | + * saddr6, daddr4 | daddr6, sport, dport, backup, if_idx [, error]. + * @MPTCP_EVENT_SUB_CLOSED: A subflow has been closed. An error (copy of + * sk_err) could be set if an error has been detected for this subflow. + * Attributes: token, family, loc_id, rem_id, saddr4 | saddr6, daddr4 | + * daddr6, sport, dport, backup, if_idx [, error]. + * @MPTCP_EVENT_SUB_PRIORITY: The priority of a subflow has changed. 'error' + * should not be set. Attributes: token, family, loc_id, rem_id, saddr4 | + * saddr6, daddr4 | daddr6, sport, dport, backup, if_idx [, error]. + * @MPTCP_EVENT_LISTENER_CREATED: A new PM listener is created. Attributes: + * family, sport, saddr4 | saddr6. + * @MPTCP_EVENT_LISTENER_CLOSED: A PM listener is closed. Attributes: family, + * sport, saddr4 | saddr6. */ enum mptcp_event_type { MPTCP_EVENT_UNSPEC, From 4f363fe9f6b28ed9b714cd7fe5ce880171927dab Mon Sep 17 00:00:00 2001 From: "Matthieu Baerts (NGI0)" Date: Sat, 21 Dec 2024 12:09:16 +0100 Subject: [PATCH 295/337] netlink: specs: mptcp: fix missing doc Two operations didn't have a small description. It looks like something that has been missed in the original commit introducing this file. Replace the two "todo" by a small and simple description: Create/Destroy subflow. While at it, also uniform the capital letters, avoid double spaces, and fix the "announce" event description: a new "address" has been announced, not a new "subflow". Reviewed-by: Geliang Tang Signed-off-by: Matthieu Baerts (NGI0) Link: https://patch.msgid.link/20241221-net-mptcp-netlink-specs-pm-doc-fixes-v2-3-e54f2db3f844@kernel.org Signed-off-by: Jakub Kicinski --- Documentation/netlink/specs/mptcp_pm.yaml | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/Documentation/netlink/specs/mptcp_pm.yaml b/Documentation/netlink/specs/mptcp_pm.yaml index 59087a230565..dfd017780d2f 100644 --- a/Documentation/netlink/specs/mptcp_pm.yaml +++ b/Documentation/netlink/specs/mptcp_pm.yaml @@ -308,8 +308,8 @@ operations: attributes: - addr - - name: flush-addrs - doc: flush addresses + name: flush-addrs + doc: Flush addresses attribute-set: endpoint dont-validate: [ strict ] flags: [ uns-admin-perm ] @@ -353,7 +353,7 @@ operations: - addr-remote - name: announce - doc: announce new sf + doc: Announce new address attribute-set: attr dont-validate: [ strict ] flags: [ uns-admin-perm ] @@ -364,7 +364,7 @@ operations: - token - name: remove - doc: announce removal + doc: Announce removal attribute-set: attr dont-validate: [ strict ] flags: [ uns-admin-perm ] @@ -375,7 +375,7 @@ operations: - loc-id - name: subflow-create - doc: todo + doc: Create subflow attribute-set: attr dont-validate: [ strict ] flags: [ uns-admin-perm ] @@ -387,7 +387,7 @@ operations: - addr-remote - name: subflow-destroy - doc: todo + doc: Destroy subflow attribute-set: attr dont-validate: [ strict ] flags: [ uns-admin-perm ] From a024e377efed31ecfb39210bed562932321345b3 Mon Sep 17 00:00:00 2001 From: Antonio Pastor Date: Tue, 24 Dec 2024 20:07:20 -0500 Subject: [PATCH 296/337] net: llc: reset skb->transport_header 802.2+LLC+SNAP frames received by napi_complete_done with GRO and DSA have skb->transport_header set two bytes short, or pointing 2 bytes before network_header & skb->data. As snap_rcv expects transport_header to point to SNAP header (OID:PID) after LLC processing advances offset over LLC header (llc_rcv & llc_fixup_skb), code doesn't find a match and packet is dropped. Between napi_complete_done and snap_rcv, transport_header is not used until __netif_receive_skb_core, where originally it was being reset. Commit fda55eca5a33 ("net: introduce skb_transport_header_was_set()") only does so if not set, on the assumption the value was set correctly by GRO (and also on assumption that "network stacks usually reset the transport header anyway"). Afterwards it is moved forward by llc_fixup_skb. Locally generated traffic shows up at __netif_receive_skb_core with no transport_header set and is processed without issue. On a setup with GRO but no DSA, transport_header and network_header are both set to point to skb->data which is also correct. As issue is LLC specific, to avoid impacting non-LLC traffic, and to follow up on original assumption made on previous code change, llc_fixup_skb to reset the offset after skb pull. llc_fixup_skb assumes the LLC header is at skb->data, and by definition SNAP header immediately follows. Fixes: fda55eca5a33 ("net: introduce skb_transport_header_was_set()") Signed-off-by: Antonio Pastor Reviewed-by: Eric Dumazet Link: https://patch.msgid.link/20241225010723.2830290-1-antonio.pastor@gmail.com Signed-off-by: Jakub Kicinski --- net/llc/llc_input.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/llc/llc_input.c b/net/llc/llc_input.c index 51bccfb00a9c..61b0159b2fbe 100644 --- a/net/llc/llc_input.c +++ b/net/llc/llc_input.c @@ -124,8 +124,8 @@ static inline int llc_fixup_skb(struct sk_buff *skb) if (unlikely(!pskb_may_pull(skb, llc_len))) return 0; - skb->transport_header += llc_len; skb_pull(skb, llc_len); + skb_reset_transport_header(skb); if (skb->protocol == htons(ETH_P_802_2)) { __be16 pdulen; s32 data_size; From b06a6187ef983f501e93faa56209169752d3bde3 Mon Sep 17 00:00:00 2001 From: Tanya Agarwal Date: Sun, 29 Dec 2024 11:32:42 +0530 Subject: [PATCH 297/337] ALSA: usb-audio: US16x08: Initialize array before use Initialize meter_urb array before use in mixer_us16x08.c. CID 1410197: (#1 of 1): Uninitialized scalar variable (UNINIT) uninit_use_in_call: Using uninitialized value *meter_urb when calling get_meter_levels_from_urb. Coverity Link: https://scan7.scan.coverity.com/#/project-view/52849/11354?selectedIssue=1410197 Fixes: d2bb390a2081 ("ALSA: usb-audio: Tascam US-16x08 DSP mixer quirk") Signed-off-by: Tanya Agarwal Link: https://patch.msgid.link/20241229060240.1642-1-tanyaagarwal25699@gmail.com Signed-off-by: Takashi Iwai --- sound/usb/mixer_us16x08.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sound/usb/mixer_us16x08.c b/sound/usb/mixer_us16x08.c index 6eb7d93b358d..20ac32635f1f 100644 --- a/sound/usb/mixer_us16x08.c +++ b/sound/usb/mixer_us16x08.c @@ -687,7 +687,7 @@ static int snd_us16x08_meter_get(struct snd_kcontrol *kcontrol, struct usb_mixer_elem_info *elem = kcontrol->private_data; struct snd_usb_audio *chip = elem->head.mixer->chip; struct snd_us16x08_meter_store *store = elem->private_data; - u8 meter_urb[64]; + u8 meter_urb[64] = {0}; switch (kcontrol->private_value) { case 0: { From 31ad36a271290648e7c2288a03d7b933d20254d6 Mon Sep 17 00:00:00 2001 From: chenchangcheng Date: Fri, 20 Dec 2024 15:48:47 +0800 Subject: [PATCH 298/337] objtool: Add bch2_trans_unlocked_error() to bcachefs noreturns Fix the following objtool warning during build time: fs/bcachefs/btree_trans_commit.o: warning: objtool: bch2_trans_commit_write_locked.isra.0() falls through to next function do_bch2_trans_commit.isra.0() fs/bcachefs/btree_trans_commit.o: warning: objtool: .text: unexpected end of section ...... fs/bcachefs/btree_update.o: warning: objtool: bch2_trans_update_get_key_cache() falls through to next function flush_new_cached_update() fs/bcachefs/btree_update.o: warning: objtool: flush_new_cached_update() falls through to next function bch2_trans_update_by_path() bch2_trans_unlocked_error() is an Obviously Correct (tm) panic() wrapper, add it to the list of known noreturns. [ mingo: Improved the changelog ] Fixes: fd104e2967b7 ("bcachefs: bch2_trans_verify_not_unlocked()") Signed-off-by: chenchangcheng Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Ingo Molnar Link: https://lkml.kernel.org/r/20241220074847.3418134-1-ccc194101@163.com --- tools/objtool/noreturns.h | 1 + 1 file changed, 1 insertion(+) diff --git a/tools/objtool/noreturns.h b/tools/objtool/noreturns.h index f37614cc2c1b..b2174894f9f7 100644 --- a/tools/objtool/noreturns.h +++ b/tools/objtool/noreturns.h @@ -19,6 +19,7 @@ NORETURN(__x64_sys_exit_group) NORETURN(arch_cpu_idle_dead) NORETURN(bch2_trans_in_restart_error) NORETURN(bch2_trans_restart_error) +NORETURN(bch2_trans_unlocked_error) NORETURN(cpu_bringup_and_idle) NORETURN(cpu_startup_entry) NORETURN(do_exit) From f718faf3940e95d5d34af9041f279f598396ab7d Mon Sep 17 00:00:00 2001 From: Chen Ridong Date: Tue, 17 Dec 2024 00:48:18 +0000 Subject: [PATCH 299/337] freezer, sched: Report frozen tasks as 'D' instead of 'R' MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Before commit: f5d39b020809 ("freezer,sched: Rewrite core freezer logic") the frozen task stat was reported as 'D' in cgroup v1. However, after rewriting the core freezer logic, the frozen task stat is reported as 'R'. This is confusing, especially when a task with stat of 'S' is frozen. This bug can be reproduced with these steps: $ cd /sys/fs/cgroup/freezer/ $ mkdir test $ sleep 1000 & [1] 739 // task whose stat is 'S' $ echo 739 > test/cgroup.procs $ echo FROZEN > test/freezer.state $ ps -aux | grep 739 root 739 0.1 0.0 8376 1812 pts/0 R 10:56 0:00 sleep 1000 As shown above, a task whose stat is 'S' was changed to 'R' when it was frozen. To solve this regression, simply maintain the same reported state as before the rewrite. [ mingo: Enhanced the changelog and comments ] Fixes: f5d39b020809 ("freezer,sched: Rewrite core freezer logic") Signed-off-by: Chen Ridong Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Ingo Molnar Acked-by: Tejun Heo Acked-by: Michal Koutný Link: https://lore.kernel.org/r/20241217004818.3200515-1-chenridong@huaweicloud.com --- include/linux/sched.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/include/linux/sched.h b/include/linux/sched.h index 66b311fbd5d6..64934e0830af 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1637,8 +1637,9 @@ static inline unsigned int __task_state_index(unsigned int tsk_state, * We're lying here, but rather than expose a completely new task state * to userspace, we can make this appear as if the task has gone through * a regular rt_mutex_lock() call. + * Report frozen tasks as uninterruptible. */ - if (tsk_state & TASK_RTLOCK_WAIT) + if ((tsk_state & TASK_RTLOCK_WAIT) || (tsk_state & TASK_FROZEN)) state = TASK_UNINTERRUPTIBLE; return fls(state); From dc81e556f2a017d681251ace21bf06c126d5a192 Mon Sep 17 00:00:00 2001 From: "Xin Li (Intel)" Date: Wed, 13 Nov 2024 09:59:34 -0800 Subject: [PATCH 300/337] x86/fred: Clear WFE in missing-ENDBRANCH #CPs An indirect branch instruction sets the CPU indirect branch tracker (IBT) into WAIT_FOR_ENDBRANCH (WFE) state and WFE stays asserted across the instruction boundary. When the decoder finds an inappropriate instruction while WFE is set ENDBR, the CPU raises a #CP fault. For the "kernel IBT no ENDBR" selftest where #CPs are deliberately triggered, the WFE state of the interrupted context needs to be cleared to let execution continue. Otherwise when the CPU resumes from the instruction that just caused the previous #CP, another missing-ENDBRANCH #CP is raised and the CPU enters a dead loop. This is not a problem with IDT because it doesn't preserve WFE and IRET doesn't set WFE. But FRED provides space on the entry stack (in an expanded CS area) to save and restore the WFE state, thus the WFE state is no longer clobbered, so software must clear it. Clear WFE to avoid dead looping in ibt_clear_fred_wfe() and the !ibt_fatal code path when execution is allowed to continue. Clobbering WFE in any other circumstance is a security-relevant bug. [ dhansen: changelog rewording ] Fixes: a5f6c2ace997 ("x86/shstk: Add user control-protection fault handler") Signed-off-by: Xin Li (Intel) Signed-off-by: Dave Hansen Signed-off-by: Ingo Molnar Acked-by: Dave Hansen Cc: stable@vger.kernel.org Link: https://lore.kernel.org/all/20241113175934.3897541-1-xin%40zytor.com --- arch/x86/kernel/cet.c | 30 ++++++++++++++++++++++++++++++ 1 file changed, 30 insertions(+) diff --git a/arch/x86/kernel/cet.c b/arch/x86/kernel/cet.c index d2c732a34e5d..303bf74d175b 100644 --- a/arch/x86/kernel/cet.c +++ b/arch/x86/kernel/cet.c @@ -81,6 +81,34 @@ static void do_user_cp_fault(struct pt_regs *regs, unsigned long error_code) static __ro_after_init bool ibt_fatal = true; +/* + * By definition, all missing-ENDBRANCH #CPs are a result of WFE && !ENDBR. + * + * For the kernel IBT no ENDBR selftest where #CPs are deliberately triggered, + * the WFE state of the interrupted context needs to be cleared to let execution + * continue. Otherwise when the CPU resumes from the instruction that just + * caused the previous #CP, another missing-ENDBRANCH #CP is raised and the CPU + * enters a dead loop. + * + * This is not a problem with IDT because it doesn't preserve WFE and IRET doesn't + * set WFE. But FRED provides space on the entry stack (in an expanded CS area) + * to save and restore the WFE state, thus the WFE state is no longer clobbered, + * so software must clear it. + */ +static void ibt_clear_fred_wfe(struct pt_regs *regs) +{ + /* + * No need to do any FRED checks. + * + * For IDT event delivery, the high-order 48 bits of CS are pushed + * as 0s into the stack, and later IRET ignores these bits. + * + * For FRED, a test to check if fred_cs.wfe is set would be dropped + * by compilers. + */ + regs->fred_cs.wfe = 0; +} + static void do_kernel_cp_fault(struct pt_regs *regs, unsigned long error_code) { if ((error_code & CP_EC) != CP_ENDBR) { @@ -90,6 +118,7 @@ static void do_kernel_cp_fault(struct pt_regs *regs, unsigned long error_code) if (unlikely(regs->ip == (unsigned long)&ibt_selftest_noendbr)) { regs->ax = 0; + ibt_clear_fred_wfe(regs); return; } @@ -97,6 +126,7 @@ static void do_kernel_cp_fault(struct pt_regs *regs, unsigned long error_code) if (!ibt_fatal) { printk(KERN_DEFAULT CUT_HERE); __warn(__FILE__, __LINE__, (void *)regs->ip, TAINT_WARN, regs, NULL); + ibt_clear_fred_wfe(regs); return; } BUG(); From 27834971f616c5e154423c578fa95e0444444ce1 Mon Sep 17 00:00:00 2001 From: Li RongQing Date: Wed, 19 Jun 2024 19:18:01 +0800 Subject: [PATCH 301/337] virt: tdx-guest: Just leak decrypted memory on unrecoverable errors In CoCo VMs it is possible for the untrusted host to cause set_memory_decrypted() to fail such that an error is returned and the resulting memory is shared. Callers need to take care to handle these errors to avoid returning decrypted (shared) memory to the page allocator, which could lead to functional or security issues. Leak the decrypted memory when set_memory_decrypted() fails, and don't need to print an error since set_memory_decrypted() will call WARN_ONCE(). Fixes: f4738f56d1dc ("virt: tdx-guest: Add Quote generation support using TSM_REPORTS") Signed-off-by: Li RongQing Signed-off-by: Dave Hansen Signed-off-by: Ingo Molnar Reviewed-by: Rick Edgecombe Reviewed-by: Kirill A. Shutemov Cc: stable@vger.kernel.org Link: https://lore.kernel.org/all/20240619111801.25630-1-lirongqing%40baidu.com --- drivers/virt/coco/tdx-guest/tdx-guest.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/drivers/virt/coco/tdx-guest/tdx-guest.c b/drivers/virt/coco/tdx-guest/tdx-guest.c index d7db6c824e13..224e7dde9cde 100644 --- a/drivers/virt/coco/tdx-guest/tdx-guest.c +++ b/drivers/virt/coco/tdx-guest/tdx-guest.c @@ -124,10 +124,8 @@ static void *alloc_quote_buf(void) if (!addr) return NULL; - if (set_memory_decrypted((unsigned long)addr, count)) { - free_pages_exact(addr, len); + if (set_memory_decrypted((unsigned long)addr, count)) return NULL; - } return addr; } From 032fe9b0516702599c2dd990a4703f783d5716b8 Mon Sep 17 00:00:00 2001 From: Mingcong Bai Date: Thu, 26 Dec 2024 14:22:05 +0800 Subject: [PATCH 302/337] platform/x86: hp-wmi: mark 8A15 board for timed OMEN thermal profile MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The HP OMEN 8 (2022), corresponding to a board ID of 8A15, supports OMEN thermal profile and requires the timed profile quirk. Upon adding this ID to both the omen_thermal_profile_boards and omen_timed_thermal_profile_boards, significant bump in performance can be observed. For instance, SilverBench (https://silver.urih.com/) results improved from ~56,000 to ~69,000, as a result of higher power draws (and thus core frequencies) whilst under load: Package Power: - Before the patch: ~65W (dropping to about 55W under sustained load). - After the patch: ~115W (dropping to about 105W under sustained load). Core Power: - Before: ~60W (ditto above). - After: ~108W (ditto above). Add 8A15 to omen_thermal_profile_boards and omen_timed_thermal_profile_boards to improve performance. Signed-off-by: Xi Xiao <1577912515@qq.com> Signed-off-by: Mingcong Bai Link: https://lore.kernel.org/r/20241226062207.3352629-1-jeffbai@aosc.io Reviewed-by: Ilpo Järvinen Signed-off-by: Ilpo Järvinen --- drivers/platform/x86/hp/hp-wmi.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/platform/x86/hp/hp-wmi.c b/drivers/platform/x86/hp/hp-wmi.c index 81ccc96ffe40..20c55bab3b8c 100644 --- a/drivers/platform/x86/hp/hp-wmi.c +++ b/drivers/platform/x86/hp/hp-wmi.c @@ -64,7 +64,7 @@ static const char * const omen_thermal_profile_boards[] = { "874A", "8603", "8604", "8748", "886B", "886C", "878A", "878B", "878C", "88C8", "88CB", "8786", "8787", "8788", "88D1", "88D2", "88F4", "88FD", "88F5", "88F6", "88F7", "88FE", "88FF", "8900", "8901", "8902", "8912", - "8917", "8918", "8949", "894A", "89EB", "8BAD", "8A42" + "8917", "8918", "8949", "894A", "89EB", "8BAD", "8A42", "8A15" }; /* DMI Board names of Omen laptops that are specifically set to be thermal @@ -80,7 +80,7 @@ static const char * const omen_thermal_profile_force_v0_boards[] = { * "balanced" when reaching zero. */ static const char * const omen_timed_thermal_profile_boards[] = { - "8BAD", "8A42" + "8BAD", "8A42", "8A15" }; /* DMI Board names of Victus laptops */ From 7e16ae558a87ac9099b6a93a43f19b42d809fd78 Mon Sep 17 00:00:00 2001 From: Vishnu Sankar Date: Sat, 28 Dec 2024 08:18:40 +0900 Subject: [PATCH 303/337] platform/x86: thinkpad-acpi: Add support for hotkey 0x1401 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit F8 mode key on Lenovo 2025 platforms use a different key code. Adding support for the new keycode 0x1401. Tested on X1 Carbon Gen 13 and X1 2-in-1 Gen 10. Signed-off-by: Vishnu Sankar Reviewed-by: Mark Pearson Link: https://lore.kernel.org/r/20241227231840.21334-1-vishnuocv@gmail.com Reviewed-by: Ilpo Järvinen Signed-off-by: Ilpo Järvinen --- Documentation/admin-guide/laptops/thinkpad-acpi.rst | 10 +++++++--- drivers/platform/x86/thinkpad_acpi.c | 4 +++- 2 files changed, 10 insertions(+), 4 deletions(-) diff --git a/Documentation/admin-guide/laptops/thinkpad-acpi.rst b/Documentation/admin-guide/laptops/thinkpad-acpi.rst index 7f674a6cfa8a..4ab0fef7d440 100644 --- a/Documentation/admin-guide/laptops/thinkpad-acpi.rst +++ b/Documentation/admin-guide/laptops/thinkpad-acpi.rst @@ -445,8 +445,10 @@ event code Key Notes 0x1008 0x07 FN+F8 IBM: toggle screen expand Lenovo: configure UltraNav, or toggle screen expand. - On newer platforms (2024+) - replaced by 0x131f (see below) + On 2024 platforms replaced by + 0x131f (see below) and on newer + platforms (2025 +) keycode is + replaced by 0x1401 (see below). 0x1009 0x08 FN+F9 - @@ -506,9 +508,11 @@ event code Key Notes 0x1019 0x18 unknown -0x131f ... FN+F8 Platform Mode change. +0x131f ... FN+F8 Platform Mode change (2024 systems). Implemented in driver. +0x1401 ... FN+F8 Platform Mode change (2025 + systems). + Implemented in driver. ... ... ... 0x1020 0x1F unknown diff --git a/drivers/platform/x86/thinkpad_acpi.c b/drivers/platform/x86/thinkpad_acpi.c index 6371a9f765c1..2cfb2ac3f465 100644 --- a/drivers/platform/x86/thinkpad_acpi.c +++ b/drivers/platform/x86/thinkpad_acpi.c @@ -184,7 +184,8 @@ enum tpacpi_hkey_event_t { */ TP_HKEY_EV_AMT_TOGGLE = 0x131a, /* Toggle AMT on/off */ TP_HKEY_EV_DOUBLETAP_TOGGLE = 0x131c, /* Toggle trackpoint doubletap on/off */ - TP_HKEY_EV_PROFILE_TOGGLE = 0x131f, /* Toggle platform profile */ + TP_HKEY_EV_PROFILE_TOGGLE = 0x131f, /* Toggle platform profile in 2024 systems */ + TP_HKEY_EV_PROFILE_TOGGLE2 = 0x1401, /* Toggle platform profile in 2025 + systems */ /* Reasons for waking up from S3/S4 */ TP_HKEY_EV_WKUP_S3_UNDOCK = 0x2304, /* undock requested, S3 */ @@ -11200,6 +11201,7 @@ static bool tpacpi_driver_event(const unsigned int hkey_event) tp_features.trackpoint_doubletap = !tp_features.trackpoint_doubletap; return true; case TP_HKEY_EV_PROFILE_TOGGLE: + case TP_HKEY_EV_PROFILE_TOGGLE2: platform_profile_cycle(); return true; } From fc033cf25e612e840e545f8d5ad2edd6ba613ed5 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Sun, 29 Dec 2024 13:15:45 -0800 Subject: [PATCH 304/337] Linux 6.13-rc5 --- Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile b/Makefile index 5c9b1d2d59b4..48e89108aa58 100644 --- a/Makefile +++ b/Makefile @@ -2,7 +2,7 @@ VERSION = 6 PATCHLEVEL = 13 SUBLEVEL = 0 -EXTRAVERSION = -rc4 +EXTRAVERSION = -rc5 NAME = Baby Opossum Posse # *DOCUMENTATION* From 6a451e2c5c03e27aa3ec36be424fccaa286c3ccd Mon Sep 17 00:00:00 2001 From: Baojun Xu Date: Mon, 30 Dec 2024 14:49:10 +0800 Subject: [PATCH 305/337] ALSA: hda/tas2781: Ignore SUBSYS_ID not found for tas2563 projects Driver will return error if no SUBSYS_ID found in BIOS(acpi). It will cause error in tas2563 projects, which have no SUBSYS_ID. Fixes: 4e7035a75da9 ("ALSA: hda/tas2781: Add speaker id check for ASUS projects") Signed-off-by: Baojun Xu Link: https://lore.kernel.org/20241223225442.1358491-1-stuart.a.hayhurst@gmail.com Link: https://patch.msgid.link/20241230064910.1583-1-baojun.xu@ti.com Signed-off-by: Takashi Iwai --- sound/pci/hda/tas2781_hda_i2c.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/sound/pci/hda/tas2781_hda_i2c.c b/sound/pci/hda/tas2781_hda_i2c.c index 0af015806aba..0e42b87dadb8 100644 --- a/sound/pci/hda/tas2781_hda_i2c.c +++ b/sound/pci/hda/tas2781_hda_i2c.c @@ -142,6 +142,9 @@ static int tas2781_read_acpi(struct tasdevice_priv *p, const char *hid) } sub = acpi_get_subsystem_id(ACPI_HANDLE(physdev)); if (IS_ERR(sub)) { + /* No subsys id in older tas2563 projects. */ + if (!strncmp(hid, "INT8866", sizeof("INT8866"))) + goto end_2563; dev_err(p->dev, "Failed to get SUBSYS ID.\n"); ret = PTR_ERR(sub); goto err; @@ -164,6 +167,7 @@ static int tas2781_read_acpi(struct tasdevice_priv *p, const char *hid) p->speaker_id = NULL; } +end_2563: acpi_dev_free_resource_list(&resources); strscpy(p->dev_name, hid, sizeof(p->dev_name)); put_device(physdev); From ac9fae799eda81e24bbf2e0d5cb9e5c33fc9bdcb Mon Sep 17 00:00:00 2001 From: Takashi Iwai Date: Sun, 29 Dec 2024 09:39:13 +0100 Subject: [PATCH 306/337] ALSA: compress_offload: Drop unneeded no_free_ptr() The error path for memdup_user() no longer needs the tricky wrap with no_free_ptr() and we can safely return the error pointer directly. Fixes: 04177158cf98 ("ALSA: compress_offload: introduce accel operation mode") Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-kbuild-all/202412290846.cncnpGaw-lkp@intel.com/ Link: https://patch.msgid.link/20241229083917.14912-1-tiwai@suse.de Signed-off-by: Takashi Iwai --- sound/core/compress_offload.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/sound/core/compress_offload.c b/sound/core/compress_offload.c index edf5aadf38e5..4ed6cec5fd5c 100644 --- a/sound/core/compress_offload.c +++ b/sound/core/compress_offload.c @@ -1077,7 +1077,7 @@ static int snd_compr_task_create(struct snd_compr_stream *stream, unsigned long return -EPERM; task = memdup_user((void __user *)arg, sizeof(*task)); if (IS_ERR(task)) - return PTR_ERR(no_free_ptr(task)); + return PTR_ERR(task); retval = snd_compr_task_new(stream, task); if (retval >= 0) if (copy_to_user((void __user *)arg, task, sizeof(*task))) @@ -1138,7 +1138,7 @@ static int snd_compr_task_start_ioctl(struct snd_compr_stream *stream, unsigned return -EPERM; task = memdup_user((void __user *)arg, sizeof(*task)); if (IS_ERR(task)) - return PTR_ERR(no_free_ptr(task)); + return PTR_ERR(task); retval = snd_compr_task_start(stream, task); if (retval >= 0) if (copy_to_user((void __user *)arg, task, sizeof(*task))) @@ -1229,7 +1229,7 @@ static int snd_compr_task_status_ioctl(struct snd_compr_stream *stream, unsigned return -EPERM; status = memdup_user((void __user *)arg, sizeof(*status)); if (IS_ERR(status)) - return PTR_ERR(no_free_ptr(status)); + return PTR_ERR(status); retval = snd_compr_task_status(stream, status); if (retval >= 0) if (copy_to_user((void __user *)arg, status, sizeof(*status))) From 7439b395211874e20c24b2fe0e4903864357a3f5 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sun, 29 Dec 2024 18:52:32 +0000 Subject: [PATCH 307/337] ALSA: compress_offload: fix remaining descriptor races in sound/core/compress_offload.c 3d3f43fab4cf ("ALSA: compress_offload: improve file descriptors installation for dma-buf") fixed some of descriptor races in snd_compr_task_new(), but there's a couple more left. We need to grab the references to dmabuf before moving them into descriptor table - trying to do that by descriptor afterwards might end up getting a different object, with a dangling reference left in task->{input,output} Fixes: 3d3f43fab4cf ("ALSA: compress_offload: improve file descriptors installation for dma-buf") Signed-off-by: Al Viro Link: https://patch.msgid.link/20241229185232.GA1977892@ZenIV Signed-off-by: Takashi Iwai --- sound/core/compress_offload.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/sound/core/compress_offload.c b/sound/core/compress_offload.c index 4ed6cec5fd5c..840bb9cfe789 100644 --- a/sound/core/compress_offload.c +++ b/sound/core/compress_offload.c @@ -1053,13 +1053,13 @@ static int snd_compr_task_new(struct snd_compr_stream *stream, struct snd_compr_ put_unused_fd(fd_i); goto cleanup; } + /* keep dmabuf reference until freed with task free ioctl */ + get_dma_buf(task->input); + get_dma_buf(task->output); fd_install(fd_i, task->input->file); fd_install(fd_o, task->output->file); utask->input_fd = fd_i; utask->output_fd = fd_o; - /* keep dmabuf reference until freed with task free ioctl */ - dma_buf_get(utask->input_fd); - dma_buf_get(utask->output_fd); list_add_tail(&task->list, &stream->runtime->tasks); stream->runtime->total_tasks++; return 0; From 0179488ca992d79908b8e26b9213f1554fc5bacc Mon Sep 17 00:00:00 2001 From: Takashi Iwai Date: Mon, 30 Dec 2024 12:05:35 +0100 Subject: [PATCH 308/337] ALSA: seq: oss: Fix races at processing SysEx messages OSS sequencer handles the SysEx messages split in 6 bytes packets, and ALSA sequencer OSS layer tries to combine those. It stores the data in the internal buffer and this access is racy as of now, which may lead to the out-of-bounds access. As a temporary band-aid fix, introduce a mutex for serializing the process of the SysEx message packets. Reported-by: Kun Hu Closes: https://lore.kernel.org/2B7E93E4-B13A-4AE4-8E87-306A8EE9BBB7@m.fudan.edu.cn Cc: Link: https://patch.msgid.link/20241230110543.32454-1-tiwai@suse.de Signed-off-by: Takashi Iwai --- sound/core/seq/oss/seq_oss_synth.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/sound/core/seq/oss/seq_oss_synth.c b/sound/core/seq/oss/seq_oss_synth.c index e3394919daa0..51ee4c00a843 100644 --- a/sound/core/seq/oss/seq_oss_synth.c +++ b/sound/core/seq/oss/seq_oss_synth.c @@ -66,6 +66,7 @@ static struct seq_oss_synth midi_synth_dev = { }; static DEFINE_SPINLOCK(register_lock); +static DEFINE_MUTEX(sysex_mutex); /* * prototypes @@ -497,6 +498,7 @@ snd_seq_oss_synth_sysex(struct seq_oss_devinfo *dp, int dev, unsigned char *buf, if (!info) return -ENXIO; + guard(mutex)(&sysex_mutex); sysex = info->sysex; if (sysex == NULL) { sysex = kzalloc(sizeof(*sysex), GFP_KERNEL); From abbff41b6932cde359589fd51f4024b7c85f366b Mon Sep 17 00:00:00 2001 From: Takashi Iwai Date: Mon, 30 Dec 2024 12:40:22 +0100 Subject: [PATCH 309/337] Revert "ALSA: ump: Don't enumeration invalid groups for legacy rawmidi" This reverts commit c2d188e137e77294323132a760a4608321a36a70. Although it's fine to filter the invalid UMP groups at the first probe time, this will become a problem when UMP groups are updated and (re-)activated. Then there is no way to re-add the substreams properly for the legacy rawmidi, and the new active groups will be still invisible. So let's revert the change. This will move back to showing the full 16 groups, but it's better than forever lost. Link: https://patch.msgid.link/20241230114023.3787-1-tiwai@suse.de Signed-off-by: Takashi Iwai --- sound/core/ump.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sound/core/ump.c b/sound/core/ump.c index fe4d39ae1159..9198bff4768c 100644 --- a/sound/core/ump.c +++ b/sound/core/ump.c @@ -1244,7 +1244,7 @@ static int fill_legacy_mapping(struct snd_ump_endpoint *ump) num = 0; for (i = 0; i < SNDRV_UMP_MAX_GROUPS; i++) - if ((group_maps & (1U << i)) && ump->groups[i].valid) + if (group_maps & (1U << i)) ump->legacy_mapping[num++] = i; return num; From b255ef45fcc2141c1bf98456796abb956d843a27 Mon Sep 17 00:00:00 2001 From: Vitalii Mordan Date: Fri, 27 Dec 2024 15:30:07 +0300 Subject: [PATCH 310/337] eth: bcmsysport: fix call balance of priv->clk handling routines Check the return value of clk_prepare_enable to ensure that priv->clk has been successfully enabled. If priv->clk was not enabled during bcm_sysport_probe, bcm_sysport_resume, or bcm_sysport_open, it must not be disabled in any subsequent execution paths. Fixes: 31bc72d97656 ("net: systemport: fetch and use clock resources") Signed-off-by: Vitalii Mordan Reviewed-by: Florian Fainelli Link: https://patch.msgid.link/20241227123007.2333397-1-mordan@ispras.ru Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/broadcom/bcmsysport.c | 21 ++++++++++++++++++--- 1 file changed, 18 insertions(+), 3 deletions(-) diff --git a/drivers/net/ethernet/broadcom/bcmsysport.c b/drivers/net/ethernet/broadcom/bcmsysport.c index 42672c63f108..bc4e1f3b3752 100644 --- a/drivers/net/ethernet/broadcom/bcmsysport.c +++ b/drivers/net/ethernet/broadcom/bcmsysport.c @@ -1933,7 +1933,11 @@ static int bcm_sysport_open(struct net_device *dev) unsigned int i; int ret; - clk_prepare_enable(priv->clk); + ret = clk_prepare_enable(priv->clk); + if (ret) { + netdev_err(dev, "could not enable priv clock\n"); + return ret; + } /* Reset UniMAC */ umac_reset(priv); @@ -2591,7 +2595,11 @@ static int bcm_sysport_probe(struct platform_device *pdev) goto err_deregister_notifier; } - clk_prepare_enable(priv->clk); + ret = clk_prepare_enable(priv->clk); + if (ret) { + dev_err(&pdev->dev, "could not enable priv clock\n"); + goto err_deregister_netdev; + } priv->rev = topctrl_readl(priv, REV_CNTL) & REV_MASK; dev_info(&pdev->dev, @@ -2605,6 +2613,8 @@ static int bcm_sysport_probe(struct platform_device *pdev) return 0; +err_deregister_netdev: + unregister_netdev(dev); err_deregister_notifier: unregister_netdevice_notifier(&priv->netdev_notifier); err_deregister_fixed_link: @@ -2774,7 +2784,12 @@ static int __maybe_unused bcm_sysport_resume(struct device *d) if (!netif_running(dev)) return 0; - clk_prepare_enable(priv->clk); + ret = clk_prepare_enable(priv->clk); + if (ret) { + netdev_err(dev, "could not enable priv clock\n"); + return ret; + } + if (priv->wolopts) clk_disable_unprepare(priv->wol_clk); From fb3a9a1165cea104b5ab3753e88218e4497b01c1 Mon Sep 17 00:00:00 2001 From: Joshua Washington Date: Fri, 20 Dec 2024 19:28:06 -0800 Subject: [PATCH 311/337] gve: trigger RX NAPI instead of TX NAPI in gve_xsk_wakeup Commit ba0925c34e0f ("gve: process XSK TX descriptors as part of RX NAPI") moved XSK TX processing to be part of the RX NAPI. However, that commit did not include triggering the RX NAPI in gve_xsk_wakeup. This is necessary because the TX NAPI only processes TX completions, meaning that a TX wakeup would not actually trigger XSK descriptor processing. Also, the branch on XDP_WAKEUP_TX was supposed to have been removed, as the NAPI should be scheduled whether the wakeup is for RX or TX. Fixes: ba0925c34e0f ("gve: process XSK TX descriptors as part of RX NAPI") Cc: stable@vger.kernel.org Signed-off-by: Joshua Washington Signed-off-by: Praveen Kaligineedi Link: https://patch.msgid.link/20241221032807.302244-1-pkaligineedi@google.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/google/gve/gve_main.c | 21 +++++++-------------- 1 file changed, 7 insertions(+), 14 deletions(-) diff --git a/drivers/net/ethernet/google/gve/gve_main.c b/drivers/net/ethernet/google/gve/gve_main.c index 09fb7f16f73e..8a8f6ab12a98 100644 --- a/drivers/net/ethernet/google/gve/gve_main.c +++ b/drivers/net/ethernet/google/gve/gve_main.c @@ -1714,7 +1714,7 @@ done: static int gve_xsk_wakeup(struct net_device *dev, u32 queue_id, u32 flags) { struct gve_priv *priv = netdev_priv(dev); - int tx_queue_id = gve_xdp_tx_queue_id(priv, queue_id); + struct napi_struct *napi; if (!gve_get_napi_enabled(priv)) return -ENETDOWN; @@ -1722,19 +1722,12 @@ static int gve_xsk_wakeup(struct net_device *dev, u32 queue_id, u32 flags) if (queue_id >= priv->rx_cfg.num_queues || !priv->xdp_prog) return -EINVAL; - if (flags & XDP_WAKEUP_TX) { - struct gve_tx_ring *tx = &priv->tx[tx_queue_id]; - struct napi_struct *napi = - &priv->ntfy_blocks[tx->ntfy_id].napi; - - if (!napi_if_scheduled_mark_missed(napi)) { - /* Call local_bh_enable to trigger SoftIRQ processing */ - local_bh_disable(); - napi_schedule(napi); - local_bh_enable(); - } - - tx->xdp_xsk_wakeup++; + napi = &priv->ntfy_blocks[gve_rx_idx_to_ntfy(priv, queue_id)].napi; + if (!napi_if_scheduled_mark_missed(napi)) { + /* Call local_bh_enable to trigger SoftIRQ processing */ + local_bh_disable(); + napi_schedule(napi); + local_bh_enable(); } return 0; From ad5c318086e2e23b577eca33559c5ebf89bc7eb9 Mon Sep 17 00:00:00 2001 From: Joe Hattori Date: Sat, 21 Dec 2024 17:14:48 +0900 Subject: [PATCH 312/337] net: mv643xx_eth: fix an OF node reference leak Current implementation of mv643xx_eth_shared_of_add_port() calls of_parse_phandle(), but does not release the refcount on error. Call of_node_put() in the error path and in mv643xx_eth_shared_of_remove(). This bug was found by an experimental verification tool that I am developing. Fixes: 76723bca2802 ("net: mv643xx_eth: add DT parsing support") Signed-off-by: Joe Hattori Link: https://patch.msgid.link/20241221081448.3313163-1-joe@pf.is.s.u-tokyo.ac.jp Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/marvell/mv643xx_eth.c | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/marvell/mv643xx_eth.c b/drivers/net/ethernet/marvell/mv643xx_eth.c index a06048719e84..67a6ff07c83d 100644 --- a/drivers/net/ethernet/marvell/mv643xx_eth.c +++ b/drivers/net/ethernet/marvell/mv643xx_eth.c @@ -2704,9 +2704,15 @@ static struct platform_device *port_platdev[3]; static void mv643xx_eth_shared_of_remove(void) { + struct mv643xx_eth_platform_data *pd; int n; for (n = 0; n < 3; n++) { + if (!port_platdev[n]) + continue; + pd = dev_get_platdata(&port_platdev[n]->dev); + if (pd) + of_node_put(pd->phy_node); platform_device_del(port_platdev[n]); port_platdev[n] = NULL; } @@ -2769,8 +2775,10 @@ static int mv643xx_eth_shared_of_add_port(struct platform_device *pdev, } ppdev = platform_device_alloc(MV643XX_ETH_NAME, dev_num); - if (!ppdev) - return -ENOMEM; + if (!ppdev) { + ret = -ENOMEM; + goto put_err; + } ppdev->dev.coherent_dma_mask = DMA_BIT_MASK(32); ppdev->dev.of_node = pnp; @@ -2792,6 +2800,8 @@ static int mv643xx_eth_shared_of_add_port(struct platform_device *pdev, port_err: platform_device_put(ppdev); +put_err: + of_node_put(ppd.phy_node); return ret; } From cbb26f7d8451fe56ccac802c6db48d16240feebd Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Sat, 21 Dec 2024 09:51:46 +0100 Subject: [PATCH 313/337] mptcp: fix TCP options overflow. Syzbot reported the following splat: Oops: general protection fault, probably for non-canonical address 0xdffffc0000000001: 0000 [#1] PREEMPT SMP KASAN PTI KASAN: null-ptr-deref in range [0x0000000000000008-0x000000000000000f] CPU: 1 UID: 0 PID: 5836 Comm: sshd Not tainted 6.13.0-rc3-syzkaller #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 11/25/2024 RIP: 0010:_compound_head include/linux/page-flags.h:242 [inline] RIP: 0010:put_page+0x23/0x260 include/linux/mm.h:1552 Code: 90 90 90 90 90 90 90 55 41 57 41 56 53 49 89 fe 48 bd 00 00 00 00 00 fc ff df e8 f8 5e 12 f8 49 8d 5e 08 48 89 d8 48 c1 e8 03 <80> 3c 28 00 74 08 48 89 df e8 8f c7 78 f8 48 8b 1b 48 89 de 48 83 RSP: 0000:ffffc90003916c90 EFLAGS: 00010202 RAX: 0000000000000001 RBX: 0000000000000008 RCX: ffff888030458000 RDX: 0000000000000100 RSI: 0000000000000000 RDI: 0000000000000000 RBP: dffffc0000000000 R08: ffffffff898ca81d R09: 1ffff110054414ac R10: dffffc0000000000 R11: ffffed10054414ad R12: 0000000000000007 R13: ffff88802a20a542 R14: 0000000000000000 R15: 0000000000000000 FS: 00007f34f496e800(0000) GS:ffff8880b8700000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00007f9d6ec9ec28 CR3: 000000004d260000 CR4: 00000000003526f0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 Call Trace: skb_page_unref include/linux/skbuff_ref.h:43 [inline] __skb_frag_unref include/linux/skbuff_ref.h:56 [inline] skb_release_data+0x483/0x8a0 net/core/skbuff.c:1119 skb_release_all net/core/skbuff.c:1190 [inline] __kfree_skb+0x55/0x70 net/core/skbuff.c:1204 tcp_clean_rtx_queue net/ipv4/tcp_input.c:3436 [inline] tcp_ack+0x2442/0x6bc0 net/ipv4/tcp_input.c:4032 tcp_rcv_state_process+0x8eb/0x44e0 net/ipv4/tcp_input.c:6805 tcp_v4_do_rcv+0x77d/0xc70 net/ipv4/tcp_ipv4.c:1939 tcp_v4_rcv+0x2dc0/0x37f0 net/ipv4/tcp_ipv4.c:2351 ip_protocol_deliver_rcu+0x22e/0x440 net/ipv4/ip_input.c:205 ip_local_deliver_finish+0x341/0x5f0 net/ipv4/ip_input.c:233 NF_HOOK+0x3a4/0x450 include/linux/netfilter.h:314 NF_HOOK+0x3a4/0x450 include/linux/netfilter.h:314 __netif_receive_skb_one_core net/core/dev.c:5672 [inline] __netif_receive_skb+0x2bf/0x650 net/core/dev.c:5785 process_backlog+0x662/0x15b0 net/core/dev.c:6117 __napi_poll+0xcb/0x490 net/core/dev.c:6883 napi_poll net/core/dev.c:6952 [inline] net_rx_action+0x89b/0x1240 net/core/dev.c:7074 handle_softirqs+0x2d4/0x9b0 kernel/softirq.c:561 __do_softirq kernel/softirq.c:595 [inline] invoke_softirq kernel/softirq.c:435 [inline] __irq_exit_rcu+0xf7/0x220 kernel/softirq.c:662 irq_exit_rcu+0x9/0x30 kernel/softirq.c:678 instr_sysvec_apic_timer_interrupt arch/x86/kernel/apic/apic.c:1049 [inline] sysvec_apic_timer_interrupt+0x57/0xc0 arch/x86/kernel/apic/apic.c:1049 asm_sysvec_apic_timer_interrupt+0x1a/0x20 arch/x86/include/asm/idtentry.h:702 RIP: 0033:0x7f34f4519ad5 Code: 85 d2 74 0d 0f 10 02 48 8d 54 24 20 0f 11 44 24 20 64 8b 04 25 18 00 00 00 85 c0 75 27 41 b8 08 00 00 00 b8 0f 01 00 00 0f 05 <48> 3d 00 f0 ff ff 76 75 48 8b 15 24 73 0d 00 f7 d8 64 89 02 48 83 RSP: 002b:00007ffec5b32ce0 EFLAGS: 00000246 RAX: 0000000000000001 RBX: 00000000000668a0 RCX: 00007f34f4519ad5 RDX: 00007ffec5b32d00 RSI: 0000000000000004 RDI: 0000564f4bc6cae0 RBP: 0000564f4bc6b5a0 R08: 0000000000000008 R09: 0000000000000000 R10: 00007ffec5b32de8 R11: 0000000000000246 R12: 0000564f48ea8aa4 R13: 0000000000000001 R14: 0000564f48ea93e8 R15: 00007ffec5b32d68 Eric noted a probable shinfo->nr_frags corruption, which indeed occurs. The root cause is a buggy MPTCP option len computation in some circumstances: the ADD_ADDR option should be mutually exclusive with DSS since the blamed commit. Still, mptcp_established_options_add_addr() tries to set the relevant info in mptcp_out_options, if the remaining space is large enough even when DSS is present. Since the ADD_ADDR infos and the DSS share the same union fields, adding first corrupts the latter. In the worst-case scenario, such corruption increases the DSS binary layout, exceeding the computed length and possibly overwriting the skb shared info. Address the issue by enforcing mutual exclusion in mptcp_established_options_add_addr(), too. Cc: stable@vger.kernel.org Reported-by: syzbot+38a095a81f30d82884c1@syzkaller.appspotmail.com Closes: https://github.com/multipath-tcp/mptcp_net-next/issues/538 Fixes: 1bff1e43a30e ("mptcp: optimize out option generation") Signed-off-by: Paolo Abeni Reviewed-by: Matthieu Baerts (NGI0) Reviewed-by: Eric Dumazet Link: https://patch.msgid.link/025d9df8cde3c9a557befc47e9bc08fbbe3476e5.1734771049.git.pabeni@redhat.com Signed-off-by: Jakub Kicinski --- net/mptcp/options.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/net/mptcp/options.c b/net/mptcp/options.c index 1603b3702e22..a62bc874bf1e 100644 --- a/net/mptcp/options.c +++ b/net/mptcp/options.c @@ -667,8 +667,15 @@ static bool mptcp_established_options_add_addr(struct sock *sk, struct sk_buff * &echo, &drop_other_suboptions)) return false; + /* + * Later on, mptcp_write_options() will enforce mutually exclusion with + * DSS, bail out if such option is set and we can't drop it. + */ if (drop_other_suboptions) remaining += opt_size; + else if (opts->suboptions & OPTION_MPTCP_DSS) + return false; + len = mptcp_add_addr_len(opts->addr.family, echo, !!opts->addr.port); if (remaining < len) return false; From 03c8d0af2e409e15c16130b185e12b5efba0a6b9 Mon Sep 17 00:00:00 2001 From: Pascal Hambourg Date: Mon, 23 Dec 2024 17:44:01 +0100 Subject: [PATCH 314/337] sky2: Add device ID 11ab:4373 for Marvell 88E8075 A Marvell 88E8075 ethernet controller has this device ID instead of 11ab:4370 and works fine with the sky2 driver. Signed-off-by: Pascal Hambourg Cc: stable@vger.kernel.org Link: https://patch.msgid.link/10165a62-99fb-4be6-8c64-84afd6234085@plouf.fr.eu.org Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/marvell/sky2.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/net/ethernet/marvell/sky2.c b/drivers/net/ethernet/marvell/sky2.c index 3914cd9210d4..988fa28cfb5f 100644 --- a/drivers/net/ethernet/marvell/sky2.c +++ b/drivers/net/ethernet/marvell/sky2.c @@ -130,6 +130,7 @@ static const struct pci_device_id sky2_id_table[] = { { PCI_DEVICE(PCI_VENDOR_ID_MARVELL, 0x436C) }, /* 88E8072 */ { PCI_DEVICE(PCI_VENDOR_ID_MARVELL, 0x436D) }, /* 88E8055 */ { PCI_DEVICE(PCI_VENDOR_ID_MARVELL, 0x4370) }, /* 88E8075 */ + { PCI_DEVICE(PCI_VENDOR_ID_MARVELL, 0x4373) }, /* 88E8075 */ { PCI_DEVICE(PCI_VENDOR_ID_MARVELL, 0x4380) }, /* 88E8057 */ { PCI_DEVICE(PCI_VENDOR_ID_MARVELL, 0x4381) }, /* 88E8059 */ { PCI_DEVICE(PCI_VENDOR_ID_MARVELL, 0x4382) }, /* 88E8079 */ From 4f619d518db9cd1a933c3a095a5f95d0c1584ae8 Mon Sep 17 00:00:00 2001 From: Jinjian Song Date: Tue, 24 Dec 2024 12:15:52 +0800 Subject: [PATCH 315/337] net: wwan: t7xx: Fix FSM command timeout issue When driver processes the internal state change command, it use an asynchronous thread to process the command operation. If the main thread detects that the task has timed out, the asynchronous thread will panic when executing the completion notification because the main thread completion object has been released. BUG: unable to handle page fault for address: fffffffffffffff8 PGD 1f283a067 P4D 1f283a067 PUD 1f283c067 PMD 0 Oops: 0000 [#1] PREEMPT SMP NOPTI RIP: 0010:complete_all+0x3e/0xa0 [...] Call Trace: ? __die_body+0x68/0xb0 ? page_fault_oops+0x379/0x3e0 ? exc_page_fault+0x69/0xa0 ? asm_exc_page_fault+0x22/0x30 ? complete_all+0x3e/0xa0 fsm_main_thread+0xa3/0x9c0 [mtk_t7xx (HASH:1400 5)] ? __pfx_autoremove_wake_function+0x10/0x10 kthread+0xd8/0x110 ? __pfx_fsm_main_thread+0x10/0x10 [mtk_t7xx (HASH:1400 5)] ? __pfx_kthread+0x10/0x10 ret_from_fork+0x38/0x50 ? __pfx_kthread+0x10/0x10 ret_from_fork_asm+0x1b/0x30 [...] CR2: fffffffffffffff8 ---[ end trace 0000000000000000 ]--- Use the reference counter to ensure safe release as Sergey suggests: https://lore.kernel.org/all/da90f64c-260a-4329-87bf-1f9ff20a5951@gmail.com/ Fixes: 13e920d93e37 ("net: wwan: t7xx: Add core components") Signed-off-by: Jinjian Song Acked-by: Sergey Ryazanov Link: https://patch.msgid.link/20241224041552.8711-1-jinjian.song@fibocom.com Signed-off-by: Jakub Kicinski --- drivers/net/wwan/t7xx/t7xx_state_monitor.c | 26 ++++++++++++++-------- drivers/net/wwan/t7xx/t7xx_state_monitor.h | 5 +++-- 2 files changed, 20 insertions(+), 11 deletions(-) diff --git a/drivers/net/wwan/t7xx/t7xx_state_monitor.c b/drivers/net/wwan/t7xx/t7xx_state_monitor.c index 3931c7a13f5a..cbdbb91e8381 100644 --- a/drivers/net/wwan/t7xx/t7xx_state_monitor.c +++ b/drivers/net/wwan/t7xx/t7xx_state_monitor.c @@ -104,14 +104,21 @@ void t7xx_fsm_broadcast_state(struct t7xx_fsm_ctl *ctl, enum md_state state) fsm_state_notify(ctl->md, state); } +static void fsm_release_command(struct kref *ref) +{ + struct t7xx_fsm_command *cmd = container_of(ref, typeof(*cmd), refcnt); + + kfree(cmd); +} + static void fsm_finish_command(struct t7xx_fsm_ctl *ctl, struct t7xx_fsm_command *cmd, int result) { if (cmd->flag & FSM_CMD_FLAG_WAIT_FOR_COMPLETION) { - *cmd->ret = result; - complete_all(cmd->done); + cmd->result = result; + complete_all(&cmd->done); } - kfree(cmd); + kref_put(&cmd->refcnt, fsm_release_command); } static void fsm_del_kf_event(struct t7xx_fsm_event *event) @@ -475,7 +482,6 @@ static int fsm_main_thread(void *data) int t7xx_fsm_append_cmd(struct t7xx_fsm_ctl *ctl, enum t7xx_fsm_cmd_state cmd_id, unsigned int flag) { - DECLARE_COMPLETION_ONSTACK(done); struct t7xx_fsm_command *cmd; unsigned long flags; int ret; @@ -487,11 +493,13 @@ int t7xx_fsm_append_cmd(struct t7xx_fsm_ctl *ctl, enum t7xx_fsm_cmd_state cmd_id INIT_LIST_HEAD(&cmd->entry); cmd->cmd_id = cmd_id; cmd->flag = flag; + kref_init(&cmd->refcnt); if (flag & FSM_CMD_FLAG_WAIT_FOR_COMPLETION) { - cmd->done = &done; - cmd->ret = &ret; + init_completion(&cmd->done); + kref_get(&cmd->refcnt); } + kref_get(&cmd->refcnt); spin_lock_irqsave(&ctl->command_lock, flags); list_add_tail(&cmd->entry, &ctl->command_queue); spin_unlock_irqrestore(&ctl->command_lock, flags); @@ -501,11 +509,11 @@ int t7xx_fsm_append_cmd(struct t7xx_fsm_ctl *ctl, enum t7xx_fsm_cmd_state cmd_id if (flag & FSM_CMD_FLAG_WAIT_FOR_COMPLETION) { unsigned long wait_ret; - wait_ret = wait_for_completion_timeout(&done, + wait_ret = wait_for_completion_timeout(&cmd->done, msecs_to_jiffies(FSM_CMD_TIMEOUT_MS)); - if (!wait_ret) - return -ETIMEDOUT; + ret = wait_ret ? cmd->result : -ETIMEDOUT; + kref_put(&cmd->refcnt, fsm_release_command); return ret; } diff --git a/drivers/net/wwan/t7xx/t7xx_state_monitor.h b/drivers/net/wwan/t7xx/t7xx_state_monitor.h index 7b0a9baf488c..6e0601bb752e 100644 --- a/drivers/net/wwan/t7xx/t7xx_state_monitor.h +++ b/drivers/net/wwan/t7xx/t7xx_state_monitor.h @@ -110,8 +110,9 @@ struct t7xx_fsm_command { struct list_head entry; enum t7xx_fsm_cmd_state cmd_id; unsigned int flag; - struct completion *done; - int *ret; + struct completion done; + int result; + struct kref refcnt; }; struct t7xx_fsm_notifier { From afc6717628f959941d7b33728570568b4af1c4b8 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Tue, 31 Dec 2024 00:06:46 -0500 Subject: [PATCH 316/337] tracing: Have process_string() also allow arrays In order to catch a common bug where a TRACE_EVENT() TP_fast_assign() assigns an address of an allocated string to the ring buffer and then references it in TP_printk(), which can be executed hours later when the string is free, the function test_event_printk() runs on all events as they are registered to make sure there's no unwanted dereferencing. It calls process_string() to handle cases in TP_printk() format that has "%s". It returns whether or not the string is safe. But it can have some false positives. For instance, xe_bo_move() has: TP_printk("move_lacks_source:%s, migrate object %p [size %zu] from %s to %s device_id:%s", __entry->move_lacks_source ? "yes" : "no", __entry->bo, __entry->size, xe_mem_type_to_name[__entry->old_placement], xe_mem_type_to_name[__entry->new_placement], __get_str(device_id)) Where the "%s" references into xe_mem_type_to_name[]. This is an array of pointers that should be safe for the event to access. Instead of flagging this as a bad reference, if a reference points to an array, where the record field is the index, consider it safe. Link: https://lore.kernel.org/all/9dee19b6185d325d0e6fa5f7cbba81d007d99166.camel@sapience.com/ Cc: stable@vger.kernel.org Cc: Masami Hiramatsu Cc: Mathieu Desnoyers Link: https://lore.kernel.org/20241231000646.324fb5f7@gandalf.local.home Fixes: 65a25d9f7ac02 ("tracing: Add "%s" check in test_event_printk()") Reported-by: Genes Lists Tested-by: Gene C Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace_events.c | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 1545cc8b49d0..770e7ed91716 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -364,6 +364,18 @@ static bool process_string(const char *fmt, int len, struct trace_event_call *ca s = r + 1; } while (s < e); + /* + * Check for arrays. If the argument has: foo[REC->val] + * then it is very likely that foo is an array of strings + * that are safe to use. + */ + r = strstr(s, "["); + if (r && r < e) { + r = strstr(r, "REC->"); + if (r && r < e) + return true; + } + /* * If there's any strings in the argument consider this arg OK as it * could be: REC->field ? "foo" : "bar" and we don't want to get into From 7b509910b3ad6d7aacead24c8744de10daf8715d Mon Sep 17 00:00:00 2001 From: Daniel Schaefer Date: Tue, 31 Dec 2024 12:59:58 +0800 Subject: [PATCH 317/337] ALSA hda/realtek: Add quirk for Framework F111:000C Similar to commit eb91c456f371 ("ALSA: hda/realtek: Add Framework Laptop 13 (Intel Core Ultra) to quirks") and previous quirks for Framework systems with Realtek codecs. 000C is a new platform that will also have an ALC285 codec and needs the same quirk. Cc: Jaroslav Kysela Cc: Takashi Iwai Cc: linux@frame.work Cc: Dustin L. Howett Signed-off-by: Daniel Schaefer Cc: Link: https://patch.msgid.link/20241231045958.14545-1-dhs@frame.work Signed-off-by: Takashi Iwai --- sound/pci/hda/patch_realtek.c | 1 + 1 file changed, 1 insertion(+) diff --git a/sound/pci/hda/patch_realtek.c b/sound/pci/hda/patch_realtek.c index 61ba5dc35b8b..b74b566f675e 100644 --- a/sound/pci/hda/patch_realtek.c +++ b/sound/pci/hda/patch_realtek.c @@ -11009,6 +11009,7 @@ static const struct hda_quirk alc269_fixup_tbl[] = { SND_PCI_QUIRK(0xf111, 0x0001, "Framework Laptop", ALC295_FIXUP_FRAMEWORK_LAPTOP_MIC_NO_PRESENCE), SND_PCI_QUIRK(0xf111, 0x0006, "Framework Laptop", ALC295_FIXUP_FRAMEWORK_LAPTOP_MIC_NO_PRESENCE), SND_PCI_QUIRK(0xf111, 0x0009, "Framework Laptop", ALC295_FIXUP_FRAMEWORK_LAPTOP_MIC_NO_PRESENCE), + SND_PCI_QUIRK(0xf111, 0x000c, "Framework Laptop", ALC295_FIXUP_FRAMEWORK_LAPTOP_MIC_NO_PRESENCE), #if 0 /* Below is a quirk table taken from the old code. From fb514b31395946022f13a08e06a435f53cf9e8b3 Mon Sep 17 00:00:00 2001 From: Li Zhijian Date: Tue, 31 Dec 2024 09:34:16 +0800 Subject: [PATCH 318/337] RDMA/rtrs: Ensure 'ib_sge list' is accessible Move the declaration of the 'ib_sge list' variable outside the 'always_invalidate' block to ensure it remains accessible for use throughout the function. Previously, 'ib_sge list' was declared within the 'always_invalidate' block, limiting its accessibility, then caused a 'BUG: kernel NULL pointer dereference'[1]. ? __die_body.cold+0x19/0x27 ? page_fault_oops+0x15a/0x2d0 ? search_module_extables+0x19/0x60 ? search_bpf_extables+0x5f/0x80 ? exc_page_fault+0x7e/0x180 ? asm_exc_page_fault+0x26/0x30 ? memcpy_orig+0xd5/0x140 rxe_mr_copy+0x1c3/0x200 [rdma_rxe] ? rxe_pool_get_index+0x4b/0x80 [rdma_rxe] copy_data+0xa5/0x230 [rdma_rxe] rxe_requester+0xd9b/0xf70 [rdma_rxe] ? finish_task_switch.isra.0+0x99/0x2e0 rxe_sender+0x13/0x40 [rdma_rxe] do_task+0x68/0x1e0 [rdma_rxe] process_one_work+0x177/0x330 worker_thread+0x252/0x390 ? __pfx_worker_thread+0x10/0x10 This change ensures the variable is available for subsequent operations that require it. [1] https://lore.kernel.org/linux-rdma/6a1f3e8f-deb0-49f9-bc69-a9b03ecfcda7@fujitsu.com/ Fixes: 9cb837480424 ("RDMA/rtrs: server: main functionality") Signed-off-by: Li Zhijian Link: https://patch.msgid.link/20241231013416.1290920-1-lizhijian@fujitsu.com Signed-off-by: Leon Romanovsky --- drivers/infiniband/ulp/rtrs/rtrs-srv.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/infiniband/ulp/rtrs/rtrs-srv.c b/drivers/infiniband/ulp/rtrs/rtrs-srv.c index e83d95647852..ef4abdea3c2d 100644 --- a/drivers/infiniband/ulp/rtrs/rtrs-srv.c +++ b/drivers/infiniband/ulp/rtrs/rtrs-srv.c @@ -349,6 +349,7 @@ static int send_io_resp_imm(struct rtrs_srv_con *con, struct rtrs_srv_op *id, struct rtrs_srv_mr *srv_mr; bool need_inval = false; enum ib_send_flags flags; + struct ib_sge list; u32 imm; int err; @@ -401,7 +402,6 @@ static int send_io_resp_imm(struct rtrs_srv_con *con, struct rtrs_srv_op *id, imm = rtrs_to_io_rsp_imm(id->msg_id, errno, need_inval); imm_wr.wr.next = NULL; if (always_invalidate) { - struct ib_sge list; struct rtrs_msg_rkey_rsp *msg; srv_mr = &srv_path->mrs[id->msg_id]; From e6178bf78d0378c2d397a6aafaf4882d0af643fa Mon Sep 17 00:00:00 2001 From: Kalesh AP Date: Tue, 31 Dec 2024 08:20:08 +0530 Subject: [PATCH 319/337] RDMA/bnxt_re: Fix error recovery sequence Fixed to return ENXIO from __send_message_basic_sanity() to indicate that device is in error state. In the case of ERR_DEVICE_DETACHED state, the driver should not post the commands to the firmware as it will time out eventually. Removed bnxt_re_modify_qp() call from bnxt_re_dev_stop() as it is a no-op. Fixes: cc5b9b48d447 ("RDMA/bnxt_re: Recover the device when FW error is detected") Signed-off-by: Kalesh AP Signed-off-by: Kashyap Desai Link: https://patch.msgid.link/20241231025008.2267162-1-kalesh-anakkur.purayil@broadcom.com Reviewed-by: Selvin Xavier Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/bnxt_re/main.c | 8 +------- drivers/infiniband/hw/bnxt_re/qplib_rcfw.c | 5 +++-- 2 files changed, 4 insertions(+), 9 deletions(-) diff --git a/drivers/infiniband/hw/bnxt_re/main.c b/drivers/infiniband/hw/bnxt_re/main.c index b7af0d5ff3b6..c143f273b759 100644 --- a/drivers/infiniband/hw/bnxt_re/main.c +++ b/drivers/infiniband/hw/bnxt_re/main.c @@ -1715,11 +1715,8 @@ static bool bnxt_re_is_qp1_or_shadow_qp(struct bnxt_re_dev *rdev, static void bnxt_re_dev_stop(struct bnxt_re_dev *rdev) { - int mask = IB_QP_STATE; - struct ib_qp_attr qp_attr; struct bnxt_re_qp *qp; - qp_attr.qp_state = IB_QPS_ERR; mutex_lock(&rdev->qp_lock); list_for_each_entry(qp, &rdev->qp_list, list) { /* Modify the state of all QPs except QP1/Shadow QP */ @@ -1727,12 +1724,9 @@ static void bnxt_re_dev_stop(struct bnxt_re_dev *rdev) if (qp->qplib_qp.state != CMDQ_MODIFY_QP_NEW_STATE_RESET && qp->qplib_qp.state != - CMDQ_MODIFY_QP_NEW_STATE_ERR) { + CMDQ_MODIFY_QP_NEW_STATE_ERR) bnxt_re_dispatch_event(&rdev->ibdev, &qp->ib_qp, 1, IB_EVENT_QP_FATAL); - bnxt_re_modify_qp(&qp->ib_qp, &qp_attr, mask, - NULL); - } } } mutex_unlock(&rdev->qp_lock); diff --git a/drivers/infiniband/hw/bnxt_re/qplib_rcfw.c b/drivers/infiniband/hw/bnxt_re/qplib_rcfw.c index 5e90ea232de8..17e62f22683b 100644 --- a/drivers/infiniband/hw/bnxt_re/qplib_rcfw.c +++ b/drivers/infiniband/hw/bnxt_re/qplib_rcfw.c @@ -424,7 +424,8 @@ static int __send_message_basic_sanity(struct bnxt_qplib_rcfw *rcfw, /* Prevent posting if f/w is not in a state to process */ if (test_bit(ERR_DEVICE_DETACHED, &rcfw->cmdq.flags)) - return bnxt_qplib_map_rc(opcode); + return -ENXIO; + if (test_bit(FIRMWARE_STALL_DETECTED, &cmdq->flags)) return -ETIMEDOUT; @@ -493,7 +494,7 @@ static int __bnxt_qplib_rcfw_send_message(struct bnxt_qplib_rcfw *rcfw, rc = __send_message_basic_sanity(rcfw, msg, opcode); if (rc) - return rc; + return rc == -ENXIO ? bnxt_qplib_map_rc(opcode) : rc; rc = __send_message(rcfw, msg, opcode); if (rc) From 8765429279e7d3d68d39ace5f84af2815174bb1e Mon Sep 17 00:00:00 2001 From: Takashi Iwai Date: Tue, 31 Dec 2024 15:53:58 +0100 Subject: [PATCH 320/337] ALSA: seq: Check UMP support for midi_version change When the kernel is built without UMP support but a user-space app requires the midi_version > 0, the kernel should return an error. Otherwise user-space assumes as if it were possible to deal, eventually hitting serious errors later. Fixes: 46397622a3fa ("ALSA: seq: Add UMP support") Cc: Link: https://patch.msgid.link/20241231145358.21946-1-tiwai@suse.de Signed-off-by: Takashi Iwai --- sound/core/seq/seq_clientmgr.c | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/sound/core/seq/seq_clientmgr.c b/sound/core/seq/seq_clientmgr.c index 3930e2f9082f..77b6ac9b5c11 100644 --- a/sound/core/seq/seq_clientmgr.c +++ b/sound/core/seq/seq_clientmgr.c @@ -1275,10 +1275,16 @@ static int snd_seq_ioctl_set_client_info(struct snd_seq_client *client, if (client->type != client_info->type) return -EINVAL; - /* check validity of midi_version field */ - if (client->user_pversion >= SNDRV_PROTOCOL_VERSION(1, 0, 3) && - client_info->midi_version > SNDRV_SEQ_CLIENT_UMP_MIDI_2_0) - return -EINVAL; + if (client->user_pversion >= SNDRV_PROTOCOL_VERSION(1, 0, 3)) { + /* check validity of midi_version field */ + if (client_info->midi_version > SNDRV_SEQ_CLIENT_UMP_MIDI_2_0) + return -EINVAL; + + /* check if UMP is supported in kernel */ + if (!IS_ENABLED(CONFIG_SND_SEQ_UMP) && + client_info->midi_version > 0) + return -EINVAL; + } /* fill the info fields */ if (client_info->name[0]) From f0ed39830e6064d62f9c5393505677a26569bb56 Mon Sep 17 00:00:00 2001 From: Umesh Nerlige Ramappa Date: Fri, 20 Dec 2024 09:19:18 -0800 Subject: [PATCH 321/337] xe/oa: Fix query mode of operation for OAR/OAC MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This is a set of squashed commits to facilitate smooth applying to stable. Each commit message is retained for reference. 1) Allow a GGTT mapped batch to be submitted to user exec queue For a OA use case, one of the HW registers needs to be modified by submitting an MI_LOAD_REGISTER_IMM command to the users exec queue, so that the register is modified in the user's hardware context. In order to do this a batch that is mapped in GGTT, needs to be submitted to the user exec queue. Since all user submissions use q->vm and hence PPGTT, add some plumbing to enable submission of batches mapped in GGTT. v2: ggtt is zero-initialized, so no need to set it false (Matt Brost) 2) xe/oa: Use MI_LOAD_REGISTER_IMMEDIATE to enable OAR/OAC To enable OAR/OAC, a bit in RING_CONTEXT_CONTROL needs to be set. Setting this bit cause the context image size to change and if not done correct, can cause undesired hangs. Current code uses a separate exec_queue to modify this bit and is error-prone. As per HW recommendation, submit MI_LOAD_REGISTER_IMM to the target hardware context to modify the relevant bit. In v2 version, an attempt to submit everything to the user-queue was made, but it failed the unprivileged-single-ctx-counters test. It appears that the OACTXCONTROL must be modified from a remote context. In v3 version, all context specific register configurations were moved to use LOAD_REGISTER_IMMEDIATE and that seems to work well. This is a cleaner way, since we can now submit all configuration to user exec_queue and the fence handling is simplified. v2: (Matt) - set job->ggtt to true if create job is successful - unlock vm on job error (Ashutosh) - don't wait on job submission - use kernel exec queue where possible v3: (Ashutosh) - Fix checkpatch issues - Remove extra spaces/new-lines - Add Fixes: and Cc: tags - Reset context control bit when OA stream is closed - Submit all config via MI_LOAD_REGISTER_IMMEDIATE (Umesh) - Update commit message for v3 experiment - Squash patches for easier port to stable v4: (Ashutosh) - No need to pass q to xe_oa_submit_bb - Do not support exec queues with width > 1 - Fix disabling of CTX_CTRL_OAC_CONTEXT_ENABLE v5: (Ashutosh) - Drop reg_lri related comments - Use XE_OA_SUBMIT_NO_DEPS in xe_oa_load_with_lri Fixes: 8135f1c09dd2 ("drm/xe/oa: Don't reset OAC_CONTEXT_ENABLE on OA stream close") Signed-off-by: Umesh Nerlige Ramappa Reviewed-by: Matthew Brost # commit 1 Reviewed-by: Ashutosh Dixit Cc: stable@vger.kernel.org Reviewed-by: Jonathan Cavitt Signed-off-by: Ashutosh Dixit Link: https://patchwork.freedesktop.org/patch/msgid/20241220171919.571528-2-umesh.nerlige.ramappa@intel.com (cherry picked from commit 55039832f98c7e05f1cf9e0d8c12b2490abd0f16) Signed-off-by: Thomas Hellström --- drivers/gpu/drm/xe/xe_oa.c | 134 ++++++++---------------- drivers/gpu/drm/xe/xe_ring_ops.c | 5 +- drivers/gpu/drm/xe/xe_sched_job_types.h | 2 + 3 files changed, 51 insertions(+), 90 deletions(-) diff --git a/drivers/gpu/drm/xe/xe_oa.c b/drivers/gpu/drm/xe/xe_oa.c index 8dd55798ab31..5cc0f6f9bc11 100644 --- a/drivers/gpu/drm/xe/xe_oa.c +++ b/drivers/gpu/drm/xe/xe_oa.c @@ -74,12 +74,6 @@ struct xe_oa_config { struct rcu_head rcu; }; -struct flex { - struct xe_reg reg; - u32 offset; - u32 value; -}; - struct xe_oa_open_param { struct xe_file *xef; u32 oa_unit_id; @@ -596,19 +590,38 @@ static __poll_t xe_oa_poll(struct file *file, poll_table *wait) return ret; } +static void xe_oa_lock_vma(struct xe_exec_queue *q) +{ + if (q->vm) { + down_read(&q->vm->lock); + xe_vm_lock(q->vm, false); + } +} + +static void xe_oa_unlock_vma(struct xe_exec_queue *q) +{ + if (q->vm) { + xe_vm_unlock(q->vm); + up_read(&q->vm->lock); + } +} + static struct dma_fence *xe_oa_submit_bb(struct xe_oa_stream *stream, enum xe_oa_submit_deps deps, struct xe_bb *bb) { + struct xe_exec_queue *q = stream->exec_q ?: stream->k_exec_q; struct xe_sched_job *job; struct dma_fence *fence; int err = 0; - /* Kernel configuration is issued on stream->k_exec_q, not stream->exec_q */ - job = xe_bb_create_job(stream->k_exec_q, bb); + xe_oa_lock_vma(q); + + job = xe_bb_create_job(q, bb); if (IS_ERR(job)) { err = PTR_ERR(job); goto exit; } + job->ggtt = true; if (deps == XE_OA_SUBMIT_ADD_DEPS) { for (int i = 0; i < stream->num_syncs && !err; i++) @@ -623,10 +636,13 @@ static struct dma_fence *xe_oa_submit_bb(struct xe_oa_stream *stream, enum xe_oa fence = dma_fence_get(&job->drm.s_fence->finished); xe_sched_job_push(job); + xe_oa_unlock_vma(q); + return fence; err_put_job: xe_sched_job_put(job); exit: + xe_oa_unlock_vma(q); return ERR_PTR(err); } @@ -675,63 +691,19 @@ static void xe_oa_free_configs(struct xe_oa_stream *stream) dma_fence_put(stream->last_fence); } -static void xe_oa_store_flex(struct xe_oa_stream *stream, struct xe_lrc *lrc, - struct xe_bb *bb, const struct flex *flex, u32 count) -{ - u32 offset = xe_bo_ggtt_addr(lrc->bo); - - do { - bb->cs[bb->len++] = MI_STORE_DATA_IMM | MI_SDI_GGTT | MI_SDI_NUM_DW(1); - bb->cs[bb->len++] = offset + flex->offset * sizeof(u32); - bb->cs[bb->len++] = 0; - bb->cs[bb->len++] = flex->value; - - } while (flex++, --count); -} - -static int xe_oa_modify_ctx_image(struct xe_oa_stream *stream, struct xe_lrc *lrc, - const struct flex *flex, u32 count) +static int xe_oa_load_with_lri(struct xe_oa_stream *stream, struct xe_oa_reg *reg_lri, u32 count) { struct dma_fence *fence; struct xe_bb *bb; int err; - bb = xe_bb_new(stream->gt, 4 * count, false); + bb = xe_bb_new(stream->gt, 2 * count + 1, false); if (IS_ERR(bb)) { err = PTR_ERR(bb); goto exit; } - xe_oa_store_flex(stream, lrc, bb, flex, count); - - fence = xe_oa_submit_bb(stream, XE_OA_SUBMIT_NO_DEPS, bb); - if (IS_ERR(fence)) { - err = PTR_ERR(fence); - goto free_bb; - } - xe_bb_free(bb, fence); - dma_fence_put(fence); - - return 0; -free_bb: - xe_bb_free(bb, NULL); -exit: - return err; -} - -static int xe_oa_load_with_lri(struct xe_oa_stream *stream, struct xe_oa_reg *reg_lri) -{ - struct dma_fence *fence; - struct xe_bb *bb; - int err; - - bb = xe_bb_new(stream->gt, 3, false); - if (IS_ERR(bb)) { - err = PTR_ERR(bb); - goto exit; - } - - write_cs_mi_lri(bb, reg_lri, 1); + write_cs_mi_lri(bb, reg_lri, count); fence = xe_oa_submit_bb(stream, XE_OA_SUBMIT_NO_DEPS, bb); if (IS_ERR(fence)) { @@ -751,71 +723,55 @@ exit: static int xe_oa_configure_oar_context(struct xe_oa_stream *stream, bool enable) { const struct xe_oa_format *format = stream->oa_buffer.format; - struct xe_lrc *lrc = stream->exec_q->lrc[0]; - u32 regs_offset = xe_lrc_regs_offset(lrc) / sizeof(u32); u32 oacontrol = __format_to_oactrl(format, OAR_OACONTROL_COUNTER_SEL_MASK) | (enable ? OAR_OACONTROL_COUNTER_ENABLE : 0); - struct flex regs_context[] = { + struct xe_oa_reg reg_lri[] = { { OACTXCONTROL(stream->hwe->mmio_base), - stream->oa->ctx_oactxctrl_offset[stream->hwe->class] + 1, enable ? OA_COUNTER_RESUME : 0, }, + { + OAR_OACONTROL, + oacontrol, + }, { RING_CONTEXT_CONTROL(stream->hwe->mmio_base), - regs_offset + CTX_CONTEXT_CONTROL, - _MASKED_BIT_ENABLE(CTX_CTRL_OAC_CONTEXT_ENABLE), + _MASKED_FIELD(CTX_CTRL_OAC_CONTEXT_ENABLE, + enable ? CTX_CTRL_OAC_CONTEXT_ENABLE : 0) }, }; - struct xe_oa_reg reg_lri = { OAR_OACONTROL, oacontrol }; - int err; - /* Modify stream hwe context image with regs_context */ - err = xe_oa_modify_ctx_image(stream, stream->exec_q->lrc[0], - regs_context, ARRAY_SIZE(regs_context)); - if (err) - return err; - - /* Apply reg_lri using LRI */ - return xe_oa_load_with_lri(stream, ®_lri); + return xe_oa_load_with_lri(stream, reg_lri, ARRAY_SIZE(reg_lri)); } static int xe_oa_configure_oac_context(struct xe_oa_stream *stream, bool enable) { const struct xe_oa_format *format = stream->oa_buffer.format; - struct xe_lrc *lrc = stream->exec_q->lrc[0]; - u32 regs_offset = xe_lrc_regs_offset(lrc) / sizeof(u32); u32 oacontrol = __format_to_oactrl(format, OAR_OACONTROL_COUNTER_SEL_MASK) | (enable ? OAR_OACONTROL_COUNTER_ENABLE : 0); - struct flex regs_context[] = { + struct xe_oa_reg reg_lri[] = { { OACTXCONTROL(stream->hwe->mmio_base), - stream->oa->ctx_oactxctrl_offset[stream->hwe->class] + 1, enable ? OA_COUNTER_RESUME : 0, }, + { + OAC_OACONTROL, + oacontrol + }, { RING_CONTEXT_CONTROL(stream->hwe->mmio_base), - regs_offset + CTX_CONTEXT_CONTROL, - _MASKED_BIT_ENABLE(CTX_CTRL_OAC_CONTEXT_ENABLE) | + _MASKED_FIELD(CTX_CTRL_OAC_CONTEXT_ENABLE, + enable ? CTX_CTRL_OAC_CONTEXT_ENABLE : 0) | _MASKED_FIELD(CTX_CTRL_RUN_ALONE, enable ? CTX_CTRL_RUN_ALONE : 0), }, }; - struct xe_oa_reg reg_lri = { OAC_OACONTROL, oacontrol }; - int err; /* Set ccs select to enable programming of OAC_OACONTROL */ xe_mmio_write32(&stream->gt->mmio, __oa_regs(stream)->oa_ctrl, __oa_ccs_select(stream)); - /* Modify stream hwe context image with regs_context */ - err = xe_oa_modify_ctx_image(stream, stream->exec_q->lrc[0], - regs_context, ARRAY_SIZE(regs_context)); - if (err) - return err; - - /* Apply reg_lri using LRI */ - return xe_oa_load_with_lri(stream, ®_lri); + return xe_oa_load_with_lri(stream, reg_lri, ARRAY_SIZE(reg_lri)); } static int xe_oa_configure_oa_context(struct xe_oa_stream *stream, bool enable) @@ -2066,8 +2022,8 @@ int xe_oa_stream_open_ioctl(struct drm_device *dev, u64 data, struct drm_file *f if (XE_IOCTL_DBG(oa->xe, !param.exec_q)) return -ENOENT; - if (param.exec_q->width > 1) - drm_dbg(&oa->xe->drm, "exec_q->width > 1, programming only exec_q->lrc[0]\n"); + if (XE_IOCTL_DBG(oa->xe, param.exec_q->width > 1)) + return -EOPNOTSUPP; } /* diff --git a/drivers/gpu/drm/xe/xe_ring_ops.c b/drivers/gpu/drm/xe/xe_ring_ops.c index 0be4f489d3e1..9f327f27c072 100644 --- a/drivers/gpu/drm/xe/xe_ring_ops.c +++ b/drivers/gpu/drm/xe/xe_ring_ops.c @@ -221,7 +221,10 @@ static int emit_pipe_imm_ggtt(u32 addr, u32 value, bool stall_only, u32 *dw, static u32 get_ppgtt_flag(struct xe_sched_job *job) { - return job->q->vm ? BIT(8) : 0; + if (job->q->vm && !job->ggtt) + return BIT(8); + + return 0; } static int emit_copy_timestamp(struct xe_lrc *lrc, u32 *dw, int i) diff --git a/drivers/gpu/drm/xe/xe_sched_job_types.h b/drivers/gpu/drm/xe/xe_sched_job_types.h index f13f333f00be..d942b20a9f29 100644 --- a/drivers/gpu/drm/xe/xe_sched_job_types.h +++ b/drivers/gpu/drm/xe/xe_sched_job_types.h @@ -56,6 +56,8 @@ struct xe_sched_job { u32 migrate_flush_flags; /** @ring_ops_flush_tlb: The ring ops need to flush TLB before payload. */ bool ring_ops_flush_tlb; + /** @ggtt: mapped in ggtt. */ + bool ggtt; /** @ptrs: per instance pointers. */ struct xe_job_ptrs ptrs[]; }; From 0bc21e701a6ffacfdde7f04f87d664d82e8a13bf Mon Sep 17 00:00:00 2001 From: Olof Johansson Date: Thu, 2 Jan 2025 06:30:03 -0800 Subject: [PATCH 322/337] MAINTAINERS: Remove Olof from SoC maintainers I haven't been an active participant for a couple of years now, and after discussions at Linux Plumbers in 2024, Arnd is getting fresh help from a few more participants. It's time to remove myself, and spare myself from patches and pull requests in my inbox. Signed-off-by: Olof Johansson Cc: Arnd Bergmann Signed-off-by: Linus Torvalds --- MAINTAINERS | 1 - 1 file changed, 1 deletion(-) diff --git a/MAINTAINERS b/MAINTAINERS index 910305c11e8a..c575de4903db 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -1797,7 +1797,6 @@ F: include/uapi/linux/if_arcnet.h ARM AND ARM64 SoC SUB-ARCHITECTURES (COMMON PARTS) M: Arnd Bergmann -M: Olof Johansson L: linux-arm-kernel@lists.infradead.org (moderated for non-subscribers) L: soc@lists.linux.dev S: Maintained From d65474033740ded0a4fe9a097fce72328655b41d Mon Sep 17 00:00:00 2001 From: Zilin Guan Date: Tue, 31 Dec 2024 11:37:31 +0000 Subject: [PATCH 323/337] fgraph: Add READ_ONCE() when accessing fgraph_array[] In __ftrace_return_to_handler(), a loop iterates over the fgraph_array[] elements, which are fgraph_ops. The loop checks if an element is a fgraph_stub to prevent using a fgraph_stub afterward. However, if the compiler reloads fgraph_array[] after this check, it might race with an update to fgraph_array[] that introduces a fgraph_stub. This could result in the stub being processed, but the stub contains a null "func_hash" field, leading to a NULL pointer dereference. To ensure that the gops compared against the fgraph_stub matches the gops processed later, add a READ_ONCE(). A similar patch appears in commit 63a8dfb ("function_graph: Add READ_ONCE() when accessing fgraph_array[]"). Cc: stable@vger.kernel.org Fixes: 37238abe3cb47 ("ftrace/function_graph: Pass fgraph_ops to function graph callbacks") Link: https://lore.kernel.org/20241231113731.277668-1-zilin@seu.edu.cn Signed-off-by: Zilin Guan Signed-off-by: Steven Rostedt (Google) --- kernel/trace/fgraph.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/trace/fgraph.c b/kernel/trace/fgraph.c index ddedcb50917f..30e3ddc8a8a8 100644 --- a/kernel/trace/fgraph.c +++ b/kernel/trace/fgraph.c @@ -833,7 +833,7 @@ static unsigned long __ftrace_return_to_handler(struct fgraph_ret_regs *ret_regs #endif { for_each_set_bit(i, &bitmap, sizeof(bitmap) * BITS_PER_BYTE) { - struct fgraph_ops *gops = fgraph_array[i]; + struct fgraph_ops *gops = READ_ONCE(fgraph_array[i]); if (gops == &fgraph_stub) continue; From 789a8cff8d2dbe4b5c617c3004b5eb63fa7a3b35 Mon Sep 17 00:00:00 2001 From: Kohei Enju Date: Thu, 2 Jan 2025 04:08:20 +0900 Subject: [PATCH 324/337] ftrace: Fix function profiler's filtering functionality Commit c132be2c4fcc ("function_graph: Have the instances use their own ftrace_ops for filtering"), function profiler (enabled via function_profile_enabled) has been showing statistics for all functions, ignoring set_ftrace_filter settings. While tracers are instantiated, the function profiler is not. Therefore, it should use the global set_ftrace_filter for consistency. This patch modifies the function profiler to use the global filter, fixing the filtering functionality. Before (filtering not working): ``` root@localhost:~# echo 'vfs*' > /sys/kernel/tracing/set_ftrace_filter root@localhost:~# echo 1 > /sys/kernel/tracing/function_profile_enabled root@localhost:~# sleep 1 root@localhost:~# echo 0 > /sys/kernel/tracing/function_profile_enabled root@localhost:~# head /sys/kernel/tracing/trace_stat/* Function Hit Time Avg s^2 -------- --- ---- --- --- schedule 314 22290594 us 70989.15 us 40372231 us x64_sys_call 1527 8762510 us 5738.382 us 3414354 us schedule_hrtimeout_range 176 8665356 us 49234.98 us 405618876 us __x64_sys_ppoll 324 5656635 us 17458.75 us 19203976 us do_sys_poll 324 5653747 us 17449.83 us 19214945 us schedule_timeout 67 5531396 us 82558.15 us 2136740827 us __x64_sys_pselect6 12 3029540 us 252461.7 us 63296940171 us do_pselect.constprop.0 12 3029532 us 252461.0 us 63296952931 us ``` After (filtering working): ``` root@localhost:~# echo 'vfs*' > /sys/kernel/tracing/set_ftrace_filter root@localhost:~# echo 1 > /sys/kernel/tracing/function_profile_enabled root@localhost:~# sleep 1 root@localhost:~# echo 0 > /sys/kernel/tracing/function_profile_enabled root@localhost:~# head /sys/kernel/tracing/trace_stat/* Function Hit Time Avg s^2 -------- --- ---- --- --- vfs_write 462 68476.43 us 148.217 us 25874.48 us vfs_read 641 9611.356 us 14.994 us 28868.07 us vfs_fstat 890 878.094 us 0.986 us 1.667 us vfs_fstatat 227 757.176 us 3.335 us 18.928 us vfs_statx 226 610.610 us 2.701 us 17.749 us vfs_getattr_nosec 1187 460.919 us 0.388 us 0.326 us vfs_statx_path 297 343.287 us 1.155 us 11.116 us vfs_rename 6 291.575 us 48.595 us 9889.236 us ``` Cc: stable@vger.kernel.org Link: https://lore.kernel.org/20250101190820.72534-1-enjuk@amazon.com Fixes: c132be2c4fcc ("function_graph: Have the instances use their own ftrace_ops for filtering") Signed-off-by: Kohei Enju Signed-off-by: Steven Rostedt (Google) --- kernel/trace/ftrace.c | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 9b17efb1a87d..2e113f8b13a2 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -902,16 +902,13 @@ static void profile_graph_return(struct ftrace_graph_ret *trace, } static struct fgraph_ops fprofiler_ops = { - .ops = { - .flags = FTRACE_OPS_FL_INITIALIZED, - INIT_OPS_HASH(fprofiler_ops.ops) - }, .entryfunc = &profile_graph_entry, .retfunc = &profile_graph_return, }; static int register_ftrace_profiler(void) { + ftrace_ops_set_global_filter(&fprofiler_ops.ops); return register_ftrace_graph(&fprofiler_ops); } @@ -922,12 +919,11 @@ static void unregister_ftrace_profiler(void) #else static struct ftrace_ops ftrace_profile_ops __read_mostly = { .func = function_profile_call, - .flags = FTRACE_OPS_FL_INITIALIZED, - INIT_OPS_HASH(ftrace_profile_ops) }; static int register_ftrace_profiler(void) { + ftrace_ops_set_global_filter(&ftrace_profile_ops); return register_ftrace_function(&ftrace_profile_ops); } From a8620de72e5676993ec3a3b975f7c10908f5f60f Mon Sep 17 00:00:00 2001 From: Liang Jie Date: Mon, 30 Dec 2024 17:37:09 +0800 Subject: [PATCH 325/337] net: sfc: Correct key_len for efx_tc_ct_zone_ht_params In efx_tc_ct_zone_ht_params, the key_len was previously set to offsetof(struct efx_tc_ct_zone, linkage). This calculation is incorrect because it includes any padding between the zone field and the linkage field due to structure alignment, which can vary between systems. This patch updates key_len to use sizeof_field(struct efx_tc_ct_zone, zone) , ensuring that the hash table correctly uses the zone as the key. This fix prevents potential hash lookup errors and improves connection tracking reliability. Fixes: c3bb5c6acd4e ("sfc: functions to register for conntrack zone offload") Signed-off-by: Liang Jie Acked-by: Edward Cree Link: https://patch.msgid.link/20241230093709.3226854-1-buaajxlj@163.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/sfc/tc_conntrack.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/sfc/tc_conntrack.c b/drivers/net/ethernet/sfc/tc_conntrack.c index d90206f27161..c0603f54cec3 100644 --- a/drivers/net/ethernet/sfc/tc_conntrack.c +++ b/drivers/net/ethernet/sfc/tc_conntrack.c @@ -16,7 +16,7 @@ static int efx_tc_flow_block(enum tc_setup_type type, void *type_data, void *cb_priv); static const struct rhashtable_params efx_tc_ct_zone_ht_params = { - .key_len = offsetof(struct efx_tc_ct_zone, linkage), + .key_len = sizeof_field(struct efx_tc_ct_zone, zone), .key_offset = 0, .head_offset = offsetof(struct efx_tc_ct_zone, linkage), }; From 68e068cabd2c6c533ef934c2e5151609cf6ecc6d Mon Sep 17 00:00:00 2001 From: Willem de Bruijn Date: Wed, 1 Jan 2025 11:47:40 -0500 Subject: [PATCH 326/337] net: reenable NETIF_F_IPV6_CSUM offload for BIG TCP packets The blamed commit disabled hardware offoad of IPv6 packets with extension headers on devices that advertise NETIF_F_IPV6_CSUM, based on the definition of that feature in skbuff.h: * * - %NETIF_F_IPV6_CSUM * - Driver (device) is only able to checksum plain * TCP or UDP packets over IPv6. These are specifically * unencapsulated packets of the form IPv6|TCP or * IPv6|UDP where the Next Header field in the IPv6 * header is either TCP or UDP. IPv6 extension headers * are not supported with this feature. This feature * cannot be set in features for a device with * NETIF_F_HW_CSUM also set. This feature is being * DEPRECATED (see below). The change causes skb_warn_bad_offload to fire for BIG TCP packets. [ 496.310233] WARNING: CPU: 13 PID: 23472 at net/core/dev.c:3129 skb_warn_bad_offload+0xc4/0xe0 [ 496.310297] ? skb_warn_bad_offload+0xc4/0xe0 [ 496.310300] skb_checksum_help+0x129/0x1f0 [ 496.310303] skb_csum_hwoffload_help+0x150/0x1b0 [ 496.310306] validate_xmit_skb+0x159/0x270 [ 496.310309] validate_xmit_skb_list+0x41/0x70 [ 496.310312] sch_direct_xmit+0x5c/0x250 [ 496.310317] __qdisc_run+0x388/0x620 BIG TCP introduced an IPV6_TLV_JUMBO IPv6 extension header to communicate packet length, as this is an IPv6 jumbogram. But, the feature is only enabled on devices that support BIG TCP TSO. The header is only present for PF_PACKET taps like tcpdump, and not transmitted by physical devices. For this specific case of extension headers that are not transmitted, return to the situation before the blamed commit and support hardware offload. ipv6_has_hopopt_jumbo() tests not only whether this header is present, but also that it is the only extension header before a terminal (L4) header. Fixes: 04c20a9356f2 ("net: skip offload for NETIF_F_IPV6_CSUM if ipv6 header contains extension") Reported-by: syzbot Reported-by: Eric Dumazet Closes: https://lore.kernel.org/netdev/CANn89iK1hdC3Nt8KPhOtTF8vCPc1AHDCtse_BTNki1pWxAByTQ@mail.gmail.com/ Signed-off-by: Willem de Bruijn Reviewed-by: Eric Dumazet Link: https://patch.msgid.link/20250101164909.1331680-1-willemdebruijn.kernel@gmail.com Signed-off-by: Jakub Kicinski --- net/core/dev.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/net/core/dev.c b/net/core/dev.c index 45a8c3dd4a64..faa23042df38 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -3642,8 +3642,10 @@ int skb_csum_hwoffload_help(struct sk_buff *skb, if (features & (NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM)) { if (vlan_get_protocol(skb) == htons(ETH_P_IPV6) && - skb_network_header_len(skb) != sizeof(struct ipv6hdr)) + skb_network_header_len(skb) != sizeof(struct ipv6hdr) && + !ipv6_has_hopopt_jumbo(skb)) goto sw_checksum; + switch (skb->csum_offset) { case offsetof(struct tcphdr, check): case offsetof(struct udphdr, check): From 5b0af621c3f6ef9261cf6067812f2fd9943acb4b Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 31 Dec 2024 16:05:27 +0000 Subject: [PATCH 327/337] net: restrict SO_REUSEPORT to inet sockets After blamed commit, crypto sockets could accidentally be destroyed from RCU call back, as spotted by zyzbot [1]. Trying to acquire a mutex in RCU callback is not allowed. Restrict SO_REUSEPORT socket option to inet sockets. v1 of this patch supported TCP, UDP and SCTP sockets, but fcnal-test.sh test needed RAW and ICMP support. [1] BUG: sleeping function called from invalid context at kernel/locking/mutex.c:562 in_atomic(): 1, irqs_disabled(): 0, non_block: 0, pid: 24, name: ksoftirqd/1 preempt_count: 100, expected: 0 RCU nest depth: 0, expected: 0 1 lock held by ksoftirqd/1/24: #0: ffffffff8e937ba0 (rcu_callback){....}-{0:0}, at: rcu_lock_acquire include/linux/rcupdate.h:337 [inline] #0: ffffffff8e937ba0 (rcu_callback){....}-{0:0}, at: rcu_do_batch kernel/rcu/tree.c:2561 [inline] #0: ffffffff8e937ba0 (rcu_callback){....}-{0:0}, at: rcu_core+0xa37/0x17a0 kernel/rcu/tree.c:2823 Preemption disabled at: [] softirq_handle_begin kernel/softirq.c:402 [inline] [] handle_softirqs+0x128/0x9b0 kernel/softirq.c:537 CPU: 1 UID: 0 PID: 24 Comm: ksoftirqd/1 Not tainted 6.13.0-rc3-syzkaller-00174-ga024e377efed #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 09/13/2024 Call Trace: __dump_stack lib/dump_stack.c:94 [inline] dump_stack_lvl+0x241/0x360 lib/dump_stack.c:120 __might_resched+0x5d4/0x780 kernel/sched/core.c:8758 __mutex_lock_common kernel/locking/mutex.c:562 [inline] __mutex_lock+0x131/0xee0 kernel/locking/mutex.c:735 crypto_put_default_null_skcipher+0x18/0x70 crypto/crypto_null.c:179 aead_release+0x3d/0x50 crypto/algif_aead.c:489 alg_do_release crypto/af_alg.c:118 [inline] alg_sock_destruct+0x86/0xc0 crypto/af_alg.c:502 __sk_destruct+0x58/0x5f0 net/core/sock.c:2260 rcu_do_batch kernel/rcu/tree.c:2567 [inline] rcu_core+0xaaa/0x17a0 kernel/rcu/tree.c:2823 handle_softirqs+0x2d4/0x9b0 kernel/softirq.c:561 run_ksoftirqd+0xca/0x130 kernel/softirq.c:950 smpboot_thread_fn+0x544/0xa30 kernel/smpboot.c:164 kthread+0x2f0/0x390 kernel/kthread.c:389 ret_from_fork+0x4b/0x80 arch/x86/kernel/process.c:147 ret_from_fork_asm+0x1a/0x30 arch/x86/entry/entry_64.S:244 Fixes: 8c7138b33e5c ("net: Unpublish sk from sk_reuseport_cb before call_rcu") Reported-by: syzbot+b3e02953598f447d4d2a@syzkaller.appspotmail.com Closes: https://lore.kernel.org/netdev/6772f2f4.050a0220.2f3838.04cb.GAE@google.com/T/#u Signed-off-by: Eric Dumazet Cc: Martin KaFai Lau Reviewed-by: Kuniyuki Iwashima Link: https://patch.msgid.link/20241231160527.3994168-1-edumazet@google.com Signed-off-by: Jakub Kicinski --- net/core/sock.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/net/core/sock.c b/net/core/sock.c index 74729d20cd00..be84885f9290 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -1295,7 +1295,10 @@ int sk_setsockopt(struct sock *sk, int level, int optname, sk->sk_reuse = (valbool ? SK_CAN_REUSE : SK_NO_REUSE); break; case SO_REUSEPORT: - sk->sk_reuseport = valbool; + if (valbool && !sk_is_inet(sk)) + ret = -EOPNOTSUPP; + else + sk->sk_reuseport = valbool; break; case SO_DONTROUTE: sock_valbool_flag(sk, SOCK_LOCALROUTE, valbool); From a7af435df0e04cfb4a4004136d597c42639a2ae7 Mon Sep 17 00:00:00 2001 From: "Maciej S. Szmigiero" Date: Sun, 29 Dec 2024 17:46:58 +0100 Subject: [PATCH 328/337] net: wwan: iosm: Properly check for valid exec stage in ipc_mmio_init() ipc_mmio_init() used the post-decrement operator in its loop continuing condition of "retries" counter being "> 0", which meant that when this condition caused loop exit "retries" counter reached -1. But the later valid exec stage failure check only tests for "retries" counter being exactly zero, so it didn't trigger in this case (but would wrongly trigger if the code reaches a valid exec stage in the very last loop iteration). Fix this by using the pre-decrement operator instead, so the loop counter is exactly zero on valid exec stage failure. Fixes: dc0514f5d828 ("net: iosm: mmio scratchpad") Signed-off-by: Maciej S. Szmigiero Link: https://patch.msgid.link/8b19125a825f9dcdd81c667c1e5c48ba28d505a6.1735490770.git.mail@maciej.szmigiero.name Signed-off-by: Jakub Kicinski --- drivers/net/wwan/iosm/iosm_ipc_mmio.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/wwan/iosm/iosm_ipc_mmio.c b/drivers/net/wwan/iosm/iosm_ipc_mmio.c index 63eb08c43c05..6764c13530b9 100644 --- a/drivers/net/wwan/iosm/iosm_ipc_mmio.c +++ b/drivers/net/wwan/iosm/iosm_ipc_mmio.c @@ -104,7 +104,7 @@ struct iosm_mmio *ipc_mmio_init(void __iomem *mmio, struct device *dev) break; msleep(20); - } while (retries-- > 0); + } while (--retries > 0); if (!retries) { dev_err(ipc_mmio->dev, "invalid exec stage %X", stage); From 77ee7a6d16b6ec07b5c3ae2b6b60a24c1afbed09 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 30 Dec 2024 16:10:03 +0000 Subject: [PATCH 329/337] af_packet: fix vlan_get_tci() vs MSG_PEEK Blamed commit forgot MSG_PEEK case, allowing a crash [1] as found by syzbot. Rework vlan_get_tci() to not touch skb at all, so that it can be used from many cpus on the same skb. Add a const qualifier to skb argument. [1] skbuff: skb_under_panic: text:ffffffff8a8da482 len:32 put:14 head:ffff88807a1d5800 data:ffff88807a1d5810 tail:0x14 end:0x140 dev: ------------[ cut here ]------------ kernel BUG at net/core/skbuff.c:206 ! Oops: invalid opcode: 0000 [#1] PREEMPT SMP KASAN PTI CPU: 0 UID: 0 PID: 5880 Comm: syz-executor172 Not tainted 6.13.0-rc3-syzkaller-00762-g9268abe611b0 #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 09/13/2024 RIP: 0010:skb_panic net/core/skbuff.c:206 [inline] RIP: 0010:skb_under_panic+0x14b/0x150 net/core/skbuff.c:216 Code: 0b 8d 48 c7 c6 9e 6c 26 8e 48 8b 54 24 08 8b 0c 24 44 8b 44 24 04 4d 89 e9 50 41 54 41 57 41 56 e8 3a 5a 79 f7 48 83 c4 20 90 <0f> 0b 0f 1f 00 90 90 90 90 90 90 90 90 90 90 90 90 90 90 90 90 f3 RSP: 0018:ffffc90003baf5b8 EFLAGS: 00010286 RAX: 0000000000000087 RBX: dffffc0000000000 RCX: 8565c1eec37aa000 RDX: 0000000000000000 RSI: 0000000080000000 RDI: 0000000000000000 RBP: ffff88802616fb50 R08: ffffffff817f0a4c R09: 1ffff92000775e50 R10: dffffc0000000000 R11: fffff52000775e51 R12: 0000000000000140 R13: ffff88807a1d5800 R14: ffff88807a1d5810 R15: 0000000000000014 FS: 00007fa03261f6c0(0000) GS:ffff8880b8600000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00007ffd65753000 CR3: 0000000031720000 CR4: 00000000003526f0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 Call Trace: skb_push+0xe5/0x100 net/core/skbuff.c:2636 vlan_get_tci+0x272/0x550 net/packet/af_packet.c:565 packet_recvmsg+0x13c9/0x1ef0 net/packet/af_packet.c:3616 sock_recvmsg_nosec net/socket.c:1044 [inline] sock_recvmsg+0x22f/0x280 net/socket.c:1066 ____sys_recvmsg+0x1c6/0x480 net/socket.c:2814 ___sys_recvmsg net/socket.c:2856 [inline] do_recvmmsg+0x426/0xab0 net/socket.c:2951 __sys_recvmmsg net/socket.c:3025 [inline] __do_sys_recvmmsg net/socket.c:3048 [inline] __se_sys_recvmmsg net/socket.c:3041 [inline] __x64_sys_recvmmsg+0x199/0x250 net/socket.c:3041 do_syscall_x64 arch/x86/entry/common.c:52 [inline] do_syscall_64+0xf3/0x230 arch/x86/entry/common.c:83 Fixes: 79eecf631c14 ("af_packet: Handle outgoing VLAN packets without hardware offloading") Reported-by: syzbot+8400677f3fd43f37d3bc@syzkaller.appspotmail.com Closes: https://lore.kernel.org/netdev/6772c485.050a0220.2f3838.04c6.GAE@google.com/T/#u Signed-off-by: Eric Dumazet Cc: Chengen Du Reviewed-by: Willem de Bruijn Link: https://patch.msgid.link/20241230161004.2681892-1-edumazet@google.com Signed-off-by: Jakub Kicinski --- net/packet/af_packet.c | 12 +++--------- 1 file changed, 3 insertions(+), 9 deletions(-) diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c index 886c0dd47b66..e2e34a49e98d 100644 --- a/net/packet/af_packet.c +++ b/net/packet/af_packet.c @@ -538,10 +538,8 @@ static void *packet_current_frame(struct packet_sock *po, return packet_lookup_frame(po, rb, rb->head, status); } -static u16 vlan_get_tci(struct sk_buff *skb, struct net_device *dev) +static u16 vlan_get_tci(const struct sk_buff *skb, struct net_device *dev) { - u8 *skb_orig_data = skb->data; - int skb_orig_len = skb->len; struct vlan_hdr vhdr, *vh; unsigned int header_len; @@ -562,12 +560,8 @@ static u16 vlan_get_tci(struct sk_buff *skb, struct net_device *dev) else return 0; - skb_push(skb, skb->data - skb_mac_header(skb)); - vh = skb_header_pointer(skb, header_len, sizeof(vhdr), &vhdr); - if (skb_orig_data != skb->data) { - skb->data = skb_orig_data; - skb->len = skb_orig_len; - } + vh = skb_header_pointer(skb, skb_mac_offset(skb) + header_len, + sizeof(vhdr), &vhdr); if (unlikely(!vh)) return 0; From f91a5b8089389eb408501af2762f168c3aaa7b79 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 30 Dec 2024 16:10:04 +0000 Subject: [PATCH 330/337] af_packet: fix vlan_get_protocol_dgram() vs MSG_PEEK Blamed commit forgot MSG_PEEK case, allowing a crash [1] as found by syzbot. Rework vlan_get_protocol_dgram() to not touch skb at all, so that it can be used from many cpus on the same skb. Add a const qualifier to skb argument. [1] skbuff: skb_under_panic: text:ffffffff8a8ccd05 len:29 put:14 head:ffff88807fc8e400 data:ffff88807fc8e3f4 tail:0x11 end:0x140 dev: ------------[ cut here ]------------ kernel BUG at net/core/skbuff.c:206 ! Oops: invalid opcode: 0000 [#1] PREEMPT SMP KASAN PTI CPU: 1 UID: 0 PID: 5892 Comm: syz-executor883 Not tainted 6.13.0-rc4-syzkaller-00054-gd6ef8b40d075 #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 09/13/2024 RIP: 0010:skb_panic net/core/skbuff.c:206 [inline] RIP: 0010:skb_under_panic+0x14b/0x150 net/core/skbuff.c:216 Code: 0b 8d 48 c7 c6 86 d5 25 8e 48 8b 54 24 08 8b 0c 24 44 8b 44 24 04 4d 89 e9 50 41 54 41 57 41 56 e8 5a 69 79 f7 48 83 c4 20 90 <0f> 0b 0f 1f 00 90 90 90 90 90 90 90 90 90 90 90 90 90 90 90 90 f3 RSP: 0018:ffffc900038d7638 EFLAGS: 00010282 RAX: 0000000000000087 RBX: dffffc0000000000 RCX: 609ffd18ea660600 RDX: 0000000000000000 RSI: 0000000080000000 RDI: 0000000000000000 RBP: ffff88802483c8d0 R08: ffffffff817f0a8c R09: 1ffff9200071ae60 R10: dffffc0000000000 R11: fffff5200071ae61 R12: 0000000000000140 R13: ffff88807fc8e400 R14: ffff88807fc8e3f4 R15: 0000000000000011 FS: 00007fbac5e006c0(0000) GS:ffff8880b8700000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00007fbac5e00d58 CR3: 000000001238e000 CR4: 00000000003526f0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 Call Trace: skb_push+0xe5/0x100 net/core/skbuff.c:2636 vlan_get_protocol_dgram+0x165/0x290 net/packet/af_packet.c:585 packet_recvmsg+0x948/0x1ef0 net/packet/af_packet.c:3552 sock_recvmsg_nosec net/socket.c:1033 [inline] sock_recvmsg+0x22f/0x280 net/socket.c:1055 ____sys_recvmsg+0x1c6/0x480 net/socket.c:2803 ___sys_recvmsg net/socket.c:2845 [inline] do_recvmmsg+0x426/0xab0 net/socket.c:2940 __sys_recvmmsg net/socket.c:3014 [inline] __do_sys_recvmmsg net/socket.c:3037 [inline] __se_sys_recvmmsg net/socket.c:3030 [inline] __x64_sys_recvmmsg+0x199/0x250 net/socket.c:3030 do_syscall_x64 arch/x86/entry/common.c:52 [inline] do_syscall_64+0xf3/0x230 arch/x86/entry/common.c:83 entry_SYSCALL_64_after_hwframe+0x77/0x7f Fixes: 79eecf631c14 ("af_packet: Handle outgoing VLAN packets without hardware offloading") Reported-by: syzbot+74f70bb1cb968bf09e4f@syzkaller.appspotmail.com Closes: https://lore.kernel.org/netdev/6772c485.050a0220.2f3838.04c5.GAE@google.com/T/#u Signed-off-by: Eric Dumazet Cc: Chengen Du Reviewed-by: Willem de Bruijn Link: https://patch.msgid.link/20241230161004.2681892-2-edumazet@google.com Signed-off-by: Jakub Kicinski --- include/linux/if_vlan.h | 16 +++++++++++++--- net/packet/af_packet.c | 16 ++++------------ 2 files changed, 17 insertions(+), 15 deletions(-) diff --git a/include/linux/if_vlan.h b/include/linux/if_vlan.h index c1645c86eed9..d65b5d71b93b 100644 --- a/include/linux/if_vlan.h +++ b/include/linux/if_vlan.h @@ -585,13 +585,16 @@ static inline int vlan_get_tag(const struct sk_buff *skb, u16 *vlan_tci) * vlan_get_protocol - get protocol EtherType. * @skb: skbuff to query * @type: first vlan protocol + * @mac_offset: MAC offset * @depth: buffer to store length of eth and vlan tags in bytes * * Returns the EtherType of the packet, regardless of whether it is * vlan encapsulated (normal or hardware accelerated) or not. */ -static inline __be16 __vlan_get_protocol(const struct sk_buff *skb, __be16 type, - int *depth) +static inline __be16 __vlan_get_protocol_offset(const struct sk_buff *skb, + __be16 type, + int mac_offset, + int *depth) { unsigned int vlan_depth = skb->mac_len, parse_depth = VLAN_MAX_DEPTH; @@ -610,7 +613,8 @@ static inline __be16 __vlan_get_protocol(const struct sk_buff *skb, __be16 type, do { struct vlan_hdr vhdr, *vh; - vh = skb_header_pointer(skb, vlan_depth, sizeof(vhdr), &vhdr); + vh = skb_header_pointer(skb, mac_offset + vlan_depth, + sizeof(vhdr), &vhdr); if (unlikely(!vh || !--parse_depth)) return 0; @@ -625,6 +629,12 @@ static inline __be16 __vlan_get_protocol(const struct sk_buff *skb, __be16 type, return type; } +static inline __be16 __vlan_get_protocol(const struct sk_buff *skb, __be16 type, + int *depth) +{ + return __vlan_get_protocol_offset(skb, type, 0, depth); +} + /** * vlan_get_protocol - get protocol EtherType. * @skb: skbuff to query diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c index e2e34a49e98d..2d73769d67f4 100644 --- a/net/packet/af_packet.c +++ b/net/packet/af_packet.c @@ -568,21 +568,13 @@ static u16 vlan_get_tci(const struct sk_buff *skb, struct net_device *dev) return ntohs(vh->h_vlan_TCI); } -static __be16 vlan_get_protocol_dgram(struct sk_buff *skb) +static __be16 vlan_get_protocol_dgram(const struct sk_buff *skb) { __be16 proto = skb->protocol; - if (unlikely(eth_type_vlan(proto))) { - u8 *skb_orig_data = skb->data; - int skb_orig_len = skb->len; - - skb_push(skb, skb->data - skb_mac_header(skb)); - proto = __vlan_get_protocol(skb, proto, NULL); - if (skb_orig_data != skb->data) { - skb->data = skb_orig_data; - skb->len = skb_orig_len; - } - } + if (unlikely(eth_type_vlan(proto))) + proto = __vlan_get_protocol_offset(skb, proto, + skb_mac_offset(skb), NULL); return proto; } From 260466b576bca0081a7d4acecc8e93687aa22d0e Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 30 Dec 2024 16:28:49 +0000 Subject: [PATCH 331/337] ila: serialize calls to nf_register_net_hooks() syzbot found a race in ila_add_mapping() [1] commit 031ae72825ce ("ila: call nf_unregister_net_hooks() sooner") attempted to fix a similar issue. Looking at the syzbot repro, we have concurrent ILA_CMD_ADD commands. Add a mutex to make sure at most one thread is calling nf_register_net_hooks(). [1] BUG: KASAN: slab-use-after-free in rht_key_hashfn include/linux/rhashtable.h:159 [inline] BUG: KASAN: slab-use-after-free in __rhashtable_lookup.constprop.0+0x426/0x550 include/linux/rhashtable.h:604 Read of size 4 at addr ffff888028f40008 by task dhcpcd/5501 CPU: 1 UID: 0 PID: 5501 Comm: dhcpcd Not tainted 6.13.0-rc4-syzkaller-00054-gd6ef8b40d075 #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 09/13/2024 Call Trace: __dump_stack lib/dump_stack.c:94 [inline] dump_stack_lvl+0x116/0x1f0 lib/dump_stack.c:120 print_address_description mm/kasan/report.c:378 [inline] print_report+0xc3/0x620 mm/kasan/report.c:489 kasan_report+0xd9/0x110 mm/kasan/report.c:602 rht_key_hashfn include/linux/rhashtable.h:159 [inline] __rhashtable_lookup.constprop.0+0x426/0x550 include/linux/rhashtable.h:604 rhashtable_lookup include/linux/rhashtable.h:646 [inline] rhashtable_lookup_fast include/linux/rhashtable.h:672 [inline] ila_lookup_wildcards net/ipv6/ila/ila_xlat.c:127 [inline] ila_xlat_addr net/ipv6/ila/ila_xlat.c:652 [inline] ila_nf_input+0x1ee/0x620 net/ipv6/ila/ila_xlat.c:185 nf_hook_entry_hookfn include/linux/netfilter.h:154 [inline] nf_hook_slow+0xbb/0x200 net/netfilter/core.c:626 nf_hook.constprop.0+0x42e/0x750 include/linux/netfilter.h:269 NF_HOOK include/linux/netfilter.h:312 [inline] ipv6_rcv+0xa4/0x680 net/ipv6/ip6_input.c:309 __netif_receive_skb_one_core+0x12e/0x1e0 net/core/dev.c:5672 __netif_receive_skb+0x1d/0x160 net/core/dev.c:5785 process_backlog+0x443/0x15f0 net/core/dev.c:6117 __napi_poll.constprop.0+0xb7/0x550 net/core/dev.c:6883 napi_poll net/core/dev.c:6952 [inline] net_rx_action+0xa94/0x1010 net/core/dev.c:7074 handle_softirqs+0x213/0x8f0 kernel/softirq.c:561 __do_softirq kernel/softirq.c:595 [inline] invoke_softirq kernel/softirq.c:435 [inline] __irq_exit_rcu+0x109/0x170 kernel/softirq.c:662 irq_exit_rcu+0x9/0x30 kernel/softirq.c:678 instr_sysvec_apic_timer_interrupt arch/x86/kernel/apic/apic.c:1049 [inline] sysvec_apic_timer_interrupt+0xa4/0xc0 arch/x86/kernel/apic/apic.c:1049 Fixes: 7f00feaf1076 ("ila: Add generic ILA translation facility") Reported-by: syzbot+47e761d22ecf745f72b9@syzkaller.appspotmail.com Closes: https://lore.kernel.org/netdev/6772c9ae.050a0220.2f3838.04c7.GAE@google.com/T/#u Signed-off-by: Eric Dumazet Cc: Florian Westphal Cc: Tom Herbert Link: https://patch.msgid.link/20241230162849.2795486-1-edumazet@google.com Signed-off-by: Jakub Kicinski --- net/ipv6/ila/ila_xlat.c | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/net/ipv6/ila/ila_xlat.c b/net/ipv6/ila/ila_xlat.c index 7646e401c630..1d41b2ab4884 100644 --- a/net/ipv6/ila/ila_xlat.c +++ b/net/ipv6/ila/ila_xlat.c @@ -195,6 +195,8 @@ static const struct nf_hook_ops ila_nf_hook_ops[] = { }, }; +static DEFINE_MUTEX(ila_mutex); + static int ila_add_mapping(struct net *net, struct ila_xlat_params *xp) { struct ila_net *ilan = net_generic(net, ila_net_id); @@ -202,16 +204,20 @@ static int ila_add_mapping(struct net *net, struct ila_xlat_params *xp) spinlock_t *lock = ila_get_lock(ilan, xp->ip.locator_match); int err = 0, order; - if (!ilan->xlat.hooks_registered) { + if (!READ_ONCE(ilan->xlat.hooks_registered)) { /* We defer registering net hooks in the namespace until the * first mapping is added. */ - err = nf_register_net_hooks(net, ila_nf_hook_ops, - ARRAY_SIZE(ila_nf_hook_ops)); + mutex_lock(&ila_mutex); + if (!ilan->xlat.hooks_registered) { + err = nf_register_net_hooks(net, ila_nf_hook_ops, + ARRAY_SIZE(ila_nf_hook_ops)); + if (!err) + WRITE_ONCE(ilan->xlat.hooks_registered, true); + } + mutex_unlock(&ila_mutex); if (err) return err; - - ilan->xlat.hooks_registered = true; } ila = kzalloc(sizeof(*ila), GFP_KERNEL); From 449e6912a2522af672e99992e1201a454910864e Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Mon, 30 Dec 2024 19:12:30 +0100 Subject: [PATCH 332/337] mptcp: fix recvbuffer adjust on sleeping rcvmsg If the recvmsg() blocks after receiving some data - i.e. due to SO_RCVLOWAT - the MPTCP code will attempt multiple times to adjust the receive buffer size, wrongly accounting every time the cumulative of received data - instead of accounting only for the delta. Address the issue moving mptcp_rcv_space_adjust just after the data reception and passing it only the just received bytes. This also removes an unneeded difference between the TCP and MPTCP RX code path implementation. Fixes: 581302298524 ("mptcp: error out earlier on disconnect") Cc: stable@vger.kernel.org Signed-off-by: Paolo Abeni Reviewed-by: Mat Martineau Signed-off-by: Matthieu Baerts (NGI0) Link: https://patch.msgid.link/20241230-net-mptcp-rbuf-fixes-v1-1-8608af434ceb@kernel.org Signed-off-by: Jakub Kicinski --- net/mptcp/protocol.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c index 08a72242428c..27afdb7e2071 100644 --- a/net/mptcp/protocol.c +++ b/net/mptcp/protocol.c @@ -1939,6 +1939,8 @@ do_error: goto out; } +static void mptcp_rcv_space_adjust(struct mptcp_sock *msk, int copied); + static int __mptcp_recvmsg_mskq(struct mptcp_sock *msk, struct msghdr *msg, size_t len, int flags, @@ -1992,6 +1994,7 @@ static int __mptcp_recvmsg_mskq(struct mptcp_sock *msk, break; } + mptcp_rcv_space_adjust(msk, copied); return copied; } @@ -2268,7 +2271,6 @@ static int mptcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, } pr_debug("block timeout %ld\n", timeo); - mptcp_rcv_space_adjust(msk, copied); err = sk_wait_data(sk, &timeo, NULL); if (err < 0) { err = copied ? : err; @@ -2276,8 +2278,6 @@ static int mptcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, } } - mptcp_rcv_space_adjust(msk, copied); - out_err: if (cmsg_flags && copied >= 0) { if (cmsg_flags & MPTCP_CMSG_TS) From 551844f26da2a9f76c0a698baaffa631d1178645 Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Mon, 30 Dec 2024 19:12:31 +0100 Subject: [PATCH 333/337] mptcp: don't always assume copied data in mptcp_cleanup_rbuf() Under some corner cases the MPTCP protocol can end-up invoking mptcp_cleanup_rbuf() when no data has been copied, but such helper assumes the opposite condition. Explicitly drop such assumption and performs the costly call only when strictly needed - before releasing the msk socket lock. Fixes: fd8976790a6c ("mptcp: be careful on MPTCP-level ack.") Cc: stable@vger.kernel.org Signed-off-by: Paolo Abeni Reviewed-by: Mat Martineau Signed-off-by: Matthieu Baerts (NGI0) Link: https://patch.msgid.link/20241230-net-mptcp-rbuf-fixes-v1-2-8608af434ceb@kernel.org Signed-off-by: Jakub Kicinski --- net/mptcp/protocol.c | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c index 27afdb7e2071..5307fff9d995 100644 --- a/net/mptcp/protocol.c +++ b/net/mptcp/protocol.c @@ -528,13 +528,13 @@ static void mptcp_send_ack(struct mptcp_sock *msk) mptcp_subflow_send_ack(mptcp_subflow_tcp_sock(subflow)); } -static void mptcp_subflow_cleanup_rbuf(struct sock *ssk) +static void mptcp_subflow_cleanup_rbuf(struct sock *ssk, int copied) { bool slow; slow = lock_sock_fast(ssk); if (tcp_can_send_ack(ssk)) - tcp_cleanup_rbuf(ssk, 1); + tcp_cleanup_rbuf(ssk, copied); unlock_sock_fast(ssk, slow); } @@ -551,7 +551,7 @@ static bool mptcp_subflow_could_cleanup(const struct sock *ssk, bool rx_empty) (ICSK_ACK_PUSHED2 | ICSK_ACK_PUSHED))); } -static void mptcp_cleanup_rbuf(struct mptcp_sock *msk) +static void mptcp_cleanup_rbuf(struct mptcp_sock *msk, int copied) { int old_space = READ_ONCE(msk->old_wspace); struct mptcp_subflow_context *subflow; @@ -559,14 +559,14 @@ static void mptcp_cleanup_rbuf(struct mptcp_sock *msk) int space = __mptcp_space(sk); bool cleanup, rx_empty; - cleanup = (space > 0) && (space >= (old_space << 1)); - rx_empty = !__mptcp_rmem(sk); + cleanup = (space > 0) && (space >= (old_space << 1)) && copied; + rx_empty = !__mptcp_rmem(sk) && copied; mptcp_for_each_subflow(msk, subflow) { struct sock *ssk = mptcp_subflow_tcp_sock(subflow); if (cleanup || mptcp_subflow_could_cleanup(ssk, rx_empty)) - mptcp_subflow_cleanup_rbuf(ssk); + mptcp_subflow_cleanup_rbuf(ssk, copied); } } @@ -2220,9 +2220,6 @@ static int mptcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, copied += bytes_read; - /* be sure to advertise window change */ - mptcp_cleanup_rbuf(msk); - if (skb_queue_empty(&msk->receive_queue) && __mptcp_move_skbs(msk)) continue; @@ -2271,6 +2268,7 @@ static int mptcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, } pr_debug("block timeout %ld\n", timeo); + mptcp_cleanup_rbuf(msk, copied); err = sk_wait_data(sk, &timeo, NULL); if (err < 0) { err = copied ? : err; @@ -2278,6 +2276,8 @@ static int mptcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, } } + mptcp_cleanup_rbuf(msk, copied); + out_err: if (cmsg_flags && copied >= 0) { if (cmsg_flags & MPTCP_CMSG_TS) From 56b824eb49d6258aa0bad09a406ceac3f643cdae Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Mon, 30 Dec 2024 19:12:32 +0100 Subject: [PATCH 334/337] mptcp: prevent excessive coalescing on receive Currently the skb size after coalescing is only limited by the skb layout (the skb must not carry frag_list). A single coalesced skb covering several MSS can potentially fill completely the receive buffer. In such a case, the snd win will zero until the receive buffer will be empty again, affecting tput badly. Fixes: 8268ed4c9d19 ("mptcp: introduce and use mptcp_try_coalesce()") Cc: stable@vger.kernel.org # please delay 2 weeks after 6.13-final release Signed-off-by: Paolo Abeni Reviewed-by: Mat Martineau Signed-off-by: Matthieu Baerts (NGI0) Link: https://patch.msgid.link/20241230-net-mptcp-rbuf-fixes-v1-3-8608af434ceb@kernel.org Signed-off-by: Jakub Kicinski --- net/mptcp/protocol.c | 1 + 1 file changed, 1 insertion(+) diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c index 5307fff9d995..1b2e7cbb577f 100644 --- a/net/mptcp/protocol.c +++ b/net/mptcp/protocol.c @@ -136,6 +136,7 @@ static bool mptcp_try_coalesce(struct sock *sk, struct sk_buff *to, int delta; if (MPTCP_SKB_CB(from)->offset || + ((to->len + from->len) > (sk->sk_rcvbuf >> 3)) || !skb_try_coalesce(to, from, &fragstolen, &delta)) return false; From 9facce84f4062f782ebde18daa7006a23d40b607 Mon Sep 17 00:00:00 2001 From: MD Danish Anwar Date: Mon, 23 Dec 2024 20:45:49 +0530 Subject: [PATCH 335/337] net: ti: icssg-prueth: Fix firmware load sequence. Timesync related operations are ran in PRU0 cores for both ICSSG SLICE0 and SLICE1. Currently whenever any ICSSG interface comes up we load the respective firmwares to PRU cores and whenever interface goes down, we stop the resective cores. Due to this, when SLICE0 goes down while SLICE1 is still active, PRU0 firmwares are unloaded and PRU0 core is stopped. This results in clock jump for SLICE1 interface as the timesync related operations are no longer running. As there are interdependencies between SLICE0 and SLICE1 firmwares, fix this by running both PRU0 and PRU1 firmwares as long as at least 1 ICSSG interface is up. Add new flag in prueth struct to check if all firmwares are running and remove the old flag (fw_running). Use emacs_initialized as reference count to load the firmwares for the first and last interface up/down. Moving init_emac_mode and fw_offload_mode API outside of icssg_config to icssg_common_start API as they need to be called only once per firmware boot. Change prueth_emac_restart() to return error code and add error prints inside the caller of this functions in case of any failures. Move prueth_emac_stop() from common to sr1 driver. sr1 and sr2 drivers have different logic handling for stopping the firmwares. While sr1 driver is dependent on emac structure to stop the corresponding pru cores for that slice, for sr2 all the pru cores of both the slices are stopped and is not dependent on emac. So the prueth_emac_stop() function is no longer common and can be moved to sr1 driver. Fixes: c1e0230eeaab ("net: ti: icss-iep: Add IEP driver") Signed-off-by: MD Danish Anwar Signed-off-by: Meghana Malladi Signed-off-by: David S. Miller --- drivers/net/ethernet/ti/icssg/icssg_common.c | 25 -- drivers/net/ethernet/ti/icssg/icssg_config.c | 41 ++- drivers/net/ethernet/ti/icssg/icssg_config.h | 1 + drivers/net/ethernet/ti/icssg/icssg_prueth.c | 281 ++++++++++++------ drivers/net/ethernet/ti/icssg/icssg_prueth.h | 5 +- .../net/ethernet/ti/icssg/icssg_prueth_sr1.c | 24 +- 6 files changed, 246 insertions(+), 131 deletions(-) diff --git a/drivers/net/ethernet/ti/icssg/icssg_common.c b/drivers/net/ethernet/ti/icssg/icssg_common.c index fdebeb2f84e0..74f0f200a89d 100644 --- a/drivers/net/ethernet/ti/icssg/icssg_common.c +++ b/drivers/net/ethernet/ti/icssg/icssg_common.c @@ -855,31 +855,6 @@ irqreturn_t prueth_rx_irq(int irq, void *dev_id) } EXPORT_SYMBOL_GPL(prueth_rx_irq); -void prueth_emac_stop(struct prueth_emac *emac) -{ - struct prueth *prueth = emac->prueth; - int slice; - - switch (emac->port_id) { - case PRUETH_PORT_MII0: - slice = ICSS_SLICE0; - break; - case PRUETH_PORT_MII1: - slice = ICSS_SLICE1; - break; - default: - netdev_err(emac->ndev, "invalid port\n"); - return; - } - - emac->fw_running = 0; - if (!emac->is_sr1) - rproc_shutdown(prueth->txpru[slice]); - rproc_shutdown(prueth->rtu[slice]); - rproc_shutdown(prueth->pru[slice]); -} -EXPORT_SYMBOL_GPL(prueth_emac_stop); - void prueth_cleanup_tx_ts(struct prueth_emac *emac) { int i; diff --git a/drivers/net/ethernet/ti/icssg/icssg_config.c b/drivers/net/ethernet/ti/icssg/icssg_config.c index 5d2491c2943a..ddfd1c02a885 100644 --- a/drivers/net/ethernet/ti/icssg/icssg_config.c +++ b/drivers/net/ethernet/ti/icssg/icssg_config.c @@ -397,7 +397,7 @@ static int prueth_emac_buffer_setup(struct prueth_emac *emac) return 0; } -static void icssg_init_emac_mode(struct prueth *prueth) +void icssg_init_emac_mode(struct prueth *prueth) { /* When the device is configured as a bridge and it is being brought * back to the emac mode, the host mac address has to be set as 0. @@ -406,9 +406,6 @@ static void icssg_init_emac_mode(struct prueth *prueth) int i; u8 mac[ETH_ALEN] = { 0 }; - if (prueth->emacs_initialized) - return; - /* Set VLAN TABLE address base */ regmap_update_bits(prueth->miig_rt, FDB_GEN_CFG1, SMEM_VLAN_OFFSET_MASK, addr << SMEM_VLAN_OFFSET); @@ -423,15 +420,13 @@ static void icssg_init_emac_mode(struct prueth *prueth) /* Clear host MAC address */ icssg_class_set_host_mac_addr(prueth->miig_rt, mac); } +EXPORT_SYMBOL_GPL(icssg_init_emac_mode); -static void icssg_init_fw_offload_mode(struct prueth *prueth) +void icssg_init_fw_offload_mode(struct prueth *prueth) { u32 addr = prueth->shram.pa + EMAC_ICSSG_SWITCH_DEFAULT_VLAN_TABLE_OFFSET; int i; - if (prueth->emacs_initialized) - return; - /* Set VLAN TABLE address base */ regmap_update_bits(prueth->miig_rt, FDB_GEN_CFG1, SMEM_VLAN_OFFSET_MASK, addr << SMEM_VLAN_OFFSET); @@ -448,6 +443,7 @@ static void icssg_init_fw_offload_mode(struct prueth *prueth) icssg_class_set_host_mac_addr(prueth->miig_rt, prueth->hw_bridge_dev->dev_addr); icssg_set_pvid(prueth, prueth->default_vlan, PRUETH_PORT_HOST); } +EXPORT_SYMBOL_GPL(icssg_init_fw_offload_mode); int icssg_config(struct prueth *prueth, struct prueth_emac *emac, int slice) { @@ -455,11 +451,6 @@ int icssg_config(struct prueth *prueth, struct prueth_emac *emac, int slice) struct icssg_flow_cfg __iomem *flow_cfg; int ret; - if (prueth->is_switch_mode || prueth->is_hsr_offload_mode) - icssg_init_fw_offload_mode(prueth); - else - icssg_init_emac_mode(prueth); - memset_io(config, 0, TAS_GATE_MASK_LIST0); icssg_miig_queues_init(prueth, slice); @@ -786,3 +777,27 @@ void icssg_set_pvid(struct prueth *prueth, u8 vid, u8 port) writel(pvid, prueth->shram.va + EMAC_ICSSG_SWITCH_PORT0_DEFAULT_VLAN_OFFSET); } EXPORT_SYMBOL_GPL(icssg_set_pvid); + +int emac_fdb_flow_id_updated(struct prueth_emac *emac) +{ + struct mgmt_cmd_rsp fdb_cmd_rsp = { 0 }; + int slice = prueth_emac_slice(emac); + struct mgmt_cmd fdb_cmd = { 0 }; + int ret; + + fdb_cmd.header = ICSSG_FW_MGMT_CMD_HEADER; + fdb_cmd.type = ICSSG_FW_MGMT_FDB_CMD_TYPE_RX_FLOW; + fdb_cmd.seqnum = ++(emac->prueth->icssg_hwcmdseq); + fdb_cmd.param = 0; + + fdb_cmd.param |= (slice << 4); + fdb_cmd.cmd_args[0] = 0; + + ret = icssg_send_fdb_msg(emac, &fdb_cmd, &fdb_cmd_rsp); + if (ret) + return ret; + + WARN_ON(fdb_cmd.seqnum != fdb_cmd_rsp.seqnum); + return fdb_cmd_rsp.status == 1 ? 0 : -EINVAL; +} +EXPORT_SYMBOL_GPL(emac_fdb_flow_id_updated); diff --git a/drivers/net/ethernet/ti/icssg/icssg_config.h b/drivers/net/ethernet/ti/icssg/icssg_config.h index 92c2deaa3068..c884e9fa099e 100644 --- a/drivers/net/ethernet/ti/icssg/icssg_config.h +++ b/drivers/net/ethernet/ti/icssg/icssg_config.h @@ -55,6 +55,7 @@ struct icssg_rxq_ctx { #define ICSSG_FW_MGMT_FDB_CMD_TYPE 0x03 #define ICSSG_FW_MGMT_CMD_TYPE 0x04 #define ICSSG_FW_MGMT_PKT 0x80000000 +#define ICSSG_FW_MGMT_FDB_CMD_TYPE_RX_FLOW 0x05 struct icssg_r30_cmd { u32 cmd[4]; diff --git a/drivers/net/ethernet/ti/icssg/icssg_prueth.c b/drivers/net/ethernet/ti/icssg/icssg_prueth.c index c568c84a032b..d76fe6d05e10 100644 --- a/drivers/net/ethernet/ti/icssg/icssg_prueth.c +++ b/drivers/net/ethernet/ti/icssg/icssg_prueth.c @@ -164,11 +164,26 @@ static struct icssg_firmwares icssg_emac_firmwares[] = { } }; -static int prueth_emac_start(struct prueth *prueth, struct prueth_emac *emac) +static int prueth_start(struct rproc *rproc, const char *fw_name) +{ + int ret; + + ret = rproc_set_firmware(rproc, fw_name); + if (ret) + return ret; + return rproc_boot(rproc); +} + +static void prueth_shutdown(struct rproc *rproc) +{ + rproc_shutdown(rproc); +} + +static int prueth_emac_start(struct prueth *prueth) { struct icssg_firmwares *firmwares; struct device *dev = prueth->dev; - int slice, ret; + int ret, slice; if (prueth->is_switch_mode) firmwares = icssg_switch_firmwares; @@ -177,49 +192,126 @@ static int prueth_emac_start(struct prueth *prueth, struct prueth_emac *emac) else firmwares = icssg_emac_firmwares; - slice = prueth_emac_slice(emac); - if (slice < 0) { - netdev_err(emac->ndev, "invalid port\n"); - return -EINVAL; + for (slice = 0; slice < PRUETH_NUM_MACS; slice++) { + ret = prueth_start(prueth->pru[slice], firmwares[slice].pru); + if (ret) { + dev_err(dev, "failed to boot PRU%d: %d\n", slice, ret); + goto unwind_slices; + } + + ret = prueth_start(prueth->rtu[slice], firmwares[slice].rtu); + if (ret) { + dev_err(dev, "failed to boot RTU%d: %d\n", slice, ret); + rproc_shutdown(prueth->pru[slice]); + goto unwind_slices; + } + + ret = prueth_start(prueth->txpru[slice], firmwares[slice].txpru); + if (ret) { + dev_err(dev, "failed to boot TX_PRU%d: %d\n", slice, ret); + rproc_shutdown(prueth->rtu[slice]); + rproc_shutdown(prueth->pru[slice]); + goto unwind_slices; + } } - ret = icssg_config(prueth, emac, slice); - if (ret) - return ret; - - ret = rproc_set_firmware(prueth->pru[slice], firmwares[slice].pru); - ret = rproc_boot(prueth->pru[slice]); - if (ret) { - dev_err(dev, "failed to boot PRU%d: %d\n", slice, ret); - return -EINVAL; - } - - ret = rproc_set_firmware(prueth->rtu[slice], firmwares[slice].rtu); - ret = rproc_boot(prueth->rtu[slice]); - if (ret) { - dev_err(dev, "failed to boot RTU%d: %d\n", slice, ret); - goto halt_pru; - } - - ret = rproc_set_firmware(prueth->txpru[slice], firmwares[slice].txpru); - ret = rproc_boot(prueth->txpru[slice]); - if (ret) { - dev_err(dev, "failed to boot TX_PRU%d: %d\n", slice, ret); - goto halt_rtu; - } - - emac->fw_running = 1; return 0; -halt_rtu: - rproc_shutdown(prueth->rtu[slice]); - -halt_pru: - rproc_shutdown(prueth->pru[slice]); +unwind_slices: + while (--slice >= 0) { + prueth_shutdown(prueth->txpru[slice]); + prueth_shutdown(prueth->rtu[slice]); + prueth_shutdown(prueth->pru[slice]); + } return ret; } +static void prueth_emac_stop(struct prueth *prueth) +{ + int slice; + + for (slice = 0; slice < PRUETH_NUM_MACS; slice++) { + prueth_shutdown(prueth->txpru[slice]); + prueth_shutdown(prueth->rtu[slice]); + prueth_shutdown(prueth->pru[slice]); + } +} + +static int prueth_emac_common_start(struct prueth *prueth) +{ + struct prueth_emac *emac; + int ret = 0; + int slice; + + if (!prueth->emac[ICSS_SLICE0] && !prueth->emac[ICSS_SLICE1]) + return -EINVAL; + + /* clear SMEM and MSMC settings for all slices */ + memset_io(prueth->msmcram.va, 0, prueth->msmcram.size); + memset_io(prueth->shram.va, 0, ICSSG_CONFIG_OFFSET_SLICE1 * PRUETH_NUM_MACS); + + icssg_class_default(prueth->miig_rt, ICSS_SLICE0, 0, false); + icssg_class_default(prueth->miig_rt, ICSS_SLICE1, 0, false); + + if (prueth->is_switch_mode || prueth->is_hsr_offload_mode) + icssg_init_fw_offload_mode(prueth); + else + icssg_init_emac_mode(prueth); + + for (slice = 0; slice < PRUETH_NUM_MACS; slice++) { + emac = prueth->emac[slice]; + if (!emac) + continue; + ret = icssg_config(prueth, emac, slice); + if (ret) + goto disable_class; + } + + ret = prueth_emac_start(prueth); + if (ret) + goto disable_class; + + emac = prueth->emac[ICSS_SLICE0] ? prueth->emac[ICSS_SLICE0] : + prueth->emac[ICSS_SLICE1]; + ret = icss_iep_init(emac->iep, &prueth_iep_clockops, + emac, IEP_DEFAULT_CYCLE_TIME_NS); + if (ret) { + dev_err(prueth->dev, "Failed to initialize IEP module\n"); + goto stop_pruss; + } + + return 0; + +stop_pruss: + prueth_emac_stop(prueth); + +disable_class: + icssg_class_disable(prueth->miig_rt, ICSS_SLICE0); + icssg_class_disable(prueth->miig_rt, ICSS_SLICE1); + + return ret; +} + +static int prueth_emac_common_stop(struct prueth *prueth) +{ + struct prueth_emac *emac; + + if (!prueth->emac[ICSS_SLICE0] && !prueth->emac[ICSS_SLICE1]) + return -EINVAL; + + icssg_class_disable(prueth->miig_rt, ICSS_SLICE0); + icssg_class_disable(prueth->miig_rt, ICSS_SLICE1); + + prueth_emac_stop(prueth); + + emac = prueth->emac[ICSS_SLICE0] ? prueth->emac[ICSS_SLICE0] : + prueth->emac[ICSS_SLICE1]; + icss_iep_exit(emac->iep); + + return 0; +} + /* called back by PHY layer if there is change in link state of hw port*/ static void emac_adjust_link(struct net_device *ndev) { @@ -374,9 +466,6 @@ static void prueth_iep_settime(void *clockops_data, u64 ns) u32 cycletime; int timeout; - if (!emac->fw_running) - return; - sc_descp = emac->prueth->shram.va + TIMESYNC_FW_WC_SETCLOCK_DESC_OFFSET; cycletime = IEP_DEFAULT_CYCLE_TIME_NS; @@ -543,23 +632,17 @@ static int emac_ndo_open(struct net_device *ndev) { struct prueth_emac *emac = netdev_priv(ndev); int ret, i, num_data_chn = emac->tx_ch_num; + struct icssg_flow_cfg __iomem *flow_cfg; struct prueth *prueth = emac->prueth; int slice = prueth_emac_slice(emac); struct device *dev = prueth->dev; int max_rx_flows; int rx_flow; - /* clear SMEM and MSMC settings for all slices */ - if (!prueth->emacs_initialized) { - memset_io(prueth->msmcram.va, 0, prueth->msmcram.size); - memset_io(prueth->shram.va, 0, ICSSG_CONFIG_OFFSET_SLICE1 * PRUETH_NUM_MACS); - } - /* set h/w MAC as user might have re-configured */ ether_addr_copy(emac->mac_addr, ndev->dev_addr); icssg_class_set_mac_addr(prueth->miig_rt, slice, emac->mac_addr); - icssg_class_default(prueth->miig_rt, slice, 0, false); icssg_ft1_set_mac_addr(prueth->miig_rt, slice, emac->mac_addr); /* Notify the stack of the actual queue counts. */ @@ -597,18 +680,23 @@ static int emac_ndo_open(struct net_device *ndev) goto cleanup_napi; } - /* reset and start PRU firmware */ - ret = prueth_emac_start(prueth, emac); - if (ret) - goto free_rx_irq; + if (!prueth->emacs_initialized) { + ret = prueth_emac_common_start(prueth); + if (ret) + goto free_rx_irq; + } + + flow_cfg = emac->dram.va + ICSSG_CONFIG_OFFSET + PSI_L_REGULAR_FLOW_ID_BASE_OFFSET; + writew(emac->rx_flow_id_base, &flow_cfg->rx_base_flow); + ret = emac_fdb_flow_id_updated(emac); + + if (ret) { + netdev_err(ndev, "Failed to update Rx Flow ID %d", ret); + goto stop; + } icssg_mii_update_mtu(prueth->mii_rt, slice, ndev->max_mtu); - if (!prueth->emacs_initialized) { - ret = icss_iep_init(emac->iep, &prueth_iep_clockops, - emac, IEP_DEFAULT_CYCLE_TIME_NS); - } - ret = request_threaded_irq(emac->tx_ts_irq, NULL, prueth_tx_ts_irq, IRQF_ONESHOT, dev_name(dev), emac); if (ret) @@ -653,7 +741,8 @@ reset_rx_chn: free_tx_ts_irq: free_irq(emac->tx_ts_irq, emac); stop: - prueth_emac_stop(emac); + if (!prueth->emacs_initialized) + prueth_emac_common_stop(prueth); free_rx_irq: free_irq(emac->rx_chns.irq[rx_flow], emac); cleanup_napi: @@ -689,8 +778,6 @@ static int emac_ndo_stop(struct net_device *ndev) if (ndev->phydev) phy_stop(ndev->phydev); - icssg_class_disable(prueth->miig_rt, prueth_emac_slice(emac)); - if (emac->prueth->is_hsr_offload_mode) __dev_mc_unsync(ndev, icssg_prueth_hsr_del_mcast); else @@ -728,11 +815,9 @@ static int emac_ndo_stop(struct net_device *ndev) /* Destroying the queued work in ndo_stop() */ cancel_delayed_work_sync(&emac->stats_work); - if (prueth->emacs_initialized == 1) - icss_iep_exit(emac->iep); - /* stop PRUs */ - prueth_emac_stop(emac); + if (prueth->emacs_initialized == 1) + prueth_emac_common_stop(prueth); free_irq(emac->tx_ts_irq, emac); @@ -1053,10 +1138,11 @@ static void prueth_offload_fwd_mark_update(struct prueth *prueth) } } -static void prueth_emac_restart(struct prueth *prueth) +static int prueth_emac_restart(struct prueth *prueth) { struct prueth_emac *emac0 = prueth->emac[PRUETH_MAC0]; struct prueth_emac *emac1 = prueth->emac[PRUETH_MAC1]; + int ret; /* Detach the net_device for both PRUeth ports*/ if (netif_running(emac0->ndev)) @@ -1065,36 +1151,46 @@ static void prueth_emac_restart(struct prueth *prueth) netif_device_detach(emac1->ndev); /* Disable both PRUeth ports */ - icssg_set_port_state(emac0, ICSSG_EMAC_PORT_DISABLE); - icssg_set_port_state(emac1, ICSSG_EMAC_PORT_DISABLE); + ret = icssg_set_port_state(emac0, ICSSG_EMAC_PORT_DISABLE); + ret |= icssg_set_port_state(emac1, ICSSG_EMAC_PORT_DISABLE); + if (ret) + return ret; /* Stop both pru cores for both PRUeth ports*/ - prueth_emac_stop(emac0); - prueth->emacs_initialized--; - prueth_emac_stop(emac1); - prueth->emacs_initialized--; + ret = prueth_emac_common_stop(prueth); + if (ret) { + dev_err(prueth->dev, "Failed to stop the firmwares"); + return ret; + } /* Start both pru cores for both PRUeth ports */ - prueth_emac_start(prueth, emac0); - prueth->emacs_initialized++; - prueth_emac_start(prueth, emac1); - prueth->emacs_initialized++; + ret = prueth_emac_common_start(prueth); + if (ret) { + dev_err(prueth->dev, "Failed to start the firmwares"); + return ret; + } /* Enable forwarding for both PRUeth ports */ - icssg_set_port_state(emac0, ICSSG_EMAC_PORT_FORWARD); - icssg_set_port_state(emac1, ICSSG_EMAC_PORT_FORWARD); + ret = icssg_set_port_state(emac0, ICSSG_EMAC_PORT_FORWARD); + ret |= icssg_set_port_state(emac1, ICSSG_EMAC_PORT_FORWARD); /* Attache net_device for both PRUeth ports */ netif_device_attach(emac0->ndev); netif_device_attach(emac1->ndev); + + return ret; } static void icssg_change_mode(struct prueth *prueth) { struct prueth_emac *emac; - int mac; + int mac, ret; - prueth_emac_restart(prueth); + ret = prueth_emac_restart(prueth); + if (ret) { + dev_err(prueth->dev, "Failed to restart the firmwares, aborting the process"); + return; + } for (mac = PRUETH_MAC0; mac < PRUETH_NUM_MACS; mac++) { emac = prueth->emac[mac]; @@ -1173,13 +1269,18 @@ static void prueth_netdevice_port_unlink(struct net_device *ndev) { struct prueth_emac *emac = netdev_priv(ndev); struct prueth *prueth = emac->prueth; + int ret; prueth->br_members &= ~BIT(emac->port_id); if (prueth->is_switch_mode) { prueth->is_switch_mode = false; emac->port_vlan = 0; - prueth_emac_restart(prueth); + ret = prueth_emac_restart(prueth); + if (ret) { + dev_err(prueth->dev, "Failed to restart the firmwares, aborting the process"); + return; + } } prueth_offload_fwd_mark_update(prueth); @@ -1228,6 +1329,7 @@ static void prueth_hsr_port_unlink(struct net_device *ndev) struct prueth *prueth = emac->prueth; struct prueth_emac *emac0; struct prueth_emac *emac1; + int ret; emac0 = prueth->emac[PRUETH_MAC0]; emac1 = prueth->emac[PRUETH_MAC1]; @@ -1238,7 +1340,11 @@ static void prueth_hsr_port_unlink(struct net_device *ndev) emac0->port_vlan = 0; emac1->port_vlan = 0; prueth->hsr_dev = NULL; - prueth_emac_restart(prueth); + ret = prueth_emac_restart(prueth); + if (ret) { + dev_err(prueth->dev, "Failed to restart the firmwares, aborting the process"); + return; + } netdev_dbg(ndev, "Disabling HSR Offload mode\n"); } } @@ -1413,13 +1519,10 @@ static int prueth_probe(struct platform_device *pdev) prueth->pa_stats = NULL; } - if (eth0_node) { + if (eth0_node || eth1_node) { ret = prueth_get_cores(prueth, ICSS_SLICE0, false); if (ret) goto put_cores; - } - - if (eth1_node) { ret = prueth_get_cores(prueth, ICSS_SLICE1, false); if (ret) goto put_cores; @@ -1618,14 +1721,12 @@ put_pruss: pruss_put(prueth->pruss); put_cores: - if (eth1_node) { - prueth_put_cores(prueth, ICSS_SLICE1); - of_node_put(eth1_node); - } - - if (eth0_node) { + if (eth0_node || eth1_node) { prueth_put_cores(prueth, ICSS_SLICE0); of_node_put(eth0_node); + + prueth_put_cores(prueth, ICSS_SLICE1); + of_node_put(eth1_node); } return ret; diff --git a/drivers/net/ethernet/ti/icssg/icssg_prueth.h b/drivers/net/ethernet/ti/icssg/icssg_prueth.h index f5c1d473e9f9..5473315ea204 100644 --- a/drivers/net/ethernet/ti/icssg/icssg_prueth.h +++ b/drivers/net/ethernet/ti/icssg/icssg_prueth.h @@ -140,7 +140,6 @@ struct prueth_rx_chn { /* data for each emac port */ struct prueth_emac { bool is_sr1; - bool fw_running; struct prueth *prueth; struct net_device *ndev; u8 mac_addr[6]; @@ -361,6 +360,8 @@ int icssg_set_port_state(struct prueth_emac *emac, enum icssg_port_state_cmd state); void icssg_config_set_speed(struct prueth_emac *emac); void icssg_config_half_duplex(struct prueth_emac *emac); +void icssg_init_emac_mode(struct prueth *prueth); +void icssg_init_fw_offload_mode(struct prueth *prueth); /* Buffer queue helpers */ int icssg_queue_pop(struct prueth *prueth, u8 queue); @@ -377,6 +378,7 @@ void icssg_vtbl_modify(struct prueth_emac *emac, u8 vid, u8 port_mask, u8 untag_mask, bool add); u16 icssg_get_pvid(struct prueth_emac *emac); void icssg_set_pvid(struct prueth *prueth, u8 vid, u8 port); +int emac_fdb_flow_id_updated(struct prueth_emac *emac); #define prueth_napi_to_tx_chn(pnapi) \ container_of(pnapi, struct prueth_tx_chn, napi_tx) @@ -407,7 +409,6 @@ void emac_rx_timestamp(struct prueth_emac *emac, struct sk_buff *skb, u32 *psdata); enum netdev_tx icssg_ndo_start_xmit(struct sk_buff *skb, struct net_device *ndev); irqreturn_t prueth_rx_irq(int irq, void *dev_id); -void prueth_emac_stop(struct prueth_emac *emac); void prueth_cleanup_tx_ts(struct prueth_emac *emac); int icssg_napi_rx_poll(struct napi_struct *napi_rx, int budget); int prueth_prepare_rx_chan(struct prueth_emac *emac, diff --git a/drivers/net/ethernet/ti/icssg/icssg_prueth_sr1.c b/drivers/net/ethernet/ti/icssg/icssg_prueth_sr1.c index 5024f0647a0d..3dc86397c367 100644 --- a/drivers/net/ethernet/ti/icssg/icssg_prueth_sr1.c +++ b/drivers/net/ethernet/ti/icssg/icssg_prueth_sr1.c @@ -440,7 +440,6 @@ static int prueth_emac_start(struct prueth *prueth, struct prueth_emac *emac) goto halt_pru; } - emac->fw_running = 1; return 0; halt_pru: @@ -449,6 +448,29 @@ halt_pru: return ret; } +static void prueth_emac_stop(struct prueth_emac *emac) +{ + struct prueth *prueth = emac->prueth; + int slice; + + switch (emac->port_id) { + case PRUETH_PORT_MII0: + slice = ICSS_SLICE0; + break; + case PRUETH_PORT_MII1: + slice = ICSS_SLICE1; + break; + default: + netdev_err(emac->ndev, "invalid port\n"); + return; + } + + if (!emac->is_sr1) + rproc_shutdown(prueth->txpru[slice]); + rproc_shutdown(prueth->rtu[slice]); + rproc_shutdown(prueth->pru[slice]); +} + /** * emac_ndo_open - EMAC device open * @ndev: network adapter device From 9b115361248dc6cce182a2dc030c1c70b0a9639e Mon Sep 17 00:00:00 2001 From: Meghana Malladi Date: Mon, 23 Dec 2024 20:45:50 +0530 Subject: [PATCH 336/337] net: ti: icssg-prueth: Fix clearing of IEP_CMP_CFG registers during iep_init When ICSSG interfaces are brought down and brought up again, the pru cores are shut down and booted again, flushing out all the memories and start again in a clean state. Hence it is expected that the IEP_CMP_CFG register needs to be flushed during iep_init() to ensure that the existing residual configuration doesn't cause any unusual behavior. If the register is not cleared, existing IEP_CMP_CFG set for CMP1 will result in SYNC0_OUT signal based on the SYNC_OUT register values. After bringing the interface up, calling PPS enable doesn't work as the driver believes PPS is already enabled, (iep->pps_enabled is not cleared during interface bring down) and driver will just return true even though there is no signal. Fix this by disabling pps and perout. Fixes: c1e0230eeaab ("net: ti: icss-iep: Add IEP driver") Signed-off-by: Meghana Malladi Reviewed-by: Roger Quadros Signed-off-by: David S. Miller --- drivers/net/ethernet/ti/icssg/icss_iep.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/drivers/net/ethernet/ti/icssg/icss_iep.c b/drivers/net/ethernet/ti/icssg/icss_iep.c index 5d6d1cf78e93..768578c0d958 100644 --- a/drivers/net/ethernet/ti/icssg/icss_iep.c +++ b/drivers/net/ethernet/ti/icssg/icss_iep.c @@ -215,6 +215,9 @@ static void icss_iep_enable_shadow_mode(struct icss_iep *iep) for (cmp = IEP_MIN_CMP; cmp < IEP_MAX_CMP; cmp++) { regmap_update_bits(iep->map, ICSS_IEP_CMP_STAT_REG, IEP_CMP_STATUS(cmp), IEP_CMP_STATUS(cmp)); + + regmap_update_bits(iep->map, ICSS_IEP_CMP_CFG_REG, + IEP_CMP_CFG_CMP_EN(cmp), 0); } /* enable reset counter on CMP0 event */ @@ -780,6 +783,11 @@ int icss_iep_exit(struct icss_iep *iep) } icss_iep_disable(iep); + if (iep->pps_enabled) + icss_iep_pps_enable(iep, false); + else if (iep->perout_enabled) + icss_iep_perout_enable(iep, NULL, false); + return 0; } EXPORT_SYMBOL_GPL(icss_iep_exit); From 45d339fefaa3dcd237038769e0d34584fb867390 Mon Sep 17 00:00:00 2001 From: Mark Zhang Date: Thu, 19 Dec 2024 14:23:36 +0200 Subject: [PATCH 337/337] RDMA/mlx5: Enable multiplane mode only when it is supported Driver queries vport_cxt.num_plane and enables multiplane when it is greater then 0, but some old FWs (versions from x.40.1000 till x.42.1000), report vport_cxt.num_plane = 1 unexpectedly. Fix it by querying num_plane only when HCA_CAP2.multiplane bit is set. Fixes: 2a5db20fa532 ("RDMA/mlx5: Add support to multi-plane device and port") Link: https://patch.msgid.link/r/1ef901acdf564716fcf550453cf5e94f343777ec.1734610916.git.leon@kernel.org Cc: stable@vger.kernel.org Reported-by: Francesco Poli Closes: https://lore.kernel.org/all/nvs4i2v7o6vn6zhmtq4sgazy2hu5kiulukxcntdelggmznnl7h@so3oul6uwgbl/ Signed-off-by: Mark Zhang Signed-off-by: Leon Romanovsky Reviewed-by: Michal Swiatkowski Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/mlx5/main.c | 2 +- include/linux/mlx5/mlx5_ifc.h | 4 +++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/drivers/infiniband/hw/mlx5/main.c b/drivers/infiniband/hw/mlx5/main.c index c2314797afc9..f5b59d02f4d3 100644 --- a/drivers/infiniband/hw/mlx5/main.c +++ b/drivers/infiniband/hw/mlx5/main.c @@ -2839,7 +2839,7 @@ static int mlx5_ib_get_plane_num(struct mlx5_core_dev *mdev, u8 *num_plane) int err; *num_plane = 0; - if (!MLX5_CAP_GEN(mdev, ib_virt)) + if (!MLX5_CAP_GEN(mdev, ib_virt) || !MLX5_CAP_GEN_2(mdev, multiplane)) return 0; err = mlx5_query_hca_vport_context(mdev, 0, 1, 0, &vport_ctx); diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h index 4fbbcf35498b..48d47181c7cd 100644 --- a/include/linux/mlx5/mlx5_ifc.h +++ b/include/linux/mlx5/mlx5_ifc.h @@ -2119,7 +2119,9 @@ struct mlx5_ifc_cmd_hca_cap_2_bits { u8 migration_in_chunks[0x1]; u8 reserved_at_d1[0x1]; u8 sf_eq_usage[0x1]; - u8 reserved_at_d3[0xd]; + u8 reserved_at_d3[0x5]; + u8 multiplane[0x1]; + u8 reserved_at_d9[0x7]; u8 cross_vhca_object_to_object_supported[0x20];