From 645dd5c6be9ff9f620fa237ff2bf2d349f8f1944 Mon Sep 17 00:00:00 2001 From: Alejandro Lucero Date: Fri, 6 Mar 2026 16:47:38 +0000 Subject: [PATCH 01/38] cxl: support Type2 when initializing cxl_dev_state In preparation for type2 drivers add function and macro for differentiating CXL memory expanders (type 3) from CXL device accelerators (type 2) helping drivers built from public headers to embed struct cxl_dev_state inside a private struct. Update type3 driver for using this same initialization. Signed-off-by: Alejandro Lucero Reviewed-by: Dave Jiang Reviewed-by: Alison Schofield Reviewed-by: Gregory Price Reviewed-by: Jonathan Cameron Link: https://patch.msgid.link/20260306164741.3796372-2-alejandro.lucero-palau@amd.com Signed-off-by: Dave Jiang (cherry picked from commit 9a775c07bb04384f7c03a35dd04818ed818c1f71) Signed-off-by: Koba Ko --- drivers/cxl/core/mbox.c | 12 +++++------- drivers/cxl/core/memdev.c | 24 ++++++++++++++++++++++++ drivers/cxl/cxlmem.h | 34 +++++++++++++++++++++++++++++++++- drivers/cxl/pci.c | 14 +++++++------- tools/testing/cxl/test/mem.c | 3 +-- 5 files changed, 70 insertions(+), 17 deletions(-) diff --git a/drivers/cxl/core/mbox.c b/drivers/cxl/core/mbox.c index 12386d9127054..aa3724f51ce93 100644 --- a/drivers/cxl/core/mbox.c +++ b/drivers/cxl/core/mbox.c @@ -1521,23 +1521,21 @@ int cxl_mailbox_init(struct cxl_mailbox *cxl_mbox, struct device *host) } EXPORT_SYMBOL_NS_GPL(cxl_mailbox_init, "CXL"); -struct cxl_memdev_state *cxl_memdev_state_create(struct device *dev) +struct cxl_memdev_state *cxl_memdev_state_create(struct device *dev, u64 serial, + u16 dvsec) { struct cxl_memdev_state *mds; int rc; - mds = devm_kzalloc(dev, sizeof(*mds), GFP_KERNEL); + mds = devm_cxl_dev_state_create(dev, CXL_DEVTYPE_CLASSMEM, serial, + dvsec, struct cxl_memdev_state, cxlds, + true); if (!mds) { dev_err(dev, "No memory available\n"); return ERR_PTR(-ENOMEM); } mutex_init(&mds->event.log_lock); - mds->cxlds.dev = dev; - mds->cxlds.reg_map.host = dev; - mds->cxlds.cxl_mbox.host = dev; - mds->cxlds.reg_map.resource = CXL_RESOURCE_NONE; - mds->cxlds.type = CXL_DEVTYPE_CLASSMEM; rc = devm_cxl_register_mce_notifier(dev, &mds->mce_notifier); if (rc == -EOPNOTSUPP) diff --git a/drivers/cxl/core/memdev.c b/drivers/cxl/core/memdev.c index 273c22118d3d8..99e422594885a 100644 --- a/drivers/cxl/core/memdev.c +++ b/drivers/cxl/core/memdev.c @@ -656,6 +656,30 @@ static void detach_memdev(struct work_struct *work) static struct lock_class_key cxl_memdev_key; +struct cxl_dev_state *_devm_cxl_dev_state_create(struct device *dev, + enum cxl_devtype type, + u64 serial, u16 dvsec, + size_t size, bool has_mbox) +{ + struct cxl_dev_state *cxlds = devm_kzalloc(dev, size, GFP_KERNEL); + + if (!cxlds) + return NULL; + + cxlds->dev = dev; + cxlds->type = type; + cxlds->serial = serial; + cxlds->cxl_dvsec = dvsec; + cxlds->reg_map.host = dev; + cxlds->reg_map.resource = CXL_RESOURCE_NONE; + + if (has_mbox) + cxlds->cxl_mbox.host = dev; + + return cxlds; +} +EXPORT_SYMBOL_NS_GPL(_devm_cxl_dev_state_create, "CXL"); + static struct cxl_memdev *cxl_memdev_alloc(struct cxl_dev_state *cxlds, const struct file_operations *fops, const struct cxl_memdev_attach *attach) diff --git a/drivers/cxl/cxlmem.h b/drivers/cxl/cxlmem.h index e21d744d639bd..71367cb5178ca 100644 --- a/drivers/cxl/cxlmem.h +++ b/drivers/cxl/cxlmem.h @@ -523,6 +523,37 @@ to_cxl_memdev_state(struct cxl_dev_state *cxlds) return container_of(cxlds, struct cxl_memdev_state, cxlds); } +struct cxl_dev_state *_devm_cxl_dev_state_create(struct device *dev, + enum cxl_devtype type, + u64 serial, u16 dvsec, + size_t size, bool has_mbox); + +/** + * cxl_dev_state_create - safely create and cast a cxl dev state embedded in a + * driver specific struct. + * + * @parent: device behind the request + * @type: CXL device type + * @serial: device identification + * @dvsec: dvsec capability offset + * @drv_struct: driver struct embedding a cxl_dev_state struct + * @member: name of the struct cxl_dev_state member in drv_struct + * @mbox: true if mailbox supported + * + * Returns a pointer to the drv_struct allocated and embedding a cxl_dev_state + * struct initialized. + * + * Introduced for Type2 driver support. + */ +#define devm_cxl_dev_state_create(parent, type, serial, dvsec, drv_struct, member, mbox) \ + ({ \ + static_assert(__same_type(struct cxl_dev_state, \ + ((drv_struct *)NULL)->member)); \ + static_assert(offsetof(drv_struct, member) == 0); \ + (drv_struct *)_devm_cxl_dev_state_create(parent, type, serial, dvsec, \ + sizeof(drv_struct), mbox); \ + }) + enum cxl_opcode { CXL_MBOX_OP_INVALID = 0x0000, CXL_MBOX_OP_RAW = CXL_MBOX_OP_INVALID, @@ -858,7 +889,8 @@ int cxl_dev_state_identify(struct cxl_memdev_state *mds); int cxl_await_media_ready(struct cxl_dev_state *cxlds); int cxl_enumerate_cmds(struct cxl_memdev_state *mds); int cxl_mem_dpa_fetch(struct cxl_memdev_state *mds, struct cxl_dpa_info *info); -struct cxl_memdev_state *cxl_memdev_state_create(struct device *dev); +struct cxl_memdev_state *cxl_memdev_state_create(struct device *dev, u64 serial, + u16 dvsec); void set_exclusive_cxl_commands(struct cxl_memdev_state *mds, unsigned long *cmds); void clear_exclusive_cxl_commands(struct cxl_memdev_state *mds, diff --git a/drivers/cxl/pci.c b/drivers/cxl/pci.c index fbb300a018302..a42f273ff72b4 100644 --- a/drivers/cxl/pci.c +++ b/drivers/cxl/pci.c @@ -865,25 +865,25 @@ static int cxl_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id) int rc, pmu_count; unsigned int i; bool irq_avail; + u16 dvsec; rc = pcim_enable_device(pdev); if (rc) return rc; pci_set_master(pdev); - mds = cxl_memdev_state_create(&pdev->dev); + dvsec = pci_find_dvsec_capability(pdev, PCI_VENDOR_ID_CXL, + PCI_DVSEC_CXL_DEVICE); + if (!dvsec) + pci_warn(pdev, "Device DVSEC not present, skip CXL.mem init\n"); + + mds = cxl_memdev_state_create(&pdev->dev, pci_get_dsn(pdev), dvsec); if (IS_ERR(mds)) return PTR_ERR(mds); cxlds = &mds->cxlds; pci_set_drvdata(pdev, cxlds); cxlds->rcd = is_cxl_restricted(pdev); - cxlds->serial = pci_get_dsn(pdev); - cxlds->cxl_dvsec = pci_find_dvsec_capability( - pdev, PCI_VENDOR_ID_CXL, PCI_DVSEC_CXL_DEVICE); - if (!cxlds->cxl_dvsec) - dev_warn(&pdev->dev, - "Device DVSEC not present, skip CXL.mem init\n"); rc = cxl_pci_setup_regs(pdev, CXL_REGLOC_RBI_MEMDEV, &map); if (rc) diff --git a/tools/testing/cxl/test/mem.c b/tools/testing/cxl/test/mem.c index cb87e8c0e63c0..79f42f4474d47 100644 --- a/tools/testing/cxl/test/mem.c +++ b/tools/testing/cxl/test/mem.c @@ -1716,7 +1716,7 @@ static int cxl_mock_mem_probe(struct platform_device *pdev) if (rc) return rc; - mds = cxl_memdev_state_create(dev); + mds = cxl_memdev_state_create(dev, pdev->id + 1, 0); if (IS_ERR(mds)) return PTR_ERR(mds); @@ -1732,7 +1732,6 @@ static int cxl_mock_mem_probe(struct platform_device *pdev) mds->event.buf = (struct cxl_get_event_payload *) mdata->event_buf; INIT_DELAYED_WORK(&mds->security.poll_dwork, cxl_mockmem_sanitize_work); - cxlds->serial = pdev->id + 1; if (is_rcd(pdev)) cxlds->rcd = true; From 2bc7fd2fd5583d7cc34646b9383e3352ca4b4ce0 Mon Sep 17 00:00:00 2001 From: Alejandro Lucero Date: Fri, 6 Mar 2026 16:47:39 +0000 Subject: [PATCH 02/38] cxl: export internal structs for external Type2 drivers In preparation for type2 support, move structs and functions a type2 driver will need to access to into a new shared header file. Differentiate between public and private data to be preserved by type2 drivers. Signed-off-by: Alejandro Lucero Reviewed-by: Dave Jiang Tested-by: Alison Schofield Reviewed-by: Gregory Price Reviewed-by: Jonathan Cameron Link: https://patch.msgid.link/20260306164741.3796372-3-alejandro.lucero-palau@amd.com Signed-off-by: Dave Jiang (cherry picked from commit 005869886d1d370afb6c10cd40709d956960e9c2) Signed-off-by: Koba Ko --- drivers/cxl/cxl.h | 97 +------------------ drivers/cxl/cxlmem.h | 114 ---------------------- include/cxl/cxl.h | 226 +++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 227 insertions(+), 210 deletions(-) create mode 100644 include/cxl/cxl.h diff --git a/drivers/cxl/cxl.h b/drivers/cxl/cxl.h index 9b947286eb9b0..1d94217729f76 100644 --- a/drivers/cxl/cxl.h +++ b/drivers/cxl/cxl.h @@ -12,6 +12,7 @@ #include #include #include +#include extern const struct nvdimm_security_ops *cxl_security_ops; @@ -201,97 +202,6 @@ static inline int ways_to_eiw(unsigned int ways, u8 *eiw) #define CXLDEV_MBOX_BG_CMD_COMMAND_VENDOR_MASK GENMASK_ULL(63, 48) #define CXLDEV_MBOX_PAYLOAD_OFFSET 0x20 -/* - * Using struct_group() allows for per register-block-type helper routines, - * without requiring block-type agnostic code to include the prefix. - */ -struct cxl_regs { - /* - * Common set of CXL Component register block base pointers - * @hdm_decoder: CXL 2.0 8.2.5.12 CXL HDM Decoder Capability Structure - * @ras: CXL 2.0 8.2.5.9 CXL RAS Capability Structure - */ - struct_group_tagged(cxl_component_regs, component, - void __iomem *hdm_decoder; - void __iomem *ras; - ); - /* - * Common set of CXL Device register block base pointers - * @status: CXL 2.0 8.2.8.3 Device Status Registers - * @mbox: CXL 2.0 8.2.8.4 Mailbox Registers - * @memdev: CXL 2.0 8.2.8.5 Memory Device Registers - */ - struct_group_tagged(cxl_device_regs, device_regs, - void __iomem *status, *mbox, *memdev; - ); - - struct_group_tagged(cxl_pmu_regs, pmu_regs, - void __iomem *pmu; - ); - - /* - * RCH downstream port specific RAS register - * @aer: CXL 3.0 8.2.1.1 RCH Downstream Port RCRB - */ - struct_group_tagged(cxl_rch_regs, rch_regs, - void __iomem *dport_aer; - ); - - /* - * RCD upstream port specific PCIe cap register - * @pcie_cap: CXL 3.0 8.2.1.2 RCD Upstream Port RCRB - */ - struct_group_tagged(cxl_rcd_regs, rcd_regs, - void __iomem *rcd_pcie_cap; - ); -}; - -struct cxl_reg_map { - bool valid; - int id; - unsigned long offset; - unsigned long size; -}; - -struct cxl_component_reg_map { - struct cxl_reg_map hdm_decoder; - struct cxl_reg_map ras; -}; - -struct cxl_device_reg_map { - struct cxl_reg_map status; - struct cxl_reg_map mbox; - struct cxl_reg_map memdev; -}; - -struct cxl_pmu_reg_map { - struct cxl_reg_map pmu; -}; - -/** - * struct cxl_register_map - DVSEC harvested register block mapping parameters - * @host: device for devm operations and logging - * @base: virtual base of the register-block-BAR + @block_offset - * @resource: physical resource base of the register block - * @max_size: maximum mapping size to perform register search - * @reg_type: see enum cxl_regloc_type - * @component_map: cxl_reg_map for component registers - * @device_map: cxl_reg_maps for device registers - * @pmu_map: cxl_reg_maps for CXL Performance Monitoring Units - */ -struct cxl_register_map { - struct device *host; - void __iomem *base; - resource_size_t resource; - resource_size_t max_size; - u8 reg_type; - union { - struct cxl_component_reg_map component_map; - struct cxl_device_reg_map device_map; - struct cxl_pmu_reg_map pmu_map; - }; -}; - void cxl_probe_component_regs(struct device *dev, void __iomem *base, struct cxl_component_reg_map *map); void cxl_probe_device_regs(struct device *dev, void __iomem *base, @@ -497,11 +407,6 @@ struct cxl_region_params { resource_size_t cache_size; }; -enum cxl_partition_mode { - CXL_PARTMODE_RAM, - CXL_PARTMODE_PMEM, -}; - /* * Indicate whether this region has been assembled by autodetection or * userspace assembly. Prevent endpoint decoders outside of automatic diff --git a/drivers/cxl/cxlmem.h b/drivers/cxl/cxlmem.h index 71367cb5178ca..281546de426e4 100644 --- a/drivers/cxl/cxlmem.h +++ b/drivers/cxl/cxlmem.h @@ -113,8 +113,6 @@ int devm_cxl_dpa_reserve(struct cxl_endpoint_decoder *cxled, resource_size_t base, resource_size_t len, resource_size_t skipped); -#define CXL_NR_PARTITIONS_MAX 2 - struct cxl_dpa_info { u64 size; struct cxl_dpa_part_info { @@ -373,87 +371,6 @@ struct cxl_security_state { struct kernfs_node *sanitize_node; }; -/* - * enum cxl_devtype - delineate type-2 from a generic type-3 device - * @CXL_DEVTYPE_DEVMEM - Vendor specific CXL Type-2 device implementing HDM-D or - * HDM-DB, no requirement that this device implements a - * mailbox, or other memory-device-standard manageability - * flows. - * @CXL_DEVTYPE_CLASSMEM - Common class definition of a CXL Type-3 device with - * HDM-H and class-mandatory memory device registers - */ -enum cxl_devtype { - CXL_DEVTYPE_DEVMEM, - CXL_DEVTYPE_CLASSMEM, -}; - -/** - * struct cxl_dpa_perf - DPA performance property entry - * @dpa_range: range for DPA address - * @coord: QoS performance data (i.e. latency, bandwidth) - * @cdat_coord: raw QoS performance data from CDAT - * @qos_class: QoS Class cookies - */ -struct cxl_dpa_perf { - struct range dpa_range; - struct access_coordinate coord[ACCESS_COORDINATE_MAX]; - struct access_coordinate cdat_coord[ACCESS_COORDINATE_MAX]; - int qos_class; -}; - -/** - * struct cxl_dpa_partition - DPA partition descriptor - * @res: shortcut to the partition in the DPA resource tree (cxlds->dpa_res) - * @perf: performance attributes of the partition from CDAT - * @mode: operation mode for the DPA capacity, e.g. ram, pmem, dynamic... - */ -struct cxl_dpa_partition { - struct resource res; - struct cxl_dpa_perf perf; - enum cxl_partition_mode mode; -}; - -/** - * struct cxl_dev_state - The driver device state - * - * cxl_dev_state represents the CXL driver/device state. It provides an - * interface to mailbox commands as well as some cached data about the device. - * Currently only memory devices are represented. - * - * @dev: The device associated with this CXL state - * @cxlmd: The device representing the CXL.mem capabilities of @dev - * @reg_map: component and ras register mapping parameters - * @regs: Class device "Device" registers - * @cxl_dvsec: Offset to the PCIe device DVSEC - * @rcd: operating in RCD mode (CXL 3.0 9.11.8 CXL Devices Attached to an RCH) - * @media_ready: Indicate whether the device media is usable - * @dpa_res: Overall DPA resource tree for the device - * @part: DPA partition array - * @nr_partitions: Number of DPA partitions - * @serial: PCIe Device Serial Number - * @type: Generic Memory Class device or Vendor Specific Memory device - * @cxl_mbox: CXL mailbox context - * @cxlfs: CXL features context - */ -struct cxl_dev_state { - struct device *dev; - struct cxl_memdev *cxlmd; - struct cxl_register_map reg_map; - struct cxl_device_regs regs; - int cxl_dvsec; - bool rcd; - bool media_ready; - struct resource dpa_res; - struct cxl_dpa_partition part[CXL_NR_PARTITIONS_MAX]; - unsigned int nr_partitions; - u64 serial; - enum cxl_devtype type; - struct cxl_mailbox cxl_mbox; -#ifdef CONFIG_CXL_FEATURES - struct cxl_features_state *cxlfs; -#endif -}; - static inline resource_size_t cxl_pmem_size(struct cxl_dev_state *cxlds) { /* @@ -523,37 +440,6 @@ to_cxl_memdev_state(struct cxl_dev_state *cxlds) return container_of(cxlds, struct cxl_memdev_state, cxlds); } -struct cxl_dev_state *_devm_cxl_dev_state_create(struct device *dev, - enum cxl_devtype type, - u64 serial, u16 dvsec, - size_t size, bool has_mbox); - -/** - * cxl_dev_state_create - safely create and cast a cxl dev state embedded in a - * driver specific struct. - * - * @parent: device behind the request - * @type: CXL device type - * @serial: device identification - * @dvsec: dvsec capability offset - * @drv_struct: driver struct embedding a cxl_dev_state struct - * @member: name of the struct cxl_dev_state member in drv_struct - * @mbox: true if mailbox supported - * - * Returns a pointer to the drv_struct allocated and embedding a cxl_dev_state - * struct initialized. - * - * Introduced for Type2 driver support. - */ -#define devm_cxl_dev_state_create(parent, type, serial, dvsec, drv_struct, member, mbox) \ - ({ \ - static_assert(__same_type(struct cxl_dev_state, \ - ((drv_struct *)NULL)->member)); \ - static_assert(offsetof(drv_struct, member) == 0); \ - (drv_struct *)_devm_cxl_dev_state_create(parent, type, serial, dvsec, \ - sizeof(drv_struct), mbox); \ - }) - enum cxl_opcode { CXL_MBOX_OP_INVALID = 0x0000, CXL_MBOX_OP_RAW = CXL_MBOX_OP_INVALID, diff --git a/include/cxl/cxl.h b/include/cxl/cxl.h new file mode 100644 index 0000000000000..fa72691546205 --- /dev/null +++ b/include/cxl/cxl.h @@ -0,0 +1,226 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* Copyright(c) 2020 Intel Corporation. */ +/* Copyright(c) 2026 Advanced Micro Devices, Inc. */ + +#ifndef __CXL_CXL_H__ +#define __CXL_CXL_H__ + +#include +#include +#include + +/** + * enum cxl_devtype - delineate type-2 from a generic type-3 device + * @CXL_DEVTYPE_DEVMEM: Vendor specific CXL Type-2 device implementing HDM-D or + * HDM-DB, no requirement that this device implements a + * mailbox, or other memory-device-standard manageability + * flows. + * @CXL_DEVTYPE_CLASSMEM: Common class definition of a CXL Type-3 device with + * HDM-H and class-mandatory memory device registers + */ +enum cxl_devtype { + CXL_DEVTYPE_DEVMEM, + CXL_DEVTYPE_CLASSMEM, +}; + +struct device; + +/* + * Using struct_group() allows for per register-block-type helper routines, + * without requiring block-type agnostic code to include the prefix. + */ +struct cxl_regs { + /* + * Common set of CXL Component register block base pointers + * @hdm_decoder: CXL 2.0 8.2.5.12 CXL HDM Decoder Capability Structure + * @ras: CXL 2.0 8.2.5.9 CXL RAS Capability Structure + */ + struct_group_tagged(cxl_component_regs, component, + void __iomem *hdm_decoder; + void __iomem *ras; + ); + /* + * Common set of CXL Device register block base pointers + * @status: CXL 2.0 8.2.8.3 Device Status Registers + * @mbox: CXL 2.0 8.2.8.4 Mailbox Registers + * @memdev: CXL 2.0 8.2.8.5 Memory Device Registers + */ + struct_group_tagged(cxl_device_regs, device_regs, + void __iomem *status, *mbox, *memdev; + ); + + struct_group_tagged(cxl_pmu_regs, pmu_regs, + void __iomem *pmu; + ); + + /* + * RCH downstream port specific RAS register + * @aer: CXL 3.0 8.2.1.1 RCH Downstream Port RCRB + */ + struct_group_tagged(cxl_rch_regs, rch_regs, + void __iomem *dport_aer; + ); + + /* + * RCD upstream port specific PCIe cap register + * @pcie_cap: CXL 3.0 8.2.1.2 RCD Upstream Port RCRB + */ + struct_group_tagged(cxl_rcd_regs, rcd_regs, + void __iomem *rcd_pcie_cap; + ); +}; + +struct cxl_reg_map { + bool valid; + int id; + unsigned long offset; + unsigned long size; +}; + +struct cxl_component_reg_map { + struct cxl_reg_map hdm_decoder; + struct cxl_reg_map ras; +}; + +struct cxl_device_reg_map { + struct cxl_reg_map status; + struct cxl_reg_map mbox; + struct cxl_reg_map memdev; +}; + +struct cxl_pmu_reg_map { + struct cxl_reg_map pmu; +}; + +/** + * struct cxl_register_map - DVSEC harvested register block mapping parameters + * @host: device for devm operations and logging + * @base: virtual base of the register-block-BAR + @block_offset + * @resource: physical resource base of the register block + * @max_size: maximum mapping size to perform register search + * @reg_type: see enum cxl_regloc_type + * @component_map: cxl_reg_map for component registers + * @device_map: cxl_reg_maps for device registers + * @pmu_map: cxl_reg_maps for CXL Performance Monitoring Units + */ +struct cxl_register_map { + struct device *host; + void __iomem *base; + resource_size_t resource; + resource_size_t max_size; + u8 reg_type; + union { + struct cxl_component_reg_map component_map; + struct cxl_device_reg_map device_map; + struct cxl_pmu_reg_map pmu_map; + }; +}; + +/** + * struct cxl_dpa_perf - DPA performance property entry + * @dpa_range: range for DPA address + * @coord: QoS performance data (i.e. latency, bandwidth) + * @cdat_coord: raw QoS performance data from CDAT + * @qos_class: QoS Class cookies + */ +struct cxl_dpa_perf { + struct range dpa_range; + struct access_coordinate coord[ACCESS_COORDINATE_MAX]; + struct access_coordinate cdat_coord[ACCESS_COORDINATE_MAX]; + int qos_class; +}; + +enum cxl_partition_mode { + CXL_PARTMODE_RAM, + CXL_PARTMODE_PMEM, +}; + +/** + * struct cxl_dpa_partition - DPA partition descriptor + * @res: shortcut to the partition in the DPA resource tree (cxlds->dpa_res) + * @perf: performance attributes of the partition from CDAT + * @mode: operation mode for the DPA capacity, e.g. ram, pmem, dynamic... + */ +struct cxl_dpa_partition { + struct resource res; + struct cxl_dpa_perf perf; + enum cxl_partition_mode mode; +}; + +#define CXL_NR_PARTITIONS_MAX 2 + +/** + * struct cxl_dev_state - The driver device state + * + * cxl_dev_state represents the CXL driver/device state. It provides an + * interface to mailbox commands as well as some cached data about the device. + * Currently only memory devices are represented. + * + * @dev: The device associated with this CXL state + * @cxlmd: The device representing the CXL.mem capabilities of @dev + * @reg_map: component and ras register mapping parameters + * @regs: Parsed register blocks + * @cxl_dvsec: Offset to the PCIe device DVSEC + * @rcd: operating in RCD mode (CXL 3.0 9.11.8 CXL Devices Attached to an RCH) + * @media_ready: Indicate whether the device media is usable + * @dpa_res: Overall DPA resource tree for the device + * @part: DPA partition array + * @nr_partitions: Number of DPA partitions + * @serial: PCIe Device Serial Number + * @type: Generic Memory Class device or Vendor Specific Memory device + * @cxl_mbox: CXL mailbox context + * @cxlfs: CXL features context + */ +struct cxl_dev_state { + /* public for Type2 drivers */ + struct device *dev; + struct cxl_memdev *cxlmd; + + /* private for Type2 drivers */ + struct cxl_register_map reg_map; + struct cxl_device_regs regs; + int cxl_dvsec; + bool rcd; + bool media_ready; + struct resource dpa_res; + struct cxl_dpa_partition part[CXL_NR_PARTITIONS_MAX]; + unsigned int nr_partitions; + u64 serial; + enum cxl_devtype type; + struct cxl_mailbox cxl_mbox; +#ifdef CONFIG_CXL_FEATURES + struct cxl_features_state *cxlfs; +#endif +}; + +struct cxl_dev_state *_devm_cxl_dev_state_create(struct device *dev, + enum cxl_devtype type, + u64 serial, u16 dvsec, + size_t size, bool has_mbox); + +/** + * cxl_dev_state_create - safely create and cast a cxl dev state embedded in a + * driver specific struct. + * + * @parent: device behind the request + * @type: CXL device type + * @serial: device identification + * @dvsec: dvsec capability offset + * @drv_struct: driver struct embedding a cxl_dev_state struct + * @member: name of the struct cxl_dev_state member in drv_struct + * @mbox: true if mailbox supported + * + * Returns a pointer to the drv_struct allocated and embedding a cxl_dev_state + * struct initialized. + * + * Introduced for Type2 driver support. + */ +#define devm_cxl_dev_state_create(parent, type, serial, dvsec, drv_struct, member, mbox) \ + ({ \ + static_assert(__same_type(struct cxl_dev_state, \ + ((drv_struct *)NULL)->member)); \ + static_assert(offsetof(drv_struct, member) == 0); \ + (drv_struct *)_devm_cxl_dev_state_create(parent, type, serial, dvsec, \ + sizeof(drv_struct), mbox); \ + }) +#endif /* __CXL_CXL_H__ */ From 8b61bd238bbbcd87029f43768c78f193f6e4cf04 Mon Sep 17 00:00:00 2001 From: Alejandro Lucero Date: Fri, 6 Mar 2026 16:47:40 +0000 Subject: [PATCH 03/38] cxl: Move pci generic code from cxl_pci to core/cxl_pci Inside cxl/core/pci.c there are helpers for CXL PCIe initialization meanwhile cxl/pci_drv.c implements the functionality for a Type3 device initialization. In preparation for type2 support, move helper functions from cxl/pci.c to cxl/core/pci.c in order to be exported and used by type2 drivers. [ dj: Clarified subject. ] Signed-off-by: Alejandro Lucero Reviewed-by: Dave Jiang Reviewed-by: Gregory Price Reviewed-by: Jonathan Cameron Signed-off-by: Gregory Price Link: https://patch.msgid.link/20260306164741.3796372-4-alejandro.lucero-palau@amd.com Signed-off-by: Dave Jiang (cherry picked from commit 58f28930c7fb0e24cdf2972a9c3b7c91aeef4539) Signed-off-by: Koba Ko --- drivers/cxl/core/core.h | 2 ++ drivers/cxl/core/pci.c | 62 ++++++++++++++++++++++++++++++++++++ drivers/cxl/core/regs.c | 1 - drivers/cxl/cxl.h | 2 -- drivers/cxl/cxlpci.h | 13 ++++++++ drivers/cxl/pci.c | 70 ----------------------------------------- 6 files changed, 77 insertions(+), 73 deletions(-) diff --git a/drivers/cxl/core/core.h b/drivers/cxl/core/core.h index 5b0570df0fd9c..5539e941782f6 100644 --- a/drivers/cxl/core/core.h +++ b/drivers/cxl/core/core.h @@ -224,4 +224,6 @@ int cxl_set_feature(struct cxl_mailbox *cxl_mbox, const uuid_t *feat_uuid, u16 *return_code); #endif +resource_size_t cxl_rcd_component_reg_phys(struct device *dev, + struct cxl_dport *dport); #endif /* __CXL_CORE_H__ */ diff --git a/drivers/cxl/core/pci.c b/drivers/cxl/core/pci.c index f96ce884a2130..c32cc62c501de 100644 --- a/drivers/cxl/core/pci.c +++ b/drivers/cxl/core/pci.c @@ -696,6 +696,68 @@ bool cxl_endpoint_decoder_reset_detected(struct cxl_port *port) } EXPORT_SYMBOL_NS_GPL(cxl_endpoint_decoder_reset_detected, "CXL"); +static int cxl_rcrb_get_comp_regs(struct pci_dev *pdev, + struct cxl_register_map *map, + struct cxl_dport *dport) +{ + resource_size_t component_reg_phys; + + *map = (struct cxl_register_map) { + .host = &pdev->dev, + .resource = CXL_RESOURCE_NONE, + }; + + struct cxl_port *port __free(put_cxl_port) = + cxl_pci_find_port(pdev, &dport); + if (!port) + return -EPROBE_DEFER; + + component_reg_phys = cxl_rcd_component_reg_phys(&pdev->dev, dport); + if (component_reg_phys == CXL_RESOURCE_NONE) + return -ENXIO; + + map->resource = component_reg_phys; + map->reg_type = CXL_REGLOC_RBI_COMPONENT; + map->max_size = CXL_COMPONENT_REG_BLOCK_SIZE; + + return 0; +} + +int cxl_pci_setup_regs(struct pci_dev *pdev, enum cxl_regloc_type type, + struct cxl_register_map *map) +{ + int rc; + + rc = cxl_find_regblock(pdev, type, map); + + /* + * If the Register Locator DVSEC does not exist, check if it + * is an RCH and try to extract the Component Registers from + * an RCRB. + */ + if (rc && type == CXL_REGLOC_RBI_COMPONENT && is_cxl_restricted(pdev)) { + struct cxl_dport *dport; + struct cxl_port *port __free(put_cxl_port) = + cxl_pci_find_port(pdev, &dport); + if (!port) + return -EPROBE_DEFER; + + rc = cxl_rcrb_get_comp_regs(pdev, map, dport); + if (rc) + return rc; + + rc = cxl_dport_map_rcd_linkcap(pdev, dport); + if (rc) + return rc; + + } else if (rc) { + return rc; + } + + return cxl_setup_regs(map); +} +EXPORT_SYMBOL_NS_GPL(cxl_pci_setup_regs, "CXL"); + int cxl_pci_get_bandwidth(struct pci_dev *pdev, struct access_coordinate *c) { int speed, bw; diff --git a/drivers/cxl/core/regs.c b/drivers/cxl/core/regs.c index a010b32143422..93710cf4f0a69 100644 --- a/drivers/cxl/core/regs.c +++ b/drivers/cxl/core/regs.c @@ -641,4 +641,3 @@ resource_size_t cxl_rcd_component_reg_phys(struct device *dev, return CXL_RESOURCE_NONE; return __rcrb_to_component(dev, &dport->rcrb, CXL_RCRB_UPSTREAM); } -EXPORT_SYMBOL_NS_GPL(cxl_rcd_component_reg_phys, "CXL"); diff --git a/drivers/cxl/cxl.h b/drivers/cxl/cxl.h index 1d94217729f76..8194447f75d30 100644 --- a/drivers/cxl/cxl.h +++ b/drivers/cxl/cxl.h @@ -222,8 +222,6 @@ int cxl_find_regblock(struct pci_dev *pdev, enum cxl_regloc_type type, struct cxl_register_map *map); int cxl_setup_regs(struct cxl_register_map *map); struct cxl_dport; -resource_size_t cxl_rcd_component_reg_phys(struct device *dev, - struct cxl_dport *dport); int cxl_dport_map_rcd_linkcap(struct pci_dev *pdev, struct cxl_dport *dport); #define CXL_RESOURCE_NONE ((resource_size_t) -1) diff --git a/drivers/cxl/cxlpci.h b/drivers/cxl/cxlpci.h index 0cf64218aa16e..b826eb53cf7ba 100644 --- a/drivers/cxl/cxlpci.h +++ b/drivers/cxl/cxlpci.h @@ -74,6 +74,17 @@ static inline bool cxl_pci_flit_256(struct pci_dev *pdev) return lnksta2 & PCI_EXP_LNKSTA2_FLIT; } +/* + * Assume that the caller has already validated that @pdev has CXL + * capabilities, any RCiEP with CXL capabilities is treated as a + * Restricted CXL Device (RCD) and finds upstream port and endpoint + * registers in a Root Complex Register Block (RCRB). + */ +static inline bool is_cxl_restricted(struct pci_dev *pdev) +{ + return pci_pcie_type(pdev) == PCI_EXP_TYPE_RC_END; +} + struct cxl_dev_state; void read_cdat_data(struct cxl_port *port); @@ -101,4 +112,6 @@ static inline void devm_cxl_port_ras_setup(struct cxl_port *port) } #endif +int cxl_pci_setup_regs(struct pci_dev *pdev, enum cxl_regloc_type type, + struct cxl_register_map *map); #endif /* __CXL_PCI_H__ */ diff --git a/drivers/cxl/pci.c b/drivers/cxl/pci.c index a42f273ff72b4..adc7c4bcb03aa 100644 --- a/drivers/cxl/pci.c +++ b/drivers/cxl/pci.c @@ -465,76 +465,6 @@ static int cxl_pci_setup_mailbox(struct cxl_memdev_state *mds, bool irq_avail) return 0; } -/* - * Assume that any RCIEP that emits the CXL memory expander class code - * is an RCD - */ -static bool is_cxl_restricted(struct pci_dev *pdev) -{ - return pci_pcie_type(pdev) == PCI_EXP_TYPE_RC_END; -} - -static int cxl_rcrb_get_comp_regs(struct pci_dev *pdev, - struct cxl_register_map *map, - struct cxl_dport *dport) -{ - resource_size_t component_reg_phys; - - *map = (struct cxl_register_map) { - .host = &pdev->dev, - .resource = CXL_RESOURCE_NONE, - }; - - struct cxl_port *port __free(put_cxl_port) = - cxl_pci_find_port(pdev, &dport); - if (!port) - return -EPROBE_DEFER; - - component_reg_phys = cxl_rcd_component_reg_phys(&pdev->dev, dport); - if (component_reg_phys == CXL_RESOURCE_NONE) - return -ENXIO; - - map->resource = component_reg_phys; - map->reg_type = CXL_REGLOC_RBI_COMPONENT; - map->max_size = CXL_COMPONENT_REG_BLOCK_SIZE; - - return 0; -} - -static int cxl_pci_setup_regs(struct pci_dev *pdev, enum cxl_regloc_type type, - struct cxl_register_map *map) -{ - int rc; - - rc = cxl_find_regblock(pdev, type, map); - - /* - * If the Register Locator DVSEC does not exist, check if it - * is an RCH and try to extract the Component Registers from - * an RCRB. - */ - if (rc && type == CXL_REGLOC_RBI_COMPONENT && is_cxl_restricted(pdev)) { - struct cxl_dport *dport; - struct cxl_port *port __free(put_cxl_port) = - cxl_pci_find_port(pdev, &dport); - if (!port) - return -EPROBE_DEFER; - - rc = cxl_rcrb_get_comp_regs(pdev, map, dport); - if (rc) - return rc; - - rc = cxl_dport_map_rcd_linkcap(pdev, dport); - if (rc) - return rc; - - } else if (rc) { - return rc; - } - - return cxl_setup_regs(map); -} - static void free_event_buf(void *buf) { kvfree(buf); From d3059e8eab0071b36fa426dabfe4781d9eddc599 Mon Sep 17 00:00:00 2001 From: Gregory Price Date: Fri, 6 Mar 2026 16:47:41 +0000 Subject: [PATCH 04/38] cxl/pci: Remove redundant cxl_pci_find_port() call Remove the redundant port lookup from cxl_rcrb_get_comp_regs() and use the dport parameter directly. The caller has already validated the port is non-NULL before invoking this function, and dport is given as a param. This is simpler than getting dport in the callee and return the pointer to the caller what would require more changes. Signed-off-by: Gregory Price Reviewed-by: Alejandro Lucero Reviewed-by: Jonathan Cameron Reviewed-by: Davidlohr Bueso Link: https://patch.msgid.link/20260306164741.3796372-5-alejandro.lucero-palau@amd.com Signed-off-by: Dave Jiang (cherry picked from commit d537d953c47866bafc89feb66d8ef34baf17659a) Signed-off-by: Koba Ko --- drivers/cxl/core/pci.c | 5 ----- 1 file changed, 5 deletions(-) diff --git a/drivers/cxl/core/pci.c b/drivers/cxl/core/pci.c index c32cc62c501de..d1f487b3d809a 100644 --- a/drivers/cxl/core/pci.c +++ b/drivers/cxl/core/pci.c @@ -707,11 +707,6 @@ static int cxl_rcrb_get_comp_regs(struct pci_dev *pdev, .resource = CXL_RESOURCE_NONE, }; - struct cxl_port *port __free(put_cxl_port) = - cxl_pci_find_port(pdev, &dport); - if (!port) - return -EPROBE_DEFER; - component_reg_phys = cxl_rcd_component_reg_phys(&pdev->dev, dport); if (component_reg_phys == CXL_RESOURCE_NONE) return -ENXIO; From 513a2a51e2194169157e4873019cdc82fa1234d5 Mon Sep 17 00:00:00 2001 From: Alejandro Lucero Date: Thu, 23 Apr 2026 19:05:21 +0100 Subject: [PATCH 05/38] NVIDIA: VR: SAUCE: sfc: add cxl support Add CXL initialization based on new CXL API for accel drivers and make it dependent on kernel CXL configuration. Signed-off-by: Alejandro Lucero Reviewed-by: Jonathan Cameron Acked-by: Edward Cree Reviewed-by: Alison Schofield Reviewed-by: Dan Williams Reviewed-by: Dave Jiang Reviewed-by: Ben Cheatham (cherry picked from https://lore.kernel.org/r/20260423180528.17166-2-alejandro.lucero-palau@amd.com) Signed-off-by: Koba Ko --- drivers/net/ethernet/sfc/Kconfig | 9 +++++ drivers/net/ethernet/sfc/Makefile | 1 + drivers/net/ethernet/sfc/efx.c | 14 +++++++- drivers/net/ethernet/sfc/efx_cxl.c | 52 +++++++++++++++++++++++++++ drivers/net/ethernet/sfc/efx_cxl.h | 29 +++++++++++++++ drivers/net/ethernet/sfc/net_driver.h | 10 ++++++ 6 files changed, 114 insertions(+), 1 deletion(-) create mode 100644 drivers/net/ethernet/sfc/efx_cxl.c create mode 100644 drivers/net/ethernet/sfc/efx_cxl.h diff --git a/drivers/net/ethernet/sfc/Kconfig b/drivers/net/ethernet/sfc/Kconfig index c4c43434f3143..979f2801e2a8e 100644 --- a/drivers/net/ethernet/sfc/Kconfig +++ b/drivers/net/ethernet/sfc/Kconfig @@ -66,6 +66,15 @@ config SFC_MCDI_LOGGING Driver-Interface) commands and responses, allowing debugging of driver/firmware interaction. The tracing is actually enabled by a sysfs file 'mcdi_logging' under the PCI device. +config SFC_CXL + bool "Solarflare SFC9100-family CXL support" + depends on SFC && CXL_BUS >= SFC + default SFC + help + This enables SFC CXL support if the kernel is configuring CXL for + using CTPIO with CXL.mem. The SFC device with CXL support and + with a CXL-aware firmware can be used for minimizing latencies + when sending through CTPIO. source "drivers/net/ethernet/sfc/falcon/Kconfig" source "drivers/net/ethernet/sfc/siena/Kconfig" diff --git a/drivers/net/ethernet/sfc/Makefile b/drivers/net/ethernet/sfc/Makefile index d99039ec468d6..bb0f1891cde65 100644 --- a/drivers/net/ethernet/sfc/Makefile +++ b/drivers/net/ethernet/sfc/Makefile @@ -13,6 +13,7 @@ sfc-$(CONFIG_SFC_SRIOV) += sriov.o ef10_sriov.o ef100_sriov.o ef100_rep.o \ mae.o tc.o tc_bindings.o tc_counters.o \ tc_encap_actions.o tc_conntrack.o +sfc-$(CONFIG_SFC_CXL) += efx_cxl.o obj-$(CONFIG_SFC) += sfc.o obj-$(CONFIG_SFC_FALCON) += falcon/ diff --git a/drivers/net/ethernet/sfc/efx.c b/drivers/net/ethernet/sfc/efx.c index 8f136a11d3968..90ccbe3103860 100644 --- a/drivers/net/ethernet/sfc/efx.c +++ b/drivers/net/ethernet/sfc/efx.c @@ -34,6 +34,7 @@ #include "selftest.h" #include "sriov.h" #include "efx_devlink.h" +#include "efx_cxl.h" #include "mcdi_port_common.h" #include "mcdi_pcol.h" @@ -981,12 +982,14 @@ static void efx_pci_remove(struct pci_dev *pci_dev) efx_pci_remove_main(efx); efx_fini_io(efx); + + probe_data = container_of(efx, struct efx_probe_data, efx); + pci_dbg(efx->pci_dev, "shutdown successful\n"); efx_fini_devlink_and_unlock(efx); efx_fini_struct(efx); free_netdev(efx->net_dev); - probe_data = container_of(efx, struct efx_probe_data, efx); kfree(probe_data); }; @@ -1190,6 +1193,15 @@ static int efx_pci_probe(struct pci_dev *pci_dev, if (rc) goto fail2; + /* A successful cxl initialization implies a CXL region created to be + * used for PIO buffers. If there is no CXL support, or initialization + * fails, cxl_pio_initialised will be false and legacy PIO buffers + * defined at specific PCI BAR regions will be used. + */ + rc = efx_cxl_init(probe_data); + if (rc) + pci_err(pci_dev, "CXL initialization failed with error %d\n", rc); + rc = efx_pci_probe_post_io(efx); if (rc) { /* On failure, retry once immediately. diff --git a/drivers/net/ethernet/sfc/efx_cxl.c b/drivers/net/ethernet/sfc/efx_cxl.c new file mode 100644 index 0000000000000..b7e8d85a43d37 --- /dev/null +++ b/drivers/net/ethernet/sfc/efx_cxl.c @@ -0,0 +1,52 @@ +// SPDX-License-Identifier: GPL-2.0-only +/**************************************************************************** + * + * Driver for AMD network controllers and boards + * Copyright (C) 2025, Advanced Micro Devices, Inc. + */ + +#include + +#include "net_driver.h" +#include "efx_cxl.h" + +#define EFX_CTPIO_BUFFER_SIZE SZ_256M + +int efx_cxl_init(struct efx_probe_data *probe_data) +{ + struct efx_nic *efx = &probe_data->efx; + struct pci_dev *pci_dev = efx->pci_dev; + struct efx_cxl *cxl; + u16 dvsec; + + probe_data->cxl_pio_initialised = false; + + /* Is the device configured with and using CXL? */ + if (!pcie_is_cxl(pci_dev)) + return 0; + + dvsec = pci_find_dvsec_capability(pci_dev, PCI_VENDOR_ID_CXL, + PCI_DVSEC_CXL_DEVICE); + if (!dvsec) { + pci_info(pci_dev, "CXL_DVSEC_PCIE_DEVICE capability not found\n"); + return 0; + } + + pci_dbg(pci_dev, "CXL_DVSEC_PCIE_DEVICE capability found\n"); + + /* Create a cxl_dev_state embedded in the cxl struct using cxl core api + * specifying no mbox available. + */ + cxl = devm_cxl_dev_state_create(&pci_dev->dev, CXL_DEVTYPE_DEVMEM, + pci_get_dsn(pci_dev), dvsec, + struct efx_cxl, cxlds, false); + + if (!cxl) + return -ENOMEM; + + probe_data->cxl = cxl; + + return 0; +} + +MODULE_IMPORT_NS("CXL"); diff --git a/drivers/net/ethernet/sfc/efx_cxl.h b/drivers/net/ethernet/sfc/efx_cxl.h new file mode 100644 index 0000000000000..04e46278464df --- /dev/null +++ b/drivers/net/ethernet/sfc/efx_cxl.h @@ -0,0 +1,29 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/**************************************************************************** + * Driver for AMD network controllers and boards + * Copyright (C) 2025, Advanced Micro Devices, Inc. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published + * by the Free Software Foundation, incorporated herein by reference. + */ + +#ifndef EFX_CXL_H +#define EFX_CXL_H + +#ifdef CONFIG_SFC_CXL + +#include + +struct efx_probe_data; + +struct efx_cxl { + struct cxl_dev_state cxlds; + struct cxl_memdev *cxlmd; +}; + +int efx_cxl_init(struct efx_probe_data *probe_data); +#else +static inline int efx_cxl_init(struct efx_probe_data *probe_data) { return 0; } +#endif +#endif diff --git a/drivers/net/ethernet/sfc/net_driver.h b/drivers/net/ethernet/sfc/net_driver.h index b98c259f672db..3964b2c56609c 100644 --- a/drivers/net/ethernet/sfc/net_driver.h +++ b/drivers/net/ethernet/sfc/net_driver.h @@ -1197,14 +1197,24 @@ struct efx_nic { atomic_t n_rx_noskb_drops; }; +#ifdef CONFIG_SFC_CXL +struct efx_cxl; +#endif + /** * struct efx_probe_data - State after hardware probe * @pci_dev: The PCI device * @efx: Efx NIC details + * @cxl: details of related cxl objects + * @cxl_pio_initialised: cxl initialization outcome. */ struct efx_probe_data { struct pci_dev *pci_dev; struct efx_nic efx; +#ifdef CONFIG_SFC_CXL + struct efx_cxl *cxl; + bool cxl_pio_initialised; +#endif }; static inline struct efx_nic *efx_netdev_priv(struct net_device *dev) From ba9972ccf9653af1be548b4312b19354ee14cbc8 Mon Sep 17 00:00:00 2001 From: Alejandro Lucero Date: Wed, 13 May 2026 23:39:25 +0800 Subject: [PATCH 06/38] NVIDIA: VR: SAUCE: cxl/sfc: Map cxl regs Export cxl core functions for a Type2 driver being able to discover and map the device registers. Use it in sfc driver cxl initialization. Signed-off-by: Alejandro Lucero Reviewed-by: Dan Williams Reviewed-by: Jonathan Cameron Reviewed-by: Dave Jiang Reviewed-by: Ben Cheatham (backported from https://lore.kernel.org/r/20260423180528.17166-3-alejandro.lucero-palau@amd.com) [kobak: Kept cxl_pci_setup_regs() in the core/pci provider added by the full Type2 prerequisite series and dropped the duplicate provider hunk from drivers/cxl/pci.c.] Signed-off-by: Koba Ko --- drivers/cxl/core/pci.c | 1 + drivers/cxl/core/port.c | 1 + drivers/cxl/core/regs.c | 1 + drivers/cxl/cxlpci.h | 10 ---------- drivers/cxl/pci.c | 1 + drivers/net/ethernet/sfc/efx_cxl.c | 26 ++++++++++++++++++++++++++ include/cxl/pci.h | 22 ++++++++++++++++++++++ 7 files changed, 52 insertions(+), 10 deletions(-) create mode 100644 include/cxl/pci.h diff --git a/drivers/cxl/core/pci.c b/drivers/cxl/core/pci.c index d1f487b3d809a..2bcd683aa286d 100644 --- a/drivers/cxl/core/pci.c +++ b/drivers/cxl/core/pci.c @@ -6,6 +6,7 @@ #include #include #include +#include #include #include #include diff --git a/drivers/cxl/core/port.c b/drivers/cxl/core/port.c index c5aacd7054f1d..dbe30e7c383be 100644 --- a/drivers/cxl/core/port.c +++ b/drivers/cxl/core/port.c @@ -11,6 +11,7 @@ #include #include #include +#include #include #include #include diff --git a/drivers/cxl/core/regs.c b/drivers/cxl/core/regs.c index 93710cf4f0a69..20c2d9fbcfe7d 100644 --- a/drivers/cxl/core/regs.c +++ b/drivers/cxl/core/regs.c @@ -4,6 +4,7 @@ #include #include #include +#include #include #include #include diff --git a/drivers/cxl/cxlpci.h b/drivers/cxl/cxlpci.h index b826eb53cf7ba..224636588f623 100644 --- a/drivers/cxl/cxlpci.h +++ b/drivers/cxl/cxlpci.h @@ -13,16 +13,6 @@ */ #define CXL_PCI_DEFAULT_MAX_VECTORS 16 -/* Register Block Identifier (RBI) */ -enum cxl_regloc_type { - CXL_REGLOC_RBI_EMPTY = 0, - CXL_REGLOC_RBI_COMPONENT, - CXL_REGLOC_RBI_VIRT, - CXL_REGLOC_RBI_MEMDEV, - CXL_REGLOC_RBI_PMU, - CXL_REGLOC_RBI_TYPES -}; - /* * Table Access DOE, CDAT Read Entry Response * diff --git a/drivers/cxl/pci.c b/drivers/cxl/pci.c index adc7c4bcb03aa..5cad118e71015 100644 --- a/drivers/cxl/pci.c +++ b/drivers/cxl/pci.c @@ -11,6 +11,7 @@ #include #include #include +#include #include #include "cxlmem.h" #include "cxlpci.h" diff --git a/drivers/net/ethernet/sfc/efx_cxl.c b/drivers/net/ethernet/sfc/efx_cxl.c index b7e8d85a43d37..f88c2b3fbb2c3 100644 --- a/drivers/net/ethernet/sfc/efx_cxl.c +++ b/drivers/net/ethernet/sfc/efx_cxl.c @@ -7,6 +7,8 @@ #include +#include +#include #include "net_driver.h" #include "efx_cxl.h" @@ -18,6 +20,7 @@ int efx_cxl_init(struct efx_probe_data *probe_data) struct pci_dev *pci_dev = efx->pci_dev; struct efx_cxl *cxl; u16 dvsec; + int rc; probe_data->cxl_pio_initialised = false; @@ -44,6 +47,29 @@ int efx_cxl_init(struct efx_probe_data *probe_data) if (!cxl) return -ENOMEM; + rc = cxl_pci_setup_regs(pci_dev, CXL_REGLOC_RBI_COMPONENT, + &cxl->cxlds.reg_map); + if (rc) { + pci_err(pci_dev, "No component registers\n"); + return rc; + } + + if (!cxl->cxlds.reg_map.component_map.hdm_decoder.valid) { + pci_err(pci_dev, "Expected HDM component register not found\n"); + return -ENODEV; + } + + if (!cxl->cxlds.reg_map.component_map.ras.valid) { + pci_err(pci_dev, "Expected RAS component register not found\n"); + return -ENODEV; + } + + /* Set media ready explicitly as there are neither mailbox for checking + * this state nor the CXL register involved, both not mandatory for + * type2. + */ + cxl->cxlds.media_ready = true; + probe_data->cxl = cxl; return 0; diff --git a/include/cxl/pci.h b/include/cxl/pci.h new file mode 100644 index 0000000000000..3e0000015871a --- /dev/null +++ b/include/cxl/pci.h @@ -0,0 +1,22 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* Copyright(c) 2020 Intel Corporation. All rights reserved. */ + +#ifndef __CXL_CXL_PCI_H__ +#define __CXL_CXL_PCI_H__ + +/* Register Block Identifier (RBI) */ +enum cxl_regloc_type { + CXL_REGLOC_RBI_EMPTY = 0, + CXL_REGLOC_RBI_COMPONENT, + CXL_REGLOC_RBI_VIRT, + CXL_REGLOC_RBI_MEMDEV, + CXL_REGLOC_RBI_PMU, + CXL_REGLOC_RBI_TYPES +}; + +struct cxl_register_map; +struct pci_dev; + +int cxl_pci_setup_regs(struct pci_dev *pdev, enum cxl_regloc_type type, + struct cxl_register_map *map); +#endif From cb8c41246ed1d64a5955cc2b842f0b532ced7eb3 Mon Sep 17 00:00:00 2001 From: Alejandro Lucero Date: Thu, 23 Apr 2026 19:05:23 +0100 Subject: [PATCH 07/38] NVIDIA: VR: SAUCE: cxl/sfc: Initialize dpa without a mailbox Type3 relies on mailbox CXL_MBOX_OP_IDENTIFY command for initializing memdev state params which end up being used for DPA initialization. Allow a Type2 driver to initialize DPA simply by giving the size of its volatile hardware partition. Move related functions to memdev. Add sfc driver as the client. Signed-off-by: Alejandro Lucero Reviewed-by: Dan Williams Reviewed-by: Dave Jiang Reviewed-by: Ben Cheatham Reviewed-by: Jonathan Cameron (cherry picked from https://lore.kernel.org/r/20260423180528.17166-4-alejandro.lucero-palau@amd.com) Signed-off-by: Koba Ko --- drivers/cxl/core/core.h | 2 + drivers/cxl/core/mbox.c | 51 +---------------------- drivers/cxl/core/memdev.c | 66 ++++++++++++++++++++++++++++++ drivers/net/ethernet/sfc/efx_cxl.c | 5 +++ include/cxl/cxl.h | 2 + 5 files changed, 76 insertions(+), 50 deletions(-) diff --git a/drivers/cxl/core/core.h b/drivers/cxl/core/core.h index 5539e941782f6..77c684744771b 100644 --- a/drivers/cxl/core/core.h +++ b/drivers/cxl/core/core.h @@ -97,6 +97,8 @@ void __iomem *devm_cxl_iomap_block(struct device *dev, resource_size_t addr, struct dentry *cxl_debugfs_create_dir(const char *dir); int cxl_dpa_set_part(struct cxl_endpoint_decoder *cxled, enum cxl_partition_mode mode); +struct cxl_memdev_state; +int cxl_mem_get_partition_info(struct cxl_memdev_state *mds); int cxl_dpa_alloc(struct cxl_endpoint_decoder *cxled, u64 size); int cxl_dpa_free(struct cxl_endpoint_decoder *cxled); resource_size_t cxl_dpa_size(struct cxl_endpoint_decoder *cxled); diff --git a/drivers/cxl/core/mbox.c b/drivers/cxl/core/mbox.c index aa3724f51ce93..bcf77e604dc3d 100644 --- a/drivers/cxl/core/mbox.c +++ b/drivers/cxl/core/mbox.c @@ -1151,7 +1151,7 @@ EXPORT_SYMBOL_NS_GPL(cxl_mem_get_event_records, "CXL"); * * See CXL @8.2.9.5.2.1 Get Partition Info */ -static int cxl_mem_get_partition_info(struct cxl_memdev_state *mds) +int cxl_mem_get_partition_info(struct cxl_memdev_state *mds) { struct cxl_mailbox *cxl_mbox = &mds->cxlds.cxl_mbox; struct cxl_mbox_get_partition_info pi; @@ -1307,55 +1307,6 @@ int cxl_mem_sanitize(struct cxl_memdev *cxlmd, u16 cmd) return -EBUSY; } -static void add_part(struct cxl_dpa_info *info, u64 start, u64 size, enum cxl_partition_mode mode) -{ - int i = info->nr_partitions; - - if (size == 0) - return; - - info->part[i].range = (struct range) { - .start = start, - .end = start + size - 1, - }; - info->part[i].mode = mode; - info->nr_partitions++; -} - -int cxl_mem_dpa_fetch(struct cxl_memdev_state *mds, struct cxl_dpa_info *info) -{ - struct cxl_dev_state *cxlds = &mds->cxlds; - struct device *dev = cxlds->dev; - int rc; - - if (!cxlds->media_ready) { - info->size = 0; - return 0; - } - - info->size = mds->total_bytes; - - if (mds->partition_align_bytes == 0) { - add_part(info, 0, mds->volatile_only_bytes, CXL_PARTMODE_RAM); - add_part(info, mds->volatile_only_bytes, - mds->persistent_only_bytes, CXL_PARTMODE_PMEM); - return 0; - } - - rc = cxl_mem_get_partition_info(mds); - if (rc) { - dev_err(dev, "Failed to query partition information\n"); - return rc; - } - - add_part(info, 0, mds->active_volatile_bytes, CXL_PARTMODE_RAM); - add_part(info, mds->active_volatile_bytes, mds->active_persistent_bytes, - CXL_PARTMODE_PMEM); - - return 0; -} -EXPORT_SYMBOL_NS_GPL(cxl_mem_dpa_fetch, "CXL"); - int cxl_get_dirty_count(struct cxl_memdev_state *mds, u32 *count) { struct cxl_mailbox *cxl_mbox = &mds->cxlds.cxl_mbox; diff --git a/drivers/cxl/core/memdev.c b/drivers/cxl/core/memdev.c index 99e422594885a..166ec6b7d0416 100644 --- a/drivers/cxl/core/memdev.c +++ b/drivers/cxl/core/memdev.c @@ -582,6 +582,72 @@ bool is_cxl_memdev(const struct device *dev) } EXPORT_SYMBOL_NS_GPL(is_cxl_memdev, "CXL"); +static void add_part(struct cxl_dpa_info *info, u64 start, u64 size, enum cxl_partition_mode mode) +{ + int i = info->nr_partitions; + + if (size == 0) + return; + + info->part[i].range = (struct range) { + .start = start, + .end = start + size - 1, + }; + info->part[i].mode = mode; + info->nr_partitions++; +} + +int cxl_mem_dpa_fetch(struct cxl_memdev_state *mds, struct cxl_dpa_info *info) +{ + struct cxl_dev_state *cxlds = &mds->cxlds; + struct device *dev = cxlds->dev; + int rc; + + if (!cxlds->media_ready) { + info->size = 0; + return 0; + } + + info->size = mds->total_bytes; + + if (mds->partition_align_bytes == 0) { + add_part(info, 0, mds->volatile_only_bytes, CXL_PARTMODE_RAM); + add_part(info, mds->volatile_only_bytes, + mds->persistent_only_bytes, CXL_PARTMODE_PMEM); + return 0; + } + + rc = cxl_mem_get_partition_info(mds); + if (rc) { + dev_err(dev, "Failed to query partition information\n"); + return rc; + } + + add_part(info, 0, mds->active_volatile_bytes, CXL_PARTMODE_RAM); + add_part(info, mds->active_volatile_bytes, mds->active_persistent_bytes, + CXL_PARTMODE_PMEM); + + return 0; +} +EXPORT_SYMBOL_NS_GPL(cxl_mem_dpa_fetch, "CXL"); + +/** + * cxl_set_capacity: initialize dpa by a driver without a mailbox. + * + * @cxlds: pointer to cxl_dev_state + * @capacity: device volatile memory size + */ +int cxl_set_capacity(struct cxl_dev_state *cxlds, u64 capacity) +{ + struct cxl_dpa_info range_info = { + .size = capacity, + }; + + add_part(&range_info, 0, capacity, CXL_PARTMODE_RAM); + return cxl_dpa_setup(cxlds, &range_info); +} +EXPORT_SYMBOL_NS_GPL(cxl_set_capacity, "CXL"); + /** * set_exclusive_cxl_commands() - atomically disable user cxl commands * @mds: The device state to operate on diff --git a/drivers/net/ethernet/sfc/efx_cxl.c b/drivers/net/ethernet/sfc/efx_cxl.c index f88c2b3fbb2c3..4d55c08cf2a16 100644 --- a/drivers/net/ethernet/sfc/efx_cxl.c +++ b/drivers/net/ethernet/sfc/efx_cxl.c @@ -70,6 +70,11 @@ int efx_cxl_init(struct efx_probe_data *probe_data) */ cxl->cxlds.media_ready = true; + if (cxl_set_capacity(&cxl->cxlds, EFX_CTPIO_BUFFER_SIZE)) { + pci_err(pci_dev, "dpa capacity setup failed\n"); + return -ENODEV; + } + probe_data->cxl = cxl; return 0; diff --git a/include/cxl/cxl.h b/include/cxl/cxl.h index fa72691546205..1346771edc4e4 100644 --- a/include/cxl/cxl.h +++ b/include/cxl/cxl.h @@ -223,4 +223,6 @@ struct cxl_dev_state *_devm_cxl_dev_state_create(struct device *dev, (drv_struct *)_devm_cxl_dev_state_create(parent, type, serial, dvsec, \ sizeof(drv_struct), mbox); \ }) + +int cxl_set_capacity(struct cxl_dev_state *cxlds, u64 capacity); #endif /* __CXL_CXL_H__ */ From 79654508fdaf511b70639197a2b1edba81b7f0b0 Mon Sep 17 00:00:00 2001 From: Alejandro Lucero Date: Thu, 23 Apr 2026 19:05:24 +0100 Subject: [PATCH 08/38] NVIDIA: VR: SAUCE: cxl: Prepare memdev creation for type2 Current cxl core is relying on a CXL_DEVTYPE_CLASSMEM type device when creating a memdev leading to problems when obtaining cxl_memdev_state references from a CXL_DEVTYPE_DEVMEM type. Modify check for obtaining cxl_memdev_state adding CXL_DEVTYPE_DEVMEM support. Make devm_cxl_add_memdev accessible from an accel driver. Signed-off-by: Alejandro Lucero Reviewed-by: Ben Cheatham Reviewed-by: Jonathan Cameron Reviewed-by: Dave Jiang Reviewed-by: Alison Schofield Reviewed-by: Dan Williams (cherry picked from https://lore.kernel.org/r/20260423180528.17166-5-alejandro.lucero-palau@amd.com) Signed-off-by: Koba Ko --- drivers/cxl/core/memdev.c | 15 +++++++++++-- drivers/cxl/cxlmem.h | 6 ------ drivers/cxl/mem.c | 45 +++++++++++++++++++++++++++++---------- include/cxl/cxl.h | 6 ++++++ 4 files changed, 53 insertions(+), 19 deletions(-) diff --git a/drivers/cxl/core/memdev.c b/drivers/cxl/core/memdev.c index 166ec6b7d0416..759b43364ed7a 100644 --- a/drivers/cxl/core/memdev.c +++ b/drivers/cxl/core/memdev.c @@ -7,6 +7,7 @@ #include #include #include +#include #include #include "trace.h" #include "core.h" @@ -576,9 +577,16 @@ static const struct device_type cxl_memdev_type = { .groups = cxl_memdev_attribute_groups, }; +static const struct device_type cxl_accel_memdev_type = { + .name = "cxl_accel_memdev", + .release = cxl_memdev_release, + .devnode = cxl_memdev_devnode, +}; + bool is_cxl_memdev(const struct device *dev) { - return dev->type == &cxl_memdev_type; + return (dev->type == &cxl_memdev_type || + dev->type == &cxl_accel_memdev_type); } EXPORT_SYMBOL_NS_GPL(is_cxl_memdev, "CXL"); @@ -773,7 +781,10 @@ static struct cxl_memdev *cxl_memdev_alloc(struct cxl_dev_state *cxlds, dev->parent = cxlds->dev; dev->bus = &cxl_bus_type; dev->devt = MKDEV(cxl_mem_major, cxlmd->id); - dev->type = &cxl_memdev_type; + if (cxlds->type == CXL_DEVTYPE_DEVMEM) + dev->type = &cxl_accel_memdev_type; + else + dev->type = &cxl_memdev_type; device_set_pm_not_required(dev); INIT_WORK(&cxlmd->detach_work, detach_memdev); diff --git a/drivers/cxl/cxlmem.h b/drivers/cxl/cxlmem.h index 281546de426e4..c98db6f18aa29 100644 --- a/drivers/cxl/cxlmem.h +++ b/drivers/cxl/cxlmem.h @@ -34,10 +34,6 @@ (FIELD_GET(CXLMDEV_RESET_NEEDED_MASK, status) != \ CXLMDEV_RESET_NEEDED_NOT) -struct cxl_memdev_attach { - int (*probe)(struct cxl_memdev *cxlmd); -}; - /** * struct cxl_memdev - CXL bus object representing a Type-3 Memory Device * @dev: driver core device object @@ -103,8 +99,6 @@ static inline bool is_cxl_endpoint(struct cxl_port *port) struct cxl_memdev *__devm_cxl_add_memdev(struct cxl_dev_state *cxlds, const struct cxl_memdev_attach *attach); -struct cxl_memdev *devm_cxl_add_memdev(struct cxl_dev_state *cxlds, - const struct cxl_memdev_attach *attach); int devm_cxl_sanitize_setup_notifier(struct device *host, struct cxl_memdev *cxlmd); struct cxl_memdev_state; diff --git a/drivers/cxl/mem.c b/drivers/cxl/mem.c index fcffe24dcb42f..ff858318091f1 100644 --- a/drivers/cxl/mem.c +++ b/drivers/cxl/mem.c @@ -65,6 +65,26 @@ static int cxl_debugfs_poison_clear(void *data, u64 dpa) DEFINE_DEBUGFS_ATTRIBUTE(cxl_poison_clear_fops, NULL, cxl_debugfs_poison_clear, "%llx\n"); +static void cxl_memdev_poison_enable(struct cxl_memdev_state *mds, + struct cxl_memdev *cxlmd, + struct dentry *dentry) +{ + /* + * Avoid poison debugfs for DEVMEM aka accelerators as they rely on + * cxl_memdev_state. + */ + if (!mds) + return; + + if (test_bit(CXL_POISON_ENABLED_INJECT, mds->poison.enabled_cmds)) + debugfs_create_file("inject_poison", 0200, dentry, cxlmd, + &cxl_poison_inject_fops); + + if (test_bit(CXL_POISON_ENABLED_CLEAR, mds->poison.enabled_cmds)) + debugfs_create_file("clear_poison", 0200, dentry, cxlmd, + &cxl_poison_clear_fops); +} + static int cxl_mem_probe(struct device *dev) { struct cxl_memdev *cxlmd = to_cxl_memdev(dev); @@ -92,12 +112,7 @@ static int cxl_mem_probe(struct device *dev) dentry = cxl_debugfs_create_dir(dev_name(dev)); debugfs_create_devm_seqfile(dev, "dpamem", dentry, cxl_mem_dpa_show); - if (test_bit(CXL_POISON_ENABLED_INJECT, mds->poison.enabled_cmds)) - debugfs_create_file("inject_poison", 0200, dentry, cxlmd, - &cxl_poison_inject_fops); - if (test_bit(CXL_POISON_ENABLED_CLEAR, mds->poison.enabled_cmds)) - debugfs_create_file("clear_poison", 0200, dentry, cxlmd, - &cxl_poison_clear_fops); + cxl_memdev_poison_enable(mds, cxlmd, dentry); rc = devm_add_action_or_reset(dev, remove_debugfs, dentry); if (rc) @@ -206,16 +221,24 @@ static ssize_t trigger_poison_list_store(struct device *dev, } static DEVICE_ATTR_WO(trigger_poison_list); -static umode_t cxl_mem_visible(struct kobject *kobj, struct attribute *a, int n) +static bool cxl_poison_attr_visible(struct kobject *kobj, struct attribute *a) { struct device *dev = kobj_to_dev(kobj); struct cxl_memdev *cxlmd = to_cxl_memdev(dev); struct cxl_memdev_state *mds = to_cxl_memdev_state(cxlmd->cxlds); - if (a == &dev_attr_trigger_poison_list.attr) - if (!test_bit(CXL_POISON_ENABLED_LIST, - mds->poison.enabled_cmds)) - return 0; + if (!mds || + !test_bit(CXL_POISON_ENABLED_LIST, mds->poison.enabled_cmds)) + return false; + + return true; +} + +static umode_t cxl_mem_visible(struct kobject *kobj, struct attribute *a, int n) +{ + if (a == &dev_attr_trigger_poison_list.attr && + !cxl_poison_attr_visible(kobj, a)) + return 0; return a->mode; } diff --git a/include/cxl/cxl.h b/include/cxl/cxl.h index 1346771edc4e4..10a9b8fa2f6b7 100644 --- a/include/cxl/cxl.h +++ b/include/cxl/cxl.h @@ -149,6 +149,10 @@ struct cxl_dpa_partition { #define CXL_NR_PARTITIONS_MAX 2 +struct cxl_memdev_attach { + int (*probe)(struct cxl_memdev *cxlmd); +}; + /** * struct cxl_dev_state - The driver device state * @@ -225,4 +229,6 @@ struct cxl_dev_state *_devm_cxl_dev_state_create(struct device *dev, }) int cxl_set_capacity(struct cxl_dev_state *cxlds, u64 capacity); +struct cxl_memdev *devm_cxl_add_memdev(struct cxl_dev_state *cxlds, + const struct cxl_memdev_attach *attach); #endif /* __CXL_CXL_H__ */ From 9c9123639c357f467514b51f7e85f0d980a00d09 Mon Sep 17 00:00:00 2001 From: Alejandro Lucero Date: Thu, 23 Apr 2026 19:05:25 +0100 Subject: [PATCH 09/38] NVIDIA: VR: SAUCE: sfc: create type2 cxl memdev Use cxl API for creating a cxl memory device using the type2 cxl_dev_state struct. Signed-off-by: Alejandro Lucero Reviewed-by: Martin Habets Reviewed-by: Fan Ni Acked-by: Edward Cree Reviewed-by: Jonathan Cameron Reviewed-by: Dave Jiang (cherry picked from https://lore.kernel.org/r/20260423180528.17166-6-alejandro.lucero-palau@amd.com) Signed-off-by: Koba Ko --- drivers/net/ethernet/sfc/efx_cxl.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/drivers/net/ethernet/sfc/efx_cxl.c b/drivers/net/ethernet/sfc/efx_cxl.c index 4d55c08cf2a16..7d8a6c2133c88 100644 --- a/drivers/net/ethernet/sfc/efx_cxl.c +++ b/drivers/net/ethernet/sfc/efx_cxl.c @@ -75,6 +75,12 @@ int efx_cxl_init(struct efx_probe_data *probe_data) return -ENODEV; } + cxl->cxlmd = devm_cxl_add_memdev(&cxl->cxlds, NULL); + if (IS_ERR(cxl->cxlmd)) { + pci_err(pci_dev, "CXL accel memdev creation failed\n"); + return PTR_ERR(cxl->cxlmd); + } + probe_data->cxl = cxl; return 0; From 388dc8c0c90ab59cb3b61e67afb6cee4317c375e Mon Sep 17 00:00:00 2001 From: Alejandro Lucero Date: Thu, 23 Apr 2026 19:05:26 +0100 Subject: [PATCH 10/38] NVIDIA: VR: SAUCE: cxl: attach region to an accelerator/type2 memdev Support an accelerator driver to safely work with an autodiscovered region from a committed HDM decoder through: 1) an accelerator driver cxl_attach_region struct with attach and detach callbacks. 2) a specific function, cxl_memdev_attach_region() keeping the required locks for finding a region linked to the memdev endpoint, and 3) invoking attach callback while keeping the locking allowing to work (ioremap and other internal stuff) with the related physical range by the accelerator driver, and 4) linking a detach callback to the endpoint device removal where the accelerator driver can stop using the region range. This covers the cases of a potential removal of cxl_acpi module or a accelerator memdev unbinding from cxl_mem driver through sysfs. Signed-off-by: Alejandro Lucero (backported from https://lore.kernel.org/r/20260423180528.17166-7-alejandro.lucero-palau@amd.com) [kobak: Check cxl_memdev_attach_region() errors and propagate failure so SFC probe does not continue after CXL core tears down the attached region. Set probe_data->cxl before attaching so the attach callback can use it, guard attach attempts before a valid endpoint exists, explicitly unwind attach/autoremove side effects if devres action registration fails, preserve DEVMEM target type for autodiscovered regions, and route delete / construct-failure cleanup through endpoint-owned devres actions.] [kobak: Keep no-detach DEVMEM unregister under the endpoint-device guard so attach cannot install endpoint devres actions for a region being freed.] [kobak: Avoid devres-registration failure cleanup under cxl_rwsem.region read lock: keep endpoint->dev locked, drop the region/DPA read guards before unregister_region(), and use devm_remove_action() so failed detach-action registration does not run cxl_endpoint_region_autoremove() under the read lock.] Signed-off-by: Koba Ko --- drivers/cxl/core/region.c | 193 +++++++++++++++++++++++++++-- drivers/cxl/cxl.h | 4 + drivers/net/ethernet/sfc/efx_cxl.c | 37 ++++++ drivers/net/ethernet/sfc/efx_cxl.h | 2 + include/cxl/cxl.h | 17 +++ 5 files changed, 242 insertions(+), 11 deletions(-) diff --git a/drivers/cxl/core/region.c b/drivers/cxl/core/region.c index 9ad83813a5e60..f8fdc0ec27316 100644 --- a/drivers/cxl/core/region.c +++ b/drivers/cxl/core/region.c @@ -2467,6 +2467,41 @@ static void unregister_region(void *_cxlr) put_device(&cxlr->dev); } +static void cxl_endpoint_region_autoremove(void *_cxlr); + +static void cxl_region_release_action(struct cxl_region *cxlr) +{ + struct cxl_port *port = cxlrd_to_port(cxlr->cxlrd); + + if (cxlr->type != CXL_DECODER_DEVMEM) { + devm_release_action(port->uport_dev, unregister_region, cxlr); + return; + } + + if (cxlr->params.nr_targets) { + struct cxl_endpoint_decoder *cxled = cxlr->params.targets[0]; + struct cxl_port *endpoint = cxled_to_port(cxled); + + guard(device)(&endpoint->dev); + if (cxlr->detach) { + void (*detach)(void *data) = cxlr->detach; + void *detach_data = cxlr->detach_data; + + cxlr->detach = NULL; + cxlr->detach_data = NULL; + devm_release_action(&endpoint->dev, detach, detach_data); + devm_release_action(&endpoint->dev, + cxl_endpoint_region_autoremove, + cxlr); + } else { + unregister_region(cxlr); + } + return; + } + + unregister_region(cxlr); +} + static struct lock_class_key cxl_region_key; static struct cxl_region *cxl_region_alloc(struct cxl_root_decoder *cxlrd, int id) @@ -2619,9 +2654,16 @@ static struct cxl_region *devm_cxl_add_region(struct cxl_root_decoder *cxlrd, if (rc) goto err; - rc = devm_add_action_or_reset(port->uport_dev, unregister_region, cxlr); - if (rc) - return ERR_PTR(rc); + /* + * For accelerators/type2, region release linked to endpoint device. + * See handling of cxl_endpoint_region_autoremove() below by + * cxl_memdev_attach_region(). + */ + if (type == CXL_DECODER_HOSTONLYMEM) { + rc = devm_add_action_or_reset(port->uport_dev, unregister_region, cxlr); + if (rc) + return ERR_PTR(rc); + } dev_dbg(port->uport_dev, "%s: created %s\n", dev_name(&cxlrd->cxlsd.cxld.dev), dev_name(dev)); @@ -2650,7 +2692,8 @@ static ssize_t create_ram_region_show(struct device *dev, } static struct cxl_region *__create_region(struct cxl_root_decoder *cxlrd, - enum cxl_partition_mode mode, int id) + enum cxl_partition_mode mode, int id, + enum cxl_decoder_type type) { int rc; @@ -2672,7 +2715,7 @@ static struct cxl_region *__create_region(struct cxl_root_decoder *cxlrd, return ERR_PTR(-EBUSY); } - return devm_cxl_add_region(cxlrd, id, mode, CXL_DECODER_HOSTONLYMEM); + return devm_cxl_add_region(cxlrd, id, mode, type); } static ssize_t create_region_store(struct device *dev, const char *buf, @@ -2686,7 +2729,7 @@ static ssize_t create_region_store(struct device *dev, const char *buf, if (rc != 1) return -EINVAL; - cxlr = __create_region(cxlrd, mode, id); + cxlr = __create_region(cxlrd, mode, id, CXL_DECODER_HOSTONLYMEM); if (IS_ERR(cxlr)) return PTR_ERR(cxlr); @@ -2743,14 +2786,13 @@ static ssize_t delete_region_store(struct device *dev, const char *buf, size_t len) { struct cxl_root_decoder *cxlrd = to_cxl_root_decoder(dev); - struct cxl_port *port = to_cxl_port(dev->parent); struct cxl_region *cxlr; cxlr = cxl_find_region_by_name(cxlrd, buf); if (IS_ERR(cxlr)) return PTR_ERR(cxlr); - devm_release_action(port->uport_dev, unregister_region, cxlr); + cxl_region_release_action(cxlr); put_device(&cxlr->dev); return len; @@ -3897,7 +3939,6 @@ static struct cxl_region *construct_region(struct cxl_root_decoder *cxlrd, { struct cxl_endpoint_decoder *cxled = ctx->cxled; struct cxl_memdev *cxlmd = cxled_to_memdev(cxled); - struct cxl_port *port = cxlrd_to_port(cxlrd); struct cxl_dev_state *cxlds = cxlmd->cxlds; int rc, part = READ_ONCE(cxled->part); struct cxl_region *cxlr; @@ -3912,7 +3953,8 @@ static struct cxl_region *construct_region(struct cxl_root_decoder *cxlrd, do { cxlr = __create_region(cxlrd, cxlds->part[part].mode, - atomic_read(&cxlrd->region_id)); + atomic_read(&cxlrd->region_id), + cxled->cxld.target_type); } while (IS_ERR(cxlr) && PTR_ERR(cxlr) == -EBUSY); if (IS_ERR(cxlr)) { @@ -3925,7 +3967,7 @@ static struct cxl_region *construct_region(struct cxl_root_decoder *cxlrd, rc = __construct_region(cxlr, ctx); if (rc) { - devm_release_action(port->uport_dev, unregister_region, cxlr); + cxl_region_release_action(cxlr); return ERR_PTR(rc); } @@ -4208,6 +4250,135 @@ static int cxl_region_can_probe(struct cxl_region *cxlr) return 0; } +static int first_mapped_decoder(struct device *dev, const void *data) +{ + struct cxl_endpoint_decoder *cxled; + + if (!is_endpoint_decoder(dev)) + return 0; + + cxled = to_cxl_endpoint_decoder(dev); + if (cxled->cxld.region) + return 1; + + return 0; +} + +/* + * As this is running in endpoint port remove context it does not race cxl_root + * destruction since port topologies are always removed depth first. + */ +static void cxl_endpoint_region_autoremove(void *_cxlr) +{ + unregister_region(_cxlr); +} + +/** + * cxl_memdev_attach_region - bind region to accelerator memdev + * + * @cxlmd: a pointer to cxl_memdev to use + * @attach: a pointer to region attach struct with callbacks for + * safely working with a region range by the caller + * + * Returns 0 or error. + */ +int cxl_memdev_attach_region(struct cxl_memdev *cxlmd, + struct cxl_attach_region *attach) +{ + struct cxl_port *endpoint = cxlmd->endpoint; + struct cxl_endpoint_decoder *cxled; + struct cxl_region *cxlr; + int rc; + + if (IS_ERR(endpoint)) + return PTR_ERR(endpoint); + if (!endpoint) + return -ENXIO; + + { + /* hold endpoint lock to setup autoremove of the region */ + guard(device)(&endpoint->dev); + if (!endpoint->dev.driver) + return -ENXIO; + + { + guard(rwsem_read)(&cxl_rwsem.region); + guard(rwsem_read)(&cxl_rwsem.dpa); + + /* + * TODO auto-instantiate a region, for now assume this will + * find an auto-region. + */ + struct device *dev __free(put_device) = + device_find_child(&endpoint->dev, NULL, + first_mapped_decoder); + + if (!dev) { + dev_dbg(cxlmd->cxlds->dev, + "no region found for memdev %s\n", + dev_name(&cxlmd->dev)); + return -ENXIO; + } + + cxled = to_cxl_endpoint_decoder(dev); + cxlr = cxled->cxld.region; + + if (cxlr->params.state < CXL_CONFIG_COMMIT) { + dev_dbg(cxlmd->cxlds->dev, + "region %s not committed for memdev %s\n", + dev_name(&cxlr->dev), dev_name(&cxlmd->dev)); + return -ENXIO; + } + + if (cxlr->params.nr_targets > 1) { + dev_dbg(cxlmd->cxlds->dev, + "Only attach to local non-interleaved region\n"); + return -ENXIO; + } + + attach->region = (struct range) { + .start = cxlr->params.res->start, + .end = cxlr->params.res->end, + }; + + /* + * With endpoint locked leave the caller to safely work + * with the region range. + */ + rc = attach->attach(attach->data); + if (rc) + return rc; + + /* Only teardown regions that pass validation, ignore the rest */ + rc = devm_add_action(&endpoint->dev, + cxl_endpoint_region_autoremove, cxlr); + if (rc) { + attach->detach(attach->data); + goto err_unregister; + } + + /* Link type2 driver callback for stopping use of the region range. */ + rc = devm_add_action_or_reset(&endpoint->dev, + attach->detach, attach->data); + if (rc) { + devm_remove_action(&endpoint->dev, + cxl_endpoint_region_autoremove, + cxlr); + goto err_unregister; + } + + cxlr->detach = attach->detach; + cxlr->detach_data = attach->data; + + return 0; + } +err_unregister: + unregister_region(cxlr); + return rc; + } +} +EXPORT_SYMBOL_NS_GPL(cxl_memdev_attach_region, "CXL"); + static int cxl_region_probe(struct device *dev) { struct cxl_region *cxlr = to_cxl_region(dev); diff --git a/drivers/cxl/cxl.h b/drivers/cxl/cxl.h index 8194447f75d30..94d7d7965c512 100644 --- a/drivers/cxl/cxl.h +++ b/drivers/cxl/cxl.h @@ -443,6 +443,8 @@ struct cxl_region_params { * @hpa_range: Address range occupied by the region * @mode: Operational mode of the mapped capacity * @type: Endpoint decoder target type + * @detach: accelerator detach callback for device-memory regions + * @detach_data: accelerator detach callback data * @cxl_nvb: nvdimm bridge for coordinating @cxlr_pmem setup / shutdown * @cxlr_pmem: (for pmem regions) cached copy of the nvdimm bridge * @flags: Region state flags @@ -458,6 +460,8 @@ struct cxl_region { struct range hpa_range; enum cxl_partition_mode mode; enum cxl_decoder_type type; + void (*detach)(void *data); + void *detach_data; struct cxl_nvdimm_bridge *cxl_nvb; struct cxl_pmem_region *cxlr_pmem; unsigned long flags; diff --git a/drivers/net/ethernet/sfc/efx_cxl.c b/drivers/net/ethernet/sfc/efx_cxl.c index 7d8a6c2133c88..a16fefebe13a0 100644 --- a/drivers/net/ethernet/sfc/efx_cxl.c +++ b/drivers/net/ethernet/sfc/efx_cxl.c @@ -14,6 +14,36 @@ #define EFX_CTPIO_BUFFER_SIZE SZ_256M +/* Called with cxl endpoint device locked for precluding potential related + * cxl region removal triggered from user space, allowing safely mapping of + * such cxl region by the sfc driver. + */ +static int efx_cxl_map_region(void *data) { + struct efx_probe_data *probe_data = data; + struct efx_nic *efx = &probe_data->efx; + struct pci_dev *pci_dev = efx->pci_dev; + struct efx_cxl *cxl = probe_data->cxl; + struct range *cxl_pio_range = &cxl->attach_region.region; + + cxl->ctpio_cxl = ioremap(cxl_pio_range->start, + cxl_pio_range->end - cxl_pio_range->start + 1); + if (!cxl->ctpio_cxl) { + pci_err(pci_dev, "CXL ioremap region (%pra) failed\n", + cxl_pio_range); + return -ENOMEM; + } + probe_data->cxl_pio_initialised = true; + return 0; +} + +/* Called at driver exit or when user space triggers cxl region removal. */ +static void efx_cxl_unmap_region(void *data) { + struct efx_probe_data *probe_data = data; + + probe_data->cxl_pio_initialised = false; + iounmap(probe_data->cxl->ctpio_cxl); +} + int efx_cxl_init(struct efx_probe_data *probe_data) { struct efx_nic *efx = &probe_data->efx; @@ -81,8 +111,15 @@ int efx_cxl_init(struct efx_probe_data *probe_data) return PTR_ERR(cxl->cxlmd); } + cxl->attach_region.attach = efx_cxl_map_region; + cxl->attach_region.detach = efx_cxl_unmap_region; + cxl->attach_region.data = probe_data; probe_data->cxl = cxl; + rc = cxl_memdev_attach_region(cxl->cxlmd, &cxl->attach_region); + if (rc) + return rc; + return 0; } diff --git a/drivers/net/ethernet/sfc/efx_cxl.h b/drivers/net/ethernet/sfc/efx_cxl.h index 04e46278464df..1c294cd1df56c 100644 --- a/drivers/net/ethernet/sfc/efx_cxl.h +++ b/drivers/net/ethernet/sfc/efx_cxl.h @@ -20,6 +20,8 @@ struct efx_probe_data; struct efx_cxl { struct cxl_dev_state cxlds; struct cxl_memdev *cxlmd; + struct cxl_attach_region attach_region; + void __iomem *ctpio_cxl; }; int efx_cxl_init(struct efx_probe_data *probe_data); diff --git a/include/cxl/cxl.h b/include/cxl/cxl.h index 10a9b8fa2f6b7..22d9435b351fb 100644 --- a/include/cxl/cxl.h +++ b/include/cxl/cxl.h @@ -7,6 +7,7 @@ #include #include +#include #include /** @@ -153,6 +154,20 @@ struct cxl_memdev_attach { int (*probe)(struct cxl_memdev *cxlmd); }; +/** + * struct cxl_attach_region - accelerator region handling + * @attach: invoked at cxl_memdev_attach_region() with endpoint device locked. + * @detach: invoked at endpoint release. + * @data: pointer referencing accelerator data for attach and detach calls. + * @region: initialised with autodiscovered region values linked to memdev. + */ +struct cxl_attach_region { + int (*attach)(void *); + void (*detach)(void *); + void *data; + struct range region; +}; + /** * struct cxl_dev_state - The driver device state * @@ -231,4 +246,6 @@ struct cxl_dev_state *_devm_cxl_dev_state_create(struct device *dev, int cxl_set_capacity(struct cxl_dev_state *cxlds, u64 capacity); struct cxl_memdev *devm_cxl_add_memdev(struct cxl_dev_state *cxlds, const struct cxl_memdev_attach *attach); +struct cxl_region; +int cxl_memdev_attach_region(struct cxl_memdev *cxlmd, struct cxl_attach_region *attach); #endif /* __CXL_CXL_H__ */ From d6acd1ddf9f69d8c5e13c29e4b83c8df570ddda8 Mon Sep 17 00:00:00 2001 From: Alejandro Lucero Date: Thu, 23 Apr 2026 19:05:27 +0100 Subject: [PATCH 11/38] NVIDIA: VR: SAUCE: cxl: Avoid dax creation for accelerators By definition a type2 cxl device will use the host managed memory for specific functionality, therefore it should not be available to other uses like DAX. Signed-off-by: Alejandro Lucero Reviewed-by: Jonathan Cameron Reviewed-by: Davidlohr Bueso Reviewed-by: Dave Jiang Reviewed-by: Ben Cheatham (cherry picked from https://lore.kernel.org/r/20260423180528.17166-8-alejandro.lucero-palau@amd.com) Signed-off-by: Koba Ko --- drivers/cxl/core/region.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/drivers/cxl/core/region.c b/drivers/cxl/core/region.c index f8fdc0ec27316..9806eb9e19fac 100644 --- a/drivers/cxl/core/region.c +++ b/drivers/cxl/core/region.c @@ -4389,6 +4389,13 @@ static int cxl_region_probe(struct device *dev) if (rc) return rc; + /* + * HDM-D[B] (device-memory) regions have accelerator specific usage. + * Skip device-dax registration. + */ + if (cxlr->type == CXL_DECODER_DEVMEM) + return 0; + /* * From this point on any path that changes the region's state away from * CXL_CONFIG_COMMIT is also responsible for releasing the driver. From 0fdb2010fcfc0f17628fe8c08e82604cc6d331e0 Mon Sep 17 00:00:00 2001 From: Alejandro Lucero Date: Wed, 13 May 2026 23:39:27 +0800 Subject: [PATCH 12/38] NVIDIA: VR: SAUCE: sfc: support pio mapping based on cxl A PIO buffer is a region of device memory to which the driver can write a packet for TX, with the device handling the transmit doorbell without requiring a DMA for getting the packet data, which helps reducing latency in certain exchanges. With CXL mem protocol this latency can be lowered further. With a device supporting CXL and successfully initialised, use the cxl region to map the memory range and use this mapping for PIO buffers. Add the disabling of those CXL-based PIO buffers if the callback for potential cxl endpoint removal by the CXL core happens. Signed-off-by: Alejandro Lucero (backported from https://lore.kernel.org/r/20260423180528.17166-9-alejandro.lucero-palau@amd.com) [kobak: Added a !EFX_USE_PIO same-module stub for efx_ef10_disable_piobufs() so non-x86 builds that still enable CONFIG_SFC_CXL do not leave efx_cxl.o unresolved.] Signed-off-by: Koba Ko --- drivers/net/ethernet/sfc/ef10.c | 78 ++++++++++++++++++++++++--- drivers/net/ethernet/sfc/efx.h | 1 + drivers/net/ethernet/sfc/efx_cxl.c | 2 + drivers/net/ethernet/sfc/net_driver.h | 2 + drivers/net/ethernet/sfc/nic.h | 3 ++ 5 files changed, 79 insertions(+), 7 deletions(-) diff --git a/drivers/net/ethernet/sfc/ef10.c b/drivers/net/ethernet/sfc/ef10.c index 7e04f115bbaaa..52ad07c121833 100644 --- a/drivers/net/ethernet/sfc/ef10.c +++ b/drivers/net/ethernet/sfc/ef10.c @@ -24,6 +24,7 @@ #include #include #include +#include "efx_cxl.h" /* Hardware control for EF10 architecture including 'Huntington'. */ @@ -106,7 +107,7 @@ static int efx_ef10_get_vf_index(struct efx_nic *efx) static int efx_ef10_init_datapath_caps(struct efx_nic *efx) { - MCDI_DECLARE_BUF(outbuf, MC_CMD_GET_CAPABILITIES_V4_OUT_LEN); + MCDI_DECLARE_BUF(outbuf, MC_CMD_GET_CAPABILITIES_V7_OUT_LEN); struct efx_ef10_nic_data *nic_data = efx->nic_data; size_t outlen; int rc; @@ -177,6 +178,12 @@ static int efx_ef10_init_datapath_caps(struct efx_nic *efx) efx->num_mac_stats); } + if (outlen < MC_CMD_GET_CAPABILITIES_V7_OUT_LEN) + nic_data->datapath_caps3 = 0; + else + nic_data->datapath_caps3 = MCDI_DWORD(outbuf, + GET_CAPABILITIES_V7_OUT_FLAGS3); + return 0; } @@ -771,6 +778,35 @@ static int efx_ef10_alloc_piobufs(struct efx_nic *efx, unsigned int n) return rc; } +#ifdef CONFIG_SFC_CXL +/* Invoked from cxl core when a cxl region is removed. This is expected at + * driver exit linked to cxl core devm releases which does not require the + * below sync. + * + * However, it is required when user space actions triggger such a cxl region + * removal forcing any cxl piobuf usage to stop. Setting per tx queue piobuf + * to NULL is safe if such a tx queue is not currently in use inside + * efx_hard_start_xmit() implying tx_queue locked. + * + * After this the cxl region physical range can be safely unmap. + */ +void efx_ef10_disable_piobufs(struct efx_nic *efx) +{ + struct efx_tx_queue *tx_queue; + struct efx_channel *channel; + + local_bh_disable(); + efx_for_each_channel(channel, efx) + efx_for_each_channel_tx_queue(tx_queue, channel) { + HARD_TX_LOCK(efx->net_dev, tx_queue->core_txq, + smp_processor_id()); + tx_queue->piobuf = NULL; + HARD_TX_UNLOCK(efx->net_dev, tx_queue->core_txq); + } + local_bh_enable(); +} +#endif + static int efx_ef10_link_piobufs(struct efx_nic *efx) { struct efx_ef10_nic_data *nic_data = efx->nic_data; @@ -914,6 +950,12 @@ static void efx_ef10_forget_old_piobufs(struct efx_nic *efx) { } +#ifdef CONFIG_SFC_CXL +void efx_ef10_disable_piobufs(struct efx_nic *efx) +{ +} +#endif + #endif /* EFX_USE_PIO */ static void efx_ef10_remove(struct efx_nic *efx) @@ -1140,6 +1182,9 @@ static int efx_ef10_dimension_resources(struct efx_nic *efx) unsigned int channel_vis, pio_write_vi_base, max_vis; struct efx_ef10_nic_data *nic_data = efx->nic_data; unsigned int uc_mem_map_size, wc_mem_map_size; +#ifdef CONFIG_SFC_CXL + struct efx_probe_data *probe_data; +#endif void __iomem *membase; int rc; @@ -1263,8 +1308,25 @@ static int efx_ef10_dimension_resources(struct efx_nic *efx) iounmap(efx->membase); efx->membase = membase; - /* Set up the WC mapping if needed */ - if (wc_mem_map_size) { + if (!wc_mem_map_size) + goto skip_pio; + + /* Set up the WC mapping */ + +#ifdef CONFIG_SFC_CXL + probe_data = container_of(efx, struct efx_probe_data, efx); + if ((nic_data->datapath_caps3 & + (1 << MC_CMD_GET_CAPABILITIES_V7_OUT_CXL_CONFIG_ENABLE_LBN)) && + probe_data->cxl_pio_initialised) { + /* Using PIO through CXL mapping */ + nic_data->pio_write_base = probe_data->cxl->ctpio_cxl; + nic_data->pio_write_vi_base = pio_write_vi_base; + + probe_data->cxl_pio_in_use = true; + } else +#endif + { + /* Using legacy PIO BAR mapping */ nic_data->wc_membase = ioremap_wc(efx->membase_phys + uc_mem_map_size, wc_mem_map_size); @@ -1279,12 +1341,14 @@ static int efx_ef10_dimension_resources(struct efx_nic *efx) nic_data->wc_membase + (pio_write_vi_base * efx->vi_stride + ER_DZ_TX_PIOBUF - uc_mem_map_size); - - rc = efx_ef10_link_piobufs(efx); - if (rc) - efx_ef10_free_piobufs(efx); } + rc = efx_ef10_link_piobufs(efx); + if (rc) + efx_ef10_free_piobufs(efx); + +skip_pio: + netif_dbg(efx, probe, efx->net_dev, "memory BAR at %pa (virtual %p+%x UC, %p+%x WC)\n", &efx->membase_phys, efx->membase, uc_mem_map_size, diff --git a/drivers/net/ethernet/sfc/efx.h b/drivers/net/ethernet/sfc/efx.h index 45e1916866256..37fd1cf96582e 100644 --- a/drivers/net/ethernet/sfc/efx.h +++ b/drivers/net/ethernet/sfc/efx.h @@ -237,4 +237,5 @@ static inline bool efx_rwsem_assert_write_locked(struct rw_semaphore *sem) int efx_xdp_tx_buffers(struct efx_nic *efx, int n, struct xdp_frame **xdpfs, bool flush); +void efx_ef10_disable_piobufs(struct efx_nic *efx); #endif /* EFX_EFX_H */ diff --git a/drivers/net/ethernet/sfc/efx_cxl.c b/drivers/net/ethernet/sfc/efx_cxl.c index a16fefebe13a0..52b2cded76daf 100644 --- a/drivers/net/ethernet/sfc/efx_cxl.c +++ b/drivers/net/ethernet/sfc/efx_cxl.c @@ -10,6 +10,7 @@ #include #include #include "net_driver.h" +#include "efx.h" #include "efx_cxl.h" #define EFX_CTPIO_BUFFER_SIZE SZ_256M @@ -40,6 +41,7 @@ static int efx_cxl_map_region(void *data) { static void efx_cxl_unmap_region(void *data) { struct efx_probe_data *probe_data = data; + efx_ef10_disable_piobufs(&probe_data->efx); probe_data->cxl_pio_initialised = false; iounmap(probe_data->cxl->ctpio_cxl); } diff --git a/drivers/net/ethernet/sfc/net_driver.h b/drivers/net/ethernet/sfc/net_driver.h index 3964b2c56609c..bea4eecdf842d 100644 --- a/drivers/net/ethernet/sfc/net_driver.h +++ b/drivers/net/ethernet/sfc/net_driver.h @@ -1207,6 +1207,7 @@ struct efx_cxl; * @efx: Efx NIC details * @cxl: details of related cxl objects * @cxl_pio_initialised: cxl initialization outcome. + * @cxl_pio_in_use: PIO using CXL mapping */ struct efx_probe_data { struct pci_dev *pci_dev; @@ -1214,6 +1215,7 @@ struct efx_probe_data { #ifdef CONFIG_SFC_CXL struct efx_cxl *cxl; bool cxl_pio_initialised; + bool cxl_pio_in_use; #endif }; diff --git a/drivers/net/ethernet/sfc/nic.h b/drivers/net/ethernet/sfc/nic.h index ec3b2df43b68d..7480f9995dfb8 100644 --- a/drivers/net/ethernet/sfc/nic.h +++ b/drivers/net/ethernet/sfc/nic.h @@ -152,6 +152,8 @@ enum { * %MC_CMD_GET_CAPABILITIES response) * @datapath_caps2: Further Capabilities of datapath firmware (FLAGS2 field of * %MC_CMD_GET_CAPABILITIES response) + * @datapath_caps3: Further Capabilities of datapath firmware (FLAGS3 field of + * %MC_CMD_GET_CAPABILITIES response) * @rx_dpcpu_fw_id: Firmware ID of the RxDPCPU * @tx_dpcpu_fw_id: Firmware ID of the TxDPCPU * @must_probe_vswitching: Flag: vswitching has yet to be setup after MC reboot @@ -187,6 +189,7 @@ struct efx_ef10_nic_data { bool must_check_datapath_caps; u32 datapath_caps; u32 datapath_caps2; + u32 datapath_caps3; unsigned int rx_dpcpu_fw_id; unsigned int tx_dpcpu_fw_id; bool must_probe_vswitching; From 1d651dbee5bc7e1e26b243967c25a1db79f74237 Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Tue, 10 Feb 2026 06:44:53 +0000 Subject: [PATCH 13/38] NVIDIA: VR: SAUCE: dax/hmem: Request cxl_acpi and cxl_pci before walking Soft Reserved ranges Ensure cxl_acpi has published CXL Window resources before HMEM walks Soft Reserved ranges. Replace MODULE_SOFTDEP("pre: cxl_acpi") with an explicit, synchronous request_module("cxl_acpi"). MODULE_SOFTDEP() only guarantees eventual loading, it does not enforce that the dependency has finished init before the current module runs. This can cause HMEM to start before cxl_acpi has populated the resource tree, breaking detection of overlaps between Soft Reserved and CXL Windows. Also, request cxl_pci before HMEM walks Soft Reserved ranges. Unlike cxl_acpi, cxl_pci attach is asynchronous and creates dependent devices that trigger further module loads. Asynchronous probe flushing (wait_for_device_probe()) is added later in the series in a deferred context before HMEM makes ownership decisions for Soft Reserved ranges. Add an additional explicit Kconfig ordering so that CXL_ACPI and CXL_PCI must be initialized before DEV_DAX_HMEM. This prevents HMEM from consuming Soft Reserved ranges before CXL drivers have had a chance to claim them. Signed-off-by: Dan Williams Signed-off-by: Smita Koralahalli Reviewed-by: Dave Jiang Reviewed-by: Jonathan Cameron Reviewed-by: Alison Schofield Tested-by: Tomasz Wolski Link: https://lore.kernel.org/r/20260210064501.157591-2-Smita.KoralahalliChannabasappa@amd.com (cherry picked from https://lore.kernel.org/r/20260210064501.157591-2-Smita.KoralahalliChannabasappa@amd.com) Signed-off-by: Koba Ko --- drivers/dax/Kconfig | 2 ++ drivers/dax/hmem/hmem.c | 17 ++++++++++------- 2 files changed, 12 insertions(+), 7 deletions(-) diff --git a/drivers/dax/Kconfig b/drivers/dax/Kconfig index d656e4c0eb846..3683bb3f2311b 100644 --- a/drivers/dax/Kconfig +++ b/drivers/dax/Kconfig @@ -48,6 +48,8 @@ config DEV_DAX_CXL tristate "CXL DAX: direct access to CXL RAM regions" depends on CXL_BUS && CXL_REGION && DEV_DAX default CXL_REGION && DEV_DAX + depends on CXL_ACPI >= DEV_DAX_HMEM + depends on CXL_PCI >= DEV_DAX_HMEM help CXL RAM regions are either mapped by platform-firmware and published in the initial system-memory map as "System RAM", mapped diff --git a/drivers/dax/hmem/hmem.c b/drivers/dax/hmem/hmem.c index 1cf7c2a0ee1cb..008172fc3607e 100644 --- a/drivers/dax/hmem/hmem.c +++ b/drivers/dax/hmem/hmem.c @@ -139,6 +139,16 @@ static __init int dax_hmem_init(void) { int rc; + /* + * Ensure that cxl_acpi and cxl_pci have a chance to kick off + * CXL topology discovery at least once before scanning the + * iomem resource tree for IORES_DESC_CXL resources. + */ + if (IS_ENABLED(CONFIG_DEV_DAX_CXL)) { + request_module("cxl_acpi"); + request_module("cxl_pci"); + } + rc = platform_driver_register(&dax_hmem_platform_driver); if (rc) return rc; @@ -159,13 +169,6 @@ static __exit void dax_hmem_exit(void) module_init(dax_hmem_init); module_exit(dax_hmem_exit); -/* Allow for CXL to define its own dax regions */ -#if IS_ENABLED(CONFIG_CXL_REGION) -#if IS_MODULE(CONFIG_CXL_ACPI) -MODULE_SOFTDEP("pre: cxl_acpi"); -#endif -#endif - MODULE_ALIAS("platform:hmem*"); MODULE_ALIAS("platform:hmem_platform*"); MODULE_DESCRIPTION("HMEM DAX: direct access to 'specific purpose' memory"); From 77ac7f9272ba0f4eb6e293435d9efbc1af45d6ab Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Tue, 10 Feb 2026 06:44:54 +0000 Subject: [PATCH 14/38] NVIDIA: VR: SAUCE: dax/hmem: Gate Soft Reserved deferral on DEV_DAX_CXL Replace IS_ENABLED(CONFIG_CXL_REGION) with IS_ENABLED(CONFIG_DEV_DAX_CXL) so that HMEM only defers Soft Reserved ranges when CXL DAX support is enabled. This makes the coordination between HMEM and the CXL stack more precise and prevents deferral in unrelated CXL configurations. Signed-off-by: Dan Williams Signed-off-by: Smita Koralahalli Reviewed-by: Dave Jiang Reviewed-by: Jonathan Cameron Reviewed-by: Alison Schofield Reviewed-by: Dave Jiang Tested-by: Tomasz Wolski Link: https://lore.kernel.org/r/20260210064501.157591-3-Smita.KoralahalliChannabasappa@amd.com (cherry picked from https://lore.kernel.org/r/20260210064501.157591-3-Smita.KoralahalliChannabasappa@amd.com) Signed-off-by: Koba Ko --- drivers/dax/hmem/hmem.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/dax/hmem/hmem.c b/drivers/dax/hmem/hmem.c index 008172fc3607e..1e34243584905 100644 --- a/drivers/dax/hmem/hmem.c +++ b/drivers/dax/hmem/hmem.c @@ -66,7 +66,7 @@ static int hmem_register_device(struct device *host, int target_nid, long id; int rc; - if (IS_ENABLED(CONFIG_CXL_REGION) && + if (IS_ENABLED(CONFIG_DEV_DAX_CXL) && region_intersects(res->start, resource_size(res), IORESOURCE_MEM, IORES_DESC_CXL) != REGION_DISJOINT) { dev_dbg(host, "deferring range to CXL: %pr\n", res); From d8ce89e1116ed0f9dbf3c8c8d1f45c7c91b09e2c Mon Sep 17 00:00:00 2001 From: Smita Koralahalli Date: Tue, 10 Feb 2026 06:44:55 +0000 Subject: [PATCH 15/38] NVIDIA: VR: SAUCE: cxl/region: Skip decoder reset on detach for autodiscovered regions __cxl_decoder_detach() currently resets decoder programming whenever a region is detached if cxl_config_state is beyond CXL_CONFIG_ACTIVE. For autodiscovered regions, this can incorrectly tear down decoder state that may be relied upon by other consumers or by subsequent ownership decisions. Skip cxl_region_decode_reset() during detach when CXL_REGION_F_AUTO is set. Signed-off-by: Smita Koralahalli Reviewed-by: Jonathan Cameron Reviewed-by: Dave Jiang Reviewed-by: Alejandro Lucero Tested-by: Tomasz Wolski Link: https://lore.kernel.org/r/20260210064501.157591-4-Smita.KoralahalliChannabasappa@amd.com (cherry picked from https://lore.kernel.org/r/20260210064501.157591-4-Smita.KoralahalliChannabasappa@amd.com) Signed-off-by: Koba Ko --- drivers/cxl/core/region.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/cxl/core/region.c b/drivers/cxl/core/region.c index 9806eb9e19fac..37dfb3965fbb3 100644 --- a/drivers/cxl/core/region.c +++ b/drivers/cxl/core/region.c @@ -2185,7 +2185,9 @@ __cxl_decoder_detach(struct cxl_region *cxlr, cxled->part = -1; if (p->state > CXL_CONFIG_ACTIVE) { - cxl_region_decode_reset(cxlr, p->interleave_ways); + if (!test_bit(CXL_REGION_F_AUTO, &cxlr->flags)) + cxl_region_decode_reset(cxlr, p->interleave_ways); + p->state = CXL_CONFIG_ACTIVE; } From 79356d9fd78d540b0e1c6b5f64e69428fc81fe90 Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Tue, 10 Feb 2026 06:44:56 +0000 Subject: [PATCH 16/38] NVIDIA: VR: SAUCE: dax/cxl, hmem: Initialize hmem early and defer dax_cxl binding Move hmem/ earlier in the dax Makefile so that hmem_init() runs before dax_cxl. In addition, defer registration of the dax_cxl driver to a workqueue instead of using module_cxl_driver(). This ensures that dax_hmem has an opportunity to initialize and register its deferred callback and make ownership decisions before dax_cxl begins probing and claiming Soft Reserved ranges. Mark the dax_cxl driver as PROBE_PREFER_ASYNCHRONOUS so its probe runs out of line from other synchronous probing avoiding ordering dependencies while coordinating ownership decisions with dax_hmem. Signed-off-by: Dan Williams Signed-off-by: Smita Koralahalli Reviewed-by: Jonathan Cameron Reviewed-by: Dave Jiang Tested-by: Tomasz Wolski Link: https://lore.kernel.org/r/20260210064501.157591-5-Smita.KoralahalliChannabasappa@amd.com (cherry picked from https://lore.kernel.org/r/20260210064501.157591-5-Smita.KoralahalliChannabasappa@amd.com) Signed-off-by: Koba Ko --- drivers/dax/Makefile | 3 +-- drivers/dax/cxl.c | 27 ++++++++++++++++++++++++++- 2 files changed, 27 insertions(+), 3 deletions(-) diff --git a/drivers/dax/Makefile b/drivers/dax/Makefile index 5ed5c39857c8b..70e996bf15261 100644 --- a/drivers/dax/Makefile +++ b/drivers/dax/Makefile @@ -1,4 +1,5 @@ # SPDX-License-Identifier: GPL-2.0 +obj-y += hmem/ obj-$(CONFIG_DAX) += dax.o obj-$(CONFIG_DEV_DAX) += device_dax.o obj-$(CONFIG_DEV_DAX_KMEM) += kmem.o @@ -10,5 +11,3 @@ dax-y += bus.o device_dax-y := device.o dax_pmem-y := pmem.o dax_cxl-y := cxl.o - -obj-y += hmem/ diff --git a/drivers/dax/cxl.c b/drivers/dax/cxl.c index 13cd94d32ff7a..a2136adfa186e 100644 --- a/drivers/dax/cxl.c +++ b/drivers/dax/cxl.c @@ -38,10 +38,35 @@ static struct cxl_driver cxl_dax_region_driver = { .id = CXL_DEVICE_DAX_REGION, .drv = { .suppress_bind_attrs = true, + .probe_type = PROBE_PREFER_ASYNCHRONOUS, }, }; -module_cxl_driver(cxl_dax_region_driver); +static void cxl_dax_region_driver_register(struct work_struct *work) +{ + cxl_driver_register(&cxl_dax_region_driver); +} + +static DECLARE_WORK(cxl_dax_region_driver_work, cxl_dax_region_driver_register); + +static int __init cxl_dax_region_init(void) +{ + /* + * Need to resolve a race with dax_hmem wanting to drive regions + * instead of CXL + */ + queue_work(system_long_wq, &cxl_dax_region_driver_work); + return 0; +} +module_init(cxl_dax_region_init); + +static void __exit cxl_dax_region_exit(void) +{ + flush_work(&cxl_dax_region_driver_work); + cxl_driver_unregister(&cxl_dax_region_driver); +} +module_exit(cxl_dax_region_exit); + MODULE_ALIAS_CXL(CXL_DEVICE_DAX_REGION); MODULE_DESCRIPTION("CXL DAX: direct access to CXL regions"); MODULE_LICENSE("GPL"); From 3a52fc67323ffaddb09955902d582f345b8fd561 Mon Sep 17 00:00:00 2001 From: Smita Koralahalli Date: Tue, 10 Feb 2026 06:44:57 +0000 Subject: [PATCH 17/38] NVIDIA: VR: SAUCE: dax: Track all dax_region allocations under a global resource tree Introduce a global "DAX Regions" resource root and register each dax_region->res under it via request_resource(). Release the resource on dax_region teardown. By enforcing a single global namespace for dax_region allocations, this ensures only one of dax_hmem or dax_cxl can successfully register a dax_region for a given range. Co-developed-by: Dan Williams Signed-off-by: Dan Williams Signed-off-by: Smita Koralahalli Reviewed-by: Dave Jiang Tested-by: Tomasz Wolski Link: https://lore.kernel.org/r/20260210064501.157591-6-Smita.KoralahalliChannabasappa@amd.com (cherry picked from https://lore.kernel.org/r/20260210064501.157591-6-Smita.KoralahalliChannabasappa@amd.com) Signed-off-by: Koba Ko --- drivers/dax/bus.c | 23 ++++++++++++++++++++--- 1 file changed, 20 insertions(+), 3 deletions(-) diff --git a/drivers/dax/bus.c b/drivers/dax/bus.c index c94c09622516e..ba288662082d7 100644 --- a/drivers/dax/bus.c +++ b/drivers/dax/bus.c @@ -10,6 +10,7 @@ #include "dax-private.h" #include "bus.h" +static struct resource dax_regions = DEFINE_RES_MEM_NAMED(0, -1, "DAX Regions"); static DEFINE_MUTEX(dax_bus_lock); /* @@ -625,6 +626,8 @@ static void dax_region_unregister(void *region) { struct dax_region *dax_region = region; + scoped_guard(rwsem_write, &dax_region_rwsem) + release_resource(&dax_region->res); sysfs_remove_groups(&dax_region->dev->kobj, dax_region_attribute_groups); dax_region_put(dax_region); @@ -635,6 +638,7 @@ struct dax_region *alloc_dax_region(struct device *parent, int region_id, unsigned long flags) { struct dax_region *dax_region; + int rc; /* * The DAX core assumes that it can store its private data in @@ -667,14 +671,27 @@ struct dax_region *alloc_dax_region(struct device *parent, int region_id, .flags = IORESOURCE_MEM | flags, }; - if (sysfs_create_groups(&parent->kobj, dax_region_attribute_groups)) { - kfree(dax_region); - return NULL; + scoped_guard(rwsem_write, &dax_region_rwsem) + rc = request_resource(&dax_regions, &dax_region->res); + if (rc) { + dev_dbg(parent, "dax_region resource conflict for %pR\n", + &dax_region->res); + goto err_res; } + if (sysfs_create_groups(&parent->kobj, dax_region_attribute_groups)) + goto err_sysfs; + if (devm_add_action_or_reset(parent, dax_region_unregister, dax_region)) return NULL; return dax_region; + +err_sysfs: + scoped_guard(rwsem_write, &dax_region_rwsem) + release_resource(&dax_region->res); +err_res: + kfree(dax_region); + return NULL; } EXPORT_SYMBOL_GPL(alloc_dax_region); From 3a6c6f56dc2c70ed379b9798ceeaf8af1a637b70 Mon Sep 17 00:00:00 2001 From: Smita Koralahalli Date: Tue, 10 Feb 2026 06:44:58 +0000 Subject: [PATCH 18/38] NVIDIA: VR: SAUCE: cxl/region: Add helper to check Soft Reserved containment by CXL regions Add a helper to determine whether a given Soft Reserved memory range is fully contained within the committed CXL region. This helper provides a primitive for policy decisions in subsequent patches such as co-ordination with dax_hmem to determine whether CXL has fully claimed ownership of Soft Reserved memory ranges. Signed-off-by: Smita Koralahalli Reviewed-by: Jonathan Cameron Reviewed-by: Dave Jiang Reviewed-by: Dan Williams Tested-by: Tomasz Wolski Link: https://lore.kernel.org/r/20260210064501.157591-7-Smita.KoralahalliChannabasappa@amd.com (backported from https://lore.kernel.org/r/20260210064501.157591-7-Smita.KoralahalliChannabasappa@amd.com) [kobak: Added the Soft Reserved declaration to the existing Type2 include/cxl/cxl.h header instead of recreating that header.] Signed-off-by: Koba Ko --- drivers/cxl/core/region.c | 30 ++++++++++++++++++++++++++++++ include/cxl/cxl.h | 9 +++++++++ 2 files changed, 39 insertions(+) diff --git a/drivers/cxl/core/region.c b/drivers/cxl/core/region.c index 37dfb3965fbb3..78ec23b21f901 100644 --- a/drivers/cxl/core/region.c +++ b/drivers/cxl/core/region.c @@ -12,6 +12,7 @@ #include #include #include +#include #include #include #include "core.h" @@ -4227,6 +4228,35 @@ static int cxl_region_setup_poison(struct cxl_region *cxlr) return devm_add_action_or_reset(dev, remove_debugfs, dentry); } +static int region_contains_soft_reserve(struct device *dev, void *data) +{ + struct resource *res = data; + struct cxl_region *cxlr; + struct cxl_region_params *p; + + if (!is_cxl_region(dev)) + return 0; + + cxlr = to_cxl_region(dev); + p = &cxlr->params; + + if (p->state != CXL_CONFIG_COMMIT) + return 0; + + if (!p->res) + return 0; + + return resource_contains(p->res, res) ? 1 : 0; +} + +bool cxl_region_contains_soft_reserve(struct resource *res) +{ + guard(rwsem_read)(&cxl_rwsem.region); + return bus_for_each_dev(&cxl_bus_type, NULL, res, + region_contains_soft_reserve) != 0; +} +EXPORT_SYMBOL_GPL(cxl_region_contains_soft_reserve); + static int cxl_region_can_probe(struct cxl_region *cxlr) { struct cxl_region_params *p = &cxlr->params; diff --git a/include/cxl/cxl.h b/include/cxl/cxl.h index 22d9435b351fb..3fbd9eac137ea 100644 --- a/include/cxl/cxl.h +++ b/include/cxl/cxl.h @@ -248,4 +248,13 @@ struct cxl_memdev *devm_cxl_add_memdev(struct cxl_dev_state *cxlds, const struct cxl_memdev_attach *attach); struct cxl_region; int cxl_memdev_attach_region(struct cxl_memdev *cxlmd, struct cxl_attach_region *attach); + +#ifdef CONFIG_CXL_REGION +bool cxl_region_contains_soft_reserve(struct resource *res); +#else +static inline bool cxl_region_contains_soft_reserve(struct resource *res) +{ + return false; +} +#endif #endif /* __CXL_CXL_H__ */ From 2e433461671f3540db2fe1c03a9f2d10fe3b9f71 Mon Sep 17 00:00:00 2001 From: Smita Koralahalli Date: Tue, 10 Feb 2026 06:44:59 +0000 Subject: [PATCH 19/38] NVIDIA: VR: SAUCE: dax: Add deferred-work helpers for dax_hmem and dax_cxl coordination Add helpers to register, queue and flush the deferred work. These helpers allow dax_hmem to execute ownership resolution outside the probe context before dax_cxl binds. Signed-off-by: Smita Koralahalli Tested-by: Tomasz Wolski Link: https://lore.kernel.org/r/20260210064501.157591-8-Smita.KoralahalliChannabasappa@amd.com (cherry picked from https://lore.kernel.org/r/20260210064501.157591-8-Smita.KoralahalliChannabasappa@amd.com) Signed-off-by: Koba Ko --- drivers/dax/bus.c | 58 +++++++++++++++++++++++++++++++++++++++++++++++ drivers/dax/bus.h | 7 ++++++ 2 files changed, 65 insertions(+) diff --git a/drivers/dax/bus.c b/drivers/dax/bus.c index ba288662082d7..523953959a16a 100644 --- a/drivers/dax/bus.c +++ b/drivers/dax/bus.c @@ -25,6 +25,64 @@ DECLARE_RWSEM(dax_region_rwsem); */ DECLARE_RWSEM(dax_dev_rwsem); +static DEFINE_MUTEX(dax_hmem_lock); +static dax_hmem_deferred_fn hmem_deferred_fn; +static void *dax_hmem_data; + +static void hmem_deferred_work(struct work_struct *work) +{ + dax_hmem_deferred_fn fn; + void *data; + + scoped_guard(mutex, &dax_hmem_lock) { + fn = hmem_deferred_fn; + data = dax_hmem_data; + } + + if (fn) + fn(data); +} + +static DECLARE_WORK(dax_hmem_work, hmem_deferred_work); + +int dax_hmem_register_work(dax_hmem_deferred_fn fn, void *data) +{ + guard(mutex)(&dax_hmem_lock); + + if (hmem_deferred_fn) + return -EINVAL; + + hmem_deferred_fn = fn; + dax_hmem_data = data; + return 0; +} +EXPORT_SYMBOL_GPL(dax_hmem_register_work); + +int dax_hmem_unregister_work(dax_hmem_deferred_fn fn, void *data) +{ + guard(mutex)(&dax_hmem_lock); + + if (hmem_deferred_fn != fn || dax_hmem_data != data) + return -EINVAL; + + hmem_deferred_fn = NULL; + dax_hmem_data = NULL; + return 0; +} +EXPORT_SYMBOL_GPL(dax_hmem_unregister_work); + +void dax_hmem_queue_work(void) +{ + queue_work(system_long_wq, &dax_hmem_work); +} +EXPORT_SYMBOL_GPL(dax_hmem_queue_work); + +void dax_hmem_flush_work(void) +{ + flush_work(&dax_hmem_work); +} +EXPORT_SYMBOL_GPL(dax_hmem_flush_work); + #define DAX_NAME_LEN 30 struct dax_id { struct list_head list; diff --git a/drivers/dax/bus.h b/drivers/dax/bus.h index cbbf64443098c..b58a88e8089c0 100644 --- a/drivers/dax/bus.h +++ b/drivers/dax/bus.h @@ -41,6 +41,13 @@ struct dax_device_driver { void (*remove)(struct dev_dax *dev); }; +typedef void (*dax_hmem_deferred_fn)(void *data); + +int dax_hmem_register_work(dax_hmem_deferred_fn fn, void *data); +int dax_hmem_unregister_work(dax_hmem_deferred_fn fn, void *data); +void dax_hmem_queue_work(void); +void dax_hmem_flush_work(void); + int __dax_driver_register(struct dax_device_driver *dax_drv, struct module *module, const char *mod_name); #define dax_driver_register(driver) \ From c0db8c8c8fda321633d2ba78a991b61d079b2df5 Mon Sep 17 00:00:00 2001 From: Smita Koralahalli Date: Tue, 10 Feb 2026 06:45:00 +0000 Subject: [PATCH 20/38] NVIDIA: VR: SAUCE: dax/hmem, cxl: Defer and resolve ownership of Soft Reserved memory ranges The current probe time ownership check for Soft Reserved memory based solely on CXL window intersection is insufficient. dax_hmem probing is not always guaranteed to run after CXL enumeration and region assembly, which can lead to incorrect ownership decisions before the CXL stack has finished publishing windows and assembling committed regions. Introduce deferred ownership handling for Soft Reserved ranges that intersect CXL windows. When such a range is encountered during dax_hmem probe, schedule deferred work and wait for the CXL stack to complete enumeration and region assembly before deciding ownership. Evaluate ownership of Soft Reserved ranges based on CXL region containment. - If all Soft Reserved ranges are fully contained within committed CXL regions, DROP handling Soft Reserved ranges from dax_hmem and allow dax_cxl to bind. - If any Soft Reserved range is not fully claimed by committed CXL region, REGISTER the Soft Reserved ranges with dax_hmem. Use dax_cxl_mode to coordinate ownership decisions for Soft Reserved ranges. Once, ownership resolution is complete, flush the deferred work from dax_cxl before allowing dax_cxl to bind. This enforces a strict ownership. Either CXL fully claims the Soft reserved ranges or it relinquishes it entirely. Co-developed-by: Dan Williams Signed-off-by: Dan Williams Signed-off-by: Smita Koralahalli Tested-by: Tomasz Wolski Link: https://lore.kernel.org/r/20260210064501.157591-9-Smita.KoralahalliChannabasappa@amd.com (cherry picked from https://lore.kernel.org/r/20260210064501.157591-9-Smita.KoralahalliChannabasappa@amd.com) Signed-off-by: Koba Ko --- drivers/dax/bus.c | 3 ++ drivers/dax/bus.h | 19 ++++++++++ drivers/dax/cxl.c | 1 + drivers/dax/hmem/hmem.c | 78 +++++++++++++++++++++++++++++++++++++++-- 4 files changed, 99 insertions(+), 2 deletions(-) diff --git a/drivers/dax/bus.c b/drivers/dax/bus.c index 523953959a16a..94c9d947a8a39 100644 --- a/drivers/dax/bus.c +++ b/drivers/dax/bus.c @@ -25,6 +25,9 @@ DECLARE_RWSEM(dax_region_rwsem); */ DECLARE_RWSEM(dax_dev_rwsem); +enum dax_cxl_mode dax_cxl_mode = DAX_CXL_MODE_DEFER; +EXPORT_SYMBOL_NS_GPL(dax_cxl_mode, "CXL"); + static DEFINE_MUTEX(dax_hmem_lock); static dax_hmem_deferred_fn hmem_deferred_fn; static void *dax_hmem_data; diff --git a/drivers/dax/bus.h b/drivers/dax/bus.h index b58a88e8089c0..82616ff52fd14 100644 --- a/drivers/dax/bus.h +++ b/drivers/dax/bus.h @@ -41,6 +41,25 @@ struct dax_device_driver { void (*remove)(struct dev_dax *dev); }; +/* + * enum dax_cxl_mode - State machine to determine ownership for CXL + * tagged Soft Reserved memory ranges. + * @DAX_CXL_MODE_DEFER: Ownership resolution pending. Set while waiting + * for CXL enumeration and region assembly to complete. + * @DAX_CXL_MODE_REGISTER: CXL regions do not fully cover Soft Reserved + * ranges. Fall back to registering those ranges via dax_hmem. + * @DAX_CXL_MODE_DROP: All Soft Reserved ranges intersecting CXL windows + * are fully contained within committed CXL regions. Drop HMEM handling + * and allow dax_cxl to bind. + */ +enum dax_cxl_mode { + DAX_CXL_MODE_DEFER, + DAX_CXL_MODE_REGISTER, + DAX_CXL_MODE_DROP, +}; + +extern enum dax_cxl_mode dax_cxl_mode; + typedef void (*dax_hmem_deferred_fn)(void *data); int dax_hmem_register_work(dax_hmem_deferred_fn fn, void *data); diff --git a/drivers/dax/cxl.c b/drivers/dax/cxl.c index a2136adfa186e..3ab39b77843d5 100644 --- a/drivers/dax/cxl.c +++ b/drivers/dax/cxl.c @@ -44,6 +44,7 @@ static struct cxl_driver cxl_dax_region_driver = { static void cxl_dax_region_driver_register(struct work_struct *work) { + dax_hmem_flush_work(); cxl_driver_register(&cxl_dax_region_driver); } diff --git a/drivers/dax/hmem/hmem.c b/drivers/dax/hmem/hmem.c index 1e34243584905..85854e25254b2 100644 --- a/drivers/dax/hmem/hmem.c +++ b/drivers/dax/hmem/hmem.c @@ -3,6 +3,7 @@ #include #include #include +#include #include "../bus.h" static bool region_idle; @@ -69,8 +70,18 @@ static int hmem_register_device(struct device *host, int target_nid, if (IS_ENABLED(CONFIG_DEV_DAX_CXL) && region_intersects(res->start, resource_size(res), IORESOURCE_MEM, IORES_DESC_CXL) != REGION_DISJOINT) { - dev_dbg(host, "deferring range to CXL: %pr\n", res); - return 0; + switch (dax_cxl_mode) { + case DAX_CXL_MODE_DEFER: + dev_dbg(host, "deferring range to CXL: %pr\n", res); + dax_hmem_queue_work(); + return 0; + case DAX_CXL_MODE_REGISTER: + dev_dbg(host, "registering CXL range: %pr\n", res); + break; + case DAX_CXL_MODE_DROP: + dev_dbg(host, "dropping CXL range: %pr\n", res); + return 0; + } } rc = region_intersects_soft_reserve(res->start, resource_size(res)); @@ -123,8 +134,70 @@ static int hmem_register_device(struct device *host, int target_nid, return rc; } +static int hmem_register_cxl_device(struct device *host, int target_nid, + const struct resource *res) +{ + if (region_intersects(res->start, resource_size(res), IORESOURCE_MEM, + IORES_DESC_CXL) != REGION_DISJOINT) + return hmem_register_device(host, target_nid, res); + + return 0; +} + +static int soft_reserve_has_cxl_match(struct device *host, int target_nid, + const struct resource *res) +{ + if (region_intersects(res->start, resource_size(res), IORESOURCE_MEM, + IORES_DESC_CXL) != REGION_DISJOINT) { + if (!cxl_region_contains_soft_reserve((struct resource *)res)) + return 1; + } + + return 0; +} + +static void process_defer_work(void *data) +{ + struct platform_device *pdev = data; + int rc; + + /* relies on cxl_acpi and cxl_pci having had a chance to load */ + wait_for_device_probe(); + + rc = walk_hmem_resources(&pdev->dev, soft_reserve_has_cxl_match); + + if (!rc) { + dax_cxl_mode = DAX_CXL_MODE_DROP; + dev_dbg(&pdev->dev, "All Soft Reserved ranges claimed by CXL\n"); + } else { + dax_cxl_mode = DAX_CXL_MODE_REGISTER; + dev_warn(&pdev->dev, + "Soft Reserved not fully contained in CXL; using HMEM\n"); + } + + walk_hmem_resources(&pdev->dev, hmem_register_cxl_device); +} + +static void kill_defer_work(void *data) +{ + struct platform_device *pdev = data; + + dax_hmem_flush_work(); + dax_hmem_unregister_work(process_defer_work, pdev); +} + static int dax_hmem_platform_probe(struct platform_device *pdev) { + int rc; + + rc = dax_hmem_register_work(process_defer_work, pdev); + if (rc) + return rc; + + rc = devm_add_action_or_reset(&pdev->dev, kill_defer_work, pdev); + if (rc) + return rc; + return walk_hmem_resources(&pdev->dev, hmem_register_device); } @@ -174,3 +247,4 @@ MODULE_ALIAS("platform:hmem_platform*"); MODULE_DESCRIPTION("HMEM DAX: direct access to 'specific purpose' memory"); MODULE_LICENSE("GPL v2"); MODULE_AUTHOR("Intel Corporation"); +MODULE_IMPORT_NS("CXL"); From 8cb4feab45afa9d9c1b4df4b92882394cafd11bc Mon Sep 17 00:00:00 2001 From: Smita Koralahalli Date: Tue, 10 Feb 2026 06:45:01 +0000 Subject: [PATCH 21/38] NVIDIA: VR: SAUCE: dax/hmem: Reintroduce Soft Reserved ranges back into the iomem tree Reworked from a patch by Alison Schofield Reintroduce Soft Reserved range into the iomem_resource tree for HMEM to consume. This restores visibility in /proc/iomem for ranges actively in use, while avoiding the early-boot conflicts that occurred when Soft Reserved was published into iomem before CXL window and region discovery. Link: https://lore.kernel.org/linux-cxl/29312c0765224ae76862d59a17748c8188fb95f1.1692638817.git.alison.schofield@intel.com/ Co-developed-by: Alison Schofield Signed-off-by: Alison Schofield Co-developed-by: Zhijian Li Signed-off-by: Zhijian Li Signed-off-by: Smita Koralahalli Reviewed-by: Dave Jiang Reviewed-by: Jonathan Cameron Reviewed-by: Dan Williams Tested-by: Tomasz Wolski Link: https://lore.kernel.org/r/20260210064501.157591-10-Smita.KoralahalliChannabasappa@amd.com (cherry picked from https://lore.kernel.org/r/20260210064501.157591-10-Smita.KoralahalliChannabasappa@amd.com) Signed-off-by: Koba Ko --- drivers/dax/hmem/hmem.c | 32 +++++++++++++++++++++++++++++++- 1 file changed, 31 insertions(+), 1 deletion(-) diff --git a/drivers/dax/hmem/hmem.c b/drivers/dax/hmem/hmem.c index 85854e25254b2..c07bf5fe833dc 100644 --- a/drivers/dax/hmem/hmem.c +++ b/drivers/dax/hmem/hmem.c @@ -59,6 +59,34 @@ static void release_hmem(void *pdev) platform_device_unregister(pdev); } +static void remove_soft_reserved(void *r) +{ + remove_resource(r); + kfree(r); +} + +static int add_soft_reserve_into_iomem(struct device *host, + const struct resource *res) +{ + int rc; + + struct resource *soft __free(kfree) = + kmalloc(sizeof(*res), GFP_KERNEL); + if (!soft) + return -ENOMEM; + + *soft = DEFINE_RES_NAMED_DESC(res->start, (res->end - res->start + 1), + "Soft Reserved", IORESOURCE_MEM, + IORES_DESC_SOFT_RESERVED); + + rc = insert_resource(&iomem_resource, soft); + if (rc) + return rc; + + return devm_add_action_or_reset(host, remove_soft_reserved, + no_free_ptr(soft)); +} + static int hmem_register_device(struct device *host, int target_nid, const struct resource *res) { @@ -88,7 +116,9 @@ static int hmem_register_device(struct device *host, int target_nid, if (rc != REGION_INTERSECTS) return 0; - /* TODO: Add Soft-Reserved memory back to iomem */ + rc = add_soft_reserve_into_iomem(host, res); + if (rc) + return rc; id = memregion_alloc(GFP_KERNEL); if (id < 0) { From 7a6aebb181779801a3604eb219c653cb1fea0033 Mon Sep 17 00:00:00 2001 From: Robert Richter Date: Tue, 28 Oct 2025 10:47:53 +0100 Subject: [PATCH 22/38] NVIDIA: VR: SAUCE: cxl/region: Support multi-level interleaving with smaller granularities for lower levels The CXL specification supports multi-level interleaving "as long as all the levels use different, but consecutive, HPA bits to select the target and no Interleave Set has more than 8 devices" (from 3.2). Currently the kernel expects that a decoder's "interleave granularity is a multiple of @parent_port granularity". That is, the granularity of a lower level is bigger than those of the parent and uses the outer HPA bits as selector. It works e.g. for the following 8-way config: * cross-link (cross-hostbridge config in CFMWS): * 4-way * 256 granularity * Selector: HPA[8:9] * sub-link (CXL Host bridge config of the HDM): * 2-way * 1024 granularity * Selector: HPA[10] Now, if the outer HPA bits are used for the cross-hostbridge, an 8-way config could look like this: * cross-link (cross-hostbridge config in CFMWS): * 4-way * 512 granularity * Selector: HPA[9:10] * sub-link (CXL Host bridge config of the HDM): * 2-way * 256 granularity * Selector: HPA[8] The enumeration of decoders for this configuration fails then with following error: cxl region0: pci0000:00:port1 cxl_port_setup_targets expected iw: 2 ig: 1024 [mem 0x10000000000-0x1ffffffffff flags 0x200] cxl region0: pci0000:00:port1 cxl_port_setup_targets got iw: 2 ig: 256 state: enabled 0x10000000000:0x1ffffffffff cxl_port endpoint12: failed to attach decoder12.0 to region0: -6 Note that this happens only if firmware is setting up the decoders (CXL_REGION_F_AUTO). For userspace region assembly the granularities are chosen to increase from root down to the lower levels. That is, outer HPA bits are always used for lower interleaving levels. Rework the implementation to also support multi-level interleaving with smaller granularities for lower levels. Determine the interleave set of autodetected decoders. Check that it is a subset of the root interleave. The HPA selector bits are extracted for all decoders of the set and checked that there is no overlap and bits are consecutive. All decoders can be programmed now to use any bit range within the region's target selector. Signed-off-by: Robert Richter (backported from https://lore.kernel.org/all/20251028094754.72816-1-rrichter@amd.com/) [kobak: resolved conflicts with cxlr->cxlrd and spa_maps_hpa()] Signed-off-by: Koba Ko --- drivers/cxl/core/region.c | 201 ++++++++++++++++++++------------------ 1 file changed, 108 insertions(+), 93 deletions(-) diff --git a/drivers/cxl/core/region.c b/drivers/cxl/core/region.c index 78ec23b21f901..455a2c090589b 100644 --- a/drivers/cxl/core/region.c +++ b/drivers/cxl/core/region.c @@ -1376,57 +1376,119 @@ static int check_interleave_cap(struct cxl_decoder *cxld, int iw, int ig) return 0; } +static inline u64 get_selector(u64 ways, u64 gran) +{ + if (!is_power_of_2(ways)) + ways /= 3; + + if (!is_power_of_2(ways) || !is_power_of_2(gran)) + return 0; + + return (ways - 1) * gran; +} + static int cxl_port_setup_targets(struct cxl_port *port, struct cxl_region *cxlr, struct cxl_endpoint_decoder *cxled) { struct cxl_root_decoder *cxlrd = cxlr->cxlrd; - int parent_iw, parent_ig, ig, iw, rc, pos = cxled->pos; struct cxl_port *parent_port = to_cxl_port(port->dev.parent); struct cxl_region_ref *cxl_rr = cxl_rr_load(port, cxlr); struct cxl_memdev *cxlmd = cxled_to_memdev(cxled); struct cxl_ep *ep = cxl_ep_load(port, cxlmd); struct cxl_region_params *p = &cxlr->params; struct cxl_decoder *cxld = cxl_rr->decoder; - struct cxl_switch_decoder *cxlsd; + struct cxl_switch_decoder *cxlsd = to_cxl_switch_decoder(&cxld->dev); struct cxl_port *iter = port; - u16 eig, peig; - u8 eiw, peiw; + int ig, iw = cxl_rr->nr_targets, rc, pos = cxled->pos; + int distance, parent_distance; + u64 selector, cxlr_sel; + u16 eig; + u8 eiw; /* * While root level decoders support x3, x6, x12, switch level * decoders only support powers of 2 up to x16. */ - if (!is_power_of_2(cxl_rr->nr_targets)) { + if (!is_power_of_2(iw)) { dev_dbg(&cxlr->dev, "%s:%s: invalid target count %d\n", - dev_name(port->uport_dev), dev_name(&port->dev), - cxl_rr->nr_targets); + dev_name(port->uport_dev), dev_name(&port->dev), iw); return -EINVAL; } - cxlsd = to_cxl_switch_decoder(&cxld->dev); - if (cxl_rr->nr_targets_set) { - int i, distance = 1; - struct cxl_region_ref *cxl_rr_iter; + if (iw > 8 || iw > cxlsd->nr_targets) { + dev_dbg(&cxlr->dev, + "%s:%s:%s: ways: %d overflows targets: %d\n", + dev_name(port->uport_dev), dev_name(&port->dev), + dev_name(&cxld->dev), iw, cxlsd->nr_targets); + return -ENXIO; + } - /* - * The "distance" between peer downstream ports represents which - * endpoint positions in the region interleave a given port can - * host. - * - * For example, at the root of a hierarchy the distance is - * always 1 as every index targets a different host-bridge. At - * each subsequent switch level those ports map every Nth region - * position where N is the width of the switch == distance. - */ - do { - cxl_rr_iter = cxl_rr_load(iter, cxlr); - distance *= cxl_rr_iter->nr_targets; - iter = to_cxl_port(iter->dev.parent); - } while (!is_cxl_root(iter)); - distance *= cxlrd->cxlsd.cxld.interleave_ways; + /* + * Calculate the effective granularity and ways to determine + * HPA bits used as target selectors of the interleave set. + * Use this to check if the root decoder and all subsequent + * HDM decoders only use bits from that range as selectors. + * + * The "distance" between peer downstream ports represents which + * endpoint positions in the region interleave a given port can + * host. + * + * For example, at the root of a hierarchy the distance is + * always 1 as every index targets a different host-bridge. At + * each subsequent switch level those ports map every Nth region + * position where N is the width of the switch == distance. + */ + + /* Start with the root decoders selector and distance. */ + selector = get_selector(cxlrd->cxlsd.cxld.interleave_ways, + cxlrd->cxlsd.cxld.interleave_granularity); + distance = cxlrd->cxlsd.cxld.interleave_ways; + if (!is_power_of_2(distance)) + distance /= 3; + + for (iter = parent_port; !is_cxl_root(iter); + iter = to_cxl_port(iter->dev.parent)) { + struct cxl_region_ref *cxl_rr_iter = cxl_rr_load(iter, cxlr); + struct cxl_decoder *cxld_iter = cxl_rr_iter->decoder; + u64 cxld_sel; + + if (cxld_iter->interleave_ways == 1) + continue; + + cxld_sel = get_selector(cxld_iter->interleave_ways, + cxld_iter->interleave_granularity); + + if (cxld_sel & selector) { + dev_dbg(&cxlr->dev, "%s:%s: overlapping selectors: %#llx:%#llx\n", + dev_name(iter->uport_dev), + dev_name(&iter->dev), cxld_sel, selector); + return -ENXIO; + } - for (i = 0; i < cxl_rr->nr_targets_set; i++) + selector |= cxld_sel; + distance *= cxl_rr_iter->nr_targets; + } + + parent_distance = distance; + distance *= iw; + + /* The combined selector bits must fit the region selector. */ + cxlr_sel = get_selector(p->interleave_ways, + p->interleave_granularity); + + if ((cxlr_sel & selector) != selector) { + dev_dbg(&cxlr->dev, "%s:%s: invalid selectors: %#llx:%#llx\n", + dev_name(iter->uport_dev), + dev_name(&iter->dev), cxlr_sel, selector); + return -ENXIO; + } + + /* Calculate remaining selector bits available for use. */ + selector = cxlr_sel & ~selector; + + if (cxl_rr->nr_targets_set) { + for (int i = 0; i < cxl_rr->nr_targets_set; i++) if (ep->dport == cxlsd->target[i]) { rc = check_last_peer(cxled, ep, cxl_rr, distance); @@ -1437,87 +1499,40 @@ static int cxl_port_setup_targets(struct cxl_port *port, goto add_target; } - if (is_cxl_root(parent_port)) { + if (test_bit(CXL_REGION_F_AUTO, &cxlr->flags)) + ig = cxld->interleave_granularity; + else /* + * Set the interleave granularity with each interleave + * level to a multiple of it's parent port interleave + * ways. Beginning with the granularity of the root + * decoder set to the region granularity (starting + * with the inner selector bits of the HPA), the + * granularity is increased with each level. Calculate + * this using the parent distance and region + * granularity. + * * Root decoder IG is always set to value in CFMWS which * may be different than this region's IG. We can use the * region's IG here since interleave_granularity_store() * does not allow interleaved host-bridges with * root IG != region IG. */ - parent_ig = p->interleave_granularity; - parent_iw = cxlrd->cxlsd.cxld.interleave_ways; - /* - * For purposes of address bit routing, use power-of-2 math for - * switch ports. - */ - if (!is_power_of_2(parent_iw)) - parent_iw /= 3; - } else { - struct cxl_region_ref *parent_rr; - struct cxl_decoder *parent_cxld; - - parent_rr = cxl_rr_load(parent_port, cxlr); - parent_cxld = parent_rr->decoder; - parent_ig = parent_cxld->interleave_granularity; - parent_iw = parent_cxld->interleave_ways; - } - - rc = granularity_to_eig(parent_ig, &peig); - if (rc) { - dev_dbg(&cxlr->dev, "%s:%s: invalid parent granularity: %d\n", - dev_name(parent_port->uport_dev), - dev_name(&parent_port->dev), parent_ig); - return rc; - } - - rc = ways_to_eiw(parent_iw, &peiw); - if (rc) { - dev_dbg(&cxlr->dev, "%s:%s: invalid parent interleave: %d\n", - dev_name(parent_port->uport_dev), - dev_name(&parent_port->dev), parent_iw); - return rc; - } + ig = p->interleave_granularity * parent_distance; - iw = cxl_rr->nr_targets; rc = ways_to_eiw(iw, &eiw); - if (rc) { - dev_dbg(&cxlr->dev, "%s:%s: invalid port interleave: %d\n", - dev_name(port->uport_dev), dev_name(&port->dev), iw); - return rc; - } - - /* - * Interleave granularity is a multiple of @parent_port granularity. - * Multiplier is the parent port interleave ways. - */ - rc = granularity_to_eig(parent_ig * parent_iw, &eig); - if (rc) { - dev_dbg(&cxlr->dev, - "%s: invalid granularity calculation (%d * %d)\n", - dev_name(&parent_port->dev), parent_ig, parent_iw); - return rc; - } - - rc = eig_to_granularity(eig, &ig); - if (rc) { - dev_dbg(&cxlr->dev, "%s:%s: invalid interleave: %d\n", - dev_name(port->uport_dev), dev_name(&port->dev), - 256 << eig); - return rc; - } + if (!rc) + rc = granularity_to_eig(ig, &eig); - if (iw > 8 || iw > cxlsd->nr_targets) { - dev_dbg(&cxlr->dev, - "%s:%s:%s: ways: %d overflows targets: %d\n", + if (rc || (iw > 1 && ~selector & get_selector(iw, ig))) { + dev_dbg(&cxlr->dev, "%s:%s: invalid port interleave: %d:%d:%#llx\n", dev_name(port->uport_dev), dev_name(&port->dev), - dev_name(&cxld->dev), iw, cxlsd->nr_targets); + iw, ig, selector); return -ENXIO; } if (test_bit(CXL_REGION_F_AUTO, &cxlr->flags)) { if (cxld->interleave_ways != iw || - (iw > 1 && cxld->interleave_granularity != ig) || !spa_maps_hpa(p, &cxld->hpa_range) || ((cxld->flags & CXL_DECODER_F_ENABLE) == 0)) { dev_err(&cxlr->dev, From a2eba9dce2b6acc3d4074985d14b09398b5b106a Mon Sep 17 00:00:00 2001 From: Jiandi An Date: Thu, 14 May 2026 01:04:53 +0800 Subject: [PATCH 23/38] NVIDIA: VR: SAUCE: [Config] CXL config annotations for Type-2 device and RAS support BugLink: https://bugs.launchpad.net/bugs/2143032 Source: https://github.com/NVIDIA/NV-Kernels/commit/f80636d27f65bb38e54c3601e7efbdde16ed0b40 Add Ubuntu kernel config annotations for CXL-related configs introduced or changed by the CXL Type-2, RAS, and autodiscovered-region support backports. CONFIG_CXL_BUS, CONFIG_CXL_PCI, CONFIG_CXL_MEM, and CONFIG_CXL_PORT are built in for Type-2 device support. CONFIG_CXL_RAS and the EINJ symbols cover CXL RAS/error-injection support. CONFIG_SFC_CXL remains disabled for NVIDIA platforms. Signed-off-by: Jiandi An Acked-by: Jamie Nguyen Acked-by: Nirmoy Das Acked-by: Carol L Soto Acked-by: Matthew R. Ochs Signed-off-by: Brad Figg (backported from commit f80636d27f65bb38e54c3601e7efbdde16ed0b40 nv-kernels/24.04_linux-nvidia-6.17-next) [kobak: Backported annotation overrides from debian.nvidia-6.17 to debian.nvidia-bos; PCIEAER_CXL is overridden as removed instead of editing debian.master.] Signed-off-by: Koba Ko --- debian.nvidia-bos/config/annotations | 39 ++++++++++++++++++++++++++++ 1 file changed, 39 insertions(+) diff --git a/debian.nvidia-bos/config/annotations b/debian.nvidia-bos/config/annotations index 392f5a93126da..c6338b139f0fd 100644 --- a/debian.nvidia-bos/config/annotations +++ b/debian.nvidia-bos/config/annotations @@ -6,6 +6,15 @@ include "../../debian.master/config/annotations" +CONFIG_ACPI_APEI_EINJ policy<{'amd64': 'y', 'arm64': 'y'}> +CONFIG_ACPI_APEI_EINJ note<'Required for CONFIG_ACPI_APEI_EINJ_CXL'> + +CONFIG_ACPI_APEI_EINJ_CXL policy<{'amd64': 'y', 'arm64': 'y'}> +CONFIG_ACPI_APEI_EINJ_CXL note<'CXL protocol error injection support via APEI EINJ'> + +CONFIG_ARCH_HAS_CPU_CACHE_INVALIDATE_MEMREGION policy<{'amd64': 'y', 'arm64': 'y'}> +CONFIG_ARCH_HAS_CPU_CACHE_INVALIDATE_MEMREGION note<'Override debian.master amd64-only; arm64 selects this via arch/arm64/Kconfig since 4d873c5dc3ed'> + CONFIG_ARM64_ERRATUM_1902691 policy<{'arm64': 'y'}> CONFIG_ARM64_ERRATUM_1902691 note<'Required for Grace enablement'> @@ -36,6 +45,9 @@ CONFIG_ARM64_WORKAROUND_TRBE_OVERWRITE_FILL_MODE note<'Required for Grace enable CONFIG_ARM64_WORKAROUND_TRBE_WRITE_OUT_OF_RANGE policy<{'arm64': 'y'}> CONFIG_ARM64_WORKAROUND_TRBE_WRITE_OUT_OF_RANGE note<'Required for Grace enablement'> +CONFIG_CACHEMAINT_FOR_HOTPLUG policy<{'amd64': '-', 'arm64': 'n'}> +CONFIG_CACHEMAINT_FOR_HOTPLUG note<'Optional HiSilicon HHA cache maintenance driver; depends on GENERIC_CPU_CACHE_MAINTENANCE; not needed for NVIDIA platforms'> + CONFIG_ARM_FFA_TRANSPORT policy<{'arm64': 'y'}> CONFIG_ARM_FFA_TRANSPORT note<'LP: #2111511'> @@ -111,9 +123,24 @@ CONFIG_CPU_FREQ_DEFAULT_GOV_PERFORMANCE note<'LP: #2028576: Perf governo CONFIG_CPU_FREQ_DEFAULT_GOV_SCHEDUTIL policy<{'amd64': 'n', 'arm64': 'n'}> CONFIG_CPU_FREQ_DEFAULT_GOV_SCHEDUTIL note<'LP: #2028576: Perf governor required for NVIDIA workloads'> +CONFIG_CXL_BUS policy<{'amd64': 'y', 'arm64': 'y'}> +CONFIG_CXL_BUS note<'Enable CXL bus support built-in; required for CXL Type-2 device and RAS support'> + +CONFIG_CXL_MEM policy<{'amd64': 'y', 'arm64': 'y'}> +CONFIG_CXL_MEM note<'Auto-selected by CXL_PCI; required for CXL memory expansion and Type-2 device support'> + CONFIG_CXL_MEM_RAW_COMMANDS policy<{'amd64': 'n', 'arm64': 'y'}> CONFIG_CXL_MEM_RAW_COMMANDS note<'Enable CXL raw commands for memory devices'> +CONFIG_CXL_PCI policy<{'amd64': 'y', 'arm64': 'y'}> +CONFIG_CXL_PCI note<'Enable CXL PCI management built-in; auto-selects CXL_MEM; required for CXL Type-2 device support'> + +CONFIG_CXL_PORT policy<{'amd64': 'y', 'arm64': 'y'}> +CONFIG_CXL_PORT note<'Required for CXL port enumeration; defaults to CXL_BUS value'> + +CONFIG_CXL_RAS policy<{'amd64': 'y', 'arm64': 'y'}> +CONFIG_CXL_RAS note<'New def_bool replacing PCIEAER_CXL; auto-enabled with ACPI_APEI_GHES+PCIEAER+CXL_BUS; CXL RAS error handling support'> + CONFIG_DRM_NOUVEAU policy<{'amd64': 'n', 'arm64': 'n'}> CONFIG_DRM_NOUVEAU note<'Disable nouveau for NVIDIA kernels'> @@ -135,6 +162,12 @@ CONFIG_EFI_CAPSULE_LOADER note<'LP: #2067111'> CONFIG_ETM4X_IMPDEF_FEATURE policy<{'arm64': 'n'}> CONFIG_ETM4X_IMPDEF_FEATURE note<'Required for Grace enablement'> +CONFIG_FWCTL policy<{'amd64': 'y', 'arm64': 'y'}> +CONFIG_FWCTL note<'Selected by CXL_BUS when CXL_FEATURES is enabled; required for CXL feature mailbox access'> + +CONFIG_GENERIC_CPU_CACHE_MAINTENANCE policy<{'amd64': '-', 'arm64': 'y'}> +CONFIG_GENERIC_CPU_CACHE_MAINTENANCE note<'Selected by arm64 via arch/arm64/Kconfig since 4d873c5dc3ed; not selected by x86'> + CONFIG_GPIO_AAEON policy<{'amd64': '-'}> CONFIG_GPIO_AAEON note<'Disable all Ubuntu ODM drivers'> @@ -180,6 +213,9 @@ CONFIG_NOUVEAU_PLATFORM_DRIVER note<'Disable nouveau for NVIDIA CONFIG_NR_CPUS policy<{'amd64': '8192', 'arm64': '512'}> CONFIG_NR_CPUS note<'LP: #1864198'> +CONFIG_PCIEAER_CXL policy<{'amd64': '-', 'arm64': '-'}> +CONFIG_PCIEAER_CXL note<'Removed by commit d18f1b7beadf (PCI/AER: Replace PCIEAER_CXL symbol with CXL_RAS)'> + CONFIG_NVGRACE_EGM policy<{'arm64': 'm'}> CONFIG_NVGRACE_EGM note<'LP: #2119656'> @@ -207,6 +243,9 @@ CONFIG_SAMPLE_CORESIGHT_SYSCFG note<'Required for Grace enablem CONFIG_SENSORS_AAEON policy<{'amd64': '-'}> CONFIG_SENSORS_AAEON note<'Disable all Ubuntu ODM drivers'> +CONFIG_SFC_CXL policy<{'amd64': 'n', 'arm64': 'n'}> +CONFIG_SFC_CXL note<'Solarflare SFC9100-family CXL Type-2 device support; not needed for NVIDIA platforms'> + CONFIG_SPI_TEGRA210_QUAD policy<{'arm64': 'y'}> CONFIG_SPI_TEGRA210_QUAD note<'Ensures the TPM is available before the IMA driver initializes'> From e3aadf637883225f4d4978060cb30f9b5c44c83c Mon Sep 17 00:00:00 2001 From: Jiandi An Date: Thu, 14 May 2026 01:05:24 +0800 Subject: [PATCH 24/38] NVIDIA: VR: SAUCE: [Config] Enable CXL DAX and KMEM built-in for CXL memory access BugLink: https://bugs.launchpad.net/bugs/2143032 Source: https://github.com/NVIDIA/NV-Kernels/commit/c5c11cf704ac7c320de7b53e26189c9d10d6b18f Override debian.master policy for DEV_DAX, DEV_DAX_CXL, and DEV_DAX_KMEM so CXL memory regions are available as raw DAX devices and as hotplugged System-RAM without relying on module load ordering. Signed-off-by: Jiandi An Acked-by: Jamie Nguyen Acked-by: Nirmoy Das Acked-by: Carol L Soto Acked-by: Matthew R. Ochs Signed-off-by: Brad Figg (backported from commit c5c11cf704ac7c320de7b53e26189c9d10d6b18f nv-kernels/24.04_linux-nvidia-6.17-next) [kobak: Backported annotation overrides from debian.nvidia-6.17 to debian.nvidia-bos.] Signed-off-by: Koba Ko --- debian.nvidia-bos/config/annotations | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/debian.nvidia-bos/config/annotations b/debian.nvidia-bos/config/annotations index c6338b139f0fd..0ad2528908f19 100644 --- a/debian.nvidia-bos/config/annotations +++ b/debian.nvidia-bos/config/annotations @@ -264,6 +264,15 @@ CONFIG_UBUNTU_ODM_DRIVERS note<'Disable all Ubuntu ODM dri CONFIG_ULTRASOC_SMB policy<{'arm64': 'n'}> CONFIG_ULTRASOC_SMB note<'Required for Grace enablement'> +CONFIG_DEV_DAX policy<{'amd64': 'y', 'arm64': 'y'}> +CONFIG_DEV_DAX note<'Override debian.master m->y; required built-in for DEV_DAX_CXL=y'> + +CONFIG_DEV_DAX_CXL policy<{'amd64': 'y', 'arm64': 'y'}> +CONFIG_DEV_DAX_CXL note<'Override debian.master m->y; CXL RAM region DAX access; depends on CXL_BUS+CXL_REGION+DEV_DAX'> + +CONFIG_DEV_DAX_KMEM policy<{'amd64': 'y', 'arm64': 'y'}> +CONFIG_DEV_DAX_KMEM note<'Override debian.master m->y; map CXL DAX devices as System-RAM'> + CONFIG_VFIO_CONTAINER policy<{'amd64': 'y', 'arm64': 'n'}> CONFIG_VFIO_CONTAINER note<'LP: #2095028'> From 5268ad904265b50dc5fed7602486d53dd8ea28d2 Mon Sep 17 00:00:00 2001 From: Jiandi An Date: Thu, 14 May 2026 01:05:40 +0800 Subject: [PATCH 25/38] NVIDIA: VR: SAUCE: [Config] Add PCI_CXL annotation for CXL state save/restore BugLink: https://bugs.launchpad.net/bugs/2143032 Source: https://github.com/NVIDIA/NV-Kernels/commit/a5544cbc1ab9ec25eff39cbcb67eb67133a261a7 Add Ubuntu kernel config annotation for CONFIG_PCI_CXL introduced by the CXL DVSEC and HDM state save/restore series. CONFIG_PCI_CXL is a hidden bool auto-enabled when CXL_BUS=y. It gates compilation of drivers/pci/cxl.o, which saves and restores CXL DVSEC control/range registers and HDM decoder state across PCI resets and link transitions. Signed-off-by: Jiandi An Acked-by: Jamie Nguyen Acked-by: Nirmoy Das Acked-by: Carol L Soto Acked-by: Matthew R. Ochs Signed-off-by: Brad Figg (backported from commit a5544cbc1ab9ec25eff39cbcb67eb67133a261a7 nv-kernels/24.04_linux-nvidia-6.17-next) [kobak: Backported annotation override from debian.nvidia-6.17 to debian.nvidia-bos.] Signed-off-by: Koba Ko --- debian.nvidia-bos/config/annotations | 3 +++ 1 file changed, 3 insertions(+) diff --git a/debian.nvidia-bos/config/annotations b/debian.nvidia-bos/config/annotations index 0ad2528908f19..854c543bb2a1c 100644 --- a/debian.nvidia-bos/config/annotations +++ b/debian.nvidia-bos/config/annotations @@ -273,6 +273,9 @@ CONFIG_DEV_DAX_CXL note<'Override debian.master m-> CONFIG_DEV_DAX_KMEM policy<{'amd64': 'y', 'arm64': 'y'}> CONFIG_DEV_DAX_KMEM note<'Override debian.master m->y; map CXL DAX devices as System-RAM'> +CONFIG_PCI_CXL policy<{'amd64': 'y', 'arm64': 'y'}> +CONFIG_PCI_CXL note<'Hidden bool; auto-enabled by CXL_BUS; PCI core CXL DVSEC and HDM state save/restore support'> + CONFIG_VFIO_CONTAINER policy<{'amd64': 'y', 'arm64': 'n'}> CONFIG_VFIO_CONTAINER note<'LP: #2095028'> From e90b1ee9e1bf9f4ea0e51a10a9ed830e843ceb17 Mon Sep 17 00:00:00 2001 From: Koba Ko Date: Tue, 19 May 2026 02:00:36 +0800 Subject: [PATCH 26/38] NVIDIA: SAUCE: Revert "NVIDIA: VR: SAUCE: cxl: add support for cxl reset" This reverts commit 4f089d910e1cfc4e1983533ab4aa8f6a1b4b296c. Drop the older monolithic CXL reset method before applying the refreshed save/restore and reset series. Signed-off-by: Koba Ko --- drivers/pci/pci.c | 146 ---------------------------------- include/linux/pci.h | 2 +- include/uapi/linux/pci_regs.h | 14 ---- 3 files changed, 1 insertion(+), 161 deletions(-) diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c index a964f12cc7ffd..8479c2e1f74f1 100644 --- a/drivers/pci/pci.c +++ b/drivers/pci/pci.c @@ -4982,151 +4982,6 @@ static int cxl_reset_bus_function(struct pci_dev *dev, bool probe) return rc; } -static int cxl_reset_prepare(struct pci_dev *dev, u16 dvsec) -{ - u32 timeout_us = 100, timeout_tot_us = 10000; - u16 reg, cap; - int rc; - - if (!pci_wait_for_pending_transaction(dev)) - pci_err(dev, "timed out waiting for pending transaction; performing cxl reset anyway\n"); - - /* Check if the device is cache capable. */ - rc = pci_read_config_word(dev, dvsec + PCI_DVSEC_CXL_CAP, &cap); - if (rc) - return rc; - - if (!(cap & PCI_DVSEC_CXL_CACHE_CAPABLE)) - return 0; - - /* Disable cache. WB and invalidate cache if capability is advertised */ - rc = pci_read_config_word(dev, dvsec + PCI_DVSEC_CXL_CTRL2, ®); - if (rc) - return rc; - reg |= PCI_DVSEC_CXL_DISABLE_CACHING; - /* - * DEVCTL2 bits are written only once. So check WB+I capability while - * keeping disable caching set. - */ - if (cap & PCI_DVSEC_CXL_CACHE_WBI_CAPABLE) - reg |= PCI_DVSEC_CXL_INIT_CACHE_WBI; - pci_write_config_word(dev, dvsec + PCI_DVSEC_CXL_CTRL2, reg); - - /* - * From Section 9.6: "Software may leverage the cache size reported in - * the DVSEC CXL Capability2 register to compute a suitable timeout - * value". - * Given there is no conversion factor for cache size -> timeout, - * setting timer for default 10ms. - */ - do { - if (timeout_tot_us == 0) - return -ETIMEDOUT; - usleep_range(timeout_us, timeout_us + 1); - timeout_tot_us -= timeout_us; - rc = pci_read_config_word(dev, dvsec + PCI_DVSEC_CXL_CTRL2, - ®); - if (rc) - return rc; - } while (!(reg & PCI_DVSEC_CXL_CACHE_INV)); - - return 0; -} - -static int cxl_reset_init(struct pci_dev *dev, u16 dvsec) -{ - /* - * Timeout values ref CXL Spec v3.2 Ch 8 Control and Status Registers, - * under section 8.1.3.1 DVSEC CXL Capability. - */ - u32 reset_timeouts_ms[] = { 10, 100, 1000, 10000, 100000 }; - u16 reg; - u32 timeout_ms; - int rc, ind; - - /* Check if CXL Reset MEM CLR is supported. */ - rc = pci_read_config_word(dev, dvsec + PCI_DVSEC_CXL_CAP, ®); - if (rc) - return rc; - - if (reg & PCI_DVSEC_CXL_RST_MEM_CLR_CAPABLE) { - rc = pci_read_config_word(dev, dvsec + PCI_DVSEC_CXL_CTRL2, - ®); - if (rc) - return rc; - - reg |= PCI_DVSEC_CXL_RST_MEM_CLR_EN; - pci_write_config_word(dev, dvsec + PCI_DVSEC_CXL_CTRL2, reg); - } - - /* Read timeout value. */ - rc = pci_read_config_word(dev, dvsec + PCI_DVSEC_CXL_CAP, ®); - if (rc) - return rc; - ind = FIELD_GET(PCI_DVSEC_CXL_RST_TIMEOUT, reg); - timeout_ms = reset_timeouts_ms[ind]; - - /* Write reset config. */ - rc = pci_read_config_word(dev, dvsec + PCI_DVSEC_CXL_CTRL2, ®); - if (rc) - return rc; - - reg |= PCI_DVSEC_CXL_INIT_CXL_RST; - pci_write_config_word(dev, dvsec + PCI_DVSEC_CXL_CTRL2, reg); - - /* Wait till timeout and then check reset status is complete. */ - msleep(timeout_ms); - rc = pci_read_config_word(dev, dvsec + PCI_DVSEC_CXL_STATUS2, ®); - if (rc) - return rc; - if (reg & PCI_DVSEC_CXL_RST_ERR || - ~reg & PCI_DVSEC_CXL_RST_DONE) - return -ETIMEDOUT; - - rc = pci_read_config_word(dev, dvsec + PCI_DVSEC_CXL_CTRL2, ®); - if (rc) - return rc; - reg &= (~PCI_DVSEC_CXL_DISABLE_CACHING); - pci_write_config_word(dev, dvsec + PCI_DVSEC_CXL_CTRL2, reg); - - return 0; -} - -/** - * cxl_reset - initiate a cxl reset - * @dev: device to reset - * @probe: if true, return 0 if device can be reset this way - * - * Initiate a cxl reset on @dev. - */ -static int cxl_reset(struct pci_dev *dev, bool probe) -{ - u16 dvsec, reg; - int rc; - - dvsec = pci_find_dvsec_capability(dev, PCI_VENDOR_ID_CXL, - PCI_DVSEC_CXL_DEVICE); - if (!dvsec) - return -ENOTTY; - - /* Check if CXL Reset is supported. */ - rc = pci_read_config_word(dev, dvsec + PCI_DVSEC_CXL_CAP, ®); - if (rc) - return -ENOTTY; - - if ((reg & PCI_DVSEC_CXL_RST_CAPABLE) == 0) - return -ENOTTY; - - if (probe) - return 0; - - rc = cxl_reset_prepare(dev, dvsec); - if (rc) - return rc; - - return cxl_reset_init(dev, dvsec); -} - void pci_dev_lock(struct pci_dev *dev) { /* block PM suspend, driver probe, etc. */ @@ -5214,7 +5069,6 @@ const struct pci_reset_fn_method pci_reset_fn_methods[] = { { pci_dev_acpi_reset, .name = "acpi" }, { pcie_reset_flr, .name = "flr" }, { pci_af_flr, .name = "af_flr" }, - { cxl_reset, .name = "cxl_reset" }, { pci_pm_reset, .name = "pm" }, { pci_reset_bus_function, .name = "bus" }, { cxl_reset_bus_function, .name = "cxl_bus" }, diff --git a/include/linux/pci.h b/include/linux/pci.h index 634239799686b..61066bcbad09c 100644 --- a/include/linux/pci.h +++ b/include/linux/pci.h @@ -51,7 +51,7 @@ PCI_STATUS_PARITY) /* Number of reset methods used in pci_reset_fn_methods array in pci.c */ -#define PCI_NUM_RESET_METHODS 9 +#define PCI_NUM_RESET_METHODS 8 #define PCI_RESET_PROBE true #define PCI_RESET_DO_RESET false diff --git a/include/uapi/linux/pci_regs.h b/include/uapi/linux/pci_regs.h index 543275ff9ed62..718fb630f5bb7 100644 --- a/include/uapi/linux/pci_regs.h +++ b/include/uapi/linux/pci_regs.h @@ -1349,24 +1349,10 @@ /* CXL r4.0, 8.1.3: PCIe DVSEC for CXL Device */ #define PCI_DVSEC_CXL_DEVICE 0 #define PCI_DVSEC_CXL_CAP 0xA -#define PCI_DVSEC_CXL_CACHE_CAPABLE _BITUL(0) #define PCI_DVSEC_CXL_MEM_CAPABLE _BITUL(2) #define PCI_DVSEC_CXL_HDM_COUNT __GENMASK(5, 4) -#define PCI_DVSEC_CXL_CACHE_WBI_CAPABLE _BITUL(6) -#define PCI_DVSEC_CXL_RST_CAPABLE _BITUL(7) -#define PCI_DVSEC_CXL_RST_TIMEOUT __GENMASK(10, 8) -#define PCI_DVSEC_CXL_RST_MEM_CLR_CAPABLE _BITUL(11) #define PCI_DVSEC_CXL_CTRL 0xC #define PCI_DVSEC_CXL_MEM_ENABLE _BITUL(2) -#define PCI_DVSEC_CXL_CTRL2 0x10 -#define PCI_DVSEC_CXL_DISABLE_CACHING _BITUL(0) -#define PCI_DVSEC_CXL_INIT_CACHE_WBI _BITUL(1) -#define PCI_DVSEC_CXL_INIT_CXL_RST _BITUL(2) -#define PCI_DVSEC_CXL_RST_MEM_CLR_EN _BITUL(3) -#define PCI_DVSEC_CXL_STATUS2 0x12 -#define PCI_DVSEC_CXL_CACHE_INV _BITUL(0) -#define PCI_DVSEC_CXL_RST_DONE _BITUL(1) -#define PCI_DVSEC_CXL_RST_ERR _BITUL(2) #define PCI_DVSEC_CXL_RANGE_SIZE_HIGH(i) (0x18 + (i * 0x10)) #define PCI_DVSEC_CXL_RANGE_SIZE_LOW(i) (0x1C + (i * 0x10)) #define PCI_DVSEC_CXL_MEM_INFO_VALID _BITUL(0) From 07705ee1b952c9516cafef947ea8ce380504c611 Mon Sep 17 00:00:00 2001 From: Srirangan Madhavan Date: Fri, 6 Mar 2026 08:00:15 +0000 Subject: [PATCH 27/38] NVIDIA: VR: SAUCE: PCI: Add CXL DVSEC control, lock, and range register definitions BugLink: https://bugs.launchpad.net/bugs/2143032 PCI: Add CXL DVSEC control, lock, and range register definitions Add register offset and field definitions for CXL DVSEC registers needed by CXL state save/restore across resets: - CTRL2 (offset 0x10) and LOCK (offset 0x14) registers - CONFIG_LOCK bit in the LOCK register - RWL (read-write-when-locked) field masks for CTRL and range base registers. Signed-off-by: Srirangan Madhavan (cherry picked from https://lore.kernel.org/linux-cxl/20260306080026.116789-1-smadhavan@nvidia.com/) Signed-off-by: Jiandi An Acked-by: Jamie Nguyen Acked-by: Nirmoy Das Acked-by: Carol L Soto Acked-by: Matthew R. Ochs Signed-off-by: Brad Figg (cherry picked from commit 07ad5f1ef09ec2c6dd872bd61e533cc2da8ed32b nv-kernels/24.04_linux-nvidia-6.17-next) Signed-off-by: Koba Ko --- include/uapi/linux/pci_regs.h | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/include/uapi/linux/pci_regs.h b/include/uapi/linux/pci_regs.h index 718fb630f5bb7..148328bd4cb37 100644 --- a/include/uapi/linux/pci_regs.h +++ b/include/uapi/linux/pci_regs.h @@ -1353,6 +1353,10 @@ #define PCI_DVSEC_CXL_HDM_COUNT __GENMASK(5, 4) #define PCI_DVSEC_CXL_CTRL 0xC #define PCI_DVSEC_CXL_MEM_ENABLE _BITUL(2) +#define PCI_DVSEC_CXL_CTRL_RWL 0x5FED +#define PCI_DVSEC_CXL_CTRL2 0x10 +#define PCI_DVSEC_CXL_LOCK 0x14 +#define PCI_DVSEC_CXL_LOCK_CONFIG _BITUL(0) #define PCI_DVSEC_CXL_RANGE_SIZE_HIGH(i) (0x18 + (i * 0x10)) #define PCI_DVSEC_CXL_RANGE_SIZE_LOW(i) (0x1C + (i * 0x10)) #define PCI_DVSEC_CXL_MEM_INFO_VALID _BITUL(0) @@ -1360,8 +1364,10 @@ #define PCI_DVSEC_CXL_MEM_ACTIVE_TIMEOUT __GENMASK(15, 13) #define PCI_DVSEC_CXL_MEM_SIZE_LOW __GENMASK(31, 28) #define PCI_DVSEC_CXL_RANGE_BASE_HIGH(i) (0x20 + (i * 0x10)) +#define PCI_DVSEC_CXL_RANGE_BASE_HI_RWL 0xFFFFFFFF #define PCI_DVSEC_CXL_RANGE_BASE_LOW(i) (0x24 + (i * 0x10)) #define PCI_DVSEC_CXL_MEM_BASE_LOW __GENMASK(31, 28) +#define PCI_DVSEC_CXL_RANGE_BASE_LO_RWL 0xF0000000 #define CXL_DVSEC_RANGE_MAX 2 From 82adedeffcd7f934fd92ba00f1fa3ef61ca0d2fe Mon Sep 17 00:00:00 2001 From: Srirangan Madhavan Date: Fri, 6 Mar 2026 08:00:16 +0000 Subject: [PATCH 28/38] NVIDIA: VR: SAUCE: cxl: Move HDM decoder and register map definitions to include/cxl/cxl.h BugLink: https://bugs.launchpad.net/bugs/2143032 Move CXL HDM decoder register defines, register map structs (cxl_reg_map, cxl_component_reg_map, cxl_device_reg_map, cxl_pmu_reg_map, cxl_register_map), cxl_hdm_decoder_count(), enum cxl_regloc_type, and cxl_find_regblock()/cxl_setup_regs() declarations from internal CXL headers to include/cxl/pci.h. This makes them accessible to code outside the CXL subsystem, in particular the PCI core CXL state save/restore support added in a subsequent patch. No functional change. Signed-off-by: Srirangan Madhavan (backported from https://lore.kernel.org/linux-cxl/20260306080026.116789-1-smadhavan@nvidia.com/) Signed-off-by: Jiandi An Acked-by: Jamie Nguyen Acked-by: Nirmoy Das Acked-by: Carol L Soto Acked-by: Matthew R. Ochs Signed-off-by: Brad Figg (backported from commit b5e166cae47a8356338c607c99d98007b83d3324 nv-kernels/24.04_linux-nvidia-6.17-next) [koba: Also move CXL_CM_CAP_CAP_ID_RAS, CXL_CM_CAP_CAP_ID_HDM, and CXL_CM_CAP_CAP_HDM_VERSION into public include/cxl/cxl.h to keep the public CXL header layout consistent.] Signed-off-by: Koba Ko --- drivers/cxl/cxl.h | 61 ----------------------------------------------- include/cxl/cxl.h | 58 ++++++++++++++++++++++++++++++++++++++++++++ include/cxl/pci.h | 5 +++- 3 files changed, 62 insertions(+), 62 deletions(-) diff --git a/drivers/cxl/cxl.h b/drivers/cxl/cxl.h index 94d7d7965c512..c7bffa399581e 100644 --- a/drivers/cxl/cxl.h +++ b/drivers/cxl/cxl.h @@ -24,63 +24,6 @@ extern const struct nvdimm_security_ops *cxl_security_ops; * (port-driver, region-driver, nvdimm object-drivers... etc). */ -/* CXL 2.0 8.2.4 CXL Component Register Layout and Definition */ -#define CXL_COMPONENT_REG_BLOCK_SIZE SZ_64K - -/* CXL 2.0 8.2.5 CXL.cache and CXL.mem Registers*/ -#define CXL_CM_OFFSET 0x1000 -#define CXL_CM_CAP_HDR_OFFSET 0x0 -#define CXL_CM_CAP_HDR_ID_MASK GENMASK(15, 0) -#define CM_CAP_HDR_CAP_ID 1 -#define CXL_CM_CAP_HDR_VERSION_MASK GENMASK(19, 16) -#define CM_CAP_HDR_CAP_VERSION 1 -#define CXL_CM_CAP_HDR_CACHE_MEM_VERSION_MASK GENMASK(23, 20) -#define CM_CAP_HDR_CACHE_MEM_VERSION 1 -#define CXL_CM_CAP_HDR_ARRAY_SIZE_MASK GENMASK(31, 24) -#define CXL_CM_CAP_PTR_MASK GENMASK(31, 20) - -#define CXL_CM_CAP_CAP_ID_RAS 0x2 -#define CXL_CM_CAP_CAP_ID_HDM 0x5 -#define CXL_CM_CAP_CAP_HDM_VERSION 1 - -/* HDM decoders CXL 2.0 8.2.5.12 CXL HDM Decoder Capability Structure */ -#define CXL_HDM_DECODER_CAP_OFFSET 0x0 -#define CXL_HDM_DECODER_COUNT_MASK GENMASK(3, 0) -#define CXL_HDM_DECODER_TARGET_COUNT_MASK GENMASK(7, 4) -#define CXL_HDM_DECODER_INTERLEAVE_11_8 BIT(8) -#define CXL_HDM_DECODER_INTERLEAVE_14_12 BIT(9) -#define CXL_HDM_DECODER_INTERLEAVE_3_6_12_WAY BIT(11) -#define CXL_HDM_DECODER_INTERLEAVE_16_WAY BIT(12) -#define CXL_HDM_DECODER_CTRL_OFFSET 0x4 -#define CXL_HDM_DECODER_ENABLE BIT(1) -#define CXL_HDM_DECODER0_BASE_LOW_OFFSET(i) (0x20 * (i) + 0x10) -#define CXL_HDM_DECODER0_BASE_HIGH_OFFSET(i) (0x20 * (i) + 0x14) -#define CXL_HDM_DECODER0_SIZE_LOW_OFFSET(i) (0x20 * (i) + 0x18) -#define CXL_HDM_DECODER0_SIZE_HIGH_OFFSET(i) (0x20 * (i) + 0x1c) -#define CXL_HDM_DECODER0_CTRL_OFFSET(i) (0x20 * (i) + 0x20) -#define CXL_HDM_DECODER0_CTRL_IG_MASK GENMASK(3, 0) -#define CXL_HDM_DECODER0_CTRL_IW_MASK GENMASK(7, 4) -#define CXL_HDM_DECODER0_CTRL_LOCK BIT(8) -#define CXL_HDM_DECODER0_CTRL_COMMIT BIT(9) -#define CXL_HDM_DECODER0_CTRL_COMMITTED BIT(10) -#define CXL_HDM_DECODER0_CTRL_COMMIT_ERROR BIT(11) -#define CXL_HDM_DECODER0_CTRL_HOSTONLY BIT(12) -#define CXL_HDM_DECODER0_TL_LOW(i) (0x20 * (i) + 0x24) -#define CXL_HDM_DECODER0_TL_HIGH(i) (0x20 * (i) + 0x28) -#define CXL_HDM_DECODER0_SKIP_LOW(i) CXL_HDM_DECODER0_TL_LOW(i) -#define CXL_HDM_DECODER0_SKIP_HIGH(i) CXL_HDM_DECODER0_TL_HIGH(i) - -/* HDM decoder control register constants CXL 3.0 8.2.5.19.7 */ -#define CXL_DECODER_MIN_GRANULARITY 256 -#define CXL_DECODER_MAX_ENCODED_IG 6 - -static inline int cxl_hdm_decoder_count(u32 cap_hdr) -{ - int val = FIELD_GET(CXL_HDM_DECODER_COUNT_MASK, cap_hdr); - - return val ? val * 2 : 1; -} - /* Encode defined in CXL 2.0 8.2.5.12.7 HDM Decoder Control Register */ static inline int eig_to_granularity(u16 eig, unsigned int *granularity) { @@ -214,13 +157,9 @@ int cxl_map_device_regs(const struct cxl_register_map *map, int cxl_map_pmu_regs(struct cxl_register_map *map, struct cxl_pmu_regs *regs); #define CXL_INSTANCES_COUNT -1 -enum cxl_regloc_type; int cxl_count_regblock(struct pci_dev *pdev, enum cxl_regloc_type type); int cxl_find_regblock_instance(struct pci_dev *pdev, enum cxl_regloc_type type, struct cxl_register_map *map, unsigned int index); -int cxl_find_regblock(struct pci_dev *pdev, enum cxl_regloc_type type, - struct cxl_register_map *map); -int cxl_setup_regs(struct cxl_register_map *map); struct cxl_dport; int cxl_dport_map_rcd_linkcap(struct pci_dev *pdev, struct cxl_dport *dport); diff --git a/include/cxl/cxl.h b/include/cxl/cxl.h index 3fbd9eac137ea..1c496c1e846c2 100644 --- a/include/cxl/cxl.h +++ b/include/cxl/cxl.h @@ -5,6 +5,7 @@ #ifndef __CXL_CXL_H__ #define __CXL_CXL_H__ +#include #include #include #include @@ -71,6 +72,63 @@ struct cxl_regs { ); }; +#define CXL_CM_CAP_CAP_ID_RAS 0x2 +#define CXL_CM_CAP_CAP_ID_HDM 0x5 +#define CXL_CM_CAP_CAP_HDM_VERSION 1 + +/* CXL 2.0 8.2.4 CXL Component Register Layout and Definition */ +#define CXL_COMPONENT_REG_BLOCK_SIZE SZ_64K + +/* CXL 2.0 8.2.5 CXL.cache and CXL.mem Registers */ +#define CXL_CM_OFFSET 0x1000 +#define CXL_CM_CAP_HDR_OFFSET 0x0 +#define CXL_CM_CAP_HDR_ID_MASK GENMASK(15, 0) +#define CM_CAP_HDR_CAP_ID 1 +#define CXL_CM_CAP_HDR_VERSION_MASK GENMASK(19, 16) +#define CM_CAP_HDR_CAP_VERSION 1 +#define CXL_CM_CAP_HDR_CACHE_MEM_VERSION_MASK GENMASK(23, 20) +#define CM_CAP_HDR_CACHE_MEM_VERSION 1 +#define CXL_CM_CAP_HDR_ARRAY_SIZE_MASK GENMASK(31, 24) +#define CXL_CM_CAP_PTR_MASK GENMASK(31, 20) + +/* HDM decoders CXL 2.0 8.2.5.12 CXL HDM Decoder Capability Structure */ +#define CXL_HDM_DECODER_CAP_OFFSET 0x0 +#define CXL_HDM_DECODER_COUNT_MASK GENMASK(3, 0) +#define CXL_HDM_DECODER_TARGET_COUNT_MASK GENMASK(7, 4) +#define CXL_HDM_DECODER_INTERLEAVE_11_8 BIT(8) +#define CXL_HDM_DECODER_INTERLEAVE_14_12 BIT(9) +#define CXL_HDM_DECODER_INTERLEAVE_3_6_12_WAY BIT(11) +#define CXL_HDM_DECODER_INTERLEAVE_16_WAY BIT(12) +#define CXL_HDM_DECODER_CTRL_OFFSET 0x4 +#define CXL_HDM_DECODER_ENABLE BIT(1) +#define CXL_HDM_DECODER0_BASE_LOW_OFFSET(i) (0x20 * (i) + 0x10) +#define CXL_HDM_DECODER0_BASE_HIGH_OFFSET(i) (0x20 * (i) + 0x14) +#define CXL_HDM_DECODER0_SIZE_LOW_OFFSET(i) (0x20 * (i) + 0x18) +#define CXL_HDM_DECODER0_SIZE_HIGH_OFFSET(i) (0x20 * (i) + 0x1c) +#define CXL_HDM_DECODER0_CTRL_OFFSET(i) (0x20 * (i) + 0x20) +#define CXL_HDM_DECODER0_CTRL_IG_MASK GENMASK(3, 0) +#define CXL_HDM_DECODER0_CTRL_IW_MASK GENMASK(7, 4) +#define CXL_HDM_DECODER0_CTRL_LOCK BIT(8) +#define CXL_HDM_DECODER0_CTRL_COMMIT BIT(9) +#define CXL_HDM_DECODER0_CTRL_COMMITTED BIT(10) +#define CXL_HDM_DECODER0_CTRL_COMMIT_ERROR BIT(11) +#define CXL_HDM_DECODER0_CTRL_HOSTONLY BIT(12) +#define CXL_HDM_DECODER0_TL_LOW(i) (0x20 * (i) + 0x24) +#define CXL_HDM_DECODER0_TL_HIGH(i) (0x20 * (i) + 0x28) +#define CXL_HDM_DECODER0_SKIP_LOW(i) CXL_HDM_DECODER0_TL_LOW(i) +#define CXL_HDM_DECODER0_SKIP_HIGH(i) CXL_HDM_DECODER0_TL_HIGH(i) + +/* HDM decoder control register constants CXL 3.0 8.2.5.19.7 */ +#define CXL_DECODER_MIN_GRANULARITY 256 +#define CXL_DECODER_MAX_ENCODED_IG 6 + +static inline int cxl_hdm_decoder_count(u32 cap_hdr) +{ + int val = FIELD_GET(CXL_HDM_DECODER_COUNT_MASK, cap_hdr); + + return val ? val * 2 : 1; +} + struct cxl_reg_map { bool valid; int id; diff --git a/include/cxl/pci.h b/include/cxl/pci.h index 3e0000015871a..edbf980c283f1 100644 --- a/include/cxl/pci.h +++ b/include/cxl/pci.h @@ -14,9 +14,12 @@ enum cxl_regloc_type { CXL_REGLOC_RBI_TYPES }; -struct cxl_register_map; struct pci_dev; +struct cxl_register_map; int cxl_pci_setup_regs(struct pci_dev *pdev, enum cxl_regloc_type type, struct cxl_register_map *map); +int cxl_find_regblock(struct pci_dev *pdev, enum cxl_regloc_type type, + struct cxl_register_map *map); +int cxl_setup_regs(struct cxl_register_map *map); #endif From 9a09ee09f074fba78aed46abf4ebb49c8d08e8d8 Mon Sep 17 00:00:00 2001 From: Srirangan Madhavan Date: Fri, 6 Mar 2026 08:00:17 +0000 Subject: [PATCH 29/38] NVIDIA: VR: SAUCE: PCI: Add virtual extended cap save buffer for CXL state BugLink: https://bugs.launchpad.net/bugs/2143032 Add pci_add_virtual_ext_cap_save_buffer() to allocate save buffers using virtual cap IDs (above PCI_EXT_CAP_ID_MAX) that don't require a real capability in config space. The existing pci_add_ext_cap_save_buffer() cannot be used for CXL DVSEC state because it calls pci_find_saved_ext_cap() which searches for a matching capability in PCI config space. The CXL state saved here is a synthetic snapshot (DVSEC+HDM) and should not be tied to a real extended-cap instance. A virtual extended-cap save buffer API (cap IDs above PCI_EXT_CAP_ID_MAX) allows PCI to track this state without a backing config space capability. Signed-off-by: Srirangan Madhavan (cherry picked from https://lore.kernel.org/linux-cxl/20260306080026.116789-1-smadhavan@nvidia.com/) Signed-off-by: Jiandi An Acked-by: Jamie Nguyen Acked-by: Nirmoy Das Acked-by: Carol L Soto Acked-by: Matthew R. Ochs Signed-off-by: Brad Figg (cherry picked from commit b3a768fe26282e3cbf250b4c4657cf11fea6ada4 nv-kernels/24.04_linux-nvidia-6.17-next) Signed-off-by: Koba Ko --- drivers/pci/pci.c | 20 ++++++++++++++++++++ drivers/pci/pci.h | 18 ++++++++++++++++++ 2 files changed, 38 insertions(+) diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c index 8479c2e1f74f1..dc8181f138648 100644 --- a/drivers/pci/pci.c +++ b/drivers/pci/pci.c @@ -3446,6 +3446,26 @@ int pci_add_ext_cap_save_buffer(struct pci_dev *dev, u16 cap, unsigned int size) return _pci_add_cap_save_buffer(dev, cap, true, size); } +int pci_add_virtual_ext_cap_save_buffer(struct pci_dev *dev, u16 cap, + unsigned int size) +{ + struct pci_cap_saved_state *save_state; + + if (cap <= PCI_EXT_CAP_ID_MAX) + return -EINVAL; + + save_state = kzalloc(sizeof(*save_state) + size, GFP_KERNEL); + if (!save_state) + return -ENOMEM; + + save_state->cap.cap_nr = cap; + save_state->cap.cap_extended = true; + save_state->cap.size = size; + pci_add_saved_cap(dev, save_state); + + return 0; +} + /** * pci_allocate_cap_save_buffers - allocate buffers for saving capabilities * @dev: the PCI device diff --git a/drivers/pci/pci.h b/drivers/pci/pci.h index 13fa71f965900..cae087dec6fb1 100644 --- a/drivers/pci/pci.h +++ b/drivers/pci/pci.h @@ -245,15 +245,33 @@ struct pci_cap_saved_state { struct pci_cap_saved_data cap; }; +/* + * Virtual extended cap ID for CXL DVSEC state in the cap save chain. + */ +#define PCI_EXT_CAP_ID_CXL_DVSEC_VIRTUAL 0xFFFF +static_assert(PCI_EXT_CAP_ID_MAX < PCI_EXT_CAP_ID_CXL_DVSEC_VIRTUAL); + void pci_allocate_cap_save_buffers(struct pci_dev *dev); void pci_free_cap_save_buffers(struct pci_dev *dev); int pci_add_cap_save_buffer(struct pci_dev *dev, char cap, unsigned int size); int pci_add_ext_cap_save_buffer(struct pci_dev *dev, u16 cap, unsigned int size); +int pci_add_virtual_ext_cap_save_buffer(struct pci_dev *dev, u16 cap, + unsigned int size); struct pci_cap_saved_state *pci_find_saved_cap(struct pci_dev *dev, char cap); struct pci_cap_saved_state *pci_find_saved_ext_cap(struct pci_dev *dev, u16 cap); +#ifdef CONFIG_PCI_CXL +void pci_allocate_cxl_save_buffer(struct pci_dev *dev); +void pci_save_cxl_state(struct pci_dev *dev); +void pci_restore_cxl_state(struct pci_dev *dev); +#else +static inline void pci_allocate_cxl_save_buffer(struct pci_dev *dev) { } +static inline void pci_save_cxl_state(struct pci_dev *dev) { } +static inline void pci_restore_cxl_state(struct pci_dev *dev) { } +#endif + #define PCI_PM_D2_DELAY 200 /* usec; see PCIe r4.0, sec 5.9.1 */ #define PCI_PM_D3HOT_WAIT 10 /* msec */ #define PCI_PM_D3COLD_WAIT 100 /* msec */ From 969356c44c973d5768611fe5c446ca9e21657a66 Mon Sep 17 00:00:00 2001 From: Srirangan Madhavan Date: Fri, 6 Mar 2026 08:00:18 +0000 Subject: [PATCH 30/38] NVIDIA: VR: SAUCE: PCI: Add cxl DVSEC state save/restore across resets BugLink: https://bugs.launchpad.net/bugs/2143032 Save and restore CXL DVSEC control registers (CTRL, CTRL2), range base registers, and lock state across PCI resets. When the DVSEC CONFIG_LOCK bit is set, certain DVSEC fields become read-only and hardware may have updated them. Blindly restoring saved values would be silently ignored or conflict with hardware state. Instead, a read-merge-write approach is used: current hardware values are read for the RWL (read-write-when-locked) fields and merged with saved state, so only writable bits are restored while locked bits retain their hardware values. Hooked into pci_save_state()/pci_restore_state() so all PCI reset paths automatically preserve CXL DVSEC configuration. Signed-off-by: Srirangan Madhavan (cherry picked from https://lore.kernel.org/linux-cxl/20260306080026.116789-1-smadhavan@nvidia.com/) [jan: Resolve minor conflict in drivers/pci/Makefile due to code line shifts ] Signed-off-by: Jiandi An Acked-by: Jamie Nguyen Acked-by: Nirmoy Das Acked-by: Carol L Soto Acked-by: Matthew R. Ochs Signed-off-by: Brad Figg (cherry picked from commit 0bb0dc0223282c6c1bd7b7cdb345848d12d41e4d nv-kernels/24.04_linux-nvidia-6.17-next) Signed-off-by: Koba Ko --- drivers/pci/Kconfig | 4 + drivers/pci/Makefile | 1 + drivers/pci/cxl.c | 177 +++++++++++++++++++++++++++++++++++++++++++ drivers/pci/pci.c | 3 + 4 files changed, 185 insertions(+) create mode 100644 drivers/pci/cxl.c diff --git a/drivers/pci/Kconfig b/drivers/pci/Kconfig index e3f848ffb52a7..6b96650b3f311 100644 --- a/drivers/pci/Kconfig +++ b/drivers/pci/Kconfig @@ -119,6 +119,10 @@ config XEN_PCIDEV_FRONTEND The PCI device frontend driver allows the kernel to import arbitrary PCI devices from a PCI backend to support PCI driver domains. +config PCI_CXL + bool + default y if CXL_BUS + config PCI_ATS bool diff --git a/drivers/pci/Makefile b/drivers/pci/Makefile index 41ebc3b9a5182..b6b5c9dbaaac7 100644 --- a/drivers/pci/Makefile +++ b/drivers/pci/Makefile @@ -40,6 +40,7 @@ obj-$(CONFIG_PCI_DYNAMIC_OF_NODES) += of_property.o obj-$(CONFIG_PCI_NPEM) += npem.o obj-$(CONFIG_PCIE_TPH) += tph.o obj-$(CONFIG_CARDBUS) += setup-cardbus.o +obj-$(CONFIG_PCI_CXL) += cxl.o # Endpoint library must be initialized before its users obj-$(CONFIG_PCI_ENDPOINT) += endpoint/ diff --git a/drivers/pci/cxl.c b/drivers/pci/cxl.c new file mode 100644 index 0000000000000..abcf70de91715 --- /dev/null +++ b/drivers/pci/cxl.c @@ -0,0 +1,177 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * CXL PCI state save/restore support. + * + * Saves and restores CXL DVSEC registers across PCI resets and link + * disable/enable transitions. Hooked into pci_save_state() / + * pci_restore_state() via the PCI capability save chain. + */ +#include +#include +#include "pci.h" + +struct cxl_pci_state { + u16 dvsec; + u16 dvsec_ctrl; + u16 dvsec_ctrl2; + u32 range_base_hi[CXL_DVSEC_RANGE_MAX]; + u32 range_base_lo[CXL_DVSEC_RANGE_MAX]; + u16 dvsec_lock; + bool dvsec_valid; +}; + +static void cxl_save_dvsec(struct pci_dev *pdev, struct cxl_pci_state *state) +{ + int rc_ctrl, rc_ctrl2; + u16 dvsec; + int i; + + dvsec = pci_find_dvsec_capability(pdev, PCI_VENDOR_ID_CXL, + PCI_DVSEC_CXL_DEVICE); + if (!dvsec) + return; + + state->dvsec = dvsec; + rc_ctrl = pci_read_config_word(pdev, dvsec + PCI_DVSEC_CXL_CTRL, + &state->dvsec_ctrl); + rc_ctrl2 = pci_read_config_word(pdev, dvsec + PCI_DVSEC_CXL_CTRL2, + &state->dvsec_ctrl2); + if (rc_ctrl || rc_ctrl2) { + pci_warn(pdev, + "CXL: DVSEC read failed (ctrl rc=%d, ctrl2 rc=%d)\n", + rc_ctrl, rc_ctrl2); + return; + } + + for (i = 0; i < CXL_DVSEC_RANGE_MAX; i++) { + pci_read_config_dword(pdev, + dvsec + PCI_DVSEC_CXL_RANGE_BASE_HIGH(i), + &state->range_base_hi[i]); + pci_read_config_dword(pdev, + dvsec + PCI_DVSEC_CXL_RANGE_BASE_LOW(i), + &state->range_base_lo[i]); + } + + pci_read_config_word(pdev, dvsec + PCI_DVSEC_CXL_LOCK, + &state->dvsec_lock); + + state->dvsec_valid = true; +} + +static u32 cxl_merge_rwl(u32 saved, u32 current_hw, u32 rwl_mask) +{ + return (current_hw & rwl_mask) | (saved & ~rwl_mask); +} + +static void cxl_restore_dvsec(struct pci_dev *pdev, + const struct cxl_pci_state *state) +{ + u16 lock_reg = 0; + int i; + + if (!state->dvsec_valid) + return; + + pci_read_config_word(pdev, state->dvsec + PCI_DVSEC_CXL_LOCK, + &lock_reg); + + if (lock_reg & PCI_DVSEC_CXL_LOCK_CONFIG) { + u16 hw_ctrl; + u32 hw_range_hi, hw_range_lo; + + pci_read_config_word(pdev, + state->dvsec + PCI_DVSEC_CXL_CTRL, + &hw_ctrl); + pci_write_config_word(pdev, + state->dvsec + PCI_DVSEC_CXL_CTRL, + cxl_merge_rwl(state->dvsec_ctrl, hw_ctrl, + PCI_DVSEC_CXL_CTRL_RWL)); + + pci_write_config_word(pdev, + state->dvsec + PCI_DVSEC_CXL_CTRL2, + state->dvsec_ctrl2); + + for (i = 0; i < CXL_DVSEC_RANGE_MAX; i++) { + pci_read_config_dword(pdev, + state->dvsec + PCI_DVSEC_CXL_RANGE_BASE_HIGH(i), + &hw_range_hi); + pci_write_config_dword(pdev, + state->dvsec + PCI_DVSEC_CXL_RANGE_BASE_HIGH(i), + cxl_merge_rwl(state->range_base_hi[i], + hw_range_hi, + PCI_DVSEC_CXL_RANGE_BASE_HI_RWL)); + + pci_read_config_dword(pdev, + state->dvsec + PCI_DVSEC_CXL_RANGE_BASE_LOW(i), + &hw_range_lo); + pci_write_config_dword(pdev, + state->dvsec + PCI_DVSEC_CXL_RANGE_BASE_LOW(i), + cxl_merge_rwl(state->range_base_lo[i], + hw_range_lo, + PCI_DVSEC_CXL_RANGE_BASE_LO_RWL)); + } + } else { + pci_write_config_word(pdev, + state->dvsec + PCI_DVSEC_CXL_CTRL, + state->dvsec_ctrl); + pci_write_config_word(pdev, + state->dvsec + PCI_DVSEC_CXL_CTRL2, + state->dvsec_ctrl2); + for (i = 0; i < CXL_DVSEC_RANGE_MAX; i++) { + pci_write_config_dword(pdev, + state->dvsec + PCI_DVSEC_CXL_RANGE_BASE_HIGH(i), + state->range_base_hi[i]); + pci_write_config_dword(pdev, + state->dvsec + PCI_DVSEC_CXL_RANGE_BASE_LOW(i), + state->range_base_lo[i]); + } + + pci_write_config_word(pdev, + state->dvsec + PCI_DVSEC_CXL_LOCK, + state->dvsec_lock); + } +} + +void pci_allocate_cxl_save_buffer(struct pci_dev *dev) +{ + if (!pcie_is_cxl(dev)) + return; + + if (pci_add_virtual_ext_cap_save_buffer(dev, + PCI_EXT_CAP_ID_CXL_DVSEC_VIRTUAL, + sizeof(struct cxl_pci_state))) + pci_err(dev, "unable to allocate CXL save buffer\n"); +} + +void pci_save_cxl_state(struct pci_dev *pdev) +{ + struct pci_cap_saved_state *save_state; + struct cxl_pci_state *state; + + save_state = pci_find_saved_ext_cap(pdev, + PCI_EXT_CAP_ID_CXL_DVSEC_VIRTUAL); + if (!save_state) + return; + + state = (struct cxl_pci_state *)save_state->cap.data; + state->dvsec_valid = false; + + cxl_save_dvsec(pdev, state); +} + +void pci_restore_cxl_state(struct pci_dev *pdev) +{ + struct pci_cap_saved_state *save_state; + struct cxl_pci_state *state; + + save_state = pci_find_saved_ext_cap(pdev, + PCI_EXT_CAP_ID_CXL_DVSEC_VIRTUAL); + if (!save_state) + return; + + state = (struct cxl_pci_state *)save_state->cap.data; + if (!state->dvsec_valid) + return; + + cxl_restore_dvsec(pdev, state); +} diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c index dc8181f138648..497720c64d6d9 100644 --- a/drivers/pci/pci.c +++ b/drivers/pci/pci.c @@ -1759,6 +1759,7 @@ int pci_save_state(struct pci_dev *dev) pci_save_aer_state(dev); pci_save_ptm_state(dev); pci_save_tph_state(dev); + pci_save_cxl_state(dev); return pci_save_vc_state(dev); } EXPORT_SYMBOL(pci_save_state); @@ -1841,6 +1842,7 @@ void pci_restore_state(struct pci_dev *dev) pci_restore_aer_state(dev); pci_restore_config_space(dev); + pci_restore_cxl_state(dev); pci_restore_pcix_state(dev); pci_restore_msi_state(dev); @@ -3489,6 +3491,7 @@ void pci_allocate_cap_save_buffers(struct pci_dev *dev) pci_err(dev, "unable to allocate suspend buffer for LTR\n"); pci_allocate_vc_save_buffers(dev); + pci_allocate_cxl_save_buffer(dev); } void pci_free_cap_save_buffers(struct pci_dev *dev) From 42ade8ec191c1fc689abcd90af67805117c284d7 Mon Sep 17 00:00:00 2001 From: Srirangan Madhavan Date: Fri, 6 Mar 2026 08:00:19 +0000 Subject: [PATCH 31/38] NVIDIA: VR: SAUCE: PCI: Add HDM decoder state save/restore BugLink: https://bugs.launchpad.net/bugs/2143032 Save and restore CXL HDM decoder registers (global control, per-decoder base/size/target-list, and commit state) across PCI resets. On restore, decoders that were committed are reprogrammed and recommitted with a 10ms timeout. Locked decoders that are already committed are skipped, since their state is protected by hardware and reprogramming them would fail. The Register Locator DVSEC is parsed directly via PCI config space reads rather than calling cxl_find_regblock()/cxl_setup_regs(), since this code lives in the PCI core and must not depend on CXL module symbols. MSE is temporarily enabled during save/restore to allow MMIO access to the HDM decoder register block. Signed-off-by: Srirangan Madhavan (cherry picked from https://lore.kernel.org/linux-cxl/20260306080026.116789-1-smadhavan@nvidia.com/) [jan: Include in drivers/pci/cxl.c due to conflict resolution in "4acbc27592b8 NVIDIA: VR: SAUCE: cxl: Move HDM decoder and register map definitions to include/cxl/cxl.h"] Signed-off-by: Jiandi An Acked-by: Jamie Nguyen Acked-by: Nirmoy Das Acked-by: Carol L Soto Acked-by: Matthew R. Ochs Signed-off-by: Brad Figg (cherry picked from commit 578f9164046dd36b52bd0872095199a13f4164d1 nv-kernels/24.04_linux-nvidia-6.17-next) Signed-off-by: Koba Ko --- drivers/pci/cxl.c | 308 +++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 305 insertions(+), 3 deletions(-) diff --git a/drivers/pci/cxl.c b/drivers/pci/cxl.c index abcf70de91715..eee28a3755a23 100644 --- a/drivers/pci/cxl.c +++ b/drivers/pci/cxl.c @@ -2,15 +2,32 @@ /* * CXL PCI state save/restore support. * - * Saves and restores CXL DVSEC registers across PCI resets and link - * disable/enable transitions. Hooked into pci_save_state() / + * Saves and restores CXL DVSEC and HDM decoder registers across PCI resets + * and link disable/enable transitions. Hooked into pci_save_state() / * pci_restore_state() via the PCI capability save chain. */ #include +#include +#include +#include +#include #include #include "pci.h" +#define CXL_HDM_MAX_DECODERS 32 + +struct cxl_hdm_decoder_snapshot { + u32 base_lo; + u32 base_hi; + u32 size_lo; + u32 size_hi; + u32 ctrl; + u32 tl_lo; + u32 tl_hi; +}; + struct cxl_pci_state { + /* DVSEC saved state */ u16 dvsec; u16 dvsec_ctrl; u16 dvsec_ctrl2; @@ -18,6 +35,15 @@ struct cxl_pci_state { u32 range_base_lo[CXL_DVSEC_RANGE_MAX]; u16 dvsec_lock; bool dvsec_valid; + + /* HDM decoder saved state */ + int hdm_bar; + unsigned long hdm_bar_offset; + unsigned long hdm_map_size; + u32 hdm_global_ctrl; + int hdm_count; + struct cxl_hdm_decoder_snapshot decoders[CXL_HDM_MAX_DECODERS]; + bool hdm_valid; }; static void cxl_save_dvsec(struct pci_dev *pdev, struct cxl_pci_state *state) @@ -132,6 +158,279 @@ static void cxl_restore_dvsec(struct pci_dev *pdev, } } +struct pci_cmd_saved { + struct pci_dev *pdev; + u16 cmd; +}; + +DEFINE_FREE(restore_pci_cmd, struct pci_cmd_saved, + if (!(_T.cmd & PCI_COMMAND_MEMORY)) + pci_write_config_word(_T.pdev, PCI_COMMAND, _T.cmd)) + +/** + * cxl_find_component_regblock - Find the Component Register Block via + * the Register Locator DVSEC + * @pdev: PCI device to scan + * @bir: output BAR index + * @offset: output offset within the BAR + * + * Parses the Register Locator DVSEC (ID 8) directly via PCI config space + * reads. No dependency on CXL module symbols. + * + * Return: 0 on success, -ENODEV if not found. + */ +static int cxl_find_component_regblock(struct pci_dev *pdev, + int *bir, u64 *offset) +{ + u32 regloc_size, regblocks; + u16 regloc; + int i; + + regloc = pci_find_dvsec_capability(pdev, PCI_VENDOR_ID_CXL, + PCI_DVSEC_CXL_REG_LOCATOR); + if (!regloc) + return -ENODEV; + + pci_read_config_dword(pdev, regloc + PCI_DVSEC_HEADER1, ®loc_size); + regloc_size = PCI_DVSEC_HEADER1_LEN(regloc_size); + regblocks = (regloc_size - PCI_DVSEC_CXL_REG_LOCATOR_BLOCK1) / 8; + + for (i = 0; i < regblocks; i++) { + u32 reg_lo, reg_hi; + unsigned int off; + + off = regloc + PCI_DVSEC_CXL_REG_LOCATOR_BLOCK1 + i * 8; + pci_read_config_dword(pdev, off, ®_lo); + pci_read_config_dword(pdev, off + 4, ®_hi); + + if (FIELD_GET(PCI_DVSEC_CXL_REG_LOCATOR_BLOCK_ID, reg_lo) != + CXL_REGLOC_RBI_COMPONENT) + continue; + + *bir = FIELD_GET(PCI_DVSEC_CXL_REG_LOCATOR_BIR, reg_lo); + *offset = ((u64)reg_hi << 32) | + (reg_lo & PCI_DVSEC_CXL_REG_LOCATOR_BLOCK_OFF_LOW); + return 0; + } + + return -ENODEV; +} + +/* + * Discover and map HDM decoder registers. + * Caller must pci_iounmap() the returned pointer. + */ +static void __iomem *cxl_hdm_map(struct pci_dev *pdev, int *bar_out, + unsigned long *offset_out, + unsigned long *size_out) +{ + int bir; + u64 reg_offset; + void __iomem *comp_base, *cm_base; + u32 cap_hdr; + int cap, cap_count; + unsigned long hdm_offset = 0, hdm_size = 0; + void __iomem *hdm; + + if (cxl_find_component_regblock(pdev, &bir, ®_offset)) + return NULL; + + comp_base = pci_iomap_range(pdev, bir, reg_offset, + CXL_CM_OFFSET + SZ_4K); + if (!comp_base) + return NULL; + + cm_base = comp_base + CXL_CM_OFFSET; + cap_hdr = readl(cm_base); + + if (FIELD_GET(CXL_CM_CAP_HDR_ID_MASK, cap_hdr) != CM_CAP_HDR_CAP_ID) { + pci_iounmap(pdev, comp_base); + return NULL; + } + + cap_count = FIELD_GET(CXL_CM_CAP_HDR_ARRAY_SIZE_MASK, cap_hdr); + + for (cap = 1; cap <= cap_count; cap++) { + u16 cap_id; + u32 cap_off; + u32 hdr; + + if (cap * sizeof(u32) >= SZ_4K) + break; + + hdr = readl(cm_base + cap * 4); + cap_id = FIELD_GET(CXL_CM_CAP_HDR_ID_MASK, hdr); + cap_off = FIELD_GET(CXL_CM_CAP_PTR_MASK, hdr); + + if (cap_id != CXL_CM_CAP_CAP_ID_HDM) + continue; + + if (cap_off + sizeof(u32) > SZ_4K) + break; + + hdr = readl(cm_base + cap_off); + hdm_offset = CXL_CM_OFFSET + cap_off; + hdm_size = 0x20 * cxl_hdm_decoder_count(hdr) + 0x10; + break; + } + + pci_iounmap(pdev, comp_base); + + if (!hdm_size) + return NULL; + + hdm = pci_iomap_range(pdev, bir, reg_offset + hdm_offset, hdm_size); + if (!hdm) + return NULL; + + *bar_out = bir; + *offset_out = reg_offset + hdm_offset; + *size_out = hdm_size; + return hdm; +} + +static void cxl_save_hdm(struct pci_dev *pdev, void __iomem *hdm, + struct cxl_pci_state *state, int count) +{ + int i; + + state->hdm_count = min_t(int, count, CXL_HDM_MAX_DECODERS); + state->hdm_global_ctrl = readl(hdm + CXL_HDM_DECODER_CTRL_OFFSET); + + for (i = 0; i < state->hdm_count; i++) { + struct cxl_hdm_decoder_snapshot *d = &state->decoders[i]; + + d->base_lo = readl(hdm + CXL_HDM_DECODER0_BASE_LOW_OFFSET(i)); + d->base_hi = readl(hdm + CXL_HDM_DECODER0_BASE_HIGH_OFFSET(i)); + d->size_lo = readl(hdm + CXL_HDM_DECODER0_SIZE_LOW_OFFSET(i)); + d->size_hi = readl(hdm + CXL_HDM_DECODER0_SIZE_HIGH_OFFSET(i)); + d->ctrl = readl(hdm + CXL_HDM_DECODER0_CTRL_OFFSET(i)); + d->tl_lo = readl(hdm + CXL_HDM_DECODER0_TL_LOW(i)); + d->tl_hi = readl(hdm + CXL_HDM_DECODER0_TL_HIGH(i)); + } +} + +static void cxl_restore_hdm(struct pci_dev *pdev, void __iomem *hdm, + const struct cxl_pci_state *state) +{ + int i; + + writel(state->hdm_global_ctrl, hdm + CXL_HDM_DECODER_CTRL_OFFSET); + + for (i = 0; i < state->hdm_count; i++) { + const struct cxl_hdm_decoder_snapshot *d = &state->decoders[i]; + unsigned long timeout; + u32 ctrl; + + if (!(d->ctrl & CXL_HDM_DECODER0_CTRL_COMMITTED)) + continue; + + ctrl = readl(hdm + CXL_HDM_DECODER0_CTRL_OFFSET(i)); + if ((ctrl & CXL_HDM_DECODER0_CTRL_LOCK) && + (ctrl & CXL_HDM_DECODER0_CTRL_COMMITTED)) + continue; + + if (ctrl & CXL_HDM_DECODER0_CTRL_COMMITTED) { + ctrl &= ~CXL_HDM_DECODER0_CTRL_COMMIT; + writel(ctrl, hdm + CXL_HDM_DECODER0_CTRL_OFFSET(i)); + } + + writel(d->base_lo, hdm + CXL_HDM_DECODER0_BASE_LOW_OFFSET(i)); + writel(d->base_hi, hdm + CXL_HDM_DECODER0_BASE_HIGH_OFFSET(i)); + writel(d->size_lo, hdm + CXL_HDM_DECODER0_SIZE_LOW_OFFSET(i)); + writel(d->size_hi, hdm + CXL_HDM_DECODER0_SIZE_HIGH_OFFSET(i)); + writel(d->tl_lo, hdm + CXL_HDM_DECODER0_TL_LOW(i)); + writel(d->tl_hi, hdm + CXL_HDM_DECODER0_TL_HIGH(i)); + + wmb(); + + ctrl = d->ctrl & ~(CXL_HDM_DECODER0_CTRL_COMMITTED | + CXL_HDM_DECODER0_CTRL_COMMIT_ERROR); + ctrl |= CXL_HDM_DECODER0_CTRL_COMMIT; + writel(ctrl, hdm + CXL_HDM_DECODER0_CTRL_OFFSET(i)); + + timeout = jiffies + msecs_to_jiffies(10); + for (;;) { + ctrl = readl(hdm + CXL_HDM_DECODER0_CTRL_OFFSET(i)); + if (ctrl & CXL_HDM_DECODER0_CTRL_COMMITTED) + break; + if (ctrl & CXL_HDM_DECODER0_CTRL_COMMIT_ERROR) { + pci_warn(pdev, + "HDM decoder %d commit error on restore\n", + i); + break; + } + if (time_after(jiffies, timeout)) { + pci_warn(pdev, + "HDM decoder %d commit timeout on restore\n", + i); + break; + } + cpu_relax(); + } + } +} + +static void cxl_save_hdm_decoders(struct pci_dev *pdev, + struct cxl_pci_state *state) +{ + int hdm_bar; + unsigned long hdm_bar_offset, hdm_map_size; + void __iomem *hdm; + u16 cmd; + u32 cap; + struct pci_cmd_saved saved __free(restore_pci_cmd) = { + .pdev = pdev, .cmd = PCI_COMMAND_MEMORY, + }; + + pci_read_config_word(pdev, PCI_COMMAND, &cmd); + saved.cmd = cmd; + if (!(cmd & PCI_COMMAND_MEMORY)) + pci_write_config_word(pdev, PCI_COMMAND, + cmd | PCI_COMMAND_MEMORY); + + hdm = cxl_hdm_map(pdev, &hdm_bar, &hdm_bar_offset, &hdm_map_size); + if (!hdm) + return; + + cap = readl(hdm + CXL_HDM_DECODER_CAP_OFFSET); + cxl_save_hdm(pdev, hdm, state, cxl_hdm_decoder_count(cap)); + state->hdm_bar = hdm_bar; + state->hdm_bar_offset = hdm_bar_offset; + state->hdm_map_size = hdm_map_size; + state->hdm_valid = true; + pci_iounmap(pdev, hdm); +} + +static void cxl_restore_hdm_decoders(struct pci_dev *pdev, + const struct cxl_pci_state *state) +{ + void __iomem *hdm; + u16 cmd; + struct pci_cmd_saved saved __free(restore_pci_cmd) = { + .pdev = pdev, .cmd = PCI_COMMAND_MEMORY, + }; + + if (!state->hdm_valid) + return; + + pci_read_config_word(pdev, PCI_COMMAND, &cmd); + saved.cmd = cmd; + if (!(cmd & PCI_COMMAND_MEMORY)) + pci_write_config_word(pdev, PCI_COMMAND, + cmd | PCI_COMMAND_MEMORY); + + hdm = pci_iomap_range(pdev, state->hdm_bar, state->hdm_bar_offset, + state->hdm_map_size); + if (!hdm) { + pci_warn(pdev, "CXL: failed to map HDM for restore\n"); + return; + } + + cxl_restore_hdm(pdev, hdm, state); + pci_iounmap(pdev, hdm); +} + void pci_allocate_cxl_save_buffer(struct pci_dev *dev) { if (!pcie_is_cxl(dev)) @@ -155,8 +454,10 @@ void pci_save_cxl_state(struct pci_dev *pdev) state = (struct cxl_pci_state *)save_state->cap.data; state->dvsec_valid = false; + state->hdm_valid = false; cxl_save_dvsec(pdev, state); + cxl_save_hdm_decoders(pdev, state); } void pci_restore_cxl_state(struct pci_dev *pdev) @@ -170,8 +471,9 @@ void pci_restore_cxl_state(struct pci_dev *pdev) return; state = (struct cxl_pci_state *)save_state->cap.data; - if (!state->dvsec_valid) + if (!state->dvsec_valid && !state->hdm_valid) return; cxl_restore_dvsec(pdev, state); + cxl_restore_hdm_decoders(pdev, state); } From 52975ec822b9417a62195332a74fb03b497008e9 Mon Sep 17 00:00:00 2001 From: Srirangan Madhavan Date: Fri, 6 Mar 2026 09:23:16 +0000 Subject: [PATCH 32/38] NVIDIA: VR: SAUCE: PCI: Add CXL DVSEC reset and capability register definitions BugLink: https://bugs.launchpad.net/bugs/2143032 Add CXL DVSEC register definitions needed for CXL device reset per CXL r3.2 section 8.1.3.1: - Capability bits: RST_CAPABLE, CACHE_CAPABLE, CACHE_WBI_CAPABLE, RST_TIMEOUT, RST_MEM_CLR_CAPABLE - Control2 register: DISABLE_CACHING, INIT_CACHE_WBI, INIT_CXL_RST, RST_MEM_CLR_EN - Status2 register: CACHE_INV, RST_DONE, RST_ERR - Non-CXL Function Map DVSEC register offset Signed-off-by: Srirangan Madhavan (backported from https://lore.kernel.org/linux-cxl/20260306092322.148765-1-smadhavan@nvidia.com/) [jan: Resolve conflicts where PCI_DVSEC_CXL_CACHE_CAPABLE is already added by "72bd823fb4f1 NVIDIA: VR: SAUCE: PCI: Allow ATS to be always on for CXL.cache capable devices"] Signed-off-by: Jiandi An Acked-by: Jamie Nguyen Acked-by: Nirmoy Das Acked-by: Carol L Soto Acked-by: Matthew R. Ochs Signed-off-by: Brad Figg (backported from commit 8f77fbe3816c18bf2a7c8aa61bea89c00860790d nv-kernels/24.04_linux-nvidia-6.17-next) [koba: Preserve PCI_DVSEC_CXL_CACHE_CAPABLE because drivers/pci/ats.c still uses it for CXL.cache ATS dependency from commit 37654885ec90c (6.17 source commit 72bd823fb4f1).] Signed-off-by: Koba Ko --- include/uapi/linux/pci_regs.h | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/include/uapi/linux/pci_regs.h b/include/uapi/linux/pci_regs.h index 148328bd4cb37..c4a6c0a916dc7 100644 --- a/include/uapi/linux/pci_regs.h +++ b/include/uapi/linux/pci_regs.h @@ -1349,12 +1349,25 @@ /* CXL r4.0, 8.1.3: PCIe DVSEC for CXL Device */ #define PCI_DVSEC_CXL_DEVICE 0 #define PCI_DVSEC_CXL_CAP 0xA +#define PCI_DVSEC_CXL_CACHE_CAPABLE _BITUL(0) #define PCI_DVSEC_CXL_MEM_CAPABLE _BITUL(2) #define PCI_DVSEC_CXL_HDM_COUNT __GENMASK(5, 4) +#define PCI_DVSEC_CXL_CACHE_WBI_CAPABLE _BITUL(6) +#define PCI_DVSEC_CXL_RST_CAPABLE _BITUL(7) +#define PCI_DVSEC_CXL_RST_TIMEOUT __GENMASK(10, 8) +#define PCI_DVSEC_CXL_RST_MEM_CLR_CAPABLE _BITUL(11) #define PCI_DVSEC_CXL_CTRL 0xC #define PCI_DVSEC_CXL_MEM_ENABLE _BITUL(2) #define PCI_DVSEC_CXL_CTRL_RWL 0x5FED #define PCI_DVSEC_CXL_CTRL2 0x10 +#define PCI_DVSEC_CXL_DISABLE_CACHING _BITUL(0) +#define PCI_DVSEC_CXL_INIT_CACHE_WBI _BITUL(1) +#define PCI_DVSEC_CXL_INIT_CXL_RST _BITUL(2) +#define PCI_DVSEC_CXL_RST_MEM_CLR_EN _BITUL(3) +#define PCI_DVSEC_CXL_STATUS2 0x12 +#define PCI_DVSEC_CXL_CACHE_INV _BITUL(0) +#define PCI_DVSEC_CXL_RST_DONE _BITUL(1) +#define PCI_DVSEC_CXL_RST_ERR _BITUL(2) #define PCI_DVSEC_CXL_LOCK 0x14 #define PCI_DVSEC_CXL_LOCK_CONFIG _BITUL(0) #define PCI_DVSEC_CXL_RANGE_SIZE_HIGH(i) (0x18 + (i * 0x10)) @@ -1373,6 +1386,7 @@ /* CXL r4.0, 8.1.4: Non-CXL Function Map DVSEC */ #define PCI_DVSEC_CXL_FUNCTION_MAP 2 +#define PCI_DVSEC_CXL_FUNCTION_MAP_REG 0x0C /* CXL r4.0, 8.1.5: Extensions DVSEC for Ports */ #define PCI_DVSEC_CXL_PORT 3 From d1c9d1b5146065524babe23598c7254e6edf68db Mon Sep 17 00:00:00 2001 From: Srirangan Madhavan Date: Fri, 6 Mar 2026 09:23:17 +0000 Subject: [PATCH 33/38] NVIDIA: VR: SAUCE: PCI: Export pci_dev_save_and_disable() and pci_dev_restore() BugLink: https://bugs.launchpad.net/bugs/2143032 Export pci_dev_save_and_disable() and pci_dev_restore() so that subsystems performing non-standard reset sequences (e.g. CXL) can reuse the PCI core standard pre/post reset lifecycle: driver reset_prepare/reset_done callbacks, PCI config space save/restore, and device disable/re-enable. Signed-off-by: Srirangan Madhavan (cherry picked from https://lore.kernel.org/linux-cxl/20260306092322.148765-1-smadhavan@nvidia.com/) Signed-off-by: Jiandi An Acked-by: Jamie Nguyen Acked-by: Nirmoy Das Acked-by: Carol L Soto Acked-by: Matthew R. Ochs Signed-off-by: Brad Figg (cherry picked from commit a14a427ac483aa939aba9173673636bb71a07784 nv-kernels/24.04_linux-nvidia-6.17-next) Signed-off-by: Koba Ko --- drivers/pci/pci.c | 21 +++++++++++++++++++-- include/linux/pci.h | 3 +++ 2 files changed, 22 insertions(+), 2 deletions(-) diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c index 497720c64d6d9..2ef8d7274b300 100644 --- a/drivers/pci/pci.c +++ b/drivers/pci/pci.c @@ -5033,7 +5033,15 @@ void pci_dev_unlock(struct pci_dev *dev) } EXPORT_SYMBOL_GPL(pci_dev_unlock); -static void pci_dev_save_and_disable(struct pci_dev *dev) +/** + * pci_dev_save_and_disable - Save device state and disable it + * @dev: PCI device to save and disable + * + * Save the PCI configuration state, invoke the driver's reset_prepare + * callback (if any), and disable the device by clearing the Command register. + * The device lock must be held by the caller. + */ +void pci_dev_save_and_disable(struct pci_dev *dev) { const struct pci_error_handlers *err_handler = dev->driver ? dev->driver->err_handler : NULL; @@ -5066,8 +5074,16 @@ static void pci_dev_save_and_disable(struct pci_dev *dev) */ pci_write_config_word(dev, PCI_COMMAND, PCI_COMMAND_INTX_DISABLE); } +EXPORT_SYMBOL_GPL(pci_dev_save_and_disable); -static void pci_dev_restore(struct pci_dev *dev) +/** + * pci_dev_restore - Restore device state after reset + * @dev: PCI device to restore + * + * Restore the saved PCI configuration state and invoke the driver's + * reset_done callback (if any). The device lock must be held by the caller. + */ +void pci_dev_restore(struct pci_dev *dev) { const struct pci_error_handlers *err_handler = dev->driver ? dev->driver->err_handler : NULL; @@ -5084,6 +5100,7 @@ static void pci_dev_restore(struct pci_dev *dev) else if (dev->driver) pci_warn(dev, "reset done"); } +EXPORT_SYMBOL_GPL(pci_dev_restore); /* dev->reset_methods[] is a 0-terminated list of indices into this array */ const struct pci_reset_fn_method pci_reset_fn_methods[] = { diff --git a/include/linux/pci.h b/include/linux/pci.h index 61066bcbad09c..a0724e3501b25 100644 --- a/include/linux/pci.h +++ b/include/linux/pci.h @@ -2008,6 +2008,9 @@ int pci_dev_trylock(struct pci_dev *dev); void pci_dev_unlock(struct pci_dev *dev); DEFINE_GUARD(pci_dev, struct pci_dev *, pci_dev_lock(_T), pci_dev_unlock(_T)) +void pci_dev_save_and_disable(struct pci_dev *dev); +void pci_dev_restore(struct pci_dev *dev); + /* * PCI domain support. Sometimes called PCI segment (eg by ACPI), * a PCI domain is defined to be a set of PCI buses which share From baa527d0e4a330b3eaba6d6e51566174e6e557d7 Mon Sep 17 00:00:00 2001 From: Srirangan Madhavan Date: Fri, 6 Mar 2026 09:23:18 +0000 Subject: [PATCH 34/38] NVIDIA: VR: SAUCE: cxl: Add memory offlining and cache flush helpers BugLink: https://bugs.launchpad.net/bugs/2143032 Add infrastructure for quiescing the CXL data path before reset: - Memory offlining: check if CXL-backed memory is online and offline it via offline_and_remove_memory() before reset, per CXL spec requirement to quiesce all CXL.mem transactions before issuing CXL Reset. - CPU cache flush: invalidate cache lines before reset as a safety measure after memory offline. Signed-off-by: Srirangan Madhavan (cherry picked from https://lore.kernel.org/linux-cxl/20260306092322.148765-1-smadhavan@nvidia.com/) Signed-off-by: Jiandi An Acked-by: Jamie Nguyen Acked-by: Nirmoy Das Acked-by: Carol L Soto Acked-by: Matthew R. Ochs Signed-off-by: Brad Figg (backported from commit 98bfbf9c3f88013ffbff4b08a1da0043606d0269 nv-kernels/24.04_linux-nvidia-6.17-next) [koba: Use a real System RAM walker callback so resource walks never invoke a NULL function pointer.] Signed-off-by: Koba Ko --- drivers/cxl/core/pci.c | 120 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 120 insertions(+) diff --git a/drivers/cxl/core/pci.c b/drivers/cxl/core/pci.c index 2bcd683aa286d..84e3f037ea71d 100644 --- a/drivers/cxl/core/pci.c +++ b/drivers/cxl/core/pci.c @@ -4,6 +4,8 @@ #include #include #include +#include +#include #include #include #include @@ -927,3 +929,121 @@ int cxl_port_get_possible_dports(struct cxl_port *port) return ctx.count; } + +/* + * CXL Reset support - core-provided reset logic for CXL devices. + * + * These functions implement the CXL reset sequence. + */ + +/* + * If CXL memory backed by this decoder is online as System RAM, offline + * and remove it per CXL spec requirements before issuing CXL Reset. + * Returns 0 if memory was not online or was successfully offlined. + */ +static int cxl_is_system_ram(struct resource *res, void *arg) +{ + return 1; +} + +static int __maybe_unused cxl_offline_memory(struct device *dev, void *data) +{ + struct cxl_endpoint_decoder *cxled; + struct cxl_region *cxlr; + struct cxl_region_params *p; + int rc; + + if (!is_endpoint_decoder(dev)) + return 0; + + cxled = to_cxl_endpoint_decoder(dev); + guard(rwsem_read)(&cxl_rwsem.region); + + cxlr = cxled->cxld.region; + if (!cxlr) + return 0; + + p = &cxlr->params; + if (!p->res) + return 0; + + if (walk_iomem_res_desc(IORES_DESC_NONE, + IORESOURCE_SYSTEM_RAM | IORESOURCE_BUSY, + p->res->start, p->res->end, NULL, + cxl_is_system_ram) <= 0) + return 0; + + dev_info(dev, "Offlining CXL memory [%pr] for reset\n", p->res); + +#ifdef CONFIG_MEMORY_HOTREMOVE + rc = offline_and_remove_memory(p->res->start, resource_size(p->res)); + if (rc) { + dev_err(dev, + "Failed to offline CXL memory [%pr]: %d\n", + p->res, rc); + return rc; + } +#else + dev_err(dev, "Memory hotremove not supported, cannot offline CXL memory\n"); + rc = -EOPNOTSUPP; + return rc; +#endif + + return 0; +} + +static int __maybe_unused cxl_reset_prepare_memdev(struct cxl_memdev *cxlmd) +{ + struct cxl_port *endpoint; + struct device *dev; + + if (!cxlmd || !cxlmd->cxlds) + return -ENODEV; + + dev = cxlmd->cxlds->dev; + endpoint = cxlmd->endpoint; + if (!endpoint) + return 0; + + return device_for_each_child(&endpoint->dev, NULL, + cxl_offline_memory); +} + +static int __maybe_unused cxl_decoder_flush_cache(struct device *dev, void *data) +{ + struct cxl_endpoint_decoder *cxled; + struct cxl_region *cxlr; + struct resource *res; + + if (!is_endpoint_decoder(dev)) + return 0; + + cxled = to_cxl_endpoint_decoder(dev); + guard(rwsem_read)(&cxl_rwsem.region); + + cxlr = cxled->cxld.region; + if (!cxlr || !cxlr->params.res) + return 0; + + res = cxlr->params.res; + cpu_cache_invalidate_memregion(res->start, resource_size(res)); + return 0; +} + +static int __maybe_unused cxl_reset_flush_cpu_caches(struct cxl_memdev *cxlmd) +{ + struct cxl_port *endpoint; + + if (!cxlmd) + return 0; + + endpoint = cxlmd->endpoint; + if (!endpoint || IS_ERR(endpoint)) + return 0; + + if (!cpu_cache_has_invalidate_memregion()) + return 0; + + device_for_each_child(&endpoint->dev, NULL, cxl_decoder_flush_cache); + return 0; +} From 3a245b9b22c3fa08249f08386018b7c025ae19f2 Mon Sep 17 00:00:00 2001 From: Srirangan Madhavan Date: Fri, 6 Mar 2026 09:23:19 +0000 Subject: [PATCH 35/38] NVIDIA: VR: SAUCE: cxl: Add multi-function sibling coordination for CXL reset BugLink: https://bugs.launchpad.net/bugs/2143032 Add sibling PCI function save/disable/restore coordination for CXL reset. Before reset, all CXL.cachemem sibling functions are locked, saved, and disabled; after reset they are restored. The Non-CXL Function Map DVSEC and per-function DVSEC capability register are consulted to skip non-CXL and CXL.io-only functions. A global mutex serializes concurrent resets to prevent deadlocks between sibling functions. Signed-off-by: Srirangan Madhavan (cherry picked from https://lore.kernel.org/linux-cxl/20260306092322.148765-1-smadhavan@nvidia.com/) Signed-off-by: Jiandi An Acked-by: Jamie Nguyen Acked-by: Nirmoy Das Acked-by: Carol L Soto Acked-by: Matthew R. Ochs Signed-off-by: Brad Figg (backported from commit 9a08c0246be53f047ed4128455f708b7a4350261 nv-kernels/24.04_linux-nvidia-6.17-next) [koba: Propagate sibling collection allocation failures after pci_walk_bus() so reset aborts instead of proceeding with a partial sibling list.] Signed-off-by: Koba Ko --- drivers/cxl/core/pci.c | 156 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 156 insertions(+) diff --git a/drivers/cxl/core/pci.c b/drivers/cxl/core/pci.c index 84e3f037ea71d..5949c5d15787b 100644 --- a/drivers/cxl/core/pci.c +++ b/drivers/cxl/core/pci.c @@ -16,6 +16,9 @@ #include "core.h" #include "trace.h" +/* Initial sibling array capacity: covers max non-ARI functions per slot */ +#define CXL_RESET_SIBLINGS_INIT 8 + /** * DOC: cxl core pci * @@ -1047,3 +1050,156 @@ static int __maybe_unused cxl_reset_flush_cpu_caches(struct cxl_memdev *cxlmd) device_for_each_child(&endpoint->dev, NULL, cxl_decoder_flush_cache); return 0; } + +/* + * Serialize all CXL reset operations globally. + */ +static DEFINE_MUTEX(cxl_reset_mutex); + +struct cxl_reset_context { + struct pci_dev *target; + struct pci_dev **pci_functions; + int pci_func_count; + int pci_func_cap; +}; + +/* + * Check if a sibling function is non-CXL using the Non-CXL Function Map + * DVSEC. Returns true if fn is listed as non-CXL, false otherwise (including + * on any read failure). + */ +static bool cxl_is_non_cxl_function(struct pci_dev *pdev, + u16 func_map_dvsec, int fn) +{ + int reg, bit; + u32 map; + + if (pci_ari_enabled(pdev->bus)) { + reg = fn / 32; + bit = fn % 32; + } else { + reg = 0; + bit = fn; + } + + if (pci_read_config_dword(pdev, + func_map_dvsec + PCI_DVSEC_CXL_FUNCTION_MAP_REG + (reg * 4), + &map)) + return false; + + return map & BIT(bit); +} + +struct cxl_reset_walk_ctx { + struct cxl_reset_context *ctx; + u16 func_map_dvsec; + int error; + bool ari; +}; + +static int cxl_reset_collect_sibling(struct pci_dev *func, void *data) +{ + struct cxl_reset_walk_ctx *wctx = data; + struct cxl_reset_context *ctx = wctx->ctx; + struct pci_dev *pdev = ctx->target; + u16 dvsec, cap; + int fn; + + if (func == pdev) + return 0; + + if (!wctx->ari && + PCI_SLOT(func->devfn) != PCI_SLOT(pdev->devfn)) + return 0; + + fn = wctx->ari ? func->devfn : PCI_FUNC(func->devfn); + if (wctx->func_map_dvsec && + cxl_is_non_cxl_function(pdev, wctx->func_map_dvsec, fn)) + return 0; + + /* Only coordinate with siblings that have CXL.cachemem */ + dvsec = pci_find_dvsec_capability(func, PCI_VENDOR_ID_CXL, + PCI_DVSEC_CXL_DEVICE); + if (!dvsec) + return 0; + if (pci_read_config_word(func, dvsec + PCI_DVSEC_CXL_CAP, &cap)) + return 0; + if (!(cap & (PCI_DVSEC_CXL_CACHE_CAPABLE | + PCI_DVSEC_CXL_MEM_CAPABLE))) + return 0; + + /* Grow sibling array; double capacity for ARI devices when running out of space */ + if (ctx->pci_func_count >= ctx->pci_func_cap) { + struct pci_dev **new; + int new_cap = ctx->pci_func_cap ? ctx->pci_func_cap * 2 + : CXL_RESET_SIBLINGS_INIT; + + new = krealloc(ctx->pci_functions, + new_cap * sizeof(*new), GFP_KERNEL); + if (!new) { + wctx->error = -ENOMEM; + return 1; + } + ctx->pci_functions = new; + ctx->pci_func_cap = new_cap; + } + + pci_dev_get(func); + ctx->pci_functions[ctx->pci_func_count++] = func; + return 0; +} + +static void __maybe_unused cxl_pci_functions_reset_release(struct cxl_reset_context *ctx) +{ + int i; + + for (i = 0; i < ctx->pci_func_count; i++) + pci_dev_put(ctx->pci_functions[i]); + kfree(ctx->pci_functions); + ctx->pci_functions = NULL; + ctx->pci_func_count = 0; + ctx->pci_func_cap = 0; +} + +static int __maybe_unused cxl_pci_functions_reset_prepare(struct cxl_reset_context *ctx) +{ + struct pci_dev *pdev = ctx->target; + struct cxl_reset_walk_ctx wctx; + int i; + + ctx->pci_func_count = 0; + ctx->pci_functions = NULL; + ctx->pci_func_cap = 0; + + wctx.ctx = ctx; + wctx.ari = pci_ari_enabled(pdev->bus); + wctx.error = 0; + wctx.func_map_dvsec = pci_find_dvsec_capability(pdev, + PCI_VENDOR_ID_CXL, PCI_DVSEC_CXL_FUNCTION_MAP); + + /* Collect CXL.cachemem siblings under pci_bus_sem */ + pci_walk_bus(pdev->bus, cxl_reset_collect_sibling, &wctx); + if (wctx.error) { + cxl_pci_functions_reset_release(ctx); + return wctx.error; + } + + /* Lock and save/disable siblings outside pci_bus_sem */ + for (i = 0; i < ctx->pci_func_count; i++) { + pci_dev_lock(ctx->pci_functions[i]); + pci_dev_save_and_disable(ctx->pci_functions[i]); + } + + return 0; +} + +static void __maybe_unused cxl_pci_functions_reset_done(struct cxl_reset_context *ctx) +{ + int i; + + for (i = 0; i < ctx->pci_func_count; i++) { + pci_dev_restore(ctx->pci_functions[i]); + pci_dev_unlock(ctx->pci_functions[i]); + } + cxl_pci_functions_reset_release(ctx); +} From 6e9e72b4b2a2847c5e97470614c7f7df1676e117 Mon Sep 17 00:00:00 2001 From: Srirangan Madhavan Date: Fri, 6 Mar 2026 09:23:20 +0000 Subject: [PATCH 36/38] NVIDIA: VR: SAUCE: cxl: Add CXL DVSEC reset sequence and flow orchestration BugLink: https://bugs.launchpad.net/bugs/2143032 cxl_dev_reset() implements the hardware reset sequence: optionally enable memory clear, initiate reset via CTRL2, wait for completion, and re-enable caching. cxl_do_reset() orchestrates the full reset flow: 1. CXL pre-reset: mem offlining and cache flush (when memdev present) 2. PCI save/disable: pci_dev_save_and_disable() automatically saves CXL DVSEC and HDM decoder state via PCI core hooks 3. Sibling coordination: save/disable CXL.cachemem sibling functions 4. Execute CXL DVSEC reset 5. Sibling restore: always runs to re-enable sibling functions 6. PCI restore: pci_dev_restore() automatically restores CXL state The CXL-specific DVSEC and HDM save/restore is handled by the PCI core's CXL save/restore infrastructure (drivers/pci/cxl.c). Signed-off-by: Srirangan Madhavan (cherry picked from https://lore.kernel.org/linux-cxl/20260306092322.148765-1-smadhavan@nvidia.com/) Signed-off-by: Jiandi An Acked-by: Jamie Nguyen Acked-by: Nirmoy Das Acked-by: Carol L Soto Acked-by: Matthew R. Ochs Signed-off-by: Brad Figg (backported from commit 92fb80732a4ea34b76cbe51b15e95ff04f49cb89 nv-kernels/24.04_linux-nvidia-6.17-next) [koba: Treat error-valued cxlmd->endpoint as no endpoint to avoid dereferencing ERR_PTR before endpoint attach.] [koba: Check sibling collection failure before starting the CXL reset so allocation failure restores the target and aborts.] [koba: Limit the memdev device lock to endpoint-dependent memory preparation and cache flush, restore memory quiesce before PCI disable, and track sibling reset preparation so reset_done cleanup only runs after successful sibling prepare.] [koba: Guard reset_done() against NULL/ERR_PTR memdev endpoints before decoder reset detection.] Signed-off-by: Koba Ko --- drivers/cxl/core/pci.c | 196 ++++++++++++++++++++++++++++++++++++++++- drivers/cxl/pci.c | 8 +- 2 files changed, 198 insertions(+), 6 deletions(-) diff --git a/drivers/cxl/core/pci.c b/drivers/cxl/core/pci.c index 5949c5d15787b..c97ee3d257a87 100644 --- a/drivers/cxl/core/pci.c +++ b/drivers/cxl/core/pci.c @@ -1005,7 +1005,7 @@ static int __maybe_unused cxl_reset_prepare_memdev(struct cxl_memdev *cxlmd) dev = cxlmd->cxlds->dev; endpoint = cxlmd->endpoint; - if (!endpoint) + if (!endpoint || IS_ERR(endpoint)) return 0; return device_for_each_child(&endpoint->dev, NULL, @@ -1149,7 +1149,7 @@ static int cxl_reset_collect_sibling(struct pci_dev *func, void *data) return 0; } -static void __maybe_unused cxl_pci_functions_reset_release(struct cxl_reset_context *ctx) +static void cxl_pci_functions_reset_release(struct cxl_reset_context *ctx) { int i; @@ -1161,7 +1161,7 @@ static void __maybe_unused cxl_pci_functions_reset_release(struct cxl_reset_cont ctx->pci_func_cap = 0; } -static int __maybe_unused cxl_pci_functions_reset_prepare(struct cxl_reset_context *ctx) +static int cxl_pci_functions_reset_prepare(struct cxl_reset_context *ctx) { struct pci_dev *pdev = ctx->target; struct cxl_reset_walk_ctx wctx; @@ -1193,7 +1193,7 @@ static int __maybe_unused cxl_pci_functions_reset_prepare(struct cxl_reset_conte return 0; } -static void __maybe_unused cxl_pci_functions_reset_done(struct cxl_reset_context *ctx) +static void cxl_pci_functions_reset_done(struct cxl_reset_context *ctx) { int i; @@ -1203,3 +1203,191 @@ static void __maybe_unused cxl_pci_functions_reset_done(struct cxl_reset_context } cxl_pci_functions_reset_release(ctx); } + +/* + * CXL device reset execution + */ +static int cxl_dev_reset(struct pci_dev *pdev, int dvsec) +{ + static const u32 reset_timeout_ms[] = { 10, 100, 1000, 10000, 100000 }; + u16 cap, ctrl2, status2; + u32 timeout_ms; + int rc, idx; + + if (!pci_wait_for_pending_transaction(pdev)) + pci_err(pdev, "timed out waiting for pending transactions\n"); + + rc = pci_read_config_word(pdev, dvsec + PCI_DVSEC_CXL_CAP, &cap); + if (rc) + return rc; + + rc = pci_read_config_word(pdev, dvsec + PCI_DVSEC_CXL_CTRL2, &ctrl2); + if (rc) + return rc; + + /* + * Disable caching and initiate cache writeback+invalidation if the + * device supports it. Poll for completion. + * Per CXL r3.2 section 9.6, software may use the cache size from + * DVSEC CXL Capability2 to compute a suitable timeout; we use a + * default of 10ms. + */ + if (cap & PCI_DVSEC_CXL_CACHE_WBI_CAPABLE) { + u32 wbi_poll_us = 100; + s32 wbi_remaining_us = 10000; + + ctrl2 |= PCI_DVSEC_CXL_DISABLE_CACHING; + rc = pci_write_config_word(pdev, dvsec + PCI_DVSEC_CXL_CTRL2, + ctrl2); + if (rc) + return rc; + + ctrl2 |= PCI_DVSEC_CXL_INIT_CACHE_WBI; + rc = pci_write_config_word(pdev, dvsec + PCI_DVSEC_CXL_CTRL2, + ctrl2); + if (rc) + return rc; + + do { + usleep_range(wbi_poll_us, wbi_poll_us + 1); + wbi_remaining_us -= wbi_poll_us; + rc = pci_read_config_word(pdev, + dvsec + PCI_DVSEC_CXL_STATUS2, + &status2); + if (rc) + return rc; + } while (!(status2 & PCI_DVSEC_CXL_CACHE_INV) && + wbi_remaining_us > 0); + + if (!(status2 & PCI_DVSEC_CXL_CACHE_INV)) { + pci_err(pdev, "CXL cache WB+I timed out\n"); + return -ETIMEDOUT; + } + } else if (cap & PCI_DVSEC_CXL_CACHE_CAPABLE) { + ctrl2 |= PCI_DVSEC_CXL_DISABLE_CACHING; + rc = pci_write_config_word(pdev, dvsec + PCI_DVSEC_CXL_CTRL2, + ctrl2); + if (rc) + return rc; + } + + if (cap & PCI_DVSEC_CXL_RST_MEM_CLR_CAPABLE) { + rc = pci_read_config_word(pdev, dvsec + PCI_DVSEC_CXL_CTRL2, + &ctrl2); + if (rc) + return rc; + + ctrl2 |= PCI_DVSEC_CXL_RST_MEM_CLR_EN; + rc = pci_write_config_word(pdev, dvsec + PCI_DVSEC_CXL_CTRL2, + ctrl2); + if (rc) + return rc; + } + + idx = FIELD_GET(PCI_DVSEC_CXL_RST_TIMEOUT, cap); + if (idx >= ARRAY_SIZE(reset_timeout_ms)) + idx = ARRAY_SIZE(reset_timeout_ms) - 1; + timeout_ms = reset_timeout_ms[idx]; + + rc = pci_read_config_word(pdev, dvsec + PCI_DVSEC_CXL_CTRL2, &ctrl2); + if (rc) + return rc; + + ctrl2 |= PCI_DVSEC_CXL_INIT_CXL_RST; + rc = pci_write_config_word(pdev, dvsec + PCI_DVSEC_CXL_CTRL2, ctrl2); + if (rc) + return rc; + + msleep(timeout_ms); + + rc = pci_read_config_word(pdev, dvsec + PCI_DVSEC_CXL_STATUS2, + &status2); + if (rc) + return rc; + + if (status2 & PCI_DVSEC_CXL_RST_ERR) { + pci_err(pdev, "CXL reset error\n"); + return -EIO; + } + + if (!(status2 & PCI_DVSEC_CXL_RST_DONE)) { + pci_err(pdev, "CXL reset timeout\n"); + return -ETIMEDOUT; + } + + rc = pci_read_config_word(pdev, dvsec + PCI_DVSEC_CXL_CTRL2, &ctrl2); + if (rc) + return rc; + + ctrl2 &= ~PCI_DVSEC_CXL_DISABLE_CACHING; + rc = pci_write_config_word(pdev, dvsec + PCI_DVSEC_CXL_CTRL2, ctrl2); + if (rc) + return rc; + + return 0; +} + +static int match_memdev_by_parent(struct device *dev, const void *parent) +{ + return is_cxl_memdev(dev) && dev->parent == parent; +} + +static int __cxl_do_reset(struct pci_dev *pdev, struct cxl_memdev *cxlmd, + int dvsec) +{ + struct cxl_reset_context ctx = { .target = pdev }; + bool siblings_prepared = false; + int rc; + + mutex_lock(&cxl_reset_mutex); + pci_dev_lock(pdev); + + if (cxlmd) { + guard(device)(&cxlmd->dev); + + rc = cxl_reset_prepare_memdev(cxlmd); + if (rc) + goto out_unlock; + + cxl_reset_flush_cpu_caches(cxlmd); + } + + pci_dev_save_and_disable(pdev); + + rc = cxl_pci_functions_reset_prepare(&ctx); + if (!rc) { + siblings_prepared = true; + rc = cxl_dev_reset(pdev, dvsec); + } + + if (siblings_prepared) + cxl_pci_functions_reset_done(&ctx); + + pci_dev_restore(pdev); + +out_unlock: + pci_dev_unlock(pdev); + mutex_unlock(&cxl_reset_mutex); + + return rc; +} + +static int cxl_do_reset(struct pci_dev *pdev) +{ + int dvsec; + + dvsec = pci_find_dvsec_capability(pdev, PCI_VENDOR_ID_CXL, + PCI_DVSEC_CXL_DEVICE); + if (!dvsec) + return -ENODEV; + + struct device *memdev __free(put_device) = + bus_find_device(&cxl_bus_type, NULL, &pdev->dev, + match_memdev_by_parent); + if (!memdev) + return __cxl_do_reset(pdev, NULL, dvsec); + + struct cxl_memdev *cxlmd = to_cxl_memdev(memdev); + + return __cxl_do_reset(pdev, cxlmd, dvsec); +} diff --git a/drivers/cxl/pci.c b/drivers/cxl/pci.c index 5cad118e71015..4bfcbd260381e 100644 --- a/drivers/cxl/pci.c +++ b/drivers/cxl/pci.c @@ -965,6 +965,7 @@ static void cxl_reset_done(struct pci_dev *pdev) { struct cxl_dev_state *cxlds = pci_get_drvdata(pdev); struct cxl_memdev *cxlmd = cxlds->cxlmd; + struct cxl_port *endpoint; struct device *dev = &pdev->dev; /* @@ -974,8 +975,11 @@ static void cxl_reset_done(struct pci_dev *pdev) * that no longer exists. */ guard(device)(&cxlmd->dev); - if (cxlmd->endpoint && - cxl_endpoint_decoder_reset_detected(cxlmd->endpoint)) { + endpoint = cxlmd->endpoint; + if (!endpoint || IS_ERR(endpoint)) + return; + + if (cxl_endpoint_decoder_reset_detected(endpoint)) { dev_crit(dev, "SBR happened without memory regions removal.\n"); dev_crit(dev, "System may be unstable if regions hosted system memory.\n"); add_taint(TAINT_USER, LOCKDEP_STILL_OK); From c6db6ce8e793a00aa43cff3d5799471e013cd401 Mon Sep 17 00:00:00 2001 From: Srirangan Madhavan Date: Fri, 6 Mar 2026 09:23:21 +0000 Subject: [PATCH 37/38] NVIDIA: VR: SAUCE: cxl: Add cxl_reset sysfs interface for PCI devices BugLink: https://bugs.launchpad.net/bugs/2143032 Add a "cxl_reset" sysfs attribute to PCI devices that support CXL Reset (CXL r3.2 section 8.1.3.1). The attribute is visible only on devices with both CXL.cache and CXL.mem capabilities and the CXL Reset Capable bit set in the DVSEC. Writing "1" to the attribute triggers the full CXL reset flow via cxl_do_reset(). The interface is decoupled from memdev creation: when a CXL memdev exists, memory offlining and cache flush are performed; otherwise reset proceeds without the memory management. The sysfs attribute is managed entirely by the CXL module using sysfs_create_group() / sysfs_remove_group() rather than the PCI core's static attribute groups. This avoids cross-module symbol dependencies between the PCI core (always built-in) and CXL_BUS (potentially modular). At module init, existing PCI devices are scanned and a PCI bus notifier handles hot-plug/unplug. kernfs_drain() makes sure that any in-flight store() completes before sysfs_remove_group() returns, preventing use-after-free during module unload. Signed-off-by: Srirangan Madhavan (cherry picked from https://lore.kernel.org/linux-cxl/20260306092322.148765-1-smadhavan@nvidia.com/) Signed-off-by: Jiandi An Acked-by: Jamie Nguyen Acked-by: Nirmoy Das Acked-by: Carol L Soto Acked-by: Matthew R. Ochs Signed-off-by: Brad Figg (cherry picked from commit 6e96f7e341a4eb1b9631e40b43d120b2b9e2c6e2 nv-kernels/24.04_linux-nvidia-6.17-next) Signed-off-by: Koba Ko --- drivers/cxl/core/core.h | 2 + drivers/cxl/core/pci.c | 113 ++++++++++++++++++++++++++++++++++++++++ drivers/cxl/core/port.c | 3 ++ 3 files changed, 118 insertions(+) diff --git a/drivers/cxl/core/core.h b/drivers/cxl/core/core.h index 77c684744771b..3861409ea69e3 100644 --- a/drivers/cxl/core/core.h +++ b/drivers/cxl/core/core.h @@ -138,6 +138,8 @@ extern struct cxl_rwsem cxl_rwsem; int cxl_memdev_init(void); void cxl_memdev_exit(void); void cxl_mbox_init(void); +void cxl_reset_sysfs_init(void); +void cxl_reset_sysfs_exit(void); enum cxl_poison_trace_type { CXL_POISON_TRACE_LIST, diff --git a/drivers/cxl/core/pci.c b/drivers/cxl/core/pci.c index c97ee3d257a87..6944f0f74790b 100644 --- a/drivers/cxl/core/pci.c +++ b/drivers/cxl/core/pci.c @@ -1391,3 +1391,116 @@ static int cxl_do_reset(struct pci_dev *pdev) return __cxl_do_reset(pdev, cxlmd, dvsec); } + +/* + * CXL reset sysfs attribute management. + * + * The cxl_reset attribute is added to PCI devices that advertise CXL Reset + * capability. Managed entirely by the CXL module via subsys_interface on + * pci_bus_type, avoiding cross-module symbol dependencies between the PCI + * core (built-in) and CXL (potentially modular). + * + * subsys_interface handles existing devices at register time and hot-plug + * add/remove automatically. On unregister, remove_dev runs for all tracked + * devices under bus core serialization. + */ + +static bool pci_cxl_reset_capable(struct pci_dev *pdev) +{ + int dvsec; + u16 cap; + + dvsec = pci_find_dvsec_capability(pdev, PCI_VENDOR_ID_CXL, + PCI_DVSEC_CXL_DEVICE); + if (!dvsec) + return false; + + if (pci_read_config_word(pdev, dvsec + PCI_DVSEC_CXL_CAP, &cap)) + return false; + + if (!(cap & PCI_DVSEC_CXL_CACHE_CAPABLE) || + !(cap & PCI_DVSEC_CXL_MEM_CAPABLE)) + return false; + + return !!(cap & PCI_DVSEC_CXL_RST_CAPABLE); +} + +static ssize_t cxl_reset_store(struct device *dev, + struct device_attribute *attr, + const char *buf, size_t count) +{ + struct pci_dev *pdev = to_pci_dev(dev); + int rc; + + if (!sysfs_streq(buf, "1")) + return -EINVAL; + + rc = cxl_do_reset(pdev); + return rc ? rc : count; +} +static DEVICE_ATTR_WO(cxl_reset); + +static umode_t cxl_reset_attr_is_visible(struct kobject *kobj, + struct attribute *a, int n) +{ + struct pci_dev *pdev = to_pci_dev(kobj_to_dev(kobj)); + + if (!pci_cxl_reset_capable(pdev)) + return 0; + + return a->mode; +} + +static struct attribute *cxl_reset_attrs[] = { + &dev_attr_cxl_reset.attr, + NULL, +}; + +static const struct attribute_group cxl_reset_attr_group = { + .attrs = cxl_reset_attrs, + .is_visible = cxl_reset_attr_is_visible, +}; + +static int cxl_reset_add_dev(struct device *dev, + struct subsys_interface *sif) +{ + struct pci_dev *pdev = to_pci_dev(dev); + + if (!pci_cxl_reset_capable(pdev)) + return 0; + + return sysfs_create_group(&dev->kobj, &cxl_reset_attr_group); +} + +static void cxl_reset_remove_dev(struct device *dev, + struct subsys_interface *sif) +{ + struct pci_dev *pdev = to_pci_dev(dev); + + if (!pci_cxl_reset_capable(pdev)) + return; + + sysfs_remove_group(&dev->kobj, &cxl_reset_attr_group); +} + +static struct subsys_interface cxl_reset_interface = { + .name = "cxl_reset", + .subsys = &pci_bus_type, + .add_dev = cxl_reset_add_dev, + .remove_dev = cxl_reset_remove_dev, +}; + +void cxl_reset_sysfs_init(void) +{ + int rc; + + rc = subsys_interface_register(&cxl_reset_interface); + if (rc) + pr_warn("CXL: failed to register cxl_reset interface (%d)\n", + rc); +} + +void cxl_reset_sysfs_exit(void) +{ + subsys_interface_unregister(&cxl_reset_interface); +} diff --git a/drivers/cxl/core/port.c b/drivers/cxl/core/port.c index dbe30e7c383be..f4f7dad473beb 100644 --- a/drivers/cxl/core/port.c +++ b/drivers/cxl/core/port.c @@ -2531,6 +2531,8 @@ static __init int cxl_core_init(void) if (rc) goto err_ras; + cxl_reset_sysfs_init(); + return 0; err_ras: @@ -2546,6 +2548,7 @@ static __init int cxl_core_init(void) static void cxl_core_exit(void) { + cxl_reset_sysfs_exit(); cxl_ras_exit(); cxl_region_exit(); bus_unregister(&cxl_bus_type); From 9bf30cf9cb99d6bf49d062fb74226edc1420b21e Mon Sep 17 00:00:00 2001 From: Srirangan Madhavan Date: Fri, 6 Mar 2026 09:23:22 +0000 Subject: [PATCH 38/38] NVIDIA: VR: SAUCE: Documentation: ABI: Add CXL PCI cxl_reset sysfs attribute BugLink: https://bugs.launchpad.net/bugs/2143032 Document the cxl_reset sysfs attribute added to PCI devices that support CXL Reset. Signed-off-by: Srirangan Madhavan (cherry picked from https://lore.kernel.org/linux-cxl/20260306092322.148765-1-smadhavan@nvidia.com/) Signed-off-by: Jiandi An Acked-by: Jamie Nguyen Acked-by: Nirmoy Das Acked-by: Carol L Soto Acked-by: Matthew R. Ochs Signed-off-by: Brad Figg (cherry picked from commit 33b53e15c379ec657cc0a4905df161eb9162a3cf nv-kernels/24.04_linux-nvidia-6.17-next) Signed-off-by: Koba Ko --- Documentation/ABI/testing/sysfs-bus-pci | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) diff --git a/Documentation/ABI/testing/sysfs-bus-pci b/Documentation/ABI/testing/sysfs-bus-pci index b767db2c52cb7..d67c733626b83 100644 --- a/Documentation/ABI/testing/sysfs-bus-pci +++ b/Documentation/ABI/testing/sysfs-bus-pci @@ -174,6 +174,28 @@ Description: similiar to writing 1 to their individual "reset" file, so use with caution. +What: /sys/bus/pci/devices/.../cxl_reset +Date: February 2026 +Contact: linux-cxl@vger.kernel.org +Description: + This attribute is only visible when the device advertises + CXL Reset Capable in the CXL DVSEC Capability register + (CXL r3.2, section 8.1.3). + + Writing 1 to this file triggers a CXL device reset which + affects CXL.cache and CXL.mem state on all CXL functions + (i.e. those not listed in the Non-CXL Function Map DVSEC, + section 8.1.4), not just CXL.io/PCIe state. This is + separate from the standard PCI reset interface because CXL + Reset has different scope. + + The reset will fail with -EBUSY if any CXL regions using this + device have drivers bound. Active regions are torn down as + part of the reset sequence. + + This attribute is registered by the CXL core when a CXL device + is discovered, independent of which driver binds the PCI device. + What: /sys/bus/pci/devices/.../vpd Date: February 2008 Contact: Ben Hutchings