diff --git a/Documentation/ABI/testing/sysfs-bus-pci b/Documentation/ABI/testing/sysfs-bus-pci index b767db2c52cb7..d67c733626b83 100644 --- a/Documentation/ABI/testing/sysfs-bus-pci +++ b/Documentation/ABI/testing/sysfs-bus-pci @@ -174,6 +174,28 @@ Description: similiar to writing 1 to their individual "reset" file, so use with caution. +What: /sys/bus/pci/devices/.../cxl_reset +Date: February 2026 +Contact: linux-cxl@vger.kernel.org +Description: + This attribute is only visible when the device advertises + CXL Reset Capable in the CXL DVSEC Capability register + (CXL r3.2, section 8.1.3). + + Writing 1 to this file triggers a CXL device reset which + affects CXL.cache and CXL.mem state on all CXL functions + (i.e. those not listed in the Non-CXL Function Map DVSEC, + section 8.1.4), not just CXL.io/PCIe state. This is + separate from the standard PCI reset interface because CXL + Reset has different scope. + + The reset will fail with -EBUSY if any CXL regions using this + device have drivers bound. Active regions are torn down as + part of the reset sequence. + + This attribute is registered by the CXL core when a CXL device + is discovered, independent of which driver binds the PCI device. + What: /sys/bus/pci/devices/.../vpd Date: February 2008 Contact: Ben Hutchings diff --git a/debian.nvidia-bos/config/annotations b/debian.nvidia-bos/config/annotations index 392f5a93126da..854c543bb2a1c 100644 --- a/debian.nvidia-bos/config/annotations +++ b/debian.nvidia-bos/config/annotations @@ -6,6 +6,15 @@ include "../../debian.master/config/annotations" +CONFIG_ACPI_APEI_EINJ policy<{'amd64': 'y', 'arm64': 'y'}> +CONFIG_ACPI_APEI_EINJ note<'Required for CONFIG_ACPI_APEI_EINJ_CXL'> + +CONFIG_ACPI_APEI_EINJ_CXL policy<{'amd64': 'y', 'arm64': 'y'}> +CONFIG_ACPI_APEI_EINJ_CXL note<'CXL protocol error injection support via APEI EINJ'> + +CONFIG_ARCH_HAS_CPU_CACHE_INVALIDATE_MEMREGION policy<{'amd64': 'y', 'arm64': 'y'}> +CONFIG_ARCH_HAS_CPU_CACHE_INVALIDATE_MEMREGION note<'Override debian.master amd64-only; arm64 selects this via arch/arm64/Kconfig since 4d873c5dc3ed'> + CONFIG_ARM64_ERRATUM_1902691 policy<{'arm64': 'y'}> CONFIG_ARM64_ERRATUM_1902691 note<'Required for Grace enablement'> @@ -36,6 +45,9 @@ CONFIG_ARM64_WORKAROUND_TRBE_OVERWRITE_FILL_MODE note<'Required for Grace enable CONFIG_ARM64_WORKAROUND_TRBE_WRITE_OUT_OF_RANGE policy<{'arm64': 'y'}> CONFIG_ARM64_WORKAROUND_TRBE_WRITE_OUT_OF_RANGE note<'Required for Grace enablement'> +CONFIG_CACHEMAINT_FOR_HOTPLUG policy<{'amd64': '-', 'arm64': 'n'}> +CONFIG_CACHEMAINT_FOR_HOTPLUG note<'Optional HiSilicon HHA cache maintenance driver; depends on GENERIC_CPU_CACHE_MAINTENANCE; not needed for NVIDIA platforms'> + CONFIG_ARM_FFA_TRANSPORT policy<{'arm64': 'y'}> CONFIG_ARM_FFA_TRANSPORT note<'LP: #2111511'> @@ -111,9 +123,24 @@ CONFIG_CPU_FREQ_DEFAULT_GOV_PERFORMANCE note<'LP: #2028576: Perf governo CONFIG_CPU_FREQ_DEFAULT_GOV_SCHEDUTIL policy<{'amd64': 'n', 'arm64': 'n'}> CONFIG_CPU_FREQ_DEFAULT_GOV_SCHEDUTIL note<'LP: #2028576: Perf governor required for NVIDIA workloads'> +CONFIG_CXL_BUS policy<{'amd64': 'y', 'arm64': 'y'}> +CONFIG_CXL_BUS note<'Enable CXL bus support built-in; required for CXL Type-2 device and RAS support'> + +CONFIG_CXL_MEM policy<{'amd64': 'y', 'arm64': 'y'}> +CONFIG_CXL_MEM note<'Auto-selected by CXL_PCI; required for CXL memory expansion and Type-2 device support'> + CONFIG_CXL_MEM_RAW_COMMANDS policy<{'amd64': 'n', 'arm64': 'y'}> CONFIG_CXL_MEM_RAW_COMMANDS note<'Enable CXL raw commands for memory devices'> +CONFIG_CXL_PCI policy<{'amd64': 'y', 'arm64': 'y'}> +CONFIG_CXL_PCI note<'Enable CXL PCI management built-in; auto-selects CXL_MEM; required for CXL Type-2 device support'> + +CONFIG_CXL_PORT policy<{'amd64': 'y', 'arm64': 'y'}> +CONFIG_CXL_PORT note<'Required for CXL port enumeration; defaults to CXL_BUS value'> + +CONFIG_CXL_RAS policy<{'amd64': 'y', 'arm64': 'y'}> +CONFIG_CXL_RAS note<'New def_bool replacing PCIEAER_CXL; auto-enabled with ACPI_APEI_GHES+PCIEAER+CXL_BUS; CXL RAS error handling support'> + CONFIG_DRM_NOUVEAU policy<{'amd64': 'n', 'arm64': 'n'}> CONFIG_DRM_NOUVEAU note<'Disable nouveau for NVIDIA kernels'> @@ -135,6 +162,12 @@ CONFIG_EFI_CAPSULE_LOADER note<'LP: #2067111'> CONFIG_ETM4X_IMPDEF_FEATURE policy<{'arm64': 'n'}> CONFIG_ETM4X_IMPDEF_FEATURE note<'Required for Grace enablement'> +CONFIG_FWCTL policy<{'amd64': 'y', 'arm64': 'y'}> +CONFIG_FWCTL note<'Selected by CXL_BUS when CXL_FEATURES is enabled; required for CXL feature mailbox access'> + +CONFIG_GENERIC_CPU_CACHE_MAINTENANCE policy<{'amd64': '-', 'arm64': 'y'}> +CONFIG_GENERIC_CPU_CACHE_MAINTENANCE note<'Selected by arm64 via arch/arm64/Kconfig since 4d873c5dc3ed; not selected by x86'> + CONFIG_GPIO_AAEON policy<{'amd64': '-'}> CONFIG_GPIO_AAEON note<'Disable all Ubuntu ODM drivers'> @@ -180,6 +213,9 @@ CONFIG_NOUVEAU_PLATFORM_DRIVER note<'Disable nouveau for NVIDIA CONFIG_NR_CPUS policy<{'amd64': '8192', 'arm64': '512'}> CONFIG_NR_CPUS note<'LP: #1864198'> +CONFIG_PCIEAER_CXL policy<{'amd64': '-', 'arm64': '-'}> +CONFIG_PCIEAER_CXL note<'Removed by commit d18f1b7beadf (PCI/AER: Replace PCIEAER_CXL symbol with CXL_RAS)'> + CONFIG_NVGRACE_EGM policy<{'arm64': 'm'}> CONFIG_NVGRACE_EGM note<'LP: #2119656'> @@ -207,6 +243,9 @@ CONFIG_SAMPLE_CORESIGHT_SYSCFG note<'Required for Grace enablem CONFIG_SENSORS_AAEON policy<{'amd64': '-'}> CONFIG_SENSORS_AAEON note<'Disable all Ubuntu ODM drivers'> +CONFIG_SFC_CXL policy<{'amd64': 'n', 'arm64': 'n'}> +CONFIG_SFC_CXL note<'Solarflare SFC9100-family CXL Type-2 device support; not needed for NVIDIA platforms'> + CONFIG_SPI_TEGRA210_QUAD policy<{'arm64': 'y'}> CONFIG_SPI_TEGRA210_QUAD note<'Ensures the TPM is available before the IMA driver initializes'> @@ -225,6 +264,18 @@ CONFIG_UBUNTU_ODM_DRIVERS note<'Disable all Ubuntu ODM dri CONFIG_ULTRASOC_SMB policy<{'arm64': 'n'}> CONFIG_ULTRASOC_SMB note<'Required for Grace enablement'> +CONFIG_DEV_DAX policy<{'amd64': 'y', 'arm64': 'y'}> +CONFIG_DEV_DAX note<'Override debian.master m->y; required built-in for DEV_DAX_CXL=y'> + +CONFIG_DEV_DAX_CXL policy<{'amd64': 'y', 'arm64': 'y'}> +CONFIG_DEV_DAX_CXL note<'Override debian.master m->y; CXL RAM region DAX access; depends on CXL_BUS+CXL_REGION+DEV_DAX'> + +CONFIG_DEV_DAX_KMEM policy<{'amd64': 'y', 'arm64': 'y'}> +CONFIG_DEV_DAX_KMEM note<'Override debian.master m->y; map CXL DAX devices as System-RAM'> + +CONFIG_PCI_CXL policy<{'amd64': 'y', 'arm64': 'y'}> +CONFIG_PCI_CXL note<'Hidden bool; auto-enabled by CXL_BUS; PCI core CXL DVSEC and HDM state save/restore support'> + CONFIG_VFIO_CONTAINER policy<{'amd64': 'y', 'arm64': 'n'}> CONFIG_VFIO_CONTAINER note<'LP: #2095028'> diff --git a/drivers/cxl/core/core.h b/drivers/cxl/core/core.h index 5b0570df0fd9c..3861409ea69e3 100644 --- a/drivers/cxl/core/core.h +++ b/drivers/cxl/core/core.h @@ -97,6 +97,8 @@ void __iomem *devm_cxl_iomap_block(struct device *dev, resource_size_t addr, struct dentry *cxl_debugfs_create_dir(const char *dir); int cxl_dpa_set_part(struct cxl_endpoint_decoder *cxled, enum cxl_partition_mode mode); +struct cxl_memdev_state; +int cxl_mem_get_partition_info(struct cxl_memdev_state *mds); int cxl_dpa_alloc(struct cxl_endpoint_decoder *cxled, u64 size); int cxl_dpa_free(struct cxl_endpoint_decoder *cxled); resource_size_t cxl_dpa_size(struct cxl_endpoint_decoder *cxled); @@ -136,6 +138,8 @@ extern struct cxl_rwsem cxl_rwsem; int cxl_memdev_init(void); void cxl_memdev_exit(void); void cxl_mbox_init(void); +void cxl_reset_sysfs_init(void); +void cxl_reset_sysfs_exit(void); enum cxl_poison_trace_type { CXL_POISON_TRACE_LIST, @@ -224,4 +228,6 @@ int cxl_set_feature(struct cxl_mailbox *cxl_mbox, const uuid_t *feat_uuid, u16 *return_code); #endif +resource_size_t cxl_rcd_component_reg_phys(struct device *dev, + struct cxl_dport *dport); #endif /* __CXL_CORE_H__ */ diff --git a/drivers/cxl/core/mbox.c b/drivers/cxl/core/mbox.c index 12386d9127054..bcf77e604dc3d 100644 --- a/drivers/cxl/core/mbox.c +++ b/drivers/cxl/core/mbox.c @@ -1151,7 +1151,7 @@ EXPORT_SYMBOL_NS_GPL(cxl_mem_get_event_records, "CXL"); * * See CXL @8.2.9.5.2.1 Get Partition Info */ -static int cxl_mem_get_partition_info(struct cxl_memdev_state *mds) +int cxl_mem_get_partition_info(struct cxl_memdev_state *mds) { struct cxl_mailbox *cxl_mbox = &mds->cxlds.cxl_mbox; struct cxl_mbox_get_partition_info pi; @@ -1307,55 +1307,6 @@ int cxl_mem_sanitize(struct cxl_memdev *cxlmd, u16 cmd) return -EBUSY; } -static void add_part(struct cxl_dpa_info *info, u64 start, u64 size, enum cxl_partition_mode mode) -{ - int i = info->nr_partitions; - - if (size == 0) - return; - - info->part[i].range = (struct range) { - .start = start, - .end = start + size - 1, - }; - info->part[i].mode = mode; - info->nr_partitions++; -} - -int cxl_mem_dpa_fetch(struct cxl_memdev_state *mds, struct cxl_dpa_info *info) -{ - struct cxl_dev_state *cxlds = &mds->cxlds; - struct device *dev = cxlds->dev; - int rc; - - if (!cxlds->media_ready) { - info->size = 0; - return 0; - } - - info->size = mds->total_bytes; - - if (mds->partition_align_bytes == 0) { - add_part(info, 0, mds->volatile_only_bytes, CXL_PARTMODE_RAM); - add_part(info, mds->volatile_only_bytes, - mds->persistent_only_bytes, CXL_PARTMODE_PMEM); - return 0; - } - - rc = cxl_mem_get_partition_info(mds); - if (rc) { - dev_err(dev, "Failed to query partition information\n"); - return rc; - } - - add_part(info, 0, mds->active_volatile_bytes, CXL_PARTMODE_RAM); - add_part(info, mds->active_volatile_bytes, mds->active_persistent_bytes, - CXL_PARTMODE_PMEM); - - return 0; -} -EXPORT_SYMBOL_NS_GPL(cxl_mem_dpa_fetch, "CXL"); - int cxl_get_dirty_count(struct cxl_memdev_state *mds, u32 *count) { struct cxl_mailbox *cxl_mbox = &mds->cxlds.cxl_mbox; @@ -1521,23 +1472,21 @@ int cxl_mailbox_init(struct cxl_mailbox *cxl_mbox, struct device *host) } EXPORT_SYMBOL_NS_GPL(cxl_mailbox_init, "CXL"); -struct cxl_memdev_state *cxl_memdev_state_create(struct device *dev) +struct cxl_memdev_state *cxl_memdev_state_create(struct device *dev, u64 serial, + u16 dvsec) { struct cxl_memdev_state *mds; int rc; - mds = devm_kzalloc(dev, sizeof(*mds), GFP_KERNEL); + mds = devm_cxl_dev_state_create(dev, CXL_DEVTYPE_CLASSMEM, serial, + dvsec, struct cxl_memdev_state, cxlds, + true); if (!mds) { dev_err(dev, "No memory available\n"); return ERR_PTR(-ENOMEM); } mutex_init(&mds->event.log_lock); - mds->cxlds.dev = dev; - mds->cxlds.reg_map.host = dev; - mds->cxlds.cxl_mbox.host = dev; - mds->cxlds.reg_map.resource = CXL_RESOURCE_NONE; - mds->cxlds.type = CXL_DEVTYPE_CLASSMEM; rc = devm_cxl_register_mce_notifier(dev, &mds->mce_notifier); if (rc == -EOPNOTSUPP) diff --git a/drivers/cxl/core/memdev.c b/drivers/cxl/core/memdev.c index 273c22118d3d8..759b43364ed7a 100644 --- a/drivers/cxl/core/memdev.c +++ b/drivers/cxl/core/memdev.c @@ -7,6 +7,7 @@ #include #include #include +#include #include #include "trace.h" #include "core.h" @@ -576,12 +577,85 @@ static const struct device_type cxl_memdev_type = { .groups = cxl_memdev_attribute_groups, }; +static const struct device_type cxl_accel_memdev_type = { + .name = "cxl_accel_memdev", + .release = cxl_memdev_release, + .devnode = cxl_memdev_devnode, +}; + bool is_cxl_memdev(const struct device *dev) { - return dev->type == &cxl_memdev_type; + return (dev->type == &cxl_memdev_type || + dev->type == &cxl_accel_memdev_type); } EXPORT_SYMBOL_NS_GPL(is_cxl_memdev, "CXL"); +static void add_part(struct cxl_dpa_info *info, u64 start, u64 size, enum cxl_partition_mode mode) +{ + int i = info->nr_partitions; + + if (size == 0) + return; + + info->part[i].range = (struct range) { + .start = start, + .end = start + size - 1, + }; + info->part[i].mode = mode; + info->nr_partitions++; +} + +int cxl_mem_dpa_fetch(struct cxl_memdev_state *mds, struct cxl_dpa_info *info) +{ + struct cxl_dev_state *cxlds = &mds->cxlds; + struct device *dev = cxlds->dev; + int rc; + + if (!cxlds->media_ready) { + info->size = 0; + return 0; + } + + info->size = mds->total_bytes; + + if (mds->partition_align_bytes == 0) { + add_part(info, 0, mds->volatile_only_bytes, CXL_PARTMODE_RAM); + add_part(info, mds->volatile_only_bytes, + mds->persistent_only_bytes, CXL_PARTMODE_PMEM); + return 0; + } + + rc = cxl_mem_get_partition_info(mds); + if (rc) { + dev_err(dev, "Failed to query partition information\n"); + return rc; + } + + add_part(info, 0, mds->active_volatile_bytes, CXL_PARTMODE_RAM); + add_part(info, mds->active_volatile_bytes, mds->active_persistent_bytes, + CXL_PARTMODE_PMEM); + + return 0; +} +EXPORT_SYMBOL_NS_GPL(cxl_mem_dpa_fetch, "CXL"); + +/** + * cxl_set_capacity: initialize dpa by a driver without a mailbox. + * + * @cxlds: pointer to cxl_dev_state + * @capacity: device volatile memory size + */ +int cxl_set_capacity(struct cxl_dev_state *cxlds, u64 capacity) +{ + struct cxl_dpa_info range_info = { + .size = capacity, + }; + + add_part(&range_info, 0, capacity, CXL_PARTMODE_RAM); + return cxl_dpa_setup(cxlds, &range_info); +} +EXPORT_SYMBOL_NS_GPL(cxl_set_capacity, "CXL"); + /** * set_exclusive_cxl_commands() - atomically disable user cxl commands * @mds: The device state to operate on @@ -656,6 +730,30 @@ static void detach_memdev(struct work_struct *work) static struct lock_class_key cxl_memdev_key; +struct cxl_dev_state *_devm_cxl_dev_state_create(struct device *dev, + enum cxl_devtype type, + u64 serial, u16 dvsec, + size_t size, bool has_mbox) +{ + struct cxl_dev_state *cxlds = devm_kzalloc(dev, size, GFP_KERNEL); + + if (!cxlds) + return NULL; + + cxlds->dev = dev; + cxlds->type = type; + cxlds->serial = serial; + cxlds->cxl_dvsec = dvsec; + cxlds->reg_map.host = dev; + cxlds->reg_map.resource = CXL_RESOURCE_NONE; + + if (has_mbox) + cxlds->cxl_mbox.host = dev; + + return cxlds; +} +EXPORT_SYMBOL_NS_GPL(_devm_cxl_dev_state_create, "CXL"); + static struct cxl_memdev *cxl_memdev_alloc(struct cxl_dev_state *cxlds, const struct file_operations *fops, const struct cxl_memdev_attach *attach) @@ -683,7 +781,10 @@ static struct cxl_memdev *cxl_memdev_alloc(struct cxl_dev_state *cxlds, dev->parent = cxlds->dev; dev->bus = &cxl_bus_type; dev->devt = MKDEV(cxl_mem_major, cxlmd->id); - dev->type = &cxl_memdev_type; + if (cxlds->type == CXL_DEVTYPE_DEVMEM) + dev->type = &cxl_accel_memdev_type; + else + dev->type = &cxl_memdev_type; device_set_pm_not_required(dev); INIT_WORK(&cxlmd->detach_work, detach_memdev); diff --git a/drivers/cxl/core/pci.c b/drivers/cxl/core/pci.c index f96ce884a2130..6944f0f74790b 100644 --- a/drivers/cxl/core/pci.c +++ b/drivers/cxl/core/pci.c @@ -4,8 +4,11 @@ #include #include #include +#include +#include #include #include +#include #include #include #include @@ -13,6 +16,9 @@ #include "core.h" #include "trace.h" +/* Initial sibling array capacity: covers max non-ARI functions per slot */ +#define CXL_RESET_SIBLINGS_INIT 8 + /** * DOC: cxl core pci * @@ -696,6 +702,63 @@ bool cxl_endpoint_decoder_reset_detected(struct cxl_port *port) } EXPORT_SYMBOL_NS_GPL(cxl_endpoint_decoder_reset_detected, "CXL"); +static int cxl_rcrb_get_comp_regs(struct pci_dev *pdev, + struct cxl_register_map *map, + struct cxl_dport *dport) +{ + resource_size_t component_reg_phys; + + *map = (struct cxl_register_map) { + .host = &pdev->dev, + .resource = CXL_RESOURCE_NONE, + }; + + component_reg_phys = cxl_rcd_component_reg_phys(&pdev->dev, dport); + if (component_reg_phys == CXL_RESOURCE_NONE) + return -ENXIO; + + map->resource = component_reg_phys; + map->reg_type = CXL_REGLOC_RBI_COMPONENT; + map->max_size = CXL_COMPONENT_REG_BLOCK_SIZE; + + return 0; +} + +int cxl_pci_setup_regs(struct pci_dev *pdev, enum cxl_regloc_type type, + struct cxl_register_map *map) +{ + int rc; + + rc = cxl_find_regblock(pdev, type, map); + + /* + * If the Register Locator DVSEC does not exist, check if it + * is an RCH and try to extract the Component Registers from + * an RCRB. + */ + if (rc && type == CXL_REGLOC_RBI_COMPONENT && is_cxl_restricted(pdev)) { + struct cxl_dport *dport; + struct cxl_port *port __free(put_cxl_port) = + cxl_pci_find_port(pdev, &dport); + if (!port) + return -EPROBE_DEFER; + + rc = cxl_rcrb_get_comp_regs(pdev, map, dport); + if (rc) + return rc; + + rc = cxl_dport_map_rcd_linkcap(pdev, dport); + if (rc) + return rc; + + } else if (rc) { + return rc; + } + + return cxl_setup_regs(map); +} +EXPORT_SYMBOL_NS_GPL(cxl_pci_setup_regs, "CXL"); + int cxl_pci_get_bandwidth(struct pci_dev *pdev, struct access_coordinate *c) { int speed, bw; @@ -869,3 +932,575 @@ int cxl_port_get_possible_dports(struct cxl_port *port) return ctx.count; } + +/* + * CXL Reset support - core-provided reset logic for CXL devices. + * + * These functions implement the CXL reset sequence. + */ + +/* + * If CXL memory backed by this decoder is online as System RAM, offline + * and remove it per CXL spec requirements before issuing CXL Reset. + * Returns 0 if memory was not online or was successfully offlined. + */ +static int cxl_is_system_ram(struct resource *res, void *arg) +{ + return 1; +} + +static int __maybe_unused cxl_offline_memory(struct device *dev, void *data) +{ + struct cxl_endpoint_decoder *cxled; + struct cxl_region *cxlr; + struct cxl_region_params *p; + int rc; + + if (!is_endpoint_decoder(dev)) + return 0; + + cxled = to_cxl_endpoint_decoder(dev); + guard(rwsem_read)(&cxl_rwsem.region); + + cxlr = cxled->cxld.region; + if (!cxlr) + return 0; + + p = &cxlr->params; + if (!p->res) + return 0; + + if (walk_iomem_res_desc(IORES_DESC_NONE, + IORESOURCE_SYSTEM_RAM | IORESOURCE_BUSY, + p->res->start, p->res->end, NULL, + cxl_is_system_ram) <= 0) + return 0; + + dev_info(dev, "Offlining CXL memory [%pr] for reset\n", p->res); + +#ifdef CONFIG_MEMORY_HOTREMOVE + rc = offline_and_remove_memory(p->res->start, resource_size(p->res)); + if (rc) { + dev_err(dev, + "Failed to offline CXL memory [%pr]: %d\n", + p->res, rc); + return rc; + } +#else + dev_err(dev, "Memory hotremove not supported, cannot offline CXL memory\n"); + rc = -EOPNOTSUPP; + return rc; +#endif + + return 0; +} + +static int __maybe_unused cxl_reset_prepare_memdev(struct cxl_memdev *cxlmd) +{ + struct cxl_port *endpoint; + struct device *dev; + + if (!cxlmd || !cxlmd->cxlds) + return -ENODEV; + + dev = cxlmd->cxlds->dev; + endpoint = cxlmd->endpoint; + if (!endpoint || IS_ERR(endpoint)) + return 0; + + return device_for_each_child(&endpoint->dev, NULL, + cxl_offline_memory); +} + +static int __maybe_unused cxl_decoder_flush_cache(struct device *dev, void *data) +{ + struct cxl_endpoint_decoder *cxled; + struct cxl_region *cxlr; + struct resource *res; + + if (!is_endpoint_decoder(dev)) + return 0; + + cxled = to_cxl_endpoint_decoder(dev); + guard(rwsem_read)(&cxl_rwsem.region); + + cxlr = cxled->cxld.region; + if (!cxlr || !cxlr->params.res) + return 0; + + res = cxlr->params.res; + cpu_cache_invalidate_memregion(res->start, resource_size(res)); + return 0; +} + +static int __maybe_unused cxl_reset_flush_cpu_caches(struct cxl_memdev *cxlmd) +{ + struct cxl_port *endpoint; + + if (!cxlmd) + return 0; + + endpoint = cxlmd->endpoint; + if (!endpoint || IS_ERR(endpoint)) + return 0; + + if (!cpu_cache_has_invalidate_memregion()) + return 0; + + device_for_each_child(&endpoint->dev, NULL, cxl_decoder_flush_cache); + return 0; +} + +/* + * Serialize all CXL reset operations globally. + */ +static DEFINE_MUTEX(cxl_reset_mutex); + +struct cxl_reset_context { + struct pci_dev *target; + struct pci_dev **pci_functions; + int pci_func_count; + int pci_func_cap; +}; + +/* + * Check if a sibling function is non-CXL using the Non-CXL Function Map + * DVSEC. Returns true if fn is listed as non-CXL, false otherwise (including + * on any read failure). + */ +static bool cxl_is_non_cxl_function(struct pci_dev *pdev, + u16 func_map_dvsec, int fn) +{ + int reg, bit; + u32 map; + + if (pci_ari_enabled(pdev->bus)) { + reg = fn / 32; + bit = fn % 32; + } else { + reg = 0; + bit = fn; + } + + if (pci_read_config_dword(pdev, + func_map_dvsec + PCI_DVSEC_CXL_FUNCTION_MAP_REG + (reg * 4), + &map)) + return false; + + return map & BIT(bit); +} + +struct cxl_reset_walk_ctx { + struct cxl_reset_context *ctx; + u16 func_map_dvsec; + int error; + bool ari; +}; + +static int cxl_reset_collect_sibling(struct pci_dev *func, void *data) +{ + struct cxl_reset_walk_ctx *wctx = data; + struct cxl_reset_context *ctx = wctx->ctx; + struct pci_dev *pdev = ctx->target; + u16 dvsec, cap; + int fn; + + if (func == pdev) + return 0; + + if (!wctx->ari && + PCI_SLOT(func->devfn) != PCI_SLOT(pdev->devfn)) + return 0; + + fn = wctx->ari ? func->devfn : PCI_FUNC(func->devfn); + if (wctx->func_map_dvsec && + cxl_is_non_cxl_function(pdev, wctx->func_map_dvsec, fn)) + return 0; + + /* Only coordinate with siblings that have CXL.cachemem */ + dvsec = pci_find_dvsec_capability(func, PCI_VENDOR_ID_CXL, + PCI_DVSEC_CXL_DEVICE); + if (!dvsec) + return 0; + if (pci_read_config_word(func, dvsec + PCI_DVSEC_CXL_CAP, &cap)) + return 0; + if (!(cap & (PCI_DVSEC_CXL_CACHE_CAPABLE | + PCI_DVSEC_CXL_MEM_CAPABLE))) + return 0; + + /* Grow sibling array; double capacity for ARI devices when running out of space */ + if (ctx->pci_func_count >= ctx->pci_func_cap) { + struct pci_dev **new; + int new_cap = ctx->pci_func_cap ? ctx->pci_func_cap * 2 + : CXL_RESET_SIBLINGS_INIT; + + new = krealloc(ctx->pci_functions, + new_cap * sizeof(*new), GFP_KERNEL); + if (!new) { + wctx->error = -ENOMEM; + return 1; + } + ctx->pci_functions = new; + ctx->pci_func_cap = new_cap; + } + + pci_dev_get(func); + ctx->pci_functions[ctx->pci_func_count++] = func; + return 0; +} + +static void cxl_pci_functions_reset_release(struct cxl_reset_context *ctx) +{ + int i; + + for (i = 0; i < ctx->pci_func_count; i++) + pci_dev_put(ctx->pci_functions[i]); + kfree(ctx->pci_functions); + ctx->pci_functions = NULL; + ctx->pci_func_count = 0; + ctx->pci_func_cap = 0; +} + +static int cxl_pci_functions_reset_prepare(struct cxl_reset_context *ctx) +{ + struct pci_dev *pdev = ctx->target; + struct cxl_reset_walk_ctx wctx; + int i; + + ctx->pci_func_count = 0; + ctx->pci_functions = NULL; + ctx->pci_func_cap = 0; + + wctx.ctx = ctx; + wctx.ari = pci_ari_enabled(pdev->bus); + wctx.error = 0; + wctx.func_map_dvsec = pci_find_dvsec_capability(pdev, + PCI_VENDOR_ID_CXL, PCI_DVSEC_CXL_FUNCTION_MAP); + + /* Collect CXL.cachemem siblings under pci_bus_sem */ + pci_walk_bus(pdev->bus, cxl_reset_collect_sibling, &wctx); + if (wctx.error) { + cxl_pci_functions_reset_release(ctx); + return wctx.error; + } + + /* Lock and save/disable siblings outside pci_bus_sem */ + for (i = 0; i < ctx->pci_func_count; i++) { + pci_dev_lock(ctx->pci_functions[i]); + pci_dev_save_and_disable(ctx->pci_functions[i]); + } + + return 0; +} + +static void cxl_pci_functions_reset_done(struct cxl_reset_context *ctx) +{ + int i; + + for (i = 0; i < ctx->pci_func_count; i++) { + pci_dev_restore(ctx->pci_functions[i]); + pci_dev_unlock(ctx->pci_functions[i]); + } + cxl_pci_functions_reset_release(ctx); +} + +/* + * CXL device reset execution + */ +static int cxl_dev_reset(struct pci_dev *pdev, int dvsec) +{ + static const u32 reset_timeout_ms[] = { 10, 100, 1000, 10000, 100000 }; + u16 cap, ctrl2, status2; + u32 timeout_ms; + int rc, idx; + + if (!pci_wait_for_pending_transaction(pdev)) + pci_err(pdev, "timed out waiting for pending transactions\n"); + + rc = pci_read_config_word(pdev, dvsec + PCI_DVSEC_CXL_CAP, &cap); + if (rc) + return rc; + + rc = pci_read_config_word(pdev, dvsec + PCI_DVSEC_CXL_CTRL2, &ctrl2); + if (rc) + return rc; + + /* + * Disable caching and initiate cache writeback+invalidation if the + * device supports it. Poll for completion. + * Per CXL r3.2 section 9.6, software may use the cache size from + * DVSEC CXL Capability2 to compute a suitable timeout; we use a + * default of 10ms. + */ + if (cap & PCI_DVSEC_CXL_CACHE_WBI_CAPABLE) { + u32 wbi_poll_us = 100; + s32 wbi_remaining_us = 10000; + + ctrl2 |= PCI_DVSEC_CXL_DISABLE_CACHING; + rc = pci_write_config_word(pdev, dvsec + PCI_DVSEC_CXL_CTRL2, + ctrl2); + if (rc) + return rc; + + ctrl2 |= PCI_DVSEC_CXL_INIT_CACHE_WBI; + rc = pci_write_config_word(pdev, dvsec + PCI_DVSEC_CXL_CTRL2, + ctrl2); + if (rc) + return rc; + + do { + usleep_range(wbi_poll_us, wbi_poll_us + 1); + wbi_remaining_us -= wbi_poll_us; + rc = pci_read_config_word(pdev, + dvsec + PCI_DVSEC_CXL_STATUS2, + &status2); + if (rc) + return rc; + } while (!(status2 & PCI_DVSEC_CXL_CACHE_INV) && + wbi_remaining_us > 0); + + if (!(status2 & PCI_DVSEC_CXL_CACHE_INV)) { + pci_err(pdev, "CXL cache WB+I timed out\n"); + return -ETIMEDOUT; + } + } else if (cap & PCI_DVSEC_CXL_CACHE_CAPABLE) { + ctrl2 |= PCI_DVSEC_CXL_DISABLE_CACHING; + rc = pci_write_config_word(pdev, dvsec + PCI_DVSEC_CXL_CTRL2, + ctrl2); + if (rc) + return rc; + } + + if (cap & PCI_DVSEC_CXL_RST_MEM_CLR_CAPABLE) { + rc = pci_read_config_word(pdev, dvsec + PCI_DVSEC_CXL_CTRL2, + &ctrl2); + if (rc) + return rc; + + ctrl2 |= PCI_DVSEC_CXL_RST_MEM_CLR_EN; + rc = pci_write_config_word(pdev, dvsec + PCI_DVSEC_CXL_CTRL2, + ctrl2); + if (rc) + return rc; + } + + idx = FIELD_GET(PCI_DVSEC_CXL_RST_TIMEOUT, cap); + if (idx >= ARRAY_SIZE(reset_timeout_ms)) + idx = ARRAY_SIZE(reset_timeout_ms) - 1; + timeout_ms = reset_timeout_ms[idx]; + + rc = pci_read_config_word(pdev, dvsec + PCI_DVSEC_CXL_CTRL2, &ctrl2); + if (rc) + return rc; + + ctrl2 |= PCI_DVSEC_CXL_INIT_CXL_RST; + rc = pci_write_config_word(pdev, dvsec + PCI_DVSEC_CXL_CTRL2, ctrl2); + if (rc) + return rc; + + msleep(timeout_ms); + + rc = pci_read_config_word(pdev, dvsec + PCI_DVSEC_CXL_STATUS2, + &status2); + if (rc) + return rc; + + if (status2 & PCI_DVSEC_CXL_RST_ERR) { + pci_err(pdev, "CXL reset error\n"); + return -EIO; + } + + if (!(status2 & PCI_DVSEC_CXL_RST_DONE)) { + pci_err(pdev, "CXL reset timeout\n"); + return -ETIMEDOUT; + } + + rc = pci_read_config_word(pdev, dvsec + PCI_DVSEC_CXL_CTRL2, &ctrl2); + if (rc) + return rc; + + ctrl2 &= ~PCI_DVSEC_CXL_DISABLE_CACHING; + rc = pci_write_config_word(pdev, dvsec + PCI_DVSEC_CXL_CTRL2, ctrl2); + if (rc) + return rc; + + return 0; +} + +static int match_memdev_by_parent(struct device *dev, const void *parent) +{ + return is_cxl_memdev(dev) && dev->parent == parent; +} + +static int __cxl_do_reset(struct pci_dev *pdev, struct cxl_memdev *cxlmd, + int dvsec) +{ + struct cxl_reset_context ctx = { .target = pdev }; + bool siblings_prepared = false; + int rc; + + mutex_lock(&cxl_reset_mutex); + pci_dev_lock(pdev); + + if (cxlmd) { + guard(device)(&cxlmd->dev); + + rc = cxl_reset_prepare_memdev(cxlmd); + if (rc) + goto out_unlock; + + cxl_reset_flush_cpu_caches(cxlmd); + } + + pci_dev_save_and_disable(pdev); + + rc = cxl_pci_functions_reset_prepare(&ctx); + if (!rc) { + siblings_prepared = true; + rc = cxl_dev_reset(pdev, dvsec); + } + + if (siblings_prepared) + cxl_pci_functions_reset_done(&ctx); + + pci_dev_restore(pdev); + +out_unlock: + pci_dev_unlock(pdev); + mutex_unlock(&cxl_reset_mutex); + + return rc; +} + +static int cxl_do_reset(struct pci_dev *pdev) +{ + int dvsec; + + dvsec = pci_find_dvsec_capability(pdev, PCI_VENDOR_ID_CXL, + PCI_DVSEC_CXL_DEVICE); + if (!dvsec) + return -ENODEV; + + struct device *memdev __free(put_device) = + bus_find_device(&cxl_bus_type, NULL, &pdev->dev, + match_memdev_by_parent); + if (!memdev) + return __cxl_do_reset(pdev, NULL, dvsec); + + struct cxl_memdev *cxlmd = to_cxl_memdev(memdev); + + return __cxl_do_reset(pdev, cxlmd, dvsec); +} + +/* + * CXL reset sysfs attribute management. + * + * The cxl_reset attribute is added to PCI devices that advertise CXL Reset + * capability. Managed entirely by the CXL module via subsys_interface on + * pci_bus_type, avoiding cross-module symbol dependencies between the PCI + * core (built-in) and CXL (potentially modular). + * + * subsys_interface handles existing devices at register time and hot-plug + * add/remove automatically. On unregister, remove_dev runs for all tracked + * devices under bus core serialization. + */ + +static bool pci_cxl_reset_capable(struct pci_dev *pdev) +{ + int dvsec; + u16 cap; + + dvsec = pci_find_dvsec_capability(pdev, PCI_VENDOR_ID_CXL, + PCI_DVSEC_CXL_DEVICE); + if (!dvsec) + return false; + + if (pci_read_config_word(pdev, dvsec + PCI_DVSEC_CXL_CAP, &cap)) + return false; + + if (!(cap & PCI_DVSEC_CXL_CACHE_CAPABLE) || + !(cap & PCI_DVSEC_CXL_MEM_CAPABLE)) + return false; + + return !!(cap & PCI_DVSEC_CXL_RST_CAPABLE); +} + +static ssize_t cxl_reset_store(struct device *dev, + struct device_attribute *attr, + const char *buf, size_t count) +{ + struct pci_dev *pdev = to_pci_dev(dev); + int rc; + + if (!sysfs_streq(buf, "1")) + return -EINVAL; + + rc = cxl_do_reset(pdev); + return rc ? rc : count; +} +static DEVICE_ATTR_WO(cxl_reset); + +static umode_t cxl_reset_attr_is_visible(struct kobject *kobj, + struct attribute *a, int n) +{ + struct pci_dev *pdev = to_pci_dev(kobj_to_dev(kobj)); + + if (!pci_cxl_reset_capable(pdev)) + return 0; + + return a->mode; +} + +static struct attribute *cxl_reset_attrs[] = { + &dev_attr_cxl_reset.attr, + NULL, +}; + +static const struct attribute_group cxl_reset_attr_group = { + .attrs = cxl_reset_attrs, + .is_visible = cxl_reset_attr_is_visible, +}; + +static int cxl_reset_add_dev(struct device *dev, + struct subsys_interface *sif) +{ + struct pci_dev *pdev = to_pci_dev(dev); + + if (!pci_cxl_reset_capable(pdev)) + return 0; + + return sysfs_create_group(&dev->kobj, &cxl_reset_attr_group); +} + +static void cxl_reset_remove_dev(struct device *dev, + struct subsys_interface *sif) +{ + struct pci_dev *pdev = to_pci_dev(dev); + + if (!pci_cxl_reset_capable(pdev)) + return; + + sysfs_remove_group(&dev->kobj, &cxl_reset_attr_group); +} + +static struct subsys_interface cxl_reset_interface = { + .name = "cxl_reset", + .subsys = &pci_bus_type, + .add_dev = cxl_reset_add_dev, + .remove_dev = cxl_reset_remove_dev, +}; + +void cxl_reset_sysfs_init(void) +{ + int rc; + + rc = subsys_interface_register(&cxl_reset_interface); + if (rc) + pr_warn("CXL: failed to register cxl_reset interface (%d)\n", + rc); +} + +void cxl_reset_sysfs_exit(void) +{ + subsys_interface_unregister(&cxl_reset_interface); +} diff --git a/drivers/cxl/core/port.c b/drivers/cxl/core/port.c index c5aacd7054f1d..f4f7dad473beb 100644 --- a/drivers/cxl/core/port.c +++ b/drivers/cxl/core/port.c @@ -11,6 +11,7 @@ #include #include #include +#include #include #include #include @@ -2530,6 +2531,8 @@ static __init int cxl_core_init(void) if (rc) goto err_ras; + cxl_reset_sysfs_init(); + return 0; err_ras: @@ -2545,6 +2548,7 @@ static __init int cxl_core_init(void) static void cxl_core_exit(void) { + cxl_reset_sysfs_exit(); cxl_ras_exit(); cxl_region_exit(); bus_unregister(&cxl_bus_type); diff --git a/drivers/cxl/core/region.c b/drivers/cxl/core/region.c index 9ad83813a5e60..455a2c090589b 100644 --- a/drivers/cxl/core/region.c +++ b/drivers/cxl/core/region.c @@ -12,6 +12,7 @@ #include #include #include +#include #include #include #include "core.h" @@ -1375,57 +1376,119 @@ static int check_interleave_cap(struct cxl_decoder *cxld, int iw, int ig) return 0; } +static inline u64 get_selector(u64 ways, u64 gran) +{ + if (!is_power_of_2(ways)) + ways /= 3; + + if (!is_power_of_2(ways) || !is_power_of_2(gran)) + return 0; + + return (ways - 1) * gran; +} + static int cxl_port_setup_targets(struct cxl_port *port, struct cxl_region *cxlr, struct cxl_endpoint_decoder *cxled) { struct cxl_root_decoder *cxlrd = cxlr->cxlrd; - int parent_iw, parent_ig, ig, iw, rc, pos = cxled->pos; struct cxl_port *parent_port = to_cxl_port(port->dev.parent); struct cxl_region_ref *cxl_rr = cxl_rr_load(port, cxlr); struct cxl_memdev *cxlmd = cxled_to_memdev(cxled); struct cxl_ep *ep = cxl_ep_load(port, cxlmd); struct cxl_region_params *p = &cxlr->params; struct cxl_decoder *cxld = cxl_rr->decoder; - struct cxl_switch_decoder *cxlsd; + struct cxl_switch_decoder *cxlsd = to_cxl_switch_decoder(&cxld->dev); struct cxl_port *iter = port; - u16 eig, peig; - u8 eiw, peiw; + int ig, iw = cxl_rr->nr_targets, rc, pos = cxled->pos; + int distance, parent_distance; + u64 selector, cxlr_sel; + u16 eig; + u8 eiw; /* * While root level decoders support x3, x6, x12, switch level * decoders only support powers of 2 up to x16. */ - if (!is_power_of_2(cxl_rr->nr_targets)) { + if (!is_power_of_2(iw)) { dev_dbg(&cxlr->dev, "%s:%s: invalid target count %d\n", - dev_name(port->uport_dev), dev_name(&port->dev), - cxl_rr->nr_targets); + dev_name(port->uport_dev), dev_name(&port->dev), iw); return -EINVAL; } - cxlsd = to_cxl_switch_decoder(&cxld->dev); - if (cxl_rr->nr_targets_set) { - int i, distance = 1; - struct cxl_region_ref *cxl_rr_iter; + if (iw > 8 || iw > cxlsd->nr_targets) { + dev_dbg(&cxlr->dev, + "%s:%s:%s: ways: %d overflows targets: %d\n", + dev_name(port->uport_dev), dev_name(&port->dev), + dev_name(&cxld->dev), iw, cxlsd->nr_targets); + return -ENXIO; + } - /* - * The "distance" between peer downstream ports represents which - * endpoint positions in the region interleave a given port can - * host. - * - * For example, at the root of a hierarchy the distance is - * always 1 as every index targets a different host-bridge. At - * each subsequent switch level those ports map every Nth region - * position where N is the width of the switch == distance. - */ - do { - cxl_rr_iter = cxl_rr_load(iter, cxlr); - distance *= cxl_rr_iter->nr_targets; - iter = to_cxl_port(iter->dev.parent); - } while (!is_cxl_root(iter)); - distance *= cxlrd->cxlsd.cxld.interleave_ways; + /* + * Calculate the effective granularity and ways to determine + * HPA bits used as target selectors of the interleave set. + * Use this to check if the root decoder and all subsequent + * HDM decoders only use bits from that range as selectors. + * + * The "distance" between peer downstream ports represents which + * endpoint positions in the region interleave a given port can + * host. + * + * For example, at the root of a hierarchy the distance is + * always 1 as every index targets a different host-bridge. At + * each subsequent switch level those ports map every Nth region + * position where N is the width of the switch == distance. + */ + + /* Start with the root decoders selector and distance. */ + selector = get_selector(cxlrd->cxlsd.cxld.interleave_ways, + cxlrd->cxlsd.cxld.interleave_granularity); + distance = cxlrd->cxlsd.cxld.interleave_ways; + if (!is_power_of_2(distance)) + distance /= 3; + + for (iter = parent_port; !is_cxl_root(iter); + iter = to_cxl_port(iter->dev.parent)) { + struct cxl_region_ref *cxl_rr_iter = cxl_rr_load(iter, cxlr); + struct cxl_decoder *cxld_iter = cxl_rr_iter->decoder; + u64 cxld_sel; + + if (cxld_iter->interleave_ways == 1) + continue; + + cxld_sel = get_selector(cxld_iter->interleave_ways, + cxld_iter->interleave_granularity); + + if (cxld_sel & selector) { + dev_dbg(&cxlr->dev, "%s:%s: overlapping selectors: %#llx:%#llx\n", + dev_name(iter->uport_dev), + dev_name(&iter->dev), cxld_sel, selector); + return -ENXIO; + } - for (i = 0; i < cxl_rr->nr_targets_set; i++) + selector |= cxld_sel; + distance *= cxl_rr_iter->nr_targets; + } + + parent_distance = distance; + distance *= iw; + + /* The combined selector bits must fit the region selector. */ + cxlr_sel = get_selector(p->interleave_ways, + p->interleave_granularity); + + if ((cxlr_sel & selector) != selector) { + dev_dbg(&cxlr->dev, "%s:%s: invalid selectors: %#llx:%#llx\n", + dev_name(iter->uport_dev), + dev_name(&iter->dev), cxlr_sel, selector); + return -ENXIO; + } + + /* Calculate remaining selector bits available for use. */ + selector = cxlr_sel & ~selector; + + if (cxl_rr->nr_targets_set) { + for (int i = 0; i < cxl_rr->nr_targets_set; i++) if (ep->dport == cxlsd->target[i]) { rc = check_last_peer(cxled, ep, cxl_rr, distance); @@ -1436,87 +1499,40 @@ static int cxl_port_setup_targets(struct cxl_port *port, goto add_target; } - if (is_cxl_root(parent_port)) { + if (test_bit(CXL_REGION_F_AUTO, &cxlr->flags)) + ig = cxld->interleave_granularity; + else /* + * Set the interleave granularity with each interleave + * level to a multiple of it's parent port interleave + * ways. Beginning with the granularity of the root + * decoder set to the region granularity (starting + * with the inner selector bits of the HPA), the + * granularity is increased with each level. Calculate + * this using the parent distance and region + * granularity. + * * Root decoder IG is always set to value in CFMWS which * may be different than this region's IG. We can use the * region's IG here since interleave_granularity_store() * does not allow interleaved host-bridges with * root IG != region IG. */ - parent_ig = p->interleave_granularity; - parent_iw = cxlrd->cxlsd.cxld.interleave_ways; - /* - * For purposes of address bit routing, use power-of-2 math for - * switch ports. - */ - if (!is_power_of_2(parent_iw)) - parent_iw /= 3; - } else { - struct cxl_region_ref *parent_rr; - struct cxl_decoder *parent_cxld; - - parent_rr = cxl_rr_load(parent_port, cxlr); - parent_cxld = parent_rr->decoder; - parent_ig = parent_cxld->interleave_granularity; - parent_iw = parent_cxld->interleave_ways; - } + ig = p->interleave_granularity * parent_distance; - rc = granularity_to_eig(parent_ig, &peig); - if (rc) { - dev_dbg(&cxlr->dev, "%s:%s: invalid parent granularity: %d\n", - dev_name(parent_port->uport_dev), - dev_name(&parent_port->dev), parent_ig); - return rc; - } - - rc = ways_to_eiw(parent_iw, &peiw); - if (rc) { - dev_dbg(&cxlr->dev, "%s:%s: invalid parent interleave: %d\n", - dev_name(parent_port->uport_dev), - dev_name(&parent_port->dev), parent_iw); - return rc; - } - - iw = cxl_rr->nr_targets; rc = ways_to_eiw(iw, &eiw); - if (rc) { - dev_dbg(&cxlr->dev, "%s:%s: invalid port interleave: %d\n", - dev_name(port->uport_dev), dev_name(&port->dev), iw); - return rc; - } - - /* - * Interleave granularity is a multiple of @parent_port granularity. - * Multiplier is the parent port interleave ways. - */ - rc = granularity_to_eig(parent_ig * parent_iw, &eig); - if (rc) { - dev_dbg(&cxlr->dev, - "%s: invalid granularity calculation (%d * %d)\n", - dev_name(&parent_port->dev), parent_ig, parent_iw); - return rc; - } - - rc = eig_to_granularity(eig, &ig); - if (rc) { - dev_dbg(&cxlr->dev, "%s:%s: invalid interleave: %d\n", - dev_name(port->uport_dev), dev_name(&port->dev), - 256 << eig); - return rc; - } + if (!rc) + rc = granularity_to_eig(ig, &eig); - if (iw > 8 || iw > cxlsd->nr_targets) { - dev_dbg(&cxlr->dev, - "%s:%s:%s: ways: %d overflows targets: %d\n", + if (rc || (iw > 1 && ~selector & get_selector(iw, ig))) { + dev_dbg(&cxlr->dev, "%s:%s: invalid port interleave: %d:%d:%#llx\n", dev_name(port->uport_dev), dev_name(&port->dev), - dev_name(&cxld->dev), iw, cxlsd->nr_targets); + iw, ig, selector); return -ENXIO; } if (test_bit(CXL_REGION_F_AUTO, &cxlr->flags)) { if (cxld->interleave_ways != iw || - (iw > 1 && cxld->interleave_granularity != ig) || !spa_maps_hpa(p, &cxld->hpa_range) || ((cxld->flags & CXL_DECODER_F_ENABLE) == 0)) { dev_err(&cxlr->dev, @@ -2185,7 +2201,9 @@ __cxl_decoder_detach(struct cxl_region *cxlr, cxled->part = -1; if (p->state > CXL_CONFIG_ACTIVE) { - cxl_region_decode_reset(cxlr, p->interleave_ways); + if (!test_bit(CXL_REGION_F_AUTO, &cxlr->flags)) + cxl_region_decode_reset(cxlr, p->interleave_ways); + p->state = CXL_CONFIG_ACTIVE; } @@ -2467,6 +2485,41 @@ static void unregister_region(void *_cxlr) put_device(&cxlr->dev); } +static void cxl_endpoint_region_autoremove(void *_cxlr); + +static void cxl_region_release_action(struct cxl_region *cxlr) +{ + struct cxl_port *port = cxlrd_to_port(cxlr->cxlrd); + + if (cxlr->type != CXL_DECODER_DEVMEM) { + devm_release_action(port->uport_dev, unregister_region, cxlr); + return; + } + + if (cxlr->params.nr_targets) { + struct cxl_endpoint_decoder *cxled = cxlr->params.targets[0]; + struct cxl_port *endpoint = cxled_to_port(cxled); + + guard(device)(&endpoint->dev); + if (cxlr->detach) { + void (*detach)(void *data) = cxlr->detach; + void *detach_data = cxlr->detach_data; + + cxlr->detach = NULL; + cxlr->detach_data = NULL; + devm_release_action(&endpoint->dev, detach, detach_data); + devm_release_action(&endpoint->dev, + cxl_endpoint_region_autoremove, + cxlr); + } else { + unregister_region(cxlr); + } + return; + } + + unregister_region(cxlr); +} + static struct lock_class_key cxl_region_key; static struct cxl_region *cxl_region_alloc(struct cxl_root_decoder *cxlrd, int id) @@ -2619,9 +2672,16 @@ static struct cxl_region *devm_cxl_add_region(struct cxl_root_decoder *cxlrd, if (rc) goto err; - rc = devm_add_action_or_reset(port->uport_dev, unregister_region, cxlr); - if (rc) - return ERR_PTR(rc); + /* + * For accelerators/type2, region release linked to endpoint device. + * See handling of cxl_endpoint_region_autoremove() below by + * cxl_memdev_attach_region(). + */ + if (type == CXL_DECODER_HOSTONLYMEM) { + rc = devm_add_action_or_reset(port->uport_dev, unregister_region, cxlr); + if (rc) + return ERR_PTR(rc); + } dev_dbg(port->uport_dev, "%s: created %s\n", dev_name(&cxlrd->cxlsd.cxld.dev), dev_name(dev)); @@ -2650,7 +2710,8 @@ static ssize_t create_ram_region_show(struct device *dev, } static struct cxl_region *__create_region(struct cxl_root_decoder *cxlrd, - enum cxl_partition_mode mode, int id) + enum cxl_partition_mode mode, int id, + enum cxl_decoder_type type) { int rc; @@ -2672,7 +2733,7 @@ static struct cxl_region *__create_region(struct cxl_root_decoder *cxlrd, return ERR_PTR(-EBUSY); } - return devm_cxl_add_region(cxlrd, id, mode, CXL_DECODER_HOSTONLYMEM); + return devm_cxl_add_region(cxlrd, id, mode, type); } static ssize_t create_region_store(struct device *dev, const char *buf, @@ -2686,7 +2747,7 @@ static ssize_t create_region_store(struct device *dev, const char *buf, if (rc != 1) return -EINVAL; - cxlr = __create_region(cxlrd, mode, id); + cxlr = __create_region(cxlrd, mode, id, CXL_DECODER_HOSTONLYMEM); if (IS_ERR(cxlr)) return PTR_ERR(cxlr); @@ -2743,14 +2804,13 @@ static ssize_t delete_region_store(struct device *dev, const char *buf, size_t len) { struct cxl_root_decoder *cxlrd = to_cxl_root_decoder(dev); - struct cxl_port *port = to_cxl_port(dev->parent); struct cxl_region *cxlr; cxlr = cxl_find_region_by_name(cxlrd, buf); if (IS_ERR(cxlr)) return PTR_ERR(cxlr); - devm_release_action(port->uport_dev, unregister_region, cxlr); + cxl_region_release_action(cxlr); put_device(&cxlr->dev); return len; @@ -3897,7 +3957,6 @@ static struct cxl_region *construct_region(struct cxl_root_decoder *cxlrd, { struct cxl_endpoint_decoder *cxled = ctx->cxled; struct cxl_memdev *cxlmd = cxled_to_memdev(cxled); - struct cxl_port *port = cxlrd_to_port(cxlrd); struct cxl_dev_state *cxlds = cxlmd->cxlds; int rc, part = READ_ONCE(cxled->part); struct cxl_region *cxlr; @@ -3912,7 +3971,8 @@ static struct cxl_region *construct_region(struct cxl_root_decoder *cxlrd, do { cxlr = __create_region(cxlrd, cxlds->part[part].mode, - atomic_read(&cxlrd->region_id)); + atomic_read(&cxlrd->region_id), + cxled->cxld.target_type); } while (IS_ERR(cxlr) && PTR_ERR(cxlr) == -EBUSY); if (IS_ERR(cxlr)) { @@ -3925,7 +3985,7 @@ static struct cxl_region *construct_region(struct cxl_root_decoder *cxlrd, rc = __construct_region(cxlr, ctx); if (rc) { - devm_release_action(port->uport_dev, unregister_region, cxlr); + cxl_region_release_action(cxlr); return ERR_PTR(rc); } @@ -4183,6 +4243,35 @@ static int cxl_region_setup_poison(struct cxl_region *cxlr) return devm_add_action_or_reset(dev, remove_debugfs, dentry); } +static int region_contains_soft_reserve(struct device *dev, void *data) +{ + struct resource *res = data; + struct cxl_region *cxlr; + struct cxl_region_params *p; + + if (!is_cxl_region(dev)) + return 0; + + cxlr = to_cxl_region(dev); + p = &cxlr->params; + + if (p->state != CXL_CONFIG_COMMIT) + return 0; + + if (!p->res) + return 0; + + return resource_contains(p->res, res) ? 1 : 0; +} + +bool cxl_region_contains_soft_reserve(struct resource *res) +{ + guard(rwsem_read)(&cxl_rwsem.region); + return bus_for_each_dev(&cxl_bus_type, NULL, res, + region_contains_soft_reserve) != 0; +} +EXPORT_SYMBOL_GPL(cxl_region_contains_soft_reserve); + static int cxl_region_can_probe(struct cxl_region *cxlr) { struct cxl_region_params *p = &cxlr->params; @@ -4208,6 +4297,135 @@ static int cxl_region_can_probe(struct cxl_region *cxlr) return 0; } +static int first_mapped_decoder(struct device *dev, const void *data) +{ + struct cxl_endpoint_decoder *cxled; + + if (!is_endpoint_decoder(dev)) + return 0; + + cxled = to_cxl_endpoint_decoder(dev); + if (cxled->cxld.region) + return 1; + + return 0; +} + +/* + * As this is running in endpoint port remove context it does not race cxl_root + * destruction since port topologies are always removed depth first. + */ +static void cxl_endpoint_region_autoremove(void *_cxlr) +{ + unregister_region(_cxlr); +} + +/** + * cxl_memdev_attach_region - bind region to accelerator memdev + * + * @cxlmd: a pointer to cxl_memdev to use + * @attach: a pointer to region attach struct with callbacks for + * safely working with a region range by the caller + * + * Returns 0 or error. + */ +int cxl_memdev_attach_region(struct cxl_memdev *cxlmd, + struct cxl_attach_region *attach) +{ + struct cxl_port *endpoint = cxlmd->endpoint; + struct cxl_endpoint_decoder *cxled; + struct cxl_region *cxlr; + int rc; + + if (IS_ERR(endpoint)) + return PTR_ERR(endpoint); + if (!endpoint) + return -ENXIO; + + { + /* hold endpoint lock to setup autoremove of the region */ + guard(device)(&endpoint->dev); + if (!endpoint->dev.driver) + return -ENXIO; + + { + guard(rwsem_read)(&cxl_rwsem.region); + guard(rwsem_read)(&cxl_rwsem.dpa); + + /* + * TODO auto-instantiate a region, for now assume this will + * find an auto-region. + */ + struct device *dev __free(put_device) = + device_find_child(&endpoint->dev, NULL, + first_mapped_decoder); + + if (!dev) { + dev_dbg(cxlmd->cxlds->dev, + "no region found for memdev %s\n", + dev_name(&cxlmd->dev)); + return -ENXIO; + } + + cxled = to_cxl_endpoint_decoder(dev); + cxlr = cxled->cxld.region; + + if (cxlr->params.state < CXL_CONFIG_COMMIT) { + dev_dbg(cxlmd->cxlds->dev, + "region %s not committed for memdev %s\n", + dev_name(&cxlr->dev), dev_name(&cxlmd->dev)); + return -ENXIO; + } + + if (cxlr->params.nr_targets > 1) { + dev_dbg(cxlmd->cxlds->dev, + "Only attach to local non-interleaved region\n"); + return -ENXIO; + } + + attach->region = (struct range) { + .start = cxlr->params.res->start, + .end = cxlr->params.res->end, + }; + + /* + * With endpoint locked leave the caller to safely work + * with the region range. + */ + rc = attach->attach(attach->data); + if (rc) + return rc; + + /* Only teardown regions that pass validation, ignore the rest */ + rc = devm_add_action(&endpoint->dev, + cxl_endpoint_region_autoremove, cxlr); + if (rc) { + attach->detach(attach->data); + goto err_unregister; + } + + /* Link type2 driver callback for stopping use of the region range. */ + rc = devm_add_action_or_reset(&endpoint->dev, + attach->detach, attach->data); + if (rc) { + devm_remove_action(&endpoint->dev, + cxl_endpoint_region_autoremove, + cxlr); + goto err_unregister; + } + + cxlr->detach = attach->detach; + cxlr->detach_data = attach->data; + + return 0; + } +err_unregister: + unregister_region(cxlr); + return rc; + } +} +EXPORT_SYMBOL_NS_GPL(cxl_memdev_attach_region, "CXL"); + static int cxl_region_probe(struct device *dev) { struct cxl_region *cxlr = to_cxl_region(dev); @@ -4218,6 +4436,13 @@ static int cxl_region_probe(struct device *dev) if (rc) return rc; + /* + * HDM-D[B] (device-memory) regions have accelerator specific usage. + * Skip device-dax registration. + */ + if (cxlr->type == CXL_DECODER_DEVMEM) + return 0; + /* * From this point on any path that changes the region's state away from * CXL_CONFIG_COMMIT is also responsible for releasing the driver. diff --git a/drivers/cxl/core/regs.c b/drivers/cxl/core/regs.c index a010b32143422..20c2d9fbcfe7d 100644 --- a/drivers/cxl/core/regs.c +++ b/drivers/cxl/core/regs.c @@ -4,6 +4,7 @@ #include #include #include +#include #include #include #include @@ -641,4 +642,3 @@ resource_size_t cxl_rcd_component_reg_phys(struct device *dev, return CXL_RESOURCE_NONE; return __rcrb_to_component(dev, &dport->rcrb, CXL_RCRB_UPSTREAM); } -EXPORT_SYMBOL_NS_GPL(cxl_rcd_component_reg_phys, "CXL"); diff --git a/drivers/cxl/cxl.h b/drivers/cxl/cxl.h index 9b947286eb9b0..c7bffa399581e 100644 --- a/drivers/cxl/cxl.h +++ b/drivers/cxl/cxl.h @@ -12,6 +12,7 @@ #include #include #include +#include extern const struct nvdimm_security_ops *cxl_security_ops; @@ -23,63 +24,6 @@ extern const struct nvdimm_security_ops *cxl_security_ops; * (port-driver, region-driver, nvdimm object-drivers... etc). */ -/* CXL 2.0 8.2.4 CXL Component Register Layout and Definition */ -#define CXL_COMPONENT_REG_BLOCK_SIZE SZ_64K - -/* CXL 2.0 8.2.5 CXL.cache and CXL.mem Registers*/ -#define CXL_CM_OFFSET 0x1000 -#define CXL_CM_CAP_HDR_OFFSET 0x0 -#define CXL_CM_CAP_HDR_ID_MASK GENMASK(15, 0) -#define CM_CAP_HDR_CAP_ID 1 -#define CXL_CM_CAP_HDR_VERSION_MASK GENMASK(19, 16) -#define CM_CAP_HDR_CAP_VERSION 1 -#define CXL_CM_CAP_HDR_CACHE_MEM_VERSION_MASK GENMASK(23, 20) -#define CM_CAP_HDR_CACHE_MEM_VERSION 1 -#define CXL_CM_CAP_HDR_ARRAY_SIZE_MASK GENMASK(31, 24) -#define CXL_CM_CAP_PTR_MASK GENMASK(31, 20) - -#define CXL_CM_CAP_CAP_ID_RAS 0x2 -#define CXL_CM_CAP_CAP_ID_HDM 0x5 -#define CXL_CM_CAP_CAP_HDM_VERSION 1 - -/* HDM decoders CXL 2.0 8.2.5.12 CXL HDM Decoder Capability Structure */ -#define CXL_HDM_DECODER_CAP_OFFSET 0x0 -#define CXL_HDM_DECODER_COUNT_MASK GENMASK(3, 0) -#define CXL_HDM_DECODER_TARGET_COUNT_MASK GENMASK(7, 4) -#define CXL_HDM_DECODER_INTERLEAVE_11_8 BIT(8) -#define CXL_HDM_DECODER_INTERLEAVE_14_12 BIT(9) -#define CXL_HDM_DECODER_INTERLEAVE_3_6_12_WAY BIT(11) -#define CXL_HDM_DECODER_INTERLEAVE_16_WAY BIT(12) -#define CXL_HDM_DECODER_CTRL_OFFSET 0x4 -#define CXL_HDM_DECODER_ENABLE BIT(1) -#define CXL_HDM_DECODER0_BASE_LOW_OFFSET(i) (0x20 * (i) + 0x10) -#define CXL_HDM_DECODER0_BASE_HIGH_OFFSET(i) (0x20 * (i) + 0x14) -#define CXL_HDM_DECODER0_SIZE_LOW_OFFSET(i) (0x20 * (i) + 0x18) -#define CXL_HDM_DECODER0_SIZE_HIGH_OFFSET(i) (0x20 * (i) + 0x1c) -#define CXL_HDM_DECODER0_CTRL_OFFSET(i) (0x20 * (i) + 0x20) -#define CXL_HDM_DECODER0_CTRL_IG_MASK GENMASK(3, 0) -#define CXL_HDM_DECODER0_CTRL_IW_MASK GENMASK(7, 4) -#define CXL_HDM_DECODER0_CTRL_LOCK BIT(8) -#define CXL_HDM_DECODER0_CTRL_COMMIT BIT(9) -#define CXL_HDM_DECODER0_CTRL_COMMITTED BIT(10) -#define CXL_HDM_DECODER0_CTRL_COMMIT_ERROR BIT(11) -#define CXL_HDM_DECODER0_CTRL_HOSTONLY BIT(12) -#define CXL_HDM_DECODER0_TL_LOW(i) (0x20 * (i) + 0x24) -#define CXL_HDM_DECODER0_TL_HIGH(i) (0x20 * (i) + 0x28) -#define CXL_HDM_DECODER0_SKIP_LOW(i) CXL_HDM_DECODER0_TL_LOW(i) -#define CXL_HDM_DECODER0_SKIP_HIGH(i) CXL_HDM_DECODER0_TL_HIGH(i) - -/* HDM decoder control register constants CXL 3.0 8.2.5.19.7 */ -#define CXL_DECODER_MIN_GRANULARITY 256 -#define CXL_DECODER_MAX_ENCODED_IG 6 - -static inline int cxl_hdm_decoder_count(u32 cap_hdr) -{ - int val = FIELD_GET(CXL_HDM_DECODER_COUNT_MASK, cap_hdr); - - return val ? val * 2 : 1; -} - /* Encode defined in CXL 2.0 8.2.5.12.7 HDM Decoder Control Register */ static inline int eig_to_granularity(u16 eig, unsigned int *granularity) { @@ -201,97 +145,6 @@ static inline int ways_to_eiw(unsigned int ways, u8 *eiw) #define CXLDEV_MBOX_BG_CMD_COMMAND_VENDOR_MASK GENMASK_ULL(63, 48) #define CXLDEV_MBOX_PAYLOAD_OFFSET 0x20 -/* - * Using struct_group() allows for per register-block-type helper routines, - * without requiring block-type agnostic code to include the prefix. - */ -struct cxl_regs { - /* - * Common set of CXL Component register block base pointers - * @hdm_decoder: CXL 2.0 8.2.5.12 CXL HDM Decoder Capability Structure - * @ras: CXL 2.0 8.2.5.9 CXL RAS Capability Structure - */ - struct_group_tagged(cxl_component_regs, component, - void __iomem *hdm_decoder; - void __iomem *ras; - ); - /* - * Common set of CXL Device register block base pointers - * @status: CXL 2.0 8.2.8.3 Device Status Registers - * @mbox: CXL 2.0 8.2.8.4 Mailbox Registers - * @memdev: CXL 2.0 8.2.8.5 Memory Device Registers - */ - struct_group_tagged(cxl_device_regs, device_regs, - void __iomem *status, *mbox, *memdev; - ); - - struct_group_tagged(cxl_pmu_regs, pmu_regs, - void __iomem *pmu; - ); - - /* - * RCH downstream port specific RAS register - * @aer: CXL 3.0 8.2.1.1 RCH Downstream Port RCRB - */ - struct_group_tagged(cxl_rch_regs, rch_regs, - void __iomem *dport_aer; - ); - - /* - * RCD upstream port specific PCIe cap register - * @pcie_cap: CXL 3.0 8.2.1.2 RCD Upstream Port RCRB - */ - struct_group_tagged(cxl_rcd_regs, rcd_regs, - void __iomem *rcd_pcie_cap; - ); -}; - -struct cxl_reg_map { - bool valid; - int id; - unsigned long offset; - unsigned long size; -}; - -struct cxl_component_reg_map { - struct cxl_reg_map hdm_decoder; - struct cxl_reg_map ras; -}; - -struct cxl_device_reg_map { - struct cxl_reg_map status; - struct cxl_reg_map mbox; - struct cxl_reg_map memdev; -}; - -struct cxl_pmu_reg_map { - struct cxl_reg_map pmu; -}; - -/** - * struct cxl_register_map - DVSEC harvested register block mapping parameters - * @host: device for devm operations and logging - * @base: virtual base of the register-block-BAR + @block_offset - * @resource: physical resource base of the register block - * @max_size: maximum mapping size to perform register search - * @reg_type: see enum cxl_regloc_type - * @component_map: cxl_reg_map for component registers - * @device_map: cxl_reg_maps for device registers - * @pmu_map: cxl_reg_maps for CXL Performance Monitoring Units - */ -struct cxl_register_map { - struct device *host; - void __iomem *base; - resource_size_t resource; - resource_size_t max_size; - u8 reg_type; - union { - struct cxl_component_reg_map component_map; - struct cxl_device_reg_map device_map; - struct cxl_pmu_reg_map pmu_map; - }; -}; - void cxl_probe_component_regs(struct device *dev, void __iomem *base, struct cxl_component_reg_map *map); void cxl_probe_device_regs(struct device *dev, void __iomem *base, @@ -304,16 +157,10 @@ int cxl_map_device_regs(const struct cxl_register_map *map, int cxl_map_pmu_regs(struct cxl_register_map *map, struct cxl_pmu_regs *regs); #define CXL_INSTANCES_COUNT -1 -enum cxl_regloc_type; int cxl_count_regblock(struct pci_dev *pdev, enum cxl_regloc_type type); int cxl_find_regblock_instance(struct pci_dev *pdev, enum cxl_regloc_type type, struct cxl_register_map *map, unsigned int index); -int cxl_find_regblock(struct pci_dev *pdev, enum cxl_regloc_type type, - struct cxl_register_map *map); -int cxl_setup_regs(struct cxl_register_map *map); struct cxl_dport; -resource_size_t cxl_rcd_component_reg_phys(struct device *dev, - struct cxl_dport *dport); int cxl_dport_map_rcd_linkcap(struct pci_dev *pdev, struct cxl_dport *dport); #define CXL_RESOURCE_NONE ((resource_size_t) -1) @@ -497,11 +344,6 @@ struct cxl_region_params { resource_size_t cache_size; }; -enum cxl_partition_mode { - CXL_PARTMODE_RAM, - CXL_PARTMODE_PMEM, -}; - /* * Indicate whether this region has been assembled by autodetection or * userspace assembly. Prevent endpoint decoders outside of automatic @@ -540,6 +382,8 @@ enum cxl_partition_mode { * @hpa_range: Address range occupied by the region * @mode: Operational mode of the mapped capacity * @type: Endpoint decoder target type + * @detach: accelerator detach callback for device-memory regions + * @detach_data: accelerator detach callback data * @cxl_nvb: nvdimm bridge for coordinating @cxlr_pmem setup / shutdown * @cxlr_pmem: (for pmem regions) cached copy of the nvdimm bridge * @flags: Region state flags @@ -555,6 +399,8 @@ struct cxl_region { struct range hpa_range; enum cxl_partition_mode mode; enum cxl_decoder_type type; + void (*detach)(void *data); + void *detach_data; struct cxl_nvdimm_bridge *cxl_nvb; struct cxl_pmem_region *cxlr_pmem; unsigned long flags; diff --git a/drivers/cxl/cxlmem.h b/drivers/cxl/cxlmem.h index e21d744d639bd..c98db6f18aa29 100644 --- a/drivers/cxl/cxlmem.h +++ b/drivers/cxl/cxlmem.h @@ -34,10 +34,6 @@ (FIELD_GET(CXLMDEV_RESET_NEEDED_MASK, status) != \ CXLMDEV_RESET_NEEDED_NOT) -struct cxl_memdev_attach { - int (*probe)(struct cxl_memdev *cxlmd); -}; - /** * struct cxl_memdev - CXL bus object representing a Type-3 Memory Device * @dev: driver core device object @@ -103,8 +99,6 @@ static inline bool is_cxl_endpoint(struct cxl_port *port) struct cxl_memdev *__devm_cxl_add_memdev(struct cxl_dev_state *cxlds, const struct cxl_memdev_attach *attach); -struct cxl_memdev *devm_cxl_add_memdev(struct cxl_dev_state *cxlds, - const struct cxl_memdev_attach *attach); int devm_cxl_sanitize_setup_notifier(struct device *host, struct cxl_memdev *cxlmd); struct cxl_memdev_state; @@ -113,8 +107,6 @@ int devm_cxl_dpa_reserve(struct cxl_endpoint_decoder *cxled, resource_size_t base, resource_size_t len, resource_size_t skipped); -#define CXL_NR_PARTITIONS_MAX 2 - struct cxl_dpa_info { u64 size; struct cxl_dpa_part_info { @@ -373,87 +365,6 @@ struct cxl_security_state { struct kernfs_node *sanitize_node; }; -/* - * enum cxl_devtype - delineate type-2 from a generic type-3 device - * @CXL_DEVTYPE_DEVMEM - Vendor specific CXL Type-2 device implementing HDM-D or - * HDM-DB, no requirement that this device implements a - * mailbox, or other memory-device-standard manageability - * flows. - * @CXL_DEVTYPE_CLASSMEM - Common class definition of a CXL Type-3 device with - * HDM-H and class-mandatory memory device registers - */ -enum cxl_devtype { - CXL_DEVTYPE_DEVMEM, - CXL_DEVTYPE_CLASSMEM, -}; - -/** - * struct cxl_dpa_perf - DPA performance property entry - * @dpa_range: range for DPA address - * @coord: QoS performance data (i.e. latency, bandwidth) - * @cdat_coord: raw QoS performance data from CDAT - * @qos_class: QoS Class cookies - */ -struct cxl_dpa_perf { - struct range dpa_range; - struct access_coordinate coord[ACCESS_COORDINATE_MAX]; - struct access_coordinate cdat_coord[ACCESS_COORDINATE_MAX]; - int qos_class; -}; - -/** - * struct cxl_dpa_partition - DPA partition descriptor - * @res: shortcut to the partition in the DPA resource tree (cxlds->dpa_res) - * @perf: performance attributes of the partition from CDAT - * @mode: operation mode for the DPA capacity, e.g. ram, pmem, dynamic... - */ -struct cxl_dpa_partition { - struct resource res; - struct cxl_dpa_perf perf; - enum cxl_partition_mode mode; -}; - -/** - * struct cxl_dev_state - The driver device state - * - * cxl_dev_state represents the CXL driver/device state. It provides an - * interface to mailbox commands as well as some cached data about the device. - * Currently only memory devices are represented. - * - * @dev: The device associated with this CXL state - * @cxlmd: The device representing the CXL.mem capabilities of @dev - * @reg_map: component and ras register mapping parameters - * @regs: Class device "Device" registers - * @cxl_dvsec: Offset to the PCIe device DVSEC - * @rcd: operating in RCD mode (CXL 3.0 9.11.8 CXL Devices Attached to an RCH) - * @media_ready: Indicate whether the device media is usable - * @dpa_res: Overall DPA resource tree for the device - * @part: DPA partition array - * @nr_partitions: Number of DPA partitions - * @serial: PCIe Device Serial Number - * @type: Generic Memory Class device or Vendor Specific Memory device - * @cxl_mbox: CXL mailbox context - * @cxlfs: CXL features context - */ -struct cxl_dev_state { - struct device *dev; - struct cxl_memdev *cxlmd; - struct cxl_register_map reg_map; - struct cxl_device_regs regs; - int cxl_dvsec; - bool rcd; - bool media_ready; - struct resource dpa_res; - struct cxl_dpa_partition part[CXL_NR_PARTITIONS_MAX]; - unsigned int nr_partitions; - u64 serial; - enum cxl_devtype type; - struct cxl_mailbox cxl_mbox; -#ifdef CONFIG_CXL_FEATURES - struct cxl_features_state *cxlfs; -#endif -}; - static inline resource_size_t cxl_pmem_size(struct cxl_dev_state *cxlds) { /* @@ -858,7 +769,8 @@ int cxl_dev_state_identify(struct cxl_memdev_state *mds); int cxl_await_media_ready(struct cxl_dev_state *cxlds); int cxl_enumerate_cmds(struct cxl_memdev_state *mds); int cxl_mem_dpa_fetch(struct cxl_memdev_state *mds, struct cxl_dpa_info *info); -struct cxl_memdev_state *cxl_memdev_state_create(struct device *dev); +struct cxl_memdev_state *cxl_memdev_state_create(struct device *dev, u64 serial, + u16 dvsec); void set_exclusive_cxl_commands(struct cxl_memdev_state *mds, unsigned long *cmds); void clear_exclusive_cxl_commands(struct cxl_memdev_state *mds, diff --git a/drivers/cxl/cxlpci.h b/drivers/cxl/cxlpci.h index 0cf64218aa16e..224636588f623 100644 --- a/drivers/cxl/cxlpci.h +++ b/drivers/cxl/cxlpci.h @@ -13,16 +13,6 @@ */ #define CXL_PCI_DEFAULT_MAX_VECTORS 16 -/* Register Block Identifier (RBI) */ -enum cxl_regloc_type { - CXL_REGLOC_RBI_EMPTY = 0, - CXL_REGLOC_RBI_COMPONENT, - CXL_REGLOC_RBI_VIRT, - CXL_REGLOC_RBI_MEMDEV, - CXL_REGLOC_RBI_PMU, - CXL_REGLOC_RBI_TYPES -}; - /* * Table Access DOE, CDAT Read Entry Response * @@ -74,6 +64,17 @@ static inline bool cxl_pci_flit_256(struct pci_dev *pdev) return lnksta2 & PCI_EXP_LNKSTA2_FLIT; } +/* + * Assume that the caller has already validated that @pdev has CXL + * capabilities, any RCiEP with CXL capabilities is treated as a + * Restricted CXL Device (RCD) and finds upstream port and endpoint + * registers in a Root Complex Register Block (RCRB). + */ +static inline bool is_cxl_restricted(struct pci_dev *pdev) +{ + return pci_pcie_type(pdev) == PCI_EXP_TYPE_RC_END; +} + struct cxl_dev_state; void read_cdat_data(struct cxl_port *port); @@ -101,4 +102,6 @@ static inline void devm_cxl_port_ras_setup(struct cxl_port *port) } #endif +int cxl_pci_setup_regs(struct pci_dev *pdev, enum cxl_regloc_type type, + struct cxl_register_map *map); #endif /* __CXL_PCI_H__ */ diff --git a/drivers/cxl/mem.c b/drivers/cxl/mem.c index fcffe24dcb42f..ff858318091f1 100644 --- a/drivers/cxl/mem.c +++ b/drivers/cxl/mem.c @@ -65,6 +65,26 @@ static int cxl_debugfs_poison_clear(void *data, u64 dpa) DEFINE_DEBUGFS_ATTRIBUTE(cxl_poison_clear_fops, NULL, cxl_debugfs_poison_clear, "%llx\n"); +static void cxl_memdev_poison_enable(struct cxl_memdev_state *mds, + struct cxl_memdev *cxlmd, + struct dentry *dentry) +{ + /* + * Avoid poison debugfs for DEVMEM aka accelerators as they rely on + * cxl_memdev_state. + */ + if (!mds) + return; + + if (test_bit(CXL_POISON_ENABLED_INJECT, mds->poison.enabled_cmds)) + debugfs_create_file("inject_poison", 0200, dentry, cxlmd, + &cxl_poison_inject_fops); + + if (test_bit(CXL_POISON_ENABLED_CLEAR, mds->poison.enabled_cmds)) + debugfs_create_file("clear_poison", 0200, dentry, cxlmd, + &cxl_poison_clear_fops); +} + static int cxl_mem_probe(struct device *dev) { struct cxl_memdev *cxlmd = to_cxl_memdev(dev); @@ -92,12 +112,7 @@ static int cxl_mem_probe(struct device *dev) dentry = cxl_debugfs_create_dir(dev_name(dev)); debugfs_create_devm_seqfile(dev, "dpamem", dentry, cxl_mem_dpa_show); - if (test_bit(CXL_POISON_ENABLED_INJECT, mds->poison.enabled_cmds)) - debugfs_create_file("inject_poison", 0200, dentry, cxlmd, - &cxl_poison_inject_fops); - if (test_bit(CXL_POISON_ENABLED_CLEAR, mds->poison.enabled_cmds)) - debugfs_create_file("clear_poison", 0200, dentry, cxlmd, - &cxl_poison_clear_fops); + cxl_memdev_poison_enable(mds, cxlmd, dentry); rc = devm_add_action_or_reset(dev, remove_debugfs, dentry); if (rc) @@ -206,16 +221,24 @@ static ssize_t trigger_poison_list_store(struct device *dev, } static DEVICE_ATTR_WO(trigger_poison_list); -static umode_t cxl_mem_visible(struct kobject *kobj, struct attribute *a, int n) +static bool cxl_poison_attr_visible(struct kobject *kobj, struct attribute *a) { struct device *dev = kobj_to_dev(kobj); struct cxl_memdev *cxlmd = to_cxl_memdev(dev); struct cxl_memdev_state *mds = to_cxl_memdev_state(cxlmd->cxlds); - if (a == &dev_attr_trigger_poison_list.attr) - if (!test_bit(CXL_POISON_ENABLED_LIST, - mds->poison.enabled_cmds)) - return 0; + if (!mds || + !test_bit(CXL_POISON_ENABLED_LIST, mds->poison.enabled_cmds)) + return false; + + return true; +} + +static umode_t cxl_mem_visible(struct kobject *kobj, struct attribute *a, int n) +{ + if (a == &dev_attr_trigger_poison_list.attr && + !cxl_poison_attr_visible(kobj, a)) + return 0; return a->mode; } diff --git a/drivers/cxl/pci.c b/drivers/cxl/pci.c index fbb300a018302..4bfcbd260381e 100644 --- a/drivers/cxl/pci.c +++ b/drivers/cxl/pci.c @@ -11,6 +11,7 @@ #include #include #include +#include #include #include "cxlmem.h" #include "cxlpci.h" @@ -465,76 +466,6 @@ static int cxl_pci_setup_mailbox(struct cxl_memdev_state *mds, bool irq_avail) return 0; } -/* - * Assume that any RCIEP that emits the CXL memory expander class code - * is an RCD - */ -static bool is_cxl_restricted(struct pci_dev *pdev) -{ - return pci_pcie_type(pdev) == PCI_EXP_TYPE_RC_END; -} - -static int cxl_rcrb_get_comp_regs(struct pci_dev *pdev, - struct cxl_register_map *map, - struct cxl_dport *dport) -{ - resource_size_t component_reg_phys; - - *map = (struct cxl_register_map) { - .host = &pdev->dev, - .resource = CXL_RESOURCE_NONE, - }; - - struct cxl_port *port __free(put_cxl_port) = - cxl_pci_find_port(pdev, &dport); - if (!port) - return -EPROBE_DEFER; - - component_reg_phys = cxl_rcd_component_reg_phys(&pdev->dev, dport); - if (component_reg_phys == CXL_RESOURCE_NONE) - return -ENXIO; - - map->resource = component_reg_phys; - map->reg_type = CXL_REGLOC_RBI_COMPONENT; - map->max_size = CXL_COMPONENT_REG_BLOCK_SIZE; - - return 0; -} - -static int cxl_pci_setup_regs(struct pci_dev *pdev, enum cxl_regloc_type type, - struct cxl_register_map *map) -{ - int rc; - - rc = cxl_find_regblock(pdev, type, map); - - /* - * If the Register Locator DVSEC does not exist, check if it - * is an RCH and try to extract the Component Registers from - * an RCRB. - */ - if (rc && type == CXL_REGLOC_RBI_COMPONENT && is_cxl_restricted(pdev)) { - struct cxl_dport *dport; - struct cxl_port *port __free(put_cxl_port) = - cxl_pci_find_port(pdev, &dport); - if (!port) - return -EPROBE_DEFER; - - rc = cxl_rcrb_get_comp_regs(pdev, map, dport); - if (rc) - return rc; - - rc = cxl_dport_map_rcd_linkcap(pdev, dport); - if (rc) - return rc; - - } else if (rc) { - return rc; - } - - return cxl_setup_regs(map); -} - static void free_event_buf(void *buf) { kvfree(buf); @@ -865,25 +796,25 @@ static int cxl_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id) int rc, pmu_count; unsigned int i; bool irq_avail; + u16 dvsec; rc = pcim_enable_device(pdev); if (rc) return rc; pci_set_master(pdev); - mds = cxl_memdev_state_create(&pdev->dev); + dvsec = pci_find_dvsec_capability(pdev, PCI_VENDOR_ID_CXL, + PCI_DVSEC_CXL_DEVICE); + if (!dvsec) + pci_warn(pdev, "Device DVSEC not present, skip CXL.mem init\n"); + + mds = cxl_memdev_state_create(&pdev->dev, pci_get_dsn(pdev), dvsec); if (IS_ERR(mds)) return PTR_ERR(mds); cxlds = &mds->cxlds; pci_set_drvdata(pdev, cxlds); cxlds->rcd = is_cxl_restricted(pdev); - cxlds->serial = pci_get_dsn(pdev); - cxlds->cxl_dvsec = pci_find_dvsec_capability( - pdev, PCI_VENDOR_ID_CXL, PCI_DVSEC_CXL_DEVICE); - if (!cxlds->cxl_dvsec) - dev_warn(&pdev->dev, - "Device DVSEC not present, skip CXL.mem init\n"); rc = cxl_pci_setup_regs(pdev, CXL_REGLOC_RBI_MEMDEV, &map); if (rc) @@ -1034,6 +965,7 @@ static void cxl_reset_done(struct pci_dev *pdev) { struct cxl_dev_state *cxlds = pci_get_drvdata(pdev); struct cxl_memdev *cxlmd = cxlds->cxlmd; + struct cxl_port *endpoint; struct device *dev = &pdev->dev; /* @@ -1043,8 +975,11 @@ static void cxl_reset_done(struct pci_dev *pdev) * that no longer exists. */ guard(device)(&cxlmd->dev); - if (cxlmd->endpoint && - cxl_endpoint_decoder_reset_detected(cxlmd->endpoint)) { + endpoint = cxlmd->endpoint; + if (!endpoint || IS_ERR(endpoint)) + return; + + if (cxl_endpoint_decoder_reset_detected(endpoint)) { dev_crit(dev, "SBR happened without memory regions removal.\n"); dev_crit(dev, "System may be unstable if regions hosted system memory.\n"); add_taint(TAINT_USER, LOCKDEP_STILL_OK); diff --git a/drivers/dax/Kconfig b/drivers/dax/Kconfig index d656e4c0eb846..3683bb3f2311b 100644 --- a/drivers/dax/Kconfig +++ b/drivers/dax/Kconfig @@ -48,6 +48,8 @@ config DEV_DAX_CXL tristate "CXL DAX: direct access to CXL RAM regions" depends on CXL_BUS && CXL_REGION && DEV_DAX default CXL_REGION && DEV_DAX + depends on CXL_ACPI >= DEV_DAX_HMEM + depends on CXL_PCI >= DEV_DAX_HMEM help CXL RAM regions are either mapped by platform-firmware and published in the initial system-memory map as "System RAM", mapped diff --git a/drivers/dax/Makefile b/drivers/dax/Makefile index 5ed5c39857c8b..70e996bf15261 100644 --- a/drivers/dax/Makefile +++ b/drivers/dax/Makefile @@ -1,4 +1,5 @@ # SPDX-License-Identifier: GPL-2.0 +obj-y += hmem/ obj-$(CONFIG_DAX) += dax.o obj-$(CONFIG_DEV_DAX) += device_dax.o obj-$(CONFIG_DEV_DAX_KMEM) += kmem.o @@ -10,5 +11,3 @@ dax-y += bus.o device_dax-y := device.o dax_pmem-y := pmem.o dax_cxl-y := cxl.o - -obj-y += hmem/ diff --git a/drivers/dax/bus.c b/drivers/dax/bus.c index c94c09622516e..94c9d947a8a39 100644 --- a/drivers/dax/bus.c +++ b/drivers/dax/bus.c @@ -10,6 +10,7 @@ #include "dax-private.h" #include "bus.h" +static struct resource dax_regions = DEFINE_RES_MEM_NAMED(0, -1, "DAX Regions"); static DEFINE_MUTEX(dax_bus_lock); /* @@ -24,6 +25,67 @@ DECLARE_RWSEM(dax_region_rwsem); */ DECLARE_RWSEM(dax_dev_rwsem); +enum dax_cxl_mode dax_cxl_mode = DAX_CXL_MODE_DEFER; +EXPORT_SYMBOL_NS_GPL(dax_cxl_mode, "CXL"); + +static DEFINE_MUTEX(dax_hmem_lock); +static dax_hmem_deferred_fn hmem_deferred_fn; +static void *dax_hmem_data; + +static void hmem_deferred_work(struct work_struct *work) +{ + dax_hmem_deferred_fn fn; + void *data; + + scoped_guard(mutex, &dax_hmem_lock) { + fn = hmem_deferred_fn; + data = dax_hmem_data; + } + + if (fn) + fn(data); +} + +static DECLARE_WORK(dax_hmem_work, hmem_deferred_work); + +int dax_hmem_register_work(dax_hmem_deferred_fn fn, void *data) +{ + guard(mutex)(&dax_hmem_lock); + + if (hmem_deferred_fn) + return -EINVAL; + + hmem_deferred_fn = fn; + dax_hmem_data = data; + return 0; +} +EXPORT_SYMBOL_GPL(dax_hmem_register_work); + +int dax_hmem_unregister_work(dax_hmem_deferred_fn fn, void *data) +{ + guard(mutex)(&dax_hmem_lock); + + if (hmem_deferred_fn != fn || dax_hmem_data != data) + return -EINVAL; + + hmem_deferred_fn = NULL; + dax_hmem_data = NULL; + return 0; +} +EXPORT_SYMBOL_GPL(dax_hmem_unregister_work); + +void dax_hmem_queue_work(void) +{ + queue_work(system_long_wq, &dax_hmem_work); +} +EXPORT_SYMBOL_GPL(dax_hmem_queue_work); + +void dax_hmem_flush_work(void) +{ + flush_work(&dax_hmem_work); +} +EXPORT_SYMBOL_GPL(dax_hmem_flush_work); + #define DAX_NAME_LEN 30 struct dax_id { struct list_head list; @@ -625,6 +687,8 @@ static void dax_region_unregister(void *region) { struct dax_region *dax_region = region; + scoped_guard(rwsem_write, &dax_region_rwsem) + release_resource(&dax_region->res); sysfs_remove_groups(&dax_region->dev->kobj, dax_region_attribute_groups); dax_region_put(dax_region); @@ -635,6 +699,7 @@ struct dax_region *alloc_dax_region(struct device *parent, int region_id, unsigned long flags) { struct dax_region *dax_region; + int rc; /* * The DAX core assumes that it can store its private data in @@ -667,14 +732,27 @@ struct dax_region *alloc_dax_region(struct device *parent, int region_id, .flags = IORESOURCE_MEM | flags, }; - if (sysfs_create_groups(&parent->kobj, dax_region_attribute_groups)) { - kfree(dax_region); - return NULL; + scoped_guard(rwsem_write, &dax_region_rwsem) + rc = request_resource(&dax_regions, &dax_region->res); + if (rc) { + dev_dbg(parent, "dax_region resource conflict for %pR\n", + &dax_region->res); + goto err_res; } + if (sysfs_create_groups(&parent->kobj, dax_region_attribute_groups)) + goto err_sysfs; + if (devm_add_action_or_reset(parent, dax_region_unregister, dax_region)) return NULL; return dax_region; + +err_sysfs: + scoped_guard(rwsem_write, &dax_region_rwsem) + release_resource(&dax_region->res); +err_res: + kfree(dax_region); + return NULL; } EXPORT_SYMBOL_GPL(alloc_dax_region); diff --git a/drivers/dax/bus.h b/drivers/dax/bus.h index cbbf64443098c..82616ff52fd14 100644 --- a/drivers/dax/bus.h +++ b/drivers/dax/bus.h @@ -41,6 +41,32 @@ struct dax_device_driver { void (*remove)(struct dev_dax *dev); }; +/* + * enum dax_cxl_mode - State machine to determine ownership for CXL + * tagged Soft Reserved memory ranges. + * @DAX_CXL_MODE_DEFER: Ownership resolution pending. Set while waiting + * for CXL enumeration and region assembly to complete. + * @DAX_CXL_MODE_REGISTER: CXL regions do not fully cover Soft Reserved + * ranges. Fall back to registering those ranges via dax_hmem. + * @DAX_CXL_MODE_DROP: All Soft Reserved ranges intersecting CXL windows + * are fully contained within committed CXL regions. Drop HMEM handling + * and allow dax_cxl to bind. + */ +enum dax_cxl_mode { + DAX_CXL_MODE_DEFER, + DAX_CXL_MODE_REGISTER, + DAX_CXL_MODE_DROP, +}; + +extern enum dax_cxl_mode dax_cxl_mode; + +typedef void (*dax_hmem_deferred_fn)(void *data); + +int dax_hmem_register_work(dax_hmem_deferred_fn fn, void *data); +int dax_hmem_unregister_work(dax_hmem_deferred_fn fn, void *data); +void dax_hmem_queue_work(void); +void dax_hmem_flush_work(void); + int __dax_driver_register(struct dax_device_driver *dax_drv, struct module *module, const char *mod_name); #define dax_driver_register(driver) \ diff --git a/drivers/dax/cxl.c b/drivers/dax/cxl.c index 13cd94d32ff7a..3ab39b77843d5 100644 --- a/drivers/dax/cxl.c +++ b/drivers/dax/cxl.c @@ -38,10 +38,36 @@ static struct cxl_driver cxl_dax_region_driver = { .id = CXL_DEVICE_DAX_REGION, .drv = { .suppress_bind_attrs = true, + .probe_type = PROBE_PREFER_ASYNCHRONOUS, }, }; -module_cxl_driver(cxl_dax_region_driver); +static void cxl_dax_region_driver_register(struct work_struct *work) +{ + dax_hmem_flush_work(); + cxl_driver_register(&cxl_dax_region_driver); +} + +static DECLARE_WORK(cxl_dax_region_driver_work, cxl_dax_region_driver_register); + +static int __init cxl_dax_region_init(void) +{ + /* + * Need to resolve a race with dax_hmem wanting to drive regions + * instead of CXL + */ + queue_work(system_long_wq, &cxl_dax_region_driver_work); + return 0; +} +module_init(cxl_dax_region_init); + +static void __exit cxl_dax_region_exit(void) +{ + flush_work(&cxl_dax_region_driver_work); + cxl_driver_unregister(&cxl_dax_region_driver); +} +module_exit(cxl_dax_region_exit); + MODULE_ALIAS_CXL(CXL_DEVICE_DAX_REGION); MODULE_DESCRIPTION("CXL DAX: direct access to CXL regions"); MODULE_LICENSE("GPL"); diff --git a/drivers/dax/hmem/hmem.c b/drivers/dax/hmem/hmem.c index 1cf7c2a0ee1cb..c07bf5fe833dc 100644 --- a/drivers/dax/hmem/hmem.c +++ b/drivers/dax/hmem/hmem.c @@ -3,6 +3,7 @@ #include #include #include +#include #include "../bus.h" static bool region_idle; @@ -58,6 +59,34 @@ static void release_hmem(void *pdev) platform_device_unregister(pdev); } +static void remove_soft_reserved(void *r) +{ + remove_resource(r); + kfree(r); +} + +static int add_soft_reserve_into_iomem(struct device *host, + const struct resource *res) +{ + int rc; + + struct resource *soft __free(kfree) = + kmalloc(sizeof(*res), GFP_KERNEL); + if (!soft) + return -ENOMEM; + + *soft = DEFINE_RES_NAMED_DESC(res->start, (res->end - res->start + 1), + "Soft Reserved", IORESOURCE_MEM, + IORES_DESC_SOFT_RESERVED); + + rc = insert_resource(&iomem_resource, soft); + if (rc) + return rc; + + return devm_add_action_or_reset(host, remove_soft_reserved, + no_free_ptr(soft)); +} + static int hmem_register_device(struct device *host, int target_nid, const struct resource *res) { @@ -66,18 +95,30 @@ static int hmem_register_device(struct device *host, int target_nid, long id; int rc; - if (IS_ENABLED(CONFIG_CXL_REGION) && + if (IS_ENABLED(CONFIG_DEV_DAX_CXL) && region_intersects(res->start, resource_size(res), IORESOURCE_MEM, IORES_DESC_CXL) != REGION_DISJOINT) { - dev_dbg(host, "deferring range to CXL: %pr\n", res); - return 0; + switch (dax_cxl_mode) { + case DAX_CXL_MODE_DEFER: + dev_dbg(host, "deferring range to CXL: %pr\n", res); + dax_hmem_queue_work(); + return 0; + case DAX_CXL_MODE_REGISTER: + dev_dbg(host, "registering CXL range: %pr\n", res); + break; + case DAX_CXL_MODE_DROP: + dev_dbg(host, "dropping CXL range: %pr\n", res); + return 0; + } } rc = region_intersects_soft_reserve(res->start, resource_size(res)); if (rc != REGION_INTERSECTS) return 0; - /* TODO: Add Soft-Reserved memory back to iomem */ + rc = add_soft_reserve_into_iomem(host, res); + if (rc) + return rc; id = memregion_alloc(GFP_KERNEL); if (id < 0) { @@ -123,8 +164,70 @@ static int hmem_register_device(struct device *host, int target_nid, return rc; } +static int hmem_register_cxl_device(struct device *host, int target_nid, + const struct resource *res) +{ + if (region_intersects(res->start, resource_size(res), IORESOURCE_MEM, + IORES_DESC_CXL) != REGION_DISJOINT) + return hmem_register_device(host, target_nid, res); + + return 0; +} + +static int soft_reserve_has_cxl_match(struct device *host, int target_nid, + const struct resource *res) +{ + if (region_intersects(res->start, resource_size(res), IORESOURCE_MEM, + IORES_DESC_CXL) != REGION_DISJOINT) { + if (!cxl_region_contains_soft_reserve((struct resource *)res)) + return 1; + } + + return 0; +} + +static void process_defer_work(void *data) +{ + struct platform_device *pdev = data; + int rc; + + /* relies on cxl_acpi and cxl_pci having had a chance to load */ + wait_for_device_probe(); + + rc = walk_hmem_resources(&pdev->dev, soft_reserve_has_cxl_match); + + if (!rc) { + dax_cxl_mode = DAX_CXL_MODE_DROP; + dev_dbg(&pdev->dev, "All Soft Reserved ranges claimed by CXL\n"); + } else { + dax_cxl_mode = DAX_CXL_MODE_REGISTER; + dev_warn(&pdev->dev, + "Soft Reserved not fully contained in CXL; using HMEM\n"); + } + + walk_hmem_resources(&pdev->dev, hmem_register_cxl_device); +} + +static void kill_defer_work(void *data) +{ + struct platform_device *pdev = data; + + dax_hmem_flush_work(); + dax_hmem_unregister_work(process_defer_work, pdev); +} + static int dax_hmem_platform_probe(struct platform_device *pdev) { + int rc; + + rc = dax_hmem_register_work(process_defer_work, pdev); + if (rc) + return rc; + + rc = devm_add_action_or_reset(&pdev->dev, kill_defer_work, pdev); + if (rc) + return rc; + return walk_hmem_resources(&pdev->dev, hmem_register_device); } @@ -139,6 +242,16 @@ static __init int dax_hmem_init(void) { int rc; + /* + * Ensure that cxl_acpi and cxl_pci have a chance to kick off + * CXL topology discovery at least once before scanning the + * iomem resource tree for IORES_DESC_CXL resources. + */ + if (IS_ENABLED(CONFIG_DEV_DAX_CXL)) { + request_module("cxl_acpi"); + request_module("cxl_pci"); + } + rc = platform_driver_register(&dax_hmem_platform_driver); if (rc) return rc; @@ -159,15 +272,9 @@ static __exit void dax_hmem_exit(void) module_init(dax_hmem_init); module_exit(dax_hmem_exit); -/* Allow for CXL to define its own dax regions */ -#if IS_ENABLED(CONFIG_CXL_REGION) -#if IS_MODULE(CONFIG_CXL_ACPI) -MODULE_SOFTDEP("pre: cxl_acpi"); -#endif -#endif - MODULE_ALIAS("platform:hmem*"); MODULE_ALIAS("platform:hmem_platform*"); MODULE_DESCRIPTION("HMEM DAX: direct access to 'specific purpose' memory"); MODULE_LICENSE("GPL v2"); MODULE_AUTHOR("Intel Corporation"); +MODULE_IMPORT_NS("CXL"); diff --git a/drivers/net/ethernet/sfc/Kconfig b/drivers/net/ethernet/sfc/Kconfig index c4c43434f3143..979f2801e2a8e 100644 --- a/drivers/net/ethernet/sfc/Kconfig +++ b/drivers/net/ethernet/sfc/Kconfig @@ -66,6 +66,15 @@ config SFC_MCDI_LOGGING Driver-Interface) commands and responses, allowing debugging of driver/firmware interaction. The tracing is actually enabled by a sysfs file 'mcdi_logging' under the PCI device. +config SFC_CXL + bool "Solarflare SFC9100-family CXL support" + depends on SFC && CXL_BUS >= SFC + default SFC + help + This enables SFC CXL support if the kernel is configuring CXL for + using CTPIO with CXL.mem. The SFC device with CXL support and + with a CXL-aware firmware can be used for minimizing latencies + when sending through CTPIO. source "drivers/net/ethernet/sfc/falcon/Kconfig" source "drivers/net/ethernet/sfc/siena/Kconfig" diff --git a/drivers/net/ethernet/sfc/Makefile b/drivers/net/ethernet/sfc/Makefile index d99039ec468d6..bb0f1891cde65 100644 --- a/drivers/net/ethernet/sfc/Makefile +++ b/drivers/net/ethernet/sfc/Makefile @@ -13,6 +13,7 @@ sfc-$(CONFIG_SFC_SRIOV) += sriov.o ef10_sriov.o ef100_sriov.o ef100_rep.o \ mae.o tc.o tc_bindings.o tc_counters.o \ tc_encap_actions.o tc_conntrack.o +sfc-$(CONFIG_SFC_CXL) += efx_cxl.o obj-$(CONFIG_SFC) += sfc.o obj-$(CONFIG_SFC_FALCON) += falcon/ diff --git a/drivers/net/ethernet/sfc/ef10.c b/drivers/net/ethernet/sfc/ef10.c index 7e04f115bbaaa..52ad07c121833 100644 --- a/drivers/net/ethernet/sfc/ef10.c +++ b/drivers/net/ethernet/sfc/ef10.c @@ -24,6 +24,7 @@ #include #include #include +#include "efx_cxl.h" /* Hardware control for EF10 architecture including 'Huntington'. */ @@ -106,7 +107,7 @@ static int efx_ef10_get_vf_index(struct efx_nic *efx) static int efx_ef10_init_datapath_caps(struct efx_nic *efx) { - MCDI_DECLARE_BUF(outbuf, MC_CMD_GET_CAPABILITIES_V4_OUT_LEN); + MCDI_DECLARE_BUF(outbuf, MC_CMD_GET_CAPABILITIES_V7_OUT_LEN); struct efx_ef10_nic_data *nic_data = efx->nic_data; size_t outlen; int rc; @@ -177,6 +178,12 @@ static int efx_ef10_init_datapath_caps(struct efx_nic *efx) efx->num_mac_stats); } + if (outlen < MC_CMD_GET_CAPABILITIES_V7_OUT_LEN) + nic_data->datapath_caps3 = 0; + else + nic_data->datapath_caps3 = MCDI_DWORD(outbuf, + GET_CAPABILITIES_V7_OUT_FLAGS3); + return 0; } @@ -771,6 +778,35 @@ static int efx_ef10_alloc_piobufs(struct efx_nic *efx, unsigned int n) return rc; } +#ifdef CONFIG_SFC_CXL +/* Invoked from cxl core when a cxl region is removed. This is expected at + * driver exit linked to cxl core devm releases which does not require the + * below sync. + * + * However, it is required when user space actions triggger such a cxl region + * removal forcing any cxl piobuf usage to stop. Setting per tx queue piobuf + * to NULL is safe if such a tx queue is not currently in use inside + * efx_hard_start_xmit() implying tx_queue locked. + * + * After this the cxl region physical range can be safely unmap. + */ +void efx_ef10_disable_piobufs(struct efx_nic *efx) +{ + struct efx_tx_queue *tx_queue; + struct efx_channel *channel; + + local_bh_disable(); + efx_for_each_channel(channel, efx) + efx_for_each_channel_tx_queue(tx_queue, channel) { + HARD_TX_LOCK(efx->net_dev, tx_queue->core_txq, + smp_processor_id()); + tx_queue->piobuf = NULL; + HARD_TX_UNLOCK(efx->net_dev, tx_queue->core_txq); + } + local_bh_enable(); +} +#endif + static int efx_ef10_link_piobufs(struct efx_nic *efx) { struct efx_ef10_nic_data *nic_data = efx->nic_data; @@ -914,6 +950,12 @@ static void efx_ef10_forget_old_piobufs(struct efx_nic *efx) { } +#ifdef CONFIG_SFC_CXL +void efx_ef10_disable_piobufs(struct efx_nic *efx) +{ +} +#endif + #endif /* EFX_USE_PIO */ static void efx_ef10_remove(struct efx_nic *efx) @@ -1140,6 +1182,9 @@ static int efx_ef10_dimension_resources(struct efx_nic *efx) unsigned int channel_vis, pio_write_vi_base, max_vis; struct efx_ef10_nic_data *nic_data = efx->nic_data; unsigned int uc_mem_map_size, wc_mem_map_size; +#ifdef CONFIG_SFC_CXL + struct efx_probe_data *probe_data; +#endif void __iomem *membase; int rc; @@ -1263,8 +1308,25 @@ static int efx_ef10_dimension_resources(struct efx_nic *efx) iounmap(efx->membase); efx->membase = membase; - /* Set up the WC mapping if needed */ - if (wc_mem_map_size) { + if (!wc_mem_map_size) + goto skip_pio; + + /* Set up the WC mapping */ + +#ifdef CONFIG_SFC_CXL + probe_data = container_of(efx, struct efx_probe_data, efx); + if ((nic_data->datapath_caps3 & + (1 << MC_CMD_GET_CAPABILITIES_V7_OUT_CXL_CONFIG_ENABLE_LBN)) && + probe_data->cxl_pio_initialised) { + /* Using PIO through CXL mapping */ + nic_data->pio_write_base = probe_data->cxl->ctpio_cxl; + nic_data->pio_write_vi_base = pio_write_vi_base; + + probe_data->cxl_pio_in_use = true; + } else +#endif + { + /* Using legacy PIO BAR mapping */ nic_data->wc_membase = ioremap_wc(efx->membase_phys + uc_mem_map_size, wc_mem_map_size); @@ -1279,12 +1341,14 @@ static int efx_ef10_dimension_resources(struct efx_nic *efx) nic_data->wc_membase + (pio_write_vi_base * efx->vi_stride + ER_DZ_TX_PIOBUF - uc_mem_map_size); - - rc = efx_ef10_link_piobufs(efx); - if (rc) - efx_ef10_free_piobufs(efx); } + rc = efx_ef10_link_piobufs(efx); + if (rc) + efx_ef10_free_piobufs(efx); + +skip_pio: + netif_dbg(efx, probe, efx->net_dev, "memory BAR at %pa (virtual %p+%x UC, %p+%x WC)\n", &efx->membase_phys, efx->membase, uc_mem_map_size, diff --git a/drivers/net/ethernet/sfc/efx.c b/drivers/net/ethernet/sfc/efx.c index 8f136a11d3968..90ccbe3103860 100644 --- a/drivers/net/ethernet/sfc/efx.c +++ b/drivers/net/ethernet/sfc/efx.c @@ -34,6 +34,7 @@ #include "selftest.h" #include "sriov.h" #include "efx_devlink.h" +#include "efx_cxl.h" #include "mcdi_port_common.h" #include "mcdi_pcol.h" @@ -981,12 +982,14 @@ static void efx_pci_remove(struct pci_dev *pci_dev) efx_pci_remove_main(efx); efx_fini_io(efx); + + probe_data = container_of(efx, struct efx_probe_data, efx); + pci_dbg(efx->pci_dev, "shutdown successful\n"); efx_fini_devlink_and_unlock(efx); efx_fini_struct(efx); free_netdev(efx->net_dev); - probe_data = container_of(efx, struct efx_probe_data, efx); kfree(probe_data); }; @@ -1190,6 +1193,15 @@ static int efx_pci_probe(struct pci_dev *pci_dev, if (rc) goto fail2; + /* A successful cxl initialization implies a CXL region created to be + * used for PIO buffers. If there is no CXL support, or initialization + * fails, cxl_pio_initialised will be false and legacy PIO buffers + * defined at specific PCI BAR regions will be used. + */ + rc = efx_cxl_init(probe_data); + if (rc) + pci_err(pci_dev, "CXL initialization failed with error %d\n", rc); + rc = efx_pci_probe_post_io(efx); if (rc) { /* On failure, retry once immediately. diff --git a/drivers/net/ethernet/sfc/efx.h b/drivers/net/ethernet/sfc/efx.h index 45e1916866256..37fd1cf96582e 100644 --- a/drivers/net/ethernet/sfc/efx.h +++ b/drivers/net/ethernet/sfc/efx.h @@ -237,4 +237,5 @@ static inline bool efx_rwsem_assert_write_locked(struct rw_semaphore *sem) int efx_xdp_tx_buffers(struct efx_nic *efx, int n, struct xdp_frame **xdpfs, bool flush); +void efx_ef10_disable_piobufs(struct efx_nic *efx); #endif /* EFX_EFX_H */ diff --git a/drivers/net/ethernet/sfc/efx_cxl.c b/drivers/net/ethernet/sfc/efx_cxl.c new file mode 100644 index 0000000000000..52b2cded76daf --- /dev/null +++ b/drivers/net/ethernet/sfc/efx_cxl.c @@ -0,0 +1,128 @@ +// SPDX-License-Identifier: GPL-2.0-only +/**************************************************************************** + * + * Driver for AMD network controllers and boards + * Copyright (C) 2025, Advanced Micro Devices, Inc. + */ + +#include + +#include +#include +#include "net_driver.h" +#include "efx.h" +#include "efx_cxl.h" + +#define EFX_CTPIO_BUFFER_SIZE SZ_256M + +/* Called with cxl endpoint device locked for precluding potential related + * cxl region removal triggered from user space, allowing safely mapping of + * such cxl region by the sfc driver. + */ +static int efx_cxl_map_region(void *data) { + struct efx_probe_data *probe_data = data; + struct efx_nic *efx = &probe_data->efx; + struct pci_dev *pci_dev = efx->pci_dev; + struct efx_cxl *cxl = probe_data->cxl; + struct range *cxl_pio_range = &cxl->attach_region.region; + + cxl->ctpio_cxl = ioremap(cxl_pio_range->start, + cxl_pio_range->end - cxl_pio_range->start + 1); + if (!cxl->ctpio_cxl) { + pci_err(pci_dev, "CXL ioremap region (%pra) failed\n", + cxl_pio_range); + return -ENOMEM; + } + probe_data->cxl_pio_initialised = true; + return 0; +} + +/* Called at driver exit or when user space triggers cxl region removal. */ +static void efx_cxl_unmap_region(void *data) { + struct efx_probe_data *probe_data = data; + + efx_ef10_disable_piobufs(&probe_data->efx); + probe_data->cxl_pio_initialised = false; + iounmap(probe_data->cxl->ctpio_cxl); +} + +int efx_cxl_init(struct efx_probe_data *probe_data) +{ + struct efx_nic *efx = &probe_data->efx; + struct pci_dev *pci_dev = efx->pci_dev; + struct efx_cxl *cxl; + u16 dvsec; + int rc; + + probe_data->cxl_pio_initialised = false; + + /* Is the device configured with and using CXL? */ + if (!pcie_is_cxl(pci_dev)) + return 0; + + dvsec = pci_find_dvsec_capability(pci_dev, PCI_VENDOR_ID_CXL, + PCI_DVSEC_CXL_DEVICE); + if (!dvsec) { + pci_info(pci_dev, "CXL_DVSEC_PCIE_DEVICE capability not found\n"); + return 0; + } + + pci_dbg(pci_dev, "CXL_DVSEC_PCIE_DEVICE capability found\n"); + + /* Create a cxl_dev_state embedded in the cxl struct using cxl core api + * specifying no mbox available. + */ + cxl = devm_cxl_dev_state_create(&pci_dev->dev, CXL_DEVTYPE_DEVMEM, + pci_get_dsn(pci_dev), dvsec, + struct efx_cxl, cxlds, false); + + if (!cxl) + return -ENOMEM; + + rc = cxl_pci_setup_regs(pci_dev, CXL_REGLOC_RBI_COMPONENT, + &cxl->cxlds.reg_map); + if (rc) { + pci_err(pci_dev, "No component registers\n"); + return rc; + } + + if (!cxl->cxlds.reg_map.component_map.hdm_decoder.valid) { + pci_err(pci_dev, "Expected HDM component register not found\n"); + return -ENODEV; + } + + if (!cxl->cxlds.reg_map.component_map.ras.valid) { + pci_err(pci_dev, "Expected RAS component register not found\n"); + return -ENODEV; + } + + /* Set media ready explicitly as there are neither mailbox for checking + * this state nor the CXL register involved, both not mandatory for + * type2. + */ + cxl->cxlds.media_ready = true; + + if (cxl_set_capacity(&cxl->cxlds, EFX_CTPIO_BUFFER_SIZE)) { + pci_err(pci_dev, "dpa capacity setup failed\n"); + return -ENODEV; + } + + cxl->cxlmd = devm_cxl_add_memdev(&cxl->cxlds, NULL); + if (IS_ERR(cxl->cxlmd)) { + pci_err(pci_dev, "CXL accel memdev creation failed\n"); + return PTR_ERR(cxl->cxlmd); + } + + cxl->attach_region.attach = efx_cxl_map_region; + cxl->attach_region.detach = efx_cxl_unmap_region; + cxl->attach_region.data = probe_data; + probe_data->cxl = cxl; + + rc = cxl_memdev_attach_region(cxl->cxlmd, &cxl->attach_region); + if (rc) + return rc; + + return 0; +} + +MODULE_IMPORT_NS("CXL"); diff --git a/drivers/net/ethernet/sfc/efx_cxl.h b/drivers/net/ethernet/sfc/efx_cxl.h new file mode 100644 index 0000000000000..1c294cd1df56c --- /dev/null +++ b/drivers/net/ethernet/sfc/efx_cxl.h @@ -0,0 +1,31 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/**************************************************************************** + * Driver for AMD network controllers and boards + * Copyright (C) 2025, Advanced Micro Devices, Inc. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published + * by the Free Software Foundation, incorporated herein by reference. + */ + +#ifndef EFX_CXL_H +#define EFX_CXL_H + +#ifdef CONFIG_SFC_CXL + +#include + +struct efx_probe_data; + +struct efx_cxl { + struct cxl_dev_state cxlds; + struct cxl_memdev *cxlmd; + struct cxl_attach_region attach_region; + void __iomem *ctpio_cxl; +}; + +int efx_cxl_init(struct efx_probe_data *probe_data); +#else +static inline int efx_cxl_init(struct efx_probe_data *probe_data) { return 0; } +#endif +#endif diff --git a/drivers/net/ethernet/sfc/net_driver.h b/drivers/net/ethernet/sfc/net_driver.h index b98c259f672db..bea4eecdf842d 100644 --- a/drivers/net/ethernet/sfc/net_driver.h +++ b/drivers/net/ethernet/sfc/net_driver.h @@ -1197,14 +1197,26 @@ struct efx_nic { atomic_t n_rx_noskb_drops; }; +#ifdef CONFIG_SFC_CXL +struct efx_cxl; +#endif + /** * struct efx_probe_data - State after hardware probe * @pci_dev: The PCI device * @efx: Efx NIC details + * @cxl: details of related cxl objects + * @cxl_pio_initialised: cxl initialization outcome. + * @cxl_pio_in_use: PIO using CXL mapping */ struct efx_probe_data { struct pci_dev *pci_dev; struct efx_nic efx; +#ifdef CONFIG_SFC_CXL + struct efx_cxl *cxl; + bool cxl_pio_initialised; + bool cxl_pio_in_use; +#endif }; static inline struct efx_nic *efx_netdev_priv(struct net_device *dev) diff --git a/drivers/net/ethernet/sfc/nic.h b/drivers/net/ethernet/sfc/nic.h index ec3b2df43b68d..7480f9995dfb8 100644 --- a/drivers/net/ethernet/sfc/nic.h +++ b/drivers/net/ethernet/sfc/nic.h @@ -152,6 +152,8 @@ enum { * %MC_CMD_GET_CAPABILITIES response) * @datapath_caps2: Further Capabilities of datapath firmware (FLAGS2 field of * %MC_CMD_GET_CAPABILITIES response) + * @datapath_caps3: Further Capabilities of datapath firmware (FLAGS3 field of + * %MC_CMD_GET_CAPABILITIES response) * @rx_dpcpu_fw_id: Firmware ID of the RxDPCPU * @tx_dpcpu_fw_id: Firmware ID of the TxDPCPU * @must_probe_vswitching: Flag: vswitching has yet to be setup after MC reboot @@ -187,6 +189,7 @@ struct efx_ef10_nic_data { bool must_check_datapath_caps; u32 datapath_caps; u32 datapath_caps2; + u32 datapath_caps3; unsigned int rx_dpcpu_fw_id; unsigned int tx_dpcpu_fw_id; bool must_probe_vswitching; diff --git a/drivers/pci/Kconfig b/drivers/pci/Kconfig index e3f848ffb52a7..6b96650b3f311 100644 --- a/drivers/pci/Kconfig +++ b/drivers/pci/Kconfig @@ -119,6 +119,10 @@ config XEN_PCIDEV_FRONTEND The PCI device frontend driver allows the kernel to import arbitrary PCI devices from a PCI backend to support PCI driver domains. +config PCI_CXL + bool + default y if CXL_BUS + config PCI_ATS bool diff --git a/drivers/pci/Makefile b/drivers/pci/Makefile index 41ebc3b9a5182..b6b5c9dbaaac7 100644 --- a/drivers/pci/Makefile +++ b/drivers/pci/Makefile @@ -40,6 +40,7 @@ obj-$(CONFIG_PCI_DYNAMIC_OF_NODES) += of_property.o obj-$(CONFIG_PCI_NPEM) += npem.o obj-$(CONFIG_PCIE_TPH) += tph.o obj-$(CONFIG_CARDBUS) += setup-cardbus.o +obj-$(CONFIG_PCI_CXL) += cxl.o # Endpoint library must be initialized before its users obj-$(CONFIG_PCI_ENDPOINT) += endpoint/ diff --git a/drivers/pci/cxl.c b/drivers/pci/cxl.c new file mode 100644 index 0000000000000..eee28a3755a23 --- /dev/null +++ b/drivers/pci/cxl.c @@ -0,0 +1,479 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * CXL PCI state save/restore support. + * + * Saves and restores CXL DVSEC and HDM decoder registers across PCI resets + * and link disable/enable transitions. Hooked into pci_save_state() / + * pci_restore_state() via the PCI capability save chain. + */ +#include +#include +#include +#include +#include +#include +#include "pci.h" + +#define CXL_HDM_MAX_DECODERS 32 + +struct cxl_hdm_decoder_snapshot { + u32 base_lo; + u32 base_hi; + u32 size_lo; + u32 size_hi; + u32 ctrl; + u32 tl_lo; + u32 tl_hi; +}; + +struct cxl_pci_state { + /* DVSEC saved state */ + u16 dvsec; + u16 dvsec_ctrl; + u16 dvsec_ctrl2; + u32 range_base_hi[CXL_DVSEC_RANGE_MAX]; + u32 range_base_lo[CXL_DVSEC_RANGE_MAX]; + u16 dvsec_lock; + bool dvsec_valid; + + /* HDM decoder saved state */ + int hdm_bar; + unsigned long hdm_bar_offset; + unsigned long hdm_map_size; + u32 hdm_global_ctrl; + int hdm_count; + struct cxl_hdm_decoder_snapshot decoders[CXL_HDM_MAX_DECODERS]; + bool hdm_valid; +}; + +static void cxl_save_dvsec(struct pci_dev *pdev, struct cxl_pci_state *state) +{ + int rc_ctrl, rc_ctrl2; + u16 dvsec; + int i; + + dvsec = pci_find_dvsec_capability(pdev, PCI_VENDOR_ID_CXL, + PCI_DVSEC_CXL_DEVICE); + if (!dvsec) + return; + + state->dvsec = dvsec; + rc_ctrl = pci_read_config_word(pdev, dvsec + PCI_DVSEC_CXL_CTRL, + &state->dvsec_ctrl); + rc_ctrl2 = pci_read_config_word(pdev, dvsec + PCI_DVSEC_CXL_CTRL2, + &state->dvsec_ctrl2); + if (rc_ctrl || rc_ctrl2) { + pci_warn(pdev, + "CXL: DVSEC read failed (ctrl rc=%d, ctrl2 rc=%d)\n", + rc_ctrl, rc_ctrl2); + return; + } + + for (i = 0; i < CXL_DVSEC_RANGE_MAX; i++) { + pci_read_config_dword(pdev, + dvsec + PCI_DVSEC_CXL_RANGE_BASE_HIGH(i), + &state->range_base_hi[i]); + pci_read_config_dword(pdev, + dvsec + PCI_DVSEC_CXL_RANGE_BASE_LOW(i), + &state->range_base_lo[i]); + } + + pci_read_config_word(pdev, dvsec + PCI_DVSEC_CXL_LOCK, + &state->dvsec_lock); + + state->dvsec_valid = true; +} + +static u32 cxl_merge_rwl(u32 saved, u32 current_hw, u32 rwl_mask) +{ + return (current_hw & rwl_mask) | (saved & ~rwl_mask); +} + +static void cxl_restore_dvsec(struct pci_dev *pdev, + const struct cxl_pci_state *state) +{ + u16 lock_reg = 0; + int i; + + if (!state->dvsec_valid) + return; + + pci_read_config_word(pdev, state->dvsec + PCI_DVSEC_CXL_LOCK, + &lock_reg); + + if (lock_reg & PCI_DVSEC_CXL_LOCK_CONFIG) { + u16 hw_ctrl; + u32 hw_range_hi, hw_range_lo; + + pci_read_config_word(pdev, + state->dvsec + PCI_DVSEC_CXL_CTRL, + &hw_ctrl); + pci_write_config_word(pdev, + state->dvsec + PCI_DVSEC_CXL_CTRL, + cxl_merge_rwl(state->dvsec_ctrl, hw_ctrl, + PCI_DVSEC_CXL_CTRL_RWL)); + + pci_write_config_word(pdev, + state->dvsec + PCI_DVSEC_CXL_CTRL2, + state->dvsec_ctrl2); + + for (i = 0; i < CXL_DVSEC_RANGE_MAX; i++) { + pci_read_config_dword(pdev, + state->dvsec + PCI_DVSEC_CXL_RANGE_BASE_HIGH(i), + &hw_range_hi); + pci_write_config_dword(pdev, + state->dvsec + PCI_DVSEC_CXL_RANGE_BASE_HIGH(i), + cxl_merge_rwl(state->range_base_hi[i], + hw_range_hi, + PCI_DVSEC_CXL_RANGE_BASE_HI_RWL)); + + pci_read_config_dword(pdev, + state->dvsec + PCI_DVSEC_CXL_RANGE_BASE_LOW(i), + &hw_range_lo); + pci_write_config_dword(pdev, + state->dvsec + PCI_DVSEC_CXL_RANGE_BASE_LOW(i), + cxl_merge_rwl(state->range_base_lo[i], + hw_range_lo, + PCI_DVSEC_CXL_RANGE_BASE_LO_RWL)); + } + } else { + pci_write_config_word(pdev, + state->dvsec + PCI_DVSEC_CXL_CTRL, + state->dvsec_ctrl); + pci_write_config_word(pdev, + state->dvsec + PCI_DVSEC_CXL_CTRL2, + state->dvsec_ctrl2); + for (i = 0; i < CXL_DVSEC_RANGE_MAX; i++) { + pci_write_config_dword(pdev, + state->dvsec + PCI_DVSEC_CXL_RANGE_BASE_HIGH(i), + state->range_base_hi[i]); + pci_write_config_dword(pdev, + state->dvsec + PCI_DVSEC_CXL_RANGE_BASE_LOW(i), + state->range_base_lo[i]); + } + + pci_write_config_word(pdev, + state->dvsec + PCI_DVSEC_CXL_LOCK, + state->dvsec_lock); + } +} + +struct pci_cmd_saved { + struct pci_dev *pdev; + u16 cmd; +}; + +DEFINE_FREE(restore_pci_cmd, struct pci_cmd_saved, + if (!(_T.cmd & PCI_COMMAND_MEMORY)) + pci_write_config_word(_T.pdev, PCI_COMMAND, _T.cmd)) + +/** + * cxl_find_component_regblock - Find the Component Register Block via + * the Register Locator DVSEC + * @pdev: PCI device to scan + * @bir: output BAR index + * @offset: output offset within the BAR + * + * Parses the Register Locator DVSEC (ID 8) directly via PCI config space + * reads. No dependency on CXL module symbols. + * + * Return: 0 on success, -ENODEV if not found. + */ +static int cxl_find_component_regblock(struct pci_dev *pdev, + int *bir, u64 *offset) +{ + u32 regloc_size, regblocks; + u16 regloc; + int i; + + regloc = pci_find_dvsec_capability(pdev, PCI_VENDOR_ID_CXL, + PCI_DVSEC_CXL_REG_LOCATOR); + if (!regloc) + return -ENODEV; + + pci_read_config_dword(pdev, regloc + PCI_DVSEC_HEADER1, ®loc_size); + regloc_size = PCI_DVSEC_HEADER1_LEN(regloc_size); + regblocks = (regloc_size - PCI_DVSEC_CXL_REG_LOCATOR_BLOCK1) / 8; + + for (i = 0; i < regblocks; i++) { + u32 reg_lo, reg_hi; + unsigned int off; + + off = regloc + PCI_DVSEC_CXL_REG_LOCATOR_BLOCK1 + i * 8; + pci_read_config_dword(pdev, off, ®_lo); + pci_read_config_dword(pdev, off + 4, ®_hi); + + if (FIELD_GET(PCI_DVSEC_CXL_REG_LOCATOR_BLOCK_ID, reg_lo) != + CXL_REGLOC_RBI_COMPONENT) + continue; + + *bir = FIELD_GET(PCI_DVSEC_CXL_REG_LOCATOR_BIR, reg_lo); + *offset = ((u64)reg_hi << 32) | + (reg_lo & PCI_DVSEC_CXL_REG_LOCATOR_BLOCK_OFF_LOW); + return 0; + } + + return -ENODEV; +} + +/* + * Discover and map HDM decoder registers. + * Caller must pci_iounmap() the returned pointer. + */ +static void __iomem *cxl_hdm_map(struct pci_dev *pdev, int *bar_out, + unsigned long *offset_out, + unsigned long *size_out) +{ + int bir; + u64 reg_offset; + void __iomem *comp_base, *cm_base; + u32 cap_hdr; + int cap, cap_count; + unsigned long hdm_offset = 0, hdm_size = 0; + void __iomem *hdm; + + if (cxl_find_component_regblock(pdev, &bir, ®_offset)) + return NULL; + + comp_base = pci_iomap_range(pdev, bir, reg_offset, + CXL_CM_OFFSET + SZ_4K); + if (!comp_base) + return NULL; + + cm_base = comp_base + CXL_CM_OFFSET; + cap_hdr = readl(cm_base); + + if (FIELD_GET(CXL_CM_CAP_HDR_ID_MASK, cap_hdr) != CM_CAP_HDR_CAP_ID) { + pci_iounmap(pdev, comp_base); + return NULL; + } + + cap_count = FIELD_GET(CXL_CM_CAP_HDR_ARRAY_SIZE_MASK, cap_hdr); + + for (cap = 1; cap <= cap_count; cap++) { + u16 cap_id; + u32 cap_off; + u32 hdr; + + if (cap * sizeof(u32) >= SZ_4K) + break; + + hdr = readl(cm_base + cap * 4); + cap_id = FIELD_GET(CXL_CM_CAP_HDR_ID_MASK, hdr); + cap_off = FIELD_GET(CXL_CM_CAP_PTR_MASK, hdr); + + if (cap_id != CXL_CM_CAP_CAP_ID_HDM) + continue; + + if (cap_off + sizeof(u32) > SZ_4K) + break; + + hdr = readl(cm_base + cap_off); + hdm_offset = CXL_CM_OFFSET + cap_off; + hdm_size = 0x20 * cxl_hdm_decoder_count(hdr) + 0x10; + break; + } + + pci_iounmap(pdev, comp_base); + + if (!hdm_size) + return NULL; + + hdm = pci_iomap_range(pdev, bir, reg_offset + hdm_offset, hdm_size); + if (!hdm) + return NULL; + + *bar_out = bir; + *offset_out = reg_offset + hdm_offset; + *size_out = hdm_size; + return hdm; +} + +static void cxl_save_hdm(struct pci_dev *pdev, void __iomem *hdm, + struct cxl_pci_state *state, int count) +{ + int i; + + state->hdm_count = min_t(int, count, CXL_HDM_MAX_DECODERS); + state->hdm_global_ctrl = readl(hdm + CXL_HDM_DECODER_CTRL_OFFSET); + + for (i = 0; i < state->hdm_count; i++) { + struct cxl_hdm_decoder_snapshot *d = &state->decoders[i]; + + d->base_lo = readl(hdm + CXL_HDM_DECODER0_BASE_LOW_OFFSET(i)); + d->base_hi = readl(hdm + CXL_HDM_DECODER0_BASE_HIGH_OFFSET(i)); + d->size_lo = readl(hdm + CXL_HDM_DECODER0_SIZE_LOW_OFFSET(i)); + d->size_hi = readl(hdm + CXL_HDM_DECODER0_SIZE_HIGH_OFFSET(i)); + d->ctrl = readl(hdm + CXL_HDM_DECODER0_CTRL_OFFSET(i)); + d->tl_lo = readl(hdm + CXL_HDM_DECODER0_TL_LOW(i)); + d->tl_hi = readl(hdm + CXL_HDM_DECODER0_TL_HIGH(i)); + } +} + +static void cxl_restore_hdm(struct pci_dev *pdev, void __iomem *hdm, + const struct cxl_pci_state *state) +{ + int i; + + writel(state->hdm_global_ctrl, hdm + CXL_HDM_DECODER_CTRL_OFFSET); + + for (i = 0; i < state->hdm_count; i++) { + const struct cxl_hdm_decoder_snapshot *d = &state->decoders[i]; + unsigned long timeout; + u32 ctrl; + + if (!(d->ctrl & CXL_HDM_DECODER0_CTRL_COMMITTED)) + continue; + + ctrl = readl(hdm + CXL_HDM_DECODER0_CTRL_OFFSET(i)); + if ((ctrl & CXL_HDM_DECODER0_CTRL_LOCK) && + (ctrl & CXL_HDM_DECODER0_CTRL_COMMITTED)) + continue; + + if (ctrl & CXL_HDM_DECODER0_CTRL_COMMITTED) { + ctrl &= ~CXL_HDM_DECODER0_CTRL_COMMIT; + writel(ctrl, hdm + CXL_HDM_DECODER0_CTRL_OFFSET(i)); + } + + writel(d->base_lo, hdm + CXL_HDM_DECODER0_BASE_LOW_OFFSET(i)); + writel(d->base_hi, hdm + CXL_HDM_DECODER0_BASE_HIGH_OFFSET(i)); + writel(d->size_lo, hdm + CXL_HDM_DECODER0_SIZE_LOW_OFFSET(i)); + writel(d->size_hi, hdm + CXL_HDM_DECODER0_SIZE_HIGH_OFFSET(i)); + writel(d->tl_lo, hdm + CXL_HDM_DECODER0_TL_LOW(i)); + writel(d->tl_hi, hdm + CXL_HDM_DECODER0_TL_HIGH(i)); + + wmb(); + + ctrl = d->ctrl & ~(CXL_HDM_DECODER0_CTRL_COMMITTED | + CXL_HDM_DECODER0_CTRL_COMMIT_ERROR); + ctrl |= CXL_HDM_DECODER0_CTRL_COMMIT; + writel(ctrl, hdm + CXL_HDM_DECODER0_CTRL_OFFSET(i)); + + timeout = jiffies + msecs_to_jiffies(10); + for (;;) { + ctrl = readl(hdm + CXL_HDM_DECODER0_CTRL_OFFSET(i)); + if (ctrl & CXL_HDM_DECODER0_CTRL_COMMITTED) + break; + if (ctrl & CXL_HDM_DECODER0_CTRL_COMMIT_ERROR) { + pci_warn(pdev, + "HDM decoder %d commit error on restore\n", + i); + break; + } + if (time_after(jiffies, timeout)) { + pci_warn(pdev, + "HDM decoder %d commit timeout on restore\n", + i); + break; + } + cpu_relax(); + } + } +} + +static void cxl_save_hdm_decoders(struct pci_dev *pdev, + struct cxl_pci_state *state) +{ + int hdm_bar; + unsigned long hdm_bar_offset, hdm_map_size; + void __iomem *hdm; + u16 cmd; + u32 cap; + struct pci_cmd_saved saved __free(restore_pci_cmd) = { + .pdev = pdev, .cmd = PCI_COMMAND_MEMORY, + }; + + pci_read_config_word(pdev, PCI_COMMAND, &cmd); + saved.cmd = cmd; + if (!(cmd & PCI_COMMAND_MEMORY)) + pci_write_config_word(pdev, PCI_COMMAND, + cmd | PCI_COMMAND_MEMORY); + + hdm = cxl_hdm_map(pdev, &hdm_bar, &hdm_bar_offset, &hdm_map_size); + if (!hdm) + return; + + cap = readl(hdm + CXL_HDM_DECODER_CAP_OFFSET); + cxl_save_hdm(pdev, hdm, state, cxl_hdm_decoder_count(cap)); + state->hdm_bar = hdm_bar; + state->hdm_bar_offset = hdm_bar_offset; + state->hdm_map_size = hdm_map_size; + state->hdm_valid = true; + pci_iounmap(pdev, hdm); +} + +static void cxl_restore_hdm_decoders(struct pci_dev *pdev, + const struct cxl_pci_state *state) +{ + void __iomem *hdm; + u16 cmd; + struct pci_cmd_saved saved __free(restore_pci_cmd) = { + .pdev = pdev, .cmd = PCI_COMMAND_MEMORY, + }; + + if (!state->hdm_valid) + return; + + pci_read_config_word(pdev, PCI_COMMAND, &cmd); + saved.cmd = cmd; + if (!(cmd & PCI_COMMAND_MEMORY)) + pci_write_config_word(pdev, PCI_COMMAND, + cmd | PCI_COMMAND_MEMORY); + + hdm = pci_iomap_range(pdev, state->hdm_bar, state->hdm_bar_offset, + state->hdm_map_size); + if (!hdm) { + pci_warn(pdev, "CXL: failed to map HDM for restore\n"); + return; + } + + cxl_restore_hdm(pdev, hdm, state); + pci_iounmap(pdev, hdm); +} + +void pci_allocate_cxl_save_buffer(struct pci_dev *dev) +{ + if (!pcie_is_cxl(dev)) + return; + + if (pci_add_virtual_ext_cap_save_buffer(dev, + PCI_EXT_CAP_ID_CXL_DVSEC_VIRTUAL, + sizeof(struct cxl_pci_state))) + pci_err(dev, "unable to allocate CXL save buffer\n"); +} + +void pci_save_cxl_state(struct pci_dev *pdev) +{ + struct pci_cap_saved_state *save_state; + struct cxl_pci_state *state; + + save_state = pci_find_saved_ext_cap(pdev, + PCI_EXT_CAP_ID_CXL_DVSEC_VIRTUAL); + if (!save_state) + return; + + state = (struct cxl_pci_state *)save_state->cap.data; + state->dvsec_valid = false; + state->hdm_valid = false; + + cxl_save_dvsec(pdev, state); + cxl_save_hdm_decoders(pdev, state); +} + +void pci_restore_cxl_state(struct pci_dev *pdev) +{ + struct pci_cap_saved_state *save_state; + struct cxl_pci_state *state; + + save_state = pci_find_saved_ext_cap(pdev, + PCI_EXT_CAP_ID_CXL_DVSEC_VIRTUAL); + if (!save_state) + return; + + state = (struct cxl_pci_state *)save_state->cap.data; + if (!state->dvsec_valid && !state->hdm_valid) + return; + + cxl_restore_dvsec(pdev, state); + cxl_restore_hdm_decoders(pdev, state); +} diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c index a964f12cc7ffd..2ef8d7274b300 100644 --- a/drivers/pci/pci.c +++ b/drivers/pci/pci.c @@ -1759,6 +1759,7 @@ int pci_save_state(struct pci_dev *dev) pci_save_aer_state(dev); pci_save_ptm_state(dev); pci_save_tph_state(dev); + pci_save_cxl_state(dev); return pci_save_vc_state(dev); } EXPORT_SYMBOL(pci_save_state); @@ -1841,6 +1842,7 @@ void pci_restore_state(struct pci_dev *dev) pci_restore_aer_state(dev); pci_restore_config_space(dev); + pci_restore_cxl_state(dev); pci_restore_pcix_state(dev); pci_restore_msi_state(dev); @@ -3446,6 +3448,26 @@ int pci_add_ext_cap_save_buffer(struct pci_dev *dev, u16 cap, unsigned int size) return _pci_add_cap_save_buffer(dev, cap, true, size); } +int pci_add_virtual_ext_cap_save_buffer(struct pci_dev *dev, u16 cap, + unsigned int size) +{ + struct pci_cap_saved_state *save_state; + + if (cap <= PCI_EXT_CAP_ID_MAX) + return -EINVAL; + + save_state = kzalloc(sizeof(*save_state) + size, GFP_KERNEL); + if (!save_state) + return -ENOMEM; + + save_state->cap.cap_nr = cap; + save_state->cap.cap_extended = true; + save_state->cap.size = size; + pci_add_saved_cap(dev, save_state); + + return 0; +} + /** * pci_allocate_cap_save_buffers - allocate buffers for saving capabilities * @dev: the PCI device @@ -3469,6 +3491,7 @@ void pci_allocate_cap_save_buffers(struct pci_dev *dev) pci_err(dev, "unable to allocate suspend buffer for LTR\n"); pci_allocate_vc_save_buffers(dev); + pci_allocate_cxl_save_buffer(dev); } void pci_free_cap_save_buffers(struct pci_dev *dev) @@ -4982,151 +5005,6 @@ static int cxl_reset_bus_function(struct pci_dev *dev, bool probe) return rc; } -static int cxl_reset_prepare(struct pci_dev *dev, u16 dvsec) -{ - u32 timeout_us = 100, timeout_tot_us = 10000; - u16 reg, cap; - int rc; - - if (!pci_wait_for_pending_transaction(dev)) - pci_err(dev, "timed out waiting for pending transaction; performing cxl reset anyway\n"); - - /* Check if the device is cache capable. */ - rc = pci_read_config_word(dev, dvsec + PCI_DVSEC_CXL_CAP, &cap); - if (rc) - return rc; - - if (!(cap & PCI_DVSEC_CXL_CACHE_CAPABLE)) - return 0; - - /* Disable cache. WB and invalidate cache if capability is advertised */ - rc = pci_read_config_word(dev, dvsec + PCI_DVSEC_CXL_CTRL2, ®); - if (rc) - return rc; - reg |= PCI_DVSEC_CXL_DISABLE_CACHING; - /* - * DEVCTL2 bits are written only once. So check WB+I capability while - * keeping disable caching set. - */ - if (cap & PCI_DVSEC_CXL_CACHE_WBI_CAPABLE) - reg |= PCI_DVSEC_CXL_INIT_CACHE_WBI; - pci_write_config_word(dev, dvsec + PCI_DVSEC_CXL_CTRL2, reg); - - /* - * From Section 9.6: "Software may leverage the cache size reported in - * the DVSEC CXL Capability2 register to compute a suitable timeout - * value". - * Given there is no conversion factor for cache size -> timeout, - * setting timer for default 10ms. - */ - do { - if (timeout_tot_us == 0) - return -ETIMEDOUT; - usleep_range(timeout_us, timeout_us + 1); - timeout_tot_us -= timeout_us; - rc = pci_read_config_word(dev, dvsec + PCI_DVSEC_CXL_CTRL2, - ®); - if (rc) - return rc; - } while (!(reg & PCI_DVSEC_CXL_CACHE_INV)); - - return 0; -} - -static int cxl_reset_init(struct pci_dev *dev, u16 dvsec) -{ - /* - * Timeout values ref CXL Spec v3.2 Ch 8 Control and Status Registers, - * under section 8.1.3.1 DVSEC CXL Capability. - */ - u32 reset_timeouts_ms[] = { 10, 100, 1000, 10000, 100000 }; - u16 reg; - u32 timeout_ms; - int rc, ind; - - /* Check if CXL Reset MEM CLR is supported. */ - rc = pci_read_config_word(dev, dvsec + PCI_DVSEC_CXL_CAP, ®); - if (rc) - return rc; - - if (reg & PCI_DVSEC_CXL_RST_MEM_CLR_CAPABLE) { - rc = pci_read_config_word(dev, dvsec + PCI_DVSEC_CXL_CTRL2, - ®); - if (rc) - return rc; - - reg |= PCI_DVSEC_CXL_RST_MEM_CLR_EN; - pci_write_config_word(dev, dvsec + PCI_DVSEC_CXL_CTRL2, reg); - } - - /* Read timeout value. */ - rc = pci_read_config_word(dev, dvsec + PCI_DVSEC_CXL_CAP, ®); - if (rc) - return rc; - ind = FIELD_GET(PCI_DVSEC_CXL_RST_TIMEOUT, reg); - timeout_ms = reset_timeouts_ms[ind]; - - /* Write reset config. */ - rc = pci_read_config_word(dev, dvsec + PCI_DVSEC_CXL_CTRL2, ®); - if (rc) - return rc; - - reg |= PCI_DVSEC_CXL_INIT_CXL_RST; - pci_write_config_word(dev, dvsec + PCI_DVSEC_CXL_CTRL2, reg); - - /* Wait till timeout and then check reset status is complete. */ - msleep(timeout_ms); - rc = pci_read_config_word(dev, dvsec + PCI_DVSEC_CXL_STATUS2, ®); - if (rc) - return rc; - if (reg & PCI_DVSEC_CXL_RST_ERR || - ~reg & PCI_DVSEC_CXL_RST_DONE) - return -ETIMEDOUT; - - rc = pci_read_config_word(dev, dvsec + PCI_DVSEC_CXL_CTRL2, ®); - if (rc) - return rc; - reg &= (~PCI_DVSEC_CXL_DISABLE_CACHING); - pci_write_config_word(dev, dvsec + PCI_DVSEC_CXL_CTRL2, reg); - - return 0; -} - -/** - * cxl_reset - initiate a cxl reset - * @dev: device to reset - * @probe: if true, return 0 if device can be reset this way - * - * Initiate a cxl reset on @dev. - */ -static int cxl_reset(struct pci_dev *dev, bool probe) -{ - u16 dvsec, reg; - int rc; - - dvsec = pci_find_dvsec_capability(dev, PCI_VENDOR_ID_CXL, - PCI_DVSEC_CXL_DEVICE); - if (!dvsec) - return -ENOTTY; - - /* Check if CXL Reset is supported. */ - rc = pci_read_config_word(dev, dvsec + PCI_DVSEC_CXL_CAP, ®); - if (rc) - return -ENOTTY; - - if ((reg & PCI_DVSEC_CXL_RST_CAPABLE) == 0) - return -ENOTTY; - - if (probe) - return 0; - - rc = cxl_reset_prepare(dev, dvsec); - if (rc) - return rc; - - return cxl_reset_init(dev, dvsec); -} - void pci_dev_lock(struct pci_dev *dev) { /* block PM suspend, driver probe, etc. */ @@ -5155,7 +5033,15 @@ void pci_dev_unlock(struct pci_dev *dev) } EXPORT_SYMBOL_GPL(pci_dev_unlock); -static void pci_dev_save_and_disable(struct pci_dev *dev) +/** + * pci_dev_save_and_disable - Save device state and disable it + * @dev: PCI device to save and disable + * + * Save the PCI configuration state, invoke the driver's reset_prepare + * callback (if any), and disable the device by clearing the Command register. + * The device lock must be held by the caller. + */ +void pci_dev_save_and_disable(struct pci_dev *dev) { const struct pci_error_handlers *err_handler = dev->driver ? dev->driver->err_handler : NULL; @@ -5188,8 +5074,16 @@ static void pci_dev_save_and_disable(struct pci_dev *dev) */ pci_write_config_word(dev, PCI_COMMAND, PCI_COMMAND_INTX_DISABLE); } +EXPORT_SYMBOL_GPL(pci_dev_save_and_disable); -static void pci_dev_restore(struct pci_dev *dev) +/** + * pci_dev_restore - Restore device state after reset + * @dev: PCI device to restore + * + * Restore the saved PCI configuration state and invoke the driver's + * reset_done callback (if any). The device lock must be held by the caller. + */ +void pci_dev_restore(struct pci_dev *dev) { const struct pci_error_handlers *err_handler = dev->driver ? dev->driver->err_handler : NULL; @@ -5206,6 +5100,7 @@ static void pci_dev_restore(struct pci_dev *dev) else if (dev->driver) pci_warn(dev, "reset done"); } +EXPORT_SYMBOL_GPL(pci_dev_restore); /* dev->reset_methods[] is a 0-terminated list of indices into this array */ const struct pci_reset_fn_method pci_reset_fn_methods[] = { @@ -5214,7 +5109,6 @@ const struct pci_reset_fn_method pci_reset_fn_methods[] = { { pci_dev_acpi_reset, .name = "acpi" }, { pcie_reset_flr, .name = "flr" }, { pci_af_flr, .name = "af_flr" }, - { cxl_reset, .name = "cxl_reset" }, { pci_pm_reset, .name = "pm" }, { pci_reset_bus_function, .name = "bus" }, { cxl_reset_bus_function, .name = "cxl_bus" }, diff --git a/drivers/pci/pci.h b/drivers/pci/pci.h index 13fa71f965900..cae087dec6fb1 100644 --- a/drivers/pci/pci.h +++ b/drivers/pci/pci.h @@ -245,15 +245,33 @@ struct pci_cap_saved_state { struct pci_cap_saved_data cap; }; +/* + * Virtual extended cap ID for CXL DVSEC state in the cap save chain. + */ +#define PCI_EXT_CAP_ID_CXL_DVSEC_VIRTUAL 0xFFFF +static_assert(PCI_EXT_CAP_ID_MAX < PCI_EXT_CAP_ID_CXL_DVSEC_VIRTUAL); + void pci_allocate_cap_save_buffers(struct pci_dev *dev); void pci_free_cap_save_buffers(struct pci_dev *dev); int pci_add_cap_save_buffer(struct pci_dev *dev, char cap, unsigned int size); int pci_add_ext_cap_save_buffer(struct pci_dev *dev, u16 cap, unsigned int size); +int pci_add_virtual_ext_cap_save_buffer(struct pci_dev *dev, u16 cap, + unsigned int size); struct pci_cap_saved_state *pci_find_saved_cap(struct pci_dev *dev, char cap); struct pci_cap_saved_state *pci_find_saved_ext_cap(struct pci_dev *dev, u16 cap); +#ifdef CONFIG_PCI_CXL +void pci_allocate_cxl_save_buffer(struct pci_dev *dev); +void pci_save_cxl_state(struct pci_dev *dev); +void pci_restore_cxl_state(struct pci_dev *dev); +#else +static inline void pci_allocate_cxl_save_buffer(struct pci_dev *dev) { } +static inline void pci_save_cxl_state(struct pci_dev *dev) { } +static inline void pci_restore_cxl_state(struct pci_dev *dev) { } +#endif + #define PCI_PM_D2_DELAY 200 /* usec; see PCIe r4.0, sec 5.9.1 */ #define PCI_PM_D3HOT_WAIT 10 /* msec */ #define PCI_PM_D3COLD_WAIT 100 /* msec */ diff --git a/include/cxl/cxl.h b/include/cxl/cxl.h new file mode 100644 index 0000000000000..1c496c1e846c2 --- /dev/null +++ b/include/cxl/cxl.h @@ -0,0 +1,318 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* Copyright(c) 2020 Intel Corporation. */ +/* Copyright(c) 2026 Advanced Micro Devices, Inc. */ + +#ifndef __CXL_CXL_H__ +#define __CXL_CXL_H__ + +#include +#include +#include +#include +#include + +/** + * enum cxl_devtype - delineate type-2 from a generic type-3 device + * @CXL_DEVTYPE_DEVMEM: Vendor specific CXL Type-2 device implementing HDM-D or + * HDM-DB, no requirement that this device implements a + * mailbox, or other memory-device-standard manageability + * flows. + * @CXL_DEVTYPE_CLASSMEM: Common class definition of a CXL Type-3 device with + * HDM-H and class-mandatory memory device registers + */ +enum cxl_devtype { + CXL_DEVTYPE_DEVMEM, + CXL_DEVTYPE_CLASSMEM, +}; + +struct device; + +/* + * Using struct_group() allows for per register-block-type helper routines, + * without requiring block-type agnostic code to include the prefix. + */ +struct cxl_regs { + /* + * Common set of CXL Component register block base pointers + * @hdm_decoder: CXL 2.0 8.2.5.12 CXL HDM Decoder Capability Structure + * @ras: CXL 2.0 8.2.5.9 CXL RAS Capability Structure + */ + struct_group_tagged(cxl_component_regs, component, + void __iomem *hdm_decoder; + void __iomem *ras; + ); + /* + * Common set of CXL Device register block base pointers + * @status: CXL 2.0 8.2.8.3 Device Status Registers + * @mbox: CXL 2.0 8.2.8.4 Mailbox Registers + * @memdev: CXL 2.0 8.2.8.5 Memory Device Registers + */ + struct_group_tagged(cxl_device_regs, device_regs, + void __iomem *status, *mbox, *memdev; + ); + + struct_group_tagged(cxl_pmu_regs, pmu_regs, + void __iomem *pmu; + ); + + /* + * RCH downstream port specific RAS register + * @aer: CXL 3.0 8.2.1.1 RCH Downstream Port RCRB + */ + struct_group_tagged(cxl_rch_regs, rch_regs, + void __iomem *dport_aer; + ); + + /* + * RCD upstream port specific PCIe cap register + * @pcie_cap: CXL 3.0 8.2.1.2 RCD Upstream Port RCRB + */ + struct_group_tagged(cxl_rcd_regs, rcd_regs, + void __iomem *rcd_pcie_cap; + ); +}; + +#define CXL_CM_CAP_CAP_ID_RAS 0x2 +#define CXL_CM_CAP_CAP_ID_HDM 0x5 +#define CXL_CM_CAP_CAP_HDM_VERSION 1 + +/* CXL 2.0 8.2.4 CXL Component Register Layout and Definition */ +#define CXL_COMPONENT_REG_BLOCK_SIZE SZ_64K + +/* CXL 2.0 8.2.5 CXL.cache and CXL.mem Registers */ +#define CXL_CM_OFFSET 0x1000 +#define CXL_CM_CAP_HDR_OFFSET 0x0 +#define CXL_CM_CAP_HDR_ID_MASK GENMASK(15, 0) +#define CM_CAP_HDR_CAP_ID 1 +#define CXL_CM_CAP_HDR_VERSION_MASK GENMASK(19, 16) +#define CM_CAP_HDR_CAP_VERSION 1 +#define CXL_CM_CAP_HDR_CACHE_MEM_VERSION_MASK GENMASK(23, 20) +#define CM_CAP_HDR_CACHE_MEM_VERSION 1 +#define CXL_CM_CAP_HDR_ARRAY_SIZE_MASK GENMASK(31, 24) +#define CXL_CM_CAP_PTR_MASK GENMASK(31, 20) + +/* HDM decoders CXL 2.0 8.2.5.12 CXL HDM Decoder Capability Structure */ +#define CXL_HDM_DECODER_CAP_OFFSET 0x0 +#define CXL_HDM_DECODER_COUNT_MASK GENMASK(3, 0) +#define CXL_HDM_DECODER_TARGET_COUNT_MASK GENMASK(7, 4) +#define CXL_HDM_DECODER_INTERLEAVE_11_8 BIT(8) +#define CXL_HDM_DECODER_INTERLEAVE_14_12 BIT(9) +#define CXL_HDM_DECODER_INTERLEAVE_3_6_12_WAY BIT(11) +#define CXL_HDM_DECODER_INTERLEAVE_16_WAY BIT(12) +#define CXL_HDM_DECODER_CTRL_OFFSET 0x4 +#define CXL_HDM_DECODER_ENABLE BIT(1) +#define CXL_HDM_DECODER0_BASE_LOW_OFFSET(i) (0x20 * (i) + 0x10) +#define CXL_HDM_DECODER0_BASE_HIGH_OFFSET(i) (0x20 * (i) + 0x14) +#define CXL_HDM_DECODER0_SIZE_LOW_OFFSET(i) (0x20 * (i) + 0x18) +#define CXL_HDM_DECODER0_SIZE_HIGH_OFFSET(i) (0x20 * (i) + 0x1c) +#define CXL_HDM_DECODER0_CTRL_OFFSET(i) (0x20 * (i) + 0x20) +#define CXL_HDM_DECODER0_CTRL_IG_MASK GENMASK(3, 0) +#define CXL_HDM_DECODER0_CTRL_IW_MASK GENMASK(7, 4) +#define CXL_HDM_DECODER0_CTRL_LOCK BIT(8) +#define CXL_HDM_DECODER0_CTRL_COMMIT BIT(9) +#define CXL_HDM_DECODER0_CTRL_COMMITTED BIT(10) +#define CXL_HDM_DECODER0_CTRL_COMMIT_ERROR BIT(11) +#define CXL_HDM_DECODER0_CTRL_HOSTONLY BIT(12) +#define CXL_HDM_DECODER0_TL_LOW(i) (0x20 * (i) + 0x24) +#define CXL_HDM_DECODER0_TL_HIGH(i) (0x20 * (i) + 0x28) +#define CXL_HDM_DECODER0_SKIP_LOW(i) CXL_HDM_DECODER0_TL_LOW(i) +#define CXL_HDM_DECODER0_SKIP_HIGH(i) CXL_HDM_DECODER0_TL_HIGH(i) + +/* HDM decoder control register constants CXL 3.0 8.2.5.19.7 */ +#define CXL_DECODER_MIN_GRANULARITY 256 +#define CXL_DECODER_MAX_ENCODED_IG 6 + +static inline int cxl_hdm_decoder_count(u32 cap_hdr) +{ + int val = FIELD_GET(CXL_HDM_DECODER_COUNT_MASK, cap_hdr); + + return val ? val * 2 : 1; +} + +struct cxl_reg_map { + bool valid; + int id; + unsigned long offset; + unsigned long size; +}; + +struct cxl_component_reg_map { + struct cxl_reg_map hdm_decoder; + struct cxl_reg_map ras; +}; + +struct cxl_device_reg_map { + struct cxl_reg_map status; + struct cxl_reg_map mbox; + struct cxl_reg_map memdev; +}; + +struct cxl_pmu_reg_map { + struct cxl_reg_map pmu; +}; + +/** + * struct cxl_register_map - DVSEC harvested register block mapping parameters + * @host: device for devm operations and logging + * @base: virtual base of the register-block-BAR + @block_offset + * @resource: physical resource base of the register block + * @max_size: maximum mapping size to perform register search + * @reg_type: see enum cxl_regloc_type + * @component_map: cxl_reg_map for component registers + * @device_map: cxl_reg_maps for device registers + * @pmu_map: cxl_reg_maps for CXL Performance Monitoring Units + */ +struct cxl_register_map { + struct device *host; + void __iomem *base; + resource_size_t resource; + resource_size_t max_size; + u8 reg_type; + union { + struct cxl_component_reg_map component_map; + struct cxl_device_reg_map device_map; + struct cxl_pmu_reg_map pmu_map; + }; +}; + +/** + * struct cxl_dpa_perf - DPA performance property entry + * @dpa_range: range for DPA address + * @coord: QoS performance data (i.e. latency, bandwidth) + * @cdat_coord: raw QoS performance data from CDAT + * @qos_class: QoS Class cookies + */ +struct cxl_dpa_perf { + struct range dpa_range; + struct access_coordinate coord[ACCESS_COORDINATE_MAX]; + struct access_coordinate cdat_coord[ACCESS_COORDINATE_MAX]; + int qos_class; +}; + +enum cxl_partition_mode { + CXL_PARTMODE_RAM, + CXL_PARTMODE_PMEM, +}; + +/** + * struct cxl_dpa_partition - DPA partition descriptor + * @res: shortcut to the partition in the DPA resource tree (cxlds->dpa_res) + * @perf: performance attributes of the partition from CDAT + * @mode: operation mode for the DPA capacity, e.g. ram, pmem, dynamic... + */ +struct cxl_dpa_partition { + struct resource res; + struct cxl_dpa_perf perf; + enum cxl_partition_mode mode; +}; + +#define CXL_NR_PARTITIONS_MAX 2 + +struct cxl_memdev_attach { + int (*probe)(struct cxl_memdev *cxlmd); +}; + +/** + * struct cxl_attach_region - accelerator region handling + * @attach: invoked at cxl_memdev_attach_region() with endpoint device locked. + * @detach: invoked at endpoint release. + * @data: pointer referencing accelerator data for attach and detach calls. + * @region: initialised with autodiscovered region values linked to memdev. + */ +struct cxl_attach_region { + int (*attach)(void *); + void (*detach)(void *); + void *data; + struct range region; +}; + +/** + * struct cxl_dev_state - The driver device state + * + * cxl_dev_state represents the CXL driver/device state. It provides an + * interface to mailbox commands as well as some cached data about the device. + * Currently only memory devices are represented. + * + * @dev: The device associated with this CXL state + * @cxlmd: The device representing the CXL.mem capabilities of @dev + * @reg_map: component and ras register mapping parameters + * @regs: Parsed register blocks + * @cxl_dvsec: Offset to the PCIe device DVSEC + * @rcd: operating in RCD mode (CXL 3.0 9.11.8 CXL Devices Attached to an RCH) + * @media_ready: Indicate whether the device media is usable + * @dpa_res: Overall DPA resource tree for the device + * @part: DPA partition array + * @nr_partitions: Number of DPA partitions + * @serial: PCIe Device Serial Number + * @type: Generic Memory Class device or Vendor Specific Memory device + * @cxl_mbox: CXL mailbox context + * @cxlfs: CXL features context + */ +struct cxl_dev_state { + /* public for Type2 drivers */ + struct device *dev; + struct cxl_memdev *cxlmd; + + /* private for Type2 drivers */ + struct cxl_register_map reg_map; + struct cxl_device_regs regs; + int cxl_dvsec; + bool rcd; + bool media_ready; + struct resource dpa_res; + struct cxl_dpa_partition part[CXL_NR_PARTITIONS_MAX]; + unsigned int nr_partitions; + u64 serial; + enum cxl_devtype type; + struct cxl_mailbox cxl_mbox; +#ifdef CONFIG_CXL_FEATURES + struct cxl_features_state *cxlfs; +#endif +}; + +struct cxl_dev_state *_devm_cxl_dev_state_create(struct device *dev, + enum cxl_devtype type, + u64 serial, u16 dvsec, + size_t size, bool has_mbox); + +/** + * cxl_dev_state_create - safely create and cast a cxl dev state embedded in a + * driver specific struct. + * + * @parent: device behind the request + * @type: CXL device type + * @serial: device identification + * @dvsec: dvsec capability offset + * @drv_struct: driver struct embedding a cxl_dev_state struct + * @member: name of the struct cxl_dev_state member in drv_struct + * @mbox: true if mailbox supported + * + * Returns a pointer to the drv_struct allocated and embedding a cxl_dev_state + * struct initialized. + * + * Introduced for Type2 driver support. + */ +#define devm_cxl_dev_state_create(parent, type, serial, dvsec, drv_struct, member, mbox) \ + ({ \ + static_assert(__same_type(struct cxl_dev_state, \ + ((drv_struct *)NULL)->member)); \ + static_assert(offsetof(drv_struct, member) == 0); \ + (drv_struct *)_devm_cxl_dev_state_create(parent, type, serial, dvsec, \ + sizeof(drv_struct), mbox); \ + }) + +int cxl_set_capacity(struct cxl_dev_state *cxlds, u64 capacity); +struct cxl_memdev *devm_cxl_add_memdev(struct cxl_dev_state *cxlds, + const struct cxl_memdev_attach *attach); +struct cxl_region; +int cxl_memdev_attach_region(struct cxl_memdev *cxlmd, struct cxl_attach_region *attach); + +#ifdef CONFIG_CXL_REGION +bool cxl_region_contains_soft_reserve(struct resource *res); +#else +static inline bool cxl_region_contains_soft_reserve(struct resource *res) +{ + return false; +} +#endif +#endif /* __CXL_CXL_H__ */ diff --git a/include/cxl/pci.h b/include/cxl/pci.h new file mode 100644 index 0000000000000..edbf980c283f1 --- /dev/null +++ b/include/cxl/pci.h @@ -0,0 +1,25 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* Copyright(c) 2020 Intel Corporation. All rights reserved. */ + +#ifndef __CXL_CXL_PCI_H__ +#define __CXL_CXL_PCI_H__ + +/* Register Block Identifier (RBI) */ +enum cxl_regloc_type { + CXL_REGLOC_RBI_EMPTY = 0, + CXL_REGLOC_RBI_COMPONENT, + CXL_REGLOC_RBI_VIRT, + CXL_REGLOC_RBI_MEMDEV, + CXL_REGLOC_RBI_PMU, + CXL_REGLOC_RBI_TYPES +}; + +struct pci_dev; +struct cxl_register_map; + +int cxl_pci_setup_regs(struct pci_dev *pdev, enum cxl_regloc_type type, + struct cxl_register_map *map); +int cxl_find_regblock(struct pci_dev *pdev, enum cxl_regloc_type type, + struct cxl_register_map *map); +int cxl_setup_regs(struct cxl_register_map *map); +#endif diff --git a/include/linux/pci.h b/include/linux/pci.h index 634239799686b..a0724e3501b25 100644 --- a/include/linux/pci.h +++ b/include/linux/pci.h @@ -51,7 +51,7 @@ PCI_STATUS_PARITY) /* Number of reset methods used in pci_reset_fn_methods array in pci.c */ -#define PCI_NUM_RESET_METHODS 9 +#define PCI_NUM_RESET_METHODS 8 #define PCI_RESET_PROBE true #define PCI_RESET_DO_RESET false @@ -2008,6 +2008,9 @@ int pci_dev_trylock(struct pci_dev *dev); void pci_dev_unlock(struct pci_dev *dev); DEFINE_GUARD(pci_dev, struct pci_dev *, pci_dev_lock(_T), pci_dev_unlock(_T)) +void pci_dev_save_and_disable(struct pci_dev *dev); +void pci_dev_restore(struct pci_dev *dev); + /* * PCI domain support. Sometimes called PCI segment (eg by ACPI), * a PCI domain is defined to be a set of PCI buses which share diff --git a/include/uapi/linux/pci_regs.h b/include/uapi/linux/pci_regs.h index 543275ff9ed62..c4a6c0a916dc7 100644 --- a/include/uapi/linux/pci_regs.h +++ b/include/uapi/linux/pci_regs.h @@ -1358,15 +1358,18 @@ #define PCI_DVSEC_CXL_RST_MEM_CLR_CAPABLE _BITUL(11) #define PCI_DVSEC_CXL_CTRL 0xC #define PCI_DVSEC_CXL_MEM_ENABLE _BITUL(2) +#define PCI_DVSEC_CXL_CTRL_RWL 0x5FED #define PCI_DVSEC_CXL_CTRL2 0x10 -#define PCI_DVSEC_CXL_DISABLE_CACHING _BITUL(0) +#define PCI_DVSEC_CXL_DISABLE_CACHING _BITUL(0) #define PCI_DVSEC_CXL_INIT_CACHE_WBI _BITUL(1) #define PCI_DVSEC_CXL_INIT_CXL_RST _BITUL(2) #define PCI_DVSEC_CXL_RST_MEM_CLR_EN _BITUL(3) #define PCI_DVSEC_CXL_STATUS2 0x12 #define PCI_DVSEC_CXL_CACHE_INV _BITUL(0) #define PCI_DVSEC_CXL_RST_DONE _BITUL(1) -#define PCI_DVSEC_CXL_RST_ERR _BITUL(2) +#define PCI_DVSEC_CXL_RST_ERR _BITUL(2) +#define PCI_DVSEC_CXL_LOCK 0x14 +#define PCI_DVSEC_CXL_LOCK_CONFIG _BITUL(0) #define PCI_DVSEC_CXL_RANGE_SIZE_HIGH(i) (0x18 + (i * 0x10)) #define PCI_DVSEC_CXL_RANGE_SIZE_LOW(i) (0x1C + (i * 0x10)) #define PCI_DVSEC_CXL_MEM_INFO_VALID _BITUL(0) @@ -1374,13 +1377,16 @@ #define PCI_DVSEC_CXL_MEM_ACTIVE_TIMEOUT __GENMASK(15, 13) #define PCI_DVSEC_CXL_MEM_SIZE_LOW __GENMASK(31, 28) #define PCI_DVSEC_CXL_RANGE_BASE_HIGH(i) (0x20 + (i * 0x10)) +#define PCI_DVSEC_CXL_RANGE_BASE_HI_RWL 0xFFFFFFFF #define PCI_DVSEC_CXL_RANGE_BASE_LOW(i) (0x24 + (i * 0x10)) #define PCI_DVSEC_CXL_MEM_BASE_LOW __GENMASK(31, 28) +#define PCI_DVSEC_CXL_RANGE_BASE_LO_RWL 0xF0000000 #define CXL_DVSEC_RANGE_MAX 2 /* CXL r4.0, 8.1.4: Non-CXL Function Map DVSEC */ #define PCI_DVSEC_CXL_FUNCTION_MAP 2 +#define PCI_DVSEC_CXL_FUNCTION_MAP_REG 0x0C /* CXL r4.0, 8.1.5: Extensions DVSEC for Ports */ #define PCI_DVSEC_CXL_PORT 3 diff --git a/tools/testing/cxl/test/mem.c b/tools/testing/cxl/test/mem.c index cb87e8c0e63c0..79f42f4474d47 100644 --- a/tools/testing/cxl/test/mem.c +++ b/tools/testing/cxl/test/mem.c @@ -1716,7 +1716,7 @@ static int cxl_mock_mem_probe(struct platform_device *pdev) if (rc) return rc; - mds = cxl_memdev_state_create(dev); + mds = cxl_memdev_state_create(dev, pdev->id + 1, 0); if (IS_ERR(mds)) return PTR_ERR(mds); @@ -1732,7 +1732,6 @@ static int cxl_mock_mem_probe(struct platform_device *pdev) mds->event.buf = (struct cxl_get_event_payload *) mdata->event_buf; INIT_DELAYED_WORK(&mds->security.poll_dwork, cxl_mockmem_sanitize_work); - cxlds->serial = pdev->id + 1; if (is_rcd(pdev)) cxlds->rcd = true;