From aa667c6408d20a84c7637420bc3b7aa0abab59a2 Mon Sep 17 00:00:00 2001 From: James Puthukattukaran Date: Mon, 9 Jul 2018 11:31:25 -0400 Subject: [PATCH 1/4] PCI: Workaround IDT switch ACS Source Validation erratum Some IDT switches incorrectly flag an ACS Source Validation error on completions for config read requests even though PCIe r4.0, sec 6.12.1.1, says that completions are never affected by ACS Source Validation. Here's the text of IDT 89H32H8G3-YC, erratum #36: Item #36 - Downstream port applies ACS Source Validation to Completions Section 6.12.1.1 of the PCI Express Base Specification 3.1 states that completions are never affected by ACS Source Validation. However, completions received by a downstream port of the PCIe switch from a device that has not yet captured a PCIe bus number are incorrectly dropped by ACS Source Validation by the switch downstream port. Workaround: Issue a CfgWr1 to the downstream device before issuing the first CfgRd1 to the device. This allows the downstream device to capture its bus number; ACS Source Validation no longer stops completions from being forwarded by the downstream port. It has been observed that Microsoft Windows implements this workaround already; however, some versions of Linux and other operating systems may not. When doing the first config read to probe for a device, if the device is behind an IDT switch with this erratum: 1. Disable ACS Source Validation if enabled 2. Wait for device to become ready to accept config accesses (by using the Config Request Retry Status mechanism) 3. Do a config write to the endpoint 4. Enable ACS Source Validation (if it was enabled to begin with) The workaround suggested by IDT is basically only step 3, but we don't know when the device is ready to accept config requests. That means we need to do config reads until we receive a non-Config Request Retry Status, which means we need to disable ACS SV temporarily. Signed-off-by: James Puthukattukaran [bhelgaas: changelog, clean up whitespace, fold in unused variable fix from Anders Roxell ] Signed-off-by: Bjorn Helgaas Reviewed-by: Alex Williamson --- drivers/pci/pci.h | 4 ++++ drivers/pci/probe.c | 22 ++++++++++++++++-- drivers/pci/quirks.c | 55 ++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 79 insertions(+), 2 deletions(-) diff --git a/drivers/pci/pci.h b/drivers/pci/pci.h index c358e7a07f3f..70808c168fb9 100644 --- a/drivers/pci/pci.h +++ b/drivers/pci/pci.h @@ -225,6 +225,10 @@ enum pci_bar_type { int pci_configure_extended_tags(struct pci_dev *dev, void *ign); bool pci_bus_read_dev_vendor_id(struct pci_bus *bus, int devfn, u32 *pl, int crs_timeout); +bool pci_bus_generic_read_dev_vendor_id(struct pci_bus *bus, int devfn, u32 *pl, + int crs_timeout); +int pci_idt_bus_quirk(struct pci_bus *bus, int devfn, u32 *pl, int crs_timeout); + int pci_setup_device(struct pci_dev *dev); int __pci_read_base(struct pci_dev *dev, enum pci_bar_type type, struct resource *res, unsigned int reg); diff --git a/drivers/pci/probe.c b/drivers/pci/probe.c index ac876e32de4b..7c0c8ab94bcf 100644 --- a/drivers/pci/probe.c +++ b/drivers/pci/probe.c @@ -2156,8 +2156,8 @@ static bool pci_bus_wait_crs(struct pci_bus *bus, int devfn, u32 *l, return true; } -bool pci_bus_read_dev_vendor_id(struct pci_bus *bus, int devfn, u32 *l, - int timeout) +bool pci_bus_generic_read_dev_vendor_id(struct pci_bus *bus, int devfn, u32 *l, + int timeout) { if (pci_bus_read_config_dword(bus, devfn, PCI_VENDOR_ID, l)) return false; @@ -2172,6 +2172,24 @@ bool pci_bus_read_dev_vendor_id(struct pci_bus *bus, int devfn, u32 *l, return true; } + +bool pci_bus_read_dev_vendor_id(struct pci_bus *bus, int devfn, u32 *l, + int timeout) +{ +#ifdef CONFIG_PCI_QUIRKS + struct pci_dev *bridge = bus->self; + + /* + * Certain IDT switches have an issue where they improperly trigger + * ACS Source Validation errors on completions for config reads. + */ + if (bridge && bridge->vendor == PCI_VENDOR_ID_IDT && + bridge->device == 0x80b5) + return pci_idt_bus_quirk(bus, devfn, l, timeout); +#endif + + return pci_bus_generic_read_dev_vendor_id(bus, devfn, l, timeout); +} EXPORT_SYMBOL(pci_bus_read_dev_vendor_id); /* diff --git a/drivers/pci/quirks.c b/drivers/pci/quirks.c index f439de848658..cab2d5f922a9 100644 --- a/drivers/pci/quirks.c +++ b/drivers/pci/quirks.c @@ -4753,3 +4753,58 @@ DECLARE_PCI_FIXUP_CLASS_FINAL(PCI_VENDOR_ID_AMD, PCI_ANY_ID, PCI_CLASS_MULTIMEDIA_HD_AUDIO, 8, quirk_gpu_hda); DECLARE_PCI_FIXUP_CLASS_FINAL(PCI_VENDOR_ID_NVIDIA, PCI_ANY_ID, PCI_CLASS_MULTIMEDIA_HD_AUDIO, 8, quirk_gpu_hda); + +/* + * Some IDT switches incorrectly flag an ACS Source Validation error on + * completions for config read requests even though PCIe r4.0, sec + * 6.12.1.1, says that completions are never affected by ACS Source + * Validation. Here's the text of IDT 89H32H8G3-YC, erratum #36: + * + * Item #36 - Downstream port applies ACS Source Validation to Completions + * Section 6.12.1.1 of the PCI Express Base Specification 3.1 states that + * completions are never affected by ACS Source Validation. However, + * completions received by a downstream port of the PCIe switch from a + * device that has not yet captured a PCIe bus number are incorrectly + * dropped by ACS Source Validation by the switch downstream port. + * + * The workaround suggested by IDT is to issue a config write to the + * downstream device before issuing the first config read. This allows the + * downstream device to capture its bus and device numbers (see PCIe r4.0, + * sec 2.2.9), thus avoiding the ACS error on the completion. + * + * However, we don't know when the device is ready to accept the config + * write, so we do config reads until we receive a non-Config Request Retry + * Status, then do the config write. + * + * To avoid hitting the erratum when doing the config reads, we disable ACS + * SV around this process. + */ +int pci_idt_bus_quirk(struct pci_bus *bus, int devfn, u32 *l, int timeout) +{ + int pos; + u16 ctrl = 0; + bool found; + struct pci_dev *bridge = bus->self; + + pos = pci_find_ext_capability(bridge, PCI_EXT_CAP_ID_ACS); + + /* Disable ACS SV before initial config reads */ + if (pos) { + pci_read_config_word(bridge, pos + PCI_ACS_CTRL, &ctrl); + if (ctrl & PCI_ACS_SV) + pci_write_config_word(bridge, pos + PCI_ACS_CTRL, + ctrl & ~PCI_ACS_SV); + } + + found = pci_bus_generic_read_dev_vendor_id(bus, devfn, l, timeout); + + /* Write Vendor ID (read-only) so the endpoint latches its bus/dev */ + if (found) + pci_bus_write_config_word(bus, devfn, PCI_VENDOR_ID, 0); + + /* Re-enable ACS_SV if it was previously enabled */ + if (ctrl & PCI_ACS_SV) + pci_write_config_word(bridge, pos + PCI_ACS_CTRL, ctrl); + + return found; +} From 2d1ce5ec2117d16047334a1aa4b62e0cfb5a0605 Mon Sep 17 00:00:00 2001 From: Alexandru Gagniuc Date: Mon, 6 Aug 2018 18:25:35 -0500 Subject: [PATCH 2/4] PCI: Check for PCIe Link downtraining When both ends of a PCIe Link are capable of a higher bandwidth than is currently in use, the Link is said to be "downtrained". A downtrained Link may indicate hardware or configuration problems in the system, but it's hard to identify such Links from userspace. Refactor pcie_print_link_status() so it continues to always print PCIe bandwidth information, as several NIC drivers desire. Add a new internal __pcie_print_link_status() to emit a message only when a device's bandwidth is constrained by the fabric and call it from the PCI core for all devices, which identifies all downtrained Links. It also emits messages for a few cases that are technically not downtrained, such as a x4 device in an open-ended x1 slot. Signed-off-by: Alexandru Gagniuc [bhelgaas: changelog, move __pcie_print_link_status() declaration to drivers/pci/, rename pcie_check_upstream_link() to pcie_report_downtraining()] Signed-off-by: Bjorn Helgaas --- drivers/pci/pci.c | 27 ++++++++++++++++++++------- drivers/pci/pci.h | 1 + drivers/pci/probe.c | 21 +++++++++++++++++++++ 3 files changed, 42 insertions(+), 7 deletions(-) diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c index 97acba712e4e..a84d341504a5 100644 --- a/drivers/pci/pci.c +++ b/drivers/pci/pci.c @@ -5264,14 +5264,16 @@ u32 pcie_bandwidth_capable(struct pci_dev *dev, enum pci_bus_speed *speed, } /** - * pcie_print_link_status - Report the PCI device's link speed and width + * __pcie_print_link_status - Report the PCI device's link speed and width * @dev: PCI device to query + * @verbose: Print info even when enough bandwidth is available * - * Report the available bandwidth at the device. If this is less than the - * device is capable of, report the device's maximum possible bandwidth and - * the upstream link that limits its performance to less than that. + * If the available bandwidth at the device is less than the device is + * capable of, report the device's maximum possible bandwidth and the + * upstream link that limits its performance. If @verbose, always print + * the available bandwidth, even if the device isn't constrained. */ -void pcie_print_link_status(struct pci_dev *dev) +void __pcie_print_link_status(struct pci_dev *dev, bool verbose) { enum pcie_link_width width, width_cap; enum pci_bus_speed speed, speed_cap; @@ -5281,11 +5283,11 @@ void pcie_print_link_status(struct pci_dev *dev) bw_cap = pcie_bandwidth_capable(dev, &speed_cap, &width_cap); bw_avail = pcie_bandwidth_available(dev, &limiting_dev, &speed, &width); - if (bw_avail >= bw_cap) + if (bw_avail >= bw_cap && verbose) pci_info(dev, "%u.%03u Gb/s available PCIe bandwidth (%s x%d link)\n", bw_cap / 1000, bw_cap % 1000, PCIE_SPEED2STR(speed_cap), width_cap); - else + else if (bw_avail < bw_cap) pci_info(dev, "%u.%03u Gb/s available PCIe bandwidth, limited by %s x%d link at %s (capable of %u.%03u Gb/s with %s x%d link)\n", bw_avail / 1000, bw_avail % 1000, PCIE_SPEED2STR(speed), width, @@ -5293,6 +5295,17 @@ void pcie_print_link_status(struct pci_dev *dev) bw_cap / 1000, bw_cap % 1000, PCIE_SPEED2STR(speed_cap), width_cap); } + +/** + * pcie_print_link_status - Report the PCI device's link speed and width + * @dev: PCI device to query + * + * Report the available bandwidth at the device. + */ +void pcie_print_link_status(struct pci_dev *dev) +{ + __pcie_print_link_status(dev, true); +} EXPORT_SYMBOL(pcie_print_link_status); /** diff --git a/drivers/pci/pci.h b/drivers/pci/pci.h index 70808c168fb9..ce880dab5bc8 100644 --- a/drivers/pci/pci.h +++ b/drivers/pci/pci.h @@ -263,6 +263,7 @@ enum pci_bus_speed pcie_get_speed_cap(struct pci_dev *dev); enum pcie_link_width pcie_get_width_cap(struct pci_dev *dev); u32 pcie_bandwidth_capable(struct pci_dev *dev, enum pci_bus_speed *speed, enum pcie_link_width *width); +void __pcie_print_link_status(struct pci_dev *dev, bool verbose); /* Single Root I/O Virtualization */ struct pci_sriov { diff --git a/drivers/pci/probe.c b/drivers/pci/probe.c index 7c0c8ab94bcf..71412db3cbeb 100644 --- a/drivers/pci/probe.c +++ b/drivers/pci/probe.c @@ -2223,6 +2223,25 @@ static struct pci_dev *pci_scan_device(struct pci_bus *bus, int devfn) return dev; } +static void pcie_report_downtraining(struct pci_dev *dev) +{ + if (!pci_is_pcie(dev)) + return; + + /* Look from the device up to avoid downstream ports with no devices */ + if ((pci_pcie_type(dev) != PCI_EXP_TYPE_ENDPOINT) && + (pci_pcie_type(dev) != PCI_EXP_TYPE_LEG_END) && + (pci_pcie_type(dev) != PCI_EXP_TYPE_UPSTREAM)) + return; + + /* Multi-function PCIe devices share the same link/status */ + if (PCI_FUNC(dev->devfn) != 0 || dev->is_virtfn) + return; + + /* Print link status only if the device is constrained by the fabric */ + __pcie_print_link_status(dev, false); +} + static void pci_init_capabilities(struct pci_dev *dev) { /* Enhanced Allocation */ @@ -2258,6 +2277,8 @@ static void pci_init_capabilities(struct pci_dev *dev) /* Advanced Error Reporting */ pci_aer_init(dev); + pcie_report_downtraining(dev); + if (pci_probe_reset_function(dev) == 0) dev->reset_fn = 1; } From 3dbe97efe8bf450b183d6dee2305cbc032e6b8a4 Mon Sep 17 00:00:00 2001 From: Myron Stowe Date: Mon, 13 Aug 2018 12:19:39 -0600 Subject: [PATCH 3/4] PCI: Skip MPS logic for Virtual Functions (VFs) PCIe r4.0, sec 9.3.5.4, "Device Control Register", shows both Max_Payload_Size (MPS) and Max_Read_request_Size (MRRS) to be 'RsvdP' for VFs. Just prior to the table it states: "PF and VF functionality is defined in Section 7.5.3.4 except where noted in Table 9-16. For VF fields marked 'RsvdP', the PF setting applies to the VF." All of which implies that with respect to Max_Payload_Size Supported (MPSS), MPS, and MRRS values, we should not be paying any attention to the VF's fields, but rather only to the PF's. Only looking at the PF's fields also logically makes sense as it's the sole physical interface to the PCIe bus. Link: https://bugzilla.kernel.org/show_bug.cgi?id=200527 Fixes: 27d868b5e6cf ("PCI: Set MPS to match upstream bridge") Signed-off-by: Myron Stowe Signed-off-by: Bjorn Helgaas Cc: stable@vger.kernel.org # 4.3+ Cc: Keith Busch Cc: Sinan Kaya Cc: Dongdong Liu Cc: Jon Mason --- drivers/pci/probe.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/drivers/pci/probe.c b/drivers/pci/probe.c index 71412db3cbeb..c2533ed45fff 100644 --- a/drivers/pci/probe.c +++ b/drivers/pci/probe.c @@ -1730,6 +1730,10 @@ static void pci_configure_mps(struct pci_dev *dev) if (!pci_is_pcie(dev) || !bridge || !pci_is_pcie(bridge)) return; + /* MPS and MRRS fields are of type 'RsvdP' for VFs, short-circuit out */ + if (dev->is_virtfn) + return; + mps = pcie_get_mps(dev); p_mps = pcie_get_mps(bridge); From 9f0e89359775ee21fe1ea732e34edb52aef5addf Mon Sep 17 00:00:00 2001 From: Myron Stowe Date: Mon, 13 Aug 2018 12:19:46 -0600 Subject: [PATCH 4/4] PCI: Match Root Port's MPS to endpoint's MPSS as necessary In commit 27d868b5e6cf ("PCI: Set MPS to match upstream bridge"), we made sure every device's MPS setting matches its upstream bridge, making it more likely that a hot-added device will work in a system with an optimized MPS configuration. Recently I've started encountering systems where the endpoint device's MPSS capability is less than its Root Port's current MPS value, thus the endpoint is not capable of matching its upstream bridge's MPS setting (see: bugzilla via "Link:" below). This leaves the system vulnerable - the upstream Root Port could respond with larger TLPs than the device can handle, and the device will consider them to be 'Malformed'. One could use the "pci=pcie_bus_safe" kernel parameter to work around the issue, but that forces a user to supply a kernel parameter to get the system to function reliably and may end up limiting MPS settings of other unrelated, sub-topologies which could benefit from maintaining their larger values. Augment Keith's approach to include tuning down a Root Port's MPS setting when its hot-added endpoint device is not capable of matching it. Link: https://bugzilla.kernel.org/show_bug.cgi?id=200527 Signed-off-by: Myron Stowe Signed-off-by: Bjorn Helgaas Acked-by: Jon Mason Cc: Keith Busch Cc: Sinan Kaya Cc: Dongdong Liu --- drivers/pci/probe.c | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/drivers/pci/probe.c b/drivers/pci/probe.c index c2533ed45fff..bca6d2741969 100644 --- a/drivers/pci/probe.c +++ b/drivers/pci/probe.c @@ -1725,7 +1725,7 @@ int pci_setup_device(struct pci_dev *dev) static void pci_configure_mps(struct pci_dev *dev) { struct pci_dev *bridge = pci_upstream_bridge(dev); - int mps, p_mps, rc; + int mps, mpss, p_mps, rc; if (!pci_is_pcie(dev) || !bridge || !pci_is_pcie(bridge)) return; @@ -1753,6 +1753,14 @@ static void pci_configure_mps(struct pci_dev *dev) if (pcie_bus_config != PCIE_BUS_DEFAULT) return; + mpss = 128 << dev->pcie_mpss; + if (mpss < p_mps && pci_pcie_type(bridge) == PCI_EXP_TYPE_ROOT_PORT) { + pcie_set_mps(bridge, mpss); + pci_info(dev, "Upstream bridge's Max Payload Size set to %d (was %d, max %d)\n", + mpss, p_mps, 128 << bridge->pcie_mpss); + p_mps = pcie_get_mps(bridge); + } + rc = pcie_set_mps(dev, p_mps); if (rc) { pci_warn(dev, "can't set Max Payload Size to %d; if necessary, use \"pci=pcie_bus_safe\" and report a bug\n", @@ -1761,7 +1769,7 @@ static void pci_configure_mps(struct pci_dev *dev) } pci_info(dev, "Max Payload Size set to %d (was %d, max %d)\n", - p_mps, mps, 128 << dev->pcie_mpss); + p_mps, mps, mpss); } static struct hpp_type0 pci_default_type0 = {