Discussion:
[PATCH 1/1] pci: fix dmar fault for kdump kernel
Li, ZhenHua
2014-10-14 09:34:42 UTC
Permalink
I tested on the latest stable version 3.17, it works well.
On a HP system with Intel vt-d supported and many PCI devices on it,
when kernel crashed and the kdump kernel boots with intel_iommu=on,
there may be some unexpected DMA requests on this adapter, which will
dmar: DRHD: handling fault status reg 102
dmar: DMAR:[DMA Read] Request device [41:00.0] fault addr fff81000
DMAR:[fault reason 01] Present bit in root entry is clear
This bug may happen on *any* PCI device.
static struct context_entry * device_to_context_entry(
struct intel_iommu *iommu, u8 bus, u8 devfn)
{
......
set_root_present(root);
......
}
device driver
intel_alloc_coherent
__intel_map_single
domain_context_mapping
domain_context_mapping_one
device_to_context_entry
This means, the present bit in root entry will not be set until the device
driver is loaded.
But in the kdump kernel, hardware devices are not aware that control has
transferred to the second kernel, and those drivers must initialize again.
Consequently there may be unexpected DMA requests from devices activity
initiated in the first kernel leading to the DMA Remapping errors in the
second kernel.
To fix this DMAR fault, we need to reset the bus that this device on. Reset
the device itself does not work.
https://lkml.org/lkml/2014/9/30/55
As in discussion, this bug may happen on *any* device, so we need to reset all
pci devices.
https://lkml.org/lkml/2013/5/14/9
Add support for legacy PCI devices.
Use pci_try_reset_bus instead of do_downstream_device_reset in original version
Randy Wright corrects some misunderstanding in this description.
---
drivers/pci/pci.c | 84 +++++++++++++++++++++++++++++++++++++++++++++++++++++++
1 file changed, 84 insertions(+)
diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c
index 2c9ac70..8cb146c 100644
--- a/drivers/pci/pci.c
+++ b/drivers/pci/pci.c
@@ -23,6 +23,7 @@
#include <linux/device.h>
#include <linux/pm_runtime.h>
#include <linux/pci_hotplug.h>
+#include <linux/crash_dump.h>
#include <asm-generic/pci-bridge.h>
#include <asm/setup.h>
#include "pci.h"
@@ -4423,6 +4424,89 @@ void __weak pci_fixup_cardbus(struct pci_bus *bus)
}
EXPORT_SYMBOL(pci_fixup_cardbus);
+/*
+ * Return true if dev is PCI root port or downstream port whose child is PCI
+ * endpoint except VGA device.
+ */
+static int __pci_dev_need_reset(struct pci_dev *dev)
+{
+ struct pci_bus *subordinate;
+ struct pci_dev *child;
+
+ if (dev->hdr_type != PCI_HEADER_TYPE_BRIDGE)
+ return 0;
+
+ if (pci_is_pcie(dev)) {
+ if ((pci_pcie_type(dev) != PCI_EXP_TYPE_ROOT_PORT) &&
+ (pci_pcie_type(dev) != PCI_EXP_TYPE_DOWNSTREAM))
+ return 0;
+ }
+
+ subordinate = dev->subordinate;
+ list_for_each_entry(child, &subordinate->devices, bus_list) {
+ /* Don't reset switch, bridge, VGA device */
+ if ((child->hdr_type == PCI_HEADER_TYPE_BRIDGE) ||
+ ((child->class >> 16) == PCI_BASE_CLASS_BRIDGE) ||
+ ((child->class >> 16) == PCI_BASE_CLASS_DISPLAY))
+ return 0;
+
+ if (pci_is_pcie(child)) {
+ if ((pci_pcie_type(child) == PCI_EXP_TYPE_UPSTREAM) ||
+ (pci_pcie_type(child) == PCI_EXP_TYPE_PCI_BRIDGE))
+ return 0;
+ }
+ }
+
+ return 1;
+}
+
+struct pci_dev_reset_entry {
+ struct list_head list;
+ struct pci_dev *dev;
+};
+int __init pci_reset_endpoints(void)
+{
+ struct pci_dev *dev = NULL;
+ struct pci_dev_reset_entry *pdev_entry, *tmp;
+ struct pci_bus *subordinate = NULL;
+ int has_it;
+
+ LIST_HEAD(pdev_list);
+
+ if (likely(!is_kdump_kernel()))
+ return 0;
+
+ for_each_pci_dev(dev) {
+ subordinate = dev->subordinate;
+ if (!subordinate || list_empty(&subordinate->devices))
+ continue;
+
+ has_it = 0;
+ list_for_each_entry(pdev_entry, &pdev_list, list) {
+ if (dev == pdev_entry->dev) {
+ has_it = 1;
+ break;
+ }
+ }
+ if (has_it)
+ continue;
+
+ if (__pci_dev_need_reset(dev)) {
+ pdev_entry = kmalloc(sizeof(*pdev_entry), GFP_KERNEL);
+ pdev_entry->dev = dev;
+ list_add(&pdev_entry->list, &pdev_list);
+ }
+ }
+
+ list_for_each_entry_safe(pdev_entry, tmp, &pdev_list, list) {
+ pci_try_reset_bus(pdev_entry->dev->subordinate);
+ kfree(pdev_entry);
+ }
+
+ return 0;
+}
+fs_initcall_sync(pci_reset_endpoints);
+
static int __init pci_setup(char *str)
{
while (str) {
Takao Indoh
2014-10-15 08:14:13 UTC
Permalink
Post by Li, ZhenHua
I tested on the latest stable version 3.17, it works well.
On a HP system with Intel vt-d supported and many PCI devices on it,
when kernel crashed and the kdump kernel boots with intel_iommu=on,
there may be some unexpected DMA requests on this adapter, which will
dmar: DRHD: handling fault status reg 102
dmar: DMAR:[DMA Read] Request device [41:00.0] fault addr fff81000
DMAR:[fault reason 01] Present bit in root entry is clear
This bug may happen on *any* PCI device.
static struct context_entry * device_to_context_entry(
struct intel_iommu *iommu, u8 bus, u8 devfn)
{
......
set_root_present(root);
......
}
device driver
intel_alloc_coherent
__intel_map_single
domain_context_mapping
domain_context_mapping_one
device_to_context_entry
This means, the present bit in root entry will not be set until the device
driver is loaded.
But in the kdump kernel, hardware devices are not aware that control has
transferred to the second kernel, and those drivers must initialize again.
Consequently there may be unexpected DMA requests from devices activity
initiated in the first kernel leading to the DMA Remapping errors in the
second kernel.
To fix this DMAR fault, we need to reset the bus that this device on. Reset
the device itself does not work.
https://lkml.org/lkml/2014/9/30/55
As in discussion, this bug may happen on *any* device, so we need to reset all
pci devices.
https://lkml.org/lkml/2013/5/14/9
As far as I can remember, the original patch was nacked by
the following reasons:

1) On sparc, the IOMMU is initialized before PCI devices are enumerated,
so there would still be a window where ongoing DMA could cause an
IOMMU error.

2) Basically Bjorn is thinking device reset should be done in the
1st kernel before jumping into 2nd kernel.

And Bill Sumner proposed another idea.
http://comments.gmane.org/gmane.linux.kernel.iommu/4828
I don't know the current status of this patch, but I think Jerry Hoemann
is working on this.

Thanks,
Takao Indoh
Post by Li, ZhenHua
Add support for legacy PCI devices.
Use pci_try_reset_bus instead of do_downstream_device_reset in original version
Randy Wright corrects some misunderstanding in this description.
---
drivers/pci/pci.c | 84 +++++++++++++++++++++++++++++++++++++++++++++++++++++++
1 file changed, 84 insertions(+)
diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c
index 2c9ac70..8cb146c 100644
--- a/drivers/pci/pci.c
+++ b/drivers/pci/pci.c
@@ -23,6 +23,7 @@
#include <linux/device.h>
#include <linux/pm_runtime.h>
#include <linux/pci_hotplug.h>
+#include <linux/crash_dump.h>
#include <asm-generic/pci-bridge.h>
#include <asm/setup.h>
#include "pci.h"
@@ -4423,6 +4424,89 @@ void __weak pci_fixup_cardbus(struct pci_bus *bus)
}
EXPORT_SYMBOL(pci_fixup_cardbus);
+/*
+ * Return true if dev is PCI root port or downstream port whose child is PCI
+ * endpoint except VGA device.
+ */
+static int __pci_dev_need_reset(struct pci_dev *dev)
+{
+ struct pci_bus *subordinate;
+ struct pci_dev *child;
+
+ if (dev->hdr_type != PCI_HEADER_TYPE_BRIDGE)
+ return 0;
+
+ if (pci_is_pcie(dev)) {
+ if ((pci_pcie_type(dev) != PCI_EXP_TYPE_ROOT_PORT) &&
+ (pci_pcie_type(dev) != PCI_EXP_TYPE_DOWNSTREAM))
+ return 0;
+ }
+
+ subordinate = dev->subordinate;
+ list_for_each_entry(child, &subordinate->devices, bus_list) {
+ /* Don't reset switch, bridge, VGA device */
+ if ((child->hdr_type == PCI_HEADER_TYPE_BRIDGE) ||
+ ((child->class >> 16) == PCI_BASE_CLASS_BRIDGE) ||
+ ((child->class >> 16) == PCI_BASE_CLASS_DISPLAY))
+ return 0;
+
+ if (pci_is_pcie(child)) {
+ if ((pci_pcie_type(child) == PCI_EXP_TYPE_UPSTREAM) ||
+ (pci_pcie_type(child) == PCI_EXP_TYPE_PCI_BRIDGE))
+ return 0;
+ }
+ }
+
+ return 1;
+}
+
+struct pci_dev_reset_entry {
+ struct list_head list;
+ struct pci_dev *dev;
+};
+int __init pci_reset_endpoints(void)
+{
+ struct pci_dev *dev = NULL;
+ struct pci_dev_reset_entry *pdev_entry, *tmp;
+ struct pci_bus *subordinate = NULL;
+ int has_it;
+
+ LIST_HEAD(pdev_list);
+
+ if (likely(!is_kdump_kernel()))
+ return 0;
+
+ for_each_pci_dev(dev) {
+ subordinate = dev->subordinate;
+ if (!subordinate || list_empty(&subordinate->devices))
+ continue;
+
+ has_it = 0;
+ list_for_each_entry(pdev_entry, &pdev_list, list) {
+ if (dev == pdev_entry->dev) {
+ has_it = 1;
+ break;
+ }
+ }
+ if (has_it)
+ continue;
+
+ if (__pci_dev_need_reset(dev)) {
+ pdev_entry = kmalloc(sizeof(*pdev_entry), GFP_KERNEL);
+ pdev_entry->dev = dev;
+ list_add(&pdev_entry->list, &pdev_list);
+ }
+ }
+
+ list_for_each_entry_safe(pdev_entry, tmp, &pdev_list, list) {
+ pci_try_reset_bus(pdev_entry->dev->subordinate);
+ kfree(pdev_entry);
+ }
+
+ return 0;
+}
+fs_initcall_sync(pci_reset_endpoints);
+
static int __init pci_setup(char *str)
{
while (str) {
Li, ZhenHua
2014-10-15 08:31:28 UTC
Permalink
I still think resetting the devices is necessary. Reading your original
mails , I think reason 2 you mentioned, resetting the devices in the
first kernel, may be a good solution.

Well, I am now working on Bill's patch that you mentioned. As it is not
accepted yet, I will create a new version for latest stable kernel 3.17.

Thanks
Zhenhua
Post by Takao Indoh
Post by Li, ZhenHua
I tested on the latest stable version 3.17, it works well.
On a HP system with Intel vt-d supported and many PCI devices on it,
when kernel crashed and the kdump kernel boots with intel_iommu=on,
there may be some unexpected DMA requests on this adapter, which will
dmar: DRHD: handling fault status reg 102
dmar: DMAR:[DMA Read] Request device [41:00.0] fault addr fff81000
DMAR:[fault reason 01] Present bit in root entry is clear
This bug may happen on *any* PCI device.
static struct context_entry * device_to_context_entry(
struct intel_iommu *iommu, u8 bus, u8 devfn)
{
......
set_root_present(root);
......
}
device driver
intel_alloc_coherent
__intel_map_single
domain_context_mapping
domain_context_mapping_one
device_to_context_entry
This means, the present bit in root entry will not be set until the device
driver is loaded.
But in the kdump kernel, hardware devices are not aware that control has
transferred to the second kernel, and those drivers must initialize again.
Consequently there may be unexpected DMA requests from devices activity
initiated in the first kernel leading to the DMA Remapping errors in the
second kernel.
To fix this DMAR fault, we need to reset the bus that this device on. Reset
the device itself does not work.
https://lkml.org/lkml/2014/9/30/55
As in discussion, this bug may happen on *any* device, so we need to reset all
pci devices.
https://lkml.org/lkml/2013/5/14/9
As far as I can remember, the original patch was nacked by
1) On sparc, the IOMMU is initialized before PCI devices are enumerated,
so there would still be a window where ongoing DMA could cause an
IOMMU error.
2) Basically Bjorn is thinking device reset should be done in the
1st kernel before jumping into 2nd kernel.
And Bill Sumner proposed another idea.
http://comments.gmane.org/gmane.linux.kernel.iommu/4828
I don't know the current status of this patch, but I think Jerry Hoemann
is working on this.
Thanks,
Takao Indoh
Post by Li, ZhenHua
Add support for legacy PCI devices.
Use pci_try_reset_bus instead of do_downstream_device_reset in original version
Randy Wright corrects some misunderstanding in this description.
---
drivers/pci/pci.c | 84 +++++++++++++++++++++++++++++++++++++++++++++++++++++++
1 file changed, 84 insertions(+)
diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c
index 2c9ac70..8cb146c 100644
--- a/drivers/pci/pci.c
+++ b/drivers/pci/pci.c
@@ -23,6 +23,7 @@
#include <linux/device.h>
#include <linux/pm_runtime.h>
#include <linux/pci_hotplug.h>
+#include <linux/crash_dump.h>
#include <asm-generic/pci-bridge.h>
#include <asm/setup.h>
#include "pci.h"
@@ -4423,6 +4424,89 @@ void __weak pci_fixup_cardbus(struct pci_bus *bus)
}
EXPORT_SYMBOL(pci_fixup_cardbus);
+/*
+ * Return true if dev is PCI root port or downstream port whose child is PCI
+ * endpoint except VGA device.
+ */
+static int __pci_dev_need_reset(struct pci_dev *dev)
+{
+ struct pci_bus *subordinate;
+ struct pci_dev *child;
+
+ if (dev->hdr_type != PCI_HEADER_TYPE_BRIDGE)
+ return 0;
+
+ if (pci_is_pcie(dev)) {
+ if ((pci_pcie_type(dev) != PCI_EXP_TYPE_ROOT_PORT) &&
+ (pci_pcie_type(dev) != PCI_EXP_TYPE_DOWNSTREAM))
+ return 0;
+ }
+
+ subordinate = dev->subordinate;
+ list_for_each_entry(child, &subordinate->devices, bus_list) {
+ /* Don't reset switch, bridge, VGA device */
+ if ((child->hdr_type == PCI_HEADER_TYPE_BRIDGE) ||
+ ((child->class >> 16) == PCI_BASE_CLASS_BRIDGE) ||
+ ((child->class >> 16) == PCI_BASE_CLASS_DISPLAY))
+ return 0;
+
+ if (pci_is_pcie(child)) {
+ if ((pci_pcie_type(child) == PCI_EXP_TYPE_UPSTREAM) ||
+ (pci_pcie_type(child) == PCI_EXP_TYPE_PCI_BRIDGE))
+ return 0;
+ }
+ }
+
+ return 1;
+}
+
+struct pci_dev_reset_entry {
+ struct list_head list;
+ struct pci_dev *dev;
+};
+int __init pci_reset_endpoints(void)
+{
+ struct pci_dev *dev = NULL;
+ struct pci_dev_reset_entry *pdev_entry, *tmp;
+ struct pci_bus *subordinate = NULL;
+ int has_it;
+
+ LIST_HEAD(pdev_list);
+
+ if (likely(!is_kdump_kernel()))
+ return 0;
+
+ for_each_pci_dev(dev) {
+ subordinate = dev->subordinate;
+ if (!subordinate || list_empty(&subordinate->devices))
+ continue;
+
+ has_it = 0;
+ list_for_each_entry(pdev_entry, &pdev_list, list) {
+ if (dev == pdev_entry->dev) {
+ has_it = 1;
+ break;
+ }
+ }
+ if (has_it)
+ continue;
+
+ if (__pci_dev_need_reset(dev)) {
+ pdev_entry = kmalloc(sizeof(*pdev_entry), GFP_KERNEL);
+ pdev_entry->dev = dev;
+ list_add(&pdev_entry->list, &pdev_list);
+ }
+ }
+
+ list_for_each_entry_safe(pdev_entry, tmp, &pdev_list, list) {
+ pci_try_reset_bus(pdev_entry->dev->subordinate);
+ kfree(pdev_entry);
+ }
+
+ return 0;
+}
+fs_initcall_sync(pci_reset_endpoints);
+
static int __init pci_setup(char *str)
{
while (str) {
Li, ZhenHua
2014-10-20 02:19:12 UTC
Permalink
Hi Takao Indoh,

According to this discussion
https://lkml.org/lkml/2014/10/17/107

It seems that we can not do the resetting on the first kernel. It can
only be called during kdump kernel boots.

Thanks
Zhenhua
Post by Takao Indoh
Post by Li, ZhenHua
I tested on the latest stable version 3.17, it works well.
On a HP system with Intel vt-d supported and many PCI devices on it,
when kernel crashed and the kdump kernel boots with intel_iommu=on,
there may be some unexpected DMA requests on this adapter, which will
dmar: DRHD: handling fault status reg 102
dmar: DMAR:[DMA Read] Request device [41:00.0] fault addr fff81000
DMAR:[fault reason 01] Present bit in root entry is clear
This bug may happen on *any* PCI device.
static struct context_entry * device_to_context_entry(
struct intel_iommu *iommu, u8 bus, u8 devfn)
{
......
set_root_present(root);
......
}
device driver
intel_alloc_coherent
__intel_map_single
domain_context_mapping
domain_context_mapping_one
device_to_context_entry
This means, the present bit in root entry will not be set until the device
driver is loaded.
But in the kdump kernel, hardware devices are not aware that control has
transferred to the second kernel, and those drivers must initialize again.
Consequently there may be unexpected DMA requests from devices activity
initiated in the first kernel leading to the DMA Remapping errors in the
second kernel.
To fix this DMAR fault, we need to reset the bus that this device on. Reset
the device itself does not work.
https://lkml.org/lkml/2014/9/30/55
As in discussion, this bug may happen on *any* device, so we need to reset all
pci devices.
https://lkml.org/lkml/2013/5/14/9
As far as I can remember, the original patch was nacked by
1) On sparc, the IOMMU is initialized before PCI devices are enumerated,
so there would still be a window where ongoing DMA could cause an
IOMMU error.
2) Basically Bjorn is thinking device reset should be done in the
1st kernel before jumping into 2nd kernel.
And Bill Sumner proposed another idea.
http://comments.gmane.org/gmane.linux.kernel.iommu/4828
I don't know the current status of this patch, but I think Jerry Hoemann
is working on this.
Thanks,
Takao Indoh
Post by Li, ZhenHua
Add support for legacy PCI devices.
Use pci_try_reset_bus instead of do_downstream_device_reset in original version
Randy Wright corrects some misunderstanding in this description.
---
drivers/pci/pci.c | 84 +++++++++++++++++++++++++++++++++++++++++++++++++++++++
1 file changed, 84 insertions(+)
diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c
index 2c9ac70..8cb146c 100644
--- a/drivers/pci/pci.c
+++ b/drivers/pci/pci.c
@@ -23,6 +23,7 @@
#include <linux/device.h>
#include <linux/pm_runtime.h>
#include <linux/pci_hotplug.h>
+#include <linux/crash_dump.h>
#include <asm-generic/pci-bridge.h>
#include <asm/setup.h>
#include "pci.h"
@@ -4423,6 +4424,89 @@ void __weak pci_fixup_cardbus(struct pci_bus *bus)
}
EXPORT_SYMBOL(pci_fixup_cardbus);
+/*
+ * Return true if dev is PCI root port or downstream port whose child is PCI
+ * endpoint except VGA device.
+ */
+static int __pci_dev_need_reset(struct pci_dev *dev)
+{
+ struct pci_bus *subordinate;
+ struct pci_dev *child;
+
+ if (dev->hdr_type != PCI_HEADER_TYPE_BRIDGE)
+ return 0;
+
+ if (pci_is_pcie(dev)) {
+ if ((pci_pcie_type(dev) != PCI_EXP_TYPE_ROOT_PORT) &&
+ (pci_pcie_type(dev) != PCI_EXP_TYPE_DOWNSTREAM))
+ return 0;
+ }
+
+ subordinate = dev->subordinate;
+ list_for_each_entry(child, &subordinate->devices, bus_list) {
+ /* Don't reset switch, bridge, VGA device */
+ if ((child->hdr_type == PCI_HEADER_TYPE_BRIDGE) ||
+ ((child->class >> 16) == PCI_BASE_CLASS_BRIDGE) ||
+ ((child->class >> 16) == PCI_BASE_CLASS_DISPLAY))
+ return 0;
+
+ if (pci_is_pcie(child)) {
+ if ((pci_pcie_type(child) == PCI_EXP_TYPE_UPSTREAM) ||
+ (pci_pcie_type(child) == PCI_EXP_TYPE_PCI_BRIDGE))
+ return 0;
+ }
+ }
+
+ return 1;
+}
+
+struct pci_dev_reset_entry {
+ struct list_head list;
+ struct pci_dev *dev;
+};
+int __init pci_reset_endpoints(void)
+{
+ struct pci_dev *dev = NULL;
+ struct pci_dev_reset_entry *pdev_entry, *tmp;
+ struct pci_bus *subordinate = NULL;
+ int has_it;
+
+ LIST_HEAD(pdev_list);
+
+ if (likely(!is_kdump_kernel()))
+ return 0;
+
+ for_each_pci_dev(dev) {
+ subordinate = dev->subordinate;
+ if (!subordinate || list_empty(&subordinate->devices))
+ continue;
+
+ has_it = 0;
+ list_for_each_entry(pdev_entry, &pdev_list, list) {
+ if (dev == pdev_entry->dev) {
+ has_it = 1;
+ break;
+ }
+ }
+ if (has_it)
+ continue;
+
+ if (__pci_dev_need_reset(dev)) {
+ pdev_entry = kmalloc(sizeof(*pdev_entry), GFP_KERNEL);
+ pdev_entry->dev = dev;
+ list_add(&pdev_entry->list, &pdev_list);
+ }
+ }
+
+ list_for_each_entry_safe(pdev_entry, tmp, &pdev_list, list) {
+ pci_try_reset_bus(pdev_entry->dev->subordinate);
+ kfree(pdev_entry);
+ }
+
+ return 0;
+}
+fs_initcall_sync(pci_reset_endpoints);
+
static int __init pci_setup(char *str)
{
while (str) {
Loading...