diff mbox

[RESEND,BUGFIX,1/3] PCI/AER: fix pci_ops return NULL when hotplug a pci bus which was doing aer error inject

Message ID 5038A210.4030005@huawei.com
State Superseded
Headers show

Commit Message

Yijing Wang Aug. 25, 2012, 9:59 a.m. UTC
When we inject aer errors to the target pci device by aer_inject module, the pci_ops of pci
bus which the target device is on will be assign to pci_ops_aer.So if the target pci device
is a bridge, once we hotplug the pci bus(child bus) which the target device bridges to, child
bus's pci_ops will be assigned to pci_ops_aer too.Now every access to the child bus's device
will result to system panic, because it return NULL pci_ops in pci_read_aer.
This patch fix this.

CallTrace:
bash[5908]: NaT consumption 17179869216 [1]
Modules linked in: aer_inject cpufreq_conservative cpufreq_userspace cpufreq_pow
ersave acpi_cpufreq binfmt_misc fuse nls_iso8859_1 loop ipmi_si(+) ipmi_devintf
ipmi_msghandler dm_mod ppdev iTCO_wdt iTCO_vendor_support sg igb parport_pc i2c_
i801 mptctl i2c_core serio_raw hid_generic lpc_ich mfd_core parport button conta
iner usbhid hid uhci_hcd ehci_hcd usbcore usb_common sd_mod crc_t10dif ext3 mbca
che jbd fan processor ide_pci_generic ide_core ata_piix libata mptsas mptscsih m
ptbase scsi_transport_sas scsi_mod thermal thermal_sys hwmon

Pid: 5908, CPU 9, comm:                 bash
psr : 00001010085a2010 ifs : 800000000000048e ip  : [<a000000220b815b0>]    Not
tainted (3.5.0-rc6yijing-repo)
ip is at pci_read_aer+0x330/0x460 [aer_inject]
unat: 0000000000000000 pfs : 000000000000048e rsc : 0000000000000003
rnat: 0000000000000000 bsps: 0000000000000000 pr  : 65519aa6a6969aa5
ldrs: 0000000000000000 ccv : ffffffff00000001 fpsr: 0009804c8a70033f
csd : 0000000000000000 ssd : 0000000000000000
b0  : a000000220b815b0 b6  : a000000220b81280 b7  : a0000001006d56a0
f6  : 1003e0000000000000005 f7  : 1003e0000000000000028
f8  : 1003e00000000000000c8 f9  : 1003e0000000000000005
f10 : 1003e627ec1e2f4c0d8a7 f11 : 1003e0000000000000011
r1  : a0000001014e63c0 r2  : 0000000000000738 r3  : 000000000000fffe
r8  : 0000000000000736 r9  : 0000000000000042 r10 : e000001f08f4c898
r11 : 0000000000000000 r12 : e000000f3dfcfdc0 r13 : e000000f3dfc0000
r14 : 0000000000000738 r15 : 0000000000004000 r16 : a000000220b827c8
r17 : a000000220b827b8 r18 : ffffffffffffff00 r19 : e000000f073b0110
r20 : 0000000000000042 r21 : e000000f073b0114 r22 : 0000000000000000
r23 : e000000f073b0118 r24 : a0000001009e0e49 r25 : 0000000000000001
r26 : 0000000000007041 r27 : e000000f3dfcfde0 r28 : 0000000000000000
r29 : e000000f3dfcfc08 r30 : a000000220b827c8 r31 : e000001f074d6000

Call Trace:
 [<a000000100016500>] show_stack+0x80/0xa0
                                sp=e000000f3dfcf800 bsp=e000000f3dfc1758
 [<a000000100016b60>] show_regs+0x640/0x920
                                sp=e000000f3dfcf9d0 bsp=e000000f3dfc1700
 [<a000000100040770>] die+0x190/0x2c0
                                sp=e000000f3dfcf9e0 bsp=e000000f3dfc16c0
 [<a0000001000408f0>] die_if_kernel+0x50/0x80
                                sp=e000000f3dfcf9e0 bsp=e000000f3dfc1690
 [<a000000100903a90>] ia64_fault+0xf0/0x15e0
                                sp=e000000f3dfcf9e0 bsp=e000000f3dfc1640
 [<a00000010000c0a0>] ia64_native_leave_kernel+0x0/0x270
                                sp=e000000f3dfcfbf0 bsp=e000000f3dfc1640
 [<a000000220b815b0>] pci_read_aer+0x330/0x460 [aer_inject]
                                sp=e000000f3dfcfdc0 bsp=e000000f3dfc15c8
 [<a0000001004ace00>] pci_bus_read_config_dword+0xe0/0x140
                                sp=e000000f3dfcfdc0 bsp=e000000f3dfc1580
 [<a0000001004b0c10>] pci_bus_read_dev_vendor_id+0x50/0x200
                                sp=e000000f3dfcfdd0 bsp=e000000f3dfc1530
 [<a0000001008d3d10>] pci_scan_single_device+0x90/0x200
                                sp=e000000f3dfcfdd0 bsp=e000000f3dfc14f8
 [<a0000001004b24b0>] pci_scan_slot+0xb0/0x320
                                sp=e000000f3dfcfde0 bsp=e000000f3dfc14a8
 [<a0000001008d9e90>] pci_scan_child_bus+0x90/0x2e0
                                sp=e000000f3dfcfde0 bsp=e000000f3dfc1468
 [<a0000001008d9580>] pci_scan_bridge+0x540/0xdc0
                                sp=e000000f3dfcfde0 bsp=e000000f3dfc13d0
 [<a0000001008da0b0>] pci_scan_child_bus+0x2b0/0x2e0
                                sp=e000000f3dfcfe00 bsp=e000000f3dfc1390
 [<a0000001008d5bd0>] pci_rescan_bus+0x50/0x220
                                sp=e000000f3dfcfe00 bsp=e000000f3dfc1358
 [<a0000001004c2ab0>] bus_rescan_store+0xf0/0x160
                                sp=e000000f3dfcfe10 bsp=e000000f3dfc1328
 [<a0000001006110b0>] bus_attr_store+0x70/0xa0
                                sp=e000000f3dfcfe20 bsp=e000000f3dfc12f0
 [<a000000100343b00>] sysfs_write_file+0x240/0x340
                                sp=e000000f3dfcfe20 bsp=e000000f3dfc1298
 [<a00000010025e230>] vfs_write+0x1b0/0x3a0
                                sp=e000000f3dfcfe20 bsp=e000000f3dfc1250
 [<a00000010025e5e0>] sys_write+0x80/0x100
                                sp=e000000f3dfcfe20 bsp=e000000f3dfc11d0
 [<a00000010000bf20>] ia64_ret_from_syscall+0x0/0x20
                                sp=e000000f3dfcfe30 bsp=e000000f3dfc11d0
 [<a000000000040720>] __kernel_syscall_via_break+0x0/0x20
                                sp=e000000f3dfd0000 bsp=e000000f3dfc11d0
Disabling lock debugging due to kernel taint

Signed-off-by: Yijing Wang <wangyijing@huawei.com>
Signed-off-by: Jiang Liu <liuj97@gmail.com>
---
 drivers/pci/pcie/aer/aer_inject.c |   21 +++++++++++++++++++++
 1 files changed, 21 insertions(+), 0 deletions(-)

Comments

Huang, Ying Aug. 27, 2012, 1:23 a.m. UTC | #1
On Sat, 2012-08-25 at 17:59 +0800, Yijing Wang wrote:
> When we inject aer errors to the target pci device by aer_inject module, the pci_ops of pci
> bus which the target device is on will be assign to pci_ops_aer.So if the target pci device
> is a bridge, once we hotplug the pci bus(child bus) which the target device bridges to, child
> bus's pci_ops will be assigned to pci_ops_aer too.Now every access to the child bus's device
> will result to system panic, because it return NULL pci_ops in pci_read_aer.
> This patch fix this.
> 
> CallTrace:
> bash[5908]: NaT consumption 17179869216 [1]
> Modules linked in: aer_inject cpufreq_conservative cpufreq_userspace cpufreq_pow
> ersave acpi_cpufreq binfmt_misc fuse nls_iso8859_1 loop ipmi_si(+) ipmi_devintf
> ipmi_msghandler dm_mod ppdev iTCO_wdt iTCO_vendor_support sg igb parport_pc i2c_
> i801 mptctl i2c_core serio_raw hid_generic lpc_ich mfd_core parport button conta
> iner usbhid hid uhci_hcd ehci_hcd usbcore usb_common sd_mod crc_t10dif ext3 mbca
> che jbd fan processor ide_pci_generic ide_core ata_piix libata mptsas mptscsih m
> ptbase scsi_transport_sas scsi_mod thermal thermal_sys hwmon
> 
> Pid: 5908, CPU 9, comm:                 bash
> psr : 00001010085a2010 ifs : 800000000000048e ip  : [<a000000220b815b0>]    Not
> tainted (3.5.0-rc6yijing-repo)
> ip is at pci_read_aer+0x330/0x460 [aer_inject]
> unat: 0000000000000000 pfs : 000000000000048e rsc : 0000000000000003
> rnat: 0000000000000000 bsps: 0000000000000000 pr  : 65519aa6a6969aa5
> ldrs: 0000000000000000 ccv : ffffffff00000001 fpsr: 0009804c8a70033f
> csd : 0000000000000000 ssd : 0000000000000000
> b0  : a000000220b815b0 b6  : a000000220b81280 b7  : a0000001006d56a0
> f6  : 1003e0000000000000005 f7  : 1003e0000000000000028
> f8  : 1003e00000000000000c8 f9  : 1003e0000000000000005
> f10 : 1003e627ec1e2f4c0d8a7 f11 : 1003e0000000000000011
> r1  : a0000001014e63c0 r2  : 0000000000000738 r3  : 000000000000fffe
> r8  : 0000000000000736 r9  : 0000000000000042 r10 : e000001f08f4c898
> r11 : 0000000000000000 r12 : e000000f3dfcfdc0 r13 : e000000f3dfc0000
> r14 : 0000000000000738 r15 : 0000000000004000 r16 : a000000220b827c8
> r17 : a000000220b827b8 r18 : ffffffffffffff00 r19 : e000000f073b0110
> r20 : 0000000000000042 r21 : e000000f073b0114 r22 : 0000000000000000
> r23 : e000000f073b0118 r24 : a0000001009e0e49 r25 : 0000000000000001
> r26 : 0000000000007041 r27 : e000000f3dfcfde0 r28 : 0000000000000000
> r29 : e000000f3dfcfc08 r30 : a000000220b827c8 r31 : e000001f074d6000
> 
> Call Trace:
>  [<a000000100016500>] show_stack+0x80/0xa0
>                                 sp=e000000f3dfcf800 bsp=e000000f3dfc1758
>  [<a000000100016b60>] show_regs+0x640/0x920
>                                 sp=e000000f3dfcf9d0 bsp=e000000f3dfc1700
>  [<a000000100040770>] die+0x190/0x2c0
>                                 sp=e000000f3dfcf9e0 bsp=e000000f3dfc16c0
>  [<a0000001000408f0>] die_if_kernel+0x50/0x80
>                                 sp=e000000f3dfcf9e0 bsp=e000000f3dfc1690
>  [<a000000100903a90>] ia64_fault+0xf0/0x15e0
>                                 sp=e000000f3dfcf9e0 bsp=e000000f3dfc1640
>  [<a00000010000c0a0>] ia64_native_leave_kernel+0x0/0x270
>                                 sp=e000000f3dfcfbf0 bsp=e000000f3dfc1640
>  [<a000000220b815b0>] pci_read_aer+0x330/0x460 [aer_inject]
>                                 sp=e000000f3dfcfdc0 bsp=e000000f3dfc15c8
>  [<a0000001004ace00>] pci_bus_read_config_dword+0xe0/0x140
>                                 sp=e000000f3dfcfdc0 bsp=e000000f3dfc1580
>  [<a0000001004b0c10>] pci_bus_read_dev_vendor_id+0x50/0x200
>                                 sp=e000000f3dfcfdd0 bsp=e000000f3dfc1530
>  [<a0000001008d3d10>] pci_scan_single_device+0x90/0x200
>                                 sp=e000000f3dfcfdd0 bsp=e000000f3dfc14f8
>  [<a0000001004b24b0>] pci_scan_slot+0xb0/0x320
>                                 sp=e000000f3dfcfde0 bsp=e000000f3dfc14a8
>  [<a0000001008d9e90>] pci_scan_child_bus+0x90/0x2e0
>                                 sp=e000000f3dfcfde0 bsp=e000000f3dfc1468
>  [<a0000001008d9580>] pci_scan_bridge+0x540/0xdc0
>                                 sp=e000000f3dfcfde0 bsp=e000000f3dfc13d0
>  [<a0000001008da0b0>] pci_scan_child_bus+0x2b0/0x2e0
>                                 sp=e000000f3dfcfe00 bsp=e000000f3dfc1390
>  [<a0000001008d5bd0>] pci_rescan_bus+0x50/0x220
>                                 sp=e000000f3dfcfe00 bsp=e000000f3dfc1358
>  [<a0000001004c2ab0>] bus_rescan_store+0xf0/0x160
>                                 sp=e000000f3dfcfe10 bsp=e000000f3dfc1328
>  [<a0000001006110b0>] bus_attr_store+0x70/0xa0
>                                 sp=e000000f3dfcfe20 bsp=e000000f3dfc12f0
>  [<a000000100343b00>] sysfs_write_file+0x240/0x340
>                                 sp=e000000f3dfcfe20 bsp=e000000f3dfc1298
>  [<a00000010025e230>] vfs_write+0x1b0/0x3a0
>                                 sp=e000000f3dfcfe20 bsp=e000000f3dfc1250
>  [<a00000010025e5e0>] sys_write+0x80/0x100
>                                 sp=e000000f3dfcfe20 bsp=e000000f3dfc11d0
>  [<a00000010000bf20>] ia64_ret_from_syscall+0x0/0x20
>                                 sp=e000000f3dfcfe30 bsp=e000000f3dfc11d0
>  [<a000000000040720>] __kernel_syscall_via_break+0x0/0x20
>                                 sp=e000000f3dfd0000 bsp=e000000f3dfc11d0
> Disabling lock debugging due to kernel taint
> 
> Signed-off-by: Yijing Wang <wangyijing@huawei.com>
> Signed-off-by: Jiang Liu <liuj97@gmail.com>
> ---
>  drivers/pci/pcie/aer/aer_inject.c |   21 +++++++++++++++++++++
>  1 files changed, 21 insertions(+), 0 deletions(-)
> 
> diff --git a/drivers/pci/pcie/aer/aer_inject.c b/drivers/pci/pcie/aer/aer_inject.c
> index 5222986..fc28785 100644
> --- a/drivers/pci/pcie/aer/aer_inject.c
> +++ b/drivers/pci/pcie/aer/aer_inject.c
> @@ -109,6 +109,19 @@ static struct aer_error *__find_aer_error_by_dev(struct pci_dev *dev)
>  	return __find_aer_error((u16)domain, dev->bus->number, dev->devfn);
>  }
> 
> +static bool pci_is_upstream_bus(struct pci_bus *bus, struct pci_bus *up_bus)
> +{
> +	struct pci_bus *pbus = bus->parent;
> +
> +	while (pbus) {
> +		if (pbus == up_bus)
> +			return true;
> +		pbus = pbus->parent;
> +	}
> +
> +	return false;
> +}
> +
>  /* inject_lock must be held before calling */
>  static struct pci_ops *__find_pci_bus_ops(struct pci_bus *bus)
>  {
> @@ -118,6 +131,13 @@ static struct pci_ops *__find_pci_bus_ops(struct pci_bus *bus)
>  		if (bus_ops->bus == bus)
>  			return bus_ops->ops;
>  	}
> +
> +	/* here can't find bus_ops, fall back to get bus_ops of upstream bus */
> +	list_for_each_entry(bus_ops, &pci_bus_ops_list, list) {
> +		if (pci_is_upstream_bus(bus, bus_ops->bus))
> +			return bus_ops->ops;
> +	}
> +
>  	return NULL;
>  }
> 
> @@ -506,6 +526,7 @@ static struct miscdevice aer_inject_device = {
>  	.fops = &aer_inject_fops,
>  };
> 
> +
>  static int __init aer_inject_init(void)
>  {
>  	return misc_register(&aer_inject_device);

After

# rmmod aer_inject

What will happen?

Best Regards,
Huang Ying


--
To unsubscribe from this list: send the line "unsubscribe linux-pci" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Chen Gong Aug. 27, 2012, 8:49 a.m. UTC | #2
On Sat, Aug 25, 2012 at 05:59:44PM +0800, Yijing Wang wrote:
> Date:	Sat, 25 Aug 2012 17:59:44 +0800
> From: Yijing Wang <wangyijing@huawei.com>
> To: Bjorn Helgaas <bhelgaas@google.com>, Rusty Russell
>  <rusty@rustcorp.com.au>, Mauro Carvalho Chehab <mchehab@redhat.com>
> CC: PCI <linux-pci@vger.kernel.org>, Jiang Liu <liuj97@gmail.com>, Huang
>  Ying <ying.huang@intel.com>, Hanjun Guo <guohanjun@huawei.com>,
>  linux-kernel@vger.kernel.org
> Subject: [RESEND BUGFIX PATCH 1/3] PCI/AER: fix pci_ops return NULL when
>  hotplug a pci bus which was doing aer error inject
> User-Agent: Mozilla/5.0 (Windows NT 6.1; rv:14.0) Gecko/20120713
>  Thunderbird/14.0
> 
> When we inject aer errors to the target pci device by aer_inject module, the pci_ops of pci
> bus which the target device is on will be assign to pci_ops_aer.So if the target pci device
> is a bridge, once we hotplug the pci bus(child bus) which the target device bridges to, child
> bus's pci_ops will be assigned to pci_ops_aer too.Now every access to the child bus's device
> will result to system panic, because it return NULL pci_ops in pci_read_aer.
> This patch fix this.
> 
> CallTrace:
> bash[5908]: NaT consumption 17179869216 [1]
> Modules linked in: aer_inject cpufreq_conservative cpufreq_userspace cpufreq_pow
> ersave acpi_cpufreq binfmt_misc fuse nls_iso8859_1 loop ipmi_si(+) ipmi_devintf
> ipmi_msghandler dm_mod ppdev iTCO_wdt iTCO_vendor_support sg igb parport_pc i2c_
> i801 mptctl i2c_core serio_raw hid_generic lpc_ich mfd_core parport button conta
> iner usbhid hid uhci_hcd ehci_hcd usbcore usb_common sd_mod crc_t10dif ext3 mbca
> che jbd fan processor ide_pci_generic ide_core ata_piix libata mptsas mptscsih m
> ptbase scsi_transport_sas scsi_mod thermal thermal_sys hwmon
> 
[...]
> 
> Signed-off-by: Yijing Wang <wangyijing@huawei.com>
> Signed-off-by: Jiang Liu <liuj97@gmail.com>
> ---
>  drivers/pci/pcie/aer/aer_inject.c |   21 +++++++++++++++++++++
>  1 files changed, 21 insertions(+), 0 deletions(-)
> 
> diff --git a/drivers/pci/pcie/aer/aer_inject.c b/drivers/pci/pcie/aer/aer_inject.c
> index 5222986..fc28785 100644
> --- a/drivers/pci/pcie/aer/aer_inject.c
> +++ b/drivers/pci/pcie/aer/aer_inject.c
> @@ -109,6 +109,19 @@ static struct aer_error *__find_aer_error_by_dev(struct pci_dev *dev)
>  	return __find_aer_error((u16)domain, dev->bus->number, dev->devfn);
>  }
> 
> +static bool pci_is_upstream_bus(struct pci_bus *bus, struct pci_bus *up_bus)
> +{
> +	struct pci_bus *pbus = bus->parent;
> +
> +	while (pbus) {
> +		if (pbus == up_bus)
> +			return true;
> +		pbus = pbus->parent;
> +	}
> +
> +	return false;
> +}
> +
>  /* inject_lock must be held before calling */
>  static struct pci_ops *__find_pci_bus_ops(struct pci_bus *bus)
>  {
> @@ -118,6 +131,13 @@ static struct pci_ops *__find_pci_bus_ops(struct pci_bus *bus)
>  		if (bus_ops->bus == bus)
>  			return bus_ops->ops;
>  	}
> +
> +	/* here can't find bus_ops, fall back to get bus_ops of upstream bus */
> +	list_for_each_entry(bus_ops, &pci_bus_ops_list, list) {
> +		if (pci_is_upstream_bus(bus, bus_ops->bus))
> +			return bus_ops->ops;
> +	}
> +
>  	return NULL;
>  }
> 
At least, when returning NULL, a proper check and protection is needed.
Jiang Liu Aug. 27, 2012, 3:05 p.m. UTC | #3
Is it ok to ignore such a case? After all, aer_inject is just a test tool:)
It's not worth to change the core logic for such a corner case.
--Gerry

On 08/27/2012 09:23 AM, Huang Ying wrote:
> On Sat, 2012-08-25 at 17:59 +0800, Yijing Wang wrote:
>> When we inject aer errors to the target pci device by aer_inject module, the pci_ops of pci
>> bus which the target device is on will be assign to pci_ops_aer.So if the target pci device
>> is a bridge, once we hotplug the pci bus(child bus) which the target device bridges to, child
>> bus's pci_ops will be assigned to pci_ops_aer too.Now every access to the child bus's device
>> will result to system panic, because it return NULL pci_ops in pci_read_aer.
>> This patch fix this.
>>
>> CallTrace:
>> bash[5908]: NaT consumption 17179869216 [1]
>> Modules linked in: aer_inject cpufreq_conservative cpufreq_userspace cpufreq_pow
>> ersave acpi_cpufreq binfmt_misc fuse nls_iso8859_1 loop ipmi_si(+) ipmi_devintf
>> ipmi_msghandler dm_mod ppdev iTCO_wdt iTCO_vendor_support sg igb parport_pc i2c_
>> i801 mptctl i2c_core serio_raw hid_generic lpc_ich mfd_core parport button conta
>> iner usbhid hid uhci_hcd ehci_hcd usbcore usb_common sd_mod crc_t10dif ext3 mbca
>> che jbd fan processor ide_pci_generic ide_core ata_piix libata mptsas mptscsih m
>> ptbase scsi_transport_sas scsi_mod thermal thermal_sys hwmon
>>
>> Pid: 5908, CPU 9, comm:                 bash
>> psr : 00001010085a2010 ifs : 800000000000048e ip  : [<a000000220b815b0>]    Not
>> tainted (3.5.0-rc6yijing-repo)
>> ip is at pci_read_aer+0x330/0x460 [aer_inject]
>> unat: 0000000000000000 pfs : 000000000000048e rsc : 0000000000000003
>> rnat: 0000000000000000 bsps: 0000000000000000 pr  : 65519aa6a6969aa5
>> ldrs: 0000000000000000 ccv : ffffffff00000001 fpsr: 0009804c8a70033f
>> csd : 0000000000000000 ssd : 0000000000000000
>> b0  : a000000220b815b0 b6  : a000000220b81280 b7  : a0000001006d56a0
>> f6  : 1003e0000000000000005 f7  : 1003e0000000000000028
>> f8  : 1003e00000000000000c8 f9  : 1003e0000000000000005
>> f10 : 1003e627ec1e2f4c0d8a7 f11 : 1003e0000000000000011
>> r1  : a0000001014e63c0 r2  : 0000000000000738 r3  : 000000000000fffe
>> r8  : 0000000000000736 r9  : 0000000000000042 r10 : e000001f08f4c898
>> r11 : 0000000000000000 r12 : e000000f3dfcfdc0 r13 : e000000f3dfc0000
>> r14 : 0000000000000738 r15 : 0000000000004000 r16 : a000000220b827c8
>> r17 : a000000220b827b8 r18 : ffffffffffffff00 r19 : e000000f073b0110
>> r20 : 0000000000000042 r21 : e000000f073b0114 r22 : 0000000000000000
>> r23 : e000000f073b0118 r24 : a0000001009e0e49 r25 : 0000000000000001
>> r26 : 0000000000007041 r27 : e000000f3dfcfde0 r28 : 0000000000000000
>> r29 : e000000f3dfcfc08 r30 : a000000220b827c8 r31 : e000001f074d6000
>>
>> Call Trace:
>>  [<a000000100016500>] show_stack+0x80/0xa0
>>                                 sp=e000000f3dfcf800 bsp=e000000f3dfc1758
>>  [<a000000100016b60>] show_regs+0x640/0x920
>>                                 sp=e000000f3dfcf9d0 bsp=e000000f3dfc1700
>>  [<a000000100040770>] die+0x190/0x2c0
>>                                 sp=e000000f3dfcf9e0 bsp=e000000f3dfc16c0
>>  [<a0000001000408f0>] die_if_kernel+0x50/0x80
>>                                 sp=e000000f3dfcf9e0 bsp=e000000f3dfc1690
>>  [<a000000100903a90>] ia64_fault+0xf0/0x15e0
>>                                 sp=e000000f3dfcf9e0 bsp=e000000f3dfc1640
>>  [<a00000010000c0a0>] ia64_native_leave_kernel+0x0/0x270
>>                                 sp=e000000f3dfcfbf0 bsp=e000000f3dfc1640
>>  [<a000000220b815b0>] pci_read_aer+0x330/0x460 [aer_inject]
>>                                 sp=e000000f3dfcfdc0 bsp=e000000f3dfc15c8
>>  [<a0000001004ace00>] pci_bus_read_config_dword+0xe0/0x140
>>                                 sp=e000000f3dfcfdc0 bsp=e000000f3dfc1580
>>  [<a0000001004b0c10>] pci_bus_read_dev_vendor_id+0x50/0x200
>>                                 sp=e000000f3dfcfdd0 bsp=e000000f3dfc1530
>>  [<a0000001008d3d10>] pci_scan_single_device+0x90/0x200
>>                                 sp=e000000f3dfcfdd0 bsp=e000000f3dfc14f8
>>  [<a0000001004b24b0>] pci_scan_slot+0xb0/0x320
>>                                 sp=e000000f3dfcfde0 bsp=e000000f3dfc14a8
>>  [<a0000001008d9e90>] pci_scan_child_bus+0x90/0x2e0
>>                                 sp=e000000f3dfcfde0 bsp=e000000f3dfc1468
>>  [<a0000001008d9580>] pci_scan_bridge+0x540/0xdc0
>>                                 sp=e000000f3dfcfde0 bsp=e000000f3dfc13d0
>>  [<a0000001008da0b0>] pci_scan_child_bus+0x2b0/0x2e0
>>                                 sp=e000000f3dfcfe00 bsp=e000000f3dfc1390
>>  [<a0000001008d5bd0>] pci_rescan_bus+0x50/0x220
>>                                 sp=e000000f3dfcfe00 bsp=e000000f3dfc1358
>>  [<a0000001004c2ab0>] bus_rescan_store+0xf0/0x160
>>                                 sp=e000000f3dfcfe10 bsp=e000000f3dfc1328
>>  [<a0000001006110b0>] bus_attr_store+0x70/0xa0
>>                                 sp=e000000f3dfcfe20 bsp=e000000f3dfc12f0
>>  [<a000000100343b00>] sysfs_write_file+0x240/0x340
>>                                 sp=e000000f3dfcfe20 bsp=e000000f3dfc1298
>>  [<a00000010025e230>] vfs_write+0x1b0/0x3a0
>>                                 sp=e000000f3dfcfe20 bsp=e000000f3dfc1250
>>  [<a00000010025e5e0>] sys_write+0x80/0x100
>>                                 sp=e000000f3dfcfe20 bsp=e000000f3dfc11d0
>>  [<a00000010000bf20>] ia64_ret_from_syscall+0x0/0x20
>>                                 sp=e000000f3dfcfe30 bsp=e000000f3dfc11d0
>>  [<a000000000040720>] __kernel_syscall_via_break+0x0/0x20
>>                                 sp=e000000f3dfd0000 bsp=e000000f3dfc11d0
>> Disabling lock debugging due to kernel taint
>>
>> Signed-off-by: Yijing Wang <wangyijing@huawei.com>
>> Signed-off-by: Jiang Liu <liuj97@gmail.com>
>> ---
>>  drivers/pci/pcie/aer/aer_inject.c |   21 +++++++++++++++++++++
>>  1 files changed, 21 insertions(+), 0 deletions(-)
>>
>> diff --git a/drivers/pci/pcie/aer/aer_inject.c b/drivers/pci/pcie/aer/aer_inject.c
>> index 5222986..fc28785 100644
>> --- a/drivers/pci/pcie/aer/aer_inject.c
>> +++ b/drivers/pci/pcie/aer/aer_inject.c
>> @@ -109,6 +109,19 @@ static struct aer_error *__find_aer_error_by_dev(struct pci_dev *dev)
>>  	return __find_aer_error((u16)domain, dev->bus->number, dev->devfn);
>>  }
>>
>> +static bool pci_is_upstream_bus(struct pci_bus *bus, struct pci_bus *up_bus)
>> +{
>> +	struct pci_bus *pbus = bus->parent;
>> +
>> +	while (pbus) {
>> +		if (pbus == up_bus)
>> +			return true;
>> +		pbus = pbus->parent;
>> +	}
>> +
>> +	return false;
>> +}
>> +
>>  /* inject_lock must be held before calling */
>>  static struct pci_ops *__find_pci_bus_ops(struct pci_bus *bus)
>>  {
>> @@ -118,6 +131,13 @@ static struct pci_ops *__find_pci_bus_ops(struct pci_bus *bus)
>>  		if (bus_ops->bus == bus)
>>  			return bus_ops->ops;
>>  	}
>> +
>> +	/* here can't find bus_ops, fall back to get bus_ops of upstream bus */
>> +	list_for_each_entry(bus_ops, &pci_bus_ops_list, list) {
>> +		if (pci_is_upstream_bus(bus, bus_ops->bus))
>> +			return bus_ops->ops;
>> +	}
>> +
>>  	return NULL;
>>  }
>>
>> @@ -506,6 +526,7 @@ static struct miscdevice aer_inject_device = {
>>  	.fops = &aer_inject_fops,
>>  };
>>
>> +
>>  static int __init aer_inject_init(void)
>>  {
>>  	return misc_register(&aer_inject_device);
> 
> After
> 
> # rmmod aer_inject
> 
> What will happen?
> 
> Best Regards,
> Huang Ying
> 
> 

--
To unsubscribe from this list: send the line "unsubscribe linux-pci" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Huang, Ying Aug. 28, 2012, 12:38 a.m. UTC | #4
On Mon, 2012-08-27 at 23:05 +0800, Jiang Liu wrote:
> Is it ok to ignore such a case? After all, aer_inject is just a test tool:)
> It's not worth to change the core logic for such a corner case.
> --Gerry

Why ignore?  At least you can prevent aer_inject from unload if
something special happened.

Best Regards,
Huang Ying

> On 08/27/2012 09:23 AM, Huang Ying wrote:
> > On Sat, 2012-08-25 at 17:59 +0800, Yijing Wang wrote:
> >> When we inject aer errors to the target pci device by aer_inject module, the pci_ops of pci
> >> bus which the target device is on will be assign to pci_ops_aer.So if the target pci device
> >> is a bridge, once we hotplug the pci bus(child bus) which the target device bridges to, child
> >> bus's pci_ops will be assigned to pci_ops_aer too.Now every access to the child bus's device
> >> will result to system panic, because it return NULL pci_ops in pci_read_aer.
> >> This patch fix this.
> >>
> >> CallTrace:
> >> bash[5908]: NaT consumption 17179869216 [1]
> >> Modules linked in: aer_inject cpufreq_conservative cpufreq_userspace cpufreq_pow
> >> ersave acpi_cpufreq binfmt_misc fuse nls_iso8859_1 loop ipmi_si(+) ipmi_devintf
> >> ipmi_msghandler dm_mod ppdev iTCO_wdt iTCO_vendor_support sg igb parport_pc i2c_
> >> i801 mptctl i2c_core serio_raw hid_generic lpc_ich mfd_core parport button conta
> >> iner usbhid hid uhci_hcd ehci_hcd usbcore usb_common sd_mod crc_t10dif ext3 mbca
> >> che jbd fan processor ide_pci_generic ide_core ata_piix libata mptsas mptscsih m
> >> ptbase scsi_transport_sas scsi_mod thermal thermal_sys hwmon
> >>
> >> Pid: 5908, CPU 9, comm:                 bash
> >> psr : 00001010085a2010 ifs : 800000000000048e ip  : [<a000000220b815b0>]    Not
> >> tainted (3.5.0-rc6yijing-repo)
> >> ip is at pci_read_aer+0x330/0x460 [aer_inject]
> >> unat: 0000000000000000 pfs : 000000000000048e rsc : 0000000000000003
> >> rnat: 0000000000000000 bsps: 0000000000000000 pr  : 65519aa6a6969aa5
> >> ldrs: 0000000000000000 ccv : ffffffff00000001 fpsr: 0009804c8a70033f
> >> csd : 0000000000000000 ssd : 0000000000000000
> >> b0  : a000000220b815b0 b6  : a000000220b81280 b7  : a0000001006d56a0
> >> f6  : 1003e0000000000000005 f7  : 1003e0000000000000028
> >> f8  : 1003e00000000000000c8 f9  : 1003e0000000000000005
> >> f10 : 1003e627ec1e2f4c0d8a7 f11 : 1003e0000000000000011
> >> r1  : a0000001014e63c0 r2  : 0000000000000738 r3  : 000000000000fffe
> >> r8  : 0000000000000736 r9  : 0000000000000042 r10 : e000001f08f4c898
> >> r11 : 0000000000000000 r12 : e000000f3dfcfdc0 r13 : e000000f3dfc0000
> >> r14 : 0000000000000738 r15 : 0000000000004000 r16 : a000000220b827c8
> >> r17 : a000000220b827b8 r18 : ffffffffffffff00 r19 : e000000f073b0110
> >> r20 : 0000000000000042 r21 : e000000f073b0114 r22 : 0000000000000000
> >> r23 : e000000f073b0118 r24 : a0000001009e0e49 r25 : 0000000000000001
> >> r26 : 0000000000007041 r27 : e000000f3dfcfde0 r28 : 0000000000000000
> >> r29 : e000000f3dfcfc08 r30 : a000000220b827c8 r31 : e000001f074d6000
> >>
> >> Call Trace:
> >>  [<a000000100016500>] show_stack+0x80/0xa0
> >>                                 sp=e000000f3dfcf800 bsp=e000000f3dfc1758
> >>  [<a000000100016b60>] show_regs+0x640/0x920
> >>                                 sp=e000000f3dfcf9d0 bsp=e000000f3dfc1700
> >>  [<a000000100040770>] die+0x190/0x2c0
> >>                                 sp=e000000f3dfcf9e0 bsp=e000000f3dfc16c0
> >>  [<a0000001000408f0>] die_if_kernel+0x50/0x80
> >>                                 sp=e000000f3dfcf9e0 bsp=e000000f3dfc1690
> >>  [<a000000100903a90>] ia64_fault+0xf0/0x15e0
> >>                                 sp=e000000f3dfcf9e0 bsp=e000000f3dfc1640
> >>  [<a00000010000c0a0>] ia64_native_leave_kernel+0x0/0x270
> >>                                 sp=e000000f3dfcfbf0 bsp=e000000f3dfc1640
> >>  [<a000000220b815b0>] pci_read_aer+0x330/0x460 [aer_inject]
> >>                                 sp=e000000f3dfcfdc0 bsp=e000000f3dfc15c8
> >>  [<a0000001004ace00>] pci_bus_read_config_dword+0xe0/0x140
> >>                                 sp=e000000f3dfcfdc0 bsp=e000000f3dfc1580
> >>  [<a0000001004b0c10>] pci_bus_read_dev_vendor_id+0x50/0x200
> >>                                 sp=e000000f3dfcfdd0 bsp=e000000f3dfc1530
> >>  [<a0000001008d3d10>] pci_scan_single_device+0x90/0x200
> >>                                 sp=e000000f3dfcfdd0 bsp=e000000f3dfc14f8
> >>  [<a0000001004b24b0>] pci_scan_slot+0xb0/0x320
> >>                                 sp=e000000f3dfcfde0 bsp=e000000f3dfc14a8
> >>  [<a0000001008d9e90>] pci_scan_child_bus+0x90/0x2e0
> >>                                 sp=e000000f3dfcfde0 bsp=e000000f3dfc1468
> >>  [<a0000001008d9580>] pci_scan_bridge+0x540/0xdc0
> >>                                 sp=e000000f3dfcfde0 bsp=e000000f3dfc13d0
> >>  [<a0000001008da0b0>] pci_scan_child_bus+0x2b0/0x2e0
> >>                                 sp=e000000f3dfcfe00 bsp=e000000f3dfc1390
> >>  [<a0000001008d5bd0>] pci_rescan_bus+0x50/0x220
> >>                                 sp=e000000f3dfcfe00 bsp=e000000f3dfc1358
> >>  [<a0000001004c2ab0>] bus_rescan_store+0xf0/0x160
> >>                                 sp=e000000f3dfcfe10 bsp=e000000f3dfc1328
> >>  [<a0000001006110b0>] bus_attr_store+0x70/0xa0
> >>                                 sp=e000000f3dfcfe20 bsp=e000000f3dfc12f0
> >>  [<a000000100343b00>] sysfs_write_file+0x240/0x340
> >>                                 sp=e000000f3dfcfe20 bsp=e000000f3dfc1298
> >>  [<a00000010025e230>] vfs_write+0x1b0/0x3a0
> >>                                 sp=e000000f3dfcfe20 bsp=e000000f3dfc1250
> >>  [<a00000010025e5e0>] sys_write+0x80/0x100
> >>                                 sp=e000000f3dfcfe20 bsp=e000000f3dfc11d0
> >>  [<a00000010000bf20>] ia64_ret_from_syscall+0x0/0x20
> >>                                 sp=e000000f3dfcfe30 bsp=e000000f3dfc11d0
> >>  [<a000000000040720>] __kernel_syscall_via_break+0x0/0x20
> >>                                 sp=e000000f3dfd0000 bsp=e000000f3dfc11d0
> >> Disabling lock debugging due to kernel taint
> >>
> >> Signed-off-by: Yijing Wang <wangyijing@huawei.com>
> >> Signed-off-by: Jiang Liu <liuj97@gmail.com>
> >> ---
> >>  drivers/pci/pcie/aer/aer_inject.c |   21 +++++++++++++++++++++
> >>  1 files changed, 21 insertions(+), 0 deletions(-)
> >>
> >> diff --git a/drivers/pci/pcie/aer/aer_inject.c b/drivers/pci/pcie/aer/aer_inject.c
> >> index 5222986..fc28785 100644
> >> --- a/drivers/pci/pcie/aer/aer_inject.c
> >> +++ b/drivers/pci/pcie/aer/aer_inject.c
> >> @@ -109,6 +109,19 @@ static struct aer_error *__find_aer_error_by_dev(struct pci_dev *dev)
> >>  	return __find_aer_error((u16)domain, dev->bus->number, dev->devfn);
> >>  }
> >>
> >> +static bool pci_is_upstream_bus(struct pci_bus *bus, struct pci_bus *up_bus)
> >> +{
> >> +	struct pci_bus *pbus = bus->parent;
> >> +
> >> +	while (pbus) {
> >> +		if (pbus == up_bus)
> >> +			return true;
> >> +		pbus = pbus->parent;
> >> +	}
> >> +
> >> +	return false;
> >> +}
> >> +
> >>  /* inject_lock must be held before calling */
> >>  static struct pci_ops *__find_pci_bus_ops(struct pci_bus *bus)
> >>  {
> >> @@ -118,6 +131,13 @@ static struct pci_ops *__find_pci_bus_ops(struct pci_bus *bus)
> >>  		if (bus_ops->bus == bus)
> >>  			return bus_ops->ops;
> >>  	}
> >> +
> >> +	/* here can't find bus_ops, fall back to get bus_ops of upstream bus */
> >> +	list_for_each_entry(bus_ops, &pci_bus_ops_list, list) {
> >> +		if (pci_is_upstream_bus(bus, bus_ops->bus))
> >> +			return bus_ops->ops;
> >> +	}
> >> +
> >>  	return NULL;
> >>  }
> >>
> >> @@ -506,6 +526,7 @@ static struct miscdevice aer_inject_device = {
> >>  	.fops = &aer_inject_fops,
> >>  };
> >>
> >> +
> >>  static int __init aer_inject_init(void)
> >>  {
> >>  	return misc_register(&aer_inject_device);
> > 
> > After
> > 
> > # rmmod aer_inject
> > 
> > What will happen?
> > 
> > Best Regards,
> > Huang Ying
> > 
> > 
> 


--
To unsubscribe from this list: send the line "unsubscribe linux-pci" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Yijing Wang Aug. 28, 2012, 12:47 a.m. UTC | #5
>> bash[5908]: NaT consumption 17179869216 [1]
>> Modules linked in: aer_inject cpufreq_conservative cpufreq_userspace cpufreq_pow
>> ersave acpi_cpufreq binfmt_misc fuse nls_iso8859_1 loop ipmi_si(+) ipmi_devintf
>> ipmi_msghandler dm_mod ppdev iTCO_wdt iTCO_vendor_support sg igb parport_pc i2c_
>> i801 mptctl i2c_core serio_raw hid_generic lpc_ich mfd_core parport button conta
>> iner usbhid hid uhci_hcd ehci_hcd usbcore usb_common sd_mod crc_t10dif ext3 mbca
>> che jbd fan processor ide_pci_generic ide_core ata_piix libata mptsas mptscsih m
>> ptbase scsi_transport_sas scsi_mod thermal thermal_sys hwmon
>>
> [...]
>>
>> Signed-off-by: Yijing Wang <wangyijing@huawei.com>
>> Signed-off-by: Jiang Liu <liuj97@gmail.com>
>> ---
>>  drivers/pci/pcie/aer/aer_inject.c |   21 +++++++++++++++++++++
>>  1 files changed, 21 insertions(+), 0 deletions(-)
>>
>> diff --git a/drivers/pci/pcie/aer/aer_inject.c b/drivers/pci/pcie/aer/aer_inject.c
>> index 5222986..fc28785 100644
>> --- a/drivers/pci/pcie/aer/aer_inject.c
>> +++ b/drivers/pci/pcie/aer/aer_inject.c
>> @@ -109,6 +109,19 @@ static struct aer_error *__find_aer_error_by_dev(struct pci_dev *dev)
>>  	return __find_aer_error((u16)domain, dev->bus->number, dev->devfn);
>>  }
>>
>> +static bool pci_is_upstream_bus(struct pci_bus *bus, struct pci_bus *up_bus)
>> +{
>> +	struct pci_bus *pbus = bus->parent;
>> +
>> +	while (pbus) {
>> +		if (pbus == up_bus)
>> +			return true;
>> +		pbus = pbus->parent;
>> +	}
>> +
>> +	return false;
>> +}
>> +
>>  /* inject_lock must be held before calling */
>>  static struct pci_ops *__find_pci_bus_ops(struct pci_bus *bus)
>>  {
>> @@ -118,6 +131,13 @@ static struct pci_ops *__find_pci_bus_ops(struct pci_bus *bus)
>>  		if (bus_ops->bus == bus)
>>  			return bus_ops->ops;
>>  	}
>> +
>> +	/* here can't find bus_ops, fall back to get bus_ops of upstream bus */
>> +	list_for_each_entry(bus_ops, &pci_bus_ops_list, list) {
>> +		if (pci_is_upstream_bus(bus, bus_ops->bus))
>> +			return bus_ops->ops;
>> +	}
>> +
>>  	return NULL;
>>  }
>>
> At least, when returning NULL, a proper check and protection is needed.
> 
Hi Chen Gong,
    Thanks for your comments. It's real dangerous when returning NULL, Since pci_read_aer/pci_write_aer functions
had no any protection codes to check it. I think maybe we can treat this situation as a read/write access error, and set *val = 0 ?
Another way here is panic system, Becasue this is a really unexpected situation.
Yijing Wang Aug. 28, 2012, 12:53 a.m. UTC | #6
On 2012/8/28 8:38, Huang Ying wrote:
> On Mon, 2012-08-27 at 23:05 +0800, Jiang Liu wrote:
>> Is it ok to ignore such a case? After all, aer_inject is just a test tool:)
>> It's not worth to change the core logic for such a corner case.
>> --Gerry
> 
> Why ignore?  At least you can prevent aer_inject from unload if
> something special happened.
> 

Hi Huang Ying,
   Thanks for your comments. It's my negligence. I will add some protection code when do #rmmod aer_inject(a race condition window about bus_ops),
I will correct it in the new version patch.

----------
Thanks!
Yijing


>> On 08/27/2012 09:23 AM, Huang Ying wrote:
>>> On Sat, 2012-08-25 at 17:59 +0800, Yijing Wang wrote:
>>>> When we inject aer errors to the target pci device by aer_inject module, the pci_ops of pci
>>>> bus which the target device is on will be assign to pci_ops_aer.So if the target pci device
>>>> is a bridge, once we hotplug the pci bus(child bus) which the target device bridges to, child
>>>> bus's pci_ops will be assigned to pci_ops_aer too.Now every access to the child bus's device
>>>> will result to system panic, because it return NULL pci_ops in pci_read_aer.
>>>> This patch fix this.



--
To unsubscribe from this list: send the line "unsubscribe linux-pci" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
diff mbox

Patch

diff --git a/drivers/pci/pcie/aer/aer_inject.c b/drivers/pci/pcie/aer/aer_inject.c
index 5222986..fc28785 100644
--- a/drivers/pci/pcie/aer/aer_inject.c
+++ b/drivers/pci/pcie/aer/aer_inject.c
@@ -109,6 +109,19 @@  static struct aer_error *__find_aer_error_by_dev(struct pci_dev *dev)
 	return __find_aer_error((u16)domain, dev->bus->number, dev->devfn);
 }

+static bool pci_is_upstream_bus(struct pci_bus *bus, struct pci_bus *up_bus)
+{
+	struct pci_bus *pbus = bus->parent;
+
+	while (pbus) {
+		if (pbus == up_bus)
+			return true;
+		pbus = pbus->parent;
+	}
+
+	return false;
+}
+
 /* inject_lock must be held before calling */
 static struct pci_ops *__find_pci_bus_ops(struct pci_bus *bus)
 {
@@ -118,6 +131,13 @@  static struct pci_ops *__find_pci_bus_ops(struct pci_bus *bus)
 		if (bus_ops->bus == bus)
 			return bus_ops->ops;
 	}
+
+	/* here can't find bus_ops, fall back to get bus_ops of upstream bus */
+	list_for_each_entry(bus_ops, &pci_bus_ops_list, list) {
+		if (pci_is_upstream_bus(bus, bus_ops->bus))
+			return bus_ops->ops;
+	}
+
 	return NULL;
 }

@@ -506,6 +526,7 @@  static struct miscdevice aer_inject_device = {
 	.fops = &aer_inject_fops,
 };

+
 static int __init aer_inject_init(void)
 {
 	return misc_register(&aer_inject_device);