Patchwork 3.4.x regression: rtl8169: frequent resets

login
register
mail settings
Submitter Stefan Lippers-Hollmann
Date June 28, 2012, 11:31 p.m.
Message ID <201206290131.49150.s.L-H@gmx.de>
Download mbox | patch
Permalink /patch/167965/
State RFC
Delegated to: David Miller
Headers show

Comments

Stefan Lippers-Hollmann - June 28, 2012, 11:31 p.m.
Hi

On Thursday 28 June 2012, Francois Romieu wrote:
> Nix <nix@esperi.org.uk> :
> > I recently upgraded from 3.3.x to 3.4.4, and am now experiencing
> > networking problems with my desktop box's r8169 card. The symptoms are
> > that all traffic ceases for five to ten seconds, then the card appears
> > to reset and everything is back to normal -- until it happens again. It
> > can happen quite a lot:
> 
> Can you try and revert 036dafa28da1e2565a8529de2ae663c37b7a0060 ?
> 
> I would welcome a complete dmesg including the XID line from the
> r8169 driver.

I received the same oops from a 3.4.4 user with these onboard network 
cards:

r8169 0000:04:00.0: eth0: RTL8168d/8111d at 0xffffc90000c72000, 00:24:1d:72:7c:75, XID 081000c0 IRQ 44
r8169 0000:05:00.0: eth1: RTL8168d/8111d at 0xffffc90000c70000, 00:24:1d:72:7c:77, XID 081000c0 IRQ 45

Reverting 036dafa28da1e2565a8529de2ae663c37b7a0060 (Nix, trivial 
backport to 3.4.4 attached) did improve the situation, no oops in 21
hours uptime so far (while it usually shows up within about an hour).
Unfortunately his oops report was cut brief, so I've asked him to try 
reproducing it with an unpatched kernel again, to collect a full dmesg
(the test is still going on, past the one hour mark, but the oops 
hasn't triggered yet). I'll report back, as soon as I get confirmation 
and a full dmesg.

Regards
	Stefan Lippers-Hollmann
Stefan Lippers-Hollmann - June 29, 2012, 11:50 a.m.
Hi

On Friday 29 June 2012, Stefan Lippers-Hollmann wrote:
> On Thursday 28 June 2012, Francois Romieu wrote:
> > Nix <nix@esperi.org.uk> :
> > > I recently upgraded from 3.3.x to 3.4.4, and am now experiencing
> > > networking problems with my desktop box's r8169 card. The symptoms are
> > > that all traffic ceases for five to ten seconds, then the card appears
> > > to reset and everything is back to normal -- until it happens again. It
> > > can happen quite a lot:
> > 
> > Can you try and revert 036dafa28da1e2565a8529de2ae663c37b7a0060 ?
> > 
> > I would welcome a complete dmesg including the XID line from the
> > r8169 driver.

Full gzipped messages/ kern.log attached (unfortunately he rebooted to 
quickly for a regular dmesg).

[    0.573645] r8169 Gigabit Ethernet driver 2.3LK-NAPI loaded
[    0.573930] r8169 0000:04:00.0: eth0: RTL8168d/8111d at 0xffffc90000c72000, 00:24:1d:72:7c:75, XID 081000c0 IRQ 44
[    0.573933] r8169 0000:04:00.0: eth0: jumbo features [frames: 9200 bytes, tx checksumming: ko]
[    0.573953] r8169 Gigabit Ethernet driver 2.3LK-NAPI loaded
[    0.574093] ehci_hcd 0000:00:1a.7: irq 18, io mem 0xfbffe000
[    0.574213] r8169 0000:05:00.0: eth1: RTL8168d/8111d at 0xffffc90000c6e000, 00:24:1d:72:7c:77, XID 081000c0 IRQ 45
[    0.574217] r8169 0000:05:00.0: eth1: jumbo features [frames: 9200 bytes, tx checksumming: ko]
[…]
[   20.872579] r8169 0000:04:00.0: eth0: link down
[   20.872594] r8169 0000:04:00.0: eth0: link down
[   20.873162] ADDRCONF(NETDEV_UP): eth0: link is not ready
[   20.945479] NET: Registered protocol family 17
[   22.516769] r8169 0000:04:00.0: eth0: link up
[   22.517670] ADDRCONF(NETDEV_CHANGE): eth0: link becomes ready
[   25.996741] ip_tables: (C) 2000-2006 Netfilter Core Team
[   26.091554] nf_conntrack version 0.5.0 (16384 buckets, 65536 max)
[…]
[14454.544994] ------------[ cut here ]------------
[14454.545004] WARNING: at /tmp/buildd/linux-aptosid-3.4/debian/build/source_amd64_none/net/sched/sch_generic.c:256 dev_watchdog+0xe9/0x15c()
[14454.545008] Hardware name: EX58-UD5
[14454.545010] NETDEV WATCHDOG: eth0 (r8169): transmit queue 0 timed out
[14454.545013] Modules linked in: rfcomm bnep cpufreq_powersave cpufreq_stats cpufreq_conservative binfmt_misc xt_tcpudp nf_conntrack_ipv4 nf_defrag_ipv4 xt_state nf_conntrack iptable_filter ip_tables x_tables af_packet hfsplus fuse nls_utf8 nls_cp437 vfat fat jfs it87 dm_crypt dm_mod kvm_intel kvm adt7475 hwmon_vid nouveau snd_hda_codec_realtek coretemp video ttm drm_kms_helper drm snd_hda_intel power_supply snd_hda_codec snd_hwdep snd_pcm snd_page_alloc i2c_i801 i2c_algo_bit iTCO_wdt i7core_edac snd_seq iTCO_vendor_support microcode snd_seq_device edac_core i2c_core mxm_wmi btusb snd_timer snd bluetooth evdev pcspkr rfkill acpi_cpufreq soundcore mperf button processor wmi ext4 crc16 jbd2 mbcache sr_mod cdrom ata_generic usbhid pata_acpi hid sd_mod crc_t10dif crc32c_intel pata_jmicron uhci_hcd ahci libahci libata scsi_mod r8169 mii ehci_hcd usbcore usb_common [last unloaded: scsi_wait_scan]
[14454.545100] Pid: 4245, comm: iceape-bin Not tainted 3.4-4.slh.1-aptosid-amd64 #1
[14454.545103] Call Trace:
[14454.545105]  <IRQ>  [<ffffffff810332f6>] ? warn_slowpath_common+0x76/0x8a
[14454.545116]  [<ffffffff810333a2>] ? warn_slowpath_fmt+0x45/0x4a
[14454.545121]  [<ffffffff8127546a>] ? netif_tx_lock+0x67/0x7a
[14454.545127]  [<ffffffff812755b3>] ? dev_watchdog+0xe9/0x15c
[14454.545133]  [<ffffffff81020f2d>] ? __default_send_IPI_dest_field.constprop.0+0x38/0x4d
[14454.545139]  [<ffffffff8103c332>] ? run_timer_softirq+0x153/0x1e3
[14454.545145]  [<ffffffff8100f389>] ? paravirt_read_tsc+0x5/0x8
[14454.545150]  [<ffffffff81037f6b>] ? __do_softirq+0x92/0x126
[14454.545154]  [<ffffffff810202e2>] ? lapic_next_event+0xd/0x11
[14454.545160]  [<ffffffff813231dc>] ? call_softirq+0x1c/0x30
[14454.545164]  [<ffffffff8100ae23>] ? do_softirq+0x3a/0x77
[14454.545168]  [<ffffffff8103824b>] ? irq_exit+0x49/0xb1
[14454.545172]  [<ffffffff81020672>] ? smp_apic_timer_interrupt+0x74/0x82
[14454.545176]  [<ffffffff8132288a>] ? apic_timer_interrupt+0x6a/0x70
[14454.545179]  <EOI>  [<ffffffff81321df9>] ? system_call_fastpath+0x16/0x1b
[14454.545185] ---[ end trace a37b096a01814f14 ]---
[14454.549925] r8169 0000:04:00.0: eth0: link up
[14472.536356] r8169 0000:04:00.0: eth0: link up

Regards
	Stefan Lippers-Hollmann

Patch

revert r8169: add byte queue limit support.

--- a/drivers/net/ethernet/realtek/r8169.c
+++ b/drivers/net/ethernet/realtek/r8169.c
@@ -5000,7 +5000,6 @@  static void rtl8169_tx_clear(struct rtl8
 {
 	rtl8169_tx_clear_range(tp, tp->dirty_tx, NUM_TX_DESC);
 	tp->cur_tx = tp->dirty_tx = 0;
-	netdev_reset_queue(tp->dev);
 }
 
 static void rtl_reset_work(struct rtl8169_private *tp)
@@ -5155,8 +5154,6 @@  static netdev_tx_t rtl8169_start_xmit(st
 
 	txd->opts2 = cpu_to_le32(opts[1]);
 
-	netdev_sent_queue(dev, skb->len);
-
 	skb_tx_timestamp(skb);
 
 	wmb();
@@ -5253,16 +5250,9 @@  static void rtl8169_pcierr_interrupt(str
 	rtl_schedule_task(tp, RTL_FLAG_TASK_RESET_PENDING);
 }
 
-struct rtl_txc {
-	int packets;
-	int bytes;
-};
-
 static void rtl_tx(struct net_device *dev, struct rtl8169_private *tp)
 {
-	struct rtl8169_stats *tx_stats = &tp->tx_stats;
 	unsigned int dirty_tx, tx_left;
-	struct rtl_txc txc = { 0, 0 };
 
 	dirty_tx = tp->dirty_tx;
 	smp_rmb();
@@ -5281,24 +5271,17 @@  static void rtl_tx(struct net_device *de
 		rtl8169_unmap_tx_skb(&tp->pci_dev->dev, tx_skb,
 				     tp->TxDescArray + entry);
 		if (status & LastFrag) {
-			struct sk_buff *skb = tx_skb->skb;
-
-			txc.packets++;
-			txc.bytes += skb->len;
-			dev_kfree_skb(skb);
+			u64_stats_update_begin(&tp->tx_stats.syncp);
+			tp->tx_stats.packets++;
+			tp->tx_stats.bytes += tx_skb->skb->len;
+			u64_stats_update_end(&tp->tx_stats.syncp);
+			dev_kfree_skb(tx_skb->skb);
 			tx_skb->skb = NULL;
 		}
 		dirty_tx++;
 		tx_left--;
 	}
 
-	u64_stats_update_begin(&tx_stats->syncp);
-	tx_stats->packets += txc.packets;
-	tx_stats->bytes += txc.bytes;
-	u64_stats_update_end(&tx_stats->syncp);
-
-	netdev_completed_queue(dev, txc.packets, txc.bytes);
-
 	if (tp->dirty_tx != dirty_tx) {
 		tp->dirty_tx = dirty_tx;
 		/* Sync with rtl8169_start_xmit: