diff mbox

sky2 panic in 2.6.32.1 under load (new oops)

Message ID 4B3AFFE8.3060409@majjas.com
State RFC, archived
Delegated to: David Miller
Headers show

Commit Message

Michael Breuer Dec. 30, 2009, 7:23 a.m. UTC
Ok - I called dump_txring from sky2_net_intr:
                 sky2_hw_intr(hw);

And got this:
Dec 30 02:17:23 mail kernel: sky2 0000:06:00.0: error interrupt 
status=0x40000008
Dec 30 02:17:23 mail kernel: sky2 0000:06:00.0: error interrupt 
status=0x40000008
Dec 30 02:17:23 mail kernel: sky2 Tx ring pending=28...30 report=29 done=29
Dec 30 02:17:23 mail kernel: sky2 Tx ring pending=28...30 report=29 done=29
Dec 30 02:17:23 mail kernel: sky2 0000:06:00.0: error interrupt status=0x8
Dec 30 02:17:23 mail kernel: sky2 0000:06:00.0: error interrupt status=0x8
Dec 30 02:17:23 mail kernel: sky2 Tx ring pending=30...32 report=30 done=31
Dec 30 02:17:23 mail kernel: sky2 Tx ring pending=30...32 report=30 done=31

On 12/29/2009 02:49 PM, Stephen Hemminger wrote:
> This patch will dump more info in case of hardware error.
>
> --- a/drivers/net/sky2.c	2009-12-29 11:30:06.441137685 -0800
> +++ b/drivers/net/sky2.c	2009-12-29 11:46:37.389262922 -0800
> @@ -2629,12 +2625,71 @@ static void sky2_hw_error(struct sky2_hw
>   	}
>   }
>
> +static void dump_txring(struct sky2_hw *hw, unsigned port)
> +{
> +	struct sky2_port *sky2 = netdev_priv(hw->dev[port]);
> +	int sop = 1;
> +	unsigned idx;
> +
> +	printk(KERN_INFO PFX "Tx ring pending=%u...%u report=%d done=%d\n",
> +		   sky2->tx_cons, sky2->tx_prod,
> +		   sky2_read16(hw, port == 0 ? STAT_TXA1_RIDX : STAT_TXA2_RIDX),
> +		   sky2_read16(hw, Q_ADDR(txqaddr[port], Q_DONE)));
> +
> +	/* Dump contents of tx ring */
> +	for (idx = sky2->tx_next;
> +	     idx != sky2->tx_prod&&  idx<  sky2->tx_ring_size;
> +	     idx = RING_NEXT(idx, sky2->tx_ring_size)) {
> +		const struct sky2_tx_le *le = sky2->tx_le + idx;
> +		u32 a = le32_to_cpu(le->addr);
> +
> +		if (sop)
> +			printk(KERN_DEBUG "%u:", idx);
> +		sop = 0;
> +
> +		switch(le->opcode&  ~HW_OWNER) {
> +		case OP_ADDR64:
> +			printk(" %#x:", a);
> +			break;
> +		case OP_LRGLEN:
> +			printk(" mtu=%d", a);
> +			break;
> +		case OP_VLAN:
> +			printk(" vlan=%d", be16_to_cpu(le->length));
> +			break;
> +		case OP_TCPLISW:
> +			printk(" csum=%#x", a);
> +			break;
> +		case OP_LARGESEND:
> +			printk(" tso=%#x(%d)", a, le16_to_cpu(le->length));
> +			break;
> +		case OP_PACKET:
> +			printk(" %#x(%d)", a, le16_to_cpu(le->length));
> +			break;
> +		case OP_BUFFER:
> +			printk(" frag=%#x(%d)", a, le16_to_cpu(le->length));
> +			break;
> +		default:
> +			printk(" op=%#x,%#x(%d)", le->opcode, a,
> +			       le16_to_cpu(le->length));
> +		}
> +
> +		if (le->ctrl&  EOP) {
> +			printk("\n");
> +			sop = 1;
> +		}
> +	}
> +}
> +
>   static void sky2_hw_intr(struct sky2_hw *hw)
>   {
>   	struct pci_dev *pdev = hw->pdev;
>   	u32 status = sky2_read32(hw, B0_HWE_ISRC);
>   	u32 hwmsk = sky2_read32(hw, B0_HWE_IMSK);
>
> +	printk(KERN_DEBUG PFX "hardware interrupt status %#x\n", status);
> +	dump_txring(hw, 0);
> +
>   	status&= hwmsk;
>
>   	if (status&  Y2_IS_TIST_OV)
>
>    

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Comments

stephen hemminger Dec. 30, 2009, 7:58 a.m. UTC | #1
On Wed, 30 Dec 2009 02:23:20 -0500
Michael Breuer <mbreuer@majjas.com> wrote:

> Ok - I called dump_txring from sky2_net_intr:
> --- a/drivers/net/sky2.c
> +++ b/drivers/net/sky2.c
> @@ -2725,8 +2791,10 @@ static void sky2_watchdog(unsigned long arg)
>   /* Hardware/software error handling */
>   static void sky2_err_intr(struct sky2_hw *hw, u32 status)
>   {
> -       if (net_ratelimit())
> +       if (net_ratelimit()) {
>                  dev_warn(&hw->pdev->dev, "error interrupt 
> status=%#x\n", status);
> +               dump_txring(hw, 0);
> +       }
> 
>          if (status & Y2_IS_HW_ERR)
>                  sky2_hw_intr(hw);
> 
> And got this:
> Dec 30 02:17:23 mail kernel: sky2 0000:06:00.0: error interrupt 
> status=0x40000008
> Dec 30 02:17:23 mail kernel: sky2 0000:06:00.0: error interrupt 
> status=0x40000008
> Dec 30 02:17:23 mail kernel: sky2 Tx ring pending=28...30 report=29 done=29
> Dec 30 02:17:23 mail kernel: sky2 Tx ring pending=28...30 report=29 done=29
> Dec 30 02:17:23 mail kernel: sky2 0000:06:00.0: error interrupt status=0x8
> Dec 30 02:17:23 mail kernel: sky2 0000:06:00.0: error interrupt status=0x8
> Dec 30 02:17:23 mail kernel: sky2 Tx ring pending=30...32 report=30 done=31
> Dec 30 02:17:23 mail kernel: sky2 Tx ring pending=30...32 report=30 done=31
> 

I notice that you have NOUVEAU Nvidia drivers loaded? The one difference in HW
between your board and mine is that I have ATI video card.
stephen hemminger Dec. 30, 2009, 7:59 a.m. UTC | #2
On Wed, 30 Dec 2009 02:23:20 -0500
Michael Breuer <mbreuer@majjas.com> wrote:

> And got this:
> Dec 30 02:17:23 mail kernel: sky2 0000:06:00.0: error interrupt 
> status=0x40000008
> Dec 30 02:17:23 mail kernel: sky2 0000:06:00.0: error interrupt 
> status=0x40000008
> Dec 30 02:17:23 mail kernel: sky2 Tx ring pending=28...30 report=29 done=29
> Dec 30 02:17:23 mail kernel: sky2 Tx ring pending=28...30 report=29 done=29
> Dec 30 02:17:23 mail kernel: sky2 0000:06:00.0: error interrupt status=0x8
> Dec 30 02:17:23 mail kernel: sky2 0000:06:00.0: error interrupt status=0x8
> Dec 30 02:17:23 mail kernel: sky2 Tx ring pending=30...32 report=30 done=31
> Dec 30 02:17:23 mail kernel: sky2 Tx ring pending=30...32 report=30 done=31

The interesting bits are only output at DEBUG level.  So either change kernel
log level or syslog.
Michael Breuer Dec. 30, 2009, 3:40 p.m. UTC | #3
Ok - reran:

Dec 30 10:37:14 mail kernel: sky2 0000:06:00.0: error interrupt 
status=0x40000008
Dec 30 10:37:14 mail kernel: sky2 Tx ring pending=31...40 report=35 done=35
Dec 30 10:37:14 mail kernel: 30: 0x0: 0xeb50c0ca(9014)
Dec 30 10:37:14 mail kernel: 32: 0x0: 0xeb5080ca(9014)
Dec 30 10:37:14 mail kernel: 34: 0x0: 0xeb5040ca(9014)
Dec 30 10:37:14 mail kernel: 36: 0x0: 0xeb5000ca(9014)
Dec 30 10:37:14 mail kernel: 38: 0x0: 0xeb4fc0ca(9014)
Dec 30 10:37:14 mail kernel: sky2 0000:06:00.0: error interrupt 
status=0x40000008
Dec 30 10:37:14 mail kernel: sky2 Tx ring pending=87...92 report=89 done=91
Dec 30 10:37:14 mail kernel: 86: 0x0: 0xeafd80ca(9014)
Dec 30 10:37:14 mail kernel: 88: 0x0: 0xeafd40ca(9014)
Dec 30 10:37:14 mail kernel: 90: 0x0: 0xeafd00ca(9014)
Dec 30 10:37:15 mail kernel: sky2 0000:06:00.0: error interrupt status=0x8
Dec 30 10:37:15 mail kernel: sky2 Tx ring pending=39...42 report=39 done=41
Dec 30 10:37:15 mail kernel: 38: 0x0: 0xe5fec0ca(9014)
Dec 30 10:37:15 mail kernel: 40: 0x0: 0xe5fe80ca(9014)
Dec 30 10:37:15 mail kernel: sky2 0000:06:00.0: error interrupt 
status=0x40000008
Dec 30 10:37:15 mail kernel: sky2 Tx ring pending=117...124 report=119 
done=119
Dec 30 10:37:15 mail kernel: 116: 0x0: 0xe5d6c0ca(9014)
Dec 30 10:37:15 mail kernel: 118: 0x0: 0xe5d680ca(9014)
Dec 30 10:37:15 mail kernel: 120: 0x0: 0xe5d660ca(5938)
Dec 30 10:37:15 mail kernel: 122: 0x0: 0xe5d640ca(4213)

continues on like this - but this is the start.
On 12/30/2009 02:59 AM, Stephen Hemminger wrote:
> On Wed, 30 Dec 2009 02:23:20 -0500
> Michael Breuer<mbreuer@majjas.com>  wrote:
>
>    
>> And got this:
>> Dec 30 02:17:23 mail kernel: sky2 0000:06:00.0: error interrupt
>> status=0x40000008
>> Dec 30 02:17:23 mail kernel: sky2 0000:06:00.0: error interrupt
>> status=0x40000008
>> Dec 30 02:17:23 mail kernel: sky2 Tx ring pending=28...30 report=29 done=29
>> Dec 30 02:17:23 mail kernel: sky2 Tx ring pending=28...30 report=29 done=29
>> Dec 30 02:17:23 mail kernel: sky2 0000:06:00.0: error interrupt status=0x8
>> Dec 30 02:17:23 mail kernel: sky2 0000:06:00.0: error interrupt status=0x8
>> Dec 30 02:17:23 mail kernel: sky2 Tx ring pending=30...32 report=30 done=31
>> Dec 30 02:17:23 mail kernel: sky2 Tx ring pending=30...32 report=30 done=31
>>      
> The interesting bits are only output at DEBUG level.  So either change kernel
> log level or syslog.
>
>    

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Michael Breuer Dec. 30, 2009, 5:49 p.m. UTC | #4
On 12/30/2009 2:58 AM, Stephen Hemminger wrote:
> On Wed, 30 Dec 2009 02:23:20 -0500
> Michael Breuer<mbreuer@majjas.com>  wrote:
>
>    
>> Ok - I called dump_txring from sky2_net_intr:
>> --- a/drivers/net/sky2.c
>> +++ b/drivers/net/sky2.c
>> @@ -2725,8 +2791,10 @@ static void sky2_watchdog(unsigned long arg)
>>    /* Hardware/software error handling */
>>    static void sky2_err_intr(struct sky2_hw *hw, u32 status)
>>    {
>> -       if (net_ratelimit())
>> +       if (net_ratelimit()) {
>>                   dev_warn(&hw->pdev->dev, "error interrupt
>> status=%#x\n", status);
>> +               dump_txring(hw, 0);
>> +       }
>>
>>           if (status&  Y2_IS_HW_ERR)
>>                   sky2_hw_intr(hw);
>>
>> And got this:
>> Dec 30 02:17:23 mail kernel: sky2 0000:06:00.0: error interrupt
>> status=0x40000008
>> Dec 30 02:17:23 mail kernel: sky2 0000:06:00.0: error interrupt
>> status=0x40000008
>> Dec 30 02:17:23 mail kernel: sky2 Tx ring pending=28...30 report=29 done=29
>> Dec 30 02:17:23 mail kernel: sky2 Tx ring pending=28...30 report=29 done=29
>> Dec 30 02:17:23 mail kernel: sky2 0000:06:00.0: error interrupt status=0x8
>> Dec 30 02:17:23 mail kernel: sky2 0000:06:00.0: error interrupt status=0x8
>> Dec 30 02:17:23 mail kernel: sky2 Tx ring pending=30...32 report=30 done=31
>> Dec 30 02:17:23 mail kernel: sky2 Tx ring pending=30...32 report=30 done=31
>>
>>      
> I notice that you have NOUVEAU Nvidia drivers loaded? The one difference in HW
> between your board and mine is that I have ATI video card.
>
>    
Seems the problem is linked to auditd and X11 (but not nouveau).

Today, I ran a bunch of scenarios. I first determined that the problem 
only manifest in runlevel 5. Next, this occurred with or without KMS and 
with or without nouveau. This happened whether or not I was logged in 
(local or remote), and regardless of window manager (xdm, gdm, kdm). I 
then checked to see what else was different between runlevel 3 and 5 - 
only thing was auditd. I disabled auditd and reran - no errors.

Now for the odd stuff:

The errors only manifest if the high throughput data transfer is 
initiated when the system is in runlevel 5 and auditd was started by 
init when transitioning from runlevel 3 to 5. For example, the following 
scenarios do not cause the errors to manifest:

runlevel3; start auditd runlevel 5; start transfer
runlevel3; chkconfig auditd off; runlevel5; start auditd; start transfer
runlevel3; start transfer (note: errors do not occur if I transition to 
runlevel 5 after the high bandwidth transfer has started)
runlevel3; startx; start transfer

The only way I get the problem to manifest is transition to runlevel 5 
with chkconfig auditd on (level 5 only) and then initate the windows 
backup.

I'm guessing that there is some sort of race condition happening between 
X (xdm/gdm/kdm/greeter?) and auditd that is somehow corrupting 
something. I'd hazard a more or less obvious guess that whatever's being 
corrupted differs when there is already a high throughput transfer under 
way.

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
stephen hemminger Dec. 30, 2009, 6:10 p.m. UTC | #5
On Wed, 30 Dec 2009 10:40:56 -0500
Michael Breuer <mbreuer@majjas.com> wrote:

> Dec 30 10:37:14 mail kernel: sky2 0000:06:00.0: error interrupt 
> status=0x40000008
> Dec 30 10:37:14 mail kernel: sky2 Tx ring pending=31...40 report=35 done=35
> Dec 30 10:37:14 mail kernel: 30: 0x0: 0xeb50c0ca(9014)
> Dec 30 10:37:14 mail kernel: 32: 0x0: 0xeb5080ca(9014)
> Dec 30 10:37:14 mail kernel: 34: 0x0: 0xeb5040ca(9014)
> Dec 30 10:37:14 mail kernel: 36: 0x0: 0xeb5000ca(9014)
> Dec 30 10:37:14 mail kernel: 38: 0x0: 0xeb4fc0ca(9014)

You must be using 9K MTU.  Do you see the problem with smaller
MTU?

> Dec 30 10:37:14 mail kernel: sky2 0000:06:00.0: error interrupt 
> status=0x40000008
> Dec 30 10:37:14 mail kernel: sky2 Tx ring pending=87...92 report=89 done=91
> Dec 30 10:37:14 mail kernel: 86: 0x0: 0xeafd80ca(9014)
> Dec 30 10:37:14 mail kernel: 88: 0x0: 0xeafd40ca(9014)
> Dec 30 10:37:14 mail kernel: 90: 0x0: 0xeafd00ca(9014)
> Dec 30 10:37:15 mail kernel: sky2 0000:06:00.0: error interrupt status=0x8
> Dec 30 10:37:15 mail kernel: sky2 Tx ring pending=39...42 report=39 done=41
> Dec 30 10:37:15 mail kernel: 38: 0x0: 0xe5fec0ca(9014)
> Dec 30 10:37:15 mail kernel: 40: 0x0: 0xe5fe80ca(9014)
> Dec 30 10:37:15 mail kernel: sky2 0000:06:00.0: error interrupt 
> status=0x40000008
> Dec 30 10:37:15 mail kernel: sky2 Tx ring pending=117...124 report=119 
> done=119
> Dec 30 10:37:15 mail kernel: 116: 0x0: 0xe5d6c0ca(9014)
> Dec 30 10:37:15 mail kernel: 118: 0x0: 0xe5d680ca(9014)
> Dec 30 10:37:15 mail kernel: 120: 0x0: 0xe5d660ca(5938)
> Dec 30 10:37:15 mail kernel: 122: 0x0: 0xe5d640ca(4213)
Michael Breuer Dec. 30, 2009, 6:37 p.m. UTC | #6
On 12/30/2009 1:10 PM, Stephen Hemminger wrote:
> On Wed, 30 Dec 2009 10:40:56 -0500
> Michael Breuer<mbreuer@majjas.com>  wrote:
>
>    
>> Dec 30 10:37:14 mail kernel: sky2 0000:06:00.0: error interrupt
>> status=0x40000008
>> Dec 30 10:37:14 mail kernel: sky2 Tx ring pending=31...40 report=35 done=35
>> Dec 30 10:37:14 mail kernel: 30: 0x0: 0xeb50c0ca(9014)
>> Dec 30 10:37:14 mail kernel: 32: 0x0: 0xeb5080ca(9014)
>> Dec 30 10:37:14 mail kernel: 34: 0x0: 0xeb5040ca(9014)
>> Dec 30 10:37:14 mail kernel: 36: 0x0: 0xeb5000ca(9014)
>> Dec 30 10:37:14 mail kernel: 38: 0x0: 0xeb4fc0ca(9014)
>>      
> You must be using 9K MTU.  Do you see the problem with smaller
> MTU?
>    
Yes - decided to try 9k in the past day or so. Can redo with smaller 
again & debug if it would be helpful.
>    
>> Dec 30 10:37:14 mail kernel: sky2 0000:06:00.0: error interrupt
>> status=0x40000008
>> Dec 30 10:37:14 mail kernel: sky2 Tx ring pending=87...92 report=89 done=91
>> Dec 30 10:37:14 mail kernel: 86: 0x0: 0xeafd80ca(9014)
>> Dec 30 10:37:14 mail kernel: 88: 0x0: 0xeafd40ca(9014)
>> Dec 30 10:37:14 mail kernel: 90: 0x0: 0xeafd00ca(9014)
>> Dec 30 10:37:15 mail kernel: sky2 0000:06:00.0: error interrupt status=0x8
>> Dec 30 10:37:15 mail kernel: sky2 Tx ring pending=39...42 report=39 done=41
>> Dec 30 10:37:15 mail kernel: 38: 0x0: 0xe5fec0ca(9014)
>> Dec 30 10:37:15 mail kernel: 40: 0x0: 0xe5fe80ca(9014)
>> Dec 30 10:37:15 mail kernel: sky2 0000:06:00.0: error interrupt
>> status=0x40000008
>> Dec 30 10:37:15 mail kernel: sky2 Tx ring pending=117...124 report=119
>> done=119
>> Dec 30 10:37:15 mail kernel: 116: 0x0: 0xe5d6c0ca(9014)
>> Dec 30 10:37:15 mail kernel: 118: 0x0: 0xe5d680ca(9014)
>> Dec 30 10:37:15 mail kernel: 120: 0x0: 0xe5d660ca(5938)
>> Dec 30 10:37:15 mail kernel: 122: 0x0: 0xe5d640ca(4213)
>>      
>    

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Michael Breuer Dec. 30, 2009, 7:15 p.m. UTC | #7
And now looking at audit.c it seems reasonable that there is a race 
condition when auditd is started at roughly the same time as X. I'm 
guessing that the kaudit thread is fired up; the tty connected; and at 
the same time X grabs the tty. Somewhere in there an skb gets hosed and 
is then reused by whatever comes along - in my case sky2 as that's where 
the subsequent demand is. If the demand happens first, the contaminated 
skb (dk in what way yet) is probably waiting to manifest as some other 
bug that's been frustrating people.
On 12/30/2009 12:49 PM, Michael Breuer wrote:
> On 12/30/2009 2:58 AM, Stephen Hemminger wrote:
>> On Wed, 30 Dec 2009 02:23:20 -0500
>> Michael Breuer<mbreuer@majjas.com>  wrote:
>>
>>> Ok - I called dump_txring from sky2_net_intr:
>>> --- a/drivers/net/sky2.c
>>> +++ b/drivers/net/sky2.c
>>> @@ -2725,8 +2791,10 @@ static void sky2_watchdog(unsigned long arg)
>>>    /* Hardware/software error handling */
>>>    static void sky2_err_intr(struct sky2_hw *hw, u32 status)
>>>    {
>>> -       if (net_ratelimit())
>>> +       if (net_ratelimit()) {
>>>                   dev_warn(&hw->pdev->dev, "error interrupt
>>> status=%#x\n", status);
>>> +               dump_txring(hw, 0);
>>> +       }
>>>
>>>           if (status&  Y2_IS_HW_ERR)
>>>                   sky2_hw_intr(hw);
>>>
>>> And got this:
>>> Dec 30 02:17:23 mail kernel: sky2 0000:06:00.0: error interrupt
>>> status=0x40000008
>>> Dec 30 02:17:23 mail kernel: sky2 0000:06:00.0: error interrupt
>>> status=0x40000008
>>> Dec 30 02:17:23 mail kernel: sky2 Tx ring pending=28...30 report=29 
>>> done=29
>>> Dec 30 02:17:23 mail kernel: sky2 Tx ring pending=28...30 report=29 
>>> done=29
>>> Dec 30 02:17:23 mail kernel: sky2 0000:06:00.0: error interrupt 
>>> status=0x8
>>> Dec 30 02:17:23 mail kernel: sky2 0000:06:00.0: error interrupt 
>>> status=0x8
>>> Dec 30 02:17:23 mail kernel: sky2 Tx ring pending=30...32 report=30 
>>> done=31
>>> Dec 30 02:17:23 mail kernel: sky2 Tx ring pending=30...32 report=30 
>>> done=31
>>>
>> I notice that you have NOUVEAU Nvidia drivers loaded? The one 
>> difference in HW
>> between your board and mine is that I have ATI video card.
>>
> Seems the problem is linked to auditd and X11 (but not nouveau).
>
> Today, I ran a bunch of scenarios. I first determined that the problem 
> only manifest in runlevel 5. Next, this occurred with or without KMS 
> and with or without nouveau. This happened whether or not I was logged 
> in (local or remote), and regardless of window manager (xdm, gdm, 
> kdm). I then checked to see what else was different between runlevel 3 
> and 5 - only thing was auditd. I disabled auditd and reran - no errors.
>
> Now for the odd stuff:
>
> The errors only manifest if the high throughput data transfer is 
> initiated when the system is in runlevel 5 and auditd was started by 
> init when transitioning from runlevel 3 to 5. For example, the 
> following scenarios do not cause the errors to manifest:
>
> runlevel3; start auditd runlevel 5; start transfer
> runlevel3; chkconfig auditd off; runlevel5; start auditd; start transfer
> runlevel3; start transfer (note: errors do not occur if I transition 
> to runlevel 5 after the high bandwidth transfer has started)
> runlevel3; startx; start transfer
>
> The only way I get the problem to manifest is transition to runlevel 5 
> with chkconfig auditd on (level 5 only) and then initate the windows 
> backup.
>
> I'm guessing that there is some sort of race condition happening 
> between X (xdm/gdm/kdm/greeter?) and auditd that is somehow corrupting 
> something. I'd hazard a more or less obvious guess that whatever's 
> being corrupted differs when there is already a high throughput 
> transfer under way.
>
> -- 
> To unsubscribe from this list: send the line "unsubscribe 
> linux-kernel" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
> Please read the FAQ at  http://www.tux.org/lkml/

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Michael Breuer Dec. 30, 2009, 8:44 p.m. UTC | #8
A couple more observations:

1) enabling auditd for runlevel 3 mitigates the issue
2) starting a remote x session (XDMCP) while under load and while auditd 
is already running also triggers the sky2 interrupt status messages - so 
maybe not tty1 - but some sort of X & auditd interaction. Even in this 
case, the frequency of the error messages is much less than when auditd 
is started in runlevel 5 for the first time.

On 12/30/2009 2:15 PM, Michael Breuer wrote:
> And now looking at audit.c it seems reasonable that there is a race 
> condition when auditd is started at roughly the same time as X. I'm 
> guessing that the kaudit thread is fired up; the tty connected; and at 
> the same time X grabs the tty. Somewhere in there an skb gets hosed 
> and is then reused by whatever comes along - in my case sky2 as that's 
> where the subsequent demand is. If the demand happens first, the 
> contaminated skb (dk in what way yet) is probably waiting to manifest 
> as some other bug that's been frustrating people.
> On 12/30/2009 12:49 PM, Michael Breuer wrote:
>> On 12/30/2009 2:58 AM, Stephen Hemminger wrote:
>>> On Wed, 30 Dec 2009 02:23:20 -0500
>>> Michael Breuer<mbreuer@majjas.com>  wrote:
>>>
>>>> Ok - I called dump_txring from sky2_net_intr:
>>>> --- a/drivers/net/sky2.c
>>>> +++ b/drivers/net/sky2.c
>>>> @@ -2725,8 +2791,10 @@ static void sky2_watchdog(unsigned long arg)
>>>>    /* Hardware/software error handling */
>>>>    static void sky2_err_intr(struct sky2_hw *hw, u32 status)
>>>>    {
>>>> -       if (net_ratelimit())
>>>> +       if (net_ratelimit()) {
>>>>                   dev_warn(&hw->pdev->dev, "error interrupt
>>>> status=%#x\n", status);
>>>> +               dump_txring(hw, 0);
>>>> +       }
>>>>
>>>>           if (status&  Y2_IS_HW_ERR)
>>>>                   sky2_hw_intr(hw);
>>>>
>>>> And got this:
>>>> Dec 30 02:17:23 mail kernel: sky2 0000:06:00.0: error interrupt
>>>> status=0x40000008
>>>> Dec 30 02:17:23 mail kernel: sky2 0000:06:00.0: error interrupt
>>>> status=0x40000008
>>>> Dec 30 02:17:23 mail kernel: sky2 Tx ring pending=28...30 report=29 
>>>> done=29
>>>> Dec 30 02:17:23 mail kernel: sky2 Tx ring pending=28...30 report=29 
>>>> done=29
>>>> Dec 30 02:17:23 mail kernel: sky2 0000:06:00.0: error interrupt 
>>>> status=0x8
>>>> Dec 30 02:17:23 mail kernel: sky2 0000:06:00.0: error interrupt 
>>>> status=0x8
>>>> Dec 30 02:17:23 mail kernel: sky2 Tx ring pending=30...32 report=30 
>>>> done=31
>>>> Dec 30 02:17:23 mail kernel: sky2 Tx ring pending=30...32 report=30 
>>>> done=31
>>>>
>>> I notice that you have NOUVEAU Nvidia drivers loaded? The one 
>>> difference in HW
>>> between your board and mine is that I have ATI video card.
>>>
>> Seems the problem is linked to auditd and X11 (but not nouveau).
>>
>> Today, I ran a bunch of scenarios. I first determined that the 
>> problem only manifest in runlevel 5. Next, this occurred with or 
>> without KMS and with or without nouveau. This happened whether or not 
>> I was logged in (local or remote), and regardless of window manager 
>> (xdm, gdm, kdm). I then checked to see what else was different 
>> between runlevel 3 and 5 - only thing was auditd. I disabled auditd 
>> and reran - no errors.
>>
>> Now for the odd stuff:
>>
>> The errors only manifest if the high throughput data transfer is 
>> initiated when the system is in runlevel 5 and auditd was started by 
>> init when transitioning from runlevel 3 to 5. For example, the 
>> following scenarios do not cause the errors to manifest:
>>
>> runlevel3; start auditd runlevel 5; start transfer
>> runlevel3; chkconfig auditd off; runlevel5; start auditd; start transfer
>> runlevel3; start transfer (note: errors do not occur if I transition 
>> to runlevel 5 after the high bandwidth transfer has started)
>> runlevel3; startx; start transfer
>>
>> The only way I get the problem to manifest is transition to runlevel 
>> 5 with chkconfig auditd on (level 5 only) and then initate the 
>> windows backup.
>>
>> I'm guessing that there is some sort of race condition happening 
>> between X (xdm/gdm/kdm/greeter?) and auditd that is somehow 
>> corrupting something. I'd hazard a more or less obvious guess that 
>> whatever's being corrupted differs when there is already a high 
>> throughput transfer under way.
>>
>> -- 
>> To unsubscribe from this list: send the line "unsubscribe 
>> linux-kernel" in
>> the body of a message to majordomo@vger.kernel.org
>> More majordomo info at  http://vger.kernel.org/majordomo-info.html
>> Please read the FAQ at  http://www.tux.org/lkml/
>
> -- 
> To unsubscribe from this list: send the line "unsubscribe 
> linux-kernel" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
> Please read the FAQ at  http://www.tux.org/lkml/

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Michael Breuer Dec. 30, 2009, 9:15 p.m. UTC | #9
On 12/30/2009 3:44 PM, Michael Breuer wrote:
> A couple more observations:
>
> 1) enabling auditd for runlevel 3 mitigates the issue
> 2) starting a remote x session (XDMCP) while under load and while 
> auditd is already running also triggers the sky2 interrupt status 
> messages - so maybe not tty1 - but some sort of X & auditd 
> interaction. Even in this case, the frequency of the error messages is 
> much less than when auditd is started in runlevel 5 for the first time.
And just for some certainty - running with audit=0, the errors do not 
manifest.
>
> On 12/30/2009 2:15 PM, Michael Breuer wrote:
>> And now looking at audit.c it seems reasonable that there is a race 
>> condition when auditd is started at roughly the same time as X. I'm 
>> guessing that the kaudit thread is fired up; the tty connected; and 
>> at the same time X grabs the tty. Somewhere in there an skb gets 
>> hosed and is then reused by whatever comes along - in my case sky2 as 
>> that's where the subsequent demand is. If the demand happens first, 
>> the contaminated skb (dk in what way yet) is probably waiting to 
>> manifest as some other bug that's been frustrating people.
>> On 12/30/2009 12:49 PM, Michael Breuer wrote:
>>> On 12/30/2009 2:58 AM, Stephen Hemminger wrote:
>>>> On Wed, 30 Dec 2009 02:23:20 -0500
>>>> Michael Breuer<mbreuer@majjas.com>  wrote:
>>>>
>>>>> Ok - I called dump_txring from sky2_net_intr:
>>>>> --- a/drivers/net/sky2.c
>>>>> +++ b/drivers/net/sky2.c
>>>>> @@ -2725,8 +2791,10 @@ static void sky2_watchdog(unsigned long arg)
>>>>>    /* Hardware/software error handling */
>>>>>    static void sky2_err_intr(struct sky2_hw *hw, u32 status)
>>>>>    {
>>>>> -       if (net_ratelimit())
>>>>> +       if (net_ratelimit()) {
>>>>>                   dev_warn(&hw->pdev->dev, "error interrupt
>>>>> status=%#x\n", status);
>>>>> +               dump_txring(hw, 0);
>>>>> +       }
>>>>>
>>>>>           if (status&  Y2_IS_HW_ERR)
>>>>>                   sky2_hw_intr(hw);
>>>>>
>>>>> And got this:
>>>>> Dec 30 02:17:23 mail kernel: sky2 0000:06:00.0: error interrupt
>>>>> status=0x40000008
>>>>> Dec 30 02:17:23 mail kernel: sky2 0000:06:00.0: error interrupt
>>>>> status=0x40000008
>>>>> Dec 30 02:17:23 mail kernel: sky2 Tx ring pending=28...30 
>>>>> report=29 done=29
>>>>> Dec 30 02:17:23 mail kernel: sky2 Tx ring pending=28...30 
>>>>> report=29 done=29
>>>>> Dec 30 02:17:23 mail kernel: sky2 0000:06:00.0: error interrupt 
>>>>> status=0x8
>>>>> Dec 30 02:17:23 mail kernel: sky2 0000:06:00.0: error interrupt 
>>>>> status=0x8
>>>>> Dec 30 02:17:23 mail kernel: sky2 Tx ring pending=30...32 
>>>>> report=30 done=31
>>>>> Dec 30 02:17:23 mail kernel: sky2 Tx ring pending=30...32 
>>>>> report=30 done=31
>>>>>
>>>> I notice that you have NOUVEAU Nvidia drivers loaded? The one 
>>>> difference in HW
>>>> between your board and mine is that I have ATI video card.
>>>>
>>> Seems the problem is linked to auditd and X11 (but not nouveau).
>>>
>>> Today, I ran a bunch of scenarios. I first determined that the 
>>> problem only manifest in runlevel 5. Next, this occurred with or 
>>> without KMS and with or without nouveau. This happened whether or 
>>> not I was logged in (local or remote), and regardless of window 
>>> manager (xdm, gdm, kdm). I then checked to see what else was 
>>> different between runlevel 3 and 5 - only thing was auditd. I 
>>> disabled auditd and reran - no errors.
>>>
>>> Now for the odd stuff:
>>>
>>> The errors only manifest if the high throughput data transfer is 
>>> initiated when the system is in runlevel 5 and auditd was started by 
>>> init when transitioning from runlevel 3 to 5. For example, the 
>>> following scenarios do not cause the errors to manifest:
>>>
>>> runlevel3; start auditd runlevel 5; start transfer
>>> runlevel3; chkconfig auditd off; runlevel5; start auditd; start 
>>> transfer
>>> runlevel3; start transfer (note: errors do not occur if I transition 
>>> to runlevel 5 after the high bandwidth transfer has started)
>>> runlevel3; startx; start transfer
>>>
>>> The only way I get the problem to manifest is transition to runlevel 
>>> 5 with chkconfig auditd on (level 5 only) and then initate the 
>>> windows backup.
>>>
>>> I'm guessing that there is some sort of race condition happening 
>>> between X (xdm/gdm/kdm/greeter?) and auditd that is somehow 
>>> corrupting something. I'd hazard a more or less obvious guess that 
>>> whatever's being corrupted differs when there is already a high 
>>> throughput transfer under way.
>>>
>>> -- 
>>> To unsubscribe from this list: send the line "unsubscribe 
>>> linux-kernel" in
>>> the body of a message to majordomo@vger.kernel.org
>>> More majordomo info at  http://vger.kernel.org/majordomo-info.html
>>> Please read the FAQ at  http://www.tux.org/lkml/
>>
>> -- 
>> To unsubscribe from this list: send the line "unsubscribe 
>> linux-kernel" in
>> the body of a message to majordomo@vger.kernel.org
>> More majordomo info at  http://vger.kernel.org/majordomo-info.html
>> Please read the FAQ at  http://www.tux.org/lkml/
>
> -- 
> To unsubscribe from this list: send the line "unsubscribe 
> linux-kernel" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
> Please read the FAQ at  http://www.tux.org/lkml/

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Michael Breuer Dec. 30, 2009, 9:21 p.m. UTC | #10
On 12/30/2009 4:15 PM, Michael Breuer wrote:
> On 12/30/2009 3:44 PM, Michael Breuer wrote:
>> A couple more observations:
>>
>> 1) enabling auditd for runlevel 3 mitigates the issue
>> 2) starting a remote x session (XDMCP) while under load and while 
>> auditd is already running also triggers the sky2 interrupt status 
>> messages - so maybe not tty1 - but some sort of X & auditd 
>> interaction. Even in this case, the frequency of the error messages 
>> is much less than when auditd is started in runlevel 5 for the first 
>> time.
> And just for some certainty - running with audit=0, the errors do not 
> manifest.
Disregard - took way longer, but they're back:
Dec 30 16:16:42 mail kernel: sky2 0000:06:00.0: error interrupt 
status=0x40000008
Dec 30 16:16:42 mail kernel: sky2 Tx ring pending=72...74 report=73 done=73
Dec 30 16:16:42 mail kernel: 72: 0x0: 0xb18a00be(8490)
Dec 30 16:16:42 mail kernel: sky2 0000:06:00.0: error interrupt 
status=0x40000008
Dec 30 16:16:42 mail kernel: sky2 Tx ring pending=86...88 report=87 done=88
Dec 30 16:16:42 mail kernel: 86: 0x0: 0xb1a140be(9014)
Dec 30 16:16:58 mail kernel: sky2 0000:06:00.0: error interrupt 
status=0x40000008
Dec 30 16:16:58 mail kernel: sky2 Tx ring pending=50...52 report=51 done=52
Dec 30 16:16:58 mail kernel: 50: 0x0: 0xac27c0be(8490)
Dec 30 16:17:15 mail kernel: sky2 0000:06:00.0: error interrupt status=0x8
Dec 30 16:17:15 mail kernel: sky2 Tx ring pending=16...18 report=16 done=17
Dec 30 16:17:15 mail kernel: 16: 0x0: 0xa67ec0be(8490)
Dec 30 16:17:15 mail kernel: sky2 0000:06:00.0: error interrupt 
status=0x40000008
Dec 30 16:17:15 mail kernel: sky2 Tx ring pending=122...124 report=123 
done=124
Dec 30 16:17:15 mail kernel: 122: 0x0: 0xa68980be(8490)
Dec 30 16:17:16 mail kernel: sky2 0000:06:00.0: error interrupt 
status=0x40000008
Dec 30 16:17:16 mail kernel: sky2 Tx ring pending=66...68 report=67 done=67
Dec 30 16:17:16 mail kernel: 66: 0x0: 0xa67200be(8490)
Dec 30 16:17:16 mail kernel: sky2 0000:06:00.0: error interrupt 
status=0x40000008
Dec 30 16:17:16 mail kernel: sky2 Tx ring pending=74...76 report=75 done=76
Dec 30 16:17:16 mail kernel: 74: 0x0: 0xa659c0be(8490)
Dec 30 16:17:21 mail dhcpd: DHCPINFORM from 10.0.0.11 via eth0
Dec 30 16:17:21 mail dhcpd: DHCPACK to 10.0.0.11 (00:1a:92:8d:30:81) via 
eth0
Dec 30 16:18:01 mail kernel: sky2 0000:06:00.0: error interrupt 
status=0x40000008
Dec 30 16:18:01 mail kernel: sky2 Tx ring pending=0...2 report=1 done=1
Dec 30 16:18:01 mail kernel: 0: 0x0: 0x926580be(8490)
Dec 30 16:18:04 mail kernel: sky2 0000:06:00.0: error interrupt 
status=0x40000008
Dec 30 16:18:04 mail kernel: sky2 Tx ring pending=116...118 report=117 
done=117
Dec 30 16:18:04 mail kernel: 116: 0x0: 0x91f380be(8490)
Dec 30 16:18:04 mail kernel: sky2 0000:06:00.0: error interrupt status=0x8
Dec 30 16:18:04 mail kernel: sky2 Tx ring pending=102...104 report=102 
done=103
Dec 30 16:18:04 mail kernel: 102: 0x0: 0x91ed80be(8490)
Dec 30 16:18:05 mail kernel: sky2 0000:06:00.0: error interrupt 
status=0x40000008
Dec 30 16:18:05 mail kernel: sky2 Tx ring pending=64...68 report=65 done=68
Dec 30 16:18:05 mail kernel: 64: 0x0: 0x91d340be(8490)
Dec 30 16:18:05 mail kernel: 66: 0x0: 0x920040ca(105)
Dec 30 16:18:06 mail kernel: sky2 0000:06:00.0: error interrupt 
status=0x40000008
Dec 30 16:18:06 mail kernel: sky2 Tx ring pending=82...86 report=83 done=83
Dec 30 16:18:06 mail kernel: 82: 0x0: 0x91bf80be(9014)
Dec 30 16:18:06 mail kernel: 84: 0x0: 0x91bec0be(7518)
Dec 30 16:18:06 mail kernel: sky2 0000:06:00.0: error interrupt 
status=0x40000008
Dec 30 16:18:06 mail kernel: sky2 Tx ring pending=83...86 report=85 done=86
Dec 30 16:18:06 mail kernel: 82: 0x0: 0x91bf80be(9014)
Dec 30 16:18:06 mail kernel: 84: 0x0: 0x91bec0be(7518)
Dec 30 16:18:06 mail kernel: sky2 0000:06:00.0: error interrupt 
status=0x40000008
Dec 30 16:18:06 mail kernel: sky2 Tx ring pending=85...88 report=87 done=87
Dec 30 16:18:06 mail kernel: 84: 0x0: 0x91bec0be(7518)
Dec 30 16:18:06 mail kernel: 86: 0x0: 0x91b080be(8398)
Dec 30 16:18:08 mail kernel: sky2 0000:06:00.0: error interrupt status=0x8
Dec 30 16:18:08 mail kernel: sky2 Tx ring pending=28...30 report=28 done=29
Dec 30 16:18:08 mail kernel: 28: 0x0: 0x919500be(8506)
... continues ...
>>
>> On 12/30/2009 2:15 PM, Michael Breuer wrote:
>>> And now looking at audit.c it seems reasonable that there is a race 
>>> condition when auditd is started at roughly the same time as X. I'm 
>>> guessing that the kaudit thread is fired up; the tty connected; and 
>>> at the same time X grabs the tty. Somewhere in there an skb gets 
>>> hosed and is then reused by whatever comes along - in my case sky2 
>>> as that's where the subsequent demand is. If the demand happens 
>>> first, the contaminated skb (dk in what way yet) is probably waiting 
>>> to manifest as some other bug that's been frustrating people.
>>> On 12/30/2009 12:49 PM, Michael Breuer wrote:
>>>> On 12/30/2009 2:58 AM, Stephen Hemminger wrote:
>>>>> On Wed, 30 Dec 2009 02:23:20 -0500
>>>>> Michael Breuer<mbreuer@majjas.com>  wrote:
>>>>>
>>>>>> Ok - I called dump_txring from sky2_net_intr:
>>>>>> --- a/drivers/net/sky2.c
>>>>>> +++ b/drivers/net/sky2.c
>>>>>> @@ -2725,8 +2791,10 @@ static void sky2_watchdog(unsigned long arg)
>>>>>>    /* Hardware/software error handling */
>>>>>>    static void sky2_err_intr(struct sky2_hw *hw, u32 status)
>>>>>>    {
>>>>>> -       if (net_ratelimit())
>>>>>> +       if (net_ratelimit()) {
>>>>>>                   dev_warn(&hw->pdev->dev, "error interrupt
>>>>>> status=%#x\n", status);
>>>>>> +               dump_txring(hw, 0);
>>>>>> +       }
>>>>>>
>>>>>>           if (status&  Y2_IS_HW_ERR)
>>>>>>                   sky2_hw_intr(hw);
>>>>>>
>>>>>> And got this:
>>>>>> Dec 30 02:17:23 mail kernel: sky2 0000:06:00.0: error interrupt
>>>>>> status=0x40000008
>>>>>> Dec 30 02:17:23 mail kernel: sky2 0000:06:00.0: error interrupt
>>>>>> status=0x40000008
>>>>>> Dec 30 02:17:23 mail kernel: sky2 Tx ring pending=28...30 
>>>>>> report=29 done=29
>>>>>> Dec 30 02:17:23 mail kernel: sky2 Tx ring pending=28...30 
>>>>>> report=29 done=29
>>>>>> Dec 30 02:17:23 mail kernel: sky2 0000:06:00.0: error interrupt 
>>>>>> status=0x8
>>>>>> Dec 30 02:17:23 mail kernel: sky2 0000:06:00.0: error interrupt 
>>>>>> status=0x8
>>>>>> Dec 30 02:17:23 mail kernel: sky2 Tx ring pending=30...32 
>>>>>> report=30 done=31
>>>>>> Dec 30 02:17:23 mail kernel: sky2 Tx ring pending=30...32 
>>>>>> report=30 done=31
>>>>>>
>>>>> I notice that you have NOUVEAU Nvidia drivers loaded? The one 
>>>>> difference in HW
>>>>> between your board and mine is that I have ATI video card.
>>>>>
>>>> Seems the problem is linked to auditd and X11 (but not nouveau).
>>>>
>>>> Today, I ran a bunch of scenarios. I first determined that the 
>>>> problem only manifest in runlevel 5. Next, this occurred with or 
>>>> without KMS and with or without nouveau. This happened whether or 
>>>> not I was logged in (local or remote), and regardless of window 
>>>> manager (xdm, gdm, kdm). I then checked to see what else was 
>>>> different between runlevel 3 and 5 - only thing was auditd. I 
>>>> disabled auditd and reran - no errors.
>>>>
>>>> Now for the odd stuff:
>>>>
>>>> The errors only manifest if the high throughput data transfer is 
>>>> initiated when the system is in runlevel 5 and auditd was started 
>>>> by init when transitioning from runlevel 3 to 5. For example, the 
>>>> following scenarios do not cause the errors to manifest:
>>>>
>>>> runlevel3; start auditd runlevel 5; start transfer
>>>> runlevel3; chkconfig auditd off; runlevel5; start auditd; start 
>>>> transfer
>>>> runlevel3; start transfer (note: errors do not occur if I 
>>>> transition to runlevel 5 after the high bandwidth transfer has 
>>>> started)
>>>> runlevel3; startx; start transfer
>>>>
>>>> The only way I get the problem to manifest is transition to 
>>>> runlevel 5 with chkconfig auditd on (level 5 only) and then initate 
>>>> the windows backup.
>>>>
>>>> I'm guessing that there is some sort of race condition happening 
>>>> between X (xdm/gdm/kdm/greeter?) and auditd that is somehow 
>>>> corrupting something. I'd hazard a more or less obvious guess that 
>>>> whatever's being corrupted differs when there is already a high 
>>>> throughput transfer under way.
>>>>
>>>> -- 
>>>> To unsubscribe from this list: send the line "unsubscribe 
>>>> linux-kernel" in
>>>> the body of a message to majordomo@vger.kernel.org
>>>> More majordomo info at  http://vger.kernel.org/majordomo-info.html
>>>> Please read the FAQ at  http://www.tux.org/lkml/
>>>
>>> -- 
>>> To unsubscribe from this list: send the line "unsubscribe 
>>> linux-kernel" in
>>> the body of a message to majordomo@vger.kernel.org
>>> More majordomo info at  http://vger.kernel.org/majordomo-info.html
>>> Please read the FAQ at  http://www.tux.org/lkml/
>>
>> -- 
>> To unsubscribe from this list: send the line "unsubscribe 
>> linux-kernel" in
>> the body of a message to majordomo@vger.kernel.org
>> More majordomo info at  http://vger.kernel.org/majordomo-info.html
>> Please read the FAQ at  http://www.tux.org/lkml/
>
> -- 
> To unsubscribe from this list: send the line "unsubscribe 
> linux-kernel" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
> Please read the FAQ at  http://www.tux.org/lkml/

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Michael Breuer Dec. 31, 2009, 6:09 p.m. UTC | #11
Did some more digging today... Haven't nailed it, but there's something 
going on with X and tty...

Among other things, when I telinit 3 && telinit 5 the tty keeps 
switching between 7 and 8 (nothing else running on either tty). It would 
appear that somehow the tty deallocation isn't complete when X restarts. 
Also, X grabbing a tty seems to be a requisite step in recreating the 
sky2 issue.

On 12/30/2009 1:10 PM, Stephen Hemminger wrote:
> On Wed, 30 Dec 2009 10:40:56 -0500
> Michael Breuer<mbreuer@majjas.com>  wrote:
>
>    
>> Dec 30 10:37:14 mail kernel: sky2 0000:06:00.0: error interrupt
>> status=0x40000008
>> Dec 30 10:37:14 mail kernel: sky2 Tx ring pending=31...40 report=35 done=35
>> Dec 30 10:37:14 mail kernel: 30: 0x0: 0xeb50c0ca(9014)
>> Dec 30 10:37:14 mail kernel: 32: 0x0: 0xeb5080ca(9014)
>> Dec 30 10:37:14 mail kernel: 34: 0x0: 0xeb5040ca(9014)
>> Dec 30 10:37:14 mail kernel: 36: 0x0: 0xeb5000ca(9014)
>> Dec 30 10:37:14 mail kernel: 38: 0x0: 0xeb4fc0ca(9014)
>>      
> You must be using 9K MTU.  Do you see the problem with smaller
> MTU?
>
>    
>> Dec 30 10:37:14 mail kernel: sky2 0000:06:00.0: error interrupt
>> status=0x40000008
>> Dec 30 10:37:14 mail kernel: sky2 Tx ring pending=87...92 report=89 done=91
>> Dec 30 10:37:14 mail kernel: 86: 0x0: 0xeafd80ca(9014)
>> Dec 30 10:37:14 mail kernel: 88: 0x0: 0xeafd40ca(9014)
>> Dec 30 10:37:14 mail kernel: 90: 0x0: 0xeafd00ca(9014)
>> Dec 30 10:37:15 mail kernel: sky2 0000:06:00.0: error interrupt status=0x8
>> Dec 30 10:37:15 mail kernel: sky2 Tx ring pending=39...42 report=39 done=41
>> Dec 30 10:37:15 mail kernel: 38: 0x0: 0xe5fec0ca(9014)
>> Dec 30 10:37:15 mail kernel: 40: 0x0: 0xe5fe80ca(9014)
>> Dec 30 10:37:15 mail kernel: sky2 0000:06:00.0: error interrupt
>> status=0x40000008
>> Dec 30 10:37:15 mail kernel: sky2 Tx ring pending=117...124 report=119
>> done=119
>> Dec 30 10:37:15 mail kernel: 116: 0x0: 0xe5d6c0ca(9014)
>> Dec 30 10:37:15 mail kernel: 118: 0x0: 0xe5d680ca(9014)
>> Dec 30 10:37:15 mail kernel: 120: 0x0: 0xe5d660ca(5938)
>> Dec 30 10:37:15 mail kernel: 122: 0x0: 0xe5d640ca(4213)
>>      
>    

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
stephen hemminger Dec. 31, 2009, 6:24 p.m. UTC | #12
I have similar motherboards and ACPI is having issues with parsing
the tables, probably not related but see:

  http://bugzilla.kernel.org/show_bug.cgi?id=14954
Michael Breuer Jan. 1, 2010, 5:42 p.m. UTC | #13
More results... Seems that this is related to the tty (or maybe vt or 
maybe console) driver. I had disabled mingetty on tty1 as a workaround 
to a Fedora/KDE issue. When I reenabled it, I was then able to recreate 
this problem (sky2) in runlevel 3.

Retesting, it seems that the following conditions must be met in order 
for the interrupt errors to happen:

1) mingetty running on tty1 before the start of the test run
2) High sustained incoming load - 25MB/sec or more (as reported by nethogs)
3) I've only been able to recreate this using samba & a Windows 7 backup 
- but there could be other triggers

As it happens, much of my other data all came down to mingetty starting 
or not starting on tty1 at specific points of test preparation. At this 
point I can recreate 100% of the time if I ensure that there is a 
mingetty running on tty1, regardless of runlevel; and I can not recreate 
100% of the time when there is no mingetty running on tty1 at the start 
of the test. The results do not change if I start or stop mingetty on 
tty1 during the test. It's possible that X running on tty1 may also 
trigger the issue, but I haven't tested that. Note that running mingetty 
on the other tty's does not affect the outcome of the test. The test is 
also unaffected if I chvt to or away from tty1 before the test. I do not 
have to be logged in at the console (on any vt) to trigger the issue.

In a nutshell, to recreate: make sure there is a mingetty running on 
tty1, blast traffic into the system on eth0 (sky2).




On 12/31/2009 1:09 PM, Michael Breuer wrote:
> Did some more digging today... Haven't nailed it, but there's 
> something going on with X and tty...
>
> Among other things, when I telinit 3 && telinit 5 the tty keeps 
> switching between 7 and 8 (nothing else running on either tty). It 
> would appear that somehow the tty deallocation isn't complete when X 
> restarts. Also, X grabbing a tty seems to be a requisite step in 
> recreating the sky2 issue.
>
> On 12/30/2009 1:10 PM, Stephen Hemminger wrote:
>> On Wed, 30 Dec 2009 10:40:56 -0500
>> Michael Breuer<mbreuer@majjas.com>  wrote:
>>
>>> Dec 30 10:37:14 mail kernel: sky2 0000:06:00.0: error interrupt
>>> status=0x40000008
>>> Dec 30 10:37:14 mail kernel: sky2 Tx ring pending=31...40 report=35 
>>> done=35
>>> Dec 30 10:37:14 mail kernel: 30: 0x0: 0xeb50c0ca(9014)
>>> Dec 30 10:37:14 mail kernel: 32: 0x0: 0xeb5080ca(9014)
>>> Dec 30 10:37:14 mail kernel: 34: 0x0: 0xeb5040ca(9014)
>>> Dec 30 10:37:14 mail kernel: 36: 0x0: 0xeb5000ca(9014)
>>> Dec 30 10:37:14 mail kernel: 38: 0x0: 0xeb4fc0ca(9014)
>> You must be using 9K MTU.  Do you see the problem with smaller
>> MTU?
>>
>>> Dec 30 10:37:14 mail kernel: sky2 0000:06:00.0: error interrupt
>>> status=0x40000008
>>> Dec 30 10:37:14 mail kernel: sky2 Tx ring pending=87...92 report=89 
>>> done=91
>>> Dec 30 10:37:14 mail kernel: 86: 0x0: 0xeafd80ca(9014)
>>> Dec 30 10:37:14 mail kernel: 88: 0x0: 0xeafd40ca(9014)
>>> Dec 30 10:37:14 mail kernel: 90: 0x0: 0xeafd00ca(9014)
>>> Dec 30 10:37:15 mail kernel: sky2 0000:06:00.0: error interrupt 
>>> status=0x8
>>> Dec 30 10:37:15 mail kernel: sky2 Tx ring pending=39...42 report=39 
>>> done=41
>>> Dec 30 10:37:15 mail kernel: 38: 0x0: 0xe5fec0ca(9014)
>>> Dec 30 10:37:15 mail kernel: 40: 0x0: 0xe5fe80ca(9014)
>>> Dec 30 10:37:15 mail kernel: sky2 0000:06:00.0: error interrupt
>>> status=0x40000008
>>> Dec 30 10:37:15 mail kernel: sky2 Tx ring pending=117...124 report=119
>>> done=119
>>> Dec 30 10:37:15 mail kernel: 116: 0x0: 0xe5d6c0ca(9014)
>>> Dec 30 10:37:15 mail kernel: 118: 0x0: 0xe5d680ca(9014)
>>> Dec 30 10:37:15 mail kernel: 120: 0x0: 0xe5d660ca(5938)
>>> Dec 30 10:37:15 mail kernel: 122: 0x0: 0xe5d640ca(4213)
>

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Michael Breuer Jan. 1, 2010, 7:26 p.m. UTC | #14
Running with TTY_DEBUG_HANGUP:

On boot - lots of stuff opening and closing tty1. Once up and stable 
(runlevel 3), I did initctl stop tty1, and then initctl start tty1. This 
is what was logged:

Jan  1 14:03:21 mail root: close mingetty1
Jan  1 14:03:21 mail kernel: tty_release_dev of tty1 (tty count=6)...
Jan  1 14:03:21 mail kernel: tty_release_dev of tty1 (tty count=5)...
Jan  1 14:03:21 mail kernel: tty1 vhangup...
Jan  1 14:03:21 mail init: tty1 main process (6031) killed by TERM signal
Jan  1 14:03:36 mail root: start mingetty1
Jan  1 14:03:36 mail kernel: error attempted to write to tty [0x(null)] 
= NULL
Jan  1 14:03:36 mail kernel: opening tty1...
Jan  1 14:03:36 mail kernel: tty1 vhangup...
Jan  1 14:03:36 mail kernel: tty_release_dev of tty1 (tty count=5)...

Start/stop of mingetty on other ttys does not get the NULL write error. 
I am always getting the NULL error when starting mingetty on tty1.

For example:
Jan  1 14:04:11 mail root: restart mingetty2
Jan  1 14:04:11 mail init: tty2 main process (6029) killed by TERM signal
Jan  1 14:04:11 mail kernel: tty_release_dev of tty7 (tty count=2)...
Jan  1 14:04:11 mail kernel: tty_release_dev of tty2 (tty count=1)...
Jan  1 14:04:11 mail kernel: freeing tty structure...
Jan  1 14:04:11 mail kernel: opening tty2...
Jan  1 14:04:11 mail kernel: tty2 vhangup...
Jan  1 14:04:11 mail kernel: tty_release_dev of tty2 (tty count=1)...
Jan  1 14:04:11 mail kernel: freeing tty structure...

While starting/stopping tty2 and X was coming up, I also got this:

Jan  1 14:04:11 mail root: restart mingetty2
Jan  1 14:04:11 mail init: tty2 main process (6029) killed by TERM signal
Jan  1 14:04:11 mail kernel: tty_release_dev of tty7 (tty count=2)...
Jan  1 14:04:11 mail kernel: tty_release_dev of tty2 (tty count=1)...
Jan  1 14:04:11 mail kernel: freeing tty structure...
Jan  1 14:04:11 mail kernel: opening tty2...
Jan  1 14:04:11 mail kernel: tty2 vhangup...
Jan  1 14:04:11 mail kernel: tty_release_dev of tty2 (tty count=1)...
Jan  1 14:04:11 mail kernel: freeing tty structure...
Jan  1 14:04:17 mail gnome-session[6832]: WARNING: Could not parse 
desktop file 
/home/mbreuer/.config/autostart/xfce4-tips-autostart.desktop: Key file 
does not have key 'Name'
Jan  1 14:04:17 mail gnome-session[6832]: WARNING: could not read 
/home/mbreuer/.config/autostart/xfce4-tips-autostart.desktop
Jan  1 14:04:19 mail pulseaudio[7002]: pid.c: Stale PID file, overwriting.
Jan  1 14:04:20 mail kernel: opening tty2...
Jan  1 14:04:20 mail kernel: opening pts2...
Jan  1 14:04:20 mail kernel: opening pts2...
Jan  1 14:04:20 mail kernel: tty_release_dev of pts2 (tty count=3)...
Jan  1 14:04:20 mail kernel: opening pts2...
Jan  1 14:04:20 mail kernel: tty_release_dev of pts2 (tty count=3)...
Jan  1 14:04:20 mail kernel: opening pts3...
Jan  1 14:04:20 mail kernel: opening pts3...
Jan  1 14:04:20 mail kernel: tty_release_dev of pts3 (tty count=3)...
Jan  1 14:04:20 mail kernel: opening pts3...
Jan  1 14:04:20 mail kernel: tty_release_dev of pts3 (tty count=3)...
Jan  1 14:04:20 mail kernel: opening pts4...
Jan  1 14:04:20 mail kernel: opening pts4...
Jan  1 14:04:20 mail kernel: tty_release_dev of pts4 (tty count=3)...
Jan  1 14:04:20 mail kernel: opening pts4...
Jan  1 14:04:21 mail kernel: tty_release_dev of pts4 (tty count=3)...
Jan  1 14:04:21 mail kernel: DMA-API: debugging out of memory - disabling
Jan  1 14:04:23 mail pulseaudio[7417]: pid.c: Daemon already running.
Jan  1 14:04:24 mail kernel: sky2 0000:06:00.0: error interrupt status=0x8
Jan  1 14:04:24 mail kernel: sky2 Tx ring pending=45...52 report=45 done=47
Jan  1 14:04:24 mail kernel: 44: 0x0: 0xf26a40be(9014)
Jan  1 14:04:24 mail kernel: 46: 0x0: 0xf26a00be(9014)
Jan  1 14:04:24 mail kernel: 48: 0x0: 0xf269c0be(9014)
Jan  1 14:04:24 mail kernel: 50: 0x0: 0xf26980be(9014)
Jan  1 14:04:24 mail kernel: sky2 0000:06:00.0: error interrupt status=0x8
Jan  1 14:04:24 mail kernel: sky2 Tx ring pending=47...52 report=47 done=49
Jan  1 14:04:24 mail kernel: 46: 0x0: 0xf26a00be(9014)
Jan  1 14:04:24 mail kernel: 48: 0x0: 0xf269c0be(9014)
Jan  1 14:04:24 mail kernel: 50: 0x0: 0xf26980be(9014)

Note the sky2 errors (this time not under load).

It looks like there is something bad going on with the tty driver.


On 01/01/2010 12:42 PM, Michael Breuer wrote:
> More results... Seems that this is related to the tty (or maybe vt or 
> maybe console) driver. I had disabled mingetty on tty1 as a workaround 
> to a Fedora/KDE issue. When I reenabled it, I was then able to 
> recreate this problem (sky2) in runlevel 3.
>
> Retesting, it seems that the following conditions must be met in order 
> for the interrupt errors to happen:
>
> 1) mingetty running on tty1 before the start of the test run
> 2) High sustained incoming load - 25MB/sec or more (as reported by 
> nethogs)
> 3) I've only been able to recreate this using samba & a Windows 7 
> backup - but there could be other triggers
>
> As it happens, much of my other data all came down to mingetty 
> starting or not starting on tty1 at specific points of test 
> preparation. At this point I can recreate 100% of the time if I ensure 
> that there is a mingetty running on tty1, regardless of runlevel; and 
> I can not recreate 100% of the time when there is no mingetty running 
> on tty1 at the start of the test. The results do not change if I start 
> or stop mingetty on tty1 during the test. It's possible that X running 
> on tty1 may also trigger the issue, but I haven't tested that. Note 
> that running mingetty on the other tty's does not affect the outcome 
> of the test. The test is also unaffected if I chvt to or away from 
> tty1 before the test. I do not have to be logged in at the console (on 
> any vt) to trigger the issue.
>
> In a nutshell, to recreate: make sure there is a mingetty running on 
> tty1, blast traffic into the system on eth0 (sky2).
>
>
>
>
> On 12/31/2009 1:09 PM, Michael Breuer wrote:
>> Did some more digging today... Haven't nailed it, but there's 
>> something going on with X and tty...
>>
>> Among other things, when I telinit 3 && telinit 5 the tty keeps 
>> switching between 7 and 8 (nothing else running on either tty). It 
>> would appear that somehow the tty deallocation isn't complete when X 
>> restarts. Also, X grabbing a tty seems to be a requisite step in 
>> recreating the sky2 issue.
>>
>> On 12/30/2009 1:10 PM, Stephen Hemminger wrote:
>>> On Wed, 30 Dec 2009 10:40:56 -0500
>>> Michael Breuer<mbreuer@majjas.com>  wrote:
>>>
>>>> Dec 30 10:37:14 mail kernel: sky2 0000:06:00.0: error interrupt
>>>> status=0x40000008
>>>> Dec 30 10:37:14 mail kernel: sky2 Tx ring pending=31...40 report=35 
>>>> done=35
>>>> Dec 30 10:37:14 mail kernel: 30: 0x0: 0xeb50c0ca(9014)
>>>> Dec 30 10:37:14 mail kernel: 32: 0x0: 0xeb5080ca(9014)
>>>> Dec 30 10:37:14 mail kernel: 34: 0x0: 0xeb5040ca(9014)
>>>> Dec 30 10:37:14 mail kernel: 36: 0x0: 0xeb5000ca(9014)
>>>> Dec 30 10:37:14 mail kernel: 38: 0x0: 0xeb4fc0ca(9014)
>>> You must be using 9K MTU.  Do you see the problem with smaller
>>> MTU?
>>>
>>>> Dec 30 10:37:14 mail kernel: sky2 0000:06:00.0: error interrupt
>>>> status=0x40000008
>>>> Dec 30 10:37:14 mail kernel: sky2 Tx ring pending=87...92 report=89 
>>>> done=91
>>>> Dec 30 10:37:14 mail kernel: 86: 0x0: 0xeafd80ca(9014)
>>>> Dec 30 10:37:14 mail kernel: 88: 0x0: 0xeafd40ca(9014)
>>>> Dec 30 10:37:14 mail kernel: 90: 0x0: 0xeafd00ca(9014)
>>>> Dec 30 10:37:15 mail kernel: sky2 0000:06:00.0: error interrupt 
>>>> status=0x8
>>>> Dec 30 10:37:15 mail kernel: sky2 Tx ring pending=39...42 report=39 
>>>> done=41
>>>> Dec 30 10:37:15 mail kernel: 38: 0x0: 0xe5fec0ca(9014)
>>>> Dec 30 10:37:15 mail kernel: 40: 0x0: 0xe5fe80ca(9014)
>>>> Dec 30 10:37:15 mail kernel: sky2 0000:06:00.0: error interrupt
>>>> status=0x40000008
>>>> Dec 30 10:37:15 mail kernel: sky2 Tx ring pending=117...124 report=119
>>>> done=119
>>>> Dec 30 10:37:15 mail kernel: 116: 0x0: 0xe5d6c0ca(9014)
>>>> Dec 30 10:37:15 mail kernel: 118: 0x0: 0xe5d680ca(9014)
>>>> Dec 30 10:37:15 mail kernel: 120: 0x0: 0xe5d660ca(5938)
>>>> Dec 30 10:37:15 mail kernel: 122: 0x0: 0xe5d640ca(4213)
>>
>
> -- 
> To unsubscribe from this list: send the line "unsubscribe 
> linux-kernel" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
> Please read the FAQ at  http://www.tux.org/lkml/

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Michael Breuer Jan. 1, 2010, 8:34 p.m. UTC | #15
In single user mode:

Looks like the first attempt to run mingetty on tty1 triggers a vhangup 
of more than tty1. Also, the initial console (shell, etc) were open on 
tty1 when I started mingetty on tty1. Looks like the subsequent vhangup 
is what broke things. Also, the vhangup issued from mingetty seems to 
have resulted in tty2 activity as well, although I don't see why it 
should have.

Scenario:

Started in single user mode.
did initctl start tty2 and then switched to tty2 and logged in:
Jan  1 15:11:42 mail kernel: opening tty2...
Jan  1 15:11:42 mail kernel: tty2 vhangup...
Jan  1 15:11:42 mail kernel: tty_release_dev of tty2 (tty count=1)...
Jan  1 15:11:42 mail kernel: freeing tty structure...
Jan  1 15:11:42 mail kernel: opening tty2...
Jan  1 15:11:42 mail kernel: tty2 vhangup...
Jan  1 15:11:42 mail kernel: opening tty2...
Jan  1 15:11:42 mail kernel: tty_release_dev of tty2 (tty count=2)...
Jan  1 15:11:42 mail kernel: opening tty2...
Jan  1 15:11:42 mail kernel: tty_release_dev of tty2 (tty count=2)...
Jan  1 15:11:42 mail kernel: opening tty2...
Jan  1 15:11:42 mail kernel: tty_release_dev of tty2 (tty count=2)...

Then (and note that time should have advanced, but didn't): initctl 
start tty1 (previously, the initial shell from single user mode was up):

Jan  1 15:11:42 mail kernel: opening tty1...
Jan  1 15:11:42 mail kernel: tty1 vhangup...
Jan  1 15:11:42 mail kernel: tty_release_dev of tty1 (tty count=3)...
Jan  1 15:11:42 mail kernel: opening tty1...
Jan  1 15:11:42 mail kernel: opening tty2...
Jan  1 15:11:42 mail kernel: tty_release_dev of tty2 (tty count=2)...
Jan  1 15:11:42 mail kernel: tty_release_dev of tty1 (tty count=3)...
Jan  1 15:11:42 mail kernel: tty1 vhangup...
Jan  1 15:11:42 mail kernel: error attempted to write to tty [0x(null)] 
= NULL
Jan  1 15:11:42 mail kernel: opening tty2...
Jan  1 15:11:42 mail kernel: tty_release_dev of tty2 (tty count=2)...
Jan  1 15:11:42 mail kernel: opening tty2...
Jan  1 15:11:42 mail kernel: opening tty2...
Jan  1 15:11:42 mail kernel: tty_release_dev of tty2 (tty count=3)...
Jan  1 15:11:42 mail kernel: opening tty2...
Jan  1 15:11:42 mail kernel: tty_release_dev of tty2 (tty count=3)...
Jan  1 15:11:42 mail kernel: opening tty2...
Jan  1 15:11:42 mail kernel: tty_release_dev of tty2 (tty count=3)...
Jan  1 15:11:42 mail kernel: opening tty2...
Jan  1 15:11:42 mail kernel: tty_release_dev of tty2 (tty count=3)...
Jan  1 15:11:42 mail kernel: opening tty2...
Jan  1 15:11:42 mail kernel: tty_release_dev of tty2 (tty count=3)...
Jan  1 15:11:42 mail kernel: opening tty2...
Jan  1 15:11:42 mail kernel: tty_release_dev of tty2 (tty count=3)...
Jan  1 15:11:42 mail kernel: opening tty2...
Jan  1 15:11:42 mail kernel: tty_release_dev of tty2 (tty count=3)...
Jan  1 15:11:42 mail kernel: opening tty2...
Jan  1 15:11:42 mail kernel: opening tty2...
Jan  1 15:11:42 mail kernel: tty_release_dev of tty2 (tty count=4)...

While I was doing this, I was watching dmesg, not /var/log/message. 
Didn't notice time was stuck...
Then, I did telinit 5:
Jan  1 15:11:42 mail kernel: sky2 eth1: enabling interface
Jan  1 15:11:42 mail kernel: ADDRCONF(NETDEV_UP): eth1: link is not ready
Jan  1 15:11:42 mail kernel: sky2 eth1: Link is up at 100 Mbps, full 
duplex, flow control rx
Jan  1 15:11:42 mail kernel: ADDRCONF(NETDEV_CHANGE): eth1: link becomes 
ready
Jan  1 15:11:42 mail kernel: opening tty2...
Jan  1 15:11:42 mail kernel: tty_release_dev of tty2 (tty count=4)...
Jan  1 15:11:42 mail kernel: opening tty1...
Jan  1 15:11:42 mail kernel: tty_release_dev of tty1 (tty count=2)...
Jan  1 15:11:42 mail kernel: opening tty1...
Jan  1 15:11:42 mail kernel: tty_release_dev of tty1 (tty count=2)...
Jan  1 15:11:42 mail kernel: opening tty1...
Jan  1 15:11:42 mail kernel: tty_release_dev of tty1 (tty count=2)...
Jan  1 15:11:42 mail kernel: opening tty1...
Jan  1 15:11:42 mail kernel: tty_release_dev of tty1 (tty count=2)...
Jan  1 15:11:42 mail kernel: opening tty1...
Jan  1 15:11:42 mail kernel: tty_release_dev of tty1 (tty count=2)...
Jan  1 15:11:42 mail kernel: opening tty1...
Jan  1 15:11:42 mail kernel: tty_release_dev of tty1 (tty count=2)...
Jan  1 15:11:42 mail kernel: opening tty1...
Jan  1 15:11:42 mail kernel: tty_release_dev of tty1 (tty count=2)...
Jan  1 15:11:42 mail kernel: opening tty1...
Jan  1 15:11:42 mail kernel: tty_release_dev of tty1 (tty count=2)...
Jan  1 15:11:42 mail kernel: IPv6 over IPv4 tunneling driver
Jan  1 15:11:42 mail kernel: sit0: Disabled Privacy Extensions
Jan  1 15:11:42 mail kernel: opening tty1...
Jan  1 15:11:42 mail kernel: tty_release_dev of tty1 (tty count=2)...
Jan  1 15:11:42 mail kernel: SELinux: initialized (dev cgroup, type 
cgroup), uses genfs_contexts
Jan  1 15:11:42 mail kernel: opening tty1...
Jan  1 15:11:42 mail kernel: tty_release_dev of tty1 (tty count=2)...
Jan  1 15:11:42 mail kernel: opening tty1...
Jan  1 15:11:42 mail kernel: tty_release_dev of tty1 (tty count=2)...
Jan  1 15:11:42 mail kernel: audit rule for LSM 'dhclient_t' is invalid
Jan  1 15:11:42 mail kernel: audit rule for LSM 'mcstransd_t' is invalid
Jan  1 15:11:42 mail kernel: audit rule for LSM 'samba_t' is invalid
Jan  1 15:11:42 mail kernel: audit rule for LSM 'dhclient_t' is invalid
Jan  1 15:11:42 mail kernel: audit rule for LSM 'mcstransd_t' is invalid
Jan  1 15:11:42 mail kernel: audit rule for LSM 'samba_t' is invalid
Jan  1 15:11:42 mail kernel: opening tty1...
Jan  1 15:11:42 mail kernel: tty_release_dev of tty1 (tty count=2)...
Jan  1 15:11:42 mail kernel: opening tty1...
Jan  1 15:11:42 mail kernel: tty_release_dev of tty1 (tty count=2)...
Jan  1 15:11:42 mail kernel: opening tty1...
Jan  1 15:11:42 mail kernel: tty_release_dev of tty1 (tty count=2)...
Jan  1 15:11:42 mail kernel: opening tty1...
Jan  1 15:11:42 mail kernel: tty_release_dev of tty1 (tty count=2)...
Jan  1 15:11:42 mail kernel: opening tty2...
Jan  1 15:11:42 mail kernel: tty_release_dev of tty2 (tty count=4)...
Jan  1 15:11:42 mail kernel: opening tty1...
Jan  1 15:11:42 mail cpuspeed: Enabling ondemand cpu frequency scaling 
governor
Jan  1 15:11:42 mail kernel: tty_release_dev of tty1 (tty count=2)...
Jan  1 15:11:42 mail kernel: opening tty1...
Jan  1 15:11:42 mail kernel: tty_release_dev of tty1 (tty count=2)...
Jan  1 15:11:42 mail kernel: opening tty1...
Jan  1 15:11:42 mail kernel: tty_release_dev of tty1 (tty count=2)...
Jan  1 15:11:42 mail kernel: opening tty1...
Jan  1 15:11:42 mail kernel: tty_release_dev of tty1 (tty count=2)...
Jan  1 15:11:42 mail kernel: opening tty1...
Jan  1 15:11:42 mail kernel: tty_release_dev of tty1 (tty count=2)...
Jan  1 15:11:42 mail kernel: opening tty1...
Jan  1 15:11:42 mail kernel: tty_release_dev of tty1 (tty count=2)...
Jan  1 15:11:42 mail kernel: opening tty1...
Jan  1 15:11:42 mail kernel: tty_release_dev of tty1 (tty count=2)...
Jan  1 15:11:42 mail kernel: opening tty1...
Jan  1 15:11:42 mail kdump: kexec: loaded kdump kernel
Jan  1 15:11:42 mail kdump: started up
Jan  1 15:11:42 mail kernel: tty_release_dev of tty1 (tty count=2)...
Jan  1 15:11:42 mail kernel: opening tty1...
Jan  1 15:11:43 mail kernel: tty_release_dev of tty1 (tty count=2)...

This is when time increments again ... time jumped ahead and was OK by 
the time ntp started. Looks like system time was OK, but syslog was 
backlogged.

On 01/01/2010 02:26 PM, Michael Breuer wrote:
> Running with TTY_DEBUG_HANGUP:
>
> On boot - lots of stuff opening and closing tty1. Once up and stable 
> (runlevel 3), I did initctl stop tty1, and then initctl start tty1. 
> This is what was logged:
>
> Jan  1 14:03:21 mail root: close mingetty1
> Jan  1 14:03:21 mail kernel: tty_release_dev of tty1 (tty count=6)...
> Jan  1 14:03:21 mail kernel: tty_release_dev of tty1 (tty count=5)...
> Jan  1 14:03:21 mail kernel: tty1 vhangup...
> Jan  1 14:03:21 mail init: tty1 main process (6031) killed by TERM signal
> Jan  1 14:03:36 mail root: start mingetty1
> Jan  1 14:03:36 mail kernel: error attempted to write to tty 
> [0x(null)] = NULL
> Jan  1 14:03:36 mail kernel: opening tty1...
> Jan  1 14:03:36 mail kernel: tty1 vhangup...
> Jan  1 14:03:36 mail kernel: tty_release_dev of tty1 (tty count=5)...
>
> Start/stop of mingetty on other ttys does not get the NULL write 
> error. I am always getting the NULL error when starting mingetty on tty1.
>
> For example:
> Jan  1 14:04:11 mail root: restart mingetty2
> Jan  1 14:04:11 mail init: tty2 main process (6029) killed by TERM signal
> Jan  1 14:04:11 mail kernel: tty_release_dev of tty7 (tty count=2)...
> Jan  1 14:04:11 mail kernel: tty_release_dev of tty2 (tty count=1)...
> Jan  1 14:04:11 mail kernel: freeing tty structure...
> Jan  1 14:04:11 mail kernel: opening tty2...
> Jan  1 14:04:11 mail kernel: tty2 vhangup...
> Jan  1 14:04:11 mail kernel: tty_release_dev of tty2 (tty count=1)...
> Jan  1 14:04:11 mail kernel: freeing tty structure...
>
> While starting/stopping tty2 and X was coming up, I also got this:
>
> Jan  1 14:04:11 mail root: restart mingetty2
> Jan  1 14:04:11 mail init: tty2 main process (6029) killed by TERM signal
> Jan  1 14:04:11 mail kernel: tty_release_dev of tty7 (tty count=2)...
> Jan  1 14:04:11 mail kernel: tty_release_dev of tty2 (tty count=1)...
> Jan  1 14:04:11 mail kernel: freeing tty structure...
> Jan  1 14:04:11 mail kernel: opening tty2...
> Jan  1 14:04:11 mail kernel: tty2 vhangup...
> Jan  1 14:04:11 mail kernel: tty_release_dev of tty2 (tty count=1)...
> Jan  1 14:04:11 mail kernel: freeing tty structure...
> Jan  1 14:04:17 mail gnome-session[6832]: WARNING: Could not parse 
> desktop file 
> /home/mbreuer/.config/autostart/xfce4-tips-autostart.desktop: Key file 
> does not have key 'Name'
> Jan  1 14:04:17 mail gnome-session[6832]: WARNING: could not read 
> /home/mbreuer/.config/autostart/xfce4-tips-autostart.desktop
> Jan  1 14:04:19 mail pulseaudio[7002]: pid.c: Stale PID file, 
> overwriting.
> Jan  1 14:04:20 mail kernel: opening tty2...
> Jan  1 14:04:20 mail kernel: opening pts2...
> Jan  1 14:04:20 mail kernel: opening pts2...
> Jan  1 14:04:20 mail kernel: tty_release_dev of pts2 (tty count=3)...
> Jan  1 14:04:20 mail kernel: opening pts2...
> Jan  1 14:04:20 mail kernel: tty_release_dev of pts2 (tty count=3)...
> Jan  1 14:04:20 mail kernel: opening pts3...
> Jan  1 14:04:20 mail kernel: opening pts3...
> Jan  1 14:04:20 mail kernel: tty_release_dev of pts3 (tty count=3)...
> Jan  1 14:04:20 mail kernel: opening pts3...
> Jan  1 14:04:20 mail kernel: tty_release_dev of pts3 (tty count=3)...
> Jan  1 14:04:20 mail kernel: opening pts4...
> Jan  1 14:04:20 mail kernel: opening pts4...
> Jan  1 14:04:20 mail kernel: tty_release_dev of pts4 (tty count=3)...
> Jan  1 14:04:20 mail kernel: opening pts4...
> Jan  1 14:04:21 mail kernel: tty_release_dev of pts4 (tty count=3)...
> Jan  1 14:04:21 mail kernel: DMA-API: debugging out of memory - disabling
> Jan  1 14:04:23 mail pulseaudio[7417]: pid.c: Daemon already running.
> Jan  1 14:04:24 mail kernel: sky2 0000:06:00.0: error interrupt 
> status=0x8
> Jan  1 14:04:24 mail kernel: sky2 Tx ring pending=45...52 report=45 
> done=47
> Jan  1 14:04:24 mail kernel: 44: 0x0: 0xf26a40be(9014)
> Jan  1 14:04:24 mail kernel: 46: 0x0: 0xf26a00be(9014)
> Jan  1 14:04:24 mail kernel: 48: 0x0: 0xf269c0be(9014)
> Jan  1 14:04:24 mail kernel: 50: 0x0: 0xf26980be(9014)
> Jan  1 14:04:24 mail kernel: sky2 0000:06:00.0: error interrupt 
> status=0x8
> Jan  1 14:04:24 mail kernel: sky2 Tx ring pending=47...52 report=47 
> done=49
> Jan  1 14:04:24 mail kernel: 46: 0x0: 0xf26a00be(9014)
> Jan  1 14:04:24 mail kernel: 48: 0x0: 0xf269c0be(9014)
> Jan  1 14:04:24 mail kernel: 50: 0x0: 0xf26980be(9014)
>
> Note the sky2 errors (this time not under load).
>
> It looks like there is something bad going on with the tty driver.
>
>
> On 01/01/2010 12:42 PM, Michael Breuer wrote:
>> More results... Seems that this is related to the tty (or maybe vt or 
>> maybe console) driver. I had disabled mingetty on tty1 as a 
>> workaround to a Fedora/KDE issue. When I reenabled it, I was then 
>> able to recreate this problem (sky2) in runlevel 3.
>>
>> Retesting, it seems that the following conditions must be met in 
>> order for the interrupt errors to happen:
>>
>> 1) mingetty running on tty1 before the start of the test run
>> 2) High sustained incoming load - 25MB/sec or more (as reported by 
>> nethogs)
>> 3) I've only been able to recreate this using samba & a Windows 7 
>> backup - but there could be other triggers
>>
>> As it happens, much of my other data all came down to mingetty 
>> starting or not starting on tty1 at specific points of test 
>> preparation. At this point I can recreate 100% of the time if I 
>> ensure that there is a mingetty running on tty1, regardless of 
>> runlevel; and I can not recreate 100% of the time when there is no 
>> mingetty running on tty1 at the start of the test. The results do not 
>> change if I start or stop mingetty on tty1 during the test. It's 
>> possible that X running on tty1 may also trigger the issue, but I 
>> haven't tested that. Note that running mingetty on the other tty's 
>> does not affect the outcome of the test. The test is also unaffected 
>> if I chvt to or away from tty1 before the test. I do not have to be 
>> logged in at the console (on any vt) to trigger the issue.
>>
>> In a nutshell, to recreate: make sure there is a mingetty running on 
>> tty1, blast traffic into the system on eth0 (sky2).
>>
>>
>>
>>
>> On 12/31/2009 1:09 PM, Michael Breuer wrote:
>>> Did some more digging today... Haven't nailed it, but there's 
>>> something going on with X and tty...
>>>
>>> Among other things, when I telinit 3 && telinit 5 the tty keeps 
>>> switching between 7 and 8 (nothing else running on either tty). It 
>>> would appear that somehow the tty deallocation isn't complete when X 
>>> restarts. Also, X grabbing a tty seems to be a requisite step in 
>>> recreating the sky2 issue.
>>>
>>> On 12/30/2009 1:10 PM, Stephen Hemminger wrote:
>>>> On Wed, 30 Dec 2009 10:40:56 -0500
>>>> Michael Breuer<mbreuer@majjas.com>  wrote:
>>>>
>>>>> Dec 30 10:37:14 mail kernel: sky2 0000:06:00.0: error interrupt
>>>>> status=0x40000008
>>>>> Dec 30 10:37:14 mail kernel: sky2 Tx ring pending=31...40 
>>>>> report=35 done=35
>>>>> Dec 30 10:37:14 mail kernel: 30: 0x0: 0xeb50c0ca(9014)
>>>>> Dec 30 10:37:14 mail kernel: 32: 0x0: 0xeb5080ca(9014)
>>>>> Dec 30 10:37:14 mail kernel: 34: 0x0: 0xeb5040ca(9014)
>>>>> Dec 30 10:37:14 mail kernel: 36: 0x0: 0xeb5000ca(9014)
>>>>> Dec 30 10:37:14 mail kernel: 38: 0x0: 0xeb4fc0ca(9014)
>>>> You must be using 9K MTU.  Do you see the problem with smaller
>>>> MTU?
>>>>
>>>>> Dec 30 10:37:14 mail kernel: sky2 0000:06:00.0: error interrupt
>>>>> status=0x40000008
>>>>> Dec 30 10:37:14 mail kernel: sky2 Tx ring pending=87...92 
>>>>> report=89 done=91
>>>>> Dec 30 10:37:14 mail kernel: 86: 0x0: 0xeafd80ca(9014)
>>>>> Dec 30 10:37:14 mail kernel: 88: 0x0: 0xeafd40ca(9014)
>>>>> Dec 30 10:37:14 mail kernel: 90: 0x0: 0xeafd00ca(9014)
>>>>> Dec 30 10:37:15 mail kernel: sky2 0000:06:00.0: error interrupt 
>>>>> status=0x8
>>>>> Dec 30 10:37:15 mail kernel: sky2 Tx ring pending=39...42 
>>>>> report=39 done=41
>>>>> Dec 30 10:37:15 mail kernel: 38: 0x0: 0xe5fec0ca(9014)
>>>>> Dec 30 10:37:15 mail kernel: 40: 0x0: 0xe5fe80ca(9014)
>>>>> Dec 30 10:37:15 mail kernel: sky2 0000:06:00.0: error interrupt
>>>>> status=0x40000008
>>>>> Dec 30 10:37:15 mail kernel: sky2 Tx ring pending=117...124 
>>>>> report=119
>>>>> done=119
>>>>> Dec 30 10:37:15 mail kernel: 116: 0x0: 0xe5d6c0ca(9014)
>>>>> Dec 30 10:37:15 mail kernel: 118: 0x0: 0xe5d680ca(9014)
>>>>> Dec 30 10:37:15 mail kernel: 120: 0x0: 0xe5d660ca(5938)
>>>>> Dec 30 10:37:15 mail kernel: 122: 0x0: 0xe5d640ca(4213)
>>>
>>
>> -- 
>> To unsubscribe from this list: send the line "unsubscribe 
>> linux-kernel" in
>> the body of a message to majordomo@vger.kernel.org
>> More majordomo info at  http://vger.kernel.org/majordomo-info.html
>> Please read the FAQ at  http://www.tux.org/lkml/
>
> -- 
> To unsubscribe from this list: send the line "unsubscribe 
> linux-kernel" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
> Please read the FAQ at  http://www.tux.org/lkml/

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Michael Breuer Jan. 2, 2010, 9:42 p.m. UTC | #16
Made some more progress - hopefully someone can point me in the right 
direction....
Put extra debugging printk's into tty_io.c and learned that when init 
starts the mingetty's, tty_release_dev is called for ptm0, and the tty 
structure is freed (according to the debug log message) - but up to this 
point, it had never been allocated. It looks like this is happening as 
the result of vhangup for pts0 being invoked while in the middle of 
closing (tty_release_dev) several tty1's. I'm guessing that's where 
something is being stepped on, but haven't tracked it down quite yet. 
Regardless, it's after this that I can generate sky2 network errors, and 
not before. This does not seem to happen if I remove /etc/events.d/tty1 
before boot.


On 1/1/2010 3:34 PM, Michael Breuer wrote:
> In single user mode:
>
> Looks like the first attempt to run mingetty on tty1 triggers a 
> vhangup of more than tty1. Also, the initial console (shell, etc) were 
> open on tty1 when I started mingetty on tty1. Looks like the 
> subsequent vhangup is what broke things. Also, the vhangup issued from 
> mingetty seems to have resulted in tty2 activity as well, although I 
> don't see why it should have.
>
> Scenario:
>
> Started in single user mode.
> did initctl start tty2 and then switched to tty2 and logged in:
> Jan  1 15:11:42 mail kernel: opening tty2...
> Jan  1 15:11:42 mail kernel: tty2 vhangup...
> Jan  1 15:11:42 mail kernel: tty_release_dev of tty2 (tty count=1)...
> Jan  1 15:11:42 mail kernel: freeing tty structure...
> Jan  1 15:11:42 mail kernel: opening tty2...
> Jan  1 15:11:42 mail kernel: tty2 vhangup...
> Jan  1 15:11:42 mail kernel: opening tty2...
> Jan  1 15:11:42 mail kernel: tty_release_dev of tty2 (tty count=2)...
> Jan  1 15:11:42 mail kernel: opening tty2...
> Jan  1 15:11:42 mail kernel: tty_release_dev of tty2 (tty count=2)...
> Jan  1 15:11:42 mail kernel: opening tty2...
> Jan  1 15:11:42 mail kernel: tty_release_dev of tty2 (tty count=2)...
>
> Then (and note that time should have advanced, but didn't): initctl 
> start tty1 (previously, the initial shell from single user mode was up):
>
> Jan  1 15:11:42 mail kernel: opening tty1...
> Jan  1 15:11:42 mail kernel: tty1 vhangup...
> Jan  1 15:11:42 mail kernel: tty_release_dev of tty1 (tty count=3)...
> Jan  1 15:11:42 mail kernel: opening tty1...
> Jan  1 15:11:42 mail kernel: opening tty2...
> Jan  1 15:11:42 mail kernel: tty_release_dev of tty2 (tty count=2)...
> Jan  1 15:11:42 mail kernel: tty_release_dev of tty1 (tty count=3)...
> Jan  1 15:11:42 mail kernel: tty1 vhangup...
> Jan  1 15:11:42 mail kernel: error attempted to write to tty 
> [0x(null)] = NULL
> Jan  1 15:11:42 mail kernel: opening tty2...
> Jan  1 15:11:42 mail kernel: tty_release_dev of tty2 (tty count=2)...
> Jan  1 15:11:42 mail kernel: opening tty2...
> Jan  1 15:11:42 mail kernel: opening tty2...
> Jan  1 15:11:42 mail kernel: tty_release_dev of tty2 (tty count=3)...
> Jan  1 15:11:42 mail kernel: opening tty2...
> Jan  1 15:11:42 mail kernel: tty_release_dev of tty2 (tty count=3)...
> Jan  1 15:11:42 mail kernel: opening tty2...
> Jan  1 15:11:42 mail kernel: tty_release_dev of tty2 (tty count=3)...
> Jan  1 15:11:42 mail kernel: opening tty2...
> Jan  1 15:11:42 mail kernel: tty_release_dev of tty2 (tty count=3)...
> Jan  1 15:11:42 mail kernel: opening tty2...
> Jan  1 15:11:42 mail kernel: tty_release_dev of tty2 (tty count=3)...
> Jan  1 15:11:42 mail kernel: opening tty2...
> Jan  1 15:11:42 mail kernel: tty_release_dev of tty2 (tty count=3)...
> Jan  1 15:11:42 mail kernel: opening tty2...
> Jan  1 15:11:42 mail kernel: tty_release_dev of tty2 (tty count=3)...
> Jan  1 15:11:42 mail kernel: opening tty2...
> Jan  1 15:11:42 mail kernel: opening tty2...
> Jan  1 15:11:42 mail kernel: tty_release_dev of tty2 (tty count=4)...
>
> While I was doing this, I was watching dmesg, not /var/log/message. 
> Didn't notice time was stuck...
> Then, I did telinit 5:
> Jan  1 15:11:42 mail kernel: sky2 eth1: enabling interface
> Jan  1 15:11:42 mail kernel: ADDRCONF(NETDEV_UP): eth1: link is not ready
> Jan  1 15:11:42 mail kernel: sky2 eth1: Link is up at 100 Mbps, full 
> duplex, flow control rx
> Jan  1 15:11:42 mail kernel: ADDRCONF(NETDEV_CHANGE): eth1: link 
> becomes ready
> Jan  1 15:11:42 mail kernel: opening tty2...
> Jan  1 15:11:42 mail kernel: tty_release_dev of tty2 (tty count=4)...
> Jan  1 15:11:42 mail kernel: opening tty1...
> Jan  1 15:11:42 mail kernel: tty_release_dev of tty1 (tty count=2)...
> Jan  1 15:11:42 mail kernel: opening tty1...
> Jan  1 15:11:42 mail kernel: tty_release_dev of tty1 (tty count=2)...
> Jan  1 15:11:42 mail kernel: opening tty1...
> Jan  1 15:11:42 mail kernel: tty_release_dev of tty1 (tty count=2)...
> Jan  1 15:11:42 mail kernel: opening tty1...
> Jan  1 15:11:42 mail kernel: tty_release_dev of tty1 (tty count=2)...
> Jan  1 15:11:42 mail kernel: opening tty1...
> Jan  1 15:11:42 mail kernel: tty_release_dev of tty1 (tty count=2)...
> Jan  1 15:11:42 mail kernel: opening tty1...
> Jan  1 15:11:42 mail kernel: tty_release_dev of tty1 (tty count=2)...
> Jan  1 15:11:42 mail kernel: opening tty1...
> Jan  1 15:11:42 mail kernel: tty_release_dev of tty1 (tty count=2)...
> Jan  1 15:11:42 mail kernel: opening tty1...
> Jan  1 15:11:42 mail kernel: tty_release_dev of tty1 (tty count=2)...
> Jan  1 15:11:42 mail kernel: IPv6 over IPv4 tunneling driver
> Jan  1 15:11:42 mail kernel: sit0: Disabled Privacy Extensions
> Jan  1 15:11:42 mail kernel: opening tty1...
> Jan  1 15:11:42 mail kernel: tty_release_dev of tty1 (tty count=2)...
> Jan  1 15:11:42 mail kernel: SELinux: initialized (dev cgroup, type 
> cgroup), uses genfs_contexts
> Jan  1 15:11:42 mail kernel: opening tty1...
> Jan  1 15:11:42 mail kernel: tty_release_dev of tty1 (tty count=2)...
> Jan  1 15:11:42 mail kernel: opening tty1...
> Jan  1 15:11:42 mail kernel: tty_release_dev of tty1 (tty count=2)...
> Jan  1 15:11:42 mail kernel: audit rule for LSM 'dhclient_t' is invalid
> Jan  1 15:11:42 mail kernel: audit rule for LSM 'mcstransd_t' is invalid
> Jan  1 15:11:42 mail kernel: audit rule for LSM 'samba_t' is invalid
> Jan  1 15:11:42 mail kernel: audit rule for LSM 'dhclient_t' is invalid
> Jan  1 15:11:42 mail kernel: audit rule for LSM 'mcstransd_t' is invalid
> Jan  1 15:11:42 mail kernel: audit rule for LSM 'samba_t' is invalid
> Jan  1 15:11:42 mail kernel: opening tty1...
> Jan  1 15:11:42 mail kernel: tty_release_dev of tty1 (tty count=2)...
> Jan  1 15:11:42 mail kernel: opening tty1...
> Jan  1 15:11:42 mail kernel: tty_release_dev of tty1 (tty count=2)...
> Jan  1 15:11:42 mail kernel: opening tty1...
> Jan  1 15:11:42 mail kernel: tty_release_dev of tty1 (tty count=2)...
> Jan  1 15:11:42 mail kernel: opening tty1...
> Jan  1 15:11:42 mail kernel: tty_release_dev of tty1 (tty count=2)...
> Jan  1 15:11:42 mail kernel: opening tty2...
> Jan  1 15:11:42 mail kernel: tty_release_dev of tty2 (tty count=4)...
> Jan  1 15:11:42 mail kernel: opening tty1...
> Jan  1 15:11:42 mail cpuspeed: Enabling ondemand cpu frequency scaling 
> governor
> Jan  1 15:11:42 mail kernel: tty_release_dev of tty1 (tty count=2)...
> Jan  1 15:11:42 mail kernel: opening tty1...
> Jan  1 15:11:42 mail kernel: tty_release_dev of tty1 (tty count=2)...
> Jan  1 15:11:42 mail kernel: opening tty1...
> Jan  1 15:11:42 mail kernel: tty_release_dev of tty1 (tty count=2)...
> Jan  1 15:11:42 mail kernel: opening tty1...
> Jan  1 15:11:42 mail kernel: tty_release_dev of tty1 (tty count=2)...
> Jan  1 15:11:42 mail kernel: opening tty1...
> Jan  1 15:11:42 mail kernel: tty_release_dev of tty1 (tty count=2)...
> Jan  1 15:11:42 mail kernel: opening tty1...
> Jan  1 15:11:42 mail kernel: tty_release_dev of tty1 (tty count=2)...
> Jan  1 15:11:42 mail kernel: opening tty1...
> Jan  1 15:11:42 mail kernel: tty_release_dev of tty1 (tty count=2)...
> Jan  1 15:11:42 mail kernel: opening tty1...
> Jan  1 15:11:42 mail kdump: kexec: loaded kdump kernel
> Jan  1 15:11:42 mail kdump: started up
> Jan  1 15:11:42 mail kernel: tty_release_dev of tty1 (tty count=2)...
> Jan  1 15:11:42 mail kernel: opening tty1...
> Jan  1 15:11:43 mail kernel: tty_release_dev of tty1 (tty count=2)...
>
> This is when time increments again ... time jumped ahead and was OK by 
> the time ntp started. Looks like system time was OK, but syslog was 
> backlogged.
>
> On 01/01/2010 02:26 PM, Michael Breuer wrote:
>> Running with TTY_DEBUG_HANGUP:
>>
>> On boot - lots of stuff opening and closing tty1. Once up and stable 
>> (runlevel 3), I did initctl stop tty1, and then initctl start tty1. 
>> This is what was logged:
>>
>> Jan  1 14:03:21 mail root: close mingetty1
>> Jan  1 14:03:21 mail kernel: tty_release_dev of tty1 (tty count=6)...
>> Jan  1 14:03:21 mail kernel: tty_release_dev of tty1 (tty count=5)...
>> Jan  1 14:03:21 mail kernel: tty1 vhangup...
>> Jan  1 14:03:21 mail init: tty1 main process (6031) killed by TERM 
>> signal
>> Jan  1 14:03:36 mail root: start mingetty1
>> Jan  1 14:03:36 mail kernel: error attempted to write to tty 
>> [0x(null)] = NULL
>> Jan  1 14:03:36 mail kernel: opening tty1...
>> Jan  1 14:03:36 mail kernel: tty1 vhangup...
>> Jan  1 14:03:36 mail kernel: tty_release_dev of tty1 (tty count=5)...
>>
>> Start/stop of mingetty on other ttys does not get the NULL write 
>> error. I am always getting the NULL error when starting mingetty on 
>> tty1.
>>
>> For example:
>> Jan  1 14:04:11 mail root: restart mingetty2
>> Jan  1 14:04:11 mail init: tty2 main process (6029) killed by TERM 
>> signal
>> Jan  1 14:04:11 mail kernel: tty_release_dev of tty7 (tty count=2)...
>> Jan  1 14:04:11 mail kernel: tty_release_dev of tty2 (tty count=1)...
>> Jan  1 14:04:11 mail kernel: freeing tty structure...
>> Jan  1 14:04:11 mail kernel: opening tty2...
>> Jan  1 14:04:11 mail kernel: tty2 vhangup...
>> Jan  1 14:04:11 mail kernel: tty_release_dev of tty2 (tty count=1)...
>> Jan  1 14:04:11 mail kernel: freeing tty structure...
>>
>> While starting/stopping tty2 and X was coming up, I also got this:
>>
>> Jan  1 14:04:11 mail root: restart mingetty2
>> Jan  1 14:04:11 mail init: tty2 main process (6029) killed by TERM 
>> signal
>> Jan  1 14:04:11 mail kernel: tty_release_dev of tty7 (tty count=2)...
>> Jan  1 14:04:11 mail kernel: tty_release_dev of tty2 (tty count=1)...
>> Jan  1 14:04:11 mail kernel: freeing tty structure...
>> Jan  1 14:04:11 mail kernel: opening tty2...
>> Jan  1 14:04:11 mail kernel: tty2 vhangup...
>> Jan  1 14:04:11 mail kernel: tty_release_dev of tty2 (tty count=1)...
>> Jan  1 14:04:11 mail kernel: freeing tty structure...
>> Jan  1 14:04:17 mail gnome-session[6832]: WARNING: Could not parse 
>> desktop file 
>> /home/mbreuer/.config/autostart/xfce4-tips-autostart.desktop: Key 
>> file does not have key 'Name'
>> Jan  1 14:04:17 mail gnome-session[6832]: WARNING: could not read 
>> /home/mbreuer/.config/autostart/xfce4-tips-autostart.desktop
>> Jan  1 14:04:19 mail pulseaudio[7002]: pid.c: Stale PID file, 
>> overwriting.
>> Jan  1 14:04:20 mail kernel: opening tty2...
>> Jan  1 14:04:20 mail kernel: opening pts2...
>> Jan  1 14:04:20 mail kernel: opening pts2...
>> Jan  1 14:04:20 mail kernel: tty_release_dev of pts2 (tty count=3)...
>> Jan  1 14:04:20 mail kernel: opening pts2...
>> Jan  1 14:04:20 mail kernel: tty_release_dev of pts2 (tty count=3)...
>> Jan  1 14:04:20 mail kernel: opening pts3...
>> Jan  1 14:04:20 mail kernel: opening pts3...
>> Jan  1 14:04:20 mail kernel: tty_release_dev of pts3 (tty count=3)...
>> Jan  1 14:04:20 mail kernel: opening pts3...
>> Jan  1 14:04:20 mail kernel: tty_release_dev of pts3 (tty count=3)...
>> Jan  1 14:04:20 mail kernel: opening pts4...
>> Jan  1 14:04:20 mail kernel: opening pts4...
>> Jan  1 14:04:20 mail kernel: tty_release_dev of pts4 (tty count=3)...
>> Jan  1 14:04:20 mail kernel: opening pts4...
>> Jan  1 14:04:21 mail kernel: tty_release_dev of pts4 (tty count=3)...
>> Jan  1 14:04:21 mail kernel: DMA-API: debugging out of memory - 
>> disabling
>> Jan  1 14:04:23 mail pulseaudio[7417]: pid.c: Daemon already running.
>> Jan  1 14:04:24 mail kernel: sky2 0000:06:00.0: error interrupt 
>> status=0x8
>> Jan  1 14:04:24 mail kernel: sky2 Tx ring pending=45...52 report=45 
>> done=47
>> Jan  1 14:04:24 mail kernel: 44: 0x0: 0xf26a40be(9014)
>> Jan  1 14:04:24 mail kernel: 46: 0x0: 0xf26a00be(9014)
>> Jan  1 14:04:24 mail kernel: 48: 0x0: 0xf269c0be(9014)
>> Jan  1 14:04:24 mail kernel: 50: 0x0: 0xf26980be(9014)
>> Jan  1 14:04:24 mail kernel: sky2 0000:06:00.0: error interrupt 
>> status=0x8
>> Jan  1 14:04:24 mail kernel: sky2 Tx ring pending=47...52 report=47 
>> done=49
>> Jan  1 14:04:24 mail kernel: 46: 0x0: 0xf26a00be(9014)
>> Jan  1 14:04:24 mail kernel: 48: 0x0: 0xf269c0be(9014)
>> Jan  1 14:04:24 mail kernel: 50: 0x0: 0xf26980be(9014)
>>
>> Note the sky2 errors (this time not under load).
>>
>> It looks like there is something bad going on with the tty driver.
>>
>>
>> On 01/01/2010 12:42 PM, Michael Breuer wrote:
>>> More results... Seems that this is related to the tty (or maybe vt 
>>> or maybe console) driver. I had disabled mingetty on tty1 as a 
>>> workaround to a Fedora/KDE issue. When I reenabled it, I was then 
>>> able to recreate this problem (sky2) in runlevel 3.
>>>
>>> Retesting, it seems that the following conditions must be met in 
>>> order for the interrupt errors to happen:
>>>
>>> 1) mingetty running on tty1 before the start of the test run
>>> 2) High sustained incoming load - 25MB/sec or more (as reported by 
>>> nethogs)
>>> 3) I've only been able to recreate this using samba & a Windows 7 
>>> backup - but there could be other triggers
>>>
>>> As it happens, much of my other data all came down to mingetty 
>>> starting or not starting on tty1 at specific points of test 
>>> preparation. At this point I can recreate 100% of the time if I 
>>> ensure that there is a mingetty running on tty1, regardless of 
>>> runlevel; and I can not recreate 100% of the time when there is no 
>>> mingetty running on tty1 at the start of the test. The results do 
>>> not change if I start or stop mingetty on tty1 during the test. It's 
>>> possible that X running on tty1 may also trigger the issue, but I 
>>> haven't tested that. Note that running mingetty on the other tty's 
>>> does not affect the outcome of the test. The test is also unaffected 
>>> if I chvt to or away from tty1 before the test. I do not have to be 
>>> logged in at the console (on any vt) to trigger the issue.
>>>
>>> In a nutshell, to recreate: make sure there is a mingetty running on 
>>> tty1, blast traffic into the system on eth0 (sky2).
>>>
>>>
>>>
>>>
>>> On 12/31/2009 1:09 PM, Michael Breuer wrote:
>>>> Did some more digging today... Haven't nailed it, but there's 
>>>> something going on with X and tty...
>>>>
>>>> Among other things, when I telinit 3 && telinit 5 the tty keeps 
>>>> switching between 7 and 8 (nothing else running on either tty). It 
>>>> would appear that somehow the tty deallocation isn't complete when 
>>>> X restarts. Also, X grabbing a tty seems to be a requisite step in 
>>>> recreating the sky2 issue.
>>>>
>>>> On 12/30/2009 1:10 PM, Stephen Hemminger wrote:
>>>>> On Wed, 30 Dec 2009 10:40:56 -0500
>>>>> Michael Breuer<mbreuer@majjas.com>  wrote:
>>>>>
>>>>>> Dec 30 10:37:14 mail kernel: sky2 0000:06:00.0: error interrupt
>>>>>> status=0x40000008
>>>>>> Dec 30 10:37:14 mail kernel: sky2 Tx ring pending=31...40 
>>>>>> report=35 done=35
>>>>>> Dec 30 10:37:14 mail kernel: 30: 0x0: 0xeb50c0ca(9014)
>>>>>> Dec 30 10:37:14 mail kernel: 32: 0x0: 0xeb5080ca(9014)
>>>>>> Dec 30 10:37:14 mail kernel: 34: 0x0: 0xeb5040ca(9014)
>>>>>> Dec 30 10:37:14 mail kernel: 36: 0x0: 0xeb5000ca(9014)
>>>>>> Dec 30 10:37:14 mail kernel: 38: 0x0: 0xeb4fc0ca(9014)
>>>>> You must be using 9K MTU.  Do you see the problem with smaller
>>>>> MTU?
>>>>>
>>>>>> Dec 30 10:37:14 mail kernel: sky2 0000:06:00.0: error interrupt
>>>>>> status=0x40000008
>>>>>> Dec 30 10:37:14 mail kernel: sky2 Tx ring pending=87...92 
>>>>>> report=89 done=91
>>>>>> Dec 30 10:37:14 mail kernel: 86: 0x0: 0xeafd80ca(9014)
>>>>>> Dec 30 10:37:14 mail kernel: 88: 0x0: 0xeafd40ca(9014)
>>>>>> Dec 30 10:37:14 mail kernel: 90: 0x0: 0xeafd00ca(9014)
>>>>>> Dec 30 10:37:15 mail kernel: sky2 0000:06:00.0: error interrupt 
>>>>>> status=0x8
>>>>>> Dec 30 10:37:15 mail kernel: sky2 Tx ring pending=39...42 
>>>>>> report=39 done=41
>>>>>> Dec 30 10:37:15 mail kernel: 38: 0x0: 0xe5fec0ca(9014)
>>>>>> Dec 30 10:37:15 mail kernel: 40: 0x0: 0xe5fe80ca(9014)
>>>>>> Dec 30 10:37:15 mail kernel: sky2 0000:06:00.0: error interrupt
>>>>>> status=0x40000008
>>>>>> Dec 30 10:37:15 mail kernel: sky2 Tx ring pending=117...124 
>>>>>> report=119
>>>>>> done=119
>>>>>> Dec 30 10:37:15 mail kernel: 116: 0x0: 0xe5d6c0ca(9014)
>>>>>> Dec 30 10:37:15 mail kernel: 118: 0x0: 0xe5d680ca(9014)
>>>>>> Dec 30 10:37:15 mail kernel: 120: 0x0: 0xe5d660ca(5938)
>>>>>> Dec 30 10:37:15 mail kernel: 122: 0x0: 0xe5d640ca(4213)
>>>>
>>>
>>> -- 
>>> To unsubscribe from this list: send the line "unsubscribe 
>>> linux-kernel" in
>>> the body of a message to majordomo@vger.kernel.org
>>> More majordomo info at  http://vger.kernel.org/majordomo-info.html
>>> Please read the FAQ at  http://www.tux.org/lkml/
>>
>> -- 
>> To unsubscribe from this list: send the line "unsubscribe 
>> linux-kernel" in
>> the body of a message to majordomo@vger.kernel.org
>> More majordomo info at  http://vger.kernel.org/majordomo-info.html
>> Please read the FAQ at  http://www.tux.org/lkml/
>

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
diff mbox

Patch

--- a/drivers/net/sky2.c
+++ b/drivers/net/sky2.c
@@ -2725,8 +2791,10 @@  static void sky2_watchdog(unsigned long arg)
  /* Hardware/software error handling */
  static void sky2_err_intr(struct sky2_hw *hw, u32 status)
  {
-       if (net_ratelimit())
+       if (net_ratelimit()) {
                 dev_warn(&hw->pdev->dev, "error interrupt 
status=%#x\n", status);
+               dump_txring(hw, 0);
+       }

         if (status & Y2_IS_HW_ERR)