diff mbox

sky2: receive dma mapping error handling

Message ID 20100201102018.7b597992@nehalam
State Superseded, archived
Delegated to: David Miller
Headers show

Commit Message

Stephen Hemminger Feb. 1, 2010, 6:20 p.m. UTC
This fixes the fact that re->flags is always zero without causing
other confusion.

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Comments

Michael Breuer Feb. 1, 2010, 6:44 p.m. UTC | #1
On 2/1/2010 1:20 PM, Stephen Hemminger wrote:
> This fixes the fact that re->flags is always zero without causing
> other confusion.
>
> --- a/drivers/net/sky2.c	2010-02-01 10:07:42.676296236 -0800
> +++ b/drivers/net/sky2.c	2010-02-01 10:18:12.575044064 -0800
> @@ -1025,11 +1025,8 @@ static void sky2_prefetch_init(struct sk
>   static inline struct sky2_tx_le *get_tx_le(struct sky2_port *sky2, u16 *slot)
>   {
>   	struct sky2_tx_le *le = sky2->tx_le + *slot;
> -	struct tx_ring_info *re = sky2->tx_ring + *slot;
>
>   	*slot = RING_NEXT(*slot, sky2->tx_ring_size);
> -	re->flags = 0;
> -	re->skb = NULL;
>   	le->ctrl = 0;
>   	return le;
>   }
> @@ -1622,8 +1619,7 @@ static unsigned tx_le_req(const struct s
>   	return count;
>   }
>
> -static void sky2_tx_unmap(struct pci_dev *pdev,
> -			  const struct tx_ring_info *re)
> +static void sky2_tx_unmap(struct pci_dev *pdev, struct tx_ring_info *re)
>   {
>   	if (re->flags&  TX_MAP_SINGLE)
>   		pci_unmap_single(pdev, pci_unmap_addr(re, mapaddr),
> @@ -1633,6 +1629,7 @@ static void sky2_tx_unmap(struct pci_dev
>   		pci_unmap_page(pdev, pci_unmap_addr(re, mapaddr),
>   			       pci_unmap_len(re, maplen),
>   			       PCI_DMA_TODEVICE);
> +	re->flags = 0;
>   }
>
>   /*
> @@ -1839,6 +1836,7 @@ static void sky2_tx_complete(struct sky2
>   			dev->stats.tx_packets++;
>   			dev->stats.tx_bytes += skb->len;
>
> +			re->skb = NULL;
>   			dev_kfree_skb_any(skb);
>
>   			sky2->tx_next = RING_NEXT(idx, sky2->tx_ring_size);
> --
> To unsubscribe from this list: send the line "unsubscribe netdev" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
>    
Running with this patch now - dma debug num_free_entries stable - so 
far, so good.
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Jarek Poplawski Feb. 1, 2010, 8:13 p.m. UTC | #2
On Mon, Feb 01, 2010 at 10:20:18AM -0800, Stephen Hemminger wrote:
> This fixes the fact that re->flags is always zero without causing
> other confusion.

Actually, there is a slight confusion: after tx_init() slots #0 are
skipped and waiting for tx_complete. Of course, no big deal, but no
problem with fixing it too, so there is the main difference between
these two patches. (Moving re->flags and re->skb initializations is
an optimization, but I tried to change only the buggy parts.)

Jarek P.

> 
> --- a/drivers/net/sky2.c	2010-02-01 10:07:42.676296236 -0800
> +++ b/drivers/net/sky2.c	2010-02-01 10:18:12.575044064 -0800
> @@ -1025,11 +1025,8 @@ static void sky2_prefetch_init(struct sk
>  static inline struct sky2_tx_le *get_tx_le(struct sky2_port *sky2, u16 *slot)
>  {
>  	struct sky2_tx_le *le = sky2->tx_le + *slot;
> -	struct tx_ring_info *re = sky2->tx_ring + *slot;
>  
>  	*slot = RING_NEXT(*slot, sky2->tx_ring_size);
> -	re->flags = 0;
> -	re->skb = NULL;
>  	le->ctrl = 0;
>  	return le;
>  }
> @@ -1622,8 +1619,7 @@ static unsigned tx_le_req(const struct s
>  	return count;
>  }
>  
> -static void sky2_tx_unmap(struct pci_dev *pdev,
> -			  const struct tx_ring_info *re)
> +static void sky2_tx_unmap(struct pci_dev *pdev, struct tx_ring_info *re)
>  {
>  	if (re->flags & TX_MAP_SINGLE)
>  		pci_unmap_single(pdev, pci_unmap_addr(re, mapaddr),
> @@ -1633,6 +1629,7 @@ static void sky2_tx_unmap(struct pci_dev
>  		pci_unmap_page(pdev, pci_unmap_addr(re, mapaddr),
>  			       pci_unmap_len(re, maplen),
>  			       PCI_DMA_TODEVICE);
> +	re->flags = 0;
>  }
>  
>  /*
> @@ -1839,6 +1836,7 @@ static void sky2_tx_complete(struct sky2
>  			dev->stats.tx_packets++;
>  			dev->stats.tx_bytes += skb->len;
>  
> +			re->skb = NULL;
>  			dev_kfree_skb_any(skb);
>  
>  			sky2->tx_next = RING_NEXT(idx, sky2->tx_ring_size);
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Jarek Poplawski Feb. 1, 2010, 8:41 p.m. UTC | #3
On Mon, Feb 01, 2010 at 09:13:23PM +0100, Jarek Poplawski wrote:
> On Mon, Feb 01, 2010 at 10:20:18AM -0800, Stephen Hemminger wrote:
> > This fixes the fact that re->flags is always zero without causing
> > other confusion.
> 
> Actually, there is a slight confusion: after tx_init() slots #0 are
> skipped and waiting for tx_complete. Of course, no big deal, but no
> problem with fixing it too, so there is the main difference between

Hmm... On the other hand it could be fixed simpler by moving
sky2->tx_cons. I'll send v3.

Thanks,
Jarek P.
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Michael Breuer Feb. 3, 2010, 4:07 a.m. UTC | #4
On 2/1/2010 1:20 PM, Stephen Hemminger wrote:
> This fixes the fact that re->flags is always zero without causing
> other confusion.
>
> --- a/drivers/net/sky2.c	2010-02-01 10:07:42.676296236 -0800
> +++ b/drivers/net/sky2.c	2010-02-01 10:18:12.575044064 -0800
> @@ -1025,11 +1025,8 @@ static void sky2_prefetch_init(struct sk
>   static inline struct sky2_tx_le *get_tx_le(struct sky2_port *sky2, u16 *slot)
>   {
>   	struct sky2_tx_le *le = sky2->tx_le + *slot;
> -	struct tx_ring_info *re = sky2->tx_ring + *slot;
>
>   	*slot = RING_NEXT(*slot, sky2->tx_ring_size);
> -	re->flags = 0;
> -	re->skb = NULL;
>   	le->ctrl = 0;
>   	return le;
>   }
> @@ -1622,8 +1619,7 @@ static unsigned tx_le_req(const struct s
>   	return count;
>   }
>
> -static void sky2_tx_unmap(struct pci_dev *pdev,
> -			  const struct tx_ring_info *re)
> +static void sky2_tx_unmap(struct pci_dev *pdev, struct tx_ring_info *re)
>   {
>   	if (re->flags&  TX_MAP_SINGLE)
>   		pci_unmap_single(pdev, pci_unmap_addr(re, mapaddr),
> @@ -1633,6 +1629,7 @@ static void sky2_tx_unmap(struct pci_dev
>   		pci_unmap_page(pdev, pci_unmap_addr(re, mapaddr),
>   			       pci_unmap_len(re, maplen),
>   			       PCI_DMA_TODEVICE);
> +	re->flags = 0;
>   }
>
>   /*
> @@ -1839,6 +1836,7 @@ static void sky2_tx_complete(struct sky2
>   			dev->stats.tx_packets++;
>   			dev->stats.tx_bytes += skb->len;
>
> +			re->skb = NULL;
>   			dev_kfree_skb_any(skb);
>
>   			sky2->tx_next = RING_NEXT(idx, sky2->tx_ring_size);
> --
> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
> Please read the FAQ at  http://www.tux.org/lkml/
>    
Just a brief update - this has been up and stable for about 32 hours - 
I've been periodically generating load on the system. No kernel errors 
of any sort so far. Actually, in retrospect, I believe the dma issue was 
triggering other bad things - including an rcu lockup (patched in tip - 
sched.c).

Just as an FYI - (and this should probably be in a new thread) I am 
seeing an large number (>9,000,000) of dropped rx packets, however at 
this time I see no errors resulting from that (on this or client 
machines). As the # of dropped packets hasn't incremented at any time I 
was observing things, I can't say what this is about. Probably nothing, 
but I'll see if I can track down what is going on. I did see some of 
this earlier on while troubleshooting the sky2 issues that now seem 
resolved.  Quick crosschecking of other machines do not show high error 
or retransmission rates. I'm also not seeing any evidence of other 
errors (no errors reported by ifconfig, or ethtool, or printk (debug is 
enabled).

I'm wondering whether these dropped packets are due mostly to hitting 
GMR_FS_RX_OK in sky2_receive. I'm also guessing that the high numbers of 
this that I'm seeing is an artifact of being able to pump more traffic 
through with the above patch. Given the description of the status code 
in sky2.h (receive ok) I'm wondering whether a) this should be reported 
as dropped, b) whether resubmit is necessary, c) whether it's possible 
that eth1 events coinciding with eth0 events are the cause and d) 
whether or not there's another issue entirely.





--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Michael Breuer Feb. 3, 2010, 4:47 p.m. UTC | #5
On 02/02/2010 11:07 PM, Michael Breuer wrote:
> Just a brief update - this has been up and stable for about 32 hours - 
> I've been periodically generating load on the system. No kernel errors 
> of any sort so far. Actually, in retrospect, I believe the dma issue 
> was triggering other bad things - including an rcu lockup (patched in 
> tip - sched.c).
>
> Just as an FYI - (and this should probably be in a new thread) I am 
> seeing an large number (>9,000,000) of dropped rx packets, however at 
> this time I see no errors resulting from that (on this or client 
> machines). As the # of dropped packets hasn't incremented at any time 
> I was observing things, I can't say what this is about. Probably 
> nothing, but I'll see if I can track down what is going on. I did see 
> some of this earlier on while troubleshooting the sky2 issues that now 
> seem resolved.  Quick crosschecking of other machines do not show high 
> error or retransmission rates. I'm also not seeing any evidence of 
> other errors (no errors reported by ifconfig, or ethtool, or printk 
> (debug is enabled).
>
> I'm wondering whether these dropped packets are due mostly to hitting 
> GMR_FS_RX_OK in sky2_receive. I'm also guessing that the high numbers 
> of this that I'm seeing is an artifact of being able to pump more 
> traffic through with the above patch. Given the description of the 
> status code in sky2.h (receive ok) I'm wondering whether a) this 
> should be reported as dropped, b) whether resubmit is necessary, c) 
> whether it's possible that eth1 events coinciding with eth0 events are 
> the cause and d) whether or not there's another issue entirely.
>
Tracked this down. The status being returned is 0x3c0080 - good flow 
control packets. Nothing is actually being dropped (confirmed by packet 
trace on switch compared with packet trace on server).

I whipped up a trivial patch to not count these as dropped packets and 
will post to netdev.

I'm not really sure what the driver should be doing in this case, but 
resubmit seems to work.
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Stephen Hemminger Feb. 3, 2010, 4:56 p.m. UTC | #6
On Wed, 03 Feb 2010 11:47:19 -0500
Michael Breuer <mbreuer@majjas.com> wrote:

> Tracked this down. The status being returned is 0x3c0080 - good flow 
> control packets. Nothing is actually being dropped (confirmed by packet 
> trace on switch compared with packet trace on server).
> 
> I whipped up a trivial patch to not count these as dropped packets and 
> will post to netdev.
> 
> I'm not really sure what the driver should be doing in this case, but 
> resubmit seems to work.

Looks like a flow control negotiation issue. You probably turned off
flow control on the Linux side, but the switch is still doing flow
control.
Michael Breuer Feb. 3, 2010, 5:07 p.m. UTC | #7
On 02/03/2010 11:56 AM, Stephen Hemminger wrote:
> On Wed, 03 Feb 2010 11:47:19 -0500
> Michael Breuer<mbreuer@majjas.com>  wrote:
>
>    
>> Tracked this down. The status being returned is 0x3c0080 - good flow
>> control packets. Nothing is actually being dropped (confirmed by packet
>> trace on switch compared with packet trace on server).
>>
>> I whipped up a trivial patch to not count these as dropped packets and
>> will post to netdev.
>>
>> I'm not really sure what the driver should be doing in this case, but
>> resubmit seems to work.
>>      
> Looks like a flow control negotiation issue. You probably turned off
> flow control on the Linux side, but the switch is still doing flow
> control.
>
>    
According to the driver:
Feb  3 12:03:02 mail kernel: sky2 eth0: Link is up at 1000 Mbps, full 
duplex, flow control both

So if the rx flow control packet status is due to flow control being 
disabled, then there's a different issue.


--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Justin Mattock Feb. 3, 2010, 5:16 p.m. UTC | #8
On 02/03/10 08:56, Stephen Hemminger wrote:
> On Wed, 03 Feb 2010 11:47:19 -0500
> Michael Breuer<mbreuer@majjas.com>  wrote:
>
>> Tracked this down. The status being returned is 0x3c0080 - good flow
>> control packets. Nothing is actually being dropped (confirmed by packet
>> trace on switch compared with packet trace on server).
>>
>> I whipped up a trivial patch to not count these as dropped packets and
>> will post to netdev.
>>
>> I'm not really sure what the driver should be doing in this case, but
>> resubmit seems to work.
>
> Looks like a flow control negotiation issue. You probably turned off
> flow control on the Linux side, but the switch is still doing flow
> control.
>

I noticed this in a hotel last week, I can try
at the home to see if this fires of, if so
I can test any patches you have.

Justin P. Mattock
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Justin Mattock Feb. 3, 2010, 6:23 p.m. UTC | #9
On 02/03/10 09:07, Michael Breuer wrote:
> On 02/03/2010 11:56 AM, Stephen Hemminger wrote:
>> On Wed, 03 Feb 2010 11:47:19 -0500
>> Michael Breuer<mbreuer@majjas.com> wrote:
>>
>>> Tracked this down. The status being returned is 0x3c0080 - good flow
>>> control packets. Nothing is actually being dropped (confirmed by packet
>>> trace on switch compared with packet trace on server).
>>>
>>> I whipped up a trivial patch to not count these as dropped packets and
>>> will post to netdev.
>>>
>>> I'm not really sure what the driver should be doing in this case, but
>>> resubmit seems to work.
>> Looks like a flow control negotiation issue. You probably turned off
>> flow control on the Linux side, but the switch is still doing flow
>> control.
>>
> According to the driver:
> Feb 3 12:03:02 mail kernel: sky2 eth0: Link is up at 1000 Mbps, full
> duplex, flow control both
>
> So if the rx flow control packet status is due to flow control being
> disabled, then there's a different issue.
>
>
> --
> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at http://vger.kernel.org/majordomo-info.html
> Please read the FAQ at http://www.tux.org/lkml/
>


hmm.. after an hour or so I'm not seeing anything.
from what I remember I turned the machine on in the
hotel, then left the system there as I went out for a few hours
(so maybe I need to wait).

Anyways I did keep dmesg of when this occurred, basically
the log was spammed with these:

[  863.294057] sky2 eth0: rx error, status 0x580002 length 88
[  865.646645] sky2 eth0: rx error, status 0x600002 length 96
[ 1286.420471] sky2 eth0: rx error, status 0x600002 length 96
[ 1286.499459] sky2 eth0: rx error, status 0x600002 length 96
[ 1746.903826] sky2 eth0: rx error, status 0x600002 length 96
[ 1754.263692] sky2 eth0: rx error, status 0x600002 length 96
[ 1755.309360] sky2 eth0: rx error, status 0x680002 length 104
[ 2213.256294] sky2 eth0: rx error, status 0x600002 length 96
[ 2219.653342] sky2 eth0: rx error, status 0x580002 length 88
[ 2221.673601] sky2 eth0: rx error, status 0x600002 length 96
[ 2679.654655] sky2 eth0: rx error, status 0x680002 length 104
[ 2692.315058] sky2 eth0: rx error, status 0x500002 length 80
[ 2694.349612] sky2 eth0: rx error, status 0x580002 length 88
[ 2703.676717] sky2 eth0: rx error, status 0x700002 length 112
[ 2703.826375] sky2 eth0: rx error, status 0x600002 length 96
[ 3187.504843] sky2 eth0: rx error, status 0x600002 length 96
[ 3189.560744] sky2 eth0: rx error, status 0x600002 length 96
[ 3672.475719] sky2 eth0: rx error, status 0x680002 length 104
[ 3676.696959] sky2 eth0: rx error, status 0x680002 length 104

but while using the system with this, I didn't notice anything
out of the ordinary.
(if this fires off I can try a bisect for you guys, but right now
since I'm not seeing anything, might be a different story);

Justin P. Mattock
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Stephen Hemminger Feb. 3, 2010, 6:25 p.m. UTC | #10
On Wed, 03 Feb 2010 10:23:01 -0800
"Justin P. Mattock" <justinmattock@gmail.com> wrote:

> On 02/03/10 09:07, Michael Breuer wrote:
> > On 02/03/2010 11:56 AM, Stephen Hemminger wrote:
> >> On Wed, 03 Feb 2010 11:47:19 -0500
> >> Michael Breuer<mbreuer@majjas.com> wrote:
> >>
> >>> Tracked this down. The status being returned is 0x3c0080 - good flow
> >>> control packets. Nothing is actually being dropped (confirmed by packet
> >>> trace on switch compared with packet trace on server).
> >>>
> >>> I whipped up a trivial patch to not count these as dropped packets and
> >>> will post to netdev.
> >>>
> >>> I'm not really sure what the driver should be doing in this case, but
> >>> resubmit seems to work.
> >> Looks like a flow control negotiation issue. You probably turned off
> >> flow control on the Linux side, but the switch is still doing flow
> >> control.
> >>
> > According to the driver:
> > Feb 3 12:03:02 mail kernel: sky2 eth0: Link is up at 1000 Mbps, full
> > duplex, flow control both
> >
> > So if the rx flow control packet status is due to flow control being
> > disabled, then there's a different issue.
> >
> >
> > --
> > To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
> > the body of a message to majordomo@vger.kernel.org
> > More majordomo info at http://vger.kernel.org/majordomo-info.html
> > Please read the FAQ at http://www.tux.org/lkml/
> >
> 
> 
> hmm.. after an hour or so I'm not seeing anything.
> from what I remember I turned the machine on in the
> hotel, then left the system there as I went out for a few hours
> (so maybe I need to wait).
> 
> Anyways I did keep dmesg of when this occurred, basically
> the log was spammed with these:
> 
> [  863.294057] sky2 eth0: rx error, status 0x580002 length 88
> [  865.646645] sky2 eth0: rx error, status 0x600002 length 96
> [ 1286.420471] sky2 eth0: rx error, status 0x600002 length 96
> [ 1286.499459] sky2 eth0: rx error, status 0x600002 length 96
> [ 1746.903826] sky2 eth0: rx error, status 0x600002 length 96
> [ 1754.263692] sky2 eth0: rx error, status 0x600002 length 96
> [ 1755.309360] sky2 eth0: rx error, status 0x680002 length 104
> [ 2213.256294] sky2 eth0: rx error, status 0x600002 length 96
> [ 2219.653342] sky2 eth0: rx error, status 0x580002 length 88
> [ 2221.673601] sky2 eth0: rx error, status 0x600002 length 96
> [ 2679.654655] sky2 eth0: rx error, status 0x680002 length 104
> [ 2692.315058] sky2 eth0: rx error, status 0x500002 length 80
> [ 2694.349612] sky2 eth0: rx error, status 0x580002 length 88
> [ 2703.676717] sky2 eth0: rx error, status 0x700002 length 112
> [ 2703.826375] sky2 eth0: rx error, status 0x600002 length 96
> [ 3187.504843] sky2 eth0: rx error, status 0x600002 length 96
> [ 3189.560744] sky2 eth0: rx error, status 0x600002 length 96
> [ 3672.475719] sky2 eth0: rx error, status 0x680002 length 104
> [ 3676.696959] sky2 eth0: rx error, status 0x680002 length 104
> 
> but while using the system with this, I didn't notice anything
> out of the ordinary.
> (if this fires off I can try a bisect for you guys, but right now
> since I'm not seeing anything, might be a different story);
> 

You were on a crappy hotel switch. Those are all CRC errors.
Justin Mattock Feb. 3, 2010, 6:48 p.m. UTC | #11
On 02/03/10 10:25, Stephen Hemminger wrote:
> On Wed, 03 Feb 2010 10:23:01 -0800
> "Justin P. Mattock"<justinmattock@gmail.com>  wrote:
>
>> On 02/03/10 09:07, Michael Breuer wrote:
>>> On 02/03/2010 11:56 AM, Stephen Hemminger wrote:
>>>> On Wed, 03 Feb 2010 11:47:19 -0500
>>>> Michael Breuer<mbreuer@majjas.com>  wrote:
>>>>
>>>>> Tracked this down. The status being returned is 0x3c0080 - good flow
>>>>> control packets. Nothing is actually being dropped (confirmed by packet
>>>>> trace on switch compared with packet trace on server).
>>>>>
>>>>> I whipped up a trivial patch to not count these as dropped packets and
>>>>> will post to netdev.
>>>>>
>>>>> I'm not really sure what the driver should be doing in this case, but
>>>>> resubmit seems to work.
>>>> Looks like a flow control negotiation issue. You probably turned off
>>>> flow control on the Linux side, but the switch is still doing flow
>>>> control.
>>>>
>>> According to the driver:
>>> Feb 3 12:03:02 mail kernel: sky2 eth0: Link is up at 1000 Mbps, full
>>> duplex, flow control both
>>>
>>> So if the rx flow control packet status is due to flow control being
>>> disabled, then there's a different issue.
>>>
>>>
>>> --
>>> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
>>> the body of a message to majordomo@vger.kernel.org
>>> More majordomo info at http://vger.kernel.org/majordomo-info.html
>>> Please read the FAQ at http://www.tux.org/lkml/
>>>
>>
>>
>> hmm.. after an hour or so I'm not seeing anything.
>> from what I remember I turned the machine on in the
>> hotel, then left the system there as I went out for a few hours
>> (so maybe I need to wait).
>>
>> Anyways I did keep dmesg of when this occurred, basically
>> the log was spammed with these:
>>
>> [  863.294057] sky2 eth0: rx error, status 0x580002 length 88
>> [  865.646645] sky2 eth0: rx error, status 0x600002 length 96
>> [ 1286.420471] sky2 eth0: rx error, status 0x600002 length 96
>> [ 1286.499459] sky2 eth0: rx error, status 0x600002 length 96
>> [ 1746.903826] sky2 eth0: rx error, status 0x600002 length 96
>> [ 1754.263692] sky2 eth0: rx error, status 0x600002 length 96
>> [ 1755.309360] sky2 eth0: rx error, status 0x680002 length 104
>> [ 2213.256294] sky2 eth0: rx error, status 0x600002 length 96
>> [ 2219.653342] sky2 eth0: rx error, status 0x580002 length 88
>> [ 2221.673601] sky2 eth0: rx error, status 0x600002 length 96
>> [ 2679.654655] sky2 eth0: rx error, status 0x680002 length 104
>> [ 2692.315058] sky2 eth0: rx error, status 0x500002 length 80
>> [ 2694.349612] sky2 eth0: rx error, status 0x580002 length 88
>> [ 2703.676717] sky2 eth0: rx error, status 0x700002 length 112
>> [ 2703.826375] sky2 eth0: rx error, status 0x600002 length 96
>> [ 3187.504843] sky2 eth0: rx error, status 0x600002 length 96
>> [ 3189.560744] sky2 eth0: rx error, status 0x600002 length 96
>> [ 3672.475719] sky2 eth0: rx error, status 0x680002 length 104
>> [ 3676.696959] sky2 eth0: rx error, status 0x680002 length 104
>>
>> but while using the system with this, I didn't notice anything
>> out of the ordinary.
>> (if this fires off I can try a bisect for you guys, but right now
>> since I'm not seeing anything, might be a different story);
>>
>
> You were on a crappy hotel switch. Those are all CRC errors.
>
>
>


alright.. makes sense cause I'm not getting
any of this now at home.

Justin P. Mattock
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Michael Breuer Nov. 6, 2010, 4:57 p.m. UTC | #12
Basically, if I enable tcp timestamps (now disabled) I get a sky2 hang. 
As with the earlier issue the effects are not seen until after a couple 
days of uptime and seem exacerbated by load.

I can't 100% confirm that the problem is not occurring without tcp 
timestamps, but will leave the system up for a while to try to confirm. 
This didn't occur previously without tcp timestamps enabled, but I also 
pulled git changes between the two events.

I'm now also on 2.6.37-rc1.... I did a quick scan and didn't see any 
obvious commits between 2.6.36-09934 and -rc1 that would have affected this.

 From the log:
Nov  2 05:41:54 mail kernel: DRHD: handling fault status reg 2
Nov  2 05:41:54 mail kernel: DMAR:[DMA Read] Request device [06:00.0] 
fault addr ffea3000
Nov  2 05:41:54 mail kernel: DMAR:[fault reason 06] PTE Read access is 
not set
Nov  2 05:41:54 mail kernel: sky2 0000:06:00.0: error interrupt 
status=0x80000000
Nov  2 05:41:54 mail kernel: sky2 0000:06:00.0: PCI hardware error (0x2010)
Nov  2 05:42:01 mail clamd[9755]: SelfCheck: Database status OK.
Nov  2 05:42:11 mail root: ping of potter failed
Nov  2 05:42:16 mail kernel: ------------[ cut here ]------------
Nov  2 05:42:16 mail kernel: WARNING: at net/sched/sch_generic.c:258 
dev_watchdog+0x251/0x260()
Nov  2 05:42:16 mail kernel: Hardware name: System Product Name
Nov  2 05:42:16 mail kernel: NETDEV WATCHDOG: eth0 (sky2): transmit 
queue 0 timed out
Nov  2 05:42:16 mail kernel: Modules linked in: cpufreq_stats 
ip6table_filter ip6table_mangle ip6_tables ipt_MASQUERADE iptable_nat 
nf_nat iptable_mangle iptable_raw ebtable_nat ebtables bridge stp 
appletalk psnap llc nfsd lockd nfs_acl auth_rpcgss exportfs coretemp 
sunrpc acpi_cpufreq mperf sit tunnel4 ipt_LOG nf_conntrack_netbios_ns 
nf_conntrack_ftp xt_DSCP xt_dscp xt_mark nf_conntrack_ipv6 
nf_defrag_ipv6 xt_state xt_multiport ipv6 kvm_intel kvm 
snd_hda_codec_analog snd_ens1371 gameport snd_rawmidi snd_ac97_codec 
snd_hda_intel snd_hda_codec ac97_bus snd_hwdep snd_seq snd_seq_device 
snd_pcm gspca_spca505 gspca_main snd_timer videodev snd v4l1_compat 
i2c_i801 sky2 v4l2_compat_ioctl32 iTCO_wdt pcspkr asus_atk0110 
i7core_edac edac_core soundcore iTCO_vendor_support snd_page_alloc 
microcode raid456 async_raid6_recov async_pq raid6_pq async_xor xor 
async_memcpy async_tx raid1 ata_generic firewire_ohci pata_acpi 
firewire_core crc_itu_t pata_marvell nouveau ttm drm_kms_helper drm 
i2c_algo_bit i2c_core video output [
Nov  2 05:42:16 mail kernel: last unloaded: ip6_tables]
Nov  2 05:42:16 mail kernel: Pid: 0, comm: swapper Tainted: G        W   
2.6.36-09934-g2aab243 #44
Nov  2 05:42:16 mail kernel: Call Trace:
Nov  2 05:42:16 mail kernel: <IRQ>  [<ffffffff81058a4f>] 
warn_slowpath_common+0x7f/0xc0
Nov  2 05:42:16 mail kernel: [<ffffffff81058b46>] 
warn_slowpath_fmt+0x46/0x50
Nov  2 05:42:16 mail kernel: [<ffffffff814603d1>] dev_watchdog+0x251/0x260
Nov  2 05:42:16 mail kernel: [<ffffffff8108a4a6>] ? 
tick_program_event+0x26/0x30
Nov  2 05:42:16 mail kernel: [<ffffffff8107eed4>] ? 
hrtimer_interrupt+0x134/0x240
Nov  2 05:42:16 mail kernel: [<ffffffff81068ab0>] 
run_timer_softirq+0x160/0x390
Nov  2 05:42:16 mail kernel: [<ffffffff8108a368>] ? 
tick_dev_program_event+0x48/0x110
Nov  2 05:42:16 mail kernel: [<ffffffff81460180>] ? dev_watchdog+0x0/0x260
Nov  2 05:42:16 mail kernel: [<ffffffff8105f981>] __do_softirq+0xb1/0x220
Nov  2 05:42:16 mail kernel: [<ffffffff8100cfdc>] call_softirq+0x1c/0x30
Nov  2 05:42:16 mail kernel: [<ffffffff8100ea15>] do_softirq+0x65/0xa0
Nov  2 05:42:16 mail kernel: [<ffffffff8105f845>] irq_exit+0x85/0x90
Nov  2 05:42:16 mail kernel: [<ffffffff81511d61>] do_IRQ+0x71/0xf0
Nov  2 05:42:16 mail kernel: [<ffffffff8150a7d3>] ret_from_intr+0x0/0x11
Nov  2 05:42:16 mail kernel: <EOI>  [<ffffffff812e4165>] ? 
intel_idle+0xd5/0x170
Nov  2 05:42:16 mail kernel: [<ffffffff812e4148>] ? intel_idle+0xb8/0x170
Nov  2 05:42:16 mail kernel: [<ffffffff81425b51>] 
cpuidle_idle_call+0x91/0x150
Nov  2 05:42:16 mail kernel: [<ffffffff8100aa8b>] cpu_idle+0xbb/0x150
Nov  2 05:42:16 mail kernel: [<ffffffff814f1785>] rest_init+0x75/0x80
Nov  2 05:42:16 mail kernel: [<ffffffff81b4ae9b>] start_kernel+0x3dc/0x3e7
Nov  2 05:42:16 mail kernel: [<ffffffff81b4a346>] 
x86_64_start_reservations+0x131/0x135
Nov  2 05:42:16 mail kernel: [<ffffffff81b4a450>] 
x86_64_start_kernel+0x106/0x115
Nov  2 05:42:16 mail kernel: ---[ end trace d9d3a1889f8925bf ]---
Nov  2 05:42:16 mail kernel: sky2 0000:06:00.0: eth0: tx timeout
Nov  2 05:42:16 mail kernel: sky2 0000:06:00.0: eth0: transmit ring 29 
.. 117 report=29 done=29

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
stephen hemminger Nov. 8, 2010, 3:13 a.m. UTC | #13
On Sat, 06 Nov 2010 12:57:53 -0400
Michael Breuer <mbreuer@majjas.com> wrote:

> Basically, if I enable tcp timestamps (now disabled) I get a sky2 hang. 
> As with the earlier issue the effects are not seen until after a couple 
> days of uptime and seem exacerbated by load.
> 
> I can't 100% confirm that the problem is not occurring without tcp 
> timestamps, but will leave the system up for a while to try to confirm. 
> This didn't occur previously without tcp timestamps enabled, but I also 
> pulled git changes between the two events.
> 
> I'm now also on 2.6.37-rc1.... I did a quick scan and didn't see any 
> obvious commits between 2.6.36-09934 and -rc1 that would have affected this.
> 
>  From the log:
> Nov  2 05:41:54 mail kernel: DRHD: handling fault status reg 2
> Nov  2 05:41:54 mail kernel: DMAR:[DMA Read] Request device [06:00.0] 
> fault addr ffea3000
> Nov  2 05:41:54 mail kernel: DMAR:[fault reason 06] PTE Read access is 
> not set
> Nov  2 05:41:54 mail kernel: sky2 0000:06:00.0: error interrupt 
> status=0x80000000
> Nov  2 05:41:54 mail kernel: sky2 0000:06:00.0: PCI hardware error (0x2010)
> Nov  2 05:42:01 mail clamd[9755]: SelfCheck: Database status OK.
> Nov  2 05:42:11 mail root: ping of potter failed
> Nov  2 05:42:16 mail kernel: ------------[ cut here ]------------
> Nov  2 05:42:16 mail kernel: WARNING: at net/sched/sch_generic.c:258 
> dev_watchdog+0x251/0x260()
> Nov  2 05:42:16 mail kernel: Hardware name: System Product Name
> Nov  2 05:42:16 mail kernel: NETDEV WATCHDOG: eth0 (sky2): transmit 
> queue 0 timed out
> Nov  2 05:42:16 mail kernel: Modules linked in: cpufreq_stats 
> ip6table_filter ip6table_mangle ip6_tables ipt_MASQUERADE iptable_nat 
> nf_nat iptable_mangle iptable_raw ebtable_nat ebtables bridge stp 
> appletalk psnap llc nfsd lockd nfs_acl auth_rpcgss exportfs coretemp 
> sunrpc acpi_cpufreq mperf sit tunnel4 ipt_LOG nf_conntrack_netbios_ns 
> nf_conntrack_ftp xt_DSCP xt_dscp xt_mark nf_conntrack_ipv6 
> nf_defrag_ipv6 xt_state xt_multiport ipv6 kvm_intel kvm 
> snd_hda_codec_analog snd_ens1371 gameport snd_rawmidi snd_ac97_codec 
> snd_hda_intel snd_hda_codec ac97_bus snd_hwdep snd_seq snd_seq_device 
> snd_pcm gspca_spca505 gspca_main snd_timer videodev snd v4l1_compat 
> i2c_i801 sky2 v4l2_compat_ioctl32 iTCO_wdt pcspkr asus_atk0110 
> i7core_edac edac_core soundcore iTCO_vendor_support snd_page_alloc 
> microcode raid456 async_raid6_recov async_pq raid6_pq async_xor xor 
> async_memcpy async_tx raid1 ata_generic firewire_ohci pata_acpi 
> firewire_core crc_itu_t pata_marvell nouveau ttm drm_kms_helper drm 
> i2c_algo_bit i2c_core video output [
> Nov  2 05:42:16 mail kernel: last unloaded: ip6_tables]
> Nov  2 05:42:16 mail kernel: Pid: 0, comm: swapper Tainted: G        W   
> 2.6.36-09934-g2aab243 #44
> Nov  2 05:42:16 mail kernel: Call Trace:
> Nov  2 05:42:16 mail kernel: <IRQ>  [<ffffffff81058a4f>] 
> warn_slowpath_common+0x7f/0xc0
> Nov  2 05:42:16 mail kernel: [<ffffffff81058b46>] 
> warn_slowpath_fmt+0x46/0x50
> Nov  2 05:42:16 mail kernel: [<ffffffff814603d1>] dev_watchdog+0x251/0x260
> Nov  2 05:42:16 mail kernel: [<ffffffff8108a4a6>] ? 
> tick_program_event+0x26/0x30
> Nov  2 05:42:16 mail kernel: [<ffffffff8107eed4>] ? 
> hrtimer_interrupt+0x134/0x240
> Nov  2 05:42:16 mail kernel: [<ffffffff81068ab0>] 
> run_timer_softirq+0x160/0x390
> Nov  2 05:42:16 mail kernel: [<ffffffff8108a368>] ? 
> tick_dev_program_event+0x48/0x110
> Nov  2 05:42:16 mail kernel: [<ffffffff81460180>] ? dev_watchdog+0x0/0x260
> Nov  2 05:42:16 mail kernel: [<ffffffff8105f981>] __do_softirq+0xb1/0x220
> Nov  2 05:42:16 mail kernel: [<ffffffff8100cfdc>] call_softirq+0x1c/0x30
> Nov  2 05:42:16 mail kernel: [<ffffffff8100ea15>] do_softirq+0x65/0xa0
> Nov  2 05:42:16 mail kernel: [<ffffffff8105f845>] irq_exit+0x85/0x90
> Nov  2 05:42:16 mail kernel: [<ffffffff81511d61>] do_IRQ+0x71/0xf0
> Nov  2 05:42:16 mail kernel: [<ffffffff8150a7d3>] ret_from_intr+0x0/0x11
> Nov  2 05:42:16 mail kernel: <EOI>  [<ffffffff812e4165>] ? 
> intel_idle+0xd5/0x170
> Nov  2 05:42:16 mail kernel: [<ffffffff812e4148>] ? intel_idle+0xb8/0x170
> Nov  2 05:42:16 mail kernel: [<ffffffff81425b51>] 
> cpuidle_idle_call+0x91/0x150
> Nov  2 05:42:16 mail kernel: [<ffffffff8100aa8b>] cpu_idle+0xbb/0x150
> Nov  2 05:42:16 mail kernel: [<ffffffff814f1785>] rest_init+0x75/0x80
> Nov  2 05:42:16 mail kernel: [<ffffffff81b4ae9b>] start_kernel+0x3dc/0x3e7
> Nov  2 05:42:16 mail kernel: [<ffffffff81b4a346>] 
> x86_64_start_reservations+0x131/0x135
> Nov  2 05:42:16 mail kernel: [<ffffffff81b4a450>] 
> x86_64_start_kernel+0x106/0x115
> Nov  2 05:42:16 mail kernel: ---[ end trace d9d3a1889f8925bf ]---
> Nov  2 05:42:16 mail kernel: sky2 0000:06:00.0: eth0: tx timeout
> Nov  2 05:42:16 mail kernel: sky2 0000:06:00.0: eth0: transmit ring 29 
> .. 117 report=29 done=29
> 
> --
> To unsubscribe from this list: send the line "unsubscribe netdev" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html

Looks like a hardware issue, never saw it before.
Are you running MTU > 1500?
Does turning off TSO help?

One possibility is that NET_IP_ALIGN changed. Now the ethernet header is
aligned and the IP header is not.


--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Michael Breuer Nov. 8, 2010, 3:38 a.m. UTC | #14
On 11/7/2010 10:13 PM, Stephen Hemminger wrote:
> On Sat, 06 Nov 2010 12:57:53 -0400
> Michael Breuer<mbreuer@majjas.com>  wrote:
>
>> Basically, if I enable tcp timestamps (now disabled) I get a sky2 hang.
>> As with the earlier issue the effects are not seen until after a couple
>> days of uptime and seem exacerbated by load.
>>
>> I can't 100% confirm that the problem is not occurring without tcp
>> timestamps, but will leave the system up for a while to try to confirm.
>> This didn't occur previously without tcp timestamps enabled, but I also
>> pulled git changes between the two events.
>>
>> I'm now also on 2.6.37-rc1.... I did a quick scan and didn't see any
>> obvious commits between 2.6.36-09934 and -rc1 that would have affected this.
>>
>>   From the log:
>> Nov  2 05:41:54 mail kernel: DRHD: handling fault status reg 2
>> Nov  2 05:41:54 mail kernel: DMAR:[DMA Read] Request device [06:00.0]
>> fault addr ffea3000
>> Nov  2 05:41:54 mail kernel: DMAR:[fault reason 06] PTE Read access is
>> not set
>> Nov  2 05:41:54 mail kernel: sky2 0000:06:00.0: error interrupt
>> status=0x80000000
>> Nov  2 05:41:54 mail kernel: sky2 0000:06:00.0: PCI hardware error (0x2010)
>> Nov  2 05:42:01 mail clamd[9755]: SelfCheck: Database status OK.
>> Nov  2 05:42:11 mail root: ping of potter failed
>> Nov  2 05:42:16 mail kernel: ------------[ cut here ]------------
>> Nov  2 05:42:16 mail kernel: WARNING: at net/sched/sch_generic.c:258
>> dev_watchdog+0x251/0x260()
>> Nov  2 05:42:16 mail kernel: Hardware name: System Product Name
>> Nov  2 05:42:16 mail kernel: NETDEV WATCHDOG: eth0 (sky2): transmit
>> queue 0 timed out
>> Nov  2 05:42:16 mail kernel: Modules linked in: cpufreq_stats
>> ip6table_filter ip6table_mangle ip6_tables ipt_MASQUERADE iptable_nat
>> nf_nat iptable_mangle iptable_raw ebtable_nat ebtables bridge stp
>> appletalk psnap llc nfsd lockd nfs_acl auth_rpcgss exportfs coretemp
>> sunrpc acpi_cpufreq mperf sit tunnel4 ipt_LOG nf_conntrack_netbios_ns
>> nf_conntrack_ftp xt_DSCP xt_dscp xt_mark nf_conntrack_ipv6
>> nf_defrag_ipv6 xt_state xt_multiport ipv6 kvm_intel kvm
>> snd_hda_codec_analog snd_ens1371 gameport snd_rawmidi snd_ac97_codec
>> snd_hda_intel snd_hda_codec ac97_bus snd_hwdep snd_seq snd_seq_device
>> snd_pcm gspca_spca505 gspca_main snd_timer videodev snd v4l1_compat
>> i2c_i801 sky2 v4l2_compat_ioctl32 iTCO_wdt pcspkr asus_atk0110
>> i7core_edac edac_core soundcore iTCO_vendor_support snd_page_alloc
>> microcode raid456 async_raid6_recov async_pq raid6_pq async_xor xor
>> async_memcpy async_tx raid1 ata_generic firewire_ohci pata_acpi
>> firewire_core crc_itu_t pata_marvell nouveau ttm drm_kms_helper drm
>> i2c_algo_bit i2c_core video output [
>> Nov  2 05:42:16 mail kernel: last unloaded: ip6_tables]
>> Nov  2 05:42:16 mail kernel: Pid: 0, comm: swapper Tainted: G        W
>> 2.6.36-09934-g2aab243 #44
>> Nov  2 05:42:16 mail kernel: Call Trace:
>> Nov  2 05:42:16 mail kernel:<IRQ>   [<ffffffff81058a4f>]
>> warn_slowpath_common+0x7f/0xc0
>> Nov  2 05:42:16 mail kernel: [<ffffffff81058b46>]
>> warn_slowpath_fmt+0x46/0x50
>> Nov  2 05:42:16 mail kernel: [<ffffffff814603d1>] dev_watchdog+0x251/0x260
>> Nov  2 05:42:16 mail kernel: [<ffffffff8108a4a6>] ?
>> tick_program_event+0x26/0x30
>> Nov  2 05:42:16 mail kernel: [<ffffffff8107eed4>] ?
>> hrtimer_interrupt+0x134/0x240
>> Nov  2 05:42:16 mail kernel: [<ffffffff81068ab0>]
>> run_timer_softirq+0x160/0x390
>> Nov  2 05:42:16 mail kernel: [<ffffffff8108a368>] ?
>> tick_dev_program_event+0x48/0x110
>> Nov  2 05:42:16 mail kernel: [<ffffffff81460180>] ? dev_watchdog+0x0/0x260
>> Nov  2 05:42:16 mail kernel: [<ffffffff8105f981>] __do_softirq+0xb1/0x220
>> Nov  2 05:42:16 mail kernel: [<ffffffff8100cfdc>] call_softirq+0x1c/0x30
>> Nov  2 05:42:16 mail kernel: [<ffffffff8100ea15>] do_softirq+0x65/0xa0
>> Nov  2 05:42:16 mail kernel: [<ffffffff8105f845>] irq_exit+0x85/0x90
>> Nov  2 05:42:16 mail kernel: [<ffffffff81511d61>] do_IRQ+0x71/0xf0
>> Nov  2 05:42:16 mail kernel: [<ffffffff8150a7d3>] ret_from_intr+0x0/0x11
>> Nov  2 05:42:16 mail kernel:<EOI>   [<ffffffff812e4165>] ?
>> intel_idle+0xd5/0x170
>> Nov  2 05:42:16 mail kernel: [<ffffffff812e4148>] ? intel_idle+0xb8/0x170
>> Nov  2 05:42:16 mail kernel: [<ffffffff81425b51>]
>> cpuidle_idle_call+0x91/0x150
>> Nov  2 05:42:16 mail kernel: [<ffffffff8100aa8b>] cpu_idle+0xbb/0x150
>> Nov  2 05:42:16 mail kernel: [<ffffffff814f1785>] rest_init+0x75/0x80
>> Nov  2 05:42:16 mail kernel: [<ffffffff81b4ae9b>] start_kernel+0x3dc/0x3e7
>> Nov  2 05:42:16 mail kernel: [<ffffffff81b4a346>]
>> x86_64_start_reservations+0x131/0x135
>> Nov  2 05:42:16 mail kernel: [<ffffffff81b4a450>]
>> x86_64_start_kernel+0x106/0x115
>> Nov  2 05:42:16 mail kernel: ---[ end trace d9d3a1889f8925bf ]---
>> Nov  2 05:42:16 mail kernel: sky2 0000:06:00.0: eth0: tx timeout
>> Nov  2 05:42:16 mail kernel: sky2 0000:06:00.0: eth0: transmit ring 29
>> .. 117 report=29 done=29
>>
>> --
>> To unsubscribe from this list: send the line "unsubscribe netdev" in
>> the body of a message to majordomo@vger.kernel.org
>> More majordomo info at  http://vger.kernel.org/majordomo-info.html
> Looks like a hardware issue, never saw it before.
> Are you running MTU>  1500?
> Does turning off TSO help?
>
> One possibility is that NET_IP_ALIGN changed. Now the ethernet header is
> aligned and the IP header is not.
>
MTU=1500
TCP timestamps seems to be the culprit - no issues with it disabled. I 
hit the problem after running about 18 hours with TCP timestamps 
enabled. Has been stable since rebuilding without timestamps... but 
another day would be more telling.

Didn't look into the header alignment - but would that be inconsistent 
with tcp timestamps being involved?
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
stephen hemminger Nov. 8, 2010, 4:46 p.m. UTC | #15
On Sun, 07 Nov 2010 22:38:19 -0500
Michael Breuer <mbreuer@majjas.com> wrote:

> On 11/7/2010 10:13 PM, Stephen Hemminger wrote:
> > On Sat, 06 Nov 2010 12:57:53 -0400
> > Michael Breuer<mbreuer@majjas.com>  wrote:
> >
> >> Basically, if I enable tcp timestamps (now disabled) I get a sky2 hang.
> >> As with the earlier issue the effects are not seen until after a couple
> >> days of uptime and seem exacerbated by load.
> >>
> >> I can't 100% confirm that the problem is not occurring without tcp
> >> timestamps, but will leave the system up for a while to try to confirm.
> >> This didn't occur previously without tcp timestamps enabled, but I also
> >> pulled git changes between the two events.
> >>
> >> I'm now also on 2.6.37-rc1.... I did a quick scan and didn't see any
> >> obvious commits between 2.6.36-09934 and -rc1 that would have affected this.
> >>
> >>   From the log:
> >> Nov  2 05:41:54 mail kernel: DRHD: handling fault status reg 2
> >> Nov  2 05:41:54 mail kernel: DMAR:[DMA Read] Request device [06:00.0]
> >> fault addr ffea3000
> >> Nov  2 05:41:54 mail kernel: DMAR:[fault reason 06] PTE Read access is
> >> not set
> >> Nov  2 05:41:54 mail kernel: sky2 0000:06:00.0: error interrupt
> >> status=0x80000000
> >> Nov  2 05:41:54 mail kernel: sky2 0000:06:00.0: PCI hardware error (0x2010)
> >> Nov  2 05:42:01 mail clamd[9755]: SelfCheck: Database status OK.
> >> Nov  2 05:42:11 mail root: ping of potter failed
> >> Nov  2 05:42:16 mail kernel: ------------[ cut here ]------------
> >> Nov  2 05:42:16 mail kernel: WARNING: at net/sched/sch_generic.c:258
> >> dev_watchdog+0x251/0x260()
> >> Nov  2 05:42:16 mail kernel: Hardware name: System Product Name
> >> Nov  2 05:42:16 mail kernel: NETDEV WATCHDOG: eth0 (sky2): transmit
> >> queue 0 timed out
> >> Nov  2 05:42:16 mail kernel: Modules linked in: cpufreq_stats
> >> ip6table_filter ip6table_mangle ip6_tables ipt_MASQUERADE iptable_nat
> >> nf_nat iptable_mangle iptable_raw ebtable_nat ebtables bridge stp
> >> appletalk psnap llc nfsd lockd nfs_acl auth_rpcgss exportfs coretemp
> >> sunrpc acpi_cpufreq mperf sit tunnel4 ipt_LOG nf_conntrack_netbios_ns
> >> nf_conntrack_ftp xt_DSCP xt_dscp xt_mark nf_conntrack_ipv6
> >> nf_defrag_ipv6 xt_state xt_multiport ipv6 kvm_intel kvm
> >> snd_hda_codec_analog snd_ens1371 gameport snd_rawmidi snd_ac97_codec
> >> snd_hda_intel snd_hda_codec ac97_bus snd_hwdep snd_seq snd_seq_device
> >> snd_pcm gspca_spca505 gspca_main snd_timer videodev snd v4l1_compat
> >> i2c_i801 sky2 v4l2_compat_ioctl32 iTCO_wdt pcspkr asus_atk0110
> >> i7core_edac edac_core soundcore iTCO_vendor_support snd_page_alloc
> >> microcode raid456 async_raid6_recov async_pq raid6_pq async_xor xor
> >> async_memcpy async_tx raid1 ata_generic firewire_ohci pata_acpi
> >> firewire_core crc_itu_t pata_marvell nouveau ttm drm_kms_helper drm
> >> i2c_algo_bit i2c_core video output [
> >> Nov  2 05:42:16 mail kernel: last unloaded: ip6_tables]
> >> Nov  2 05:42:16 mail kernel: Pid: 0, comm: swapper Tainted: G        W
> >> 2.6.36-09934-g2aab243 #44
> >> Nov  2 05:42:16 mail kernel: Call Trace:
> >> Nov  2 05:42:16 mail kernel:<IRQ>   [<ffffffff81058a4f>]
> >> warn_slowpath_common+0x7f/0xc0
> >> Nov  2 05:42:16 mail kernel: [<ffffffff81058b46>]
> >> warn_slowpath_fmt+0x46/0x50
> >> Nov  2 05:42:16 mail kernel: [<ffffffff814603d1>] dev_watchdog+0x251/0x260
> >> Nov  2 05:42:16 mail kernel: [<ffffffff8108a4a6>] ?
> >> tick_program_event+0x26/0x30
> >> Nov  2 05:42:16 mail kernel: [<ffffffff8107eed4>] ?
> >> hrtimer_interrupt+0x134/0x240
> >> Nov  2 05:42:16 mail kernel: [<ffffffff81068ab0>]
> >> run_timer_softirq+0x160/0x390
> >> Nov  2 05:42:16 mail kernel: [<ffffffff8108a368>] ?
> >> tick_dev_program_event+0x48/0x110
> >> Nov  2 05:42:16 mail kernel: [<ffffffff81460180>] ? dev_watchdog+0x0/0x260
> >> Nov  2 05:42:16 mail kernel: [<ffffffff8105f981>] __do_softirq+0xb1/0x220
> >> Nov  2 05:42:16 mail kernel: [<ffffffff8100cfdc>] call_softirq+0x1c/0x30
> >> Nov  2 05:42:16 mail kernel: [<ffffffff8100ea15>] do_softirq+0x65/0xa0
> >> Nov  2 05:42:16 mail kernel: [<ffffffff8105f845>] irq_exit+0x85/0x90
> >> Nov  2 05:42:16 mail kernel: [<ffffffff81511d61>] do_IRQ+0x71/0xf0
> >> Nov  2 05:42:16 mail kernel: [<ffffffff8150a7d3>] ret_from_intr+0x0/0x11
> >> Nov  2 05:42:16 mail kernel:<EOI>   [<ffffffff812e4165>] ?
> >> intel_idle+0xd5/0x170
> >> Nov  2 05:42:16 mail kernel: [<ffffffff812e4148>] ? intel_idle+0xb8/0x170
> >> Nov  2 05:42:16 mail kernel: [<ffffffff81425b51>]
> >> cpuidle_idle_call+0x91/0x150
> >> Nov  2 05:42:16 mail kernel: [<ffffffff8100aa8b>] cpu_idle+0xbb/0x150
> >> Nov  2 05:42:16 mail kernel: [<ffffffff814f1785>] rest_init+0x75/0x80
> >> Nov  2 05:42:16 mail kernel: [<ffffffff81b4ae9b>] start_kernel+0x3dc/0x3e7
> >> Nov  2 05:42:16 mail kernel: [<ffffffff81b4a346>]
> >> x86_64_start_reservations+0x131/0x135
> >> Nov  2 05:42:16 mail kernel: [<ffffffff81b4a450>]
> >> x86_64_start_kernel+0x106/0x115
> >> Nov  2 05:42:16 mail kernel: ---[ end trace d9d3a1889f8925bf ]---
> >> Nov  2 05:42:16 mail kernel: sky2 0000:06:00.0: eth0: tx timeout
> >> Nov  2 05:42:16 mail kernel: sky2 0000:06:00.0: eth0: transmit ring 29
> >> .. 117 report=29 done=29
> >>
> >> --
> >> To unsubscribe from this list: send the line "unsubscribe netdev" in
> >> the body of a message to majordomo@vger.kernel.org
> >> More majordomo info at  http://vger.kernel.org/majordomo-info.html
> > Looks like a hardware issue, never saw it before.
> > Are you running MTU>  1500?
> > Does turning off TSO help?
> >
> > One possibility is that NET_IP_ALIGN changed. Now the ethernet header is
> > aligned and the IP header is not.
> >
> MTU=1500
> TCP timestamps seems to be the culprit - no issues with it disabled. I 
> hit the problem after running about 18 hours with TCP timestamps 
> enabled. Has been stable since rebuilding without timestamps... but 
> another day would be more telling.
> 
> Didn't look into the header alignment - but would that be inconsistent 
> with tcp timestamps being involved?

TCP timestamps make the header bigger and that might be causing
the gather code to see different alignment, causing problem.
Seeing the whole contents of the transmit ring on the dmesg might
give a clue.

I don't work for Marvell. The limited documentation does not describe
any restrictions on alignment. But that's not a surprise since they
never tell me about errata. 

Since it is a regression, bisect might help.
diff mbox

Patch

--- a/drivers/net/sky2.c	2010-02-01 10:07:42.676296236 -0800
+++ b/drivers/net/sky2.c	2010-02-01 10:18:12.575044064 -0800
@@ -1025,11 +1025,8 @@  static void sky2_prefetch_init(struct sk
 static inline struct sky2_tx_le *get_tx_le(struct sky2_port *sky2, u16 *slot)
 {
 	struct sky2_tx_le *le = sky2->tx_le + *slot;
-	struct tx_ring_info *re = sky2->tx_ring + *slot;
 
 	*slot = RING_NEXT(*slot, sky2->tx_ring_size);
-	re->flags = 0;
-	re->skb = NULL;
 	le->ctrl = 0;
 	return le;
 }
@@ -1622,8 +1619,7 @@  static unsigned tx_le_req(const struct s
 	return count;
 }
 
-static void sky2_tx_unmap(struct pci_dev *pdev,
-			  const struct tx_ring_info *re)
+static void sky2_tx_unmap(struct pci_dev *pdev, struct tx_ring_info *re)
 {
 	if (re->flags & TX_MAP_SINGLE)
 		pci_unmap_single(pdev, pci_unmap_addr(re, mapaddr),
@@ -1633,6 +1629,7 @@  static void sky2_tx_unmap(struct pci_dev
 		pci_unmap_page(pdev, pci_unmap_addr(re, mapaddr),
 			       pci_unmap_len(re, maplen),
 			       PCI_DMA_TODEVICE);
+	re->flags = 0;
 }
 
 /*
@@ -1839,6 +1836,7 @@  static void sky2_tx_complete(struct sky2
 			dev->stats.tx_packets++;
 			dev->stats.tx_bytes += skb->len;
 
+			re->skb = NULL;
 			dev_kfree_skb_any(skb);
 
 			sky2->tx_next = RING_NEXT(idx, sky2->tx_ring_size);