diff mbox

[v5,2/5] can: kvaser_usb: Consolidate and unify state change handling

Message ID 20150120214537.GB16828@linux
State Awaiting Upstream, archived
Delegated to: David Miller
Headers show

Commit Message

Ahmed S. Darwish Jan. 20, 2015, 9:45 p.m. UTC
From: Ahmed S. Darwish <ahmed.darwish@valeo.com>

Replace most of the can interface's state and error counters
handling with the new can-dev can_change_state() mechanism.

Suggested-by: Andri Yngvason <andri.yngvason@marel.com>
Signed-off-by: Ahmed S. Darwish <ahmed.darwish@valeo.com>
---
 drivers/net/can/usb/kvaser_usb.c | 114 +++++++++++++++++++--------------------
 1 file changed, 55 insertions(+), 59 deletions(-)

Comments

Andri Yngvason Jan. 21, 2015, 10:33 a.m. UTC | #1
Quoting Ahmed S. Darwish (2015-01-20 21:45:37)
> From: Ahmed S. Darwish <ahmed.darwish@valeo.com>
> 
> Replace most of the can interface's state and error counters
> handling with the new can-dev can_change_state() mechanism.
> 
> Suggested-by: Andri Yngvason <andri.yngvason@marel.com>
> Signed-off-by: Ahmed S. Darwish <ahmed.darwish@valeo.com>
> ---
>  drivers/net/can/usb/kvaser_usb.c | 114 +++++++++++++++++++--------------------
>  1 file changed, 55 insertions(+), 59 deletions(-)
> 
> diff --git a/drivers/net/can/usb/kvaser_usb.c b/drivers/net/can/usb/kvaser_usb.c
> index 971c5f9..0386d3f 100644
> --- a/drivers/net/can/usb/kvaser_usb.c
> +++ b/drivers/net/can/usb/kvaser_usb.c
> @@ -620,40 +620,43 @@ static void kvaser_usb_unlink_tx_urbs(struct kvaser_usb_net_priv *priv)
>  }
>  
>  static void kvaser_usb_rx_error_update_can_state(struct kvaser_usb_net_priv *priv,
> -                                                const struct kvaser_usb_error_summary *es)
> +                                                const struct kvaser_usb_error_summary *es,
> +                                                struct can_frame *cf)
>  {
>         struct net_device_stats *stats;
> -       enum can_state new_state;
> -
> -       stats = &priv->netdev->stats;
> -       new_state = priv->can.state;
> +       enum can_state cur_state, new_state, tx_state, rx_state;
>  
>         netdev_dbg(priv->netdev, "Error status: 0x%02x\n", es->status);
>  
> -       if (es->status & M16C_STATE_BUS_OFF) {
> -               priv->can.can_stats.bus_off++;
> +       stats = &priv->netdev->stats;
> +       new_state = cur_state = priv->can.state;
> +
> +       if (es->status & M16C_STATE_BUS_OFF)
>                 new_state = CAN_STATE_BUS_OFF;
> -       } else if (es->status & M16C_STATE_BUS_PASSIVE) {
> -               if (priv->can.state != CAN_STATE_ERROR_PASSIVE)
> -                       priv->can.can_stats.error_passive++;
> +       else if (es->status & M16C_STATE_BUS_PASSIVE)
>                 new_state = CAN_STATE_ERROR_PASSIVE;
> -       }
>  
>         if (es->status == M16C_STATE_BUS_ERROR) {
> -               if ((priv->can.state < CAN_STATE_ERROR_WARNING) &&
> -                   ((es->txerr >= 96) || (es->rxerr >= 96))) {
> -                       priv->can.can_stats.error_warning++;
> +               if ((cur_state < CAN_STATE_ERROR_WARNING) &&
> +                   ((es->txerr >= 96) || (es->rxerr >= 96)))
>                         new_state = CAN_STATE_ERROR_WARNING;
> -               } else if (priv->can.state > CAN_STATE_ERROR_ACTIVE) {
> +               else if (cur_state > CAN_STATE_ERROR_ACTIVE)
>                         new_state = CAN_STATE_ERROR_ACTIVE;
> -               }
>         }
>  
>         if (!es->status)
>                 new_state = CAN_STATE_ERROR_ACTIVE;
>  
> +       if (new_state != cur_state) {
> +               tx_state = (es->txerr >= es->rxerr) ? new_state : 0;
> +               rx_state = (es->txerr <= es->rxerr) ? new_state : 0;
> +
> +               can_change_state(priv->netdev, cf, tx_state, rx_state);
> +               new_state = priv->can.state;
> +       }
> +
>         if (priv->can.restart_ms &&
> -           (priv->can.state >= CAN_STATE_BUS_OFF) &&
> +           (cur_state >= CAN_STATE_BUS_OFF) &&
>             (new_state < CAN_STATE_BUS_OFF)) {
>                 priv->can.can_stats.restarts++;
>         }
> @@ -665,18 +668,17 @@ static void kvaser_usb_rx_error_update_can_state(struct kvaser_usb_net_priv *pri
>  
>         priv->bec.txerr = es->txerr;
>         priv->bec.rxerr = es->rxerr;
> -       priv->can.state = new_state;
>  }
>  
>  static void kvaser_usb_rx_error(const struct kvaser_usb *dev,
>                                 const struct kvaser_msg *msg)
>  {
> -       struct can_frame *cf;
> +       struct can_frame *cf, tmp_cf = { .can_id = CAN_ERR_FLAG, .can_dlc = CAN_ERR_DLC };
>         struct sk_buff *skb;
>         struct net_device_stats *stats;
>         struct kvaser_usb_net_priv *priv;
>         struct kvaser_usb_error_summary es = { };
> -       enum can_state old_state;
> +       enum can_state old_state, new_state;
>  
>         switch (msg->id) {
>         case CMD_CAN_ERROR_EVENT:
> @@ -721,60 +723,54 @@ static void kvaser_usb_rx_error(const struct kvaser_usb *dev,
>         }
>  
>         /* Update all of the can interface's state and error counters before
> -        * trying any skb allocation that can actually fail with -ENOMEM.
> +        * trying any memory allocation that can actually fail with -ENOMEM.
> +        *
> +        * We send a temporary stack-allocated error can frame to
> +        * can_change_state() for the very same reason.
> +        *
> +        * TODO: Split can_change_state() responsibility between updating the
> +        * can interface's state and counters, and the setting up of can error
> +        * frame ID and data to userspace. Remove stack allocation afterwards.
>          */
>         old_state = priv->can.state;
> -       kvaser_usb_rx_error_update_can_state(priv, &es);
> +       kvaser_usb_rx_error_update_can_state(priv, &es, &tmp_cf);
> +       new_state = priv->can.state;
>  
>         skb = alloc_can_err_skb(priv->netdev, &cf);
>         if (!skb) {
>                 stats->rx_dropped++;
>                 return;
>         }
> +       memcpy(cf, &tmp_cf, sizeof(*cf));
>  
> -       if (es.status & M16C_STATE_BUS_OFF) {
> -               cf->can_id |= CAN_ERR_BUSOFF;
> -
> -               if (!priv->can.restart_ms)
> -                       kvaser_usb_simple_msg_async(priv, CMD_STOP_CHIP);
> -               netif_carrier_off(priv->netdev);
> -       } else if (es.status & M16C_STATE_BUS_PASSIVE) {
> -               if (old_state != CAN_STATE_ERROR_PASSIVE) {
> -                       cf->can_id |= CAN_ERR_CRTL;
> -
> -                       if (es.txerr || es.rxerr)
> -                               cf->data[1] = (es.txerr > es.rxerr)
> -                                               ? CAN_ERR_CRTL_TX_PASSIVE
> -                                               : CAN_ERR_CRTL_RX_PASSIVE;
> -                       else
> -                               cf->data[1] = CAN_ERR_CRTL_TX_PASSIVE |
> -                                             CAN_ERR_CRTL_RX_PASSIVE;
> +       if (new_state != old_state) {
> +               if (es.status & M16C_STATE_BUS_OFF) {
> +                       if (!priv->can.restart_ms)
> +                               kvaser_usb_simple_msg_async(priv, CMD_STOP_CHIP);
> +                       netif_carrier_off(priv->netdev);
> +               }
> +
> +               if (es.status == M16C_STATE_BUS_ERROR) {
> +                       if ((old_state >= CAN_STATE_ERROR_WARNING) ||
> +                           (es.txerr < 96 && es.rxerr < 96)) {
> +                               if (old_state > CAN_STATE_ERROR_ACTIVE) {
> +                                       cf->can_id |= CAN_ERR_PROT;
> +                                       cf->data[2] = CAN_ERR_PROT_ACTIVE;
> +                               }
> +                       }
>                 }
> -       }
>  
> -       if (es.status == M16C_STATE_BUS_ERROR) {
> -               if ((old_state < CAN_STATE_ERROR_WARNING) &&
> -                   ((es.txerr >= 96) || (es.rxerr >= 96))) {
> -                       cf->can_id |= CAN_ERR_CRTL;
> -                       cf->data[1] = (es.txerr > es.rxerr)
> -                                       ? CAN_ERR_CRTL_TX_WARNING
> -                                       : CAN_ERR_CRTL_RX_WARNING;
> -               } else if (old_state > CAN_STATE_ERROR_ACTIVE) {
> +               if (!es.status) {
>                         cf->can_id |= CAN_ERR_PROT;
>                         cf->data[2] = CAN_ERR_PROT_ACTIVE;
>                 }
> -       }
>  
> -       if (!es.status) {
> -               cf->can_id |= CAN_ERR_PROT;
> -               cf->data[2] = CAN_ERR_PROT_ACTIVE;
> -       }
> -
> -       if (priv->can.restart_ms &&
> -           (old_state >= CAN_STATE_BUS_OFF) &&
> -           (priv->can.state < CAN_STATE_BUS_OFF)) {
> -               cf->can_id |= CAN_ERR_RESTARTED;
> -               netif_carrier_on(priv->netdev);
> +               if (priv->can.restart_ms &&
> +                   (old_state >= CAN_STATE_BUS_OFF) &&
> +                   (new_state < CAN_STATE_BUS_OFF)) {
> +                       cf->can_id |= CAN_ERR_RESTARTED;
> +                       netif_carrier_on(priv->netdev);
> +               }
>         }
>  
>         if (es.error_factor) {
> -- 
> 1.9.1

Looks good.

--
Andri
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Marc Kleine-Budde Jan. 21, 2015, 10:44 a.m. UTC | #2
On 01/21/2015 11:33 AM, Andri Yngvason wrote:
> Quoting Ahmed S. Darwish (2015-01-20 21:45:37)
>> From: Ahmed S. Darwish <ahmed.darwish@valeo.com>
>>
>> Replace most of the can interface's state and error counters
>> handling with the new can-dev can_change_state() mechanism.
>>
>> Suggested-by: Andri Yngvason <andri.yngvason@marel.com>
>> Signed-off-by: Ahmed S. Darwish <ahmed.darwish@valeo.com>
>> ---
>>  drivers/net/can/usb/kvaser_usb.c | 114 +++++++++++++++++++--------------------
>>  1 file changed, 55 insertions(+), 59 deletions(-)
>>
>> diff --git a/drivers/net/can/usb/kvaser_usb.c b/drivers/net/can/usb/kvaser_usb.c
>> index 971c5f9..0386d3f 100644
>> --- a/drivers/net/can/usb/kvaser_usb.c
>> +++ b/drivers/net/can/usb/kvaser_usb.c

> Looks good.

Is this an Acked-by?

Marc
Andri Yngvason Jan. 21, 2015, 11 a.m. UTC | #3
Quoting Marc Kleine-Budde (2015-01-21 10:44:54)
> On 01/21/2015 11:33 AM, Andri Yngvason wrote:
> > Quoting Ahmed S. Darwish (2015-01-20 21:45:37)
> >> From: Ahmed S. Darwish <ahmed.darwish@valeo.com>
> >>
> >> Replace most of the can interface's state and error counters
> >> handling with the new can-dev can_change_state() mechanism.
> >>
> >> Suggested-by: Andri Yngvason <andri.yngvason@marel.com>
> >> Signed-off-by: Ahmed S. Darwish <ahmed.darwish@valeo.com>
> >> ---
> >>  drivers/net/can/usb/kvaser_usb.c | 114 +++++++++++++++++++--------------------
> >>  1 file changed, 55 insertions(+), 59 deletions(-)
> >>
> >> diff --git a/drivers/net/can/usb/kvaser_usb.c b/drivers/net/can/usb/kvaser_usb.c
> >> index 971c5f9..0386d3f 100644
> >> --- a/drivers/net/can/usb/kvaser_usb.c
> >> +++ b/drivers/net/can/usb/kvaser_usb.c
> 
> > Looks good.
> 
> Is this an Acked-by?
> 

ACK.

--
Andri
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Wolfgang Grandegger Jan. 21, 2015, 11:53 a.m. UTC | #4
On Wed, 21 Jan 2015 10:33:19 +0000, Andri Yngvason
<andri.yngvason@marel.com> wrote:
> Quoting Ahmed S. Darwish (2015-01-20 21:45:37)
>> From: Ahmed S. Darwish <ahmed.darwish@valeo.com>
>> 
>> Replace most of the can interface's state and error counters
>> handling with the new can-dev can_change_state() mechanism.
>> 
>> Suggested-by: Andri Yngvason <andri.yngvason@marel.com>
>> Signed-off-by: Ahmed S. Darwish <ahmed.darwish@valeo.com>
>> ---
>>  drivers/net/can/usb/kvaser_usb.c | 114
>>  +++++++++++++++++++--------------------
>>  1 file changed, 55 insertions(+), 59 deletions(-)
>> 
>> diff --git a/drivers/net/can/usb/kvaser_usb.c
>> b/drivers/net/can/usb/kvaser_usb.c
>> index 971c5f9..0386d3f 100644
>> --- a/drivers/net/can/usb/kvaser_usb.c
>> +++ b/drivers/net/can/usb/kvaser_usb.c

...
> 
> Looks good.

Would be nice to see some "candump" traces as well. Ahmed, could you
please generate such traces doing:

1. Execute in a session:
   # candump -t d -e any,0:0,#FFFFFFFF

2. Execute in another session:
   # cangen -g 10 -D i can0

3. Disconnect the CAN cable

4. After a while reconnect the CAN cable

5. Stop candump and save the trace.

and to test bus-off:

1. Execute in a session:
   # candump -t d -e any,0:0,#FFFFFFFF

2. Execute in another session:
   # cangen -g 10 -D i can0

3. Short-circuit the CAN low and high wires of the CAN cable

4. After a while remove the short-circuit.

5. Stop candump and save the trace.

Thanks,

Wolfgang.
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Ahmed S. Darwish Jan. 21, 2015, 2:43 p.m. UTC | #5
Hi!

On Wed, Jan 21, 2015 at 12:53:58PM +0100, Wolfgang Grandegger wrote:
> On Wed, 21 Jan 2015 10:33:19 +0000, Andri Yngvason
> <andri.yngvason@marel.com> wrote:
> > Quoting Ahmed S. Darwish (2015-01-20 21:45:37)
> >> From: Ahmed S. Darwish <ahmed.darwish@valeo.com>
> >> 
> >> Replace most of the can interface's state and error counters
> >> handling with the new can-dev can_change_state() mechanism.
> >> 
> >> Suggested-by: Andri Yngvason <andri.yngvason@marel.com>
> >> Signed-off-by: Ahmed S. Darwish <ahmed.darwish@valeo.com>
> >> ---
> >>  drivers/net/can/usb/kvaser_usb.c | 114
> >>  +++++++++++++++++++--------------------
> >>  1 file changed, 55 insertions(+), 59 deletions(-)
> >> 
> >> diff --git a/drivers/net/can/usb/kvaser_usb.c
> >> b/drivers/net/can/usb/kvaser_usb.c
> >> index 971c5f9..0386d3f 100644
> >> --- a/drivers/net/can/usb/kvaser_usb.c
> >> +++ b/drivers/net/can/usb/kvaser_usb.c
> 
> ...
> > 
> > Looks good.
> 
> Would be nice to see some "candump" traces as well.

Sure. The USBCan-II device trace below is generated after applying
all patches in the series, especially patch #3, which fixes some
some invalid CAN state transitions logic in the original driver.

##########################################################################

candump on a PC, Kvaser USB device on the receiving end:

 ...
 (000.011392)  can0  71D   [8]  5B 06 00 00 00 00 00 00
 (000.009270)  can0  712   [3]  5C 06 00
 (000.010691)  can0  0F3   [7]  5D 06 00 00 00 00 00
 (000.010443)  can0  63E   [8]  5E 06 00 00 00 00 00 00
 (000.010112)  can0  502   [8]  5F 06 00 00 00 00 00 00
 (000.009944)  can0  39A   [8]  60 06 00 00 00 00 00 00
 (000.010186)  can0  721   [8]  61 06 00 00 00 00 00 00
 (000.009628)  can0  5B7   [6]  62 06 00 00 00 00
 (000.009784)  can0  1D7   [4]  63 06 00 00
 (000.010806)  can0  4FE   [8]  64 06 00 00 00 00 00 00
 (000.008897)  can0  75E   [1]  65
 (000.010257)  can0  1EA   [2]  66 06

<-- Unplug the cable -->
 (000.010640)  can0  20000080   [8]  00 00 00 00 00 00 00 01   ERRORFRAME
	bus-error
	error-counter-tx-rx{{0}{1}}

<-- Replug the cable, after 12 seconds -->
 (044.345134)  can0  20000080   [8]  00 00 00 00 00 00 00 02   ERRORFRAME
	bus-error
	error-counter-tx-rx{{0}{2}}

 (000.002730)  can0  75E   [8]  67 06 00 00 00 00 00 00
 (000.002097)  can0  696   [6]  68 06 00 00 00 00
 (000.002328)  can0  45A   [8]  69 06 00 00 00 00 00 00
 (000.002484)  can0  496   [8]  6A 06 00 00 00 00 00 00
 (000.002458)  can0  604   [8]  6B 06 00 00 00 00 00 00
 (000.002252)  can0  27B   [7]  6C 06 00 00 00 00 00
 (000.002420)  can0  48F   [8]  6D 06 00 00 00 00 00 00
 (000.001306)  can0  1B3   [1]  6E
 (000.002518)  can0  145   [8]  6F 06 00 00 00 00 00 00
 (000.002262)  can0  6EA   [7]  70 06 00 00 00 00 00
 (000.001053)  can0  2DC   [1]  71
 (000.001731)  can0  1DD   [4]  72 06 00 00
 (000.002332)  can0  455   [8]  73 06 00 00 00 00 00 00
 ...

If the cable was _swiftly_ plugged and unplugged, no errors appear.
So it seems the errors above are just due to noise.

##########################################################################

Afterwards, candump on a PC, Kvaser USB device on the sending end:

 ...
 (000.008784)  can0  60A   [1]  C1
 (000.011341)  can0  2A8   [8]  C2 0A 00 00 00 00 00 00
 (000.009873)  can0  03D   [7]  C3 0A 00 00 00 00 00
 (000.010394)  can0  55C   [8]  C4 0A 00 00 00 00 00 00
 (000.009979)  can0  45A   [8]  C5 0A 00 00 00 00 00 00
 (000.010125)  can0  6E8   [8]  C6 0A 00 00 00 00 00 00
 (000.010149)  can0  4EE   [8]  C7 0A 00 00 00 00 00 00
 (000.010102)  can0  5D2   [8]  C8 0A 00 00 00 00 00 00
 (000.010000)  can0  61F   [8]  C9 0A 00 00 00 00 00 00
 (000.010271)  can0  5F8   [8]  CA 0A 00 00 00 00 00 00

<-- Unplug the cable -->

 (000.009106)  can0  20000080   [8]  00 00 00 00 00 00 08 00   ERRORFRAME
	bus-error
	error-counter-tx-rx{{8}{0}}
 (000.001872)  can0  20000080   [8]  00 00 00 00 00 00 10 00   ERRORFRAME
	bus-error
	error-counter-tx-rx{{16}{0}}
 (000.001748)  can0  20000080   [8]  00 00 00 00 00 00 18 00   ERRORFRAME
	bus-error
	error-counter-tx-rx{{24}{0}}
 (000.001751)  can0  20000080   [8]  00 00 00 00 00 00 20 00   ERRORFRAME
	bus-error
	error-counter-tx-rx{{32}{0}}
 (000.001874)  can0  20000080   [8]  00 00 00 00 00 00 28 00   ERRORFRAME
	bus-error
	error-counter-tx-rx{{40}{0}}
 (000.001625)  can0  20000080   [8]  00 00 00 00 00 00 30 00   ERRORFRAME
	bus-error
	error-counter-tx-rx{{48}{0}}
 (000.001875)  can0  20000080   [8]  00 00 00 00 00 00 38 00   ERRORFRAME
	bus-error
	error-counter-tx-rx{{56}{0}}
 (000.001751)  can0  20000080   [8]  00 00 00 00 00 00 40 00   ERRORFRAME
	bus-error
	error-counter-tx-rx{{64}{0}}
 (000.001761)  can0  20000080   [8]  00 00 00 00 00 00 48 00   ERRORFRAME
	bus-error
	error-counter-tx-rx{{72}{0}}
 (000.001743)  can0  20000080   [8]  00 00 00 00 00 00 50 00   ERRORFRAME
	bus-error
	error-counter-tx-rx{{80}{0}}
 (000.001910)  can0  20000080   [8]  00 00 00 00 00 00 58 00   ERRORFRAME
	bus-error
	error-counter-tx-rx{{88}{0}}
 (000.001753)  can0  20000084   [8]  00 08 00 00 00 00 60 00   ERRORFRAME
	controller-problem{tx-error-warning}
	bus-error
	error-counter-tx-rx{{96}{0}}
 (000.001720)  can0  20000080   [8]  00 00 00 00 00 00 68 00   ERRORFRAME
	bus-error
	error-counter-tx-rx{{104}{0}}
 (000.001876)  can0  20000080   [8]  00 00 00 00 00 00 70 00   ERRORFRAME
	bus-error
	error-counter-tx-rx{{112}{0}}
 (000.001749)  can0  20000080   [8]  00 00 00 00 00 00 78 00   ERRORFRAME
	bus-error
	error-counter-tx-rx{{120}{0}}
 (000.001771)  can0  20000084   [8]  00 20 00 00 00 00 80 00   ERRORFRAME
	controller-problem{tx-error-passive}
	bus-error
	error-counter-tx-rx{{128}{0}}
 (000.001868)  can0  20000080   [8]  00 00 00 00 00 00 80 00   ERRORFRAME
	bus-error
	error-counter-tx-rx{{128}{0}}
 (000.001982)  can0  20000080   [8]  00 00 00 00 00 00 80 00   ERRORFRAME
	bus-error
	error-counter-tx-rx{{128}{0}}

(( Then a continous flood, exactly similar to the above packet, appears.
   Unfortunately this flooding is a firmware problem. ))

<-- Replug the cable, after a good amount of time -->

 (000.520485)  can0  33D   [4]  CB 0A 00 00
 (000.002693)  can0  42E   [8]  CC 0A 00 00 00 00 00 00
 (000.001795)  can0  319   [4]  CD 0A 00 00
 (000.002705)  can0  3B1   [8]  CE 0A 00 00 00 00 00 00
 (000.001295)  can0  4CC   [2]  CF 0A
 (000.002205)  can0  42B   [6]  D0 0A 00 00 00 00
 (000.001620)  can0  5A2   [3]  D1 0A 00
 (000.002636)  can0  691   [8]  D2 0A 00 00 00 00 00 00
 (000.002615)  can0  36A   [8]  D3 0A 00 00 00 00 00 00
 (000.001729)  can0  068   [4]  D4 0A 00 00
 (000.001195)  can0  4C8   [1]  D5
 ...

##########################################################################

Bus-off Testing:

candump on a PC, Kvaser device on the sending end. An i.mx6 ARM
board with flexcan is on the receiving end:

 (000.010319)  can0  5CC   [8]  90 02 00 00 00 00 00 00
 (000.008747)  can0  679   [1]  91
 (000.011442)  can0  011   [8]  92 02 00 00 00 00 00 00
 (000.008991)  can0  631   [2]  93 02
 (000.011097)  can0  532   [7]  94 02 00 00 00 00 00
 (000.009781)  can0  0A9   [5]  95 02 00 00 00
 (000.010792)  can0  1DD   [8]  96 02 00 00 00 00 00 00
 (000.010026)  can0  44E   [8]  97 02 00 00 00 00 00 00
 (000.010181)  can0  76A   [8]  98 02 00 00 00 00 00 00
 (000.008867)  can0  1A5   [1]  99
 (000.011322)  can0  2B4   [8]  9A 02 00 00 00 00 00 00

<-- Unplug the can low and high wires from the board -->

 (000.009688)  can0  20000080   [8]  00 00 00 00 00 00 08 00   ERRORFRAME
	bus-error
	error-counter-tx-rx{{8}{0}}
 (000.002246)  can0  20000080   [8]  00 00 00 00 00 00 10 00   ERRORFRAME
	bus-error
	error-counter-tx-rx{{16}{0}}
 (000.002124)  can0  20000080   [8]  00 00 00 00 00 00 18 00   ERRORFRAME
	bus-error
	error-counter-tx-rx{{24}{0}}
 (000.002115)  can0  20000080   [8]  00 00 00 00 00 00 20 00   ERRORFRAME
	bus-error
	error-counter-tx-rx{{32}{0}}
 (000.002132)  can0  20000080   [8]  00 00 00 00 00 00 28 00   ERRORFRAME
	bus-error
	error-counter-tx-rx{{40}{0}}
 (000.002266)  can0  20000080   [8]  00 00 00 00 00 00 30 00   ERRORFRAME
	bus-error
	error-counter-tx-rx{{48}{0}}
 (000.002187)  can0  20000080   [8]  00 00 00 00 00 00 38 00   ERRORFRAME
	bus-error
	error-counter-tx-rx{{56}{0}}
 (000.002046)  can0  20000080   [8]  00 00 00 00 00 00 40 00   ERRORFRAME
	bus-error
	error-counter-tx-rx{{64}{0}}
 (000.002076)  can0  20000080   [8]  00 00 00 00 00 00 48 00   ERRORFRAME
	bus-error
	error-counter-tx-rx{{72}{0}}
 (000.002406)  can0  20000080   [8]  00 00 00 00 00 00 50 00   ERRORFRAME
	bus-error
	error-counter-tx-rx{{80}{0}}
 (000.001969)  can0  20000080   [8]  00 00 00 00 00 00 58 00   ERRORFRAME
	bus-error
	error-counter-tx-rx{{88}{0}}
 (000.002388)  can0  20000084   [8]  00 08 00 00 00 00 60 00   ERRORFRAME
	controller-problem{tx-error-warning}
	bus-error
	error-counter-tx-rx{{96}{0}}
 (000.002021)  can0  20000080   [8]  00 00 00 00 00 00 68 00   ERRORFRAME
	bus-error
	error-counter-tx-rx{{104}{0}}
 (000.002110)  can0  20000080   [8]  00 00 00 00 00 00 70 00   ERRORFRAME
	bus-error
	error-counter-tx-rx{{112}{0}}
 (000.002155)  can0  20000080   [8]  00 00 00 00 00 00 78 00   ERRORFRAME
	bus-error
	error-counter-tx-rx{{120}{0}}
 (000.002140)  can0  20000084   [8]  00 20 00 00 00 00 80 00   ERRORFRAME
	controller-problem{tx-error-passive}
	bus-error
	error-counter-tx-rx{{128}{0}}
 (000.002324)  can0  20000080   [8]  00 00 00 00 00 00 80 00   ERRORFRAME
	bus-error
	error-counter-tx-rx{{128}{0}}
 (000.002416)  can0  20000080   [8]  00 00 00 00 00 00 80 00   ERRORFRAME
	bus-error
	error-counter-tx-rx{{128}{0}}
 (000.002237)  can0  20000080   [8]  00 00 00 00 00 00 80 00   ERRORFRAME
	bus-error
	error-counter-tx-rx{{128}{0}}

(( Then a continous flood, exactly similar to the above packet, appears ))

<-- Short-circuit the can high and low wires -->

 (000.002364)  can0  20000080   [8]  00 00 00 00 00 00 80 00   ERRORFRAME
	bus-error
	error-counter-tx-rx{{128}{0}}
 (000.002108)  can0  20000080   [8]  00 00 00 00 00 00 88 00   ERRORFRAME
	bus-error
	error-counter-tx-rx{{136}{0}}
 (000.000494)  can0  20000080   [8]  00 00 00 00 00 00 90 00   ERRORFRAME
	bus-error
	error-counter-tx-rx{{144}{0}}
 (000.000523)  can0  20000080   [8]  00 00 00 00 00 00 98 00   ERRORFRAME
	bus-error
	error-counter-tx-rx{{152}{0}}
 (000.000661)  can0  20000080   [8]  00 00 00 00 00 00 A0 00   ERRORFRAME
	bus-error
	error-counter-tx-rx{{160}{0}}
 (000.000464)  can0  20000080   [8]  00 00 00 00 00 00 A8 00   ERRORFRAME
	bus-error
	error-counter-tx-rx{{168}{0}}
 (000.000534)  can0  20000080   [8]  00 00 00 00 00 00 B0 00   ERRORFRAME
	bus-error
	error-counter-tx-rx{{176}{0}}
 (000.000499)  can0  20000080   [8]  00 00 00 00 00 00 B8 00   ERRORFRAME
	bus-error
	error-counter-tx-rx{{184}{0}}
 (000.000626)  can0  20000080   [8]  00 00 00 00 00 00 C0 00   ERRORFRAME
	bus-error
	error-counter-tx-rx{{192}{0}}
 (000.000373)  can0  20000080   [8]  00 00 00 00 00 00 C8 00   ERRORFRAME
	bus-error
	error-counter-tx-rx{{200}{0}}
 (000.000627)  can0  20000080   [8]  00 00 00 00 00 00 D0 00   ERRORFRAME
	bus-error
	error-counter-tx-rx{{208}{0}}
 (000.000507)  can0  20000080   [8]  00 00 00 00 00 00 D8 00   ERRORFRAME
	bus-error
	error-counter-tx-rx{{216}{0}}
 (000.000501)  can0  20000080   [8]  00 00 00 00 00 00 E0 00   ERRORFRAME
	bus-error
	error-counter-tx-rx{{224}{0}}
 (000.000459)  can0  20000080   [8]  00 00 00 00 00 00 E8 00   ERRORFRAME
	bus-error
	error-counter-tx-rx{{232}{0}}
 (000.000606)  can0  20000080   [8]  00 00 00 00 00 00 F0 00   ERRORFRAME
	bus-error
	error-counter-tx-rx{{240}{0}}
 (000.000454)  can0  20000080   [8]  00 00 00 00 00 00 F8 00   ERRORFRAME
	bus-error
	error-counter-tx-rx{{248}{0}}
 (000.000664)  can0  200000C0   [8]  00 00 00 00 00 00 00 00   ERRORFRAME
	bus-off
	bus-error

 (( No further bus activity ))


##########################################################################

Regards,
Darwish
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Andri Yngvason Jan. 21, 2015, 3 p.m. UTC | #6
Quoting Ahmed S. Darwish (2015-01-21 14:43:23)
> Hi!
> 
> On Wed, Jan 21, 2015 at 12:53:58PM +0100, Wolfgang Grandegger wrote:
> > On Wed, 21 Jan 2015 10:33:19 +0000, Andri Yngvason
> > <andri.yngvason@marel.com> wrote:
> > > Quoting Ahmed S. Darwish (2015-01-20 21:45:37)
> > >> From: Ahmed S. Darwish <ahmed.darwish@valeo.com>
> > >> 
> > >> Replace most of the can interface's state and error counters
> > >> handling with the new can-dev can_change_state() mechanism.
> > >> 
> > >> Suggested-by: Andri Yngvason <andri.yngvason@marel.com>
> > >> Signed-off-by: Ahmed S. Darwish <ahmed.darwish@valeo.com>
> > >> ---
> > >>  drivers/net/can/usb/kvaser_usb.c | 114
> > >>  +++++++++++++++++++--------------------
> > >>  1 file changed, 55 insertions(+), 59 deletions(-)
> > >> 
> > >> diff --git a/drivers/net/can/usb/kvaser_usb.c
> > >> b/drivers/net/can/usb/kvaser_usb.c
> > >> index 971c5f9..0386d3f 100644
> > >> --- a/drivers/net/can/usb/kvaser_usb.c
> > >> +++ b/drivers/net/can/usb/kvaser_usb.c
> > 
> > ...
> > > 
> > > Looks good.
> > 
> > Would be nice to see some "candump" traces as well.
> 
> Sure. The USBCan-II device trace below is generated after applying
> all patches in the series, especially patch #3, which fixes some
> some invalid CAN state transitions logic in the original driver.
[...]
> 
> Afterwards, candump on a PC, Kvaser USB device on the sending end:
> 
>  ...
>  (000.008784)  can0  60A   [1]  C1
>  (000.011341)  can0  2A8   [8]  C2 0A 00 00 00 00 00 00
>  (000.009873)  can0  03D   [7]  C3 0A 00 00 00 00 00
>  (000.010394)  can0  55C   [8]  C4 0A 00 00 00 00 00 00
>  (000.009979)  can0  45A   [8]  C5 0A 00 00 00 00 00 00
>  (000.010125)  can0  6E8   [8]  C6 0A 00 00 00 00 00 00
>  (000.010149)  can0  4EE   [8]  C7 0A 00 00 00 00 00 00
>  (000.010102)  can0  5D2   [8]  C8 0A 00 00 00 00 00 00
>  (000.010000)  can0  61F   [8]  C9 0A 00 00 00 00 00 00
>  (000.010271)  can0  5F8   [8]  CA 0A 00 00 00 00 00 00
> 
> <-- Unplug the cable -->
> 
>  (000.009106)  can0  20000080   [8]  00 00 00 00 00 00 08 00   ERRORFRAME
>         bus-error
>         error-counter-tx-rx{{8}{0}}
>  (000.001872)  can0  20000080   [8]  00 00 00 00 00 00 10 00   ERRORFRAME
>         bus-error
>         error-counter-tx-rx{{16}{0}}
[...]
>         error-counter-tx-rx{{80}{0}}
>  (000.001910)  can0  20000080   [8]  00 00 00 00 00 00 58 00   ERRORFRAME
>         bus-error
>         error-counter-tx-rx{{88}{0}}
>  (000.001753)  can0  20000084   [8]  00 08 00 00 00 00 60 00   ERRORFRAME
>         controller-problem{tx-error-warning}
Good.
>         bus-error
>         error-counter-tx-rx{{96}{0}}
>  (000.001720)  can0  20000080   [8]  00 00 00 00 00 00 68 00   ERRORFRAME
>         bus-error
>         error-counter-tx-rx{{104}{0}}
>  (000.001876)  can0  20000080   [8]  00 00 00 00 00 00 70 00   ERRORFRAME
>         bus-error
>         error-counter-tx-rx{{112}{0}}
>  (000.001749)  can0  20000080   [8]  00 00 00 00 00 00 78 00   ERRORFRAME
>         bus-error
>         error-counter-tx-rx{{120}{0}}
>  (000.001771)  can0  20000084   [8]  00 20 00 00 00 00 80 00   ERRORFRAME
>         controller-problem{tx-error-passive}
Also good.
>         bus-error
>         error-counter-tx-rx{{128}{0}}
>  (000.001868)  can0  20000080   [8]  00 00 00 00 00 00 80 00   ERRORFRAME
>         bus-error
>         error-counter-tx-rx{{128}{0}}
>  (000.001982)  can0  20000080   [8]  00 00 00 00 00 00 80 00   ERRORFRAME
>         bus-error
>         error-counter-tx-rx{{128}{0}}
> 
> (( Then a continous flood, exactly similar to the above packet, appears.
>    Unfortunately this flooding is a firmware problem. ))
> 
> <-- Replug the cable, after a good amount of time -->
>
Where are the reverse state transitions?
> 
>  (000.520485)  can0  33D   [4]  CB 0A 00 00
>  (000.002693)  can0  42E   [8]  CC 0A 00 00 00 00 00 00
>  (000.001795)  can0  319   [4]  CD 0A 00 00
>  (000.002705)  can0  3B1   [8]  CE 0A 00 00 00 00 00 00
>  (000.001295)  can0  4CC   [2]  CF 0A
>  (000.002205)  can0  42B   [6]  D0 0A 00 00 00 00
>  (000.001620)  can0  5A2   [3]  D1 0A 00
>  (000.002636)  can0  691   [8]  D2 0A 00 00 00 00 00 00
>  (000.002615)  can0  36A   [8]  D3 0A 00 00 00 00 00 00
>  (000.001729)  can0  068   [4]  D4 0A 00 00
>  (000.001195)  can0  4C8   [1]  D5
>  ...
> 
> ##########################################################################
> 
> Bus-off Testing:
> 
> candump on a PC, Kvaser device on the sending end. An i.mx6 ARM
> board with flexcan is on the receiving end:
> 
>  (000.010319)  can0  5CC   [8]  90 02 00 00 00 00 00 00
>  (000.008747)  can0  679   [1]  91
>  (000.011442)  can0  011   [8]  92 02 00 00 00 00 00 00
>  (000.008991)  can0  631   [2]  93 02
>  (000.011097)  can0  532   [7]  94 02 00 00 00 00 00
>  (000.009781)  can0  0A9   [5]  95 02 00 00 00
>  (000.010792)  can0  1DD   [8]  96 02 00 00 00 00 00 00
>  (000.010026)  can0  44E   [8]  97 02 00 00 00 00 00 00
>  (000.010181)  can0  76A   [8]  98 02 00 00 00 00 00 00
>  (000.008867)  can0  1A5   [1]  99
>  (000.011322)  can0  2B4   [8]  9A 02 00 00 00 00 00 00
> 
> <-- Unplug the can low and high wires from the board -->
> 
>  (000.009688)  can0  20000080   [8]  00 00 00 00 00 00 08 00   ERRORFRAME
>         bus-error
>         error-counter-tx-rx{{8}{0}}
>  (000.002246)  can0  20000080   [8]  00 00 00 00 00 00 10 00   ERRORFRAME
>         bus-error
>         error-counter-tx-rx{{16}{0}}
>  (000.002124)  can0  20000080   [8]  00 00 00 00 00 00 18 00   ERRORFRAME
>         bus-error
>         error-counter-tx-rx{{24}{0}}
>  (000.002115)  can0  20000080   [8]  00 00 00 00 00 00 20 00   ERRORFRAME
>         bus-error
>         error-counter-tx-rx{{32}{0}}
>  (000.002132)  can0  20000080   [8]  00 00 00 00 00 00 28 00   ERRORFRAME
>         bus-error
>         error-counter-tx-rx{{40}{0}}
>  (000.002266)  can0  20000080   [8]  00 00 00 00 00 00 30 00   ERRORFRAME
>         bus-error
>         error-counter-tx-rx{{48}{0}}
>  (000.002187)  can0  20000080   [8]  00 00 00 00 00 00 38 00   ERRORFRAME
>         bus-error
>         error-counter-tx-rx{{56}{0}}
>  (000.002046)  can0  20000080   [8]  00 00 00 00 00 00 40 00   ERRORFRAME
>         bus-error
>         error-counter-tx-rx{{64}{0}}
>  (000.002076)  can0  20000080   [8]  00 00 00 00 00 00 48 00   ERRORFRAME
>         bus-error
>         error-counter-tx-rx{{72}{0}}
>  (000.002406)  can0  20000080   [8]  00 00 00 00 00 00 50 00   ERRORFRAME
>         bus-error
>         error-counter-tx-rx{{80}{0}}
>  (000.001969)  can0  20000080   [8]  00 00 00 00 00 00 58 00   ERRORFRAME
>         bus-error
>         error-counter-tx-rx{{88}{0}}
>  (000.002388)  can0  20000084   [8]  00 08 00 00 00 00 60 00   ERRORFRAME
>         controller-problem{tx-error-warning}
>         bus-error
>         error-counter-tx-rx{{96}{0}}
>  (000.002021)  can0  20000080   [8]  00 00 00 00 00 00 68 00   ERRORFRAME
>         bus-error
>         error-counter-tx-rx{{104}{0}}
>  (000.002110)  can0  20000080   [8]  00 00 00 00 00 00 70 00   ERRORFRAME
>         bus-error
>         error-counter-tx-rx{{112}{0}}
>  (000.002155)  can0  20000080   [8]  00 00 00 00 00 00 78 00   ERRORFRAME
>         bus-error
>         error-counter-tx-rx{{120}{0}}
>  (000.002140)  can0  20000084   [8]  00 20 00 00 00 00 80 00   ERRORFRAME
>         controller-problem{tx-error-passive}
>         bus-error
>         error-counter-tx-rx{{128}{0}}
>  (000.002324)  can0  20000080   [8]  00 00 00 00 00 00 80 00   ERRORFRAME
>         bus-error
>         error-counter-tx-rx{{128}{0}}
>  (000.002416)  can0  20000080   [8]  00 00 00 00 00 00 80 00   ERRORFRAME
>         bus-error
>         error-counter-tx-rx{{128}{0}}
>  (000.002237)  can0  20000080   [8]  00 00 00 00 00 00 80 00   ERRORFRAME
>         bus-error
>         error-counter-tx-rx{{128}{0}}
> 
> (( Then a continous flood, exactly similar to the above packet, appears ))
> 
> <-- Short-circuit the can high and low wires -->
> 
>  (000.002364)  can0  20000080   [8]  00 00 00 00 00 00 80 00   ERRORFRAME
>         bus-error
>         error-counter-tx-rx{{128}{0}}
>  (000.002108)  can0  20000080   [8]  00 00 00 00 00 00 88 00   ERRORFRAME
>         bus-error
>         error-counter-tx-rx{{136}{0}}
>  (000.000494)  can0  20000080   [8]  00 00 00 00 00 00 90 00   ERRORFRAME
>         bus-error
>         error-counter-tx-rx{{144}{0}}
>  (000.000523)  can0  20000080   [8]  00 00 00 00 00 00 98 00   ERRORFRAME
>         bus-error
>         error-counter-tx-rx{{152}{0}}
>  (000.000661)  can0  20000080   [8]  00 00 00 00 00 00 A0 00   ERRORFRAME
>         bus-error
>         error-counter-tx-rx{{160}{0}}
>  (000.000464)  can0  20000080   [8]  00 00 00 00 00 00 A8 00   ERRORFRAME
>         bus-error
>         error-counter-tx-rx{{168}{0}}
>  (000.000534)  can0  20000080   [8]  00 00 00 00 00 00 B0 00   ERRORFRAME
>         bus-error
>         error-counter-tx-rx{{176}{0}}
>  (000.000499)  can0  20000080   [8]  00 00 00 00 00 00 B8 00   ERRORFRAME
>         bus-error
>         error-counter-tx-rx{{184}{0}}
>  (000.000626)  can0  20000080   [8]  00 00 00 00 00 00 C0 00   ERRORFRAME
>         bus-error
>         error-counter-tx-rx{{192}{0}}
>  (000.000373)  can0  20000080   [8]  00 00 00 00 00 00 C8 00   ERRORFRAME
>         bus-error
>         error-counter-tx-rx{{200}{0}}
>  (000.000627)  can0  20000080   [8]  00 00 00 00 00 00 D0 00   ERRORFRAME
>         bus-error
>         error-counter-tx-rx{{208}{0}}
>  (000.000507)  can0  20000080   [8]  00 00 00 00 00 00 D8 00   ERRORFRAME
>         bus-error
>         error-counter-tx-rx{{216}{0}}
>  (000.000501)  can0  20000080   [8]  00 00 00 00 00 00 E0 00   ERRORFRAME
>         bus-error
>         error-counter-tx-rx{{224}{0}}
>  (000.000459)  can0  20000080   [8]  00 00 00 00 00 00 E8 00   ERRORFRAME
>         bus-error
>         error-counter-tx-rx{{232}{0}}
>  (000.000606)  can0  20000080   [8]  00 00 00 00 00 00 F0 00   ERRORFRAME
>         bus-error
>         error-counter-tx-rx{{240}{0}}
>  (000.000454)  can0  20000080   [8]  00 00 00 00 00 00 F8 00   ERRORFRAME
>         bus-error
>         error-counter-tx-rx{{248}{0}}
>  (000.000664)  can0  200000C0   [8]  00 00 00 00 00 00 00 00   ERRORFRAME
>         bus-off
>         bus-error
> 
>  (( No further bus activity ))
> 
Bus-off seems OK. You could have just short circuited them without
disconnecting.

Reverse state transitions are missing from the logs. See comments above.

--
Andri
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Ahmed S. Darwish Jan. 21, 2015, 3:36 p.m. UTC | #7
On Wed, Jan 21, 2015 at 03:00:15PM +0000, Andri Yngvason wrote:
> Quoting Ahmed S. Darwish (2015-01-21 14:43:23)
> > Hi!

...

> > <-- Unplug the cable -->
> > 
> >  (000.009106)  can0  20000080   [8]  00 00 00 00 00 00 08 00   ERRORFRAME
> >         bus-error
> >         error-counter-tx-rx{{8}{0}}
> >  (000.001872)  can0  20000080   [8]  00 00 00 00 00 00 10 00   ERRORFRAME
> >         bus-error
> >         error-counter-tx-rx{{16}{0}}
> [...]
> >         error-counter-tx-rx{{80}{0}}
> >  (000.001910)  can0  20000080   [8]  00 00 00 00 00 00 58 00   ERRORFRAME
> >         bus-error
> >         error-counter-tx-rx{{88}{0}}
> >  (000.001753)  can0  20000084   [8]  00 08 00 00 00 00 60 00   ERRORFRAME
> >         controller-problem{tx-error-warning}
> Good.
> >         bus-error
> >         error-counter-tx-rx{{96}{0}}

Nice.

> >  (000.001720)  can0  20000080   [8]  00 00 00 00 00 00 68 00   ERRORFRAME
> >         bus-error
> >         error-counter-tx-rx{{104}{0}}
> >  (000.001876)  can0  20000080   [8]  00 00 00 00 00 00 70 00   ERRORFRAME
> >         bus-error
> >         error-counter-tx-rx{{112}{0}}
> >  (000.001749)  can0  20000080   [8]  00 00 00 00 00 00 78 00   ERRORFRAME
> >         bus-error
> >         error-counter-tx-rx{{120}{0}}
> >  (000.001771)  can0  20000084   [8]  00 20 00 00 00 00 80 00   ERRORFRAME
> >         controller-problem{tx-error-passive}
> Also good.
> >         bus-error
> >         error-counter-tx-rx{{128}{0}}

Also nice :-)

> >  (000.001868)  can0  20000080   [8]  00 00 00 00 00 00 80 00   ERRORFRAME
> >         bus-error
> >         error-counter-tx-rx{{128}{0}}
> >  (000.001982)  can0  20000080   [8]  00 00 00 00 00 00 80 00   ERRORFRAME
> >         bus-error
> >         error-counter-tx-rx{{128}{0}}
> > 
> > (( Then a continous flood, exactly similar to the above packet, appears.
> >    Unfortunately this flooding is a firmware problem. ))
> > 
> > <-- Replug the cable, after a good amount of time -->
> >
> Where are the reverse state transitions?
> > 

Hmmm...

[ ... ]
> 
> Reverse state transitions are missing from the logs. See comments above.
> 

When the device is on the _receiving_ end, and I unplug the CAN cable after
introducing some noise to the level of reaching WARNING or PASSIVE, I
receive a BUS_ERROR event with the rxerr count reset back to 0 or 1. In
that case, the driver correctly transitions back the state to ERROR_ACTIVE
and candump produces something similar to:

    (000.000362)  can0  2000008C   [8]  00 40 40 00 00 00 00 01   ERRORFRAME
    controller-problem{}
    protocol-violation{{back-to-error-active}{}}
    bus-error
    error-counter-tx-rx{{0}{1}}

which is, AFAIK, the correct behaviour from the driver side.

Meanwhile, when the device is on the _sending_ end and I re-plug the CAN
cable again. Sometimes I receive events with txerr reset to 0 or 1, and
the driver correctly reverts back to ERROR_ACTIVE in that case. But on
another times like the quoted case above, I don't receive any events
resetting txerr back -- only data packets on the bus.

So, What can the driver do given the above?

Thanks,
Darwish

P.S. just in case, I'll also re-check now if the driver unintentionally
drops any important events resetting the txerr count back after a CAN
cable replug -- preventing the code from returning to ERROR_ACTIVE in
the process.

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Wolfgang Grandegger Jan. 21, 2015, 4:13 p.m. UTC | #8
On Wed, 21 Jan 2015 10:36:47 -0500, "Ahmed S. Darwish"
<darwish.07@gmail.com> wrote:
> On Wed, Jan 21, 2015 at 03:00:15PM +0000, Andri Yngvason wrote:
>> Quoting Ahmed S. Darwish (2015-01-21 14:43:23)
>> > Hi!
> 
> ...
> 
>> > <-- Unplug the cable -->
>> > 
>> >  (000.009106)  can0  20000080   [8]  00 00 00 00 00 00 08 00  
>> >  ERRORFRAME
>> >         bus-error
>> >         error-counter-tx-rx{{8}{0}}
>> >  (000.001872)  can0  20000080   [8]  00 00 00 00 00 00 10 00  

For a bus-errors I would also expcect some more information in the
data[2..3] fields. But these are always zero.

>> >  ERRORFRAME
>> >         bus-error
>> >         error-counter-tx-rx{{16}{0}}
>> [...]
>> >         error-counter-tx-rx{{80}{0}}
>> >  (000.001910)  can0  20000080   [8]  00 00 00 00 00 00 58 00  
>> >  ERRORFRAME
>> >         bus-error
>> >         error-counter-tx-rx{{88}{0}}
>> >  (000.001753)  can0  20000084   [8]  00 08 00 00 00 00 60 00  
>> >  ERRORFRAME
>> >         controller-problem{tx-error-warning}
>> Good.
>> >         bus-error
>> >         error-counter-tx-rx{{96}{0}}
> 
> Nice.
> 
>> >  (000.001720)  can0  20000080   [8]  00 00 00 00 00 00 68 00  
>> >  ERRORFRAME
>> >         bus-error
>> >         error-counter-tx-rx{{104}{0}}
>> >  (000.001876)  can0  20000080   [8]  00 00 00 00 00 00 70 00  
>> >  ERRORFRAME
>> >         bus-error
>> >         error-counter-tx-rx{{112}{0}}
>> >  (000.001749)  can0  20000080   [8]  00 00 00 00 00 00 78 00  
>> >  ERRORFRAME
>> >         bus-error
>> >         error-counter-tx-rx{{120}{0}}
>> >  (000.001771)  can0  20000084   [8]  00 20 00 00 00 00 80 00  
>> >  ERRORFRAME
>> >         controller-problem{tx-error-passive}
>> Also good.
>> >         bus-error
>> >         error-counter-tx-rx{{128}{0}}
> 
> Also nice :-)
> 
>> >  (000.001868)  can0  20000080   [8]  00 00 00 00 00 00 80 00  
>> >  ERRORFRAME
>> >         bus-error
>> >         error-counter-tx-rx{{128}{0}}
>> >  (000.001982)  can0  20000080   [8]  00 00 00 00 00 00 80 00  
>> >  ERRORFRAME
>> >         bus-error
>> >         error-counter-tx-rx{{128}{0}}
>> > 
>> > (( Then a continous flood, exactly similar to the above packet,
>> > appears.
>> >    Unfortunately this flooding is a firmware problem. ))
>> > 
>> > <-- Replug the cable, after a good amount of time -->
>> >
>> Where are the reverse state transitions?
>> > 
> 
> Hmmm...
> 
> [ ... ]
>> 
>> Reverse state transitions are missing from the logs. See comments
above.
>> 
> 
> When the device is on the _receiving_ end, and I unplug the CAN cable
after
> introducing some noise to the level of reaching WARNING or PASSIVE, I
> receive a BUS_ERROR event with the rxerr count reset back to 0 or 1. In
> that case, the driver correctly transitions back the state to
ERROR_ACTIVE
> and candump produces something similar to:
> 
>     (000.000362)  can0  2000008C   [8]  00 40 40 00 00 00 00 01  
>     ERRORFRAME
>     controller-problem{}
>     protocol-violation{{back-to-error-active}{}}
>     bus-error
>     error-counter-tx-rx{{0}{1}}
> 
> which is, AFAIK, the correct behaviour from the driver side.
> 
> Meanwhile, when the device is on the _sending_ end and I re-plug the CAN
> cable again. Sometimes I receive events with txerr reset to 0 or 1, and
> the driver correctly reverts back to ERROR_ACTIVE in that case. But on
> another times like the quoted case above, I don't receive any events
> resetting txerr back -- only data packets on the bus.

Well, the firmware seems to report *only* bus-errors via
CMD_CAN_ERROR_EVENT
messages, also carrying the new state, but no CMD_CHIP_STATE_EVENT just
for
the state changes.

> So, What can the driver do given the above?

Little if the notification does not come.

Wolfgang.

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Andri Yngvason Jan. 21, 2015, 4:20 p.m. UTC | #9
Quoting Ahmed S. Darwish (2015-01-20 21:45:37)
> From: Ahmed S. Darwish <ahmed.darwish@valeo.com>
> 
> Replace most of the can interface's state and error counters
> handling with the new can-dev can_change_state() mechanism.
> 
> Suggested-by: Andri Yngvason <andri.yngvason@marel.com>
> Signed-off-by: Ahmed S. Darwish <ahmed.darwish@valeo.com>
> ---
>  drivers/net/can/usb/kvaser_usb.c | 114 +++++++++++++++++++--------------------
>  1 file changed, 55 insertions(+), 59 deletions(-)
> 
> diff --git a/drivers/net/can/usb/kvaser_usb.c b/drivers/net/can/usb/kvaser_usb.c
> index 971c5f9..0386d3f 100644
> --- a/drivers/net/can/usb/kvaser_usb.c
> +++ b/drivers/net/can/usb/kvaser_usb.c
> @@ -620,40 +620,43 @@ static void kvaser_usb_unlink_tx_urbs(struct kvaser_usb_net_priv *priv)
>  }
>  
>  static void kvaser_usb_rx_error_update_can_state(struct kvaser_usb_net_priv *priv,
> -                                                const struct kvaser_usb_error_summary *es)
> +                                                const struct kvaser_usb_error_summary *es,
> +                                                struct can_frame *cf)
>  {
>         struct net_device_stats *stats;
> -       enum can_state new_state;
> -
> -       stats = &priv->netdev->stats;
> -       new_state = priv->can.state;
> +       enum can_state cur_state, new_state, tx_state, rx_state;
>  
>         netdev_dbg(priv->netdev, "Error status: 0x%02x\n", es->status);
>  
> -       if (es->status & M16C_STATE_BUS_OFF) {
> -               priv->can.can_stats.bus_off++;
> +       stats = &priv->netdev->stats;
> +       new_state = cur_state = priv->can.state;
> +
> +       if (es->status & M16C_STATE_BUS_OFF)
>                 new_state = CAN_STATE_BUS_OFF;
> -       } else if (es->status & M16C_STATE_BUS_PASSIVE) {
> -               if (priv->can.state != CAN_STATE_ERROR_PASSIVE)
> -                       priv->can.can_stats.error_passive++;
> +       else if (es->status & M16C_STATE_BUS_PASSIVE)
>                 new_state = CAN_STATE_ERROR_PASSIVE;
> -       }
>  
>         if (es->status == M16C_STATE_BUS_ERROR) {
> -               if ((priv->can.state < CAN_STATE_ERROR_WARNING) &&
> -                   ((es->txerr >= 96) || (es->rxerr >= 96))) {
> -                       priv->can.can_stats.error_warning++;
> +               if ((cur_state < CAN_STATE_ERROR_WARNING) &&
> +                   ((es->txerr >= 96) || (es->rxerr >= 96)))
>                         new_state = CAN_STATE_ERROR_WARNING;
> -               } else if (priv->can.state > CAN_STATE_ERROR_ACTIVE) {
> +               else if (cur_state > CAN_STATE_ERROR_ACTIVE)
>                         new_state = CAN_STATE_ERROR_ACTIVE;
> -               }
>         }
>  
>         if (!es->status)
>                 new_state = CAN_STATE_ERROR_ACTIVE;
>  
> +       if (new_state != cur_state) {
> +               tx_state = (es->txerr >= es->rxerr) ? new_state : 0;
> +               rx_state = (es->txerr <= es->rxerr) ? new_state : 0;
> +
> +               can_change_state(priv->netdev, cf, tx_state, rx_state);
This (below) is redundant. It doesn't harm but at this point can_change_state
has set priv->can.state to new_state.
> +               new_state = priv->can.state;
> +       }
> +
>         if (priv->can.restart_ms &&
> -           (priv->can.state >= CAN_STATE_BUS_OFF) &&
> +           (cur_state >= CAN_STATE_BUS_OFF) &&
>             (new_state < CAN_STATE_BUS_OFF)) {
>                 priv->can.can_stats.restarts++;
>         }
> @@ -665,18 +668,17 @@ static void kvaser_usb_rx_error_update_can_state(struct kvaser_usb_net_priv *pri
>  
>         priv->bec.txerr = es->txerr;
>         priv->bec.rxerr = es->rxerr;
> -       priv->can.state = new_state;
>  }
>  
>  static void kvaser_usb_rx_error(const struct kvaser_usb *dev,
>                                 const struct kvaser_msg *msg)
>  {
> -       struct can_frame *cf;
> +       struct can_frame *cf, tmp_cf = { .can_id = CAN_ERR_FLAG, .can_dlc = CAN_ERR_DLC };
>         struct sk_buff *skb;
>         struct net_device_stats *stats;
>         struct kvaser_usb_net_priv *priv;
>         struct kvaser_usb_error_summary es = { };
> -       enum can_state old_state;
> +       enum can_state old_state, new_state;
>  
>         switch (msg->id) {
>         case CMD_CAN_ERROR_EVENT:
> @@ -721,60 +723,54 @@ static void kvaser_usb_rx_error(const struct kvaser_usb *dev,
>         }
>  
>         /* Update all of the can interface's state and error counters before
> -        * trying any skb allocation that can actually fail with -ENOMEM.
> +        * trying any memory allocation that can actually fail with -ENOMEM.
> +        *
> +        * We send a temporary stack-allocated error can frame to
> +        * can_change_state() for the very same reason.
> +        *
> +        * TODO: Split can_change_state() responsibility between updating the
> +        * can interface's state and counters, and the setting up of can error
> +        * frame ID and data to userspace. Remove stack allocation afterwards.
>          */
>         old_state = priv->can.state;
> -       kvaser_usb_rx_error_update_can_state(priv, &es);
> +       kvaser_usb_rx_error_update_can_state(priv, &es, &tmp_cf);
> +       new_state = priv->can.state;
>  
>         skb = alloc_can_err_skb(priv->netdev, &cf);
>         if (!skb) {
>                 stats->rx_dropped++;
>                 return;
>         }
> +       memcpy(cf, &tmp_cf, sizeof(*cf));
>  
> -       if (es.status & M16C_STATE_BUS_OFF) {
> -               cf->can_id |= CAN_ERR_BUSOFF;
> -
> -               if (!priv->can.restart_ms)
> -                       kvaser_usb_simple_msg_async(priv, CMD_STOP_CHIP);
> -               netif_carrier_off(priv->netdev);
> -       } else if (es.status & M16C_STATE_BUS_PASSIVE) {
> -               if (old_state != CAN_STATE_ERROR_PASSIVE) {
> -                       cf->can_id |= CAN_ERR_CRTL;
> -
> -                       if (es.txerr || es.rxerr)
> -                               cf->data[1] = (es.txerr > es.rxerr)
> -                                               ? CAN_ERR_CRTL_TX_PASSIVE
> -                                               : CAN_ERR_CRTL_RX_PASSIVE;
> -                       else
> -                               cf->data[1] = CAN_ERR_CRTL_TX_PASSIVE |
> -                                             CAN_ERR_CRTL_RX_PASSIVE;
> +       if (new_state != old_state) {
> +               if (es.status & M16C_STATE_BUS_OFF) {
> +                       if (!priv->can.restart_ms)
> +                               kvaser_usb_simple_msg_async(priv, CMD_STOP_CHIP);
> +                       netif_carrier_off(priv->netdev);
> +               }
> +
This block is wrong. The usage of PROT_ACTIVE is based on a misunderstanding.
It's used in some drivers to signify back-to-error-active but its original
meaning is something completely different, AFAIK.
This is handled in can_change_state() using a new CTRL message; namely:
CAN_ERR_CTRL_ACTIVE. The newest version of can-utils is up to date with this.
> +               if (es.status == M16C_STATE_BUS_ERROR) {
> +                       if ((old_state >= CAN_STATE_ERROR_WARNING) ||
> +                           (es.txerr < 96 && es.rxerr < 96)) {
> +                               if (old_state > CAN_STATE_ERROR_ACTIVE) {
> +                                       cf->can_id |= CAN_ERR_PROT;
> +                                       cf->data[2] = CAN_ERR_PROT_ACTIVE;
> +                               }
> +                       }
>                 }
> -       }
>  
> -       if (es.status == M16C_STATE_BUS_ERROR) {
> -               if ((old_state < CAN_STATE_ERROR_WARNING) &&
> -                   ((es.txerr >= 96) || (es.rxerr >= 96))) {
> -                       cf->can_id |= CAN_ERR_CRTL;
> -                       cf->data[1] = (es.txerr > es.rxerr)
> -                                       ? CAN_ERR_CRTL_TX_WARNING
> -                                       : CAN_ERR_CRTL_RX_WARNING;
> -               } else if (old_state > CAN_STATE_ERROR_ACTIVE) {
This is also redundant, and wrong:
> +               if (!es.status) {
>                         cf->can_id |= CAN_ERR_PROT;
>                         cf->data[2] = CAN_ERR_PROT_ACTIVE;
>                 }
> -       }
>  
> -       if (!es.status) {
> -               cf->can_id |= CAN_ERR_PROT;
> -               cf->data[2] = CAN_ERR_PROT_ACTIVE;
> -       }
> -
> -       if (priv->can.restart_ms &&
> -           (old_state >= CAN_STATE_BUS_OFF) &&
> -           (priv->can.state < CAN_STATE_BUS_OFF)) {
> -               cf->can_id |= CAN_ERR_RESTARTED;
> -               netif_carrier_on(priv->netdev);
> +               if (priv->can.restart_ms &&
> +                   (old_state >= CAN_STATE_BUS_OFF) &&
> +                   (new_state < CAN_STATE_BUS_OFF)) {
> +                       cf->can_id |= CAN_ERR_RESTARTED;
> +                       netif_carrier_on(priv->netdev);
> +               }
>         }
>  
>         if (es.error_factor) {
> -- 
> 1.9.1

Looking over the patch again, I've noticed that there are a few things that are
not quite right.

Marc, could you merge the "move bus_off++" patch before you merge this so that I
won't have to incorporate this patch-set into it?

--
Andri
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Andri Yngvason Jan. 21, 2015, 4:37 p.m. UTC | #10
Quoting Ahmed S. Darwish (2015-01-21 15:36:47)
> On Wed, Jan 21, 2015 at 03:00:15PM +0000, Andri Yngvason wrote:
> > Quoting Ahmed S. Darwish (2015-01-21 14:43:23)
> > > Hi!
> 
> ...
> 
> > > <-- Unplug the cable -->
> > > 
> > >  (000.009106)  can0  20000080   [8]  00 00 00 00 00 00 08 00   ERRORFRAME
> > >         bus-error
> > >         error-counter-tx-rx{{8}{0}}
> > >  (000.001872)  can0  20000080   [8]  00 00 00 00 00 00 10 00   ERRORFRAME
> > >         bus-error
> > >         error-counter-tx-rx{{16}{0}}
> > [...]
> > >         error-counter-tx-rx{{80}{0}}
> > >  (000.001910)  can0  20000080   [8]  00 00 00 00 00 00 58 00   ERRORFRAME
> > >         bus-error
> > >         error-counter-tx-rx{{88}{0}}
> > >  (000.001753)  can0  20000084   [8]  00 08 00 00 00 00 60 00   ERRORFRAME
> > >         controller-problem{tx-error-warning}
> > Good.
> > >         bus-error
> > >         error-counter-tx-rx{{96}{0}}
> 
> Nice.
> 
> > >  (000.001720)  can0  20000080   [8]  00 00 00 00 00 00 68 00   ERRORFRAME
> > >         bus-error
> > >         error-counter-tx-rx{{104}{0}}
> > >  (000.001876)  can0  20000080   [8]  00 00 00 00 00 00 70 00   ERRORFRAME
> > >         bus-error
> > >         error-counter-tx-rx{{112}{0}}
> > >  (000.001749)  can0  20000080   [8]  00 00 00 00 00 00 78 00   ERRORFRAME
> > >         bus-error
> > >         error-counter-tx-rx{{120}{0}}
> > >  (000.001771)  can0  20000084   [8]  00 20 00 00 00 00 80 00   ERRORFRAME
> > >         controller-problem{tx-error-passive}
> > Also good.
> > >         bus-error
> > >         error-counter-tx-rx{{128}{0}}
> 
> Also nice :-)
> 
> > >  (000.001868)  can0  20000080   [8]  00 00 00 00 00 00 80 00   ERRORFRAME
> > >         bus-error
> > >         error-counter-tx-rx{{128}{0}}
> > >  (000.001982)  can0  20000080   [8]  00 00 00 00 00 00 80 00   ERRORFRAME
> > >         bus-error
> > >         error-counter-tx-rx{{128}{0}}
> > > 
> > > (( Then a continous flood, exactly similar to the above packet, appears.
> > >    Unfortunately this flooding is a firmware problem. ))
> > > 
> > > <-- Replug the cable, after a good amount of time -->
> > >
> > Where are the reverse state transitions?
> > > 
> 
> Hmmm...
> 
> [ ... ]
> > 
> > Reverse state transitions are missing from the logs. See comments above.
> > 
> 
> When the device is on the _receiving_ end, and I unplug the CAN cable after
> introducing some noise to the level of reaching WARNING or PASSIVE, I
> receive a BUS_ERROR event with the rxerr count reset back to 0 or 1. In
> that case, the driver correctly transitions back the state to ERROR_ACTIVE
> and candump produces something similar to:
> 
>     (000.000362)  can0  2000008C   [8]  00 40 40 00 00 00 00 01   ERRORFRAME
>     controller-problem{}
>     protocol-violation{{back-to-error-active}{}}
>     bus-error
>     error-counter-tx-rx{{0}{1}}
> 
> which is, AFAIK, the correct behaviour from the driver side.
> 
> Meanwhile, when the device is on the _sending_ end and I re-plug the CAN
> cable again. Sometimes I receive events with txerr reset to 0 or 1, and
> the driver correctly reverts back to ERROR_ACTIVE in that case. But on
> another times like the quoted case above, I don't receive any events
> resetting txerr back -- only data packets on the bus.
> 
> So, What can the driver do given the above?
>
So what you're telling us is that the state does not got back to error-active
unless there is something else transmitting on the bus?
If that's the case, it's almost definitely because state changes aren't
triggering interrupts.

An rx event will give you an interrupt which yields a napi poll which means that
the state will be polled, so in that case you don't need the interrupts.

Look for "Consolidate and unify state change handling" on gmane. There was a lot
of discussion about this kind of issues, especially regarding FlexCAN.

Best regards,
Andri
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Marc Kleine-Budde Jan. 21, 2015, 10:59 p.m. UTC | #11
On 01/21/2015 05:20 PM, Andri Yngvason wrote:
> Marc, could you merge the "move bus_off++" patch before you merge this so that I
> won't have to incorporate this patch-set into it?

...included in the lastest pull-request to David. Use
tags/linux-can-next-for-3.20-20150121 of the can-next repo as you new base.

regards,
Marc
Andri Yngvason Jan. 22, 2015, 10:14 a.m. UTC | #12
Quoting Marc Kleine-Budde (2015-01-21 22:59:23)
> On 01/21/2015 05:20 PM, Andri Yngvason wrote:
> > Marc, could you merge the "move bus_off++" patch before you merge this so that I
> > won't have to incorporate this patch-set into it?
> 
> ...included in the lastest pull-request to David. Use
> tags/linux-can-next-for-3.20-20150121 of the can-next repo as you new base.
> 

Thanks!

--
Andri
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Ahmed S. Darwish Jan. 23, 2015, 6:07 a.m. UTC | #13
On Wed, Jan 21, 2015 at 05:13:45PM +0100, Wolfgang Grandegger wrote:
> On Wed, 21 Jan 2015 10:36:47 -0500, "Ahmed S. Darwish"
> <darwish.07@gmail.com> wrote:
> > On Wed, Jan 21, 2015 at 03:00:15PM +0000, Andri Yngvason wrote:
> >> Quoting Ahmed S. Darwish (2015-01-21 14:43:23)
> >> > Hi!
> > 
> > ...
> > 
> >> > <-- Unplug the cable -->
> >> > 
> >> >  (000.009106)  can0  20000080   [8]  00 00 00 00 00 00 08 00  
> >> >  ERRORFRAME
> >> >         bus-error
> >> >         error-counter-tx-rx{{8}{0}}
> >> >  (000.001872)  can0  20000080   [8]  00 00 00 00 00 00 10 00  
> 
> For a bus-errors I would also expcect some more information in the
> data[2..3] fields. But these are always zero.
> 

M16C error factors made it possible to report things like
CAN_ERR_PROT_FORM/STUFF/BIT0/BIT1/TX in data[2], and
CAN_ERR_PROT_LOC_ACK/CRC_DEL in data[3].

Unfortunately such error factors are only reported in Leaf, but
not in USBCan-II due to the wire format change in the error event:

	struct leaf_msg_error_event {
		u8 tid;
		u8 flags;
		__le16 time[3];
		u8 channel;
		u8 padding;
		u8 tx_errors_count;
		u8 rx_errors_count;
		u8 status;
		u8 error_factor;
	} __packed;

	struct usbcan_msg_error_event {
		u8 tid;
		u8 padding;
		u8 tx_errors_count_ch0;
		u8 rx_errors_count_ch0;
		u8 tx_errors_count_ch1;
		u8 rx_errors_count_ch1;
		u8 status_ch0;
		u8 status_ch1;
		__le16 time;
	} __packed;

I speculate that the wire format was changed due to controller
bugs in the USBCan-II, which was slightly mentioned in their
data sheets here:

	http://www.kvaser.com/canlib-webhelp/page_hardware_specific_can_controllers.html

So it seems there's really no way for filling such bus error
info given the very limited amount of data exported :-(

The issue of incomplete data does not even stop here, kindly
check below notes regarding reverse state transitions:

> >> >  ERRORFRAME
> >> >         bus-error
> >> >         error-counter-tx-rx{{16}{0}}
> >> [...]
> >> >         error-counter-tx-rx{{80}{0}}
> >> >  (000.001910)  can0  20000080   [8]  00 00 00 00 00 00 58 00  
> >> >  ERRORFRAME
> >> >         bus-error
> >> >         error-counter-tx-rx{{88}{0}}
> >> >  (000.001753)  can0  20000084   [8]  00 08 00 00 00 00 60 00  
> >> >  ERRORFRAME
> >> >         controller-problem{tx-error-warning}
> >> Good.
> >> >         bus-error
> >> >         error-counter-tx-rx{{96}{0}}
> > 
> > Nice.
> > 
> >> >  (000.001720)  can0  20000080   [8]  00 00 00 00 00 00 68 00  
> >> >  ERRORFRAME
> >> >         bus-error
> >> >         error-counter-tx-rx{{104}{0}}
> >> >  (000.001876)  can0  20000080   [8]  00 00 00 00 00 00 70 00  
> >> >  ERRORFRAME
> >> >         bus-error
> >> >         error-counter-tx-rx{{112}{0}}
> >> >  (000.001749)  can0  20000080   [8]  00 00 00 00 00 00 78 00  
> >> >  ERRORFRAME
> >> >         bus-error
> >> >         error-counter-tx-rx{{120}{0}}
> >> >  (000.001771)  can0  20000084   [8]  00 20 00 00 00 00 80 00  
> >> >  ERRORFRAME
> >> >         controller-problem{tx-error-passive}
> >> Also good.
> >> >         bus-error
> >> >         error-counter-tx-rx{{128}{0}}
> > 
> > Also nice :-)
> > 
> >> >  (000.001868)  can0  20000080   [8]  00 00 00 00 00 00 80 00  
> >> >  ERRORFRAME
> >> >         bus-error
> >> >         error-counter-tx-rx{{128}{0}}
> >> >  (000.001982)  can0  20000080   [8]  00 00 00 00 00 00 80 00  
> >> >  ERRORFRAME
> >> >         bus-error
> >> >         error-counter-tx-rx{{128}{0}}
> >> > 
> >> > (( Then a continous flood, exactly similar to the above packet,
> >> > appears.
> >> >    Unfortunately this flooding is a firmware problem. ))
> >> > 
> >> > <-- Replug the cable, after a good amount of time -->
> >> >
> >> Where are the reverse state transitions?
> >> > 
> > 
> > Hmmm...
> > 
> > [ ... ]
> >> 
> >> Reverse state transitions are missing from the logs. See comments
> above.
> >> 
> > 
> > When the device is on the _receiving_ end, and I unplug the CAN cable
> after
> > introducing some noise to the level of reaching WARNING or PASSIVE, I
> > receive a BUS_ERROR event with the rxerr count reset back to 0 or 1. In
> > that case, the driver correctly transitions back the state to
> ERROR_ACTIVE
> > and candump produces something similar to:
> > 
> >     (000.000362)  can0  2000008C   [8]  00 40 40 00 00 00 00 01  
> >     ERRORFRAME
> >     controller-problem{}
> >     protocol-violation{{back-to-error-active}{}}
> >     bus-error
> >     error-counter-tx-rx{{0}{1}}
> > 
> > which is, AFAIK, the correct behaviour from the driver side.
> > 
> > Meanwhile, when the device is on the _sending_ end and I re-plug the CAN
> > cable again. Sometimes I receive events with txerr reset to 0 or 1, and
> > the driver correctly reverts back to ERROR_ACTIVE in that case. But on
> > another times like the quoted case above, I don't receive any events
> > resetting txerr back -- only data packets on the bus.
> 
> Well, the firmware seems to report *only* bus-errors via
> CMD_CAN_ERROR_EVENT messages, also carrying the new state, but no
> CMD_CHIP_STATE_EVENT just for the state changes.
> 

I've dumped _every_ message I receive from the firmware while
disconnecting the CAN bus, waiting a while, and connecting it again.
I really received _nothing_ from the firmware when the CAN bus was
reconnected and the data packets were flowing again. Not even a
single CHIP_STATE_EVENT, even after waiting for a long time.

So it's basically:
...
ERR EVENT, txerr=128, rxerr=0
ERR EVENT, txerr=128, rxerr=0
ERR EVENT, txerr=128, rxerr=0
...

then complete silence, except the data frames. I've even tried with
different versions of the firmware, but the same behaviour persisted.

> > So, What can the driver do given the above?
> 
> Little if the notification does not come.
> 

We can poll the state by sending CMD_GET_CHIP_STATE to the firmware,
and it will hopefully reply with a CHIP_STATE_EVENT response
containing the new txerr and rxerr values that we can use for
reverse state transitions.

But do we _really_ want to go through the path? I feel that it will
open some cans of worms w.r.t. concurrent access to both the netdev
and USB stacks from a single driver.

A possible solution can be setting up a kernel thread that queries
for a CHIP_STATE_EVENT every second?

Your inputs on this is appreciated.

> Wolfgang.
> 

Regards,
Darwish
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Andri Yngvason Jan. 23, 2015, 10:32 a.m. UTC | #14
Quoting Ahmed S. Darwish (2015-01-23 06:07:34)
> On Wed, Jan 21, 2015 at 05:13:45PM +0100, Wolfgang Grandegger wrote:
> > On Wed, 21 Jan 2015 10:36:47 -0500, "Ahmed S. Darwish"
> > <darwish.07@gmail.com> wrote:
> > > On Wed, Jan 21, 2015 at 03:00:15PM +0000, Andri Yngvason wrote:
> > >> Quoting Ahmed S. Darwish (2015-01-21 14:43:23)
> > >> > Hi!
> > > 
> > > ...
> > > 
> > >> > <-- Unplug the cable -->
> > >> > 
> > >> >  (000.009106)  can0  20000080   [8]  00 00 00 00 00 00 08 00  
> > >> >  ERRORFRAME
> > >> >         bus-error
> > >> >         error-counter-tx-rx{{8}{0}}
> > >> >  (000.001872)  can0  20000080   [8]  00 00 00 00 00 00 10 00  
> > 
> > For a bus-errors I would also expcect some more information in the
> > data[2..3] fields. But these are always zero.
> > 
> 
> M16C error factors made it possible to report things like
> CAN_ERR_PROT_FORM/STUFF/BIT0/BIT1/TX in data[2], and
> CAN_ERR_PROT_LOC_ACK/CRC_DEL in data[3].
> 
> Unfortunately such error factors are only reported in Leaf, but
> not in USBCan-II due to the wire format change in the error event:
> 
>         struct leaf_msg_error_event {
>                 u8 tid;
>                 u8 flags;
>                 __le16 time[3];
>                 u8 channel;
>                 u8 padding;
>                 u8 tx_errors_count;
>                 u8 rx_errors_count;
>                 u8 status;
>                 u8 error_factor;
>         } __packed;
> 
>         struct usbcan_msg_error_event {
>                 u8 tid;
>                 u8 padding;
>                 u8 tx_errors_count_ch0;
>                 u8 rx_errors_count_ch0;
>                 u8 tx_errors_count_ch1;
>                 u8 rx_errors_count_ch1;
>                 u8 status_ch0;
>                 u8 status_ch1;
>                 __le16 time;
>         } __packed;
> 
> I speculate that the wire format was changed due to controller
> bugs in the USBCan-II, which was slightly mentioned in their
> data sheets here:
> 
>         http://www.kvaser.com/canlib-webhelp/page_hardware_specific_can_controllers.html
> 
> So it seems there's really no way for filling such bus error
> info given the very limited amount of data exported :-(
>
We experienced similar problems with FlexCAN.
> 
> The issue of incomplete data does not even stop here, kindly
> check below notes regarding reverse state transitions:
> 
> > >> >  ERRORFRAME
> > >> >         bus-error
> > >> >         error-counter-tx-rx{{16}{0}}
> > >> [...]
[...]
> > >> >  ERRORFRAME
> > >> >         bus-error
> > >> >         error-counter-tx-rx{{128}{0}}
> > >> > 
> > >> > (( Then a continous flood, exactly similar to the above packet,
> > >> > appears.
> > >> >    Unfortunately this flooding is a firmware problem. ))
> > >> > 
> > >> > <-- Replug the cable, after a good amount of time -->
> > >> >
> > >> Where are the reverse state transitions?
> > >> > 
> > > 
> > > Hmmm...
> > > 
> > > [ ... ]
> > >> 
> > >> Reverse state transitions are missing from the logs. See comments
> > above.
> > >> 
> > > 
> > > When the device is on the _receiving_ end, and I unplug the CAN cable
> > after
> > > introducing some noise to the level of reaching WARNING or PASSIVE, I
> > > receive a BUS_ERROR event with the rxerr count reset back to 0 or 1. In
> > > that case, the driver correctly transitions back the state to
> > ERROR_ACTIVE
> > > and candump produces something similar to:
> > > 
> > >     (000.000362)  can0  2000008C   [8]  00 40 40 00 00 00 00 01  
> > >     ERRORFRAME
> > >     controller-problem{}
> > >     protocol-violation{{back-to-error-active}{}}
> > >     bus-error
> > >     error-counter-tx-rx{{0}{1}}
> > > 
> > > which is, AFAIK, the correct behaviour from the driver side.
> > > 
> > > Meanwhile, when the device is on the _sending_ end and I re-plug the CAN
> > > cable again. Sometimes I receive events with txerr reset to 0 or 1, and
> > > the driver correctly reverts back to ERROR_ACTIVE in that case. But on
> > > another times like the quoted case above, I don't receive any events
> > > resetting txerr back -- only data packets on the bus.
> > 
> > Well, the firmware seems to report *only* bus-errors via
> > CMD_CAN_ERROR_EVENT messages, also carrying the new state, but no
> > CMD_CHIP_STATE_EVENT just for the state changes.
> > 
> 
> I've dumped _every_ message I receive from the firmware while
> disconnecting the CAN bus, waiting a while, and connecting it again.
> I really received _nothing_ from the firmware when the CAN bus was
> reconnected and the data packets were flowing again. Not even a
> single CHIP_STATE_EVENT, even after waiting for a long time.
> 
> So it's basically:
> ...
> ERR EVENT, txerr=128, rxerr=0
> ERR EVENT, txerr=128, rxerr=0
> ERR EVENT, txerr=128, rxerr=0
> ...
> 
> then complete silence, except the data frames. I've even tried with
> different versions of the firmware, but the same behaviour persisted.
> 
> > > So, What can the driver do given the above?
> > 
> > Little if the notification does not come.
> > 
> 
> We can poll the state by sending CMD_GET_CHIP_STATE to the firmware,
> and it will hopefully reply with a CHIP_STATE_EVENT response
> containing the new txerr and rxerr values that we can use for
> reverse state transitions.
>
> But do we _really_ want to go through the path? I feel that it will
> open some cans of worms w.r.t. concurrent access to both the netdev
> and USB stacks from a single driver.
>
Honestly, I don't know.
>
> A possible solution can be setting up a kernel thread that queries
> for a CHIP_STATE_EVENT every second?
> 
Have you considered polling in kvaser_usb_tx_acknowledge? You could do something
like:
if(unlikely(dev->can.state != CAN_STATE_ERROR_ACTIVE))
{
    request_state();
}

I don't think that anything beyond that would be worth pursuing.

Best regards,
Andri
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Ahmed S. Darwish Jan. 25, 2015, 2:43 a.m. UTC | #15
On Fri, Jan 23, 2015 at 10:32:13AM +0000, Andri Yngvason wrote:
> Quoting Ahmed S. Darwish (2015-01-23 06:07:34)
> > On Wed, Jan 21, 2015 at 05:13:45PM +0100, Wolfgang Grandegger wrote:
> > > On Wed, 21 Jan 2015 10:36:47 -0500, "Ahmed S. Darwish"
> > > <darwish.07@gmail.com> wrote:
> > > > On Wed, Jan 21, 2015 at 03:00:15PM +0000, Andri Yngvason wrote:
> > > >> Quoting Ahmed S. Darwish (2015-01-21 14:43:23)
> > > >> > Hi!
> > > > 
> > > > ...
> > > > 
> > > >> > <-- Unplug the cable -->
> > > >> > 
> > > >> >  (000.009106)  can0  20000080   [8]  00 00 00 00 00 00 08 00  
> > > >> >  ERRORFRAME
> > > >> >         bus-error
> > > >> >         error-counter-tx-rx{{8}{0}}
> > > >> >  (000.001872)  can0  20000080   [8]  00 00 00 00 00 00 10 00  
> > > 
> > > For a bus-errors I would also expcect some more information in the
> > > data[2..3] fields. But these are always zero.
> > > 
> > 
> > M16C error factors made it possible to report things like
> > CAN_ERR_PROT_FORM/STUFF/BIT0/BIT1/TX in data[2], and
> > CAN_ERR_PROT_LOC_ACK/CRC_DEL in data[3].
> > 
> > Unfortunately such error factors are only reported in Leaf, but
> > not in USBCan-II due to the wire format change in the error event:
> > 
> >         struct leaf_msg_error_event {
> >                 u8 tid;
> >                 u8 flags;
> >                 __le16 time[3];
> >                 u8 channel;
> >                 u8 padding;
> >                 u8 tx_errors_count;
> >                 u8 rx_errors_count;
> >                 u8 status;
> >                 u8 error_factor;
> >         } __packed;
> > 
> >         struct usbcan_msg_error_event {
> >                 u8 tid;
> >                 u8 padding;
> >                 u8 tx_errors_count_ch0;
> >                 u8 rx_errors_count_ch0;
> >                 u8 tx_errors_count_ch1;
> >                 u8 rx_errors_count_ch1;
> >                 u8 status_ch0;
> >                 u8 status_ch1;
> >                 __le16 time;
> >         } __packed;
> > 
> > I speculate that the wire format was changed due to controller
> > bugs in the USBCan-II, which was slightly mentioned in their
> > data sheets here:
> > 
> >         http://www.kvaser.com/canlib-webhelp/page_hardware_specific_can_controllers.html
> > 
> > So it seems there's really no way for filling such bus error
> > info given the very limited amount of data exported :-(
> >
> We experienced similar problems with FlexCAN.

Hmm, I'll have a look there then...

Although my initial instincts imply that the FlexCAN driver has
access to the raw CAN registers, something I'm unable to do here.
But maybe there's some black magic I'm missing :-)

[...]

> > 
> > I've dumped _every_ message I receive from the firmware while
> > disconnecting the CAN bus, waiting a while, and connecting it again.
> > I really received _nothing_ from the firmware when the CAN bus was
> > reconnected and the data packets were flowing again. Not even a
> > single CHIP_STATE_EVENT, even after waiting for a long time.
> > 
> > So it's basically:
> > ...
> > ERR EVENT, txerr=128, rxerr=0
> > ERR EVENT, txerr=128, rxerr=0
> > ERR EVENT, txerr=128, rxerr=0
> > ...
> > 
> > then complete silence, except the data frames. I've even tried with
> > different versions of the firmware, but the same behaviour persisted.
> > 
> > > > So, What can the driver do given the above?
> > > 
> > > Little if the notification does not come.
> > > 
> > 
> > We can poll the state by sending CMD_GET_CHIP_STATE to the firmware,
> > and it will hopefully reply with a CHIP_STATE_EVENT response
> > containing the new txerr and rxerr values that we can use for
> > reverse state transitions.
> >
> > But do we _really_ want to go through the path? I feel that it will
> > open some cans of worms w.r.t. concurrent access to both the netdev
> > and USB stacks from a single driver.
> >
> Honestly, I don't know.
> >
> > A possible solution can be setting up a kernel thread that queries
> > for a CHIP_STATE_EVENT every second?
> > 
> Have you considered polling in kvaser_usb_tx_acknowledge? You could do something
> like:
> if(unlikely(dev->can.state != CAN_STATE_ERROR_ACTIVE))
> {
>     request_state();
> }
> 

OK, I have four important updates on this issue:

a) My initial testing was done on high-speed channel, at a bitrate
   of 50K. After setting the bus to a more reasonable bitrate 500K
   or 1M, I was _consistently_ able to receive CHIP_STATE_EVENTs
   when plugging the CAN connector again after an unplug.

b) The error counters on this device do not get reset on plugging
   after an unplug. I've setup a kernel thread [2] that queries
   the chip state event every second, and the error counters stays
   the same all the time. [1]

c) There's a single case when the erro counters do indeed get
   reversed, and it happens only when introducing some noise in
   the bus after the re-plug. In that case, the new error events
   get raised with new error counters starting from 0/1 again.

d) I've discovered a bug that forbids the CAN state from
   returning to ERROR_ACTIVE in case of the error counters
   numbers getting decreased. But independent from that bug, the
   verbose debugging messages clearly imply that we only get the
   error counters decreased in the case mentioned at `c)' above.

So from [1] and [2], it's now clear that the device do not reset
these counters back in the re-plug case. I'll give a check to
flexcan as advised, but unfortunately I don't really think there's
much I can do about this.

[1]

[  877.207082] CAN_ERROR_: channel=0, txerr=88, rxerr=0
[  877.207090] CAN_ERROR_: channel=0, txerr=136, rxerr=0
[  877.207094] CAN_ERROR_: channel=0, txerr=144, rxerr=0
[  877.207098] CAN_ERROR_: channel=0, txerr=152, rxerr=0
[  877.207100] CAN_ERROR_: channel=0, txerr=160, rxerr=0
[  877.207102] CAN_ERROR_: channel=0, txerr=168, rxerr=0
[  877.208075] CAN_ERROR_: channel=0, txerr=200, rxerr=0 

(( The above error event, staying the same at txerr=200 keeps
   flooding the bus until the CAN cable is re-plugged ))

[  878.225116] CHIP_STATE: channel=0, txerr=200, rxerr=0
[  878.225143] CHIP_STATE: channel=1, txerr=0, rxerr=0
[  879.265167] CHIP_STATE: channel=0, txerr=200, rxerr=0
[  879.267152] CHIP_STATE: channel=1, txerr=0, rxerr=0
[  879.265167] CHIP_STATE: channel=0, txerr=200, rxerr=0
[  879.267152] CHIP_STATE: channel=1, txerr=0, rxerr=0

(( The same counters get repeated every second ))

[2] State was polled using:

static int kvaser_usb_poll_chip_state(void *vpriv) {
      struct kvaser_usb_net_priv *priv = vpriv;

      while (!kthread_should_stop()) {
              kvaser_usb_simple_msg_async(priv, CMD_GET_CHIP_STATE);
              ssleep(1);
      }

      return 0;
}

> I don't think that anything beyond that would be worth pursuing.
> 

I agree, but given the new input, it seems that our problem
extends to the error counters themselves not getting decreased
on re-plug. So, even polling will not solve the issue: we'll
get the same txerr/rxerr values again and again :-(

> Best regards,
> Andri

Regards,
Darwish

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Ahmed S. Darwish Jan. 25, 2015, 2:49 a.m. UTC | #16
On Thu, Jan 22, 2015 at 10:14:47AM +0000, Andri Yngvason wrote:
> Quoting Marc Kleine-Budde (2015-01-21 22:59:23)
> > On 01/21/2015 05:20 PM, Andri Yngvason wrote:
> > > Marc, could you merge the "move bus_off++" patch before you merge this so that I
> > > won't have to incorporate this patch-set into it?
> > 
> > ...included in the lastest pull-request to David. Use
> > tags/linux-can-next-for-3.20-20150121 of the can-next repo as you new base.
> > 
> 
> Thanks!
>

I guess I'll re-base my next submission over this tag too.
Nothing in the new 5 patches is substantial enough to be
included in the current kernel release.

Thanks!
Darwish
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Ahmed S. Darwish Jan. 25, 2015, 3:21 a.m. UTC | #17
Hi!

On Wed, Jan 21, 2015 at 04:20:25PM +0000, Andri Yngvason wrote:
> Quoting Ahmed S. Darwish (2015-01-20 21:45:37)
> > From: Ahmed S. Darwish <ahmed.darwish@valeo.com>
> > 
> > Replace most of the can interface's state and error counters
> > handling with the new can-dev can_change_state() mechanism.
> > 
> > Suggested-by: Andri Yngvason <andri.yngvason@marel.com>
> > Signed-off-by: Ahmed S. Darwish <ahmed.darwish@valeo.com>
> > ---
> >  drivers/net/can/usb/kvaser_usb.c | 114 +++++++++++++++++++--------------------
> >  1 file changed, 55 insertions(+), 59 deletions(-)
> > 
> > diff --git a/drivers/net/can/usb/kvaser_usb.c b/drivers/net/can/usb/kvaser_usb.c
> > index 971c5f9..0386d3f 100644
> > --- a/drivers/net/can/usb/kvaser_usb.c
> > +++ b/drivers/net/can/usb/kvaser_usb.c
> > @@ -620,40 +620,43 @@ static void kvaser_usb_unlink_tx_urbs(struct kvaser_usb_net_priv *priv)
> >  }
> >  
> >  static void kvaser_usb_rx_error_update_can_state(struct kvaser_usb_net_priv *priv,
> > -                                                const struct kvaser_usb_error_summary *es)
> > +                                                const struct kvaser_usb_error_summary *es,
> > +                                                struct can_frame *cf)
> >  {
> >         struct net_device_stats *stats;
> > -       enum can_state new_state;
> > -
> > -       stats = &priv->netdev->stats;
> > -       new_state = priv->can.state;
> > +       enum can_state cur_state, new_state, tx_state, rx_state;
> >  
> >         netdev_dbg(priv->netdev, "Error status: 0x%02x\n", es->status);
> >  
> > -       if (es->status & M16C_STATE_BUS_OFF) {
> > -               priv->can.can_stats.bus_off++;
> > +       stats = &priv->netdev->stats;
> > +       new_state = cur_state = priv->can.state;
> > +
> > +       if (es->status & M16C_STATE_BUS_OFF)
> >                 new_state = CAN_STATE_BUS_OFF;
> > -       } else if (es->status & M16C_STATE_BUS_PASSIVE) {
> > -               if (priv->can.state != CAN_STATE_ERROR_PASSIVE)
> > -                       priv->can.can_stats.error_passive++;
> > +       else if (es->status & M16C_STATE_BUS_PASSIVE)
> >                 new_state = CAN_STATE_ERROR_PASSIVE;
> > -       }
> >  
> >         if (es->status == M16C_STATE_BUS_ERROR) {
> > -               if ((priv->can.state < CAN_STATE_ERROR_WARNING) &&
> > -                   ((es->txerr >= 96) || (es->rxerr >= 96))) {
> > -                       priv->can.can_stats.error_warning++;
> > +               if ((cur_state < CAN_STATE_ERROR_WARNING) &&
> > +                   ((es->txerr >= 96) || (es->rxerr >= 96)))
> >                         new_state = CAN_STATE_ERROR_WARNING;
> > -               } else if (priv->can.state > CAN_STATE_ERROR_ACTIVE) {
> > +               else if (cur_state > CAN_STATE_ERROR_ACTIVE)
> >                         new_state = CAN_STATE_ERROR_ACTIVE;
> > -               }
> >         }
> >  
> >         if (!es->status)
> >                 new_state = CAN_STATE_ERROR_ACTIVE;
> >  
> > +       if (new_state != cur_state) {
> > +               tx_state = (es->txerr >= es->rxerr) ? new_state : 0;
> > +               rx_state = (es->txerr <= es->rxerr) ? new_state : 0;
> > +
> > +               can_change_state(priv->netdev, cf, tx_state, rx_state);
>
> This (below) is redundant. It doesn't harm but at this point can_change_state
> has set priv->can.state to new_state.
>
> > +               new_state = priv->can.state;
> > +       }
> > +

Correct; I will remove it.

> >         if (priv->can.restart_ms &&
> > -           (priv->can.state >= CAN_STATE_BUS_OFF) &&
> > +           (cur_state >= CAN_STATE_BUS_OFF) &&
> >             (new_state < CAN_STATE_BUS_OFF)) {
> >                 priv->can.can_stats.restarts++;
> >         }
> > @@ -665,18 +668,17 @@ static void kvaser_usb_rx_error_update_can_state(struct kvaser_usb_net_priv *pri
> >  
> >         priv->bec.txerr = es->txerr;
> >         priv->bec.rxerr = es->rxerr;
> > -       priv->can.state = new_state;
> >  }
> >  
> >  static void kvaser_usb_rx_error(const struct kvaser_usb *dev,
> >                                 const struct kvaser_msg *msg)
> >  {
> > -       struct can_frame *cf;
> > +       struct can_frame *cf, tmp_cf = { .can_id = CAN_ERR_FLAG, .can_dlc = CAN_ERR_DLC };
> >         struct sk_buff *skb;
> >         struct net_device_stats *stats;
> >         struct kvaser_usb_net_priv *priv;
> >         struct kvaser_usb_error_summary es = { };
> > -       enum can_state old_state;
> > +       enum can_state old_state, new_state;
> >  
> >         switch (msg->id) {
> >         case CMD_CAN_ERROR_EVENT:
> > @@ -721,60 +723,54 @@ static void kvaser_usb_rx_error(const struct kvaser_usb *dev,
> >         }
> >  
> >         /* Update all of the can interface's state and error counters before
> > -        * trying any skb allocation that can actually fail with -ENOMEM.
> > +        * trying any memory allocation that can actually fail with -ENOMEM.
> > +        *
> > +        * We send a temporary stack-allocated error can frame to
> > +        * can_change_state() for the very same reason.
> > +        *
> > +        * TODO: Split can_change_state() responsibility between updating the
> > +        * can interface's state and counters, and the setting up of can error
> > +        * frame ID and data to userspace. Remove stack allocation afterwards.
> >          */
> >         old_state = priv->can.state;
> > -       kvaser_usb_rx_error_update_can_state(priv, &es);
> > +       kvaser_usb_rx_error_update_can_state(priv, &es, &tmp_cf);
> > +       new_state = priv->can.state;
> >  
> >         skb = alloc_can_err_skb(priv->netdev, &cf);
> >         if (!skb) {
> >                 stats->rx_dropped++;
> >                 return;
> >         }
> > +       memcpy(cf, &tmp_cf, sizeof(*cf));
> >  
> > -       if (es.status & M16C_STATE_BUS_OFF) {
> > -               cf->can_id |= CAN_ERR_BUSOFF;
> > -
> > -               if (!priv->can.restart_ms)
> > -                       kvaser_usb_simple_msg_async(priv, CMD_STOP_CHIP);
> > -               netif_carrier_off(priv->netdev);
> > -       } else if (es.status & M16C_STATE_BUS_PASSIVE) {
> > -               if (old_state != CAN_STATE_ERROR_PASSIVE) {
> > -                       cf->can_id |= CAN_ERR_CRTL;
> > -
> > -                       if (es.txerr || es.rxerr)
> > -                               cf->data[1] = (es.txerr > es.rxerr)
> > -                                               ? CAN_ERR_CRTL_TX_PASSIVE
> > -                                               : CAN_ERR_CRTL_RX_PASSIVE;
> > -                       else
> > -                               cf->data[1] = CAN_ERR_CRTL_TX_PASSIVE |
> > -                                             CAN_ERR_CRTL_RX_PASSIVE;
> > +       if (new_state != old_state) {
> > +               if (es.status & M16C_STATE_BUS_OFF) {
> > +                       if (!priv->can.restart_ms)
> > +                               kvaser_usb_simple_msg_async(priv, CMD_STOP_CHIP);
> > +                       netif_carrier_off(priv->netdev);
> > +               }
> > +
>
> This block [below] is wrong. The usage of PROT_ACTIVE is based on a misunderstanding.
> It's used in some drivers to signify back-to-error-active but its original
> meaning is something completely different, AFAIK.
> This is handled in can_change_state() using a new CTRL message; namely:
> CAN_ERR_CTRL_ACTIVE. The newest version of can-utils is up to date with this.
>
> > +               if (es.status == M16C_STATE_BUS_ERROR) {
> > +                       if ((old_state >= CAN_STATE_ERROR_WARNING) ||
> > +                           (es.txerr < 96 && es.rxerr < 96)) {
> > +                               if (old_state > CAN_STATE_ERROR_ACTIVE) {
> > +                                       cf->can_id |= CAN_ERR_PROT;
> > +                                       cf->data[2] = CAN_ERR_PROT_ACTIVE;
> > +                               }
> > +                       }
> >                 }

So I would just make the new_state equals CAN_STATE_ERROR_ACTIVE,
and then can_change_state() will do the right thing? Excellent!!
This means the entire block above can be removed ;-)

[ On another note, the block's if conditions above are faulty and
  fixed in patch #3. This patch (#2) used can_change_state()
  without changing any of that faulty logic to ease any future
  bisection. ]

> > -       }
> >  
> > -       if (es.status == M16C_STATE_BUS_ERROR) {
> > -               if ((old_state < CAN_STATE_ERROR_WARNING) &&
> > -                   ((es.txerr >= 96) || (es.rxerr >= 96))) {
> > -                       cf->can_id |= CAN_ERR_CRTL;
> > -                       cf->data[1] = (es.txerr > es.rxerr)
> > -                                       ? CAN_ERR_CRTL_TX_WARNING
> > -                                       : CAN_ERR_CRTL_RX_WARNING;
> > -               } else if (old_state > CAN_STATE_ERROR_ACTIVE) {
>
> This is also redundant, and wrong:
>
> > +               if (!es.status) {
> >                         cf->can_id |= CAN_ERR_PROT;
> >                         cf->data[2] = CAN_ERR_PROT_ACTIVE;
> >                 }
> > -       }

As in the above; extra code to be removed, yay! ;-)

> >  
> > -       if (!es.status) {
> > -               cf->can_id |= CAN_ERR_PROT;
> > -               cf->data[2] = CAN_ERR_PROT_ACTIVE;
> > -       }
> > -
> > -       if (priv->can.restart_ms &&
> > -           (old_state >= CAN_STATE_BUS_OFF) &&
> > -           (priv->can.state < CAN_STATE_BUS_OFF)) {
> > -               cf->can_id |= CAN_ERR_RESTARTED;
> > -               netif_carrier_on(priv->netdev);
> > +               if (priv->can.restart_ms &&
> > +                   (old_state >= CAN_STATE_BUS_OFF) &&
> > +                   (new_state < CAN_STATE_BUS_OFF)) {
> > +                       cf->can_id |= CAN_ERR_RESTARTED;
> > +                       netif_carrier_on(priv->netdev);
> > +               }
> >         }
> >  
> >         if (es.error_factor) {
> > -- 
> > 1.9.1
> 
> Looking over the patch again, I've noticed that there
> are a few things that are not quite right.
> 

The state-handlig code has become much simpler since your
reveiw and Kleine-Budde's one. Thanks a lot for all the effort.

Regards,
Darwish
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Andri Yngvason Jan. 26, 2015, 10:21 a.m. UTC | #18
Quoting Ahmed S. Darwish (2015-01-25 02:43:00)
> On Fri, Jan 23, 2015 at 10:32:13AM +0000, Andri Yngvason wrote:
> > Quoting Ahmed S. Darwish (2015-01-23 06:07:34)
> > > On Wed, Jan 21, 2015 at 05:13:45PM +0100, Wolfgang Grandegger wrote:
> > > > On Wed, 21 Jan 2015 10:36:47 -0500, "Ahmed S. Darwish"
> > > > <darwish.07@gmail.com> wrote:
> > > > > On Wed, Jan 21, 2015 at 03:00:15PM +0000, Andri Yngvason wrote:
> > > > >> Quoting Ahmed S. Darwish (2015-01-21 14:43:23)
> > > > >> > Hi!
> > > > > 
> > > > > ...
> > > > > 
> > > > >> > <-- Unplug the cable -->
> > > > >> > 
> > > > >> >  (000.009106)  can0  20000080   [8]  00 00 00 00 00 00 08 00  
> > > > >> >  ERRORFRAME
> > > > >> >         bus-error
> > > > >> >         error-counter-tx-rx{{8}{0}}
> > > > >> >  (000.001872)  can0  20000080   [8]  00 00 00 00 00 00 10 00  
> > > > 
> > > > For a bus-errors I would also expcect some more information in the
> > > > data[2..3] fields. But these are always zero.
> > > > 
> > > 
> > > M16C error factors made it possible to report things like
> > > CAN_ERR_PROT_FORM/STUFF/BIT0/BIT1/TX in data[2], and
> > > CAN_ERR_PROT_LOC_ACK/CRC_DEL in data[3].
> > > 
> > > Unfortunately such error factors are only reported in Leaf, but
> > > not in USBCan-II due to the wire format change in the error event:
> > > 
> > >         struct leaf_msg_error_event {
> > >                 u8 tid;
> > >                 u8 flags;
> > >                 __le16 time[3];
> > >                 u8 channel;
> > >                 u8 padding;
> > >                 u8 tx_errors_count;
> > >                 u8 rx_errors_count;
> > >                 u8 status;
> > >                 u8 error_factor;
> > >         } __packed;
> > > 
> > >         struct usbcan_msg_error_event {
> > >                 u8 tid;
> > >                 u8 padding;
> > >                 u8 tx_errors_count_ch0;
> > >                 u8 rx_errors_count_ch0;
> > >                 u8 tx_errors_count_ch1;
> > >                 u8 rx_errors_count_ch1;
> > >                 u8 status_ch0;
> > >                 u8 status_ch1;
> > >                 __le16 time;
> > >         } __packed;
> > > 
> > > I speculate that the wire format was changed due to controller
> > > bugs in the USBCan-II, which was slightly mentioned in their
> > > data sheets here:
> > > 
> > >         http://www.kvaser.com/canlib-webhelp/page_hardware_specific_can_controllers.html
> > > 
> > > So it seems there's really no way for filling such bus error
> > > info given the very limited amount of data exported :-(
> > >
> > We experienced similar problems with FlexCAN.
> 
> Hmm, I'll have a look there then...
> 
> Although my initial instincts imply that the FlexCAN driver has
> access to the raw CAN registers, something I'm unable to do here.
> But maybe there's some black magic I'm missing :-)
>
Yes, it's memory mapped.
> 
> [...]
> 
> > > 
> > > I've dumped _every_ message I receive from the firmware while
> > > disconnecting the CAN bus, waiting a while, and connecting it again.
> > > I really received _nothing_ from the firmware when the CAN bus was
> > > reconnected and the data packets were flowing again. Not even a
> > > single CHIP_STATE_EVENT, even after waiting for a long time.
> > > 
> > > So it's basically:
> > > ...
> > > ERR EVENT, txerr=128, rxerr=0
> > > ERR EVENT, txerr=128, rxerr=0
> > > ERR EVENT, txerr=128, rxerr=0
> > > ...
> > > 
> > > then complete silence, except the data frames. I've even tried with
> > > different versions of the firmware, but the same behaviour persisted.
> > > 
> > > > > So, What can the driver do given the above?
> > > > 
> > > > Little if the notification does not come.
> > > > 
> > > 
> > > We can poll the state by sending CMD_GET_CHIP_STATE to the firmware,
> > > and it will hopefully reply with a CHIP_STATE_EVENT response
> > > containing the new txerr and rxerr values that we can use for
> > > reverse state transitions.
> > >
> > > But do we _really_ want to go through the path? I feel that it will
> > > open some cans of worms w.r.t. concurrent access to both the netdev
> > > and USB stacks from a single driver.
> > >
> > Honestly, I don't know.
> > >
> > > A possible solution can be setting up a kernel thread that queries
> > > for a CHIP_STATE_EVENT every second?
> > > 
> > Have you considered polling in kvaser_usb_tx_acknowledge? You could do something
> > like:
> > if(unlikely(dev->can.state != CAN_STATE_ERROR_ACTIVE))
> > {
> >     request_state();
> > }
> > 
> 
> OK, I have four important updates on this issue:
> 
> a) My initial testing was done on high-speed channel, at a bitrate
>    of 50K. After setting the bus to a more reasonable bitrate 500K
>    or 1M, I was _consistently_ able to receive CHIP_STATE_EVENTs
>    when plugging the CAN connector again after an unplug.
> 
> b) The error counters on this device do not get reset on plugging
>    after an unplug. I've setup a kernel thread [2] that queries
>    the chip state event every second, and the error counters stays
>    the same all the time. [1]
> 
> c) There's a single case when the erro counters do indeed get
>    reversed, and it happens only when introducing some noise in
>    the bus after the re-plug. In that case, the new error events
>    get raised with new error counters starting from 0/1 again.
> 
> d) I've discovered a bug that forbids the CAN state from
>    returning to ERROR_ACTIVE in case of the error counters
>    numbers getting decreased. But independent from that bug, the
>    verbose debugging messages clearly imply that we only get the
>    error counters decreased in the case mentioned at `c)' above.
> 
> So from [1] and [2], it's now clear that the device do not reset
> these counters back in the re-plug case. I'll give a check to
> flexcan as advised, but unfortunately I don't really think there's
> much I can do about this.
> 
> [1]
> 
> [  877.207082] CAN_ERROR_: channel=0, txerr=88, rxerr=0
> [  877.207090] CAN_ERROR_: channel=0, txerr=136, rxerr=0
> [  877.207094] CAN_ERROR_: channel=0, txerr=144, rxerr=0
> [  877.207098] CAN_ERROR_: channel=0, txerr=152, rxerr=0
> [  877.207100] CAN_ERROR_: channel=0, txerr=160, rxerr=0
> [  877.207102] CAN_ERROR_: channel=0, txerr=168, rxerr=0
> [  877.208075] CAN_ERROR_: channel=0, txerr=200, rxerr=0 
> 
> (( The above error event, staying the same at txerr=200 keeps
>    flooding the bus until the CAN cable is re-plugged ))
> 
> [  878.225116] CHIP_STATE: channel=0, txerr=200, rxerr=0
> [  878.225143] CHIP_STATE: channel=1, txerr=0, rxerr=0
> [  879.265167] CHIP_STATE: channel=0, txerr=200, rxerr=0
> [  879.267152] CHIP_STATE: channel=1, txerr=0, rxerr=0
> [  879.265167] CHIP_STATE: channel=0, txerr=200, rxerr=0
> [  879.267152] CHIP_STATE: channel=1, txerr=0, rxerr=0
> 
> (( The same counters get repeated every second ))
> 
> [2] State was polled using:
> 
> static int kvaser_usb_poll_chip_state(void *vpriv) {
>       struct kvaser_usb_net_priv *priv = vpriv;
> 
>       while (!kthread_should_stop()) {
>               kvaser_usb_simple_msg_async(priv, CMD_GET_CHIP_STATE);
>               ssleep(1);
>       }
> 
>       return 0;
> }
> 
> > I don't think that anything beyond that would be worth pursuing.
> > 
> 
> I agree, but given the new input, it seems that our problem
> extends to the error counters themselves not getting decreased
> on re-plug. So, even polling will not solve the issue: we'll
> get the same txerr/rxerr values again and again :-(
> 
Well, that's a damn shame. Not much you can do about that.

Best regards,
Andri
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
diff mbox

Patch

diff --git a/drivers/net/can/usb/kvaser_usb.c b/drivers/net/can/usb/kvaser_usb.c
index 971c5f9..0386d3f 100644
--- a/drivers/net/can/usb/kvaser_usb.c
+++ b/drivers/net/can/usb/kvaser_usb.c
@@ -620,40 +620,43 @@  static void kvaser_usb_unlink_tx_urbs(struct kvaser_usb_net_priv *priv)
 }
 
 static void kvaser_usb_rx_error_update_can_state(struct kvaser_usb_net_priv *priv,
-						 const struct kvaser_usb_error_summary *es)
+						 const struct kvaser_usb_error_summary *es,
+						 struct can_frame *cf)
 {
 	struct net_device_stats *stats;
-	enum can_state new_state;
-
-	stats = &priv->netdev->stats;
-	new_state = priv->can.state;
+	enum can_state cur_state, new_state, tx_state, rx_state;
 
 	netdev_dbg(priv->netdev, "Error status: 0x%02x\n", es->status);
 
-	if (es->status & M16C_STATE_BUS_OFF) {
-		priv->can.can_stats.bus_off++;
+	stats = &priv->netdev->stats;
+	new_state = cur_state = priv->can.state;
+
+	if (es->status & M16C_STATE_BUS_OFF)
 		new_state = CAN_STATE_BUS_OFF;
-	} else if (es->status & M16C_STATE_BUS_PASSIVE) {
-		if (priv->can.state != CAN_STATE_ERROR_PASSIVE)
-			priv->can.can_stats.error_passive++;
+	else if (es->status & M16C_STATE_BUS_PASSIVE)
 		new_state = CAN_STATE_ERROR_PASSIVE;
-	}
 
 	if (es->status == M16C_STATE_BUS_ERROR) {
-		if ((priv->can.state < CAN_STATE_ERROR_WARNING) &&
-		    ((es->txerr >= 96) || (es->rxerr >= 96))) {
-			priv->can.can_stats.error_warning++;
+		if ((cur_state < CAN_STATE_ERROR_WARNING) &&
+		    ((es->txerr >= 96) || (es->rxerr >= 96)))
 			new_state = CAN_STATE_ERROR_WARNING;
-		} else if (priv->can.state > CAN_STATE_ERROR_ACTIVE) {
+		else if (cur_state > CAN_STATE_ERROR_ACTIVE)
 			new_state = CAN_STATE_ERROR_ACTIVE;
-		}
 	}
 
 	if (!es->status)
 		new_state = CAN_STATE_ERROR_ACTIVE;
 
+	if (new_state != cur_state) {
+		tx_state = (es->txerr >= es->rxerr) ? new_state : 0;
+		rx_state = (es->txerr <= es->rxerr) ? new_state : 0;
+
+		can_change_state(priv->netdev, cf, tx_state, rx_state);
+		new_state = priv->can.state;
+	}
+
 	if (priv->can.restart_ms &&
-	    (priv->can.state >= CAN_STATE_BUS_OFF) &&
+	    (cur_state >= CAN_STATE_BUS_OFF) &&
 	    (new_state < CAN_STATE_BUS_OFF)) {
 		priv->can.can_stats.restarts++;
 	}
@@ -665,18 +668,17 @@  static void kvaser_usb_rx_error_update_can_state(struct kvaser_usb_net_priv *pri
 
 	priv->bec.txerr = es->txerr;
 	priv->bec.rxerr = es->rxerr;
-	priv->can.state = new_state;
 }
 
 static void kvaser_usb_rx_error(const struct kvaser_usb *dev,
 				const struct kvaser_msg *msg)
 {
-	struct can_frame *cf;
+	struct can_frame *cf, tmp_cf = { .can_id = CAN_ERR_FLAG, .can_dlc = CAN_ERR_DLC };
 	struct sk_buff *skb;
 	struct net_device_stats *stats;
 	struct kvaser_usb_net_priv *priv;
 	struct kvaser_usb_error_summary es = { };
-	enum can_state old_state;
+	enum can_state old_state, new_state;
 
 	switch (msg->id) {
 	case CMD_CAN_ERROR_EVENT:
@@ -721,60 +723,54 @@  static void kvaser_usb_rx_error(const struct kvaser_usb *dev,
 	}
 
 	/* Update all of the can interface's state and error counters before
-	 * trying any skb allocation that can actually fail with -ENOMEM.
+	 * trying any memory allocation that can actually fail with -ENOMEM.
+	 *
+	 * We send a temporary stack-allocated error can frame to
+	 * can_change_state() for the very same reason.
+	 *
+	 * TODO: Split can_change_state() responsibility between updating the
+	 * can interface's state and counters, and the setting up of can error
+	 * frame ID and data to userspace. Remove stack allocation afterwards.
 	 */
 	old_state = priv->can.state;
-	kvaser_usb_rx_error_update_can_state(priv, &es);
+	kvaser_usb_rx_error_update_can_state(priv, &es, &tmp_cf);
+	new_state = priv->can.state;
 
 	skb = alloc_can_err_skb(priv->netdev, &cf);
 	if (!skb) {
 		stats->rx_dropped++;
 		return;
 	}
+	memcpy(cf, &tmp_cf, sizeof(*cf));
 
-	if (es.status & M16C_STATE_BUS_OFF) {
-		cf->can_id |= CAN_ERR_BUSOFF;
-
-		if (!priv->can.restart_ms)
-			kvaser_usb_simple_msg_async(priv, CMD_STOP_CHIP);
-		netif_carrier_off(priv->netdev);
-	} else if (es.status & M16C_STATE_BUS_PASSIVE) {
-		if (old_state != CAN_STATE_ERROR_PASSIVE) {
-			cf->can_id |= CAN_ERR_CRTL;
-
-			if (es.txerr || es.rxerr)
-				cf->data[1] = (es.txerr > es.rxerr)
-						? CAN_ERR_CRTL_TX_PASSIVE
-						: CAN_ERR_CRTL_RX_PASSIVE;
-			else
-				cf->data[1] = CAN_ERR_CRTL_TX_PASSIVE |
-					      CAN_ERR_CRTL_RX_PASSIVE;
+	if (new_state != old_state) {
+		if (es.status & M16C_STATE_BUS_OFF) {
+			if (!priv->can.restart_ms)
+				kvaser_usb_simple_msg_async(priv, CMD_STOP_CHIP);
+			netif_carrier_off(priv->netdev);
+		}
+
+		if (es.status == M16C_STATE_BUS_ERROR) {
+			if ((old_state >= CAN_STATE_ERROR_WARNING) ||
+			    (es.txerr < 96 && es.rxerr < 96)) {
+				if (old_state > CAN_STATE_ERROR_ACTIVE) {
+					cf->can_id |= CAN_ERR_PROT;
+					cf->data[2] = CAN_ERR_PROT_ACTIVE;
+				}
+			}
 		}
-	}
 
-	if (es.status == M16C_STATE_BUS_ERROR) {
-		if ((old_state < CAN_STATE_ERROR_WARNING) &&
-		    ((es.txerr >= 96) || (es.rxerr >= 96))) {
-			cf->can_id |= CAN_ERR_CRTL;
-			cf->data[1] = (es.txerr > es.rxerr)
-					? CAN_ERR_CRTL_TX_WARNING
-					: CAN_ERR_CRTL_RX_WARNING;
-		} else if (old_state > CAN_STATE_ERROR_ACTIVE) {
+		if (!es.status) {
 			cf->can_id |= CAN_ERR_PROT;
 			cf->data[2] = CAN_ERR_PROT_ACTIVE;
 		}
-	}
 
-	if (!es.status) {
-		cf->can_id |= CAN_ERR_PROT;
-		cf->data[2] = CAN_ERR_PROT_ACTIVE;
-	}
-
-	if (priv->can.restart_ms &&
-	    (old_state >= CAN_STATE_BUS_OFF) &&
-	    (priv->can.state < CAN_STATE_BUS_OFF)) {
-		cf->can_id |= CAN_ERR_RESTARTED;
-		netif_carrier_on(priv->netdev);
+		if (priv->can.restart_ms &&
+		    (old_state >= CAN_STATE_BUS_OFF) &&
+		    (new_state < CAN_STATE_BUS_OFF)) {
+			cf->can_id |= CAN_ERR_RESTARTED;
+			netif_carrier_on(priv->netdev);
+		}
 	}
 
 	if (es.error_factor) {