diff mbox

changing usbnet's API to better deal with cdc-ncm

Message ID 2791550.LhGu6po6Xy@linux-lqwf.site
State RFC, archived
Delegated to: David Miller
Headers show

Commit Message

Oliver Neukum Sept. 5, 2012, 8:12 p.m. UTC
Hi,

looking at cdc-ncm it seeems to me that cdc-ncm is forced to play
very dirty games because usbnet doesn't have a notion about aggregating
packets for a single transfer.

It seems to me that this can be fixed introducing a method for bundling,
which tells usbnet how packets have been aggregated. To have performance
usbnet strives to always keep at least two transfers in flight.

The code isn't complete and I need to get a device for testing, but to get
your opinion, I ask you to comment on what I have now.

	Regards
		Oliver


--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Comments

Ming Lei Sept. 6, 2012, 3:23 a.m. UTC | #1
On Thu, Sep 6, 2012 at 4:12 AM, Oliver Neukum <oneukum@suse.de> wrote:
> Hi,
>
> looking at cdc-ncm it seeems to me that cdc-ncm is forced to play
> very dirty games because usbnet doesn't have a notion about aggregating
> packets for a single transfer.

The Ethernet API we are using does not support transmitting multiple Ethernet
frames in a single call, so the aggregation things should be done inside low
level driver, in fact it is just what some wlan(802.11n) drivers have
been doing.

IMO, the current .tx_fixup is intelligent enough to handle aggregation:

      - return a skb_out for sending if low level drivers think it is
ready to send
        the aggregated frames
     -  return NULL if  the low level drivers think they need to wait
for more frames

Of course, the low level drivers need to start a timer to trigger sending
remainder frames in case of timeout and no further frames come from
upper protocol stack.

Looks the introduced .tx_bundle is not necessary since .tx_fixup is OK.

>
> It seems to me that this can be fixed introducing a method for bundling,
> which tells usbnet how packets have been aggregated. To have performance
> usbnet strives to always keep at least two transfers in flight.

IMO, usbnet only cares how to send out the aggregated frame, and
doesn't mind how the packets are aggregated.  Also the way of
aggregation is per low level driver and should be handled by low
level driver.

>
> The code isn't complete and I need to get a device for testing, but to get
> your opinion, I ask you to comment on what I have now.
>
>         Regards
>                 Oliver
>
> diff --git a/drivers/net/usb/cdc_ncm.c b/drivers/net/usb/cdc_ncm.c
> index 4cd582a..56ef743 100644
> --- a/drivers/net/usb/cdc_ncm.c
> +++ b/drivers/net/usb/cdc_ncm.c
> @@ -135,9 +135,6 @@ struct cdc_ncm_ctx {
>         u16 connected;
>  };
>
> -static void cdc_ncm_txpath_bh(unsigned long param);
> -static void cdc_ncm_tx_timeout_start(struct cdc_ncm_ctx *ctx);
> -static enum hrtimer_restart cdc_ncm_tx_timer_cb(struct hrtimer *hr_timer);
>  static struct usb_driver cdc_ncm_driver;
>
>  static void
> @@ -464,10 +461,6 @@ static int cdc_ncm_bind(struct usbnet *dev, struct usb_interface *intf)
>         if (ctx == NULL)
>                 return -ENODEV;
>
> -       hrtimer_init(&ctx->tx_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
> -       ctx->tx_timer.function = &cdc_ncm_tx_timer_cb;
> -       ctx->bh.data = (unsigned long)ctx;
> -       ctx->bh.func = cdc_ncm_txpath_bh;

The timeout timer is removed, so I am wondering how the low level driver or
usbnet handle the wait_for_more?

>         atomic_set(&ctx->stop, 0);
>         spin_lock_init(&ctx->mtx);
>         ctx->netdev = dev->net;
> @@ -650,7 +643,7 @@ static void cdc_ncm_zero_fill(u8 *ptr, u32 first, u32 end, u32 max)
>         memset(ptr + first, 0, end - first);
>  }
>
> -static struct sk_buff *
> +static int
>  cdc_ncm_fill_tx_frame(struct cdc_ncm_ctx *ctx, struct sk_buff *skb)
>  {
>         struct sk_buff *skb_out;
> @@ -659,12 +652,7 @@ cdc_ncm_fill_tx_frame(struct cdc_ncm_ctx *ctx, struct sk_buff *skb)
>         u32 last_offset;
>         u16 n = 0, index;
>         u8 ready2send = 0;
> -
> -       /* if there is a remaining skb, it gets priority */
> -       if (skb != NULL)
> -               swap(skb, ctx->tx_rem_skb);
> -       else
> -               ready2send = 1;
> +       u8 error = 0;
>
>         /*
>          * +----------------+
> @@ -690,7 +678,7 @@ cdc_ncm_fill_tx_frame(struct cdc_ncm_ctx *ctx, struct sk_buff *skb)
>                                 dev_kfree_skb_any(skb);
>                                 ctx->netdev->stats.tx_dropped++;
>                         }
> -                       goto exit_no_skb;
> +                       return -EBUSY;

No send out skb, so you still want usbnet to transmit the NULL frame?
Also the current skb to send is not stored in ctx->tx_rem_skb, it will be lost.

>                 }
>
>                 /* make room for NTH and NDP */
> @@ -719,28 +707,15 @@ cdc_ncm_fill_tx_frame(struct cdc_ncm_ctx *ctx, struct sk_buff *skb)
>                 /* compute maximum buffer size */
>                 rem = ctx->tx_max - offset;
>
> -               if (skb == NULL) {
> -                       skb = ctx->tx_rem_skb;
> -                       ctx->tx_rem_skb = NULL;
> -
> -                       /* check for end of skb */
> -                       if (skb == NULL)
> -                               break;
> -               }
> -
>                 if (skb->len > rem) {
>                         if (n == 0) {
>                                 /* won't fit, MTU problem? */
>                                 dev_kfree_skb_any(skb);
>                                 skb = NULL;
>                                 ctx->netdev->stats.tx_dropped++;
> +                               error = 1;
>                         } else {
> -                               /* no room for skb - store for later */
> -                               if (ctx->tx_rem_skb != NULL) {
> -                                       dev_kfree_skb_any(ctx->tx_rem_skb);
> -                                       ctx->netdev->stats.tx_dropped++;
> -                               }
> -                               ctx->tx_rem_skb = skb;
> +

The 'skb' will be lost too, :-)

>                                 skb = NULL;
>                                 ready2send = 1;
>                         }
> @@ -768,13 +743,6 @@ cdc_ncm_fill_tx_frame(struct cdc_ncm_ctx *ctx, struct sk_buff *skb)
>                 skb = NULL;
>         }
>
> -       /* free up any dangling skb */
> -       if (skb != NULL) {
> -               dev_kfree_skb_any(skb);
> -               skb = NULL;
> -               ctx->netdev->stats.tx_dropped++;
> -       }
> -
>         ctx->tx_curr_frame_num = n;
>
>         if (n == 0) {
> @@ -791,9 +759,7 @@ cdc_ncm_fill_tx_frame(struct cdc_ncm_ctx *ctx, struct sk_buff *skb)
>                 ctx->tx_curr_skb = skb_out;
>                 ctx->tx_curr_offset = offset;
>                 ctx->tx_curr_last_offset = last_offset;
> -               /* set the pending count */
> -               if (n < CDC_NCM_RESTART_TIMER_DATAGRAM_CNT)
> -                       ctx->tx_timer_pending = CDC_NCM_TIMER_PENDING_CNT;
> +
>                 goto exit_no_skb;
>
>         } else {
> @@ -874,71 +840,37 @@ cdc_ncm_fill_tx_frame(struct cdc_ncm_ctx *ctx, struct sk_buff *skb)
>         /* return skb */
>         ctx->tx_curr_skb = NULL;
>         ctx->netdev->stats.tx_packets += ctx->tx_curr_frame_num;
> -       return skb_out;
>
> -exit_no_skb:
> -       /* Start timer, if there is a remaining skb */
> -       if (ctx->tx_curr_skb != NULL)
> -               cdc_ncm_tx_timeout_start(ctx);
> -       return NULL;
> -}
> -
> -static void cdc_ncm_tx_timeout_start(struct cdc_ncm_ctx *ctx)
> -{
> -       /* start timer, if not already started */
> -       if (!(hrtimer_active(&ctx->tx_timer) || atomic_read(&ctx->stop)))
> -               hrtimer_start(&ctx->tx_timer,
> -                               ktime_set(0, CDC_NCM_TIMER_INTERVAL),
> -                               HRTIMER_MODE_REL);
> -}
> +       if (error)
> +               return -EBUSY;
> +       if (ready2send)
> +               return -EBUSY;
> +       else
> +               return 0;
>
> -static enum hrtimer_restart cdc_ncm_tx_timer_cb(struct hrtimer *timer)
> -{
> -       struct cdc_ncm_ctx *ctx =
> -                       container_of(timer, struct cdc_ncm_ctx, tx_timer);
> +exit_no_skb:
>
> -       if (!atomic_read(&ctx->stop))
> -               tasklet_schedule(&ctx->bh);
> -       return HRTIMER_NORESTART;
> +       return -EAGAIN;

Return -EAGAIN will make the driver or usbnet lose the wait_for_more...

>  }
>
> -static void cdc_ncm_txpath_bh(unsigned long param)
> +static int cdc_ncm_tx_bundle(struct usbnet *dev, struct sk_buff *skb, gfp_t flags)
>  {
> -       struct cdc_ncm_ctx *ctx = (struct cdc_ncm_ctx *)param;
> -
> -       spin_lock_bh(&ctx->mtx);
> -       if (ctx->tx_timer_pending != 0) {
> -               ctx->tx_timer_pending--;
> -               cdc_ncm_tx_timeout_start(ctx);
> -               spin_unlock_bh(&ctx->mtx);
> -       } else if (ctx->netdev != NULL) {
> -               spin_unlock_bh(&ctx->mtx);
> -               netif_tx_lock_bh(ctx->netdev);
> -               usbnet_start_xmit(NULL, ctx->netdev);
> -               netif_tx_unlock_bh(ctx->netdev);
> -       }
> +       int err;
> +       struct cdc_ncm_ctx *ctx = (struct cdc_ncm_ctx *)dev->data[0];
> +
> +       err = cdc_ncm_fill_tx_frame(ctx, skb);
> +       return err;
>  }
>
>  static struct sk_buff *
>  cdc_ncm_tx_fixup(struct usbnet *dev, struct sk_buff *skb, gfp_t flags)
>  {
> -       struct sk_buff *skb_out;
>         struct cdc_ncm_ctx *ctx = (struct cdc_ncm_ctx *)dev->data[0];
>
> -       /*
> -        * The Ethernet API we are using does not support transmitting
> -        * multiple Ethernet frames in a single call. This driver will
> -        * accumulate multiple Ethernet frames and send out a larger
> -        * USB frame when the USB buffer is full or when a single jiffies
> -        * timeout happens.
> -        */
>         if (ctx == NULL)
>                 goto error;
>
> -       spin_lock_bh(&ctx->mtx);
> -       skb_out = cdc_ncm_fill_tx_frame(ctx, skb);
> -       spin_unlock_bh(&ctx->mtx);
> -       return skb_out;
> +       return ctx->tx_curr_skb;

So you basically replace .tx_fixup as .tx_bundle for ncm, don't you?

This also is what I commented above: .tx_fixup is enough for aggregation, :-)

>
>  error:
>         if (skb != NULL)
> @@ -1197,6 +1129,7 @@ static const struct driver_info cdc_ncm_info = {
>         .manage_power = cdc_ncm_manage_power,
>         .status = cdc_ncm_status,
>         .rx_fixup = cdc_ncm_rx_fixup,
> +       .tx_bundle = cdc_ncm_tx_bundle,
>         .tx_fixup = cdc_ncm_tx_fixup,
>  };
>
> @@ -1211,6 +1144,7 @@ static const struct driver_info wwan_info = {
>         .manage_power = cdc_ncm_manage_power,
>         .status = cdc_ncm_status,
>         .rx_fixup = cdc_ncm_rx_fixup,
> +       .tx_bundle = cdc_ncm_tx_bundle,
>         .tx_fixup = cdc_ncm_tx_fixup,
>  };
>
> diff --git a/drivers/net/usb/usbnet.c b/drivers/net/usb/usbnet.c
> index 8531c1c..d9a595e 100644
> --- a/drivers/net/usb/usbnet.c
> +++ b/drivers/net/usb/usbnet.c
> @@ -1024,6 +1024,7 @@ static void tx_complete (struct urb *urb)
>         struct skb_data         *entry = (struct skb_data *) skb->cb;
>         struct usbnet           *dev = entry->dev;
>
> +       atomic_dec(&dev->tx_in_flight);
>         if (urb->status == 0) {
>                 if (!(dev->driver_info->flags & FLAG_MULTI_PACKET))
>                         dev->net->stats.tx_packets++;
> @@ -1089,23 +1090,50 @@ netdev_tx_t usbnet_start_xmit (struct sk_buff *skb,
>         struct urb              *urb = NULL;
>         struct skb_data         *entry;
>         struct driver_info      *info = dev->driver_info;
> +       struct sk_buff          *skb_old = NULL;
>         unsigned long           flags;
>         int retval;
> +       int transmit_now = 1;
> +       int bundle_again = 0;
>
>         if (skb)
>                 skb_tx_timestamp(skb);
>
> +       /*
> +        * first we allow drivers to bundle packets together
> +        * maintainance of the buffer is the responsibility
> +        * of the lower layer
> +        */
> +rebundle:
> +       if (info->tx_bundle) {
> +               bundle_again = 0;
> +               retval = info->tx_bundle(dev, skb, GFP_ATOMIC);
> +
> +               switch (retval) {
> +               case 0: /* the package has been bundled */
> +                       if (atomic_read(&dev->tx_in_flight) < 2)
> +                               transmit_now = 1;
> +                       else
> +                               transmit_now = 0;
> +                       break;
> +               case -EAGAIN:
> +                       transmit_now = 1;
> +                       bundle_again = 1;

I don't understand why you want to bundle the same SKB again...

> +                       skb_old = skb;
> +                       break;
> +               case -EBUSY:
> +                       transmit_now = 1;
> +                       break;
> +               }
> +       }
>         // some devices want funky USB-level framing, for
>         // win32 driver (usually) and/or hardware quirks
> -       if (info->tx_fixup) {
> +       if (transmit_now && info->tx_fixup) {
>                 skb = info->tx_fixup (dev, skb, GFP_ATOMIC);
>                 if (!skb) {
>                         if (netif_msg_tx_err(dev)) {
>                                 netif_dbg(dev, tx_err, dev->net, "can't tx_fixup skb\n");
>                                 goto drop;
> -                       } else {
> -                               /* cdc_ncm collected packet; waits for more */
> -                               goto not_drop;

This above may trigger oops since the null skb will be transmit later...

>                         }
>                 }
>         }
> @@ -1164,14 +1192,17 @@ netdev_tx_t usbnet_start_xmit (struct sk_buff *skb,
>         }
>  #endif
>
> +       atomic_inc(&dev->tx_in_flight);
>         switch ((retval = usb_submit_urb (urb, GFP_ATOMIC))) {
>         case -EPIPE:
>                 netif_stop_queue (net);
>                 usbnet_defer_kevent (dev, EVENT_TX_HALT);
> +               atomic_dec(&dev->tx_in_flight);
>                 usb_autopm_put_interface_async(dev->intf);
>                 break;
>         default:
>                 usb_autopm_put_interface_async(dev->intf);
> +               atomic_dec(&dev->tx_in_flight);
>                 netif_dbg(dev, tx_err, dev->net,
>                           "tx: submit urb err %d\n", retval);
>                 break;
> @@ -1187,7 +1218,6 @@ netdev_tx_t usbnet_start_xmit (struct sk_buff *skb,
>                 netif_dbg(dev, tx_err, dev->net, "drop, code %d\n", retval);
>  drop:
>                 dev->net->stats.tx_dropped++;
> -not_drop:
>                 if (skb)
>                         dev_kfree_skb_any (skb);
>                 usb_free_urb (urb);
> @@ -1197,6 +1227,10 @@ not_drop:
>  #ifdef CONFIG_PM
>  deferred:
>  #endif
> +       if (bundle_again) {
> +               skb = skb_old;
> +               goto rebundle;
> +       }
>         return NETDEV_TX_OK;
>  }
>  EXPORT_SYMBOL_GPL(usbnet_start_xmit);
> @@ -1393,6 +1427,7 @@ usbnet_probe (struct usb_interface *udev, const struct usb_device_id *prod)
>         dev->delay.data = (unsigned long) dev;
>         init_timer (&dev->delay);
>         mutex_init (&dev->phy_mutex);
> +       atomic_set(&dev->tx_in_flight, 0);
>
>         dev->net = net;
>         strcpy (net->name, "usb%d");
> diff --git a/include/linux/usb/usbnet.h b/include/linux/usb/usbnet.h
> index f87cf62..bb2f622 100644
> --- a/include/linux/usb/usbnet.h
> +++ b/include/linux/usb/usbnet.h
> @@ -33,6 +33,7 @@ struct usbnet {
>         wait_queue_head_t       *wait;
>         struct mutex            phy_mutex;
>         unsigned char           suspend_count;
> +       atomic_t                tx_in_flight;
>
>         /* i/o info: pipes etc */
>         unsigned                in, out;
> @@ -133,6 +134,12 @@ struct driver_info {
>         /* fixup rx packet (strip framing) */
>         int     (*rx_fixup)(struct usbnet *dev, struct sk_buff *skb);
>
> +       /* bundle individual package for transmission as
> +        * larger package. This cannot sleep
> +        */
> +       int     (*tx_bundle)(struct usbnet *dev,
> +                               struct sk_buff *skb, gfp_t flags);
> +
>         /* fixup tx packet (add framing) */
>         struct sk_buff  *(*tx_fixup)(struct usbnet *dev,
>                                 struct sk_buff *skb, gfp_t flags);
>
> --
> To unsubscribe from this list: send the line "unsubscribe linux-usb" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html


Thanks,
Alexey ORISHKO Sept. 6, 2012, 8:13 a.m. UTC | #2
> -----Original Message-----
> From: Oliver Neukum [mailto:oneukum@suse.de]


> looking at cdc-ncm it seeems to me that cdc-ncm is forced to play very
> dirty games because usbnet doesn't have a notion about aggregating
> packets for a single transfer.

Several issues need to be improved:
Tx path:
1. IP packets must be accumulated in one NTB. Currently it's done via data copy.
Preferred way would be a possibility to have a list of skb-s in resulting frame sent down.

2. Timer is needed to accumulate enough packets to get acceptable throughput and reducing amount of HW INTs on device side.
If we get access to Tx queue size and can read skb from it, time can be removed. But Tx queue is above usbnet, so doesn't look possible in current framework.

3. While trying to get best throughput we also need to keep latency in mind, because in some setups it is quite important.
As a simple solution configuration parameters could be used to address this issue.

Rx path:
4. IP packets are cloned to separate skb and length of actual data set to eth packet size, while skb size is still the same as skb containing full NTB frame. This causes problems with TCP stack when throughput is high because of flow control in the stack, if too much data allocated.
Someone suggested a patch for it, which was rejected. Anyway, I don't think copy data to a new skb would be a nice solution, but keeping several clones with size equal to incoming skb in not good either. 

/alexey
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Bjørn Mork Sept. 6, 2012, 8:17 a.m. UTC | #3
Oliver Neukum <oneukum@suse.de> writes:

> looking at cdc-ncm it seeems to me that cdc-ncm is forced to play
> very dirty games because usbnet doesn't have a notion about aggregating
> packets for a single transfer.
>
> It seems to me that this can be fixed introducing a method for bundling,
> which tells usbnet how packets have been aggregated. To have performance
> usbnet strives to always keep at least two transfers in flight.
>
> The code isn't complete and I need to get a device for testing, but to get
> your opinion, I ask you to comment on what I have now.

I haven't tested this on any device either, but it looks like the right
direction to me.  Note that there are several other existing minidrivers
which could take advantage of this code. A quick look found that both
sierra_net and rndis_host have some unbundling support in their
rx_fixups, but both ignore tx bundling. rndis_host has the following
explanation in tx_fixup:

        /* fill out the RNDIS header.  we won't bother trying to batch
         * packets; Linux minimizes wasted bandwidth through tx queues.
         */

Although that may be true for the host, I am not sure it will be true
for every device?


> @@ -874,71 +840,37 @@ cdc_ncm_fill_tx_frame(struct cdc_ncm_ctx *ctx, struct sk_buff *skb)
>  	/* return skb */
>  	ctx->tx_curr_skb = NULL;
>  	ctx->netdev->stats.tx_packets += ctx->tx_curr_frame_num;


Maybe all the statistics could be moved back to usbnet now that it will
see all packets again?


> index f87cf62..bb2f622 100644
> --- a/include/linux/usb/usbnet.h
> +++ b/include/linux/usb/usbnet.h
> @@ -33,6 +33,7 @@ struct usbnet {
>  	wait_queue_head_t	*wait;
>  	struct mutex		phy_mutex;
>  	unsigned char		suspend_count;
> +	atomic_t		tx_in_flight;
>  
>  	/* i/o info: pipes etc */
>  	unsigned		in, out;


So you don't keep any timer here?  The idea is that you always will
transmit immediately unless there are two transmit's in flight?  I
believe that is a change which has to be tested against real devices.
They may be optimized for receiving bundles.


Not really related, but I am still worrying how MBIM is going to fit
into the usbnet model where you have a strict relation between one
netdev and one USB interface.  Do you see any way usbnet could be
extended to manage a list of netdevs?

All the netdev related functions doing

	struct usbnet		*dev = netdev_priv(net);

would still work.  But all the USB related functions using dev->net to
get _the_ netdev would fail.  Maybe this will be too difficult to
implement within the usbnet framework at all?


Bjørn
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Bjørn Mork Sept. 6, 2012, 8:30 a.m. UTC | #4
Ming Lei <tom.leiming@gmail.com> writes:
> On Thu, Sep 6, 2012 at 4:12 AM, Oliver Neukum <oneukum@suse.de> wrote:
>> Hi,
>>
>> looking at cdc-ncm it seeems to me that cdc-ncm is forced to play
>> very dirty games because usbnet doesn't have a notion about aggregating
>> packets for a single transfer.
>
> The Ethernet API we are using does not support transmitting multiple Ethernet
> frames in a single call, so the aggregation things should be done inside low
> level driver, in fact it is just what some wlan(802.11n) drivers have
> been doing.
>
> IMO, the current .tx_fixup is intelligent enough to handle aggregation:
>
>       - return a skb_out for sending if low level drivers think it is
> ready to send
>         the aggregated frames
>      -  return NULL if  the low level drivers think they need to wait
> for more frames
>
> Of course, the low level drivers need to start a timer to trigger sending
> remainder frames in case of timeout and no further frames come from
> upper protocol stack.
>
> Looks the introduced .tx_bundle is not necessary since .tx_fixup is OK.

The minidriver does not have any information about tx in progress.  The
strategy above, which is what cdc_ncm uses today, is fine as long as you
always want to wait as long as you always want to fill as many frames as
possible in each packet.  But what if the queue is empty and the device
is just sitting there doing nothing while you are waiting for more
frames?  Then you are just adding unnecessary latency.

There are also several usbnet minidrivers for protocols with frame
aggregation support.  Most of them do not currently implement tx
aggregation, possibly due to the complexity.  This makes a common
aggregation framework interesting in any case.  Reimplementing similar
loginc in multiple minidrivers is meaningless.

I believe Oliver is adding very useful functionality to usbnet here.
Functionality which at least cdc_ncm and possibly other existing
minidrivers can take advantage of immediately.  There also seems to be a
trend towards more complex aggregating protocols as the devices get
smarter and speeds go up.


BJørn
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Oliver Neukum Sept. 6, 2012, 8:50 a.m. UTC | #5
On Thursday 06 September 2012 10:13:01 Alexey ORISHKO wrote:
> > -----Original Message-----
> > From: Oliver Neukum [mailto:oneukum@suse.de]
> 
> 
> > looking at cdc-ncm it seeems to me that cdc-ncm is forced to play very
> > dirty games because usbnet doesn't have a notion about aggregating
> > packets for a single transfer.
> 
> Several issues need to be improved:
> Tx path:
> 1. IP packets must be accumulated in one NTB. Currently it's done via data copy.
> Preferred way would be a possibility to have a list of skb-s in resulting frame sent down.

I am afraid much HCD hardware is just not capable of doing DMA on a list of skbs.
I guess one copy will be necessary. We might consider not freeing the buffer used
to transfer over the bus.

> 2. Timer is needed to accumulate enough packets to get acceptable throughput and reducing amount of HW INTs on device side.
> If we get access to Tx queue size and can read skb from it, time can be removed. But Tx queue is above usbnet, so doesn't look possible in current framework.

I think we can use the number of transfers in flight to make that decision.
So if the choice is to not transfer data at all or to send transfers filled to the
max, we should send, otherwise we ought to accumulate.
That way we get rid of the timer.

> 3. While trying to get best throughput we also need to keep latency in mind, because in some setups it is quite important.
> As a simple solution configuration parameters could be used to address this issue.

I think we should just send the first (two) packages and use the time
to aggregate later packets.

> Rx path:
> 4. IP packets are cloned to separate skb and length of actual data set to eth packet size, while skb size is still the same as skb containing full NTB frame. This causes problems with TCP stack when throughput is high because of flow control in the stack, if too much data allocated.
> Someone suggested a patch for it, which was rejected. Anyway, I don't think copy data to a new skb would be a nice solution, but keeping several clones with size equal to incoming skb in not good either. 

Perhaps the problem is using an skb for aggregate reception at all.
Possibly enough buffers of fixed size should be allocated on open and reused,
not freed.

	Regards
		Oliver

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Eric Dumazet Sept. 6, 2012, 9:11 a.m. UTC | #6
On Thu, 2012-09-06 at 10:50 +0200, Oliver Neukum wrote: 
> On Thursday 06 September 2012 10:13:01 Alexey ORISHKO wrote:

> > Rx path:
> > 4. IP packets are cloned to separate skb and length of actual data
> set to eth packet size, while skb size is still the same as skb
> containing full NTB frame. This causes problems with TCP stack when
> throughput is high because of flow control in the stack, if too much
> data allocated.
> > Someone suggested a patch for it, which was rejected. Anyway, I
> don't think copy data to a new skb would be a nice solution, but
> keeping several clones with size equal to incoming skb in not good
> either. 
> 
> Perhaps the problem is using an skb for aggregate reception at all.
> Possibly enough buffers of fixed size should be allocated on open and
> reused,
> not freed.

Really skb_clone() use should be removed from cdc_ncm_rx_fixup()

Unless you expect 10Gbit speed from this driver, skb_clone() is the
worst possible strategy.

Allocating fresh skbs of the right size permits better memory use and
allows TCP coalescing as well.

The use of skb_clone() forces some parts of the stack to perform a full
copy anyway.

It's still unclear to me with we use up to 32KB blocks in USB drivers...


--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Ming Lei Sept. 6, 2012, 4:09 p.m. UTC | #7
On Thu, Sep 6, 2012 at 4:30 PM, Bjørn Mork <bjorn@mork.no> wrote:
> Ming Lei <tom.leiming@gmail.com> writes:
>> On Thu, Sep 6, 2012 at 4:12 AM, Oliver Neukum <oneukum@suse.de> wrote:
>>> Hi,
>>>
>>> looking at cdc-ncm it seeems to me that cdc-ncm is forced to play
>>> very dirty games because usbnet doesn't have a notion about aggregating
>>> packets for a single transfer.
>>
>> The Ethernet API we are using does not support transmitting multiple Ethernet
>> frames in a single call, so the aggregation things should be done inside low
>> level driver, in fact it is just what some wlan(802.11n) drivers have
>> been doing.
>>
>> IMO, the current .tx_fixup is intelligent enough to handle aggregation:
>>
>>       - return a skb_out for sending if low level drivers think it is
>> ready to send
>>         the aggregated frames
>>      -  return NULL if  the low level drivers think they need to wait
>> for more frames
>>
>> Of course, the low level drivers need to start a timer to trigger sending
>> remainder frames in case of timeout and no further frames come from
>> upper protocol stack.
>>
>> Looks the introduced .tx_bundle is not necessary since .tx_fixup is OK.
>
> The minidriver does not have any information about tx in progress.  The

Inside .tx_fixup, the low level driver will get the tx progress information.

> strategy above, which is what cdc_ncm uses today, is fine as long as you
> always want to wait as long as you always want to fill as many frames as
> possible in each packet.  But what if the queue is empty and the device

For cdc_ncm, the wait time is controlled by cdc_ncm driver, see
cdc_ncm_tx_timeout_start().

> is just sitting there doing nothing while you are waiting for more
> frames?  Then you are just adding unnecessary latency.

As said above, cdc_ncm can control the latency by itself.

>
> There are also several usbnet minidrivers for protocols with frame
> aggregation support.  Most of them do not currently implement tx
> aggregation, possibly due to the complexity.  This makes a common
> aggregation framework interesting in any case.  Reimplementing similar
> loginc in multiple minidrivers is meaningless.

If we can abstract some common things about aggregation, it should be
meaningful. As far as I know, most of aggregation protocol is very different,
so almost all aggregation work is only done by low level driver, such as
cdc_ncm.

If we want to implement some aggregation framework, maybe below is
one solution, correct me if it is wrong.

usbnet_start_xmit():

	to_send = info->tx_bundle(in_skb, &out_skb)
	if (!to_send) {
		usbnet_tx_timeout_start(usbnet);
		goto not_drop;
	}
	skb = info->tx_fixup(dev, out_skb, GFP_ATOMIC);	
	...

Looks the above is no much advantage than doing bundling in .tx_fixup
just like cdc_ncm.

>
> I believe Oliver is adding very useful functionality to usbnet here.
> Functionality which at least cdc_ncm and possibly other existing
> minidrivers can take advantage of immediately.  There also seems to be a

Sorry, I don't know other minidrivers, and is there some common
things in building aggregation with cdc_ncm?

Considered that the patch doesn't implement wait_for_more, suggest
Oliver to post a new one with basic wait_for_more support for further
discussion.

Thanks,
Oliver Neukum Sept. 6, 2012, 5:56 p.m. UTC | #8
On Friday 07 September 2012 00:09:13 Ming Lei wrote:
> On Thu, Sep 6, 2012 at 4:30 PM, Bjørn Mork <bjorn@mork.no> wrote:
> > Ming Lei <tom.leiming@gmail.com> writes:

> >> Looks the introduced .tx_bundle is not necessary since .tx_fixup is OK.
> >
> > The minidriver does not have any information about tx in progress.  The
> 
> Inside .tx_fixup, the low level driver will get the tx progress information.

That information changes after tx_fixup
 
> > strategy above, which is what cdc_ncm uses today, is fine as long as you
> > always want to wait as long as you always want to fill as many frames as
> > possible in each packet.  But what if the queue is empty and the device
> 
> For cdc_ncm, the wait time is controlled by cdc_ncm driver, see
> cdc_ncm_tx_timeout_start().

Well, that is the mistake. Using a timer is a bad idea.

> If we can abstract some common things about aggregation, it should be
> meaningful. As far as I know, most of aggregation protocol is very different,
> so almost all aggregation work is only done by low level driver, such as
> cdc_ncm.
> 
> If we want to implement some aggregation framework, maybe below is
> one solution, correct me if it is wrong.

It isn't so much wrong as incomplete.

It seems to me we can have

- can queue, buffer not full -> do nothing more
- can queue, buffer full -> transmit
- cannot queue, buffer full -> transmit and then try again to queue

and an error case

- cannot queue, buffer not full

And that's the way I coded it.

	Regards
		Oliver

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Ming Lei Sept. 7, 2012, 3:55 a.m. UTC | #9
On Fri, Sep 7, 2012 at 1:56 AM, Oliver Neukum <oneukum@suse.de> wrote:
> On Friday 07 September 2012 00:09:13 Ming Lei wrote:
>> On Thu, Sep 6, 2012 at 4:30 PM, Bjørn Mork <bjorn@mork.no> wrote:
>> > Ming Lei <tom.leiming@gmail.com> writes:
>
>> >> Looks the introduced .tx_bundle is not necessary since .tx_fixup is OK.
>> >
>> > The minidriver does not have any information about tx in progress.  The
>>
>> Inside .tx_fixup, the low level driver will get the tx progress information.
>
> That information changes after tx_fixup

You mean the low level drive can't see if the later transmission
for the aggregated frame is successful?  If so, there is no any complete
notification to all low level drivers (with aggregation capability
or not) now, isn't there?

If no aggregated frame is ready for transmission, the information
can't be changed after current tx_fixup and before the next .tx_fixup.

>
>> > strategy above, which is what cdc_ncm uses today, is fine as long as you
>> > always want to wait as long as you always want to fill as many frames as
>> > possible in each packet.  But what if the queue is empty and the device
>>
>> For cdc_ncm, the wait time is controlled by cdc_ncm driver, see
>> cdc_ncm_tx_timeout_start().
>
> Well, that is the mistake. Using a timer is a bad idea.

Why is a bad idea?  Without a timer, how can usbnet or
low level driver aggregate the later coming transmit frames to
form a biger aggregated frame?

>
>> If we can abstract some common things about aggregation, it should be
>> meaningful. As far as I know, most of aggregation protocol is very different,
>> so almost all aggregation work is only done by low level driver, such as
>> cdc_ncm.
>>
>> If we want to implement some aggregation framework, maybe below is
>> one solution, correct me if it is wrong.
>
> It isn't so much wrong as incomplete.
>
> It seems to me we can have
>
> - can queue, buffer not full -> do nothing more

As I said above, without a timeout timer, the queued skb
might not be sent out even some long time passed if no
frame comes later.

So could you add the wait for more mechanism to your patch
for further discussion?

> - can queue, buffer full -> transmit
> - cannot queue, buffer full -> transmit and then try again to queue

This might not happen, the buffer full should have been observed
in last xmit path.

>
> and an error case
>
> - cannot queue, buffer not full



Thanks,
Oliver Neukum Sept. 7, 2012, 7:40 a.m. UTC | #10
On Thursday 06 September 2012 10:17:46 Bjørn Mork wrote:
> Not really related, but I am still worrying how MBIM is going to fit
> into the usbnet model where you have a strict relation between one
> netdev and one USB interface.  Do you see any way usbnet could be
> extended to manage a list of netdevs?
> 
> All the netdev related functions doing
> 
>         struct usbnet           *dev = netdev_priv(net);
> 
> would still work.  But all the USB related functions using dev->net to
> get the netdev would fail.  Maybe this will be too difficult to
> implement within the usbnet framework at all?

It depends on how much support we get from upper layers.
You can give two IPs to an ethernet card as well.

It would be best if you could come up with some preliminary code
at least.

	Reagrds
		Oliver

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Bjørn Mork Sept. 7, 2012, 8:53 a.m. UTC | #11
Oliver Neukum <oneukum@suse.de> writes:
> On Thursday 06 September 2012 10:17:46 Bjørn Mork wrote:
>> Not really related, but I am still worrying how MBIM is going to fit
>> into the usbnet model where you have a strict relation between one
>> netdev and one USB interface.  Do you see any way usbnet could be
>> extended to manage a list of netdevs?
>> 
>> All the netdev related functions doing
>> 
>>         struct usbnet           *dev = netdev_priv(net);
>> 
>> would still work.  But all the USB related functions using dev->net to
>> get the netdev would fail.  Maybe this will be too difficult to
>> implement within the usbnet framework at all?
>
> It depends on how much support we get from upper layers.
> You can give two IPs to an ethernet card as well.

Yes, of course.  But I don't think that fits the MBIM model, where
multiple independent connections (Internet, VoIP, MMS, VPN1, VPN2, etc)
to different APNs will be multiplexed over the same USB endpoints.  And
to make this not fly at all: Some of these may be IPv4, some IPv6 and
some dual stack.  You cannot support that on a single netdev.  Mixing
them all together on the host will not do.  When the driver sees an IPv6
multicast packet (RS, ND, whatever) on the netdev, which MBIM session
should it forward that packet too?

If I understand MBIM correctly, we will see device configurations with
_one_ CDC data interface carrying all data traffic for multiple device
functions.  A MBIM driver must de-multiplex this, and each IP function
should be routed to its own netdev IMHO.

> It would be best if you could come up with some preliminary code
> at least.

Yup.  Finally got a device.  Will start playing.


Bjørn
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Ming Lei Sept. 7, 2012, 12:58 p.m. UTC | #12
On Thu, Sep 6, 2012 at 4:13 PM, Alexey ORISHKO
<alexey.orishko@stericsson.com> wrote:
>> -----Original Message-----
>> From: Oliver Neukum [mailto:oneukum@suse.de]
>
>
>> looking at cdc-ncm it seeems to me that cdc-ncm is forced to play very
>> dirty games because usbnet doesn't have a notion about aggregating
>> packets for a single transfer.
>
> Several issues need to be improved:
> Tx path:
> 1. IP packets must be accumulated in one NTB. Currently it's done via data copy.
> Preferred way would be a possibility to have a list of skb-s in resulting frame sent down.

Looks the copy is inevitable because the buffer length of each skb in the
list is not possible to be max endpoint packet aligned, so one short packet
may terminate the current tx urb transfer.


Thanks,
Alexey ORISHKO Sept. 7, 2012, 1:10 p.m. UTC | #13
> -----Original Message-----
> From: Ming Lei [mailto:tom.leiming@gmail.com]
> Sent: Friday, September 07, 2012 2:58 PM

> > Several issues need to be improved:
> > Tx path:
> > 1. IP packets must be accumulated in one NTB. Currently it's done via
> data copy.
> > Preferred way would be a possibility to have a list of skb-s in
> resulting frame sent down.
> 
> Looks the copy is inevitable because the buffer length of each skb in
> the list is not possible to be max endpoint packet aligned, so one
> short packet may terminate the current tx urb transfer.

Zero or short packet shall not be added if resulting NTB is max negotiated size.
It's yet another problem with usbnet, where we had to add some workarounds
to avoid redundant short/zero packets.

Regards,
Alexey
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Dan Williams Sept. 7, 2012, 5:02 p.m. UTC | #14
On Fri, 2012-09-07 at 10:53 +0200, Bjørn Mork wrote:
> Oliver Neukum <oneukum@suse.de> writes:
> > On Thursday 06 September 2012 10:17:46 Bjørn Mork wrote:
> >> Not really related, but I am still worrying how MBIM is going to fit
> >> into the usbnet model where you have a strict relation between one
> >> netdev and one USB interface.  Do you see any way usbnet could be
> >> extended to manage a list of netdevs?
> >> 
> >> All the netdev related functions doing
> >> 
> >>         struct usbnet           *dev = netdev_priv(net);
> >> 
> >> would still work.  But all the USB related functions using dev->net to
> >> get the netdev would fail.  Maybe this will be too difficult to
> >> implement within the usbnet framework at all?
> >
> > It depends on how much support we get from upper layers.
> > You can give two IPs to an ethernet card as well.
> 
> Yes, of course.  But I don't think that fits the MBIM model, where
> multiple independent connections (Internet, VoIP, MMS, VPN1, VPN2, etc)
> to different APNs will be multiplexed over the same USB endpoints.  And

Not just multiple APNs each with different IP addressing... but QoS too.
Each WWAN bearer you create may have different QoS settings.  That
bearer may also have *both* IPv4 and IPv6 addresses.  We also may want
to put the interface that bearer is tied to into a different network
namespace or container or something like that, so that the process that
requested the specific QoS doesn't have to share it with other apps,
possibly malicious ones.  Can't do that if they are all on the same
kernel network interface, AFAIK.

Dan

> to make this not fly at all: Some of these may be IPv4, some IPv6 and
> some dual stack.  You cannot support that on a single netdev.  Mixing
> them all together on the host will not do.  When the driver sees an IPv6
> multicast packet (RS, ND, whatever) on the netdev, which MBIM session
> should it forward that packet too?
> 
> If I understand MBIM correctly, we will see device configurations with
> _one_ CDC data interface carrying all data traffic for multiple device
> functions.  A MBIM driver must de-multiplex this, and each IP function
> should be routed to its own netdev IMHO.
> 
> > It would be best if you could come up with some preliminary code
> > at least.
> 
> Yup.  Finally got a device.  Will start playing.
> 
> 
> Bjørn
> --
> To unsubscribe from this list: send the line "unsubscribe netdev" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html


--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
diff mbox

Patch

diff --git a/drivers/net/usb/cdc_ncm.c b/drivers/net/usb/cdc_ncm.c
index 4cd582a..56ef743 100644
--- a/drivers/net/usb/cdc_ncm.c
+++ b/drivers/net/usb/cdc_ncm.c
@@ -135,9 +135,6 @@  struct cdc_ncm_ctx {
 	u16 connected;
 };
 
-static void cdc_ncm_txpath_bh(unsigned long param);
-static void cdc_ncm_tx_timeout_start(struct cdc_ncm_ctx *ctx);
-static enum hrtimer_restart cdc_ncm_tx_timer_cb(struct hrtimer *hr_timer);
 static struct usb_driver cdc_ncm_driver;
 
 static void
@@ -464,10 +461,6 @@  static int cdc_ncm_bind(struct usbnet *dev, struct usb_interface *intf)
 	if (ctx == NULL)
 		return -ENODEV;
 
-	hrtimer_init(&ctx->tx_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
-	ctx->tx_timer.function = &cdc_ncm_tx_timer_cb;
-	ctx->bh.data = (unsigned long)ctx;
-	ctx->bh.func = cdc_ncm_txpath_bh;
 	atomic_set(&ctx->stop, 0);
 	spin_lock_init(&ctx->mtx);
 	ctx->netdev = dev->net;
@@ -650,7 +643,7 @@  static void cdc_ncm_zero_fill(u8 *ptr, u32 first, u32 end, u32 max)
 	memset(ptr + first, 0, end - first);
 }
 
-static struct sk_buff *
+static int
 cdc_ncm_fill_tx_frame(struct cdc_ncm_ctx *ctx, struct sk_buff *skb)
 {
 	struct sk_buff *skb_out;
@@ -659,12 +652,7 @@  cdc_ncm_fill_tx_frame(struct cdc_ncm_ctx *ctx, struct sk_buff *skb)
 	u32 last_offset;
 	u16 n = 0, index;
 	u8 ready2send = 0;
-
-	/* if there is a remaining skb, it gets priority */
-	if (skb != NULL)
-		swap(skb, ctx->tx_rem_skb);
-	else
-		ready2send = 1;
+	u8 error = 0;
 
 	/*
 	 * +----------------+
@@ -690,7 +678,7 @@  cdc_ncm_fill_tx_frame(struct cdc_ncm_ctx *ctx, struct sk_buff *skb)
 				dev_kfree_skb_any(skb);
 				ctx->netdev->stats.tx_dropped++;
 			}
-			goto exit_no_skb;
+			return -EBUSY;
 		}
 
 		/* make room for NTH and NDP */
@@ -719,28 +707,15 @@  cdc_ncm_fill_tx_frame(struct cdc_ncm_ctx *ctx, struct sk_buff *skb)
 		/* compute maximum buffer size */
 		rem = ctx->tx_max - offset;
 
-		if (skb == NULL) {
-			skb = ctx->tx_rem_skb;
-			ctx->tx_rem_skb = NULL;
-
-			/* check for end of skb */
-			if (skb == NULL)
-				break;
-		}
-
 		if (skb->len > rem) {
 			if (n == 0) {
 				/* won't fit, MTU problem? */
 				dev_kfree_skb_any(skb);
 				skb = NULL;
 				ctx->netdev->stats.tx_dropped++;
+				error = 1;
 			} else {
-				/* no room for skb - store for later */
-				if (ctx->tx_rem_skb != NULL) {
-					dev_kfree_skb_any(ctx->tx_rem_skb);
-					ctx->netdev->stats.tx_dropped++;
-				}
-				ctx->tx_rem_skb = skb;
+
 				skb = NULL;
 				ready2send = 1;
 			}
@@ -768,13 +743,6 @@  cdc_ncm_fill_tx_frame(struct cdc_ncm_ctx *ctx, struct sk_buff *skb)
 		skb = NULL;
 	}
 
-	/* free up any dangling skb */
-	if (skb != NULL) {
-		dev_kfree_skb_any(skb);
-		skb = NULL;
-		ctx->netdev->stats.tx_dropped++;
-	}
-
 	ctx->tx_curr_frame_num = n;
 
 	if (n == 0) {
@@ -791,9 +759,7 @@  cdc_ncm_fill_tx_frame(struct cdc_ncm_ctx *ctx, struct sk_buff *skb)
 		ctx->tx_curr_skb = skb_out;
 		ctx->tx_curr_offset = offset;
 		ctx->tx_curr_last_offset = last_offset;
-		/* set the pending count */
-		if (n < CDC_NCM_RESTART_TIMER_DATAGRAM_CNT)
-			ctx->tx_timer_pending = CDC_NCM_TIMER_PENDING_CNT;
+
 		goto exit_no_skb;
 
 	} else {
@@ -874,71 +840,37 @@  cdc_ncm_fill_tx_frame(struct cdc_ncm_ctx *ctx, struct sk_buff *skb)
 	/* return skb */
 	ctx->tx_curr_skb = NULL;
 	ctx->netdev->stats.tx_packets += ctx->tx_curr_frame_num;
-	return skb_out;
 
-exit_no_skb:
-	/* Start timer, if there is a remaining skb */
-	if (ctx->tx_curr_skb != NULL)
-		cdc_ncm_tx_timeout_start(ctx);
-	return NULL;
-}
-
-static void cdc_ncm_tx_timeout_start(struct cdc_ncm_ctx *ctx)
-{
-	/* start timer, if not already started */
-	if (!(hrtimer_active(&ctx->tx_timer) || atomic_read(&ctx->stop)))
-		hrtimer_start(&ctx->tx_timer,
-				ktime_set(0, CDC_NCM_TIMER_INTERVAL),
-				HRTIMER_MODE_REL);
-}
+	if (error)
+		return -EBUSY;
+	if (ready2send)
+		return -EBUSY;
+	else
+		return 0;
 
-static enum hrtimer_restart cdc_ncm_tx_timer_cb(struct hrtimer *timer)
-{
-	struct cdc_ncm_ctx *ctx =
-			container_of(timer, struct cdc_ncm_ctx, tx_timer);
+exit_no_skb:
 
-	if (!atomic_read(&ctx->stop))
-		tasklet_schedule(&ctx->bh);
-	return HRTIMER_NORESTART;
+	return -EAGAIN;
 }
 
-static void cdc_ncm_txpath_bh(unsigned long param)
+static int cdc_ncm_tx_bundle(struct usbnet *dev, struct sk_buff *skb, gfp_t flags)
 {
-	struct cdc_ncm_ctx *ctx = (struct cdc_ncm_ctx *)param;
-
-	spin_lock_bh(&ctx->mtx);
-	if (ctx->tx_timer_pending != 0) {
-		ctx->tx_timer_pending--;
-		cdc_ncm_tx_timeout_start(ctx);
-		spin_unlock_bh(&ctx->mtx);
-	} else if (ctx->netdev != NULL) {
-		spin_unlock_bh(&ctx->mtx);
-		netif_tx_lock_bh(ctx->netdev);
-		usbnet_start_xmit(NULL, ctx->netdev);
-		netif_tx_unlock_bh(ctx->netdev);
-	}
+	int err;
+	struct cdc_ncm_ctx *ctx = (struct cdc_ncm_ctx *)dev->data[0];
+	
+	err = cdc_ncm_fill_tx_frame(ctx, skb);
+	return err;
 }
 
 static struct sk_buff *
 cdc_ncm_tx_fixup(struct usbnet *dev, struct sk_buff *skb, gfp_t flags)
 {
-	struct sk_buff *skb_out;
 	struct cdc_ncm_ctx *ctx = (struct cdc_ncm_ctx *)dev->data[0];
 
-	/*
-	 * The Ethernet API we are using does not support transmitting
-	 * multiple Ethernet frames in a single call. This driver will
-	 * accumulate multiple Ethernet frames and send out a larger
-	 * USB frame when the USB buffer is full or when a single jiffies
-	 * timeout happens.
-	 */
 	if (ctx == NULL)
 		goto error;
 
-	spin_lock_bh(&ctx->mtx);
-	skb_out = cdc_ncm_fill_tx_frame(ctx, skb);
-	spin_unlock_bh(&ctx->mtx);
-	return skb_out;
+	return ctx->tx_curr_skb;
 
 error:
 	if (skb != NULL)
@@ -1197,6 +1129,7 @@  static const struct driver_info cdc_ncm_info = {
 	.manage_power = cdc_ncm_manage_power,
 	.status = cdc_ncm_status,
 	.rx_fixup = cdc_ncm_rx_fixup,
+	.tx_bundle = cdc_ncm_tx_bundle,
 	.tx_fixup = cdc_ncm_tx_fixup,
 };
 
@@ -1211,6 +1144,7 @@  static const struct driver_info wwan_info = {
 	.manage_power = cdc_ncm_manage_power,
 	.status = cdc_ncm_status,
 	.rx_fixup = cdc_ncm_rx_fixup,
+	.tx_bundle = cdc_ncm_tx_bundle,
 	.tx_fixup = cdc_ncm_tx_fixup,
 };
 
diff --git a/drivers/net/usb/usbnet.c b/drivers/net/usb/usbnet.c
index 8531c1c..d9a595e 100644
--- a/drivers/net/usb/usbnet.c
+++ b/drivers/net/usb/usbnet.c
@@ -1024,6 +1024,7 @@  static void tx_complete (struct urb *urb)
 	struct skb_data		*entry = (struct skb_data *) skb->cb;
 	struct usbnet		*dev = entry->dev;
 
+	atomic_dec(&dev->tx_in_flight);
 	if (urb->status == 0) {
 		if (!(dev->driver_info->flags & FLAG_MULTI_PACKET))
 			dev->net->stats.tx_packets++;
@@ -1089,23 +1090,50 @@  netdev_tx_t usbnet_start_xmit (struct sk_buff *skb,
 	struct urb		*urb = NULL;
 	struct skb_data		*entry;
 	struct driver_info	*info = dev->driver_info;
+	struct sk_buff		*skb_old = NULL;
 	unsigned long		flags;
 	int retval;
+	int transmit_now = 1;
+	int bundle_again = 0;
 
 	if (skb)
 		skb_tx_timestamp(skb);
 
+	/*
+	 * first we allow drivers to bundle packets together
+	 * maintainance of the buffer is the responsibility
+	 * of the lower layer
+	 */
+rebundle:
+	if (info->tx_bundle) {
+		bundle_again = 0;
+		retval = info->tx_bundle(dev, skb, GFP_ATOMIC);
+
+		switch (retval) {
+		case 0: /* the package has been bundled */
+			if (atomic_read(&dev->tx_in_flight) < 2)
+				transmit_now = 1;
+			else
+				transmit_now = 0;
+			break;
+		case -EAGAIN:
+			transmit_now = 1;
+			bundle_again = 1;
+			skb_old = skb;
+			break;
+		case -EBUSY:
+			transmit_now = 1;
+			break;
+		}
+	}
 	// some devices want funky USB-level framing, for
 	// win32 driver (usually) and/or hardware quirks
-	if (info->tx_fixup) {
+	if (transmit_now && info->tx_fixup) {
 		skb = info->tx_fixup (dev, skb, GFP_ATOMIC);
 		if (!skb) {
 			if (netif_msg_tx_err(dev)) {
 				netif_dbg(dev, tx_err, dev->net, "can't tx_fixup skb\n");
 				goto drop;
-			} else {
-				/* cdc_ncm collected packet; waits for more */
-				goto not_drop;
 			}
 		}
 	}
@@ -1164,14 +1192,17 @@  netdev_tx_t usbnet_start_xmit (struct sk_buff *skb,
 	}
 #endif
 
+	atomic_inc(&dev->tx_in_flight);
 	switch ((retval = usb_submit_urb (urb, GFP_ATOMIC))) {
 	case -EPIPE:
 		netif_stop_queue (net);
 		usbnet_defer_kevent (dev, EVENT_TX_HALT);
+		atomic_dec(&dev->tx_in_flight);
 		usb_autopm_put_interface_async(dev->intf);
 		break;
 	default:
 		usb_autopm_put_interface_async(dev->intf);
+		atomic_dec(&dev->tx_in_flight);
 		netif_dbg(dev, tx_err, dev->net,
 			  "tx: submit urb err %d\n", retval);
 		break;
@@ -1187,7 +1218,6 @@  netdev_tx_t usbnet_start_xmit (struct sk_buff *skb,
 		netif_dbg(dev, tx_err, dev->net, "drop, code %d\n", retval);
 drop:
 		dev->net->stats.tx_dropped++;
-not_drop:
 		if (skb)
 			dev_kfree_skb_any (skb);
 		usb_free_urb (urb);
@@ -1197,6 +1227,10 @@  not_drop:
 #ifdef CONFIG_PM
 deferred:
 #endif
+	if (bundle_again) {
+		skb = skb_old;
+		goto rebundle;
+	}
 	return NETDEV_TX_OK;
 }
 EXPORT_SYMBOL_GPL(usbnet_start_xmit);
@@ -1393,6 +1427,7 @@  usbnet_probe (struct usb_interface *udev, const struct usb_device_id *prod)
 	dev->delay.data = (unsigned long) dev;
 	init_timer (&dev->delay);
 	mutex_init (&dev->phy_mutex);
+	atomic_set(&dev->tx_in_flight, 0); 
 
 	dev->net = net;
 	strcpy (net->name, "usb%d");
diff --git a/include/linux/usb/usbnet.h b/include/linux/usb/usbnet.h
index f87cf62..bb2f622 100644
--- a/include/linux/usb/usbnet.h
+++ b/include/linux/usb/usbnet.h
@@ -33,6 +33,7 @@  struct usbnet {
 	wait_queue_head_t	*wait;
 	struct mutex		phy_mutex;
 	unsigned char		suspend_count;
+	atomic_t		tx_in_flight;
 
 	/* i/o info: pipes etc */
 	unsigned		in, out;
@@ -133,6 +134,12 @@  struct driver_info {
 	/* fixup rx packet (strip framing) */
 	int	(*rx_fixup)(struct usbnet *dev, struct sk_buff *skb);
 
+	/* bundle individual package for transmission as
+	 * larger package. This cannot sleep
+	 */
+	int	(*tx_bundle)(struct usbnet *dev,
+				struct sk_buff *skb, gfp_t flags);
+
 	/* fixup tx packet (add framing) */
 	struct sk_buff	*(*tx_fixup)(struct usbnet *dev,
 				struct sk_buff *skb, gfp_t flags);