diff mbox

[U-Boot,v3,8/8] ehci: Optimize qTD allocations

Message ID 1426447058.2236846.1344549255096.JavaMail.root@advansee.com
State Superseded
Delegated to: Marek Vasut
Headers show

Commit Message

Benoît Thébaudeau Aug. 9, 2012, 9:54 p.m. UTC
Relax the qTD transfer alignment constraints in order to need less qTDs for
buffers that are aligned to 512 bytes but not to pages.

Signed-off-by: Benoît Thébaudeau <benoit.thebaudeau@advansee.com>
Cc: Marek Vasut <marex@denx.de>
Cc: Ilya Yanok <ilya.yanok@cogentembedded.com>
Cc: Stefan Herbrechtsmeier <stefan@herbrechtsmeier.net>
---
Changes for v2: N/A.
Changes for v3:
 - New patch.

 .../drivers/usb/host/ehci-hcd.c                    |   68 +++++++++++---------
 1 file changed, 38 insertions(+), 30 deletions(-)

Comments

Marek Vasut Aug. 9, 2012, 10:33 p.m. UTC | #1
Dear Benoît Thébaudeau,

> Relax the qTD transfer alignment constraints in order to need less qTDs for
> buffers that are aligned to 512 bytes but not to pages.
> 
> Signed-off-by: Benoît Thébaudeau <benoit.thebaudeau@advansee.com>
> Cc: Marek Vasut <marex@denx.de>
> Cc: Ilya Yanok <ilya.yanok@cogentembedded.com>
> Cc: Stefan Herbrechtsmeier <stefan@herbrechtsmeier.net>
> ---
> Changes for v2: N/A.
> Changes for v3:
>  - New patch.
[...]

I'll need to go through this patch one more time ... can you please just check 
my comments on 1/8? The rest are all right (but this, which I'll review tomorrow 
I hope).

Thanks a lot for your {code, time, contribution, patience with lousy usb 
maintainership}!

Best regards,
Marek Vasut
Marek Vasut Aug. 11, 2012, 11:41 p.m. UTC | #2
Dear Benoît Thébaudeau,

> Relax the qTD transfer alignment constraints in order to need less qTDs for
> buffers that are aligned to 512 bytes but not to pages.
> 
> Signed-off-by: Benoît Thébaudeau <benoit.thebaudeau@advansee.com>
> Cc: Marek Vasut <marex@denx.de>
> Cc: Ilya Yanok <ilya.yanok@cogentembedded.com>
> Cc: Stefan Herbrechtsmeier <stefan@herbrechtsmeier.net>
> ---
> Changes for v2: N/A.
> Changes for v3:
>  - New patch.
> 
>  .../drivers/usb/host/ehci-hcd.c                    |   68
> +++++++++++--------- 1 file changed, 38 insertions(+), 30 deletions(-)
> 
> diff --git u-boot-usb-8d5fb14.orig/drivers/usb/host/ehci-hcd.c
> u-boot-usb-8d5fb14/drivers/usb/host/ehci-hcd.c index 84c7d08..37517cb
> 100644
> --- u-boot-usb-8d5fb14.orig/drivers/usb/host/ehci-hcd.c
> +++ u-boot-usb-8d5fb14/drivers/usb/host/ehci-hcd.c
> @@ -215,7 +215,7 @@ ehci_submit_async(struct usb_device *dev, unsigned long
> pipe, void *buffer, volatile struct qTD *vtd;
>  	unsigned long ts;
>  	uint32_t *tdp;
> -	uint32_t endpt, token, usbsts;
> +	uint32_t endpt, maxpacket, token, usbsts;
>  	uint32_t c, toggle;
>  	uint32_t cmd;
>  	int timeout;
> @@ -230,6 +230,7 @@ ehci_submit_async(struct usb_device *dev, unsigned long
> pipe, void *buffer, le16_to_cpu(req->value), le16_to_cpu(req->value),
>  		      le16_to_cpu(req->index));
> 
> +#define PKT_ALIGN	512

Make this const int maybe ?

>  	/*
>  	 * The USB transfer is split into qTD transfers. Eeach qTD transfer is
>  	 * described by a transfer descriptor (the qTD). The qTDs form a linked
> @@ -251,43 +252,41 @@ ehci_submit_async(struct usb_device *dev, unsigned
> long pipe, void *buffer, if (length > 0 || req == NULL) {
>  		/*
>  		 * Determine the qTD transfer size that will be used for the
> -		 * data payload (not considering the final qTD transfer, which
> -		 * may be shorter).
> +		 * data payload (not considering the first qTD transfer, which
> +		 * may be longer or shorter, and the final one, which may be
> +		 * shorter).
>  		 *
>  		 * In order to keep each packet within a qTD transfer, the qTD
> -		 * transfer size is aligned to EHCI_PAGE_SIZE, which is a
> -		 * multiple of wMaxPacketSize (except in some cases for
> -		 * interrupt transfers, see comment in submit_int_msg()).
> +		 * transfer size is aligned to PKT_ALIGN, which is a multiple of
> +		 * wMaxPacketSize (except in some cases for interrupt transfers,
> +		 * see comment in submit_int_msg()).
>  		 *
> -		 * By default, i.e. if the input buffer is page-aligned,
> +		 * By default, i.e. if the input buffer is aligned to PKT_ALIGN,
>  		 * QT_BUFFER_CNT full pages will be used.
>  		 */
>  		int xfr_sz = QT_BUFFER_CNT;
>  		/*
> -		 * However, if the input buffer is not page-aligned, the qTD
> -		 * transfer size will be one page shorter, and the first qTD
> +		 * However, if the input buffer is not aligned to PKT_ALIGN, the
> +		 * qTD transfer size will be one page shorter, and the first qTD
>  		 * data buffer of each transfer will be page-unaligned.
>  		 */
> -		if ((uint32_t)buffer & (EHCI_PAGE_SIZE - 1))
> +		if ((uint32_t)buffer & (PKT_ALIGN - 1))
>  			xfr_sz--;
>  		/* Convert the qTD transfer size to bytes. */
>  		xfr_sz *= EHCI_PAGE_SIZE;
>  		/*
> -		 * Determine the number of qTDs that will be required for the
> -		 * data payload. This value has to be rounded up since the final
> -		 * qTD transfer may be shorter than the regular qTD transfer
> -		 * size that has just been computed.
> +		 * Approximate by excess the number of qTDs that will be
> +		 * required for the data payload. The exact formula is way more
> +		 * complicated and saves at most 2 qTDs, i.e. a total of 128
> +		 * bytes.
>  		 */
> -		qtd_count += DIV_ROUND_UP(length, xfr_sz);
> -		/* ZLPs also need a qTD. */
> -		if (!qtd_count)
> -			qtd_count++;
> +		qtd_count += 2 + length / xfr_sz;
>  	}
>  /*
> - * Threshold value based on the worst-case total size of the qTDs to
> allocate - * for a mass-storage transfer of 65535 blocks of 512 bytes.
> + * Threshold value based on the worst-case total size of the allocated
> qTDs for + * a mass-storage transfer of 65535 blocks of 512 bytes.
>   */
> -#if CONFIG_SYS_MALLOC_LEN <= 128 * 1024
> +#if CONFIG_SYS_MALLOC_LEN <= 64 + 128 * 1024
>  #warning CONFIG_SYS_MALLOC_LEN may be too small for EHCI
>  #endif
>  	qtd = memalign(USB_DMA_MINALIGN, qtd_count * sizeof(struct qTD));
> @@ -314,9 +313,10 @@ ehci_submit_async(struct usb_device *dev, unsigned
> long pipe, void *buffer, qh->qh_link = cpu_to_hc32((uint32_t)qh_list |
> QH_LINK_TYPE_QH);
>  	c = (usb_pipespeed(pipe) != USB_SPEED_HIGH &&
>  	     usb_pipeendpoint(pipe) == 0);
> +	maxpacket = usb_maxpacket(dev, pipe);
>  	endpt = (8 << QH_ENDPT1_RL) |
>  	    (c << QH_ENDPT1_C) |
> -	    (usb_maxpacket(dev, pipe) << QH_ENDPT1_MAXPKTLEN) |
> +	    (maxpacket << QH_ENDPT1_MAXPKTLEN) |

Is this change really needed? (not that I care much).

[...]

Took me a bit to make it through, but I think I get it ... just real nits above.
Benoît Thébaudeau Aug. 12, 2012, 12:11 a.m. UTC | #3
Dear Marek Vasut,

> Dear Benoît Thébaudeau,
> 
> > Relax the qTD transfer alignment constraints in order to need less
> > qTDs for
> > buffers that are aligned to 512 bytes but not to pages.
> > 
> > Signed-off-by: Benoît Thébaudeau <benoit.thebaudeau@advansee.com>
> > Cc: Marek Vasut <marex@denx.de>
> > Cc: Ilya Yanok <ilya.yanok@cogentembedded.com>
> > Cc: Stefan Herbrechtsmeier <stefan@herbrechtsmeier.net>
> > ---
> > Changes for v2: N/A.
> > Changes for v3:
> >  - New patch.
> > 
> >  .../drivers/usb/host/ehci-hcd.c                    |   68
> > +++++++++++--------- 1 file changed, 38 insertions(+), 30
> > deletions(-)
> > 
> > diff --git u-boot-usb-8d5fb14.orig/drivers/usb/host/ehci-hcd.c
> > u-boot-usb-8d5fb14/drivers/usb/host/ehci-hcd.c index
> > 84c7d08..37517cb
> > 100644
> > --- u-boot-usb-8d5fb14.orig/drivers/usb/host/ehci-hcd.c
> > +++ u-boot-usb-8d5fb14/drivers/usb/host/ehci-hcd.c
> > @@ -215,7 +215,7 @@ ehci_submit_async(struct usb_device *dev,
> > unsigned long
> > pipe, void *buffer, volatile struct qTD *vtd;
> >  	unsigned long ts;
> >  	uint32_t *tdp;
> > -	uint32_t endpt, token, usbsts;
> > +	uint32_t endpt, maxpacket, token, usbsts;
> >  	uint32_t c, toggle;
> >  	uint32_t cmd;
> >  	int timeout;
> > @@ -230,6 +230,7 @@ ehci_submit_async(struct usb_device *dev,
> > unsigned long
> > pipe, void *buffer, le16_to_cpu(req->value),
> > le16_to_cpu(req->value),
> >  		      le16_to_cpu(req->index));
> > 
> > +#define PKT_ALIGN	512
> 
> Make this const int maybe ?

Why? I don't see any need for this.

> >  	/*
> >  	 * The USB transfer is split into qTD transfers. Eeach qTD
> >  	 transfer is
> >  	 * described by a transfer descriptor (the qTD). The qTDs form a
> >  	 linked
> > @@ -251,43 +252,41 @@ ehci_submit_async(struct usb_device *dev,
> > unsigned
> > long pipe, void *buffer, if (length > 0 || req == NULL) {
> >  		/*
> >  		 * Determine the qTD transfer size that will be used for the
> > -		 * data payload (not considering the final qTD transfer, which
> > -		 * may be shorter).
> > +		 * data payload (not considering the first qTD transfer, which
> > +		 * may be longer or shorter, and the final one, which may be
> > +		 * shorter).
> >  		 *
> >  		 * In order to keep each packet within a qTD transfer, the qTD
> > -		 * transfer size is aligned to EHCI_PAGE_SIZE, which is a
> > -		 * multiple of wMaxPacketSize (except in some cases for
> > -		 * interrupt transfers, see comment in submit_int_msg()).
> > +		 * transfer size is aligned to PKT_ALIGN, which is a multiple of
> > +		 * wMaxPacketSize (except in some cases for interrupt transfers,
> > +		 * see comment in submit_int_msg()).
> >  		 *
> > -		 * By default, i.e. if the input buffer is page-aligned,
> > +		 * By default, i.e. if the input buffer is aligned to PKT_ALIGN,
> >  		 * QT_BUFFER_CNT full pages will be used.
> >  		 */
> >  		int xfr_sz = QT_BUFFER_CNT;
> >  		/*
> > -		 * However, if the input buffer is not page-aligned, the qTD
> > -		 * transfer size will be one page shorter, and the first qTD
> > +		 * However, if the input buffer is not aligned to PKT_ALIGN, the
> > +		 * qTD transfer size will be one page shorter, and the first qTD
> >  		 * data buffer of each transfer will be page-unaligned.
> >  		 */
> > -		if ((uint32_t)buffer & (EHCI_PAGE_SIZE - 1))
> > +		if ((uint32_t)buffer & (PKT_ALIGN - 1))
> >  			xfr_sz--;
> >  		/* Convert the qTD transfer size to bytes. */
> >  		xfr_sz *= EHCI_PAGE_SIZE;
> >  		/*
> > -		 * Determine the number of qTDs that will be required for the
> > -		 * data payload. This value has to be rounded up since the final
> > -		 * qTD transfer may be shorter than the regular qTD transfer
> > -		 * size that has just been computed.
> > +		 * Approximate by excess the number of qTDs that will be
> > +		 * required for the data payload. The exact formula is way more
> > +		 * complicated and saves at most 2 qTDs, i.e. a total of 128
> > +		 * bytes.
> >  		 */
> > -		qtd_count += DIV_ROUND_UP(length, xfr_sz);
> > -		/* ZLPs also need a qTD. */
> > -		if (!qtd_count)
> > -			qtd_count++;
> > +		qtd_count += 2 + length / xfr_sz;
> >  	}
> >  /*
> > - * Threshold value based on the worst-case total size of the qTDs
> > to
> > allocate - * for a mass-storage transfer of 65535 blocks of 512
> > bytes.
> > + * Threshold value based on the worst-case total size of the
> > allocated
> > qTDs for + * a mass-storage transfer of 65535 blocks of 512 bytes.
> >   */
> > -#if CONFIG_SYS_MALLOC_LEN <= 128 * 1024
> > +#if CONFIG_SYS_MALLOC_LEN <= 64 + 128 * 1024
> >  #warning CONFIG_SYS_MALLOC_LEN may be too small for EHCI
> >  #endif
> >  	qtd = memalign(USB_DMA_MINALIGN, qtd_count * sizeof(struct qTD));
> > @@ -314,9 +313,10 @@ ehci_submit_async(struct usb_device *dev,
> > unsigned
> > long pipe, void *buffer, qh->qh_link =
> > cpu_to_hc32((uint32_t)qh_list |
> > QH_LINK_TYPE_QH);
> >  	c = (usb_pipespeed(pipe) != USB_SPEED_HIGH &&
> >  	     usb_pipeendpoint(pipe) == 0);
> > +	maxpacket = usb_maxpacket(dev, pipe);
> >  	endpt = (8 << QH_ENDPT1_RL) |
> >  	    (c << QH_ENDPT1_C) |
> > -	    (usb_maxpacket(dev, pipe) << QH_ENDPT1_MAXPKTLEN) |
> > +	    (maxpacket << QH_ENDPT1_MAXPKTLEN) |
> 
> Is this change really needed? (not that I care much).

It's here only to avoid calling the usb_maxpacket() function several times for
nothing since it is also called later in the patch.

> [...]
> 
> Took me a bit to make it through, but I think I get it ... just real
> nits above.

OK. Tell me if you have any question.

I don't think any change is needed, all the more you have already applied this
patch.

Best regards,
Benoît
Marek Vasut Aug. 12, 2012, 12:30 a.m. UTC | #4
Dear Benoît Thébaudeau,

> Dear Marek Vasut,
> 
> > Dear Benoît Thébaudeau,
> > 
> > > Relax the qTD transfer alignment constraints in order to need less
> > > qTDs for
> > > buffers that are aligned to 512 bytes but not to pages.
> > > 
> > > Signed-off-by: Benoît Thébaudeau <benoit.thebaudeau@advansee.com>
> > > Cc: Marek Vasut <marex@denx.de>
> > > Cc: Ilya Yanok <ilya.yanok@cogentembedded.com>
> > > Cc: Stefan Herbrechtsmeier <stefan@herbrechtsmeier.net>
> > > ---
> > > Changes for v2: N/A.
> > > 
> > > Changes for v3:
> > >  - New patch.
> > >  
> > >  .../drivers/usb/host/ehci-hcd.c                    |   68
> > > 
> > > +++++++++++--------- 1 file changed, 38 insertions(+), 30
> > > deletions(-)
> > > 
> > > diff --git u-boot-usb-8d5fb14.orig/drivers/usb/host/ehci-hcd.c
> > > u-boot-usb-8d5fb14/drivers/usb/host/ehci-hcd.c index
> > > 84c7d08..37517cb
> > > 100644
> > > --- u-boot-usb-8d5fb14.orig/drivers/usb/host/ehci-hcd.c
> > > +++ u-boot-usb-8d5fb14/drivers/usb/host/ehci-hcd.c
> > > @@ -215,7 +215,7 @@ ehci_submit_async(struct usb_device *dev,
> > > unsigned long
> > > pipe, void *buffer, volatile struct qTD *vtd;
> > > 
> > >  	unsigned long ts;
> > >  	uint32_t *tdp;
> > > 
> > > -	uint32_t endpt, token, usbsts;
> > > +	uint32_t endpt, maxpacket, token, usbsts;
> > > 
> > >  	uint32_t c, toggle;
> > >  	uint32_t cmd;
> > >  	int timeout;
> > > 
> > > @@ -230,6 +230,7 @@ ehci_submit_async(struct usb_device *dev,
> > > unsigned long
> > > pipe, void *buffer, le16_to_cpu(req->value),
> > > le16_to_cpu(req->value),
> > > 
> > >  		      le16_to_cpu(req->index));
> > > 
> > > +#define PKT_ALIGN	512
> > 
> > Make this const int maybe ?
> 
> Why? I don't see any need for this.

Typecheck maybe, but it's not so important.

> > >  	/*
> > >  	
> > >  	 * The USB transfer is split into qTD transfers. Eeach qTD
> > >  	 transfer is
> > >  	 * described by a transfer descriptor (the qTD). The qTDs form a
> > >  	 linked
> > > 
> > > @@ -251,43 +252,41 @@ ehci_submit_async(struct usb_device *dev,
> > > unsigned
> > > long pipe, void *buffer, if (length > 0 || req == NULL) {
> > > 
> > >  		/*
> > >  		
> > >  		 * Determine the qTD transfer size that will be used for the
> > > 
> > > -		 * data payload (not considering the final qTD transfer, which
> > > -		 * may be shorter).
> > > +		 * data payload (not considering the first qTD transfer, which
> > > +		 * may be longer or shorter, and the final one, which may be
> > > +		 * shorter).
> > > 
> > >  		 *
> > >  		 * In order to keep each packet within a qTD transfer, the qTD
> > > 
> > > -		 * transfer size is aligned to EHCI_PAGE_SIZE, which is a
> > > -		 * multiple of wMaxPacketSize (except in some cases for
> > > -		 * interrupt transfers, see comment in submit_int_msg()).
> > > +		 * transfer size is aligned to PKT_ALIGN, which is a multiple of
> > > +		 * wMaxPacketSize (except in some cases for interrupt transfers,
> > > +		 * see comment in submit_int_msg()).
> > > 
> > >  		 *
> > > 
> > > -		 * By default, i.e. if the input buffer is page-aligned,
> > > +		 * By default, i.e. if the input buffer is aligned to PKT_ALIGN,
> > > 
> > >  		 * QT_BUFFER_CNT full pages will be used.
> > >  		 */
> > >  		
> > >  		int xfr_sz = QT_BUFFER_CNT;
> > >  		/*
> > > 
> > > -		 * However, if the input buffer is not page-aligned, the qTD
> > > -		 * transfer size will be one page shorter, and the first qTD
> > > +		 * However, if the input buffer is not aligned to PKT_ALIGN, the
> > > +		 * qTD transfer size will be one page shorter, and the first qTD
> > > 
> > >  		 * data buffer of each transfer will be page-unaligned.
> > >  		 */
> > > 
> > > -		if ((uint32_t)buffer & (EHCI_PAGE_SIZE - 1))
> > > +		if ((uint32_t)buffer & (PKT_ALIGN - 1))
> > > 
> > >  			xfr_sz--;
> > >  		
> > >  		/* Convert the qTD transfer size to bytes. */
> > >  		xfr_sz *= EHCI_PAGE_SIZE;
> > >  		/*
> > > 
> > > -		 * Determine the number of qTDs that will be required for the
> > > -		 * data payload. This value has to be rounded up since the final
> > > -		 * qTD transfer may be shorter than the regular qTD transfer
> > > -		 * size that has just been computed.
> > > +		 * Approximate by excess the number of qTDs that will be
> > > +		 * required for the data payload. The exact formula is way more
> > > +		 * complicated and saves at most 2 qTDs, i.e. a total of 128
> > > +		 * bytes.
> > > 
> > >  		 */
> > > 
> > > -		qtd_count += DIV_ROUND_UP(length, xfr_sz);
> > > -		/* ZLPs also need a qTD. */
> > > -		if (!qtd_count)
> > > -			qtd_count++;
> > > +		qtd_count += 2 + length / xfr_sz;
> > > 
> > >  	}
> > >  
> > >  /*
> > > 
> > > - * Threshold value based on the worst-case total size of the qTDs
> > > to
> > > allocate - * for a mass-storage transfer of 65535 blocks of 512
> > > bytes.
> > > + * Threshold value based on the worst-case total size of the
> > > allocated
> > > qTDs for + * a mass-storage transfer of 65535 blocks of 512 bytes.
> > > 
> > >   */
> > > 
> > > -#if CONFIG_SYS_MALLOC_LEN <= 128 * 1024
> > > +#if CONFIG_SYS_MALLOC_LEN <= 64 + 128 * 1024
> > > 
> > >  #warning CONFIG_SYS_MALLOC_LEN may be too small for EHCI
> > >  #endif
> > >  
> > >  	qtd = memalign(USB_DMA_MINALIGN, qtd_count * sizeof(struct qTD));
> > > 
> > > @@ -314,9 +313,10 @@ ehci_submit_async(struct usb_device *dev,
> > > unsigned
> > > long pipe, void *buffer, qh->qh_link =
> > > cpu_to_hc32((uint32_t)qh_list |
> > > QH_LINK_TYPE_QH);
> > > 
> > >  	c = (usb_pipespeed(pipe) != USB_SPEED_HIGH &&
> > >  	
> > >  	     usb_pipeendpoint(pipe) == 0);
> > > 
> > > +	maxpacket = usb_maxpacket(dev, pipe);
> > > 
> > >  	endpt = (8 << QH_ENDPT1_RL) |
> > >  	
> > >  	    (c << QH_ENDPT1_C) |
> > > 
> > > -	    (usb_maxpacket(dev, pipe) << QH_ENDPT1_MAXPKTLEN) |
> > > +	    (maxpacket << QH_ENDPT1_MAXPKTLEN) |
> > 
> > Is this change really needed? (not that I care much).
> 
> It's here only to avoid calling the usb_maxpacket() function several times
> for nothing since it is also called later in the patch.

Ah ok.

> > [...]
> > 
> > Took me a bit to make it through, but I think I get it ... just real
> > nits above.
> 
> OK. Tell me if you have any question.
> 
> I don't think any change is needed, all the more you have already applied
> this patch.

I did? Heh ... must have been a mistake, but all right, I don't see much trouble 
with this one anyway :)

Well then we're done here ... thanks for your patches!

> Best regards,
> Benoît
diff mbox

Patch

diff --git u-boot-usb-8d5fb14.orig/drivers/usb/host/ehci-hcd.c u-boot-usb-8d5fb14/drivers/usb/host/ehci-hcd.c
index 84c7d08..37517cb 100644
--- u-boot-usb-8d5fb14.orig/drivers/usb/host/ehci-hcd.c
+++ u-boot-usb-8d5fb14/drivers/usb/host/ehci-hcd.c
@@ -215,7 +215,7 @@  ehci_submit_async(struct usb_device *dev, unsigned long pipe, void *buffer,
 	volatile struct qTD *vtd;
 	unsigned long ts;
 	uint32_t *tdp;
-	uint32_t endpt, token, usbsts;
+	uint32_t endpt, maxpacket, token, usbsts;
 	uint32_t c, toggle;
 	uint32_t cmd;
 	int timeout;
@@ -230,6 +230,7 @@  ehci_submit_async(struct usb_device *dev, unsigned long pipe, void *buffer,
 		      le16_to_cpu(req->value), le16_to_cpu(req->value),
 		      le16_to_cpu(req->index));
 
+#define PKT_ALIGN	512
 	/*
 	 * The USB transfer is split into qTD transfers. Eeach qTD transfer is
 	 * described by a transfer descriptor (the qTD). The qTDs form a linked
@@ -251,43 +252,41 @@  ehci_submit_async(struct usb_device *dev, unsigned long pipe, void *buffer,
 	if (length > 0 || req == NULL) {
 		/*
 		 * Determine the qTD transfer size that will be used for the
-		 * data payload (not considering the final qTD transfer, which
-		 * may be shorter).
+		 * data payload (not considering the first qTD transfer, which
+		 * may be longer or shorter, and the final one, which may be
+		 * shorter).
 		 *
 		 * In order to keep each packet within a qTD transfer, the qTD
-		 * transfer size is aligned to EHCI_PAGE_SIZE, which is a
-		 * multiple of wMaxPacketSize (except in some cases for
-		 * interrupt transfers, see comment in submit_int_msg()).
+		 * transfer size is aligned to PKT_ALIGN, which is a multiple of
+		 * wMaxPacketSize (except in some cases for interrupt transfers,
+		 * see comment in submit_int_msg()).
 		 *
-		 * By default, i.e. if the input buffer is page-aligned,
+		 * By default, i.e. if the input buffer is aligned to PKT_ALIGN,
 		 * QT_BUFFER_CNT full pages will be used.
 		 */
 		int xfr_sz = QT_BUFFER_CNT;
 		/*
-		 * However, if the input buffer is not page-aligned, the qTD
-		 * transfer size will be one page shorter, and the first qTD
+		 * However, if the input buffer is not aligned to PKT_ALIGN, the
+		 * qTD transfer size will be one page shorter, and the first qTD
 		 * data buffer of each transfer will be page-unaligned.
 		 */
-		if ((uint32_t)buffer & (EHCI_PAGE_SIZE - 1))
+		if ((uint32_t)buffer & (PKT_ALIGN - 1))
 			xfr_sz--;
 		/* Convert the qTD transfer size to bytes. */
 		xfr_sz *= EHCI_PAGE_SIZE;
 		/*
-		 * Determine the number of qTDs that will be required for the
-		 * data payload. This value has to be rounded up since the final
-		 * qTD transfer may be shorter than the regular qTD transfer
-		 * size that has just been computed.
+		 * Approximate by excess the number of qTDs that will be
+		 * required for the data payload. The exact formula is way more
+		 * complicated and saves at most 2 qTDs, i.e. a total of 128
+		 * bytes.
 		 */
-		qtd_count += DIV_ROUND_UP(length, xfr_sz);
-		/* ZLPs also need a qTD. */
-		if (!qtd_count)
-			qtd_count++;
+		qtd_count += 2 + length / xfr_sz;
 	}
 /*
- * Threshold value based on the worst-case total size of the qTDs to allocate
- * for a mass-storage transfer of 65535 blocks of 512 bytes.
+ * Threshold value based on the worst-case total size of the allocated qTDs for
+ * a mass-storage transfer of 65535 blocks of 512 bytes.
  */
-#if CONFIG_SYS_MALLOC_LEN <= 128 * 1024
+#if CONFIG_SYS_MALLOC_LEN <= 64 + 128 * 1024
 #warning CONFIG_SYS_MALLOC_LEN may be too small for EHCI
 #endif
 	qtd = memalign(USB_DMA_MINALIGN, qtd_count * sizeof(struct qTD));
@@ -314,9 +313,10 @@  ehci_submit_async(struct usb_device *dev, unsigned long pipe, void *buffer,
 	qh->qh_link = cpu_to_hc32((uint32_t)qh_list | QH_LINK_TYPE_QH);
 	c = (usb_pipespeed(pipe) != USB_SPEED_HIGH &&
 	     usb_pipeendpoint(pipe) == 0);
+	maxpacket = usb_maxpacket(dev, pipe);
 	endpt = (8 << QH_ENDPT1_RL) |
 	    (c << QH_ENDPT1_C) |
-	    (usb_maxpacket(dev, pipe) << QH_ENDPT1_MAXPKTLEN) |
+	    (maxpacket << QH_ENDPT1_MAXPKTLEN) |
 	    (0 << QH_ENDPT1_H) |
 	    (QH_ENDPT1_DTC_DT_FROM_QTD << QH_ENDPT1_DTC) |
 	    (usb_pipespeed(pipe) << QH_ENDPT1_EPS) |
@@ -362,6 +362,7 @@  ehci_submit_async(struct usb_device *dev, unsigned long pipe, void *buffer,
 	}
 
 	if (length > 0 || req == NULL) {
+		uint32_t qtd_toggle = toggle;
 		uint8_t *buf_ptr = buffer;
 		int left_length = length;
 
@@ -379,9 +380,9 @@  ehci_submit_async(struct usb_device *dev, unsigned long pipe, void *buffer,
 			xfr_bytes -= (uint32_t)buf_ptr & (EHCI_PAGE_SIZE - 1);
 			/*
 			 * In order to keep each packet within a qTD transfer,
-			 * align the qTD transfer size to EHCI_PAGE_SIZE.
+			 * align the qTD transfer size to PKT_ALIGN.
 			 */
-			xfr_bytes &= ~(EHCI_PAGE_SIZE - 1);
+			xfr_bytes &= ~(PKT_ALIGN - 1);
 			/*
 			 * This transfer may be shorter than the available qTD
 			 * transfer size that has just been computed.
@@ -401,7 +402,7 @@  ehci_submit_async(struct usb_device *dev, unsigned long pipe, void *buffer,
 					cpu_to_hc32(QT_NEXT_TERMINATE);
 			qtd[qtd_counter].qt_altnext =
 					cpu_to_hc32(QT_NEXT_TERMINATE);
-			token = (toggle << QT_TOKEN_DT) |
+			token = (qtd_toggle << QT_TOKEN_DT) |
 			    (xfr_bytes << QT_TOKEN_TOTALBYTES) |
 			    ((req == NULL) << QT_TOKEN_IOC) |
 			    (0 << QT_TOKEN_CPAGE) |
@@ -418,6 +419,13 @@  ehci_submit_async(struct usb_device *dev, unsigned long pipe, void *buffer,
 			/* Update previous qTD! */
 			*tdp = cpu_to_hc32((uint32_t)&qtd[qtd_counter]);
 			tdp = &qtd[qtd_counter++].qt_next;
+			/*
+			 * Data toggle has to be adjusted since the qTD transfer
+			 * size is not always an even multiple of
+			 * wMaxPacketSize.
+			 */
+			if ((xfr_bytes / maxpacket) & 1)
+				qtd_toggle ^= 1;
 			buf_ptr += xfr_bytes;
 			left_length -= xfr_bytes;
 		} while (left_length > 0);
@@ -944,11 +952,11 @@  submit_int_msg(struct usb_device *dev, unsigned long pipe, void *buffer,
 	 * because bInterval is ignored.
 	 *
 	 * Also, ehci_submit_async() relies on wMaxPacketSize being a power of 2
-	 * if several qTDs are required, while the USB specification does not
-	 * constrain this for interrupt transfers. That means that
-	 * ehci_submit_async() would support interrupt transfers requiring
-	 * several transactions only as long as the transfer size does not
-	 * require more than a single qTD.
+	 * <= PKT_ALIGN if several qTDs are required, while the USB
+	 * specification does not constrain this for interrupt transfers. That
+	 * means that ehci_submit_async() would support interrupt transfers
+	 * requiring several transactions only as long as the transfer size does
+	 * not require more than a single qTD.
 	 */
 	if (length > usb_maxpacket(dev, pipe)) {
 		printf("%s: Interrupt transfers requiring several transactions "