Patchwork [U-Boot,v2,2/5] ehci-hcd: Boost transfer speed

login
register
mail settings
Submitter Benoît Thébaudeau
Date July 20, 2012, 11:26 a.m.
Message ID <1024949320.326453.1342783584348.JavaMail.root@advansee.com>
Download mbox | patch
Permalink /patch/172204/
State Superseded
Headers show

Comments

Benoît Thébaudeau - July 20, 2012, 11:26 a.m.
This patch takes advantage of the hardware EHCI qTD queuing mechanism to avoid
software overhead and to make transfers as fast as possible.

The only drawback is a call to memalign. However, this is fast compared to the
transfer timings, and the heap size to allocate is small, e.g. a little bit more
than 100 kB for a transfer length of 65535 packets of 512 bytes.

Tested on i.MX25 and i.MX35. In my test conditions, the speedup was about 15x
using page-aligned buffers, which is really appreciable when accessing large
files.

Signed-off-by: Benoît Thébaudeau <benoit.thebaudeau@advansee.com>
Cc: Marek Vasut <marex@denx.de>
Cc: Ilya Yanok <ilya.yanok@cogentembedded.com>
Cc: Stefan Herbrechtsmeier <stefan@herbrechtsmeier.net>
---
Changes for v2:
 - Use DIV_ROUND_UP to make code more readable.

 .../drivers/usb/host/ehci-hcd.c                    |   92 ++++++++++++++------
 1 file changed, 63 insertions(+), 29 deletions(-)
Stefan Herbrechtsmeier - July 20, 2012, 11:37 a.m.
Am 20.07.2012 13:26, schrieb Benoît Thébaudeau:
> This patch takes advantage of the hardware EHCI qTD queuing mechanism to avoid
> software overhead and to make transfers as fast as possible.
>
> The only drawback is a call to memalign. However, this is fast compared to the
> transfer timings, and the heap size to allocate is small, e.g. a little bit more
> than 100 kB for a transfer length of 65535 packets of 512 bytes.
>
> Tested on i.MX25 and i.MX35. In my test conditions, the speedup was about 15x
> using page-aligned buffers, which is really appreciable when accessing large
> files.
>
> Signed-off-by: Benoît Thébaudeau <benoit.thebaudeau@advansee.com>
> Cc: Marek Vasut <marex@denx.de>
> Cc: Ilya Yanok <ilya.yanok@cogentembedded.com>
> Cc: Stefan Herbrechtsmeier <stefan@herbrechtsmeier.net>
> ---
> Changes for v2:
>   - Use DIV_ROUND_UP to make code more readable.
>
>   .../drivers/usb/host/ehci-hcd.c                    |   92 ++++++++++++++------
>   1 file changed, 63 insertions(+), 29 deletions(-)
>
> diff --git u-boot-usb-1b4bd0e.orig/drivers/usb/host/ehci-hcd.c u-boot-usb-1b4bd0e/drivers/usb/host/ehci-hcd.c
> index 5b3b906..cf9ab92 100644
> --- u-boot-usb-1b4bd0e.orig/drivers/usb/host/ehci-hcd.c
> +++ u-boot-usb-1b4bd0e/drivers/usb/host/ehci-hcd.c
> @@ -208,7 +208,8 @@ ehci_submit_async(struct usb_device *dev, unsigned long pipe, void *buffer,
>   		   int length, struct devrequest *req)
>   {
>   	ALLOC_ALIGN_BUFFER(struct QH, qh, 1, USB_DMA_MINALIGN);
> -	ALLOC_ALIGN_BUFFER(struct qTD, qtd, 3, USB_DMA_MINALIGN);
> +	struct qTD *qtd;
> +	int qtd_count = 0;
>   	int qtd_counter = 0;
>   
>   	volatile struct qTD *vtd;
> @@ -229,8 +230,23 @@ ehci_submit_async(struct usb_device *dev, unsigned long pipe, void *buffer,
>   		      le16_to_cpu(req->value), le16_to_cpu(req->value),
>   		      le16_to_cpu(req->index));
>   
> +	if (req != NULL)			/* SETUP + ACK */
> +		qtd_count += 1 + 1;
> +	if (length > 0 || req == NULL) {	/* buffer */
> +		if ((uint32_t)buffer & 4095)		/* page-unaligned */
> +			qtd_count += DIV_ROUND_UP(((uint32_t)buffer & 4095) +
> +					length, (QT_BUFFER_CNT - 1) * 4096);
> +		else					/* page-aligned */
> +			qtd_count += DIV_ROUND_UP(length, QT_BUFFER_CNT * 4096);
> +	}
> +	qtd = memalign(USB_DMA_MINALIGN, qtd_count * sizeof(struct qTD));
> +	if (qtd == NULL) {
> +		printf("unable to allocate TDs\n");
> +		return -1;
> +	}
> +
>   	memset(qh, 0, sizeof(struct QH));
> -	memset(qtd, 0, 3 * sizeof(*qtd));
> +	memset(qtd, 0, qtd_count * sizeof(*qtd));
>   
>   	toggle = usb_gettoggle(dev, usb_pipeendpoint(pipe), usb_pipeout(pipe));
>   
> @@ -291,31 +307,46 @@ ehci_submit_async(struct usb_device *dev, unsigned long pipe, void *buffer,
>   	}
>   
>   	if (length > 0 || req == NULL) {
> -		/*
> -		 * Setup request qTD (3.5 in ehci-r10.pdf)
> -		 *
> -		 *   qt_next ................ 03-00 H
> -		 *   qt_altnext ............. 07-04 H
> -		 *   qt_token ............... 0B-08 H
> -		 *
> -		 *   [ buffer, buffer_hi ] loaded with "buffer".
> -		 */
> -		qtd[qtd_counter].qt_next = cpu_to_hc32(QT_NEXT_TERMINATE);
> -		qtd[qtd_counter].qt_altnext = cpu_to_hc32(QT_NEXT_TERMINATE);
> -		token = (toggle << 31) |
> -		    (length << 16) |
> -		    ((req == NULL ? 1 : 0) << 15) |
> -		    (0 << 12) |
> -		    (3 << 10) |
> -		    ((usb_pipein(pipe) ? 1 : 0) << 8) | (0x80 << 0);
> -		qtd[qtd_counter].qt_token = cpu_to_hc32(token);
> -		if (ehci_td_buffer(&qtd[qtd_counter], buffer, length) != 0) {
> -			printf("unable construct DATA td\n");
> -			goto fail;
> -		}
> -		/* Update previous qTD! */
> -		*tdp = cpu_to_hc32((uint32_t)&qtd[qtd_counter]);
> -		tdp = &qtd[qtd_counter++].qt_next;
> +		uint8_t *buf_ptr = buffer;
> +		int left_length = length;
> +
> +		do {
> +			int xfr_bytes = min(left_length,
> +					    (QT_BUFFER_CNT * 4096 -
> +					     ((uint32_t)buf_ptr & 4095)) &
> +					    ~4095);
Why you align the length to 4096?
> +
> +			/*
> +			 * Setup request qTD (3.5 in ehci-r10.pdf)
> +			 *
> +			 *   qt_next ................ 03-00 H
> +			 *   qt_altnext ............. 07-04 H
> +			 *   qt_token ............... 0B-08 H
> +			 *
> +			 *   [ buffer, buffer_hi ] loaded with "buffer".
> +			 */
> +			qtd[qtd_counter].qt_next =
> +					cpu_to_hc32(QT_NEXT_TERMINATE);
> +			qtd[qtd_counter].qt_altnext =
> +					cpu_to_hc32(QT_NEXT_TERMINATE);
> +			token = (toggle << 31) |
> +			    (xfr_bytes << 16) |
> +			    ((req == NULL ? 1 : 0) << 15) |
> +			    (0 << 12) |
> +			    (3 << 10) |
> +			    ((usb_pipein(pipe) ? 1 : 0) << 8) | (0x80 << 0);
> +			qtd[qtd_counter].qt_token = cpu_to_hc32(token);
> +			if (ehci_td_buffer(&qtd[qtd_counter], buf_ptr,
> +						xfr_bytes) != 0) {
> +				printf("unable construct DATA td\n");
> +				goto fail;
> +			}
> +			/* Update previous qTD! */
> +			*tdp = cpu_to_hc32((uint32_t)&qtd[qtd_counter]);
> +			tdp = &qtd[qtd_counter++].qt_next;
> +			buf_ptr += xfr_bytes;
> +			left_length -= xfr_bytes;
> +		} while (left_length > 0);
>   	}
>   
>   	if (req != NULL) {
> @@ -346,7 +377,8 @@ ehci_submit_async(struct usb_device *dev, unsigned long pipe, void *buffer,
>   	flush_dcache_range((uint32_t)qh_list,
>   		ALIGN_END_ADDR(struct QH, qh_list, 1));
>   	flush_dcache_range((uint32_t)qh, ALIGN_END_ADDR(struct QH, qh, 1));
> -	flush_dcache_range((uint32_t)qtd, ALIGN_END_ADDR(struct qTD, qtd, 3));
> +	flush_dcache_range((uint32_t)qtd,
> +			   ALIGN_END_ADDR(struct qTD, qtd, qtd_count));
>   
>   	/* Set async. queue head pointer. */
>   	ehci_writel(&hcor->or_asynclistaddr, (uint32_t)qh_list);
> @@ -377,7 +409,7 @@ ehci_submit_async(struct usb_device *dev, unsigned long pipe, void *buffer,
>   		invalidate_dcache_range((uint32_t)qh,
>   			ALIGN_END_ADDR(struct QH, qh, 1));
>   		invalidate_dcache_range((uint32_t)qtd,
> -			ALIGN_END_ADDR(struct qTD, qtd, 3));
> +			ALIGN_END_ADDR(struct qTD, qtd, qtd_count));
>   
>   		token = hc32_to_cpu(vtd->qt_token);
>   		if (!(token & 0x80))
> @@ -450,9 +482,11 @@ ehci_submit_async(struct usb_device *dev, unsigned long pipe, void *buffer,
>   		      ehci_readl(&hcor->or_portsc[1]));
>   	}
>   
> +	free(qtd);
>   	return (dev->status != USB_ST_NOT_PROC) ? 0 : -1;
>   
>   fail:
> +	free(qtd);
>   	return -1;
>   }
>
Benoît Thébaudeau - July 20, 2012, 1:17 p.m.
On Friday 20 July 2012 13:37:37 Stefan Herbrechtsmeier wrote:
> Am 20.07.2012 13:26, schrieb Benoît Thébaudeau:
> > +			int xfr_bytes = min(left_length,
> > +					    (QT_BUFFER_CNT * 4096 -
> > +					     ((uint32_t)buf_ptr & 4095)) &
> > +					    ~4095);
> Why you align the length to 4096?

It's to guarantee that each transfer length is a multiple of the max packet
length. Otherwise, early short packets are issued, which breaks the transfer and
results in time-out error messages.

Regards,
Benoît
Marek Vasut - July 20, 2012, 1:44 p.m.
Dear Benoît Thébaudeau,

> On Friday 20 July 2012 13:37:37 Stefan Herbrechtsmeier wrote:
> > Am 20.07.2012 13:26, schrieb Benoît Thébaudeau:
> > > +			int xfr_bytes = min(left_length,
> > > +					    (QT_BUFFER_CNT * 4096 -
> > > +					     ((uint32_t)buf_ptr & 4095)) &
> > > +					    ~4095);
> > 
> > Why you align the length to 4096?
> 
> It's to guarantee that each transfer length is a multiple of the max packet
> length. Otherwise, early short packets are issued, which breaks the
> transfer and results in time-out error messages.

Early short packets ? What do you mean?

> Regards,
> Benoît

Best regards,
Marek Vasut
Benoît Thébaudeau - July 20, 2012, 1:56 p.m.
Dear Marek Vasut,

On Friday 20 July 2012 15:44:01 Marek Vasut wrote:
> > On Friday 20 July 2012 13:37:37 Stefan Herbrechtsmeier wrote:
> > > Am 20.07.2012 13:26, schrieb Benoît Thébaudeau:
> > > > +			int xfr_bytes = min(left_length,
> > > > +					    (QT_BUFFER_CNT * 4096 -
> > > > +					     ((uint32_t)buf_ptr & 4095)) &
> > > > +					    ~4095);
> > > 
> > > Why you align the length to 4096?
> > 
> > It's to guarantee that each transfer length is a multiple of the
> > max packet
> > length. Otherwise, early short packets are issued, which breaks the
> > transfer and results in time-out error messages.
> 
> Early short packets ? What do you mean?

During a USB transfer, all packets must have a length of max packet length for
the pipe/endpoint, except the final one that can be a short packet. Without the
alignment I make for xfr_bytes, short packets can occur within a transfer,
because the hardware starts a new packet for each new queued qTD it handles.

Best regards,
Benoît
Stefan Herbrechtsmeier - July 20, 2012, 2:51 p.m.
Am 20.07.2012 15:56, schrieb Benoît Thébaudeau:
> Dear Marek Vasut,
>
> On Friday 20 July 2012 15:44:01 Marek Vasut wrote:
>>> On Friday 20 July 2012 13:37:37 Stefan Herbrechtsmeier wrote:
>>>> Am 20.07.2012 13:26, schrieb Benoît Thébaudeau:
>>>>> +			int xfr_bytes = min(left_length,
>>>>> +					    (QT_BUFFER_CNT * 4096 -
>>>>> +					     ((uint32_t)buf_ptr & 4095)) &
>>>>> +					    ~4095);
>>>> Why you align the length to 4096?
>>> It's to guarantee that each transfer length is a multiple of the
>>> max packet
>>> length. Otherwise, early short packets are issued, which breaks the
>>> transfer and results in time-out error messages.
>> Early short packets ? What do you mean?
> During a USB transfer, all packets must have a length of max packet length for
> the pipe/endpoint, except the final one that can be a short packet. Without the
> alignment I make for xfr_bytes, short packets can occur within a transfer,
> because the hardware starts a new packet for each new queued qTD it handles.
But if I am right, the max packet length is 512 for bulk and 1024 for 
Interrupt transfer.

Regards,
     Stefan
Benoît Thébaudeau - July 20, 2012, 3:03 p.m.
On Friday 20 July 2012 16:51:33 Stefan Herbrechtsmeier wrote:
> Am 20.07.2012 15:56, schrieb Benoît Thébaudeau:
> > Dear Marek Vasut,
> >
> > On Friday 20 July 2012 15:44:01 Marek Vasut wrote:
> >>> On Friday 20 July 2012 13:37:37 Stefan Herbrechtsmeier wrote:
> >>>> Am 20.07.2012 13:26, schrieb Benoît Thébaudeau:
> >>>>> +			int xfr_bytes = min(left_length,
> >>>>> +					    (QT_BUFFER_CNT * 4096 -
> >>>>> +					     ((uint32_t)buf_ptr & 4095)) &
> >>>>> +					    ~4095);
> >>>> Why you align the length to 4096?
> >>> It's to guarantee that each transfer length is a multiple of the
> >>> max packet
> >>> length. Otherwise, early short packets are issued, which breaks
> >>> the
> >>> transfer and results in time-out error messages.
> >> Early short packets ? What do you mean?
> > During a USB transfer, all packets must have a length of max packet
> > length for
> > the pipe/endpoint, except the final one that can be a short packet.
> > Without the
> > alignment I make for xfr_bytes, short packets can occur within a
> > transfer,
> > because the hardware starts a new packet for each new queued qTD it
> > handles.
> But if I am right, the max packet length is 512 for bulk and 1024 for
> Interrupt transfer.

There are indeed different max packet lengths for different transfer types, but
it does not matter since the chosen alignment guarantees a multiple of all these
possible max packet lengths.

Best regards,
Benoît
Stefan Herbrechtsmeier - July 20, 2012, 3:15 p.m.
Am 20.07.2012 17:03, schrieb Benoît Thébaudeau:
> On Friday 20 July 2012 16:51:33 Stefan Herbrechtsmeier wrote:
>> Am 20.07.2012 15:56, schrieb Benoît Thébaudeau:
>>> Dear Marek Vasut,
>>>
>>> On Friday 20 July 2012 15:44:01 Marek Vasut wrote:
>>>>> On Friday 20 July 2012 13:37:37 Stefan Herbrechtsmeier wrote:
>>>>>> Am 20.07.2012 13:26, schrieb Benoît Thébaudeau:
>>>>>>> +			int xfr_bytes = min(left_length,
>>>>>>> +					    (QT_BUFFER_CNT * 4096 -
>>>>>>> +					     ((uint32_t)buf_ptr & 4095)) &
>>>>>>> +					    ~4095);
>>>>>> Why you align the length to 4096?
>>>>> It's to guarantee that each transfer length is a multiple of the
>>>>> max packet
>>>>> length. Otherwise, early short packets are issued, which breaks
>>>>> the
>>>>> transfer and results in time-out error messages.
>>>> Early short packets ? What do you mean?
>>> During a USB transfer, all packets must have a length of max packet
>>> length for
>>> the pipe/endpoint, except the final one that can be a short packet.
>>> Without the
>>> alignment I make for xfr_bytes, short packets can occur within a
>>> transfer,
>>> because the hardware starts a new packet for each new queued qTD it
>>> handles.
>> But if I am right, the max packet length is 512 for bulk and 1024 for
>> Interrupt transfer.
> There are indeed different max packet lengths for different transfer types, but
> it does not matter since the chosen alignment guarantees a multiple of all these
> possible max packet lengths.
But thereby you limit the transfer to 4 qT buffers for unaligned transfers.

Best regards,
     Stefan
Benoît Thébaudeau - July 20, 2012, 3:35 p.m.
On Friday 20 July 2012 17:15:13 Stefan Herbrechtsmeier wrote:
> Am 20.07.2012 17:03, schrieb Benoît Thébaudeau:
> > On Friday 20 July 2012 16:51:33 Stefan Herbrechtsmeier wrote:
> >> Am 20.07.2012 15:56, schrieb Benoît Thébaudeau:
> >>> Dear Marek Vasut,
> >>>
> >>> On Friday 20 July 2012 15:44:01 Marek Vasut wrote:
> >>>>> On Friday 20 July 2012 13:37:37 Stefan Herbrechtsmeier wrote:
> >>>>>> Am 20.07.2012 13:26, schrieb Benoît Thébaudeau:
> >>>>>>> +			int xfr_bytes = min(left_length,
> >>>>>>> +					    (QT_BUFFER_CNT * 4096 -
> >>>>>>> +					     ((uint32_t)buf_ptr & 4095)) &
> >>>>>>> +					    ~4095);
> >>>>>> Why you align the length to 4096?
> >>>>> It's to guarantee that each transfer length is a multiple of
> >>>>> the
> >>>>> max packet
> >>>>> length. Otherwise, early short packets are issued, which breaks
> >>>>> the
> >>>>> transfer and results in time-out error messages.
> >>>> Early short packets ? What do you mean?
> >>> During a USB transfer, all packets must have a length of max
> >>> packet
> >>> length for
> >>> the pipe/endpoint, except the final one that can be a short
> >>> packet.
> >>> Without the
> >>> alignment I make for xfr_bytes, short packets can occur within a
> >>> transfer,
> >>> because the hardware starts a new packet for each new queued qTD
> >>> it
> >>> handles.
> >> But if I am right, the max packet length is 512 for bulk and 1024
> >> for
> >> Interrupt transfer.
> > There are indeed different max packet lengths for different
> > transfer types, but
> > it does not matter since the chosen alignment guarantees a multiple
> > of all these
> > possible max packet lengths.
> But thereby you limit the transfer to 4 qT buffers for unaligned
> transfers.

Not exactly. The 5 qt_buffers are used for page-unaligned buffers, but that
results in only 4 full pages of unaligned data, requiring 5 aligned pages.

For page-aligned buffers, the 5 qt_buffers result in 5 full pages of aligned
data.

The unaligned case could be a little bit improved to always use as many packets
as possible per qTD, but that would over-complicate things for a very negligible
speed and memory gain.

Best regards,
Benoît
Stefan Herbrechtsmeier - July 23, 2012, 1:35 p.m.
Am 20.07.2012 17:35, schrieb Benoît Thébaudeau:
> On Friday 20 July 2012 17:15:13 Stefan Herbrechtsmeier wrote:
>> Am 20.07.2012 17:03, schrieb Benoît Thébaudeau:
>>> On Friday 20 July 2012 16:51:33 Stefan Herbrechtsmeier wrote:
>>>> Am 20.07.2012 15:56, schrieb Benoît Thébaudeau:
>>>>> Dear Marek Vasut,
>>>>>
>>>>> On Friday 20 July 2012 15:44:01 Marek Vasut wrote:
>>>>>>> On Friday 20 July 2012 13:37:37 Stefan Herbrechtsmeier wrote:
>>>>>>>> Am 20.07.2012 13:26, schrieb Benoît Thébaudeau:
>>>>>>>>> +			int xfr_bytes = min(left_length,
>>>>>>>>> +					    (QT_BUFFER_CNT * 4096 -
>>>>>>>>> +					     ((uint32_t)buf_ptr & 4095)) &
>>>>>>>>> +					    ~4095);
>>>>>>>> Why you align the length to 4096?
>>>>>>> It's to guarantee that each transfer length is a multiple of
>>>>>>> the
>>>>>>> max packet
>>>>>>> length. Otherwise, early short packets are issued, which breaks
>>>>>>> the
>>>>>>> transfer and results in time-out error messages.
>>>>>> Early short packets ? What do you mean?
>>>>> During a USB transfer, all packets must have a length of max
>>>>> packet
>>>>> length for
>>>>> the pipe/endpoint, except the final one that can be a short
>>>>> packet.
>>>>> Without the
>>>>> alignment I make for xfr_bytes, short packets can occur within a
>>>>> transfer,
>>>>> because the hardware starts a new packet for each new queued qTD
>>>>> it
>>>>> handles.
>>>> But if I am right, the max packet length is 512 for bulk and 1024
>>>> for
>>>> Interrupt transfer.
>>> There are indeed different max packet lengths for different
>>> transfer types, but
>>> it does not matter since the chosen alignment guarantees a multiple
>>> of all these
>>> possible max packet lengths.
>> But thereby you limit the transfer to 4 qT buffers for unaligned
>> transfers.
> Not exactly. The 5 qt_buffers are used for page-unaligned buffers, but that
> results in only 4 full pages of unaligned data, requiring 5 aligned pages.
Sorry I mean 4 full pages of unaligned data.
>
> For page-aligned buffers, the 5 qt_buffers result in 5 full pages of aligned
> data.
Sure.
>
> The unaligned case could be a little bit improved to always use as many packets
> as possible per qTD, but that would over-complicate things for a very negligible
> speed and memory gain.
In my use case (fragmented file on usb storage)  the gain would be 
nearly 20%. The reason is that the data are block aligned (512) and 
could be aligned to 4096 with the first transfer (5 qt_buffers).

My suggestion would be to truncate the xfr_bytes with the max 
wMaxPacketSize (1024) and for the qtd_count use:

if ((uint32_t)buffer & 1023)    /* wMaxPacketSize unaligned */
     qtd_count += DIV_ROUND_UP(((uint32_t)buffer & 4095) +
             length, (QT_BUFFER_CNT - 1) * 4096);
else                /* wMaxPacketSize aligned */
     qtd_count += DIV_ROUND_UP(((uint32_t)buffer & 4095) +
             length, QT_BUFFER_CNT * 4096);

This allows 50% of unaligned block data (512) to be transferred with min 
qTDs.
Benoît Thébaudeau - July 23, 2012, 5:15 p.m.
On Monday 23 July 2012 15:35:25 Stefan Herbrechtsmeier wrote:
> Am 20.07.2012 17:35, schrieb Benoît Thébaudeau:
> > On Friday 20 July 2012 17:15:13 Stefan Herbrechtsmeier wrote:
> >> Am 20.07.2012 17:03, schrieb Benoît Thébaudeau:
> >>> On Friday 20 July 2012 16:51:33 Stefan Herbrechtsmeier wrote:
> >>>> Am 20.07.2012 15:56, schrieb Benoît Thébaudeau:
> >>>>> Dear Marek Vasut,
> >>>>>
> >>>>> On Friday 20 July 2012 15:44:01 Marek Vasut wrote:
> >>>>>>> On Friday 20 July 2012 13:37:37 Stefan Herbrechtsmeier wrote:
> >>>>>>>> Am 20.07.2012 13:26, schrieb Benoît Thébaudeau:
> >>>>>>>>> +			int xfr_bytes = min(left_length,
> >>>>>>>>> +					    (QT_BUFFER_CNT * 4096 -
> >>>>>>>>> +					     ((uint32_t)buf_ptr & 4095)) &
> >>>>>>>>> +					    ~4095);
> >>>>>>>> Why you align the length to 4096?
> >>>>>>> It's to guarantee that each transfer length is a multiple of
> >>>>>>> the
> >>>>>>> max packet
> >>>>>>> length. Otherwise, early short packets are issued, which
> >>>>>>> breaks
> >>>>>>> the
> >>>>>>> transfer and results in time-out error messages.
> >>>>>> Early short packets ? What do you mean?
> >>>>> During a USB transfer, all packets must have a length of max
> >>>>> packet
> >>>>> length for
> >>>>> the pipe/endpoint, except the final one that can be a short
> >>>>> packet.
> >>>>> Without the
> >>>>> alignment I make for xfr_bytes, short packets can occur within
> >>>>> a
> >>>>> transfer,
> >>>>> because the hardware starts a new packet for each new queued
> >>>>> qTD
> >>>>> it
> >>>>> handles.
> >>>> But if I am right, the max packet length is 512 for bulk and
> >>>> 1024
> >>>> for
> >>>> Interrupt transfer.
> >>> There are indeed different max packet lengths for different
> >>> transfer types, but
> >>> it does not matter since the chosen alignment guarantees a
> >>> multiple
> >>> of all these
> >>> possible max packet lengths.
> >> But thereby you limit the transfer to 4 qT buffers for unaligned
> >> transfers.
> > Not exactly. The 5 qt_buffers are used for page-unaligned buffers,
> > but that
> > results in only 4 full pages of unaligned data, requiring 5 aligned
> > pages.
> Sorry I mean 4 full pages of unaligned data.
> >
> > For page-aligned buffers, the 5 qt_buffers result in 5 full pages
> > of aligned
> > data.
> Sure.
> >
> > The unaligned case could be a little bit improved to always use as
> > many packets
> > as possible per qTD, but that would over-complicate things for a
> > very negligible
> > speed and memory gain.
> In my use case (fragmented file on usb storage)  the gain would be
> nearly 20%. The reason is that the data are block aligned (512) and
> could be aligned to 4096 with the first transfer (5 qt_buffers).

Can you explain where this gain would come from? In both cases, the data in USB
transfers would be organized in the same way, and it would be accessed in memory
also in the same way (regarding bursts). The only difference would be the fetch
time of a little bit more qTDs, which is extremely fast and insignificant
compared to the transfer time of the payload, which remains unchanged.

Moreover, in your use case, if you are e.g. using FAT, on the one hand, the
buffers in fat.c are never aligned to more than the DMA min alignment, and on
the other hand, if you can align your user buffers to 512 bytes, you can also
align them directly to 4 kB.

> My suggestion would be to truncate the xfr_bytes with the max
> wMaxPacketSize (1024) and for the qtd_count use:
> 
> if ((uint32_t)buffer & 1023)    /* wMaxPacketSize unaligned */
>      qtd_count += DIV_ROUND_UP(((uint32_t)buffer & 4095) +
>              length, (QT_BUFFER_CNT - 1) * 4096);
> else                /* wMaxPacketSize aligned */
>      qtd_count += DIV_ROUND_UP(((uint32_t)buffer & 4095) +
>              length, QT_BUFFER_CNT * 4096);
> 
> This allows 50% of unaligned block data (512) to be transferred with
> min
> qTDs.

That would also require a realignment-to-page stage. This is specific code for
specific buffer alignment from the upper layers. We could also skip the
realignment to page and always keep the same qTD transfer size except for the
last one, by adding as many packets as possible for the buffer alignment.

But I still don't see a significant reason to complicate code to do that.

BTW, the 15x speed gain that I gave in my patch description was compared to an
older version of the original code that used 20 blocks per transfer in
usb_storage.c. This is now 40 blocks per transfer with a page-aligned buffer, so
the speed gain compared to the current code should be rather about 7x. I should
update that.

Best regards,
Benoît
Stefan Herbrechtsmeier - July 24, 2012, 1:02 p.m.
Am 23.07.2012 19:15, schrieb Benoît Thébaudeau:
> On Monday 23 July 2012 15:35:25 Stefan Herbrechtsmeier wrote:
>> Am 20.07.2012 17:35, schrieb Benoît Thébaudeau:
>>> On Friday 20 July 2012 17:15:13 Stefan Herbrechtsmeier wrote:
>>>> Am 20.07.2012 17:03, schrieb Benoît Thébaudeau:
>>>>> On Friday 20 July 2012 16:51:33 Stefan Herbrechtsmeier wrote:
>>>>>> Am 20.07.2012 15:56, schrieb Benoît Thébaudeau:
>>>>>>> Dear Marek Vasut,
>>>>>>>
>>>>>>> On Friday 20 July 2012 15:44:01 Marek Vasut wrote:
>>>>>>>>> On Friday 20 July 2012 13:37:37 Stefan Herbrechtsmeier wrote:
>>>>>>>>>> Am 20.07.2012 13:26, schrieb Benoît Thébaudeau:
>>>>>>>>>>> +			int xfr_bytes = min(left_length,
>>>>>>>>>>> +					    (QT_BUFFER_CNT * 4096 -
>>>>>>>>>>> +					     ((uint32_t)buf_ptr & 4095)) &
>>>>>>>>>>> +					    ~4095);
>>>>>>>>>> Why you align the length to 4096?
>>>>>>>>> It's to guarantee that each transfer length is a multiple of
>>>>>>>>> the
>>>>>>>>> max packet
>>>>>>>>> length. Otherwise, early short packets are issued, which
>>>>>>>>> breaks
>>>>>>>>> the
>>>>>>>>> transfer and results in time-out error messages.
>>>>>>>> Early short packets ? What do you mean?
>>>>>>> During a USB transfer, all packets must have a length of max
>>>>>>> packet
>>>>>>> length for
>>>>>>> the pipe/endpoint, except the final one that can be a short
>>>>>>> packet.
>>>>>>> Without the
>>>>>>> alignment I make for xfr_bytes, short packets can occur within
>>>>>>> a
>>>>>>> transfer,
>>>>>>> because the hardware starts a new packet for each new queued
>>>>>>> qTD
>>>>>>> it
>>>>>>> handles.
>>>>>> But if I am right, the max packet length is 512 for bulk and
>>>>>> 1024
>>>>>> for
>>>>>> Interrupt transfer.
>>>>> There are indeed different max packet lengths for different
>>>>> transfer types, but
>>>>> it does not matter since the chosen alignment guarantees a
>>>>> multiple
>>>>> of all these
>>>>> possible max packet lengths.
>>>> But thereby you limit the transfer to 4 qT buffers for unaligned
>>>> transfers.
>>> Not exactly. The 5 qt_buffers are used for page-unaligned buffers,
>>> but that
>>> results in only 4 full pages of unaligned data, requiring 5 aligned
>>> pages.
>> Sorry I mean 4 full pages of unaligned data.
>>> For page-aligned buffers, the 5 qt_buffers result in 5 full pages
>>> of aligned
>>> data.
>> Sure.
>>> The unaligned case could be a little bit improved to always use as
>>> many packets
>>> as possible per qTD, but that would over-complicate things for a
>>> very negligible
>>> speed and memory gain.
>> In my use case (fragmented file on usb storage)  the gain would be
>> nearly 20%. The reason is that the data are block aligned (512) and
>> could be aligned to 4096 with the first transfer (5 qt_buffers).
> Can you explain where this gain would come from? In both cases, the data in USB
> transfers would be organized in the same way, and it would be accessed in memory
> also in the same way (regarding bursts). The only difference would be the fetch
> time of a little bit more qTDs, which is extremely fast and insignificant
> compared to the transfer time of the payload, which remains unchanged.
You are right, the speed different will be minimal, only the memory 
usage will be lower.

> Moreover, in your use case, if you are e.g. using FAT, on the one hand, the
> buffers in fat.c are never aligned to more than the DMA min alignment, and on
> the other hand, if you can align your user buffers to 512 bytes, you can also
> align them directly to 4 kB.
The user buffer is aligned to 4kB, but this doesn't matter as a file 
load from a storage device (ex. fatload) can be segmented in partial USB 
transfers. This can lead to any block aligned buffer for a partial transfer.

>
>> My suggestion would be to truncate the xfr_bytes with the max
>> wMaxPacketSize (1024) and for the qtd_count use:
>>
>> if ((uint32_t)buffer & 1023)    /* wMaxPacketSize unaligned */
>>       qtd_count += DIV_ROUND_UP(((uint32_t)buffer & 4095) +
>>               length, (QT_BUFFER_CNT - 1) * 4096);
>> else                /* wMaxPacketSize aligned */
>>       qtd_count += DIV_ROUND_UP(((uint32_t)buffer & 4095) +
>>               length, QT_BUFFER_CNT * 4096);
>>
>> This allows 50% of unaligned block data (512) to be transferred with
>> min
>> qTDs.
> That would also require a realignment-to-page stage. This is specific code for
> specific buffer alignment from the upper layers. We could also skip the
> realignment to page and always keep the same qTD transfer size except for the
> last one, by adding as many packets as possible for the buffer alignment.
What you mean by realignment-to-page stage?

> But I still don't see a significant reason to complicate code to do that.
I don't understand where you expect to complicate the code.

You limit the size of one transfer (xfr_bytes) to (QT_BUFFER_CNT - 1) * 
4kB for unaligned buffers. But you only need to limit it to a multiple 
of the maximal possible wMaxPacketSize (1kB) to make sure that you 
always send full packages.

I only suggest to replace the causeless 4kB alignment with a reasonable 
1kB alignment and adapte the qtd_count caluclation.

                         int xfr_bytes = min(left_length,
                                             (QT_BUFFER_CNT * 4096 -
                                              ((uint32_t)buf_ptr & 4095)) &
-                                           ~4095);
+                                           ~1023);

> BTW, the 15x speed gain that I gave in my patch description was compared to an
> older version of the original code that used 20 blocks per transfer in
> usb_storage.c. This is now 40 blocks per transfer with a page-aligned buffer, so
> the speed gain compared to the current code should be rather about 7x. I should
> update that.
I'm sure that there is a significant speed gain but you shouldn't miss 
the heap usage as the main CONFIG_SYS_MALLOC_LEN is 128kB.

Maybe you should also add a worst case heap usage and I'm not sure, if 
your calculation are right, as the size of struct qTD is allays 32B and 
thereby I get 50kB or 64kB.

Best regards,
     Stefan
Marek Vasut - July 27, 2012, 2:07 p.m.
Dear Benoît Thébaudeau,

[...]

> @@ -229,8 +230,23 @@ ehci_submit_async(struct usb_device *dev, unsigned
> long pipe, void *buffer, le16_to_cpu(req->value), le16_to_cpu(req->value),
>  		      le16_to_cpu(req->index));
> 
> +	if (req != NULL)			/* SETUP + ACK */
> +		qtd_count += 1 + 1;
> +	if (length > 0 || req == NULL) {	/* buffer */
> +		if ((uint32_t)buffer & 4095)		/* page-unaligned */
> +			qtd_count += DIV_ROUND_UP(((uint32_t)buffer & 4095) +
> +					length, (QT_BUFFER_CNT - 1) * 4096);
> +		else					/* page-aligned */
> +			qtd_count += DIV_ROUND_UP(length, QT_BUFFER_CNT * 4096);
> +	}

Ok, now I almost see it.

> +	qtd = memalign(USB_DMA_MINALIGN, qtd_count * sizeof(struct qTD));
> +	if (qtd == NULL) {
> +		printf("unable to allocate TDs\n");
> +		return -1;
> +	}
> +
>  	memset(qh, 0, sizeof(struct QH));
> -	memset(qtd, 0, 3 * sizeof(*qtd));
> +	memset(qtd, 0, qtd_count * sizeof(*qtd));
> 
>  	toggle = usb_gettoggle(dev, usb_pipeendpoint(pipe), usb_pipeout(pipe));
> 
[...]

I'm cool with the rest, I'll think about the calculation a bit though, since I'm 
not certain about it right away and let you know. Will you be submitting the 
series again or shall I just merge 3/5 and 4/5, apply this one and be done with 
it?

Best regards,
Marek Vasut
Benoît Thébaudeau - July 27, 2012, 2:16 p.m.
Dear Marek,

On Fri, Jul 27, 2012 at 04:07:01 PM, Marek Vasut wrote:
> > +	qtd = memalign(USB_DMA_MINALIGN, qtd_count * sizeof(struct qTD));
> > +	if (qtd == NULL) {
> > +		printf("unable to allocate TDs\n");
> > +		return -1;
> > +	}
> > +
> >  	memset(qh, 0, sizeof(struct QH));
> > -	memset(qtd, 0, 3 * sizeof(*qtd));
> > +	memset(qtd, 0, qtd_count * sizeof(*qtd));
> > 
> >  	toggle = usb_gettoggle(dev, usb_pipeendpoint(pipe),
> >  	usb_pipeout(pipe));
> > 
> [...]
> 
> I'm cool with the rest, I'll think about the calculation a bit
> though, since I'm
> not certain about it right away and let you know. Will you be
> submitting the
> series again or shall I just merge 3/5 and 4/5, apply this one and be
> done with
> it?

I'll resubmit the whole series. I'll first answer Stefan in this thread when
possible.

Regards,
Benoît
Marek Vasut - July 27, 2012, 2:30 p.m.
Dear Benoît Thébaudeau,

> Dear Marek,
> 
> On Fri, Jul 27, 2012 at 04:07:01 PM, Marek Vasut wrote:
> > > +	qtd = memalign(USB_DMA_MINALIGN, qtd_count * sizeof(struct qTD));
> > > +	if (qtd == NULL) {
> > > +		printf("unable to allocate TDs\n");
> > > +		return -1;
> > > +	}
> > > +
> > > 
> > >  	memset(qh, 0, sizeof(struct QH));
> > > 
> > > -	memset(qtd, 0, 3 * sizeof(*qtd));
> > > +	memset(qtd, 0, qtd_count * sizeof(*qtd));
> > > 
> > >  	toggle = usb_gettoggle(dev, usb_pipeendpoint(pipe),
> > >  	usb_pipeout(pipe));
> > 
> > [...]
> > 
> > I'm cool with the rest, I'll think about the calculation a bit
> > though, since I'm
> > not certain about it right away and let you know. Will you be
> > submitting the
> > series again or shall I just merge 3/5 and 4/5, apply this one and be
> > done with
> > it?
> 
> I'll resubmit the whole series. I'll first answer Stefan in this thread
> when possible.

Please do, and see how Jim is with the timeout patch please.

> Regards,
> Benoît

Best regards,
Marek Vasut
Benoît Thébaudeau - July 29, 2012, 12:48 a.m.
Dear Stefan,

Sorry for the delay. I'm very busy, and there is much to tell on this topic.

On Tue, Jul 24, 2012 at 03:02:00 PM, Stefan Herbrechtsmeier wrote:
> Am 23.07.2012 19:15, schrieb Benoît Thébaudeau:
> > On Monday 23 July 2012 15:35:25 Stefan Herbrechtsmeier wrote:
> >> Am 20.07.2012 17:35, schrieb Benoît Thébaudeau:
> >>> On Friday 20 July 2012 17:15:13 Stefan Herbrechtsmeier wrote:
> >>>> Am 20.07.2012 17:03, schrieb Benoît Thébaudeau:
> >>>>> On Friday 20 July 2012 16:51:33 Stefan Herbrechtsmeier wrote:
> >>>>>> Am 20.07.2012 15:56, schrieb Benoît Thébaudeau:
> >>>>>>> Dear Marek Vasut,
> >>>>>>>
> >>>>>>> On Friday 20 July 2012 15:44:01 Marek Vasut wrote:
> >>>>>>>>> On Friday 20 July 2012 13:37:37 Stefan Herbrechtsmeier
> >>>>>>>>> wrote:
> >>>>>>>>>> Am 20.07.2012 13:26, schrieb Benoît Thébaudeau:
> >>>>>>>>>>> +			int xfr_bytes = min(left_length,
> >>>>>>>>>>> +					    (QT_BUFFER_CNT * 4096 -
> >>>>>>>>>>> +					     ((uint32_t)buf_ptr & 4095)) &
> >>>>>>>>>>> +					    ~4095);
> >>>>>>>>>> Why you align the length to 4096?
> >>>>>>>>> It's to guarantee that each transfer length is a multiple
> >>>>>>>>> of
> >>>>>>>>> the
> >>>>>>>>> max packet
> >>>>>>>>> length. Otherwise, early short packets are issued, which
> >>>>>>>>> breaks
> >>>>>>>>> the
> >>>>>>>>> transfer and results in time-out error messages.
> >>>>>>>> Early short packets ? What do you mean?
> >>>>>>> During a USB transfer, all packets must have a length of max
> >>>>>>> packet
> >>>>>>> length for
> >>>>>>> the pipe/endpoint, except the final one that can be a short
> >>>>>>> packet.
> >>>>>>> Without the
> >>>>>>> alignment I make for xfr_bytes, short packets can occur
> >>>>>>> within
> >>>>>>> a
> >>>>>>> transfer,
> >>>>>>> because the hardware starts a new packet for each new queued
> >>>>>>> qTD
> >>>>>>> it
> >>>>>>> handles.
> >>>>>> But if I am right, the max packet length is 512 for bulk and
> >>>>>> 1024
> >>>>>> for
> >>>>>> Interrupt transfer.
> >>>>> There are indeed different max packet lengths for different
> >>>>> transfer types, but
> >>>>> it does not matter since the chosen alignment guarantees a
> >>>>> multiple
> >>>>> of all these
> >>>>> possible max packet lengths.
> >>>> But thereby you limit the transfer to 4 qT buffers for unaligned
> >>>> transfers.
> >>> Not exactly. The 5 qt_buffers are used for page-unaligned
> >>> buffers,
> >>> but that
> >>> results in only 4 full pages of unaligned data, requiring 5
> >>> aligned
> >>> pages.
> >> Sorry I mean 4 full pages of unaligned data.
> >>> For page-aligned buffers, the 5 qt_buffers result in 5 full pages
> >>> of aligned
> >>> data.
> >> Sure.
> >>> The unaligned case could be a little bit improved to always use
> >>> as
> >>> many packets
> >>> as possible per qTD, but that would over-complicate things for a
> >>> very negligible
> >>> speed and memory gain.
> >> In my use case (fragmented file on usb storage)  the gain would be
> >> nearly 20%. The reason is that the data are block aligned (512)
> >> and
> >> could be aligned to 4096 with the first transfer (5 qt_buffers).
> > Can you explain where this gain would come from? In both cases, the
> > data in USB
> > transfers would be organized in the same way, and it would be
> > accessed in memory
> > also in the same way (regarding bursts). The only difference would
> > be the fetch
> > time of a little bit more qTDs, which is extremely fast and
> > insignificant
> > compared to the transfer time of the payload, which remains
> > unchanged.
> You are right, the speed different will be minimal, only the memory
> usage will be lower.

If your point is only the memory gain, I agree. With your suggestion, there are
roughly 25% less qTDs used in the "(max wMaxPacketSize)-aligned but not
page-aligned" case since the number of qTDs is about (total transfer size) / 5
instead of (total transfer size) / 4. But this is still small compared to usual
heap sizes (at least on the kind of hardware I use).

> > Moreover, in your use case, if you are e.g. using FAT, on the one
> > hand, the
> > buffers in fat.c are never aligned to more than the DMA min
> > alignment, and on
> > the other hand, if you can align your user buffers to 512 bytes,
> > you can also
> > align them directly to 4 kB.
> The user buffer is aligned to 4kB, but this doesn't matter as a file
> load from a storage device (ex. fatload) can be segmented in partial
> USB
> transfers. This can lead to any block aligned buffer for a partial
> transfer.

What do you mean by "partial USB transfers"? As seen from EHCI users like the
MSC driver (usb_storage.c), USB transfers either succeed or fail, but they
cannot be "segmented".

On its side, the MSC driver will only segment the FAT layer requests if they are
larger than 65535 blocks, so still not what you describe.

As to the FAT stack, it will only read whole clusters while accessing file
payload, and the most usual cluster sizes are by default a multiple of 4 kiB
(see http://support.microsoft.com/kb/140365).

So I don't see "segmentation" anywhere, and for usual cluster sizes, the EHCI
buffer alignment is fully determined by the applicative buffer alignment and the
file position corresponding to the beginning of the applicative buffer. But
there are indeed some unusual use cases (e.g. smaller clusters) for which only
a block-aligned buffer will reach EHCI despite a page-aligned applicative
buffer.

> >> My suggestion would be to truncate the xfr_bytes with the max
> >> wMaxPacketSize (1024) and for the qtd_count use:
> >>
> >> if ((uint32_t)buffer & 1023)    /* wMaxPacketSize unaligned */
> >>       qtd_count += DIV_ROUND_UP(((uint32_t)buffer & 4095) +
> >>               length, (QT_BUFFER_CNT - 1) * 4096);
> >> else                /* wMaxPacketSize aligned */
> >>       qtd_count += DIV_ROUND_UP(((uint32_t)buffer & 4095) +
> >>               length, QT_BUFFER_CNT * 4096);
> >>
> >> This allows 50% of unaligned block data (512) to be transferred
> >> with
> >> min
> >> qTDs.
> > That would also require a realignment-to-page stage. This is
> > specific code for
> > specific buffer alignment from the upper layers. We could also skip
> > the
> > realignment to page and always keep the same qTD transfer size
> > except for the
> > last one, by adding as many packets as possible for the buffer
> > alignment.
> What you mean by realignment-to-page stage?

I mean that the alignment of the transfer to 1024 instead of 4096 can make the
first qTD transfer larger than the following ones, which guarantees that the
following qTD transfers are page-aligned, even if the first one was only aligned
to 1024. For the 1024-aligned case, this results in the change that you suggest,
but it also changes things for the unaligned case, which makes this part of the
code inaccurate. See below.

> > But I still don't see a significant reason to complicate code to do
> > that.
> I don't understand where you expect to complicate the code.
> 
> You limit the size of one transfer (xfr_bytes) to (QT_BUFFER_CNT - 1)
> *
> 4kB for unaligned buffers. But you only need to limit it to a
> multiple
> of the maximal possible wMaxPacketSize (1kB) to make sure that you
> always send full packages.
> 
> I only suggest to replace the causeless 4kB alignment with a
> reasonable
> 1kB alignment and adapte the qtd_count caluclation.
> 
>                          int xfr_bytes = min(left_length,
>                                              (QT_BUFFER_CNT * 4096 -
>                                               ((uint32_t)buf_ptr &
>                                               4095)) &
> -                                           ~4095);
> +                                           ~1023);

I agree for this part of the code. But for the allocation part, your suggestion
is already a little bit more complicated than my original code, while still
incomplete. Besides that, the "((uint32_t)buffer & 4095) +" for the
page-unaligned case in my code was actually useless, which emphasizes the
difference, even if it's a light complication.

For the allocation part, the appropriate code for your suggestion would be:

		if ((uint32_t)buffer & 1023) /* max-wMaxPacketSize-unaligned */
			qtd_count += DIV_ROUND_UP(max(length > 0, length -
				(4096 - ((uint32_t)buf_ptr & 4095) & ~1023)),
					(QT_BUFFER_CNT - 1) * 4096);
		else			     /* max-wMaxPacketSize-aligned */
			qtd_count += DIV_ROUND_UP(((uint32_t)buffer & 4095) +
					length, QT_BUFFER_CNT * 4096);

This code allocates exactly the required number of qTDs, no less, no more. It's
clearly more complicated than the 4096 version.

A test should also be added to make sure that qtd_count is never 0. Otherwise,
ZLPs are broken (this applies to my original code too).

If we want to compromise accuracy for simplicity, we can change that to:

		qtd_count += 2 + length /
			((QT_BUFFER_CNT - !!((uint32_t)buffer & 1023)) * 4096);

This code allocates enough qTDs for all cases, with at worst 2 extra qTDs (i.e.
a total of 128 bytes) that will be left unused. It also handles intrinsically
ZLPs.

Now, let's consider the possible values of wMaxPacketSize:
 - control endpoints:
    * LS: 8,
    * FS: 8, 16, 32 or 64,
    * HS: 64,
 - isochronous endpoints: not supported by ehci-hcd.c,
 - interrupt endpoints:
    * LS: <= 8,
    * FS: <= 64,
    * HS: <= 1024 (1x to 3x for high-bandwidth),
 - bulk endpoints:
    * LS: N/A,
    * FS: 8, 16, 32 or 64,
    * HS: 512.

My code assumes that wMaxPacketSize is a power of 2. This is not always true for
interrupt endpoints. Let's talk about these. Their handling is currently broken
in U-Boot since their transfers are made asynchronous instead of periodic.
Devices shouldn't care too much about that, as long as transfers do not exceed
wMaxPacketSize, in which case my code still works because wMaxPacketSize always
fits in a single qTD. Interrupt transfers larger than wMaxPacketSize do not seem
to be used by U-Boot. If they were used, the current code in U-Boot would have a
timing issue because the asynchronous scheme would break the interval requested
by devices, which could at worst make them fail in some way. So the only
solution would be that such transfers be split by the caller of submit_int_msg,
in which case my code still works. What would you think about failing with an
error message in submit_int_msg if length is larger than wMaxPacketSize? Marek,
what do you think?

For all other cases, wMaxPacketSize is a power of 2, so everything is fine,
except that in those cases wMaxPacketSize is at most 512, which means that with
the suggested limitation applied to submit_int_msg, your suggested 1024 could be
replaced with 512, which is good news since this is also the most common storage
sector size.

We could even use usb_maxpacket(dev, pipe) instead of 512, with some
restrictions. If we don't want to alter the misalignment check in
ehci_td_buffer, max(usb_maxpacket(dev, pipe), ARCH_DMA_MINALIGN) would actually
have to be used. This misalignment check could be limited to the first qTD
transfer of a USB transfer, but that would complicate things, all the more the
corresponding call to flush_dcache_range would have to be modified to fix
alignments.

So we have to make two choices:
 - between 4096, 1024, 512 and max(usb_maxpacket(dev, pipe), ARCH_DMA_MINALIGN),
 - between the accurate and simple allocations.
That makes a total of 8 working possibilities. What do you guys think we should
choose? On my side, I like max(usb_maxpacket(dev, pipe), ARCH_DMA_MINALIGN) with
the simple allocation. It's efficient as to code speed, size and readability, as
well as to RAM usage.

> > BTW, the 15x speed gain that I gave in my patch description was
> > compared to an
> > older version of the original code that used 20 blocks per transfer
> > in
> > usb_storage.c. This is now 40 blocks per transfer with a
> > page-aligned buffer, so
> > the speed gain compared to the current code should be rather about
> > 7x. I should
> > update that.
> I'm sure that there is a significant speed gain but you shouldn't
> miss
> the heap usage as the main CONFIG_SYS_MALLOC_LEN is 128kB.

I have checked all the config files. Among those using EHCI, most have a heap
size >= 1 MiB. The only exceptions are:

---------------------------------------------
| Heap Size |      Board       |  RAM Size  |
|-------------------------------------------|
|   512 kiB | MERGERBOX        |   > 10 MiB |
|           | MPC8315ERDB      |    128 MiB |
|           | MVBLM7           |    512 MiB |
|-------------------------------------------|
|   256 kiB | MPC8349ITX       |    256 MiB |
|           | omap4_panda      |      1 GiB |
|-------------------------------------------|
|   128 kiB | adp-ag102        |    256 MiB |
|           | at91sam9m10g45ek |    128 MiB |
|           | edminiv2         |     64 MiB |
|           | M52277EVB        |     64 MiB |
|           | omap3_beagle     | >= 128 MiB |
---------------------------------------------

As you can see, these small heap sizes are not linked to any hardware
constraint, but only to the lack of need to have larger heaps, so they could be
easily enlarged if needed. But even 128 kiB should be enough for common usage.

> Maybe you should also add a worst case heap usage and I'm not sure,
> if
> your calculation are right, as the size of struct qTD is allays 32B
> and
> thereby I get 50kB or 64kB.

From ehci.h:
struct qTD {
	/* this part defined by EHCI spec */
	uint32_t qt_next;		/* see EHCI 3.5.1 */
#define	QT_NEXT_TERMINATE	1
	uint32_t qt_altnext;		/* see EHCI 3.5.2 */
	uint32_t qt_token;		/* see EHCI 3.5.3 */
	uint32_t qt_buffer[5];		/* see EHCI 3.5.4 */
	uint32_t qt_buffer_hi[5];	/* Appendix B */
	/* pad struct for 32 byte alignment */
	uint32_t unused[3];
};

So sizeof(struct qTD) is 16 * 32 bits = 64 bytes. For the worst alignment case,
the number of qTDs to allocate for 65535 blocks of 512 bytes (worst MSC case
with 512-byte sectors) is DIV_ROUND_UP(65535 * 512, 4 * 4096) = 2048 qTDs, i.e.
128 kiB. For the same transfer size with the best alignment case, it is
DIV_ROUND_UP(65535 * 512, 5 * 4096) = 1639 qTDs, i.e. 104896 B.

But if we consider extreme cases, these figures can get even larger, e.g. with
4-kiB storage sectors.

So we could perhaps issue a #error in ehci-hcd or in usb_storage if
CONFIG_SYS_MALLOC_LEN is not large enough, but I don't think it's a good idea
because:
 - the threshold value would have to depend on runtime block sizes or something,
   which could lead to a big worst worst case that would almost never happen in
   real life, so giving such an unrealistic heap size constraint would be
   cumbersome,
 - reaching the top sizes would mean reading a huge file or something to a large
   buffer (much larger than the qTDs this transfer requires), which would very
   likely be heap-allocated (except for commands like fatload), so
   CONFIG_SYS_MALLOC_LEN would already have to be large for the application,
 - for command line operations like fatload, transfers of uncontrolled lengths
   could simply crash U-Boot if they go too far in memory, which means that
   users of such commands need to know what they are doing anyway, so they have
   to control transfer sizes,
 - there is already a runtime error displayed in case of allocation failure.

Marek, what do you think?

Best regards,
Benoît
Marek Vasut - July 30, 2012, 10:38 p.m.
Dear Benoît Thébaudeau,

[...]

> > > Can you explain where this gain would come from? In both cases, the
> > > data in USB
> > > transfers would be organized in the same way, and it would be
> > > accessed in memory
> > > also in the same way (regarding bursts). The only difference would
> > > be the fetch
> > > time of a little bit more qTDs, which is extremely fast and
> > > insignificant
> > > compared to the transfer time of the payload, which remains
> > > unchanged.
> > 
> > You are right, the speed different will be minimal, only the memory
> > usage will be lower.
> 
> If your point is only the memory gain, I agree. With your suggestion, there
> are roughly 25% less qTDs used in the "(max wMaxPacketSize)-aligned but
> not page-aligned" case since the number of qTDs is about (total transfer
> size) / 5 instead of (total transfer size) / 4. But this is still small
> compared to usual heap sizes (at least on the kind of hardware I use).

Ok, I see the point. I understand it's not really a bug, just an improvement. 
Maybe we can do a subsequent patch on top of these from Benoit and see how it 
fares?

> > > Moreover, in your use case, if you are e.g. using FAT, on the one
> > > hand, the
> > > buffers in fat.c are never aligned to more than the DMA min
> > > alignment, and on
> > > the other hand, if you can align your user buffers to 512 bytes,
> > > you can also
> > > align them directly to 4 kB.
> > 
> > The user buffer is aligned to 4kB, but this doesn't matter as a file
> > load from a storage device (ex. fatload) can be segmented in partial
> > USB
> > transfers. This can lead to any block aligned buffer for a partial
> > transfer.
> 
> What do you mean by "partial USB transfers"? As seen from EHCI users like
> the MSC driver (usb_storage.c), USB transfers either succeed or fail, but
> they cannot be "segmented".

Segmented -- like multiple transfers being issues with small payload? You can 
not put these together at the USB-level, since it's the issuing code that has to 
be fixed.

> On its side, the MSC driver will only segment the FAT layer requests if
> they are larger than 65535 blocks, so still not what you describe.
> 
> As to the FAT stack, it will only read whole clusters while accessing file
> payload, and the most usual cluster sizes are by default a multiple of 4
> kiB (see http://support.microsoft.com/kb/140365).

512b is minimum and it's quite often used.

> So I don't see "segmentation" anywhere, and for usual cluster sizes, the
> EHCI buffer alignment is fully determined by the applicative buffer
> alignment and the file position corresponding to the beginning of the
> applicative buffer. But there are indeed some unusual use cases (e.g.
> smaller clusters) for which only a block-aligned buffer will reach EHCI
> despite a page-aligned applicative buffer.

I don't quite get this one.

> > >> My suggestion would be to truncate the xfr_bytes with the max
> > >> wMaxPacketSize (1024) and for the qtd_count use:
> > >> 
> > >> if ((uint32_t)buffer & 1023)    /* wMaxPacketSize unaligned */
> > >> 
> > >>       qtd_count += DIV_ROUND_UP(((uint32_t)buffer & 4095) +
> > >>       
> > >>               length, (QT_BUFFER_CNT - 1) * 4096);
> > >> 
> > >> else                /* wMaxPacketSize aligned */
> > >> 
> > >>       qtd_count += DIV_ROUND_UP(((uint32_t)buffer & 4095) +
> > >>       
> > >>               length, QT_BUFFER_CNT * 4096);
> > >> 
> > >> This allows 50% of unaligned block data (512) to be transferred
> > >> with
> > >> min
> > >> qTDs.
> > > 
> > > That would also require a realignment-to-page stage. This is
> > > specific code for
> > > specific buffer alignment from the upper layers. We could also skip
> > > the
> > > realignment to page and always keep the same qTD transfer size
> > > except for the
> > > last one, by adding as many packets as possible for the buffer
> > > alignment.
> > 
> > What you mean by realignment-to-page stage?
> 
> I mean that the alignment of the transfer to 1024 instead of 4096 can make
> the first qTD transfer larger than the following ones, which guarantees
> that the following qTD transfers are page-aligned, even if the first one
> was only aligned to 1024. For the 1024-aligned case, this results in the
> change that you suggest, but it also changes things for the unaligned
> case, which makes this part of the code inaccurate. See below.
> 
> > > But I still don't see a significant reason to complicate code to do
> > > that.
> > 
> > I don't understand where you expect to complicate the code.
> > 
> > You limit the size of one transfer (xfr_bytes) to (QT_BUFFER_CNT - 1)
> > *
> > 4kB for unaligned buffers. But you only need to limit it to a
> > multiple
> > of the maximal possible wMaxPacketSize (1kB) to make sure that you
> > always send full packages.
> > 
> > I only suggest to replace the causeless 4kB alignment with a
> > reasonable
> > 1kB alignment and adapte the qtd_count caluclation.
> > 
> >                          int xfr_bytes = min(left_length,
> >                          
> >                                              (QT_BUFFER_CNT * 4096 -
> >                                              
> >                                               ((uint32_t)buf_ptr &
> >                                               4095)) &
> > 
> > -                                           ~4095);
> > +                                           ~1023);
> 
> I agree for this part of the code. But for the allocation part, your
> suggestion is already a little bit more complicated than my original code,
> while still incomplete. Besides that, the "((uint32_t)buffer & 4095) +"
> for the page-unaligned case in my code was actually useless, which
> emphasizes the difference, even if it's a light complication.
> 
> For the allocation part, the appropriate code for your suggestion would be:
> 
> 		if ((uint32_t)buffer & 1023) /* max-wMaxPacketSize-unaligned */
> 			qtd_count +=

DIV_ROUND_UP(
    max(
        length > 0,
        length - (4096 - ((uint32_t)buf_ptr & 4095) & ~1023)
    ),
    (QT_BUFFER_CNT - 1) * 4096
);

Ok, I now think I understand what's going on here. I still have to wonder how 
much would the compiler optimize of this if you "decompressed" your code -- to 
make it more easy to understand.

> 		else			     /* max-wMaxPacketSize-aligned */
> 			qtd_count += DIV_ROUND_UP(((uint32_t)buffer & 4095) +
> 					length, QT_BUFFER_CNT * 4096);
> 
> This code allocates exactly the required number of qTDs, no less, no more.
> It's clearly more complicated than the 4096 version.
> 
> A test should also be added to make sure that qtd_count is never 0.
> Otherwise, ZLPs are broken (this applies to my original code too).
> 
> If we want to compromise accuracy for simplicity, we can change that to:
> 
> 		qtd_count += 2 + length /
> 			((QT_BUFFER_CNT - !!((uint32_t)buffer & 1023)) * 4096);
> 
> This code allocates enough qTDs for all cases, with at worst 2 extra qTDs
> (i.e. a total of 128 bytes) that will be left unused. It also handles
> intrinsically ZLPs.
> 
> Now, let's consider the possible values of wMaxPacketSize:
>  - control endpoints:
>     * LS: 8,
>     * FS: 8, 16, 32 or 64,
>     * HS: 64,
>  - isochronous endpoints: not supported by ehci-hcd.c,
>  - interrupt endpoints:
>     * LS: <= 8,
>     * FS: <= 64,
>     * HS: <= 1024 (1x to 3x for high-bandwidth),
>  - bulk endpoints:
>     * LS: N/A,
>     * FS: 8, 16, 32 or 64,
>     * HS: 512.
> 
> My code assumes that wMaxPacketSize is a power of 2. This is not always
> true for interrupt endpoints. Let's talk about these. Their handling is
> currently broken in U-Boot since their transfers are made asynchronous
> instead of periodic. Devices shouldn't care too much about that, as long
> as transfers do not exceed wMaxPacketSize, in which case my code still
> works because wMaxPacketSize always fits in a single qTD. Interrupt
> transfers larger than wMaxPacketSize do not seem to be used by U-Boot. If
> they were used, the current code in U-Boot would have a timing issue
> because the asynchronous scheme would break the interval requested by
> devices, which could at worst make them fail in some way. So the only
> solution would be that such transfers be split by the caller of
> submit_int_msg, in which case my code still works. What would you think
> about failing with an error message in submit_int_msg if length is larger
> than wMaxPacketSize? Marek, what do you think?

Let's do that ... I think the interrupt endpoint is only used for keyboard and 
if someone needs it for something else, the code will be there, just needing 
improvement. Comment and error message are OK.

> For all other cases, wMaxPacketSize is a power of 2, so everything is fine,
> except that in those cases wMaxPacketSize is at most 512, which means that
> with the suggested limitation applied to submit_int_msg, your suggested
> 1024 could be replaced with 512, which is good news since this is also the
> most common storage sector size.
>
> We could even use usb_maxpacket(dev, pipe) instead of 512, with some
> restrictions. If we don't want to alter the misalignment check in
> ehci_td_buffer, max(usb_maxpacket(dev, pipe), ARCH_DMA_MINALIGN) would
> actually have to be used. This misalignment check could be limited to the
> first qTD transfer of a USB transfer, but that would complicate things,
> all the more the corresponding call to flush_dcache_range would have to be
> modified to fix alignments.
> 
> So we have to make two choices:
>  - between 4096, 1024, 512 and max(usb_maxpacket(dev, pipe),
> ARCH_DMA_MINALIGN), - between the accurate and simple allocations.
> That makes a total of 8 working possibilities. What do you guys think we
> should choose? On my side, I like max(usb_maxpacket(dev, pipe),
> ARCH_DMA_MINALIGN)

Won't maxpacket fall below 512 on occasions, which might cause trouble?

> with the simple allocation. It's efficient as to code
> speed, size and readability, as well as to RAM usage.

For now, I'd go for the safest, easiest and dumbest solution and see how it 
fares. Subsequent patch can be submitted to improve that and measurements made.

"We should forget about small efficiencies, say about 97% of the time; premature 
optimization is the root of all evil"
            -- Donald E. Knuth, Structured Programming with go to Statements
[...]

> So we could perhaps issue a #error in ehci-hcd or in usb_storage if
> CONFIG_SYS_MALLOC_LEN is not large enough, but I don't think it's a good
> idea because:
>  - the threshold value would have to depend on runtime block sizes or
> something, which could lead to a big worst worst case that would almost
> never happen in real life, so giving such an unrealistic heap size
> constraint would be cumbersome,

#warning then?

>  - reaching the top sizes would mean reading a huge file or something to a
> large buffer (much larger than the qTDs this transfer requires), which
> would very likely be heap-allocated (except for commands like fatload), so
> CONFIG_SYS_MALLOC_LEN would already have to be large for the application,
> - for command line operations like fatload, transfers of uncontrolled
> lengths could simply crash U-Boot if they go too far in memory

Why, because they overwrite it?

> , which
> means that users of such commands need to know what they are doing anyway,
> so they have to control transfer sizes,
>  - there is already a runtime error displayed in case of allocation
> failure.

Ok

> Marek, what do you think?

Had a good evening with the EHCI r10 spec, hope I answered most of your 
questions.
Benoît Thébaudeau - July 31, 2012, 1:06 a.m.
Dear Marek Vasut,

On Tue, Jul 31, 2012 at 12:38:54 AM, Marek Vasut wrote:
> [...]
> 
> > > > Can you explain where this gain would come from? In both cases,
> > > > the
> > > > data in USB
> > > > transfers would be organized in the same way, and it would be
> > > > accessed in memory
> > > > also in the same way (regarding bursts). The only difference
> > > > would
> > > > be the fetch
> > > > time of a little bit more qTDs, which is extremely fast and
> > > > insignificant
> > > > compared to the transfer time of the payload, which remains
> > > > unchanged.
> > > 
> > > You are right, the speed different will be minimal, only the
> > > memory
> > > usage will be lower.
> > 
> > If your point is only the memory gain, I agree. With your
> > suggestion, there
> > are roughly 25% less qTDs used in the "(max wMaxPacketSize)-aligned
> > but
> > not page-aligned" case since the number of qTDs is about (total
> > transfer
> > size) / 5 instead of (total transfer size) / 4. But this is still
> > small
> > compared to usual heap sizes (at least on the kind of hardware I
> > use).
> 
> Ok, I see the point. I understand it's not really a bug, just an
> improvement.

Exactly.

> Maybe we can do a subsequent patch on top of these from Benoit and
> see how it
> fares?

If you wish. I'll do that.

> > > > Moreover, in your use case, if you are e.g. using FAT, on the
> > > > one
> > > > hand, the
> > > > buffers in fat.c are never aligned to more than the DMA min
> > > > alignment, and on
> > > > the other hand, if you can align your user buffers to 512
> > > > bytes,
> > > > you can also
> > > > align them directly to 4 kB.
> > > 
> > > The user buffer is aligned to 4kB, but this doesn't matter as a
> > > file
> > > load from a storage device (ex. fatload) can be segmented in
> > > partial
> > > USB
> > > transfers. This can lead to any block aligned buffer for a
> > > partial
> > > transfer.
> > 
> > What do you mean by "partial USB transfers"? As seen from EHCI
> > users like
> > the MSC driver (usb_storage.c), USB transfers either succeed or
> > fail, but
> > they cannot be "segmented".
> 
> Segmented -- like multiple transfers being issues with small payload?
> You can
> not put these together at the USB-level, since it's the issuing code
> that has to
> be fixed.
> 
> > On its side, the MSC driver will only segment the FAT layer
> > requests if
> > they are larger than 65535 blocks, so still not what you describe.
> > 
> > As to the FAT stack, it will only read whole clusters while
> > accessing file
> > payload, and the most usual cluster sizes are by default a multiple
> > of 4
> > kiB (see http://support.microsoft.com/kb/140365).
> 
> 512b is minimum and it's quite often used.

OK.

> > So I don't see "segmentation" anywhere, and for usual cluster
> > sizes, the
> > EHCI buffer alignment is fully determined by the applicative buffer
> > alignment and the file position corresponding to the beginning of
> > the
> > applicative buffer. But there are indeed some unusual use cases
> > (e.g.
> > smaller clusters) for which only a block-aligned buffer will reach
> > EHCI
> > despite a page-aligned applicative buffer.
> 
> I don't quite get this one.

I meant that 512 bytes (most usual storage block size) is what we should aim at
to optimize the number of qTDs.

> > > >> My suggestion would be to truncate the xfr_bytes with the max
> > > >> wMaxPacketSize (1024) and for the qtd_count use:
> > > >> 
> > > >> if ((uint32_t)buffer & 1023)    /* wMaxPacketSize unaligned */
> > > >> 
> > > >>       qtd_count += DIV_ROUND_UP(((uint32_t)buffer & 4095) +
> > > >>       
> > > >>               length, (QT_BUFFER_CNT - 1) * 4096);
> > > >> 
> > > >> else                /* wMaxPacketSize aligned */
> > > >> 
> > > >>       qtd_count += DIV_ROUND_UP(((uint32_t)buffer & 4095) +
> > > >>       
> > > >>               length, QT_BUFFER_CNT * 4096);
> > > >> 
> > > >> This allows 50% of unaligned block data (512) to be
> > > >> transferred
> > > >> with
> > > >> min
> > > >> qTDs.
> > > > 
> > > > That would also require a realignment-to-page stage. This is
> > > > specific code for
> > > > specific buffer alignment from the upper layers. We could also
> > > > skip
> > > > the
> > > > realignment to page and always keep the same qTD transfer size
> > > > except for the
> > > > last one, by adding as many packets as possible for the buffer
> > > > alignment.
> > > 
> > > What you mean by realignment-to-page stage?
> > 
> > I mean that the alignment of the transfer to 1024 instead of 4096
> > can make
> > the first qTD transfer larger than the following ones, which
> > guarantees
> > that the following qTD transfers are page-aligned, even if the
> > first one
> > was only aligned to 1024. For the 1024-aligned case, this results
> > in the
> > change that you suggest, but it also changes things for the
> > unaligned
> > case, which makes this part of the code inaccurate. See below.
> > 
> > > > But I still don't see a significant reason to complicate code
> > > > to do
> > > > that.
> > > 
> > > I don't understand where you expect to complicate the code.
> > > 
> > > You limit the size of one transfer (xfr_bytes) to (QT_BUFFER_CNT
> > > - 1)
> > > *
> > > 4kB for unaligned buffers. But you only need to limit it to a
> > > multiple
> > > of the maximal possible wMaxPacketSize (1kB) to make sure that
> > > you
> > > always send full packages.
> > > 
> > > I only suggest to replace the causeless 4kB alignment with a
> > > reasonable
> > > 1kB alignment and adapte the qtd_count caluclation.
> > > 
> > >                          int xfr_bytes = min(left_length,
> > >                          
> > >                                              (QT_BUFFER_CNT *
> > >                                              4096 -
> > >                                              
> > >                                               ((uint32_t)buf_ptr
> > >                                               &
> > >                                               4095)) &
> > > 
> > > -                                           ~4095);
> > > +                                           ~1023);
> > 
> > I agree for this part of the code. But for the allocation part,
> > your
> > suggestion is already a little bit more complicated than my
> > original code,
> > while still incomplete. Besides that, the "((uint32_t)buffer &
> > 4095) +"
> > for the page-unaligned case in my code was actually useless, which
> > emphasizes the difference, even if it's a light complication.
> > 
> > For the allocation part, the appropriate code for your suggestion
> > would be:
> > 
> > 		if ((uint32_t)buffer & 1023) /* max-wMaxPacketSize-unaligned */
> > 			qtd_count +=
> 
> DIV_ROUND_UP(
>     max(
>         length > 0,
>         length - (4096 - ((uint32_t)buf_ptr & 4095) & ~1023)
>     ),
>     (QT_BUFFER_CNT - 1) * 4096
> );
> 
> Ok, I now think I understand what's going on here. I still have to
> wonder how
> much would the compiler optimize of this if you "decompressed" your
> code -- to
> make it more easy to understand.

I wouldn't go for this complicated version since it's not really useful compared
to the simpler yet less accurate solution I gave below.

> > 		else			     /* max-wMaxPacketSize-aligned */
> > 			qtd_count += DIV_ROUND_UP(((uint32_t)buffer & 4095) +
> > 					length, QT_BUFFER_CNT * 4096);
> > 
> > This code allocates exactly the required number of qTDs, no less,
> > no more.
> > It's clearly more complicated than the 4096 version.
> > 
> > A test should also be added to make sure that qtd_count is never 0.
> > Otherwise, ZLPs are broken (this applies to my original code too).
> > 
> > If we want to compromise accuracy for simplicity, we can change
> > that to:
> > 
> > 		qtd_count += 2 + length /
> > 			((QT_BUFFER_CNT - !!((uint32_t)buffer & 1023)) * 4096);

It's this solution I'd like to use to optimize the number of qTDs (with 1023 or
something else).

> > This code allocates enough qTDs for all cases, with at worst 2
> > extra qTDs
> > (i.e. a total of 128 bytes) that will be left unused. It also
> > handles
> > intrinsically ZLPs.
> > 
> > Now, let's consider the possible values of wMaxPacketSize:
> >  - control endpoints:
> >     * LS: 8,
> >     * FS: 8, 16, 32 or 64,
> >     * HS: 64,
> >  - isochronous endpoints: not supported by ehci-hcd.c,
> >  - interrupt endpoints:
> >     * LS: <= 8,
> >     * FS: <= 64,
> >     * HS: <= 1024 (1x to 3x for high-bandwidth),
> >  - bulk endpoints:
> >     * LS: N/A,
> >     * FS: 8, 16, 32 or 64,
> >     * HS: 512.
> > 
> > My code assumes that wMaxPacketSize is a power of 2. This is not
> > always
> > true for interrupt endpoints. Let's talk about these. Their
> > handling is
> > currently broken in U-Boot since their transfers are made
> > asynchronous
> > instead of periodic. Devices shouldn't care too much about that, as
> > long
> > as transfers do not exceed wMaxPacketSize, in which case my code
> > still
> > works because wMaxPacketSize always fits in a single qTD. Interrupt
> > transfers larger than wMaxPacketSize do not seem to be used by
> > U-Boot. If
> > they were used, the current code in U-Boot would have a timing
> > issue
> > because the asynchronous scheme would break the interval requested
> > by
> > devices, which could at worst make them fail in some way. So the
> > only
> > solution would be that such transfers be split by the caller of
> > submit_int_msg, in which case my code still works. What would you
> > think
> > about failing with an error message in submit_int_msg if length is
> > larger
> > than wMaxPacketSize? Marek, what do you think?
> 
> Let's do that ... I think the interrupt endpoint is only used for
> keyboard and
> if someone needs it for something else, the code will be there, just
> needing
> improvement. Comment and error message are OK.

OK. I have thought of another solution for this. You'll tell me which one you
prefer.

The ehci_submit_async code currently in U-Boot checks through ehci_td_buffer
that length fits in the single qTD reserved for data payload only after work has
begun, possibly after a SETUP transfer. With my series, this is checked at the
very beginning, before the allocation. We could detect that wMaxPacketSize is
not a power of 2 (e.g. with __builtin_popcount), in which case the allocation
for the data payload would be restricted to 1 qTD like now, and there would be
a check at the very beginning to test if length fits in this qTD. In that way,
there could be several packets per interrupt transfer as long as it fits in a
single qTD, just like now, contrary to the limitation imposed by the error in
submit_int_msg. But I'm not sure it's a good idea to allow this behavior.

> > For all other cases, wMaxPacketSize is a power of 2, so everything
> > is fine,
> > except that in those cases wMaxPacketSize is at most 512, which
> > means that
> > with the suggested limitation applied to submit_int_msg, your
> > suggested
> > 1024 could be replaced with 512, which is good news since this is
> > also the
> > most common storage sector size.
> >
> > We could even use usb_maxpacket(dev, pipe) instead of 512, with
> > some
> > restrictions. If we don't want to alter the misalignment check in
> > ehci_td_buffer, max(usb_maxpacket(dev, pipe), ARCH_DMA_MINALIGN)
> > would
> > actually have to be used. This misalignment check could be limited
> > to the
> > first qTD transfer of a USB transfer, but that would complicate
> > things,
> > all the more the corresponding call to flush_dcache_range would
> > have to be
> > modified to fix alignments.
> > 
> > So we have to make two choices:
> >  - between 4096, 1024, 512 and max(usb_maxpacket(dev, pipe),
> > ARCH_DMA_MINALIGN), - between the accurate and simple allocations.
> > That makes a total of 8 working possibilities. What do you guys
> > think we
> > should choose? On my side, I like max(usb_maxpacket(dev, pipe),
> > ARCH_DMA_MINALIGN)
> 
> Won't maxpacket fall below 512 on occasions,

Sure.

> which might cause
> trouble?

Why?

> > with the simple allocation. It's efficient as to code
> > speed, size and readability, as well as to RAM usage.
> 
> For now, I'd go for the safest, easiest and dumbest solution and see
> how it
> fares. Subsequent patch can be submitted to improve that and
> measurements made.
> 
> "We should forget about small efficiencies, say about 97% of the
> time; premature
> optimization is the root of all evil"
>             -- Donald E. Knuth, Structured Programming with go to
>             Statements
> [...]

OK, so I'll stick to my original series, rework it lightly as we said, add Jim's
patch, and add a further patch for these optimizations.

> > So we could perhaps issue a #error in ehci-hcd or in usb_storage if
> > CONFIG_SYS_MALLOC_LEN is not large enough, but I don't think it's a
> > good
> > idea because:
> >  - the threshold value would have to depend on runtime block sizes
> >  or
> > something, which could lead to a big worst worst case that would
> > almost
> > never happen in real life, so giving such an unrealistic heap size
> > constraint would be cumbersome,
> 
> #warning then?

With which limit if so?

> >  - reaching the top sizes would mean reading a huge file or
> >  something to a
> > large buffer (much larger than the qTDs this transfer requires),
> > which
> > would very likely be heap-allocated (except for commands like
> > fatload), so
> > CONFIG_SYS_MALLOC_LEN would already have to be large for the
> > application,
> > - for command line operations like fatload, transfers of
> > uncontrolled
> > lengths could simply crash U-Boot if they go too far in memory
> 
> Why, because they overwrite it?

Yes. U-Boot expands down its allocation during startup, so it's often located at
the end of the embedded RAM, which means that fatload will very likely use the
beginning of the RAM.

> > , which
> > means that users of such commands need to know what they are doing
> > anyway,
> > so they have to control transfer sizes,
> >  - there is already a runtime error displayed in case of allocation
> > failure.
> 
> Ok

So #warning or not besides this?

> > Marek, what do you think?
> 
> Had a good evening with the EHCI r10 spec, hope I answered most of
> your
> questions.

Yes, thanks.

Best regards,
Benoît
Stefan Herbrechtsmeier - July 31, 2012, 7:52 p.m.
Am 31.07.2012 03:06, schrieb Benoît Thébaudeau:
> Dear Marek Vasut,
>
> On Tue, Jul 31, 2012 at 12:38:54 AM, Marek Vasut wrote:
>> [...]
>>
>>>>> Can you explain where this gain would come from? In both cases,
>>>>> the
>>>>> data in USB
>>>>> transfers would be organized in the same way, and it would be
>>>>> accessed in memory
>>>>> also in the same way (regarding bursts). The only difference
>>>>> would
>>>>> be the fetch
>>>>> time of a little bit more qTDs, which is extremely fast and
>>>>> insignificant
>>>>> compared to the transfer time of the payload, which remains
>>>>> unchanged.
>>>> You are right, the speed different will be minimal, only the
>>>> memory
>>>> usage will be lower.
>>> If your point is only the memory gain, I agree. With your
>>> suggestion, there
>>> are roughly 25% less qTDs used in the "(max wMaxPacketSize)-aligned
>>> but
>>> not page-aligned" case since the number of qTDs is about (total
>>> transfer
>>> size) / 5 instead of (total transfer size) / 4. But this is still
>>> small
>>> compared to usual heap sizes (at least on the kind of hardware I
>>> use).
>> Ok, I see the point. I understand it's not really a bug, just an
>> improvement.
> Exactly.
>
>> Maybe we can do a subsequent patch on top of these from Benoit and
>> see how it
>> fares?
> If you wish. I'll do that.
>
>>>>> Moreover, in your use case, if you are e.g. using FAT, on the
>>>>> one
>>>>> hand, the
>>>>> buffers in fat.c are never aligned to more than the DMA min
>>>>> alignment, and on
>>>>> the other hand, if you can align your user buffers to 512
>>>>> bytes,
>>>>> you can also
>>>>> align them directly to 4 kB.
>>>> The user buffer is aligned to 4kB, but this doesn't matter as a
>>>> file
>>>> load from a storage device (ex. fatload) can be segmented in
>>>> partial
>>>> USB
>>>> transfers. This can lead to any block aligned buffer for a
>>>> partial
>>>> transfer.
>>> What do you mean by "partial USB transfers"? As seen from EHCI
>>> users like
>>> the MSC driver (usb_storage.c), USB transfers either succeed or
>>> fail, but
>>> they cannot be "segmented".
>> Segmented -- like multiple transfers being issues with small payload?
Right.
>> You can
>> not put these together at the USB-level, since it's the issuing code
>> that has to
>> be fixed.
If the segmentation comes from the file system handling we can not avoid 
this.

>>> On its side, the MSC driver will only segment the FAT layer
>>> requests if
>>> they are larger than 65535 blocks, so still not what you describe.
>>>
>>> As to the FAT stack, it will only read whole clusters while
>>> accessing file
>>> payload, and the most usual cluster sizes are by default a multiple
>>> of 4
>>> kiB (see http://support.microsoft.com/kb/140365).
>> 512b is minimum and it's quite often used.
> OK.
In my example I use a FAT partition with 128 MB and 1 KB clusters. The 
file is read in two segments in which the first transfer starts 4 kB 
aligned but stops 1 kB aligned but not 4 kB aligned and leads to 
unaligned second transfer.
>>> So I don't see "segmentation" anywhere, and for usual cluster
>>> sizes, the
>>> EHCI buffer alignment is fully determined by the applicative buffer
>>> alignment and the file position corresponding to the beginning of
>>> the
>>> applicative buffer. But there are indeed some unusual use cases
>>> (e.g.
>>> smaller clusters) for which only a block-aligned buffer will reach
>>> EHCI
>>> despite a page-aligned applicative buffer.
>> I don't quite get this one.
> I meant that 512 bytes (most usual storage block size) is what we should aim at
> to optimize the number of qTDs.
Right.
>
>>>>>> My suggestion would be to truncate the xfr_bytes with the max
>>>>>> wMaxPacketSize (1024) and for the qtd_count use:
>>>>>>
>>>>>> if ((uint32_t)buffer & 1023)    /* wMaxPacketSize unaligned */
>>>>>>
>>>>>>        qtd_count += DIV_ROUND_UP(((uint32_t)buffer & 4095) +
>>>>>>        
>>>>>>                length, (QT_BUFFER_CNT - 1) * 4096);
>>>>>>
>>>>>> else                /* wMaxPacketSize aligned */
>>>>>>
>>>>>>        qtd_count += DIV_ROUND_UP(((uint32_t)buffer & 4095) +
>>>>>>        
>>>>>>                length, QT_BUFFER_CNT * 4096);
>>>>>>
>>>>>> This allows 50% of unaligned block data (512) to be
>>>>>> transferred
>>>>>> with
>>>>>> min
>>>>>> qTDs.
>>>>> That would also require a realignment-to-page stage. This is
>>>>> specific code for
>>>>> specific buffer alignment from the upper layers. We could also
>>>>> skip
>>>>> the
>>>>> realignment to page and always keep the same qTD transfer size
>>>>> except for the
>>>>> last one, by adding as many packets as possible for the buffer
>>>>> alignment.
>>>> What you mean by realignment-to-page stage?
>>> I mean that the alignment of the transfer to 1024 instead of 4096
>>> can make
>>> the first qTD transfer larger than the following ones, which
>>> guarantees
>>> that the following qTD transfers are page-aligned, even if the
>>> first one
>>> was only aligned to 1024. For the 1024-aligned case, this results
>>> in the
>>> change that you suggest, but it also changes things for the
>>> unaligned
>>> case, which makes this part of the code inaccurate. See below.
You are right. It maximise the first transfer. All other transfers are 5 
* 4 KB (aligned) or 4 * 4 KB (unaligned) long.
>>>>> But I still don't see a significant reason to complicate code
>>>>> to do
>>>>> that.
>>>> I don't understand where you expect to complicate the code.
>>>>
>>>> You limit the size of one transfer (xfr_bytes) to (QT_BUFFER_CNT
>>>> - 1)
>>>> *
>>>> 4kB for unaligned buffers. But you only need to limit it to a
>>>> multiple
>>>> of the maximal possible wMaxPacketSize (1kB) to make sure that
>>>> you
>>>> always send full packages.
>>>>
>>>> I only suggest to replace the causeless 4kB alignment with a
>>>> reasonable
>>>> 1kB alignment and adapte the qtd_count caluclation.
>>>>
>>>>                           int xfr_bytes = min(left_length,
>>>>                           
>>>>                                               (QT_BUFFER_CNT *
>>>>                                               4096 -
>>>>                                               
>>>>                                                ((uint32_t)buf_ptr
>>>>                                                &
>>>>                                                4095)) &
>>>>
>>>> -                                           ~4095);
>>>> +                                           ~1023);
>>> I agree for this part of the code. But for the allocation part,
>>> your
>>> suggestion is already a little bit more complicated than my
>>> original code,
>>> while still incomplete. Besides that, the "((uint32_t)buffer &
>>> 4095) +"
>>> for the page-unaligned case in my code was actually useless, which
>>> emphasizes the difference, even if it's a light complication.
>>>
>>> For the allocation part, the appropriate code for your suggestion
>>> would be:
>>>
>>> 		if ((uint32_t)buffer & 1023) /* max-wMaxPacketSize-unaligned */
>>> 			qtd_count +=
>> DIV_ROUND_UP(
>>      max(
>>          length > 0,
>>          length - (4096 - ((uint32_t)buf_ptr & 4095) & ~1023)
>>      ),
>>      (QT_BUFFER_CNT - 1) * 4096
>> );
>>
>> Ok, I now think I understand what's going on here. I still have to
>> wonder how
>> much would the compiler optimize of this if you "decompressed" your
>> code -- to
>> make it more easy to understand.
> I wouldn't go for this complicated version since it's not really useful compared
> to the simpler yet less accurate solution I gave below.
You are right, but your complicate code only saves one qTD.
>
>>> 		else			     /* max-wMaxPacketSize-aligned */
>>> 			qtd_count += DIV_ROUND_UP(((uint32_t)buffer & 4095) +
>>> 					length, QT_BUFFER_CNT * 4096);
>>>
>>> This code allocates exactly the required number of qTDs, no less,
>>> no more.
>>> It's clearly more complicated than the 4096 version.
>>>
>>> A test should also be added to make sure that qtd_count is never 0.
>>> Otherwise, ZLPs are broken (this applies to my original code too).
>>>
>>> If we want to compromise accuracy for simplicity, we can change
>>> that to:
>>>
>>> 		qtd_count += 2 + length /
>>> 			((QT_BUFFER_CNT - !!((uint32_t)buffer & 1023)) * 4096);
> It's this solution I'd like to use to optimize the number of qTDs (with 1023 or
> something else).
This sounds reasonable.

>
>>> This code allocates enough qTDs for all cases, with at worst 2
>>> extra qTDs
>>> (i.e. a total of 128 bytes) that will be left unused. It also
>>> handles
>>> intrinsically ZLPs.
>>>
>>> Now, let's consider the possible values of wMaxPacketSize:
>>>   - control endpoints:
>>>      * LS: 8,
>>>      * FS: 8, 16, 32 or 64,
>>>      * HS: 64,
>>>   - isochronous endpoints: not supported by ehci-hcd.c,
>>>   - interrupt endpoints:
>>>      * LS: <= 8,
>>>      * FS: <= 64,
>>>      * HS: <= 1024 (1x to 3x for high-bandwidth),
>>>   - bulk endpoints:
>>>      * LS: N/A,
>>>      * FS: 8, 16, 32 or 64,
>>>      * HS: 512.
>>>
>>> My code assumes that wMaxPacketSize is a power of 2. This is not
>>> always
>>> true for interrupt endpoints. Let's talk about these. Their
>>> handling is
>>> currently broken in U-Boot since their transfers are made
>>> asynchronous
>>> instead of periodic. Devices shouldn't care too much about that, as
>>> long
>>> as transfers do not exceed wMaxPacketSize, in which case my code
>>> still
>>> works because wMaxPacketSize always fits in a single qTD. Interrupt
>>> transfers larger than wMaxPacketSize do not seem to be used by
>>> U-Boot. If
>>> they were used, the current code in U-Boot would have a timing
>>> issue
>>> because the asynchronous scheme would break the interval requested
>>> by
>>> devices, which could at worst make them fail in some way. So the
>>> only
>>> solution would be that such transfers be split by the caller of
>>> submit_int_msg, in which case my code still works. What would you
>>> think
>>> about failing with an error message in submit_int_msg if length is
>>> larger
>>> than wMaxPacketSize? Marek, what do you think?
>> Let's do that ... I think the interrupt endpoint is only used for
>> keyboard and
>> if someone needs it for something else, the code will be there, just
>> needing
>> improvement. Comment and error message are OK.
> OK. I have thought of another solution for this. You'll tell me which one you
> prefer.
>
> The ehci_submit_async code currently in U-Boot checks through ehci_td_buffer
> that length fits in the single qTD reserved for data payload only after work has
> begun, possibly after a SETUP transfer. With my series, this is checked at the
> very beginning, before the allocation. We could detect that wMaxPacketSize is
> not a power of 2 (e.g. with __builtin_popcount), in which case the allocation
> for the data payload would be restricted to 1 qTD like now, and there would be
> a check at the very beginning to test if length fits in this qTD. In that way,
> there could be several packets per interrupt transfer as long as it fits in a
> single qTD, just like now, contrary to the limitation imposed by the error in
> submit_int_msg. But I'm not sure it's a good idea to allow this behavior.
I think this is not needed, as there is only one user (keyboard) with 
max size of 8 byte.
>
>>> For all other cases, wMaxPacketSize is a power of 2, so everything
>>> is fine,
>>> except that in those cases wMaxPacketSize is at most 512, which
>>> means that
>>> with the suggested limitation applied to submit_int_msg, your
>>> suggested
>>> 1024 could be replaced with 512, which is good news since this is
>>> also the
>>> most common storage sector size.
>>>
>>> We could even use usb_maxpacket(dev, pipe) instead of 512, with
>>> some
>>> restrictions. If we don't want to alter the misalignment check in
>>> ehci_td_buffer, max(usb_maxpacket(dev, pipe), ARCH_DMA_MINALIGN)
>>> would
>>> actually have to be used. This misalignment check could be limited
>>> to the
>>> first qTD transfer of a USB transfer, but that would complicate
>>> things,
>>> all the more the corresponding call to flush_dcache_range would
>>> have to be
>>> modified to fix alignments.
>>>
>>> So we have to make two choices:
>>>   - between 4096, 1024, 512 and max(usb_maxpacket(dev, pipe),
>>> ARCH_DMA_MINALIGN), - between the accurate and simple allocations.
>>> That makes a total of 8 working possibilities. What do you guys
>>> think we
>>> should choose? On my side, I like max(usb_maxpacket(dev, pipe),
>>> ARCH_DMA_MINALIGN)
>> Won't maxpacket fall below 512 on occasions,
> Sure.
I would go with 512 as it also the most common storage sector size.
>> which might cause
>> trouble?
> Why?
>
>>> with the simple allocation. It's efficient as to code
>>> speed, size and readability, as well as to RAM usage.
>> For now, I'd go for the safest, easiest and dumbest solution and see
>> how it
>> fares. Subsequent patch can be submitted to improve that and
>> measurements made.
>>
>> "We should forget about small efficiencies, say about 97% of the
>> time; premature
>> optimization is the root of all evil"
>>              -- Donald E. Knuth, Structured Programming with go to
>>              Statements
>> [...]
> OK, so I'll stick to my original series, rework it lightly as we said, add Jim's
> patch, and add a further patch for these optimizations.
Okay.
>>> So we could perhaps issue a #error in ehci-hcd or in usb_storage if
>>> CONFIG_SYS_MALLOC_LEN is not large enough, but I don't think it's a
>>> good
>>> idea because:
>>>   - the threshold value would have to depend on runtime block sizes
>>>   or
>>> something, which could lead to a big worst worst case that would
>>> almost
>>> never happen in real life, so giving such an unrealistic heap size
>>> constraint would be cumbersome,
>> #warning then?
> With which limit if so?
I would expect more than 128kB as this is a common worst case (512 B 
block size).

>
>>>   - reaching the top sizes would mean reading a huge file or
>>>   something to a
>>> large buffer (much larger than the qTDs this transfer requires),
>>> which
>>> would very likely be heap-allocated (except for commands like
>>> fatload), so
>>> CONFIG_SYS_MALLOC_LEN would already have to be large for the
>>> application,
>>> - for command line operations like fatload, transfers of
>>> uncontrolled
>>> lengths could simply crash U-Boot if they go too far in memory
>> Why, because they overwrite it?
> Yes. U-Boot expands down its allocation during startup, so it's often located at
> the end of the embedded RAM, which means that fatload will very likely use the
> beginning of the RAM.
>
>>> , which
>>> means that users of such commands need to know what they are doing
>>> anyway,
>>> so they have to control transfer sizes,
>>>   - there is already a runtime error displayed in case of allocation
>>> failure.
>> Ok
> So #warning or not besides this?
>
>>> Marek, what do you think?
>> Had a good evening with the EHCI r10 spec, hope I answered most of
>> your
>> questions.
> Yes, thanks.
Regards,
     Stefan
Stefan Herbrechtsmeier - July 31, 2012, 8:01 p.m.
Dear Benoît

Am 29.07.2012 02:48, schrieb Benoît Thébaudeau:
> Sorry for the delay. I'm very busy, and there is much to tell on this topic.
No problem. Hopefully I don't make you to much extra trouble.

>>> BTW, the 15x speed gain that I gave in my patch description was
>>> compared to an
>>> older version of the original code that used 20 blocks per transfer
>>> in
>>> usb_storage.c. This is now 40 blocks per transfer with a
>>> page-aligned buffer, so
>>> the speed gain compared to the current code should be rather about
>>> 7x. I should
>>> update that.
>> I'm sure that there is a significant speed gain but you shouldn't
>> miss
>> the heap usage as the main CONFIG_SYS_MALLOC_LEN is 128kB.
> I have checked all the config files. Among those using EHCI, most have a heap
> size >= 1 MiB. The only exceptions are:
>
> ---------------------------------------------
> | Heap Size |      Board       |  RAM Size  |
> |-------------------------------------------|
> |   512 kiB | MERGERBOX        |   > 10 MiB |
> |           | MPC8315ERDB      |    128 MiB |
> |           | MVBLM7           |    512 MiB |
> |-------------------------------------------|
> |   256 kiB | MPC8349ITX       |    256 MiB |
> |           | omap4_panda      |      1 GiB |
> |-------------------------------------------|
> |   128 kiB | adp-ag102        |    256 MiB |
> |           | at91sam9m10g45ek |    128 MiB |
> |           | edminiv2         |     64 MiB |
> |           | M52277EVB        |     64 MiB |
> |           | omap3_beagle     | >= 128 MiB |
> ---------------------------------------------
>
> As you can see, these small heap sizes are not linked to any hardware
> constraint, but only to the lack of need to have larger heaps, so they could be
> easily enlarged if needed. But even 128 kiB should be enough for common usage.
You are right.
>
>> Maybe you should also add a worst case heap usage and I'm not sure,
>> if
>> your calculation are right, as the size of struct qTD is allays 32B
>> and
>> thereby I get 50kB or 64kB.
> >From ehci.h:
> struct qTD {
> 	/* this part defined by EHCI spec */
> 	uint32_t qt_next;		/* see EHCI 3.5.1 */
> #define	QT_NEXT_TERMINATE	1
> 	uint32_t qt_altnext;		/* see EHCI 3.5.2 */
> 	uint32_t qt_token;		/* see EHCI 3.5.3 */
> 	uint32_t qt_buffer[5];		/* see EHCI 3.5.4 */
> 	uint32_t qt_buffer_hi[5];	/* Appendix B */
> 	/* pad struct for 32 byte alignment */
> 	uint32_t unused[3];
> };
>
> So sizeof(struct qTD) is 16 * 32 bits = 64 bytes. For the worst alignment case,
> the number of qTDs to allocate for 65535 blocks of 512 bytes (worst MSC case
> with 512-byte sectors) is DIV_ROUND_UP(65535 * 512, 4 * 4096) = 2048 qTDs, i.e.
> 128 kiB. For the same transfer size with the best alignment case, it is
> DIV_ROUND_UP(65535 * 512, 5 * 4096) = 1639 qTDs, i.e. 104896 B.
Sorry, you are right. I had missed the arrays.

Regards,
     Stefan
Marek Vasut - Aug. 1, 2012, 2:41 a.m.
Dear Stefan Herbrechtsmeier,

[...]

> >>> What do you mean by "partial USB transfers"? As seen from EHCI
> >>> users like
> >>> the MSC driver (usb_storage.c), USB transfers either succeed or
> >>> fail, but
> >>> they cannot be "segmented".
> >> 
> >> Segmented -- like multiple transfers being issues with small payload?
> 
> Right.
> 
> >> You can
> >> not put these together at the USB-level, since it's the issuing code
> >> that has to
> >> be fixed.
> 
> If the segmentation comes from the file system handling we can not avoid
> this.

If the FS code is shitty or the device is fragmented, noone can help that.

[...]

> >>> My code assumes that wMaxPacketSize is a power of 2. This is not
> >>> always
> >>> true for interrupt endpoints. Let's talk about these. Their
> >>> handling is
> >>> currently broken in U-Boot since their transfers are made
> >>> asynchronous
> >>> instead of periodic. Devices shouldn't care too much about that, as
> >>> long
> >>> as transfers do not exceed wMaxPacketSize, in which case my code
> >>> still
> >>> works because wMaxPacketSize always fits in a single qTD. Interrupt
> >>> transfers larger than wMaxPacketSize do not seem to be used by
> >>> U-Boot. If
> >>> they were used, the current code in U-Boot would have a timing
> >>> issue
> >>> because the asynchronous scheme would break the interval requested
> >>> by
> >>> devices, which could at worst make them fail in some way. So the
> >>> only
> >>> solution would be that such transfers be split by the caller of
> >>> submit_int_msg, in which case my code still works. What would you
> >>> think
> >>> about failing with an error message in submit_int_msg if length is
> >>> larger
> >>> than wMaxPacketSize? Marek, what do you think?
> >> 
> >> Let's do that ... I think the interrupt endpoint is only used for
> >> keyboard and
> >> if someone needs it for something else, the code will be there, just
> >> needing
> >> improvement. Comment and error message are OK.
> > 
> > OK. I have thought of another solution for this. You'll tell me which one
> > you prefer.
> > 
> > The ehci_submit_async code currently in U-Boot checks through
> > ehci_td_buffer that length fits in the single qTD reserved for data
> > payload only after work has begun, possibly after a SETUP transfer. With
> > my series, this is checked at the very beginning, before the allocation.
> > We could detect that wMaxPacketSize is not a power of 2 (e.g. with
> > __builtin_popcount), in which case the allocation for the data payload
> > would be restricted to 1 qTD like now, and there would be a check at the
> > very beginning to test if length fits in this qTD. In that way, there
> > could be several packets per interrupt transfer as long as it fits in a
> > single qTD, just like now, contrary to the limitation imposed by the
> > error in submit_int_msg. But I'm not sure it's a good idea to allow this
> > behavior.
> 
> I think this is not needed, as there is only one user (keyboard) with
> max size of 8 byte.

I agree, but for a different reason. Let's aim for a simple implementation first 
that doesn't change behavior and add this change later _if_ needed.

[...]

> >>> So we could perhaps issue a #error in ehci-hcd or in usb_storage if
> >>> CONFIG_SYS_MALLOC_LEN is not large enough, but I don't think it's a
> >>> good
> >>> 
> >>> idea because:
> >>>   - the threshold value would have to depend on runtime block sizes
> >>>   or
> >>> 
> >>> something, which could lead to a big worst worst case that would
> >>> almost
> >>> never happen in real life, so giving such an unrealistic heap size
> >>> constraint would be cumbersome,
> >> 
> >> #warning then?
> > 
> > With which limit if so?
> 
> I would expect more than 128kB as this is a common worst case (512 B
> block size).

Seems to be reasonable.

[..]
Benoît Thébaudeau - Aug. 3, 2012, 11:02 p.m.
Dear Marek Vasut,

On Tue, Jul 31, 2012 at 03:06:00 AM, Benoît Thébaudeau wrote:
> > > Marek, what do you think?
> > 
> > Had a good evening with the EHCI r10 spec, hope I answered most of
> > your
> > questions.
> 
> Yes, thanks.

Sorry again for the delay. I still have a few urgent issues to address (caused
by U-Boot :( ) in priority. Hopefully, next week will be the good one. I now
have tens of new patches to post :) , not all for you ;) .

Best regards,
Benoît
Marek Vasut - Aug. 4, 2012, 7:45 a.m.
Dear Benoît Thébaudeau,

> Dear Marek Vasut,
> 
> On Tue, Jul 31, 2012 at 03:06:00 AM, Benoît Thébaudeau wrote:
> > > > Marek, what do you think?
> > > 
> > > Had a good evening with the EHCI r10 spec, hope I answered most of
> > > your
> > > questions.
> > 
> > Yes, thanks.
> 
> Sorry again for the delay. I still have a few urgent issues to address
> (caused by U-Boot :( ) in priority. Hopefully, next week will be the good
> one. I now have tens of new patches to post :) , not all for you ;) .

I'm myself really juiced out, so take your time ;-)

> Best regards,
> Benoît

Best regards,
Marek Vasut
Benoît Thébaudeau - Aug. 8, 2012, 11:14 p.m.
Dear Marek Vasut,

On Sat, Aug 4, 2012 at 09:45:33 AM, Marek Vasut wrote:
> > On Tue, Jul 31, 2012 at 03:06:00 AM, Benoît Thébaudeau wrote:
> > > > > Marek, what do you think?
> > > > 
> > > > Had a good evening with the EHCI r10 spec, hope I answered most
> > > > of
> > > > your
> > > > questions.
> > > 
> > > Yes, thanks.
> > 
> > Sorry again for the delay. I still have a few urgent issues to
> > address
> > (caused by U-Boot :( ) in priority. Hopefully, next week will be
> > the good
> > one. I now have tens of new patches to post :) , not all for you ;)
> > .
> 
> I'm myself really juiced out, so take your time ;-)

I was close to posting it tonight. I have only a few comments and tests left to
do. That will be for tomorrow.

How will you handle the merge with http://patchwork.ozlabs.org/patch/175887/ ?
Do you want to apply it before or after my series? If before, do you want me to
factor my series as if this patch had been applied?

Best regards,
Benoît
Marek Vasut - Aug. 8, 2012, 11:14 p.m.
Dear Benoît Thébaudeau,

> Dear Marek Vasut,
> 
> On Sat, Aug 4, 2012 at 09:45:33 AM, Marek Vasut wrote:
> > > On Tue, Jul 31, 2012 at 03:06:00 AM, Benoît Thébaudeau wrote:
> > > > > > Marek, what do you think?
> > > > > 
> > > > > Had a good evening with the EHCI r10 spec, hope I answered most
> > > > > of
> > > > > your
> > > > > questions.
> > > > 
> > > > Yes, thanks.
> > > 
> > > Sorry again for the delay. I still have a few urgent issues to
> > > address
> > > (caused by U-Boot :( ) in priority. Hopefully, next week will be
> > > the good
> > > one. I now have tens of new patches to post :) , not all for you ;)
> > > .
> > 
> > I'm myself really juiced out, so take your time ;-)
> 
> I was close to posting it tonight. I have only a few comments and tests
> left to do. That will be for tomorrow.
> 
> How will you handle the merge with
> http://patchwork.ozlabs.org/patch/175887/ ?
> Do you want to apply it before
> or after my series? If before, do you want me to factor my series as if
> this patch had been applied?

After, your series seems to be ready for application. Can you rebase your series 
on current u-boot-usb/master btw ?

> Best regards,
> Benoît

Best regards,
Marek Vasut

Patch

diff --git u-boot-usb-1b4bd0e.orig/drivers/usb/host/ehci-hcd.c u-boot-usb-1b4bd0e/drivers/usb/host/ehci-hcd.c
index 5b3b906..cf9ab92 100644
--- u-boot-usb-1b4bd0e.orig/drivers/usb/host/ehci-hcd.c
+++ u-boot-usb-1b4bd0e/drivers/usb/host/ehci-hcd.c
@@ -208,7 +208,8 @@  ehci_submit_async(struct usb_device *dev, unsigned long pipe, void *buffer,
 		   int length, struct devrequest *req)
 {
 	ALLOC_ALIGN_BUFFER(struct QH, qh, 1, USB_DMA_MINALIGN);
-	ALLOC_ALIGN_BUFFER(struct qTD, qtd, 3, USB_DMA_MINALIGN);
+	struct qTD *qtd;
+	int qtd_count = 0;
 	int qtd_counter = 0;
 
 	volatile struct qTD *vtd;
@@ -229,8 +230,23 @@  ehci_submit_async(struct usb_device *dev, unsigned long pipe, void *buffer,
 		      le16_to_cpu(req->value), le16_to_cpu(req->value),
 		      le16_to_cpu(req->index));
 
+	if (req != NULL)			/* SETUP + ACK */
+		qtd_count += 1 + 1;
+	if (length > 0 || req == NULL) {	/* buffer */
+		if ((uint32_t)buffer & 4095)		/* page-unaligned */
+			qtd_count += DIV_ROUND_UP(((uint32_t)buffer & 4095) +
+					length, (QT_BUFFER_CNT - 1) * 4096);
+		else					/* page-aligned */
+			qtd_count += DIV_ROUND_UP(length, QT_BUFFER_CNT * 4096);
+	}
+	qtd = memalign(USB_DMA_MINALIGN, qtd_count * sizeof(struct qTD));
+	if (qtd == NULL) {
+		printf("unable to allocate TDs\n");
+		return -1;
+	}
+
 	memset(qh, 0, sizeof(struct QH));
-	memset(qtd, 0, 3 * sizeof(*qtd));
+	memset(qtd, 0, qtd_count * sizeof(*qtd));
 
 	toggle = usb_gettoggle(dev, usb_pipeendpoint(pipe), usb_pipeout(pipe));
 
@@ -291,31 +307,46 @@  ehci_submit_async(struct usb_device *dev, unsigned long pipe, void *buffer,
 	}
 
 	if (length > 0 || req == NULL) {
-		/*
-		 * Setup request qTD (3.5 in ehci-r10.pdf)
-		 *
-		 *   qt_next ................ 03-00 H
-		 *   qt_altnext ............. 07-04 H
-		 *   qt_token ............... 0B-08 H
-		 *
-		 *   [ buffer, buffer_hi ] loaded with "buffer".
-		 */
-		qtd[qtd_counter].qt_next = cpu_to_hc32(QT_NEXT_TERMINATE);
-		qtd[qtd_counter].qt_altnext = cpu_to_hc32(QT_NEXT_TERMINATE);
-		token = (toggle << 31) |
-		    (length << 16) |
-		    ((req == NULL ? 1 : 0) << 15) |
-		    (0 << 12) |
-		    (3 << 10) |
-		    ((usb_pipein(pipe) ? 1 : 0) << 8) | (0x80 << 0);
-		qtd[qtd_counter].qt_token = cpu_to_hc32(token);
-		if (ehci_td_buffer(&qtd[qtd_counter], buffer, length) != 0) {
-			printf("unable construct DATA td\n");
-			goto fail;
-		}
-		/* Update previous qTD! */
-		*tdp = cpu_to_hc32((uint32_t)&qtd[qtd_counter]);
-		tdp = &qtd[qtd_counter++].qt_next;
+		uint8_t *buf_ptr = buffer;
+		int left_length = length;
+
+		do {
+			int xfr_bytes = min(left_length,
+					    (QT_BUFFER_CNT * 4096 -
+					     ((uint32_t)buf_ptr & 4095)) &
+					    ~4095);
+
+			/*
+			 * Setup request qTD (3.5 in ehci-r10.pdf)
+			 *
+			 *   qt_next ................ 03-00 H
+			 *   qt_altnext ............. 07-04 H
+			 *   qt_token ............... 0B-08 H
+			 *
+			 *   [ buffer, buffer_hi ] loaded with "buffer".
+			 */
+			qtd[qtd_counter].qt_next =
+					cpu_to_hc32(QT_NEXT_TERMINATE);
+			qtd[qtd_counter].qt_altnext =
+					cpu_to_hc32(QT_NEXT_TERMINATE);
+			token = (toggle << 31) |
+			    (xfr_bytes << 16) |
+			    ((req == NULL ? 1 : 0) << 15) |
+			    (0 << 12) |
+			    (3 << 10) |
+			    ((usb_pipein(pipe) ? 1 : 0) << 8) | (0x80 << 0);
+			qtd[qtd_counter].qt_token = cpu_to_hc32(token);
+			if (ehci_td_buffer(&qtd[qtd_counter], buf_ptr,
+						xfr_bytes) != 0) {
+				printf("unable construct DATA td\n");
+				goto fail;
+			}
+			/* Update previous qTD! */
+			*tdp = cpu_to_hc32((uint32_t)&qtd[qtd_counter]);
+			tdp = &qtd[qtd_counter++].qt_next;
+			buf_ptr += xfr_bytes;
+			left_length -= xfr_bytes;
+		} while (left_length > 0);
 	}
 
 	if (req != NULL) {
@@ -346,7 +377,8 @@  ehci_submit_async(struct usb_device *dev, unsigned long pipe, void *buffer,
 	flush_dcache_range((uint32_t)qh_list,
 		ALIGN_END_ADDR(struct QH, qh_list, 1));
 	flush_dcache_range((uint32_t)qh, ALIGN_END_ADDR(struct QH, qh, 1));
-	flush_dcache_range((uint32_t)qtd, ALIGN_END_ADDR(struct qTD, qtd, 3));
+	flush_dcache_range((uint32_t)qtd,
+			   ALIGN_END_ADDR(struct qTD, qtd, qtd_count));
 
 	/* Set async. queue head pointer. */
 	ehci_writel(&hcor->or_asynclistaddr, (uint32_t)qh_list);
@@ -377,7 +409,7 @@  ehci_submit_async(struct usb_device *dev, unsigned long pipe, void *buffer,
 		invalidate_dcache_range((uint32_t)qh,
 			ALIGN_END_ADDR(struct QH, qh, 1));
 		invalidate_dcache_range((uint32_t)qtd,
-			ALIGN_END_ADDR(struct qTD, qtd, 3));
+			ALIGN_END_ADDR(struct qTD, qtd, qtd_count));
 
 		token = hc32_to_cpu(vtd->qt_token);
 		if (!(token & 0x80))
@@ -450,9 +482,11 @@  ehci_submit_async(struct usb_device *dev, unsigned long pipe, void *buffer,
 		      ehci_readl(&hcor->or_portsc[1]));
 	}
 
+	free(qtd);
 	return (dev->status != USB_ST_NOT_PROC) ? 0 : -1;
 
 fail:
+	free(qtd);
 	return -1;
 }