diff mbox

[U-Boot,2/5] ehci-hcd: Boost transfer speed

Message ID 1857491712.292135.1342729024425.JavaMail.root@advansee.com
State Superseded
Headers show

Commit Message

Benoît Thébaudeau July 19, 2012, 8:17 p.m. UTC
This patch takes advantage of the hardware EHCI qTD queuing mechanism to avoid
software overhead and to make transfers as fast as possible.

The only drawback is a call to memalign. However, this is fast compared to the
transfer timings, and the heap size to allocate is small, e.g. a little bit more
than 100 kB for a transfer length of 65535 packets of 512 bytes.

Tested on i.MX25 and i.MX35. In my test conditions, the speedup was about 15x
using page-aligned buffers, which is really appreciable when accessing large
files.

Signed-off-by: Benoît Thébaudeau <benoit.thebaudeau@advansee.com>
Cc: Marek Vasut <marex@denx.de>
Cc: Ilya Yanok <ilya.yanok@cogentembedded.com>
Cc: Stefan Herbrechtsmeier <stefan@herbrechtsmeier.net>
---
 .../drivers/usb/host/ehci-hcd.c                    |   94 ++++++++++++++------
 1 file changed, 65 insertions(+), 29 deletions(-)

Comments

Marek Vasut July 27, 2012, 12:54 p.m. UTC | #1
Dear Benoît Thébaudeau,

> This patch takes advantage of the hardware EHCI qTD queuing mechanism to
> avoid software overhead and to make transfers as fast as possible.
> 
> The only drawback is a call to memalign. However, this is fast compared to
> the transfer timings, and the heap size to allocate is small, e.g. a
> little bit more than 100 kB for a transfer length of 65535 packets of 512
> bytes.
> 
> Tested on i.MX25 and i.MX35. In my test conditions, the speedup was about
> 15x using page-aligned buffers, which is really appreciable when accessing
> large files.
> 
> Signed-off-by: Benoît Thébaudeau <benoit.thebaudeau@advansee.com>
> Cc: Marek Vasut <marex@denx.de>
> Cc: Ilya Yanok <ilya.yanok@cogentembedded.com>
> Cc: Stefan Herbrechtsmeier <stefan@herbrechtsmeier.net>
> ---
>  .../drivers/usb/host/ehci-hcd.c                    |   94
> ++++++++++++++------ 1 file changed, 65 insertions(+), 29 deletions(-)
> 
> diff --git u-boot-usb-1b4bd0e.orig/drivers/usb/host/ehci-hcd.c
> u-boot-usb-1b4bd0e/drivers/usb/host/ehci-hcd.c index 5b3b906..b5645fa
> 100644
> --- u-boot-usb-1b4bd0e.orig/drivers/usb/host/ehci-hcd.c
> +++ u-boot-usb-1b4bd0e/drivers/usb/host/ehci-hcd.c
> @@ -208,7 +208,8 @@ ehci_submit_async(struct usb_device *dev, unsigned long
> pipe, void *buffer, int length, struct devrequest *req)
>  {
>  	ALLOC_ALIGN_BUFFER(struct QH, qh, 1, USB_DMA_MINALIGN);
> -	ALLOC_ALIGN_BUFFER(struct qTD, qtd, 3, USB_DMA_MINALIGN);
> +	struct qTD *qtd;
> +	int qtd_count = 0;
>  	int qtd_counter = 0;
> 
>  	volatile struct qTD *vtd;
> @@ -229,8 +230,25 @@ ehci_submit_async(struct usb_device *dev, unsigned
> long pipe, void *buffer, le16_to_cpu(req->value), le16_to_cpu(req->value),
>  		      le16_to_cpu(req->index));
> 
> +	if (req != NULL)			/* SETUP + ACK */
> +		qtd_count += 1 + 1;
> +	if (length > 0 || req == NULL) {	/* buffer */
> +		if ((uint32_t)buffer & 4095)		/* page-unaligned */
> +			qtd_count += (((uint32_t)buffer & 4095) + length +
> +					(QT_BUFFER_CNT - 1) * 4096 - 1) /
> +						((QT_BUFFER_CNT - 1) * 4096);

Ok, maybe you can please elaborate on this crazy calculation in here? Or somehow 
clarify it? Also, won't the macros in include/common.h help in a way? (like 
ROUND() etc).

I don't really graps what you're trying to calculate here, so maybe even a 
comment would help.

> +		else					/* page-aligned */
> +			qtd_count += (length + QT_BUFFER_CNT * 4096 - 1) /
> +					(QT_BUFFER_CNT * 4096);

Same here, also please avoid using those 4096 and such constants ... maybe 
#define them in ehci.h ?

> +	}
> +	qtd = memalign(USB_DMA_MINALIGN, qtd_count * sizeof(struct qTD));

So your code can alloc more than 5 qTDs ? How does it chain them then? Into more 
QHs ?

> +	if (qtd == NULL) {
> +		printf("unable to allocate TDs\n");
> +		return -1;
> +	}
> +
>  	memset(qh, 0, sizeof(struct QH));
> -	memset(qtd, 0, 3 * sizeof(*qtd));
> +	memset(qtd, 0, qtd_count * sizeof(*qtd));
> 
>  	toggle = usb_gettoggle(dev, usb_pipeendpoint(pipe), usb_pipeout(pipe));
> 
> @@ -291,31 +309,46 @@ ehci_submit_async(struct usb_device *dev, unsigned
> long pipe, void *buffer, }
> 
>  	if (length > 0 || req == NULL) {
> -		/*
> -		 * Setup request qTD (3.5 in ehci-r10.pdf)
> -		 *
> -		 *   qt_next ................ 03-00 H
> -		 *   qt_altnext ............. 07-04 H
> -		 *   qt_token ............... 0B-08 H
> -		 *
> -		 *   [ buffer, buffer_hi ] loaded with "buffer".
> -		 */
> -		qtd[qtd_counter].qt_next = cpu_to_hc32(QT_NEXT_TERMINATE);
> -		qtd[qtd_counter].qt_altnext = cpu_to_hc32(QT_NEXT_TERMINATE);
> -		token = (toggle << 31) |
> -		    (length << 16) |
> -		    ((req == NULL ? 1 : 0) << 15) |
> -		    (0 << 12) |
> -		    (3 << 10) |
> -		    ((usb_pipein(pipe) ? 1 : 0) << 8) | (0x80 << 0);
> -		qtd[qtd_counter].qt_token = cpu_to_hc32(token);
> -		if (ehci_td_buffer(&qtd[qtd_counter], buffer, length) != 0) {
> -			printf("unable construct DATA td\n");
> -			goto fail;
> -		}
> -		/* Update previous qTD! */
> -		*tdp = cpu_to_hc32((uint32_t)&qtd[qtd_counter]);
> -		tdp = &qtd[qtd_counter++].qt_next;
> +		uint8_t *buf_ptr = buffer;
> +		int left_length = length;
> +
> +		do {
> +			int xfr_bytes = min(left_length,
> +					    (QT_BUFFER_CNT * 4096 -
> +					     ((uint32_t)buf_ptr & 4095)) &
> +					    ~4095);

Magic formula yet again ... comment would again be welcome please.

> +			/*
> +			 * Setup request qTD (3.5 in ehci-r10.pdf)
> +			 *
> +			 *   qt_next ................ 03-00 H
> +			 *   qt_altnext ............. 07-04 H
> +			 *   qt_token ............... 0B-08 H
> +			 *
> +			 *   [ buffer, buffer_hi ] loaded with "buffer".
> +			 */
> +			qtd[qtd_counter].qt_next =
> +					cpu_to_hc32(QT_NEXT_TERMINATE);
> +			qtd[qtd_counter].qt_altnext =
> +					cpu_to_hc32(QT_NEXT_TERMINATE);
> +			token = (toggle << 31) |
> +			    (xfr_bytes << 16) |
> +			    ((req == NULL ? 1 : 0) << 15) |
> +			    (0 << 12) |
> +			    (3 << 10) |
> +			    ((usb_pipein(pipe) ? 1 : 0) << 8) | (0x80 << 0);

If you could fix all this magic afterwards (not in these patches), that'd be 
great.

> +			qtd[qtd_counter].qt_token = cpu_to_hc32(token);
> +			if (ehci_td_buffer(&qtd[qtd_counter], buf_ptr,
> +						xfr_bytes) != 0) {
> +				printf("unable construct DATA td\n");
> +				goto fail;
> +			}
> +			/* Update previous qTD! */
> +			*tdp = cpu_to_hc32((uint32_t)&qtd[qtd_counter]);
> +			tdp = &qtd[qtd_counter++].qt_next;
> +			buf_ptr += xfr_bytes;
> +			left_length -= xfr_bytes;
> +		} while (left_length > 0);
>  	}
> 
>  	if (req != NULL) {
> @@ -346,7 +379,8 @@ ehci_submit_async(struct usb_device *dev, unsigned long
> pipe, void *buffer, flush_dcache_range((uint32_t)qh_list,
>  		ALIGN_END_ADDR(struct QH, qh_list, 1));
>  	flush_dcache_range((uint32_t)qh, ALIGN_END_ADDR(struct QH, qh, 1));
> -	flush_dcache_range((uint32_t)qtd, ALIGN_END_ADDR(struct qTD, qtd, 3));
> +	flush_dcache_range((uint32_t)qtd,
> +			   ALIGN_END_ADDR(struct qTD, qtd, qtd_count));
> 
>  	/* Set async. queue head pointer. */
>  	ehci_writel(&hcor->or_asynclistaddr, (uint32_t)qh_list);
> @@ -377,7 +411,7 @@ ehci_submit_async(struct usb_device *dev, unsigned long
> pipe, void *buffer, invalidate_dcache_range((uint32_t)qh,
>  			ALIGN_END_ADDR(struct QH, qh, 1));
>  		invalidate_dcache_range((uint32_t)qtd,
> -			ALIGN_END_ADDR(struct qTD, qtd, 3));
> +			ALIGN_END_ADDR(struct qTD, qtd, qtd_count));
> 
>  		token = hc32_to_cpu(vtd->qt_token);
>  		if (!(token & 0x80))
> @@ -450,9 +484,11 @@ ehci_submit_async(struct usb_device *dev, unsigned
> long pipe, void *buffer, ehci_readl(&hcor->or_portsc[1]));
>  	}
> 
> +	free(qtd);
>  	return (dev->status != USB_ST_NOT_PROC) ? 0 : -1;
> 
>  fail:
> +	free(qtd);
>  	return -1;
>  }
Benoît Thébaudeau July 27, 2012, 1:59 p.m. UTC | #2
Dear Marek Vasut,

On Fri, Jul 27, 2012 at 02:54:07 PM, Marek Vasut wrote:
> > This patch takes advantage of the hardware EHCI qTD queuing
> > mechanism to
> > avoid software overhead and to make transfers as fast as possible.
> > 
> > The only drawback is a call to memalign. However, this is fast
> > compared to
> > the transfer timings, and the heap size to allocate is small, e.g.
> > a
> > little bit more than 100 kB for a transfer length of 65535 packets
> > of 512
> > bytes.
> > 
> > Tested on i.MX25 and i.MX35. In my test conditions, the speedup was
> > about
> > 15x using page-aligned buffers, which is really appreciable when
> > accessing
> > large files.
> > 
> > Signed-off-by: Benoît Thébaudeau <benoit.thebaudeau@advansee.com>
> > Cc: Marek Vasut <marex@denx.de>
> > Cc: Ilya Yanok <ilya.yanok@cogentembedded.com>
> > Cc: Stefan Herbrechtsmeier <stefan@herbrechtsmeier.net>
> > ---
> >  .../drivers/usb/host/ehci-hcd.c                    |   94
> > ++++++++++++++------ 1 file changed, 65 insertions(+), 29
> > deletions(-)
> > 
> > diff --git u-boot-usb-1b4bd0e.orig/drivers/usb/host/ehci-hcd.c
> > u-boot-usb-1b4bd0e/drivers/usb/host/ehci-hcd.c index
> > 5b3b906..b5645fa
> > 100644
> > --- u-boot-usb-1b4bd0e.orig/drivers/usb/host/ehci-hcd.c
> > +++ u-boot-usb-1b4bd0e/drivers/usb/host/ehci-hcd.c
> > @@ -208,7 +208,8 @@ ehci_submit_async(struct usb_device *dev,
> > unsigned long
> > pipe, void *buffer, int length, struct devrequest *req)
> >  {
> >  	ALLOC_ALIGN_BUFFER(struct QH, qh, 1, USB_DMA_MINALIGN);
> > -	ALLOC_ALIGN_BUFFER(struct qTD, qtd, 3, USB_DMA_MINALIGN);
> > +	struct qTD *qtd;
> > +	int qtd_count = 0;
> >  	int qtd_counter = 0;
> > 
> >  	volatile struct qTD *vtd;
> > @@ -229,8 +230,25 @@ ehci_submit_async(struct usb_device *dev,
> > unsigned
> > long pipe, void *buffer, le16_to_cpu(req->value),
> > le16_to_cpu(req->value),
> >  		      le16_to_cpu(req->index));
> > 
> > +	if (req != NULL)			/* SETUP + ACK */
> > +		qtd_count += 1 + 1;
> > +	if (length > 0 || req == NULL) {	/* buffer */
> > +		if ((uint32_t)buffer & 4095)		/* page-unaligned */
> > +			qtd_count += (((uint32_t)buffer & 4095) + length +
> > +					(QT_BUFFER_CNT - 1) * 4096 - 1) /
> > +						((QT_BUFFER_CNT - 1) * 4096);
> 
> Ok, maybe you can please elaborate on this crazy calculation in here?
> Or somehow
> clarify it? Also, won't the macros in include/common.h help in a way?
> (like
> ROUND() etc).

I have already posted a v2 for this specific patch (only 2/5) that does exactly
that. Please review only the latest versions of patches.

> I don't really graps what you're trying to calculate here, so maybe
> even a
> comment would help.

I'll do that.

> > +		else					/* page-aligned */
> > +			qtd_count += (length + QT_BUFFER_CNT * 4096 - 1) /
> > +					(QT_BUFFER_CNT * 4096);
> 
> Same here, also please avoid using those 4096 and such constants ...
> maybe
> #define them in ehci.h ?

Yes. It was already everywhere, so I went on the same way. Do you think using
PAGE_SIZE from <linux/compat.h> would be fine since these 4096 are nothing more
than page sizes?

> > +	}
> > +	qtd = memalign(USB_DMA_MINALIGN, qtd_count * sizeof(struct qTD));
> 
> So your code can alloc more than 5 qTDs ? How does it chain them
> then? Into more
> QHs ?

It's done in exactly the same way as for the original 3 qTDs, only with more
qTDs, but still with 5 qt_buffers per qTD.

> > +	if (qtd == NULL) {
> > +		printf("unable to allocate TDs\n");
> > +		return -1;
> > +	}
> > +
> >  	memset(qh, 0, sizeof(struct QH));
> > -	memset(qtd, 0, 3 * sizeof(*qtd));
> > +	memset(qtd, 0, qtd_count * sizeof(*qtd));
> > 
> >  	toggle = usb_gettoggle(dev, usb_pipeendpoint(pipe),
> >  	usb_pipeout(pipe));
> > 
> > @@ -291,31 +309,46 @@ ehci_submit_async(struct usb_device *dev,
> > unsigned
> > long pipe, void *buffer, }
> > 
> >  	if (length > 0 || req == NULL) {
> > -		/*
> > -		 * Setup request qTD (3.5 in ehci-r10.pdf)
> > -		 *
> > -		 *   qt_next ................ 03-00 H
> > -		 *   qt_altnext ............. 07-04 H
> > -		 *   qt_token ............... 0B-08 H
> > -		 *
> > -		 *   [ buffer, buffer_hi ] loaded with "buffer".
> > -		 */
> > -		qtd[qtd_counter].qt_next = cpu_to_hc32(QT_NEXT_TERMINATE);
> > -		qtd[qtd_counter].qt_altnext = cpu_to_hc32(QT_NEXT_TERMINATE);
> > -		token = (toggle << 31) |
> > -		    (length << 16) |
> > -		    ((req == NULL ? 1 : 0) << 15) |
> > -		    (0 << 12) |
> > -		    (3 << 10) |
> > -		    ((usb_pipein(pipe) ? 1 : 0) << 8) | (0x80 << 0);
> > -		qtd[qtd_counter].qt_token = cpu_to_hc32(token);
> > -		if (ehci_td_buffer(&qtd[qtd_counter], buffer, length) != 0) {
> > -			printf("unable construct DATA td\n");
> > -			goto fail;
> > -		}
> > -		/* Update previous qTD! */
> > -		*tdp = cpu_to_hc32((uint32_t)&qtd[qtd_counter]);
> > -		tdp = &qtd[qtd_counter++].qt_next;
> > +		uint8_t *buf_ptr = buffer;
> > +		int left_length = length;
> > +
> > +		do {
> > +			int xfr_bytes = min(left_length,
> > +					    (QT_BUFFER_CNT * 4096 -
> > +					     ((uint32_t)buf_ptr & 4095)) &
> > +					    ~4095);
> 
> Magic formula yet again ... comment would again be welcome please.

OK.

> > +			/*
> > +			 * Setup request qTD (3.5 in ehci-r10.pdf)
> > +			 *
> > +			 *   qt_next ................ 03-00 H
> > +			 *   qt_altnext ............. 07-04 H
> > +			 *   qt_token ............... 0B-08 H
> > +			 *
> > +			 *   [ buffer, buffer_hi ] loaded with "buffer".
> > +			 */
> > +			qtd[qtd_counter].qt_next =
> > +					cpu_to_hc32(QT_NEXT_TERMINATE);
> > +			qtd[qtd_counter].qt_altnext =
> > +					cpu_to_hc32(QT_NEXT_TERMINATE);
> > +			token = (toggle << 31) |
> > +			    (xfr_bytes << 16) |
> > +			    ((req == NULL ? 1 : 0) << 15) |
> > +			    (0 << 12) |
> > +			    (3 << 10) |
> > +			    ((usb_pipein(pipe) ? 1 : 0) << 8) | (0x80 << 0);
> 
> If you could fix all this magic afterwards (not in these patches),
> that'd be
> great.

Do you only mean #defining all those values?

> > +			qtd[qtd_counter].qt_token = cpu_to_hc32(token);
> > +			if (ehci_td_buffer(&qtd[qtd_counter], buf_ptr,
> > +						xfr_bytes) != 0) {
> > +				printf("unable construct DATA td\n");
> > +				goto fail;
> > +			}
> > +			/* Update previous qTD! */
> > +			*tdp = cpu_to_hc32((uint32_t)&qtd[qtd_counter]);
> > +			tdp = &qtd[qtd_counter++].qt_next;
> > +			buf_ptr += xfr_bytes;
> > +			left_length -= xfr_bytes;
> > +		} while (left_length > 0);
> >  	}
> > 
> >  	if (req != NULL) {
> > @@ -346,7 +379,8 @@ ehci_submit_async(struct usb_device *dev,
> > unsigned long
> > pipe, void *buffer, flush_dcache_range((uint32_t)qh_list,
> >  		ALIGN_END_ADDR(struct QH, qh_list, 1));
> >  	flush_dcache_range((uint32_t)qh, ALIGN_END_ADDR(struct QH, qh,
> >  	1));
> > -	flush_dcache_range((uint32_t)qtd, ALIGN_END_ADDR(struct qTD, qtd,
> > 3));
> > +	flush_dcache_range((uint32_t)qtd,
> > +			   ALIGN_END_ADDR(struct qTD, qtd, qtd_count));
> > 
> >  	/* Set async. queue head pointer. */
> >  	ehci_writel(&hcor->or_asynclistaddr, (uint32_t)qh_list);
> > @@ -377,7 +411,7 @@ ehci_submit_async(struct usb_device *dev,
> > unsigned long
> > pipe, void *buffer, invalidate_dcache_range((uint32_t)qh,
> >  			ALIGN_END_ADDR(struct QH, qh, 1));
> >  		invalidate_dcache_range((uint32_t)qtd,
> > -			ALIGN_END_ADDR(struct qTD, qtd, 3));
> > +			ALIGN_END_ADDR(struct qTD, qtd, qtd_count));
> > 
> >  		token = hc32_to_cpu(vtd->qt_token);
> >  		if (!(token & 0x80))
> > @@ -450,9 +484,11 @@ ehci_submit_async(struct usb_device *dev,
> > unsigned
> > long pipe, void *buffer, ehci_readl(&hcor->or_portsc[1]));
> >  	}
> > 
> > +	free(qtd);
> >  	return (dev->status != USB_ST_NOT_PROC) ? 0 : -1;
> > 
> >  fail:
> > +	free(qtd);
> >  	return -1;
> >  }
> 

Best regards,
Benoît Thébaudeau
Marek Vasut July 27, 2012, 2:01 p.m. UTC | #3
Dear Benoît Thébaudeau,

[...]

> > Ok, maybe you can please elaborate on this crazy calculation in here?
> > Or somehow
> > clarify it? Also, won't the macros in include/common.h help in a way?
> > (like
> > ROUND() etc).
> 
> I have already posted a v2 for this specific patch (only 2/5) that does
> exactly that. Please review only the latest versions of patches.
> 
> > I don't really graps what you're trying to calculate here, so maybe
> > even a
> > comment would help.
> 
> I'll do that.
> 
> > > +		else					/* page-aligned */
> > > +			qtd_count += (length + QT_BUFFER_CNT * 4096 - 1) /
> > > +					(QT_BUFFER_CNT * 4096);
> > 
> > Same here, also please avoid using those 4096 and such constants ...
> > maybe
> > #define them in ehci.h ?
> 
> Yes. It was already everywhere, so I went on the same way.

I'm not exactly proud of the state of the usb stack, you know :(

> Do you think
> using PAGE_SIZE from <linux/compat.h> would be fine since these 4096 are
> nothing more than page sizes?

Isn't that intel-specific?

> > > +	}
> > > +	qtd = memalign(USB_DMA_MINALIGN, qtd_count * sizeof(struct qTD));
> > 
> > So your code can alloc more than 5 qTDs ? How does it chain them
> > then? Into more
> > QHs ?
> 
> It's done in exactly the same way as for the original 3 qTDs, only with
> more qTDs, but still with 5 qt_buffers per qTD.

I'm starting to see what you're trying to do. That's really cool :)

[...]
> > > +			token = (toggle << 31) |
> > > +			    (xfr_bytes << 16) |
> > > +			    ((req == NULL ? 1 : 0) << 15) |
> > > +			    (0 << 12) |
> > > +			    (3 << 10) |
> > > +			    ((usb_pipein(pipe) ? 1 : 0) << 8) | (0x80 << 0);
> > 
> > If you could fix all this magic afterwards (not in these patches),
> > that'd be
> > great.
> 
> Do you only mean #defining all those values?

Yes, but let's do this in a subsequent patch. It can wait for later.

[...]

> Best regards,
> Benoît Thébaudeau
Benoît Thébaudeau July 27, 2012, 2:13 p.m. UTC | #4
Dear Marek,

On Fri, Jul 27, 2012 at 04:01:11 PM, Marek Vasut wrote:
> > Do you think
> > using PAGE_SIZE from <linux/compat.h> would be fine since these
> > 4096 are
> > nothing more than page sizes?
> 
> Isn't that intel-specific?

I don't know. The code does not say so. What is sure is that page sizes should
be arch-specific, even with several possible page sizes per arch. But this
#define seems to fit our needs, so why not use it? The only thing that would
make me reluctant to using it is that this code might change without further
notice.

> > > > +	}
> > > > +	qtd = memalign(USB_DMA_MINALIGN, qtd_count * sizeof(struct
> > > > qTD));
> > > 
> > > So your code can alloc more than 5 qTDs ? How does it chain them
> > > then? Into more
> > > QHs ?
> > 
> > It's done in exactly the same way as for the original 3 qTDs, only
> > with
> > more qTDs, but still with 5 qt_buffers per qTD.
> 
> I'm starting to see what you're trying to do. That's really cool :)

OK.

> [...]
> > > > +			token = (toggle << 31) |
> > > > +			    (xfr_bytes << 16) |
> > > > +			    ((req == NULL ? 1 : 0) << 15) |
> > > > +			    (0 << 12) |
> > > > +			    (3 << 10) |
> > > > +			    ((usb_pipein(pipe) ? 1 : 0) << 8) | (0x80 << 0);
> > > 
> > > If you could fix all this magic afterwards (not in these
> > > patches),
> > > that'd be
> > > great.
> > 
> > Do you only mean #defining all those values?
> 
> Yes, but let's do this in a subsequent patch. It can wait for later.

OK.

Regards,
Benoît
Marek Vasut July 27, 2012, 2:31 p.m. UTC | #5
Dear Benoît Thébaudeau,

> Dear Marek,
> 
> On Fri, Jul 27, 2012 at 04:01:11 PM, Marek Vasut wrote:
> > > Do you think
> > > using PAGE_SIZE from <linux/compat.h> would be fine since these
> > > 4096 are
> > > nothing more than page sizes?
> > 
> > Isn't that intel-specific?
> 
> I don't know. The code does not say so. What is sure is that page sizes
> should be arch-specific, even with several possible page sizes per arch.
> But this #define seems to fit our needs, so why not use it? The only thing
> that would make me reluctant to using it is that this code might change
> without further notice.

That's exactly my point. And it'll break anything with pages smaller than 4k. 
Please define it in ehci.h
[..]
Best regards,
Marek Vasut
Benoît Thébaudeau July 29, 2012, 12:58 a.m. UTC | #6
Dear Marek,

On Fri, Jul 27, 2012 at 04:13:45 PM, Benoît Thébaudeau wrote:
> On Fri, Jul 27, 2012 at 04:01:11 PM, Marek Vasut wrote:
> > [...]
> > > > > +			token = (toggle << 31) |
> > > > > +			    (xfr_bytes << 16) |
> > > > > +			    ((req == NULL ? 1 : 0) << 15) |
> > > > > +			    (0 << 12) |
> > > > > +			    (3 << 10) |
> > > > > +			    ((usb_pipein(pipe) ? 1 : 0) << 8) | (0x80 << 0);
> > > > 
> > > > If you could fix all this magic afterwards (not in these
> > > > patches),
> > > > that'd be
> > > > great.
> > > 
> > > Do you only mean #defining all those values?
> > 
> > Yes, but let's do this in a subsequent patch. It can wait for
> > later.
> 
> OK.

What would you think about merging that together with the definition of 4096
into the current patch 1/5? In the next version, this patch would thus become a
general cosmetic patch for EHCI to define all used constants.

Best regards,
Benoît
Marek Vasut July 29, 2012, 1:40 a.m. UTC | #7
Dear Benoît Thébaudeau,

> Dear Marek,
> 
> On Fri, Jul 27, 2012 at 04:13:45 PM, Benoît Thébaudeau wrote:
> > On Fri, Jul 27, 2012 at 04:01:11 PM, Marek Vasut wrote:
> > > [...]
> > > 
> > > > > > +			token = (toggle << 31) |
> > > > > > +			    (xfr_bytes << 16) |
> > > > > > +			    ((req == NULL ? 1 : 0) << 15) |
> > > > > > +			    (0 << 12) |
> > > > > > +			    (3 << 10) |
> > > > > > +			    ((usb_pipein(pipe) ? 1 : 0) << 8) | (0x80 << 
0);
> > > > > 
> > > > > If you could fix all this magic afterwards (not in these
> > > > > patches),
> > > > > that'd be
> > > > > great.
> > > > 
> > > > Do you only mean #defining all those values?
> > > 
> > > Yes, but let's do this in a subsequent patch. It can wait for
> > > later.
> > 
> > OK.
> 
> What would you think about merging that together with the definition of
> 4096 into the current patch 1/5? In the next version, this patch would
> thus become a general cosmetic patch for EHCI to define all used
> constants.

That's all right with me.

> Best regards,
> Benoît

Best regards,
Marek Vasut
Benoît Thébaudeau July 29, 2012, 2:14 p.m. UTC | #8
Dear Marek Vasut,

On Sun, Jul 29, 2012 at 03:40:32 AM, Marek Vasut wrote:
> > On Fri, Jul 27, 2012 at 04:13:45 PM, Benoît Thébaudeau wrote:
> > > On Fri, Jul 27, 2012 at 04:01:11 PM, Marek Vasut wrote:
> > > > [...]
> > > > 
> > > > > > > +			token = (toggle << 31) |
> > > > > > > +			    (xfr_bytes << 16) |
> > > > > > > +			    ((req == NULL ? 1 : 0) << 15) |
> > > > > > > +			    (0 << 12) |
> > > > > > > +			    (3 << 10) |
> > > > > > > +			    ((usb_pipein(pipe) ? 1 : 0) << 8) | (0x80 <<
> 0);
> > > > > > 
> > > > > > If you could fix all this magic afterwards (not in these
> > > > > > patches),
> > > > > > that'd be
> > > > > > great.
> > > > > 
> > > > > Do you only mean #defining all those values?
> > > > 
> > > > Yes, but let's do this in a subsequent patch. It can wait for
> > > > later.
> > > 
> > > OK.
> > 
> > What would you think about merging that together with the
> > definition of
> > 4096 into the current patch 1/5? In the next version, this patch
> > would
> > thus become a general cosmetic patch for EHCI to define all used
> > constants.
> 
> That's all right with me.

Great. There are also questions for you in my answer to Stefan. Please take a
look at these. I know, it's long to read, sorry.

Best regards,
Benoît
Marek Vasut July 29, 2012, 6:08 p.m. UTC | #9
Dear Benoît Thébaudeau,

> Dear Marek Vasut,
> 
> On Sun, Jul 29, 2012 at 03:40:32 AM, Marek Vasut wrote:
> > > On Fri, Jul 27, 2012 at 04:13:45 PM, Benoît Thébaudeau wrote:
> > > > On Fri, Jul 27, 2012 at 04:01:11 PM, Marek Vasut wrote:
> > > > > [...]
> > > > > 
> > > > > > > > +			token = (toggle << 31) |
> > > > > > > > +			    (xfr_bytes << 16) |
> > > > > > > > +			    ((req == NULL ? 1 : 0) << 15) |
> > > > > > > > +			    (0 << 12) |
> > > > > > > > +			    (3 << 10) |
> > > > > > > > +			    ((usb_pipein(pipe) ? 1 : 0) << 8) | 
(0x80 <<
> > 
> > 0);
> > 
> > > > > > > If you could fix all this magic afterwards (not in these
> > > > > > > patches),
> > > > > > > that'd be
> > > > > > > great.
> > > > > > 
> > > > > > Do you only mean #defining all those values?
> > > > > 
> > > > > Yes, but let's do this in a subsequent patch. It can wait for
> > > > > later.
> > > > 
> > > > OK.
> > > 
> > > What would you think about merging that together with the
> > > definition of
> > > 4096 into the current patch 1/5? In the next version, this patch
> > > would
> > > thus become a general cosmetic patch for EHCI to define all used
> > > constants.
> > 
> > That's all right with me.
> 
> Great. There are also questions for you in my answer to Stefan. Please take
> a look at these. I know, it's long to read, sorry.

I'm still trying to make some sense of it, gimme one more day please.

> Best regards,
> Benoît

Best regards,
Marek Vasut
diff mbox

Patch

diff --git u-boot-usb-1b4bd0e.orig/drivers/usb/host/ehci-hcd.c u-boot-usb-1b4bd0e/drivers/usb/host/ehci-hcd.c
index 5b3b906..b5645fa 100644
--- u-boot-usb-1b4bd0e.orig/drivers/usb/host/ehci-hcd.c
+++ u-boot-usb-1b4bd0e/drivers/usb/host/ehci-hcd.c
@@ -208,7 +208,8 @@  ehci_submit_async(struct usb_device *dev, unsigned long pipe, void *buffer,
 		   int length, struct devrequest *req)
 {
 	ALLOC_ALIGN_BUFFER(struct QH, qh, 1, USB_DMA_MINALIGN);
-	ALLOC_ALIGN_BUFFER(struct qTD, qtd, 3, USB_DMA_MINALIGN);
+	struct qTD *qtd;
+	int qtd_count = 0;
 	int qtd_counter = 0;
 
 	volatile struct qTD *vtd;
@@ -229,8 +230,25 @@  ehci_submit_async(struct usb_device *dev, unsigned long pipe, void *buffer,
 		      le16_to_cpu(req->value), le16_to_cpu(req->value),
 		      le16_to_cpu(req->index));
 
+	if (req != NULL)			/* SETUP + ACK */
+		qtd_count += 1 + 1;
+	if (length > 0 || req == NULL) {	/* buffer */
+		if ((uint32_t)buffer & 4095)		/* page-unaligned */
+			qtd_count += (((uint32_t)buffer & 4095) + length +
+					(QT_BUFFER_CNT - 1) * 4096 - 1) /
+						((QT_BUFFER_CNT - 1) * 4096);
+		else					/* page-aligned */
+			qtd_count += (length + QT_BUFFER_CNT * 4096 - 1) /
+					(QT_BUFFER_CNT * 4096);
+	}
+	qtd = memalign(USB_DMA_MINALIGN, qtd_count * sizeof(struct qTD));
+	if (qtd == NULL) {
+		printf("unable to allocate TDs\n");
+		return -1;
+	}
+
 	memset(qh, 0, sizeof(struct QH));
-	memset(qtd, 0, 3 * sizeof(*qtd));
+	memset(qtd, 0, qtd_count * sizeof(*qtd));
 
 	toggle = usb_gettoggle(dev, usb_pipeendpoint(pipe), usb_pipeout(pipe));
 
@@ -291,31 +309,46 @@  ehci_submit_async(struct usb_device *dev, unsigned long pipe, void *buffer,
 	}
 
 	if (length > 0 || req == NULL) {
-		/*
-		 * Setup request qTD (3.5 in ehci-r10.pdf)
-		 *
-		 *   qt_next ................ 03-00 H
-		 *   qt_altnext ............. 07-04 H
-		 *   qt_token ............... 0B-08 H
-		 *
-		 *   [ buffer, buffer_hi ] loaded with "buffer".
-		 */
-		qtd[qtd_counter].qt_next = cpu_to_hc32(QT_NEXT_TERMINATE);
-		qtd[qtd_counter].qt_altnext = cpu_to_hc32(QT_NEXT_TERMINATE);
-		token = (toggle << 31) |
-		    (length << 16) |
-		    ((req == NULL ? 1 : 0) << 15) |
-		    (0 << 12) |
-		    (3 << 10) |
-		    ((usb_pipein(pipe) ? 1 : 0) << 8) | (0x80 << 0);
-		qtd[qtd_counter].qt_token = cpu_to_hc32(token);
-		if (ehci_td_buffer(&qtd[qtd_counter], buffer, length) != 0) {
-			printf("unable construct DATA td\n");
-			goto fail;
-		}
-		/* Update previous qTD! */
-		*tdp = cpu_to_hc32((uint32_t)&qtd[qtd_counter]);
-		tdp = &qtd[qtd_counter++].qt_next;
+		uint8_t *buf_ptr = buffer;
+		int left_length = length;
+
+		do {
+			int xfr_bytes = min(left_length,
+					    (QT_BUFFER_CNT * 4096 -
+					     ((uint32_t)buf_ptr & 4095)) &
+					    ~4095);
+
+			/*
+			 * Setup request qTD (3.5 in ehci-r10.pdf)
+			 *
+			 *   qt_next ................ 03-00 H
+			 *   qt_altnext ............. 07-04 H
+			 *   qt_token ............... 0B-08 H
+			 *
+			 *   [ buffer, buffer_hi ] loaded with "buffer".
+			 */
+			qtd[qtd_counter].qt_next =
+					cpu_to_hc32(QT_NEXT_TERMINATE);
+			qtd[qtd_counter].qt_altnext =
+					cpu_to_hc32(QT_NEXT_TERMINATE);
+			token = (toggle << 31) |
+			    (xfr_bytes << 16) |
+			    ((req == NULL ? 1 : 0) << 15) |
+			    (0 << 12) |
+			    (3 << 10) |
+			    ((usb_pipein(pipe) ? 1 : 0) << 8) | (0x80 << 0);
+			qtd[qtd_counter].qt_token = cpu_to_hc32(token);
+			if (ehci_td_buffer(&qtd[qtd_counter], buf_ptr,
+						xfr_bytes) != 0) {
+				printf("unable construct DATA td\n");
+				goto fail;
+			}
+			/* Update previous qTD! */
+			*tdp = cpu_to_hc32((uint32_t)&qtd[qtd_counter]);
+			tdp = &qtd[qtd_counter++].qt_next;
+			buf_ptr += xfr_bytes;
+			left_length -= xfr_bytes;
+		} while (left_length > 0);
 	}
 
 	if (req != NULL) {
@@ -346,7 +379,8 @@  ehci_submit_async(struct usb_device *dev, unsigned long pipe, void *buffer,
 	flush_dcache_range((uint32_t)qh_list,
 		ALIGN_END_ADDR(struct QH, qh_list, 1));
 	flush_dcache_range((uint32_t)qh, ALIGN_END_ADDR(struct QH, qh, 1));
-	flush_dcache_range((uint32_t)qtd, ALIGN_END_ADDR(struct qTD, qtd, 3));
+	flush_dcache_range((uint32_t)qtd,
+			   ALIGN_END_ADDR(struct qTD, qtd, qtd_count));
 
 	/* Set async. queue head pointer. */
 	ehci_writel(&hcor->or_asynclistaddr, (uint32_t)qh_list);
@@ -377,7 +411,7 @@  ehci_submit_async(struct usb_device *dev, unsigned long pipe, void *buffer,
 		invalidate_dcache_range((uint32_t)qh,
 			ALIGN_END_ADDR(struct QH, qh, 1));
 		invalidate_dcache_range((uint32_t)qtd,
-			ALIGN_END_ADDR(struct qTD, qtd, 3));
+			ALIGN_END_ADDR(struct qTD, qtd, qtd_count));
 
 		token = hc32_to_cpu(vtd->qt_token);
 		if (!(token & 0x80))
@@ -450,9 +484,11 @@  ehci_submit_async(struct usb_device *dev, unsigned long pipe, void *buffer,
 		      ehci_readl(&hcor->or_portsc[1]));
 	}
 
+	free(qtd);
 	return (dev->status != USB_ST_NOT_PROC) ? 0 : -1;
 
 fail:
+	free(qtd);
 	return -1;
 }