diff mbox

[U-Boot] ehci-hcd.c, musb_core, usb.h: Add USB_DMA_MINALIGN define for cache alignment

Message ID 1341407039-6018-1-git-send-email-ilya.yanok@cogentembedded.com
State Superseded
Delegated to: Marek Vasut
Headers show

Commit Message

Ilya Yanok July 4, 2012, 1:03 p.m. UTC
From: Tom Rini <trini@ti.com>

The USB spec says that 32 bytes is the minimum required alignment.
However on some platforms we have a larger minimum requirement for cache
coherency.  In those cases, use that value rather than the USB spec
minimum.  We add a cpp check to <usb.h> to define USB_DMA_MINALIGN and
make use of it in ehci-hcd.c and musb_core.h.  We cannot use MAX() here
as we are not allowed to have tests inside of align(...).

Cc: Marek Vasut <marex@denx.de>
Signed-off-by: Tom Rini <trini@ti.com>
[ilya.yanok]: fix size alignment, drop (incorrect) rounding
when invalidating the buffer. If we got unaligned buffer from the
upper layer -- that's definetely a bug so it's good to buzz
about it. But we have to align the buffer length -- upper layers
should take care to reserve enough space.
Signed-off-by: Ilya Yanok <ilya.yanok@cogentembedded.com>
---
Changes from Tom's V4:
 - Internal buffers should be not only address but also size aligned
 - Don't try to fix unalignment of external buffer
 - Fix also debug() checks in ehci_td_buffer() (though size check is
   meaningless: in the current API only size of transfer is passed
   which is not always the same as size of underlying buffer and
   can be unaligned.

 No bounce-buffering is implemented so unaligned buffers coming from
the upper layers will still result in invalidate_dcache_range() vows.
But I tested it with unaligned fatload and got strange result: no
errors from invalidate_dcache_range, I got "EHCI timed out on TD"
errors instead (the same situtation without this patch and cache
disabled). Looks like unaligned buffers are problem for EHCI even
without cache involved...
Aligned fatload works like a charm.

 drivers/usb/host/ehci-hcd.c  |   89 +++++++++++++++++++++++++-----------------
 drivers/usb/musb/musb_core.h |    2 +-
 include/usb.h                |   10 +++++
 3 files changed, 65 insertions(+), 36 deletions(-)

Comments

Marek Vasut July 4, 2012, 8:24 p.m. UTC | #1
Dear Ilya Yanok,

> From: Tom Rini <trini@ti.com>
> 
> The USB spec says that 32 bytes is the minimum required alignment.
> However on some platforms we have a larger minimum requirement for cache
> coherency.  In those cases, use that value rather than the USB spec
> minimum.  We add a cpp check to <usb.h> to define USB_DMA_MINALIGN and
> make use of it in ehci-hcd.c and musb_core.h.  We cannot use MAX() here
> as we are not allowed to have tests inside of align(...).
> 
> Cc: Marek Vasut <marex@denx.de>
> Signed-off-by: Tom Rini <trini@ti.com>
> [ilya.yanok]: fix size alignment, drop (incorrect) rounding
> when invalidating the buffer. If we got unaligned buffer from the
> upper layer -- that's definetely a bug so it's good to buzz
> about it. But we have to align the buffer length -- upper layers
> should take care to reserve enough space.
> Signed-off-by: Ilya Yanok <ilya.yanok@cogentembedded.com>
> ---
> Changes from Tom's V4:
>  - Internal buffers should be not only address but also size aligned
>  - Don't try to fix unalignment of external buffer
>  - Fix also debug() checks in ehci_td_buffer() (though size check is
>    meaningless: in the current API only size of transfer is passed
>    which is not always the same as size of underlying buffer and
>    can be unaligned.
> 
>  No bounce-buffering is implemented so unaligned buffers coming from
> the upper layers will still result in invalidate_dcache_range() vows.
> But I tested it with unaligned fatload and got strange result: no
> errors from invalidate_dcache_range, I got "EHCI timed out on TD"
> errors instead (the same situtation without this patch and cache
> disabled). Looks like unaligned buffers are problem for EHCI even
> without cache involved...
> Aligned fatload works like a charm.
> 
>  drivers/usb/host/ehci-hcd.c  |   89
> +++++++++++++++++++++++++----------------- drivers/usb/musb/musb_core.h | 
>   2 +-
>  include/usb.h                |   10 +++++
>  3 files changed, 65 insertions(+), 36 deletions(-)
> 
> diff --git a/drivers/usb/host/ehci-hcd.c b/drivers/usb/host/ehci-hcd.c
> index 04300be..74a5c76 100644
> --- a/drivers/usb/host/ehci-hcd.c
> +++ b/drivers/usb/host/ehci-hcd.c
> @@ -34,7 +34,9 @@ struct ehci_hccr *hccr;	/* R/O registers, not need for
> volatile */ volatile struct ehci_hcor *hcor;
> 
>  static uint16_t portreset;
> -static struct QH qh_list __attribute__((aligned(32)));
> +static char __qh_list[ALIGN(sizeof(struct QH), USB_DMA_MINALIGN)]
> +			__attribute__((aligned(USB_DMA_MINALIGN)));
> +static struct QH *qh_list = (struct QH *)__qh_list;

Maybe we should create DEFINE_ALIGNED_VARIABLE as a common.h macro?

>  static struct descriptor {
>  	struct usb_hub_descriptor hub;
> @@ -172,13 +174,13 @@ static int ehci_td_buffer(struct qTD *td, void *buf,
> size_t sz) {
>  	uint32_t delta, next;
>  	uint32_t addr = (uint32_t)buf;
> -	size_t rsz = roundup(sz, 32);
> +	size_t rsz = roundup(sz, USB_DMA_MINALIGN);
>  	int idx;
> 
>  	if (sz != rsz)
>  		debug("EHCI-HCD: Misaligned buffer size (%08x)\n", sz);
> 
> -	if (addr & 31)
> +	if (addr != ALIGN(addr, USB_DMA_MINALIGN))
>  		debug("EHCI-HCD: Misaligned buffer address (%p)\n", buf);

Good :)

>  	idx = 0;
> @@ -207,8 +209,12 @@ static int
>  ehci_submit_async(struct usb_device *dev, unsigned long pipe, void
> *buffer, int length, struct devrequest *req)
>  {
> -	static struct QH qh __attribute__((aligned(32)));
> -	static struct qTD qtd[3] __attribute__((aligned (32)));
> +	static char *__qh[ALIGN(sizeof(struct QH), USB_DMA_MINALIGN)]
> +				__attribute__((aligned(USB_DMA_MINALIGN)));
> +	struct QH *qh = (struct QH *)__qh;
> +	static char *__qtd[ALIGN(3*sizeof(struct qTD), USB_DMA_MINALIGN)]
> +				__attribute__((aligned(USB_DMA_MINALIGN)));
> +	struct qTD *qtd = (struct qTD *)__qtd;
>  	int qtd_counter = 0;
> 
>  	volatile struct qTD *vtd;
> @@ -229,8 +235,8 @@ ehci_submit_async(struct usb_device *dev, unsigned long
> pipe, void *buffer, le16_to_cpu(req->value), le16_to_cpu(req->value),
>  		      le16_to_cpu(req->index));
> 
> -	memset(&qh, 0, sizeof(struct QH));
> -	memset(qtd, 0, sizeof(qtd));
> +	memset(qh, 0, sizeof(struct QH));
> +	memset(qtd, 0, 3 * sizeof(*qtd));
> 
>  	toggle = usb_gettoggle(dev, usb_pipeendpoint(pipe), usb_pipeout(pipe));
> 
> @@ -244,7 +250,7 @@ ehci_submit_async(struct usb_device *dev, unsigned long
> pipe, void *buffer, *   qh_overlay.qt_next ...... 13-10 H
>  	 * - qh_overlay.qt_altnext
>  	 */
> -	qh.qh_link = cpu_to_hc32((uint32_t)&qh_list | QH_LINK_TYPE_QH);
> +	qh->qh_link = cpu_to_hc32((uint32_t)qh_list | QH_LINK_TYPE_QH);
>  	c = (usb_pipespeed(pipe) != USB_SPEED_HIGH &&
>  	     usb_pipeendpoint(pipe) == 0) ? 1 : 0;
>  	endpt = (8 << 28) |
> @@ -255,14 +261,14 @@ ehci_submit_async(struct usb_device *dev, unsigned
> long pipe, void *buffer, (usb_pipespeed(pipe) << 12) |
>  	    (usb_pipeendpoint(pipe) << 8) |
>  	    (0 << 7) | (usb_pipedevice(pipe) << 0);
> -	qh.qh_endpt1 = cpu_to_hc32(endpt);
> +	qh->qh_endpt1 = cpu_to_hc32(endpt);
>  	endpt = (1 << 30) |
>  	    (dev->portnr << 23) |
>  	    (dev->parent->devnum << 16) | (0 << 8) | (0 << 0);
> -	qh.qh_endpt2 = cpu_to_hc32(endpt);
> -	qh.qh_overlay.qt_next = cpu_to_hc32(QT_NEXT_TERMINATE);
> +	qh->qh_endpt2 = cpu_to_hc32(endpt);
> +	qh->qh_overlay.qt_next = cpu_to_hc32(QT_NEXT_TERMINATE);
> 
> -	tdp = &qh.qh_overlay.qt_next;
> +	tdp = &qh->qh_overlay.qt_next;
> 
>  	if (req != NULL) {
>  		/*
> @@ -340,13 +346,15 @@ ehci_submit_async(struct usb_device *dev, unsigned
> long pipe, void *buffer, tdp = &qtd[qtd_counter++].qt_next;
>  	}
> 
> -	qh_list.qh_link = cpu_to_hc32((uint32_t)&qh | QH_LINK_TYPE_QH);
> +	qh_list->qh_link = cpu_to_hc32((uint32_t)qh | QH_LINK_TYPE_QH);
> 
>  	/* Flush dcache */
> -	flush_dcache_range((uint32_t)&qh_list,
> -		(uint32_t)&qh_list + sizeof(struct QH));
> -	flush_dcache_range((uint32_t)&qh, (uint32_t)&qh + sizeof(struct QH));
> -	flush_dcache_range((uint32_t)qtd, (uint32_t)qtd + sizeof(qtd));
> +	flush_dcache_range((uint32_t)qh_list,
> +		(uint32_t)qh_list + ALIGN(sizeof(struct QH), USB_DMA_MINALIGN));
> +	flush_dcache_range((uint32_t)qh, (uint32_t)qh +
> +		ALIGN(sizeof(struct QH), USB_DMA_MINALIGN));
> +	flush_dcache_range((uint32_t)qtd, (uint32_t)qtd +
> +		ALIGN(3*sizeof(*qtd), USB_DMA_MINALIGN));

Maybe we should make a macro for those things here to prevent such spaghetti of 
code ?

[...]

Minor things really, otherwise it's really good :)
Tom Rini July 5, 2012, 5:15 p.m. UTC | #2
On Wed, Jul 04, 2012 at 05:03:59PM +0400, Ilya Yanok wrote:
`
> From: Tom Rini <trini@ti.com>
> 
> The USB spec says that 32 bytes is the minimum required alignment.
> However on some platforms we have a larger minimum requirement for cache
> coherency.  In those cases, use that value rather than the USB spec
> minimum.  We add a cpp check to <usb.h> to define USB_DMA_MINALIGN and
> make use of it in ehci-hcd.c and musb_core.h.  We cannot use MAX() here
> as we are not allowed to have tests inside of align(...).
> 
> Cc: Marek Vasut <marex@denx.de>
> Signed-off-by: Tom Rini <trini@ti.com>
> [ilya.yanok]: fix size alignment, drop (incorrect) rounding
> when invalidating the buffer. If we got unaligned buffer from the
> upper layer -- that's definetely a bug so it's good to buzz
> about it. But we have to align the buffer length -- upper layers
> should take care to reserve enough space.
> Signed-off-by: Ilya Yanok <ilya.yanok@cogentembedded.com>

So trying this on ethernet still gives a handful of unaligned areas.  Do
you have a beagleboard xM available?
Ilya Yanok July 5, 2012, 6:25 p.m. UTC | #3
Hi Tom,

On Thu, Jul 5, 2012 at 9:15 PM, Tom Rini <trini@ti.com> wrote:

> On Wed, Jul 04, 2012 at 05:03:59PM +0400, Ilya Yanok wrote:
> `
> > From: Tom Rini <trini@ti.com>
> >
> > The USB spec says that 32 bytes is the minimum required alignment.
> > However on some platforms we have a larger minimum requirement for cache
> > coherency.  In those cases, use that value rather than the USB spec
> > minimum.  We add a cpp check to <usb.h> to define USB_DMA_MINALIGN and
> > make use of it in ehci-hcd.c and musb_core.h.  We cannot use MAX() here
> > as we are not allowed to have tests inside of align(...).
> >
> > Cc: Marek Vasut <marex@denx.de>
> > Signed-off-by: Tom Rini <trini@ti.com>
> > [ilya.yanok]: fix size alignment, drop (incorrect) rounding
> > when invalidating the buffer. If we got unaligned buffer from the
> > upper layer -- that's definetely a bug so it's good to buzz
> > about it. But we have to align the buffer length -- upper layers
> > should take care to reserve enough space.
> > Signed-off-by: Ilya Yanok <ilya.yanok@cogentembedded.com>
>
> So trying this on ethernet still gives a handful of unaligned areas.  Do
>

 Not surprised. USB Ethernet drivers are unfixed wrt cache alignment.

you have a beagleboard xM available?
>

Unfortunately no.

Regards, Ilya.
Ilya Yanok July 5, 2012, 6:44 p.m. UTC | #4
Hi Marek,

On Thu, Jul 5, 2012 at 12:24 AM, Marek Vasut <marex@denx.de> wrote:

>
> > -static struct QH qh_list __attribute__((aligned(32)));
> > +static char __qh_list[ALIGN(sizeof(struct QH), USB_DMA_MINALIGN)]
> > +                     __attribute__((aligned(USB_DMA_MINALIGN)));
> > +static struct QH *qh_list = (struct QH *)__qh_list;
>
>
> Maybe we should create DEFINE_ALIGNED_VARIABLE as a common.h macro?
>

Yep. I even thought about this but decided not to do... can't recall why.
Now I think it's really a good idea.


>
> >  static struct descriptor {
> >       struct usb_hub_descriptor hub;
> > @@ -172,13 +174,13 @@ static int ehci_td_buffer(struct qTD *td, void
> *buf,
> > size_t sz) {
> >       uint32_t delta, next;
> >       uint32_t addr = (uint32_t)buf;
> > -     size_t rsz = roundup(sz, 32);
> > +     size_t rsz = roundup(sz, USB_DMA_MINALIGN);
> >       int idx;
> >
> >       if (sz != rsz)
> >               debug("EHCI-HCD: Misaligned buffer size (%08x)\n", sz);
> >
> > -     if (addr & 31)
> > +     if (addr != ALIGN(addr, USB_DMA_MINALIGN))
> >               debug("EHCI-HCD: Misaligned buffer address (%p)\n", buf);
>
> Good :)
>

Well, thinking more about this, it's actually a wrong place to check
this... It can be setup packet buffer, which can be unaligned (this is ok
cause it's only flushed and never invalidated). And size can always be
unaligned...



> >       /* Flush dcache */
> > -     flush_dcache_range((uint32_t)&qh_list,
> > -             (uint32_t)&qh_list + sizeof(struct QH));
> > -     flush_dcache_range((uint32_t)&qh, (uint32_t)&qh + sizeof(struct
> QH));
> > -     flush_dcache_range((uint32_t)qtd, (uint32_t)qtd + sizeof(qtd));
> > +     flush_dcache_range((uint32_t)qh_list,
> > +             (uint32_t)qh_list + ALIGN(sizeof(struct QH),
> USB_DMA_MINALIGN));
> > +     flush_dcache_range((uint32_t)qh, (uint32_t)qh +
> > +             ALIGN(sizeof(struct QH), USB_DMA_MINALIGN));
> > +     flush_dcache_range((uint32_t)qtd, (uint32_t)qtd +
> > +             ALIGN(3*sizeof(*qtd), USB_DMA_MINALIGN));
>
> Maybe we should make a macro for those things here to prevent such
> spaghetti of
> code ?
>

Hm.. Maybe. Ideas? ;) Actually I also thought about moving all this stuff
to a single proper aligned buffer and do flush/invalidate for a whole
buffer at once. It can save us some space... but it's BSS anyway... Don't
know if it's worth it...

Regards, Ilya.
Marek Vasut July 5, 2012, 6:47 p.m. UTC | #5
Dear Ilya Yanok,

> Hi Tom,
> 
> On Thu, Jul 5, 2012 at 9:15 PM, Tom Rini <trini@ti.com> wrote:
> > On Wed, Jul 04, 2012 at 05:03:59PM +0400, Ilya Yanok wrote:
> > `
> > 
> > > From: Tom Rini <trini@ti.com>
> > > 
> > > The USB spec says that 32 bytes is the minimum required alignment.
> > > However on some platforms we have a larger minimum requirement for
> > > cache coherency.  In those cases, use that value rather than the USB
> > > spec minimum.  We add a cpp check to <usb.h> to define
> > > USB_DMA_MINALIGN and make use of it in ehci-hcd.c and musb_core.h.  We
> > > cannot use MAX() here as we are not allowed to have tests inside of
> > > align(...).
> > > 
> > > Cc: Marek Vasut <marex@denx.de>
> > > Signed-off-by: Tom Rini <trini@ti.com>
> > > [ilya.yanok]: fix size alignment, drop (incorrect) rounding
> > > when invalidating the buffer. If we got unaligned buffer from the
> > > upper layer -- that's definetely a bug so it's good to buzz
> > > about it. But we have to align the buffer length -- upper layers
> > > should take care to reserve enough space.
> > > Signed-off-by: Ilya Yanok <ilya.yanok@cogentembedded.com>
> > 
> > So trying this on ethernet still gives a handful of unaligned areas.  Do
> 
>  Not surprised. USB Ethernet drivers are unfixed wrt cache alignment.

Well I fixed the ASIX, but now that I think about it, the fix might not be 
enough.

> you have a beagleboard xM available?


> Unfortunately no.

Any ARMv7 chip shall be OK.

> 
> Regards, Ilya.

Best regards,
Marek Vasut
Marek Vasut July 5, 2012, 7:58 p.m. UTC | #6
Dear Ilya Yanok,

> Hi Marek,
> 
> On Thu, Jul 5, 2012 at 12:24 AM, Marek Vasut <marex@denx.de> wrote:
> > > -static struct QH qh_list __attribute__((aligned(32)));
> > > +static char __qh_list[ALIGN(sizeof(struct QH), USB_DMA_MINALIGN)]
> > > +                     __attribute__((aligned(USB_DMA_MINALIGN)));
> > > +static struct QH *qh_list = (struct QH *)__qh_list;
> > 
> > Maybe we should create DEFINE_ALIGNED_VARIABLE as a common.h macro?
> 
> Yep. I even thought about this but decided not to do... can't recall why.
> Now I think it's really a good idea.

Like what's in common.h -- ALLOC_CACHE_ALIGN_BUFFER ?

[..]

> Hm.. Maybe. Ideas? ;) Actually I also thought about moving all this stuff
> to a single proper aligned buffer and do flush/invalidate for a whole
> buffer at once. It can save us some space... but it's BSS anyway... Don't
> know if it's worth it...

But if you copy stuff back and forth, it'll cause performance hit.

> Regards, Ilya.

Best regards,
Marek Vasut
Ilya Yanok July 5, 2012, 8:27 p.m. UTC | #7
Hi Marek,

On Thu, Jul 5, 2012 at 11:58 PM, Marek Vasut <marex@denx.de> wrote:

>
> >
> > Yep. I even thought about this but decided not to do... can't recall why.
> > Now I think it's really a good idea.
>
> > > Maybe we should create DEFINE_ALIGNED_VARIABLE as a common.h macro?
> Like what's in common.h -- ALLOC_CACHE_ALIGN_BUFFER ?
>

Yes, but for static variables and not hard-bound to ARCH_DMA_MINALIGN.


> > Hm.. Maybe. Ideas? ;) Actually I also thought about moving all this stuff
> > to a single proper aligned buffer and do flush/invalidate for a whole
> > buffer at once. It can save us some space... but it's BSS anyway... Don't
> > know if it's worth it...
>
> But if you copy stuff back and forth, it'll cause performance hit.
>

No, you talk about full bounce-buffering support and meant only one big
buffer for internal structs.

Regards, Ilya.
Marek Vasut July 5, 2012, 8:55 p.m. UTC | #8
Dear Ilya Yanok,

> Hi Marek,
> 
> On Thu, Jul 5, 2012 at 11:58 PM, Marek Vasut <marex@denx.de> wrote:
> > > Yep. I even thought about this but decided not to do... can't recall
> > > why. Now I think it's really a good idea.
> > > 
> > > > Maybe we should create DEFINE_ALIGNED_VARIABLE as a common.h macro?
> > 
> > Like what's in common.h -- ALLOC_CACHE_ALIGN_BUFFER ?
> 
> Yes, but for static variables and not hard-bound to ARCH_DMA_MINALIGN.

Good idea :)

> > > Hm.. Maybe. Ideas? ;) Actually I also thought about moving all this
> > > stuff to a single proper aligned buffer and do flush/invalidate for a
> > > whole buffer at once. It can save us some space... but it's BSS
> > > anyway... Don't know if it's worth it...
> > 
> > But if you copy stuff back and forth, it'll cause performance hit.
> 
> No, you talk about full bounce-buffering support and meant only one big
> buffer for internal structs.

Sure, but the internal structs can be aligned on their own, so there's no need 
for a buffer.

> Regards, Ilya.

Best regards,
Marek Vasut
diff mbox

Patch

diff --git a/drivers/usb/host/ehci-hcd.c b/drivers/usb/host/ehci-hcd.c
index 04300be..74a5c76 100644
--- a/drivers/usb/host/ehci-hcd.c
+++ b/drivers/usb/host/ehci-hcd.c
@@ -34,7 +34,9 @@  struct ehci_hccr *hccr;	/* R/O registers, not need for volatile */
 volatile struct ehci_hcor *hcor;
 
 static uint16_t portreset;
-static struct QH qh_list __attribute__((aligned(32)));
+static char __qh_list[ALIGN(sizeof(struct QH), USB_DMA_MINALIGN)]
+			__attribute__((aligned(USB_DMA_MINALIGN)));
+static struct QH *qh_list = (struct QH *)__qh_list;
 
 static struct descriptor {
 	struct usb_hub_descriptor hub;
@@ -172,13 +174,13 @@  static int ehci_td_buffer(struct qTD *td, void *buf, size_t sz)
 {
 	uint32_t delta, next;
 	uint32_t addr = (uint32_t)buf;
-	size_t rsz = roundup(sz, 32);
+	size_t rsz = roundup(sz, USB_DMA_MINALIGN);
 	int idx;
 
 	if (sz != rsz)
 		debug("EHCI-HCD: Misaligned buffer size (%08x)\n", sz);
 
-	if (addr & 31)
+	if (addr != ALIGN(addr, USB_DMA_MINALIGN))
 		debug("EHCI-HCD: Misaligned buffer address (%p)\n", buf);
 
 	idx = 0;
@@ -207,8 +209,12 @@  static int
 ehci_submit_async(struct usb_device *dev, unsigned long pipe, void *buffer,
 		   int length, struct devrequest *req)
 {
-	static struct QH qh __attribute__((aligned(32)));
-	static struct qTD qtd[3] __attribute__((aligned (32)));
+	static char *__qh[ALIGN(sizeof(struct QH), USB_DMA_MINALIGN)]
+				__attribute__((aligned(USB_DMA_MINALIGN)));
+	struct QH *qh = (struct QH *)__qh;
+	static char *__qtd[ALIGN(3*sizeof(struct qTD), USB_DMA_MINALIGN)]
+				__attribute__((aligned(USB_DMA_MINALIGN)));
+	struct qTD *qtd = (struct qTD *)__qtd;
 	int qtd_counter = 0;
 
 	volatile struct qTD *vtd;
@@ -229,8 +235,8 @@  ehci_submit_async(struct usb_device *dev, unsigned long pipe, void *buffer,
 		      le16_to_cpu(req->value), le16_to_cpu(req->value),
 		      le16_to_cpu(req->index));
 
-	memset(&qh, 0, sizeof(struct QH));
-	memset(qtd, 0, sizeof(qtd));
+	memset(qh, 0, sizeof(struct QH));
+	memset(qtd, 0, 3 * sizeof(*qtd));
 
 	toggle = usb_gettoggle(dev, usb_pipeendpoint(pipe), usb_pipeout(pipe));
 
@@ -244,7 +250,7 @@  ehci_submit_async(struct usb_device *dev, unsigned long pipe, void *buffer,
 	 *   qh_overlay.qt_next ...... 13-10 H
 	 * - qh_overlay.qt_altnext
 	 */
-	qh.qh_link = cpu_to_hc32((uint32_t)&qh_list | QH_LINK_TYPE_QH);
+	qh->qh_link = cpu_to_hc32((uint32_t)qh_list | QH_LINK_TYPE_QH);
 	c = (usb_pipespeed(pipe) != USB_SPEED_HIGH &&
 	     usb_pipeendpoint(pipe) == 0) ? 1 : 0;
 	endpt = (8 << 28) |
@@ -255,14 +261,14 @@  ehci_submit_async(struct usb_device *dev, unsigned long pipe, void *buffer,
 	    (usb_pipespeed(pipe) << 12) |
 	    (usb_pipeendpoint(pipe) << 8) |
 	    (0 << 7) | (usb_pipedevice(pipe) << 0);
-	qh.qh_endpt1 = cpu_to_hc32(endpt);
+	qh->qh_endpt1 = cpu_to_hc32(endpt);
 	endpt = (1 << 30) |
 	    (dev->portnr << 23) |
 	    (dev->parent->devnum << 16) | (0 << 8) | (0 << 0);
-	qh.qh_endpt2 = cpu_to_hc32(endpt);
-	qh.qh_overlay.qt_next = cpu_to_hc32(QT_NEXT_TERMINATE);
+	qh->qh_endpt2 = cpu_to_hc32(endpt);
+	qh->qh_overlay.qt_next = cpu_to_hc32(QT_NEXT_TERMINATE);
 
-	tdp = &qh.qh_overlay.qt_next;
+	tdp = &qh->qh_overlay.qt_next;
 
 	if (req != NULL) {
 		/*
@@ -340,13 +346,15 @@  ehci_submit_async(struct usb_device *dev, unsigned long pipe, void *buffer,
 		tdp = &qtd[qtd_counter++].qt_next;
 	}
 
-	qh_list.qh_link = cpu_to_hc32((uint32_t)&qh | QH_LINK_TYPE_QH);
+	qh_list->qh_link = cpu_to_hc32((uint32_t)qh | QH_LINK_TYPE_QH);
 
 	/* Flush dcache */
-	flush_dcache_range((uint32_t)&qh_list,
-		(uint32_t)&qh_list + sizeof(struct QH));
-	flush_dcache_range((uint32_t)&qh, (uint32_t)&qh + sizeof(struct QH));
-	flush_dcache_range((uint32_t)qtd, (uint32_t)qtd + sizeof(qtd));
+	flush_dcache_range((uint32_t)qh_list,
+		(uint32_t)qh_list + ALIGN(sizeof(struct QH), USB_DMA_MINALIGN));
+	flush_dcache_range((uint32_t)qh, (uint32_t)qh +
+		ALIGN(sizeof(struct QH), USB_DMA_MINALIGN));
+	flush_dcache_range((uint32_t)qtd, (uint32_t)qtd +
+		ALIGN(3*sizeof(*qtd), USB_DMA_MINALIGN));
 
 	usbsts = ehci_readl(&hcor->or_usbsts);
 	ehci_writel(&hcor->or_usbsts, (usbsts & 0x3f));
@@ -369,12 +377,15 @@  ehci_submit_async(struct usb_device *dev, unsigned long pipe, void *buffer,
 	timeout = USB_TIMEOUT_MS(pipe);
 	do {
 		/* Invalidate dcache */
-		invalidate_dcache_range((uint32_t)&qh_list,
-			(uint32_t)&qh_list + sizeof(struct QH));
-		invalidate_dcache_range((uint32_t)&qh,
-			(uint32_t)&qh + sizeof(struct QH));
+		invalidate_dcache_range((uint32_t)qh_list,
+			(uint32_t)qh_list +
+			ALIGN(sizeof(struct QH), USB_DMA_MINALIGN));
+		invalidate_dcache_range((uint32_t)qh,
+			(uint32_t)qh +
+			ALIGN(sizeof(struct QH), USB_DMA_MINALIGN));
 		invalidate_dcache_range((uint32_t)qtd,
-			(uint32_t)qtd + sizeof(qtd));
+			(uint32_t)qtd +
+			ALIGN(3*sizeof(*qtd), USB_DMA_MINALIGN));
 
 		token = hc32_to_cpu(vtd->qt_token);
 		if (!(token & 0x80))
@@ -382,9 +393,17 @@  ehci_submit_async(struct usb_device *dev, unsigned long pipe, void *buffer,
 		WATCHDOG_RESET();
 	} while (get_timer(ts) < timeout);
 
-	/* Invalidate the memory area occupied by buffer */
-	invalidate_dcache_range(((uint32_t)buffer & ~31),
-		((uint32_t)buffer & ~31) + roundup(length, 32));
+	/*
+	 * Invalidate the memory area occupied by buffer
+	 * Don't try to fix the buffer alignment, if it isn't properly
+	 * aligned it's upper layer's fault so let invalidate_dcache_range()
+	 * vow about it. But we have to fix the length as it's actual
+	 * transfer length and can be unaligned. This is potentially
+	 * dangerous operation, it's responsibility of the calling
+	 * code to make sure enough space is reserved.
+	 */
+	invalidate_dcache_range((uint32_t)buffer,
+		(uint32_t)buffer + ALIGN(length, ARCH_DMA_MINALIGN));
 
 	/* Check that the TD processing happened */
 	if (token & 0x80) {
@@ -403,9 +422,9 @@  ehci_submit_async(struct usb_device *dev, unsigned long pipe, void *buffer,
 		goto fail;
 	}
 
-	qh_list.qh_link = cpu_to_hc32((uint32_t)&qh_list | QH_LINK_TYPE_QH);
+	qh_list->qh_link = cpu_to_hc32((uint32_t)qh_list | QH_LINK_TYPE_QH);
 
-	token = hc32_to_cpu(qh.qh_overlay.qt_token);
+	token = hc32_to_cpu(qh->qh_overlay.qt_token);
 	if (!(token & 0x80)) {
 		debug("TOKEN=%#x\n", token);
 		switch (token & 0xfc) {
@@ -733,16 +752,16 @@  int usb_lowlevel_init(void)
 #endif
 
 	/* Set head of reclaim list */
-	memset(&qh_list, 0, sizeof(qh_list));
-	qh_list.qh_link = cpu_to_hc32((uint32_t)&qh_list | QH_LINK_TYPE_QH);
-	qh_list.qh_endpt1 = cpu_to_hc32((1 << 15) | (USB_SPEED_HIGH << 12));
-	qh_list.qh_curtd = cpu_to_hc32(QT_NEXT_TERMINATE);
-	qh_list.qh_overlay.qt_next = cpu_to_hc32(QT_NEXT_TERMINATE);
-	qh_list.qh_overlay.qt_altnext = cpu_to_hc32(QT_NEXT_TERMINATE);
-	qh_list.qh_overlay.qt_token = cpu_to_hc32(0x40);
+	memset(qh_list, 0, sizeof(*qh_list));
+	qh_list->qh_link = cpu_to_hc32((uint32_t)qh_list | QH_LINK_TYPE_QH);
+	qh_list->qh_endpt1 = cpu_to_hc32((1 << 15) | (USB_SPEED_HIGH << 12));
+	qh_list->qh_curtd = cpu_to_hc32(QT_NEXT_TERMINATE);
+	qh_list->qh_overlay.qt_next = cpu_to_hc32(QT_NEXT_TERMINATE);
+	qh_list->qh_overlay.qt_altnext = cpu_to_hc32(QT_NEXT_TERMINATE);
+	qh_list->qh_overlay.qt_token = cpu_to_hc32(0x40);
 
 	/* Set async. queue head pointer. */
-	ehci_writel(&hcor->or_asynclistaddr, (uint32_t)&qh_list);
+	ehci_writel(&hcor->or_asynclistaddr, (uint32_t)qh_list);
 
 	reg = ehci_readl(&hccr->cr_hcsparams);
 	descriptor.hub.bNbrPorts = HCS_N_PORTS(reg);
diff --git a/drivers/usb/musb/musb_core.h b/drivers/usb/musb/musb_core.h
index a8adcce..e914369 100644
--- a/drivers/usb/musb/musb_core.h
+++ b/drivers/usb/musb/musb_core.h
@@ -145,7 +145,7 @@  struct musb_regs {
 		struct musb_epN_regs epN;
 	} ep[16];
 
-} __attribute__((packed, aligned(32)));
+} __attribute__((packed, aligned(USB_DMA_MINALIGN)));
 #endif
 
 /*
diff --git a/include/usb.h b/include/usb.h
index 6da91e7..ba3d169 100644
--- a/include/usb.h
+++ b/include/usb.h
@@ -29,6 +29,16 @@ 
 #include <usb_defs.h>
 #include <usbdescriptors.h>
 
+/*
+ * The EHCI spec says that we must align to at least 32 bytes.  However,
+ * some platforms require larger alignment.
+ */
+#if ARCH_DMA_MINALIGN > 32
+#define USB_DMA_MINALIGN	ARCH_DMA_MINALIGN
+#else
+#define USB_DMA_MINALIGN	32
+#endif
+
 /* Everything is aribtrary */
 #define USB_ALTSETTINGALLOC		4
 #define USB_MAXALTSETTING		128	/* Hard limit */