diff mbox

[U-Boot,4/6] usb:udc:samsung: Zero copy approach for data passed to Samsung's UDC driver

Message ID 1391170589-14340-5-git-send-email-l.majewski@samsung.com
State Superseded
Headers show

Commit Message

Łukasz Majewski Jan. 31, 2014, 12:16 p.m. UTC
The Samsung's UDC driver is not anymore copying data from USB requests to
data aligned internal buffers. Now it works directly in data allocated in
the upper layers like UMS, DFU, THOR.

This change is possible since those gadgets now take care to allocate
buffers aligned to cache line (CONFIG_SYS_CACHELINE_SIZE ).

Previously the UDC needed to copy this data to internal aligned buffer to
prevent from unaligned access exceptions.

Test condition
- test HW + measurement: Trats - Exynos4210 rev.1
- test HW Trats2 - Exynos4412 rev.1
400 MiB compressed rootfs image download with `thor 0 mmc 0`

Measurement:
Transmission speed: 27.04 MiB/s

Change-Id: I1df1fbafc72ec703f0367ddee3fedf3a3f5523ed
Signed-off-by: Lukasz Majewski <l.majewski@samsung.com>
Cc: Marek Vasut <marex@denx.de>
---
 drivers/usb/gadget/s3c_udc_otg.c          |   19 +++++-----
 drivers/usb/gadget/s3c_udc_otg_xfer_dma.c |   54 ++++++++++++-----------------
 include/usb/s3c_udc.h                     |    5 +--
 3 files changed, 32 insertions(+), 46 deletions(-)

Comments

Marek Vasut Feb. 1, 2014, 2:55 a.m. UTC | #1
On Friday, January 31, 2014 at 01:16:27 PM, Lukasz Majewski wrote:
> The Samsung's UDC driver is not anymore copying data from USB requests to
> data aligned internal buffers. Now it works directly in data allocated in
> the upper layers like UMS, DFU, THOR.
> 
> This change is possible since those gadgets now take care to allocate
> buffers aligned to cache line (CONFIG_SYS_CACHELINE_SIZE ).
> 
> Previously the UDC needed to copy this data to internal aligned buffer to
> prevent from unaligned access exceptions.
> 
> Test condition
> - test HW + measurement: Trats - Exynos4210 rev.1
> - test HW Trats2 - Exynos4412 rev.1
> 400 MiB compressed rootfs image download with `thor 0 mmc 0`
> 
> Measurement:
> Transmission speed: 27.04 MiB/s
> 
> Change-Id: I1df1fbafc72ec703f0367ddee3fedf3a3f5523ed
> Signed-off-by: Lukasz Majewski <l.majewski@samsung.com>
> Cc: Marek Vasut <marex@denx.de>

You should use ROUND_UP(), not ROUND() throughout the patch. Otherwise you might 
fail to flush/invalidate the last little bit of data in some cacheline.

Best regards,
Marek Vasut
Lukasz Majewski Feb. 1, 2014, 11:05 a.m. UTC | #2
On Sat, 1 Feb 2014 03:55:20 +0100
Marek Vasut <marex@denx.de> wrote:

> On Friday, January 31, 2014 at 01:16:27 PM, Lukasz Majewski wrote:
> > The Samsung's UDC driver is not anymore copying data from USB
> > requests to data aligned internal buffers. Now it works directly in
> > data allocated in the upper layers like UMS, DFU, THOR.
> > 
> > This change is possible since those gadgets now take care to
> > allocate buffers aligned to cache line (CONFIG_SYS_CACHELINE_SIZE ).
> > 
> > Previously the UDC needed to copy this data to internal aligned
> > buffer to prevent from unaligned access exceptions.
> > 
> > Test condition
> > - test HW + measurement: Trats - Exynos4210 rev.1
> > - test HW Trats2 - Exynos4412 rev.1
> > 400 MiB compressed rootfs image download with `thor 0 mmc 0`
> > 
> > Measurement:
> > Transmission speed: 27.04 MiB/s
> > 
> > Change-Id: I1df1fbafc72ec703f0367ddee3fedf3a3f5523ed
> > Signed-off-by: Lukasz Majewski <l.majewski@samsung.com>
> > Cc: Marek Vasut <marex@denx.de>
> 
> You should use ROUND_UP(), not ROUND() throughout the patch.
> Otherwise you might fail to flush/invalidate the last little bit of
> data in some cacheline.

I might overlooked something, so please correct me if needed.

I allocate buffers in gadgets which are aligned to cache line with
starting address and its size is a multiplication of cache line size
(so I will not trash data allocated next to it when I invalidate cache).

In the code I'm using ROUND to invalidate/flush more data than needed
(ROUND(176, 32) = 192). I'm prepared for this since buffer in gadget is
properly allocated (with DEFINE_CACHE_ALIGN_BUFFER() which uses
roundup() internally).

Best regards,
Lukasz Majewski

> 
> Best regards,
> Marek Vasut
> _______________________________________________
> U-Boot mailing list
> U-Boot@lists.denx.de
> http://lists.denx.de/mailman/listinfo/u-boot
Marek Vasut Feb. 1, 2014, 10:55 p.m. UTC | #3
On Saturday, February 01, 2014 at 12:05:29 PM, Lukasz Majewski wrote:
> On Sat, 1 Feb 2014 03:55:20 +0100
> 
> Marek Vasut <marex@denx.de> wrote:
> > On Friday, January 31, 2014 at 01:16:27 PM, Lukasz Majewski wrote:
> > > The Samsung's UDC driver is not anymore copying data from USB
> > > requests to data aligned internal buffers. Now it works directly in
> > > data allocated in the upper layers like UMS, DFU, THOR.
> > > 
> > > This change is possible since those gadgets now take care to
> > > allocate buffers aligned to cache line (CONFIG_SYS_CACHELINE_SIZE ).
> > > 
> > > Previously the UDC needed to copy this data to internal aligned
> > > buffer to prevent from unaligned access exceptions.
> > > 
> > > Test condition
> > > - test HW + measurement: Trats - Exynos4210 rev.1
> > > - test HW Trats2 - Exynos4412 rev.1
> > > 400 MiB compressed rootfs image download with `thor 0 mmc 0`
> > > 
> > > Measurement:
> > > Transmission speed: 27.04 MiB/s
> > > 
> > > Change-Id: I1df1fbafc72ec703f0367ddee3fedf3a3f5523ed
> > > Signed-off-by: Lukasz Majewski <l.majewski@samsung.com>
> > > Cc: Marek Vasut <marex@denx.de>
> > 
> > You should use ROUND_UP(), not ROUND() throughout the patch.
> > Otherwise you might fail to flush/invalidate the last little bit of
> > data in some cacheline.
> 
> I might overlooked something, so please correct me if needed.
> 
> I allocate buffers in gadgets which are aligned to cache line with
> starting address and its size is a multiplication of cache line size
> (so I will not trash data allocated next to it when I invalidate cache).
> 
> In the code I'm using ROUND to invalidate/flush more data than needed
> (ROUND(176, 32) = 192). I'm prepared for this since buffer in gadget is
> properly allocated (with DEFINE_CACHE_ALIGN_BUFFER() which uses
> roundup() internally).

The problem is in case you receive buffer which is aligned to cacheline with 
it's start, but is [(k * cacheline_size) + (cacheline_size / 2) - 1] big. I 
think it's unlikely, but if this happens, you will get corruption, right ? You 
might actually want to check for this condition and throw a warning in such a 
case.

I understand your argument with trying to not trash data, but then you will get 
a corruption during transfer, right ?

Best regards,
Marek Vasut
Łukasz Majewski Feb. 3, 2014, 11:06 a.m. UTC | #4
Hi Marek,

> On Saturday, February 01, 2014 at 12:05:29 PM, Lukasz Majewski wrote:
> > On Sat, 1 Feb 2014 03:55:20 +0100
> > 
> > Marek Vasut <marex@denx.de> wrote:
> > > On Friday, January 31, 2014 at 01:16:27 PM, Lukasz Majewski wrote:
> > > > The Samsung's UDC driver is not anymore copying data from USB
> > > > requests to data aligned internal buffers. Now it works
> > > > directly in data allocated in the upper layers like UMS, DFU,
> > > > THOR.
> > > > 
> > > > This change is possible since those gadgets now take care to
> > > > allocate buffers aligned to cache line
> > > > (CONFIG_SYS_CACHELINE_SIZE ).
> > > > 
> > > > Previously the UDC needed to copy this data to internal aligned
> > > > buffer to prevent from unaligned access exceptions.
> > > > 
> > > > Test condition
> > > > - test HW + measurement: Trats - Exynos4210 rev.1
> > > > - test HW Trats2 - Exynos4412 rev.1
> > > > 400 MiB compressed rootfs image download with `thor 0 mmc 0`
> > > > 
> > > > Measurement:
> > > > Transmission speed: 27.04 MiB/s
> > > > 
> > > > Change-Id: I1df1fbafc72ec703f0367ddee3fedf3a3f5523ed
> > > > Signed-off-by: Lukasz Majewski <l.majewski@samsung.com>
> > > > Cc: Marek Vasut <marex@denx.de>
> > > 
> > > You should use ROUND_UP(), not ROUND() throughout the patch.
> > > Otherwise you might fail to flush/invalidate the last little bit
> > > of data in some cacheline.
> > 
> > I might overlooked something, so please correct me if needed.
> > 
> > I allocate buffers in gadgets which are aligned to cache line with
> > starting address and its size is a multiplication of cache line size
> > (so I will not trash data allocated next to it when I invalidate
> > cache).
> > 
> > In the code I'm using ROUND to invalidate/flush more data than
> > needed (ROUND(176, 32) = 192). I'm prepared for this since buffer
> > in gadget is properly allocated (with DEFINE_CACHE_ALIGN_BUFFER()
> > which uses roundup() internally).
> 
> The problem is in case you receive buffer which is aligned to
> cacheline with it's start, but is [(k * cacheline_size) +
> (cacheline_size / 2) - 1] big. I think it's unlikely, but if this
> happens, you will get corruption, right ? 

Let's suppose, that I will receive 2063B = [(64 * 32) + 16 -1] from UDC.
If the passed buffer was exactly 2063 B in size, then we would have
here a data corruption.

However this situation will not happen since the buffer at gadget is
allocated with DEFINE_CACHE_ALIGN_BUFFER() or is an aligned
multiplication of cache line size (like 1MiB).

I think, that it is the responsibility of gadget developer to allocate
buffers with proper alignment and size.

> You might actually want to
> check for this condition and throw a warning in such a case.

The check is already implemented at ./arch/arm/cpu/armv7/cache_v7.c.

It complains with "ERROR" message when start or end address is not
aligned (that is how I've discovered the unaligned buffers at UMS).

> 
> I understand your argument with trying to not trash data, but then
> you will get a corruption during transfer, right ?

After applying those patches, the cache management would be performed
when the USB request is completed (in the UDC).

The only requirement for UDC is the correctly allocated buffer at
gadget.

> 
> Best regards,
> Marek Vasut
Marek Vasut Feb. 3, 2014, 6:11 p.m. UTC | #5
On Monday, February 03, 2014 at 12:06:59 PM, Lukasz Majewski wrote:
> Hi Marek,
> 
> > On Saturday, February 01, 2014 at 12:05:29 PM, Lukasz Majewski wrote:
> > > On Sat, 1 Feb 2014 03:55:20 +0100
> > > 
> > > Marek Vasut <marex@denx.de> wrote:
> > > > On Friday, January 31, 2014 at 01:16:27 PM, Lukasz Majewski wrote:
> > > > > The Samsung's UDC driver is not anymore copying data from USB
> > > > > requests to data aligned internal buffers. Now it works
> > > > > directly in data allocated in the upper layers like UMS, DFU,
> > > > > THOR.
> > > > > 
> > > > > This change is possible since those gadgets now take care to
> > > > > allocate buffers aligned to cache line
> > > > > (CONFIG_SYS_CACHELINE_SIZE ).
> > > > > 
> > > > > Previously the UDC needed to copy this data to internal aligned
> > > > > buffer to prevent from unaligned access exceptions.
> > > > > 
> > > > > Test condition
> > > > > - test HW + measurement: Trats - Exynos4210 rev.1
> > > > > - test HW Trats2 - Exynos4412 rev.1
> > > > > 400 MiB compressed rootfs image download with `thor 0 mmc 0`
> > > > > 
> > > > > Measurement:
> > > > > Transmission speed: 27.04 MiB/s
> > > > > 
> > > > > Change-Id: I1df1fbafc72ec703f0367ddee3fedf3a3f5523ed
> > > > > Signed-off-by: Lukasz Majewski <l.majewski@samsung.com>
> > > > > Cc: Marek Vasut <marex@denx.de>
> > > > 
> > > > You should use ROUND_UP(), not ROUND() throughout the patch.
> > > > Otherwise you might fail to flush/invalidate the last little bit
> > > > of data in some cacheline.
> > > 
> > > I might overlooked something, so please correct me if needed.
> > > 
> > > I allocate buffers in gadgets which are aligned to cache line with
> > > starting address and its size is a multiplication of cache line size
> > > (so I will not trash data allocated next to it when I invalidate
> > > cache).
> > > 
> > > In the code I'm using ROUND to invalidate/flush more data than
> > > needed (ROUND(176, 32) = 192). I'm prepared for this since buffer
> > > in gadget is properly allocated (with DEFINE_CACHE_ALIGN_BUFFER()
> > > which uses roundup() internally).
> > 
> > The problem is in case you receive buffer which is aligned to
> > cacheline with it's start, but is [(k * cacheline_size) +
> > (cacheline_size / 2) - 1] big. I think it's unlikely, but if this
> > happens, you will get corruption, right ?
> 
> Let's suppose, that I will receive 2063B = [(64 * 32) + 16 -1] from UDC.
> If the passed buffer was exactly 2063 B in size, then we would have
> here a data corruption.
> 
> However this situation will not happen

_Should_ not happen ... I am absolutelly positive someone will be bitten by such 
assumption. I think this assumption about buffer alignment should really be 
documented somewhere.

> since the buffer at gadget is
> allocated with DEFINE_CACHE_ALIGN_BUFFER() or is an aligned
> multiplication of cache line size (like 1MiB).
> 
> I think, that it is the responsibility of gadget developer to allocate
> buffers with proper alignment and size.

Document that please, I doubt this is documented anywhere, but it's clearly part 
of the API. Also, some checks might be put in place for the alignment , they 
might be in #ifdef DEBUG for all I care, but it would be nice to have such a 
check, since I'm worried someone will really be bitten.

> > You might actually want to
> > check for this condition and throw a warning in such a case.
> 
> The check is already implemented at ./arch/arm/cpu/armv7/cache_v7.c.

Yeah, for arm926ejs core as well. Maybe that check shall be shifted into the 
cache management routine prototypes somehow ... not all CPUs implement that 
check :-(

> It complains with "ERROR" message when start or end address is not
> aligned (that is how I've discovered the unaligned buffers at UMS).

Yes.

> > I understand your argument with trying to not trash data, but then
> > you will get a corruption during transfer, right ?
> 
> After applying those patches, the cache management would be performed
> when the USB request is completed (in the UDC).
> 
> The only requirement for UDC is the correctly allocated buffer at
> gadget.

Got it.
Łukasz Majewski Feb. 4, 2014, 7:29 a.m. UTC | #6
Hi Marek,

> On Monday, February 03, 2014 at 12:06:59 PM, Lukasz Majewski wrote:
> > Hi Marek,
> > 
> > > On Saturday, February 01, 2014 at 12:05:29 PM, Lukasz Majewski
> > > wrote:
> > > > On Sat, 1 Feb 2014 03:55:20 +0100
> > > > 
> > > > Marek Vasut <marex@denx.de> wrote:
> > > > > On Friday, January 31, 2014 at 01:16:27 PM, Lukasz Majewski
> > > > > wrote:
> > > > > > The Samsung's UDC driver is not anymore copying data from
> > > > > > USB requests to data aligned internal buffers. Now it works
> > > > > > directly in data allocated in the upper layers like UMS,
> > > > > > DFU, THOR.
> > > > > > 
> > > > > > This change is possible since those gadgets now take care to
> > > > > > allocate buffers aligned to cache line
> > > > > > (CONFIG_SYS_CACHELINE_SIZE ).
> > > > > > 
> > > > > > Previously the UDC needed to copy this data to internal
> > > > > > aligned buffer to prevent from unaligned access exceptions.
> > > > > > 
> > > > > > Test condition
> > > > > > - test HW + measurement: Trats - Exynos4210 rev.1
> > > > > > - test HW Trats2 - Exynos4412 rev.1
> > > > > > 400 MiB compressed rootfs image download with `thor 0 mmc 0`
> > > > > > 
> > > > > > Measurement:
> > > > > > Transmission speed: 27.04 MiB/s
> > > > > > 
> > > > > > Change-Id: I1df1fbafc72ec703f0367ddee3fedf3a3f5523ed
> > > > > > Signed-off-by: Lukasz Majewski <l.majewski@samsung.com>
> > > > > > Cc: Marek Vasut <marex@denx.de>
> > > > > 
> > > > > You should use ROUND_UP(), not ROUND() throughout the patch.
> > > > > Otherwise you might fail to flush/invalidate the last little
> > > > > bit of data in some cacheline.
> > > > 
> > > > I might overlooked something, so please correct me if needed.
> > > > 
> > > > I allocate buffers in gadgets which are aligned to cache line
> > > > with starting address and its size is a multiplication of cache
> > > > line size (so I will not trash data allocated next to it when I
> > > > invalidate cache).
> > > > 
> > > > In the code I'm using ROUND to invalidate/flush more data than
> > > > needed (ROUND(176, 32) = 192). I'm prepared for this since
> > > > buffer in gadget is properly allocated (with
> > > > DEFINE_CACHE_ALIGN_BUFFER() which uses roundup() internally).
> > > 
> > > The problem is in case you receive buffer which is aligned to
> > > cacheline with it's start, but is [(k * cacheline_size) +
> > > (cacheline_size / 2) - 1] big. I think it's unlikely, but if this
> > > happens, you will get corruption, right ?
> > 
> > Let's suppose, that I will receive 2063B = [(64 * 32) + 16 -1] from
> > UDC. If the passed buffer was exactly 2063 B in size, then we would
> > have here a data corruption.
> > 
> > However this situation will not happen
> 
> _Should_ not happen ... I am absolutelly positive someone will be
> bitten by such assumption. I think this assumption about buffer
> alignment should really be documented somewhere.

I will add verbose commit message for this.

> 
> > since the buffer at gadget is
> > allocated with DEFINE_CACHE_ALIGN_BUFFER() or is an aligned
> > multiplication of cache line size (like 1MiB).
> > 
> > I think, that it is the responsibility of gadget developer to
> > allocate buffers with proper alignment and size.
> 
> Document that please, I doubt this is documented anywhere, but it's
> clearly part of the API. Also, some checks might be put in place for
> the alignment , they might be in #ifdef DEBUG for all I care, but it
> would be nice to have such a check, since I'm worried someone will
> really be bitten.

I rely on the code embedded at cache_v7.c. It works and saved me a lot
of trouble (since it always print "ERROR" when buffer alignment and
size is wrong). 

Also I think, that those checks shall be done at invalidate_* and
flush_* cache routines, not at USB driver.

> 
> > > You might actually want to
> > > check for this condition and throw a warning in such a case.
> > 
> > The check is already implemented at ./arch/arm/cpu/armv7/cache_v7.c.
> 
> Yeah, for arm926ejs core as well. Maybe that check shall be shifted
> into the cache management routine prototypes somehow ... not all CPUs
> implement that check :-(

Maybe other ARM architectures shall have the cache management code
updated to work in a similar way to cache_v7.c ?

> 
> > It complains with "ERROR" message when start or end address is not
> > aligned (that is how I've discovered the unaligned buffers at UMS).
> 
> Yes.
> 
> > > I understand your argument with trying to not trash data, but then
> > > you will get a corruption during transfer, right ?
> > 
> > After applying those patches, the cache management would be
> > performed when the USB request is completed (in the UDC).
> > 
> > The only requirement for UDC is the correctly allocated buffer at
> > gadget.
> 
> Got it.

I will emphasize the need of correct buffer allocation at commit
message and add some comment to UDC in the v2.
Marek Vasut Feb. 4, 2014, 8:21 p.m. UTC | #7
On Tuesday, February 04, 2014 at 08:29:49 AM, Lukasz Majewski wrote:
> Hi Marek,
> 
> > On Monday, February 03, 2014 at 12:06:59 PM, Lukasz Majewski wrote:
> > > Hi Marek,
> > > 
> > > > On Saturday, February 01, 2014 at 12:05:29 PM, Lukasz Majewski
> > > > 
> > > > wrote:
> > > > > On Sat, 1 Feb 2014 03:55:20 +0100
> > > > > 
> > > > > Marek Vasut <marex@denx.de> wrote:
> > > > > > On Friday, January 31, 2014 at 01:16:27 PM, Lukasz Majewski
> > > > > > 
> > > > > > wrote:
> > > > > > > The Samsung's UDC driver is not anymore copying data from
> > > > > > > USB requests to data aligned internal buffers. Now it works
> > > > > > > directly in data allocated in the upper layers like UMS,
> > > > > > > DFU, THOR.
> > > > > > > 
> > > > > > > This change is possible since those gadgets now take care to
> > > > > > > allocate buffers aligned to cache line
> > > > > > > (CONFIG_SYS_CACHELINE_SIZE ).
> > > > > > > 
> > > > > > > Previously the UDC needed to copy this data to internal
> > > > > > > aligned buffer to prevent from unaligned access exceptions.
> > > > > > > 
> > > > > > > Test condition
> > > > > > > - test HW + measurement: Trats - Exynos4210 rev.1
> > > > > > > - test HW Trats2 - Exynos4412 rev.1
> > > > > > > 400 MiB compressed rootfs image download with `thor 0 mmc 0`
> > > > > > > 
> > > > > > > Measurement:
> > > > > > > Transmission speed: 27.04 MiB/s
> > > > > > > 
> > > > > > > Change-Id: I1df1fbafc72ec703f0367ddee3fedf3a3f5523ed
> > > > > > > Signed-off-by: Lukasz Majewski <l.majewski@samsung.com>
> > > > > > > Cc: Marek Vasut <marex@denx.de>
> > > > > > 
> > > > > > You should use ROUND_UP(), not ROUND() throughout the patch.
> > > > > > Otherwise you might fail to flush/invalidate the last little
> > > > > > bit of data in some cacheline.
> > > > > 
> > > > > I might overlooked something, so please correct me if needed.
> > > > > 
> > > > > I allocate buffers in gadgets which are aligned to cache line
> > > > > with starting address and its size is a multiplication of cache
> > > > > line size (so I will not trash data allocated next to it when I
> > > > > invalidate cache).
> > > > > 
> > > > > In the code I'm using ROUND to invalidate/flush more data than
> > > > > needed (ROUND(176, 32) = 192). I'm prepared for this since
> > > > > buffer in gadget is properly allocated (with
> > > > > DEFINE_CACHE_ALIGN_BUFFER() which uses roundup() internally).
> > > > 
> > > > The problem is in case you receive buffer which is aligned to
> > > > cacheline with it's start, but is [(k * cacheline_size) +
> > > > (cacheline_size / 2) - 1] big. I think it's unlikely, but if this
> > > > happens, you will get corruption, right ?
> > > 
> > > Let's suppose, that I will receive 2063B = [(64 * 32) + 16 -1] from
> > > UDC. If the passed buffer was exactly 2063 B in size, then we would
> > > have here a data corruption.
> > > 
> > > However this situation will not happen
> > 
> > _Should_ not happen ... I am absolutelly positive someone will be
> > bitten by such assumption. I think this assumption about buffer
> > alignment should really be documented somewhere.
> 
> I will add verbose commit message for this.

The commit message will get lost as time moves on, this should be clearly 
documented in some doc/ or code comment.

> > > since the buffer at gadget is
> > > allocated with DEFINE_CACHE_ALIGN_BUFFER() or is an aligned
> > > multiplication of cache line size (like 1MiB).
> > > 
> > > I think, that it is the responsibility of gadget developer to
> > > allocate buffers with proper alignment and size.
> > 
> > Document that please, I doubt this is documented anywhere, but it's
> > clearly part of the API. Also, some checks might be put in place for
> > the alignment , they might be in #ifdef DEBUG for all I care, but it
> > would be nice to have such a check, since I'm worried someone will
> > really be bitten.
> 
> I rely on the code embedded at cache_v7.c. It works and saved me a lot
> of trouble (since it always print "ERROR" when buffer alignment and
> size is wrong).
> 
> Also I think, that those checks shall be done at invalidate_* and
> flush_* cache routines, not at USB driver.

Not all CPUs are ARMv7 though.

> > > > You might actually want to
> > > > check for this condition and throw a warning in such a case.
> > > 
> > > The check is already implemented at ./arch/arm/cpu/armv7/cache_v7.c.
> > 
> > Yeah, for arm926ejs core as well. Maybe that check shall be shifted
> > into the cache management routine prototypes somehow ... not all CPUs
> > implement that check :-(
> 
> Maybe other ARM architectures shall have the cache management code
> updated to work in a similar way to cache_v7.c ?

Not all CPUs are ARM architecture ... there're others, you know ;-)

> > > It complains with "ERROR" message when start or end address is not
> > > aligned (that is how I've discovered the unaligned buffers at UMS).
> > 
> > Yes.
> > 
> > > > I understand your argument with trying to not trash data, but then
> > > > you will get a corruption during transfer, right ?
> > > 
> > > After applying those patches, the cache management would be
> > > performed when the USB request is completed (in the UDC).
> > > 
> > > The only requirement for UDC is the correctly allocated buffer at
> > > gadget.
> > 
> > Got it.
> 
> I will emphasize the need of correct buffer allocation at commit
> message and add some comment to UDC in the v2.
Lukasz Majewski Feb. 4, 2014, 9:49 p.m. UTC | #8
On Tue, 4 Feb 2014 21:21:16 +0100
Marek Vasut <marex@denx.de> wrote:

> On Tuesday, February 04, 2014 at 08:29:49 AM, Lukasz Majewski wrote:
> > Hi Marek,
> > 
> > > On Monday, February 03, 2014 at 12:06:59 PM, Lukasz Majewski
> > > wrote:
> > > > Hi Marek,
> > > > 
> > > > > On Saturday, February 01, 2014 at 12:05:29 PM, Lukasz Majewski
> > > > > 
> > > > > wrote:
> > > > > > On Sat, 1 Feb 2014 03:55:20 +0100
> > > > > > 
> > > > > > Marek Vasut <marex@denx.de> wrote:
> > > > > > > On Friday, January 31, 2014 at 01:16:27 PM, Lukasz
> > > > > > > Majewski
> > > > > > > 
> > > > > > > wrote:
> > > > > > > > The Samsung's UDC driver is not anymore copying data
> > > > > > > > from USB requests to data aligned internal buffers. Now
> > > > > > > > it works directly in data allocated in the upper layers
> > > > > > > > like UMS, DFU, THOR.
> > > > > > > > 
> > > > > > > > This change is possible since those gadgets now take
> > > > > > > > care to allocate buffers aligned to cache line
> > > > > > > > (CONFIG_SYS_CACHELINE_SIZE ).
> > > > > > > > 
> > > > > > > > Previously the UDC needed to copy this data to internal
> > > > > > > > aligned buffer to prevent from unaligned access
> > > > > > > > exceptions.
> > > > > > > > 
> > > > > > > > Test condition
> > > > > > > > - test HW + measurement: Trats - Exynos4210 rev.1
> > > > > > > > - test HW Trats2 - Exynos4412 rev.1
> > > > > > > > 400 MiB compressed rootfs image download with `thor 0
> > > > > > > > mmc 0`
> > > > > > > > 
> > > > > > > > Measurement:
> > > > > > > > Transmission speed: 27.04 MiB/s
> > > > > > > > 
> > > > > > > > Change-Id: I1df1fbafc72ec703f0367ddee3fedf3a3f5523ed
> > > > > > > > Signed-off-by: Lukasz Majewski <l.majewski@samsung.com>
> > > > > > > > Cc: Marek Vasut <marex@denx.de>
> > > > > > > 
> > > > > > > You should use ROUND_UP(), not ROUND() throughout the
> > > > > > > patch. Otherwise you might fail to flush/invalidate the
> > > > > > > last little bit of data in some cacheline.
> > > > > > 
> > > > > > I might overlooked something, so please correct me if
> > > > > > needed.
> > > > > > 
> > > > > > I allocate buffers in gadgets which are aligned to cache
> > > > > > line with starting address and its size is a multiplication
> > > > > > of cache line size (so I will not trash data allocated next
> > > > > > to it when I invalidate cache).
> > > > > > 
> > > > > > In the code I'm using ROUND to invalidate/flush more data
> > > > > > than needed (ROUND(176, 32) = 192). I'm prepared for this
> > > > > > since buffer in gadget is properly allocated (with
> > > > > > DEFINE_CACHE_ALIGN_BUFFER() which uses roundup()
> > > > > > internally).
> > > > > 
> > > > > The problem is in case you receive buffer which is aligned to
> > > > > cacheline with it's start, but is [(k * cacheline_size) +
> > > > > (cacheline_size / 2) - 1] big. I think it's unlikely, but if
> > > > > this happens, you will get corruption, right ?
> > > > 
> > > > Let's suppose, that I will receive 2063B = [(64 * 32) + 16 -1]
> > > > from UDC. If the passed buffer was exactly 2063 B in size, then
> > > > we would have here a data corruption.
> > > > 
> > > > However this situation will not happen
> > > 
> > > _Should_ not happen ... I am absolutelly positive someone will be
> > > bitten by such assumption. I think this assumption about buffer
> > > alignment should really be documented somewhere.
> > 
> > I will add verbose commit message for this.
> 
> The commit message will get lost as time moves on, this should be
> clearly documented in some doc/ or code comment.

Ok. I will add code comment.

> 
> > > > since the buffer at gadget is
> > > > allocated with DEFINE_CACHE_ALIGN_BUFFER() or is an aligned
> > > > multiplication of cache line size (like 1MiB).
> > > > 
> > > > I think, that it is the responsibility of gadget developer to
> > > > allocate buffers with proper alignment and size.
> > > 
> > > Document that please, I doubt this is documented anywhere, but
> > > it's clearly part of the API. Also, some checks might be put in
> > > place for the alignment , they might be in #ifdef DEBUG for all I
> > > care, but it would be nice to have such a check, since I'm
> > > worried someone will really be bitten.
> > 
> > I rely on the code embedded at cache_v7.c. It works and saved me a
> > lot of trouble (since it always print "ERROR" when buffer alignment
> > and size is wrong).
> > 
> > Also I think, that those checks shall be done at invalidate_* and
> > flush_* cache routines, not at USB driver.
> 
> Not all CPUs are ARMv7 though.
> 
> > > > > You might actually want to
> > > > > check for this condition and throw a warning in such a case.
> > > > 
> > > > The check is already implemented
> > > > at ./arch/arm/cpu/armv7/cache_v7.c.
> > > 
> > > Yeah, for arm926ejs core as well. Maybe that check shall be
> > > shifted into the cache management routine prototypes somehow ...
> > > not all CPUs implement that check :-(
> > 
> > Maybe other ARM architectures shall have the cache management code
> > updated to work in a similar way to cache_v7.c ?
> 
> Not all CPUs are ARM architecture ... there're others, you know ;-)

:-) ... but who cares about the rest :-)

To be serious (quite), I do believe that checking if unaligned
cache flush/invalidation is performed, shall be handled at the code
which is responsible for cache management.

Conceptually, it shall not be done at UDC code.

> 
> > > > It complains with "ERROR" message when start or end address is
> > > > not aligned (that is how I've discovered the unaligned buffers
> > > > at UMS).
> > > 
> > > Yes.
> > > 
> > > > > I understand your argument with trying to not trash data, but
> > > > > then you will get a corruption during transfer, right ?
> > > > 
> > > > After applying those patches, the cache management would be
> > > > performed when the USB request is completed (in the UDC).
> > > > 
> > > > The only requirement for UDC is the correctly allocated buffer
> > > > at gadget.
> > > 
> > > Got it.
> > 
> > I will emphasize the need of correct buffer allocation at commit
> > message and add some comment to UDC in the v2.

Best regards,
Lukasz Majewski
Marek Vasut Feb. 5, 2014, 2:35 a.m. UTC | #9
On Tuesday, February 04, 2014 at 10:49:24 PM, Lukasz Majewski wrote:

[...]

> > > Maybe other ARM architectures shall have the cache management code
> > > updated to work in a similar way to cache_v7.c ?
> > 
> > Not all CPUs are ARM architecture ... there're others, you know ;-)
> :
> :-) ... but who cares about the rest :-)

Me, duh. I have mips, sparc and m68k devices here and I'm not afraid to use 
them!

> To be serious (quite), I do believe that checking if unaligned
> cache flush/invalidation is performed, shall be handled at the code
> which is responsible for cache management.
> 
> Conceptually, it shall not be done at UDC code.

You have a point, ACK. Even better, we should have a wrapper for that which will 
only then call the cache management routine.
[...]

Best regards,
Marek Vasut
diff mbox

Patch

diff --git a/drivers/usb/gadget/s3c_udc_otg.c b/drivers/usb/gadget/s3c_udc_otg.c
index ba17a04..63d4487 100644
--- a/drivers/usb/gadget/s3c_udc_otg.c
+++ b/drivers/usb/gadget/s3c_udc_otg.c
@@ -843,7 +843,7 @@  static struct s3c_udc memory = {
 int s3c_udc_probe(struct s3c_plat_otg_data *pdata)
 {
 	struct s3c_udc *dev = &memory;
-	int retval = 0, i;
+	int retval = 0;
 
 	debug("%s: %p\n", __func__, pdata);
 
@@ -864,16 +864,15 @@  int s3c_udc_probe(struct s3c_plat_otg_data *pdata)
 
 	the_controller = dev;
 
-	for (i = 0; i < S3C_MAX_ENDPOINTS+1; i++) {
-		dev->dma_buf[i] = memalign(CONFIG_SYS_CACHELINE_SIZE,
-					   DMA_BUFFER_SIZE);
-		dev->dma_addr[i] = (dma_addr_t) dev->dma_buf[i];
-		invalidate_dcache_range((unsigned long) dev->dma_buf[i],
-					(unsigned long) (dev->dma_buf[i]
-							 + DMA_BUFFER_SIZE));
+	usb_ctrl = memalign(CONFIG_SYS_CACHELINE_SIZE,
+			    ROUND(sizeof(struct usb_ctrlrequest),
+				  CONFIG_SYS_CACHELINE_SIZE));
+	if (!usb_ctrl) {
+		error("No memory available for UDC!\n");
+		return -ENOMEM;
 	}
-	usb_ctrl = dev->dma_buf[0];
-	usb_ctrl_dma_addr = dev->dma_addr[0];
+
+	usb_ctrl_dma_addr = (dma_addr_t) usb_ctrl;
 
 	udc_reinit(dev);
 
diff --git a/drivers/usb/gadget/s3c_udc_otg_xfer_dma.c b/drivers/usb/gadget/s3c_udc_otg_xfer_dma.c
index 274b68c..3f9febc 100644
--- a/drivers/usb/gadget/s3c_udc_otg_xfer_dma.c
+++ b/drivers/usb/gadget/s3c_udc_otg_xfer_dma.c
@@ -75,8 +75,9 @@  static inline void s3c_ep0_complete_out(void)
 		"%s : Prepare Complete Out packet.\n", __func__);
 
 	invalidate_dcache_range((unsigned long) usb_ctrl_dma_addr,
-				(unsigned long) usb_ctrl_dma_addr
-				+ DMA_BUFFER_SIZE);
+				(unsigned long) usb_ctrl_dma_addr +
+				ROUND(sizeof(struct usb_ctrlrequest),
+				      CONFIG_SYS_CACHELINE_SIZE));
 
 	writel(DOEPT_SIZ_PKT_CNT(1) | sizeof(struct usb_ctrlrequest),
 	       &reg->out_endp[EP0_CON].doeptsiz);
@@ -114,8 +115,7 @@  static int setdma_rx(struct s3c_ep *ep, struct s3c_request *req)
 
 	ctrl =  readl(&reg->out_endp[ep_num].doepctl);
 
-	writel(the_controller->dma_addr[ep_index(ep)+1],
-	       &reg->out_endp[ep_num].doepdma);
+	writel((unsigned int) ep->dma_buf, &reg->out_endp[ep_num].doepdma);
 	writel(DOEPT_SIZ_PKT_CNT(pktcnt) | DOEPT_SIZ_XFER_SIZE(length),
 	       &reg->out_endp[ep_num].doeptsiz);
 	writel(DEPCTL_EPENA|DEPCTL_CNAK|ctrl, &reg->out_endp[ep_num].doepctl);
@@ -138,7 +138,6 @@  int setdma_tx(struct s3c_ep *ep, struct s3c_request *req)
 	u32 *buf, ctrl = 0;
 	u32 length, pktcnt;
 	u32 ep_num = ep_index(ep);
-	u32 *p = the_controller->dma_buf[ep_index(ep)+1];
 
 	buf = req->req.buf + req->req.actual;
 	length = req->req.length - req->req.actual;
@@ -148,10 +147,10 @@  int setdma_tx(struct s3c_ep *ep, struct s3c_request *req)
 
 	ep->len = length;
 	ep->dma_buf = buf;
-	memcpy(p, ep->dma_buf, length);
 
-	flush_dcache_range((unsigned long) p ,
-			   (unsigned long) p + DMA_BUFFER_SIZE);
+	flush_dcache_range((unsigned long) ep->dma_buf,
+			   (unsigned long) ep->dma_buf +
+			   ROUND(ep->len, CONFIG_SYS_CACHELINE_SIZE));
 
 	if (length == 0)
 		pktcnt = 1;
@@ -164,8 +163,7 @@  int setdma_tx(struct s3c_ep *ep, struct s3c_request *req)
 	while (readl(&reg->grstctl) & TX_FIFO_FLUSH)
 		;
 
-	writel(the_controller->dma_addr[ep_index(ep)+1],
-	       &reg->in_endp[ep_num].diepdma);
+	writel((unsigned long) ep->dma_buf, &reg->in_endp[ep_num].diepdma);
 	writel(DIEPT_SIZ_PKT_CNT(pktcnt) | DIEPT_SIZ_XFER_SIZE(length),
 	       &reg->in_endp[ep_num].dieptsiz);
 
@@ -198,7 +196,6 @@  static void complete_rx(struct s3c_udc *dev, u8 ep_num)
 	struct s3c_ep *ep = &dev->ep[ep_num];
 	struct s3c_request *req = NULL;
 	u32 ep_tsr = 0, xfer_size = 0, is_short = 0;
-	u32 *p = the_controller->dma_buf[ep_index(ep)+1];
 
 	if (list_empty(&ep->queue)) {
 		debug_cond(DEBUG_OUT_EP != 0,
@@ -218,10 +215,9 @@  static void complete_rx(struct s3c_udc *dev, u8 ep_num)
 
 	xfer_size = ep->len - xfer_size;
 
-	invalidate_dcache_range((unsigned long) p,
-				(unsigned long) p + DMA_BUFFER_SIZE);
-
-	memcpy(ep->dma_buf, p, ep->len);
+	invalidate_dcache_range((unsigned long) ep->dma_buf,
+				(unsigned long) ep->dma_buf +
+				ROUND(xfer_size, CONFIG_SYS_CACHELINE_SIZE));
 
 	req->req.actual += min(xfer_size, req->req.length - req->req.actual);
 	is_short = (xfer_size < ep->ep.maxpacket);
@@ -715,19 +711,14 @@  static int write_fifo_ep0(struct s3c_ep *ep, struct s3c_request *req)
 
 int s3c_fifo_read(struct s3c_ep *ep, u32 *cp, int max)
 {
-	u32 bytes;
-
-	bytes = sizeof(struct usb_ctrlrequest);
-
-	invalidate_dcache_range((unsigned long) ep->dev->dma_buf[ep_index(ep)],
-				(unsigned long) ep->dev->dma_buf[ep_index(ep)]
-				+ DMA_BUFFER_SIZE);
+	invalidate_dcache_range((unsigned long)cp, (unsigned long)cp +
+				ROUND(max, CONFIG_SYS_CACHELINE_SIZE));
 
 	debug_cond(DEBUG_EP0 != 0,
-		   "%s: bytes=%d, ep_index=%d %p\n", __func__,
-		   bytes, ep_index(ep), ep->dev->dma_buf[ep_index(ep)]);
+		   "%s: bytes=%d, ep_index=%d 0x%p\n", __func__,
+		   max, ep_index(ep), cp);
 
-	return bytes;
+	return max;
 }
 
 /**
@@ -859,14 +850,12 @@  static int s3c_ep0_write(struct s3c_udc *dev)
 	return 1;
 }
 
-u16	g_status;
-
 int s3c_udc_get_status(struct s3c_udc *dev,
 		struct usb_ctrlrequest *crq)
 {
 	u8 ep_num = crq->wIndex & 0x7F;
+	u16 g_status = 0;
 	u32 ep_ctrl;
-	u32 *p = the_controller->dma_buf[1];
 
 	debug_cond(DEBUG_SETUP != 0,
 		   "%s: *** USB_REQ_GET_STATUS\n", __func__);
@@ -904,12 +893,13 @@  int s3c_udc_get_status(struct s3c_udc *dev,
 		return 1;
 	}
 
-	memcpy(p, &g_status, sizeof(g_status));
+	memcpy(usb_ctrl, &g_status, sizeof(g_status));
 
-	flush_dcache_range((unsigned long) p,
-			   (unsigned long) p + DMA_BUFFER_SIZE);
+	flush_dcache_range((unsigned long) usb_ctrl,
+			   (unsigned long) usb_ctrl +
+			   ROUND(sizeof(g_status), CONFIG_SYS_CACHELINE_SIZE));
 
-	writel(the_controller->dma_addr[1], &reg->in_endp[EP0_CON].diepdma);
+	writel(usb_ctrl_dma_addr, &reg->in_endp[EP0_CON].diepdma);
 	writel(DIEPT_SIZ_PKT_CNT(1) | DIEPT_SIZ_XFER_SIZE(2),
 	       &reg->in_endp[EP0_CON].dieptsiz);
 
diff --git a/include/usb/s3c_udc.h b/include/usb/s3c_udc.h
index 734c6cd..6dead2f 100644
--- a/include/usb/s3c_udc.h
+++ b/include/usb/s3c_udc.h
@@ -19,7 +19,7 @@ 
 
 /*-------------------------------------------------------------------------*/
 /* DMA bounce buffer size, 16K is enough even for mass storage */
-#define DMA_BUFFER_SIZE	(4096*4)
+#define DMA_BUFFER_SIZE	(16*SZ_1K)
 
 #define EP0_FIFO_SIZE		64
 #define EP_FIFO_SIZE		512
@@ -81,9 +81,6 @@  struct s3c_udc {
 
 	struct s3c_plat_otg_data *pdata;
 
-	void *dma_buf[S3C_MAX_ENDPOINTS+1];
-	dma_addr_t dma_addr[S3C_MAX_ENDPOINTS+1];
-
 	int ep0state;
 	struct s3c_ep ep[S3C_MAX_ENDPOINTS];