diff mbox series

[v2] ARC: io.h: Implement reads{x}()/writes{x}()

Message ID 19fb2e394afcb073bbc109e432417fbbc03323f6.1543499759.git.joabreu@synopsys.com
State New
Headers show
Series [v2] ARC: io.h: Implement reads{x}()/writes{x}() | expand

Commit Message

Jose Abreu Nov. 29, 2018, 2:29 p.m. UTC
Some ARC CPU's do not support unaligned loads/stores. Currently, generic
implementation of reads{b/w/l}()/writes{b/w/l}() is being used with ARC.
This can lead to misfunction of some drivers as generic functions do a
plain dereference of a pointer that can be unaligned.

Let's use {get/put}_unaligned() helper instead of plain dereference of
pointer in order to fix this.

Changes from v1:
- Check if buffer is already aligned (David)
- Remove 64 bit mention (Alexey)

Signed-off-by: Jose Abreu <joabreu@synopsys.com>
Tested-by: Vitor Soares <soares@synopsys.com>
Cc: Vineet Gupta <vgupta@synopsys.com>
Cc: Alexey Brodkin <abrodkin@synopsys.com>
Cc: Joao Pinto <jpinto@synopsys.com>
Cc: Vitor Soares <soares@synopsys.com>
Cc: David Laight <David.Laight@ACULAB.COM>
---
 arch/arc/include/asm/io.h | 61 +++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 61 insertions(+)

Comments

David Laight Nov. 29, 2018, 2:38 p.m. UTC | #1
From: Jose Abreu
> Sent: 29 November 2018 14:29
> 
> Some ARC CPU's do not support unaligned loads/stores. Currently, generic
> implementation of reads{b/w/l}()/writes{b/w/l}() is being used with ARC.
> This can lead to misfunction of some drivers as generic functions do a
> plain dereference of a pointer that can be unaligned.
> 
> Let's use {get/put}_unaligned() helper instead of plain dereference of
> pointer in order to fix this.
...
> +#define __raw_readsx(t,f) \
> +static inline void __raw_reads##f(const volatile void __iomem *addr, \
> +				  void *buffer, unsigned int count) \
> +{ \
> +	if (count) { \
> +		const unsigned long bptr = (unsigned long)buffer; \
> +		u##t *buf = buffer; \
> +\
> +		do { \
> +			u##t x = __raw_read##f(addr); \
> +\
> +			/* Some ARC CPU's don't support unaligned accesses */ \
> +			if (bptr % ((t) / 8)) { \
> +				put_unaligned(x, buf++); \
> +			} else { \
> +				*buf++ = x; \
> +			} \
> +		} while (--count); \
> +	} \
> +}

Does the compiler move the alignment test outside the loop?
You really want two copies of the loop body.

	David

-
Registered Address Lakeside, Bramley Road, Mount Farm, Milton Keynes, MK1 1PT, UK
Registration No: 1397386 (Wales)
Jose Abreu Nov. 29, 2018, 2:42 p.m. UTC | #2
On 29-11-2018 14:38, David Laight wrote:
> From: Jose Abreu
>> Sent: 29 November 2018 14:29
>>
>> Some ARC CPU's do not support unaligned loads/stores. Currently, generic
>> implementation of reads{b/w/l}()/writes{b/w/l}() is being used with ARC.
>> This can lead to misfunction of some drivers as generic functions do a
>> plain dereference of a pointer that can be unaligned.
>>
>> Let's use {get/put}_unaligned() helper instead of plain dereference of
>> pointer in order to fix this.
> ...
>> +#define __raw_readsx(t,f) \
>> +static inline void __raw_reads##f(const volatile void __iomem *addr, \
>> +				  void *buffer, unsigned int count) \
>> +{ \
>> +	if (count) { \
>> +		const unsigned long bptr = (unsigned long)buffer; \
>> +		u##t *buf = buffer; \
>> +\
>> +		do { \
>> +			u##t x = __raw_read##f(addr); \
>> +\
>> +			/* Some ARC CPU's don't support unaligned accesses */ \
>> +			if (bptr % ((t) / 8)) { \
>> +				put_unaligned(x, buf++); \
>> +			} else { \
>> +				*buf++ = x; \
>> +			} \
>> +		} while (--count); \
>> +	} \
>> +}
> Does the compiler move the alignment test outside the loop?
> You really want two copies of the loop body.

Hmm, I would expect so because the if condition takes two const
args ... I will try check that.

Thanks and Best Regards,
Jose Miguel Abreu

>
> 	David
>
> -
> Registered Address Lakeside, Bramley Road, Mount Farm, Milton Keynes, MK1 1PT, UK
> Registration No: 1397386 (Wales)
>
Jose Abreu Nov. 29, 2018, 4:13 p.m. UTC | #3
On 29-11-2018 14:42, Jose Abreu wrote:
> On 29-11-2018 14:38, David Laight wrote:
>> From: Jose Abreu
>>> Sent: 29 November 2018 14:29
>>>
>>> Some ARC CPU's do not support unaligned loads/stores. Currently, generic
>>> implementation of reads{b/w/l}()/writes{b/w/l}() is being used with ARC.
>>> This can lead to misfunction of some drivers as generic functions do a
>>> plain dereference of a pointer that can be unaligned.
>>>
>>> Let's use {get/put}_unaligned() helper instead of plain dereference of
>>> pointer in order to fix this.
>> ...
>>> +#define __raw_readsx(t,f) \
>>> +static inline void __raw_reads##f(const volatile void __iomem *addr, \
>>> +				  void *buffer, unsigned int count) \
>>> +{ \
>>> +	if (count) { \
>>> +		const unsigned long bptr = (unsigned long)buffer; \
>>> +		u##t *buf = buffer; \
>>> +\
>>> +		do { \
>>> +			u##t x = __raw_read##f(addr); \
>>> +\
>>> +			/* Some ARC CPU's don't support unaligned accesses */ \
>>> +			if (bptr % ((t) / 8)) { \
>>> +				put_unaligned(x, buf++); \
>>> +			} else { \
>>> +				*buf++ = x; \
>>> +			} \
>>> +		} while (--count); \
>>> +	} \
>>> +}
>> Does the compiler move the alignment test outside the loop?
>> You really want two copies of the loop body.
> Hmm, I would expect so because the if condition takes two const
> args ... I will try check that.

And it did optimize :)

Sample C Source:
--->8--
static noinline void test_readsl(char *buf, int len)
{
        readsl(0xdeadbeef, buf, len);
}
--->8---

And the disassembly:
--->8---
00000e88 <test_readsl>:
 e88:    breq.dr1,0,eac <0xeac>        /* if (count) */
 e8c:    and r2,r0,3

 e90:    mov_s lp_count,r1            /* r1 = count */
 e92:    brne r2,0,eb0 <0xeb0>        /* if (bptr % ((t) / 8)) */

 e96:    sub r0,r0,4
 e9a:    nop_s
 
 e9c:    lp eac <0xeac>                /* first loop */
 ea0:    ld r2,[0xdeadbeef]
 ea8:    st.a r2,[r0,4]
 eac:    j_s [blink]
 eae:    nop_s

 eb0:    lp ed6 <0xed6>                /* second loop */
 eb4:    ld r2,[0xdeadbeef]
 ebc:    lsr r5,r2,8
 ec0:    lsr r4,r2,16
 ec4:    lsr r3,r2,24
 ec8:    stb_s r2,[r0,0]
 eca:    stb r5,[r0,1]
 ece:    stb r4,[r0,2]
 ed2:    stb_s r3,[r0,3]
 ed4:    add_s r0,r0,4
 ed6:    j_s [blink]

--->8---

See how the if condition added in this version is checked in
<test_readsl+0xe92> and then it takes two different loops.

Thanks and Best Regards,
Jose Miguel Abreu

>
> Thanks and Best Regards,
> Jose Miguel Abreu
>
>> 	David
>>
>> -
>> Registered Address Lakeside, Bramley Road, Mount Farm, Milton Keynes, MK1 1PT, UK
>> Registration No: 1397386 (Wales)
>>
Vineet Gupta Nov. 29, 2018, 9:16 p.m. UTC | #4
On 11/29/18 6:29 AM, Jose Abreu wrote:
> Some ARC CPU's do not support unaligned loads/stores. Currently, generic
> implementation of reads{b/w/l}()/writes{b/w/l}() is being used with ARC.
> This can lead to misfunction of some drivers as generic functions do a
> plain dereference of a pointer that can be unaligned.
>
> Let's use {get/put}_unaligned() helper instead of plain dereference of
> pointer in order to fix this. 

Please add a few lines of comment about what the accessor does in general - maybe
copy-paste from generic file if needed.

> +#define __raw_readsx(t,f) \
> +static inline void __raw_reads##f(const volatile void __iomem *addr, \
> +				  void *buffer, unsigned int count) \
> +{ \
> +	if (count) { \

Could you reduce 1 level of indentation by checking for !count and return.

> +		const unsigned long bptr = (unsigned long)buffer; \
> +		u##t *buf = buffer; \
> +\
> +		do { \
> +			u##t x = __raw_read##f(addr); \
> +\
> +			/* Some ARC CPU's don't support unaligned accesses */ \
> +			if (bptr % ((t) / 8)) { \

This math is loop invariant so instead of using bptr here, have something like
is_aligned = (unsigned long)buffer % ((t) / 8)) outside the loop. A reasonable
compiler already does that but this one is easier on eyes.

> +				put_unaligned(x, buf++); \
> +			} else { \
> +				*buf++ = x; \
> +			} \
> +		} while (--count); \
> +	} \
> +}

Per your later post the compiler is doing the good job here, but sometimes it
might not. Adding a condition inside a loop is not a good idea in general. Better
to code the 2 loops seperately.

> +
> +#define __raw_readsb __raw_readsb
> +__raw_readsx(8, b);

The trailing ; is not needed

> +#define __raw_readsw __raw_readsw
> +__raw_readsx(16, w);
> +#define __raw_readsl __raw_readsl
> +__raw_readsx(32, l);

> +
>  #define __raw_writeb __raw_writeb
>  static inline void __raw_writeb(u8 b, volatile void __iomem *addr)
>  {
> @@ -126,6 +155,32 @@ static inline void __raw_writel(u32 w, volatile void __iomem *addr)
>  
>  }
>  
> +#define __raw_writesx(t,f) \
> +static inline void __raw_writes##f(volatile void __iomem *addr, \
> +				   const void *buffer, unsigned int count) \
> +{ \
> +	if (count) { \
> +		const unsigned long bptr = (unsigned long)buffer; \
> +		const u##t *buf = buffer; \
> +\
> +		do { \
> +			/* Some ARC CPU's don't support unaligned accesses */ \
> +			if (bptr % ((t) / 8)) { \
> +				__raw_write##f(get_unaligned(buf++), addr); \
> +			} else { \
> +				__raw_write##f(*buf++, addr); \
> +			} \
> +		} while (--count); \
> +	} \
> +}

Same as for read !

> +
> +#define __raw_writesb __raw_writesb
> +__raw_writesx(8, b);
> +#define __raw_writesw __raw_writesw
> +__raw_writesx(16, w);
> +#define __raw_writesl __raw_writesl
> +__raw_writesx(32, l);

Ditto !
Arnd Bergmann Nov. 29, 2018, 9:20 p.m. UTC | #5
On Thu, Nov 29, 2018 at 5:14 PM Jose Abreu <jose.abreu@synopsys.com> wrote:

> --->8--
> static noinline void test_readsl(char *buf, int len)
> {
>         readsl(0xdeadbeef, buf, len);
> }
> --->8---
>
> And the disassembly:
> --->8---
> 00000e88 <test_readsl>:
>  e88:    breq.dr1,0,eac <0xeac>        /* if (count) */
>  e8c:    and r2,r0,3
>
>  e90:    mov_s lp_count,r1            /* r1 = count */
>  e92:    brne r2,0,eb0 <0xeb0>        /* if (bptr % ((t) / 8)) */
>
>  e96:    sub r0,r0,4
>  e9a:    nop_s
>
>  e9c:    lp eac <0xeac>                /* first loop */
>  ea0:    ld r2,[0xdeadbeef]
>  ea8:    st.a r2,[r0,4]
>  eac:    j_s [blink]
>  eae:    nop_s
>
>  eb0:    lp ed6 <0xed6>                /* second loop */
>  eb4:    ld r2,[0xdeadbeef]
>  ebc:    lsr r5,r2,8
>  ec0:    lsr r4,r2,16
>  ec4:    lsr r3,r2,24
>  ec8:    stb_s r2,[r0,0]
>  eca:    stb r5,[r0,1]
>  ece:    stb r4,[r0,2]
>  ed2:    stb_s r3,[r0,3]
>  ed4:    add_s r0,r0,4
>  ed6:    j_s [blink]
>
> --->8---
>
> See how the if condition added in this version is checked in
> <test_readsl+0xe92> and then it takes two different loops.

This looks good to me. I wonder what the result is for CPUs
that /do/ support unaligned accesses. Normally put_unaligned()
should fall back to a simple store in that case, but I'm not
sure it can fold the two stores back into one and skip the
alignment check. Probably not worth overoptimizing for that
case (the MMIO access latency should be much higher than
anything you could gain here), but I'm still curious about
how well our get/put_unaligned macros work.

       Arnd
Jose Abreu Nov. 30, 2018, 8:56 a.m. UTC | #6
On 29-11-2018 21:20, Arnd Bergmann wrote:
> On Thu, Nov 29, 2018 at 5:14 PM Jose Abreu <jose.abreu@synopsys.com> wrote:
>
>> --->8--
>> static noinline void test_readsl(char *buf, int len)
>> {
>>         readsl(0xdeadbeef, buf, len);
>> }
>> --->8---
>>
>> And the disassembly:
>> --->8---
>> 00000e88 <test_readsl>:
>>  e88:    breq.dr1,0,eac <0xeac>        /* if (count) */
>>  e8c:    and r2,r0,3
>>
>>  e90:    mov_s lp_count,r1            /* r1 = count */
>>  e92:    brne r2,0,eb0 <0xeb0>        /* if (bptr % ((t) / 8)) */
>>
>>  e96:    sub r0,r0,4
>>  e9a:    nop_s
>>
>>  e9c:    lp eac <0xeac>                /* first loop */
>>  ea0:    ld r2,[0xdeadbeef]
>>  ea8:    st.a r2,[r0,4]
>>  eac:    j_s [blink]
>>  eae:    nop_s
>>
>>  eb0:    lp ed6 <0xed6>                /* second loop */
>>  eb4:    ld r2,[0xdeadbeef]
>>  ebc:    lsr r5,r2,8
>>  ec0:    lsr r4,r2,16
>>  ec4:    lsr r3,r2,24
>>  ec8:    stb_s r2,[r0,0]
>>  eca:    stb r5,[r0,1]
>>  ece:    stb r4,[r0,2]
>>  ed2:    stb_s r3,[r0,3]
>>  ed4:    add_s r0,r0,4
>>  ed6:    j_s [blink]
>>
>> --->8---
>>
>> See how the if condition added in this version is checked in
>> <test_readsl+0xe92> and then it takes two different loops.
> This looks good to me. I wonder what the result is for CPUs
> that /do/ support unaligned accesses. Normally put_unaligned()
> should fall back to a simple store in that case, but I'm not
> sure it can fold the two stores back into one and skip the
> alignment check. Probably not worth overoptimizing for that
> case (the MMIO access latency should be much higher than
> anything you could gain here), but I'm still curious about
> how well our get/put_unaligned macros work.

Here is disassembly for an ARC CPU that supports unaligned accesses:

-->8---
00000d48 <test_readsl>:
 d48:    breq_s r1,0,28            /* if (count) */
 d4a:    tst    r0,0x3
 d4e:    bne_s 32                /* if (bptr % ((t) / 8)) */
 
 d50:    ld r2,[0xdeadbeef]        /* first loop */
 d58:    sub_s r1,r1,0x1
 d5a:    tst_s r1,r1
 d5c:    bne.d -12
 d60:    st.ab r2,[r0,4]
 
 d64:    dmb    0x1                    /* common exit point */
 d68:    j_s    [blink]
 d6a:    nop_s
 
 d6c:    ld r2,[0xdeadbeef]        /* second loop */
 d74:    sub_s r1,r1,0x1
 d76:    tst_s r1,r1
 d78:    bne.d -12
 d7c:    st.ab r2,[r0,4]

 d80:    b_s -28                    /* jmp to 0xd64 */
 d82:    nop_s
--->8---

Notice how first and second loop are exactly equal ...

Thanks and Best Regards,
Jose Miguel Abreu

>
>        Arnd
Arnd Bergmann Nov. 30, 2018, 1:44 p.m. UTC | #7
On Fri, Nov 30, 2018 at 9:57 AM Jose Abreu <jose.abreu@synopsys.com> wrote:
> On 29-11-2018 21:20, Arnd Bergmann wrote:
> > On Thu, Nov 29, 2018 at 5:14 PM Jose Abreu <jose.abreu@synopsys.com> wrote:
> >> See how the if condition added in this version is checked in
> >> <test_readsl+0xe92> and then it takes two different loops.
> > This looks good to me. I wonder what the result is for CPUs
> > that /do/ support unaligned accesses. Normally put_unaligned()
> > should fall back to a simple store in that case, but I'm not
> > sure it can fold the two stores back into one and skip the
> > alignment check. Probably not worth overoptimizing for that
> > case (the MMIO access latency should be much higher than
> > anything you could gain here), but I'm still curious about
> > how well our get/put_unaligned macros work.
>
> Here is disassembly for an ARC CPU that supports unaligned accesses:
>
> -->8---
> 00000d48 <test_readsl>:
>  d48:    breq_s r1,0,28            /* if (count) */
>  d4a:    tst    r0,0x3
>  d4e:    bne_s 32                /* if (bptr % ((t) / 8)) */
>
>  d50:    ld r2,[0xdeadbeef]        /* first loop */
>  d58:    sub_s r1,r1,0x1
>  d5a:    tst_s r1,r1
>  d5c:    bne.d -12
>  d60:    st.ab r2,[r0,4]
>
>  d64:    dmb    0x1                    /* common exit point */
>  d68:    j_s    [blink]
>  d6a:    nop_s
>
>  d6c:    ld r2,[0xdeadbeef]        /* second loop */
>  d74:    sub_s r1,r1,0x1
>  d76:    tst_s r1,r1
>  d78:    bne.d -12
>  d7c:    st.ab r2,[r0,4]
>
>  d80:    b_s -28                    /* jmp to 0xd64 */
>  d82:    nop_s
> --->8---
>
> Notice how first and second loop are exactly equal ...

Ok, so it's halfway there: it managed to optimize the the unaligned
case correctly, but it failed to notice that both sides are
identical now.

      Arnd
David Laight Nov. 30, 2018, 1:57 p.m. UTC | #8
From: Arnd Bergmann
> Sent: 30 November 2018 13:44
> 
> On Fri, Nov 30, 2018 at 9:57 AM Jose Abreu <jose.abreu@synopsys.com> wrote:
> > On 29-11-2018 21:20, Arnd Bergmann wrote:
> > > On Thu, Nov 29, 2018 at 5:14 PM Jose Abreu <jose.abreu@synopsys.com> wrote:
> > >> See how the if condition added in this version is checked in
> > >> <test_readsl+0xe92> and then it takes two different loops.
> > > This looks good to me. I wonder what the result is for CPUs
> > > that /do/ support unaligned accesses. Normally put_unaligned()
> > > should fall back to a simple store in that case, but I'm not
> > > sure it can fold the two stores back into one and skip the
> > > alignment check. Probably not worth overoptimizing for that
> > > case (the MMIO access latency should be much higher than
> > > anything you could gain here), but I'm still curious about
> > > how well our get/put_unaligned macros work.
> >
> > Here is disassembly for an ARC CPU that supports unaligned accesses:
> >
> > -->8---
> > 00000d48 <test_readsl>:
> >  d48:    breq_s r1,0,28            /* if (count) */
> >  d4a:    tst    r0,0x3
> >  d4e:    bne_s 32                /* if (bptr % ((t) / 8)) */
> >
> >  d50:    ld r2,[0xdeadbeef]        /* first loop */
> >  d58:    sub_s r1,r1,0x1
> >  d5a:    tst_s r1,r1
> >  d5c:    bne.d -12
> >  d60:    st.ab r2,[r0,4]
> >
> >  d64:    dmb    0x1                    /* common exit point */
> >  d68:    j_s    [blink]
> >  d6a:    nop_s
> >
> >  d6c:    ld r2,[0xdeadbeef]        /* second loop */
> >  d74:    sub_s r1,r1,0x1
> >  d76:    tst_s r1,r1
> >  d78:    bne.d -12
> >  d7c:    st.ab r2,[r0,4]
> >
> >  d80:    b_s -28                    /* jmp to 0xd64 */
> >  d82:    nop_s
> > --->8---
> >
> > Notice how first and second loop are exactly equal ...
> 
> Ok, so it's halfway there: it managed to optimize the the unaligned
> case correctly, but it failed to notice that both sides are
> identical now.

There're even identical opcodes...
The barrier() (etc) in the asm output probably stopped the optimisation.

It also seems to have used a different type of loop to the
other example, probably less efficient.
(Not that I'm an expert on ARC opcodes.)

	David

-
Registered Address Lakeside, Bramley Road, Mount Farm, Milton Keynes, MK1 1PT, UK
Registration No: 1397386 (Wales)
Vineet Gupta Nov. 30, 2018, 7 p.m. UTC | #9
On 11/30/18 5:57 AM, David Laight wrote:
> There're even identical opcodes...
> The barrier() (etc) in the asm output probably stopped the optimisation.
>
> It also seems to have used a different type of loop to the
> other example, probably less efficient.
> (Not that I'm an expert on ARC opcodes.)

The difference is due to ISA and ensuing ARC gcc backends. ARCompact based cores
don't support unaligned access and the loop there was ZOL (Zero delay loop). In
ARCv2 based cores, the gcc backend has been tweaked to generate fewer ZOLs hence
you see the more canonical tst and branch style loop.

-Vineet
David Laight Dec. 3, 2018, 10:10 a.m. UTC | #10
From: Vineet Gupta
...
> > It also seems to have used a different type of loop to the
> > other example, probably less efficient.
> > (Not that I'm an expert on ARC opcodes.)
> 
> The difference is due to ISA and ensuing ARC gcc backends. ARCompact based cores
> don't support unaligned access and the loop there was ZOL (Zero delay loop). In
> ARCv2 based cores, the gcc backend has been tweaked to generate fewer ZOLs hence
> you see the more canonical tst and branch style loop.

Is this another case of the hardware implementing 'hardware' loop
instructions that execute slower than ones made of simple instructions?

The worst example has to be the x86 'loop' (dec cx and jump nz)
instruction which is microcoded on intel cpus.
That makes it very difficult to use the new addx instruction to
get two dependency chains through a loop.

	David

-
Registered Address Lakeside, Bramley Road, Mount Farm, Milton Keynes, MK1 1PT, UK
Registration No: 1397386 (Wales)
Vineet Gupta Dec. 3, 2018, 5:31 p.m. UTC | #11
On 12/3/18 2:10 AM, David Laight wrote:
> From: Vineet Gupta
> ...
>>> It also seems to have used a different type of loop to the
>>> other example, probably less efficient.
>>> (Not that I'm an expert on ARC opcodes.)
>> The difference is due to ISA and ensuing ARC gcc backends. ARCompact based cores
>> don't support unaligned access and the loop there was ZOL (Zero delay loop). In
>> ARCv2 based cores, the gcc backend has been tweaked to generate fewer ZOLs hence
>> you see the more canonical tst and branch style loop.
> Is this another case of the hardware implementing 'hardware' loop
> instructions that execute slower than ones made of simple instructions?

Not really. ZOL allow for hardware loops with no instruction/cycle overhead in
general. However as micro-arches get more complicated there are newer "gizmos"
added to the machinery which sometimes make it harder for the compliers to
optimize for all the cases. ARCv2 ISA has a new DBNZ instruction (similar to x86
you refer below) to implement loops and that is preferred over the ZOL.

> The worst example has to be the x86 'loop' (dec cx and jump nz)
> instruction which is microcoded on intel cpus.
> That makes it very difficult to use the new addx instruction to
> get two dependency chains through a loop.
diff mbox series

Patch

diff --git a/arch/arc/include/asm/io.h b/arch/arc/include/asm/io.h
index c22b181e8206..949759a45cff 100644
--- a/arch/arc/include/asm/io.h
+++ b/arch/arc/include/asm/io.h
@@ -12,6 +12,7 @@ 
 #include <linux/types.h>
 #include <asm/byteorder.h>
 #include <asm/page.h>
+#include <asm/unaligned.h>
 
 #ifdef CONFIG_ISA_ARCV2
 #include <asm/barrier.h>
@@ -94,6 +95,34 @@  static inline u32 __raw_readl(const volatile void __iomem *addr)
 	return w;
 }
 
+#define __raw_readsx(t,f) \
+static inline void __raw_reads##f(const volatile void __iomem *addr, \
+				  void *buffer, unsigned int count) \
+{ \
+	if (count) { \
+		const unsigned long bptr = (unsigned long)buffer; \
+		u##t *buf = buffer; \
+\
+		do { \
+			u##t x = __raw_read##f(addr); \
+\
+			/* Some ARC CPU's don't support unaligned accesses */ \
+			if (bptr % ((t) / 8)) { \
+				put_unaligned(x, buf++); \
+			} else { \
+				*buf++ = x; \
+			} \
+		} while (--count); \
+	} \
+}
+
+#define __raw_readsb __raw_readsb
+__raw_readsx(8, b);
+#define __raw_readsw __raw_readsw
+__raw_readsx(16, w);
+#define __raw_readsl __raw_readsl
+__raw_readsx(32, l);
+
 #define __raw_writeb __raw_writeb
 static inline void __raw_writeb(u8 b, volatile void __iomem *addr)
 {
@@ -126,6 +155,32 @@  static inline void __raw_writel(u32 w, volatile void __iomem *addr)
 
 }
 
+#define __raw_writesx(t,f) \
+static inline void __raw_writes##f(volatile void __iomem *addr, \
+				   const void *buffer, unsigned int count) \
+{ \
+	if (count) { \
+		const unsigned long bptr = (unsigned long)buffer; \
+		const u##t *buf = buffer; \
+\
+		do { \
+			/* Some ARC CPU's don't support unaligned accesses */ \
+			if (bptr % ((t) / 8)) { \
+				__raw_write##f(get_unaligned(buf++), addr); \
+			} else { \
+				__raw_write##f(*buf++, addr); \
+			} \
+		} while (--count); \
+	} \
+}
+
+#define __raw_writesb __raw_writesb
+__raw_writesx(8, b);
+#define __raw_writesw __raw_writesw
+__raw_writesx(16, w);
+#define __raw_writesl __raw_writesl
+__raw_writesx(32, l);
+
 /*
  * MMIO can also get buffered/optimized in micro-arch, so barriers needed
  * Based on ARM model for the typical use case
@@ -141,10 +196,16 @@  static inline void __raw_writel(u32 w, volatile void __iomem *addr)
 #define readb(c)		({ u8  __v = readb_relaxed(c); __iormb(); __v; })
 #define readw(c)		({ u16 __v = readw_relaxed(c); __iormb(); __v; })
 #define readl(c)		({ u32 __v = readl_relaxed(c); __iormb(); __v; })
+#define readsb(p,d,l)		({ __raw_readsb(p,d,l); __iormb(); })
+#define readsw(p,d,l)		({ __raw_readsw(p,d,l); __iormb(); })
+#define readsl(p,d,l)		({ __raw_readsl(p,d,l); __iormb(); })
 
 #define writeb(v,c)		({ __iowmb(); writeb_relaxed(v,c); })
 #define writew(v,c)		({ __iowmb(); writew_relaxed(v,c); })
 #define writel(v,c)		({ __iowmb(); writel_relaxed(v,c); })
+#define writesb(p,d,l)		({ __iowmb(); __raw_writesb(p,d,l); })
+#define writesw(p,d,l)		({ __iowmb(); __raw_writesw(p,d,l); })
+#define writesl(p,d,l)		({ __iowmb(); __raw_writesl(p,d,l); })
 
 /*
  * Relaxed API for drivers which can handle barrier ordering themselves