diff mbox

[1/1] sparc: support for -mmisalign in the SPARC M8

Message ID 1501687671-143345-1-git-send-email-qing.zhao@oracle.com
State New
Headers show

Commit Message

Qing Zhao Aug. 2, 2017, 3:27 p.m. UTC
This patch adds support to GCC for the misaligned load/store
    instructions introduced in the Oracle SPARC Architecture 2017 and
    implemented by the SPARC M8 processor.

    A new command line option -mmisaligned is added, that activates the
    usage of the new instructions.

    The SPARC backend is modified to use the misaligned load/store
    instructions when loading/storing data from/to addresses that are
    known to be misaligned at compile time (such as in packed structs).

    New tests are added to check that the proper instructions are used
    when loading and storing from/to packed structs.

    The GCC manual is expanded to cover the new command-line option.

    gcc/ChangeLog:

        * config/sparc/constraints.md: New constraint B for memory
        references whose addresses are misaligned.
        * config/sparc/m8.md: New insn reservation for misaligned
        load/store.
        * config/sparc/sparc-protos.h (memory_is_misaligned): New.
        * config/sparc/sparc.c (dump_target_flag_bits): Dump
        MASK_MISALIGN.
        (sparc_option_override): Honour MASK_MISALIGN, use command-line
        option explicitly specified target_flags to control
        target_flags.
        (RTX_OK_FOR_OFFSET_P): For TARGET_MISALIGN treat 10-bits as
        legal IMM.
        (sparc_legitimate_address_p): Prohibit LO_SUM+IMM for
        TARGET_MISALIGN.
        (memory_is_misaligned): New
        * config/sparc/sparc.h (STRICT_ALIGNMENT): Set to
        !(TARGET_MISALIGN)
        * config/sparc/sparc.md (cpu_feature): Add new feature misalign
        (enabled): Handle misalign.
        (type): New insn type load_mis, store_mis, fpload_mis,
        fpstore_mis.
        ("*movhi_insn"): Add new alternatives for misaligned memory
        accesses to use M8 misaligned load/store insns,update
        corresponding attributes.
        ("*movsi_insn"): Likewise.
        ("*movdi_insn_sp32"): Likewise.
        ("*movdi_insn_sp64"): Likewise.
        ("*movsf_insn"): Likewise.
        ("*movdf_insn_sp32"): Likewise.
        ("*movdf_insn_sp64"): Likewise.
        ("*mov<VM32:mode>_insn"): Likewise.
        ("*mov<VM64:mode>_insn_sp64"): Likewise.
        ("*mov<VM64:mode>_insn_sp32"): Likewise.
        (define_split): DI "memory_operand" from "const_zero_operand"
        disable splitting for TARGET_MISALIGN.
        (define_split): DF "memory_operand" from "const_zero_operand"
        Likewise.
        (define_split): VM64 "memory_operand" from "const_zero_operand"
        Likewise.
        * config/sparc/sparc.opt (mmisalign): New option.
        * doc/invoke.texi (Option Summary): Document -mmisalign and
        -mno-misalign.
        (SPARC Optons): Likewise.

    gcc/testsuite/ChangeLog:

        * gcc.target/sparc/misalign-1.c: New test for misaligned ld/st.
        * gcc.target/sparc/misalign-2.c: Likewise.
        * gcc.target/sparc/misalign-3.c: Likewise.
        * gcc.target/sparc/misalign-4.c: Likewise.
        * gcc.target/sparc/misalign-5.c: Likewise.
        * gcc.target/sparc/misalign-run-1.c: Likewise.
        * gcc.target/sparc/misalign-run-2.c: Likewise.
        * gcc.target/sparc/misalign-run-3.c: Likewise.
        * lib/target-supports.exp (check_effective_target_misalign_hw):
        New procedure.
---
 gcc/config/sparc/constraints.md                 |   12 ++-
 gcc/config/sparc/m8.md                          |   16 +-
 gcc/config/sparc/sparc-protos.h                 |    1 +
 gcc/config/sparc/sparc.c                        |   60 +++++++-
 gcc/config/sparc/sparc.h                        |    5 +-
 gcc/config/sparc/sparc.md                       |  183 +++++++++++++++--------
 gcc/config/sparc/sparc.opt                      |    4 +
 gcc/doc/invoke.texi                             |   10 ++
 gcc/testsuite/gcc.target/sparc/misalign-1.c     |   45 ++++++
 gcc/testsuite/gcc.target/sparc/misalign-2.c     |   23 +++
 gcc/testsuite/gcc.target/sparc/misalign-3.c     |   42 +++++
 gcc/testsuite/gcc.target/sparc/misalign-4.c     |   23 +++
 gcc/testsuite/gcc.target/sparc/misalign-5.c     |   19 +++
 gcc/testsuite/gcc.target/sparc/misalign-run-1.c |   34 ++++
 gcc/testsuite/gcc.target/sparc/misalign-run-2.c |   23 +++
 gcc/testsuite/gcc.target/sparc/misalign-run-3.c |   53 +++++++
 gcc/testsuite/lib/target-supports.exp           |   15 ++
 17 files changed, 485 insertions(+), 83 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/sparc/misalign-1.c
 create mode 100644 gcc/testsuite/gcc.target/sparc/misalign-2.c
 create mode 100644 gcc/testsuite/gcc.target/sparc/misalign-3.c
 create mode 100644 gcc/testsuite/gcc.target/sparc/misalign-4.c
 create mode 100644 gcc/testsuite/gcc.target/sparc/misalign-5.c
 create mode 100644 gcc/testsuite/gcc.target/sparc/misalign-run-1.c
 create mode 100644 gcc/testsuite/gcc.target/sparc/misalign-run-2.c
 create mode 100644 gcc/testsuite/gcc.target/sparc/misalign-run-3.c

Comments

David Miller Aug. 2, 2017, 4:42 p.m. UTC | #1
From: qinzhao <qing.zhao@oracle.com>
Date: Wed,  2 Aug 2017 10:27:51 -0500

>     This patch adds support to GCC for the misaligned load/store
>     instructions introduced in the Oracle SPARC Architecture 2017 and
>     implemented by the SPARC M8 processor.
> 
>     A new command line option -mmisaligned is added, that activates the
>     usage of the new instructions.
> 
>     The SPARC backend is modified to use the misaligned load/store
>     instructions when loading/storing data from/to addresses that are
>     known to be misaligned at compile time (such as in packed structs).
> 
>     New tests are added to check that the proper instructions are used
>     when loading and storing from/to packed structs.
> 
>     The GCC manual is expanded to cover the new command-line option.

STRICT_ALIGNMENT has a lot of implications.

I think just because we happen to have misaligned loads and stores
available doesn't mean we want all of the side effects associated
with STRICT_ALIGNMENT being true.
Qing Zhao Aug. 2, 2017, 8:16 p.m. UTC | #2
Hi, David,

thanks a lot for your comment.

see my reply below

> STRICT_ALIGNMENT has a lot of implications.

from the definition of STRICT_ALIGNMENT:

/* Set this nonzero if move instructions will actually fail to work
   when given unaligned data.  */
#define STRICT_ALIGNMENT 1

for MISALIGN_TARGET,  it’s clear that move instructions will NOT fail to work when given unaligned data.
so, it’s reasonable to set STRICT_ALIGNMENT to 0 for MISALIGN_TARGET.

> 
> I think just because we happen to have misaligned loads and stores
> available doesn't mean we want all of the side effects associated
> with STRICT_ALIGNMENT being true.

so, could you please specify what kind of side effects will have when =
set STRICT_ALIGNMENT to true on TARGET_MISALIGN?=20

thanks a lot.

Qing
David Miller Aug. 2, 2017, 11:17 p.m. UTC | #3
From: Qing Zhao <qing.zhao@oracle.com>
Date: Wed, 2 Aug 2017 14:41:51 -0500

> so, could you please specify what kind of side effects will have
> when set STRICT_ALIGNMENT to true on TARGET_MISALIGN?

Why don't you read the code rather than just relying upon what
high level description is given by the documentation instead?

Thanks.
Qing Zhao Aug. 3, 2017, 1:44 p.m. UTC | #4
> On Aug 2, 2017, at 6:17 PM, David Miller <davem@davemloft.net> wrote:
> 
> From: Qing Zhao <qing.zhao@oracle.com>
> Date: Wed, 2 Aug 2017 14:41:51 -0500
> 
>> so, could you please specify what kind of side effects will have
>> when set STRICT_ALIGNMENT to true on TARGET_MISALIGN?
> 
> Why don't you read the code rather than just relying upon what
> high level description is given by the documentation instead?

I read the codes before making the change, that’s the reason I ask you to specify clearly the bad side effect that I didn’t considered yet.

thanks.

Qing
> 
> Thanks.
Qing Zhao Aug. 3, 2017, 3:37 p.m. UTC | #5
To be more specified,  when reading all the codes corresponding to “STRICT_ALIGNMENT” and “SLOW_UNALIGNMENT_ACCESS” in gcc
(NOTE, SLOW_UNALIGNMENT_ACCESS is the same as STRICT_ALIGNMENT when it is NOT defined explicitly, this is the case for SPARC)

We can get the following summary: 

all the special handling on STRICT_ALIGNMENT or SLOW_UNALIGNMENT_ACCESS in these codes have the following common logic:

if the memory access is known to be not-aligned well during compilation time, if the targeted platform does NOT support faster unaligned memory
access,  the compiler will try to make the memory access aligned well. Otherwise, if the targeted platform supports faster unaligned memory access,
 it will leave the compiler-time known not-aligned memory access as it, later the hardware support will kicked in for these unaligned memory access. 

this behavior is consistent with the high level definition of STRICT_ALIGNMENT. 

And also consistent with the M8 misaligned support:

if the target is NOT TARGET_MISALIGN,  STRICT_ALIGNMENT is 1,  all the compiler-time known misaligned memory accesses are adjusted to
aligned memory access before RTL generation;
on the other hand, if the target is TARGET_MISALIGN, STRICT_ALIGNMENT is 0,  the compiler-time known misaligned memory accesses are NOT
adjusted,  after RTL generation, we will have compiler-time known misaligned memory access, we can use the new misaligned ld/st hardware insns to 
support these compiler-time known misaligned memory access. 

hope this is clear.

thanks.

Qing


>> 
>> Why don't you read the code rather than just relying upon what
>> high level description is given by the documentation instead?
> 
> I read the codes before making the change, that’s the reason I ask you to specify clearly the bad side effect that I didn’t considered yet.
> 
> thanks.
> 
> Qing
David Miller Aug. 3, 2017, 4:40 p.m. UTC | #6
From: Qing Zhao <qing.zhao@oracle.com>
Date: Thu, 3 Aug 2017 10:37:15 -0500

> all the special handling on STRICT_ALIGNMENT or
> SLOW_UNALIGNMENT_ACCESS in these codes have the following common
> logic:
> 
> if the memory access is known to be not-aligned well during
> compilation time, if the targeted platform does NOT support faster
> unaligned memory access, the compiler will try to make the memory
> access aligned well. Otherwise, if the targeted platform supports
> faster unaligned memory access, it will leave the compiler-time
> known not-aligned memory access as it, later the hardware support
> will kicked in for these unaligned memory access.
> 
> this behavior is consistent with the high level definition of STRICT_ALIGNMENT. 

That's exactly the problem.

What you want with this M8 feature is simply to let the compiler know
that if it is completely impossible to make some memory object
aligned, then the cpu can handle this with special instructions.

You still want the compiler to make the effort to align data when it
can because the accesses will be faster than if it used the unaligned
loads and stores.

This is incredibly important for on-stack objects.
Qing Zhao Aug. 3, 2017, 6:49 p.m. UTC | #7
> On Aug 3, 2017, at 11:40 AM, David Miller <davem@davemloft.net> wrote:
> 
> From: Qing Zhao <qing.zhao@oracle.com>
> Date: Thu, 3 Aug 2017 10:37:15 -0500
> 
>> all the special handling on STRICT_ALIGNMENT or
>> SLOW_UNALIGNMENT_ACCESS in these codes have the following common
>> logic:
>> 
>> if the memory access is known to be not-aligned well during
>> compilation time, if the targeted platform does NOT support faster
>> unaligned memory access, the compiler will try to make the memory
>> access aligned well. Otherwise, if the targeted platform supports
>> faster unaligned memory access, it will leave the compiler-time
>> known not-aligned memory access as it, later the hardware support
>> will kicked in for these unaligned memory access.
>> 
>> this behavior is consistent with the high level definition of STRICT_ALIGNMENT. 
> 
> That's exactly the problem.
> 
> What you want with this M8 feature is simply to let the compiler know
> that if it is completely impossible to make some memory object
> aligned, then the cpu can handle this with special instructions.

> 
> You still want the compiler to make the effort to align data when it
> can because the accesses will be faster than if it used the unaligned
> loads and stores.

I don’t think the above is true.

first, the compiler-time known misaligned memory access can always be emulated by aligned memory access ( by byte-size load/stores).  then there will be no compiler-time known 
misaligned memory access left for the special misaligned ld/st insns. 

second, there are always overhead cost for the compiler-time effort to make the compiler-time known unaligned memory access as aligned memory access. (adding additional
padding, or split the unaligned multi-bytes to single-byte load/store), all such overhead might be even bigger than the overhead of the special misaligned load/store itself.

to decide which is better (to use software emulation or use hardware misaligned load/store insns), experiments might be needed to justify the performance impact.

This set of change is to provide a way to use misaligned load/store insns to implement the compiler-time known unaligned memory access,  -mno-misalign can be used
to disable such behavior very easily if our performance data shows that misaligned load/store insns are slower than the current software emulation. 

Qing


> 
> This is incredibly important for on-stack objects.
Qing Zhao Sept. 6, 2017, 7:41 p.m. UTC | #8
Just a followup on this patch.

We did some run-time performance testing internally on this set of
change on sparc M8 machine with -mmisalign and -mno-misalign
based on the latest upstream gcc

for CPU2017 C/C++ SPEED run:

***without -O,  -mmisalign slowdown the run-time performance about 4% on
average

This is mainly due to the following workaround to misaligned support in
M8: (config/sparc/sparc.c)

+/* for misaligned ld/st provided by M8, the IMM field is 10-bit wide
+   other than the 13-bit for regular ld/st.
+   The best solution for this problem is to distinguish each ld/st
+   whether it's aligned or misaligned. However, due to the current
+   design of the common routine TARGET_LEGITIMATE_ADDRESS_P,  only
+   the ADDR of a ld/st is passed to the routine, the align info
+   carried by the corresponding MEM is NOT passed in. without changing
+   the prototype of TARGET_LEGITIMATE_ADDRESS_P, we cannot use this
+   best solution.
+   as a workaround, we have to conservatively treat ALL IMM field of
+   a ld/st insn on a MISALIGNED target is 10-bit wide.
+   the side-effect of this workaround is:  there will be additiona
+   REG<-IMM insn generated for regular ld/st when -mmisalign is ON.
+   However, such additional reload insns should be very easily to be
+   removed by a set of optimization whenever -O specified.
+*/
+#define RTX_OK_FOR_OFFSET_P(X, MODE)                     \
+  (CONST_INT_P (X)                                       \
+   && ((!TARGET_MISALIGN                                 \
+        && INTVAL (X) >=3D3D -0x1000                         \
+        && INTVAL (X) <=3D3D (0x1000 - GET_MODE_SIZE (MODE)))\
+    || (TARGET_MISALIGN                                  \
+        && INTVAL (X) >=3D3D -0x0400                         \
+        && INTVAL (X) <=3D3D (0x0400 - GET_MODE_SIZE (MODE)))))

due to this run-time regression introduced by this workaround is not
trivial, We decided to hold on this
set of change at this time.

Thanks.

Qing

> 
> This set of change is to provide a way to use misaligned load/store insns to implement the compiler-time known unaligned memory access,  -mno-misalign can be used
> to disable such behavior very easily if our performance data shows that misaligned load/store insns are slower than the current software emulation. 
> 
> Qing
diff mbox

Patch

diff --git a/gcc/config/sparc/constraints.md b/gcc/config/sparc/constraints.md
index cff5a61..ba15233 100644
--- a/gcc/config/sparc/constraints.md
+++ b/gcc/config/sparc/constraints.md
@@ -18,7 +18,6 @@ 
 ;; <http://www.gnu.org/licenses/>.
 
 ;;; Unused letters:
-;;;     B
 ;;;    a        jkl        uv xyz
 
 
@@ -201,6 +200,17 @@ 
   "A memory with only a base register"
   (match_operand 0 "mem_noofs_operand"))
 
+;; We need a special memory constraint for the misaligned memory access
+;; This is only for TARGET_MISALIGN target
+;; However, due to a bug in the current special_memory_constraint handling
+;; in lra-constraints.c, we have to define this special_memory_constraint
+;; as a regular constraint as a workaround.
+(define_constraint "B"
+ "Memory reference whose address is misaligned"
+ (and (match_code "mem")
+      (match_test "TARGET_MISALIGN")
+      (match_test "memory_is_misaligned (op, mode)")))
+
 (define_constraint "Y"
  "The vector zero constant"
  (and (match_code "const_vector")
diff --git a/gcc/config/sparc/m8.md b/gcc/config/sparc/m8.md
index f0fe1b2..9542f01 100644
--- a/gcc/config/sparc/m8.md
+++ b/gcc/config/sparc/m8.md
@@ -125,10 +125,10 @@ 
                  (eq_attr "subtype" "regular"))))
   "m8_slot0, nothing*2")
 
-;; (define_insn_reservation "m8_load_misalign" 11
-;;  (and (eq_attr "cpu" "m8")
-;;       (eq_attr "type" "load_mis,fpload_mis"))
-;;  "m8_slot0, nothing*10")
+(define_insn_reservation "m8_load_misalign" 11
+ (and (eq_attr "cpu" "m8")
+      (eq_attr "type" "load_mis,fpload_mis"))
+ "m8_slot0, nothing*10")
 
 (define_insn_reservation "m8_prefetch" 1
   (and (eq_attr "cpu" "m8")
@@ -147,10 +147,10 @@ 
        (eq_attr "type" "store,fpstore"))
   "m8_slot1")
 
-;; (define_insn_reservation "m8_store_misalign" 3
-;;   (and (eq_attr "cpu" "m8")
-;;        (eq_attr "type" "store_mis,fpstore_mis"))
-;;   "m8_slot1, nothing*2")
+(define_insn_reservation "m8_store_misalign" 3
+  (and (eq_attr "cpu" "m8")
+       (eq_attr "type" "store_mis,fpstore_mis"))
+  "m8_slot1, nothing*2")
 
 ;; Control-transfer instructions execute in the Branch Unit in the
 ;; slot1.
diff --git a/gcc/config/sparc/sparc-protos.h b/gcc/config/sparc/sparc-protos.h
index d453c1a..9003586 100644
--- a/gcc/config/sparc/sparc-protos.h
+++ b/gcc/config/sparc/sparc-protos.h
@@ -100,6 +100,7 @@  extern int mem_min_alignment (rtx, int);
 extern int pic_address_needs_scratch (rtx);
 extern int register_ok_for_ldd (rtx);
 extern int memory_ok_for_ldd (rtx);
+extern int memory_is_misaligned (rtx, machine_mode);
 extern int v9_regcmp_p (enum rtx_code);
 /* Function used for V8+ code generation.  Returns 1 if the high
    32 bits of REG are 0 before INSN.  */   
diff --git a/gcc/config/sparc/sparc.c b/gcc/config/sparc/sparc.c
index 59761aa..455ebf5 100644
--- a/gcc/config/sparc/sparc.c
+++ b/gcc/config/sparc/sparc.c
@@ -1342,6 +1342,8 @@  dump_target_flag_bits (const int flags)
     fprintf (stderr, "V8 ");
   if (flags & MASK_V9)
     fprintf (stderr, "V9 ");
+  if (flags & MASK_MISALIGN)
+    fprintf (stderr, "MISALIGN ");
 }
 
 static void
@@ -1449,10 +1451,14 @@  sparc_option_override (void)
       MASK_V9|MASK_POPC|MASK_VIS4|MASK_FMAF|MASK_CBCOND|MASK_SUBXC },
     /* UltraSPARC M8 */
     { "m8",		MASK_ISA,
-      MASK_V9|MASK_POPC|MASK_VIS4|MASK_FMAF|MASK_CBCOND|MASK_SUBXC|MASK_VIS4B }
+      MASK_V9|MASK_POPC|MASK_VIS4|MASK_FMAF|MASK_CBCOND|MASK_SUBXC
+      |MASK_VIS4B|MASK_MISALIGN }
   };
   const struct cpu_table *cpu;
   unsigned int i;
+  unsigned int target_flags_explicit_init;
+  unsigned int target_flags_explicit_enable;
+  unsigned int target_flags_explicit_disable;
 
   if (sparc_debug_string != NULL)
     {
@@ -1493,10 +1499,16 @@  sparc_option_override (void)
   if (TARGET_FPU && !(target_flags_explicit & MASK_FSMULD))
     target_flags |= MASK_FSMULD;
 
+  target_flags_explicit_init = target_flags & target_flags_explicit;
+  target_flags_explicit_enable = target_flags_explicit_init & target_flags_explicit;
+  target_flags_explicit_disable = ~target_flags_explicit_init & target_flags_explicit;
   if (TARGET_DEBUG_OPTIONS)
     {
       dump_target_flags("Initial target_flags", target_flags);
       dump_target_flags("target_flags_explicit", target_flags_explicit);
+      dump_target_flags("target_flags_explicit_init", target_flags_explicit_init);
+      dump_target_flags("target_flags_explicit_enable", target_flags_explicit_enable);
+      dump_target_flags("target_flags_explicit_disable", target_flags_explicit_disable);
     }
 
 #ifdef SUBTARGET_OVERRIDE_OPTIONS
@@ -1572,8 +1584,8 @@  sparc_option_override (void)
       dump_target_flags ("cpu->enable", cpu->enable);
     }
 
-  target_flags &= ~cpu->disable;
-  target_flags |= (cpu->enable
+  target_flags &= ~(cpu->disable & ~target_flags_explicit_enable);
+  target_flags |= (cpu->enable & ~target_flags_explicit_disable 
 #ifndef HAVE_AS_FMAF_HPC_VIS3
 		   & ~(MASK_FMAF | MASK_VIS3)
 #endif
@@ -1584,7 +1596,7 @@  sparc_option_override (void)
 		   & ~(MASK_VIS4 | MASK_SUBXC)
 #endif
 #ifndef HAVE_AS_SPARC6
-		   & ~(MASK_VIS4B)
+		   & ~(MASK_VIS4B | MASK_MISALIGN)
 #endif
 #ifndef HAVE_AS_LEON
 		   & ~(MASK_LEON | MASK_LEON3)
@@ -4099,10 +4111,30 @@  legitimate_pic_operand_p (rtx x)
   return true;
 }
 
-#define RTX_OK_FOR_OFFSET_P(X, MODE)			\
-  (CONST_INT_P (X)					\
-   && INTVAL (X) >= -0x1000				\
-   && INTVAL (X) <= (0x1000 - GET_MODE_SIZE (MODE)))
+/* for misaligned ld/st provided by M8, the IMM field is 10-bit wide
+   other than the 13-bit for regular ld/st.
+   The best solution for this problem is to distinguish each ld/st 
+   whether it's aligned or misaligned. However, due to the current  
+   design of the common routine TARGET_LEGITIMATE_ADDRESS_P,  only 
+   the ADDR of a ld/st is passed to the routine, the align info
+   carried by the corresponding MEM is NOT passed in. without changing
+   the prototype of TARGET_LEGITIMATE_ADDRESS_P, we cannot use this
+   best solution.
+   as a workaround, we have to conservatively treat ALL IMM field of
+   a ld/st insn on a MISALIGNED target is 10-bit wide.
+   the side-effect of this workaround is:  there will be additional 
+   REG<-IMM insn generated for regular ld/st when -mmisalign is ON.
+   However, such additional reload insns should be very easily to be
+   removed by a set of optimization whenever -O specified.
+*/
+#define RTX_OK_FOR_OFFSET_P(X, MODE)                     \
+  (CONST_INT_P (X)                                       \
+   && ((!TARGET_MISALIGN                                 \
+        && INTVAL (X) >= -0x1000                         \
+        && INTVAL (X) <= (0x1000 - GET_MODE_SIZE (MODE)))\
+    || (TARGET_MISALIGN                                  \
+        && INTVAL (X) >= -0x0400                         \
+        && INTVAL (X) <= (0x0400 - GET_MODE_SIZE (MODE)))))
 
 #define RTX_OK_FOR_OLO10_P(X, MODE)			\
   (CONST_INT_P (X)					\
@@ -4179,10 +4211,12 @@  sparc_legitimate_address_p (machine_mode mode, rtx addr, bool strict)
 	      && (mode == DFmode || mode == DImode))
 	    return 0;
 	}
+      /* We prohibit LO_SUM + IMM on TARGET_MISALIGN since it is not supported */
       else if (USE_AS_OFFSETABLE_LO10
 	       && GET_CODE (rs1) == LO_SUM
 	       && TARGET_ARCH64
 	       && ! TARGET_CM_MEDMID
+               && ! TARGET_MISALIGN
 	       && RTX_OK_FOR_OLO10_P (rs2, mode))
 	{
 	  rs2 = NULL;
@@ -8910,6 +8944,16 @@  memory_ok_for_ldd (rtx op)
 
   return 1;
 }
+
+/* Return 1 if OP, a MEM, has an address which is know to be
+   misaligned */
+
+int
+memory_is_misaligned (rtx op, machine_mode mode)
+{
+  return (MEM_ALIGN (op) < GET_MODE_BITSIZE (mode));
+}
+
 
 /* Implement TARGET_PRINT_OPERAND_PUNCT_VALID_P.  */
 
diff --git a/gcc/config/sparc/sparc.h b/gcc/config/sparc/sparc.h
index 15a6217..5425d01 100644
--- a/gcc/config/sparc/sparc.h
+++ b/gcc/config/sparc/sparc.h
@@ -595,8 +595,9 @@  extern enum cmodel sparc_cmodel;
 #define LOCAL_ALIGNMENT(TYPE, ALIGN) DATA_ALIGNMENT (TYPE, ALIGN)
 
 /* Set this nonzero if move instructions will actually fail to work
-   when given unaligned data.  */
-#define STRICT_ALIGNMENT 1
+   when given unaligned data.  
+   when TARGET_MISALIGN, this should be zero */
+#define STRICT_ALIGNMENT !(TARGET_MISALIGN)
 
 /* Things that must be doubleword aligned cannot go in the text section,
    because the linker fails to align the text section enough!
diff --git a/gcc/config/sparc/sparc.md b/gcc/config/sparc/sparc.md
index 925b49e..44cad42 100644
--- a/gcc/config/sparc/sparc.md
+++ b/gcc/config/sparc/sparc.md
@@ -258,7 +258,7 @@ 
 	 (symbol_ref "TARGET_SPARCLET") (const_string "sparclet")]
 	(const_string "v7"))))
 
-(define_attr "cpu_feature" "none,fpu,fpunotv9,v9,vis,vis3,vis4,vis4b"
+(define_attr "cpu_feature" "none,fpu,fpunotv9,v9,vis,vis3,vis4,vis4b,misalign"
   (const_string "none"))
 
 (define_attr "lra" "disabled,enabled"
@@ -273,7 +273,8 @@ 
          (eq_attr "cpu_feature" "vis") (symbol_ref "TARGET_VIS")
          (eq_attr "cpu_feature" "vis3") (symbol_ref "TARGET_VIS3")
          (eq_attr "cpu_feature" "vis4") (symbol_ref "TARGET_VIS4")
-         (eq_attr "cpu_feature" "vis4b") (symbol_ref "TARGET_VIS4B")]
+         (eq_attr "cpu_feature" "vis4b") (symbol_ref "TARGET_VIS4B")
+         (eq_attr "cpu_feature" "misalign") (symbol_ref "TARGET_MISALIGN")]
         (const_int 0)))
 
 ;; The SPARC instructions used by the backend are organized into a
@@ -302,8 +303,12 @@ 
 ;; load/prefetch: PREFETCH
 ;; fpload: LDF LDDF LDQF
 ;; sload: LD{SB,SH,SW}
+;; load_mis: LDM{SH,UH,SW,UW,X}[A]
+;; fpload_mis: LDMF{S,D}[A]
 ;; store: ST{B,H,W,X} STFSR
 ;; fpstore: STF STDF STQF
+;; store_mis: STM{H,W,X}[A]
+;; fpstore_mis: STMF{S,D}[A]
 ;; cbcond: CWB{NE,E,G,LE,GE,L,GU,LEU,CC,CS,POS,NEG,VC,VS}
 ;;         CXB{NE,E,G,LE,GE,L,GU,LEU,CC,CS,POS,NEG,VC,VS}
 ;; uncond_branch: BA BPA JMPL
@@ -361,10 +366,12 @@ 
 (define_attr "type"
   "ialu,compare,shift,
    load,sload,store,
+   load_mis,store_mis,
    uncond_branch,branch,call,sibcall,call_no_delay_slot,return,
    cbcond,uncond_cbcond,
    imul,idiv,
    fpload,fpstore,
+   fpload_mis,fpstore_mis,
    fp,fpmove,
    fpcmove,fpcrmove,
    fpcmp,
@@ -1621,18 +1628,21 @@ 
 })
 
 (define_insn "*movhi_insn"
-  [(set (match_operand:HI 0 "nonimmediate_operand" "=r,r,r,m")
-	(match_operand:HI 1 "input_operand"   "rI,K,m,rJ"))]
+  [(set (match_operand:HI 0 "nonimmediate_operand" "=r,r,r,r, B,m")
+	(match_operand:HI 1 "input_operand"   "rI,K,B,m,rJ,rJ"))]
   "(register_operand (operands[0], HImode)
     || register_or_zero_operand (operands[1], HImode))"
   "@
    mov\t%1, %0
    sethi\t%%hi(%a1), %0
+   ldmuh\t%1, %0
    lduh\t%1, %0
+   stmh\t%r1, %0
    sth\t%r1, %0"
-  [(set_attr "type" "*,*,load,store")
-   (set_attr "subtype" "*,*,regular,*")
-   (set_attr "us3load_type" "*,*,3cycle,*")])
+  [(set_attr "type" "*,*,load_mis,load,store_mis,store")
+   (set_attr "subtype" "*,*,*,regular,*,*")
+   (set_attr "cpu_feature" "*,*,misalign,*,misalign,*")
+   (set_attr "us3load_type" "*,*,*,3cycle,*,*")])
 
 ;; We always work with constants here.
 (define_insn "*movhi_lo_sum"
@@ -1652,25 +1662,31 @@ 
 })
 
 (define_insn "*movsi_insn"
-  [(set (match_operand:SI 0 "nonimmediate_operand" "=r,r,r, m, r,*f,*f,*f, m,d,d")
-	(match_operand:SI 1 "input_operand"        "rI,K,m,rJ,*f, r, f, m,*f,J,P"))]
+  [(set (match_operand:SI 0 "nonimmediate_operand" "=r,r,r,r, B, m, r,*f,*f,*f,*f, B, m,d,d")
+        (match_operand:SI 1 "input_operand"        "rI,K,B,m,rJ,rJ,*f, r, f, B, m,*f,*f,J,P"))]
+
   "register_operand (operands[0], SImode)
    || register_or_zero_or_all_ones_operand (operands[1], SImode)"
   "@
    mov\t%1, %0
    sethi\t%%hi(%a1), %0
+   ldmsw\t%1, %0
    ld\t%1, %0
+   stmw\t%r1, %0
    st\t%r1, %0
    movstouw\t%1, %0
    movwtos\t%1, %0
    fmovs\t%1, %0
+   ldmfs\t%1, %0
    ld\t%1, %0
+   stmfs\t%1, %0
    st\t%1, %0
    fzeros\t%0
    fones\t%0"
-  [(set_attr "type" "*,*,load,store,vismv,vismv,fpmove,fpload,fpstore,visl,visl")
-   (set_attr "subtype" "*,*,regular,*,movstouw,single,*,*,*,single,single")
-   (set_attr "cpu_feature" "*,*,*,*,vis3,vis3,*,*,*,vis,vis")])
+  [(set_attr "type" "*,*,load_mis,load,store_mis,store,vismv,vismv,fpmove,fpload_mis,fpload,fpstore_mis,fpstore,visl,visl")
+   (set_attr "subtype" "*,*,*,regular,*,*,*,*,*,*,*,*,*,*,*")
+   (set_attr "cpu_feature" "*,*,misalign,*,misalign,*,vis3,vis3,*,misalign,*,misalign,*,vis,vis")])
+
 
 (define_insn "*movsi_lo_sum"
   [(set (match_operand:SI 0 "register_operand" "=r")
@@ -1810,13 +1826,15 @@ 
 
 (define_insn "*movdi_insn_sp32"
   [(set (match_operand:DI 0 "nonimmediate_operand"
-			    "=T,o,U,T,r,o,r,r,?*f,?T,?*f,?o,?*e,?*e,  r,?*f,?*e,?T,*b,*b")
+                            "=B,T,o,U,T,r,o,r,r,?*f,?T,?*f,?o,?*e,?*e,  r,?*f,?*e,?T,*b,*b")
         (match_operand:DI 1 "input_operand"
-			    " J,J,T,U,o,r,i,r,  T,*f,  o,*f, *e, *e,?*f,  r,  T,*e, J, P"))]
+                            " J,J,J,T,U,o,r,i,r,  T,*f,  o,*f, *e, *e,?*f,  r,  T,*e, J, P"))]
+
   "TARGET_ARCH32
    && (register_operand (operands[0], DImode)
        || register_or_zero_operand (operands[1], DImode))"
   "@
+   stmx\t%r1, %0
    stx\t%r1, %0
    #
    ldd\t%1, %0
@@ -1837,24 +1855,26 @@ 
    std\t%1, %0
    fzero\t%0
    fone\t%0"
-  [(set_attr "type" "store,*,load,store,load,store,*,*,fpload,fpstore,*,*,fpmove,*,*,*,fpload,fpstore,visl,
-visl")
-   (set_attr "subtype" "*,*,regular,*,regular,*,*,*,*,*,*,*,*,*,*,*,*,*,double,double")
-   (set_attr "length" "*,2,*,*,*,*,2,2,*,*,2,2,*,2,2,2,*,*,*,*")
-   (set_attr "fptype" "*,*,*,*,*,*,*,*,*,*,*,*,double,*,*,*,*,*,double,double")
-   (set_attr "cpu_feature" "v9,*,*,*,*,*,*,*,fpu,fpu,fpu,fpu,v9,fpunotv9,vis3,vis3,fpu,fpu,vis,vis")
-   (set_attr "lra" "*,*,disabled,disabled,*,*,*,*,*,*,*,*,*,*,*,*,*,*,*,*")])
+  [(set_attr "type" "store_mis,store,*,load,store,load,store,*,*,fpload,fpstore,*,*,fpmove,*,*,*,fpload,fpstore,visl,visl")
+   (set_attr "subtype" "*,*,*,*,regular,*,*,*,regular,*,*,*,*,*,*,*,*,*,*,*,*")
+   (set_attr "length" "*,*,2,*,*,*,*,2,2,*,*,2,2,*,2,2,2,*,*,*,*")
+   (set_attr "fptype" "*,*,*,*,*,*,*,*,*,*,*,*,*,double,*,*,*,*,*,double,double")
+   (set_attr "cpu_feature" "misalign,*,*,*,*,*,*,*,*,fpu,fpu,fpu,fpu,v9,fpunotv9,vis3,vis3,fpu,fpu,vis,vis")
+   (set_attr "lra" "*,*,*,disabled,disabled,*,*,*,*,*,*,*,*,*,*,*,*,*,*,*,*")])
+
 
 (define_insn "*movdi_insn_sp64"
-  [(set (match_operand:DI 0 "nonimmediate_operand" "=r,r,r, m, r,*e,?*e,?*e,?W,b,b")
-        (match_operand:DI 1 "input_operand"        "rI,N,m,rJ,*e, r, *e,  W,*e,J,P"))]
+  [(set (match_operand:DI 0 "nonimmediate_operand" "=r,r,r,r, B, m, r,*e,?*e,?*e,?W,b,b")
+        (match_operand:DI 1 "input_operand"        "rI,N,B,m,rJ,rJ,*e, r, *e,  W,*e,J,P"))]
   "TARGET_ARCH64
    && (register_operand (operands[0], DImode)
        || register_or_zero_or_all_ones_operand (operands[1], DImode))"
   "@
    mov\t%1, %0
    sethi\t%%hi(%a1), %0
+   ldmx\t%1, %0
    ldx\t%1, %0
+   stmx\t%r1, %0
    stx\t%r1, %0
    movdtox\t%1, %0
    movxtod\t%1, %0
@@ -1863,10 +1883,11 @@  visl")
    std\t%1, %0
    fzero\t%0
    fone\t%0"
-  [(set_attr "type" "*,*,load,store,vismv,vismv,fpmove,fpload,fpstore,visl,visl")
-   (set_attr "subtype" "*,*,regular,*,movdtox,movxtod,*,*,*,double,double")
-   (set_attr "fptype" "*,*,*,*,*,*,double,*,*,double,double")
-   (set_attr "cpu_feature" "*,*,*,*,vis3,vis3,*,*,*,vis,vis")])
+  [(set_attr "type" "*,*,load_mis,load,store_mis,store,vismv,vismv,fpmove,fpload,fpstore,visl,visl")
+   (set_attr "subtype" "*,*,*,regular,*,*,movdtox,movxtod,*,*,*,*,*")
+   (set_attr "fptype" "*,*,*,*,*,*,*,*,double,*,*,double,double")
+   (set_attr "cpu_feature" "*,*,misalign,*,misalign,*,vis3,vis3,*,*,*,vis,vis")])
+
 
 (define_expand "movdi_pic_label_ref"
   [(set (match_dup 3) (high:DI
@@ -2170,6 +2191,7 @@  visl")
   "reload_completed
    && (!TARGET_V9
        || (TARGET_ARCH32
+           && !TARGET_MISALIGN
 	   && !mem_min_alignment (operands[0], 8)))
    && offsettable_memref_p (operands[0])"
   [(clobber (const_int 0))]
@@ -2350,8 +2372,9 @@  visl")
 })
 
 (define_insn "*movsf_insn"
-  [(set (match_operand:SF 0 "nonimmediate_operand" "=d,d,f, *r,*r,*r,*r, f,f,*r,m,  m")
-	(match_operand:SF 1 "input_operand"         "G,C,f,*rR, Q, S, f,*r,m, m,f,*rG"))]
+  [(set (match_operand:SF 0 "nonimmediate_operand" "=d,d,f, *r,*r,*r,*r, f,f,f,*r,*r,B,m,  B,  m")
+       (match_operand:SF 1 "input_operand"         "G,C,f,*rR, Q, S, f,*r,B,m, B, m,f,f,*rG,*rG"))]
+
   "(register_operand (operands[0], SFmode)
     || register_or_zero_or_all_ones_operand (operands[1], SFmode))"
 {
@@ -2385,18 +2408,29 @@  visl")
     case 7:
       return "movwtos\t%1, %0";
     case 8:
+      return "ldmfs\t%1, %0";
     case 9:
       return "ld\t%1, %0";
     case 10:
+      return "ldmsw\t%1, %0";
     case 11:
+      return "ld\t%1, %0";
+    case 12:
+      return "stmfs\t%r1, %0";
+    case 13:
       return "st\t%r1, %0";
+    case 14:
+      return "stm\t%r1, %0";
+    case 15:
+       return "st\t%r1, %0";
     default:
       gcc_unreachable ();
     }
 }
-  [(set_attr "type" "visl,visl,fpmove,*,*,*,vismv,vismv,fpload,load,fpstore,store")
-   (set_attr "subtype" "single,single,*,*,*,*,movstouw,single,*,regular,*,*")
-   (set_attr "cpu_feature" "vis,vis,fpu,*,*,*,vis3,vis3,fpu,*,fpu,*")])
+  [(set_attr "type" "visl,visl,fpmove,*,*,*,vismv,vismv,fpload_mis,fpload,load_mis,load,fpstore_mis,fpstore,store_mis,store")
+   (set_attr "subtype" "single,single,*,*,*,*,movstouw,single,*,*,*,regular,*,*,*,*")
+   (set_attr "cpu_feature" "vis,vis,fpu,*,*,*,vis3,vis3,misalign,fpu,misalign,*,misalign,fpu,misalign,*")])
+
 
 ;; The following 3 patterns build SFmode constants in integer registers.
 
@@ -2443,13 +2477,14 @@  visl")
 
 (define_insn "*movdf_insn_sp32"
   [(set (match_operand:DF 0 "nonimmediate_operand"
-			    "=T,o,b,b,e,e,*r, f,  e,T,U,T,  f,o, *r,*r, o")
+                            "=B,T,o,b,b,e,e,*r, f,  e,T,U,T,  f,o, *r,*r, o")
 	(match_operand:DF 1 "input_operand"
-			    " G,G,G,C,e,e, f,*r,T#F,e,T,U,o#F,f,*rF, o,*r"))]
+                            " G,G,G,G,C,e,e, f,*r,T#F,e,T,U,o#F,f,*rF, o,*r"))]
   "TARGET_ARCH32
    && (register_operand (operands[0], DFmode)
        || register_or_zero_or_all_ones_operand (operands[1], DFmode))"
   "@
+  stmx\t%r1, %0
   stx\t%r1, %0
   #
   fzero\t%0
@@ -2467,16 +2502,17 @@  visl")
   #
   ldd\t%1, %0
   std\t%1, %0"
-  [(set_attr "type" "store,*,visl,visl,fpmove,*,*,*,fpload,fpstore,load,store,*,*,*,load,store")
-   (set_attr "subtype" "*,*,double,double,*,*,*,*,*,*,regular,*,*,*,*,regular,*")
-   (set_attr "length" "*,2,*,*,*,2,2,2,*,*,*,*,2,2,2,*,*")
-   (set_attr "fptype" "*,*,double,double,double,*,*,*,*,*,*,*,*,*,*,*,*")
-   (set_attr "cpu_feature" "v9,*,vis,vis,v9,fpunotv9,vis3,vis3,fpu,fpu,*,*,fpu,fpu,*,*,*")
-   (set_attr "lra" "*,*,*,*,*,*,*,*,*,*,disabled,disabled,*,*,*,*,*")])
+  [(set_attr "type" "store_mis,store,*,visl,visl,fpmove,*,*,*,fpload,fpstore,load,store,*,*,*,load,store")
+   (set_attr "subtype" "*,*,*,double,double,*,*,*,*,*,*,regular,*,*,*,*,regular,*")
+   (set_attr "length" "*,*,2,*,*,*,2,2,2,*,*,*,*,2,2,2,*,*")
+   (set_attr "fptype" "*,*,*,double,double,double,*,*,*,*,*,*,*,*,*,*,*,*")
+   (set_attr "cpu_feature" "misalign,v9,*,vis,vis,v9,fpunotv9,vis3,vis3,fpu,fpu,*,*,fpu,fpu,*,*,*")
+   (set_attr "lra" "*,*,*,*,*,*,*,*,*,*,*,disabled,disabled,*,*,*,*,*")])
+
 
 (define_insn "*movdf_insn_sp64"
-  [(set (match_operand:DF 0 "nonimmediate_operand" "=b,b,e,*r, e,  e,W, *r,*r,  m,*r")
-	(match_operand:DF 1 "input_operand"         "G,C,e, e,*r,W#F,e,*rG, m,*rG, F"))]
+  [(set (match_operand:DF 0 "nonimmediate_operand" "=b,b,e,*r, e,e,  e,B,W, *r,*r,  m,*r")
+        (match_operand:DF 1 "input_operand"        " G,C,e, e,*r,B,W#F,e,e,*rG, m,*rG, F"))]
   "TARGET_ARCH64
    && (register_operand (operands[0], DFmode)
        || register_or_zero_or_all_ones_operand (operands[1], DFmode))"
@@ -2486,17 +2522,19 @@  visl")
   fmovd\t%1, %0
   movdtox\t%1, %0
   movxtod\t%1, %0
+  ldmfd\t%1, %0
   ldd\t%1, %0
+  stmfd\t%1, %0
   std\t%1, %0
   mov\t%r1, %0
   ldx\t%1, %0
   stx\t%r1, %0
   #"
-  [(set_attr "type" "visl,visl,fpmove,vismv,vismv,load,store,*,load,store,*")
-   (set_attr "subtype" "double,double,*,movdtox,movxtod,regular,*,*,regular,*,*")
-   (set_attr "length" "*,*,*,*,*,*,*,*,*,*,2")
-   (set_attr "fptype" "double,double,double,double,double,*,*,*,*,*,*")
-   (set_attr "cpu_feature" "vis,vis,fpu,vis3,vis3,fpu,fpu,*,*,*,*")])
+  [(set_attr "type" "visl,visl,fpmove,vismv,vismv,fpload_mis,fpload,fpstore_mis,fpstore,*,load,store,*")
+   (set_attr "subtype" "double,double,*,movdtox,movxtod,*,regular,*,*,*,regular,*,*")
+   (set_attr "length" "*,*,*,*,*,*,*,*,*,*,*,*,2")
+   (set_attr "fptype" "double,double,double,double,double,*,*,*,*,*,*,*,*")
+   (set_attr "cpu_feature" "vis,vis,fpu,vis3,vis3,misalign,fpu,misalign,fpu,*,*,*,*")])
 
 ;; This pattern builds DFmode constants in integer registers.
 (define_split
@@ -2603,6 +2641,7 @@  visl")
   "reload_completed
    && (!TARGET_V9
        || (TARGET_ARCH32
+           && !TARGET_MISALIGN
 	   && !mem_min_alignment (operands[0], 8)))
    && offsettable_memref_p (operands[0])"
   [(clobber (const_int 0))]
@@ -8634,8 +8673,8 @@  visl")
 })
 
 (define_insn "*mov<VM32:mode>_insn"
-  [(set (match_operand:VM32 0 "nonimmediate_operand" "=f,f,f,f,m,m,*r, m,*r,*r, f")
-	(match_operand:VM32 1 "input_operand"         "Y,Z,f,m,f,Y, m,*r,*r, f,*r"))]
+  [(set (match_operand:VM32 0 "nonimmediate_operand" "=f,f,f,f,f,B,m,B,m,*r,*r, B, m,*r,*r, f")
+        (match_operand:VM32 1 "input_operand"         "Y,Z,f,B,m,f,f,Y,Y, B, m,*r,*r,*r, f,*r"))]
   "TARGET_VIS
    && (register_operand (operands[0], <VM32:MODE>mode)
        || register_or_zero_or_all_ones_operand (operands[1], <VM32:MODE>mode))"
@@ -8643,21 +8682,27 @@  visl")
   fzeros\t%0
   fones\t%0
   fsrc2s\t%1, %0
+  ldmfs\t%1, %0
   ld\t%1, %0
+  stmfs\t%1, %0
   st\t%1, %0
+  stmw\t%r1, %0
   st\t%r1, %0
+  ldmsw\t%1, %0
   ld\t%1, %0
+  stmw\t%1, %0
   st\t%1, %0
   mov\t%1, %0
   movstouw\t%1, %0
   movwtos\t%1, %0"
-  [(set_attr "type" "visl,visl,vismv,fpload,fpstore,store,load,store,*,vismv,vismv")
-   (set_attr "subtype" "single,single,single,*,*,*,regular,*,*,movstouw,single")
-   (set_attr "cpu_feature" "vis,vis,vis,*,*,*,*,*,*,vis3,vis3")])
+  [(set_attr "type" "visl,visl,vismv,fpload_mis,fpload,fpstore_mis,fpstore,store_mis,store,load_mis,load,store_mis,store,*,vismv,vismv")
+   (set_attr "subtype" "single,single,single,*,*,*,*,*,*,*,regular,*,*,*,movstouw,single")
+   (set_attr "cpu_feature" "vis,vis,vis,misalign,*,misalign,*,misalign,*,misalign,*,misalign,*,*,vis3,vis3")])
+
 
 (define_insn "*mov<VM64:mode>_insn_sp64"
-  [(set (match_operand:VM64 0 "nonimmediate_operand" "=e,e,e,e,W,m,*r, m,*r, e,*r")
-	(match_operand:VM64 1 "input_operand"         "Y,Z,e,W,e,Y, m,*r, e,*r,*r"))]
+  [(set (match_operand:VM64 0 "nonimmediate_operand" "=e,e,e,e,e,B,W,B,m,*r,*r, B, m,*r, e,*r")
+        (match_operand:VM64 1 "input_operand"         "Y,Z,e,B,W,e,e,Y,Y, B, m,*r,*r, e,*r,*r"))]
   "TARGET_VIS
    && TARGET_ARCH64
    && (register_operand (operands[0], <VM64:MODE>mode)
@@ -8666,28 +8711,36 @@  visl")
   fzero\t%0
   fone\t%0
   fsrc2\t%1, %0
+  ldmfd\t%1, %0
   ldd\t%1, %0
+  stmfd\t%1, %0
   std\t%1, %0
+  stmx\t%r1, %0
   stx\t%r1, %0
+  ldmx\t%1, %0
   ldx\t%1, %0
+  stmx\t%1, %0
   stx\t%1, %0
   movdtox\t%1, %0
   movxtod\t%1, %0
   mov\t%1, %0"
-  [(set_attr "type" "visl,visl,vismv,fpload,fpstore,store,load,store,vismv,vismv,*")
-   (set_attr "subtype" "double,double,double,*,*,*,regular,*,movdtox,movxtod,*")
-   (set_attr "cpu_feature" "vis,vis,vis,*,*,*,*,*,vis3,vis3,*")])
+  [(set_attr "type" "visl,visl,vismv,fpload_mis,fpload,fpstore_mis,fpstore,store_mis,store,load_mis,load,store_mis,store,vismv,vismv,*")
+   (set_attr "subtype" "double,double,double,*,*,*,*,*,*,*,regular,*,*,movdtox,movxtod,*")
+   (set_attr "cpu_feature" "vis,vis,vis,misalign,*,misalign,*,misalign,*,misalign,*,misalign,*,vis3,vis3,*")])
+
+
 
 (define_insn "*mov<VM64:mode>_insn_sp32"
   [(set (match_operand:VM64 0 "nonimmediate_operand"
-			      "=T,o,e,e,e,*r, f,e,T,U,T,f,o,*r,*r, o")
+                              "=B,T,o,e,e,e,*r, f,e,T,U,T,f,o,*r,*r, o")
 	(match_operand:VM64 1 "input_operand"
-			      " Y,Y,Y,Z,e, f,*r,T,e,T,U,o,f,*r, o,*r"))]
+                              " Y,Y,Y,Y,Z,e, f,*r,T,e,T,U,o,f,*r, o,*r"))]
   "TARGET_VIS
    && TARGET_ARCH32
    && (register_operand (operands[0], <VM64:MODE>mode)
        || register_or_zero_or_all_ones_operand (operands[1], <VM64:MODE>mode))"
   "@
+  stmx\t%r1, %0
   stx\t%r1, %0
   #
   fzero\t%0
@@ -8704,11 +8757,12 @@  visl")
   #
   ldd\t%1, %0
   std\t%1, %0"
-  [(set_attr "type" "store,*,visl,visl,vismv,*,*,fpload,fpstore,load,store,*,*,*,load,store")
-   (set_attr "subtype" "*,*,double,double,double,*,*,*,*,regular,*,*,*,*,regular,*")
-   (set_attr "length" "*,2,*,*,*,2,2,*,*,*,*,2,2,2,*,*")
-   (set_attr "cpu_feature" "*,*,vis,vis,vis,vis3,vis3,*,*,*,*,*,*,*,*,*")
-   (set_attr "lra" "*,*,*,*,*,*,*,*,*,disabled,disabled,*,*,*,*,*")])
+  [(set_attr "type" "store_mis,store,*,visl,visl,vismv,*,*,fpload,fpstore,load,store,*,*,*,load,store")
+   (set_attr "subtype" "*,*,*,double,double,double,*,*,*,*,regular,*,*,*,*,regular,*")
+   (set_attr "length" "*,*,2,*,*,*,2,2,*,*,*,*,2,2,2,*,*")
+   (set_attr "cpu_feature" "misalign,*,*,vis,vis,vis,vis3,vis3,*,*,*,*,*,*,*,*,*")
+   (set_attr "lra" "*,*,*,*,*,*,*,*,*,*,disabled,disabled,*,*,*,*,*")])
+
 
 (define_split
   [(set (match_operand:VM64 0 "register_operand" "")
@@ -8755,6 +8809,7 @@  visl")
   "reload_completed
    && TARGET_VIS
    && TARGET_ARCH32
+   && !TARGET_MISALIGN
    && !mem_min_alignment (operands[0], 8)
    && offsettable_memref_p (operands[0])"
   [(clobber (const_int 0))]
diff --git a/gcc/config/sparc/sparc.opt b/gcc/config/sparc/sparc.opt
index 22267f5..3b86576 100644
--- a/gcc/config/sparc/sparc.opt
+++ b/gcc/config/sparc/sparc.opt
@@ -85,6 +85,10 @@  mvis4b
 Target Report Mask(VIS4B)
 Use additional VIS instructions introduced in OSA2017.
 
+mmisalign
+Target Report Mask(MISALIGN)
+Use OSA2017 misaligned load and store instructions.
+
 mcbcond
 Target Report Mask(CBCOND)
 Use UltraSPARC Compare-and-Branch extensions.
diff --git a/gcc/doc/invoke.texi b/gcc/doc/invoke.texi
index 5ae9dc4..525218c 100644
--- a/gcc/doc/invoke.texi
+++ b/gcc/doc/invoke.texi
@@ -1123,6 +1123,7 @@  See RS/6000 and PowerPC Options.
 -mv8plus  -mno-v8plus  -mvis  -mno-vis @gol
 -mvis2  -mno-vis2  -mvis3  -mno-vis3 @gol
 -mvis4 -mno-vis4 -mvis4b -mno-vis4b @gol
+-mmisalign -mno-misalign @gol
 -mcbcond  -mno-cbcond  -mfmaf  -mno-fmaf  -mfsmuld  -mno-fsmuld  @gol
 -mpopc  -mno-popc  -msubxc  -mno-subxc @gol
 -mfix-at697f  -mfix-ut699  -mfix-ut700  -mfix-gr712rc @gol
@@ -24045,6 +24046,15 @@  cpu that supports such instructions, such as m8 and later.  Setting
 @option{-mvis4b} also sets @option{-mvis4}, @option{-mvis3},
 @option{-mvis2} and @option{-mvis}.
 
+@item -mmisalign
+@itemx -mno-misalign
+@opindex mmisalign
+@opindex mno-misalign
+With @option{-mmisalign}, GCC generates code that takes advantage of
+the misaligned load and store instructions introduced in the Oracle
+SPARC Architecture 2017.  The default is @option{-mmisalign} when
+targeting a cpu that supports such instructions, such as m8 and later.
+
 @item -mcbcond
 @itemx -mno-cbcond
 @opindex mcbcond
diff --git a/gcc/testsuite/gcc.target/sparc/misalign-1.c b/gcc/testsuite/gcc.target/sparc/misalign-1.c
new file mode 100644
index 0000000..764905f
--- /dev/null
+++ b/gcc/testsuite/gcc.target/sparc/misalign-1.c
@@ -0,0 +1,45 @@ 
+/* { dg-do compile } */
+/* { dg-options "-mmisalign" } */
+
+typedef struct __attribute__((packed)) {
+  char old;
+  short ip;
+  int new;
+  long long ll;
+  float fp;
+  double dp;
+} NCO;
+
+NCO a;
+short eip;
+int enew;
+long long ell;
+float efp;
+double edp;
+
+int main(void) {
+
+  a.old = 'c';
+  a.ip = 10;
+  a.new = 20;
+  a.ll = 50;
+  a.fp = 30.0;
+  a.dp = 40.0;
+  eip = a.ip;
+  enew = a.new;
+  ell = a.ll;
+  efp = a.fp;
+  edp = a.dp;
+  return 0;
+}
+
+/* { dg-final { scan-assembler "stmh" } } */
+/* { dg-final { scan-assembler "stmw" } } */
+/* { dg-final { scan-assembler "stmx" } } */
+/* { dg-final { scan-assembler "stmfs" } } */
+/* { dg-final { scan-assembler "stmfd" } } */
+/* { dg-final { scan-assembler "ldmuh" } } */
+/* { dg-final { scan-assembler "ldmsw" } } */
+/* { dg-final { scan-assembler "ldmx" } } */
+/* { dg-final { scan-assembler "ldmfs" } } */
+/* { dg-final { scan-assembler "ldmfd" } } */
diff --git a/gcc/testsuite/gcc.target/sparc/misalign-2.c b/gcc/testsuite/gcc.target/sparc/misalign-2.c
new file mode 100644
index 0000000..d94959d
--- /dev/null
+++ b/gcc/testsuite/gcc.target/sparc/misalign-2.c
@@ -0,0 +1,23 @@ 
+/* this is to verify that the store of constant zero to misaligned memory address of type
+ * long long or double use stmx insns on m8 and -m32 */
+
+/* { dg-do compile } */
+/* { dg-options "-mcpu=m8 -m32" } */
+
+typedef struct __attribute__((packed)) {
+  char old;
+  long long ll;
+  double dp;
+} NCO;
+
+NCO a;
+
+int main(void) {
+
+  a.old = 'c';
+  a.ll = 0;
+  a.dp = 0.0;
+  return 0;
+}
+
+/* { dg-final { scan-assembler-times "stmx" 2 } } */
diff --git a/gcc/testsuite/gcc.target/sparc/misalign-3.c b/gcc/testsuite/gcc.target/sparc/misalign-3.c
new file mode 100644
index 0000000..eac2adb
--- /dev/null
+++ b/gcc/testsuite/gcc.target/sparc/misalign-3.c
@@ -0,0 +1,42 @@ 
+/* { dg-do compile } */
+/* { dg-options "-mmisalign" } */
+
+typedef int   v1si __attribute__ ((vector_size (4),aligned(2)));
+typedef short v2hi __attribute__ ((vector_size (4),aligned(1)));
+
+typedef long long v1di __attribute__ ((vector_size (8),aligned(4)));
+typedef int       v2si __attribute__ ((vector_size (8),aligned(2)));
+typedef short     v4hi __attribute__ ((vector_size (8),aligned(1)));
+
+v1si a1 = {1};
+v1si b1 = {2};
+v1si c1;
+v2hi a2 = {1,2};
+v2hi b2 = {2,1};
+v2hi c2;
+
+v1di a3 = {1};
+v1di b3 = {2};
+v1di c3;
+v2si a4 = {1,2};
+v2si b4 = {2,1};
+v2si c4;
+v4hi a5 = {1,2,3,4};
+v4hi b5 = {4,3,2,1};
+v4hi c5;
+
+int main()
+{
+  c1 = a1 + b1;
+  c2 = a2 - b2;
+
+  c3 = a3 * b3;
+  c4 = a4 / b4;
+  c5 = a5 == b5;
+
+  return 0;
+}
+
+/* { dg-final { scan-assembler-times "ldm" 10} } */
+/* { dg-final { scan-assembler-times "stm" 5} } */
+
diff --git a/gcc/testsuite/gcc.target/sparc/misalign-4.c b/gcc/testsuite/gcc.target/sparc/misalign-4.c
new file mode 100644
index 0000000..23e9b3e
--- /dev/null
+++ b/gcc/testsuite/gcc.target/sparc/misalign-4.c
@@ -0,0 +1,23 @@ 
+/* { dg-do compile } */
+/* { dg-options "-mmisalign" } */
+
+/* this test case is added to test the fact that, misaligned ld/st insns does NOT 
+ * support REG + IMM address mode, when the IMM is too big to be represented by a
+ * 10 bit signed interger */
+typedef struct __attribute__((packed)) {
+  char old;
+  char pad[1024];
+  long ll;
+} NCO;
+
+NCO a;
+extern long el;
+
+int main(void) {
+
+  a.ll = 0;
+  el = a.ll;
+  return 0;
+}
+
+/* { dg-final { scan-assembler "stmx" } } */
diff --git a/gcc/testsuite/gcc.target/sparc/misalign-5.c b/gcc/testsuite/gcc.target/sparc/misalign-5.c
new file mode 100644
index 0000000..2b9d069
--- /dev/null
+++ b/gcc/testsuite/gcc.target/sparc/misalign-5.c
@@ -0,0 +1,19 @@ 
+/* { dg-do compile } */
+/* { dg-options "-mmisalign -O1" } */
+
+/* this test case is added to test the fact that, misaligned ld/st insns does NOT 
+ * support LOSUM + IMM address mode */
+typedef struct __attribute__((packed)) {
+  char old;
+  long long ll;
+} NCO;
+
+NCO a;
+
+int main(void) {
+
+  a.ll = 0;
+  return 0;
+}
+
+/* { dg-final { scan-assembler "stmx" } } */
diff --git a/gcc/testsuite/gcc.target/sparc/misalign-run-1.c b/gcc/testsuite/gcc.target/sparc/misalign-run-1.c
new file mode 100644
index 0000000..61f007e
--- /dev/null
+++ b/gcc/testsuite/gcc.target/sparc/misalign-run-1.c
@@ -0,0 +1,34 @@ 
+/* { dg-do run} */
+/* { dg-require-effective-target misalign_hw } */
+/* { dg-options "-mcpu=m8" } */
+
+extern void abort (void);
+
+typedef struct __attribute__((packed)) {
+  char old;
+  short ip;
+  int new;
+  long long ll;
+  float fp;
+  double dp;
+} NCO;
+
+NCO a;
+
+int main(void) {
+
+  a.old = 'c';
+  a.ip = 10;
+  a.new = 20;
+  a.ll = 50;
+  a.fp = 30.0;
+  a.dp = 40.0;
+  if ((a.ip != 10)
+      || (a.new != 20)
+      || (a.ll != 50)
+      || (a.fp != 30.0)
+      || (a.dp != 40.0))
+    abort();
+
+  return 0;
+}
diff --git a/gcc/testsuite/gcc.target/sparc/misalign-run-2.c b/gcc/testsuite/gcc.target/sparc/misalign-run-2.c
new file mode 100644
index 0000000..1679860
--- /dev/null
+++ b/gcc/testsuite/gcc.target/sparc/misalign-run-2.c
@@ -0,0 +1,23 @@ 
+/* { dg-do run} */
+/* { dg-require-effective-target misalign_hw } */
+/* { dg-options "-mcpu=m8 -m32" } */
+
+extern void abort (void);
+
+typedef struct __attribute__((packed)) {
+  char old;
+  long long ll;
+  double dp;
+} NCO;
+
+NCO a;
+
+int main(void) {
+
+  a.old = 'c';
+  a.ll = 0;
+  a.dp = 0.0;
+  if ((a.ll != 0) || (a.dp != 0.0))
+    abort ();
+  return 0;
+}
diff --git a/gcc/testsuite/gcc.target/sparc/misalign-run-3.c b/gcc/testsuite/gcc.target/sparc/misalign-run-3.c
new file mode 100644
index 0000000..18ba8d2
--- /dev/null
+++ b/gcc/testsuite/gcc.target/sparc/misalign-run-3.c
@@ -0,0 +1,53 @@ 
+/* { dg-do run} */
+/* { dg-require-effective-target misalign_hw } */
+/* { dg-options "-mmisalign" } */
+
+typedef int   v1si __attribute__ ((vector_size (4),aligned(2)));
+typedef short v2hi __attribute__ ((vector_size (4),aligned(1)));
+
+typedef long long v1di __attribute__ ((vector_size (8),aligned(4)));
+typedef int       v2si __attribute__ ((vector_size (8),aligned(2)));
+typedef short     v4hi __attribute__ ((vector_size (8),aligned(1)));
+
+v1si a1 = {1};
+v1si b1 = {2};
+v1si c1;
+v2hi a2 = {1,2};
+v2hi b2 = {2,1};
+v2hi c2;
+
+v1di a3 = {1};
+v1di b3 = {2};
+v1di c3;
+v2si a4 = {1,2};
+v2si b4 = {2,1};
+v2si c4;
+v4hi a5 = {1,2,3,4};
+v4hi b5 = {4,3,2,1};
+v4hi c5;
+
+extern void abort (void);
+
+int main()
+{
+  c1 = a1 + b1;
+  c2 = a2 - b2;
+
+  c3 = a3 * b3;
+  c4 = a4 / b4;
+  c5 = a5 == b5;
+
+  if ((c1[0] != 3)
+      || (c2[0] != -1)
+      || (c2[1] != 1)
+      || (c3[0] != 2)
+      || (c4[0] != 0)
+      || (c4[1] != 2)
+      || (c5[0] != 0)
+      || (c5[1] != 0)
+      || (c5[2] != 0)
+      || (c5[3] != 0))
+    abort ();
+
+  return 0;
+}
diff --git a/gcc/testsuite/lib/target-supports.exp b/gcc/testsuite/lib/target-supports.exp
index 5a65627..d7fe3a0 100644
--- a/gcc/testsuite/lib/target-supports.exp
+++ b/gcc/testsuite/lib/target-supports.exp
@@ -8292,6 +8292,21 @@  proc check_effective_target_offload_nvptx { } {
     } "-foffload=nvptx-none" ]
 }
 
+# Return 1 if the target supports the following misaligned load instruction:
+# ldmx o1, o2
+proc check_effective_target_misalign_hw { } {
+    return [check_runtime misalign_hw {
+       int main (void)
+       {
+           register void *p __asm__ ("o1") = &main;
+           register long res __asm__ ("o2");
+           asm volatile (".word 0xd58a5400");
+           return 0;
+       }
+    } "-mmisalign"]
+}
+
+
 # Return 1 if the compiler has been configured with hsa offloading.
 
 proc check_effective_target_offload_hsa { } {