diff mbox

Detect a pack-unpack pattern in GCC vectorizer and optimize it.

Message ID CAK=A3=2ZcCUrMi60=ViCsr8qe0M8eMXiAsTjog3q1HBDuiBPLQ@mail.gmail.com
State New
Headers show

Commit Message

Cong Hou April 24, 2014, 5:24 p.m. UTC
Given the following loop:

int a[N];
short b[N*2];

for (int i = 0; i < N; ++i)
  a[i] = b[i*2];


After being vectorized, the access to b[i*2] will be compiled into
several packing statements, while the type promotion from short to int
will be compiled into several unpacking statements. With this patch,
each pair of pack/unpack statements will be replaced by less expensive
statements (with shift or bit-and operations).

On x86_64, the loop above will be compiled into the following assembly
(with -O2 -ftree-vectorize):

movdqu 0x10(%rcx),%xmm3
movdqu -0x20(%rcx),%xmm0
movdqa %xmm0,%xmm2
punpcklwd %xmm3,%xmm0
punpckhwd %xmm3,%xmm2
movdqa %xmm0,%xmm3
punpcklwd %xmm2,%xmm0
punpckhwd %xmm2,%xmm3
movdqa %xmm1,%xmm2
punpcklwd %xmm3,%xmm0
pcmpgtw %xmm0,%xmm2
movdqa %xmm0,%xmm3
punpckhwd %xmm2,%xmm0
punpcklwd %xmm2,%xmm3
movups %xmm0,-0x10(%rdx)
movups %xmm3,-0x20(%rdx)


With this patch, the generated assembly is shown below:

movdqu 0x10(%rcx),%xmm0
movdqu -0x20(%rcx),%xmm1
pslld  $0x10,%xmm0
psrad  $0x10,%xmm0
pslld  $0x10,%xmm1
movups %xmm0,-0x10(%rdx)
psrad  $0x10,%xmm1
movups %xmm1,-0x20(%rdx)


Bootstrapped and tested on x86-64. OK for trunk?


thanks,
Cong

Comments

Richard Biener April 28, 2014, 11:04 a.m. UTC | #1
On Thu, 24 Apr 2014, Cong Hou wrote:

> Given the following loop:
> 
> int a[N];
> short b[N*2];
> 
> for (int i = 0; i < N; ++i)
>   a[i] = b[i*2];
> 
> 
> After being vectorized, the access to b[i*2] will be compiled into
> several packing statements, while the type promotion from short to int
> will be compiled into several unpacking statements. With this patch,
> each pair of pack/unpack statements will be replaced by less expensive
> statements (with shift or bit-and operations).
> 
> On x86_64, the loop above will be compiled into the following assembly
> (with -O2 -ftree-vectorize):
> 
> movdqu 0x10(%rcx),%xmm3
> movdqu -0x20(%rcx),%xmm0
> movdqa %xmm0,%xmm2
> punpcklwd %xmm3,%xmm0
> punpckhwd %xmm3,%xmm2
> movdqa %xmm0,%xmm3
> punpcklwd %xmm2,%xmm0
> punpckhwd %xmm2,%xmm3
> movdqa %xmm1,%xmm2
> punpcklwd %xmm3,%xmm0
> pcmpgtw %xmm0,%xmm2
> movdqa %xmm0,%xmm3
> punpckhwd %xmm2,%xmm0
> punpcklwd %xmm2,%xmm3
> movups %xmm0,-0x10(%rdx)
> movups %xmm3,-0x20(%rdx)
> 
> 
> With this patch, the generated assembly is shown below:
> 
> movdqu 0x10(%rcx),%xmm0
> movdqu -0x20(%rcx),%xmm1
> pslld  $0x10,%xmm0
> psrad  $0x10,%xmm0
> pslld  $0x10,%xmm1
> movups %xmm0,-0x10(%rdx)
> psrad  $0x10,%xmm1
> movups %xmm1,-0x20(%rdx)
> 
> 
> Bootstrapped and tested on x86-64. OK for trunk?

This is an odd place to implement such transform.  Also if it
is faster or not depends on the exact ISA you target - for
example ppc has constraints on the maximum number of shifts
carried out in parallel and the above has 4 in very short
succession.  Esp. for the sign-extend path.

So this looks more like an opportunity for a post-vectorizer
transform on RTL or for the vectorizer special-casing
widening loads with a vectorizer pattern.

Richard.
Cong Hou May 3, 2014, 12:39 a.m. UTC | #2
On Mon, Apr 28, 2014 at 4:04 AM, Richard Biener <rguenther@suse.de> wrote:
> On Thu, 24 Apr 2014, Cong Hou wrote:
>
>> Given the following loop:
>>
>> int a[N];
>> short b[N*2];
>>
>> for (int i = 0; i < N; ++i)
>>   a[i] = b[i*2];
>>
>>
>> After being vectorized, the access to b[i*2] will be compiled into
>> several packing statements, while the type promotion from short to int
>> will be compiled into several unpacking statements. With this patch,
>> each pair of pack/unpack statements will be replaced by less expensive
>> statements (with shift or bit-and operations).
>>
>> On x86_64, the loop above will be compiled into the following assembly
>> (with -O2 -ftree-vectorize):
>>
>> movdqu 0x10(%rcx),%xmm3
>> movdqu -0x20(%rcx),%xmm0
>> movdqa %xmm0,%xmm2
>> punpcklwd %xmm3,%xmm0
>> punpckhwd %xmm3,%xmm2
>> movdqa %xmm0,%xmm3
>> punpcklwd %xmm2,%xmm0
>> punpckhwd %xmm2,%xmm3
>> movdqa %xmm1,%xmm2
>> punpcklwd %xmm3,%xmm0
>> pcmpgtw %xmm0,%xmm2
>> movdqa %xmm0,%xmm3
>> punpckhwd %xmm2,%xmm0
>> punpcklwd %xmm2,%xmm3
>> movups %xmm0,-0x10(%rdx)
>> movups %xmm3,-0x20(%rdx)
>>
>>
>> With this patch, the generated assembly is shown below:
>>
>> movdqu 0x10(%rcx),%xmm0
>> movdqu -0x20(%rcx),%xmm1
>> pslld  $0x10,%xmm0
>> psrad  $0x10,%xmm0
>> pslld  $0x10,%xmm1
>> movups %xmm0,-0x10(%rdx)
>> psrad  $0x10,%xmm1
>> movups %xmm1,-0x20(%rdx)
>>
>>
>> Bootstrapped and tested on x86-64. OK for trunk?
>
> This is an odd place to implement such transform.  Also if it
> is faster or not depends on the exact ISA you target - for
> example ppc has constraints on the maximum number of shifts
> carried out in parallel and the above has 4 in very short
> succession.  Esp. for the sign-extend path.

Thank you for the information about ppc. If this is an issue, I think
we can do it in a target dependent way.


>
> So this looks more like an opportunity for a post-vectorizer
> transform on RTL or for the vectorizer special-casing
> widening loads with a vectorizer pattern.

I am not sure if the RTL transform is more difficult to implement. I
prefer the widening loads method, which can be detected in a pattern
recognizer. The target related issue will be resolved by only
expanding the widening load on those targets where this pattern is
beneficial. But this requires new tree operations to be defined. What
is your suggestion?

I apologize for the delayed reply.


thanks,
Cong

>
> Richard.
Richard Biener June 24, 2014, 11:05 a.m. UTC | #3
On Sat, May 3, 2014 at 2:39 AM, Cong Hou <congh@google.com> wrote:
> On Mon, Apr 28, 2014 at 4:04 AM, Richard Biener <rguenther@suse.de> wrote:
>> On Thu, 24 Apr 2014, Cong Hou wrote:
>>
>>> Given the following loop:
>>>
>>> int a[N];
>>> short b[N*2];
>>>
>>> for (int i = 0; i < N; ++i)
>>>   a[i] = b[i*2];
>>>
>>>
>>> After being vectorized, the access to b[i*2] will be compiled into
>>> several packing statements, while the type promotion from short to int
>>> will be compiled into several unpacking statements. With this patch,
>>> each pair of pack/unpack statements will be replaced by less expensive
>>> statements (with shift or bit-and operations).
>>>
>>> On x86_64, the loop above will be compiled into the following assembly
>>> (with -O2 -ftree-vectorize):
>>>
>>> movdqu 0x10(%rcx),%xmm3
>>> movdqu -0x20(%rcx),%xmm0
>>> movdqa %xmm0,%xmm2
>>> punpcklwd %xmm3,%xmm0
>>> punpckhwd %xmm3,%xmm2
>>> movdqa %xmm0,%xmm3
>>> punpcklwd %xmm2,%xmm0
>>> punpckhwd %xmm2,%xmm3
>>> movdqa %xmm1,%xmm2
>>> punpcklwd %xmm3,%xmm0
>>> pcmpgtw %xmm0,%xmm2
>>> movdqa %xmm0,%xmm3
>>> punpckhwd %xmm2,%xmm0
>>> punpcklwd %xmm2,%xmm3
>>> movups %xmm0,-0x10(%rdx)
>>> movups %xmm3,-0x20(%rdx)
>>>
>>>
>>> With this patch, the generated assembly is shown below:
>>>
>>> movdqu 0x10(%rcx),%xmm0
>>> movdqu -0x20(%rcx),%xmm1
>>> pslld  $0x10,%xmm0
>>> psrad  $0x10,%xmm0
>>> pslld  $0x10,%xmm1
>>> movups %xmm0,-0x10(%rdx)
>>> psrad  $0x10,%xmm1
>>> movups %xmm1,-0x20(%rdx)
>>>
>>>
>>> Bootstrapped and tested on x86-64. OK for trunk?
>>
>> This is an odd place to implement such transform.  Also if it
>> is faster or not depends on the exact ISA you target - for
>> example ppc has constraints on the maximum number of shifts
>> carried out in parallel and the above has 4 in very short
>> succession.  Esp. for the sign-extend path.
>
> Thank you for the information about ppc. If this is an issue, I think
> we can do it in a target dependent way.
>
>
>>
>> So this looks more like an opportunity for a post-vectorizer
>> transform on RTL or for the vectorizer special-casing
>> widening loads with a vectorizer pattern.
>
> I am not sure if the RTL transform is more difficult to implement. I
> prefer the widening loads method, which can be detected in a pattern
> recognizer. The target related issue will be resolved by only
> expanding the widening load on those targets where this pattern is
> beneficial. But this requires new tree operations to be defined. What
> is your suggestion?
>
> I apologize for the delayed reply.

Likewise ;)

I suggest to implement this optimization in vector lowering in
tree-vect-generic.c.  This sees for your example

  vect__5.7_32 = MEM[symbol: b, index: ivtmp.15_13, offset: 0B];
  vect__5.8_34 = MEM[symbol: b, index: ivtmp.15_13, offset: 16B];
  vect_perm_even_35 = VEC_PERM_EXPR <vect__5.7_32, vect__5.8_34, { 0,
2, 4, 6, 8, 10, 12, 14 }>;
  vect__6.9_37 = [vec_unpack_lo_expr] vect_perm_even_35;
  vect__6.9_38 = [vec_unpack_hi_expr] vect_perm_even_35;

where you can apply the pattern matching and transform (after checking
with the target, of course).

Richard.

>
> thanks,
> Cong
>
>>
>> Richard.
Cong Hou June 25, 2014, 5:34 p.m. UTC | #4
On Tue, Jun 24, 2014 at 4:05 AM, Richard Biener
<richard.guenther@gmail.com> wrote:
> On Sat, May 3, 2014 at 2:39 AM, Cong Hou <congh@google.com> wrote:
>> On Mon, Apr 28, 2014 at 4:04 AM, Richard Biener <rguenther@suse.de> wrote:
>>> On Thu, 24 Apr 2014, Cong Hou wrote:
>>>
>>>> Given the following loop:
>>>>
>>>> int a[N];
>>>> short b[N*2];
>>>>
>>>> for (int i = 0; i < N; ++i)
>>>>   a[i] = b[i*2];
>>>>
>>>>
>>>> After being vectorized, the access to b[i*2] will be compiled into
>>>> several packing statements, while the type promotion from short to int
>>>> will be compiled into several unpacking statements. With this patch,
>>>> each pair of pack/unpack statements will be replaced by less expensive
>>>> statements (with shift or bit-and operations).
>>>>
>>>> On x86_64, the loop above will be compiled into the following assembly
>>>> (with -O2 -ftree-vectorize):
>>>>
>>>> movdqu 0x10(%rcx),%xmm3
>>>> movdqu -0x20(%rcx),%xmm0
>>>> movdqa %xmm0,%xmm2
>>>> punpcklwd %xmm3,%xmm0
>>>> punpckhwd %xmm3,%xmm2
>>>> movdqa %xmm0,%xmm3
>>>> punpcklwd %xmm2,%xmm0
>>>> punpckhwd %xmm2,%xmm3
>>>> movdqa %xmm1,%xmm2
>>>> punpcklwd %xmm3,%xmm0
>>>> pcmpgtw %xmm0,%xmm2
>>>> movdqa %xmm0,%xmm3
>>>> punpckhwd %xmm2,%xmm0
>>>> punpcklwd %xmm2,%xmm3
>>>> movups %xmm0,-0x10(%rdx)
>>>> movups %xmm3,-0x20(%rdx)
>>>>
>>>>
>>>> With this patch, the generated assembly is shown below:
>>>>
>>>> movdqu 0x10(%rcx),%xmm0
>>>> movdqu -0x20(%rcx),%xmm1
>>>> pslld  $0x10,%xmm0
>>>> psrad  $0x10,%xmm0
>>>> pslld  $0x10,%xmm1
>>>> movups %xmm0,-0x10(%rdx)
>>>> psrad  $0x10,%xmm1
>>>> movups %xmm1,-0x20(%rdx)
>>>>
>>>>
>>>> Bootstrapped and tested on x86-64. OK for trunk?
>>>
>>> This is an odd place to implement such transform.  Also if it
>>> is faster or not depends on the exact ISA you target - for
>>> example ppc has constraints on the maximum number of shifts
>>> carried out in parallel and the above has 4 in very short
>>> succession.  Esp. for the sign-extend path.
>>
>> Thank you for the information about ppc. If this is an issue, I think
>> we can do it in a target dependent way.
>>
>>
>>>
>>> So this looks more like an opportunity for a post-vectorizer
>>> transform on RTL or for the vectorizer special-casing
>>> widening loads with a vectorizer pattern.
>>
>> I am not sure if the RTL transform is more difficult to implement. I
>> prefer the widening loads method, which can be detected in a pattern
>> recognizer. The target related issue will be resolved by only
>> expanding the widening load on those targets where this pattern is
>> beneficial. But this requires new tree operations to be defined. What
>> is your suggestion?
>>
>> I apologize for the delayed reply.
>
> Likewise ;)
>
> I suggest to implement this optimization in vector lowering in
> tree-vect-generic.c.  This sees for your example
>
>   vect__5.7_32 = MEM[symbol: b, index: ivtmp.15_13, offset: 0B];
>   vect__5.8_34 = MEM[symbol: b, index: ivtmp.15_13, offset: 16B];
>   vect_perm_even_35 = VEC_PERM_EXPR <vect__5.7_32, vect__5.8_34, { 0,
> 2, 4, 6, 8, 10, 12, 14 }>;
>   vect__6.9_37 = [vec_unpack_lo_expr] vect_perm_even_35;
>   vect__6.9_38 = [vec_unpack_hi_expr] vect_perm_even_35;
>
> where you can apply the pattern matching and transform (after checking
> with the target, of course).

This sounds good to me! I'll try to make a patch following your suggestion.

Thank you!


Cong

>
> Richard.
>
>>
>> thanks,
>> Cong
>>
>>>
>>> Richard.
Evgeny Stupachenko Nov. 21, 2014, 11:50 a.m. UTC | #5
Hi,

Please note that currently the test:

int a[N];
short b[N*2];

for (int i = 0; i < N; ++i)
  a[i] = b[i*2];

Is compiled to (with -march=corei7 -O2 -ftree-vectorize):

        movdqa  b(%rax), %xmm0
        movdqa  b-16(%rax), %xmm2
        pand    %xmm1, %xmm0
        pand    %xmm1, %xmm2
        packusdw        %xmm2, %xmm0
        pmovsxwd        %xmm0, %xmm2
        psrldq  $8, %xmm0
        pmovsxwd        %xmm0, %xmm0
        movaps  %xmm2, a-32(%rax)
        movaps  %xmm0, a-16(%rax)

Which is more close to the requested sequence.

Thanks,
Evgeny


On Wed, Jun 25, 2014 at 8:34 PM, Cong Hou <congh@google.com> wrote:
> On Tue, Jun 24, 2014 at 4:05 AM, Richard Biener
> <richard.guenther@gmail.com> wrote:
>> On Sat, May 3, 2014 at 2:39 AM, Cong Hou <congh@google.com> wrote:
>>> On Mon, Apr 28, 2014 at 4:04 AM, Richard Biener <rguenther@suse.de> wrote:
>>>> On Thu, 24 Apr 2014, Cong Hou wrote:
>>>>
>>>>> Given the following loop:
>>>>>
>>>>> int a[N];
>>>>> short b[N*2];
>>>>>
>>>>> for (int i = 0; i < N; ++i)
>>>>>   a[i] = b[i*2];
>>>>>
>>>>>
>>>>> After being vectorized, the access to b[i*2] will be compiled into
>>>>> several packing statements, while the type promotion from short to int
>>>>> will be compiled into several unpacking statements. With this patch,
>>>>> each pair of pack/unpack statements will be replaced by less expensive
>>>>> statements (with shift or bit-and operations).
>>>>>
>>>>> On x86_64, the loop above will be compiled into the following assembly
>>>>> (with -O2 -ftree-vectorize):
>>>>>
>>>>> movdqu 0x10(%rcx),%xmm3
>>>>> movdqu -0x20(%rcx),%xmm0
>>>>> movdqa %xmm0,%xmm2
>>>>> punpcklwd %xmm3,%xmm0
>>>>> punpckhwd %xmm3,%xmm2
>>>>> movdqa %xmm0,%xmm3
>>>>> punpcklwd %xmm2,%xmm0
>>>>> punpckhwd %xmm2,%xmm3
>>>>> movdqa %xmm1,%xmm2
>>>>> punpcklwd %xmm3,%xmm0
>>>>> pcmpgtw %xmm0,%xmm2
>>>>> movdqa %xmm0,%xmm3
>>>>> punpckhwd %xmm2,%xmm0
>>>>> punpcklwd %xmm2,%xmm3
>>>>> movups %xmm0,-0x10(%rdx)
>>>>> movups %xmm3,-0x20(%rdx)
>>>>>
>>>>>
>>>>> With this patch, the generated assembly is shown below:
>>>>>
>>>>> movdqu 0x10(%rcx),%xmm0
>>>>> movdqu -0x20(%rcx),%xmm1
>>>>> pslld  $0x10,%xmm0
>>>>> psrad  $0x10,%xmm0
>>>>> pslld  $0x10,%xmm1
>>>>> movups %xmm0,-0x10(%rdx)
>>>>> psrad  $0x10,%xmm1
>>>>> movups %xmm1,-0x20(%rdx)
>>>>>
>>>>>
>>>>> Bootstrapped and tested on x86-64. OK for trunk?
>>>>
>>>> This is an odd place to implement such transform.  Also if it
>>>> is faster or not depends on the exact ISA you target - for
>>>> example ppc has constraints on the maximum number of shifts
>>>> carried out in parallel and the above has 4 in very short
>>>> succession.  Esp. for the sign-extend path.
>>>
>>> Thank you for the information about ppc. If this is an issue, I think
>>> we can do it in a target dependent way.
>>>
>>>
>>>>
>>>> So this looks more like an opportunity for a post-vectorizer
>>>> transform on RTL or for the vectorizer special-casing
>>>> widening loads with a vectorizer pattern.
>>>
>>> I am not sure if the RTL transform is more difficult to implement. I
>>> prefer the widening loads method, which can be detected in a pattern
>>> recognizer. The target related issue will be resolved by only
>>> expanding the widening load on those targets where this pattern is
>>> beneficial. But this requires new tree operations to be defined. What
>>> is your suggestion?
>>>
>>> I apologize for the delayed reply.
>>
>> Likewise ;)
>>
>> I suggest to implement this optimization in vector lowering in
>> tree-vect-generic.c.  This sees for your example
>>
>>   vect__5.7_32 = MEM[symbol: b, index: ivtmp.15_13, offset: 0B];
>>   vect__5.8_34 = MEM[symbol: b, index: ivtmp.15_13, offset: 16B];
>>   vect_perm_even_35 = VEC_PERM_EXPR <vect__5.7_32, vect__5.8_34, { 0,
>> 2, 4, 6, 8, 10, 12, 14 }>;
>>   vect__6.9_37 = [vec_unpack_lo_expr] vect_perm_even_35;
>>   vect__6.9_38 = [vec_unpack_hi_expr] vect_perm_even_35;
>>
>> where you can apply the pattern matching and transform (after checking
>> with the target, of course).
>
> This sounds good to me! I'll try to make a patch following your suggestion.
>
> Thank you!
>
>
> Cong
>
>>
>> Richard.
>>
>>>
>>> thanks,
>>> Cong
>>>
>>>>
>>>> Richard.
diff mbox

Patch

diff --git a/gcc/ChangeLog b/gcc/ChangeLog
index 117cdd0..e7143f1 100644
--- a/gcc/ChangeLog
+++ b/gcc/ChangeLog
@@ -1,3 +1,8 @@ 
+2014-04-23  Cong Hou  <congh@google.com>
+
+	* tree-vect-stmts.c (detect_pack_unpack_pattern): New function.
+	(vect_gen_widened_results_half): Call detect_pack_unpack_pattern.
+
 2014-04-23  David Malcolm  <dmalcolm@redhat.com>
 
 	* is-a.h: Update comments to reflect the following changes to the
diff --git a/gcc/testsuite/ChangeLog b/gcc/testsuite/ChangeLog
index 62b07f4..a8755b3 100644
--- a/gcc/testsuite/ChangeLog
+++ b/gcc/testsuite/ChangeLog
@@ -1,3 +1,7 @@ 
+2014-04-23  Cong Hou  <congh@google.com>
+
+	* gcc.dg/vect/vect-125.c: New test.
+
 2014-04-23  Jeff Law  <law@redhat.com>
 
 	PR tree-optimization/60902
diff --git a/gcc/testsuite/gcc.dg/vect/vect-125.c b/gcc/testsuite/gcc.dg/vect/vect-125.c
new file mode 100644
index 0000000..988dea6
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/vect-125.c
@@ -0,0 +1,122 @@ 
+/* { dg-require-effective-target vect_int } */
+
+#include <limits.h>
+#include "tree-vect.h"
+
+#define N 64
+
+char b[N];
+unsigned char c[N];
+short d[N];
+unsigned short e[N];
+
+__attribute__((noinline)) void
+test1 ()
+{
+  int a[N];
+  int i;
+  for (i = 0; i < N/2; i++)
+    {
+      a[i] = b[i*2];
+      a[i+N/2] = b[i*2+1];
+    }
+  for (i = 0; i < N/2; i++)
+    if (a[i] != b[i*2] || a[i+N/2] != b[i*2+1])
+      abort ();
+
+  for (i = 0; i < N/2; i++)
+    {
+      a[i] = c[i*2];
+      a[i+N/2] = c[i*2+1];
+    }
+  for (i = 0; i < N/2; i++)
+    if (a[i] != c[i*2] || a[i+N/2] != c[i*2+1])
+      abort ();
+
+  for (i = 0; i < N/2; i++)
+    {
+      a[i] = d[i*2];
+      a[i+N/2] = d[i*2+1];
+    }
+  for (i = 0; i < N/2; i++)
+    if (a[i] != d[i*2] || a[i+N/2] != d[i*2+1])
+      abort ();
+
+  for (i = 0; i < N/2; i++)
+    {
+      a[i] = e[i*2];
+      a[i+N/2] = e[i*2+1];
+    }
+  for (i = 0; i < N/2; i++)
+    if (a[i] != e[i*2] || a[i+N/2] != e[i*2+1])
+      abort ();
+}
+
+__attribute__((noinline)) void
+test2 ()
+{
+  unsigned int a[N];
+  int i;
+  for (i = 0; i < N/2; i++)
+    {
+      a[i] = b[i*2];
+      a[i+N/2] = b[i*2+1];
+    }
+  for (i = 0; i < N/2; i++)
+    if (a[i] != b[i*2] || a[i+N/2] != b[i*2+1])
+      abort ();
+
+  for (i = 0; i < N/2; i++)
+    {
+      a[i] = c[i*2];
+      a[i+N/2] = c[i*2+1];
+    }
+  for (i = 0; i < N/2; i++)
+    if (a[i] != c[i*2] || a[i+N/2] != c[i*2+1])
+      abort ();
+
+  for (i = 0; i < N/2; i++)
+    {
+      a[i] = d[i*2];
+      a[i+N/2] = d[i*2+1];
+    }
+  for (i = 0; i < N/2; i++)
+    if (a[i] != d[i*2] || a[i+N/2] != d[i*2+1])
+      abort ();
+
+  for (i = 0; i < N/2; i++)
+    {
+      a[i] = e[i*2];
+      a[i+N/2] = e[i*2+1];
+    }
+  for (i = 0; i < N/2; i++)
+    if (a[i] != e[i*2] || a[i+N/2] != e[i*2+1])
+      abort ();
+}
+
+int
+main ()
+{
+  b[0] = CHAR_MIN;
+  c[0] = UCHAR_MAX;
+  d[0] = SHRT_MIN;
+  e[0] = USHRT_MAX;
+
+  int i;
+  for (i = 1; i < N; i++)
+    {
+      b[i] = b[i-1] + 1;
+      c[i] = c[i-1] - 1;
+      d[i] = d[i-1] + 1;
+      e[i] = e[i-1] - 1;
+    }
+
+  test1 ();
+  test2 ();
+  return 0;
+}
+
+/* { dg-final { scan-tree-dump-times "vectorized 4 loops" 2 "vect" } } */
+/* { dg-final { scan-tree-dump-times "A pack-unpack pattern is recognized" 32 "vect" } } */
+/* { dg-final { cleanup-tree-dump "vect" } } */
+
diff --git a/gcc/tree-vect-stmts.c b/gcc/tree-vect-stmts.c
index 1a51d6d..d0cf1f4 100644
--- a/gcc/tree-vect-stmts.c
+++ b/gcc/tree-vect-stmts.c
@@ -3191,6 +3191,174 @@  vectorizable_simd_clone_call (gimple stmt, gimple_stmt_iterator *gsi,
 }
 
 
+/* Function detect_pack_unpack_pattern
+
+   Detect the following pattern:
+
+   S1  vect3 = VEC_PERM_EXPR <vect1, vect2, { 0, 2, 4, ... }>;
+   or
+   S1  vect3 = VEC_PERM_EXPR <vect1, vect2, { 1, 3, 5, ... }>;
+
+   S2  vect4 = [vec_unpack_lo_expr] vect3;
+   or/and
+   S3  vect5 = [vec_unpack_hi_expr] vect3;
+
+   S1 is usually generated from accessing even/odd elements of an array
+   and S2/S3 are generated from type promotion.  In this case S1 and S2/S3 can
+   be replaced by less expensive statements.  */
+
+static gimple
+detect_pack_unpack_pattern (enum tree_code code, tree vec_oprnd, int op_type,
+			    tree vec_dest, gimple_stmt_iterator *gsi,
+			    gimple stmt)
+{
+  if ((code != VEC_UNPACK_LO_EXPR && code != VEC_UNPACK_HI_EXPR)
+      || op_type != unary_op)
+    return NULL;
+
+  if (TREE_CODE (vec_oprnd) != SSA_NAME)
+    return NULL;
+
+  gimple def_stmt = SSA_NAME_DEF_STMT (vec_oprnd);
+  if (!is_gimple_assign (def_stmt))
+    return NULL;
+  enum tree_code def_code = gimple_assign_rhs_code (def_stmt);
+  if (def_code != VEC_PERM_EXPR)
+    return NULL;
+
+  bool unpack_lo = (code == VEC_UNPACK_LO_EXPR);
+  tree perm_vec = gimple_assign_rhs3 (def_stmt);
+  gcc_assert (TREE_CODE (perm_vec) == VECTOR_CST);
+
+  /* Check if VEC_PERM_EXPR is generated from accessing even/odd elements of
+     an array.  */
+  int nelts = VECTOR_CST_NELTS (perm_vec);
+  int odd_num = 0;
+  for (int i = 0; i < nelts / 2; i++)
+    {
+      tree elt = VECTOR_CST_ELT (perm_vec,
+				 unpack_lo ? i : i + nelts / 2);
+      int elt_val = (HOST_WIDE_INT) TREE_INT_CST_LOW (elt);
+      if (!unpack_lo)
+	elt_val -= nelts;
+
+      if (elt_val / 2 != i)
+	return NULL;
+      odd_num += elt_val & 1;
+    }
+
+  bool even_elem = false;
+  if (odd_num == 0)
+    even_elem = true;
+  else if (odd_num != nelts / 2)
+    return NULL;
+
+  if (dump_enabled_p ())
+    dump_printf_loc (MSG_NOTE, vect_location,
+		     "A pack-unpack pattern is recognized.\n");
+
+  tree vec_oprnd0 = unpack_lo ? gimple_assign_rhs1 (def_stmt)
+			      : gimple_assign_rhs2 (def_stmt);
+  tree vec_type = TREE_TYPE (vec_oprnd0);
+  tree scalar_type = lang_hooks.types.type_for_mode (
+		       GET_MODE_INNER (TYPE_MODE (vec_type)),
+		       TYPE_UNSIGNED (vec_type));
+  tree vectype_out = TREE_TYPE (vec_dest);
+
+  if (even_elem)
+    {
+      /* If even elements are packed, and if the original values are unsigned,
+	 build a bit_and statement to replace the pack-unpack statements.
+	 If the original values are signed, build a lshift statement and a
+	 rshift statement to replace the pack-unpack statements.  */
+
+      if (TYPE_UNSIGNED (vec_type))
+	{
+	  tree *mask = XALLOCAVEC (tree, nelts);
+	  unsigned HOST_WIDE_INT max_val = TREE_INT_CST_LOW (
+					     TYPE_MAX_VALUE (scalar_type));
+	  for (int k = 0; k < nelts; ++k)
+	    mask[k] = build_int_cst (scalar_type, (k & 1) ? 0 : max_val);
+	  tree vec_oprnd1 = build_vector (TREE_TYPE (vec_oprnd0), mask);
+	  tree temp = make_temp_ssa_name (vec_type, NULL, "vect_temp");
+
+	  gimple new_stmt1, new_stmt2;
+	  new_stmt1 = gimple_build_assign_with_ops (BIT_AND_EXPR, temp,
+						    vec_oprnd0, vec_oprnd1);
+	  new_stmt2 = gimple_build_assign_with_ops (NOP_EXPR, vec_dest,
+						    temp, NULL_TREE);
+	  gimple_assign_set_lhs (new_stmt2,
+				 make_ssa_name (vec_dest, new_stmt2));
+
+	  vect_finish_stmt_generation (stmt, new_stmt1, gsi);
+	  vect_finish_stmt_generation (stmt, new_stmt2, gsi);
+
+	  return new_stmt2;
+	}
+      else
+	{
+	  tree scalar_itype = lang_hooks.types.type_for_mode (
+				GET_MODE_INNER (TYPE_MODE (vectype_out)), 0);
+	  tree vecitype = get_vectype_for_scalar_type (scalar_itype);
+	  tree shift_oprnd = build_int_cst (scalar_type,
+					    TYPE_PRECISION (scalar_type));
+
+	  tree temp1 = make_temp_ssa_name (vecitype, NULL, "vect_temp");
+	  tree temp2 = make_temp_ssa_name (vecitype, NULL, "vect_temp");
+	  tree temp3 = make_temp_ssa_name (vecitype, NULL, "vect_temp");
+
+	  gimple new_stmt1, new_stmt2, new_stmt3, new_stmt4;
+	  new_stmt1 = gimple_build_assign_with_ops (NOP_EXPR, temp1,
+						    vec_oprnd0, NULL);
+	  new_stmt2 = gimple_build_assign_with_ops (LSHIFT_EXPR, temp2,
+						    temp1, shift_oprnd);
+	  new_stmt3 = gimple_build_assign_with_ops (RSHIFT_EXPR, temp3,
+						    temp2, shift_oprnd);
+	  new_stmt4 = gimple_build_assign_with_ops (NOP_EXPR, vec_dest,
+						    temp3, NULL);
+	  gimple_assign_set_lhs (new_stmt4,
+				 make_ssa_name (vec_dest, new_stmt4));
+
+	  vect_finish_stmt_generation (stmt, new_stmt1, gsi);
+	  vect_finish_stmt_generation (stmt, new_stmt2, gsi);
+	  vect_finish_stmt_generation (stmt, new_stmt3, gsi);
+	  vect_finish_stmt_generation (stmt, new_stmt4, gsi);
+
+	  return new_stmt4;
+	}
+    }
+  else
+    {
+      /* If odd elements are packed, build a rshift statement to replace
+	 the pack-unpack statements.  */
+
+      gimple new_stmt1, new_stmt2, new_stmt3;
+      tree scalar_itype = lang_hooks.types.type_for_mode (
+			    GET_MODE_INNER (TYPE_MODE (vectype_out)),
+			    TYPE_UNSIGNED (scalar_type));
+      tree vecitype = get_vectype_for_scalar_type (scalar_itype);
+      tree temp1 = make_temp_ssa_name (vecitype, NULL, "vect_temp");
+      tree temp2 = make_temp_ssa_name (vecitype, NULL, "vect_temp");
+      tree shift_oprnd = build_int_cst (scalar_type,
+					TYPE_PRECISION (scalar_type));
+
+      new_stmt1 = gimple_build_assign_with_ops (NOP_EXPR, temp1,
+						vec_oprnd0, NULL);
+      new_stmt2 = gimple_build_assign_with_ops (RSHIFT_EXPR, temp2,
+						temp1, shift_oprnd);
+      new_stmt3 = gimple_build_assign_with_ops (NOP_EXPR, vec_dest,
+						temp2, NULL);
+      gimple_assign_set_lhs (new_stmt3,
+			     make_ssa_name (vec_dest, new_stmt2));
+
+      vect_finish_stmt_generation (stmt, new_stmt1, gsi);
+      vect_finish_stmt_generation (stmt, new_stmt2, gsi);
+      vect_finish_stmt_generation (stmt, new_stmt3, gsi);
+
+      return new_stmt3;
+    }
+}
+
 /* Function vect_gen_widened_results_half
 
    Create a vector stmt whose code, type, number of arguments, and result
@@ -3223,10 +3391,15 @@  vect_gen_widened_results_half (enum tree_code code,
     }
   else
     {
+      if ((new_stmt = detect_pack_unpack_pattern (
+			  code, vec_oprnd0, op_type, vec_dest, gsi, stmt)))
+        return new_stmt;
+
       /* Generic support */
       gcc_assert (op_type == TREE_CODE_LENGTH (code));
       if (op_type != binary_op)
 	vec_oprnd1 = NULL;
+
       new_stmt = gimple_build_assign_with_ops (code, vec_dest, vec_oprnd0,
 					       vec_oprnd1);
       new_temp = make_ssa_name (vec_dest, new_stmt);