[v2] aarch64: Optimized memset for Kunpeng processor.
diff mbox series

Message ID 20191101131038.31348-1-zhangxuelei4@huawei.com
State New
Headers show
Series
  • [v2] aarch64: Optimized memset for Kunpeng processor.
Related show

Commit Message

Xuelei Zhang Nov. 1, 2019, 1:10 p.m. UTC
Due to the branch prediction issue of Kunpeng processor, we found
memset_generic has poor performance on middle sizes setting, and so
we reconstructed the logic, expanded the loop by 3 times in set_long
to solve the problem, even when setting below 1K sizes have benefit.

Another change is that DZ_ZVA seems no work when setting zero, so we
discarded it and used set_long to set zero instead. Fewer branches and
predictions also make the zero case have slightly improvement.

Here's the part of the result:
                                    SIMPLE_MEMSET	__memset_falkor	__memset_emag	__memset_kunpeng	__memset_generic
========================================================================================================================
                  length=16, char=65:         9.85 (-146.38%)	        4.00 (  0.06%)	        4.01 ( -0.24%)	        3.61 (  9.71%)	        4.00
                  length=17, char=65:        10.24 (-154.44%)	        3.99 (  0.79%)	        3.99 (  0.85%)	        3.61 ( 10.40%)	        4.02
                  length=17, char=65:        10.24 (-156.51%)	        4.00 ( -0.09%)	        3.99 (  0.06%)	        3.61 (  9.57%)	        3.99
                  length=18, char=65:        10.63 (-166.37%)	        3.99 (  0.00%)	        3.99 (  0.03%)	        3.61 (  9.55%)	        3.99
                  length=18, char=65:        10.63 (-166.43%)	        4.00 ( -0.15%)	        3.99 ( -0.06%)	        3.61 (  9.55%)	        3.99
                  length=19, char=65:        11.01 (-176.02%)	        4.00 ( -0.18%)	        3.99 (  0.00%)	        3.61 (  9.61%)	        3.99
                  length=19, char=65:        11.02 (-176.50%)	        3.99 ( -0.24%)	        3.99 ( -0.15%)	        3.60 (  9.59%)	        3.98
                  length=20, char=65:        11.40 (-185.69%)	        3.99 (  0.00%)	        3.99 (  0.09%)	        3.61 (  9.51%)	        3.99
                  length=20, char=65:        11.41 (-185.78%)	        4.02 ( -0.73%)	        3.99 (  0.06%)	        3.60 (  9.79%)	        3.99
                  length=21, char=65:        11.82 (-196.30%)	        3.99 ( -0.12%)	        3.99 ( -0.03%)	        3.61 (  9.58%)	        3.99
                  length=21, char=65:        11.81 (-196.08%)	        4.00 ( -0.24%)	        3.99 ( -0.12%)	        3.61 (  9.52%)	        3.99
                  length=22, char=65:        12.19 (-204.73%)	        3.99 (  0.12%)	        3.99 (  0.15%)	        3.61 (  9.80%)	        4.00
                  length=22, char=65:        12.19 (-205.45%)	        3.99 ( -0.06%)	        3.99 (  0.00%)	        3.61 (  9.52%)	        3.99
                  length=23, char=65:        12.58 (-215.43%)	        3.99 ( -0.12%)	        3.99 ( -0.03%)	        3.61 (  9.40%)	        3.99
                  length=23, char=65:        12.57 (-215.18%)	        3.99 ( -0.03%)	        3.99 ( -0.03%)	        3.64 (  8.84%)	        3.99
                  length=24, char=65:        12.96 (-224.85%)	        3.99 ( -0.12%)	        3.99 (  0.00%)	        3.61 (  9.49%)	        3.99
                  length=24, char=65:        12.96 (-223.23%)	        4.00 (  0.24%)	        4.00 (  0.37%)	        3.62 (  9.77%)	        4.01
                  length=25, char=65:        13.36 (-234.64%)	        4.00 ( -0.15%)	        3.99 (  0.03%)	        3.61 (  9.60%)	        3.99
                  length=25, char=65:        13.35 (-234.37%)	        3.99 ( -0.03%)	        3.99 (  0.00%)	        3.60 (  9.72%)	        3.99
                  length=26, char=65:        13.74 (-243.99%)	        4.00 ( -0.12%)	        3.99 (  0.03%)	        3.61 (  9.60%)	        3.99
                  length=26, char=65:        13.74 (-241.73%)	        4.00 (  0.49%)	        3.99 (  0.79%)	        3.61 ( 10.26%)	        4.02
                  length=27, char=65:        14.12 (-254.01%)	        3.99 ( -0.12%)	        3.99 ( -0.03%)	        3.61 (  9.61%)	        3.99
                  length=27, char=65:        14.12 (-251.99%)	        3.99 (  0.52%)	        3.99 (  0.64%)	        4.13 ( -2.83%)	        4.01
                  length=28, char=65:        14.52 (-263.56%)	        4.00 ( -0.06%)	        4.00 ( -0.12%)	        3.61 (  9.69%)	        3.99
                  length=28, char=65:        14.52 (-263.84%)	        4.00 ( -0.12%)	        3.99 ( -0.09%)	        3.61 (  9.57%)	        3.99
                  length=29, char=65:        14.90 (-273.05%)	        4.00 ( -0.03%)	        3.99 (  0.06%)	        3.61 (  9.62%)	        4.00
                  length=29, char=65:        14.90 (-273.18%)	        3.99 ( -0.03%)	        3.99 (  0.09%)	        3.61 (  9.66%)	        3.99
                  length=30, char=65:        15.29 (-283.12%)	        4.02 ( -0.80%)	        3.99 (  0.03%)	        3.60 (  9.76%)	        3.99
                  length=30, char=65:        15.29 (-282.94%)	        3.99 (  0.09%)	        3.99 (  0.03%)	        3.61 (  9.57%)	        3.99
                  length=31, char=65:        15.68 (-293.08%)	        4.00 ( -0.21%)	        3.99 ( -0.12%)	        3.61 (  9.55%)	        3.99
                  length=31, char=65:        15.68 (-292.93%)	        4.00 ( -0.15%)	        4.00 ( -0.15%)	        3.61 (  9.55%)	        3.99
                  length=32, char=65:        16.07 (-345.68%)	        3.62 ( -0.34%)	        3.63 ( -0.74%)	        3.23 ( 10.29%)	        3.60
                  length=32, char=65:        16.07 (-345.73%)	        3.61 ( -0.07%)	        3.61 (  0.00%)	        3.22 ( 10.66%)	        3.61
                  length=64, char=65:        28.49 (-689.25%)	        3.60 (  0.17%)	        3.61 (  0.03%)	        3.64 ( -0.74%)	        3.61
                  length=64, char=65:        29.11 (-706.87%)	        3.62 ( -0.34%)	        3.61 ( -0.14%)	        3.62 ( -0.27%)	        3.61
                  length=96, char=65:        40.88 (-1032.78%)	        3.61 ( -0.07%)	        3.61 ( -0.10%)	        3.23 ( 10.49%)	        3.61
                  length=96, char=65:        40.87 (-1034.21%)	        3.61 ( -0.07%)	        3.61 ( -0.24%)	        3.23 ( 10.33%)	        3.60
                 length=128, char=65:        53.31 (-1234.78%)	        4.00 ( -0.12%)	        3.63 (  9.23%)	        4.00 ( -0.15%)	        3.99
                 length=128, char=65:        53.32 (-1234.47%)	        4.01 ( -0.40%)	        3.61 (  9.59%)	        4.00 (  0.00%)	        4.00
                 length=160, char=65:        70.67 (-1253.63%)	        5.22 (  0.00%)	        5.54 ( -6.08%)	        4.39 ( 15.95%)	        5.22
                 length=160, char=65:        71.30 (-1266.84%)	        5.24 ( -0.40%)	        5.64 ( -8.05%)	        4.39 ( 15.91%)	        5.22
                 length=192, char=65:        82.85 (-1487.67%)	        5.21 (  0.07%)	        5.18 (  0.82%)	        5.34 ( -2.39%)	        5.22
                 length=192, char=65:        82.85 (-1486.81%)	        5.21 (  0.16%)	        5.16 (  1.08%)	        5.35 ( -2.43%)	        5.22
                 length=224, char=65:        95.35 (-471.44%)	       16.68 (  0.01%)	        6.45 ( 61.36%)	        5.99 ( 64.11%)	       16.69
                 length=224, char=65:        96.00 (-475.09%)	       16.70 ( -0.04%)	        6.44 ( 61.41%)	        5.99 ( 64.14%)	       16.69
                 length=256, char=65:       107.47 (-544.11%)	       16.69 ( -0.01%)	        6.83 ( 59.04%)	        6.89 ( 58.70%)	       16.68
                 length=256, char=65:       107.47 (-544.19%)	       16.68 ( -0.01%)	        6.85 ( 58.94%)	        6.88 ( 58.76%)	       16.68
                 length=288, char=65:       120.57 (-1349.57%)	        8.33 ( -0.09%)	        7.58 (  8.85%)	        7.54 (  9.35%)	        8.32
                 length=288, char=65:       120.03 (-1347.51%)	        8.29 (  0.01%)	        7.56 (  8.88%)	        7.51 (  9.39%)	        8.29
                 length=320, char=65:       132.08 (-1492.79%)	        8.29 ( -0.01%)	        8.24 (  0.62%)	        8.42 ( -1.52%)	        8.29
                 length=320, char=65:       132.53 (-1497.78%)	        8.31 ( -0.19%)	        8.22 (  0.88%)	        8.42 ( -1.49%)	        8.29
                 length=352, char=65:       144.78 (-1372.65%)	        9.84 ( -0.06%)	        9.10 (  7.44%)	        9.23 (  6.15%)	        9.83
                 length=352, char=65:       144.77 (-1372.49%)	        9.83 (  0.00%)	        9.09 (  7.55%)	        9.10 (  7.49%)	        9.83
                 length=384, char=65:       157.91 (-1505.36%)	        9.85 ( -0.15%)	        9.78 (  0.58%)	        9.95 ( -1.18%)	        9.84
                 length=384, char=65:       156.82 (-1494.91%)	        9.83 (  0.01%)	        9.76 (  0.71%)	        9.95 ( -1.19%)	        9.83
                 length=416, char=65:       169.83 (-1392.95%)	       11.38 ( -0.08%)	       10.65 (  6.35%)	       10.81 (  4.98%)	       11.38
                 length=416, char=65:       169.00 (-1386.09%)	       11.37 (  0.02%)	       10.64 (  6.46%)	       10.62 (  6.61%)	       11.37
                 length=448, char=65:       181.98 (-1500.22%)	       11.38 ( -0.06%)	       11.32 (  0.42%)	       11.51 ( -1.25%)	       11.37
                 length=448, char=65:       181.51 (-1496.96%)	       11.37 ( -0.06%)	       11.30 (  0.56%)	       11.50 ( -1.17%)	       11.37
                 length=480, char=65:       194.46 (-1394.52%)	       13.01 ( -0.01%)	       12.19 (  6.34%)	       12.24 (  5.93%)	       13.01
                 length=480, char=65:       194.00 (-1377.70%)	       13.13 (  0.00%)	       12.18 (  7.22%)	       12.13 (  7.57%)	       13.13
				   length=16, char=0:         9.85 (-146.59%)	        4.03 ( -0.95%)	        4.00 ( -0.12%)	        3.61 (  9.72%)	        4.00
                   length=17, char=0:        10.24 (-156.42%)	        3.99 (  0.12%)	        4.00 ( -0.06%)	        3.61 (  9.69%)	        3.99
                   length=17, char=0:        10.24 (-156.78%)	        3.99 ( -0.15%)	        3.99 ( -0.15%)	        3.60 (  9.61%)	        3.99
                   length=18, char=0:        10.62 (-166.15%)	        3.99 ( -0.03%)	        3.99 (  0.00%)	        3.61 (  9.60%)	        3.99
                   length=18, char=0:        10.63 (-166.29%)	        4.62 (-15.78%)	        4.00 ( -0.31%)	        3.61 (  9.57%)	        3.99
                   length=19, char=0:        11.02 (-176.16%)	        3.99 ( -0.12%)	        3.99 ( -0.06%)	        3.61 (  9.52%)	        3.99
                   length=19, char=0:        11.01 (-175.51%)	        3.99 (  0.09%)	        3.99 (  0.21%)	        3.64 (  9.01%)	        4.00
                   length=20, char=0:        11.40 (-185.45%)	        3.99 (  0.06%)	        3.99 (  0.06%)	        3.62 (  9.26%)	        3.99
                   length=20, char=0:        11.40 (-185.72%)	        3.99 (  0.09%)	        3.99 ( -0.03%)	        3.60 (  9.72%)	        3.99
                   length=21, char=0:        11.79 (-195.41%)	        3.99 ( -0.03%)	        3.99 (  0.03%)	        3.61 (  9.64%)	        3.99
                   length=21, char=0:        11.79 (-195.23%)	        3.99 (  0.03%)	        3.99 (  0.12%)	        3.60 (  9.75%)	        3.99
                   length=22, char=0:        12.18 (-205.05%)	        4.00 ( -0.09%)	        3.99 (  0.03%)	        3.61 (  9.63%)	        3.99
                   length=22, char=0:        12.18 (-205.14%)	        4.00 ( -0.15%)	        3.99 ( -0.03%)	        3.61 (  9.61%)	        3.99
                   length=23, char=0:        12.60 (-215.60%)	        4.00 ( -0.15%)	        3.99 (  0.12%)	        3.60 (  9.72%)	        3.99
                   length=23, char=0:        12.56 (-214.71%)	        3.99 ( -0.03%)	        3.99 (  0.03%)	        3.61 (  9.60%)	        3.99
                   length=24, char=0:        12.95 (-224.30%)	        3.99 (  0.03%)	        3.99 (  0.06%)	        3.61 (  9.60%)	        3.99
                   length=24, char=0:        12.95 (-224.27%)	        3.99 (  0.00%)	        3.99 (  0.18%)	        3.61 (  9.69%)	        3.99
                   length=25, char=0:        13.34 (-233.74%)	        3.99 (  0.09%)	        4.51 (-12.82%)	        3.61 (  9.68%)	        4.00
                   length=25, char=0:        13.34 (-234.52%)	        3.99 ( -0.12%)	        3.99 ( -0.12%)	        3.61 (  9.58%)	        3.99
                   length=26, char=0:        13.74 (-244.40%)	        4.00 ( -0.18%)	        4.02 ( -0.83%)	        3.61 (  9.49%)	        3.99
                   length=26, char=0:        13.73 (-244.28%)	        3.99 ( -0.03%)	        3.99 ( -0.09%)	        3.60 (  9.64%)	        3.99
                   length=27, char=0:        14.12 (-253.92%)	        4.00 ( -0.18%)	        3.99 (  0.03%)	        3.60 (  9.67%)	        3.99
                   length=27, char=0:        14.12 (-254.03%)	        3.99 ( -0.03%)	        3.99 (  0.00%)	        3.60 (  9.64%)	        3.99
                   length=28, char=0:        14.51 (-263.04%)	        3.99 (  0.15%)	        3.99 (  0.12%)	        3.61 (  9.77%)	        4.00
                   length=28, char=0:        14.51 (-263.50%)	        3.99 (  0.03%)	        3.99 (  0.03%)	        3.61 (  9.69%)	        3.99
                   length=29, char=0:        14.90 (-270.95%)	        3.99 (  0.58%)	        3.99 (  0.70%)	        3.60 ( 10.30%)	        4.02
                   length=29, char=0:        14.90 (-273.29%)	        3.99 (  0.00%)	        3.99 ( -0.06%)	        3.61 (  9.57%)	        3.99
                   length=30, char=0:        15.29 (-282.76%)	        3.99 (  0.09%)	        4.00 ( -0.06%)	        3.61 (  9.69%)	        3.99
                   length=30, char=0:        15.29 (-283.14%)	        4.00 ( -0.15%)	        3.99 (  0.00%)	        3.61 (  9.48%)	        3.99
                   length=31, char=0:        15.68 (-293.14%)	        3.99 ( -0.12%)	        3.99 ( -0.12%)	        3.61 (  9.58%)	        3.99
                   length=31, char=0:        15.69 (-293.36%)	        3.99 ( -0.15%)	        3.99 (  0.03%)	        3.61 (  9.52%)	        3.99
                   length=32, char=0:        16.07 (-345.22%)	        3.62 ( -0.20%)	        3.61 (  0.00%)	        3.23 ( 10.62%)	        3.61
                   length=32, char=0:        16.10 (-346.63%)	        3.62 ( -0.30%)	        3.61 ( -0.03%)	        3.22 ( 10.74%)	        3.60
                   length=64, char=0:        28.49 (-691.26%)	        3.60 ( -0.10%)	        3.61 ( -0.34%)	        3.61 ( -0.20%)	        3.60
                   length=64, char=0:        28.49 (-690.31%)	        3.60 (  0.00%)	        3.61 ( -0.10%)	        3.61 ( -0.03%)	        3.60
                   length=96, char=0:        55.09 (-1427.24%)	        3.61 ( -0.14%)	        3.60 (  0.07%)	        3.24 ( 10.12%)	        3.61
                   length=96, char=0:        51.76 (-1334.40%)	        3.61 (  0.00%)	        3.60 (  0.14%)	        3.23 ( 10.52%)	        3.61
                  length=128, char=0:        64.00 (-1501.44%)	        4.00 (  0.03%)	        3.63 (  9.25%)	        4.00 ( -0.18%)	        4.00
                  length=128, char=0:        64.72 (-1519.77%)	        4.01 ( -0.24%)	        3.65 (  8.77%)	        3.99 (  0.03%)	        4.00
                  length=160, char=0:        76.39 (-1365.53%)	        5.21 (  0.02%)	        5.64 ( -8.22%)	        4.39 ( 15.71%)	        5.21
                  length=160, char=0:        76.39 (-1367.17%)	        5.21 ( -0.14%)	        5.64 ( -8.25%)	        4.38 ( 15.87%)	        5.21
                  length=192, char=0:        88.72 (-1603.26%)	        5.22 ( -0.16%)	        5.18 (  0.47%)	        5.34 ( -2.60%)	        5.21
                  length=192, char=0:        89.05 (-1608.39%)	        5.23 ( -0.30%)	        5.18 (  0.61%)	        5.34 ( -2.51%)	        5.21
                  length=224, char=0:        95.34 (-471.58%)	       16.68 ( -0.01%)	        6.44 ( 61.37%)	        5.99 ( 64.08%)	       16.68
                  length=224, char=0:        95.34 (-470.55%)	       16.68 (  0.17%)	        6.44 ( 61.47%)	        5.98 ( 64.21%)	       16.71
                  length=256, char=0:       107.46 (-1175.45%)	        8.42 (  0.06%)	        6.69 ( 20.62%)	        6.88 ( 18.33%)	        8.43
                  length=256, char=0:       107.99 (-1182.66%)	        8.44 ( -0.28%)	        6.69 ( 20.57%)	        6.88 ( 18.27%)	        8.42
                  length=288, char=0:       120.03 (-1325.00%)	        8.42 ( -0.01%)	        7.55 ( 10.30%)	        7.53 ( 10.61%)	        8.42
                  length=288, char=0:       120.03 (-1324.60%)	        8.42 (  0.07%)	        7.55 ( 10.33%)	        7.52 ( 10.75%)	        8.43
                  length=320, char=0:       132.58 (-1471.50%)	        8.43 (  0.03%)	        8.24 (  2.37%)	        8.42 (  0.17%)	        8.44
                  length=320, char=0:       132.09 (-1465.48%)	        8.44 (  0.01%)	        8.22 (  2.55%)	        8.42 (  0.25%)	        8.44
                  length=352, char=0:       144.77 (-729.18%)	       10.62 ( 39.17%)	        9.10 ( 47.87%)	        9.07 ( 48.04%)	       17.46
                  length=352, char=0:       144.77 (-729.17%)	        9.96 ( 42.95%)	        9.09 ( 47.95%)	        9.06 ( 48.10%)	       17.46
                  length=384, char=0:       156.81 (-797.96%)	        9.95 ( 43.00%)	        9.77 ( 44.07%)	        9.96 ( 42.97%)	       17.46
                  length=384, char=0:       157.56 (-802.27%)	        9.97 ( 42.89%)	        9.77 ( 44.06%)	        9.96 ( 42.96%)	       17.46
                  length=416, char=0:       169.38 (-774.13%)	       17.46 (  9.91%)	       10.64 ( 45.09%)	       10.81 ( 44.20%)	       19.38
                  length=416, char=0:       169.58 (-830.31%)	       17.47 (  4.17%)	       10.64 ( 41.61%)	       10.81 ( 40.70%)	       18.23
                  length=448, char=0:       185.67 (-857.80%)	       17.45 (  9.97%)	       11.31 ( 41.65%)	       11.54 ( 40.45%)	       19.38
                  length=448, char=0:       182.09 (-839.56%)	       17.47 (  9.86%)	       11.30 ( 41.70%)	       11.49 ( 40.69%)	       19.38
                  length=480, char=0:       194.00 (-1456.58%)	       13.42 ( -7.69%)	       12.18 (  2.25%)	       12.23 (  1.89%)	       12.46
                  length=480, char=0:       194.25 (-1411.94%)	       13.43 ( -4.53%)	       12.19 (  5.15%)	       12.13 (  5.56%)	       12.85
---
 sysdeps/aarch64/multiarch/Makefile          |   2 +-
 sysdeps/aarch64/multiarch/ifunc-impl-list.c |   1 +
 sysdeps/aarch64/multiarch/memset.c          |   5 +-
 sysdeps/aarch64/multiarch/memset_kunpeng.S  | 115 ++++++++++++++++++++++++++++
 4 files changed, 121 insertions(+), 2 deletions(-)
 create mode 100644 sysdeps/aarch64/multiarch/memset_kunpeng.S

Comments

Wilco Dijkstra Nov. 1, 2019, 3:33 p.m. UTC | #1
Hi Xuelei,

+L(set127):
+       and     tmp1, dstin, 15
+       bic     dst, dstin, 15
+       stp     q0,q0, [dst, 16]
+       str     q0, [dst, 48]
+       stp     q0, q0, [dstend, -64]
+       stp     q0, q0, [dstend, -32]
+       ret

This won't work, when you align like that you need an extra unaligned store at the end.
The easiest solution is not to align and just keep the stp and str using dstin.

+2:     stp     q0, q0, [dst, 32]
+       stp     q0, q0, [dstend, -32]
+       ret

Both of these must use dstend since this is the 1-64 bytes for the tail part.

Wilco

Patch
diff mbox series

diff --git a/sysdeps/aarch64/multiarch/Makefile b/sysdeps/aarch64/multiarch/Makefile
index 4150b89a902..8378107c78e 100644
--- a/sysdeps/aarch64/multiarch/Makefile
+++ b/sysdeps/aarch64/multiarch/Makefile
@@ -1,7 +1,7 @@ 
 ifeq ($(subdir),string)
 sysdep_routines += memcpy_generic memcpy_thunderx memcpy_thunderx2 \
 		   memcpy_falkor memmove_falkor \
-		   memset_generic memset_falkor memset_emag \
+		   memset_generic memset_falkor memset_emag memset_kunpeng \
 		   memchr_generic memchr_nosimd \
 		   strlen_generic strlen_asimd
 endif
diff --git a/sysdeps/aarch64/multiarch/ifunc-impl-list.c b/sysdeps/aarch64/multiarch/ifunc-impl-list.c
index be13b916e58..bcbd90d0c41 100644
--- a/sysdeps/aarch64/multiarch/ifunc-impl-list.c
+++ b/sysdeps/aarch64/multiarch/ifunc-impl-list.c
@@ -53,6 +53,7 @@  __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 		 can do a comparative analysis with __memset_generic.  */
 	      IFUNC_IMPL_ADD (array, i, memset, (zva_size == 64), __memset_falkor)
 	      IFUNC_IMPL_ADD (array, i, memset, (zva_size == 64), __memset_emag)
+	      IFUNC_IMPL_ADD (array, i, memset, 1, __memset_kunpeng)
 	      IFUNC_IMPL_ADD (array, i, memset, 1, __memset_generic))
   IFUNC_IMPL (i, name, memchr,
 	      IFUNC_IMPL_ADD (array, i, memchr, 1, __memchr_nosimd)
diff --git a/sysdeps/aarch64/multiarch/memset.c b/sysdeps/aarch64/multiarch/memset.c
index e9cdd385f26..4cc34b9b99a 100644
--- a/sysdeps/aarch64/multiarch/memset.c
+++ b/sysdeps/aarch64/multiarch/memset.c
@@ -30,10 +30,13 @@  extern __typeof (__redirect_memset) __libc_memset;
 
 extern __typeof (__redirect_memset) __memset_falkor attribute_hidden;
 extern __typeof (__redirect_memset) __memset_emag attribute_hidden;
+extern __typeof (__redirect_memset) __memset_kunpeng attribute_hidden;
 extern __typeof (__redirect_memset) __memset_generic attribute_hidden;
 
 libc_ifunc (__libc_memset,
-	    ((IS_FALKOR (midr) || IS_PHECDA (midr)) && zva_size == 64
+	    IS_KUNPENG (midr)
+	    ?__memset_kunpeng
+	    : ((IS_FALKOR (midr) || IS_PHECDA (midr)) && zva_size == 64
 	     ? __memset_falkor
 	     : (IS_EMAG (midr) && zva_size == 64
 	       ? __memset_emag
diff --git a/sysdeps/aarch64/multiarch/memset_kunpeng.S b/sysdeps/aarch64/multiarch/memset_kunpeng.S
new file mode 100644
index 00000000000..d5eaf069501
--- /dev/null
+++ b/sysdeps/aarch64/multiarch/memset_kunpeng.S
@@ -0,0 +1,115 @@ 
+/* Optimized memset for Huawei Kunpeng processor.
+   Copyright (C) 2012-2019 Free Software Foundation, Inc.
+
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library.  If not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+#include <sysdeps/aarch64/memset-reg.h>
+
+#if IS_IN (libc)
+# define MEMSET __memset_kunpeng
+
+/* Assumptions:
+ *
+ * ARMv8-a, AArch64, unaligned accesses
+ *
+ */
+
+ENTRY_ALIGN (MEMSET, 6)
+
+	DELOUSE (0)
+	DELOUSE (2)
+
+	dup	v0.16B, valw
+	add	dstend, dstin, count
+
+	cmp	count, 128
+	b.hs	L(set_long)
+
+	cmp	count, 16
+	b.lo	L(less16)
+
+	/* Set 16..127 bytes.  */
+	str	q0, [dstin]
+	tbnz	count, 6, L(set127)
+	str	q0, [dstend, -16]
+	tbz	count, 5, 1f
+	str	q0, [dstin, 16]
+	str	q0, [dstend, -32]
+1:	ret
+
+	.p2align 4
+	/* Set 64..127 bytes.  Write 64 bytes from the start and
+	   64 bytes from the end.  */
+L(set127):
+	and	tmp1, dstin, 15
+	bic	dst, dstin, 15
+	stp	q0,q0, [dst, 16]
+	str	q0, [dst, 48]
+	stp	q0, q0, [dstend, -64]
+	stp	q0, q0, [dstend, -32]
+	ret
+
+	.p2align 4
+	/* Set 0..15 bytes.  */
+L(less16):
+	tbz	count, 3, L(less8)
+	str	d0, [dstin]
+	str	d0, [dstend, -8]
+	ret
+L(less8):
+	tbz	count, 2, 2f
+	str	s0, [dstin]
+	str	s0, [dstend, -4]
+	ret
+2:	cbz	count, 3f
+	str	b0, [dstin]
+	tbz	count, 1, 3f
+	str	h0, [dstend, -2]
+3:	ret
+	
+	.p2align 4
+L(set_long):
+	bic	dst, dstin, 15
+	str	q0, [dstin]
+	sub	count, dstend, dst	/* Count is 16 too large.  */
+	sub	dst, dst, 16		/* Dst is biased by -32.  */
+	sub	count, count, 64 + 16 + 1 /* Adjust count and bias for loop.  */
+1:	stp	q0, q0, [dst, 32]
+	stp	q0, q0, [dst, 64]!
+	subs	count, count, 64
+	b.lo	2f
+	stp	q0, q0, [dst, 32]
+	stp	q0, q0, [dst, 64]!
+	subs	count, count, 64
+	b.lo	2f
+	stp	q0, q0, [dst, 32]
+	stp	q0, q0, [dst, 64]!
+	subs	count, count, 64
+	b.lo	2f
+	stp	q0, q0, [dst, 32]
+	stp	q0, q0, [dst, 64]!
+	subs	count, count, 64
+	b.hs	1b
+
+2:	stp	q0, q0, [dst, 32]
+	stp	q0, q0, [dstend, -32]
+	ret
+
+END (MEMSET)
+libc_hidden_builtin_def (MEMSET)
+#endif