[v3] aarch64: Optimized memset for Kunpeng processor.
diff mbox series

Message ID 20191104084939.30876-1-zhangxuelei4@huawei.com
State New
Headers show
Series
  • [v3] aarch64: Optimized memset for Kunpeng processor.
Related show

Commit Message

Xuelei Zhang Nov. 4, 2019, 8:49 a.m. UTC
Due to the branch prediction issue of Kunpeng processor, we found
memset_generic has poor performance on middle sizes setting, and so
we reconstructed the logic, expanded the loop by 4 times in set_long
to solve the problem, even when setting below 1K sizes have benefit.

Another change is that DZ_ZVA seems no work when setting zero, so we
discarded it and used set_long to set zero instead. Fewer branches and
predictions also make the zero case have slightly improvement.

Here's the part of the result:
                                    SIMPLE_MEMSET	__memset_falkor	__memset_emag	__memset_kunpeng	__memset_generic
========================================================================================================================
                  length=16, char=65:         9.84 (-146.83%)	        4.00 ( -0.18%)	        3.99 ( -0.12%)	        3.61 (  9.52%)	        3.99
                  length=17, char=65:        10.24 (-156.58%)	        3.99 (  0.03%)	        4.00 ( -0.15%)	        3.61 (  9.55%)	        3.99
                  length=17, char=65:        10.23 (-156.24%)	        3.99 (  0.09%)	        3.99 (  0.12%)	        3.60 (  9.78%)	        3.99
                  length=18, char=65:        10.62 (-166.20%)	        3.99 (  0.06%)	        3.99 (  0.00%)	        3.60 (  9.73%)	        3.99
                  length=18, char=65:        10.62 (-166.27%)	        4.02 ( -0.83%)	        3.99 (  0.00%)	        3.61 (  9.58%)	        3.99
                  length=19, char=65:        11.01 (-176.06%)	        3.99 ( -0.06%)	        3.99 ( -0.03%)	        3.61 (  9.49%)	        3.99
                  length=19, char=65:        11.01 (-175.96%)	        4.01 ( -0.43%)	        3.99 ( -0.03%)	        3.60 (  9.76%)	        3.99
                  length=20, char=65:        11.39 (-185.67%)	        3.99 (  0.00%)	        3.99 (  0.03%)	        3.60 (  9.67%)	        3.99
                  length=20, char=65:        11.40 (-185.65%)	        4.00 ( -0.18%)	        3.99 ( -0.03%)	        3.61 (  9.61%)	        3.99
                  length=21, char=65:        11.79 (-195.41%)	        3.99 (  0.09%)	        3.99 (  0.06%)	        3.60 (  9.73%)	        3.99
                  length=21, char=65:        11.80 (-195.90%)	        3.99 ( -0.12%)	        3.99 ( -0.03%)	        3.63 (  8.88%)	        3.99
                  length=22, char=65:        12.17 (-205.14%)	        3.99 (  0.03%)	        3.99 ( -0.09%)	        3.60 (  9.64%)	        3.99
                  length=22, char=65:        12.17 (-204.99%)	        3.99 ( -0.03%)	        3.99 (  0.15%)	        3.60 (  9.76%)	        3.99
                  length=23, char=65:        12.57 (-215.12%)	        3.99 (  0.06%)	        3.99 ( -0.06%)	        3.60 (  9.61%)	        3.99
                  length=23, char=65:        12.56 (-214.93%)	        3.99 ( -0.06%)	        3.99 (  0.03%)	        3.60 (  9.64%)	        3.99
                  length=24, char=65:        12.95 (-224.47%)	        3.99 (  0.06%)	        3.99 (  0.03%)	        3.61 (  9.64%)	        3.99
                  length=24, char=65:        12.96 (-224.98%)	        3.99 ( -0.09%)	        4.00 ( -0.37%)	        3.64 (  8.72%)	        3.99
                  length=25, char=65:        13.34 (-234.35%)	        3.99 (  0.03%)	        3.99 (  0.03%)	        3.60 (  9.70%)	        3.99
                  length=25, char=65:        13.34 (-234.56%)	        3.99 (  0.06%)	        3.99 (  0.00%)	        3.60 (  9.70%)	        3.99
                  length=26, char=65:        13.74 (-244.40%)	        3.99 ( -0.03%)	        3.99 (  0.00%)	        3.60 (  9.64%)	        3.99
                  length=26, char=65:        13.73 (-244.11%)	        3.99 (  0.06%)	        3.99 (  0.06%)	        3.61 (  9.64%)	        3.99
                  length=27, char=65:        14.11 (-253.55%)	        3.99 (  0.12%)	        3.99 (  0.09%)	        3.60 (  9.76%)	        3.99
                  length=27, char=65:        14.12 (-253.28%)	        3.99 (  0.18%)	        4.48 (-12.10%)	        3.62 (  9.35%)	        4.00
                  length=28, char=65:        14.65 (-267.34%)	        3.99 ( -0.09%)	        3.99 (  0.03%)	        3.61 (  9.58%)	        3.99
                  length=28, char=65:        14.51 (-263.82%)	        3.99 ( -0.09%)	        3.99 ( -0.15%)	        3.60 (  9.64%)	        3.99
                  length=29, char=65:        14.89 (-273.15%)	        3.99 (  0.09%)	        3.99 (  0.09%)	        3.61 (  9.66%)	        3.99
                  length=29, char=65:        14.89 (-273.63%)	        3.99 (  0.00%)	        3.99 ( -0.12%)	        3.60 (  9.65%)	        3.99
                  length=30, char=65:        15.29 (-283.17%)	        3.99 (  0.09%)	        3.99 (  0.03%)	        3.60 (  9.64%)	        3.99
                  length=30, char=65:        15.28 (-283.02%)	        3.99 (  0.09%)	        3.99 (  0.06%)	        3.61 (  9.64%)	        3.99
                  length=31, char=65:        15.67 (-292.84%)	        3.99 (  0.09%)	        4.03 ( -0.92%)	        3.60 (  9.70%)	        3.99
                  length=31, char=65:        15.67 (-293.17%)	        3.99 (  0.00%)	        3.99 ( -0.12%)	        3.60 (  9.56%)	        3.99
                  length=32, char=65:        16.06 (-344.75%)	        3.62 ( -0.30%)	        3.61 (  0.00%)	        3.24 ( 10.41%)	        3.61
                  length=32, char=65:        16.06 (-344.41%)	        3.60 (  0.37%)	        3.62 ( -0.27%)	        3.23 ( 10.77%)	        3.61
                  length=64, char=65:        28.48 (-689.91%)	        3.61 (  0.00%)	        3.61 ( -0.07%)	        3.35 (  7.21%)	        3.61
                  length=64, char=65:        28.94 (-702.30%)	        3.62 ( -0.37%)	        3.61 ( -0.10%)	        3.22 ( 10.63%)	        3.61
                  length=96, char=65:        40.87 (-1023.42%)	        3.61 (  0.87%)	        3.61 (  0.77%)	        3.24 ( 10.84%)	        3.64
                  length=96, char=65:        40.86 (-1033.97%)	        3.60 (  0.03%)	        3.61 ( -0.10%)	        3.23 ( 10.47%)	        3.60
                 length=128, char=65:        53.30 (-1228.42%)	        4.01 (  0.09%)	        3.61 ( 10.10%)	        3.99 (  0.46%)	        4.01
                 length=128, char=65:        53.30 (-1226.43%)	        4.01 (  0.15%)	        3.61 ( 10.15%)	        4.00 (  0.43%)	        4.02
                 length=160, char=65:        70.66 (-1256.33%)	        5.21 (  0.07%)	        5.64 ( -8.27%)	        5.22 ( -0.16%)	        5.21
                 length=160, char=65:        71.22 (-1266.07%)	        5.24 ( -0.44%)	        5.63 ( -8.01%)	        5.21 (  0.05%)	        5.21
                 length=192, char=65:        82.84 (-1489.72%)	        5.21 (  0.00%)	        5.17 (  0.84%)	        5.21 (  0.07%)	        5.21
                 length=192, char=65:        82.85 (-1486.79%)	        5.24 ( -0.44%)	        5.17 (  1.05%)	        5.21 (  0.26%)	        5.22
                 length=224, char=65:        95.34 (-471.58%)	       16.68 (  0.01%)	        6.45 ( 61.36%)	        6.75 ( 59.53%)	       16.68
                 length=224, char=65:        95.81 (-469.71%)	       16.69 (  0.73%)	        6.30 ( 62.55%)	        6.75 ( 59.85%)	       16.82
                 length=256, char=65:       107.46 (-544.28%)	       16.68 ( -0.01%)	        6.69 ( 59.91%)	        6.75 ( 59.54%)	       16.68
                 length=256, char=65:       107.46 (-544.24%)	       16.68 ( -0.01%)	        6.68 ( 59.94%)	        6.75 ( 59.55%)	       16.68
                 length=288, char=65:       120.49 (-1350.50%)	        8.31 ( -0.07%)	        7.57 (  8.88%)	        8.31 ( -0.07%)	        8.31
                 length=288, char=65:       120.02 (-1348.91%)	        8.29 ( -0.07%)	        9.42 (-13.68%)	        8.29 ( -0.03%)	        8.28
                 length=320, char=65:       132.07 (-1493.15%)	        8.29 (  0.04%)	        8.22 (  0.82%)	        8.28 (  0.10%)	        8.29
                 length=320, char=65:       133.08 (-1505.07%)	        8.31 ( -0.22%)	        8.99 ( -8.44%)	        8.29 (  0.01%)	        8.29
                 length=352, char=65:       144.76 (-1373.36%)	        9.83 ( -0.07%)	        9.09 (  7.48%)	        9.84 ( -0.12%)	        9.83
                 length=352, char=65:       144.77 (-1370.10%)	        9.83 (  0.21%)	        9.08 (  7.76%)	       10.31 ( -4.70%)	        9.85
                 length=384, char=65:       156.82 (-1495.48%)	        9.83 (  0.02%)	        9.77 (  0.58%)	        9.82 (  0.05%)	        9.83
                 length=384, char=65:       156.81 (-1489.89%)	        9.83 (  0.35%)	        9.76 (  1.03%)	        9.83 (  0.36%)	        9.86
                 length=416, char=65:       169.39 (-1389.81%)	       11.38 ( -0.12%)	       10.92 (  3.94%)	       11.43 ( -0.49%)	       11.37
                 length=416, char=65:       169.00 (-1386.86%)	       11.36 (  0.02%)	       10.64 (  6.40%)	       11.42 ( -0.48%)	       11.37
                 length=448, char=65:       181.98 (-1501.07%)	       11.38 ( -0.10%)	       11.30 (  0.57%)	       11.44 ( -0.67%)	       11.37
                 length=448, char=65:       181.49 (-1496.98%)	       11.36 (  0.01%)	       11.30 (  0.59%)	       11.36 (  0.00%)	       11.36
                 length=480, char=65:       194.27 (-1394.63%)	       13.01 ( -0.08%)	       12.18 (  6.26%)	       13.13 ( -1.03%)	       13.00
                 length=480, char=65:       193.99 (-1377.78%)	       13.13 ( -0.02%)	       12.28 (  6.44%)	       13.00 (  0.98%)	       13.13
                   length=16, char=0:         9.85 (-146.35%)	        4.02 ( -0.40%)	        4.02 ( -0.55%)	        3.62 (  9.40%)	        4.00
                   length=17, char=0:        10.24 (-156.80%)	        3.99 ( -0.06%)	        3.99 ( -0.06%)	        3.60 (  9.58%)	        3.99
                   length=17, char=0:        10.23 (-156.55%)	        4.02 ( -0.70%)	        3.99 (  0.00%)	        3.61 (  9.52%)	        3.99
                   length=18, char=0:        10.63 (-166.64%)	        3.99 ( -0.18%)	        3.99 ( -0.03%)	        3.62 (  9.28%)	        3.99
                   length=18, char=0:        10.62 (-166.39%)	        3.99 (  0.00%)	        3.99 (  0.00%)	        3.60 (  9.67%)	        3.99
                   length=19, char=0:        11.01 (-176.01%)	        3.99 ( -0.03%)	        3.99 (  0.03%)	        3.60 (  9.64%)	        3.99
                   length=19, char=0:        11.01 (-141.13%)	        4.07 ( 10.83%)	        3.99 ( 12.62%)	        3.60 ( 21.02%)	        4.56
                   length=20, char=0:        11.42 (-186.11%)	        3.99 (  0.06%)	        3.99 (  0.06%)	        3.60 (  9.67%)	        3.99
                   length=20, char=0:        11.40 (-185.83%)	        3.99 (  0.06%)	        3.99 (  0.00%)	        3.64 (  8.85%)	        3.99
                   length=21, char=0:        11.79 (-195.44%)	        3.99 (  0.00%)	        3.98 (  0.12%)	        3.61 (  9.61%)	        3.99
                   length=21, char=0:        11.79 (-195.41%)	        3.99 (  0.03%)	        3.99 (  0.03%)	        3.60 (  9.73%)	        3.99
                   length=22, char=0:        12.18 (-205.11%)	        3.99 (  0.00%)	        3.99 (  0.09%)	        3.60 (  9.70%)	        3.99
                   length=22, char=0:        12.17 (-204.99%)	        3.99 (  0.03%)	        3.99 (  0.00%)	        3.60 (  9.76%)	        3.99
                   length=23, char=0:        12.56 (-215.16%)	        3.99 ( -0.03%)	        3.99 (  0.00%)	        3.61 (  9.53%)	        3.99
                   length=23, char=0:        12.56 (-215.16%)	        3.99 ( -0.03%)	        3.99 (  0.00%)	        3.60 (  9.61%)	        3.99
                   length=24, char=0:        12.98 (-225.53%)	        3.99 (  0.00%)	        3.99 (  0.03%)	        3.60 (  9.64%)	        3.99
                   length=24, char=0:        12.95 (-224.93%)	        3.99 ( -0.06%)	        3.99 ( -0.03%)	        3.60 (  9.59%)	        3.99
                   length=25, char=0:        13.34 (-234.39%)	        3.99 (  0.03%)	        3.99 ( -0.03%)	        3.60 (  9.67%)	        3.99
                   length=25, char=0:        13.34 (-234.64%)	        3.99 ( -0.12%)	        3.99 ( -0.03%)	        3.60 (  9.62%)	        3.99
                   length=26, char=0:        13.73 (-242.86%)	        3.99 (  0.43%)	        3.99 (  0.40%)	        3.60 (  9.97%)	        4.00
                   length=26, char=0:        13.73 (-244.70%)	        3.99 ( -0.12%)	        3.99 ( -0.09%)	        3.60 (  9.53%)	        3.98
                   length=27, char=0:        14.12 (-253.83%)	        3.99 (  0.06%)	        4.02 ( -0.80%)	        3.60 (  9.70%)	        3.99
                   length=27, char=0:        14.11 (-254.01%)	        4.00 ( -0.24%)	        3.99 (  0.00%)	        3.60 (  9.58%)	        3.99
                   length=28, char=0:        14.51 (-263.79%)	        3.99 ( -0.03%)	        3.99 (  0.06%)	        3.61 (  9.55%)	        3.99
                   length=28, char=0:        14.51 (-263.98%)	        3.99 ( -0.12%)	        3.99 ( -0.06%)	        3.60 (  9.62%)	        3.99
                   length=29, char=0:        14.89 (-273.43%)	        3.99 (  0.03%)	        3.99 (  0.00%)	        3.61 (  9.55%)	        3.99
                   length=29, char=0:        14.89 (-273.40%)	        3.99 (  0.03%)	        3.99 (  0.06%)	        3.60 (  9.70%)	        3.99
                   length=30, char=0:        15.28 (-280.20%)	        3.99 (  0.79%)	        3.98 (  0.88%)	        3.60 ( 10.36%)	        4.02
                   length=30, char=0:        15.29 (-283.40%)	        3.99 ( -0.09%)	        3.99 ( -0.03%)	        3.60 (  9.58%)	        3.99
                   length=31, char=0:        15.67 (-293.08%)	        3.99 ( -0.03%)	        3.99 ( -0.15%)	        3.60 (  9.62%)	        3.99
                   length=31, char=0:        15.66 (-292.74%)	        3.99 (  0.00%)	        3.99 ( -0.06%)	        3.61 (  9.58%)	        3.99
                   length=32, char=0:        16.07 (-342.37%)	        3.62 (  0.40%)	        3.61 (  0.50%)	        3.24 ( 10.79%)	        3.63
                   length=32, char=0:        16.08 (-346.21%)	        3.62 ( -0.51%)	        3.60 (  0.00%)	        3.22 ( 10.64%)	        3.60
                   length=64, char=0:        28.48 (-689.84%)	        3.60 (  0.10%)	        3.60 (  0.10%)	        3.22 ( 10.60%)	        3.61
                   length=64, char=0:        28.51 (-691.29%)	        3.60 (  0.03%)	        3.61 ( -0.07%)	        3.22 ( 10.60%)	        3.60
                   length=96, char=0:        40.86 (-1035.06%)	        3.60 ( -0.10%)	        3.60 ( -0.10%)	        3.22 ( 10.51%)	        3.60
                   length=96, char=0:        40.86 (-1034.75%)	        3.60 (  0.00%)	        3.60 ( -0.03%)	        3.22 ( 10.58%)	        3.60
                  length=128, char=0:        53.30 (-1232.88%)	        4.00 ( -0.03%)	        3.61 (  9.71%)	        3.99 (  0.15%)	        4.00
                  length=128, char=0:        53.30 (-1237.74%)	        3.99 ( -0.21%)	        3.61 (  9.47%)	        3.99 ( -0.24%)	        3.98
                  length=160, char=0:        71.44 (-1270.17%)	        5.22 ( -0.12%)	        5.59 ( -7.19%)	        5.22 ( -0.09%)	        5.21
                  length=160, char=0:        70.67 (-1256.99%)	        5.21 (  0.02%)	        5.61 ( -7.76%)	        5.21 ( -0.05%)	        5.21
                  length=192, char=0:        82.85 (-1490.91%)	        5.21 ( -0.09%)	        5.16 (  0.84%)	        5.21 (  0.00%)	        5.21
                  length=192, char=0:        82.84 (-1490.79%)	        5.21 (  0.00%)	        5.16 (  0.89%)	        5.21 ( -0.05%)	        5.21
                  length=224, char=0:        95.34 (-514.50%)	       17.14 (-10.44%)	        6.45 ( 58.40%)	        6.76 ( 56.44%)	       15.52
                  length=224, char=0:        95.34 (-490.08%)	       16.68 ( -3.23%)	        6.44 ( 60.15%)	        6.75 ( 58.24%)	       16.16
                  length=256, char=0:       107.45 (-1177.01%)	        8.42 ( -0.04%)	        6.69 ( 20.54%)	        6.75 ( 19.79%)	        8.41
                  length=256, char=0:       107.45 (-1168.94%)	        8.42 (  0.58%)	        7.06 ( 16.62%)	        6.75 ( 20.34%)	        8.47
                  length=288, char=0:       120.54 (-1331.13%)	        8.43 ( -0.07%)	        7.56 ( 10.25%)	        8.29 (  1.57%)	        8.42
                  length=288, char=0:       120.02 (-1325.53%)	        8.42 ( -0.01%)	        7.55 ( 10.32%)	        8.29 (  1.54%)	        8.42
                  length=320, char=0:       132.08 (-1371.26%)	        8.42 (  6.24%)	        8.22 (  8.39%)	        8.28 (  7.74%)	        8.98
                  length=320, char=0:       136.44 (-1518.49%)	        8.44 ( -0.06%)	        8.22 (  2.49%)	        8.28 (  1.72%)	        8.43
                  length=352, char=0:       144.76 (-729.19%)	        9.96 ( 42.97%)	        9.09 ( 47.96%)	        9.83 ( 43.71%)	       17.46
                  length=352, char=0:       145.45 (-732.82%)	        9.98 ( 42.86%)	       10.15 ( 41.85%)	        9.83 ( 43.72%)	       17.46
                  length=384, char=0:       156.82 (-798.42%)	       10.00 ( 42.74%)	        9.76 ( 44.07%)	        9.82 ( 43.72%)	       17.45
                  length=384, char=0:       156.81 (-797.96%)	        9.95 ( 43.00%)	       11.85 ( 32.16%)	        9.85 ( 43.60%)	       17.46
                  length=416, char=0:       168.99 (-772.17%)	       17.46 (  9.90%)	       10.53 ( 45.64%)	       11.42 ( 41.04%)	       19.38
                  length=416, char=0:       168.99 (-771.29%)	       17.45 ( 10.01%)	       10.53 ( 45.71%)	       11.97 ( 38.28%)	       19.40
                  length=448, char=0:       181.50 (-836.83%)	       17.46 (  9.90%)	       11.30 ( 41.66%)	       11.37 ( 41.30%)	       19.37
                  length=448, char=0:       185.66 (-858.07%)	       18.01 (  7.04%)	       11.31 ( 41.62%)	       11.36 ( 41.37%)	       19.38
                  length=480, char=0:       194.01 (-1410.62%)	       13.42 ( -4.48%)	       12.19 (  5.12%)	       13.14 ( -2.30%)	       12.84
                  length=480, char=0:       193.62 (-1407.27%)	       13.42 ( -4.46%)	       12.83 (  0.13%)	       13.01 ( -1.24%)	       12.85
---
 sysdeps/aarch64/multiarch/Makefile          |   2 +-
 sysdeps/aarch64/multiarch/ifunc-impl-list.c |   1 +
 sysdeps/aarch64/multiarch/memset.c          |   5 +-
 sysdeps/aarch64/multiarch/memset_kunpeng.S  | 113 ++++++++++++++++++++++++++++
 4 files changed, 119 insertions(+), 2 deletions(-)
 create mode 100644 sysdeps/aarch64/multiarch/memset_kunpeng.S

Patch
diff mbox series

diff --git a/sysdeps/aarch64/multiarch/Makefile b/sysdeps/aarch64/multiarch/Makefile
index 4150b89a902..8378107c78e 100644
--- a/sysdeps/aarch64/multiarch/Makefile
+++ b/sysdeps/aarch64/multiarch/Makefile
@@ -1,7 +1,7 @@ 
 ifeq ($(subdir),string)
 sysdep_routines += memcpy_generic memcpy_thunderx memcpy_thunderx2 \
 		   memcpy_falkor memmove_falkor \
-		   memset_generic memset_falkor memset_emag \
+		   memset_generic memset_falkor memset_emag memset_kunpeng \
 		   memchr_generic memchr_nosimd \
 		   strlen_generic strlen_asimd
 endif
diff --git a/sysdeps/aarch64/multiarch/ifunc-impl-list.c b/sysdeps/aarch64/multiarch/ifunc-impl-list.c
index be13b916e58..bcbd90d0c41 100644
--- a/sysdeps/aarch64/multiarch/ifunc-impl-list.c
+++ b/sysdeps/aarch64/multiarch/ifunc-impl-list.c
@@ -53,6 +53,7 @@  __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 		 can do a comparative analysis with __memset_generic.  */
 	      IFUNC_IMPL_ADD (array, i, memset, (zva_size == 64), __memset_falkor)
 	      IFUNC_IMPL_ADD (array, i, memset, (zva_size == 64), __memset_emag)
+	      IFUNC_IMPL_ADD (array, i, memset, 1, __memset_kunpeng)
 	      IFUNC_IMPL_ADD (array, i, memset, 1, __memset_generic))
   IFUNC_IMPL (i, name, memchr,
 	      IFUNC_IMPL_ADD (array, i, memchr, 1, __memchr_nosimd)
diff --git a/sysdeps/aarch64/multiarch/memset.c b/sysdeps/aarch64/multiarch/memset.c
index e9cdd385f26..4cc34b9b99a 100644
--- a/sysdeps/aarch64/multiarch/memset.c
+++ b/sysdeps/aarch64/multiarch/memset.c
@@ -30,10 +30,13 @@  extern __typeof (__redirect_memset) __libc_memset;
 
 extern __typeof (__redirect_memset) __memset_falkor attribute_hidden;
 extern __typeof (__redirect_memset) __memset_emag attribute_hidden;
+extern __typeof (__redirect_memset) __memset_kunpeng attribute_hidden;
 extern __typeof (__redirect_memset) __memset_generic attribute_hidden;
 
 libc_ifunc (__libc_memset,
-	    ((IS_FALKOR (midr) || IS_PHECDA (midr)) && zva_size == 64
+	    IS_KUNPENG (midr)
+	    ?__memset_kunpeng
+	    : ((IS_FALKOR (midr) || IS_PHECDA (midr)) && zva_size == 64
 	     ? __memset_falkor
 	     : (IS_EMAG (midr) && zva_size == 64
 	       ? __memset_emag
diff --git a/sysdeps/aarch64/multiarch/memset_kunpeng.S b/sysdeps/aarch64/multiarch/memset_kunpeng.S
new file mode 100644
index 00000000000..a03441ae72f
--- /dev/null
+++ b/sysdeps/aarch64/multiarch/memset_kunpeng.S
@@ -0,0 +1,113 @@ 
+/* Optimized memset for Huawei Kunpeng processor.
+   Copyright (C) 2012-2019 Free Software Foundation, Inc.
+
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library.  If not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+#include <sysdeps/aarch64/memset-reg.h>
+
+#if IS_IN (libc)
+# define MEMSET __memset_kunpeng
+
+/* Assumptions:
+ *
+ * ARMv8-a, AArch64, unaligned accesses
+ *
+ */
+
+ENTRY_ALIGN (MEMSET, 6)
+
+	DELOUSE (0)
+	DELOUSE (2)
+
+	dup	v0.16B, valw
+	add	dstend, dstin, count
+
+	cmp	count, 128
+	b.hs	L(set_long)
+
+	cmp	count, 16
+	b.lo	L(less16)
+
+	/* Set 16..127 bytes.  */
+	str	q0, [dstin]
+	tbnz	count, 6, L(set127)
+	str	q0, [dstend, -16]
+	tbz	count, 5, 1f
+	str	q0, [dstin, 16]
+	str	q0, [dstend, -32]
+1:	ret
+
+	.p2align 4
+	/* Set 64..127 bytes.  Write 64 bytes from the start and
+	   64 bytes from the end.  */
+L(set127):
+	stp	q0, q0, [dstin, 16]
+	str	q0, [dstin, 48]
+	stp	q0, q0, [dstend, -64]
+	stp	q0, q0, [dstend, -32]
+	ret
+
+	.p2align 4
+	/* Set 0..15 bytes.  */
+L(less16):
+	tbz	count, 3, L(less8)
+	str	d0, [dstin]
+	str	d0, [dstend, -8]
+	ret
+L(less8):
+	tbz	count, 2, 2f
+	str	s0, [dstin]
+	str	s0, [dstend, -4]
+	ret
+2:	cbz	count, 3f
+	str	b0, [dstin]
+	tbz	count, 1, 3f
+	str	h0, [dstend, -2]
+3:	ret
+
+	.p2align 4
+L(set_long):
+	bic	dst, dstin, 15
+	str	q0, [dstin]
+	sub	count, dstend, dst	/* Count is 16 too large.  */
+	sub	dst, dst, 16		/* Dst is biased by -32.  */
+	sub	count, count, 64 + 16 + 1 /* Adjust count and bias for loop.  */
+1:	stp	q0, q0, [dst, 32]
+	stp	q0, q0, [dst, 64]!
+	subs	count, count, 64
+	b.lo	1f
+	stp	q0, q0, [dst, 32]
+	stp	q0, q0, [dst, 64]!
+	subs	count, count, 64
+	b.lo	1f
+	stp	q0, q0, [dst, 32]
+	stp	q0, q0, [dst, 64]!
+	subs	count, count, 64
+	b.lo	1f
+	stp	q0, q0, [dst, 32]
+	stp	q0, q0, [dst, 64]!
+	subs	count, count, 64
+	b.hs	1b
+
+1:	stp	q0, q0, [dstend, -64]
+	stp	q0, q0, [dstend, -32]
+	ret
+
+END (MEMSET)
+libc_hidden_builtin_def (MEMSET)
+#endif