diff mbox series

[12/14,LoongArch] Add optimized string functions

Message ID CAKjxQHm63V0hfkv7_Q1-aEmB8gnDoMRxw6bH5gSkO+0Ev8HeWg@mail.gmail.com
State New
Headers show
Series None | expand

Commit Message

Paul Hua Aug. 19, 2021, 4:18 a.m. UTC
From c51b254809ebf7d2471497ff8e1053e4a30d1baf Mon Sep 17 00:00:00 2001
From: caiyinyu <caiyinyu@loongson.cn>
Date: Tue, 27 Jul 2021 16:18:43 +0800
Subject: [PATCH 12/14] LoongArch: Add optimized string functions

        * sysdeps/loongarch/lp64/strchr.S: New file.
        * sysdeps/loongarch/lp64/strchrnul.S: Likewise.
        * sysdeps/loongarch/lp64/strcmp.S: Likewise.
        * sysdeps/loongarch/lp64/strcpy.S: Likewise.
        * sysdeps/loongarch/lp64/strlen.S: Likewise.
        * sysdeps/loongarch/lp64/strncmp.S: Likewise.
        * sysdeps/loongarch/lp64/strnlen.S: Likewise.
---
 sysdeps/loongarch/lp64/strchr.S    | 148 +++++++++++++++
 sysdeps/loongarch/lp64/strchrnul.S | 163 +++++++++++++++++
 sysdeps/loongarch/lp64/strcmp.S    | 211 +++++++++++++++++++++
 sysdeps/loongarch/lp64/strcpy.S    | 224 +++++++++++++++++++++++
 sysdeps/loongarch/lp64/strlen.S    | 151 +++++++++++++++
 sysdeps/loongarch/lp64/strncmp.S   | 282 +++++++++++++++++++++++++++++
 sysdeps/loongarch/lp64/strnlen.S   | 168 +++++++++++++++++
 7 files changed, 1347 insertions(+)
 create mode 100644 sysdeps/loongarch/lp64/strchr.S
 create mode 100644 sysdeps/loongarch/lp64/strchrnul.S
 create mode 100644 sysdeps/loongarch/lp64/strcmp.S
 create mode 100644 sysdeps/loongarch/lp64/strcpy.S
 create mode 100644 sysdeps/loongarch/lp64/strlen.S
 create mode 100644 sysdeps/loongarch/lp64/strncmp.S
 create mode 100644 sysdeps/loongarch/lp64/strnlen.S
diff mbox series

Patch

diff --git a/sysdeps/loongarch/lp64/strchr.S b/sysdeps/loongarch/lp64/strchr.S
new file mode 100644
index 0000000000..de02859500
--- /dev/null
+++ b/sysdeps/loongarch/lp64/strchr.S
@@ -0,0 +1,148 @@ 
+/* Copyright (C) 2021 Free Software Foundation, Inc.
+
+   This file is part of the GNU C Library.
+
+   Contributed by Loongson Technology Corporation Limited.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library.  If not, see
+   <https://www.gnu.org/licenses/>.  */
+
+/*
+ * ISA: LoongArch
+ * ABI: lp64
+ */
+
+/* basic algorithm :
+
+   +.   use ld.d and mask for the first 8 bytes or less;
+
+   +.   build a1 with 8c with dins;
+
+   +.   use xor from a1 and v0 to check if is found;
+
+   +.   if (v0 - 0x0101010101010101) & (~(v0 | 0x7f7f7f7f7f7f7f7f)!= 0, v0 has
+   one byte is \0, else has no \0
+ */
+
+#include <sysdep.h>
+#include <sys/asm.h>
+
+#define L_ADDIU addi.d
+#define L_ADDU add.d
+#define L_SUBU sub.d
+
+#define STRCHR  strchr
+#define MOVN(rd,rs,rt) \
+ maskeqz t6, rs, rt;\
+ masknez rd, rd, rt;\
+ or rd, rd, t6
+
+#define MOVN2(rd,rt) \
+ masknez rd, rd, rt;\
+ or rd, rd, rt
+
+
+/* char * strchr (const char *s1, int c); */
+
+LEAF(STRCHR)
+ .align 6
+
+ li.w t4, 0x7
+ lu12i.w a2, 0x01010
+ bstrins.d a1, a1, 15, 8
+ andi t0, a0, 0x7
+
+ ori a2, a2, 0x101
+ andn t4, a0, t4
+ slli.w t1, t0, 3
+
+ ld.d t4, t4, 0
+
+
+ nor t8, zero, zero
+ bstrins.d a1, a1, 31, 16
+ srl.d t4, t4, t1
+
+ bstrins.d a1, a1, 63, 32
+ bstrins.d a2, a2, 63, 32
+ srl.d a7, t8, t1
+
+ li.w t1, 8
+ nor t8, a7, zero
+ slli.d a3, a2, 7
+ or t5, t8, t4
+ and t3, a7, a1
+
+ sub.w t1, t1, t0
+ nor a3, a3, zero
+ xor t2, t5, t3
+ sub.d a7, t5, a2
+ nor a6, t5, a3
+
+ sub.d a5, t2, a2
+ nor a4, t2, a3
+
+ and a6, a7, a6
+ and a5, a5, a4
+ or a7, a6, a5
+ bnez a7, L(_mc8_a)
+
+ L_ADDU a0, a0, t1
+L(_aloop):
+ ld.d t4, a0, 0
+
+ xor t2, t4, a1
+ sub.d a7, t4, a2
+ nor a6, t4, a3
+ sub.d a5, t2, a2
+
+ nor a4, t2, a3
+ and a6, a7, a6
+ and a5, a5, a4
+ or a7, a6, a5
+ bnez a7, L(_mc8_a)
+
+ ld.d t4, a0, 8
+ L_ADDIU a0, a0, 16
+ xor t2, t4, a1
+ sub.d a7, t4, a2
+ nor a6, t4, a3
+ sub.d a5, t2, a2
+
+ nor a4, t2, a3
+ and a6, a7, a6
+ and a5, a5, a4
+ or a7, a6, a5
+ beqz a7, L(_aloop)
+
+ L_ADDIU a0, a0, -8
+L(_mc8_a):
+
+ ctz.d t0, a5
+ ctz.d t2, a6
+
+ srli.w t0, t0, 3
+ srli.w t2, t2, 3
+ sltu t1, t2, t0
+ L_ADDU v0, a0, t0
+ masknez v0, v0, t1
+ jr ra
+END(STRCHR)
+
+#ifndef ANDROID_CHANGES
+#ifdef _LIBC
+libc_hidden_builtin_def (strchr)
+weak_alias (strchr, index)
+#endif
+#endif
diff --git a/sysdeps/loongarch/lp64/strchrnul.S
b/sysdeps/loongarch/lp64/strchrnul.S
new file mode 100644
index 0000000000..23f91c93dd
--- /dev/null
+++ b/sysdeps/loongarch/lp64/strchrnul.S
@@ -0,0 +1,163 @@ 
+/* Copyright (C) 2021 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   Contributed by Loongson Technology Corporation Limited.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library.  If not, see
+   <https://www.gnu.org/licenses/>.  */
+
+/*
+ * ISA: LoongArch
+ * ABI: lp64
+ */
+
+/* basic algorithm :
+
+    +.  use ld.d and mask for the first 8 bytes or less;
+
+    +.  build a1 with 8c with dins;
+
+    +.  use xor from a1 and v0 to check if is found;
+
+    +.  if (v0 - 0x0101010101010101) & (~(v0 | 0x7f7f7f7f7f7f7f7f)!= 0, v0 has
+    one byte is \0, else has no \0
+
+ */
+
+#include <sysdep.h>
+#include <sys/asm.h>
+
+
+#define L_ADDIU addi.d
+#define L_ADDU add.d
+#define L_SUBU sub.d
+
+#define STRCHRNUL   __strchrnul
+
+#define MOVN(rd,rs,rt) \
+ maskeqz t6, rs, rt;\
+ masknez rd, rd, rt;\
+ or rd, rd, t6
+
+#define MOVZ(rd,rs,rt) \
+ masknez t6, rs, rt;\
+ maskeqz rd, rd, rt;\
+ or rd, rd, t6
+
+
+#define MOVN2(rd,rt) \
+ masknez rd, rd, rt;\
+ or rd, rd, rt
+
+/* char * strchrnul (const char *s1, int c); */
+
+LEAF(STRCHRNUL)
+ .align 6
+
+ li.w t4, 0x7
+ lu12i.w a2, 0x01010
+ bstrins.d a1, a1, 15, 8
+ andi t0, a0, 0x7
+
+ ori a2, a2, 0x101
+ andn t4, a0, t4
+ slli.w t1, t0, 3
+
+ /* ldr t4, 0(a0) */
+ ld.d t4, t4, 0
+
+
+ nor t8, zero, zero
+ bstrins.d a1, a1, 31, 16
+ srl.d t4, t4, t1
+
+ preld 0, a0, 32
+ bstrins.d a1, a1, 63, 32
+ bstrins.d a2, a2, 63, 32
+ srl.d a7, t8, t1
+
+ nor t8, a7, zero
+ slli.d a3, a2, 7
+ or t5, t8, t4
+ and t3, a7, a1
+
+ nor a3, a3, zero
+ xor t2, t5, t3
+ sub.d a7, t5, a2
+ nor a6, t5, a3
+
+ li.w t1, 8
+ sub.d a5, t2, a2
+ nor a4, t2, a3
+
+ and a6, a7, a6
+ and a5, a5, a4
+ or a7, a6, a5
+ bnez a7, L(_mc8_a)
+
+
+ sub.w t1, t1, t0
+ L_ADDU a0, a0, t1
+L(_aloop):
+ ld.d t4, a0, 0
+
+ xor t2, t4, a1
+ sub.d a7, t4, a2
+ nor a6, t4, a3
+ sub.d a5, t2, a2
+
+ nor a4, t2, a3
+ and a6, a7, a6
+ and a5, a5, a4
+
+ or a7, a6, a5
+ bnez a7, L(_mc8_a)
+
+ ld.d t4, a0, 8
+ L_ADDIU a0, a0, 16
+
+ xor t2, t4, a1
+ sub.d a7, t4, a2
+ nor a6, t4, a3
+ sub.d a5, t2, a2
+
+ nor a4, t2, a3
+ and a6, a7, a6
+ and a5, a5, a4
+
+ or a7, a6, a5
+ beqz a7, L(_aloop)
+
+ L_ADDIU a0, a0, -8
+L(_mc8_a):
+
+ ctz.d t0, a5
+ ctz.d t2, a6
+
+ srli.w t0, t0, 3
+ srli.w t2, t2, 3
+ slt t1, t0, t2
+
+ MOVZ(t0,t2,t1)
+
+ L_ADDU v0, a0, t0
+ jr ra
+END(STRCHRNUL)
+
+#ifndef ANDROID_CHANGES
+#ifdef _LIBC
+weak_alias(__strchrnul, strchrnul)
+libc_hidden_builtin_def (__strchrnul)
+#endif
+#endif
diff --git a/sysdeps/loongarch/lp64/strcmp.S b/sysdeps/loongarch/lp64/strcmp.S
new file mode 100644
index 0000000000..b59f36f578
--- /dev/null
+++ b/sysdeps/loongarch/lp64/strcmp.S
@@ -0,0 +1,211 @@ 
+/* Copyright (C) 2021 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   Contributed by Loongson Technology Corporation Limited.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library.  If not, see
+   <https://www.gnu.org/licenses/>.  */
+
+/*
+ * ISA: LoongArch
+ * ABI: lp64
+ */
+
+/* basic algorithm :
+
+   +.  let t0, t1 point to a0, a1, if a0 has smaller low 3 bit of a0 and a1,
+   set a4 to 1 and let t0 point to the larger of lower 3bit of a0 and a1
+
+   +.  if low 3 bit of a0 equal low 3 bit of a0, use a ldr one time and more
+   ld other times;
+
+   +.  if not,  load partial t2 and t3, check if t2 has \0;
+
+   +.  then use use ld for t0, ldr for t1,
+
+   +.  if partial 8 byte  from t1 has \0, compare partial 8 byte from t1 with
+   8 byte from t0 with a mask in a7
+
+   +.  if not, ldl other part of t1, compare  8 byte from t1 with 8 byte from
+   t0
+
+   +.  if (v0 - 0x0101010101010101) & (~v0) & 0x8080808080808080 != 0, v0 has
+   one byte is \0, else has no \0
+
+   +.  for partial 8 byte from ldr t3, 0(a0), preload t3 with
+   0xffffffffffffffff
+ */
+
+#include <sys/asm.h>
+#include <sys/regdef.h>
+
+
+#define STRCMP  strcmp
+
+#define REP8_01 0x0101010101010101
+#define REP8_7f 0x7f7f7f7f7f7f7f7f
+#define REP8_80 0x8080808080808080
+
+/* Parameters and Results */
+#define src1 a0
+#define src2 a1
+#define result v0
+/* Note: v0 = a0 in lp64 ABI */
+
+
+/* Internal variable */
+#define data1 t0
+#define data2 t1
+#define has_nul t2
+#define diff t3
+#define syndrome t4
+#define zeroones t5
+#define sevenf t6
+#define pos t7
+#define exchange t8
+#define tmp1 a4
+#define tmp2 a5
+#define tmp3 a6
+#define src1_off a2
+#define src2_off a3
+#define tmp4 a7
+
+/* rd <- if rc then ra else rb will destroy tmp3 */
+
+#define CONDITIONSEL(rd,rc,ra,rb)\
+ masknez tmp3, rb, rc;\
+ maskeqz rd,   ra, rc;\
+ or rd,   rd, tmp3
+
+
+
+/* int strcmp (const char *s1, const char *s2); */
+
+LEAF(STRCMP)
+ .align 4
+
+ xor tmp1, src1, src2
+ lu12i.w zeroones, 0x01010
+ lu12i.w sevenf, 0x7f7f7
+ andi src1_off, src1, 0x7
+ ori zeroones, zeroones, 0x101
+ ori sevenf, sevenf, 0xf7f
+ andi tmp1, tmp1, 0x7
+ bstrins.d zeroones, zeroones, 63, 32
+ bstrins.d sevenf, sevenf, 63, 32
+ bnez tmp1, strcmp_misaligned8
+ bnez src1_off, strcmp_mutual_align
+strcmp_loop_aligned:
+ ld.d data1, src1, 0
+ addi.d src1, src1, 8
+ ld.d data2, src2, 0
+ addi.d src2, src2, 8
+strcmp_start_realigned:
+ sub.d tmp1, data1, zeroones
+ or tmp2, data1, sevenf
+ xor diff, data1, data2
+ andn has_nul, tmp1, tmp2
+ or syndrome, diff, has_nul
+ beqz syndrome, strcmp_loop_aligned
+
+strcmp_end:
+ ctz.d pos, syndrome
+ bstrins.d pos, zero, 2, 0
+ srl.d data1, data1, pos
+ srl.d data2, data2, pos
+ andi data1, data1, 0xff
+ andi data2, data2, 0xff
+ sub.d result, data1, data2
+ jr ra
+strcmp_mutual_align:
+ bstrins.d src1, zero, 2, 0
+ bstrins.d src2, zero, 2, 0
+ slli.d tmp1, src1_off,  0x3
+ ld.d data1, src1, 0
+ sub.d tmp1, zero, tmp1
+ ld.d data2, src2, 0
+ addi.d src1, src1, 8
+ addi.d src2, src2, 8
+ nor tmp2, zero, zero
+ srl.d tmp2, tmp2, tmp1
+ or data1, data1, tmp2
+ or data2, data2, tmp2
+ b strcmp_start_realigned
+
+strcmp_misaligned8:
+
+/* check if ((src1 != 0) && ((src2 == 0) || (src1 < src2)))
+   then exchange(src1,src2)
+*/
+ andi src2_off, src2, 0x7
+ slt tmp2, src1_off, src2_off
+ CONDITIONSEL(tmp2,src2_off,tmp2,tmp1)
+ maskeqz exchange, tmp2, src1_off
+ xor tmp3, src1, src2
+ maskeqz tmp3, tmp3, exchange
+ xor src1, src1, tmp3
+ xor src2, src2, tmp3
+
+ andi src1_off, src1, 0x7
+ beqz src1_off, strcmp_loop_misaligned
+strcmp_do_misaligned:
+ ld.bu data1, src1, 0
+ ld.bu data2, src2, 0
+ xor tmp3, data1, data2
+ addi.d src1, src1, 1
+ masknez tmp3, data1, tmp3
+ addi.d src2, src2, 1
+ beqz tmp3, strcmp_done
+ andi src1_off, src1, 0x7
+ bnez src1_off, strcmp_do_misaligned
+
+strcmp_loop_misaligned:
+ andi tmp1, src2, 0xff8
+ xori tmp1, tmp1, 0xff8
+ beqz tmp1, strcmp_do_misaligned
+ ld.d data1, src1, 0
+ ld.d data2, src2, 0
+ addi.d src1, src1, 8
+ addi.d src2, src2, 8
+
+ sub.d tmp1, data1, zeroones
+ or tmp2, data1, sevenf
+ xor diff, data1, data2
+ andn has_nul, tmp1, tmp2
+ or syndrome, diff, has_nul
+ beqz syndrome, strcmp_loop_misaligned
+
+strcmp_misalign_end:
+ ctz.d pos, syndrome
+ bstrins.d pos, zero, 2, 0
+ srl.d data1, data1, pos
+ srl.d data2, data2, pos
+ andi data1, data1, 0xff
+ andi data2, data2, 0xff
+ sub.d tmp1, data1, data2
+ sub.d tmp2, data2, data1
+ CONDITIONSEL(result,exchange,tmp2,tmp1)
+ jr ra
+
+strcmp_done:
+ sub.d tmp1, data1, data2
+ sub.d tmp2, data2, data1
+ CONDITIONSEL(result,exchange,tmp2,tmp1)
+ jr ra
+END(STRCMP)
+#ifndef ANDROID_CHANGES
+#ifdef _LIBC
+libc_hidden_builtin_def (strcmp)
+#endif
+#endif
diff --git a/sysdeps/loongarch/lp64/strcpy.S b/sysdeps/loongarch/lp64/strcpy.S
new file mode 100644
index 0000000000..e75eeb06cf
--- /dev/null
+++ b/sysdeps/loongarch/lp64/strcpy.S
@@ -0,0 +1,224 @@ 
+/* Copyright (C) 2021 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   Contributed by Loongson Technology Corporation Limited.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library.  If not, see
+   <https://www.gnu.org/licenses/>.  */
+
+/*
+ * ISA: LoongArch
+ * ABI: lp64
+ */
+
+/* basic algorithm :
+
+   +.  if src aligned. just do the copy loop. if not, do the cross page check
+   and copy one double word.
+
+   Then move src to aligned.
+
+   +.   if (v0 - 0x0101010101010101) & (~v0) & 0x8080808080808080 != 0, v0 has
+   one byte is \0, else has no \0
+
+ */
+
+#include <sys/asm.h>
+#include <sys/regdef.h>
+
+
+#define STRCPY  strcpy
+
+
+#define REP8_01 0x0101010101010101
+#define REP8_7f 0x7f7f7f7f7f7f7f7f
+#define REP8_80 0x8080808080808080
+
+/* Parameters and Results */
+#define dest a0
+#define src a1
+#define result v0
+/* Note: v0 = a0 in lp64 ABI */
+
+
+/* Internal variable */
+#define data t0
+#define data1 t1
+#define has_nul t2
+#define diff t3
+#define syndrome t4
+#define zeroones t5
+#define sevenf t6
+#define pos t7
+#define dest_backup t8
+#define tmp1 a4
+#define tmp2 a5
+#define tmp3 a6
+#define dest_off a2
+#define src_off a3
+#define tmp4 a7
+
+/* rd <- if rc then ra else rb will destroy tmp3 */
+#define CONDITIONSEL(rd,rc,ra,rb)\
+ masknez tmp3, rb, rc;\
+ maskeqz rd,   ra, rc;\
+ or rd,   rd, tmp3
+
+
+
+/* int strcpy (const char *s1, const char *s2); */
+LEAF(STRCPY)
+ .align 4
+
+ move dest_backup, dest
+ lu12i.w zeroones, 0x01010
+ lu12i.w sevenf, 0x7f7f7
+ ori zeroones, zeroones, 0x101
+ ori sevenf, sevenf, 0xf7f
+ bstrins.d zeroones, zeroones, 63, 32
+ bstrins.d sevenf, sevenf, 63, 32
+ andi src_off, src, 0x7
+ beqz src_off, strcpy_loop_aligned_1
+ b strcpy_mutual_align
+strcpy_loop_aligned:
+ st.d data, dest, 0
+ addi.d dest, dest, 8
+strcpy_loop_aligned_1:
+ ld.d data, src, 0
+ addi.d src, src, 8
+strcpy_start_realigned:
+ sub.d tmp1, data, zeroones
+ or tmp2, data, sevenf
+ andn has_nul, tmp1, tmp2
+ beqz has_nul, strcpy_loop_aligned
+
+strcpy_end:
+
+/* 8 4 2 1 */
+ ctz.d pos, has_nul
+ srli.d pos, pos, 3
+ addi.d pos, pos, 1
+
+/*
+   Do 8/4/2/1 strcpy based on pos value.
+   pos value is the number of bytes to be copied
+   the bytes include the final \0 so the max length is 8 and the min length
+   is 1
+ */
+strcpy_end_8:
+ andi tmp1, pos, 0x8
+ beqz tmp1, strcpy_end_4
+ st.d data, dest, 0
+ move dest, dest_backup
+ jr ra
+strcpy_end_4:
+ andi tmp1, pos, 0x4
+ beqz tmp1, strcpy_end_2
+ st.w data, dest, 0
+ srli.d data, data, 32
+ addi.d dest, dest, 4
+strcpy_end_2:
+ andi tmp1, pos, 0x2
+ beqz tmp1, strcpy_end_1
+ st.h data, dest, 0
+ srli.d data, data, 16
+ addi.d dest, dest, 2
+strcpy_end_1:
+ andi tmp1, pos, 0x1
+ beqz tmp1, strcpy_end_ret
+ st.b data, dest, 0
+strcpy_end_ret:
+ move result, dest_backup
+ jr ra
+
+strcpy_mutual_align:
+
+/*
+   Check if around src page bound.
+   if not go to page cross ok.
+   if it is, do further check.
+   use tmp2 to accelerate.
+ */
+ li.w tmp2, 0xff8
+ andi tmp1, src,  0xff8
+ beq tmp1, tmp2, strcpy_page_cross
+
+strcpy_page_cross_ok:
+
+/*
+   Load a misaligned double word and check if has \0
+   If no, do a misaligned double word paste.
+   If yes, calculate the number of avaliable bytes,
+   then jump to 4/2/1 end.
+ */
+ ld.d data, src, 0
+ sub.d tmp1, data, zeroones
+ or tmp2, data, sevenf
+ andn has_nul, tmp1, tmp2
+ bnez has_nul, strcpy_end
+strcpy_mutual_align_finish:
+
+/*
+   Before jump back to align loop, make dest/src aligned.
+   This will cause a duplicated paste for several bytes between the first
+   double word and the second double word,
+   but should not bring a problem.
+ */
+ li.w tmp1, 8
+ st.d data, dest, 0
+ sub.d tmp1, tmp1, src_off
+ add.d src,  src,  tmp1
+ add.d dest, dest, tmp1
+
+ b strcpy_loop_aligned_1
+
+strcpy_page_cross:
+
+/*
+   ld.d from aligned address(src & ~0x7).
+   check if high bytes have \0.
+   it not, go back to page cross ok,
+   since the string is supposed to cross the page bound in such situation.
+   if it is,
+   do a srl for data to make it seems like a direct double word from src,
+   then go to 4/2/1 strcpy end.
+
+   tmp4 is 0xffff...ffff mask
+   tmp2 demonstrate the bytes to be masked
+   tmp2 = src_off << 3
+   data = data >> (src_off * 8) | -1 << (64 - src_off * 8)
+   and
+   -1 << (64 - src_off * 8) ->  ~(-1 >> (src_off * 8))
+
+ */
+ li.w tmp1, 0x7
+ andn tmp3, src,  tmp1
+ ld.d data, tmp3, 0
+ li.w tmp4, -1
+ slli.d tmp2, src_off, 3
+ srl.d tmp4, tmp4, tmp2
+ srl.d data, data, tmp2
+ nor tmp4, tmp4, zero
+ or data, data, tmp4
+ sub.d tmp1, data, zeroones
+ or tmp2, data, sevenf
+ andn has_nul, tmp1, tmp2
+ beqz has_nul, strcpy_page_cross_ok
+ b strcpy_end
+END(STRCPY)
+#ifndef ANDROID_CHANGES
+#ifdef _LIBC
+libc_hidden_builtin_def (strcpy)
+#endif
+#endif
diff --git a/sysdeps/loongarch/lp64/strlen.S b/sysdeps/loongarch/lp64/strlen.S
new file mode 100644
index 0000000000..ec4bad21a6
--- /dev/null
+++ b/sysdeps/loongarch/lp64/strlen.S
@@ -0,0 +1,151 @@ 
+/* Copyright (C) 2021 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   Contributed by Loongson Technology Corporation Limited.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library.  If not, see
+   <https://www.gnu.org/licenses/>.  */
+
+/*
+ * ISA: LoongArch
+ * ABI: lp64
+ */
+/*
+algorithm:
+
+    #.  use ld/ldr to access word/partial word in the string
+
+    #.  use (x - 0x0101010101010101) & (~(x | 0x7f7f7f7f7f7f7f7f) != 0 to
+    judge if x has zero byte
+
+    #.  use dctz((x - 0x0101010101010101) & (~(x | 0x7f7f7f7f7f7f7f7f) >> 3
+    to get the index of first rightmost zero byte in dword x;
+
+    #.  use dctz(x) = 64 - dclz(~x & (x-1));
+
+    #.  use pointer to the last non zero byte  minus pointer to the start
+    of the string to get the length of string
+
+*/
+
+#include <sysdep.h>
+#include <sys/asm.h>
+
+
+#define L_ADDIU addi.d
+#define L_ADDU add.d
+#define L_SUBU sub.d
+
+#define STRLEN strlen
+#define L(x) x
+
+
+/* size_t strlen (const char *s1); */
+
+ .text;
+ .globl strlen;
+ .align 5;
+ cfi_startproc;
+ .type strlen, @function;
+strlen:
+
+ /* LEAF(strlen) */
+ #preld 0, a0, 0
+
+ nor t4, zero, zero
+ lu12i.w a2, 0x01010
+ andi t5, a0, 0x7
+
+ li.w t7, 0x7
+ slli.d t6, t5, 0x3
+ andn t7, a0, t7
+ ld.d a1, t7, 0
+ sub.d t7, zero, t6
+ sll.d t4, t4, t7
+ maskeqz t4, t4, t6
+ srl.d a1, a1, t6
+ or a1, a1, t4
+
+
+ ori a2, a2, 0x101
+ nor t1, a1, zero
+ li.w a4, 8
+
+ #preld 0, a0, 32
+ bstrins.d a2, a2, 63, 32
+ sub.d a5, a4, t5
+ move t5, a0
+
+ sub.d t0, a1, a2
+ slli.d t4, a2, 7
+ nor a3, zero, t4
+ nor t1, a1, a3
+
+ and t0, t0, t1
+ #preld 0, a0, 64
+
+/* instead of use bnel with daddu a0, a0, a5 in branch slot */
+
+ bnez t0, strlen_count1
+ L_ADDU a0, a0, a5
+strlen_loop:
+ ld.d a1, a0, 0
+ sub.d t0, a1, a2
+ and t1, t0, t4
+ bnez t1, strlen_count_pre
+ ld.d a1, a0, 8
+ sub.d t0, a1, a2
+ and t1, t0, t4
+ L_ADDIU a0, a0, 16
+ beqz t1, strlen_loop
+strlen_count:
+ addi.d a0, a0, -8
+strlen_count_pre:
+ nor t1, a1, a3
+ and t0, t0, t1
+ beqz t0, strlen_noascii_start
+strlen_count1:
+ ctz.d t1, t0
+ L_SUBU v0, a0, t5
+ srli.w t1, t1, 3
+ L_ADDU v0, v0, t1
+ jr ra
+strlen_noascii_start:
+ addi.d a0, a0, 8
+strlen_loop_noascii:
+ ld.d a1, a0, 0
+ sub.d t0, a1, a2
+ nor t1, a1, a3
+ and t0, t0, t1
+ bnez t0, strlen_count1
+ ld.d a1, a0, 8
+ sub.d t0, a1, a2
+ nor t1, a1, a3
+ and t0, t0, t1
+ L_ADDIU a0, a0, 16
+ beqz t0, strlen_loop_noascii
+ addi.d a0, a0, -8
+ ctz.d t1, t0
+ L_SUBU v0, a0, t5
+ srli.w t1, t1, 3
+ L_ADDU v0, v0, t1
+ jr ra
+END(STRLEN)
+
+#ifndef ANDROID_CHANGES
+#ifdef _LIBC
+libc_hidden_builtin_def (strlen)
+#endif
+#endif
+
diff --git a/sysdeps/loongarch/lp64/strncmp.S b/sysdeps/loongarch/lp64/strncmp.S
new file mode 100644
index 0000000000..e95f5f7a36
--- /dev/null
+++ b/sysdeps/loongarch/lp64/strncmp.S
@@ -0,0 +1,282 @@ 
+/* Copyright (C) 2021 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   Contributed by Loongson Technology Corporation Limited.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library.  If not, see
+   <https://www.gnu.org/licenses/>.  */
+
+/*
+ * ISA: LoongArch64
+ * ABI: lp64
+ */
+
+/* basic algorithm :
+
+   +.   let t0, t1 point to a0, a1, if a0 has smaller low 3 bit of a0 and a1,
+   set a4 to 1 and let t0 point to the larger of lower 3bit of a0 and a1
+
+   +.   if low 3 bit of a0 equal low 3 bit of a0, use a ldr one time and more
+   ld other times;
+
+   +.   if not,  load partial t2 and t3, check if t2 has \0;
+
+   +.   then use use ld for t0, ldr for t1,
+
+   +.   if partial 8 byte  from t1 has \0, compare partial 8 byte from t1 with
+   8 byte from t0 with a mask in a7
+
+   +.   if not, ldl other part of t1, compare  8 byte from t1 with 8 byte from
+   t0
+
+   +.   if (v0 - 0x0101010101010101) & (~v0) & 0x8080808080808080 != 0, v0 has
+   one byte is \0, else has no \0
+
+   +.   for partial 8 byte from ldr t3, 0(a0), preload t3 with
+   0xffffffffffffffff
+
+ */
+
+#include <sys/asm.h>
+#include <sys/regdef.h>
+
+
+#define STRNCMP strncmp
+
+#define REP8_01 0x0101010101010101
+#define REP8_7f 0x7f7f7f7f7f7f7f7f
+#define REP8_80 0x8080808080808080
+
+/* Parameters and Results */
+#define src1 a0
+#define src2 a1
+#define limit a2
+/* Note: v0 = a0 in lp64 ABI */
+#define result v0
+
+
+/* Internal variable */
+#define data1 t0
+#define data2 t1
+#define has_nul t2
+#define diff t3
+#define syndrome t4
+#define zeroones t5
+#define sevenf t6
+#define pos t7
+#define exchange t8
+#define tmp1 a5
+#define tmp2 a6
+#define tmp3 a7
+#define src1_off a3
+#define limit_wd a4
+
+
+/* int strncmp (const char *s1, const char *s2); */
+
+LEAF(STRNCMP)
+ .align 4
+ beqz limit, strncmp_ret0
+
+ xor tmp1, src1, src2
+ lu12i.w zeroones, 0x01010
+ lu12i.w sevenf, 0x7f7f7
+ andi src1_off, src1, 0x7
+ ori zeroones, zeroones, 0x101
+ andi tmp1, tmp1, 0x7
+ ori sevenf, sevenf, 0xf7f
+ bstrins.d zeroones, zeroones, 63, 32
+ bstrins.d sevenf, sevenf, 63, 32
+ bnez tmp1, strncmp_misaligned8
+ bnez src1_off, strncmp_mutual_align
+
+ addi.d limit_wd, limit, -1
+ srli.d limit_wd, limit_wd, 3
+
+strncmp_loop_aligned:
+ ld.d data1, src1, 0
+ addi.d src1, src1, 8
+ ld.d data2, src2, 0
+ addi.d src2, src2, 8
+
+strncmp_start_realigned:
+ addi.d limit_wd, limit_wd, -1
+ sub.d tmp1, data1, zeroones
+ or tmp2, data1, sevenf
+ xor diff, data1, data2
+ andn has_nul, tmp1, tmp2
+ srli.d tmp1, limit_wd, 63
+ or syndrome, diff, has_nul
+ or tmp2, syndrome, tmp1
+ beqz tmp2, strncmp_loop_aligned
+
+ /* if not reach limit */
+ bge limit_wd, zero, strncmp_not_limit
+ /* if reach limit */
+ andi limit, limit, 0x7
+ li.w tmp1, 0x8
+ sub.d limit, tmp1, limit
+ slli.d limit, limit, 0x3
+ li.d tmp1, -1
+ srl.d tmp1, tmp1, limit
+ and data1, data1, tmp1
+ and data2, data2, tmp1
+ orn syndrome, syndrome, tmp1
+
+
+strncmp_not_limit:
+ ctz.d pos, syndrome
+ bstrins.d pos, zero, 2, 0
+ srl.d data1, data1, pos
+ srl.d data2, data2, pos
+ andi data1, data1, 0xff
+ andi data2, data2, 0xff
+ sub.d result, data1, data2
+ jr ra
+
+
+
+strncmp_mutual_align:
+ bstrins.d src1, zero, 2, 0
+ bstrins.d src2, zero, 2, 0
+ slli.d tmp1, src1_off,  0x3
+ ld.d data1, src1, 0
+ ld.d data2, src2, 0
+ addi.d src2, src2, 8
+ addi.d src1, src1, 8
+
+ addi.d limit_wd, limit, -1
+ andi tmp3, limit_wd, 0x7
+ srli.d limit_wd, limit_wd, 3
+ add.d limit, limit, src1_off
+ add.d tmp3, tmp3, src1_off
+ srli.d tmp3, tmp3, 3
+ add.d limit_wd, limit_wd, tmp3
+
+ sub.d tmp1, zero, tmp1
+ nor tmp2, zero, zero
+ srl.d tmp2, tmp2, tmp1
+ or data1, data1, tmp2
+ or data2, data2, tmp2
+ b strncmp_start_realigned
+
+strncmp_misaligned8:
+ li.w tmp1, 0x10
+ bge limit, tmp1, strncmp_try_words
+
+strncmp_byte_loop:
+ ld.bu data1, src1, 0
+ ld.bu data2, src2, 0
+ addi.d limit, limit, -1
+ xor tmp1, data1, data2
+ masknez tmp1, data1, tmp1
+ maskeqz tmp1, limit, tmp1
+ beqz tmp1, strncmp_done
+
+ ld.bu data1, src1, 1
+ ld.bu data2, src2, 1
+ addi.d src1, src1, 2
+ addi.d src2, src2, 2
+ addi.d limit, limit, -1
+ xor tmp1, data1, data2
+ masknez tmp1, data1, tmp1
+ maskeqz tmp1, limit, tmp1
+ bnez tmp1, strncmp_byte_loop
+
+
+strncmp_done:
+ sub.d result, data1, data2
+ jr ra
+
+strncmp_try_words:
+ srli.d limit_wd, limit, 3
+ beqz src1_off, strncmp_do_misaligned
+
+ sub.d src1_off, zero, src1_off
+ andi src1_off, src1_off, 0x7
+ sub.d limit, limit, src1_off
+ srli.d limit_wd, limit, 0x3
+
+
+strncmp_page_end_loop:
+ ld.bu data1, src1, 0
+ ld.bu data2, src2, 0
+ addi.d src1, src1, 1
+ addi.d src2, src2, 1
+ xor tmp1, data1, data2
+ masknez tmp1, data1, tmp1
+ beqz tmp1, strncmp_done
+ andi tmp1, src1, 0x7
+ bnez tmp1, strncmp_page_end_loop
+strncmp_do_misaligned:
+ li.w src1_off, 0x8
+ addi.d limit_wd, limit_wd, -1
+ blt limit_wd, zero, strncmp_done_loop
+
+strncmp_loop_misaligned:
+ andi tmp2, src2, 0xff8
+ xori tmp2, tmp2, 0xff8
+ beqz tmp2, strncmp_page_end_loop
+
+ ld.d data1, src1, 0
+ ld.d data2, src2, 0
+ addi.d src1, src1, 8
+ addi.d src2, src2, 8
+ sub.d tmp1, data1, zeroones
+ or tmp2, data1, sevenf
+ xor diff, data1, data2
+ andn has_nul, tmp1, tmp2
+ or syndrome, diff, has_nul
+ bnez syndrome, strncmp_not_limit
+ addi.d limit_wd, limit_wd, -1
+ bge limit_wd, zero, strncmp_loop_misaligned
+
+strncmp_done_loop:
+ andi limit, limit, 0x7
+ beqz limit, strncmp_not_limit
+ /* Read the last double word */
+ /* check if the final part is about to exceed the page */
+ andi tmp1, src2, 0x7
+ andi tmp2, src2, 0xff8
+ add.d tmp1, tmp1, limit
+ xori tmp2, tmp2, 0xff8
+ andi tmp1, tmp1, 0x8
+ masknez tmp1, tmp1, tmp2
+ bnez tmp1, strncmp_byte_loop
+ addi.d src1, src1, -8
+ addi.d src2, src2, -8
+ ldx.d data1, src1, limit
+ ldx.d data2, src2, limit
+ sub.d tmp1, data1, zeroones
+ or tmp2, data1, sevenf
+ xor diff, data1, data2
+ andn has_nul, tmp1, tmp2
+ or syndrome, diff, has_nul
+ bnez syndrome, strncmp_not_limit
+
+strncmp_ret0:
+ move result, zero
+ jr ra
+
+/* check if ((src1 != 0) && ((src2 == 0) || (src1 < src2)))
+   then exchange(src1,src2)
+ */
+
+
+END(STRNCMP)
+#ifndef ANDROID_CHANGES
+#ifdef _LIBC
+libc_hidden_builtin_def (strncmp)
+#endif
+#endif
diff --git a/sysdeps/loongarch/lp64/strnlen.S b/sysdeps/loongarch/lp64/strnlen.S
new file mode 100644
index 0000000000..b475938daf
--- /dev/null
+++ b/sysdeps/loongarch/lp64/strnlen.S
@@ -0,0 +1,168 @@ 
+/* Copyright (C) 2021 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   Contributed by Loongson Technology Corporation Limited.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library.  If not, see
+   <https://www.gnu.org/licenses/>.  */
+
+/*
+ * ISA: LoongArch
+ * ABI: lp64
+ */
+/*
+algorithm:
+
+    #.  use ld/ldr to access word/partial word in the string
+
+    #.  use (x - 0x0101010101010101) & (~(x | 0x7f7f7f7f7f7f7f7f) != 0 to
+    judge if x has zero byte
+
+    #.  use dctz((x - 0x0101010101010101) & (~(x | 0x7f7f7f7f7f7f7f7f) >> 3
+    to get the index of first rightmost zero byte in dword x;
+
+    #.  use dctz(x) = 64 - dclz(~x & (x-1));
+
+    #.  use pointer to the last non zero byte  minus pointer to the start
+    of the string to get the length of string
+
+ */
+
+#include <sys/asm.h>
+#include <sys/regdef.h>
+
+
+
+#define L_ADDIU addi.d
+#define L_ADDU add.d
+#define L_SUBU sub.d
+
+#define STRNLEN __strnlen
+#define L(x) x
+/* rd <- if rc then ra else rb will destroy t6 */
+
+#define CONDITIONSEL(rd,ra,rb,rc)\
+ masknez a5, rb, rc;\
+ maskeqz rd, ra, rc;\
+ or rd, rd, a5
+
+
+/* Parameters and Results */
+#define srcin a0
+#define limit a1
+#define len v0
+
+
+/* Internal variable */
+#define data1 t0
+#define data2 t1
+#define has_nul1 t2
+#define has_nul2 t3
+#define src t4
+#define zeroones t5
+#define sevenf t6
+#define data2a t7
+#define tmp6 t7
+#define pos t8
+#define tmp1 a2
+#define tmp2 a3
+#define tmp3 a4
+#define tmp4 a5
+#define tmp5 a6
+#define limit_wd a7
+
+
+
+/* size_t strnlen (const char *s1,size_t maxlen); */
+
+LEAF(STRNLEN)
+
+ .align 4
+ beqz limit, L(_hit_limit)
+ lu12i.w zeroones, 0x01010
+ lu12i.w sevenf, 0x7f7f7
+ ori zeroones, zeroones, 0x101
+ ori sevenf, sevenf, 0xf7f
+ bstrins.d zeroones, zeroones, 63, 32
+ bstrins.d sevenf, sevenf, 63, 32
+ andi tmp1, srcin, 15
+ sub.d src, srcin, tmp1
+ bnez tmp1, L(misaligned)
+ addi.d limit_wd, limit, -1
+ srli.d limit_wd, limit_wd, 4
+L(_loop):
+ ld.d data1, src, 0
+ ld.d data2, src, 8
+ addi.d src, src, 16
+L(_realigned):
+ sub.d tmp1, data1, zeroones
+ or tmp2, data1, sevenf
+ sub.d tmp3, data2, zeroones
+ or tmp4, data2, sevenf
+ andn has_nul1, tmp1, tmp2
+ andn has_nul2, tmp3, tmp4
+ addi.d limit_wd, limit_wd, -1
+ srli.d tmp1, limit_wd, 63
+ or tmp2, has_nul1, has_nul2
+ or tmp3, tmp1, tmp2
+ beqz tmp3, L(_loop)
+ beqz tmp2, L(_hit_limit)
+ sub.d len, src, srcin
+ beqz has_nul1, L(_nul_in_data2)
+ move has_nul2, has_nul1
+ addi.d len, len, -8
+L(_nul_in_data2):
+ ctz.d pos, has_nul2
+ srli.d pos, pos, 3
+ addi.d len, len, -8
+ add.d len, len, pos
+ sltu tmp1, len, limit
+ CONDITIONSEL(len,len,limit,tmp1)
+ jr ra
+
+
+L(misaligned):
+ addi.d limit_wd, limit, -1
+ sub.d tmp4, zero, tmp1
+ andi tmp3, limit_wd, 15
+ srli.d limit_wd, limit_wd, 4
+ li.d tmp5, -1
+ ld.d data1, src, 0
+ ld.d data2, src, 8
+ addi.d src, src, 16
+ slli.d tmp4, tmp4, 3
+ add.d tmp3, tmp3, tmp1
+ srl.d tmp2, tmp5, tmp4
+ srli.d tmp3, tmp3, 4
+ add.d limit_wd, limit_wd, tmp3
+ or data1, data1, tmp2
+ or data2a, data2, tmp2
+ li.w tmp3, 9
+ sltu tmp1, tmp1, tmp3
+ CONDITIONSEL(data1,data1,tmp5,tmp1)
+ CONDITIONSEL(data2,data2,data2a,tmp1)
+ b L(_realigned)
+
+
+L(_hit_limit):
+ move len, limit
+ jr ra
+END(STRNLEN)
+#ifndef ANDROID_CHANGES
+#ifdef _LIBC
+weak_alias (__strnlen, strnlen)
+libc_hidden_def (strnlen)
+libc_hidden_def (__strnlen)
+#endif
+#endif