[PATCH 1/1] LoongArch: Add optimized functions.
dengjianbo
dengjianbo@loongson.cn
Fri Sep 2 08:39:08 GMT 2022
---
sysdeps/loongarch/lp64/memmove.S | 491 +++++++++++++++++++++++++++++
sysdeps/loongarch/lp64/strchr.S | 145 +++++++++
sysdeps/loongarch/lp64/strchrnul.S | 160 ++++++++++
sysdeps/loongarch/lp64/strcmp.S | 210 ++++++++++++
sysdeps/loongarch/lp64/strncmp.S | 281 +++++++++++++++++
5 files changed, 1287 insertions(+)
create mode 100644 sysdeps/loongarch/lp64/memmove.S
create mode 100644 sysdeps/loongarch/lp64/strchr.S
create mode 100644 sysdeps/loongarch/lp64/strchrnul.S
create mode 100644 sysdeps/loongarch/lp64/strcmp.S
create mode 100644 sysdeps/loongarch/lp64/strncmp.S
diff --git a/sysdeps/loongarch/lp64/memmove.S b/sysdeps/loongarch/lp64/memmove.S
new file mode 100644
index 0000000000..632820404e
--- /dev/null
+++ b/sysdeps/loongarch/lp64/memmove.S
@@ -0,0 +1,491 @@
+/* Assembly implementation of memmove.
+ Copyright (C) 2022 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library. If not, see
+ <https://www.gnu.org/licenses/>. */
+
+#ifdef _LIBC
+#include <sysdep.h>
+#include <sys/regdef.h>
+#include <sys/asm.h>
+#else
+#include <regdef.h>
+#include <sys/asm.h>
+#endif
+
+/* Allow the routine to be named something else if desired. */
+#ifndef MEMMOVE_NAME
+#define MEMMOVE_NAME memmove
+#endif
+
+#define LD_64(reg, n) \
+ ld.d t0, reg, n; \
+ ld.d t1, reg, n+8; \
+ ld.d t2, reg, n+16; \
+ ld.d t3, reg, n+24; \
+ ld.d t4, reg, n+32; \
+ ld.d t5, reg, n+40; \
+ ld.d t6, reg, n+48; \
+ ld.d t7, reg, n+56;
+
+
+#define ST_64(reg, n) \
+ st.d t0, reg, n; \
+ st.d t1, reg, n+8; \
+ st.d t2, reg, n+16; \
+ st.d t3, reg, n+24; \
+ st.d t4, reg, n+32; \
+ st.d t5, reg, n+40; \
+ st.d t6, reg, n+48; \
+ st.d t7, reg, n+56;
+
+#define LDST_1024 \
+ LD_64(a1, 0); \
+ ST_64(a0, 0); \
+ LD_64(a1, 64); \
+ ST_64(a0, 64); \
+ LD_64(a1, 128); \
+ ST_64(a0, 128); \
+ LD_64(a1, 192); \
+ ST_64(a0, 192); \
+ LD_64(a1, 256); \
+ ST_64(a0, 256); \
+ LD_64(a1, 320); \
+ ST_64(a0, 320); \
+ LD_64(a1, 384); \
+ ST_64(a0, 384); \
+ LD_64(a1, 448); \
+ ST_64(a0, 448); \
+ LD_64(a1, 512); \
+ ST_64(a0, 512); \
+ LD_64(a1, 576); \
+ ST_64(a0, 576); \
+ LD_64(a1, 640); \
+ ST_64(a0, 640); \
+ LD_64(a1, 704); \
+ ST_64(a0, 704); \
+ LD_64(a1, 768); \
+ ST_64(a0, 768); \
+ LD_64(a1, 832); \
+ ST_64(a0, 832); \
+ LD_64(a1, 896); \
+ ST_64(a0, 896); \
+ LD_64(a1, 960); \
+ ST_64(a0, 960);
+
+#define LDST_1024_BACK \
+ LD_64(a4, -64); \
+ ST_64(a3, -64); \
+ LD_64(a4, -128); \
+ ST_64(a3, -128); \
+ LD_64(a4, -192); \
+ ST_64(a3, -192); \
+ LD_64(a4, -256); \
+ ST_64(a3, -256); \
+ LD_64(a4, -320); \
+ ST_64(a3, -320); \
+ LD_64(a4, -384); \
+ ST_64(a3, -384); \
+ LD_64(a4, -448); \
+ ST_64(a3, -448); \
+ LD_64(a4, -512); \
+ ST_64(a3, -512); \
+ LD_64(a4, -576); \
+ ST_64(a3, -576); \
+ LD_64(a4, -640); \
+ ST_64(a3, -640); \
+ LD_64(a4, -704); \
+ ST_64(a3, -704); \
+ LD_64(a4, -768); \
+ ST_64(a3, -768); \
+ LD_64(a4, -832); \
+ ST_64(a3, -832); \
+ LD_64(a4, -896); \
+ ST_64(a3, -896); \
+ LD_64(a4, -960); \
+ ST_64(a3, -960); \
+ LD_64(a4, -1024); \
+ ST_64(a3, -1024);
+
+#ifdef ANDROID_CHANGES
+LEAF(MEMMOVE_NAME, 0)
+#else
+LEAF(MEMMOVE_NAME)
+#endif
+
+/* 1st var: dest ptr: void *str1 $r4 a0 */
+/* 2nd var: src ptr: void *str2 $r5 a1 */
+/* 3rd var: size_t num */
+/* t0~t9 registers as temp */
+
+ add.d a4, a1, a2
+ add.d a3, a0, a2
+ beq a1, a0, less_1bytes
+ move t8, a0
+ srai.d a6, a2, 4 #num/16
+ beqz a6, less_16bytes #num<16
+ srai.d a6, a2, 6 #num/64
+ bnez a6, more_64bytes #num>64
+ srai.d a6, a2, 5
+ beqz a6, less_32bytes #num<32
+
+ ld.d t0, a1, 0 #32<num<64
+ ld.d t1, a1, 8
+ ld.d t2, a1, 16
+ ld.d t3, a1, 24
+ ld.d t4, a4, -32
+ ld.d t5, a4, -24
+ ld.d t6, a4, -16
+ ld.d t7, a4, -8
+ st.d t0, a0, 0
+ st.d t1, a0, 8
+ st.d t2, a0, 16
+ st.d t3, a0, 24
+ st.d t4, a3, -32
+ st.d t5, a3, -24
+ st.d t6, a3, -16
+ st.d t7, a3, -8
+
+ jr ra
+
+less_32bytes:
+ ld.d t0, a1, 0
+ ld.d t1, a1, 8
+ ld.d t2, a4, -16
+ ld.d t3, a4, -8
+ st.d t0, a0, 0
+ st.d t1, a0, 8
+ st.d t2, a3, -16
+ st.d t3, a3, -8
+
+ jr ra
+
+less_16bytes:
+ srai.d a6, a2, 3 #num/8
+ beqz a6, less_8bytes
+
+ ld.d t0, a1, 0
+ ld.d t1, a4, -8
+ st.d t0, a0, 0
+ st.d t1, a3, -8
+
+ jr ra
+
+less_8bytes:
+ srai.d a6, a2, 2
+ beqz a6, less_4bytes
+
+ ld.w t0, a1, 0
+ ld.w t1, a4, -4
+ st.w t0, a0, 0
+ st.w t1, a3, -4
+
+ jr ra
+
+less_4bytes:
+ srai.d a6, a2, 1
+ beqz a6, less_2bytes
+
+ ld.h t0, a1, 0
+ ld.h t1, a4, -2
+ st.h t0, a0, 0
+ st.h t1, a3, -2
+
+ jr ra
+
+less_2bytes:
+ beqz a2, less_1bytes
+
+ ld.b t0, a1, 0
+ st.b t0, a0, 0
+
+ jr ra
+
+less_1bytes:
+ jr ra
+
+more_64bytes:
+ sub.d a7, a0, a1
+ bltu a7, a2, copy_backward
+
+copy_forward:
+ srli.d a0, a0, 3
+ slli.d a0, a0, 3
+ beq a0, t8, all_align
+ addi.d a0, a0, 0x8
+ sub.d a7, t8, a0
+ sub.d a1, a1, a7
+ add.d a2, a7, a2
+
+start_unalign_proc:
+ pcaddi t1, 18
+ slli.d a6, a7, 3
+ add.d t1, t1, a6
+ jirl zero, t1, 0
+
+start_7_unalign:
+ ld.b t0, a1, -7
+ st.b t0, a0, -7
+start_6_unalign:
+ ld.b t0, a1, -6
+ st.b t0, a0, -6
+start_5_unalign:
+ ld.b t0, a1, -5
+ st.b t0, a0, -5
+start_4_unalign:
+ ld.b t0, a1, -4
+ st.b t0, a0, -4
+start_3_unalign:
+ ld.b t0, a1, -3
+ st.b t0, a0, -3
+start_2_unalign:
+ ld.b t0, a1, -2
+ st.b t0, a0, -2
+start_1_unalign:
+ ld.b t0, a1, -1
+ st.b t0, a0, -1
+
+start_over:
+ addi.d a2, a2, -0x80
+ blt a2, zero, end_unalign_proc
+
+loop_less:
+ LD_64(a1, 0)
+ ST_64(a0, 0)
+ LD_64(a1, 64)
+ ST_64(a0, 64)
+
+ addi.d a0, a0, 0x80
+ addi.d a1, a1, 0x80
+ addi.d a2, a2, -0x80
+ bge a2, zero, loop_less
+
+end_unalign_proc:
+ addi.d a2, a2, 0x80
+
+ pcaddi t1, 36
+ andi t2, a2, 0x78
+ add.d a1, a1, t2
+ add.d a0, a0, t2
+ sub.d t1, t1, t2
+ jirl zero, t1, 0
+
+end_120_128_unalign:
+ ld.d t0, a1, -120
+ st.d t0, a0, -120
+end_112_120_unalign:
+ ld.d t0, a1, -112
+ st.d t0, a0, -112
+end_104_112_unalign:
+ ld.d t0, a1, -104
+ st.d t0, a0, -104
+end_96_104_unalign:
+ ld.d t0, a1, -96
+ st.d t0, a0, -96
+end_88_96_unalign:
+ ld.d t0, a1, -88
+ st.d t0, a0, -88
+end_80_88_unalign:
+ ld.d t0, a1, -80
+ st.d t0, a0, -80
+end_72_80_unalign:
+ ld.d t0, a1, -72
+ st.d t0, a0, -72
+end_64_72_unalign:
+ ld.d t0, a1, -64
+ st.d t0, a0, -64
+end_56_64_unalign:
+ ld.d t0, a1, -56
+ st.d t0, a0, -56
+end_48_56_unalign:
+ ld.d t0, a1, -48
+ st.d t0, a0, -48
+end_40_48_unalign:
+ ld.d t0, a1, -40
+ st.d t0, a0, -40
+end_32_40_unalign:
+ ld.d t0, a1, -32
+ st.d t0, a0, -32
+end_24_32_unalign:
+ ld.d t0, a1, -24
+ st.d t0, a0, -24
+end_16_24_unalign:
+ ld.d t0, a1, -16
+ st.d t0, a0, -16
+end_8_16_unalign:
+ ld.d t0, a1, -8
+ st.d t0, a0, -8
+end_0_8_unalign:
+ andi a2, a2, 0x7
+ pcaddi t1, 18
+ slli.d a2, a2, 3
+ sub.d t1, t1, a2
+ jirl zero, t1, 0
+end_7_unalign:
+ ld.b t0, a4, -7
+ st.b t0, a3, -7
+end_6_unalign:
+ ld.b t0, a4, -6
+ st.b t0, a3, -6
+end_5_unalign:
+ ld.b t0, a4, -5
+ st.b t0, a3, -5
+end_4_unalign:
+ ld.b t0, a4, -4
+ st.b t0, a3, -4
+end_3_unalign:
+ ld.b t0, a4, -3
+ st.b t0, a3, -3
+end_2_unalign:
+ ld.b t0, a4, -2
+ st.b t0, a3, -2
+end_1_unalign:
+ ld.b t0, a4, -1
+ st.b t0, a3, -1
+end:
+ move v0, t8
+ jr ra
+
+all_align:
+ addi.d a1, a1, 0x8
+ addi.d a0, a0, 0x8
+ ld.d t0, a1, -8
+ st.d t0, a0, -8
+ addi.d a2, a2, -8
+ b start_over
+
+all_align_back:
+ addi.d a4, a4, -0x8
+ addi.d a3, a3, -0x8
+ ld.d t0, a4, 0
+ st.d t0, a3, 0
+ addi.d a2, a2, -8
+ b start_over_back
+
+copy_backward:
+ move a5, a3
+ srli.d a3, a3, 3
+ slli.d a3, a3, 3
+ beq a3, a5, all_align_back
+ sub.d a7, a3, a5
+ add.d a4, a4, a7
+ add.d a2, a7, a2
+
+ pcaddi t1, 18
+ slli.d a6, a7, 3
+ add.d t1, t1, a6
+ jirl zero, t1, 0
+
+ ld.b t0, a4, 6
+ st.b t0, a3, 6
+ ld.b t0, a4, 5
+ st.b t0, a3, 5
+ ld.b t0, a4, 4
+ st.b t0, a3, 4
+ ld.b t0, a4, 3
+ st.b t0, a3, 3
+ ld.b t0, a4, 2
+ st.b t0, a3, 2
+ ld.b t0, a4, 1
+ st.b t0, a3, 1
+ ld.b t0, a4, 0
+ st.b t0, a3, 0
+
+start_over_back:
+ addi.d a2, a2, -0x80
+ blt a2, zero, end_unalign_proc_back
+
+loop_less_back:
+ LD_64(a4, -64)
+ ST_64(a3, -64)
+ LD_64(a4, -128)
+ ST_64(a3, -128)
+
+ addi.d a4, a4, -0x80
+ addi.d a3, a3, -0x80
+ addi.d a2, a2, -0x80
+ bge a2, zero, loop_less_back
+
+end_unalign_proc_back:
+ addi.d a2, a2, 0x80
+
+ pcaddi t1, 36
+ andi t2, a2, 0x78
+ sub.d a4, a4, t2
+ sub.d a3, a3, t2
+ sub.d t1, t1, t2
+ jirl zero, t1, 0
+
+ ld.d t0, a4, 112
+ st.d t0, a3, 112
+ ld.d t0, a4, 104
+ st.d t0, a3, 104
+ ld.d t0, a4, 96
+ st.d t0, a3, 96
+ ld.d t0, a4, 88
+ st.d t0, a3, 88
+ ld.d t0, a4, 80
+ st.d t0, a3, 80
+ ld.d t0, a4, 72
+ st.d t0, a3, 72
+ ld.d t0, a4, 64
+ st.d t0, a3, 64
+ ld.d t0, a4, 56
+ st.d t0, a3, 56
+ ld.d t0, a4, 48
+ st.d t0, a3, 48
+ ld.d t0, a4, 40
+ st.d t0, a3, 40
+ ld.d t0, a4, 32
+ st.d t0, a3, 32
+ ld.d t0, a4, 24
+ st.d t0, a3, 24
+ ld.d t0, a4, 16
+ st.d t0, a3, 16
+ ld.d t0, a4, 8
+ st.d t0, a3, 8
+ ld.d t0, a4, 0
+ st.d t0, a3, 0
+
+ andi a2, a2, 0x7
+ pcaddi t1, 18
+ slli.d a2, a2, 3
+ sub.d t1, t1, a2
+ jirl zero, t1, 0
+
+ ld.b t0, a1, 6
+ st.b t0, a0, 6
+ ld.b t0, a1, 5
+ st.b t0, a0, 5
+ ld.b t0, a1, 4
+ st.b t0, a0, 4
+ ld.b t0, a1, 3
+ st.b t0, a0, 3
+ ld.b t0, a1, 2
+ st.b t0, a0, 2
+ ld.b t0, a1, 1
+ st.b t0, a0, 1
+ ld.b t0, a1, 0
+ st.b t0, a0, 0
+
+ move v0, t8
+ jr ra
+
+END(MEMMOVE_NAME)
+#ifndef ANDROID_CHANGES
+#ifdef _LIBC
+libc_hidden_builtin_def (MEMMOVE_NAME)
+#endif
+#endif
diff --git a/sysdeps/loongarch/lp64/strchr.S b/sysdeps/loongarch/lp64/strchr.S
new file mode 100644
index 0000000000..ffe3fbca62
--- /dev/null
+++ b/sysdeps/loongarch/lp64/strchr.S
@@ -0,0 +1,145 @@
+/* Assembly implementation of strchr.
+ Copyright (C) 2022 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library. If not, see
+ <https://www.gnu.org/licenses/>. */
+
+/*
+ * ISA: LoongArch64
+ * Data Model: lp64
+ */
+
+/* basic algorithm :
+
+ +. use ld.d and mask for the first 8 bytes or less;
+
+ +. build a1 with 8c with dins;
+
+ +. use xor from a1 and v0 to check if is found;
+
+ +. if (v0 - 0x0101010101010101) & (~(v0 | 0x7f7f7f7f7f7f7f7f)!= 0, v0 has
+ one byte is \0, else has no \0
+ */
+
+#include <sysdep.h>
+#include <sys/asm.h>
+
+#define L_ADDIU addi.d
+#define L_ADDU add.d
+#define L_SUBU sub.d
+
+#define STRCHR strchr
+#define MOVN(rd,rs,rt) \
+ maskeqz t6, rs, rt;\
+ masknez rd, rd, rt;\
+ or rd, rd, t6
+
+#define MOVN2(rd,rt) \
+ masknez rd, rd, rt;\
+ or rd, rd, rt
+
+
+/* char * strchr (const char *s1, int c); */
+
+LEAF(STRCHR)
+ .align 6
+
+ li.w t4, 0x7
+ lu12i.w a2, 0x01010
+ bstrins.d a1, a1, 15, 8
+ andi t0, a0, 0x7
+
+ ori a2, a2, 0x101
+ andn t4, a0, t4
+ slli.w t1, t0, 3
+
+ ld.d t4, t4, 0
+
+ nor t8, zero, zero
+ bstrins.d a1, a1, 31, 16
+ srl.d t4, t4, t1
+
+ bstrins.d a1, a1, 63, 32
+ bstrins.d a2, a2, 63, 32
+ srl.d a7, t8, t1
+
+ li.w t1, 8
+ nor t8, a7, zero
+ slli.d a3, a2, 7
+ or t5, t8, t4
+ and t3, a7, a1
+
+ sub.w t1, t1, t0
+ nor a3, a3, zero
+ xor t2, t5, t3
+ sub.d a7, t5, a2
+ nor a6, t5, a3
+
+ sub.d a5, t2, a2
+ nor a4, t2, a3
+
+ and a6, a7, a6
+ and a5, a5, a4
+ or a7, a6, a5
+ bnez a7, L(_mc8_a)
+
+ L_ADDU a0, a0, t1
+L(_aloop):
+ ld.d t4, a0, 0
+
+ xor t2, t4, a1
+ sub.d a7, t4, a2
+ nor a6, t4, a3
+ sub.d a5, t2, a2
+
+ nor a4, t2, a3
+ and a6, a7, a6
+ and a5, a5, a4
+ or a7, a6, a5
+ bnez a7, L(_mc8_a)
+
+ ld.d t4, a0, 8
+ L_ADDIU a0, a0, 16
+ xor t2, t4, a1
+ sub.d a7, t4, a2
+ nor a6, t4, a3
+ sub.d a5, t2, a2
+
+ nor a4, t2, a3
+ and a6, a7, a6
+ and a5, a5, a4
+ or a7, a6, a5
+ beqz a7, L(_aloop)
+
+ L_ADDIU a0, a0, -8
+L(_mc8_a):
+
+ ctz.d t0, a5
+ ctz.d t2, a6
+
+ srli.w t0, t0, 3
+ srli.w t2, t2, 3
+ sltu t1, t2, t0
+ L_ADDU v0, a0, t0
+ masknez v0, v0, t1
+ jr ra
+END(STRCHR)
+
+#ifndef ANDROID_CHANGES
+#ifdef _LIBC
+libc_hidden_builtin_def (strchr)
+weak_alias (strchr, index)
+#endif
+#endif
diff --git a/sysdeps/loongarch/lp64/strchrnul.S b/sysdeps/loongarch/lp64/strchrnul.S
new file mode 100644
index 0000000000..dcbfded765
--- /dev/null
+++ b/sysdeps/loongarch/lp64/strchrnul.S
@@ -0,0 +1,160 @@
+/* Assembly implementation of strchrnul.
+ Copyright (C) 2022 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library. If not, see
+ <https://www.gnu.org/licenses/>. */
+
+/*
+ * ISA: LoongArch64
+ * Data Model: lp64
+ */
+
+/* basic algorithm :
+
+ +. use ld.d and mask for the first 8 bytes or less;
+
+ +. build a1 with 8c with dins;
+
+ +. use xor from a1 and v0 to check if is found;
+
+ +. if (v0 - 0x0101010101010101) & (~(v0 | 0x7f7f7f7f7f7f7f7f)!= 0, v0 has
+ one byte is \0, else has no \0
+
+ */
+
+#include <sysdep.h>
+#include <sys/asm.h>
+
+
+#define L_ADDIU addi.d
+#define L_ADDU add.d
+#define L_SUBU sub.d
+
+#define STRCHRNUL __strchrnul
+
+#define MOVN(rd,rs,rt) \
+ maskeqz t6, rs, rt;\
+ masknez rd, rd, rt;\
+ or rd, rd, t6
+
+#define MOVZ(rd,rs,rt) \
+ masknez t6, rs, rt;\
+ maskeqz rd, rd, rt;\
+ or rd, rd, t6
+
+
+#define MOVN2(rd,rt) \
+ masknez rd, rd, rt;\
+ or rd, rd, rt
+
+/* char * strchrnul (const char *s1, int c); */
+
+LEAF(STRCHRNUL)
+ .align 6
+
+ li.w t4, 0x7
+ lu12i.w a2, 0x01010
+ bstrins.d a1, a1, 15, 8
+ andi t0, a0, 0x7
+
+ ori a2, a2, 0x101
+ andn t4, a0, t4
+ slli.w t1, t0, 3
+
+ ld.d t4, t4, 0
+
+ nor t8, zero, zero
+ bstrins.d a1, a1, 31, 16
+ srl.d t4, t4, t1
+
+ preld 0, a0, 32
+ bstrins.d a1, a1, 63, 32
+ bstrins.d a2, a2, 63, 32
+ srl.d a7, t8, t1
+
+ nor t8, a7, zero
+ slli.d a3, a2, 7
+ or t5, t8, t4
+ and t3, a7, a1
+
+ nor a3, a3, zero
+ xor t2, t5, t3
+ sub.d a7, t5, a2
+ nor a6, t5, a3
+
+ li.w t1, 8
+ sub.d a5, t2, a2
+ nor a4, t2, a3
+
+ and a6, a7, a6
+ and a5, a5, a4
+ or a7, a6, a5
+ bnez a7, L(_mc8_a)
+
+
+ sub.w t1, t1, t0
+ L_ADDU a0, a0, t1
+L(_aloop):
+ ld.d t4, a0, 0
+
+ xor t2, t4, a1
+ sub.d a7, t4, a2
+ nor a6, t4, a3
+ sub.d a5, t2, a2
+
+ nor a4, t2, a3
+ and a6, a7, a6
+ and a5, a5, a4
+
+ or a7, a6, a5
+ bnez a7, L(_mc8_a)
+
+ ld.d t4, a0, 8
+ L_ADDIU a0, a0, 16
+
+ xor t2, t4, a1
+ sub.d a7, t4, a2
+ nor a6, t4, a3
+ sub.d a5, t2, a2
+
+ nor a4, t2, a3
+ and a6, a7, a6
+ and a5, a5, a4
+
+ or a7, a6, a5
+ beqz a7, L(_aloop)
+
+ L_ADDIU a0, a0, -8
+L(_mc8_a):
+
+ ctz.d t0, a5
+ ctz.d t2, a6
+
+ srli.w t0, t0, 3
+ srli.w t2, t2, 3
+ slt t1, t0, t2
+
+ MOVZ(t0,t2,t1)
+
+ L_ADDU v0, a0, t0
+ jr ra
+END(STRCHRNUL)
+
+#ifndef ANDROID_CHANGES
+#ifdef _LIBC
+weak_alias(__strchrnul, strchrnul)
+libc_hidden_builtin_def (__strchrnul)
+#endif
+#endif
diff --git a/sysdeps/loongarch/lp64/strcmp.S b/sysdeps/loongarch/lp64/strcmp.S
new file mode 100644
index 0000000000..8160ae432d
--- /dev/null
+++ b/sysdeps/loongarch/lp64/strcmp.S
@@ -0,0 +1,210 @@
+/* Assembly implementation of strcmp.
+ Copyright (C) 2022 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library. If not, see
+ <https://www.gnu.org/licenses/>. */
+
+/*
+ * ISA: LoongArch64
+ * Data Model: lp64
+ */
+
+/* basic algorithm :
+
+ +. let t0, t1 point to a0, a1, if a0 has smaller low 3 bit of a0 and a1,
+ set a4 to 1 and let t0 point to the larger of lower 3bit of a0 and a1
+
+ +. if low 3 bit of a0 equal low 3 bit of a0, use a ldr one time and more
+ ld other times;
+
+ +. if not, load partial t2 and t3, check if t2 has \0;
+
+ +. then use use ld for t0, ldr for t1,
+
+ +. if partial 8 byte from t1 has \0, compare partial 8 byte from t1 with
+ 8 byte from t0 with a mask in a7
+
+ +. if not, ldl other part of t1, compare 8 byte from t1 with 8 byte from
+ t0
+
+ +. if (v0 - 0x0101010101010101) & (~v0) & 0x8080808080808080 != 0, v0 has
+ one byte is \0, else has no \0
+
+ +. for partial 8 byte from ldr t3, 0(a0), preload t3 with
+ 0xffffffffffffffff
+ */
+
+#include <sys/asm.h>
+#include <sys/regdef.h>
+
+
+#define STRCMP strcmp
+
+#define REP8_01 0x0101010101010101
+#define REP8_7f 0x7f7f7f7f7f7f7f7f
+#define REP8_80 0x8080808080808080
+
+/* Parameters and Results */
+#define src1 a0
+#define src2 a1
+#define result v0
+/* Note: v0 = a0 in lp64 ABI */
+
+
+/* Internal variable */
+#define data1 t0
+#define data2 t1
+#define has_nul t2
+#define diff t3
+#define syndrome t4
+#define zeroones t5
+#define sevenf t6
+#define pos t7
+#define exchange t8
+#define tmp1 a4
+#define tmp2 a5
+#define tmp3 a6
+#define src1_off a2
+#define src2_off a3
+#define tmp4 a7
+
+/* rd <- if rc then ra else rb will destroy tmp3 */
+
+#define CONDITIONSEL(rd,rc,ra,rb)\
+ masknez tmp3, rb, rc;\
+ maskeqz rd, ra, rc;\
+ or rd, rd, tmp3
+
+
+
+/* int strcmp (const char *s1, const char *s2); */
+
+LEAF(STRCMP)
+ .align 4
+
+ xor tmp1, src1, src2
+ lu12i.w zeroones, 0x01010
+ lu12i.w sevenf, 0x7f7f7
+ andi src1_off, src1, 0x7
+ ori zeroones, zeroones, 0x101
+ ori sevenf, sevenf, 0xf7f
+ andi tmp1, tmp1, 0x7
+ bstrins.d zeroones, zeroones, 63, 32
+ bstrins.d sevenf, sevenf, 63, 32
+ bnez tmp1, strcmp_misaligned8
+ bnez src1_off, strcmp_mutual_align
+strcmp_loop_aligned:
+ ld.d data1, src1, 0
+ addi.d src1, src1, 8
+ ld.d data2, src2, 0
+ addi.d src2, src2, 8
+strcmp_start_realigned:
+ sub.d tmp1, data1, zeroones
+ or tmp2, data1, sevenf
+ xor diff, data1, data2
+ andn has_nul, tmp1, tmp2
+ or syndrome, diff, has_nul
+ beqz syndrome, strcmp_loop_aligned
+
+strcmp_end:
+ ctz.d pos, syndrome
+ bstrins.d pos, zero, 2, 0
+ srl.d data1, data1, pos
+ srl.d data2, data2, pos
+ andi data1, data1, 0xff
+ andi data2, data2, 0xff
+ sub.d result, data1, data2
+ jr ra
+strcmp_mutual_align:
+ bstrins.d src1, zero, 2, 0
+ bstrins.d src2, zero, 2, 0
+ slli.d tmp1, src1_off, 0x3
+ ld.d data1, src1, 0
+ sub.d tmp1, zero, tmp1
+ ld.d data2, src2, 0
+ addi.d src1, src1, 8
+ addi.d src2, src2, 8
+ nor tmp2, zero, zero
+ srl.d tmp2, tmp2, tmp1
+ or data1, data1, tmp2
+ or data2, data2, tmp2
+ b strcmp_start_realigned
+
+strcmp_misaligned8:
+
+/* check if ((src1 != 0) && ((src2 == 0) || (src1 < src2)))
+ then exchange(src1,src2)
+*/
+ andi src2_off, src2, 0x7
+ slt tmp2, src1_off, src2_off
+ CONDITIONSEL(tmp2,src2_off,tmp2,tmp1)
+ maskeqz exchange, tmp2, src1_off
+ xor tmp3, src1, src2
+ maskeqz tmp3, tmp3, exchange
+ xor src1, src1, tmp3
+ xor src2, src2, tmp3
+
+ andi src1_off, src1, 0x7
+ beqz src1_off, strcmp_loop_misaligned
+strcmp_do_misaligned:
+ ld.bu data1, src1, 0
+ ld.bu data2, src2, 0
+ xor tmp3, data1, data2
+ addi.d src1, src1, 1
+ masknez tmp3, data1, tmp3
+ addi.d src2, src2, 1
+ beqz tmp3, strcmp_done
+ andi src1_off, src1, 0x7
+ bnez src1_off, strcmp_do_misaligned
+
+strcmp_loop_misaligned:
+ andi tmp1, src2, 0xff8
+ xori tmp1, tmp1, 0xff8
+ beqz tmp1, strcmp_do_misaligned
+ ld.d data1, src1, 0
+ ld.d data2, src2, 0
+ addi.d src1, src1, 8
+ addi.d src2, src2, 8
+
+ sub.d tmp1, data1, zeroones
+ or tmp2, data1, sevenf
+ xor diff, data1, data2
+ andn has_nul, tmp1, tmp2
+ or syndrome, diff, has_nul
+ beqz syndrome, strcmp_loop_misaligned
+
+strcmp_misalign_end:
+ ctz.d pos, syndrome
+ bstrins.d pos, zero, 2, 0
+ srl.d data1, data1, pos
+ srl.d data2, data2, pos
+ andi data1, data1, 0xff
+ andi data2, data2, 0xff
+ sub.d tmp1, data1, data2
+ sub.d tmp2, data2, data1
+ CONDITIONSEL(result,exchange,tmp2,tmp1)
+ jr ra
+
+strcmp_done:
+ sub.d tmp1, data1, data2
+ sub.d tmp2, data2, data1
+ CONDITIONSEL(result,exchange,tmp2,tmp1)
+ jr ra
+END(STRCMP)
+#ifndef ANDROID_CHANGES
+#ifdef _LIBC
+libc_hidden_builtin_def (strcmp)
+#endif
+#endif
diff --git a/sysdeps/loongarch/lp64/strncmp.S b/sysdeps/loongarch/lp64/strncmp.S
new file mode 100644
index 0000000000..a72b280170
--- /dev/null
+++ b/sysdeps/loongarch/lp64/strncmp.S
@@ -0,0 +1,281 @@
+/* Assembly implementation of strncmp.
+ Copyright (C) 2022 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library. If not, see
+ <https://www.gnu.org/licenses/>. */
+
+/*
+ * ISA: LoongArch64
+ * Data Model: lp64
+ */
+
+/* basic algorithm :
+
+ +. let t0, t1 point to a0, a1, if a0 has smaller low 3 bit of a0 and a1,
+ set a4 to 1 and let t0 point to the larger of lower 3bit of a0 and a1
+
+ +. if low 3 bit of a0 equal low 3 bit of a0, use a ldr one time and more
+ ld other times;
+
+ +. if not, load partial t2 and t3, check if t2 has \0;
+
+ +. then use use ld for t0, ldr for t1,
+
+ +. if partial 8 byte from t1 has \0, compare partial 8 byte from t1 with
+ 8 byte from t0 with a mask in a7
+
+ +. if not, ldl other part of t1, compare 8 byte from t1 with 8 byte from
+ t0
+
+ +. if (v0 - 0x0101010101010101) & (~v0) & 0x8080808080808080 != 0, v0 has
+ one byte is \0, else has no \0
+
+ +. for partial 8 byte from ldr t3, 0(a0), preload t3 with
+ 0xffffffffffffffff
+
+ */
+
+#include <sys/asm.h>
+#include <sys/regdef.h>
+
+
+#define STRNCMP strncmp
+
+#define REP8_01 0x0101010101010101
+#define REP8_7f 0x7f7f7f7f7f7f7f7f
+#define REP8_80 0x8080808080808080
+
+/* Parameters and Results */
+#define src1 a0
+#define src2 a1
+#define limit a2
+/* Note: v0 = a0 in lp64 ABI */
+#define result v0
+
+
+/* Internal variable */
+#define data1 t0
+#define data2 t1
+#define has_nul t2
+#define diff t3
+#define syndrome t4
+#define zeroones t5
+#define sevenf t6
+#define pos t7
+#define exchange t8
+#define tmp1 a5
+#define tmp2 a6
+#define tmp3 a7
+#define src1_off a3
+#define limit_wd a4
+
+
+/* int strncmp (const char *s1, const char *s2); */
+
+LEAF(STRNCMP)
+ .align 4
+ beqz limit, strncmp_ret0
+
+ xor tmp1, src1, src2
+ lu12i.w zeroones, 0x01010
+ lu12i.w sevenf, 0x7f7f7
+ andi src1_off, src1, 0x7
+ ori zeroones, zeroones, 0x101
+ andi tmp1, tmp1, 0x7
+ ori sevenf, sevenf, 0xf7f
+ bstrins.d zeroones, zeroones, 63, 32
+ bstrins.d sevenf, sevenf, 63, 32
+ bnez tmp1, strncmp_misaligned8
+ bnez src1_off, strncmp_mutual_align
+
+ addi.d limit_wd, limit, -1
+ srli.d limit_wd, limit_wd, 3
+
+strncmp_loop_aligned:
+ ld.d data1, src1, 0
+ addi.d src1, src1, 8
+ ld.d data2, src2, 0
+ addi.d src2, src2, 8
+
+strncmp_start_realigned:
+ addi.d limit_wd, limit_wd, -1
+ sub.d tmp1, data1, zeroones
+ or tmp2, data1, sevenf
+ xor diff, data1, data2
+ andn has_nul, tmp1, tmp2
+ srli.d tmp1, limit_wd, 63
+ or syndrome, diff, has_nul
+ or tmp2, syndrome, tmp1
+ beqz tmp2, strncmp_loop_aligned
+
+ /* if not reach limit */
+ bge limit_wd, zero, strncmp_not_limit
+ /* if reach limit */
+ andi limit, limit, 0x7
+ li.w tmp1, 0x8
+ sub.d limit, tmp1, limit
+ slli.d limit, limit, 0x3
+ li.d tmp1, -1
+ srl.d tmp1, tmp1, limit
+ and data1, data1, tmp1
+ and data2, data2, tmp1
+ orn syndrome, syndrome, tmp1
+
+
+strncmp_not_limit:
+ ctz.d pos, syndrome
+ bstrins.d pos, zero, 2, 0
+ srl.d data1, data1, pos
+ srl.d data2, data2, pos
+ andi data1, data1, 0xff
+ andi data2, data2, 0xff
+ sub.d result, data1, data2
+ jr ra
+
+
+
+strncmp_mutual_align:
+ bstrins.d src1, zero, 2, 0
+ bstrins.d src2, zero, 2, 0
+ slli.d tmp1, src1_off, 0x3
+ ld.d data1, src1, 0
+ ld.d data2, src2, 0
+ addi.d src2, src2, 8
+ addi.d src1, src1, 8
+
+ addi.d limit_wd, limit, -1
+ andi tmp3, limit_wd, 0x7
+ srli.d limit_wd, limit_wd, 3
+ add.d limit, limit, src1_off
+ add.d tmp3, tmp3, src1_off
+ srli.d tmp3, tmp3, 3
+ add.d limit_wd, limit_wd, tmp3
+
+ sub.d tmp1, zero, tmp1
+ nor tmp2, zero, zero
+ srl.d tmp2, tmp2, tmp1
+ or data1, data1, tmp2
+ or data2, data2, tmp2
+ b strncmp_start_realigned
+
+strncmp_misaligned8:
+ li.w tmp1, 0x10
+ bge limit, tmp1, strncmp_try_words
+
+strncmp_byte_loop:
+ ld.bu data1, src1, 0
+ ld.bu data2, src2, 0
+ addi.d limit, limit, -1
+ xor tmp1, data1, data2
+ masknez tmp1, data1, tmp1
+ maskeqz tmp1, limit, tmp1
+ beqz tmp1, strncmp_done
+
+ ld.bu data1, src1, 1
+ ld.bu data2, src2, 1
+ addi.d src1, src1, 2
+ addi.d src2, src2, 2
+ addi.d limit, limit, -1
+ xor tmp1, data1, data2
+ masknez tmp1, data1, tmp1
+ maskeqz tmp1, limit, tmp1
+ bnez tmp1, strncmp_byte_loop
+
+
+strncmp_done:
+ sub.d result, data1, data2
+ jr ra
+
+strncmp_try_words:
+ srli.d limit_wd, limit, 3
+ beqz src1_off, strncmp_do_misaligned
+
+ sub.d src1_off, zero, src1_off
+ andi src1_off, src1_off, 0x7
+ sub.d limit, limit, src1_off
+ srli.d limit_wd, limit, 0x3
+
+
+strncmp_page_end_loop:
+ ld.bu data1, src1, 0
+ ld.bu data2, src2, 0
+ addi.d src1, src1, 1
+ addi.d src2, src2, 1
+ xor tmp1, data1, data2
+ masknez tmp1, data1, tmp1
+ beqz tmp1, strncmp_done
+ andi tmp1, src1, 0x7
+ bnez tmp1, strncmp_page_end_loop
+strncmp_do_misaligned:
+ li.w src1_off, 0x8
+ addi.d limit_wd, limit_wd, -1
+ blt limit_wd, zero, strncmp_done_loop
+
+strncmp_loop_misaligned:
+ andi tmp2, src2, 0xff8
+ xori tmp2, tmp2, 0xff8
+ beqz tmp2, strncmp_page_end_loop
+
+ ld.d data1, src1, 0
+ ld.d data2, src2, 0
+ addi.d src1, src1, 8
+ addi.d src2, src2, 8
+ sub.d tmp1, data1, zeroones
+ or tmp2, data1, sevenf
+ xor diff, data1, data2
+ andn has_nul, tmp1, tmp2
+ or syndrome, diff, has_nul
+ bnez syndrome, strncmp_not_limit
+ addi.d limit_wd, limit_wd, -1
+ bge limit_wd, zero, strncmp_loop_misaligned
+
+strncmp_done_loop:
+ andi limit, limit, 0x7
+ beqz limit, strncmp_not_limit
+ /* Read the last double word */
+ /* check if the final part is about to exceed the page */
+ andi tmp1, src2, 0x7
+ andi tmp2, src2, 0xff8
+ add.d tmp1, tmp1, limit
+ xori tmp2, tmp2, 0xff8
+ andi tmp1, tmp1, 0x8
+ masknez tmp1, tmp1, tmp2
+ bnez tmp1, strncmp_byte_loop
+ addi.d src1, src1, -8
+ addi.d src2, src2, -8
+ ldx.d data1, src1, limit
+ ldx.d data2, src2, limit
+ sub.d tmp1, data1, zeroones
+ or tmp2, data1, sevenf
+ xor diff, data1, data2
+ andn has_nul, tmp1, tmp2
+ or syndrome, diff, has_nul
+ bnez syndrome, strncmp_not_limit
+
+strncmp_ret0:
+ move result, zero
+ jr ra
+
+/* check if ((src1 != 0) && ((src2 == 0) || (src1 < src2)))
+ then exchange(src1,src2)
+ */
+
+
+END(STRNCMP)
+#ifndef ANDROID_CHANGES
+#ifdef _LIBC
+libc_hidden_builtin_def (strncmp)
+#endif
+#endif
--
2.20.1
More information about the Libc-alpha
mailing list