This is the mail archive of the
libc-alpha@sourceware.org
mailing list for the glibc project.
Re: [PATCH v2] aarch64: Optimized strlen for strlen_asimd
- From: Wilco Dijkstra <Wilco dot Dijkstra at arm dot com>
- To: Xuelei Zhang <zhangxuelei4 at huawei dot com>, "libc-alpha at sourceware dot org" <libc-alpha at sourceware dot org>, "siddhesh at gotplt dot org" <siddhesh at gotplt dot org>, Szabolcs Nagy <Szabolcs dot Nagy at arm dot com>, "jiangyikun at huawei dot com" <jiangyikun at huawei dot com>, "yikunkero at gmail dot com" <yikunkero at gmail dot com>
- Cc: nd <nd at arm dot com>
- Date: Tue, 22 Oct 2019 16:33:52 +0000
- Subject: Re: [PATCH v2] aarch64: Optimized strlen for strlen_asimd
- Arc-authentication-results: i=1; mx.microsoft.com 1; spf=pass smtp.mailfrom=arm.com; dmarc=pass action=none header.from=arm.com; dkim=pass header.d=arm.com; arc=none
- Arc-message-signature: i=1; a=rsa-sha256; c=relaxed/relaxed; d=microsoft.com; s=arcselector9901; h=From:Date:Subject:Message-ID:Content-Type:MIME-Version:X-MS-Exchange-SenderADCheck; bh=N/IdrQDDz4SO0ao/f6N1TXnyy4H3bnRfNZ7bWmn0hi8=; b=JUQiujYHAMFewaccqKGhKI22qENBSTc5/Yo2Tr/JnZzRliLO7Wj9rCROKCHkSFa0ebhnKXAR2rKSJ9+F/SpVTo8dEpCVOf+z0aW+s9pJthmb1rMntQci5y8Q1cklo3GKeoOum3IdLGfb8DTw2RdaGyzaMCfjCseZOtk4Qe8LDjuTWci7cqR+ybUlVx0RkkFV+PbxMKQxYZQdA76oTgdLXG19k27CHUfXHmszYPW85VvSFFkMqGHZkD9O1p0cbbhqWBMaMSJMLGFY/500NGiKVmTLrJkV3RrSvhwlXeTsPSHWmw7s94SOeW0opw4O40eN7sHVp4sL2eWZhGKG0fWnGQ==
- Arc-seal: i=1; a=rsa-sha256; s=arcselector9901; d=microsoft.com; cv=none; b=ek51X6b31L1Hnd/Ngcf++fy95bCh35kCgHR0NzOMHN1W0h5E0+q9qLxQqd+I6h7giF6Lz7DECeR+LUcQU+iP8JdS89caa/4AaBHjZjTY3uWKWOQwcQWj7rCd08LR9YS/s/mTcnjuy1zdjWOg0QVnyvRenU3ECW5SXZxKgcj6ZkoWq1zMGT4HadjZ7zRKMQ6Ftwd6BZ08E4A5ayLdXlrCbzrd27slFTz0RSpb80drW7xwSiADGBSztdGbTf0KRPiYByDVtsCGOZg1HSA5kUEPhvlVnnM9dLTZd5t/St5iwVaU6x4hTDUfRUjR9klY+39rqs7ftzH5HyKwCCimqix4VQ==
- Original-authentication-results: spf=none (sender IP is ) smtp.mailfrom=Wilco dot Dijkstra at arm dot com;
- References: <20191022094118.11468-1-zhangxuelei4@huawei.com>
Hi Xuelei,
> Optimize the strlen implementation by using vector operations and
> loop unrolling in main loop.Compared to __strlen_generic,it reduces
> latency of cases in bench-strlen by 7%~18% when the length of src
> is greater than 128 bytes, with gains throughout the benchmark.
This is a good improvement, OK to commit. Also given it uses integer
arithmetic for the first 16 bytes, it can never be worse off than the generic
variant for small inputs.
Wilco
diff --git a/sysdeps/aarch64/multiarch/strlen.c b/sysdeps/aarch64/multiarch/strlen.c
index 1db01babeec..abf6513eeea 100644
--- a/sysdeps/aarch64/multiarch/strlen.c
+++ b/sysdeps/aarch64/multiarch/strlen.c
@@ -34,7 +34,9 @@ extern __typeof (__redirect_strlen) __strlen_generic attribute_hidden;
extern __typeof (__redirect_strlen) __strlen_asimd attribute_hidden;
libc_ifunc (__strlen,
- (USE_ASIMD_STRLEN () ? __strlen_asimd : __strlen_generic));
+ (USE_ASIMD_STRLEN () || IS_KUNPENG(midr)
+ ? __strlen_asimd
+ :__strlen_generic));
# undef strlen
strong_alias (__strlen, strlen);
diff --git a/sysdeps/aarch64/multiarch/strlen_asimd.S b/sysdeps/aarch64/multiarch/strlen_asimd.S
index 1d1c6abb825..1de6cd3a173 100644
--- a/sysdeps/aarch64/multiarch/strlen_asimd.S
+++ b/sysdeps/aarch64/multiarch/strlen_asimd.S
@@ -48,6 +48,9 @@
#define dataq2 q3
#define datav2 v3
+#define REP8_01 0x0101010101010101
+#define REP8_7f 0x7f7f7f7f7f7f7f7f
+
#ifdef TEST_PAGE_CROSS
# define MIN_PAGE_SIZE 16
#else
@@ -82,40 +85,47 @@ ENTRY_ALIGN (__strlen_asimd, 6)
DELOUSE (0)
DELOUSE (1)
and tmp1, srcin, MIN_PAGE_SIZE - 1
+ mov zeroones, REP8_01
cmp tmp1, MIN_PAGE_SIZE - 16
b.gt L(page_cross)
- ldr dataq, [srcin]
+ ldp data1, data2, [srcin]
#ifdef __AARCH64EB__
- rev64 datav.16b, datav.16b
+ rev data1, data1
+ rev data2, data2
#endif
- /* Get the minimum value and keep going if it is not zero. */
- uminv datab2, datav.16b
- mov tmp1, datav2.d[0]
- cbnz tmp1, L(main_loop_entry)
-
- cmeq datav.16b, datav.16b, #0
- mov data1, datav.d[0]
- mov data2, datav.d[1]
- cmp data1, 0
- csel data1, data1, data2, ne
+ sub tmp1, data1, zeroones
+ orr tmp2, data1, REP8_7f
+ sub tmp3, data2, zeroones
+ orr tmp4, data2, REP8_7f
+ bics has_nul1, tmp1, tmp2
+ bic has_nul2, tmp3, tmp4
+ ccmp has_nul2, 0, 0, eq
+ beq L(main_loop_entry)
+ csel has_nul1, has_nul1, has_nul2, cc
mov len, 8
- rev data1, data1
- clz tmp1, data1
- csel len, xzr, len, ne
+ rev has_nul1, has_nul1
+ clz tmp1, has_nul1
+ csel len, xzr, len, cc
add len, len, tmp1, lsr 3
ret
L(main_loop_entry):
bic src, srcin, 15
+ sub src, src, 16
L(main_loop):
- ldr dataq, [src, 16]!
+ ldr dataq, [src, 32]!
L(page_cross_entry):
/* Get the minimum value and keep going if it is not zero. */
uminv datab2, datav.16b
mov tmp1, datav2.d[0]
+ cbz tmp1, L(tail)
+ ldr dataq, [src, 16]
+ uminv datab2, datav.16b
+ mov tmp1, datav2.d[0]
cbnz tmp1, L(main_loop)
+ add src, src, 16
L(tail):
#ifdef __AARCH64EB__
OK