This is the mail archive of the
libc-alpha@sourceware.org
mailing list for the glibc project.
[PATCH v3] aarch64: Optimized memset for Kunpeng processor.
- From: Xuelei Zhang <zhangxuelei4 at huawei dot com>
- To: <libc-alpha at sourceware dot org>, <Wilco dot Dijkstra at arm dot com>, <siddhesh at gotplt dot org>, <Szabolcs dot Nagy at arm dot com>, <jiangyikun at huawei dot com>, <yikunkero at gmail dot com>
- Date: Mon, 4 Nov 2019 16:49:39 +0800
- Subject: [PATCH v3] aarch64: Optimized memset for Kunpeng processor.
Due to the branch prediction issue of Kunpeng processor, we found
memset_generic has poor performance on middle sizes setting, and so
we reconstructed the logic, expanded the loop by 4 times in set_long
to solve the problem, even when setting below 1K sizes have benefit.
Another change is that DZ_ZVA seems no work when setting zero, so we
discarded it and used set_long to set zero instead. Fewer branches and
predictions also make the zero case have slightly improvement.
Here's the part of the result:
SIMPLE_MEMSET __memset_falkor __memset_emag __memset_kunpeng __memset_generic
========================================================================================================================
length=16, char=65: 9.84 (-146.83%) 4.00 ( -0.18%) 3.99 ( -0.12%) 3.61 ( 9.52%) 3.99
length=17, char=65: 10.24 (-156.58%) 3.99 ( 0.03%) 4.00 ( -0.15%) 3.61 ( 9.55%) 3.99
length=17, char=65: 10.23 (-156.24%) 3.99 ( 0.09%) 3.99 ( 0.12%) 3.60 ( 9.78%) 3.99
length=18, char=65: 10.62 (-166.20%) 3.99 ( 0.06%) 3.99 ( 0.00%) 3.60 ( 9.73%) 3.99
length=18, char=65: 10.62 (-166.27%) 4.02 ( -0.83%) 3.99 ( 0.00%) 3.61 ( 9.58%) 3.99
length=19, char=65: 11.01 (-176.06%) 3.99 ( -0.06%) 3.99 ( -0.03%) 3.61 ( 9.49%) 3.99
length=19, char=65: 11.01 (-175.96%) 4.01 ( -0.43%) 3.99 ( -0.03%) 3.60 ( 9.76%) 3.99
length=20, char=65: 11.39 (-185.67%) 3.99 ( 0.00%) 3.99 ( 0.03%) 3.60 ( 9.67%) 3.99
length=20, char=65: 11.40 (-185.65%) 4.00 ( -0.18%) 3.99 ( -0.03%) 3.61 ( 9.61%) 3.99
length=21, char=65: 11.79 (-195.41%) 3.99 ( 0.09%) 3.99 ( 0.06%) 3.60 ( 9.73%) 3.99
length=21, char=65: 11.80 (-195.90%) 3.99 ( -0.12%) 3.99 ( -0.03%) 3.63 ( 8.88%) 3.99
length=22, char=65: 12.17 (-205.14%) 3.99 ( 0.03%) 3.99 ( -0.09%) 3.60 ( 9.64%) 3.99
length=22, char=65: 12.17 (-204.99%) 3.99 ( -0.03%) 3.99 ( 0.15%) 3.60 ( 9.76%) 3.99
length=23, char=65: 12.57 (-215.12%) 3.99 ( 0.06%) 3.99 ( -0.06%) 3.60 ( 9.61%) 3.99
length=23, char=65: 12.56 (-214.93%) 3.99 ( -0.06%) 3.99 ( 0.03%) 3.60 ( 9.64%) 3.99
length=24, char=65: 12.95 (-224.47%) 3.99 ( 0.06%) 3.99 ( 0.03%) 3.61 ( 9.64%) 3.99
length=24, char=65: 12.96 (-224.98%) 3.99 ( -0.09%) 4.00 ( -0.37%) 3.64 ( 8.72%) 3.99
length=25, char=65: 13.34 (-234.35%) 3.99 ( 0.03%) 3.99 ( 0.03%) 3.60 ( 9.70%) 3.99
length=25, char=65: 13.34 (-234.56%) 3.99 ( 0.06%) 3.99 ( 0.00%) 3.60 ( 9.70%) 3.99
length=26, char=65: 13.74 (-244.40%) 3.99 ( -0.03%) 3.99 ( 0.00%) 3.60 ( 9.64%) 3.99
length=26, char=65: 13.73 (-244.11%) 3.99 ( 0.06%) 3.99 ( 0.06%) 3.61 ( 9.64%) 3.99
length=27, char=65: 14.11 (-253.55%) 3.99 ( 0.12%) 3.99 ( 0.09%) 3.60 ( 9.76%) 3.99
length=27, char=65: 14.12 (-253.28%) 3.99 ( 0.18%) 4.48 (-12.10%) 3.62 ( 9.35%) 4.00
length=28, char=65: 14.65 (-267.34%) 3.99 ( -0.09%) 3.99 ( 0.03%) 3.61 ( 9.58%) 3.99
length=28, char=65: 14.51 (-263.82%) 3.99 ( -0.09%) 3.99 ( -0.15%) 3.60 ( 9.64%) 3.99
length=29, char=65: 14.89 (-273.15%) 3.99 ( 0.09%) 3.99 ( 0.09%) 3.61 ( 9.66%) 3.99
length=29, char=65: 14.89 (-273.63%) 3.99 ( 0.00%) 3.99 ( -0.12%) 3.60 ( 9.65%) 3.99
length=30, char=65: 15.29 (-283.17%) 3.99 ( 0.09%) 3.99 ( 0.03%) 3.60 ( 9.64%) 3.99
length=30, char=65: 15.28 (-283.02%) 3.99 ( 0.09%) 3.99 ( 0.06%) 3.61 ( 9.64%) 3.99
length=31, char=65: 15.67 (-292.84%) 3.99 ( 0.09%) 4.03 ( -0.92%) 3.60 ( 9.70%) 3.99
length=31, char=65: 15.67 (-293.17%) 3.99 ( 0.00%) 3.99 ( -0.12%) 3.60 ( 9.56%) 3.99
length=32, char=65: 16.06 (-344.75%) 3.62 ( -0.30%) 3.61 ( 0.00%) 3.24 ( 10.41%) 3.61
length=32, char=65: 16.06 (-344.41%) 3.60 ( 0.37%) 3.62 ( -0.27%) 3.23 ( 10.77%) 3.61
length=64, char=65: 28.48 (-689.91%) 3.61 ( 0.00%) 3.61 ( -0.07%) 3.35 ( 7.21%) 3.61
length=64, char=65: 28.94 (-702.30%) 3.62 ( -0.37%) 3.61 ( -0.10%) 3.22 ( 10.63%) 3.61
length=96, char=65: 40.87 (-1023.42%) 3.61 ( 0.87%) 3.61 ( 0.77%) 3.24 ( 10.84%) 3.64
length=96, char=65: 40.86 (-1033.97%) 3.60 ( 0.03%) 3.61 ( -0.10%) 3.23 ( 10.47%) 3.60
length=128, char=65: 53.30 (-1228.42%) 4.01 ( 0.09%) 3.61 ( 10.10%) 3.99 ( 0.46%) 4.01
length=128, char=65: 53.30 (-1226.43%) 4.01 ( 0.15%) 3.61 ( 10.15%) 4.00 ( 0.43%) 4.02
length=160, char=65: 70.66 (-1256.33%) 5.21 ( 0.07%) 5.64 ( -8.27%) 5.22 ( -0.16%) 5.21
length=160, char=65: 71.22 (-1266.07%) 5.24 ( -0.44%) 5.63 ( -8.01%) 5.21 ( 0.05%) 5.21
length=192, char=65: 82.84 (-1489.72%) 5.21 ( 0.00%) 5.17 ( 0.84%) 5.21 ( 0.07%) 5.21
length=192, char=65: 82.85 (-1486.79%) 5.24 ( -0.44%) 5.17 ( 1.05%) 5.21 ( 0.26%) 5.22
length=224, char=65: 95.34 (-471.58%) 16.68 ( 0.01%) 6.45 ( 61.36%) 6.75 ( 59.53%) 16.68
length=224, char=65: 95.81 (-469.71%) 16.69 ( 0.73%) 6.30 ( 62.55%) 6.75 ( 59.85%) 16.82
length=256, char=65: 107.46 (-544.28%) 16.68 ( -0.01%) 6.69 ( 59.91%) 6.75 ( 59.54%) 16.68
length=256, char=65: 107.46 (-544.24%) 16.68 ( -0.01%) 6.68 ( 59.94%) 6.75 ( 59.55%) 16.68
length=288, char=65: 120.49 (-1350.50%) 8.31 ( -0.07%) 7.57 ( 8.88%) 8.31 ( -0.07%) 8.31
length=288, char=65: 120.02 (-1348.91%) 8.29 ( -0.07%) 9.42 (-13.68%) 8.29 ( -0.03%) 8.28
length=320, char=65: 132.07 (-1493.15%) 8.29 ( 0.04%) 8.22 ( 0.82%) 8.28 ( 0.10%) 8.29
length=320, char=65: 133.08 (-1505.07%) 8.31 ( -0.22%) 8.99 ( -8.44%) 8.29 ( 0.01%) 8.29
length=352, char=65: 144.76 (-1373.36%) 9.83 ( -0.07%) 9.09 ( 7.48%) 9.84 ( -0.12%) 9.83
length=352, char=65: 144.77 (-1370.10%) 9.83 ( 0.21%) 9.08 ( 7.76%) 10.31 ( -4.70%) 9.85
length=384, char=65: 156.82 (-1495.48%) 9.83 ( 0.02%) 9.77 ( 0.58%) 9.82 ( 0.05%) 9.83
length=384, char=65: 156.81 (-1489.89%) 9.83 ( 0.35%) 9.76 ( 1.03%) 9.83 ( 0.36%) 9.86
length=416, char=65: 169.39 (-1389.81%) 11.38 ( -0.12%) 10.92 ( 3.94%) 11.43 ( -0.49%) 11.37
length=416, char=65: 169.00 (-1386.86%) 11.36 ( 0.02%) 10.64 ( 6.40%) 11.42 ( -0.48%) 11.37
length=448, char=65: 181.98 (-1501.07%) 11.38 ( -0.10%) 11.30 ( 0.57%) 11.44 ( -0.67%) 11.37
length=448, char=65: 181.49 (-1496.98%) 11.36 ( 0.01%) 11.30 ( 0.59%) 11.36 ( 0.00%) 11.36
length=480, char=65: 194.27 (-1394.63%) 13.01 ( -0.08%) 12.18 ( 6.26%) 13.13 ( -1.03%) 13.00
length=480, char=65: 193.99 (-1377.78%) 13.13 ( -0.02%) 12.28 ( 6.44%) 13.00 ( 0.98%) 13.13
length=16, char=0: 9.85 (-146.35%) 4.02 ( -0.40%) 4.02 ( -0.55%) 3.62 ( 9.40%) 4.00
length=17, char=0: 10.24 (-156.80%) 3.99 ( -0.06%) 3.99 ( -0.06%) 3.60 ( 9.58%) 3.99
length=17, char=0: 10.23 (-156.55%) 4.02 ( -0.70%) 3.99 ( 0.00%) 3.61 ( 9.52%) 3.99
length=18, char=0: 10.63 (-166.64%) 3.99 ( -0.18%) 3.99 ( -0.03%) 3.62 ( 9.28%) 3.99
length=18, char=0: 10.62 (-166.39%) 3.99 ( 0.00%) 3.99 ( 0.00%) 3.60 ( 9.67%) 3.99
length=19, char=0: 11.01 (-176.01%) 3.99 ( -0.03%) 3.99 ( 0.03%) 3.60 ( 9.64%) 3.99
length=19, char=0: 11.01 (-141.13%) 4.07 ( 10.83%) 3.99 ( 12.62%) 3.60 ( 21.02%) 4.56
length=20, char=0: 11.42 (-186.11%) 3.99 ( 0.06%) 3.99 ( 0.06%) 3.60 ( 9.67%) 3.99
length=20, char=0: 11.40 (-185.83%) 3.99 ( 0.06%) 3.99 ( 0.00%) 3.64 ( 8.85%) 3.99
length=21, char=0: 11.79 (-195.44%) 3.99 ( 0.00%) 3.98 ( 0.12%) 3.61 ( 9.61%) 3.99
length=21, char=0: 11.79 (-195.41%) 3.99 ( 0.03%) 3.99 ( 0.03%) 3.60 ( 9.73%) 3.99
length=22, char=0: 12.18 (-205.11%) 3.99 ( 0.00%) 3.99 ( 0.09%) 3.60 ( 9.70%) 3.99
length=22, char=0: 12.17 (-204.99%) 3.99 ( 0.03%) 3.99 ( 0.00%) 3.60 ( 9.76%) 3.99
length=23, char=0: 12.56 (-215.16%) 3.99 ( -0.03%) 3.99 ( 0.00%) 3.61 ( 9.53%) 3.99
length=23, char=0: 12.56 (-215.16%) 3.99 ( -0.03%) 3.99 ( 0.00%) 3.60 ( 9.61%) 3.99
length=24, char=0: 12.98 (-225.53%) 3.99 ( 0.00%) 3.99 ( 0.03%) 3.60 ( 9.64%) 3.99
length=24, char=0: 12.95 (-224.93%) 3.99 ( -0.06%) 3.99 ( -0.03%) 3.60 ( 9.59%) 3.99
length=25, char=0: 13.34 (-234.39%) 3.99 ( 0.03%) 3.99 ( -0.03%) 3.60 ( 9.67%) 3.99
length=25, char=0: 13.34 (-234.64%) 3.99 ( -0.12%) 3.99 ( -0.03%) 3.60 ( 9.62%) 3.99
length=26, char=0: 13.73 (-242.86%) 3.99 ( 0.43%) 3.99 ( 0.40%) 3.60 ( 9.97%) 4.00
length=26, char=0: 13.73 (-244.70%) 3.99 ( -0.12%) 3.99 ( -0.09%) 3.60 ( 9.53%) 3.98
length=27, char=0: 14.12 (-253.83%) 3.99 ( 0.06%) 4.02 ( -0.80%) 3.60 ( 9.70%) 3.99
length=27, char=0: 14.11 (-254.01%) 4.00 ( -0.24%) 3.99 ( 0.00%) 3.60 ( 9.58%) 3.99
length=28, char=0: 14.51 (-263.79%) 3.99 ( -0.03%) 3.99 ( 0.06%) 3.61 ( 9.55%) 3.99
length=28, char=0: 14.51 (-263.98%) 3.99 ( -0.12%) 3.99 ( -0.06%) 3.60 ( 9.62%) 3.99
length=29, char=0: 14.89 (-273.43%) 3.99 ( 0.03%) 3.99 ( 0.00%) 3.61 ( 9.55%) 3.99
length=29, char=0: 14.89 (-273.40%) 3.99 ( 0.03%) 3.99 ( 0.06%) 3.60 ( 9.70%) 3.99
length=30, char=0: 15.28 (-280.20%) 3.99 ( 0.79%) 3.98 ( 0.88%) 3.60 ( 10.36%) 4.02
length=30, char=0: 15.29 (-283.40%) 3.99 ( -0.09%) 3.99 ( -0.03%) 3.60 ( 9.58%) 3.99
length=31, char=0: 15.67 (-293.08%) 3.99 ( -0.03%) 3.99 ( -0.15%) 3.60 ( 9.62%) 3.99
length=31, char=0: 15.66 (-292.74%) 3.99 ( 0.00%) 3.99 ( -0.06%) 3.61 ( 9.58%) 3.99
length=32, char=0: 16.07 (-342.37%) 3.62 ( 0.40%) 3.61 ( 0.50%) 3.24 ( 10.79%) 3.63
length=32, char=0: 16.08 (-346.21%) 3.62 ( -0.51%) 3.60 ( 0.00%) 3.22 ( 10.64%) 3.60
length=64, char=0: 28.48 (-689.84%) 3.60 ( 0.10%) 3.60 ( 0.10%) 3.22 ( 10.60%) 3.61
length=64, char=0: 28.51 (-691.29%) 3.60 ( 0.03%) 3.61 ( -0.07%) 3.22 ( 10.60%) 3.60
length=96, char=0: 40.86 (-1035.06%) 3.60 ( -0.10%) 3.60 ( -0.10%) 3.22 ( 10.51%) 3.60
length=96, char=0: 40.86 (-1034.75%) 3.60 ( 0.00%) 3.60 ( -0.03%) 3.22 ( 10.58%) 3.60
length=128, char=0: 53.30 (-1232.88%) 4.00 ( -0.03%) 3.61 ( 9.71%) 3.99 ( 0.15%) 4.00
length=128, char=0: 53.30 (-1237.74%) 3.99 ( -0.21%) 3.61 ( 9.47%) 3.99 ( -0.24%) 3.98
length=160, char=0: 71.44 (-1270.17%) 5.22 ( -0.12%) 5.59 ( -7.19%) 5.22 ( -0.09%) 5.21
length=160, char=0: 70.67 (-1256.99%) 5.21 ( 0.02%) 5.61 ( -7.76%) 5.21 ( -0.05%) 5.21
length=192, char=0: 82.85 (-1490.91%) 5.21 ( -0.09%) 5.16 ( 0.84%) 5.21 ( 0.00%) 5.21
length=192, char=0: 82.84 (-1490.79%) 5.21 ( 0.00%) 5.16 ( 0.89%) 5.21 ( -0.05%) 5.21
length=224, char=0: 95.34 (-514.50%) 17.14 (-10.44%) 6.45 ( 58.40%) 6.76 ( 56.44%) 15.52
length=224, char=0: 95.34 (-490.08%) 16.68 ( -3.23%) 6.44 ( 60.15%) 6.75 ( 58.24%) 16.16
length=256, char=0: 107.45 (-1177.01%) 8.42 ( -0.04%) 6.69 ( 20.54%) 6.75 ( 19.79%) 8.41
length=256, char=0: 107.45 (-1168.94%) 8.42 ( 0.58%) 7.06 ( 16.62%) 6.75 ( 20.34%) 8.47
length=288, char=0: 120.54 (-1331.13%) 8.43 ( -0.07%) 7.56 ( 10.25%) 8.29 ( 1.57%) 8.42
length=288, char=0: 120.02 (-1325.53%) 8.42 ( -0.01%) 7.55 ( 10.32%) 8.29 ( 1.54%) 8.42
length=320, char=0: 132.08 (-1371.26%) 8.42 ( 6.24%) 8.22 ( 8.39%) 8.28 ( 7.74%) 8.98
length=320, char=0: 136.44 (-1518.49%) 8.44 ( -0.06%) 8.22 ( 2.49%) 8.28 ( 1.72%) 8.43
length=352, char=0: 144.76 (-729.19%) 9.96 ( 42.97%) 9.09 ( 47.96%) 9.83 ( 43.71%) 17.46
length=352, char=0: 145.45 (-732.82%) 9.98 ( 42.86%) 10.15 ( 41.85%) 9.83 ( 43.72%) 17.46
length=384, char=0: 156.82 (-798.42%) 10.00 ( 42.74%) 9.76 ( 44.07%) 9.82 ( 43.72%) 17.45
length=384, char=0: 156.81 (-797.96%) 9.95 ( 43.00%) 11.85 ( 32.16%) 9.85 ( 43.60%) 17.46
length=416, char=0: 168.99 (-772.17%) 17.46 ( 9.90%) 10.53 ( 45.64%) 11.42 ( 41.04%) 19.38
length=416, char=0: 168.99 (-771.29%) 17.45 ( 10.01%) 10.53 ( 45.71%) 11.97 ( 38.28%) 19.40
length=448, char=0: 181.50 (-836.83%) 17.46 ( 9.90%) 11.30 ( 41.66%) 11.37 ( 41.30%) 19.37
length=448, char=0: 185.66 (-858.07%) 18.01 ( 7.04%) 11.31 ( 41.62%) 11.36 ( 41.37%) 19.38
length=480, char=0: 194.01 (-1410.62%) 13.42 ( -4.48%) 12.19 ( 5.12%) 13.14 ( -2.30%) 12.84
length=480, char=0: 193.62 (-1407.27%) 13.42 ( -4.46%) 12.83 ( 0.13%) 13.01 ( -1.24%) 12.85
---
sysdeps/aarch64/multiarch/Makefile | 2 +-
sysdeps/aarch64/multiarch/ifunc-impl-list.c | 1 +
sysdeps/aarch64/multiarch/memset.c | 5 +-
sysdeps/aarch64/multiarch/memset_kunpeng.S | 113 ++++++++++++++++++++++++++++
4 files changed, 119 insertions(+), 2 deletions(-)
create mode 100644 sysdeps/aarch64/multiarch/memset_kunpeng.S
diff --git a/sysdeps/aarch64/multiarch/Makefile b/sysdeps/aarch64/multiarch/Makefile
index 4150b89a902..8378107c78e 100644
--- a/sysdeps/aarch64/multiarch/Makefile
+++ b/sysdeps/aarch64/multiarch/Makefile
@@ -1,7 +1,7 @@
ifeq ($(subdir),string)
sysdep_routines += memcpy_generic memcpy_thunderx memcpy_thunderx2 \
memcpy_falkor memmove_falkor \
- memset_generic memset_falkor memset_emag \
+ memset_generic memset_falkor memset_emag memset_kunpeng \
memchr_generic memchr_nosimd \
strlen_generic strlen_asimd
endif
diff --git a/sysdeps/aarch64/multiarch/ifunc-impl-list.c b/sysdeps/aarch64/multiarch/ifunc-impl-list.c
index be13b916e58..bcbd90d0c41 100644
--- a/sysdeps/aarch64/multiarch/ifunc-impl-list.c
+++ b/sysdeps/aarch64/multiarch/ifunc-impl-list.c
@@ -53,6 +53,7 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
can do a comparative analysis with __memset_generic. */
IFUNC_IMPL_ADD (array, i, memset, (zva_size == 64), __memset_falkor)
IFUNC_IMPL_ADD (array, i, memset, (zva_size == 64), __memset_emag)
+ IFUNC_IMPL_ADD (array, i, memset, 1, __memset_kunpeng)
IFUNC_IMPL_ADD (array, i, memset, 1, __memset_generic))
IFUNC_IMPL (i, name, memchr,
IFUNC_IMPL_ADD (array, i, memchr, 1, __memchr_nosimd)
diff --git a/sysdeps/aarch64/multiarch/memset.c b/sysdeps/aarch64/multiarch/memset.c
index e9cdd385f26..4cc34b9b99a 100644
--- a/sysdeps/aarch64/multiarch/memset.c
+++ b/sysdeps/aarch64/multiarch/memset.c
@@ -30,10 +30,13 @@ extern __typeof (__redirect_memset) __libc_memset;
extern __typeof (__redirect_memset) __memset_falkor attribute_hidden;
extern __typeof (__redirect_memset) __memset_emag attribute_hidden;
+extern __typeof (__redirect_memset) __memset_kunpeng attribute_hidden;
extern __typeof (__redirect_memset) __memset_generic attribute_hidden;
libc_ifunc (__libc_memset,
- ((IS_FALKOR (midr) || IS_PHECDA (midr)) && zva_size == 64
+ IS_KUNPENG (midr)
+ ?__memset_kunpeng
+ : ((IS_FALKOR (midr) || IS_PHECDA (midr)) && zva_size == 64
? __memset_falkor
: (IS_EMAG (midr) && zva_size == 64
? __memset_emag
diff --git a/sysdeps/aarch64/multiarch/memset_kunpeng.S b/sysdeps/aarch64/multiarch/memset_kunpeng.S
new file mode 100644
index 00000000000..a03441ae72f
--- /dev/null
+++ b/sysdeps/aarch64/multiarch/memset_kunpeng.S
@@ -0,0 +1,113 @@
+/* Optimized memset for Huawei Kunpeng processor.
+ Copyright (C) 2012-2019 Free Software Foundation, Inc.
+
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library. If not, see
+ <https://www.gnu.org/licenses/>. */
+
+#include <sysdep.h>
+#include <sysdeps/aarch64/memset-reg.h>
+
+#if IS_IN (libc)
+# define MEMSET __memset_kunpeng
+
+/* Assumptions:
+ *
+ * ARMv8-a, AArch64, unaligned accesses
+ *
+ */
+
+ENTRY_ALIGN (MEMSET, 6)
+
+ DELOUSE (0)
+ DELOUSE (2)
+
+ dup v0.16B, valw
+ add dstend, dstin, count
+
+ cmp count, 128
+ b.hs L(set_long)
+
+ cmp count, 16
+ b.lo L(less16)
+
+ /* Set 16..127 bytes. */
+ str q0, [dstin]
+ tbnz count, 6, L(set127)
+ str q0, [dstend, -16]
+ tbz count, 5, 1f
+ str q0, [dstin, 16]
+ str q0, [dstend, -32]
+1: ret
+
+ .p2align 4
+ /* Set 64..127 bytes. Write 64 bytes from the start and
+ 64 bytes from the end. */
+L(set127):
+ stp q0, q0, [dstin, 16]
+ str q0, [dstin, 48]
+ stp q0, q0, [dstend, -64]
+ stp q0, q0, [dstend, -32]
+ ret
+
+ .p2align 4
+ /* Set 0..15 bytes. */
+L(less16):
+ tbz count, 3, L(less8)
+ str d0, [dstin]
+ str d0, [dstend, -8]
+ ret
+L(less8):
+ tbz count, 2, 2f
+ str s0, [dstin]
+ str s0, [dstend, -4]
+ ret
+2: cbz count, 3f
+ str b0, [dstin]
+ tbz count, 1, 3f
+ str h0, [dstend, -2]
+3: ret
+
+ .p2align 4
+L(set_long):
+ bic dst, dstin, 15
+ str q0, [dstin]
+ sub count, dstend, dst /* Count is 16 too large. */
+ sub dst, dst, 16 /* Dst is biased by -32. */
+ sub count, count, 64 + 16 + 1 /* Adjust count and bias for loop. */
+1: stp q0, q0, [dst, 32]
+ stp q0, q0, [dst, 64]!
+ subs count, count, 64
+ b.lo 1f
+ stp q0, q0, [dst, 32]
+ stp q0, q0, [dst, 64]!
+ subs count, count, 64
+ b.lo 1f
+ stp q0, q0, [dst, 32]
+ stp q0, q0, [dst, 64]!
+ subs count, count, 64
+ b.lo 1f
+ stp q0, q0, [dst, 32]
+ stp q0, q0, [dst, 64]!
+ subs count, count, 64
+ b.hs 1b
+
+1: stp q0, q0, [dstend, -64]
+ stp q0, q0, [dstend, -32]
+ ret
+
+END (MEMSET)
+libc_hidden_builtin_def (MEMSET)
+#endif
--
2.14.1.windows.1