This is the mail archive of the libc-alpha@sourceware.org mailing list for the glibc project.


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]
Other format: [Raw text]

[PATCH] aarch64: Optimized memset for Kunpeng processor.


Due to the branch prediction issue of Kunpeng processor, we found
memset_generic has poor performance on middle sizes setting, and so
we reconstructed the logic, expanded the loop by 3 times in set_long
to solve the problem, even when setting below 1K sizes have benefit.

Another change is that DZ_ZVA seems no work when setting zero, so we
discarded it and used set_long to set zero instead. Fewer branches and
predictions also make the zero case have slightly improvement.

Here's the part of the result:
                                    SIMPLE_MEMSET	__memset_falkor	__memset_emag	__memset_kunpeng	__memset_generic
========================================================================================================================
                  length=16, char=65:         9.85 (-146.38%)	        4.00 (  0.06%)	        4.01 ( -0.24%)	        3.61 (  9.71%)	        4.00
                  length=17, char=65:        10.24 (-154.44%)	        3.99 (  0.79%)	        3.99 (  0.85%)	        3.61 ( 10.40%)	        4.02
                  length=17, char=65:        10.24 (-156.51%)	        4.00 ( -0.09%)	        3.99 (  0.06%)	        3.61 (  9.57%)	        3.99
                  length=18, char=65:        10.63 (-166.37%)	        3.99 (  0.00%)	        3.99 (  0.03%)	        3.61 (  9.55%)	        3.99
                  length=18, char=65:        10.63 (-166.43%)	        4.00 ( -0.15%)	        3.99 ( -0.06%)	        3.61 (  9.55%)	        3.99
                  length=19, char=65:        11.01 (-176.02%)	        4.00 ( -0.18%)	        3.99 (  0.00%)	        3.61 (  9.61%)	        3.99
                  length=19, char=65:        11.02 (-176.50%)	        3.99 ( -0.24%)	        3.99 ( -0.15%)	        3.60 (  9.59%)	        3.98
                  length=20, char=65:        11.40 (-185.69%)	        3.99 (  0.00%)	        3.99 (  0.09%)	        3.61 (  9.51%)	        3.99
                  length=20, char=65:        11.41 (-185.78%)	        4.02 ( -0.73%)	        3.99 (  0.06%)	        3.60 (  9.79%)	        3.99
                  length=21, char=65:        11.82 (-196.30%)	        3.99 ( -0.12%)	        3.99 ( -0.03%)	        3.61 (  9.58%)	        3.99
                  length=21, char=65:        11.81 (-196.08%)	        4.00 ( -0.24%)	        3.99 ( -0.12%)	        3.61 (  9.52%)	        3.99
                  length=22, char=65:        12.19 (-204.73%)	        3.99 (  0.12%)	        3.99 (  0.15%)	        3.61 (  9.80%)	        4.00
                  length=22, char=65:        12.19 (-205.45%)	        3.99 ( -0.06%)	        3.99 (  0.00%)	        3.61 (  9.52%)	        3.99
                  length=23, char=65:        12.58 (-215.43%)	        3.99 ( -0.12%)	        3.99 ( -0.03%)	        3.61 (  9.40%)	        3.99
                  length=23, char=65:        12.57 (-215.18%)	        3.99 ( -0.03%)	        3.99 ( -0.03%)	        3.64 (  8.84%)	        3.99
                  length=24, char=65:        12.96 (-224.85%)	        3.99 ( -0.12%)	        3.99 (  0.00%)	        3.61 (  9.49%)	        3.99
                  length=24, char=65:        12.96 (-223.23%)	        4.00 (  0.24%)	        4.00 (  0.37%)	        3.62 (  9.77%)	        4.01
                  length=25, char=65:        13.36 (-234.64%)	        4.00 ( -0.15%)	        3.99 (  0.03%)	        3.61 (  9.60%)	        3.99
                  length=25, char=65:        13.35 (-234.37%)	        3.99 ( -0.03%)	        3.99 (  0.00%)	        3.60 (  9.72%)	        3.99
                  length=26, char=65:        13.74 (-243.99%)	        4.00 ( -0.12%)	        3.99 (  0.03%)	        3.61 (  9.60%)	        3.99
                  length=26, char=65:        13.74 (-241.73%)	        4.00 (  0.49%)	        3.99 (  0.79%)	        3.61 ( 10.26%)	        4.02
                  length=27, char=65:        14.12 (-254.01%)	        3.99 ( -0.12%)	        3.99 ( -0.03%)	        3.61 (  9.61%)	        3.99
                  length=27, char=65:        14.12 (-251.99%)	        3.99 (  0.52%)	        3.99 (  0.64%)	        4.13 ( -2.83%)	        4.01
                  length=28, char=65:        14.52 (-263.56%)	        4.00 ( -0.06%)	        4.00 ( -0.12%)	        3.61 (  9.69%)	        3.99
                  length=28, char=65:        14.52 (-263.84%)	        4.00 ( -0.12%)	        3.99 ( -0.09%)	        3.61 (  9.57%)	        3.99
                  length=29, char=65:        14.90 (-273.05%)	        4.00 ( -0.03%)	        3.99 (  0.06%)	        3.61 (  9.62%)	        4.00
                  length=29, char=65:        14.90 (-273.18%)	        3.99 ( -0.03%)	        3.99 (  0.09%)	        3.61 (  9.66%)	        3.99
                  length=30, char=65:        15.29 (-283.12%)	        4.02 ( -0.80%)	        3.99 (  0.03%)	        3.60 (  9.76%)	        3.99
                  length=30, char=65:        15.29 (-282.94%)	        3.99 (  0.09%)	        3.99 (  0.03%)	        3.61 (  9.57%)	        3.99
                  length=31, char=65:        15.68 (-293.08%)	        4.00 ( -0.21%)	        3.99 ( -0.12%)	        3.61 (  9.55%)	        3.99
                  length=31, char=65:        15.68 (-292.93%)	        4.00 ( -0.15%)	        4.00 ( -0.15%)	        3.61 (  9.55%)	        3.99
                  length=32, char=65:        16.07 (-345.68%)	        3.62 ( -0.34%)	        3.63 ( -0.74%)	        3.23 ( 10.29%)	        3.60
                  length=32, char=65:        16.07 (-345.73%)	        3.61 ( -0.07%)	        3.61 (  0.00%)	        3.22 ( 10.66%)	        3.61
                  length=64, char=65:        28.49 (-689.25%)	        3.60 (  0.17%)	        3.61 (  0.03%)	        3.64 ( -0.74%)	        3.61
                  length=64, char=65:        29.11 (-706.87%)	        3.62 ( -0.34%)	        3.61 ( -0.14%)	        3.62 ( -0.27%)	        3.61
                  length=96, char=65:        40.88 (-1032.78%)	        3.61 ( -0.07%)	        3.61 ( -0.10%)	        3.23 ( 10.49%)	        3.61
                  length=96, char=65:        40.87 (-1034.21%)	        3.61 ( -0.07%)	        3.61 ( -0.24%)	        3.23 ( 10.33%)	        3.60
                 length=128, char=65:        53.31 (-1234.78%)	        4.00 ( -0.12%)	        3.63 (  9.23%)	        4.00 ( -0.15%)	        3.99
                 length=128, char=65:        53.32 (-1234.47%)	        4.01 ( -0.40%)	        3.61 (  9.59%)	        4.00 (  0.00%)	        4.00
                 length=160, char=65:        70.67 (-1253.63%)	        5.22 (  0.00%)	        5.54 ( -6.08%)	        4.39 ( 15.95%)	        5.22
                 length=160, char=65:        71.30 (-1266.84%)	        5.24 ( -0.40%)	        5.64 ( -8.05%)	        4.39 ( 15.91%)	        5.22
                 length=192, char=65:        82.85 (-1487.67%)	        5.21 (  0.07%)	        5.18 (  0.82%)	        5.34 ( -2.39%)	        5.22
                 length=192, char=65:        82.85 (-1486.81%)	        5.21 (  0.16%)	        5.16 (  1.08%)	        5.35 ( -2.43%)	        5.22
                 length=224, char=65:        95.35 (-471.44%)	       16.68 (  0.01%)	        6.45 ( 61.36%)	        5.99 ( 64.11%)	       16.69
                 length=224, char=65:        96.00 (-475.09%)	       16.70 ( -0.04%)	        6.44 ( 61.41%)	        5.99 ( 64.14%)	       16.69
                 length=256, char=65:       107.47 (-544.11%)	       16.69 ( -0.01%)	        6.83 ( 59.04%)	        6.89 ( 58.70%)	       16.68
                 length=256, char=65:       107.47 (-544.19%)	       16.68 ( -0.01%)	        6.85 ( 58.94%)	        6.88 ( 58.76%)	       16.68
                 length=288, char=65:       120.57 (-1349.57%)	        8.33 ( -0.09%)	        7.58 (  8.85%)	        7.54 (  9.35%)	        8.32
                 length=288, char=65:       120.03 (-1347.51%)	        8.29 (  0.01%)	        7.56 (  8.88%)	        7.51 (  9.39%)	        8.29
                 length=320, char=65:       132.08 (-1492.79%)	        8.29 ( -0.01%)	        8.24 (  0.62%)	        8.42 ( -1.52%)	        8.29
                 length=320, char=65:       132.53 (-1497.78%)	        8.31 ( -0.19%)	        8.22 (  0.88%)	        8.42 ( -1.49%)	        8.29
                 length=352, char=65:       144.78 (-1372.65%)	        9.84 ( -0.06%)	        9.10 (  7.44%)	        9.23 (  6.15%)	        9.83
                 length=352, char=65:       144.77 (-1372.49%)	        9.83 (  0.00%)	        9.09 (  7.55%)	        9.10 (  7.49%)	        9.83
                 length=384, char=65:       157.91 (-1505.36%)	        9.85 ( -0.15%)	        9.78 (  0.58%)	        9.95 ( -1.18%)	        9.84
                 length=384, char=65:       156.82 (-1494.91%)	        9.83 (  0.01%)	        9.76 (  0.71%)	        9.95 ( -1.19%)	        9.83
                 length=416, char=65:       169.83 (-1392.95%)	       11.38 ( -0.08%)	       10.65 (  6.35%)	       10.81 (  4.98%)	       11.38
                 length=416, char=65:       169.00 (-1386.09%)	       11.37 (  0.02%)	       10.64 (  6.46%)	       10.62 (  6.61%)	       11.37
                 length=448, char=65:       181.98 (-1500.22%)	       11.38 ( -0.06%)	       11.32 (  0.42%)	       11.51 ( -1.25%)	       11.37
                 length=448, char=65:       181.51 (-1496.96%)	       11.37 ( -0.06%)	       11.30 (  0.56%)	       11.50 ( -1.17%)	       11.37
                 length=480, char=65:       194.46 (-1394.52%)	       13.01 ( -0.01%)	       12.19 (  6.34%)	       12.24 (  5.93%)	       13.01
                 length=480, char=65:       194.00 (-1377.70%)	       13.13 (  0.00%)	       12.18 (  7.22%)	       12.13 (  7.57%)	       13.13
				   length=16, char=0:         9.85 (-146.59%)	        4.03 ( -0.95%)	        4.00 ( -0.12%)	        3.61 (  9.72%)	        4.00
                   length=17, char=0:        10.24 (-156.42%)	        3.99 (  0.12%)	        4.00 ( -0.06%)	        3.61 (  9.69%)	        3.99
                   length=17, char=0:        10.24 (-156.78%)	        3.99 ( -0.15%)	        3.99 ( -0.15%)	        3.60 (  9.61%)	        3.99
                   length=18, char=0:        10.62 (-166.15%)	        3.99 ( -0.03%)	        3.99 (  0.00%)	        3.61 (  9.60%)	        3.99
                   length=18, char=0:        10.63 (-166.29%)	        4.62 (-15.78%)	        4.00 ( -0.31%)	        3.61 (  9.57%)	        3.99
                   length=19, char=0:        11.02 (-176.16%)	        3.99 ( -0.12%)	        3.99 ( -0.06%)	        3.61 (  9.52%)	        3.99
                   length=19, char=0:        11.01 (-175.51%)	        3.99 (  0.09%)	        3.99 (  0.21%)	        3.64 (  9.01%)	        4.00
                   length=20, char=0:        11.40 (-185.45%)	        3.99 (  0.06%)	        3.99 (  0.06%)	        3.62 (  9.26%)	        3.99
                   length=20, char=0:        11.40 (-185.72%)	        3.99 (  0.09%)	        3.99 ( -0.03%)	        3.60 (  9.72%)	        3.99
                   length=21, char=0:        11.79 (-195.41%)	        3.99 ( -0.03%)	        3.99 (  0.03%)	        3.61 (  9.64%)	        3.99
                   length=21, char=0:        11.79 (-195.23%)	        3.99 (  0.03%)	        3.99 (  0.12%)	        3.60 (  9.75%)	        3.99
                   length=22, char=0:        12.18 (-205.05%)	        4.00 ( -0.09%)	        3.99 (  0.03%)	        3.61 (  9.63%)	        3.99
                   length=22, char=0:        12.18 (-205.14%)	        4.00 ( -0.15%)	        3.99 ( -0.03%)	        3.61 (  9.61%)	        3.99
                   length=23, char=0:        12.60 (-215.60%)	        4.00 ( -0.15%)	        3.99 (  0.12%)	        3.60 (  9.72%)	        3.99
                   length=23, char=0:        12.56 (-214.71%)	        3.99 ( -0.03%)	        3.99 (  0.03%)	        3.61 (  9.60%)	        3.99
                   length=24, char=0:        12.95 (-224.30%)	        3.99 (  0.03%)	        3.99 (  0.06%)	        3.61 (  9.60%)	        3.99
                   length=24, char=0:        12.95 (-224.27%)	        3.99 (  0.00%)	        3.99 (  0.18%)	        3.61 (  9.69%)	        3.99
                   length=25, char=0:        13.34 (-233.74%)	        3.99 (  0.09%)	        4.51 (-12.82%)	        3.61 (  9.68%)	        4.00
                   length=25, char=0:        13.34 (-234.52%)	        3.99 ( -0.12%)	        3.99 ( -0.12%)	        3.61 (  9.58%)	        3.99
                   length=26, char=0:        13.74 (-244.40%)	        4.00 ( -0.18%)	        4.02 ( -0.83%)	        3.61 (  9.49%)	        3.99
                   length=26, char=0:        13.73 (-244.28%)	        3.99 ( -0.03%)	        3.99 ( -0.09%)	        3.60 (  9.64%)	        3.99
                   length=27, char=0:        14.12 (-253.92%)	        4.00 ( -0.18%)	        3.99 (  0.03%)	        3.60 (  9.67%)	        3.99
                   length=27, char=0:        14.12 (-254.03%)	        3.99 ( -0.03%)	        3.99 (  0.00%)	        3.60 (  9.64%)	        3.99
                   length=28, char=0:        14.51 (-263.04%)	        3.99 (  0.15%)	        3.99 (  0.12%)	        3.61 (  9.77%)	        4.00
                   length=28, char=0:        14.51 (-263.50%)	        3.99 (  0.03%)	        3.99 (  0.03%)	        3.61 (  9.69%)	        3.99
                   length=29, char=0:        14.90 (-270.95%)	        3.99 (  0.58%)	        3.99 (  0.70%)	        3.60 ( 10.30%)	        4.02
                   length=29, char=0:        14.90 (-273.29%)	        3.99 (  0.00%)	        3.99 ( -0.06%)	        3.61 (  9.57%)	        3.99
                   length=30, char=0:        15.29 (-282.76%)	        3.99 (  0.09%)	        4.00 ( -0.06%)	        3.61 (  9.69%)	        3.99
                   length=30, char=0:        15.29 (-283.14%)	        4.00 ( -0.15%)	        3.99 (  0.00%)	        3.61 (  9.48%)	        3.99
                   length=31, char=0:        15.68 (-293.14%)	        3.99 ( -0.12%)	        3.99 ( -0.12%)	        3.61 (  9.58%)	        3.99
                   length=31, char=0:        15.69 (-293.36%)	        3.99 ( -0.15%)	        3.99 (  0.03%)	        3.61 (  9.52%)	        3.99
                   length=32, char=0:        16.07 (-345.22%)	        3.62 ( -0.20%)	        3.61 (  0.00%)	        3.23 ( 10.62%)	        3.61
                   length=32, char=0:        16.10 (-346.63%)	        3.62 ( -0.30%)	        3.61 ( -0.03%)	        3.22 ( 10.74%)	        3.60
                   length=64, char=0:        28.49 (-691.26%)	        3.60 ( -0.10%)	        3.61 ( -0.34%)	        3.61 ( -0.20%)	        3.60
                   length=64, char=0:        28.49 (-690.31%)	        3.60 (  0.00%)	        3.61 ( -0.10%)	        3.61 ( -0.03%)	        3.60
                   length=96, char=0:        55.09 (-1427.24%)	        3.61 ( -0.14%)	        3.60 (  0.07%)	        3.24 ( 10.12%)	        3.61
                   length=96, char=0:        51.76 (-1334.40%)	        3.61 (  0.00%)	        3.60 (  0.14%)	        3.23 ( 10.52%)	        3.61
                  length=128, char=0:        64.00 (-1501.44%)	        4.00 (  0.03%)	        3.63 (  9.25%)	        4.00 ( -0.18%)	        4.00
                  length=128, char=0:        64.72 (-1519.77%)	        4.01 ( -0.24%)	        3.65 (  8.77%)	        3.99 (  0.03%)	        4.00
                  length=160, char=0:        76.39 (-1365.53%)	        5.21 (  0.02%)	        5.64 ( -8.22%)	        4.39 ( 15.71%)	        5.21
                  length=160, char=0:        76.39 (-1367.17%)	        5.21 ( -0.14%)	        5.64 ( -8.25%)	        4.38 ( 15.87%)	        5.21
                  length=192, char=0:        88.72 (-1603.26%)	        5.22 ( -0.16%)	        5.18 (  0.47%)	        5.34 ( -2.60%)	        5.21
                  length=192, char=0:        89.05 (-1608.39%)	        5.23 ( -0.30%)	        5.18 (  0.61%)	        5.34 ( -2.51%)	        5.21
                  length=224, char=0:        95.34 (-471.58%)	       16.68 ( -0.01%)	        6.44 ( 61.37%)	        5.99 ( 64.08%)	       16.68
                  length=224, char=0:        95.34 (-470.55%)	       16.68 (  0.17%)	        6.44 ( 61.47%)	        5.98 ( 64.21%)	       16.71
                  length=256, char=0:       107.46 (-1175.45%)	        8.42 (  0.06%)	        6.69 ( 20.62%)	        6.88 ( 18.33%)	        8.43
                  length=256, char=0:       107.99 (-1182.66%)	        8.44 ( -0.28%)	        6.69 ( 20.57%)	        6.88 ( 18.27%)	        8.42
                  length=288, char=0:       120.03 (-1325.00%)	        8.42 ( -0.01%)	        7.55 ( 10.30%)	        7.53 ( 10.61%)	        8.42
                  length=288, char=0:       120.03 (-1324.60%)	        8.42 (  0.07%)	        7.55 ( 10.33%)	        7.52 ( 10.75%)	        8.43
                  length=320, char=0:       132.58 (-1471.50%)	        8.43 (  0.03%)	        8.24 (  2.37%)	        8.42 (  0.17%)	        8.44
                  length=320, char=0:       132.09 (-1465.48%)	        8.44 (  0.01%)	        8.22 (  2.55%)	        8.42 (  0.25%)	        8.44
                  length=352, char=0:       144.77 (-729.18%)	       10.62 ( 39.17%)	        9.10 ( 47.87%)	        9.07 ( 48.04%)	       17.46
                  length=352, char=0:       144.77 (-729.17%)	        9.96 ( 42.95%)	        9.09 ( 47.95%)	        9.06 ( 48.10%)	       17.46
                  length=384, char=0:       156.81 (-797.96%)	        9.95 ( 43.00%)	        9.77 ( 44.07%)	        9.96 ( 42.97%)	       17.46
                  length=384, char=0:       157.56 (-802.27%)	        9.97 ( 42.89%)	        9.77 ( 44.06%)	        9.96 ( 42.96%)	       17.46
                  length=416, char=0:       169.38 (-774.13%)	       17.46 (  9.91%)	       10.64 ( 45.09%)	       10.81 ( 44.20%)	       19.38
                  length=416, char=0:       169.58 (-830.31%)	       17.47 (  4.17%)	       10.64 ( 41.61%)	       10.81 ( 40.70%)	       18.23
                  length=448, char=0:       185.67 (-857.80%)	       17.45 (  9.97%)	       11.31 ( 41.65%)	       11.54 ( 40.45%)	       19.38
                  length=448, char=0:       182.09 (-839.56%)	       17.47 (  9.86%)	       11.30 ( 41.70%)	       11.49 ( 40.69%)	       19.38
                  length=480, char=0:       194.00 (-1456.58%)	       13.42 ( -7.69%)	       12.18 (  2.25%)	       12.23 (  1.89%)	       12.46
                  length=480, char=0:       194.25 (-1411.94%)	       13.43 ( -4.53%)	       12.19 (  5.15%)	       12.13 (  5.56%)	       12.85
---
 sysdeps/aarch64/multiarch/Makefile          |   2 +-
 sysdeps/aarch64/multiarch/ifunc-impl-list.c |   1 +
 sysdeps/aarch64/multiarch/memset.c          |   5 +-
 sysdeps/aarch64/multiarch/memset_kunpeng.S  | 124 ++++++++++++++++++++++++++++
 4 files changed, 130 insertions(+), 2 deletions(-)
 create mode 100644 sysdeps/aarch64/multiarch/memset_kunpeng.S

diff --git a/sysdeps/aarch64/multiarch/Makefile b/sysdeps/aarch64/multiarch/Makefile
index 4150b89a902..8378107c78e 100644
--- a/sysdeps/aarch64/multiarch/Makefile
+++ b/sysdeps/aarch64/multiarch/Makefile
@@ -1,7 +1,7 @@
 ifeq ($(subdir),string)
 sysdep_routines += memcpy_generic memcpy_thunderx memcpy_thunderx2 \
 		   memcpy_falkor memmove_falkor \
-		   memset_generic memset_falkor memset_emag \
+		   memset_generic memset_falkor memset_emag memset_kunpeng \
 		   memchr_generic memchr_nosimd \
 		   strlen_generic strlen_asimd
 endif
diff --git a/sysdeps/aarch64/multiarch/ifunc-impl-list.c b/sysdeps/aarch64/multiarch/ifunc-impl-list.c
index be13b916e58..bcbd90d0c41 100644
--- a/sysdeps/aarch64/multiarch/ifunc-impl-list.c
+++ b/sysdeps/aarch64/multiarch/ifunc-impl-list.c
@@ -53,6 +53,7 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 		 can do a comparative analysis with __memset_generic.  */
 	      IFUNC_IMPL_ADD (array, i, memset, (zva_size == 64), __memset_falkor)
 	      IFUNC_IMPL_ADD (array, i, memset, (zva_size == 64), __memset_emag)
+	      IFUNC_IMPL_ADD (array, i, memset, 1, __memset_kunpeng)
 	      IFUNC_IMPL_ADD (array, i, memset, 1, __memset_generic))
   IFUNC_IMPL (i, name, memchr,
 	      IFUNC_IMPL_ADD (array, i, memchr, 1, __memchr_nosimd)
diff --git a/sysdeps/aarch64/multiarch/memset.c b/sysdeps/aarch64/multiarch/memset.c
index e9cdd385f26..4cc34b9b99a 100644
--- a/sysdeps/aarch64/multiarch/memset.c
+++ b/sysdeps/aarch64/multiarch/memset.c
@@ -30,10 +30,13 @@ extern __typeof (__redirect_memset) __libc_memset;
 
 extern __typeof (__redirect_memset) __memset_falkor attribute_hidden;
 extern __typeof (__redirect_memset) __memset_emag attribute_hidden;
+extern __typeof (__redirect_memset) __memset_kunpeng attribute_hidden;
 extern __typeof (__redirect_memset) __memset_generic attribute_hidden;
 
 libc_ifunc (__libc_memset,
-	    ((IS_FALKOR (midr) || IS_PHECDA (midr)) && zva_size == 64
+	    IS_KUNPENG (midr)
+	    ?__memset_kunpeng
+	    : ((IS_FALKOR (midr) || IS_PHECDA (midr)) && zva_size == 64
 	     ? __memset_falkor
 	     : (IS_EMAG (midr) && zva_size == 64
 	       ? __memset_emag
diff --git a/sysdeps/aarch64/multiarch/memset_kunpeng.S b/sysdeps/aarch64/multiarch/memset_kunpeng.S
new file mode 100644
index 00000000000..a862b13045f
--- /dev/null
+++ b/sysdeps/aarch64/multiarch/memset_kunpeng.S
@@ -0,0 +1,124 @@
+/* Optimized memset for Huawei Kunpeng processor.
+   Copyright (C) 2012-2019 Free Software Foundation, Inc.
+
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library.  If not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+#include <sysdeps/aarch64/memset-reg.h>
+
+#if IS_IN (libc)
+# define MEMSET __memset_kunpeng
+
+/* Assumptions:
+ *
+ * ARMv8-a, AArch64, unaligned accesses
+ *
+ */
+
+ENTRY_ALIGN (MEMSET, 6)
+
+	DELOUSE (0)
+	DELOUSE (2)
+
+	dup	v0.16B, valw
+	add	dstend, dstin, count
+
+	cmp	count, 128
+	b.hs	L(set_long)
+
+	cmp	count, 16
+	b.lo	L(less16)
+
+	/* Set 16..127 bytes.  */
+	str	q0, [dstin]
+	tbnz	count, 6, L(set112)
+	str	q0, [dstend, -16]
+	tbz	count, 5, 1f
+	str	q0, [dstin, 16]
+	str	q0, [dstend, -32]
+1:	ret
+
+	.p2align 4
+	/* Set 64..127 bytes.  Write 64 bytes from the start and
+	   32 bytes from the end.  */
+L(set112):
+	ands	tmp1, dstin, 15
+	bne	2f
+	str	q0, [dstin, 16]
+	stp	q0, q0, [dstin, 32]
+	tbz	count, 5, 1f
+	stp	q0, q0, [dstin, 64]
+1:	stp	q0, q0, [dstend, -32]
+	ret
+	.p2align 4
+2:	bic	dst, dstin, 15
+	stp	q0,q0, [dst, 16]
+	str	q0, [dst, 48]
+	tbz	count, 5, 3f
+	stp	q0, q0, [dst, 64]
+3:	stp	q0, q0, [dstend, -48]
+	str	q0, [dstend, -16]
+	ret
+
+	.p2align 4
+	/* Set 0..15 bytes.  */
+L(less16):
+	tbz	count, 3, L(less8)
+	str	d0, [dstin]
+	str	d0, [dstend, -8]
+	ret
+L(less8):
+	tbz	count, 2, 2f
+	str	s0, [dstin]
+	str	s0, [dstend, -4]
+	ret
+2:	cbz	count, 3f
+	str	b0, [dstin]
+	tbz	count, 1, 3f
+	str	h0, [dstend, -2]
+3:	ret
+	
+	.p2align 4
+L(set_long):
+	and	valw, valw, 255
+	bic	dst, dstin, 15
+	str	q0, [dstin]
+	sub	count, dstend, dst	/* Count is 16 too large.  */
+	sub	dst, dst, 16		/* Dst is biased by -32.  */
+	sub	count, count, 64 + 16 + 1 /* Adjust count and bias for loop.  */
+1:	stp	q0, q0, [dst, 32]
+	stp	q0, q0, [dst, 64]!
+	subs	count, count, 64
+	b.lo	1f
+	stp	q0, q0, [dst, 32]
+	stp	q0, q0, [dst, 64]!
+	subs	count, count, 64
+	b.lo	1f
+	stp	q0, q0, [dst, 32]
+	stp	q0, q0, [dst, 64]!
+	subs	count, count, 64
+	b.hs	1b
+
+1:	tbz	count, 5, 2f
+	str	q0, [dst, 32]
+	str	q0, [dst, 48]
+2:	stp	q0, q0, [dstend, -32]
+	ret
+
+END (MEMSET)
+libc_hidden_builtin_def (MEMSET)
+#endif
-- 
2.14.1.windows.1



Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]