This is the mail archive of the
libc-alpha@sourceware.org
mailing list for the glibc project.
Re: [PATCH] aarch64: optimized memcpy implementation for thunderx2
- From: Anton Youdkevitch <anton dot youdkevitch at bell-sw dot com>
- To: Siddhesh Poyarekar <siddhesh at gotplt dot org>, Steve Ellcey <sellcey at cavium dot com>, libc-alpha at sourceware dot org
- Date: Mon, 1 Oct 2018 19:22:19 +0300
- Subject: Re: [PATCH] aarch64: optimized memcpy implementation for thunderx2
- References: <2063a582-d65f-9e9f-50f5-80e4502edbd8@gotplt.org> <1538408223.18948.85.camel@cavium.com> <0899c6de-9462-8cca-5283-adc263d4b650@gotplt.org>
Below is the benchmark data along with the slightly
modified implementation.
On Mon, Oct 01, 2018 at 09:11:47PM +0530, Siddhesh Poyarekar wrote:
> On 01/10/18 9:07 PM, Steve Ellcey wrote:
> >Anton is doing this work under a contract with Cavium. ??Cavium has a
> >corporate copyright assignment on file with FSF so I am hoping that
> >is sufficient for Anton to make contributions. ??I don't have my login
> >to fencepost anymore so I can't double check the specifics of the
> >Cavium copyright assignment myself but my understanding is that it is
> >a general one that does not restrict contributions to be from just
> >a list of specific people.
>
> Thanks for the clarification Steve. That should be sufficient assuming that
> Anton's work is owned by Cavium.
>
> As for the patch itself, I'll leave it to you for a deeper review since
> you're the better judge for tx2 performance. It would still be nice to see
> the results from memcpy-walk and not just memcpy-large, assuming that's what
> Anton tested with.
>
> The patch needs a ChangeLog too.
>
> Thanks,
> Siddhesh
memcpy-large, the original T2 implementation as the baseline
Length Old New
65543: 3592.50 3432.19 ( 4.46%)
65551: 6554.06 4035.31 ( 38.43%)
65567: 6563.75 4028.75 ( 38.62%)
65599: 6727.19 4028.44 ( 40.12%)
131079: 7289.06 7002.81 ( 3.93%)
131087: 12587.80 8230.94 ( 34.61%)
131103: 12563.10 8226.56 ( 34.52%)
131135: 12901.60 8247.50 ( 36.07%)
262151: 20497.20 19074.10 ( 6.94%)
262159: 25104.10 20833.80 ( 17.01%)
262175: 24629.10 20776.60 ( 15.64%)
262207: 25103.80 20922.50 ( 16.66%)
524295: 41298.10 38061.90 ( 7.84%)
524303: 50114.10 41634.70 ( 16.92%)
524319: 49232.80 41417.50 ( 15.87%)
524351: 50357.20 41590.00 ( 17.41%)
1048583: 81622.80 75968.80 ( 6.93%)
1048591: 100746.00 82978.10 ( 17.64%)
1048607: 98254.70 82763.40 ( 15.77%)
1048639: 100326.00 83549.40 ( 16.72%)
2097159: 163283.00 152319.00 ( 6.00%)
2097167: 201279.00 165965.00 ( 17.00%)
2097183: 196811.00 165500.00 ( 15.00%)
2097215: 200769.00 166158.00 ( 17.00%)
4194311: 326463.00 303574.00 ( 7.00%)
4194319: 401840.00 331810.00 ( 17.00%)
4194335: 394120.00 330547.00 ( 16.00%)
4194367: 401803.00 331893.00 ( 17.00%)
8388615: 652982.00 607675.00 ( 6.00%)
8388623: 802170.00 663562.00 ( 17.00%)
8388639: 786795.00 661482.00 ( 15.00%)
8388671: 802130.00 663625.00 ( 17.00%)
16777223: 1404170.00 1272160.00 ( 9.40%)
16777231: 1611260.00 1339180.00 ( 16.89%)
16777247: 1573870.00 1326400.00 ( 15.72%)
16777279: 1604780.00 1330200.00 ( 17.11%)
33554439: 3700280.00 3167250.00 ( 14.41%)
33554447: 4085260.00 3647450.00 ( 10.72%)
33554463: 4006190.00 3636820.00 ( 9.22%)
33554495: 4081450.00 3635760.00 ( 10.92%)
memcpy-walk, the original T2 implementation as the baseline
Length: Old: New:
128: 10.56 9.87 ( 6.59%)
144: 10.68 10.35 ( 3.13%)
129: 14.68 12.56 ( 14.47%)
143: 16.75 13.70 ( 18.18%)
130: 14.03 12.19 ( 13.08%)
142: 15.82 13.21 ( 16.50%)
131: 15.05 12.91 ( 14.21%)
141: 16.57 13.63 ( 17.72%)
132: 12.74 11.46 ( 10.05%)
140: 13.92 12.14 ( 12.79%)
133: 15.36 13.07 ( 14.88%)
139: 16.18 13.49 ( 16.63%)
134: 14.67 12.64 ( 13.86%)
138: 15.21 12.99 ( 14.59%)
135: 15.65 13.24 ( 15.43%)
137: 15.91 13.42 ( 15.63%)
256: 32.36 19.82 ( 38.75%)
272: 20.73 20.53 ( 0.97%)
257: 23.91 22.61 ( 5.46%)
271: 25.90 23.71 ( 8.46%)
258: 23.41 21.59 ( 7.77%)
270: 25.10 22.83 ( 9.03%)
259: 23.97 22.08 ( 7.90%)
269: 25.44 23.17 ( 8.89%)
260: 22.24 21.08 ( 5.22%)
268: 23.47 21.72 ( 7.45%)
261: 24.35 22.12 ( 9.14%)
267: 25.22 23.18 ( 8.07%)
262: 23.65 21.83 ( 7.72%)
266: 25.04 22.17 ( 11.45%)
263: 24.60 22.29 ( 9.41%)
265: 25.13 22.53 ( 10.34%)
512: 60.10 39.10 ( 34.94%)
528: 38.97 38.98 ( -0.03%)
513: 43.74 41.41 ( 5.31%)
527: 45.35 42.24 ( 6.85%)
514: 43.55 41.21 ( 5.38%)
526: 44.93 41.88 ( 6.79%)
515: 44.34 41.69 ( 5.98%)
525: 45.37 42.39 ( 6.56%)
516: 42.98 41.34 ( 3.81%)
524: 43.61 41.62 ( 4.57%)
517: 44.76 41.70 ( 6.85%)
523: 45.29 42.03 ( 7.19%)
518: 43.98 41.17 ( 6.39%)
522: 44.47 41.37 ( 6.96%)
519: 44.62 41.90 ( 6.10%)
521: 44.82 41.84 ( 6.65%)
1024: 92.67 78.24 ( 15.57%)
1040: 75.14 75.48 ( -0.46%)
1025: 82.23 78.58 ( 4.44%)
1039: 80.86 78.48 ( 2.95%)
1026: 81.74 78.61 ( 3.83%)
1038: 81.07 78.64 ( 3.00%)
1027: 82.62 78.24 ( 5.30%)
1037: 81.87 78.26 ( 4.41%)
1028: 81.50 76.85 ( 5.71%)
1036: 80.41 77.37 ( 3.78%)
1029: 82.85 77.57 ( 6.38%)
1035: 81.88 77.52 ( 5.33%)
1030: 82.27 76.73 ( 6.73%)
1034: 80.61 76.51 ( 5.08%)
1031: 82.95 77.28 ( 6.84%)
1033: 82.22 77.46 ( 5.79%)
2048: 162.40 143.79 ( 11.46%)
2064: 141.59 139.96 ( 1.16%)
2049: 151.28 143.47 ( 5.16%)
2063: 147.76 143.20 ( 3.09%)
2050: 150.12 143.27 ( 4.57%)
2062: 147.95 142.99 ( 3.35%)
2051: 150.78 143.21 ( 5.02%)
2061: 149.21 142.89 ( 4.24%)
2052: 149.21 141.79 ( 4.97%)
2060: 147.81 142.17 ( 3.82%)
2053: 151.31 142.50 ( 5.82%)
2059: 149.39 142.10 ( 4.88%)
2054: 150.49 141.93 ( 5.69%)
2058: 149.70 141.49 ( 5.49%)
2055: 150.97 142.20 ( 5.81%)
2057: 149.50 142.35 ( 4.79%)
4096: 303.24 281.71 ( 7.10%)
4112: 276.36 272.07 ( 1.55%)
4097: 289.49 279.64 ( 3.40%)
4111: 285.08 278.72 ( 2.23%)
4098: 288.09 279.75 ( 2.90%)
4110: 285.33 277.88 ( 2.61%)
4099: 288.90 279.66 ( 3.20%)
4109: 285.42 277.08 ( 2.92%)
4100: 286.11 276.89 ( 3.22%)
4108: 285.30 275.74 ( 3.35%)
4101: 287.57 276.41 ( 3.88%)
4107: 287.46 275.66 ( 4.11%)
4102: 287.98 274.84 ( 4.56%)
4106: 287.78 276.39 ( 3.96%)
4103: 287.88 275.74 ( 4.21%)
4105: 286.37 275.38 ( 3.84%)
8192: 584.91 546.80 ( 6.52%)
8208: 567.20 548.22 ( 3.35%)
8193: 570.20 550.95 ( 3.38%)
8207: 575.19 552.11 ( 4.01%)
8194: 570.07 552.84 ( 3.02%)
8206: 574.37 551.27 ( 4.02%)
8195: 571.65 550.30 ( 3.74%)
8205: 573.86 550.05 ( 4.15%)
8196: 569.58 550.20 ( 3.40%)
8204: 571.56 548.25 ( 4.08%)
8197: 571.27 549.32 ( 3.84%)
8203: 574.61 549.93 ( 4.29%)
8198: 574.07 549.76 ( 4.23%)
8202: 574.54 548.60 ( 4.51%)
8199: 574.75 550.24 ( 4.26%)
8201: 572.30 549.55 ( 3.97%)
16384: 1119.46 1071.00 ( 4.33%)
16400: 1103.77 1074.23 ( 2.68%)
16385: 1105.45 1078.87 ( 2.40%)
16399: 1106.27 1080.64 ( 2.32%)
16386: 1103.68 1075.02 ( 2.60%)
16398: 1105.78 1073.91 ( 2.88%)
16387: 1105.48 1074.46 ( 2.81%)
16397: 1111.44 1074.85 ( 3.29%)
16388: 1108.83 1074.96 ( 3.05%)
16396: 1110.16 1075.69 ( 3.10%)
16389: 1104.51 1073.80 ( 2.78%)
16395: 1105.41 1075.22 ( 2.73%)
16390: 1104.02 1082.29 ( 1.97%)
16394: 1104.55 1079.86 ( 2.24%)
16391: 1104.67 1079.74 ( 2.26%)
16393: 1106.43 1075.68 ( 2.78%)
32768: 2166.84 2120.03 ( 2.16%)
32784: 2151.61 2123.45 ( 1.31%)
32769: 2159.75 2120.54 ( 1.82%)
32783: 2161.50 2122.79 ( 1.79%)
32770: 2150.59 2123.10 ( 1.28%)
32782: 2151.40 2122.74 ( 1.33%)
32771: 2154.45 2135.24 ( 0.89%)
32781: 2150.22 2132.97 ( 0.80%)
32772: 2151.94 2125.86 ( 1.21%)
32780: 2152.97 2125.27 ( 1.29%)
32773: 2152.49 2123.85 ( 1.33%)
32779: 2163.28 2124.11 ( 1.81%)
32774: 2162.51 2122.53 ( 1.85%)
32778: 2151.93 2123.59 ( 1.32%)
32775: 2151.68 2123.54 ( 1.31%)
32777: 2149.72 2132.82 ( 0.79%)
65536: 4295.72 4239.57 ( 1.31%)
65552: 4283.87 4222.39 ( 1.44%)
65537: 4274.55 4214.65 ( 1.40%)
65551: 4280.03 4221.12 ( 1.38%)
65538: 4297.00 4219.75 ( 1.80%)
65550: 4315.21 4219.55 ( 2.22%)
65539: 4278.71 4220.25 ( 1.37%)
65549: 4277.82 4222.09 ( 1.30%)
65540: 4279.59 4245.00 ( 0.81%)
65548: 4281.26 4247.10 ( 0.80%)
65541: 4280.69 4224.98 ( 1.30%)
65547: 4279.46 4221.36 ( 1.36%)
65542: 4292.06 4232.92 ( 1.38%)
65546: 4299.94 4218.97 ( 1.88%)
65543: 4303.29 4223.02 ( 1.87%)
65545: 4280.18 4221.31 ( 1.38%)
131072: 8539.06 8407.54 ( 1.54%)
131088: 8531.11 8466.35 ( 0.76%)
131073: 8530.33 8455.33 ( 0.88%)
131087: 8533.24 8415.62 ( 1.38%)
131074: 8527.70 8412.99 ( 1.35%)
131086: 8533.75 8413.98 ( 1.40%)
131075: 8570.35 8412.75 ( 1.84%)
131085: 8575.33 8420.96 ( 1.80%)
131076: 8529.16 8414.49 ( 1.34%)
131084: 8530.96 8450.14 ( 0.95%)
131077: 8527.81 8455.23 ( 0.85%)
131083: 8530.29 8412.71 ( 1.38%)
131078: 8530.68 8415.02 ( 1.36%)
131082: 8526.46 8412.15 ( 1.34%)
131079: 8573.24 8415.12 ( 1.84%)
131081: 8563.96 8409.38 ( 1.81%)
262144: 17040.60 16801.50 ( 1.40%)
262160: 17051.10 16815.00 ( 1.38%)
262145: 17047.60 16902.20 ( 0.85%)
262159: 17042.10 16893.20 ( 0.87%)
262146: 17039.40 16800.20 ( 1.40%)
262158: 17042.50 16807.00 ( 1.38%)
262147: 17038.40 16798.30 ( 1.41%)
262157: 17116.00 16808.10 ( 1.80%)
262148: 17109.80 16800.40 ( 1.81%)
262156: 17040.50 16807.30 ( 1.37%)
262149: 17029.20 16878.90 ( 0.88%)
262155: 17029.70 16886.00 ( 0.84%)
262150: 17035.20 16799.50 ( 1.38%)
262154: 17035.60 16803.10 ( 1.36%)
262151: 17037.70 16802.10 ( 1.38%)
262153: 17106.10 16812.70 ( 1.72%)
524288: 34204.90 33576.70 ( 1.84%)
524304: 34076.60 33594.10 ( 1.42%)
524289: 34072.60 33616.60 ( 1.34%)
524303: 34040.40 33733.00 ( 0.90%)
524290: 34064.90 33754.80 ( 0.91%)
524302: 34059.30 33602.60 ( 1.34%)
524291: 34077.50 33598.00 ( 1.41%)
524301: 34060.20 33585.50 ( 1.39%)
524292: 34220.10 33601.80 ( 1.81%)
524300: 34051.60 33600.30 ( 1.33%)
524293: 34071.20 33594.80 ( 1.40%)
524299: 34054.70 33740.30 ( 0.92%)
524294: 34067.20 33762.40 ( 0.89%)
524298: 34066.80 33597.30 ( 1.38%)
524295: 34049.90 33587.70 ( 1.36%)
524297: 34048.30 33578.80 ( 1.38%)
1048576: 68512.70 67215.50 ( 1.89%)
1048592: 68456.10 67137.00 ( 1.93%)
1048577: 68161.20 67191.10 ( 1.42%)
1048591: 68101.60 67146.60 ( 1.40%)
1048578: 68105.50 67527.30 ( 0.85%)
1048590: 68120.60 67532.50 ( 0.86%)
1048579: 68123.30 67158.90 ( 1.42%)
1048589: 68109.40 67128.10 ( 1.44%)
1048580: 68420.80 67141.20 ( 1.87%)
1048588: 68387.70 67105.90 ( 1.87%)
1048581: 68111.40 67163.60 ( 1.39%)
1048587: 68100.20 67156.40 ( 1.39%)
1048582: 68079.20 67464.20 ( 0.90%)
1048586: 68092.00 67579.80 ( 0.75%)
1048583: 68103.40 67150.20 ( 1.40%)
1048585: 68100.30 67154.80 ( 1.39%)
2097152: 135942.00 134168.00 ( 1.00%)
2097168: 136859.00 134261.00 ( 1.00%)
2097153: 137141.00 134278.00 ( 2.00%)
2097167: 136145.00 134289.00 ( 1.00%)
2097154: 136326.00 134327.00 ( 1.00%)
2097166: 136221.00 134941.00 ( 0.00%)
2097155: 136244.00 134299.00 ( 1.00%)
2097165: 136273.00 134367.00 ( 1.00%)
2097156: 136221.00 134286.00 ( 1.00%)
2097164: 136793.00 134281.00 ( 1.00%)
2097157: 136947.00 134346.00 ( 1.00%)
2097163: 136241.00 134288.00 ( 1.00%)
2097158: 136256.00 134288.00 ( 1.00%)
2097162: 136229.00 134982.00 ( 0.00%)
2097159: 136227.00 134913.00 ( 0.00%)
2097161: 136176.00 134265.00 ( 1.00%)
4194304: 271842.00 268390.00 ( 1.00%)
4194320: 272394.00 268479.00 ( 1.00%)
4194305: 274327.00 268534.00 ( 2.00%)
4194319: 273587.00 268584.00 ( 1.00%)
4194306: 272976.00 268614.00 ( 1.00%)
4194318: 272294.00 269878.00 ( 0.00%)
4194307: 272749.00 269894.00 ( 1.00%)
4194317: 272459.00 268622.00 ( 1.00%)
4194308: 272632.00 268565.00 ( 1.00%)
4194316: 272439.00 268609.00 ( 1.00%)
4194309: 273908.00 268734.00 ( 1.00%)
4194315: 273827.00 268492.00 ( 1.00%)
4194310: 272508.00 268573.00 ( 1.00%)
4194314: 272512.00 268566.00 ( 1.00%)
4194311: 272395.00 269761.00 ( 0.00%)
4194313: 272305.00 269994.00 ( 0.00%)
8388608: 543440.00 536600.00 ( 1.00%)
8388624: 544635.00 536972.00 ( 1.00%)
8388609: 548560.00 537180.00 ( 2.00%)
8388623: 547458.00 537086.00 ( 1.00%)
8388610: 545852.00 537179.00 ( 1.00%)
8388622: 544720.00 536976.00 ( 1.00%)
8388611: 545748.00 539724.00 ( 1.00%)
8388621: 544660.00 539579.00 ( 0.00%)
8388612: 545815.00 536950.00 ( 1.00%)
8388620: 544914.00 537276.00 ( 1.00%)
8388613: 545538.00 537080.00 ( 1.00%)
8388619: 549975.00 537315.00 ( 2.00%)
8388614: 548025.00 537379.00 ( 1.00%)
8388618: 544850.00 538511.00 ( 1.00%)
8388615: 546555.00 538226.00 ( 1.00%)
8388617: 544815.00 540246.00 ( 0.00%)
16777216: 1086650.00 1072510.00 ( 1.30%)
16777232: 1090790.00 1073660.00 ( 1.57%)
16777217: 1091930.00 1074220.00 ( 1.62%)
16777231: 1096050.00 1074300.00 ( 1.98%)
16777218: 1096680.00 1074520.00 ( 2.02%)
16777230: 1090850.00 1074100.00 ( 1.54%)
16777219: 1091580.00 1074500.00 ( 1.56%)
16777229: 1090860.00 1079600.00 ( 1.03%)
16777220: 1091380.00 1079380.00 ( 1.10%)
16777228: 1091350.00 1074350.00 ( 1.56%)
16777221: 1091300.00 1074090.00 ( 1.58%)
16777227: 1090770.00 1074680.00 ( 1.48%)
16777222: 1096710.00 1074380.00 ( 2.04%)
16777226: 1095200.00 1073920.00 ( 1.94%)
16777223: 1091620.00 1074550.00 ( 1.56%)
16777225: 1091020.00 1074570.00 ( 1.51%)
33554432: 2174840.00 2156700.00 ( 0.83%)
33554448: 2182480.00 2148180.00 ( 1.57%)
33554433: 2183340.00 2147960.00 ( 1.62%)
33554447: 2184400.00 2149360.00 ( 1.60%)
33554434: 2192560.00 2149040.00 ( 1.98%)
33554446: 2194490.00 2149140.00 ( 2.07%)
33554435: 2183870.00 2146940.00 ( 1.69%)
33554445: 2183470.00 2148720.00 ( 1.59%)
33554436: 2183370.00 2158320.00 ( 1.15%)
33554444: 2182820.00 2159310.00 ( 1.08%)
33554437: 2183770.00 2149160.00 ( 1.58%)
33554443: 2183190.00 2147080.00 ( 1.65%)
33554438: 2183250.00 2148280.00 ( 1.60%)
33554442: 2193750.00 2148190.00 ( 2.08%)
33554439: 2183280.00 2147770.00 ( 1.63%)
33554441: 2183830.00 2148430.00 ( 1.62%)
2018-10-01 Anton Youdkevitch <anton.youdkevitch@bell-sw.com>
* sysdeps/aarch64/multiarch/memcpy_thunderx2.S: rewritten
implementation considering thunderX2 chip specifics
diff --git a/sysdeps/aarch64/multiarch/memcpy_thunderx.S b/sysdeps/aarch64/multiarch/memcpy_thunderx.S
index de494d9..6000365 100644
--- a/sysdeps/aarch64/multiarch/memcpy_thunderx.S
+++ b/sysdeps/aarch64/multiarch/memcpy_thunderx.S
@@ -74,13 +74,10 @@
#if IS_IN (libc)
-# ifndef USE_THUNDERX2
# undef MEMCPY
# define MEMCPY __memcpy_thunderx
# undef MEMMOVE
# define MEMMOVE __memmove_thunderx
-# define USE_THUNDERX
-# endif
ENTRY_ALIGN (MEMMOVE, 6)
@@ -182,8 +179,6 @@ L(copy96):
.p2align 4
L(copy_long):
-# if defined(USE_THUNDERX) || defined (USE_THUNDERX2)
-
/* On thunderx, large memcpy's are helped by software prefetching.
This loop is identical to the one below it but with prefetching
instructions included. For loops that are less than 32768 bytes,
@@ -196,11 +191,7 @@ L(copy_long):
bic dst, dstin, 15
ldp D_l, D_h, [src]
sub src, src, tmp1
-# if defined(USE_THUNDERX)
prfm pldl1strm, [src, 384]
-# elif defined(USE_THUNDERX2)
- prfm pldl1strm, [src, 256]
-# endif
add count, count, tmp1 /* Count is now 16 too large. */
ldp A_l, A_h, [src, 16]
stp D_l, D_h, [dstin]
@@ -210,13 +201,9 @@ L(copy_long):
subs count, count, 128 + 16 /* Test and readjust count. */
L(prefetch_loop64):
-# if defined(USE_THUNDERX)
tbz src, #6, 1f
prfm pldl1strm, [src, 512]
1:
-# elif defined(USE_THUNDERX2)
- prfm pldl1strm, [src, 256]
-# endif
stp A_l, A_h, [dst, 16]
ldp A_l, A_h, [src, 16]
stp B_l, B_h, [dst, 32]
@@ -230,7 +217,6 @@ L(prefetch_loop64):
b L(last64)
L(copy_long_without_prefetch):
-# endif
and tmp1, dstin, 15
bic dst, dstin, 15
diff --git a/sysdeps/aarch64/multiarch/memcpy_thunderx2.S b/sysdeps/aarch64/multiarch/memcpy_thunderx2.S
index 8501abf..945d1e8 100644
--- a/sysdeps/aarch64/multiarch/memcpy_thunderx2.S
+++ b/sysdeps/aarch64/multiarch/memcpy_thunderx2.S
@@ -20,8 +20,997 @@
/* The actual code in this memcpy and memmove is in memcpy_thunderx.S.
The only real differences are with the prefetching instructions. */
+#include <sysdep.h>
+
+/* Assumptions:
+ *
+ * ARMv8-a, AArch64, unaligned accesses.
+ *
+ */
+
+#define dstin x0
+#define src x1
+#define count x2
+#define dst x3
+#define srcend x4
+#define dstend x5
+#define tmp2 x6
+#define A_l x6
+#define A_lw w6
+#define A_h x7
+#define A_hw w7
+#define B_l x8
+#define B_lw w8
+#define B_h x9
+#define C_l x10
+#define C_h x11
+#define D_l x12
+#define D_h x13
+#define E_l src
+#define E_h count
+#define F_l srcend
+#define F_h dst
+#define G_l count
+#define G_h dst
+#define tmp1 x14
+
+#define A_q q0
+#define B_q q1
+#define C_q q2
+#define D_q q3
+#define E_q q4
+#define F_q q5
+#define G_q q6
+#define H_q q7
+#define I_q q16
+#define J_q q17
+
+#define A_v v0
+#define B_v v1
+#define C_v v2
+#define D_v v3
+#define E_v v4
+#define F_v v5
+#define G_v v6
+#define H_v v7
+#define I_v v16
+#define J_v v17
+
+#ifndef MEMMOVE
+# define MEMMOVE memmove
+#endif
+#ifndef MEMCPY
+# define MEMCPY memcpy
+#endif
+
+#if IS_IN (libc)
+
+#undef MEMCPY
+#undef MEMMOVE
#define MEMCPY __memcpy_thunderx2
#define MEMMOVE __memmove_thunderx2
-#define USE_THUNDERX2
-#include "memcpy_thunderx.S"
+
+/* Copies are split into 3 main cases: small copies of up to 16 bytes,
+ medium copies of 17..96 bytes which are fully unrolled. Large copies
+ of more than 96 bytes align the destination and use an unrolled loop
+ processing 64 bytes per iteration.
+ The current optimized memcpy implementation is not compatible with
+ memmove and is separated from it completely. See below.
+ Overlapping large forward memmoves use a loop that copies backwards.
+*/
+
+ENTRY_ALIGN (MEMMOVE, 6)
+
+ DELOUSE (0)
+ DELOUSE (1)
+ DELOUSE (2)
+
+ sub tmp1, dstin, src
+ cmp count, 96
+ ccmp tmp1, count, 2, hi
+ b.lo L(move_long)
+
+ prfm PLDL1KEEP, [src]
+ add srcend, src, count
+ add dstend, dstin, count
+ cmp count, 16
+ b.ls L(copy16)
+ cmp count, 96
+ b.hi L(copy_long)
+
+ /* Medium copies: 17..96 bytes. */
+ sub tmp1, count, 1
+ ldp A_l, A_h, [src]
+ tbnz tmp1, 6, L(copy96)
+ ldp D_l, D_h, [srcend, -16]
+ tbz tmp1, 5, 1f
+ ldp B_l, B_h, [src, 16]
+ ldp C_l, C_h, [srcend, -32]
+ stp B_l, B_h, [dstin, 16]
+ stp C_l, C_h, [dstend, -32]
+1:
+ stp A_l, A_h, [dstin]
+ stp D_l, D_h, [dstend, -16]
+ ret
+
+ .p2align 4
+ /* Small copies: 0..16 bytes. */
+L(copy16):
+ cmp count, 8
+ b.lo 1f
+ ldr A_l, [src]
+ ldr A_h, [srcend, -8]
+ str A_l, [dstin]
+ str A_h, [dstend, -8]
+ ret
+ .p2align 4
+1:
+ tbz count, 2, 1f
+ ldr A_lw, [src]
+ ldr A_hw, [srcend, -4]
+ str A_lw, [dstin]
+ str A_hw, [dstend, -4]
+ ret
+
+ /* Copy 0..3 bytes. Use a branchless sequence that copies the same
+ byte 3 times if count==1, or the 2nd byte twice if count==2. */
+1:
+ cbz count, 2f
+ lsr tmp1, count, 1
+ ldrb A_lw, [src]
+ ldrb A_hw, [srcend, -1]
+ ldrb B_lw, [src, tmp1]
+ strb A_lw, [dstin]
+ strb B_lw, [dstin, tmp1]
+ strb A_hw, [dstend, -1]
+2: ret
+
+ .p2align 4
+ /* Copy 64..96 bytes. Copy 64 bytes from the start and
+ 32 bytes from the end. */
+L(copy96):
+ ldp B_l, B_h, [src, 16]
+ ldp C_l, C_h, [src, 32]
+ ldp D_l, D_h, [src, 48]
+ ldp E_l, E_h, [srcend, -32]
+ ldp F_l, F_h, [srcend, -16]
+ stp A_l, A_h, [dstin]
+ stp B_l, B_h, [dstin, 16]
+ stp C_l, C_h, [dstin, 32]
+ stp D_l, D_h, [dstin, 48]
+ stp E_l, E_h, [dstend, -32]
+ stp F_l, F_h, [dstend, -16]
+ ret
+
+ /* Align DST to 16 byte alignment so that we don't cross cache line
+ boundaries on both loads and stores. There are at least 96 bytes
+ to copy, so copy 16 bytes unaligned and then align. The loop
+ copies 64 bytes per iteration and prefetches one iteration ahead. */
+
+ .p2align 4
+L(copy_long):
+ and tmp1, dstin, 15
+ bic dst, dstin, 15
+ ldp D_l, D_h, [src]
+ sub src, src, tmp1
+ add count, count, tmp1 /* Count is now 16 too large. */
+ ldp A_l, A_h, [src, 16]
+ stp D_l, D_h, [dstin]
+ ldp B_l, B_h, [src, 32]
+ ldp C_l, C_h, [src, 48]
+ ldp D_l, D_h, [src, 64]!
+ subs count, count, 128 + 16 /* Test and readjust count. */
+ b.ls L(last64)
+L(loop64):
+ stp A_l, A_h, [dst, 16]
+ ldp A_l, A_h, [src, 16]
+ stp B_l, B_h, [dst, 32]
+ ldp B_l, B_h, [src, 32]
+ stp C_l, C_h, [dst, 48]
+ ldp C_l, C_h, [src, 48]
+ stp D_l, D_h, [dst, 64]!
+ ldp D_l, D_h, [src, 64]!
+ subs count, count, 64
+ b.hi L(loop64)
+
+ /* Write the last full set of 64 bytes. The remainder is at most 64
+ bytes, so it is safe to always copy 64 bytes from the end even if
+ there is just 1 byte left. */
+L(last64):
+ ldp E_l, E_h, [srcend, -64]
+ stp A_l, A_h, [dst, 16]
+ ldp A_l, A_h, [srcend, -48]
+ stp B_l, B_h, [dst, 32]
+ ldp B_l, B_h, [srcend, -32]
+ stp C_l, C_h, [dst, 48]
+ ldp C_l, C_h, [srcend, -16]
+ stp D_l, D_h, [dst, 64]
+ stp E_l, E_h, [dstend, -64]
+ stp A_l, A_h, [dstend, -48]
+ stp B_l, B_h, [dstend, -32]
+ stp C_l, C_h, [dstend, -16]
+ ret
+
+ .p2align 4
+L(move_long):
+ cbz tmp1, 3f
+
+ add srcend, src, count
+ add dstend, dstin, count
+
+ /* Align dstend to 16 byte alignment so that we don't cross cache line
+ boundaries on both loads and stores. There are at least 96 bytes
+ to copy, so copy 16 bytes unaligned and then align. The loop
+ copies 64 bytes per iteration and prefetches one iteration ahead. */
+
+ and tmp1, dstend, 15
+ ldp D_l, D_h, [srcend, -16]
+ sub srcend, srcend, tmp1
+ sub count, count, tmp1
+ ldp A_l, A_h, [srcend, -16]
+ stp D_l, D_h, [dstend, -16]
+ ldp B_l, B_h, [srcend, -32]
+ ldp C_l, C_h, [srcend, -48]
+ ldp D_l, D_h, [srcend, -64]!
+ sub dstend, dstend, tmp1
+ subs count, count, 128
+ b.ls 2f
+
+ nop
+1:
+ stp A_l, A_h, [dstend, -16]
+ ldp A_l, A_h, [srcend, -16]
+ stp B_l, B_h, [dstend, -32]
+ ldp B_l, B_h, [srcend, -32]
+ stp C_l, C_h, [dstend, -48]
+ ldp C_l, C_h, [srcend, -48]
+ stp D_l, D_h, [dstend, -64]!
+ ldp D_l, D_h, [srcend, -64]!
+ subs count, count, 64
+ b.hi 1b
+
+ /* Write the last full set of 64 bytes. The remainder is at most 64
+ bytes, so it is safe to always copy 64 bytes from the start even if
+ there is just 1 byte left. */
+2:
+ ldp G_l, G_h, [src, 48]
+ stp A_l, A_h, [dstend, -16]
+ ldp A_l, A_h, [src, 32]
+ stp B_l, B_h, [dstend, -32]
+ ldp B_l, B_h, [src, 16]
+ stp C_l, C_h, [dstend, -48]
+ ldp C_l, C_h, [src]
+ stp D_l, D_h, [dstend, -64]
+ stp G_l, G_h, [dstin, 48]
+ stp A_l, A_h, [dstin, 32]
+ stp B_l, B_h, [dstin, 16]
+ stp C_l, C_h, [dstin]
+3: ret
+
+END (MEMMOVE)
+libc_hidden_builtin_def (MEMMOVE)
+
+
+/* memcpy implementation below is not compatible with memmove
+ because of pipelined loads/stores, which are faster, but they
+ can't be used in the case of overlapping memmove arrays */
+
+#define MEMCPY_PREFETCH_LDR 640
+
+ENTRY (MEMCPY)
+ DELOUSE (0)
+ DELOUSE (1)
+ DELOUSE (2)
+
+ add srcend, src, count
+ cmp count, 16
+ b.ls L(memcopy16)
+ ldr A_q, [src], #16
+ add dstend, dstin, count
+ and tmp1, src, 15
+ cmp count, 96
+ b.hi L(memcopy_long)
+
+ /* Medium copies: 17..96 bytes. */
+ ldr E_q, [srcend, -16]
+ cmp count, 64
+ b.gt L(memcpy_copy96)
+ cmp count, 48
+ b.le L(bytes_17_to_48)
+ /* 49..64 bytes */
+ ldp B_q, C_q, [src]
+ str E_q, [dstend, -16]
+ stp A_q, B_q, [dstin]
+ str C_q, [dstin, 32]
+ ret
+
+L(bytes_17_to_48):
+ /* 17..48 bytes*/
+ cmp count, 32
+ b.gt L(bytes_32_to_48)
+ /* 17..32 bytes*/
+ str A_q, [dstin]
+ str E_q, [dstend, -16]
+ ret
+
+L(bytes_32_to_48):
+ /* 32..48 */
+ ldr B_q, [src]
+ str A_q, [dstin]
+ str E_q, [dstend, -16]
+ str B_q, [dstin, 16]
+ ret
+
+ .p2align 4
+ /* Small copies: 0..16 bytes. */
+L(memcopy16):
+ cmp count, 8
+ b.lo L(bytes_0_to_8)
+ ldr A_l, [src]
+ ldr A_h, [srcend, -8]
+ add dstend, dstin, count
+ str A_l, [dstin]
+ str A_h, [dstend, -8]
+ ret
+ .p2align 4
+
+L(bytes_0_to_8):
+ tbz count, 2, L(bytes_0_to_3)
+ ldr A_lw, [src]
+ ldr A_hw, [srcend, -4]
+ add dstend, dstin, count
+ str A_lw, [dstin]
+ str A_hw, [dstend, -4]
+ ret
+
+ /* Copy 0..3 bytes. Use a branchless sequence that copies the same
+ byte 3 times if count==1, or the 2nd byte twice if count==2. */
+L(bytes_0_to_3):
+ cbz count, L(end)
+ lsr tmp1, count, 1
+ ldrb A_lw, [src]
+ ldrb A_hw, [srcend, -1]
+ add dstend, dstin, count
+ ldrb B_lw, [src, tmp1]
+ strb A_lw, [dstin]
+ strb B_lw, [dstin, tmp1]
+ strb A_hw, [dstend, -1]
+L(end): ret
+
+ .p2align 4
+
+L(memcpy_copy96):
+ /* Copying 65..96 bytes. A_q (first 16 bytes) and
+ E_q(last 16 bytes) are already loaded.
+
+ The size is large enough to benefit from aligned
+ loads */
+ bic src, src, 15
+ ldp B_q, C_q, [src]
+ str A_q, [dstin]
+ /* Loaded 64 bytes, second 16-bytes chunk can be
+ overlapping with the first chunk by tmp1 bytes.
+ Stored 16 bytes. */
+ sub dst, dstin, tmp1
+ add count, count, tmp1
+ /* the range of count being [65..96] becomes [65..111]
+ after tmp [0..15] gets added to it,
+ count now is <bytes-left-to-load>+48 */
+ cmp count, 80
+ b.gt L(copy96_medium)
+ ldr D_q, [src, 32]
+ stp B_q, C_q, [dst, 16]
+ str E_q, [dstend, -16]
+ str D_q, [dst, 48]
+ ret
+
+ .p2align 4
+L(copy96_medium):
+ ldp D_q, A_q, [src, 32]
+ str B_q, [dst, 16]
+ cmp count, 96
+ b.gt L(copy96_large)
+ str E_q, [dstend, -16]
+ stp C_q, D_q, [dst, 32]
+ str A_q, [dst, 64]
+ ret
+
+L(copy96_large):
+ ldr F_q, [src, 64]
+ stp C_q, D_q, [dst, 32]
+ str E_q, [dstend, -16]
+ stp A_q, F_q, [dst, 64]
+ ret
+
+ .p2align 4
+L(memcopy_long):
+ bic src, src, 15
+ ldp B_q, C_q, [src], #32
+ str A_q, [dstin]
+ sub dst, dstin, tmp1
+ add count, count, tmp1
+ add dst, dst, 16
+ and tmp1, dst, 15
+ ldp D_q, E_q, [src], #32
+ str B_q, [dst], #16
+
+ /* Already loaded 64+16 bytes. Check if at
+ least 64 more bytes left */
+ subs count, count, 64+64+16
+ b.lt L(loop128_exit2)
+ cmp count, MEMCPY_PREFETCH_LDR + 64 + 32
+ b.lt L(loop128)
+ cbnz tmp1, L(dst_unaligned)
+ sub count, count, MEMCPY_PREFETCH_LDR + 64 + 32
+
+ .p2align 4
+
+L(loop128_prefetch):
+ str C_q, [dst], #16
+ prfm pldl1strm, [src, MEMCPY_PREFETCH_LDR]
+ str D_q, [dst], #16
+ ldp F_q, G_q, [src], #32
+ str E_q, [dst], #16
+ ldp H_q, A_q, [src], #32
+ str F_q, [dst], #16
+ prfm pldl1strm, [src, MEMCPY_PREFETCH_LDR]
+ str G_q, [dst], #16
+ ldp B_q, C_q, [src], #32
+ str H_q, [dst], #16
+ ldp D_q, E_q, [src], #32
+ stp A_q, B_q, [dst], #32
+ subs count, count, 128
+ b.ge L(loop128_prefetch)
+
+L(preloop128):
+ add count, count, MEMCPY_PREFETCH_LDR + 64 + 32
+ .p2align 4
+L(loop128):
+ ldp F_q, G_q, [src], #32
+ str C_q, [dst], #16
+ ldp B_q, A_q, [src], #32
+ str D_q, [dst], #16
+ stp E_q, F_q, [dst], #32
+ stp G_q, B_q, [dst], #32
+ subs count, count, 64
+ b.lt L(loop128_exit1)
+L(loop128_proceed):
+ ldp B_q, C_q, [src], #32
+ str A_q, [dst], #16
+ ldp D_q, E_q, [src], #32
+ str B_q, [dst], #16
+ subs count, count, 64
+ b.ge L(loop128)
+
+ .p2align 4
+L(loop128_exit2):
+ stp C_q, D_q, [dst], #32
+ str E_q, [dst], #16
+ b L(copy_long_check32);
+
+L(loop128_exit1):
+ /* A_q is still not stored and 0..63 bytes left,
+ so, count is -64..-1.
+ Check if less than 32 bytes left (count < -32) */
+ str A_q, [dst], #16
+L(copy_long_check32):
+ cmn count, 64
+ b.eq L(copy_long_done)
+ cmn count, 32
+ b.le L(copy_long_last32)
+ ldp B_q, C_q, [src]
+ stp B_q, C_q, [dst]
+
+L(copy_long_last32):
+ ldp F_q, G_q, [srcend, -32]
+ stp F_q, G_q, [dstend, -32]
+
+L(copy_long_done):
+ ret
+
+L(dst_unaligned):
+ /* For the unaligned store case the code loads two
+ aligned chunks and then merges them using ext
+ instrunction. This can be up to 30% faster than
+ the the simple unaligned store access.
+
+ Current state: tmp1 = dst % 16; C_q, D_q, E_q
+ contains data yet to be stored. src and dst points
+ to next-to-be-processed data. A_q, B_q contains
+ data already stored before, count = bytes left to
+ be load decremented by 64.
+
+ The control is passed here if at least 64 bytes left
+ to be loaded. The code does two aligned loads and then
+ extracts (16-tmp1) bytes from the first register and
+ tmp1 bytes from the next register forming the value
+ for the aligned store.
+
+ As ext instruction can only have it's index encoded
+ as immediate. 15 code chunks process each possible
+ index value. Computed goto is used to reach the
+ required code. */
+
+ /* Store the 16 bytes to dst and align dst for further
+ operations, several bytes will be stored at this
+ address once more */
+ str C_q, [dst], #16
+ ldp F_q, G_q, [src], #32
+ bic dst, dst, 15
+ adr tmp2, L(load_and_merge)
+ add tmp2, tmp2, tmp1, LSL 7
+ sub tmp2, tmp2, 128
+ br tmp2
+
+.p2align 7
+L(load_and_merge):
+#define EXT_SIZE 1
+ ext A_v.16b, C_v.16b, D_v.16b, 16-EXT_SIZE
+ ext B_v.16b, D_v.16b, E_v.16b, 16-EXT_SIZE
+ subs count, count, 32
+ b.ge 2f
+1:
+ stp A_q, B_q, [dst], #32
+ ext H_v.16b, E_v.16b, F_v.16b, 16-EXT_SIZE
+ ext I_v.16b, F_v.16b, G_v.16b, 16-EXT_SIZE
+ stp H_q, I_q, [dst], #16
+ add dst, dst, tmp1
+ str G_q, [dst], #16
+ b L(copy_long_check32)
+2:
+ stp A_q, B_q, [dst], #32
+ prfm pldl1strm, [src, MEMCPY_PREFETCH_LDR]
+ ldp D_q, J_q, [src], #32
+ ext H_v.16b, E_v.16b, F_v.16b, 16-EXT_SIZE
+ ext I_v.16b, F_v.16b, G_v.16b, 16-EXT_SIZE
+ mov C_v.16b, G_v.16b
+ stp H_q, I_q, [dst], #32
+ ldp F_q, G_q, [src], #32
+ ext A_v.16b, C_v.16b, D_v.16b, 16-EXT_SIZE
+ ext B_v.16b, D_v.16b, J_v.16b, 16-EXT_SIZE
+ mov E_v.16b, J_v.16b
+ subs count, count, 64
+ b.ge 2b
+ b 1b
+#undef EXT_SIZE
+
+.p2align 7
+#define EXT_SIZE 2
+ ext A_v.16b, C_v.16b, D_v.16b, 16-EXT_SIZE
+ ext B_v.16b, D_v.16b, E_v.16b, 16-EXT_SIZE
+ subs count, count, 32
+ b.ge 2f
+1:
+ stp A_q, B_q, [dst], #32
+ ext H_v.16b, E_v.16b, F_v.16b, 16-EXT_SIZE
+ ext I_v.16b, F_v.16b, G_v.16b, 16-EXT_SIZE
+ stp H_q, I_q, [dst], #16
+ add dst, dst, tmp1
+ str G_q, [dst], #16
+ b L(copy_long_check32)
+2:
+ stp A_q, B_q, [dst], #32
+ prfm pldl1strm, [src, MEMCPY_PREFETCH_LDR]
+ ldp D_q, J_q, [src], #32
+ ext H_v.16b, E_v.16b, F_v.16b, 16-EXT_SIZE
+ ext I_v.16b, F_v.16b, G_v.16b, 16-EXT_SIZE
+ mov C_v.16b, G_v.16b
+ stp H_q, I_q, [dst], #32
+ ldp F_q, G_q, [src], #32
+ ext A_v.16b, C_v.16b, D_v.16b, 16-EXT_SIZE
+ ext B_v.16b, D_v.16b, J_v.16b, 16-EXT_SIZE
+ mov E_v.16b, J_v.16b
+ subs count, count, 64
+ b.ge 2b
+ b 1b
+2:
+
+#undef EXT_SIZE
+
+.p2align 7
+#define EXT_SIZE 3
+ ext A_v.16b, C_v.16b, D_v.16b, 16-EXT_SIZE
+ ext B_v.16b, D_v.16b, E_v.16b, 16-EXT_SIZE
+ subs count, count, 32
+ b.ge 2f
+1:
+ stp A_q, B_q, [dst], #32
+ ext H_v.16b, E_v.16b, F_v.16b, 16-EXT_SIZE
+ ext I_v.16b, F_v.16b, G_v.16b, 16-EXT_SIZE
+ stp H_q, I_q, [dst], #16
+ add dst, dst, tmp1
+ str G_q, [dst], #16
+ b L(copy_long_check32)
+2:
+ stp A_q, B_q, [dst], #32
+ prfm pldl1strm, [src, MEMCPY_PREFETCH_LDR]
+ ldp D_q, J_q, [src], #32
+ ext H_v.16b, E_v.16b, F_v.16b, 16-EXT_SIZE
+ ext I_v.16b, F_v.16b, G_v.16b, 16-EXT_SIZE
+ mov C_v.16b, G_v.16b
+ stp H_q, I_q, [dst], #32
+ ldp F_q, G_q, [src], #32
+ ext A_v.16b, C_v.16b, D_v.16b, 16-EXT_SIZE
+ ext B_v.16b, D_v.16b, J_v.16b, 16-EXT_SIZE
+ mov E_v.16b, J_v.16b
+ subs count, count, 64
+ b.ge 2b
+ b 1b
+#undef EXT_SIZE
+
+.p2align 7
+#define EXT_SIZE 4
+ ext A_v.16b, C_v.16b, D_v.16b, 16-EXT_SIZE
+ ext B_v.16b, D_v.16b, E_v.16b, 16-EXT_SIZE
+ subs count, count, 32
+ b.ge 2f
+1:
+ stp A_q, B_q, [dst], #32
+ ext H_v.16b, E_v.16b, F_v.16b, 16-EXT_SIZE
+ ext I_v.16b, F_v.16b, G_v.16b, 16-EXT_SIZE
+ stp H_q, I_q, [dst], #16
+ add dst, dst, tmp1
+ str G_q, [dst], #16
+ b L(copy_long_check32)
+2:
+ stp A_q, B_q, [dst], #32
+ prfm pldl1strm, [src, MEMCPY_PREFETCH_LDR]
+ ldp D_q, J_q, [src], #32
+ ext H_v.16b, E_v.16b, F_v.16b, 16-EXT_SIZE
+ ext I_v.16b, F_v.16b, G_v.16b, 16-EXT_SIZE
+ mov C_v.16b, G_v.16b
+ stp H_q, I_q, [dst], #32
+ ldp F_q, G_q, [src], #32
+ ext A_v.16b, C_v.16b, D_v.16b, 16-EXT_SIZE
+ ext B_v.16b, D_v.16b, J_v.16b, 16-EXT_SIZE
+ mov E_v.16b, J_v.16b
+ subs count, count, 64
+ b.ge 2b
+ b 1b
+#undef EXT_SIZE
+
+.p2align 7
+#define EXT_SIZE 5
+ ext A_v.16b, C_v.16b, D_v.16b, 16-EXT_SIZE
+ ext B_v.16b, D_v.16b, E_v.16b, 16-EXT_SIZE
+ subs count, count, 32
+ b.ge 2f
+1:
+ stp A_q, B_q, [dst], #32
+ ext H_v.16b, E_v.16b, F_v.16b, 16-EXT_SIZE
+ ext I_v.16b, F_v.16b, G_v.16b, 16-EXT_SIZE
+ stp H_q, I_q, [dst], #16
+ add dst, dst, tmp1
+ str G_q, [dst], #16
+ b L(copy_long_check32)
+2:
+ stp A_q, B_q, [dst], #32
+ prfm pldl1strm, [src, MEMCPY_PREFETCH_LDR]
+ ldp D_q, J_q, [src], #32
+ ext H_v.16b, E_v.16b, F_v.16b, 16-EXT_SIZE
+ ext I_v.16b, F_v.16b, G_v.16b, 16-EXT_SIZE
+ mov C_v.16b, G_v.16b
+ stp H_q, I_q, [dst], #32
+ ldp F_q, G_q, [src], #32
+ ext A_v.16b, C_v.16b, D_v.16b, 16-EXT_SIZE
+ ext B_v.16b, D_v.16b, J_v.16b, 16-EXT_SIZE
+ mov E_v.16b, J_v.16b
+ subs count, count, 64
+ b.ge 2b
+ b 1b
+#undef EXT_SIZE
+
+.p2align 7
+#define EXT_SIZE 6
+ ext A_v.16b, C_v.16b, D_v.16b, 16-EXT_SIZE
+ ext B_v.16b, D_v.16b, E_v.16b, 16-EXT_SIZE
+ subs count, count, 32
+ b.ge 2f
+1:
+ stp A_q, B_q, [dst], #32
+ ext H_v.16b, E_v.16b, F_v.16b, 16-EXT_SIZE
+ ext I_v.16b, F_v.16b, G_v.16b, 16-EXT_SIZE
+ stp H_q, I_q, [dst], #16
+ add dst, dst, tmp1
+ str G_q, [dst], #16
+ b L(copy_long_check32)
+2:
+ stp A_q, B_q, [dst], #32
+ prfm pldl1strm, [src, MEMCPY_PREFETCH_LDR]
+ ldp D_q, J_q, [src], #32
+ ext H_v.16b, E_v.16b, F_v.16b, 16-EXT_SIZE
+ ext I_v.16b, F_v.16b, G_v.16b, 16-EXT_SIZE
+ mov C_v.16b, G_v.16b
+ stp H_q, I_q, [dst], #32
+ ldp F_q, G_q, [src], #32
+ ext A_v.16b, C_v.16b, D_v.16b, 16-EXT_SIZE
+ ext B_v.16b, D_v.16b, J_v.16b, 16-EXT_SIZE
+ mov E_v.16b, J_v.16b
+ subs count, count, 64
+ b.ge 2b
+ b 1b
+#undef EXT_SIZE
+
+.p2align 7
+#define EXT_SIZE 7
+ ext A_v.16b, C_v.16b, D_v.16b, 16-EXT_SIZE
+ ext B_v.16b, D_v.16b, E_v.16b, 16-EXT_SIZE
+ subs count, count, 32
+ b.ge 2f
+1:
+ stp A_q, B_q, [dst], #32
+ ext H_v.16b, E_v.16b, F_v.16b, 16-EXT_SIZE
+ ext I_v.16b, F_v.16b, G_v.16b, 16-EXT_SIZE
+ stp H_q, I_q, [dst], #16
+ add dst, dst, tmp1
+ str G_q, [dst], #16
+ b L(copy_long_check32)
+2:
+ stp A_q, B_q, [dst], #32
+ prfm pldl1strm, [src, MEMCPY_PREFETCH_LDR]
+ ldp D_q, J_q, [src], #32
+ ext H_v.16b, E_v.16b, F_v.16b, 16-EXT_SIZE
+ ext I_v.16b, F_v.16b, G_v.16b, 16-EXT_SIZE
+ mov C_v.16b, G_v.16b
+ stp H_q, I_q, [dst], #32
+ ldp F_q, G_q, [src], #32
+ ext A_v.16b, C_v.16b, D_v.16b, 16-EXT_SIZE
+ ext B_v.16b, D_v.16b, J_v.16b, 16-EXT_SIZE
+ mov E_v.16b, J_v.16b
+ subs count, count, 64
+ b.ge 2b
+ b 1b
+#undef EXT_SIZE
+
+.p2align 7
+#define EXT_SIZE 8
+ ext A_v.16b, C_v.16b, D_v.16b, 16-EXT_SIZE
+ ext B_v.16b, D_v.16b, E_v.16b, 16-EXT_SIZE
+ subs count, count, 32
+ b.ge 2f
+1:
+ stp A_q, B_q, [dst], #32
+ ext H_v.16b, E_v.16b, F_v.16b, 16-EXT_SIZE
+ ext I_v.16b, F_v.16b, G_v.16b, 16-EXT_SIZE
+ stp H_q, I_q, [dst], #16
+ add dst, dst, tmp1
+ str G_q, [dst], #16
+ b L(copy_long_check32)
+2:
+ stp A_q, B_q, [dst], #32
+ prfm pldl1strm, [src, MEMCPY_PREFETCH_LDR]
+ ldp D_q, J_q, [src], #32
+ ext H_v.16b, E_v.16b, F_v.16b, 16-EXT_SIZE
+ ext I_v.16b, F_v.16b, G_v.16b, 16-EXT_SIZE
+ mov C_v.16b, G_v.16b
+ stp H_q, I_q, [dst], #32
+ ldp F_q, G_q, [src], #32
+ ext A_v.16b, C_v.16b, D_v.16b, 16-EXT_SIZE
+ ext B_v.16b, D_v.16b, J_v.16b, 16-EXT_SIZE
+ mov E_v.16b, J_v.16b
+ subs count, count, 64
+ b.ge 2b
+ b 1b
+#undef EXT_SIZE
+
+.p2align 7
+#define EXT_SIZE 9
+ ext A_v.16b, C_v.16b, D_v.16b, 16-EXT_SIZE
+ ext B_v.16b, D_v.16b, E_v.16b, 16-EXT_SIZE
+ subs count, count, 32
+ b.ge 2f
+1:
+ stp A_q, B_q, [dst], #32
+ ext H_v.16b, E_v.16b, F_v.16b, 16-EXT_SIZE
+ ext I_v.16b, F_v.16b, G_v.16b, 16-EXT_SIZE
+ stp H_q, I_q, [dst], #16
+ add dst, dst, tmp1
+ str G_q, [dst], #16
+ b L(copy_long_check32)
+2:
+ stp A_q, B_q, [dst], #32
+ prfm pldl1strm, [src, MEMCPY_PREFETCH_LDR]
+ ldp D_q, J_q, [src], #32
+ ext H_v.16b, E_v.16b, F_v.16b, 16-EXT_SIZE
+ ext I_v.16b, F_v.16b, G_v.16b, 16-EXT_SIZE
+ mov C_v.16b, G_v.16b
+ stp H_q, I_q, [dst], #32
+ ldp F_q, G_q, [src], #32
+ ext A_v.16b, C_v.16b, D_v.16b, 16-EXT_SIZE
+ ext B_v.16b, D_v.16b, J_v.16b, 16-EXT_SIZE
+ mov E_v.16b, J_v.16b
+ subs count, count, 64
+ b.ge 2b
+ b 1b
+#undef EXT_SIZE
+
+.p2align 7
+#define EXT_SIZE 10
+ ext A_v.16b, C_v.16b, D_v.16b, 16-EXT_SIZE
+ ext B_v.16b, D_v.16b, E_v.16b, 16-EXT_SIZE
+ subs count, count, 32
+ b.ge 2f
+1:
+ stp A_q, B_q, [dst], #32
+ ext H_v.16b, E_v.16b, F_v.16b, 16-EXT_SIZE
+ ext I_v.16b, F_v.16b, G_v.16b, 16-EXT_SIZE
+ stp H_q, I_q, [dst], #16
+ add dst, dst, tmp1
+ str G_q, [dst], #16
+ b L(copy_long_check32)
+2:
+ stp A_q, B_q, [dst], #32
+ prfm pldl1strm, [src, MEMCPY_PREFETCH_LDR]
+ ldp D_q, J_q, [src], #32
+ ext H_v.16b, E_v.16b, F_v.16b, 16-EXT_SIZE
+ ext I_v.16b, F_v.16b, G_v.16b, 16-EXT_SIZE
+ mov C_v.16b, G_v.16b
+ stp H_q, I_q, [dst], #32
+ ldp F_q, G_q, [src], #32
+ ext A_v.16b, C_v.16b, D_v.16b, 16-EXT_SIZE
+ ext B_v.16b, D_v.16b, J_v.16b, 16-EXT_SIZE
+ mov E_v.16b, J_v.16b
+ subs count, count, 64
+ b.ge 2b
+ b 1b
+#undef EXT_SIZE
+
+.p2align 7
+#define EXT_SIZE 11
+ ext A_v.16b, C_v.16b, D_v.16b, 16-EXT_SIZE
+ ext B_v.16b, D_v.16b, E_v.16b, 16-EXT_SIZE
+ subs count, count, 32
+ b.ge 2f
+1:
+ stp A_q, B_q, [dst], #32
+ ext H_v.16b, E_v.16b, F_v.16b, 16-EXT_SIZE
+ ext I_v.16b, F_v.16b, G_v.16b, 16-EXT_SIZE
+ stp H_q, I_q, [dst], #16
+ add dst, dst, tmp1
+ str G_q, [dst], #16
+ b L(copy_long_check32)
+2:
+ stp A_q, B_q, [dst], #32
+ prfm pldl1strm, [src, MEMCPY_PREFETCH_LDR]
+ ldp D_q, J_q, [src], #32
+ ext H_v.16b, E_v.16b, F_v.16b, 16-EXT_SIZE
+ ext I_v.16b, F_v.16b, G_v.16b, 16-EXT_SIZE
+ mov C_v.16b, G_v.16b
+ stp H_q, I_q, [dst], #32
+ ldp F_q, G_q, [src], #32
+ ext A_v.16b, C_v.16b, D_v.16b, 16-EXT_SIZE
+ ext B_v.16b, D_v.16b, J_v.16b, 16-EXT_SIZE
+ mov E_v.16b, J_v.16b
+ subs count, count, 64
+ b.ge 2b
+ b 1b
+#undef EXT_SIZE
+
+.p2align 7
+#define EXT_SIZE 12
+ ext A_v.16b, C_v.16b, D_v.16b, 16-EXT_SIZE
+ ext B_v.16b, D_v.16b, E_v.16b, 16-EXT_SIZE
+ subs count, count, 32
+ b.ge 2f
+1:
+ stp A_q, B_q, [dst], #32
+ ext H_v.16b, E_v.16b, F_v.16b, 16-EXT_SIZE
+ ext I_v.16b, F_v.16b, G_v.16b, 16-EXT_SIZE
+ stp H_q, I_q, [dst], #16
+ add dst, dst, tmp1
+ str G_q, [dst], #16
+ b L(copy_long_check32)
+2:
+ stp A_q, B_q, [dst], #32
+ prfm pldl1strm, [src, MEMCPY_PREFETCH_LDR]
+ ldp D_q, J_q, [src], #32
+ ext H_v.16b, E_v.16b, F_v.16b, 16-EXT_SIZE
+ ext I_v.16b, F_v.16b, G_v.16b, 16-EXT_SIZE
+ mov C_v.16b, G_v.16b
+ stp H_q, I_q, [dst], #32
+ ldp F_q, G_q, [src], #32
+ ext A_v.16b, C_v.16b, D_v.16b, 16-EXT_SIZE
+ ext B_v.16b, D_v.16b, J_v.16b, 16-EXT_SIZE
+ mov E_v.16b, J_v.16b
+ subs count, count, 64
+ b.ge 2b
+ b 1b
+#undef EXT_SIZE
+
+.p2align 7
+#define EXT_SIZE 13
+ ext A_v.16b, C_v.16b, D_v.16b, 16-EXT_SIZE
+ ext B_v.16b, D_v.16b, E_v.16b, 16-EXT_SIZE
+ subs count, count, 32
+ b.ge 2f
+1:
+ stp A_q, B_q, [dst], #32
+ ext H_v.16b, E_v.16b, F_v.16b, 16-EXT_SIZE
+ ext I_v.16b, F_v.16b, G_v.16b, 16-EXT_SIZE
+ stp H_q, I_q, [dst], #16
+ add dst, dst, tmp1
+ str G_q, [dst], #16
+ b L(copy_long_check32)
+2:
+ stp A_q, B_q, [dst], #32
+ prfm pldl1strm, [src, MEMCPY_PREFETCH_LDR]
+ ldp D_q, J_q, [src], #32
+ ext H_v.16b, E_v.16b, F_v.16b, 16-EXT_SIZE
+ ext I_v.16b, F_v.16b, G_v.16b, 16-EXT_SIZE
+ mov C_v.16b, G_v.16b
+ stp H_q, I_q, [dst], #32
+ ldp F_q, G_q, [src], #32
+ ext A_v.16b, C_v.16b, D_v.16b, 16-EXT_SIZE
+ ext B_v.16b, D_v.16b, J_v.16b, 16-EXT_SIZE
+ mov E_v.16b, J_v.16b
+ subs count, count, 64
+ b.ge 2b
+ b 1b
+#undef EXT_SIZE
+
+.p2align 7
+#define EXT_SIZE 14
+ ext A_v.16b, C_v.16b, D_v.16b, 16-EXT_SIZE
+ ext B_v.16b, D_v.16b, E_v.16b, 16-EXT_SIZE
+ subs count, count, 32
+ b.ge 2f
+1:
+ stp A_q, B_q, [dst], #32
+ ext H_v.16b, E_v.16b, F_v.16b, 16-EXT_SIZE
+ ext I_v.16b, F_v.16b, G_v.16b, 16-EXT_SIZE
+ stp H_q, I_q, [dst], #16
+ add dst, dst, tmp1
+ str G_q, [dst], #16
+ b L(copy_long_check32)
+2:
+ stp A_q, B_q, [dst], #32
+ prfm pldl1strm, [src, MEMCPY_PREFETCH_LDR]
+ ldp D_q, J_q, [src], #32
+ ext H_v.16b, E_v.16b, F_v.16b, 16-EXT_SIZE
+ ext I_v.16b, F_v.16b, G_v.16b, 16-EXT_SIZE
+ mov C_v.16b, G_v.16b
+ stp H_q, I_q, [dst], #32
+ ldp F_q, G_q, [src], #32
+ ext A_v.16b, C_v.16b, D_v.16b, 16-EXT_SIZE
+ ext B_v.16b, D_v.16b, J_v.16b, 16-EXT_SIZE
+ mov E_v.16b, J_v.16b
+ subs count, count, 64
+ b.ge 2b
+ b 1b
+#undef EXT_SIZE
+
+.p2align 7
+#define EXT_SIZE 15
+ ext A_v.16b, C_v.16b, D_v.16b, 16-EXT_SIZE
+ ext B_v.16b, D_v.16b, E_v.16b, 16-EXT_SIZE
+ subs count, count, 32
+ b.ge 2f
+1:
+ stp A_q, B_q, [dst], #32
+ ext H_v.16b, E_v.16b, F_v.16b, 16-EXT_SIZE
+ ext I_v.16b, F_v.16b, G_v.16b, 16-EXT_SIZE
+ stp H_q, I_q, [dst], #16
+ add dst, dst, tmp1
+ str G_q, [dst], #16
+ b L(copy_long_check32)
+2:
+ stp A_q, B_q, [dst], #32
+ prfm pldl1strm, [src, MEMCPY_PREFETCH_LDR]
+ ldp D_q, J_q, [src], #32
+ ext H_v.16b, E_v.16b, F_v.16b, 16-EXT_SIZE
+ ext I_v.16b, F_v.16b, G_v.16b, 16-EXT_SIZE
+ mov C_v.16b, G_v.16b
+ stp H_q, I_q, [dst], #32
+ ldp F_q, G_q, [src], #32
+ ext A_v.16b, C_v.16b, D_v.16b, 16-EXT_SIZE
+ ext B_v.16b, D_v.16b, J_v.16b, 16-EXT_SIZE
+ mov E_v.16b, J_v.16b
+ subs count, count, 64
+ b.ge 2b
+ b 1b
+#undef EXT_SIZE
+
+END (MEMCPY)
+libc_hidden_builtin_def (MEMCPY)
+#endif