]> sourceware.org Git - glibc.git/commitdiff
powerpc: Replace lxvd2x/stxvd2x with lvx/stvx in P7's memcpy/memmove
authorRajalakshmi Srinivasaraghavan <raji@linux.vnet.ibm.com>
Wed, 25 Oct 2017 15:13:53 +0000 (13:13 -0200)
committerTulio Magno Quites Machado Filho <tuliom@linux.vnet.ibm.com>
Wed, 25 Oct 2017 15:14:30 +0000 (13:14 -0200)
POWER9 DD2.1 and earlier has an issue where some cache inhibited
vector load traps to the kernel, causing a performance degradation.  To
handle this in memcpy and memmove, lvx/stvx is used for aligned
addresses instead of lxvd2x/stxvd2x.

Reference: https://patchwork.ozlabs.org/patch/814059/

* sysdeps/powerpc/powerpc64/power7/memcpy.S: Replace
lxvd2x/stxvd2x with lvx/stvx.
* sysdeps/powerpc/powerpc64/power7/memmove.S: Likewise.

Reviewed-by: Tulio Magno Quites Machado Filho <tuliom@linux.vnet.ibm.com>
Reviewed-by: Adhemerval Zanella <adhemerval.zanella@linaro.org>
ChangeLog
sysdeps/powerpc/powerpc64/power7/memcpy.S
sysdeps/powerpc/powerpc64/power7/memmove.S

index bb29b732d50c6d9bbe3b476f24e88222c6ecd1e7..76c592e5d336e3e150978c3eddad7d3148b91a76 100644 (file)
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,9 @@
+2017-10-25  Rajalakshmi Srinivasaraghavan  <raji@linux.vnet.ibm.com>
+
+       * sysdeps/powerpc/powerpc64/power7/memcpy.S: Replace
+       lxvd2x/stxvd2x with lvx/stvx.
+       * sysdeps/powerpc/powerpc64/power7/memmove.S: Likewise.
+
 2017-10-25  H.J. Lu  <hongjiu.lu@intel.com>
 
        * include/alloc_buffer.h: Replace "if if " with "if " in
index 1ccbc2e1535b3ee4de12a3e41dc75a969ef5099b..a7cdf8b326955f280b0af9b7823b016e5d3257d9 100644 (file)
@@ -91,63 +91,63 @@ L(aligned_copy):
        srdi    12,cnt,7
        cmpdi   12,0
        beq     L(aligned_tail)
-       lxvd2x  6,0,src
-       lxvd2x  7,src,6
+       lvx     6,0,src
+       lvx     7,src,6
        mtctr   12
        b       L(aligned_128loop)
 
        .align  4
 L(aligned_128head):
        /* for the 2nd + iteration of this loop. */
-       lxvd2x  6,0,src
-       lxvd2x  7,src,6
+       lvx     6,0,src
+       lvx     7,src,6
 L(aligned_128loop):
-       lxvd2x  8,src,7
-       lxvd2x  9,src,8
-       stxvd2x 6,0,dst
+       lvx     8,src,7
+       lvx     9,src,8
+       stvx    6,0,dst
        addi    src,src,64
-       stxvd2x 7,dst,6
-       stxvd2x 8,dst,7
-       stxvd2x 9,dst,8
-       lxvd2x  6,0,src
-       lxvd2x  7,src,6
+       stvx    7,dst,6
+       stvx    8,dst,7
+       stvx    9,dst,8
+       lvx     6,0,src
+       lvx     7,src,6
        addi    dst,dst,64
-       lxvd2x  8,src,7
-       lxvd2x  9,src,8
+       lvx     8,src,7
+       lvx     9,src,8
        addi    src,src,64
-       stxvd2x 6,0,dst
-       stxvd2x 7,dst,6
-       stxvd2x 8,dst,7
-       stxvd2x 9,dst,8
+       stvx    6,0,dst
+       stvx    7,dst,6
+       stvx    8,dst,7
+       stvx    9,dst,8
        addi    dst,dst,64
        bdnz    L(aligned_128head)
 
 L(aligned_tail):
        mtocrf  0x01,cnt
        bf      25,32f
-       lxvd2x  6,0,src
-       lxvd2x  7,src,6
-       lxvd2x  8,src,7
-       lxvd2x  9,src,8
+       lvx     6,0,src
+       lvx     7,src,6
+       lvx     8,src,7
+       lvx     9,src,8
        addi    src,src,64
-       stxvd2x 6,0,dst
-       stxvd2x 7,dst,6
-       stxvd2x 8,dst,7
-       stxvd2x 9,dst,8
+       stvx    6,0,dst
+       stvx    7,dst,6
+       stvx    8,dst,7
+       stvx    9,dst,8
        addi    dst,dst,64
 32:
        bf      26,16f
-       lxvd2x  6,0,src
-       lxvd2x  7,src,6
+       lvx     6,0,src
+       lvx     7,src,6
        addi    src,src,32
-       stxvd2x 6,0,dst
-       stxvd2x 7,dst,6
+       stvx    6,0,dst
+       stvx    7,dst,6
        addi    dst,dst,32
 16:
        bf      27,8f
-       lxvd2x  6,0,src
+       lvx     6,0,src
        addi    src,src,16
-       stxvd2x 6,0,dst
+       stvx    6,0,dst
        addi    dst,dst,16
 8:
        bf      28,4f
index 93baa69ee26c61328384dbba3d661eb072248506..667c6e20925ec327fafcb45d887acbe7b4207687 100644 (file)
@@ -92,63 +92,63 @@ L(aligned_copy):
        srdi    12,r5,7
        cmpdi   12,0
        beq     L(aligned_tail)
-       lxvd2x  6,0,r4
-       lxvd2x  7,r4,6
+       lvx     6,0,r4
+       lvx     7,r4,6
        mtctr   12
        b       L(aligned_128loop)
 
        .align  4
 L(aligned_128head):
        /* for the 2nd + iteration of this loop. */
-       lxvd2x  6,0,r4
-       lxvd2x  7,r4,6
+       lvx     6,0,r4
+       lvx     7,r4,6
 L(aligned_128loop):
-       lxvd2x  8,r4,7
-       lxvd2x  9,r4,8
-       stxvd2x 6,0,r11
+       lvx     8,r4,7
+       lvx     9,r4,8
+       stvx    6,0,r11
        addi    r4,r4,64
-       stxvd2x 7,r11,6
-       stxvd2x 8,r11,7
-       stxvd2x 9,r11,8
-       lxvd2x  6,0,r4
-       lxvd2x  7,r4,6
+       stvx    7,r11,6
+       stvx    8,r11,7
+       stvx    9,r11,8
+       lvx     6,0,r4
+       lvx     7,r4,6
        addi    r11,r11,64
-       lxvd2x  8,r4,7
-       lxvd2x  9,r4,8
+       lvx     8,r4,7
+       lvx     9,r4,8
        addi    r4,r4,64
-       stxvd2x 6,0,r11
-       stxvd2x 7,r11,6
-       stxvd2x 8,r11,7
-       stxvd2x 9,r11,8
+       stvx    6,0,r11
+       stvx    7,r11,6
+       stvx    8,r11,7
+       stvx    9,r11,8
        addi    r11,r11,64
        bdnz    L(aligned_128head)
 
 L(aligned_tail):
        mtocrf  0x01,r5
        bf      25,32f
-       lxvd2x  6,0,r4
-       lxvd2x  7,r4,6
-       lxvd2x  8,r4,7
-       lxvd2x  9,r4,8
+       lvx     6,0,r4
+       lvx     7,r4,6
+       lvx     8,r4,7
+       lvx     9,r4,8
        addi    r4,r4,64
-       stxvd2x 6,0,r11
-       stxvd2x 7,r11,6
-       stxvd2x 8,r11,7
-       stxvd2x 9,r11,8
+       stvx    6,0,r11
+       stvx    7,r11,6
+       stvx    8,r11,7
+       stvx    9,r11,8
        addi    r11,r11,64
 32:
        bf      26,16f
-       lxvd2x  6,0,r4
-       lxvd2x  7,r4,6
+       lvx     6,0,r4
+       lvx     7,r4,6
        addi    r4,r4,32
-       stxvd2x 6,0,r11
-       stxvd2x 7,r11,6
+       stvx    6,0,r11
+       stvx    7,r11,6
        addi    r11,r11,32
 16:
        bf      27,8f
-       lxvd2x  6,0,r4
+       lvx     6,0,r4
        addi    r4,r4,16
-       stxvd2x 6,0,r11
+       stvx    6,0,r11
        addi    r11,r11,16
 8:
        bf      28,4f
@@ -488,63 +488,63 @@ L(aligned_copy_bwd):
        srdi    r12,r5,7
        cmpdi   r12,0
        beq     L(aligned_tail_bwd)
-       lxvd2x  v6,r4,r6
-       lxvd2x  v7,r4,r7
+       lvx     v6,r4,r6
+       lvx     v7,r4,r7
        mtctr   12
        b       L(aligned_128loop_bwd)
 
        .align  4
 L(aligned_128head_bwd):
        /* for the 2nd + iteration of this loop. */
-       lxvd2x  v6,r4,r6
-       lxvd2x  v7,r4,r7
+       lvx     v6,r4,r6
+       lvx     v7,r4,r7
 L(aligned_128loop_bwd):
-       lxvd2x  v8,r4,r8
-       lxvd2x  v9,r4,r9
-       stxvd2x v6,r11,r6
+       lvx     v8,r4,r8
+       lvx     v9,r4,r9
+       stvx    v6,r11,r6
        subi    r4,r4,64
-       stxvd2x v7,r11,r7
-       stxvd2x v8,r11,r8
-       stxvd2x v9,r11,r9
-       lxvd2x  v6,r4,r6
-       lxvd2x  v7,r4,7
+       stvx    v7,r11,r7
+       stvx    v8,r11,r8
+       stvx    v9,r11,r9
+       lvx     v6,r4,r6
+       lvx     v7,r4,7
        subi    r11,r11,64
-       lxvd2x  v8,r4,r8
-       lxvd2x  v9,r4,r9
+       lvx     v8,r4,r8
+       lvx     v9,r4,r9
        subi    r4,r4,64
-       stxvd2x v6,r11,r6
-       stxvd2x v7,r11,r7
-       stxvd2x v8,r11,r8
-       stxvd2x v9,r11,r9
+       stvx    v6,r11,r6
+       stvx    v7,r11,r7
+       stvx    v8,r11,r8
+       stvx    v9,r11,r9
        subi    r11,r11,64
        bdnz    L(aligned_128head_bwd)
 
 L(aligned_tail_bwd):
        mtocrf  0x01,r5
        bf      25,32f
-       lxvd2x  v6,r4,r6
-       lxvd2x  v7,r4,r7
-       lxvd2x  v8,r4,r8
-       lxvd2x  v9,r4,r9
+       lvx     v6,r4,r6
+       lvx     v7,r4,r7
+       lvx     v8,r4,r8
+       lvx     v9,r4,r9
        subi    r4,r4,64
-       stxvd2x v6,r11,r6
-       stxvd2x v7,r11,r7
-       stxvd2x v8,r11,r8
-       stxvd2x v9,r11,r9
+       stvx    v6,r11,r6
+       stvx    v7,r11,r7
+       stvx    v8,r11,r8
+       stvx    v9,r11,r9
        subi    r11,r11,64
 32:
        bf      26,16f
-       lxvd2x  v6,r4,r6
-       lxvd2x  v7,r4,r7
+       lvx     v6,r4,r6
+       lvx     v7,r4,r7
        subi    r4,r4,32
-       stxvd2x v6,r11,r6
-       stxvd2x v7,r11,r7
+       stvx    v6,r11,r6
+       stvx    v7,r11,r7
        subi    r11,r11,32
 16:
        bf      27,8f
-       lxvd2x  v6,r4,r6
+       lvx     v6,r4,r6
        subi    r4,r4,16
-       stxvd2x v6,r11,r6
+       stvx    v6,r11,r6
        subi    r11,r11,16
 8:
        bf      28,4f
This page took 1.001346 seconds and 5 git commands to generate.