Update x86-64 mpn routines from GMP 5.0.1.

author Ulrich Drepper <drepper@redhat.com>

Fri, 3 Sep 2010 06:36:25 +0000 (23:36 -0700)

committer Ulrich Drepper <drepper@redhat.com>

Fri, 3 Sep 2010 06:36:25 +0000 (23:36 -0700)
author Ulrich Drepper <drepper@redhat.com>
Fri, 3 Sep 2010 06:36:25 +0000 (23:36 -0700)
committer Ulrich Drepper <drepper@redhat.com>
Fri, 3 Sep 2010 06:36:25 +0000 (23:36 -0700)
diff --git a/ChangeLog b/ChangeLog

index fd4b7751ca90a0732443bcba46321203f4262d76..fe512db42bf776d2ee835d292008221346a4ea55 100644 (file)
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,12 +1,22 @@
+2010-09-02  Ulrich Drepper  <drepper@redhat.com>
+
+       * sysdeps/x86_64/add_n.S: Update from GMP 5.0.1.
+       * sysdeps/x86_64/addmul_1.S: Likewise.
+       * sysdeps/x86_64/lshift.S: Likewise.
+       * sysdeps/x86_64/mul_1.S: Likewise.
+       * sysdeps/x86_64/rshift.S: Likewise.
+       * sysdeps/x86_64/sub_n.S: Likewise.
+       * sysdeps/x86_64/submul_1.S: Likewise.
+
  2010-09-01  Samuel Thibault  <samuel.thibault@ens-lyon.org>
  
-        This aligns bits/sched.h onto sysdeps/unix/sysv/linux/bits/sched.h:
-        Define __sched_param instead of SCHED_* and sched_param when
+       This aligns bits/sched.h onto sysdeps/unix/sysv/linux/bits/sched.h:
+       Define __sched_param instead of SCHED_* and sched_param when
         <bits/sched.h> is included with __need_schedparam defined.
-        * bits/sched.h [__need_schedparam]
+       * bits/sched.h [__need_schedparam]
         (SCHED_OTHER, SCHED_FIFO, SCHED_RR, sched_param): Do not define.
-        [!__defined_schedparam && (__need_schedparam || _SCHED_H)]
-        (__defined_schedparam): Define to 1.
+       [!__defined_schedparam && (__need_schedparam || _SCHED_H)]
+       (__defined_schedparam): Define to 1.
         (__sched_param): New structure, identical to sched_param.
         (__need_schedparam): Undefine.
  
diff --git a/sysdeps/x86_64/add_n.S b/sysdeps/x86_64/add_n.S

index 7883f6c8406b9e81260c55a7a59135a406886499..f0b4c3f78c6530fa5a720a40fe5de20313133470 100644 (file)
--- a/sysdeps/x86_64/add_n.S
+++ b/sysdeps/x86_64/add_n.S
@@ -1,6 +1,6 @@
-/* Add two limb vectors of the same length > 0 and store sum in a third
-   limb vector.
-   Copyright (C) 2004 Free Software Foundation, Inc.
+/* x86-64 __mpn_add_n -- Add two limb vectors of the same length > 0 and store
+   sum in a third limb vector.
+   Copyright (C) 2006, 2007 Free Software Foundation, Inc.
     This file is part of the GNU MP Library.
  
     The GNU MP Library is free software; you can redistribute it and/or modify
@@ -21,22 +21,81 @@
  #include "sysdep.h"
  #include "asm-syntax.h"
  
+#define rp     %rdi
+#define up     %rsi
+#define vp     %rdx
+#define n      %rcx
+#define cy     %r8
+
+#ifndef func
+# define func __mpn_add_n
+# define ADCSBB adc
+#endif
+
         .text
-ENTRY (__mpn_add_n)
-       leaq    (%rsi,%rcx,8), %rsi
-       leaq    (%rdi,%rcx,8), %rdi
-       leaq    (%rdx,%rcx,8), %rdx
-       negq    %rcx
-       xorl    %eax, %eax                      # clear cy
-       .p2align 2
-L(loop):
-       movq    (%rsi,%rcx,8), %rax
-       movq    (%rdx,%rcx,8), %r10
-       adcq    %r10, %rax
-       movq    %rax, (%rdi,%rcx,8)
-       incq    %rcx
-       jne     L(loop)
-       movq    %rcx, %rax                      # zero %rax
-       adcq    %rax, %rax
+ENTRY (func)
+       xor     %r8, %r8
+       mov     (up), %r10
+       mov     (vp), %r11
+
+       lea     -8(up,n,8), up
+       lea     -8(vp,n,8), vp
+       lea     -16(rp,n,8), rp
+       mov     %ecx, %eax
+       neg     n
+       and     $3, %eax
+       je      L(b00)
+       add     %rax, n         /* clear low rcx bits for jrcxz */
+       cmp     $2, %eax
+       jl      L(b01)
+       je      L(b10)
+
+L(b11):        shr     %r8             /* set cy */
+       jmp     L(e11)
+
+L(b00):        shr     %r8             /* set cy */
+       mov     %r10, %r8
+       mov     %r11, %r9
+       lea     4(n), n
+       jmp     L(e00)
+
+L(b01):        shr     %r8             /* set cy */
+       jmp     L(e01)
+
+L(b10):        shr     %r8             /* set cy */
+       mov     %r10, %r8
+       mov     %r11, %r9
+       jmp     L(e10)
+
+L(end):        ADCSBB  %r11, %r10
+       mov     %r10, 8(rp)
+       mov     %ecx, %eax      /* clear eax, ecx contains 0 */
+       adc     %eax, %eax
         ret
-END (__mpn_add_n)
+
+       .p2align 4
+L(top):
+       mov     -24(up,n,8), %r8
+       mov     -24(vp,n,8), %r9
+       ADCSBB  %r11, %r10
+       mov     %r10, -24(rp,n,8)
+L(e00):
+       mov     -16(up,n,8), %r10
+       mov     -16(vp,n,8), %r11
+       ADCSBB  %r9, %r8
+       mov     %r8, -16(rp,n,8)
+L(e11):
+       mov     -8(up,n,8), %r8
+       mov     -8(vp,n,8), %r9
+       ADCSBB  %r11, %r10
+       mov     %r10, -8(rp,n,8)
+L(e10):
+       mov     (up,n,8), %r10
+       mov     (vp,n,8), %r11
+       ADCSBB  %r9, %r8
+       mov     %r8, (rp,n,8)
+L(e01):
+       jrcxz   L(end)
+       lea     4(n), n
+       jmp     L(top)
+END (func)
diff --git a/sysdeps/x86_64/addmul_1.S b/sysdeps/x86_64/addmul_1.S

index bdb5226a333132fee1e6746accfd1782ffef41e0..e99789670376cb9c046a99b8679cd810a02110fb 100644 (file)
--- a/sysdeps/x86_64/addmul_1.S
+++ b/sysdeps/x86_64/addmul_1.S
@@ -1,6 +1,6 @@
-/* AMD64 __mpn_addmul_1 -- Multiply a limb vector with a limb and add
+/* x86-64 __mpn_addmul_1 -- Multiply a limb vector with a limb and add
     the result to a second limb vector.
-   Copyright (C) 2004 Free Software Foundation, Inc.
+   Copyright (C) 2003,2004,2005,2007,2008,2009 Free Software Foundation, Inc.
     This file is part of the GNU MP Library.
  
     The GNU MP Library is free software; you can redistribute it and/or modify
@@ -21,26 +21,95 @@
  #include "sysdep.h"
  #include "asm-syntax.h"
  
+#define rp     %rdi
+#define up     %rsi
+#define n      %rdx
+#define v0     %rcx
+
+#ifndef func
+# define func __mpn_addmul_1
+# define ADDSUB add
+#endif
+
         .text
-ENTRY (__mpn_addmul_1)
-       movq    %rdx, %r11
-       leaq    (%rsi,%rdx,8), %rsi
-       leaq    (%rdi,%rdx,8), %rdi
-       negq    %r11
-       xorl    %r8d, %r8d
-       xorl    %r10d, %r10d
-       .p2align 2
-L(loop):
-       movq    (%rsi,%r11,8), %rax
-       mulq    %rcx
-       addq    (%rdi,%r11,8), %rax
-       adcq    %r10, %rdx
-       addq    %r8, %rax
-       movq    %r10, %r8
-       movq    %rax, (%rdi,%r11,8)
-       adcq    %rdx, %r8
-       incq    %r11
-       jne     L(loop)
-       movq    %r8, %rax
+ENTRY (func)
+       push    %rbx
+       push    %rbp
+       lea     (%rdx), %rbx
+       neg     %rbx
+
+       mov     (up), %rax
+       mov     (rp), %r10
+
+       lea     -16(rp,%rdx,8), rp
+       lea     (up,%rdx,8), up
+       mul     %rcx
+
+       bt      $0, %ebx
+       jc      L(odd)
+
+       lea     (%rax), %r11
+       mov     8(up,%rbx,8), %rax
+       lea     (%rdx), %rbp
+       mul     %rcx
+       add     $2, %rbx
+       jns     L(n2)
+
+       lea     (%rax), %r8
+       mov     (up,%rbx,8), %rax
+       lea     (%rdx), %r9
+       jmp     L(mid)
+
+L(odd):        add     $1, %rbx
+       jns     L(n1)
+
+       lea     (%rax), %r8
+       mov     (up,%rbx,8), %rax
+       lea     (%rdx), %r9
+       mul     %rcx
+       lea     (%rax), %r11
+       mov     8(up,%rbx,8), %rax
+       lea     (%rdx), %rbp
+       jmp     L(e)
+
+       .p2align 4
+L(top):        mul     %rcx
+       ADDSUB  %r8, %r10
+       lea     (%rax), %r8
+       mov     (up,%rbx,8), %rax
+       adc     %r9, %r11
+       mov     %r10, -8(rp,%rbx,8)
+       mov     (rp,%rbx,8), %r10
+       lea     (%rdx), %r9
+       adc     $0, %rbp
+L(mid):        mul     %rcx
+       ADDSUB  %r11, %r10
+       lea     (%rax), %r11
+       mov     8(up,%rbx,8), %rax
+       adc     %rbp, %r8
+       mov     %r10, (rp,%rbx,8)
+       mov     8(rp,%rbx,8), %r10
+       lea     (%rdx), %rbp
+       adc     $0, %r9
+L(e):  add     $2, %rbx
+       js      L(top)
+
+       mul     %rcx
+       ADDSUB  %r8, %r10
+       adc     %r9, %r11
+       mov     %r10, -8(rp)
+       adc     $0, %rbp
+L(n2): mov     (rp), %r10
+       ADDSUB  %r11, %r10
+       adc     %rbp, %rax
+       mov     %r10, (rp)
+       adc     $0, %rdx
+L(n1): mov     8(rp), %r10
+       ADDSUB  %rax, %r10
+       mov     %r10, 8(rp)
+       mov     %ebx, %eax      /* zero rax */
+       adc     %rdx, %rax
+       pop     %rbp
+       pop     %rbx
         ret
-END (__mpn_addmul_1)
+END (func)
diff --git a/sysdeps/x86_64/lshift.S b/sysdeps/x86_64/lshift.S

index 5ac66f0a365fdbf86f359fdedb13eb0917a03be7..f89d3e09b3a4e0b82ff25e07f94c65d252a34a58 100644 (file)
--- a/sysdeps/x86_64/lshift.S
+++ b/sysdeps/x86_64/lshift.S
@@ -1,5 +1,5 @@
-/* AMD64 __mpn_lshift --
-   Copyright 2004, 2006 Free Software Foundation, Inc.
+/* x86-64 __mpn_lshift --
+   Copyright (C) 2007, 2009 Free Software Foundation, Inc.
     This file is part of the GNU MP Library.
  
     The GNU MP Library is free software; you can redistribute it and/or modify
@@ -20,41 +20,98 @@
  #include "sysdep.h"
  #include "asm-syntax.h"
  
+#define rp     %rdi
+#define up     %rsi
+#define n      %rdx
+#define cnt    %cl
  
         .text
  ENTRY (__mpn_lshift)
-       movq    -8(%rsi,%rdx,8), %mm7
-       movd    %ecx, %mm1
-       movl    $64, %eax
-       subl    %ecx, %eax
-       movd    %eax, %mm0
-       movq    %mm7, %mm3
-       psrlq   %mm0, %mm7
-       movd    %mm7, %rax
-       subq    $2, %rdx
-       jl      L(endo)
-       .p2align 2
-L(loop):
-       movq    (%rsi,%rdx,8), %mm6
-       movq    %mm6, %mm2
-       psrlq   %mm0, %mm6
-       psllq   %mm1, %mm3
-       por     %mm6, %mm3
-       movq    %mm3, 8(%rdi,%rdx,8)
-       je      L(ende)
-       movq    -8(%rsi,%rdx,8), %mm7
-       movq    %mm7, %mm3
-       psrlq   %mm0, %mm7
-       psllq   %mm1, %mm2
-       por     %mm7, %mm2
-       movq    %mm2, (%rdi,%rdx,8)
-       subq    $2, %rdx
-       jge     L(loop)
-L(endo):
-       movq    %mm3, %mm2
-L(ende):
-       psllq   %mm1, %mm2
-       movq    %mm2, (%rdi)
-       emms
+       lea     -8(rp,n,8), rp
+       lea     -8(up,n,8), up
+
+       mov     %edx, %eax
+       and     $3, %eax
+       jne     L(nb00)
+L(b00):        /* n = 4, 8, 12, ... */
+       mov     (up), %r10
+       mov     -8(up), %r11
+       xor     %eax, %eax
+       shld    %cl, %r10, %rax
+       mov     -16(up), %r8
+       lea     24(rp), rp
+       sub     $4, n
+       jmp     L(00)
+
+L(nb00):/* n = 1, 5, 9, ... */
+       cmp     $2, %eax
+       jae     L(nb01)
+L(b01):        mov     (up), %r9
+       xor     %eax, %eax
+       shld    %cl, %r9, %rax
+       sub     $2, n
+       jb      L(le1)
+       mov     -8(up), %r10
+       mov     -16(up), %r11
+       lea     -8(up), up
+       lea     16(rp), rp
+       jmp     L(01)
+L(le1):        shl     %cl, %r9
+       mov     %r9, (rp)
+       ret
+
+L(nb01):/* n = 2, 6, 10, ... */
+       jne     L(b11)
+L(b10):        mov     (up), %r8
+       mov     -8(up), %r9
+       xor     %eax, %eax
+       shld    %cl, %r8, %rax
+       sub     $3, n
+       jb      L(le2)
+       mov     -16(up), %r10
+       lea     -16(up), up
+       lea     8(rp), rp
+       jmp     L(10)
+L(le2):        shld    %cl, %r9, %r8
+       mov     %r8, (rp)
+       shl     %cl, %r9
+       mov     %r9, -8(rp)
+       ret
+
+       .p2align 4              /* performance critical! */
+L(b11):        /* n = 3, 7, 11, ... */
+       mov     (up), %r11
+       mov     -8(up), %r8
+       xor     %eax, %eax
+       shld    %cl, %r11, %rax
+       mov     -16(up), %r9
+       lea     -24(up), up
+       sub     $4, n
+       jb      L(end)
+
+       .p2align 4
+L(top):        shld    %cl, %r8, %r11
+       mov     (up), %r10
+       mov     %r11, (rp)
+L(10): shld    %cl, %r9, %r8
+       mov     -8(up), %r11
+       mov     %r8, -8(rp)
+L(01): shld    %cl, %r10, %r9
+       mov     -16(up), %r8
+       mov     %r9, -16(rp)
+L(00): shld    %cl, %r11, %r10
+       mov     -24(up), %r9
+       mov     %r10, -24(rp)
+       add     $-32, up
+       lea     -32(rp), rp
+       sub     $4, n
+       jnc     L(top)
+
+L(end):        shld    %cl, %r8, %r11
+       mov     %r11, (rp)
+       shld    %cl, %r9, %r8
+       mov     %r8, -8(rp)
+       shl     %cl, %r9
+       mov     %r9, -16(rp)
         ret
  END (__mpn_lshift)
diff --git a/sysdeps/x86_64/mul_1.S b/sysdeps/x86_64/mul_1.S

index 978916b72c5d57cbff5bf97d03475259f6d029b1..676afd175523882c9e46cc767faa8134429eb01e 100644 (file)
--- a/sysdeps/x86_64/mul_1.S
+++ b/sysdeps/x86_64/mul_1.S
@@ -1,6 +1,6 @@
  /* AMD64 __mpn_mul_1 -- Multiply a limb vector with a limb and store
     the result in a second limb vector.
-   Copyright (C) 2004 Free Software Foundation, Inc.
+   Copyright (C) 2003, 2004, 2005, 2007, 2008 Free Software Foundation, Inc.
     This file is part of the GNU MP Library.
  
     The GNU MP Library is free software; you can redistribute it and/or modify
@@ -21,22 +21,109 @@
  #include <sysdep.h>
  #include "asm-syntax.h"
  
+#define rp     %rdi
+#define up     %rsi
+#define n_param        %rdx
+#define vl     %rcx
+
+#define n      %r11
+
         .text
  ENTRY (__mpn_mul_1)
-       movq    %rdx, %r11
-       leaq    (%rsi,%rdx,8), %rsi
-       leaq    (%rdi,%rdx,8), %rdi
-       negq    %r11
-       xorl    %r8d, %r8d
-L(loop):
-       movq    (%rsi,%r11,8), %rax
-       mulq    %rcx
-       addq    %r8, %rax
-       movl    $0, %r8d
-       adcq    %rdx, %r8
-       movq    %rax, (%rdi,%r11,8)
-       incq    %r11
-       jne     L(loop)
-       movq    %r8, %rax
+       push    %rbx
+       cfi_adjust_cfa_offset (8)
+       cfi_rel_offset (%rbx, 0)
+       xor     %r10, %r10
+       mov     (up), %rax              /* read first u limb early */
+       mov     n_param, %rbx           /* move away n from rdx, mul uses it */
+       mul     vl
+       mov     %rbx, %r11
+
+       add     %r10, %rax
+       adc     $0, %rdx
+
+       and     $3, %ebx
+       jz      L(b0)
+       cmp     $2, %ebx
+       jz      L(b2)
+       jg      L(b3)
+
+L(b1): dec     n
+       jne     L(gt1)
+       mov     %rax, (rp)
+       jmp     L(ret)
+L(gt1):        lea     8(up,n,8), up
+       lea     -8(rp,n,8), rp
+       neg     n
+       xor     %r10, %r10
+       xor     %ebx, %ebx
+       mov     %rax, %r9
+       mov     (up,n,8), %rax
+       mov     %rdx, %r8
+       jmp     L(L1)
+
+L(b0): lea     (up,n,8), up
+       lea     -16(rp,n,8), rp
+       neg     n
+       xor     %r10, %r10
+       mov     %rax, %r8
+       mov     %rdx, %rbx
+       jmp     L(L0)
+
+L(b3): lea     -8(up,n,8), up
+       lea     -24(rp,n,8), rp
+       neg     n
+       mov     %rax, %rbx
+       mov     %rdx, %r10
+       jmp     L(L3)
+
+L(b2): lea     -16(up,n,8), up
+       lea     -32(rp,n,8), rp
+       neg     n
+       xor     %r8, %r8
+       xor     %ebx, %ebx
+       mov     %rax, %r10
+       mov     24(up,n,8), %rax
+       mov     %rdx, %r9
+       jmp     L(L2)
+
+       .p2align 4
+L(top): mov    %r10, (rp,n,8)
+       add     %rax, %r9
+       mov     (up,n,8), %rax
+       adc     %rdx, %r8
+       mov     $0, %r10d
+L(L1): mul     vl
+       mov     %r9, 8(rp,n,8)
+       add     %rax, %r8
+       adc     %rdx, %rbx
+L(L0): mov     8(up,n,8), %rax
+       mul     vl
+       mov     %r8, 16(rp,n,8)
+       add     %rax, %rbx
+       adc     %rdx, %r10
+L(L3): mov     16(up,n,8), %rax
+       mul     vl
+       mov     %rbx, 24(rp,n,8)
+       mov     $0, %r8d                # zero
+       mov     %r8, %rbx               # zero
+       add     %rax, %r10
+       mov     24(up,n,8), %rax
+       mov     %r8, %r9                # zero
+       adc     %rdx, %r9
+L(L2): mul     vl
+       add     $4, n
+       js      L(top)
+
+       mov     %r10, (rp,n,8)
+       add     %rax, %r9
+       adc     %r8, %rdx
+       mov     %r9, 8(rp,n,8)
+       add     %r8, %rdx
+L(ret):        mov     %rdx, %rax
+
+       pop     %rbx
+       cfi_adjust_cfa_offset (-8)
+       cfi_restore (%rbx)
         ret
  END (__mpn_mul_1)
diff --git a/sysdeps/x86_64/rshift.S b/sysdeps/x86_64/rshift.S

index ee0c8aa15ca1a16e1847eb3342c6e6f52d0f81a9..8ff055169aed59e0a7b1726c7151bbfe2a216bc7 100644 (file)
--- a/sysdeps/x86_64/rshift.S
+++ b/sysdeps/x86_64/rshift.S
@@ -1,5 +1,5 @@
-/* AMD64 __mpn_rshift --
-   Copyright (C) 2004, 2006 Free Software Foundation, Inc.
+/* x86-64 __mpn_rshift --
+   Copyright (C) 2007, 2009 Free Software Foundation, Inc.
     This file is part of the GNU MP Library.
  
     The GNU MP Library is free software; you can redistribute it and/or modify
@@ -20,43 +20,96 @@
  #include "sysdep.h"
  #include "asm-syntax.h"
  
+#define rp     %rdi
+#define up     %rsi
+#define n      %rdx
+#define cnt    %cl
+
         .text
  ENTRY (__mpn_rshift)
-       movq    (%rsi), %mm7
-       movd    %ecx, %mm1
-       movl    $64, %eax
-       subl    %ecx, %eax
-       movd    %eax, %mm0
-       movq    %mm7, %mm3
-       psllq   %mm0, %mm7
-       movd    %mm7, %rax
-       leaq    (%rsi,%rdx,8), %rsi
-       leaq    (%rdi,%rdx,8), %rdi
-       negq    %rdx
-       addq    $2, %rdx
-       jg      L(endo)
-       .p2align 2
-L(loop):
-       movq    -8(%rsi,%rdx,8), %mm6
-       movq    %mm6, %mm2
-       psllq   %mm0, %mm6
-       psrlq   %mm1, %mm3
-       por     %mm6, %mm3
-       movq    %mm3, -16(%rdi,%rdx,8)
-       je      L(ende)
-       movq    (%rsi,%rdx,8), %mm7
-       movq    %mm7, %mm3
-       psllq   %mm0, %mm7
-       psrlq   %mm1, %mm2
-       por     %mm7, %mm2
-       movq    %mm2, -8(%rdi,%rdx,8)
-       addq    $2, %rdx
-       jle     L(loop)
-L(endo):
-       movq    %mm3, %mm2
-L(ende):
-       psrlq   %mm1, %mm2
-       movq    %mm2, -8(%rdi)
-       emms
+       mov     %edx, %eax
+       and     $3, %eax
+       jne     L(nb00)
+L(b00):        /* n = 4, 8, 12, ... */
+       mov     (up), %r10
+       mov     8(up), %r11
+       xor     %eax, %eax
+       shrd    %cl, %r10, %rax
+       mov     16(up), %r8
+       lea     8(up), up
+       lea     -24(rp), rp
+       sub     $4, n
+       jmp     L(00)
+
+L(nb00):/* n = 1, 5, 9, ... */
+       cmp     $2, %eax
+       jae     L(nb01)
+L(b01):        mov     (up), %r9
+       xor     %eax, %eax
+       shrd    %cl, %r9, %rax
+       sub     $2, n
+       jb      L(le1)
+       mov     8(up), %r10
+       mov     16(up), %r11
+       lea     16(up), up
+       lea     -16(rp), rp
+       jmp     L(01)
+L(le1): shr    %cl, %r9
+       mov     %r9, (rp)
+       ret
+
+L(nb01):/* n = 2, 6, 10, ... */
+       jne     L(b11)
+L(b10):        mov     (up), %r8
+       mov     8(up), %r9
+       xor     %eax, %eax
+       shrd    %cl, %r8, %rax
+       sub     $3, n
+       jb      L(le2)
+       mov     16(up), %r10
+       lea     24(up), up
+       lea     -8(rp), rp
+       jmp     L(10)
+L(le2): shrd   %cl, %r9, %r8
+       mov     %r8, (rp)
+       shr     %cl, %r9
+       mov     %r9, 8(rp)
+       ret
+
+       .p2align 4
+L(b11):        /* n = 3, 7, 11, ... */
+       mov     (up), %r11
+       mov     8(up), %r8
+       xor     %eax, %eax
+       shrd    %cl, %r11, %rax
+       mov     16(up), %r9
+       lea     32(up), up
+       sub     $4, n
+       jb      L(end)
+
+       .p2align 4
+L(top):        shrd    %cl, %r8, %r11
+       mov     -8(up), %r10
+       mov     %r11, (rp)
+L(10): shrd    %cl, %r9, %r8
+       mov     (up), %r11
+       mov     %r8, 8(rp)
+L(01): shrd    %cl, %r10, %r9
+       mov     8(up), %r8
+       mov     %r9, 16(rp)
+L(00): shrd    %cl, %r11, %r10
+       mov     16(up), %r9
+       mov     %r10, 24(rp)
+       add     $32, up
+       lea     32(rp), rp
+       sub     $4, n
+       jnc     L(top)
+
+L(end):        shrd    %cl, %r8, %r11
+       mov     %r11, (rp)
+       shrd    %cl, %r9, %r8
+       mov     %r8, 8(rp)
+       shr     %cl, %r9
+       mov     %r9, 16(rp)
         ret
  END (__mpn_rshift)
diff --git a/sysdeps/x86_64/sub_n.S b/sysdeps/x86_64/sub_n.S

index 48e1a2e0f4512a6fb1ca7951bd2e59da0bb9c5b9..60c15fc3e19a6e3dd05b01050fc38ee18286bc52 100644 (file)
--- a/sysdeps/x86_64/sub_n.S
+++ b/sysdeps/x86_64/sub_n.S
@@ -1,6 +1,6 @@
-/* AMD64 __mpn_sub_n -- Add two limb vectors of the same length > 0 and store
+/* x86-64 __mpn_sub_n -- Add two limb vectors of the same length > 0 and store
     sum in a third limb vector.
-   Copyright (C) 2004 Free Software Foundation, Inc.
+   Copyright (C) 2006, 2007 Free Software Foundation, Inc.
     This file is part of the GNU MP Library.
  
     The GNU MP Library is free software; you can redistribute it and/or modify
@@ -18,25 +18,7 @@
     the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
     MA 02111-1307, USA. */
  
-#include "sysdep.h"
-#include "asm-syntax.h"
+#define func __mpn_sub_n
+#define ADCSBB sbb
  
-       .text
-ENTRY (__mpn_sub_n)
-       leaq    (%rsi,%rcx,8), %rsi
-       leaq    (%rdi,%rcx,8), %rdi
-       leaq    (%rdx,%rcx,8), %rdx
-       negq    %rcx
-       xorl    %eax, %eax              # clear cy
-       .p2align 2
-L(loop):
-       movq    (%rsi,%rcx,8), %rax
-       movq    (%rdx,%rcx,8), %r10
-       sbbq    %r10, %rax
-       movq    %rax, (%rdi,%rcx,8)
-       incq    %rcx
-       jne     L(loop)
-       movq    %rcx, %rax              # zero %rax
-       adcq    %rax, %rax
-       ret
-END (__mpn_sub_n)
+#include "add_n.S"
diff --git a/sysdeps/x86_64/submul_1.S b/sysdeps/x86_64/submul_1.S

index e94c9a7bee6b354ed526fab03d3627c008b09e6b..150a92762fb052e9de8f48b6064a32805e4026a9 100644 (file)
--- a/sysdeps/x86_64/submul_1.S
+++ b/sysdeps/x86_64/submul_1.S
@@ -1,6 +1,6 @@
-/* AMD64 __mpn_submul_1 -- Multiply a limb vector with a limb and subtract
+/* x86-64 __mpn_submul_1 -- Multiply a limb vector with a limb and subtract
     the result from a second limb vector.
-   Copyright (C) 2004 Free Software Foundation, Inc.
+   Copyright (C) 2003,2004,2005,2007,2008,2009 Free Software Foundation, Inc.
     This file is part of the GNU MP Library.
  
     The GNU MP Library is free software; you can redistribute it and/or modify
@@ -18,29 +18,7 @@
     the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
     MA 02111-1307, USA. */
  
-#include "sysdep.h"
-#include "asm-syntax.h"
+#define func __mpn_submul_1
+#define ADDSUB sub
  
-       .text
-ENTRY (__mpn_submul_1)
-       movq    %rdx, %r11
-       leaq    (%rsi,%r11,8), %rsi
-       leaq    (%rdi,%r11,8), %rdi
-       negq    %r11
-       xorl    %r8d, %r8d
-       .p2align 3
-L(loop):
-       movq    (%rsi,%r11,8), %rax
-       movq    (%rdi,%r11,8), %r10
-       mulq    %rcx
-       subq    %r8, %r10
-       movl    $0, %r8d
-       adcl    %r8d, %r8d
-       subq    %rax, %r10
-       adcq    %rdx, %r8
-       movq    %r10, (%rdi,%r11,8)
-       incq    %r11
-       jne     L(loop)
-       movq    %r8, %rax
-       ret
-END (__mpn_submul_1)
+#include "addmul_1.S"
author	Ulrich Drepper <drepper@redhat.com>
	Fri, 3 Sep 2010 06:36:25 +0000 (23:36 -0700)
committer	Ulrich Drepper <drepper@redhat.com>
	Fri, 3 Sep 2010 06:36:25 +0000 (23:36 -0700)
ChangeLog		patch \| blob \| blame \| history
sysdeps/x86_64/add_n.S		patch \| blob \| blame \| history
sysdeps/x86_64/addmul_1.S		patch \| blob \| blame \| history
sysdeps/x86_64/lshift.S		patch \| blob \| blame \| history
sysdeps/x86_64/mul_1.S		patch \| blob \| blame \| history
sysdeps/x86_64/rshift.S		patch \| blob \| blame \| history
sysdeps/x86_64/sub_n.S		patch \| blob \| blame \| history
sysdeps/x86_64/submul_1.S		patch \| blob \| blame \| history