x86_64: prepare shared lib/memset.S
authorThomas Gleixner <tglx@linutronix.de>
Thu, 11 Oct 2007 09:15:55 +0000 (11:15 +0200)
committerThomas Gleixner <tglx@linutronix.de>
Thu, 11 Oct 2007 09:15:55 +0000 (11:15 +0200)
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
arch/x86_64/lib/Makefile
arch/x86_64/lib/memset.S [deleted file]
arch/x86_64/lib/memset_64.S [new file with mode: 0644]

index 6bdf7d824f4c61d72aed29f6d138ad87f5482cee..09c1ffa974a995025fb0d8873366ae1d7320a5ec 100644 (file)
@@ -10,4 +10,4 @@ obj-$(CONFIG_SMP)     += msr-on-cpu.o
 lib-y := csum-partial_64.o csum-copy_64.o csum-wrappers_64.o delay_64.o \
        usercopy_64.o getuser_64.o putuser_64.o  \
        thunk_64.o clear_page_64.o copy_page_64.o bitstr_64.o bitops_64.o
-lib-y += memcpy_64.o memmove_64.o memset.o copy_user.o rwlock_64.o copy_user_nocache_64.o
+lib-y += memcpy_64.o memmove_64.o memset_64.o copy_user.o rwlock_64.o copy_user_nocache_64.o
diff --git a/arch/x86_64/lib/memset.S b/arch/x86_64/lib/memset.S
deleted file mode 100644 (file)
index 2c59481..0000000
+++ /dev/null
@@ -1,133 +0,0 @@
-/* Copyright 2002 Andi Kleen, SuSE Labs */
-
-#include <linux/linkage.h>
-#include <asm/dwarf2.h>
-
-/*
- * ISO C memset - set a memory block to a byte value.
- *     
- * rdi   destination
- * rsi   value (char) 
- * rdx   count (bytes) 
- * 
- * rax   original destination
- */    
-       ALIGN
-memset_c:
-       CFI_STARTPROC
-       movq %rdi,%r9
-       movl %edx,%r8d
-       andl $7,%r8d
-       movl %edx,%ecx
-       shrl $3,%ecx
-       /* expand byte value  */
-       movzbl %sil,%esi
-       movabs $0x0101010101010101,%rax
-       mulq %rsi               /* with rax, clobbers rdx */
-       rep stosq
-       movl %r8d,%ecx
-       rep stosb
-       movq %r9,%rax
-       ret
-       CFI_ENDPROC
-ENDPROC(memset_c)
-
-ENTRY(memset)
-ENTRY(__memset)
-       CFI_STARTPROC
-       movq %rdi,%r10
-       movq %rdx,%r11
-
-       /* expand byte value  */
-       movzbl %sil,%ecx
-       movabs $0x0101010101010101,%rax
-       mul    %rcx             /* with rax, clobbers rdx */
-
-       /* align dst */
-       movl  %edi,%r9d
-       andl  $7,%r9d
-       jnz  .Lbad_alignment
-       CFI_REMEMBER_STATE
-.Lafter_bad_alignment:
-
-       movl %r11d,%ecx
-       shrl $6,%ecx
-       jz       .Lhandle_tail
-
-       .p2align 4
-.Lloop_64:
-       decl   %ecx
-       movq  %rax,(%rdi)
-       movq  %rax,8(%rdi)
-       movq  %rax,16(%rdi)
-       movq  %rax,24(%rdi)
-       movq  %rax,32(%rdi)
-       movq  %rax,40(%rdi)
-       movq  %rax,48(%rdi)
-       movq  %rax,56(%rdi)
-       leaq  64(%rdi),%rdi
-       jnz    .Lloop_64
-
-       /* Handle tail in loops. The loops should be faster than hard
-          to predict jump tables. */
-       .p2align 4
-.Lhandle_tail:
-       movl    %r11d,%ecx
-       andl    $63&(~7),%ecx
-       jz              .Lhandle_7
-       shrl    $3,%ecx
-       .p2align 4
-.Lloop_8:
-       decl   %ecx
-       movq  %rax,(%rdi)
-       leaq  8(%rdi),%rdi
-       jnz    .Lloop_8
-
-.Lhandle_7:
-       movl    %r11d,%ecx
-       andl    $7,%ecx
-       jz      .Lende
-       .p2align 4
-.Lloop_1:
-       decl    %ecx
-       movb    %al,(%rdi)
-       leaq    1(%rdi),%rdi
-       jnz     .Lloop_1
-
-.Lende:
-       movq    %r10,%rax
-       ret
-
-       CFI_RESTORE_STATE
-.Lbad_alignment:
-       cmpq $7,%r11
-       jbe     .Lhandle_7
-       movq %rax,(%rdi)        /* unaligned store */
-       movq $8,%r8
-       subq %r9,%r8
-       addq %r8,%rdi
-       subq %r8,%r11
-       jmp .Lafter_bad_alignment
-.Lfinal:
-       CFI_ENDPROC
-ENDPROC(memset)
-ENDPROC(__memset)
-
-       /* Some CPUs run faster using the string instructions.
-          It is also a lot simpler. Use this when possible */
-
-#include <asm/cpufeature.h>
-
-       .section .altinstr_replacement,"ax"
-1:     .byte 0xeb                              /* jmp <disp8> */
-       .byte (memset_c - memset) - (2f - 1b)   /* offset */
-2:
-       .previous
-       .section .altinstructions,"a"
-       .align 8
-       .quad memset
-       .quad 1b
-       .byte X86_FEATURE_REP_GOOD
-       .byte .Lfinal - memset
-       .byte 2b - 1b
-       .previous
diff --git a/arch/x86_64/lib/memset_64.S b/arch/x86_64/lib/memset_64.S
new file mode 100644 (file)
index 0000000..2c59481
--- /dev/null
@@ -0,0 +1,133 @@
+/* Copyright 2002 Andi Kleen, SuSE Labs */
+
+#include <linux/linkage.h>
+#include <asm/dwarf2.h>
+
+/*
+ * ISO C memset - set a memory block to a byte value.
+ *     
+ * rdi   destination
+ * rsi   value (char) 
+ * rdx   count (bytes) 
+ * 
+ * rax   original destination
+ */    
+       ALIGN
+memset_c:
+       CFI_STARTPROC
+       movq %rdi,%r9
+       movl %edx,%r8d
+       andl $7,%r8d
+       movl %edx,%ecx
+       shrl $3,%ecx
+       /* expand byte value  */
+       movzbl %sil,%esi
+       movabs $0x0101010101010101,%rax
+       mulq %rsi               /* with rax, clobbers rdx */
+       rep stosq
+       movl %r8d,%ecx
+       rep stosb
+       movq %r9,%rax
+       ret
+       CFI_ENDPROC
+ENDPROC(memset_c)
+
+ENTRY(memset)
+ENTRY(__memset)
+       CFI_STARTPROC
+       movq %rdi,%r10
+       movq %rdx,%r11
+
+       /* expand byte value  */
+       movzbl %sil,%ecx
+       movabs $0x0101010101010101,%rax
+       mul    %rcx             /* with rax, clobbers rdx */
+
+       /* align dst */
+       movl  %edi,%r9d
+       andl  $7,%r9d
+       jnz  .Lbad_alignment
+       CFI_REMEMBER_STATE
+.Lafter_bad_alignment:
+
+       movl %r11d,%ecx
+       shrl $6,%ecx
+       jz       .Lhandle_tail
+
+       .p2align 4
+.Lloop_64:
+       decl   %ecx
+       movq  %rax,(%rdi)
+       movq  %rax,8(%rdi)
+       movq  %rax,16(%rdi)
+       movq  %rax,24(%rdi)
+       movq  %rax,32(%rdi)
+       movq  %rax,40(%rdi)
+       movq  %rax,48(%rdi)
+       movq  %rax,56(%rdi)
+       leaq  64(%rdi),%rdi
+       jnz    .Lloop_64
+
+       /* Handle tail in loops. The loops should be faster than hard
+          to predict jump tables. */
+       .p2align 4
+.Lhandle_tail:
+       movl    %r11d,%ecx
+       andl    $63&(~7),%ecx
+       jz              .Lhandle_7
+       shrl    $3,%ecx
+       .p2align 4
+.Lloop_8:
+       decl   %ecx
+       movq  %rax,(%rdi)
+       leaq  8(%rdi),%rdi
+       jnz    .Lloop_8
+
+.Lhandle_7:
+       movl    %r11d,%ecx
+       andl    $7,%ecx
+       jz      .Lende
+       .p2align 4
+.Lloop_1:
+       decl    %ecx
+       movb    %al,(%rdi)
+       leaq    1(%rdi),%rdi
+       jnz     .Lloop_1
+
+.Lende:
+       movq    %r10,%rax
+       ret
+
+       CFI_RESTORE_STATE
+.Lbad_alignment:
+       cmpq $7,%r11
+       jbe     .Lhandle_7
+       movq %rax,(%rdi)        /* unaligned store */
+       movq $8,%r8
+       subq %r9,%r8
+       addq %r8,%rdi
+       subq %r8,%r11
+       jmp .Lafter_bad_alignment
+.Lfinal:
+       CFI_ENDPROC
+ENDPROC(memset)
+ENDPROC(__memset)
+
+       /* Some CPUs run faster using the string instructions.
+          It is also a lot simpler. Use this when possible */
+
+#include <asm/cpufeature.h>
+
+       .section .altinstr_replacement,"ax"
+1:     .byte 0xeb                              /* jmp <disp8> */
+       .byte (memset_c - memset) - (2f - 1b)   /* offset */
+2:
+       .previous
+       .section .altinstructions,"a"
+       .align 8
+       .quad memset
+       .quad 1b
+       .byte X86_FEATURE_REP_GOOD
+       .byte .Lfinal - memset
+       .byte 2b - 1b
+       .previous