From: Thomas Gleixner <tglx@linutronix.de>
Date: Thu, 11 Oct 2007 09:15:39 +0000 (+0200)
Subject: x86_64: prepare shared lib/memcpy.S
X-Git-Tag: firefly_0821_release~26156
X-Git-Url: http://demsky.eecs.uci.edu/git/?a=commitdiff_plain;h=ff8e90da7c3ccf2e74e7ab4a57a0ff6e0de027b0;p=firefly-linux-kernel-4.4.55.git

x86_64: prepare shared lib/memcpy.S

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---

diff --git a/arch/x86_64/lib/Makefile b/arch/x86_64/lib/Makefile
index d9d715d3ec9c..dce49348cebe 100644
--- a/arch/x86_64/lib/Makefile
+++ b/arch/x86_64/lib/Makefile
@@ -10,4 +10,4 @@ obj-$(CONFIG_SMP)	+= msr-on-cpu.o
 lib-y := csum-partial.o csum-copy_64.o csum-wrappers.o delay.o \
 	usercopy.o getuser.o putuser.o  \
 	thunk_64.o clear_page_64.o copy_page_64.o bitstr_64.o bitops.o
-lib-y += memcpy.o memmove.o memset.o copy_user.o rwlock_64.o copy_user_nocache.o
+lib-y += memcpy_64.o memmove.o memset.o copy_user.o rwlock_64.o copy_user_nocache.o
diff --git a/arch/x86_64/lib/memcpy.S b/arch/x86_64/lib/memcpy.S
deleted file mode 100644
index c22981fa2f3a..000000000000
--- a/arch/x86_64/lib/memcpy.S
+++ /dev/null
@@ -1,131 +0,0 @@
-/* Copyright 2002 Andi Kleen */
-
-#include <linux/linkage.h>
-#include <asm/dwarf2.h>
-#include <asm/cpufeature.h>
-
-/*
- * memcpy - Copy a memory block.
- *
- * Input:	
- * rdi destination
- * rsi source
- * rdx count
- * 
- * Output:
- * rax original destination
- */	
-
-	ALIGN
-memcpy_c:
-	CFI_STARTPROC
-	movq %rdi,%rax
-	movl %edx,%ecx
-	shrl $3,%ecx
-	andl $7,%edx
-	rep movsq
-	movl %edx,%ecx
-	rep movsb
-	ret
-	CFI_ENDPROC
-ENDPROC(memcpy_c)
-
-ENTRY(__memcpy)
-ENTRY(memcpy)
-	CFI_STARTPROC
-	pushq %rbx
-	CFI_ADJUST_CFA_OFFSET 8
-	CFI_REL_OFFSET rbx, 0
-	movq %rdi,%rax
-
-	movl %edx,%ecx
-	shrl $6,%ecx
-	jz .Lhandle_tail
-
-	.p2align 4
-.Lloop_64:
-	decl %ecx
-
-	movq (%rsi),%r11
-	movq 8(%rsi),%r8
-
-	movq %r11,(%rdi)
-	movq %r8,1*8(%rdi)
-
-	movq 2*8(%rsi),%r9
-	movq 3*8(%rsi),%r10
-
-	movq %r9,2*8(%rdi)
-	movq %r10,3*8(%rdi)
-
-	movq 4*8(%rsi),%r11
-	movq 5*8(%rsi),%r8
-
-	movq %r11,4*8(%rdi)
-	movq %r8,5*8(%rdi)
-
-	movq 6*8(%rsi),%r9
-	movq 7*8(%rsi),%r10
-
-	movq %r9,6*8(%rdi)
-	movq %r10,7*8(%rdi)
-
-	leaq 64(%rsi),%rsi
-	leaq 64(%rdi),%rdi
-	jnz  .Lloop_64
-
-.Lhandle_tail:
-	movl %edx,%ecx
-	andl $63,%ecx
-	shrl $3,%ecx
-	jz   .Lhandle_7
-	.p2align 4
-.Lloop_8:
-	decl %ecx
-	movq (%rsi),%r8
-	movq %r8,(%rdi)
-	leaq 8(%rdi),%rdi
-	leaq 8(%rsi),%rsi
-	jnz  .Lloop_8
-
-.Lhandle_7:
-	movl %edx,%ecx
-	andl $7,%ecx
-	jz .Lende
-	.p2align 4
-.Lloop_1:
-	movb (%rsi),%r8b
-	movb %r8b,(%rdi)
-	incq %rdi
-	incq %rsi
-	decl %ecx
-	jnz .Lloop_1
-
-.Lende:
-	popq %rbx
-	CFI_ADJUST_CFA_OFFSET -8
-	CFI_RESTORE rbx
-	ret
-.Lfinal:
-	CFI_ENDPROC
-ENDPROC(memcpy)
-ENDPROC(__memcpy)
-
-	/* Some CPUs run faster using the string copy instructions.
-	   It is also a lot simpler. Use this when possible */
-
-	.section .altinstr_replacement,"ax"
-1:	.byte 0xeb				/* jmp <disp8> */
-	.byte (memcpy_c - memcpy) - (2f - 1b)	/* offset */
-2:
-	.previous
-	.section .altinstructions,"a"
-	.align 8
-	.quad memcpy
-	.quad 1b
-	.byte X86_FEATURE_REP_GOOD
-	/* Replace only beginning, memcpy is used to apply alternatives, so it
-	 * is silly to overwrite itself with nops - reboot is only outcome... */
-	.byte 2b - 1b
-	.byte 2b - 1b
-	.previous
diff --git a/arch/x86_64/lib/memcpy_64.S b/arch/x86_64/lib/memcpy_64.S
new file mode 100644
index 000000000000..c22981fa2f3a
--- /dev/null
+++ b/arch/x86_64/lib/memcpy_64.S
@@ -0,0 +1,131 @@
+/* Copyright 2002 Andi Kleen */
+
+#include <linux/linkage.h>
+#include <asm/dwarf2.h>
+#include <asm/cpufeature.h>
+
+/*
+ * memcpy - Copy a memory block.
+ *
+ * Input:	
+ * rdi destination
+ * rsi source
+ * rdx count
+ * 
+ * Output:
+ * rax original destination
+ */	
+
+	ALIGN
+memcpy_c:
+	CFI_STARTPROC
+	movq %rdi,%rax
+	movl %edx,%ecx
+	shrl $3,%ecx
+	andl $7,%edx
+	rep movsq
+	movl %edx,%ecx
+	rep movsb
+	ret
+	CFI_ENDPROC
+ENDPROC(memcpy_c)
+
+ENTRY(__memcpy)
+ENTRY(memcpy)
+	CFI_STARTPROC
+	pushq %rbx
+	CFI_ADJUST_CFA_OFFSET 8
+	CFI_REL_OFFSET rbx, 0
+	movq %rdi,%rax
+
+	movl %edx,%ecx
+	shrl $6,%ecx
+	jz .Lhandle_tail
+
+	.p2align 4
+.Lloop_64:
+	decl %ecx
+
+	movq (%rsi),%r11
+	movq 8(%rsi),%r8
+
+	movq %r11,(%rdi)
+	movq %r8,1*8(%rdi)
+
+	movq 2*8(%rsi),%r9
+	movq 3*8(%rsi),%r10
+
+	movq %r9,2*8(%rdi)
+	movq %r10,3*8(%rdi)
+
+	movq 4*8(%rsi),%r11
+	movq 5*8(%rsi),%r8
+
+	movq %r11,4*8(%rdi)
+	movq %r8,5*8(%rdi)
+
+	movq 6*8(%rsi),%r9
+	movq 7*8(%rsi),%r10
+
+	movq %r9,6*8(%rdi)
+	movq %r10,7*8(%rdi)
+
+	leaq 64(%rsi),%rsi
+	leaq 64(%rdi),%rdi
+	jnz  .Lloop_64
+
+.Lhandle_tail:
+	movl %edx,%ecx
+	andl $63,%ecx
+	shrl $3,%ecx
+	jz   .Lhandle_7
+	.p2align 4
+.Lloop_8:
+	decl %ecx
+	movq (%rsi),%r8
+	movq %r8,(%rdi)
+	leaq 8(%rdi),%rdi
+	leaq 8(%rsi),%rsi
+	jnz  .Lloop_8
+
+.Lhandle_7:
+	movl %edx,%ecx
+	andl $7,%ecx
+	jz .Lende
+	.p2align 4
+.Lloop_1:
+	movb (%rsi),%r8b
+	movb %r8b,(%rdi)
+	incq %rdi
+	incq %rsi
+	decl %ecx
+	jnz .Lloop_1
+
+.Lende:
+	popq %rbx
+	CFI_ADJUST_CFA_OFFSET -8
+	CFI_RESTORE rbx
+	ret
+.Lfinal:
+	CFI_ENDPROC
+ENDPROC(memcpy)
+ENDPROC(__memcpy)
+
+	/* Some CPUs run faster using the string copy instructions.
+	   It is also a lot simpler. Use this when possible */
+
+	.section .altinstr_replacement,"ax"
+1:	.byte 0xeb				/* jmp <disp8> */
+	.byte (memcpy_c - memcpy) - (2f - 1b)	/* offset */
+2:
+	.previous
+	.section .altinstructions,"a"
+	.align 8
+	.quad memcpy
+	.quad 1b
+	.byte X86_FEATURE_REP_GOOD
+	/* Replace only beginning, memcpy is used to apply alternatives, so it
+	 * is silly to overwrite itself with nops - reboot is only outcome... */
+	.byte 2b - 1b
+	.byte 2b - 1b
+	.previous