From: Thomas Gleixner Date: Thu, 11 Oct 2007 09:15:32 +0000 (+0200) Subject: x86_64: prepare shared lib/copy_page.S X-Git-Tag: firefly_0821_release~26161 X-Git-Url: http://demsky.eecs.uci.edu/git/?a=commitdiff_plain;h=7d96027379c0f43f7169019c814cadca7487d507;p=firefly-linux-kernel-4.4.55.git x86_64: prepare shared lib/copy_page.S Signed-off-by: Thomas Gleixner Signed-off-by: Ingo Molnar --- diff --git a/arch/x86_64/lib/Makefile b/arch/x86_64/lib/Makefile index ea20b1077ff0..c22087f66d5f 100644 --- a/arch/x86_64/lib/Makefile +++ b/arch/x86_64/lib/Makefile @@ -9,5 +9,5 @@ obj-$(CONFIG_SMP) += msr-on-cpu.o lib-y := csum-partial.o csum-copy.o csum-wrappers.o delay.o \ usercopy.o getuser.o putuser.o \ - thunk_64.o clear_page.o copy_page.o bitstr.o bitops.o + thunk_64.o clear_page.o copy_page_64.o bitstr.o bitops.o lib-y += memcpy.o memmove.o memset.o copy_user.o rwlock.o copy_user_nocache.o diff --git a/arch/x86_64/lib/copy_page.S b/arch/x86_64/lib/copy_page.S deleted file mode 100644 index 727a5d46d2fc..000000000000 --- a/arch/x86_64/lib/copy_page.S +++ /dev/null @@ -1,119 +0,0 @@ -/* Written 2003 by Andi Kleen, based on a kernel by Evandro Menezes */ - -#include -#include - - ALIGN -copy_page_c: - CFI_STARTPROC - movl $4096/8,%ecx - rep movsq - ret - CFI_ENDPROC -ENDPROC(copy_page_c) - -/* Don't use streaming store because it's better when the target - ends up in cache. */ - -/* Could vary the prefetch distance based on SMP/UP */ - -ENTRY(copy_page) - CFI_STARTPROC - subq $3*8,%rsp - CFI_ADJUST_CFA_OFFSET 3*8 - movq %rbx,(%rsp) - CFI_REL_OFFSET rbx, 0 - movq %r12,1*8(%rsp) - CFI_REL_OFFSET r12, 1*8 - movq %r13,2*8(%rsp) - CFI_REL_OFFSET r13, 2*8 - - movl $(4096/64)-5,%ecx - .p2align 4 -.Loop64: - dec %rcx - - movq (%rsi), %rax - movq 8 (%rsi), %rbx - movq 16 (%rsi), %rdx - movq 24 (%rsi), %r8 - movq 32 (%rsi), %r9 - movq 40 (%rsi), %r10 - movq 48 (%rsi), %r11 - movq 56 (%rsi), %r12 - - prefetcht0 5*64(%rsi) - - movq %rax, (%rdi) - movq %rbx, 8 (%rdi) - movq %rdx, 16 (%rdi) - movq %r8, 24 (%rdi) - movq %r9, 32 (%rdi) - movq %r10, 40 (%rdi) - movq %r11, 48 (%rdi) - movq %r12, 56 (%rdi) - - leaq 64 (%rsi), %rsi - leaq 64 (%rdi), %rdi - - jnz .Loop64 - - movl $5,%ecx - .p2align 4 -.Loop2: - decl %ecx - - movq (%rsi), %rax - movq 8 (%rsi), %rbx - movq 16 (%rsi), %rdx - movq 24 (%rsi), %r8 - movq 32 (%rsi), %r9 - movq 40 (%rsi), %r10 - movq 48 (%rsi), %r11 - movq 56 (%rsi), %r12 - - movq %rax, (%rdi) - movq %rbx, 8 (%rdi) - movq %rdx, 16 (%rdi) - movq %r8, 24 (%rdi) - movq %r9, 32 (%rdi) - movq %r10, 40 (%rdi) - movq %r11, 48 (%rdi) - movq %r12, 56 (%rdi) - - leaq 64(%rdi),%rdi - leaq 64(%rsi),%rsi - - jnz .Loop2 - - movq (%rsp),%rbx - CFI_RESTORE rbx - movq 1*8(%rsp),%r12 - CFI_RESTORE r12 - movq 2*8(%rsp),%r13 - CFI_RESTORE r13 - addq $3*8,%rsp - CFI_ADJUST_CFA_OFFSET -3*8 - ret -.Lcopy_page_end: - CFI_ENDPROC -ENDPROC(copy_page) - - /* Some CPUs run faster using the string copy instructions. - It is also a lot simpler. Use this when possible */ - -#include - - .section .altinstr_replacement,"ax" -1: .byte 0xeb /* jmp */ - .byte (copy_page_c - copy_page) - (2f - 1b) /* offset */ -2: - .previous - .section .altinstructions,"a" - .align 8 - .quad copy_page - .quad 1b - .byte X86_FEATURE_REP_GOOD - .byte .Lcopy_page_end - copy_page - .byte 2b - 1b - .previous diff --git a/arch/x86_64/lib/copy_page_64.S b/arch/x86_64/lib/copy_page_64.S new file mode 100644 index 000000000000..727a5d46d2fc --- /dev/null +++ b/arch/x86_64/lib/copy_page_64.S @@ -0,0 +1,119 @@ +/* Written 2003 by Andi Kleen, based on a kernel by Evandro Menezes */ + +#include +#include + + ALIGN +copy_page_c: + CFI_STARTPROC + movl $4096/8,%ecx + rep movsq + ret + CFI_ENDPROC +ENDPROC(copy_page_c) + +/* Don't use streaming store because it's better when the target + ends up in cache. */ + +/* Could vary the prefetch distance based on SMP/UP */ + +ENTRY(copy_page) + CFI_STARTPROC + subq $3*8,%rsp + CFI_ADJUST_CFA_OFFSET 3*8 + movq %rbx,(%rsp) + CFI_REL_OFFSET rbx, 0 + movq %r12,1*8(%rsp) + CFI_REL_OFFSET r12, 1*8 + movq %r13,2*8(%rsp) + CFI_REL_OFFSET r13, 2*8 + + movl $(4096/64)-5,%ecx + .p2align 4 +.Loop64: + dec %rcx + + movq (%rsi), %rax + movq 8 (%rsi), %rbx + movq 16 (%rsi), %rdx + movq 24 (%rsi), %r8 + movq 32 (%rsi), %r9 + movq 40 (%rsi), %r10 + movq 48 (%rsi), %r11 + movq 56 (%rsi), %r12 + + prefetcht0 5*64(%rsi) + + movq %rax, (%rdi) + movq %rbx, 8 (%rdi) + movq %rdx, 16 (%rdi) + movq %r8, 24 (%rdi) + movq %r9, 32 (%rdi) + movq %r10, 40 (%rdi) + movq %r11, 48 (%rdi) + movq %r12, 56 (%rdi) + + leaq 64 (%rsi), %rsi + leaq 64 (%rdi), %rdi + + jnz .Loop64 + + movl $5,%ecx + .p2align 4 +.Loop2: + decl %ecx + + movq (%rsi), %rax + movq 8 (%rsi), %rbx + movq 16 (%rsi), %rdx + movq 24 (%rsi), %r8 + movq 32 (%rsi), %r9 + movq 40 (%rsi), %r10 + movq 48 (%rsi), %r11 + movq 56 (%rsi), %r12 + + movq %rax, (%rdi) + movq %rbx, 8 (%rdi) + movq %rdx, 16 (%rdi) + movq %r8, 24 (%rdi) + movq %r9, 32 (%rdi) + movq %r10, 40 (%rdi) + movq %r11, 48 (%rdi) + movq %r12, 56 (%rdi) + + leaq 64(%rdi),%rdi + leaq 64(%rsi),%rsi + + jnz .Loop2 + + movq (%rsp),%rbx + CFI_RESTORE rbx + movq 1*8(%rsp),%r12 + CFI_RESTORE r12 + movq 2*8(%rsp),%r13 + CFI_RESTORE r13 + addq $3*8,%rsp + CFI_ADJUST_CFA_OFFSET -3*8 + ret +.Lcopy_page_end: + CFI_ENDPROC +ENDPROC(copy_page) + + /* Some CPUs run faster using the string copy instructions. + It is also a lot simpler. Use this when possible */ + +#include + + .section .altinstr_replacement,"ax" +1: .byte 0xeb /* jmp */ + .byte (copy_page_c - copy_page) - (2f - 1b) /* offset */ +2: + .previous + .section .altinstructions,"a" + .align 8 + .quad copy_page + .quad 1b + .byte X86_FEATURE_REP_GOOD + .byte .Lcopy_page_end - copy_page + .byte 2b - 1b + .previous