Otherwise perf profiles don't charge tme to memcpy
Signed-off-by: Vineet Gupta <vgupta@synopsys.com>
;;; if size <= 8
cmp r2, 8
;;; if size <= 8
cmp r2, 8
mov.f lp_count, r2
and.f r4, r0, 0x03
rsub lp_count, r4, 4
mov.f lp_count, r2
and.f r4, r0, 0x03
rsub lp_count, r4, 4
+ lpnz @.Laligndestination
;; LOOP BEGIN
ldb.ab r5, [r1,1]
sub r2, r2, 1
stb.ab r5, [r3,1]
;; LOOP BEGIN
ldb.ab r5, [r1,1]
sub r2, r2, 1
stb.ab r5, [r3,1]
;;; Check the alignment of the source
and.f r4, r1, 0x03
;;; Check the alignment of the source
and.f r4, r1, 0x03
+ bnz.d @.Lsourceunaligned
;;; CASE 0: Both source and destination are 32bit aligned
;;; Convert len to Dwords, unfold x4
lsr.f lp_count, r2, ZOLSHFT
;;; CASE 0: Both source and destination are 32bit aligned
;;; Convert len to Dwords, unfold x4
lsr.f lp_count, r2, ZOLSHFT
;; LOOP START
LOADX (r6, r1)
PREFETCH_READ (r1)
;; LOOP START
LOADX (r6, r1)
PREFETCH_READ (r1)
STOREX (r8, r3)
STOREX (r10, r3)
STOREX (r4, r3)
STOREX (r8, r3)
STOREX (r10, r3)
STOREX (r4, r3)
and.f lp_count, r2, ZOLAND ;Last remaining 31 bytes
and.f lp_count, r2, ZOLAND ;Last remaining 31 bytes
-smallchunk:
- lpnz @copyremainingbytes
+.Lsmallchunk:
+ lpnz @.Lcopyremainingbytes
;; LOOP START
ldb.ab r5, [r1,1]
stb.ab r5, [r3,1]
;; LOOP START
ldb.ab r5, [r1,1]
stb.ab r5, [r3,1]
+ beq.d @.LunalignedOffby2
+ bhi.d @.LunalignedOffby3
ldb.ab r5, [r1, 1]
;;; CASE 1: The source is unaligned, off by 1
ldb.ab r5, [r1, 1]
;;; CASE 1: The source is unaligned, off by 1
or r5, r5, r6
;; Both src and dst are aligned
or r5, r5, r6
;; Both src and dst are aligned
;; LOOP START
ld.ab r6, [r1, 4]
prefetch [r1, 28] ;Prefetch the next read location
;; LOOP START
ld.ab r6, [r1, 4]
prefetch [r1, 28] ;Prefetch the next read location
st.ab r7, [r3, 4]
st.ab r9, [r3, 4]
st.ab r7, [r3, 4]
st.ab r9, [r3, 4]
;; Write back the remaining 16bits
EXTRACT_1 (r6, r5, 16)
;; Write back the remaining 16bits
EXTRACT_1 (r6, r5, 16)
stb.ab r5, [r3, 1]
and.f lp_count, r2, 0x07 ;Last 8bytes
stb.ab r5, [r3, 1]
and.f lp_count, r2, 0x07 ;Last 8bytes
;; LOOP START
ldb.ab r6, [r1,1]
stb.ab r6, [r3,1]
;; LOOP START
ldb.ab r6, [r1,1]
stb.ab r6, [r3,1]
;;; CASE 2: The source is unaligned, off by 2
ldh.ab r5, [r1, 2]
sub r2, r2, 1
;;; CASE 2: The source is unaligned, off by 2
ldh.ab r5, [r1, 2]
sub r2, r2, 1
#ifdef __BIG_ENDIAN__
asl.nz r5, r5, 16
#endif
#ifdef __BIG_ENDIAN__
asl.nz r5, r5, 16
#endif
;; LOOP START
ld.ab r6, [r1, 4]
prefetch [r1, 28] ;Prefetch the next read location
;; LOOP START
ld.ab r6, [r1, 4]
prefetch [r1, 28] ;Prefetch the next read location
st.ab r7, [r3, 4]
st.ab r9, [r3, 4]
st.ab r7, [r3, 4]
st.ab r9, [r3, 4]
#ifdef __BIG_ENDIAN__
lsr.nz r5, r5, 16
#ifdef __BIG_ENDIAN__
lsr.nz r5, r5, 16
sth.ab r5, [r3, 2]
and.f lp_count, r2, 0x07 ;Last 8bytes
sth.ab r5, [r3, 2]
and.f lp_count, r2, 0x07 ;Last 8bytes
;; LOOP START
ldb.ab r6, [r1,1]
stb.ab r6, [r3,1]
;; LOOP START
ldb.ab r6, [r1,1]
stb.ab r6, [r3,1]
;;; CASE 3: The source is unaligned, off by 3
;;; Hence, I need to read 1byte for achieve the 32bit alignment
;;; CASE 3: The source is unaligned, off by 3
;;; Hence, I need to read 1byte for achieve the 32bit alignment
#ifdef __BIG_ENDIAN__
asl.ne r5, r5, 24
#endif
#ifdef __BIG_ENDIAN__
asl.ne r5, r5, 24
#endif
;; LOOP START
ld.ab r6, [r1, 4]
prefetch [r1, 28] ;Prefetch the next read location
;; LOOP START
ld.ab r6, [r1, 4]
prefetch [r1, 28] ;Prefetch the next read location
st.ab r7, [r3, 4]
st.ab r9, [r3, 4]
st.ab r7, [r3, 4]
st.ab r9, [r3, 4]
#ifdef __BIG_ENDIAN__
lsr.nz r5, r5, 24
#ifdef __BIG_ENDIAN__
lsr.nz r5, r5, 24
stb.ab r5, [r3, 1]
and.f lp_count, r2, 0x07 ;Last 8bytes
stb.ab r5, [r3, 1]
and.f lp_count, r2, 0x07 ;Last 8bytes
;; LOOP START
ldb.ab r6, [r1,1]
stb.ab r6, [r3,1]
;; LOOP START
ldb.ab r6, [r1,1]
stb.ab r6, [r3,1]