/* Part of the "copies and fills" library by Simon Hall The inner loop of the misaligned path is derived from the GNU libc ARM port. The rest is my own work. This code is licensed under the GNU Lesser General Public License version 2.1 void memcpy(void *dest, const void *source, size_t count); */ .global memcpy .func memcpy memcpy: cmp r2, #0 pld [r1] bxeq lr /* get straight out on zero, NB: count is unsigned */ cmp r2, #4 /* basic copy for four bytes */ pld [r1, #32] ldreq r3, [r1] /* we're relying on the cpu misalignment support here */ streq r3, [r0] bxeq lr cmp r2, #8 pld [r1, #64] /* basic copy for eight bytes, with fall through for < 8 */ ldreq r3, [r1] /* can't use ldrd without checking alignment...can't trust os alignment handling */ streq r3, [r0] /* if we do trust the os then r2 is free for ldrd */ ldreq r3, [r1, #4] streq r3, [r0, #4] bxeq lr cmp r2, #32 pld [r1, #96] blt byte_at_a_time_no_pld /* fast path for small sizes, no stack push */ push {r0, r4-r11} /* memcpy returns the original destination, hence push r0 */ /* compute the dest pointer alignment */ .if 0 and r3, r0, #3 /* slightly slower compared to conditional version below */ cmp r3, #3 /* three bytes misaligned, one to do */ beq head_1 cmp r3, #2 beq head_2 /* two bytes misaligned, two to do */ cmp r3, #1 beq head_3 /* one byte misaligned, three to do */ .else ands r3, r0, #3 beq skip_byte_realignment rsb r4, r3, #4 /* how many bytes need to be read */ cmp r4, #2 ldrgtb r5, [r1], #1 /* three bytes */ ldrgeb r6, [r1], #1 /* two+ bytes */ ldrb r7, [r1], #1 /* one+ byte */ strgtb r5, [r0], #1 strgeb r6, [r0], #1 strb r7, [r0], #1 sub r2, r4 skip_byte_realignment: .endif .if 0 eor r3, r0, r1 /* check the 4b alignment of the two pointers */ tst r3, #3 /* ideally the bottom two bits should line up */ .else ands r3, r1, #3 .endif bne misaligned /* dest pointer now 4b aligned */ /* let's try and 32b align the destination */ tst r0, #31 beq pre_fast_loop align_up: .if 1 ldr r3, [r1], #4 add r0, #4 sub r2, #4 tst r0, #31 /* do it early for the next run */ str r3, [r0, #-4] bne align_up .else and r3, r0, #31 /* jump based on the amount of bytes to do - slower than loop above */ add pc, pc, r3 nop; nop ldr r4, [r1], #4 ldr r5, [r1], #4 ldr r6, [r1], #4 ldr r7, [r1], #4 ldr r8, [r1], #4 ldr r9, [r1], #4 ldr r10, [r1], #4 add pc, pc, r3 nop; nop str r4, [r0], #4 str r5, [r0], #4 str r6, [r0], #4 str r7, [r0], #4 str r8, [r0], #4 str r9, [r0], #4 str r10, [r0], #4 rsb r3, #32 sub r2, r3 .endif pre_fast_loop: /* round byte count down to nearest 32 */ bics r3, r2, #31 /* compute the spare */ and r2, #31 beq post_fast_loop /* nothing to do in the main loop */ /* work through 32b at a time */ fast_loop: .if 0 ldmia r1!, {r4-r11} /* original version */ subs r3, #32 stmia r0!, {r4-r11} pld [r1, #128] bne fast_loop .else ldmia r1!, {r4-r7} /* slightly fast version suggested by tufty */ ldmia r1!, {r8-r11} stmia r0!, {r4-r7} pld [r1, #128] subs r3, #32 stmia r0!, {r8-r11} bne fast_loop .endif /* handle the spare bytes, up to 32 of them */ post_fast_loop: cmp r2, #0 /* there might be none */ beq full_out bics r3, r2, #3 and r2, #3 beq tail_fast_loop_byte tail_fast_loop: ldr r4, [r1], #4 subs r3, #4 str r4, [r0], #4 bne tail_fast_loop cmp r2, #0 beq full_out tail_fast_loop_byte: subs r2, #1 ldrb r3, [r1], #1 strb r3, [r0], #1 bne tail_fast_loop_byte full_out: pop {r0, r4-r11} bx lr byte_at_a_time_no_pld: subs r2, #1 ldrb r3, [r1, r2] /* one byte at a time, so we don't have to check for odd */ strb r3, [r0, r2] /* sizes and alignments etc; also no stack push necessary */ bne byte_at_a_time_no_pld bx lr /* leaving r0 intact */ /*head_3: ldrb r3, [r1], #1 strb r3, [r0], #1 sub r2, #1 head_2: ldrb r3, [r1], #1 strb r3, [r0], #1 sub r2, #1 head_1: ldrb r3, [r1], #1 strb r3, [r0], #1 sub r2, #1 b pre_fast_loop */ misaligned: bic r1, #3 /* align down r1, with r3 containing the r1 misalignment */ cmp r3, #2 ldr r11, [r1], #4 beq misaligned_2 bgt misaligned_3 misaligned_1: cmp r2, #32 blt post_misalignment_1 mis_1_loop: lsr r3, r11, #8 /* we want the high three bytes of this */ ldmia r1!, {r4-r11} sub r2, #32 cmp r2, #32 orr r3, r4, lsl #24 lsr r4, #8; orr r4, r5, lsl #24 lsr r5, #8; orr r5, r6, lsl #24 lsr r6, #8; orr r6, r7, lsl #24 lsr r7, #8; orr r7, r8, lsl #24 lsr r8, #8; orr r8, r9, lsl #24 lsr r9, #8; orr r9, r10, lsl #24 lsr r10, #8; orr r10, r11, lsl #24 pld [r1, #128] stmia r0!, {r3-r10} bge mis_1_loop post_misalignment_1: cmp r2, #0 beq full_out lsr r11, #8 mov r3, #3 post_misalignment_1_loop: cmp r3, #0 ldreq r11, [r1], #4 moveq r3, #4 strb r11, [r0], #1 sub r3, #1 subs r2, #1 lsr r11, #8 bne post_misalignment_1_loop b full_out misaligned_2: cmp r2, #32 blt post_misalignment_2 mis_2_loop: lsr r3, r11, #16 /* we want the high two bytes of this */ ldmia r1!, {r4-r11} sub r2, #32 cmp r2, #32 orr r3, r4, lsl #16 lsr r4, #16; orr r4, r5, lsl #16 lsr r5, #16; orr r5, r6, lsl #16 lsr r6, #16; orr r6, r7, lsl #16 lsr r7, #16; orr r7, r8, lsl #16 lsr r8, #16; orr r8, r9, lsl #16 lsr r9, #16; orr r9, r10, lsl #16 lsr r10, #16; orr r10, r11, lsl #16 pld [r1, #128] stmia r0!, {r3-r10} bge mis_2_loop post_misalignment_2: cmp r2, #0 beq full_out lsr r11, #16 mov r3, #2 post_misalignment_2_loop: cmp r3, #0 ldreq r11, [r1], #4 moveq r3, #4 strb r11, [r0], #1 sub r3, #1 subs r2, #1 lsr r11, #8 bne post_misalignment_2_loop b full_out misaligned_3: cmp r2, #32 blt post_misalignment_3 mis_3_loop: lsr r3, r11, #24 /* we want the high byte of this */ ldmia r1!, {r4-r11} sub r2, #32 cmp r2, #32 orr r3, r4, lsl #8 lsr r4, #24; orr r4, r5, lsl #8 lsr r5, #24; orr r5, r6, lsl #8 lsr r6, #24; orr r6, r7, lsl #8 lsr r7, #24; orr r7, r8, lsl #8 lsr r8, #24; orr r8, r9, lsl #8 lsr r9, #24; orr r9, r10, lsl #8 lsr r10, #24; orr r10, r11, lsl #8 pld [r1, #128] stmia r0!, {r3-r10} bge mis_3_loop post_misalignment_3: cmp r2, #0 beq full_out lsr r11, #24 mov r3, #1 post_misalignment_3_loop: cmp r3, #0 ldreq r11, [r1], #4 moveq r3, #4 strb r11, [r0], #1 sub r3, #1 subs r2, #1 lsr r11, #8 bne post_misalignment_3_loop b full_out .endfunc /* Raj: Added this to enable no exec stack */ #if defined(__linux__) && defined(__ELF__) .section .note.GNU-stack,"",%progbits #endif