1/* NEON/VFP/ARM version of memcpy optimized for Cortex-A15.
2   Copyright (C) 2013-2022 Free Software Foundation, Inc.
3   This file is part of the GNU C Library.
4
5   The GNU C Library is free software; you can redistribute it and/or
6   modify it under the terms of the GNU Lesser General Public
7   License as published by the Free Software Foundation; either
8   version 2.1 of the License, or (at your option) any later version.
9
10   The GNU C Library is distributed in the hope that it will be useful,
11   but WITHOUT ANY WARRANTY; without even the implied warranty of
12   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
13   Lesser General Public License for more details.
14
15   You should have received a copy of the GNU Lesser General Public
16   License along with the GNU C Library; if not, see
17   <https://www.gnu.org/licenses/>.
18
19   This memcpy routine is optimised for Cortex-A15 cores and takes advantage
20   of VFP or NEON when built with the appropriate flags.
21
22   Assumptions:
23
24    ARMv6 (ARMv7-a if using Neon)
25    ARM state
26    Unaligned accesses
27
28 */
29
30/* Thumb cannot encode negative immediate offsets in memory operations.  */
31#ifndef NO_THUMB
32#define NO_THUMB
33#endif
34#include <sysdep.h>
35#include <arm-features.h>
36
37	.syntax unified
38	/* This implementation requires ARM state.  */
39	.arm
40
41#ifdef MEMCPY_NEON
42
43	.fpu	neon
44	.arch	armv7-a
45# define FRAME_SIZE	4
46# define USE_VFP
47# define USE_NEON
48
49#elif defined (MEMCPY_VFP)
50
51	.arch	armv6
52	.fpu	vfpv2
53# define FRAME_SIZE	32
54# define USE_VFP
55
56#else
57	.arch	armv6
58# define FRAME_SIZE    32
59
60#endif
61
62#define ALIGN(addr, align) addr:align
63
64#define INSN_SIZE	4
65
66/* Call parameters.  */
67#define dstin	r0
68#define src	r1
69#define count	r2
70
71/* Locals.  */
72#define tmp1	r3
73#define dst	ip
74#define tmp2	r8
75
76/* These two macros both work by repeated invocation of the macro
77   dispatch_step (not defined here).  That macro performs one "step",
78   doing one load instruction and one store instruction to copy one
79   "unit".  On entry, TMP1 contains the number of bytes to be copied,
80   a multiple of the unit size.  The macro clobbers TMP1 in the
81   process of doing a computed jump to the tail containing the
82   appropriate number of steps.
83
84   In dispatch_7_dword, dispatch_step is invoked seven times, with an
85   argument that is 7 for the first and 1 for the last.  Units are
86   double-words (8 bytes).  TMP1 is at most 56.
87
88   In dispatch_15_word, dispatch_step is invoked fifteen times,
89   with an argument that is 15 for the first and 1 for the last.
90   Units are words (4 bytes).  TMP1 is at most 60.  */
91
92#ifndef ARM_ALWAYS_BX
93# if ARM_BX_ALIGN_LOG2 != 2
94#  error case not handled
95# endif
96	.macro dispatch_7_dword
97	rsb	tmp1, tmp1, #((7 * 8) - PC_OFS + INSN_SIZE)
98	add	pc, pc, tmp1
99	dispatch_step 7
100	dispatch_step 6
101	dispatch_step 5
102	dispatch_step 4
103	dispatch_step 3
104	dispatch_step 2
105	dispatch_step 1
106	.purgem dispatch_step
107	.endm
108
109	.macro dispatch_15_word
110	rsb	tmp1, tmp1, #((15 * 4) - PC_OFS/2 + INSN_SIZE/2)
111	add	pc, pc, tmp1, lsl #1
112	dispatch_step 15
113	dispatch_step 14
114	dispatch_step 13
115	dispatch_step 12
116	dispatch_step 11
117	dispatch_step 10
118	dispatch_step 9
119	dispatch_step 8
120	dispatch_step 7
121	dispatch_step 6
122	dispatch_step 5
123	dispatch_step 4
124	dispatch_step 3
125	dispatch_step 2
126	dispatch_step 1
127	.purgem dispatch_step
128	.endm
129#else
130# if ARM_BX_ALIGN_LOG2 < 3
131#  error case not handled
132# endif
133	.macro dispatch_helper steps, log2_bytes_per_step
134	/* TMP1 gets (max_bytes - bytes_to_copy), where max_bytes is
135	   (STEPS << LOG2_BYTES_PER_STEP).
136	   So this is (steps_to_skip << LOG2_BYTES_PER_STEP).
137	   Then it needs further adjustment to compensate for the
138	   distance between the PC value taken below (0f + PC_OFS)
139	   and the first step's instructions (1f).  */
140	rsb	tmp1, tmp1, #((\steps << \log2_bytes_per_step) \
141			      + ((1f - PC_OFS - 0f) \
142				 >> (ARM_BX_ALIGN_LOG2 - \log2_bytes_per_step)))
143	/* Shifting down LOG2_BYTES_PER_STEP gives us the number of
144	   steps to skip, then shifting up ARM_BX_ALIGN_LOG2 gives us
145	   the (byte) distance to add to the PC.  */
1460:	add	tmp1, pc, tmp1, lsl #(ARM_BX_ALIGN_LOG2 - \log2_bytes_per_step)
147	bx	tmp1
148	.p2align ARM_BX_ALIGN_LOG2
1491:
150	.endm
151
152	.macro dispatch_7_dword
153	dispatch_helper 7, 3
154	.p2align ARM_BX_ALIGN_LOG2
155	dispatch_step 7
156	.p2align ARM_BX_ALIGN_LOG2
157	dispatch_step 6
158	.p2align ARM_BX_ALIGN_LOG2
159	dispatch_step 5
160	.p2align ARM_BX_ALIGN_LOG2
161	dispatch_step 4
162	.p2align ARM_BX_ALIGN_LOG2
163	dispatch_step 3
164	.p2align ARM_BX_ALIGN_LOG2
165	dispatch_step 2
166	.p2align ARM_BX_ALIGN_LOG2
167	dispatch_step 1
168	.p2align ARM_BX_ALIGN_LOG2
169	.purgem dispatch_step
170	.endm
171
172	.macro dispatch_15_word
173	dispatch_helper 15, 2
174	dispatch_step 15
175	.p2align ARM_BX_ALIGN_LOG2
176	dispatch_step 14
177	.p2align ARM_BX_ALIGN_LOG2
178	dispatch_step 13
179	.p2align ARM_BX_ALIGN_LOG2
180	dispatch_step 12
181	.p2align ARM_BX_ALIGN_LOG2
182	dispatch_step 11
183	.p2align ARM_BX_ALIGN_LOG2
184	dispatch_step 10
185	.p2align ARM_BX_ALIGN_LOG2
186	dispatch_step 9
187	.p2align ARM_BX_ALIGN_LOG2
188	dispatch_step 8
189	.p2align ARM_BX_ALIGN_LOG2
190	dispatch_step 7
191	.p2align ARM_BX_ALIGN_LOG2
192	dispatch_step 6
193	.p2align ARM_BX_ALIGN_LOG2
194	dispatch_step 5
195	.p2align ARM_BX_ALIGN_LOG2
196	dispatch_step 4
197	.p2align ARM_BX_ALIGN_LOG2
198	dispatch_step 3
199	.p2align ARM_BX_ALIGN_LOG2
200	dispatch_step 2
201	.p2align ARM_BX_ALIGN_LOG2
202	dispatch_step 1
203	.p2align ARM_BX_ALIGN_LOG2
204	.purgem dispatch_step
205	.endm
206
207#endif
208
209#ifndef USE_NEON
210/* For bulk copies using GP registers.  */
211#define	A_l	r2		/* Call-clobbered.  */
212#define	A_h	r3		/* Call-clobbered.  */
213#define	B_l	r4
214#define	B_h	r5
215#define	C_l	r6
216#define	C_h	r7
217/* Don't use the pair r8,r9 because in some EABI variants r9 is reserved.  */
218#define	D_l	r10
219#define	D_h	r11
220#endif
221
222/* Number of lines ahead to pre-fetch data.  If you change this the code
223   below will need adjustment to compensate.  */
224
225#define prefetch_lines	5
226
227#ifdef USE_VFP
228	.macro	cpy_line_vfp vreg, base
229	vstr	\vreg, [dst, #\base]
230	vldr	\vreg, [src, #\base]
231	vstr	d0, [dst, #\base + 8]
232	vldr	d0, [src, #\base + 8]
233	vstr	d1, [dst, #\base + 16]
234	vldr	d1, [src, #\base + 16]
235	vstr	d2, [dst, #\base + 24]
236	vldr	d2, [src, #\base + 24]
237	vstr	\vreg, [dst, #\base + 32]
238	vldr	\vreg, [src, #\base + prefetch_lines * 64 - 32]
239	vstr	d0, [dst, #\base + 40]
240	vldr	d0, [src, #\base + 40]
241	vstr	d1, [dst, #\base + 48]
242	vldr	d1, [src, #\base + 48]
243	vstr	d2, [dst, #\base + 56]
244	vldr	d2, [src, #\base + 56]
245	.endm
246
247	.macro	cpy_tail_vfp vreg, base
248	vstr	\vreg, [dst, #\base]
249	vldr	\vreg, [src, #\base]
250	vstr	d0, [dst, #\base + 8]
251	vldr	d0, [src, #\base + 8]
252	vstr	d1, [dst, #\base + 16]
253	vldr	d1, [src, #\base + 16]
254	vstr	d2, [dst, #\base + 24]
255	vldr	d2, [src, #\base + 24]
256	vstr	\vreg, [dst, #\base + 32]
257	vstr	d0, [dst, #\base + 40]
258	vldr	d0, [src, #\base + 40]
259	vstr	d1, [dst, #\base + 48]
260	vldr	d1, [src, #\base + 48]
261	vstr	d2, [dst, #\base + 56]
262	vldr	d2, [src, #\base + 56]
263	.endm
264#endif
265
266	.p2align 6
267ENTRY(memcpy)
268
269	mov	dst, dstin	/* Preserve dstin, we need to return it.  */
270	cmp	count, #64
271	bhs	.Lcpy_not_short
272	/* Deal with small copies quickly by dropping straight into the
273	   exit block.  */
274
275.Ltail63unaligned:
276#ifdef USE_NEON
277	/* These need an extra layer of macro just to work around a
278	   bug in the assembler's parser when an operand starts with
279	   a {...}.  https://sourceware.org/bugzilla/show_bug.cgi?id=15647
280	   tracks that bug; it was not fixed as of binutils-2.23.2.  */
281	.macro neon_load_d0 reg
282	vld1.8	{d0}, [\reg]!
283	.endm
284	.macro neon_store_d0 reg
285	vst1.8	{d0}, [\reg]!
286	.endm
287
288	and	tmp1, count, #0x38
289	.macro dispatch_step i
290	neon_load_d0 src
291	neon_store_d0 dst
292	.endm
293	dispatch_7_dword
294
295	tst	count, #4
296	ldrne	tmp1, [src], #4
297	strne	tmp1, [dst], #4
298#else
299	/* Copy up to 15 full words of data.  May not be aligned.  */
300	/* Cannot use VFP for unaligned data.  */
301	and	tmp1, count, #0x3c
302	add	dst, dst, tmp1
303	add	src, src, tmp1
304	/* Jump directly into the sequence below at the correct offset.  */
305	.macro dispatch_step i
306	ldr	tmp1, [src, #-(\i * 4)]
307	str	tmp1, [dst, #-(\i * 4)]
308	.endm
309	dispatch_15_word
310#endif
311
312	lsls	count, count, #31
313	ldrhcs	tmp1, [src], #2
314	ldrbne	src, [src]		/* Src is dead, use as a scratch.  */
315	strhcs	tmp1, [dst], #2
316	strbne	src, [dst]
317	bx	lr
318
319.Lcpy_not_short:
320	/* At least 64 bytes to copy, but don't know the alignment yet.  */
321	str	tmp2, [sp, #-FRAME_SIZE]!
322	cfi_adjust_cfa_offset (FRAME_SIZE)
323	cfi_rel_offset (tmp2, 0)
324	cfi_remember_state
325	and	tmp2, src, #7
326	and	tmp1, dst, #7
327	cmp	tmp1, tmp2
328	bne	.Lcpy_notaligned
329
330#ifdef USE_VFP
331	/* Magic dust alert!  Force VFP on Cortex-A9.  Experiments show
332	   that the FP pipeline is much better at streaming loads and
333	   stores.  This is outside the critical loop.  */
334	vmov.f32	s0, s0
335#endif
336
337	/* SRC and DST have the same mutual 64-bit alignment, but we may
338	   still need to pre-copy some bytes to get to natural alignment.
339	   We bring SRC and DST into full 64-bit alignment.  */
340	lsls	tmp2, dst, #29
341	beq	1f
342	rsbs	tmp2, tmp2, #0
343	sub	count, count, tmp2, lsr #29
344	ldrmi	tmp1, [src], #4
345	strmi	tmp1, [dst], #4
346	lsls	tmp2, tmp2, #2
347	ldrhcs	tmp1, [src], #2
348	ldrbne	tmp2, [src], #1
349	strhcs	tmp1, [dst], #2
350	strbne	tmp2, [dst], #1
351
3521:
353	subs	tmp2, count, #64	/* Use tmp2 for count.  */
354	blo	.Ltail63aligned
355
356	cmp	tmp2, #512
357	bhs	.Lcpy_body_long
358
359.Lcpy_body_medium:			/* Count in tmp2.  */
360#ifdef USE_VFP
3611:
362	vldr	d0, [src, #0]
363	subs	tmp2, tmp2, #64
364	vldr	d1, [src, #8]
365	vstr	d0, [dst, #0]
366	vldr	d0, [src, #16]
367	vstr	d1, [dst, #8]
368	vldr	d1, [src, #24]
369	vstr	d0, [dst, #16]
370	vldr	d0, [src, #32]
371	vstr	d1, [dst, #24]
372	vldr	d1, [src, #40]
373	vstr	d0, [dst, #32]
374	vldr	d0, [src, #48]
375	vstr	d1, [dst, #40]
376	vldr	d1, [src, #56]
377	vstr	d0, [dst, #48]
378	add	src, src, #64
379	vstr	d1, [dst, #56]
380	add	dst, dst, #64
381	bhs	1b
382	tst	tmp2, #0x3f
383	beq	.Ldone
384
385.Ltail63aligned:			/* Count in tmp2.  */
386	and	tmp1, tmp2, #0x38
387	add	dst, dst, tmp1
388	add	src, src, tmp1
389	.macro dispatch_step i
390	vldr	d0, [src, #-(\i * 8)]
391	vstr	d0, [dst, #-(\i * 8)]
392	.endm
393	dispatch_7_dword
394#else
395	sub	src, src, #8
396	sub	dst, dst, #8
3971:
398	ldrd	A_l, A_h, [src, #8]
399	strd	A_l, A_h, [dst, #8]
400	ldrd	A_l, A_h, [src, #16]
401	strd	A_l, A_h, [dst, #16]
402	ldrd	A_l, A_h, [src, #24]
403	strd	A_l, A_h, [dst, #24]
404	ldrd	A_l, A_h, [src, #32]
405	strd	A_l, A_h, [dst, #32]
406	ldrd	A_l, A_h, [src, #40]
407	strd	A_l, A_h, [dst, #40]
408	ldrd	A_l, A_h, [src, #48]
409	strd	A_l, A_h, [dst, #48]
410	ldrd	A_l, A_h, [src, #56]
411	strd	A_l, A_h, [dst, #56]
412	ldrd	A_l, A_h, [src, #64]!
413	strd	A_l, A_h, [dst, #64]!
414	subs	tmp2, tmp2, #64
415	bhs	1b
416	tst	tmp2, #0x3f
417	bne	1f
418	ldr	tmp2,[sp], #FRAME_SIZE
419	cfi_adjust_cfa_offset (-FRAME_SIZE)
420	cfi_restore (tmp2)
421	bx	lr
422
423	cfi_restore_state
424	cfi_remember_state
4251:
426	add	src, src, #8
427	add	dst, dst, #8
428
429.Ltail63aligned:			/* Count in tmp2.  */
430	/* Copy up to 7 d-words of data.  Similar to Ltail63unaligned, but
431	   we know that the src and dest are 64-bit aligned so we can use
432	   LDRD/STRD to improve efficiency.  */
433	/* TMP2 is now negative, but we don't care about that.  The bottom
434	   six bits still tell us how many bytes are left to copy.  */
435
436	and	tmp1, tmp2, #0x38
437	add	dst, dst, tmp1
438	add	src, src, tmp1
439	.macro dispatch_step i
440	ldrd	A_l, A_h, [src, #-(\i * 8)]
441	strd	A_l, A_h, [dst, #-(\i * 8)]
442	.endm
443	dispatch_7_dword
444#endif
445
446	tst	tmp2, #4
447	ldrne	tmp1, [src], #4
448	strne	tmp1, [dst], #4
449	lsls	tmp2, tmp2, #31		/* Count (tmp2) now dead. */
450	ldrhcs	tmp1, [src], #2
451	ldrbne	tmp2, [src]
452	strhcs	tmp1, [dst], #2
453	strbne	tmp2, [dst]
454
455.Ldone:
456	ldr	tmp2, [sp], #FRAME_SIZE
457	cfi_adjust_cfa_offset (-FRAME_SIZE)
458	cfi_restore (tmp2)
459	bx	lr
460
461	cfi_restore_state
462	cfi_remember_state
463
464.Lcpy_body_long:			/* Count in tmp2.  */
465
466	/* Long copy.  We know that there's at least (prefetch_lines * 64)
467	   bytes to go.  */
468#ifdef USE_VFP
469	/* Don't use PLD.  Instead, read some data in advance of the current
470	   copy position into a register.  This should act like a PLD
471	   operation but we won't have to repeat the transfer.  */
472
473	vldr	d3, [src, #0]
474	vldr	d4, [src, #64]
475	vldr	d5, [src, #128]
476	vldr	d6, [src, #192]
477	vldr	d7, [src, #256]
478
479	vldr	d0, [src, #8]
480	vldr	d1, [src, #16]
481	vldr	d2, [src, #24]
482	add	src, src, #32
483
484	subs	tmp2, tmp2, #prefetch_lines * 64 * 2
485	blo	2f
4861:
487	cpy_line_vfp	d3, 0
488	cpy_line_vfp	d4, 64
489	cpy_line_vfp	d5, 128
490	add	dst, dst, #3 * 64
491	add	src, src, #3 * 64
492	cpy_line_vfp	d6, 0
493	cpy_line_vfp	d7, 64
494	add	dst, dst, #2 * 64
495	add	src, src, #2 * 64
496	subs	tmp2, tmp2, #prefetch_lines * 64
497	bhs	1b
498
4992:
500	cpy_tail_vfp	d3, 0
501	cpy_tail_vfp	d4, 64
502	cpy_tail_vfp	d5, 128
503	add	src, src, #3 * 64
504	add	dst, dst, #3 * 64
505	cpy_tail_vfp	d6, 0
506	vstr	d7, [dst, #64]
507	vldr	d7, [src, #64]
508	vstr	d0, [dst, #64 + 8]
509	vldr	d0, [src, #64 + 8]
510	vstr	d1, [dst, #64 + 16]
511	vldr	d1, [src, #64 + 16]
512	vstr	d2, [dst, #64 + 24]
513	vldr	d2, [src, #64 + 24]
514	vstr	d7, [dst, #64 + 32]
515	add	src, src, #96
516	vstr	d0, [dst, #64 + 40]
517	vstr	d1, [dst, #64 + 48]
518	vstr	d2, [dst, #64 + 56]
519	add	dst, dst, #128
520	add	tmp2, tmp2, #prefetch_lines * 64
521	b	.Lcpy_body_medium
522#else
523	/* Long copy.  Use an SMS style loop to maximize the I/O
524	   bandwidth of the core.  We don't have enough spare registers
525	   to synthesise prefetching, so use PLD operations.  */
526	/* Pre-bias src and dst.  */
527	sub	src, src, #8
528	sub	dst, dst, #8
529	pld	[src, #8]
530	pld	[src, #72]
531	subs	tmp2, tmp2, #64
532	pld	[src, #136]
533	ldrd	A_l, A_h, [src, #8]
534	strd	B_l, B_h, [sp, #8]
535	cfi_rel_offset (B_l, 8)
536	cfi_rel_offset (B_h, 12)
537	ldrd	B_l, B_h, [src, #16]
538	strd	C_l, C_h, [sp, #16]
539	cfi_rel_offset (C_l, 16)
540	cfi_rel_offset (C_h, 20)
541	ldrd	C_l, C_h, [src, #24]
542	strd	D_l, D_h, [sp, #24]
543	cfi_rel_offset (D_l, 24)
544	cfi_rel_offset (D_h, 28)
545	pld	[src, #200]
546	ldrd	D_l, D_h, [src, #32]!
547	b	1f
548	.p2align	6
5492:
550	pld	[src, #232]
551	strd	A_l, A_h, [dst, #40]
552	ldrd	A_l, A_h, [src, #40]
553	strd	B_l, B_h, [dst, #48]
554	ldrd	B_l, B_h, [src, #48]
555	strd	C_l, C_h, [dst, #56]
556	ldrd	C_l, C_h, [src, #56]
557	strd	D_l, D_h, [dst, #64]!
558	ldrd	D_l, D_h, [src, #64]!
559	subs	tmp2, tmp2, #64
5601:
561	strd	A_l, A_h, [dst, #8]
562	ldrd	A_l, A_h, [src, #8]
563	strd	B_l, B_h, [dst, #16]
564	ldrd	B_l, B_h, [src, #16]
565	strd	C_l, C_h, [dst, #24]
566	ldrd	C_l, C_h, [src, #24]
567	strd	D_l, D_h, [dst, #32]
568	ldrd	D_l, D_h, [src, #32]
569	bcs	2b
570	/* Save the remaining bytes and restore the callee-saved regs.  */
571	strd	A_l, A_h, [dst, #40]
572	add	src, src, #40
573	strd	B_l, B_h, [dst, #48]
574	ldrd	B_l, B_h, [sp, #8]
575	cfi_restore (B_l)
576	cfi_restore (B_h)
577	strd	C_l, C_h, [dst, #56]
578	ldrd	C_l, C_h, [sp, #16]
579	cfi_restore (C_l)
580	cfi_restore (C_h)
581	strd	D_l, D_h, [dst, #64]
582	ldrd	D_l, D_h, [sp, #24]
583	cfi_restore (D_l)
584	cfi_restore (D_h)
585	add	dst, dst, #72
586	tst	tmp2, #0x3f
587	bne	.Ltail63aligned
588	ldr	tmp2, [sp], #FRAME_SIZE
589	cfi_adjust_cfa_offset (-FRAME_SIZE)
590	cfi_restore (tmp2)
591	bx	lr
592#endif
593
594	cfi_restore_state
595	cfi_remember_state
596
597.Lcpy_notaligned:
598	pld	[src, #0]
599	pld	[src, #64]
600	/* There's at least 64 bytes to copy, but there is no mutual
601	   alignment.  */
602	/* Bring DST to 64-bit alignment.  */
603	lsls	tmp2, dst, #29
604	pld	[src, #(2 * 64)]
605	beq	1f
606	rsbs	tmp2, tmp2, #0
607	sub	count, count, tmp2, lsr #29
608	ldrmi	tmp1, [src], #4
609	strmi	tmp1, [dst], #4
610	lsls	tmp2, tmp2, #2
611	ldrbne	tmp1, [src], #1
612	ldrhcs	tmp2, [src], #2
613	strbne	tmp1, [dst], #1
614	strhcs	tmp2, [dst], #2
6151:
616	pld	[src, #(3 * 64)]
617	subs	count, count, #64
618	ldrlo	tmp2, [sp], #FRAME_SIZE
619	blo	.Ltail63unaligned
620	pld	[src, #(4 * 64)]
621
622#ifdef USE_NEON
623	/* These need an extra layer of macro just to work around a
624	   bug in the assembler's parser when an operand starts with
625	   a {...}.  */
626	.macro neon_load_multi reglist, basereg
627	vld1.8	{\reglist}, [\basereg]!
628	.endm
629	.macro neon_store_multi reglist, basereg
630	vst1.8	{\reglist}, [ALIGN (\basereg, 64)]!
631	.endm
632
633	neon_load_multi d0-d3, src
634	neon_load_multi d4-d7, src
635	subs	count, count, #64
636	blo	2f
6371:
638	pld	[src, #(4 * 64)]
639	neon_store_multi d0-d3, dst
640	neon_load_multi d0-d3, src
641	neon_store_multi d4-d7, dst
642	neon_load_multi d4-d7, src
643	subs	count, count, #64
644	bhs	1b
6452:
646	neon_store_multi d0-d3, dst
647	neon_store_multi d4-d7, dst
648	ands	count, count, #0x3f
649#else
650	/* Use an SMS style loop to maximize the I/O bandwidth.  */
651	sub	src, src, #4
652	sub	dst, dst, #8
653	subs	tmp2, count, #64	/* Use tmp2 for count.  */
654	ldr	A_l, [src, #4]
655	ldr	A_h, [src, #8]
656	strd	B_l, B_h, [sp, #8]
657	cfi_rel_offset (B_l, 8)
658	cfi_rel_offset (B_h, 12)
659	ldr	B_l, [src, #12]
660	ldr	B_h, [src, #16]
661	strd	C_l, C_h, [sp, #16]
662	cfi_rel_offset (C_l, 16)
663	cfi_rel_offset (C_h, 20)
664	ldr	C_l, [src, #20]
665	ldr	C_h, [src, #24]
666	strd	D_l, D_h, [sp, #24]
667	cfi_rel_offset (D_l, 24)
668	cfi_rel_offset (D_h, 28)
669	ldr	D_l, [src, #28]
670	ldr	D_h, [src, #32]!
671	b	1f
672	.p2align	6
6732:
674	pld	[src, #(5 * 64) - (32 - 4)]
675	strd	A_l, A_h, [dst, #40]
676	ldr	A_l, [src, #36]
677	ldr	A_h, [src, #40]
678	strd	B_l, B_h, [dst, #48]
679	ldr	B_l, [src, #44]
680	ldr	B_h, [src, #48]
681	strd	C_l, C_h, [dst, #56]
682	ldr	C_l, [src, #52]
683	ldr	C_h, [src, #56]
684	strd	D_l, D_h, [dst, #64]!
685	ldr	D_l, [src, #60]
686	ldr	D_h, [src, #64]!
687	subs	tmp2, tmp2, #64
6881:
689	strd	A_l, A_h, [dst, #8]
690	ldr	A_l, [src, #4]
691	ldr	A_h, [src, #8]
692	strd	B_l, B_h, [dst, #16]
693	ldr	B_l, [src, #12]
694	ldr	B_h, [src, #16]
695	strd	C_l, C_h, [dst, #24]
696	ldr	C_l, [src, #20]
697	ldr	C_h, [src, #24]
698	strd	D_l, D_h, [dst, #32]
699	ldr	D_l, [src, #28]
700	ldr	D_h, [src, #32]
701	bcs	2b
702
703	/* Save the remaining bytes and restore the callee-saved regs.  */
704	strd	A_l, A_h, [dst, #40]
705	add	src, src, #36
706	strd	B_l, B_h, [dst, #48]
707	ldrd	B_l, B_h, [sp, #8]
708	cfi_restore (B_l)
709	cfi_restore (B_h)
710	strd	C_l, C_h, [dst, #56]
711	ldrd	C_l, C_h, [sp, #16]
712	cfi_restore (C_l)
713	cfi_restore (C_h)
714	strd	D_l, D_h, [dst, #64]
715	ldrd	D_l, D_h, [sp, #24]
716	cfi_restore (D_l)
717	cfi_restore (D_h)
718	add	dst, dst, #72
719	ands	count, tmp2, #0x3f
720#endif
721	ldr	tmp2, [sp], #FRAME_SIZE
722	cfi_adjust_cfa_offset (-FRAME_SIZE)
723	cfi_restore (tmp2)
724	bne	.Ltail63unaligned
725	bx	lr
726
727END(memcpy)
728libc_hidden_builtin_def (memcpy)
729