1 /* SPDX-License-Identifier: GPL-2.0-only */
2 /*
3  *  arch/arm/include/asm/xor.h
4  *
5  *  Copyright (C) 2001 Russell King
6  */
7 #include <linux/hardirq.h>
8 #include <asm-generic/xor.h>
9 #include <asm/hwcap.h>
10 #include <asm/neon.h>
11 
12 #define __XOR(a1, a2) a1 ^= a2
13 
14 #define GET_BLOCK_2(dst) \
15 	__asm__("ldmia	%0, {%1, %2}" \
16 		: "=r" (dst), "=r" (a1), "=r" (a2) \
17 		: "0" (dst))
18 
19 #define GET_BLOCK_4(dst) \
20 	__asm__("ldmia	%0, {%1, %2, %3, %4}" \
21 		: "=r" (dst), "=r" (a1), "=r" (a2), "=r" (a3), "=r" (a4) \
22 		: "0" (dst))
23 
24 #define XOR_BLOCK_2(src) \
25 	__asm__("ldmia	%0!, {%1, %2}" \
26 		: "=r" (src), "=r" (b1), "=r" (b2) \
27 		: "0" (src)); \
28 	__XOR(a1, b1); __XOR(a2, b2);
29 
30 #define XOR_BLOCK_4(src) \
31 	__asm__("ldmia	%0!, {%1, %2, %3, %4}" \
32 		: "=r" (src), "=r" (b1), "=r" (b2), "=r" (b3), "=r" (b4) \
33 		: "0" (src)); \
34 	__XOR(a1, b1); __XOR(a2, b2); __XOR(a3, b3); __XOR(a4, b4)
35 
36 #define PUT_BLOCK_2(dst) \
37 	__asm__ __volatile__("stmia	%0!, {%2, %3}" \
38 		: "=r" (dst) \
39 		: "0" (dst), "r" (a1), "r" (a2))
40 
41 #define PUT_BLOCK_4(dst) \
42 	__asm__ __volatile__("stmia	%0!, {%2, %3, %4, %5}" \
43 		: "=r" (dst) \
44 		: "0" (dst), "r" (a1), "r" (a2), "r" (a3), "r" (a4))
45 
46 static void
xor_arm4regs_2(unsigned long bytes,unsigned long * __restrict p1,const unsigned long * __restrict p2)47 xor_arm4regs_2(unsigned long bytes, unsigned long * __restrict p1,
48 	       const unsigned long * __restrict p2)
49 {
50 	unsigned int lines = bytes / sizeof(unsigned long) / 4;
51 	register unsigned int a1 __asm__("r4");
52 	register unsigned int a2 __asm__("r5");
53 	register unsigned int a3 __asm__("r6");
54 	register unsigned int a4 __asm__("r10");
55 	register unsigned int b1 __asm__("r8");
56 	register unsigned int b2 __asm__("r9");
57 	register unsigned int b3 __asm__("ip");
58 	register unsigned int b4 __asm__("lr");
59 
60 	do {
61 		GET_BLOCK_4(p1);
62 		XOR_BLOCK_4(p2);
63 		PUT_BLOCK_4(p1);
64 	} while (--lines);
65 }
66 
67 static void
xor_arm4regs_3(unsigned long bytes,unsigned long * __restrict p1,const unsigned long * __restrict p2,const unsigned long * __restrict p3)68 xor_arm4regs_3(unsigned long bytes, unsigned long * __restrict p1,
69 	       const unsigned long * __restrict p2,
70 	       const unsigned long * __restrict p3)
71 {
72 	unsigned int lines = bytes / sizeof(unsigned long) / 4;
73 	register unsigned int a1 __asm__("r4");
74 	register unsigned int a2 __asm__("r5");
75 	register unsigned int a3 __asm__("r6");
76 	register unsigned int a4 __asm__("r10");
77 	register unsigned int b1 __asm__("r8");
78 	register unsigned int b2 __asm__("r9");
79 	register unsigned int b3 __asm__("ip");
80 	register unsigned int b4 __asm__("lr");
81 
82 	do {
83 		GET_BLOCK_4(p1);
84 		XOR_BLOCK_4(p2);
85 		XOR_BLOCK_4(p3);
86 		PUT_BLOCK_4(p1);
87 	} while (--lines);
88 }
89 
90 static void
xor_arm4regs_4(unsigned long bytes,unsigned long * __restrict p1,const unsigned long * __restrict p2,const unsigned long * __restrict p3,const unsigned long * __restrict p4)91 xor_arm4regs_4(unsigned long bytes, unsigned long * __restrict p1,
92 	       const unsigned long * __restrict p2,
93 	       const unsigned long * __restrict p3,
94 	       const unsigned long * __restrict p4)
95 {
96 	unsigned int lines = bytes / sizeof(unsigned long) / 2;
97 	register unsigned int a1 __asm__("r8");
98 	register unsigned int a2 __asm__("r9");
99 	register unsigned int b1 __asm__("ip");
100 	register unsigned int b2 __asm__("lr");
101 
102 	do {
103 		GET_BLOCK_2(p1);
104 		XOR_BLOCK_2(p2);
105 		XOR_BLOCK_2(p3);
106 		XOR_BLOCK_2(p4);
107 		PUT_BLOCK_2(p1);
108 	} while (--lines);
109 }
110 
111 static void
xor_arm4regs_5(unsigned long bytes,unsigned long * __restrict p1,const unsigned long * __restrict p2,const unsigned long * __restrict p3,const unsigned long * __restrict p4,const unsigned long * __restrict p5)112 xor_arm4regs_5(unsigned long bytes, unsigned long * __restrict p1,
113 	       const unsigned long * __restrict p2,
114 	       const unsigned long * __restrict p3,
115 	       const unsigned long * __restrict p4,
116 	       const unsigned long * __restrict p5)
117 {
118 	unsigned int lines = bytes / sizeof(unsigned long) / 2;
119 	register unsigned int a1 __asm__("r8");
120 	register unsigned int a2 __asm__("r9");
121 	register unsigned int b1 __asm__("ip");
122 	register unsigned int b2 __asm__("lr");
123 
124 	do {
125 		GET_BLOCK_2(p1);
126 		XOR_BLOCK_2(p2);
127 		XOR_BLOCK_2(p3);
128 		XOR_BLOCK_2(p4);
129 		XOR_BLOCK_2(p5);
130 		PUT_BLOCK_2(p1);
131 	} while (--lines);
132 }
133 
134 static struct xor_block_template xor_block_arm4regs = {
135 	.name	= "arm4regs",
136 	.do_2	= xor_arm4regs_2,
137 	.do_3	= xor_arm4regs_3,
138 	.do_4	= xor_arm4regs_4,
139 	.do_5	= xor_arm4regs_5,
140 };
141 
142 #undef XOR_TRY_TEMPLATES
143 #define XOR_TRY_TEMPLATES			\
144 	do {					\
145 		xor_speed(&xor_block_arm4regs);	\
146 		xor_speed(&xor_block_8regs);	\
147 		xor_speed(&xor_block_32regs);	\
148 		NEON_TEMPLATES;			\
149 	} while (0)
150 
151 #ifdef CONFIG_KERNEL_MODE_NEON
152 
153 extern struct xor_block_template const xor_block_neon_inner;
154 
155 static void
xor_neon_2(unsigned long bytes,unsigned long * __restrict p1,const unsigned long * __restrict p2)156 xor_neon_2(unsigned long bytes, unsigned long * __restrict p1,
157 	   const unsigned long * __restrict p2)
158 {
159 	if (in_interrupt()) {
160 		xor_arm4regs_2(bytes, p1, p2);
161 	} else {
162 		kernel_neon_begin();
163 		xor_block_neon_inner.do_2(bytes, p1, p2);
164 		kernel_neon_end();
165 	}
166 }
167 
168 static void
xor_neon_3(unsigned long bytes,unsigned long * __restrict p1,const unsigned long * __restrict p2,const unsigned long * __restrict p3)169 xor_neon_3(unsigned long bytes, unsigned long * __restrict p1,
170 	   const unsigned long * __restrict p2,
171 	   const unsigned long * __restrict p3)
172 {
173 	if (in_interrupt()) {
174 		xor_arm4regs_3(bytes, p1, p2, p3);
175 	} else {
176 		kernel_neon_begin();
177 		xor_block_neon_inner.do_3(bytes, p1, p2, p3);
178 		kernel_neon_end();
179 	}
180 }
181 
182 static void
xor_neon_4(unsigned long bytes,unsigned long * __restrict p1,const unsigned long * __restrict p2,const unsigned long * __restrict p3,const unsigned long * __restrict p4)183 xor_neon_4(unsigned long bytes, unsigned long * __restrict p1,
184 	   const unsigned long * __restrict p2,
185 	   const unsigned long * __restrict p3,
186 	   const unsigned long * __restrict p4)
187 {
188 	if (in_interrupt()) {
189 		xor_arm4regs_4(bytes, p1, p2, p3, p4);
190 	} else {
191 		kernel_neon_begin();
192 		xor_block_neon_inner.do_4(bytes, p1, p2, p3, p4);
193 		kernel_neon_end();
194 	}
195 }
196 
197 static void
xor_neon_5(unsigned long bytes,unsigned long * __restrict p1,const unsigned long * __restrict p2,const unsigned long * __restrict p3,const unsigned long * __restrict p4,const unsigned long * __restrict p5)198 xor_neon_5(unsigned long bytes, unsigned long * __restrict p1,
199 	   const unsigned long * __restrict p2,
200 	   const unsigned long * __restrict p3,
201 	   const unsigned long * __restrict p4,
202 	   const unsigned long * __restrict p5)
203 {
204 	if (in_interrupt()) {
205 		xor_arm4regs_5(bytes, p1, p2, p3, p4, p5);
206 	} else {
207 		kernel_neon_begin();
208 		xor_block_neon_inner.do_5(bytes, p1, p2, p3, p4, p5);
209 		kernel_neon_end();
210 	}
211 }
212 
213 static struct xor_block_template xor_block_neon = {
214 	.name	= "neon",
215 	.do_2	= xor_neon_2,
216 	.do_3	= xor_neon_3,
217 	.do_4	= xor_neon_4,
218 	.do_5	= xor_neon_5
219 };
220 
221 #define NEON_TEMPLATES	\
222 	do { if (cpu_has_neon()) xor_speed(&xor_block_neon); } while (0)
223 #else
224 #define NEON_TEMPLATES
225 #endif
226