1/*************************************************************************** 2* Copyright (C) 2006 by Joachim Fritschi, <jfritschi@freenet.de> * 3* * 4* This program is free software; you can redistribute it and/or modify * 5* it under the terms of the GNU General Public License as published by * 6* the Free Software Foundation; either version 2 of the License, or * 7* (at your option) any later version. * 8* * 9* This program is distributed in the hope that it will be useful, * 10* but WITHOUT ANY WARRANTY; without even the implied warranty of * 11* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * 12* GNU General Public License for more details. * 13* * 14* You should have received a copy of the GNU General Public License * 15* along with this program; if not, write to the * 16* Free Software Foundation, Inc., * 17* 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. * 18***************************************************************************/ 19 20.file "twofish-x86_64-asm.S" 21.text 22 23#include <asm/asm-offsets.h> 24 25#define a_offset 0 26#define b_offset 4 27#define c_offset 8 28#define d_offset 12 29 30/* Structure of the crypto context struct*/ 31 32#define s0 0 /* S0 Array 256 Words each */ 33#define s1 1024 /* S1 Array */ 34#define s2 2048 /* S2 Array */ 35#define s3 3072 /* S3 Array */ 36#define w 4096 /* 8 whitening keys (word) */ 37#define k 4128 /* key 1-32 ( word ) */ 38 39/* define a few register aliases to allow macro substitution */ 40 41#define R0 %rax 42#define R0D %eax 43#define R0B %al 44#define R0H %ah 45 46#define R1 %rbx 47#define R1D %ebx 48#define R1B %bl 49#define R1H %bh 50 51#define R2 %rcx 52#define R2D %ecx 53#define R2B %cl 54#define R2H %ch 55 56#define R3 %rdx 57#define R3D %edx 58#define R3B %dl 59#define R3H %dh 60 61 62/* performs input whitening */ 63#define input_whitening(src,context,offset)\ 64 xor w+offset(context), src; 65 66/* performs input whitening */ 67#define output_whitening(src,context,offset)\ 68 xor w+16+offset(context), src; 69 70 71/* 72 * a input register containing a (rotated 16) 73 * b input register containing b 74 * c input register containing c 75 * d input register containing d (already rol $1) 76 * operations on a and b are interleaved to increase performance 77 */ 78#define encrypt_round(a,b,c,d,round)\ 79 movzx b ## B, %edi;\ 80 mov s1(%r11,%rdi,4),%r8d;\ 81 movzx a ## B, %edi;\ 82 mov s2(%r11,%rdi,4),%r9d;\ 83 movzx b ## H, %edi;\ 84 ror $16, b ## D;\ 85 xor s2(%r11,%rdi,4),%r8d;\ 86 movzx a ## H, %edi;\ 87 ror $16, a ## D;\ 88 xor s3(%r11,%rdi,4),%r9d;\ 89 movzx b ## B, %edi;\ 90 xor s3(%r11,%rdi,4),%r8d;\ 91 movzx a ## B, %edi;\ 92 xor (%r11,%rdi,4), %r9d;\ 93 movzx b ## H, %edi;\ 94 ror $15, b ## D;\ 95 xor (%r11,%rdi,4), %r8d;\ 96 movzx a ## H, %edi;\ 97 xor s1(%r11,%rdi,4),%r9d;\ 98 add %r8d, %r9d;\ 99 add %r9d, %r8d;\ 100 add k+round(%r11), %r9d;\ 101 xor %r9d, c ## D;\ 102 rol $15, c ## D;\ 103 add k+4+round(%r11),%r8d;\ 104 xor %r8d, d ## D; 105 106/* 107 * a input register containing a(rotated 16) 108 * b input register containing b 109 * c input register containing c 110 * d input register containing d (already rol $1) 111 * operations on a and b are interleaved to increase performance 112 * during the round a and b are prepared for the output whitening 113 */ 114#define encrypt_last_round(a,b,c,d,round)\ 115 mov b ## D, %r10d;\ 116 shl $32, %r10;\ 117 movzx b ## B, %edi;\ 118 mov s1(%r11,%rdi,4),%r8d;\ 119 movzx a ## B, %edi;\ 120 mov s2(%r11,%rdi,4),%r9d;\ 121 movzx b ## H, %edi;\ 122 ror $16, b ## D;\ 123 xor s2(%r11,%rdi,4),%r8d;\ 124 movzx a ## H, %edi;\ 125 ror $16, a ## D;\ 126 xor s3(%r11,%rdi,4),%r9d;\ 127 movzx b ## B, %edi;\ 128 xor s3(%r11,%rdi,4),%r8d;\ 129 movzx a ## B, %edi;\ 130 xor (%r11,%rdi,4), %r9d;\ 131 xor a, %r10;\ 132 movzx b ## H, %edi;\ 133 xor (%r11,%rdi,4), %r8d;\ 134 movzx a ## H, %edi;\ 135 xor s1(%r11,%rdi,4),%r9d;\ 136 add %r8d, %r9d;\ 137 add %r9d, %r8d;\ 138 add k+round(%r11), %r9d;\ 139 xor %r9d, c ## D;\ 140 ror $1, c ## D;\ 141 add k+4+round(%r11),%r8d;\ 142 xor %r8d, d ## D 143 144/* 145 * a input register containing a 146 * b input register containing b (rotated 16) 147 * c input register containing c (already rol $1) 148 * d input register containing d 149 * operations on a and b are interleaved to increase performance 150 */ 151#define decrypt_round(a,b,c,d,round)\ 152 movzx a ## B, %edi;\ 153 mov (%r11,%rdi,4), %r9d;\ 154 movzx b ## B, %edi;\ 155 mov s3(%r11,%rdi,4),%r8d;\ 156 movzx a ## H, %edi;\ 157 ror $16, a ## D;\ 158 xor s1(%r11,%rdi,4),%r9d;\ 159 movzx b ## H, %edi;\ 160 ror $16, b ## D;\ 161 xor (%r11,%rdi,4), %r8d;\ 162 movzx a ## B, %edi;\ 163 xor s2(%r11,%rdi,4),%r9d;\ 164 movzx b ## B, %edi;\ 165 xor s1(%r11,%rdi,4),%r8d;\ 166 movzx a ## H, %edi;\ 167 ror $15, a ## D;\ 168 xor s3(%r11,%rdi,4),%r9d;\ 169 movzx b ## H, %edi;\ 170 xor s2(%r11,%rdi,4),%r8d;\ 171 add %r8d, %r9d;\ 172 add %r9d, %r8d;\ 173 add k+round(%r11), %r9d;\ 174 xor %r9d, c ## D;\ 175 add k+4+round(%r11),%r8d;\ 176 xor %r8d, d ## D;\ 177 rol $15, d ## D; 178 179/* 180 * a input register containing a 181 * b input register containing b 182 * c input register containing c (already rol $1) 183 * d input register containing d 184 * operations on a and b are interleaved to increase performance 185 * during the round a and b are prepared for the output whitening 186 */ 187#define decrypt_last_round(a,b,c,d,round)\ 188 movzx a ## B, %edi;\ 189 mov (%r11,%rdi,4), %r9d;\ 190 movzx b ## B, %edi;\ 191 mov s3(%r11,%rdi,4),%r8d;\ 192 movzx b ## H, %edi;\ 193 ror $16, b ## D;\ 194 xor (%r11,%rdi,4), %r8d;\ 195 movzx a ## H, %edi;\ 196 mov b ## D, %r10d;\ 197 shl $32, %r10;\ 198 xor a, %r10;\ 199 ror $16, a ## D;\ 200 xor s1(%r11,%rdi,4),%r9d;\ 201 movzx b ## B, %edi;\ 202 xor s1(%r11,%rdi,4),%r8d;\ 203 movzx a ## B, %edi;\ 204 xor s2(%r11,%rdi,4),%r9d;\ 205 movzx b ## H, %edi;\ 206 xor s2(%r11,%rdi,4),%r8d;\ 207 movzx a ## H, %edi;\ 208 xor s3(%r11,%rdi,4),%r9d;\ 209 add %r8d, %r9d;\ 210 add %r9d, %r8d;\ 211 add k+round(%r11), %r9d;\ 212 xor %r9d, c ## D;\ 213 add k+4+round(%r11),%r8d;\ 214 xor %r8d, d ## D;\ 215 ror $1, d ## D; 216 217.align 8 218.global twofish_enc_blk 219.global twofish_dec_blk 220 221twofish_enc_blk: 222 pushq R1 223 224 /* %rdi contains the crypto tfm address */ 225 /* %rsi contains the output address */ 226 /* %rdx contains the input address */ 227 add $crypto_tfm_ctx_offset, %rdi /* set ctx address */ 228 /* ctx address is moved to free one non-rex register 229 as target for the 8bit high operations */ 230 mov %rdi, %r11 231 232 movq (R3), R1 233 movq 8(R3), R3 234 input_whitening(R1,%r11,a_offset) 235 input_whitening(R3,%r11,c_offset) 236 mov R1D, R0D 237 rol $16, R0D 238 shr $32, R1 239 mov R3D, R2D 240 shr $32, R3 241 rol $1, R3D 242 243 encrypt_round(R0,R1,R2,R3,0); 244 encrypt_round(R2,R3,R0,R1,8); 245 encrypt_round(R0,R1,R2,R3,2*8); 246 encrypt_round(R2,R3,R0,R1,3*8); 247 encrypt_round(R0,R1,R2,R3,4*8); 248 encrypt_round(R2,R3,R0,R1,5*8); 249 encrypt_round(R0,R1,R2,R3,6*8); 250 encrypt_round(R2,R3,R0,R1,7*8); 251 encrypt_round(R0,R1,R2,R3,8*8); 252 encrypt_round(R2,R3,R0,R1,9*8); 253 encrypt_round(R0,R1,R2,R3,10*8); 254 encrypt_round(R2,R3,R0,R1,11*8); 255 encrypt_round(R0,R1,R2,R3,12*8); 256 encrypt_round(R2,R3,R0,R1,13*8); 257 encrypt_round(R0,R1,R2,R3,14*8); 258 encrypt_last_round(R2,R3,R0,R1,15*8); 259 260 261 output_whitening(%r10,%r11,a_offset) 262 movq %r10, (%rsi) 263 264 shl $32, R1 265 xor R0, R1 266 267 output_whitening(R1,%r11,c_offset) 268 movq R1, 8(%rsi) 269 270 popq R1 271 movq $1,%rax 272 ret 273 274twofish_dec_blk: 275 pushq R1 276 277 /* %rdi contains the crypto tfm address */ 278 /* %rsi contains the output address */ 279 /* %rdx contains the input address */ 280 add $crypto_tfm_ctx_offset, %rdi /* set ctx address */ 281 /* ctx address is moved to free one non-rex register 282 as target for the 8bit high operations */ 283 mov %rdi, %r11 284 285 movq (R3), R1 286 movq 8(R3), R3 287 output_whitening(R1,%r11,a_offset) 288 output_whitening(R3,%r11,c_offset) 289 mov R1D, R0D 290 shr $32, R1 291 rol $16, R1D 292 mov R3D, R2D 293 shr $32, R3 294 rol $1, R2D 295 296 decrypt_round(R0,R1,R2,R3,15*8); 297 decrypt_round(R2,R3,R0,R1,14*8); 298 decrypt_round(R0,R1,R2,R3,13*8); 299 decrypt_round(R2,R3,R0,R1,12*8); 300 decrypt_round(R0,R1,R2,R3,11*8); 301 decrypt_round(R2,R3,R0,R1,10*8); 302 decrypt_round(R0,R1,R2,R3,9*8); 303 decrypt_round(R2,R3,R0,R1,8*8); 304 decrypt_round(R0,R1,R2,R3,7*8); 305 decrypt_round(R2,R3,R0,R1,6*8); 306 decrypt_round(R0,R1,R2,R3,5*8); 307 decrypt_round(R2,R3,R0,R1,4*8); 308 decrypt_round(R0,R1,R2,R3,3*8); 309 decrypt_round(R2,R3,R0,R1,2*8); 310 decrypt_round(R0,R1,R2,R3,1*8); 311 decrypt_last_round(R2,R3,R0,R1,0); 312 313 input_whitening(%r10,%r11,a_offset) 314 movq %r10, (%rsi) 315 316 shl $32, R1 317 xor R0, R1 318 319 input_whitening(R1,%r11,c_offset) 320 movq R1, 8(%rsi) 321 322 popq R1 323 movq $1,%rax 324 ret 325