1/* 2 * Twofish Cipher 3-way parallel algorithm (x86_64) 3 * 4 * Copyright (C) 2011 Jussi Kivilinna <jussi.kivilinna@mbnet.fi> 5 * 6 * This program is free software; you can redistribute it and/or modify 7 * it under the terms of the GNU General Public License as published by 8 * the Free Software Foundation; either version 2 of the License, or 9 * (at your option) any later version. 10 * 11 * This program is distributed in the hope that it will be useful, 12 * but WITHOUT ANY WARRANTY; without even the implied warranty of 13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 14 * GNU General Public License for more details. 15 * 16 * You should have received a copy of the GNU General Public License 17 * along with this program; if not, write to the Free Software 18 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 19 * USA 20 * 21 */ 22 23.file "twofish-x86_64-asm-3way.S" 24.text 25 26/* structure of crypto context */ 27#define s0 0 28#define s1 1024 29#define s2 2048 30#define s3 3072 31#define w 4096 32#define k 4128 33 34/********************************************************************** 35 3-way twofish 36 **********************************************************************/ 37#define CTX %rdi 38#define RIO %rdx 39 40#define RAB0 %rax 41#define RAB1 %rbx 42#define RAB2 %rcx 43 44#define RAB0d %eax 45#define RAB1d %ebx 46#define RAB2d %ecx 47 48#define RAB0bh %ah 49#define RAB1bh %bh 50#define RAB2bh %ch 51 52#define RAB0bl %al 53#define RAB1bl %bl 54#define RAB2bl %cl 55 56#define RCD0 %r8 57#define RCD1 %r9 58#define RCD2 %r10 59 60#define RCD0d %r8d 61#define RCD1d %r9d 62#define RCD2d %r10d 63 64#define RX0 %rbp 65#define RX1 %r11 66#define RX2 %r12 67 68#define RX0d %ebp 69#define RX1d %r11d 70#define RX2d %r12d 71 72#define RY0 %r13 73#define RY1 %r14 74#define RY2 %r15 75 76#define RY0d %r13d 77#define RY1d %r14d 78#define RY2d %r15d 79 80#define RT0 %rdx 81#define RT1 %rsi 82 83#define RT0d %edx 84#define RT1d %esi 85 86#define do16bit_ror(rot, op1, op2, T0, T1, tmp1, tmp2, ab, dst) \ 87 movzbl ab ## bl, tmp2 ## d; \ 88 movzbl ab ## bh, tmp1 ## d; \ 89 rorq $(rot), ab; \ 90 op1##l T0(CTX, tmp2, 4), dst ## d; \ 91 op2##l T1(CTX, tmp1, 4), dst ## d; 92 93/* 94 * Combined G1 & G2 function. Reordered with help of rotates to have moves 95 * at begining. 96 */ 97#define g1g2_3(ab, cd, Tx0, Tx1, Tx2, Tx3, Ty0, Ty1, Ty2, Ty3, x, y) \ 98 /* G1,1 && G2,1 */ \ 99 do16bit_ror(32, mov, xor, Tx0, Tx1, RT0, x ## 0, ab ## 0, x ## 0); \ 100 do16bit_ror(48, mov, xor, Ty1, Ty2, RT0, y ## 0, ab ## 0, y ## 0); \ 101 \ 102 do16bit_ror(32, mov, xor, Tx0, Tx1, RT0, x ## 1, ab ## 1, x ## 1); \ 103 do16bit_ror(48, mov, xor, Ty1, Ty2, RT0, y ## 1, ab ## 1, y ## 1); \ 104 \ 105 do16bit_ror(32, mov, xor, Tx0, Tx1, RT0, x ## 2, ab ## 2, x ## 2); \ 106 do16bit_ror(48, mov, xor, Ty1, Ty2, RT0, y ## 2, ab ## 2, y ## 2); \ 107 \ 108 /* G1,2 && G2,2 */ \ 109 do16bit_ror(32, xor, xor, Tx2, Tx3, RT0, RT1, ab ## 0, x ## 0); \ 110 do16bit_ror(16, xor, xor, Ty3, Ty0, RT0, RT1, ab ## 0, y ## 0); \ 111 xchgq cd ## 0, ab ## 0; \ 112 \ 113 do16bit_ror(32, xor, xor, Tx2, Tx3, RT0, RT1, ab ## 1, x ## 1); \ 114 do16bit_ror(16, xor, xor, Ty3, Ty0, RT0, RT1, ab ## 1, y ## 1); \ 115 xchgq cd ## 1, ab ## 1; \ 116 \ 117 do16bit_ror(32, xor, xor, Tx2, Tx3, RT0, RT1, ab ## 2, x ## 2); \ 118 do16bit_ror(16, xor, xor, Ty3, Ty0, RT0, RT1, ab ## 2, y ## 2); \ 119 xchgq cd ## 2, ab ## 2; 120 121#define enc_round_end(ab, x, y, n) \ 122 addl y ## d, x ## d; \ 123 addl x ## d, y ## d; \ 124 addl k+4*(2*(n))(CTX), x ## d; \ 125 xorl ab ## d, x ## d; \ 126 addl k+4*(2*(n)+1)(CTX), y ## d; \ 127 shrq $32, ab; \ 128 roll $1, ab ## d; \ 129 xorl y ## d, ab ## d; \ 130 shlq $32, ab; \ 131 rorl $1, x ## d; \ 132 orq x, ab; 133 134#define dec_round_end(ba, x, y, n) \ 135 addl y ## d, x ## d; \ 136 addl x ## d, y ## d; \ 137 addl k+4*(2*(n))(CTX), x ## d; \ 138 addl k+4*(2*(n)+1)(CTX), y ## d; \ 139 xorl ba ## d, y ## d; \ 140 shrq $32, ba; \ 141 roll $1, ba ## d; \ 142 xorl x ## d, ba ## d; \ 143 shlq $32, ba; \ 144 rorl $1, y ## d; \ 145 orq y, ba; 146 147#define encrypt_round3(ab, cd, n) \ 148 g1g2_3(ab, cd, s0, s1, s2, s3, s0, s1, s2, s3, RX, RY); \ 149 \ 150 enc_round_end(ab ## 0, RX0, RY0, n); \ 151 enc_round_end(ab ## 1, RX1, RY1, n); \ 152 enc_round_end(ab ## 2, RX2, RY2, n); 153 154#define decrypt_round3(ba, dc, n) \ 155 g1g2_3(ba, dc, s1, s2, s3, s0, s3, s0, s1, s2, RY, RX); \ 156 \ 157 dec_round_end(ba ## 0, RX0, RY0, n); \ 158 dec_round_end(ba ## 1, RX1, RY1, n); \ 159 dec_round_end(ba ## 2, RX2, RY2, n); 160 161#define encrypt_cycle3(ab, cd, n) \ 162 encrypt_round3(ab, cd, n*2); \ 163 encrypt_round3(ab, cd, (n*2)+1); 164 165#define decrypt_cycle3(ba, dc, n) \ 166 decrypt_round3(ba, dc, (n*2)+1); \ 167 decrypt_round3(ba, dc, (n*2)); 168 169#define inpack3(in, n, xy, m) \ 170 movq 4*(n)(in), xy ## 0; \ 171 xorq w+4*m(CTX), xy ## 0; \ 172 \ 173 movq 4*(4+(n))(in), xy ## 1; \ 174 xorq w+4*m(CTX), xy ## 1; \ 175 \ 176 movq 4*(8+(n))(in), xy ## 2; \ 177 xorq w+4*m(CTX), xy ## 2; 178 179#define outunpack3(op, out, n, xy, m) \ 180 xorq w+4*m(CTX), xy ## 0; \ 181 op ## q xy ## 0, 4*(n)(out); \ 182 \ 183 xorq w+4*m(CTX), xy ## 1; \ 184 op ## q xy ## 1, 4*(4+(n))(out); \ 185 \ 186 xorq w+4*m(CTX), xy ## 2; \ 187 op ## q xy ## 2, 4*(8+(n))(out); 188 189#define inpack_enc3() \ 190 inpack3(RIO, 0, RAB, 0); \ 191 inpack3(RIO, 2, RCD, 2); 192 193#define outunpack_enc3(op) \ 194 outunpack3(op, RIO, 2, RAB, 6); \ 195 outunpack3(op, RIO, 0, RCD, 4); 196 197#define inpack_dec3() \ 198 inpack3(RIO, 0, RAB, 4); \ 199 rorq $32, RAB0; \ 200 rorq $32, RAB1; \ 201 rorq $32, RAB2; \ 202 inpack3(RIO, 2, RCD, 6); \ 203 rorq $32, RCD0; \ 204 rorq $32, RCD1; \ 205 rorq $32, RCD2; 206 207#define outunpack_dec3() \ 208 rorq $32, RCD0; \ 209 rorq $32, RCD1; \ 210 rorq $32, RCD2; \ 211 outunpack3(mov, RIO, 0, RCD, 0); \ 212 rorq $32, RAB0; \ 213 rorq $32, RAB1; \ 214 rorq $32, RAB2; \ 215 outunpack3(mov, RIO, 2, RAB, 2); 216 217.align 8 218.global __twofish_enc_blk_3way 219.type __twofish_enc_blk_3way,@function; 220 221__twofish_enc_blk_3way: 222 /* input: 223 * %rdi: ctx, CTX 224 * %rsi: dst 225 * %rdx: src, RIO 226 * %rcx: bool, if true: xor output 227 */ 228 pushq %r15; 229 pushq %r14; 230 pushq %r13; 231 pushq %r12; 232 pushq %rbp; 233 pushq %rbx; 234 235 pushq %rcx; /* bool xor */ 236 pushq %rsi; /* dst */ 237 238 inpack_enc3(); 239 240 encrypt_cycle3(RAB, RCD, 0); 241 encrypt_cycle3(RAB, RCD, 1); 242 encrypt_cycle3(RAB, RCD, 2); 243 encrypt_cycle3(RAB, RCD, 3); 244 encrypt_cycle3(RAB, RCD, 4); 245 encrypt_cycle3(RAB, RCD, 5); 246 encrypt_cycle3(RAB, RCD, 6); 247 encrypt_cycle3(RAB, RCD, 7); 248 249 popq RIO; /* dst */ 250 popq %rbp; /* bool xor */ 251 252 testb %bpl, %bpl; 253 jnz __enc_xor3; 254 255 outunpack_enc3(mov); 256 257 popq %rbx; 258 popq %rbp; 259 popq %r12; 260 popq %r13; 261 popq %r14; 262 popq %r15; 263 ret; 264 265__enc_xor3: 266 outunpack_enc3(xor); 267 268 popq %rbx; 269 popq %rbp; 270 popq %r12; 271 popq %r13; 272 popq %r14; 273 popq %r15; 274 ret; 275 276.global twofish_dec_blk_3way 277.type twofish_dec_blk_3way,@function; 278 279twofish_dec_blk_3way: 280 /* input: 281 * %rdi: ctx, CTX 282 * %rsi: dst 283 * %rdx: src, RIO 284 */ 285 pushq %r15; 286 pushq %r14; 287 pushq %r13; 288 pushq %r12; 289 pushq %rbp; 290 pushq %rbx; 291 292 pushq %rsi; /* dst */ 293 294 inpack_dec3(); 295 296 decrypt_cycle3(RAB, RCD, 7); 297 decrypt_cycle3(RAB, RCD, 6); 298 decrypt_cycle3(RAB, RCD, 5); 299 decrypt_cycle3(RAB, RCD, 4); 300 decrypt_cycle3(RAB, RCD, 3); 301 decrypt_cycle3(RAB, RCD, 2); 302 decrypt_cycle3(RAB, RCD, 1); 303 decrypt_cycle3(RAB, RCD, 0); 304 305 popq RIO; /* dst */ 306 307 outunpack_dec3(); 308 309 popq %rbx; 310 popq %rbp; 311 popq %r12; 312 popq %r13; 313 popq %r14; 314 popq %r15; 315 ret; 316 317