2 ; ---------------------------------------------------------------------------
\r
3 ; Copyright (c) 2002, Dr Brian Gladman, Worcester, UK. All rights reserved.
\r
7 ; The free distribution and use of this software in both source and binary
\r
8 ; form is allowed (with or without changes) provided that:
\r
10 ; 1. distributions of this source code include the above copyright
\r
11 ; notice, this list of conditions and the following disclaimer;
\r
13 ; 2. distributions in binary form include the above copyright
\r
14 ; notice, this list of conditions and the following disclaimer
\r
15 ; in the documentation and/or other associated materials;
\r
17 ; 3. the copyright holder's name is not used to endorse products
\r
18 ; built using this software without specific written permission.
\r
20 ; ALTERNATIVELY, provided that this notice is retained in full, this product
\r
21 ; may be distributed under the terms of the GNU General Public License (GPL),
\r
22 ; in which case the provisions of the GPL apply INSTEAD OF those given above.
\r
26 ; This software is provided 'as is' with no explicit or implied warranties
\r
27 ; in respect of its properties, including, but not limited to, correctness
\r
28 ; and/or fitness for purpose.
\r
29 ; ---------------------------------------------------------------------------
\r
32 ; I am grateful to Dag Arne Osvik for many discussions of the techniques that
\r
33 ; can be used to optimise AES assembler code on AMD64/EM64T architectures.
\r
34 ; Some of the techniques used in this implementation are the result of
\r
35 ; suggestions made by him for which I am most grateful.
\r
37 ; An AES implementation for AMD64 processors using the YASM assembler. This
\r
38 ; implemetation provides only encryption, decryption and hence requires key
\r
39 ; scheduling support in C. It uses 8k bytes of tables but its encryption and
\r
40 ; decryption performance is very close to that obtained using large tables.
\r
41 ; It can use either Windows or Gnu/Linux calling conventions, which are as
\r
47 ; context (cx) r8 rdx
\r
49 ; preserved rsi - + rbx, rbp, rsp, r12, r13, r14 & r15
\r
50 ; registers rdi - on both
\r
52 ; destroyed - rsi + rax, rcx, rdx, r8, r9, r10 & r11
\r
53 ; registers - rdi on both
\r
55 ; The default convention is that for windows, the gnu/linux convention being
\r
56 ; used if __GNUC__ is defined.
\r
58 ; This code provides the standard AES block size (128 bits, 16 bytes) and the
\r
59 ; three standard AES key sizes (128, 192 and 256 bits). It has the same call
\r
60 ; interface as my C implementation. It uses the Microsoft C AMD64 calling
\r
61 ; conventions in which the three parameters are placed in rcx, rdx and r8
\r
62 ; respectively. The rbx, rsi, rdi, rbp and r12..r15 registers are preserved.
\r
64 ; AES_RETURN aes_encrypt(const unsigned char in_blk[],
\r
65 ; unsigned char out_blk[], const aes_encrypt_ctx cx[1]);
\r
67 ; AES_RETURN aes_decrypt(const unsigned char in_blk[],
\r
68 ; unsigned char out_blk[], const aes_decrypt_ctx cx[1]);
\r
70 ; AES_RETURN aes_encrypt_key<NNN>(const unsigned char key[],
\r
71 ; const aes_encrypt_ctx cx[1]);
\r
73 ; AES_RETURN aes_decrypt_key<NNN>(const unsigned char key[],
\r
74 ; const aes_decrypt_ctx cx[1]);
\r
76 ; AES_RETURN aes_encrypt_key(const unsigned char key[],
\r
77 ; unsigned int len, const aes_decrypt_ctx cx[1]);
\r
79 ; AES_RETURN aes_decrypt_key(const unsigned char key[],
\r
80 ; unsigned int len, const aes_decrypt_ctx cx[1]);
\r
82 ; where <NNN> is 128, 102 or 256. In the last two calls the length can be in
\r
83 ; either bits or bytes.
\r
85 ; Comment in/out the following lines to obtain the desired subroutines. These
\r
86 ; selections MUST match those in the C header file aes.h
\r
88 %define AES_128 ; define if AES with 128 bit keys is needed
\r
89 %define AES_192 ; define if AES with 192 bit keys is needed
\r
90 %define AES_256 ; define if AES with 256 bit keys is needed
\r
91 %define AES_VAR ; define if a variable key size is needed
\r
92 %define ENCRYPTION ; define if encryption is needed
\r
93 %define DECRYPTION ; define if decryption is needed
\r
94 %define AES_REV_DKS ; define if key decryption schedule is reversed
\r
95 %define LAST_ROUND_TABLES ; define for the faster version using extra tables
\r
97 ; The encryption key schedule has the following in memory layout where N is the
\r
98 ; number of rounds (10, 12 or 14):
\r
100 ; lo: | input key (round 0) | ; each round is four 32-bit words
\r
101 ; | encryption round 1 |
\r
102 ; | encryption round 2 |
\r
104 ; | encryption round N-1 |
\r
105 ; hi: | encryption round N |
\r
107 ; The decryption key schedule is normally set up so that it has the same
\r
108 ; layout as above by actually reversing the order of the encryption key
\r
109 ; schedule in memory (this happens when AES_REV_DKS is set):
\r
111 ; lo: | decryption round 0 | = | encryption round N |
\r
112 ; | decryption round 1 | = INV_MIX_COL[ | encryption round N-1 | ]
\r
113 ; | decryption round 2 | = INV_MIX_COL[ | encryption round N-2 | ]
\r
115 ; | decryption round N-1 | = INV_MIX_COL[ | encryption round 1 | ]
\r
116 ; hi: | decryption round N | = | input key (round 0) |
\r
118 ; with rounds except the first and last modified using inv_mix_column()
\r
119 ; But if AES_REV_DKS is NOT set the order of keys is left as it is for
\r
120 ; encryption so that it has to be accessed in reverse when used for
\r
121 ; decryption (although the inverse mix column modifications are done)
\r
123 ; lo: | decryption round 0 | = | input key (round 0) |
\r
124 ; | decryption round 1 | = INV_MIX_COL[ | encryption round 1 | ]
\r
125 ; | decryption round 2 | = INV_MIX_COL[ | encryption round 2 | ]
\r
127 ; | decryption round N-1 | = INV_MIX_COL[ | encryption round N-1 | ]
\r
128 ; hi: | decryption round N | = | encryption round N |
\r
130 ; This layout is faster when the assembler key scheduling provided here
\r
133 ; The DLL interface must use the _stdcall convention in which the number
\r
134 ; of bytes of parameter space is added after an @ to the sutine's name.
\r
135 ; We must also remove our parameters from the stack before return (see
\r
136 ; the do_exit macro). Define DLL_EXPORT for the Dynamic Link Library version.
\r
138 ;%define DLL_EXPORT
\r
140 ; End of user defines
\r
155 %define KS_LENGTH 60
\r
157 %define KS_LENGTH 60
\r
159 %define KS_LENGTH 52
\r
161 %define KS_LENGTH 44
\r
201 ; finite field multiplies by {02}, {04} and {08}
\r
203 %define f2(x) ((x<<1)^(((x>>7)&1)*0x11b))
\r
204 %define f4(x) ((x<<2)^(((x>>6)&1)*0x11b)^(((x>>6)&2)*0x11b))
\r
205 %define f8(x) ((x<<3)^(((x>>5)&1)*0x11b)^(((x>>5)&2)*0x11b)^(((x>>5)&4)*0x11b))
\r
207 ; finite field multiplies required in table generation
\r
209 %define f3(x) (f2(x) ^ x)
\r
210 %define f9(x) (f8(x) ^ x)
\r
211 %define fb(x) (f8(x) ^ f2(x) ^ x)
\r
212 %define fd(x) (f8(x) ^ f4(x) ^ x)
\r
213 %define fe(x) (f8(x) ^ f4(x) ^ f2(x))
\r
215 ; macro for expanding S-box data
\r
218 db %1(0x63),%1(0x7c),%1(0x77),%1(0x7b),%1(0xf2),%1(0x6b),%1(0x6f),%1(0xc5)
\r
219 db %1(0x30),%1(0x01),%1(0x67),%1(0x2b),%1(0xfe),%1(0xd7),%1(0xab),%1(0x76)
\r
220 db %1(0xca),%1(0x82),%1(0xc9),%1(0x7d),%1(0xfa),%1(0x59),%1(0x47),%1(0xf0)
\r
221 db %1(0xad),%1(0xd4),%1(0xa2),%1(0xaf),%1(0x9c),%1(0xa4),%1(0x72),%1(0xc0)
\r
222 db %1(0xb7),%1(0xfd),%1(0x93),%1(0x26),%1(0x36),%1(0x3f),%1(0xf7),%1(0xcc)
\r
223 db %1(0x34),%1(0xa5),%1(0xe5),%1(0xf1),%1(0x71),%1(0xd8),%1(0x31),%1(0x15)
\r
224 db %1(0x04),%1(0xc7),%1(0x23),%1(0xc3),%1(0x18),%1(0x96),%1(0x05),%1(0x9a)
\r
225 db %1(0x07),%1(0x12),%1(0x80),%1(0xe2),%1(0xeb),%1(0x27),%1(0xb2),%1(0x75)
\r
226 db %1(0x09),%1(0x83),%1(0x2c),%1(0x1a),%1(0x1b),%1(0x6e),%1(0x5a),%1(0xa0)
\r
227 db %1(0x52),%1(0x3b),%1(0xd6),%1(0xb3),%1(0x29),%1(0xe3),%1(0x2f),%1(0x84)
\r
228 db %1(0x53),%1(0xd1),%1(0x00),%1(0xed),%1(0x20),%1(0xfc),%1(0xb1),%1(0x5b)
\r
229 db %1(0x6a),%1(0xcb),%1(0xbe),%1(0x39),%1(0x4a),%1(0x4c),%1(0x58),%1(0xcf)
\r
230 db %1(0xd0),%1(0xef),%1(0xaa),%1(0xfb),%1(0x43),%1(0x4d),%1(0x33),%1(0x85)
\r
231 db %1(0x45),%1(0xf9),%1(0x02),%1(0x7f),%1(0x50),%1(0x3c),%1(0x9f),%1(0xa8)
\r
232 db %1(0x51),%1(0xa3),%1(0x40),%1(0x8f),%1(0x92),%1(0x9d),%1(0x38),%1(0xf5)
\r
233 db %1(0xbc),%1(0xb6),%1(0xda),%1(0x21),%1(0x10),%1(0xff),%1(0xf3),%1(0xd2)
\r
234 db %1(0xcd),%1(0x0c),%1(0x13),%1(0xec),%1(0x5f),%1(0x97),%1(0x44),%1(0x17)
\r
235 db %1(0xc4),%1(0xa7),%1(0x7e),%1(0x3d),%1(0x64),%1(0x5d),%1(0x19),%1(0x73)
\r
236 db %1(0x60),%1(0x81),%1(0x4f),%1(0xdc),%1(0x22),%1(0x2a),%1(0x90),%1(0x88)
\r
237 db %1(0x46),%1(0xee),%1(0xb8),%1(0x14),%1(0xde),%1(0x5e),%1(0x0b),%1(0xdb)
\r
238 db %1(0xe0),%1(0x32),%1(0x3a),%1(0x0a),%1(0x49),%1(0x06),%1(0x24),%1(0x5c)
\r
239 db %1(0xc2),%1(0xd3),%1(0xac),%1(0x62),%1(0x91),%1(0x95),%1(0xe4),%1(0x79)
\r
240 db %1(0xe7),%1(0xc8),%1(0x37),%1(0x6d),%1(0x8d),%1(0xd5),%1(0x4e),%1(0xa9)
\r
241 db %1(0x6c),%1(0x56),%1(0xf4),%1(0xea),%1(0x65),%1(0x7a),%1(0xae),%1(0x08)
\r
242 db %1(0xba),%1(0x78),%1(0x25),%1(0x2e),%1(0x1c),%1(0xa6),%1(0xb4),%1(0xc6)
\r
243 db %1(0xe8),%1(0xdd),%1(0x74),%1(0x1f),%1(0x4b),%1(0xbd),%1(0x8b),%1(0x8a)
\r
244 db %1(0x70),%1(0x3e),%1(0xb5),%1(0x66),%1(0x48),%1(0x03),%1(0xf6),%1(0x0e)
\r
245 db %1(0x61),%1(0x35),%1(0x57),%1(0xb9),%1(0x86),%1(0xc1),%1(0x1d),%1(0x9e)
\r
246 db %1(0xe1),%1(0xf8),%1(0x98),%1(0x11),%1(0x69),%1(0xd9),%1(0x8e),%1(0x94)
\r
247 db %1(0x9b),%1(0x1e),%1(0x87),%1(0xe9),%1(0xce),%1(0x55),%1(0x28),%1(0xdf)
\r
248 db %1(0x8c),%1(0xa1),%1(0x89),%1(0x0d),%1(0xbf),%1(0xe6),%1(0x42),%1(0x68)
\r
249 db %1(0x41),%1(0x99),%1(0x2d),%1(0x0f),%1(0xb0),%1(0x54),%1(0xbb),%1(0x16)
\r
253 db %1(0x52),%1(0x09),%1(0x6a),%1(0xd5),%1(0x30),%1(0x36),%1(0xa5),%1(0x38)
\r
254 db %1(0xbf),%1(0x40),%1(0xa3),%1(0x9e),%1(0x81),%1(0xf3),%1(0xd7),%1(0xfb)
\r
255 db %1(0x7c),%1(0xe3),%1(0x39),%1(0x82),%1(0x9b),%1(0x2f),%1(0xff),%1(0x87)
\r
256 db %1(0x34),%1(0x8e),%1(0x43),%1(0x44),%1(0xc4),%1(0xde),%1(0xe9),%1(0xcb)
\r
257 db %1(0x54),%1(0x7b),%1(0x94),%1(0x32),%1(0xa6),%1(0xc2),%1(0x23),%1(0x3d)
\r
258 db %1(0xee),%1(0x4c),%1(0x95),%1(0x0b),%1(0x42),%1(0xfa),%1(0xc3),%1(0x4e)
\r
259 db %1(0x08),%1(0x2e),%1(0xa1),%1(0x66),%1(0x28),%1(0xd9),%1(0x24),%1(0xb2)
\r
260 db %1(0x76),%1(0x5b),%1(0xa2),%1(0x49),%1(0x6d),%1(0x8b),%1(0xd1),%1(0x25)
\r
261 db %1(0x72),%1(0xf8),%1(0xf6),%1(0x64),%1(0x86),%1(0x68),%1(0x98),%1(0x16)
\r
262 db %1(0xd4),%1(0xa4),%1(0x5c),%1(0xcc),%1(0x5d),%1(0x65),%1(0xb6),%1(0x92)
\r
263 db %1(0x6c),%1(0x70),%1(0x48),%1(0x50),%1(0xfd),%1(0xed),%1(0xb9),%1(0xda)
\r
264 db %1(0x5e),%1(0x15),%1(0x46),%1(0x57),%1(0xa7),%1(0x8d),%1(0x9d),%1(0x84)
\r
265 db %1(0x90),%1(0xd8),%1(0xab),%1(0x00),%1(0x8c),%1(0xbc),%1(0xd3),%1(0x0a)
\r
266 db %1(0xf7),%1(0xe4),%1(0x58),%1(0x05),%1(0xb8),%1(0xb3),%1(0x45),%1(0x06)
\r
267 db %1(0xd0),%1(0x2c),%1(0x1e),%1(0x8f),%1(0xca),%1(0x3f),%1(0x0f),%1(0x02)
\r
268 db %1(0xc1),%1(0xaf),%1(0xbd),%1(0x03),%1(0x01),%1(0x13),%1(0x8a),%1(0x6b)
\r
269 db %1(0x3a),%1(0x91),%1(0x11),%1(0x41),%1(0x4f),%1(0x67),%1(0xdc),%1(0xea)
\r
270 db %1(0x97),%1(0xf2),%1(0xcf),%1(0xce),%1(0xf0),%1(0xb4),%1(0xe6),%1(0x73)
\r
271 db %1(0x96),%1(0xac),%1(0x74),%1(0x22),%1(0xe7),%1(0xad),%1(0x35),%1(0x85)
\r
272 db %1(0xe2),%1(0xf9),%1(0x37),%1(0xe8),%1(0x1c),%1(0x75),%1(0xdf),%1(0x6e)
\r
273 db %1(0x47),%1(0xf1),%1(0x1a),%1(0x71),%1(0x1d),%1(0x29),%1(0xc5),%1(0x89)
\r
274 db %1(0x6f),%1(0xb7),%1(0x62),%1(0x0e),%1(0xaa),%1(0x18),%1(0xbe),%1(0x1b)
\r
275 db %1(0xfc),%1(0x56),%1(0x3e),%1(0x4b),%1(0xc6),%1(0xd2),%1(0x79),%1(0x20)
\r
276 db %1(0x9a),%1(0xdb),%1(0xc0),%1(0xfe),%1(0x78),%1(0xcd),%1(0x5a),%1(0xf4)
\r
277 db %1(0x1f),%1(0xdd),%1(0xa8),%1(0x33),%1(0x88),%1(0x07),%1(0xc7),%1(0x31)
\r
278 db %1(0xb1),%1(0x12),%1(0x10),%1(0x59),%1(0x27),%1(0x80),%1(0xec),%1(0x5f)
\r
279 db %1(0x60),%1(0x51),%1(0x7f),%1(0xa9),%1(0x19),%1(0xb5),%1(0x4a),%1(0x0d)
\r
280 db %1(0x2d),%1(0xe5),%1(0x7a),%1(0x9f),%1(0x93),%1(0xc9),%1(0x9c),%1(0xef)
\r
281 db %1(0xa0),%1(0xe0),%1(0x3b),%1(0x4d),%1(0xae),%1(0x2a),%1(0xf5),%1(0xb0)
\r
282 db %1(0xc8),%1(0xeb),%1(0xbb),%1(0x3c),%1(0x83),%1(0x53),%1(0x99),%1(0x61)
\r
283 db %1(0x17),%1(0x2b),%1(0x04),%1(0x7e),%1(0xba),%1(0x77),%1(0xd6),%1(0x26)
\r
284 db %1(0xe1),%1(0x69),%1(0x14),%1(0x63),%1(0x55),%1(0x21),%1(0x0c),%1(0x7d)
\r
287 %define u8(x) f2(x), x, x, f3(x), f2(x), x, x, f3(x)
\r
288 %define v8(x) fe(x), f9(x), fd(x), fb(x), fe(x), f9(x), fd(x), x
\r
289 %define w8(x) x, 0, 0, 0, x, 0, 0, 0
\r
291 %define tptr rbp ; table pointer
\r
292 %define kptr r8 ; key schedule pointer
\r
293 %define fofs 128 ; adjust offset in key schedule to keep |disp| < 128
\r
294 %define fk_ref(x,y) [kptr-16*x+fofs+4*y]
\r
297 %define ik_ref(x,y) [kptr-16*x+rofs+4*y]
\r
300 %define ik_ref(x,y) [kptr+16*x+rofs+4*y]
\r
303 %define tab_0(x) [tptr+8*x]
\r
304 %define tab_1(x) [tptr+8*x+3]
\r
305 %define tab_2(x) [tptr+8*x+2]
\r
306 %define tab_3(x) [tptr+8*x+1]
\r
307 %define tab_f(x) byte [tptr+8*x+1]
\r
308 %define tab_i(x) byte [tptr+8*x+7]
\r
309 %define t_ref(x,r) tab_ %+ x(r)
\r
311 %macro ff_rnd 5 ; normal forward round
\r
312 mov %1d, fk_ref(%5,0)
\r
313 mov %2d, fk_ref(%5,1)
\r
314 mov %3d, fk_ref(%5,2)
\r
315 mov %4d, fk_ref(%5,3)
\r
320 xor %1d, t_ref(0,rsi)
\r
321 xor %4d, t_ref(1,rdi)
\r
324 xor %3d, t_ref(2,rsi)
\r
325 xor %2d, t_ref(3,rdi)
\r
330 xor %2d, t_ref(0,rsi)
\r
331 xor %1d, t_ref(1,rdi)
\r
334 xor %4d, t_ref(2,rsi)
\r
335 xor %3d, t_ref(3,rdi)
\r
340 xor %3d, t_ref(0,rsi)
\r
341 xor %2d, t_ref(1,rdi)
\r
344 xor %1d, t_ref(2,rsi)
\r
345 xor %4d, t_ref(3,rdi)
\r
350 xor %4d, t_ref(0,rsi)
\r
351 xor %3d, t_ref(1,rdi)
\r
354 xor %2d, t_ref(2,rsi)
\r
355 xor %1d, t_ref(3,rdi)
\r
363 %ifdef LAST_ROUND_TABLES
\r
365 %macro fl_rnd 5 ; last forward round
\r
367 mov %1d, fk_ref(%5,0)
\r
368 mov %2d, fk_ref(%5,1)
\r
369 mov %3d, fk_ref(%5,2)
\r
370 mov %4d, fk_ref(%5,3)
\r
375 xor %1d, t_ref(0,rsi)
\r
376 xor %4d, t_ref(1,rdi)
\r
379 xor %3d, t_ref(2,rsi)
\r
380 xor %2d, t_ref(3,rdi)
\r
385 xor %2d, t_ref(0,rsi)
\r
386 xor %1d, t_ref(1,rdi)
\r
389 xor %4d, t_ref(2,rsi)
\r
390 xor %3d, t_ref(3,rdi)
\r
395 xor %3d, t_ref(0,rsi)
\r
396 xor %2d, t_ref(1,rdi)
\r
399 xor %1d, t_ref(2,rsi)
\r
400 xor %4d, t_ref(3,rdi)
\r
405 xor %4d, t_ref(0,rsi)
\r
406 xor %3d, t_ref(1,rdi)
\r
409 xor %2d, t_ref(2,rsi)
\r
410 xor %1d, t_ref(3,rdi)
\r
415 %macro fl_rnd 5 ; last forward round
\r
416 mov %1d, fk_ref(%5,0)
\r
417 mov %2d, fk_ref(%5,1)
\r
418 mov %3d, fk_ref(%5,2)
\r
419 mov %4d, fk_ref(%5,3)
\r
424 movzx esi, t_ref(f,rsi)
\r
425 movzx edi, t_ref(f,rdi)
\r
431 movzx esi, t_ref(f,rsi)
\r
432 movzx edi, t_ref(f,rdi)
\r
441 movzx esi, t_ref(f,rsi)
\r
442 movzx edi, t_ref(f,rdi)
\r
448 movzx esi, t_ref(f,rsi)
\r
449 movzx edi, t_ref(f,rdi)
\r
457 movzx esi, t_ref(f,rsi)
\r
458 movzx edi, t_ref(f,rdi)
\r
465 movzx esi, t_ref(f,rsi)
\r
466 movzx edi, t_ref(f,rdi)
\r
474 movzx esi, t_ref(f,rsi)
\r
475 movzx edi, t_ref(f,rdi)
\r
482 movzx esi, t_ref(f,rsi)
\r
483 movzx edi, t_ref(f,rdi)
\r
492 %macro ii_rnd 5 ; normal inverse round
\r
493 mov %1d, ik_ref(%5,0)
\r
494 mov %2d, ik_ref(%5,1)
\r
495 mov %3d, ik_ref(%5,2)
\r
496 mov %4d, ik_ref(%5,3)
\r
501 xor %1d, t_ref(0,rsi)
\r
502 xor %2d, t_ref(1,rdi)
\r
505 xor %3d, t_ref(2,rsi)
\r
506 xor %4d, t_ref(3,rdi)
\r
511 xor %2d, t_ref(0,rsi)
\r
512 xor %3d, t_ref(1,rdi)
\r
515 xor %4d, t_ref(2,rsi)
\r
516 xor %1d, t_ref(3,rdi)
\r
521 xor %3d, t_ref(0,rsi)
\r
522 xor %4d, t_ref(1,rdi)
\r
525 xor %1d, t_ref(2,rsi)
\r
526 xor %2d, t_ref(3,rdi)
\r
531 xor %4d, t_ref(0,rsi)
\r
532 xor %1d, t_ref(1,rdi)
\r
535 xor %2d, t_ref(2,rsi)
\r
536 xor %3d, t_ref(3,rdi)
\r
544 %ifdef LAST_ROUND_TABLES
\r
546 %macro il_rnd 5 ; last inverse round
\r
548 mov %1d, ik_ref(%5,0)
\r
549 mov %2d, ik_ref(%5,1)
\r
550 mov %3d, ik_ref(%5,2)
\r
551 mov %4d, ik_ref(%5,3)
\r
556 xor %1d, t_ref(0,rsi)
\r
557 xor %2d, t_ref(1,rdi)
\r
560 xor %3d, t_ref(2,rsi)
\r
561 xor %4d, t_ref(3,rdi)
\r
566 xor %2d, t_ref(0,rsi)
\r
567 xor %3d, t_ref(1,rdi)
\r
570 xor %4d, t_ref(2,rsi)
\r
571 xor %1d, t_ref(3,rdi)
\r
576 xor %3d, t_ref(0,rsi)
\r
577 xor %4d, t_ref(1,rdi)
\r
580 xor %1d, t_ref(2,rsi)
\r
581 xor %2d, t_ref(3,rdi)
\r
586 xor %4d, t_ref(0,rsi)
\r
587 xor %1d, t_ref(1,rdi)
\r
590 xor %2d, t_ref(2,rsi)
\r
591 xor %3d, t_ref(3,rdi)
\r
596 %macro il_rnd 5 ; last inverse round
\r
597 mov %1d, ik_ref(%5,0)
\r
598 mov %2d, ik_ref(%5,1)
\r
599 mov %3d, ik_ref(%5,2)
\r
600 mov %4d, ik_ref(%5,3)
\r
604 movzx esi, t_ref(i,rsi)
\r
605 movzx edi, t_ref(i,rdi)
\r
612 movzx esi, t_ref(i,rsi)
\r
613 movzx edi, t_ref(i,rdi)
\r
621 movzx esi, t_ref(i,rsi)
\r
622 movzx edi, t_ref(i,rdi)
\r
629 movzx esi, t_ref(i,rsi)
\r
630 movzx edi, t_ref(i,rdi)
\r
638 movzx esi, t_ref(i,rsi)
\r
639 movzx edi, t_ref(i,rdi)
\r
646 movzx esi, t_ref(i,rsi)
\r
647 movzx edi, t_ref(i,rdi)
\r
655 movzx esi, t_ref(i,rsi)
\r
656 movzx edi, t_ref(i,rdi)
\r
663 movzx esi, t_ref(i,rsi)
\r
664 movzx edi, t_ref(i,rdi)
\r
680 section .data align=64
\r
684 %ifdef LAST_ROUND_TABLES
\r
688 section .text align=16
\r
693 sub rsp, 4*8 ; gnu/linux binary interface
\r
694 mov [rsp+0*8], rsi ; output pointer
\r
695 mov r8, rdx ; context
\r
697 sub rsp, 6*8 ; windows binary interface
\r
700 mov [rsp+0*8], rdx ; output pointer
\r
701 mov rdi, rcx ; input pointer
\r
703 mov [rsp+1*8], rbx ; input pointer in rdi
\r
704 mov [rsp+2*8], rbp ; output pointer in [rsp]
\r
705 mov [rsp+3*8], r12 ; context in r8
\r
707 movzx esi, byte [kptr+4*KS_LENGTH]
\r
708 lea tptr,[enc_tab wrt rip]
\r
716 xor eax, [kptr+fofs]
\r
717 xor ebx, [kptr+fofs+4]
\r
718 xor ecx, [kptr+fofs+8]
\r
719 xor edx, [kptr+fofs+12]
\r
721 lea kptr,[kptr+rsi]
\r
731 .1: ff_rnd r9, r10, r11, r12, 13
\r
732 ff_rnd r9, r10, r11, r12, 12
\r
733 .2: ff_rnd r9, r10, r11, r12, 11
\r
734 ff_rnd r9, r10, r11, r12, 10
\r
735 .3: ff_rnd r9, r10, r11, r12, 9
\r
736 ff_rnd r9, r10, r11, r12, 8
\r
737 ff_rnd r9, r10, r11, r12, 7
\r
738 ff_rnd r9, r10, r11, r12, 6
\r
739 ff_rnd r9, r10, r11, r12, 5
\r
740 ff_rnd r9, r10, r11, r12, 4
\r
741 ff_rnd r9, r10, r11, r12, 3
\r
742 ff_rnd r9, r10, r11, r12, 2
\r
743 ff_rnd r9, r10, r11, r12, 1
\r
744 fl_rnd r9, r10, r11, r12, 0
\r
774 section .data align=64
\r
778 %ifdef LAST_ROUND_TABLES
\r
782 section .text align=16
\r
787 sub rsp, 4*8 ; gnu/linux binary interface
\r
788 mov [rsp+0*8], rsi ; output pointer
\r
789 mov r8, rdx ; context
\r
791 sub rsp, 6*8 ; windows binary interface
\r
794 mov [rsp+0*8], rdx ; output pointer
\r
795 mov rdi, rcx ; input pointer
\r
797 mov [rsp+1*8], rbx ; input pointer in rdi
\r
798 mov [rsp+2*8], rbp ; output pointer in [rsp]
\r
799 mov [rsp+3*8], r12 ; context in r8
\r
801 movzx esi,byte[kptr+4*KS_LENGTH]
\r
802 lea tptr,[dec_tab wrt rip]
\r
812 lea kptr,[kptr+rsi]
\r
817 xor eax, [rdi+rofs]
\r
818 xor ebx, [rdi+rofs+4]
\r
819 xor ecx, [rdi+rofs+8]
\r
820 xor edx, [rdi+rofs+12]
\r
831 .1: ii_rnd r9, r10, r11, r12, 13
\r
832 ii_rnd r9, r10, r11, r12, 12
\r
833 .2: ii_rnd r9, r10, r11, r12, 11
\r
834 ii_rnd r9, r10, r11, r12, 10
\r
835 .3: ii_rnd r9, r10, r11, r12, 9
\r
836 ii_rnd r9, r10, r11, r12, 8
\r
837 ii_rnd r9, r10, r11, r12, 7
\r
838 ii_rnd r9, r10, r11, r12, 6
\r
839 ii_rnd r9, r10, r11, r12, 5
\r
840 ii_rnd r9, r10, r11, r12, 4
\r
841 ii_rnd r9, r10, r11, r12, 3
\r
842 ii_rnd r9, r10, r11, r12, 2
\r
843 ii_rnd r9, r10, r11, r12, 1
\r
844 il_rnd r9, r10, r11, r12, 0
\r
852 .4: mov rbx, [rsp+1*8]
\r