--- /dev/null
+\r
+; ---------------------------------------------------------------------------\r
+; Copyright (c) 2002, Dr Brian Gladman, Worcester, UK. All rights reserved.\r
+;\r
+; LICENSE TERMS\r
+;\r
+; The free distribution and use of this software in both source and binary\r
+; form is allowed (with or without changes) provided that:\r
+;\r
+; 1. distributions of this source code include the above copyright\r
+; notice, this list of conditions and the following disclaimer;\r
+;\r
+; 2. distributions in binary form include the above copyright\r
+; notice, this list of conditions and the following disclaimer\r
+; in the documentation and/or other associated materials;\r
+;\r
+; 3. the copyright holder's name is not used to endorse products\r
+; built using this software without specific written permission.\r
+;\r
+; ALTERNATIVELY, provided that this notice is retained in full, this product\r
+; may be distributed under the terms of the GNU General Public License (GPL),\r
+; in which case the provisions of the GPL apply INSTEAD OF those given above.\r
+;\r
+; DISCLAIMER\r
+;\r
+; This software is provided 'as is' with no explicit or implied warranties\r
+; in respect of its properties, including, but not limited to, correctness\r
+; and/or fitness for purpose.\r
+; ---------------------------------------------------------------------------\r
+; Issue 09/09/2006\r
+\r
+; I am grateful to Dag Arne Osvik for many discussions of the techniques that\r
+; can be used to optimise AES assembler code on AMD64/EM64T architectures.\r
+; Some of the techniques used in this implementation are the result of\r
+; suggestions made by him for which I am most grateful.\r
+\r
+; An AES implementation for AMD64 processors using the YASM assembler. This\r
+; implemetation provides only encryption, decryption and hence requires key\r
+; scheduling support in C. It uses 8k bytes of tables but its encryption and\r
+; decryption performance is very close to that obtained using large tables.\r
+; It can use either Windows or Gnu/Linux calling conventions, which are as\r
+; follows:\r
+; windows gnu/linux\r
+;\r
+; in_blk rcx rdi\r
+; out_blk rdx rsi\r
+; context (cx) r8 rdx\r
+;\r
+; preserved rsi - + rbx, rbp, rsp, r12, r13, r14 & r15\r
+; registers rdi - on both\r
+;\r
+; destroyed - rsi + rax, rcx, rdx, r8, r9, r10 & r11\r
+; registers - rdi on both\r
+;\r
+; The default convention is that for windows, the gnu/linux convention being\r
+; used if __GNUC__ is defined.\r
+;\r
+; This code provides the standard AES block size (128 bits, 16 bytes) and the\r
+; three standard AES key sizes (128, 192 and 256 bits). It has the same call\r
+; interface as my C implementation. It uses the Microsoft C AMD64 calling\r
+; conventions in which the three parameters are placed in rcx, rdx and r8\r
+; respectively. The rbx, rsi, rdi, rbp and r12..r15 registers are preserved.\r
+;\r
+; AES_RETURN aes_encrypt(const unsigned char in_blk[],\r
+; unsigned char out_blk[], const aes_encrypt_ctx cx[1]);\r
+;\r
+; AES_RETURN aes_decrypt(const unsigned char in_blk[],\r
+; unsigned char out_blk[], const aes_decrypt_ctx cx[1]);\r
+;\r
+; AES_RETURN aes_encrypt_key<NNN>(const unsigned char key[],\r
+; const aes_encrypt_ctx cx[1]);\r
+;\r
+; AES_RETURN aes_decrypt_key<NNN>(const unsigned char key[],\r
+; const aes_decrypt_ctx cx[1]);\r
+;\r
+; AES_RETURN aes_encrypt_key(const unsigned char key[],\r
+; unsigned int len, const aes_decrypt_ctx cx[1]);\r
+;\r
+; AES_RETURN aes_decrypt_key(const unsigned char key[],\r
+; unsigned int len, const aes_decrypt_ctx cx[1]);\r
+;\r
+; where <NNN> is 128, 102 or 256. In the last two calls the length can be in\r
+; either bits or bytes.\r
+;\r
+; Comment in/out the following lines to obtain the desired subroutines. These\r
+; selections MUST match those in the C header file aes.h\r
+\r
+%define AES_128 ; define if AES with 128 bit keys is needed\r
+%define AES_192 ; define if AES with 192 bit keys is needed\r
+%define AES_256 ; define if AES with 256 bit keys is needed\r
+%define AES_VAR ; define if a variable key size is needed\r
+%define ENCRYPTION ; define if encryption is needed\r
+%define DECRYPTION ; define if decryption is needed\r
+%define AES_REV_DKS ; define if key decryption schedule is reversed\r
+%define LAST_ROUND_TABLES ; define for the faster version using extra tables\r
+\r
+; The encryption key schedule has the following in memory layout where N is the\r
+; number of rounds (10, 12 or 14):\r
+;\r
+; lo: | input key (round 0) | ; each round is four 32-bit words\r
+; | encryption round 1 |\r
+; | encryption round 2 |\r
+; ....\r
+; | encryption round N-1 |\r
+; hi: | encryption round N |\r
+;\r
+; The decryption key schedule is normally set up so that it has the same\r
+; layout as above by actually reversing the order of the encryption key\r
+; schedule in memory (this happens when AES_REV_DKS is set):\r
+;\r
+; lo: | decryption round 0 | = | encryption round N |\r
+; | decryption round 1 | = INV_MIX_COL[ | encryption round N-1 | ]\r
+; | decryption round 2 | = INV_MIX_COL[ | encryption round N-2 | ]\r
+; .... ....\r
+; | decryption round N-1 | = INV_MIX_COL[ | encryption round 1 | ]\r
+; hi: | decryption round N | = | input key (round 0) |\r
+;\r
+; with rounds except the first and last modified using inv_mix_column()\r
+; But if AES_REV_DKS is NOT set the order of keys is left as it is for\r
+; encryption so that it has to be accessed in reverse when used for\r
+; decryption (although the inverse mix column modifications are done)\r
+;\r
+; lo: | decryption round 0 | = | input key (round 0) |\r
+; | decryption round 1 | = INV_MIX_COL[ | encryption round 1 | ]\r
+; | decryption round 2 | = INV_MIX_COL[ | encryption round 2 | ]\r
+; .... ....\r
+; | decryption round N-1 | = INV_MIX_COL[ | encryption round N-1 | ]\r
+; hi: | decryption round N | = | encryption round N |\r
+;\r
+; This layout is faster when the assembler key scheduling provided here\r
+; is used.\r
+;\r
+; The DLL interface must use the _stdcall convention in which the number\r
+; of bytes of parameter space is added after an @ to the sutine's name.\r
+; We must also remove our parameters from the stack before return (see\r
+; the do_exit macro). Define DLL_EXPORT for the Dynamic Link Library version.\r
+\r
+;%define DLL_EXPORT\r
+\r
+; End of user defines\r
+\r
+%ifdef AES_VAR\r
+%ifndef AES_128\r
+%define AES_128\r
+%endif\r
+%ifndef AES_192\r
+%define AES_192\r
+%endif\r
+%ifndef AES_256\r
+%define AES_256\r
+%endif\r
+%endif\r
+\r
+%ifdef AES_VAR\r
+%define KS_LENGTH 60\r
+%elifdef AES_256\r
+%define KS_LENGTH 60\r
+%elifdef AES_192\r
+%define KS_LENGTH 52\r
+%else\r
+%define KS_LENGTH 44\r
+%endif\r
+\r
+%define r0 rax\r
+%define r1 rdx\r
+%define r2 rcx\r
+%define r3 rbx\r
+%define r4 rsi\r
+%define r5 rdi\r
+%define r6 rbp\r
+%define r7 rsp\r
+\r
+%define raxd eax\r
+%define rdxd edx\r
+%define rcxd ecx\r
+%define rbxd ebx\r
+%define rsid esi\r
+%define rdid edi\r
+%define rbpd ebp\r
+%define rspd esp\r
+\r
+%define raxb al\r
+%define rdxb dl\r
+%define rcxb cl\r
+%define rbxb bl\r
+%define rsib sil\r
+%define rdib dil\r
+%define rbpb bpl\r
+%define rspb spl\r
+\r
+%define r0h ah\r
+%define r1h dh\r
+%define r2h ch\r
+%define r3h bh\r
+\r
+%define r0d eax\r
+%define r1d edx\r
+%define r2d ecx\r
+%define r3d ebx\r
+\r
+; finite field multiplies by {02}, {04} and {08}\r
+\r
+%define f2(x) ((x<<1)^(((x>>7)&1)*0x11b))\r
+%define f4(x) ((x<<2)^(((x>>6)&1)*0x11b)^(((x>>6)&2)*0x11b))\r
+%define f8(x) ((x<<3)^(((x>>5)&1)*0x11b)^(((x>>5)&2)*0x11b)^(((x>>5)&4)*0x11b))\r
+\r
+; finite field multiplies required in table generation\r
+\r
+%define f3(x) (f2(x) ^ x)\r
+%define f9(x) (f8(x) ^ x)\r
+%define fb(x) (f8(x) ^ f2(x) ^ x)\r
+%define fd(x) (f8(x) ^ f4(x) ^ x)\r
+%define fe(x) (f8(x) ^ f4(x) ^ f2(x))\r
+\r
+; macro for expanding S-box data\r
+\r
+%macro enc_vals 1\r
+ db %1(0x63),%1(0x7c),%1(0x77),%1(0x7b),%1(0xf2),%1(0x6b),%1(0x6f),%1(0xc5)\r
+ db %1(0x30),%1(0x01),%1(0x67),%1(0x2b),%1(0xfe),%1(0xd7),%1(0xab),%1(0x76)\r
+ db %1(0xca),%1(0x82),%1(0xc9),%1(0x7d),%1(0xfa),%1(0x59),%1(0x47),%1(0xf0)\r
+ db %1(0xad),%1(0xd4),%1(0xa2),%1(0xaf),%1(0x9c),%1(0xa4),%1(0x72),%1(0xc0)\r
+ db %1(0xb7),%1(0xfd),%1(0x93),%1(0x26),%1(0x36),%1(0x3f),%1(0xf7),%1(0xcc)\r
+ db %1(0x34),%1(0xa5),%1(0xe5),%1(0xf1),%1(0x71),%1(0xd8),%1(0x31),%1(0x15)\r
+ db %1(0x04),%1(0xc7),%1(0x23),%1(0xc3),%1(0x18),%1(0x96),%1(0x05),%1(0x9a)\r
+ db %1(0x07),%1(0x12),%1(0x80),%1(0xe2),%1(0xeb),%1(0x27),%1(0xb2),%1(0x75)\r
+ db %1(0x09),%1(0x83),%1(0x2c),%1(0x1a),%1(0x1b),%1(0x6e),%1(0x5a),%1(0xa0)\r
+ db %1(0x52),%1(0x3b),%1(0xd6),%1(0xb3),%1(0x29),%1(0xe3),%1(0x2f),%1(0x84)\r
+ db %1(0x53),%1(0xd1),%1(0x00),%1(0xed),%1(0x20),%1(0xfc),%1(0xb1),%1(0x5b)\r
+ db %1(0x6a),%1(0xcb),%1(0xbe),%1(0x39),%1(0x4a),%1(0x4c),%1(0x58),%1(0xcf)\r
+ db %1(0xd0),%1(0xef),%1(0xaa),%1(0xfb),%1(0x43),%1(0x4d),%1(0x33),%1(0x85)\r
+ db %1(0x45),%1(0xf9),%1(0x02),%1(0x7f),%1(0x50),%1(0x3c),%1(0x9f),%1(0xa8)\r
+ db %1(0x51),%1(0xa3),%1(0x40),%1(0x8f),%1(0x92),%1(0x9d),%1(0x38),%1(0xf5)\r
+ db %1(0xbc),%1(0xb6),%1(0xda),%1(0x21),%1(0x10),%1(0xff),%1(0xf3),%1(0xd2)\r
+ db %1(0xcd),%1(0x0c),%1(0x13),%1(0xec),%1(0x5f),%1(0x97),%1(0x44),%1(0x17)\r
+ db %1(0xc4),%1(0xa7),%1(0x7e),%1(0x3d),%1(0x64),%1(0x5d),%1(0x19),%1(0x73)\r
+ db %1(0x60),%1(0x81),%1(0x4f),%1(0xdc),%1(0x22),%1(0x2a),%1(0x90),%1(0x88)\r
+ db %1(0x46),%1(0xee),%1(0xb8),%1(0x14),%1(0xde),%1(0x5e),%1(0x0b),%1(0xdb)\r
+ db %1(0xe0),%1(0x32),%1(0x3a),%1(0x0a),%1(0x49),%1(0x06),%1(0x24),%1(0x5c)\r
+ db %1(0xc2),%1(0xd3),%1(0xac),%1(0x62),%1(0x91),%1(0x95),%1(0xe4),%1(0x79)\r
+ db %1(0xe7),%1(0xc8),%1(0x37),%1(0x6d),%1(0x8d),%1(0xd5),%1(0x4e),%1(0xa9)\r
+ db %1(0x6c),%1(0x56),%1(0xf4),%1(0xea),%1(0x65),%1(0x7a),%1(0xae),%1(0x08)\r
+ db %1(0xba),%1(0x78),%1(0x25),%1(0x2e),%1(0x1c),%1(0xa6),%1(0xb4),%1(0xc6)\r
+ db %1(0xe8),%1(0xdd),%1(0x74),%1(0x1f),%1(0x4b),%1(0xbd),%1(0x8b),%1(0x8a)\r
+ db %1(0x70),%1(0x3e),%1(0xb5),%1(0x66),%1(0x48),%1(0x03),%1(0xf6),%1(0x0e)\r
+ db %1(0x61),%1(0x35),%1(0x57),%1(0xb9),%1(0x86),%1(0xc1),%1(0x1d),%1(0x9e)\r
+ db %1(0xe1),%1(0xf8),%1(0x98),%1(0x11),%1(0x69),%1(0xd9),%1(0x8e),%1(0x94)\r
+ db %1(0x9b),%1(0x1e),%1(0x87),%1(0xe9),%1(0xce),%1(0x55),%1(0x28),%1(0xdf)\r
+ db %1(0x8c),%1(0xa1),%1(0x89),%1(0x0d),%1(0xbf),%1(0xe6),%1(0x42),%1(0x68)\r
+ db %1(0x41),%1(0x99),%1(0x2d),%1(0x0f),%1(0xb0),%1(0x54),%1(0xbb),%1(0x16)\r
+%endmacro\r
+\r
+%macro dec_vals 1\r
+ db %1(0x52),%1(0x09),%1(0x6a),%1(0xd5),%1(0x30),%1(0x36),%1(0xa5),%1(0x38)\r
+ db %1(0xbf),%1(0x40),%1(0xa3),%1(0x9e),%1(0x81),%1(0xf3),%1(0xd7),%1(0xfb)\r
+ db %1(0x7c),%1(0xe3),%1(0x39),%1(0x82),%1(0x9b),%1(0x2f),%1(0xff),%1(0x87)\r
+ db %1(0x34),%1(0x8e),%1(0x43),%1(0x44),%1(0xc4),%1(0xde),%1(0xe9),%1(0xcb)\r
+ db %1(0x54),%1(0x7b),%1(0x94),%1(0x32),%1(0xa6),%1(0xc2),%1(0x23),%1(0x3d)\r
+ db %1(0xee),%1(0x4c),%1(0x95),%1(0x0b),%1(0x42),%1(0xfa),%1(0xc3),%1(0x4e)\r
+ db %1(0x08),%1(0x2e),%1(0xa1),%1(0x66),%1(0x28),%1(0xd9),%1(0x24),%1(0xb2)\r
+ db %1(0x76),%1(0x5b),%1(0xa2),%1(0x49),%1(0x6d),%1(0x8b),%1(0xd1),%1(0x25)\r
+ db %1(0x72),%1(0xf8),%1(0xf6),%1(0x64),%1(0x86),%1(0x68),%1(0x98),%1(0x16)\r
+ db %1(0xd4),%1(0xa4),%1(0x5c),%1(0xcc),%1(0x5d),%1(0x65),%1(0xb6),%1(0x92)\r
+ db %1(0x6c),%1(0x70),%1(0x48),%1(0x50),%1(0xfd),%1(0xed),%1(0xb9),%1(0xda)\r
+ db %1(0x5e),%1(0x15),%1(0x46),%1(0x57),%1(0xa7),%1(0x8d),%1(0x9d),%1(0x84)\r
+ db %1(0x90),%1(0xd8),%1(0xab),%1(0x00),%1(0x8c),%1(0xbc),%1(0xd3),%1(0x0a)\r
+ db %1(0xf7),%1(0xe4),%1(0x58),%1(0x05),%1(0xb8),%1(0xb3),%1(0x45),%1(0x06)\r
+ db %1(0xd0),%1(0x2c),%1(0x1e),%1(0x8f),%1(0xca),%1(0x3f),%1(0x0f),%1(0x02)\r
+ db %1(0xc1),%1(0xaf),%1(0xbd),%1(0x03),%1(0x01),%1(0x13),%1(0x8a),%1(0x6b)\r
+ db %1(0x3a),%1(0x91),%1(0x11),%1(0x41),%1(0x4f),%1(0x67),%1(0xdc),%1(0xea)\r
+ db %1(0x97),%1(0xf2),%1(0xcf),%1(0xce),%1(0xf0),%1(0xb4),%1(0xe6),%1(0x73)\r
+ db %1(0x96),%1(0xac),%1(0x74),%1(0x22),%1(0xe7),%1(0xad),%1(0x35),%1(0x85)\r
+ db %1(0xe2),%1(0xf9),%1(0x37),%1(0xe8),%1(0x1c),%1(0x75),%1(0xdf),%1(0x6e)\r
+ db %1(0x47),%1(0xf1),%1(0x1a),%1(0x71),%1(0x1d),%1(0x29),%1(0xc5),%1(0x89)\r
+ db %1(0x6f),%1(0xb7),%1(0x62),%1(0x0e),%1(0xaa),%1(0x18),%1(0xbe),%1(0x1b)\r
+ db %1(0xfc),%1(0x56),%1(0x3e),%1(0x4b),%1(0xc6),%1(0xd2),%1(0x79),%1(0x20)\r
+ db %1(0x9a),%1(0xdb),%1(0xc0),%1(0xfe),%1(0x78),%1(0xcd),%1(0x5a),%1(0xf4)\r
+ db %1(0x1f),%1(0xdd),%1(0xa8),%1(0x33),%1(0x88),%1(0x07),%1(0xc7),%1(0x31)\r
+ db %1(0xb1),%1(0x12),%1(0x10),%1(0x59),%1(0x27),%1(0x80),%1(0xec),%1(0x5f)\r
+ db %1(0x60),%1(0x51),%1(0x7f),%1(0xa9),%1(0x19),%1(0xb5),%1(0x4a),%1(0x0d)\r
+ db %1(0x2d),%1(0xe5),%1(0x7a),%1(0x9f),%1(0x93),%1(0xc9),%1(0x9c),%1(0xef)\r
+ db %1(0xa0),%1(0xe0),%1(0x3b),%1(0x4d),%1(0xae),%1(0x2a),%1(0xf5),%1(0xb0)\r
+ db %1(0xc8),%1(0xeb),%1(0xbb),%1(0x3c),%1(0x83),%1(0x53),%1(0x99),%1(0x61)\r
+ db %1(0x17),%1(0x2b),%1(0x04),%1(0x7e),%1(0xba),%1(0x77),%1(0xd6),%1(0x26)\r
+ db %1(0xe1),%1(0x69),%1(0x14),%1(0x63),%1(0x55),%1(0x21),%1(0x0c),%1(0x7d)\r
+%endmacro\r
+\r
+%define u8(x) f2(x), x, x, f3(x), f2(x), x, x, f3(x)\r
+%define v8(x) fe(x), f9(x), fd(x), fb(x), fe(x), f9(x), fd(x), x\r
+%define w8(x) x, 0, 0, 0, x, 0, 0, 0\r
+\r
+%define tptr rbp ; table pointer\r
+%define kptr r8 ; key schedule pointer\r
+%define fofs 128 ; adjust offset in key schedule to keep |disp| < 128\r
+%define fk_ref(x,y) [kptr-16*x+fofs+4*y]\r
+%ifdef AES_REV_DKS\r
+%define rofs 128\r
+%define ik_ref(x,y) [kptr-16*x+rofs+4*y]\r
+%else\r
+%define rofs -128\r
+%define ik_ref(x,y) [kptr+16*x+rofs+4*y]\r
+%endif\r
+\r
+%define tab_0(x) [tptr+8*x]\r
+%define tab_1(x) [tptr+8*x+3]\r
+%define tab_2(x) [tptr+8*x+2]\r
+%define tab_3(x) [tptr+8*x+1]\r
+%define tab_f(x) byte [tptr+8*x+1]\r
+%define tab_i(x) byte [tptr+8*x+7]\r
+%define t_ref(x,r) tab_ %+ x(r)\r
+\r
+%macro ff_rnd 5 ; normal forward round\r
+ mov %1d, fk_ref(%5,0)\r
+ mov %2d, fk_ref(%5,1)\r
+ mov %3d, fk_ref(%5,2)\r
+ mov %4d, fk_ref(%5,3)\r
+\r
+ movzx esi, al\r
+ movzx edi, ah\r
+ shr eax, 16\r
+ xor %1d, t_ref(0,rsi)\r
+ xor %4d, t_ref(1,rdi)\r
+ movzx esi, al\r
+ movzx edi, ah\r
+ xor %3d, t_ref(2,rsi)\r
+ xor %2d, t_ref(3,rdi)\r
+\r
+ movzx esi, bl\r
+ movzx edi, bh\r
+ shr ebx, 16\r
+ xor %2d, t_ref(0,rsi)\r
+ xor %1d, t_ref(1,rdi)\r
+ movzx esi, bl\r
+ movzx edi, bh\r
+ xor %4d, t_ref(2,rsi)\r
+ xor %3d, t_ref(3,rdi)\r
+\r
+ movzx esi, cl\r
+ movzx edi, ch\r
+ shr ecx, 16\r
+ xor %3d, t_ref(0,rsi)\r
+ xor %2d, t_ref(1,rdi)\r
+ movzx esi, cl\r
+ movzx edi, ch\r
+ xor %1d, t_ref(2,rsi)\r
+ xor %4d, t_ref(3,rdi)\r
+\r
+ movzx esi, dl\r
+ movzx edi, dh\r
+ shr edx, 16\r
+ xor %4d, t_ref(0,rsi)\r
+ xor %3d, t_ref(1,rdi)\r
+ movzx esi, dl\r
+ movzx edi, dh\r
+ xor %2d, t_ref(2,rsi)\r
+ xor %1d, t_ref(3,rdi)\r
+\r
+ mov eax,%1d\r
+ mov ebx,%2d\r
+ mov ecx,%3d\r
+ mov edx,%4d\r
+%endmacro\r
+\r
+%ifdef LAST_ROUND_TABLES\r
+\r
+%macro fl_rnd 5 ; last forward round\r
+ add tptr, 2048\r
+ mov %1d, fk_ref(%5,0)\r
+ mov %2d, fk_ref(%5,1)\r
+ mov %3d, fk_ref(%5,2)\r
+ mov %4d, fk_ref(%5,3)\r
+\r
+ movzx esi, al\r
+ movzx edi, ah\r
+ shr eax, 16\r
+ xor %1d, t_ref(0,rsi)\r
+ xor %4d, t_ref(1,rdi)\r
+ movzx esi, al\r
+ movzx edi, ah\r
+ xor %3d, t_ref(2,rsi)\r
+ xor %2d, t_ref(3,rdi)\r
+\r
+ movzx esi, bl\r
+ movzx edi, bh\r
+ shr ebx, 16\r
+ xor %2d, t_ref(0,rsi)\r
+ xor %1d, t_ref(1,rdi)\r
+ movzx esi, bl\r
+ movzx edi, bh\r
+ xor %4d, t_ref(2,rsi)\r
+ xor %3d, t_ref(3,rdi)\r
+\r
+ movzx esi, cl\r
+ movzx edi, ch\r
+ shr ecx, 16\r
+ xor %3d, t_ref(0,rsi)\r
+ xor %2d, t_ref(1,rdi)\r
+ movzx esi, cl\r
+ movzx edi, ch\r
+ xor %1d, t_ref(2,rsi)\r
+ xor %4d, t_ref(3,rdi)\r
+\r
+ movzx esi, dl\r
+ movzx edi, dh\r
+ shr edx, 16\r
+ xor %4d, t_ref(0,rsi)\r
+ xor %3d, t_ref(1,rdi)\r
+ movzx esi, dl\r
+ movzx edi, dh\r
+ xor %2d, t_ref(2,rsi)\r
+ xor %1d, t_ref(3,rdi)\r
+%endmacro\r
+\r
+%else\r
+\r
+%macro fl_rnd 5 ; last forward round\r
+ mov %1d, fk_ref(%5,0)\r
+ mov %2d, fk_ref(%5,1)\r
+ mov %3d, fk_ref(%5,2)\r
+ mov %4d, fk_ref(%5,3)\r
+\r
+ movzx esi, al\r
+ movzx edi, ah\r
+ shr eax, 16\r
+ movzx esi, t_ref(f,rsi)\r
+ movzx edi, t_ref(f,rdi)\r
+ xor %1d, esi\r
+ rol edi, 8\r
+ xor %4d, edi\r
+ movzx esi, al\r
+ movzx edi, ah\r
+ movzx esi, t_ref(f,rsi)\r
+ movzx edi, t_ref(f,rdi)\r
+ rol esi, 16\r
+ rol edi, 24\r
+ xor %3d, esi\r
+ xor %2d, edi\r
+\r
+ movzx esi, bl\r
+ movzx edi, bh\r
+ shr ebx, 16\r
+ movzx esi, t_ref(f,rsi)\r
+ movzx edi, t_ref(f,rdi)\r
+ xor %2d, esi\r
+ rol edi, 8\r
+ xor %1d, edi\r
+ movzx esi, bl\r
+ movzx edi, bh\r
+ movzx esi, t_ref(f,rsi)\r
+ movzx edi, t_ref(f,rdi)\r
+ rol esi, 16\r
+ rol edi, 24\r
+ xor %4d, esi\r
+ xor %3d, edi\r
+\r
+ movzx esi, cl\r
+ movzx edi, ch\r
+ movzx esi, t_ref(f,rsi)\r
+ movzx edi, t_ref(f,rdi)\r
+ shr ecx, 16\r
+ xor %3d, esi\r
+ rol edi, 8\r
+ xor %2d, edi\r
+ movzx esi, cl\r
+ movzx edi, ch\r
+ movzx esi, t_ref(f,rsi)\r
+ movzx edi, t_ref(f,rdi)\r
+ rol esi, 16\r
+ rol edi, 24\r
+ xor %1d, esi\r
+ xor %4d, edi\r
+\r
+ movzx esi, dl\r
+ movzx edi, dh\r
+ movzx esi, t_ref(f,rsi)\r
+ movzx edi, t_ref(f,rdi)\r
+ shr edx, 16\r
+ xor %4d, esi\r
+ rol edi, 8\r
+ xor %3d, edi\r
+ movzx esi, dl\r
+ movzx edi, dh\r
+ movzx esi, t_ref(f,rsi)\r
+ movzx edi, t_ref(f,rdi)\r
+ rol esi, 16\r
+ rol edi, 24\r
+ xor %2d, esi\r
+ xor %1d, edi\r
+%endmacro\r
+\r
+%endif\r
+\r
+%macro ii_rnd 5 ; normal inverse round\r
+ mov %1d, ik_ref(%5,0)\r
+ mov %2d, ik_ref(%5,1)\r
+ mov %3d, ik_ref(%5,2)\r
+ mov %4d, ik_ref(%5,3)\r
+\r
+ movzx esi, al\r
+ movzx edi, ah\r
+ shr eax, 16\r
+ xor %1d, t_ref(0,rsi)\r
+ xor %2d, t_ref(1,rdi)\r
+ movzx esi, al\r
+ movzx edi, ah\r
+ xor %3d, t_ref(2,rsi)\r
+ xor %4d, t_ref(3,rdi)\r
+\r
+ movzx esi, bl\r
+ movzx edi, bh\r
+ shr ebx, 16\r
+ xor %2d, t_ref(0,rsi)\r
+ xor %3d, t_ref(1,rdi)\r
+ movzx esi, bl\r
+ movzx edi, bh\r
+ xor %4d, t_ref(2,rsi)\r
+ xor %1d, t_ref(3,rdi)\r
+\r
+ movzx esi, cl\r
+ movzx edi, ch\r
+ shr ecx, 16\r
+ xor %3d, t_ref(0,rsi)\r
+ xor %4d, t_ref(1,rdi)\r
+ movzx esi, cl\r
+ movzx edi, ch\r
+ xor %1d, t_ref(2,rsi)\r
+ xor %2d, t_ref(3,rdi)\r
+\r
+ movzx esi, dl\r
+ movzx edi, dh\r
+ shr edx, 16\r
+ xor %4d, t_ref(0,rsi)\r
+ xor %1d, t_ref(1,rdi)\r
+ movzx esi, dl\r
+ movzx edi, dh\r
+ xor %2d, t_ref(2,rsi)\r
+ xor %3d, t_ref(3,rdi)\r
+\r
+ mov eax,%1d\r
+ mov ebx,%2d\r
+ mov ecx,%3d\r
+ mov edx,%4d\r
+%endmacro\r
+\r
+%ifdef LAST_ROUND_TABLES\r
+\r
+%macro il_rnd 5 ; last inverse round\r
+ add tptr, 2048\r
+ mov %1d, ik_ref(%5,0)\r
+ mov %2d, ik_ref(%5,1)\r
+ mov %3d, ik_ref(%5,2)\r
+ mov %4d, ik_ref(%5,3)\r
+\r
+ movzx esi, al\r
+ movzx edi, ah\r
+ shr eax, 16\r
+ xor %1d, t_ref(0,rsi)\r
+ xor %2d, t_ref(1,rdi)\r
+ movzx esi, al\r
+ movzx edi, ah\r
+ xor %3d, t_ref(2,rsi)\r
+ xor %4d, t_ref(3,rdi)\r
+\r
+ movzx esi, bl\r
+ movzx edi, bh\r
+ shr ebx, 16\r
+ xor %2d, t_ref(0,rsi)\r
+ xor %3d, t_ref(1,rdi)\r
+ movzx esi, bl\r
+ movzx edi, bh\r
+ xor %4d, t_ref(2,rsi)\r
+ xor %1d, t_ref(3,rdi)\r
+\r
+ movzx esi, cl\r
+ movzx edi, ch\r
+ shr ecx, 16\r
+ xor %3d, t_ref(0,rsi)\r
+ xor %4d, t_ref(1,rdi)\r
+ movzx esi, cl\r
+ movzx edi, ch\r
+ xor %1d, t_ref(2,rsi)\r
+ xor %2d, t_ref(3,rdi)\r
+\r
+ movzx esi, dl\r
+ movzx edi, dh\r
+ shr edx, 16\r
+ xor %4d, t_ref(0,rsi)\r
+ xor %1d, t_ref(1,rdi)\r
+ movzx esi, dl\r
+ movzx edi, dh\r
+ xor %2d, t_ref(2,rsi)\r
+ xor %3d, t_ref(3,rdi)\r
+%endmacro\r
+\r
+%else\r
+\r
+%macro il_rnd 5 ; last inverse round\r
+ mov %1d, ik_ref(%5,0)\r
+ mov %2d, ik_ref(%5,1)\r
+ mov %3d, ik_ref(%5,2)\r
+ mov %4d, ik_ref(%5,3)\r
+\r
+ movzx esi, al\r
+ movzx edi, ah\r
+ movzx esi, t_ref(i,rsi)\r
+ movzx edi, t_ref(i,rdi)\r
+ shr eax, 16\r
+ xor %1d, esi\r
+ rol edi, 8\r
+ xor %2d, edi\r
+ movzx esi, al\r
+ movzx edi, ah\r
+ movzx esi, t_ref(i,rsi)\r
+ movzx edi, t_ref(i,rdi)\r
+ rol esi, 16\r
+ rol edi, 24\r
+ xor %3d, esi\r
+ xor %4d, edi\r
+\r
+ movzx esi, bl\r
+ movzx edi, bh\r
+ movzx esi, t_ref(i,rsi)\r
+ movzx edi, t_ref(i,rdi)\r
+ shr ebx, 16\r
+ xor %2d, esi\r
+ rol edi, 8\r
+ xor %3d, edi\r
+ movzx esi, bl\r
+ movzx edi, bh\r
+ movzx esi, t_ref(i,rsi)\r
+ movzx edi, t_ref(i,rdi)\r
+ rol esi, 16\r
+ rol edi, 24\r
+ xor %4d, esi\r
+ xor %1d, edi\r
+\r
+ movzx esi, cl\r
+ movzx edi, ch\r
+ movzx esi, t_ref(i,rsi)\r
+ movzx edi, t_ref(i,rdi)\r
+ shr ecx, 16\r
+ xor %3d, esi\r
+ rol edi, 8\r
+ xor %4d, edi\r
+ movzx esi, cl\r
+ movzx edi, ch\r
+ movzx esi, t_ref(i,rsi)\r
+ movzx edi, t_ref(i,rdi)\r
+ rol esi, 16\r
+ rol edi, 24\r
+ xor %1d, esi\r
+ xor %2d, edi\r
+\r
+ movzx esi, dl\r
+ movzx edi, dh\r
+ movzx esi, t_ref(i,rsi)\r
+ movzx edi, t_ref(i,rdi)\r
+ shr edx, 16\r
+ xor %4d, esi\r
+ rol edi, 8\r
+ xor %1d, edi\r
+ movzx esi, dl\r
+ movzx edi, dh\r
+ movzx esi, t_ref(i,rsi)\r
+ movzx edi, t_ref(i,rdi)\r
+ rol esi, 16\r
+ rol edi, 24\r
+ xor %2d, esi\r
+ xor %3d, edi\r
+%endmacro\r
+\r
+%endif\r
+\r
+%ifdef ENCRYPTION\r
+\r
+ global aes_encrypt\r
+%ifdef DLL_EXPORT\r
+ export aes_encrypt\r
+%endif\r
+\r
+ section .data align=64\r
+ align 64\r
+enc_tab:\r
+ enc_vals u8\r
+%ifdef LAST_ROUND_TABLES\r
+ enc_vals w8\r
+%endif\r
+\r
+ section .text align=16\r
+ align 16\r
+aes_encrypt:\r
+\r
+%ifdef __GNUC__\r
+ sub rsp, 4*8 ; gnu/linux binary interface\r
+ mov [rsp+0*8], rsi ; output pointer\r
+ mov r8, rdx ; context\r
+%else\r
+ sub rsp, 6*8 ; windows binary interface\r
+ mov [rsp+4*8], rsi\r
+ mov [rsp+5*8], rdi\r
+ mov [rsp+0*8], rdx ; output pointer\r
+ mov rdi, rcx ; input pointer\r
+%endif\r
+ mov [rsp+1*8], rbx ; input pointer in rdi\r
+ mov [rsp+2*8], rbp ; output pointer in [rsp]\r
+ mov [rsp+3*8], r12 ; context in r8\r
+\r
+ movzx esi, byte [kptr+4*KS_LENGTH]\r
+ lea tptr,[enc_tab wrt rip]\r
+ sub kptr, fofs\r
+\r
+ mov eax, [rdi+0*4]\r
+ mov ebx, [rdi+1*4]\r
+ mov ecx, [rdi+2*4]\r
+ mov edx, [rdi+3*4]\r
+\r
+ xor eax, [kptr+fofs]\r
+ xor ebx, [kptr+fofs+4]\r
+ xor ecx, [kptr+fofs+8]\r
+ xor edx, [kptr+fofs+12]\r
+\r
+ lea kptr,[kptr+rsi]\r
+ cmp esi, 10*16\r
+ je .3\r
+ cmp esi, 12*16\r
+ je .2\r
+ cmp esi, 14*16\r
+ je .1\r
+ mov rax, -1\r
+ jmp .4\r
+\r
+.1: ff_rnd r9, r10, r11, r12, 13\r
+ ff_rnd r9, r10, r11, r12, 12\r
+.2: ff_rnd r9, r10, r11, r12, 11\r
+ ff_rnd r9, r10, r11, r12, 10\r
+.3: ff_rnd r9, r10, r11, r12, 9\r
+ ff_rnd r9, r10, r11, r12, 8\r
+ ff_rnd r9, r10, r11, r12, 7\r
+ ff_rnd r9, r10, r11, r12, 6\r
+ ff_rnd r9, r10, r11, r12, 5\r
+ ff_rnd r9, r10, r11, r12, 4\r
+ ff_rnd r9, r10, r11, r12, 3\r
+ ff_rnd r9, r10, r11, r12, 2\r
+ ff_rnd r9, r10, r11, r12, 1\r
+ fl_rnd r9, r10, r11, r12, 0\r
+\r
+ mov rbx, [rsp]\r
+ mov [rbx], r9d\r
+ mov [rbx+4], r10d\r
+ mov [rbx+8], r11d\r
+ mov [rbx+12], r12d\r
+ xor rax, rax\r
+.4:\r
+ mov rbx, [rsp+1*8]\r
+ mov rbp, [rsp+2*8]\r
+ mov r12, [rsp+3*8]\r
+%ifdef __GNUC__\r
+ add rsp, 4*8\r
+%else\r
+ mov rsi, [rsp+4*8]\r
+ mov rdi, [rsp+5*8]\r
+ add rsp, 6*8\r
+%endif\r
+ ret\r
+\r
+%endif\r
+\r
+%ifdef DECRYPTION\r
+\r
+ global aes_decrypt\r
+%ifdef DLL_EXPORT\r
+ export aes_decrypt\r
+%endif\r
+\r
+ section .data align=64\r
+ align 64\r
+dec_tab:\r
+ dec_vals v8\r
+%ifdef LAST_ROUND_TABLES\r
+ dec_vals w8\r
+%endif\r
+\r
+ section .text align=16\r
+ align 16\r
+aes_decrypt:\r
+\r
+%ifdef __GNUC__\r
+ sub rsp, 4*8 ; gnu/linux binary interface\r
+ mov [rsp+0*8], rsi ; output pointer\r
+ mov r8, rdx ; context\r
+%else\r
+ sub rsp, 6*8 ; windows binary interface\r
+ mov [rsp+4*8], rsi\r
+ mov [rsp+5*8], rdi\r
+ mov [rsp+0*8], rdx ; output pointer\r
+ mov rdi, rcx ; input pointer\r
+%endif\r
+ mov [rsp+1*8], rbx ; input pointer in rdi\r
+ mov [rsp+2*8], rbp ; output pointer in [rsp]\r
+ mov [rsp+3*8], r12 ; context in r8\r
+\r
+ movzx esi,byte[kptr+4*KS_LENGTH]\r
+ lea tptr,[dec_tab wrt rip]\r
+ sub kptr, rofs\r
+\r
+ mov eax, [rdi+0*4]\r
+ mov ebx, [rdi+1*4]\r
+ mov ecx, [rdi+2*4]\r
+ mov edx, [rdi+3*4]\r
+\r
+%ifdef AES_REV_DKS\r
+ mov rdi, kptr\r
+ lea kptr,[kptr+rsi]\r
+%else\r
+ lea rdi,[kptr+rsi]\r
+%endif\r
+\r
+ xor eax, [rdi+rofs]\r
+ xor ebx, [rdi+rofs+4]\r
+ xor ecx, [rdi+rofs+8]\r
+ xor edx, [rdi+rofs+12]\r
+\r
+ cmp esi, 10*16\r
+ je .3\r
+ cmp esi, 12*16\r
+ je .2\r
+ cmp esi, 14*16\r
+ je .1\r
+ mov rax, -1\r
+ jmp .4\r
+\r
+.1: ii_rnd r9, r10, r11, r12, 13\r
+ ii_rnd r9, r10, r11, r12, 12\r
+.2: ii_rnd r9, r10, r11, r12, 11\r
+ ii_rnd r9, r10, r11, r12, 10\r
+.3: ii_rnd r9, r10, r11, r12, 9\r
+ ii_rnd r9, r10, r11, r12, 8\r
+ ii_rnd r9, r10, r11, r12, 7\r
+ ii_rnd r9, r10, r11, r12, 6\r
+ ii_rnd r9, r10, r11, r12, 5\r
+ ii_rnd r9, r10, r11, r12, 4\r
+ ii_rnd r9, r10, r11, r12, 3\r
+ ii_rnd r9, r10, r11, r12, 2\r
+ ii_rnd r9, r10, r11, r12, 1\r
+ il_rnd r9, r10, r11, r12, 0\r
+\r
+ mov rbx, [rsp]\r
+ mov [rbx], r9d\r
+ mov [rbx+4], r10d\r
+ mov [rbx+8], r11d\r
+ mov [rbx+12], r12d\r
+ xor rax, rax\r
+.4: mov rbx, [rsp+1*8]\r
+ mov rbp, [rsp+2*8]\r
+ mov r12, [rsp+3*8]\r
+%ifdef __GNUC__\r
+ add rsp, 4*8\r
+%else\r
+ mov rsi, [rsp+4*8]\r
+ mov rdi, [rsp+5*8]\r
+ add rsp, 6*8\r
+%endif\r
+ ret\r
+\r
+%endif\r
+\r
+ end\r