/*

  sha256_x86.S

  Author: Pekka Riikonen <priikone@silcnet.org>

  Copyright (C) 2007 Pekka Riikonen

  This program is free software; you can redistribute it and/or modify
  it under the terms of the GNU General Public License as published by
  the Free Software Foundation; version 2 of the License.

  This program is distributed in the hope that it will be useful,
  but WITHOUT ANY WARRANTY; without even the implied warranty of
  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  GNU General Public License for more details.

*/

/* SHA-256 x86 assembler implementation.  This implements only the SHA-256
   transform function and other parts are implemented in sha256.c.  The
   function preserves ebp, edx, edi and esi but does not preserve other
   registers.

   This implementation uses only 32-bit registers.  It does not use MMX or
   SSE registers which could be used to enhance the performance, especially
   when loading the W.  This is about as fast as we can get with less than
   8 32-bit registers on 32-bit CPU.

   Benchmarks (megabytes (MB) per second), bigger is better:

   Code           P4 3.60 GHz      PM 1.60 GHz     Xeon 5160 3.00 GHz
   ----------------------------------------------------------------------
   SHA-256, asm   110.57 MB/sec    58.50 MB/sec    146.43 MB/sec
   SHA-256, gcc    49.07 MB/sec    39.55 MB/sec     82.14 MB/sec
   SHA-256, icc   109.97 MB/sec    55.69 MB/sec     N/A

   Notes:
   - Test program was lib/silccrypt/tests/test_hash
   - nice -n -20 was used with test_hash running as root
   - P4 is Pentium 4, PM is Pentium M, Xeon 5160 is 64-bit CPU but the OS
     had 32-bit kernel in the test.
   - ICC generates significantly better code compared to GCC for SSE2
     capable CPU, and the generated code uses SSE registers.  Hence the
     comparable speed with the assembler code.  Note that, the GCC code
     was also compiled with -msse2.  Note that, this assembler code
     specifically does not use SSE or MMX, for better compatibility.

*/

#include "silcdefs.h"

#ifdef SILC_SHA256_X86

#define STACK_STATE	(8 * 4)
#define STACK_W		(64 * 4)
#define STACK_SIZE	STACK_STATE + STACK_W
#define ARG_STATE	STACK_SIZE + 20(%esp)
#define ARG_BUF		STACK_SIZE + 24(%esp)

#define A		0
#define B	      	4
#define C	       	8
#define D	       	12
#define E	       	16
#define F	       	20
#define G	       	24
#define H	       	28

#define r0	       	%eax
#define r1	       	%ebx
#define r2	       	%ecx
#define r3	       	%edx
#define r4	       	%edi
#define r5	       	%esi

/* One round of SHA-256.  The a (r0) and e (r1) are inputs already in
   registers.  r0 will be the next round a, r1 the next round e.  The
   d and h are outputs and they are the r0 and r1 for next round. */
#define RND(a, b, c, d, e, f, g, h, W, ki)				\
	movl	f(%ebp), r2;						\
	movl	g(%ebp), r3;						\
									\
  	movl    e,   r4;		/* e to Sigma1 */		\
  	rorl    $6,  r4;		/* Sigma1 >>= 6 */		\
  	movl    r4,  r5;		/* Sigma1 to temp */		\
  	rorl    $5,  r4;		/* Sigma1 >>= 5 (11) */		\
  	xorl    r4,  r5;		/* temp ^= Sigma1 */		\
  	rorl    $14, r4;		/* Sigma1 >>= 14 (25) */	\
  	xorl    r5,  r4;		/* Sigma1 ^= temp */  		\
									\
 	movl    r3,  r5;		/* g to Ch */			\
  	xorl    r2,  r5;		/* Ch ^= f */			\
  	andl    e,   r5;		/* Ch &= e */			\
  	xorl    r3,  r5;		/* Ch ^= g */			\
									\
  	leal    ki(r4, r5), r4;		/* t0 = Sigma1 + Ch + ki */	\
	movl    h(%ebp), r3;						\
  	movl    d(%ebp), r1;						\
  	addl    W * 4(%esp), r4;	/* t0 += W[i] */		\
  	addl    r4, r3;			/* h += t0 */			\
  	addl    r3, r1;			/* d += h (t0) */		\
									\
        movl    a,   r4;		/* a to Sigma0 */		\
  	rorl    $2,  r4;		/* Sigma0 >>= 2 */		\
  	movl    r4,  r5;		/* Sigma0 to temp */		\
  	rorl    $11, r4;		/* Sigma0 >>= 11 (13) */	\
  	xorl    r4,  r5;		/* temp ^= Sigma0 */		\
  	rorl    $9,  r4;		/* Sigma0 >>= 9 (22) */		\
  	xorl    r5,  r4;		/* Sigma0 ^= temp */		\
									\
        addl    r3, r4;			/* t1 = Sigma0 + h (t0) */	\
 	movl    b(%ebp), r2;						\
  	movl    c(%ebp), r3;						\
									\
	movl    r2,  r5;		/* b to temp */			\
  	orl     a,   r5;		/* temp |= a */			\
  	andl    r3,  r5;		/* temp &= c */			\
  	andl    r2,  a;			/* a &= b */			\
  	orl     r5,  a;			/* a |= temp */			\
  	addl    r4,  r0;		/* h = t0 + t1 */

#define ROUND(a, b, c, d, e, f, g, h, W, ki)				\
	RND(a, b, c, d, e, f, g, h, W, ki)				\
	movl    r1, d(%ebp);		/* Update d in stack */		\
	movl    r0, h(%ebp);		/* Update h in stack */

/* Get 64 bits from input buffer in MSB first order */
#define GET_BUF(i)							\
	movl    i * 4(r5), r4;						\
	movl    (i + 1) * 4(r5), r3;					\
	bswapl  r4;							\
	bswapl  r3;							\
	movl    r4, i * 4(%esp);					\
	movl    r3, (i + 1) * 4(%esp);

/* Expand the input */
#define EXP_BUF(i)							\
	rorl    $17, r4;		/* Gamma1 >>= 17 */		\
	movl    r4,  r5;		/* Gamma1 to temp */		\
	rorl    $2,  r4;		/* Gamma1 >>= 2 (19) */		\
	xorl    r4,  r5;		/* temp ^= Gamma1 */		\
	shrl	$10, r2;		/* w-2 >> 10 */			\
	xorl    r5,  r2;		/* Gamma1 = w-2 ^ temp */	\
									\
  	movl    (i - 15) * 4(%esp), r3;					\
	movl    r3,  r4;		/* w-15 to Gamma0 */		\
	rorl    $7,  r4;		/* Gamma0 >>= 7 */		\
	movl    r4,  r5;		/* Gamma0 to temp */		\
	rorl    $11, r4;		/* Gamma0 >>= 11 (18) */	\
	xorl    r4,  r5;		/* temp ^= Gamma0 */		\
	shrl	$3,  r3;		/* w-15 >> 3 */			\
	xorl    r5,  r3;		/* Gamma0 = w-15 ^ temp */	\
									\
	addl    (i - 7) * 4(%esp), r2;	/* Gamma1 += w-7 */		\
	addl	(i - 16) * 4(%esp), r2;	/* Gamma1 += w-16 */		\
	addl    r2,  r3;		/* Gamma0 += Gamma1 */		\
	movl    r3, i * 4(%esp);

#define EXP_BUF0(i)							\
	movl    r4, r2;							\
	EXP_BUF(i)

#define EXP_BUFX(i)							\
	movl    (i - 2) * 4(%esp), r2;					\
	movl    r2,  r4;		/* w-2 to Gamma1 */		\
  	EXP_BUF(i)


.text
.align 4
.globl sha256_transform
sha256_transform:
	pushl	%ebp
	pushl	%ebx
	pushl	%edi
	pushl	%esi
	subl	$STACK_SIZE, %esp

	/* State to stack */
	movl	ARG_STATE, %ebp
	movl	A(%ebp), r0
	movl	B(%ebp), r1
	movl	C(%ebp), r2
	movl	D(%ebp), r3
	movl	r0, A + STACK_W(%esp)
	movl	r1, B + STACK_W(%esp)
	movl	r2, C + STACK_W(%esp)
	movl	r3, D + STACK_W(%esp)
	movl	E(%ebp), r1
	movl	F(%ebp), r2
	movl	G(%ebp), r3
	movl	H(%ebp), r4
	movl	r1, E + STACK_W(%esp)
	movl	r2, F + STACK_W(%esp)
	movl	r3, G + STACK_W(%esp)
	movl	r4, H + STACK_W(%esp)

	/* Get buf in MSB first order, W[0..15] */
	movl	ARG_BUF, r5
	GET_BUF(0) GET_BUF(2) GET_BUF(4) GET_BUF(6)
	GET_BUF(8) GET_BUF(10) GET_BUF(12) GET_BUF(14)

	/* Expand input, fill in W[16..63] */
	EXP_BUF0(16) EXP_BUFX(17) EXP_BUFX(18) EXP_BUFX(19) EXP_BUFX(20)
	EXP_BUFX(21) EXP_BUFX(22) EXP_BUFX(23) EXP_BUFX(24) EXP_BUFX(25)
	EXP_BUFX(26) EXP_BUFX(27) EXP_BUFX(28) EXP_BUFX(29) EXP_BUFX(30)
	EXP_BUFX(31) EXP_BUFX(32) EXP_BUFX(33) EXP_BUFX(34) EXP_BUFX(35)
	EXP_BUFX(36) EXP_BUFX(37) EXP_BUFX(38) EXP_BUFX(39) EXP_BUFX(40)
	EXP_BUFX(41) EXP_BUFX(42) EXP_BUFX(43) EXP_BUFX(44) EXP_BUFX(45)
	EXP_BUFX(46) EXP_BUFX(47) EXP_BUFX(48) EXP_BUFX(49) EXP_BUFX(50)
	EXP_BUFX(51) EXP_BUFX(52) EXP_BUFX(53) EXP_BUFX(54) EXP_BUFX(55)
	EXP_BUFX(56) EXP_BUFX(57) EXP_BUFX(58) EXP_BUFX(59) EXP_BUFX(60)
	EXP_BUFX(61) EXP_BUFX(62) EXP_BUFX(63)

	/* Hash, r0 and r1 set above, ebp is base address to state */
	leal	STACK_W(%esp), %ebp

	ROUND(r0, B, C, D, r1, F, G, H, 0, 0x428a2f98);
	ROUND(r0, A, B, C, r1, E, F, G, 1, 0x71374491);
	ROUND(r0, H, A, B, r1, D, E, F, 2, 0xb5c0fbcf);
	ROUND(r0, G, H, A, r1, C, D, E, 3, 0xe9b5dba5);
	ROUND(r0, F, G, H, r1, B, C, D, 4, 0x3956c25b);
	ROUND(r0, E, F, G, r1, A, B, C, 5, 0x59f111f1);
	ROUND(r0, D, E, F, r1, H, A, B, 6, 0x923f82a4);
	ROUND(r0, C, D, E, r1, G, H, A, 7, 0xab1c5ed5);

	ROUND(r0, B, C, D, r1, F, G, H, 8, 0xd807aa98);
	ROUND(r0, A, B, C, r1, E, F, G, 9, 0x12835b01);
	ROUND(r0, H, A, B, r1, D, E, F, 10, 0x243185be);
	ROUND(r0, G, H, A, r1, C, D, E, 11, 0x550c7dc3);
	ROUND(r0, F, G, H, r1, B, C, D, 12, 0x72be5d74);
	ROUND(r0, E, F, G, r1, A, B, C, 13, 0x80deb1fe);
	ROUND(r0, D, E, F, r1, H, A, B, 14, 0x9bdc06a7);
	ROUND(r0, C, D, E, r1, G, H, A, 15, 0xc19bf174);

	ROUND(r0, B, C, D, r1, F, G, H, 16, 0xe49b69c1);
	ROUND(r0, A, B, C, r1, E, F, G, 17, 0xefbe4786);
	ROUND(r0, H, A, B, r1, D, E, F, 18, 0x0fc19dc6);
	ROUND(r0, G, H, A, r1, C, D, E, 19, 0x240ca1cc);
	ROUND(r0, F, G, H, r1, B, C, D, 20, 0x2de92c6f);
	ROUND(r0, E, F, G, r1, A, B, C, 21, 0x4a7484aa);
	ROUND(r0, D, E, F, r1, H, A, B, 22, 0x5cb0a9dc);
	ROUND(r0, C, D, E, r1, G, H, A, 23, 0x76f988da);

	ROUND(r0, B, C, D, r1, F, G, H, 24, 0x983e5152);
	ROUND(r0, A, B, C, r1, E, F, G, 25, 0xa831c66d);
	ROUND(r0, H, A, B, r1, D, E, F, 26, 0xb00327c8);
	ROUND(r0, G, H, A, r1, C, D, E, 27, 0xbf597fc7);
	ROUND(r0, F, G, H, r1, B, C, D, 28, 0xc6e00bf3);
	ROUND(r0, E, F, G, r1, A, B, C, 29, 0xd5a79147);
	ROUND(r0, D, E, F, r1, H, A, B, 30, 0x06ca6351);
	ROUND(r0, C, D, E, r1, G, H, A, 31, 0x14292967);

	ROUND(r0, B, C, D, r1, F, G, H, 32, 0x27b70a85);
	ROUND(r0, A, B, C, r1, E, F, G, 33, 0x2e1b2138);
	ROUND(r0, H, A, B, r1, D, E, F, 34, 0x4d2c6dfc);
	ROUND(r0, G, H, A, r1, C, D, E, 35, 0x53380d13);
	ROUND(r0, F, G, H, r1, B, C, D, 36, 0x650a7354);
	ROUND(r0, E, F, G, r1, A, B, C, 37, 0x766a0abb);
	ROUND(r0, D, E, F, r1, H, A, B, 38, 0x81c2c92e);
	ROUND(r0, C, D, E, r1, G, H, A, 39, 0x92722c85);

	ROUND(r0, B, C, D, r1, F, G, H, 40, 0xa2bfe8a1);
	ROUND(r0, A, B, C, r1, E, F, G, 41, 0xa81a664b);
	ROUND(r0, H, A, B, r1, D, E, F, 42, 0xc24b8b70);
	ROUND(r0, G, H, A, r1, C, D, E, 43, 0xc76c51a3);
	ROUND(r0, F, G, H, r1, B, C, D, 44, 0xd192e819);
	ROUND(r0, E, F, G, r1, A, B, C, 45, 0xd6990624);
	ROUND(r0, D, E, F, r1, H, A, B, 46, 0xf40e3585);
	ROUND(r0, C, D, E, r1, G, H, A, 47, 0x106aa070);

	ROUND(r0, B, C, D, r1, F, G, H, 48, 0x19a4c116);
	ROUND(r0, A, B, C, r1, E, F, G, 49, 0x1e376c08);
	ROUND(r0, H, A, B, r1, D, E, F, 50, 0x2748774c);
	ROUND(r0, G, H, A, r1, C, D, E, 51, 0x34b0bcb5);
	ROUND(r0, F, G, H, r1, B, C, D, 52, 0x391c0cb3);
	ROUND(r0, E, F, G, r1, A, B, C, 53, 0x4ed8aa4a);
	ROUND(r0, D, E, F, r1, H, A, B, 54, 0x5b9cca4f);
	ROUND(r0, C, D, E, r1, G, H, A, 55, 0x682e6ff3);

	ROUND(r0, B, C, D, r1, F, G, H, 56, 0x748f82ee);
	ROUND(r0, A, B, C, r1, E, F, G, 57, 0x78a5636f);
	ROUND(r0, H, A, B, r1, D, E, F, 58, 0x84c87814);
	ROUND(r0, G, H, A, r1, C, D, E, 59, 0x8cc70208);
	ROUND(r0, F, G, H, r1, B, C, D, 60, 0x90befffa);
	ROUND(r0, E, F, G, r1, A, B, C, 61, 0xa4506ceb);
	ROUND(r0, D, E, F, r1, H, A, B, 62, 0xbef9a3f7);
	RND  (r0, C, D, E, r1, G, H, A, 63, 0xc67178f2);

	/* Update state from stack */
	movl	ARG_STATE, %ebp
	addl	r0, A(%ebp)		/* a from last round */
	addl	r1, E(%ebp)		/* e from last round */
	movl	B + STACK_W(%esp), r0
	movl	C + STACK_W(%esp), r1
	movl	D + STACK_W(%esp), r2
	movl	F + STACK_W(%esp), r3
	movl	G + STACK_W(%esp), r4
	movl	H + STACK_W(%esp), r5
	addl	r0, B(%ebp)
	addl	r1, C(%ebp)
	addl	r2, D(%ebp)
	addl	r3, F(%ebp)
	addl	r4, G(%ebp)
	addl	r5, H(%ebp)

	addl	$STACK_SIZE, %esp
	popl	%esi
	popl	%edi
	popl	%ebx
	popl	%ebp

	ret

#endif /* SILC_SHA256_X86 */