From f92dde990eb20f0415b1b666a0809004049cbfd8 Mon Sep 17 00:00:00 2001
From: magnum <magnum>
Date: Mon, 25 Jul 2011 15:09:34 +0200
Subject: [PATCH 10/11] NT_fmt now uses the optimised x86-64 crypt unless in UTF-8 mode

---
 src/NT_fmt_plug.c |    8 ++++
 src/x86-64.S      |   93 +++++++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 101 insertions(+), 0 deletions(-)

diff --git a/src/NT_fmt_plug.c b/src/NT_fmt_plug.c
index aa64a70..1f59d66 100644
--- a/src/NT_fmt_plug.c
+++ b/src/NT_fmt_plug.c
@@ -111,6 +111,7 @@ static struct fmt_tests tests[] = {
 	#define ALGORITHM_NAME		"128/128 X2 SSE2-16"
 	#define NT_CRYPT_FUN		nt_crypt_all_x86_64
 	extern void nt_crypt_all_x86_64(int count);
+	extern void nt_crypt_all_8859_1_x86_64(int count);
 #elif defined (NT_SSE2)
 	#define NT_NUM_KEYS	40
 	#define NT_NUM_KEYS1	8
@@ -233,6 +234,9 @@ static void fmt_NT_init(struct fmt_main *pFmt)
 	memset(nt_buffer1x,0,16*4*NT_NUM_KEYS);
 #endif
 	if (options.utf8) {
+#if defined (NT_X86_64)
+		fmt_NT.methods.crypt_all = nt_crypt_all_x86_64;
+#endif
 		/* This avoids an if clause for every set_key */
 		fmt_NT.methods.set_key = set_key_utf8;
 		/* kick it up from 27. We will 'adjust' in the setkey_utf8 function.  */
@@ -244,6 +248,10 @@ static void fmt_NT_init(struct fmt_main *pFmt)
 		tests[4].plaintext = "\xE2\x82\xAC\xE2\x82\xAC";
 		tests[4].ciphertext = "$NT$682467b963bb4e61943e170a04f7db46";
 	}
+#if defined (NT_X86_64)
+	else
+		fmt_NT.methods.crypt_all = nt_crypt_all_8859_1_x86_64;
+#endif
 }
 
 static char * nt_split(char *ciphertext, int index)
diff --git a/src/x86-64.S b/src/x86-64.S
index 29a0a0f..a6abffa 100644
--- a/src/x86-64.S
+++ b/src/x86-64.S
@@ -990,15 +990,18 @@ DES_bs_crypt_LM_loop:
 
 #ifdef UNDERSCORES
 #define nt_crypt_all_x86_64 _nt_crypt_all_x86_64
+#define nt_crypt_all_8859_1_x86_64 _nt_crypt_all_8859_1_x86_64
 #define nt_buffer8x _nt_buffer8x
 #define output8x _output8x
 #endif
 
 /*
 extern nt_crypt_all_x86_64(int count);
+extern nt_crypt_all_8859_1x86_64(int count);
 */
 
 .globl nt_crypt_all_x86_64
+.globl nt_crypt_all_8859_1_x86_64
 
 .data
 DO_ALIGN(6)
@@ -1123,6 +1126,85 @@ const_stage3:
 	por t1, aa;						\
 	por t13, aa3;
 
+#define NT_CRYPT_BODY_8859_1(base)					\
+	movdqa const_init_a(%rip), a;				\
+	movdqa const_init_a(%rip), a3;				\
+	movdqa const_init_b(%rip), b;				\
+	movdqa const_init_b(%rip), b3;				\
+	movdqa const_init_c(%rip), c;				\
+	movdqa const_init_c(%rip), c3;				\
+	movdqa const_init_d(%rip), d;				\
+	movdqa const_init_d(%rip), d3;				\
+								\
+	paddd (512*base)+nt_buffer8x(%rip), a;			\
+	paddd (512*base)+16+nt_buffer8x(%rip), a3;		\
+	pslld $3, a;						\
+	pslld $3, a3;						\
+								\
+	STEP1(d, a, b, c, d3, a3, b3, c3, 1 , 7 , base)		\
+	STEP1(c, d, a, b, c3, d3, a3, b3, 2 , 11, base)		\
+	STEP1(b, c, d, a, b3, c3, d3, a3, 3 , 19, base)		\
+	STEP1(a, b, c, d, a3, b3, c3, d3, 4 , 3 , base)		\
+	STEP1(d, a, b, c, d3, a3, b3, c3, 5 , 7 , base)		\
+	STEP1(c, d, a, b, c3, d3, a3, b3, 6 , 11, base)		\
+	STEP1(b, c, d, a, b3, c3, d3, a3, 7 , 19, base)		\
+	STEP1(a, b, c, d, a3, b3, c3, d3, 8 , 3 , base)		\
+	STEP1(d, a, b, c, d3, a3, b3, c3, 9 , 7 , base)		\
+	STEP1(c, d, a, b, c3, d3, a3, b3, 10, 11, base)		\
+	STEP1(b, c, d, a, b3, c3, d3, a3, 11, 19, base)		\
+	STEP1(a, b, c, d, a3, b3, c3, d3, 12, 3 , base)		\
+	STEP1(d, a, b, c, d3, a3, b3, c3, 13, 7 , base)		\
+	STEP1(c, d, a, b, c3, d3, a3, b3, 14, 11, base)		\
+	STEP1(b, c, d, a, b3, c3, d3, a3, 15, 19, base)		\
+								\
+	STEP2(a, b, c, d, a3, b3, c3, d3, 0 , 3 , base)		\
+	STEP2(d, a, b, c, d3, a3, b3, c3, 4 , 5 , base)		\
+	STEP2(c, d, a, b, c3, d3, a3, b3, 8 , 9 , base)		\
+	STEP2(b, c, d, a, b3, c3, d3, a3, 12, 13, base)		\
+	STEP2(a, b, c, d, a3, b3, c3, d3, 1 , 3 , base)		\
+	STEP2(d, a, b, c, d3, a3, b3, c3, 5 , 5 , base)		\
+	STEP2(c, d, a, b, c3, d3, a3, b3, 9 , 9 , base)		\
+	STEP2(b, c, d, a, b3, c3, d3, a3, 13, 13, base)		\
+	STEP2(a, b, c, d, a3, b3, c3, d3, 2 , 3 , base)		\
+	STEP2(d, a, b, c, d3, a3, b3, c3, 6 , 5 , base)		\
+	STEP2(c, d, a, b, c3, d3, a3, b3, 10, 9 , base)		\
+	STEP2(b, c, d, a, b3, c3, d3, a3, 14, 13, base)		\
+	STEP2(a, b, c, d, a3, b3, c3, d3, 3 , 3 , base)		\
+	STEP2(d, a, b, c, d3, a3, b3, c3, 7 , 5 , base)		\
+	STEP2(c, d, a, b, c3, d3, a3, b3, 11, 9 , base)		\
+	STEP2(b, c, d, a, b3, c3, d3, a3, 15, 13, base)		\
+								\
+	STEP3(a, b, c, d, a3, b3, c3, d3, 0 , 3 , base)		\
+	STEP3(d, a, b, c, d3, a3, b3, c3, 8 , 9 , base)		\
+	STEP3(c, d, a, b, c3, d3, a3, b3, 4 , 11, base)		\
+	STEP3(b, c, d, a, b3, c3, d3, a3, 12, 15, base)		\
+	STEP3(a, b, c, d, a3, b3, c3, d3, 2 , 3 , base)		\
+	STEP3(d, a, b, c, d3, a3, b3, c3, 10, 9 , base)		\
+	STEP3(c, d, a, b, c3, d3, a3, b3, 6 , 11, base)		\
+	STEP3(b, c, d, a, b3, c3, d3, a3, 14, 15, base)		\
+	STEP3(a, b, c, d, a3, b3, c3, d3, 1 , 3 , base)		\
+	STEP3(d, a, b, c, d3, a3, b3, c3, 9 , 9 , base)		\
+	STEP3(c, d, a, b, c3, d3, a3, b3, 5 , 11, base)		\
+	movdqa a, t1;						\
+	movdqa a3, t13;						\
+	paddd (512*base)+416+nt_buffer8x(%rip), b;		\
+	paddd (512*base)+416+16+nt_buffer8x(%rip), b3;		\
+	pxor d, t1;						\
+	pxor d3,t13;						\
+	pxor c, t1;						\
+	pxor c3,t13;						\
+	paddd t1, b;						\
+	paddd t13,b3;						\
+								\
+	movdqa a,  (128*base)+output8x(%rip);			\
+	movdqa a3,  (128*base)+16+output8x(%rip);		\
+	movdqa b, (128*base)+32+output8x(%rip);			\
+	movdqa b3, (128*base)+32+16+output8x(%rip);		\
+	movdqa c, (128*base)+64+output8x(%rip);			\
+	movdqa c3, (128*base)+64+16+output8x(%rip);		\
+	movdqa d, (128*base)+96+output8x(%rip);			\
+	movdqa d3, (128*base)+96+16+output8x(%rip);
+
 #define NT_CRYPT_BODY(base)					\
 	movdqa const_init_a(%rip), a;				\
 	movdqa const_init_a(%rip), a3;				\
@@ -1223,6 +1305,17 @@ nt_crypt_all_x86_64:
 
 	ret
 
+nt_crypt_all_8859_1_x86_64:
+	movdqa const_stage2(%rip), t3
+	movdqa const_stage3(%rip), t4
+
+	NT_CRYPT_BODY_8859_1(0)
+	NT_CRYPT_BODY_8859_1(1)
+	NT_CRYPT_BODY_8859_1(2)
+	NT_CRYPT_BODY_8859_1(3)
+
+	ret
+
 #if defined(__ELF__) && defined(__linux__)
 .section .note.GNU-stack,"",@progbits
 #endif
-- 
1.7.4.1

