diff -urpN john-1.7.8.orig//src/Makefile john-1.7.8-mscash2cuda-0/src/Makefile
--- john-1.7.8.orig//src/Makefile	2011-05-04 18:52:31.000000000 +0000
+++ john-1.7.8-mscash2cuda-0/src/Makefile	2011-08-22 18:36:22.323678649 +0000
@@ -3,6 +3,7 @@
 # Copyright (c) 1996-2011 by Solar Designer
 #
 
+NVCC = nvcc
 CC = gcc
 AS = $(CC)
 LD = $(CC)
@@ -19,6 +20,12 @@ OMPFLAGS =
 #OMPFLAGS = -fopenmp
 # Sun Studio with OpenMP (set the OMP_NUM_THREADS env var at runtime)
 #OMPFLAGS = -xopenmp
+CUDAPATH=/usr/local/cuda/lib
+CUDA64PATH=/usr/local/cuda/lib64
+
+NVCC_FLAGS = -c -Xptxas -v -arch sm_10
+MSCASH_FLAGS = $(NVCC_FLAGS) --maxrregcount=60
+
 CFLAGS = -c -Wall -O2 -fomit-frame-pointer $(OMPFLAGS)
 ASFLAGS = -c $(OMPFLAGS)
 LDFLAGS = -s $(OMPFLAGS)
@@ -41,6 +48,9 @@ JOHN_OBJS = \
 	unafs.o \
 	unique.o
 
+CUDA_OBJS = \
+	mscash2cuda_fmt.o mscash2.o 
+
 BENCH_DES_OBJS_ORIG = \
 	DES_fmt.o DES_std.o
 
@@ -74,6 +84,8 @@ default:
 	@echo "To build John the Ripper, type:"
 	@echo "	make clean SYSTEM"
 	@echo "where SYSTEM can be one of the following:"
+	@echo "linux-x86-64-cuda        Linux, x86-64 with SSE2 and CUDA (experimental)"
+	@echo "linux-x86-cuda           Linux, x86 32-bit with SSE2 and CUDA (experimental)"
 	@echo "linux-x86-64             Linux, x86-64 with SSE2 (best tested)"
 	@echo "linux-x86-64-avx         Linux, x86-64 with AVX (experimental)"
 	@echo "linux-x86-64-xop         Linux, x86-64 with AVX and XOP (experimental)"
@@ -148,6 +160,20 @@ default:
 	@echo "beos-x86-any             BeOS, x86"
 	@echo "generic                  Any other Unix-like system with gcc"
 
+linux-x86-64-cuda:
+	$(LN) x86-64.h arch.h
+	$(MAKE) $(PROJ) \
+		JOHN_OBJS="$(JOHN_OBJS) $(CUDA_OBJS) c3_fmt.o x86-64.o" \
+		CFLAGS="$(CFLAGS) -DHAVE_CRYPT" \
+		LDFLAGS="$(LDFLAGS) -L$(CUDA64PATH) -lcudart  -lcrypt"
+
+linux-x86-cuda:
+	$(LN) x86-sse.h arch.h
+	$(MAKE) $(PROJ) \
+		JOHN_OBJS="$(JOHN_OBJS) $(CUDA_OBJS) c3_fmt.o x86.o x86-sse.o" \
+		CFLAGS="$(CFLAGS) -DHAVE_CRYPT" \
+		LDFLAGS="$(LDFLAGS) -L$(CUDAPATH) -lcudart -lcrypt"
+
 linux-x86-64:
 	$(LN) x86-64.h arch.h
 	$(MAKE) $(PROJ) \
@@ -811,6 +837,13 @@ generic.h:
 		"$(BENCH_MD5_OBJS_DEPEND)" \
 		"$(BENCH_BF_OBJS_DEPEND)"
 
+mscash2.o:  mscash2.h cuda/mscash2.cu
+	cd cuda; nvcc $(MSCASH_FLAGS) mscash2.cu
+
+mscash2cuda_fmt.o: mscash2.o mscash2cuda_fmt.c mscash2.h
+	cp cuda/mscash2.o mscash2.o
+	$(CC)  $(CFLAGS) mscash2cuda_fmt.c
+
 bench: $(BENCH_OBJS)
 	$(LD) $(BENCH_OBJS) $(LDFLAGS) -o bench
 
@@ -890,7 +923,7 @@ depend:
 
 clean:
 	$(RM) $(PROJ) $(PROJ_DOS) $(PROJ_WIN32)
-	$(RM) ../run/john.exe john-macosx-* *.o *.bak core
+	$(RM) ../run/john.exe john-macosx-* *.o *.bak *~ core cuda/*.o cuda/*~ 
 	$(RM) detect bench generic.h arch.h tmp.s
 	$(CP) $(NULL) Makefile.dep
 
diff -urpN john-1.7.8.orig//src/cuda/mscash2.cu john-1.7.8-mscash2cuda-0/src/cuda/mscash2.cu
--- john-1.7.8.orig//src/cuda/mscash2.cu	1970-01-01 00:00:00.000000000 +0000
+++ john-1.7.8-mscash2cuda-0/src/cuda/mscash2.cu	2011-08-22 04:28:45.240396371 +0000
@@ -0,0 +1,408 @@
+/*
+* This software is Copyright (c) 2011 Lukas Odzioba <lukas dot odzioba at gmail dot com> 
+* and it is hereby released to the general public under the following terms:
+* Redistribution and use in source and binary forms, with or without modification, are permitted.
+* Based on S3nf implementation http://openwall.info/wiki/john/MSCash2
+*/
+
+#include <stdio.h>
+#include "../mscash2.h"
+
+static void HandleError(cudaError_t err, const char *file, int line)
+{
+	if (err != cudaSuccess) {
+		printf("%s in %s at line %d\n", cudaGetErrorString(err), file,
+		    line);
+		exit(EXIT_FAILURE);
+	}
+}
+
+#define HANDLE_ERROR(err) (HandleError(err,__FILE__,__LINE__))
+
+extern "C" void mscash2_init(int gpuid);
+extern "C" void mscash2_gpu(mscash2_password *, mscash2_hash *, mscash2_salt *);
+
+__constant__ mscash2_salt cuda_salt[1];
+__host__ void mscash2_init(int gpuid)
+{
+	int count;
+	HANDLE_ERROR(cudaGetDeviceCount(&count));
+	if (gpuid < count)
+		cudaSetDevice(gpuid);
+	else {
+		printf("Invalid CUDA device id = %d\n", gpuid);
+		exit(1);
+	}
+}
+
+__host__ void md4_crypt(uint32_t * buffer, uint32_t * hash)
+{
+	uint32_t a;
+	uint32_t b;
+	uint32_t c;
+	uint32_t d;
+
+	a = 0xFFFFFFFF + buffer[0];
+	a = (a << 3) | (a >> 29);
+	d = INIT_D + (INIT_C ^ (a & 0x77777777)) + buffer[1];
+	d = (d << 7) | (d >> 25);
+	c = INIT_C + (INIT_B ^ (d & (a ^ INIT_B))) + buffer[2];
+	c = (c << 11) | (c >> 21);
+	b = INIT_B + (a ^ (c & (d ^ a))) + buffer[3];
+	b = (b << 19) | (b >> 13);
+
+	a += (d ^ (b & (c ^ d))) + buffer[4];
+	a = (a << 3) | (a >> 29);
+	d += (c ^ (a & (b ^ c))) + buffer[5];
+	d = (d << 7) | (d >> 25);
+	c += (b ^ (d & (a ^ b))) + buffer[6];
+	c = (c << 11) | (c >> 21);
+	b += (a ^ (c & (d ^ a))) + buffer[7];
+	b = (b << 19) | (b >> 13);
+
+	a += (d ^ (b & (c ^ d))) + buffer[8];
+	a = (a << 3) | (a >> 29);
+	d += (c ^ (a & (b ^ c))) + buffer[9];
+	d = (d << 7) | (d >> 25);
+	c += (b ^ (d & (a ^ b))) + buffer[10];
+	c = (c << 11) | (c >> 21);
+	b += (a ^ (c & (d ^ a))) + buffer[11];
+	b = (b << 19) | (b >> 13);
+
+	a += (d ^ (b & (c ^ d))) + buffer[12];
+	a = (a << 3) | (a >> 29);
+	d += (c ^ (a & (b ^ c))) + buffer[13];
+	d = (d << 7) | (d >> 25);
+	c += (b ^ (d & (a ^ b))) + buffer[14];
+	c = (c << 11) | (c >> 21);
+	b += (a ^ (c & (d ^ a))) + buffer[15];
+	b = (b << 19) | (b >> 13);
+
+	a += ((b & (c | d)) | (c & d)) + buffer[0] + SQRT_2;
+	a = (a << 3) | (a >> 29);
+	d += ((a & (b | c)) | (b & c)) + buffer[4] + SQRT_2;
+	d = (d << 5) | (d >> 27);
+	c += ((d & (a | b)) | (a & b)) + buffer[8] + SQRT_2;
+	c = (c << 9) | (c >> 23);
+	b += ((c & (d | a)) | (d & a)) + buffer[12] + SQRT_2;
+	b = (b << 13) | (b >> 19);
+
+	a += ((b & (c | d)) | (c & d)) + buffer[1] + SQRT_2;
+	a = (a << 3) | (a >> 29);
+	d += ((a & (b | c)) | (b & c)) + buffer[5] + SQRT_2;
+	d = (d << 5) | (d >> 27);
+	c += ((d & (a | b)) | (a & b)) + buffer[9] + SQRT_2;
+	c = (c << 9) | (c >> 23);
+	b += ((c & (d | a)) | (d & a)) + buffer[13] + SQRT_2;
+	b = (b << 13) | (b >> 19);
+
+	a += ((b & (c | d)) | (c & d)) + buffer[2] + SQRT_2;
+	a = (a << 3) | (a >> 29);
+	d += ((a & (b | c)) | (b & c)) + buffer[6] + SQRT_2;
+	d = (d << 5) | (d >> 27);
+	c += ((d & (a | b)) | (a & b)) + buffer[10] + SQRT_2;
+	c = (c << 9) | (c >> 23);
+	b += ((c & (d | a)) | (d & a)) + buffer[14] + SQRT_2;
+	b = (b << 13) | (b >> 19);
+
+	a += ((b & (c | d)) | (c & d)) + buffer[3] + SQRT_2;
+	a = (a << 3) | (a >> 29);
+	d += ((a & (b | c)) | (b & c)) + buffer[7] + SQRT_2;
+	d = (d << 5) | (d >> 27);
+	c += ((d & (a | b)) | (a & b)) + buffer[11] + SQRT_2;
+	c = (c << 9) | (c >> 23);
+	b += ((c & (d | a)) | (d & a)) + buffer[15] + SQRT_2;
+	b = (b << 13) | (b >> 19);
+
+	a += (d ^ c ^ b) + buffer[0] + SQRT_3;
+	a = (a << 3) | (a >> 29);
+	d += (c ^ b ^ a) + buffer[8] + SQRT_3;
+	d = (d << 9) | (d >> 23);
+	c += (b ^ a ^ d) + buffer[4] + SQRT_3;
+	c = (c << 11) | (c >> 21);
+	b += (a ^ d ^ c) + buffer[12] + SQRT_3;
+	b = (b << 15) | (b >> 17);
+
+	a += (d ^ c ^ b) + buffer[2] + SQRT_3;
+	a = (a << 3) | (a >> 29);
+	d += (c ^ b ^ a) + buffer[10] + SQRT_3;
+	d = (d << 9) | (d >> 23);
+	c += (b ^ a ^ d) + buffer[6] + SQRT_3;
+	c = (c << 11) | (c >> 21);
+	b += (a ^ d ^ c) + buffer[14] + SQRT_3;
+	b = (b << 15) | (b >> 17);
+
+	a += (d ^ c ^ b) + buffer[1] + SQRT_3;
+	a = (a << 3) | (a >> 29);
+	d += (c ^ b ^ a) + buffer[9] + SQRT_3;
+	d = (d << 9) | (d >> 23);
+	c += (b ^ a ^ d) + buffer[5] + SQRT_3;
+	c = (c << 11) | (c >> 21);
+	b += (a ^ d ^ c) + buffer[13] + SQRT_3;
+	b = (b << 15) | (b >> 17);
+
+	a += (d ^ c ^ b) + buffer[3] + SQRT_3;
+	a = (a << 3) | (a >> 29);
+
+	d += (c ^ b ^ a) + buffer[11] + SQRT_3;
+	d = (d << 9) | (d >> 23);
+	c += (b ^ a ^ d) + buffer[7] + SQRT_3;
+	c = (c << 11) | (c >> 21);
+	b += (a ^ d ^ c) + buffer[15] + SQRT_3;
+	b = (b << 15) | (b >> 17);
+
+	hash[0] = a + INIT_A;
+	hash[1] = b + INIT_B;
+	hash[2] = c + INIT_C;
+	hash[3] = d + INIT_D;
+}
+
+__device__ __host__ void preproc(const uint8_t * key, uint32_t keylen,
+    uint32_t * state, uint8_t var)
+{
+	int i;
+	uint32_t W[16], temp;
+	uint8_t ipad[64];
+
+	for (i = 0; i < 64; i++)
+		ipad[i] = var;
+
+	for (i = 0; i < keylen; i++)
+		ipad[i] = ipad[i] ^ key[i];
+
+#pragma unroll 16
+	for (i = 0; i < 16; i++)
+		GET_WORD_32_BE(W[i], ipad, i * 4);
+	
+	uint32_t A = INIT_A;
+	uint32_t B = INIT_B;
+	uint32_t C = INIT_C;
+	uint32_t D = INIT_D;
+	uint32_t E = INIT_E;
+
+	SHA1(A, B, C, D, E, W);
+
+	state[0] = A + INIT_A;
+	state[1] = B + INIT_B;
+	state[2] = C + INIT_C;
+	state[3] = D + INIT_D;
+	state[4] = E + INIT_E;
+
+}
+
+__device__ void hmac_sha1(const uint8_t * key, uint32_t keylen,
+    const uint8_t * input, uint32_t inputlen, uint8_t * output,
+    uint32_t * ipad_state, uint32_t * opad_state)
+{
+	int i;
+	uint32_t temp, W[16];
+	uint32_t A, B, C, D, E;
+	uint32_t state_A,state_B,state_C,state_D,state_E;
+	uint8_t buf[64];
+	uint32_t *src=(uint32_t*)buf;
+	i=64/4;
+	while(i--)
+	  *src++=0;
+
+	memcpy(buf, input, inputlen);
+	buf[inputlen] = 0x80;
+	PUT_WORD_32_BE((64 + inputlen) << 3, buf, 60);
+
+	A = ipad_state[0];
+	B = ipad_state[1];
+	C = ipad_state[2];
+	D = ipad_state[3];
+	E = ipad_state[4];
+	
+	state_A=A;
+	state_B=B;
+	state_C=C;
+	state_D=D;
+	state_E=E;
+
+	for (i = 0; i < 16; i++)
+		GET_WORD_32_BE(W[i], buf, i * 4);
+
+	SHA1(A, B, C, D, E, W);
+
+	A += state_A;
+	B += state_B;
+	C += state_C;
+	D += state_D;
+	E += state_E;
+
+	PUT_WORD_32_BE(A, buf, 0);
+	PUT_WORD_32_BE(B, buf, 4);
+	PUT_WORD_32_BE(C, buf, 8);
+	PUT_WORD_32_BE(D, buf, 12);
+	PUT_WORD_32_BE(E, buf, 16);
+
+	buf[20] = 0x80;
+	PUT_WORD_32_BE(0x2A0, buf, 60);
+
+	A = opad_state[0];
+	B = opad_state[1];
+	C = opad_state[2];
+	D = opad_state[3];
+	E = opad_state[4];
+	
+	state_A=A;
+	state_B=B;
+	state_C=C;
+	state_D=D;
+	state_E=E;
+
+	for (i = 0; i < 16; i++)
+		GET_WORD_32_BE(W[i], buf, i * 4);
+
+	SHA1(A, B, C, D, E, W);
+
+	A += state_A;
+	B += state_B;
+	C += state_C;
+	D += state_D;
+	E += state_E;
+
+	PUT_WORD_32_BE(A, output, 0);
+	PUT_WORD_32_BE(B, output, 4);
+	PUT_WORD_32_BE(C, output, 8);
+	PUT_WORD_32_BE(D, output, 12);
+	PUT_WORD_32_BE(E, output, 16);
+}
+
+__device__ void pbkdf2(const uint8_t * pass, const uint8_t * salt,
+    int saltlen, uint8_t * out)
+{
+	uint8_t temp[SHA1_DIGEST_LENGTH];
+	uint8_t buf[48];
+	uint32_t ipad_state[5];
+	uint32_t opad_state[5];
+	int i, j;
+	uint8_t tmp_out[16];
+
+	i=48/4;
+	uint32_t *src=(uint32_t*)buf;
+	while(i--)
+	  *src++=0;
+
+	memcpy(buf, salt, saltlen);
+	buf[saltlen + 3] = 0x01;
+
+	preproc(pass, 16, ipad_state, 0x36);
+	preproc(pass, 16, opad_state, 0x5c);
+
+	hmac_sha1(pass, 16, buf, saltlen + 4, temp, ipad_state, opad_state);
+
+	memcpy(tmp_out, temp, 20);
+
+	for (i = 1; i < ITERATIONS; i++) {
+		hmac_sha1(pass, 16, temp, SHA1_DIGEST_LENGTH, temp, ipad_state,
+		    opad_state);
+
+#pragma unroll 16
+		for (j = 0; j < 16; j++)
+			tmp_out[j] ^= temp[j];
+	}
+	memcpy(out, tmp_out, 20);
+}
+
+
+__global__ void pbkdf2_kernel(mscash2_password * inbuffer,
+    mscash2_hash * outbuffer)
+{
+	uint32_t idx = blockIdx.x * blockDim.x + threadIdx.x;
+
+	uint32_t username_len = (uint32_t) cuda_salt[0].length;
+
+	pbkdf2((uint8_t *) inbuffer[idx].dcc_hash,
+	    cuda_salt[0].unicode_salt, username_len << 1,
+	    (uint8_t *) outbuffer[idx].v);
+
+}
+
+__host__ void mscash_cpu(mscash2_password * inbuffer, mscash2_hash * outbuffer,
+    mscash2_salt * host_salt)
+    {
+      
+      int i,idx = 0;
+	uint32_t buffer[16];
+	uint32_t nt_hash[16];
+	uint8_t salt[64];
+	memset(salt,0,64);
+	uint8_t *username = host_salt->salt;
+	uint32_t username_len = (uint32_t) host_salt->length;
+	
+
+	for (i = 0; i < (username_len >> 1) + 1; i++)
+		((uint32_t *) salt)[i] =
+		    username[2 * i] | (username[2 * i + 1] << 16);
+	memcpy(host_salt->unicode_salt, salt, 64);
+
+
+
+	for (idx = 0; idx < KEYS_PER_CRYPT; idx++) {
+
+		uint8_t *password = inbuffer[idx].v;
+		uint32_t password_len = inbuffer[idx].length;
+		memset(nt_hash, 0, 64);
+		memset(buffer, 0, 64);
+
+		for (i = 0; i < password_len >> 1; i++)
+			buffer[i] =
+			    password[2 * i] | (password[2 * i + 1] << 16);
+
+		if (password_len % 2 == 1)
+			buffer[i] = password[password_len - 1] | 0x800000;
+		else
+			buffer[i] = 0x80;
+
+		buffer[14] = password_len << 4;
+
+		md4_crypt(buffer, nt_hash);
+
+		memcpy((uint8_t *) nt_hash + 16, salt, username_len << 1);
+
+		i = username_len + 8;
+
+		if (username_len % 2 == 1)
+			nt_hash[i >> 1] =
+			    username[username_len - 1] | 0x800000;
+		else
+			nt_hash[i >> 1] = 0x80;
+
+		nt_hash[14] = i << 4;
+
+		md4_crypt(nt_hash, inbuffer[idx].dcc_hash);
+
+	}
+
+      
+    }
+__host__ void mscash2_gpu(mscash2_password * inbuffer, mscash2_hash * outbuffer,
+    mscash2_salt * host_salt)
+{
+	
+	mscash_cpu(inbuffer,outbuffer,host_salt);
+	mscash2_password *cuda_inbuffer;
+	mscash2_hash *cuda_outbuffer;
+	size_t insize = sizeof(mscash2_password) * KEYS_PER_CRYPT;
+	size_t outsize = sizeof(mscash2_hash) * KEYS_PER_CRYPT;
+	
+	HANDLE_ERROR(cudaMemcpyToSymbol(cuda_salt, host_salt,
+		sizeof(mscash2_salt)));
+	
+	HANDLE_ERROR(cudaMalloc(&cuda_inbuffer, insize));
+	HANDLE_ERROR(cudaMalloc(&cuda_outbuffer, outsize));
+
+	HANDLE_ERROR(cudaMemcpy(cuda_inbuffer, inbuffer, insize,
+		cudaMemcpyHostToDevice));
+
+	pbkdf2_kernel <<< BLOCKS, THREADS >>> (cuda_inbuffer, cuda_outbuffer);
+
+	HANDLE_ERROR(cudaMemcpy(outbuffer, cuda_outbuffer, outsize,
+		cudaMemcpyDeviceToHost));
+
+	HANDLE_ERROR(cudaFree(cuda_inbuffer));
+	HANDLE_ERROR(cudaFree(cuda_outbuffer));
+
+}
diff -urpN john-1.7.8.orig//src/john.c john-1.7.8-mscash2cuda-0/src/john.c
--- john-1.7.8.orig//src/john.c	2011-02-27 12:31:36.000000000 +0000
+++ john-1.7.8-mscash2cuda-0/src/john.c	2011-08-22 02:09:22.361521212 +0000
@@ -39,6 +39,7 @@ extern int CPU_detect(void);
 
 extern struct fmt_main fmt_DES, fmt_BSDI, fmt_MD5, fmt_BF;
 extern struct fmt_main fmt_AFS, fmt_LM;
+extern struct fmt_main fmt_MSCASH2CUDA;
 #ifdef HAVE_CRYPT
 extern struct fmt_main fmt_crypt;
 #endif
@@ -71,6 +72,7 @@ static void john_register_all(void)
 	john_register_one(&fmt_BF);
 	john_register_one(&fmt_AFS);
 	john_register_one(&fmt_LM);
+	john_register_one(&fmt_MSCASH2CUDA);
 #ifdef HAVE_CRYPT
 	john_register_one(&fmt_crypt);
 #endif
diff -urpN john-1.7.8.orig//src/mscash2.h john-1.7.8-mscash2cuda-0/src/mscash2.h
--- john-1.7.8.orig//src/mscash2.h	1970-01-01 00:00:00.000000000 +0000
+++ john-1.7.8-mscash2cuda-0/src/mscash2.h	2011-08-22 18:36:11.591678614 +0000
@@ -0,0 +1,202 @@
+/*
+* This software is Copyright (c) 2011 Lukas Odzioba <lukas dot odzioba at gmail dot com> 
+* and it is hereby released to the general public under the following terms:
+* Redistribution and use in source and binary forms, with or without modification, are permitted.
+* Based on S3nf implementation http://openwall.info/wiki/john/MSCash2
+*/
+#ifndef _MSCASH2_H
+#define _MSCASH2_H
+
+#define uint8_t			unsigned char
+#define uint16_t		unsigned short
+#define uint32_t		unsigned int
+
+#define THREADS			128
+#define BLOCKS			14
+#define	KEYS_PER_CRYPT		(THREADS)*(BLOCKS)
+
+#define BINARY_SIZE		16
+#define PLAINTEXT_LENGTH	15
+#define SALT_SIZE		sizeof(mscash2_salt)
+
+#define MIN_KEYS_PER_CRYPT	KEYS_PER_CRYPT
+#define MAX_KEYS_PER_CRYPT	KEYS_PER_CRYPT
+
+#define MAX(x,y)		((x) > (y) ? (x) : (y))
+#define MIN(x,y)		((x) < (y) ? (x) : (y))
+# define SWAP(n) \
+    (((n) << 24) | (((n) & 0xff00) << 8) | (((n) >> 8) & 0xff00) | ((n) >> 24))
+
+#define ITERATIONS		10240
+
+#define INIT_A			0x67452301
+#define INIT_B			0xefcdab89
+#define INIT_C			0x98badcfe
+#define INIT_D			0x10325476
+#define INIT_E			0xc3d2e1f0
+
+#define SQRT_2			0x5a827999
+#define SQRT_3			0x6ed9eba1
+
+#define SHA1_DIGEST_LENGTH	20
+
+#define K1			0x5a827999
+#define K2			0x6ed9eba1
+#define K3			0x8f1bbcdc
+#define K4			0xca62c1d6
+
+#define F1(x,y,z)		(z ^ (x & (y ^ z)))
+#define F2(x,y,z)		(x ^ y ^ z)
+#define F3(x,y,z)		((x & y) | (z & (x | y)))
+#define F4(x,y,z)		(x ^ y ^ z)
+
+#ifndef GET_WORD_32_BE
+#define GET_WORD_32_BE(n,b,i)                           \
+{                                                       \
+    (n) = ( (unsigned long) (b)[(i)    ] << 24 )        \
+        | ( (unsigned long) (b)[(i) + 1] << 16 )        \
+        | ( (unsigned long) (b)[(i) + 2] <<  8 )        \
+        | ( (unsigned long) (b)[(i) + 3]       );       \
+}
+#endif
+
+#ifndef PUT_WORD_32_BE
+#define PUT_WORD_32_BE(n,b,i)                           \
+{                                                       \
+    (b)[(i)    ] = (unsigned char) ( (n) >> 24 );       \
+    (b)[(i) + 1] = (unsigned char) ( (n) >> 16 );       \
+    (b)[(i) + 2] = (unsigned char) ( (n) >>  8 );       \
+    (b)[(i) + 3] = (unsigned char) ( (n)       );       \
+}
+#endif
+
+#define S(x,n) ((x << n) | ((x & 0xFFFFFFFF) >> (32 - n)))
+
+#define R(t)                                            \
+(                                                       \
+    temp = W[(t -  3) & 0x0F] ^ W[(t - 8) & 0x0F] ^     \
+           W[(t - 14) & 0x0F] ^ W[ t      & 0x0F],      \
+    ( W[t & 0x0F] = S(temp,1) )                         \
+)
+
+#define P1(a,b,c,d,e,x)                                  \
+{                                                       \
+    e += S(a,5) + F1(b,c,d) + K1 + x; b = S(b,30);        \
+}
+
+#define P2(a,b,c,d,e,x)                                  \
+{                                                       \
+    e += S(a,5) + F2(b,c,d) + K2 + x; b = S(b,30);        \
+}
+
+#define P3(a,b,c,d,e,x)                                  \
+{                                                       \
+    e += S(a,5) + F3(b,c,d) + K3 + x; b = S(b,30);        \
+}
+
+#define P4(a,b,c,d,e,x)                                  \
+{                                                       \
+    e += S(a,5) + F4(b,c,d) + K4 + x; b = S(b,30);        \
+}
+
+#define SHA1(A,B,C,D,E,W) \
+    P1(A, B, C, D, E, W[0] );\
+    P1(E, A, B, C, D, W[1] );\
+    P1(D, E, A, B, C, W[2] );\
+    P1(C, D, E, A, B, W[3] );\
+    P1(B, C, D, E, A, W[4] );\
+    P1(A, B, C, D, E, W[5] );\
+    P1(E, A, B, C, D, W[6] );\
+    P1(D, E, A, B, C, W[7] );\
+    P1(C, D, E, A, B, W[8] );\
+    P1(B, C, D, E, A, W[9] );\
+    P1(A, B, C, D, E, W[10]);\
+    P1(E, A, B, C, D, W[11]);\
+    P1(D, E, A, B, C, W[12]);\
+    P1(C, D, E, A, B, W[13]);\
+    P1(B, C, D, E, A, W[14]);\
+    P1(A, B, C, D, E, W[15]);\
+    P1(E, A, B, C, D, R(16));\
+    P1(D, E, A, B, C, R(17));\
+    P1(C, D, E, A, B, R(18));\
+    P1(B, C, D, E, A, R(19));\
+    P2(A, B, C, D, E, R(20));\
+    P2(E, A, B, C, D, R(21));\
+    P2(D, E, A, B, C, R(22));\
+    P2(C, D, E, A, B, R(23));\
+    P2(B, C, D, E, A, R(24));\
+    P2(A, B, C, D, E, R(25));\
+    P2(E, A, B, C, D, R(26));\
+    P2(D, E, A, B, C, R(27));\
+    P2(C, D, E, A, B, R(28));\
+    P2(B, C, D, E, A, R(29));\
+    P2(A, B, C, D, E, R(30));\
+    P2(E, A, B, C, D, R(31));\
+    P2(D, E, A, B, C, R(32));\
+    P2(C, D, E, A, B, R(33));\
+    P2(B, C, D, E, A, R(34));\
+    P2(A, B, C, D, E, R(35));\
+    P2(E, A, B, C, D, R(36));\
+    P2(D, E, A, B, C, R(37));\
+    P2(C, D, E, A, B, R(38));\
+    P2(B, C, D, E, A, R(39));\
+    P3(A, B, C, D, E, R(40));\
+    P3(E, A, B, C, D, R(41));\
+    P3(D, E, A, B, C, R(42));\
+    P3(C, D, E, A, B, R(43));\
+    P3(B, C, D, E, A, R(44));\
+    P3(A, B, C, D, E, R(45));\
+    P3(E, A, B, C, D, R(46));\
+    P3(D, E, A, B, C, R(47));\
+    P3(C, D, E, A, B, R(48));\
+    P3(B, C, D, E, A, R(49));\
+    P3(A, B, C, D, E, R(50));\
+    P3(E, A, B, C, D, R(51));\
+    P3(D, E, A, B, C, R(52));\
+    P3(C, D, E, A, B, R(53));\
+    P3(B, C, D, E, A, R(54));\
+    P3(A, B, C, D, E, R(55));\
+    P3(E, A, B, C, D, R(56));\
+    P3(D, E, A, B, C, R(57));\
+    P3(C, D, E, A, B, R(58));\
+    P3(B, C, D, E, A, R(59));\
+    P4(A, B, C, D, E, R(60));\
+    P4(E, A, B, C, D, R(61));\
+    P4(D, E, A, B, C, R(62));\
+    P4(C, D, E, A, B, R(63));\
+    P4(B, C, D, E, A, R(64));\
+    P4(A, B, C, D, E, R(65));\
+    P4(E, A, B, C, D, R(66));\
+    P4(D, E, A, B, C, R(67));\
+    P4(C, D, E, A, B, R(68));\
+    P4(B, C, D, E, A, R(69));\
+    P4(A, B, C, D, E, R(70));\
+    P4(E, A, B, C, D, R(71));\
+    P4(D, E, A, B, C, R(72));\
+    P4(C, D, E, A, B, R(73));\
+    P4(B, C, D, E, A, R(74));\
+    P4(A, B, C, D, E, R(75));\
+    P4(E, A, B, C, D, R(76));\
+    P4(D, E, A, B, C, R(77));\
+    P4(C, D, E, A, B, R(78));\
+    P4(B, C, D, E, A, R(79));
+
+static const char mscash2_prefix[] = "$DCC2$";
+
+typedef struct {
+	uint8_t length;
+	uint8_t v[15];
+	uint32_t dcc_hash[4];
+} mscash2_password;
+
+typedef struct {
+	uint32_t v[8];
+} mscash2_hash;
+
+typedef struct {
+	uint8_t length;
+	uint8_t salt[15];
+	uint8_t unicode_salt[64];
+} mscash2_salt;
+
+#endif
diff -urpN john-1.7.8.orig//src/mscash2cuda_fmt.c john-1.7.8-mscash2cuda-0/src/mscash2cuda_fmt.c
--- john-1.7.8.orig//src/mscash2cuda_fmt.c	1970-01-01 00:00:00.000000000 +0000
+++ john-1.7.8-mscash2cuda-0/src/mscash2cuda_fmt.c	2011-09-09 06:55:01.174592599 +0000
@@ -0,0 +1,244 @@
+/*
+* This software is Copyright (c) 2011 Lukas Odzioba <lukas dot odzioba at gmail dot com> 
+* and it is hereby released to the general public under the following terms:
+* Redistribution and use in source and binary forms, with or without modification, are permitted.
+* Based on S3nf implementation http://openwall.info/wiki/john/MSCash2
+*/
+#include <string.h>
+#include "arch.h"
+#include "formats.h"
+#include "common.h"
+#include "misc.h"
+#include "mscash2.h"
+
+#define FORMAT_LABEL		"mscash2cuda"
+#define FORMAT_NAME		"MSCASH2CUDA"
+#define ALGORITHM_NAME		"GPU"
+
+#define BENCHMARK_COMMENT	""
+#define BENCHMARK_LENGTH	-1
+
+//#define _MSCASH2_DEBUG
+
+static mscash2_password inbuffer[MAX_KEYS_PER_CRYPT];
+static mscash2_hash outbuffer[MAX_KEYS_PER_CRYPT];
+static mscash2_salt currentsalt;
+
+static struct fmt_tests tests[] = {
+	{"$DCC2$test#a86012faf7d88d1fc037a69764a92cac", "password"},
+	//{"$DCC2$administrator#a150f71752b5d605ef0b2a1e98945611","a"},
+	//{"$DCC2$administrator#c14eb8279e4233ec14e9d393637b65e2","ab"},
+	//{"$DCC2$administrator#8ce9c0279b4e6f226f52d559f9c2c5f3","abc"},
+	//{"$DCC2$administrator#2fc788d09fad7e26a92d12356fa44bdf","abcd"},
+	//{"$DCC2$administrator#6aa19842ffea11f0f0c89f8ca8d245bd","abcde"},
+	{NULL}
+};
+
+extern void mscash2_init(int gpuid);
+extern void mscash2_gpu(mscash2_password *, mscash2_hash *, mscash2_salt *);
+
+static void init(void)
+{
+	mscash2_init(0);
+}
+
+static int valid(char *ciphertext)
+{
+	if (strncmp(ciphertext, mscash2_prefix, strlen(mscash2_prefix)) != 0)
+		return 0;
+	char *hash = strrchr(ciphertext, '#') + 1;
+	int hashlength = 0;
+	if (hash == NULL)
+		return 0;
+	while (hash < ciphertext + strlen(ciphertext)) {
+		if (atoi16[ARCH_INDEX(*hash++)] == 0x7f)
+			return 0;
+		hashlength++;
+	}
+	if (hashlength != 32)
+		return 0;
+	return 1;
+}
+
+static void *binary(char *ciphertext)
+{
+	static uint32_t binary[4];
+	char *hash = strrchr(ciphertext, '#') + 1;
+	if (hash == NULL)
+		return binary;
+	int i;
+	for (i = 0; i < 4; i++) {
+		sscanf(hash + (8 * i), "%08x", &binary[i]);
+		binary[i] = SWAP(binary[i]);
+	}
+	return binary;
+
+}
+
+static void *salt(char *ciphertext)
+{
+	static mscash2_salt salt;
+	char *pos = ciphertext + strlen(mscash2_prefix);
+	int length = 0;
+	while (*pos != '#')
+		salt.salt[length++] = *pos++;
+	salt.length = length;
+	return &salt;
+}
+
+static void set_salt(void *salt)
+{
+	memcpy(&currentsalt, salt, sizeof(mscash2_salt));
+}
+
+static void set_key(char *key, int index)
+{
+	uint8_t length = strlen(key);
+	inbuffer[index].length = length;
+	memcpy(inbuffer[index].v, key, length);
+}
+
+static char *get_key(int index)
+{
+	static char ret[PLAINTEXT_LENGTH + 1];
+	uint8_t length = inbuffer[index].length;
+	memcpy(ret, inbuffer[index].v, length);
+	ret[length] = '\0';
+	return ret;
+}
+
+static void crypt_all(int count)
+{
+	mscash2_gpu(inbuffer, outbuffer, &currentsalt);
+}
+
+static int binary_hash_0(void *binary)
+{
+#ifdef _MSCASH2_DEBUG
+	puts("binary");
+	uint32_t i, *b = binary;
+	for (i = 0; i < 4; i++)
+		printf("%08x ", b[i]);
+	puts("");
+#endif
+	return (((uint32_t *) binary)[0] & 0xf);
+}
+
+static int binary_hash_1(void *binary)
+{
+	return ((uint32_t *) binary)[0] & 0xff;
+}
+
+static int binary_hash_2(void *binary)
+{
+	return ((uint32_t *) binary)[0] & 0xfff;
+}
+
+static int binary_hash_3(void *binary)
+{
+	return ((uint32_t *) binary)[0] & 0xffff;
+}
+
+static int binary_hash_4(void *binary)
+{
+	return ((uint32_t *) binary)[0] & 0xfffff;
+}
+
+
+static int get_hash_0(int index)
+{
+#ifdef _MSCASH2_DEBUG
+	int i;
+	puts("get_hash");
+	for (i = 0; i < 4; i++)
+		printf("%08x ", outbuffer[index].v[i]);
+	puts("");
+#endif
+	return outbuffer[index].v[0] & 0xf;
+}
+
+static int get_hash_1(int index)
+{
+	return outbuffer[index].v[0] & 0xff;
+}
+
+static int get_hash_2(int index)
+{
+	return outbuffer[index].v[0] & 0xfff;
+}
+
+static int get_hash_3(int index)
+{
+	return outbuffer[index].v[0] & 0xffff;
+}
+
+static int get_hash_4(int index)
+{
+	return outbuffer[index].v[0] & 0xfffff;
+}
+
+static int cmp_all(void *binary, int count)
+{
+	uint32_t i, b = ((uint32_t *) binary)[0];
+	for (i = 0; i < count; i++)
+		if (b == outbuffer[i].v[0])
+			return 1;
+	return 0;
+}
+
+static int cmp_one(void *binary, int index)
+{
+	uint32_t i, *b = (uint32_t *) binary;
+	for (i = 0; i < 4; i++)
+		if (b[i] != outbuffer[index].v[i])
+			return 0;
+	return 1;
+}
+
+static int cmp_exact(char *source, int count)
+{
+	return 1;
+}
+
+struct fmt_main fmt_MSCASH2CUDA = {
+	{
+		    FORMAT_LABEL,
+		    FORMAT_NAME,
+		    ALGORITHM_NAME,
+		    BENCHMARK_COMMENT,
+		    BENCHMARK_LENGTH,
+		    PLAINTEXT_LENGTH,
+		    BINARY_SIZE,
+		    SALT_SIZE,
+		    MIN_KEYS_PER_CRYPT,
+		    MAX_KEYS_PER_CRYPT,
+		    FMT_CASE | FMT_8_BIT,
+	    tests},
+	{
+		    init,
+		    valid,
+		    fmt_default_split,
+		    binary,
+		    salt,
+		    {
+				binary_hash_0,
+				binary_hash_1,
+				binary_hash_2,
+				binary_hash_3,
+			binary_hash_4},
+		    fmt_default_salt_hash,
+		    set_salt,
+		    set_key,
+		    get_key,
+		    fmt_default_clear_keys,
+		    crypt_all,
+		    {
+				get_hash_0,
+				get_hash_1,
+				get_hash_2,
+				get_hash_3,
+			get_hash_4},
+		    cmp_all,
+		    cmp_one,
+	    cmp_exact}
+};
diff -urpN john-1.7.8.orig//src/options.c john-1.7.8-mscash2cuda-0/src/options.c
--- john-1.7.8.orig//src/options.c	2011-06-22 13:01:40.000000000 +0000
+++ john-1.7.8-mscash2cuda-0/src/options.c	2011-08-22 02:09:08.630270899 +0000
@@ -100,7 +100,7 @@ static struct opt_entry opt_list[] = {
 "--salts=[-]COUNT           load salts with[out] at least COUNT passwords " \
 	"only\n" \
 "--format=NAME              force hash type NAME: " \
-	"DES/BSDI/MD5/BF/AFS/LM" MAYBE_CRYPT "\n" \
+	"DES/BSDI/MD5/BF/AFS/LM/MSCASH2CUDA" MAYBE_CRYPT "\n" \
 "--save-memory=LEVEL        enable memory saving, at LEVEL 1..3\n"
 
 void opt_init(char *name, int argc, char **argv)
