From da11c45059b5b1dfa29f798e81e253816334ab9f Mon Sep 17 00:00:00 2001
From: magnum <magnum>
Date: Wed, 27 Jul 2011 23:30:25 +0200
Subject: [PATCH 12/12] Re-enable support for full UTF-16 (not just UCS-2) (not for NT and mscash1)

---
 src/unicode.c |   79 ++++++++++++++++++++++++++++++++++++++++++++++++++------
 1 files changed, 70 insertions(+), 9 deletions(-)
diff --git a/src/unicode.c b/src/unicode.c
index ac2ad3d..93bcd86 100644
--- a/src/unicode.c
+++ b/src/unicode.c
@@ -152,8 +152,8 @@ static const UTF8 firstByteMark[7] = { 0x00, 0x00, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC
  * This table contains as many values as there might be trailing bytes
  * in a UTF-8 sequence. (Cut-down version, 4 and 5 are illegal).
  */
-const UTF32 offsetsFromUTF8[] = { 0x00000000UL, 0x00003080UL,
-    0x000E2080UL, 0x03C82080UL };
+const UTF32 offsetsFromUTF8[6] = { 0x00000000UL, 0x00003080UL, 0x000E2080UL,
+		     0x03C82080UL, 0xFA082080UL, 0x82082080UL };
 
 /*
  * Index into the table below with the first byte of a UTF-8 sequence to
@@ -169,7 +169,17 @@ const char opt_trailingBytesUTF8[64] = {
 	2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 3,3,3,3,3,3,3,3,4,4,4,4,5,5,5,5
 };
 
-// Convert UTF-8 to NT UNICODE (UCS-2) string. Note that regardless of
+static const int halfShift  = 10; /* used for shifting by 10 bits */
+
+static const UTF32 halfBase = 0x0010000UL;
+static const UTF32 halfMask = 0x3FFUL;
+
+#define UNI_SUR_HIGH_START  (UTF32)0xD800
+#define UNI_SUR_HIGH_END    (UTF32)0xDBFF
+#define UNI_SUR_LOW_START   (UTF32)0xDC00
+#define UNI_SUR_LOW_END     (UTF32)0xDFFF
+
+// Convert UTF-8 to NT UNICODE (UTF-16LE) string. Note that regardless of
 // processor type this must be in intel (little-endian) format.
 // This code is optimised for speed. Errors result in truncation.
 //
@@ -216,6 +226,9 @@ inline int utf8towcs(UTF16 * target, unsigned int len, const UTF8 * source,
 			return -1 * (source - sourceStart);
 		}
 		switch (extraBytesToRead) {
+		case 3:
+			ch <<= 6;
+			ch += *++source;
 		case 2:
 			ch <<= 6;
 			ch += *++source;
@@ -230,12 +243,39 @@ inline int utf8towcs(UTF16 * target, unsigned int len, const UTF8 * source,
 			return -1 * (source - sourceStart);
 		}
 		ch -= offsetsFromUTF8[extraBytesToRead];
+#if 0 /* This only supports UCS-2 */
 #if ARCH_LITTLE_ENDIAN
 		*target++ = (UTF16) ch;
 #else
 		SSVAL(target, 0, ch);
 		++target;
 #endif
+#else /* This supports full UTF-16 with surrogate pairs */
+		if (ch <= UNI_MAX_BMP) {  /* Target is a character <= 0xFFFF */
+#if ARCH_LITTLE_ENDIAN
+			*target++ = (UTF16) ch;
+#else
+			SSVAL(target, 0, ch);
+			++target;
+#endif
+		} else {  /* target is a character in range 0xFFFF - 0x10FFFF. */
+			if (target + 1 >= targetEnd) {
+				source -= (extraBytesToRead+1); /* Back up source pointer! */
+				*target = 0;
+				return -1 * (source - sourceStart);
+			}
+			ch -= halfBase;
+#if ARCH_LITTLE_ENDIAN
+			*target++ = (UTF16)((ch >> halfShift) + UNI_SUR_HIGH_START);
+			*target++ = (UTF16)((ch & halfMask) + UNI_SUR_LOW_START);
+#else
+			SSVAL(target, 0, (UTF16)((ch >> halfShift) + UNI_SUR_HIGH_START));
+			++target;
+			SSVAL(target, 0, (UTF16)((ch & halfMask) + UNI_SUR_LOW_START));
+			++target;
+#endif
+		}
+#endif
 		if (*source == 0)
 			break;
 		if (target >= targetEnd) {
@@ -318,7 +358,7 @@ int E_md4hash(const UTF8 * passwd, unsigned int len, unsigned char *p16)
 #endif
 		static MD4_CTX ctx;
 
-		/* Password is converted to UTF-16LE (UCS-2) */
+		/* Password is converted to UTF-16LE */
 		trunclen = plaintowcs(wpwd, PLAINTEXT_BUFFER_SIZE, passwd, len);
 		if(trunclen < 0)
 			len = strlen16(wpwd); // From UTF-8 you can't know
@@ -338,7 +378,7 @@ int E_md4hash(const UTF8 * passwd, unsigned int len, unsigned char *p16)
 		unsigned int d = INIT_D;
 		unsigned int i = 0, md4_size = 0;
 
-		/* Password is converted to UTF-16LE (UCS-2) */
+		/* Password is converted to UTF-16LE */
 		trunclen = plaintowcs(wpwd, 27, passwd, len);
 		// We need to check this because it's not just a matter of truncating
 		// length, it can be malformed UTF-8
@@ -438,8 +478,8 @@ int E_md4hash(const UTF8 * passwd, unsigned int len, unsigned char *p16)
 // only used in get_key() as of now.
 // Non thread-safe version
 UTF8 * utf16toutf8 (const UTF16* source) {
-	static UTF8 ret_Key[PLAINTEXT_BUFFER_SIZE*3+1];
-	return utf16toutf8_r(ret_Key, PLAINTEXT_BUFFER_SIZE*3, source);
+	static UTF8 ret_Key[PLAINTEXT_BUFFER_SIZE + 1];
+	return utf16toutf8_r(ret_Key, PLAINTEXT_BUFFER_SIZE, source);
 }
 
 // Thread-safe version
@@ -453,6 +493,27 @@ UTF8 * utf16toutf8_r (UTF8 *dst, int dst_len, const UTF16* source) {
 		const UTF32 byteMark = 0x80;
 		const UTF16* oldSource = source; /* In case we have to back out */
 		ch = *source++;
+		/* If we have a surrogate pair, convert to UTF32 first. */
+		if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_HIGH_END) {
+			/* If the 16 bits following the high surrogate are in the source buffer... */
+			if (source) {
+				UTF32 ch2 = *source;
+				/* If it's a low surrogate, convert to UTF32. */
+				if (ch2 >= UNI_SUR_LOW_START && ch2 <= UNI_SUR_LOW_END) {
+					ch = ((ch - UNI_SUR_HIGH_START) << halfShift)
+						+ (ch2 - UNI_SUR_LOW_START) + halfBase;
+					++source;
+				} else { /* it's an unpaired high surrogate */
+					--source; /* return to the illegal value itself */
+					fprintf(stderr, "warning, utf16toutf8 failed (illegal) - this is a bug in JtR\n");
+					break;
+				}
+			} else { /* We don't have the 16 bits following the high surrogate. */
+				--source; /* return to the high surrogate */
+				fprintf(stderr, "warning, utf16toutf8 failed (no surrogate) - this is a bug in JtR\n");
+				break;
+			}
+		}
 		/* Figure out how many bytes the result will require */
 		if (ch < (UTF32)0x80) {	     bytesToWrite = 1;
 		} else if (ch < (UTF32)0x800) {     bytesToWrite = 2;
@@ -483,8 +544,8 @@ UTF8 * utf16toutf8_r (UTF8 *dst, int dst_len, const UTF16* source) {
 // This is not optimised as it's only used in get_key() as of now.
 // Non thread-safe version
 UTF8 * utf16toplain (const UTF16* source) {
-	static UTF8 ret_Key[PLAINTEXT_BUFFER_SIZE*3+1];
-	return utf16toplain_r(ret_Key, PLAINTEXT_BUFFER_SIZE*3, source);
+	static UTF8 ret_Key[PLAINTEXT_BUFFER_SIZE + 1];
+	return utf16toplain_r(ret_Key, PLAINTEXT_BUFFER_SIZE, source);
 }
 
 // Thread-safe version
-- 
1.7.4.1