From 109d08086a7756642f898e7b5fcd79063079cb91 Mon Sep 17 00:00:00 2001
From: magnum <magnum>
Date: Fri, 23 Sep 2011 05:17:07 +0200
Subject: [PATCH 4/4] j7: AlwaysReportUTF8 also affects --show
     plus better line-up for hashes cracked when running

---
 src/john.c    |    4 -
 src/loader.c  |   18 +++++-
 src/logger.c  |   15 ++++-
 src/options.c |   80 ------------------------
 src/unicode.c |  187 ++++++++++++++++++++++++++++++++++++++++++++++----------
 src/unicode.h |    6 ++-
 6 files changed, 187 insertions(+), 123 deletions(-)
diff --git a/src/john.c b/src/john.c
index 0705207..84e8388 100644
--- a/src/john.c
+++ b/src/john.c
@@ -329,10 +329,6 @@ static void john_load(void)
 				database.format->params.algorithm_name);
 		}
 
-		options.log_passwords = cfg_get_bool(SECTION_OPTIONS,
-		    NULL, "LogCrackedPasswords", 0);
-		options.report_utf8 = cfg_get_bool(SECTION_OPTIONS,
-		    NULL, "AlwaysReportUTF8", 0);
 		if (database.password_count) {
 			if (database.format->params.flags & FMT_UNICODE)
 				options.store_utf8 = cfg_get_bool(SECTION_OPTIONS,
diff --git a/src/loader.c b/src/loader.c
index b963e93..eb9677b 100644
--- a/src/loader.c
+++ b/src/loader.c
@@ -649,8 +649,14 @@ static void ldr_remove_marked(struct db_main *db)
 				}
 			} else {
 				last_pw = current_pw;
-				if (db->options->showuncracked)
-					printf("%s%c%s\n",current_pw->login,db->options->field_sep_char,current_pw->source);
+				if (db->options->showuncracked) {
+					if (!options.utf8 && options.report_utf8) {
+						UTF8 utf8login[PLAINTEXT_BUFFER_SIZE + 1];
+						enc_to_utf8_r(current_pw->login, utf8login);
+						printf("%s%c%s\n",utf8login,db->options->field_sep_char,current_pw->source);
+					} else
+						printf("%s%c%s\n",current_pw->login,db->options->field_sep_char,current_pw->source);
+				}
 			}
 		} while ((current_pw = current_pw->next));
 
@@ -932,6 +938,8 @@ static void ldr_show_pw_line(struct db_main *db, char *line)
 	int pass, found, chars;
 	int hash;
 	struct db_cracked *current;
+	UTF8 utf8login[LINE_BUFFER_SIZE];
+	UTF8 utf8source[LINE_BUFFER_SIZE];
 
 	format = NULL;
 	count = ldr_split_line(&login, &ciphertext, &gecos, &home,
@@ -943,6 +951,12 @@ static void ldr_show_pw_line(struct db_main *db, char *line)
 
 	show = !(db->options->flags & DB_PLAINTEXTS);
 
+	if (!options.utf8 && options.report_utf8) {
+		login = (char*)enc_to_utf8_r(login, utf8login);
+		enc_to_utf8_r(source, utf8source);
+		strnzcpy(source, (char*)utf8source, sizeof(source));
+	}
+
 	if (format) {
 		split = format->methods.split;
 		unify = format->params.flags & FMT_SPLIT_UNIFIES_CASE;
diff --git a/src/logger.c b/src/logger.c
index 2af1dd4..86ed40c 100644
--- a/src/logger.c
+++ b/src/logger.c
@@ -37,6 +37,7 @@
 #include "status.h"
 #include "config.h"
 #include "options.h"
+#include "unicode.h"
 #ifdef HAVE_MPI
 #include "john-mpi.h"
 #endif
@@ -179,14 +180,24 @@ void log_init(char *log_name, char *pot_name, char *session)
 void log_guess(char *login, char *ciphertext, char *rep_plain, char *store_plain, char field_sep)
 {
 	int count1, count2;
+	int len;
+	char spacer[] = "                ";
+
+	// This is because printf("%-16s") does not line up multibyte UTF-8.
+	// We need to count characters, not octets.
+	if (options.utf8 || (!options.utf8 && options.report_utf8))
+		len = strlen8((UTF8*)rep_plain);
+	else
+		len = strlen(rep_plain);
+	spacer[len > 16 ? 0 : 16 - len] = 0;
 
 #ifdef HAVE_MPI
 	// All but node 0 has stdout closed so we output to stderr
 	if (mpi_p > 1)
-		fprintf(stderr, "%-16s (%s)\n", rep_plain, login);
+		fprintf(stderr, "%s%s (%s)\n", rep_plain, spacer, login);
 	else
 #endif
-	printf("%-16s (%s)\n", rep_plain, login);
+		printf("%s%s (%s)\n", rep_plain, spacer, login);
 
 	in_logger = 1;
 
diff --git a/src/options.c b/src/options.c
index 910b4db..11469af 100644
--- a/src/options.c
+++ b/src/options.c
@@ -268,86 +268,6 @@ void opt_init(char *name, int argc, char **argv)
 		return;
 	}
 
-	options.utf8 = options.iso8859_1 = options.iso8859_7 =
-	  options.iso8859_15 = options.koi8_r = options.cp437 =
-	  options.cp737 = options.cp850 = options.cp858 = options.cp866 =
-	  options.cp1251 = options.cp1252 = options.cp1253 = 0;
-	// by 'default' we are setup in 7 bit ascii mode (for rules).
-	options.ascii = 1;
-	options.encodingStr = "";
-	if ( (options.flags & FLG_INP_ENCODING) && options.encoding) {
-		// Ok, check a 'few' valid things for utf8
-		options.ascii = 0;
-		if (!strcasecmp(options.encoding, "utf8")||!strcasecmp(options.encoding, "utf-8")) {
-			options.utf8 = 1;
-			options.encodingStr = "UTF-8";
-		} else
-			if (!strcasecmp(options.encoding, "ansi")||!strcasecmp(options.encoding, "iso-8859-1")||!strcasecmp(options.encoding, "8859-1")||!strcasecmp(options.encoding, "iso8859-1")) {
-			options.iso8859_1 = 1;
-			options.encodingStr = "ISO-8859-1";
-		} else
-		if (!strcasecmp(options.encoding, "iso-8859-7")||!strcasecmp(options.encoding, "8859-7")||!strcasecmp(options.encoding, "iso8859-7")) {
-			options.iso8859_7 = 1;
-			options.encodingStr = "ISO-8859-7";
-		} else
-		if (!strcasecmp(options.encoding, "iso-8859-15")||!strcasecmp(options.encoding, "8859-15")||!strcasecmp(options.encoding, "iso8859-15")) {
-			options.iso8859_15 = 1;
-			options.encodingStr = "ISO-8859-15";
-		} else
-		if (!strcasecmp(options.encoding, "koi8-r")||!strcasecmp(options.encoding, "koi8r")) {
-			options.koi8_r = 1;
-			options.encodingStr = "KOI8-R";
-		} else
-		if (!strcasecmp(options.encoding, "cp437")||!strcasecmp(options.encoding, "cp-437")) {
-			options.cp437 = 1;
-			options.encodingStr = "CP437";
-		} else
-		if (!strcasecmp(options.encoding, "cp737")||!strcasecmp(options.encoding, "cp-737")) {
-			options.cp737 = 1;
-			options.encodingStr = "CP737";
-		} else
-		if (!strcasecmp(options.encoding, "cp850")||!strcasecmp(options.encoding, "cp-850")) {
-			options.cp850 = 1;
-			options.encodingStr = "CP850";
-		} else
-		if (!strcasecmp(options.encoding, "cp858")||!strcasecmp(options.encoding, "cp-858")) {
-			options.cp858 = 1;
-			options.encodingStr = "CP858";
-		} else
-		if (!strcasecmp(options.encoding, "cp866")||!strcasecmp(options.encoding, "cp-866")) {
-			options.cp866 = 1;
-			options.encodingStr = "CP866";
-		} else
-		if (!strcasecmp(options.encoding, "cp1251")||!strcasecmp(options.encoding, "cp-1251")) {
-			options.cp1251 = 1;
-			options.encodingStr = "CP1251";
-		} else
-		if (!strcasecmp(options.encoding, "cp1252")||!strcasecmp(options.encoding, "cp-1252")) {
-			options.cp1252 = 1;
-			options.encodingStr = "CP1252";
-		} else
-		if (!strcasecmp(options.encoding, "cp1253")||!strcasecmp(options.encoding, "cp-1253")) {
-			options.cp1253 = 1;
-			options.encodingStr = "CP1253";
-		} else
-		if (strcasecmp(options.encoding, "raw") && strcasecmp(options.encoding, "ascii") && strcasecmp(options.encoding, "default")) {
-			fprintf (stderr, "Supported encodings within john are: raw, utf-8, iso-8859-1 (or ansi)"
-					",\niso-8859-7"
-					", iso-8859-15"
-					", koi8-r"
-					", cp437"
-					", cp737"
-					", cp850"
-					", cp858"
-					", cp866"
-					",\ncp1251"
-					", cp1252"
-					", cp1253"
-					"\n");
-			error();
-		}
-	}
-
 	if (options.subformat && !strcasecmp(options.subformat, "list"))
 	{
 		md5_gen_DISPLAY_ALL_FORMATS();
diff --git a/src/unicode.c b/src/unicode.c
index 6dd3213..dadcf96 100644
--- a/src/unicode.c
+++ b/src/unicode.c
@@ -72,9 +72,40 @@
 
 ------------------------------------------------------------------------ */
 
+#include <string.h>
+
+#include "common.h"
+#include "arch.h"
+#include "byteorder.h"
 #include "unicode.h"
 #include "UnicodeData.h"
 #include "encoding_data.h"
+#include "misc.h"
+#include "config.h"
+#include "md4.h"
+#if !defined (NOT_JOHN)
+#include "options.h"
+#else
+struct opts { int flags; };
+#define FLG_UTF8 1
+struct opts options;
+#endif
+
+#if !defined(uint16) && !defined(HAVE_UINT16_FROM_RPC_RPC_H)
+#if (SIZEOF_SHORT == 4)
+#define uint16 __ERROR___CANNOT_DETERMINE_TYPE_FOR_INT16;
+#else /* SIZEOF_SHORT != 4 */
+#define uint16 unsigned short
+#endif /* SIZEOF_SHORT != 4 */
+#endif
+
+#if !defined(int16) && !defined(HAVE_INT16_FROM_RPC_RPC_H)
+#if (SIZEOF_SHORT == 4)
+#define int16 __ERROR___CANNOT_DETERMINE_TYPE_FOR_INT16;
+#else /* SIZEOF_SHORT != 4 */
+#define int16 short
+#endif /* SIZEOF_SHORT != 4 */
+#endif
 
 UTF16 ucs2_upcase[0x10000];
 UTF16 ucs2_downcase[0x10000];
@@ -101,35 +132,6 @@ static int UnicodeType = -1;
 UTF8 CP_isLetter[0x100];
 UTF8 CP_isSeparator[0x100];
 
-#include "md4.h"
-#if !defined(uint16) && !defined(HAVE_UINT16_FROM_RPC_RPC_H)
-#if (SIZEOF_SHORT == 4)
-#define uint16 __ERROR___CANNOT_DETERMINE_TYPE_FOR_INT16;
-#else /* SIZEOF_SHORT != 4 */
-#define uint16 unsigned short
-#endif /* SIZEOF_SHORT != 4 */
-#endif
-
-#if !defined(int16) && !defined(HAVE_INT16_FROM_RPC_RPC_H)
-#if (SIZEOF_SHORT == 4)
-#define int16 __ERROR___CANNOT_DETERMINE_TYPE_FOR_INT16;
-#else /* SIZEOF_SHORT != 4 */
-#define int16 short
-#endif /* SIZEOF_SHORT != 4 */
-#endif
-
-#include <string.h>
-#include "common.h"
-#include "arch.h"
-#include "byteorder.h"
-#if !defined (NOT_JOHN)
-#include "options.h"
-#else
-struct opts { int flags; };
-#define FLG_UTF8 1
-struct opts options;
-#endif
-
 #if ARCH_LITTLE_ENDIAN
 #define BE_FIX(a) a
 #else
@@ -243,7 +245,6 @@ inline int utf8_to_utf16(UTF16 *target, unsigned int len, const UTF8 *source,
 		case 1:
 			ch <<= 6;
 			ch += *++source;
-		case 0:
 			++source;
 			break;
 		default:
@@ -345,7 +346,6 @@ int utf8_to_utf16_be(UTF16 *target, unsigned int len, const UTF8 *source,
 		case 1:
 			ch <<= 6;
 			ch += *++source;
-		case 0:
 			++source;
 			break;
 		default:
@@ -463,7 +463,8 @@ int enc_to_utf16_be(UTF16 *dst, unsigned int maxdstlen, const UTF8 *src,
 	}
 }
 
-// strlen of UTF16 (in characters, not bytes)
+// Strlen of UTF-16 (in 16-bit words, not octets)
+// Characters > U+FFFF are two 16-bit words
 inline unsigned int strlen16(const UTF16 *str)
 {
 	unsigned int len = 0;
@@ -472,6 +473,39 @@ inline unsigned int strlen16(const UTF16 *str)
 	return len;
 }
 
+// strlen of UTF-8 (in characters, not octets)
+// Will return a "truncated" length if fed with bad data.
+inline unsigned int strlen8(const UTF8 *source)
+{
+	int targetLen = 0;
+	const UTF8 *sourceEnd = source + strlen((char*)source);
+	unsigned int extraBytesToRead;
+
+	while (source < sourceEnd) {
+		if (*source < 0xC0) {
+			source++;
+			targetLen++;
+			if (*source == 0)
+				break;
+			continue;
+		}
+		// The original code in ConvertUTF.c has a much larger (slower)
+		// lookup table including zeros. This point must not be reached
+		// with *source < 0xC0
+		extraBytesToRead =
+		    opt_trailingBytesUTF8[*source & 0x3f];
+		if ((source + extraBytesToRead >= sourceEnd) ||
+		    (extraBytesToRead > 3)) {
+			return targetLen;
+		}
+		source += extraBytesToRead + 1;
+		targetLen++;
+		if (*source == 0 || source >= sourceEnd)
+			break;
+	}
+	return targetLen;
+}
+
 /*
  * Creates an MD4 Hash of the user's password in NT UNICODE.
  * This version honours the --encoding=utf8 flag and makes a couple
@@ -639,12 +673,12 @@ UTF8 *utf16_to_utf8_r (UTF8 *dst, int dst_len, const UTF16 *source) {
 					ch = ((ch - UNI_SUR_HIGH_START) << halfShift)
 						+ (ch2 - UNI_SUR_LOW_START) + halfBase;
 					++source;
-				} else { /* it's an unpaired high surrogate */
+//				} else { /* it's an unpaired high surrogate */
 //					--source; /* return to the illegal value itself */
 //					fprintf(stderr, "warning, utf16toutf8 failed (illegal) - this is a bug in JtR\n");
 //					break;
 				}
-			} else { /* We don't have the 16 bits following the high surrogate. */
+//			} else { /* We don't have the 16 bits following the high surrogate. */
 //				--source; /* return to the high surrogate */
 //				fprintf(stderr, "warning, utf16toutf8 failed (no surrogate) - this is a bug in JtR\n");
 //				break;
@@ -730,6 +764,91 @@ int initUnicode(int type) {
 
 	UnicodeType = type;
 
+	options.utf8 = options.iso8859_1 = options.iso8859_7 =
+	  options.iso8859_15 = options.koi8_r = options.cp437 =
+	  options.cp737 = options.cp850 = options.cp858 = options.cp866 =
+	  options.cp1251 = options.cp1252 = options.cp1253 = 0;
+	// by 'default' we are setup in 7 bit ascii mode (for rules).
+	options.ascii = 1;
+	options.encodingStr = "";
+	if ( (options.flags & FLG_INP_ENCODING) && options.encoding) {
+		// Ok, check a 'few' valid things for utf8
+		options.ascii = 0;
+		if (!strcasecmp(options.encoding, "utf8")||!strcasecmp(options.encoding, "utf-8")) {
+			options.utf8 = 1;
+			options.encodingStr = "UTF-8";
+		} else
+			if (!strcasecmp(options.encoding, "ansi")||!strcasecmp(options.encoding, "iso-8859-1")||!strcasecmp(options.encoding, "8859-1")||!strcasecmp(options.encoding, "iso8859-1")) {
+			options.iso8859_1 = 1;
+			options.encodingStr = "ISO-8859-1";
+		} else
+		if (!strcasecmp(options.encoding, "iso-8859-7")||!strcasecmp(options.encoding, "8859-7")||!strcasecmp(options.encoding, "iso8859-7")) {
+			options.iso8859_7 = 1;
+			options.encodingStr = "ISO-8859-7";
+		} else
+		if (!strcasecmp(options.encoding, "iso-8859-15")||!strcasecmp(options.encoding, "8859-15")||!strcasecmp(options.encoding, "iso8859-15")) {
+			options.iso8859_15 = 1;
+			options.encodingStr = "ISO-8859-15";
+		} else
+		if (!strcasecmp(options.encoding, "koi8-r")||!strcasecmp(options.encoding, "koi8r")) {
+			options.koi8_r = 1;
+			options.encodingStr = "KOI8-R";
+		} else
+		if (!strcasecmp(options.encoding, "cp437")||!strcasecmp(options.encoding, "cp-437")) {
+			options.cp437 = 1;
+			options.encodingStr = "CP437";
+		} else
+		if (!strcasecmp(options.encoding, "cp737")||!strcasecmp(options.encoding, "cp-737")) {
+			options.cp737 = 1;
+			options.encodingStr = "CP737";
+		} else
+		if (!strcasecmp(options.encoding, "cp850")||!strcasecmp(options.encoding, "cp-850")) {
+			options.cp850 = 1;
+			options.encodingStr = "CP850";
+		} else
+		if (!strcasecmp(options.encoding, "cp858")||!strcasecmp(options.encoding, "cp-858")) {
+			options.cp858 = 1;
+			options.encodingStr = "CP858";
+		} else
+		if (!strcasecmp(options.encoding, "cp866")||!strcasecmp(options.encoding, "cp-866")) {
+			options.cp866 = 1;
+			options.encodingStr = "CP866";
+		} else
+		if (!strcasecmp(options.encoding, "cp1251")||!strcasecmp(options.encoding, "cp-1251")) {
+			options.cp1251 = 1;
+			options.encodingStr = "CP1251";
+		} else
+		if (!strcasecmp(options.encoding, "cp1252")||!strcasecmp(options.encoding, "cp-1252")) {
+			options.cp1252 = 1;
+			options.encodingStr = "CP1252";
+		} else
+		if (!strcasecmp(options.encoding, "cp1253")||!strcasecmp(options.encoding, "cp-1253")) {
+			options.cp1253 = 1;
+			options.encodingStr = "CP1253";
+		} else
+		if (strcasecmp(options.encoding, "raw") && strcasecmp(options.encoding, "ascii") && strcasecmp(options.encoding, "default")) {
+			fprintf (stderr, "Supported encodings within john are: raw, utf-8, iso-8859-1 (or ansi)"
+					",\niso-8859-7"
+					", iso-8859-15"
+					", koi8-r"
+					", cp437"
+					", cp737"
+					", cp850"
+					", cp858"
+					", cp866"
+					",\ncp1251"
+					", cp1252"
+					", cp1253"
+					"\n");
+			error();
+		}
+	}
+
+	options.log_passwords = cfg_get_bool(SECTION_OPTIONS,
+	    NULL, "LogCrackedPasswords", 0);
+	options.report_utf8 = cfg_get_bool(SECTION_OPTIONS,
+	    NULL, "AlwaysReportUTF8", 0);
+
 	memset(ucs2_upcase, 0, sizeof(ucs2_upcase));
 	memset(ucs2_downcase, 0, sizeof(ucs2_downcase));
 
diff --git a/src/unicode.h b/src/unicode.h
index bebab99..171c8ca 100644
--- a/src/unicode.h
+++ b/src/unicode.h
@@ -88,10 +88,14 @@ extern UTF8 * utf16_to_enc_r (UTF8 *dst, int dst_len, const UTF16* source);
 
 /* These were in smbencrypt.c before: */
 
-/* Return length (in characters) of a UTF16 string */
+/* Return length (in characters) of a UTF-16 string */
 /* Number of octets is the result * sizeof(UTF16)  */
 extern unsigned int strlen16(const UTF16 * str);
 
+/* Return length (in characters) of a UTF-8 string */
+/* Will return a "truncated" length if fed with invalid data. */
+extern unsigned int strlen8(const UTF8 *source);
+
 /* Create an NT hash from a ISO-8859 or UTF-8 string (--encoding= aware) */
 extern int E_md4hash(const UTF8 * passwd, unsigned int len, unsigned char *p16);
 
-- 
1.7.4.1