From 2befd94b268782a90ecbad7eb77a9207f6ec7ef0 Mon Sep 17 00:00:00 2001
From: magnum <magnum>
Date: Fri, 29 Jul 2011 09:46:07 +0200
Subject: [PATCH 13/14] cmpt_cp.pl

---
 run/cmpt_cp.pl |   86 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 files changed, 86 insertions(+), 0 deletions(-)
 create mode 100755 run/cmpt_cp.pl

diff --git a/run/cmpt_cp.pl b/run/cmpt_cp.pl
new file mode 100755
index 0000000..01eabfb
--- /dev/null
+++ b/run/cmpt_cp.pl
@@ -0,0 +1,86 @@
+#!/usr/bin/perl -w
+use strict;
+use Encode;
+
+############################################################################
+# code page data builder, by magnum / JimF.   v1.01
+# coded July 27-29, 2011, as a tool to build codepage encoding data needed
+# for John the Ripper code page conversions.  The data output from this file 
+# is made to be directly placed into the ./src/encoding_data.h file in john's
+# source tree.
+# USAGE:  cmpt_cp.pl CODEPAGE
+# cmpt_cp.pl run without any arguments will show a list of possible code pages.
+############################################################################
+
+my $enc;
+if (@ARGV==1) {$enc=$ARGV[0];}
+else {
+	print "Supported encodings:\n", join(", ", Encode->encodings(":all")), "\n\n";
+	exit(0);
+}
+
+my $to_unicode_high128=""; my $lower=""; my $upper=""; my $lowonly=""; my $uponly="";
+my $encu = uc($enc);my $hs = "";
+$encu =~ s/-/_/g;
+################################# 
+# first step, compute the unicode array
+################################# 
+foreach my $i (0x80..0xFF) {
+	my $u = chr($i);
+	Encode::from_to($u, $enc, "utf8");
+	$hs .= $u;
+	$to_unicode_high128 .= "0x" . sprintf "%04X", ord Encode::decode("UTF-8", $u);
+	if ($i % 16 == 15 && $i != 255) { $to_unicode_high128 .= ",\n"; }
+	elsif ($i != 255) { $to_unicode_high128 .= ","; }
+}
+print "\n// ", $hs, "\n";
+print "\n// here is the $encu to Unicode conversion for $encu characters from 0x80 to 0xFF\n";
+print "static UTF16 ".$encu."_to_unicode_high128[] = {\n";
+print $to_unicode_high128 . " };\n";
+
+################################# 
+# Now build upcase/downcase data.
+################################# 
+foreach my $i (0x80..0xFF) {
+	my $c = chr($i);
+	# converts $c into utf8, from $enc code page, and 'sets' the 'flag' in perl that $c IS a utf8 char.
+	$c = Encode::decode($enc, $c);
+	# upcase and low case the utf8 chars
+	my $clc = lc $c; my $ulc = uc $c;
+	# reconvert the utf8 char's back into $enc code page.
+	$clc = Encode::encode($enc, $clc); $ulc = Encode::encode($enc, $ulc);
+	if (chr($i) eq $clc && $clc ne $ulc) { 
+		if (chr($i) ne $clc && chr($i) ne $ulc) {
+			printf("// *** WARNING, char at ord(0x%X) U+%04X needs to be looked into. Neither conversion gets back to original value!\n",$i,ord($c));
+		} elsif ( ord($ulc) == 0x3F) { # 0x3F is ?  In perl, if there IS a unicode conversion for a case, but there is no 'reverse' case char in the charset, 
+			$lowonly .= sprintf("\\x%02X", ord($clc));
+		} elsif ( ord($clc) == 0x3F) { # then the final encode gives us a '?'. Thus, we know this is a character, and know it's case, but we can not 'convert' it to the other case.
+			$uponly .= sprintf("\\x%02X", ord($ulc));
+		} elsif ( ord($clc) < 0x80 || ord($ulc) < 0x80) {
+			printf("// *** WARNING, char at ord(0x%X) U+%04X needs to be looked into.  Likely multi-byte conversion\n",$i,ord($c));
+		} else {
+			$lower .= sprintf("\\x%02X", ord($clc));
+			$upper .= sprintf("\\x%02X", ord($ulc));
+		}
+	}
+}
+print "#define CHARS_LOWER_".$encu." "; if (length($lower)>80) {print"\\\n\t";} print "\"".$lower."\"\n"; 
+print "#define CHARS_UPPER_".$encu." "; if (length($upper)>80) {print"\\\n\t";} print "\"".$upper."\"\n";
+if (length($lowonly)>0) {
+	print "#define CHARS_LOW_ONLY_".$encu." "; if (length($lowonly)>80) {print"\\\n\t";} print "\"".$lowonly."\"\n";
+}
+if (length($uponly)>0) {
+	print "#define CHARS_UP_ONLY_".$encu." "; if (length($uponly)>80) {print"\\\n\t";} print "\"".$uponly."\"\n";
+}
+
+################################# 
+# Ok, provide a check to see if any of the characters UNDER 0x80
+# are non-standard.  At this time, there is no plan on HOW to handle
+# this within john.  The information is simply listed at this time.
+################################# 
+foreach my $i (0x00..0x7F) {
+	my $u = chr($i);
+	Encode::from_to($u, $enc, "utf8");
+	my $str = sprintf "%04X", ord Encode::decode("UTF-8", $u);
+	if ( hex($str) != $i) { printf("WARNING, low character %X maps into Unicode 0x%s\n", $i, $str);}
+}
-- 
1.7.4.1

