#!/usr/bin/perl -w # Program to translate between a wide variety of internationalized character # formats. Copyright 2002 Paul Hoffman. You may use this program freely # for any purpose. No warranties of correctness are implied or given. # Version 6.0 # Changes from 5.1 to 6.0: # Removed all old formats use Unicode::String qw(utf8 ucs4 utf16 uchr); # From CPAN/authors/id/GAAS/Unicode-String use Getopt::Long; use MIME::Base64; @Formats = ('utf8', 'utf16', 'ucs4', 'punycode'. 'name', 'u+'); $PunyBasics = '-abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789'; $PunyMap = 'abcdefghijklmnopqrstuvwxyz0123456789'; $UnicodeStdErr = '/usr/web/imc/idna/thestdunicodeerr'; $UnicodeStdDie = '/usr/web/imc/idna/thestdunicodedie'; # Get the command line arguments and check the formats unless(&GetOptions( 'infmt=s' => \$InFmt, 'outfmt=s' => \$OutFmt, 'in=s' => \$InFile, 'out=s' => \$OutFile, 'bulk' => \$Bulk, 'bulkinout' => \$BulkInOut, 'debug' => \$Debug, 'hexin' => \$Hexin, 'base64in' => \$Base64In, 'base64out' => \$Base64Out, 'errtmp' => \$ErrTmp, 'h|help|?' => \$Help, )) { die "Error getting the options: $!\n" } if(defined($Help)) { print << "EOF"; This program uses the following command-line options: -infmt Specifies the format of the input. If this is not given, it defaults to utf8. -outfmt Specifies the format of the output. If this is not given, it defaults to utf16. -in Names the file for input. If this is not given, the input comes from STDIN. -out Names the file for output. If this is not given, the output goes to STDOUT. -bulk Reads multiple lines from the input file -debug Sends debugging info to STDERR. -hexin Takes the input (in any format) as hex instead of binary -base64in Takes the input (in any format) as Base64 instead of binary -base64out Makes the output Base64 instead of binary -errtmp Writes out STDERR to $UnicodeStdErr -? or -h This info. The formats are: utf8 utf16 ucs4 punycode draft-ietf-idn-punycode-00.txt name The character names; output only u+ The character hex values in U+ notation; output only EOF exit; } if(defined($ErrTmp)) { open(REALERR, ">$UnicodeStdErr") or die "Could not write to $UnicodeStdErr\n"; open(REALDIE, ">$UnicodeStdDie") or die "Could not write to $UnicodeStdDie\n"; } else { open(REALERR, '>&STDERR'); } unless(defined($OutFile)) { open(REALOUT, '>-') } else { open(REALOUT, ">$OutFile") or die "Could not open $OutFile for writing: $!\n" } unless(defined($InFile)) { open(THEIN, '-') } else { open(THEIN, $InFile) or die "Could not open $InFile for reading: $!\n" } binmode(THEIN); binmode(REALOUT); $InFmt = lc($InFmt); $OutFmt = lc($OutFmt); # Set the defaults if($InFmt eq '') { $InFmt = 'utf8'}; if($OutFmt eq '') { $OutFmt = 'utf16'}; unless(grep(/$InFmt/, @Formats)) { die "Invalid input format: $InFmt\n" } if(($InFmt eq 'name') or ($InFmt eq 'u+')) { die "You cannot use $InFmt as an input format.\n" } unless(grep(/$OutFmt/, @Formats)) { die "Invalid output format: $OutFmt\n" } if($Hexin and $Base64In) { die "Cannot have both -hexin and -base64in\n" } # Get the input @TheInArr = (); unless(defined($Bulk)) { $InStuff = ''; while() { $InStuff .= $_ } @TheInArr = ($InStuff); } else { # It's a bulk read while() { push(@TheInArr, $_) } &DebugOut("Doing bulk input on ", $#TheInArr+1, " lines\n"); } $BulkLoopCount = 0; foreach $TheIn (@TheInArr) { $BulkLoopCount++; &DebugOut("Input format: $InFmt\nOutput format: $OutFmt\n"); &DebugOut("Length of input: ", length($TheIn), "\n"); $OrigTheIn = $TheIn; chomp($OrigTheIn); if($Hexin) { &DebugOut("Input to hexin:\n$TheIn\n"); $TheIn = lc($TheIn); $TheIn =~ s/[uU+\\]//g; # Swallow all the "u+"s @AllHexIn = split(/\s+/, $TheIn); $HexOutString = ''; foreach $HexPartString (@AllHexIn) { if(length($HexPartString) > 5) { &DieOut("This program does not yet handle characters " . "> U+FFFFF."); } $HexOutString .= uchr(hex("0x$HexPartString"))->utf16; } $TheIn = $HexOutString; } if($Base64In) { &DebugOut("Input to base64:\n$TheIn\n"); $Decoded64 = decode_base64($TheIn); $TheIn = $Decoded64; } &DebugOut("Hex of input to unicode-test:\n", &HexOut($TheIn)); # Convert the input into UCS4 if($InFmt eq 'ucs4') { $TheUCS4 = ucs4($TheIn)->ucs4 } elsif($InFmt eq 'utf8') { $TheUCS4 = utf8($TheIn)->ucs4; } elsif($InFmt eq 'utf16') { $TheUCS4 = utf16($TheIn)->ucs4; } elsif($InFmt eq 'punycode') { $TheUCS4 = &PunytoUCS4($TheIn) } else { die "Weird error: Couldn't start to translate input.\n" } if($OutFmt eq 'ucs4') { $TheOut = ucs4($TheUCS4)->ucs4 } elsif($OutFmt eq 'utf8') { $TheOut = ucs4($TheUCS4)->utf8 } elsif($OutFmt eq 'utf16') { $TheOut = ucs4($TheUCS4)->utf16 } elsif($OutFmt eq 'punycode') { $TheOut = &UCS4toPuny($TheUCS4) } elsif($OutFmt eq 'name') { $TheOut = &UCS4toName($TheUCS4) } elsif($OutFmt eq 'u+') { $TheOut = &UCS4toUPlus($TheUCS4) } else { die "Weird error: Couldn't finish the translation.\n" } if($Base64Out) { &DebugOut("Input to base64 output:\n$TheOut\n"); $Encoded64 = encode_base64($TheOut); $TheOut = $Encoded64; } &DebugOut("Length of output: ", length($TheOut), "\n"); &DebugOut("Hex of output:\n", &HexOut($TheOut)); if(defined($BulkInOut)) { print REALOUT "$OrigTheIn|" } print REALOUT $TheOut; if(defined($Bulk)) { unless(index('ucs4utf8utf16', $OutFmt) > -1) { print REALOUT "\n" } } } exit; sub UCS4toName { my $InString = shift(@_); my @TheNames = ucs4($InString)->name; my $NameString = join("\n", @TheNames) . "\n"; return $NameString; } sub UCS4toUPlus { my $InString = shift(@_); my $TheHex = ucs4($InString)->hex . "\n"; $TheHex =~ s/ /\n/g; $TheHex = uc($TheHex); return $TheHex; } sub UCS4toPuny { my $InString = shift(@_); if(length($InString) == 0) { &DieOut("Input to UCS4toPuny had zero length; aborting.\n") } $Base = 36; $TMin = 1; $TMax = 26; $Skew = 38; $Damp = 700; $InitialBias = 72; $InitialN = 0x80; my ($n, $delta, $bias, $b, $TCount, $ThisFour, @InArr, $ThisOrd, $h, $OutStr, $m, $q, $k, $t, $x, $MapLoc, $ThisIndex, @SortedArr ); # $m and $n are the ords of the strings in question &DebugOut("Hex of input to UCS4toPuny:\n", &HexOut($InString)); $n = $InitialN; $delta = 0; $bias = $InitialBias; $b = 0; @InArr = (); for($TCount = 0; $TCount < length($InString); $TCount += 4) { $ThisFour = substr($InString, $TCount, 4); push(@InArr, ucs4($ThisFour)->ord); } foreach $ThisOrd (@InArr) { if(index($PunyBasics, uchr($ThisOrd)->utf8) > -1) { &DebugOut("Found basic char with ord", $ThisOrd, "\n"); $b++; } } &DebugOut("After basics checking, b is $b\n"); $h = $b; if($b > 0) { foreach $ThisOrd (@InArr) { if(index($PunyBasics, uchr($ThisOrd)->utf8) > -1) { $OutStr .= uchr($ThisOrd)->utf8; } } $OutStr .= '-'; } else { $OutStr = '' } &DebugOut("After prefix check, OutStr is '$OutStr'\n"); @SortedArr = reverse sort SubNum @InArr; while($h < (length($InString) / 4 )) { &DebugOut("At top of loop1, h is $h, delta is $delta, ", " n is 0x", sprintf('%lX', $n), "\n"); $m = $SortedArr[0]; foreach $SortCheck (@SortedArr) { last unless($SortCheck >= $n); $m = $SortCheck; } &DebugOut("After looking for m, m is 0x", sprintf('%lX', $m), "\n"); $delta += ($m - $n) * ($h + 1); $n = $m; &DebugOut("Before loop2, delta is $delta\n"); foreach $x (@InArr) { if($x < $n) { $delta++ }; if($x == $n) { $q = $delta; for ($k = $Base; 1; $k += $Base) { $t = (($k <= $bias) ? $TMin : ((($k - $bias) > $TMax) ? $TMax : ($k - $bias))); &DebugOut("In loop3, k is $k, bias is $bias, q is $q, ", "t is $t\n"); last if($q < $t); $MapLoc = $t + (($q - $t) % ($Base - $t)); $OutStr .= substr($PunyMap, $MapLoc, 1); &DebugOut("In loop3, OutStr is now '$OutStr;\n"); $q = int(($q - $t) / ($Base - $t)); &DebugOut("At the end of loop3, q is $q\n"); } $OutStr .= substr($PunyMap, $q, 1); &DebugOut("In loop2, OutStr is now '$OutStr'\n"); $bias = &PunyAdapt($delta, ($h + 1), ($h == $b)); $delta = 0; $h++; } } $delta++; $n++; } return("$OutStr"); } sub PunytoUCS4 { my $InString = shift(@_); $Base = 36; $TMin = 1; $TMax = 26; $Skew = 38; $Damp = 700; $InitialBias = 72; $InitialN = uchr(0x0080)->utf16; my ($n, $i, $bias, $TCount, $LastHypehnPos, $OutString, @TestOutArr, $ThisChar, $OrdN, ); &DebugOut("Hex of input to PunytoUCS4:\n", &HexOut($InString)); # Strip any whitespace $InString =~ s/\s*//g; # Strip of the prefix string &DebugOut("The string after stripping in PunytoUCS4: $InString\n"); $n = utf16($InitialN)->ucs4; $i = 0; $bias = $InitialBias; $LastHyphenPos = rindex($InString, '-'); if($LastHyphenPos > -1) { $OutString = utf8(substr($InString, 0, $LastHyphenPos))->ucs4; for($TCount = 0; $TCount < length($OutString); $TCount += 4) { $ThisFour = substr($OutString, $TCount, 4); push(@TestOutArr, ucs4($ThisFour)->utf8); } foreach $ThisChar (@TestOutArr) { next if(index($PunyBasics, $ThisChar) > -1); &DieOut("Found a non-basic char in the first section of the " . "input to PunytoUCS4\n"); } $InString = substr($InString, $LastHyphenPos); } else { $OutString = ''; } if(length($OutString) > 0) { substr($InString, 0, 1, ''); } &DebugOut("After prefix check, InString is $InString\n"); unless($OutString eq '') { &DebugOut("After prefix check, hex of " . "OutString:\n", &HexOut($OutString)); } while(length($InString) > 0) { $oldi = $i; $w = 1; $k = $Base; for($k = $Base; 1; $k += $Base) { if(length($InString) == 0) { &DieOut("Got to unexpected end of " . "string in the loop in PunytoUCS4\n"); } $ThisChar = substr($InString, 0, 1, ''); $digit = index($PunyMap, lc($ThisChar)); if($digit == -1) { &DieOut("Found a non-valid character in the " . "lop in PunytoUCS4\n"); } $i = $i + ($digit * $w); # It should check for overflow of $i here, but not clear how # let t = k <= bias ? tmin : k - bias > tmax ? tmax : k - bias $t = ($k <= $bias ? $TMin : ($k - $bias > $TMax ? $TMax : ($k - $bias))); last if($digit < $t); $w = ($w * ($Base - $t)); # It should check for overflow of $w here, but not clear how &DebugOut("At end of inner loop, digit is $digit, i is $i,\n" . "t is $t, w is $w, InString is '$InString'\n"); } $bias = &PunyAdapt(($i - $oldi), ((length($OutString) / 4) + 1), ($oldi == 0)); &DebugOut("After PunyAdapt, bias is $bias, oldi is $oldi, w is $w\n", " k is $k, bias is $bias, i is $i, hex of n is ", sprintf('%lX', ucs4($n)->ord), "\n"); $OrdN = ucs4($n)->ord; $OrdN += int($i / ((length($OutString) / 4 ) + 1)); if($OrdN > 0x10FFFF) { &DieOut("OrdN exceeded maximum allowed " . "value.\n"); } $n = uchr($OrdN)->ucs4; $i = $i % ((length($OutString) / 4) + 1); substr($OutString, $i*4, 0) = $n; $i++; &DebugOut("At end of outer loop, i is $i, ord of n is ", sprintf('%lX', ord($n)), "\n"); &DebugOut("Hex of OutString:\n", &HexOut($OutString)); } return $OutString; } sub PunyAdapt { my $Delta = shift(@_); my $NumPoints = shift(@_); my $FirstTime = shift(@_); my ($K); $Delta = int($Delta / ( $FirstTime ? $Damp : 2 )); $Delta += int($Delta / $NumPoints); $K = 0; while($Delta > int((($Base - $TMin) * $TMax) / 2)) { $Delta = int($Delta / ($Base - $TMin)); $K = $K + $Base; } return ($K + int((($Base - $TMin + 1) * $Delta) / ($Delta + $Skew))); } sub Base32Encode { my($ToEncode) = shift(@_); my ($i, $OutString, $CompBits, $FivePos, $FiveBitsString, $FiveIndex); &DebugOut("Hex of input to Base32Encode:\n", &HexOut($ToEncode)); # Turn the compressed string into a string that represents the bits as # 0 and 1. This is wasteful of space but easy to read and debug. $CompBits = ''; foreach $i (split(//, $ToEncode)) { $CompBits .= unpack("B8", $i) }; # Pad the value with enough 0's to make it a multiple of 5 if((length($CompBits) % 5) != 0) { $CompBits .= '0' x (5 - (length($CompBits) % 5)) }; # Step 1a &DebugOut("The compressed bits in Base32Encode after padding:\n" . "$CompBits\n"); $FivePos = 0; # Step 1b do { $FiveBitsString = substr($CompBits, $FivePos, 5); # Step 2 $FiveIndex = unpack("N", pack("B32", ('0' x 27) . $FiveBitsString)); $OutString .= substr($Base32Chars, $FiveIndex, 1); # Step 3 $FivePos += 5; # Step 4a } until($FivePos == length($CompBits)); # Step 4b &DebugOut("Output of Base32Encode:\n$OutString\n"); return $OutString; } sub Base32Decode { my ($ToDecode) = shift(@_); my ($InputCheck, $OutString, $DeCompBits, $DeCompIndex, @DeArr, $i, $PaddingLen, $PaddingContent); &DebugOut("Hex of input to Base32Decode:\n", &HexOut($ToDecode)); $InputCheck = length($ToDecode) % 8; # Step 1 if(($InputCheck == 1) or ($InputCheck == 3) or ($InputCheck == 6)) { &DieOut("Input to Base32Decode was a bad mod length: $InputCheck\n") } # $DeCompBits is a string that represents the bits as # 0 and 1. This is wasteful of space but easy to read and debug. $DeCompBits = ''; foreach $InChar (split(//, $ToDecode)) { if(index($Base32Chars, $InChar) == -1) # Character not in base set { &DieOut("Input to Base32Decode had a bad character: $InChar\n")}; $DeCompIndex = pack("N", index($Base32Chars, $InChar)); $DeCompBits .= substr(unpack("B32", $DeCompIndex), 27); } &DebugOut("The decompressed bits in Base32Decode:\n$DeCompBits\n"); &DebugOut("The number of bits in Base32Decode: " , length($DeCompBits), "\n"); # Step 5 $Padding = length($DeCompBits) % 8; $PaddingContent = substr($DeCompBits, (length($DeCompBits) - $Padding)); &DebugOut("The padding check in Base32Decode is \"$PaddingContent\"\n"); unless(index($PaddingContent, '1') == -1) { &DieOut("Found non-zero padding in Base32Decode\n") } # Break the decompressed string into octets for returning @DeArr = (); for($i = 0; $i < int(length($DeCompBits) / 8); $i++) { $DeArr[$i] = chr(unpack("N", pack("B32", ('0' x 24) . substr($DeCompBits, $i * 8, 8)))); } $OutString = join('', @DeArr); &DebugOut("Hex of the decompressed array:\n", &HexOut("$OutString")); return $OutString; } sub HexOut { my $AllInStr = shift(@_); my($HexIn, $HexOut, @AllOrd, $i, $j, $k, $OutReg, $SpOut); my($OctetIn, $LineCount); my $OutString = ''; @AllOrd = split(//, $AllInStr); $HexIn[23] = ''; while(@AllOrd) { for($i = 0; $i < 24; $i++) { $OctetIn[$i] = shift(@AllOrd); if(defined($OctetIn[$i])) { $HexIn[$i] = sprintf('%2.2x', ord($OctetIn[$i])); $LineCount = $i; } } for($j = 0; $j <= $LineCount; $j++ ) { $HexOut .= $HexIn[$j]; if(($j % 4) == 3) { $HexOut .= ' ' } if((ord($OctetIn[$j]) < 20) or (ord($OctetIn[$j]) > 126)) { $OutReg .= '.' } else { $OutReg .= $OctetIn[$j] } } for ($k=length($HexOut); $k < 56; $k++) { $SpOut .= ' ' } $OutString .= "$HexOut$SpOut$OutReg\n" ; $HexOut = ''; $OutReg = ''; $SpOut = ''; } return $OutString; } sub DebugOut { # Print out an error string if $Debug is set my $DebugTemp = join('', @_); if($Debug) { print REALERR $DebugTemp; } } sub DieOut { my $DieTemp = join('', @_); if(defined($Bulk)) { $DieTemp .= "Died at input line $BulkLoopCount\n" } print REALERR $DieTemp; if(defined($ErrTmp)) { print REALDIE $DieTemp; } close(REALDIE); close(REALERR); exit; } sub SubNum { $a <=> $b };