[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]
Implementing the draft: first partial attempt (Was: New Internet Draft on registering IDNs
On Wed, Apr 02, 2003 at 06:35:40PM -0500,
Martin Duerst <duerst@xxxxxx> wrote
a message of 690 lines which said:
> >The format of the table is meant to be machine-readable but not
> >human-readable. It is fairly trivial
>
> For some people, writing a C program or a perl script is
> 'fairly trivial'. For others, it's not. It is easy to change
> the format to make it even more trivial.
Most humans do not program, they use shrink-wrapped software. Here is
a small script which parses a table in the draft format and generate
the bundle. WARNING: strings in the RHS are not really supported yet.
Example of use:
~/AFNIC/IDN % ./gen-bundles.py bar
bar
bär
bâr
bàr
bår
bãr
bár
#!/usr/bin/python
import re
import string
import sys
import unicodedata
import getopt
udigit = "u\+[0-9A-F]+"
file = "variant-table"
locale = "latin-1" # TODO: find it from the environment, instead
dump_table = 0
def usage():
print "Usage: " + sys.argv[0] + "name..."
print " (You can also send the names on standard input)"
class Bundles:
list = []
def __init__ (self, variants, label, canonical=1):
if not canonical:
canonic_label = u""
for character in label:
if not variants.characters.has_key(character):
raise "Invalid character \"" + \
character.encode(locale, 'replace') + "\" (" + \
unicodedata.name(character, "Unknown character") + ")"
canonic_label = canonic_label + variants.characters[character]
label = canonic_label
self.list = [label]
for i in range (len(label)):
character = label[i]
for variant in variants.base_characters[character]:
if variant != character:
prefix = u""
for j in range (i):
prefix = prefix + label[j]
prefix = prefix + variant
rest = label[i+1:]
#print "DEBUG (starting): " + prefix.encode (locale, 'replace')
new_bundle = Bundles (variants, rest)
self.append (prefix, new_bundle)
def append(self, prefix, bundle):
for string in bundle.get_list():
self.list.append (prefix + string)
def get_list (self):
# TODO: flatten before returning
return self.list
class Variants:
characters = {}
base_characters = {}
def __init__(self, file):
fh = open (file, "r")
line = fh.readline()
num = 1
while line:
if re.match ("^\s*#", line):
line = fh.readline()
num = num + 1
continue
line = string.strip(line)
if not line:
line = fh.readline()
num = num + 1
continue
expr = re.compile ("^(" + udigit + ")(\|(" + udigit + ":?)+)?$",
re.IGNORECASE)
first_digit = re.compile ("^(" + udigit + ")", re.IGNORECASE)
found = expr.match (line)
if not found:
raise "Invalid " + str(num) + " line: " + line
base_character = found.group(1)
ubase_character = unichr (self.unhex(base_character[2:6]))
self.characters[ubase_character] = ubase_character
self.base_characters[ubase_character] = []
if not found.group(2): # No variant
line = fh.readline()
num = num + 1
continue
over = 0
line = line[found.end(1)+1:]
while not over:
found = first_digit.match (line)
if not found:
raise "Invalid " + str(num) + " line: " + line
character = found.group(1)
ucharacter = unichr (self.unhex(character[2:6]))
self.characters[ucharacter] = ubase_character
if found.end(1) >= len(line):
over = 1
continue
if line[found.end(1)] == ":":
line = line[found.end(1)+1:]
elif line[found.end(1)] == "u" or line[found.end(1)] == "U":
line = line[found.end(1):]
else:
raise Invalid_character_sequence + " in line " + str(num)
line = fh.readline()
num = num + 1
fh.close()
for character in self.characters.keys():
self.base_characters[self.characters[character]].append (character)
def unhex(self, s):
"""Get the integer value of a hexadecimal number."""
bits = 0
for c in s:
if '0' <= c <= '9':
i = ord('0')
elif 'a' <= c <= 'f':
i = ord('a')-10
elif 'A' <= c <= 'F':
i = ord('A')-10
else:
break
bits = bits*16 + (ord(c) - i)
return bits
variants = Variants (file)
if dump_table:
for character in variants.characters.keys():
print "UTF-8: " + character.encode ('utf-8'),
print ", Locale: " + character.encode(locale, 'replace'),
print " (" + unicodedata.name(character, "Unknown character") + ")",
print ". Base character: ",
print variants.characters[character].encode(locale, 'replace')
if len(sys.argv) > 1:
for word in sys.argv[1:]:
# TODO: we should nameprep before sending to variants.bundle()
bundle = Bundles(variants,
unicode(string.lower(word),
locale),
canonical = 0)
for variant in bundle.get_list():
print variant.encode (locale, 'replace')
else:
name = sys.stdin.readline()
name = name[:-1] # Chop end-of-line
while name:
bundle = Bundles(variants,
unicode(string.lower(name),
locale),
canonical = 0)
for variant in bundle.get_list():
print variant.encode (locale, 'replace')
name = sys.stdin.readline()
name = name[:-1] # Chop end-of-line