[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

Implementing the draft: first partial attempt (Was: New Internet Draft on registering IDNs



On Wed, Apr 02, 2003 at 06:35:40PM -0500,
 Martin Duerst <duerst@xxxxxx> wrote 
 a message of 690 lines which said:

> >The format of the table is meant to be machine-readable but not
> >human-readable. It is fairly trivial
> 
> For some people, writing a C program or a perl script is
> 'fairly trivial'. For others, it's not. It is easy to change
> the format to make it even more trivial.

Most humans do not program, they use shrink-wrapped software. Here is
a small script which parses a table in the draft format and generate
the bundle. WARNING: strings in the RHS are not really supported yet.

Example of use:

~/AFNIC/IDN % ./gen-bundles.py bar
bar
br
br
br
br
br
br




#!/usr/bin/python

import re
import string
import sys
import unicodedata
import getopt

udigit = "u\+[0-9A-F]+"
file = "variant-table"
locale = "latin-1" # TODO: find it from the environment, instead
dump_table = 0

def usage():
    print "Usage: " + sys.argv[0] + "name..."
    print "   (You can also send the names on standard input)"

class Bundles:

    list = []

    def __init__ (self, variants, label, canonical=1):
        if not canonical:
            canonic_label = u""
            for character in label:
                if not variants.characters.has_key(character):
                    raise "Invalid character \"" + \
                          character.encode(locale, 'replace') + "\" (" + \
                          unicodedata.name(character, "Unknown character") + ")"
                canonic_label = canonic_label + variants.characters[character]
                label = canonic_label
        self.list = [label]
        for i in range (len(label)):
            character = label[i]
            for variant in variants.base_characters[character]:
                if variant != character:
                    prefix = u""
                    for j in range (i):
                        prefix = prefix + label[j]
                    prefix = prefix + variant
                    rest = label[i+1:]
                    #print "DEBUG (starting): " + prefix.encode (locale, 'replace')
                    new_bundle = Bundles (variants, rest)
                    self.append (prefix, new_bundle)

    def append(self, prefix, bundle):
        for string in bundle.get_list():
            self.list.append (prefix + string)
        
    def get_list (self):
        # TODO: flatten before returning
        return self.list
            
class Variants:

    characters = {}
    base_characters = {}

    def __init__(self, file):
        fh = open (file, "r")
        line = fh.readline()
        num = 1
        while line:
            if re.match ("^\s*#", line):
                line = fh.readline()
                num = num + 1
                continue
            line = string.strip(line)
            if not line:
                line = fh.readline()
                num = num + 1
                continue
            expr = re.compile ("^(" + udigit + ")(\|(" + udigit + ":?)+)?$",
                               re.IGNORECASE)
            first_digit = re.compile ("^(" + udigit + ")", re.IGNORECASE)
            found = expr.match (line)
            if not found:
                raise "Invalid " + str(num) + " line: " + line
            base_character = found.group(1)
            ubase_character = unichr (self.unhex(base_character[2:6]))
            self.characters[ubase_character] = ubase_character
            self.base_characters[ubase_character] = []
            if not found.group(2): # No variant
                line = fh.readline()
                num = num + 1
                continue
            over = 0
            line = line[found.end(1)+1:]
            while not over:
                found = first_digit.match (line)
                if not found:
                    raise "Invalid " + str(num) + " line: " + line
                character = found.group(1)
                ucharacter = unichr (self.unhex(character[2:6]))
                self.characters[ucharacter] = ubase_character
                if found.end(1) >= len(line):
                    over = 1
                    continue
                if line[found.end(1)] == ":":
                    line = line[found.end(1)+1:]
                elif line[found.end(1)] == "u" or line[found.end(1)] == "U":
                    line = line[found.end(1):]
                else:
                    raise Invalid_character_sequence + " in line " + str(num)
            line = fh.readline()
            num = num + 1
        fh.close()
        for character in self.characters.keys():
            self.base_characters[self.characters[character]].append (character)
            
    def unhex(self, s):
        """Get the integer value of a hexadecimal number."""
        bits = 0
        for c in s:
            if '0' <= c <= '9':
                i = ord('0')
            elif 'a' <= c <= 'f':
                i = ord('a')-10
            elif 'A' <= c <= 'F':
                i = ord('A')-10
            else:
                break
            bits = bits*16 + (ord(c) - i)
        return bits

variants = Variants (file)
if dump_table:
    for character in variants.characters.keys():
        print "UTF-8: " + character.encode ('utf-8'),
        print ", Locale: " + character.encode(locale, 'replace'),
        print " (" + unicodedata.name(character, "Unknown character") + ")",
        print ". Base character: ",
        print variants.characters[character].encode(locale, 'replace')
if len(sys.argv) > 1:
    for word in sys.argv[1:]:
        # TODO: we should nameprep before sending to variants.bundle()
        bundle = Bundles(variants,
                         unicode(string.lower(word),
                                 locale),
                         canonical = 0)
        for variant in bundle.get_list():
            print variant.encode (locale, 'replace')
else:
    name = sys.stdin.readline()
    name = name[:-1] # Chop end-of-line
    while name:
        bundle = Bundles(variants,
                         unicode(string.lower(name),
                                 locale),
                         canonical = 0)
        for variant in bundle.get_list():
            print variant.encode (locale, 'replace')
        name = sys.stdin.readline()
        name = name[:-1] # Chop end-of-line