#!/bin/python preamble = """ Rune* rbsearch(Rune c, Rune *t, int n, int nelem) { Rune *p; int m; while (n > 1) { m = n >> 1; p = t + m*nelem; if (c >= p[0]) { t = p; n -= m; } else n = m; } if (n && c >= t[0]) return t; else return 0; } """ def findrange(s): rs = [] ss = [] b, c = None, None for elt in sorted(s): if b is None: b, c = elt, elt continue if elt == c + 1: c = elt continue else: if b == c: ss.append(b) else: rs.append( (b, c) ) b, c = elt, elt return rs, ss def puttab(s, fd, name): lname = name.lower() rs, ss = findrange(s) rtabn = "%s_rtab"%lname stabn = "%s_stab"%lname fd.write("static Rune %s[] = {" % rtabn) for i, elts in enumerate(sorted(rs)): fd.write("0x%04x, 0x%04x,\n" % (elts[0], elts[1])) fd.write("};\n\n") if len(ss) > 0: fd.write("static Rune %s[] = {" % stabn) for i, elt in enumerate(sorted(ss)): if i % 2 == 0: fd.write("\n") fd.write("0x%04x, "%(elt)) fd.write("\n};\n\n") fd.write("int\n") fd.write("utf8ยท%s(Rune c)\n"%name) fd.write("{\n") fd.write("\tRune *p;\n\n") fd.write("\tp = rbsearch(c, %s, arrlen(%s)/2, 2);\n"%(rtabn,rtabn)) fd.write("\tif (p && c >= p[0] && c <= p[1])\n") fd.write("\t\t\treturn 1;\n") if len(ss) > 0: fd.write("\tp = rbsearch(c, %s, arrlen(%s), 1);\n"%(stabn,stabn)) fd.write("\tif (p && c == p[0])\n") fd.write("\t\treturn 1;\n") fd.write("\n\treturn 0;\n") fd.write("}\n\n") RUNEMAX = 0x10FFFF if __name__ == "__main__": alphas = set() lowers = set() uppers = set() titles = set() digits = set() spaces = set() tolower = {} toupper = {} totitle = {} with open("data/UnicodeData.txt") as utf8: for runedata in utf8: fields = runedata[:-1].split(";") rune = int(fields[0], 16) kind = fields[2] assert rune < RUNEMAX if kind[0] == 'Z': spaces.add(rune) elif kind[0] == 'L': alphas.add(rune) if kind[1] == "l": lowers.add(rune) elif kind[1] == "u": uppers.add(rune) elif kind[1] == "t": titles.add(rune) if (fields[12] != ""): toupper[rune] = int(fields[12], 16) if (fields[13] != ""): tolower[rune] = int(fields[13], 16) if (fields[14] != ""): totitle[rune] = int(fields[14], 16) elif kind == 'Nd': digits.add(rune) # Some annoying exceptions... # These are counted as control characters spaces.add(ord('\t')) spaces.add(ord('\n')) spaces.add(ord('\f')) spaces.add(ord('\v')) spaces.add(0x85) spaces.add(0xfeff) # TODO: Add toLower/toUpper/toTitle... with open("utf8_gen.c", "w") as utf8: utf8.write(preamble) puttab(spaces, utf8, "IsSpace") puttab(lowers, utf8, "IsLower") puttab(uppers, utf8, "IsUpper") puttab(titles, utf8, "IsTitle") puttab(alphas, utf8, "IsLetter") puttab(digits, utf8, "IsDigit")