From 0d5942d8deaa70427df6df016b3ed9dedfb65b0d Mon Sep 17 00:00:00 2001 From: Nicholas Noll Date: Fri, 22 May 2020 16:47:40 -0700 Subject: checkin: utf8 generation code --- sys/libn/genutf8.py | 141 ++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 141 insertions(+) create mode 100755 sys/libn/genutf8.py (limited to 'sys') diff --git a/sys/libn/genutf8.py b/sys/libn/genutf8.py new file mode 100755 index 0000000..f4b9f8c --- /dev/null +++ b/sys/libn/genutf8.py @@ -0,0 +1,141 @@ +#!/bin/python + +preamble = """ +Rune* +rbsearch(Rune c, Rune *t, int n, int nelem) +{ + Rune *p; + int m; + + while (n > 1) { + m = n >> 1; + p = t + m*nelem; + if (c >= p[0]) { + t = p; + n -= m; + } else n = m; + } + + if (n && c >= t[0]) return t; + else return 0; +} +""" + +def findrange(s): + rs = [] + ss = [] + b, c = None, None + for elt in sorted(s): + if b is None: + b, c = elt, elt + continue + if elt == c + 1: + c = elt + continue + else: + if b == c: + ss.append(b) + else: + rs.append( (b, c) ) + b, c = elt, elt + return rs, ss + +def puttab(s, fd, name): + lname = name.lower() + rs, ss = findrange(s) + + rtabn = "%s_rtab"%lname + stabn = "%s_stab"%lname + + fd.write("static Rune %s[] = {" % rtabn) + for i, elts in enumerate(sorted(rs)): + fd.write("0x%04x, 0x%04x,\n" % (elts[0], elts[1])) + fd.write("};\n\n") + + if len(ss) > 0: + fd.write("static Rune %s[] = {" % stabn) + for i, elt in enumerate(sorted(ss)): + if i % 2 == 0: + fd.write("\n") + fd.write("0x%04x, "%(elt)) + fd.write("\n};\n\n") + + fd.write("int\n") + fd.write("utf8ยท%s(Rune c)\n"%name) + fd.write("{\n") + fd.write("\tRune *p;\n\n") + fd.write("\tp = rbsearch(c, %s, arrlen(%s)/2, 2);\n"%(rtabn,rtabn)) + fd.write("\tif (p && c >= p[0] && c <= p[1])\n") + fd.write("\t\t\treturn 1;\n") + if len(ss) > 0: + fd.write("\tp = rbsearch(c, %s, arrlen(%s), 1);\n"%(stabn,stabn)) + fd.write("\tif (p && c == p[0])\n") + fd.write("\t\treturn 1;\n") + fd.write("\n\treturn 0;\n") + fd.write("}\n\n") + +RUNEMAX = 0x10FFFF + +if __name__ == "__main__": + + alphas = set() + lowers = set() + uppers = set() + titles = set() + digits = set() + spaces = set() + + tolower = {} + toupper = {} + totitle = {} + + with open("data/UnicodeData.txt") as utf8: + for runedata in utf8: + fields = runedata[:-1].split(";") + rune = int(fields[0], 16) + kind = fields[2] + + assert rune < RUNEMAX + + if kind[0] == 'Z': + spaces.add(rune) + elif kind[0] == 'L': + alphas.add(rune) + + if kind[1] == "l": + lowers.add(rune) + elif kind[1] == "u": + uppers.add(rune) + elif kind[1] == "t": + titles.add(rune) + + if (fields[12] != ""): + toupper[rune] = int(fields[12], 16) + + if (fields[13] != ""): + tolower[rune] = int(fields[13], 16) + + if (fields[14] != ""): + totitle[rune] = int(fields[14], 16) + + elif kind == 'Nd': + digits.add(rune) + + # Some annoying exceptions... + # These are counted as control characters + spaces.add(ord('\t')) + spaces.add(ord('\n')) + spaces.add(ord('\f')) + spaces.add(ord('\v')) + spaces.add(0x85) + spaces.add(0xfeff) + + # TODO: Add toLower/toUpper/toTitle... + with open("utf8_gen.c", "w") as utf8: + utf8.write(preamble) + puttab(spaces, utf8, "IsSpace") + puttab(lowers, utf8, "IsLower") + puttab(uppers, utf8, "IsUpper") + puttab(titles, utf8, "IsTitle") + puttab(alphas, utf8, "IsLetter") + puttab(digits, utf8, "IsDigit") -- cgit v1.2.1