From 29b56ef4e4113bcd091b19d6926f18814162ca53 Mon Sep 17 00:00:00 2001 From: Nicholas Noll Date: Tue, 26 Oct 2021 21:01:41 -0700 Subject: Feat(libunicode): Added an explicit unicode library Refactored code to pull out utf8 functions from base into a standalone library. Also left the required function inside arg.c so that code that calls ARG_BEGIN doesn't have to link to libunicode. --- sys/base/genutf8.py | 140 ---------------------------------------------------- 1 file changed, 140 deletions(-) delete mode 100755 sys/base/genutf8.py (limited to 'sys/base/genutf8.py') diff --git a/sys/base/genutf8.py b/sys/base/genutf8.py deleted file mode 100755 index 9ce9975..0000000 --- a/sys/base/genutf8.py +++ /dev/null @@ -1,140 +0,0 @@ -#!/bin/python - -preamble = """ -Rune* -rbsearch(Rune c, Rune *t, int n, int nelem) -{ - Rune *p; - int m; - - while (n > 1) { - m = n >> 1; - p = t + m*nelem; - if (c >= p[0]) { - t = p; - n -= m; - } else n = m; - } - - if (n && c >= t[0]) return t; - else return 0; -} -""" - -def findrange(s): - rs = [] - ss = [] - b, c = None, None - for elt in sorted(s): - if b is None: - b, c = elt, elt - continue - if elt == c + 1: - c = elt - continue - else: - if b == c: - ss.append(b) - else: - rs.append( (b, c) ) - b, c = elt, elt - return rs, ss - -def puttab(s, fd, name): - lname = name.lower() - rs, ss = findrange(s) - - rtabn = "%s_rtab"%lname - stabn = "%s_stab"%lname - - fd.write("static Rune %s[] = {" % rtabn) - for i, elts in enumerate(sorted(rs)): - fd.write("0x%04x, 0x%04x,\n" % (elts[0], elts[1])) - fd.write("};\n\n") - - if len(ss) > 0: - fd.write("static Rune %s[] = {" % stabn) - for i, elt in enumerate(sorted(ss)): - if i % 2 == 0: - fd.write("\n") - fd.write("0x%04x, "%(elt)) - fd.write("\n};\n\n") - - fd.write("int\n") - fd.write("utf8ยท%s(Rune c)\n"%name) - fd.write("{\n") - fd.write("\tRune *p;\n\n") - fd.write("\tp = rbsearch(c, %s, arrlen(%s)/2, 2);\n"%(rtabn,rtabn)) - fd.write("\tif (p && c >= p[0] && c <= p[1])\n") - fd.write("\t\t\treturn 1;\n") - if len(ss) > 0: - fd.write("\tp = rbsearch(c, %s, arrlen(%s), 1);\n"%(stabn,stabn)) - fd.write("\tif (p && c == p[0])\n") - fd.write("\t\treturn 1;\n") - fd.write("\n\treturn 0;\n") - fd.write("}\n\n") - -RUNEMAX = 0x10FFFF - -if __name__ == "__main__": - alphas = set() - lowers = set() - uppers = set() - titles = set() - digits = set() - spaces = set() - - tolower = {} - toupper = {} - totitle = {} - - with open("data/UnicodeData.txt") as utf8: - for runedata in utf8: - fields = runedata[:-1].split(";") - rune = int(fields[0], 16) - kind = fields[2] - - assert rune < RUNEMAX - - if kind[0] == 'Z': - spaces.add(rune) - elif kind[0] == 'L': - alphas.add(rune) - - if kind[1] == "l": - lowers.add(rune) - elif kind[1] == "u": - uppers.add(rune) - elif kind[1] == "t": - titles.add(rune) - - if (fields[12] != ""): - toupper[rune] = int(fields[12], 16) - - if (fields[13] != ""): - tolower[rune] = int(fields[13], 16) - - if (fields[14] != ""): - totitle[rune] = int(fields[14], 16) - - elif kind == 'Nd': - digits.add(rune) - - # Some annoying exceptions... - # These are counted as control characters - spaces.add(ord('\t')) - spaces.add(ord('\n')) - spaces.add(ord('\f')) - spaces.add(ord('\v')) - spaces.add(0x85) - spaces.add(0xfeff) - - # TODO: Add toLower/toUpper/toTitle... - with open("utf8_gen.c", "w") as utf8: - utf8.write(preamble) - puttab(spaces, utf8, "IsSpace") - puttab(lowers, utf8, "IsLower") - puttab(uppers, utf8, "IsUpper") - puttab(titles, utf8, "IsTitle") - puttab(alphas, utf8, "IsLetter") - puttab(digits, utf8, "IsDigit") -- cgit v1.2.1