aboutsummaryrefslogtreecommitdiff
path: root/sys
diff options
context:
space:
mode:
authorNicholas Noll <nbnoll@eml.cc>2020-05-22 16:47:40 -0700
committerNicholas Noll <nbnoll@eml.cc>2020-05-22 16:47:40 -0700
commit0d5942d8deaa70427df6df016b3ed9dedfb65b0d (patch)
treedc57c7b8c95b93116b34d3f5805df2123fb34e36 /sys
parent50d3df1fafd26305742373a71022fdc4dd0d0ed4 (diff)
checkin: utf8 generation code
Diffstat (limited to 'sys')
-rwxr-xr-xsys/libn/genutf8.py141
1 files changed, 141 insertions, 0 deletions
diff --git a/sys/libn/genutf8.py b/sys/libn/genutf8.py
new file mode 100755
index 0000000..f4b9f8c
--- /dev/null
+++ b/sys/libn/genutf8.py
@@ -0,0 +1,141 @@
+#!/bin/python
+
+preamble = """
+Rune*
+rbsearch(Rune c, Rune *t, int n, int nelem)
+{
+ Rune *p;
+ int m;
+
+ while (n > 1) {
+ m = n >> 1;
+ p = t + m*nelem;
+ if (c >= p[0]) {
+ t = p;
+ n -= m;
+ } else n = m;
+ }
+
+ if (n && c >= t[0]) return t;
+ else return 0;
+}
+"""
+
+def findrange(s):
+ rs = []
+ ss = []
+ b, c = None, None
+ for elt in sorted(s):
+ if b is None:
+ b, c = elt, elt
+ continue
+ if elt == c + 1:
+ c = elt
+ continue
+ else:
+ if b == c:
+ ss.append(b)
+ else:
+ rs.append( (b, c) )
+ b, c = elt, elt
+ return rs, ss
+
+def puttab(s, fd, name):
+ lname = name.lower()
+ rs, ss = findrange(s)
+
+ rtabn = "%s_rtab"%lname
+ stabn = "%s_stab"%lname
+
+ fd.write("static Rune %s[] = {" % rtabn)
+ for i, elts in enumerate(sorted(rs)):
+ fd.write("0x%04x, 0x%04x,\n" % (elts[0], elts[1]))
+ fd.write("};\n\n")
+
+ if len(ss) > 0:
+ fd.write("static Rune %s[] = {" % stabn)
+ for i, elt in enumerate(sorted(ss)):
+ if i % 2 == 0:
+ fd.write("\n")
+ fd.write("0x%04x, "%(elt))
+ fd.write("\n};\n\n")
+
+ fd.write("int\n")
+ fd.write("utf8ยท%s(Rune c)\n"%name)
+ fd.write("{\n")
+ fd.write("\tRune *p;\n\n")
+ fd.write("\tp = rbsearch(c, %s, arrlen(%s)/2, 2);\n"%(rtabn,rtabn))
+ fd.write("\tif (p && c >= p[0] && c <= p[1])\n")
+ fd.write("\t\t\treturn 1;\n")
+ if len(ss) > 0:
+ fd.write("\tp = rbsearch(c, %s, arrlen(%s), 1);\n"%(stabn,stabn))
+ fd.write("\tif (p && c == p[0])\n")
+ fd.write("\t\treturn 1;\n")
+ fd.write("\n\treturn 0;\n")
+ fd.write("}\n\n")
+
+RUNEMAX = 0x10FFFF
+
+if __name__ == "__main__":
+
+ alphas = set()
+ lowers = set()
+ uppers = set()
+ titles = set()
+ digits = set()
+ spaces = set()
+
+ tolower = {}
+ toupper = {}
+ totitle = {}
+
+ with open("data/UnicodeData.txt") as utf8:
+ for runedata in utf8:
+ fields = runedata[:-1].split(";")
+ rune = int(fields[0], 16)
+ kind = fields[2]
+
+ assert rune < RUNEMAX
+
+ if kind[0] == 'Z':
+ spaces.add(rune)
+ elif kind[0] == 'L':
+ alphas.add(rune)
+
+ if kind[1] == "l":
+ lowers.add(rune)
+ elif kind[1] == "u":
+ uppers.add(rune)
+ elif kind[1] == "t":
+ titles.add(rune)
+
+ if (fields[12] != ""):
+ toupper[rune] = int(fields[12], 16)
+
+ if (fields[13] != ""):
+ tolower[rune] = int(fields[13], 16)
+
+ if (fields[14] != ""):
+ totitle[rune] = int(fields[14], 16)
+
+ elif kind == 'Nd':
+ digits.add(rune)
+
+ # Some annoying exceptions...
+ # These are counted as control characters
+ spaces.add(ord('\t'))
+ spaces.add(ord('\n'))
+ spaces.add(ord('\f'))
+ spaces.add(ord('\v'))
+ spaces.add(0x85)
+ spaces.add(0xfeff)
+
+ # TODO: Add toLower/toUpper/toTitle...
+ with open("utf8_gen.c", "w") as utf8:
+ utf8.write(preamble)
+ puttab(spaces, utf8, "IsSpace")
+ puttab(lowers, utf8, "IsLower")
+ puttab(uppers, utf8, "IsUpper")
+ puttab(titles, utf8, "IsTitle")
+ puttab(alphas, utf8, "IsLetter")
+ puttab(digits, utf8, "IsDigit")