aboutsummaryrefslogtreecommitdiff
path: root/src/libutf
diff options
context:
space:
mode:
authorNicholas <nbnoll@eml.cc>2021-11-20 10:53:19 -0800
committerNicholas <nbnoll@eml.cc>2021-11-20 10:53:19 -0800
commita9bfe650038afea8b751175cac16f6027345e45f (patch)
tree9a7f9feb76a64bb3efe573036d80b7bdbf8a59a5 /src/libutf
parent1c8d4e69205fd875f6bec3fa3bd929c2e7f52f62 (diff)
Chore: reorganize libutf and libfmt into base
I found the split to be arbitrary. Better to include the functionality in the standard library. I also split the headers to allow for more granular inclusion (but the library is still monolithic). The only ugliness is the circular dependency introduced with libutf's generated functions. We put explicit prereqs with the necessary object files instead.
Diffstat (limited to 'src/libutf')
-rw-r--r--src/libutf/canfit.c23
-rw-r--r--src/libutf/decode.c98
-rw-r--r--src/libutf/decodeprev.c60
-rw-r--r--src/libutf/encode.c69
-rw-r--r--src/libutf/find.c31
-rw-r--r--src/libutf/findlast.c32
-rw-r--r--src/libutf/internal.h38
-rw-r--r--src/libutf/len.c21
-rw-r--r--src/libutf/rules.mk76
-rw-r--r--src/libutf/runelen.c8
-rw-r--r--src/libutf/vendor/common.c220
-rw-r--r--src/libutf/vendor/common.h46
-rw-r--r--src/libutf/vendor/mkgraphemedata.c24
-rw-r--r--src/libutf/vendor/mkrunetype.c390
-rw-r--r--src/libutf/vendor/mkrunewidth.c325
15 files changed, 0 insertions, 1461 deletions
diff --git a/src/libutf/canfit.c b/src/libutf/canfit.c
deleted file mode 100644
index 4579ab3..0000000
--- a/src/libutf/canfit.c
+++ /dev/null
@@ -1,23 +0,0 @@
-#include "internal.h"
-
-/* returns 1 if string of length n is long enough to be decoded */
-int
-utf8·canfit(byte* s, int n)
-{
- int i;
- rune c;
-
- if(n <= 0)
- return 0;
-
- c = *(ubyte*)s;
- if(c < TByte1)
- return 1;
-
- if(c < TByte3)
- return n >= 2;
- if(c < TByte4)
- return n >= 3;
-
- return n >= UTFmax;
-}
diff --git a/src/libutf/decode.c b/src/libutf/decode.c
deleted file mode 100644
index 01797f1..0000000
--- a/src/libutf/decode.c
+++ /dev/null
@@ -1,98 +0,0 @@
-#include "internal.h"
-
-#define ACCEPT 0
-#define REJECT 12
-
-static uint8 decode[] = {
- /*
- * the first part of the table maps bytes to character classes that
- * to reduce the size of the transition table and create bitmasks
- */
- 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
- 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
- 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
- 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
- 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,
- 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,
- 8,8,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
- 10,3,3,3,3,3,3,3,3,3,3,3,3,4,3,3, 11,6,6,6,5,8,8,8,8,8,8,8,8,8,8,8,
-
- /*
- * the second part is a transition table that maps a combination
- * of a state of the automaton and a character class to a state
- */
- 0,12,24,36,60,96,84,12,12,12,48,72, 12,12,12,12,12,12,12,12,12,12,12,12,
- 12, 0,12,12,12,12,12, 0,12, 0,12,12, 12,24,12,12,12,12,12,24,12,24,12,12,
- 12,12,12,12,12,12,12,24,12,12,12,12, 12,24,12,12,12,12,12,12,12,24,12,12,
- 12,12,12,12,12,12,12,36,12,36,12,12, 12,36,12,12,12,12,12,36,12,36,12,12,
- 12,36,12,12,12,12,12,12,12,12,12,12,
-};
-
-int
-utf8·decode(char *s, rune *r)
-{
- int n;
- rune v;
- uint8 b, t, x=ACCEPT;
-
- b = ((uint8 *)s)[0];
- t = decode[b];
- v = (0xFF >> t) & b;
- x = decode[256+x+t];
-
- for(n=1; x > REJECT && n < UTFmax; n++){
- b = ((uint8 *)s)[n];
- t = decode[b];
- v = (v << 6) | (b & TMask);
- x = decode[256+x+t];
- }
-
- if(x != ACCEPT){
- *r = RuneErr;
- return 1;
- }
-
- *r = v;
- return n;
-}
-
-#if 0
-int
-utf8·decode(byte *s, rune *r)
-{
- int c[UTFmax], i;
- rune l;
-
- c[0] = *(ubyte*)(s);
- if(c[0] < Tx){
- *r = c[0];
- return 1;
- }
-
- l = c[0];
- for(i = 1; i < UTFmax; i++){
- c[i] = *(ubyte*)(s+i);
- c[i] ^= Tx;
- if(c[i] & Testx) goto bad;
-
- l = (l << Bitx) | c[i];
- if(c[0] < Tbyte(i + 2)){
- l &= RuneX(i + 1);
- if(i == 1){
- if(c[0] < Tbyte(2) || l <= Rune1)
- goto bad;
- }else if(l <= RuneX(i) || l > RuneMax)
- goto bad;
-
- if(i == 2 && SurrogateMin <= l && l <= SurrogateMax)
- goto bad;
-
- *r = l;
- return i + 1;
- }
- }
-bad:
- *r = RuneErr;
- return 1;
-}
-#endif
diff --git a/src/libutf/decodeprev.c b/src/libutf/decodeprev.c
deleted file mode 100644
index 27dced6..0000000
--- a/src/libutf/decodeprev.c
+++ /dev/null
@@ -1,60 +0,0 @@
-#include "internal.h"
-
-#define ACCEPT 0
-#define REJECT 12
-
-static uint8 decode[] = {
- /*
- * the first part of the table maps bytes to character classes that
- * to reduce the size of the transition table and create bitmasks.
- */
- 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
- 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
- 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
- 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
- 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,
- 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,
- 8,8,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
- 10,3,3,3,3,3,3,3,3,3,3,3,3,4,3,3, 11,6,6,6,5,8,8,8,8,8,8,8,8,8,8,8,
- /*
- * The second part is a transition table that maps a combination
- * of a state of the automaton and a character class to a state.
- */
- // 0 1 2 3 4 5 6 7 8 9 10 11
- 0,24,12,12,12,12,12,24,12,24,12,12,
- 0,24,12,12,12,12,12,24,12,24,12,12,
- 12,36, 0,12,12,12,12,48,12,36,12,12,
- 12,60,12, 0, 0,12,12,72,12,72,12,12,
- 12,60,12, 0,12,12,12,72,12,72, 0,12,
- 12,12,12,12,12, 0, 0,12,12,12,12,12,
- 12,12,12,12,12,12,12,12,12,12,12, 0
-};
-
-int
-utf8·decodeprev(byte *s, rune *r)
-{
- int n;
- rune v;
- uint8 b, t, d, x=ACCEPT;
-
- v=0, n=0, d=0;
-nextbyte:
- b = ((uint8 *)s)[-n++];
- t = decode[b];
- x = decode[256+x+t];
-
- if(x > REJECT && n < UTFmax){
- v = v | ((b & TMask) << d);
- d += 6;
- goto nextbyte;
- }
-
- if(x != ACCEPT)
- *r = RuneErr;
- else{
- v |= (((0xFFu >> t) & b) << d);
- *r = v;
- }
-
- return n;
-}
diff --git a/src/libutf/encode.c b/src/libutf/encode.c
deleted file mode 100644
index fa7c93e..0000000
--- a/src/libutf/encode.c
+++ /dev/null
@@ -1,69 +0,0 @@
-#include "internal.h"
-
-int
-utf8·encode(rune *r, byte *s)
-{
- rune c;
-
- c = *r;
- if(c < Rune1Byte){ // 7 bits
- s[0] = (uint8)c;
- return 1;
- }
-
- if(c < Rune2Byte){ // 11 bits
- s[0] = TByte1 | (c >> 6);
- s[1] = Tx | (c & TMask);
- return 2;
- }
-
- if(c < Rune3Byte){ // 16 bits
- s[0] = TByte2 | ((c >> 12));
- s[1] = Tx | ((c >> 6) & TMask);
- s[2] = Tx | ((c) & TMask);
- return 3;
- }
-
- // 22 bits
- if(c > RuneMax || (RuneSurrogateMin <= c && c <= RuneSurrogateMax))
- c = RuneErr;
-
- s[0] = TByte3 | ((c >> 18));
- s[1] = Tx | ((c >> 12) & TMask);
- s[2] = Tx | ((c >> 6) & TMask);
- s[3] = Tx | ((c) & TMask);
-
- return 4;
-}
-
-#if 0
-int
-utf8·encode(rune* r, byte* s)
-{
- int i, j;
- rune c;
-
- c = *r;
- if(c <= Rune1) {
- s[0] = c;
- return 1;
- }
-
- for(i = 2; i < UTFmax + 1; i++){
- if(i == 3){
- if(c > RuneMax)
- c = RuneErr;
- if(SurrogateMin <= c && c <= SurrogateMax)
- c = RuneErr;
- }
- if(c <= RuneX(i) || i == UTFmax) {
- s[0] = Tbyte(i) | (c >> (i - 1)*Bitx);
- for(j = 1; j < i; j++)
- s[j] = Tx | ((c >> (i - j - 1)*Bitx) & Maskx);
- return i;
- }
- }
-
- return UTFmax;
-}
-#endif
diff --git a/src/libutf/find.c b/src/libutf/find.c
deleted file mode 100644
index d75feb8..0000000
--- a/src/libutf/find.c
+++ /dev/null
@@ -1,31 +0,0 @@
-#include "internal.h"
-
-byte*
-utf8·find(byte* s, rune c)
-{
- long c1;
- rune r;
- int n;
-
- if(c < Tx)
- return strchr(s, c);
-
- for(;;){
- c1 = *(ubyte*)s;
- if(c1 < Tx){
- if(c1 == 0) return nil;
- if(c1 == c) return s;
- s++;
- continue;
- }
-
- n = utf8·decode(s, &r);
-
- if(r == c)
- return s;
-
- s += n;
- }
-
- return nil;
-}
diff --git a/src/libutf/findlast.c b/src/libutf/findlast.c
deleted file mode 100644
index ab25ab2..0000000
--- a/src/libutf/findlast.c
+++ /dev/null
@@ -1,32 +0,0 @@
-#include "internal.h"
-
-byte*
-utf8·findlast(byte* s, rune c)
-{
- long c1;
- rune r;
- byte *l;
-
- if(c < Tx)
- return strrchr(s, c);
-
- l = nil;
- for(;;){
- c1 = *(ubyte*)s;
- if(c1 < Tx){
- if(c1 == 0) return l;
- if(c1 == c) l = s;
- s++;
- continue;
- }
-
- c1 = utf8·decode(s, &r);
-
- if(r == c)
- l = s;
-
- s += c1;
- }
-
- return nil;
-}
diff --git a/src/libutf/internal.h b/src/libutf/internal.h
deleted file mode 100644
index 9719977..0000000
--- a/src/libutf/internal.h
+++ /dev/null
@@ -1,38 +0,0 @@
-#pragma once
-
-#include <u.h>
-#include <base.h>
-#include <libutf.h>
-
-/*
- * NOTE: we use the preprocessor to ensure we have unsigned constants.
- * UTF-8 code:
- * 1 byte:
- * 0xxxxxxx
- * 2 byte:
- * 110xxxxx 10xxxxxx
- * 3 byte:
- * 1110xxxx 10xxxxxx 10xxxxxx
- * 4 byte:
- * 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
- */
-
-#define Tx 0x80u // 0b10000000 transfer header
-#define TMask 0x3Fu // 0b00111111 transfer mask
-
-#define TByte1 0xC0u // 0b11000000
-#define TByte2 0xE0u // 0b11100000
-#define TByte3 0xF0u // 0b11110000
-#define TByte4 0xF8u // 0b11111000
-
-#define RuneMask 0x1FFFFFu
-
-#define Rune1Byte 0x000080u // 1 << 8 (1 byte)
-#define Rune2Byte 0x001000u // 1 << 12 (2 bytes)
-#define Rune3Byte 0x020000u // 1 << 17 (3 bytes)
-#define Rune4Byte 0x400000u // 1 << 22 (4 bytes)
-
-
-/* UTF-16 nonsense */
-#define RuneSurrogateMin 0x0D8000
-#define RuneSurrogateMax 0x0D8FFF
diff --git a/src/libutf/len.c b/src/libutf/len.c
deleted file mode 100644
index 8fbd679..0000000
--- a/src/libutf/len.c
+++ /dev/null
@@ -1,21 +0,0 @@
-#include "internal.h"
-
-int
-utf8·len(char *s)
-{
- int c;
- long n;
- rune r;
-
- n = 0;
- for(;;){
- c = *(uchar*)s;
- if(c < Tx){
- if(c == 0)
- return n;
- s++;
- }else
- s += utf8·decode(s, &r);
- n++;
- }
-}
diff --git a/src/libutf/rules.mk b/src/libutf/rules.mk
deleted file mode 100644
index aeb86b2..0000000
--- a/src/libutf/rules.mk
+++ /dev/null
@@ -1,76 +0,0 @@
-include share/push.mk
-
-UNICODE=14.0.0
-
-SRCS_$(d):=\
- $(d)/encode.c\
- $(d)/decode.c\
- $(d)/decodeprev.c\
- $(d)/find.c\
- $(d)/findlast.c\
- $(d)/canfit.c\
- $(d)/runelen.c\
- $(d)/len.c\
- $(d)/runetype-$(UNICODE).c\
- $(d)/runewidth-$(UNICODE).c
-
-LIBS_$(d):=$(d)/libutf.a
-
-include share/paths.mk
-
-# ========================================================================
-# table generation
-
-$(d)/vendor/common.o: $(d)/vendor/common.c
- $(COMPILE)
-
-# rune categories
-$(d)/vendor/UnicodeData-$(UNICODE).txt:
- @echo "GET UnicodeData.txt";\
- curl https://www.unicode.org/Public/$(UNICODE)/ucd/UnicodeData.txt > $@
-
-$(d)/vendor/mkrunetype: $(d)/vendor/mkrunetype.c $(d)/vendor/common.o $(OBJ_DIR)/base/base.a
- $(COMPLINK)
-
-GENS += $(d)/vendor/mkrunetype
-
-$(d)/runetype-$(UNICODE).c: $(d)/vendor/UnicodeData-$(UNICODE).txt $(d)/vendor/mkrunetype
- @$(dir $@)vendor/mkrunetype $< > $@
-
-# rune widths
-$(d)/vendor/EastAsianWidth-$(UNICODE).txt:
- @echo "GET EastAsianWidth.txt";\
- curl https://www.unicode.org/Public/$(UNICODE)/ucd/EastAsianWidth.txt > $@
-
-$(d)/vendor/EmojiData-$(UNICODE).txt:
- @echo "GET EmojiData.txt";\
- curl https://www.unicode.org/Public/$(UNICODE)/ucd/emoji/emoji-data.txt > $@
-
-$(d)/vendor/mkrunewidth: $(d)/vendor/mkrunewidth.c $(d)/vendor/common.o $(OBJ_DIR)/base/base.a
- $(COMPLINK)
-
-GENS += $(d)/vendor/mkrunewidth
-
-$(d)/runewidth-$(UNICODE).c: $(d)/vendor/mkrunewidth $(d)/vendor/UnicodeData-$(UNICODE).txt $(d)/vendor/EastAsianWidth-$(UNICODE).txt $(d)/vendor/EmojiData-$(UNICODE).txt
- @$(dir $@)vendor/mkrunewidth $(filter-out $<, $^) > $@
-
-# grapheme boundaries
-$(d)/vendor/GraphemeBreakProperty-$(UNICODE).txt:
- @echo "GET GraphemeBreakProperty.txt";\
- curl https://www.unicode.org/Public/$(UNICODE)/ucd/auxiliary/GraphemeBreakProperty.txt > $@
-
-$(d)/vendor/mkgraphemedata: $(d)/vendor/mkgraphemedata.c $(d)/vendor/common.o $(OBJ_DIR)/base/base.a
- $(COMPLINK)
-
-$(d)/graphemedata-$(UNICODE).c: $(d)/vendor/mkgraphemedata $(d)/vendor/GraphemeBreakProperty-$(UNICODE).txt
- $^ > $@
-
-GENS += $(d)/vendor/mkgraphemedata
-
-# ========================================================================
-# normal operations
-
-$(LIBS_$(d)): $(OBJS_$(d))
- $(ARCHIVE)
-
-include share/pop.mk
diff --git a/src/libutf/runelen.c b/src/libutf/runelen.c
deleted file mode 100644
index dac7f15..0000000
--- a/src/libutf/runelen.c
+++ /dev/null
@@ -1,8 +0,0 @@
-#include "internal.h"
-
-int
-utf8·runelen(rune r)
-{
- byte s[10];
- return utf8·encode(&r, s);
-}
diff --git a/src/libutf/vendor/common.c b/src/libutf/vendor/common.c
deleted file mode 100644
index 5a03a50..0000000
--- a/src/libutf/vendor/common.c
+++ /dev/null
@@ -1,220 +0,0 @@
-#include "common.h"
-
-// -----------------------------------------------------------------------
-// input functions
-
-int
-parse(io·Stream *io, int nfield, char **field, int len, char *line)
-{
- int n;
- if((n=io·readln(io, len, line)) <= 0)
- return ParseEOF;
-
- if(n == len)
- panicf("line too long");
-
- if(line[n-1] != '\n')
- panicf("invalid line: expected '\n', found '%c'", line[n]);
-
- line[n-1] = 0;
-
- if(line[0] == '#' || line[0] == 0)
- return ParseSkip;
-
- /* tokenize line into fields */
- n = 0;
- field[n] = line;
- while(*line){
- if(*line == ';'){
- *line = 0;
- field[++n] = line+1;
- }
- line++;
- }
-
- if(n != nfield-1)
- panicf("expected %d number of fields, got %d: %s", nfield, n, line);
-
- return ParseOK;
-}
-
-int
-codepoint(char *s)
-{
- int c, b;
-
- c = 0;
- while((b=*s++)){
- c <<= 4;
- if(b >= '0' && b <= '9')
- c += b - '0';
- else if(b >= 'A' && b <= 'F')
- c += b - 'A' + 10;
- else
- panicf("bad codepoint char '%c'", b);
- }
-
- return c;
-}
-
-void
-codepointrange(io·Stream *utf8, char *field[NumFields], int *start, int *stop)
-{
- int e, c;
- char *other[NumFields], line[1024];
-
- // XXX: the stop variable passes in the previous stopping character
- e = *stop;
- c = codepoint(field[Fcode]);
-
- if(c >= NumRunes)
- panicf("unexpected large codepoint %x", c);
- if(c <= e)
- panicf("bad code sequence: %x then %x", e, c);
- e = c;
-
- if(strstr(field[Fname], ", First>") != nil){
- if(!parse(utf8, arrlen(other), other, arrlen(line), line))
- panicf("range start at end of file");
- if(strstr(other[Fname], ", Last>") == nil)
- panicf("range start not followed by range end");
-
- e = codepoint(other[Fcode]);
-
- if(e <= c)
- panicf("bad code sequence: %x then %x", c, e);
- if(strcmp(field[Fcategory], other[Fcategory]) != 0)
- panicf("range with mismatched category");
- }
-
- *start = c;
- *stop = e;
-}
-
-// -----------------------------------------------------------------------
-// output functions
-
-void
-putsearch(void)
-{
- puts(
- "#include <u.h>\n"
- "#include <libutf.h>\n"
- "\n"
- "static\n"
- "rune*\n"
- "rangesearch(rune c, rune *t, int n, int ne)\n"
- "{\n"
- " rune *p;\n"
- " int m;\n"
- " while(n > 1) {\n"
- " m = n >> 1;\n"
- " p = t + m*ne;\n"
- " if(c >= p[0]){\n"
- " t = p;\n"
- " n = n-m;\n"
- " }else\n"
- " n = m;\n"
- " }\n"
- " if(n && c >= t[0])\n"
- " return t;\n"
- " return 0;\n"
- "}\n"
- );
-
-}
-
-int
-putrange(char *ident, char *prop, int force)
-{
- int l, r, start;
-
- start = 0;
- for(l = 0; l < NumRunes;) {
- if(!prop[l]){
- l++;
- continue;
- }
-
- for(r = l+1; r < NumRunes; r++){
- if(!prop[r])
- break;
- prop[r] = 0;
- }
-
- if(force || r > l + 1){
- if(!start){
- printf("static rune %s[] = {\n", ident);
- start = 1;
- }
- prop[l] = 0;
- printf("\t0x%.4x, 0x%.4x,\n", l, r-1);
- }
-
- l = r;
- }
-
- if(start)
- printf("};\n\n");
-
- return start;
-}
-
-int
-putpair(char *ident, char *prop)
-{
- int l, r, start;
-
- start = 0;
- for(l=0; l+2 < NumRunes; ){
- if(!prop[l]){
- l++;
- continue;
- }
-
- for(r = l + 2; r < NumRunes; r += 2){
- if(!prop[r])
- break;
- prop[r] = 0;
- }
-
- if(r != l + 2){
- if(!start){
- printf("static rune %s[] = {\n", ident);
- start = 1;
- }
- prop[l] = 0;
- printf("\t0x%.4x, 0x%.4x,\n", l, r - 2);
- }
-
- l = r;
- }
-
- if(start)
- printf("};\n\n");
- return start;
-}
-
-int
-putsingle(char *ident, char *prop)
-{
- int i, start;
-
- start = 0;
- for(i = 0; i < NumRunes; i++) {
- if(!prop[i])
- continue;
-
- if(!start){
- printf("static rune %s[] = {\n", ident);
- start = 1;
- }
- prop[i] = 0;
- printf("\t0x%.4x,\n", i);
- }
-
- if(start)
- printf("};\n\n");
-
- return start;
-}
diff --git a/src/libutf/vendor/common.h b/src/libutf/vendor/common.h
deleted file mode 100644
index 62f6c5b..0000000
--- a/src/libutf/vendor/common.h
+++ /dev/null
@@ -1,46 +0,0 @@
-#pragma once
-
-#include <u.h>
-#include <base.h>
-#include <libutf.h>
-
-enum
-{
- // Fields inside UnicodeData.txt
- Fcode,
- Fname,
- Fcategory,
- Fcombine,
- Fbidir,
- Fdecomp,
- Fdecimal,
- Fdigit,
- Fnumeric,
- Fmirror,
- Foldname,
- Fcomment,
- Fupper,
- Flower,
- Ftitle,
-
- NumFields,
- NumRunes = 1 << 21,
-};
-
-/* input functions */
-enum
-{
- ParseEOF,
- ParseOK,
- ParseSkip,
-};
-
-int parse(io·Stream *io, int nfield, char **field, int len, char *line);
-int codepoint(char *s);
-void codepointrange(io·Stream *utf8, char *field[NumFields], int *start, int *stop);
-
-/* output functions */
-void putsearch(void);
-int putrange(char *ident, char *prop, int force);
-int putpair(char *ident, char *prop);
-int putsingle(char *ident, char *prop);
diff --git a/src/libutf/vendor/mkgraphemedata.c b/src/libutf/vendor/mkgraphemedata.c
deleted file mode 100644
index ce5a952..0000000
--- a/src/libutf/vendor/mkgraphemedata.c
+++ /dev/null
@@ -1,24 +0,0 @@
-#include <u.h>
-#include <base.h>
-#include <libutf.h>
-
-// -----------------------------------------------------------------------
-// main point of entry
-
-static
-void
-usage(void)
-{
- fprintf(stderr, "usage: mkgraphemedata <GraphemeBreakProperty.txt>\n");
- exit(1);
-}
-
-int
-main(int argc, char *argv[])
-{
- io·Stream *utf8;
- char line[1024];
-
- ARGBEGIN{
- }ARGEND;
-}
diff --git a/src/libutf/vendor/mkrunetype.c b/src/libutf/vendor/mkrunetype.c
deleted file mode 100644
index b33df32..0000000
--- a/src/libutf/vendor/mkrunetype.c
+++ /dev/null
@@ -1,390 +0,0 @@
-#include "common.h"
-
-// -----------------------------------------------------------------------
-// globals
-
-#define OFFSET (1 << 20)
-#define DELTA(mapx, x) ((1 << 20) + (mapx) - (x))
-
-// TODO: use bitarrays. will reduce executable size 8x
-struct Table
-{
- /* properties */
- char isspace[NumRunes];
- char isalpha[NumRunes];
- char ismark[NumRunes];
- char isdigit[NumRunes];
- char isupper[NumRunes];
- char islower[NumRunes];
- char istitle[NumRunes];
- char ispunct[NumRunes];
- char issymbl[NumRunes];
- char iscntrl[NumRunes];
-
- char combine[NumRunes];
-
- /* transformations */
- int toupper[NumRunes];
- int tolower[NumRunes];
- int totitle[NumRunes];
-};
-
-static struct Table table;
-
-// -----------------------------------------------------------------------
-// internal functions
-
-static
-int
-isrange(char *label, char *prop, int force)
-{
- char ident[128];
- if(snprintf(ident, arrlen(ident), "is%s_range", label) == arrlen(ident))
- panicf("out of identifier space\n");
-
- return putrange(ident, prop, force);
-}
-
-static
-int
-ispair(char *label, char *prop)
-{
- char ident[128];
- if(snprintf(ident, arrlen(ident), "is%s_pair", label) == arrlen(ident))
- panicf("out of identifier space\n");
-
- return putpair(ident, prop);
-}
-
-static
-int
-issingle(char *label, char *prop)
-{
- char ident[128];
- if(snprintf(ident, arrlen(ident), "is%s_single", label) == arrlen(ident))
- panicf("out of identifier space\n");
-
- return putsingle(ident, prop);
-}
-
-static
-void
-makeis(char *label, char *table, int pairs, int onlyranges)
-{
- int hasr, hasp=0, hass=0;
-
- hasr = isrange(label, table, onlyranges);
- if(!onlyranges && pairs)
- hasp = ispair(label, table);
- if(!onlyranges)
- hass = issingle(label, table);
-
- printf(
- "int\n"
- "utf8·is%s(rune c)\n"
- "{\n"
- " rune *p;\n"
- "\n",
- label);
-
- if(hasr){
- printf(
- " p = rangesearch(c, is%s_range, arrlen(is%s_range)/2, 2);\n"
- " if(p && c >= p[0] && c <= p[1])\n"
- " return 1;\n",
- label, label);
- }
-
- if(hasp){
- printf(
- " p = rangesearch(c, is%s_pair, arrlen(is%s_pair)/2, 2);\n"
- " if(p && c >= p[0] && c <= p[1] && !((c - p[0]) & 1))\n"
- " return 1;\n",
- label, label);
- }
-
- if(hass)
- printf(
- " p = rangesearch(c, is%s_single, arrlen(is%s_single), 1);\n"
- " if(p && c == p[0])\n"
- " return 1;\n",
- label, label);
-
- printf(
- " return 0;\n"
- "}\n"
- "\n");
-}
-
-static
-int
-torange(char *label, int *index, int force)
-{
- int l, r, d, start = 0;
-
- for(l = 0; l < NumRunes; ){
- if(index[l] == l){
- l++;
- continue;
- }
-
- d = DELTA(index[l], l);
- if(d != (rune)d)
- panicf("bad map delta %d", d);
-
- for(r = l+1; r < NumRunes; r++){
- if(DELTA(index[r], r) != d)
- break;
- index[r] = r;
- }
-
- if(force || r != l + 1){
- if(!start){
- printf("static rune to%s_range[] = {\n", label);
- start = 1;
- }
- index[l] = l;
- printf("\t0x%.4x, 0x%.4x, %d,\n", l, r-1, d);
- }
- l = r;
- }
- if(start)
- printf("};\n\n");
-
- return start;
-}
-
-static
-int
-topair(char *label, int *index)
-{
- int l, r, d, start = 0;
-
- for(l = 0; l + 2 < NumRunes; ){
- if(index[l] == l){
- l++;
- continue;
- }
-
- d = DELTA(index[l], l);
- if(d != (rune)d)
- panicf("bad delta %d", d);
-
- for(r = l+2; r < NumRunes; r += 2){
- if(DELTA(index[r], r) != d)
- break;
- index[r] = r;
- }
-
- if(r > l+2){
- if(!start){
- printf("static rune to%s_pair[] = {\n", label);
- start = 1;
- }
- index[l] = l;
- printf("\t0x%.4x, 0x%.4x, %d,\n", l, r-2, d);
- }
-
- l = r;
- }
- if(start)
- printf("};\n\n");
-
- return start;
-}
-
-static
-int
-tosingle(char *label, int *index)
-{
- int i, d, start = 0;
-
- for(i=0; i < NumRunes; i++) {
- if(index[i] == i)
- continue;
-
- d = DELTA(index[i], i);
- if(d != (rune)d)
- panicf("bad map delta %d", d);
-
- if(!start){
- printf("static rune to%s_single[] = {\n", label);
- start = 1;
- }
- index[i] = i;
- printf("\t0x%.4x, %d,\n", i, d);
- }
- if(start)
- printf("};\n\n");
-
- return start;
-}
-
-static
-void
-mkto(char *label, int *index, int pairs, int onlyrange)
-{
- int hasr, hasp=0, hass=0;
-
- hasr = torange(label, index, !onlyrange);
- if(!onlyrange && pairs)
- hasp = topair(label, index);
- if(!onlyrange)
- hass = tosingle(label, index);
-
- printf(
- "rune\n"
- "utf8·to%s(rune c)\n"
- "{\n"
- " rune *p;\n"
- "\n",
- label);
-
- if(hasr)
- printf(
- " p = rangesearch(c, to%s_range, arrlen(to%s_range)/3, 3);\n"
- " if(p && c >= p[0] && c <= p[1])\n"
- " return c + p[2] - %d;\n",
- label, label, OFFSET);
-
- if(hasp)
- printf(
- " p = rangesearch(c, to%s_pair, arrlen(to%s_pair)/3, 3);\n"
- " if(p && c >= p[0] && c <= p[1] && !((c - p[0]) & 1))\n"
- " return c + p[2] - %d;\n",
- label, label, OFFSET);
-
- if(hass)
- printf(
- " p = rangesearch(c, to%s_single, arrlen(to%s_single)/2, 2);\n"
- " if(p && c == p[0])\n"
- " return c + p[1] - %d;\n",
- label, label, OFFSET);
-
-
- printf(
- " return c;\n"
- "}\n"
- "\n"
- );
-}
-
-// -----------------------------------------------------------------------
-// main point of entry
-
-static
-void
-usage(void)
-{
- fprintf(stderr, "usage: mkrunetype <UnicodeData.txt>\n");
- exit(1);
-}
-
-int
-main(int argc, char *argv[])
-{
- int i, sc, c, ec;
- io·Stream *utf8;
- char *prop, *field[NumFields], line[1024];
-
- ARGBEGIN{
- }ARGEND;
-
- if(argc != 1)
- usage();
-
- if(!(utf8 = io·open(argv[0], "r")))
- panicf("can't open %s\n", argv[0]);
-
- /* by default each character maps to itself */
- for(i = 0; i < NumRunes; i++) {
- table.toupper[i] = i;
- table.tolower[i] = i;
- table.totitle[i] = i;
- }
-
- /* ensure all C local white space characters pass */
- table.isspace['\t'] = 1;
- table.isspace['\n'] = 1;
- table.isspace['\r'] = 1;
- table.isspace['\f'] = 1;
- table.isspace['\v'] = 1;
- table.isspace[0x85] = 1;
-
- ec = -1;
- // NOTE: we don't check for comments here: assume UnicodeData.txt doesn't have any
- while(parse(utf8, arrlen(field), field, arrlen(line), line)){
- /* parse unicode range */
- codepointrange(utf8, field, &sc, &ec);
- prop = field[Fcategory];
-
- for(c = sc; c <= ec; c++){
- /* grab properties */
- switch(prop[0]){
- case 'L':
- table.isalpha[c] = 1;
- switch(prop[1]){
- case 'u': table.isupper[c] = 1; break;
- case 'l': table.islower[c] = 1; break;
- case 't': table.istitle[c] = 1; break;
- case 'm': break; // modifier letters
- case 'o': break; // ideograph letters
- default:
- goto badproperty;
- }
- break;
-
- case 'Z':
- table.isspace[c] = 1;
- break;
-
- case 'M':
- table.ismark[c] = 1;
- break;
-
- case 'N':
- table.isdigit[c] = 1;
- break;
-
- case 'P':
- table.ispunct[c] = 1;
- break;
-
- case 'S':
- table.issymbl[c] = 1;
- break;
-
- case 'C':
- table.iscntrl[c] = 1;
- break;
-
- default: badproperty:
- panicf("unrecognized category '%s'", prop);
- }
- /* grab transformations */
- if(*field[Fupper])
- table.toupper[c] = codepoint(field[Fupper]);
- if(*field[Flower])
- table.tolower[c] = codepoint(field[Flower]);
- if(*field[Ftitle])
- table.totitle[c] = codepoint(field[Ftitle]);
- }
- }
- io·close(utf8);
-
- putsearch();
-
- makeis("space", table.isspace, 0, 1);
- makeis("digit", table.isdigit, 0, 1);
- makeis("alpha", table.isalpha, 0, 0);
- makeis("upper", table.isupper, 1, 0);
- makeis("lower", table.islower, 1, 0);
- makeis("title", table.istitle, 1, 0);
- makeis("punct", table.ispunct, 1, 0);
-
- mkto("upper", table.toupper, 1, 0);
- mkto("lower", table.tolower, 1, 0);
- mkto("title", table.totitle, 1, 0);
-
- return 0;
-}
diff --git a/src/libutf/vendor/mkrunewidth.c b/src/libutf/vendor/mkrunewidth.c
deleted file mode 100644
index 14e6973..0000000
--- a/src/libutf/vendor/mkrunewidth.c
+++ /dev/null
@@ -1,325 +0,0 @@
-#include "common.h"
-
-/*
- * inspired by design choices in utf8proc/charwidths.jl
- * all widths default to 1 unless they fall within the categories:
- * 1. Mn 2. Mc 3. Me 4. Zl
- * 5. Zp 6. Cc 7. Cf 8. Cs
- * these default to zero width
- */
-enum
-{
- /* width ? */
- WidthNeutral, /* (N) practially treated like narrow but unclear ... */
- WidthAmbiguous, /* (A) sometimes wide and sometimes not... */
- /* width 1 */
- WidthHalf, /* (H) = to narrow (compatability equivalent) */
- WidthNarrow, /* (Na) ASCII width */
- /* width 2 */
- WidthWide, /* (W) 2x width */
- WidthFull, /* (F) = to wide (compatability equivalent) */
-};
-
-struct Table
-{
- char width[3][NumRunes];
-};
-
-static struct Table table;
-
-// -----------------------------------------------------------------------
-// internal functions
-
-static
-void
-parse_category(char *path)
-{
- int sc, c, ec, w;
- io·Stream *utf8;
- char *prop, *field[NumFields], line[1024];
-
- if(!(utf8 = io·open(path, "r")))
- panicf("can't open %s\n", path);
-
- // NOTE: we don't check for comments here
- ec = -1;
- while(parse(utf8, arrlen(field), field, arrlen(line), line)){
- codepointrange(utf8, field, &sc, &ec);
-
- prop = field[Fcategory];
-
- switch(prop[0]){
- case 'M':
- switch(prop[1]){
- case 'n': case 'c': case 'e':
- w = 0;
- break;
- default:
- w = 1;
- break;
- }
- break;
- case 'Z':
- switch(prop[1]){
- case 'l': case 'p':
- w = 0;
- break;
- default:
- w = 1;
- break;
- }
- break;
- case 'C':
- switch(prop[1]){
- case 'c': case 'f': case 's':
- w = 0;
- break;
- default:
- w = 1;
- break;
- }
- default:
- w = 1;
- }
-
- for(c = sc; c <= ec; c++)
- table.width[w][c] = 1;
- }
-
- io·close(utf8);
-}
-
-static
-void
-coderange(char *field, int *l, int *r)
-{
- char *s;
-
- if(!(s = strstr(field, "..")))
- *l=*r=codepoint(field);
- else{
- *s++ = 0, *s++ = 0;
- *l=codepoint(field);
- *r=codepoint(s);
- }
-}
-
-static
-void
-parse_eawidths(char *path)
-{
- int at, w;
- int l, c, r;
- io·Stream *utf8;
- char *field[2], line[1024];
-
- utf8 = io·open(path, "r");
- while((at=parse(utf8, arrlen(field), field, arrlen(line), line)) != ParseEOF){
- if(at == ParseSkip)
- continue;
-
- switch(field[1][0]){
- case 'A': continue;
- case 'N':
- if(field[1][1] != 'a')
- continue;
- /* fallthrough */
- case 'H': w = 1; break;
-
- case 'W': /* fallthrough */
- case 'F': w = 2; break;
-
- default:
- panicf("malformed east asian width class: %s\n", field[1]);
- }
-
- coderange(field[0], &l, &r);
-
- for(c=l; c <= r; c++){
- /* ensure it only exists in one table */
- table.width[w][c] = 1;
- table.width[(w+1)%3][c] = 0;
- table.width[(w+2)%3][c] = 0;
- }
- }
- io·close(utf8);
-}
-
-static
-void
-parse_emoji(char *path)
-{
- int at, w;
- int l, c, r;
- io·Stream *utf8;
- char *s, *field[2], line[1024];
-
- utf8 = io·open(path, "r");
- while((at=parse(utf8, arrlen(field), field, arrlen(line), line)) != ParseEOF){
- if(at == ParseSkip)
- continue;
-
- /* only override emoji presentation */
- if(!strstr(field[1], "Emoji_Presentation"))
- continue;
-
- /* trim trailing space */
- for(s=field[0]; *s; s++){
- if(*s == ' ')
- *s = 0;
- }
-
- coderange(field[0], &l, &r);
-
- for(c=l; c <= r; c++){
- table.width[0][c] = 0;
- table.width[1][c] = 0;
- table.width[2][c] = 1;
- }
- }
-
- io·close(utf8);
-}
-
-/* output functions */
-static
-void
-maketable(char *label, char *table, int pairs, int onlyranges)
-{
- int r, p=0, s=0;
- char ident[3][128];
-
- enum
- {
- Irange,
- Ipair,
- Isingle,
- };
-
- /* ranges */
- if(snprintf(ident[Irange], arrlen(ident[Irange]), "%s_range", label) == arrlen(ident[Irange]))
- panicf("out of identifier space\n");
- r = putrange(ident[Irange], table, onlyranges);
-
- if(!onlyranges && pairs){
- if(snprintf(ident[Ipair], arrlen(ident[Ipair]), "%s_pair", label) == arrlen(ident[Ipair]))
- panicf("out of identifier space\n");
- p = putpair(ident[Ipair], table);
- }
- if(!onlyranges){
- if(snprintf(ident[Isingle], arrlen(ident[Isingle]), "%s_single", label) == arrlen(ident[Isingle]))
- panicf("out of identifier space\n");
-
- s = putsingle(ident[Isingle], table);
- }
-
- printf(
- "static int\n"
- "is%s(rune c)\n"
- "{\n"
- " rune *p;\n"
- "\n",
- label);
-
- if(r){
- printf(
- " p = rangesearch(c, %s, arrlen(%s)/2, 2);\n"
- " if(p && c >= p[0] && c <= p[1])\n"
- " return 1;\n",
- ident[Irange], ident[Irange]);
- }
-
- if(p){
- printf(
- " p = rangesearch(c, %s, arrlen(%s)/2, 2);\n"
- " if(p && c >= p[0] && c <= p[1] && !((c - p[0]) & 1))\n"
- " return 1;\n",
- ident[Ipair], ident[Ipair]);
- }
-
- if(s)
- printf(
- " p = rangesearch(c, %s, arrlen(%s), 1);\n"
- " if(p && c == p[0])\n"
- " return 1;\n",
- ident[Isingle], ident[Isingle]);
-
- printf(
- " return 0;\n"
- "}\n"
- "\n");
-}
-
-// -----------------------------------------------------------------------
-// main point of entry
-
-static
-void
-usage(void)
-{
- fprintf(stderr, "usage: mkrunewidth <UnicodeData.txt> <EastAsianWidth.txt> <EmojiData.txt>\n");
- exit(1);
-}
-
-#define SETW0(c) \
- table.width[0][(c)] = 1, \
- table.width[1][(c)] = 0, \
- table.width[2][(c)] = 0;
-
-#define SETW1(c) \
- table.width[0][(c)] = 0, \
- table.width[1][(c)] = 1, \
- table.width[2][(c)] = 0;
-
-#define SETW2(c) \
- table.width[0][(c)] = 0, \
- table.width[1][(c)] = 0, \
- table.width[2][(c)] = 1;
-
-
-int
-main(int argc, char *argv[])
-{
- int c;
-
- ARGBEGIN{
- }ARGEND;
-
- if(argc != 3)
- usage();
-
- parse_category(*argv++);
- parse_eawidths(*argv++);
- parse_emoji(*argv);
-
- /* overrides */
- SETW0(0x2028);
- SETW0(0x2029);
-
- SETW1(0x00AD);
-
- /* simple checking */
- for(c=0; c<NumRunes; c++){
- if(table.width[0][c] + table.width[1][c] + table.width[2][c] > 1)
- panicf("improper table state");
- }
-
- putsearch();
-
- maketable("width0", table.width[0], 1, 0);
- maketable("width1", table.width[1], 1, 0);
- maketable("width2", table.width[2], 1, 0);
-
- puts(
- "\n"
- "int\n"
- "utf8·runewidth(rune c)\n"
- "{\n"
- " if(iswidth1(c))\n"
- " return 1;\n"
- " if(iswidth2(c))\n"
- " return 2;\n"
- " return 0;\n"
- "}"
- );
-}