From 7ea1cdb7d31f00024f5a1d124b42cd19a03b959a Mon Sep 17 00:00:00 2001 From: Nicholas Date: Wed, 10 Nov 2021 20:12:45 -0800 Subject: chore: libunicode -> libutf --- include/base.h | 8 +- include/libunicode.h | 38 ---- include/libutf.h | 38 ++++ sys/base/mmap.c | 9 +- sys/cmd/dwm/dwm.h | 2 +- sys/cmd/dwm/rules.mk | 2 +- sys/cmd/ic/ic.c | 40 ++-- sys/cmd/ic/rules.mk | 2 +- sys/cmd/ic/strlcpy.c | 14 +- sys/cmd/menu/menu.h | 2 +- sys/cmd/rc/rc.h | 2 +- sys/cmd/rc/rules.mk | 2 +- sys/cmd/rules.mk | 3 + sys/cmd/term/rules.mk | 4 +- sys/cmd/term/term.h | 2 +- sys/libunicode/canfit.c | 23 -- sys/libunicode/decode.c | 98 --------- sys/libunicode/decodeprev.c | 60 ----- sys/libunicode/encode.c | 69 ------ sys/libunicode/find.c | 31 --- sys/libunicode/findlast.c | 32 --- sys/libunicode/internal.h | 38 ---- sys/libunicode/len.c | 21 -- sys/libunicode/rules.mk | 76 ------- sys/libunicode/runelen.c | 8 - sys/libunicode/vendor/common.c | 220 ------------------- sys/libunicode/vendor/common.h | 46 ---- sys/libunicode/vendor/mkgraphemedata.c | 24 -- sys/libunicode/vendor/mkrunetype.c | 388 --------------------------------- sys/libunicode/vendor/mkrunewidth.c | 325 --------------------------- sys/libutf/canfit.c | 23 ++ sys/libutf/decode.c | 98 +++++++++ sys/libutf/decodeprev.c | 60 +++++ sys/libutf/encode.c | 69 ++++++ sys/libutf/find.c | 31 +++ sys/libutf/findlast.c | 32 +++ sys/libutf/internal.h | 38 ++++ sys/libutf/len.c | 21 ++ sys/libutf/rules.mk | 76 +++++++ sys/libutf/runelen.c | 8 + sys/libutf/runetype-14.0.0.c | 111 ++++++++++ sys/libutf/runewidth-14.0.0.c | 71 ++++++ sys/rules.mk | 2 +- 43 files changed, 724 insertions(+), 1543 deletions(-) delete mode 100644 include/libunicode.h create mode 100644 include/libutf.h delete mode 100644 sys/libunicode/canfit.c delete mode 100644 sys/libunicode/decode.c delete mode 100644 sys/libunicode/decodeprev.c delete mode 100644 sys/libunicode/encode.c delete mode 100644 sys/libunicode/find.c delete mode 100644 sys/libunicode/findlast.c delete mode 100644 sys/libunicode/internal.h delete mode 100644 sys/libunicode/len.c delete mode 100644 sys/libunicode/rules.mk delete mode 100644 sys/libunicode/runelen.c delete mode 100644 sys/libunicode/vendor/common.c delete mode 100644 sys/libunicode/vendor/common.h delete mode 100644 sys/libunicode/vendor/mkgraphemedata.c delete mode 100644 sys/libunicode/vendor/mkrunetype.c delete mode 100644 sys/libunicode/vendor/mkrunewidth.c create mode 100644 sys/libutf/canfit.c create mode 100644 sys/libutf/decode.c create mode 100644 sys/libutf/decodeprev.c create mode 100644 sys/libutf/encode.c create mode 100644 sys/libutf/find.c create mode 100644 sys/libutf/findlast.c create mode 100644 sys/libutf/internal.h create mode 100644 sys/libutf/len.c create mode 100644 sys/libutf/rules.mk create mode 100644 sys/libutf/runelen.c create mode 100644 sys/libutf/runetype-14.0.0.c create mode 100644 sys/libutf/runewidth-14.0.0.c diff --git a/include/base.h b/include/base.h index a7813e6..b841040 100644 --- a/include/base.h +++ b/include/base.h @@ -261,12 +261,12 @@ int bufio·read(io·Buffer *buf, int sz, int n, void *out); // ----------------------------------------------------------------------------- // memory mapped files -typedef struct mmap·Reader +typedef struct mmap·Reader { vlong len; - union { - byte *buf; - ubyte *ubuf; + union{ + char *b; + uchar *u; }; } mmap·Reader; diff --git a/include/libunicode.h b/include/libunicode.h deleted file mode 100644 index 25d6dee..0000000 --- a/include/libunicode.h +++ /dev/null @@ -1,38 +0,0 @@ -#pragma once - -typedef uint32 rune; - -#define UTFmax 4 -#define RuneErr 0xFFFDu -#define RuneMax 0x10FFFFu - -#define utf8·onebyte(c) (((c)&0x80u)==0x00u) -#define utf8·twobyte(c) (((c)&0xE0u)==0xC0u) -#define utf8·threebyte(c) (((c)&0xF0u)==0xE0u) -#define utf8·fourbyte(c) (((c)&0xF8u)==0xF0u) - -/* - * UTF-8 functions. - */ -int utf8·len(char *s); // returns number of runes for s -int utf8·runelen(rune r); // returns number of bytes for rune -int utf8·runewidth(rune r); // returns number of printed columns for rune - -int utf8·decode(char *, rune *); // decode 1 rune from char stream, store into rune, return number of bytes -int utf8·encode(rune *, char *); // encode 1 rune from rune stream, store into char, return number of bytes -int utf8·decodeprev(char *s, rune *r); // decode 1 rune from char stream, reading backwards, store into rune, return number of bytes - -char *utf8·find(char *s, rune); // find rune in char stream -char *utf8·findlast(char* s, rune); // find last rune in char stream - -int utf8·canfit(char *, int); // XXX: odd function... - -int utf8·isalpha(rune r); -int utf8·isdigit(rune r); -int utf8·isspace(rune r); -int utf8·istitle(rune r); -int utf8·ispunct(rune r); - -rune utf8·toupper(rune r); -rune utf8·tolower(rune r); -rune utf8·totitle(rune r); diff --git a/include/libutf.h b/include/libutf.h new file mode 100644 index 0000000..25d6dee --- /dev/null +++ b/include/libutf.h @@ -0,0 +1,38 @@ +#pragma once + +typedef uint32 rune; + +#define UTFmax 4 +#define RuneErr 0xFFFDu +#define RuneMax 0x10FFFFu + +#define utf8·onebyte(c) (((c)&0x80u)==0x00u) +#define utf8·twobyte(c) (((c)&0xE0u)==0xC0u) +#define utf8·threebyte(c) (((c)&0xF0u)==0xE0u) +#define utf8·fourbyte(c) (((c)&0xF8u)==0xF0u) + +/* + * UTF-8 functions. + */ +int utf8·len(char *s); // returns number of runes for s +int utf8·runelen(rune r); // returns number of bytes for rune +int utf8·runewidth(rune r); // returns number of printed columns for rune + +int utf8·decode(char *, rune *); // decode 1 rune from char stream, store into rune, return number of bytes +int utf8·encode(rune *, char *); // encode 1 rune from rune stream, store into char, return number of bytes +int utf8·decodeprev(char *s, rune *r); // decode 1 rune from char stream, reading backwards, store into rune, return number of bytes + +char *utf8·find(char *s, rune); // find rune in char stream +char *utf8·findlast(char* s, rune); // find last rune in char stream + +int utf8·canfit(char *, int); // XXX: odd function... + +int utf8·isalpha(rune r); +int utf8·isdigit(rune r); +int utf8·isspace(rune r); +int utf8·istitle(rune r); +int utf8·ispunct(rune r); + +rune utf8·toupper(rune r); +rune utf8·tolower(rune r); +rune utf8·totitle(rune r); diff --git a/sys/base/mmap.c b/sys/base/mmap.c index 6edfe28..7f3062d 100644 --- a/sys/base/mmap.c +++ b/sys/base/mmap.c @@ -15,19 +15,19 @@ mmap·open(byte *filename) s = io·open(filename, "r"); fd = io·fd(s); err = io·stat(s, &st); - if (err) { + if(err){ errorf("file stat: error code %d", err); goto ERROR; } buf = mmap(nil, st.st_size, PROT_READ, MAP_SHARED, fd, 0); - if (!buf) { + if(!buf){ errorf("mmap: failed"); goto ERROR; } // NOTE: posix systems require that reference kept to mmap file after fd is closed io·close(s); - return (mmap·Reader){.len=st.st_size, .buf=buf}; + return (mmap·Reader){.len=st.st_size, .b=buf}; ERROR: io·close(s); @@ -37,7 +37,6 @@ ERROR: error mmap·close(mmap·Reader rdr) { - munmap(rdr.buf, rdr.len); - + munmap(rdr.b, rdr.len); return 0; } diff --git a/sys/cmd/dwm/dwm.h b/sys/cmd/dwm/dwm.h index 9b616d1..afec1f2 100644 --- a/sys/cmd/dwm/dwm.h +++ b/sys/cmd/dwm/dwm.h @@ -2,7 +2,7 @@ #pragma once #include #include -#include +#include #include #include diff --git a/sys/cmd/dwm/rules.mk b/sys/cmd/dwm/rules.mk index f7254cd..79c4548 100644 --- a/sys/cmd/dwm/rules.mk +++ b/sys/cmd/dwm/rules.mk @@ -22,7 +22,7 @@ $(BINS_$(d)): TCLIBS = \ `$(PKG) --libs freetype2` \ -lX11 -lXinerama -lXft -lX11-xcb -lxcb -lxcb-res -$(BINS_$(d)): $(OBJS_$(d)) $(OBJ_DIR)/sys/libunicode/libunicode.a $(OBJ_DIR)/sys/base/base.a +$(BINS_$(d)): $(OBJS_$(d)) $(OBJ_DIR)/sys/libutf/libutf.a $(OBJ_DIR)/sys/base/base.a $(COMPLINK) include share/pop.mk diff --git a/sys/cmd/ic/ic.c b/sys/cmd/ic/ic.c index 19b868d..7fc37d8 100644 --- a/sys/cmd/ic/ic.c +++ b/sys/cmd/ic/ic.c @@ -1,6 +1,6 @@ /* See LICENSE file for license details. */ #include -#include +#include #include #include @@ -308,7 +308,7 @@ channel_rm(Channel *c) free(c); } -static +static void channel_leave(Channel *c) { @@ -321,7 +321,7 @@ channel_leave(Channel *c) channel_rm(c); } -static +static void loginkey(int ircfd, const char *key) { @@ -347,13 +347,13 @@ udsopen(const char *uds) size_t len; int fd; - if ((fd = socket(AF_UNIX, SOCK_STREAM, 0)) == -1) { + if((fd = socket(AF_UNIX, SOCK_STREAM, 0)) == -1) { fprintf(stderr, "%s: socket: %s\n", argv0, strerror(errno)); exit(1); } sun.sun_family = AF_UNIX; - if (strlcpy(sun.sun_path, uds, sizeof(sun.sun_path)) >= sizeof(sun.sun_path)) { + if(strlcpy(sun.sun_path, uds, sizeof(sun.sun_path)) >= sizeof(sun.sun_path)) { fprintf(stderr, "%s: UNIX domain socket path truncation\n", argv0); exit(1); } @@ -403,7 +403,7 @@ tcpopen(const char *host, const char *service) return fd; } -static +static int isnumeric(const char *s) { @@ -464,7 +464,7 @@ proc_channels_privmsg(int ircfd, Channel *c, char *buf) ewritestr(ircfd, msg); } -static +static void proc_channels_input(int ircfd, Channel *c, char *buf) { @@ -700,8 +700,8 @@ handle_channels_input(int ircfd, Channel *c) { char buf[IRC_MSG_MAX]; - if (read_line(c->fdin, buf, sizeof(buf)) == -1) { - if (channel_reopen(c) == -1) + if(read_line(c->fdin, buf, sizeof(buf)) == -1) { + if(channel_reopen(c) == -1) channel_rm(c); return; } @@ -755,7 +755,7 @@ run(int ircfd, const char *host) int r, maxfd; snprintf(ping_msg, sizeof(ping_msg), "PING %s\r\n", host); - while (isrunning) { + while(isrunning) { maxfd = ircfd; FD_ZERO(&rdset); FD_SET(ircfd, &rdset); @@ -767,12 +767,12 @@ run(int ircfd, const char *host) memset(&tv, 0, sizeof(tv)); tv.tv_sec = 120; r = select(maxfd + 1, &rdset, 0, 0, &tv); - if (r < 0) { + if(r < 0){ if (errno == EINTR) continue; fprintf(stderr, "%s: select: %s\n", argv0, strerror(errno)); exit(1); - } else if (r == 0) { + }else if(r == 0){ if (time(nil) - last_response >= PING_TIMEOUT) { channel_print(channelmaster, "-!- ii shutting down: ping timeout"); exit(2); /* status code 2 for timeout */ @@ -780,11 +780,11 @@ run(int ircfd, const char *host) ewritestr(ircfd, ping_msg); continue; } - if (FD_ISSET(ircfd, &rdset)) { + if(FD_ISSET(ircfd, &rdset)) { handle_server_output(ircfd); last_response = time(nil); } - for (c = channels; c; c = tmp) { + for(c = channels; c; c = tmp) { tmp = c->next; if (FD_ISSET(c->fdin, &rdset)) handle_channels_input(ircfd, c); @@ -803,7 +803,7 @@ main(int argc, char *argv[]) int ircfd, r; /* use nickname and home dir of user by default */ - if (!(spw = getpwuid(getuid()))) { + if(!(spw = getpwuid(getuid()))) { fprintf(stderr, "%s: getpwuid: %s\n", argv0, strerror(errno)); exit(1); } @@ -837,10 +837,10 @@ main(int argc, char *argv[]) break; } ARGEND - if (!*host) + if(!*host) usage(); - if (uds) + if(uds) ircfd = udsopen(uds); else ircfd = tcpopen(host, service); @@ -861,15 +861,15 @@ main(int argc, char *argv[]) create_dirtree(ircpath); channelmaster = channel_add(""); /* master channel */ - if (key) + if(key) loginkey(ircfd, key); loginuser(ircfd, host, fullname && *fullname ? fullname : nick); setup(); run(ircfd, host); - if (channelmaster) + if(channelmaster) channel_leave(channelmaster); - for (c = channels; c; c = tmp) { + for(c = channels; c; c = tmp) { tmp = c->next; channel_leave(c); } diff --git a/sys/cmd/ic/rules.mk b/sys/cmd/ic/rules.mk index c373dfb..649c9ac 100644 --- a/sys/cmd/ic/rules.mk +++ b/sys/cmd/ic/rules.mk @@ -8,7 +8,7 @@ BINS_$(d) := $(d)/ic include share/paths.mk # Local rules -$(BINS_$(d)): $(OBJS_$(d)) $(OBJ_DIR)/libn/libn.a +$(BINS_$(d)): $(OBJS_$(d)) $(OBJ_DIR)/sys/base/base.a $(COMPLINK) include share/pop.mk diff --git a/sys/cmd/ic/strlcpy.c b/sys/cmd/ic/strlcpy.c index db0e6f0..5af7906 100644 --- a/sys/cmd/ic/strlcpy.c +++ b/sys/cmd/ic/strlcpy.c @@ -15,18 +15,18 @@ strlcpy(char *dst, const char *src, size_t siz) size_t n = siz; /* Copy as many bytes as will fit */ - if (n != 0) { - while (--n != 0) { - if ((*d++ = *s++) == '\0') + if(n != 0) { + while(--n != 0) { + if((*d++ = *s++) == '\0') break; } } /* Not enough room in dst, add NUL and traverse rest of src */ - if (n == 0) { - if (siz != 0) + if(n == 0) { + if(siz != 0) *d = '\0'; /* NUL-terminate dst */ - while (*s++) + while(*s++) ; } - return(s - src - 1); /* count does not include NUL */ + return s - src - 1; /* count does not include NUL */ } diff --git a/sys/cmd/menu/menu.h b/sys/cmd/menu/menu.h index 8f1d6d7..f4345bb 100644 --- a/sys/cmd/menu/menu.h +++ b/sys/cmd/menu/menu.h @@ -1,7 +1,7 @@ /* See LICENSE file for copyright and license details. */ #include #include -#include +#include #include #include diff --git a/sys/cmd/rc/rc.h b/sys/cmd/rc/rc.h index 83c39e9..9b415fc 100644 --- a/sys/cmd/rc/rc.h +++ b/sys/cmd/rc/rc.h @@ -1,7 +1,7 @@ #include #include -#include +#include // ----------------------------------------------------------------------- // types diff --git a/sys/cmd/rc/rules.mk b/sys/cmd/rc/rules.mk index ceec50b..a2fd058 100644 --- a/sys/cmd/rc/rules.mk +++ b/sys/cmd/rc/rules.mk @@ -25,7 +25,7 @@ $(d)/parse.h $(d)/parse.c: $(d)/syntax.y yacc --header=$( #include -#include +#include #include #include diff --git a/sys/libunicode/canfit.c b/sys/libunicode/canfit.c deleted file mode 100644 index 4579ab3..0000000 --- a/sys/libunicode/canfit.c +++ /dev/null @@ -1,23 +0,0 @@ -#include "internal.h" - -/* returns 1 if string of length n is long enough to be decoded */ -int -utf8·canfit(byte* s, int n) -{ - int i; - rune c; - - if(n <= 0) - return 0; - - c = *(ubyte*)s; - if(c < TByte1) - return 1; - - if(c < TByte3) - return n >= 2; - if(c < TByte4) - return n >= 3; - - return n >= UTFmax; -} diff --git a/sys/libunicode/decode.c b/sys/libunicode/decode.c deleted file mode 100644 index 01797f1..0000000 --- a/sys/libunicode/decode.c +++ /dev/null @@ -1,98 +0,0 @@ -#include "internal.h" - -#define ACCEPT 0 -#define REJECT 12 - -static uint8 decode[] = { - /* - * the first part of the table maps bytes to character classes that - * to reduce the size of the transition table and create bitmasks - */ - 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, - 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, - 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, - 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, - 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9, - 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, - 8,8,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, - 10,3,3,3,3,3,3,3,3,3,3,3,3,4,3,3, 11,6,6,6,5,8,8,8,8,8,8,8,8,8,8,8, - - /* - * the second part is a transition table that maps a combination - * of a state of the automaton and a character class to a state - */ - 0,12,24,36,60,96,84,12,12,12,48,72, 12,12,12,12,12,12,12,12,12,12,12,12, - 12, 0,12,12,12,12,12, 0,12, 0,12,12, 12,24,12,12,12,12,12,24,12,24,12,12, - 12,12,12,12,12,12,12,24,12,12,12,12, 12,24,12,12,12,12,12,12,12,24,12,12, - 12,12,12,12,12,12,12,36,12,36,12,12, 12,36,12,12,12,12,12,36,12,36,12,12, - 12,36,12,12,12,12,12,12,12,12,12,12, -}; - -int -utf8·decode(char *s, rune *r) -{ - int n; - rune v; - uint8 b, t, x=ACCEPT; - - b = ((uint8 *)s)[0]; - t = decode[b]; - v = (0xFF >> t) & b; - x = decode[256+x+t]; - - for(n=1; x > REJECT && n < UTFmax; n++){ - b = ((uint8 *)s)[n]; - t = decode[b]; - v = (v << 6) | (b & TMask); - x = decode[256+x+t]; - } - - if(x != ACCEPT){ - *r = RuneErr; - return 1; - } - - *r = v; - return n; -} - -#if 0 -int -utf8·decode(byte *s, rune *r) -{ - int c[UTFmax], i; - rune l; - - c[0] = *(ubyte*)(s); - if(c[0] < Tx){ - *r = c[0]; - return 1; - } - - l = c[0]; - for(i = 1; i < UTFmax; i++){ - c[i] = *(ubyte*)(s+i); - c[i] ^= Tx; - if(c[i] & Testx) goto bad; - - l = (l << Bitx) | c[i]; - if(c[0] < Tbyte(i + 2)){ - l &= RuneX(i + 1); - if(i == 1){ - if(c[0] < Tbyte(2) || l <= Rune1) - goto bad; - }else if(l <= RuneX(i) || l > RuneMax) - goto bad; - - if(i == 2 && SurrogateMin <= l && l <= SurrogateMax) - goto bad; - - *r = l; - return i + 1; - } - } -bad: - *r = RuneErr; - return 1; -} -#endif diff --git a/sys/libunicode/decodeprev.c b/sys/libunicode/decodeprev.c deleted file mode 100644 index 27dced6..0000000 --- a/sys/libunicode/decodeprev.c +++ /dev/null @@ -1,60 +0,0 @@ -#include "internal.h" - -#define ACCEPT 0 -#define REJECT 12 - -static uint8 decode[] = { - /* - * the first part of the table maps bytes to character classes that - * to reduce the size of the transition table and create bitmasks. - */ - 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, - 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, - 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, - 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, - 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9, - 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, - 8,8,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, - 10,3,3,3,3,3,3,3,3,3,3,3,3,4,3,3, 11,6,6,6,5,8,8,8,8,8,8,8,8,8,8,8, - /* - * The second part is a transition table that maps a combination - * of a state of the automaton and a character class to a state. - */ - // 0 1 2 3 4 5 6 7 8 9 10 11 - 0,24,12,12,12,12,12,24,12,24,12,12, - 0,24,12,12,12,12,12,24,12,24,12,12, - 12,36, 0,12,12,12,12,48,12,36,12,12, - 12,60,12, 0, 0,12,12,72,12,72,12,12, - 12,60,12, 0,12,12,12,72,12,72, 0,12, - 12,12,12,12,12, 0, 0,12,12,12,12,12, - 12,12,12,12,12,12,12,12,12,12,12, 0 -}; - -int -utf8·decodeprev(byte *s, rune *r) -{ - int n; - rune v; - uint8 b, t, d, x=ACCEPT; - - v=0, n=0, d=0; -nextbyte: - b = ((uint8 *)s)[-n++]; - t = decode[b]; - x = decode[256+x+t]; - - if(x > REJECT && n < UTFmax){ - v = v | ((b & TMask) << d); - d += 6; - goto nextbyte; - } - - if(x != ACCEPT) - *r = RuneErr; - else{ - v |= (((0xFFu >> t) & b) << d); - *r = v; - } - - return n; -} diff --git a/sys/libunicode/encode.c b/sys/libunicode/encode.c deleted file mode 100644 index fa7c93e..0000000 --- a/sys/libunicode/encode.c +++ /dev/null @@ -1,69 +0,0 @@ -#include "internal.h" - -int -utf8·encode(rune *r, byte *s) -{ - rune c; - - c = *r; - if(c < Rune1Byte){ // 7 bits - s[0] = (uint8)c; - return 1; - } - - if(c < Rune2Byte){ // 11 bits - s[0] = TByte1 | (c >> 6); - s[1] = Tx | (c & TMask); - return 2; - } - - if(c < Rune3Byte){ // 16 bits - s[0] = TByte2 | ((c >> 12)); - s[1] = Tx | ((c >> 6) & TMask); - s[2] = Tx | ((c) & TMask); - return 3; - } - - // 22 bits - if(c > RuneMax || (RuneSurrogateMin <= c && c <= RuneSurrogateMax)) - c = RuneErr; - - s[0] = TByte3 | ((c >> 18)); - s[1] = Tx | ((c >> 12) & TMask); - s[2] = Tx | ((c >> 6) & TMask); - s[3] = Tx | ((c) & TMask); - - return 4; -} - -#if 0 -int -utf8·encode(rune* r, byte* s) -{ - int i, j; - rune c; - - c = *r; - if(c <= Rune1) { - s[0] = c; - return 1; - } - - for(i = 2; i < UTFmax + 1; i++){ - if(i == 3){ - if(c > RuneMax) - c = RuneErr; - if(SurrogateMin <= c && c <= SurrogateMax) - c = RuneErr; - } - if(c <= RuneX(i) || i == UTFmax) { - s[0] = Tbyte(i) | (c >> (i - 1)*Bitx); - for(j = 1; j < i; j++) - s[j] = Tx | ((c >> (i - j - 1)*Bitx) & Maskx); - return i; - } - } - - return UTFmax; -} -#endif diff --git a/sys/libunicode/find.c b/sys/libunicode/find.c deleted file mode 100644 index d75feb8..0000000 --- a/sys/libunicode/find.c +++ /dev/null @@ -1,31 +0,0 @@ -#include "internal.h" - -byte* -utf8·find(byte* s, rune c) -{ - long c1; - rune r; - int n; - - if(c < Tx) - return strchr(s, c); - - for(;;){ - c1 = *(ubyte*)s; - if(c1 < Tx){ - if(c1 == 0) return nil; - if(c1 == c) return s; - s++; - continue; - } - - n = utf8·decode(s, &r); - - if(r == c) - return s; - - s += n; - } - - return nil; -} diff --git a/sys/libunicode/findlast.c b/sys/libunicode/findlast.c deleted file mode 100644 index ab25ab2..0000000 --- a/sys/libunicode/findlast.c +++ /dev/null @@ -1,32 +0,0 @@ -#include "internal.h" - -byte* -utf8·findlast(byte* s, rune c) -{ - long c1; - rune r; - byte *l; - - if(c < Tx) - return strrchr(s, c); - - l = nil; - for(;;){ - c1 = *(ubyte*)s; - if(c1 < Tx){ - if(c1 == 0) return l; - if(c1 == c) l = s; - s++; - continue; - } - - c1 = utf8·decode(s, &r); - - if(r == c) - l = s; - - s += c1; - } - - return nil; -} diff --git a/sys/libunicode/internal.h b/sys/libunicode/internal.h deleted file mode 100644 index df69310..0000000 --- a/sys/libunicode/internal.h +++ /dev/null @@ -1,38 +0,0 @@ -#pragma once - -#include -#include -#include - -/* - * NOTE: we use the preprocessor to ensure we have unsigned constants. - * UTF-8 code: - * 1 byte: - * 0xxxxxxx - * 2 byte: - * 110xxxxx 10xxxxxx - * 3 byte: - * 1110xxxx 10xxxxxx 10xxxxxx - * 4 byte: - * 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx - */ - -#define Tx 0x80u // 0b10000000 transfer header -#define TMask 0x3Fu // 0b00111111 transfer mask - -#define TByte1 0xC0u // 0b11000000 -#define TByte2 0xE0u // 0b11100000 -#define TByte3 0xF0u // 0b11110000 -#define TByte4 0xF8u // 0b11111000 - -#define RuneMask 0x1FFFFFu - -#define Rune1Byte 0x000080u // 1 << 8 (1 byte) -#define Rune2Byte 0x001000u // 1 << 12 (2 bytes) -#define Rune3Byte 0x020000u // 1 << 17 (3 bytes) -#define Rune4Byte 0x400000u // 1 << 22 (4 bytes) - - -/* UTF-16 nonsense */ -#define RuneSurrogateMin 0x0D8000 -#define RuneSurrogateMax 0x0D8FFF diff --git a/sys/libunicode/len.c b/sys/libunicode/len.c deleted file mode 100644 index 8fbd679..0000000 --- a/sys/libunicode/len.c +++ /dev/null @@ -1,21 +0,0 @@ -#include "internal.h" - -int -utf8·len(char *s) -{ - int c; - long n; - rune r; - - n = 0; - for(;;){ - c = *(uchar*)s; - if(c < Tx){ - if(c == 0) - return n; - s++; - }else - s += utf8·decode(s, &r); - n++; - } -} diff --git a/sys/libunicode/rules.mk b/sys/libunicode/rules.mk deleted file mode 100644 index 1d714fe..0000000 --- a/sys/libunicode/rules.mk +++ /dev/null @@ -1,76 +0,0 @@ -include share/push.mk - -UNICODE = 14.0.0 - -SRCS_$(d) := \ - $(d)/encode.c \ - $(d)/decode.c \ - $(d)/decodeprev.c \ - $(d)/find.c \ - $(d)/findlast.c \ - $(d)/canfit.c \ - $(d)/runelen.c \ - $(d)/len.c \ - $(d)/runetype-$(UNICODE).c \ - $(d)/runewidth-$(UNICODE).c - -LIBS_$(d) := $(d)/libunicode.a - -include share/paths.mk - -# ======================================================================== -# table generation - -$(d)/vendor/common.o: $(d)/vendor/common.c - $(COMPILE) - -# rune categories -$(d)/vendor/UnicodeData-$(UNICODE).txt: - @echo "GET UnicodeData.txt";\ - curl https://www.unicode.org/Public/$(UNICODE)/ucd/UnicodeData.txt > $@ - -$(d)/vendor/mkrunetype: $(d)/vendor/mkrunetype.c $(d)/vendor/common.o $(OBJ_DIR)/sys/base/base.a - $(COMPLINK) - -GENS += $(d)/vendor/mkrunetype - -$(d)/runetype-$(UNICODE).c: $(d)/vendor/UnicodeData-$(UNICODE).txt $(d)/vendor/mkrunetype - @$(dir $@)vendor/mkrunetype $< > $@ - -# rune widths -$(d)/vendor/EastAsianWidth-$(UNICODE).txt: - @echo "GET EastAsianWidth.txt";\ - curl https://www.unicode.org/Public/$(UNICODE)/ucd/EastAsianWidth.txt > $@ - -$(d)/vendor/EmojiData-$(UNICODE).txt: - @echo "GET EmojiData.txt";\ - curl https://www.unicode.org/Public/$(UNICODE)/ucd/emoji/emoji-data.txt > $@ - -$(d)/vendor/mkrunewidth: $(d)/vendor/mkrunewidth.c $(d)/vendor/common.o $(OBJ_DIR)/sys/base/base.a - $(COMPLINK) - -GENS += $(d)/vendor/mkrunewidth - -$(d)/runewidth-$(UNICODE).c: $(d)/vendor/mkrunewidth $(d)/vendor/UnicodeData-$(UNICODE).txt $(d)/vendor/EastAsianWidth-$(UNICODE).txt $(d)/vendor/EmojiData-$(UNICODE).txt - @$(dir $@)vendor/mkrunewidth $(filter-out $<, $^) > $@ - -# grapheme boundaries -$(d)/vendor/GraphemeBreakProperty-$(UNICODE).txt: - @echo "GET GraphemeBreakProperty.txt";\ - curl https://www.unicode.org/Public/$(UNICODE)/ucd/auxiliary/GraphemeBreakProperty.txt > $@ - -$(d)/vendor/mkgraphemedata: $(d)/vendor/mkgraphemedata.c $(d)/vendor/common.o $(OBJ_DIR)/sys/base/base.a - $(COMPLINK) - -$(d)/graphemedata-$(UNICODE).c: $(d)/vendor/mkgraphemedata $(d)/vendor/GraphemeBreakProperty-$(UNICODE).txt - $^ > $@ - -GENS += $(d)/vendor/mkgraphemedata - -# ======================================================================== -# normal operations - -$(LIBS_$(d)): $(OBJS_$(d)) - $(ARCHIVE) - -include share/pop.mk diff --git a/sys/libunicode/runelen.c b/sys/libunicode/runelen.c deleted file mode 100644 index dac7f15..0000000 --- a/sys/libunicode/runelen.c +++ /dev/null @@ -1,8 +0,0 @@ -#include "internal.h" - -int -utf8·runelen(rune r) -{ - byte s[10]; - return utf8·encode(&r, s); -} diff --git a/sys/libunicode/vendor/common.c b/sys/libunicode/vendor/common.c deleted file mode 100644 index 6b5d1b3..0000000 --- a/sys/libunicode/vendor/common.c +++ /dev/null @@ -1,220 +0,0 @@ -#include "common.h" - -// ----------------------------------------------------------------------- -// input functions - -int -parse(io·Stream *io, int nfield, char **field, int len, char *line) -{ - int n; - if((n=io·readln(io, len, line)) <= 0) - return ParseEOF; - - if(n == len) - panicf("line too long"); - - if(line[n-1] != '\n') - panicf("invalid line: expected '\n', found '%c'", line[n]); - - line[n-1] = 0; - - if(line[0] == '#' || line[0] == 0) - return ParseSkip; - - /* tokenize line into fields */ - n = 0; - field[n] = line; - while(*line){ - if(*line == ';'){ - *line = 0; - field[++n] = line+1; - } - line++; - } - - if(n != nfield-1) - panicf("expected %d number of fields, got %d: %s", nfield, n, line); - - return ParseOK; -} - -int -codepoint(char *s) -{ - int c, b; - - c = 0; - while((b=*s++)){ - c <<= 4; - if(b >= '0' && b <= '9') - c += b - '0'; - else if(b >= 'A' && b <= 'F') - c += b - 'A' + 10; - else - panicf("bad codepoint char '%c'", b); - } - - return c; -} - -void -codepointrange(io·Stream *utf8, char *field[NumFields], int *start, int *stop) -{ - int e, c; - char *other[NumFields], line[1024]; - - // XXX: the stop variable passes in the previous stopping character - e = *stop; - c = codepoint(field[Fcode]); - - if(c >= NumRunes) - panicf("unexpected large codepoint %x", c); - if(c <= e) - panicf("bad code sequence: %x then %x", e, c); - e = c; - - if(strstr(field[Fname], ", First>") != nil){ - if(!parse(utf8, arrlen(other), other, arrlen(line), line)) - panicf("range start at end of file"); - if(strstr(other[Fname], ", Last>") == nil) - panicf("range start not followed by range end"); - - e = codepoint(other[Fcode]); - - if(e <= c) - panicf("bad code sequence: %x then %x", c, e); - if(strcmp(field[Fcategory], other[Fcategory]) != 0) - panicf("range with mismatched category"); - } - - *start = c; - *stop = e; -} - -// ----------------------------------------------------------------------- -// output functions - -void -putsearch(void) -{ - puts( - "#include \n" - "#include \n" - "\n" - "static\n" - "rune*\n" - "rangesearch(rune c, rune *t, int n, int ne)\n" - "{\n" - " rune *p;\n" - " int m;\n" - " while(n > 1) {\n" - " m = n >> 1;\n" - " p = t + m*ne;\n" - " if(c >= p[0]){\n" - " t = p;\n" - " n = n-m;\n" - " }else\n" - " n = m;\n" - " }\n" - " if(n && c >= t[0])\n" - " return t;\n" - " return 0;\n" - "}\n" - ); - -} - -int -putrange(char *ident, char *prop, int force) -{ - int l, r, start; - - start = 0; - for(l = 0; l < NumRunes;) { - if(!prop[l]){ - l++; - continue; - } - - for(r = l+1; r < NumRunes; r++){ - if(!prop[r]) - break; - prop[r] = 0; - } - - if(force || r > l + 1){ - if(!start){ - printf("static rune %s[] = {\n", ident); - start = 1; - } - prop[l] = 0; - printf("\t0x%.4x, 0x%.4x,\n", l, r-1); - } - - l = r; - } - - if(start) - printf("};\n\n"); - - return start; -} - -int -putpair(char *ident, char *prop) -{ - int l, r, start; - - start = 0; - for(l=0; l+2 < NumRunes; ){ - if(!prop[l]){ - l++; - continue; - } - - for(r = l + 2; r < NumRunes; r += 2){ - if(!prop[r]) - break; - prop[r] = 0; - } - - if(r != l + 2){ - if(!start){ - printf("static rune %s[] = {\n", ident); - start = 1; - } - prop[l] = 0; - printf("\t0x%.4x, 0x%.4x,\n", l, r - 2); - } - - l = r; - } - - if(start) - printf("};\n\n"); - return start; -} - -int -putsingle(char *ident, char *prop) -{ - int i, start; - - start = 0; - for(i = 0; i < NumRunes; i++) { - if(!prop[i]) - continue; - - if(!start){ - printf("static rune %s[] = {\n", ident); - start = 1; - } - prop[i] = 0; - printf("\t0x%.4x,\n", i); - } - - if(start) - printf("};\n\n"); - - return start; -} diff --git a/sys/libunicode/vendor/common.h b/sys/libunicode/vendor/common.h deleted file mode 100644 index 17cb113..0000000 --- a/sys/libunicode/vendor/common.h +++ /dev/null @@ -1,46 +0,0 @@ -#pragma once - -#include -#include -#include - -enum -{ - // Fields inside UnicodeData.txt - Fcode, - Fname, - Fcategory, - Fcombine, - Fbidir, - Fdecomp, - Fdecimal, - Fdigit, - Fnumeric, - Fmirror, - Foldname, - Fcomment, - Fupper, - Flower, - Ftitle, - - NumFields, - NumRunes = 1 << 21, -}; - -/* input functions */ -enum -{ - ParseEOF, - ParseOK, - ParseSkip, -}; - -int parse(io·Stream *io, int nfield, char **field, int len, char *line); -int codepoint(char *s); -void codepointrange(io·Stream *utf8, char *field[NumFields], int *start, int *stop); - -/* output functions */ -void putsearch(void); -int putrange(char *ident, char *prop, int force); -int putpair(char *ident, char *prop); -int putsingle(char *ident, char *prop); diff --git a/sys/libunicode/vendor/mkgraphemedata.c b/sys/libunicode/vendor/mkgraphemedata.c deleted file mode 100644 index f79d851..0000000 --- a/sys/libunicode/vendor/mkgraphemedata.c +++ /dev/null @@ -1,24 +0,0 @@ -#include -#include -#include - -// ----------------------------------------------------------------------- -// main point of entry - -static -void -usage(void) -{ - fprintf(stderr, "usage: mkgraphemedata \n"); - exit(1); -} - -int -main(int argc, char *argv[]) -{ - io·Stream *utf8; - char line[1024]; - - ARGBEGIN{ - }ARGEND; -} diff --git a/sys/libunicode/vendor/mkrunetype.c b/sys/libunicode/vendor/mkrunetype.c deleted file mode 100644 index 9f939f4..0000000 --- a/sys/libunicode/vendor/mkrunetype.c +++ /dev/null @@ -1,388 +0,0 @@ -#include "common.h" - -// ----------------------------------------------------------------------- -// globals - -#define OFFSET (1 << 20) -#define DELTA(mapx, x) ((1 << 20) + (mapx) - (x)) - -// TODO: use bitarrays. will reduce executable size 8x -struct Table -{ - /* properties */ - char isspace[NumRunes]; - char isalpha[NumRunes]; - char ismark[NumRunes]; - char isdigit[NumRunes]; - char isupper[NumRunes]; - char islower[NumRunes]; - char istitle[NumRunes]; - char ispunct[NumRunes]; - char issymbl[NumRunes]; - char iscntrl[NumRunes]; - - char combine[NumRunes]; - - /* transformations */ - int toupper[NumRunes]; - int tolower[NumRunes]; - int totitle[NumRunes]; -}; - -static struct Table table; - -// ----------------------------------------------------------------------- -// internal functions - -static -int -isrange(char *label, char *prop, int force) -{ - char ident[128]; - if(snprintf(ident, arrlen(ident), "is%s_range", label) == arrlen(ident)) - panicf("out of identifier space\n"); - - return putrange(ident, prop, force); -} - -static -int -ispair(char *label, char *prop) -{ - char ident[128]; - if(snprintf(ident, arrlen(ident), "is%s_pair", label) == arrlen(ident)) - panicf("out of identifier space\n"); - - return putpair(ident, prop); -} - -static -int -issingle(char *label, char *prop) -{ - char ident[128]; - if(snprintf(ident, arrlen(ident), "is%s_single", label) == arrlen(ident)) - panicf("out of identifier space\n"); - - return putsingle(ident, prop); -} - -static -void -makeis(char *label, char *table, int pairs, int onlyranges) -{ - int hasr, hasp=0, hass=0; - - hasr = isrange(label, table, onlyranges); - if(!onlyranges && pairs) - hasp = ispair(label, table); - if(!onlyranges) - hass = issingle(label, table); - - printf( - "int\n" - "utf8·is%s(rune c)\n" - "{\n" - " rune *p;\n" - "\n", - label); - - if(hasr){ - printf( - " p = rangesearch(c, is%s_range, arrlen(is%s_range)/2, 2);\n" - " if(p && c >= p[0] && c <= p[1])\n" - " return 1;\n", - label, label); - } - - if(hasp){ - printf( - " p = rangesearch(c, is%s_pair, arrlen(is%s_pair)/2, 2);\n" - " if(p && c >= p[0] && c <= p[1] && !((c - p[0]) & 1))\n" - " return 1;\n", - label, label); - } - - if(hass) - printf( - " p = rangesearch(c, is%s_single, arrlen(is%s_single), 1);\n" - " if(p && c == p[0])\n" - " return 1;\n", - label, label); - - printf( - " return 0;\n" - "}\n" - "\n"); -} - -static -int -torange(char *label, int *index, int force) -{ - int l, r, d, start = 0; - - for(l = 0; l < NumRunes; ){ - if(index[l] == l){ - l++; - continue; - } - - d = DELTA(index[l], l); - if(d != (rune)d) - panicf("bad map delta %d", d); - - for(r = l+1; r < NumRunes; r++){ - if(DELTA(index[r], r) != d) - break; - index[r] = r; - } - - if(force || r != l + 1){ - if(!start){ - printf("static rune to%s_range[] = {\n", label); - start = 1; - } - index[l] = l; - printf("\t0x%.4x, 0x%.4x, %d,\n", l, r-1, d); - } - l = r; - } - if(start) - printf("};\n\n"); - - return start; -} - -static -int -topair(char *label, int *index) -{ - int l, r, d, start = 0; - - for(l = 0; l + 2 < NumRunes; ){ - if(index[l] == l){ - l++; - continue; - } - - d = DELTA(index[l], l); - if(d != (rune)d) - panicf("bad delta %d", d); - - for(r = l+2; r < NumRunes; r += 2){ - if(DELTA(index[r], r) != d) - break; - index[r] = r; - } - - if(r > l+2){ - if(!start){ - printf("static rune to%s_pair[] = {\n", label); - start = 1; - } - index[l] = l; - printf("\t0x%.4x, 0x%.4x, %d,\n", l, r-2, d); - } - - l = r; - } - if(start) - printf("};\n\n"); - - return start; -} - -static -int -tosingle(char *label, int *index) -{ - int i, d, start = 0; - - for(i=0; i < NumRunes; i++) { - if(index[i] == i) - continue; - - d = DELTA(index[i], i); - if(d != (rune)d) - panicf("bad map delta %d", d); - - if(!start){ - printf("static rune to%s_single[] = {\n", label); - start = 1; - } - index[i] = i; - printf("\t0x%.4x, %d,\n", i, d); - } - if(start) - printf("};\n\n"); - - return start; -} - -static -void -mkto(char *label, int *index, int pairs, int onlyrange) -{ - int hasr, hasp=0, hass=0; - - hasr = torange(label, index, !onlyrange); - if(!onlyrange && pairs) - hasp = topair(label, index); - if(!onlyrange) - hass = tosingle(label, index); - - printf( - "rune\n" - "utf8·to%s(rune c)\n" - "{\n" - " rune *p;\n" - "\n", - label); - - if(hasr) - printf( - " p = rangesearch(c, to%s_range, arrlen(to%s_range)/3, 3);\n" - " if(p && c >= p[0] && c <= p[1])\n" - " return c + p[2] - %d;\n", - label, label, OFFSET); - - if(hasp) - printf( - " p = rangesearch(c, to%s_pair, arrlen(to%s_pair)/3, 3);\n" - " if(p && c >= p[0] && c <= p[1] && !((c - p[0]) & 1))\n" - " return c + p[2] - %d;\n", - label, label, OFFSET); - - if(hass) - printf( - " p = rangesearch(c, to%s_single, arrlen(to%s_single)/2, 2);\n" - " if(p && c == p[0])\n" - " return c + p[1] - %d;\n", - label, label, OFFSET); - - - printf( - " return c;\n" - "}\n" - "\n" - ); -} - -// ----------------------------------------------------------------------- -// main point of entry - -static -void -usage(void) -{ - fprintf(stderr, "usage: mkrunetype \n"); - exit(1); -} - -int -main(int argc, char *argv[]) -{ - int i, sc, c, ec; - io·Stream *utf8; - char *prop, *field[NumFields], line[1024]; - - ARGBEGIN{ - }ARGEND; - - if(argc != 1) - usage(); - - if(!(utf8 = io·open(argv[0], "r"))) - panicf("can't open %s\n", argv[0]); - - /* by default each character maps to itself */ - for(i = 0; i < NumRunes; i++) { - table.toupper[i] = i; - table.tolower[i] = i; - table.totitle[i] = i; - } - - /* ensure all C local white space characters pass */ - table.isspace['\t'] = 1; - table.isspace['\n'] = 1; - table.isspace['\r'] = 1; - table.isspace['\f'] = 1; - table.isspace['\v'] = 1; - table.isspace[0x85] = 1; - - ec = -1; - // NOTE: we don't check for comments here: assume UnicodeData.txt doesn't have any - while(parse(utf8, arrlen(field), field, arrlen(line), line)){ - /* parse unicode range */ - codepointrange(utf8, field, &sc, &ec); - prop = field[Fcategory]; - - for(c = sc; c <= ec; c++){ - /* grab properties */ - switch(prop[0]){ - case 'L': - table.isalpha[c] = 1; - switch(prop[1]){ - case 'u': table.isupper[c] = 1; break; - case 'l': table.islower[c] = 1; break; - case 't': table.istitle[c] = 1; break; - case 'm': break; // modifier letters - case 'o': break; // ideograph letters - default: - goto badproperty; - } - break; - - case 'Z': - table.isspace[c] = 1; - break; - - case 'M': - table.ismark[c] = 1; - break; - - case 'N': - table.isdigit[c] = 1; - break; - - case 'P': - table.ispunct[c] = 1; - break; - - case 'S': - table.issymbl[c] = 1; - break; - - case 'C': - table.iscntrl[c] = 1; - break; - - default: badproperty: - panicf("unrecognized category '%s'", prop); - } - /* grab transformations */ - if(*field[Fupper]) - table.toupper[c] = codepoint(field[Fupper]); - if(*field[Flower]) - table.tolower[c] = codepoint(field[Flower]); - if(*field[Ftitle]) - table.totitle[c] = codepoint(field[Ftitle]); - } - } - io·close(utf8); - - putsearch(); - - makeis("space", table.isspace, 0, 1); - makeis("digit", table.isdigit, 0, 1); - makeis("alpha", table.isalpha, 0, 0); - makeis("upper", table.isupper, 1, 0); - makeis("lower", table.islower, 1, 0); - makeis("title", table.istitle, 1, 0); - makeis("punct", table.ispunct, 1, 0); - - mkto("upper", table.toupper, 1, 0); - mkto("lower", table.tolower, 1, 0); - mkto("title", table.totitle, 1, 0); -} diff --git a/sys/libunicode/vendor/mkrunewidth.c b/sys/libunicode/vendor/mkrunewidth.c deleted file mode 100644 index 14e6973..0000000 --- a/sys/libunicode/vendor/mkrunewidth.c +++ /dev/null @@ -1,325 +0,0 @@ -#include "common.h" - -/* - * inspired by design choices in utf8proc/charwidths.jl - * all widths default to 1 unless they fall within the categories: - * 1. Mn 2. Mc 3. Me 4. Zl - * 5. Zp 6. Cc 7. Cf 8. Cs - * these default to zero width - */ -enum -{ - /* width ? */ - WidthNeutral, /* (N) practially treated like narrow but unclear ... */ - WidthAmbiguous, /* (A) sometimes wide and sometimes not... */ - /* width 1 */ - WidthHalf, /* (H) = to narrow (compatability equivalent) */ - WidthNarrow, /* (Na) ASCII width */ - /* width 2 */ - WidthWide, /* (W) 2x width */ - WidthFull, /* (F) = to wide (compatability equivalent) */ -}; - -struct Table -{ - char width[3][NumRunes]; -}; - -static struct Table table; - -// ----------------------------------------------------------------------- -// internal functions - -static -void -parse_category(char *path) -{ - int sc, c, ec, w; - io·Stream *utf8; - char *prop, *field[NumFields], line[1024]; - - if(!(utf8 = io·open(path, "r"))) - panicf("can't open %s\n", path); - - // NOTE: we don't check for comments here - ec = -1; - while(parse(utf8, arrlen(field), field, arrlen(line), line)){ - codepointrange(utf8, field, &sc, &ec); - - prop = field[Fcategory]; - - switch(prop[0]){ - case 'M': - switch(prop[1]){ - case 'n': case 'c': case 'e': - w = 0; - break; - default: - w = 1; - break; - } - break; - case 'Z': - switch(prop[1]){ - case 'l': case 'p': - w = 0; - break; - default: - w = 1; - break; - } - break; - case 'C': - switch(prop[1]){ - case 'c': case 'f': case 's': - w = 0; - break; - default: - w = 1; - break; - } - default: - w = 1; - } - - for(c = sc; c <= ec; c++) - table.width[w][c] = 1; - } - - io·close(utf8); -} - -static -void -coderange(char *field, int *l, int *r) -{ - char *s; - - if(!(s = strstr(field, ".."))) - *l=*r=codepoint(field); - else{ - *s++ = 0, *s++ = 0; - *l=codepoint(field); - *r=codepoint(s); - } -} - -static -void -parse_eawidths(char *path) -{ - int at, w; - int l, c, r; - io·Stream *utf8; - char *field[2], line[1024]; - - utf8 = io·open(path, "r"); - while((at=parse(utf8, arrlen(field), field, arrlen(line), line)) != ParseEOF){ - if(at == ParseSkip) - continue; - - switch(field[1][0]){ - case 'A': continue; - case 'N': - if(field[1][1] != 'a') - continue; - /* fallthrough */ - case 'H': w = 1; break; - - case 'W': /* fallthrough */ - case 'F': w = 2; break; - - default: - panicf("malformed east asian width class: %s\n", field[1]); - } - - coderange(field[0], &l, &r); - - for(c=l; c <= r; c++){ - /* ensure it only exists in one table */ - table.width[w][c] = 1; - table.width[(w+1)%3][c] = 0; - table.width[(w+2)%3][c] = 0; - } - } - io·close(utf8); -} - -static -void -parse_emoji(char *path) -{ - int at, w; - int l, c, r; - io·Stream *utf8; - char *s, *field[2], line[1024]; - - utf8 = io·open(path, "r"); - while((at=parse(utf8, arrlen(field), field, arrlen(line), line)) != ParseEOF){ - if(at == ParseSkip) - continue; - - /* only override emoji presentation */ - if(!strstr(field[1], "Emoji_Presentation")) - continue; - - /* trim trailing space */ - for(s=field[0]; *s; s++){ - if(*s == ' ') - *s = 0; - } - - coderange(field[0], &l, &r); - - for(c=l; c <= r; c++){ - table.width[0][c] = 0; - table.width[1][c] = 0; - table.width[2][c] = 1; - } - } - - io·close(utf8); -} - -/* output functions */ -static -void -maketable(char *label, char *table, int pairs, int onlyranges) -{ - int r, p=0, s=0; - char ident[3][128]; - - enum - { - Irange, - Ipair, - Isingle, - }; - - /* ranges */ - if(snprintf(ident[Irange], arrlen(ident[Irange]), "%s_range", label) == arrlen(ident[Irange])) - panicf("out of identifier space\n"); - r = putrange(ident[Irange], table, onlyranges); - - if(!onlyranges && pairs){ - if(snprintf(ident[Ipair], arrlen(ident[Ipair]), "%s_pair", label) == arrlen(ident[Ipair])) - panicf("out of identifier space\n"); - p = putpair(ident[Ipair], table); - } - if(!onlyranges){ - if(snprintf(ident[Isingle], arrlen(ident[Isingle]), "%s_single", label) == arrlen(ident[Isingle])) - panicf("out of identifier space\n"); - - s = putsingle(ident[Isingle], table); - } - - printf( - "static int\n" - "is%s(rune c)\n" - "{\n" - " rune *p;\n" - "\n", - label); - - if(r){ - printf( - " p = rangesearch(c, %s, arrlen(%s)/2, 2);\n" - " if(p && c >= p[0] && c <= p[1])\n" - " return 1;\n", - ident[Irange], ident[Irange]); - } - - if(p){ - printf( - " p = rangesearch(c, %s, arrlen(%s)/2, 2);\n" - " if(p && c >= p[0] && c <= p[1] && !((c - p[0]) & 1))\n" - " return 1;\n", - ident[Ipair], ident[Ipair]); - } - - if(s) - printf( - " p = rangesearch(c, %s, arrlen(%s), 1);\n" - " if(p && c == p[0])\n" - " return 1;\n", - ident[Isingle], ident[Isingle]); - - printf( - " return 0;\n" - "}\n" - "\n"); -} - -// ----------------------------------------------------------------------- -// main point of entry - -static -void -usage(void) -{ - fprintf(stderr, "usage: mkrunewidth \n"); - exit(1); -} - -#define SETW0(c) \ - table.width[0][(c)] = 1, \ - table.width[1][(c)] = 0, \ - table.width[2][(c)] = 0; - -#define SETW1(c) \ - table.width[0][(c)] = 0, \ - table.width[1][(c)] = 1, \ - table.width[2][(c)] = 0; - -#define SETW2(c) \ - table.width[0][(c)] = 0, \ - table.width[1][(c)] = 0, \ - table.width[2][(c)] = 1; - - -int -main(int argc, char *argv[]) -{ - int c; - - ARGBEGIN{ - }ARGEND; - - if(argc != 3) - usage(); - - parse_category(*argv++); - parse_eawidths(*argv++); - parse_emoji(*argv); - - /* overrides */ - SETW0(0x2028); - SETW0(0x2029); - - SETW1(0x00AD); - - /* simple checking */ - for(c=0; c 1) - panicf("improper table state"); - } - - putsearch(); - - maketable("width0", table.width[0], 1, 0); - maketable("width1", table.width[1], 1, 0); - maketable("width2", table.width[2], 1, 0); - - puts( - "\n" - "int\n" - "utf8·runewidth(rune c)\n" - "{\n" - " if(iswidth1(c))\n" - " return 1;\n" - " if(iswidth2(c))\n" - " return 2;\n" - " return 0;\n" - "}" - ); -} diff --git a/sys/libutf/canfit.c b/sys/libutf/canfit.c new file mode 100644 index 0000000..4579ab3 --- /dev/null +++ b/sys/libutf/canfit.c @@ -0,0 +1,23 @@ +#include "internal.h" + +/* returns 1 if string of length n is long enough to be decoded */ +int +utf8·canfit(byte* s, int n) +{ + int i; + rune c; + + if(n <= 0) + return 0; + + c = *(ubyte*)s; + if(c < TByte1) + return 1; + + if(c < TByte3) + return n >= 2; + if(c < TByte4) + return n >= 3; + + return n >= UTFmax; +} diff --git a/sys/libutf/decode.c b/sys/libutf/decode.c new file mode 100644 index 0000000..01797f1 --- /dev/null +++ b/sys/libutf/decode.c @@ -0,0 +1,98 @@ +#include "internal.h" + +#define ACCEPT 0 +#define REJECT 12 + +static uint8 decode[] = { + /* + * the first part of the table maps bytes to character classes that + * to reduce the size of the transition table and create bitmasks + */ + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9, + 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, + 8,8,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, + 10,3,3,3,3,3,3,3,3,3,3,3,3,4,3,3, 11,6,6,6,5,8,8,8,8,8,8,8,8,8,8,8, + + /* + * the second part is a transition table that maps a combination + * of a state of the automaton and a character class to a state + */ + 0,12,24,36,60,96,84,12,12,12,48,72, 12,12,12,12,12,12,12,12,12,12,12,12, + 12, 0,12,12,12,12,12, 0,12, 0,12,12, 12,24,12,12,12,12,12,24,12,24,12,12, + 12,12,12,12,12,12,12,24,12,12,12,12, 12,24,12,12,12,12,12,12,12,24,12,12, + 12,12,12,12,12,12,12,36,12,36,12,12, 12,36,12,12,12,12,12,36,12,36,12,12, + 12,36,12,12,12,12,12,12,12,12,12,12, +}; + +int +utf8·decode(char *s, rune *r) +{ + int n; + rune v; + uint8 b, t, x=ACCEPT; + + b = ((uint8 *)s)[0]; + t = decode[b]; + v = (0xFF >> t) & b; + x = decode[256+x+t]; + + for(n=1; x > REJECT && n < UTFmax; n++){ + b = ((uint8 *)s)[n]; + t = decode[b]; + v = (v << 6) | (b & TMask); + x = decode[256+x+t]; + } + + if(x != ACCEPT){ + *r = RuneErr; + return 1; + } + + *r = v; + return n; +} + +#if 0 +int +utf8·decode(byte *s, rune *r) +{ + int c[UTFmax], i; + rune l; + + c[0] = *(ubyte*)(s); + if(c[0] < Tx){ + *r = c[0]; + return 1; + } + + l = c[0]; + for(i = 1; i < UTFmax; i++){ + c[i] = *(ubyte*)(s+i); + c[i] ^= Tx; + if(c[i] & Testx) goto bad; + + l = (l << Bitx) | c[i]; + if(c[0] < Tbyte(i + 2)){ + l &= RuneX(i + 1); + if(i == 1){ + if(c[0] < Tbyte(2) || l <= Rune1) + goto bad; + }else if(l <= RuneX(i) || l > RuneMax) + goto bad; + + if(i == 2 && SurrogateMin <= l && l <= SurrogateMax) + goto bad; + + *r = l; + return i + 1; + } + } +bad: + *r = RuneErr; + return 1; +} +#endif diff --git a/sys/libutf/decodeprev.c b/sys/libutf/decodeprev.c new file mode 100644 index 0000000..27dced6 --- /dev/null +++ b/sys/libutf/decodeprev.c @@ -0,0 +1,60 @@ +#include "internal.h" + +#define ACCEPT 0 +#define REJECT 12 + +static uint8 decode[] = { + /* + * the first part of the table maps bytes to character classes that + * to reduce the size of the transition table and create bitmasks. + */ + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9, + 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, + 8,8,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, + 10,3,3,3,3,3,3,3,3,3,3,3,3,4,3,3, 11,6,6,6,5,8,8,8,8,8,8,8,8,8,8,8, + /* + * The second part is a transition table that maps a combination + * of a state of the automaton and a character class to a state. + */ + // 0 1 2 3 4 5 6 7 8 9 10 11 + 0,24,12,12,12,12,12,24,12,24,12,12, + 0,24,12,12,12,12,12,24,12,24,12,12, + 12,36, 0,12,12,12,12,48,12,36,12,12, + 12,60,12, 0, 0,12,12,72,12,72,12,12, + 12,60,12, 0,12,12,12,72,12,72, 0,12, + 12,12,12,12,12, 0, 0,12,12,12,12,12, + 12,12,12,12,12,12,12,12,12,12,12, 0 +}; + +int +utf8·decodeprev(byte *s, rune *r) +{ + int n; + rune v; + uint8 b, t, d, x=ACCEPT; + + v=0, n=0, d=0; +nextbyte: + b = ((uint8 *)s)[-n++]; + t = decode[b]; + x = decode[256+x+t]; + + if(x > REJECT && n < UTFmax){ + v = v | ((b & TMask) << d); + d += 6; + goto nextbyte; + } + + if(x != ACCEPT) + *r = RuneErr; + else{ + v |= (((0xFFu >> t) & b) << d); + *r = v; + } + + return n; +} diff --git a/sys/libutf/encode.c b/sys/libutf/encode.c new file mode 100644 index 0000000..fa7c93e --- /dev/null +++ b/sys/libutf/encode.c @@ -0,0 +1,69 @@ +#include "internal.h" + +int +utf8·encode(rune *r, byte *s) +{ + rune c; + + c = *r; + if(c < Rune1Byte){ // 7 bits + s[0] = (uint8)c; + return 1; + } + + if(c < Rune2Byte){ // 11 bits + s[0] = TByte1 | (c >> 6); + s[1] = Tx | (c & TMask); + return 2; + } + + if(c < Rune3Byte){ // 16 bits + s[0] = TByte2 | ((c >> 12)); + s[1] = Tx | ((c >> 6) & TMask); + s[2] = Tx | ((c) & TMask); + return 3; + } + + // 22 bits + if(c > RuneMax || (RuneSurrogateMin <= c && c <= RuneSurrogateMax)) + c = RuneErr; + + s[0] = TByte3 | ((c >> 18)); + s[1] = Tx | ((c >> 12) & TMask); + s[2] = Tx | ((c >> 6) & TMask); + s[3] = Tx | ((c) & TMask); + + return 4; +} + +#if 0 +int +utf8·encode(rune* r, byte* s) +{ + int i, j; + rune c; + + c = *r; + if(c <= Rune1) { + s[0] = c; + return 1; + } + + for(i = 2; i < UTFmax + 1; i++){ + if(i == 3){ + if(c > RuneMax) + c = RuneErr; + if(SurrogateMin <= c && c <= SurrogateMax) + c = RuneErr; + } + if(c <= RuneX(i) || i == UTFmax) { + s[0] = Tbyte(i) | (c >> (i - 1)*Bitx); + for(j = 1; j < i; j++) + s[j] = Tx | ((c >> (i - j - 1)*Bitx) & Maskx); + return i; + } + } + + return UTFmax; +} +#endif diff --git a/sys/libutf/find.c b/sys/libutf/find.c new file mode 100644 index 0000000..d75feb8 --- /dev/null +++ b/sys/libutf/find.c @@ -0,0 +1,31 @@ +#include "internal.h" + +byte* +utf8·find(byte* s, rune c) +{ + long c1; + rune r; + int n; + + if(c < Tx) + return strchr(s, c); + + for(;;){ + c1 = *(ubyte*)s; + if(c1 < Tx){ + if(c1 == 0) return nil; + if(c1 == c) return s; + s++; + continue; + } + + n = utf8·decode(s, &r); + + if(r == c) + return s; + + s += n; + } + + return nil; +} diff --git a/sys/libutf/findlast.c b/sys/libutf/findlast.c new file mode 100644 index 0000000..ab25ab2 --- /dev/null +++ b/sys/libutf/findlast.c @@ -0,0 +1,32 @@ +#include "internal.h" + +byte* +utf8·findlast(byte* s, rune c) +{ + long c1; + rune r; + byte *l; + + if(c < Tx) + return strrchr(s, c); + + l = nil; + for(;;){ + c1 = *(ubyte*)s; + if(c1 < Tx){ + if(c1 == 0) return l; + if(c1 == c) l = s; + s++; + continue; + } + + c1 = utf8·decode(s, &r); + + if(r == c) + l = s; + + s += c1; + } + + return nil; +} diff --git a/sys/libutf/internal.h b/sys/libutf/internal.h new file mode 100644 index 0000000..9719977 --- /dev/null +++ b/sys/libutf/internal.h @@ -0,0 +1,38 @@ +#pragma once + +#include +#include +#include + +/* + * NOTE: we use the preprocessor to ensure we have unsigned constants. + * UTF-8 code: + * 1 byte: + * 0xxxxxxx + * 2 byte: + * 110xxxxx 10xxxxxx + * 3 byte: + * 1110xxxx 10xxxxxx 10xxxxxx + * 4 byte: + * 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx + */ + +#define Tx 0x80u // 0b10000000 transfer header +#define TMask 0x3Fu // 0b00111111 transfer mask + +#define TByte1 0xC0u // 0b11000000 +#define TByte2 0xE0u // 0b11100000 +#define TByte3 0xF0u // 0b11110000 +#define TByte4 0xF8u // 0b11111000 + +#define RuneMask 0x1FFFFFu + +#define Rune1Byte 0x000080u // 1 << 8 (1 byte) +#define Rune2Byte 0x001000u // 1 << 12 (2 bytes) +#define Rune3Byte 0x020000u // 1 << 17 (3 bytes) +#define Rune4Byte 0x400000u // 1 << 22 (4 bytes) + + +/* UTF-16 nonsense */ +#define RuneSurrogateMin 0x0D8000 +#define RuneSurrogateMax 0x0D8FFF diff --git a/sys/libutf/len.c b/sys/libutf/len.c new file mode 100644 index 0000000..8fbd679 --- /dev/null +++ b/sys/libutf/len.c @@ -0,0 +1,21 @@ +#include "internal.h" + +int +utf8·len(char *s) +{ + int c; + long n; + rune r; + + n = 0; + for(;;){ + c = *(uchar*)s; + if(c < Tx){ + if(c == 0) + return n; + s++; + }else + s += utf8·decode(s, &r); + n++; + } +} diff --git a/sys/libutf/rules.mk b/sys/libutf/rules.mk new file mode 100644 index 0000000..53ff8cf --- /dev/null +++ b/sys/libutf/rules.mk @@ -0,0 +1,76 @@ +include share/push.mk + +UNICODE = 14.0.0 + +SRCS_$(d) := \ + $(d)/encode.c \ + $(d)/decode.c \ + $(d)/decodeprev.c \ + $(d)/find.c \ + $(d)/findlast.c \ + $(d)/canfit.c \ + $(d)/runelen.c \ + $(d)/len.c \ + $(d)/runetype-$(UNICODE).c \ + $(d)/runewidth-$(UNICODE).c + +LIBS_$(d) := $(d)/libutf.a + +include share/paths.mk + +# ======================================================================== +# table generation + +$(d)/vendor/common.o: $(d)/vendor/common.c + $(COMPILE) + +# rune categories +$(d)/vendor/UnicodeData-$(UNICODE).txt: + @echo "GET UnicodeData.txt";\ + curl https://www.unicode.org/Public/$(UNICODE)/ucd/UnicodeData.txt > $@ + +$(d)/vendor/mkrunetype: $(d)/vendor/mkrunetype.c $(d)/vendor/common.o $(OBJ_DIR)/sys/base/base.a + $(COMPLINK) + +GENS += $(d)/vendor/mkrunetype + +$(d)/runetype-$(UNICODE).c: $(d)/vendor/UnicodeData-$(UNICODE).txt $(d)/vendor/mkrunetype + @$(dir $@)vendor/mkrunetype $< > $@ + +# rune widths +$(d)/vendor/EastAsianWidth-$(UNICODE).txt: + @echo "GET EastAsianWidth.txt";\ + curl https://www.unicode.org/Public/$(UNICODE)/ucd/EastAsianWidth.txt > $@ + +$(d)/vendor/EmojiData-$(UNICODE).txt: + @echo "GET EmojiData.txt";\ + curl https://www.unicode.org/Public/$(UNICODE)/ucd/emoji/emoji-data.txt > $@ + +$(d)/vendor/mkrunewidth: $(d)/vendor/mkrunewidth.c $(d)/vendor/common.o $(OBJ_DIR)/sys/base/base.a + $(COMPLINK) + +GENS += $(d)/vendor/mkrunewidth + +$(d)/runewidth-$(UNICODE).c: $(d)/vendor/mkrunewidth $(d)/vendor/UnicodeData-$(UNICODE).txt $(d)/vendor/EastAsianWidth-$(UNICODE).txt $(d)/vendor/EmojiData-$(UNICODE).txt + @$(dir $@)vendor/mkrunewidth $(filter-out $<, $^) > $@ + +# grapheme boundaries +$(d)/vendor/GraphemeBreakProperty-$(UNICODE).txt: + @echo "GET GraphemeBreakProperty.txt";\ + curl https://www.unicode.org/Public/$(UNICODE)/ucd/auxiliary/GraphemeBreakProperty.txt > $@ + +$(d)/vendor/mkgraphemedata: $(d)/vendor/mkgraphemedata.c $(d)/vendor/common.o $(OBJ_DIR)/sys/base/base.a + $(COMPLINK) + +$(d)/graphemedata-$(UNICODE).c: $(d)/vendor/mkgraphemedata $(d)/vendor/GraphemeBreakProperty-$(UNICODE).txt + $^ > $@ + +GENS += $(d)/vendor/mkgraphemedata + +# ======================================================================== +# normal operations + +$(LIBS_$(d)): $(OBJS_$(d)) + $(ARCHIVE) + +include share/pop.mk diff --git a/sys/libutf/runelen.c b/sys/libutf/runelen.c new file mode 100644 index 0000000..dac7f15 --- /dev/null +++ b/sys/libutf/runelen.c @@ -0,0 +1,8 @@ +#include "internal.h" + +int +utf8·runelen(rune r) +{ + byte s[10]; + return utf8·encode(&r, s); +} diff --git a/sys/libutf/runetype-14.0.0.c b/sys/libutf/runetype-14.0.0.c new file mode 100644 index 0000000..6f4469d --- /dev/null +++ b/sys/libutf/runetype-14.0.0.c @@ -0,0 +1,111 @@ +#include +#include + +static +rune* +rangesearch(rune c, rune *t, int n, int ne) +{ + rune *p; + int m; + while(n > 1) { + m = n >> 1; + p = t + m*ne; + if(c >= p[0]){ + t = p; + n = n-m; + }else + n = m; + } + if(n && c >= t[0]) + return t; + return 0; +} + +static rune isspace_range[] = { + 0x0009, 0x000d, + 0x0085, 0x0085, +}; + +int +utf8·isspace(rune c) +{ + rune *p; + + p = rangesearch(c, isspace_range, arrlen(isspace_range)/2, 2); + if(p && c >= p[0] && c <= p[1]) + return 1; + return 0; +} + +int +utf8·isdigit(rune c) +{ + rune *p; + + return 0; +} + +int +utf8·isalpha(rune c) +{ + rune *p; + + return 0; +} + +int +utf8·isupper(rune c) +{ + rune *p; + + return 0; +} + +int +utf8·islower(rune c) +{ + rune *p; + + return 0; +} + +int +utf8·istitle(rune c) +{ + rune *p; + + return 0; +} + +int +utf8·ispunct(rune c) +{ + rune *p; + + return 0; +} + +rune +utf8·toupper(rune c) +{ + rune *p; + + return c; +} + +rune +utf8·tolower(rune c) +{ + rune *p; + + return c; +} + +rune +utf8·totitle(rune c) +{ + rune *p; + + return c; +} + diff --git a/sys/libutf/runewidth-14.0.0.c b/sys/libutf/runewidth-14.0.0.c new file mode 100644 index 0000000..113c35e --- /dev/null +++ b/sys/libutf/runewidth-14.0.0.c @@ -0,0 +1,71 @@ +#include +#include + +static +rune* +rangesearch(rune c, rune *t, int n, int ne) +{ + rune *p; + int m; + while(n > 1) { + m = n >> 1; + p = t + m*ne; + if(c >= p[0]){ + t = p; + n = n-m; + }else + n = m; + } + if(n && c >= t[0]) + return t; + return 0; +} + +static rune width0_range[] = { + 0x2028, 0x2029, +}; + +static int +iswidth0(rune c) +{ + rune *p; + + p = rangesearch(c, width0_range, arrlen(width0_range)/2, 2); + if(p && c >= p[0] && c <= p[1]) + return 1; + return 0; +} + +static rune width1_single[] = { + 0x00ad, +}; + +static int +iswidth1(rune c) +{ + rune *p; + + p = rangesearch(c, width1_single, arrlen(width1_single), 1); + if(p && c == p[0]) + return 1; + return 0; +} + +static int +iswidth2(rune c) +{ + rune *p; + + return 0; +} + + +int +utf8·runewidth(rune c) +{ + if(iswidth1(c)) + return 1; + if(iswidth2(c)) + return 2; + return 0; +} diff --git a/sys/rules.mk b/sys/rules.mk index c8c3dd3..cefa4a9 100644 --- a/sys/rules.mk +++ b/sys/rules.mk @@ -8,7 +8,7 @@ include $(DIR)/rules.mk DIR := $(d)/base include $(DIR)/rules.mk -DIR := $(d)/libunicode +DIR := $(d)/libutf include $(DIR)/rules.mk DIR := $(d)/libmath -- cgit v1.2.1