aboutsummaryrefslogtreecommitdiff
path: root/sys/libunicode
diff options
context:
space:
mode:
authorNicholas Noll <nbnoll@eml.cc>2021-10-26 21:32:55 -0700
committerNicholas Noll <nbnoll@eml.cc>2021-10-26 21:51:49 -0700
commit189b9e23edfe60b7e82c4c7b6071a3f98799653a (patch)
tree77c1f32726446deb298a0d2e6389358bfe16bd32 /sys/libunicode
parent29b56ef4e4113bcd091b19d6926f18814162ca53 (diff)
fix(unicode): renamed functions to be easier to understand their functions
Diffstat (limited to 'sys/libunicode')
-rw-r--r--sys/libunicode/canfit.c20
-rw-r--r--sys/libunicode/decode.c (renamed from sys/libunicode/bytetorune.c)2
-rw-r--r--sys/libunicode/encode.c (renamed from sys/libunicode/runetobyte.c)2
-rw-r--r--sys/libunicode/find.c30
-rw-r--r--sys/libunicode/findlast.c (renamed from sys/libunicode/findlastrune.c)15
-rw-r--r--sys/libunicode/findrune.c26
-rw-r--r--sys/libunicode/fullrune.c18
-rw-r--r--sys/libunicode/internal.h12
-rw-r--r--sys/libunicode/len.c21
-rw-r--r--sys/libunicode/rules.mk14
-rw-r--r--sys/libunicode/runelen.c2
11 files changed, 101 insertions, 61 deletions
diff --git a/sys/libunicode/canfit.c b/sys/libunicode/canfit.c
new file mode 100644
index 0000000..d44c9e6
--- /dev/null
+++ b/sys/libunicode/canfit.c
@@ -0,0 +1,20 @@
+#include "internal.h"
+
+/* returns 1 if string of length n is long enough to be decoded */
+int
+utf8·canfit(byte* s, int n)
+{
+ int i;
+ rune c;
+
+ if(n <= 0) return 0;
+ c = *(ubyte*) s;
+ if(c < Tx) return 1;
+
+ for(i = 3; i < UTFmax + 1; i++){
+ if(c < Tbyte(i))
+ return n >= i - 1;
+ }
+
+ return n >= UTFmax;
+}
diff --git a/sys/libunicode/bytetorune.c b/sys/libunicode/decode.c
index fe41e7a..79271f2 100644
--- a/sys/libunicode/bytetorune.c
+++ b/sys/libunicode/decode.c
@@ -1,7 +1,7 @@
#include "internal.h"
int
-utf8·bytetorune(rune* r, byte* s)
+utf8·decode(byte *s, rune* r)
{
int c[UTFmax], i;
rune l;
diff --git a/sys/libunicode/runetobyte.c b/sys/libunicode/encode.c
index 27f252b..8f4d212 100644
--- a/sys/libunicode/runetobyte.c
+++ b/sys/libunicode/encode.c
@@ -1,7 +1,7 @@
#include "internal.h"
int
-utf8·runetobyte(byte* s, rune* r)
+utf8·encode(rune* r, byte* s)
{
int i, j;
rune c;
diff --git a/sys/libunicode/find.c b/sys/libunicode/find.c
new file mode 100644
index 0000000..659ab5b
--- /dev/null
+++ b/sys/libunicode/find.c
@@ -0,0 +1,30 @@
+#include "internal.h"
+
+byte*
+utf8·find(byte* s, rune c)
+{
+ long c1;
+ rune r;
+ int n;
+
+ if(c < RuneSync)
+ return strchr(s, c);
+
+ for(;;){
+ c1 = *(ubyte*)s;
+ if(c1 < RuneSelf) {
+ if(c1 == 0) return nil;
+ if(c1 == c) return s;
+ s++;
+ continue;
+ }
+ n = utf8·decode(s, &r);
+
+ if(r == c)
+ return s;
+
+ s += n;
+ }
+
+ return nil;
+}
diff --git a/sys/libunicode/findlastrune.c b/sys/libunicode/findlast.c
index 0dc5032..3a4ed38 100644
--- a/sys/libunicode/findlastrune.c
+++ b/sys/libunicode/findlast.c
@@ -1,27 +1,28 @@
#include "internal.h"
byte*
-utf8·findlastrune(byte* s, long c)
+utf8·findlast(byte* s, rune c)
{
long c1;
rune r;
byte *l;
- if (c < RuneSync)
+ if(c < RuneSync)
return strrchr(s, c);
l = nil;
for(;;){
c1 = *(ubyte*)s;
- if (c1 < RuneSelf) {
- if (c1 == 0) return l;
- if (c1 == c) l = s;
+ if(c1 < RuneSelf) {
+ if(c1 == 0) return l;
+ if(c1 == c) l = s;
s++;
continue;
}
- c1 = utf8·bytetorune(&r, s);
- if (r == c)
+ c1 = utf8·decode(s, &r);
+ if(r == c)
l = s;
+
s += c1;
}
diff --git a/sys/libunicode/findrune.c b/sys/libunicode/findrune.c
deleted file mode 100644
index 97edc3c..0000000
--- a/sys/libunicode/findrune.c
+++ /dev/null
@@ -1,26 +0,0 @@
-#include "internal.h"
-
-byte*
-utf8·findrune(byte* s, long c)
-{
- long c1;
- rune r;
- int n;
-
- if (c < RuneSync) return strchr(s, c);
-
- for (;;) {
- c1 = *(ubyte*)s;
- if (c1 < RuneSelf) {
- if (c1 == 0) return nil;
- if (c1 == c) return s;
- s++;
- continue;
- }
- n = utf8·bytetorune(&r, s);
- if (r == c) return s;
- s += n;
- }
-
- return nil;
-}
diff --git a/sys/libunicode/fullrune.c b/sys/libunicode/fullrune.c
deleted file mode 100644
index e5cf314..0000000
--- a/sys/libunicode/fullrune.c
+++ /dev/null
@@ -1,18 +0,0 @@
-#include "internal.h"
-
-int
-utf8·fullrune(byte* s, int n)
-{
- int i;
- rune c;
-
- if (n <= 0) return 0;
- c = *(ubyte*) s;
- if (c < Tx) return 1;
-
- for (i = 3; i < UTFmax + 1; i++) {
- if (c < Tbyte(i)) return n >= i - 1;
- }
-
- return n >= UTFmax;
-}
diff --git a/sys/libunicode/internal.h b/sys/libunicode/internal.h
index bf376b2..31ed2ae 100644
--- a/sys/libunicode/internal.h
+++ b/sys/libunicode/internal.h
@@ -4,8 +4,18 @@
#include <base.h>
#include <libunicode.h>
+/* UTF-8 code
+ * 1 byte:
+ * 0xxxxxxx
+ * 2 byte:
+ * 110xxxxx 10xxxxxx
+ * 3 byte:
+ * 1110xxxx 10xxxxxx 10xxxxxx
+ * 4 byte:
+ * 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
+ */
#define Bit(i) (7-(i))
-/* N 0's preceded by i 1's e.g. T(Bit(2)) is 1100 0000 */
+/* 0's preceded by i 1's e.g. T(Bit(2)) is 1100 0000 */
#define Tbyte(i) (((1 << (Bit(i)+1))-1) ^ 0xFF)
/* 0000 0000 0000 0111 1111 1111 */
#define RuneX(i) ((1 << (Bit(i) + ((i)-1)*Bitx))-1)
diff --git a/sys/libunicode/len.c b/sys/libunicode/len.c
new file mode 100644
index 0000000..c461bf3
--- /dev/null
+++ b/sys/libunicode/len.c
@@ -0,0 +1,21 @@
+#include "internal.h"
+
+int
+utf8·len(char *s)
+{
+ int c;
+ long n;
+ rune r;
+
+ n = 0;
+ for(;;){
+ c = *(uchar*)s;
+ if(c < RuneSelf) {
+ if(c == 0)
+ return n;
+ s++;
+ } else
+ s += utf8·decode(s, &r);
+ n++;
+ }
+}
diff --git a/sys/libunicode/rules.mk b/sys/libunicode/rules.mk
index fb83819..100697b 100644
--- a/sys/libunicode/rules.mk
+++ b/sys/libunicode/rules.mk
@@ -3,12 +3,13 @@ include share/push.mk
UNICODE = 14.0.0
SRCS_$(d) := \
- $(d)/runetobyte.c \
- $(d)/bytetorune.c \
- $(d)/findrune.c \
- $(d)/findlastrune.c \
- $(d)/fullrune.c \
+ $(d)/encode.c \
+ $(d)/decode.c \
+ $(d)/find.c \
+ $(d)/findlast.c \
+ $(d)/canfit.c \
$(d)/runelen.c \
+ $(d)/len.c \
$(d)/runetype-$(UNICODE).c
LIBS_$(d) := $(d)/libunicode.a
@@ -16,13 +17,14 @@ LIBS_$(d) := $(d)/libunicode.a
include share/paths.mk
$(d)/vendor/UnicodeData-$(UNICODE).txt:
+ @echo "GET UnicodeData.txt";\
curl https://www.unicode.org/Public/$(UNICODE)/ucd/UnicodeData.txt > $@
$(d)/vendor/mkrunetype: $(d)/vendor/mkrunetype.c $(OBJ_DIR)/sys/base/base.a
$(COMPLINK)
$(d)/runetype-$(UNICODE).c: $(d)/vendor/UnicodeData-$(UNICODE).txt $(d)/vendor/mkrunetype
- $(dir $@)/vendor/mkrunetype $< > $@
+ @$(dir $@)vendor/mkrunetype $< > $@
GENS += $(d)/vendor/mkrunetype
diff --git a/sys/libunicode/runelen.c b/sys/libunicode/runelen.c
index 682400c..dac7f15 100644
--- a/sys/libunicode/runelen.c
+++ b/sys/libunicode/runelen.c
@@ -4,5 +4,5 @@ int
utf8·runelen(rune r)
{
byte s[10];
- return utf8·runetobyte(s, &r);
+ return utf8·encode(&r, s);
}