aboutsummaryrefslogtreecommitdiff
path: root/sys/libunicode/decode.c
blob: 01797f118a0171b783567a82e13c95c6174ef9b6 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
#include "internal.h"

#define ACCEPT 0
#define REJECT 12

static uint8 decode[] = {
    /*
     * the first part of the table maps bytes to character classes that
     * to reduce the size of the transition table and create bitmasks
     */
     0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
     0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
     0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
     0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
     1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,  9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,
     7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,  7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,
     8,8,2,2,2,2,2,2,2,2,2,2,2,2,2,2,  2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
    10,3,3,3,3,3,3,3,3,3,3,3,3,4,3,3, 11,6,6,6,5,8,8,8,8,8,8,8,8,8,8,8,

    /*
     * the second part is a transition table that maps a combination
     * of a state of the automaton and a character class to a state
     */
     0,12,24,36,60,96,84,12,12,12,48,72, 12,12,12,12,12,12,12,12,12,12,12,12,
    12, 0,12,12,12,12,12, 0,12, 0,12,12, 12,24,12,12,12,12,12,24,12,24,12,12,
    12,12,12,12,12,12,12,24,12,12,12,12, 12,24,12,12,12,12,12,12,12,24,12,12,
    12,12,12,12,12,12,12,36,12,36,12,12, 12,36,12,12,12,12,12,36,12,36,12,12,
    12,36,12,12,12,12,12,12,12,12,12,12,
};

int
utf8·decode(char *s, rune *r)
{
    int   n;
    rune  v;
    uint8 b, t, x=ACCEPT;

    b = ((uint8 *)s)[0];
    t = decode[b];
    v = (0xFF >> t) & b;
    x = decode[256+x+t];

    for(n=1; x > REJECT && n < UTFmax; n++){
        b = ((uint8 *)s)[n];
        t = decode[b];
        v = (v << 6) | (b & TMask);
        x = decode[256+x+t];
    }

    if(x != ACCEPT){
        *r = RuneErr;
        return 1;
    }

    *r = v;
    return n;
}

#if 0
int
utf8·decode(byte *s, rune *r)
{
    int c[UTFmax], i;
    rune l;

    c[0] = *(ubyte*)(s);
    if(c[0] < Tx){
        *r = c[0];
        return 1;
    }

    l = c[0];
    for(i = 1; i < UTFmax; i++){
        c[i]  = *(ubyte*)(s+i);
        c[i] ^= Tx;
        if(c[i] & Testx) goto bad;

        l = (l << Bitx) | c[i];
        if(c[0] < Tbyte(i + 2)){
            l &= RuneX(i + 1);
            if(i == 1){
                if(c[0] < Tbyte(2) || l <= Rune1)
                    goto bad;
            }else if(l <= RuneX(i) || l > RuneMax)
                goto bad;

            if(i == 2 && SurrogateMin <= l && l <= SurrogateMax)
                goto bad;

            *r = l;
            return i + 1;
        }
    }
bad:
    *r = RuneErr;
    return 1;
}
#endif