1 | #ifndef NU_UTF8_INTERNAL_H |
2 | #define NU_UTF8_INTERNAL_H |
3 | |
4 | #include <sys/types.h> |
5 | |
6 | static inline |
7 | unsigned utf8_char_length(const char c) { |
8 | const unsigned char uc = c; |
9 | |
10 | if ((uc & 0x80) == 0) return 1; |
11 | if ((uc & 0xE0) == 0xC0) return 2; |
12 | if ((uc & 0xF0) == 0xE0) return 3; |
13 | if ((uc & 0xF8) == 0xF0) return 4; |
14 | |
15 | return 0; /* undefined */ |
16 | } |
17 | |
18 | static inline |
19 | void utf8_2b(const char *p, uint32_t *codepoint) { |
20 | const unsigned char *up = (const unsigned char *)(p); |
21 | |
22 | /* UTF-8: 110xxxxx 10xxxxxx |
23 | * |__ 1st unicode octet |
24 | * 110xxx00 << 6 -> 00000xxx 00000000 | |
25 | * -------- |
26 | * 110000xx << 6 -> 00000xxx xx000000 |__ 2nd unicode octet |
27 | * 10xxxxxx -> 00000xxx xxxxxxxx | |
28 | * -------- */ |
29 | *codepoint = (*(up) & 0x1C) << 6 |
30 | | ((*(up) & 0x03) << 6 | (*(up + 1) & 0x3F)); |
31 | } |
32 | |
33 | static inline |
34 | void utf8_3b(const char *p, uint32_t *codepoint) { |
35 | const unsigned char *up = (const unsigned char *)(p); |
36 | |
37 | /* UTF-8: 1110xxxx 10xxxxxx 10xxxxxx |
38 | * |
39 | * 1110xxxx << 12 -> xxxx0000 0000000 |__ 1st unicode octet |
40 | * 10xxxx00 << 6 -> xxxxxxxx 0000000 | |
41 | * -------- |
42 | * 100000xx << 6 -> xxxxxxxx xx00000 |__ 2nd unicode octet |
43 | * 10xxxxxx -> xxxxxxxx xxxxxxx | |
44 | * ------- */ |
45 | *codepoint = |
46 | ((*(up) & 0x0F) << 12 | (*(up + 1) & 0x3C) << 6) |
47 | | ((*(up + 1) & 0x03) << 6 | (*(up + 2) & 0x3F)); |
48 | } |
49 | |
50 | static inline |
51 | void utf8_4b(const char *p, uint32_t *codepoint) { |
52 | const unsigned char *up = (const unsigned char *)(p); |
53 | |
54 | /* UTF-8: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx |
55 | * |
56 | * 11110xxx << 18 -> 00xxx00 00000000 00000000 |__ 1st unicode octet |
57 | * 10xx0000 << 12 -> 00xxxxx 00000000 00000000 | |
58 | * ------- |
59 | * 1000xxxx << 12 -> 00xxxxx xxxx0000 00000000 |__ 2nd unicode octet |
60 | * 10xxxx00 << 6 -> 00xxxxx xxxxxxxx 00000000 | |
61 | * -------- |
62 | * 100000xx << 6 -> 00xxxxx xxxxxxxx xx000000 |__ 3rd unicode octet |
63 | * 10xxxxxx -> 00xxxxx xxxxxxxx xxxxxxxx | |
64 | * --------- */ |
65 | *codepoint = |
66 | ((*(up) & 0x07) << 18 | (*(up + 1) & 0x30) << 12) |
67 | | ((*(up + 1) & 0x0F) << 12 | (*(up + 2) & 0x3C) << 6) |
68 | | ((*(up + 2) & 0x03) << 6 | (*(up + 3) & 0x3F)); |
69 | } |
70 | |
71 | static inline |
72 | unsigned utf8_codepoint_length(uint32_t codepoint) { |
73 | if (codepoint < 128) return 1; |
74 | if (codepoint < 0x0800) return 2; |
75 | if (codepoint < 0x10000) return 3; |
76 | |
77 | return 4; /* de facto max length in UTF-8 */ |
78 | } |
79 | |
80 | static inline |
81 | void b2_utf8(uint32_t codepoint, char *p) { |
82 | unsigned char *up = (unsigned char *)(p); |
83 | |
84 | /* UNICODE: 00000xxx xxxxxxxx |
85 | * |
86 | * 00000xxx >> 6 -> 110xxx00 10000000 |__ 1st UTF-8 octet |
87 | * xxxxxxxx >> 6 -> 110xxxxx 10000000 | |
88 | * -------- |
89 | * |__ 2nd UTF-8 octet |
90 | * xxxxxxxx -> 110xxxxx 10xxxxxx | |
91 | * -------- */ |
92 | *(up) = (0xC0 | (codepoint & 0xFF00) >> 6 | (codepoint & 0xFF) >> 6); |
93 | *(up + 1) = (0x80 | (codepoint & 0x3F)); |
94 | } |
95 | |
96 | static inline |
97 | void b3_utf8(uint32_t codepoint, char *p) { |
98 | unsigned char *up = (unsigned char *)(p); |
99 | |
100 | /* UNICODE: xxxxxxxx xxxxxxxx |
101 | * |__ 1st UTF-8 octet |
102 | * xxxxxxxx >> 12 -> 1110xxxx 10000000 10000000 | |
103 | * -------- |
104 | * xxxxxxxx >> 6 -> 1110xxxx 10xxxx00 10000000 |__ 2nd UTF-8 octet |
105 | * xxxxxxxx >> 6 -> 1110xxxx 10xxxxxx 10000000 | |
106 | * -------- |
107 | * |__ 3rd UTF-8 octet |
108 | * xxxxxxxx -> 1110xxxx 10xxxxxx 10xxxxxx | |
109 | * -------- */ |
110 | *(up) = (0xE0 | (codepoint & 0xF000) >> 12); |
111 | *(up + 1) = (0x80 | (codepoint & 0x0F00) >> 6 | (codepoint & 0xC0) >> 6); |
112 | *(up + 2) = (0x80 | (codepoint & 0x3F)); |
113 | } |
114 | |
115 | static inline |
116 | void b4_utf8(uint32_t codepoint, char *p) { |
117 | unsigned char *up = (unsigned char *)(p); |
118 | |
119 | /* UNICODE: 000xxxxx xxxxxxxx xxxxxxxx |
120 | * |__ 1st UTF-8 octet |
121 | * 000xxxxx >> 18 -> 11110xxx 1000000 10000000 10000000 | |
122 | * -------- |
123 | * 000xxxxx >> 12 -> 11110xxx 10xx000 10000000 10000000 |__ 2nd UTF-8 octet |
124 | * xxxxxxxx >> 12 -> 11110xxx 10xxxxx 10000000 10000000 | |
125 | * ------- |
126 | * xxxxxxxx >> 6 -> 11110xxx 10xxxxx 10xxxxx0 10000000 |__ 3rd UTF-8 octet |
127 | * xxxxxxxx >> 6 -> 11110xxx 10xxxxx 10xxxxxx 10000000 | |
128 | * -------- |
129 | * |__ 4th UTF-8 octet |
130 | * xxxxxxxx -> 11110xxx 10xxxxx 10xxxxxx 10000000 | */ |
131 | *(up) = (0xF0 | ((codepoint & 0x1C0000) >> 18)); |
132 | *(up + 1) = (0x80 | (codepoint & 0x030000) >> 12 | (codepoint & 0x00E000) >> 12); |
133 | *(up + 2) = (0x80 | (codepoint & 0x001F00) >> 6 | (codepoint & 0x0000E0) >> 6); |
134 | *(up + 3) = (0x80 | (codepoint & 0x3F)); |
135 | } |
136 | |
137 | static inline |
138 | int utf8_validread_basic(const char *p, size_t max_len) { |
139 | const unsigned char *up = (const unsigned char *)(p); |
140 | |
141 | /* it should be 0xxxxxxx or 110xxxxx or 1110xxxx or 11110xxx |
142 | * latter should be followed by number of 10xxxxxx */ |
143 | |
144 | unsigned len = utf8_char_length(c: *p); |
145 | |
146 | /* codepoints longer than 6 bytes does not currently exist |
147 | * and not currently supported |
148 | * TODO: longer UTF-8 sequences support |
149 | */ |
150 | if (max_len < len) { |
151 | return 0; |
152 | } |
153 | |
154 | switch (len) { |
155 | case 1: return 1; /* one byte codepoint */ |
156 | case 2: return ((*(up + 1) & 0xC0) == 0x80 ? 2 : 0); |
157 | case 3: return ((*(up + 1) & 0xC0) == 0x80 |
158 | && (*(up + 2) & 0xC0) == 0x80 ? 3 : 0); |
159 | |
160 | case 4: return ((*(up + 1) & 0xC0) == 0x80 |
161 | && (*(up + 2) & 0xC0) == 0x80 |
162 | && (*(up + 3) & 0xC0) == 0x80 ? 4 : 0); |
163 | } |
164 | |
165 | return 0; |
166 | } |
167 | |
168 | #endif /* NU_UTF8_INTERNAL_H */ |
169 | |