1 | #ifndef NU_UTF8_H |
2 | #define NU_UTF8_H |
3 | |
4 | #include <stdint.h> |
5 | #include <sys/types.h> |
6 | |
7 | #include <libnu/config.h> |
8 | #include <libnu/defines.h> |
9 | #include <libnu/utf8_internal.h> |
10 | |
11 | /** @defgroup utf8 UTF-8 support |
12 | * |
13 | * Note: There is no utf8_string[i] equivalent - it will be slow, |
14 | * use nu_utf8_read() and nu_utf8_revread() instead |
15 | * |
16 | * @example utf8.c |
17 | * @example revread.c |
18 | */ |
19 | |
20 | #if defined (__cplusplus) || defined (c_plusplus) |
21 | extern "C" { |
22 | #endif |
23 | |
24 | #ifdef NU_WITH_UTF8_READER |
25 | |
26 | /** Read codepoint from UTF-8 string |
27 | * |
28 | * @ingroup utf8 |
29 | * @param utf8 pointer to UTF-8 encoded string |
30 | * @param unicode output unicode codepoint or 0 |
31 | * @return pointer to next codepoint in UTF-8 string |
32 | */ |
33 | static inline |
34 | const char* nu_utf8_read(const char *utf8, uint32_t *unicode) { |
35 | uint32_t c = *(unsigned char *)(utf8); |
36 | |
37 | if (c >= 0x80) { |
38 | if (c < 0xE0) { |
39 | if (unicode != 0) { |
40 | utf8_2b(p: utf8, codepoint: unicode); |
41 | } |
42 | return utf8 + 2; |
43 | } |
44 | else if (c < 0xF0) { |
45 | if (unicode != 0) { |
46 | utf8_3b(p: utf8, codepoint: unicode); |
47 | } |
48 | return utf8 + 3; |
49 | } |
50 | else { |
51 | if (unicode != 0) { |
52 | utf8_4b(p: utf8, codepoint: unicode); |
53 | } |
54 | return utf8 + 4; |
55 | } |
56 | } |
57 | else if (unicode != 0) { |
58 | *unicode = c; |
59 | } |
60 | |
61 | return utf8 + 1; |
62 | } |
63 | |
64 | #ifdef NU_WITH_REVERSE_READ |
65 | |
66 | /** Read codepoint from UTF-8 string in backward direction |
67 | * |
68 | * Note that it is your responsibility to check that this call |
69 | * is not going under beginning of encoded string. Normally you |
70 | * shouldn't call it like this: nu_utf8_revread(&u, "hello"); which |
71 | * will result in undefined behavior |
72 | * |
73 | * @ingroup utf8 |
74 | * @param unicode output unicode codepoint or 0 |
75 | * @param utf8 pointer to UTF-8 encoded string |
76 | * @return pointer to previous codepoint in UTF-8 string |
77 | */ |
78 | static inline |
79 | const char* nu_utf8_revread(uint32_t *unicode, const char *utf8) { |
80 | /* valid UTF-8 has either 10xxxxxx (continuation byte) |
81 | * or beginning of byte sequence */ |
82 | const char *p = utf8 - 1; |
83 | while (((unsigned char)(*p) & 0xC0) == 0x80) { /* skip every 0b10000000 */ |
84 | --p; |
85 | } |
86 | |
87 | if (unicode != 0) { |
88 | nu_utf8_read(p, unicode); |
89 | } |
90 | |
91 | return p; |
92 | } |
93 | |
94 | #endif /* NU_WITH_REVERSE_READ */ |
95 | |
96 | #ifdef NU_WITH_VALIDATION |
97 | |
98 | /** Validate codepoint in string |
99 | * |
100 | * @ingroup utf8 |
101 | * @param encoded buffer with encoded string |
102 | * @param max_len buffer length |
103 | * @return codepoint length or 0 on error |
104 | */ |
105 | NU_EXPORT |
106 | int nu_utf8_validread(const char *encoded, size_t max_len); |
107 | |
108 | #endif /* NU_WITH_VALIDATION */ |
109 | #endif /* NU_WITH_UTF8_READER */ |
110 | |
111 | #ifdef NU_WITH_UTF8_WRITER |
112 | |
113 | /** Write unicode codepoints into UTF-8 encoded string |
114 | * |
115 | * @ingroup utf8 |
116 | * @param unicode unicode codepoint |
117 | * @param utf8 pointer to buffer to write UTF-8 encoded text to, |
118 | * should be large enough to hold encoded value |
119 | * @return pointer to byte after last written |
120 | */ |
121 | NU_EXPORT |
122 | char* nu_utf8_write(uint32_t unicode, char *utf8); |
123 | |
124 | #endif /* NU_WITH_UTF8_WRITER */ |
125 | |
126 | #if defined (__cplusplus) || defined (c_plusplus) |
127 | } |
128 | #endif |
129 | |
130 | #endif /* NU_UTF8_H */ |
131 | |