| 1 | #ifndef NU_UTF8_INTERNAL_H |
| 2 | #define NU_UTF8_INTERNAL_H |
| 3 | |
| 4 | #include <sys/types.h> |
| 5 | |
| 6 | static inline |
| 7 | unsigned utf8_char_length(const char c) { |
| 8 | const unsigned char uc = c; |
| 9 | |
| 10 | if ((uc & 0x80) == 0) return 1; |
| 11 | if ((uc & 0xE0) == 0xC0) return 2; |
| 12 | if ((uc & 0xF0) == 0xE0) return 3; |
| 13 | if ((uc & 0xF8) == 0xF0) return 4; |
| 14 | |
| 15 | return 0; /* undefined */ |
| 16 | } |
| 17 | |
| 18 | static inline |
| 19 | void utf8_2b(const char *p, uint32_t *codepoint) { |
| 20 | const unsigned char *up = (const unsigned char *)(p); |
| 21 | |
| 22 | /* UTF-8: 110xxxxx 10xxxxxx |
| 23 | * |__ 1st unicode octet |
| 24 | * 110xxx00 << 6 -> 00000xxx 00000000 | |
| 25 | * -------- |
| 26 | * 110000xx << 6 -> 00000xxx xx000000 |__ 2nd unicode octet |
| 27 | * 10xxxxxx -> 00000xxx xxxxxxxx | |
| 28 | * -------- */ |
| 29 | *codepoint = (*(up) & 0x1C) << 6 |
| 30 | | ((*(up) & 0x03) << 6 | (*(up + 1) & 0x3F)); |
| 31 | } |
| 32 | |
| 33 | static inline |
| 34 | void utf8_3b(const char *p, uint32_t *codepoint) { |
| 35 | const unsigned char *up = (const unsigned char *)(p); |
| 36 | |
| 37 | /* UTF-8: 1110xxxx 10xxxxxx 10xxxxxx |
| 38 | * |
| 39 | * 1110xxxx << 12 -> xxxx0000 0000000 |__ 1st unicode octet |
| 40 | * 10xxxx00 << 6 -> xxxxxxxx 0000000 | |
| 41 | * -------- |
| 42 | * 100000xx << 6 -> xxxxxxxx xx00000 |__ 2nd unicode octet |
| 43 | * 10xxxxxx -> xxxxxxxx xxxxxxx | |
| 44 | * ------- */ |
| 45 | *codepoint = |
| 46 | ((*(up) & 0x0F) << 12 | (*(up + 1) & 0x3C) << 6) |
| 47 | | ((*(up + 1) & 0x03) << 6 | (*(up + 2) & 0x3F)); |
| 48 | } |
| 49 | |
| 50 | static inline |
| 51 | void utf8_4b(const char *p, uint32_t *codepoint) { |
| 52 | const unsigned char *up = (const unsigned char *)(p); |
| 53 | |
| 54 | /* UTF-8: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx |
| 55 | * |
| 56 | * 11110xxx << 18 -> 00xxx00 00000000 00000000 |__ 1st unicode octet |
| 57 | * 10xx0000 << 12 -> 00xxxxx 00000000 00000000 | |
| 58 | * ------- |
| 59 | * 1000xxxx << 12 -> 00xxxxx xxxx0000 00000000 |__ 2nd unicode octet |
| 60 | * 10xxxx00 << 6 -> 00xxxxx xxxxxxxx 00000000 | |
| 61 | * -------- |
| 62 | * 100000xx << 6 -> 00xxxxx xxxxxxxx xx000000 |__ 3rd unicode octet |
| 63 | * 10xxxxxx -> 00xxxxx xxxxxxxx xxxxxxxx | |
| 64 | * --------- */ |
| 65 | *codepoint = |
| 66 | ((*(up) & 0x07) << 18 | (*(up + 1) & 0x30) << 12) |
| 67 | | ((*(up + 1) & 0x0F) << 12 | (*(up + 2) & 0x3C) << 6) |
| 68 | | ((*(up + 2) & 0x03) << 6 | (*(up + 3) & 0x3F)); |
| 69 | } |
| 70 | |
| 71 | static inline |
| 72 | unsigned utf8_codepoint_length(uint32_t codepoint) { |
| 73 | if (codepoint < 128) return 1; |
| 74 | if (codepoint < 0x0800) return 2; |
| 75 | if (codepoint < 0x10000) return 3; |
| 76 | |
| 77 | return 4; /* de facto max length in UTF-8 */ |
| 78 | } |
| 79 | |
| 80 | static inline |
| 81 | void b2_utf8(uint32_t codepoint, char *p) { |
| 82 | unsigned char *up = (unsigned char *)(p); |
| 83 | |
| 84 | /* UNICODE: 00000xxx xxxxxxxx |
| 85 | * |
| 86 | * 00000xxx >> 6 -> 110xxx00 10000000 |__ 1st UTF-8 octet |
| 87 | * xxxxxxxx >> 6 -> 110xxxxx 10000000 | |
| 88 | * -------- |
| 89 | * |__ 2nd UTF-8 octet |
| 90 | * xxxxxxxx -> 110xxxxx 10xxxxxx | |
| 91 | * -------- */ |
| 92 | *(up) = (0xC0 | (codepoint & 0xFF00) >> 6 | (codepoint & 0xFF) >> 6); |
| 93 | *(up + 1) = (0x80 | (codepoint & 0x3F)); |
| 94 | } |
| 95 | |
| 96 | static inline |
| 97 | void b3_utf8(uint32_t codepoint, char *p) { |
| 98 | unsigned char *up = (unsigned char *)(p); |
| 99 | |
| 100 | /* UNICODE: xxxxxxxx xxxxxxxx |
| 101 | * |__ 1st UTF-8 octet |
| 102 | * xxxxxxxx >> 12 -> 1110xxxx 10000000 10000000 | |
| 103 | * -------- |
| 104 | * xxxxxxxx >> 6 -> 1110xxxx 10xxxx00 10000000 |__ 2nd UTF-8 octet |
| 105 | * xxxxxxxx >> 6 -> 1110xxxx 10xxxxxx 10000000 | |
| 106 | * -------- |
| 107 | * |__ 3rd UTF-8 octet |
| 108 | * xxxxxxxx -> 1110xxxx 10xxxxxx 10xxxxxx | |
| 109 | * -------- */ |
| 110 | *(up) = (0xE0 | (codepoint & 0xF000) >> 12); |
| 111 | *(up + 1) = (0x80 | (codepoint & 0x0F00) >> 6 | (codepoint & 0xC0) >> 6); |
| 112 | *(up + 2) = (0x80 | (codepoint & 0x3F)); |
| 113 | } |
| 114 | |
| 115 | static inline |
| 116 | void b4_utf8(uint32_t codepoint, char *p) { |
| 117 | unsigned char *up = (unsigned char *)(p); |
| 118 | |
| 119 | /* UNICODE: 000xxxxx xxxxxxxx xxxxxxxx |
| 120 | * |__ 1st UTF-8 octet |
| 121 | * 000xxxxx >> 18 -> 11110xxx 1000000 10000000 10000000 | |
| 122 | * -------- |
| 123 | * 000xxxxx >> 12 -> 11110xxx 10xx000 10000000 10000000 |__ 2nd UTF-8 octet |
| 124 | * xxxxxxxx >> 12 -> 11110xxx 10xxxxx 10000000 10000000 | |
| 125 | * ------- |
| 126 | * xxxxxxxx >> 6 -> 11110xxx 10xxxxx 10xxxxx0 10000000 |__ 3rd UTF-8 octet |
| 127 | * xxxxxxxx >> 6 -> 11110xxx 10xxxxx 10xxxxxx 10000000 | |
| 128 | * -------- |
| 129 | * |__ 4th UTF-8 octet |
| 130 | * xxxxxxxx -> 11110xxx 10xxxxx 10xxxxxx 10000000 | */ |
| 131 | *(up) = (0xF0 | ((codepoint & 0x1C0000) >> 18)); |
| 132 | *(up + 1) = (0x80 | (codepoint & 0x030000) >> 12 | (codepoint & 0x00E000) >> 12); |
| 133 | *(up + 2) = (0x80 | (codepoint & 0x001F00) >> 6 | (codepoint & 0x0000E0) >> 6); |
| 134 | *(up + 3) = (0x80 | (codepoint & 0x3F)); |
| 135 | } |
| 136 | |
| 137 | static inline |
| 138 | int utf8_validread_basic(const char *p, size_t max_len) { |
| 139 | const unsigned char *up = (const unsigned char *)(p); |
| 140 | |
| 141 | /* it should be 0xxxxxxx or 110xxxxx or 1110xxxx or 11110xxx |
| 142 | * latter should be followed by number of 10xxxxxx */ |
| 143 | |
| 144 | unsigned len = utf8_char_length(c: *p); |
| 145 | |
| 146 | /* codepoints longer than 6 bytes does not currently exist |
| 147 | * and not currently supported |
| 148 | * TODO: longer UTF-8 sequences support |
| 149 | */ |
| 150 | if (max_len < len) { |
| 151 | return 0; |
| 152 | } |
| 153 | |
| 154 | switch (len) { |
| 155 | case 1: return 1; /* one byte codepoint */ |
| 156 | case 2: return ((*(up + 1) & 0xC0) == 0x80 ? 2 : 0); |
| 157 | case 3: return ((*(up + 1) & 0xC0) == 0x80 |
| 158 | && (*(up + 2) & 0xC0) == 0x80 ? 3 : 0); |
| 159 | |
| 160 | case 4: return ((*(up + 1) & 0xC0) == 0x80 |
| 161 | && (*(up + 2) & 0xC0) == 0x80 |
| 162 | && (*(up + 3) & 0xC0) == 0x80 ? 4 : 0); |
| 163 | } |
| 164 | |
| 165 | return 0; |
| 166 | } |
| 167 | |
| 168 | #endif /* NU_UTF8_INTERNAL_H */ |
| 169 | |