| 1 | #ifndef NU_UTF8_H |
| 2 | #define NU_UTF8_H |
| 3 | |
| 4 | #include <stdint.h> |
| 5 | #include <sys/types.h> |
| 6 | |
| 7 | #include <libnu/config.h> |
| 8 | #include <libnu/defines.h> |
| 9 | #include <libnu/utf8_internal.h> |
| 10 | |
| 11 | /** @defgroup utf8 UTF-8 support |
| 12 | * |
| 13 | * Note: There is no utf8_string[i] equivalent - it will be slow, |
| 14 | * use nu_utf8_read() and nu_utf8_revread() instead |
| 15 | * |
| 16 | * @example utf8.c |
| 17 | * @example revread.c |
| 18 | */ |
| 19 | |
| 20 | #if defined (__cplusplus) || defined (c_plusplus) |
| 21 | extern "C" { |
| 22 | #endif |
| 23 | |
| 24 | #ifdef NU_WITH_UTF8_READER |
| 25 | |
| 26 | /** Read codepoint from UTF-8 string |
| 27 | * |
| 28 | * @ingroup utf8 |
| 29 | * @param utf8 pointer to UTF-8 encoded string |
| 30 | * @param unicode output unicode codepoint or 0 |
| 31 | * @return pointer to next codepoint in UTF-8 string |
| 32 | */ |
| 33 | static inline |
| 34 | const char* nu_utf8_read(const char *utf8, uint32_t *unicode) { |
| 35 | uint32_t c = *(unsigned char *)(utf8); |
| 36 | |
| 37 | if (c >= 0x80) { |
| 38 | if (c < 0xE0) { |
| 39 | if (unicode != 0) { |
| 40 | utf8_2b(p: utf8, codepoint: unicode); |
| 41 | } |
| 42 | return utf8 + 2; |
| 43 | } |
| 44 | else if (c < 0xF0) { |
| 45 | if (unicode != 0) { |
| 46 | utf8_3b(p: utf8, codepoint: unicode); |
| 47 | } |
| 48 | return utf8 + 3; |
| 49 | } |
| 50 | else { |
| 51 | if (unicode != 0) { |
| 52 | utf8_4b(p: utf8, codepoint: unicode); |
| 53 | } |
| 54 | return utf8 + 4; |
| 55 | } |
| 56 | } |
| 57 | else if (unicode != 0) { |
| 58 | *unicode = c; |
| 59 | } |
| 60 | |
| 61 | return utf8 + 1; |
| 62 | } |
| 63 | |
| 64 | #ifdef NU_WITH_REVERSE_READ |
| 65 | |
| 66 | /** Read codepoint from UTF-8 string in backward direction |
| 67 | * |
| 68 | * Note that it is your responsibility to check that this call |
| 69 | * is not going under beginning of encoded string. Normally you |
| 70 | * shouldn't call it like this: nu_utf8_revread(&u, "hello"); which |
| 71 | * will result in undefined behavior |
| 72 | * |
| 73 | * @ingroup utf8 |
| 74 | * @param unicode output unicode codepoint or 0 |
| 75 | * @param utf8 pointer to UTF-8 encoded string |
| 76 | * @return pointer to previous codepoint in UTF-8 string |
| 77 | */ |
| 78 | static inline |
| 79 | const char* nu_utf8_revread(uint32_t *unicode, const char *utf8) { |
| 80 | /* valid UTF-8 has either 10xxxxxx (continuation byte) |
| 81 | * or beginning of byte sequence */ |
| 82 | const char *p = utf8 - 1; |
| 83 | while (((unsigned char)(*p) & 0xC0) == 0x80) { /* skip every 0b10000000 */ |
| 84 | --p; |
| 85 | } |
| 86 | |
| 87 | if (unicode != 0) { |
| 88 | nu_utf8_read(p, unicode); |
| 89 | } |
| 90 | |
| 91 | return p; |
| 92 | } |
| 93 | |
| 94 | #endif /* NU_WITH_REVERSE_READ */ |
| 95 | |
| 96 | #ifdef NU_WITH_VALIDATION |
| 97 | |
| 98 | /** Validate codepoint in string |
| 99 | * |
| 100 | * @ingroup utf8 |
| 101 | * @param encoded buffer with encoded string |
| 102 | * @param max_len buffer length |
| 103 | * @return codepoint length or 0 on error |
| 104 | */ |
| 105 | NU_EXPORT |
| 106 | int nu_utf8_validread(const char *encoded, size_t max_len); |
| 107 | |
| 108 | #endif /* NU_WITH_VALIDATION */ |
| 109 | #endif /* NU_WITH_UTF8_READER */ |
| 110 | |
| 111 | #ifdef NU_WITH_UTF8_WRITER |
| 112 | |
| 113 | /** Write unicode codepoints into UTF-8 encoded string |
| 114 | * |
| 115 | * @ingroup utf8 |
| 116 | * @param unicode unicode codepoint |
| 117 | * @param utf8 pointer to buffer to write UTF-8 encoded text to, |
| 118 | * should be large enough to hold encoded value |
| 119 | * @return pointer to byte after last written |
| 120 | */ |
| 121 | NU_EXPORT |
| 122 | char* nu_utf8_write(uint32_t unicode, char *utf8); |
| 123 | |
| 124 | #endif /* NU_WITH_UTF8_WRITER */ |
| 125 | |
| 126 | #if defined (__cplusplus) || defined (c_plusplus) |
| 127 | } |
| 128 | #endif |
| 129 | |
| 130 | #endif /* NU_UTF8_H */ |
| 131 | |