1 | //======================================================================== |
2 | // |
3 | // UTF.h |
4 | // |
5 | // This file is licensed under the GPLv2 or later |
6 | // |
7 | // Copyright (C) 2012, 2017, 2021, 2023 Adrian Johnson <ajohnson@redneon.com> |
8 | // Copyright (C) 2016 Jason Crain <jason@aquaticape.us> |
9 | // Copyright (C) 2018 Klarälvdalens Datakonsult AB, a KDAB Group company, <info@kdab.com>. Work sponsored by the LiMux project of the city of Munich |
10 | // Copyright (C) 2018 Nelson Benítez León <nbenitezl@gmail.com> |
11 | // Copyright (C) 2019-2022 Albert Astals Cid <aacid@kde.org> |
12 | // Copyright (C) 2021 Georgiy Sgibnev <georgiy@sgibnev.com>. Work sponsored by lab50.net. |
13 | // Copyright (C) 2023, 2024 g10 Code GmbH, Author: Sune Stolborg Vuorela <sune@vuorela.dk> |
14 | // Copyright (C) 2023 Even Rouault <even.rouault@spatialys.com> |
15 | // Copyright (C) 2023, 2024 Oliver Sander <oliver.sander@tu-dresden.de> |
16 | // |
17 | //======================================================================== |
18 | |
19 | #ifndef UTF_H |
20 | #define UTF_H |
21 | |
22 | #include <cstdint> |
23 | #include <climits> |
24 | #include <string> |
25 | #include <vector> |
26 | |
27 | #include "CharTypes.h" |
28 | #include "poppler_private_export.h" |
29 | |
30 | // Magic bytes that mark the byte order in a UTF-16 unicode string (big-endian case) |
31 | constexpr std::string_view unicodeByteOrderMark = "\xFE\xFF" ; |
32 | |
33 | // Magic bytes that mark the byte order in a UTF-16 unicode string (little-endian case) |
34 | constexpr std::string_view unicodeByteOrderMarkLE = "\xFF\xFE" ; |
35 | |
36 | // Convert a UTF-16 string to a UCS-4 |
37 | // utf16 - utf16 bytes |
38 | // utf16_len - number of UTF-16 characters |
39 | // returns number of UCS-4 characters |
40 | std::vector<Unicode> UTF16toUCS4(const Unicode *utf16, int utf16Len); |
41 | |
42 | // Convert a PDF Text String to UCS-4 |
43 | // s - PDF text string |
44 | // returns UCS-4 characters |
45 | // Convert a PDF text string to UCS-4 |
46 | std::vector<Unicode> POPPLER_PRIVATE_EXPORT TextStringToUCS4(const std::string &textStr); |
47 | |
48 | // check if UCS-4 character is valid |
49 | inline bool UnicodeIsValid(Unicode ucs4) |
50 | { |
51 | return (ucs4 < 0x110000) && ((ucs4 & 0xfffff800) != 0xd800) && (ucs4 < 0xfdd0 || ucs4 > 0xfdef) && ((ucs4 & 0xfffe) != 0xfffe); |
52 | } |
53 | |
54 | // check whether string starts with Big-Endian byte order mark |
55 | inline bool hasUnicodeByteOrderMark(const std::string &s) |
56 | { |
57 | return s.starts_with(x: unicodeByteOrderMark); |
58 | } |
59 | |
60 | // check whether string starts with Little-Endian byte order mark |
61 | inline bool hasUnicodeByteOrderMarkLE(const std::string &s) |
62 | { |
63 | return s.starts_with(x: unicodeByteOrderMarkLE); |
64 | } |
65 | |
66 | // put big-endian unicode byte order mark at the beginning of a string |
67 | inline void prependUnicodeByteOrderMark(std::string &s) |
68 | { |
69 | s.insert(pos: 0, svt: unicodeByteOrderMark); |
70 | } |
71 | |
72 | // is a unicode whitespace character |
73 | bool UnicodeIsWhitespace(Unicode ucs4); |
74 | |
75 | // Count number of UCS-4 characters required to convert a UTF-8 string to |
76 | // UCS-4 (excluding terminating NULL). |
77 | int POPPLER_PRIVATE_EXPORT utf8CountUCS4(const char *utf8); |
78 | |
79 | // Convert a UTF-8 string to a UCS-4 |
80 | // utf8 - utf8 bytes |
81 | // ucs4_out - if not NULL, allocates and returns UCS-4 string. Free with gfree. |
82 | // returns number of UCS-4 characters |
83 | int POPPLER_PRIVATE_EXPORT utf8ToUCS4(const char *utf8, Unicode **ucs4_out); |
84 | |
85 | // Count number of UTF-16 code units required to convert a UTF-8 string |
86 | // (excluding terminating NULL). Each invalid byte is counted as a |
87 | // code point since the UTF-8 conversion functions will replace it with |
88 | // REPLACEMENT_CHAR. |
89 | int POPPLER_PRIVATE_EXPORT utf8CountUtf16CodeUnits(const char *utf8); |
90 | |
91 | // Convert UTF-8 to UTF-16 |
92 | // utf8- UTF-8 string to convert. If not null terminated, set maxUtf8 to num |
93 | // bytes to convert |
94 | // utf16 - output buffer to write UTF-16 to. Output will always be null terminated. |
95 | // maxUtf16 - maximum size of output buffer including space for null. |
96 | // maxUtf8 - maximum number of UTF-8 bytes to convert. Conversion stops when |
97 | // either this count is reached or a null is encountered. |
98 | // Returns number of UTF-16 code units written (excluding NULL). |
99 | int POPPLER_PRIVATE_EXPORT utf8ToUtf16(const char *utf8, uint16_t *utf16, int maxUtf16, int maxUtf8); |
100 | |
101 | // Allocate utf16 string and convert utf8 into it. |
102 | uint16_t POPPLER_PRIVATE_EXPORT *utf8ToUtf16(const char *utf8, int *len = nullptr); |
103 | |
104 | inline bool isUtf8WithBom(std::string_view str) |
105 | { |
106 | if (str.size() < 4) { |
107 | return false; |
108 | } |
109 | if (str[0] == '\xef' && str[1] == '\xbb' && str[2] == '\xbf') { |
110 | return true; |
111 | } |
112 | return false; |
113 | } |
114 | |
115 | // Converts a UTF-8 string to a big endian UTF-16 string with BOM. |
116 | // The caller owns the returned pointer. |
117 | // utf8 - UTF-8 string to convert. An empty string is acceptable. |
118 | // Returns a big endian UTF-16 string with BOM or an empty string without BOM. |
119 | std::string POPPLER_PRIVATE_EXPORT utf8ToUtf16WithBom(const std::string &utf8); |
120 | |
121 | // Count number of UTF-8 bytes required to convert a UTF-16 string to |
122 | // UTF-8 (excluding terminating NULL). |
123 | int POPPLER_PRIVATE_EXPORT utf16CountUtf8Bytes(const uint16_t *utf16); |
124 | |
125 | // Convert UTF-16 to UTF-8 |
126 | // utf16- UTF-16 string to convert. If not null terminated, set maxUtf16 to num |
127 | // code units to convert |
128 | // utf8 - output buffer to write UTF-8 to. Output will always be null terminated. |
129 | // maxUtf8 - maximum size of output buffer including space for null. |
130 | // maxUtf16 - maximum number of UTF-16 code units to convert. Conversion stops when |
131 | // either this count is reached or a null is encountered. |
132 | // Returns number of UTF-8 bytes written (excluding NULL). |
133 | int POPPLER_PRIVATE_EXPORT utf16ToUtf8(const uint16_t *utf16, char *utf8, int maxUtf8 = INT_MAX, int maxUtf16 = INT_MAX); |
134 | |
135 | // Allocate utf8 string and convert utf16 into it. |
136 | char POPPLER_PRIVATE_EXPORT *utf16ToUtf8(const uint16_t *utf16, int *len = nullptr); |
137 | |
138 | // Convert a UCS-4 string to pure ASCII (7bit) |
139 | // in - UCS-4 string bytes |
140 | // len - number of UCS-4 characters |
141 | // ucs4_out - if not NULL, allocates and returns UCS-4 string. Free with gfree. |
142 | // out_len - number of UCS-4 characters in ucs4_out. |
143 | // in_idx - if not NULL, the int array returned by the out fourth parameter of |
144 | // unicodeNormalizeNFKC() function. Optional, needed for @indices out parameter. |
145 | // indices - if not NULL, @indices is assigned the location of a newly-allocated array |
146 | // of length @out_len + 1, for each character in the ascii string giving the index |
147 | // of the corresponding character in the text of the line (thanks to this info |
148 | // being passed in @in_idx parameter). |
149 | void POPPLER_PRIVATE_EXPORT unicodeToAscii7(const Unicode *in, int len, Unicode **ucs4_out, int *out_len, const int *in_idx, int **indices); |
150 | |
151 | // Convert a PDF Text String to UTF-8 |
152 | // textStr - PDF text string |
153 | // returns UTF-8 string. |
154 | std::string POPPLER_PRIVATE_EXPORT TextStringToUtf8(const std::string &textStr); |
155 | |
156 | #endif |
157 | |