1//========================================================================
2//
3// UTF.h
4//
5// This file is licensed under the GPLv2 or later
6//
7// Copyright (C) 2012, 2017, 2021, 2023 Adrian Johnson <ajohnson@redneon.com>
8// Copyright (C) 2016 Jason Crain <jason@aquaticape.us>
9// Copyright (C) 2018 Klarälvdalens Datakonsult AB, a KDAB Group company, <info@kdab.com>. Work sponsored by the LiMux project of the city of Munich
10// Copyright (C) 2018 Nelson Benítez León <nbenitezl@gmail.com>
11// Copyright (C) 2019-2022 Albert Astals Cid <aacid@kde.org>
12// Copyright (C) 2021 Georgiy Sgibnev <georgiy@sgibnev.com>. Work sponsored by lab50.net.
13// Copyright (C) 2023, 2024 g10 Code GmbH, Author: Sune Stolborg Vuorela <sune@vuorela.dk>
14// Copyright (C) 2023 Even Rouault <even.rouault@spatialys.com>
15// Copyright (C) 2023, 2024 Oliver Sander <oliver.sander@tu-dresden.de>
16//
17//========================================================================
18
19#ifndef UTF_H
20#define UTF_H
21
22#include <cstdint>
23#include <climits>
24#include <string>
25#include <vector>
26
27#include "CharTypes.h"
28#include "poppler_private_export.h"
29
30// Magic bytes that mark the byte order in a UTF-16 unicode string (big-endian case)
31constexpr std::string_view unicodeByteOrderMark = "\xFE\xFF";
32
33// Magic bytes that mark the byte order in a UTF-16 unicode string (little-endian case)
34constexpr std::string_view unicodeByteOrderMarkLE = "\xFF\xFE";
35
36// Convert a UTF-16 string to a UCS-4
37// utf16 - utf16 bytes
38// utf16_len - number of UTF-16 characters
39// returns number of UCS-4 characters
40std::vector<Unicode> UTF16toUCS4(const Unicode *utf16, int utf16Len);
41
42// Convert a PDF Text String to UCS-4
43// s - PDF text string
44// returns UCS-4 characters
45// Convert a PDF text string to UCS-4
46std::vector<Unicode> POPPLER_PRIVATE_EXPORT TextStringToUCS4(const std::string &textStr);
47
48// check if UCS-4 character is valid
49inline bool UnicodeIsValid(Unicode ucs4)
50{
51 return (ucs4 < 0x110000) && ((ucs4 & 0xfffff800) != 0xd800) && (ucs4 < 0xfdd0 || ucs4 > 0xfdef) && ((ucs4 & 0xfffe) != 0xfffe);
52}
53
54// check whether string starts with Big-Endian byte order mark
55inline bool hasUnicodeByteOrderMark(const std::string &s)
56{
57 return s.starts_with(x: unicodeByteOrderMark);
58}
59
60// check whether string starts with Little-Endian byte order mark
61inline bool hasUnicodeByteOrderMarkLE(const std::string &s)
62{
63 return s.starts_with(x: unicodeByteOrderMarkLE);
64}
65
66// put big-endian unicode byte order mark at the beginning of a string
67inline void prependUnicodeByteOrderMark(std::string &s)
68{
69 s.insert(pos: 0, svt: unicodeByteOrderMark);
70}
71
72// is a unicode whitespace character
73bool UnicodeIsWhitespace(Unicode ucs4);
74
75// Count number of UCS-4 characters required to convert a UTF-8 string to
76// UCS-4 (excluding terminating NULL).
77int POPPLER_PRIVATE_EXPORT utf8CountUCS4(const char *utf8);
78
79// Convert a UTF-8 string to a UCS-4
80// utf8 - utf8 bytes
81// ucs4_out - if not NULL, allocates and returns UCS-4 string. Free with gfree.
82// returns number of UCS-4 characters
83int POPPLER_PRIVATE_EXPORT utf8ToUCS4(const char *utf8, Unicode **ucs4_out);
84
85// Count number of UTF-16 code units required to convert a UTF-8 string
86// (excluding terminating NULL). Each invalid byte is counted as a
87// code point since the UTF-8 conversion functions will replace it with
88// REPLACEMENT_CHAR.
89int POPPLER_PRIVATE_EXPORT utf8CountUtf16CodeUnits(const char *utf8);
90
91// Convert UTF-8 to UTF-16
92// utf8- UTF-8 string to convert. If not null terminated, set maxUtf8 to num
93// bytes to convert
94// utf16 - output buffer to write UTF-16 to. Output will always be null terminated.
95// maxUtf16 - maximum size of output buffer including space for null.
96// maxUtf8 - maximum number of UTF-8 bytes to convert. Conversion stops when
97// either this count is reached or a null is encountered.
98// Returns number of UTF-16 code units written (excluding NULL).
99int POPPLER_PRIVATE_EXPORT utf8ToUtf16(const char *utf8, uint16_t *utf16, int maxUtf16, int maxUtf8);
100
101// Allocate utf16 string and convert utf8 into it.
102uint16_t POPPLER_PRIVATE_EXPORT *utf8ToUtf16(const char *utf8, int *len = nullptr);
103
104inline bool isUtf8WithBom(std::string_view str)
105{
106 if (str.size() < 4) {
107 return false;
108 }
109 if (str[0] == '\xef' && str[1] == '\xbb' && str[2] == '\xbf') {
110 return true;
111 }
112 return false;
113}
114
115// Converts a UTF-8 string to a big endian UTF-16 string with BOM.
116// The caller owns the returned pointer.
117// utf8 - UTF-8 string to convert. An empty string is acceptable.
118// Returns a big endian UTF-16 string with BOM or an empty string without BOM.
119std::string POPPLER_PRIVATE_EXPORT utf8ToUtf16WithBom(const std::string &utf8);
120
121// Count number of UTF-8 bytes required to convert a UTF-16 string to
122// UTF-8 (excluding terminating NULL).
123int POPPLER_PRIVATE_EXPORT utf16CountUtf8Bytes(const uint16_t *utf16);
124
125// Convert UTF-16 to UTF-8
126// utf16- UTF-16 string to convert. If not null terminated, set maxUtf16 to num
127// code units to convert
128// utf8 - output buffer to write UTF-8 to. Output will always be null terminated.
129// maxUtf8 - maximum size of output buffer including space for null.
130// maxUtf16 - maximum number of UTF-16 code units to convert. Conversion stops when
131// either this count is reached or a null is encountered.
132// Returns number of UTF-8 bytes written (excluding NULL).
133int POPPLER_PRIVATE_EXPORT utf16ToUtf8(const uint16_t *utf16, char *utf8, int maxUtf8 = INT_MAX, int maxUtf16 = INT_MAX);
134
135// Allocate utf8 string and convert utf16 into it.
136char POPPLER_PRIVATE_EXPORT *utf16ToUtf8(const uint16_t *utf16, int *len = nullptr);
137
138// Convert a UCS-4 string to pure ASCII (7bit)
139// in - UCS-4 string bytes
140// len - number of UCS-4 characters
141// ucs4_out - if not NULL, allocates and returns UCS-4 string. Free with gfree.
142// out_len - number of UCS-4 characters in ucs4_out.
143// in_idx - if not NULL, the int array returned by the out fourth parameter of
144// unicodeNormalizeNFKC() function. Optional, needed for @indices out parameter.
145// indices - if not NULL, @indices is assigned the location of a newly-allocated array
146// of length @out_len + 1, for each character in the ascii string giving the index
147// of the corresponding character in the text of the line (thanks to this info
148// being passed in @in_idx parameter).
149void POPPLER_PRIVATE_EXPORT unicodeToAscii7(const Unicode *in, int len, Unicode **ucs4_out, int *out_len, const int *in_idx, int **indices);
150
151// Convert a PDF Text String to UTF-8
152// textStr - PDF text string
153// returns UTF-8 string.
154std::string POPPLER_PRIVATE_EXPORT TextStringToUtf8(const std::string &textStr);
155
156#endif
157

source code of poppler/poppler/UTF.h