UTF.h source code [poppler/poppler/UTF.h]

1	//========================================================================
2	//
3	// UTF.h
4	//
5	// This file is licensed under the GPLv2 or later
6	//
7	// Copyright (C) 2012, 2017, 2021, 2023 Adrian Johnson <ajohnson@redneon.com>
8	// Copyright (C) 2016 Jason Crain <jason@aquaticape.us>
9	// Copyright (C) 2018 Klarälvdalens Datakonsult AB, a KDAB Group company, <info@kdab.com>. Work sponsored by the LiMux project of the city of Munich
10	// Copyright (C) 2018 Nelson Benítez León <nbenitezl@gmail.com>
11	// Copyright (C) 2019-2022 Albert Astals Cid <aacid@kde.org>
12	// Copyright (C) 2021 Georgiy Sgibnev <georgiy@sgibnev.com>. Work sponsored by lab50.net.
13	// Copyright (C) 2023, 2024 g10 Code GmbH, Author: Sune Stolborg Vuorela <sune@vuorela.dk>
14	// Copyright (C) 2023 Even Rouault <even.rouault@spatialys.com>
15	// Copyright (C) 2023, 2024 Oliver Sander <oliver.sander@tu-dresden.de>
16	//
17	//========================================================================
18
19	#ifndef UTF_H
20	#define UTF_H
21
22	#include <cstdint>
23	#include <climits>
24	#include <string>
25	#include <vector>
26
27	#include "CharTypes.h"
28	#include "poppler_private_export.h"
29
30	// Magic bytes that mark the byte order in a UTF-16 unicode string (big-endian case)
31	constexpr std::string_view unicodeByteOrderMark = "\xFE\xFF";
32
33	// Magic bytes that mark the byte order in a UTF-16 unicode string (little-endian case)
34	constexpr std::string_view unicodeByteOrderMarkLE = "\xFF\xFE";
35
36	// Convert a UTF-16 string to a UCS-4
37	// utf16 - utf16 bytes
38	// utf16_len - number of UTF-16 characters
39	// returns number of UCS-4 characters
40	std::vector<Unicode> UTF16toUCS4(const Unicode utf16, int* utf16Len);
41
42	// Convert a PDF Text String to UCS-4
43	// s - PDF text string
44	// returns UCS-4 characters
45	// Convert a PDF text string to UCS-4
46	std::vector<Unicode> POPPLER_PRIVATE_EXPORT TextStringToUCS4(const std::string &textStr);
47
48	// check if UCS-4 character is valid
49	inline bool UnicodeIsValid(Unicode ucs4)
50	{
51	return (ucs4 < `0x110000`) && ((ucs4 & `0xfffff800`) != `0xd800`) && (ucs4 < `0xfdd0` \|\| ucs4 > `0xfdef`) && ((ucs4 & `0xfffe`) != `0xfffe`);
52	}
53
54	// check whether string starts with Big-Endian byte order mark
55	inline bool hasUnicodeByteOrderMark(const std::string &s)
56	{
57	return s.starts_with(x: unicodeByteOrderMark);
58	}
59
60	// check whether string starts with Little-Endian byte order mark
61	inline bool hasUnicodeByteOrderMarkLE(const std::string &s)
62	{
63	return s.starts_with(x: unicodeByteOrderMarkLE);
64	}
65
66	// put big-endian unicode byte order mark at the beginning of a string
67	inline void prependUnicodeByteOrderMark(std::string &s)
68	{
69	s.insert(pos: `0`, svt: unicodeByteOrderMark);
70	}
71
72	// is a unicode whitespace character
73	bool UnicodeIsWhitespace(Unicode ucs4);
74
75	// Count number of UCS-4 characters required to convert a UTF-8 string to
76	// UCS-4 (excluding terminating NULL).
77	int POPPLER_PRIVATE_EXPORT utf8CountUCS4(const char *utf8);
78
79	// Convert a UTF-8 string to a UCS-4
80	// utf8 - utf8 bytes
81	// ucs4_out - if not NULL, allocates and returns UCS-4 string. Free with gfree.
82	// returns number of UCS-4 characters
83	int POPPLER_PRIVATE_EXPORT utf8ToUCS4(const char utf8, Unicode *ucs4_out);
84
85	// Count number of UTF-16 code units required to convert a UTF-8 string
86	// (excluding terminating NULL). Each invalid byte is counted as a
87	// code point since the UTF-8 conversion functions will replace it with
88	// REPLACEMENT_CHAR.
89	int POPPLER_PRIVATE_EXPORT utf8CountUtf16CodeUnits(const char *utf8);
90
91	// Convert UTF-8 to UTF-16
92	// utf8- UTF-8 string to convert. If not null terminated, set maxUtf8 to num
93	// bytes to convert
94	// utf16 - output buffer to write UTF-16 to. Output will always be null terminated.
95	// maxUtf16 - maximum size of output buffer including space for null.
96	// maxUtf8 - maximum number of UTF-8 bytes to convert. Conversion stops when
97	// either this count is reached or a null is encountered.
98	// Returns number of UTF-16 code units written (excluding NULL).
99	int POPPLER_PRIVATE_EXPORT utf8ToUtf16(const char utf8, uint16_t utf16, int maxUtf16, int maxUtf8);
100
101	// Allocate utf16 string and convert utf8 into it.
102	uint16_t POPPLER_PRIVATE_EXPORT utf8ToUtf16(const* char utf8, int* len = nullptr*);
103
104	inline bool isUtf8WithBom(std::string_view str)
105	{
106	if (str.size() < `4`) {
107	return false;
108	}
109	if (str [`0`] == `'\xef'` && str [`1`] == `'\xbb'` && str [`2`] == `'\xbf'`) {
110	return true;
111	}
112	return false;
113	}
114
115	// Converts a UTF-8 string to a big endian UTF-16 string with BOM.
116	// The caller owns the returned pointer.
117	// utf8 - UTF-8 string to convert. An empty string is acceptable.
118	// Returns a big endian UTF-16 string with BOM or an empty string without BOM.
119	std::string POPPLER_PRIVATE_EXPORT utf8ToUtf16WithBom(const std::string &utf8);
120
121	// Count number of UTF-8 bytes required to convert a UTF-16 string to
122	// UTF-8 (excluding terminating NULL).
123	int POPPLER_PRIVATE_EXPORT utf16CountUtf8Bytes(const uint16_t *utf16);
124
125	// Convert UTF-16 to UTF-8
126	// utf16- UTF-16 string to convert. If not null terminated, set maxUtf16 to num
127	// code units to convert
128	// utf8 - output buffer to write UTF-8 to. Output will always be null terminated.
129	// maxUtf8 - maximum size of output buffer including space for null.
130	// maxUtf16 - maximum number of UTF-16 code units to convert. Conversion stops when
131	// either this count is reached or a null is encountered.
132	// Returns number of UTF-8 bytes written (excluding NULL).
133	int POPPLER_PRIVATE_EXPORT utf16ToUtf8(const uint16_t utf16, char* utf8, int* maxUtf8 = INT_MAX, int maxUtf16 = INT_MAX);
134
135	// Allocate utf8 string and convert utf16 into it.
136	char POPPLER_PRIVATE_EXPORT utf16ToUtf8(const* uint16_t utf16, int* len = nullptr*);
137
138	// Convert a UCS-4 string to pure ASCII (7bit)
139	// in - UCS-4 string bytes
140	// len - number of UCS-4 characters
141	// ucs4_out - if not NULL, allocates and returns UCS-4 string. Free with gfree.
142	// out_len - number of UCS-4 characters in ucs4_out.
143	// in_idx - if not NULL, the int array returned by the out fourth parameter of
144	// unicodeNormalizeNFKC() function. Optional, needed for @indices out parameter.
145	// indices - if not NULL, @indices is assigned the location of a newly-allocated array
146	// of length @out_len + 1, for each character in the ascii string giving the index
147	// of the corresponding character in the text of the line (thanks to this info
148	// being passed in @in_idx parameter).
149	void POPPLER_PRIVATE_EXPORT unicodeToAscii7(const Unicode in, int* len, Unicode *ucs4_out, int* out_len, const* int in_idx, int* **indices);
150
151	// Convert a PDF Text String to UTF-8
152	// textStr - PDF text string
153	// returns UTF-8 string.
154	std::string POPPLER_PRIVATE_EXPORT TextStringToUtf8(const std::string &textStr);
155
156	#endif
157

source code of poppler/poppler/UTF.h