unicodeobject.h source code [include/python3.12/cpython/unicodeobject.h]

1	#ifndef Py_CPYTHON_UNICODEOBJECT_H
2	# error "this header file must not be included directly"
3	#endif
4
5	/ Py_UNICODE was the native Unicode storage format (code unit) used by*
6	Python and represents a single Unicode element in the Unicode type.
7	With PEP 393, Py_UNICODE is deprecated and replaced with a
8	typedef to wchar_t. /*
9	#define PY_UNICODE_TYPE wchar_t
10	/ Py_DEPRECATED(3.3) / typedef wchar_t Py_UNICODE;
11
12	/ --- Internal Unicode Operations ---------------------------------------- /
13
14	// Static inline functions to work with surrogates
15	static inline int Py_UNICODE_IS_SURROGATE(Py_UCS4 ch) {
16	return (`0xD800` <= ch && ch <= `0xDFFF`);
17	}
18	static inline int Py_UNICODE_IS_HIGH_SURROGATE(Py_UCS4 ch) {
19	return (`0xD800` <= ch && ch <= `0xDBFF`);
20	}
21	static inline int Py_UNICODE_IS_LOW_SURROGATE(Py_UCS4 ch) {
22	return (`0xDC00` <= ch && ch <= `0xDFFF`);
23	}
24
25	// Join two surrogate characters and return a single Py_UCS4 value.
26	static inline Py_UCS4 Py_UNICODE_JOIN_SURROGATES(Py_UCS4 high, Py_UCS4 low) {
27	assert(Py_UNICODE_IS_HIGH_SURROGATE(high));
28	assert(Py_UNICODE_IS_LOW_SURROGATE(low));
29	return `0x10000` + (((high & `0x03FF`) << `10`) \| (low & `0x03FF`));
30	}
31
32	// High surrogate = top 10 bits added to 0xD800.
33	// The character must be in the range [U+10000; U+10ffff].
34	static inline Py_UCS4 Py_UNICODE_HIGH_SURROGATE(Py_UCS4 ch) {
35	assert(`0x10000` <= ch && ch <= `0x10ffff`);
36	return (`0xD800` - (`0x10000` >> `10`) + (ch >> `10`));
37	}
38
39	// Low surrogate = bottom 10 bits added to 0xDC00.
40	// The character must be in the range [U+10000; U+10ffff].
41	static inline Py_UCS4 Py_UNICODE_LOW_SURROGATE(Py_UCS4 ch) {
42	assert(`0x10000` <= ch && ch <= `0x10ffff`);
43	return (`0xDC00` + (ch & `0x3FF`));
44	}
45
46	/ --- Unicode Type ------------------------------------------------------- /
47
48	/ ASCII-only strings created through PyUnicode_New use the PyASCIIObject*
49	structure. state.ascii and state.compact are set, and the data
50	immediately follow the structure. utf8_length can be found
51	in the length field; the utf8 pointer is equal to the data pointer. /*
52	typedef struct {
53	/ There are 4 forms of Unicode strings:*
54
55	- compact ascii:
56
57	* structure = PyASCIIObject
58	* test: PyUnicode_IS_COMPACT_ASCII(op)
59	* kind = PyUnicode_1BYTE_KIND
60	* compact = 1
61	* ascii = 1
62	* (length is the length of the utf8)
63	* (data starts just after the structure)
64	* (since ASCII is decoded from UTF-8, the utf8 string are the data)
65
66	- compact:
67
68	* structure = PyCompactUnicodeObject
69	* test: PyUnicode_IS_COMPACT(op) && !PyUnicode_IS_ASCII(op)
70	* kind = PyUnicode_1BYTE_KIND, PyUnicode_2BYTE_KIND or
71	PyUnicode_4BYTE_KIND
72	* compact = 1
73	* ascii = 0
74	* utf8 is not shared with data
75	* utf8_length = 0 if utf8 is NULL
76	* (data starts just after the structure)
77
78	- legacy string:
79
80	* structure = PyUnicodeObject structure
81	* test: !PyUnicode_IS_COMPACT(op)
82	* kind = PyUnicode_1BYTE_KIND, PyUnicode_2BYTE_KIND or
83	PyUnicode_4BYTE_KIND
84	* compact = 0
85	* data.any is not NULL
86	* utf8 is shared and utf8_length = length with data.any if ascii = 1
87	* utf8_length = 0 if utf8 is NULL
88
89	Compact strings use only one memory block (structure + characters),
90	whereas legacy strings use one block for the structure and one block
91	for characters.
92
93	Legacy strings are created by subclasses of Unicode.
94
95	See also _PyUnicode_CheckConsistency().
96	*/
97	PyObject_HEAD
98	Py_ssize_t length; / Number of code points in the string /
99	Py_hash_t hash; / Hash value; -1 if not set /
100	struct {
101	/ If interned is non-zero, the two references from the*
102	dictionary to this object are not* counted in ob_refcnt.*
103	The possible values here are:
104	0: Not Interned
105	1: Interned
106	2: Interned and Immortal
107	3: Interned, Immortal, and Static
108	This categorization allows the runtime to determine the right
109	cleanup mechanism at runtime shutdown. /*
110	unsigned int interned:`2`;
111	/ Character size:*
112
113	- PyUnicode_1BYTE_KIND (1):
114
115	* character type = Py_UCS1 (8 bits, unsigned)
116	* all characters are in the range U+0000-U+00FF (latin1)
117	* if ascii is set, all characters are in the range U+0000-U+007F
118	(ASCII), otherwise at least one character is in the range
119	U+0080-U+00FF
120
121	- PyUnicode_2BYTE_KIND (2):
122
123	* character type = Py_UCS2 (16 bits, unsigned)
124	* all characters are in the range U+0000-U+FFFF (BMP)
125	* at least one character is in the range U+0100-U+FFFF
126
127	- PyUnicode_4BYTE_KIND (4):
128
129	* character type = Py_UCS4 (32 bits, unsigned)
130	* all characters are in the range U+0000-U+10FFFF
131	* at least one character is in the range U+10000-U+10FFFF
132	*/
133	unsigned int kind:`3`;
134	/ Compact is with respect to the allocation scheme. Compact unicode*
135	objects only require one memory block while non-compact objects use
136	one block for the PyUnicodeObject struct and another for its data
137	buffer. /*
138	unsigned int compact:`1`;
139	/ The string only contains characters in the range U+0000-U+007F (ASCII)*
140	and the kind is PyUnicode_1BYTE_KIND. If ascii is set and compact is
141	set, use the PyASCIIObject structure. /*
142	unsigned int ascii:`1`;
143	/ The object is statically allocated. /
144	unsigned int statically_allocated:`1`;
145	/ Padding to ensure that PyUnicode_DATA() is always aligned to*
146	4 bytes (see issue #19537 on m68k). /*
147	unsigned int :`24`;
148	} state;
149	} PyASCIIObject;
150
151	/ Non-ASCII strings allocated through PyUnicode_New use the*
152	PyCompactUnicodeObject structure. state.compact is set, and the data
153	immediately follow the structure. /*
154	typedef struct {
155	PyASCIIObject _base;
156	Py_ssize_t utf8_length; / Number of bytes in utf8, excluding the*
157	* terminating \0. */
158	char utf8; /* UTF-8 representation (null-terminated) /
159	} PyCompactUnicodeObject;
160
161	/ Object format for Unicode subclasses. /
162	typedef struct {
163	PyCompactUnicodeObject _base;
164	union {
165	void *any;
166	Py_UCS1 *latin1;
167	Py_UCS2 *ucs2;
168	Py_UCS4 *ucs4;
169	} data; / Canonical, smallest-form Unicode buffer /
170	} PyUnicodeObject;
171
172	PyAPI_FUNC(int) _PyUnicode_CheckConsistency(
173	PyObject *op,
174	int check_content);
175
176
177	#define _PyASCIIObject_CAST(op) \
178	(assert(PyUnicode_Check(op)), \
179	_Py_CAST(PyASCIIObject*, (op)))
180	#define _PyCompactUnicodeObject_CAST(op) \
181	(assert(PyUnicode_Check(op)), \
182	_Py_CAST(PyCompactUnicodeObject*, (op)))
183	#define _PyUnicodeObject_CAST(op) \
184	(assert(PyUnicode_Check(op)), \
185	_Py_CAST(PyUnicodeObject*, (op)))
186
187
188	/ --- Flexible String Representation Helper Macros (PEP 393) -------------- /
189
190	/ Values for PyASCIIObject.state: /
191
192	/ Interning state. /
193	#define SSTATE_NOT_INTERNED 0
194	#define SSTATE_INTERNED_MORTAL 1
195	#define SSTATE_INTERNED_IMMORTAL 2
196	#define SSTATE_INTERNED_IMMORTAL_STATIC 3
197
198	/ Use only if you know it's a string /
199	static inline unsigned int PyUnicode_CHECK_INTERNED(PyObject *op) {
200	return _PyASCIIObject_CAST(op)->state.interned;
201	}
202	#define PyUnicode_CHECK_INTERNED(op) PyUnicode_CHECK_INTERNED(_PyObject_CAST(op))
203
204	/ For backward compatibility /
205	static inline unsigned int PyUnicode_IS_READY(PyObject* Py_UNUSED(op)) {
206	return `1`;
207	}
208	#define PyUnicode_IS_READY(op) PyUnicode_IS_READY(_PyObject_CAST(op))
209
210	/ Return true if the string contains only ASCII characters, or 0 if not. The*
211	string may be compact (PyUnicode_IS_COMPACT_ASCII) or not, but must be
212	ready. /*
213	static inline unsigned int PyUnicode_IS_ASCII(PyObject *op) {
214	return _PyASCIIObject_CAST(op)->state.ascii;
215	}
216	#define PyUnicode_IS_ASCII(op) PyUnicode_IS_ASCII(_PyObject_CAST(op))
217
218	/ Return true if the string is compact or 0 if not.*
219	No type checks or Ready calls are performed. /*
220	static inline unsigned int PyUnicode_IS_COMPACT(PyObject *op) {
221	return _PyASCIIObject_CAST(op)->state.compact;
222	}
223	#define PyUnicode_IS_COMPACT(op) PyUnicode_IS_COMPACT(_PyObject_CAST(op))
224
225	/ Return true if the string is a compact ASCII string (use PyASCIIObject*
226	structure), or 0 if not. No type checks or Ready calls are performed. /*
227	static inline int PyUnicode_IS_COMPACT_ASCII(PyObject *op) {
228	return (_PyASCIIObject_CAST(op)->state.ascii && PyUnicode_IS_COMPACT(op));
229	}
230	#define PyUnicode_IS_COMPACT_ASCII(op) PyUnicode_IS_COMPACT_ASCII(_PyObject_CAST(op))
231
232	enum PyUnicode_Kind {
233	/ Return values of the PyUnicode_KIND() function: /
234	PyUnicode_1BYTE_KIND = `1`,
235	PyUnicode_2BYTE_KIND = `2`,
236	PyUnicode_4BYTE_KIND = `4`
237	};
238
239	// PyUnicode_KIND(): Return one of the PyUnicode__KIND values defined above.*
240	//
241	// gh-89653: Converting this macro to a static inline function would introduce
242	// new compiler warnings on "kind < PyUnicode_KIND(str)" (compare signed and
243	// unsigned numbers) where kind type is an int or on
244	// "unsigned int kind = PyUnicode_KIND(str)" (cast signed to unsigned).
245	#define PyUnicode_KIND(op) _Py_RVALUE(_PyASCIIObject_CAST(op)->state.kind)
246
247	/ Return a void pointer to the raw unicode buffer. /
248	static inline void* _PyUnicode_COMPACT_DATA(PyObject *op) {
249	if (PyUnicode_IS_ASCII(op)) {
250	return _Py_STATIC_CAST(void*, (_PyASCIIObject_CAST(op) + `1`));
251	}
252	return _Py_STATIC_CAST(void*, (_PyCompactUnicodeObject_CAST(op) + `1`));
253	}
254
255	static inline void* _PyUnicode_NONCOMPACT_DATA(PyObject *op) {
256	void *data;
257	assert(!PyUnicode_IS_COMPACT(op));
258	data = _PyUnicodeObject_CAST(op)->data.any;
259	assert(data != NULL);
260	return data;
261	}
262
263	static inline void* PyUnicode_DATA(PyObject *op) {
264	if (PyUnicode_IS_COMPACT(op)) {
265	return _PyUnicode_COMPACT_DATA(op);
266	}
267	return _PyUnicode_NONCOMPACT_DATA(op);
268	}
269	#define PyUnicode_DATA(op) PyUnicode_DATA(_PyObject_CAST(op))
270
271	/ Return pointers to the canonical representation cast to unsigned char,*
272	Py_UCS2, or Py_UCS4 for direct character access.
273	No checks are performed, use PyUnicode_KIND() before to ensure
274	these will work correctly. /*
275
276	#define PyUnicode_1BYTE_DATA(op) _Py_STATIC_CAST(Py_UCS1*, PyUnicode_DATA(op))
277	#define PyUnicode_2BYTE_DATA(op) _Py_STATIC_CAST(Py_UCS2*, PyUnicode_DATA(op))
278	#define PyUnicode_4BYTE_DATA(op) _Py_STATIC_CAST(Py_UCS4*, PyUnicode_DATA(op))
279
280	/ Returns the length of the unicode string. /
281	static inline Py_ssize_t PyUnicode_GET_LENGTH(PyObject *op) {
282	return _PyASCIIObject_CAST(op)->length;
283	}
284	#define PyUnicode_GET_LENGTH(op) PyUnicode_GET_LENGTH(_PyObject_CAST(op))
285
286	/ Write into the canonical representation, this function does not do any sanity*
287	checks and is intended for usage in loops. The caller should cache the
288	kind and data pointers obtained from other function calls.
289	index is the index in the string (starts at 0) and value is the new
290	code point value which should be written to that location. /*
291	static inline void PyUnicode_WRITE(int kind, void *data,
292	Py_ssize_t index, Py_UCS4 value)
293	{
294	assert(index >= `0`);
295	if (kind == PyUnicode_1BYTE_KIND) {
296	assert(value <= `0xffU`);
297	_Py_STATIC_CAST(Py_UCS1*, data)[index] = _Py_STATIC_CAST(Py_UCS1, value);
298	}
299	else if (kind == PyUnicode_2BYTE_KIND) {
300	assert(value <= `0xffffU`);
301	_Py_STATIC_CAST(Py_UCS2*, data)[index] = _Py_STATIC_CAST(Py_UCS2, value);
302	}
303	else {
304	assert(kind == PyUnicode_4BYTE_KIND);
305	assert(value <= `0x10ffffU`);
306	_Py_STATIC_CAST(Py_UCS4*, data)[index] = value;
307	}
308	}
309	#define PyUnicode_WRITE(kind, data, index, value) \
310	PyUnicode_WRITE(_Py_STATIC_CAST(int, kind), _Py_CAST(void*, data), \
311	(index), _Py_STATIC_CAST(Py_UCS4, value))
312
313	/ Read a code point from the string's canonical representation. No checks*
314	or ready calls are performed. /*
315	static inline Py_UCS4 PyUnicode_READ(int kind,
316	const void *data, Py_ssize_t index)
317	{
318	assert(index >= `0`);
319	if (kind == PyUnicode_1BYTE_KIND) {
320	return _Py_STATIC_CAST(const Py_UCS1*, data)[index];
321	}
322	if (kind == PyUnicode_2BYTE_KIND) {
323	return _Py_STATIC_CAST(const Py_UCS2*, data)[index];
324	}
325	assert(kind == PyUnicode_4BYTE_KIND);
326	return _Py_STATIC_CAST(const Py_UCS4*, data)[index];
327	}
328	#define PyUnicode_READ(kind, data, index) \
329	PyUnicode_READ(_Py_STATIC_CAST(int, kind), \
330	_Py_STATIC_CAST(const void*, data), \
331	(index))
332
333	/ PyUnicode_READ_CHAR() is less efficient than PyUnicode_READ() because it*
334	calls PyUnicode_KIND() and might call it twice. For single reads, use
335	PyUnicode_READ_CHAR, for multiple consecutive reads callers should
336	cache kind and use PyUnicode_READ instead. /*
337	static inline Py_UCS4 PyUnicode_READ_CHAR(PyObject *unicode, Py_ssize_t index)
338	{
339	int kind;
340
341	assert(index >= `0`);
342	// Tolerate reading the NUL character at str[len(str)]
343	assert(index <= PyUnicode_GET_LENGTH(unicode));
344
345	kind = PyUnicode_KIND(unicode);
346	if (kind == PyUnicode_1BYTE_KIND) {
347	return PyUnicode_1BYTE_DATA(unicode)[index];
348	}
349	if (kind == PyUnicode_2BYTE_KIND) {
350	return PyUnicode_2BYTE_DATA(unicode)[index];
351	}
352	assert(kind == PyUnicode_4BYTE_KIND);
353	return PyUnicode_4BYTE_DATA(unicode)[index];
354	}
355	#define PyUnicode_READ_CHAR(unicode, index) \
356	PyUnicode_READ_CHAR(_PyObject_CAST(unicode), (index))
357
358	/ Return a maximum character value which is suitable for creating another*
359	string based on op. This is always an approximation but more efficient
360	than iterating over the string. /*
361	static inline Py_UCS4 PyUnicode_MAX_CHAR_VALUE(PyObject *op)
362	{
363	int kind;
364
365	if (PyUnicode_IS_ASCII(op)) {
366	return `0x7fU`;
367	}
368
369	kind = PyUnicode_KIND(op);
370	if (kind == PyUnicode_1BYTE_KIND) {
371	return `0xffU`;
372	}
373	if (kind == PyUnicode_2BYTE_KIND) {
374	return `0xffffU`;
375	}
376	assert(kind == PyUnicode_4BYTE_KIND);
377	return `0x10ffffU`;
378	}
379	#define PyUnicode_MAX_CHAR_VALUE(op) \
380	PyUnicode_MAX_CHAR_VALUE(_PyObject_CAST(op))
381
382	/ === Public API ========================================================= /
383
384	/ --- Plain Py_UNICODE --------------------------------------------------- /
385
386	/ With PEP 393, this is the recommended way to allocate a new unicode object.*
387	This function will allocate the object and its buffer in a single memory
388	block. Objects created using this function are not resizable. /*
389	PyAPI_FUNC(PyObject*) PyUnicode_New(
390	Py_ssize_t size, / Number of code points in the new string /
391	Py_UCS4 maxchar / maximum code point value in the string /
392	);
393
394	/ For backward compatibility /
395	static inline int PyUnicode_READY(PyObject* Py_UNUSED(op))
396	{
397	return `0`;
398	}
399	#define PyUnicode_READY(op) PyUnicode_READY(_PyObject_CAST(op))
400
401	/ Get a copy of a Unicode string. /
402	PyAPI_FUNC(PyObject*) _PyUnicode_Copy(
403	PyObject *unicode
404	);
405
406	/ Copy character from one unicode object into another, this function performs*
407	character conversion when necessary and falls back to memcpy() if possible.
408
409	Fail if to is too small (smaller than how_many* or smaller than*
410	len(from)-from_start), or if kind(from[from_start:from_start+how_many]) >
411	kind(to), or if to* has more than 1 reference.*
412
413	Return the number of written character, or return -1 and raise an exception
414	on error.
415
416	Pseudo-code:
417
418	how_many = min(how_many, len(from) - from_start)
419	to[to_start:to_start+how_many] = from[from_start:from_start+how_many]
420	return how_many
421
422	Note: The function doesn't write a terminating null character.
423	*/
424	PyAPI_FUNC(Py_ssize_t) PyUnicode_CopyCharacters(
425	PyObject *to,
426	Py_ssize_t to_start,
427	PyObject *from,
428	Py_ssize_t from_start,
429	Py_ssize_t how_many
430	);
431
432	/ Unsafe version of PyUnicode_CopyCharacters(): don't check arguments and so*
433	may crash if parameters are invalid (e.g. if the output string
434	is too short). /*
435	PyAPI_FUNC(void) _PyUnicode_FastCopyCharacters(
436	PyObject *to,
437	Py_ssize_t to_start,
438	PyObject *from,
439	Py_ssize_t from_start,
440	Py_ssize_t how_many
441	);
442
443	/ Fill a string with a character: write fill_char into*
444	unicode[start:start+length].
445
446	Fail if fill_char is bigger than the string maximum character, or if the
447	string has more than 1 reference.
448
449	Return the number of written character, or return -1 and raise an exception
450	on error. /*
451	PyAPI_FUNC(Py_ssize_t) PyUnicode_Fill(
452	PyObject *unicode,
453	Py_ssize_t start,
454	Py_ssize_t length,
455	Py_UCS4 fill_char
456	);
457
458	/ Unsafe version of PyUnicode_Fill(): don't check arguments and so may crash*
459	if parameters are invalid (e.g. if length is longer than the string). /*
460	PyAPI_FUNC(void) _PyUnicode_FastFill(
461	PyObject *unicode,
462	Py_ssize_t start,
463	Py_ssize_t length,
464	Py_UCS4 fill_char
465	);
466
467	/ Create a new string from a buffer of Py_UCS1, Py_UCS2 or Py_UCS4 characters.*
468	Scan the string to find the maximum character. /*
469	PyAPI_FUNC(PyObject*) PyUnicode_FromKindAndData(
470	int kind,
471	const void *buffer,
472	Py_ssize_t size);
473
474	/ Create a new string from a buffer of ASCII characters.*
475	WARNING: Don't check if the string contains any non-ASCII character. /*
476	PyAPI_FUNC(PyObject*) _PyUnicode_FromASCII(
477	const char *buffer,
478	Py_ssize_t size);
479
480	/ Compute the maximum character of the substring unicode[start:end].*
481	Return 127 for an empty string. /*
482	PyAPI_FUNC(Py_UCS4) _PyUnicode_FindMaxChar (
483	PyObject *unicode,
484	Py_ssize_t start,
485	Py_ssize_t end);
486
487	/ --- _PyUnicodeWriter API ----------------------------------------------- /
488
489	typedef struct {
490	PyObject *buffer;
491	void *data;
492	int kind;
493	Py_UCS4 maxchar;
494	Py_ssize_t size;
495	Py_ssize_t pos;
496
497	/ minimum number of allocated characters (default: 0) /
498	Py_ssize_t min_length;
499
500	/ minimum character (default: 127, ASCII) /
501	Py_UCS4 min_char;
502
503	/ If non-zero, overallocate the buffer (default: 0). /
504	unsigned char overallocate;
505
506	/ If readonly is 1, buffer is a shared string (cannot be modified)*
507	and size is set to 0. /*
508	unsigned char readonly;
509	} _PyUnicodeWriter ;
510
511	/ Initialize a Unicode writer.*
512	*
513	* By default, the minimum buffer size is 0 character and overallocation is
514	* disabled. Set min_length, min_char and overallocate attributes to control
515	* the allocation of the buffer. */
516	PyAPI_FUNC(void)
517	_PyUnicodeWriter_Init(_PyUnicodeWriter *writer);
518
519	/ Prepare the buffer to write 'length' characters*
520	with the specified maximum character.
521
522	Return 0 on success, raise an exception and return -1 on error. /*
523	#define _PyUnicodeWriter_Prepare(WRITER, LENGTH, MAXCHAR) \
524	(((MAXCHAR) <= (WRITER)->maxchar \
525	&& (LENGTH) <= (WRITER)->size - (WRITER)->pos) \
526	? 0 \
527	: (((LENGTH) == 0) \
528	? 0 \
529	: _PyUnicodeWriter_PrepareInternal((WRITER), (LENGTH), (MAXCHAR))))
530
531	/ Don't call this function directly, use the _PyUnicodeWriter_Prepare() macro*
532	instead. /*
533	PyAPI_FUNC(int)
534	_PyUnicodeWriter_PrepareInternal(_PyUnicodeWriter *writer,
535	Py_ssize_t length, Py_UCS4 maxchar);
536
537	/ Prepare the buffer to have at least the kind KIND.*
538	For example, kind=PyUnicode_2BYTE_KIND ensures that the writer will
539	support characters in range U+000-U+FFFF.
540
541	Return 0 on success, raise an exception and return -1 on error. /*
542	#define _PyUnicodeWriter_PrepareKind(WRITER, KIND) \
543	((KIND) <= (WRITER)->kind \
544	? 0 \
545	: _PyUnicodeWriter_PrepareKindInternal((WRITER), (KIND)))
546
547	/ Don't call this function directly, use the _PyUnicodeWriter_PrepareKind()*
548	macro instead. /*
549	PyAPI_FUNC(int)
550	_PyUnicodeWriter_PrepareKindInternal(_PyUnicodeWriter *writer,
551	int kind);
552
553	/ Append a Unicode character.*
554	Return 0 on success, raise an exception and return -1 on error. /*
555	PyAPI_FUNC(int)
556	_PyUnicodeWriter_WriteChar(_PyUnicodeWriter *writer,
557	Py_UCS4 ch
558	);
559
560	/ Append a Unicode string.*
561	Return 0 on success, raise an exception and return -1 on error. /*
562	PyAPI_FUNC(int)
563	_PyUnicodeWriter_WriteStr(_PyUnicodeWriter *writer,
564	PyObject str /* Unicode string /
565	);
566
567	/ Append a substring of a Unicode string.*
568	Return 0 on success, raise an exception and return -1 on error. /*
569	PyAPI_FUNC(int)
570	_PyUnicodeWriter_WriteSubstring(_PyUnicodeWriter *writer,
571	PyObject str, /* Unicode string /
572	Py_ssize_t start,
573	Py_ssize_t end
574	);
575
576	/ Append an ASCII-encoded byte string.*
577	Return 0 on success, raise an exception and return -1 on error. /*
578	PyAPI_FUNC(int)
579	_PyUnicodeWriter_WriteASCIIString(_PyUnicodeWriter *writer,
580	const char str, /* ASCII-encoded byte string /
581	Py_ssize_t len / number of bytes, or -1 if unknown /
582	);
583
584	/ Append a latin1-encoded byte string.*
585	Return 0 on success, raise an exception and return -1 on error. /*
586	PyAPI_FUNC(int)
587	_PyUnicodeWriter_WriteLatin1String(_PyUnicodeWriter *writer,
588	const char str, /* latin1-encoded byte string /
589	Py_ssize_t len / length in bytes /
590	);
591
592	/ Get the value of the writer as a Unicode string. Clear the*
593	buffer of the writer. Raise an exception and return NULL
594	on error. /*
595	PyAPI_FUNC(PyObject *)
596	_PyUnicodeWriter_Finish(_PyUnicodeWriter *writer);
597
598	/ Deallocate memory of a writer (clear its internal buffer). /
599	PyAPI_FUNC(void)
600	_PyUnicodeWriter_Dealloc(_PyUnicodeWriter *writer);
601
602
603	/ Format the object based on the format_spec, as defined in PEP 3101*
604	(Advanced String Formatting). /*
605	PyAPI_FUNC(int) _PyUnicode_FormatAdvancedWriter(
606	_PyUnicodeWriter *writer,
607	PyObject *obj,
608	PyObject *format_spec,
609	Py_ssize_t start,
610	Py_ssize_t end);
611
612	/ --- Manage the default encoding ---------------------------------------- /
613
614	/ Returns a pointer to the default encoding (UTF-8) of the*
615	Unicode object unicode.
616
617	Like PyUnicode_AsUTF8AndSize(), this also caches the UTF-8 representation
618	in the unicodeobject.
619
620	_PyUnicode_AsString is a #define for PyUnicode_AsUTF8 to
621	support the previous internal function with the same behaviour.
622
623	Use of this API is DEPRECATED since no size information can be
624	extracted from the returned data.
625	*/
626
627	PyAPI_FUNC(const char ) PyUnicode_AsUTF8(PyObject unicode);
628
629	#define _PyUnicode_AsString PyUnicode_AsUTF8
630
631	/ --- UTF-7 Codecs ------------------------------------------------------- /
632
633	PyAPI_FUNC(PyObject*) _PyUnicode_EncodeUTF7(
634	PyObject unicode, /* Unicode object /
635	int base64SetO, / Encode RFC2152 Set O characters in base64 /
636	int base64WhiteSpace, / Encode whitespace (sp, ht, nl, cr) in base64 /
637	const char errors /* error handling /
638	);
639
640	/ --- UTF-8 Codecs ------------------------------------------------------- /
641
642	PyAPI_FUNC(PyObject*) _PyUnicode_AsUTF8String(
643	PyObject *unicode,
644	const char *errors);
645
646	/ --- UTF-32 Codecs ------------------------------------------------------ /
647
648	PyAPI_FUNC(PyObject*) _PyUnicode_EncodeUTF32(
649	PyObject object, /* Unicode object /
650	const char errors, /* error handling /
651	int byteorder / byteorder to use 0=BOM+native;-1=LE,1=BE /
652	);
653
654	/ --- UTF-16 Codecs ------------------------------------------------------ /
655
656	/ Returns a Python string object holding the UTF-16 encoded value of*
657	the Unicode data.
658
659	If byteorder is not 0, output is written according to the following
660	byte order:
661
662	byteorder == -1: little endian
663	byteorder == 0: native byte order (writes a BOM mark)
664	byteorder == 1: big endian
665
666	If byteorder is 0, the output string will always start with the
667	Unicode BOM mark (U+FEFF). In the other two modes, no BOM mark is
668	prepended.
669	*/
670	PyAPI_FUNC(PyObject*) _PyUnicode_EncodeUTF16(
671	PyObject* unicode, / Unicode object /
672	const char errors, /* error handling /
673	int byteorder / byteorder to use 0=BOM+native;-1=LE,1=BE /
674	);
675
676	/ --- Unicode-Escape Codecs ---------------------------------------------- /
677
678	/ Variant of PyUnicode_DecodeUnicodeEscape that supports partial decoding. /
679	PyAPI_FUNC(PyObject*) _PyUnicode_DecodeUnicodeEscapeStateful(
680	const char string, /* Unicode-Escape encoded string /
681	Py_ssize_t length, / size of string /
682	const char errors, /* error handling /
683	Py_ssize_t consumed /* bytes consumed /
684	);
685	/ Helper for PyUnicode_DecodeUnicodeEscape that detects invalid escape*
686	chars. /*
687	PyAPI_FUNC(PyObject*) _PyUnicode_DecodeUnicodeEscapeInternal2(
688	const char string, /* Unicode-Escape encoded string /
689	Py_ssize_t length, / size of string /
690	const char errors, /* error handling /
691	Py_ssize_t consumed, /* bytes consumed /
692	int first_invalid_escape_char, /* on return, if not -1, contain the first*
693	invalid escaped char (<= 0xff) or invalid
694	octal escape (> 0xff) in string. /*
695	const char *first_invalid_escape_ptr); /* on return, if not NULL, may*
696	point to the first invalid escaped
697	char in string.
698	May be NULL if errors is not NULL. /*
699	// Export for binary compatibility.
700	PyAPI_FUNC(PyObject*) _PyUnicode_DecodeUnicodeEscapeInternal(
701	const char string, /* Unicode-Escape encoded string /
702	Py_ssize_t length, / size of string /
703	const char errors, /* error handling /
704	Py_ssize_t consumed, /* bytes consumed /
705	const char *first_invalid_escape /* on return, points to first*
706	invalid escaped char in
707	string. /*
708	);
709
710	/ --- Raw-Unicode-Escape Codecs ---------------------------------------------- /
711
712	/ Variant of PyUnicode_DecodeRawUnicodeEscape that supports partial decoding. /
713	PyAPI_FUNC(PyObject*) _PyUnicode_DecodeRawUnicodeEscapeStateful(
714	const char string, /* Unicode-Escape encoded string /
715	Py_ssize_t length, / size of string /
716	const char errors, /* error handling /
717	Py_ssize_t consumed /* bytes consumed /
718	);
719
720	/ --- Latin-1 Codecs ----------------------------------------------------- /
721
722	PyAPI_FUNC(PyObject*) _PyUnicode_AsLatin1String(
723	PyObject* unicode,
724	const char* errors);
725
726	/ --- ASCII Codecs ------------------------------------------------------- /
727
728	PyAPI_FUNC(PyObject*) _PyUnicode_AsASCIIString(
729	PyObject* unicode,
730	const char* errors);
731
732	/ --- Character Map Codecs ----------------------------------------------- /
733
734	/ Translate an Unicode object by applying a character mapping table to*
735	it and return the resulting Unicode object.
736
737	The mapping table must map Unicode ordinal integers to Unicode strings,
738	Unicode ordinal integers or None (causing deletion of the character).
739
740	Mapping tables may be dictionaries or sequences. Unmapped character
741	ordinals (ones which cause a LookupError) are left untouched and
742	are copied as-is.
743	*/
744	PyAPI_FUNC(PyObject*) _PyUnicode_EncodeCharmap(
745	PyObject unicode, /* Unicode object /
746	PyObject mapping, /* encoding mapping /
747	const char errors /* error handling /
748	);
749
750	/ --- Decimal Encoder ---------------------------------------------------- /
751
752	/ Coverts a Unicode object holding a decimal value to an ASCII string*
753	for using in int, float and complex parsers.
754	Transforms code points that have decimal digit property to the
755	corresponding ASCII digit code points. Transforms spaces to ASCII.
756	Transforms code points starting from the first non-ASCII code point that
757	is neither a decimal digit nor a space to the end into '?'. /*
758
759	PyAPI_FUNC(PyObject*) _PyUnicode_TransformDecimalAndSpaceToASCII(
760	PyObject unicode /* Unicode object /
761	);
762
763	/ --- Methods & Slots ---------------------------------------------------- /
764
765	PyAPI_FUNC(PyObject *) _PyUnicode_JoinArray(
766	PyObject *separator,
767	PyObject *const *items,
768	Py_ssize_t seqlen
769	);
770
771	/ Test whether a unicode is equal to ASCII identifier. Return 1 if true,*
772	0 otherwise. The right argument must be ASCII identifier.
773	Any error occurs inside will be cleared before return. /*
774	PyAPI_FUNC(int) _PyUnicode_EqualToASCIIId(
775	PyObject left, /* Left string /
776	_Py_Identifier right /* Right identifier /
777	);
778
779	/ Test whether a unicode is equal to ASCII string. Return 1 if true,*
780	0 otherwise. The right argument must be ASCII-encoded string.
781	Any error occurs inside will be cleared before return. /*
782	PyAPI_FUNC(int) _PyUnicode_EqualToASCIIString(
783	PyObject *left,
784	const char right /* ASCII-encoded string /
785	);
786
787	/ Externally visible for str.strip(unicode) /
788	PyAPI_FUNC(PyObject *) _PyUnicode_XStrip(
789	PyObject *self,
790	int striptype,
791	PyObject *sepobj
792	);
793
794	/ Using explicit passed-in values, insert the thousands grouping*
795	into the string pointed to by buffer. For the argument descriptions,
796	see Objects/stringlib/localeutil.h /*
797	PyAPI_FUNC(Py_ssize_t) _PyUnicode_InsertThousandsGrouping(
798	_PyUnicodeWriter *writer,
799	Py_ssize_t n_buffer,
800	PyObject *digits,
801	Py_ssize_t d_pos,
802	Py_ssize_t n_digits,
803	Py_ssize_t min_width,
804	const char *grouping,
805	PyObject *thousands_sep,
806	Py_UCS4 *maxchar);
807
808	/ === Characters Type APIs =============================================== /
809
810	/ These should not be used directly. Use the Py_UNICODE_IS* and*
811	Py_UNICODE_TO macros instead.*
812
813	These APIs are implemented in Objects/unicodectype.c.
814
815	*/
816
817	PyAPI_FUNC(int) _PyUnicode_IsLowercase(
818	Py_UCS4 ch / Unicode character /
819	);
820
821	PyAPI_FUNC(int) _PyUnicode_IsUppercase(
822	Py_UCS4 ch / Unicode character /
823	);
824
825	PyAPI_FUNC(int) _PyUnicode_IsTitlecase(
826	Py_UCS4 ch / Unicode character /
827	);
828
829	PyAPI_FUNC(int) _PyUnicode_IsXidStart(
830	Py_UCS4 ch / Unicode character /
831	);
832
833	PyAPI_FUNC(int) _PyUnicode_IsXidContinue(
834	Py_UCS4 ch / Unicode character /
835	);
836
837	PyAPI_FUNC(int) _PyUnicode_IsWhitespace(
838	const Py_UCS4 ch / Unicode character /
839	);
840
841	PyAPI_FUNC(int) _PyUnicode_IsLinebreak(
842	const Py_UCS4 ch / Unicode character /
843	);
844
845	/ Py_DEPRECATED(3.3) / PyAPI_FUNC(Py_UCS4) _PyUnicode_ToLowercase(
846	Py_UCS4 ch / Unicode character /
847	);
848
849	/ Py_DEPRECATED(3.3) / PyAPI_FUNC(Py_UCS4) _PyUnicode_ToUppercase(
850	Py_UCS4 ch / Unicode character /
851	);
852
853	Py_DEPRECATED(`3.3`) PyAPI_FUNC(Py_UCS4) _PyUnicode_ToTitlecase(
854	Py_UCS4 ch / Unicode character /
855	);
856
857	PyAPI_FUNC(int) _PyUnicode_ToLowerFull(
858	Py_UCS4 ch, / Unicode character /
859	Py_UCS4 *res
860	);
861
862	PyAPI_FUNC(int) _PyUnicode_ToTitleFull(
863	Py_UCS4 ch, / Unicode character /
864	Py_UCS4 *res
865	);
866
867	PyAPI_FUNC(int) _PyUnicode_ToUpperFull(
868	Py_UCS4 ch, / Unicode character /
869	Py_UCS4 *res
870	);
871
872	PyAPI_FUNC(int) _PyUnicode_ToFoldedFull(
873	Py_UCS4 ch, / Unicode character /
874	Py_UCS4 *res
875	);
876
877	PyAPI_FUNC(int) _PyUnicode_IsCaseIgnorable(
878	Py_UCS4 ch / Unicode character /
879	);
880
881	PyAPI_FUNC(int) _PyUnicode_IsCased(
882	Py_UCS4 ch / Unicode character /
883	);
884
885	PyAPI_FUNC(int) _PyUnicode_ToDecimalDigit(
886	Py_UCS4 ch / Unicode character /
887	);
888
889	PyAPI_FUNC(int) _PyUnicode_ToDigit(
890	Py_UCS4 ch / Unicode character /
891	);
892
893	PyAPI_FUNC(double) _PyUnicode_ToNumeric(
894	Py_UCS4 ch / Unicode character /
895	);
896
897	PyAPI_FUNC(int) _PyUnicode_IsDecimalDigit(
898	Py_UCS4 ch / Unicode character /
899	);
900
901	PyAPI_FUNC(int) _PyUnicode_IsDigit(
902	Py_UCS4 ch / Unicode character /
903	);
904
905	PyAPI_FUNC(int) _PyUnicode_IsNumeric(
906	Py_UCS4 ch / Unicode character /
907	);
908
909	PyAPI_FUNC(int) _PyUnicode_IsPrintable(
910	Py_UCS4 ch / Unicode character /
911	);
912
913	PyAPI_FUNC(int) _PyUnicode_IsAlpha(
914	Py_UCS4 ch / Unicode character /
915	);
916
917	// Helper array used by Py_UNICODE_ISSPACE().
918	PyAPI_DATA(const unsigned char) _Py_ascii_whitespace[];
919
920	// Since splitting on whitespace is an important use case, and
921	// whitespace in most situations is solely ASCII whitespace, we
922	// optimize for the common case by using a quick look-up table
923	// _Py_ascii_whitespace (see below) with an inlined check.
924	static inline int Py_UNICODE_ISSPACE(Py_UCS4 ch) {
925	if (ch < `128`) {
926	return _Py_ascii_whitespace[ch];
927	}
928	return _PyUnicode_IsWhitespace(ch);
929	}
930
931	#define Py_UNICODE_ISLOWER(ch) _PyUnicode_IsLowercase(ch)
932	#define Py_UNICODE_ISUPPER(ch) _PyUnicode_IsUppercase(ch)
933	#define Py_UNICODE_ISTITLE(ch) _PyUnicode_IsTitlecase(ch)
934	#define Py_UNICODE_ISLINEBREAK(ch) _PyUnicode_IsLinebreak(ch)
935
936	#define Py_UNICODE_TOLOWER(ch) _PyUnicode_ToLowercase(ch)
937	#define Py_UNICODE_TOUPPER(ch) _PyUnicode_ToUppercase(ch)
938	#define Py_UNICODE_TOTITLE(ch) _PyUnicode_ToTitlecase(ch)
939
940	#define Py_UNICODE_ISDECIMAL(ch) _PyUnicode_IsDecimalDigit(ch)
941	#define Py_UNICODE_ISDIGIT(ch) _PyUnicode_IsDigit(ch)
942	#define Py_UNICODE_ISNUMERIC(ch) _PyUnicode_IsNumeric(ch)
943	#define Py_UNICODE_ISPRINTABLE(ch) _PyUnicode_IsPrintable(ch)
944
945	#define Py_UNICODE_TODECIMAL(ch) _PyUnicode_ToDecimalDigit(ch)
946	#define Py_UNICODE_TODIGIT(ch) _PyUnicode_ToDigit(ch)
947	#define Py_UNICODE_TONUMERIC(ch) _PyUnicode_ToNumeric(ch)
948
949	#define Py_UNICODE_ISALPHA(ch) _PyUnicode_IsAlpha(ch)
950
951	static inline int Py_UNICODE_ISALNUM(Py_UCS4 ch) {
952	return (Py_UNICODE_ISALPHA(ch)
953	\|\| Py_UNICODE_ISDECIMAL(ch)
954	\|\| Py_UNICODE_ISDIGIT(ch)
955	\|\| Py_UNICODE_ISNUMERIC(ch));
956	}
957
958
959	/ === Misc functions ===================================================== /
960
961	PyAPI_FUNC(PyObject) _PyUnicode_FormatLong(PyObject , int, int, int);
962
963	/ Return an interned Unicode object for an Identifier; may fail if there is no memory./
964	PyAPI_FUNC(PyObject) _PyUnicode_FromId(_Py_Identifier);
965
966	/ Fast equality check when the inputs are known to be exact unicode types*
967	and where the hash values are equal (i.e. a very probable match) /*
968	PyAPI_FUNC(int) _PyUnicode_EQ(PyObject , PyObject );
969
970	/ Equality check. /
971	PyAPI_FUNC(int) _PyUnicode_Equal(PyObject , PyObject );
972
973	PyAPI_FUNC(int) _PyUnicode_WideCharString_Converter(PyObject , void* *);
974	PyAPI_FUNC(int) _PyUnicode_WideCharString_Opt_Converter(PyObject , void* *);
975
976	PyAPI_FUNC(Py_ssize_t) _PyUnicode_ScanIdentifier(PyObject *);
977

source code of include/python3.12/cpython/unicodeobject.h