unicodeobject.h source code [include/python3.12/unicodeobject.h]

1	#ifndef Py_UNICODEOBJECT_H
2	#define Py_UNICODEOBJECT_H
3
4	#include <stdarg.h> // va_list
5
6	/*
7
8	Unicode implementation based on original code by Fredrik Lundh,
9	modified by Marc-Andre Lemburg (mal@lemburg.com) according to the
10	Unicode Integration Proposal. (See
11	http://www.egenix.com/files/python/unicode-proposal.txt).
12
13	Copyright (c) Corporation for National Research Initiatives.
14
15
16	Original header:
17	--------------------------------------------------------------------
18
19	* Yet another Unicode string type for Python. This type supports the
20	* 16-bit Basic Multilingual Plane (BMP) only.
21	*
22	* Written by Fredrik Lundh, January 1999.
23	*
24	* Copyright (c) 1999 by Secret Labs AB.
25	* Copyright (c) 1999 by Fredrik Lundh.
26	*
27	* fredrik@pythonware.com
28	* http://www.pythonware.com
29	*
30	* --------------------------------------------------------------------
31	* This Unicode String Type is
32	*
33	* Copyright (c) 1999 by Secret Labs AB
34	* Copyright (c) 1999 by Fredrik Lundh
35	*
36	* By obtaining, using, and/or copying this software and/or its
37	* associated documentation, you agree that you have read, understood,
38	* and will comply with the following terms and conditions:
39	*
40	* Permission to use, copy, modify, and distribute this software and its
41	* associated documentation for any purpose and without fee is hereby
42	* granted, provided that the above copyright notice appears in all
43	* copies, and that both that copyright notice and this permission notice
44	* appear in supporting documentation, and that the name of Secret Labs
45	* AB or the author not be used in advertising or publicity pertaining to
46	* distribution of the software without specific, written prior
47	* permission.
48	*
49	* SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
50	* THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
51	* FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
52	* ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
53	* WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
54	* ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
55	* OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
56	* -------------------------------------------------------------------- */
57
58	#include <ctype.h>
59
60	/ === Internal API ======================================================= /
61
62	/ --- Internal Unicode Format -------------------------------------------- /
63
64	/ Python 3.x requires unicode /
65	#define Py_USING_UNICODE
66
67	#ifndef SIZEOF_WCHAR_T
68	#error Must define SIZEOF_WCHAR_T
69	#endif
70
71	#define Py_UNICODE_SIZE SIZEOF_WCHAR_T
72
73	/ If wchar_t can be used for UCS-4 storage, set Py_UNICODE_WIDE.*
74	Otherwise, Unicode strings are stored as UCS-2 (with limited support
75	for UTF-16) /*
76
77	#if Py_UNICODE_SIZE >= 4
78	#define Py_UNICODE_WIDE
79	#endif
80
81	/ Set these flags if the platform has "wchar.h" and the*
82	wchar_t type is a 16-bit unsigned type /*
83	/ #define HAVE_WCHAR_H /
84	/ #define HAVE_USABLE_WCHAR_T /
85
86	/ If the compiler provides a wchar_t type we try to support it*
87	through the interface functions PyUnicode_FromWideChar(),
88	PyUnicode_AsWideChar() and PyUnicode_AsWideCharString(). /*
89
90	#ifdef HAVE_USABLE_WCHAR_T
91	# ifndef HAVE_WCHAR_H
92	# define HAVE_WCHAR_H
93	# endif
94	#endif
95
96	#ifdef HAVE_WCHAR_H
97	# include <wchar.h>
98	#endif
99
100	/ Py_UCS4 and Py_UCS2 are typedefs for the respective*
101	unicode representations. /*
102	typedef uint32_t Py_UCS4;
103	typedef uint16_t Py_UCS2;
104	typedef uint8_t Py_UCS1;
105
106	#ifdef __cplusplus
107	extern "C" {
108	#endif
109
110
111	PyAPI_DATA(PyTypeObject) PyUnicode_Type;
112	PyAPI_DATA(PyTypeObject) PyUnicodeIter_Type;
113
114	#define PyUnicode_Check(op) \
115	PyType_FastSubclass(Py_TYPE(op), Py_TPFLAGS_UNICODE_SUBCLASS)
116	#define PyUnicode_CheckExact(op) Py_IS_TYPE((op), &PyUnicode_Type)
117
118	/ --- Constants ---------------------------------------------------------- /
119
120	/ This Unicode character will be used as replacement character during*
121	decoding if the errors argument is set to "replace". Note: the
122	Unicode character U+FFFD is the official REPLACEMENT CHARACTER in
123	Unicode 3.0. /*
124
125	#define Py_UNICODE_REPLACEMENT_CHARACTER ((Py_UCS4) 0xFFFD)
126
127	/ === Public API ========================================================= /
128
129	/ Similar to PyUnicode_FromUnicode(), but u points to UTF-8 encoded bytes /
130	PyAPI_FUNC(PyObject*) PyUnicode_FromStringAndSize(
131	const char u, /* UTF-8 encoded string /
132	Py_ssize_t size / size of buffer /
133	);
134
135	/ Similar to PyUnicode_FromUnicode(), but u points to null-terminated*
136	UTF-8 encoded bytes. The size is determined with strlen(). /*
137	PyAPI_FUNC(PyObject*) PyUnicode_FromString(
138	const char u /* UTF-8 encoded string /
139	);
140
141	#if !defined(Py_LIMITED_API) \|\| Py_LIMITED_API+0 >= 0x03030000
142	PyAPI_FUNC(PyObject*) PyUnicode_Substring(
143	PyObject *str,
144	Py_ssize_t start,
145	Py_ssize_t end);
146	#endif
147
148	#if !defined(Py_LIMITED_API) \|\| Py_LIMITED_API+0 >= 0x03030000
149	/ Copy the string into a UCS4 buffer including the null character if copy_null*
150	is set. Return NULL and raise an exception on error. Raise a SystemError if
151	the buffer is smaller than the string. Return buffer on success.
152
153	buflen is the length of the buffer in (Py_UCS4) characters. /*
154	PyAPI_FUNC(Py_UCS4*) PyUnicode_AsUCS4(
155	PyObject *unicode,
156	Py_UCS4* buffer,
157	Py_ssize_t buflen,
158	int copy_null);
159
160	/ Copy the string into a UCS4 buffer. A new buffer is allocated using*
161	* PyMem_Malloc; if this fails, NULL is returned with a memory error
162	exception set. /*
163	PyAPI_FUNC(Py_UCS4) PyUnicode_AsUCS4Copy(PyObject unicode);
164	#endif
165
166	#if !defined(Py_LIMITED_API) \|\| Py_LIMITED_API+0 >= 0x03030000
167	/ Get the length of the Unicode object. /
168
169	PyAPI_FUNC(Py_ssize_t) PyUnicode_GetLength(
170	PyObject *unicode
171	);
172	#endif
173
174	#if !defined(Py_LIMITED_API) \|\| Py_LIMITED_API+0 >= 0x03030000
175	/ Read a character from the string. /
176
177	PyAPI_FUNC(Py_UCS4) PyUnicode_ReadChar(
178	PyObject *unicode,
179	Py_ssize_t index
180	);
181
182	/ Write a character to the string. The string must have been created through*
183	PyUnicode_New, must not be shared, and must not have been hashed yet.
184
185	Return 0 on success, -1 on error. /*
186
187	PyAPI_FUNC(int) PyUnicode_WriteChar(
188	PyObject *unicode,
189	Py_ssize_t index,
190	Py_UCS4 character
191	);
192	#endif
193
194	/ Resize a Unicode object. The length is the number of codepoints.*
195
196	*unicode is modified to point to the new (resized) object and 0
197	returned on success.
198
199	Try to resize the string in place (which is usually faster than allocating
200	a new string and copy characters), or create a new string.
201
202	Error handling is implemented as follows: an exception is set, -1
203	is returned and unicode left untouched.*
204
205	WARNING: The function doesn't check string content, the result may not be a
206	string in canonical representation. /*
207
208	PyAPI_FUNC(int) PyUnicode_Resize(
209	PyObject *unicode, /* Pointer to the Unicode object /
210	Py_ssize_t length / New length /
211	);
212
213	/ Decode obj to a Unicode object.*
214
215	bytes, bytearray and other bytes-like objects are decoded according to the
216	given encoding and error handler. The encoding and error handler can be
217	NULL to have the interface use UTF-8 and "strict".
218
219	All other objects (including Unicode objects) raise an exception.
220
221	The API returns NULL in case of an error. The caller is responsible
222	for decref'ing the returned objects.
223
224	*/
225
226	PyAPI_FUNC(PyObject*) PyUnicode_FromEncodedObject(
227	PyObject obj, /* Object /
228	const char encoding, /* encoding /
229	const char errors /* error handling /
230	);
231
232	/ Copy an instance of a Unicode subtype to a new true Unicode object if*
233	necessary. If obj is already a true Unicode object (not a subtype), return
234	the reference with incremented* refcount.*
235
236	The API returns NULL in case of an error. The caller is responsible
237	for decref'ing the returned objects.
238
239	*/
240
241	PyAPI_FUNC(PyObject*) PyUnicode_FromObject(
242	PyObject obj /* Object /
243	);
244
245	PyAPI_FUNC(PyObject *) PyUnicode_FromFormatV(
246	const char format, /* ASCII-encoded string /
247	va_list vargs
248	);
249	PyAPI_FUNC(PyObject *) PyUnicode_FromFormat(
250	const char format, /* ASCII-encoded string /
251	...
252	);
253
254	PyAPI_FUNC(void) PyUnicode_InternInPlace(PyObject **);
255	PyAPI_FUNC(PyObject *) PyUnicode_InternFromString(
256	const char u /* UTF-8 encoded string /
257	);
258
259	/ --- wchar_t support for platforms which support it --------------------- /
260
261	#ifdef HAVE_WCHAR_H
262
263	/ Create a Unicode Object from the wchar_t buffer w of the given*
264	size.
265
266	The buffer is copied into the new object. /*
267
268	PyAPI_FUNC(PyObject*) PyUnicode_FromWideChar(
269	const wchar_t w, /* wchar_t buffer /
270	Py_ssize_t size / size of buffer /
271	);
272
273	/ Copies the Unicode Object contents into the wchar_t buffer w. At*
274	most size wchar_t characters are copied.
275
276	Note that the resulting wchar_t string may or may not be
277	0-terminated. It is the responsibility of the caller to make sure
278	that the wchar_t string is 0-terminated in case this is required by
279	the application.
280
281	Returns the number of wchar_t characters copied (excluding a
282	possibly trailing 0-termination character) or -1 in case of an
283	error. /*
284
285	PyAPI_FUNC(Py_ssize_t) PyUnicode_AsWideChar(
286	PyObject unicode, /* Unicode object /
287	wchar_t w, /* wchar_t buffer /
288	Py_ssize_t size / size of buffer /
289	);
290
291	/ Convert the Unicode object to a wide character string. The output string*
292	always ends with a nul character. If size is not NULL, write the number of
293	wide characters (excluding the null character) into size.*
294
295	Returns a buffer allocated by PyMem_Malloc() (use PyMem_Free() to free it)
296	on success. On error, returns NULL, size is undefined and raises a*
297	MemoryError. /*
298
299	PyAPI_FUNC(wchar_t*) PyUnicode_AsWideCharString(
300	PyObject unicode, /* Unicode object /
301	Py_ssize_t size /* number of characters of the result /
302	);
303
304	#endif
305
306	/ --- Unicode ordinals --------------------------------------------------- /
307
308	/ Create a Unicode Object from the given Unicode code point ordinal.*
309
310	The ordinal must be in range(0x110000). A ValueError is
311	raised in case it is not.
312
313	*/
314
315	PyAPI_FUNC(PyObject) PyUnicode_FromOrdinal(int* ordinal);
316
317	/ === Builtin Codecs =====================================================*
318
319	Many of these APIs take two arguments encoding and errors. These
320	parameters encoding and errors have the same semantics as the ones
321	of the builtin str() API.
322
323	Setting encoding to NULL causes the default encoding (UTF-8) to be used.
324
325	Error handling is set by errors which may also be set to NULL
326	meaning to use the default handling defined for the codec. Default
327	error handling for all builtin codecs is "strict" (ValueErrors are
328	raised).
329
330	The codecs all use a similar interface. Only deviation from the
331	generic ones are documented.
332
333	*/
334
335	/ --- Manage the default encoding ---------------------------------------- /
336
337	/ Returns "utf-8". /
338	PyAPI_FUNC(const char) PyUnicode_GetDefaultEncoding(void*);
339
340	/ --- Generic Codecs ----------------------------------------------------- /
341
342	/ Create a Unicode object by decoding the encoded string s of the*
343	given size. /*
344
345	PyAPI_FUNC(PyObject*) PyUnicode_Decode(
346	const char s, /* encoded string /
347	Py_ssize_t size, / size of buffer /
348	const char encoding, /* encoding /
349	const char errors /* error handling /
350	);
351
352	/ Decode a Unicode object unicode and return the result as Python*
353	object.
354
355	This API is DEPRECATED. The only supported standard encoding is rot13.
356	Use PyCodec_Decode() to decode with rot13 and non-standard codecs
357	that decode from str. /*
358
359	Py_DEPRECATED(`3.6`) PyAPI_FUNC(PyObject*) PyUnicode_AsDecodedObject(
360	PyObject unicode, /* Unicode object /
361	const char encoding, /* encoding /
362	const char errors /* error handling /
363	);
364
365	/ Decode a Unicode object unicode and return the result as Unicode*
366	object.
367
368	This API is DEPRECATED. The only supported standard encoding is rot13.
369	Use PyCodec_Decode() to decode with rot13 and non-standard codecs
370	that decode from str to str. /*
371
372	Py_DEPRECATED(`3.6`) PyAPI_FUNC(PyObject*) PyUnicode_AsDecodedUnicode(
373	PyObject unicode, /* Unicode object /
374	const char encoding, /* encoding /
375	const char errors /* error handling /
376	);
377
378	/ Encodes a Unicode object and returns the result as Python*
379	object.
380
381	This API is DEPRECATED. It is superseded by PyUnicode_AsEncodedString()
382	since all standard encodings (except rot13) encode str to bytes.
383	Use PyCodec_Encode() for encoding with rot13 and non-standard codecs
384	that encode form str to non-bytes. /*
385
386	Py_DEPRECATED(`3.6`) PyAPI_FUNC(PyObject*) PyUnicode_AsEncodedObject(
387	PyObject unicode, /* Unicode object /
388	const char encoding, /* encoding /
389	const char errors /* error handling /
390	);
391
392	/ Encodes a Unicode object and returns the result as Python string*
393	object. /*
394
395	PyAPI_FUNC(PyObject*) PyUnicode_AsEncodedString(
396	PyObject unicode, /* Unicode object /
397	const char encoding, /* encoding /
398	const char errors /* error handling /
399	);
400
401	/ Encodes a Unicode object and returns the result as Unicode*
402	object.
403
404	This API is DEPRECATED. The only supported standard encodings is rot13.
405	Use PyCodec_Encode() to encode with rot13 and non-standard codecs
406	that encode from str to str. /*
407
408	Py_DEPRECATED(`3.6`) PyAPI_FUNC(PyObject*) PyUnicode_AsEncodedUnicode(
409	PyObject unicode, /* Unicode object /
410	const char encoding, /* encoding /
411	const char errors /* error handling /
412	);
413
414	/ Build an encoding map. /
415
416	PyAPI_FUNC(PyObject*) PyUnicode_BuildEncodingMap(
417	PyObject* string / 256 character map /
418	);
419
420	/ --- UTF-7 Codecs ------------------------------------------------------- /
421
422	PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF7(
423	const char string, /* UTF-7 encoded string /
424	Py_ssize_t length, / size of string /
425	const char errors /* error handling /
426	);
427
428	PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF7Stateful(
429	const char string, /* UTF-7 encoded string /
430	Py_ssize_t length, / size of string /
431	const char errors, /* error handling /
432	Py_ssize_t consumed /* bytes consumed /
433	);
434
435	/ --- UTF-8 Codecs ------------------------------------------------------- /
436
437	PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF8(
438	const char string, /* UTF-8 encoded string /
439	Py_ssize_t length, / size of string /
440	const char errors /* error handling /
441	);
442
443	PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF8Stateful(
444	const char string, /* UTF-8 encoded string /
445	Py_ssize_t length, / size of string /
446	const char errors, /* error handling /
447	Py_ssize_t consumed /* bytes consumed /
448	);
449
450	PyAPI_FUNC(PyObject*) PyUnicode_AsUTF8String(
451	PyObject unicode /* Unicode object /
452	);
453
454	/ Returns a pointer to the default encoding (UTF-8) of the*
455	Unicode object unicode and the size of the encoded representation
456	in bytes stored in size.*
457
458	In case of an error, no size is set.*
459
460	This function caches the UTF-8 encoded string in the unicodeobject
461	and subsequent calls will return the same string. The memory is released
462	when the unicodeobject is deallocated.
463	*/
464
465	#if !defined(Py_LIMITED_API) \|\| Py_LIMITED_API+0 >= 0x030A0000
466	PyAPI_FUNC(const char *) PyUnicode_AsUTF8AndSize(
467	PyObject *unicode,
468	Py_ssize_t *size);
469	#endif
470
471	/ --- UTF-32 Codecs ------------------------------------------------------ /
472
473	/ Decodes length bytes from a UTF-32 encoded buffer string and returns*
474	the corresponding Unicode object.
475
476	errors (if non-NULL) defines the error handling. It defaults
477	to "strict".
478
479	If byteorder is non-NULL, the decoder starts decoding using the
480	given byte order:
481
482	*byteorder == -1: little endian
483	*byteorder == 0: native order
484	*byteorder == 1: big endian
485
486	In native mode, the first four bytes of the stream are checked for a
487	BOM mark. If found, the BOM mark is analysed, the byte order
488	adjusted and the BOM skipped. In the other modes, no BOM mark
489	interpretation is done. After completion, byteorder is set to the*
490	current byte order at the end of input data.
491
492	If byteorder is NULL, the codec starts in native order mode.
493
494	*/
495
496	PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF32(
497	const char string, /* UTF-32 encoded string /
498	Py_ssize_t length, / size of string /
499	const char errors, /* error handling /
500	int byteorder /* pointer to byteorder to use*
501	0=native;-1=LE,1=BE; updated on
502	exit /*
503	);
504
505	PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF32Stateful(
506	const char string, /* UTF-32 encoded string /
507	Py_ssize_t length, / size of string /
508	const char errors, /* error handling /
509	int byteorder, /* pointer to byteorder to use*
510	0=native;-1=LE,1=BE; updated on
511	exit /*
512	Py_ssize_t consumed /* bytes consumed /
513	);
514
515	/ Returns a Python string using the UTF-32 encoding in native byte*
516	order. The string always starts with a BOM mark. /*
517
518	PyAPI_FUNC(PyObject*) PyUnicode_AsUTF32String(
519	PyObject unicode /* Unicode object /
520	);
521
522	/ Returns a Python string object holding the UTF-32 encoded value of*
523	the Unicode data.
524
525	If byteorder is not 0, output is written according to the following
526	byte order:
527
528	byteorder == -1: little endian
529	byteorder == 0: native byte order (writes a BOM mark)
530	byteorder == 1: big endian
531
532	If byteorder is 0, the output string will always start with the
533	Unicode BOM mark (U+FEFF). In the other two modes, no BOM mark is
534	prepended.
535
536	*/
537
538	/ --- UTF-16 Codecs ------------------------------------------------------ /
539
540	/ Decodes length bytes from a UTF-16 encoded buffer string and returns*
541	the corresponding Unicode object.
542
543	errors (if non-NULL) defines the error handling. It defaults
544	to "strict".
545
546	If byteorder is non-NULL, the decoder starts decoding using the
547	given byte order:
548
549	*byteorder == -1: little endian
550	*byteorder == 0: native order
551	*byteorder == 1: big endian
552
553	In native mode, the first two bytes of the stream are checked for a
554	BOM mark. If found, the BOM mark is analysed, the byte order
555	adjusted and the BOM skipped. In the other modes, no BOM mark
556	interpretation is done. After completion, byteorder is set to the*
557	current byte order at the end of input data.
558
559	If byteorder is NULL, the codec starts in native order mode.
560
561	*/
562
563	PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF16(
564	const char string, /* UTF-16 encoded string /
565	Py_ssize_t length, / size of string /
566	const char errors, /* error handling /
567	int byteorder /* pointer to byteorder to use*
568	0=native;-1=LE,1=BE; updated on
569	exit /*
570	);
571
572	PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF16Stateful(
573	const char string, /* UTF-16 encoded string /
574	Py_ssize_t length, / size of string /
575	const char errors, /* error handling /
576	int byteorder, /* pointer to byteorder to use*
577	0=native;-1=LE,1=BE; updated on
578	exit /*
579	Py_ssize_t consumed /* bytes consumed /
580	);
581
582	/ Returns a Python string using the UTF-16 encoding in native byte*
583	order. The string always starts with a BOM mark. /*
584
585	PyAPI_FUNC(PyObject*) PyUnicode_AsUTF16String(
586	PyObject unicode /* Unicode object /
587	);
588
589	/ --- Unicode-Escape Codecs ---------------------------------------------- /
590
591	PyAPI_FUNC(PyObject*) PyUnicode_DecodeUnicodeEscape(
592	const char string, /* Unicode-Escape encoded string /
593	Py_ssize_t length, / size of string /
594	const char errors /* error handling /
595	);
596
597	PyAPI_FUNC(PyObject*) PyUnicode_AsUnicodeEscapeString(
598	PyObject unicode /* Unicode object /
599	);
600
601	/ --- Raw-Unicode-Escape Codecs ------------------------------------------ /
602
603	PyAPI_FUNC(PyObject*) PyUnicode_DecodeRawUnicodeEscape(
604	const char string, /* Raw-Unicode-Escape encoded string /
605	Py_ssize_t length, / size of string /
606	const char errors /* error handling /
607	);
608
609	PyAPI_FUNC(PyObject*) PyUnicode_AsRawUnicodeEscapeString(
610	PyObject unicode /* Unicode object /
611	);
612
613	/ --- Latin-1 Codecs -----------------------------------------------------*
614
615	Note: Latin-1 corresponds to the first 256 Unicode ordinals. /*
616
617	PyAPI_FUNC(PyObject*) PyUnicode_DecodeLatin1(
618	const char string, /* Latin-1 encoded string /
619	Py_ssize_t length, / size of string /
620	const char errors /* error handling /
621	);
622
623	PyAPI_FUNC(PyObject*) PyUnicode_AsLatin1String(
624	PyObject unicode /* Unicode object /
625	);
626
627	/ --- ASCII Codecs -------------------------------------------------------*
628
629	Only 7-bit ASCII data is expected. All other codes generate errors.
630
631	*/
632
633	PyAPI_FUNC(PyObject*) PyUnicode_DecodeASCII(
634	const char string, /* ASCII encoded string /
635	Py_ssize_t length, / size of string /
636	const char errors /* error handling /
637	);
638
639	PyAPI_FUNC(PyObject*) PyUnicode_AsASCIIString(
640	PyObject unicode /* Unicode object /
641	);
642
643	/ --- Character Map Codecs -----------------------------------------------*
644
645	This codec uses mappings to encode and decode characters.
646
647	Decoding mappings must map byte ordinals (integers in the range from 0 to
648	255) to Unicode strings, integers (which are then interpreted as Unicode
649	ordinals) or None. Unmapped data bytes (ones which cause a LookupError)
650	as well as mapped to None, 0xFFFE or '\ufffe' are treated as "undefined
651	mapping" and cause an error.
652
653	Encoding mappings must map Unicode ordinal integers to bytes objects,
654	integers in the range from 0 to 255 or None. Unmapped character
655	ordinals (ones which cause a LookupError) as well as mapped to
656	None are treated as "undefined mapping" and cause an error.
657
658	*/
659
660	PyAPI_FUNC(PyObject*) PyUnicode_DecodeCharmap(
661	const char string, /* Encoded string /
662	Py_ssize_t length, / size of string /
663	PyObject mapping, /* decoding mapping /
664	const char errors /* error handling /
665	);
666
667	PyAPI_FUNC(PyObject*) PyUnicode_AsCharmapString(
668	PyObject unicode, /* Unicode object /
669	PyObject mapping /* encoding mapping /
670	);
671
672	/ --- MBCS codecs for Windows -------------------------------------------- /
673
674	#ifdef MS_WINDOWS
675	PyAPI_FUNC(PyObject*) PyUnicode_DecodeMBCS(
676	const char string, /* MBCS encoded string /
677	Py_ssize_t length, / size of string /
678	const char errors /* error handling /
679	);
680
681	PyAPI_FUNC(PyObject*) PyUnicode_DecodeMBCSStateful(
682	const char string, /* MBCS encoded string /
683	Py_ssize_t length, / size of string /
684	const char errors, /* error handling /
685	Py_ssize_t consumed /* bytes consumed /
686	);
687
688	#if !defined(Py_LIMITED_API) \|\| Py_LIMITED_API+0 >= 0x03030000
689	PyAPI_FUNC(PyObject*) PyUnicode_DecodeCodePageStateful(
690	int code_page, / code page number /
691	const char string, /* encoded string /
692	Py_ssize_t length, / size of string /
693	const char errors, /* error handling /
694	Py_ssize_t consumed /* bytes consumed /
695	);
696	#endif
697
698	PyAPI_FUNC(PyObject*) PyUnicode_AsMBCSString(
699	PyObject unicode /* Unicode object /
700	);
701
702	#if !defined(Py_LIMITED_API) \|\| Py_LIMITED_API+0 >= 0x03030000
703	PyAPI_FUNC(PyObject*) PyUnicode_EncodeCodePage(
704	int code_page, / code page number /
705	PyObject unicode, /* Unicode object /
706	const char errors /* error handling /
707	);
708	#endif
709
710	#endif /* MS_WINDOWS */
711
712	/ --- Locale encoding --------------------------------------------------- /
713
714	#if !defined(Py_LIMITED_API) \|\| Py_LIMITED_API+0 >= 0x03030000
715	/ Decode a string from the current locale encoding. The decoder is strict if*
716	surrogateescape is equal to zero, otherwise it uses the 'surrogateescape'
717	error handler (PEP 383) to escape undecodable bytes. If a byte sequence can
718	be decoded as a surrogate character and surrogateescape* is not equal to*
719	zero, the byte sequence is escaped using the 'surrogateescape' error handler
720	instead of being decoded. str* must end with a null character but cannot*
721	contain embedded null characters. /*
722
723	PyAPI_FUNC(PyObject*) PyUnicode_DecodeLocaleAndSize(
724	const char *str,
725	Py_ssize_t len,
726	const char *errors);
727
728	/ Similar to PyUnicode_DecodeLocaleAndSize(), but compute the string*
729	length using strlen(). /*
730
731	PyAPI_FUNC(PyObject*) PyUnicode_DecodeLocale(
732	const char *str,
733	const char *errors);
734
735	/ Encode a Unicode object to the current locale encoding. The encoder is*
736	strict is surrogateescape* is equal to zero, otherwise the*
737	"surrogateescape" error handler is used. Return a bytes object. The string
738	cannot contain embedded null characters. /*
739
740	PyAPI_FUNC(PyObject*) PyUnicode_EncodeLocale(
741	PyObject *unicode,
742	const char *errors
743	);
744	#endif
745
746	/ --- File system encoding ---------------------------------------------- /
747
748	/ ParseTuple converter: encode str objects to bytes using*
749	PyUnicode_EncodeFSDefault(); bytes objects are output as-is. /*
750
751	PyAPI_FUNC(int) PyUnicode_FSConverter(PyObject, void**);
752
753	/ ParseTuple converter: decode bytes objects to unicode using*
754	PyUnicode_DecodeFSDefaultAndSize(); str objects are output as-is. /*
755
756	PyAPI_FUNC(int) PyUnicode_FSDecoder(PyObject, void**);
757
758	/ Decode a null-terminated string from the Python filesystem encoding*
759	and error handler.
760
761	If the string length is known, use PyUnicode_DecodeFSDefaultAndSize(). /*
762	PyAPI_FUNC(PyObject*) PyUnicode_DecodeFSDefault(
763	const char s /* encoded string /
764	);
765
766	/ Decode a string from the Python filesystem encoding and error handler. /
767	PyAPI_FUNC(PyObject*) PyUnicode_DecodeFSDefaultAndSize(
768	const char s, /* encoded string /
769	Py_ssize_t size / size /
770	);
771
772	/ Encode a Unicode object to the Python filesystem encoding and error handler.*
773	Return bytes. /*
774	PyAPI_FUNC(PyObject*) PyUnicode_EncodeFSDefault(
775	PyObject *unicode
776	);
777
778	/ --- Methods & Slots ----------------------------------------------------*
779
780	These are capable of handling Unicode objects and strings on input
781	(we refer to them as strings in the descriptions) and return
782	Unicode objects or integers as appropriate. /*
783
784	/ Concat two strings giving a new Unicode string. /
785
786	PyAPI_FUNC(PyObject*) PyUnicode_Concat(
787	PyObject left, /* Left string /
788	PyObject right /* Right string /
789	);
790
791	/ Concat two strings and put the result in pleft
792	(sets pleft to NULL on error) /
793
794	PyAPI_FUNC(void) PyUnicode_Append(
795	PyObject *pleft, /* Pointer to left string /
796	PyObject right /* Right string /
797	);
798
799	/ Concat two strings, put the result in pleft and drop the right object
800	(sets pleft to NULL on error) /
801
802	PyAPI_FUNC(void) PyUnicode_AppendAndDel(
803	PyObject *pleft, /* Pointer to left string /
804	PyObject right /* Right string /
805	);
806
807	/ Split a string giving a list of Unicode strings.*
808
809	If sep is NULL, splitting will be done at all whitespace
810	substrings. Otherwise, splits occur at the given separator.
811
812	At most maxsplit splits will be done. If negative, no limit is set.
813
814	Separators are not included in the resulting list.
815
816	*/
817
818	PyAPI_FUNC(PyObject*) PyUnicode_Split(
819	PyObject s, /* String to split /
820	PyObject sep, /* String separator /
821	Py_ssize_t maxsplit / Maxsplit count /
822	);
823
824	/ Dito, but split at line breaks.*
825
826	CRLF is considered to be one line break. Line breaks are not
827	included in the resulting list. /*
828
829	PyAPI_FUNC(PyObject*) PyUnicode_Splitlines(
830	PyObject s, /* String to split /
831	int keepends / If true, line end markers are included /
832	);
833
834	/ Partition a string using a given separator. /
835
836	PyAPI_FUNC(PyObject*) PyUnicode_Partition(
837	PyObject s, /* String to partition /
838	PyObject sep /* String separator /
839	);
840
841	/ Partition a string using a given separator, searching from the end of the*
842	string. /*
843
844	PyAPI_FUNC(PyObject*) PyUnicode_RPartition(
845	PyObject s, /* String to partition /
846	PyObject sep /* String separator /
847	);
848
849	/ Split a string giving a list of Unicode strings.*
850
851	If sep is NULL, splitting will be done at all whitespace
852	substrings. Otherwise, splits occur at the given separator.
853
854	At most maxsplit splits will be done. But unlike PyUnicode_Split
855	PyUnicode_RSplit splits from the end of the string. If negative,
856	no limit is set.
857
858	Separators are not included in the resulting list.
859
860	*/
861
862	PyAPI_FUNC(PyObject*) PyUnicode_RSplit(
863	PyObject s, /* String to split /
864	PyObject sep, /* String separator /
865	Py_ssize_t maxsplit / Maxsplit count /
866	);
867
868	/ Translate a string by applying a character mapping table to it and*
869	return the resulting Unicode object.
870
871	The mapping table must map Unicode ordinal integers to Unicode strings,
872	Unicode ordinal integers or None (causing deletion of the character).
873
874	Mapping tables may be dictionaries or sequences. Unmapped character
875	ordinals (ones which cause a LookupError) are left untouched and
876	are copied as-is.
877
878	*/
879
880	PyAPI_FUNC(PyObject *) PyUnicode_Translate(
881	PyObject str, /* String /
882	PyObject table, /* Translate table /
883	const char errors /* error handling /
884	);
885
886	/ Join a sequence of strings using the given separator and return*
887	the resulting Unicode string. /*
888
889	PyAPI_FUNC(PyObject*) PyUnicode_Join(
890	PyObject separator, /* Separator string /
891	PyObject seq /* Sequence object /
892	);
893
894	/ Return 1 if substr matches str[start:end] at the given tail end, 0*
895	otherwise. /*
896
897	PyAPI_FUNC(Py_ssize_t) PyUnicode_Tailmatch(
898	PyObject str, /* String /
899	PyObject substr, /* Prefix or Suffix string /
900	Py_ssize_t start, / Start index /
901	Py_ssize_t end, / Stop index /
902	int direction / Tail end: -1 prefix, +1 suffix /
903	);
904
905	/ Return the first position of substr in str[start:end] using the*
906	given search direction or -1 if not found. -2 is returned in case
907	an error occurred and an exception is set. /*
908
909	PyAPI_FUNC(Py_ssize_t) PyUnicode_Find(
910	PyObject str, /* String /
911	PyObject substr, /* Substring to find /
912	Py_ssize_t start, / Start index /
913	Py_ssize_t end, / Stop index /
914	int direction / Find direction: +1 forward, -1 backward /
915	);
916
917	#if !defined(Py_LIMITED_API) \|\| Py_LIMITED_API+0 >= 0x03030000
918	/ Like PyUnicode_Find, but search for single character only. /
919	PyAPI_FUNC(Py_ssize_t) PyUnicode_FindChar(
920	PyObject *str,
921	Py_UCS4 ch,
922	Py_ssize_t start,
923	Py_ssize_t end,
924	int direction
925	);
926	#endif
927
928	/ Count the number of occurrences of substr in str[start:end]. /
929
930	PyAPI_FUNC(Py_ssize_t) PyUnicode_Count(
931	PyObject str, /* String /
932	PyObject substr, /* Substring to count /
933	Py_ssize_t start, / Start index /
934	Py_ssize_t end / Stop index /
935	);
936
937	/ Replace at most maxcount occurrences of substr in str with replstr*
938	and return the resulting Unicode object. /*
939
940	PyAPI_FUNC(PyObject *) PyUnicode_Replace(
941	PyObject str, /* String /
942	PyObject substr, /* Substring to find /
943	PyObject replstr, /* Substring to replace /
944	Py_ssize_t maxcount / Max. number of replacements to apply;*
945	-1 = all /*
946	);
947
948	/ Compare two strings and return -1, 0, 1 for less than, equal,*
949	greater than resp.
950	Raise an exception and return -1 on error. /*
951
952	PyAPI_FUNC(int) PyUnicode_Compare(
953	PyObject left, /* Left string /
954	PyObject right /* Right string /
955	);
956
957	/ Compare a Unicode object with C string and return -1, 0, 1 for less than,*
958	equal, and greater than, respectively. It is best to pass only
959	ASCII-encoded strings, but the function interprets the input string as
960	ISO-8859-1 if it contains non-ASCII characters.
961	This function does not raise exceptions. /*
962
963	PyAPI_FUNC(int) PyUnicode_CompareWithASCIIString(
964	PyObject *left,
965	const char right /* ASCII-encoded string /
966	);
967
968	/ Rich compare two strings and return one of the following:*
969
970	- NULL in case an exception was raised
971	- Py_True or Py_False for successful comparisons
972	- Py_NotImplemented in case the type combination is unknown
973
974	Possible values for op:
975
976	Py_GT, Py_GE, Py_EQ, Py_NE, Py_LT, Py_LE
977
978	*/
979
980	PyAPI_FUNC(PyObject *) PyUnicode_RichCompare(
981	PyObject left, /* Left string /
982	PyObject right, /* Right string /
983	int op / Operation: Py_EQ, Py_NE, Py_GT, etc. /
984	);
985
986	/ Apply an argument tuple or dictionary to a format string and return*
987	the resulting Unicode string. /*
988
989	PyAPI_FUNC(PyObject *) PyUnicode_Format(
990	PyObject format, /* Format string /
991	PyObject args /* Argument tuple or dictionary /
992	);
993
994	/ Checks whether element is contained in container and return 1/0*
995	accordingly.
996
997	element has to coerce to a one element Unicode string. -1 is
998	returned in case of an error. /*
999
1000	PyAPI_FUNC(int) PyUnicode_Contains(
1001	PyObject container, /* Container string /
1002	PyObject element /* Element string /
1003	);
1004
1005	/ Checks whether argument is a valid identifier. /
1006
1007	PyAPI_FUNC(int) PyUnicode_IsIdentifier(PyObject *s);
1008
1009	/ === Characters Type APIs =============================================== /
1010
1011	#ifndef Py_LIMITED_API
1012	# define Py_CPYTHON_UNICODEOBJECT_H
1013	# include "cpython/unicodeobject.h"
1014	# undef Py_CPYTHON_UNICODEOBJECT_H
1015	#endif
1016
1017	#ifdef __cplusplus
1018	}
1019	#endif
1020	#endif /* !Py_UNICODEOBJECT_H */
1021

source code of include/python3.12/unicodeobject.h