1#ifndef Py_CPYTHON_UNICODEOBJECT_H
2# error "this header file must not be included directly"
3#endif
4
5/* Py_UNICODE was the native Unicode storage format (code unit) used by
6 Python and represents a single Unicode element in the Unicode type.
7 With PEP 393, Py_UNICODE is deprecated and replaced with a
8 typedef to wchar_t. */
9#define PY_UNICODE_TYPE wchar_t
10/* Py_DEPRECATED(3.3) */ typedef wchar_t Py_UNICODE;
11
12/* --- Internal Unicode Operations ---------------------------------------- */
13
14// Static inline functions to work with surrogates
15static inline int Py_UNICODE_IS_SURROGATE(Py_UCS4 ch) {
16 return (0xD800 <= ch && ch <= 0xDFFF);
17}
18static inline int Py_UNICODE_IS_HIGH_SURROGATE(Py_UCS4 ch) {
19 return (0xD800 <= ch && ch <= 0xDBFF);
20}
21static inline int Py_UNICODE_IS_LOW_SURROGATE(Py_UCS4 ch) {
22 return (0xDC00 <= ch && ch <= 0xDFFF);
23}
24
25// Join two surrogate characters and return a single Py_UCS4 value.
26static inline Py_UCS4 Py_UNICODE_JOIN_SURROGATES(Py_UCS4 high, Py_UCS4 low) {
27 assert(Py_UNICODE_IS_HIGH_SURROGATE(high));
28 assert(Py_UNICODE_IS_LOW_SURROGATE(low));
29 return 0x10000 + (((high & 0x03FF) << 10) | (low & 0x03FF));
30}
31
32// High surrogate = top 10 bits added to 0xD800.
33// The character must be in the range [U+10000; U+10ffff].
34static inline Py_UCS4 Py_UNICODE_HIGH_SURROGATE(Py_UCS4 ch) {
35 assert(0x10000 <= ch && ch <= 0x10ffff);
36 return (0xD800 - (0x10000 >> 10) + (ch >> 10));
37}
38
39// Low surrogate = bottom 10 bits added to 0xDC00.
40// The character must be in the range [U+10000; U+10ffff].
41static inline Py_UCS4 Py_UNICODE_LOW_SURROGATE(Py_UCS4 ch) {
42 assert(0x10000 <= ch && ch <= 0x10ffff);
43 return (0xDC00 + (ch & 0x3FF));
44}
45
46/* --- Unicode Type ------------------------------------------------------- */
47
48/* ASCII-only strings created through PyUnicode_New use the PyASCIIObject
49 structure. state.ascii and state.compact are set, and the data
50 immediately follow the structure. utf8_length can be found
51 in the length field; the utf8 pointer is equal to the data pointer. */
52typedef struct {
53 /* There are 4 forms of Unicode strings:
54
55 - compact ascii:
56
57 * structure = PyASCIIObject
58 * test: PyUnicode_IS_COMPACT_ASCII(op)
59 * kind = PyUnicode_1BYTE_KIND
60 * compact = 1
61 * ascii = 1
62 * (length is the length of the utf8)
63 * (data starts just after the structure)
64 * (since ASCII is decoded from UTF-8, the utf8 string are the data)
65
66 - compact:
67
68 * structure = PyCompactUnicodeObject
69 * test: PyUnicode_IS_COMPACT(op) && !PyUnicode_IS_ASCII(op)
70 * kind = PyUnicode_1BYTE_KIND, PyUnicode_2BYTE_KIND or
71 PyUnicode_4BYTE_KIND
72 * compact = 1
73 * ascii = 0
74 * utf8 is not shared with data
75 * utf8_length = 0 if utf8 is NULL
76 * (data starts just after the structure)
77
78 - legacy string:
79
80 * structure = PyUnicodeObject structure
81 * test: !PyUnicode_IS_COMPACT(op)
82 * kind = PyUnicode_1BYTE_KIND, PyUnicode_2BYTE_KIND or
83 PyUnicode_4BYTE_KIND
84 * compact = 0
85 * data.any is not NULL
86 * utf8 is shared and utf8_length = length with data.any if ascii = 1
87 * utf8_length = 0 if utf8 is NULL
88
89 Compact strings use only one memory block (structure + characters),
90 whereas legacy strings use one block for the structure and one block
91 for characters.
92
93 Legacy strings are created by subclasses of Unicode.
94
95 See also _PyUnicode_CheckConsistency().
96 */
97 PyObject_HEAD
98 Py_ssize_t length; /* Number of code points in the string */
99 Py_hash_t hash; /* Hash value; -1 if not set */
100 struct {
101 /* If interned is non-zero, the two references from the
102 dictionary to this object are *not* counted in ob_refcnt.
103 The possible values here are:
104 0: Not Interned
105 1: Interned
106 2: Interned and Immortal
107 3: Interned, Immortal, and Static
108 This categorization allows the runtime to determine the right
109 cleanup mechanism at runtime shutdown. */
110 unsigned int interned:2;
111 /* Character size:
112
113 - PyUnicode_1BYTE_KIND (1):
114
115 * character type = Py_UCS1 (8 bits, unsigned)
116 * all characters are in the range U+0000-U+00FF (latin1)
117 * if ascii is set, all characters are in the range U+0000-U+007F
118 (ASCII), otherwise at least one character is in the range
119 U+0080-U+00FF
120
121 - PyUnicode_2BYTE_KIND (2):
122
123 * character type = Py_UCS2 (16 bits, unsigned)
124 * all characters are in the range U+0000-U+FFFF (BMP)
125 * at least one character is in the range U+0100-U+FFFF
126
127 - PyUnicode_4BYTE_KIND (4):
128
129 * character type = Py_UCS4 (32 bits, unsigned)
130 * all characters are in the range U+0000-U+10FFFF
131 * at least one character is in the range U+10000-U+10FFFF
132 */
133 unsigned int kind:3;
134 /* Compact is with respect to the allocation scheme. Compact unicode
135 objects only require one memory block while non-compact objects use
136 one block for the PyUnicodeObject struct and another for its data
137 buffer. */
138 unsigned int compact:1;
139 /* The string only contains characters in the range U+0000-U+007F (ASCII)
140 and the kind is PyUnicode_1BYTE_KIND. If ascii is set and compact is
141 set, use the PyASCIIObject structure. */
142 unsigned int ascii:1;
143 /* The object is statically allocated. */
144 unsigned int statically_allocated:1;
145 /* Padding to ensure that PyUnicode_DATA() is always aligned to
146 4 bytes (see issue #19537 on m68k). */
147 unsigned int :24;
148 } state;
149} PyASCIIObject;
150
151/* Non-ASCII strings allocated through PyUnicode_New use the
152 PyCompactUnicodeObject structure. state.compact is set, and the data
153 immediately follow the structure. */
154typedef struct {
155 PyASCIIObject _base;
156 Py_ssize_t utf8_length; /* Number of bytes in utf8, excluding the
157 * terminating \0. */
158 char *utf8; /* UTF-8 representation (null-terminated) */
159} PyCompactUnicodeObject;
160
161/* Object format for Unicode subclasses. */
162typedef struct {
163 PyCompactUnicodeObject _base;
164 union {
165 void *any;
166 Py_UCS1 *latin1;
167 Py_UCS2 *ucs2;
168 Py_UCS4 *ucs4;
169 } data; /* Canonical, smallest-form Unicode buffer */
170} PyUnicodeObject;
171
172PyAPI_FUNC(int) _PyUnicode_CheckConsistency(
173 PyObject *op,
174 int check_content);
175
176
177#define _PyASCIIObject_CAST(op) \
178 (assert(PyUnicode_Check(op)), \
179 _Py_CAST(PyASCIIObject*, (op)))
180#define _PyCompactUnicodeObject_CAST(op) \
181 (assert(PyUnicode_Check(op)), \
182 _Py_CAST(PyCompactUnicodeObject*, (op)))
183#define _PyUnicodeObject_CAST(op) \
184 (assert(PyUnicode_Check(op)), \
185 _Py_CAST(PyUnicodeObject*, (op)))
186
187
188/* --- Flexible String Representation Helper Macros (PEP 393) -------------- */
189
190/* Values for PyASCIIObject.state: */
191
192/* Interning state. */
193#define SSTATE_NOT_INTERNED 0
194#define SSTATE_INTERNED_MORTAL 1
195#define SSTATE_INTERNED_IMMORTAL 2
196#define SSTATE_INTERNED_IMMORTAL_STATIC 3
197
198/* Use only if you know it's a string */
199static inline unsigned int PyUnicode_CHECK_INTERNED(PyObject *op) {
200 return _PyASCIIObject_CAST(op)->state.interned;
201}
202#define PyUnicode_CHECK_INTERNED(op) PyUnicode_CHECK_INTERNED(_PyObject_CAST(op))
203
204/* For backward compatibility */
205static inline unsigned int PyUnicode_IS_READY(PyObject* Py_UNUSED(op)) {
206 return 1;
207}
208#define PyUnicode_IS_READY(op) PyUnicode_IS_READY(_PyObject_CAST(op))
209
210/* Return true if the string contains only ASCII characters, or 0 if not. The
211 string may be compact (PyUnicode_IS_COMPACT_ASCII) or not, but must be
212 ready. */
213static inline unsigned int PyUnicode_IS_ASCII(PyObject *op) {
214 return _PyASCIIObject_CAST(op)->state.ascii;
215}
216#define PyUnicode_IS_ASCII(op) PyUnicode_IS_ASCII(_PyObject_CAST(op))
217
218/* Return true if the string is compact or 0 if not.
219 No type checks or Ready calls are performed. */
220static inline unsigned int PyUnicode_IS_COMPACT(PyObject *op) {
221 return _PyASCIIObject_CAST(op)->state.compact;
222}
223#define PyUnicode_IS_COMPACT(op) PyUnicode_IS_COMPACT(_PyObject_CAST(op))
224
225/* Return true if the string is a compact ASCII string (use PyASCIIObject
226 structure), or 0 if not. No type checks or Ready calls are performed. */
227static inline int PyUnicode_IS_COMPACT_ASCII(PyObject *op) {
228 return (_PyASCIIObject_CAST(op)->state.ascii && PyUnicode_IS_COMPACT(op));
229}
230#define PyUnicode_IS_COMPACT_ASCII(op) PyUnicode_IS_COMPACT_ASCII(_PyObject_CAST(op))
231
232enum PyUnicode_Kind {
233/* Return values of the PyUnicode_KIND() function: */
234 PyUnicode_1BYTE_KIND = 1,
235 PyUnicode_2BYTE_KIND = 2,
236 PyUnicode_4BYTE_KIND = 4
237};
238
239// PyUnicode_KIND(): Return one of the PyUnicode_*_KIND values defined above.
240//
241// gh-89653: Converting this macro to a static inline function would introduce
242// new compiler warnings on "kind < PyUnicode_KIND(str)" (compare signed and
243// unsigned numbers) where kind type is an int or on
244// "unsigned int kind = PyUnicode_KIND(str)" (cast signed to unsigned).
245#define PyUnicode_KIND(op) _Py_RVALUE(_PyASCIIObject_CAST(op)->state.kind)
246
247/* Return a void pointer to the raw unicode buffer. */
248static inline void* _PyUnicode_COMPACT_DATA(PyObject *op) {
249 if (PyUnicode_IS_ASCII(op)) {
250 return _Py_STATIC_CAST(void*, (_PyASCIIObject_CAST(op) + 1));
251 }
252 return _Py_STATIC_CAST(void*, (_PyCompactUnicodeObject_CAST(op) + 1));
253}
254
255static inline void* _PyUnicode_NONCOMPACT_DATA(PyObject *op) {
256 void *data;
257 assert(!PyUnicode_IS_COMPACT(op));
258 data = _PyUnicodeObject_CAST(op)->data.any;
259 assert(data != NULL);
260 return data;
261}
262
263static inline void* PyUnicode_DATA(PyObject *op) {
264 if (PyUnicode_IS_COMPACT(op)) {
265 return _PyUnicode_COMPACT_DATA(op);
266 }
267 return _PyUnicode_NONCOMPACT_DATA(op);
268}
269#define PyUnicode_DATA(op) PyUnicode_DATA(_PyObject_CAST(op))
270
271/* Return pointers to the canonical representation cast to unsigned char,
272 Py_UCS2, or Py_UCS4 for direct character access.
273 No checks are performed, use PyUnicode_KIND() before to ensure
274 these will work correctly. */
275
276#define PyUnicode_1BYTE_DATA(op) _Py_STATIC_CAST(Py_UCS1*, PyUnicode_DATA(op))
277#define PyUnicode_2BYTE_DATA(op) _Py_STATIC_CAST(Py_UCS2*, PyUnicode_DATA(op))
278#define PyUnicode_4BYTE_DATA(op) _Py_STATIC_CAST(Py_UCS4*, PyUnicode_DATA(op))
279
280/* Returns the length of the unicode string. */
281static inline Py_ssize_t PyUnicode_GET_LENGTH(PyObject *op) {
282 return _PyASCIIObject_CAST(op)->length;
283}
284#define PyUnicode_GET_LENGTH(op) PyUnicode_GET_LENGTH(_PyObject_CAST(op))
285
286/* Write into the canonical representation, this function does not do any sanity
287 checks and is intended for usage in loops. The caller should cache the
288 kind and data pointers obtained from other function calls.
289 index is the index in the string (starts at 0) and value is the new
290 code point value which should be written to that location. */
291static inline void PyUnicode_WRITE(int kind, void *data,
292 Py_ssize_t index, Py_UCS4 value)
293{
294 assert(index >= 0);
295 if (kind == PyUnicode_1BYTE_KIND) {
296 assert(value <= 0xffU);
297 _Py_STATIC_CAST(Py_UCS1*, data)[index] = _Py_STATIC_CAST(Py_UCS1, value);
298 }
299 else if (kind == PyUnicode_2BYTE_KIND) {
300 assert(value <= 0xffffU);
301 _Py_STATIC_CAST(Py_UCS2*, data)[index] = _Py_STATIC_CAST(Py_UCS2, value);
302 }
303 else {
304 assert(kind == PyUnicode_4BYTE_KIND);
305 assert(value <= 0x10ffffU);
306 _Py_STATIC_CAST(Py_UCS4*, data)[index] = value;
307 }
308}
309#define PyUnicode_WRITE(kind, data, index, value) \
310 PyUnicode_WRITE(_Py_STATIC_CAST(int, kind), _Py_CAST(void*, data), \
311 (index), _Py_STATIC_CAST(Py_UCS4, value))
312
313/* Read a code point from the string's canonical representation. No checks
314 or ready calls are performed. */
315static inline Py_UCS4 PyUnicode_READ(int kind,
316 const void *data, Py_ssize_t index)
317{
318 assert(index >= 0);
319 if (kind == PyUnicode_1BYTE_KIND) {
320 return _Py_STATIC_CAST(const Py_UCS1*, data)[index];
321 }
322 if (kind == PyUnicode_2BYTE_KIND) {
323 return _Py_STATIC_CAST(const Py_UCS2*, data)[index];
324 }
325 assert(kind == PyUnicode_4BYTE_KIND);
326 return _Py_STATIC_CAST(const Py_UCS4*, data)[index];
327}
328#define PyUnicode_READ(kind, data, index) \
329 PyUnicode_READ(_Py_STATIC_CAST(int, kind), \
330 _Py_STATIC_CAST(const void*, data), \
331 (index))
332
333/* PyUnicode_READ_CHAR() is less efficient than PyUnicode_READ() because it
334 calls PyUnicode_KIND() and might call it twice. For single reads, use
335 PyUnicode_READ_CHAR, for multiple consecutive reads callers should
336 cache kind and use PyUnicode_READ instead. */
337static inline Py_UCS4 PyUnicode_READ_CHAR(PyObject *unicode, Py_ssize_t index)
338{
339 int kind;
340
341 assert(index >= 0);
342 // Tolerate reading the NUL character at str[len(str)]
343 assert(index <= PyUnicode_GET_LENGTH(unicode));
344
345 kind = PyUnicode_KIND(unicode);
346 if (kind == PyUnicode_1BYTE_KIND) {
347 return PyUnicode_1BYTE_DATA(unicode)[index];
348 }
349 if (kind == PyUnicode_2BYTE_KIND) {
350 return PyUnicode_2BYTE_DATA(unicode)[index];
351 }
352 assert(kind == PyUnicode_4BYTE_KIND);
353 return PyUnicode_4BYTE_DATA(unicode)[index];
354}
355#define PyUnicode_READ_CHAR(unicode, index) \
356 PyUnicode_READ_CHAR(_PyObject_CAST(unicode), (index))
357
358/* Return a maximum character value which is suitable for creating another
359 string based on op. This is always an approximation but more efficient
360 than iterating over the string. */
361static inline Py_UCS4 PyUnicode_MAX_CHAR_VALUE(PyObject *op)
362{
363 int kind;
364
365 if (PyUnicode_IS_ASCII(op)) {
366 return 0x7fU;
367 }
368
369 kind = PyUnicode_KIND(op);
370 if (kind == PyUnicode_1BYTE_KIND) {
371 return 0xffU;
372 }
373 if (kind == PyUnicode_2BYTE_KIND) {
374 return 0xffffU;
375 }
376 assert(kind == PyUnicode_4BYTE_KIND);
377 return 0x10ffffU;
378}
379#define PyUnicode_MAX_CHAR_VALUE(op) \
380 PyUnicode_MAX_CHAR_VALUE(_PyObject_CAST(op))
381
382/* === Public API ========================================================= */
383
384/* --- Plain Py_UNICODE --------------------------------------------------- */
385
386/* With PEP 393, this is the recommended way to allocate a new unicode object.
387 This function will allocate the object and its buffer in a single memory
388 block. Objects created using this function are not resizable. */
389PyAPI_FUNC(PyObject*) PyUnicode_New(
390 Py_ssize_t size, /* Number of code points in the new string */
391 Py_UCS4 maxchar /* maximum code point value in the string */
392 );
393
394/* For backward compatibility */
395static inline int PyUnicode_READY(PyObject* Py_UNUSED(op))
396{
397 return 0;
398}
399#define PyUnicode_READY(op) PyUnicode_READY(_PyObject_CAST(op))
400
401/* Get a copy of a Unicode string. */
402PyAPI_FUNC(PyObject*) _PyUnicode_Copy(
403 PyObject *unicode
404 );
405
406/* Copy character from one unicode object into another, this function performs
407 character conversion when necessary and falls back to memcpy() if possible.
408
409 Fail if to is too small (smaller than *how_many* or smaller than
410 len(from)-from_start), or if kind(from[from_start:from_start+how_many]) >
411 kind(to), or if *to* has more than 1 reference.
412
413 Return the number of written character, or return -1 and raise an exception
414 on error.
415
416 Pseudo-code:
417
418 how_many = min(how_many, len(from) - from_start)
419 to[to_start:to_start+how_many] = from[from_start:from_start+how_many]
420 return how_many
421
422 Note: The function doesn't write a terminating null character.
423 */
424PyAPI_FUNC(Py_ssize_t) PyUnicode_CopyCharacters(
425 PyObject *to,
426 Py_ssize_t to_start,
427 PyObject *from,
428 Py_ssize_t from_start,
429 Py_ssize_t how_many
430 );
431
432/* Unsafe version of PyUnicode_CopyCharacters(): don't check arguments and so
433 may crash if parameters are invalid (e.g. if the output string
434 is too short). */
435PyAPI_FUNC(void) _PyUnicode_FastCopyCharacters(
436 PyObject *to,
437 Py_ssize_t to_start,
438 PyObject *from,
439 Py_ssize_t from_start,
440 Py_ssize_t how_many
441 );
442
443/* Fill a string with a character: write fill_char into
444 unicode[start:start+length].
445
446 Fail if fill_char is bigger than the string maximum character, or if the
447 string has more than 1 reference.
448
449 Return the number of written character, or return -1 and raise an exception
450 on error. */
451PyAPI_FUNC(Py_ssize_t) PyUnicode_Fill(
452 PyObject *unicode,
453 Py_ssize_t start,
454 Py_ssize_t length,
455 Py_UCS4 fill_char
456 );
457
458/* Unsafe version of PyUnicode_Fill(): don't check arguments and so may crash
459 if parameters are invalid (e.g. if length is longer than the string). */
460PyAPI_FUNC(void) _PyUnicode_FastFill(
461 PyObject *unicode,
462 Py_ssize_t start,
463 Py_ssize_t length,
464 Py_UCS4 fill_char
465 );
466
467/* Create a new string from a buffer of Py_UCS1, Py_UCS2 or Py_UCS4 characters.
468 Scan the string to find the maximum character. */
469PyAPI_FUNC(PyObject*) PyUnicode_FromKindAndData(
470 int kind,
471 const void *buffer,
472 Py_ssize_t size);
473
474/* Create a new string from a buffer of ASCII characters.
475 WARNING: Don't check if the string contains any non-ASCII character. */
476PyAPI_FUNC(PyObject*) _PyUnicode_FromASCII(
477 const char *buffer,
478 Py_ssize_t size);
479
480/* Compute the maximum character of the substring unicode[start:end].
481 Return 127 for an empty string. */
482PyAPI_FUNC(Py_UCS4) _PyUnicode_FindMaxChar (
483 PyObject *unicode,
484 Py_ssize_t start,
485 Py_ssize_t end);
486
487/* --- _PyUnicodeWriter API ----------------------------------------------- */
488
489typedef struct {
490 PyObject *buffer;
491 void *data;
492 int kind;
493 Py_UCS4 maxchar;
494 Py_ssize_t size;
495 Py_ssize_t pos;
496
497 /* minimum number of allocated characters (default: 0) */
498 Py_ssize_t min_length;
499
500 /* minimum character (default: 127, ASCII) */
501 Py_UCS4 min_char;
502
503 /* If non-zero, overallocate the buffer (default: 0). */
504 unsigned char overallocate;
505
506 /* If readonly is 1, buffer is a shared string (cannot be modified)
507 and size is set to 0. */
508 unsigned char readonly;
509} _PyUnicodeWriter ;
510
511/* Initialize a Unicode writer.
512 *
513 * By default, the minimum buffer size is 0 character and overallocation is
514 * disabled. Set min_length, min_char and overallocate attributes to control
515 * the allocation of the buffer. */
516PyAPI_FUNC(void)
517_PyUnicodeWriter_Init(_PyUnicodeWriter *writer);
518
519/* Prepare the buffer to write 'length' characters
520 with the specified maximum character.
521
522 Return 0 on success, raise an exception and return -1 on error. */
523#define _PyUnicodeWriter_Prepare(WRITER, LENGTH, MAXCHAR) \
524 (((MAXCHAR) <= (WRITER)->maxchar \
525 && (LENGTH) <= (WRITER)->size - (WRITER)->pos) \
526 ? 0 \
527 : (((LENGTH) == 0) \
528 ? 0 \
529 : _PyUnicodeWriter_PrepareInternal((WRITER), (LENGTH), (MAXCHAR))))
530
531/* Don't call this function directly, use the _PyUnicodeWriter_Prepare() macro
532 instead. */
533PyAPI_FUNC(int)
534_PyUnicodeWriter_PrepareInternal(_PyUnicodeWriter *writer,
535 Py_ssize_t length, Py_UCS4 maxchar);
536
537/* Prepare the buffer to have at least the kind KIND.
538 For example, kind=PyUnicode_2BYTE_KIND ensures that the writer will
539 support characters in range U+000-U+FFFF.
540
541 Return 0 on success, raise an exception and return -1 on error. */
542#define _PyUnicodeWriter_PrepareKind(WRITER, KIND) \
543 ((KIND) <= (WRITER)->kind \
544 ? 0 \
545 : _PyUnicodeWriter_PrepareKindInternal((WRITER), (KIND)))
546
547/* Don't call this function directly, use the _PyUnicodeWriter_PrepareKind()
548 macro instead. */
549PyAPI_FUNC(int)
550_PyUnicodeWriter_PrepareKindInternal(_PyUnicodeWriter *writer,
551 int kind);
552
553/* Append a Unicode character.
554 Return 0 on success, raise an exception and return -1 on error. */
555PyAPI_FUNC(int)
556_PyUnicodeWriter_WriteChar(_PyUnicodeWriter *writer,
557 Py_UCS4 ch
558 );
559
560/* Append a Unicode string.
561 Return 0 on success, raise an exception and return -1 on error. */
562PyAPI_FUNC(int)
563_PyUnicodeWriter_WriteStr(_PyUnicodeWriter *writer,
564 PyObject *str /* Unicode string */
565 );
566
567/* Append a substring of a Unicode string.
568 Return 0 on success, raise an exception and return -1 on error. */
569PyAPI_FUNC(int)
570_PyUnicodeWriter_WriteSubstring(_PyUnicodeWriter *writer,
571 PyObject *str, /* Unicode string */
572 Py_ssize_t start,
573 Py_ssize_t end
574 );
575
576/* Append an ASCII-encoded byte string.
577 Return 0 on success, raise an exception and return -1 on error. */
578PyAPI_FUNC(int)
579_PyUnicodeWriter_WriteASCIIString(_PyUnicodeWriter *writer,
580 const char *str, /* ASCII-encoded byte string */
581 Py_ssize_t len /* number of bytes, or -1 if unknown */
582 );
583
584/* Append a latin1-encoded byte string.
585 Return 0 on success, raise an exception and return -1 on error. */
586PyAPI_FUNC(int)
587_PyUnicodeWriter_WriteLatin1String(_PyUnicodeWriter *writer,
588 const char *str, /* latin1-encoded byte string */
589 Py_ssize_t len /* length in bytes */
590 );
591
592/* Get the value of the writer as a Unicode string. Clear the
593 buffer of the writer. Raise an exception and return NULL
594 on error. */
595PyAPI_FUNC(PyObject *)
596_PyUnicodeWriter_Finish(_PyUnicodeWriter *writer);
597
598/* Deallocate memory of a writer (clear its internal buffer). */
599PyAPI_FUNC(void)
600_PyUnicodeWriter_Dealloc(_PyUnicodeWriter *writer);
601
602
603/* Format the object based on the format_spec, as defined in PEP 3101
604 (Advanced String Formatting). */
605PyAPI_FUNC(int) _PyUnicode_FormatAdvancedWriter(
606 _PyUnicodeWriter *writer,
607 PyObject *obj,
608 PyObject *format_spec,
609 Py_ssize_t start,
610 Py_ssize_t end);
611
612/* --- Manage the default encoding ---------------------------------------- */
613
614/* Returns a pointer to the default encoding (UTF-8) of the
615 Unicode object unicode.
616
617 Like PyUnicode_AsUTF8AndSize(), this also caches the UTF-8 representation
618 in the unicodeobject.
619
620 _PyUnicode_AsString is a #define for PyUnicode_AsUTF8 to
621 support the previous internal function with the same behaviour.
622
623 Use of this API is DEPRECATED since no size information can be
624 extracted from the returned data.
625*/
626
627PyAPI_FUNC(const char *) PyUnicode_AsUTF8(PyObject *unicode);
628
629#define _PyUnicode_AsString PyUnicode_AsUTF8
630
631/* --- UTF-7 Codecs ------------------------------------------------------- */
632
633PyAPI_FUNC(PyObject*) _PyUnicode_EncodeUTF7(
634 PyObject *unicode, /* Unicode object */
635 int base64SetO, /* Encode RFC2152 Set O characters in base64 */
636 int base64WhiteSpace, /* Encode whitespace (sp, ht, nl, cr) in base64 */
637 const char *errors /* error handling */
638 );
639
640/* --- UTF-8 Codecs ------------------------------------------------------- */
641
642PyAPI_FUNC(PyObject*) _PyUnicode_AsUTF8String(
643 PyObject *unicode,
644 const char *errors);
645
646/* --- UTF-32 Codecs ------------------------------------------------------ */
647
648PyAPI_FUNC(PyObject*) _PyUnicode_EncodeUTF32(
649 PyObject *object, /* Unicode object */
650 const char *errors, /* error handling */
651 int byteorder /* byteorder to use 0=BOM+native;-1=LE,1=BE */
652 );
653
654/* --- UTF-16 Codecs ------------------------------------------------------ */
655
656/* Returns a Python string object holding the UTF-16 encoded value of
657 the Unicode data.
658
659 If byteorder is not 0, output is written according to the following
660 byte order:
661
662 byteorder == -1: little endian
663 byteorder == 0: native byte order (writes a BOM mark)
664 byteorder == 1: big endian
665
666 If byteorder is 0, the output string will always start with the
667 Unicode BOM mark (U+FEFF). In the other two modes, no BOM mark is
668 prepended.
669*/
670PyAPI_FUNC(PyObject*) _PyUnicode_EncodeUTF16(
671 PyObject* unicode, /* Unicode object */
672 const char *errors, /* error handling */
673 int byteorder /* byteorder to use 0=BOM+native;-1=LE,1=BE */
674 );
675
676/* --- Unicode-Escape Codecs ---------------------------------------------- */
677
678/* Variant of PyUnicode_DecodeUnicodeEscape that supports partial decoding. */
679PyAPI_FUNC(PyObject*) _PyUnicode_DecodeUnicodeEscapeStateful(
680 const char *string, /* Unicode-Escape encoded string */
681 Py_ssize_t length, /* size of string */
682 const char *errors, /* error handling */
683 Py_ssize_t *consumed /* bytes consumed */
684);
685/* Helper for PyUnicode_DecodeUnicodeEscape that detects invalid escape
686 chars. */
687PyAPI_FUNC(PyObject*) _PyUnicode_DecodeUnicodeEscapeInternal2(
688 const char *string, /* Unicode-Escape encoded string */
689 Py_ssize_t length, /* size of string */
690 const char *errors, /* error handling */
691 Py_ssize_t *consumed, /* bytes consumed */
692 int *first_invalid_escape_char, /* on return, if not -1, contain the first
693 invalid escaped char (<= 0xff) or invalid
694 octal escape (> 0xff) in string. */
695 const char **first_invalid_escape_ptr); /* on return, if not NULL, may
696 point to the first invalid escaped
697 char in string.
698 May be NULL if errors is not NULL. */
699// Export for binary compatibility.
700PyAPI_FUNC(PyObject*) _PyUnicode_DecodeUnicodeEscapeInternal(
701 const char *string, /* Unicode-Escape encoded string */
702 Py_ssize_t length, /* size of string */
703 const char *errors, /* error handling */
704 Py_ssize_t *consumed, /* bytes consumed */
705 const char **first_invalid_escape /* on return, points to first
706 invalid escaped char in
707 string. */
708);
709
710/* --- Raw-Unicode-Escape Codecs ---------------------------------------------- */
711
712/* Variant of PyUnicode_DecodeRawUnicodeEscape that supports partial decoding. */
713PyAPI_FUNC(PyObject*) _PyUnicode_DecodeRawUnicodeEscapeStateful(
714 const char *string, /* Unicode-Escape encoded string */
715 Py_ssize_t length, /* size of string */
716 const char *errors, /* error handling */
717 Py_ssize_t *consumed /* bytes consumed */
718);
719
720/* --- Latin-1 Codecs ----------------------------------------------------- */
721
722PyAPI_FUNC(PyObject*) _PyUnicode_AsLatin1String(
723 PyObject* unicode,
724 const char* errors);
725
726/* --- ASCII Codecs ------------------------------------------------------- */
727
728PyAPI_FUNC(PyObject*) _PyUnicode_AsASCIIString(
729 PyObject* unicode,
730 const char* errors);
731
732/* --- Character Map Codecs ----------------------------------------------- */
733
734/* Translate an Unicode object by applying a character mapping table to
735 it and return the resulting Unicode object.
736
737 The mapping table must map Unicode ordinal integers to Unicode strings,
738 Unicode ordinal integers or None (causing deletion of the character).
739
740 Mapping tables may be dictionaries or sequences. Unmapped character
741 ordinals (ones which cause a LookupError) are left untouched and
742 are copied as-is.
743*/
744PyAPI_FUNC(PyObject*) _PyUnicode_EncodeCharmap(
745 PyObject *unicode, /* Unicode object */
746 PyObject *mapping, /* encoding mapping */
747 const char *errors /* error handling */
748 );
749
750/* --- Decimal Encoder ---------------------------------------------------- */
751
752/* Coverts a Unicode object holding a decimal value to an ASCII string
753 for using in int, float and complex parsers.
754 Transforms code points that have decimal digit property to the
755 corresponding ASCII digit code points. Transforms spaces to ASCII.
756 Transforms code points starting from the first non-ASCII code point that
757 is neither a decimal digit nor a space to the end into '?'. */
758
759PyAPI_FUNC(PyObject*) _PyUnicode_TransformDecimalAndSpaceToASCII(
760 PyObject *unicode /* Unicode object */
761 );
762
763/* --- Methods & Slots ---------------------------------------------------- */
764
765PyAPI_FUNC(PyObject *) _PyUnicode_JoinArray(
766 PyObject *separator,
767 PyObject *const *items,
768 Py_ssize_t seqlen
769 );
770
771/* Test whether a unicode is equal to ASCII identifier. Return 1 if true,
772 0 otherwise. The right argument must be ASCII identifier.
773 Any error occurs inside will be cleared before return. */
774PyAPI_FUNC(int) _PyUnicode_EqualToASCIIId(
775 PyObject *left, /* Left string */
776 _Py_Identifier *right /* Right identifier */
777 );
778
779/* Test whether a unicode is equal to ASCII string. Return 1 if true,
780 0 otherwise. The right argument must be ASCII-encoded string.
781 Any error occurs inside will be cleared before return. */
782PyAPI_FUNC(int) _PyUnicode_EqualToASCIIString(
783 PyObject *left,
784 const char *right /* ASCII-encoded string */
785 );
786
787/* Externally visible for str.strip(unicode) */
788PyAPI_FUNC(PyObject *) _PyUnicode_XStrip(
789 PyObject *self,
790 int striptype,
791 PyObject *sepobj
792 );
793
794/* Using explicit passed-in values, insert the thousands grouping
795 into the string pointed to by buffer. For the argument descriptions,
796 see Objects/stringlib/localeutil.h */
797PyAPI_FUNC(Py_ssize_t) _PyUnicode_InsertThousandsGrouping(
798 _PyUnicodeWriter *writer,
799 Py_ssize_t n_buffer,
800 PyObject *digits,
801 Py_ssize_t d_pos,
802 Py_ssize_t n_digits,
803 Py_ssize_t min_width,
804 const char *grouping,
805 PyObject *thousands_sep,
806 Py_UCS4 *maxchar);
807
808/* === Characters Type APIs =============================================== */
809
810/* These should not be used directly. Use the Py_UNICODE_IS* and
811 Py_UNICODE_TO* macros instead.
812
813 These APIs are implemented in Objects/unicodectype.c.
814
815*/
816
817PyAPI_FUNC(int) _PyUnicode_IsLowercase(
818 Py_UCS4 ch /* Unicode character */
819 );
820
821PyAPI_FUNC(int) _PyUnicode_IsUppercase(
822 Py_UCS4 ch /* Unicode character */
823 );
824
825PyAPI_FUNC(int) _PyUnicode_IsTitlecase(
826 Py_UCS4 ch /* Unicode character */
827 );
828
829PyAPI_FUNC(int) _PyUnicode_IsXidStart(
830 Py_UCS4 ch /* Unicode character */
831 );
832
833PyAPI_FUNC(int) _PyUnicode_IsXidContinue(
834 Py_UCS4 ch /* Unicode character */
835 );
836
837PyAPI_FUNC(int) _PyUnicode_IsWhitespace(
838 const Py_UCS4 ch /* Unicode character */
839 );
840
841PyAPI_FUNC(int) _PyUnicode_IsLinebreak(
842 const Py_UCS4 ch /* Unicode character */
843 );
844
845/* Py_DEPRECATED(3.3) */ PyAPI_FUNC(Py_UCS4) _PyUnicode_ToLowercase(
846 Py_UCS4 ch /* Unicode character */
847 );
848
849/* Py_DEPRECATED(3.3) */ PyAPI_FUNC(Py_UCS4) _PyUnicode_ToUppercase(
850 Py_UCS4 ch /* Unicode character */
851 );
852
853Py_DEPRECATED(3.3) PyAPI_FUNC(Py_UCS4) _PyUnicode_ToTitlecase(
854 Py_UCS4 ch /* Unicode character */
855 );
856
857PyAPI_FUNC(int) _PyUnicode_ToLowerFull(
858 Py_UCS4 ch, /* Unicode character */
859 Py_UCS4 *res
860 );
861
862PyAPI_FUNC(int) _PyUnicode_ToTitleFull(
863 Py_UCS4 ch, /* Unicode character */
864 Py_UCS4 *res
865 );
866
867PyAPI_FUNC(int) _PyUnicode_ToUpperFull(
868 Py_UCS4 ch, /* Unicode character */
869 Py_UCS4 *res
870 );
871
872PyAPI_FUNC(int) _PyUnicode_ToFoldedFull(
873 Py_UCS4 ch, /* Unicode character */
874 Py_UCS4 *res
875 );
876
877PyAPI_FUNC(int) _PyUnicode_IsCaseIgnorable(
878 Py_UCS4 ch /* Unicode character */
879 );
880
881PyAPI_FUNC(int) _PyUnicode_IsCased(
882 Py_UCS4 ch /* Unicode character */
883 );
884
885PyAPI_FUNC(int) _PyUnicode_ToDecimalDigit(
886 Py_UCS4 ch /* Unicode character */
887 );
888
889PyAPI_FUNC(int) _PyUnicode_ToDigit(
890 Py_UCS4 ch /* Unicode character */
891 );
892
893PyAPI_FUNC(double) _PyUnicode_ToNumeric(
894 Py_UCS4 ch /* Unicode character */
895 );
896
897PyAPI_FUNC(int) _PyUnicode_IsDecimalDigit(
898 Py_UCS4 ch /* Unicode character */
899 );
900
901PyAPI_FUNC(int) _PyUnicode_IsDigit(
902 Py_UCS4 ch /* Unicode character */
903 );
904
905PyAPI_FUNC(int) _PyUnicode_IsNumeric(
906 Py_UCS4 ch /* Unicode character */
907 );
908
909PyAPI_FUNC(int) _PyUnicode_IsPrintable(
910 Py_UCS4 ch /* Unicode character */
911 );
912
913PyAPI_FUNC(int) _PyUnicode_IsAlpha(
914 Py_UCS4 ch /* Unicode character */
915 );
916
917// Helper array used by Py_UNICODE_ISSPACE().
918PyAPI_DATA(const unsigned char) _Py_ascii_whitespace[];
919
920// Since splitting on whitespace is an important use case, and
921// whitespace in most situations is solely ASCII whitespace, we
922// optimize for the common case by using a quick look-up table
923// _Py_ascii_whitespace (see below) with an inlined check.
924static inline int Py_UNICODE_ISSPACE(Py_UCS4 ch) {
925 if (ch < 128) {
926 return _Py_ascii_whitespace[ch];
927 }
928 return _PyUnicode_IsWhitespace(ch);
929}
930
931#define Py_UNICODE_ISLOWER(ch) _PyUnicode_IsLowercase(ch)
932#define Py_UNICODE_ISUPPER(ch) _PyUnicode_IsUppercase(ch)
933#define Py_UNICODE_ISTITLE(ch) _PyUnicode_IsTitlecase(ch)
934#define Py_UNICODE_ISLINEBREAK(ch) _PyUnicode_IsLinebreak(ch)
935
936#define Py_UNICODE_TOLOWER(ch) _PyUnicode_ToLowercase(ch)
937#define Py_UNICODE_TOUPPER(ch) _PyUnicode_ToUppercase(ch)
938#define Py_UNICODE_TOTITLE(ch) _PyUnicode_ToTitlecase(ch)
939
940#define Py_UNICODE_ISDECIMAL(ch) _PyUnicode_IsDecimalDigit(ch)
941#define Py_UNICODE_ISDIGIT(ch) _PyUnicode_IsDigit(ch)
942#define Py_UNICODE_ISNUMERIC(ch) _PyUnicode_IsNumeric(ch)
943#define Py_UNICODE_ISPRINTABLE(ch) _PyUnicode_IsPrintable(ch)
944
945#define Py_UNICODE_TODECIMAL(ch) _PyUnicode_ToDecimalDigit(ch)
946#define Py_UNICODE_TODIGIT(ch) _PyUnicode_ToDigit(ch)
947#define Py_UNICODE_TONUMERIC(ch) _PyUnicode_ToNumeric(ch)
948
949#define Py_UNICODE_ISALPHA(ch) _PyUnicode_IsAlpha(ch)
950
951static inline int Py_UNICODE_ISALNUM(Py_UCS4 ch) {
952 return (Py_UNICODE_ISALPHA(ch)
953 || Py_UNICODE_ISDECIMAL(ch)
954 || Py_UNICODE_ISDIGIT(ch)
955 || Py_UNICODE_ISNUMERIC(ch));
956}
957
958
959/* === Misc functions ===================================================== */
960
961PyAPI_FUNC(PyObject*) _PyUnicode_FormatLong(PyObject *, int, int, int);
962
963/* Return an interned Unicode object for an Identifier; may fail if there is no memory.*/
964PyAPI_FUNC(PyObject*) _PyUnicode_FromId(_Py_Identifier*);
965
966/* Fast equality check when the inputs are known to be exact unicode types
967 and where the hash values are equal (i.e. a very probable match) */
968PyAPI_FUNC(int) _PyUnicode_EQ(PyObject *, PyObject *);
969
970/* Equality check. */
971PyAPI_FUNC(int) _PyUnicode_Equal(PyObject *, PyObject *);
972
973PyAPI_FUNC(int) _PyUnicode_WideCharString_Converter(PyObject *, void *);
974PyAPI_FUNC(int) _PyUnicode_WideCharString_Opt_Converter(PyObject *, void *);
975
976PyAPI_FUNC(Py_ssize_t) _PyUnicode_ScanIdentifier(PyObject *);
977

source code of include/python3.12/cpython/unicodeobject.h