1 | #ifndef Py_UNICODEOBJECT_H |
2 | #define Py_UNICODEOBJECT_H |
3 | |
4 | #include <stdarg.h> |
5 | |
6 | /* |
7 | |
8 | Unicode implementation based on original code by Fredrik Lundh, |
9 | modified by Marc-Andre Lemburg (mal@lemburg.com) according to the |
10 | Unicode Integration Proposal. (See |
11 | http://www.egenix.com/files/python/unicode-proposal.txt). |
12 | |
13 | Copyright (c) Corporation for National Research Initiatives. |
14 | |
15 | |
16 | Original header: |
17 | -------------------------------------------------------------------- |
18 | |
19 | * Yet another Unicode string type for Python. This type supports the |
20 | * 16-bit Basic Multilingual Plane (BMP) only. |
21 | * |
22 | * Written by Fredrik Lundh, January 1999. |
23 | * |
24 | * Copyright (c) 1999 by Secret Labs AB. |
25 | * Copyright (c) 1999 by Fredrik Lundh. |
26 | * |
27 | * fredrik@pythonware.com |
28 | * http://www.pythonware.com |
29 | * |
30 | * -------------------------------------------------------------------- |
31 | * This Unicode String Type is |
32 | * |
33 | * Copyright (c) 1999 by Secret Labs AB |
34 | * Copyright (c) 1999 by Fredrik Lundh |
35 | * |
36 | * By obtaining, using, and/or copying this software and/or its |
37 | * associated documentation, you agree that you have read, understood, |
38 | * and will comply with the following terms and conditions: |
39 | * |
40 | * Permission to use, copy, modify, and distribute this software and its |
41 | * associated documentation for any purpose and without fee is hereby |
42 | * granted, provided that the above copyright notice appears in all |
43 | * copies, and that both that copyright notice and this permission notice |
44 | * appear in supporting documentation, and that the name of Secret Labs |
45 | * AB or the author not be used in advertising or publicity pertaining to |
46 | * distribution of the software without specific, written prior |
47 | * permission. |
48 | * |
49 | * SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO |
50 | * THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND |
51 | * FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR |
52 | * ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES |
53 | * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN |
54 | * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT |
55 | * OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. |
56 | * -------------------------------------------------------------------- */ |
57 | |
58 | #include <ctype.h> |
59 | |
60 | /* === Internal API ======================================================= */ |
61 | |
62 | /* --- Internal Unicode Format -------------------------------------------- */ |
63 | |
64 | /* Python 3.x requires unicode */ |
65 | #define Py_USING_UNICODE |
66 | |
67 | #ifndef SIZEOF_WCHAR_T |
68 | #error Must define SIZEOF_WCHAR_T |
69 | #endif |
70 | |
71 | #define Py_UNICODE_SIZE SIZEOF_WCHAR_T |
72 | |
73 | /* If wchar_t can be used for UCS-4 storage, set Py_UNICODE_WIDE. |
74 | Otherwise, Unicode strings are stored as UCS-2 (with limited support |
75 | for UTF-16) */ |
76 | |
77 | #if Py_UNICODE_SIZE >= 4 |
78 | #define Py_UNICODE_WIDE |
79 | #endif |
80 | |
81 | /* Set these flags if the platform has "wchar.h" and the |
82 | wchar_t type is a 16-bit unsigned type */ |
83 | /* #define HAVE_WCHAR_H */ |
84 | /* #define HAVE_USABLE_WCHAR_T */ |
85 | |
86 | /* If the compiler provides a wchar_t type we try to support it |
87 | through the interface functions PyUnicode_FromWideChar(), |
88 | PyUnicode_AsWideChar() and PyUnicode_AsWideCharString(). */ |
89 | |
90 | #ifdef HAVE_USABLE_WCHAR_T |
91 | # ifndef HAVE_WCHAR_H |
92 | # define HAVE_WCHAR_H |
93 | # endif |
94 | #endif |
95 | |
96 | #ifdef HAVE_WCHAR_H |
97 | # include <wchar.h> |
98 | #endif |
99 | |
100 | /* Py_UCS4 and Py_UCS2 are typedefs for the respective |
101 | unicode representations. */ |
102 | typedef uint32_t Py_UCS4; |
103 | typedef uint16_t Py_UCS2; |
104 | typedef uint8_t Py_UCS1; |
105 | |
106 | #ifdef __cplusplus |
107 | extern "C" { |
108 | #endif |
109 | |
110 | |
111 | PyAPI_DATA(PyTypeObject) PyUnicode_Type; |
112 | PyAPI_DATA(PyTypeObject) PyUnicodeIter_Type; |
113 | |
114 | #define PyUnicode_Check(op) \ |
115 | PyType_FastSubclass(Py_TYPE(op), Py_TPFLAGS_UNICODE_SUBCLASS) |
116 | #define PyUnicode_CheckExact(op) Py_IS_TYPE(op, &PyUnicode_Type) |
117 | |
118 | /* --- Constants ---------------------------------------------------------- */ |
119 | |
120 | /* This Unicode character will be used as replacement character during |
121 | decoding if the errors argument is set to "replace". Note: the |
122 | Unicode character U+FFFD is the official REPLACEMENT CHARACTER in |
123 | Unicode 3.0. */ |
124 | |
125 | #define Py_UNICODE_REPLACEMENT_CHARACTER ((Py_UCS4) 0xFFFD) |
126 | |
127 | /* === Public API ========================================================= */ |
128 | |
129 | /* Similar to PyUnicode_FromUnicode(), but u points to UTF-8 encoded bytes */ |
130 | PyAPI_FUNC(PyObject*) PyUnicode_FromStringAndSize( |
131 | const char *u, /* UTF-8 encoded string */ |
132 | Py_ssize_t size /* size of buffer */ |
133 | ); |
134 | |
135 | /* Similar to PyUnicode_FromUnicode(), but u points to null-terminated |
136 | UTF-8 encoded bytes. The size is determined with strlen(). */ |
137 | PyAPI_FUNC(PyObject*) PyUnicode_FromString( |
138 | const char *u /* UTF-8 encoded string */ |
139 | ); |
140 | |
141 | #if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 >= 0x03030000 |
142 | PyAPI_FUNC(PyObject*) PyUnicode_Substring( |
143 | PyObject *str, |
144 | Py_ssize_t start, |
145 | Py_ssize_t end); |
146 | #endif |
147 | |
148 | #if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 >= 0x03030000 |
149 | /* Copy the string into a UCS4 buffer including the null character if copy_null |
150 | is set. Return NULL and raise an exception on error. Raise a SystemError if |
151 | the buffer is smaller than the string. Return buffer on success. |
152 | |
153 | buflen is the length of the buffer in (Py_UCS4) characters. */ |
154 | PyAPI_FUNC(Py_UCS4*) PyUnicode_AsUCS4( |
155 | PyObject *unicode, |
156 | Py_UCS4* buffer, |
157 | Py_ssize_t buflen, |
158 | int copy_null); |
159 | |
160 | /* Copy the string into a UCS4 buffer. A new buffer is allocated using |
161 | * PyMem_Malloc; if this fails, NULL is returned with a memory error |
162 | exception set. */ |
163 | PyAPI_FUNC(Py_UCS4*) PyUnicode_AsUCS4Copy(PyObject *unicode); |
164 | #endif |
165 | |
166 | #if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 >= 0x03030000 |
167 | /* Get the length of the Unicode object. */ |
168 | |
169 | PyAPI_FUNC(Py_ssize_t) PyUnicode_GetLength( |
170 | PyObject *unicode |
171 | ); |
172 | #endif |
173 | |
174 | /* Get the number of Py_UNICODE units in the |
175 | string representation. */ |
176 | |
177 | Py_DEPRECATED(3.3) PyAPI_FUNC(Py_ssize_t) PyUnicode_GetSize( |
178 | PyObject *unicode /* Unicode object */ |
179 | ); |
180 | |
181 | #if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 >= 0x03030000 |
182 | /* Read a character from the string. */ |
183 | |
184 | PyAPI_FUNC(Py_UCS4) PyUnicode_ReadChar( |
185 | PyObject *unicode, |
186 | Py_ssize_t index |
187 | ); |
188 | |
189 | /* Write a character to the string. The string must have been created through |
190 | PyUnicode_New, must not be shared, and must not have been hashed yet. |
191 | |
192 | Return 0 on success, -1 on error. */ |
193 | |
194 | PyAPI_FUNC(int) PyUnicode_WriteChar( |
195 | PyObject *unicode, |
196 | Py_ssize_t index, |
197 | Py_UCS4 character |
198 | ); |
199 | #endif |
200 | |
201 | /* Resize a Unicode object. The length is the number of characters, except |
202 | if the kind of the string is PyUnicode_WCHAR_KIND: in this case, the length |
203 | is the number of Py_UNICODE characters. |
204 | |
205 | *unicode is modified to point to the new (resized) object and 0 |
206 | returned on success. |
207 | |
208 | Try to resize the string in place (which is usually faster than allocating |
209 | a new string and copy characters), or create a new string. |
210 | |
211 | Error handling is implemented as follows: an exception is set, -1 |
212 | is returned and *unicode left untouched. |
213 | |
214 | WARNING: The function doesn't check string content, the result may not be a |
215 | string in canonical representation. */ |
216 | |
217 | PyAPI_FUNC(int) PyUnicode_Resize( |
218 | PyObject **unicode, /* Pointer to the Unicode object */ |
219 | Py_ssize_t length /* New length */ |
220 | ); |
221 | |
222 | /* Decode obj to a Unicode object. |
223 | |
224 | bytes, bytearray and other bytes-like objects are decoded according to the |
225 | given encoding and error handler. The encoding and error handler can be |
226 | NULL to have the interface use UTF-8 and "strict". |
227 | |
228 | All other objects (including Unicode objects) raise an exception. |
229 | |
230 | The API returns NULL in case of an error. The caller is responsible |
231 | for decref'ing the returned objects. |
232 | |
233 | */ |
234 | |
235 | PyAPI_FUNC(PyObject*) PyUnicode_FromEncodedObject( |
236 | PyObject *obj, /* Object */ |
237 | const char *encoding, /* encoding */ |
238 | const char *errors /* error handling */ |
239 | ); |
240 | |
241 | /* Copy an instance of a Unicode subtype to a new true Unicode object if |
242 | necessary. If obj is already a true Unicode object (not a subtype), return |
243 | the reference with *incremented* refcount. |
244 | |
245 | The API returns NULL in case of an error. The caller is responsible |
246 | for decref'ing the returned objects. |
247 | |
248 | */ |
249 | |
250 | PyAPI_FUNC(PyObject*) PyUnicode_FromObject( |
251 | PyObject *obj /* Object */ |
252 | ); |
253 | |
254 | PyAPI_FUNC(PyObject *) PyUnicode_FromFormatV( |
255 | const char *format, /* ASCII-encoded string */ |
256 | va_list vargs |
257 | ); |
258 | PyAPI_FUNC(PyObject *) PyUnicode_FromFormat( |
259 | const char *format, /* ASCII-encoded string */ |
260 | ... |
261 | ); |
262 | |
263 | PyAPI_FUNC(void) PyUnicode_InternInPlace(PyObject **); |
264 | PyAPI_FUNC(PyObject *) PyUnicode_InternFromString( |
265 | const char *u /* UTF-8 encoded string */ |
266 | ); |
267 | |
268 | // PyUnicode_InternImmortal() is deprecated since Python 3.10 |
269 | // and will be removed in Python 3.12. Use PyUnicode_InternInPlace() instead. |
270 | Py_DEPRECATED(3.10) PyAPI_FUNC(void) PyUnicode_InternImmortal(PyObject **); |
271 | |
272 | /* Use only if you know it's a string */ |
273 | #define PyUnicode_CHECK_INTERNED(op) \ |
274 | (((PyASCIIObject *)(op))->state.interned) |
275 | |
276 | /* --- wchar_t support for platforms which support it --------------------- */ |
277 | |
278 | #ifdef HAVE_WCHAR_H |
279 | |
280 | /* Create a Unicode Object from the wchar_t buffer w of the given |
281 | size. |
282 | |
283 | The buffer is copied into the new object. */ |
284 | |
285 | PyAPI_FUNC(PyObject*) PyUnicode_FromWideChar( |
286 | const wchar_t *w, /* wchar_t buffer */ |
287 | Py_ssize_t size /* size of buffer */ |
288 | ); |
289 | |
290 | /* Copies the Unicode Object contents into the wchar_t buffer w. At |
291 | most size wchar_t characters are copied. |
292 | |
293 | Note that the resulting wchar_t string may or may not be |
294 | 0-terminated. It is the responsibility of the caller to make sure |
295 | that the wchar_t string is 0-terminated in case this is required by |
296 | the application. |
297 | |
298 | Returns the number of wchar_t characters copied (excluding a |
299 | possibly trailing 0-termination character) or -1 in case of an |
300 | error. */ |
301 | |
302 | PyAPI_FUNC(Py_ssize_t) PyUnicode_AsWideChar( |
303 | PyObject *unicode, /* Unicode object */ |
304 | wchar_t *w, /* wchar_t buffer */ |
305 | Py_ssize_t size /* size of buffer */ |
306 | ); |
307 | |
308 | /* Convert the Unicode object to a wide character string. The output string |
309 | always ends with a nul character. If size is not NULL, write the number of |
310 | wide characters (excluding the null character) into *size. |
311 | |
312 | Returns a buffer allocated by PyMem_Malloc() (use PyMem_Free() to free it) |
313 | on success. On error, returns NULL, *size is undefined and raises a |
314 | MemoryError. */ |
315 | |
316 | PyAPI_FUNC(wchar_t*) PyUnicode_AsWideCharString( |
317 | PyObject *unicode, /* Unicode object */ |
318 | Py_ssize_t *size /* number of characters of the result */ |
319 | ); |
320 | |
321 | #endif |
322 | |
323 | /* --- Unicode ordinals --------------------------------------------------- */ |
324 | |
325 | /* Create a Unicode Object from the given Unicode code point ordinal. |
326 | |
327 | The ordinal must be in range(0x110000). A ValueError is |
328 | raised in case it is not. |
329 | |
330 | */ |
331 | |
332 | PyAPI_FUNC(PyObject*) PyUnicode_FromOrdinal(int ordinal); |
333 | |
334 | /* === Builtin Codecs ===================================================== |
335 | |
336 | Many of these APIs take two arguments encoding and errors. These |
337 | parameters encoding and errors have the same semantics as the ones |
338 | of the builtin str() API. |
339 | |
340 | Setting encoding to NULL causes the default encoding (UTF-8) to be used. |
341 | |
342 | Error handling is set by errors which may also be set to NULL |
343 | meaning to use the default handling defined for the codec. Default |
344 | error handling for all builtin codecs is "strict" (ValueErrors are |
345 | raised). |
346 | |
347 | The codecs all use a similar interface. Only deviation from the |
348 | generic ones are documented. |
349 | |
350 | */ |
351 | |
352 | /* --- Manage the default encoding ---------------------------------------- */ |
353 | |
354 | /* Returns "utf-8". */ |
355 | PyAPI_FUNC(const char*) PyUnicode_GetDefaultEncoding(void); |
356 | |
357 | /* --- Generic Codecs ----------------------------------------------------- */ |
358 | |
359 | /* Create a Unicode object by decoding the encoded string s of the |
360 | given size. */ |
361 | |
362 | PyAPI_FUNC(PyObject*) PyUnicode_Decode( |
363 | const char *s, /* encoded string */ |
364 | Py_ssize_t size, /* size of buffer */ |
365 | const char *encoding, /* encoding */ |
366 | const char *errors /* error handling */ |
367 | ); |
368 | |
369 | /* Decode a Unicode object unicode and return the result as Python |
370 | object. |
371 | |
372 | This API is DEPRECATED. The only supported standard encoding is rot13. |
373 | Use PyCodec_Decode() to decode with rot13 and non-standard codecs |
374 | that decode from str. */ |
375 | |
376 | Py_DEPRECATED(3.6) PyAPI_FUNC(PyObject*) PyUnicode_AsDecodedObject( |
377 | PyObject *unicode, /* Unicode object */ |
378 | const char *encoding, /* encoding */ |
379 | const char *errors /* error handling */ |
380 | ); |
381 | |
382 | /* Decode a Unicode object unicode and return the result as Unicode |
383 | object. |
384 | |
385 | This API is DEPRECATED. The only supported standard encoding is rot13. |
386 | Use PyCodec_Decode() to decode with rot13 and non-standard codecs |
387 | that decode from str to str. */ |
388 | |
389 | Py_DEPRECATED(3.6) PyAPI_FUNC(PyObject*) PyUnicode_AsDecodedUnicode( |
390 | PyObject *unicode, /* Unicode object */ |
391 | const char *encoding, /* encoding */ |
392 | const char *errors /* error handling */ |
393 | ); |
394 | |
395 | /* Encodes a Unicode object and returns the result as Python |
396 | object. |
397 | |
398 | This API is DEPRECATED. It is superseded by PyUnicode_AsEncodedString() |
399 | since all standard encodings (except rot13) encode str to bytes. |
400 | Use PyCodec_Encode() for encoding with rot13 and non-standard codecs |
401 | that encode form str to non-bytes. */ |
402 | |
403 | Py_DEPRECATED(3.6) PyAPI_FUNC(PyObject*) PyUnicode_AsEncodedObject( |
404 | PyObject *unicode, /* Unicode object */ |
405 | const char *encoding, /* encoding */ |
406 | const char *errors /* error handling */ |
407 | ); |
408 | |
409 | /* Encodes a Unicode object and returns the result as Python string |
410 | object. */ |
411 | |
412 | PyAPI_FUNC(PyObject*) PyUnicode_AsEncodedString( |
413 | PyObject *unicode, /* Unicode object */ |
414 | const char *encoding, /* encoding */ |
415 | const char *errors /* error handling */ |
416 | ); |
417 | |
418 | /* Encodes a Unicode object and returns the result as Unicode |
419 | object. |
420 | |
421 | This API is DEPRECATED. The only supported standard encodings is rot13. |
422 | Use PyCodec_Encode() to encode with rot13 and non-standard codecs |
423 | that encode from str to str. */ |
424 | |
425 | Py_DEPRECATED(3.6) PyAPI_FUNC(PyObject*) PyUnicode_AsEncodedUnicode( |
426 | PyObject *unicode, /* Unicode object */ |
427 | const char *encoding, /* encoding */ |
428 | const char *errors /* error handling */ |
429 | ); |
430 | |
431 | /* Build an encoding map. */ |
432 | |
433 | PyAPI_FUNC(PyObject*) PyUnicode_BuildEncodingMap( |
434 | PyObject* string /* 256 character map */ |
435 | ); |
436 | |
437 | /* --- UTF-7 Codecs ------------------------------------------------------- */ |
438 | |
439 | PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF7( |
440 | const char *string, /* UTF-7 encoded string */ |
441 | Py_ssize_t length, /* size of string */ |
442 | const char *errors /* error handling */ |
443 | ); |
444 | |
445 | PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF7Stateful( |
446 | const char *string, /* UTF-7 encoded string */ |
447 | Py_ssize_t length, /* size of string */ |
448 | const char *errors, /* error handling */ |
449 | Py_ssize_t *consumed /* bytes consumed */ |
450 | ); |
451 | |
452 | /* --- UTF-8 Codecs ------------------------------------------------------- */ |
453 | |
454 | PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF8( |
455 | const char *string, /* UTF-8 encoded string */ |
456 | Py_ssize_t length, /* size of string */ |
457 | const char *errors /* error handling */ |
458 | ); |
459 | |
460 | PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF8Stateful( |
461 | const char *string, /* UTF-8 encoded string */ |
462 | Py_ssize_t length, /* size of string */ |
463 | const char *errors, /* error handling */ |
464 | Py_ssize_t *consumed /* bytes consumed */ |
465 | ); |
466 | |
467 | PyAPI_FUNC(PyObject*) PyUnicode_AsUTF8String( |
468 | PyObject *unicode /* Unicode object */ |
469 | ); |
470 | |
471 | /* Returns a pointer to the default encoding (UTF-8) of the |
472 | Unicode object unicode and the size of the encoded representation |
473 | in bytes stored in *size. |
474 | |
475 | In case of an error, no *size is set. |
476 | |
477 | This function caches the UTF-8 encoded string in the unicodeobject |
478 | and subsequent calls will return the same string. The memory is released |
479 | when the unicodeobject is deallocated. |
480 | */ |
481 | |
482 | #if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 >= 0x030A0000 |
483 | PyAPI_FUNC(const char *) PyUnicode_AsUTF8AndSize( |
484 | PyObject *unicode, |
485 | Py_ssize_t *size); |
486 | #endif |
487 | |
488 | /* --- UTF-32 Codecs ------------------------------------------------------ */ |
489 | |
490 | /* Decodes length bytes from a UTF-32 encoded buffer string and returns |
491 | the corresponding Unicode object. |
492 | |
493 | errors (if non-NULL) defines the error handling. It defaults |
494 | to "strict". |
495 | |
496 | If byteorder is non-NULL, the decoder starts decoding using the |
497 | given byte order: |
498 | |
499 | *byteorder == -1: little endian |
500 | *byteorder == 0: native order |
501 | *byteorder == 1: big endian |
502 | |
503 | In native mode, the first four bytes of the stream are checked for a |
504 | BOM mark. If found, the BOM mark is analysed, the byte order |
505 | adjusted and the BOM skipped. In the other modes, no BOM mark |
506 | interpretation is done. After completion, *byteorder is set to the |
507 | current byte order at the end of input data. |
508 | |
509 | If byteorder is NULL, the codec starts in native order mode. |
510 | |
511 | */ |
512 | |
513 | PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF32( |
514 | const char *string, /* UTF-32 encoded string */ |
515 | Py_ssize_t length, /* size of string */ |
516 | const char *errors, /* error handling */ |
517 | int *byteorder /* pointer to byteorder to use |
518 | 0=native;-1=LE,1=BE; updated on |
519 | exit */ |
520 | ); |
521 | |
522 | PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF32Stateful( |
523 | const char *string, /* UTF-32 encoded string */ |
524 | Py_ssize_t length, /* size of string */ |
525 | const char *errors, /* error handling */ |
526 | int *byteorder, /* pointer to byteorder to use |
527 | 0=native;-1=LE,1=BE; updated on |
528 | exit */ |
529 | Py_ssize_t *consumed /* bytes consumed */ |
530 | ); |
531 | |
532 | /* Returns a Python string using the UTF-32 encoding in native byte |
533 | order. The string always starts with a BOM mark. */ |
534 | |
535 | PyAPI_FUNC(PyObject*) PyUnicode_AsUTF32String( |
536 | PyObject *unicode /* Unicode object */ |
537 | ); |
538 | |
539 | /* Returns a Python string object holding the UTF-32 encoded value of |
540 | the Unicode data. |
541 | |
542 | If byteorder is not 0, output is written according to the following |
543 | byte order: |
544 | |
545 | byteorder == -1: little endian |
546 | byteorder == 0: native byte order (writes a BOM mark) |
547 | byteorder == 1: big endian |
548 | |
549 | If byteorder is 0, the output string will always start with the |
550 | Unicode BOM mark (U+FEFF). In the other two modes, no BOM mark is |
551 | prepended. |
552 | |
553 | */ |
554 | |
555 | /* --- UTF-16 Codecs ------------------------------------------------------ */ |
556 | |
557 | /* Decodes length bytes from a UTF-16 encoded buffer string and returns |
558 | the corresponding Unicode object. |
559 | |
560 | errors (if non-NULL) defines the error handling. It defaults |
561 | to "strict". |
562 | |
563 | If byteorder is non-NULL, the decoder starts decoding using the |
564 | given byte order: |
565 | |
566 | *byteorder == -1: little endian |
567 | *byteorder == 0: native order |
568 | *byteorder == 1: big endian |
569 | |
570 | In native mode, the first two bytes of the stream are checked for a |
571 | BOM mark. If found, the BOM mark is analysed, the byte order |
572 | adjusted and the BOM skipped. In the other modes, no BOM mark |
573 | interpretation is done. After completion, *byteorder is set to the |
574 | current byte order at the end of input data. |
575 | |
576 | If byteorder is NULL, the codec starts in native order mode. |
577 | |
578 | */ |
579 | |
580 | PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF16( |
581 | const char *string, /* UTF-16 encoded string */ |
582 | Py_ssize_t length, /* size of string */ |
583 | const char *errors, /* error handling */ |
584 | int *byteorder /* pointer to byteorder to use |
585 | 0=native;-1=LE,1=BE; updated on |
586 | exit */ |
587 | ); |
588 | |
589 | PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF16Stateful( |
590 | const char *string, /* UTF-16 encoded string */ |
591 | Py_ssize_t length, /* size of string */ |
592 | const char *errors, /* error handling */ |
593 | int *byteorder, /* pointer to byteorder to use |
594 | 0=native;-1=LE,1=BE; updated on |
595 | exit */ |
596 | Py_ssize_t *consumed /* bytes consumed */ |
597 | ); |
598 | |
599 | /* Returns a Python string using the UTF-16 encoding in native byte |
600 | order. The string always starts with a BOM mark. */ |
601 | |
602 | PyAPI_FUNC(PyObject*) PyUnicode_AsUTF16String( |
603 | PyObject *unicode /* Unicode object */ |
604 | ); |
605 | |
606 | /* --- Unicode-Escape Codecs ---------------------------------------------- */ |
607 | |
608 | PyAPI_FUNC(PyObject*) PyUnicode_DecodeUnicodeEscape( |
609 | const char *string, /* Unicode-Escape encoded string */ |
610 | Py_ssize_t length, /* size of string */ |
611 | const char *errors /* error handling */ |
612 | ); |
613 | |
614 | PyAPI_FUNC(PyObject*) PyUnicode_AsUnicodeEscapeString( |
615 | PyObject *unicode /* Unicode object */ |
616 | ); |
617 | |
618 | /* --- Raw-Unicode-Escape Codecs ------------------------------------------ */ |
619 | |
620 | PyAPI_FUNC(PyObject*) PyUnicode_DecodeRawUnicodeEscape( |
621 | const char *string, /* Raw-Unicode-Escape encoded string */ |
622 | Py_ssize_t length, /* size of string */ |
623 | const char *errors /* error handling */ |
624 | ); |
625 | |
626 | PyAPI_FUNC(PyObject*) PyUnicode_AsRawUnicodeEscapeString( |
627 | PyObject *unicode /* Unicode object */ |
628 | ); |
629 | |
630 | /* --- Latin-1 Codecs ----------------------------------------------------- |
631 | |
632 | Note: Latin-1 corresponds to the first 256 Unicode ordinals. */ |
633 | |
634 | PyAPI_FUNC(PyObject*) PyUnicode_DecodeLatin1( |
635 | const char *string, /* Latin-1 encoded string */ |
636 | Py_ssize_t length, /* size of string */ |
637 | const char *errors /* error handling */ |
638 | ); |
639 | |
640 | PyAPI_FUNC(PyObject*) PyUnicode_AsLatin1String( |
641 | PyObject *unicode /* Unicode object */ |
642 | ); |
643 | |
644 | /* --- ASCII Codecs ------------------------------------------------------- |
645 | |
646 | Only 7-bit ASCII data is excepted. All other codes generate errors. |
647 | |
648 | */ |
649 | |
650 | PyAPI_FUNC(PyObject*) PyUnicode_DecodeASCII( |
651 | const char *string, /* ASCII encoded string */ |
652 | Py_ssize_t length, /* size of string */ |
653 | const char *errors /* error handling */ |
654 | ); |
655 | |
656 | PyAPI_FUNC(PyObject*) PyUnicode_AsASCIIString( |
657 | PyObject *unicode /* Unicode object */ |
658 | ); |
659 | |
660 | /* --- Character Map Codecs ----------------------------------------------- |
661 | |
662 | This codec uses mappings to encode and decode characters. |
663 | |
664 | Decoding mappings must map byte ordinals (integers in the range from 0 to |
665 | 255) to Unicode strings, integers (which are then interpreted as Unicode |
666 | ordinals) or None. Unmapped data bytes (ones which cause a LookupError) |
667 | as well as mapped to None, 0xFFFE or '\ufffe' are treated as "undefined |
668 | mapping" and cause an error. |
669 | |
670 | Encoding mappings must map Unicode ordinal integers to bytes objects, |
671 | integers in the range from 0 to 255 or None. Unmapped character |
672 | ordinals (ones which cause a LookupError) as well as mapped to |
673 | None are treated as "undefined mapping" and cause an error. |
674 | |
675 | */ |
676 | |
677 | PyAPI_FUNC(PyObject*) PyUnicode_DecodeCharmap( |
678 | const char *string, /* Encoded string */ |
679 | Py_ssize_t length, /* size of string */ |
680 | PyObject *mapping, /* decoding mapping */ |
681 | const char *errors /* error handling */ |
682 | ); |
683 | |
684 | PyAPI_FUNC(PyObject*) PyUnicode_AsCharmapString( |
685 | PyObject *unicode, /* Unicode object */ |
686 | PyObject *mapping /* encoding mapping */ |
687 | ); |
688 | |
689 | /* --- MBCS codecs for Windows -------------------------------------------- */ |
690 | |
691 | #ifdef MS_WINDOWS |
692 | PyAPI_FUNC(PyObject*) PyUnicode_DecodeMBCS( |
693 | const char *string, /* MBCS encoded string */ |
694 | Py_ssize_t length, /* size of string */ |
695 | const char *errors /* error handling */ |
696 | ); |
697 | |
698 | PyAPI_FUNC(PyObject*) PyUnicode_DecodeMBCSStateful( |
699 | const char *string, /* MBCS encoded string */ |
700 | Py_ssize_t length, /* size of string */ |
701 | const char *errors, /* error handling */ |
702 | Py_ssize_t *consumed /* bytes consumed */ |
703 | ); |
704 | |
705 | #if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 >= 0x03030000 |
706 | PyAPI_FUNC(PyObject*) PyUnicode_DecodeCodePageStateful( |
707 | int code_page, /* code page number */ |
708 | const char *string, /* encoded string */ |
709 | Py_ssize_t length, /* size of string */ |
710 | const char *errors, /* error handling */ |
711 | Py_ssize_t *consumed /* bytes consumed */ |
712 | ); |
713 | #endif |
714 | |
715 | PyAPI_FUNC(PyObject*) PyUnicode_AsMBCSString( |
716 | PyObject *unicode /* Unicode object */ |
717 | ); |
718 | |
719 | #if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 >= 0x03030000 |
720 | PyAPI_FUNC(PyObject*) PyUnicode_EncodeCodePage( |
721 | int code_page, /* code page number */ |
722 | PyObject *unicode, /* Unicode object */ |
723 | const char *errors /* error handling */ |
724 | ); |
725 | #endif |
726 | |
727 | #endif /* MS_WINDOWS */ |
728 | |
729 | /* --- Locale encoding --------------------------------------------------- */ |
730 | |
731 | #if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 >= 0x03030000 |
732 | /* Decode a string from the current locale encoding. The decoder is strict if |
733 | *surrogateescape* is equal to zero, otherwise it uses the 'surrogateescape' |
734 | error handler (PEP 383) to escape undecodable bytes. If a byte sequence can |
735 | be decoded as a surrogate character and *surrogateescape* is not equal to |
736 | zero, the byte sequence is escaped using the 'surrogateescape' error handler |
737 | instead of being decoded. *str* must end with a null character but cannot |
738 | contain embedded null characters. */ |
739 | |
740 | PyAPI_FUNC(PyObject*) PyUnicode_DecodeLocaleAndSize( |
741 | const char *str, |
742 | Py_ssize_t len, |
743 | const char *errors); |
744 | |
745 | /* Similar to PyUnicode_DecodeLocaleAndSize(), but compute the string |
746 | length using strlen(). */ |
747 | |
748 | PyAPI_FUNC(PyObject*) PyUnicode_DecodeLocale( |
749 | const char *str, |
750 | const char *errors); |
751 | |
752 | /* Encode a Unicode object to the current locale encoding. The encoder is |
753 | strict is *surrogateescape* is equal to zero, otherwise the |
754 | "surrogateescape" error handler is used. Return a bytes object. The string |
755 | cannot contain embedded null characters. */ |
756 | |
757 | PyAPI_FUNC(PyObject*) PyUnicode_EncodeLocale( |
758 | PyObject *unicode, |
759 | const char *errors |
760 | ); |
761 | #endif |
762 | |
763 | /* --- File system encoding ---------------------------------------------- */ |
764 | |
765 | /* ParseTuple converter: encode str objects to bytes using |
766 | PyUnicode_EncodeFSDefault(); bytes objects are output as-is. */ |
767 | |
768 | PyAPI_FUNC(int) PyUnicode_FSConverter(PyObject*, void*); |
769 | |
770 | /* ParseTuple converter: decode bytes objects to unicode using |
771 | PyUnicode_DecodeFSDefaultAndSize(); str objects are output as-is. */ |
772 | |
773 | PyAPI_FUNC(int) PyUnicode_FSDecoder(PyObject*, void*); |
774 | |
775 | /* Decode a null-terminated string using Py_FileSystemDefaultEncoding |
776 | and the "surrogateescape" error handler. |
777 | |
778 | If Py_FileSystemDefaultEncoding is not set, fall back to the locale |
779 | encoding. |
780 | |
781 | Use PyUnicode_DecodeFSDefaultAndSize() if the string length is known. |
782 | */ |
783 | |
784 | PyAPI_FUNC(PyObject*) PyUnicode_DecodeFSDefault( |
785 | const char *s /* encoded string */ |
786 | ); |
787 | |
788 | /* Decode a string using Py_FileSystemDefaultEncoding |
789 | and the "surrogateescape" error handler. |
790 | |
791 | If Py_FileSystemDefaultEncoding is not set, fall back to the locale |
792 | encoding. |
793 | */ |
794 | |
795 | PyAPI_FUNC(PyObject*) PyUnicode_DecodeFSDefaultAndSize( |
796 | const char *s, /* encoded string */ |
797 | Py_ssize_t size /* size */ |
798 | ); |
799 | |
800 | /* Encode a Unicode object to Py_FileSystemDefaultEncoding with the |
801 | "surrogateescape" error handler, and return bytes. |
802 | |
803 | If Py_FileSystemDefaultEncoding is not set, fall back to the locale |
804 | encoding. |
805 | */ |
806 | |
807 | PyAPI_FUNC(PyObject*) PyUnicode_EncodeFSDefault( |
808 | PyObject *unicode |
809 | ); |
810 | |
811 | /* --- Methods & Slots ---------------------------------------------------- |
812 | |
813 | These are capable of handling Unicode objects and strings on input |
814 | (we refer to them as strings in the descriptions) and return |
815 | Unicode objects or integers as appropriate. */ |
816 | |
817 | /* Concat two strings giving a new Unicode string. */ |
818 | |
819 | PyAPI_FUNC(PyObject*) PyUnicode_Concat( |
820 | PyObject *left, /* Left string */ |
821 | PyObject *right /* Right string */ |
822 | ); |
823 | |
824 | /* Concat two strings and put the result in *pleft |
825 | (sets *pleft to NULL on error) */ |
826 | |
827 | PyAPI_FUNC(void) PyUnicode_Append( |
828 | PyObject **pleft, /* Pointer to left string */ |
829 | PyObject *right /* Right string */ |
830 | ); |
831 | |
832 | /* Concat two strings, put the result in *pleft and drop the right object |
833 | (sets *pleft to NULL on error) */ |
834 | |
835 | PyAPI_FUNC(void) PyUnicode_AppendAndDel( |
836 | PyObject **pleft, /* Pointer to left string */ |
837 | PyObject *right /* Right string */ |
838 | ); |
839 | |
840 | /* Split a string giving a list of Unicode strings. |
841 | |
842 | If sep is NULL, splitting will be done at all whitespace |
843 | substrings. Otherwise, splits occur at the given separator. |
844 | |
845 | At most maxsplit splits will be done. If negative, no limit is set. |
846 | |
847 | Separators are not included in the resulting list. |
848 | |
849 | */ |
850 | |
851 | PyAPI_FUNC(PyObject*) PyUnicode_Split( |
852 | PyObject *s, /* String to split */ |
853 | PyObject *sep, /* String separator */ |
854 | Py_ssize_t maxsplit /* Maxsplit count */ |
855 | ); |
856 | |
857 | /* Dito, but split at line breaks. |
858 | |
859 | CRLF is considered to be one line break. Line breaks are not |
860 | included in the resulting list. */ |
861 | |
862 | PyAPI_FUNC(PyObject*) PyUnicode_Splitlines( |
863 | PyObject *s, /* String to split */ |
864 | int keepends /* If true, line end markers are included */ |
865 | ); |
866 | |
867 | /* Partition a string using a given separator. */ |
868 | |
869 | PyAPI_FUNC(PyObject*) PyUnicode_Partition( |
870 | PyObject *s, /* String to partition */ |
871 | PyObject *sep /* String separator */ |
872 | ); |
873 | |
874 | /* Partition a string using a given separator, searching from the end of the |
875 | string. */ |
876 | |
877 | PyAPI_FUNC(PyObject*) PyUnicode_RPartition( |
878 | PyObject *s, /* String to partition */ |
879 | PyObject *sep /* String separator */ |
880 | ); |
881 | |
882 | /* Split a string giving a list of Unicode strings. |
883 | |
884 | If sep is NULL, splitting will be done at all whitespace |
885 | substrings. Otherwise, splits occur at the given separator. |
886 | |
887 | At most maxsplit splits will be done. But unlike PyUnicode_Split |
888 | PyUnicode_RSplit splits from the end of the string. If negative, |
889 | no limit is set. |
890 | |
891 | Separators are not included in the resulting list. |
892 | |
893 | */ |
894 | |
895 | PyAPI_FUNC(PyObject*) PyUnicode_RSplit( |
896 | PyObject *s, /* String to split */ |
897 | PyObject *sep, /* String separator */ |
898 | Py_ssize_t maxsplit /* Maxsplit count */ |
899 | ); |
900 | |
901 | /* Translate a string by applying a character mapping table to it and |
902 | return the resulting Unicode object. |
903 | |
904 | The mapping table must map Unicode ordinal integers to Unicode strings, |
905 | Unicode ordinal integers or None (causing deletion of the character). |
906 | |
907 | Mapping tables may be dictionaries or sequences. Unmapped character |
908 | ordinals (ones which cause a LookupError) are left untouched and |
909 | are copied as-is. |
910 | |
911 | */ |
912 | |
913 | PyAPI_FUNC(PyObject *) PyUnicode_Translate( |
914 | PyObject *str, /* String */ |
915 | PyObject *table, /* Translate table */ |
916 | const char *errors /* error handling */ |
917 | ); |
918 | |
919 | /* Join a sequence of strings using the given separator and return |
920 | the resulting Unicode string. */ |
921 | |
922 | PyAPI_FUNC(PyObject*) PyUnicode_Join( |
923 | PyObject *separator, /* Separator string */ |
924 | PyObject *seq /* Sequence object */ |
925 | ); |
926 | |
927 | /* Return 1 if substr matches str[start:end] at the given tail end, 0 |
928 | otherwise. */ |
929 | |
930 | PyAPI_FUNC(Py_ssize_t) PyUnicode_Tailmatch( |
931 | PyObject *str, /* String */ |
932 | PyObject *substr, /* Prefix or Suffix string */ |
933 | Py_ssize_t start, /* Start index */ |
934 | Py_ssize_t end, /* Stop index */ |
935 | int direction /* Tail end: -1 prefix, +1 suffix */ |
936 | ); |
937 | |
938 | /* Return the first position of substr in str[start:end] using the |
939 | given search direction or -1 if not found. -2 is returned in case |
940 | an error occurred and an exception is set. */ |
941 | |
942 | PyAPI_FUNC(Py_ssize_t) PyUnicode_Find( |
943 | PyObject *str, /* String */ |
944 | PyObject *substr, /* Substring to find */ |
945 | Py_ssize_t start, /* Start index */ |
946 | Py_ssize_t end, /* Stop index */ |
947 | int direction /* Find direction: +1 forward, -1 backward */ |
948 | ); |
949 | |
950 | #if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 >= 0x03030000 |
951 | /* Like PyUnicode_Find, but search for single character only. */ |
952 | PyAPI_FUNC(Py_ssize_t) PyUnicode_FindChar( |
953 | PyObject *str, |
954 | Py_UCS4 ch, |
955 | Py_ssize_t start, |
956 | Py_ssize_t end, |
957 | int direction |
958 | ); |
959 | #endif |
960 | |
961 | /* Count the number of occurrences of substr in str[start:end]. */ |
962 | |
963 | PyAPI_FUNC(Py_ssize_t) PyUnicode_Count( |
964 | PyObject *str, /* String */ |
965 | PyObject *substr, /* Substring to count */ |
966 | Py_ssize_t start, /* Start index */ |
967 | Py_ssize_t end /* Stop index */ |
968 | ); |
969 | |
970 | /* Replace at most maxcount occurrences of substr in str with replstr |
971 | and return the resulting Unicode object. */ |
972 | |
973 | PyAPI_FUNC(PyObject *) PyUnicode_Replace( |
974 | PyObject *str, /* String */ |
975 | PyObject *substr, /* Substring to find */ |
976 | PyObject *replstr, /* Substring to replace */ |
977 | Py_ssize_t maxcount /* Max. number of replacements to apply; |
978 | -1 = all */ |
979 | ); |
980 | |
981 | /* Compare two strings and return -1, 0, 1 for less than, equal, |
982 | greater than resp. |
983 | Raise an exception and return -1 on error. */ |
984 | |
985 | PyAPI_FUNC(int) PyUnicode_Compare( |
986 | PyObject *left, /* Left string */ |
987 | PyObject *right /* Right string */ |
988 | ); |
989 | |
990 | /* Compare a Unicode object with C string and return -1, 0, 1 for less than, |
991 | equal, and greater than, respectively. It is best to pass only |
992 | ASCII-encoded strings, but the function interprets the input string as |
993 | ISO-8859-1 if it contains non-ASCII characters. |
994 | This function does not raise exceptions. */ |
995 | |
996 | PyAPI_FUNC(int) PyUnicode_CompareWithASCIIString( |
997 | PyObject *left, |
998 | const char *right /* ASCII-encoded string */ |
999 | ); |
1000 | |
1001 | /* Rich compare two strings and return one of the following: |
1002 | |
1003 | - NULL in case an exception was raised |
1004 | - Py_True or Py_False for successful comparisons |
1005 | - Py_NotImplemented in case the type combination is unknown |
1006 | |
1007 | Possible values for op: |
1008 | |
1009 | Py_GT, Py_GE, Py_EQ, Py_NE, Py_LT, Py_LE |
1010 | |
1011 | */ |
1012 | |
1013 | PyAPI_FUNC(PyObject *) PyUnicode_RichCompare( |
1014 | PyObject *left, /* Left string */ |
1015 | PyObject *right, /* Right string */ |
1016 | int op /* Operation: Py_EQ, Py_NE, Py_GT, etc. */ |
1017 | ); |
1018 | |
1019 | /* Apply an argument tuple or dictionary to a format string and return |
1020 | the resulting Unicode string. */ |
1021 | |
1022 | PyAPI_FUNC(PyObject *) PyUnicode_Format( |
1023 | PyObject *format, /* Format string */ |
1024 | PyObject *args /* Argument tuple or dictionary */ |
1025 | ); |
1026 | |
1027 | /* Checks whether element is contained in container and return 1/0 |
1028 | accordingly. |
1029 | |
1030 | element has to coerce to a one element Unicode string. -1 is |
1031 | returned in case of an error. */ |
1032 | |
1033 | PyAPI_FUNC(int) PyUnicode_Contains( |
1034 | PyObject *container, /* Container string */ |
1035 | PyObject *element /* Element string */ |
1036 | ); |
1037 | |
1038 | /* Checks whether argument is a valid identifier. */ |
1039 | |
1040 | PyAPI_FUNC(int) PyUnicode_IsIdentifier(PyObject *s); |
1041 | |
1042 | /* === Characters Type APIs =============================================== */ |
1043 | |
1044 | #ifndef Py_LIMITED_API |
1045 | # define Py_CPYTHON_UNICODEOBJECT_H |
1046 | # include "cpython/unicodeobject.h" |
1047 | # undef Py_CPYTHON_UNICODEOBJECT_H |
1048 | #endif |
1049 | |
1050 | #ifdef __cplusplus |
1051 | } |
1052 | #endif |
1053 | #endif /* !Py_UNICODEOBJECT_H */ |
1054 | |