1#[cfg(not(Py_LIMITED_API))]
2use crate::exceptions::PyUnicodeDecodeError;
3use crate::types::PyBytes;
4use crate::{ffi, PyAny, PyResult, Python};
5use std::borrow::Cow;
6use std::os::raw::c_char;
7use std::str;
8
9/// Represents raw data backing a Python `str`.
10///
11/// Python internally stores strings in various representations. This enumeration
12/// represents those variations.
13#[cfg(not(Py_LIMITED_API))]
14#[derive(Clone, Copy, Debug, PartialEq, Eq)]
15pub enum PyStringData<'a> {
16 /// UCS1 representation.
17 Ucs1(&'a [u8]),
18
19 /// UCS2 representation.
20 Ucs2(&'a [u16]),
21
22 /// UCS4 representation.
23 Ucs4(&'a [u32]),
24}
25
26#[cfg(not(Py_LIMITED_API))]
27impl<'a> PyStringData<'a> {
28 /// Obtain the raw bytes backing this instance as a [u8] slice.
29 pub fn as_bytes(&self) -> &[u8] {
30 match self {
31 Self::Ucs1(s) => s,
32 Self::Ucs2(s) => unsafe {
33 std::slice::from_raw_parts(
34 s.as_ptr() as *const u8,
35 s.len() * self.value_width_bytes(),
36 )
37 },
38 Self::Ucs4(s) => unsafe {
39 std::slice::from_raw_parts(
40 s.as_ptr() as *const u8,
41 s.len() * self.value_width_bytes(),
42 )
43 },
44 }
45 }
46
47 /// Size in bytes of each value/item in the underlying slice.
48 #[inline]
49 pub fn value_width_bytes(&self) -> usize {
50 match self {
51 Self::Ucs1(_) => 1,
52 Self::Ucs2(_) => 2,
53 Self::Ucs4(_) => 4,
54 }
55 }
56
57 /// Convert the raw data to a Rust string.
58 ///
59 /// For UCS-1 / UTF-8, returns a borrow into the original slice. For UCS-2 and UCS-4,
60 /// returns an owned string.
61 ///
62 /// Returns [PyUnicodeDecodeError] if the string data isn't valid in its purported
63 /// storage format. This should only occur for strings that were created via Python
64 /// C APIs that skip input validation (like `PyUnicode_FromKindAndData`) and should
65 /// never occur for strings that were created from Python code.
66 pub fn to_string(self, py: Python<'_>) -> PyResult<Cow<'a, str>> {
67 use std::ffi::CStr;
68 match self {
69 Self::Ucs1(data) => match str::from_utf8(data) {
70 Ok(s) => Ok(Cow::Borrowed(s)),
71 Err(e) => Err(crate::PyErr::from_value(PyUnicodeDecodeError::new_utf8(
72 py, data, e,
73 )?)),
74 },
75 Self::Ucs2(data) => match String::from_utf16(data) {
76 Ok(s) => Ok(Cow::Owned(s)),
77 Err(e) => {
78 let mut message = e.to_string().as_bytes().to_vec();
79 message.push(0);
80
81 Err(crate::PyErr::from_value(PyUnicodeDecodeError::new(
82 py,
83 CStr::from_bytes_with_nul(b"utf-16\0").unwrap(),
84 self.as_bytes(),
85 0..self.as_bytes().len(),
86 CStr::from_bytes_with_nul(&message).unwrap(),
87 )?))
88 }
89 },
90 Self::Ucs4(data) => match data.iter().map(|&c| std::char::from_u32(c)).collect() {
91 Some(s) => Ok(Cow::Owned(s)),
92 None => Err(crate::PyErr::from_value(PyUnicodeDecodeError::new(
93 py,
94 CStr::from_bytes_with_nul(b"utf-32\0").unwrap(),
95 self.as_bytes(),
96 0..self.as_bytes().len(),
97 CStr::from_bytes_with_nul(b"error converting utf-32\0").unwrap(),
98 )?)),
99 },
100 }
101 }
102
103 /// Convert the raw data to a Rust string, possibly with data loss.
104 ///
105 /// Invalid code points will be replaced with `U+FFFD REPLACEMENT CHARACTER`.
106 ///
107 /// Returns a borrow into original data, when possible, or owned data otherwise.
108 ///
109 /// The return value of this function should only disagree with [Self::to_string]
110 /// when that method would error.
111 pub fn to_string_lossy(self) -> Cow<'a, str> {
112 match self {
113 Self::Ucs1(data) => String::from_utf8_lossy(data),
114 Self::Ucs2(data) => Cow::Owned(String::from_utf16_lossy(data)),
115 Self::Ucs4(data) => Cow::Owned(
116 data.iter()
117 .map(|&c| std::char::from_u32(c).unwrap_or('\u{FFFD}'))
118 .collect(),
119 ),
120 }
121 }
122}
123
124/// Represents a Python `string` (a Unicode string object).
125///
126/// This type is immutable.
127#[repr(transparent)]
128pub struct PyString(PyAny);
129
130pyobject_native_type_core!(PyString, pyobject_native_static_type_object!(ffi::PyUnicode_Type), #checkfunction=ffi::PyUnicode_Check);
131
132impl PyString {
133 /// Creates a new Python string object.
134 ///
135 /// Panics if out of memory.
136 pub fn new<'p>(py: Python<'p>, s: &str) -> &'p PyString {
137 let ptr = s.as_ptr() as *const c_char;
138 let len = s.len() as ffi::Py_ssize_t;
139 unsafe { py.from_owned_ptr(ffi::PyUnicode_FromStringAndSize(ptr, len)) }
140 }
141
142 /// Intern the given string
143 ///
144 /// This will return a reference to the same Python string object if called repeatedly with the same string.
145 ///
146 /// Note that while this is more memory efficient than [`PyString::new`], it unconditionally allocates a
147 /// temporary Python string object and is thereby slower than [`PyString::new`].
148 ///
149 /// Panics if out of memory.
150 pub fn intern<'p>(py: Python<'p>, s: &str) -> &'p PyString {
151 let ptr = s.as_ptr() as *const c_char;
152 let len = s.len() as ffi::Py_ssize_t;
153 unsafe {
154 let mut ob = ffi::PyUnicode_FromStringAndSize(ptr, len);
155 if !ob.is_null() {
156 ffi::PyUnicode_InternInPlace(&mut ob);
157 }
158 py.from_owned_ptr(ob)
159 }
160 }
161
162 /// Attempts to create a Python string from a Python [bytes-like object].
163 ///
164 /// [bytes-like object]: (https://docs.python.org/3/glossary.html#term-bytes-like-object).
165 pub fn from_object<'p>(src: &'p PyAny, encoding: &str, errors: &str) -> PyResult<&'p PyString> {
166 unsafe {
167 src.py()
168 .from_owned_ptr_or_err::<PyString>(ffi::PyUnicode_FromEncodedObject(
169 src.as_ptr(),
170 encoding.as_ptr() as *const c_char,
171 errors.as_ptr() as *const c_char,
172 ))
173 }
174 }
175
176 /// Gets the Python string as a byte slice.
177 ///
178 /// Returns a `UnicodeEncodeError` if the input is not valid unicode
179 /// (containing unpaired surrogates).
180 #[inline]
181 pub fn to_str(&self) -> PyResult<&str> {
182 let utf8_slice = {
183 cfg_if::cfg_if! {
184 if #[cfg(any(Py_3_10, not(Py_LIMITED_API)))] {
185 // PyUnicode_AsUTF8AndSize only available on limited API starting with 3.10.
186 let mut size: ffi::Py_ssize_t = 0;
187 let data: *const u8 = unsafe { ffi::PyUnicode_AsUTF8AndSize(self.as_ptr(), &mut size).cast() };
188 if data.is_null() {
189 return Err(crate::PyErr::fetch(self.py()));
190 } else {
191 unsafe { std::slice::from_raw_parts(data, size as usize) }
192 }
193 } else {
194 let bytes = unsafe {
195 self.py().from_owned_ptr_or_err::<PyBytes>(ffi::PyUnicode_AsUTF8String(self.as_ptr()))?
196 };
197 bytes.as_bytes()
198 }
199 }
200 };
201 Ok(unsafe { std::str::from_utf8_unchecked(utf8_slice) })
202 }
203
204 /// Converts the `PyString` into a Rust string.
205 ///
206 /// Unpaired surrogates invalid UTF-8 sequences are
207 /// replaced with `U+FFFD REPLACEMENT CHARACTER`.
208 pub fn to_string_lossy(&self) -> Cow<'_, str> {
209 match self.to_str() {
210 Ok(s) => Cow::Borrowed(s),
211 Err(_) => {
212 let bytes = unsafe {
213 self.py()
214 .from_owned_ptr::<PyBytes>(ffi::PyUnicode_AsEncodedString(
215 self.as_ptr(),
216 b"utf-8\0" as *const _ as _,
217 b"surrogatepass\0" as *const _ as _,
218 ))
219 };
220 String::from_utf8_lossy(bytes.as_bytes())
221 }
222 }
223 }
224
225 /// Obtains the raw data backing the Python string.
226 ///
227 /// If the Python string object was created through legacy APIs, its internal storage format
228 /// will be canonicalized before data is returned.
229 ///
230 /// # Safety
231 ///
232 /// This function implementation relies on manually decoding a C bitfield. In practice, this
233 /// works well on common little-endian architectures such as x86_64, where the bitfield has a
234 /// common representation (even if it is not part of the C spec). The PyO3 CI tests this API on
235 /// x86_64 platforms.
236 ///
237 /// By using this API, you accept responsibility for testing that PyStringData behaves as
238 /// expected on the targets where you plan to distribute your software.
239 #[cfg(not(Py_LIMITED_API))]
240 pub unsafe fn data(&self) -> PyResult<PyStringData<'_>> {
241 let ptr = self.as_ptr();
242
243 #[cfg(not(Py_3_12))]
244 #[allow(deprecated)]
245 {
246 let ready = ffi::PyUnicode_READY(ptr);
247 if ready != 0 {
248 // Exception was created on failure.
249 return Err(crate::PyErr::fetch(self.py()));
250 }
251 }
252
253 // The string should be in its canonical form after calling `PyUnicode_READY()`.
254 // And non-canonical form not possible after Python 3.12. So it should be safe
255 // to call these APIs.
256 let length = ffi::PyUnicode_GET_LENGTH(ptr) as usize;
257 let raw_data = ffi::PyUnicode_DATA(ptr);
258 let kind = ffi::PyUnicode_KIND(ptr);
259
260 match kind {
261 ffi::PyUnicode_1BYTE_KIND => Ok(PyStringData::Ucs1(std::slice::from_raw_parts(
262 raw_data as *const u8,
263 length,
264 ))),
265 ffi::PyUnicode_2BYTE_KIND => Ok(PyStringData::Ucs2(std::slice::from_raw_parts(
266 raw_data as *const u16,
267 length,
268 ))),
269 ffi::PyUnicode_4BYTE_KIND => Ok(PyStringData::Ucs4(std::slice::from_raw_parts(
270 raw_data as *const u32,
271 length,
272 ))),
273 _ => unreachable!(),
274 }
275 }
276}
277
278#[cfg(test)]
279mod tests {
280 use super::*;
281 use crate::Python;
282 use crate::{PyObject, ToPyObject};
283 #[cfg(not(Py_LIMITED_API))]
284 use std::borrow::Cow;
285
286 #[test]
287 fn test_to_str_ascii() {
288 Python::with_gil(|py| {
289 let s = "ascii 🐈";
290 let obj: PyObject = PyString::new(py, s).into();
291 let py_string: &PyString = obj.downcast(py).unwrap();
292 assert_eq!(s, py_string.to_str().unwrap());
293 })
294 }
295
296 #[test]
297 fn test_to_str_surrogate() {
298 Python::with_gil(|py| {
299 let obj: PyObject = py.eval(r"'\ud800'", None, None).unwrap().into();
300 let py_string: &PyString = obj.downcast(py).unwrap();
301 assert!(py_string.to_str().is_err());
302 })
303 }
304
305 #[test]
306 fn test_to_str_unicode() {
307 Python::with_gil(|py| {
308 let s = "哈哈🐈";
309 let obj: PyObject = PyString::new(py, s).into();
310 let py_string: &PyString = obj.downcast(py).unwrap();
311 assert_eq!(s, py_string.to_str().unwrap());
312 })
313 }
314
315 #[test]
316 fn test_to_string_lossy() {
317 Python::with_gil(|py| {
318 let obj: PyObject = py
319 .eval(r"'🐈 Hello \ud800World'", None, None)
320 .unwrap()
321 .into();
322 let py_string: &PyString = obj.downcast(py).unwrap();
323 assert_eq!(py_string.to_string_lossy(), "🐈 Hello ���World");
324 })
325 }
326
327 #[test]
328 fn test_debug_string() {
329 Python::with_gil(|py| {
330 let v = "Hello\n".to_object(py);
331 let s: &PyString = v.downcast(py).unwrap();
332 assert_eq!(format!("{:?}", s), "'Hello\\n'");
333 })
334 }
335
336 #[test]
337 fn test_display_string() {
338 Python::with_gil(|py| {
339 let v = "Hello\n".to_object(py);
340 let s: &PyString = v.downcast(py).unwrap();
341 assert_eq!(format!("{}", s), "Hello\n");
342 })
343 }
344
345 #[test]
346 #[cfg(not(Py_LIMITED_API))]
347 fn test_string_data_ucs1() {
348 Python::with_gil(|py| {
349 let s = PyString::new(py, "hello, world");
350 let data = unsafe { s.data().unwrap() };
351
352 assert_eq!(data, PyStringData::Ucs1(b"hello, world"));
353 assert_eq!(data.to_string(py).unwrap(), Cow::Borrowed("hello, world"));
354 assert_eq!(data.to_string_lossy(), Cow::Borrowed("hello, world"));
355 })
356 }
357
358 #[test]
359 #[cfg(not(Py_LIMITED_API))]
360 fn test_string_data_ucs1_invalid() {
361 Python::with_gil(|py| {
362 // 0xfe is not allowed in UTF-8.
363 let buffer = b"f\xfe\0";
364 let ptr = unsafe {
365 crate::ffi::PyUnicode_FromKindAndData(
366 crate::ffi::PyUnicode_1BYTE_KIND as _,
367 buffer.as_ptr() as *const _,
368 2,
369 )
370 };
371 assert!(!ptr.is_null());
372 let s: &PyString = unsafe { py.from_owned_ptr(ptr) };
373 let data = unsafe { s.data().unwrap() };
374 assert_eq!(data, PyStringData::Ucs1(b"f\xfe"));
375 let err = data.to_string(py).unwrap_err();
376 assert!(err.get_type(py).is(py.get_type::<PyUnicodeDecodeError>()));
377 assert!(err
378 .to_string()
379 .contains("'utf-8' codec can't decode byte 0xfe in position 1"));
380 assert_eq!(data.to_string_lossy(), Cow::Borrowed("f�"));
381 });
382 }
383
384 #[test]
385 #[cfg(not(Py_LIMITED_API))]
386 fn test_string_data_ucs2() {
387 Python::with_gil(|py| {
388 let s = py.eval("'foo\\ud800'", None, None).unwrap();
389 let py_string = s.downcast::<PyString>().unwrap();
390 let data = unsafe { py_string.data().unwrap() };
391
392 assert_eq!(data, PyStringData::Ucs2(&[102, 111, 111, 0xd800]));
393 assert_eq!(
394 data.to_string_lossy(),
395 Cow::Owned::<str>("foo�".to_string())
396 );
397 })
398 }
399
400 #[test]
401 #[cfg(all(not(Py_LIMITED_API), target_endian = "little"))]
402 fn test_string_data_ucs2_invalid() {
403 Python::with_gil(|py| {
404 // U+FF22 (valid) & U+d800 (never valid)
405 let buffer = b"\x22\xff\x00\xd8\x00\x00";
406 let ptr = unsafe {
407 crate::ffi::PyUnicode_FromKindAndData(
408 crate::ffi::PyUnicode_2BYTE_KIND as _,
409 buffer.as_ptr() as *const _,
410 2,
411 )
412 };
413 assert!(!ptr.is_null());
414 let s: &PyString = unsafe { py.from_owned_ptr(ptr) };
415 let data = unsafe { s.data().unwrap() };
416 assert_eq!(data, PyStringData::Ucs2(&[0xff22, 0xd800]));
417 let err = data.to_string(py).unwrap_err();
418 assert!(err.get_type(py).is(py.get_type::<PyUnicodeDecodeError>()));
419 assert!(err
420 .to_string()
421 .contains("'utf-16' codec can't decode bytes in position 0-3"));
422 assert_eq!(data.to_string_lossy(), Cow::Owned::<str>("B�".into()));
423 });
424 }
425
426 #[test]
427 #[cfg(not(Py_LIMITED_API))]
428 fn test_string_data_ucs4() {
429 Python::with_gil(|py| {
430 let s = "哈哈🐈";
431 let py_string = PyString::new(py, s);
432 let data = unsafe { py_string.data().unwrap() };
433
434 assert_eq!(data, PyStringData::Ucs4(&[21704, 21704, 128008]));
435 assert_eq!(data.to_string_lossy(), Cow::Owned::<str>(s.to_string()));
436 })
437 }
438
439 #[test]
440 #[cfg(all(not(Py_LIMITED_API), target_endian = "little"))]
441 fn test_string_data_ucs4_invalid() {
442 Python::with_gil(|py| {
443 // U+20000 (valid) & U+d800 (never valid)
444 let buffer = b"\x00\x00\x02\x00\x00\xd8\x00\x00\x00\x00\x00\x00";
445 let ptr = unsafe {
446 crate::ffi::PyUnicode_FromKindAndData(
447 crate::ffi::PyUnicode_4BYTE_KIND as _,
448 buffer.as_ptr() as *const _,
449 2,
450 )
451 };
452 assert!(!ptr.is_null());
453 let s: &PyString = unsafe { py.from_owned_ptr(ptr) };
454 let data = unsafe { s.data().unwrap() };
455 assert_eq!(data, PyStringData::Ucs4(&[0x20000, 0xd800]));
456 let err = data.to_string(py).unwrap_err();
457 assert!(err.get_type(py).is(py.get_type::<PyUnicodeDecodeError>()));
458 assert!(err
459 .to_string()
460 .contains("'utf-32' codec can't decode bytes in position 0-7"));
461 assert_eq!(data.to_string_lossy(), Cow::Owned::<str>("𠀀�".into()));
462 });
463 }
464
465 #[test]
466 fn test_intern_string() {
467 Python::with_gil(|py| {
468 let py_string1 = PyString::intern(py, "foo");
469 assert_eq!(py_string1.to_str().unwrap(), "foo");
470
471 let py_string2 = PyString::intern(py, "foo");
472 assert_eq!(py_string2.to_str().unwrap(), "foo");
473
474 assert_eq!(py_string1.as_ptr(), py_string2.as_ptr());
475
476 let py_string3 = PyString::intern(py, "bar");
477 assert_eq!(py_string3.to_str().unwrap(), "bar");
478
479 assert_ne!(py_string1.as_ptr(), py_string3.as_ptr());
480 });
481 }
482}
483