1 | #[cfg (not(Py_LIMITED_API))] |
2 | use crate::exceptions::PyUnicodeDecodeError; |
3 | use crate::types::PyBytes; |
4 | use crate::{ffi, PyAny, PyResult, Python}; |
5 | use std::borrow::Cow; |
6 | use std::os::raw::c_char; |
7 | use std::str; |
8 | |
9 | /// Represents raw data backing a Python `str`. |
10 | /// |
11 | /// Python internally stores strings in various representations. This enumeration |
12 | /// represents those variations. |
13 | #[cfg (not(Py_LIMITED_API))] |
14 | #[derive (Clone, Copy, Debug, PartialEq, Eq)] |
15 | pub enum PyStringData<'a> { |
16 | /// UCS1 representation. |
17 | Ucs1(&'a [u8]), |
18 | |
19 | /// UCS2 representation. |
20 | Ucs2(&'a [u16]), |
21 | |
22 | /// UCS4 representation. |
23 | Ucs4(&'a [u32]), |
24 | } |
25 | |
26 | #[cfg (not(Py_LIMITED_API))] |
27 | impl<'a> PyStringData<'a> { |
28 | /// Obtain the raw bytes backing this instance as a [u8] slice. |
29 | pub fn as_bytes(&self) -> &[u8] { |
30 | match self { |
31 | Self::Ucs1(s) => s, |
32 | Self::Ucs2(s) => unsafe { |
33 | std::slice::from_raw_parts( |
34 | s.as_ptr() as *const u8, |
35 | s.len() * self.value_width_bytes(), |
36 | ) |
37 | }, |
38 | Self::Ucs4(s) => unsafe { |
39 | std::slice::from_raw_parts( |
40 | s.as_ptr() as *const u8, |
41 | s.len() * self.value_width_bytes(), |
42 | ) |
43 | }, |
44 | } |
45 | } |
46 | |
47 | /// Size in bytes of each value/item in the underlying slice. |
48 | #[inline ] |
49 | pub fn value_width_bytes(&self) -> usize { |
50 | match self { |
51 | Self::Ucs1(_) => 1, |
52 | Self::Ucs2(_) => 2, |
53 | Self::Ucs4(_) => 4, |
54 | } |
55 | } |
56 | |
57 | /// Convert the raw data to a Rust string. |
58 | /// |
59 | /// For UCS-1 / UTF-8, returns a borrow into the original slice. For UCS-2 and UCS-4, |
60 | /// returns an owned string. |
61 | /// |
62 | /// Returns [PyUnicodeDecodeError] if the string data isn't valid in its purported |
63 | /// storage format. This should only occur for strings that were created via Python |
64 | /// C APIs that skip input validation (like `PyUnicode_FromKindAndData`) and should |
65 | /// never occur for strings that were created from Python code. |
66 | pub fn to_string(self, py: Python<'_>) -> PyResult<Cow<'a, str>> { |
67 | use std::ffi::CStr; |
68 | match self { |
69 | Self::Ucs1(data) => match str::from_utf8(data) { |
70 | Ok(s) => Ok(Cow::Borrowed(s)), |
71 | Err(e) => Err(crate::PyErr::from_value(PyUnicodeDecodeError::new_utf8( |
72 | py, data, e, |
73 | )?)), |
74 | }, |
75 | Self::Ucs2(data) => match String::from_utf16(data) { |
76 | Ok(s) => Ok(Cow::Owned(s)), |
77 | Err(e) => { |
78 | let mut message = e.to_string().as_bytes().to_vec(); |
79 | message.push(0); |
80 | |
81 | Err(crate::PyErr::from_value(PyUnicodeDecodeError::new( |
82 | py, |
83 | CStr::from_bytes_with_nul(b"utf-16 \0" ).unwrap(), |
84 | self.as_bytes(), |
85 | 0..self.as_bytes().len(), |
86 | CStr::from_bytes_with_nul(&message).unwrap(), |
87 | )?)) |
88 | } |
89 | }, |
90 | Self::Ucs4(data) => match data.iter().map(|&c| std::char::from_u32(c)).collect() { |
91 | Some(s) => Ok(Cow::Owned(s)), |
92 | None => Err(crate::PyErr::from_value(PyUnicodeDecodeError::new( |
93 | py, |
94 | CStr::from_bytes_with_nul(b"utf-32 \0" ).unwrap(), |
95 | self.as_bytes(), |
96 | 0..self.as_bytes().len(), |
97 | CStr::from_bytes_with_nul(b"error converting utf-32 \0" ).unwrap(), |
98 | )?)), |
99 | }, |
100 | } |
101 | } |
102 | |
103 | /// Convert the raw data to a Rust string, possibly with data loss. |
104 | /// |
105 | /// Invalid code points will be replaced with `U+FFFD REPLACEMENT CHARACTER`. |
106 | /// |
107 | /// Returns a borrow into original data, when possible, or owned data otherwise. |
108 | /// |
109 | /// The return value of this function should only disagree with [Self::to_string] |
110 | /// when that method would error. |
111 | pub fn to_string_lossy(self) -> Cow<'a, str> { |
112 | match self { |
113 | Self::Ucs1(data) => String::from_utf8_lossy(data), |
114 | Self::Ucs2(data) => Cow::Owned(String::from_utf16_lossy(data)), |
115 | Self::Ucs4(data) => Cow::Owned( |
116 | data.iter() |
117 | .map(|&c| std::char::from_u32(c).unwrap_or(' \u{FFFD}' )) |
118 | .collect(), |
119 | ), |
120 | } |
121 | } |
122 | } |
123 | |
124 | /// Represents a Python `string` (a Unicode string object). |
125 | /// |
126 | /// This type is immutable. |
127 | #[repr (transparent)] |
128 | pub struct PyString(PyAny); |
129 | |
130 | pyobject_native_type_core!(PyString, pyobject_native_static_type_object!(ffi::PyUnicode_Type), #checkfunction=ffi::PyUnicode_Check); |
131 | |
132 | impl PyString { |
133 | /// Creates a new Python string object. |
134 | /// |
135 | /// Panics if out of memory. |
136 | pub fn new<'p>(py: Python<'p>, s: &str) -> &'p PyString { |
137 | let ptr = s.as_ptr() as *const c_char; |
138 | let len = s.len() as ffi::Py_ssize_t; |
139 | unsafe { py.from_owned_ptr(ffi::PyUnicode_FromStringAndSize(ptr, len)) } |
140 | } |
141 | |
142 | /// Intern the given string |
143 | /// |
144 | /// This will return a reference to the same Python string object if called repeatedly with the same string. |
145 | /// |
146 | /// Note that while this is more memory efficient than [`PyString::new`], it unconditionally allocates a |
147 | /// temporary Python string object and is thereby slower than [`PyString::new`]. |
148 | /// |
149 | /// Panics if out of memory. |
150 | pub fn intern<'p>(py: Python<'p>, s: &str) -> &'p PyString { |
151 | let ptr = s.as_ptr() as *const c_char; |
152 | let len = s.len() as ffi::Py_ssize_t; |
153 | unsafe { |
154 | let mut ob = ffi::PyUnicode_FromStringAndSize(ptr, len); |
155 | if !ob.is_null() { |
156 | ffi::PyUnicode_InternInPlace(&mut ob); |
157 | } |
158 | py.from_owned_ptr(ob) |
159 | } |
160 | } |
161 | |
162 | /// Attempts to create a Python string from a Python [bytes-like object]. |
163 | /// |
164 | /// [bytes-like object]: (https://docs.python.org/3/glossary.html#term-bytes-like-object). |
165 | pub fn from_object<'p>(src: &'p PyAny, encoding: &str, errors: &str) -> PyResult<&'p PyString> { |
166 | unsafe { |
167 | src.py() |
168 | .from_owned_ptr_or_err::<PyString>(ffi::PyUnicode_FromEncodedObject( |
169 | src.as_ptr(), |
170 | encoding.as_ptr() as *const c_char, |
171 | errors.as_ptr() as *const c_char, |
172 | )) |
173 | } |
174 | } |
175 | |
176 | /// Gets the Python string as a byte slice. |
177 | /// |
178 | /// Returns a `UnicodeEncodeError` if the input is not valid unicode |
179 | /// (containing unpaired surrogates). |
180 | #[inline ] |
181 | pub fn to_str(&self) -> PyResult<&str> { |
182 | let utf8_slice = { |
183 | cfg_if::cfg_if! { |
184 | if #[cfg(any(Py_3_10, not(Py_LIMITED_API)))] { |
185 | // PyUnicode_AsUTF8AndSize only available on limited API starting with 3.10. |
186 | let mut size: ffi::Py_ssize_t = 0; |
187 | let data: *const u8 = unsafe { ffi::PyUnicode_AsUTF8AndSize(self.as_ptr(), &mut size).cast() }; |
188 | if data.is_null() { |
189 | return Err(crate::PyErr::fetch(self.py())); |
190 | } else { |
191 | unsafe { std::slice::from_raw_parts(data, size as usize) } |
192 | } |
193 | } else { |
194 | let bytes = unsafe { |
195 | self.py().from_owned_ptr_or_err::<PyBytes>(ffi::PyUnicode_AsUTF8String(self.as_ptr()))? |
196 | }; |
197 | bytes.as_bytes() |
198 | } |
199 | } |
200 | }; |
201 | Ok(unsafe { std::str::from_utf8_unchecked(utf8_slice) }) |
202 | } |
203 | |
204 | /// Converts the `PyString` into a Rust string. |
205 | /// |
206 | /// Unpaired surrogates invalid UTF-8 sequences are |
207 | /// replaced with `U+FFFD REPLACEMENT CHARACTER`. |
208 | pub fn to_string_lossy(&self) -> Cow<'_, str> { |
209 | match self.to_str() { |
210 | Ok(s) => Cow::Borrowed(s), |
211 | Err(_) => { |
212 | let bytes = unsafe { |
213 | self.py() |
214 | .from_owned_ptr::<PyBytes>(ffi::PyUnicode_AsEncodedString( |
215 | self.as_ptr(), |
216 | b"utf-8 \0" as *const _ as _, |
217 | b"surrogatepass \0" as *const _ as _, |
218 | )) |
219 | }; |
220 | String::from_utf8_lossy(bytes.as_bytes()) |
221 | } |
222 | } |
223 | } |
224 | |
225 | /// Obtains the raw data backing the Python string. |
226 | /// |
227 | /// If the Python string object was created through legacy APIs, its internal storage format |
228 | /// will be canonicalized before data is returned. |
229 | /// |
230 | /// # Safety |
231 | /// |
232 | /// This function implementation relies on manually decoding a C bitfield. In practice, this |
233 | /// works well on common little-endian architectures such as x86_64, where the bitfield has a |
234 | /// common representation (even if it is not part of the C spec). The PyO3 CI tests this API on |
235 | /// x86_64 platforms. |
236 | /// |
237 | /// By using this API, you accept responsibility for testing that PyStringData behaves as |
238 | /// expected on the targets where you plan to distribute your software. |
239 | #[cfg (not(Py_LIMITED_API))] |
240 | pub unsafe fn data(&self) -> PyResult<PyStringData<'_>> { |
241 | let ptr = self.as_ptr(); |
242 | |
243 | #[cfg (not(Py_3_12))] |
244 | #[allow (deprecated)] |
245 | { |
246 | let ready = ffi::PyUnicode_READY(ptr); |
247 | if ready != 0 { |
248 | // Exception was created on failure. |
249 | return Err(crate::PyErr::fetch(self.py())); |
250 | } |
251 | } |
252 | |
253 | // The string should be in its canonical form after calling `PyUnicode_READY()`. |
254 | // And non-canonical form not possible after Python 3.12. So it should be safe |
255 | // to call these APIs. |
256 | let length = ffi::PyUnicode_GET_LENGTH(ptr) as usize; |
257 | let raw_data = ffi::PyUnicode_DATA(ptr); |
258 | let kind = ffi::PyUnicode_KIND(ptr); |
259 | |
260 | match kind { |
261 | ffi::PyUnicode_1BYTE_KIND => Ok(PyStringData::Ucs1(std::slice::from_raw_parts( |
262 | raw_data as *const u8, |
263 | length, |
264 | ))), |
265 | ffi::PyUnicode_2BYTE_KIND => Ok(PyStringData::Ucs2(std::slice::from_raw_parts( |
266 | raw_data as *const u16, |
267 | length, |
268 | ))), |
269 | ffi::PyUnicode_4BYTE_KIND => Ok(PyStringData::Ucs4(std::slice::from_raw_parts( |
270 | raw_data as *const u32, |
271 | length, |
272 | ))), |
273 | _ => unreachable!(), |
274 | } |
275 | } |
276 | } |
277 | |
278 | #[cfg (test)] |
279 | mod tests { |
280 | use super::*; |
281 | use crate::Python; |
282 | use crate::{PyObject, ToPyObject}; |
283 | #[cfg (not(Py_LIMITED_API))] |
284 | use std::borrow::Cow; |
285 | |
286 | #[test ] |
287 | fn test_to_str_ascii() { |
288 | Python::with_gil(|py| { |
289 | let s = "ascii 🐈" ; |
290 | let obj: PyObject = PyString::new(py, s).into(); |
291 | let py_string: &PyString = obj.downcast(py).unwrap(); |
292 | assert_eq!(s, py_string.to_str().unwrap()); |
293 | }) |
294 | } |
295 | |
296 | #[test ] |
297 | fn test_to_str_surrogate() { |
298 | Python::with_gil(|py| { |
299 | let obj: PyObject = py.eval(r"'\ud800'" , None, None).unwrap().into(); |
300 | let py_string: &PyString = obj.downcast(py).unwrap(); |
301 | assert!(py_string.to_str().is_err()); |
302 | }) |
303 | } |
304 | |
305 | #[test ] |
306 | fn test_to_str_unicode() { |
307 | Python::with_gil(|py| { |
308 | let s = "哈哈🐈" ; |
309 | let obj: PyObject = PyString::new(py, s).into(); |
310 | let py_string: &PyString = obj.downcast(py).unwrap(); |
311 | assert_eq!(s, py_string.to_str().unwrap()); |
312 | }) |
313 | } |
314 | |
315 | #[test ] |
316 | fn test_to_string_lossy() { |
317 | Python::with_gil(|py| { |
318 | let obj: PyObject = py |
319 | .eval(r"'🐈 Hello \ud800World'" , None, None) |
320 | .unwrap() |
321 | .into(); |
322 | let py_string: &PyString = obj.downcast(py).unwrap(); |
323 | assert_eq!(py_string.to_string_lossy(), "🐈 Hello ���World" ); |
324 | }) |
325 | } |
326 | |
327 | #[test ] |
328 | fn test_debug_string() { |
329 | Python::with_gil(|py| { |
330 | let v = "Hello \n" .to_object(py); |
331 | let s: &PyString = v.downcast(py).unwrap(); |
332 | assert_eq!(format!(" {:?}" , s), "'Hello \\n'" ); |
333 | }) |
334 | } |
335 | |
336 | #[test ] |
337 | fn test_display_string() { |
338 | Python::with_gil(|py| { |
339 | let v = "Hello \n" .to_object(py); |
340 | let s: &PyString = v.downcast(py).unwrap(); |
341 | assert_eq!(format!(" {}" , s), "Hello \n" ); |
342 | }) |
343 | } |
344 | |
345 | #[test ] |
346 | #[cfg (not(Py_LIMITED_API))] |
347 | fn test_string_data_ucs1() { |
348 | Python::with_gil(|py| { |
349 | let s = PyString::new(py, "hello, world" ); |
350 | let data = unsafe { s.data().unwrap() }; |
351 | |
352 | assert_eq!(data, PyStringData::Ucs1(b"hello, world" )); |
353 | assert_eq!(data.to_string(py).unwrap(), Cow::Borrowed("hello, world" )); |
354 | assert_eq!(data.to_string_lossy(), Cow::Borrowed("hello, world" )); |
355 | }) |
356 | } |
357 | |
358 | #[test ] |
359 | #[cfg (not(Py_LIMITED_API))] |
360 | fn test_string_data_ucs1_invalid() { |
361 | Python::with_gil(|py| { |
362 | // 0xfe is not allowed in UTF-8. |
363 | let buffer = b"f \xfe\0" ; |
364 | let ptr = unsafe { |
365 | crate::ffi::PyUnicode_FromKindAndData( |
366 | crate::ffi::PyUnicode_1BYTE_KIND as _, |
367 | buffer.as_ptr() as *const _, |
368 | 2, |
369 | ) |
370 | }; |
371 | assert!(!ptr.is_null()); |
372 | let s: &PyString = unsafe { py.from_owned_ptr(ptr) }; |
373 | let data = unsafe { s.data().unwrap() }; |
374 | assert_eq!(data, PyStringData::Ucs1(b"f \xfe" )); |
375 | let err = data.to_string(py).unwrap_err(); |
376 | assert!(err.get_type(py).is(py.get_type::<PyUnicodeDecodeError>())); |
377 | assert!(err |
378 | .to_string() |
379 | .contains("'utf-8' codec can't decode byte 0xfe in position 1" )); |
380 | assert_eq!(data.to_string_lossy(), Cow::Borrowed("f�" )); |
381 | }); |
382 | } |
383 | |
384 | #[test ] |
385 | #[cfg (not(Py_LIMITED_API))] |
386 | fn test_string_data_ucs2() { |
387 | Python::with_gil(|py| { |
388 | let s = py.eval("'foo \\ud800'" , None, None).unwrap(); |
389 | let py_string = s.downcast::<PyString>().unwrap(); |
390 | let data = unsafe { py_string.data().unwrap() }; |
391 | |
392 | assert_eq!(data, PyStringData::Ucs2(&[102, 111, 111, 0xd800])); |
393 | assert_eq!( |
394 | data.to_string_lossy(), |
395 | Cow::Owned::<str>("foo�" .to_string()) |
396 | ); |
397 | }) |
398 | } |
399 | |
400 | #[test ] |
401 | #[cfg (all(not(Py_LIMITED_API), target_endian = "little" ))] |
402 | fn test_string_data_ucs2_invalid() { |
403 | Python::with_gil(|py| { |
404 | // U+FF22 (valid) & U+d800 (never valid) |
405 | let buffer = b" \x22\xff\x00\xd8\x00\x00" ; |
406 | let ptr = unsafe { |
407 | crate::ffi::PyUnicode_FromKindAndData( |
408 | crate::ffi::PyUnicode_2BYTE_KIND as _, |
409 | buffer.as_ptr() as *const _, |
410 | 2, |
411 | ) |
412 | }; |
413 | assert!(!ptr.is_null()); |
414 | let s: &PyString = unsafe { py.from_owned_ptr(ptr) }; |
415 | let data = unsafe { s.data().unwrap() }; |
416 | assert_eq!(data, PyStringData::Ucs2(&[0xff22, 0xd800])); |
417 | let err = data.to_string(py).unwrap_err(); |
418 | assert!(err.get_type(py).is(py.get_type::<PyUnicodeDecodeError>())); |
419 | assert!(err |
420 | .to_string() |
421 | .contains("'utf-16' codec can't decode bytes in position 0-3" )); |
422 | assert_eq!(data.to_string_lossy(), Cow::Owned::<str>("B�" .into())); |
423 | }); |
424 | } |
425 | |
426 | #[test ] |
427 | #[cfg (not(Py_LIMITED_API))] |
428 | fn test_string_data_ucs4() { |
429 | Python::with_gil(|py| { |
430 | let s = "哈哈🐈" ; |
431 | let py_string = PyString::new(py, s); |
432 | let data = unsafe { py_string.data().unwrap() }; |
433 | |
434 | assert_eq!(data, PyStringData::Ucs4(&[21704, 21704, 128008])); |
435 | assert_eq!(data.to_string_lossy(), Cow::Owned::<str>(s.to_string())); |
436 | }) |
437 | } |
438 | |
439 | #[test ] |
440 | #[cfg (all(not(Py_LIMITED_API), target_endian = "little" ))] |
441 | fn test_string_data_ucs4_invalid() { |
442 | Python::with_gil(|py| { |
443 | // U+20000 (valid) & U+d800 (never valid) |
444 | let buffer = b" \x00\x00\x02\x00\x00\xd8\x00\x00\x00\x00\x00\x00" ; |
445 | let ptr = unsafe { |
446 | crate::ffi::PyUnicode_FromKindAndData( |
447 | crate::ffi::PyUnicode_4BYTE_KIND as _, |
448 | buffer.as_ptr() as *const _, |
449 | 2, |
450 | ) |
451 | }; |
452 | assert!(!ptr.is_null()); |
453 | let s: &PyString = unsafe { py.from_owned_ptr(ptr) }; |
454 | let data = unsafe { s.data().unwrap() }; |
455 | assert_eq!(data, PyStringData::Ucs4(&[0x20000, 0xd800])); |
456 | let err = data.to_string(py).unwrap_err(); |
457 | assert!(err.get_type(py).is(py.get_type::<PyUnicodeDecodeError>())); |
458 | assert!(err |
459 | .to_string() |
460 | .contains("'utf-32' codec can't decode bytes in position 0-7" )); |
461 | assert_eq!(data.to_string_lossy(), Cow::Owned::<str>("𠀀�" .into())); |
462 | }); |
463 | } |
464 | |
465 | #[test ] |
466 | fn test_intern_string() { |
467 | Python::with_gil(|py| { |
468 | let py_string1 = PyString::intern(py, "foo" ); |
469 | assert_eq!(py_string1.to_str().unwrap(), "foo" ); |
470 | |
471 | let py_string2 = PyString::intern(py, "foo" ); |
472 | assert_eq!(py_string2.to_str().unwrap(), "foo" ); |
473 | |
474 | assert_eq!(py_string1.as_ptr(), py_string2.as_ptr()); |
475 | |
476 | let py_string3 = PyString::intern(py, "bar" ); |
477 | assert_eq!(py_string3.to_str().unwrap(), "bar" ); |
478 | |
479 | assert_ne!(py_string1.as_ptr(), py_string3.as_ptr()); |
480 | }); |
481 | } |
482 | } |
483 | |