1 | #[cfg (not(Py_LIMITED_API))] |
2 | use crate::exceptions::PyUnicodeDecodeError; |
3 | use crate::ffi_ptr_ext::FfiPtrExt; |
4 | use crate::instance::Borrowed; |
5 | use crate::py_result_ext::PyResultExt; |
6 | use crate::types::any::PyAnyMethods; |
7 | use crate::types::bytes::PyBytesMethods; |
8 | use crate::types::PyBytes; |
9 | #[allow (deprecated)] |
10 | use crate::IntoPy; |
11 | use crate::{ffi, Bound, Py, PyAny, PyResult, Python}; |
12 | use std::borrow::Cow; |
13 | use std::ffi::CString; |
14 | use std::str; |
15 | |
16 | /// Deprecated alias for [`PyString`]. |
17 | #[deprecated (since = "0.23.0" , note = "use `PyString` instead" )] |
18 | pub type PyUnicode = PyString; |
19 | |
20 | /// Represents raw data backing a Python `str`. |
21 | /// |
22 | /// Python internally stores strings in various representations. This enumeration |
23 | /// represents those variations. |
24 | #[cfg (not(Py_LIMITED_API))] |
25 | #[derive (Clone, Copy, Debug, PartialEq, Eq)] |
26 | pub enum PyStringData<'a> { |
27 | /// UCS1 representation. |
28 | Ucs1(&'a [u8]), |
29 | |
30 | /// UCS2 representation. |
31 | Ucs2(&'a [u16]), |
32 | |
33 | /// UCS4 representation. |
34 | Ucs4(&'a [u32]), |
35 | } |
36 | |
37 | #[cfg (not(Py_LIMITED_API))] |
38 | impl<'a> PyStringData<'a> { |
39 | /// Obtain the raw bytes backing this instance as a [u8] slice. |
40 | pub fn as_bytes(&self) -> &[u8] { |
41 | match self { |
42 | Self::Ucs1(s) => s, |
43 | Self::Ucs2(s) => unsafe { |
44 | std::slice::from_raw_parts(s.as_ptr().cast(), s.len() * self.value_width_bytes()) |
45 | }, |
46 | Self::Ucs4(s) => unsafe { |
47 | std::slice::from_raw_parts(s.as_ptr().cast(), s.len() * self.value_width_bytes()) |
48 | }, |
49 | } |
50 | } |
51 | |
52 | /// Size in bytes of each value/item in the underlying slice. |
53 | #[inline ] |
54 | pub fn value_width_bytes(&self) -> usize { |
55 | match self { |
56 | Self::Ucs1(_) => 1, |
57 | Self::Ucs2(_) => 2, |
58 | Self::Ucs4(_) => 4, |
59 | } |
60 | } |
61 | |
62 | /// Convert the raw data to a Rust string. |
63 | /// |
64 | /// For UCS-1 / UTF-8, returns a borrow into the original slice. For UCS-2 and UCS-4, |
65 | /// returns an owned string. |
66 | /// |
67 | /// Returns [PyUnicodeDecodeError] if the string data isn't valid in its purported |
68 | /// storage format. This should only occur for strings that were created via Python |
69 | /// C APIs that skip input validation (like `PyUnicode_FromKindAndData`) and should |
70 | /// never occur for strings that were created from Python code. |
71 | pub fn to_string(self, py: Python<'_>) -> PyResult<Cow<'a, str>> { |
72 | use std::ffi::CStr; |
73 | match self { |
74 | Self::Ucs1(data) => match str::from_utf8(data) { |
75 | Ok(s) => Ok(Cow::Borrowed(s)), |
76 | Err(e) => Err(PyUnicodeDecodeError::new_utf8(py, data, e)?.into()), |
77 | }, |
78 | Self::Ucs2(data) => match String::from_utf16(data) { |
79 | Ok(s) => Ok(Cow::Owned(s)), |
80 | Err(e) => { |
81 | let mut message = e.to_string().as_bytes().to_vec(); |
82 | message.push(0); |
83 | |
84 | Err(PyUnicodeDecodeError::new( |
85 | py, |
86 | ffi::c_str!("utf-16" ), |
87 | self.as_bytes(), |
88 | 0..self.as_bytes().len(), |
89 | CStr::from_bytes_with_nul(&message).unwrap(), |
90 | )? |
91 | .into()) |
92 | } |
93 | }, |
94 | Self::Ucs4(data) => match data.iter().map(|&c| std::char::from_u32(c)).collect() { |
95 | Some(s) => Ok(Cow::Owned(s)), |
96 | None => Err(PyUnicodeDecodeError::new( |
97 | py, |
98 | ffi::c_str!("utf-32" ), |
99 | self.as_bytes(), |
100 | 0..self.as_bytes().len(), |
101 | ffi::c_str!("error converting utf-32" ), |
102 | )? |
103 | .into()), |
104 | }, |
105 | } |
106 | } |
107 | |
108 | /// Convert the raw data to a Rust string, possibly with data loss. |
109 | /// |
110 | /// Invalid code points will be replaced with `U+FFFD REPLACEMENT CHARACTER`. |
111 | /// |
112 | /// Returns a borrow into original data, when possible, or owned data otherwise. |
113 | /// |
114 | /// The return value of this function should only disagree with [Self::to_string] |
115 | /// when that method would error. |
116 | pub fn to_string_lossy(self) -> Cow<'a, str> { |
117 | match self { |
118 | Self::Ucs1(data) => String::from_utf8_lossy(data), |
119 | Self::Ucs2(data) => Cow::Owned(String::from_utf16_lossy(data)), |
120 | Self::Ucs4(data) => Cow::Owned( |
121 | data.iter() |
122 | .map(|&c| std::char::from_u32(c).unwrap_or(' \u{FFFD}' )) |
123 | .collect(), |
124 | ), |
125 | } |
126 | } |
127 | } |
128 | |
129 | /// Represents a Python `string` (a Unicode string object). |
130 | /// |
131 | /// Values of this type are accessed via PyO3's smart pointers, e.g. as |
132 | /// [`Py<PyString>`][crate::Py] or [`Bound<'py, PyString>`][Bound]. |
133 | /// |
134 | /// For APIs available on `str` objects, see the [`PyStringMethods`] trait which is implemented for |
135 | /// [`Bound<'py, PyString>`][Bound]. |
136 | /// |
137 | /// # Equality |
138 | /// |
139 | /// For convenience, [`Bound<'py, PyString>`] implements [`PartialEq<str>`] to allow comparing the |
140 | /// data in the Python string to a Rust UTF-8 string slice. |
141 | /// |
142 | /// This is not always the most appropriate way to compare Python strings, as Python string subclasses |
143 | /// may have different equality semantics. In situations where subclasses overriding equality might be |
144 | /// relevant, use [`PyAnyMethods::eq`], at cost of the additional overhead of a Python method call. |
145 | /// |
146 | /// ```rust |
147 | /// # use pyo3::prelude::*; |
148 | /// use pyo3::types::PyString; |
149 | /// |
150 | /// # Python::with_gil(|py| { |
151 | /// let py_string = PyString::new(py, "foo" ); |
152 | /// // via PartialEq<str> |
153 | /// assert_eq!(py_string, "foo" ); |
154 | /// |
155 | /// // via Python equality |
156 | /// assert!(py_string.as_any().eq("foo" ).unwrap()); |
157 | /// # }); |
158 | /// ``` |
159 | #[repr (transparent)] |
160 | pub struct PyString(PyAny); |
161 | |
162 | pyobject_native_type_core!(PyString, pyobject_native_static_type_object!(ffi::PyUnicode_Type), #checkfunction=ffi::PyUnicode_Check); |
163 | |
164 | impl PyString { |
165 | /// Creates a new Python string object. |
166 | /// |
167 | /// Panics if out of memory. |
168 | pub fn new<'py>(py: Python<'py>, s: &str) -> Bound<'py, PyString> { |
169 | let ptr = s.as_ptr().cast(); |
170 | let len = s.len() as ffi::Py_ssize_t; |
171 | unsafe { |
172 | ffi::PyUnicode_FromStringAndSize(ptr, len) |
173 | .assume_owned(py) |
174 | .downcast_into_unchecked() |
175 | } |
176 | } |
177 | |
178 | /// Deprecated name for [`PyString::new`]. |
179 | #[deprecated (since = "0.23.0" , note = "renamed to `PyString::new`" )] |
180 | #[inline ] |
181 | pub fn new_bound<'py>(py: Python<'py>, s: &str) -> Bound<'py, PyString> { |
182 | Self::new(py, s) |
183 | } |
184 | |
185 | /// Intern the given string |
186 | /// |
187 | /// This will return a reference to the same Python string object if called repeatedly with the same string. |
188 | /// |
189 | /// Note that while this is more memory efficient than [`PyString::new_bound`], it unconditionally allocates a |
190 | /// temporary Python string object and is thereby slower than [`PyString::new_bound`]. |
191 | /// |
192 | /// Panics if out of memory. |
193 | pub fn intern<'py>(py: Python<'py>, s: &str) -> Bound<'py, PyString> { |
194 | let ptr = s.as_ptr().cast(); |
195 | let len = s.len() as ffi::Py_ssize_t; |
196 | unsafe { |
197 | let mut ob = ffi::PyUnicode_FromStringAndSize(ptr, len); |
198 | if !ob.is_null() { |
199 | ffi::PyUnicode_InternInPlace(&mut ob); |
200 | } |
201 | ob.assume_owned(py).downcast_into_unchecked() |
202 | } |
203 | } |
204 | |
205 | /// Deprecated name for [`PyString::intern`]. |
206 | #[deprecated (since = "0.23.0" , note = "renamed to `PyString::intern`" )] |
207 | #[inline ] |
208 | pub fn intern_bound<'py>(py: Python<'py>, s: &str) -> Bound<'py, PyString> { |
209 | Self::intern(py, s) |
210 | } |
211 | |
212 | /// Attempts to create a Python string from a Python [bytes-like object]. |
213 | /// |
214 | /// [bytes-like object]: (https://docs.python.org/3/glossary.html#term-bytes-like-object). |
215 | pub fn from_object<'py>( |
216 | src: &Bound<'py, PyAny>, |
217 | encoding: &str, |
218 | errors: &str, |
219 | ) -> PyResult<Bound<'py, PyString>> { |
220 | let encoding = CString::new(encoding)?; |
221 | let errors = CString::new(errors)?; |
222 | unsafe { |
223 | ffi::PyUnicode_FromEncodedObject( |
224 | src.as_ptr(), |
225 | encoding.as_ptr().cast(), |
226 | errors.as_ptr().cast(), |
227 | ) |
228 | .assume_owned_or_err(src.py()) |
229 | .downcast_into_unchecked() |
230 | } |
231 | } |
232 | |
233 | /// Deprecated name for [`PyString::from_object`]. |
234 | #[deprecated (since = "0.23.0" , note = "renamed to `PyString::from_object`" )] |
235 | #[inline ] |
236 | pub fn from_object_bound<'py>( |
237 | src: &Bound<'py, PyAny>, |
238 | encoding: &str, |
239 | errors: &str, |
240 | ) -> PyResult<Bound<'py, PyString>> { |
241 | Self::from_object(src, encoding, errors) |
242 | } |
243 | } |
244 | |
245 | /// Implementation of functionality for [`PyString`]. |
246 | /// |
247 | /// These methods are defined for the `Bound<'py, PyString>` smart pointer, so to use method call |
248 | /// syntax these methods are separated into a trait, because stable Rust does not yet support |
249 | /// `arbitrary_self_types`. |
250 | #[doc (alias = "PyString" )] |
251 | pub trait PyStringMethods<'py>: crate::sealed::Sealed { |
252 | /// Gets the Python string as a Rust UTF-8 string slice. |
253 | /// |
254 | /// Returns a `UnicodeEncodeError` if the input is not valid unicode |
255 | /// (containing unpaired surrogates). |
256 | #[cfg (any(Py_3_10, not(Py_LIMITED_API)))] |
257 | fn to_str(&self) -> PyResult<&str>; |
258 | |
259 | /// Converts the `PyString` into a Rust string, avoiding copying when possible. |
260 | /// |
261 | /// Returns a `UnicodeEncodeError` if the input is not valid unicode |
262 | /// (containing unpaired surrogates). |
263 | fn to_cow(&self) -> PyResult<Cow<'_, str>>; |
264 | |
265 | /// Converts the `PyString` into a Rust string. |
266 | /// |
267 | /// Unpaired surrogates invalid UTF-8 sequences are |
268 | /// replaced with `U+FFFD REPLACEMENT CHARACTER`. |
269 | fn to_string_lossy(&self) -> Cow<'_, str>; |
270 | |
271 | /// Encodes this string as a Python `bytes` object, using UTF-8 encoding. |
272 | fn encode_utf8(&self) -> PyResult<Bound<'py, PyBytes>>; |
273 | |
274 | /// Obtains the raw data backing the Python string. |
275 | /// |
276 | /// If the Python string object was created through legacy APIs, its internal storage format |
277 | /// will be canonicalized before data is returned. |
278 | /// |
279 | /// # Safety |
280 | /// |
281 | /// This function implementation relies on manually decoding a C bitfield. In practice, this |
282 | /// works well on common little-endian architectures such as x86_64, where the bitfield has a |
283 | /// common representation (even if it is not part of the C spec). The PyO3 CI tests this API on |
284 | /// x86_64 platforms. |
285 | /// |
286 | /// By using this API, you accept responsibility for testing that PyStringData behaves as |
287 | /// expected on the targets where you plan to distribute your software. |
288 | #[cfg (not(any(Py_LIMITED_API, GraalPy, PyPy)))] |
289 | unsafe fn data(&self) -> PyResult<PyStringData<'_>>; |
290 | } |
291 | |
292 | impl<'py> PyStringMethods<'py> for Bound<'py, PyString> { |
293 | #[cfg (any(Py_3_10, not(Py_LIMITED_API)))] |
294 | fn to_str(&self) -> PyResult<&str> { |
295 | self.as_borrowed().to_str() |
296 | } |
297 | |
298 | fn to_cow(&self) -> PyResult<Cow<'_, str>> { |
299 | self.as_borrowed().to_cow() |
300 | } |
301 | |
302 | fn to_string_lossy(&self) -> Cow<'_, str> { |
303 | self.as_borrowed().to_string_lossy() |
304 | } |
305 | |
306 | fn encode_utf8(&self) -> PyResult<Bound<'py, PyBytes>> { |
307 | unsafe { |
308 | ffi::PyUnicode_AsUTF8String(self.as_ptr()) |
309 | .assume_owned_or_err(self.py()) |
310 | .downcast_into_unchecked::<PyBytes>() |
311 | } |
312 | } |
313 | |
314 | #[cfg (not(any(Py_LIMITED_API, GraalPy, PyPy)))] |
315 | unsafe fn data(&self) -> PyResult<PyStringData<'_>> { |
316 | unsafe { self.as_borrowed().data() } |
317 | } |
318 | } |
319 | |
320 | impl<'a> Borrowed<'a, '_, PyString> { |
321 | #[cfg (any(Py_3_10, not(Py_LIMITED_API)))] |
322 | #[allow (clippy::wrong_self_convention)] |
323 | pub(crate) fn to_str(self) -> PyResult<&'a str> { |
324 | // PyUnicode_AsUTF8AndSize only available on limited API starting with 3.10. |
325 | let mut size: ffi::Py_ssize_t = 0; |
326 | let data: *const u8 = |
327 | unsafe { ffi::PyUnicode_AsUTF8AndSize(self.as_ptr(), &mut size).cast() }; |
328 | if data.is_null() { |
329 | Err(crate::PyErr::fetch(self.py())) |
330 | } else { |
331 | Ok(unsafe { |
332 | std::str::from_utf8_unchecked(std::slice::from_raw_parts(data, size as usize)) |
333 | }) |
334 | } |
335 | } |
336 | |
337 | #[allow (clippy::wrong_self_convention)] |
338 | pub(crate) fn to_cow(self) -> PyResult<Cow<'a, str>> { |
339 | // TODO: this method can probably be deprecated once Python 3.9 support is dropped, |
340 | // because all versions then support the more efficient `to_str`. |
341 | #[cfg (any(Py_3_10, not(Py_LIMITED_API)))] |
342 | { |
343 | self.to_str().map(Cow::Borrowed) |
344 | } |
345 | |
346 | #[cfg (not(any(Py_3_10, not(Py_LIMITED_API))))] |
347 | { |
348 | let bytes = self.encode_utf8()?; |
349 | Ok(Cow::Owned( |
350 | unsafe { str::from_utf8_unchecked(bytes.as_bytes()) }.to_owned(), |
351 | )) |
352 | } |
353 | } |
354 | |
355 | #[allow (clippy::wrong_self_convention)] |
356 | fn to_string_lossy(self) -> Cow<'a, str> { |
357 | let ptr = self.as_ptr(); |
358 | let py = self.py(); |
359 | |
360 | #[cfg (any(Py_3_10, not(Py_LIMITED_API)))] |
361 | if let Ok(s) = self.to_str() { |
362 | return Cow::Borrowed(s); |
363 | } |
364 | |
365 | let bytes = unsafe { |
366 | ffi::PyUnicode_AsEncodedString( |
367 | ptr, |
368 | ffi::c_str!("utf-8" ).as_ptr(), |
369 | ffi::c_str!("surrogatepass" ).as_ptr(), |
370 | ) |
371 | .assume_owned(py) |
372 | .downcast_into_unchecked::<PyBytes>() |
373 | }; |
374 | Cow::Owned(String::from_utf8_lossy(bytes.as_bytes()).into_owned()) |
375 | } |
376 | |
377 | #[cfg (not(any(Py_LIMITED_API, GraalPy, PyPy)))] |
378 | unsafe fn data(self) -> PyResult<PyStringData<'a>> { |
379 | unsafe { |
380 | let ptr = self.as_ptr(); |
381 | |
382 | #[cfg (not(Py_3_12))] |
383 | #[allow (deprecated)] |
384 | { |
385 | let ready = ffi::PyUnicode_READY(ptr); |
386 | if ready != 0 { |
387 | // Exception was created on failure. |
388 | return Err(crate::PyErr::fetch(self.py())); |
389 | } |
390 | } |
391 | |
392 | // The string should be in its canonical form after calling `PyUnicode_READY()`. |
393 | // And non-canonical form not possible after Python 3.12. So it should be safe |
394 | // to call these APIs. |
395 | let length = ffi::PyUnicode_GET_LENGTH(ptr) as usize; |
396 | let raw_data = ffi::PyUnicode_DATA(ptr); |
397 | let kind = ffi::PyUnicode_KIND(ptr); |
398 | |
399 | match kind { |
400 | ffi::PyUnicode_1BYTE_KIND => Ok(PyStringData::Ucs1(std::slice::from_raw_parts( |
401 | raw_data as *const u8, |
402 | length, |
403 | ))), |
404 | ffi::PyUnicode_2BYTE_KIND => Ok(PyStringData::Ucs2(std::slice::from_raw_parts( |
405 | raw_data as *const u16, |
406 | length, |
407 | ))), |
408 | ffi::PyUnicode_4BYTE_KIND => Ok(PyStringData::Ucs4(std::slice::from_raw_parts( |
409 | raw_data as *const u32, |
410 | length, |
411 | ))), |
412 | _ => unreachable!(), |
413 | } |
414 | } |
415 | } |
416 | } |
417 | |
418 | impl Py<PyString> { |
419 | /// Gets the Python string as a Rust UTF-8 string slice. |
420 | /// |
421 | /// Returns a `UnicodeEncodeError` if the input is not valid unicode |
422 | /// (containing unpaired surrogates). |
423 | /// |
424 | /// Because `str` objects are immutable, the returned slice is independent of |
425 | /// the GIL lifetime. |
426 | #[cfg (any(Py_3_10, not(Py_LIMITED_API)))] |
427 | pub fn to_str<'a>(&'a self, py: Python<'_>) -> PyResult<&'a str> { |
428 | self.bind_borrowed(py).to_str() |
429 | } |
430 | |
431 | /// Converts the `PyString` into a Rust string, avoiding copying when possible. |
432 | /// |
433 | /// Returns a `UnicodeEncodeError` if the input is not valid unicode |
434 | /// (containing unpaired surrogates). |
435 | /// |
436 | /// Because `str` objects are immutable, the returned slice is independent of |
437 | /// the GIL lifetime. |
438 | pub fn to_cow<'a>(&'a self, py: Python<'_>) -> PyResult<Cow<'a, str>> { |
439 | self.bind_borrowed(py).to_cow() |
440 | } |
441 | |
442 | /// Converts the `PyString` into a Rust string. |
443 | /// |
444 | /// Unpaired surrogates invalid UTF-8 sequences are |
445 | /// replaced with `U+FFFD REPLACEMENT CHARACTER`. |
446 | /// |
447 | /// Because `str` objects are immutable, the returned slice is independent of |
448 | /// the GIL lifetime. |
449 | pub fn to_string_lossy<'a>(&'a self, py: Python<'_>) -> Cow<'a, str> { |
450 | self.bind_borrowed(py).to_string_lossy() |
451 | } |
452 | } |
453 | |
454 | #[allow (deprecated)] |
455 | impl IntoPy<Py<PyString>> for Bound<'_, PyString> { |
456 | fn into_py(self, _py: Python<'_>) -> Py<PyString> { |
457 | self.unbind() |
458 | } |
459 | } |
460 | |
461 | #[allow (deprecated)] |
462 | impl IntoPy<Py<PyString>> for &Bound<'_, PyString> { |
463 | fn into_py(self, _py: Python<'_>) -> Py<PyString> { |
464 | self.clone().unbind() |
465 | } |
466 | } |
467 | |
468 | #[allow (deprecated)] |
469 | impl IntoPy<Py<PyString>> for &'_ Py<PyString> { |
470 | fn into_py(self, py: Python<'_>) -> Py<PyString> { |
471 | self.clone_ref(py) |
472 | } |
473 | } |
474 | |
475 | /// Compares whether the data in the Python string is equal to the given UTF8. |
476 | /// |
477 | /// In some cases Python equality might be more appropriate; see the note on [`PyString`]. |
478 | impl PartialEq<str> for Bound<'_, PyString> { |
479 | #[inline ] |
480 | fn eq(&self, other: &str) -> bool { |
481 | self.as_borrowed() == *other |
482 | } |
483 | } |
484 | |
485 | /// Compares whether the data in the Python string is equal to the given UTF8. |
486 | /// |
487 | /// In some cases Python equality might be more appropriate; see the note on [`PyString`]. |
488 | impl PartialEq<&'_ str> for Bound<'_, PyString> { |
489 | #[inline ] |
490 | fn eq(&self, other: &&str) -> bool { |
491 | self.as_borrowed() == **other |
492 | } |
493 | } |
494 | |
495 | /// Compares whether the data in the Python string is equal to the given UTF8. |
496 | /// |
497 | /// In some cases Python equality might be more appropriate; see the note on [`PyString`]. |
498 | impl PartialEq<Bound<'_, PyString>> for str { |
499 | #[inline ] |
500 | fn eq(&self, other: &Bound<'_, PyString>) -> bool { |
501 | *self == other.as_borrowed() |
502 | } |
503 | } |
504 | |
505 | /// Compares whether the data in the Python string is equal to the given UTF8. |
506 | /// |
507 | /// In some cases Python equality might be more appropriate; see the note on [`PyString`]. |
508 | impl PartialEq<&'_ Bound<'_, PyString>> for str { |
509 | #[inline ] |
510 | fn eq(&self, other: &&Bound<'_, PyString>) -> bool { |
511 | *self == other.as_borrowed() |
512 | } |
513 | } |
514 | |
515 | /// Compares whether the data in the Python string is equal to the given UTF8. |
516 | /// |
517 | /// In some cases Python equality might be more appropriate; see the note on [`PyString`]. |
518 | impl PartialEq<Bound<'_, PyString>> for &'_ str { |
519 | #[inline ] |
520 | fn eq(&self, other: &Bound<'_, PyString>) -> bool { |
521 | **self == other.as_borrowed() |
522 | } |
523 | } |
524 | |
525 | /// Compares whether the data in the Python string is equal to the given UTF8. |
526 | /// |
527 | /// In some cases Python equality might be more appropriate; see the note on [`PyString`]. |
528 | impl PartialEq<str> for &'_ Bound<'_, PyString> { |
529 | #[inline ] |
530 | fn eq(&self, other: &str) -> bool { |
531 | self.as_borrowed() == other |
532 | } |
533 | } |
534 | |
535 | /// Compares whether the data in the Python string is equal to the given UTF8. |
536 | /// |
537 | /// In some cases Python equality might be more appropriate; see the note on [`PyString`]. |
538 | impl PartialEq<str> for Borrowed<'_, '_, PyString> { |
539 | #[inline ] |
540 | fn eq(&self, other: &str) -> bool { |
541 | #[cfg (not(Py_3_13))] |
542 | { |
543 | self.to_cow().map_or(default:false, |s: Cow<'_, str>| s == other) |
544 | } |
545 | |
546 | #[cfg (Py_3_13)] |
547 | unsafe { |
548 | ffi::PyUnicode_EqualToUTF8AndSize( |
549 | self.as_ptr(), |
550 | other.as_ptr().cast(), |
551 | other.len() as _, |
552 | ) == 1 |
553 | } |
554 | } |
555 | } |
556 | |
557 | /// Compares whether the data in the Python string is equal to the given UTF8. |
558 | /// |
559 | /// In some cases Python equality might be more appropriate; see the note on [`PyString`]. |
560 | impl PartialEq<&str> for Borrowed<'_, '_, PyString> { |
561 | #[inline ] |
562 | fn eq(&self, other: &&str) -> bool { |
563 | *self == **other |
564 | } |
565 | } |
566 | |
567 | /// Compares whether the data in the Python string is equal to the given UTF8. |
568 | /// |
569 | /// In some cases Python equality might be more appropriate; see the note on [`PyString`]. |
570 | impl PartialEq<Borrowed<'_, '_, PyString>> for str { |
571 | #[inline ] |
572 | fn eq(&self, other: &Borrowed<'_, '_, PyString>) -> bool { |
573 | other == self |
574 | } |
575 | } |
576 | |
577 | /// Compares whether the data in the Python string is equal to the given UTF8. |
578 | /// |
579 | /// In some cases Python equality might be more appropriate; see the note on [`PyString`]. |
580 | impl PartialEq<Borrowed<'_, '_, PyString>> for &'_ str { |
581 | #[inline ] |
582 | fn eq(&self, other: &Borrowed<'_, '_, PyString>) -> bool { |
583 | other == self |
584 | } |
585 | } |
586 | |
587 | #[cfg (test)] |
588 | mod tests { |
589 | use super::*; |
590 | use crate::{IntoPyObject, PyObject}; |
591 | |
592 | #[test ] |
593 | fn test_to_cow_utf8() { |
594 | Python::with_gil(|py| { |
595 | let s = "ascii 🐈" ; |
596 | let py_string = PyString::new(py, s); |
597 | assert_eq!(s, py_string.to_cow().unwrap()); |
598 | }) |
599 | } |
600 | |
601 | #[test ] |
602 | fn test_to_cow_surrogate() { |
603 | Python::with_gil(|py| { |
604 | let py_string = py |
605 | .eval(ffi::c_str!(r"'\ud800'" ), None, None) |
606 | .unwrap() |
607 | .downcast_into::<PyString>() |
608 | .unwrap(); |
609 | assert!(py_string.to_cow().is_err()); |
610 | }) |
611 | } |
612 | |
613 | #[test ] |
614 | fn test_to_cow_unicode() { |
615 | Python::with_gil(|py| { |
616 | let s = "哈哈🐈" ; |
617 | let py_string = PyString::new(py, s); |
618 | assert_eq!(s, py_string.to_cow().unwrap()); |
619 | }) |
620 | } |
621 | |
622 | #[test ] |
623 | fn test_encode_utf8_unicode() { |
624 | Python::with_gil(|py| { |
625 | let s = "哈哈🐈" ; |
626 | let obj = PyString::new(py, s); |
627 | assert_eq!(s.as_bytes(), obj.encode_utf8().unwrap().as_bytes()); |
628 | }) |
629 | } |
630 | |
631 | #[test ] |
632 | fn test_encode_utf8_surrogate() { |
633 | Python::with_gil(|py| { |
634 | let obj: PyObject = py |
635 | .eval(ffi::c_str!(r"'\ud800'" ), None, None) |
636 | .unwrap() |
637 | .into(); |
638 | assert!(obj |
639 | .bind(py) |
640 | .downcast::<PyString>() |
641 | .unwrap() |
642 | .encode_utf8() |
643 | .is_err()); |
644 | }) |
645 | } |
646 | |
647 | #[test ] |
648 | fn test_to_string_lossy() { |
649 | Python::with_gil(|py| { |
650 | let py_string = py |
651 | .eval(ffi::c_str!(r"'🐈 Hello \ud800World'" ), None, None) |
652 | .unwrap() |
653 | .downcast_into::<PyString>() |
654 | .unwrap(); |
655 | |
656 | assert_eq!(py_string.to_string_lossy(), "🐈 Hello ���World" ); |
657 | }) |
658 | } |
659 | |
660 | #[test ] |
661 | fn test_debug_string() { |
662 | Python::with_gil(|py| { |
663 | let s = "Hello \n" .into_pyobject(py).unwrap(); |
664 | assert_eq!(format!("{:?}" , s), "'Hello \\n'" ); |
665 | }) |
666 | } |
667 | |
668 | #[test ] |
669 | fn test_display_string() { |
670 | Python::with_gil(|py| { |
671 | let s = "Hello \n" .into_pyobject(py).unwrap(); |
672 | assert_eq!(format!("{}" , s), "Hello \n" ); |
673 | }) |
674 | } |
675 | |
676 | #[test ] |
677 | fn test_string_from_object() { |
678 | Python::with_gil(|py| { |
679 | let py_bytes = PyBytes::new(py, b"ab \xFFcd" ); |
680 | |
681 | let py_string = PyString::from_object(&py_bytes, "utf-8" , "ignore" ).unwrap(); |
682 | |
683 | let result = py_string.to_cow().unwrap(); |
684 | assert_eq!(result, "abcd" ); |
685 | }); |
686 | } |
687 | |
688 | #[test ] |
689 | fn test_string_from_obect_with_invalid_encoding_errors() { |
690 | Python::with_gil(|py| { |
691 | let py_bytes = PyBytes::new(py, b"abcd" ); |
692 | |
693 | let result = PyString::from_object(&py_bytes, "utf \0-8" , "ignore" ); |
694 | assert!(result.is_err()); |
695 | |
696 | let result = PyString::from_object(&py_bytes, "utf-8" , "ign \0ore" ); |
697 | assert!(result.is_err()); |
698 | }); |
699 | } |
700 | |
701 | #[test ] |
702 | #[cfg (not(any(Py_LIMITED_API, PyPy)))] |
703 | fn test_string_data_ucs1() { |
704 | Python::with_gil(|py| { |
705 | let s = PyString::new(py, "hello, world" ); |
706 | let data = unsafe { s.data().unwrap() }; |
707 | |
708 | assert_eq!(data, PyStringData::Ucs1(b"hello, world" )); |
709 | assert_eq!(data.to_string(py).unwrap(), Cow::Borrowed("hello, world" )); |
710 | assert_eq!(data.to_string_lossy(), Cow::Borrowed("hello, world" )); |
711 | }) |
712 | } |
713 | |
714 | #[test ] |
715 | #[cfg (not(any(Py_LIMITED_API, PyPy)))] |
716 | fn test_string_data_ucs1_invalid() { |
717 | Python::with_gil(|py| { |
718 | // 0xfe is not allowed in UTF-8. |
719 | let buffer = b"f \xfe\0" ; |
720 | let ptr = unsafe { |
721 | crate::ffi::PyUnicode_FromKindAndData( |
722 | crate::ffi::PyUnicode_1BYTE_KIND as _, |
723 | buffer.as_ptr().cast(), |
724 | 2, |
725 | ) |
726 | }; |
727 | assert!(!ptr.is_null()); |
728 | let s = unsafe { ptr.assume_owned(py).downcast_into_unchecked::<PyString>() }; |
729 | let data = unsafe { s.data().unwrap() }; |
730 | assert_eq!(data, PyStringData::Ucs1(b"f \xfe" )); |
731 | let err = data.to_string(py).unwrap_err(); |
732 | assert!(err.get_type(py).is(&py.get_type::<PyUnicodeDecodeError>())); |
733 | assert!(err |
734 | .to_string() |
735 | .contains("'utf-8' codec can't decode byte 0xfe in position 1" )); |
736 | assert_eq!(data.to_string_lossy(), Cow::Borrowed("f�" )); |
737 | }); |
738 | } |
739 | |
740 | #[test ] |
741 | #[cfg (not(any(Py_LIMITED_API, PyPy)))] |
742 | fn test_string_data_ucs2() { |
743 | Python::with_gil(|py| { |
744 | let s = py.eval(ffi::c_str!("'foo \\ud800'" ), None, None).unwrap(); |
745 | let py_string = s.downcast::<PyString>().unwrap(); |
746 | let data = unsafe { py_string.data().unwrap() }; |
747 | |
748 | assert_eq!(data, PyStringData::Ucs2(&[102, 111, 111, 0xd800])); |
749 | assert_eq!( |
750 | data.to_string_lossy(), |
751 | Cow::Owned::<str>("foo�" .to_string()) |
752 | ); |
753 | }) |
754 | } |
755 | |
756 | #[test ] |
757 | #[cfg (all(not(any(Py_LIMITED_API, PyPy)), target_endian = "little" ))] |
758 | fn test_string_data_ucs2_invalid() { |
759 | Python::with_gil(|py| { |
760 | // U+FF22 (valid) & U+d800 (never valid) |
761 | let buffer = b" \x22\xff\x00\xd8\x00\x00" ; |
762 | let ptr = unsafe { |
763 | crate::ffi::PyUnicode_FromKindAndData( |
764 | crate::ffi::PyUnicode_2BYTE_KIND as _, |
765 | buffer.as_ptr().cast(), |
766 | 2, |
767 | ) |
768 | }; |
769 | assert!(!ptr.is_null()); |
770 | let s = unsafe { ptr.assume_owned(py).downcast_into_unchecked::<PyString>() }; |
771 | let data = unsafe { s.data().unwrap() }; |
772 | assert_eq!(data, PyStringData::Ucs2(&[0xff22, 0xd800])); |
773 | let err = data.to_string(py).unwrap_err(); |
774 | assert!(err.get_type(py).is(&py.get_type::<PyUnicodeDecodeError>())); |
775 | assert!(err |
776 | .to_string() |
777 | .contains("'utf-16' codec can't decode bytes in position 0-3" )); |
778 | assert_eq!(data.to_string_lossy(), Cow::Owned::<str>("B�" .into())); |
779 | }); |
780 | } |
781 | |
782 | #[test ] |
783 | #[cfg (not(any(Py_LIMITED_API, PyPy)))] |
784 | fn test_string_data_ucs4() { |
785 | Python::with_gil(|py| { |
786 | let s = "哈哈🐈" ; |
787 | let py_string = PyString::new(py, s); |
788 | let data = unsafe { py_string.data().unwrap() }; |
789 | |
790 | assert_eq!(data, PyStringData::Ucs4(&[21704, 21704, 128008])); |
791 | assert_eq!(data.to_string_lossy(), Cow::Owned::<str>(s.to_string())); |
792 | }) |
793 | } |
794 | |
795 | #[test ] |
796 | #[cfg (all(not(any(Py_LIMITED_API, PyPy)), target_endian = "little" ))] |
797 | fn test_string_data_ucs4_invalid() { |
798 | Python::with_gil(|py| { |
799 | // U+20000 (valid) & U+d800 (never valid) |
800 | let buffer = b" \x00\x00\x02\x00\x00\xd8\x00\x00\x00\x00\x00\x00" ; |
801 | let ptr = unsafe { |
802 | crate::ffi::PyUnicode_FromKindAndData( |
803 | crate::ffi::PyUnicode_4BYTE_KIND as _, |
804 | buffer.as_ptr().cast(), |
805 | 2, |
806 | ) |
807 | }; |
808 | assert!(!ptr.is_null()); |
809 | let s = unsafe { ptr.assume_owned(py).downcast_into_unchecked::<PyString>() }; |
810 | let data = unsafe { s.data().unwrap() }; |
811 | assert_eq!(data, PyStringData::Ucs4(&[0x20000, 0xd800])); |
812 | let err = data.to_string(py).unwrap_err(); |
813 | assert!(err.get_type(py).is(&py.get_type::<PyUnicodeDecodeError>())); |
814 | assert!(err |
815 | .to_string() |
816 | .contains("'utf-32' codec can't decode bytes in position 0-7" )); |
817 | assert_eq!(data.to_string_lossy(), Cow::Owned::<str>("𠀀�" .into())); |
818 | }); |
819 | } |
820 | |
821 | #[test ] |
822 | fn test_intern_string() { |
823 | Python::with_gil(|py| { |
824 | let py_string1 = PyString::intern(py, "foo" ); |
825 | assert_eq!(py_string1, "foo" ); |
826 | |
827 | let py_string2 = PyString::intern(py, "foo" ); |
828 | assert_eq!(py_string2, "foo" ); |
829 | |
830 | assert_eq!(py_string1.as_ptr(), py_string2.as_ptr()); |
831 | |
832 | let py_string3 = PyString::intern(py, "bar" ); |
833 | assert_eq!(py_string3, "bar" ); |
834 | |
835 | assert_ne!(py_string1.as_ptr(), py_string3.as_ptr()); |
836 | }); |
837 | } |
838 | |
839 | #[test ] |
840 | fn test_py_to_str_utf8() { |
841 | Python::with_gil(|py| { |
842 | let s = "ascii 🐈" ; |
843 | let py_string = PyString::new(py, s).unbind(); |
844 | |
845 | #[cfg (any(Py_3_10, not(Py_LIMITED_API)))] |
846 | assert_eq!(s, py_string.to_str(py).unwrap()); |
847 | |
848 | assert_eq!(s, py_string.to_cow(py).unwrap()); |
849 | }) |
850 | } |
851 | |
852 | #[test ] |
853 | fn test_py_to_str_surrogate() { |
854 | Python::with_gil(|py| { |
855 | let py_string: Py<PyString> = py |
856 | .eval(ffi::c_str!(r"'\ud800'" ), None, None) |
857 | .unwrap() |
858 | .extract() |
859 | .unwrap(); |
860 | |
861 | #[cfg (any(Py_3_10, not(Py_LIMITED_API)))] |
862 | assert!(py_string.to_str(py).is_err()); |
863 | |
864 | assert!(py_string.to_cow(py).is_err()); |
865 | }) |
866 | } |
867 | |
868 | #[test ] |
869 | fn test_py_to_string_lossy() { |
870 | Python::with_gil(|py| { |
871 | let py_string: Py<PyString> = py |
872 | .eval(ffi::c_str!(r"'🐈 Hello \ud800World'" ), None, None) |
873 | .unwrap() |
874 | .extract() |
875 | .unwrap(); |
876 | assert_eq!(py_string.to_string_lossy(py), "🐈 Hello ���World" ); |
877 | }) |
878 | } |
879 | |
880 | #[test ] |
881 | fn test_comparisons() { |
882 | Python::with_gil(|py| { |
883 | let s = "hello, world" ; |
884 | let py_string = PyString::new(py, s); |
885 | |
886 | assert_eq!(py_string, "hello, world" ); |
887 | |
888 | assert_eq!(py_string, s); |
889 | assert_eq!(&py_string, s); |
890 | assert_eq!(s, py_string); |
891 | assert_eq!(s, &py_string); |
892 | |
893 | assert_eq!(py_string, *s); |
894 | assert_eq!(&py_string, *s); |
895 | assert_eq!(*s, py_string); |
896 | assert_eq!(*s, &py_string); |
897 | |
898 | let py_string = py_string.as_borrowed(); |
899 | |
900 | assert_eq!(py_string, s); |
901 | assert_eq!(&py_string, s); |
902 | assert_eq!(s, py_string); |
903 | assert_eq!(s, &py_string); |
904 | |
905 | assert_eq!(py_string, *s); |
906 | assert_eq!(*s, py_string); |
907 | }) |
908 | } |
909 | } |
910 | |