1#[cfg(not(Py_LIMITED_API))]
2use crate::exceptions::PyUnicodeDecodeError;
3use crate::ffi_ptr_ext::FfiPtrExt;
4use crate::instance::Borrowed;
5use crate::py_result_ext::PyResultExt;
6use crate::types::any::PyAnyMethods;
7use crate::types::bytes::PyBytesMethods;
8use crate::types::PyBytes;
9#[allow(deprecated)]
10use crate::IntoPy;
11use crate::{ffi, Bound, Py, PyAny, PyResult, Python};
12use std::borrow::Cow;
13use std::ffi::CString;
14use std::str;
15
16/// Deprecated alias for [`PyString`].
17#[deprecated(since = "0.23.0", note = "use `PyString` instead")]
18pub type PyUnicode = PyString;
19
20/// Represents raw data backing a Python `str`.
21///
22/// Python internally stores strings in various representations. This enumeration
23/// represents those variations.
24#[cfg(not(Py_LIMITED_API))]
25#[derive(Clone, Copy, Debug, PartialEq, Eq)]
26pub enum PyStringData<'a> {
27 /// UCS1 representation.
28 Ucs1(&'a [u8]),
29
30 /// UCS2 representation.
31 Ucs2(&'a [u16]),
32
33 /// UCS4 representation.
34 Ucs4(&'a [u32]),
35}
36
37#[cfg(not(Py_LIMITED_API))]
38impl<'a> PyStringData<'a> {
39 /// Obtain the raw bytes backing this instance as a [u8] slice.
40 pub fn as_bytes(&self) -> &[u8] {
41 match self {
42 Self::Ucs1(s) => s,
43 Self::Ucs2(s) => unsafe {
44 std::slice::from_raw_parts(s.as_ptr().cast(), s.len() * self.value_width_bytes())
45 },
46 Self::Ucs4(s) => unsafe {
47 std::slice::from_raw_parts(s.as_ptr().cast(), s.len() * self.value_width_bytes())
48 },
49 }
50 }
51
52 /// Size in bytes of each value/item in the underlying slice.
53 #[inline]
54 pub fn value_width_bytes(&self) -> usize {
55 match self {
56 Self::Ucs1(_) => 1,
57 Self::Ucs2(_) => 2,
58 Self::Ucs4(_) => 4,
59 }
60 }
61
62 /// Convert the raw data to a Rust string.
63 ///
64 /// For UCS-1 / UTF-8, returns a borrow into the original slice. For UCS-2 and UCS-4,
65 /// returns an owned string.
66 ///
67 /// Returns [PyUnicodeDecodeError] if the string data isn't valid in its purported
68 /// storage format. This should only occur for strings that were created via Python
69 /// C APIs that skip input validation (like `PyUnicode_FromKindAndData`) and should
70 /// never occur for strings that were created from Python code.
71 pub fn to_string(self, py: Python<'_>) -> PyResult<Cow<'a, str>> {
72 use std::ffi::CStr;
73 match self {
74 Self::Ucs1(data) => match str::from_utf8(data) {
75 Ok(s) => Ok(Cow::Borrowed(s)),
76 Err(e) => Err(PyUnicodeDecodeError::new_utf8(py, data, e)?.into()),
77 },
78 Self::Ucs2(data) => match String::from_utf16(data) {
79 Ok(s) => Ok(Cow::Owned(s)),
80 Err(e) => {
81 let mut message = e.to_string().as_bytes().to_vec();
82 message.push(0);
83
84 Err(PyUnicodeDecodeError::new(
85 py,
86 ffi::c_str!("utf-16"),
87 self.as_bytes(),
88 0..self.as_bytes().len(),
89 CStr::from_bytes_with_nul(&message).unwrap(),
90 )?
91 .into())
92 }
93 },
94 Self::Ucs4(data) => match data.iter().map(|&c| std::char::from_u32(c)).collect() {
95 Some(s) => Ok(Cow::Owned(s)),
96 None => Err(PyUnicodeDecodeError::new(
97 py,
98 ffi::c_str!("utf-32"),
99 self.as_bytes(),
100 0..self.as_bytes().len(),
101 ffi::c_str!("error converting utf-32"),
102 )?
103 .into()),
104 },
105 }
106 }
107
108 /// Convert the raw data to a Rust string, possibly with data loss.
109 ///
110 /// Invalid code points will be replaced with `U+FFFD REPLACEMENT CHARACTER`.
111 ///
112 /// Returns a borrow into original data, when possible, or owned data otherwise.
113 ///
114 /// The return value of this function should only disagree with [Self::to_string]
115 /// when that method would error.
116 pub fn to_string_lossy(self) -> Cow<'a, str> {
117 match self {
118 Self::Ucs1(data) => String::from_utf8_lossy(data),
119 Self::Ucs2(data) => Cow::Owned(String::from_utf16_lossy(data)),
120 Self::Ucs4(data) => Cow::Owned(
121 data.iter()
122 .map(|&c| std::char::from_u32(c).unwrap_or('\u{FFFD}'))
123 .collect(),
124 ),
125 }
126 }
127}
128
129/// Represents a Python `string` (a Unicode string object).
130///
131/// Values of this type are accessed via PyO3's smart pointers, e.g. as
132/// [`Py<PyString>`][crate::Py] or [`Bound<'py, PyString>`][Bound].
133///
134/// For APIs available on `str` objects, see the [`PyStringMethods`] trait which is implemented for
135/// [`Bound<'py, PyString>`][Bound].
136///
137/// # Equality
138///
139/// For convenience, [`Bound<'py, PyString>`] implements [`PartialEq<str>`] to allow comparing the
140/// data in the Python string to a Rust UTF-8 string slice.
141///
142/// This is not always the most appropriate way to compare Python strings, as Python string subclasses
143/// may have different equality semantics. In situations where subclasses overriding equality might be
144/// relevant, use [`PyAnyMethods::eq`], at cost of the additional overhead of a Python method call.
145///
146/// ```rust
147/// # use pyo3::prelude::*;
148/// use pyo3::types::PyString;
149///
150/// # Python::with_gil(|py| {
151/// let py_string = PyString::new(py, "foo");
152/// // via PartialEq<str>
153/// assert_eq!(py_string, "foo");
154///
155/// // via Python equality
156/// assert!(py_string.as_any().eq("foo").unwrap());
157/// # });
158/// ```
159#[repr(transparent)]
160pub struct PyString(PyAny);
161
162pyobject_native_type_core!(PyString, pyobject_native_static_type_object!(ffi::PyUnicode_Type), #checkfunction=ffi::PyUnicode_Check);
163
164impl PyString {
165 /// Creates a new Python string object.
166 ///
167 /// Panics if out of memory.
168 pub fn new<'py>(py: Python<'py>, s: &str) -> Bound<'py, PyString> {
169 let ptr = s.as_ptr().cast();
170 let len = s.len() as ffi::Py_ssize_t;
171 unsafe {
172 ffi::PyUnicode_FromStringAndSize(ptr, len)
173 .assume_owned(py)
174 .downcast_into_unchecked()
175 }
176 }
177
178 /// Deprecated name for [`PyString::new`].
179 #[deprecated(since = "0.23.0", note = "renamed to `PyString::new`")]
180 #[inline]
181 pub fn new_bound<'py>(py: Python<'py>, s: &str) -> Bound<'py, PyString> {
182 Self::new(py, s)
183 }
184
185 /// Intern the given string
186 ///
187 /// This will return a reference to the same Python string object if called repeatedly with the same string.
188 ///
189 /// Note that while this is more memory efficient than [`PyString::new_bound`], it unconditionally allocates a
190 /// temporary Python string object and is thereby slower than [`PyString::new_bound`].
191 ///
192 /// Panics if out of memory.
193 pub fn intern<'py>(py: Python<'py>, s: &str) -> Bound<'py, PyString> {
194 let ptr = s.as_ptr().cast();
195 let len = s.len() as ffi::Py_ssize_t;
196 unsafe {
197 let mut ob = ffi::PyUnicode_FromStringAndSize(ptr, len);
198 if !ob.is_null() {
199 ffi::PyUnicode_InternInPlace(&mut ob);
200 }
201 ob.assume_owned(py).downcast_into_unchecked()
202 }
203 }
204
205 /// Deprecated name for [`PyString::intern`].
206 #[deprecated(since = "0.23.0", note = "renamed to `PyString::intern`")]
207 #[inline]
208 pub fn intern_bound<'py>(py: Python<'py>, s: &str) -> Bound<'py, PyString> {
209 Self::intern(py, s)
210 }
211
212 /// Attempts to create a Python string from a Python [bytes-like object].
213 ///
214 /// [bytes-like object]: (https://docs.python.org/3/glossary.html#term-bytes-like-object).
215 pub fn from_object<'py>(
216 src: &Bound<'py, PyAny>,
217 encoding: &str,
218 errors: &str,
219 ) -> PyResult<Bound<'py, PyString>> {
220 let encoding = CString::new(encoding)?;
221 let errors = CString::new(errors)?;
222 unsafe {
223 ffi::PyUnicode_FromEncodedObject(
224 src.as_ptr(),
225 encoding.as_ptr().cast(),
226 errors.as_ptr().cast(),
227 )
228 .assume_owned_or_err(src.py())
229 .downcast_into_unchecked()
230 }
231 }
232
233 /// Deprecated name for [`PyString::from_object`].
234 #[deprecated(since = "0.23.0", note = "renamed to `PyString::from_object`")]
235 #[inline]
236 pub fn from_object_bound<'py>(
237 src: &Bound<'py, PyAny>,
238 encoding: &str,
239 errors: &str,
240 ) -> PyResult<Bound<'py, PyString>> {
241 Self::from_object(src, encoding, errors)
242 }
243}
244
245/// Implementation of functionality for [`PyString`].
246///
247/// These methods are defined for the `Bound<'py, PyString>` smart pointer, so to use method call
248/// syntax these methods are separated into a trait, because stable Rust does not yet support
249/// `arbitrary_self_types`.
250#[doc(alias = "PyString")]
251pub trait PyStringMethods<'py>: crate::sealed::Sealed {
252 /// Gets the Python string as a Rust UTF-8 string slice.
253 ///
254 /// Returns a `UnicodeEncodeError` if the input is not valid unicode
255 /// (containing unpaired surrogates).
256 #[cfg(any(Py_3_10, not(Py_LIMITED_API)))]
257 fn to_str(&self) -> PyResult<&str>;
258
259 /// Converts the `PyString` into a Rust string, avoiding copying when possible.
260 ///
261 /// Returns a `UnicodeEncodeError` if the input is not valid unicode
262 /// (containing unpaired surrogates).
263 fn to_cow(&self) -> PyResult<Cow<'_, str>>;
264
265 /// Converts the `PyString` into a Rust string.
266 ///
267 /// Unpaired surrogates invalid UTF-8 sequences are
268 /// replaced with `U+FFFD REPLACEMENT CHARACTER`.
269 fn to_string_lossy(&self) -> Cow<'_, str>;
270
271 /// Encodes this string as a Python `bytes` object, using UTF-8 encoding.
272 fn encode_utf8(&self) -> PyResult<Bound<'py, PyBytes>>;
273
274 /// Obtains the raw data backing the Python string.
275 ///
276 /// If the Python string object was created through legacy APIs, its internal storage format
277 /// will be canonicalized before data is returned.
278 ///
279 /// # Safety
280 ///
281 /// This function implementation relies on manually decoding a C bitfield. In practice, this
282 /// works well on common little-endian architectures such as x86_64, where the bitfield has a
283 /// common representation (even if it is not part of the C spec). The PyO3 CI tests this API on
284 /// x86_64 platforms.
285 ///
286 /// By using this API, you accept responsibility for testing that PyStringData behaves as
287 /// expected on the targets where you plan to distribute your software.
288 #[cfg(not(any(Py_LIMITED_API, GraalPy, PyPy)))]
289 unsafe fn data(&self) -> PyResult<PyStringData<'_>>;
290}
291
292impl<'py> PyStringMethods<'py> for Bound<'py, PyString> {
293 #[cfg(any(Py_3_10, not(Py_LIMITED_API)))]
294 fn to_str(&self) -> PyResult<&str> {
295 self.as_borrowed().to_str()
296 }
297
298 fn to_cow(&self) -> PyResult<Cow<'_, str>> {
299 self.as_borrowed().to_cow()
300 }
301
302 fn to_string_lossy(&self) -> Cow<'_, str> {
303 self.as_borrowed().to_string_lossy()
304 }
305
306 fn encode_utf8(&self) -> PyResult<Bound<'py, PyBytes>> {
307 unsafe {
308 ffi::PyUnicode_AsUTF8String(self.as_ptr())
309 .assume_owned_or_err(self.py())
310 .downcast_into_unchecked::<PyBytes>()
311 }
312 }
313
314 #[cfg(not(any(Py_LIMITED_API, GraalPy, PyPy)))]
315 unsafe fn data(&self) -> PyResult<PyStringData<'_>> {
316 unsafe { self.as_borrowed().data() }
317 }
318}
319
320impl<'a> Borrowed<'a, '_, PyString> {
321 #[cfg(any(Py_3_10, not(Py_LIMITED_API)))]
322 #[allow(clippy::wrong_self_convention)]
323 pub(crate) fn to_str(self) -> PyResult<&'a str> {
324 // PyUnicode_AsUTF8AndSize only available on limited API starting with 3.10.
325 let mut size: ffi::Py_ssize_t = 0;
326 let data: *const u8 =
327 unsafe { ffi::PyUnicode_AsUTF8AndSize(self.as_ptr(), &mut size).cast() };
328 if data.is_null() {
329 Err(crate::PyErr::fetch(self.py()))
330 } else {
331 Ok(unsafe {
332 std::str::from_utf8_unchecked(std::slice::from_raw_parts(data, size as usize))
333 })
334 }
335 }
336
337 #[allow(clippy::wrong_self_convention)]
338 pub(crate) fn to_cow(self) -> PyResult<Cow<'a, str>> {
339 // TODO: this method can probably be deprecated once Python 3.9 support is dropped,
340 // because all versions then support the more efficient `to_str`.
341 #[cfg(any(Py_3_10, not(Py_LIMITED_API)))]
342 {
343 self.to_str().map(Cow::Borrowed)
344 }
345
346 #[cfg(not(any(Py_3_10, not(Py_LIMITED_API))))]
347 {
348 let bytes = self.encode_utf8()?;
349 Ok(Cow::Owned(
350 unsafe { str::from_utf8_unchecked(bytes.as_bytes()) }.to_owned(),
351 ))
352 }
353 }
354
355 #[allow(clippy::wrong_self_convention)]
356 fn to_string_lossy(self) -> Cow<'a, str> {
357 let ptr = self.as_ptr();
358 let py = self.py();
359
360 #[cfg(any(Py_3_10, not(Py_LIMITED_API)))]
361 if let Ok(s) = self.to_str() {
362 return Cow::Borrowed(s);
363 }
364
365 let bytes = unsafe {
366 ffi::PyUnicode_AsEncodedString(
367 ptr,
368 ffi::c_str!("utf-8").as_ptr(),
369 ffi::c_str!("surrogatepass").as_ptr(),
370 )
371 .assume_owned(py)
372 .downcast_into_unchecked::<PyBytes>()
373 };
374 Cow::Owned(String::from_utf8_lossy(bytes.as_bytes()).into_owned())
375 }
376
377 #[cfg(not(any(Py_LIMITED_API, GraalPy, PyPy)))]
378 unsafe fn data(self) -> PyResult<PyStringData<'a>> {
379 unsafe {
380 let ptr = self.as_ptr();
381
382 #[cfg(not(Py_3_12))]
383 #[allow(deprecated)]
384 {
385 let ready = ffi::PyUnicode_READY(ptr);
386 if ready != 0 {
387 // Exception was created on failure.
388 return Err(crate::PyErr::fetch(self.py()));
389 }
390 }
391
392 // The string should be in its canonical form after calling `PyUnicode_READY()`.
393 // And non-canonical form not possible after Python 3.12. So it should be safe
394 // to call these APIs.
395 let length = ffi::PyUnicode_GET_LENGTH(ptr) as usize;
396 let raw_data = ffi::PyUnicode_DATA(ptr);
397 let kind = ffi::PyUnicode_KIND(ptr);
398
399 match kind {
400 ffi::PyUnicode_1BYTE_KIND => Ok(PyStringData::Ucs1(std::slice::from_raw_parts(
401 raw_data as *const u8,
402 length,
403 ))),
404 ffi::PyUnicode_2BYTE_KIND => Ok(PyStringData::Ucs2(std::slice::from_raw_parts(
405 raw_data as *const u16,
406 length,
407 ))),
408 ffi::PyUnicode_4BYTE_KIND => Ok(PyStringData::Ucs4(std::slice::from_raw_parts(
409 raw_data as *const u32,
410 length,
411 ))),
412 _ => unreachable!(),
413 }
414 }
415 }
416}
417
418impl Py<PyString> {
419 /// Gets the Python string as a Rust UTF-8 string slice.
420 ///
421 /// Returns a `UnicodeEncodeError` if the input is not valid unicode
422 /// (containing unpaired surrogates).
423 ///
424 /// Because `str` objects are immutable, the returned slice is independent of
425 /// the GIL lifetime.
426 #[cfg(any(Py_3_10, not(Py_LIMITED_API)))]
427 pub fn to_str<'a>(&'a self, py: Python<'_>) -> PyResult<&'a str> {
428 self.bind_borrowed(py).to_str()
429 }
430
431 /// Converts the `PyString` into a Rust string, avoiding copying when possible.
432 ///
433 /// Returns a `UnicodeEncodeError` if the input is not valid unicode
434 /// (containing unpaired surrogates).
435 ///
436 /// Because `str` objects are immutable, the returned slice is independent of
437 /// the GIL lifetime.
438 pub fn to_cow<'a>(&'a self, py: Python<'_>) -> PyResult<Cow<'a, str>> {
439 self.bind_borrowed(py).to_cow()
440 }
441
442 /// Converts the `PyString` into a Rust string.
443 ///
444 /// Unpaired surrogates invalid UTF-8 sequences are
445 /// replaced with `U+FFFD REPLACEMENT CHARACTER`.
446 ///
447 /// Because `str` objects are immutable, the returned slice is independent of
448 /// the GIL lifetime.
449 pub fn to_string_lossy<'a>(&'a self, py: Python<'_>) -> Cow<'a, str> {
450 self.bind_borrowed(py).to_string_lossy()
451 }
452}
453
454#[allow(deprecated)]
455impl IntoPy<Py<PyString>> for Bound<'_, PyString> {
456 fn into_py(self, _py: Python<'_>) -> Py<PyString> {
457 self.unbind()
458 }
459}
460
461#[allow(deprecated)]
462impl IntoPy<Py<PyString>> for &Bound<'_, PyString> {
463 fn into_py(self, _py: Python<'_>) -> Py<PyString> {
464 self.clone().unbind()
465 }
466}
467
468#[allow(deprecated)]
469impl IntoPy<Py<PyString>> for &'_ Py<PyString> {
470 fn into_py(self, py: Python<'_>) -> Py<PyString> {
471 self.clone_ref(py)
472 }
473}
474
475/// Compares whether the data in the Python string is equal to the given UTF8.
476///
477/// In some cases Python equality might be more appropriate; see the note on [`PyString`].
478impl PartialEq<str> for Bound<'_, PyString> {
479 #[inline]
480 fn eq(&self, other: &str) -> bool {
481 self.as_borrowed() == *other
482 }
483}
484
485/// Compares whether the data in the Python string is equal to the given UTF8.
486///
487/// In some cases Python equality might be more appropriate; see the note on [`PyString`].
488impl PartialEq<&'_ str> for Bound<'_, PyString> {
489 #[inline]
490 fn eq(&self, other: &&str) -> bool {
491 self.as_borrowed() == **other
492 }
493}
494
495/// Compares whether the data in the Python string is equal to the given UTF8.
496///
497/// In some cases Python equality might be more appropriate; see the note on [`PyString`].
498impl PartialEq<Bound<'_, PyString>> for str {
499 #[inline]
500 fn eq(&self, other: &Bound<'_, PyString>) -> bool {
501 *self == other.as_borrowed()
502 }
503}
504
505/// Compares whether the data in the Python string is equal to the given UTF8.
506///
507/// In some cases Python equality might be more appropriate; see the note on [`PyString`].
508impl PartialEq<&'_ Bound<'_, PyString>> for str {
509 #[inline]
510 fn eq(&self, other: &&Bound<'_, PyString>) -> bool {
511 *self == other.as_borrowed()
512 }
513}
514
515/// Compares whether the data in the Python string is equal to the given UTF8.
516///
517/// In some cases Python equality might be more appropriate; see the note on [`PyString`].
518impl PartialEq<Bound<'_, PyString>> for &'_ str {
519 #[inline]
520 fn eq(&self, other: &Bound<'_, PyString>) -> bool {
521 **self == other.as_borrowed()
522 }
523}
524
525/// Compares whether the data in the Python string is equal to the given UTF8.
526///
527/// In some cases Python equality might be more appropriate; see the note on [`PyString`].
528impl PartialEq<str> for &'_ Bound<'_, PyString> {
529 #[inline]
530 fn eq(&self, other: &str) -> bool {
531 self.as_borrowed() == other
532 }
533}
534
535/// Compares whether the data in the Python string is equal to the given UTF8.
536///
537/// In some cases Python equality might be more appropriate; see the note on [`PyString`].
538impl PartialEq<str> for Borrowed<'_, '_, PyString> {
539 #[inline]
540 fn eq(&self, other: &str) -> bool {
541 #[cfg(not(Py_3_13))]
542 {
543 self.to_cow().map_or(default:false, |s: Cow<'_, str>| s == other)
544 }
545
546 #[cfg(Py_3_13)]
547 unsafe {
548 ffi::PyUnicode_EqualToUTF8AndSize(
549 self.as_ptr(),
550 other.as_ptr().cast(),
551 other.len() as _,
552 ) == 1
553 }
554 }
555}
556
557/// Compares whether the data in the Python string is equal to the given UTF8.
558///
559/// In some cases Python equality might be more appropriate; see the note on [`PyString`].
560impl PartialEq<&str> for Borrowed<'_, '_, PyString> {
561 #[inline]
562 fn eq(&self, other: &&str) -> bool {
563 *self == **other
564 }
565}
566
567/// Compares whether the data in the Python string is equal to the given UTF8.
568///
569/// In some cases Python equality might be more appropriate; see the note on [`PyString`].
570impl PartialEq<Borrowed<'_, '_, PyString>> for str {
571 #[inline]
572 fn eq(&self, other: &Borrowed<'_, '_, PyString>) -> bool {
573 other == self
574 }
575}
576
577/// Compares whether the data in the Python string is equal to the given UTF8.
578///
579/// In some cases Python equality might be more appropriate; see the note on [`PyString`].
580impl PartialEq<Borrowed<'_, '_, PyString>> for &'_ str {
581 #[inline]
582 fn eq(&self, other: &Borrowed<'_, '_, PyString>) -> bool {
583 other == self
584 }
585}
586
587#[cfg(test)]
588mod tests {
589 use super::*;
590 use crate::{IntoPyObject, PyObject};
591
592 #[test]
593 fn test_to_cow_utf8() {
594 Python::with_gil(|py| {
595 let s = "ascii 🐈";
596 let py_string = PyString::new(py, s);
597 assert_eq!(s, py_string.to_cow().unwrap());
598 })
599 }
600
601 #[test]
602 fn test_to_cow_surrogate() {
603 Python::with_gil(|py| {
604 let py_string = py
605 .eval(ffi::c_str!(r"'\ud800'"), None, None)
606 .unwrap()
607 .downcast_into::<PyString>()
608 .unwrap();
609 assert!(py_string.to_cow().is_err());
610 })
611 }
612
613 #[test]
614 fn test_to_cow_unicode() {
615 Python::with_gil(|py| {
616 let s = "哈哈🐈";
617 let py_string = PyString::new(py, s);
618 assert_eq!(s, py_string.to_cow().unwrap());
619 })
620 }
621
622 #[test]
623 fn test_encode_utf8_unicode() {
624 Python::with_gil(|py| {
625 let s = "哈哈🐈";
626 let obj = PyString::new(py, s);
627 assert_eq!(s.as_bytes(), obj.encode_utf8().unwrap().as_bytes());
628 })
629 }
630
631 #[test]
632 fn test_encode_utf8_surrogate() {
633 Python::with_gil(|py| {
634 let obj: PyObject = py
635 .eval(ffi::c_str!(r"'\ud800'"), None, None)
636 .unwrap()
637 .into();
638 assert!(obj
639 .bind(py)
640 .downcast::<PyString>()
641 .unwrap()
642 .encode_utf8()
643 .is_err());
644 })
645 }
646
647 #[test]
648 fn test_to_string_lossy() {
649 Python::with_gil(|py| {
650 let py_string = py
651 .eval(ffi::c_str!(r"'🐈 Hello \ud800World'"), None, None)
652 .unwrap()
653 .downcast_into::<PyString>()
654 .unwrap();
655
656 assert_eq!(py_string.to_string_lossy(), "🐈 Hello ���World");
657 })
658 }
659
660 #[test]
661 fn test_debug_string() {
662 Python::with_gil(|py| {
663 let s = "Hello\n".into_pyobject(py).unwrap();
664 assert_eq!(format!("{:?}", s), "'Hello\\n'");
665 })
666 }
667
668 #[test]
669 fn test_display_string() {
670 Python::with_gil(|py| {
671 let s = "Hello\n".into_pyobject(py).unwrap();
672 assert_eq!(format!("{}", s), "Hello\n");
673 })
674 }
675
676 #[test]
677 fn test_string_from_object() {
678 Python::with_gil(|py| {
679 let py_bytes = PyBytes::new(py, b"ab\xFFcd");
680
681 let py_string = PyString::from_object(&py_bytes, "utf-8", "ignore").unwrap();
682
683 let result = py_string.to_cow().unwrap();
684 assert_eq!(result, "abcd");
685 });
686 }
687
688 #[test]
689 fn test_string_from_obect_with_invalid_encoding_errors() {
690 Python::with_gil(|py| {
691 let py_bytes = PyBytes::new(py, b"abcd");
692
693 let result = PyString::from_object(&py_bytes, "utf\0-8", "ignore");
694 assert!(result.is_err());
695
696 let result = PyString::from_object(&py_bytes, "utf-8", "ign\0ore");
697 assert!(result.is_err());
698 });
699 }
700
701 #[test]
702 #[cfg(not(any(Py_LIMITED_API, PyPy)))]
703 fn test_string_data_ucs1() {
704 Python::with_gil(|py| {
705 let s = PyString::new(py, "hello, world");
706 let data = unsafe { s.data().unwrap() };
707
708 assert_eq!(data, PyStringData::Ucs1(b"hello, world"));
709 assert_eq!(data.to_string(py).unwrap(), Cow::Borrowed("hello, world"));
710 assert_eq!(data.to_string_lossy(), Cow::Borrowed("hello, world"));
711 })
712 }
713
714 #[test]
715 #[cfg(not(any(Py_LIMITED_API, PyPy)))]
716 fn test_string_data_ucs1_invalid() {
717 Python::with_gil(|py| {
718 // 0xfe is not allowed in UTF-8.
719 let buffer = b"f\xfe\0";
720 let ptr = unsafe {
721 crate::ffi::PyUnicode_FromKindAndData(
722 crate::ffi::PyUnicode_1BYTE_KIND as _,
723 buffer.as_ptr().cast(),
724 2,
725 )
726 };
727 assert!(!ptr.is_null());
728 let s = unsafe { ptr.assume_owned(py).downcast_into_unchecked::<PyString>() };
729 let data = unsafe { s.data().unwrap() };
730 assert_eq!(data, PyStringData::Ucs1(b"f\xfe"));
731 let err = data.to_string(py).unwrap_err();
732 assert!(err.get_type(py).is(&py.get_type::<PyUnicodeDecodeError>()));
733 assert!(err
734 .to_string()
735 .contains("'utf-8' codec can't decode byte 0xfe in position 1"));
736 assert_eq!(data.to_string_lossy(), Cow::Borrowed("f�"));
737 });
738 }
739
740 #[test]
741 #[cfg(not(any(Py_LIMITED_API, PyPy)))]
742 fn test_string_data_ucs2() {
743 Python::with_gil(|py| {
744 let s = py.eval(ffi::c_str!("'foo\\ud800'"), None, None).unwrap();
745 let py_string = s.downcast::<PyString>().unwrap();
746 let data = unsafe { py_string.data().unwrap() };
747
748 assert_eq!(data, PyStringData::Ucs2(&[102, 111, 111, 0xd800]));
749 assert_eq!(
750 data.to_string_lossy(),
751 Cow::Owned::<str>("foo�".to_string())
752 );
753 })
754 }
755
756 #[test]
757 #[cfg(all(not(any(Py_LIMITED_API, PyPy)), target_endian = "little"))]
758 fn test_string_data_ucs2_invalid() {
759 Python::with_gil(|py| {
760 // U+FF22 (valid) & U+d800 (never valid)
761 let buffer = b"\x22\xff\x00\xd8\x00\x00";
762 let ptr = unsafe {
763 crate::ffi::PyUnicode_FromKindAndData(
764 crate::ffi::PyUnicode_2BYTE_KIND as _,
765 buffer.as_ptr().cast(),
766 2,
767 )
768 };
769 assert!(!ptr.is_null());
770 let s = unsafe { ptr.assume_owned(py).downcast_into_unchecked::<PyString>() };
771 let data = unsafe { s.data().unwrap() };
772 assert_eq!(data, PyStringData::Ucs2(&[0xff22, 0xd800]));
773 let err = data.to_string(py).unwrap_err();
774 assert!(err.get_type(py).is(&py.get_type::<PyUnicodeDecodeError>()));
775 assert!(err
776 .to_string()
777 .contains("'utf-16' codec can't decode bytes in position 0-3"));
778 assert_eq!(data.to_string_lossy(), Cow::Owned::<str>("B�".into()));
779 });
780 }
781
782 #[test]
783 #[cfg(not(any(Py_LIMITED_API, PyPy)))]
784 fn test_string_data_ucs4() {
785 Python::with_gil(|py| {
786 let s = "哈哈🐈";
787 let py_string = PyString::new(py, s);
788 let data = unsafe { py_string.data().unwrap() };
789
790 assert_eq!(data, PyStringData::Ucs4(&[21704, 21704, 128008]));
791 assert_eq!(data.to_string_lossy(), Cow::Owned::<str>(s.to_string()));
792 })
793 }
794
795 #[test]
796 #[cfg(all(not(any(Py_LIMITED_API, PyPy)), target_endian = "little"))]
797 fn test_string_data_ucs4_invalid() {
798 Python::with_gil(|py| {
799 // U+20000 (valid) & U+d800 (never valid)
800 let buffer = b"\x00\x00\x02\x00\x00\xd8\x00\x00\x00\x00\x00\x00";
801 let ptr = unsafe {
802 crate::ffi::PyUnicode_FromKindAndData(
803 crate::ffi::PyUnicode_4BYTE_KIND as _,
804 buffer.as_ptr().cast(),
805 2,
806 )
807 };
808 assert!(!ptr.is_null());
809 let s = unsafe { ptr.assume_owned(py).downcast_into_unchecked::<PyString>() };
810 let data = unsafe { s.data().unwrap() };
811 assert_eq!(data, PyStringData::Ucs4(&[0x20000, 0xd800]));
812 let err = data.to_string(py).unwrap_err();
813 assert!(err.get_type(py).is(&py.get_type::<PyUnicodeDecodeError>()));
814 assert!(err
815 .to_string()
816 .contains("'utf-32' codec can't decode bytes in position 0-7"));
817 assert_eq!(data.to_string_lossy(), Cow::Owned::<str>("𠀀�".into()));
818 });
819 }
820
821 #[test]
822 fn test_intern_string() {
823 Python::with_gil(|py| {
824 let py_string1 = PyString::intern(py, "foo");
825 assert_eq!(py_string1, "foo");
826
827 let py_string2 = PyString::intern(py, "foo");
828 assert_eq!(py_string2, "foo");
829
830 assert_eq!(py_string1.as_ptr(), py_string2.as_ptr());
831
832 let py_string3 = PyString::intern(py, "bar");
833 assert_eq!(py_string3, "bar");
834
835 assert_ne!(py_string1.as_ptr(), py_string3.as_ptr());
836 });
837 }
838
839 #[test]
840 fn test_py_to_str_utf8() {
841 Python::with_gil(|py| {
842 let s = "ascii 🐈";
843 let py_string = PyString::new(py, s).unbind();
844
845 #[cfg(any(Py_3_10, not(Py_LIMITED_API)))]
846 assert_eq!(s, py_string.to_str(py).unwrap());
847
848 assert_eq!(s, py_string.to_cow(py).unwrap());
849 })
850 }
851
852 #[test]
853 fn test_py_to_str_surrogate() {
854 Python::with_gil(|py| {
855 let py_string: Py<PyString> = py
856 .eval(ffi::c_str!(r"'\ud800'"), None, None)
857 .unwrap()
858 .extract()
859 .unwrap();
860
861 #[cfg(any(Py_3_10, not(Py_LIMITED_API)))]
862 assert!(py_string.to_str(py).is_err());
863
864 assert!(py_string.to_cow(py).is_err());
865 })
866 }
867
868 #[test]
869 fn test_py_to_string_lossy() {
870 Python::with_gil(|py| {
871 let py_string: Py<PyString> = py
872 .eval(ffi::c_str!(r"'🐈 Hello \ud800World'"), None, None)
873 .unwrap()
874 .extract()
875 .unwrap();
876 assert_eq!(py_string.to_string_lossy(py), "🐈 Hello ���World");
877 })
878 }
879
880 #[test]
881 fn test_comparisons() {
882 Python::with_gil(|py| {
883 let s = "hello, world";
884 let py_string = PyString::new(py, s);
885
886 assert_eq!(py_string, "hello, world");
887
888 assert_eq!(py_string, s);
889 assert_eq!(&py_string, s);
890 assert_eq!(s, py_string);
891 assert_eq!(s, &py_string);
892
893 assert_eq!(py_string, *s);
894 assert_eq!(&py_string, *s);
895 assert_eq!(*s, py_string);
896 assert_eq!(*s, &py_string);
897
898 let py_string = py_string.as_borrowed();
899
900 assert_eq!(py_string, s);
901 assert_eq!(&py_string, s);
902 assert_eq!(s, py_string);
903 assert_eq!(s, &py_string);
904
905 assert_eq!(py_string, *s);
906 assert_eq!(*s, py_string);
907 })
908 }
909}
910