| 1 | #[cfg (not(Py_LIMITED_API))] |
| 2 | use crate::exceptions::PyUnicodeDecodeError; |
| 3 | use crate::ffi_ptr_ext::FfiPtrExt; |
| 4 | use crate::instance::Borrowed; |
| 5 | use crate::py_result_ext::PyResultExt; |
| 6 | use crate::types::any::PyAnyMethods; |
| 7 | use crate::types::bytes::PyBytesMethods; |
| 8 | use crate::types::PyBytes; |
| 9 | #[allow (deprecated)] |
| 10 | use crate::IntoPy; |
| 11 | use crate::{ffi, Bound, Py, PyAny, PyResult, Python}; |
| 12 | use std::borrow::Cow; |
| 13 | use std::ffi::CString; |
| 14 | use std::str; |
| 15 | |
| 16 | /// Deprecated alias for [`PyString`]. |
| 17 | #[deprecated (since = "0.23.0" , note = "use `PyString` instead" )] |
| 18 | pub type PyUnicode = PyString; |
| 19 | |
| 20 | /// Represents raw data backing a Python `str`. |
| 21 | /// |
| 22 | /// Python internally stores strings in various representations. This enumeration |
| 23 | /// represents those variations. |
| 24 | #[cfg (not(Py_LIMITED_API))] |
| 25 | #[derive (Clone, Copy, Debug, PartialEq, Eq)] |
| 26 | pub enum PyStringData<'a> { |
| 27 | /// UCS1 representation. |
| 28 | Ucs1(&'a [u8]), |
| 29 | |
| 30 | /// UCS2 representation. |
| 31 | Ucs2(&'a [u16]), |
| 32 | |
| 33 | /// UCS4 representation. |
| 34 | Ucs4(&'a [u32]), |
| 35 | } |
| 36 | |
| 37 | #[cfg (not(Py_LIMITED_API))] |
| 38 | impl<'a> PyStringData<'a> { |
| 39 | /// Obtain the raw bytes backing this instance as a [u8] slice. |
| 40 | pub fn as_bytes(&self) -> &[u8] { |
| 41 | match self { |
| 42 | Self::Ucs1(s) => s, |
| 43 | Self::Ucs2(s) => unsafe { |
| 44 | std::slice::from_raw_parts(s.as_ptr().cast(), s.len() * self.value_width_bytes()) |
| 45 | }, |
| 46 | Self::Ucs4(s) => unsafe { |
| 47 | std::slice::from_raw_parts(s.as_ptr().cast(), s.len() * self.value_width_bytes()) |
| 48 | }, |
| 49 | } |
| 50 | } |
| 51 | |
| 52 | /// Size in bytes of each value/item in the underlying slice. |
| 53 | #[inline ] |
| 54 | pub fn value_width_bytes(&self) -> usize { |
| 55 | match self { |
| 56 | Self::Ucs1(_) => 1, |
| 57 | Self::Ucs2(_) => 2, |
| 58 | Self::Ucs4(_) => 4, |
| 59 | } |
| 60 | } |
| 61 | |
| 62 | /// Convert the raw data to a Rust string. |
| 63 | /// |
| 64 | /// For UCS-1 / UTF-8, returns a borrow into the original slice. For UCS-2 and UCS-4, |
| 65 | /// returns an owned string. |
| 66 | /// |
| 67 | /// Returns [PyUnicodeDecodeError] if the string data isn't valid in its purported |
| 68 | /// storage format. This should only occur for strings that were created via Python |
| 69 | /// C APIs that skip input validation (like `PyUnicode_FromKindAndData`) and should |
| 70 | /// never occur for strings that were created from Python code. |
| 71 | pub fn to_string(self, py: Python<'_>) -> PyResult<Cow<'a, str>> { |
| 72 | use std::ffi::CStr; |
| 73 | match self { |
| 74 | Self::Ucs1(data) => match str::from_utf8(data) { |
| 75 | Ok(s) => Ok(Cow::Borrowed(s)), |
| 76 | Err(e) => Err(PyUnicodeDecodeError::new_utf8(py, data, e)?.into()), |
| 77 | }, |
| 78 | Self::Ucs2(data) => match String::from_utf16(data) { |
| 79 | Ok(s) => Ok(Cow::Owned(s)), |
| 80 | Err(e) => { |
| 81 | let mut message = e.to_string().as_bytes().to_vec(); |
| 82 | message.push(0); |
| 83 | |
| 84 | Err(PyUnicodeDecodeError::new( |
| 85 | py, |
| 86 | ffi::c_str!("utf-16" ), |
| 87 | self.as_bytes(), |
| 88 | 0..self.as_bytes().len(), |
| 89 | CStr::from_bytes_with_nul(&message).unwrap(), |
| 90 | )? |
| 91 | .into()) |
| 92 | } |
| 93 | }, |
| 94 | Self::Ucs4(data) => match data.iter().map(|&c| std::char::from_u32(c)).collect() { |
| 95 | Some(s) => Ok(Cow::Owned(s)), |
| 96 | None => Err(PyUnicodeDecodeError::new( |
| 97 | py, |
| 98 | ffi::c_str!("utf-32" ), |
| 99 | self.as_bytes(), |
| 100 | 0..self.as_bytes().len(), |
| 101 | ffi::c_str!("error converting utf-32" ), |
| 102 | )? |
| 103 | .into()), |
| 104 | }, |
| 105 | } |
| 106 | } |
| 107 | |
| 108 | /// Convert the raw data to a Rust string, possibly with data loss. |
| 109 | /// |
| 110 | /// Invalid code points will be replaced with `U+FFFD REPLACEMENT CHARACTER`. |
| 111 | /// |
| 112 | /// Returns a borrow into original data, when possible, or owned data otherwise. |
| 113 | /// |
| 114 | /// The return value of this function should only disagree with [Self::to_string] |
| 115 | /// when that method would error. |
| 116 | pub fn to_string_lossy(self) -> Cow<'a, str> { |
| 117 | match self { |
| 118 | Self::Ucs1(data) => String::from_utf8_lossy(data), |
| 119 | Self::Ucs2(data) => Cow::Owned(String::from_utf16_lossy(data)), |
| 120 | Self::Ucs4(data) => Cow::Owned( |
| 121 | data.iter() |
| 122 | .map(|&c| std::char::from_u32(c).unwrap_or(' \u{FFFD}' )) |
| 123 | .collect(), |
| 124 | ), |
| 125 | } |
| 126 | } |
| 127 | } |
| 128 | |
| 129 | /// Represents a Python `string` (a Unicode string object). |
| 130 | /// |
| 131 | /// Values of this type are accessed via PyO3's smart pointers, e.g. as |
| 132 | /// [`Py<PyString>`][crate::Py] or [`Bound<'py, PyString>`][Bound]. |
| 133 | /// |
| 134 | /// For APIs available on `str` objects, see the [`PyStringMethods`] trait which is implemented for |
| 135 | /// [`Bound<'py, PyString>`][Bound]. |
| 136 | /// |
| 137 | /// # Equality |
| 138 | /// |
| 139 | /// For convenience, [`Bound<'py, PyString>`] implements [`PartialEq<str>`] to allow comparing the |
| 140 | /// data in the Python string to a Rust UTF-8 string slice. |
| 141 | /// |
| 142 | /// This is not always the most appropriate way to compare Python strings, as Python string subclasses |
| 143 | /// may have different equality semantics. In situations where subclasses overriding equality might be |
| 144 | /// relevant, use [`PyAnyMethods::eq`], at cost of the additional overhead of a Python method call. |
| 145 | /// |
| 146 | /// ```rust |
| 147 | /// # use pyo3::prelude::*; |
| 148 | /// use pyo3::types::PyString; |
| 149 | /// |
| 150 | /// # Python::with_gil(|py| { |
| 151 | /// let py_string = PyString::new(py, "foo" ); |
| 152 | /// // via PartialEq<str> |
| 153 | /// assert_eq!(py_string, "foo" ); |
| 154 | /// |
| 155 | /// // via Python equality |
| 156 | /// assert!(py_string.as_any().eq("foo" ).unwrap()); |
| 157 | /// # }); |
| 158 | /// ``` |
| 159 | #[repr (transparent)] |
| 160 | pub struct PyString(PyAny); |
| 161 | |
| 162 | pyobject_native_type_core!(PyString, pyobject_native_static_type_object!(ffi::PyUnicode_Type), #checkfunction=ffi::PyUnicode_Check); |
| 163 | |
| 164 | impl PyString { |
| 165 | /// Creates a new Python string object. |
| 166 | /// |
| 167 | /// Panics if out of memory. |
| 168 | pub fn new<'py>(py: Python<'py>, s: &str) -> Bound<'py, PyString> { |
| 169 | let ptr = s.as_ptr().cast(); |
| 170 | let len = s.len() as ffi::Py_ssize_t; |
| 171 | unsafe { |
| 172 | ffi::PyUnicode_FromStringAndSize(ptr, len) |
| 173 | .assume_owned(py) |
| 174 | .downcast_into_unchecked() |
| 175 | } |
| 176 | } |
| 177 | |
| 178 | /// Deprecated name for [`PyString::new`]. |
| 179 | #[deprecated (since = "0.23.0" , note = "renamed to `PyString::new`" )] |
| 180 | #[inline ] |
| 181 | pub fn new_bound<'py>(py: Python<'py>, s: &str) -> Bound<'py, PyString> { |
| 182 | Self::new(py, s) |
| 183 | } |
| 184 | |
| 185 | /// Intern the given string |
| 186 | /// |
| 187 | /// This will return a reference to the same Python string object if called repeatedly with the same string. |
| 188 | /// |
| 189 | /// Note that while this is more memory efficient than [`PyString::new_bound`], it unconditionally allocates a |
| 190 | /// temporary Python string object and is thereby slower than [`PyString::new_bound`]. |
| 191 | /// |
| 192 | /// Panics if out of memory. |
| 193 | pub fn intern<'py>(py: Python<'py>, s: &str) -> Bound<'py, PyString> { |
| 194 | let ptr = s.as_ptr().cast(); |
| 195 | let len = s.len() as ffi::Py_ssize_t; |
| 196 | unsafe { |
| 197 | let mut ob = ffi::PyUnicode_FromStringAndSize(ptr, len); |
| 198 | if !ob.is_null() { |
| 199 | ffi::PyUnicode_InternInPlace(&mut ob); |
| 200 | } |
| 201 | ob.assume_owned(py).downcast_into_unchecked() |
| 202 | } |
| 203 | } |
| 204 | |
| 205 | /// Deprecated name for [`PyString::intern`]. |
| 206 | #[deprecated (since = "0.23.0" , note = "renamed to `PyString::intern`" )] |
| 207 | #[inline ] |
| 208 | pub fn intern_bound<'py>(py: Python<'py>, s: &str) -> Bound<'py, PyString> { |
| 209 | Self::intern(py, s) |
| 210 | } |
| 211 | |
| 212 | /// Attempts to create a Python string from a Python [bytes-like object]. |
| 213 | /// |
| 214 | /// [bytes-like object]: (https://docs.python.org/3/glossary.html#term-bytes-like-object). |
| 215 | pub fn from_object<'py>( |
| 216 | src: &Bound<'py, PyAny>, |
| 217 | encoding: &str, |
| 218 | errors: &str, |
| 219 | ) -> PyResult<Bound<'py, PyString>> { |
| 220 | let encoding = CString::new(encoding)?; |
| 221 | let errors = CString::new(errors)?; |
| 222 | unsafe { |
| 223 | ffi::PyUnicode_FromEncodedObject( |
| 224 | src.as_ptr(), |
| 225 | encoding.as_ptr().cast(), |
| 226 | errors.as_ptr().cast(), |
| 227 | ) |
| 228 | .assume_owned_or_err(src.py()) |
| 229 | .downcast_into_unchecked() |
| 230 | } |
| 231 | } |
| 232 | |
| 233 | /// Deprecated name for [`PyString::from_object`]. |
| 234 | #[deprecated (since = "0.23.0" , note = "renamed to `PyString::from_object`" )] |
| 235 | #[inline ] |
| 236 | pub fn from_object_bound<'py>( |
| 237 | src: &Bound<'py, PyAny>, |
| 238 | encoding: &str, |
| 239 | errors: &str, |
| 240 | ) -> PyResult<Bound<'py, PyString>> { |
| 241 | Self::from_object(src, encoding, errors) |
| 242 | } |
| 243 | } |
| 244 | |
| 245 | /// Implementation of functionality for [`PyString`]. |
| 246 | /// |
| 247 | /// These methods are defined for the `Bound<'py, PyString>` smart pointer, so to use method call |
| 248 | /// syntax these methods are separated into a trait, because stable Rust does not yet support |
| 249 | /// `arbitrary_self_types`. |
| 250 | #[doc (alias = "PyString" )] |
| 251 | pub trait PyStringMethods<'py>: crate::sealed::Sealed { |
| 252 | /// Gets the Python string as a Rust UTF-8 string slice. |
| 253 | /// |
| 254 | /// Returns a `UnicodeEncodeError` if the input is not valid unicode |
| 255 | /// (containing unpaired surrogates). |
| 256 | #[cfg (any(Py_3_10, not(Py_LIMITED_API)))] |
| 257 | fn to_str(&self) -> PyResult<&str>; |
| 258 | |
| 259 | /// Converts the `PyString` into a Rust string, avoiding copying when possible. |
| 260 | /// |
| 261 | /// Returns a `UnicodeEncodeError` if the input is not valid unicode |
| 262 | /// (containing unpaired surrogates). |
| 263 | fn to_cow(&self) -> PyResult<Cow<'_, str>>; |
| 264 | |
| 265 | /// Converts the `PyString` into a Rust string. |
| 266 | /// |
| 267 | /// Unpaired surrogates invalid UTF-8 sequences are |
| 268 | /// replaced with `U+FFFD REPLACEMENT CHARACTER`. |
| 269 | fn to_string_lossy(&self) -> Cow<'_, str>; |
| 270 | |
| 271 | /// Encodes this string as a Python `bytes` object, using UTF-8 encoding. |
| 272 | fn encode_utf8(&self) -> PyResult<Bound<'py, PyBytes>>; |
| 273 | |
| 274 | /// Obtains the raw data backing the Python string. |
| 275 | /// |
| 276 | /// If the Python string object was created through legacy APIs, its internal storage format |
| 277 | /// will be canonicalized before data is returned. |
| 278 | /// |
| 279 | /// # Safety |
| 280 | /// |
| 281 | /// This function implementation relies on manually decoding a C bitfield. In practice, this |
| 282 | /// works well on common little-endian architectures such as x86_64, where the bitfield has a |
| 283 | /// common representation (even if it is not part of the C spec). The PyO3 CI tests this API on |
| 284 | /// x86_64 platforms. |
| 285 | /// |
| 286 | /// By using this API, you accept responsibility for testing that PyStringData behaves as |
| 287 | /// expected on the targets where you plan to distribute your software. |
| 288 | #[cfg (not(any(Py_LIMITED_API, GraalPy, PyPy)))] |
| 289 | unsafe fn data(&self) -> PyResult<PyStringData<'_>>; |
| 290 | } |
| 291 | |
| 292 | impl<'py> PyStringMethods<'py> for Bound<'py, PyString> { |
| 293 | #[cfg (any(Py_3_10, not(Py_LIMITED_API)))] |
| 294 | fn to_str(&self) -> PyResult<&str> { |
| 295 | self.as_borrowed().to_str() |
| 296 | } |
| 297 | |
| 298 | fn to_cow(&self) -> PyResult<Cow<'_, str>> { |
| 299 | self.as_borrowed().to_cow() |
| 300 | } |
| 301 | |
| 302 | fn to_string_lossy(&self) -> Cow<'_, str> { |
| 303 | self.as_borrowed().to_string_lossy() |
| 304 | } |
| 305 | |
| 306 | fn encode_utf8(&self) -> PyResult<Bound<'py, PyBytes>> { |
| 307 | unsafe { |
| 308 | ffi::PyUnicode_AsUTF8String(self.as_ptr()) |
| 309 | .assume_owned_or_err(self.py()) |
| 310 | .downcast_into_unchecked::<PyBytes>() |
| 311 | } |
| 312 | } |
| 313 | |
| 314 | #[cfg (not(any(Py_LIMITED_API, GraalPy, PyPy)))] |
| 315 | unsafe fn data(&self) -> PyResult<PyStringData<'_>> { |
| 316 | unsafe { self.as_borrowed().data() } |
| 317 | } |
| 318 | } |
| 319 | |
| 320 | impl<'a> Borrowed<'a, '_, PyString> { |
| 321 | #[cfg (any(Py_3_10, not(Py_LIMITED_API)))] |
| 322 | #[allow (clippy::wrong_self_convention)] |
| 323 | pub(crate) fn to_str(self) -> PyResult<&'a str> { |
| 324 | // PyUnicode_AsUTF8AndSize only available on limited API starting with 3.10. |
| 325 | let mut size: ffi::Py_ssize_t = 0; |
| 326 | let data: *const u8 = |
| 327 | unsafe { ffi::PyUnicode_AsUTF8AndSize(self.as_ptr(), &mut size).cast() }; |
| 328 | if data.is_null() { |
| 329 | Err(crate::PyErr::fetch(self.py())) |
| 330 | } else { |
| 331 | Ok(unsafe { |
| 332 | std::str::from_utf8_unchecked(std::slice::from_raw_parts(data, size as usize)) |
| 333 | }) |
| 334 | } |
| 335 | } |
| 336 | |
| 337 | #[allow (clippy::wrong_self_convention)] |
| 338 | pub(crate) fn to_cow(self) -> PyResult<Cow<'a, str>> { |
| 339 | // TODO: this method can probably be deprecated once Python 3.9 support is dropped, |
| 340 | // because all versions then support the more efficient `to_str`. |
| 341 | #[cfg (any(Py_3_10, not(Py_LIMITED_API)))] |
| 342 | { |
| 343 | self.to_str().map(Cow::Borrowed) |
| 344 | } |
| 345 | |
| 346 | #[cfg (not(any(Py_3_10, not(Py_LIMITED_API))))] |
| 347 | { |
| 348 | let bytes = self.encode_utf8()?; |
| 349 | Ok(Cow::Owned( |
| 350 | unsafe { str::from_utf8_unchecked(bytes.as_bytes()) }.to_owned(), |
| 351 | )) |
| 352 | } |
| 353 | } |
| 354 | |
| 355 | #[allow (clippy::wrong_self_convention)] |
| 356 | fn to_string_lossy(self) -> Cow<'a, str> { |
| 357 | let ptr = self.as_ptr(); |
| 358 | let py = self.py(); |
| 359 | |
| 360 | #[cfg (any(Py_3_10, not(Py_LIMITED_API)))] |
| 361 | if let Ok(s) = self.to_str() { |
| 362 | return Cow::Borrowed(s); |
| 363 | } |
| 364 | |
| 365 | let bytes = unsafe { |
| 366 | ffi::PyUnicode_AsEncodedString( |
| 367 | ptr, |
| 368 | ffi::c_str!("utf-8" ).as_ptr(), |
| 369 | ffi::c_str!("surrogatepass" ).as_ptr(), |
| 370 | ) |
| 371 | .assume_owned(py) |
| 372 | .downcast_into_unchecked::<PyBytes>() |
| 373 | }; |
| 374 | Cow::Owned(String::from_utf8_lossy(bytes.as_bytes()).into_owned()) |
| 375 | } |
| 376 | |
| 377 | #[cfg (not(any(Py_LIMITED_API, GraalPy, PyPy)))] |
| 378 | unsafe fn data(self) -> PyResult<PyStringData<'a>> { |
| 379 | unsafe { |
| 380 | let ptr = self.as_ptr(); |
| 381 | |
| 382 | #[cfg (not(Py_3_12))] |
| 383 | #[allow (deprecated)] |
| 384 | { |
| 385 | let ready = ffi::PyUnicode_READY(ptr); |
| 386 | if ready != 0 { |
| 387 | // Exception was created on failure. |
| 388 | return Err(crate::PyErr::fetch(self.py())); |
| 389 | } |
| 390 | } |
| 391 | |
| 392 | // The string should be in its canonical form after calling `PyUnicode_READY()`. |
| 393 | // And non-canonical form not possible after Python 3.12. So it should be safe |
| 394 | // to call these APIs. |
| 395 | let length = ffi::PyUnicode_GET_LENGTH(ptr) as usize; |
| 396 | let raw_data = ffi::PyUnicode_DATA(ptr); |
| 397 | let kind = ffi::PyUnicode_KIND(ptr); |
| 398 | |
| 399 | match kind { |
| 400 | ffi::PyUnicode_1BYTE_KIND => Ok(PyStringData::Ucs1(std::slice::from_raw_parts( |
| 401 | raw_data as *const u8, |
| 402 | length, |
| 403 | ))), |
| 404 | ffi::PyUnicode_2BYTE_KIND => Ok(PyStringData::Ucs2(std::slice::from_raw_parts( |
| 405 | raw_data as *const u16, |
| 406 | length, |
| 407 | ))), |
| 408 | ffi::PyUnicode_4BYTE_KIND => Ok(PyStringData::Ucs4(std::slice::from_raw_parts( |
| 409 | raw_data as *const u32, |
| 410 | length, |
| 411 | ))), |
| 412 | _ => unreachable!(), |
| 413 | } |
| 414 | } |
| 415 | } |
| 416 | } |
| 417 | |
| 418 | impl Py<PyString> { |
| 419 | /// Gets the Python string as a Rust UTF-8 string slice. |
| 420 | /// |
| 421 | /// Returns a `UnicodeEncodeError` if the input is not valid unicode |
| 422 | /// (containing unpaired surrogates). |
| 423 | /// |
| 424 | /// Because `str` objects are immutable, the returned slice is independent of |
| 425 | /// the GIL lifetime. |
| 426 | #[cfg (any(Py_3_10, not(Py_LIMITED_API)))] |
| 427 | pub fn to_str<'a>(&'a self, py: Python<'_>) -> PyResult<&'a str> { |
| 428 | self.bind_borrowed(py).to_str() |
| 429 | } |
| 430 | |
| 431 | /// Converts the `PyString` into a Rust string, avoiding copying when possible. |
| 432 | /// |
| 433 | /// Returns a `UnicodeEncodeError` if the input is not valid unicode |
| 434 | /// (containing unpaired surrogates). |
| 435 | /// |
| 436 | /// Because `str` objects are immutable, the returned slice is independent of |
| 437 | /// the GIL lifetime. |
| 438 | pub fn to_cow<'a>(&'a self, py: Python<'_>) -> PyResult<Cow<'a, str>> { |
| 439 | self.bind_borrowed(py).to_cow() |
| 440 | } |
| 441 | |
| 442 | /// Converts the `PyString` into a Rust string. |
| 443 | /// |
| 444 | /// Unpaired surrogates invalid UTF-8 sequences are |
| 445 | /// replaced with `U+FFFD REPLACEMENT CHARACTER`. |
| 446 | /// |
| 447 | /// Because `str` objects are immutable, the returned slice is independent of |
| 448 | /// the GIL lifetime. |
| 449 | pub fn to_string_lossy<'a>(&'a self, py: Python<'_>) -> Cow<'a, str> { |
| 450 | self.bind_borrowed(py).to_string_lossy() |
| 451 | } |
| 452 | } |
| 453 | |
| 454 | #[allow (deprecated)] |
| 455 | impl IntoPy<Py<PyString>> for Bound<'_, PyString> { |
| 456 | fn into_py(self, _py: Python<'_>) -> Py<PyString> { |
| 457 | self.unbind() |
| 458 | } |
| 459 | } |
| 460 | |
| 461 | #[allow (deprecated)] |
| 462 | impl IntoPy<Py<PyString>> for &Bound<'_, PyString> { |
| 463 | fn into_py(self, _py: Python<'_>) -> Py<PyString> { |
| 464 | self.clone().unbind() |
| 465 | } |
| 466 | } |
| 467 | |
| 468 | #[allow (deprecated)] |
| 469 | impl IntoPy<Py<PyString>> for &'_ Py<PyString> { |
| 470 | fn into_py(self, py: Python<'_>) -> Py<PyString> { |
| 471 | self.clone_ref(py) |
| 472 | } |
| 473 | } |
| 474 | |
| 475 | /// Compares whether the data in the Python string is equal to the given UTF8. |
| 476 | /// |
| 477 | /// In some cases Python equality might be more appropriate; see the note on [`PyString`]. |
| 478 | impl PartialEq<str> for Bound<'_, PyString> { |
| 479 | #[inline ] |
| 480 | fn eq(&self, other: &str) -> bool { |
| 481 | self.as_borrowed() == *other |
| 482 | } |
| 483 | } |
| 484 | |
| 485 | /// Compares whether the data in the Python string is equal to the given UTF8. |
| 486 | /// |
| 487 | /// In some cases Python equality might be more appropriate; see the note on [`PyString`]. |
| 488 | impl PartialEq<&'_ str> for Bound<'_, PyString> { |
| 489 | #[inline ] |
| 490 | fn eq(&self, other: &&str) -> bool { |
| 491 | self.as_borrowed() == **other |
| 492 | } |
| 493 | } |
| 494 | |
| 495 | /// Compares whether the data in the Python string is equal to the given UTF8. |
| 496 | /// |
| 497 | /// In some cases Python equality might be more appropriate; see the note on [`PyString`]. |
| 498 | impl PartialEq<Bound<'_, PyString>> for str { |
| 499 | #[inline ] |
| 500 | fn eq(&self, other: &Bound<'_, PyString>) -> bool { |
| 501 | *self == other.as_borrowed() |
| 502 | } |
| 503 | } |
| 504 | |
| 505 | /// Compares whether the data in the Python string is equal to the given UTF8. |
| 506 | /// |
| 507 | /// In some cases Python equality might be more appropriate; see the note on [`PyString`]. |
| 508 | impl PartialEq<&'_ Bound<'_, PyString>> for str { |
| 509 | #[inline ] |
| 510 | fn eq(&self, other: &&Bound<'_, PyString>) -> bool { |
| 511 | *self == other.as_borrowed() |
| 512 | } |
| 513 | } |
| 514 | |
| 515 | /// Compares whether the data in the Python string is equal to the given UTF8. |
| 516 | /// |
| 517 | /// In some cases Python equality might be more appropriate; see the note on [`PyString`]. |
| 518 | impl PartialEq<Bound<'_, PyString>> for &'_ str { |
| 519 | #[inline ] |
| 520 | fn eq(&self, other: &Bound<'_, PyString>) -> bool { |
| 521 | **self == other.as_borrowed() |
| 522 | } |
| 523 | } |
| 524 | |
| 525 | /// Compares whether the data in the Python string is equal to the given UTF8. |
| 526 | /// |
| 527 | /// In some cases Python equality might be more appropriate; see the note on [`PyString`]. |
| 528 | impl PartialEq<str> for &'_ Bound<'_, PyString> { |
| 529 | #[inline ] |
| 530 | fn eq(&self, other: &str) -> bool { |
| 531 | self.as_borrowed() == other |
| 532 | } |
| 533 | } |
| 534 | |
| 535 | /// Compares whether the data in the Python string is equal to the given UTF8. |
| 536 | /// |
| 537 | /// In some cases Python equality might be more appropriate; see the note on [`PyString`]. |
| 538 | impl PartialEq<str> for Borrowed<'_, '_, PyString> { |
| 539 | #[inline ] |
| 540 | fn eq(&self, other: &str) -> bool { |
| 541 | #[cfg (not(Py_3_13))] |
| 542 | { |
| 543 | self.to_cow().map_or(default:false, |s: Cow<'_, str>| s == other) |
| 544 | } |
| 545 | |
| 546 | #[cfg (Py_3_13)] |
| 547 | unsafe { |
| 548 | ffi::PyUnicode_EqualToUTF8AndSize( |
| 549 | self.as_ptr(), |
| 550 | other.as_ptr().cast(), |
| 551 | other.len() as _, |
| 552 | ) == 1 |
| 553 | } |
| 554 | } |
| 555 | } |
| 556 | |
| 557 | /// Compares whether the data in the Python string is equal to the given UTF8. |
| 558 | /// |
| 559 | /// In some cases Python equality might be more appropriate; see the note on [`PyString`]. |
| 560 | impl PartialEq<&str> for Borrowed<'_, '_, PyString> { |
| 561 | #[inline ] |
| 562 | fn eq(&self, other: &&str) -> bool { |
| 563 | *self == **other |
| 564 | } |
| 565 | } |
| 566 | |
| 567 | /// Compares whether the data in the Python string is equal to the given UTF8. |
| 568 | /// |
| 569 | /// In some cases Python equality might be more appropriate; see the note on [`PyString`]. |
| 570 | impl PartialEq<Borrowed<'_, '_, PyString>> for str { |
| 571 | #[inline ] |
| 572 | fn eq(&self, other: &Borrowed<'_, '_, PyString>) -> bool { |
| 573 | other == self |
| 574 | } |
| 575 | } |
| 576 | |
| 577 | /// Compares whether the data in the Python string is equal to the given UTF8. |
| 578 | /// |
| 579 | /// In some cases Python equality might be more appropriate; see the note on [`PyString`]. |
| 580 | impl PartialEq<Borrowed<'_, '_, PyString>> for &'_ str { |
| 581 | #[inline ] |
| 582 | fn eq(&self, other: &Borrowed<'_, '_, PyString>) -> bool { |
| 583 | other == self |
| 584 | } |
| 585 | } |
| 586 | |
| 587 | #[cfg (test)] |
| 588 | mod tests { |
| 589 | use super::*; |
| 590 | use crate::{IntoPyObject, PyObject}; |
| 591 | |
| 592 | #[test ] |
| 593 | fn test_to_cow_utf8() { |
| 594 | Python::with_gil(|py| { |
| 595 | let s = "ascii 🐈" ; |
| 596 | let py_string = PyString::new(py, s); |
| 597 | assert_eq!(s, py_string.to_cow().unwrap()); |
| 598 | }) |
| 599 | } |
| 600 | |
| 601 | #[test ] |
| 602 | fn test_to_cow_surrogate() { |
| 603 | Python::with_gil(|py| { |
| 604 | let py_string = py |
| 605 | .eval(ffi::c_str!(r"'\ud800'" ), None, None) |
| 606 | .unwrap() |
| 607 | .downcast_into::<PyString>() |
| 608 | .unwrap(); |
| 609 | assert!(py_string.to_cow().is_err()); |
| 610 | }) |
| 611 | } |
| 612 | |
| 613 | #[test ] |
| 614 | fn test_to_cow_unicode() { |
| 615 | Python::with_gil(|py| { |
| 616 | let s = "哈哈🐈" ; |
| 617 | let py_string = PyString::new(py, s); |
| 618 | assert_eq!(s, py_string.to_cow().unwrap()); |
| 619 | }) |
| 620 | } |
| 621 | |
| 622 | #[test ] |
| 623 | fn test_encode_utf8_unicode() { |
| 624 | Python::with_gil(|py| { |
| 625 | let s = "哈哈🐈" ; |
| 626 | let obj = PyString::new(py, s); |
| 627 | assert_eq!(s.as_bytes(), obj.encode_utf8().unwrap().as_bytes()); |
| 628 | }) |
| 629 | } |
| 630 | |
| 631 | #[test ] |
| 632 | fn test_encode_utf8_surrogate() { |
| 633 | Python::with_gil(|py| { |
| 634 | let obj: PyObject = py |
| 635 | .eval(ffi::c_str!(r"'\ud800'" ), None, None) |
| 636 | .unwrap() |
| 637 | .into(); |
| 638 | assert!(obj |
| 639 | .bind(py) |
| 640 | .downcast::<PyString>() |
| 641 | .unwrap() |
| 642 | .encode_utf8() |
| 643 | .is_err()); |
| 644 | }) |
| 645 | } |
| 646 | |
| 647 | #[test ] |
| 648 | fn test_to_string_lossy() { |
| 649 | Python::with_gil(|py| { |
| 650 | let py_string = py |
| 651 | .eval(ffi::c_str!(r"'🐈 Hello \ud800World'" ), None, None) |
| 652 | .unwrap() |
| 653 | .downcast_into::<PyString>() |
| 654 | .unwrap(); |
| 655 | |
| 656 | assert_eq!(py_string.to_string_lossy(), "🐈 Hello ���World" ); |
| 657 | }) |
| 658 | } |
| 659 | |
| 660 | #[test ] |
| 661 | fn test_debug_string() { |
| 662 | Python::with_gil(|py| { |
| 663 | let s = "Hello \n" .into_pyobject(py).unwrap(); |
| 664 | assert_eq!(format!("{:?}" , s), "'Hello \\n'" ); |
| 665 | }) |
| 666 | } |
| 667 | |
| 668 | #[test ] |
| 669 | fn test_display_string() { |
| 670 | Python::with_gil(|py| { |
| 671 | let s = "Hello \n" .into_pyobject(py).unwrap(); |
| 672 | assert_eq!(format!("{}" , s), "Hello \n" ); |
| 673 | }) |
| 674 | } |
| 675 | |
| 676 | #[test ] |
| 677 | fn test_string_from_object() { |
| 678 | Python::with_gil(|py| { |
| 679 | let py_bytes = PyBytes::new(py, b"ab \xFFcd" ); |
| 680 | |
| 681 | let py_string = PyString::from_object(&py_bytes, "utf-8" , "ignore" ).unwrap(); |
| 682 | |
| 683 | let result = py_string.to_cow().unwrap(); |
| 684 | assert_eq!(result, "abcd" ); |
| 685 | }); |
| 686 | } |
| 687 | |
| 688 | #[test ] |
| 689 | fn test_string_from_obect_with_invalid_encoding_errors() { |
| 690 | Python::with_gil(|py| { |
| 691 | let py_bytes = PyBytes::new(py, b"abcd" ); |
| 692 | |
| 693 | let result = PyString::from_object(&py_bytes, "utf \0-8" , "ignore" ); |
| 694 | assert!(result.is_err()); |
| 695 | |
| 696 | let result = PyString::from_object(&py_bytes, "utf-8" , "ign \0ore" ); |
| 697 | assert!(result.is_err()); |
| 698 | }); |
| 699 | } |
| 700 | |
| 701 | #[test ] |
| 702 | #[cfg (not(any(Py_LIMITED_API, PyPy)))] |
| 703 | fn test_string_data_ucs1() { |
| 704 | Python::with_gil(|py| { |
| 705 | let s = PyString::new(py, "hello, world" ); |
| 706 | let data = unsafe { s.data().unwrap() }; |
| 707 | |
| 708 | assert_eq!(data, PyStringData::Ucs1(b"hello, world" )); |
| 709 | assert_eq!(data.to_string(py).unwrap(), Cow::Borrowed("hello, world" )); |
| 710 | assert_eq!(data.to_string_lossy(), Cow::Borrowed("hello, world" )); |
| 711 | }) |
| 712 | } |
| 713 | |
| 714 | #[test ] |
| 715 | #[cfg (not(any(Py_LIMITED_API, PyPy)))] |
| 716 | fn test_string_data_ucs1_invalid() { |
| 717 | Python::with_gil(|py| { |
| 718 | // 0xfe is not allowed in UTF-8. |
| 719 | let buffer = b"f \xfe\0" ; |
| 720 | let ptr = unsafe { |
| 721 | crate::ffi::PyUnicode_FromKindAndData( |
| 722 | crate::ffi::PyUnicode_1BYTE_KIND as _, |
| 723 | buffer.as_ptr().cast(), |
| 724 | 2, |
| 725 | ) |
| 726 | }; |
| 727 | assert!(!ptr.is_null()); |
| 728 | let s = unsafe { ptr.assume_owned(py).downcast_into_unchecked::<PyString>() }; |
| 729 | let data = unsafe { s.data().unwrap() }; |
| 730 | assert_eq!(data, PyStringData::Ucs1(b"f \xfe" )); |
| 731 | let err = data.to_string(py).unwrap_err(); |
| 732 | assert!(err.get_type(py).is(&py.get_type::<PyUnicodeDecodeError>())); |
| 733 | assert!(err |
| 734 | .to_string() |
| 735 | .contains("'utf-8' codec can't decode byte 0xfe in position 1" )); |
| 736 | assert_eq!(data.to_string_lossy(), Cow::Borrowed("f�" )); |
| 737 | }); |
| 738 | } |
| 739 | |
| 740 | #[test ] |
| 741 | #[cfg (not(any(Py_LIMITED_API, PyPy)))] |
| 742 | fn test_string_data_ucs2() { |
| 743 | Python::with_gil(|py| { |
| 744 | let s = py.eval(ffi::c_str!("'foo \\ud800'" ), None, None).unwrap(); |
| 745 | let py_string = s.downcast::<PyString>().unwrap(); |
| 746 | let data = unsafe { py_string.data().unwrap() }; |
| 747 | |
| 748 | assert_eq!(data, PyStringData::Ucs2(&[102, 111, 111, 0xd800])); |
| 749 | assert_eq!( |
| 750 | data.to_string_lossy(), |
| 751 | Cow::Owned::<str>("foo�" .to_string()) |
| 752 | ); |
| 753 | }) |
| 754 | } |
| 755 | |
| 756 | #[test ] |
| 757 | #[cfg (all(not(any(Py_LIMITED_API, PyPy)), target_endian = "little" ))] |
| 758 | fn test_string_data_ucs2_invalid() { |
| 759 | Python::with_gil(|py| { |
| 760 | // U+FF22 (valid) & U+d800 (never valid) |
| 761 | let buffer = b" \x22\xff\x00\xd8\x00\x00" ; |
| 762 | let ptr = unsafe { |
| 763 | crate::ffi::PyUnicode_FromKindAndData( |
| 764 | crate::ffi::PyUnicode_2BYTE_KIND as _, |
| 765 | buffer.as_ptr().cast(), |
| 766 | 2, |
| 767 | ) |
| 768 | }; |
| 769 | assert!(!ptr.is_null()); |
| 770 | let s = unsafe { ptr.assume_owned(py).downcast_into_unchecked::<PyString>() }; |
| 771 | let data = unsafe { s.data().unwrap() }; |
| 772 | assert_eq!(data, PyStringData::Ucs2(&[0xff22, 0xd800])); |
| 773 | let err = data.to_string(py).unwrap_err(); |
| 774 | assert!(err.get_type(py).is(&py.get_type::<PyUnicodeDecodeError>())); |
| 775 | assert!(err |
| 776 | .to_string() |
| 777 | .contains("'utf-16' codec can't decode bytes in position 0-3" )); |
| 778 | assert_eq!(data.to_string_lossy(), Cow::Owned::<str>("B�" .into())); |
| 779 | }); |
| 780 | } |
| 781 | |
| 782 | #[test ] |
| 783 | #[cfg (not(any(Py_LIMITED_API, PyPy)))] |
| 784 | fn test_string_data_ucs4() { |
| 785 | Python::with_gil(|py| { |
| 786 | let s = "哈哈🐈" ; |
| 787 | let py_string = PyString::new(py, s); |
| 788 | let data = unsafe { py_string.data().unwrap() }; |
| 789 | |
| 790 | assert_eq!(data, PyStringData::Ucs4(&[21704, 21704, 128008])); |
| 791 | assert_eq!(data.to_string_lossy(), Cow::Owned::<str>(s.to_string())); |
| 792 | }) |
| 793 | } |
| 794 | |
| 795 | #[test ] |
| 796 | #[cfg (all(not(any(Py_LIMITED_API, PyPy)), target_endian = "little" ))] |
| 797 | fn test_string_data_ucs4_invalid() { |
| 798 | Python::with_gil(|py| { |
| 799 | // U+20000 (valid) & U+d800 (never valid) |
| 800 | let buffer = b" \x00\x00\x02\x00\x00\xd8\x00\x00\x00\x00\x00\x00" ; |
| 801 | let ptr = unsafe { |
| 802 | crate::ffi::PyUnicode_FromKindAndData( |
| 803 | crate::ffi::PyUnicode_4BYTE_KIND as _, |
| 804 | buffer.as_ptr().cast(), |
| 805 | 2, |
| 806 | ) |
| 807 | }; |
| 808 | assert!(!ptr.is_null()); |
| 809 | let s = unsafe { ptr.assume_owned(py).downcast_into_unchecked::<PyString>() }; |
| 810 | let data = unsafe { s.data().unwrap() }; |
| 811 | assert_eq!(data, PyStringData::Ucs4(&[0x20000, 0xd800])); |
| 812 | let err = data.to_string(py).unwrap_err(); |
| 813 | assert!(err.get_type(py).is(&py.get_type::<PyUnicodeDecodeError>())); |
| 814 | assert!(err |
| 815 | .to_string() |
| 816 | .contains("'utf-32' codec can't decode bytes in position 0-7" )); |
| 817 | assert_eq!(data.to_string_lossy(), Cow::Owned::<str>("𠀀�" .into())); |
| 818 | }); |
| 819 | } |
| 820 | |
| 821 | #[test ] |
| 822 | fn test_intern_string() { |
| 823 | Python::with_gil(|py| { |
| 824 | let py_string1 = PyString::intern(py, "foo" ); |
| 825 | assert_eq!(py_string1, "foo" ); |
| 826 | |
| 827 | let py_string2 = PyString::intern(py, "foo" ); |
| 828 | assert_eq!(py_string2, "foo" ); |
| 829 | |
| 830 | assert_eq!(py_string1.as_ptr(), py_string2.as_ptr()); |
| 831 | |
| 832 | let py_string3 = PyString::intern(py, "bar" ); |
| 833 | assert_eq!(py_string3, "bar" ); |
| 834 | |
| 835 | assert_ne!(py_string1.as_ptr(), py_string3.as_ptr()); |
| 836 | }); |
| 837 | } |
| 838 | |
| 839 | #[test ] |
| 840 | fn test_py_to_str_utf8() { |
| 841 | Python::with_gil(|py| { |
| 842 | let s = "ascii 🐈" ; |
| 843 | let py_string = PyString::new(py, s).unbind(); |
| 844 | |
| 845 | #[cfg (any(Py_3_10, not(Py_LIMITED_API)))] |
| 846 | assert_eq!(s, py_string.to_str(py).unwrap()); |
| 847 | |
| 848 | assert_eq!(s, py_string.to_cow(py).unwrap()); |
| 849 | }) |
| 850 | } |
| 851 | |
| 852 | #[test ] |
| 853 | fn test_py_to_str_surrogate() { |
| 854 | Python::with_gil(|py| { |
| 855 | let py_string: Py<PyString> = py |
| 856 | .eval(ffi::c_str!(r"'\ud800'" ), None, None) |
| 857 | .unwrap() |
| 858 | .extract() |
| 859 | .unwrap(); |
| 860 | |
| 861 | #[cfg (any(Py_3_10, not(Py_LIMITED_API)))] |
| 862 | assert!(py_string.to_str(py).is_err()); |
| 863 | |
| 864 | assert!(py_string.to_cow(py).is_err()); |
| 865 | }) |
| 866 | } |
| 867 | |
| 868 | #[test ] |
| 869 | fn test_py_to_string_lossy() { |
| 870 | Python::with_gil(|py| { |
| 871 | let py_string: Py<PyString> = py |
| 872 | .eval(ffi::c_str!(r"'🐈 Hello \ud800World'" ), None, None) |
| 873 | .unwrap() |
| 874 | .extract() |
| 875 | .unwrap(); |
| 876 | assert_eq!(py_string.to_string_lossy(py), "🐈 Hello ���World" ); |
| 877 | }) |
| 878 | } |
| 879 | |
| 880 | #[test ] |
| 881 | fn test_comparisons() { |
| 882 | Python::with_gil(|py| { |
| 883 | let s = "hello, world" ; |
| 884 | let py_string = PyString::new(py, s); |
| 885 | |
| 886 | assert_eq!(py_string, "hello, world" ); |
| 887 | |
| 888 | assert_eq!(py_string, s); |
| 889 | assert_eq!(&py_string, s); |
| 890 | assert_eq!(s, py_string); |
| 891 | assert_eq!(s, &py_string); |
| 892 | |
| 893 | assert_eq!(py_string, *s); |
| 894 | assert_eq!(&py_string, *s); |
| 895 | assert_eq!(*s, py_string); |
| 896 | assert_eq!(*s, &py_string); |
| 897 | |
| 898 | let py_string = py_string.as_borrowed(); |
| 899 | |
| 900 | assert_eq!(py_string, s); |
| 901 | assert_eq!(&py_string, s); |
| 902 | assert_eq!(s, py_string); |
| 903 | assert_eq!(s, &py_string); |
| 904 | |
| 905 | assert_eq!(py_string, *s); |
| 906 | assert_eq!(*s, py_string); |
| 907 | }) |
| 908 | } |
| 909 | } |
| 910 | |