string.rs source code [crates/pyo3-0.20.3/src/types/string.rs]

1	#[cfg(not(Py_LIMITED_API))]
2	use crate::exceptions::PyUnicodeDecodeError;
3	use crate::types::PyBytes;
4	use crate::{ffi, PyAny, PyResult, Python};
5	use std::borrow::Cow;
6	use std::os::raw::c_char;
7	use std::str;
8
9	/// Represents raw data backing a Python `str`.
10	///
11	/// Python internally stores strings in various representations. This enumeration
12	/// represents those variations.
13	#[cfg(not(Py_LIMITED_API))]
14	#[derive(Clone, Copy, Debug, PartialEq, Eq)]
15	pub enum PyStringData<'a> {
16	/// UCS1 representation.
17	Ucs1(&'a [u8]),
18
19	/// UCS2 representation.
20	Ucs2(&'a [u16]),
21
22	/// UCS4 representation.
23	Ucs4(&'a [u32]),
24	}
25
26	#[cfg(not(Py_LIMITED_API))]
27	impl<'a> PyStringData<'a> {
28	/// Obtain the raw bytes backing this instance as a [u8] slice.
29	pub fn as_bytes(&self) -> &[u8] {
30	match self {
31	Self::Ucs1(s) => s,
32	Self::Ucs2(s) => unsafe {
33	std::slice::from_raw_parts(
34	s.as_ptr() as *const u8,
35	s.len() * self.value_width_bytes(),
36	)
37	},
38	Self::Ucs4(s) => unsafe {
39	std::slice::from_raw_parts(
40	s.as_ptr() as *const u8,
41	s.len() * self.value_width_bytes(),
42	)
43	},
44	}
45	}
46
47	/// Size in bytes of each value/item in the underlying slice.
48	#[inline]
49	pub fn value_width_bytes(&self) -> usize {
50	match self {
51	Self::Ucs1(_) => `1`,
52	Self::Ucs2(_) => `2`,
53	Self::Ucs4(_) => `4`,
54	}
55	}
56
57	/// Convert the raw data to a Rust string.
58	///
59	/// For UCS-1 / UTF-8, returns a borrow into the original slice. For UCS-2 and UCS-4,
60	/// returns an owned string.
61	///
62	/// Returns [PyUnicodeDecodeError] if the string data isn't valid in its purported
63	/// storage format. This should only occur for strings that were created via Python
64	/// C APIs that skip input validation (like `PyUnicode_FromKindAndData`) and should
65	/// never occur for strings that were created from Python code.
66	pub fn to_string(self, py: Python<'_>) -> PyResult<Cow<'a, str>> {
67	use std::ffi::CStr;
68	match self {
69	Self::Ucs1(data) => match str::from_utf8(data) {
70	Ok(s) => Ok(Cow::Borrowed(s)),
71	Err(e) => Err(crate::PyErr::from_value(PyUnicodeDecodeError::new_utf8(
72	py, data, e,
73	)?)),
74	},
75	Self::Ucs2(data) => match String::from_utf16(data) {
76	Ok(s) => Ok(Cow::Owned(s)),
77	Err(e) => {
78	let mut message = e.to_string().as_bytes().to_vec();
79	message.push(`0`);
80
81	Err(crate::PyErr::from_value(PyUnicodeDecodeError::new(
82	py,
83	CStr::from_bytes_with_nul(b"utf-16`\0`").unwrap(),
84	self.as_bytes(),
85	`0`..self.as_bytes().len(),
86	CStr::from_bytes_with_nul(&message).unwrap(),
87	)?))
88	}
89	},
90	Self::Ucs4(data) => match data.iter().map(\|&c\| std::char::from_u32(c)).collect() {
91	Some(s) => Ok(Cow::Owned(s)),
92	None => Err(crate::PyErr::from_value(PyUnicodeDecodeError::new(
93	py,
94	CStr::from_bytes_with_nul(b"utf-32`\0`").unwrap(),
95	self.as_bytes(),
96	`0`..self.as_bytes().len(),
97	CStr::from_bytes_with_nul(b"error converting utf-32`\0`").unwrap(),
98	)?)),
99	},
100	}
101	}
102
103	/// Convert the raw data to a Rust string, possibly with data loss.
104	///
105	/// Invalid code points will be replaced with `U+FFFD REPLACEMENT CHARACTER`.
106	///
107	/// Returns a borrow into original data, when possible, or owned data otherwise.
108	///
109	/// The return value of this function should only disagree with [Self::to_string]
110	/// when that method would error.
111	pub fn to_string_lossy(self) -> Cow<'a, str> {
112	match self {
113	Self::Ucs1(data) => String::from_utf8_lossy(data),
114	Self::Ucs2(data) => Cow::Owned(String::from_utf16_lossy(data)),
115	Self::Ucs4(data) => Cow::Owned(
116	data.iter()
117	.map(\|&c\| std::char::from_u32(c).unwrap_or('`\u{FFFD}`'))
118	.collect(),
119	),
120	}
121	}
122	}
123
124	/// Represents a Python `string` (a Unicode string object).
125	///
126	/// This type is immutable.
127	#[repr(transparent)]
128	pub struct PyString(PyAny);
129
130	pyobject_native_type_core!(PyString, pyobject_native_static_type_object!(ffi::PyUnicode_Type), #checkfunction=ffi::PyUnicode_Check);
131
132	impl PyString {
133	/// Creates a new Python string object.
134	///
135	/// Panics if out of memory.
136	pub fn new<'p>(py: Python<'p>, s: &str) -> &'p PyString {
137	let ptr = s.as_ptr() as *const c_char;
138	let len = s.len() as ffi::Py_ssize_t;
139	unsafe { py.from_owned_ptr(ffi::PyUnicode_FromStringAndSize(ptr, len)) }
140	}
141
142	/// Intern the given string
143	///
144	/// This will return a reference to the same Python string object if called repeatedly with the same string.
145	///
146	/// Note that while this is more memory efficient than [`PyString::new`], it unconditionally allocates a
147	/// temporary Python string object and is thereby slower than [`PyString::new`].
148	///
149	/// Panics if out of memory.
150	pub fn intern<'p>(py: Python<'p>, s: &str) -> &'p PyString {
151	let ptr = s.as_ptr() as *const c_char;
152	let len = s.len() as ffi::Py_ssize_t;
153	unsafe {
154	let mut ob = ffi::PyUnicode_FromStringAndSize(ptr, len);
155	if !ob.is_null() {
156	ffi::PyUnicode_InternInPlace(&mut ob);
157	}
158	py.from_owned_ptr(ob)
159	}
160	}
161
162	/// Attempts to create a Python string from a Python [bytes-like object].
163	///
164	/// [bytes-like object]: (https://docs.python.org/3/glossary.html#term-bytes-like-object).
165	pub fn from_object<'p>(src: &'p PyAny, encoding: &str, errors: &str) -> PyResult<&'p PyString> {
166	unsafe {
167	src.py()
168	.from_owned_ptr_or_err::<PyString>(ffi::PyUnicode_FromEncodedObject(
169	src.as_ptr(),
170	encoding.as_ptr() as *const c_char,
171	errors.as_ptr() as *const c_char,
172	))
173	}
174	}
175
176	/// Gets the Python string as a byte slice.
177	///
178	/// Returns a `UnicodeEncodeError` if the input is not valid unicode
179	/// (containing unpaired surrogates).
180	#[inline]
181	pub fn to_str(&self) -> PyResult<&str> {
182	let utf8_slice = {
183	cfg_if::cfg_if! {
184	if #[cfg(any(Py_3_10, not(Py_LIMITED_API)))] {
185	// PyUnicode_AsUTF8AndSize only available on limited API starting with 3.10.
186	let mut size: ffi::Py_ssize_t = `0`;
187	let data: *const u8 = unsafe { ffi::PyUnicode_AsUTF8AndSize(self.as_ptr(), &mut size).cast() };
188	if data.is_null() {
189	return Err(crate::PyErr::fetch(self.py()));
190	} else {
191	unsafe { std::slice::from_raw_parts(data, size as usize) }
192	}
193	} else {
194	let bytes = unsafe {
195	self.py().from_owned_ptr_or_err::<PyBytes>(ffi::PyUnicode_AsUTF8String(self.as_ptr()))?
196	};
197	bytes.as_bytes()
198	}
199	}
200	};
201	Ok(unsafe { std::str::from_utf8_unchecked(utf8_slice) })
202	}
203
204	/// Converts the `PyString` into a Rust string.
205	///
206	/// Unpaired surrogates invalid UTF-8 sequences are
207	/// replaced with `U+FFFD REPLACEMENT CHARACTER`.
208	pub fn to_string_lossy(&self) -> Cow<'_, str> {
209	match self.to_str() {
210	Ok(s) => Cow::Borrowed(s),
211	Err(_) => {
212	let bytes = unsafe {
213	self.py()
214	.from_owned_ptr::<PyBytes>(ffi::PyUnicode_AsEncodedString(
215	self.as_ptr(),
216	b"utf-8`\0`" as *const _ as _,
217	b"surrogatepass`\0`" as *const _ as _,
218	))
219	};
220	String::from_utf8_lossy(bytes.as_bytes())
221	}
222	}
223	}
224
225	/// Obtains the raw data backing the Python string.
226	///
227	/// If the Python string object was created through legacy APIs, its internal storage format
228	/// will be canonicalized before data is returned.
229	///
230	/// # Safety
231	///
232	/// This function implementation relies on manually decoding a C bitfield. In practice, this
233	/// works well on common little-endian architectures such as x86_64, where the bitfield has a
234	/// common representation (even if it is not part of the C spec). The PyO3 CI tests this API on
235	/// x86_64 platforms.
236	///
237	/// By using this API, you accept responsibility for testing that PyStringData behaves as
238	/// expected on the targets where you plan to distribute your software.
239	#[cfg(not(Py_LIMITED_API))]
240	pub unsafe fn data(&self) -> PyResult<PyStringData<'_>> {
241	let ptr = self.as_ptr();
242
243	#[cfg(not(Py_3_12))]
244	#[allow(deprecated)]
245	{
246	let ready = ffi::PyUnicode_READY(ptr);
247	if ready != `0` {
248	// Exception was created on failure.
249	return Err(crate::PyErr::fetch(self.py()));
250	}
251	}
252
253	// The string should be in its canonical form after calling `PyUnicode_READY()`.
254	// And non-canonical form not possible after Python 3.12. So it should be safe
255	// to call these APIs.
256	let length = ffi::PyUnicode_GET_LENGTH(ptr) as usize;
257	let raw_data = ffi::PyUnicode_DATA(ptr);
258	let kind = ffi::PyUnicode_KIND(ptr);
259
260	match kind {
261	ffi::PyUnicode_1BYTE_KIND => Ok(PyStringData::Ucs1(std::slice::from_raw_parts(
262	raw_data as *const u8,
263	length,
264	))),
265	ffi::PyUnicode_2BYTE_KIND => Ok(PyStringData::Ucs2(std::slice::from_raw_parts(
266	raw_data as *const u16,
267	length,
268	))),
269	ffi::PyUnicode_4BYTE_KIND => Ok(PyStringData::Ucs4(std::slice::from_raw_parts(
270	raw_data as *const u32,
271	length,
272	))),
273	_ => unreachable!(),
274	}
275	}
276	}
277
278	#[cfg(test)]
279	mod tests {
280	use super::*;
281	use crate::Python;
282	use crate::{PyObject, ToPyObject};
283	#[cfg(not(Py_LIMITED_API))]
284	use std::borrow::Cow;
285
286	#[test]
287	fn test_to_str_ascii() {
288	Python::with_gil(\|py\| {
289	let s = "ascii 🐈";
290	let obj: PyObject = PyString::new(py, s).into();
291	let py_string: &PyString = obj.downcast(py).unwrap();
292	assert_eq!(s, py_string.to_str().unwrap());
293	})
294	}
295
296	#[test]
297	fn test_to_str_surrogate() {
298	Python::with_gil(\|py\| {
299	let obj: PyObject = py.eval(r"'\ud800'", None, None).unwrap().into();
300	let py_string: &PyString = obj.downcast(py).unwrap();
301	assert!(py_string.to_str().is_err());
302	})
303	}
304
305	#[test]
306	fn test_to_str_unicode() {
307	Python::with_gil(\|py\| {
308	let s = "哈哈🐈";
309	let obj: PyObject = PyString::new(py, s).into();
310	let py_string: &PyString = obj.downcast(py).unwrap();
311	assert_eq!(s, py_string.to_str().unwrap());
312	})
313	}
314
315	#[test]
316	fn test_to_string_lossy() {
317	Python::with_gil(\|py\| {
318	let obj: PyObject = py
319	.eval(r"'🐈 Hello \ud800World'", None, None)
320	.unwrap()
321	.into();
322	let py_string: &PyString = obj.downcast(py).unwrap();
323	assert_eq!(py_string.to_string_lossy(), "🐈 Hello ��World");
324	})
325	}
326
327	#[test]
328	fn test_debug_string() {
329	Python::with_gil(\|py\| {
330	let v = "Hello`\n`".to_object(py);
331	let s: &PyString = v.downcast(py).unwrap();
332	assert_eq!(format!("{:?}", s), "'Hello`\\`n'");
333	})
334	}
335
336	#[test]
337	fn test_display_string() {
338	Python::with_gil(\|py\| {
339	let v = "Hello`\n`".to_object(py);
340	let s: &PyString = v.downcast(py).unwrap();
341	assert_eq!(format!("{}", s), "Hello`\n`");
342	})
343	}
344
345	#[test]
346	#[cfg(not(Py_LIMITED_API))]
347	fn test_string_data_ucs1() {
348	Python::with_gil(\|py\| {
349	let s = PyString::new(py, "hello, world");
350	let data = unsafe { s.data().unwrap() };
351
352	assert_eq!(data, PyStringData::Ucs1(b"hello, world"));
353	assert_eq!(data.to_string(py).unwrap(), Cow::Borrowed("hello, world"));
354	assert_eq!(data.to_string_lossy(), Cow::Borrowed("hello, world"));
355	})
356	}
357
358	#[test]
359	#[cfg(not(Py_LIMITED_API))]
360	fn test_string_data_ucs1_invalid() {
361	Python::with_gil(\|py\| {
362	// 0xfe is not allowed in UTF-8.
363	let buffer = b"f`\xfe\0`";
364	let ptr = unsafe {
365	crate::ffi::PyUnicode_FromKindAndData(
366	crate::ffi::PyUnicode_1BYTE_KIND as _,
367	buffer.as_ptr() as *const _,
368	`2`,
369	)
370	};
371	assert!(!ptr.is_null());
372	let s: &PyString = unsafe { py.from_owned_ptr(ptr) };
373	let data = unsafe { s.data().unwrap() };
374	assert_eq!(data, PyStringData::Ucs1(b"f`\xfe`"));
375	let err = data.to_string(py).unwrap_err();
376	assert!(err.get_type(py).is(py.get_type::<PyUnicodeDecodeError>()));
377	assert!(err
378	.to_string()
379	.contains("'utf-8' codec can't decode byte 0xfe in position 1"));
380	assert_eq!(data.to_string_lossy(), Cow::Borrowed("f�"));
381	});
382	}
383
384	#[test]
385	#[cfg(not(Py_LIMITED_API))]
386	fn test_string_data_ucs2() {
387	Python::with_gil(\|py\| {
388	let s = py.eval("'foo`\\`ud800'", None, None).unwrap();
389	let py_string = s.downcast::<PyString>().unwrap();
390	let data = unsafe { py_string.data().unwrap() };
391
392	assert_eq!(data, PyStringData::Ucs2(&[`102`, `111`, `111`, `0xd800`]));
393	assert_eq!(
394	data.to_string_lossy(),
395	Cow::Owned::<str>("foo�".to_string())
396	);
397	})
398	}
399
400	#[test]
401	#[cfg(all(not(Py_LIMITED_API), target_endian = "little"))]
402	fn test_string_data_ucs2_invalid() {
403	Python::with_gil(\|py\| {
404	// U+FF22 (valid) & U+d800 (never valid)
405	let buffer = b"`\x22\xff\x00\xd8\x00\x00`";
406	let ptr = unsafe {
407	crate::ffi::PyUnicode_FromKindAndData(
408	crate::ffi::PyUnicode_2BYTE_KIND as _,
409	buffer.as_ptr() as *const _,
410	`2`,
411	)
412	};
413	assert!(!ptr.is_null());
414	let s: &PyString = unsafe { py.from_owned_ptr(ptr) };
415	let data = unsafe { s.data().unwrap() };
416	assert_eq!(data, PyStringData::Ucs2(&[`0xff22`, `0xd800`]));
417	let err = data.to_string(py).unwrap_err();
418	assert!(err.get_type(py).is(py.get_type::<PyUnicodeDecodeError>()));
419	assert!(err
420	.to_string()
421	.contains("'utf-16' codec can't decode bytes in position 0-3"));
422	assert_eq!(data.to_string_lossy(), Cow::Owned::<str>("Ｂ�".into()));
423	});
424	}
425
426	#[test]
427	#[cfg(not(Py_LIMITED_API))]
428	fn test_string_data_ucs4() {
429	Python::with_gil(\|py\| {
430	let s = "哈哈🐈";
431	let py_string = PyString::new(py, s);
432	let data = unsafe { py_string.data().unwrap() };
433
434	assert_eq!(data, PyStringData::Ucs4(&[`21704`, `21704`, `128008`]));
435	assert_eq!(data.to_string_lossy(), Cow::Owned::<str>(s.to_string()));
436	})
437	}
438
439	#[test]
440	#[cfg(all(not(Py_LIMITED_API), target_endian = "little"))]
441	fn test_string_data_ucs4_invalid() {
442	Python::with_gil(\|py\| {
443	// U+20000 (valid) & U+d800 (never valid)
444	let buffer = b"`\x00\x00\x02\x00\x00\xd8\x00\x00\x00\x00\x00\x00`";
445	let ptr = unsafe {
446	crate::ffi::PyUnicode_FromKindAndData(
447	crate::ffi::PyUnicode_4BYTE_KIND as _,
448	buffer.as_ptr() as *const _,
449	`2`,
450	)
451	};
452	assert!(!ptr.is_null());
453	let s: &PyString = unsafe { py.from_owned_ptr(ptr) };
454	let data = unsafe { s.data().unwrap() };
455	assert_eq!(data, PyStringData::Ucs4(&[`0x20000`, `0xd800`]));
456	let err = data.to_string(py).unwrap_err();
457	assert!(err.get_type(py).is(py.get_type::<PyUnicodeDecodeError>()));
458	assert!(err
459	.to_string()
460	.contains("'utf-32' codec can't decode bytes in position 0-7"));
461	assert_eq!(data.to_string_lossy(), Cow::Owned::<str>("𠀀�".into()));
462	});
463	}
464
465	#[test]
466	fn test_intern_string() {
467	Python::with_gil(\|py\| {
468	let py_string1 = PyString::intern(py, "foo");
469	assert_eq!(py_string1.to_str().unwrap(), "foo");
470
471	let py_string2 = PyString::intern(py, "foo");
472	assert_eq!(py_string2.to_str().unwrap(), "foo");
473
474	assert_eq!(py_string1.as_ptr(), py_string2.as_ptr());
475
476	let py_string3 = PyString::intern(py, "bar");
477	assert_eq!(py_string3.to_str().unwrap(), "bar");
478
479	assert_ne!(py_string1.as_ptr(), py_string3.as_ptr());
480	});
481	}
482	}
483