io.rs source code [crates/bstr-1.5.0/src/io.rs]

1	/!*
2	Utilities for working with I/O using byte strings.
3
4	This module currently only exports a single trait, `BufReadExt`, which provides
5	facilities for conveniently and efficiently working with lines as byte strings.
6
7	More APIs may be added in the future.
8	*/
9
10	use alloc::{vec, vec::Vec};
11
12	use std::io;
13
14	use crate::{ext_slice::ByteSlice, ext_vec::ByteVec};
15
16	/// An extension trait for
17	/// [`std::io::BufRead`](https://doc.rust-lang.org/std/io/trait.BufRead.html)
18	/// which provides convenience APIs for dealing with byte strings.
19	pub trait BufReadExt: io::BufRead {
20	/// Returns an iterator over the lines of this reader, where each line
21	/// is represented as a byte string.
22	///
23	/// Each item yielded by this iterator is a `io::Result<Vec<u8>>`, where
24	/// an error is yielded if there was a problem reading from the underlying
25	/// reader.
26	///
27	/// On success, the next line in the iterator is returned. The line does
28	/// not* contain a trailing `\n` or `\r\n`.*
29	///
30	/// # Examples
31	///
32	/// Basic usage:
33	///
34	/// ```
35	/// use std::io;
36	///
37	/// use bstr::io::BufReadExt;
38	///
39	/// # fn example() -> Result<(), io::Error> {
40	/// let mut cursor = io::Cursor::new(b"lorem`\n`ipsum`\r\n`dolor");
41	///
42	/// let mut lines = vec![];
43	/// for result in cursor.byte_lines() {
44	/// let line = result?;
45	/// lines.push(line);
46	/// }
47	/// assert_eq!(lines.len(), `3`);
48	/// assert_eq!(lines[`0`], "lorem".as_bytes());
49	/// assert_eq!(lines[`1`], "ipsum".as_bytes());
50	/// assert_eq!(lines[`2`], "dolor".as_bytes());
51	/// # Ok(()) }; example().unwrap()
52	/// ```
53	fn byte_lines(self) -> ByteLines<Self>
54	where
55	Self: Sized,
56	{
57	ByteLines { buf: self }
58	}
59
60	/// Returns an iterator over byte-terminated records of this reader, where
61	/// each record is represented as a byte string.
62	///
63	/// Each item yielded by this iterator is a `io::Result<Vec<u8>>`, where
64	/// an error is yielded if there was a problem reading from the underlying
65	/// reader.
66	///
67	/// On success, the next record in the iterator is returned. The record
68	/// does not* contain its trailing terminator.*
69	///
70	/// Note that calling `byte_records(b'\n')` differs from `byte_lines()` in
71	/// that it has no special handling for `\r`.
72	///
73	/// # Examples
74	///
75	/// Basic usage:
76	///
77	/// ```
78	/// use std::io;
79	///
80	/// use bstr::io::BufReadExt;
81	///
82	/// # fn example() -> Result<(), io::Error> {
83	/// let mut cursor = io::Cursor::new(b"lorem`\x00`ipsum`\x00`dolor");
84	///
85	/// let mut records = vec![];
86	/// for result in cursor.byte_records(b'`\x00`') {
87	/// let record = result?;
88	/// records.push(record);
89	/// }
90	/// assert_eq!(records.len(), `3`);
91	/// assert_eq!(records[`0`], "lorem".as_bytes());
92	/// assert_eq!(records[`1`], "ipsum".as_bytes());
93	/// assert_eq!(records[`2`], "dolor".as_bytes());
94	/// # Ok(()) }; example().unwrap()
95	/// ```
96	fn byte_records(self, terminator: u8) -> ByteRecords<Self>
97	where
98	Self: Sized,
99	{
100	ByteRecords { terminator, buf: self }
101	}
102
103	/// Executes the given closure on each line in the underlying reader.
104	///
105	/// If the closure returns an error (or if the underlying reader returns an
106	/// error), then iteration is stopped and the error is returned. If false
107	/// is returned, then iteration is stopped and no error is returned.
108	///
109	/// The closure given is called on exactly the same values as yielded by
110	/// the [`byte_lines`](trait.BufReadExt.html#method.byte_lines)
111	/// iterator. Namely, lines do _not_ contain trailing `\n` or `\r\n` bytes.
112	///
113	/// This routine is useful for iterating over lines as quickly as
114	/// possible. Namely, a single allocation is reused for each line.
115	///
116	/// # Examples
117	///
118	/// Basic usage:
119	///
120	/// ```
121	/// use std::io;
122	///
123	/// use bstr::io::BufReadExt;
124	///
125	/// # fn example() -> Result<(), io::Error> {
126	/// let mut cursor = io::Cursor::new(b"lorem`\n`ipsum`\r\n`dolor");
127	///
128	/// let mut lines = vec![];
129	/// cursor.for_byte_line(\|line\| {
130	/// lines.push(line.to_vec());
131	/// Ok(`true`)
132	/// })?;
133	/// assert_eq!(lines.len(), `3`);
134	/// assert_eq!(lines[`0`], "lorem".as_bytes());
135	/// assert_eq!(lines[`1`], "ipsum".as_bytes());
136	/// assert_eq!(lines[`2`], "dolor".as_bytes());
137	/// # Ok(()) }; example().unwrap()
138	/// ```
139	fn for_byte_line<F>(&mut self, mut for_each_line: F) -> io::Result<()>
140	where
141	Self: Sized,
142	F: FnMut(&[u8]) -> io::Result<bool>,
143	{
144	self.for_byte_line_with_terminator(\|line\| {
145	for_each_line(&trim_line_slice(&line))
146	})
147	}
148
149	/// Executes the given closure on each byte-terminated record in the
150	/// underlying reader.
151	///
152	/// If the closure returns an error (or if the underlying reader returns an
153	/// error), then iteration is stopped and the error is returned. If false
154	/// is returned, then iteration is stopped and no error is returned.
155	///
156	/// The closure given is called on exactly the same values as yielded by
157	/// the [`byte_records`](trait.BufReadExt.html#method.byte_records)
158	/// iterator. Namely, records do _not_ contain a trailing terminator byte.
159	///
160	/// This routine is useful for iterating over records as quickly as
161	/// possible. Namely, a single allocation is reused for each record.
162	///
163	/// # Examples
164	///
165	/// Basic usage:
166	///
167	/// ```
168	/// use std::io;
169	///
170	/// use bstr::io::BufReadExt;
171	///
172	/// # fn example() -> Result<(), io::Error> {
173	/// let mut cursor = io::Cursor::new(b"lorem`\x00`ipsum`\x00`dolor");
174	///
175	/// let mut records = vec![];
176	/// cursor.for_byte_record(b'`\x00`', \|record\| {
177	/// records.push(record.to_vec());
178	/// Ok(`true`)
179	/// })?;
180	/// assert_eq!(records.len(), `3`);
181	/// assert_eq!(records[`0`], "lorem".as_bytes());
182	/// assert_eq!(records[`1`], "ipsum".as_bytes());
183	/// assert_eq!(records[`2`], "dolor".as_bytes());
184	/// # Ok(()) }; example().unwrap()
185	/// ```
186	fn for_byte_record<F>(
187	&mut self,
188	terminator: u8,
189	mut for_each_record: F,
190	) -> io::Result<()>
191	where
192	Self: Sized,
193	F: FnMut(&[u8]) -> io::Result<bool>,
194	{
195	self.for_byte_record_with_terminator(terminator, \|chunk\| {
196	for_each_record(&trim_record_slice(&chunk, terminator))
197	})
198	}
199
200	/// Executes the given closure on each line in the underlying reader.
201	///
202	/// If the closure returns an error (or if the underlying reader returns an
203	/// error), then iteration is stopped and the error is returned. If false
204	/// is returned, then iteration is stopped and no error is returned.
205	///
206	/// Unlike
207	/// [`for_byte_line`](trait.BufReadExt.html#method.for_byte_line),
208	/// the lines given to the closure do* include the line terminator, if one*
209	/// exists.
210	///
211	/// This routine is useful for iterating over lines as quickly as
212	/// possible. Namely, a single allocation is reused for each line.
213	///
214	/// This is identical to `for_byte_record_with_terminator` with a
215	/// terminator of `\n`.
216	///
217	/// # Examples
218	///
219	/// Basic usage:
220	///
221	/// ```
222	/// use std::io;
223	///
224	/// use bstr::io::BufReadExt;
225	///
226	/// # fn example() -> Result<(), io::Error> {
227	/// let mut cursor = io::Cursor::new(b"lorem`\n`ipsum`\r\n`dolor");
228	///
229	/// let mut lines = vec![];
230	/// cursor.for_byte_line_with_terminator(\|line\| {
231	/// lines.push(line.to_vec());
232	/// Ok(`true`)
233	/// })?;
234	/// assert_eq!(lines.len(), `3`);
235	/// assert_eq!(lines[`0`], "lorem`\n`".as_bytes());
236	/// assert_eq!(lines[`1`], "ipsum`\r\n`".as_bytes());
237	/// assert_eq!(lines[`2`], "dolor".as_bytes());
238	/// # Ok(()) }; example().unwrap()
239	/// ```
240	fn for_byte_line_with_terminator<F>(
241	&mut self,
242	for_each_line: F,
243	) -> io::Result<()>
244	where
245	Self: Sized,
246	F: FnMut(&[u8]) -> io::Result<bool>,
247	{
248	self.for_byte_record_with_terminator(b'`\n`', for_each_line)
249	}
250
251	/// Executes the given closure on each byte-terminated record in the
252	/// underlying reader.
253	///
254	/// If the closure returns an error (or if the underlying reader returns an
255	/// error), then iteration is stopped and the error is returned. If false
256	/// is returned, then iteration is stopped and no error is returned.
257	///
258	/// Unlike
259	/// [`for_byte_record`](trait.BufReadExt.html#method.for_byte_record),
260	/// the lines given to the closure do* include the record terminator, if*
261	/// one exists.
262	///
263	/// This routine is useful for iterating over records as quickly as
264	/// possible. Namely, a single allocation is reused for each record.
265	///
266	/// # Examples
267	///
268	/// Basic usage:
269	///
270	/// ```
271	/// use std::io;
272	///
273	/// use bstr::{io::BufReadExt, B};
274	///
275	/// # fn example() -> Result<(), io::Error> {
276	/// let mut cursor = io::Cursor::new(b"lorem`\x00`ipsum`\x00`dolor");
277	///
278	/// let mut records = vec![];
279	/// cursor.for_byte_record_with_terminator(b'`\x00`', \|record\| {
280	/// records.push(record.to_vec());
281	/// Ok(`true`)
282	/// })?;
283	/// assert_eq!(records.len(), `3`);
284	/// assert_eq!(records[`0`], B(b"lorem`\x00`"));
285	/// assert_eq!(records[`1`], B("ipsum`\x00`"));
286	/// assert_eq!(records[`2`], B("dolor"));
287	/// # Ok(()) }; example().unwrap()
288	/// ```
289	fn for_byte_record_with_terminator<F>(
290	&mut self,
291	terminator: u8,
292	mut for_each_record: F,
293	) -> io::Result<()>
294	where
295	Self: Sized,
296	F: FnMut(&[u8]) -> io::Result<bool>,
297	{
298	let mut bytes = vec![];
299	let mut res = Ok(());
300	let mut consumed = `0`;
301	'outer: loop {
302	// Lend out complete record slices from our buffer
303	{
304	let mut buf = self.fill_buf()?;
305	while let Some(index) = buf.find_byte(terminator) {
306	let (record, rest) = buf.split_at(index + `1`);
307	buf = rest;
308	consumed += record.len();
309	match for_each_record(&record) {
310	Ok(`false`) => break 'outer,
311	Err(err) => {
312	res = Err(err);
313	break 'outer;
314	}
315	_ => (),
316	}
317	}
318
319	// Copy the final record fragment to our local buffer. This
320	// saves read_until() from re-scanning a buffer we know
321	// contains no remaining terminators.
322	bytes.extend_from_slice(&buf);
323	consumed += buf.len();
324	}
325
326	self.consume(consumed);
327	consumed = `0`;
328
329	// N.B. read_until uses a different version of memchr that may
330	// be slower than the memchr crate that bstr uses. However, this
331	// should only run for a fairly small number of records, assuming a
332	// decent buffer size.
333	self.read_until(terminator, &mut bytes)?;
334	if bytes.is_empty() \|\| !for_each_record(&bytes)? {
335	break;
336	}
337	bytes.clear();
338	}
339	self.consume(consumed);
340	res
341	}
342	}
343
344	impl<B: io::BufRead> BufReadExt for B {}
345
346	/// An iterator over lines from an instance of
347	/// [`std::io::BufRead`](https://doc.rust-lang.org/std/io/trait.BufRead.html).
348	///
349	/// This iterator is generally created by calling the
350	/// [`byte_lines`](trait.BufReadExt.html#method.byte_lines)
351	/// method on the
352	/// [`BufReadExt`](trait.BufReadExt.html)
353	/// trait.
354	#[derive(Debug)]
355	pub struct ByteLines<B> {
356	buf: B,
357	}
358
359	/// An iterator over records from an instance of
360	/// [`std::io::BufRead`](https://doc.rust-lang.org/std/io/trait.BufRead.html).
361	///
362	/// A byte record is any sequence of bytes terminated by a particular byte
363	/// chosen by the caller. For example, NUL separated byte strings are said to
364	/// be NUL-terminated byte records.
365	///
366	/// This iterator is generally created by calling the
367	/// [`byte_records`](trait.BufReadExt.html#method.byte_records)
368	/// method on the
369	/// [`BufReadExt`](trait.BufReadExt.html)
370	/// trait.
371	#[derive(Debug)]
372	pub struct ByteRecords<B> {
373	buf: B,
374	terminator: u8,
375	}
376
377	impl<B: io::BufRead> Iterator for ByteLines<B> {
378	type Item = io::Result<Vec<u8>>;
379
380	fn next(&mut self) -> Option<io::Result<Vec<u8>>> {
381	let mut bytes: Vec = vec![];
382	match self.buf.read_until(byte:b'`\n`', &mut bytes) {
383	Err(e: Error) => Some(Err(e)),
384	Ok(`0`) => None,
385	Ok(_) => {
386	trim_line(&mut bytes);
387	Some(Ok(bytes))
388	}
389	}
390	}
391	}
392
393	impl<B: io::BufRead> Iterator for ByteRecords<B> {
394	type Item = io::Result<Vec<u8>>;
395
396	fn next(&mut self) -> Option<io::Result<Vec<u8>>> {
397	let mut bytes: Vec = vec![];
398	match self.buf.read_until(self.terminator, &mut bytes) {
399	Err(e: Error) => Some(Err(e)),
400	Ok(`0`) => None,
401	Ok(_) => {
402	trim_record(&mut bytes, self.terminator);
403	Some(Ok(bytes))
404	}
405	}
406	}
407	}
408
409	fn trim_line(line: &mut Vec<u8>) {
410	if line.last_byte() == Some(b'`\n`') {
411	line.pop_byte();
412	if line.last_byte() == Some(b'`\r`') {
413	line.pop_byte();
414	}
415	}
416	}
417
418	fn trim_line_slice(mut line: &[u8]) -> &[u8] {
419	if line.last_byte() == Some(b'`\n`') {
420	line = &line[..line.len() - `1`];
421	if line.last_byte() == Some(b'`\r`') {
422	line = &line[..line.len() - `1`];
423	}
424	}
425	line
426	}
427
428	fn trim_record(record: &mut Vec<u8>, terminator: u8) {
429	if record.last_byte() == Some(terminator) {
430	record.pop_byte();
431	}
432	}
433
434	fn trim_record_slice(mut record: &[u8], terminator: u8) -> &[u8] {
435	if record.last_byte() == Some(terminator) {
436	record = &record[..record.len() - `1`];
437	}
438	record
439	}
440
441	#[cfg(all(test, feature = "std"))]
442	mod tests {
443	use crate::bstring::BString;
444
445	use super::BufReadExt;
446
447	fn collect_lines<B: AsRef<[u8]>>(slice: B) -> Vec<BString> {
448	let mut lines = vec![];
449	slice
450	.as_ref()
451	.for_byte_line(\|line\| {
452	lines.push(BString::from(line.to_vec()));
453	Ok(`true`)
454	})
455	.unwrap();
456	lines
457	}
458
459	fn collect_lines_term<B: AsRef<[u8]>>(slice: B) -> Vec<BString> {
460	let mut lines = vec![];
461	slice
462	.as_ref()
463	.for_byte_line_with_terminator(\|line\| {
464	lines.push(BString::from(line.to_vec()));
465	Ok(`true`)
466	})
467	.unwrap();
468	lines
469	}
470
471	#[test]
472	fn lines_without_terminator() {
473	assert_eq!(collect_lines(""), Vec::<BString>::new());
474
475	assert_eq!(collect_lines("`\n`"), vec![""]);
476	assert_eq!(collect_lines("`\n\n`"), vec!["", ""]);
477	assert_eq!(collect_lines("a`\n`b`\n`"), vec!["a", "b"]);
478	assert_eq!(collect_lines("a`\n`b"), vec!["a", "b"]);
479	assert_eq!(collect_lines("abc`\n`xyz`\n`"), vec!["abc", "xyz"]);
480	assert_eq!(collect_lines("abc`\n`xyz"), vec!["abc", "xyz"]);
481
482	assert_eq!(collect_lines("`\r\n`"), vec![""]);
483	assert_eq!(collect_lines("`\r\n\r\n`"), vec!["", ""]);
484	assert_eq!(collect_lines("a`\r\n`b`\r\n`"), vec!["a", "b"]);
485	assert_eq!(collect_lines("a`\r\n`b"), vec!["a", "b"]);
486	assert_eq!(collect_lines("abc`\r\n`xyz`\r\n`"), vec!["abc", "xyz"]);
487	assert_eq!(collect_lines("abc`\r\n`xyz"), vec!["abc", "xyz"]);
488
489	assert_eq!(collect_lines("abc`\r`xyz"), vec!["abc`\r`xyz"]);
490	}
491
492	#[test]
493	fn lines_with_terminator() {
494	assert_eq!(collect_lines_term(""), Vec::<BString>::new());
495
496	assert_eq!(collect_lines_term("`\n`"), vec!["`\n`"]);
497	assert_eq!(collect_lines_term("`\n\n`"), vec!["`\n`", "`\n`"]);
498	assert_eq!(collect_lines_term("a`\n`b`\n`"), vec!["a`\n`", "b`\n`"]);
499	assert_eq!(collect_lines_term("a`\n`b"), vec!["a`\n`", "b"]);
500	assert_eq!(collect_lines_term("abc`\n`xyz`\n`"), vec!["abc`\n`", "xyz`\n`"]);
501	assert_eq!(collect_lines_term("abc`\n`xyz"), vec!["abc`\n`", "xyz"]);
502
503	assert_eq!(collect_lines_term("`\r\n`"), vec!["`\r\n`"]);
504	assert_eq!(collect_lines_term("`\r\n\r\n`"), vec!["`\r\n`", "`\r\n`"]);
505	assert_eq!(collect_lines_term("a`\r\n`b`\r\n`"), vec!["a`\r\n`", "b`\r\n`"]);
506	assert_eq!(collect_lines_term("a`\r\n`b"), vec!["a`\r\n`", "b"]);
507	assert_eq!(
508	collect_lines_term("abc`\r\n`xyz`\r\n`"),
509	vec!["abc`\r\n`", "xyz`\r\n`"]
510	);
511	assert_eq!(collect_lines_term("abc`\r\n`xyz"), vec!["abc`\r\n`", "xyz"]);
512
513	assert_eq!(collect_lines_term("abc`\r`xyz"), vec!["abc`\r`xyz"]);
514	}
515	}
516