io.rs source code [crates/bstr/src/io.rs]

1	/!*
2	Utilities for working with I/O using byte strings.
3
4	This module currently only exports a single trait, `BufReadExt`, which provides
5	facilities for conveniently and efficiently working with lines as byte strings.
6
7	More APIs may be added in the future.
8	*/
9
10	use alloc::{vec, vec::Vec};
11
12	use std::io;
13
14	use crate::{ext_slice::ByteSlice, ext_vec::ByteVec};
15
16	/// An extension trait for
17	/// [`std::io::BufRead`](https://doc.rust-lang.org/std/io/trait.BufRead.html)
18	/// which provides convenience APIs for dealing with byte strings.
19	pub trait BufReadExt: io::BufRead {
20	/// Returns an iterator over the lines of this reader, where each line
21	/// is represented as a byte string.
22	///
23	/// Each item yielded by this iterator is a `io::Result<Vec<u8>>`, where
24	/// an error is yielded if there was a problem reading from the underlying
25	/// reader.
26	///
27	/// On success, the next line in the iterator is returned. The line does
28	/// not* contain a trailing `\n` or `\r\n`.*
29	///
30	/// # Examples
31	///
32	/// Basic usage:
33	///
34	/// ```
35	/// use std::io;
36	///
37	/// use bstr::io::BufReadExt;
38	///
39	/// # fn example() -> Result<(), io::Error> {
40	/// let mut cursor = io::Cursor::new(b"lorem`\n`ipsum`\r\n`dolor");
41	///
42	/// let mut lines = vec![];
43	/// for result in cursor.byte_lines() {
44	/// let line = result?;
45	/// lines.push(line);
46	/// }
47	/// assert_eq!(lines.len(), `3`);
48	/// assert_eq!(lines[`0`], "lorem".as_bytes());
49	/// assert_eq!(lines[`1`], "ipsum".as_bytes());
50	/// assert_eq!(lines[`2`], "dolor".as_bytes());
51	/// # Ok(()) }; example().unwrap()
52	/// ```
53	fn byte_lines(self) -> ByteLines<Self>
54	where
55	Self: Sized,
56	{
57	ByteLines { buf: self }
58	}
59
60	/// Returns an iterator over byte-terminated records of this reader, where
61	/// each record is represented as a byte string.
62	///
63	/// Each item yielded by this iterator is a `io::Result<Vec<u8>>`, where
64	/// an error is yielded if there was a problem reading from the underlying
65	/// reader.
66	///
67	/// On success, the next record in the iterator is returned. The record
68	/// does not* contain its trailing terminator.*
69	///
70	/// Note that calling `byte_records(b'\n')` differs from `byte_lines()` in
71	/// that it has no special handling for `\r`.
72	///
73	/// # Examples
74	///
75	/// Basic usage:
76	///
77	/// ```
78	/// use std::io;
79	///
80	/// use bstr::io::BufReadExt;
81	///
82	/// # fn example() -> Result<(), io::Error> {
83	/// let mut cursor = io::Cursor::new(b"lorem`\x00`ipsum`\x00`dolor");
84	///
85	/// let mut records = vec![];
86	/// for result in cursor.byte_records(b'`\x00`') {
87	/// let record = result?;
88	/// records.push(record);
89	/// }
90	/// assert_eq!(records.len(), `3`);
91	/// assert_eq!(records[`0`], "lorem".as_bytes());
92	/// assert_eq!(records[`1`], "ipsum".as_bytes());
93	/// assert_eq!(records[`2`], "dolor".as_bytes());
94	/// # Ok(()) }; example().unwrap()
95	/// ```
96	fn byte_records(self, terminator: u8) -> ByteRecords<Self>
97	where
98	Self: Sized,
99	{
100	ByteRecords { terminator, buf: self }
101	}
102
103	/// Executes the given closure on each line in the underlying reader.
104	///
105	/// If the closure returns an error (or if the underlying reader returns an
106	/// error), then iteration is stopped and the error is returned. If false
107	/// is returned, then iteration is stopped and no error is returned.
108	///
109	/// The closure given is called on exactly the same values as yielded by
110	/// the [`byte_lines`](trait.BufReadExt.html#method.byte_lines)
111	/// iterator. Namely, lines do _not_ contain trailing `\n` or `\r\n` bytes.
112	///
113	/// This routine is useful for iterating over lines as quickly as
114	/// possible. Namely, a single allocation is reused for each line.
115	///
116	/// # Examples
117	///
118	/// Basic usage:
119	///
120	/// ```
121	/// use std::io;
122	///
123	/// use bstr::io::BufReadExt;
124	///
125	/// # fn example() -> Result<(), io::Error> {
126	/// let mut cursor = io::Cursor::new(b"lorem`\n`ipsum`\r\n`dolor");
127	///
128	/// let mut lines = vec![];
129	/// cursor.for_byte_line(\|line\| {
130	/// lines.push(line.to_vec());
131	/// Ok(`true`)
132	/// })?;
133	/// assert_eq!(lines.len(), `3`);
134	/// assert_eq!(lines[`0`], "lorem".as_bytes());
135	/// assert_eq!(lines[`1`], "ipsum".as_bytes());
136	/// assert_eq!(lines[`2`], "dolor".as_bytes());
137	/// # Ok(()) }; example().unwrap()
138	/// ```
139	fn for_byte_line<F>(&mut self, mut for_each_line: F) -> io::Result<()>
140	where
141	Self: Sized,
142	F: FnMut(&[u8]) -> io::Result<bool>,
143	{
144	self.for_byte_line_with_terminator(\|line\| {
145	for_each_line(trim_line_slice(line))
146	})
147	}
148
149	/// Executes the given closure on each byte-terminated record in the
150	/// underlying reader.
151	///
152	/// If the closure returns an error (or if the underlying reader returns an
153	/// error), then iteration is stopped and the error is returned. If false
154	/// is returned, then iteration is stopped and no error is returned.
155	///
156	/// The closure given is called on exactly the same values as yielded by
157	/// the [`byte_records`](trait.BufReadExt.html#method.byte_records)
158	/// iterator. Namely, records do _not_ contain a trailing terminator byte.
159	///
160	/// This routine is useful for iterating over records as quickly as
161	/// possible. Namely, a single allocation is reused for each record.
162	///
163	/// # Examples
164	///
165	/// Basic usage:
166	///
167	/// ```
168	/// use std::io;
169	///
170	/// use bstr::io::BufReadExt;
171	///
172	/// # fn example() -> Result<(), io::Error> {
173	/// let mut cursor = io::Cursor::new(b"lorem`\x00`ipsum`\x00`dolor");
174	///
175	/// let mut records = vec![];
176	/// cursor.for_byte_record(b'`\x00`', \|record\| {
177	/// records.push(record.to_vec());
178	/// Ok(`true`)
179	/// })?;
180	/// assert_eq!(records.len(), `3`);
181	/// assert_eq!(records[`0`], "lorem".as_bytes());
182	/// assert_eq!(records[`1`], "ipsum".as_bytes());
183	/// assert_eq!(records[`2`], "dolor".as_bytes());
184	/// # Ok(()) }; example().unwrap()
185	/// ```
186	fn for_byte_record<F>(
187	&mut self,
188	terminator: u8,
189	mut for_each_record: F,
190	) -> io::Result<()>
191	where
192	Self: Sized,
193	F: FnMut(&[u8]) -> io::Result<bool>,
194	{
195	self.for_byte_record_with_terminator(terminator, \|chunk\| {
196	for_each_record(trim_record_slice(chunk, terminator))
197	})
198	}
199
200	/// Executes the given closure on each line in the underlying reader.
201	///
202	/// If the closure returns an error (or if the underlying reader returns an
203	/// error), then iteration is stopped and the error is returned. If false
204	/// is returned, then iteration is stopped and no error is returned.
205	///
206	/// Unlike
207	/// [`for_byte_line`](trait.BufReadExt.html#method.for_byte_line),
208	/// the lines given to the closure do* include the line terminator, if one*
209	/// exists.
210	///
211	/// This routine is useful for iterating over lines as quickly as
212	/// possible. Namely, a single allocation is reused for each line.
213	///
214	/// This is identical to `for_byte_record_with_terminator` with a
215	/// terminator of `\n`.
216	///
217	/// # Examples
218	///
219	/// Basic usage:
220	///
221	/// ```
222	/// use std::io;
223	///
224	/// use bstr::io::BufReadExt;
225	///
226	/// # fn example() -> Result<(), io::Error> {
227	/// let mut cursor = io::Cursor::new(b"lorem`\n`ipsum`\r\n`dolor");
228	///
229	/// let mut lines = vec![];
230	/// cursor.for_byte_line_with_terminator(\|line\| {
231	/// lines.push(line.to_vec());
232	/// Ok(`true`)
233	/// })?;
234	/// assert_eq!(lines.len(), `3`);
235	/// assert_eq!(lines[`0`], "lorem`\n`".as_bytes());
236	/// assert_eq!(lines[`1`], "ipsum`\r\n`".as_bytes());
237	/// assert_eq!(lines[`2`], "dolor".as_bytes());
238	/// # Ok(()) }; example().unwrap()
239	/// ```
240	fn for_byte_line_with_terminator<F>(
241	&mut self,
242	for_each_line: F,
243	) -> io::Result<()>
244	where
245	Self: Sized,
246	F: FnMut(&[u8]) -> io::Result<bool>,
247	{
248	self.for_byte_record_with_terminator(b'`\n`', for_each_line)
249	}
250
251	/// Executes the given closure on each byte-terminated record in the
252	/// underlying reader.
253	///
254	/// If the closure returns an error (or if the underlying reader returns an
255	/// error), then iteration is stopped and the error is returned. If false
256	/// is returned, then iteration is stopped and no error is returned.
257	///
258	/// Unlike
259	/// [`for_byte_record`](trait.BufReadExt.html#method.for_byte_record),
260	/// the lines given to the closure do* include the record terminator, if*
261	/// one exists.
262	///
263	/// This routine is useful for iterating over records as quickly as
264	/// possible. Namely, a single allocation is reused for each record.
265	///
266	/// # Examples
267	///
268	/// Basic usage:
269	///
270	/// ```
271	/// use std::io;
272	///
273	/// use bstr::{io::BufReadExt, B};
274	///
275	/// # fn example() -> Result<(), io::Error> {
276	/// let mut cursor = io::Cursor::new(b"lorem`\x00`ipsum`\x00`dolor");
277	///
278	/// let mut records = vec![];
279	/// cursor.for_byte_record_with_terminator(b'`\x00`', \|record\| {
280	/// records.push(record.to_vec());
281	/// Ok(`true`)
282	/// })?;
283	/// assert_eq!(records.len(), `3`);
284	/// assert_eq!(records[`0`], B(b"lorem`\x00`"));
285	/// assert_eq!(records[`1`], B("ipsum`\x00`"));
286	/// assert_eq!(records[`2`], B("dolor"));
287	/// # Ok(()) }; example().unwrap()
288	/// ```
289	fn for_byte_record_with_terminator<F>(
290	&mut self,
291	terminator: u8,
292	mut for_each_record: F,
293	) -> io::Result<()>
294	where
295	Self: Sized,
296	F: FnMut(&[u8]) -> io::Result<bool>,
297	{
298	let mut bytes = vec![];
299	let mut res = Ok(());
300	let mut consumed = `0`;
301	'outer: loop {
302	// Lend out complete record slices from our buffer
303	{
304	let mut buf = self.fill_buf()?;
305	if buf.is_empty() {
306	break;
307	}
308	while let Some(index) = buf.find_byte(terminator) {
309	let (record, rest) = buf.split_at(index + `1`);
310	buf = rest;
311	consumed += record.len();
312	match for_each_record(record) {
313	Ok(`false`) => break 'outer,
314	Err(err) => {
315	res = Err(err);
316	break 'outer;
317	}
318	_ => (),
319	}
320	}
321
322	// Copy the final record fragment to our local buffer. This
323	// saves read_until() from re-scanning a buffer we know
324	// contains no remaining terminators.
325	bytes.extend_from_slice(buf);
326	consumed += buf.len();
327	}
328
329	self.consume(consumed);
330	consumed = `0`;
331
332	// N.B. read_until uses a different version of memchr that may
333	// be slower than the memchr crate that bstr uses. However, this
334	// should only run for a fairly small number of records, assuming a
335	// decent buffer size.
336	self.read_until(terminator, &mut bytes)?;
337	if bytes.is_empty() \|\| !for_each_record(&bytes)? {
338	break;
339	}
340	bytes.clear();
341	}
342	self.consume(consumed);
343	res
344	}
345	}
346
347	impl<B: io::BufRead> BufReadExt for B {}
348
349	/// An iterator over lines from an instance of
350	/// [`std::io::BufRead`](https://doc.rust-lang.org/std/io/trait.BufRead.html).
351	///
352	/// This iterator is generally created by calling the
353	/// [`byte_lines`](trait.BufReadExt.html#method.byte_lines)
354	/// method on the
355	/// [`BufReadExt`](trait.BufReadExt.html)
356	/// trait.
357	#[derive(Debug)]
358	pub struct ByteLines<B> {
359	buf: B,
360	}
361
362	/// An iterator over records from an instance of
363	/// [`std::io::BufRead`](https://doc.rust-lang.org/std/io/trait.BufRead.html).
364	///
365	/// A byte record is any sequence of bytes terminated by a particular byte
366	/// chosen by the caller. For example, NUL separated byte strings are said to
367	/// be NUL-terminated byte records.
368	///
369	/// This iterator is generally created by calling the
370	/// [`byte_records`](trait.BufReadExt.html#method.byte_records)
371	/// method on the
372	/// [`BufReadExt`](trait.BufReadExt.html)
373	/// trait.
374	#[derive(Debug)]
375	pub struct ByteRecords<B> {
376	buf: B,
377	terminator: u8,
378	}
379
380	impl<B: io::BufRead> Iterator for ByteLines<B> {
381	type Item = io::Result<Vec<u8>>;
382
383	fn next(&mut self) -> Option<io::Result<Vec<u8>>> {
384	let mut bytes: Vec = vec![];
385	match self.buf.read_until(byte:b'`\n`', &mut bytes) {
386	Err(e: Error) => Some(Err(e)),
387	Ok(`0`) => None,
388	Ok(_) => {
389	trim_line(&mut bytes);
390	Some(Ok(bytes))
391	}
392	}
393	}
394	}
395
396	impl<B: io::BufRead> Iterator for ByteRecords<B> {
397	type Item = io::Result<Vec<u8>>;
398
399	fn next(&mut self) -> Option<io::Result<Vec<u8>>> {
400	let mut bytes: Vec = vec![];
401	match self.buf.read_until(self.terminator, &mut bytes) {
402	Err(e: Error) => Some(Err(e)),
403	Ok(`0`) => None,
404	Ok(_) => {
405	trim_record(&mut bytes, self.terminator);
406	Some(Ok(bytes))
407	}
408	}
409	}
410	}
411
412	fn trim_line(line: &mut Vec<u8>) {
413	if line.last_byte() == Some(b'`\n`') {
414	line.pop_byte();
415	if line.last_byte() == Some(b'`\r`') {
416	line.pop_byte();
417	}
418	}
419	}
420
421	fn trim_line_slice(mut line: &[u8]) -> &[u8] {
422	if line.last_byte() == Some(b'`\n`') {
423	line = &line[..line.len() - `1`];
424	if line.last_byte() == Some(b'`\r`') {
425	line = &line[..line.len() - `1`];
426	}
427	}
428	line
429	}
430
431	fn trim_record(record: &mut Vec<u8>, terminator: u8) {
432	if record.last_byte() == Some(terminator) {
433	record.pop_byte();
434	}
435	}
436
437	fn trim_record_slice(mut record: &[u8], terminator: u8) -> &[u8] {
438	if record.last_byte() == Some(terminator) {
439	record = &record[..record.len() - `1`];
440	}
441	record
442	}
443
444	#[cfg(all(test, feature = "std"))]
445	mod tests {
446	use alloc::{vec, vec::Vec};
447
448	use crate::bstring::BString;
449
450	use super::BufReadExt;
451
452	fn collect_lines<B: AsRef<[u8]>>(slice: B) -> Vec<BString> {
453	let mut lines = vec![];
454	slice
455	.as_ref()
456	.for_byte_line(\|line\| {
457	lines.push(BString::from(line.to_vec()));
458	Ok(`true`)
459	})
460	.unwrap();
461	lines
462	}
463
464	fn collect_lines_term<B: AsRef<[u8]>>(slice: B) -> Vec<BString> {
465	let mut lines = vec![];
466	slice
467	.as_ref()
468	.for_byte_line_with_terminator(\|line\| {
469	lines.push(BString::from(line.to_vec()));
470	Ok(`true`)
471	})
472	.unwrap();
473	lines
474	}
475
476	#[test]
477	fn lines_without_terminator() {
478	assert_eq!(collect_lines(""), Vec::<BString>::new());
479
480	assert_eq!(collect_lines("`\n`"), vec![""]);
481	assert_eq!(collect_lines("`\n\n`"), vec!["", ""]);
482	assert_eq!(collect_lines("a`\n`b`\n`"), vec!["a", "b"]);
483	assert_eq!(collect_lines("a`\n`b"), vec!["a", "b"]);
484	assert_eq!(collect_lines("abc`\n`xyz`\n`"), vec!["abc", "xyz"]);
485	assert_eq!(collect_lines("abc`\n`xyz"), vec!["abc", "xyz"]);
486
487	assert_eq!(collect_lines("`\r\n`"), vec![""]);
488	assert_eq!(collect_lines("`\r\n\r\n`"), vec!["", ""]);
489	assert_eq!(collect_lines("a`\r\n`b`\r\n`"), vec!["a", "b"]);
490	assert_eq!(collect_lines("a`\r\n`b"), vec!["a", "b"]);
491	assert_eq!(collect_lines("abc`\r\n`xyz`\r\n`"), vec!["abc", "xyz"]);
492	assert_eq!(collect_lines("abc`\r\n`xyz"), vec!["abc", "xyz"]);
493
494	assert_eq!(collect_lines("abc`\r`xyz"), vec!["abc`\r`xyz"]);
495	}
496
497	#[test]
498	fn lines_with_terminator() {
499	assert_eq!(collect_lines_term(""), Vec::<BString>::new());
500
501	assert_eq!(collect_lines_term("`\n`"), vec!["`\n`"]);
502	assert_eq!(collect_lines_term("`\n\n`"), vec!["`\n`", "`\n`"]);
503	assert_eq!(collect_lines_term("a`\n`b`\n`"), vec!["a`\n`", "b`\n`"]);
504	assert_eq!(collect_lines_term("a`\n`b"), vec!["a`\n`", "b"]);
505	assert_eq!(collect_lines_term("abc`\n`xyz`\n`"), vec!["abc`\n`", "xyz`\n`"]);
506	assert_eq!(collect_lines_term("abc`\n`xyz"), vec!["abc`\n`", "xyz"]);
507
508	assert_eq!(collect_lines_term("`\r\n`"), vec!["`\r\n`"]);
509	assert_eq!(collect_lines_term("`\r\n\r\n`"), vec!["`\r\n`", "`\r\n`"]);
510	assert_eq!(collect_lines_term("a`\r\n`b`\r\n`"), vec!["a`\r\n`", "b`\r\n`"]);
511	assert_eq!(collect_lines_term("a`\r\n`b"), vec!["a`\r\n`", "b"]);
512	assert_eq!(
513	collect_lines_term("abc`\r\n`xyz`\r\n`"),
514	vec!["abc`\r\n`", "xyz`\r\n`"]
515	);
516	assert_eq!(collect_lines_term("abc`\r\n`xyz"), vec!["abc`\r\n`", "xyz"]);
517
518	assert_eq!(collect_lines_term("abc`\r`xyz"), vec!["abc`\r`xyz"]);
519	}
520	}
521