1 | /*! |
2 | Utilities for working with I/O using byte strings. |
3 | |
4 | This module currently only exports a single trait, `BufReadExt`, which provides |
5 | facilities for conveniently and efficiently working with lines as byte strings. |
6 | |
7 | More APIs may be added in the future. |
8 | */ |
9 | |
10 | use alloc::{vec, vec::Vec}; |
11 | |
12 | use std::io; |
13 | |
14 | use crate::{ext_slice::ByteSlice, ext_vec::ByteVec}; |
15 | |
16 | /// An extension trait for |
17 | /// [`std::io::BufRead`](https://doc.rust-lang.org/std/io/trait.BufRead.html) |
18 | /// which provides convenience APIs for dealing with byte strings. |
19 | pub trait BufReadExt: io::BufRead { |
20 | /// Returns an iterator over the lines of this reader, where each line |
21 | /// is represented as a byte string. |
22 | /// |
23 | /// Each item yielded by this iterator is a `io::Result<Vec<u8>>`, where |
24 | /// an error is yielded if there was a problem reading from the underlying |
25 | /// reader. |
26 | /// |
27 | /// On success, the next line in the iterator is returned. The line does |
28 | /// *not* contain a trailing `\n` or `\r\n`. |
29 | /// |
30 | /// # Examples |
31 | /// |
32 | /// Basic usage: |
33 | /// |
34 | /// ``` |
35 | /// use std::io; |
36 | /// |
37 | /// use bstr::io::BufReadExt; |
38 | /// |
39 | /// # fn example() -> Result<(), io::Error> { |
40 | /// let mut cursor = io::Cursor::new(b"lorem \nipsum \r\ndolor" ); |
41 | /// |
42 | /// let mut lines = vec![]; |
43 | /// for result in cursor.byte_lines() { |
44 | /// let line = result?; |
45 | /// lines.push(line); |
46 | /// } |
47 | /// assert_eq!(lines.len(), 3); |
48 | /// assert_eq!(lines[0], "lorem" .as_bytes()); |
49 | /// assert_eq!(lines[1], "ipsum" .as_bytes()); |
50 | /// assert_eq!(lines[2], "dolor" .as_bytes()); |
51 | /// # Ok(()) }; example().unwrap() |
52 | /// ``` |
53 | fn byte_lines(self) -> ByteLines<Self> |
54 | where |
55 | Self: Sized, |
56 | { |
57 | ByteLines { buf: self } |
58 | } |
59 | |
60 | /// Returns an iterator over byte-terminated records of this reader, where |
61 | /// each record is represented as a byte string. |
62 | /// |
63 | /// Each item yielded by this iterator is a `io::Result<Vec<u8>>`, where |
64 | /// an error is yielded if there was a problem reading from the underlying |
65 | /// reader. |
66 | /// |
67 | /// On success, the next record in the iterator is returned. The record |
68 | /// does *not* contain its trailing terminator. |
69 | /// |
70 | /// Note that calling `byte_records(b'\n')` differs from `byte_lines()` in |
71 | /// that it has no special handling for `\r`. |
72 | /// |
73 | /// # Examples |
74 | /// |
75 | /// Basic usage: |
76 | /// |
77 | /// ``` |
78 | /// use std::io; |
79 | /// |
80 | /// use bstr::io::BufReadExt; |
81 | /// |
82 | /// # fn example() -> Result<(), io::Error> { |
83 | /// let mut cursor = io::Cursor::new(b"lorem \x00ipsum \x00dolor" ); |
84 | /// |
85 | /// let mut records = vec![]; |
86 | /// for result in cursor.byte_records(b' \x00' ) { |
87 | /// let record = result?; |
88 | /// records.push(record); |
89 | /// } |
90 | /// assert_eq!(records.len(), 3); |
91 | /// assert_eq!(records[0], "lorem" .as_bytes()); |
92 | /// assert_eq!(records[1], "ipsum" .as_bytes()); |
93 | /// assert_eq!(records[2], "dolor" .as_bytes()); |
94 | /// # Ok(()) }; example().unwrap() |
95 | /// ``` |
96 | fn byte_records(self, terminator: u8) -> ByteRecords<Self> |
97 | where |
98 | Self: Sized, |
99 | { |
100 | ByteRecords { terminator, buf: self } |
101 | } |
102 | |
103 | /// Executes the given closure on each line in the underlying reader. |
104 | /// |
105 | /// If the closure returns an error (or if the underlying reader returns an |
106 | /// error), then iteration is stopped and the error is returned. If false |
107 | /// is returned, then iteration is stopped and no error is returned. |
108 | /// |
109 | /// The closure given is called on exactly the same values as yielded by |
110 | /// the [`byte_lines`](trait.BufReadExt.html#method.byte_lines) |
111 | /// iterator. Namely, lines do _not_ contain trailing `\n` or `\r\n` bytes. |
112 | /// |
113 | /// This routine is useful for iterating over lines as quickly as |
114 | /// possible. Namely, a single allocation is reused for each line. |
115 | /// |
116 | /// # Examples |
117 | /// |
118 | /// Basic usage: |
119 | /// |
120 | /// ``` |
121 | /// use std::io; |
122 | /// |
123 | /// use bstr::io::BufReadExt; |
124 | /// |
125 | /// # fn example() -> Result<(), io::Error> { |
126 | /// let mut cursor = io::Cursor::new(b"lorem \nipsum \r\ndolor" ); |
127 | /// |
128 | /// let mut lines = vec![]; |
129 | /// cursor.for_byte_line(|line| { |
130 | /// lines.push(line.to_vec()); |
131 | /// Ok(true) |
132 | /// })?; |
133 | /// assert_eq!(lines.len(), 3); |
134 | /// assert_eq!(lines[0], "lorem" .as_bytes()); |
135 | /// assert_eq!(lines[1], "ipsum" .as_bytes()); |
136 | /// assert_eq!(lines[2], "dolor" .as_bytes()); |
137 | /// # Ok(()) }; example().unwrap() |
138 | /// ``` |
139 | fn for_byte_line<F>(&mut self, mut for_each_line: F) -> io::Result<()> |
140 | where |
141 | Self: Sized, |
142 | F: FnMut(&[u8]) -> io::Result<bool>, |
143 | { |
144 | self.for_byte_line_with_terminator(|line| { |
145 | for_each_line(trim_line_slice(line)) |
146 | }) |
147 | } |
148 | |
149 | /// Executes the given closure on each byte-terminated record in the |
150 | /// underlying reader. |
151 | /// |
152 | /// If the closure returns an error (or if the underlying reader returns an |
153 | /// error), then iteration is stopped and the error is returned. If false |
154 | /// is returned, then iteration is stopped and no error is returned. |
155 | /// |
156 | /// The closure given is called on exactly the same values as yielded by |
157 | /// the [`byte_records`](trait.BufReadExt.html#method.byte_records) |
158 | /// iterator. Namely, records do _not_ contain a trailing terminator byte. |
159 | /// |
160 | /// This routine is useful for iterating over records as quickly as |
161 | /// possible. Namely, a single allocation is reused for each record. |
162 | /// |
163 | /// # Examples |
164 | /// |
165 | /// Basic usage: |
166 | /// |
167 | /// ``` |
168 | /// use std::io; |
169 | /// |
170 | /// use bstr::io::BufReadExt; |
171 | /// |
172 | /// # fn example() -> Result<(), io::Error> { |
173 | /// let mut cursor = io::Cursor::new(b"lorem \x00ipsum \x00dolor" ); |
174 | /// |
175 | /// let mut records = vec![]; |
176 | /// cursor.for_byte_record(b' \x00' , |record| { |
177 | /// records.push(record.to_vec()); |
178 | /// Ok(true) |
179 | /// })?; |
180 | /// assert_eq!(records.len(), 3); |
181 | /// assert_eq!(records[0], "lorem" .as_bytes()); |
182 | /// assert_eq!(records[1], "ipsum" .as_bytes()); |
183 | /// assert_eq!(records[2], "dolor" .as_bytes()); |
184 | /// # Ok(()) }; example().unwrap() |
185 | /// ``` |
186 | fn for_byte_record<F>( |
187 | &mut self, |
188 | terminator: u8, |
189 | mut for_each_record: F, |
190 | ) -> io::Result<()> |
191 | where |
192 | Self: Sized, |
193 | F: FnMut(&[u8]) -> io::Result<bool>, |
194 | { |
195 | self.for_byte_record_with_terminator(terminator, |chunk| { |
196 | for_each_record(trim_record_slice(chunk, terminator)) |
197 | }) |
198 | } |
199 | |
200 | /// Executes the given closure on each line in the underlying reader. |
201 | /// |
202 | /// If the closure returns an error (or if the underlying reader returns an |
203 | /// error), then iteration is stopped and the error is returned. If false |
204 | /// is returned, then iteration is stopped and no error is returned. |
205 | /// |
206 | /// Unlike |
207 | /// [`for_byte_line`](trait.BufReadExt.html#method.for_byte_line), |
208 | /// the lines given to the closure *do* include the line terminator, if one |
209 | /// exists. |
210 | /// |
211 | /// This routine is useful for iterating over lines as quickly as |
212 | /// possible. Namely, a single allocation is reused for each line. |
213 | /// |
214 | /// This is identical to `for_byte_record_with_terminator` with a |
215 | /// terminator of `\n`. |
216 | /// |
217 | /// # Examples |
218 | /// |
219 | /// Basic usage: |
220 | /// |
221 | /// ``` |
222 | /// use std::io; |
223 | /// |
224 | /// use bstr::io::BufReadExt; |
225 | /// |
226 | /// # fn example() -> Result<(), io::Error> { |
227 | /// let mut cursor = io::Cursor::new(b"lorem \nipsum \r\ndolor" ); |
228 | /// |
229 | /// let mut lines = vec![]; |
230 | /// cursor.for_byte_line_with_terminator(|line| { |
231 | /// lines.push(line.to_vec()); |
232 | /// Ok(true) |
233 | /// })?; |
234 | /// assert_eq!(lines.len(), 3); |
235 | /// assert_eq!(lines[0], "lorem \n" .as_bytes()); |
236 | /// assert_eq!(lines[1], "ipsum \r\n" .as_bytes()); |
237 | /// assert_eq!(lines[2], "dolor" .as_bytes()); |
238 | /// # Ok(()) }; example().unwrap() |
239 | /// ``` |
240 | fn for_byte_line_with_terminator<F>( |
241 | &mut self, |
242 | for_each_line: F, |
243 | ) -> io::Result<()> |
244 | where |
245 | Self: Sized, |
246 | F: FnMut(&[u8]) -> io::Result<bool>, |
247 | { |
248 | self.for_byte_record_with_terminator(b' \n' , for_each_line) |
249 | } |
250 | |
251 | /// Executes the given closure on each byte-terminated record in the |
252 | /// underlying reader. |
253 | /// |
254 | /// If the closure returns an error (or if the underlying reader returns an |
255 | /// error), then iteration is stopped and the error is returned. If false |
256 | /// is returned, then iteration is stopped and no error is returned. |
257 | /// |
258 | /// Unlike |
259 | /// [`for_byte_record`](trait.BufReadExt.html#method.for_byte_record), |
260 | /// the lines given to the closure *do* include the record terminator, if |
261 | /// one exists. |
262 | /// |
263 | /// This routine is useful for iterating over records as quickly as |
264 | /// possible. Namely, a single allocation is reused for each record. |
265 | /// |
266 | /// # Examples |
267 | /// |
268 | /// Basic usage: |
269 | /// |
270 | /// ``` |
271 | /// use std::io; |
272 | /// |
273 | /// use bstr::{io::BufReadExt, B}; |
274 | /// |
275 | /// # fn example() -> Result<(), io::Error> { |
276 | /// let mut cursor = io::Cursor::new(b"lorem \x00ipsum \x00dolor" ); |
277 | /// |
278 | /// let mut records = vec![]; |
279 | /// cursor.for_byte_record_with_terminator(b' \x00' , |record| { |
280 | /// records.push(record.to_vec()); |
281 | /// Ok(true) |
282 | /// })?; |
283 | /// assert_eq!(records.len(), 3); |
284 | /// assert_eq!(records[0], B(b"lorem \x00" )); |
285 | /// assert_eq!(records[1], B("ipsum \x00" )); |
286 | /// assert_eq!(records[2], B("dolor" )); |
287 | /// # Ok(()) }; example().unwrap() |
288 | /// ``` |
289 | fn for_byte_record_with_terminator<F>( |
290 | &mut self, |
291 | terminator: u8, |
292 | mut for_each_record: F, |
293 | ) -> io::Result<()> |
294 | where |
295 | Self: Sized, |
296 | F: FnMut(&[u8]) -> io::Result<bool>, |
297 | { |
298 | let mut bytes = vec![]; |
299 | let mut res = Ok(()); |
300 | let mut consumed = 0; |
301 | 'outer: loop { |
302 | // Lend out complete record slices from our buffer |
303 | { |
304 | let mut buf = self.fill_buf()?; |
305 | if buf.is_empty() { |
306 | break; |
307 | } |
308 | while let Some(index) = buf.find_byte(terminator) { |
309 | let (record, rest) = buf.split_at(index + 1); |
310 | buf = rest; |
311 | consumed += record.len(); |
312 | match for_each_record(record) { |
313 | Ok(false) => break 'outer, |
314 | Err(err) => { |
315 | res = Err(err); |
316 | break 'outer; |
317 | } |
318 | _ => (), |
319 | } |
320 | } |
321 | |
322 | // Copy the final record fragment to our local buffer. This |
323 | // saves read_until() from re-scanning a buffer we know |
324 | // contains no remaining terminators. |
325 | bytes.extend_from_slice(buf); |
326 | consumed += buf.len(); |
327 | } |
328 | |
329 | self.consume(consumed); |
330 | consumed = 0; |
331 | |
332 | // N.B. read_until uses a different version of memchr that may |
333 | // be slower than the memchr crate that bstr uses. However, this |
334 | // should only run for a fairly small number of records, assuming a |
335 | // decent buffer size. |
336 | self.read_until(terminator, &mut bytes)?; |
337 | if bytes.is_empty() || !for_each_record(&bytes)? { |
338 | break; |
339 | } |
340 | bytes.clear(); |
341 | } |
342 | self.consume(consumed); |
343 | res |
344 | } |
345 | } |
346 | |
347 | impl<B: io::BufRead> BufReadExt for B {} |
348 | |
349 | /// An iterator over lines from an instance of |
350 | /// [`std::io::BufRead`](https://doc.rust-lang.org/std/io/trait.BufRead.html). |
351 | /// |
352 | /// This iterator is generally created by calling the |
353 | /// [`byte_lines`](trait.BufReadExt.html#method.byte_lines) |
354 | /// method on the |
355 | /// [`BufReadExt`](trait.BufReadExt.html) |
356 | /// trait. |
357 | #[derive (Debug)] |
358 | pub struct ByteLines<B> { |
359 | buf: B, |
360 | } |
361 | |
362 | /// An iterator over records from an instance of |
363 | /// [`std::io::BufRead`](https://doc.rust-lang.org/std/io/trait.BufRead.html). |
364 | /// |
365 | /// A byte record is any sequence of bytes terminated by a particular byte |
366 | /// chosen by the caller. For example, NUL separated byte strings are said to |
367 | /// be NUL-terminated byte records. |
368 | /// |
369 | /// This iterator is generally created by calling the |
370 | /// [`byte_records`](trait.BufReadExt.html#method.byte_records) |
371 | /// method on the |
372 | /// [`BufReadExt`](trait.BufReadExt.html) |
373 | /// trait. |
374 | #[derive (Debug)] |
375 | pub struct ByteRecords<B> { |
376 | buf: B, |
377 | terminator: u8, |
378 | } |
379 | |
380 | impl<B: io::BufRead> Iterator for ByteLines<B> { |
381 | type Item = io::Result<Vec<u8>>; |
382 | |
383 | fn next(&mut self) -> Option<io::Result<Vec<u8>>> { |
384 | let mut bytes: Vec = vec![]; |
385 | match self.buf.read_until(byte:b' \n' , &mut bytes) { |
386 | Err(e: Error) => Some(Err(e)), |
387 | Ok(0) => None, |
388 | Ok(_) => { |
389 | trim_line(&mut bytes); |
390 | Some(Ok(bytes)) |
391 | } |
392 | } |
393 | } |
394 | } |
395 | |
396 | impl<B: io::BufRead> Iterator for ByteRecords<B> { |
397 | type Item = io::Result<Vec<u8>>; |
398 | |
399 | fn next(&mut self) -> Option<io::Result<Vec<u8>>> { |
400 | let mut bytes: Vec = vec![]; |
401 | match self.buf.read_until(self.terminator, &mut bytes) { |
402 | Err(e: Error) => Some(Err(e)), |
403 | Ok(0) => None, |
404 | Ok(_) => { |
405 | trim_record(&mut bytes, self.terminator); |
406 | Some(Ok(bytes)) |
407 | } |
408 | } |
409 | } |
410 | } |
411 | |
412 | fn trim_line(line: &mut Vec<u8>) { |
413 | if line.last_byte() == Some(b' \n' ) { |
414 | line.pop_byte(); |
415 | if line.last_byte() == Some(b' \r' ) { |
416 | line.pop_byte(); |
417 | } |
418 | } |
419 | } |
420 | |
421 | fn trim_line_slice(mut line: &[u8]) -> &[u8] { |
422 | if line.last_byte() == Some(b' \n' ) { |
423 | line = &line[..line.len() - 1]; |
424 | if line.last_byte() == Some(b' \r' ) { |
425 | line = &line[..line.len() - 1]; |
426 | } |
427 | } |
428 | line |
429 | } |
430 | |
431 | fn trim_record(record: &mut Vec<u8>, terminator: u8) { |
432 | if record.last_byte() == Some(terminator) { |
433 | record.pop_byte(); |
434 | } |
435 | } |
436 | |
437 | fn trim_record_slice(mut record: &[u8], terminator: u8) -> &[u8] { |
438 | if record.last_byte() == Some(terminator) { |
439 | record = &record[..record.len() - 1]; |
440 | } |
441 | record |
442 | } |
443 | |
444 | #[cfg (all(test, feature = "std" ))] |
445 | mod tests { |
446 | use alloc::{vec, vec::Vec}; |
447 | |
448 | use crate::bstring::BString; |
449 | |
450 | use super::BufReadExt; |
451 | |
452 | fn collect_lines<B: AsRef<[u8]>>(slice: B) -> Vec<BString> { |
453 | let mut lines = vec![]; |
454 | slice |
455 | .as_ref() |
456 | .for_byte_line(|line| { |
457 | lines.push(BString::from(line.to_vec())); |
458 | Ok(true) |
459 | }) |
460 | .unwrap(); |
461 | lines |
462 | } |
463 | |
464 | fn collect_lines_term<B: AsRef<[u8]>>(slice: B) -> Vec<BString> { |
465 | let mut lines = vec![]; |
466 | slice |
467 | .as_ref() |
468 | .for_byte_line_with_terminator(|line| { |
469 | lines.push(BString::from(line.to_vec())); |
470 | Ok(true) |
471 | }) |
472 | .unwrap(); |
473 | lines |
474 | } |
475 | |
476 | #[test ] |
477 | fn lines_without_terminator() { |
478 | assert_eq!(collect_lines("" ), Vec::<BString>::new()); |
479 | |
480 | assert_eq!(collect_lines(" \n" ), vec!["" ]); |
481 | assert_eq!(collect_lines(" \n\n" ), vec!["" , "" ]); |
482 | assert_eq!(collect_lines("a \nb \n" ), vec!["a" , "b" ]); |
483 | assert_eq!(collect_lines("a \nb" ), vec!["a" , "b" ]); |
484 | assert_eq!(collect_lines("abc \nxyz \n" ), vec!["abc" , "xyz" ]); |
485 | assert_eq!(collect_lines("abc \nxyz" ), vec!["abc" , "xyz" ]); |
486 | |
487 | assert_eq!(collect_lines(" \r\n" ), vec!["" ]); |
488 | assert_eq!(collect_lines(" \r\n\r\n" ), vec!["" , "" ]); |
489 | assert_eq!(collect_lines("a \r\nb \r\n" ), vec!["a" , "b" ]); |
490 | assert_eq!(collect_lines("a \r\nb" ), vec!["a" , "b" ]); |
491 | assert_eq!(collect_lines("abc \r\nxyz \r\n" ), vec!["abc" , "xyz" ]); |
492 | assert_eq!(collect_lines("abc \r\nxyz" ), vec!["abc" , "xyz" ]); |
493 | |
494 | assert_eq!(collect_lines("abc \rxyz" ), vec!["abc \rxyz" ]); |
495 | } |
496 | |
497 | #[test ] |
498 | fn lines_with_terminator() { |
499 | assert_eq!(collect_lines_term("" ), Vec::<BString>::new()); |
500 | |
501 | assert_eq!(collect_lines_term(" \n" ), vec![" \n" ]); |
502 | assert_eq!(collect_lines_term(" \n\n" ), vec![" \n" , " \n" ]); |
503 | assert_eq!(collect_lines_term("a \nb \n" ), vec!["a \n" , "b \n" ]); |
504 | assert_eq!(collect_lines_term("a \nb" ), vec!["a \n" , "b" ]); |
505 | assert_eq!(collect_lines_term("abc \nxyz \n" ), vec!["abc \n" , "xyz \n" ]); |
506 | assert_eq!(collect_lines_term("abc \nxyz" ), vec!["abc \n" , "xyz" ]); |
507 | |
508 | assert_eq!(collect_lines_term(" \r\n" ), vec![" \r\n" ]); |
509 | assert_eq!(collect_lines_term(" \r\n\r\n" ), vec![" \r\n" , " \r\n" ]); |
510 | assert_eq!(collect_lines_term("a \r\nb \r\n" ), vec!["a \r\n" , "b \r\n" ]); |
511 | assert_eq!(collect_lines_term("a \r\nb" ), vec!["a \r\n" , "b" ]); |
512 | assert_eq!( |
513 | collect_lines_term("abc \r\nxyz \r\n" ), |
514 | vec!["abc \r\n" , "xyz \r\n" ] |
515 | ); |
516 | assert_eq!(collect_lines_term("abc \r\nxyz" ), vec!["abc \r\n" , "xyz" ]); |
517 | |
518 | assert_eq!(collect_lines_term("abc \rxyz" ), vec!["abc \rxyz" ]); |
519 | } |
520 | } |
521 | |