1 | /*! |
2 | Utilities for working with I/O using byte strings. |
3 | |
4 | This module currently only exports a single trait, `BufReadExt`, which provides |
5 | facilities for conveniently and efficiently working with lines as byte strings. |
6 | |
7 | More APIs may be added in the future. |
8 | */ |
9 | |
10 | use alloc::{vec, vec::Vec}; |
11 | |
12 | use std::io; |
13 | |
14 | use crate::{ext_slice::ByteSlice, ext_vec::ByteVec}; |
15 | |
16 | /// An extension trait for |
17 | /// [`std::io::BufRead`](https://doc.rust-lang.org/std/io/trait.BufRead.html) |
18 | /// which provides convenience APIs for dealing with byte strings. |
19 | pub trait BufReadExt: io::BufRead { |
20 | /// Returns an iterator over the lines of this reader, where each line |
21 | /// is represented as a byte string. |
22 | /// |
23 | /// Each item yielded by this iterator is a `io::Result<Vec<u8>>`, where |
24 | /// an error is yielded if there was a problem reading from the underlying |
25 | /// reader. |
26 | /// |
27 | /// On success, the next line in the iterator is returned. The line does |
28 | /// *not* contain a trailing `\n` or `\r\n`. |
29 | /// |
30 | /// # Examples |
31 | /// |
32 | /// Basic usage: |
33 | /// |
34 | /// ``` |
35 | /// use std::io; |
36 | /// |
37 | /// use bstr::io::BufReadExt; |
38 | /// |
39 | /// # fn example() -> Result<(), io::Error> { |
40 | /// let mut cursor = io::Cursor::new(b"lorem \nipsum \r\ndolor" ); |
41 | /// |
42 | /// let mut lines = vec![]; |
43 | /// for result in cursor.byte_lines() { |
44 | /// let line = result?; |
45 | /// lines.push(line); |
46 | /// } |
47 | /// assert_eq!(lines.len(), 3); |
48 | /// assert_eq!(lines[0], "lorem" .as_bytes()); |
49 | /// assert_eq!(lines[1], "ipsum" .as_bytes()); |
50 | /// assert_eq!(lines[2], "dolor" .as_bytes()); |
51 | /// # Ok(()) }; example().unwrap() |
52 | /// ``` |
53 | fn byte_lines(self) -> ByteLines<Self> |
54 | where |
55 | Self: Sized, |
56 | { |
57 | ByteLines { buf: self } |
58 | } |
59 | |
60 | /// Returns an iterator over byte-terminated records of this reader, where |
61 | /// each record is represented as a byte string. |
62 | /// |
63 | /// Each item yielded by this iterator is a `io::Result<Vec<u8>>`, where |
64 | /// an error is yielded if there was a problem reading from the underlying |
65 | /// reader. |
66 | /// |
67 | /// On success, the next record in the iterator is returned. The record |
68 | /// does *not* contain its trailing terminator. |
69 | /// |
70 | /// Note that calling `byte_records(b'\n')` differs from `byte_lines()` in |
71 | /// that it has no special handling for `\r`. |
72 | /// |
73 | /// # Examples |
74 | /// |
75 | /// Basic usage: |
76 | /// |
77 | /// ``` |
78 | /// use std::io; |
79 | /// |
80 | /// use bstr::io::BufReadExt; |
81 | /// |
82 | /// # fn example() -> Result<(), io::Error> { |
83 | /// let mut cursor = io::Cursor::new(b"lorem \x00ipsum \x00dolor" ); |
84 | /// |
85 | /// let mut records = vec![]; |
86 | /// for result in cursor.byte_records(b' \x00' ) { |
87 | /// let record = result?; |
88 | /// records.push(record); |
89 | /// } |
90 | /// assert_eq!(records.len(), 3); |
91 | /// assert_eq!(records[0], "lorem" .as_bytes()); |
92 | /// assert_eq!(records[1], "ipsum" .as_bytes()); |
93 | /// assert_eq!(records[2], "dolor" .as_bytes()); |
94 | /// # Ok(()) }; example().unwrap() |
95 | /// ``` |
96 | fn byte_records(self, terminator: u8) -> ByteRecords<Self> |
97 | where |
98 | Self: Sized, |
99 | { |
100 | ByteRecords { terminator, buf: self } |
101 | } |
102 | |
103 | /// Executes the given closure on each line in the underlying reader. |
104 | /// |
105 | /// If the closure returns an error (or if the underlying reader returns an |
106 | /// error), then iteration is stopped and the error is returned. If false |
107 | /// is returned, then iteration is stopped and no error is returned. |
108 | /// |
109 | /// The closure given is called on exactly the same values as yielded by |
110 | /// the [`byte_lines`](trait.BufReadExt.html#method.byte_lines) |
111 | /// iterator. Namely, lines do _not_ contain trailing `\n` or `\r\n` bytes. |
112 | /// |
113 | /// This routine is useful for iterating over lines as quickly as |
114 | /// possible. Namely, a single allocation is reused for each line. |
115 | /// |
116 | /// # Examples |
117 | /// |
118 | /// Basic usage: |
119 | /// |
120 | /// ``` |
121 | /// use std::io; |
122 | /// |
123 | /// use bstr::io::BufReadExt; |
124 | /// |
125 | /// # fn example() -> Result<(), io::Error> { |
126 | /// let mut cursor = io::Cursor::new(b"lorem \nipsum \r\ndolor" ); |
127 | /// |
128 | /// let mut lines = vec![]; |
129 | /// cursor.for_byte_line(|line| { |
130 | /// lines.push(line.to_vec()); |
131 | /// Ok(true) |
132 | /// })?; |
133 | /// assert_eq!(lines.len(), 3); |
134 | /// assert_eq!(lines[0], "lorem" .as_bytes()); |
135 | /// assert_eq!(lines[1], "ipsum" .as_bytes()); |
136 | /// assert_eq!(lines[2], "dolor" .as_bytes()); |
137 | /// # Ok(()) }; example().unwrap() |
138 | /// ``` |
139 | fn for_byte_line<F>(&mut self, mut for_each_line: F) -> io::Result<()> |
140 | where |
141 | Self: Sized, |
142 | F: FnMut(&[u8]) -> io::Result<bool>, |
143 | { |
144 | self.for_byte_line_with_terminator(|line| { |
145 | for_each_line(&trim_line_slice(&line)) |
146 | }) |
147 | } |
148 | |
149 | /// Executes the given closure on each byte-terminated record in the |
150 | /// underlying reader. |
151 | /// |
152 | /// If the closure returns an error (or if the underlying reader returns an |
153 | /// error), then iteration is stopped and the error is returned. If false |
154 | /// is returned, then iteration is stopped and no error is returned. |
155 | /// |
156 | /// The closure given is called on exactly the same values as yielded by |
157 | /// the [`byte_records`](trait.BufReadExt.html#method.byte_records) |
158 | /// iterator. Namely, records do _not_ contain a trailing terminator byte. |
159 | /// |
160 | /// This routine is useful for iterating over records as quickly as |
161 | /// possible. Namely, a single allocation is reused for each record. |
162 | /// |
163 | /// # Examples |
164 | /// |
165 | /// Basic usage: |
166 | /// |
167 | /// ``` |
168 | /// use std::io; |
169 | /// |
170 | /// use bstr::io::BufReadExt; |
171 | /// |
172 | /// # fn example() -> Result<(), io::Error> { |
173 | /// let mut cursor = io::Cursor::new(b"lorem \x00ipsum \x00dolor" ); |
174 | /// |
175 | /// let mut records = vec![]; |
176 | /// cursor.for_byte_record(b' \x00' , |record| { |
177 | /// records.push(record.to_vec()); |
178 | /// Ok(true) |
179 | /// })?; |
180 | /// assert_eq!(records.len(), 3); |
181 | /// assert_eq!(records[0], "lorem" .as_bytes()); |
182 | /// assert_eq!(records[1], "ipsum" .as_bytes()); |
183 | /// assert_eq!(records[2], "dolor" .as_bytes()); |
184 | /// # Ok(()) }; example().unwrap() |
185 | /// ``` |
186 | fn for_byte_record<F>( |
187 | &mut self, |
188 | terminator: u8, |
189 | mut for_each_record: F, |
190 | ) -> io::Result<()> |
191 | where |
192 | Self: Sized, |
193 | F: FnMut(&[u8]) -> io::Result<bool>, |
194 | { |
195 | self.for_byte_record_with_terminator(terminator, |chunk| { |
196 | for_each_record(&trim_record_slice(&chunk, terminator)) |
197 | }) |
198 | } |
199 | |
200 | /// Executes the given closure on each line in the underlying reader. |
201 | /// |
202 | /// If the closure returns an error (or if the underlying reader returns an |
203 | /// error), then iteration is stopped and the error is returned. If false |
204 | /// is returned, then iteration is stopped and no error is returned. |
205 | /// |
206 | /// Unlike |
207 | /// [`for_byte_line`](trait.BufReadExt.html#method.for_byte_line), |
208 | /// the lines given to the closure *do* include the line terminator, if one |
209 | /// exists. |
210 | /// |
211 | /// This routine is useful for iterating over lines as quickly as |
212 | /// possible. Namely, a single allocation is reused for each line. |
213 | /// |
214 | /// This is identical to `for_byte_record_with_terminator` with a |
215 | /// terminator of `\n`. |
216 | /// |
217 | /// # Examples |
218 | /// |
219 | /// Basic usage: |
220 | /// |
221 | /// ``` |
222 | /// use std::io; |
223 | /// |
224 | /// use bstr::io::BufReadExt; |
225 | /// |
226 | /// # fn example() -> Result<(), io::Error> { |
227 | /// let mut cursor = io::Cursor::new(b"lorem \nipsum \r\ndolor" ); |
228 | /// |
229 | /// let mut lines = vec![]; |
230 | /// cursor.for_byte_line_with_terminator(|line| { |
231 | /// lines.push(line.to_vec()); |
232 | /// Ok(true) |
233 | /// })?; |
234 | /// assert_eq!(lines.len(), 3); |
235 | /// assert_eq!(lines[0], "lorem \n" .as_bytes()); |
236 | /// assert_eq!(lines[1], "ipsum \r\n" .as_bytes()); |
237 | /// assert_eq!(lines[2], "dolor" .as_bytes()); |
238 | /// # Ok(()) }; example().unwrap() |
239 | /// ``` |
240 | fn for_byte_line_with_terminator<F>( |
241 | &mut self, |
242 | for_each_line: F, |
243 | ) -> io::Result<()> |
244 | where |
245 | Self: Sized, |
246 | F: FnMut(&[u8]) -> io::Result<bool>, |
247 | { |
248 | self.for_byte_record_with_terminator(b' \n' , for_each_line) |
249 | } |
250 | |
251 | /// Executes the given closure on each byte-terminated record in the |
252 | /// underlying reader. |
253 | /// |
254 | /// If the closure returns an error (or if the underlying reader returns an |
255 | /// error), then iteration is stopped and the error is returned. If false |
256 | /// is returned, then iteration is stopped and no error is returned. |
257 | /// |
258 | /// Unlike |
259 | /// [`for_byte_record`](trait.BufReadExt.html#method.for_byte_record), |
260 | /// the lines given to the closure *do* include the record terminator, if |
261 | /// one exists. |
262 | /// |
263 | /// This routine is useful for iterating over records as quickly as |
264 | /// possible. Namely, a single allocation is reused for each record. |
265 | /// |
266 | /// # Examples |
267 | /// |
268 | /// Basic usage: |
269 | /// |
270 | /// ``` |
271 | /// use std::io; |
272 | /// |
273 | /// use bstr::{io::BufReadExt, B}; |
274 | /// |
275 | /// # fn example() -> Result<(), io::Error> { |
276 | /// let mut cursor = io::Cursor::new(b"lorem \x00ipsum \x00dolor" ); |
277 | /// |
278 | /// let mut records = vec![]; |
279 | /// cursor.for_byte_record_with_terminator(b' \x00' , |record| { |
280 | /// records.push(record.to_vec()); |
281 | /// Ok(true) |
282 | /// })?; |
283 | /// assert_eq!(records.len(), 3); |
284 | /// assert_eq!(records[0], B(b"lorem \x00" )); |
285 | /// assert_eq!(records[1], B("ipsum \x00" )); |
286 | /// assert_eq!(records[2], B("dolor" )); |
287 | /// # Ok(()) }; example().unwrap() |
288 | /// ``` |
289 | fn for_byte_record_with_terminator<F>( |
290 | &mut self, |
291 | terminator: u8, |
292 | mut for_each_record: F, |
293 | ) -> io::Result<()> |
294 | where |
295 | Self: Sized, |
296 | F: FnMut(&[u8]) -> io::Result<bool>, |
297 | { |
298 | let mut bytes = vec![]; |
299 | let mut res = Ok(()); |
300 | let mut consumed = 0; |
301 | 'outer: loop { |
302 | // Lend out complete record slices from our buffer |
303 | { |
304 | let mut buf = self.fill_buf()?; |
305 | while let Some(index) = buf.find_byte(terminator) { |
306 | let (record, rest) = buf.split_at(index + 1); |
307 | buf = rest; |
308 | consumed += record.len(); |
309 | match for_each_record(&record) { |
310 | Ok(false) => break 'outer, |
311 | Err(err) => { |
312 | res = Err(err); |
313 | break 'outer; |
314 | } |
315 | _ => (), |
316 | } |
317 | } |
318 | |
319 | // Copy the final record fragment to our local buffer. This |
320 | // saves read_until() from re-scanning a buffer we know |
321 | // contains no remaining terminators. |
322 | bytes.extend_from_slice(&buf); |
323 | consumed += buf.len(); |
324 | } |
325 | |
326 | self.consume(consumed); |
327 | consumed = 0; |
328 | |
329 | // N.B. read_until uses a different version of memchr that may |
330 | // be slower than the memchr crate that bstr uses. However, this |
331 | // should only run for a fairly small number of records, assuming a |
332 | // decent buffer size. |
333 | self.read_until(terminator, &mut bytes)?; |
334 | if bytes.is_empty() || !for_each_record(&bytes)? { |
335 | break; |
336 | } |
337 | bytes.clear(); |
338 | } |
339 | self.consume(consumed); |
340 | res |
341 | } |
342 | } |
343 | |
344 | impl<B: io::BufRead> BufReadExt for B {} |
345 | |
346 | /// An iterator over lines from an instance of |
347 | /// [`std::io::BufRead`](https://doc.rust-lang.org/std/io/trait.BufRead.html). |
348 | /// |
349 | /// This iterator is generally created by calling the |
350 | /// [`byte_lines`](trait.BufReadExt.html#method.byte_lines) |
351 | /// method on the |
352 | /// [`BufReadExt`](trait.BufReadExt.html) |
353 | /// trait. |
354 | #[derive (Debug)] |
355 | pub struct ByteLines<B> { |
356 | buf: B, |
357 | } |
358 | |
359 | /// An iterator over records from an instance of |
360 | /// [`std::io::BufRead`](https://doc.rust-lang.org/std/io/trait.BufRead.html). |
361 | /// |
362 | /// A byte record is any sequence of bytes terminated by a particular byte |
363 | /// chosen by the caller. For example, NUL separated byte strings are said to |
364 | /// be NUL-terminated byte records. |
365 | /// |
366 | /// This iterator is generally created by calling the |
367 | /// [`byte_records`](trait.BufReadExt.html#method.byte_records) |
368 | /// method on the |
369 | /// [`BufReadExt`](trait.BufReadExt.html) |
370 | /// trait. |
371 | #[derive (Debug)] |
372 | pub struct ByteRecords<B> { |
373 | buf: B, |
374 | terminator: u8, |
375 | } |
376 | |
377 | impl<B: io::BufRead> Iterator for ByteLines<B> { |
378 | type Item = io::Result<Vec<u8>>; |
379 | |
380 | fn next(&mut self) -> Option<io::Result<Vec<u8>>> { |
381 | let mut bytes: Vec = vec![]; |
382 | match self.buf.read_until(byte:b' \n' , &mut bytes) { |
383 | Err(e: Error) => Some(Err(e)), |
384 | Ok(0) => None, |
385 | Ok(_) => { |
386 | trim_line(&mut bytes); |
387 | Some(Ok(bytes)) |
388 | } |
389 | } |
390 | } |
391 | } |
392 | |
393 | impl<B: io::BufRead> Iterator for ByteRecords<B> { |
394 | type Item = io::Result<Vec<u8>>; |
395 | |
396 | fn next(&mut self) -> Option<io::Result<Vec<u8>>> { |
397 | let mut bytes: Vec = vec![]; |
398 | match self.buf.read_until(self.terminator, &mut bytes) { |
399 | Err(e: Error) => Some(Err(e)), |
400 | Ok(0) => None, |
401 | Ok(_) => { |
402 | trim_record(&mut bytes, self.terminator); |
403 | Some(Ok(bytes)) |
404 | } |
405 | } |
406 | } |
407 | } |
408 | |
409 | fn trim_line(line: &mut Vec<u8>) { |
410 | if line.last_byte() == Some(b' \n' ) { |
411 | line.pop_byte(); |
412 | if line.last_byte() == Some(b' \r' ) { |
413 | line.pop_byte(); |
414 | } |
415 | } |
416 | } |
417 | |
418 | fn trim_line_slice(mut line: &[u8]) -> &[u8] { |
419 | if line.last_byte() == Some(b' \n' ) { |
420 | line = &line[..line.len() - 1]; |
421 | if line.last_byte() == Some(b' \r' ) { |
422 | line = &line[..line.len() - 1]; |
423 | } |
424 | } |
425 | line |
426 | } |
427 | |
428 | fn trim_record(record: &mut Vec<u8>, terminator: u8) { |
429 | if record.last_byte() == Some(terminator) { |
430 | record.pop_byte(); |
431 | } |
432 | } |
433 | |
434 | fn trim_record_slice(mut record: &[u8], terminator: u8) -> &[u8] { |
435 | if record.last_byte() == Some(terminator) { |
436 | record = &record[..record.len() - 1]; |
437 | } |
438 | record |
439 | } |
440 | |
441 | #[cfg (all(test, feature = "std" ))] |
442 | mod tests { |
443 | use crate::bstring::BString; |
444 | |
445 | use super::BufReadExt; |
446 | |
447 | fn collect_lines<B: AsRef<[u8]>>(slice: B) -> Vec<BString> { |
448 | let mut lines = vec![]; |
449 | slice |
450 | .as_ref() |
451 | .for_byte_line(|line| { |
452 | lines.push(BString::from(line.to_vec())); |
453 | Ok(true) |
454 | }) |
455 | .unwrap(); |
456 | lines |
457 | } |
458 | |
459 | fn collect_lines_term<B: AsRef<[u8]>>(slice: B) -> Vec<BString> { |
460 | let mut lines = vec![]; |
461 | slice |
462 | .as_ref() |
463 | .for_byte_line_with_terminator(|line| { |
464 | lines.push(BString::from(line.to_vec())); |
465 | Ok(true) |
466 | }) |
467 | .unwrap(); |
468 | lines |
469 | } |
470 | |
471 | #[test ] |
472 | fn lines_without_terminator() { |
473 | assert_eq!(collect_lines("" ), Vec::<BString>::new()); |
474 | |
475 | assert_eq!(collect_lines(" \n" ), vec!["" ]); |
476 | assert_eq!(collect_lines(" \n\n" ), vec!["" , "" ]); |
477 | assert_eq!(collect_lines("a \nb \n" ), vec!["a" , "b" ]); |
478 | assert_eq!(collect_lines("a \nb" ), vec!["a" , "b" ]); |
479 | assert_eq!(collect_lines("abc \nxyz \n" ), vec!["abc" , "xyz" ]); |
480 | assert_eq!(collect_lines("abc \nxyz" ), vec!["abc" , "xyz" ]); |
481 | |
482 | assert_eq!(collect_lines(" \r\n" ), vec!["" ]); |
483 | assert_eq!(collect_lines(" \r\n\r\n" ), vec!["" , "" ]); |
484 | assert_eq!(collect_lines("a \r\nb \r\n" ), vec!["a" , "b" ]); |
485 | assert_eq!(collect_lines("a \r\nb" ), vec!["a" , "b" ]); |
486 | assert_eq!(collect_lines("abc \r\nxyz \r\n" ), vec!["abc" , "xyz" ]); |
487 | assert_eq!(collect_lines("abc \r\nxyz" ), vec!["abc" , "xyz" ]); |
488 | |
489 | assert_eq!(collect_lines("abc \rxyz" ), vec!["abc \rxyz" ]); |
490 | } |
491 | |
492 | #[test ] |
493 | fn lines_with_terminator() { |
494 | assert_eq!(collect_lines_term("" ), Vec::<BString>::new()); |
495 | |
496 | assert_eq!(collect_lines_term(" \n" ), vec![" \n" ]); |
497 | assert_eq!(collect_lines_term(" \n\n" ), vec![" \n" , " \n" ]); |
498 | assert_eq!(collect_lines_term("a \nb \n" ), vec!["a \n" , "b \n" ]); |
499 | assert_eq!(collect_lines_term("a \nb" ), vec!["a \n" , "b" ]); |
500 | assert_eq!(collect_lines_term("abc \nxyz \n" ), vec!["abc \n" , "xyz \n" ]); |
501 | assert_eq!(collect_lines_term("abc \nxyz" ), vec!["abc \n" , "xyz" ]); |
502 | |
503 | assert_eq!(collect_lines_term(" \r\n" ), vec![" \r\n" ]); |
504 | assert_eq!(collect_lines_term(" \r\n\r\n" ), vec![" \r\n" , " \r\n" ]); |
505 | assert_eq!(collect_lines_term("a \r\nb \r\n" ), vec!["a \r\n" , "b \r\n" ]); |
506 | assert_eq!(collect_lines_term("a \r\nb" ), vec!["a \r\n" , "b" ]); |
507 | assert_eq!( |
508 | collect_lines_term("abc \r\nxyz \r\n" ), |
509 | vec!["abc \r\n" , "xyz \r\n" ] |
510 | ); |
511 | assert_eq!(collect_lines_term("abc \r\nxyz" ), vec!["abc \r\n" , "xyz" ]); |
512 | |
513 | assert_eq!(collect_lines_term("abc \rxyz" ), vec!["abc \rxyz" ]); |
514 | } |
515 | } |
516 | |