lib.rs source code [crates/blake3/src/lib.rs]

1	//! The official Rust implementation of the [BLAKE3] cryptographic hash
2	//! function.
3	//!
4	//! # Examples
5	//!
6	//! ```
7	//! # fn main() -> Result<(), Box<dyn std::error::Error>> {
8	//! // Hash an input all at once.
9	//! let hash1 = blake3::hash(b"foobarbaz");
10	//!
11	//! // Hash an input incrementally.
12	//! let mut hasher = blake3::Hasher::new();
13	//! hasher.update(b"foo");
14	//! hasher.update(b"bar");
15	//! hasher.update(b"baz");
16	//! let hash2 = hasher.finalize();
17	//! assert_eq!(hash1, hash2);
18	//!
19	//! // Extended output. OutputReader also implements Read and Seek.
20	//! # #[cfg(feature = "std")] {
21	//! let mut output = [`0`; `1000`];
22	//! let mut output_reader = hasher.finalize_xof();
23	//! output_reader.fill(&mut output);
24	//! assert_eq!(hash1, output[..`32`]);
25	//! # }
26	//!
27	//! // Print a hash as hex.
28	//! println!("{}", hash1);
29	//! # Ok(())
30	//! # }
31	//! ```
32	//!
33	//! # Cargo Features
34	//!
35	//! The `std` feature (the only feature enabled by default) is required for
36	//! implementations of the [`Write`] and [`Seek`] traits, the
37	//! [`update_reader`](Hasher::update_reader) helper method, and runtime CPU
38	//! feature detection on x86. If this feature is disabled, the only way to use
39	//! the x86 SIMD implementations is to enable the corresponding instruction sets
40	//! globally, with e.g. `RUSTFLAGS="-C target-cpu=native"`. The resulting binary
41	//! will not be portable to other machines.
42	//!
43	//! The `rayon` feature (disabled by default, but enabled for [docs.rs]) adds
44	//! the [`update_rayon`](Hasher::update_rayon) and (in combination with `mmap`
45	//! below) [`update_mmap_rayon`](Hasher::update_mmap_rayon) methods, for
46	//! multithreaded hashing. However, even if this feature is enabled, all other
47	//! APIs remain single-threaded.
48	//!
49	//! The `mmap` feature (disabled by default, but enabled for [docs.rs]) adds the
50	//! [`update_mmap`](Hasher::update_mmap) and (in combination with `rayon` above)
51	//! [`update_mmap_rayon`](Hasher::update_mmap_rayon) helper methods for
52	//! memory-mapped IO.
53	//!
54	//! The `zeroize` feature (disabled by default, but enabled for [docs.rs])
55	//! implements
56	//! [`Zeroize`](https://docs.rs/zeroize/latest/zeroize/trait.Zeroize.html) for
57	//! this crate's types.
58	//!
59	//! The `serde` feature (disabled by default, but enabled for [docs.rs]) implements
60	//! [`serde::Serialize`](https://docs.rs/serde/latest/serde/trait.Serialize.html) and
61	//! [`serde::Deserialize`](https://docs.rs/serde/latest/serde/trait.Deserialize.html)
62	//! for [`Hash`](struct@Hash).
63	//!
64	//! The NEON implementation is enabled by default for AArch64 but requires the
65	//! `neon` feature for other ARM targets. Not all ARMv7 CPUs support NEON, and
66	//! enabling this feature will produce a binary that's not portable to CPUs
67	//! without NEON support.
68	//!
69	//! The `traits-preview` feature enables implementations of traits from the
70	//! RustCrypto [`digest`] crate, and re-exports that crate as `traits::digest`.
71	//! However, the traits aren't stable, and they're expected to change in
72	//! incompatible ways before that crate reaches 1.0. For that reason, this crate
73	//! makes no SemVer guarantees for this feature, and callers who use it should
74	//! expect breaking changes between patch versions. (The "-preview" feature name
75	//! follows the conventions of the RustCrypto [`signature`] crate.)
76	//!
77	//! [`Hasher::update_rayon`]: struct.Hasher.html#method.update_rayon
78	//! [BLAKE3]: https://blake3.io
79	//! [Rayon]: https://github.com/rayon-rs/rayon
80	//! [docs.rs]: https://docs.rs/
81	//! [`Write`]: https://doc.rust-lang.org/std/io/trait.Write.html
82	//! [`Seek`]: https://doc.rust-lang.org/std/io/trait.Seek.html
83	//! [`digest`]: https://crates.io/crates/digest
84	//! [`signature`]: https://crates.io/crates/signature
85
86	#![cfg_attr(not(feature = "std"), no_std)]
87
88	#[cfg(test)]
89	mod test;
90
91	// The guts module is for incremental use cases like the `bao` crate that need
92	// to explicitly compute chunk and parent chaining values. It is semi-stable
93	// and likely to keep working, but largely undocumented and not intended for
94	// widespread use.
95	#[doc(hidden)]
96	pub mod guts;
97
98	/// Undocumented and unstable, for benchmarks only.
99	#[doc(hidden)]
100	pub mod platform;
101
102	// Platform-specific implementations of the compression function. These
103	// BLAKE3-specific cfg flags are set in build.rs.
104	#[cfg(blake3_avx2_rust)]
105	#[path = "rust_avx2.rs"]
106	mod avx2;
107	#[cfg(blake3_avx2_ffi)]
108	#[path = "ffi_avx2.rs"]
109	mod avx2;
110	#[cfg(blake3_avx512_ffi)]
111	#[path = "ffi_avx512.rs"]
112	mod avx512;
113	#[cfg(blake3_neon)]
114	#[path = "ffi_neon.rs"]
115	mod neon;
116	mod portable;
117	#[cfg(blake3_sse2_rust)]
118	#[path = "rust_sse2.rs"]
119	mod sse2;
120	#[cfg(blake3_sse2_ffi)]
121	#[path = "ffi_sse2.rs"]
122	mod sse2;
123	#[cfg(blake3_sse41_rust)]
124	#[path = "rust_sse41.rs"]
125	mod sse41;
126	#[cfg(blake3_sse41_ffi)]
127	#[path = "ffi_sse41.rs"]
128	mod sse41;
129
130	#[cfg(feature = "traits-preview")]
131	pub mod traits;
132
133	mod io;
134	mod join;
135
136	use arrayref::{array_mut_ref, array_ref};
137	use arrayvec::{ArrayString, ArrayVec};
138	use core::cmp;
139	use core::fmt;
140	use platform::{Platform, MAX_SIMD_DEGREE, MAX_SIMD_DEGREE_OR_2};
141	#[cfg(feature = "zeroize")]
142	use zeroize::Zeroize;
143
144	/// The number of bytes in a [`Hash`](struct.Hash.html), 32.
145	pub const OUT_LEN: usize = `32`;
146
147	/// The number of bytes in a key, 32.
148	pub const KEY_LEN: usize = `32`;
149
150	const MAX_DEPTH: usize = `54`; // 2^54 CHUNK_LEN = 2^64*
151	use guts::{BLOCK_LEN, CHUNK_LEN};
152
153	// While iterating the compression function within a chunk, the CV is
154	// represented as words, to avoid doing two extra endianness conversions for
155	// each compression in the portable implementation. But the hash_many interface
156	// needs to hash both input bytes and parent nodes, so its better for its
157	// output CVs to be represented as bytes.
158	type CVWords = [u32; `8`];
159	type CVBytes = [u8; `32`]; // little-endian
160
161	const IV: &CVWords = &[
162	`0x6A09E667`, `0xBB67AE85`, `0x3C6EF372`, `0xA54FF53A`, `0x510E527F`, `0x9B05688C`, `0x1F83D9AB`, `0x5BE0CD19`,
163	];
164
165	const MSG_SCHEDULE: [[usize; `16`]; `7`] = [
166	[`0`, `1`, `2`, `3`, `4`, `5`, `6`, `7`, `8`, `9`, `10`, `11`, `12`, `13`, `14`, `15`],
167	[`2`, `6`, `3`, `10`, `7`, `0`, `4`, `13`, `1`, `11`, `12`, `5`, `9`, `14`, `15`, `8`],
168	[`3`, `4`, `10`, `12`, `13`, `2`, `7`, `14`, `6`, `5`, `9`, `0`, `11`, `15`, `8`, `1`],
169	[`10`, `7`, `12`, `9`, `14`, `3`, `13`, `15`, `4`, `0`, `11`, `2`, `5`, `8`, `1`, `6`],
170	[`12`, `13`, `9`, `11`, `15`, `10`, `14`, `8`, `7`, `2`, `5`, `3`, `0`, `1`, `6`, `4`],
171	[`9`, `14`, `11`, `5`, `8`, `12`, `15`, `1`, `13`, `3`, `0`, `10`, `2`, `6`, `4`, `7`],
172	[`11`, `15`, `5`, `0`, `1`, `9`, `8`, `6`, `14`, `10`, `2`, `12`, `3`, `4`, `7`, `13`],
173	];
174
175	// These are the internal flags that we use to domain separate root/non-root,
176	// chunk/parent, and chunk beginning/middle/end. These get set at the high end
177	// of the block flags word in the compression function, so their values start
178	// high and go down.
179	const CHUNK_START: u8 = `1` << `0`;
180	const CHUNK_END: u8 = `1` << `1`;
181	const PARENT: u8 = `1` << `2`;
182	const ROOT: u8 = `1` << `3`;
183	const KEYED_HASH: u8 = `1` << `4`;
184	const DERIVE_KEY_CONTEXT: u8 = `1` << `5`;
185	const DERIVE_KEY_MATERIAL: u8 = `1` << `6`;
186
187	#[inline]
188	fn counter_low(counter: u64) -> u32 {
189	counter as u32
190	}
191
192	#[inline]
193	fn counter_high(counter: u64) -> u32 {
194	(counter >> `32`) as u32
195	}
196
197	/// An output of the default size, 32 bytes, which provides constant-time
198	/// equality checking.
199	///
200	/// `Hash` implements [`From`] and [`Into`] for `[u8; 32]`, and it provides
201	/// [`from_bytes`] and [`as_bytes`] for explicit conversions between itself and
202	/// `[u8; 32]`. However, byte arrays and slices don't provide constant-time
203	/// equality checking, which is often a security requirement in software that
204	/// handles private data. `Hash` doesn't implement [`Deref`] or [`AsRef`], to
205	/// avoid situations where a type conversion happens implicitly and the
206	/// constant-time property is accidentally lost.
207	///
208	/// `Hash` provides the [`to_hex`] and [`from_hex`] methods for converting to
209	/// and from hexadecimal. It also implements [`Display`] and [`FromStr`].
210	///
211	/// [`From`]: https://doc.rust-lang.org/std/convert/trait.From.html
212	/// [`Into`]: https://doc.rust-lang.org/std/convert/trait.Into.html
213	/// [`as_bytes`]: #method.as_bytes
214	/// [`from_bytes`]: #method.from_bytes
215	/// [`Deref`]: https://doc.rust-lang.org/stable/std/ops/trait.Deref.html
216	/// [`AsRef`]: https://doc.rust-lang.org/std/convert/trait.AsRef.html
217	/// [`to_hex`]: #method.to_hex
218	/// [`from_hex`]: #method.from_hex
219	/// [`Display`]: https://doc.rust-lang.org/std/fmt/trait.Display.html
220	/// [`FromStr`]: https://doc.rust-lang.org/std/str/trait.FromStr.html
221	#[cfg_attr(feature = "serde", derive(serde::Deserialize, serde::Serialize))]
222	#[derive(Clone, Copy, Hash)]
223	pub struct Hash([u8; OUT_LEN]);
224
225	impl Hash {
226	/// The raw bytes of the `Hash`. Note that byte arrays don't provide
227	/// constant-time equality checking, so if you need to compare hashes,
228	/// prefer the `Hash` type.
229	#[inline]
230	pub const fn as_bytes(&self) -> &[u8; OUT_LEN] {
231	&self.0
232	}
233
234	/// Create a `Hash` from its raw bytes representation.
235	pub const fn from_bytes(bytes: [u8; OUT_LEN]) -> Self {
236	Self(bytes)
237	}
238
239	/// Encode a `Hash` in lowercase hexadecimal.
240	///
241	/// The returned [`ArrayString`] is a fixed size and doesn't allocate memory
242	/// on the heap. Note that [`ArrayString`] doesn't provide constant-time
243	/// equality checking, so if you need to compare hashes, prefer the `Hash`
244	/// type.
245	///
246	/// [`ArrayString`]: https://docs.rs/arrayvec/0.5.1/arrayvec/struct.ArrayString.html
247	pub fn to_hex(&self) -> ArrayString<{ `2` * OUT_LEN }> {
248	let mut s = ArrayString::new();
249	let table = b"0123456789abcdef";
250	for &b in self.0.iter() {
251	s.push(table[(b >> `4`) as usize] as char);
252	s.push(table[(b & `0xf`) as usize] as char);
253	}
254	s
255	}
256
257	/// Decode a `Hash` from hexadecimal. Both uppercase and lowercase ASCII
258	/// bytes are supported.
259	///
260	/// Any byte outside the ranges `'0'...'9'`, `'a'...'f'`, and `'A'...'F'`
261	/// results in an error. An input length other than 64 also results in an
262	/// error.
263	///
264	/// Note that `Hash` also implements `FromStr`, so `Hash::from_hex("...")`
265	/// is equivalent to `"...".parse()`.
266	pub fn from_hex(hex: impl AsRef<[u8]>) -> Result<Self, HexError> {
267	fn hex_val(byte: u8) -> Result<u8, HexError> {
268	match byte {
269	b'A'..=b'F' => Ok(byte - b'A' + `10`),
270	b'a'..=b'f' => Ok(byte - b'a' + `10`),
271	b'0'..=b'9' => Ok(byte - b'0'),
272	_ => Err(HexError(HexErrorInner::InvalidByte(byte))),
273	}
274	}
275	let hex_bytes: &[u8] = hex.as_ref();
276	if hex_bytes.len() != OUT_LEN * `2` {
277	return Err(HexError(HexErrorInner::InvalidLen(hex_bytes.len())));
278	}
279	let mut hash_bytes: [u8; OUT_LEN] = [`0`; OUT_LEN];
280	for i in `0`..OUT_LEN {
281	hash_bytes[i] = `16` * hex_val(hex_bytes[`2` * i])? + hex_val(hex_bytes[`2` * i + `1`])?;
282	}
283	Ok(Hash::from(hash_bytes))
284	}
285	}
286
287	impl From<[u8; OUT_LEN]> for Hash {
288	#[inline]
289	fn from(bytes: [u8; OUT_LEN]) -> Self {
290	Self::from_bytes(bytes)
291	}
292	}
293
294	impl From<Hash> for [u8; OUT_LEN] {
295	#[inline]
296	fn from(hash: Hash) -> Self {
297	hash.0
298	}
299	}
300
301	impl core::str::FromStr for Hash {
302	type Err = HexError;
303
304	fn from_str(s: &str) -> Result<Self, Self::Err> {
305	Hash::from_hex(s)
306	}
307	}
308
309	#[cfg(feature = "zeroize")]
310	impl Zeroize for Hash {
311	fn zeroize(&mut self) {
312	// Destructuring to trigger compile error as a reminder to update this impl.
313	let Self(bytes) = self;
314	bytes.zeroize();
315	}
316	}
317
318	/// This implementation is constant-time.
319	impl PartialEq for Hash {
320	#[inline]
321	fn eq(&self, other: &Hash) -> bool {
322	constant_time_eq::constant_time_eq_32(&self.0, &other.0)
323	}
324	}
325
326	/// This implementation is constant-time.
327	impl PartialEq<[u8; OUT_LEN]> for Hash {
328	#[inline]
329	fn eq(&self, other: &[u8; OUT_LEN]) -> bool {
330	constant_time_eq::constant_time_eq_32(&self.0, b:other)
331	}
332	}
333
334	/// This implementation is constant-time if the target is 32 bytes long.
335	impl PartialEq<[u8]> for Hash {
336	#[inline]
337	fn eq(&self, other: &[u8]) -> bool {
338	constant_time_eq::constant_time_eq(&self.0, b:other)
339	}
340	}
341
342	impl Eq for Hash {}
343
344	impl fmt::Display for Hash {
345	fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
346	// Formatting field as `&str` to reduce code size since the `Debug`
347	// dynamic dispatch table for `&str` is likely needed elsewhere already,
348	// but that for `ArrayString<[u8; 64]>` is not.
349	let hex: ArrayString<_> = self.to_hex();
350	let hex: &str = hex.as_str();
351
352	f.write_str(data:hex)
353	}
354	}
355
356	impl fmt::Debug for Hash {
357	fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
358	// Formatting field as `&str` to reduce code size since the `Debug`
359	// dynamic dispatch table for `&str` is likely needed elsewhere already,
360	// but that for `ArrayString<[u8; 64]>` is not.
361	let hex: ArrayString<_> = self.to_hex();
362	let hex: &str = hex.as_str();
363
364	f.debug_tuple(name:"Hash").field(&hex).finish()
365	}
366	}
367
368	/// The error type for [`Hash::from_hex`].
369	///
370	/// The `.to_string()` representation of this error currently distinguishes between bad length
371	/// errors and bad character errors. This is to help with logging and debugging, but it isn't a
372	/// stable API detail, and it may change at any time.
373	#[derive(Clone, Debug)]
374	pub struct HexError(HexErrorInner);
375
376	#[derive(Clone, Debug)]
377	enum HexErrorInner {
378	InvalidByte(u8),
379	InvalidLen(usize),
380	}
381
382	impl fmt::Display for HexError {
383	fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
384	match self.0 {
385	HexErrorInner::InvalidByte(byte: u8) => {
386	if byte < `128` {
387	write!(f, "invalid hex character: {:?}", byte as char)
388	} else {
389	write!(f, "invalid hex character: 0x{:x}", byte)
390	}
391	}
392	HexErrorInner::InvalidLen(len: usize) => {
393	write!(f, "expected 64 hex bytes, received {}", len)
394	}
395	}
396	}
397	}
398
399	#[cfg(feature = "std")]
400	impl std::error::Error for HexError {}
401
402	// Each chunk or parent node can produce either a 32-byte chaining value or, by
403	// setting the ROOT flag, any number of final output bytes. The Output struct
404	// captures the state just prior to choosing between those two possibilities.
405	#[derive(Clone)]
406	struct Output {
407	input_chaining_value: CVWords,
408	block: [u8; `64`],
409	block_len: u8,
410	counter: u64,
411	flags: u8,
412	platform: Platform,
413	}
414
415	impl Output {
416	fn chaining_value(&self) -> CVBytes {
417	let mut cv = self.input_chaining_value;
418	self.platform.compress_in_place(
419	&mut cv,
420	&self.block,
421	self.block_len,
422	self.counter,
423	self.flags,
424	);
425	platform::le_bytes_from_words_32(&cv)
426	}
427
428	fn root_hash(&self) -> Hash {
429	debug_assert_eq!(self.counter, `0`);
430	let mut cv = self.input_chaining_value;
431	self.platform
432	.compress_in_place(&mut cv, &self.block, self.block_len, `0`, self.flags \| ROOT);
433	Hash(platform::le_bytes_from_words_32(&cv))
434	}
435
436	fn root_output_block(&self) -> [u8; `2` * OUT_LEN] {
437	self.platform.compress_xof(
438	&self.input_chaining_value,
439	&self.block,
440	self.block_len,
441	self.counter,
442	self.flags \| ROOT,
443	)
444	}
445	}
446
447	#[cfg(feature = "zeroize")]
448	impl Zeroize for Output {
449	fn zeroize(&mut self) {
450	// Destructuring to trigger compile error as a reminder to update this impl.
451	let Self {
452	input_chaining_value,
453	block,
454	block_len,
455	counter,
456	flags,
457	platform: _,
458	} = self;
459
460	input_chaining_value.zeroize();
461	block.zeroize();
462	block_len.zeroize();
463	counter.zeroize();
464	flags.zeroize();
465	}
466	}
467
468	#[derive(Clone)]
469	struct ChunkState {
470	cv: CVWords,
471	chunk_counter: u64,
472	buf: [u8; BLOCK_LEN],
473	buf_len: u8,
474	blocks_compressed: u8,
475	flags: u8,
476	platform: Platform,
477	}
478
479	impl ChunkState {
480	fn new(key: &CVWords, chunk_counter: u64, flags: u8, platform: Platform) -> Self {
481	Self {
482	cv: *key,
483	chunk_counter,
484	buf: [`0`; BLOCK_LEN],
485	buf_len: `0`,
486	blocks_compressed: `0`,
487	flags,
488	platform,
489	}
490	}
491
492	fn len(&self) -> usize {
493	BLOCK_LEN * self.blocks_compressed as usize + self.buf_len as usize
494	}
495
496	fn fill_buf(&mut self, input: &mut &[u8]) {
497	let want = BLOCK_LEN - self.buf_len as usize;
498	let take = cmp::min(want, input.len());
499	self.buf[self.buf_len as usize..][..take].copy_from_slice(&input[..take]);
500	self.buf_len += take as u8;
501	*input = &input[take..];
502	}
503
504	fn start_flag(&self) -> u8 {
505	if self.blocks_compressed == `0` {
506	CHUNK_START
507	} else {
508	`0`
509	}
510	}
511
512	// Try to avoid buffering as much as possible, by compressing directly from
513	// the input slice when full blocks are available.
514	fn update(&mut self, mut input: &[u8]) -> &mut Self {
515	if self.buf_len > `0` {
516	self.fill_buf(&mut input);
517	if !input.is_empty() {
518	debug_assert_eq!(self.buf_len as usize, BLOCK_LEN);
519	let block_flags = self.flags \| self.start_flag(); // borrowck
520	self.platform.compress_in_place(
521	&mut self.cv,
522	&self.buf,
523	BLOCK_LEN as u8,
524	self.chunk_counter,
525	block_flags,
526	);
527	self.buf_len = `0`;
528	self.buf = [`0`; BLOCK_LEN];
529	self.blocks_compressed += `1`;
530	}
531	}
532
533	while input.len() > BLOCK_LEN {
534	debug_assert_eq!(self.buf_len, `0`);
535	let block_flags = self.flags \| self.start_flag(); // borrowck
536	self.platform.compress_in_place(
537	&mut self.cv,
538	array_ref!(input, `0`, BLOCK_LEN),
539	BLOCK_LEN as u8,
540	self.chunk_counter,
541	block_flags,
542	);
543	self.blocks_compressed += `1`;
544	input = &input[BLOCK_LEN..];
545	}
546
547	self.fill_buf(&mut input);
548	debug_assert!(input.is_empty());
549	debug_assert!(self.len() <= CHUNK_LEN);
550	self
551	}
552
553	fn output(&self) -> Output {
554	let block_flags = self.flags \| self.start_flag() \| CHUNK_END;
555	Output {
556	input_chaining_value: self.cv,
557	block: self.buf,
558	block_len: self.buf_len,
559	counter: self.chunk_counter,
560	flags: block_flags,
561	platform: self.platform,
562	}
563	}
564	}
565
566	// Don't derive(Debug), because the state may be secret.
567	impl fmt::Debug for ChunkState {
568	fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
569	f&mut DebugStruct<'_, '_>.debug_struct("ChunkState")
570	.field("len", &self.len())
571	.field("chunk_counter", &self.chunk_counter)
572	.field("flags", &self.flags)
573	.field(name:"platform", &self.platform)
574	.finish()
575	}
576	}
577
578	#[cfg(feature = "zeroize")]
579	impl Zeroize for ChunkState {
580	fn zeroize(&mut self) {
581	// Destructuring to trigger compile error as a reminder to update this impl.
582	let Self {
583	cv,
584	chunk_counter,
585	buf,
586	buf_len,
587	blocks_compressed,
588	flags,
589	platform: _,
590	} = self;
591
592	cv.zeroize();
593	chunk_counter.zeroize();
594	buf.zeroize();
595	buf_len.zeroize();
596	blocks_compressed.zeroize();
597	flags.zeroize();
598	}
599	}
600
601	// IMPLEMENTATION NOTE
602	// ===================
603	// The recursive function compress_subtree_wide(), implemented below, is the
604	// basis of high-performance BLAKE3. We use it both for all-at-once hashing,
605	// and for the incremental input with Hasher (though we have to be careful with
606	// subtree boundaries in the incremental case). compress_subtree_wide() applies
607	// several optimizations at the same time:
608	// - Multithreading with Rayon.
609	// - Parallel chunk hashing with SIMD.
610	// - Parallel parent hashing with SIMD. Note that while SIMD chunk hashing
611	// maxes out at MAX_SIMD_DEGREECHUNK_LEN, parallel parent hashing continues*
612	// to benefit from larger inputs, because more levels of the tree benefit can
613	// use full-width SIMD vectors for parent hashing. Without parallel parent
614	// hashing, we lose about 10% of overall throughput on AVX2 and AVX-512.
615
616	/// Undocumented and unstable, for benchmarks only.
617	#[doc(hidden)]
618	#[derive(Clone, Copy)]
619	pub enum IncrementCounter {
620	Yes,
621	No,
622	}
623
624	impl IncrementCounter {
625	#[inline]
626	fn yes(&self) -> bool {
627	match self {
628	IncrementCounter::Yes => `true`,
629	IncrementCounter::No => `false`,
630	}
631	}
632	}
633
634	// The largest power of two less than or equal to `n`, used for left_len()
635	// immediately below, and also directly in Hasher::update().
636	fn largest_power_of_two_leq(n: usize) -> usize {
637	((n / `2`) + `1`).next_power_of_two()
638	}
639
640	// Given some input larger than one chunk, return the number of bytes that
641	// should go in the left subtree. This is the largest power-of-2 number of
642	// chunks that leaves at least 1 byte for the right subtree.
643	fn left_len(content_len: usize) -> usize {
644	debug_assert!(content_len > CHUNK_LEN);
645	// Subtract 1 to reserve at least one byte for the right side.
646	let full_chunks: usize = (content_len - `1`) / CHUNK_LEN;
647	largest_power_of_two_leq(full_chunks) * CHUNK_LEN
648	}
649
650	// Use SIMD parallelism to hash up to MAX_SIMD_DEGREE chunks at the same time
651	// on a single thread. Write out the chunk chaining values and return the
652	// number of chunks hashed. These chunks are never the root and never empty;
653	// those cases use a different codepath.
654	fn compress_chunks_parallel(
655	input: &[u8],
656	key: &CVWords,
657	chunk_counter: u64,
658	flags: u8,
659	platform: Platform,
660	out: &mut [u8],
661	) -> usize {
662	debug_assert!(!input.is_empty(), "empty chunks below the root");
663	debug_assert!(input.len() <= MAX_SIMD_DEGREE * CHUNK_LEN);
664
665	let mut chunks_exact = input.chunks_exact(CHUNK_LEN);
666	let mut chunks_array = ArrayVec::<&[u8; CHUNK_LEN], MAX_SIMD_DEGREE>::new();
667	for chunk in &mut chunks_exact {
668	chunks_array.push(array_ref!(chunk, `0`, CHUNK_LEN));
669	}
670	platform.hash_many(
671	&chunks_array,
672	key,
673	chunk_counter,
674	IncrementCounter::Yes,
675	flags,
676	CHUNK_START,
677	CHUNK_END,
678	out,
679	);
680
681	// Hash the remaining partial chunk, if there is one. Note that the empty
682	// chunk (meaning the empty message) is a different codepath.
683	let chunks_so_far = chunks_array.len();
684	if !chunks_exact.remainder().is_empty() {
685	let counter = chunk_counter + chunks_so_far as u64;
686	let mut chunk_state = ChunkState::new(key, counter, flags, platform);
687	chunk_state.update(chunks_exact.remainder());
688	array_mut_ref!(out, chunks_so_far OUT_LEN, OUT_LEN) =
689	chunk_state.output().chaining_value();
690	chunks_so_far + `1`
691	} else {
692	chunks_so_far
693	}
694	}
695
696	// Use SIMD parallelism to hash up to MAX_SIMD_DEGREE parents at the same time
697	// on a single thread. Write out the parent chaining values and return the
698	// number of parents hashed. (If there's an odd input chaining value left over,
699	// return it as an additional output.) These parents are never the root and
700	// never empty; those cases use a different codepath.
701	fn compress_parents_parallel(
702	child_chaining_values: &[u8],
703	key: &CVWords,
704	flags: u8,
705	platform: Platform,
706	out: &mut [u8],
707	) -> usize {
708	debug_assert_eq!(child_chaining_values.len() % OUT_LEN, `0`, "wacky hash bytes");
709	let num_children = child_chaining_values.len() / OUT_LEN;
710	debug_assert!(num_children >= `2`, "not enough children");
711	debug_assert!(num_children <= `2` * MAX_SIMD_DEGREE_OR_2, "too many");
712
713	let mut parents_exact = child_chaining_values.chunks_exact(BLOCK_LEN);
714	// Use MAX_SIMD_DEGREE_OR_2 rather than MAX_SIMD_DEGREE here, because of
715	// the requirements of compress_subtree_wide().
716	let mut parents_array = ArrayVec::<&[u8; BLOCK_LEN], MAX_SIMD_DEGREE_OR_2>::new();
717	for parent in &mut parents_exact {
718	parents_array.push(array_ref!(parent, `0`, BLOCK_LEN));
719	}
720	platform.hash_many(
721	&parents_array,
722	key,
723	`0`, // Parents always use counter 0.
724	IncrementCounter::No,
725	flags \| PARENT,
726	`0`, // Parents have no start flags.
727	`0`, // Parents have no end flags.
728	out,
729	);
730
731	// If there's an odd child left over, it becomes an output.
732	let parents_so_far = parents_array.len();
733	if !parents_exact.remainder().is_empty() {
734	out[parents_so_far * OUT_LEN..][..OUT_LEN].copy_from_slice(parents_exact.remainder());
735	parents_so_far + `1`
736	} else {
737	parents_so_far
738	}
739	}
740
741	// The wide helper function returns (writes out) an array of chaining values
742	// and returns the length of that array. The number of chaining values returned
743	// is the dynamically detected SIMD degree, at most MAX_SIMD_DEGREE. Or fewer,
744	// if the input is shorter than that many chunks. The reason for maintaining a
745	// wide array of chaining values going back up the tree, is to allow the
746	// implementation to hash as many parents in parallel as possible.
747	//
748	// As a special case when the SIMD degree is 1, this function will still return
749	// at least 2 outputs. This guarantees that this function doesn't perform the
750	// root compression. (If it did, it would use the wrong flags, and also we
751	// wouldn't be able to implement extendable output.) Note that this function is
752	// not used when the whole input is only 1 chunk long; that's a different
753	// codepath.
754	//
755	// Why not just have the caller split the input on the first update(), instead
756	// of implementing this special rule? Because we don't want to limit SIMD or
757	// multithreading parallelism for that update().
758	fn compress_subtree_wide<J: join::Join>(
759	input: &[u8],
760	key: &CVWords,
761	chunk_counter: u64,
762	flags: u8,
763	platform: Platform,
764	out: &mut [u8],
765	) -> usize {
766	// Note that the single chunk case does not* bump the SIMD degree up to 2*
767	// when it is 1. This allows Rayon the option of multithreading even the
768	// 2-chunk case, which can help performance on smaller platforms.
769	if input.len() <= platform.simd_degree() * CHUNK_LEN {
770	return compress_chunks_parallel(input, key, chunk_counter, flags, platform, out);
771	}
772
773	// With more than simd_degree chunks, we need to recurse. Start by dividing
774	// the input into left and right subtrees. (Note that this is only optimal
775	// as long as the SIMD degree is a power of 2. If we ever get a SIMD degree
776	// of 3 or something, we'll need a more complicated strategy.)
777	debug_assert_eq!(platform.simd_degree().count_ones(), `1`, "power of 2");
778	let (left, right) = input.split_at(left_len(input.len()));
779	let right_chunk_counter = chunk_counter + (left.len() / CHUNK_LEN) as u64;
780
781	// Make space for the child outputs. Here we use MAX_SIMD_DEGREE_OR_2 to
782	// account for the special case of returning 2 outputs when the SIMD degree
783	// is 1.
784	let mut cv_array = [`0`; `2` * MAX_SIMD_DEGREE_OR_2 * OUT_LEN];
785	let degree = if left.len() == CHUNK_LEN {
786	// The "simd_degree=1 and we're at the leaf nodes" case.
787	debug_assert_eq!(platform.simd_degree(), `1`);
788	`1`
789	} else {
790	cmp::max(platform.simd_degree(), `2`)
791	};
792	let (left_out, right_out) = cv_array.split_at_mut(degree * OUT_LEN);
793
794	// Recurse! For update_rayon(), this is where we take advantage of RayonJoin and use multiple
795	// threads.
796	let (left_n, right_n) = J::join(
797	\|\| compress_subtree_wide::<J>(left, key, chunk_counter, flags, platform, left_out),
798	\|\| compress_subtree_wide::<J>(right, key, right_chunk_counter, flags, platform, right_out),
799	);
800
801	// The special case again. If simd_degree=1, then we'll have left_n=1 and
802	// right_n=1. Rather than compressing them into a single output, return
803	// them directly, to make sure we always have at least two outputs.
804	debug_assert_eq!(left_n, degree);
805	debug_assert!(right_n >= `1` && right_n <= left_n);
806	if left_n == `1` {
807	out[..`2` * OUT_LEN].copy_from_slice(&cv_array[..`2` * OUT_LEN]);
808	return `2`;
809	}
810
811	// Otherwise, do one layer of parent node compression.
812	let num_children = left_n + right_n;
813	compress_parents_parallel(
814	&cv_array[..num_children * OUT_LEN],
815	key,
816	flags,
817	platform,
818	out,
819	)
820	}
821
822	// Hash a subtree with compress_subtree_wide(), and then condense the resulting
823	// list of chaining values down to a single parent node. Don't compress that
824	// last parent node, however. Instead, return its message bytes (the
825	// concatenated chaining values of its children). This is necessary when the
826	// first call to update() supplies a complete subtree, because the topmost
827	// parent node of that subtree could end up being the root. It's also necessary
828	// for extended output in the general case.
829	//
830	// As with compress_subtree_wide(), this function is not used on inputs of 1
831	// chunk or less. That's a different codepath.
832	fn compress_subtree_to_parent_node<J: join::Join>(
833	input: &[u8],
834	key: &CVWords,
835	chunk_counter: u64,
836	flags: u8,
837	platform: Platform,
838	) -> [u8; BLOCK_LEN] {
839	debug_assert!(input.len() > CHUNK_LEN);
840	let mut cv_array: [u8; 512] = [`0`; MAX_SIMD_DEGREE_OR_2 * OUT_LEN];
841	let mut num_cvs: usize =
842	compress_subtree_wide::<J>(input, &key, chunk_counter, flags, platform, &mut cv_array);
843	debug_assert!(num_cvs >= `2`);
844
845	// If MAX_SIMD_DEGREE is greater than 2 and there's enough input,
846	// compress_subtree_wide() returns more than 2 chaining values. Condense
847	// them into 2 by forming parent nodes repeatedly.
848	let mut out_array: [u8; 256] = [`0`; MAX_SIMD_DEGREE_OR_2 * OUT_LEN / `2`];
849	while num_cvs > `2` {
850	let cv_slice: &[u8] = &cv_array[..num_cvs * OUT_LEN];
851	num_cvs = compress_parents_parallel(child_chaining_values:cv_slice, key, flags, platform, &mut out_array);
852	cv_array[..num_cvs * OUT_LEN].copy_from_slice(&out_array[..num_cvs * OUT_LEN]);
853	}
854	array_ref!(cv_array, `0`, `2` OUT_LEN)
855	}
856
857	// Hash a complete input all at once. Unlike compress_subtree_wide() and
858	// compress_subtree_to_parent_node(), this function handles the 1 chunk case.
859	fn hash_all_at_once<J: join::Join>(input: &[u8], key: &CVWords, flags: u8) -> Output {
860	let platform: Platform = Platform::detect();
861
862	// If the whole subtree is one chunk, hash it directly with a ChunkState.
863	if input.len() <= CHUNK_LEN {
864	return ChunkState&mut ChunkState::new(key, chunk_counter:`0`, flags, platform)
865	.update(input)
866	.output();
867	}
868
869	// Otherwise construct an Output object from the parent node returned by
870	// compress_subtree_to_parent_node().
871	Output {
872	input_chaining_value: *key,
873	block: compress_subtree_to_parent_node::<J>(input, key, chunk_counter:`0`, flags, platform),
874	block_len: BLOCK_LEN as u8,
875	counter: `0`,
876	flags: flags \| PARENT,
877	platform,
878	}
879	}
880
881	/// The default hash function.
882	///
883	/// For an incremental version that accepts multiple writes, see [`Hasher::new`],
884	/// [`Hasher::update`], and [`Hasher::finalize`]. These two lines are equivalent:
885	///
886	/// ```
887	/// let hash = blake3::hash(b"foo");
888	/// # let hash1 = hash;
889	///
890	/// let hash = blake3::Hasher::new().update(b"foo").finalize();
891	/// # let hash2 = hash;
892	/// # assert_eq!(hash1, hash2);
893	/// ```
894	///
895	/// For output sizes other than 32 bytes, see [`Hasher::finalize_xof`] and
896	/// [`OutputReader`].
897	///
898	/// This function is always single-threaded. For multithreading support, see
899	/// [`Hasher::update_rayon`](struct.Hasher.html#method.update_rayon).
900	pub fn hash(input: &[u8]) -> Hash {
901	hash_all_at_once::<join::SerialJoin>(input, IV, flags:`0`).root_hash()
902	}
903
904	/// The keyed hash function.
905	///
906	/// This is suitable for use as a message authentication code, for example to
907	/// replace an HMAC instance. In that use case, the constant-time equality
908	/// checking provided by [`Hash`](struct.Hash.html) is almost always a security
909	/// requirement, and callers need to be careful not to compare MACs as raw
910	/// bytes.
911	///
912	/// For an incremental version that accepts multiple writes, see [`Hasher::new_keyed`],
913	/// [`Hasher::update`], and [`Hasher::finalize`]. These two lines are equivalent:
914	///
915	/// ```
916	/// # const KEY: &[u8; `32`] = &[`0`; `32`];
917	/// let mac = blake3::keyed_hash(KEY, b"foo");
918	/// # let mac1 = mac;
919	///
920	/// let mac = blake3::Hasher::new_keyed(KEY).update(b"foo").finalize();
921	/// # let mac2 = mac;
922	/// # assert_eq!(mac1, mac2);
923	/// ```
924	///
925	/// For output sizes other than 32 bytes, see [`Hasher::finalize_xof`], and [`OutputReader`].
926	///
927	/// This function is always single-threaded. For multithreading support, see
928	/// [`Hasher::update_rayon`](struct.Hasher.html#method.update_rayon).
929	pub fn keyed_hash(key: &[u8; KEY_LEN], input: &[u8]) -> Hash {
930	let key_words: [u32; 8] = platform::words_from_le_bytes_32(bytes:key);
931	hash_all_at_once::<join::SerialJoin>(input, &key_words, KEYED_HASH).root_hash()
932	}
933
934	/// The key derivation function.
935	///
936	/// Given cryptographic key material of any length and a context string of any
937	/// length, this function outputs a 32-byte derived subkey. The context string
938	/// should be hardcoded, globally unique, and application-specific.* A good*
939	/// default format for such strings is `"[application] [commit timestamp]
940	/// [purpose]"`, e.g., `"example.com 2019-12-25 16:18:03 session tokens v1"`.
941	///
942	/// Key derivation is important when you want to use the same key in multiple
943	/// algorithms or use cases. Using the same key with different cryptographic
944	/// algorithms is generally forbidden, and deriving a separate subkey for each
945	/// use case protects you from bad interactions. Derived keys also mitigate the
946	/// damage from one part of your application accidentally leaking its key.
947	///
948	/// As a rare exception to that general rule, however, it is possible to use
949	/// `derive_key` itself with key material that you are already using with
950	/// another algorithm. You might need to do this if you're adding features to
951	/// an existing application, which does not yet use key derivation internally.
952	/// However, you still must not share key material with algorithms that forbid
953	/// key reuse entirely, like a one-time pad. For more on this, see sections 6.2
954	/// and 7.8 of the [BLAKE3 paper](https://github.com/BLAKE3-team/BLAKE3-specs/blob/master/blake3.pdf).
955	///
956	/// Note that BLAKE3 is not a password hash, and `derive_key` should never be
957	/// used with passwords.* Instead, use a dedicated password hash like*
958	/// [Argon2]. Password hashes are entirely different from generic hash
959	/// functions, with opposite design requirements.
960	///
961	/// For an incremental version that accepts multiple writes, see [`Hasher::new_derive_key`],
962	/// [`Hasher::update`], and [`Hasher::finalize`]. These two statements are equivalent:
963	///
964	/// ```
965	/// # const CONTEXT: &str = "example.com 2019-12-25 16:18:03 session tokens v1";
966	/// let key = blake3::derive_key(CONTEXT, b"key material, not a password");
967	/// # let key1 = key;
968	///
969	/// let key: [u8; `32`] = blake3::Hasher::new_derive_key(CONTEXT)
970	/// .update(b"key material, not a password")
971	/// .finalize()
972	/// .into();
973	/// # let key2 = key;
974	/// # assert_eq!(key1, key2);
975	/// ```
976	///
977	/// For output sizes other than 32 bytes, see [`Hasher::finalize_xof`], and [`OutputReader`].
978	///
979	/// This function is always single-threaded. For multithreading support, see
980	/// [`Hasher::update_rayon`](struct.Hasher.html#method.update_rayon).
981	///
982	/// [Argon2]: https://en.wikipedia.org/wiki/Argon2
983	pub fn derive_key(context: &str, key_material: &[u8]) -> [u8; OUT_LEN] {
984	let context_key: Hash =
985	hash_all_at_onceOutput::<join::SerialJoin>(input:context.as_bytes(), IV, DERIVE_KEY_CONTEXT)
986	.root_hash();
987	let context_key_words: [u32; 8] = platform::words_from_le_bytes_32(context_key.as_bytes());
988	hash_all_at_onceHash::<join::SerialJoin>(input:key_material, &context_key_words, DERIVE_KEY_MATERIAL)
989	.root_hash()
990	.0
991	}
992
993	fn parent_node_output(
994	left_child: &CVBytes,
995	right_child: &CVBytes,
996	key: &CVWords,
997	flags: u8,
998	platform: Platform,
999	) -> Output {
1000	let mut block: [u8; 64] = [`0`; BLOCK_LEN];
1001	block[..`32`].copy_from_slice(src:left_child);
1002	block[`32`..].copy_from_slice(src:right_child);
1003	Output {
1004	input_chaining_value: *key,
1005	block,
1006	block_len: BLOCK_LEN as u8,
1007	counter: `0`,
1008	flags: flags \| PARENT,
1009	platform,
1010	}
1011	}
1012
1013	/// An incremental hash state that can accept any number of writes.
1014	///
1015	/// The `rayon` and `mmap` Cargo features enable additional methods on this
1016	/// type related to multithreading and memory-mapped IO.
1017	///
1018	/// When the `traits-preview` Cargo feature is enabled, this type implements
1019	/// several commonly used traits from the
1020	/// [`digest`](https://crates.io/crates/digest) crate. However, those
1021	/// traits aren't stable, and they're expected to change in incompatible ways
1022	/// before that crate reaches 1.0. For that reason, this crate makes no SemVer
1023	/// guarantees for this feature, and callers who use it should expect breaking
1024	/// changes between patch versions.
1025	///
1026	/// # Examples
1027	///
1028	/// ```
1029	/// # fn main() -> Result<(), Box<dyn std::error::Error>> {
1030	/// // Hash an input incrementally.
1031	/// let mut hasher = blake3::Hasher::new();
1032	/// hasher.update(b"foo");
1033	/// hasher.update(b"bar");
1034	/// hasher.update(b"baz");
1035	/// assert_eq!(hasher.finalize(), blake3::hash(b"foobarbaz"));
1036	///
1037	/// // Extended output. OutputReader also implements Read and Seek.
1038	/// # #[cfg(feature = "std")] {
1039	/// let mut output = [`0`; `1000`];
1040	/// let mut output_reader = hasher.finalize_xof();
1041	/// output_reader.fill(&mut output);
1042	/// assert_eq!(&output[..`32`], blake3::hash(b"foobarbaz").as_bytes());
1043	/// # }
1044	/// # Ok(())
1045	/// # }
1046	/// ```
1047	#[derive(Clone)]
1048	pub struct Hasher {
1049	key: CVWords,
1050	chunk_state: ChunkState,
1051	// The stack size is MAX_DEPTH + 1 because we do lazy merging. For example,
1052	// with 7 chunks, we have 3 entries in the stack. Adding an 8th chunk
1053	// requires a 4th entry, rather than merging everything down to 1, because
1054	// we don't know whether more input is coming. This is different from how
1055	// the reference implementation does things.
1056	cv_stack: ArrayVec<CVBytes, { MAX_DEPTH + `1` }>,
1057	}
1058
1059	impl Hasher {
1060	fn new_internal(key: &CVWords, flags: u8) -> Self {
1061	Self {
1062	key: *key,
1063	chunk_state: ChunkState::new(key, `0`, flags, Platform::detect()),
1064	cv_stack: ArrayVec::new(),
1065	}
1066	}
1067
1068	/// Construct a new `Hasher` for the regular hash function.
1069	pub fn new() -> Self {
1070	Self::new_internal(IV, `0`)
1071	}
1072
1073	/// Construct a new `Hasher` for the keyed hash function. See
1074	/// [`keyed_hash`].
1075	///
1076	/// [`keyed_hash`]: fn.keyed_hash.html
1077	pub fn new_keyed(key: &[u8; KEY_LEN]) -> Self {
1078	let key_words = platform::words_from_le_bytes_32(key);
1079	Self::new_internal(&key_words, KEYED_HASH)
1080	}
1081
1082	/// Construct a new `Hasher` for the key derivation function. See
1083	/// [`derive_key`]. The context string should be hardcoded, globally
1084	/// unique, and application-specific.
1085	///
1086	/// [`derive_key`]: fn.derive_key.html
1087	pub fn new_derive_key(context: &str) -> Self {
1088	let context_key =
1089	hash_all_at_once::<join::SerialJoin>(context.as_bytes(), IV, DERIVE_KEY_CONTEXT)
1090	.root_hash();
1091	let context_key_words = platform::words_from_le_bytes_32(context_key.as_bytes());
1092	Self::new_internal(&context_key_words, DERIVE_KEY_MATERIAL)
1093	}
1094
1095	/// Reset the `Hasher` to its initial state.
1096	///
1097	/// This is functionally the same as overwriting the `Hasher` with a new
1098	/// one, using the same key or context string if any.
1099	pub fn reset(&mut self) -> &mut Self {
1100	self.chunk_state = ChunkState::new(
1101	&self.key,
1102	`0`,
1103	self.chunk_state.flags,
1104	self.chunk_state.platform,
1105	);
1106	self.cv_stack.clear();
1107	self
1108	}
1109
1110	// As described in push_cv() below, we do "lazy merging", delaying merges
1111	// until right before the next CV is about to be added. This is different
1112	// from the reference implementation. Another difference is that we aren't
1113	// always merging 1 chunk at a time. Instead, each CV might represent any
1114	// power-of-two number of chunks, as long as the smaller-above-larger stack
1115	// order is maintained. Instead of the "count the trailing 0-bits"
1116	// algorithm described in the spec, we use a "count the total number of
1117	// 1-bits" variant that doesn't require us to retain the subtree size of
1118	// the CV on top of the stack. The principle is the same: each CV that
1119	// should remain in the stack is represented by a 1-bit in the total number
1120	// of chunks (or bytes) so far.
1121	fn merge_cv_stack(&mut self, total_len: u64) {
1122	let post_merge_stack_len = total_len.count_ones() as usize;
1123	while self.cv_stack.len() > post_merge_stack_len {
1124	let right_child = self.cv_stack.pop().unwrap();
1125	let left_child = self.cv_stack.pop().unwrap();
1126	let parent_output = parent_node_output(
1127	&left_child,
1128	&right_child,
1129	&self.key,
1130	self.chunk_state.flags,
1131	self.chunk_state.platform,
1132	);
1133	self.cv_stack.push(parent_output.chaining_value());
1134	}
1135	}
1136
1137	// In reference_impl.rs, we merge the new CV with existing CVs from the
1138	// stack before pushing it. We can do that because we know more input is
1139	// coming, so we know none of the merges are root.
1140	//
1141	// This setting is different. We want to feed as much input as possible to
1142	// compress_subtree_wide(), without setting aside anything for the
1143	// chunk_state. If the user gives us 64 KiB, we want to parallelize over
1144	// all 64 KiB at once as a single subtree, if at all possible.
1145	//
1146	// This leads to two problems:
1147	// 1) This 64 KiB input might be the only call that ever gets made to
1148	// update. In this case, the root node of the 64 KiB subtree would be
1149	// the root node of the whole tree, and it would need to be ROOT
1150	// finalized. We can't compress it until we know.
1151	// 2) This 64 KiB input might complete a larger tree, whose root node is
1152	// similarly going to be the root of the whole tree. For example,
1153	// maybe we have 196 KiB (that is, 128 + 64) hashed so far. We can't
1154	// compress the node at the root of the 256 KiB subtree until we know
1155	// how to finalize it.
1156	//
1157	// The second problem is solved with "lazy merging". That is, when we're
1158	// about to add a CV to the stack, we don't merge it with anything first,
1159	// as the reference impl does. Instead we do merges using the previous* CV*
1160	// that was added, which is sitting on top of the stack, and we put the new
1161	// CV (unmerged) on top of the stack afterwards. This guarantees that we
1162	// never merge the root node until finalize().
1163	//
1164	// Solving the first problem requires an additional tool,
1165	// compress_subtree_to_parent_node(). That function always returns the top
1166	// two* chaining values of the subtree it's compressing. We then do lazy*
1167	// merging with each of them separately, so that the second CV will always
1168	// remain unmerged. (That also helps us support extendable output when
1169	// we're hashing an input all-at-once.)
1170	fn push_cv(&mut self, new_cv: &CVBytes, chunk_counter: u64) {
1171	self.merge_cv_stack(chunk_counter);
1172	self.cv_stack.push(*new_cv);
1173	}
1174
1175	/// Add input bytes to the hash state. You can call this any number of times.
1176	///
1177	/// This method is always single-threaded. For multithreading support, see
1178	/// [`update_rayon`](#method.update_rayon) (enabled with the `rayon` Cargo feature).
1179	///
1180	/// Note that the degree of SIMD parallelism that `update` can use is limited by the size of
1181	/// this input buffer. See [`update_reader`](#method.update_reader).
1182	pub fn update(&mut self, input: &[u8]) -> &mut Self {
1183	self.update_with_join::<join::SerialJoin>(input)
1184	}
1185
1186	fn update_with_join<J: join::Join>(&mut self, mut input: &[u8]) -> &mut Self {
1187	// If we have some partial chunk bytes in the internal chunk_state, we
1188	// need to finish that chunk first.
1189	if self.chunk_state.len() > `0` {
1190	let want = CHUNK_LEN - self.chunk_state.len();
1191	let take = cmp::min(want, input.len());
1192	self.chunk_state.update(&input[..take]);
1193	input = &input[take..];
1194	if !input.is_empty() {
1195	// We've filled the current chunk, and there's more input
1196	// coming, so we know it's not the root and we can finalize it.
1197	// Then we'll proceed to hashing whole chunks below.
1198	debug_assert_eq!(self.chunk_state.len(), CHUNK_LEN);
1199	let chunk_cv = self.chunk_state.output().chaining_value();
1200	self.push_cv(&chunk_cv, self.chunk_state.chunk_counter);
1201	self.chunk_state = ChunkState::new(
1202	&self.key,
1203	self.chunk_state.chunk_counter + `1`,
1204	self.chunk_state.flags,
1205	self.chunk_state.platform,
1206	);
1207	} else {
1208	return self;
1209	}
1210	}
1211
1212	// Now the chunk_state is clear, and we have more input. If there's
1213	// more than a single chunk (so, definitely not the root chunk), hash
1214	// the largest whole subtree we can, with the full benefits of SIMD and
1215	// multithreading parallelism. Two restrictions:
1216	// - The subtree has to be a power-of-2 number of chunks. Only subtrees
1217	// along the right edge can be incomplete, and we don't know where
1218	// the right edge is going to be until we get to finalize().
1219	// - The subtree must evenly divide the total number of chunks up until
1220	// this point (if total is not 0). If the current incomplete subtree
1221	// is only waiting for 1 more chunk, we can't hash a subtree of 4
1222	// chunks. We have to complete the current subtree first.
1223	// Because we might need to break up the input to form powers of 2, or
1224	// to evenly divide what we already have, this part runs in a loop.
1225	while input.len() > CHUNK_LEN {
1226	debug_assert_eq!(self.chunk_state.len(), `0`, "no partial chunk data");
1227	debug_assert_eq!(CHUNK_LEN.count_ones(), `1`, "power of 2 chunk len");
1228	let mut subtree_len = largest_power_of_two_leq(input.len());
1229	let count_so_far = self.chunk_state.chunk_counter * CHUNK_LEN as u64;
1230	// Shrink the subtree_len until it evenly divides the count so far.
1231	// We know that subtree_len itself is a power of 2, so we can use a
1232	// bitmasking trick instead of an actual remainder operation. (Note
1233	// that if the caller consistently passes power-of-2 inputs of the
1234	// same size, as is hopefully typical, this loop condition will
1235	// always fail, and subtree_len will always be the full length of
1236	// the input.)
1237	//
1238	// An aside: We don't have to shrink subtree_len quite this much.
1239	// For example, if count_so_far is 1, we could pass 2 chunks to
1240	// compress_subtree_to_parent_node. Since we'll get 2 CVs back,
1241	// we'll still get the right answer in the end, and we might get to
1242	// use 2-way SIMD parallelism. The problem with this optimization,
1243	// is that it gets us stuck always hashing 2 chunks. The total
1244	// number of chunks will remain odd, and we'll never graduate to
1245	// higher degrees of parallelism. See
1246	// https://github.com/BLAKE3-team/BLAKE3/issues/69.
1247	while (subtree_len - `1`) as u64 & count_so_far != `0` {
1248	subtree_len /= `2`;
1249	}
1250	// The shrunken subtree_len might now be 1 chunk long. If so, hash
1251	// that one chunk by itself. Otherwise, compress the subtree into a
1252	// pair of CVs.
1253	let subtree_chunks = (subtree_len / CHUNK_LEN) as u64;
1254	if subtree_len <= CHUNK_LEN {
1255	debug_assert_eq!(subtree_len, CHUNK_LEN);
1256	self.push_cv(
1257	&ChunkState::new(
1258	&self.key,
1259	self.chunk_state.chunk_counter,
1260	self.chunk_state.flags,
1261	self.chunk_state.platform,
1262	)
1263	.update(&input[..subtree_len])
1264	.output()
1265	.chaining_value(),
1266	self.chunk_state.chunk_counter,
1267	);
1268	} else {
1269	// This is the high-performance happy path, though getting here
1270	// depends on the caller giving us a long enough input.
1271	let cv_pair = compress_subtree_to_parent_node::<J>(
1272	&input[..subtree_len],
1273	&self.key,
1274	self.chunk_state.chunk_counter,
1275	self.chunk_state.flags,
1276	self.chunk_state.platform,
1277	);
1278	let left_cv = array_ref!(cv_pair, `0`, `32`);
1279	let right_cv = array_ref!(cv_pair, `32`, `32`);
1280	// Push the two CVs we received into the CV stack in order. Because
1281	// the stack merges lazily, this guarantees we aren't merging the
1282	// root.
1283	self.push_cv(left_cv, self.chunk_state.chunk_counter);
1284	self.push_cv(
1285	right_cv,
1286	self.chunk_state.chunk_counter + (subtree_chunks / `2`),
1287	);
1288	}
1289	self.chunk_state.chunk_counter += subtree_chunks;
1290	input = &input[subtree_len..];
1291	}
1292
1293	// What remains is 1 chunk or less. Add it to the chunk state.
1294	debug_assert!(input.len() <= CHUNK_LEN);
1295	if !input.is_empty() {
1296	self.chunk_state.update(input);
1297	// Having added some input to the chunk_state, we know what's in
1298	// the CV stack won't become the root node, and we can do an extra
1299	// merge. This simplifies finalize().
1300	self.merge_cv_stack(self.chunk_state.chunk_counter);
1301	}
1302
1303	self
1304	}
1305
1306	fn final_output(&self) -> Output {
1307	// If the current chunk is the only chunk, that makes it the root node
1308	// also. Convert it directly into an Output. Otherwise, we need to
1309	// merge subtrees below.
1310	if self.cv_stack.is_empty() {
1311	debug_assert_eq!(self.chunk_state.chunk_counter, `0`);
1312	return self.chunk_state.output();
1313	}
1314
1315	// If there are any bytes in the ChunkState, finalize that chunk and
1316	// merge its CV with everything in the CV stack. In that case, the work
1317	// we did at the end of update() above guarantees that the stack
1318	// doesn't contain any unmerged subtrees that need to be merged first.
1319	// (This is important, because if there were two chunk hashes sitting
1320	// on top of the stack, they would need to merge with each other, and
1321	// merging a new chunk hash into them would be incorrect.)
1322	//
1323	// If there are no bytes in the ChunkState, we'll merge what's already
1324	// in the stack. In this case it's fine if there are unmerged chunks on
1325	// top, because we'll merge them with each other. Note that the case of
1326	// the empty chunk is taken care of above.
1327	let mut output: Output;
1328	let mut num_cvs_remaining = self.cv_stack.len();
1329	if self.chunk_state.len() > `0` {
1330	debug_assert_eq!(
1331	self.cv_stack.len(),
1332	self.chunk_state.chunk_counter.count_ones() as usize,
1333	"cv stack does not need a merge"
1334	);
1335	output = self.chunk_state.output();
1336	} else {
1337	debug_assert!(self.cv_stack.len() >= `2`);
1338	output = parent_node_output(
1339	&self.cv_stack[num_cvs_remaining - `2`],
1340	&self.cv_stack[num_cvs_remaining - `1`],
1341	&self.key,
1342	self.chunk_state.flags,
1343	self.chunk_state.platform,
1344	);
1345	num_cvs_remaining -= `2`;
1346	}
1347	while num_cvs_remaining > `0` {
1348	output = parent_node_output(
1349	&self.cv_stack[num_cvs_remaining - `1`],
1350	&output.chaining_value(),
1351	&self.key,
1352	self.chunk_state.flags,
1353	self.chunk_state.platform,
1354	);
1355	num_cvs_remaining -= `1`;
1356	}
1357	output
1358	}
1359
1360	/// Finalize the hash state and return the [`Hash`](struct.Hash.html) of
1361	/// the input.
1362	///
1363	/// This method is idempotent. Calling it twice will give the same result.
1364	/// You can also add more input and finalize again.
1365	pub fn finalize(&self) -> Hash {
1366	self.final_output().root_hash()
1367	}
1368
1369	/// Finalize the hash state and return an [`OutputReader`], which can
1370	/// supply any number of output bytes.
1371	///
1372	/// This method is idempotent. Calling it twice will give the same result.
1373	/// You can also add more input and finalize again.
1374	///
1375	/// [`OutputReader`]: struct.OutputReader.html
1376	pub fn finalize_xof(&self) -> OutputReader {
1377	OutputReader::new(self.final_output())
1378	}
1379
1380	/// Return the total number of bytes hashed so far.
1381	pub fn count(&self) -> u64 {
1382	self.chunk_state.chunk_counter * CHUNK_LEN as u64 + self.chunk_state.len() as u64
1383	}
1384
1385	/// As [`update`](Hasher::update), but reading from a
1386	/// [`std::io::Read`](https://doc.rust-lang.org/std/io/trait.Read.html) implementation.
1387	///
1388	/// [`Hasher`] implements
1389	/// [`std::io::Write`](https://doc.rust-lang.org/std/io/trait.Write.html), so it's possible to
1390	/// use [`std::io::copy`](https://doc.rust-lang.org/std/io/fn.copy.html) to update a [`Hasher`]
1391	/// from any reader. Unfortunately, this standard approach can limit performance, because
1392	/// `copy` currently uses an internal 8 KiB buffer that isn't big enough to take advantage of
1393	/// all SIMD instruction sets. (In particular, [AVX-512](https://en.wikipedia.org/wiki/AVX-512)
1394	/// needs a 16 KiB buffer.) `update_reader` avoids this performance problem and is slightly
1395	/// more convenient.
1396	///
1397	/// The internal buffer size this method uses may change at any time, and it may be different
1398	/// for different targets. The only guarantee is that it will be large enough for all of this
1399	/// crate's SIMD implementations on the current platform.
1400	///
1401	/// The most common implementer of
1402	/// [`std::io::Read`](https://doc.rust-lang.org/std/io/trait.Read.html) might be
1403	/// [`std::fs::File`](https://doc.rust-lang.org/std/fs/struct.File.html), but note that memory
1404	/// mapping can be faster than this method for hashing large files. See
1405	/// [`update_mmap`](Hasher::update_mmap) and [`update_mmap_rayon`](Hasher::update_mmap_rayon),
1406	/// which require the `mmap` and (for the latter) `rayon` Cargo features.
1407	///
1408	/// This method requires the `std` Cargo feature, which is enabled by default.
1409	///
1410	/// # Example
1411	///
1412	/// ```no_run
1413	/// # use std::fs::File;
1414	/// # use std::io;
1415	/// # fn main() -> io::Result<()> {
1416	/// // Hash standard input.
1417	/// let mut hasher = blake3::Hasher::new();
1418	/// hasher.update_reader(std::io::stdin().lock())?;
1419	/// println!("{}", hasher.finalize());
1420	/// # Ok(())
1421	/// # }
1422	/// ```
1423	#[cfg(feature = "std")]
1424	pub fn update_reader(&mut self, reader: impl std::io::Read) -> std::io::Result<&mut Self> {
1425	io::copy_wide(reader, self)?;
1426	Ok(self)
1427	}
1428
1429	/// As [`update`](Hasher::update), but using Rayon-based multithreading
1430	/// internally.
1431	///
1432	/// This method is gated by the `rayon` Cargo feature, which is disabled by
1433	/// default but enabled on [docs.rs](https://docs.rs).
1434	///
1435	/// To get any performance benefit from multithreading, the input buffer
1436	/// needs to be large. As a rule of thumb on x86_64, `update_rayon` is
1437	/// _slower_ than `update` for inputs under 128 KiB. That threshold varies
1438	/// quite a lot across different processors, and it's important to benchmark
1439	/// your specific use case. See also the performance warning associated with
1440	/// [`update_mmap_rayon`](Hasher::update_mmap_rayon).
1441	///
1442	/// If you already have a large buffer in memory, and you want to hash it
1443	/// with multiple threads, this method is a good option. However, reading a
1444	/// file into memory just to call this method can be a performance mistake,
1445	/// both because it requires lots of memory and because single-threaded
1446	/// reads can be slow. For hashing whole files, see
1447	/// [`update_mmap_rayon`](Hasher::update_mmap_rayon), which is gated by both
1448	/// the `rayon` and `mmap` Cargo features.
1449	#[cfg(feature = "rayon")]
1450	pub fn update_rayon(&mut self, input: &[u8]) -> &mut Self {
1451	self.update_with_join::<join::RayonJoin>(input)
1452	}
1453
1454	/// As [`update`](Hasher::update), but reading the contents of a file using memory mapping.
1455	///
1456	/// Not all files can be memory mapped, and memory mapping small files can be slower than
1457	/// reading them the usual way. In those cases, this method will fall back to standard file IO.
1458	/// The heuristic for whether to use memory mapping is currently very simple (file size >=
1459	/// 16 KiB), and it might change at any time.
1460	///
1461	/// Like [`update`](Hasher::update), this method is single-threaded. In this author's
1462	/// experience, memory mapping improves single-threaded performance by ~10% for large files
1463	/// that are already in cache. This probably varies between platforms, and as always it's a
1464	/// good idea to benchmark your own use case. In comparison, the multithreaded
1465	/// [`update_mmap_rayon`](Hasher::update_mmap_rayon) method can have a much larger impact on
1466	/// performance.
1467	///
1468	/// There's a correctness reason that this method takes
1469	/// [`Path`](https://doc.rust-lang.org/stable/std/path/struct.Path.html) instead of
1470	/// [`File`](https://doc.rust-lang.org/std/fs/struct.File.html): reading from a memory-mapped
1471	/// file ignores the seek position of the original file handle (it neither respects the current
1472	/// position nor updates the position). This difference in behavior would've caused
1473	/// `update_mmap` and [`update_reader`](Hasher::update_reader) to give different answers and
1474	/// have different side effects in some cases. Taking a
1475	/// [`Path`](https://doc.rust-lang.org/stable/std/path/struct.Path.html) avoids this problem by
1476	/// making it clear that a new [`File`](https://doc.rust-lang.org/std/fs/struct.File.html) is
1477	/// opened internally.
1478	///
1479	/// This method requires the `mmap` Cargo feature, which is disabled by default but enabled on
1480	/// [docs.rs](https://docs.rs).
1481	///
1482	/// # Example
1483	///
1484	/// ```no_run
1485	/// # use std::io;
1486	/// # use std::path::Path;
1487	/// # fn main() -> io::Result<()> {
1488	/// let path = Path::new("file.dat");
1489	/// let mut hasher = blake3::Hasher::new();
1490	/// hasher.update_mmap(path)?;
1491	/// println!("{}", hasher.finalize());
1492	/// # Ok(())
1493	/// # }
1494	/// ```
1495	#[cfg(feature = "mmap")]
1496	pub fn update_mmap(&mut self, path: impl AsRef<std::path::Path>) -> std::io::Result<&mut Self> {
1497	let file = std::fs::File::open(path.as_ref())?;
1498	if let Some(mmap) = io::maybe_mmap_file(&file)? {
1499	self.update(&mmap);
1500	} else {
1501	io::copy_wide(&file, self)?;
1502	}
1503	Ok(self)
1504	}
1505
1506	/// As [`update_rayon`](Hasher::update_rayon), but reading the contents of a file using
1507	/// memory mapping. This is the default behavior of `b3sum`.
1508	///
1509	/// For large files that are likely to be in cache, this can be much faster than
1510	/// single-threaded hashing. When benchmarks report that BLAKE3 is 10x or 20x faster than other
1511	/// cryptographic hashes, this is usually what they're measuring. However...
1512	///
1513	/// Performance Warning:* There are cases where multithreading hurts performance. The worst*
1514	/// case is [a large file on a spinning disk](https://github.com/BLAKE3-team/BLAKE3/issues/31),
1515	/// where simultaneous reads from multiple threads can cause "thrashing" (i.e. the disk spends
1516	/// more time seeking around than reading data). Windows tends to be somewhat worse about this,
1517	/// in part because it's less likely than Linux to keep very large files in cache. More
1518	/// generally, if your CPU cores are already busy, then multithreading will add overhead
1519	/// without improving performance. If your code runs in different environments that you don't
1520	/// control and can't measure, then unfortunately there's no one-size-fits-all answer for
1521	/// whether multithreading is a good idea.
1522	///
1523	/// The memory mapping behavior of this function is the same as
1524	/// [`update_mmap`](Hasher::update_mmap), and the heuristic for when to fall back to standard
1525	/// file IO might change at any time.
1526	///
1527	/// This method requires both the `mmap` and `rayon` Cargo features, which are disabled by
1528	/// default but enabled on [docs.rs](https://docs.rs).
1529	///
1530	/// # Example
1531	///
1532	/// ```no_run
1533	/// # use std::io;
1534	/// # use std::path::Path;
1535	/// # fn main() -> io::Result<()> {
1536	/// # #[cfg(feature = "rayon")]
1537	/// # {
1538	/// let path = Path::new("big_file.dat");
1539	/// let mut hasher = blake3::Hasher::new();
1540	/// hasher.update_mmap_rayon(path)?;
1541	/// println!("{}", hasher.finalize());
1542	/// # }
1543	/// # Ok(())
1544	/// # }
1545	/// ```
1546	#[cfg(feature = "mmap")]
1547	#[cfg(feature = "rayon")]
1548	pub fn update_mmap_rayon(
1549	&mut self,
1550	path: impl AsRef<std::path::Path>,
1551	) -> std::io::Result<&mut Self> {
1552	let file = std::fs::File::open(path.as_ref())?;
1553	if let Some(mmap) = io::maybe_mmap_file(&file)? {
1554	self.update_rayon(&mmap);
1555	} else {
1556	io::copy_wide(&file, self)?;
1557	}
1558	Ok(self)
1559	}
1560	}
1561
1562	// Don't derive(Debug), because the state may be secret.
1563	impl fmt::Debug for Hasher {
1564	fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
1565	f&mut DebugStruct<'_, '_>.debug_struct("Hasher")
1566	.field("flags", &self.chunk_state.flags)
1567	.field(name:"platform", &self.chunk_state.platform)
1568	.finish()
1569	}
1570	}
1571
1572	impl Default for Hasher {
1573	#[inline]
1574	fn default() -> Self {
1575	Self::new()
1576	}
1577	}
1578
1579	#[cfg(feature = "std")]
1580	impl std::io::Write for Hasher {
1581	/// This is equivalent to [`update`](#method.update).
1582	#[inline]
1583	fn write(&mut self, input: &[u8]) -> std::io::Result<usize> {
1584	self.update(input);
1585	Ok(input.len())
1586	}
1587
1588	#[inline]
1589	fn flush(&mut self) -> std::io::Result<()> {
1590	Ok(())
1591	}
1592	}
1593
1594	#[cfg(feature = "zeroize")]
1595	impl Zeroize for Hasher {
1596	fn zeroize(&mut self) {
1597	// Destructuring to trigger compile error as a reminder to update this impl.
1598	let Self {
1599	key,
1600	chunk_state,
1601	cv_stack,
1602	} = self;
1603
1604	key.zeroize();
1605	chunk_state.zeroize();
1606	cv_stack.zeroize();
1607	}
1608	}
1609
1610	/// An incremental reader for extended output, returned by
1611	/// [`Hasher::finalize_xof`](struct.Hasher.html#method.finalize_xof).
1612	///
1613	/// Shorter BLAKE3 outputs are prefixes of longer ones, and explicitly requesting a short output is
1614	/// equivalent to truncating the default-length output. Note that this is a difference between
1615	/// BLAKE2 and BLAKE3.
1616	///
1617	/// # Security notes
1618	///
1619	/// Outputs shorter than the default length of 32 bytes (256 bits) provide less security. An N-bit
1620	/// BLAKE3 output is intended to provide N bits of first and second preimage resistance and N/2
1621	/// bits of collision resistance, for any N up to 256. Longer outputs don't provide any additional
1622	/// security.
1623	///
1624	/// Avoid relying on the secrecy of the output offset, that is, the number of output bytes read or
1625	/// the arguments to [`seek`](struct.OutputReader.html#method.seek) or
1626	/// [`set_position`](struct.OutputReader.html#method.set_position). [_Block-Cipher-Based Tree
1627	/// Hashing_ by Aldo Gunsing](https://eprint.iacr.org/2022/283) shows that an attacker who knows
1628	/// both the message and the key (if any) can easily determine the offset of an extended output.
1629	/// For comparison, AES-CTR has a similar property: if you know the key, you can decrypt a block
1630	/// from an unknown position in the output stream to recover its block index. Callers with strong
1631	/// secret keys aren't affected in practice, but secret offsets are a [design
1632	/// smell](https://en.wikipedia.org/wiki/Design_smell) in any case.
1633	#[derive(Clone)]
1634	pub struct OutputReader {
1635	inner: Output,
1636	position_within_block: u8,
1637	}
1638
1639	impl OutputReader {
1640	fn new(inner: Output) -> Self {
1641	Self {
1642	inner,
1643	position_within_block: `0`,
1644	}
1645	}
1646
1647	// This helper function handles both the case where the output buffer is
1648	// shorter than one block, and the case where our position_within_block is
1649	// non-zero.
1650	fn fill_one_block(&mut self, buf: &mut &mut [u8]) {
1651	let output_block: [u8; BLOCK_LEN] = self.inner.root_output_block();
1652	let output_bytes = &output_block[self.position_within_block as usize..];
1653	let take = cmp::min(buf.len(), output_bytes.len());
1654	buf[..take].copy_from_slice(&output_bytes[..take]);
1655	self.position_within_block += take as u8;
1656	if self.position_within_block == BLOCK_LEN as u8 {
1657	self.inner.counter += `1`;
1658	self.position_within_block = `0`;
1659	}
1660	// Advance the dest buffer. mem::take() is a borrowck workaround.
1661	buf = &mut* core::mem::take(buf)[take..];
1662	}
1663
1664	/// Fill a buffer with output bytes and advance the position of the
1665	/// `OutputReader`. This is equivalent to [`Read::read`], except that it
1666	/// doesn't return a `Result`. Both methods always fill the entire buffer.
1667	///
1668	/// Note that `OutputReader` doesn't buffer output bytes internally, so
1669	/// calling `fill` repeatedly with a short-length or odd-length slice will
1670	/// end up performing the same compression multiple times. If you're
1671	/// reading output in a loop, prefer a slice length that's a multiple of
1672	/// 64.
1673	///
1674	/// The maximum output size of BLAKE3 is 2<sup>64</sup>-1 bytes. If you try
1675	/// to extract more than that, for example by seeking near the end and
1676	/// reading further, the behavior is unspecified.
1677	///
1678	/// [`Read::read`]: #method.read
1679	pub fn fill(&mut self, mut buf: &mut [u8]) {
1680	if buf.is_empty() {
1681	return;
1682	}
1683
1684	// If we're partway through a block, try to get to a block boundary.
1685	if self.position_within_block != `0` {
1686	self.fill_one_block(&mut buf);
1687	}
1688
1689	let full_blocks = buf.len() / BLOCK_LEN;
1690	let full_blocks_len = full_blocks * BLOCK_LEN;
1691	if full_blocks > `0` {
1692	debug_assert_eq!(`0`, self.position_within_block);
1693	self.inner.platform.xof_many(
1694	&self.inner.input_chaining_value,
1695	&self.inner.block,
1696	self.inner.block_len,
1697	self.inner.counter,
1698	self.inner.flags \| ROOT,
1699	&mut buf[..full_blocks_len],
1700	);
1701	self.inner.counter += full_blocks as u64;
1702	buf = &mut buf[full_blocks * BLOCK_LEN..];
1703	}
1704
1705	if !buf.is_empty() {
1706	debug_assert!(buf.len() < BLOCK_LEN);
1707	self.fill_one_block(&mut buf);
1708	debug_assert!(buf.is_empty());
1709	}
1710	}
1711
1712	/// Return the current read position in the output stream. This is
1713	/// equivalent to [`Seek::stream_position`], except that it doesn't return
1714	/// a `Result`. The position of a new `OutputReader` starts at 0, and each
1715	/// call to [`fill`] or [`Read::read`] moves the position forward by the
1716	/// number of bytes read.
1717	///
1718	/// [`Seek::stream_position`]: #method.stream_position
1719	/// [`fill`]: #method.fill
1720	/// [`Read::read`]: #method.read
1721	pub fn position(&self) -> u64 {
1722	self.inner.counter * BLOCK_LEN as u64 + self.position_within_block as u64
1723	}
1724
1725	/// Seek to a new read position in the output stream. This is equivalent to
1726	/// calling [`Seek::seek`] with [`SeekFrom::Start`], except that it doesn't
1727	/// return a `Result`.
1728	///
1729	/// [`Seek::seek`]: #method.seek
1730	/// [`SeekFrom::Start`]: https://doc.rust-lang.org/std/io/enum.SeekFrom.html
1731	pub fn set_position(&mut self, position: u64) {
1732	self.position_within_block = (position % BLOCK_LEN as u64) as u8;
1733	self.inner.counter = position / BLOCK_LEN as u64;
1734	}
1735	}
1736
1737	// Don't derive(Debug), because the state may be secret.
1738	impl fmt::Debug for OutputReader {
1739	fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
1740	f&mut DebugStruct<'_, '_>.debug_struct("OutputReader")
1741	.field(name:"position", &self.position())
1742	.finish()
1743	}
1744	}
1745
1746	#[cfg(feature = "std")]
1747	impl std::io::Read for OutputReader {
1748	#[inline]
1749	fn read(&mut self, buf: &mut [u8]) -> std::io::Result<usize> {
1750	self.fill(buf);
1751	Ok(buf.len())
1752	}
1753	}
1754
1755	#[cfg(feature = "std")]
1756	impl std::io::Seek for OutputReader {
1757	fn seek(&mut self, pos: std::io::SeekFrom) -> std::io::Result<u64> {
1758	let max_position: i128 = u64::max_value() as i128;
1759	let target_position: i128 = match pos {
1760	std::io::SeekFrom::Start(x: u64) => x as i128,
1761	std::io::SeekFrom::Current(x: i64) => self.position() as i128 + x as i128,
1762	std::io::SeekFrom::End(_) => {
1763	return Err(std::io::Error::new(
1764	kind:std::io::ErrorKind::InvalidInput,
1765	error:"seek from end not supported",
1766	));
1767	}
1768	};
1769	if target_position < `0` {
1770	return Err(std::io::Error::new(
1771	kind:std::io::ErrorKind::InvalidInput,
1772	error:"seek before start",
1773	));
1774	}
1775	self.set_position(cmp::min(v1:target_position, v2:max_position) as u64);
1776	Ok(self.position())
1777	}
1778	}
1779
1780	#[cfg(feature = "zeroize")]
1781	impl Zeroize for OutputReader {
1782	fn zeroize(&mut self) {
1783	// Destructuring to trigger compile error as a reminder to update this impl.
1784	let Self {
1785	inner,
1786	position_within_block,
1787	} = self;
1788
1789	inner.zeroize();
1790	position_within_block.zeroize();
1791	}
1792	}
1793