1 | // Copyright 2019 The CryptoCorrosion Contributors |
2 | // Copyright 2020 Developers of the Rand project. |
3 | // |
4 | // Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or |
5 | // https://www.apache.org/licenses/LICENSE-2.0> or the MIT license |
6 | // <LICENSE-MIT or https://opensource.org/licenses/MIT>, at your |
7 | // option. This file may not be copied, modified, or distributed |
8 | // except according to those terms. |
9 | |
10 | //! The ChaCha random number generator. |
11 | |
12 | use ppv_lite86::{dispatch, dispatch_light128}; |
13 | |
14 | pub use ppv_lite86::Machine; |
15 | use ppv_lite86::{ |
16 | vec128_storage, ArithOps, BitOps32, LaneWords4, MultiLane, StoreBytes, Vec4, Vec4Ext, Vector, |
17 | }; |
18 | |
19 | pub(crate) const BLOCK: usize = 16; |
20 | pub(crate) const BLOCK64: u64 = BLOCK as u64; |
21 | const LOG2_BUFBLOCKS: u64 = 2; |
22 | const BUFBLOCKS: u64 = 1 << LOG2_BUFBLOCKS; |
23 | pub(crate) const BUFSZ64: u64 = BLOCK64 * BUFBLOCKS; |
24 | pub(crate) const BUFSZ: usize = BUFSZ64 as usize; |
25 | |
26 | const STREAM_PARAM_NONCE: u32 = 1; |
27 | const STREAM_PARAM_BLOCK: u32 = 0; |
28 | |
29 | #[derive (Clone, PartialEq, Eq)] |
30 | pub struct ChaCha { |
31 | pub(crate) b: vec128_storage, |
32 | pub(crate) c: vec128_storage, |
33 | pub(crate) d: vec128_storage, |
34 | } |
35 | |
36 | #[derive (Clone)] |
37 | pub struct State<V> { |
38 | pub(crate) a: V, |
39 | pub(crate) b: V, |
40 | pub(crate) c: V, |
41 | pub(crate) d: V, |
42 | } |
43 | |
44 | #[inline (always)] |
45 | pub(crate) fn round<V: ArithOps + BitOps32>(mut x: State<V>) -> State<V> { |
46 | x.a += x.b; |
47 | x.d = (x.d ^ x.a).rotate_each_word_right16(); |
48 | x.c += x.d; |
49 | x.b = (x.b ^ x.c).rotate_each_word_right20(); |
50 | x.a += x.b; |
51 | x.d = (x.d ^ x.a).rotate_each_word_right24(); |
52 | x.c += x.d; |
53 | x.b = (x.b ^ x.c).rotate_each_word_right25(); |
54 | x |
55 | } |
56 | |
57 | #[inline (always)] |
58 | pub(crate) fn diagonalize<V: LaneWords4>(mut x: State<V>) -> State<V> { |
59 | x.b = x.b.shuffle_lane_words3012(); |
60 | x.c = x.c.shuffle_lane_words2301(); |
61 | x.d = x.d.shuffle_lane_words1230(); |
62 | x |
63 | } |
64 | #[inline (always)] |
65 | pub(crate) fn undiagonalize<V: LaneWords4>(mut x: State<V>) -> State<V> { |
66 | x.b = x.b.shuffle_lane_words1230(); |
67 | x.c = x.c.shuffle_lane_words2301(); |
68 | x.d = x.d.shuffle_lane_words3012(); |
69 | x |
70 | } |
71 | |
72 | impl ChaCha { |
73 | #[inline (always)] |
74 | pub fn new(key: &[u8; 32], nonce: &[u8]) -> Self { |
75 | init_chacha(key, nonce) |
76 | } |
77 | |
78 | /// Produce 4 blocks of output, advancing the state |
79 | #[inline (always)] |
80 | pub fn refill4(&mut self, drounds: u32, out: &mut [u32; BUFSZ]) { |
81 | refill_wide(self, drounds, out) |
82 | } |
83 | |
84 | #[inline (always)] |
85 | pub fn set_block_pos(&mut self, value: u64) { |
86 | set_stream_param(self, STREAM_PARAM_BLOCK, value) |
87 | } |
88 | |
89 | #[inline (always)] |
90 | pub fn get_block_pos(&self) -> u64 { |
91 | get_stream_param(self, STREAM_PARAM_BLOCK) |
92 | } |
93 | |
94 | #[inline (always)] |
95 | pub fn set_nonce(&mut self, value: u64) { |
96 | set_stream_param(self, STREAM_PARAM_NONCE, value) |
97 | } |
98 | |
99 | #[inline (always)] |
100 | pub fn get_nonce(&self) -> u64 { |
101 | get_stream_param(self, STREAM_PARAM_NONCE) |
102 | } |
103 | |
104 | #[inline (always)] |
105 | pub fn get_seed(&self) -> [u8; 32] { |
106 | get_seed(self) |
107 | } |
108 | } |
109 | |
110 | // This implementation is platform-independent. |
111 | #[inline (always)] |
112 | #[cfg (target_endian = "big" )] |
113 | fn add_pos<Mach: Machine>(_m: Mach, d0: Mach::u32x4, i: u64) -> Mach::u32x4 { |
114 | let pos0 = ((d0.extract(1) as u64) << 32) | d0.extract(0) as u64; |
115 | let pos = pos0.wrapping_add(i); |
116 | d0.insert((pos >> 32) as u32, 1).insert(pos as u32, 0) |
117 | } |
118 | #[inline (always)] |
119 | #[cfg (target_endian = "big" )] |
120 | fn d0123<Mach: Machine>(m: Mach, d: vec128_storage) -> Mach::u32x4x4 { |
121 | let d0: Mach::u32x4 = m.unpack(d); |
122 | let mut pos = ((d0.extract(1) as u64) << 32) | d0.extract(0) as u64; |
123 | pos = pos.wrapping_add(1); |
124 | let d1 = d0.insert((pos >> 32) as u32, 1).insert(pos as u32, 0); |
125 | pos = pos.wrapping_add(1); |
126 | let d2 = d0.insert((pos >> 32) as u32, 1).insert(pos as u32, 0); |
127 | pos = pos.wrapping_add(1); |
128 | let d3 = d0.insert((pos >> 32) as u32, 1).insert(pos as u32, 0); |
129 | Mach::u32x4x4::from_lanes([d0, d1, d2, d3]) |
130 | } |
131 | |
132 | // Pos is packed into the state vectors as a little-endian u64, |
133 | // so on LE platforms we can use native vector ops to increment it. |
134 | #[inline (always)] |
135 | #[cfg (target_endian = "little" )] |
136 | fn add_pos<Mach: Machine>(m: Mach, d: Mach::u32x4, i: u64) -> Mach::u32x4 { |
137 | let d0: Mach::u64x2 = m.unpack(d.into()); |
138 | let incr: ::u64x2 = m.vec([i, 0]); |
139 | m.unpack((d0 + incr).into()) |
140 | } |
141 | #[inline (always)] |
142 | #[cfg (target_endian = "little" )] |
143 | fn d0123<Mach: Machine>(m: Mach, d: vec128_storage) -> Mach::u32x4x4 { |
144 | let d0: Mach::u64x2 = m.unpack(d); |
145 | let incr: ::u64x2x4 = |
146 | Mach::u64x2x4::from_lanes([m.vec([0, 0]), m.vec([1, 0]), m.vec([2, 0]), m.vec([3, 0])]); |
147 | m.unpack((Mach::u64x2x4::from_lanes([d0, d0, d0, d0]) + incr).into()) |
148 | } |
149 | |
150 | #[allow (clippy::many_single_char_names)] |
151 | #[inline (always)] |
152 | fn refill_wide_impl<Mach: Machine>( |
153 | m: Mach, |
154 | state: &mut ChaCha, |
155 | drounds: u32, |
156 | out: &mut [u32; BUFSZ], |
157 | ) { |
158 | let k = m.vec([0x6170_7865, 0x3320_646e, 0x7962_2d32, 0x6b20_6574]); |
159 | let b = m.unpack(state.b); |
160 | let c = m.unpack(state.c); |
161 | let mut x = State { |
162 | a: Mach::u32x4x4::from_lanes([k, k, k, k]), |
163 | b: Mach::u32x4x4::from_lanes([b, b, b, b]), |
164 | c: Mach::u32x4x4::from_lanes([c, c, c, c]), |
165 | d: d0123(m, state.d), |
166 | }; |
167 | for _ in 0..drounds { |
168 | x = round(x); |
169 | x = undiagonalize(round(diagonalize(x))); |
170 | } |
171 | let kk = Mach::u32x4x4::from_lanes([k, k, k, k]); |
172 | let sb = m.unpack(state.b); |
173 | let sb = Mach::u32x4x4::from_lanes([sb, sb, sb, sb]); |
174 | let sc = m.unpack(state.c); |
175 | let sc = Mach::u32x4x4::from_lanes([sc, sc, sc, sc]); |
176 | let sd = d0123(m, state.d); |
177 | let results = Mach::u32x4x4::transpose4(x.a + kk, x.b + sb, x.c + sc, x.d + sd); |
178 | out[0..16].copy_from_slice(&results.0.to_scalars()); |
179 | out[16..32].copy_from_slice(&results.1.to_scalars()); |
180 | out[32..48].copy_from_slice(&results.2.to_scalars()); |
181 | out[48..64].copy_from_slice(&results.3.to_scalars()); |
182 | state.d = add_pos(m, sd.to_lanes()[0], 4).into(); |
183 | } |
184 | |
185 | dispatch!(m, Mach, { |
186 | fn refill_wide(state: &mut ChaCha, drounds: u32, out: &mut [u32; BUFSZ]) { |
187 | refill_wide_impl(m, state, drounds, out); |
188 | } |
189 | }); |
190 | |
191 | // Single-block, rounds-only; shared by try_apply_keystream for tails shorter than BUFSZ |
192 | // and XChaCha's setup step. |
193 | dispatch!(m, Mach, { |
194 | fn refill_narrow_rounds(state: &mut ChaCha, drounds: u32) -> State<vec128_storage> { |
195 | let k: Mach::u32x4 = m.vec([0x6170_7865, 0x3320_646e, 0x7962_2d32, 0x6b20_6574]); |
196 | let mut x = State { |
197 | a: k, |
198 | b: m.unpack(state.b), |
199 | c: m.unpack(state.c), |
200 | d: m.unpack(state.d), |
201 | }; |
202 | for _ in 0..drounds { |
203 | x = round(x); |
204 | x = undiagonalize(round(diagonalize(x))); |
205 | } |
206 | State { |
207 | a: x.a.into(), |
208 | b: x.b.into(), |
209 | c: x.c.into(), |
210 | d: x.d.into(), |
211 | } |
212 | } |
213 | }); |
214 | |
215 | dispatch_light128!(m, Mach, { |
216 | fn set_stream_param(state: &mut ChaCha, param: u32, value: u64) { |
217 | let d: Mach::u32x4 = m.unpack(state.d); |
218 | state.d = d |
219 | .insert((value >> 32) as u32, (param << 1) | 1) |
220 | .insert(value as u32, param << 1) |
221 | .into(); |
222 | } |
223 | }); |
224 | |
225 | dispatch_light128!(m, Mach, { |
226 | fn get_stream_param(state: &ChaCha, param: u32) -> u64 { |
227 | let d: Mach::u32x4 = m.unpack(state.d); |
228 | ((d.extract((param << 1) | 1) as u64) << 32) | d.extract(param << 1) as u64 |
229 | } |
230 | }); |
231 | |
232 | dispatch_light128!(m, Mach, { |
233 | fn get_seed(state: &ChaCha) -> [u8; 32] { |
234 | let b: Mach::u32x4 = m.unpack(state.b); |
235 | let c: Mach::u32x4 = m.unpack(state.c); |
236 | let mut key = [0u8; 32]; |
237 | b.write_le(&mut key[..16]); |
238 | c.write_le(&mut key[16..]); |
239 | key |
240 | } |
241 | }); |
242 | |
243 | fn read_u32le(xs: &[u8]) -> u32 { |
244 | assert_eq!(xs.len(), 4); |
245 | u32::from(xs[0]) | (u32::from(xs[1]) << 8) | (u32::from(xs[2]) << 16) | (u32::from(xs[3]) << 24) |
246 | } |
247 | |
248 | dispatch_light128!(m, Mach, { |
249 | fn init_chacha(key: &[u8; 32], nonce: &[u8]) -> ChaCha { |
250 | let ctr_nonce = [ |
251 | 0, |
252 | if nonce.len() == 12 { |
253 | read_u32le(&nonce[0..4]) |
254 | } else { |
255 | 0 |
256 | }, |
257 | read_u32le(&nonce[nonce.len() - 8..nonce.len() - 4]), |
258 | read_u32le(&nonce[nonce.len() - 4..]), |
259 | ]; |
260 | let key0: Mach::u32x4 = m.read_le(&key[..16]); |
261 | let key1: Mach::u32x4 = m.read_le(&key[16..]); |
262 | ChaCha { |
263 | b: key0.into(), |
264 | c: key1.into(), |
265 | d: ctr_nonce.into(), |
266 | } |
267 | } |
268 | }); |
269 | |
270 | dispatch_light128!(m, Mach, { |
271 | fn init_chacha_x(key: &[u8; 32], nonce: &[u8; 24], rounds: u32) -> ChaCha { |
272 | let key0: Mach::u32x4 = m.read_le(&key[..16]); |
273 | let key1: Mach::u32x4 = m.read_le(&key[16..]); |
274 | let nonce0: Mach::u32x4 = m.read_le(&nonce[..16]); |
275 | let mut state = ChaCha { |
276 | b: key0.into(), |
277 | c: key1.into(), |
278 | d: nonce0.into(), |
279 | }; |
280 | let x = refill_narrow_rounds(&mut state, rounds); |
281 | let ctr_nonce1 = [0, 0, read_u32le(&nonce[16..20]), read_u32le(&nonce[20..24])]; |
282 | state.b = x.a; |
283 | state.c = x.d; |
284 | state.d = ctr_nonce1.into(); |
285 | state |
286 | } |
287 | }); |
288 | |