1 | // Copyright 2014 The Servo Project Developers. See the COPYRIGHT |
---|---|
2 | // file at the top-level directory of this distribution. |
3 | // |
4 | // Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or |
5 | // http://www.apache.org/licenses/LICENSE-2.0> or the MIT license |
6 | // <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your |
7 | // option. This file may not be copied, modified, or distributed |
8 | // except according to those terms. |
9 | |
10 | use crate::dynamic_set::{dynamic_set, Entry}; |
11 | use crate::static_sets::StaticAtomSet; |
12 | use debug_unreachable::debug_unreachable; |
13 | |
14 | use std::borrow::Cow; |
15 | use std::cmp::Ordering::{self, Equal}; |
16 | use std::fmt; |
17 | use std::hash::{Hash, Hasher}; |
18 | use std::marker::PhantomData; |
19 | use std::mem; |
20 | use std::num::NonZeroU64; |
21 | use std::ops; |
22 | use std::slice; |
23 | use std::str; |
24 | use std::sync::atomic::Ordering::SeqCst; |
25 | |
26 | const DYNAMIC_TAG: u8 = 0b_00; |
27 | const INLINE_TAG: u8 = 0b_01; // len in upper nybble |
28 | const STATIC_TAG: u8 = 0b_10; |
29 | const TAG_MASK: u64 = 0b_11; |
30 | const LEN_OFFSET: u64 = 4; |
31 | const LEN_MASK: u64 = 0xF0; |
32 | |
33 | const MAX_INLINE_LEN: usize = 7; |
34 | const STATIC_SHIFT_BITS: usize = 32; |
35 | |
36 | /// Represents a string that has been interned. |
37 | /// |
38 | /// While the type definition for `Atom` indicates that it generic on a particular |
39 | /// implementation of an atom set, you don't need to worry about this. Atoms can be static |
40 | /// and come from a `StaticAtomSet` generated by the `string_cache_codegen` crate, or they |
41 | /// can be dynamic and created by you on an `EmptyStaticAtomSet`. |
42 | /// |
43 | /// `Atom` implements `Clone` but not `Copy`, since internally atoms are reference-counted; |
44 | /// this means that you may need to `.clone()` an atom to keep copies to it in different |
45 | /// places, or when passing it to a function that takes an `Atom` rather than an `&Atom`. |
46 | /// |
47 | /// ## Creating an atom at runtime |
48 | /// |
49 | /// If you use `string_cache_codegen` to generate a precomputed list of atoms, your code |
50 | /// may then do something like read data from somewhere and extract tokens that need to be |
51 | /// compared to the atoms. In this case, you can use `Atom::from(&str)` or |
52 | /// `Atom::from(String)`. These create a reference-counted atom which will be |
53 | /// automatically freed when all references to it are dropped. |
54 | /// |
55 | /// This means that your application can safely have a loop which tokenizes data, creates |
56 | /// atoms from the tokens, and compares the atoms to a predefined set of keywords, without |
57 | /// running the risk of arbitrary memory consumption from creating large numbers of atoms — |
58 | /// as long as your application does not store clones of the atoms it creates along the |
59 | /// way. |
60 | /// |
61 | /// For example, the following is safe and will not consume arbitrary amounts of memory: |
62 | /// |
63 | /// ```ignore |
64 | /// let untrusted_data = "large amounts of text ..."; |
65 | /// |
66 | /// for token in untrusted_data.split_whitespace() { |
67 | /// let atom = Atom::from(token); // interns the string |
68 | /// |
69 | /// if atom == Atom::from("keyword") { |
70 | /// // handle that keyword |
71 | /// } else if atom == Atom::from("another_keyword") { |
72 | /// // handle that keyword |
73 | /// } else { |
74 | /// println!("unknown keyword"); |
75 | /// } |
76 | /// } // atom is dropped here, so it is not kept around in memory |
77 | /// ``` |
78 | #[derive(PartialEq, Eq)] |
79 | // NOTE: Deriving PartialEq requires that a given string must always be interned the same way. |
80 | pub struct Atom<Static> { |
81 | unsafe_data: NonZeroU64, |
82 | phantom: PhantomData<Static>, |
83 | } |
84 | |
85 | // FIXME: bound removed from the struct definition before of this error for pack_static: |
86 | // "error[E0723]: trait bounds other than `Sized` on const fn parameters are unstable" |
87 | // https://github.com/rust-lang/rust/issues/57563 |
88 | impl<Static> Atom<Static> { |
89 | /// For the atom!() macros |
90 | #[inline(always)] |
91 | #[doc(hidden)] |
92 | pub const fn pack_static(n: u32) -> Self { |
93 | Self { |
94 | unsafe_data: unsafe { |
95 | // STATIC_TAG ensures this is non-zero |
96 | NonZeroU64::new_unchecked((STATIC_TAG as u64) | ((n as u64) << STATIC_SHIFT_BITS)) |
97 | }, |
98 | phantom: PhantomData, |
99 | } |
100 | } |
101 | |
102 | /// For the atom!() macros |
103 | #[inline(always)] |
104 | #[doc(hidden)] |
105 | pub const fn pack_inline(mut n: u64, len: u8) -> Self { |
106 | if cfg!(target_endian = "big") { |
107 | // Reverse order of top 7 bytes. |
108 | // Bottom 8 bits of `n` are zero, and we need that to remain so. |
109 | // String data is stored in top 7 bytes, tag and length in bottom byte. |
110 | n = n.to_le() << 8; |
111 | } |
112 | |
113 | let data: u64 = (INLINE_TAG as u64) | ((len as u64) << LEN_OFFSET) | n; |
114 | Self { |
115 | // INLINE_TAG ensures this is never zero |
116 | unsafe_data: unsafe { NonZeroU64::new_unchecked(data) }, |
117 | phantom: PhantomData, |
118 | } |
119 | } |
120 | |
121 | fn tag(&self) -> u8 { |
122 | (self.unsafe_data.get() & TAG_MASK) as u8 |
123 | } |
124 | } |
125 | |
126 | impl<Static: StaticAtomSet> Atom<Static> { |
127 | /// Return the internal representation. For testing. |
128 | #[doc(hidden)] |
129 | pub fn unsafe_data(&self) -> u64 { |
130 | self.unsafe_data.get() |
131 | } |
132 | |
133 | /// Return true if this is a static Atom. For testing. |
134 | #[doc(hidden)] |
135 | pub fn is_static(&self) -> bool { |
136 | self.tag() == STATIC_TAG |
137 | } |
138 | |
139 | /// Return true if this is a dynamic Atom. For testing. |
140 | #[doc(hidden)] |
141 | pub fn is_dynamic(&self) -> bool { |
142 | self.tag() == DYNAMIC_TAG |
143 | } |
144 | |
145 | /// Return true if this is an inline Atom. For testing. |
146 | #[doc(hidden)] |
147 | pub fn is_inline(&self) -> bool { |
148 | self.tag() == INLINE_TAG |
149 | } |
150 | |
151 | fn static_index(&self) -> u64 { |
152 | self.unsafe_data.get() >> STATIC_SHIFT_BITS |
153 | } |
154 | |
155 | /// Get the hash of the string as it is stored in the set. |
156 | pub fn get_hash(&self) -> u32 { |
157 | match self.tag() { |
158 | DYNAMIC_TAG => { |
159 | let entry = self.unsafe_data.get() as *const Entry; |
160 | unsafe { (*entry).hash } |
161 | } |
162 | STATIC_TAG => Static::get().hashes[self.static_index() as usize], |
163 | INLINE_TAG => { |
164 | let data = self.unsafe_data.get(); |
165 | // This may or may not be great... |
166 | ((data >> 32) ^ data) as u32 |
167 | } |
168 | _ => unsafe { debug_unreachable!() }, |
169 | } |
170 | } |
171 | |
172 | pub fn try_static(string_to_add: &str) -> Option<Self> { |
173 | Self::try_static_internal(string_to_add).ok() |
174 | } |
175 | |
176 | fn try_static_internal(string_to_add: &str) -> Result<Self, phf_shared::Hashes> { |
177 | let static_set = Static::get(); |
178 | let hash = phf_shared::hash(&*string_to_add, &static_set.key); |
179 | let index = phf_shared::get_index(&hash, static_set.disps, static_set.atoms.len()); |
180 | |
181 | if static_set.atoms[index as usize] == string_to_add { |
182 | Ok(Self::pack_static(index)) |
183 | } else { |
184 | Err(hash) |
185 | } |
186 | } |
187 | } |
188 | |
189 | impl<Static: StaticAtomSet> Default for Atom<Static> { |
190 | #[inline] |
191 | fn default() -> Self { |
192 | Atom::pack_static(Static::empty_string_index()) |
193 | } |
194 | } |
195 | |
196 | impl<Static: StaticAtomSet> Hash for Atom<Static> { |
197 | #[inline] |
198 | fn hash<H>(&self, state: &mut H) |
199 | where |
200 | H: Hasher, |
201 | { |
202 | state.write_u32(self.get_hash()) |
203 | } |
204 | } |
205 | |
206 | impl<'a, Static: StaticAtomSet> From<Cow<'a, str>> for Atom<Static> { |
207 | fn from(string_to_add: Cow<'a, str>) -> Self { |
208 | let len = string_to_add.len(); |
209 | if len == 0 { |
210 | Self::pack_static(Static::empty_string_index()) |
211 | } else if len <= MAX_INLINE_LEN { |
212 | let mut data: u64 = (INLINE_TAG as u64) | ((len as u64) << LEN_OFFSET); |
213 | { |
214 | let dest = inline_atom_slice_mut(&mut data); |
215 | dest[..len].copy_from_slice(string_to_add.as_bytes()); |
216 | } |
217 | Atom { |
218 | // INLINE_TAG ensures this is never zero |
219 | unsafe_data: unsafe { NonZeroU64::new_unchecked(data) }, |
220 | phantom: PhantomData, |
221 | } |
222 | } else { |
223 | Self::try_static_internal(&*string_to_add).unwrap_or_else(|hash| { |
224 | let ptr: std::ptr::NonNull<Entry> = dynamic_set().insert(string_to_add, hash.g); |
225 | let data = ptr.as_ptr() as u64; |
226 | debug_assert!(0 == data & TAG_MASK); |
227 | Atom { |
228 | // The address of a ptr::NonNull is non-zero |
229 | unsafe_data: unsafe { NonZeroU64::new_unchecked(data) }, |
230 | phantom: PhantomData, |
231 | } |
232 | }) |
233 | } |
234 | } |
235 | } |
236 | |
237 | impl<Static: StaticAtomSet> Clone for Atom<Static> { |
238 | #[inline(always)] |
239 | fn clone(&self) -> Self { |
240 | if self.tag() == DYNAMIC_TAG { |
241 | let entry: *const Entry = self.unsafe_data.get() as *const Entry; |
242 | unsafe { &*entry }.ref_count.fetch_add(val:1, order:SeqCst); |
243 | } |
244 | Atom { ..*self } |
245 | } |
246 | } |
247 | |
248 | impl<Static> Drop for Atom<Static> { |
249 | #[inline] |
250 | fn drop(&mut self) { |
251 | if self.tag() == DYNAMIC_TAG { |
252 | let entry: *const Entry = self.unsafe_data.get() as *const Entry; |
253 | if unsafe { &*entry }.ref_count.fetch_sub(val:1, order:SeqCst) == 1 { |
254 | drop_slow(self) |
255 | } |
256 | } |
257 | |
258 | // Out of line to guide inlining. |
259 | fn drop_slow<Static>(this: &mut Atom<Static>) { |
260 | dynamic_set().remove(ptr:this.unsafe_data.get() as *mut Entry); |
261 | } |
262 | } |
263 | } |
264 | |
265 | impl<Static: StaticAtomSet> ops::Deref for Atom<Static> { |
266 | type Target = str; |
267 | |
268 | #[inline] |
269 | fn deref(&self) -> &str { |
270 | unsafe { |
271 | match self.tag() { |
272 | DYNAMIC_TAG => { |
273 | let entry: *const Entry = self.unsafe_data.get() as *const Entry; |
274 | &(*entry).string |
275 | } |
276 | INLINE_TAG => { |
277 | let len: u64 = (self.unsafe_data() & LEN_MASK) >> LEN_OFFSET; |
278 | debug_assert!(len as usize <= MAX_INLINE_LEN); |
279 | let src: &[u8] = inline_atom_slice(&self.unsafe_data); |
280 | str::from_utf8_unchecked(src.get_unchecked(..(len as usize))) |
281 | } |
282 | STATIC_TAG => Static::get().atoms[self.static_index() as usize], |
283 | _ => debug_unreachable!(), |
284 | } |
285 | } |
286 | } |
287 | } |
288 | |
289 | impl<Static: StaticAtomSet> fmt::Debug for Atom<Static> { |
290 | #[inline] |
291 | fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { |
292 | let ty_str: &'static str = unsafe { |
293 | match self.tag() { |
294 | DYNAMIC_TAG => "dynamic", |
295 | INLINE_TAG => "inline", |
296 | STATIC_TAG => "static", |
297 | _ => debug_unreachable!(), |
298 | } |
299 | }; |
300 | |
301 | write!(f, "Atom('{} ' type={} )", &*self, ty_str) |
302 | } |
303 | } |
304 | |
305 | impl<Static: StaticAtomSet> PartialOrd for Atom<Static> { |
306 | #[inline] |
307 | fn partial_cmp(&self, other: &Self) -> Option<Ordering> { |
308 | if self.unsafe_data == other.unsafe_data { |
309 | return Some(Equal); |
310 | } |
311 | self.as_ref().partial_cmp(other.as_ref()) |
312 | } |
313 | } |
314 | |
315 | impl<Static: StaticAtomSet> Ord for Atom<Static> { |
316 | #[inline] |
317 | fn cmp(&self, other: &Self) -> Ordering { |
318 | if self.unsafe_data == other.unsafe_data { |
319 | return Equal; |
320 | } |
321 | self.as_ref().cmp(other.as_ref()) |
322 | } |
323 | } |
324 | |
325 | // AsciiExt requires mutating methods, so we just implement the non-mutating ones. |
326 | // We don't need to implement is_ascii because there's no performance improvement |
327 | // over the one from &str. |
328 | impl<Static: StaticAtomSet> Atom<Static> { |
329 | fn from_mutated_str<F: FnOnce(&mut str)>(s: &str, f: F) -> Self { |
330 | let mut buffer = mem::MaybeUninit::<[u8; 64]>::uninit(); |
331 | let buffer = unsafe { &mut *buffer.as_mut_ptr() }; |
332 | |
333 | if let Some(buffer_prefix) = buffer.get_mut(..s.len()) { |
334 | buffer_prefix.copy_from_slice(s.as_bytes()); |
335 | let as_str = unsafe { ::std::str::from_utf8_unchecked_mut(buffer_prefix) }; |
336 | f(as_str); |
337 | Atom::from(&*as_str) |
338 | } else { |
339 | let mut string = s.to_owned(); |
340 | f(&mut string); |
341 | Atom::from(string) |
342 | } |
343 | } |
344 | |
345 | /// Like [`to_ascii_uppercase`]. |
346 | /// |
347 | /// [`to_ascii_uppercase`]: https://doc.rust-lang.org/std/ascii/trait.AsciiExt.html#tymethod.to_ascii_uppercase |
348 | pub fn to_ascii_uppercase(&self) -> Self { |
349 | for (i, b) in self.bytes().enumerate() { |
350 | if let b'a'..= b'z'= b { |
351 | return Atom::from_mutated_str(self, |s| s[i..].make_ascii_uppercase()); |
352 | } |
353 | } |
354 | self.clone() |
355 | } |
356 | |
357 | /// Like [`to_ascii_lowercase`]. |
358 | /// |
359 | /// [`to_ascii_lowercase`]: https://doc.rust-lang.org/std/ascii/trait.AsciiExt.html#tymethod.to_ascii_lowercase |
360 | pub fn to_ascii_lowercase(&self) -> Self { |
361 | for (i, b) in self.bytes().enumerate() { |
362 | if let b'A'..= b'Z'= b { |
363 | return Atom::from_mutated_str(self, |s| s[i..].make_ascii_lowercase()); |
364 | } |
365 | } |
366 | self.clone() |
367 | } |
368 | |
369 | /// Like [`eq_ignore_ascii_case`]. |
370 | /// |
371 | /// [`eq_ignore_ascii_case`]: https://doc.rust-lang.org/std/ascii/trait.AsciiExt.html#tymethod.eq_ignore_ascii_case |
372 | pub fn eq_ignore_ascii_case(&self, other: &Self) -> bool { |
373 | (self == other) || self.eq_str_ignore_ascii_case(&**other) |
374 | } |
375 | |
376 | /// Like [`eq_ignore_ascii_case`], but takes an unhashed string as `other`. |
377 | /// |
378 | /// [`eq_ignore_ascii_case`]: https://doc.rust-lang.org/std/ascii/trait.AsciiExt.html#tymethod.eq_ignore_ascii_case |
379 | pub fn eq_str_ignore_ascii_case(&self, other: &str) -> bool { |
380 | (&**self).eq_ignore_ascii_case(other) |
381 | } |
382 | } |
383 | |
384 | #[inline(always)] |
385 | fn inline_atom_slice(x: &NonZeroU64) -> &[u8] { |
386 | let x: *const NonZeroU64 = x; |
387 | let mut data: *const u8 = x as *const u8; |
388 | // All except the lowest byte, which is first in little-endian, last in big-endian. |
389 | if cfg!(target_endian = "little") { |
390 | data = unsafe { data.offset(count:1) }; |
391 | } |
392 | let len: usize = 7; |
393 | unsafe { slice::from_raw_parts(data, len) } |
394 | } |
395 | |
396 | #[inline(always)] |
397 | fn inline_atom_slice_mut(x: &mut u64) -> &mut [u8] { |
398 | let x: *mut u64 = x; |
399 | let mut data: *mut u8 = x as *mut u8; |
400 | // All except the lowest byte, which is first in little-endian, last in big-endian. |
401 | if cfg!(target_endian = "little") { |
402 | data = unsafe { data.offset(count:1) }; |
403 | } |
404 | let len: usize = 7; |
405 | unsafe { slice::from_raw_parts_mut(data, len) } |
406 | } |
407 |
Definitions
- Atom
- unsafe_data
- phantom
- pack_static
- pack_inline
- tag
- unsafe_data
- is_static
- is_dynamic
- is_inline
- static_index
- get_hash
- try_static
- try_static_internal
- default
- hash
- from
- clone
- drop
- drop_slow
- Target
- deref
- fmt
- partial_cmp
- cmp
- from_mutated_str
- to_ascii_uppercase
- to_ascii_lowercase
- eq_ignore_ascii_case
- eq_str_ignore_ascii_case
- inline_atom_slice
Learn Rust with the experts
Find out more