1 | // Copyright 2014 The Servo Project Developers. See the COPYRIGHT |
2 | // file at the top-level directory of this distribution. |
3 | // |
4 | // Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or |
5 | // http://www.apache.org/licenses/LICENSE-2.0> or the MIT license |
6 | // <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your |
7 | // option. This file may not be copied, modified, or distributed |
8 | // except according to those terms. |
9 | |
10 | use crate::dynamic_set::{Entry, DYNAMIC_SET}; |
11 | use crate::static_sets::StaticAtomSet; |
12 | use debug_unreachable::debug_unreachable; |
13 | |
14 | use std::borrow::Cow; |
15 | use std::cmp::Ordering::{self, Equal}; |
16 | use std::fmt; |
17 | use std::hash::{Hash, Hasher}; |
18 | use std::marker::PhantomData; |
19 | use std::mem; |
20 | use std::num::NonZeroU64; |
21 | use std::ops; |
22 | use std::slice; |
23 | use std::str; |
24 | use std::sync::atomic::Ordering::SeqCst; |
25 | |
26 | const DYNAMIC_TAG: u8 = 0b_00; |
27 | const INLINE_TAG: u8 = 0b_01; // len in upper nybble |
28 | const STATIC_TAG: u8 = 0b_10; |
29 | const TAG_MASK: u64 = 0b_11; |
30 | const LEN_OFFSET: u64 = 4; |
31 | const LEN_MASK: u64 = 0xF0; |
32 | |
33 | const MAX_INLINE_LEN: usize = 7; |
34 | const STATIC_SHIFT_BITS: usize = 32; |
35 | |
36 | /// Represents a string that has been interned. |
37 | /// |
38 | /// While the type definition for `Atom` indicates that it generic on a particular |
39 | /// implementation of an atom set, you don't need to worry about this. Atoms can be static |
40 | /// and come from a `StaticAtomSet` generated by the `string_cache_codegen` crate, or they |
41 | /// can be dynamic and created by you on an `EmptyStaticAtomSet`. |
42 | /// |
43 | /// `Atom` implements `Clone` but not `Copy`, since internally atoms are reference-counted; |
44 | /// this means that you may need to `.clone()` an atom to keep copies to it in different |
45 | /// places, or when passing it to a function that takes an `Atom` rather than an `&Atom`. |
46 | /// |
47 | /// ## Creating an atom at runtime |
48 | /// |
49 | /// If you use `string_cache_codegen` to generate a precomputed list of atoms, your code |
50 | /// may then do something like read data from somewhere and extract tokens that need to be |
51 | /// compared to the atoms. In this case, you can use `Atom::from(&str)` or |
52 | /// `Atom::from(String)`. These create a reference-counted atom which will be |
53 | /// automatically freed when all references to it are dropped. |
54 | /// |
55 | /// This means that your application can safely have a loop which tokenizes data, creates |
56 | /// atoms from the tokens, and compares the atoms to a predefined set of keywords, without |
57 | /// running the risk of arbitrary memory consumption from creating large numbers of atoms — |
58 | /// as long as your application does not store clones of the atoms it creates along the |
59 | /// way. |
60 | /// |
61 | /// For example, the following is safe and will not consume arbitrary amounts of memory: |
62 | /// |
63 | /// ```ignore |
64 | /// let untrusted_data = "large amounts of text ..." ; |
65 | /// |
66 | /// for token in untrusted_data.split_whitespace() { |
67 | /// let atom = Atom::from(token); // interns the string |
68 | /// |
69 | /// if atom == Atom::from("keyword" ) { |
70 | /// // handle that keyword |
71 | /// } else if atom == Atom::from("another_keyword" ) { |
72 | /// // handle that keyword |
73 | /// } else { |
74 | /// println!("unknown keyword" ); |
75 | /// } |
76 | /// } // atom is dropped here, so it is not kept around in memory |
77 | /// ``` |
78 | #[derive (PartialEq, Eq)] |
79 | // NOTE: Deriving PartialEq requires that a given string must always be interned the same way. |
80 | pub struct Atom<Static> { |
81 | unsafe_data: NonZeroU64, |
82 | phantom: PhantomData<Static>, |
83 | } |
84 | |
85 | // FIXME: bound removed from the struct definition before of this error for pack_static: |
86 | // "error[E0723]: trait bounds other than `Sized` on const fn parameters are unstable" |
87 | // https://github.com/rust-lang/rust/issues/57563 |
88 | impl<Static> Atom<Static> { |
89 | /// For the atom!() macros |
90 | #[inline (always)] |
91 | #[doc (hidden)] |
92 | pub const fn pack_static(n: u32) -> Self { |
93 | Self { |
94 | unsafe_data: unsafe { |
95 | // STATIC_TAG ensures this is non-zero |
96 | NonZeroU64::new_unchecked((STATIC_TAG as u64) | ((n as u64) << STATIC_SHIFT_BITS)) |
97 | }, |
98 | phantom: PhantomData, |
99 | } |
100 | } |
101 | |
102 | fn tag(&self) -> u8 { |
103 | (self.unsafe_data.get() & TAG_MASK) as u8 |
104 | } |
105 | } |
106 | |
107 | impl<Static: StaticAtomSet> Atom<Static> { |
108 | /// Return the internal representation. For testing. |
109 | #[doc (hidden)] |
110 | pub fn unsafe_data(&self) -> u64 { |
111 | self.unsafe_data.get() |
112 | } |
113 | |
114 | /// Return true if this is a static Atom. For testing. |
115 | #[doc (hidden)] |
116 | pub fn is_static(&self) -> bool { |
117 | self.tag() == STATIC_TAG |
118 | } |
119 | |
120 | /// Return true if this is a dynamic Atom. For testing. |
121 | #[doc (hidden)] |
122 | pub fn is_dynamic(&self) -> bool { |
123 | self.tag() == DYNAMIC_TAG |
124 | } |
125 | |
126 | /// Return true if this is an inline Atom. For testing. |
127 | #[doc (hidden)] |
128 | pub fn is_inline(&self) -> bool { |
129 | self.tag() == INLINE_TAG |
130 | } |
131 | |
132 | fn static_index(&self) -> u64 { |
133 | self.unsafe_data.get() >> STATIC_SHIFT_BITS |
134 | } |
135 | |
136 | /// Get the hash of the string as it is stored in the set. |
137 | pub fn get_hash(&self) -> u32 { |
138 | match self.tag() { |
139 | DYNAMIC_TAG => { |
140 | let entry = self.unsafe_data.get() as *const Entry; |
141 | unsafe { (*entry).hash } |
142 | } |
143 | STATIC_TAG => Static::get().hashes[self.static_index() as usize], |
144 | INLINE_TAG => { |
145 | let data = self.unsafe_data.get(); |
146 | // This may or may not be great... |
147 | ((data >> 32) ^ data) as u32 |
148 | } |
149 | _ => unsafe { debug_unreachable!() }, |
150 | } |
151 | } |
152 | |
153 | pub fn try_static(string_to_add: &str) -> Option<Self> { |
154 | Self::try_static_internal(string_to_add).ok() |
155 | } |
156 | |
157 | fn try_static_internal(string_to_add: &str) -> Result<Self, phf_shared::Hashes> { |
158 | let static_set = Static::get(); |
159 | let hash = phf_shared::hash(&*string_to_add, &static_set.key); |
160 | let index = phf_shared::get_index(&hash, static_set.disps, static_set.atoms.len()); |
161 | |
162 | if static_set.atoms[index as usize] == string_to_add { |
163 | Ok(Self::pack_static(index)) |
164 | } else { |
165 | Err(hash) |
166 | } |
167 | } |
168 | } |
169 | |
170 | impl<Static: StaticAtomSet> Default for Atom<Static> { |
171 | #[inline ] |
172 | fn default() -> Self { |
173 | Atom::pack_static(Static::empty_string_index()) |
174 | } |
175 | } |
176 | |
177 | impl<Static: StaticAtomSet> Hash for Atom<Static> { |
178 | #[inline ] |
179 | fn hash<H>(&self, state: &mut H) |
180 | where |
181 | H: Hasher, |
182 | { |
183 | state.write_u32(self.get_hash()) |
184 | } |
185 | } |
186 | |
187 | impl<'a, Static: StaticAtomSet> From<Cow<'a, str>> for Atom<Static> { |
188 | fn from(string_to_add: Cow<'a, str>) -> Self { |
189 | Self::try_static_internal(&*string_to_add).unwrap_or_else(|hash| { |
190 | let len = string_to_add.len(); |
191 | if len <= MAX_INLINE_LEN { |
192 | let mut data: u64 = (INLINE_TAG as u64) | ((len as u64) << LEN_OFFSET); |
193 | { |
194 | let dest = inline_atom_slice_mut(&mut data); |
195 | dest[..len].copy_from_slice(string_to_add.as_bytes()) |
196 | } |
197 | Atom { |
198 | // INLINE_TAG ensures this is never zero |
199 | unsafe_data: unsafe { NonZeroU64::new_unchecked(data) }, |
200 | phantom: PhantomData, |
201 | } |
202 | } else { |
203 | let ptr: std::ptr::NonNull<Entry> = DYNAMIC_SET.insert(string_to_add, hash.g); |
204 | let data = ptr.as_ptr() as u64; |
205 | debug_assert!(0 == data & TAG_MASK); |
206 | Atom { |
207 | // The address of a ptr::NonNull is non-zero |
208 | unsafe_data: unsafe { NonZeroU64::new_unchecked(data) }, |
209 | phantom: PhantomData, |
210 | } |
211 | } |
212 | }) |
213 | } |
214 | } |
215 | |
216 | impl<Static: StaticAtomSet> Clone for Atom<Static> { |
217 | #[inline (always)] |
218 | fn clone(&self) -> Self { |
219 | if self.tag() == DYNAMIC_TAG { |
220 | let entry: *const Entry = self.unsafe_data.get() as *const Entry; |
221 | unsafe { &*entry }.ref_count.fetch_add(val:1, order:SeqCst); |
222 | } |
223 | Atom { ..*self } |
224 | } |
225 | } |
226 | |
227 | impl<Static> Drop for Atom<Static> { |
228 | #[inline ] |
229 | fn drop(&mut self) { |
230 | if self.tag() == DYNAMIC_TAG { |
231 | let entry: *const Entry = self.unsafe_data.get() as *const Entry; |
232 | if unsafe { &*entry }.ref_count.fetch_sub(val:1, order:SeqCst) == 1 { |
233 | drop_slow(self) |
234 | } |
235 | } |
236 | |
237 | // Out of line to guide inlining. |
238 | fn drop_slow<Static>(this: &mut Atom<Static>) { |
239 | DYNAMIC_SET.remove(ptr:this.unsafe_data.get() as *mut Entry); |
240 | } |
241 | } |
242 | } |
243 | |
244 | impl<Static: StaticAtomSet> ops::Deref for Atom<Static> { |
245 | type Target = str; |
246 | |
247 | #[inline ] |
248 | fn deref(&self) -> &str { |
249 | unsafe { |
250 | match self.tag() { |
251 | DYNAMIC_TAG => { |
252 | let entry: *const Entry = self.unsafe_data.get() as *const Entry; |
253 | &(*entry).string |
254 | } |
255 | INLINE_TAG => { |
256 | let len: u64 = (self.unsafe_data() & LEN_MASK) >> LEN_OFFSET; |
257 | let src: &[u8] = inline_atom_slice(&self.unsafe_data); |
258 | str::from_utf8_unchecked(&src[..(len as usize)]) |
259 | } |
260 | STATIC_TAG => Static::get().atoms[self.static_index() as usize], |
261 | _ => debug_unreachable!(), |
262 | } |
263 | } |
264 | } |
265 | } |
266 | |
267 | impl<Static: StaticAtomSet> fmt::Debug for Atom<Static> { |
268 | #[inline ] |
269 | fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { |
270 | let ty_str: &str = unsafe { |
271 | match self.tag() { |
272 | DYNAMIC_TAG => "dynamic" , |
273 | INLINE_TAG => "inline" , |
274 | STATIC_TAG => "static" , |
275 | _ => debug_unreachable!(), |
276 | } |
277 | }; |
278 | |
279 | write!(f, "Atom(' {}' type= {})" , &*self, ty_str) |
280 | } |
281 | } |
282 | |
283 | impl<Static: StaticAtomSet> PartialOrd for Atom<Static> { |
284 | #[inline ] |
285 | fn partial_cmp(&self, other: &Self) -> Option<Ordering> { |
286 | if self.unsafe_data == other.unsafe_data { |
287 | return Some(Equal); |
288 | } |
289 | self.as_ref().partial_cmp(other.as_ref()) |
290 | } |
291 | } |
292 | |
293 | impl<Static: StaticAtomSet> Ord for Atom<Static> { |
294 | #[inline ] |
295 | fn cmp(&self, other: &Self) -> Ordering { |
296 | if self.unsafe_data == other.unsafe_data { |
297 | return Equal; |
298 | } |
299 | self.as_ref().cmp(other.as_ref()) |
300 | } |
301 | } |
302 | |
303 | // AsciiExt requires mutating methods, so we just implement the non-mutating ones. |
304 | // We don't need to implement is_ascii because there's no performance improvement |
305 | // over the one from &str. |
306 | impl<Static: StaticAtomSet> Atom<Static> { |
307 | fn from_mutated_str<F: FnOnce(&mut str)>(s: &str, f: F) -> Self { |
308 | let mut buffer = mem::MaybeUninit::<[u8; 64]>::uninit(); |
309 | let buffer = unsafe { &mut *buffer.as_mut_ptr() }; |
310 | |
311 | if let Some(buffer_prefix) = buffer.get_mut(..s.len()) { |
312 | buffer_prefix.copy_from_slice(s.as_bytes()); |
313 | let as_str = unsafe { ::std::str::from_utf8_unchecked_mut(buffer_prefix) }; |
314 | f(as_str); |
315 | Atom::from(&*as_str) |
316 | } else { |
317 | let mut string = s.to_owned(); |
318 | f(&mut string); |
319 | Atom::from(string) |
320 | } |
321 | } |
322 | |
323 | /// Like [`to_ascii_uppercase`]. |
324 | /// |
325 | /// [`to_ascii_uppercase`]: https://doc.rust-lang.org/std/ascii/trait.AsciiExt.html#tymethod.to_ascii_uppercase |
326 | pub fn to_ascii_uppercase(&self) -> Self { |
327 | for (i, b) in self.bytes().enumerate() { |
328 | if let b'a' ..=b'z' = b { |
329 | return Atom::from_mutated_str(self, |s| s[i..].make_ascii_uppercase()); |
330 | } |
331 | } |
332 | self.clone() |
333 | } |
334 | |
335 | /// Like [`to_ascii_lowercase`]. |
336 | /// |
337 | /// [`to_ascii_lowercase`]: https://doc.rust-lang.org/std/ascii/trait.AsciiExt.html#tymethod.to_ascii_lowercase |
338 | pub fn to_ascii_lowercase(&self) -> Self { |
339 | for (i, b) in self.bytes().enumerate() { |
340 | if let b'A' ..=b'Z' = b { |
341 | return Atom::from_mutated_str(self, |s| s[i..].make_ascii_lowercase()); |
342 | } |
343 | } |
344 | self.clone() |
345 | } |
346 | |
347 | /// Like [`eq_ignore_ascii_case`]. |
348 | /// |
349 | /// [`eq_ignore_ascii_case`]: https://doc.rust-lang.org/std/ascii/trait.AsciiExt.html#tymethod.eq_ignore_ascii_case |
350 | pub fn eq_ignore_ascii_case(&self, other: &Self) -> bool { |
351 | (self == other) || self.eq_str_ignore_ascii_case(&**other) |
352 | } |
353 | |
354 | /// Like [`eq_ignore_ascii_case`], but takes an unhashed string as `other`. |
355 | /// |
356 | /// [`eq_ignore_ascii_case`]: https://doc.rust-lang.org/std/ascii/trait.AsciiExt.html#tymethod.eq_ignore_ascii_case |
357 | pub fn eq_str_ignore_ascii_case(&self, other: &str) -> bool { |
358 | (&**self).eq_ignore_ascii_case(other) |
359 | } |
360 | } |
361 | |
362 | #[inline (always)] |
363 | fn inline_atom_slice(x: &NonZeroU64) -> &[u8] { |
364 | unsafe { |
365 | let x: *const NonZeroU64 = x; |
366 | let mut data: *const u8 = x as *const u8; |
367 | // All except the lowest byte, which is first in little-endian, last in big-endian. |
368 | if cfg!(target_endian = "little" ) { |
369 | data = data.offset(count:1); |
370 | } |
371 | let len: usize = 7; |
372 | slice::from_raw_parts(data, len) |
373 | } |
374 | } |
375 | |
376 | #[inline (always)] |
377 | fn inline_atom_slice_mut(x: &mut u64) -> &mut [u8] { |
378 | unsafe { |
379 | let x: *mut u64 = x; |
380 | let mut data: *mut u8 = x as *mut u8; |
381 | // All except the lowest byte, which is first in little-endian, last in big-endian. |
382 | if cfg!(target_endian = "little" ) { |
383 | data = data.offset(count:1); |
384 | } |
385 | let len: usize = 7; |
386 | slice::from_raw_parts_mut(data, len) |
387 | } |
388 | } |
389 | |