1// Copyright 2014 The Servo Project Developers. See the COPYRIGHT
2// file at the top-level directory of this distribution.
3//
4// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
5// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
6// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
7// option. This file may not be copied, modified, or distributed
8// except according to those terms.
9
10use crate::dynamic_set::{Entry, DYNAMIC_SET};
11use crate::static_sets::StaticAtomSet;
12use debug_unreachable::debug_unreachable;
13
14use std::borrow::Cow;
15use std::cmp::Ordering::{self, Equal};
16use std::fmt;
17use std::hash::{Hash, Hasher};
18use std::marker::PhantomData;
19use std::mem;
20use std::num::NonZeroU64;
21use std::ops;
22use std::slice;
23use std::str;
24use std::sync::atomic::Ordering::SeqCst;
25
26const DYNAMIC_TAG: u8 = 0b_00;
27const INLINE_TAG: u8 = 0b_01; // len in upper nybble
28const STATIC_TAG: u8 = 0b_10;
29const TAG_MASK: u64 = 0b_11;
30const LEN_OFFSET: u64 = 4;
31const LEN_MASK: u64 = 0xF0;
32
33const MAX_INLINE_LEN: usize = 7;
34const STATIC_SHIFT_BITS: usize = 32;
35
36/// Represents a string that has been interned.
37///
38/// While the type definition for `Atom` indicates that it generic on a particular
39/// implementation of an atom set, you don't need to worry about this. Atoms can be static
40/// and come from a `StaticAtomSet` generated by the `string_cache_codegen` crate, or they
41/// can be dynamic and created by you on an `EmptyStaticAtomSet`.
42///
43/// `Atom` implements `Clone` but not `Copy`, since internally atoms are reference-counted;
44/// this means that you may need to `.clone()` an atom to keep copies to it in different
45/// places, or when passing it to a function that takes an `Atom` rather than an `&Atom`.
46///
47/// ## Creating an atom at runtime
48///
49/// If you use `string_cache_codegen` to generate a precomputed list of atoms, your code
50/// may then do something like read data from somewhere and extract tokens that need to be
51/// compared to the atoms. In this case, you can use `Atom::from(&str)` or
52/// `Atom::from(String)`. These create a reference-counted atom which will be
53/// automatically freed when all references to it are dropped.
54///
55/// This means that your application can safely have a loop which tokenizes data, creates
56/// atoms from the tokens, and compares the atoms to a predefined set of keywords, without
57/// running the risk of arbitrary memory consumption from creating large numbers of atoms —
58/// as long as your application does not store clones of the atoms it creates along the
59/// way.
60///
61/// For example, the following is safe and will not consume arbitrary amounts of memory:
62///
63/// ```ignore
64/// let untrusted_data = "large amounts of text ...";
65///
66/// for token in untrusted_data.split_whitespace() {
67/// let atom = Atom::from(token); // interns the string
68///
69/// if atom == Atom::from("keyword") {
70/// // handle that keyword
71/// } else if atom == Atom::from("another_keyword") {
72/// // handle that keyword
73/// } else {
74/// println!("unknown keyword");
75/// }
76/// } // atom is dropped here, so it is not kept around in memory
77/// ```
78#[derive(PartialEq, Eq)]
79// NOTE: Deriving PartialEq requires that a given string must always be interned the same way.
80pub struct Atom<Static> {
81 unsafe_data: NonZeroU64,
82 phantom: PhantomData<Static>,
83}
84
85// FIXME: bound removed from the struct definition before of this error for pack_static:
86// "error[E0723]: trait bounds other than `Sized` on const fn parameters are unstable"
87// https://github.com/rust-lang/rust/issues/57563
88impl<Static> Atom<Static> {
89 /// For the atom!() macros
90 #[inline(always)]
91 #[doc(hidden)]
92 pub const fn pack_static(n: u32) -> Self {
93 Self {
94 unsafe_data: unsafe {
95 // STATIC_TAG ensures this is non-zero
96 NonZeroU64::new_unchecked((STATIC_TAG as u64) | ((n as u64) << STATIC_SHIFT_BITS))
97 },
98 phantom: PhantomData,
99 }
100 }
101
102 fn tag(&self) -> u8 {
103 (self.unsafe_data.get() & TAG_MASK) as u8
104 }
105}
106
107impl<Static: StaticAtomSet> Atom<Static> {
108 /// Return the internal representation. For testing.
109 #[doc(hidden)]
110 pub fn unsafe_data(&self) -> u64 {
111 self.unsafe_data.get()
112 }
113
114 /// Return true if this is a static Atom. For testing.
115 #[doc(hidden)]
116 pub fn is_static(&self) -> bool {
117 self.tag() == STATIC_TAG
118 }
119
120 /// Return true if this is a dynamic Atom. For testing.
121 #[doc(hidden)]
122 pub fn is_dynamic(&self) -> bool {
123 self.tag() == DYNAMIC_TAG
124 }
125
126 /// Return true if this is an inline Atom. For testing.
127 #[doc(hidden)]
128 pub fn is_inline(&self) -> bool {
129 self.tag() == INLINE_TAG
130 }
131
132 fn static_index(&self) -> u64 {
133 self.unsafe_data.get() >> STATIC_SHIFT_BITS
134 }
135
136 /// Get the hash of the string as it is stored in the set.
137 pub fn get_hash(&self) -> u32 {
138 match self.tag() {
139 DYNAMIC_TAG => {
140 let entry = self.unsafe_data.get() as *const Entry;
141 unsafe { (*entry).hash }
142 }
143 STATIC_TAG => Static::get().hashes[self.static_index() as usize],
144 INLINE_TAG => {
145 let data = self.unsafe_data.get();
146 // This may or may not be great...
147 ((data >> 32) ^ data) as u32
148 }
149 _ => unsafe { debug_unreachable!() },
150 }
151 }
152
153 pub fn try_static(string_to_add: &str) -> Option<Self> {
154 Self::try_static_internal(string_to_add).ok()
155 }
156
157 fn try_static_internal(string_to_add: &str) -> Result<Self, phf_shared::Hashes> {
158 let static_set = Static::get();
159 let hash = phf_shared::hash(&*string_to_add, &static_set.key);
160 let index = phf_shared::get_index(&hash, static_set.disps, static_set.atoms.len());
161
162 if static_set.atoms[index as usize] == string_to_add {
163 Ok(Self::pack_static(index))
164 } else {
165 Err(hash)
166 }
167 }
168}
169
170impl<Static: StaticAtomSet> Default for Atom<Static> {
171 #[inline]
172 fn default() -> Self {
173 Atom::pack_static(Static::empty_string_index())
174 }
175}
176
177impl<Static: StaticAtomSet> Hash for Atom<Static> {
178 #[inline]
179 fn hash<H>(&self, state: &mut H)
180 where
181 H: Hasher,
182 {
183 state.write_u32(self.get_hash())
184 }
185}
186
187impl<'a, Static: StaticAtomSet> From<Cow<'a, str>> for Atom<Static> {
188 fn from(string_to_add: Cow<'a, str>) -> Self {
189 Self::try_static_internal(&*string_to_add).unwrap_or_else(|hash| {
190 let len = string_to_add.len();
191 if len <= MAX_INLINE_LEN {
192 let mut data: u64 = (INLINE_TAG as u64) | ((len as u64) << LEN_OFFSET);
193 {
194 let dest = inline_atom_slice_mut(&mut data);
195 dest[..len].copy_from_slice(string_to_add.as_bytes())
196 }
197 Atom {
198 // INLINE_TAG ensures this is never zero
199 unsafe_data: unsafe { NonZeroU64::new_unchecked(data) },
200 phantom: PhantomData,
201 }
202 } else {
203 let ptr: std::ptr::NonNull<Entry> = DYNAMIC_SET.insert(string_to_add, hash.g);
204 let data = ptr.as_ptr() as u64;
205 debug_assert!(0 == data & TAG_MASK);
206 Atom {
207 // The address of a ptr::NonNull is non-zero
208 unsafe_data: unsafe { NonZeroU64::new_unchecked(data) },
209 phantom: PhantomData,
210 }
211 }
212 })
213 }
214}
215
216impl<Static: StaticAtomSet> Clone for Atom<Static> {
217 #[inline(always)]
218 fn clone(&self) -> Self {
219 if self.tag() == DYNAMIC_TAG {
220 let entry: *const Entry = self.unsafe_data.get() as *const Entry;
221 unsafe { &*entry }.ref_count.fetch_add(val:1, order:SeqCst);
222 }
223 Atom { ..*self }
224 }
225}
226
227impl<Static> Drop for Atom<Static> {
228 #[inline]
229 fn drop(&mut self) {
230 if self.tag() == DYNAMIC_TAG {
231 let entry: *const Entry = self.unsafe_data.get() as *const Entry;
232 if unsafe { &*entry }.ref_count.fetch_sub(val:1, order:SeqCst) == 1 {
233 drop_slow(self)
234 }
235 }
236
237 // Out of line to guide inlining.
238 fn drop_slow<Static>(this: &mut Atom<Static>) {
239 DYNAMIC_SET.remove(ptr:this.unsafe_data.get() as *mut Entry);
240 }
241 }
242}
243
244impl<Static: StaticAtomSet> ops::Deref for Atom<Static> {
245 type Target = str;
246
247 #[inline]
248 fn deref(&self) -> &str {
249 unsafe {
250 match self.tag() {
251 DYNAMIC_TAG => {
252 let entry: *const Entry = self.unsafe_data.get() as *const Entry;
253 &(*entry).string
254 }
255 INLINE_TAG => {
256 let len: u64 = (self.unsafe_data() & LEN_MASK) >> LEN_OFFSET;
257 let src: &[u8] = inline_atom_slice(&self.unsafe_data);
258 str::from_utf8_unchecked(&src[..(len as usize)])
259 }
260 STATIC_TAG => Static::get().atoms[self.static_index() as usize],
261 _ => debug_unreachable!(),
262 }
263 }
264 }
265}
266
267impl<Static: StaticAtomSet> fmt::Debug for Atom<Static> {
268 #[inline]
269 fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
270 let ty_str: &str = unsafe {
271 match self.tag() {
272 DYNAMIC_TAG => "dynamic",
273 INLINE_TAG => "inline",
274 STATIC_TAG => "static",
275 _ => debug_unreachable!(),
276 }
277 };
278
279 write!(f, "Atom('{}' type={})", &*self, ty_str)
280 }
281}
282
283impl<Static: StaticAtomSet> PartialOrd for Atom<Static> {
284 #[inline]
285 fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
286 if self.unsafe_data == other.unsafe_data {
287 return Some(Equal);
288 }
289 self.as_ref().partial_cmp(other.as_ref())
290 }
291}
292
293impl<Static: StaticAtomSet> Ord for Atom<Static> {
294 #[inline]
295 fn cmp(&self, other: &Self) -> Ordering {
296 if self.unsafe_data == other.unsafe_data {
297 return Equal;
298 }
299 self.as_ref().cmp(other.as_ref())
300 }
301}
302
303// AsciiExt requires mutating methods, so we just implement the non-mutating ones.
304// We don't need to implement is_ascii because there's no performance improvement
305// over the one from &str.
306impl<Static: StaticAtomSet> Atom<Static> {
307 fn from_mutated_str<F: FnOnce(&mut str)>(s: &str, f: F) -> Self {
308 let mut buffer = mem::MaybeUninit::<[u8; 64]>::uninit();
309 let buffer = unsafe { &mut *buffer.as_mut_ptr() };
310
311 if let Some(buffer_prefix) = buffer.get_mut(..s.len()) {
312 buffer_prefix.copy_from_slice(s.as_bytes());
313 let as_str = unsafe { ::std::str::from_utf8_unchecked_mut(buffer_prefix) };
314 f(as_str);
315 Atom::from(&*as_str)
316 } else {
317 let mut string = s.to_owned();
318 f(&mut string);
319 Atom::from(string)
320 }
321 }
322
323 /// Like [`to_ascii_uppercase`].
324 ///
325 /// [`to_ascii_uppercase`]: https://doc.rust-lang.org/std/ascii/trait.AsciiExt.html#tymethod.to_ascii_uppercase
326 pub fn to_ascii_uppercase(&self) -> Self {
327 for (i, b) in self.bytes().enumerate() {
328 if let b'a'..=b'z' = b {
329 return Atom::from_mutated_str(self, |s| s[i..].make_ascii_uppercase());
330 }
331 }
332 self.clone()
333 }
334
335 /// Like [`to_ascii_lowercase`].
336 ///
337 /// [`to_ascii_lowercase`]: https://doc.rust-lang.org/std/ascii/trait.AsciiExt.html#tymethod.to_ascii_lowercase
338 pub fn to_ascii_lowercase(&self) -> Self {
339 for (i, b) in self.bytes().enumerate() {
340 if let b'A'..=b'Z' = b {
341 return Atom::from_mutated_str(self, |s| s[i..].make_ascii_lowercase());
342 }
343 }
344 self.clone()
345 }
346
347 /// Like [`eq_ignore_ascii_case`].
348 ///
349 /// [`eq_ignore_ascii_case`]: https://doc.rust-lang.org/std/ascii/trait.AsciiExt.html#tymethod.eq_ignore_ascii_case
350 pub fn eq_ignore_ascii_case(&self, other: &Self) -> bool {
351 (self == other) || self.eq_str_ignore_ascii_case(&**other)
352 }
353
354 /// Like [`eq_ignore_ascii_case`], but takes an unhashed string as `other`.
355 ///
356 /// [`eq_ignore_ascii_case`]: https://doc.rust-lang.org/std/ascii/trait.AsciiExt.html#tymethod.eq_ignore_ascii_case
357 pub fn eq_str_ignore_ascii_case(&self, other: &str) -> bool {
358 (&**self).eq_ignore_ascii_case(other)
359 }
360}
361
362#[inline(always)]
363fn inline_atom_slice(x: &NonZeroU64) -> &[u8] {
364 unsafe {
365 let x: *const NonZeroU64 = x;
366 let mut data: *const u8 = x as *const u8;
367 // All except the lowest byte, which is first in little-endian, last in big-endian.
368 if cfg!(target_endian = "little") {
369 data = data.offset(count:1);
370 }
371 let len: usize = 7;
372 slice::from_raw_parts(data, len)
373 }
374}
375
376#[inline(always)]
377fn inline_atom_slice_mut(x: &mut u64) -> &mut [u8] {
378 unsafe {
379 let x: *mut u64 = x;
380 let mut data: *mut u8 = x as *mut u8;
381 // All except the lowest byte, which is first in little-endian, last in big-endian.
382 if cfg!(target_endian = "little") {
383 data = data.offset(count:1);
384 }
385 let len: usize = 7;
386 slice::from_raw_parts_mut(data, len)
387 }
388}
389