| 1 | // Copyright 2014 The Servo Project Developers. See the COPYRIGHT |
| 2 | // file at the top-level directory of this distribution. |
| 3 | // |
| 4 | // Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or |
| 5 | // http://www.apache.org/licenses/LICENSE-2.0> or the MIT license |
| 6 | // <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your |
| 7 | // option. This file may not be copied, modified, or distributed |
| 8 | // except according to those terms. |
| 9 | |
| 10 | use crate::dynamic_set::{dynamic_set, Entry}; |
| 11 | use crate::static_sets::StaticAtomSet; |
| 12 | use debug_unreachable::debug_unreachable; |
| 13 | |
| 14 | use std::borrow::Cow; |
| 15 | use std::cmp::Ordering::{self, Equal}; |
| 16 | use std::fmt; |
| 17 | use std::hash::{Hash, Hasher}; |
| 18 | use std::marker::PhantomData; |
| 19 | use std::mem; |
| 20 | use std::num::NonZeroU64; |
| 21 | use std::ops; |
| 22 | use std::slice; |
| 23 | use std::str; |
| 24 | use std::sync::atomic::Ordering::SeqCst; |
| 25 | |
| 26 | const DYNAMIC_TAG: u8 = 0b_00; |
| 27 | const INLINE_TAG: u8 = 0b_01; // len in upper nybble |
| 28 | const STATIC_TAG: u8 = 0b_10; |
| 29 | const TAG_MASK: u64 = 0b_11; |
| 30 | const LEN_OFFSET: u64 = 4; |
| 31 | const LEN_MASK: u64 = 0xF0; |
| 32 | |
| 33 | const MAX_INLINE_LEN: usize = 7; |
| 34 | const STATIC_SHIFT_BITS: usize = 32; |
| 35 | |
| 36 | /// Represents a string that has been interned. |
| 37 | /// |
| 38 | /// While the type definition for `Atom` indicates that it generic on a particular |
| 39 | /// implementation of an atom set, you don't need to worry about this. Atoms can be static |
| 40 | /// and come from a `StaticAtomSet` generated by the `string_cache_codegen` crate, or they |
| 41 | /// can be dynamic and created by you on an `EmptyStaticAtomSet`. |
| 42 | /// |
| 43 | /// `Atom` implements `Clone` but not `Copy`, since internally atoms are reference-counted; |
| 44 | /// this means that you may need to `.clone()` an atom to keep copies to it in different |
| 45 | /// places, or when passing it to a function that takes an `Atom` rather than an `&Atom`. |
| 46 | /// |
| 47 | /// ## Creating an atom at runtime |
| 48 | /// |
| 49 | /// If you use `string_cache_codegen` to generate a precomputed list of atoms, your code |
| 50 | /// may then do something like read data from somewhere and extract tokens that need to be |
| 51 | /// compared to the atoms. In this case, you can use `Atom::from(&str)` or |
| 52 | /// `Atom::from(String)`. These create a reference-counted atom which will be |
| 53 | /// automatically freed when all references to it are dropped. |
| 54 | /// |
| 55 | /// This means that your application can safely have a loop which tokenizes data, creates |
| 56 | /// atoms from the tokens, and compares the atoms to a predefined set of keywords, without |
| 57 | /// running the risk of arbitrary memory consumption from creating large numbers of atoms — |
| 58 | /// as long as your application does not store clones of the atoms it creates along the |
| 59 | /// way. |
| 60 | /// |
| 61 | /// For example, the following is safe and will not consume arbitrary amounts of memory: |
| 62 | /// |
| 63 | /// ```ignore |
| 64 | /// let untrusted_data = "large amounts of text ..." ; |
| 65 | /// |
| 66 | /// for token in untrusted_data.split_whitespace() { |
| 67 | /// let atom = Atom::from(token); // interns the string |
| 68 | /// |
| 69 | /// if atom == Atom::from("keyword" ) { |
| 70 | /// // handle that keyword |
| 71 | /// } else if atom == Atom::from("another_keyword" ) { |
| 72 | /// // handle that keyword |
| 73 | /// } else { |
| 74 | /// println!("unknown keyword" ); |
| 75 | /// } |
| 76 | /// } // atom is dropped here, so it is not kept around in memory |
| 77 | /// ``` |
| 78 | #[derive (PartialEq, Eq)] |
| 79 | // NOTE: Deriving PartialEq requires that a given string must always be interned the same way. |
| 80 | pub struct Atom<Static> { |
| 81 | unsafe_data: NonZeroU64, |
| 82 | phantom: PhantomData<Static>, |
| 83 | } |
| 84 | |
| 85 | // FIXME: bound removed from the struct definition before of this error for pack_static: |
| 86 | // "error[E0723]: trait bounds other than `Sized` on const fn parameters are unstable" |
| 87 | // https://github.com/rust-lang/rust/issues/57563 |
| 88 | impl<Static> Atom<Static> { |
| 89 | /// For the atom!() macros |
| 90 | #[inline (always)] |
| 91 | #[doc (hidden)] |
| 92 | pub const fn pack_static(n: u32) -> Self { |
| 93 | Self { |
| 94 | unsafe_data: unsafe { |
| 95 | // STATIC_TAG ensures this is non-zero |
| 96 | NonZeroU64::new_unchecked((STATIC_TAG as u64) | ((n as u64) << STATIC_SHIFT_BITS)) |
| 97 | }, |
| 98 | phantom: PhantomData, |
| 99 | } |
| 100 | } |
| 101 | |
| 102 | /// For the atom!() macros |
| 103 | #[inline (always)] |
| 104 | #[doc (hidden)] |
| 105 | pub const fn pack_inline(mut n: u64, len: u8) -> Self { |
| 106 | if cfg!(target_endian = "big" ) { |
| 107 | // Reverse order of top 7 bytes. |
| 108 | // Bottom 8 bits of `n` are zero, and we need that to remain so. |
| 109 | // String data is stored in top 7 bytes, tag and length in bottom byte. |
| 110 | n = n.to_le() << 8; |
| 111 | } |
| 112 | |
| 113 | let data: u64 = (INLINE_TAG as u64) | ((len as u64) << LEN_OFFSET) | n; |
| 114 | Self { |
| 115 | // INLINE_TAG ensures this is never zero |
| 116 | unsafe_data: unsafe { NonZeroU64::new_unchecked(data) }, |
| 117 | phantom: PhantomData, |
| 118 | } |
| 119 | } |
| 120 | |
| 121 | fn tag(&self) -> u8 { |
| 122 | (self.unsafe_data.get() & TAG_MASK) as u8 |
| 123 | } |
| 124 | } |
| 125 | |
| 126 | impl<Static: StaticAtomSet> Atom<Static> { |
| 127 | /// Return the internal representation. For testing. |
| 128 | #[doc (hidden)] |
| 129 | pub fn unsafe_data(&self) -> u64 { |
| 130 | self.unsafe_data.get() |
| 131 | } |
| 132 | |
| 133 | /// Return true if this is a static Atom. For testing. |
| 134 | #[doc (hidden)] |
| 135 | pub fn is_static(&self) -> bool { |
| 136 | self.tag() == STATIC_TAG |
| 137 | } |
| 138 | |
| 139 | /// Return true if this is a dynamic Atom. For testing. |
| 140 | #[doc (hidden)] |
| 141 | pub fn is_dynamic(&self) -> bool { |
| 142 | self.tag() == DYNAMIC_TAG |
| 143 | } |
| 144 | |
| 145 | /// Return true if this is an inline Atom. For testing. |
| 146 | #[doc (hidden)] |
| 147 | pub fn is_inline(&self) -> bool { |
| 148 | self.tag() == INLINE_TAG |
| 149 | } |
| 150 | |
| 151 | fn static_index(&self) -> u64 { |
| 152 | self.unsafe_data.get() >> STATIC_SHIFT_BITS |
| 153 | } |
| 154 | |
| 155 | /// Get the hash of the string as it is stored in the set. |
| 156 | pub fn get_hash(&self) -> u32 { |
| 157 | match self.tag() { |
| 158 | DYNAMIC_TAG => { |
| 159 | let entry = self.unsafe_data.get() as *const Entry; |
| 160 | unsafe { (*entry).hash } |
| 161 | } |
| 162 | STATIC_TAG => Static::get().hashes[self.static_index() as usize], |
| 163 | INLINE_TAG => { |
| 164 | let data = self.unsafe_data.get(); |
| 165 | // This may or may not be great... |
| 166 | ((data >> 32) ^ data) as u32 |
| 167 | } |
| 168 | _ => unsafe { debug_unreachable!() }, |
| 169 | } |
| 170 | } |
| 171 | |
| 172 | pub fn try_static(string_to_add: &str) -> Option<Self> { |
| 173 | Self::try_static_internal(string_to_add).ok() |
| 174 | } |
| 175 | |
| 176 | fn try_static_internal(string_to_add: &str) -> Result<Self, phf_shared::Hashes> { |
| 177 | let static_set = Static::get(); |
| 178 | let hash = phf_shared::hash(&*string_to_add, &static_set.key); |
| 179 | let index = phf_shared::get_index(&hash, static_set.disps, static_set.atoms.len()); |
| 180 | |
| 181 | if static_set.atoms[index as usize] == string_to_add { |
| 182 | Ok(Self::pack_static(index)) |
| 183 | } else { |
| 184 | Err(hash) |
| 185 | } |
| 186 | } |
| 187 | } |
| 188 | |
| 189 | impl<Static: StaticAtomSet> Default for Atom<Static> { |
| 190 | #[inline ] |
| 191 | fn default() -> Self { |
| 192 | Atom::pack_static(Static::empty_string_index()) |
| 193 | } |
| 194 | } |
| 195 | |
| 196 | impl<Static: StaticAtomSet> Hash for Atom<Static> { |
| 197 | #[inline ] |
| 198 | fn hash<H>(&self, state: &mut H) |
| 199 | where |
| 200 | H: Hasher, |
| 201 | { |
| 202 | state.write_u32(self.get_hash()) |
| 203 | } |
| 204 | } |
| 205 | |
| 206 | impl<'a, Static: StaticAtomSet> From<Cow<'a, str>> for Atom<Static> { |
| 207 | fn from(string_to_add: Cow<'a, str>) -> Self { |
| 208 | let len = string_to_add.len(); |
| 209 | if len == 0 { |
| 210 | Self::pack_static(Static::empty_string_index()) |
| 211 | } else if len <= MAX_INLINE_LEN { |
| 212 | let mut data: u64 = (INLINE_TAG as u64) | ((len as u64) << LEN_OFFSET); |
| 213 | { |
| 214 | let dest = inline_atom_slice_mut(&mut data); |
| 215 | dest[..len].copy_from_slice(string_to_add.as_bytes()); |
| 216 | } |
| 217 | Atom { |
| 218 | // INLINE_TAG ensures this is never zero |
| 219 | unsafe_data: unsafe { NonZeroU64::new_unchecked(data) }, |
| 220 | phantom: PhantomData, |
| 221 | } |
| 222 | } else { |
| 223 | Self::try_static_internal(&*string_to_add).unwrap_or_else(|hash| { |
| 224 | let ptr: std::ptr::NonNull<Entry> = dynamic_set().insert(string_to_add, hash.g); |
| 225 | let data = ptr.as_ptr() as u64; |
| 226 | debug_assert!(0 == data & TAG_MASK); |
| 227 | Atom { |
| 228 | // The address of a ptr::NonNull is non-zero |
| 229 | unsafe_data: unsafe { NonZeroU64::new_unchecked(data) }, |
| 230 | phantom: PhantomData, |
| 231 | } |
| 232 | }) |
| 233 | } |
| 234 | } |
| 235 | } |
| 236 | |
| 237 | impl<Static: StaticAtomSet> Clone for Atom<Static> { |
| 238 | #[inline (always)] |
| 239 | fn clone(&self) -> Self { |
| 240 | if self.tag() == DYNAMIC_TAG { |
| 241 | let entry: *const Entry = self.unsafe_data.get() as *const Entry; |
| 242 | unsafe { &*entry }.ref_count.fetch_add(val:1, order:SeqCst); |
| 243 | } |
| 244 | Atom { ..*self } |
| 245 | } |
| 246 | } |
| 247 | |
| 248 | impl<Static> Drop for Atom<Static> { |
| 249 | #[inline ] |
| 250 | fn drop(&mut self) { |
| 251 | if self.tag() == DYNAMIC_TAG { |
| 252 | let entry: *const Entry = self.unsafe_data.get() as *const Entry; |
| 253 | if unsafe { &*entry }.ref_count.fetch_sub(val:1, order:SeqCst) == 1 { |
| 254 | drop_slow(self) |
| 255 | } |
| 256 | } |
| 257 | |
| 258 | // Out of line to guide inlining. |
| 259 | fn drop_slow<Static>(this: &mut Atom<Static>) { |
| 260 | dynamic_set().remove(ptr:this.unsafe_data.get() as *mut Entry); |
| 261 | } |
| 262 | } |
| 263 | } |
| 264 | |
| 265 | impl<Static: StaticAtomSet> ops::Deref for Atom<Static> { |
| 266 | type Target = str; |
| 267 | |
| 268 | #[inline ] |
| 269 | fn deref(&self) -> &str { |
| 270 | unsafe { |
| 271 | match self.tag() { |
| 272 | DYNAMIC_TAG => { |
| 273 | let entry: *const Entry = self.unsafe_data.get() as *const Entry; |
| 274 | &(*entry).string |
| 275 | } |
| 276 | INLINE_TAG => { |
| 277 | let len: u64 = (self.unsafe_data() & LEN_MASK) >> LEN_OFFSET; |
| 278 | debug_assert!(len as usize <= MAX_INLINE_LEN); |
| 279 | let src: &[u8] = inline_atom_slice(&self.unsafe_data); |
| 280 | str::from_utf8_unchecked(src.get_unchecked(..(len as usize))) |
| 281 | } |
| 282 | STATIC_TAG => Static::get().atoms[self.static_index() as usize], |
| 283 | _ => debug_unreachable!(), |
| 284 | } |
| 285 | } |
| 286 | } |
| 287 | } |
| 288 | |
| 289 | impl<Static: StaticAtomSet> fmt::Debug for Atom<Static> { |
| 290 | #[inline ] |
| 291 | fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { |
| 292 | let ty_str: &'static str = unsafe { |
| 293 | match self.tag() { |
| 294 | DYNAMIC_TAG => "dynamic" , |
| 295 | INLINE_TAG => "inline" , |
| 296 | STATIC_TAG => "static" , |
| 297 | _ => debug_unreachable!(), |
| 298 | } |
| 299 | }; |
| 300 | |
| 301 | write!(f, "Atom(' {}' type= {})" , &*self, ty_str) |
| 302 | } |
| 303 | } |
| 304 | |
| 305 | impl<Static: StaticAtomSet> PartialOrd for Atom<Static> { |
| 306 | #[inline ] |
| 307 | fn partial_cmp(&self, other: &Self) -> Option<Ordering> { |
| 308 | if self.unsafe_data == other.unsafe_data { |
| 309 | return Some(Equal); |
| 310 | } |
| 311 | self.as_ref().partial_cmp(other.as_ref()) |
| 312 | } |
| 313 | } |
| 314 | |
| 315 | impl<Static: StaticAtomSet> Ord for Atom<Static> { |
| 316 | #[inline ] |
| 317 | fn cmp(&self, other: &Self) -> Ordering { |
| 318 | if self.unsafe_data == other.unsafe_data { |
| 319 | return Equal; |
| 320 | } |
| 321 | self.as_ref().cmp(other.as_ref()) |
| 322 | } |
| 323 | } |
| 324 | |
| 325 | // AsciiExt requires mutating methods, so we just implement the non-mutating ones. |
| 326 | // We don't need to implement is_ascii because there's no performance improvement |
| 327 | // over the one from &str. |
| 328 | impl<Static: StaticAtomSet> Atom<Static> { |
| 329 | fn from_mutated_str<F: FnOnce(&mut str)>(s: &str, f: F) -> Self { |
| 330 | let mut buffer = mem::MaybeUninit::<[u8; 64]>::uninit(); |
| 331 | let buffer = unsafe { &mut *buffer.as_mut_ptr() }; |
| 332 | |
| 333 | if let Some(buffer_prefix) = buffer.get_mut(..s.len()) { |
| 334 | buffer_prefix.copy_from_slice(s.as_bytes()); |
| 335 | let as_str = unsafe { ::std::str::from_utf8_unchecked_mut(buffer_prefix) }; |
| 336 | f(as_str); |
| 337 | Atom::from(&*as_str) |
| 338 | } else { |
| 339 | let mut string = s.to_owned(); |
| 340 | f(&mut string); |
| 341 | Atom::from(string) |
| 342 | } |
| 343 | } |
| 344 | |
| 345 | /// Like [`to_ascii_uppercase`]. |
| 346 | /// |
| 347 | /// [`to_ascii_uppercase`]: https://doc.rust-lang.org/std/ascii/trait.AsciiExt.html#tymethod.to_ascii_uppercase |
| 348 | pub fn to_ascii_uppercase(&self) -> Self { |
| 349 | for (i, b) in self.bytes().enumerate() { |
| 350 | if let b'a' ..=b'z' = b { |
| 351 | return Atom::from_mutated_str(self, |s| s[i..].make_ascii_uppercase()); |
| 352 | } |
| 353 | } |
| 354 | self.clone() |
| 355 | } |
| 356 | |
| 357 | /// Like [`to_ascii_lowercase`]. |
| 358 | /// |
| 359 | /// [`to_ascii_lowercase`]: https://doc.rust-lang.org/std/ascii/trait.AsciiExt.html#tymethod.to_ascii_lowercase |
| 360 | pub fn to_ascii_lowercase(&self) -> Self { |
| 361 | for (i, b) in self.bytes().enumerate() { |
| 362 | if let b'A' ..=b'Z' = b { |
| 363 | return Atom::from_mutated_str(self, |s| s[i..].make_ascii_lowercase()); |
| 364 | } |
| 365 | } |
| 366 | self.clone() |
| 367 | } |
| 368 | |
| 369 | /// Like [`eq_ignore_ascii_case`]. |
| 370 | /// |
| 371 | /// [`eq_ignore_ascii_case`]: https://doc.rust-lang.org/std/ascii/trait.AsciiExt.html#tymethod.eq_ignore_ascii_case |
| 372 | pub fn eq_ignore_ascii_case(&self, other: &Self) -> bool { |
| 373 | (self == other) || self.eq_str_ignore_ascii_case(&**other) |
| 374 | } |
| 375 | |
| 376 | /// Like [`eq_ignore_ascii_case`], but takes an unhashed string as `other`. |
| 377 | /// |
| 378 | /// [`eq_ignore_ascii_case`]: https://doc.rust-lang.org/std/ascii/trait.AsciiExt.html#tymethod.eq_ignore_ascii_case |
| 379 | pub fn eq_str_ignore_ascii_case(&self, other: &str) -> bool { |
| 380 | (&**self).eq_ignore_ascii_case(other) |
| 381 | } |
| 382 | } |
| 383 | |
| 384 | #[inline (always)] |
| 385 | fn inline_atom_slice(x: &NonZeroU64) -> &[u8] { |
| 386 | let x: *const NonZeroU64 = x; |
| 387 | let mut data: *const u8 = x as *const u8; |
| 388 | // All except the lowest byte, which is first in little-endian, last in big-endian. |
| 389 | if cfg!(target_endian = "little" ) { |
| 390 | data = unsafe { data.offset(count:1) }; |
| 391 | } |
| 392 | let len: usize = 7; |
| 393 | unsafe { slice::from_raw_parts(data, len) } |
| 394 | } |
| 395 | |
| 396 | #[inline (always)] |
| 397 | fn inline_atom_slice_mut(x: &mut u64) -> &mut [u8] { |
| 398 | let x: *mut u64 = x; |
| 399 | let mut data: *mut u8 = x as *mut u8; |
| 400 | // All except the lowest byte, which is first in little-endian, last in big-endian. |
| 401 | if cfg!(target_endian = "little" ) { |
| 402 | data = unsafe { data.offset(count:1) }; |
| 403 | } |
| 404 | let len: usize = 7; |
| 405 | unsafe { slice::from_raw_parts_mut(data, len) } |
| 406 | } |
| 407 | |