| 1 | //! A string table implementation with a tree-like encoding. |
| 2 | //! |
| 3 | //! Each entry in the table represents a string and is encoded as a list of |
| 4 | //! components where each component can either be |
| 5 | //! |
| 6 | //! 1. a string _value_ that contains actual UTF-8 string content, |
| 7 | //! 2. a string _ID_ that contains a reference to another entry, or |
| 8 | //! 3. a terminator tag which marks the end of a component list. |
| 9 | //! |
| 10 | //! The string _content_ of an entry is defined as the concatenation of the |
| 11 | //! content of its components. The content of a string value is its actual |
| 12 | //! UTF-8 bytes. The content of a string ID is the contents of the entry |
| 13 | //! it references. |
| 14 | //! |
| 15 | //! The byte-level encoding of component lists uses the structure of UTF-8 in |
| 16 | //! order to save space: |
| 17 | //! |
| 18 | //! - A valid UTF-8 codepoint never starts with the byte `0xFE`. We make use |
| 19 | //! of this fact by letting all string ID components start with this `0xFE` |
| 20 | //! prefix. Thus when we parse the contents of a value we know to stop if |
| 21 | //! we encounter this byte. |
| 22 | //! |
| 23 | //! - A valid UTF-8 string cannot contain the `0xFF` byte. Thus we can safely |
| 24 | //! use `0xFF` as our component list terminator. |
| 25 | //! |
| 26 | //! The sample composite string ["abc", ID(42), "def", TERMINATOR] would thus be |
| 27 | //! encoded as: |
| 28 | //! |
| 29 | //! ```ignore |
| 30 | //! ['a' , 'b' , 'c' , 254, 42, 0, 0, 0, 'd' , 'e' , 'f' , 255] |
| 31 | //! ^^^^^^^^^^^^^^^^ ^^^ |
| 32 | //! string ID with 0xFE prefix terminator (0xFF) |
| 33 | //! ``` |
| 34 | //! |
| 35 | //! As you can see string IDs are encoded in little endian format. |
| 36 | //! |
| 37 | //! ---------------------------------------------------------------------------- |
| 38 | //! |
| 39 | //! Each string in the table is referred to via a `StringId`. `StringId`s may |
| 40 | //! be generated in two ways: |
| 41 | //! |
| 42 | //! 1. Calling `StringTableBuilder::alloc()` which returns the `StringId` for |
| 43 | //! the allocated string. |
| 44 | //! 2. Calling `StringId::new_virtual()` to create a "virtual" `StringId` that |
| 45 | //! later can be mapped to an actual string via |
| 46 | //! `StringTableBuilder::map_virtual_to_concrete_string()`. |
| 47 | //! |
| 48 | //! String IDs allow you to deduplicate strings by allocating a string |
| 49 | //! once and then referring to it by id over and over. This is a useful trick |
| 50 | //! for strings which are recorded many times and it can significantly reduce |
| 51 | //! the size of profile trace files. |
| 52 | //! |
| 53 | //! `StringId`s are partitioned according to type: |
| 54 | //! |
| 55 | //! > [0 .. MAX_VIRTUAL_STRING_ID, METADATA_STRING_ID, .. ] |
| 56 | //! |
| 57 | //! From `0` to `MAX_VIRTUAL_STRING_ID` are the allowed values for virtual strings. |
| 58 | //! After `MAX_VIRTUAL_STRING_ID`, there is one string id (`METADATA_STRING_ID`) |
| 59 | //! which is used internally by `measureme` to record additional metadata about |
| 60 | //! the profiling session. After `METADATA_STRING_ID` are all other `StringId` |
| 61 | //! values. |
| 62 | |
| 63 | use crate::file_header::{ |
| 64 | write_file_header, FILE_MAGIC_STRINGTABLE_DATA, FILE_MAGIC_STRINGTABLE_INDEX, |
| 65 | }; |
| 66 | use crate::serialization::Addr; |
| 67 | use crate::serialization::SerializationSink; |
| 68 | use std::{error::Error, sync::Arc}; |
| 69 | |
| 70 | /// A `StringId` is used to identify a string in the `StringTable`. It is |
| 71 | /// either a regular `StringId`, meaning that it contains the absolute address |
| 72 | /// of a string within the string table data. Or it is "virtual", which means |
| 73 | /// that the address it points to is resolved via the string table index data, |
| 74 | /// that maps virtual `StringId`s to addresses. |
| 75 | #[derive (Clone, Copy, Eq, PartialEq, Debug, Hash)] |
| 76 | #[repr (C)] |
| 77 | pub struct StringId(u64); |
| 78 | |
| 79 | impl StringId { |
| 80 | pub const INVALID: StringId = StringId(INVALID_STRING_ID); |
| 81 | |
| 82 | #[inline ] |
| 83 | pub fn new(id: impl Into<u64>) -> StringId { |
| 84 | StringId(id.into()) |
| 85 | } |
| 86 | |
| 87 | #[inline ] |
| 88 | pub fn new_virtual(id: impl Into<u64>) -> StringId { |
| 89 | let id = id.into(); |
| 90 | assert!(id <= MAX_USER_VIRTUAL_STRING_ID); |
| 91 | StringId(id) |
| 92 | } |
| 93 | |
| 94 | #[inline ] |
| 95 | pub fn is_virtual(self) -> bool { |
| 96 | self.0 <= METADATA_STRING_ID |
| 97 | } |
| 98 | |
| 99 | #[inline ] |
| 100 | pub fn as_u64(self) -> u64 { |
| 101 | self.0 |
| 102 | } |
| 103 | |
| 104 | #[inline ] |
| 105 | pub fn from_addr(addr: Addr) -> StringId { |
| 106 | let id = addr.0.checked_add(FIRST_REGULAR_STRING_ID).unwrap(); |
| 107 | StringId::new(id) |
| 108 | } |
| 109 | |
| 110 | #[inline ] |
| 111 | pub fn to_addr(self) -> Addr { |
| 112 | Addr(self.0.checked_sub(FIRST_REGULAR_STRING_ID).unwrap()) |
| 113 | } |
| 114 | } |
| 115 | |
| 116 | // See module-level documentation for more information on the encoding. |
| 117 | pub const TERMINATOR: u8 = 0xFF; |
| 118 | pub const STRING_REF_TAG: u8 = 0xFE; |
| 119 | pub const STRING_REF_ENCODED_SIZE: usize = 9; |
| 120 | |
| 121 | /// The maximum id value a virtual string may be. |
| 122 | const MAX_USER_VIRTUAL_STRING_ID: u64 = 100_000_000; |
| 123 | |
| 124 | /// The id of the profile metadata string entry. |
| 125 | pub const METADATA_STRING_ID: u64 = MAX_USER_VIRTUAL_STRING_ID + 1; |
| 126 | |
| 127 | /// Some random string ID that we make sure cannot be generated or assigned to. |
| 128 | const INVALID_STRING_ID: u64 = METADATA_STRING_ID + 1; |
| 129 | |
| 130 | pub const FIRST_REGULAR_STRING_ID: u64 = INVALID_STRING_ID + 1; |
| 131 | |
| 132 | /// Write-only version of the string table |
| 133 | pub struct StringTableBuilder { |
| 134 | data_sink: Arc<SerializationSink>, |
| 135 | index_sink: Arc<SerializationSink>, |
| 136 | } |
| 137 | |
| 138 | /// Anything that implements `SerializableString` can be written to a |
| 139 | /// `StringTable`. |
| 140 | pub trait SerializableString { |
| 141 | fn serialized_size(&self) -> usize; |
| 142 | fn serialize(&self, bytes: &mut [u8]); |
| 143 | } |
| 144 | |
| 145 | // A single string is encoded as `[UTF-8 bytes][TERMINATOR]` |
| 146 | impl SerializableString for str { |
| 147 | #[inline ] |
| 148 | fn serialized_size(&self) -> usize { |
| 149 | self.len() + // actual bytes |
| 150 | 1 // terminator |
| 151 | } |
| 152 | |
| 153 | #[inline ] |
| 154 | fn serialize(&self, bytes: &mut [u8]) { |
| 155 | let last_byte_index: usize = bytes.len() - 1; |
| 156 | bytes[0..last_byte_index].copy_from_slice(self.as_bytes()); |
| 157 | bytes[last_byte_index] = TERMINATOR; |
| 158 | } |
| 159 | } |
| 160 | |
| 161 | /// A single component of a string. Used for building composite table entries. |
| 162 | pub enum StringComponent<'s> { |
| 163 | Value(&'s str), |
| 164 | Ref(StringId), |
| 165 | } |
| 166 | |
| 167 | impl<'s> StringComponent<'s> { |
| 168 | #[inline ] |
| 169 | fn serialized_size(&self) -> usize { |
| 170 | match *self { |
| 171 | StringComponent::Value(s) => s.len(), |
| 172 | StringComponent::Ref(_) => STRING_REF_ENCODED_SIZE, |
| 173 | } |
| 174 | } |
| 175 | |
| 176 | #[inline ] |
| 177 | fn serialize<'b>(&self, bytes: &'b mut [u8]) -> &'b mut [u8] { |
| 178 | match *self { |
| 179 | StringComponent::Value(s) => { |
| 180 | bytes[..s.len()].copy_from_slice(s.as_bytes()); |
| 181 | &mut bytes[s.len()..] |
| 182 | } |
| 183 | StringComponent::Ref(string_id) => { |
| 184 | // The code below assumes we use a 9-byte encoding for string |
| 185 | // refs, where the first byte is STRING_REF_TAG and the |
| 186 | // following 8 bytes are a little-endian u64 string ID value. |
| 187 | assert!(STRING_REF_ENCODED_SIZE == 9); |
| 188 | |
| 189 | bytes[0] = STRING_REF_TAG; |
| 190 | bytes[1..9].copy_from_slice(&string_id.0.to_le_bytes()); |
| 191 | &mut bytes[9..] |
| 192 | } |
| 193 | } |
| 194 | } |
| 195 | } |
| 196 | |
| 197 | impl<'a> SerializableString for [StringComponent<'a>] { |
| 198 | #[inline ] |
| 199 | fn serialized_size(&self) -> usize { |
| 200 | self.iter().map(|c: &StringComponent<'_>| c.serialized_size()).sum::<usize>() + // size of components |
| 201 | 1 // terminator |
| 202 | } |
| 203 | |
| 204 | #[inline ] |
| 205 | fn serialize(&self, mut bytes: &mut [u8]) { |
| 206 | assert!(bytes.len() == self.serialized_size()); |
| 207 | for component: &StringComponent<'_> in self.iter() { |
| 208 | bytes = component.serialize(bytes); |
| 209 | } |
| 210 | |
| 211 | // Assert that we used the exact number of bytes we anticipated. |
| 212 | assert!(bytes.len() == 1); |
| 213 | bytes[0] = TERMINATOR; |
| 214 | } |
| 215 | } |
| 216 | |
| 217 | macro_rules! impl_serializable_string_for_fixed_size { |
| 218 | ($n:expr) => { |
| 219 | impl<'a> SerializableString for [StringComponent<'a>; $n] { |
| 220 | #[inline(always)] |
| 221 | fn serialized_size(&self) -> usize { |
| 222 | (&self[..]).serialized_size() |
| 223 | } |
| 224 | |
| 225 | #[inline(always)] |
| 226 | fn serialize(&self, bytes: &mut [u8]) { |
| 227 | (&self[..]).serialize(bytes); |
| 228 | } |
| 229 | } |
| 230 | }; |
| 231 | } |
| 232 | |
| 233 | impl_serializable_string_for_fixed_size!(0); |
| 234 | impl_serializable_string_for_fixed_size!(1); |
| 235 | impl_serializable_string_for_fixed_size!(2); |
| 236 | impl_serializable_string_for_fixed_size!(3); |
| 237 | impl_serializable_string_for_fixed_size!(4); |
| 238 | impl_serializable_string_for_fixed_size!(5); |
| 239 | impl_serializable_string_for_fixed_size!(6); |
| 240 | impl_serializable_string_for_fixed_size!(7); |
| 241 | impl_serializable_string_for_fixed_size!(8); |
| 242 | impl_serializable_string_for_fixed_size!(9); |
| 243 | impl_serializable_string_for_fixed_size!(10); |
| 244 | impl_serializable_string_for_fixed_size!(11); |
| 245 | impl_serializable_string_for_fixed_size!(12); |
| 246 | impl_serializable_string_for_fixed_size!(13); |
| 247 | impl_serializable_string_for_fixed_size!(14); |
| 248 | impl_serializable_string_for_fixed_size!(15); |
| 249 | impl_serializable_string_for_fixed_size!(16); |
| 250 | |
| 251 | fn serialize_index_entry(sink: &SerializationSink, id: StringId, addr: Addr) { |
| 252 | sink.write_atomic(num_bytes:16, |bytes: &mut [u8]| { |
| 253 | bytes[0..8].copy_from_slice(&id.0.to_le_bytes()); |
| 254 | bytes[8..16].copy_from_slice(&addr.0.to_le_bytes()); |
| 255 | }); |
| 256 | } |
| 257 | |
| 258 | impl StringTableBuilder { |
| 259 | pub fn new( |
| 260 | data_sink: Arc<SerializationSink>, |
| 261 | index_sink: Arc<SerializationSink>, |
| 262 | ) -> Result<StringTableBuilder, Box<dyn Error + Send + Sync>> { |
| 263 | // The first thing in every stream we generate must be the stream header. |
| 264 | write_file_header(&mut data_sink.as_std_write(), FILE_MAGIC_STRINGTABLE_DATA)?; |
| 265 | write_file_header(&mut index_sink.as_std_write(), FILE_MAGIC_STRINGTABLE_INDEX)?; |
| 266 | |
| 267 | Ok(StringTableBuilder { |
| 268 | data_sink, |
| 269 | index_sink, |
| 270 | }) |
| 271 | } |
| 272 | |
| 273 | /// Creates a mapping so that `virtual_id` will resolve to the contents of |
| 274 | /// `concrete_id` when reading the string table. |
| 275 | pub fn map_virtual_to_concrete_string(&self, virtual_id: StringId, concrete_id: StringId) { |
| 276 | // This assertion does not use `is_virtual` on purpose because that |
| 277 | // would also allow to overwrite `METADATA_STRING_ID`. |
| 278 | assert!(virtual_id.0 <= MAX_USER_VIRTUAL_STRING_ID); |
| 279 | serialize_index_entry(&*self.index_sink, virtual_id, concrete_id.to_addr()); |
| 280 | } |
| 281 | |
| 282 | pub fn bulk_map_virtual_to_single_concrete_string<I>( |
| 283 | &self, |
| 284 | virtual_ids: I, |
| 285 | concrete_id: StringId, |
| 286 | ) where |
| 287 | I: Iterator<Item = StringId> + ExactSizeIterator, |
| 288 | { |
| 289 | // TODO: Index data encoding could have a special bulk mode that assigns |
| 290 | // multiple StringIds to the same addr, so we don't have to repeat |
| 291 | // the `concrete_id` over and over. |
| 292 | |
| 293 | type MappingEntry = [u64; 2]; |
| 294 | assert!(std::mem::size_of::<MappingEntry>() == 16); |
| 295 | |
| 296 | let to_addr_le = concrete_id.to_addr().0.to_le(); |
| 297 | |
| 298 | let serialized: Vec<MappingEntry> = virtual_ids |
| 299 | .map(|from| { |
| 300 | let id = from.0; |
| 301 | assert!(id <= MAX_USER_VIRTUAL_STRING_ID); |
| 302 | [id.to_le(), to_addr_le] |
| 303 | }) |
| 304 | .collect(); |
| 305 | |
| 306 | let num_bytes = serialized.len() * std::mem::size_of::<MappingEntry>(); |
| 307 | let byte_ptr = serialized.as_ptr() as *const u8; |
| 308 | |
| 309 | let bytes = unsafe { std::slice::from_raw_parts(byte_ptr, num_bytes) }; |
| 310 | |
| 311 | self.index_sink.write_bytes_atomic(bytes); |
| 312 | } |
| 313 | |
| 314 | pub fn alloc_metadata<STR: SerializableString + ?Sized>(&self, s: &STR) { |
| 315 | let concrete_id = self.alloc(s); |
| 316 | let virtual_id = StringId(METADATA_STRING_ID); |
| 317 | assert!(virtual_id.is_virtual()); |
| 318 | serialize_index_entry(&*self.index_sink, virtual_id, concrete_id.to_addr()); |
| 319 | } |
| 320 | |
| 321 | pub fn alloc<STR: SerializableString + ?Sized>(&self, s: &STR) -> StringId { |
| 322 | let size_in_bytes = s.serialized_size(); |
| 323 | let addr = self.data_sink.write_atomic(size_in_bytes, |mem| { |
| 324 | s.serialize(mem); |
| 325 | }); |
| 326 | |
| 327 | StringId::from_addr(addr) |
| 328 | } |
| 329 | } |
| 330 | |