1//! A string table implementation with a tree-like encoding.
2//!
3//! Each entry in the table represents a string and is encoded as a list of
4//! components where each component can either be
5//!
6//! 1. a string _value_ that contains actual UTF-8 string content,
7//! 2. a string _ID_ that contains a reference to another entry, or
8//! 3. a terminator tag which marks the end of a component list.
9//!
10//! The string _content_ of an entry is defined as the concatenation of the
11//! content of its components. The content of a string value is its actual
12//! UTF-8 bytes. The content of a string ID is the contents of the entry
13//! it references.
14//!
15//! The byte-level encoding of component lists uses the structure of UTF-8 in
16//! order to save space:
17//!
18//! - A valid UTF-8 codepoint never starts with the byte `0xFE`. We make use
19//! of this fact by letting all string ID components start with this `0xFE`
20//! prefix. Thus when we parse the contents of a value we know to stop if
21//! we encounter this byte.
22//!
23//! - A valid UTF-8 string cannot contain the `0xFF` byte. Thus we can safely
24//! use `0xFF` as our component list terminator.
25//!
26//! The sample composite string ["abc", ID(42), "def", TERMINATOR] would thus be
27//! encoded as:
28//!
29//! ```ignore
30//! ['a', 'b' , 'c', 254, 42, 0, 0, 0, 'd', 'e', 'f', 255]
31//! ^^^^^^^^^^^^^^^^ ^^^
32//! string ID with 0xFE prefix terminator (0xFF)
33//! ```
34//!
35//! As you can see string IDs are encoded in little endian format.
36//!
37//! ----------------------------------------------------------------------------
38//!
39//! Each string in the table is referred to via a `StringId`. `StringId`s may
40//! be generated in two ways:
41//!
42//! 1. Calling `StringTableBuilder::alloc()` which returns the `StringId` for
43//! the allocated string.
44//! 2. Calling `StringId::new_virtual()` to create a "virtual" `StringId` that
45//! later can be mapped to an actual string via
46//! `StringTableBuilder::map_virtual_to_concrete_string()`.
47//!
48//! String IDs allow you to deduplicate strings by allocating a string
49//! once and then referring to it by id over and over. This is a useful trick
50//! for strings which are recorded many times and it can significantly reduce
51//! the size of profile trace files.
52//!
53//! `StringId`s are partitioned according to type:
54//!
55//! > [0 .. MAX_VIRTUAL_STRING_ID, METADATA_STRING_ID, .. ]
56//!
57//! From `0` to `MAX_VIRTUAL_STRING_ID` are the allowed values for virtual strings.
58//! After `MAX_VIRTUAL_STRING_ID`, there is one string id (`METADATA_STRING_ID`)
59//! which is used internally by `measureme` to record additional metadata about
60//! the profiling session. After `METADATA_STRING_ID` are all other `StringId`
61//! values.
62
63use crate::file_header::{
64 write_file_header, FILE_MAGIC_STRINGTABLE_DATA, FILE_MAGIC_STRINGTABLE_INDEX,
65};
66use crate::serialization::Addr;
67use crate::serialization::SerializationSink;
68use std::{error::Error, sync::Arc};
69
70/// A `StringId` is used to identify a string in the `StringTable`. It is
71/// either a regular `StringId`, meaning that it contains the absolute address
72/// of a string within the string table data. Or it is "virtual", which means
73/// that the address it points to is resolved via the string table index data,
74/// that maps virtual `StringId`s to addresses.
75#[derive(Clone, Copy, Eq, PartialEq, Debug, Hash)]
76#[repr(C)]
77pub struct StringId(u64);
78
79impl StringId {
80 pub const INVALID: StringId = StringId(INVALID_STRING_ID);
81
82 #[inline]
83 pub fn new(id: impl Into<u64>) -> StringId {
84 StringId(id.into())
85 }
86
87 #[inline]
88 pub fn new_virtual(id: impl Into<u64>) -> StringId {
89 let id = id.into();
90 assert!(id <= MAX_USER_VIRTUAL_STRING_ID);
91 StringId(id)
92 }
93
94 #[inline]
95 pub fn is_virtual(self) -> bool {
96 self.0 <= METADATA_STRING_ID
97 }
98
99 #[inline]
100 pub fn as_u64(self) -> u64 {
101 self.0
102 }
103
104 #[inline]
105 pub fn from_addr(addr: Addr) -> StringId {
106 let id = addr.0.checked_add(FIRST_REGULAR_STRING_ID).unwrap();
107 StringId::new(id)
108 }
109
110 #[inline]
111 pub fn to_addr(self) -> Addr {
112 Addr(self.0.checked_sub(FIRST_REGULAR_STRING_ID).unwrap())
113 }
114}
115
116// See module-level documentation for more information on the encoding.
117pub const TERMINATOR: u8 = 0xFF;
118pub const STRING_REF_TAG: u8 = 0xFE;
119pub const STRING_REF_ENCODED_SIZE: usize = 9;
120
121/// The maximum id value a virtual string may be.
122const MAX_USER_VIRTUAL_STRING_ID: u64 = 100_000_000;
123
124/// The id of the profile metadata string entry.
125pub const METADATA_STRING_ID: u64 = MAX_USER_VIRTUAL_STRING_ID + 1;
126
127/// Some random string ID that we make sure cannot be generated or assigned to.
128const INVALID_STRING_ID: u64 = METADATA_STRING_ID + 1;
129
130pub const FIRST_REGULAR_STRING_ID: u64 = INVALID_STRING_ID + 1;
131
132/// Write-only version of the string table
133pub struct StringTableBuilder {
134 data_sink: Arc<SerializationSink>,
135 index_sink: Arc<SerializationSink>,
136}
137
138/// Anything that implements `SerializableString` can be written to a
139/// `StringTable`.
140pub trait SerializableString {
141 fn serialized_size(&self) -> usize;
142 fn serialize(&self, bytes: &mut [u8]);
143}
144
145// A single string is encoded as `[UTF-8 bytes][TERMINATOR]`
146impl SerializableString for str {
147 #[inline]
148 fn serialized_size(&self) -> usize {
149 self.len() + // actual bytes
150 1 // terminator
151 }
152
153 #[inline]
154 fn serialize(&self, bytes: &mut [u8]) {
155 let last_byte_index: usize = bytes.len() - 1;
156 bytes[0..last_byte_index].copy_from_slice(self.as_bytes());
157 bytes[last_byte_index] = TERMINATOR;
158 }
159}
160
161/// A single component of a string. Used for building composite table entries.
162pub enum StringComponent<'s> {
163 Value(&'s str),
164 Ref(StringId),
165}
166
167impl<'s> StringComponent<'s> {
168 #[inline]
169 fn serialized_size(&self) -> usize {
170 match *self {
171 StringComponent::Value(s) => s.len(),
172 StringComponent::Ref(_) => STRING_REF_ENCODED_SIZE,
173 }
174 }
175
176 #[inline]
177 fn serialize<'b>(&self, bytes: &'b mut [u8]) -> &'b mut [u8] {
178 match *self {
179 StringComponent::Value(s) => {
180 bytes[..s.len()].copy_from_slice(s.as_bytes());
181 &mut bytes[s.len()..]
182 }
183 StringComponent::Ref(string_id) => {
184 // The code below assumes we use a 9-byte encoding for string
185 // refs, where the first byte is STRING_REF_TAG and the
186 // following 8 bytes are a little-endian u64 string ID value.
187 assert!(STRING_REF_ENCODED_SIZE == 9);
188
189 bytes[0] = STRING_REF_TAG;
190 bytes[1..9].copy_from_slice(&string_id.0.to_le_bytes());
191 &mut bytes[9..]
192 }
193 }
194 }
195}
196
197impl<'a> SerializableString for [StringComponent<'a>] {
198 #[inline]
199 fn serialized_size(&self) -> usize {
200 self.iter().map(|c: &StringComponent<'_>| c.serialized_size()).sum::<usize>() + // size of components
201 1 // terminator
202 }
203
204 #[inline]
205 fn serialize(&self, mut bytes: &mut [u8]) {
206 assert!(bytes.len() == self.serialized_size());
207 for component: &StringComponent<'_> in self.iter() {
208 bytes = component.serialize(bytes);
209 }
210
211 // Assert that we used the exact number of bytes we anticipated.
212 assert!(bytes.len() == 1);
213 bytes[0] = TERMINATOR;
214 }
215}
216
217macro_rules! impl_serializable_string_for_fixed_size {
218 ($n:expr) => {
219 impl<'a> SerializableString for [StringComponent<'a>; $n] {
220 #[inline(always)]
221 fn serialized_size(&self) -> usize {
222 (&self[..]).serialized_size()
223 }
224
225 #[inline(always)]
226 fn serialize(&self, bytes: &mut [u8]) {
227 (&self[..]).serialize(bytes);
228 }
229 }
230 };
231}
232
233impl_serializable_string_for_fixed_size!(0);
234impl_serializable_string_for_fixed_size!(1);
235impl_serializable_string_for_fixed_size!(2);
236impl_serializable_string_for_fixed_size!(3);
237impl_serializable_string_for_fixed_size!(4);
238impl_serializable_string_for_fixed_size!(5);
239impl_serializable_string_for_fixed_size!(6);
240impl_serializable_string_for_fixed_size!(7);
241impl_serializable_string_for_fixed_size!(8);
242impl_serializable_string_for_fixed_size!(9);
243impl_serializable_string_for_fixed_size!(10);
244impl_serializable_string_for_fixed_size!(11);
245impl_serializable_string_for_fixed_size!(12);
246impl_serializable_string_for_fixed_size!(13);
247impl_serializable_string_for_fixed_size!(14);
248impl_serializable_string_for_fixed_size!(15);
249impl_serializable_string_for_fixed_size!(16);
250
251fn serialize_index_entry(sink: &SerializationSink, id: StringId, addr: Addr) {
252 sink.write_atomic(num_bytes:16, |bytes: &mut [u8]| {
253 bytes[0..8].copy_from_slice(&id.0.to_le_bytes());
254 bytes[8..16].copy_from_slice(&addr.0.to_le_bytes());
255 });
256}
257
258impl StringTableBuilder {
259 pub fn new(
260 data_sink: Arc<SerializationSink>,
261 index_sink: Arc<SerializationSink>,
262 ) -> Result<StringTableBuilder, Box<dyn Error + Send + Sync>> {
263 // The first thing in every stream we generate must be the stream header.
264 write_file_header(&mut data_sink.as_std_write(), FILE_MAGIC_STRINGTABLE_DATA)?;
265 write_file_header(&mut index_sink.as_std_write(), FILE_MAGIC_STRINGTABLE_INDEX)?;
266
267 Ok(StringTableBuilder {
268 data_sink,
269 index_sink,
270 })
271 }
272
273 /// Creates a mapping so that `virtual_id` will resolve to the contents of
274 /// `concrete_id` when reading the string table.
275 pub fn map_virtual_to_concrete_string(&self, virtual_id: StringId, concrete_id: StringId) {
276 // This assertion does not use `is_virtual` on purpose because that
277 // would also allow to overwrite `METADATA_STRING_ID`.
278 assert!(virtual_id.0 <= MAX_USER_VIRTUAL_STRING_ID);
279 serialize_index_entry(&*self.index_sink, virtual_id, concrete_id.to_addr());
280 }
281
282 pub fn bulk_map_virtual_to_single_concrete_string<I>(
283 &self,
284 virtual_ids: I,
285 concrete_id: StringId,
286 ) where
287 I: Iterator<Item = StringId> + ExactSizeIterator,
288 {
289 // TODO: Index data encoding could have a special bulk mode that assigns
290 // multiple StringIds to the same addr, so we don't have to repeat
291 // the `concrete_id` over and over.
292
293 type MappingEntry = [u64; 2];
294 assert!(std::mem::size_of::<MappingEntry>() == 16);
295
296 let to_addr_le = concrete_id.to_addr().0.to_le();
297
298 let serialized: Vec<MappingEntry> = virtual_ids
299 .map(|from| {
300 let id = from.0;
301 assert!(id <= MAX_USER_VIRTUAL_STRING_ID);
302 [id.to_le(), to_addr_le]
303 })
304 .collect();
305
306 let num_bytes = serialized.len() * std::mem::size_of::<MappingEntry>();
307 let byte_ptr = serialized.as_ptr() as *const u8;
308
309 let bytes = unsafe { std::slice::from_raw_parts(byte_ptr, num_bytes) };
310
311 self.index_sink.write_bytes_atomic(bytes);
312 }
313
314 pub fn alloc_metadata<STR: SerializableString + ?Sized>(&self, s: &STR) {
315 let concrete_id = self.alloc(s);
316 let virtual_id = StringId(METADATA_STRING_ID);
317 assert!(virtual_id.is_virtual());
318 serialize_index_entry(&*self.index_sink, virtual_id, concrete_id.to_addr());
319 }
320
321 pub fn alloc<STR: SerializableString + ?Sized>(&self, s: &STR) -> StringId {
322 let size_in_bytes = s.serialized_size();
323 let addr = self.data_sink.write_atomic(size_in_bytes, |mem| {
324 s.serialize(mem);
325 });
326
327 StringId::from_addr(addr)
328 }
329}
330