1 | //! A string table implementation with a tree-like encoding. |
2 | //! |
3 | //! Each entry in the table represents a string and is encoded as a list of |
4 | //! components where each component can either be |
5 | //! |
6 | //! 1. a string _value_ that contains actual UTF-8 string content, |
7 | //! 2. a string _ID_ that contains a reference to another entry, or |
8 | //! 3. a terminator tag which marks the end of a component list. |
9 | //! |
10 | //! The string _content_ of an entry is defined as the concatenation of the |
11 | //! content of its components. The content of a string value is its actual |
12 | //! UTF-8 bytes. The content of a string ID is the contents of the entry |
13 | //! it references. |
14 | //! |
15 | //! The byte-level encoding of component lists uses the structure of UTF-8 in |
16 | //! order to save space: |
17 | //! |
18 | //! - A valid UTF-8 codepoint never starts with the byte `0xFE`. We make use |
19 | //! of this fact by letting all string ID components start with this `0xFE` |
20 | //! prefix. Thus when we parse the contents of a value we know to stop if |
21 | //! we encounter this byte. |
22 | //! |
23 | //! - A valid UTF-8 string cannot contain the `0xFF` byte. Thus we can safely |
24 | //! use `0xFF` as our component list terminator. |
25 | //! |
26 | //! The sample composite string ["abc", ID(42), "def", TERMINATOR] would thus be |
27 | //! encoded as: |
28 | //! |
29 | //! ```ignore |
30 | //! ['a' , 'b' , 'c' , 254, 42, 0, 0, 0, 'd' , 'e' , 'f' , 255] |
31 | //! ^^^^^^^^^^^^^^^^ ^^^ |
32 | //! string ID with 0xFE prefix terminator (0xFF) |
33 | //! ``` |
34 | //! |
35 | //! As you can see string IDs are encoded in little endian format. |
36 | //! |
37 | //! ---------------------------------------------------------------------------- |
38 | //! |
39 | //! Each string in the table is referred to via a `StringId`. `StringId`s may |
40 | //! be generated in two ways: |
41 | //! |
42 | //! 1. Calling `StringTableBuilder::alloc()` which returns the `StringId` for |
43 | //! the allocated string. |
44 | //! 2. Calling `StringId::new_virtual()` to create a "virtual" `StringId` that |
45 | //! later can be mapped to an actual string via |
46 | //! `StringTableBuilder::map_virtual_to_concrete_string()`. |
47 | //! |
48 | //! String IDs allow you to deduplicate strings by allocating a string |
49 | //! once and then referring to it by id over and over. This is a useful trick |
50 | //! for strings which are recorded many times and it can significantly reduce |
51 | //! the size of profile trace files. |
52 | //! |
53 | //! `StringId`s are partitioned according to type: |
54 | //! |
55 | //! > [0 .. MAX_VIRTUAL_STRING_ID, METADATA_STRING_ID, .. ] |
56 | //! |
57 | //! From `0` to `MAX_VIRTUAL_STRING_ID` are the allowed values for virtual strings. |
58 | //! After `MAX_VIRTUAL_STRING_ID`, there is one string id (`METADATA_STRING_ID`) |
59 | //! which is used internally by `measureme` to record additional metadata about |
60 | //! the profiling session. After `METADATA_STRING_ID` are all other `StringId` |
61 | //! values. |
62 | |
63 | use crate::file_header::{ |
64 | write_file_header, FILE_MAGIC_STRINGTABLE_DATA, FILE_MAGIC_STRINGTABLE_INDEX, |
65 | }; |
66 | use crate::serialization::Addr; |
67 | use crate::serialization::SerializationSink; |
68 | use std::{error::Error, sync::Arc}; |
69 | |
70 | /// A `StringId` is used to identify a string in the `StringTable`. It is |
71 | /// either a regular `StringId`, meaning that it contains the absolute address |
72 | /// of a string within the string table data. Or it is "virtual", which means |
73 | /// that the address it points to is resolved via the string table index data, |
74 | /// that maps virtual `StringId`s to addresses. |
75 | #[derive (Clone, Copy, Eq, PartialEq, Debug, Hash)] |
76 | #[repr (C)] |
77 | pub struct StringId(u64); |
78 | |
79 | impl StringId { |
80 | pub const INVALID: StringId = StringId(INVALID_STRING_ID); |
81 | |
82 | #[inline ] |
83 | pub fn new(id: impl Into<u64>) -> StringId { |
84 | StringId(id.into()) |
85 | } |
86 | |
87 | #[inline ] |
88 | pub fn new_virtual(id: impl Into<u64>) -> StringId { |
89 | let id = id.into(); |
90 | assert!(id <= MAX_USER_VIRTUAL_STRING_ID); |
91 | StringId(id) |
92 | } |
93 | |
94 | #[inline ] |
95 | pub fn is_virtual(self) -> bool { |
96 | self.0 <= METADATA_STRING_ID |
97 | } |
98 | |
99 | #[inline ] |
100 | pub fn as_u64(self) -> u64 { |
101 | self.0 |
102 | } |
103 | |
104 | #[inline ] |
105 | pub fn from_addr(addr: Addr) -> StringId { |
106 | let id = addr.0.checked_add(FIRST_REGULAR_STRING_ID).unwrap(); |
107 | StringId::new(id) |
108 | } |
109 | |
110 | #[inline ] |
111 | pub fn to_addr(self) -> Addr { |
112 | Addr(self.0.checked_sub(FIRST_REGULAR_STRING_ID).unwrap()) |
113 | } |
114 | } |
115 | |
116 | // See module-level documentation for more information on the encoding. |
117 | pub const TERMINATOR: u8 = 0xFF; |
118 | pub const STRING_REF_TAG: u8 = 0xFE; |
119 | pub const STRING_REF_ENCODED_SIZE: usize = 9; |
120 | |
121 | /// The maximum id value a virtual string may be. |
122 | const MAX_USER_VIRTUAL_STRING_ID: u64 = 100_000_000; |
123 | |
124 | /// The id of the profile metadata string entry. |
125 | pub const METADATA_STRING_ID: u64 = MAX_USER_VIRTUAL_STRING_ID + 1; |
126 | |
127 | /// Some random string ID that we make sure cannot be generated or assigned to. |
128 | const INVALID_STRING_ID: u64 = METADATA_STRING_ID + 1; |
129 | |
130 | pub const FIRST_REGULAR_STRING_ID: u64 = INVALID_STRING_ID + 1; |
131 | |
132 | /// Write-only version of the string table |
133 | pub struct StringTableBuilder { |
134 | data_sink: Arc<SerializationSink>, |
135 | index_sink: Arc<SerializationSink>, |
136 | } |
137 | |
138 | /// Anything that implements `SerializableString` can be written to a |
139 | /// `StringTable`. |
140 | pub trait SerializableString { |
141 | fn serialized_size(&self) -> usize; |
142 | fn serialize(&self, bytes: &mut [u8]); |
143 | } |
144 | |
145 | // A single string is encoded as `[UTF-8 bytes][TERMINATOR]` |
146 | impl SerializableString for str { |
147 | #[inline ] |
148 | fn serialized_size(&self) -> usize { |
149 | self.len() + // actual bytes |
150 | 1 // terminator |
151 | } |
152 | |
153 | #[inline ] |
154 | fn serialize(&self, bytes: &mut [u8]) { |
155 | let last_byte_index: usize = bytes.len() - 1; |
156 | bytes[0..last_byte_index].copy_from_slice(self.as_bytes()); |
157 | bytes[last_byte_index] = TERMINATOR; |
158 | } |
159 | } |
160 | |
161 | /// A single component of a string. Used for building composite table entries. |
162 | pub enum StringComponent<'s> { |
163 | Value(&'s str), |
164 | Ref(StringId), |
165 | } |
166 | |
167 | impl<'s> StringComponent<'s> { |
168 | #[inline ] |
169 | fn serialized_size(&self) -> usize { |
170 | match *self { |
171 | StringComponent::Value(s) => s.len(), |
172 | StringComponent::Ref(_) => STRING_REF_ENCODED_SIZE, |
173 | } |
174 | } |
175 | |
176 | #[inline ] |
177 | fn serialize<'b>(&self, bytes: &'b mut [u8]) -> &'b mut [u8] { |
178 | match *self { |
179 | StringComponent::Value(s) => { |
180 | bytes[..s.len()].copy_from_slice(s.as_bytes()); |
181 | &mut bytes[s.len()..] |
182 | } |
183 | StringComponent::Ref(string_id) => { |
184 | // The code below assumes we use a 9-byte encoding for string |
185 | // refs, where the first byte is STRING_REF_TAG and the |
186 | // following 8 bytes are a little-endian u64 string ID value. |
187 | assert!(STRING_REF_ENCODED_SIZE == 9); |
188 | |
189 | bytes[0] = STRING_REF_TAG; |
190 | bytes[1..9].copy_from_slice(&string_id.0.to_le_bytes()); |
191 | &mut bytes[9..] |
192 | } |
193 | } |
194 | } |
195 | } |
196 | |
197 | impl<'a> SerializableString for [StringComponent<'a>] { |
198 | #[inline ] |
199 | fn serialized_size(&self) -> usize { |
200 | self.iter().map(|c: &StringComponent<'_>| c.serialized_size()).sum::<usize>() + // size of components |
201 | 1 // terminator |
202 | } |
203 | |
204 | #[inline ] |
205 | fn serialize(&self, mut bytes: &mut [u8]) { |
206 | assert!(bytes.len() == self.serialized_size()); |
207 | for component: &StringComponent<'_> in self.iter() { |
208 | bytes = component.serialize(bytes); |
209 | } |
210 | |
211 | // Assert that we used the exact number of bytes we anticipated. |
212 | assert!(bytes.len() == 1); |
213 | bytes[0] = TERMINATOR; |
214 | } |
215 | } |
216 | |
217 | macro_rules! impl_serializable_string_for_fixed_size { |
218 | ($n:expr) => { |
219 | impl<'a> SerializableString for [StringComponent<'a>; $n] { |
220 | #[inline(always)] |
221 | fn serialized_size(&self) -> usize { |
222 | (&self[..]).serialized_size() |
223 | } |
224 | |
225 | #[inline(always)] |
226 | fn serialize(&self, bytes: &mut [u8]) { |
227 | (&self[..]).serialize(bytes); |
228 | } |
229 | } |
230 | }; |
231 | } |
232 | |
233 | impl_serializable_string_for_fixed_size!(0); |
234 | impl_serializable_string_for_fixed_size!(1); |
235 | impl_serializable_string_for_fixed_size!(2); |
236 | impl_serializable_string_for_fixed_size!(3); |
237 | impl_serializable_string_for_fixed_size!(4); |
238 | impl_serializable_string_for_fixed_size!(5); |
239 | impl_serializable_string_for_fixed_size!(6); |
240 | impl_serializable_string_for_fixed_size!(7); |
241 | impl_serializable_string_for_fixed_size!(8); |
242 | impl_serializable_string_for_fixed_size!(9); |
243 | impl_serializable_string_for_fixed_size!(10); |
244 | impl_serializable_string_for_fixed_size!(11); |
245 | impl_serializable_string_for_fixed_size!(12); |
246 | impl_serializable_string_for_fixed_size!(13); |
247 | impl_serializable_string_for_fixed_size!(14); |
248 | impl_serializable_string_for_fixed_size!(15); |
249 | impl_serializable_string_for_fixed_size!(16); |
250 | |
251 | fn serialize_index_entry(sink: &SerializationSink, id: StringId, addr: Addr) { |
252 | sink.write_atomic(num_bytes:16, |bytes: &mut [u8]| { |
253 | bytes[0..8].copy_from_slice(&id.0.to_le_bytes()); |
254 | bytes[8..16].copy_from_slice(&addr.0.to_le_bytes()); |
255 | }); |
256 | } |
257 | |
258 | impl StringTableBuilder { |
259 | pub fn new( |
260 | data_sink: Arc<SerializationSink>, |
261 | index_sink: Arc<SerializationSink>, |
262 | ) -> Result<StringTableBuilder, Box<dyn Error + Send + Sync>> { |
263 | // The first thing in every stream we generate must be the stream header. |
264 | write_file_header(&mut data_sink.as_std_write(), FILE_MAGIC_STRINGTABLE_DATA)?; |
265 | write_file_header(&mut index_sink.as_std_write(), FILE_MAGIC_STRINGTABLE_INDEX)?; |
266 | |
267 | Ok(StringTableBuilder { |
268 | data_sink, |
269 | index_sink, |
270 | }) |
271 | } |
272 | |
273 | /// Creates a mapping so that `virtual_id` will resolve to the contents of |
274 | /// `concrete_id` when reading the string table. |
275 | pub fn map_virtual_to_concrete_string(&self, virtual_id: StringId, concrete_id: StringId) { |
276 | // This assertion does not use `is_virtual` on purpose because that |
277 | // would also allow to overwrite `METADATA_STRING_ID`. |
278 | assert!(virtual_id.0 <= MAX_USER_VIRTUAL_STRING_ID); |
279 | serialize_index_entry(&*self.index_sink, virtual_id, concrete_id.to_addr()); |
280 | } |
281 | |
282 | pub fn bulk_map_virtual_to_single_concrete_string<I>( |
283 | &self, |
284 | virtual_ids: I, |
285 | concrete_id: StringId, |
286 | ) where |
287 | I: Iterator<Item = StringId> + ExactSizeIterator, |
288 | { |
289 | // TODO: Index data encoding could have a special bulk mode that assigns |
290 | // multiple StringIds to the same addr, so we don't have to repeat |
291 | // the `concrete_id` over and over. |
292 | |
293 | type MappingEntry = [u64; 2]; |
294 | assert!(std::mem::size_of::<MappingEntry>() == 16); |
295 | |
296 | let to_addr_le = concrete_id.to_addr().0.to_le(); |
297 | |
298 | let serialized: Vec<MappingEntry> = virtual_ids |
299 | .map(|from| { |
300 | let id = from.0; |
301 | assert!(id <= MAX_USER_VIRTUAL_STRING_ID); |
302 | [id.to_le(), to_addr_le] |
303 | }) |
304 | .collect(); |
305 | |
306 | let num_bytes = serialized.len() * std::mem::size_of::<MappingEntry>(); |
307 | let byte_ptr = serialized.as_ptr() as *const u8; |
308 | |
309 | let bytes = unsafe { std::slice::from_raw_parts(byte_ptr, num_bytes) }; |
310 | |
311 | self.index_sink.write_bytes_atomic(bytes); |
312 | } |
313 | |
314 | pub fn alloc_metadata<STR: SerializableString + ?Sized>(&self, s: &STR) { |
315 | let concrete_id = self.alloc(s); |
316 | let virtual_id = StringId(METADATA_STRING_ID); |
317 | assert!(virtual_id.is_virtual()); |
318 | serialize_index_entry(&*self.index_sink, virtual_id, concrete_id.to_addr()); |
319 | } |
320 | |
321 | pub fn alloc<STR: SerializableString + ?Sized>(&self, s: &STR) -> StringId { |
322 | let size_in_bytes = s.serialized_size(); |
323 | let addr = self.data_sink.write_atomic(size_in_bytes, |mem| { |
324 | s.serialize(mem); |
325 | }); |
326 | |
327 | StringId::from_addr(addr) |
328 | } |
329 | } |
330 | |