1 | //! Definition for encoding of custom sections within core wasm modules of |
2 | //! component-model related data. |
3 | //! |
4 | //! When creating a component from a source language the high-level process for |
5 | //! doing this is that code will be generated into the source language by |
6 | //! `wit-bindgen` or a similar tool which will be compiled down to core wasm. |
7 | //! The core wasm file is then fed into `wit-component` and a component is |
8 | //! created. This means that the componentization process is decoupled from the |
9 | //! binding generation process and intentionally affords for linking together |
10 | //! libraries into the main core wasm module that import different interfaces. |
11 | //! |
12 | //! The purpose of this module is to define an intermediate format to reside in |
13 | //! a custom section in the core wasm output. This intermediate format is |
14 | //! carried through the wasm linker through a custom section whose name starts |
15 | //! with `component-type`. This custom section is created |
16 | //! per-language-binding-generation and consumed by slurping up all the |
17 | //! sections during the component creation process. |
18 | //! |
19 | //! Currently the encoding of this custom section is itself a component. The |
20 | //! component has a single export which is a component type which represents the |
21 | //! `world` that was bound during bindings generation. This single export is |
22 | //! used to decode back into a `Resolve` with a WIT representation. |
23 | //! |
24 | //! Currently the component additionally has a custom section named |
25 | //! `wit-component-encoding` (see `CUSTOM_SECTION_NAME`). This section is |
26 | //! currently defined as 2 bytes: |
27 | //! |
28 | //! * The first byte is `CURRENT_VERSION` to help protect against future and |
29 | //! past changes. |
30 | //! * The second byte indicates the string encoding used for imports/exports as |
31 | //! part of the bindings process. The mapping is defined by |
32 | //! `encode_string_encoding`. |
33 | //! |
34 | //! This means that the top-level `encode` function takes a `Resolve`, a |
35 | //! `WorldId`, and a `StringEncoding`. Note that the top-level `decode` function |
36 | //! is slightly difference because it's taking all custom sections in a core |
37 | //! wasm binary, possibly from multiple invocations of bindgen, and unioning |
38 | //! them all together. This means that the output is a `Bindgen` which |
39 | //! represents the union of all previous bindings. |
40 | //! |
41 | //! The dual of `encode` is the `decode_custom_section` fucntion which decodes |
42 | //! the three arguments originally passed to `encode`. |
43 | |
44 | use crate::{DecodedWasm, StringEncoding}; |
45 | use anyhow::{bail, Context, Result}; |
46 | use indexmap::{IndexMap, IndexSet}; |
47 | use std::borrow::Cow; |
48 | use wasm_encoder::{ |
49 | ComponentBuilder, ComponentExportKind, ComponentType, ComponentTypeRef, CustomSection, |
50 | }; |
51 | use wasm_metadata::Producers; |
52 | use wasmparser::{BinaryReader, Encoding, Parser, Payload}; |
53 | use wit_parser::{Package, PackageName, Resolve, World, WorldId, WorldItem, WorldKey}; |
54 | |
55 | const CURRENT_VERSION: u8 = 0x04; |
56 | const CUSTOM_SECTION_NAME: &str = "wit-component-encoding" ; |
57 | |
58 | /// The result of decoding binding information from a WebAssembly binary. |
59 | /// |
60 | /// This structure is returned by [`decode`] and represents the interface of a |
61 | /// WebAssembly binary. |
62 | pub struct Bindgen { |
63 | /// Interface and type information for this binary. |
64 | pub resolve: Resolve, |
65 | /// The world that was bound. |
66 | pub world: WorldId, |
67 | /// Metadata about this specific module that was bound. |
68 | pub metadata: ModuleMetadata, |
69 | /// Producer information about tools used to produce this specific module. |
70 | pub producers: Option<Producers>, |
71 | } |
72 | |
73 | impl Default for Bindgen { |
74 | fn default() -> Bindgen { |
75 | let mut resolve = Resolve::default(); |
76 | let package = resolve.packages.alloc(Package { |
77 | name: PackageName { |
78 | namespace: "root" .to_string(), |
79 | name: "root" .to_string(), |
80 | version: None, |
81 | }, |
82 | docs: Default::default(), |
83 | interfaces: Default::default(), |
84 | worlds: Default::default(), |
85 | }); |
86 | let world = resolve.worlds.alloc(World { |
87 | name: "root" .to_string(), |
88 | docs: Default::default(), |
89 | imports: Default::default(), |
90 | exports: Default::default(), |
91 | includes: Default::default(), |
92 | include_names: Default::default(), |
93 | package: Some(package), |
94 | stability: Default::default(), |
95 | }); |
96 | resolve.packages[package] |
97 | .worlds |
98 | .insert("root" .to_string(), world); |
99 | Bindgen { |
100 | resolve, |
101 | world, |
102 | metadata: ModuleMetadata::default(), |
103 | producers: None, |
104 | } |
105 | } |
106 | } |
107 | |
108 | /// Module-level metadata that's specific to one core WebAssembly module. This |
109 | /// is extracted with a [`Bindgen`]. |
110 | #[derive (Default)] |
111 | pub struct ModuleMetadata { |
112 | /// Per-function options imported into the core wasm module, currently only |
113 | /// related to string encoding. |
114 | pub import_encodings: EncodingMap, |
115 | |
116 | /// Per-function options exported from the core wasm module, currently only |
117 | /// related to string encoding. |
118 | pub export_encodings: EncodingMap, |
119 | } |
120 | |
121 | /// Internal map that keeps track of encodings for various world imports and |
122 | /// exports. |
123 | /// |
124 | /// Stored in [`ModuleMetadata`]. |
125 | #[derive (Default)] |
126 | pub struct EncodingMap { |
127 | /// A map of an "identifying string" for world items to what string |
128 | /// encoding the import or export is using. |
129 | /// |
130 | /// The keys of this map are created by `EncodingMap::key` and are |
131 | /// specifically chosen to be able to be looked up during both insertion and |
132 | /// fetching. Note that in particular this map does not use `*Id` types such |
133 | /// as `InterfaceId` from `wit_parser`. This is due to the fact that during |
134 | /// world merging new interfaces are created for named imports (e.g. `import |
135 | /// x: interface { ... }`) as inline interfaces are copied from one world to |
136 | /// another. Additionally during world merging different interfaces at the |
137 | /// same version may be deduplicated. |
138 | /// |
139 | /// For these reasons a string-based key is chosen to avoid juggling IDs |
140 | /// through the world merging process. Additionally versions are chopped off |
141 | /// for now to help with a problem such as: |
142 | /// |
143 | /// * The main module imports a:b/c@0.1.0 |
144 | /// * An adapter imports a:b/c@0.1.1 |
145 | /// * The final world uses a:b/c@0.1.1, but the main module has no |
146 | /// encoding listed for that exact item. |
147 | /// |
148 | /// By chopping off versions this is able to get everything registered |
149 | /// correctly even in the fact of merging interfaces and worlds. |
150 | encodings: IndexMap<String, StringEncoding>, |
151 | } |
152 | |
153 | impl EncodingMap { |
154 | fn insert_all( |
155 | &mut self, |
156 | resolve: &Resolve, |
157 | set: &IndexMap<WorldKey, WorldItem>, |
158 | encoding: StringEncoding, |
159 | ) { |
160 | for (name, item) in set { |
161 | match item { |
162 | WorldItem::Function(func) => { |
163 | let key = self.key(resolve, name, &func.name); |
164 | self.encodings.insert(key, encoding); |
165 | } |
166 | WorldItem::Interface { id, .. } => { |
167 | for (func, _) in resolve.interfaces[*id].functions.iter() { |
168 | let key = self.key(resolve, name, func); |
169 | self.encodings.insert(key, encoding); |
170 | } |
171 | } |
172 | WorldItem::Type(_) => {} |
173 | } |
174 | } |
175 | } |
176 | |
177 | /// Looks up the encoding of the function `func` which is scoped under `key` |
178 | /// in the world in question. |
179 | pub fn get(&self, resolve: &Resolve, key: &WorldKey, func: &str) -> Option<StringEncoding> { |
180 | let key = self.key(resolve, key, func); |
181 | self.encodings.get(&key).copied() |
182 | } |
183 | |
184 | fn key(&self, resolve: &Resolve, key: &WorldKey, func: &str) -> String { |
185 | format!( |
186 | " {}/ {func}" , |
187 | match key { |
188 | WorldKey::Name(name) => name.to_string(), |
189 | WorldKey::Interface(id) => { |
190 | let iface = &resolve.interfaces[*id]; |
191 | let pkg = &resolve.packages[iface.package.unwrap()]; |
192 | format!( |
193 | " {}: {}/ {}" , |
194 | pkg.name.namespace, |
195 | pkg.name.name, |
196 | iface.name.as_ref().unwrap() |
197 | ) |
198 | } |
199 | } |
200 | ) |
201 | } |
202 | |
203 | fn merge(&mut self, other: EncodingMap) -> Result<()> { |
204 | for (key, encoding) in other.encodings { |
205 | if let Some(prev) = self.encodings.insert(key.clone(), encoding) { |
206 | if prev != encoding { |
207 | bail!("conflicting string encodings specified for ` {key}`" ); |
208 | } |
209 | } |
210 | } |
211 | Ok(()) |
212 | } |
213 | } |
214 | |
215 | /// This function will parse the core `wasm` binary given as input and return a |
216 | /// [`Bindgen`] which extracts the custom sections describing component-level |
217 | /// types from within the binary itself. |
218 | /// |
219 | /// This is used to parse the output of `wit-bindgen`-generated modules and is |
220 | /// one of the earliest phases in transitioning such a module to a component. |
221 | /// The extraction here provides the metadata necessary to continue the process |
222 | /// later on. |
223 | /// |
224 | /// This will return an error if `wasm` is not a valid WebAssembly module. |
225 | /// |
226 | /// If a `component-type` custom section was found then a new binary is |
227 | /// optionally returned with the custom sections stripped out. If no |
228 | /// `component-type` custom sections are found then `None` is returned. |
229 | pub fn decode(wasm: &[u8]) -> Result<(Option<Vec<u8>>, Bindgen)> { |
230 | let mut ret = Bindgen::default(); |
231 | let mut new_module = wasm_encoder::Module::new(); |
232 | |
233 | let mut found_custom = false; |
234 | for payload in wasmparser::Parser::new(0).parse_all(wasm) { |
235 | let payload = payload.context("decoding item in module" )?; |
236 | match payload { |
237 | wasmparser::Payload::CustomSection(cs) if cs.name().starts_with("component-type" ) => { |
238 | let data = Bindgen::decode_custom_section(cs.data()) |
239 | .with_context(|| format!("decoding custom section {}" , cs.name()))?; |
240 | ret.merge(data) |
241 | .with_context(|| format!("updating metadata for section {}" , cs.name()))?; |
242 | found_custom = true; |
243 | } |
244 | wasmparser::Payload::Version { encoding, .. } if encoding != Encoding::Module => { |
245 | bail!("decoding a component is not supported" ) |
246 | } |
247 | _ => { |
248 | if let Some((id, range)) = payload.as_section() { |
249 | new_module.section(&wasm_encoder::RawSection { |
250 | id, |
251 | data: &wasm[range], |
252 | }); |
253 | } |
254 | } |
255 | } |
256 | } |
257 | |
258 | if found_custom { |
259 | Ok((Some(new_module.finish()), ret)) |
260 | } else { |
261 | Ok((None, ret)) |
262 | } |
263 | } |
264 | |
265 | /// Creates a `component-type*` custom section to be decoded by `decode` above. |
266 | /// |
267 | /// This is primarily created by wit-bindgen-based guest generators to embed |
268 | /// into the final core wasm binary. The core wasm binary is later fed |
269 | /// through `wit-component` to produce the actual component where this returned |
270 | /// section will be decoded. |
271 | pub fn encode( |
272 | resolve: &Resolve, |
273 | world: WorldId, |
274 | string_encoding: StringEncoding, |
275 | extra_producers: Option<&Producers>, |
276 | ) -> Result<Vec<u8>> { |
277 | let ty = crate::encoding::encode_world(resolve, world)?; |
278 | |
279 | let world = &resolve.worlds[world]; |
280 | let mut outer_ty = ComponentType::new(); |
281 | outer_ty.ty().component(&ty); |
282 | outer_ty.export( |
283 | &resolve.id_of_name(world.package.unwrap(), &world.name), |
284 | ComponentTypeRef::Component(0), |
285 | ); |
286 | |
287 | let mut builder = ComponentBuilder::default(); |
288 | |
289 | let string_encoding = encode_string_encoding(string_encoding); |
290 | builder.custom_section(&CustomSection { |
291 | name: CUSTOM_SECTION_NAME.into(), |
292 | data: Cow::Borrowed(&[CURRENT_VERSION, string_encoding]), |
293 | }); |
294 | |
295 | let ty = builder.type_component(&outer_ty); |
296 | builder.export(&world.name, ComponentExportKind::Type, ty, None); |
297 | |
298 | let mut producers = crate::base_producers(); |
299 | if let Some(p) = extra_producers { |
300 | producers.merge(&p); |
301 | } |
302 | builder.raw_custom_section(&producers.raw_custom_section()); |
303 | Ok(builder.finish()) |
304 | } |
305 | |
306 | fn decode_custom_section(wasm: &[u8]) -> Result<(Resolve, WorldId, StringEncoding)> { |
307 | let (resolve: Resolve, world: Id) = wit_parser::decoding::decode_world(wasm)?; |
308 | let mut custom_section: Option<&[u8]> = None; |
309 | |
310 | for payload: Result, BinaryReaderError> in Parser::new(0).parse_all(data:wasm) { |
311 | match payload? { |
312 | Payload::CustomSection(s: CustomSectionReader<'_>) if s.name() == CUSTOM_SECTION_NAME => { |
313 | custom_section = Some(s.data()); |
314 | } |
315 | _ => {} |
316 | } |
317 | } |
318 | let string_encoding: StringEncoding = match custom_section { |
319 | None => bail!("missing custom section of name ` {CUSTOM_SECTION_NAME}`" ), |
320 | Some([CURRENT_VERSION, byte: &u8]) => decode_string_encoding(*byte)?, |
321 | Some([]) => bail!("custom section ` {CUSTOM_SECTION_NAME}` in unknown format" ), |
322 | Some([version: &u8, ..]) => bail!( |
323 | "custom section ` {CUSTOM_SECTION_NAME}` uses format {version} but only {CURRENT_VERSION} is supported" |
324 | ), |
325 | }; |
326 | Ok((resolve, world, string_encoding)) |
327 | } |
328 | |
329 | fn encode_string_encoding(e: StringEncoding) -> u8 { |
330 | match e { |
331 | StringEncoding::UTF8 => 0x00, |
332 | StringEncoding::UTF16 => 0x01, |
333 | StringEncoding::CompactUTF16 => 0x02, |
334 | } |
335 | } |
336 | |
337 | fn decode_string_encoding(byte: u8) -> Result<StringEncoding> { |
338 | match byte { |
339 | 0x00 => Ok(StringEncoding::UTF8), |
340 | 0x01 => Ok(StringEncoding::UTF16), |
341 | 0x02 => Ok(StringEncoding::CompactUTF16), |
342 | byte: u8 => bail!("invalid string encoding {byte:#x}" ), |
343 | } |
344 | } |
345 | |
346 | impl Bindgen { |
347 | fn decode_custom_section(data: &[u8]) -> Result<Bindgen> { |
348 | let wasm; |
349 | let world; |
350 | let resolve; |
351 | let encoding; |
352 | |
353 | let mut reader = BinaryReader::new(data, 0); |
354 | match reader.read_u8()? { |
355 | // Historical 0x03 format where the support here will be deleted in |
356 | // the future |
357 | 0x03 => { |
358 | encoding = decode_string_encoding(reader.read_u8()?)?; |
359 | let world_name = reader.read_string()?; |
360 | wasm = &data[reader.original_position()..]; |
361 | |
362 | let (r, pkg) = match crate::decode(wasm)? { |
363 | DecodedWasm::WitPackage(resolve, pkgs) => (resolve, pkgs), |
364 | DecodedWasm::Component(..) => bail!("expected encoded wit package(s)" ), |
365 | }; |
366 | resolve = r; |
367 | world = resolve.select_world(pkg, Some(world_name.into()))?; |
368 | } |
369 | |
370 | // Current format where `data` is a wasm component itself. |
371 | _ => { |
372 | wasm = data; |
373 | (resolve, world, encoding) = decode_custom_section(wasm)?; |
374 | } |
375 | } |
376 | |
377 | Ok(Bindgen { |
378 | metadata: ModuleMetadata::new(&resolve, world, encoding), |
379 | producers: wasm_metadata::Producers::from_wasm(wasm)?, |
380 | resolve, |
381 | world, |
382 | }) |
383 | } |
384 | |
385 | /// Merges another `BindgenMetadata` into this one. |
386 | /// |
387 | /// This operation is intended to be akin to "merging worlds" when the |
388 | /// abstraction level for that is what we're working at here. For now the |
389 | /// merge operation only succeeds if the two metadata descriptions are |
390 | /// entirely disjoint. |
391 | /// |
392 | /// Note that at this time there's no support for changing string encodings |
393 | /// between metadata. |
394 | /// |
395 | /// This function returns the set of exports that the main world of |
396 | /// `other` added to the world in `self`. |
397 | pub fn merge(&mut self, other: Bindgen) -> Result<IndexSet<WorldKey>> { |
398 | let Bindgen { |
399 | resolve, |
400 | world, |
401 | metadata: |
402 | ModuleMetadata { |
403 | import_encodings, |
404 | export_encodings, |
405 | }, |
406 | producers, |
407 | } = other; |
408 | |
409 | let remap = self |
410 | .resolve |
411 | .merge(resolve) |
412 | .context("failed to merge WIT package sets together" )?; |
413 | let world = remap.map_world(world, None)?; |
414 | let exports = self.resolve.worlds[world].exports.keys().cloned().collect(); |
415 | self.resolve |
416 | .merge_worlds(world, self.world) |
417 | .context("failed to merge worlds from two documents" )?; |
418 | |
419 | self.metadata.import_encodings.merge(import_encodings)?; |
420 | self.metadata.export_encodings.merge(export_encodings)?; |
421 | if let Some(producers) = producers { |
422 | if let Some(mine) = &mut self.producers { |
423 | mine.merge(&producers); |
424 | } else { |
425 | self.producers = Some(producers); |
426 | } |
427 | } |
428 | |
429 | Ok(exports) |
430 | } |
431 | } |
432 | |
433 | impl ModuleMetadata { |
434 | /// Creates a new `ModuleMetadata` instance holding the given set of |
435 | /// interfaces which are expected to all use the `encoding` specified. |
436 | pub fn new(resolve: &Resolve, world: WorldId, encoding: StringEncoding) -> ModuleMetadata { |
437 | let mut ret: ModuleMetadata = ModuleMetadata::default(); |
438 | |
439 | let world: &World = &resolve.worlds[world]; |
440 | ret.export_encodings |
441 | .insert_all(resolve, &world.exports, encoding); |
442 | ret.import_encodings |
443 | .insert_all(resolve, &world.imports, encoding); |
444 | |
445 | ret |
446 | } |
447 | } |
448 | |