1 | //! A lightweight Datalog engine in Rust |
2 | //! |
3 | //! The intended design is that one has static `Relation` types that are sets |
4 | //! of tuples, and `Variable` types that represent monotonically increasing |
5 | //! sets of tuples. |
6 | //! |
7 | //! The types are mostly wrappers around `Vec<Tuple>` indicating sorted-ness, |
8 | //! and the intent is that this code can be dropped in the middle of an otherwise |
9 | //! normal Rust program, run to completion, and then the results extracted as |
10 | //! vectors again. |
11 | |
12 | #![forbid (missing_docs)] |
13 | |
14 | use std::cell::RefCell; |
15 | use std::cmp::Ordering; |
16 | use std::iter::FromIterator; |
17 | use std::rc::Rc; |
18 | |
19 | mod join; |
20 | mod map; |
21 | mod test; |
22 | mod treefrog; |
23 | pub use crate::join::JoinInput; |
24 | pub use crate::treefrog::{ |
25 | extend_anti::ExtendAnti, |
26 | extend_with::ExtendWith, |
27 | filter_anti::FilterAnti, |
28 | filter_with::FilterWith, |
29 | filters::{PrefixFilter, ValueFilter}, |
30 | Leaper, Leapers, RelationLeaper, |
31 | }; |
32 | |
33 | /// A static, ordered list of key-value pairs. |
34 | /// |
35 | /// A relation represents a fixed set of key-value pairs. In many places in a |
36 | /// Datalog computation we want to be sure that certain relations are not able |
37 | /// to vary (for example, in antijoins). |
38 | #[derive (Clone)] |
39 | pub struct Relation<Tuple: Ord> { |
40 | /// Sorted list of distinct tuples. |
41 | pub elements: Vec<Tuple>, |
42 | } |
43 | |
44 | impl<Tuple: Ord> Relation<Tuple> { |
45 | /// Merges two relations into their union. |
46 | pub fn merge(self, other: Self) -> Self { |
47 | let Relation { |
48 | elements: mut elements1, |
49 | } = self; |
50 | let Relation { |
51 | elements: mut elements2, |
52 | } = other; |
53 | |
54 | // If one of the element lists is zero-length, we don't need to do any work |
55 | if elements1.is_empty() { |
56 | return Relation { |
57 | elements: elements2, |
58 | }; |
59 | } |
60 | |
61 | if elements2.is_empty() { |
62 | return Relation { |
63 | elements: elements1, |
64 | }; |
65 | } |
66 | |
67 | // Make sure that elements1 starts with the lower element |
68 | // Will not panic since both collections must have at least 1 element at this point |
69 | if elements1[0] > elements2[0] { |
70 | std::mem::swap(&mut elements1, &mut elements2); |
71 | } |
72 | |
73 | // Fast path for when all the new elements are after the exiting ones |
74 | if elements1[elements1.len() - 1] < elements2[0] { |
75 | elements1.extend(elements2.into_iter()); |
76 | // println!("fast path"); |
77 | return Relation { |
78 | elements: elements1, |
79 | }; |
80 | } |
81 | |
82 | let mut elements = Vec::with_capacity(elements1.len() + elements2.len()); |
83 | let mut elements1 = elements1.drain(..); |
84 | let mut elements2 = elements2.drain(..).peekable(); |
85 | |
86 | elements.push(elements1.next().unwrap()); |
87 | if elements.first() == elements2.peek() { |
88 | elements2.next(); |
89 | } |
90 | |
91 | for elem in elements1 { |
92 | while elements2.peek().map(|x| x.cmp(&elem)) == Some(Ordering::Less) { |
93 | elements.push(elements2.next().unwrap()); |
94 | } |
95 | if elements2.peek().map(|x| x.cmp(&elem)) == Some(Ordering::Equal) { |
96 | elements2.next(); |
97 | } |
98 | elements.push(elem); |
99 | } |
100 | |
101 | // Finish draining second list |
102 | elements.extend(elements2); |
103 | |
104 | Relation { elements } |
105 | } |
106 | |
107 | /// Creates a `Relation` from the elements of the `iterator`. |
108 | /// |
109 | /// Same as the `from_iter` method from `std::iter::FromIterator` trait. |
110 | pub fn from_iter<I>(iterator: I) -> Self |
111 | where |
112 | I: IntoIterator<Item = Tuple>, |
113 | { |
114 | iterator.into_iter().collect() |
115 | } |
116 | |
117 | /// Creates a `Relation` using the `leapjoin` logic; |
118 | /// see [`Variable::from_leapjoin`] |
119 | pub fn from_leapjoin<'leap, SourceTuple: Ord, Val: Ord + 'leap>( |
120 | source: &Relation<SourceTuple>, |
121 | leapers: impl Leapers<'leap, SourceTuple, Val>, |
122 | logic: impl FnMut(&SourceTuple, &Val) -> Tuple, |
123 | ) -> Self { |
124 | treefrog::leapjoin(&source.elements, leapers, logic) |
125 | } |
126 | |
127 | /// Creates a `Relation` by joining the values from `input1` and |
128 | /// `input2` and then applying `logic`. Like |
129 | /// [`Variable::from_join`] except for use where the inputs are |
130 | /// not varying across iterations. |
131 | pub fn from_join<Key: Ord, Val1: Ord, Val2: Ord>( |
132 | input1: &Relation<(Key, Val1)>, |
133 | input2: &Relation<(Key, Val2)>, |
134 | logic: impl FnMut(&Key, &Val1, &Val2) -> Tuple, |
135 | ) -> Self { |
136 | join::join_into_relation(input1, input2, logic) |
137 | } |
138 | |
139 | /// Creates a `Relation` by removing all values from `input1` that |
140 | /// share a key with `input2`, and then transforming the resulting |
141 | /// tuples with the `logic` closure. Like |
142 | /// [`Variable::from_antijoin`] except for use where the inputs |
143 | /// are not varying across iterations. |
144 | pub fn from_antijoin<Key: Ord, Val1: Ord>( |
145 | input1: &Relation<(Key, Val1)>, |
146 | input2: &Relation<Key>, |
147 | logic: impl FnMut(&Key, &Val1) -> Tuple, |
148 | ) -> Self { |
149 | join::antijoin(input1, input2, logic) |
150 | } |
151 | |
152 | /// Construct a new relation by mapping another one. Equivalent to |
153 | /// creating an iterator but perhaps more convenient. Analogous to |
154 | /// `Variable::from_map`. |
155 | pub fn from_map<T2: Ord>(input: &Relation<T2>, logic: impl FnMut(&T2) -> Tuple) -> Self { |
156 | input.iter().map(logic).collect() |
157 | } |
158 | |
159 | /// Creates a `Relation` from a vector of tuples. |
160 | pub fn from_vec(mut elements: Vec<Tuple>) -> Self { |
161 | elements.sort(); |
162 | elements.dedup(); |
163 | Relation { elements } |
164 | } |
165 | } |
166 | |
167 | impl<Tuple: Ord> From<Vec<Tuple>> for Relation<Tuple> { |
168 | fn from(iterator: Vec<Tuple>) -> Self { |
169 | Self::from_vec(elements:iterator) |
170 | } |
171 | } |
172 | |
173 | impl<Tuple: Ord> FromIterator<Tuple> for Relation<Tuple> { |
174 | fn from_iter<I>(iterator: I) -> Self |
175 | where |
176 | I: IntoIterator<Item = Tuple>, |
177 | { |
178 | Relation::from_vec(elements:iterator.into_iter().collect()) |
179 | } |
180 | } |
181 | |
182 | impl<'tuple, Tuple: 'tuple + Copy + Ord> FromIterator<&'tuple Tuple> for Relation<Tuple> { |
183 | fn from_iter<I>(iterator: I) -> Self |
184 | where |
185 | I: IntoIterator<Item = &'tuple Tuple>, |
186 | { |
187 | Relation::from_vec(elements:iterator.into_iter().cloned().collect()) |
188 | } |
189 | } |
190 | |
191 | impl<Tuple: Ord> std::ops::Deref for Relation<Tuple> { |
192 | type Target = [Tuple]; |
193 | fn deref(&self) -> &Self::Target { |
194 | &self.elements[..] |
195 | } |
196 | } |
197 | |
198 | /// An iterative context for recursive evaluation. |
199 | /// |
200 | /// An `Iteration` tracks monotonic variables, and monitors their progress. |
201 | /// It can inform the user if they have ceased changing, at which point the |
202 | /// computation should be done. |
203 | pub struct Iteration { |
204 | variables: Vec<Box<dyn VariableTrait>>, |
205 | } |
206 | |
207 | impl Iteration { |
208 | /// Create a new iterative context. |
209 | pub fn new() -> Self { |
210 | Iteration { |
211 | variables: Vec::new(), |
212 | } |
213 | } |
214 | /// Reports whether any of the monitored variables have changed since |
215 | /// the most recent call. |
216 | pub fn changed(&mut self) -> bool { |
217 | let mut result = false; |
218 | for variable in self.variables.iter_mut() { |
219 | if variable.changed() { |
220 | result = true; |
221 | } |
222 | } |
223 | result |
224 | } |
225 | /// Creates a new named variable associated with the iterative context. |
226 | pub fn variable<Tuple: Ord + 'static>(&mut self, name: &str) -> Variable<Tuple> { |
227 | let variable = Variable::new(name); |
228 | self.variables.push(Box::new(variable.clone())); |
229 | variable |
230 | } |
231 | /// Creates a new named variable associated with the iterative context. |
232 | /// |
233 | /// This variable will not be maintained distinctly, and may advertise tuples as |
234 | /// recent multiple times (perhaps unboundedly many times). |
235 | pub fn variable_indistinct<Tuple: Ord + 'static>(&mut self, name: &str) -> Variable<Tuple> { |
236 | let mut variable = Variable::new(name); |
237 | variable.distinct = false; |
238 | self.variables.push(Box::new(variable.clone())); |
239 | variable |
240 | } |
241 | } |
242 | |
243 | /// A type that can report on whether it has changed. |
244 | trait VariableTrait { |
245 | /// Reports whether the variable has changed since it was last asked. |
246 | fn changed(&mut self) -> bool; |
247 | } |
248 | |
249 | /// An monotonically increasing set of `Tuple`s. |
250 | /// |
251 | /// There are three stages in the lifecycle of a tuple: |
252 | /// |
253 | /// 1. A tuple is added to `self.to_add`, but is not yet visible externally. |
254 | /// 2. Newly added tuples are then promoted to `self.recent` for one iteration. |
255 | /// 3. After one iteration, recent tuples are moved to `self.tuples` for posterity. |
256 | /// |
257 | /// Each time `self.changed()` is called, the `recent` relation is folded into `tuples`, |
258 | /// and the `to_add` relations are merged, potentially deduplicated against `tuples`, and |
259 | /// then made `recent`. This way, across calls to `changed()` all added tuples are in |
260 | /// `recent` at least once and eventually all are in `tuples`. |
261 | /// |
262 | /// A `Variable` may optionally be instructed not to de-duplicate its tuples, for reasons |
263 | /// of performance. Such a variable cannot be relied on to terminate iterative computation, |
264 | /// and it is important that any cycle of derivations have at least one de-duplicating |
265 | /// variable on it. |
266 | pub struct Variable<Tuple: Ord> { |
267 | /// Should the variable be maintained distinctly. |
268 | distinct: bool, |
269 | /// A useful name for the variable. |
270 | name: String, |
271 | /// A list of relations whose union are the accepted tuples. |
272 | pub stable: Rc<RefCell<Vec<Relation<Tuple>>>>, |
273 | /// A list of recent tuples, still to be processed. |
274 | pub recent: Rc<RefCell<Relation<Tuple>>>, |
275 | /// A list of future tuples, to be introduced. |
276 | to_add: Rc<RefCell<Vec<Relation<Tuple>>>>, |
277 | } |
278 | |
279 | // Operator implementations. |
280 | impl<Tuple: Ord> Variable<Tuple> { |
281 | /// Adds tuples that result from joining `input1` and `input2` -- |
282 | /// each of the inputs must be a set of (Key, Value) tuples. Both |
283 | /// `input1` and `input2` must have the same type of key (`K`) but |
284 | /// they can have distinct value types (`V1` and `V2` |
285 | /// respectively). The `logic` closure will be invoked for each |
286 | /// key that appears in both inputs; it is also given the two |
287 | /// values, and from those it should construct the resulting |
288 | /// value. |
289 | /// |
290 | /// Note that `input1` must be a variable, but `input2` can be a |
291 | /// relation or a variable. Therefore, you cannot join two |
292 | /// relations with this method. This is not because the result |
293 | /// would be wrong, but because it would be inefficient: the |
294 | /// result from such a join cannot vary across iterations (as |
295 | /// relations are fixed), so you should prefer to invoke `insert` |
296 | /// on a relation created by `Relation::from_join` instead. |
297 | /// |
298 | /// # Examples |
299 | /// |
300 | /// This example starts a collection with the pairs (x, x+1) and (x+1, x) for x in 0 .. 10. |
301 | /// It then adds pairs (y, z) for which (x, y) and (x, z) are present. Because the initial |
302 | /// pairs are symmetric, this should result in all pairs (x, y) for x and y in 0 .. 11. |
303 | /// |
304 | /// ``` |
305 | /// use datafrog::{Iteration, Relation}; |
306 | /// |
307 | /// let mut iteration = Iteration::new(); |
308 | /// let variable = iteration.variable::<(usize, usize)>("source" ); |
309 | /// variable.extend((0 .. 10).map(|x| (x, x + 1))); |
310 | /// variable.extend((0 .. 10).map(|x| (x + 1, x))); |
311 | /// |
312 | /// while iteration.changed() { |
313 | /// variable.from_join(&variable, &variable, |&key, &val1, &val2| (val1, val2)); |
314 | /// } |
315 | /// |
316 | /// let result = variable.complete(); |
317 | /// assert_eq!(result.len(), 121); |
318 | /// ``` |
319 | pub fn from_join<'me, K: Ord, V1: Ord, V2: Ord>( |
320 | &self, |
321 | input1: &'me Variable<(K, V1)>, |
322 | input2: impl JoinInput<'me, (K, V2)>, |
323 | logic: impl FnMut(&K, &V1, &V2) -> Tuple, |
324 | ) { |
325 | join::join_into(input1, input2, self, logic) |
326 | } |
327 | |
328 | /// Adds tuples from `input1` whose key is not present in `input2`. |
329 | /// |
330 | /// Note that `input1` must be a variable: if you have a relation |
331 | /// instead, you can use `Relation::from_antijoin` and then |
332 | /// `Variable::insert`. Note that the result will not vary during |
333 | /// the iteration. |
334 | /// |
335 | /// # Examples |
336 | /// |
337 | /// This example starts a collection with the pairs (x, x+1) for x in 0 .. 10. It then |
338 | /// adds any pairs (x+1,x) for which x is not a multiple of three. That excludes four |
339 | /// pairs (for 0, 3, 6, and 9) which should leave us with 16 total pairs. |
340 | /// |
341 | /// ``` |
342 | /// use datafrog::{Iteration, Relation}; |
343 | /// |
344 | /// let mut iteration = Iteration::new(); |
345 | /// let variable = iteration.variable::<(usize, usize)>("source" ); |
346 | /// variable.extend((0 .. 10).map(|x| (x, x + 1))); |
347 | /// |
348 | /// let relation: Relation<_> = (0 .. 10).filter(|x| x % 3 == 0).collect(); |
349 | /// |
350 | /// while iteration.changed() { |
351 | /// variable.from_antijoin(&variable, &relation, |&key, &val| (val, key)); |
352 | /// } |
353 | /// |
354 | /// let result = variable.complete(); |
355 | /// assert_eq!(result.len(), 16); |
356 | /// ``` |
357 | pub fn from_antijoin<K: Ord, V: Ord>( |
358 | &self, |
359 | input1: &Variable<(K, V)>, |
360 | input2: &Relation<K>, |
361 | logic: impl FnMut(&K, &V) -> Tuple, |
362 | ) { |
363 | self.insert(join::antijoin(input1, input2, logic)) |
364 | } |
365 | |
366 | /// Adds tuples that result from mapping `input`. |
367 | /// |
368 | /// # Examples |
369 | /// |
370 | /// This example starts a collection with the pairs (x, x) for x in 0 .. 10. It then |
371 | /// repeatedly adds any pairs (x, z) for (x, y) in the collection, where z is the Collatz |
372 | /// step for y: it is y/2 if y is even, and 3*y + 1 if y is odd. This produces all of the |
373 | /// pairs (x, y) where x visits y as part of its Collatz journey. |
374 | /// |
375 | /// ``` |
376 | /// use datafrog::{Iteration, Relation}; |
377 | /// |
378 | /// let mut iteration = Iteration::new(); |
379 | /// let variable = iteration.variable::<(usize, usize)>("source" ); |
380 | /// variable.extend((0 .. 10).map(|x| (x, x))); |
381 | /// |
382 | /// while iteration.changed() { |
383 | /// variable.from_map(&variable, |&(key, val)| |
384 | /// if val % 2 == 0 { |
385 | /// (key, val/2) |
386 | /// } |
387 | /// else { |
388 | /// (key, 3*val + 1) |
389 | /// }); |
390 | /// } |
391 | /// |
392 | /// let result = variable.complete(); |
393 | /// assert_eq!(result.len(), 74); |
394 | /// ``` |
395 | pub fn from_map<T2: Ord>(&self, input: &Variable<T2>, logic: impl FnMut(&T2) -> Tuple) { |
396 | map::map_into(input, self, logic) |
397 | } |
398 | |
399 | /// Adds tuples that result from combining `source` with the |
400 | /// relations given in `leapers`. This operation is very flexible |
401 | /// and can be used to do a combination of joins and anti-joins. |
402 | /// The main limitation is that the things being combined must |
403 | /// consist of one dynamic variable (`source`) and then several |
404 | /// fixed relations (`leapers`). |
405 | /// |
406 | /// The idea is as follows: |
407 | /// |
408 | /// - You will be inserting new tuples that result from joining (and anti-joining) |
409 | /// some dynamic variable `source` of source tuples (`SourceTuple`) |
410 | /// with some set of values (of type `Val`). |
411 | /// - You provide these values by combining `source` with a set of leapers |
412 | /// `leapers`, each of which is derived from a fixed relation. The `leapers` |
413 | /// should be either a single leaper (of suitable type) or else a tuple of leapers. |
414 | /// You can create a leaper in one of two ways: |
415 | /// - Extension: In this case, you have a relation of type `(K, Val)` for some |
416 | /// type `K`. You provide a closure that maps from `SourceTuple` to the key |
417 | /// `K`. If you use `relation.extend_with`, then any `Val` values the |
418 | /// relation provides will be added to the set of values; if you use |
419 | /// `extend_anti`, then the `Val` values will be removed. |
420 | /// - Filtering: In this case, you have a relation of type `K` for some |
421 | /// type `K` and you provide a closure that maps from `SourceTuple` to |
422 | /// the key `K`. Filters don't provide values but they remove source |
423 | /// tuples. |
424 | /// - Finally, you get a callback `logic` that accepts each `(SourceTuple, Val)` |
425 | /// that was successfully joined (and not filtered) and which maps to the |
426 | /// type of this variable. |
427 | pub fn from_leapjoin<'leap, SourceTuple: Ord, Val: Ord + 'leap>( |
428 | &self, |
429 | source: &Variable<SourceTuple>, |
430 | leapers: impl Leapers<'leap, SourceTuple, Val>, |
431 | logic: impl FnMut(&SourceTuple, &Val) -> Tuple, |
432 | ) { |
433 | self.insert(treefrog::leapjoin(&source.recent.borrow(), leapers, logic)); |
434 | } |
435 | } |
436 | |
437 | impl<Tuple: Ord> Clone for Variable<Tuple> { |
438 | fn clone(&self) -> Self { |
439 | Variable { |
440 | distinct: self.distinct, |
441 | name: self.name.clone(), |
442 | stable: self.stable.clone(), |
443 | recent: self.recent.clone(), |
444 | to_add: self.to_add.clone(), |
445 | } |
446 | } |
447 | } |
448 | |
449 | impl<Tuple: Ord> Variable<Tuple> { |
450 | fn new(name: &str) -> Self { |
451 | Variable { |
452 | distinct: true, |
453 | name: name.to_string(), |
454 | stable: Rc::new(RefCell::new(Vec::new())), |
455 | recent: Rc::new(RefCell::new(Vec::new().into())), |
456 | to_add: Rc::new(RefCell::new(Vec::new())), |
457 | } |
458 | } |
459 | |
460 | /// Inserts a relation into the variable. |
461 | /// |
462 | /// This is most commonly used to load initial values into a variable. |
463 | /// it is not obvious that it should be commonly used otherwise, but |
464 | /// it should not be harmful. |
465 | pub fn insert(&self, relation: Relation<Tuple>) { |
466 | if !relation.is_empty() { |
467 | self.to_add.borrow_mut().push(relation); |
468 | } |
469 | } |
470 | |
471 | /// Extend the variable with values from the iterator. |
472 | /// |
473 | /// This is most commonly used to load initial values into a variable. |
474 | /// it is not obvious that it should be commonly used otherwise, but |
475 | /// it should not be harmful. |
476 | pub fn extend<T>(&self, iterator: impl IntoIterator<Item = T>) |
477 | where |
478 | Relation<Tuple>: FromIterator<T>, |
479 | { |
480 | self.insert(iterator.into_iter().collect()); |
481 | } |
482 | |
483 | /// Consumes the variable and returns a relation. |
484 | /// |
485 | /// This method removes the ability for the variable to develop, and |
486 | /// flattens all internal tuples down to one relation. The method |
487 | /// asserts that iteration has completed, in that `self.recent` and |
488 | /// `self.to_add` should both be empty. |
489 | pub fn complete(self) -> Relation<Tuple> { |
490 | assert!(self.recent.borrow().is_empty()); |
491 | assert!(self.to_add.borrow().is_empty()); |
492 | let mut result: Relation<Tuple> = Vec::new().into(); |
493 | while let Some(batch) = self.stable.borrow_mut().pop() { |
494 | result = result.merge(batch); |
495 | } |
496 | result |
497 | } |
498 | } |
499 | |
500 | impl<Tuple: Ord> VariableTrait for Variable<Tuple> { |
501 | fn changed(&mut self) -> bool { |
502 | // 1. Merge self.recent into self.stable. |
503 | if !self.recent.borrow().is_empty() { |
504 | let mut recent = |
505 | ::std::mem::replace(&mut (*self.recent.borrow_mut()), Vec::new().into()); |
506 | while self |
507 | .stable |
508 | .borrow() |
509 | .last() |
510 | .map(|x| x.len() <= 2 * recent.len()) |
511 | == Some(true) |
512 | { |
513 | let last = self.stable.borrow_mut().pop().unwrap(); |
514 | recent = recent.merge(last); |
515 | } |
516 | self.stable.borrow_mut().push(recent); |
517 | } |
518 | |
519 | // 2. Move self.to_add into self.recent. |
520 | let to_add = self.to_add.borrow_mut().pop(); |
521 | if let Some(mut to_add) = to_add { |
522 | while let Some(to_add_more) = self.to_add.borrow_mut().pop() { |
523 | to_add = to_add.merge(to_add_more); |
524 | } |
525 | // 2b. Restrict `to_add` to tuples not in `self.stable`. |
526 | if self.distinct { |
527 | for batch in self.stable.borrow().iter() { |
528 | let mut slice = &batch[..]; |
529 | // Only gallop if the slice is relatively large. |
530 | if slice.len() > 4 * to_add.elements.len() { |
531 | to_add.elements.retain(|x| { |
532 | slice = join::gallop(slice, |y| y < x); |
533 | slice.is_empty() || &slice[0] != x |
534 | }); |
535 | } else { |
536 | to_add.elements.retain(|x| { |
537 | while !slice.is_empty() && &slice[0] < x { |
538 | slice = &slice[1..]; |
539 | } |
540 | slice.is_empty() || &slice[0] != x |
541 | }); |
542 | } |
543 | } |
544 | } |
545 | *self.recent.borrow_mut() = to_add; |
546 | } |
547 | |
548 | // let mut total = 0; |
549 | // for tuple in self.stable.borrow().iter() { |
550 | // total += tuple.len(); |
551 | // } |
552 | |
553 | // println!("Variable\t{}\t{}\t{}", self.name, total, self.recent.borrow().len()); |
554 | |
555 | !self.recent.borrow().is_empty() |
556 | } |
557 | } |
558 | |
559 | // impl<Tuple: Ord> Drop for Variable<Tuple> { |
560 | // fn drop(&mut self) { |
561 | // let mut total = 0; |
562 | // for batch in self.stable.borrow().iter() { |
563 | // total += batch.len(); |
564 | // } |
565 | // println!("FINAL: {:?}\t{:?}", self.name, total); |
566 | // } |
567 | // } |
568 | |