1use std::cmp::{max, min};
2use std::collections::HashMap;
3use std::hash::Hash;
4use utils::calculate_ratio;
5
6#[derive(Debug, Clone, Copy, PartialEq, PartialOrd, Eq, Ord)]
7pub struct Match {
8 pub first_start: usize,
9 pub second_start: usize,
10 pub size: usize,
11}
12
13impl Match {
14 fn new(first_start: usize, second_start: usize, size: usize) -> Match {
15 Match {
16 first_start,
17 second_start,
18 size,
19 }
20 }
21}
22
23#[derive(Debug, Clone, PartialEq)]
24pub struct Opcode {
25 pub tag: String,
26 pub first_start: usize,
27 pub first_end: usize,
28 pub second_start: usize,
29 pub second_end: usize,
30}
31
32impl Opcode {
33 fn new(
34 tag: String,
35 first_start: usize,
36 first_end: usize,
37 second_start: usize,
38 second_end: usize,
39 ) -> Opcode {
40 Opcode {
41 tag,
42 first_start,
43 first_end,
44 second_start,
45 second_end,
46 }
47 }
48}
49
50pub trait Sequence: Eq + Hash {}
51impl<T: Eq + Hash> Sequence for T {}
52
53pub struct SequenceMatcher<'a, T: 'a + Sequence> {
54 first_sequence: &'a [T],
55 second_sequence: &'a [T],
56 matching_blocks: Option<Vec<Match>>,
57 opcodes: Option<Vec<Opcode>>,
58 is_junk: Option<fn(&T) -> bool>,
59 second_sequence_elements: HashMap<&'a T, Vec<usize>>,
60}
61
62impl<'a, T: Sequence> SequenceMatcher<'a, T> {
63 pub fn new<S>(first_sequence: &'a S, second_sequence: &'a S) -> SequenceMatcher<'a, T>
64 where
65 S: AsRef<[T]> + ?Sized,
66 {
67 let mut matcher = SequenceMatcher {
68 first_sequence: first_sequence.as_ref(),
69 second_sequence: second_sequence.as_ref(),
70 matching_blocks: None,
71 opcodes: None,
72 is_junk: None,
73 second_sequence_elements: HashMap::new(),
74 };
75 matcher.set_seqs(first_sequence, second_sequence);
76 matcher
77 }
78
79 pub fn set_is_junk(&mut self, is_junk: Option<fn(&T) -> bool>) {
80 self.is_junk = is_junk;
81 self.matching_blocks = None;
82 self.opcodes = None;
83 self.chain_second_seq();
84 }
85
86 pub fn set_seqs<S>(&mut self, first_sequence: &'a S, second_sequence: &'a S)
87 where
88 S: AsRef<[T]> + ?Sized,
89 {
90 self.set_first_seq(first_sequence);
91 self.set_second_seq(second_sequence);
92 }
93
94 pub fn set_first_seq<S>(&mut self, sequence: &'a S)
95 where
96 S: AsRef<[T]> + ?Sized,
97 {
98 self.first_sequence = sequence.as_ref();
99 self.matching_blocks = None;
100 self.opcodes = None;
101 }
102
103 pub fn set_second_seq<S>(&mut self, sequence: &'a S)
104 where
105 S: AsRef<[T]> + ?Sized,
106 {
107 self.second_sequence = sequence.as_ref();
108 self.matching_blocks = None;
109 self.opcodes = None;
110 self.chain_second_seq();
111 }
112
113 fn chain_second_seq(&mut self) {
114 let second_sequence = self.second_sequence;
115 let mut second_sequence_elements = HashMap::new();
116 for (i, item) in second_sequence.iter().enumerate() {
117 let mut counter = second_sequence_elements
118 .entry(item)
119 .or_insert_with(Vec::new);
120 counter.push(i);
121 }
122 if let Some(junk_func) = self.is_junk {
123 second_sequence_elements = second_sequence_elements
124 .into_iter()
125 .filter(|&(element, _)| !junk_func(element))
126 .collect();
127 }
128 // Filter out popular elements
129 let len = second_sequence.len();
130 if len >= 200 {
131 let test_len = (len as f32 / 100.0).floor() as usize + 1;
132 second_sequence_elements = second_sequence_elements
133 .into_iter()
134 .filter(|&(_, ref indexes)| indexes.len() > test_len)
135 .collect();
136 }
137 self.second_sequence_elements = second_sequence_elements;
138 }
139
140 pub fn find_longest_match(
141 &self,
142 first_start: usize,
143 first_end: usize,
144 second_start: usize,
145 second_end: usize,
146 ) -> Match {
147 let first_sequence = &self.first_sequence;
148 let second_sequence = &self.second_sequence;
149 let second_sequence_elements = &self.second_sequence_elements;
150 let (mut best_i, mut best_j, mut best_size) = (first_start, second_start, 0);
151 let mut j2len: HashMap<usize, usize> = HashMap::new();
152 for (i, item) in first_sequence
153 .iter()
154 .enumerate()
155 .take(first_end)
156 .skip(first_start)
157 {
158 let mut new_j2len: HashMap<usize, usize> = HashMap::new();
159 if let Some(indexes) = second_sequence_elements.get(item) {
160 for j in indexes {
161 let j = *j;
162 if j < second_start {
163 continue;
164 };
165 if j >= second_end {
166 break;
167 };
168 let mut size = 0;
169 if j > 0 {
170 if let Some(k) = j2len.get(&(j - 1)) {
171 size = *k;
172 }
173 }
174 size += 1;
175 new_j2len.insert(j, size);
176 if size > best_size {
177 best_i = i + 1 - size;
178 best_j = j + 1 - size;
179 best_size = size;
180 }
181 }
182 }
183 j2len = new_j2len;
184 }
185 for _ in 0..2 {
186 while best_i > first_start
187 && best_j > second_start
188 && first_sequence.get(best_i - 1) == second_sequence.get(best_j - 1)
189 {
190 best_i -= 1;
191 best_j -= 1;
192 best_size += 1;
193 }
194 while best_i + best_size < first_end
195 && best_j + best_size < second_end
196 && first_sequence.get(best_i + best_size) == second_sequence.get(best_j + best_size)
197 {
198 best_size += 1;
199 }
200 }
201 Match::new(best_i, best_j, best_size)
202 }
203
204 pub fn get_matching_blocks(&mut self) -> Vec<Match> {
205 if self.matching_blocks.as_ref().is_some() {
206 return self.matching_blocks.as_ref().unwrap().clone();
207 }
208 let (first_length, second_length) = (self.first_sequence.len(), self.second_sequence.len());
209 let mut matches = Vec::new();
210 let mut queue = vec![(0, first_length, 0, second_length)];
211 while !queue.is_empty() {
212 let (first_start, first_end, second_start, second_end) = queue.pop().unwrap();
213 let m = self.find_longest_match(first_start, first_end, second_start, second_end);
214 match m.size {
215 0 => {}
216 _ => {
217 if first_start < m.first_start && second_start < m.second_start {
218 queue.push((first_start, m.first_start, second_start, m.second_start));
219 }
220 if m.first_start + m.size < first_end && m.second_start + m.size < second_end {
221 queue.push((
222 m.first_start + m.size,
223 first_end,
224 m.second_start + m.size,
225 second_end,
226 ));
227 }
228 matches.push(m);
229 }
230 }
231 }
232 matches.sort_by(|a, b| a.cmp(b));
233 let (mut first_start, mut second_start, mut size) = (0, 0, 0);
234 let mut non_adjacent = Vec::new();
235 for m in &matches {
236 if first_start + size == m.first_start && second_start + size == m.second_start {
237 size += m.size
238 } else {
239 if size != 0 {
240 non_adjacent.push(Match::new(first_start, second_start, size));
241 }
242 first_start = m.first_start;
243 second_start = m.second_start;
244 size = m.size;
245 }
246 }
247 if size != 0 {
248 non_adjacent.push(Match::new(first_start, second_start, size));
249 }
250 non_adjacent.push(Match::new(first_length, second_length, 0));
251 self.matching_blocks = Some(non_adjacent);
252 self.matching_blocks.as_ref().unwrap().clone()
253 }
254
255 pub fn get_opcodes(&mut self) -> Vec<Opcode> {
256 if self.opcodes.as_ref().is_some() {
257 return self.opcodes.as_ref().unwrap().clone();
258 }
259 let mut opcodes = Vec::new();
260 let (mut i, mut j) = (0, 0);
261 for m in self.get_matching_blocks() {
262 let mut tag = String::new();
263 if i < m.first_start && j < m.second_start {
264 tag = String::from("replace");
265 } else if i < m.first_start {
266 tag = String::from("delete");
267 } else if j < m.second_start {
268 tag = String::from("insert");
269 }
270 if !tag.is_empty() {
271 opcodes.push(Opcode::new(tag, i, m.first_start, j, m.second_start));
272 }
273 i = m.first_start + m.size;
274 j = m.second_start + m.size;
275 if m.size != 0 {
276 opcodes.push(Opcode::new(
277 String::from("equal"),
278 m.first_start,
279 i,
280 m.second_start,
281 j,
282 ));
283 }
284 }
285 self.opcodes = Some(opcodes);
286 self.opcodes.as_ref().unwrap().clone()
287 }
288
289 pub fn get_grouped_opcodes(&mut self, n: usize) -> Vec<Vec<Opcode>> {
290 let mut res = Vec::new();
291 let mut codes = self.get_opcodes();
292 if codes.is_empty() {
293 codes.push(Opcode::new("equal".to_string(), 0, 1, 0, 1));
294 }
295
296 if codes.first().unwrap().tag == "equal" {
297 let opcode = codes.first_mut().unwrap();
298 opcode.first_start = max(opcode.first_start, opcode.first_end.saturating_sub(n));
299 opcode.second_start = max(opcode.second_start, opcode.second_end.saturating_sub(n));
300 }
301 if codes.last().unwrap().tag == "equal" {
302 let opcode = codes.last_mut().unwrap();
303 opcode.first_end = min(opcode.first_start + n, opcode.first_end);
304 opcode.second_end = min(opcode.second_start + n, opcode.second_end);
305 }
306 let nn = n + n;
307 let mut group = Vec::new();
308 for code in &codes {
309 let (mut first_start, mut second_start) = (code.first_start, code.second_start);
310 if code.tag == "equal" && code.first_end - code.first_start > nn {
311 group.push(Opcode::new(
312 code.tag.clone(),
313 code.first_start,
314 min(code.first_end, code.first_start + n),
315 code.second_start,
316 min(code.second_end, code.second_start + n),
317 ));
318 res.push(group.clone());
319 group.clear();
320 first_start = max(first_start, code.first_end.saturating_sub(n));
321 second_start = max(second_start, code.second_end.saturating_sub(n));
322 }
323 group.push(Opcode::new(
324 code.tag.clone(),
325 first_start,
326 code.first_end,
327 second_start,
328 code.second_end,
329 ));
330 }
331 if !(group.len() == 1 && group.first().unwrap().tag == "equal") || group.is_empty() {
332 res.push(group.clone());
333 }
334 res
335 }
336
337 pub fn ratio(&mut self) -> f32 {
338 let matches = self.get_matching_blocks()
339 .iter()
340 .fold(0, |res, &m| res + m.size);
341 calculate_ratio(
342 matches,
343 self.first_sequence.len() + self.second_sequence.len(),
344 )
345 }
346}
347