1 | use std::cmp::{max, min}; |
2 | use std::collections::HashMap; |
3 | use std::hash::Hash; |
4 | use utils::calculate_ratio; |
5 | |
6 | #[derive(Debug, Clone, Copy, PartialEq, PartialOrd, Eq, Ord)] |
7 | pub struct Match { |
8 | pub first_start: usize, |
9 | pub second_start: usize, |
10 | pub size: usize, |
11 | } |
12 | |
13 | impl Match { |
14 | fn new(first_start: usize, second_start: usize, size: usize) -> Match { |
15 | Match { |
16 | first_start, |
17 | second_start, |
18 | size, |
19 | } |
20 | } |
21 | } |
22 | |
23 | #[derive(Debug, Clone, PartialEq)] |
24 | pub struct Opcode { |
25 | pub tag: String, |
26 | pub first_start: usize, |
27 | pub first_end: usize, |
28 | pub second_start: usize, |
29 | pub second_end: usize, |
30 | } |
31 | |
32 | impl Opcode { |
33 | fn new( |
34 | tag: String, |
35 | first_start: usize, |
36 | first_end: usize, |
37 | second_start: usize, |
38 | second_end: usize, |
39 | ) -> Opcode { |
40 | Opcode { |
41 | tag, |
42 | first_start, |
43 | first_end, |
44 | second_start, |
45 | second_end, |
46 | } |
47 | } |
48 | } |
49 | |
50 | pub trait Sequence: Eq + Hash {} |
51 | impl<T: Eq + Hash> Sequence for T {} |
52 | |
53 | pub struct SequenceMatcher<'a, T: 'a + Sequence> { |
54 | first_sequence: &'a [T], |
55 | second_sequence: &'a [T], |
56 | matching_blocks: Option<Vec<Match>>, |
57 | opcodes: Option<Vec<Opcode>>, |
58 | is_junk: Option<fn(&T) -> bool>, |
59 | second_sequence_elements: HashMap<&'a T, Vec<usize>>, |
60 | } |
61 | |
62 | impl<'a, T: Sequence> SequenceMatcher<'a, T> { |
63 | pub fn new<S>(first_sequence: &'a S, second_sequence: &'a S) -> SequenceMatcher<'a, T> |
64 | where |
65 | S: AsRef<[T]> + ?Sized, |
66 | { |
67 | let mut matcher = SequenceMatcher { |
68 | first_sequence: first_sequence.as_ref(), |
69 | second_sequence: second_sequence.as_ref(), |
70 | matching_blocks: None, |
71 | opcodes: None, |
72 | is_junk: None, |
73 | second_sequence_elements: HashMap::new(), |
74 | }; |
75 | matcher.set_seqs(first_sequence, second_sequence); |
76 | matcher |
77 | } |
78 | |
79 | pub fn set_is_junk(&mut self, is_junk: Option<fn(&T) -> bool>) { |
80 | self.is_junk = is_junk; |
81 | self.matching_blocks = None; |
82 | self.opcodes = None; |
83 | self.chain_second_seq(); |
84 | } |
85 | |
86 | pub fn set_seqs<S>(&mut self, first_sequence: &'a S, second_sequence: &'a S) |
87 | where |
88 | S: AsRef<[T]> + ?Sized, |
89 | { |
90 | self.set_first_seq(first_sequence); |
91 | self.set_second_seq(second_sequence); |
92 | } |
93 | |
94 | pub fn set_first_seq<S>(&mut self, sequence: &'a S) |
95 | where |
96 | S: AsRef<[T]> + ?Sized, |
97 | { |
98 | self.first_sequence = sequence.as_ref(); |
99 | self.matching_blocks = None; |
100 | self.opcodes = None; |
101 | } |
102 | |
103 | pub fn set_second_seq<S>(&mut self, sequence: &'a S) |
104 | where |
105 | S: AsRef<[T]> + ?Sized, |
106 | { |
107 | self.second_sequence = sequence.as_ref(); |
108 | self.matching_blocks = None; |
109 | self.opcodes = None; |
110 | self.chain_second_seq(); |
111 | } |
112 | |
113 | fn chain_second_seq(&mut self) { |
114 | let second_sequence = self.second_sequence; |
115 | let mut second_sequence_elements = HashMap::new(); |
116 | for (i, item) in second_sequence.iter().enumerate() { |
117 | let mut counter = second_sequence_elements |
118 | .entry(item) |
119 | .or_insert_with(Vec::new); |
120 | counter.push(i); |
121 | } |
122 | if let Some(junk_func) = self.is_junk { |
123 | second_sequence_elements = second_sequence_elements |
124 | .into_iter() |
125 | .filter(|&(element, _)| !junk_func(element)) |
126 | .collect(); |
127 | } |
128 | // Filter out popular elements |
129 | let len = second_sequence.len(); |
130 | if len >= 200 { |
131 | let test_len = (len as f32 / 100.0).floor() as usize + 1; |
132 | second_sequence_elements = second_sequence_elements |
133 | .into_iter() |
134 | .filter(|&(_, ref indexes)| indexes.len() > test_len) |
135 | .collect(); |
136 | } |
137 | self.second_sequence_elements = second_sequence_elements; |
138 | } |
139 | |
140 | pub fn find_longest_match( |
141 | &self, |
142 | first_start: usize, |
143 | first_end: usize, |
144 | second_start: usize, |
145 | second_end: usize, |
146 | ) -> Match { |
147 | let first_sequence = &self.first_sequence; |
148 | let second_sequence = &self.second_sequence; |
149 | let second_sequence_elements = &self.second_sequence_elements; |
150 | let (mut best_i, mut best_j, mut best_size) = (first_start, second_start, 0); |
151 | let mut j2len: HashMap<usize, usize> = HashMap::new(); |
152 | for (i, item) in first_sequence |
153 | .iter() |
154 | .enumerate() |
155 | .take(first_end) |
156 | .skip(first_start) |
157 | { |
158 | let mut new_j2len: HashMap<usize, usize> = HashMap::new(); |
159 | if let Some(indexes) = second_sequence_elements.get(item) { |
160 | for j in indexes { |
161 | let j = *j; |
162 | if j < second_start { |
163 | continue; |
164 | }; |
165 | if j >= second_end { |
166 | break; |
167 | }; |
168 | let mut size = 0; |
169 | if j > 0 { |
170 | if let Some(k) = j2len.get(&(j - 1)) { |
171 | size = *k; |
172 | } |
173 | } |
174 | size += 1; |
175 | new_j2len.insert(j, size); |
176 | if size > best_size { |
177 | best_i = i + 1 - size; |
178 | best_j = j + 1 - size; |
179 | best_size = size; |
180 | } |
181 | } |
182 | } |
183 | j2len = new_j2len; |
184 | } |
185 | for _ in 0..2 { |
186 | while best_i > first_start |
187 | && best_j > second_start |
188 | && first_sequence.get(best_i - 1) == second_sequence.get(best_j - 1) |
189 | { |
190 | best_i -= 1; |
191 | best_j -= 1; |
192 | best_size += 1; |
193 | } |
194 | while best_i + best_size < first_end |
195 | && best_j + best_size < second_end |
196 | && first_sequence.get(best_i + best_size) == second_sequence.get(best_j + best_size) |
197 | { |
198 | best_size += 1; |
199 | } |
200 | } |
201 | Match::new(best_i, best_j, best_size) |
202 | } |
203 | |
204 | pub fn get_matching_blocks(&mut self) -> Vec<Match> { |
205 | if self.matching_blocks.as_ref().is_some() { |
206 | return self.matching_blocks.as_ref().unwrap().clone(); |
207 | } |
208 | let (first_length, second_length) = (self.first_sequence.len(), self.second_sequence.len()); |
209 | let mut matches = Vec::new(); |
210 | let mut queue = vec![(0, first_length, 0, second_length)]; |
211 | while !queue.is_empty() { |
212 | let (first_start, first_end, second_start, second_end) = queue.pop().unwrap(); |
213 | let m = self.find_longest_match(first_start, first_end, second_start, second_end); |
214 | match m.size { |
215 | 0 => {} |
216 | _ => { |
217 | if first_start < m.first_start && second_start < m.second_start { |
218 | queue.push((first_start, m.first_start, second_start, m.second_start)); |
219 | } |
220 | if m.first_start + m.size < first_end && m.second_start + m.size < second_end { |
221 | queue.push(( |
222 | m.first_start + m.size, |
223 | first_end, |
224 | m.second_start + m.size, |
225 | second_end, |
226 | )); |
227 | } |
228 | matches.push(m); |
229 | } |
230 | } |
231 | } |
232 | matches.sort_by(|a, b| a.cmp(b)); |
233 | let (mut first_start, mut second_start, mut size) = (0, 0, 0); |
234 | let mut non_adjacent = Vec::new(); |
235 | for m in &matches { |
236 | if first_start + size == m.first_start && second_start + size == m.second_start { |
237 | size += m.size |
238 | } else { |
239 | if size != 0 { |
240 | non_adjacent.push(Match::new(first_start, second_start, size)); |
241 | } |
242 | first_start = m.first_start; |
243 | second_start = m.second_start; |
244 | size = m.size; |
245 | } |
246 | } |
247 | if size != 0 { |
248 | non_adjacent.push(Match::new(first_start, second_start, size)); |
249 | } |
250 | non_adjacent.push(Match::new(first_length, second_length, 0)); |
251 | self.matching_blocks = Some(non_adjacent); |
252 | self.matching_blocks.as_ref().unwrap().clone() |
253 | } |
254 | |
255 | pub fn get_opcodes(&mut self) -> Vec<Opcode> { |
256 | if self.opcodes.as_ref().is_some() { |
257 | return self.opcodes.as_ref().unwrap().clone(); |
258 | } |
259 | let mut opcodes = Vec::new(); |
260 | let (mut i, mut j) = (0, 0); |
261 | for m in self.get_matching_blocks() { |
262 | let mut tag = String::new(); |
263 | if i < m.first_start && j < m.second_start { |
264 | tag = String::from("replace" ); |
265 | } else if i < m.first_start { |
266 | tag = String::from("delete" ); |
267 | } else if j < m.second_start { |
268 | tag = String::from("insert" ); |
269 | } |
270 | if !tag.is_empty() { |
271 | opcodes.push(Opcode::new(tag, i, m.first_start, j, m.second_start)); |
272 | } |
273 | i = m.first_start + m.size; |
274 | j = m.second_start + m.size; |
275 | if m.size != 0 { |
276 | opcodes.push(Opcode::new( |
277 | String::from("equal" ), |
278 | m.first_start, |
279 | i, |
280 | m.second_start, |
281 | j, |
282 | )); |
283 | } |
284 | } |
285 | self.opcodes = Some(opcodes); |
286 | self.opcodes.as_ref().unwrap().clone() |
287 | } |
288 | |
289 | pub fn get_grouped_opcodes(&mut self, n: usize) -> Vec<Vec<Opcode>> { |
290 | let mut res = Vec::new(); |
291 | let mut codes = self.get_opcodes(); |
292 | if codes.is_empty() { |
293 | codes.push(Opcode::new("equal" .to_string(), 0, 1, 0, 1)); |
294 | } |
295 | |
296 | if codes.first().unwrap().tag == "equal" { |
297 | let opcode = codes.first_mut().unwrap(); |
298 | opcode.first_start = max(opcode.first_start, opcode.first_end.saturating_sub(n)); |
299 | opcode.second_start = max(opcode.second_start, opcode.second_end.saturating_sub(n)); |
300 | } |
301 | if codes.last().unwrap().tag == "equal" { |
302 | let opcode = codes.last_mut().unwrap(); |
303 | opcode.first_end = min(opcode.first_start + n, opcode.first_end); |
304 | opcode.second_end = min(opcode.second_start + n, opcode.second_end); |
305 | } |
306 | let nn = n + n; |
307 | let mut group = Vec::new(); |
308 | for code in &codes { |
309 | let (mut first_start, mut second_start) = (code.first_start, code.second_start); |
310 | if code.tag == "equal" && code.first_end - code.first_start > nn { |
311 | group.push(Opcode::new( |
312 | code.tag.clone(), |
313 | code.first_start, |
314 | min(code.first_end, code.first_start + n), |
315 | code.second_start, |
316 | min(code.second_end, code.second_start + n), |
317 | )); |
318 | res.push(group.clone()); |
319 | group.clear(); |
320 | first_start = max(first_start, code.first_end.saturating_sub(n)); |
321 | second_start = max(second_start, code.second_end.saturating_sub(n)); |
322 | } |
323 | group.push(Opcode::new( |
324 | code.tag.clone(), |
325 | first_start, |
326 | code.first_end, |
327 | second_start, |
328 | code.second_end, |
329 | )); |
330 | } |
331 | if !(group.len() == 1 && group.first().unwrap().tag == "equal" ) || group.is_empty() { |
332 | res.push(group.clone()); |
333 | } |
334 | res |
335 | } |
336 | |
337 | pub fn ratio(&mut self) -> f32 { |
338 | let matches = self.get_matching_blocks() |
339 | .iter() |
340 | .fold(0, |res, &m| res + m.size); |
341 | calculate_ratio( |
342 | matches, |
343 | self.first_sequence.len() + self.second_sequence.len(), |
344 | ) |
345 | } |
346 | } |
347 | |