| 1 | use std::cmp::{max, min}; |
| 2 | use std::collections::HashMap; |
| 3 | use std::hash::Hash; |
| 4 | use utils::calculate_ratio; |
| 5 | |
| 6 | #[derive(Debug, Clone, Copy, PartialEq, PartialOrd, Eq, Ord)] |
| 7 | pub struct Match { |
| 8 | pub first_start: usize, |
| 9 | pub second_start: usize, |
| 10 | pub size: usize, |
| 11 | } |
| 12 | |
| 13 | impl Match { |
| 14 | fn new(first_start: usize, second_start: usize, size: usize) -> Match { |
| 15 | Match { |
| 16 | first_start, |
| 17 | second_start, |
| 18 | size, |
| 19 | } |
| 20 | } |
| 21 | } |
| 22 | |
| 23 | #[derive(Debug, Clone, PartialEq)] |
| 24 | pub struct Opcode { |
| 25 | pub tag: String, |
| 26 | pub first_start: usize, |
| 27 | pub first_end: usize, |
| 28 | pub second_start: usize, |
| 29 | pub second_end: usize, |
| 30 | } |
| 31 | |
| 32 | impl Opcode { |
| 33 | fn new( |
| 34 | tag: String, |
| 35 | first_start: usize, |
| 36 | first_end: usize, |
| 37 | second_start: usize, |
| 38 | second_end: usize, |
| 39 | ) -> Opcode { |
| 40 | Opcode { |
| 41 | tag, |
| 42 | first_start, |
| 43 | first_end, |
| 44 | second_start, |
| 45 | second_end, |
| 46 | } |
| 47 | } |
| 48 | } |
| 49 | |
| 50 | pub trait Sequence: Eq + Hash {} |
| 51 | impl<T: Eq + Hash> Sequence for T {} |
| 52 | |
| 53 | pub struct SequenceMatcher<'a, T: 'a + Sequence> { |
| 54 | first_sequence: &'a [T], |
| 55 | second_sequence: &'a [T], |
| 56 | matching_blocks: Option<Vec<Match>>, |
| 57 | opcodes: Option<Vec<Opcode>>, |
| 58 | is_junk: Option<fn(&T) -> bool>, |
| 59 | second_sequence_elements: HashMap<&'a T, Vec<usize>>, |
| 60 | } |
| 61 | |
| 62 | impl<'a, T: Sequence> SequenceMatcher<'a, T> { |
| 63 | pub fn new<S>(first_sequence: &'a S, second_sequence: &'a S) -> SequenceMatcher<'a, T> |
| 64 | where |
| 65 | S: AsRef<[T]> + ?Sized, |
| 66 | { |
| 67 | let mut matcher = SequenceMatcher { |
| 68 | first_sequence: first_sequence.as_ref(), |
| 69 | second_sequence: second_sequence.as_ref(), |
| 70 | matching_blocks: None, |
| 71 | opcodes: None, |
| 72 | is_junk: None, |
| 73 | second_sequence_elements: HashMap::new(), |
| 74 | }; |
| 75 | matcher.set_seqs(first_sequence, second_sequence); |
| 76 | matcher |
| 77 | } |
| 78 | |
| 79 | pub fn set_is_junk(&mut self, is_junk: Option<fn(&T) -> bool>) { |
| 80 | self.is_junk = is_junk; |
| 81 | self.matching_blocks = None; |
| 82 | self.opcodes = None; |
| 83 | self.chain_second_seq(); |
| 84 | } |
| 85 | |
| 86 | pub fn set_seqs<S>(&mut self, first_sequence: &'a S, second_sequence: &'a S) |
| 87 | where |
| 88 | S: AsRef<[T]> + ?Sized, |
| 89 | { |
| 90 | self.set_first_seq(first_sequence); |
| 91 | self.set_second_seq(second_sequence); |
| 92 | } |
| 93 | |
| 94 | pub fn set_first_seq<S>(&mut self, sequence: &'a S) |
| 95 | where |
| 96 | S: AsRef<[T]> + ?Sized, |
| 97 | { |
| 98 | self.first_sequence = sequence.as_ref(); |
| 99 | self.matching_blocks = None; |
| 100 | self.opcodes = None; |
| 101 | } |
| 102 | |
| 103 | pub fn set_second_seq<S>(&mut self, sequence: &'a S) |
| 104 | where |
| 105 | S: AsRef<[T]> + ?Sized, |
| 106 | { |
| 107 | self.second_sequence = sequence.as_ref(); |
| 108 | self.matching_blocks = None; |
| 109 | self.opcodes = None; |
| 110 | self.chain_second_seq(); |
| 111 | } |
| 112 | |
| 113 | fn chain_second_seq(&mut self) { |
| 114 | let second_sequence = self.second_sequence; |
| 115 | let mut second_sequence_elements = HashMap::new(); |
| 116 | for (i, item) in second_sequence.iter().enumerate() { |
| 117 | let mut counter = second_sequence_elements |
| 118 | .entry(item) |
| 119 | .or_insert_with(Vec::new); |
| 120 | counter.push(i); |
| 121 | } |
| 122 | if let Some(junk_func) = self.is_junk { |
| 123 | second_sequence_elements = second_sequence_elements |
| 124 | .into_iter() |
| 125 | .filter(|&(element, _)| !junk_func(element)) |
| 126 | .collect(); |
| 127 | } |
| 128 | // Filter out popular elements |
| 129 | let len = second_sequence.len(); |
| 130 | if len >= 200 { |
| 131 | let test_len = (len as f32 / 100.0).floor() as usize + 1; |
| 132 | second_sequence_elements = second_sequence_elements |
| 133 | .into_iter() |
| 134 | .filter(|&(_, ref indexes)| indexes.len() > test_len) |
| 135 | .collect(); |
| 136 | } |
| 137 | self.second_sequence_elements = second_sequence_elements; |
| 138 | } |
| 139 | |
| 140 | pub fn find_longest_match( |
| 141 | &self, |
| 142 | first_start: usize, |
| 143 | first_end: usize, |
| 144 | second_start: usize, |
| 145 | second_end: usize, |
| 146 | ) -> Match { |
| 147 | let first_sequence = &self.first_sequence; |
| 148 | let second_sequence = &self.second_sequence; |
| 149 | let second_sequence_elements = &self.second_sequence_elements; |
| 150 | let (mut best_i, mut best_j, mut best_size) = (first_start, second_start, 0); |
| 151 | let mut j2len: HashMap<usize, usize> = HashMap::new(); |
| 152 | for (i, item) in first_sequence |
| 153 | .iter() |
| 154 | .enumerate() |
| 155 | .take(first_end) |
| 156 | .skip(first_start) |
| 157 | { |
| 158 | let mut new_j2len: HashMap<usize, usize> = HashMap::new(); |
| 159 | if let Some(indexes) = second_sequence_elements.get(item) { |
| 160 | for j in indexes { |
| 161 | let j = *j; |
| 162 | if j < second_start { |
| 163 | continue; |
| 164 | }; |
| 165 | if j >= second_end { |
| 166 | break; |
| 167 | }; |
| 168 | let mut size = 0; |
| 169 | if j > 0 { |
| 170 | if let Some(k) = j2len.get(&(j - 1)) { |
| 171 | size = *k; |
| 172 | } |
| 173 | } |
| 174 | size += 1; |
| 175 | new_j2len.insert(j, size); |
| 176 | if size > best_size { |
| 177 | best_i = i + 1 - size; |
| 178 | best_j = j + 1 - size; |
| 179 | best_size = size; |
| 180 | } |
| 181 | } |
| 182 | } |
| 183 | j2len = new_j2len; |
| 184 | } |
| 185 | for _ in 0..2 { |
| 186 | while best_i > first_start |
| 187 | && best_j > second_start |
| 188 | && first_sequence.get(best_i - 1) == second_sequence.get(best_j - 1) |
| 189 | { |
| 190 | best_i -= 1; |
| 191 | best_j -= 1; |
| 192 | best_size += 1; |
| 193 | } |
| 194 | while best_i + best_size < first_end |
| 195 | && best_j + best_size < second_end |
| 196 | && first_sequence.get(best_i + best_size) == second_sequence.get(best_j + best_size) |
| 197 | { |
| 198 | best_size += 1; |
| 199 | } |
| 200 | } |
| 201 | Match::new(best_i, best_j, best_size) |
| 202 | } |
| 203 | |
| 204 | pub fn get_matching_blocks(&mut self) -> Vec<Match> { |
| 205 | if self.matching_blocks.as_ref().is_some() { |
| 206 | return self.matching_blocks.as_ref().unwrap().clone(); |
| 207 | } |
| 208 | let (first_length, second_length) = (self.first_sequence.len(), self.second_sequence.len()); |
| 209 | let mut matches = Vec::new(); |
| 210 | let mut queue = vec![(0, first_length, 0, second_length)]; |
| 211 | while !queue.is_empty() { |
| 212 | let (first_start, first_end, second_start, second_end) = queue.pop().unwrap(); |
| 213 | let m = self.find_longest_match(first_start, first_end, second_start, second_end); |
| 214 | match m.size { |
| 215 | 0 => {} |
| 216 | _ => { |
| 217 | if first_start < m.first_start && second_start < m.second_start { |
| 218 | queue.push((first_start, m.first_start, second_start, m.second_start)); |
| 219 | } |
| 220 | if m.first_start + m.size < first_end && m.second_start + m.size < second_end { |
| 221 | queue.push(( |
| 222 | m.first_start + m.size, |
| 223 | first_end, |
| 224 | m.second_start + m.size, |
| 225 | second_end, |
| 226 | )); |
| 227 | } |
| 228 | matches.push(m); |
| 229 | } |
| 230 | } |
| 231 | } |
| 232 | matches.sort_by(|a, b| a.cmp(b)); |
| 233 | let (mut first_start, mut second_start, mut size) = (0, 0, 0); |
| 234 | let mut non_adjacent = Vec::new(); |
| 235 | for m in &matches { |
| 236 | if first_start + size == m.first_start && second_start + size == m.second_start { |
| 237 | size += m.size |
| 238 | } else { |
| 239 | if size != 0 { |
| 240 | non_adjacent.push(Match::new(first_start, second_start, size)); |
| 241 | } |
| 242 | first_start = m.first_start; |
| 243 | second_start = m.second_start; |
| 244 | size = m.size; |
| 245 | } |
| 246 | } |
| 247 | if size != 0 { |
| 248 | non_adjacent.push(Match::new(first_start, second_start, size)); |
| 249 | } |
| 250 | non_adjacent.push(Match::new(first_length, second_length, 0)); |
| 251 | self.matching_blocks = Some(non_adjacent); |
| 252 | self.matching_blocks.as_ref().unwrap().clone() |
| 253 | } |
| 254 | |
| 255 | pub fn get_opcodes(&mut self) -> Vec<Opcode> { |
| 256 | if self.opcodes.as_ref().is_some() { |
| 257 | return self.opcodes.as_ref().unwrap().clone(); |
| 258 | } |
| 259 | let mut opcodes = Vec::new(); |
| 260 | let (mut i, mut j) = (0, 0); |
| 261 | for m in self.get_matching_blocks() { |
| 262 | let mut tag = String::new(); |
| 263 | if i < m.first_start && j < m.second_start { |
| 264 | tag = String::from("replace" ); |
| 265 | } else if i < m.first_start { |
| 266 | tag = String::from("delete" ); |
| 267 | } else if j < m.second_start { |
| 268 | tag = String::from("insert" ); |
| 269 | } |
| 270 | if !tag.is_empty() { |
| 271 | opcodes.push(Opcode::new(tag, i, m.first_start, j, m.second_start)); |
| 272 | } |
| 273 | i = m.first_start + m.size; |
| 274 | j = m.second_start + m.size; |
| 275 | if m.size != 0 { |
| 276 | opcodes.push(Opcode::new( |
| 277 | String::from("equal" ), |
| 278 | m.first_start, |
| 279 | i, |
| 280 | m.second_start, |
| 281 | j, |
| 282 | )); |
| 283 | } |
| 284 | } |
| 285 | self.opcodes = Some(opcodes); |
| 286 | self.opcodes.as_ref().unwrap().clone() |
| 287 | } |
| 288 | |
| 289 | pub fn get_grouped_opcodes(&mut self, n: usize) -> Vec<Vec<Opcode>> { |
| 290 | let mut res = Vec::new(); |
| 291 | let mut codes = self.get_opcodes(); |
| 292 | if codes.is_empty() { |
| 293 | codes.push(Opcode::new("equal" .to_string(), 0, 1, 0, 1)); |
| 294 | } |
| 295 | |
| 296 | if codes.first().unwrap().tag == "equal" { |
| 297 | let opcode = codes.first_mut().unwrap(); |
| 298 | opcode.first_start = max(opcode.first_start, opcode.first_end.saturating_sub(n)); |
| 299 | opcode.second_start = max(opcode.second_start, opcode.second_end.saturating_sub(n)); |
| 300 | } |
| 301 | if codes.last().unwrap().tag == "equal" { |
| 302 | let opcode = codes.last_mut().unwrap(); |
| 303 | opcode.first_end = min(opcode.first_start + n, opcode.first_end); |
| 304 | opcode.second_end = min(opcode.second_start + n, opcode.second_end); |
| 305 | } |
| 306 | let nn = n + n; |
| 307 | let mut group = Vec::new(); |
| 308 | for code in &codes { |
| 309 | let (mut first_start, mut second_start) = (code.first_start, code.second_start); |
| 310 | if code.tag == "equal" && code.first_end - code.first_start > nn { |
| 311 | group.push(Opcode::new( |
| 312 | code.tag.clone(), |
| 313 | code.first_start, |
| 314 | min(code.first_end, code.first_start + n), |
| 315 | code.second_start, |
| 316 | min(code.second_end, code.second_start + n), |
| 317 | )); |
| 318 | res.push(group.clone()); |
| 319 | group.clear(); |
| 320 | first_start = max(first_start, code.first_end.saturating_sub(n)); |
| 321 | second_start = max(second_start, code.second_end.saturating_sub(n)); |
| 322 | } |
| 323 | group.push(Opcode::new( |
| 324 | code.tag.clone(), |
| 325 | first_start, |
| 326 | code.first_end, |
| 327 | second_start, |
| 328 | code.second_end, |
| 329 | )); |
| 330 | } |
| 331 | if !(group.len() == 1 && group.first().unwrap().tag == "equal" ) || group.is_empty() { |
| 332 | res.push(group.clone()); |
| 333 | } |
| 334 | res |
| 335 | } |
| 336 | |
| 337 | pub fn ratio(&mut self) -> f32 { |
| 338 | let matches = self.get_matching_blocks() |
| 339 | .iter() |
| 340 | .fold(0, |res, &m| res + m.size); |
| 341 | calculate_ratio( |
| 342 | matches, |
| 343 | self.first_sequence.len() + self.second_sequence.len(), |
| 344 | ) |
| 345 | } |
| 346 | } |
| 347 | |