diff --git a/crates/differ/README.md b/crates/differ/README.md index 30b9d0c..459cc52 100644 --- a/crates/differ/README.md +++ b/crates/differ/README.md @@ -3,7 +3,7 @@ [![CI](https://github.com/nlp-rs/differ.rs/actions/workflows/main.yml/badge.svg)](https://github.com/nlp-rs/differ.rs/actions/workflows/main.yml) [![Security audit](https://github.com/nlp-rs/differ.rs/actions/workflows/security-audit.yml/badge.svg)](https://github.com/nlp-rs/differ.rs/actions/workflows/security-audit.yml) > warning: **Differ.rs is currently experimental** -This crate provides edit distance, delta vectors between 2 words, and lets you apply delta vectors in order to transform words. +This crate provides edit distance, deltas between 2 words, and lets you apply deltas in order to transform words. ## Install ```shell @@ -16,40 +16,38 @@ differ-rs = "0.0.0" ``` ## Features -* `apply_diff`: Allows users to apply delta vectors in order to transform a words. -* `extra_traits`: all `struct`s implemented in `differ-rs` are `HammingDistance` and `LevenshteinDistance`. Each Struct implements the `diff` and `distance` methods. - +* `Diff` struct: Contains a Box<> of operations between two strings. Also keeps track of length of longest string. Has methods that allows users to get the edit distance between two words, and view delta operations. +* `apply_diff()`: Allows users to apply deltas in order to transform a words. +* `levenshtein()`: Returns a Diff struct between string 1 and string 2. Levenshtein algorithm can detect insertions, deletions, and substitutions. +* `hamming()`: Returns a Diff struct between string 1 and string 2. Hamming algorithm can only detect substitutions, and string 1 and string 2 must me equal length. + ## How it works -* `apply_diff` works by giving a string and a transformation vector to the method. Then the transformation vector is applied to the string given in the first argument. -* `StringDiffAlgorithm` provides two methods `diff` which gives you a transformation vector from the first to second string. The `distance` method gives you the edit distance from the frist argument to the second argument. The structs `HammingDistance` and `LevenshteinDistance` have their own implementations for each method. +* `apply_diff()` works by giving a string and a transformation vector to the method. Then the transformation vector is applied to the string given in the first argument. +* `Diff` holds a `Box`, and the longest length of any two strings. Both `levenshtein()`, and `hamming()` eturn this struct. ## Examples Getting the edit distance between two words using Levenshtein algorithm ```rs -use differ_rs::{LevenshteinDistance, StringDiffAlgorithm}; +use differ_rs::levenshtein; fn main(){ - let my_levensthein = LevenshteinDistance {}; + let levensthein_edit_distance = levenshtein("Sitting", "Kitten").distance(); - let edit_distance = my_levensthein.distance("Sitting", "Kitten"); - - assert_eq!(3, edit_distance) + assert_eq!(3, levensthein_edit_distance); } ``` > **Note**: We are getting the edit distance to get from "Sitting" to "Kitten". -Getting the delta vectors between two words using Levenshtein algorithm +To view the delta between two words using Levenshtein algorithm ```rs -use differ_rs::{LevenshteinDistance, StringDiffAlgorithm}; +use differ_rs::levenshtein; fn main(){ - let my_levensthein = LevenshteinDistance {}; + let my_levensthein = levenshtein("Sitting", "Kitten"); - let delta_vec = my_levensthein.diff("Sitting", "Kitten"); - - for i in delta_vec.iter(){ - println!("{:?}", i); + for diff_op in my_levensthein.ops.iter() { + println!("{:?}", diff_op); } } ``` @@ -57,40 +55,36 @@ fn main(){ This example outputs: ```text -StringDiffOp { kind: Delete('g'), index: 6 } +StringDiffOp { kind: Delete, index: 6 } StringDiffOp { kind: Substitute('i', 'e'), index: 4 } StringDiffOp { kind: Substitute('S', 'K'), index: 0 } ``` Getting the edit distance between two words using Hamming algorithm ```rs -use differ_rs::{HammingDistance, StringDiffAlgorithm}; +use differ_rs::hamming; fn main(){ - let my_hamming = HammingDistance {}; + let kathrin_edit_distance = hamming("karolin", "kathrin").distance(); - let edit_distance = my_hamming.distance("karolin", "kathrin"); - - assert_eq!(3, edit_distance); + assert_eq!(3, kathrin_edit_distance); } ``` -Note: We are getting the edit distance to get from "karolin" to "kathrin", +> **Note**: We are getting the edit distance to get from "karolin" to "kathrin", additionally the first string and second string must be the same length, or will cause a panic to be triggered. -Getting the delta vectors between two words using Hamming algorithm +Getting the deltas between two words using Hamming algorithm ```rs -use differ_rs::{HammingDistance, StringDiffAlgorithm}; +use differ_rs::hamming; fn main(){ - let my_hamming = HammingDistance {}; + let kathrin_edit_distance = hamming("karolin", "kathrin"); - let delta_vec = my_hamming.diff("karolin", "kathrin"); - - for i in delta_vec.iter(){ - println!("{:?}", i); - } + for diff_op in kathrin_edit_distance.ops.iter() { + println!("{:?}", diff_op); + } } ``` This example outputs: @@ -101,18 +95,17 @@ StringDiffOp { kind: Substitute('o', 'h'), index: 3 } StringDiffOp { kind: Substitute('l', 'r'), index: 4 } ``` -Applying delta vectors to words +Applying deltas to words ```rs -use differ_rs::{HammingDistance, LevenshteinDistance, StringDiffAlgorithm,apply_diff}; +use differ_rs::{hamming, levenshtein, apply_diff}; fn main(){ - let my_levensthein = LevenshteinDistance {}; - let levensthein_delta_vec = my_levensthein.diff("sitting", "kitten"); - let delta_applied_v1 = apply_diff("sitting", levensthein_delta_vec); + let my_levensthein = levenshtein("sitting", "kitten"); + let delta_applied_v1 = apply_diff("sitting", my_levensthein.ops.to_vec()); + - let my_hamming = HammingDistance {}; - let hamming_delta_vec = my_hamming.diff("karolin", "kathrin"); - let delta_applied_v2 = apply_diff("karolin", hamming_delta_vec); + let my_hamming = hamming("karolin", "kathrin"); + let delta_applied_v2 = apply_diff("karolin", my_hamming.ops.to_vec()); assert_eq!("kitten", delta_applied_v1); assert_eq!("kathrin", delta_applied_v2); diff --git a/crates/differ/src/apply_diff.rs b/crates/differ/src/apply_diff.rs index 9fd06da..16f1ec6 100644 --- a/crates/differ/src/apply_diff.rs +++ b/crates/differ/src/apply_diff.rs @@ -37,47 +37,50 @@ mod tests { #[test] fn test_apply_diffs() { - let test_vec: Vec = vec![ + let test_box: Box<[StringDiffOp]> = Box::new([ StringDiffOp::new_insert('g', 6), StringDiffOp::new_substitute('e', 'i', 4), StringDiffOp::new_substitute('k', 's', 0), - ]; + ]); - let test_vec_2: Vec = vec![ + let test_box_2: Box<[StringDiffOp]> = Box::new([ StringDiffOp::new_substitute('r', 'n', 4), StringDiffOp::new_delete(2), StringDiffOp::new_delete(1), - ]; + ]); - let test_vec_3: Vec = vec![ + let test_box_3: Box<[StringDiffOp]> = Box::new([ StringDiffOp::new_insert('S', 5), StringDiffOp::new_delete(1), StringDiffOp::new_delete(0), - ]; + ]); - let test_vec_4 = vec![ + let test_box_4: Box<[StringDiffOp]> = Box::new([ StringDiffOp::new_insert('e', 1), StringDiffOp::new_insert('o', 3), - ]; + ]); - let test_vec_5 = vec![ + let test_box_5: Box<[StringDiffOp]> = Box::new([ StringDiffOp::new_insert('r', 4), StringDiffOp::new_insert('s', 0), - ]; + ]); assert_eq!( String::from("sitting"), - super::apply_diff("kitten", test_vec) + super::apply_diff("kitten", test_box.to_vec()) ); assert_eq!( String::from("Sunday"), - super::apply_diff("Saturday", test_vec_2) + super::apply_diff("Saturday", test_box_2.to_vec()) ); - assert_eq!(String::from("SETS"), super::apply_diff("RESET", test_vec_3)); - assert_eq!(String::from("heeoy"), super::apply_diff("hey", test_vec_4)); + assert_eq!( + String::from("SETS"), + super::apply_diff("RESET", test_box_3.to_vec()) + ); + assert_eq!(String::from("heeoy"), super::apply_diff("hey", test_box_4.to_vec())); assert_eq!( String::from("skater"), - super::apply_diff("kate", test_vec_5) + super::apply_diff("kate", test_box_5.to_vec()) ); } } diff --git a/crates/differ/src/hamming.rs b/crates/differ/src/hamming.rs index 0e66036..21b8983 100644 --- a/crates/differ/src/hamming.rs +++ b/crates/differ/src/hamming.rs @@ -1,84 +1,82 @@ -use crate::{StringDiffAlgorithm, StringDiffOp}; +use crate::{Diff, StringDiffOp}; use std::iter::zip; -pub struct HammingDistance {} -impl StringDiffAlgorithm for HammingDistance { - fn diff<'a>(&self, s1: &'a str, s2: &'a str) -> Vec { - if s1.len() != s2.len() { - panic!("Strings must be same length"); - } +pub fn hamming<'a>(s1: &'a str, s2: &'a str) -> Diff { + if s1.len() != s2.len() { + panic!("Strings must be same length"); + } - let mut opp_vec: Vec = Vec::new(); - let iter = zip(s1.chars(), s2.chars()); + let mut opp_vec: Vec = Vec::new(); + let iter = zip(s1.chars(), s2.chars()); - for (i, (char1, char2)) in iter.enumerate() { - if char1 != char2 { - opp_vec.push(StringDiffOp::new_substitute(char1, char2, i)); - } + for (i, (char1, char2)) in iter.enumerate() { + if char1 != char2 { + opp_vec.push(StringDiffOp::new_substitute(char1, char2, i)); } - opp_vec - } - - fn distance<'a>(&self, s1: &'a str, s2: &'a str) -> usize { - self.diff(s1, s2).len() } + Diff::new(opp_vec, s1.len()) } #[cfg(test)] mod tests { - use crate::{StringDiffAlgorithm, StringDiffOp}; - - #[test] - fn test_hamming_distance_edit_distance() { - let test_struct = super::HammingDistance {}; - - assert_eq!(3, test_struct.distance("karolin", "kathrin")); - assert_eq!(3, test_struct.distance("karolin", "kerstin")); - assert_eq!(4, test_struct.distance("kathrin", "kerstin")); - assert_eq!(4, test_struct.distance("0000", "1111")); - assert_eq!(3, test_struct.distance("2173896", "2233796")); - } + use crate::StringDiffOp; #[test] fn test_hamming_distance_op_distance() { - let test_struct = super::HammingDistance {}; + use crate::hamming::hamming; + use crate::Diff; - let test_vec: Vec = vec![ - StringDiffOp::new_substitute('r', 't', 2), - StringDiffOp::new_substitute('o', 'h', 3), - StringDiffOp::new_substitute('l', 'r', 4), - ]; + let test_diff = Diff { + ops: Box::new([ + StringDiffOp::new_substitute('r', 't', 2), + StringDiffOp::new_substitute('o', 'h', 3), + StringDiffOp::new_substitute('l', 'r', 4), + ]), + total_len: 7, + }; - let test_vec_2: Vec = vec![ - StringDiffOp::new_substitute('a', 'e', 1), - StringDiffOp::new_substitute('o', 's', 3), - StringDiffOp::new_substitute('l', 't', 4), - ]; + let test_diff_2 = Diff { + ops: Box::new([ + StringDiffOp::new_substitute('a', 'e', 1), + StringDiffOp::new_substitute('o', 's', 3), + StringDiffOp::new_substitute('l', 't', 4), + ]), + total_len: 7, + }; - let test_vec_3: Vec = vec![ - StringDiffOp::new_substitute('a', 'e', 1), - StringDiffOp::new_substitute('t', 'r', 2), - StringDiffOp::new_substitute('h', 's', 3), - StringDiffOp::new_substitute('r', 't', 4), - ]; + let test_diff_3 = Diff { + ops: Box::new([ + StringDiffOp::new_substitute('a', 'e', 1), + StringDiffOp::new_substitute('t', 'r', 2), + StringDiffOp::new_substitute('h', 's', 3), + StringDiffOp::new_substitute('r', 't', 4), + ]), + total_len: 7, + }; - let test_vec_4: Vec = vec![ - StringDiffOp::new_substitute('0', '1', 0), - StringDiffOp::new_substitute('0', '1', 1), - StringDiffOp::new_substitute('0', '1', 2), - StringDiffOp::new_substitute('0', '1', 3), - ]; + let test_diff_4 = Diff { + ops: Box::new([ + StringDiffOp::new_substitute('0', '1', 0), + StringDiffOp::new_substitute('0', '1', 1), + StringDiffOp::new_substitute('0', '1', 2), + StringDiffOp::new_substitute('0', '1', 3), + ]), + total_len: 4, + }; - let test_vec_5: Vec = vec![ - StringDiffOp::new_substitute('1', '2', 1), - StringDiffOp::new_substitute('7', '3', 2), - StringDiffOp::new_substitute('8', '7', 4), - ]; + let test_diff_5 = Diff { + ops: Box::new([ + StringDiffOp::new_substitute('1', '2', 1), + StringDiffOp::new_substitute('7', '3', 2), + StringDiffOp::new_substitute('8', '7', 4), + ]), + total_len: 7, + }; - assert_eq!(&test_vec, &test_struct.diff("karolin", "kathrin")); - assert_eq!(&test_vec_2, &test_struct.diff("karolin", "kerstin")); - assert_eq!(&test_vec_3, &test_struct.diff("kathrin", "kerstin")); - assert_eq!(&test_vec_4, &test_struct.diff("0000", "1111")); - assert_eq!(&test_vec_5, &test_struct.diff("2173896", "2233796")); + assert_eq!(test_diff, hamming("karolin", "kathrin")); + assert_eq!(test_diff_2, hamming("karolin", "kerstin")); + assert_eq!(test_diff_3, hamming("kathrin", "kerstin")); + assert_eq!(test_diff_4, hamming("0000", "1111")); + assert_eq!(test_diff_5, hamming("2173896", "2233796")); } } diff --git a/crates/differ/src/levenshtein.rs b/crates/differ/src/levenshtein.rs index 81c62ce..1540824 100644 --- a/crates/differ/src/levenshtein.rs +++ b/crates/differ/src/levenshtein.rs @@ -1,207 +1,169 @@ -use crate::{StringDiffAlgorithm, StringDiffOp}; +use crate::{get_operation_matrix, Diff, StringDiffOp}; use core::panic; -pub struct LevenshteinDistance {} - -impl LevenshteinDistance { - /// At a given (x,y) we must choose the minimum value between a cells - /// Top, Left, and Diagonal value. Depending on which cell is chosen between - /// the three it will tell us if its a deletion, insertion or substitution operation. - /// if we chooze x(The value above the cell) as the min value its a insertion operation (symbolized by '^') - /// if we choose y(The value left of the cell) as the min value its a deletion operation(symbolized by '<') - /// if we choose z(The value diagnal of the cell) as the min value its a substitution operation( sybmolized by '\' ) - /// we should always return either x,y,z if somehow we dont we panic with the unrechable macro. - pub(crate) fn min_dist_with_dir(x: usize, y: usize, z: usize) -> (usize, char) { - if x <= y && x <= z { - return (x, '^'); - } - if y <= x && y <= z { - return (y, '<'); - } - if z <= x && z <= y { - return (z, '\\'); - } - unreachable!() +pub(crate) fn reverse_vec_and_indexes(my_vec: &mut Vec, mut top_string_len: usize) { + my_vec.reverse(); + for i in my_vec.iter_mut() { + i.index = top_string_len; + top_string_len += 1; } +} - pub(crate) fn reverse_vec_and_indexes( - my_vec: &mut Vec, - mut top_string_len: usize, - ) { - my_vec.reverse(); - for i in my_vec.iter_mut() { - i.index = top_string_len; - top_string_len += 1; +pub(crate) fn get_operations( + my_opp: &Vec>, + left_string: &str, + top_string: &str, +) -> Vec { + let mut diff_ops: Vec = Vec::new(); + let mut top_str_len = top_string.len(); + let mut left_str_len = left_string.len(); + let mut prev_char: char = ' '; + + loop { + if top_str_len == 0 && left_str_len == 0 { + break; } - } - pub(crate) fn get_operations( - my_opp: &Vec>, - left_string: &str, - top_string: &str, - ) -> Vec { - let mut diff_ops: Vec = Vec::new(); - let mut top_str_len = top_string.len(); - let mut left_str_len = left_string.len(); - let mut prev_char: char = ' '; - - loop { - if top_str_len == 0 && left_str_len == 0 { - break; + //Rows Columns + match my_opp[left_str_len][top_str_len] { + //insertion + '^' => { + let insertion_op = + StringDiffOp::new_insert(left_string.chars().nth(left_str_len - 1).unwrap(), 0); + + left_str_len -= 1; + diff_ops.push(insertion_op); + prev_char = '^'; } + //substitution + '\\' => { + if prev_char == '^' { + reverse_vec_and_indexes(&mut diff_ops, top_str_len); + } - //Rows Columns - match my_opp[left_str_len][top_str_len] { - //insertion - '^' => { - let insertion_op = StringDiffOp::new_insert( + if left_string.chars().nth(left_str_len - 1).unwrap() + != top_string.chars().nth(top_str_len - 1).unwrap() + { + let substitution_op = StringDiffOp::new_substitute( + top_string.chars().nth(top_str_len - 1).unwrap(), left_string.chars().nth(left_str_len - 1).unwrap(), - 0, + top_str_len - 1, ); - left_str_len -= 1; - diff_ops.push(insertion_op); - prev_char = '^'; + diff_ops.push(substitution_op); } - //substitution - '\\' => { - if prev_char == '^' { - Self::reverse_vec_and_indexes(&mut diff_ops, top_str_len); - } - - if left_string.chars().nth(left_str_len - 1).unwrap() - != top_string.chars().nth(top_str_len - 1).unwrap() - { - let substitution_op = StringDiffOp::new_substitute( - top_string.chars().nth(top_str_len - 1).unwrap(), - left_string.chars().nth(left_str_len - 1).unwrap(), - top_str_len - 1, - ); - - diff_ops.push(substitution_op); - } - left_str_len -= 1; - top_str_len -= 1; - prev_char = '\\'; + left_str_len -= 1; + top_str_len -= 1; + prev_char = '\\'; + } + //deletion + '<' => { + if prev_char == '^' { + reverse_vec_and_indexes(&mut diff_ops, top_str_len) } - //deletion - '<' => { - if prev_char == '^' { - Self::reverse_vec_and_indexes(&mut diff_ops, top_str_len) - } - let deletion_op = StringDiffOp::new_delete(top_str_len - 1); + let deletion_op = StringDiffOp::new_delete(top_str_len - 1); - top_str_len -= 1; - diff_ops.push(deletion_op); - prev_char = '<'; - } - _ => { - panic!("UNRECOGNIZED SYMBOL OPERATION !") - } + top_str_len -= 1; + diff_ops.push(deletion_op); + prev_char = '<'; + } + _ => { + panic!("UNRECOGNIZED SYMBOL OPERATION !") } } - - diff_ops } - pub(crate) fn get_operation_matrix(s1: &str, s2: &str) -> Vec> { - let first_string_len: usize = s1.len(); - let second_string_len: usize = s2.len(); - - let mut dist_vector = vec![vec![0usize; first_string_len + 1]; second_string_len + 1]; - let mut dir_vector: Vec> = - vec![vec![' '; first_string_len + 1]; second_string_len + 1]; - - for i in 0..first_string_len + 1 { - dist_vector[0][i] = i; - } - for j in 0..second_string_len + 1 { - dist_vector[j][0] = j; - } - - dir_vector[0][0] = '\\'; - for j in 1..second_string_len + 1 { - dir_vector[j][0] = '^'; - } - for i in 1..first_string_len + 1 { - dir_vector[0][i] = '<'; - } + diff_ops +} - let mut sub_cost: usize = 0; - for i in 1..second_string_len + 1 { - for j in 1..first_string_len + 1 { - if s1.chars().nth(j - 1).unwrap() == s2.chars().nth(i - 1).unwrap() { - sub_cost = 0; - } else { - sub_cost = 1; - } - (dist_vector[i][j], dir_vector[i][j]) = LevenshteinDistance::min_dist_with_dir( - dist_vector[i - 1][j] + 1, //deletion - dist_vector[i][j - 1] + 1, //insertion - dist_vector[i - 1][j - 1] + sub_cost, - ); //substitution - } - } - dir_vector +/// At a given (x,y) we must choose the minimum value between a cells +/// Top, Left, and Diagonal value. Depending on which cell is chosen between +/// the three it will tell us if its a deletion, insertion or substitution operation. +/// if we chooze x(The value above the cell) as the min value its a insertion operation (symbolized by '^') +/// if we choose y(The value left of the cell) as the min value its a deletion operation(symbolized by '<') +/// if we choose z(The value diagnal of the cell) as the min value its a substitution operation( sybmolized by '\' ) +/// we should always return either x,y,z if somehow we dont we panic with the unrechable macro. +pub(crate) fn min_dist_with_dir(x: isize, y: isize, z: isize) -> (isize, char) { + if x <= y && x <= z { + return (x, '^'); + } + if y <= x && y <= z { + return (y, '<'); + } + if z <= x && z <= y { + return (z, '\\'); } + unreachable!() } -impl StringDiffAlgorithm for LevenshteinDistance { - fn diff<'a>(&self, s1: &'a str, s2: &'a str) -> Vec { - let dir_matrix = LevenshteinDistance::get_operation_matrix(s1, s2); - LevenshteinDistance::get_operations(&dir_matrix, s2, s1) +pub(crate) fn my_init_vec(my_vec: &mut Vec>, top_str_len: usize, left_str_len: usize) { + for i in 0..top_str_len { + my_vec[0][i] = i as isize; } - - fn distance<'a>(&self, s1: &'a str, s2: &'a str) -> usize { - let dir_matrix = LevenshteinDistance::get_operation_matrix(s1, s2); - LevenshteinDistance::get_operations(&dir_matrix, s2, s1).len() + for j in 0..left_str_len { + my_vec[j][0] = j as isize; } } -#[cfg(test)] -mod tests { - use crate::{StringDiffAlgorithm, StringDiffOp}; +pub fn levenshtein<'a>(s1: &'a str, s2: &'a str) -> Diff { + let dir_matrix = get_operation_matrix(s1, s2, min_dist_with_dir, my_init_vec, 0, 1, 1); + let temp = get_operations(&dir_matrix, s2, s1).clone(); + let val: usize = if s1.len() >= s2.len() { + s1.len() + } else { + s2.len() + }; - #[test] - fn test_levenshtein_distance_edit_distance() { - let test_struct = super::LevenshteinDistance {}; + Diff::new(temp, val) +} - assert_eq!(3, test_struct.distance("reset", "sets")); - assert_eq!(3, test_struct.distance("kitten", "sitting")); - assert_eq!(3, test_struct.distance("Saturday", "Sunday")); - } +#[cfg(test)] +mod tests { #[test] fn test_levenshtein_distance_op_distance() { - let test_struct = super::LevenshteinDistance {}; - - let test_vec: Vec = vec![ - super::StringDiffOp::new_insert('g', 6), - super::StringDiffOp::new_substitute('e', 'i', 4), - super::StringDiffOp::new_substitute('k', 's', 0), - ]; - - let test_vec_2: Vec = vec![ - super::StringDiffOp::new_substitute('r', 'n', 4), - super::StringDiffOp::new_delete(2), - super::StringDiffOp::new_delete(1), - ]; - - let test_vec_3: Vec = vec![ - super::StringDiffOp::new_insert('S', 5), - super::StringDiffOp::new_delete(1), - super::StringDiffOp::new_delete(0), - ]; - - let test_vec_4: Vec = vec![ - super::StringDiffOp::new_insert('E', 5), - super::StringDiffOp::new_insert('R', 6), - ]; - - assert_eq!(&test_vec, &test_struct.diff("kitten", "sitting")); - assert_eq!(&test_vec_2, &test_struct.diff("Saturday", "Sunday")); - assert_eq!(&test_vec_3, &test_struct.diff("RESET", "SETS")); - assert_eq!(&test_vec_4, &test_struct.diff("RESET", "RESETER")); + use crate::levenshtein::levenshtein; + use crate::Diff; + + let test_diff = Diff { + ops: Box::new([ + super::StringDiffOp::new_insert('g', 6), + super::StringDiffOp::new_substitute('e', 'i', 4), + super::StringDiffOp::new_substitute('k', 's', 0), + ]), + total_len: 7, + }; + + let test_diff_2 = Diff { + ops: Box::new([ + super::StringDiffOp::new_substitute('r', 'n', 4), + super::StringDiffOp::new_delete(2), + super::StringDiffOp::new_delete(1), + ]), + total_len: 8, + }; + + let test_diff_3 = Diff { + ops: Box::new([ + super::StringDiffOp::new_insert('S', 5), + super::StringDiffOp::new_delete(1), + super::StringDiffOp::new_delete(0), + ]), + total_len: 5, + }; + + let test_diff_4 = Diff { + ops: Box::new([ + super::StringDiffOp::new_insert('E', 5), + super::StringDiffOp::new_insert('R', 6), + ]), + total_len: 7, + }; + + assert_eq!(test_diff, levenshtein("kitten", "sitting")); + assert_eq!(test_diff_2, levenshtein("Saturday", "Sunday")); + assert_eq!(test_diff_3, levenshtein("RESET", "SETS")); + assert_eq!(test_diff_4, levenshtein("RESET", "RESETER")); } } diff --git a/crates/differ/src/lib.rs b/crates/differ/src/lib.rs index 6eee171..56e62ec 100644 --- a/crates/differ/src/lib.rs +++ b/crates/differ/src/lib.rs @@ -1,13 +1,12 @@ #![doc = include_str!("../README.md")] - mod apply_diff; pub use crate::apply_diff::apply_diff; mod hamming; -pub use crate::hamming::HammingDistance; +pub use crate::hamming::hamming; mod levenshtein; -pub use crate::levenshtein::LevenshteinDistance; +pub use crate::levenshtein::levenshtein; -#[derive(PartialEq, Eq, Debug)] +#[derive(PartialEq, Eq, Debug, Clone)] pub enum StringDiffOpKind { Substitute(char, char), Insert(char), @@ -15,7 +14,7 @@ pub enum StringDiffOpKind { Transpose, } -#[derive(Debug, PartialEq, Eq)] +#[derive(Debug, PartialEq, Eq, Clone)] pub struct StringDiffOp { pub kind: StringDiffOpKind, pub index: usize, @@ -39,7 +38,69 @@ impl StringDiffOp { } } -pub trait StringDiffAlgorithm { - fn diff<'a>(&self, s1: &'a str, s2: &'a str) -> Vec; - fn distance<'a>(&self, s1: &'a str, s2: &'a str) -> usize; +#[derive(Debug, PartialEq, Eq)] +pub struct Diff { + pub ops: Box<[StringDiffOp]>, + pub total_len: usize, +} + +impl Diff { + pub fn new(diffs: Vec, total_len: usize) -> Self { + Self { + ops: diffs.into_boxed_slice(), + total_len: total_len, + } + } + + pub fn distance(&self) -> usize { + self.ops.len() + } +} + +pub(crate) fn get_operation_matrix( + s1: &str, + s2: &str, + dist_with_dir: fn(isize, isize, isize) -> (isize, char), + init_vec: fn(&mut Vec>, usize, usize), + char_match: isize, + not_char_match: isize, + indent_cost: isize, +) -> Vec> { + let first_string_len: usize = s1.len(); + let second_string_len: usize = s2.len(); + + let mut dist_vector = vec![vec![0isize; first_string_len + 1]; second_string_len + 1]; + let mut dir_vector: Vec> = + vec![vec![' '; first_string_len + 1]; second_string_len + 1]; + + init_vec( + &mut dist_vector, + first_string_len + 1, + second_string_len + 1, + ); + + dir_vector[0][0] = '\\'; + for j in 1..second_string_len + 1 { + dir_vector[j][0] = '^'; + } + for i in 1..first_string_len + 1 { + dir_vector[0][i] = '<'; + } + + for i in 1..second_string_len + 1 { + for j in 1..first_string_len + 1 { + let diagonal_gap_cost: isize; + if s1.chars().nth(j - 1).unwrap() == s2.chars().nth(i - 1).unwrap() { + diagonal_gap_cost = char_match; + } else { + diagonal_gap_cost = not_char_match; + } + (dist_vector[i][j], dir_vector[i][j]) = dist_with_dir( + dist_vector[i - 1][j] + indent_cost, //deletion + dist_vector[i][j - 1] + indent_cost, //insertion + dist_vector[i - 1][j - 1] + diagonal_gap_cost, + ); //substitution + } + } + dir_vector }