From 8e7d8922707b33aca6e0ab8c73fb223c77fd68ba Mon Sep 17 00:00:00 2001 From: Alfredo Date: Fri, 6 Jan 2023 10:51:37 -0600 Subject: [PATCH 1/2] refactor: better architecture for differ library fixes #36 --- crates/differ/README.md | 71 ++++--- crates/differ/src/apply_diff.rs | 35 ++-- crates/differ/src/diff.rs | 26 +++ crates/differ/src/diff_score.rs | 41 ++++ crates/differ/src/hamming.rs | 125 +++++++------ crates/differ/src/levenshtein.rs | 309 ++++++++++++++----------------- crates/differ/src/lib.rs | 62 ++++++- 7 files changed, 369 insertions(+), 300 deletions(-) create mode 100644 crates/differ/src/diff.rs create mode 100644 crates/differ/src/diff_score.rs diff --git a/crates/differ/README.md b/crates/differ/README.md index 30b9d0c..5f75a13 100644 --- a/crates/differ/README.md +++ b/crates/differ/README.md @@ -3,7 +3,7 @@ [![CI](https://github.com/nlp-rs/differ.rs/actions/workflows/main.yml/badge.svg)](https://github.com/nlp-rs/differ.rs/actions/workflows/main.yml) [![Security audit](https://github.com/nlp-rs/differ.rs/actions/workflows/security-audit.yml/badge.svg)](https://github.com/nlp-rs/differ.rs/actions/workflows/security-audit.yml) > warning: **Differ.rs is currently experimental** -This crate provides edit distance, delta vectors between 2 words, and lets you apply delta vectors in order to transform words. +This crate provides edit distance, deltas between 2 words, and lets you apply deltas in order to transform words. ## Install ```shell @@ -16,82 +16,73 @@ differ-rs = "0.0.0" ``` ## Features -* `apply_diff`: Allows users to apply delta vectors in order to transform a words. -* `extra_traits`: all `struct`s implemented in `differ-rs` are `HammingDistance` and `LevenshteinDistance`. Each Struct implements the `diff` and `distance` methods. +* `apply_diff` function: Allows users to apply deltas in order to transform a words. +* `Diff` struct: Contains a Box<> of operations between two strings. Also keeps track of length of longest string. Has methods that allows users to get the edit distance between two words, and view delta operations. +* `levenshtein` function: Returns a Diff struct between string 1 and string 2 using levenshtein algorithm. +* `hamming` function: Returns a Diff struct between string 1 and string 2 hamming algorithm. ## How it works * `apply_diff` works by giving a string and a transformation vector to the method. Then the transformation vector is applied to the string given in the first argument. -* `StringDiffAlgorithm` provides two methods `diff` which gives you a transformation vector from the first to second string. The `distance` method gives you the edit distance from the frist argument to the second argument. The structs `HammingDistance` and `LevenshteinDistance` have their own implementations for each method. +* `Diff` works by hodling a Box<> of operations, and longest length between any two strings. Both the `levenshtein`, `hamming` algorithm return this struct. ## Examples Getting the edit distance between two words using Levenshtein algorithm ```rs -use differ_rs::{LevenshteinDistance, StringDiffAlgorithm}; +use differ_rs::levenshtein; fn main(){ - let my_levensthein = LevenshteinDistance {}; + let levensthein_edit_distance = levenshtein("Sitting", "Kitten").distance(); - let edit_distance = my_levensthein.distance("Sitting", "Kitten"); - - assert_eq!(3, edit_distance) + assert_eq!(3, levensthein_edit_distance); } ``` > **Note**: We are getting the edit distance to get from "Sitting" to "Kitten". -Getting the delta vectors between two words using Levenshtein algorithm +To view the delta between two words using Levenshtein algorithm ```rs -use differ_rs::{LevenshteinDistance, StringDiffAlgorithm}; +use differ_rs::levenshtein; fn main(){ - let my_levensthein = LevenshteinDistance {}; + let my_levensthein = levenshtein("Sitting", "Kitten"); - let delta_vec = my_levensthein.diff("Sitting", "Kitten"); - - for i in delta_vec.iter(){ - println!("{:?}", i); - } + my_levensthein.operations(); } ``` This example outputs: ```text -StringDiffOp { kind: Delete('g'), index: 6 } +StringDiffOp { kind: Delete, index: 6 } StringDiffOp { kind: Substitute('i', 'e'), index: 4 } StringDiffOp { kind: Substitute('S', 'K'), index: 0 } ``` Getting the edit distance between two words using Hamming algorithm ```rs -use differ_rs::{HammingDistance, StringDiffAlgorithm}; +use differ_rs::hamming; fn main(){ - let my_hamming = HammingDistance {}; + let kathrin_edit_distance = hamming("karolin", "kathrin").distance(); - let edit_distance = my_hamming.distance("karolin", "kathrin"); - - assert_eq!(3, edit_distance); + assert_eq!(3, kathrin_edit_distance); } ``` -Note: We are getting the edit distance to get from "karolin" to "kathrin", +> **Note**: We are getting the edit distance to get from "karolin" to "kathrin", additionally the first string and second string must be the same length, or will cause a panic to be triggered. -Getting the delta vectors between two words using Hamming algorithm +Getting the deltas between two words using Hamming algorithm ```rs -use differ_rs::{HammingDistance, StringDiffAlgorithm}; +use differ_rs::hamming; fn main(){ - let my_hamming = HammingDistance {}; + let kathrin_edit_distance = hamming("karolin", "kathrin"); - let delta_vec = my_hamming.diff("karolin", "kathrin"); - - for i in delta_vec.iter(){ - println!("{:?}", i); - } + kathrin_edit_distance.operations(); } + ``` This example outputs: @@ -101,22 +92,22 @@ StringDiffOp { kind: Substitute('o', 'h'), index: 3 } StringDiffOp { kind: Substitute('l', 'r'), index: 4 } ``` -Applying delta vectors to words +Applying deltas to words ```rs -use differ_rs::{HammingDistance, LevenshteinDistance, StringDiffAlgorithm,apply_diff}; +use differ_rs::{hamming, levenshtein, apply_diff}; fn main(){ - let my_levensthein = LevenshteinDistance {}; - let levensthein_delta_vec = my_levensthein.diff("sitting", "kitten"); - let delta_applied_v1 = apply_diff("sitting", levensthein_delta_vec); + let my_levensthein = levenshtein("sitting", "kitten"); + let delta_applied_v1 = apply_diff("sitting", &my_levensthein.ops); + - let my_hamming = HammingDistance {}; - let hamming_delta_vec = my_hamming.diff("karolin", "kathrin"); - let delta_applied_v2 = apply_diff("karolin", hamming_delta_vec); + let my_hamming = hamming("karolin", "kathrin"); + let delta_applied_v2 = apply_diff("karolin", &my_hamming.ops); assert_eq!("kitten", delta_applied_v1); assert_eq!("kathrin", delta_applied_v2); } + ``` ## License diff --git a/crates/differ/src/apply_diff.rs b/crates/differ/src/apply_diff.rs index 9fd06da..d36b0ad 100644 --- a/crates/differ/src/apply_diff.rs +++ b/crates/differ/src/apply_diff.rs @@ -11,7 +11,7 @@ pub(crate) fn remove(start: usize, stop: usize, s: &str) -> String { result } -pub fn apply_diff(s: &str, diffs: Vec) -> String { +pub fn apply_diff(s: &str, diffs: &Box<[StringDiffOp]>) -> String { let mut new_string: String = s.into(); for i in diffs.iter() { @@ -37,47 +37,50 @@ mod tests { #[test] fn test_apply_diffs() { - let test_vec: Vec = vec![ + let test_box: Box<[StringDiffOp]> = Box::new([ StringDiffOp::new_insert('g', 6), StringDiffOp::new_substitute('e', 'i', 4), StringDiffOp::new_substitute('k', 's', 0), - ]; + ]); - let test_vec_2: Vec = vec![ + let test_box_2: Box<[StringDiffOp]> = Box::new([ StringDiffOp::new_substitute('r', 'n', 4), StringDiffOp::new_delete(2), StringDiffOp::new_delete(1), - ]; + ]); - let test_vec_3: Vec = vec![ + let test_box_3: Box<[StringDiffOp]> = Box::new([ StringDiffOp::new_insert('S', 5), StringDiffOp::new_delete(1), StringDiffOp::new_delete(0), - ]; + ]); - let test_vec_4 = vec![ + let test_box_4: Box<[StringDiffOp]> = Box::new([ StringDiffOp::new_insert('e', 1), StringDiffOp::new_insert('o', 3), - ]; + ]); - let test_vec_5 = vec![ + let test_box_5: Box<[StringDiffOp]> = Box::new([ StringDiffOp::new_insert('r', 4), StringDiffOp::new_insert('s', 0), - ]; + ]); assert_eq!( String::from("sitting"), - super::apply_diff("kitten", test_vec) + super::apply_diff("kitten", &test_box) ); assert_eq!( String::from("Sunday"), - super::apply_diff("Saturday", test_vec_2) + super::apply_diff("Saturday", &test_box_2) ); - assert_eq!(String::from("SETS"), super::apply_diff("RESET", test_vec_3)); - assert_eq!(String::from("heeoy"), super::apply_diff("hey", test_vec_4)); + assert_eq!( + String::from("SETS"), + super::apply_diff("RESET", &test_box_3) + ); + assert_eq!(String::from("heeoy"), super::apply_diff("hey", &test_box_4)); assert_eq!( String::from("skater"), - super::apply_diff("kate", test_vec_5) + super::apply_diff("kate", &test_box_5) ); } } diff --git a/crates/differ/src/diff.rs b/crates/differ/src/diff.rs new file mode 100644 index 0000000..12f5608 --- /dev/null +++ b/crates/differ/src/diff.rs @@ -0,0 +1,26 @@ +use crate::StringDiffOp; + +#[derive(Debug, PartialEq)] +pub struct Diff { + pub ops: Box<[StringDiffOp]>, + pub total_len: usize, +} + +impl Diff { + pub fn new(diffs: Vec, total_len: usize) -> Self { + Self { + ops: diffs.into_boxed_slice(), + total_len: total_len, + } + } + + pub fn distance(&self) -> usize { + self.ops.len() + } + + pub fn operations(&self) { + for i in self.ops.iter() { + println!("{:?}", i); + } + } +} diff --git a/crates/differ/src/diff_score.rs b/crates/differ/src/diff_score.rs new file mode 100644 index 0000000..10edac5 --- /dev/null +++ b/crates/differ/src/diff_score.rs @@ -0,0 +1,41 @@ +pub struct DiffScoreConfig { + pub sub_cost: f32, + pub lowercase_sub_cost: f32, + pub indel_cost: f32, + pub transpose_cost: f32, + // future properties here as needed +} + +impl Default for DiffScoreConfig { + fn default() -> Self { + Self { + sub_cost: 1.0, + lowercase_sub_cost: 1.0, + indel_cost: 1.0, + transpose_cost: 1.0, + } + } +} + +#[cfg(test)] +mod tests { + + #[test] + fn test_default() { + let test_struct = super::DiffScoreConfig::default(); + assert_eq!(test_struct.sub_cost, 1.0); + assert_eq!(test_struct.lowercase_sub_cost, 1.0); + assert_eq!(test_struct.indel_cost, 1.0); + assert_eq!(test_struct.transpose_cost, 1.0); + + let mut test_struct = super::DiffScoreConfig::default(); + test_struct.sub_cost = 2.0; + test_struct.lowercase_sub_cost = 2.0; + test_struct.indel_cost = 2.0; + test_struct.transpose_cost = 2.0; + assert_eq!(test_struct.sub_cost, 2.0); + assert_eq!(test_struct.lowercase_sub_cost, 2.0); + assert_eq!(test_struct.indel_cost, 2.0); + assert_eq!(test_struct.transpose_cost, 2.0); + } +} diff --git a/crates/differ/src/hamming.rs b/crates/differ/src/hamming.rs index 0e66036..0d3d102 100644 --- a/crates/differ/src/hamming.rs +++ b/crates/differ/src/hamming.rs @@ -1,84 +1,83 @@ -use crate::{StringDiffAlgorithm, StringDiffOp}; +use crate::diff::Diff; +use crate::StringDiffOp; use std::iter::zip; -pub struct HammingDistance {} -impl StringDiffAlgorithm for HammingDistance { - fn diff<'a>(&self, s1: &'a str, s2: &'a str) -> Vec { - if s1.len() != s2.len() { - panic!("Strings must be same length"); - } +pub fn hamming<'a>(s1: &'a str, s2: &'a str) -> Diff { + if s1.len() != s2.len() { + panic!("Strings must be same length"); + } - let mut opp_vec: Vec = Vec::new(); - let iter = zip(s1.chars(), s2.chars()); + let mut opp_vec: Vec = Vec::new(); + let iter = zip(s1.chars(), s2.chars()); - for (i, (char1, char2)) in iter.enumerate() { - if char1 != char2 { - opp_vec.push(StringDiffOp::new_substitute(char1, char2, i)); - } + for (i, (char1, char2)) in iter.enumerate() { + if char1 != char2 { + opp_vec.push(StringDiffOp::new_substitute(char1, char2, i)); } - opp_vec - } - - fn distance<'a>(&self, s1: &'a str, s2: &'a str) -> usize { - self.diff(s1, s2).len() } + Diff::new(opp_vec, s1.len()) } #[cfg(test)] mod tests { - use crate::{StringDiffAlgorithm, StringDiffOp}; - - #[test] - fn test_hamming_distance_edit_distance() { - let test_struct = super::HammingDistance {}; - - assert_eq!(3, test_struct.distance("karolin", "kathrin")); - assert_eq!(3, test_struct.distance("karolin", "kerstin")); - assert_eq!(4, test_struct.distance("kathrin", "kerstin")); - assert_eq!(4, test_struct.distance("0000", "1111")); - assert_eq!(3, test_struct.distance("2173896", "2233796")); - } + use crate::StringDiffOp; #[test] fn test_hamming_distance_op_distance() { - let test_struct = super::HammingDistance {}; + use crate::diff::Diff; + use crate::hamming::hamming; - let test_vec: Vec = vec![ - StringDiffOp::new_substitute('r', 't', 2), - StringDiffOp::new_substitute('o', 'h', 3), - StringDiffOp::new_substitute('l', 'r', 4), - ]; + let test_diff = Diff { + ops: Box::new([ + StringDiffOp::new_substitute('r', 't', 2), + StringDiffOp::new_substitute('o', 'h', 3), + StringDiffOp::new_substitute('l', 'r', 4), + ]), + total_len: 7, + }; - let test_vec_2: Vec = vec![ - StringDiffOp::new_substitute('a', 'e', 1), - StringDiffOp::new_substitute('o', 's', 3), - StringDiffOp::new_substitute('l', 't', 4), - ]; + let test_diff_2 = Diff { + ops: Box::new([ + StringDiffOp::new_substitute('a', 'e', 1), + StringDiffOp::new_substitute('o', 's', 3), + StringDiffOp::new_substitute('l', 't', 4), + ]), + total_len: 7, + }; - let test_vec_3: Vec = vec![ - StringDiffOp::new_substitute('a', 'e', 1), - StringDiffOp::new_substitute('t', 'r', 2), - StringDiffOp::new_substitute('h', 's', 3), - StringDiffOp::new_substitute('r', 't', 4), - ]; + let test_diff_3 = Diff { + ops: Box::new([ + StringDiffOp::new_substitute('a', 'e', 1), + StringDiffOp::new_substitute('t', 'r', 2), + StringDiffOp::new_substitute('h', 's', 3), + StringDiffOp::new_substitute('r', 't', 4), + ]), + total_len: 7, + }; - let test_vec_4: Vec = vec![ - StringDiffOp::new_substitute('0', '1', 0), - StringDiffOp::new_substitute('0', '1', 1), - StringDiffOp::new_substitute('0', '1', 2), - StringDiffOp::new_substitute('0', '1', 3), - ]; + let test_diff_4 = Diff { + ops: Box::new([ + StringDiffOp::new_substitute('0', '1', 0), + StringDiffOp::new_substitute('0', '1', 1), + StringDiffOp::new_substitute('0', '1', 2), + StringDiffOp::new_substitute('0', '1', 3), + ]), + total_len: 4, + }; - let test_vec_5: Vec = vec![ - StringDiffOp::new_substitute('1', '2', 1), - StringDiffOp::new_substitute('7', '3', 2), - StringDiffOp::new_substitute('8', '7', 4), - ]; + let test_diff_5 = Diff { + ops: Box::new([ + StringDiffOp::new_substitute('1', '2', 1), + StringDiffOp::new_substitute('7', '3', 2), + StringDiffOp::new_substitute('8', '7', 4), + ]), + total_len: 7, + }; - assert_eq!(&test_vec, &test_struct.diff("karolin", "kathrin")); - assert_eq!(&test_vec_2, &test_struct.diff("karolin", "kerstin")); - assert_eq!(&test_vec_3, &test_struct.diff("kathrin", "kerstin")); - assert_eq!(&test_vec_4, &test_struct.diff("0000", "1111")); - assert_eq!(&test_vec_5, &test_struct.diff("2173896", "2233796")); + assert_eq!(test_diff, hamming("karolin", "kathrin")); + assert_eq!(test_diff_2, hamming("karolin", "kerstin")); + assert_eq!(test_diff_3, hamming("kathrin", "kerstin")); + assert_eq!(test_diff_4, hamming("0000", "1111")); + assert_eq!(test_diff_5, hamming("2173896", "2233796")); } } diff --git a/crates/differ/src/levenshtein.rs b/crates/differ/src/levenshtein.rs index 81c62ce..d8b3de4 100644 --- a/crates/differ/src/levenshtein.rs +++ b/crates/differ/src/levenshtein.rs @@ -1,207 +1,170 @@ -use crate::{StringDiffAlgorithm, StringDiffOp}; +use crate::diff::Diff; +use crate::{get_operation_matrix, StringDiffOp}; use core::panic; -pub struct LevenshteinDistance {} - -impl LevenshteinDistance { - /// At a given (x,y) we must choose the minimum value between a cells - /// Top, Left, and Diagonal value. Depending on which cell is chosen between - /// the three it will tell us if its a deletion, insertion or substitution operation. - /// if we chooze x(The value above the cell) as the min value its a insertion operation (symbolized by '^') - /// if we choose y(The value left of the cell) as the min value its a deletion operation(symbolized by '<') - /// if we choose z(The value diagnal of the cell) as the min value its a substitution operation( sybmolized by '\' ) - /// we should always return either x,y,z if somehow we dont we panic with the unrechable macro. - pub(crate) fn min_dist_with_dir(x: usize, y: usize, z: usize) -> (usize, char) { - if x <= y && x <= z { - return (x, '^'); - } - if y <= x && y <= z { - return (y, '<'); - } - if z <= x && z <= y { - return (z, '\\'); - } - unreachable!() +pub(crate) fn reverse_vec_and_indexes(my_vec: &mut Vec, mut top_string_len: usize) { + my_vec.reverse(); + for i in my_vec.iter_mut() { + i.index = top_string_len; + top_string_len += 1; } +} - pub(crate) fn reverse_vec_and_indexes( - my_vec: &mut Vec, - mut top_string_len: usize, - ) { - my_vec.reverse(); - for i in my_vec.iter_mut() { - i.index = top_string_len; - top_string_len += 1; +pub(crate) fn get_operations( + my_opp: &Vec>, + left_string: &str, + top_string: &str, +) -> Vec { + let mut diff_ops: Vec = Vec::new(); + let mut top_str_len = top_string.len(); + let mut left_str_len = left_string.len(); + let mut prev_char: char = ' '; + + loop { + if top_str_len == 0 && left_str_len == 0 { + break; } - } - pub(crate) fn get_operations( - my_opp: &Vec>, - left_string: &str, - top_string: &str, - ) -> Vec { - let mut diff_ops: Vec = Vec::new(); - let mut top_str_len = top_string.len(); - let mut left_str_len = left_string.len(); - let mut prev_char: char = ' '; - - loop { - if top_str_len == 0 && left_str_len == 0 { - break; + //Rows Columns + match my_opp[left_str_len][top_str_len] { + //insertion + '^' => { + let insertion_op = + StringDiffOp::new_insert(left_string.chars().nth(left_str_len - 1).unwrap(), 0); + + left_str_len -= 1; + diff_ops.push(insertion_op); + prev_char = '^'; } + //substitution + '\\' => { + if prev_char == '^' { + reverse_vec_and_indexes(&mut diff_ops, top_str_len); + } - //Rows Columns - match my_opp[left_str_len][top_str_len] { - //insertion - '^' => { - let insertion_op = StringDiffOp::new_insert( + if left_string.chars().nth(left_str_len - 1).unwrap() + != top_string.chars().nth(top_str_len - 1).unwrap() + { + let substitution_op = StringDiffOp::new_substitute( + top_string.chars().nth(top_str_len - 1).unwrap(), left_string.chars().nth(left_str_len - 1).unwrap(), - 0, + top_str_len - 1, ); - left_str_len -= 1; - diff_ops.push(insertion_op); - prev_char = '^'; + diff_ops.push(substitution_op); } - //substitution - '\\' => { - if prev_char == '^' { - Self::reverse_vec_and_indexes(&mut diff_ops, top_str_len); - } - - if left_string.chars().nth(left_str_len - 1).unwrap() - != top_string.chars().nth(top_str_len - 1).unwrap() - { - let substitution_op = StringDiffOp::new_substitute( - top_string.chars().nth(top_str_len - 1).unwrap(), - left_string.chars().nth(left_str_len - 1).unwrap(), - top_str_len - 1, - ); - - diff_ops.push(substitution_op); - } - left_str_len -= 1; - top_str_len -= 1; - prev_char = '\\'; + left_str_len -= 1; + top_str_len -= 1; + prev_char = '\\'; + } + //deletion + '<' => { + if prev_char == '^' { + reverse_vec_and_indexes(&mut diff_ops, top_str_len) } - //deletion - '<' => { - if prev_char == '^' { - Self::reverse_vec_and_indexes(&mut diff_ops, top_str_len) - } - let deletion_op = StringDiffOp::new_delete(top_str_len - 1); + let deletion_op = StringDiffOp::new_delete(top_str_len - 1); - top_str_len -= 1; - diff_ops.push(deletion_op); - prev_char = '<'; - } - _ => { - panic!("UNRECOGNIZED SYMBOL OPERATION !") - } + top_str_len -= 1; + diff_ops.push(deletion_op); + prev_char = '<'; + } + _ => { + panic!("UNRECOGNIZED SYMBOL OPERATION !") } } - - diff_ops } - pub(crate) fn get_operation_matrix(s1: &str, s2: &str) -> Vec> { - let first_string_len: usize = s1.len(); - let second_string_len: usize = s2.len(); - - let mut dist_vector = vec![vec![0usize; first_string_len + 1]; second_string_len + 1]; - let mut dir_vector: Vec> = - vec![vec![' '; first_string_len + 1]; second_string_len + 1]; - - for i in 0..first_string_len + 1 { - dist_vector[0][i] = i; - } - for j in 0..second_string_len + 1 { - dist_vector[j][0] = j; - } - - dir_vector[0][0] = '\\'; - for j in 1..second_string_len + 1 { - dir_vector[j][0] = '^'; - } - for i in 1..first_string_len + 1 { - dir_vector[0][i] = '<'; - } + diff_ops +} - let mut sub_cost: usize = 0; - for i in 1..second_string_len + 1 { - for j in 1..first_string_len + 1 { - if s1.chars().nth(j - 1).unwrap() == s2.chars().nth(i - 1).unwrap() { - sub_cost = 0; - } else { - sub_cost = 1; - } - (dist_vector[i][j], dir_vector[i][j]) = LevenshteinDistance::min_dist_with_dir( - dist_vector[i - 1][j] + 1, //deletion - dist_vector[i][j - 1] + 1, //insertion - dist_vector[i - 1][j - 1] + sub_cost, - ); //substitution - } - } - dir_vector +/// At a given (x,y) we must choose the minimum value between a cells +/// Top, Left, and Diagonal value. Depending on which cell is chosen between +/// the three it will tell us if its a deletion, insertion or substitution operation. +/// if we chooze x(The value above the cell) as the min value its a insertion operation (symbolized by '^') +/// if we choose y(The value left of the cell) as the min value its a deletion operation(symbolized by '<') +/// if we choose z(The value diagnal of the cell) as the min value its a substitution operation( sybmolized by '\' ) +/// we should always return either x,y,z if somehow we dont we panic with the unrechable macro. +pub(crate) fn min_dist_with_dir(x: isize, y: isize, z: isize) -> (isize, char) { + if x <= y && x <= z { + return (x, '^'); + } + if y <= x && y <= z { + return (y, '<'); + } + if z <= x && z <= y { + return (z, '\\'); } + unreachable!() } -impl StringDiffAlgorithm for LevenshteinDistance { - fn diff<'a>(&self, s1: &'a str, s2: &'a str) -> Vec { - let dir_matrix = LevenshteinDistance::get_operation_matrix(s1, s2); - LevenshteinDistance::get_operations(&dir_matrix, s2, s1) +pub(crate) fn my_init_vec(my_vec: &mut Vec>, top_str_len: usize, left_str_len: usize) { + for i in 0..top_str_len { + my_vec[0][i] = i as isize; } - - fn distance<'a>(&self, s1: &'a str, s2: &'a str) -> usize { - let dir_matrix = LevenshteinDistance::get_operation_matrix(s1, s2); - LevenshteinDistance::get_operations(&dir_matrix, s2, s1).len() + for j in 0..left_str_len { + my_vec[j][0] = j as isize; } } -#[cfg(test)] -mod tests { - use crate::{StringDiffAlgorithm, StringDiffOp}; +pub fn levenshtein<'a>(s1: &'a str, s2: &'a str) -> Diff { + let dir_matrix = get_operation_matrix(s1, s2, min_dist_with_dir, my_init_vec, 0, 1, 1); + let temp = get_operations(&dir_matrix, s2, s1).clone(); + let val: usize = if s1.len() >= s2.len() { + s1.len() + } else { + s2.len() + }; - #[test] - fn test_levenshtein_distance_edit_distance() { - let test_struct = super::LevenshteinDistance {}; + Diff::new(temp, val) +} - assert_eq!(3, test_struct.distance("reset", "sets")); - assert_eq!(3, test_struct.distance("kitten", "sitting")); - assert_eq!(3, test_struct.distance("Saturday", "Sunday")); - } +#[cfg(test)] +mod tests { #[test] fn test_levenshtein_distance_op_distance() { - let test_struct = super::LevenshteinDistance {}; - - let test_vec: Vec = vec![ - super::StringDiffOp::new_insert('g', 6), - super::StringDiffOp::new_substitute('e', 'i', 4), - super::StringDiffOp::new_substitute('k', 's', 0), - ]; - - let test_vec_2: Vec = vec![ - super::StringDiffOp::new_substitute('r', 'n', 4), - super::StringDiffOp::new_delete(2), - super::StringDiffOp::new_delete(1), - ]; - - let test_vec_3: Vec = vec![ - super::StringDiffOp::new_insert('S', 5), - super::StringDiffOp::new_delete(1), - super::StringDiffOp::new_delete(0), - ]; - - let test_vec_4: Vec = vec![ - super::StringDiffOp::new_insert('E', 5), - super::StringDiffOp::new_insert('R', 6), - ]; - - assert_eq!(&test_vec, &test_struct.diff("kitten", "sitting")); - assert_eq!(&test_vec_2, &test_struct.diff("Saturday", "Sunday")); - assert_eq!(&test_vec_3, &test_struct.diff("RESET", "SETS")); - assert_eq!(&test_vec_4, &test_struct.diff("RESET", "RESETER")); + use crate::diff::Diff; + use crate::levenshtein::levenshtein; + + let test_diff = Diff { + ops: Box::new([ + super::StringDiffOp::new_insert('g', 6), + super::StringDiffOp::new_substitute('e', 'i', 4), + super::StringDiffOp::new_substitute('k', 's', 0), + ]), + total_len: 7, + }; + + let test_diff_2 = Diff { + ops: Box::new([ + super::StringDiffOp::new_substitute('r', 'n', 4), + super::StringDiffOp::new_delete(2), + super::StringDiffOp::new_delete(1), + ]), + total_len: 8, + }; + + let test_diff_3 = Diff { + ops: Box::new([ + super::StringDiffOp::new_insert('S', 5), + super::StringDiffOp::new_delete(1), + super::StringDiffOp::new_delete(0), + ]), + total_len: 5, + }; + + let test_diff_4 = Diff { + ops: Box::new([ + super::StringDiffOp::new_insert('E', 5), + super::StringDiffOp::new_insert('R', 6), + ]), + total_len: 7, + }; + + assert_eq!(test_diff, levenshtein("kitten", "sitting")); + assert_eq!(test_diff_2, levenshtein("Saturday", "Sunday")); + assert_eq!(test_diff_3, levenshtein("RESET", "SETS")); + assert_eq!(test_diff_4, levenshtein("RESET", "RESETER")); } } diff --git a/crates/differ/src/lib.rs b/crates/differ/src/lib.rs index 6eee171..260aa80 100644 --- a/crates/differ/src/lib.rs +++ b/crates/differ/src/lib.rs @@ -1,13 +1,16 @@ #![doc = include_str!("../README.md")] - mod apply_diff; pub use crate::apply_diff::apply_diff; mod hamming; -pub use crate::hamming::HammingDistance; +pub use crate::hamming::hamming; mod levenshtein; -pub use crate::levenshtein::LevenshteinDistance; +pub use crate::levenshtein::levenshtein; +mod diff_score; +pub use crate::diff_score::DiffScoreConfig; +mod diff; +pub use crate::diff::Diff; -#[derive(PartialEq, Eq, Debug)] +#[derive(PartialEq, Eq, Debug, Clone)] pub enum StringDiffOpKind { Substitute(char, char), Insert(char), @@ -15,7 +18,7 @@ pub enum StringDiffOpKind { Transpose, } -#[derive(Debug, PartialEq, Eq)] +#[derive(Debug, PartialEq, Eq, Clone)] pub struct StringDiffOp { pub kind: StringDiffOpKind, pub index: usize, @@ -39,7 +42,50 @@ impl StringDiffOp { } } -pub trait StringDiffAlgorithm { - fn diff<'a>(&self, s1: &'a str, s2: &'a str) -> Vec; - fn distance<'a>(&self, s1: &'a str, s2: &'a str) -> usize; +pub(crate) fn get_operation_matrix( + s1: &str, + s2: &str, + dist_with_dir: fn(isize, isize, isize) -> (isize, char), + init_vec: fn(&mut Vec>, usize, usize), + char_match: isize, + not_char_match: isize, + indent_cost: isize, +) -> Vec> { + let first_string_len: usize = s1.len(); + let second_string_len: usize = s2.len(); + + let mut dist_vector = vec![vec![0isize; first_string_len + 1]; second_string_len + 1]; + let mut dir_vector: Vec> = + vec![vec![' '; first_string_len + 1]; second_string_len + 1]; + + init_vec( + &mut dist_vector, + first_string_len + 1, + second_string_len + 1, + ); + + dir_vector[0][0] = '\\'; + for j in 1..second_string_len + 1 { + dir_vector[j][0] = '^'; + } + for i in 1..first_string_len + 1 { + dir_vector[0][i] = '<'; + } + + for i in 1..second_string_len + 1 { + for j in 1..first_string_len + 1 { + let diagnal_gap_cost: isize; + if s1.chars().nth(j - 1).unwrap() == s2.chars().nth(i - 1).unwrap() { + diagnal_gap_cost = char_match; + } else { + diagnal_gap_cost = not_char_match; + } + (dist_vector[i][j], dir_vector[i][j]) = dist_with_dir( + dist_vector[i - 1][j] + indent_cost, //deletion + dist_vector[i][j - 1] + indent_cost, //insertion + dist_vector[i - 1][j - 1] + diagnal_gap_cost, + ); //substitution + } + } + dir_vector } From fda6a62c4884e7babe5bac1386dff3a6d9a1bd4e Mon Sep 17 00:00:00 2001 From: Alfredo Date: Fri, 6 Jan 2023 23:49:53 -0600 Subject: [PATCH 2/2] fix: comments made 1/06/2023 PR #38 --- crates/differ/README.md | 30 ++++++++++++----------- crates/differ/src/apply_diff.rs | 12 +++++----- crates/differ/src/diff.rs | 26 -------------------- crates/differ/src/diff_score.rs | 41 -------------------------------- crates/differ/src/hamming.rs | 5 ++-- crates/differ/src/levenshtein.rs | 5 ++-- crates/differ/src/lib.rs | 31 +++++++++++++++++------- 7 files changed, 49 insertions(+), 101 deletions(-) delete mode 100644 crates/differ/src/diff.rs delete mode 100644 crates/differ/src/diff_score.rs diff --git a/crates/differ/README.md b/crates/differ/README.md index 5f75a13..459cc52 100644 --- a/crates/differ/README.md +++ b/crates/differ/README.md @@ -16,14 +16,14 @@ differ-rs = "0.0.0" ``` ## Features -* `apply_diff` function: Allows users to apply deltas in order to transform a words. * `Diff` struct: Contains a Box<> of operations between two strings. Also keeps track of length of longest string. Has methods that allows users to get the edit distance between two words, and view delta operations. -* `levenshtein` function: Returns a Diff struct between string 1 and string 2 using levenshtein algorithm. -* `hamming` function: Returns a Diff struct between string 1 and string 2 hamming algorithm. - +* `apply_diff()`: Allows users to apply deltas in order to transform a words. +* `levenshtein()`: Returns a Diff struct between string 1 and string 2. Levenshtein algorithm can detect insertions, deletions, and substitutions. +* `hamming()`: Returns a Diff struct between string 1 and string 2. Hamming algorithm can only detect substitutions, and string 1 and string 2 must me equal length. + ## How it works -* `apply_diff` works by giving a string and a transformation vector to the method. Then the transformation vector is applied to the string given in the first argument. -* `Diff` works by hodling a Box<> of operations, and longest length between any two strings. Both the `levenshtein`, `hamming` algorithm return this struct. +* `apply_diff()` works by giving a string and a transformation vector to the method. Then the transformation vector is applied to the string given in the first argument. +* `Diff` holds a `Box`, and the longest length of any two strings. Both `levenshtein()`, and `hamming()` eturn this struct. ## Examples @@ -32,7 +32,7 @@ Getting the edit distance between two words using Levenshtein algorithm use differ_rs::levenshtein; fn main(){ - let levensthein_edit_distance = levenshtein("Sitting", "Kitten").distance(); + let levensthein_edit_distance = levenshtein("Sitting", "Kitten").distance(); assert_eq!(3, levensthein_edit_distance); } @@ -44,9 +44,11 @@ To view the delta between two words using Levenshtein algorithm use differ_rs::levenshtein; fn main(){ - let my_levensthein = levenshtein("Sitting", "Kitten"); + let my_levensthein = levenshtein("Sitting", "Kitten"); - my_levensthein.operations(); + for diff_op in my_levensthein.ops.iter() { + println!("{:?}", diff_op); + } } ``` @@ -80,9 +82,10 @@ use differ_rs::hamming; fn main(){ let kathrin_edit_distance = hamming("karolin", "kathrin"); - kathrin_edit_distance.operations(); + for diff_op in kathrin_edit_distance.ops.iter() { + println!("{:?}", diff_op); + } } - ``` This example outputs: @@ -98,16 +101,15 @@ use differ_rs::{hamming, levenshtein, apply_diff}; fn main(){ let my_levensthein = levenshtein("sitting", "kitten"); - let delta_applied_v1 = apply_diff("sitting", &my_levensthein.ops); + let delta_applied_v1 = apply_diff("sitting", my_levensthein.ops.to_vec()); let my_hamming = hamming("karolin", "kathrin"); - let delta_applied_v2 = apply_diff("karolin", &my_hamming.ops); + let delta_applied_v2 = apply_diff("karolin", my_hamming.ops.to_vec()); assert_eq!("kitten", delta_applied_v1); assert_eq!("kathrin", delta_applied_v2); } - ``` ## License diff --git a/crates/differ/src/apply_diff.rs b/crates/differ/src/apply_diff.rs index d36b0ad..16f1ec6 100644 --- a/crates/differ/src/apply_diff.rs +++ b/crates/differ/src/apply_diff.rs @@ -11,7 +11,7 @@ pub(crate) fn remove(start: usize, stop: usize, s: &str) -> String { result } -pub fn apply_diff(s: &str, diffs: &Box<[StringDiffOp]>) -> String { +pub fn apply_diff(s: &str, diffs: Vec) -> String { let mut new_string: String = s.into(); for i in diffs.iter() { @@ -67,20 +67,20 @@ mod tests { assert_eq!( String::from("sitting"), - super::apply_diff("kitten", &test_box) + super::apply_diff("kitten", test_box.to_vec()) ); assert_eq!( String::from("Sunday"), - super::apply_diff("Saturday", &test_box_2) + super::apply_diff("Saturday", test_box_2.to_vec()) ); assert_eq!( String::from("SETS"), - super::apply_diff("RESET", &test_box_3) + super::apply_diff("RESET", test_box_3.to_vec()) ); - assert_eq!(String::from("heeoy"), super::apply_diff("hey", &test_box_4)); + assert_eq!(String::from("heeoy"), super::apply_diff("hey", test_box_4.to_vec())); assert_eq!( String::from("skater"), - super::apply_diff("kate", &test_box_5) + super::apply_diff("kate", test_box_5.to_vec()) ); } } diff --git a/crates/differ/src/diff.rs b/crates/differ/src/diff.rs deleted file mode 100644 index 12f5608..0000000 --- a/crates/differ/src/diff.rs +++ /dev/null @@ -1,26 +0,0 @@ -use crate::StringDiffOp; - -#[derive(Debug, PartialEq)] -pub struct Diff { - pub ops: Box<[StringDiffOp]>, - pub total_len: usize, -} - -impl Diff { - pub fn new(diffs: Vec, total_len: usize) -> Self { - Self { - ops: diffs.into_boxed_slice(), - total_len: total_len, - } - } - - pub fn distance(&self) -> usize { - self.ops.len() - } - - pub fn operations(&self) { - for i in self.ops.iter() { - println!("{:?}", i); - } - } -} diff --git a/crates/differ/src/diff_score.rs b/crates/differ/src/diff_score.rs deleted file mode 100644 index 10edac5..0000000 --- a/crates/differ/src/diff_score.rs +++ /dev/null @@ -1,41 +0,0 @@ -pub struct DiffScoreConfig { - pub sub_cost: f32, - pub lowercase_sub_cost: f32, - pub indel_cost: f32, - pub transpose_cost: f32, - // future properties here as needed -} - -impl Default for DiffScoreConfig { - fn default() -> Self { - Self { - sub_cost: 1.0, - lowercase_sub_cost: 1.0, - indel_cost: 1.0, - transpose_cost: 1.0, - } - } -} - -#[cfg(test)] -mod tests { - - #[test] - fn test_default() { - let test_struct = super::DiffScoreConfig::default(); - assert_eq!(test_struct.sub_cost, 1.0); - assert_eq!(test_struct.lowercase_sub_cost, 1.0); - assert_eq!(test_struct.indel_cost, 1.0); - assert_eq!(test_struct.transpose_cost, 1.0); - - let mut test_struct = super::DiffScoreConfig::default(); - test_struct.sub_cost = 2.0; - test_struct.lowercase_sub_cost = 2.0; - test_struct.indel_cost = 2.0; - test_struct.transpose_cost = 2.0; - assert_eq!(test_struct.sub_cost, 2.0); - assert_eq!(test_struct.lowercase_sub_cost, 2.0); - assert_eq!(test_struct.indel_cost, 2.0); - assert_eq!(test_struct.transpose_cost, 2.0); - } -} diff --git a/crates/differ/src/hamming.rs b/crates/differ/src/hamming.rs index 0d3d102..21b8983 100644 --- a/crates/differ/src/hamming.rs +++ b/crates/differ/src/hamming.rs @@ -1,5 +1,4 @@ -use crate::diff::Diff; -use crate::StringDiffOp; +use crate::{Diff, StringDiffOp}; use std::iter::zip; pub fn hamming<'a>(s1: &'a str, s2: &'a str) -> Diff { @@ -24,8 +23,8 @@ mod tests { #[test] fn test_hamming_distance_op_distance() { - use crate::diff::Diff; use crate::hamming::hamming; + use crate::Diff; let test_diff = Diff { ops: Box::new([ diff --git a/crates/differ/src/levenshtein.rs b/crates/differ/src/levenshtein.rs index d8b3de4..1540824 100644 --- a/crates/differ/src/levenshtein.rs +++ b/crates/differ/src/levenshtein.rs @@ -1,5 +1,4 @@ -use crate::diff::Diff; -use crate::{get_operation_matrix, StringDiffOp}; +use crate::{get_operation_matrix, Diff, StringDiffOp}; use core::panic; pub(crate) fn reverse_vec_and_indexes(my_vec: &mut Vec, mut top_string_len: usize) { @@ -124,8 +123,8 @@ mod tests { #[test] fn test_levenshtein_distance_op_distance() { - use crate::diff::Diff; use crate::levenshtein::levenshtein; + use crate::Diff; let test_diff = Diff { ops: Box::new([ diff --git a/crates/differ/src/lib.rs b/crates/differ/src/lib.rs index 260aa80..56e62ec 100644 --- a/crates/differ/src/lib.rs +++ b/crates/differ/src/lib.rs @@ -5,10 +5,6 @@ mod hamming; pub use crate::hamming::hamming; mod levenshtein; pub use crate::levenshtein::levenshtein; -mod diff_score; -pub use crate::diff_score::DiffScoreConfig; -mod diff; -pub use crate::diff::Diff; #[derive(PartialEq, Eq, Debug, Clone)] pub enum StringDiffOpKind { @@ -42,6 +38,25 @@ impl StringDiffOp { } } +#[derive(Debug, PartialEq, Eq)] +pub struct Diff { + pub ops: Box<[StringDiffOp]>, + pub total_len: usize, +} + +impl Diff { + pub fn new(diffs: Vec, total_len: usize) -> Self { + Self { + ops: diffs.into_boxed_slice(), + total_len: total_len, + } + } + + pub fn distance(&self) -> usize { + self.ops.len() + } +} + pub(crate) fn get_operation_matrix( s1: &str, s2: &str, @@ -74,16 +89,16 @@ pub(crate) fn get_operation_matrix( for i in 1..second_string_len + 1 { for j in 1..first_string_len + 1 { - let diagnal_gap_cost: isize; + let diagonal_gap_cost: isize; if s1.chars().nth(j - 1).unwrap() == s2.chars().nth(i - 1).unwrap() { - diagnal_gap_cost = char_match; + diagonal_gap_cost = char_match; } else { - diagnal_gap_cost = not_char_match; + diagonal_gap_cost = not_char_match; } (dist_vector[i][j], dir_vector[i][j]) = dist_with_dir( dist_vector[i - 1][j] + indent_cost, //deletion dist_vector[i][j - 1] + indent_cost, //insertion - dist_vector[i - 1][j - 1] + diagnal_gap_cost, + dist_vector[i - 1][j - 1] + diagonal_gap_cost, ); //substitution } }