GH-673: add TfidfFeaturizer

undertheseanlp · Jul 31, 2023 · c84fd54 · c84fd54
1 parent b04d521
commit c84fd54
Show file tree

Hide file tree

Showing 12 changed files with 269 additions and 42 deletions.
diff --git a/examples/classification/.gitignore b/examples/classification/.gitignore
@@ -0,0 +1 @@
+outputs
diff --git a/examples/classification/README.md b/examples/classification/README.md
@@ -1,41 +1 @@
-# Vietnamese Text Classification with underthesea
-
-```
-Author: Vu Anh
-Date: July 27, 2023
-```
-
-Vietnamese text classification is an integral domain in Natural Language Processing (NLP) concerning the Vietnamese language. The objective of text classification is to assign predefined labels or categories to a given text based on its content. This report presents an overview of the challenges, methodologies, and advancements in Vietnamese text classification.
-
-## Methodologies and Approaches
-
-Zero-shot classification is a technique allowing models to categorize data into previously unseen classes without direct training. Leveraging large language models like GPT-3 and GPT-4, this method is invaluable when there's a shortage of labeled data or when it's impractical to gather annotations for every conceivable class. Such models use their vast training knowledge to generalize across tasks, making them versatile and adaptable to new classification challenges.
-
-## Results
-
-The following table presents the outcomes of the Vietnamese Text Classification endeavor when utilizing Large Language Models (LLMs):
-
-```
-<table>
-    <thead>
-        <tr>
-            <th>Dataset</th>
-            <th>LLMs</th>
-            <th>F1 Metric</th>
-        </tr>
-    </thead>
-    <tbody>
-        <tr>
-            <td></td>
-            <td>GPT-3.5</td>
-            <td></td>
-        </tr>
-        <tr>
-            <td></td>
-            <td>GPT-4</td>
-            <td></td>
-        </tr>
-    </tbody>
-</table>
-```
-
+# Classification
diff --git a/examples/classification/preprocess_data.py b/examples/classification/preprocess_data.py
@@ -0,0 +1,17 @@
+from datasets import load_dataset
+
+dataset = load_dataset("uit-nlp/vietnamese_students_feedback")
+print(dataset)
+
+print(dataset["train"][0])
+
+sentences = []
+for i in range(10):
+    item = dataset["train"][i]
+    sentence = item["sentence"]
+    sentences.append(sentence)
+    print(item)
+
+with open("tmp/train.txt", "w") as f:
+    content = "\n".join(sentences)
+    f.write(content)
diff --git a/examples/classification/technical_report.md b/examples/classification/technical_report.md
@@ -0,0 +1,41 @@
+# Vietnamese Text Classification with underthesea
+
+```
+Author: Vu Anh
+Date: July 27, 2023
+```
+
+Vietnamese text classification is an integral domain in Natural Language Processing (NLP) concerning the Vietnamese language. The objective of text classification is to assign predefined labels or categories to a given text based on its content. This report presents an overview of the challenges, methodologies, and advancements in Vietnamese text classification.
+
+## Methodologies and Approaches
+
+Zero-shot classification is a technique allowing models to categorize data into previously unseen classes without direct training. Leveraging large language models like GPT-3 and GPT-4, this method is invaluable when there's a shortage of labeled data or when it's impractical to gather annotations for every conceivable class. Such models use their vast training knowledge to generalize across tasks, making them versatile and adaptable to new classification challenges.
+
+## Results
+
+The following table presents the outcomes of the Vietnamese Text Classification endeavor when utilizing Large Language Models (LLMs):
+
+```
+<table>
+    <thead>
+        <tr>
+            <th>Dataset</th>
+            <th>LLMs</th>
+            <th>F1 Metric</th>
+        </tr>
+    </thead>
+    <tbody>
+        <tr>
+            <td></td>
+            <td>GPT-3.5</td>
+            <td></td>
+        </tr>
+        <tr>
+            <td></td>
+            <td>GPT-4</td>
+            <td></td>
+        </tr>
+    </tbody>
+</table>
+```
+
diff --git a/examples/classification/tmp/.gitignore b/examples/classification/tmp/.gitignore
@@ -0,0 +1,2 @@
+*
+!.gitignore
diff --git a/extensions/underthesea_core/Cargo.toml b/extensions/underthesea_core/Cargo.toml
@@ -29,6 +29,7 @@ serde = { version = "1.0", features = [ "derive" ] }
 regex = "1"
 rayon = "1.5"
 crfs = "0.1"
+nalgebra = "0.29"
 
 [dependencies.pyo3]
 version = "0.15.0"

diff --git a/extensions/underthesea_core/HISTORY.rst b/extensions/underthesea_core/HISTORY.rst
@@ -2,6 +2,11 @@
 History
 ================================================================================
 
+1.0.5 (2023-07-31)
+--------------------------------------------------------------------------------
+
+* Add metrics::consine_similarity
+
 1.0.4 (2023-04-28)
 --------------------------------------------------------------------------------
 

diff --git a/extensions/underthesea_core/src/lib.rs b/extensions/underthesea_core/src/lib.rs
@@ -5,6 +5,8 @@ use pyo3::prelude::*;
 use std::collections::HashSet;
 
 pub mod featurizers;
+pub mod tfidf;
+pub mod metrics;
 
 #[pyclass]
 pub struct CRFFeaturizer {

diff --git a/extensions/underthesea_core/src/metrics/cosine_similarity.rs b/extensions/underthesea_core/src/metrics/cosine_similarity.rs
@@ -0,0 +1,35 @@
+//! Cosine similarity
+//!
+//! Cosine similarity is a measure of similarity between two non-zero vectors of an inner product space that measures the cosine of the angle between them.
+//!
+//! # Author: Vu Anh
+//! # Date: 2023-07-30
+use nalgebra::DVector;
+
+pub fn cosine_similarity(a: &Vec<f64>, b: &Vec<f64>) -> f64 {
+    let va = DVector::from_vec(a.clone());
+    let vb = DVector::from_vec(b.clone());
+
+    va.dot(&vb) / (va.norm() * vb.norm())
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn test_cosine_similarity_1() {
+        let a = vec![1.0, 2.0, 3.0];
+        let b = vec![1.0, 2.0, 3.0];
+
+        assert!((cosine_similarity(&a, &b) - 1.0).abs() < f64::EPSILON);
+    }
+
+    #[test]
+    fn test_cosine_similarity_orthogonal() {
+        let a = vec![1.0, 0.0];
+        let b = vec![0.0, 1.0];
+
+        assert!((cosine_similarity(&a, &b)).abs() < f64::EPSILON);
+    }
+}
diff --git a/extensions/underthesea_core/src/metrics/mod.rs b/extensions/underthesea_core/src/metrics/mod.rs
@@ -0,0 +1,3 @@
+mod cosine_similarity;
+
+pub use cosine_similarity::cosine_similarity;
diff --git a/extensions/underthesea_core/src/tfidf.rs b/extensions/underthesea_core/src/tfidf.rs
@@ -0,0 +1,160 @@
+//! tfidf.rs
+//! 
+//! Provides functionality for computing Term Frequency-Inverse Document Frequency (TFIDF) vectors.
+//! 
+//! Author: Vu Anh 
+//! Date: 2023-07-29
+
+use std::collections::{HashMap, HashSet};
+
+pub struct TfidfFeaturizer {
+    idf: Vec<f64>,
+    term_to_index: HashMap<String, usize>
+}
+
+impl TfidfFeaturizer {
+    pub fn new() -> Self {
+        TfidfFeaturizer {
+            idf: Vec::new(),
+            term_to_index: HashMap::new()
+        }
+    }
+
+    pub fn get_idf(&self) -> &Vec<f64> {
+        &self.idf
+    }
+
+    fn compute_idf(&mut self, documents: &[Vec<String>]) {
+        let n = documents.len() as f64;
+
+        let mut word_freq = HashMap::new();
+
+        for doc in documents.iter() {
+            let mut seen_terms = HashSet::new();
+
+            for term in doc {
+                if !seen_terms.contains(term) {
+                    let idx = match self.term_to_index.get(term) {
+                        Some(&existing_idx) => existing_idx,
+                        None => {
+                            let new_idx = self.term_to_index.len();
+                            self.term_to_index.insert(term.clone(), new_idx);
+                            new_idx
+                        }
+                    };
+                    *word_freq.entry(idx).or_insert(0.0) += 1.0;
+                    seen_terms.insert(term.clone());
+                }
+            }
+        }
+
+        self.idf.resize(self.term_to_index.len(), 0.0);
+        for(&idx, &freq) in &word_freq {
+            self.idf[idx] = (n / freq).ln();
+        }
+    }
+
+    pub fn train(&mut self, texts: &[&str]){
+        let documents: Vec<Vec<String>> = texts.iter().map(|text| {
+            text.split_whitespace().map(|word| word.to_string()).collect()
+        }).collect();
+
+        self.compute_idf(&documents);
+    }
+
+    pub fn predict(&self, texts: &Vec<&str>) -> Vec<Vec<f64>> {
+        texts.iter().map(|text| {
+            let words: Vec<String> = text.split_whitespace().map(|word| word.to_string()).collect();
+            let mut tfidf_vector = vec![0.0; self.term_to_index.len()];
+
+            // compute term frequence for this text
+            let mut tf = HashMap::new();
+            for word in &words {
+                *tf.entry(word).or_insert(0.0) += 1.0;
+            }
+
+            let keys: Vec<_> = tf.keys().cloned().collect();
+            for word in keys {
+                if let Some(freq) = tf.get_mut(&word) {
+                    *freq /= words.len() as f64;
+                }
+            }
+
+            // compute tfidf values
+            for (word, &index) in &self.term_to_index {
+                if let Some(&term_freq) = tf.get(word) {
+                    tfidf_vector[index] = term_freq * self.idf[index];
+                }
+            }
+
+            tfidf_vector
+        }).collect()
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use crate::metrics::cosine_similarity;
+
+    #[test]
+    fn test_constructor(){
+        TfidfFeaturizer::new();
+    }
+
+    #[test]
+    fn test_train_tfidf() {
+        let mut tfidf_featurizer = TfidfFeaturizer::new();
+        let texts = vec![
+            "i love you",
+            "you hate me",
+            "me too"
+        ];
+
+        // Train tfidf vectorizer
+        tfidf_featurizer.train(&texts);
+
+        // vocab: i love you hate me too
+
+        let idf_actual = tfidf_featurizer.get_idf();
+        assert_eq!(idf_actual.len(), 6);
+
+        let idf_expected = vec![
+            (3.0f64/1.0f64).ln(),
+            (3.0f64/1.0f64).ln(),
+            (3.0f64/2.0f64).ln(),
+            (3.0f64/1.0f64).ln(),
+            (3.0f64/2.0f64).ln(),
+            (3.0f64/1.0f64).ln(),
+        ];
+        assert!((cosine_similarity(&idf_actual, &idf_expected) - 1.0).abs() < 1e-9);
+
+        // Predict tfidf values
+        let output = tfidf_featurizer.predict(&texts);
+        assert!(output.len() == 3);
+
+        // Document 1: "i love you"
+        let doc1_actual = output[0].clone();
+        let doc1_expected = vec![
+            (1.0f64 / 3.0f64) * (3.0f64).ln() ,
+            (1.0f64 / 3.0f64) * (3.0f64).ln() ,
+            (1.0f64 / 3.0f64) * (3.0f64 / 2.0f64).ln() ,
+            0.0f64,
+            0.0f64,
+            0.0f64
+        ];
+        assert!((cosine_similarity(&doc1_actual, &doc1_expected) - 1.0).abs() < 1e-9);
+
+        // Document 2: "you hate me"
+        let doc2_actual = output[1].clone();
+        let doc2_expected = vec![
+            0.0f64,
+            0.0f64,
+            (1.0f64 / 3.0f64) * (3.0f64 / 2.0f64).ln(),
+            (1.0f64 / 3.0f64) * (3.0f64 / 1.0f64).ln(),
+            (1.0f64 / 3.0f64) * (3.0f64 / 2.0f64).ln(),
+            0.0f64
+        ];
+        assert!((cosine_similarity(&doc2_actual, &doc2_expected) - 1.0).abs() < 1e-9);
+    }
+}
diff --git a/extensions/underthesea_core/tests/models.rs b/extensions/underthesea_core/tests/models.rs
@@ -5,6 +5,6 @@ mod tests {
     #[test]
     fn test_crfs(){
         let buf = fs::read("tests/wt_crf_2018_09_13.bin").unwrap();
-        let model = crfs::Model::new(&buf).unwrap();
+        let _model = crfs::Model::new(&buf).unwrap();
      }
 }