Skip to content

Commit

Permalink
GH-673: add TfidfFeaturizer
Browse files Browse the repository at this point in the history
  • Loading branch information
rain1024 committed Jul 31, 2023
1 parent b04d521 commit c84fd54
Show file tree
Hide file tree
Showing 12 changed files with 269 additions and 42 deletions.
1 change: 1 addition & 0 deletions examples/classification/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
outputs
42 changes: 1 addition & 41 deletions examples/classification/README.md
Original file line number Diff line number Diff line change
@@ -1,41 +1 @@
# Vietnamese Text Classification with underthesea

```
Author: Vu Anh
Date: July 27, 2023
```

Vietnamese text classification is an integral domain in Natural Language Processing (NLP) concerning the Vietnamese language. The objective of text classification is to assign predefined labels or categories to a given text based on its content. This report presents an overview of the challenges, methodologies, and advancements in Vietnamese text classification.

## Methodologies and Approaches

Zero-shot classification is a technique allowing models to categorize data into previously unseen classes without direct training. Leveraging large language models like GPT-3 and GPT-4, this method is invaluable when there's a shortage of labeled data or when it's impractical to gather annotations for every conceivable class. Such models use their vast training knowledge to generalize across tasks, making them versatile and adaptable to new classification challenges.

## Results

The following table presents the outcomes of the Vietnamese Text Classification endeavor when utilizing Large Language Models (LLMs):

```
<table>
<thead>
<tr>
<th>Dataset</th>
<th>LLMs</th>
<th>F1 Metric</th>
</tr>
</thead>
<tbody>
<tr>
<td></td>
<td>GPT-3.5</td>
<td></td>
</tr>
<tr>
<td></td>
<td>GPT-4</td>
<td></td>
</tr>
</tbody>
</table>
```

# Classification
17 changes: 17 additions & 0 deletions examples/classification/preprocess_data.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
from datasets import load_dataset

dataset = load_dataset("uit-nlp/vietnamese_students_feedback")
print(dataset)

print(dataset["train"][0])

sentences = []
for i in range(10):
item = dataset["train"][i]
sentence = item["sentence"]
sentences.append(sentence)
print(item)

with open("tmp/train.txt", "w") as f:
content = "\n".join(sentences)
f.write(content)
41 changes: 41 additions & 0 deletions examples/classification/technical_report.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
# Vietnamese Text Classification with underthesea

```
Author: Vu Anh
Date: July 27, 2023
```

Vietnamese text classification is an integral domain in Natural Language Processing (NLP) concerning the Vietnamese language. The objective of text classification is to assign predefined labels or categories to a given text based on its content. This report presents an overview of the challenges, methodologies, and advancements in Vietnamese text classification.

## Methodologies and Approaches

Zero-shot classification is a technique allowing models to categorize data into previously unseen classes without direct training. Leveraging large language models like GPT-3 and GPT-4, this method is invaluable when there's a shortage of labeled data or when it's impractical to gather annotations for every conceivable class. Such models use their vast training knowledge to generalize across tasks, making them versatile and adaptable to new classification challenges.

## Results

The following table presents the outcomes of the Vietnamese Text Classification endeavor when utilizing Large Language Models (LLMs):

```
<table>
<thead>
<tr>
<th>Dataset</th>
<th>LLMs</th>
<th>F1 Metric</th>
</tr>
</thead>
<tbody>
<tr>
<td></td>
<td>GPT-3.5</td>
<td></td>
</tr>
<tr>
<td></td>
<td>GPT-4</td>
<td></td>
</tr>
</tbody>
</table>
```

2 changes: 2 additions & 0 deletions examples/classification/tmp/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
*
!.gitignore
1 change: 1 addition & 0 deletions extensions/underthesea_core/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@ serde = { version = "1.0", features = [ "derive" ] }
regex = "1"
rayon = "1.5"
crfs = "0.1"
nalgebra = "0.29"

[dependencies.pyo3]
version = "0.15.0"
Expand Down
5 changes: 5 additions & 0 deletions extensions/underthesea_core/HISTORY.rst
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,11 @@
History
================================================================================

1.0.5 (2023-07-31)
--------------------------------------------------------------------------------

* Add metrics::consine_similarity

1.0.4 (2023-04-28)
--------------------------------------------------------------------------------

Expand Down
2 changes: 2 additions & 0 deletions extensions/underthesea_core/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,8 @@ use pyo3::prelude::*;
use std::collections::HashSet;

pub mod featurizers;
pub mod tfidf;
pub mod metrics;

#[pyclass]
pub struct CRFFeaturizer {
Expand Down
35 changes: 35 additions & 0 deletions extensions/underthesea_core/src/metrics/cosine_similarity.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
//! Cosine similarity
//!
//! Cosine similarity is a measure of similarity between two non-zero vectors of an inner product space that measures the cosine of the angle between them.
//!
//! # Author: Vu Anh
//! # Date: 2023-07-30
use nalgebra::DVector;

pub fn cosine_similarity(a: &Vec<f64>, b: &Vec<f64>) -> f64 {
let va = DVector::from_vec(a.clone());
let vb = DVector::from_vec(b.clone());

va.dot(&vb) / (va.norm() * vb.norm())
}

#[cfg(test)]
mod tests {
use super::*;

#[test]
fn test_cosine_similarity_1() {
let a = vec![1.0, 2.0, 3.0];
let b = vec![1.0, 2.0, 3.0];

assert!((cosine_similarity(&a, &b) - 1.0).abs() < f64::EPSILON);
}

#[test]
fn test_cosine_similarity_orthogonal() {
let a = vec![1.0, 0.0];
let b = vec![0.0, 1.0];

assert!((cosine_similarity(&a, &b)).abs() < f64::EPSILON);
}
}
3 changes: 3 additions & 0 deletions extensions/underthesea_core/src/metrics/mod.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
mod cosine_similarity;

pub use cosine_similarity::cosine_similarity;
160 changes: 160 additions & 0 deletions extensions/underthesea_core/src/tfidf.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,160 @@
//! tfidf.rs
//!
//! Provides functionality for computing Term Frequency-Inverse Document Frequency (TFIDF) vectors.
//!
//! Author: Vu Anh
//! Date: 2023-07-29

use std::collections::{HashMap, HashSet};

pub struct TfidfFeaturizer {
idf: Vec<f64>,
term_to_index: HashMap<String, usize>
}

impl TfidfFeaturizer {
pub fn new() -> Self {
TfidfFeaturizer {
idf: Vec::new(),
term_to_index: HashMap::new()
}
}

pub fn get_idf(&self) -> &Vec<f64> {
&self.idf
}

fn compute_idf(&mut self, documents: &[Vec<String>]) {
let n = documents.len() as f64;

let mut word_freq = HashMap::new();

for doc in documents.iter() {
let mut seen_terms = HashSet::new();

for term in doc {
if !seen_terms.contains(term) {
let idx = match self.term_to_index.get(term) {
Some(&existing_idx) => existing_idx,
None => {
let new_idx = self.term_to_index.len();
self.term_to_index.insert(term.clone(), new_idx);
new_idx
}
};
*word_freq.entry(idx).or_insert(0.0) += 1.0;
seen_terms.insert(term.clone());
}
}
}

self.idf.resize(self.term_to_index.len(), 0.0);
for(&idx, &freq) in &word_freq {
self.idf[idx] = (n / freq).ln();
}
}

pub fn train(&mut self, texts: &[&str]){
let documents: Vec<Vec<String>> = texts.iter().map(|text| {
text.split_whitespace().map(|word| word.to_string()).collect()
}).collect();

self.compute_idf(&documents);
}

pub fn predict(&self, texts: &Vec<&str>) -> Vec<Vec<f64>> {
texts.iter().map(|text| {
let words: Vec<String> = text.split_whitespace().map(|word| word.to_string()).collect();
let mut tfidf_vector = vec![0.0; self.term_to_index.len()];

// compute term frequence for this text
let mut tf = HashMap::new();
for word in &words {
*tf.entry(word).or_insert(0.0) += 1.0;
}

let keys: Vec<_> = tf.keys().cloned().collect();
for word in keys {
if let Some(freq) = tf.get_mut(&word) {
*freq /= words.len() as f64;
}
}

// compute tfidf values
for (word, &index) in &self.term_to_index {
if let Some(&term_freq) = tf.get(word) {
tfidf_vector[index] = term_freq * self.idf[index];
}
}

tfidf_vector
}).collect()
}
}

#[cfg(test)]
mod tests {
use super::*;
use crate::metrics::cosine_similarity;

#[test]
fn test_constructor(){
TfidfFeaturizer::new();
}

#[test]
fn test_train_tfidf() {
let mut tfidf_featurizer = TfidfFeaturizer::new();
let texts = vec![
"i love you",
"you hate me",
"me too"
];

// Train tfidf vectorizer
tfidf_featurizer.train(&texts);

// vocab: i love you hate me too

let idf_actual = tfidf_featurizer.get_idf();
assert_eq!(idf_actual.len(), 6);

let idf_expected = vec![
(3.0f64/1.0f64).ln(),
(3.0f64/1.0f64).ln(),
(3.0f64/2.0f64).ln(),
(3.0f64/1.0f64).ln(),
(3.0f64/2.0f64).ln(),
(3.0f64/1.0f64).ln(),
];
assert!((cosine_similarity(&idf_actual, &idf_expected) - 1.0).abs() < 1e-9);

// Predict tfidf values
let output = tfidf_featurizer.predict(&texts);
assert!(output.len() == 3);

// Document 1: "i love you"
let doc1_actual = output[0].clone();
let doc1_expected = vec![
(1.0f64 / 3.0f64) * (3.0f64).ln() ,
(1.0f64 / 3.0f64) * (3.0f64).ln() ,
(1.0f64 / 3.0f64) * (3.0f64 / 2.0f64).ln() ,
0.0f64,
0.0f64,
0.0f64
];
assert!((cosine_similarity(&doc1_actual, &doc1_expected) - 1.0).abs() < 1e-9);

// Document 2: "you hate me"
let doc2_actual = output[1].clone();
let doc2_expected = vec![
0.0f64,
0.0f64,
(1.0f64 / 3.0f64) * (3.0f64 / 2.0f64).ln(),
(1.0f64 / 3.0f64) * (3.0f64 / 1.0f64).ln(),
(1.0f64 / 3.0f64) * (3.0f64 / 2.0f64).ln(),
0.0f64
];
assert!((cosine_similarity(&doc2_actual, &doc2_expected) - 1.0).abs() < 1e-9);
}
}
2 changes: 1 addition & 1 deletion extensions/underthesea_core/tests/models.rs
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,6 @@ mod tests {
#[test]
fn test_crfs(){
let buf = fs::read("tests/wt_crf_2018_09_13.bin").unwrap();
let model = crfs::Model::new(&buf).unwrap();
let _model = crfs::Model::new(&buf).unwrap();
}
}

0 comments on commit c84fd54

Please sign in to comment.