Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Optimize memory footprint of resources #151

Merged
merged 8 commits into from
Sep 10, 2019
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,6 +1,10 @@
# Changelog
All notable changes to this project will be documented in this file.

## [Unreleased] - 2019-09-10
### Fixed
- Optimize memory footprint of resources [#151](https://github.com/snipsco/snips-nlu-rs/pull/151)

## [0.65.2] - 2019-09-06
### Fixed
- Freeze chrono to 0.4.8 to fix issue with rustling-ontology [#149](https://github.com/snipsco/snips-nlu-rs/pull/149)
Expand Down
15 changes: 10 additions & 5 deletions src/intent_classifier/featurizer.rs
Original file line number Diff line number Diff line change
Expand Up @@ -394,7 +394,10 @@ fn get_custom_entity_feature_name(entity_name: &str, language: NluUtilsLanguage)
format!("entityfeature{}", e)
}

fn get_word_clusters(query_tokens: &[String], word_clusterer: Arc<dyn WordClusterer>) -> Vec<String> {
fn get_word_clusters(
query_tokens: &[String],
word_clusterer: Arc<dyn WordClusterer>,
) -> Vec<String> {
let tokens_ref = query_tokens.iter().map(|t| t.as_ref()).collect_vec();
compute_all_ngrams(tokens_ref.as_ref(), tokens_ref.len())
.into_iter()
Expand Down Expand Up @@ -777,10 +780,12 @@ mod tests {
// Given
let language = Language::EN;
let query_tokens = tokenize_light("I, love House, muSic", language);
let word_clusterer = HashMapWordClusterer::from_iter(vec![
("love".to_string(), "cluster_love".to_string()),
("house".to_string(), "cluster_house".to_string()),
]);
let clusters: &[u8] = r#"
love cluster_love
house cluster_house
"#
.as_ref();
let word_clusterer = HashMapWordClusterer::from_reader(clusters).unwrap();

// When
let augmented_query = get_word_clusters(&query_tokens, Arc::new(word_clusterer));
Expand Down
33 changes: 16 additions & 17 deletions src/intent_parser/lookup_intent_parser.rs
Original file line number Diff line number Diff line change
@@ -1,27 +1,26 @@
use std::collections::{HashMap, HashSet};
use std::fs::File;
use std::path::Path;
use std::str::FromStr;
use std::sync::Arc;

use failure::ResultExt;
use itertools::Itertools;
use log::debug;
use snips_nlu_ontology::{BuiltinEntityKind, IntentClassifierResult, Language};
use snips_nlu_utils::language::Language as NluUtilsLanguage;
use snips_nlu_utils::string::normalize;
use snips_nlu_utils::string::{hash_str_to_i32, substring_with_char_range, suffix_from_char_index};
use snips_nlu_utils::token::tokenize_light;

use crate::errors::*;
use crate::intent_parser::InternalParsingResult;
use crate::language::FromLanguage;
use crate::models::LookupParserModel;
use crate::resources::SharedResources;
use crate::slot_utils::*;
use crate::utils::{deduplicate_overlapping_entities, IntentName, MatchedEntity, SlotName};
use crate::IntentParser;
use crate::{EntityScope, GroupedEntityScope, InputHash, IntentId, SlotId};

use super::{IntentParser, InternalParsingResult};
use failure::ResultExt;
use itertools::Itertools;
use log::debug;
use snips_nlu_ontology::{BuiltinEntityKind, IntentClassifierResult, Language};
use snips_nlu_utils::language::Language as NluUtilsLanguage;
use snips_nlu_utils::string::{
hash_str_to_i32, normalize, substring_with_char_range, suffix_from_char_index,
};
use snips_nlu_utils::token::tokenize_light;
use std::collections::{HashMap, HashSet};
use std::fs::File;
use std::path::Path;
use std::str::FromStr;
use std::sync::Arc;

/// HashMap based Intent Parser. The normalized/canonical form of an utterance
/// serves as the key and the value is tuple of (intent_id, [vec_of_slots_ids])
Expand Down
17 changes: 10 additions & 7 deletions src/resources/gazetteer.rs
Original file line number Diff line number Diff line change
@@ -1,25 +1,25 @@
use crate::errors::*;
use snips_nlu_utils::string::hash_str_to_i32;
use std::collections::HashSet;
use std::io::{BufRead, BufReader, Read};
use std::iter::FromIterator;

use crate::errors::*;

pub trait Gazetteer: Send + Sync {
fn contains(&self, value: &str) -> bool;
}

pub struct HashSetGazetteer {
values: HashSet<String>,
values: HashSet<i32>,
}

impl HashSetGazetteer {
pub fn from_reader<R: Read>(reader: R) -> Result<Self> {
let reader = BufReader::new(reader);
let mut values = HashSet::<String>::new();
let mut values = HashSet::new();
for line in reader.lines() {
let word = line?;
if !word.is_empty() {
values.insert(word);
values.insert(hash_str_to_i32(&*word));
}
}
Ok(Self { values })
Expand All @@ -29,14 +29,17 @@ impl HashSetGazetteer {
impl FromIterator<String> for HashSetGazetteer {
fn from_iter<T: IntoIterator<Item = String>>(iter: T) -> Self {
Self {
values: HashSet::from_iter(iter),
values: iter
.into_iter()
.map(|str_value| hash_str_to_i32(&*str_value))
.collect(),
}
}
}

impl Gazetteer for HashSetGazetteer {
fn contains(&self, value: &str) -> bool {
self.values.contains(value)
self.values.contains(&hash_str_to_i32(value))
}
}

Expand Down
19 changes: 10 additions & 9 deletions src/resources/stemmer.rs
Original file line number Diff line number Diff line change
@@ -1,22 +1,20 @@
use crate::errors::*;
use snips_nlu_utils::string::{hash_str_to_i32, normalize};
use std::collections::HashMap;
use std::io::Read;
use std::iter::FromIterator;

use snips_nlu_utils::string::normalize;

use crate::errors::*;

pub trait Stemmer: Send + Sync {
fn stem(&self, value: &str) -> String;
}

pub struct HashMapStemmer {
values: HashMap<String, String>,
values: HashMap<i32, String>,
}

impl HashMapStemmer {
pub fn from_reader<R: Read>(reader: R) -> Result<Self> {
let mut values = HashMap::<String, String>::new();
let mut values = HashMap::new();
let mut csv_reader = csv::ReaderBuilder::new()
.delimiter(b',')
.quoting(false)
Expand All @@ -28,7 +26,7 @@ impl HashMapStemmer {
let elements = record?;
let stem = &elements[0];
for value in elements.iter().skip(1) {
values.insert(value.to_string(), stem.to_string());
values.insert(hash_str_to_i32(value), stem.to_string());
}
}
Ok(Self { values })
Expand All @@ -38,15 +36,18 @@ impl HashMapStemmer {
impl FromIterator<(String, String)> for HashMapStemmer {
fn from_iter<T: IntoIterator<Item = (String, String)>>(iter: T) -> Self {
Self {
values: HashMap::from_iter(iter),
values: iter
.into_iter()
.map(|(str_key, str_value)| (hash_str_to_i32(&*str_key), str_value))
.collect(),
}
}
}

impl Stemmer for HashMapStemmer {
fn stem(&self, value: &str) -> String {
self.values
.get(&*normalize(value))
.get(&hash_str_to_i32(&*normalize(value)))
.map(|v| v.to_string())
.unwrap_or_else(|| value.to_string())
}
Expand Down
113 changes: 78 additions & 35 deletions src/resources/word_clusterer.rs
Original file line number Diff line number Diff line change
@@ -1,17 +1,19 @@
use crate::errors::*;
use itertools::Either;
use snips_nlu_ontology::Language;
use snips_nlu_utils::string::hash_str_to_i32;
use std::collections::HashMap;
use std::io::Read;
use std::iter::FromIterator;

use snips_nlu_ontology::Language;

use crate::errors::*;
use std::str::FromStr;

pub trait WordClusterer: Send + Sync {
fn get_cluster(&self, word: &str) -> Option<String>;
}

pub struct HashMapWordClusterer {
values: HashMap<String, String>,
/// This implementation allows to support both u16 and raw string representations for
/// word clusters
values: Either<HashMap<i32, u16>, HashMap<i32, String>>,
}

impl HashMapWordClusterer {
Expand All @@ -21,27 +23,53 @@ impl HashMapWordClusterer {
.quoting(false)
.has_headers(false)
.from_reader(reader);
let mut values = HashMap::<String, String>::new();
// This flag is switched to false as soon as a record is found which cannot
// be converted to a u16
let mut u16_casting_ok = true;
let mut u16_values = HashMap::new();
let mut str_values = HashMap::new();
for record in csv_reader.records() {
let elements = record?;
values.insert(elements[0].to_string(), elements[1].to_string());
}

Ok(Self { values })
}
}

impl FromIterator<(String, String)> for HashMapWordClusterer {
fn from_iter<T: IntoIterator<Item = (String, String)>>(iter: T) -> Self {
Self {
values: HashMap::from_iter(iter),
let hashed_key = hash_str_to_i32(elements[0].as_ref());
// Casting into u16 is attempted only when all previous clusters were converted
// successfully
if u16_casting_ok {
match u16::from_str(elements[1].as_ref()) {
Ok(u16_value) => {
u16_values.insert(hashed_key, u16_value);
}
Err(_) => {
// A word cluster cannot be converted into a u16, let's move all the
// previously stored clusters into a raw string representation
for (hash, value) in u16_values.iter() {
str_values.insert(*hash, format!("{}", value));
}
str_values.insert(hashed_key, elements[1].to_string());
u16_casting_ok = false;
u16_values.clear();
}
}
} else {
str_values.insert(hashed_key, elements[1].to_string());
}
}
Ok(Self {
values: if u16_casting_ok {
Either::Left(u16_values)
} else {
Either::Right(str_values)
},
})
}
}

impl WordClusterer for HashMapWordClusterer {
fn get_cluster(&self, word: &str) -> Option<String> {
self.values.get(word).map(|v| v.to_string())
let hashed_key = hash_str_to_i32(word);
match &self.values {
Either::Left(u16_values) => u16_values.get(&hashed_key).map(|v| format!("{}", v)),
Either::Right(str_values) => str_values.get(&hashed_key).cloned(),
}
}
}

Expand All @@ -56,12 +84,12 @@ mod tests {
use super::*;

#[test]
fn test_hashmap_word_clusterer() {
fn test_hashmap_word_clusterer_with_non_u16_values() {
// Given
let clusters: &[u8] = r#"
hello 1111111111111
world 1111110111111
"yolo 1111100111111
hello 42
world 123
"yolo cluster_which_is_not_u16
"#
.as_ref();

Expand All @@ -71,18 +99,33 @@ world 1111110111111
// Then
assert!(clusterer.is_ok());
let clusterer = clusterer.unwrap();
assert_eq!(
clusterer.get_cluster("hello"),
Some("1111111111111".to_string())
);
assert_eq!(
clusterer.get_cluster("world"),
Some("1111110111111".to_string())
);
assert_eq!(
clusterer.get_cluster("\"yolo"),
Some("1111100111111".to_string())
);
assert!(clusterer.values.is_right());
assert_eq!(clusterer.get_cluster("hello"), Some("42".to_string()));
assert_eq!(clusterer.get_cluster("world"), Some("123".to_string()));
assert_eq!(clusterer.get_cluster("\"yolo"), Some("cluster_which_is_not_u16".to_string()));
assert_eq!(clusterer.get_cluster("unknown"), None);
}

#[test]
fn test_hashmap_word_clusterer_with_u16_values() {
// Given
let clusters: &[u8] = r#"
hello 42
world 123
yolo 65500
"#
.as_ref();

// When
let clusterer = HashMapWordClusterer::from_reader(clusters);

// Then
assert!(clusterer.is_ok());
let clusterer = clusterer.unwrap();
assert!(clusterer.values.is_left());
assert_eq!(clusterer.get_cluster("hello"), Some("42".to_string()));
assert_eq!(clusterer.get_cluster("world"), Some("123".to_string()));
assert_eq!(clusterer.get_cluster("yolo"), Some("65500".to_string()));
assert_eq!(clusterer.get_cluster("unknown"), None);
}
}
10 changes: 6 additions & 4 deletions src/slot_filler/features.rs
Original file line number Diff line number Diff line change
Expand Up @@ -834,9 +834,11 @@ mod tests {
fn test_word_cluster_feature() {
// Given
let language = NluUtilsLanguage::EN;
let word_clusterer = HashMapWordClusterer::from_iter(
vec![("bird".to_string(), "010101".to_string())].into_iter(),
);
let clusters: &[u8] = r#"
bird 42
"#
.as_ref();
let word_clusterer = HashMapWordClusterer::from_reader(clusters).unwrap();
let tokens = tokenize("I love this bird", language);
let feature = WordClusterFeature {
cluster_name: "test_clusters".to_string(),
Expand All @@ -849,7 +851,7 @@ mod tests {
.collect();

// Then
let expected_results = vec![None, None, None, Some("010101".to_string())];
let expected_results = vec![None, None, None, Some("42".to_string())];
assert_eq!(expected_results, results);
}
}