Skip to content
This repository has been archived by the owner on Mar 9, 2022. It is now read-only.

Commit

Permalink
Upgrade lindera 0.6.0 and yada 0.4.0 (#9)
Browse files Browse the repository at this point in the history
* Upgrade lindera 0.6.0 and yada 0.4.0

Closes #7

* Upgrade lindera 0.6.0 and yada 0.4.0

Closes #7
  • Loading branch information
johtani authored Oct 12, 2020
1 parent b5d9271 commit 529338f
Show file tree
Hide file tree
Showing 5 changed files with 47 additions and 56 deletions.
1 change: 1 addition & 0 deletions CHANGES.md
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ All notable changes to this project will be documented in this file.
This project adheres to [Semantic Versioning](http://semver.org/).

## Unreleased
- Upgrade lindera 0.6.0 & yada 0.4.0 #9 @johtani
- Update 2018 edition #5 @johtani

## 0.1.2 (2020-05-22)
Expand Down
54 changes: 18 additions & 36 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

17 changes: 9 additions & 8 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -16,14 +16,15 @@ categories = ["text-processing"]
license = "MIT"

[dependencies]
bincode = "1.2.1"
byteorder = "1.3.4"
clap = "2.33.0"
csv = "1.1.3"
encoding = "0.2.33"
glob = "0.3.0"
lindera-core = "0.4.0"
lindera-fst = "0.1.0"
bincode = "1.2"
byteorder = "1.3"
clap = "2.33"
csv = "1.1"
encoding = "0.2"
glob = "0.3"
yada = "0.4"

lindera-core = "0.6"

[[bin]]
name = "lindera-ko-dic"
Expand Down
2 changes: 1 addition & 1 deletion Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,6 @@ tag:
git push origin v$(LINDERA_KO_DIC_BUILDER_VERSION)

publish:
ifeq ($(shell cargo show --json lindera-ko-dic-builder | jq -r '.versions[].num' | grep $(LINDERA_KO_DIC_BUILDER_VERSION)),)
ifeq ($(shell curl -s -XGET https://crates.io/api/v1/crates/lindera-ko-dic-builder | jq -r '.versions[].num' | grep $(LINDERA_KO_DIC_BUILDER_VERSION)),)
cargo package && cargo publish
endif
29 changes: 18 additions & 11 deletions src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -14,12 +14,13 @@ use csv::StringRecord;
use encoding::all::UTF_16LE;
use encoding::{DecoderTrap, Encoding};
use glob::glob;
use yada::builder::DoubleArrayBuilder;

use lindera_core::core::character_definition::{
CategoryData, CategoryId, CharacterDefinitions, LookupTable,
};
use lindera_core::core::unknown_dictionary::UnknownDictionary;
use lindera_core::core::word_entry::{WordEntry, WordId};
use lindera_fst::MapBuilder;

#[derive(Debug)]
pub enum ParsingError {
Expand Down Expand Up @@ -88,8 +89,8 @@ fn build_dict(input_dir: &str, output_dir: &str) -> Result<(), ParsingError> {
println!("sorting entries");
rows.sort_by_key(|row| row[0].to_string().clone());

let wtr_fst = io::BufWriter::new(
File::create(Path::new(output_dir).join(Path::new("dict.fst"))).unwrap(),
let mut wtr_da = io::BufWriter::new(
File::create(Path::new(output_dir).join(Path::new("dict.da"))).unwrap(),
);
let mut wtr_vals = io::BufWriter::new(
File::create(Path::new(output_dir).join(Path::new("dict.vals"))).unwrap(),
Expand All @@ -102,7 +103,7 @@ fn build_dict(input_dir: &str, output_dir: &str) -> Result<(), ParsingError> {
.entry(row[0].to_string())
.or_insert_with(Vec::new)
.push(WordEntry {
word_id: WordId(row_id as u32),
word_id: WordId(row_id as u32, true),
word_cost: i16::from_str(row[3].trim()).unwrap(),
cost_id: u16::from_str(row[1].trim()).unwrap(),
});
Expand Down Expand Up @@ -135,17 +136,23 @@ fn build_dict(input_dir: &str, output_dir: &str) -> Result<(), ParsingError> {
wtr_words.flush()?;
wtr_words_idx.flush()?;

let mut id = 0u64;
let mut id = 0u32;

println!("building fst");
let mut fst_build = MapBuilder::new(wtr_fst).unwrap();
println!("building da");
let mut keyset: Vec<(&[u8], u32)> = vec![];
let mut lastlen = 0;
for (key, word_entries) in &word_entry_map {
let len = word_entries.len() as u64;
let len = word_entries.len() as u32;
let val = (id << 5) | len;
fst_build.insert(&key, val).unwrap();
keyset.push((key.as_bytes(), val));
id += len;
lastlen += len;
}
fst_build.finish().unwrap();
let da_bytes = DoubleArrayBuilder::build(&keyset);
assert!(da_bytes.is_some(), "DoubleArray build error. ");
wtr_da.write_all(&da_bytes.unwrap()[..])?;

println!("Last len is {}", lastlen);

println!("building values");
for word_entries in word_entry_map.values() {
Expand Down Expand Up @@ -376,7 +383,7 @@ fn make_costs_array(entries: &[DictionaryEntry]) -> Vec<WordEntry> {
// in `unk.def` are not the same.
//assert_eq!(e.left_id, e.right_id);
WordEntry {
word_id: WordId(std::u32::MAX),
word_id: WordId(std::u32::MAX, true),
cost_id: e.left_id as u16,
word_cost: e.word_cost as i16,
}
Expand Down

0 comments on commit 529338f

Please sign in to comment.