Skip to content
This repository has been archived by the owner on Mar 9, 2022. It is now read-only.

Upgrade lindera 0.6.0 and yada 0.4.0 #9

Merged
merged 2 commits into from
Oct 12, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions CHANGES.md
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ All notable changes to this project will be documented in this file.
This project adheres to [Semantic Versioning](http://semver.org/).

## Unreleased
- Upgrade lindera 0.6.0 & yada 0.4.0 #9 @johtani
- Update 2018 edition #5 @johtani

## 0.1.2 (2020-05-22)
Expand Down
54 changes: 18 additions & 36 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

17 changes: 9 additions & 8 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -16,14 +16,15 @@ categories = ["text-processing"]
license = "MIT"

[dependencies]
bincode = "1.2.1"
byteorder = "1.3.4"
clap = "2.33.0"
csv = "1.1.3"
encoding = "0.2.33"
glob = "0.3.0"
lindera-core = "0.4.0"
lindera-fst = "0.1.0"
bincode = "1.2"
byteorder = "1.3"
clap = "2.33"
csv = "1.1"
encoding = "0.2"
glob = "0.3"
yada = "0.4"

lindera-core = "0.6"

[[bin]]
name = "lindera-ko-dic"
Expand Down
2 changes: 1 addition & 1 deletion Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,6 @@ tag:
git push origin v$(LINDERA_KO_DIC_BUILDER_VERSION)

publish:
ifeq ($(shell cargo show --json lindera-ko-dic-builder | jq -r '.versions[].num' | grep $(LINDERA_KO_DIC_BUILDER_VERSION)),)
ifeq ($(shell curl -s -XGET https://crates.io/api/v1/crates/lindera-ko-dic-builder | jq -r '.versions[].num' | grep $(LINDERA_KO_DIC_BUILDER_VERSION)),)
cargo package && cargo publish
endif
29 changes: 18 additions & 11 deletions src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -14,12 +14,13 @@ use csv::StringRecord;
use encoding::all::UTF_16LE;
use encoding::{DecoderTrap, Encoding};
use glob::glob;
use yada::builder::DoubleArrayBuilder;

use lindera_core::core::character_definition::{
CategoryData, CategoryId, CharacterDefinitions, LookupTable,
};
use lindera_core::core::unknown_dictionary::UnknownDictionary;
use lindera_core::core::word_entry::{WordEntry, WordId};
use lindera_fst::MapBuilder;

#[derive(Debug)]
pub enum ParsingError {
Expand Down Expand Up @@ -88,8 +89,8 @@ fn build_dict(input_dir: &str, output_dir: &str) -> Result<(), ParsingError> {
println!("sorting entries");
rows.sort_by_key(|row| row[0].to_string().clone());

let wtr_fst = io::BufWriter::new(
File::create(Path::new(output_dir).join(Path::new("dict.fst"))).unwrap(),
let mut wtr_da = io::BufWriter::new(
File::create(Path::new(output_dir).join(Path::new("dict.da"))).unwrap(),
);
let mut wtr_vals = io::BufWriter::new(
File::create(Path::new(output_dir).join(Path::new("dict.vals"))).unwrap(),
Expand All @@ -102,7 +103,7 @@ fn build_dict(input_dir: &str, output_dir: &str) -> Result<(), ParsingError> {
.entry(row[0].to_string())
.or_insert_with(Vec::new)
.push(WordEntry {
word_id: WordId(row_id as u32),
word_id: WordId(row_id as u32, true),
word_cost: i16::from_str(row[3].trim()).unwrap(),
cost_id: u16::from_str(row[1].trim()).unwrap(),
});
Expand Down Expand Up @@ -135,17 +136,23 @@ fn build_dict(input_dir: &str, output_dir: &str) -> Result<(), ParsingError> {
wtr_words.flush()?;
wtr_words_idx.flush()?;

let mut id = 0u64;
let mut id = 0u32;

println!("building fst");
let mut fst_build = MapBuilder::new(wtr_fst).unwrap();
println!("building da");
let mut keyset: Vec<(&[u8], u32)> = vec![];
let mut lastlen = 0;
for (key, word_entries) in &word_entry_map {
let len = word_entries.len() as u64;
let len = word_entries.len() as u32;
let val = (id << 5) | len;
fst_build.insert(&key, val).unwrap();
keyset.push((key.as_bytes(), val));
id += len;
lastlen += len;
}
fst_build.finish().unwrap();
let da_bytes = DoubleArrayBuilder::build(&keyset);
assert!(da_bytes.is_some(), "DoubleArray build error. ");
wtr_da.write_all(&da_bytes.unwrap()[..])?;

println!("Last len is {}", lastlen);

println!("building values");
for word_entries in word_entry_map.values() {
Expand Down Expand Up @@ -376,7 +383,7 @@ fn make_costs_array(entries: &[DictionaryEntry]) -> Vec<WordEntry> {
// in `unk.def` are not the same.
//assert_eq!(e.left_id, e.right_id);
WordEntry {
word_id: WordId(std::u32::MAX),
word_id: WordId(std::u32::MAX, true),
cost_id: e.left_id as u16,
word_cost: e.word_cost as i16,
}
Expand Down