Here, we explain how to generate tokenized data for pretraining.
The following command will generate pretrain_sentence_wiki.txt
under ../data/pretraining
.
cd ./compare-ja-tokenizer/preprocessing_for_pretraining
python src/make_data_wiki.py \
--tokenizr_path ../data/dict/mecab_bpe.json \
--pretokenizer_type mecab
The following command will generate pretrain_sentence_cc100.txt
under ../data/pretraining
.
python src/make_data_cc100.py \
--tokenizr_path ../data/dict/mecab_bpe.json \
--pretokenizer_type mecab
- Pre-tokenize texts (Takes time.)
# Wikipedia
cat ../data/pretraining/pretrain_sentence_wiki.txt | mecab --input-buffer-size=1048576 -Owakati > ../data/pretraining/mecab/pretrain_sentence_wiki.txt
# CC-100
cat ../data/pretraining/pretrain_sentence_cc100.txt | mecab -Owakati > ../data/pretraining/mecab/pretrain_sentence_cc100.txt
- Generate tokenized dataset
# BPE
bash ./scripts/mecab/bpe.sh
# WordPiece
bash ./scripts/mecab/wordpiece.sh
# Unigram
bash ./scripts/mecab/unigram.sh
- Pre-tokenize texts (Takes time.)
# Wikipedia
jumanpp ../data/pretraining/pretrain_sentence_wiki.txt --segment > ../data/pretraining/juman++/pretrain_sentence_wiki.txt
# CC-100
jumanpp ../data/pretraining/pretrain_sentence_cc100.txt --segment > ../data/pretraining/juman++/pretrain_sentence_cc100.txt
- Generate tokenized dataset
# BPE
bash ./scripts/juman++/bpe.sh
# WordPiece
bash ./scripts/juman++/wordpiece.sh
# Unigram
bash ./scripts/juman++/unigram.sh
- Pre-tokenize texts (Takes time.)
# Wikipedia
python src/sudachi/pretokenize_sudachi.py \
--input_path ../data/pretraining/pretrain_sentence_wiki.txt \
--output_path ../data/pretraining/sudachi/pretrain_sentence_wiki.txt
# CC-100
python src/sudachi/pretokenize_sudachi.py \
--input_path ../data/pretraining/pretrain_sentence_cc100.txt \
--output_path ../data/pretraining/sudachi/pretrain_sentence_cc100.txt
- Generate tokenized dataset
# BPE
bash ./scripts/sudachi/bpe.sh
# WordPiece
bash ./scripts/sudachi/wordpiece.sh
# Unigram
bash ./scripts/sudachi/unigram.sh
- Pre-tokenize texts (Takes time.)
# Wikipedia
cat ../data/pretraining/pretrain_sentence_wiki.txt | cargo run --release -p predict -- --model /path/to/bccwj-suw+unidic+tag.model.zst > ../data/pretraining/vaporetto/temp_pretrain_sentence_wiki.txt
sed -e 's/\\//g' ../data/pretraining/vaporetto/temp_pretrain_sentence_wiki.txt > ../data/pretraining/vaporetto/pretrain_sentence_wiki.txt
# CC-100
cat ../data/pretraining/pretrain_sentence_cc100.txt | cargo run --release -p predict -- --model /path/to/bccwj-suw+unidic+tag.model.zst > ../data/pretraining/vaporetto/temp_pretrain_sentence_cc100.txt
sed -e 's/\\//g' ../data/pretraining/vaporetto/temp_pretrain_sentence_cc100.txt > ../data/pretraining/vaporetto/pretrain_sentence_cc100.txt
- Generate tokenized dataset
# BPE
bash ./scripts/vaporetto/bpe.sh
# WordPiece
bash ./scripts/vaporetto/wordpiece.sh
# Unigram
bash ./scripts/vaporetto/unigram.sh
# BPE
bash ./scripts/Nothing/bpe.sh
# WordPiece
bash ./scripts/Nothing/wordpiece.sh
# Unigram
bash ./scripts/Nothing/unigram.sh