Skip to content
This repository has been archived by the owner on Feb 15, 2023. It is now read-only.

Commit

Permalink
GH-22: update preprocess
Browse files Browse the repository at this point in the history
  • Loading branch information
rain1024 committed Dec 24, 2018
2 parents a9a81c3 + 6836b95 commit 1af7b6c
Show file tree
Hide file tree
Showing 3 changed files with 8 additions and 2 deletions.
10 changes: 8 additions & 2 deletions util/preprocess_vlsp2013/preprocess.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,12 +3,15 @@
from os.path import dirname

# Preprocess Train Data
FOLDER = "/data/projects/undertheseanlp/word_tokenize/data/vlsp2013"

FOLDER = "../../data/vlsp2013"


folder1 = FOLDER + "/raw/WordSegmentationTask/Trainset/Trainset-Segmentation-1"
folder2 = FOLDER + "/raw/WordSegmentationTask/Trainset/Trainset-Segmentation-2"
count = 0
output_filepath = "tmp/train_dev.txt"

if exists(output_filepath):
remove(output_filepath)
output = open(output_filepath, "a")
Expand Down Expand Up @@ -38,8 +41,11 @@
output.write(line)
count += 1
print("Number of sentences in Trainset-Segmentation-2 folder:", count)
<<<<<<< HEAD

# Train-dev split
=======
>>>>>>> 6836b95536410c9c8ac353e87d31bfd34d43209f

# Preprocess Test Data
count = 0
Expand All @@ -55,6 +61,6 @@
count += 1
content = " ".join(tokens) + "\n"
output.write(content)
print(count)
print("Number of sentences in Testset:", count)


Empty file.
File renamed without changes.

0 comments on commit 1af7b6c

Please sign in to comment.