Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

preprocessing and better iters #111

Open
wants to merge 24 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions tests/data/corpora/sentencise/text.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
3.14 is a number, not some B.S. You know M.D. Bob is my friend.
I've got my Ph.D. in 2014. I have Ph.D. I got it in 2014.

А по-русски слабо? Что делать с гос. служащими?
富士山が見える。こんにちは

2 changes: 1 addition & 1 deletion vecto/_version.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
"""Version of vecto package."""

VERSION = "0.2.16"
VERSION = "0.2.21"
12 changes: 7 additions & 5 deletions vecto/corpus/corpus.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,8 @@
from vecto.utils.data import get_uncompressed_size
from vecto.utils.metadata import WithMetaData

from .iterators import (DirIterator, FileIterator, FileLineIterator,
LoopedLineIterator, SequenceIterator,
from .iterators import (CharIterator, DirIterator, FileIterator,
FileLineIterator, LoopedLineIterator, SequenceIterator,
SlidingWindowIterator, TokenIterator,
TokenizedSequenceIterator, ViewLineIterator)
from .tokenization import (DEFAULT_JAP_TOKENIZER, DEFAULT_SENT_TOKENIZER,
Expand Down Expand Up @@ -51,7 +51,7 @@ def get_token_iterator(self, tokenizer=None, verbose=False):
return TokenIterator(self.get_sentence_iterator(tokenizer, verbose))

def get_character_iterator(self, verbose=False):
return TokenIterator(self.get_line_iterator(verbose))
return CharIterator(self.get_line_iterator(verbose))

def get_sentence_iterator(self, tokenizer=None, verbose=False):
if tokenizer is None:
Expand All @@ -68,10 +68,12 @@ def get_sequence_iterator(self, sequence_length, tokenizer):
sequence_length=sequence_length,
tokenizer=tokenizer)

def get_looped_sequence_iterator(self, sequence_length, tokenizer, rank, size):
def get_looped_sequence_iterator(self, sequence_length, tokenizer, rank, size, min_length=0, reset_on_new_line=False):
return SequenceIterator(self.get_looped_line_iterator(rank, size),
sequence_length=sequence_length,
tokenizer=tokenizer)
tokenizer=tokenizer,
minimal_length=min_length,
reset_on_new_line=reset_on_new_line)


class Corpus(BaseCorpus):
Expand Down
33 changes: 25 additions & 8 deletions vecto/corpus/iterators.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,11 +35,14 @@ def __init__(self, dirname, verbose=0):
self.dirname = dirname

def _generate_samples(self):
for root, _, files in os.walk(self.dirname, followlinks=True):
for good_fname in sorted(fnmatch.filter(files, "*")):
full_file_path = os.path.join(root, good_fname)
logger.info("processing " + full_file_path)
yield full_file_path
if os.path.isfile(self.dirname):
yield self.dirname
else:
for root, _, files in os.walk(self.dirname, followlinks=True):
for good_fname in sorted(fnmatch.filter(files, "*")):
full_file_path = os.path.join(root, good_fname)
logger.info("processing " + full_file_path)
yield full_file_path


class FileLineIterator(BaseIterator):
Expand Down Expand Up @@ -110,6 +113,7 @@ def _generate_samples(self):
filename = self.tree[self.id_file][0]
file_in = detect_archive_format_and_open(filename)
seek_unicode(file_in, self.start_offset)
file_in.readline()
while True:
for line in file_in:
line = line.strip()
Expand Down Expand Up @@ -150,20 +154,26 @@ def _generate_samples(self):


class SequenceIterator(BaseIterator):
def __init__(self, line_terator, sequence_length, tokenizer):
def __init__(self, line_terator, sequence_length, tokenizer, minimal_length=0, reset_on_new_line=False):
super().__init__()
self.line_iterator = line_terator
self.sequence_length = sequence_length
self.tokenizer = tokenizer
self.buffer = []
self.minimal_length = minimal_length
self.reset_on_new_line = reset_on_new_line

def _generate_samples(self):
# TODO: consider removing too small chunks of sentences at the end
# TODO: consider leveraging sentence iterator is corpus has mark-up
for line in self.line_iterator:
tokens = self.tokenizer(line)
if self.reset_on_new_line:
self.buffer = []
elif len(self.buffer) < self.minimal_length:
self.buffer = []
self.buffer += tokens
while len(self.buffer) > self.sequence_length:
while len(self.buffer) > self.sequence_length - self.minimal_length:
s = self.buffer[: self.sequence_length]
self.buffer = self.buffer[self.sequence_length:]
yield s
Expand All @@ -184,13 +194,20 @@ def __init__(self, parent_iterator, verbose=0):


class TokenIterator(BaseNestedIterator):

def _generate_samples(self):
for tokenized_str in self.parent_iterator:
for token in tokenized_str:
yield token


class CharIterator(BaseNestedIterator):
def _generate_samples(self):
for line in self.parent_iterator:
for c in line:
yield c
yield " "


def iter_sliding_window(seq, left_ctx_size, right_ctx_size):
for i, current in enumerate(seq):
ctx = []
Expand Down
142 changes: 142 additions & 0 deletions vecto/corpus/preprocess.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,142 @@
# import spacy
# import numpy as np
# from nltk.tokenize import sent_tokenize
# import nltk
import json
import random
import sys

from transformers import AutoTokenizer
from vecto.corpus import Corpus


def simple_char_iter(text):
for c in text:
yield c


# def sentencize(text):
# nlp = spacy.load("en_core_web_sm")
# doc = nlp(text)
# sents = [sent.text for sent in doc.sents]
# return [s for s in sentence_iter(char_iter(text))]
# sent_detector = nltk.data.load('tokenizers/punkt/english.pickle')
# return sent_detector.tokenize(text)


# TODO: ok let's do streaming sentence splitter
# ingest character by character
# append to sentence, unles
other_delimiters = {"?", "!", "。"}

known_abbreviations = {"md", "bs", "mr", "ms"}


def is_abbreviation(token):
if "." in token:
return True
if len(token) == 1:
return True
if token.lower() in known_abbreviations:
return True
return False


def sentence_iter(char_iter):
size_buffer = 10000
buffer = [" "] * size_buffer
pos = 0
prev_char = ""
prev_token = ""
for c in char_iter:
is_sentence_end = False
if c == " " and prev_char == ".":
# print(prev_token)
if not is_abbreviation(prev_token[:-1]):
is_sentence_end = True
if prev_char in other_delimiters and c != "\"":
is_sentence_end = True
#buffer[pos] = c
#pos += 1
if is_sentence_end:
if pos > 0:
yield "".join(buffer[: pos]).strip()
buffer = [" "] * size_buffer
pos = 0
continue
prev_char = c
if pos >= len(buffer):
print("buffer overflow:")
# print("".join(buffer[:100]))
print("".join(buffer[-100:]))
pos = 0
buffer[pos] = c
prev_token += c
if c == " ":
prev_token = ""
pos += 1
if pos > 0:
yield "".join(buffer[: pos])


def main():
# samples = []
# samples.append("Hey how do you do? M.D. Bob is my friend. Mr. John too.")
# samples.append("А по-русски слабо? Что делать с гос. служащими?")
# samples.append("富士山が見える。こんにちは")
# for s in samples:
# tokenized = sentencize(s)
# print(tokenized)
# path = "./tests/data/corpora/sentencise"
path = sys.argv[1]
# path = "/mnt/storage/Data/NLP/corpora/wiki_clean.txt"
# path = "/mnt/storage/Data/NLP/corpora/toronto_clean.txt"
# path = "./quotes/13th_Reality-1.txt"
name_tokenizer = "roberta-base"
tokenizer = AutoTokenizer.from_pretrained(name_tokenizer)
corpus = Corpus(path)
corpus.load_dir_strucute()
char_iter = corpus.get_character_iterator()
sent_iter = sentence_iter(char_iter)
# cnt = 0
sample = [tokenizer.cls_token_id]
max_length = 128
cnt = 0
proba_shortening = 0.1
with open("lines.jsonl", "w") as f_out:
for line in sent_iter:
tokens = tokenizer(line,
add_special_tokens=False,
return_attention_mask=False,)["input_ids"]
sample += tokens
if len(sample) > max_length - 10:
sample = sample[:max_length - 1]
min_length = 5
if random.random() < proba_shortening:
sample = sample[: random.randint(min_length, len(sample))]
sample += [tokenizer.sep_token_id]
sample += [tokenizer.pad_token_id] * (max_length - len(sample))
# print(len(sample))
serialized = json.dumps(sample)
if ":" in serialized:
print(sample)
print(serialized)
f_out.write(serialized)
f_out.write("\n")
#print(tokenizer.decode(sample))
#print(len(sample))
#print()
sample = [tokenizer.cls_token_id]
cnt += 1
if cnt % 10000 == 0:
print(cnt, "last line", len(tokens))
# print(tokenizer.convert_ids_to_tokens(tokens))
# print(line)
# print()
# if cnt > 100:
# break
# cnt += 1


if __name__ == "__main__":
main()