-
Notifications
You must be signed in to change notification settings - Fork 11
/
Copy pathpreprocess.py
52 lines (49 loc) · 2.31 KB
/
preprocess.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
#################################################
### THIS FILE WAS AUTOGENERATED! DO NOT EDIT! ###
#################################################
# file to edit: dev_nb/preprocess.ipynb
from fastai import *
from fastai.text import *
path = Path('../data')
def preprocess(data,bs=64):
if data == 'gutenberg':
data_lm = (TextList.from_folder(path/'gutenberg',extensions=".txt",recurse=False)
.split_by_rand_pct(0.1)
.label_for_lm()
.databunch(bs=bs))
data_lm.save('gutenberg_tmp')
elif data == 'lyrics':
df = pd.read_json(path/'lyrics.json',lines=True)
df['text'] = df['text'].astype('str')
data_lm = (TextList.from_df(df,cols='text',path=path)
.random_split_by_pct(0.1)
.label_for_lm()
.databunch(bs=bs))
data_lm.save('tmp_lyrics')
elif data == 'poems':
poems = (path/'poems.txt').open().read().split('\n\n\n\n\n\n\n')
poems_df = pd.DataFrame(poems)
tokenizer = Tokenizer(SpacyTokenizer, 'en')
processor = [TokenizeProcessor(tokenizer=tokenizer), NumericalizeProcessor(min_freq=1,max_vocab=60000)]
data_lm = (TextList.from_df(poems_df,path,cols=0,processor=processor)
.split_by_rand_pct(0.1)
.label_for_lm()
.databunch(bs=bs))
data_lm.save('poems_tmp')
elif data == 'metaphors':
df = pd.read_csv(path/'metaphors.csv')
df['metaphor'] = df['metaphor'].astype('str')
metaphors = list(df['metaphor'])
cleaned_metaphors = []
for metaphor in metaphors:
metaphor = metaphor.replace('/ ','').replace('"','').replace("'","").replace('<i> ','').replace(' </i>',"")
cleaned_metaphors.append(metaphor)
df_met = pd.DataFrame(cleaned_metaphors)
tokenizer = Tokenizer(SpacyTokenizer, 'en')
processor = [TokenizeProcessor(tokenizer=tokenizer), NumericalizeProcessor(min_freq=1,max_vocab=50000)]
data_lm = (TextList.from_df(df_met,cols=0,path=path,processor=processor)
.random_split_by_pct(0.1)
.label_for_lm()
.databunch(bs=bs))
data_lm.save('tmp_metaphors')
else: print(f'data {data} not supported')