-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathbrown_corp_generator.py
88 lines (70 loc) · 2.36 KB
/
brown_corp_generator.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
# this file generates sample classification data from the brown corpus
# it saves the sentence(s) and genres into separate files
# edit the topics list below to select topics
import codecs, re
from nltk.corpus import brown
import pandas as pd
# ################## edit params here ##################
lower_lim = 8 # minimum words per sentence
upper_lim = 100 # maximum words per sentence
max_ex = 1000 # maximum examples per genre
max_clusters = 5 # sentences per example
sents = codecs.open('datasets/brown_sents.txt', 'w', encoding='utf-8')
classes = codecs.open('datasets/brown_topics.txt', 'w', encoding='utf-8')
topics = ['religion', 'government', 'romance', 'news', 'science_fiction']
'''
choose some from the following genres:
adventure
belles_lettres
editorial
fiction
government
hobbies
humor
learned
lore
mystery
news
religion
reviews
romance
science_fiction
'''
striplist = ["`", "'", '!', '?', '.', ',', ':', ';', '-', '(', ')', ]
counts_list = []
csv_sents = []
csv_labels = []
for topic in topics:
good_count = 0 # for counting good sentences
this_counter = 0
this_cluster = ''
for sentence in brown.sents(categories=[topic]):
# check length first:
if lower_lim < len(sentence) < upper_lim:
this_string = ' '.join(sentence).lower() # lowercase
for shit in striplist:
this_string = this_string.replace(shit, '') # remove punctuation etc
this_string = re.sub(r'\d', '#', this_string) # sub # for digits
this_string = re.sub(r'[\s]+', ' ', this_string)
if this_counter < max_clusters:
this_cluster += this_string
this_counter += 1
else:
good_count += 1
sents.write(this_cluster)
csv_sents.append(this_cluster)
this_cluster = ''
sents.write('\n')
classes.write(topic)
classes.write('\n')
csv_labels.append(topic)
this_counter = 0
print(good_count, "sentence (clusters) for", topic)
if good_count > max_ex:
break
counts_list.append(good_count)
dicto = {'document' : csv_sents,
'topic' : csv_labels}
df = pd.DataFrame.from_dict(dicto)
df.to_csv('datasets/brown.csv', sep='\t')
print(sum(counts_list), counts_list)