-
Notifications
You must be signed in to change notification settings - Fork 0
/
handle_tf-idf.py
142 lines (123 loc) · 3.96 KB
/
handle_tf-idf.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
from sklearn.feature_extraction.text import TfidfVectorizer
from openpyxl import load_workbook
from pathlib import Path
import pandas as pd
# check headdata
# pd.set_option('display.max_rows', None)
# pd.set_option('display.max_columns', None)
# pd.set_option('display.width', None)
# pd.set_option('display.max_colwidth', -1)
gwd = Path.cwd()
def get_kr_priciple():
path = gwd / "krdata" / "krdata.xlsx"
wb = load_workbook(path)
ws = wb.worksheets[1]
dicts = {}
for i in range(ws.max_row):
key = ws.cell(row=i+1,column=1).value
value = ws.cell(row=i+1,column=3).value
dicts[key] = value
return dicts
def get_tw_priciple():
path = gwd / "twdata" / "twdata.xlsx"
wb = load_workbook(path)
ws = wb.worksheets[1]
dicts = {}
for i in range(ws.max_row):
key = ws.cell(row=i+1,column=1).value
value = ws.cell(row=i+1,column=3).value
dicts[key] = value
return dicts
def handle_tw_predata():
path_twdata = gwd / "twdata" / "twdata.xlsx"
path_participle = gwd / "twdata" / "tw_participle.xlsx"
twb = load_workbook(path_twdata)
pwb = load_workbook(path_participle)
pws = pwb.active
twbs = twb.active
basee = 3
tw_temp_list = []
tw_index_temp_list = []
for i in range(twbs.max_row-2):
value = twbs.cell(row=basee,column=6).value
tw_index_temp_list.append(value)
basee += 1
for i in range(pws.max_row):
value = pws.cell(row=i+1,column=1).value
tw_temp_list.append(value)
twb.close()
pwb.close()
return tw_temp_list,tw_index_temp_list
def handle_kr_predata():
path = gwd / "krdata" / "krdata.xlsx"
krb = load_workbook(path)
krbs = krb.active
base = 1
kr_temp_list = []
kr_index_temp_list = []
for i in range(krbs.max_row):
value = krbs.cell(row=base,column=9).value
index_value = krbs.cell(row=base,column=7).value
kr_index_temp_list.append(index_value)
kr_temp_list.append(value)
base += 1
krb.close()
return kr_temp_list,kr_index_temp_list
def handle_jp_predata():
path = gwd / "jpdata.xlsx"
jwb = load_workbook(path)
jws = jwb.worksheets[0]
jp_temp_list = []
jp_index_temp_list = []
for i in range(jws.max_row-2):
value = jws.cell(row=i+3,column=9).value
index_value = jws.cell(row=i+3,column=6).value
jp_temp_list.append(value)
jp_index_temp_list.append(index_value)
jwb.close()
return jp_temp_list,jp_index_temp_list
def handle_tf_idf(predata,index_data):
vectoriser = TfidfVectorizer(norm = None)
tf_idf_scores = vectoriser.fit_transform(predata)
feature_names = vectoriser.get_feature_names_out()
df_tf_idf = pd.DataFrame(tf_idf_scores.T.todense(), index = feature_names, columns = index_data)
return df_tf_idf
def handle_sum(df):
newdf = df.T
sum_ = newdf.sum()
return sum_
def handle_mean(df):
newdf = df.T
mean_ = newdf.mean()
return mean_
def handle_max(df):
newdf = df.T
max_ = newdf.max()
return max_
def handle_jp_sum():
temp_list,index_list = handle_jp_predata()
df = handle_tf_idf(temp_list,index_list)
sum_df = handle_sum(df)
sum_df.to_csv("sum.csv",encoding="utf-8_sig")
def handle_kr_sum():
records = []
df = pd.read_csv("krdata\kr_sum.csv")
dicts = get_kr_priciple()
names = df["word"]
for (colname,colval) in names.iteritems():
records.append(dicts[colval])
df.insert(2,column="type",value=records)
df.to_csv("sum.csv",encoding="utf-8_sig")
def handle_tw_sum():
records = []
df = pd.read_csv(r"twdata\tw_sum.csv")
dicts = get_tw_priciple()
for index,row in df.iterrows():
string_ = dicts.get(row["word"])
if string_:
records.append(string_)
else:
df.drop(index=index,inplace=True)
df.insert(2,column="type",value=records)
df.reset_index(inplace=True)
df.to_csv("sum.csv",encoding="utf-8_sig")