-
Notifications
You must be signed in to change notification settings - Fork 0
/
generate_wordlist_as_csv.py
36 lines (27 loc) · 1.01 KB
/
generate_wordlist_as_csv.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
import csv
from collections import Counter
import re
import pandas as pd
# Define the path to the CSV file
csv_file = 'alpaca_data_cleaned.csv'
# Define the column names in the CSV file
message_id_column = 'message_id'
msg_column = 'message'
# Create an empty list to store the words
word_list = []
# Read the CSV file and filter the data
with open(csv_file, 'r', encoding='utf-8') as file:
reader = csv.DictReader(file)
for row in reader:
if row[message_id_column] == '0':
# Tokenize the text into words (ignoring punctuation)
words = re.findall(r'\b\w+\b', row[msg_column].lower())
word_list.extend(words)
# Count the occurrences of each word
word_count = Counter(word_list)
# Sort the word count by count in descending order
sorted_word_count = dict(sorted(word_count.items(), key=lambda item: item[1], reverse=True))
data = {'words': list(sorted_word_count.keys()),
'counts': list(sorted_word_count.values())}
df = pd.DataFrame(data)
df.to_csv('word_count.csv')