-
Notifications
You must be signed in to change notification settings - Fork 0
/
sentiment_explore.py
84 lines (69 loc) · 2.9 KB
/
sentiment_explore.py
1
from typing import TextIO, List, Union, Dict, Tuplefrom sentiment import *from operator import itemgetterimport matplotlib.pyplot as plt# Your exploration functions here# Follow FDRdef occurrence(item: Tuple[str, List[int]]) -> int: '''Given item as a (key, value) tuple, return the second integer in value ''' return item[1][1]def most_common_words(count: int, kss: Dict[str, List[int]], exclude: Union[List[str], None]) -> List[str]: '''Given the number of desired words, kss and exclude which is a list of common English words, return the most common review words not present in the exclude. ''' # sort the words from most common to least common sorted_list = sorted(kss.items(), key=occurrence, reverse=True) return_list = [] for item in sorted_list: if (len(return_list) >= count): return return_list elif (exclude is None or item[0] not in exclude): return_list.append(item[0]) def word_occurrence(x: List[str], kss: Dict[str, List[int]]) -> List[int]: '''Given the list of the common words, a dictionary of review words with the number of their occurrences, return the list of the number of occurences of these words. ''' return_list = [] for word in x: return_list.append(kss[word][1]) return return_list if __name__ == "__main__": # Pick a dataset #dataset = 'tiny.txt' #dataset = 'small.txt' #dataset = 'medium.txt' dataset = 'full.txt' # Pick 5000 most common English words dataset word_dataset = 'most_common_english_words.txt' # Pick 10000 common English words dataset #word_dataset = '10000_most_common_english_words.txt' # Get the score and frequency of words in the dataset with open(dataset, 'r') as dataset_file: kss = extract_kss(dataset_file) # Get a list of common English words common_en_words = [] with open(word_dataset, 'r') as word_file: for word in word_file: common_en_words.append(word.strip()) while True: count = int(input('Enter the number of desired words: ')) print('The ' + str(count) + ' most common words are: ') print(most_common_words(count, kss, None)) print('The ' + str(count) + ' most common movie-specific words are: ') print(most_common_words(count, kss, common_en_words)) # visualization x = most_common_words(count, kss, common_en_words) y = word_occurrence(x, kss) plt.figure(dpi=100) plt.bar(x, y, width=0.5) plt.title('Number of Occurrences of the Most Common Movie-Specific Words in full.txt') plt.xlabel('Most Common Movie-Specific Words') plt.xticks(rotation=90) plt.ylabel('Number of Occurrences') plt.grid(linestyle='--', alpha=0.5) plt.tight_layout() plt.show()