sentiment_explore.py

from typing import TextIO, List, Union, Dict, Tuplefrom sentiment import *from operator import itemgetterimport matplotlib.pyplot as plt# Your exploration functions here# Follow FDRdef occurrence(item: Tuple[str, List[int]]) -> int:    '''Given item as a (key, value) tuple, return the    second integer in value     '''    return item[1][1]def most_common_words(count: int, kss: Dict[str, List[int]], exclude: Union[List[str], None]) -> List[str]:    '''Given the number of desired words, kss    and exclude which is a list of common English words, return the most common     review words not present in the exclude.    '''    # sort the words from most common to least common    sorted_list = sorted(kss.items(), key=occurrence, reverse=True)    return_list = []    for item in sorted_list:        if (len(return_list) >= count):            return return_list        elif (exclude is None or item[0] not in exclude):            return_list.append(item[0])                        def word_occurrence(x: List[str], kss: Dict[str, List[int]]) -> List[int]:    '''Given the list of the common words, a dictionary of review words     with the number of their occurrences, return the list of the number of occurences     of these words.    '''    return_list = []    for word in x:        return_list.append(kss[word][1])    return return_list if __name__ == "__main__":    # Pick a dataset        #dataset = 'tiny.txt'    #dataset = 'small.txt'    #dataset = 'medium.txt'    dataset = 'full.txt'    # Pick 5000 most common English words dataset    word_dataset = 'most_common_english_words.txt'    # Pick 10000 common English words dataset    #word_dataset = '10000_most_common_english_words.txt'        # Get the score and frequency of words in the dataset    with open(dataset, 'r') as dataset_file:                kss = extract_kss(dataset_file)        # Get a list of common English words    common_en_words = []    with open(word_dataset, 'r') as word_file:        for word in word_file:            common_en_words.append(word.strip())        while True:        count = int(input('Enter the number of desired words: '))        print('The ' + str(count) + ' most common words are: ')        print(most_common_words(count, kss, None))                print('The ' + str(count) + ' most common movie-specific words are: ')        print(most_common_words(count, kss, common_en_words))                # visualization         x = most_common_words(count, kss, common_en_words)        y = word_occurrence(x, kss)        plt.figure(dpi=100)        plt.bar(x, y, width=0.5)        plt.title('Number of Occurrences of the Most Common Movie-Specific Words in full.txt')        plt.xlabel('Most Common Movie-Specific Words')        plt.xticks(rotation=90)        plt.ylabel('Number of Occurrences')        plt.grid(linestyle='--', alpha=0.5)        plt.tight_layout()        plt.show()