Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

257 add nmf code #271

Merged
merged 7 commits into from
May 20, 2019
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 6 additions & 1 deletion pygrams.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,8 @@ def get_args(command_line_arguments):
parser.add_argument("-ih", "--id_header", default=None, help=argparse.SUPPRESS)
parser.add_argument("-c", "--cite", default=False, action="store_true", help=argparse.SUPPRESS)
parser.add_argument("-pt", "--path", default='data', help=argparse.SUPPRESS)
parser.add_argument("-nmf", "--n_nmf_topics", type=int, default=0, help=argparse.SUPPRESS)
# help="NMF topic modelling - number of topics (e.g. 20 or 40)")

# Focus source and function
parser.add_argument("-f", "--focus", default=None, choices=['set', 'chi2', 'mutual'],
Expand Down Expand Up @@ -153,6 +155,8 @@ def main(supplied_args):
outputs.append('report')
if args.term_counts:
outputs.append('termcounts')
if args.n_nmf_topics >0:
outputs.append('nmf')

docs_mask_dict = argscheck.get_docs_mask_dict()
terms_mask_dict = argscheck.get_terms_mask_dict()
Expand All @@ -171,7 +175,8 @@ def main(supplied_args):
prefilter_terms=args.prefilter_terms, pickled_tf_idf_file_name=pickled_tf_idf_path,
output_name=args.outputs_name, emerging_technology=args.emerging_technology)

pipeline.output(outputs, wordcloud_title=args.wordcloud_title, outname=args.outputs_name, nterms=args.num_ngrams_report)
pipeline.output(outputs, wordcloud_title=args.wordcloud_title, outname=args.outputs_name,
nterms=args.num_ngrams_report, n_nmf_topics=args.n_nmf_topics)

# emtech integration
if args.emerging_technology:
Expand Down
31 changes: 31 additions & 0 deletions scripts/nmf_wrapper.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
from sklearn.decomposition import NMF
import pandas as pd


def nmf_topic_modelling(n_nmf_topics, tfidf_mat): # Experimental only

# run NMF on TFIDF
return NMF(n_components=n_nmf_topics, random_state=1, alpha=.1, l1_ratio=.5, init='nndsvd').fit(tfidf_mat)


def calculate_weights(nmf, feature_names):
term_weights_to_sum = 10 # 0 to sum all weights
term_weights_to_print = 50

# create list of all or of top n terms & weights for every topic
top_features = []
if term_weights_to_sum == 0:
term_weights = nmf.components_.sum(axis=0)
top_features = zip(feature_names, term_weights)
else:
for topic_idx, term_weights in enumerate(nmf.components_):
for idx in term_weights.argsort()[:-term_weights_to_sum - 1:-1]:
top_features.append((feature_names[idx], term_weights[idx]))

# sum term weights over topics and print
top_features_df = pd.DataFrame(top_features, columns=['feature', 'score'])
top_features_df = top_features_df.groupby(top_features_df.feature).sum(). \
sort_values(by='score', ascending=False).reset_index()
print("Term weights extracted from topics (sum over all topics of term weights associated with each topic):")
print(top_features_df[0:term_weights_to_print])
print()
25 changes: 24 additions & 1 deletion scripts/output_factory.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,12 +3,13 @@
import pickle
from os import makedirs, path

from scripts.nmf_wrapper import nmf_topic_modelling
from scripts.terms_graph import TermsGraph
from scripts.visualization.wordclouds.multicloudplot import MultiCloudPlot


def create(output_type, output, wordcloud_title=None, tfidf_reduce_obj=None, name=None, nterms=50,
term_counts_data=None, date_dict=None, pick=None, doc_pickle_file_name=None, time=None):
term_counts_data=None, date_dict=None, pick=None, doc_pickle_file_name=None, time=None, nmf_topics=0):

if output_type == 'report':
filename_and_path = path.join('outputs', 'reports', name + '.txt')
Expand Down Expand Up @@ -66,6 +67,28 @@ def create(output_type, output, wordcloud_title=None, tfidf_reduce_obj=None, nam

with open(json_file_name, 'w') as json_file:
json.dump(json_data, json_file)
elif output_type =='nmf':
# topic modelling
topic_terms_to_print = 10
nmf = nmf_topic_modelling(nmf_topics, tfidf_reduce_obj.tfidf_masked)
filename_and_path = path.join('outputs', 'reports', name + '_nmf.txt')
with open(filename_and_path, 'w') as file:

# print topics
print()
print('*** NMF topic modelling (experimental only) ***')
file.write('*** NMF topic modelling (experimental only) *** \n')
print('Topics:')
file.write('Topics \n')
feature_names = tfidf_reduce_obj.feature_names
for topic_idx, term_weights in enumerate(nmf.components_):
print("%d:" % (topic_idx), end='')
file.write("%d: " % (topic_idx))
topic_names = ", ".join(
[feature_names[i] for i in term_weights.argsort()[:-topic_terms_to_print - 1:-1]])
print(topic_names)
file.write(topic_names + '\n')
print()
file.write('\n')
else:
assert 0, "Bad output type: " + output_type
12 changes: 5 additions & 7 deletions scripts/pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,6 @@ def __init__(self, data_filename, docs_mask_dict, pick_method='sum', ngram_range

number_of_ngrams_before = len(self.__tfidf_obj.feature_names)
self.__tfidf_obj = tfidf_subset_from_features(self.__tfidf_obj, feature_subset)

number_of_ngrams_after = len(self.__tfidf_obj.feature_names)
print(f'Reduced number of terms by pre-filtering from {number_of_ngrams_before:,} '
f'to {number_of_ngrams_after:,}')
Expand Down Expand Up @@ -126,11 +125,11 @@ def __init__(self, data_filename, docs_mask_dict, pick_method='sum', ngram_range

# todo: this mutiply and remove null will disappear - maybe put weight combiner last so it can remove 0 weights
# mask the tfidf matrix
tfidf_matrix = self.__tfidf_obj.tfidf_matrix
tfidf_masked = tfidf_mask.multiply(tfidf_matrix)

tfidf_masked = tfidf_mask.multiply(self.__tfidf_obj.tfidf_matrix)

tfidf_masked, self.__dataframe = utils.remove_all_null_rows_global(tfidf_masked, self.__dataframe)
print(f'Processing TFIDF matrix of {tfidf_masked.shape[0]:,} / {tfidf_matrix.shape[0]:,} documents')
print(f'Processing TFIDF matrix of {tfidf_masked.shape[0]:,} / {self.__tfidf_obj.tfidf_matrix.shape[0]:,} documents')

# todo: no advantage in classes - just create term_count and extract_ngrams as functions

Expand All @@ -152,15 +151,14 @@ def __init__(self, data_filename, docs_mask_dict, pick_method='sum', ngram_range
def term_counts_data(self):
return self.__term_counts_data

def output(self, output_types, wordcloud_title=None, outname=None, nterms=50):
def output(self, output_types, wordcloud_title=None, outname=None, nterms=50, n_nmf_topics=0):

for output_type in output_types:
output_factory.create(output_type, self.__term_score_tuples, wordcloud_title=wordcloud_title,
tfidf_reduce_obj=self.__tfidf_reduce_obj, name=outname,
nterms=nterms, term_counts_data=self.__term_counts_data,
date_dict=self.__date_dict, pick=self.__pick_method,
doc_pickle_file_name=self.__data_filename, time=self.__time)

doc_pickle_file_name=self.__data_filename, time=self.__time, nmf_topics=n_nmf_topics)

@property
def term_score_tuples(self):
Expand Down
51 changes: 51 additions & 0 deletions tests/test_nmf_wrapper.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
import unittest
from numpy.testing import assert_almost_equal
import pandas as pd

from scripts import FilePaths
from scripts.nmf_wrapper import nmf_topic_modelling
from scripts.text_processing import LemmaTokenizer
from scripts.tfidf_wrapper import tfidf_from_text


class TestNMFWrapper(unittest.TestCase):

@classmethod
def setUpClass(cls):

df = pd.read_pickle(FilePaths.us_patents_random_100_pickle_name)
tfidf_obj = tfidf_from_text(df['abstract'], ngram_range=(1, 3), max_document_frequency=0.1,
tokenizer=LemmaTokenizer())
nmf_topics = 5
cls.__nmf = nmf_topic_modelling(nmf_topics, tfidf_obj.tfidf_matrix)

def test_nmf_topic1(self):
IanGrimstead marked this conversation as resolved.
Show resolved Hide resolved

actual_topic_1_score = self.__nmf.components_[0][3302]
expected_topic_1_score = 0.2044937886411859

assert_almost_equal(actual_topic_1_score, expected_topic_1_score, decimal=3)

def test_nmf_topic2(self):
actual_topic_2_score = self.__nmf.components_[1][281]
expected_topic_2_score = 0.276781

assert_almost_equal(actual_topic_2_score, expected_topic_2_score, decimal=3)

def test_nmf_topic3(self):
actual_topic_3_score = self.__nmf.components_[2][2983]
expected_topic_3_score = 0.441

assert_almost_equal(actual_topic_3_score, expected_topic_3_score, decimal=3)

def test_nmf_topic4(self):
actual_topic_4_score = self.__nmf.components_[3][1683]
expected_topic_4_score = 0.219

assert_almost_equal(actual_topic_4_score, expected_topic_4_score, decimal=3)

def test_nmf_topic5(self):
actual_topic_5_score = self.__nmf.components_[4][252]
expected_topic_5_score = 0.275

assert_almost_equal(actual_topic_5_score, expected_topic_5_score, decimal=3)