datasciencecampus · IanGrimstead · May 20, 2019 · May 10, 2019 · May 10, 2019 · May 17, 2019
diff --git a/pygrams.py b/pygrams.py
@@ -23,6 +23,8 @@ def get_args(command_line_arguments):
     parser.add_argument("-ih", "--id_header", default=None, help=argparse.SUPPRESS)
     parser.add_argument("-c", "--cite", default=False, action="store_true", help=argparse.SUPPRESS)
     parser.add_argument("-pt", "--path", default='data', help=argparse.SUPPRESS)
+    parser.add_argument("-nmf", "--n_nmf_topics", type=int, default=0, help=argparse.SUPPRESS)
+                        # help="NMF topic modelling - number of topics (e.g. 20 or 40)")
 
     # Focus source and function
     parser.add_argument("-f", "--focus", default=None, choices=['set', 'chi2', 'mutual'],
@@ -153,6 +155,8 @@ def main(supplied_args):
     outputs.append('report')
     if args.term_counts:
         outputs.append('termcounts')
+    if args.n_nmf_topics >0:
+        outputs.append('nmf')
 
     docs_mask_dict = argscheck.get_docs_mask_dict()
     terms_mask_dict = argscheck.get_terms_mask_dict()
@@ -171,7 +175,8 @@ def main(supplied_args):
                         prefilter_terms=args.prefilter_terms, pickled_tf_idf_file_name=pickled_tf_idf_path,
                         output_name=args.outputs_name, emerging_technology=args.emerging_technology)
 
-    pipeline.output(outputs, wordcloud_title=args.wordcloud_title, outname=args.outputs_name, nterms=args.num_ngrams_report)
+    pipeline.output(outputs, wordcloud_title=args.wordcloud_title, outname=args.outputs_name,
+                    nterms=args.num_ngrams_report, n_nmf_topics=args.n_nmf_topics)
 
     # emtech integration
     if args.emerging_technology:

diff --git a/scripts/nmf_wrapper.py b/scripts/nmf_wrapper.py
@@ -0,0 +1,31 @@
+from sklearn.decomposition import NMF
+import pandas as pd
+
+
+def nmf_topic_modelling(n_nmf_topics, tfidf_mat):  # Experimental only
+
+    # run NMF on TFIDF
+    return NMF(n_components=n_nmf_topics, random_state=1, alpha=.1, l1_ratio=.5, init='nndsvd').fit(tfidf_mat)
+
+
+def calculate_weights(nmf, feature_names):
+    term_weights_to_sum = 10  # 0 to sum all weights
+    term_weights_to_print = 50
+
+    # create list of all or of top n terms & weights for every topic
+    top_features = []
+    if term_weights_to_sum == 0:
+        term_weights = nmf.components_.sum(axis=0)
+        top_features = zip(feature_names, term_weights)
+    else:
+        for topic_idx, term_weights in enumerate(nmf.components_):
+            for idx in term_weights.argsort()[:-term_weights_to_sum - 1:-1]:
+                top_features.append((feature_names[idx], term_weights[idx]))
+
+    # sum term weights over topics and print
+    top_features_df = pd.DataFrame(top_features, columns=['feature', 'score'])
+    top_features_df = top_features_df.groupby(top_features_df.feature).sum(). \
+        sort_values(by='score', ascending=False).reset_index()
+    print("Term weights extracted from topics (sum over all topics of term weights associated with each topic):")
+    print(top_features_df[0:term_weights_to_print])
+    print()
diff --git a/scripts/output_factory.py b/scripts/output_factory.py
@@ -3,12 +3,13 @@
 import pickle
 from os import makedirs, path
 
+from scripts.nmf_wrapper import nmf_topic_modelling
 from scripts.terms_graph import TermsGraph
 from scripts.visualization.wordclouds.multicloudplot import MultiCloudPlot
 
 
 def create(output_type, output, wordcloud_title=None, tfidf_reduce_obj=None, name=None, nterms=50,
-           term_counts_data=None, date_dict=None, pick=None, doc_pickle_file_name=None, time=None):
+           term_counts_data=None, date_dict=None, pick=None, doc_pickle_file_name=None, time=None, nmf_topics=0):
 
     if output_type == 'report':
         filename_and_path = path.join('outputs', 'reports', name + '.txt')
@@ -66,6 +67,28 @@ def create(output_type, output, wordcloud_title=None, tfidf_reduce_obj=None, nam
 
         with open(json_file_name, 'w') as json_file:
             json.dump(json_data, json_file)
+    elif output_type =='nmf':
+        # topic modelling
+        topic_terms_to_print = 10
+        nmf = nmf_topic_modelling(nmf_topics, tfidf_reduce_obj.tfidf_masked)
+        filename_and_path = path.join('outputs', 'reports', name + '_nmf.txt')
+        with open(filename_and_path, 'w') as file:
 
+            # print topics
+            print()
+            print('*** NMF topic modelling (experimental only) ***')
+            file.write('*** NMF topic modelling (experimental only) *** \n')
+            print('Topics:')
+            file.write('Topics \n')
+            feature_names = tfidf_reduce_obj.feature_names
+            for topic_idx, term_weights in enumerate(nmf.components_):
+                print("%d:" % (topic_idx), end='')
+                file.write("%d: " % (topic_idx))
+                topic_names = ", ".join(
+                    [feature_names[i] for i in term_weights.argsort()[:-topic_terms_to_print - 1:-1]])
+                print(topic_names)
+                file.write(topic_names + '\n')
+            print()
+            file.write('\n')
     else:
         assert 0, "Bad output type: " + output_type
diff --git a/scripts/pipeline.py b/scripts/pipeline.py
@@ -54,7 +54,6 @@ def __init__(self, data_filename, docs_mask_dict, pick_method='sum', ngram_range
 
                 number_of_ngrams_before = len(self.__tfidf_obj.feature_names)
                 self.__tfidf_obj = tfidf_subset_from_features(self.__tfidf_obj, feature_subset)
-
                 number_of_ngrams_after = len(self.__tfidf_obj.feature_names)
                 print(f'Reduced number of terms by pre-filtering from {number_of_ngrams_before:,} '
                       f'to {number_of_ngrams_after:,}')
@@ -126,11 +125,11 @@ def __init__(self, data_filename, docs_mask_dict, pick_method='sum', ngram_range
 
         # todo: this mutiply and remove null will disappear - maybe put weight combiner last so it can remove 0 weights
         # mask the tfidf matrix
-        tfidf_matrix = self.__tfidf_obj.tfidf_matrix
-        tfidf_masked = tfidf_mask.multiply(tfidf_matrix)
+
+        tfidf_masked = tfidf_mask.multiply(self.__tfidf_obj.tfidf_matrix)
 
         tfidf_masked, self.__dataframe = utils.remove_all_null_rows_global(tfidf_masked, self.__dataframe)
-        print(f'Processing TFIDF matrix of {tfidf_masked.shape[0]:,} / {tfidf_matrix.shape[0]:,} documents')
+        print(f'Processing TFIDF matrix of {tfidf_masked.shape[0]:,} / {self.__tfidf_obj.tfidf_matrix.shape[0]:,} documents')
 
         # todo: no advantage in classes - just create term_count and extract_ngrams as functions
 
@@ -152,15 +151,14 @@ def __init__(self, data_filename, docs_mask_dict, pick_method='sum', ngram_range
     def term_counts_data(self):
         return self.__term_counts_data
 
-    def output(self, output_types, wordcloud_title=None, outname=None, nterms=50):
+    def output(self, output_types, wordcloud_title=None, outname=None, nterms=50, n_nmf_topics=0):
 
         for output_type in output_types:
             output_factory.create(output_type, self.__term_score_tuples, wordcloud_title=wordcloud_title,
                                   tfidf_reduce_obj=self.__tfidf_reduce_obj, name=outname,
                                   nterms=nterms, term_counts_data=self.__term_counts_data,
                                   date_dict=self.__date_dict, pick=self.__pick_method,
-                                  doc_pickle_file_name=self.__data_filename, time=self.__time)
-
+                                  doc_pickle_file_name=self.__data_filename, time=self.__time, nmf_topics=n_nmf_topics)
 
     @property
     def term_score_tuples(self):

diff --git a/tests/test_nmf_wrapper.py b/tests/test_nmf_wrapper.py
@@ -0,0 +1,51 @@
+import unittest
+from numpy.testing import assert_almost_equal
+import pandas as pd
+
+from scripts import FilePaths
+from scripts.nmf_wrapper import nmf_topic_modelling
+from scripts.text_processing import LemmaTokenizer
+from scripts.tfidf_wrapper import tfidf_from_text
+
+
+class TestNMFWrapper(unittest.TestCase):
+
+    @classmethod
+    def setUpClass(cls):
+
+        df = pd.read_pickle(FilePaths.us_patents_random_100_pickle_name)
+        tfidf_obj = tfidf_from_text(df['abstract'], ngram_range=(1, 3), max_document_frequency=0.1,
+                                    tokenizer=LemmaTokenizer())
+        nmf_topics = 5
+        cls.__nmf = nmf_topic_modelling(nmf_topics, tfidf_obj.tfidf_matrix)
+
+    def test_nmf_topic1(self):
+
+        actual_topic_1_score = self.__nmf.components_[0][3302]
+        expected_topic_1_score = 0.2044937886411859
+
+        assert_almost_equal(actual_topic_1_score, expected_topic_1_score, decimal=3)
+
+    def test_nmf_topic2(self):
+        actual_topic_2_score = self.__nmf.components_[1][281]
+        expected_topic_2_score = 0.276781
+
+        assert_almost_equal(actual_topic_2_score, expected_topic_2_score, decimal=3)
+
+    def test_nmf_topic3(self):
+        actual_topic_3_score = self.__nmf.components_[2][2983]
+        expected_topic_3_score = 0.441
+
+        assert_almost_equal(actual_topic_3_score, expected_topic_3_score, decimal=3)
+
+    def test_nmf_topic4(self):
+        actual_topic_4_score = self.__nmf.components_[3][1683]
+        expected_topic_4_score = 0.219
+
+        assert_almost_equal(actual_topic_4_score, expected_topic_4_score, decimal=3)
+
+    def test_nmf_topic5(self):
+        actual_topic_5_score = self.__nmf.components_[4][252]
+        expected_topic_5_score = 0.275
+
+        assert_almost_equal(actual_topic_5_score, expected_topic_5_score, decimal=3)