#328 added tests for example command line (#329)

* #328 added tests for example command line * fixed: date not defined when not required causes failure * #328 corrected execution folder for README tests
datasciencecampus · Sep 24, 2019 · 80d8524 · 80d8524
1 parent 4db6fb8
commit 80d8524
Show file tree

Hide file tree

Showing 8 changed files with 95 additions and 22 deletions.
diff --git a/.travis.yml b/.travis.yml
@@ -47,7 +47,7 @@ install:
   - python -m easy_install -U setuptools
   # command to install dependencies
   #  - python setup.py install
-  - pip install -e .
+  - pip install -e .[test]
 
 script:
   # for codecov support

diff --git a/README.md b/README.md
@@ -180,13 +180,13 @@ unbias results to avoid double or triple counting contained n-grams.
 This argument can be used to filter documents to a certain timeframe. For example, the below will restrict the document cohort to only those from 20 Feb 2000 up to now (the default start date being 1 Jan 1900).
 
 ```
-python pygrams.py -df=2000/02/20
+python pygrams.py -dh publication_date -df=2000/02/20
 ```
 
 The following will restrict the document cohort to only those between 1 March 2000 and 31 July 2016.
 
 ```
-python pygrams.py -df=2000/03/01 -dt=2016/07/31
+python pygrams.py -dh publication_date -df=2000/03/01 -dt=2016/07/31
 ```
 
 #### Column features filters (-fh, -fb)
@@ -208,7 +208,7 @@ This filter assumes that values are '0'/'1', or 'Yes'/'No'.
 This subsets the chosen patents dataset to a particular Cooperative Patent Classification (CPC) class, for example Y02. The Y02 classification is for "technologies or applications for mitigation or adaptation against climate change". An example script is:
 
 ```
-python pygrams.py -cpc=Y02 -ps=USPTO-random-10000.pkl.bz2
+python pygrams.py -cpc=Y02 -ds=USPTO-random-10000.pkl.bz2
 ```
 
 In the console the number of subset patents will be stated. For example, for `python pygrams.py -cpc=Y02 -ps=USPTO-random-10000.pkl.bz2` the number of Y02 patents is 197. Thus, the TFIDF will be run for 197 patents.
@@ -314,8 +314,8 @@ Python pygrams.py -nrm=False
 Pygrams outputs a report of top ranked terms (popular or emergent). Additional command line arguments provide alternative options, for example a word cloud or 'graph summary'.
 
 ```
-python pygrams.py -o='wordcloud'
-python pygrams.py -o='graph'
+python pygrams.py -o wordcloud
+python pygrams.py -o graph
 ```
 
 The output options generate:

diff --git a/appveyor.yml b/appveyor.yml
@@ -17,7 +17,7 @@ install:
   - python -m pip install -U pip
   - python -m easy_install -U setuptools
   # command to install dependencies
-  - python setup.py install
+  - pip install -e .[test]
   # also need to download punkt tokeniser data
   - python -m nltk.downloader punkt averaged_perceptron_tagger wordnet
 

diff --git a/pygrams.py b/pygrams.py
@@ -135,9 +135,9 @@ def get_args(command_line_arguments):
                         help="number of steps ahead to analyse for")
 
     parser.add_argument("-ei", "--emergence-index", default='porter', choices=('porter', 'quadratic', 'gradients'),
-                        help="Emergence calculation to use (default: %(default))")
+                        help="Emergence calculation to use")
     parser.add_argument("-sma", "--smoothing-alg", default=None, choices=('kalman', 'savgol'),
-                        help="Time series smoothing to use (default: %(default))")
+                        help="Time series smoothing to use")
 
     parser.add_argument("-exp", "--exponential_fitting", default=False, action="store_true",
                         help="analyse using exponential type fit or not")

diff --git a/scripts/pipeline.py b/scripts/pipeline.py
@@ -62,12 +62,16 @@ def __init__(self, data_filename, docs_mask_dict, pick_method='sum', ngram_range
                       f'to {number_of_ngrams_after:,}')
 
             self.__cpc_dict = utils.cpc_dict(dataframe)
-            self.__dates = scripts.utils.date_utils.generate_year_week_dates(dataframe, docs_mask_dict['date_header'])
-
-            min_date = min(self.__dates)
-            max_date = max(self.__dates)
+            if docs_mask_dict['date_header'] is None:
+                self.__cached_folder_name = path.join('cached', output_name + f'-mdf-{max_df}')
+                self.__dates = None
+            else:
+                self.__dates = scripts.utils.date_utils.generate_year_week_dates(dataframe,
+                                                                                 docs_mask_dict['date_header'])
+                min_date = min(self.__dates)
+                max_date = max(self.__dates)
+                self.__cached_folder_name = path.join('cached', output_name + f'-mdf-{max_df}-{min_date}-{max_date}')
 
-            self.__cached_folder_name = path.join('cached', output_name + f'-mdf-{max_df}-{min_date}-{max_date}')
             utils.pickle_object('tfidf', self.__tfidf_obj, self.__cached_folder_name)
             utils.pickle_object('dates', self.__dates, self.__cached_folder_name)
             utils.pickle_object('cpc_dict', self.__cpc_dict, self.__cached_folder_name)

diff --git a/scripts/utils/utils.py b/scripts/utils/utils.py
@@ -220,12 +220,10 @@ def stop_tup(tuples, unigrams, ngrams, digits=True):
 
 
 def checkdf(df, emtec, docs_mask_dict, text_header):
-    app_exit = False
 
     if emtec or docs_mask_dict['date'] is not None:
         if docs_mask_dict['date_header'] not in df.columns:
-            print(f"date_header '{docs_mask_dict['date_header']}' not in dataframe")
-            app_exit = True
+            raise ValueError(f"date_header '{docs_mask_dict['date_header']}' not in dataframe")
 
     if docs_mask_dict['date_header'] is not None:
         if is_string_dtype(df[docs_mask_dict['date_header']]):
@@ -238,11 +236,7 @@ def checkdf(df, emtec, docs_mask_dict, text_header):
         print('Document dates not specified')
 
     if text_header not in df.columns:
-        print(f"text_header '{text_header}' not in dataframe")
-        app_exit = True
-
-    if app_exit:
-        exit(0)
+        raise ValueError(f"text_header '{text_header}' not in dataframe")
 
 
 def remove_empty_documents(data_frame, text_header):

diff --git a/setup.py b/setup.py
@@ -72,6 +72,7 @@ def setup_package():
                           'xlrd', 'python-Levenshtein', 'gensim==3.4.0', 'statsmodels', 'keras', 'tensorflow',
                           'keras_tqdm', 'patsy', 'humanfriendly', 'psutil', 'jinja2', 'urllib3==1.22'],
         # extras_require={'dev': ['check-manifest'],'test': ['coverage'],},
+        extras_require={'test': ['pytest']},
         python_requires='>=3.6',
         cmdclass={
             'install': CustomInstaller,

diff --git a/tests/test_readme.py b/tests/test_readme.py
@@ -0,0 +1,74 @@
+import os
+import unittest
+
+import pygrams
+
+
+# @pytest.mark.skipif('TRAVIS' not in os.environ, reason="Only execute with Travis due to speed")
+class TestReadme(unittest.TestCase):
+    """
+    Batch of tests to execute same commands as mentioned in the README.md, to ensure they work without crashing.
+    Note that the tests need to be run at the main user folder as this shows the files exist - not to be run in the
+    tests folder.
+    """
+
+    @classmethod
+    def setUpClass(cls):
+        os.chdir('..')
+
+    @classmethod
+    def tearDownClass(cls):
+        os.chdir('tests')
+
+    def test_no_arguments_and_use_cache(self):
+        # clear cached result
+        import shutil
+        shutil.rmtree(os.path.join('cached', 'out-mdf-0.05'), ignore_errors=True)
+
+        # should make cache
+        pygrams.main([])
+
+        # load cache
+        pygrams.main(['-uc', 'out-mdf-0.05'])
+
+    def test_10000_patents(self):
+        pygrams.main(['-ds', 'USPTO-random-10000.pkl.bz2'])
+
+    def test_mn_mx_uni_bi_trigrams(self):
+        pygrams.main(['-mn', '1', '-mx', '3'])
+
+    def test_mn_mx_unigrams(self):
+        pygrams.main(['-mn', '1', '-mx', '1'])
+
+    def test_mdf(self):
+        pygrams.main(['-mdf', '0.05'])
+
+    def test_pt(self):
+        pygrams.main(['-pt', '0'])
+
+    def test_prefilter_terms_10000(self):
+        pygrams.main(['--prefilter_terms', '10000'])
+
+    def test_date_from(self):
+        pygrams.main(['-dh', 'publication_date', '-df', '2000/02/20'])
+
+    def test_date_from_and_to(self):
+        pygrams.main(['-dh', 'publication_date', '-df', '2000/03/01', '-dt', '2016/07/31'])
+
+    # def test_filter(self):
+    #     pygrams.main(['-fc', "['female','british']", '-fb', 'union'])
+
+    def test_cpc(self):
+        pygrams.main(['-cpc', 'Y02', '-ds', 'USPTO-random-10000.pkl.bz2'])
+
+    def test_search_terms(self):
+        pygrams.main(['-st', 'pharmacy', 'medicine', 'chemist'])
+
+    def test_wordcloud(self):
+        pygrams.main(['-o', 'wordcloud'])
+
+    def test_graph(self):
+        pygrams.main(['-o', 'graph'])
+
+    # def test_help(self):
+    #     pygrams.main(['-h'])