Skip to content

Commit

Permalink
#328 added tests for example command line (#329)
Browse files Browse the repository at this point in the history
* #328 added tests for example command line
* fixed: date not defined when not required causes failure
* #328 corrected execution folder for README tests
  • Loading branch information
IanGrimstead authored and thanasions committed Sep 24, 2019
1 parent 4db6fb8 commit 80d8524
Show file tree
Hide file tree
Showing 8 changed files with 95 additions and 22 deletions.
2 changes: 1 addition & 1 deletion .travis.yml
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,7 @@ install:
- python -m easy_install -U setuptools
# command to install dependencies
# - python setup.py install
- pip install -e .
- pip install -e .[test]

script:
# for codecov support
Expand Down
10 changes: 5 additions & 5 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -180,13 +180,13 @@ unbias results to avoid double or triple counting contained n-grams.
This argument can be used to filter documents to a certain timeframe. For example, the below will restrict the document cohort to only those from 20 Feb 2000 up to now (the default start date being 1 Jan 1900).

```
python pygrams.py -df=2000/02/20
python pygrams.py -dh publication_date -df=2000/02/20
```

The following will restrict the document cohort to only those between 1 March 2000 and 31 July 2016.

```
python pygrams.py -df=2000/03/01 -dt=2016/07/31
python pygrams.py -dh publication_date -df=2000/03/01 -dt=2016/07/31
```

#### Column features filters (-fh, -fb)
Expand All @@ -208,7 +208,7 @@ This filter assumes that values are '0'/'1', or 'Yes'/'No'.
This subsets the chosen patents dataset to a particular Cooperative Patent Classification (CPC) class, for example Y02. The Y02 classification is for "technologies or applications for mitigation or adaptation against climate change". An example script is:

```
python pygrams.py -cpc=Y02 -ps=USPTO-random-10000.pkl.bz2
python pygrams.py -cpc=Y02 -ds=USPTO-random-10000.pkl.bz2
```

In the console the number of subset patents will be stated. For example, for `python pygrams.py -cpc=Y02 -ps=USPTO-random-10000.pkl.bz2` the number of Y02 patents is 197. Thus, the TFIDF will be run for 197 patents.
Expand Down Expand Up @@ -314,8 +314,8 @@ Python pygrams.py -nrm=False
Pygrams outputs a report of top ranked terms (popular or emergent). Additional command line arguments provide alternative options, for example a word cloud or 'graph summary'.

```
python pygrams.py -o='wordcloud'
python pygrams.py -o='graph'
python pygrams.py -o wordcloud
python pygrams.py -o graph
```

The output options generate:
Expand Down
2 changes: 1 addition & 1 deletion appveyor.yml
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ install:
- python -m pip install -U pip
- python -m easy_install -U setuptools
# command to install dependencies
- python setup.py install
- pip install -e .[test]
# also need to download punkt tokeniser data
- python -m nltk.downloader punkt averaged_perceptron_tagger wordnet

Expand Down
4 changes: 2 additions & 2 deletions pygrams.py
Original file line number Diff line number Diff line change
Expand Up @@ -135,9 +135,9 @@ def get_args(command_line_arguments):
help="number of steps ahead to analyse for")

parser.add_argument("-ei", "--emergence-index", default='porter', choices=('porter', 'quadratic', 'gradients'),
help="Emergence calculation to use (default: %(default))")
help="Emergence calculation to use")
parser.add_argument("-sma", "--smoothing-alg", default=None, choices=('kalman', 'savgol'),
help="Time series smoothing to use (default: %(default))")
help="Time series smoothing to use")

parser.add_argument("-exp", "--exponential_fitting", default=False, action="store_true",
help="analyse using exponential type fit or not")
Expand Down
14 changes: 9 additions & 5 deletions scripts/pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,12 +62,16 @@ def __init__(self, data_filename, docs_mask_dict, pick_method='sum', ngram_range
f'to {number_of_ngrams_after:,}')

self.__cpc_dict = utils.cpc_dict(dataframe)
self.__dates = scripts.utils.date_utils.generate_year_week_dates(dataframe, docs_mask_dict['date_header'])

min_date = min(self.__dates)
max_date = max(self.__dates)
if docs_mask_dict['date_header'] is None:
self.__cached_folder_name = path.join('cached', output_name + f'-mdf-{max_df}')
self.__dates = None
else:
self.__dates = scripts.utils.date_utils.generate_year_week_dates(dataframe,
docs_mask_dict['date_header'])
min_date = min(self.__dates)
max_date = max(self.__dates)
self.__cached_folder_name = path.join('cached', output_name + f'-mdf-{max_df}-{min_date}-{max_date}')

self.__cached_folder_name = path.join('cached', output_name + f'-mdf-{max_df}-{min_date}-{max_date}')
utils.pickle_object('tfidf', self.__tfidf_obj, self.__cached_folder_name)
utils.pickle_object('dates', self.__dates, self.__cached_folder_name)
utils.pickle_object('cpc_dict', self.__cpc_dict, self.__cached_folder_name)
Expand Down
10 changes: 2 additions & 8 deletions scripts/utils/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -220,12 +220,10 @@ def stop_tup(tuples, unigrams, ngrams, digits=True):


def checkdf(df, emtec, docs_mask_dict, text_header):
app_exit = False

if emtec or docs_mask_dict['date'] is not None:
if docs_mask_dict['date_header'] not in df.columns:
print(f"date_header '{docs_mask_dict['date_header']}' not in dataframe")
app_exit = True
raise ValueError(f"date_header '{docs_mask_dict['date_header']}' not in dataframe")

if docs_mask_dict['date_header'] is not None:
if is_string_dtype(df[docs_mask_dict['date_header']]):
Expand All @@ -238,11 +236,7 @@ def checkdf(df, emtec, docs_mask_dict, text_header):
print('Document dates not specified')

if text_header not in df.columns:
print(f"text_header '{text_header}' not in dataframe")
app_exit = True

if app_exit:
exit(0)
raise ValueError(f"text_header '{text_header}' not in dataframe")


def remove_empty_documents(data_frame, text_header):
Expand Down
1 change: 1 addition & 0 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -72,6 +72,7 @@ def setup_package():
'xlrd', 'python-Levenshtein', 'gensim==3.4.0', 'statsmodels', 'keras', 'tensorflow',
'keras_tqdm', 'patsy', 'humanfriendly', 'psutil', 'jinja2', 'urllib3==1.22'],
# extras_require={'dev': ['check-manifest'],'test': ['coverage'],},
extras_require={'test': ['pytest']},
python_requires='>=3.6',
cmdclass={
'install': CustomInstaller,
Expand Down
74 changes: 74 additions & 0 deletions tests/test_readme.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,74 @@
import os
import unittest

import pygrams


# @pytest.mark.skipif('TRAVIS' not in os.environ, reason="Only execute with Travis due to speed")
class TestReadme(unittest.TestCase):
"""
Batch of tests to execute same commands as mentioned in the README.md, to ensure they work without crashing.
Note that the tests need to be run at the main user folder as this shows the files exist - not to be run in the
tests folder.
"""

@classmethod
def setUpClass(cls):
os.chdir('..')

@classmethod
def tearDownClass(cls):
os.chdir('tests')

def test_no_arguments_and_use_cache(self):
# clear cached result
import shutil
shutil.rmtree(os.path.join('cached', 'out-mdf-0.05'), ignore_errors=True)

# should make cache
pygrams.main([])

# load cache
pygrams.main(['-uc', 'out-mdf-0.05'])

def test_10000_patents(self):
pygrams.main(['-ds', 'USPTO-random-10000.pkl.bz2'])

def test_mn_mx_uni_bi_trigrams(self):
pygrams.main(['-mn', '1', '-mx', '3'])

def test_mn_mx_unigrams(self):
pygrams.main(['-mn', '1', '-mx', '1'])

def test_mdf(self):
pygrams.main(['-mdf', '0.05'])

def test_pt(self):
pygrams.main(['-pt', '0'])

def test_prefilter_terms_10000(self):
pygrams.main(['--prefilter_terms', '10000'])

def test_date_from(self):
pygrams.main(['-dh', 'publication_date', '-df', '2000/02/20'])

def test_date_from_and_to(self):
pygrams.main(['-dh', 'publication_date', '-df', '2000/03/01', '-dt', '2016/07/31'])

# def test_filter(self):
# pygrams.main(['-fc', "['female','british']", '-fb', 'union'])

def test_cpc(self):
pygrams.main(['-cpc', 'Y02', '-ds', 'USPTO-random-10000.pkl.bz2'])

def test_search_terms(self):
pygrams.main(['-st', 'pharmacy', 'medicine', 'chemist'])

def test_wordcloud(self):
pygrams.main(['-o', 'wordcloud'])

def test_graph(self):
pygrams.main(['-o', 'graph'])

# def test_help(self):
# pygrams.main(['-h'])

0 comments on commit 80d8524

Please sign in to comment.