Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

225 ridof pmdarima #226

Merged
merged 6 commits into from
Apr 1, 2019
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion pygrams.py
Original file line number Diff line number Diff line change
Expand Up @@ -165,7 +165,7 @@ def main(supplied_args):
pickled_tf_idf_file_name=pickled_tf_idf_path,
output_name=args.outputs_name, emerging_technology=args.emerging_technology)

pipeline.output(outputs, wordcloud_title=args.wordcloud_title, outname=args.outputs_name, nterms=50)
pipeline.output(outputs, wordcloud_title=args.wordcloud_title, outname=args.outputs_name, nterms=args.num_ngrams_report)

# emtech integration
if args.emerging_technology:
Expand Down
72 changes: 59 additions & 13 deletions scripts/algorithms/arima.py
Original file line number Diff line number Diff line change
@@ -1,26 +1,72 @@
import warnings
import numpy as np
from numpy import clip, inf
from pmdarima.arima import auto_arima
from sklearn.metrics import mean_squared_error
from statsmodels.tsa.arima_model import ARIMA


class ARIMAForecast(object):

def __init__(self, data_in, num_prediction_periods):
def __evaluate_models(self, dataset, p_values, d_values, q_values):
dataset=np.array(dataset)
dataset = dataset.astype('float32')
best_score, best_cfg = float("inf"), None
for p in p_values:
for d in d_values:
for q in q_values:
order = (p, d, q)
try:
mse = self.__evaluate_arima_model(dataset, order, ground_truth_in_history=True)
if mse < best_score:
best_score = mse
best_cfg = order
except:
continue
return best_cfg, best_score

def __evaluate_arima_model(self, X, arima_order, ground_truth_in_history=False):

train_ratio = 0.8
train_size = int(len(X) * train_ratio)
train, test = X[0:train_size], X[train_size:]
history = [x for x in train]
predictions = list()

for t in range(len(test)):
model = ARIMA(history, order=arima_order)
model_fit = model.fit(disp=0, maxiter=200)
yhat = model_fit.forecast()[0][0]
predictions.append(yhat)
history.append(test[t] if ground_truth_in_history else yhat)
error = mean_squared_error(test, predictions)
return error

def __arima_model_predict(self, X, arima_order, steps_ahead):
# make predictions
predictions = list()
for t in range(steps_ahead):
model = ARIMA(X, order=arima_order)
model_fit = model.fit(disp=0)
yhat = model_fit.forecast()[0][0]
predictions.append(yhat)
X= np.append(X, yhat)

return predictions

def __init__(self, data_in, num_prediction_periods ):
if not all(isinstance(x, float) for x in data_in):
raise ValueError('Time series must be all float values')

self.__history = data_in
self.__num_prediction_periods = num_prediction_periods

self.__stepwise_model = auto_arima(
data_in,
seasonal=False,
# error_action='ignore', suppress_warnings=True,
stepwise=True
)
p_values = [0, 1, 2, 4, 6]
d_values = range(0, 3)
q_values = range(0, 3)
warnings.filterwarnings("ignore")
self.__order, score = self.__evaluate_models(data_in, p_values, d_values, q_values)
self.__predictions = self.__arima_model_predict(data_in, self.__order, num_prediction_periods)

@property
def configuration(self):
return self.__stepwise_model.order
return self.__order

def predict_counts(self):
return clip(self.__stepwise_model.predict(n_periods=self.__num_prediction_periods), 0, inf)
return clip(self.__predictions, 0, inf)
7 changes: 6 additions & 1 deletion scripts/pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
from scripts.documents_filter import DocumentsFilter
from scripts.documents_weights import DocumentsWeights
from scripts.filter_terms import FilterTerms
from scripts.text_processing import LemmaTokenizer, WordAnalyzer
from scripts.text_processing import LemmaTokenizer, WordAnalyzer, lowercase_strip_accents_and_ownership
from scripts.tfidf_mask import TfidfMask
from scripts.tfidf_reduce import TfidfReduce
from scripts.tfidf_wrapper import TFIDF
Expand Down Expand Up @@ -140,6 +140,11 @@ def __init__(self, data_filename, docs_mask_dict, pick_method='sum', ngram_range
docs_mask_dict['date_header'])
# if other outputs
self.__term_score_tuples = self.__tfidf_reduce_obj.extract_ngrams_from_docset(pick_method)

WordAnalyzer.init(
tokenizer=LemmaTokenizer(),
preprocess=lowercase_strip_accents_and_ownership,
ngram_range=ngram_range)
self.__term_score_tuples = utils.stop_tup(self.__term_score_tuples, WordAnalyzer.stemmed_stop_word_set_uni, WordAnalyzer.stemmed_stop_word_set_n)

# todo: no output method; just if statements to call output functions...?
Expand Down
4 changes: 2 additions & 2 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,8 +53,8 @@ def setup_package():
'License :: MIT License',
'Programming Language :: Python :: 3.6',
],
install_requires=['matplotlib', 'numpy', 'scipy', 'wordcloud', 'pandas', 'tqdm', 'nltk', 'scikit-learn', 'xlrd',
'python-Levenshtein', 'gensim', 'pmdarima>=1.1.0', 'keras', 'tensorflow', 'keras_tqdm',
install_requires=['matplotlib', 'numpy', 'scipy>=1.2.1', 'wordcloud', 'pandas', 'tqdm', 'nltk', 'scikit-learn', 'xlrd',
'python-Levenshtein', 'gensim', 'statsmodels', 'keras', 'tensorflow', 'keras_tqdm',
'patsy', 'humanfriendly', 'psutil', 'jinja2'],
# extras_require={'dev': ['check-manifest'],'test': ['coverage'],},
python_requires='>=3.6',
Expand Down
16 changes: 2 additions & 14 deletions tests/algorithms/test_arima.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,6 @@
import platform; print(platform.platform())
import sys; print("Python", sys.version)
import os
import pmdarima; print("pmdarima", pmdarima.__version__)
import pandas as pd
import numpy as np; print("NumPy", np.__version__)
import scipy; print("SciPy", scipy.__version__)
Expand Down Expand Up @@ -52,22 +51,11 @@ def test_linearly_increasing_sequence_fuel_cell(self):

np_test.assert_almost_equal(actual_prediction, expected_prediction, decimal=0)

def test_linearly_increasing_sequence_combustion_engine(self):
time_series = pd.read_csv(os.path.join('tests','data', 'combustion_engine_quarterly.csv')).values.tolist()
time_series = [item for sublist in time_series for item in sublist]
num_predicted_periods = 4
expected_prediction = [333., 333., 334., 335.]
arima = ARIMAForecast(np.array(time_series).astype(float), num_predicted_periods)

actual_prediction = arima.predict_counts()

np_test.assert_almost_equal(actual_prediction, expected_prediction, decimal=0)

def test_linearly_increasing_sequence_image_data(self):
def test_linearly_decreasing_sequence_image_data(self):
time_series = pd.read_csv(os.path.join('tests','data', 'image_data_quarterly.csv')).values.tolist()
time_series = [item for sublist in time_series for item in sublist]
num_predicted_periods = 4
expected_prediction = [577., 583., 590., 597.]
expected_prediction = [562., 561., 558., 556.]
arima = ARIMAForecast(np.array(time_series).astype(float), num_predicted_periods)

actual_prediction = arima.predict_counts()
Expand Down
52 changes: 0 additions & 52 deletions tests/data/combustion_engine_quarterly.csv

This file was deleted.