diff --git a/README.md b/README.md index 3ecd50fd..444ca62f 100644 --- a/README.md +++ b/README.md @@ -39,8 +39,8 @@ environment `text_venv`: ### Install from PyPI 1. Install `pxtextmining` and its PyPI dependencies: - - `pip3 install pxtextmining==0.3.0` (Linux & MacOS); - - `pip install pxtextmining==0.3.0` (Windows); + - `pip3 install pxtextmining==0.3.2` (Linux & MacOS); + - `pip install pxtextmining==0.3.2` (Windows); 1. We also need to install a couple of [`spaCy`](https://github.com/explosion/spacy-models) models. @@ -65,7 +65,7 @@ All steps in one go: ``` python3 -m venv text_venv source text_venv/bin/activate - pip3 install pxtextmining==0.3.0 + pip3 install pxtextmining==0.3.2 pip3 install https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-2.3.1/en_core_web_sm-2.3.1.tar.gz pip3 install https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-2.3.1/en_core_web_lg-2.3.1.tar.gz ``` @@ -73,7 +73,7 @@ All steps in one go: ``` python -m venv text_venv text_venv\Scripts\activate - pip install pxtextmining==0.3.0 + pip install pxtextmining==0.3.2 pip install https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-2.3.1/en_core_web_sm-2.3.1.tar.gz pip install https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-2.3.1/en_core_web_lg-2.3.1.tar.gz ``` diff --git a/build/lib/pxtextmining/__init__.py b/build/lib/pxtextmining/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/build/lib/pxtextmining/factories/__init__.py b/build/lib/pxtextmining/factories/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/build/lib/pxtextmining/factories/factory_data_load_and_split.py b/build/lib/pxtextmining/factories/factory_data_load_and_split.py index 320ec086..ca828d80 100644 --- a/build/lib/pxtextmining/factories/factory_data_load_and_split.py +++ b/build/lib/pxtextmining/factories/factory_data_load_and_split.py @@ -4,13 +4,18 @@ from sklearn.model_selection import train_test_split -def factory_data_load_and_split(filename, target, predictor, test_size=0.33): +def factory_data_load_and_split(filename, target, predictor, test_size=0.33, reduce_criticality=False, theme=None): """ Function loads the dataset, renames the response and predictor as "target" and "predictor" respectively, and splits the dataset into training and test sets. - :param str filename: Dataset name (CSV), including the data type suffix. The dataset should be placed in folder - ``pxtextmining/datasets``. If ``filename`` is ``None``, the data are read from the database. + **NOTE:** As described later, arguments `reduce_criticality` and `theme` are for internal use by Nottinghamshire + Healthcare NHS Foundation Trust or other trusts who use the theme ("Access", "Environment/ facilities" etc.) and + criticality labels. They can otherwise be safely ignored. + + :param str, pandas.DataFrame filename: A ``pandas.DataFrame`` with the data (class and text columns), otherwise the + dataset name (CSV), including full path to the data folder (if not in the project's working directory), and the + data type suffix (".csv"). If ``filename`` is ``None``, the data are read from the database. **NOTE:** The feature that reads data from the database is for internal use only. Experienced users who would like to pull their data from their own databases can, of course, achieve that by slightly modifying the relevant lines in the script. A "my.conf" file will need to be placed in the root, with five lines, as follows @@ -24,6 +29,19 @@ def factory_data_load_and_split(filename, target, predictor, test_size=0.33): :param str target: Name of the response variable. :param str predictor: Name of the predictor variable. :param float test_size: Proportion of data that will form the test dataset. + :param bool reduce_criticality: For internal use by Nottinghamshire Healthcare NHS Foundation Trust or other trusts + that hold data on criticality. If `True`, then all records with a criticality of "-5" (respectively, "5") are + assigned a criticality of "-4" (respectively, "4"). This is to avoid situations where the pipeline breaks due to + a lack of sufficient data for "-5" and/or "5". Defaults to `False`. + :param str theme: For internal use by Nottinghamshire Healthcare NHS Foundation Trust or other trusts + that use theme labels ("Access", "Environment/ facilities" etc.). The column name of the theme variable. + Defaults to `None`. If supplied, the theme variable will be used as a predictor (along with the text predictor) + in the model that is fitted with criticality as the response variable. The rationale is two-fold. First, to + help the model improve predictions on criticality when the theme labels are readily available. Second, to force + the criticality for "Couldn't be improved" to always be "3" in the training and test data, as well as in the + predictions. This is the only criticality value that "Couldn't be improved" can take, so by forcing it to always + be "3", we are improving model performance, but are also correcting possible erroneous assignments of values + other than "3" that are attributed to human error. :return: A tuple of length 4: predictor-train, predictor-test, target-train and target-test datasets. """ @@ -31,38 +49,54 @@ def factory_data_load_and_split(filename, target, predictor, test_size=0.33): # Choose to read CSV from folder or table directly from database if filename is not None: - data_path = path.join('datasets', filename) - text_data = pd.read_csv(data_path, encoding='utf-8') + if isinstance(filename, str): + text_data = pd.read_csv(filename, encoding='utf-8') + else: + text_data = filename else: db = mysql.connector.connect(option_files="my.conf", use_pure=True) - with db.cursor() as cursor: - cursor.execute( - "SELECT " + target + ", " + predictor + " FROM text_data" - ) - text_data = cursor.fetchall() - text_data = pd.DataFrame(text_data) - text_data.columns = cursor.column_names + if theme is None: + with db.cursor() as cursor: + cursor.execute( + "SELECT " + target + ", " + predictor + " FROM text_data" + ) + text_data = cursor.fetchall() + text_data = pd.DataFrame(text_data) + text_data.columns = cursor.column_names + else: + with db.cursor() as cursor: + cursor.execute( + "SELECT " + target + ", " + predictor + ", " + theme + " FROM text_data" + ) + text_data = cursor.fetchall() + text_data = pd.DataFrame(text_data) + text_data.columns = cursor.column_names - text_data = text_data.rename(columns={target: "target", predictor: "predictor"}) - text_data = text_data.loc[text_data.target.notnull()].copy() - text_data = text_data.loc[text_data.target.notna()].copy() - text_data['predictor'] = text_data.predictor.fillna('__none__') + text_data = text_data.rename(columns={target: 'target', predictor: 'predictor'}) + if theme is not None: + text_data = text_data.rename(columns={theme: 'theme'}) + text_data = text_data.dropna(subset=['target', 'predictor']).copy() + text_data['predictor'] = text_data.predictor.fillna('__notext__') - # This is specific to NHS patient feedback data labelled with "criticality" classes. Should remove when a - # proper API is developed for this function. - if target == 'criticality': + # This is specific to NHS patient feedback data labelled with "criticality" classes + if reduce_criticality: text_data = text_data.query("target in ('-5', '-4', '-3', '-2', '-1', '0', '1', '2', '3', '4', '5')") text_data.loc[text_data.target == '-5', 'target'] = '-4' text_data.loc[text_data.target == '5', 'target'] = '4' + if theme is not None: + text_data.loc[text_data['theme'] == "Couldn't be improved", 'target'] = '3' print('Preparing training and test sets...') - x = pd.DataFrame(text_data["predictor"]) - y = text_data["target"].to_numpy() - x_train, x_test, y_train, y_test = train_test_split(x, y, - test_size=test_size, - stratify=y, - shuffle=True, - # random_state=42 # https://stackoverflow.com/questions/28064634/random-state-pseudo-random-number-in-scikit-learn - ) + x = text_data[['predictor']] # Needs to be an array of a data frame- can't be a pandas Series + if theme is not None: + x['theme'] = text_data['theme'].copy() + y = text_data['target'].to_numpy() + x_train, x_test, y_train, y_test, index_training_data, index_test_data = \ + train_test_split(x, y, pd.DataFrame(x).index, + test_size=test_size, + stratify=y, + shuffle=True + ) + print("Done") - return x_train, x_test, y_train, y_test + return x_train, x_test, y_train, y_test, index_training_data, index_test_data diff --git a/build/lib/pxtextmining/factories/factory_model_performance.py b/build/lib/pxtextmining/factories/factory_model_performance.py new file mode 100644 index 00000000..da319521 --- /dev/null +++ b/build/lib/pxtextmining/factories/factory_model_performance.py @@ -0,0 +1,126 @@ +import numpy as np +import pandas as pd +import seaborn as sns +import matplotlib.pyplot as plt +from sklearn.metrics import balanced_accuracy_score, confusion_matrix, matthews_corrcoef +from pxtextmining.helpers.metrics import class_balance_accuracy_score + + +def factory_model_performance(pipe, x_train, y_train, x_test, y_test, + metric): + + """ + Evaluate the performance of a fitted pipeline. + + :param pipe: Fitted `sklearn.pipeline.Pipeline + `_/ + `imblearn.pipeline.Pipeline + `_ + :param x_train: Training data (predictor). + :param y_train: Training data (response). + :param x_test: Test data (predictor). + :param y_test: Test data (response). + :param str metric: Scorer that was used in pipeline tuning ("accuracy_score", "balanced_accuracy_score", + "matthews_corrcoef", "class_balance_accuracy_score"). + :return: A ``tuple`` of length 5: + + - The fitted ``Scikit-learn``/``imblearn`` pipeline; + - A ``pandas.DataFrame`` with all (hyper)parameter values and models tried during fitting; + - A ``pandas.DataFrame`` with the predictions on the test set; + - A ``pandas.DataFrame`` with accuracies per class; + - A bar plot comparing the mean scores (of the user-supplied metric parameter) from the cross-validation on + the training set, for the best (hyper)parameter values for each learner; + """ + + refit = metric.replace("_", " ").replace(" score", "").title() + + aux = pd.DataFrame(pipe.best_params_.items()) + best_estimator = aux[aux[0] == "clf__estimator"].reset_index()[1][0] + estimator_position = len(pipe.best_estimator_) - 1 + pipe.best_estimator_.steps.pop(estimator_position) + pipe.best_estimator_.steps.append(("clf", best_estimator)) + pipe.best_estimator_.fit(x_train, y_train) + + print("The best estimator is %s" % (pipe.best_estimator_.named_steps["clf"])) + print("The best parameters are:") + for param, value in pipe.best_params_.items(): + print("{}: {}".format(param, value)) + print("The best score from the cross-validation for \n the supplied scorer (" + + refit + ") is %s" + % (round(pipe.best_score_, 2))) + + pred = pipe.best_estimator_.predict(x_test) + cm = confusion_matrix(y_test, pred) + + print("Model accuracy on the test set is %s percent" + % (int(pipe.best_estimator_.score(x_test, y_test) * 100))) + print("Balanced accuracy on the test set is %s percent" + % (int(balanced_accuracy_score(y_test, pred) * 100))) + print("Class balance accuracy on the test set is %s percent" + % (int(class_balance_accuracy_score(y_test, pred) * 100))) + print("Matthews correlation on the test set is %s " + % (round(matthews_corrcoef(y_test, pred), 2))) + + accuracy_per_class = cm.astype("float") / cm.sum(axis=1)[:, np.newaxis] + accuracy_per_class = pd.DataFrame(accuracy_per_class.diagonal()) + accuracy_per_class.columns = ["accuracy"] + unique, frequency = np.unique(y_test, return_counts=True) + accuracy_per_class["class"], accuracy_per_class["counts"] = unique, frequency + accuracy_per_class = accuracy_per_class[["class", "counts", "accuracy"]] + + tuning_results = pd.DataFrame(pipe.cv_results_) + tuned_learners = [] + for i in tuning_results["param_clf__estimator"]: + tuned_learners.append(i.__class__.__name__) + tuning_results["learner"] = tuned_learners + y_axis = "mean_test_" + refit + tuning_results = tuning_results.sort_values(y_axis, ascending=False) + tuning_results.columns = tuning_results.columns.str.replace('alltrans__process__', '') # When using ordinal with theme='label', names are too long. + + # Convert non-numeric to strings. This is to ensure that writing to MySQL won't throw an error. + # (There MUST be a better way of fixing this!) + for i in tuning_results.columns: + if ( + (not isinstance(tuning_results[i][0], float)) and + (not isinstance(tuning_results[i][0], int)) and + (not isinstance(tuning_results[i][0], str)) + ): + tuning_results[i] = tuning_results[i].apply(str) + + print("Plotting performance of the best of each estimator...") + + # Find the best tunings for each model. # + # Note that SGDClassifier fits a logistic regression when loss is "log", but a Linear SVM when loss is "hinge". + # Looking at column "learner" in "tuning results", one cannot tell which of the two models SGD is. + # Let's make that clear. + if 'param_clf__estimator_loss' in tuning_results.columns: # Need statement as models other than SGD don't have loss. + learners = [] + for i, j in zip(tuning_results["learner"], tuning_results["param_clf__estimator__loss"]): + if j == "log": + learners.append("Logistic") + elif j == "hinge": + learners.append("Linear SVM") + else: + learners.append(i) + tuning_results["learner"] = learners + + # Now, let's find the best tunings for each of the fitted models + aux = tuning_results.filter(regex="mean_test|learner").groupby(["learner"]).max().reset_index() + aux = aux.sort_values([y_axis], ascending=False) + aux = aux.melt("learner") + aux["variable"] = aux["variable"].str.replace("mean_test_", "") + aux["learner"] = aux["learner"].str.replace("Classifier", "") + + p_compare_models_bar = sns.barplot(x="learner", y="value", hue="variable", + data=aux) + p_compare_models_bar.figure.set_size_inches(15, 13) + p_compare_models_bar.set_xticklabels(p_compare_models_bar.get_xticklabels(), + rotation=90) + plt.legend(bbox_to_anchor=(1.01, 1), borderaxespad=0) + p_compare_models_bar.set(xlabel=None, ylabel=None, + title="Learner performance ordered by " + refit) + + print("Fitting optimal pipeline on whole dataset...") + pipe.best_estimator_.fit(pd.concat([x_train, x_test]), np.concatenate([y_train, y_test])) + + return pipe, tuning_results, pred, accuracy_per_class, p_compare_models_bar diff --git a/build/lib/pxtextmining/factories/factory_pipeline.py b/build/lib/pxtextmining/factories/factory_pipeline.py new file mode 100644 index 00000000..a9c96ff7 --- /dev/null +++ b/build/lib/pxtextmining/factories/factory_pipeline.py @@ -0,0 +1,423 @@ +from imblearn import FunctionSampler +from imblearn.pipeline import Pipeline +# from sklearn.pipeline import Pipeline +from sklearn.metrics import make_scorer, accuracy_score, balanced_accuracy_score, matthews_corrcoef +from sklearn.compose import ColumnTransformer +from sklearn.preprocessing import FunctionTransformer, KBinsDiscretizer, OneHotEncoder, StandardScaler +from sklearn.feature_extraction.text import TfidfVectorizer +from sklearn.feature_selection import SelectPercentile, chi2, f_classif +from sklearn.model_selection import RandomizedSearchCV +# from sklearn.svm import LinearSVC +from sklearn.linear_model import PassiveAggressiveClassifier, Perceptron, RidgeClassifier, SGDClassifier +from sklearn.naive_bayes import BernoulliNB, ComplementNB, MultinomialNB +from sklearn.neighbors import KNeighborsClassifier, NearestCentroid +from sklearn.ensemble import RandomForestClassifier +from pxtextmining.helpers.text_preprocessor import text_preprocessor +from pxtextmining.helpers.sentiment_scores import sentiment_scores +from pxtextmining.helpers.text_length import text_length +from pxtextmining.helpers.tokenization import LemmaTokenizer +from pxtextmining.helpers.word_vectorization import EmbeddingsTransformer +from pxtextmining.helpers.oversampling import random_over_sampler_data_generator +from pxtextmining.helpers.metrics import class_balance_accuracy_score +from pxtextmining.helpers.estimator_switcher import ClfSwitcher +from pxtextmining.helpers.ordinal_classification import OrdinalClassifier +from pxtextmining.helpers.scaler_switcher import ScalerSwitcher +from pxtextmining.helpers.feature_selection_switcher import FeatureSelectionSwitcher +from pxtextmining.helpers.text_transformer_switcher import TextTransformerSwitcher +from pxtextmining.helpers.theme_binarization import ThemeBinarizer + + +def factory_pipeline(x, y, tknz="spacy", + ordinal=False, + metric="class_balance_accuracy_score", + cv=5, n_iter=100, n_jobs=5, verbose=3, + learners=[ + "SGDClassifier", + "RidgeClassifier", + "Perceptron", + "PassiveAggressiveClassifier", + "BernoulliNB", + "ComplementNB", + "MultinomialNB", + # "KNeighborsClassifier", + # "NearestCentroid", + "RandomForestClassifier" + ], + theme=None): + + """ + Prepare and fit a text classification pipeline. + + The pipeline's parameter grid switches between two approaches to text classification: Bag-of-Words and Embeddings. + For the former, both TF-IDF and raw counts are tried out. + + The pipeline does the following: + + - Feature engineering: + + * Converts text into TF-IDFs or `GloVe `_ word vectors with + `spaCy `_; + * Creates a new feature that is the length of the text in each record; + * Performs sentiment analysis on the text feature and creates new features that are all scores/indicators + produced by `TextBlob `_ + and `vaderSentiment `_. + * Applies `sklearn.preprocessing.KBinsDiscretizer + `_ to the text + length and sentiment indicator features, and `sklearn.preprocessing.StandardScaler + `_ to the + embeddings (word vectors); + - Up-sampling of rare classes: uses `imblearn.over_sampling.RandomOverSampler + `_ + to up-sample rare classes. Currently the threshold to consider a class as rare and the up-balancing values are + fixed and cannot be user-defined. + - Tokenization and lemmatization of the text feature: uses ``spaCy`` (default) or `NLTK `_. + It also strips punctuation, excess spaces, and metacharacters "r" and "n" from the text. It converts emojis into + "__text__" (where "text" is the emoji name), and NA/NULL values into "__notext__" (the pipeline does get rid of + records with no text, but this conversion at least deals with any escaping ones). + - Feature selection: Uses `sklearn.feature_selection.SelectPercentile + `_ + with `sklearn.feature_selection.chi2 + `_ + for TF-IDFs or `sklearn.feature_selection.f_classif + `_ + for embeddings. + - Fitting and benchmarking of user-supplied ``Scikit-learn`` `estimators + `_. + + The numeric values in the grid are currently lists/tuples of values that are defined either empirically or + are based on the published literature (e.g. for Random Forest, see `Probst et al. 2019 + `_). Values may be replaced by appropriate distributions in a future release. + + **NOTE:** As described later, argument `theme` is for internal use by Nottinghamshire Healthcare NHS Foundation + Trust or other trusts who use the theme ("Access", "Environment/ facilities" etc.) labels. It can otherwise be + safely ignored. + + :param bool ordinal: Whether to fit an ordinal classification model. The ordinal model is the implementation of + `Frank and Hall (2001) `_ that can use any + standard classification model that calculates probabilities. + :param x: The text feature. + :param y: The response variable. + :param str tknz: Tokenizer to use ("spacy" or "wordnet"). + :param str metric: Scorer to use during pipeline tuning ("accuracy_score", "balanced_accuracy_score", + "matthews_corrcoef", "class_balance_accuracy_score"). + :param int cv: Number of cross-validation folds. + :param int n_iter: Number of parameter settings that are sampled (see `sklearn.model_selection.RandomizedSearchCV + `_). + :param int n_jobs: Number of jobs to run in parallel (see ``sklearn.model_selection.RandomizedSearchCV``). + :param int verbose: Controls the verbosity (see ``sklearn.model_selection.RandomizedSearchCV``). + :param str, list[str] learners: A list of ``Scikit-learn`` names of the learners to tune. Must be one or more of + "SGDClassifier", "RidgeClassifier", "Perceptron", "PassiveAggressiveClassifier", "BernoulliNB", "ComplementNB", + "MultinomialNB", "KNeighborsClassifier", "NearestCentroid", "RandomForestClassifier". When a single model is + used, it can be passed as a string. + :param str theme: For internal use by Nottinghamshire Healthcare NHS Foundation Trust or other trusts + that use theme labels ("Access", "Environment/ facilities" etc.). The column name of the theme variable. + Defaults to `None`. If supplied, the theme variable will be used as a predictor (along with the text predictor) + in the model that is fitted with criticality as the response variable. The rationale is two-fold. First, to + help the model improve predictions on criticality when the theme labels are readily available. Second, to force + the criticality for "Couldn't be improved" to always be "3" in the training and test data, as well as in the + predictions. This is the only criticality value that "Couldn't be improved" can take, so by forcing it to always + be "3", we are improving model performance, but are also correcting possible erroneous assignments of values + other than "3" that are attributed to human error. + :return: A tuned `sklearn.pipeline.Pipeline + `_/ + `imblearn.pipeline.Pipeline + `_. + """ + + features_text = 'predictor' + + # Define transformers for pipeline # + # Transformer that calculates text length and transforms it. + transformer_text_length = Pipeline(steps=[ + ('length', (FunctionTransformer(text_length))), + ('scaler', (ScalerSwitcher())) + ]) + + # Transformer that calculates sentiment indicators (e.g. TextBlob, VADER) and transforms them. + transformer_sentiment = Pipeline(steps=[ + ('sentiment', (FunctionTransformer(sentiment_scores))), + ('scaler', (ScalerSwitcher())) + ]) + + # Transformer that converts text to Bag-of-Words or embeddings. + transformer_text = Pipeline(steps=[ + ('text', (TextTransformerSwitcher())) + ]) + + # Gather transformers. + preprocessor = ColumnTransformer( + transformers=[ + ('sentimenttr', transformer_sentiment, features_text), + ('lengthtr', transformer_text_length, features_text), + ('texttr', transformer_text, features_text)]) + + # Up-sampling step # + oversampler = FunctionSampler(func=random_over_sampler_data_generator, + kw_args={'threshold': 200, + 'up_balancing_counts': 300, + 'random_state': 0}, + validate=False) + + # Make pipeline # + if ordinal and theme is not None: + # This is for internal use by Nottinghamshire Healthcare NHS Foundation Trust or other trusts that use theme + # labels ("Access", "Environment/ facilities" etc.). We want the criticality for "Couldn't be improved" to + # always be "3". The theme label is passed as a one-hot encoded set of columns (or as a "binarized" column where + # 1 is for "Couldn't be improved" and 0 is for everything else) of which the first is for # "Couldn't be + # improved". The one-hot encoded columns (or the binarized column) are (is) actually the first column(s) of the + # whole sparse matrix that has the TF-IDFs, sentiment features etc. that is produced when fitting by the + # pipeline. When running the ordinal classification model, we want to find the records with "Couldn't be + # improved" (i.e. records with a value of 1) in the first column and replace the predicted criticality values + # with "3". + # When one-hot encoded, we pass all of the theme's columns into the model, so we handle them separately from + # text predictor to avoid the feature selection step for them. We thus make a separate pipeline with the + # preprocessor and feature selection steps for the text predictor (pipe_all_but_theme) and one-hot encode the + # theme column in all_transforms. We want to place "Couldn't be improved" in position 0 (first column) of the + # thus produced sparse matrix so as to easily access it in the code for the ordinal model (OrdinalClassifier()). + pipe_all_but_theme = Pipeline([ + ('preprocessor', preprocessor), + ('featsel', FeatureSelectionSwitcher()) + ]) + + all_transforms = ColumnTransformer([ + ('theme', ScalerSwitcher(), ['theme']), # Try out OneHotEncoder() or ThemeBinarizer(). + ('process', pipe_all_but_theme, [features_text]) + ]) + + pipe = Pipeline([ + ('sampling', oversampler), + ('alltrans', all_transforms), + ('clf', OrdinalClassifier(theme='theme', target_class_value='3', theme_class_value=1)) + ]) + elif ordinal and theme is None: + pipe = Pipeline([ + ('sampling', oversampler), + ('preprocessor', preprocessor), + ('featsel', FeatureSelectionSwitcher()), + ('clf', OrdinalClassifier())]) + else: + pipe = Pipeline([ + ('sampling', oversampler), + ('preprocessor', preprocessor), + ('featsel', FeatureSelectionSwitcher()), + ('clf', ClfSwitcher())]) + + # Define (hyper)parameter grid # + # A few initial value ranges for some (hyper)parameters. + param_grid_preproc = { + 'sampling__kw_args': [{'threshold': 100}, {'threshold': 200}], + 'sampling__kw_args': [{'up_balancing_counts': 300}, {'up_balancing_counts': 800}], + 'clf__estimator': None, + 'preprocessor__sentimenttr__scaler__scaler': None, + 'preprocessor__lengthtr__scaler__scaler': None, + 'preprocessor__texttr__text__transformer': None, + 'featsel__selector': [SelectPercentile()], + 'featsel__selector__percentile': [70, 85, 100] + } + + if ordinal and theme is not None: + param_grid_preproc['alltrans__theme__scaler'] = None + + + # If a single model is passed as a string, convert to list + if isinstance(learners, str): + learners = [learners] + + # Just in case user has supplied the same learner more than once + learners = list(set(learners)) + + # For Frank and Hall's (2001) ordinal method to work, we need models that can calculate probs/scores. + if ordinal: + learners = [lrn for lrn in learners if lrn not in ["RidgeClassifier", "Perceptron", + "PassiveAggressiveClassifier", "NearestCentroid"]] + + # Replace learner name with learner class in 'learners' function argument. + for i in learners: + if i in "SGDClassifier": + learners[learners.index(i)] = SGDClassifier() + if i in "RidgeClassifier": + learners[learners.index(i)] = RidgeClassifier() + if i in "Perceptron": + learners[learners.index(i)] = Perceptron() + if i in "PassiveAggressiveClassifier": + learners[learners.index(i)] = PassiveAggressiveClassifier() + if i in "BernoulliNB": + learners[learners.index(i)] = BernoulliNB() + if i in "ComplementNB": + learners[learners.index(i)] = ComplementNB() + if i in "MultinomialNB": + learners[learners.index(i)] = MultinomialNB() + if i in "KNeighborsClassifier": + learners[learners.index(i)] = KNeighborsClassifier() + if i in "NearestCentroid": + learners[learners.index(i)] = NearestCentroid() + if i in "RandomForestClassifier": + learners[learners.index(i)] = RandomForestClassifier() + + # Further populate (hyper)parameter grid. + # NOTE ABOUT PROCESS BELOW: + # Use TfidfVectorizer() as CountVectorizer() also, to determine if raw + # counts instead of frequencies improves performance. This requires + # use_idf=False and norm=None. We want to ensure that norm=None + # will not be combined with use_idf=True inside the grid search, so we + # create a separate parameter set to prevent this from happening. We do + # this below with temp list aux1. + # Meanwhile, we want norm='l2' (the default) for the grid defined by temp + # list aux. If we don't explicitly set norm='l2' in aux, the + # norm column in the table of the CV results (following fitting) is + # always empty. My speculation is that Scikit-learn does consider norm + # to be 'l2' for aux, but it doesn't print it. That's because unless we + # explicitly run aux['preprocessor__text__tfidf__norm'] = ['l2'], setting + # norm as 'l2' in aux is implicit (i.e. it's the default), while setting + # norm as None in aux1 is explicit (i.e. done by the user). But we want + # the colum norm in the CV results to clearly state which runs used the + # 'l2' norm, hence we explicitly run command + # aux['preprocessor__text__tfidf__norm'] = ['l2']. + + param_grid = [] + for i in learners: + for j in [TfidfVectorizer(), EmbeddingsTransformer()]: + aux = param_grid_preproc.copy() + aux['clf__estimator'] = [i] + aux['preprocessor__texttr__text__transformer'] = [j] + if ordinal and theme is not None: + onehot_categories = [["Couldn't be improved", 'Access', 'Care received', 'Communication', 'Dignity', + 'Environment/ facilities', 'Miscellaneous', 'Staff', 'Transition/coordination']] + aux['alltrans__theme__scaler'] = \ + [OneHotEncoder(categories=onehot_categories), ThemeBinarizer(class_col='theme', + target_class="Couldn't be improved")] + + # if i.__class__.__name__ == LinearSVC().__class__.__name__: + # aux['clf__estimator__max_iter'] = [10000] + # aux['clf__estimator__class_weight'] = [None, 'balanced'] + # # aux['clf__estimator__dual'] = [True, False] # https://stackoverflow.com/questions/52670012/convergencewarning-liblinear-failed-to-converge-increase-the-number-of-iterati + if i.__class__.__name__ == BernoulliNB().__class__.__name__: + aux['clf__estimator__alpha'] = (0.1, 0.5, 1) + if i.__class__.__name__ == ComplementNB().__class__.__name__: + aux['clf__estimator__alpha'] = (0.1, 0.5, 1) + if i.__class__.__name__ == MultinomialNB().__class__.__name__: + aux['clf__estimator__alpha'] = (0.1, 0.5, 1) + if i.__class__.__name__ == SGDClassifier().__class__.__name__: + aux['clf__estimator__max_iter'] = [10000] + aux['clf__estimator__class_weight'] = [None, 'balanced'] + aux['clf__estimator__penalty'] = ('l2', 'elasticnet') + if ordinal: + aux['clf__estimator__loss'] = ['log'] + else: + aux['clf__estimator__loss'] = ['hinge', 'log'] + if i.__class__.__name__ == RidgeClassifier().__class__.__name__: + aux['clf__estimator__class_weight'] = [None, 'balanced'] + aux['clf__estimator__alpha'] = (0.1, 1.0, 10.0) + if i.__class__.__name__ == Perceptron().__class__.__name__: + aux['clf__estimator__class_weight'] = [None, 'balanced'] + aux['clf__estimator__penalty'] = ('l2', 'elasticnet') + if i.__class__.__name__ == RandomForestClassifier().__class__.__name__: + aux['clf__estimator__max_features'] = ('sqrt', 0.666) + + if j.__class__.__name__ == TfidfVectorizer().__class__.__name__: + aux['featsel__selector__score_func'] = [chi2] + aux['preprocessor__texttr__text__transformer__tokenizer'] = [LemmaTokenizer(tknz)] + aux['preprocessor__texttr__text__transformer__preprocessor'] = [text_preprocessor] + aux['preprocessor__texttr__text__transformer__norm'] = ['l2'] + aux['preprocessor__texttr__text__transformer__ngram_range'] = ((1, 3), (2, 3), (3, 3)) + aux['preprocessor__texttr__text__transformer__max_df'] = [0.7, 0.95] + aux['preprocessor__texttr__text__transformer__min_df'] = [3, 1] + aux['preprocessor__texttr__text__transformer__use_idf'] = [True, False] + + # The transformation is a k-means discretizer with 3 bins: + # 1. The three bins represent short, medium and long text length. Reluctant to make n_bins a tunable + # parameter for efficiency reasons; + # 2. Discretizing and one-hot encoding satisfies the data format requirements for Chi^2-based feature + # selection; + # 3. An added benefit is that this data format is acceptable by different models, some of which may + # not be scale-invariant, while others do not accept negative or continuous values other than + # TF-IDFs; + aux['preprocessor__lengthtr__scaler__scaler'] = \ + [KBinsDiscretizer(n_bins=3, encode='onehot', strategy='kmeans')] + + # The transformation is a k-means discretizer with 4 or 8 bins supplied as a tunable argument later on: + # 1. The 4 bins represent weak, weak-medium, medium-strong and strong for values in [0, 1]; + # 2. The 8 bins represent weak, weak-medium, medium-strong and strong for values in [-1, 0] and [0, 1] + # (i.e. 8 bins for values in [-1, 1]); + # 3. We also allow for the possibility of 8 bins for [0, 1] and 4 bins for [-1, 1]- no harm in trying; + # 4. Discretizing and one-hot encoding satisfies the data format requirements for Chi^2-based feature + # selection; + # 5. An added benefit is that this data format is acceptable by different models, some of which may + # not be scale-invariant, while others do not accept negative or continuous values other than + # TF-IDFs; + aux['preprocessor__sentimenttr__scaler__scaler'] = [KBinsDiscretizer(encode='onehot', strategy='kmeans')] + aux['preprocessor__sentimenttr__scaler__scaler__n_bins'] = [4, 8] # Based on the idea of having 4 (8) bins for indicators in [0, 1] ([-1, 1]), but open to trying 8 (4) for [0, 1] ([-1, 1]) too. + + param_grid.append(aux) + + aux1 = aux.copy() + aux1['preprocessor__texttr__text__transformer__use_idf'] = [False] + aux1['preprocessor__texttr__text__transformer__norm'] = [None] + + param_grid.append(aux1) + + if j.__class__.__name__ == EmbeddingsTransformer().__class__.__name__: + aux['featsel__selector__score_func'] = [f_classif] + aux['preprocessor__lengthtr__scaler__scaler'] = [StandardScaler()] + aux['preprocessor__sentimenttr__scaler__scaler'] = [StandardScaler()] + + # We don't want learners than can't handle negative data in the embeddings. + if (i.__class__.__name__ == BernoulliNB().__class__.__name__) or \ + (i.__class__.__name__ == ComplementNB().__class__.__name__) or \ + (i.__class__.__name__ == MultinomialNB().__class__.__name__): + aux = None + + param_grid.append(aux) + + param_grid = [x for x in param_grid if x is not None] + + # When a theme is supplied for the ordinal model, the pipeline steps are a little different. Step "alltrans" + # includes the steps for both the preprocessing of the text feature, and the one-hot encoding of the theme feature. + # So, a parameter such as "featsel__selector" in the pipeline without a theme feature would be + # "alltrans__process__featsel__selector" in this one. We need to pass these correct names to the tuning grid. + if ordinal and theme is not None: + ordinal_with_theme_params = [ + 'featsel__selector', + 'featsel__selector__percentile', + 'featsel__selector__score_func', + 'preprocessor__sentimenttr__scaler__scaler', + 'preprocessor__sentimenttr__scaler__scaler__n_bins', + 'preprocessor__lengthtr__scaler__scaler', + 'preprocessor__texttr__text__transformer', + 'preprocessor__texttr__text__transformer__tokenizer', + 'preprocessor__texttr__text__transformer__preprocessor', + 'preprocessor__texttr__text__transformer__norm', + 'preprocessor__texttr__text__transformer__ngram_range', + 'preprocessor__texttr__text__transformer__max_df', + 'preprocessor__texttr__text__transformer__min_df', + 'preprocessor__texttr__text__transformer__use_idf'] + + for i in range(len(param_grid)): + for j in ordinal_with_theme_params: + if j in param_grid[i].keys(): + old_key = j + new_key = 'alltrans__process__' + old_key + param_grid[i][new_key] = param_grid[i].pop(old_key) + + # Define fitting metric (refit) and other useful performance metrics. + refit = metric.replace('_', ' ').replace(' score', '').title() + scoring = {'Accuracy': make_scorer(accuracy_score), + 'Balanced Accuracy': make_scorer(balanced_accuracy_score), + 'Matthews Correlation Coefficient': make_scorer(matthews_corrcoef), + 'Class Balance Accuracy': make_scorer(class_balance_accuracy_score)} + + # Define pipeline # + pipe_cv = RandomizedSearchCV(pipe, param_grid, n_jobs=n_jobs, return_train_score=False, + cv=cv, verbose=verbose, + scoring=scoring, refit=refit, n_iter=n_iter) + + # These messages are for function helpers.text_preprocessor which is used by + # TfidfVectorizer() and EmbeddingsTransformer(). Having them inside text_preprocessor() prints + # them in each iteration, which is redundant. Having the here prints them once. + print('Stripping punctuation from text...') + print("Stripping excess spaces, whitespaces and line breaks from text...") + + # Fit pipeline # + pipe_cv.fit(x, y) + + return pipe_cv diff --git a/build/lib/pxtextmining/factories/factory_predict_unlabelled_text.py b/build/lib/pxtextmining/factories/factory_predict_unlabelled_text.py new file mode 100644 index 00000000..c9c055e9 --- /dev/null +++ b/build/lib/pxtextmining/factories/factory_predict_unlabelled_text.py @@ -0,0 +1,80 @@ +import pandas as pd +import joblib +from itertools import chain + + +def factory_predict_unlabelled_text(dataset, predictor, pipe_path_or_object, + preds_column=None, column_names='all_cols', theme=None): + """ + Predict unlabelled text data using a fitted `sklearn.pipeline.Pipeline + `_/`imblearn.pipeline.Pipeline + `_. + + **NOTE:** As described later, argument `theme` is for internal use by Nottinghamshire Healthcare NHS Foundation + Trust or other trusts who use the theme ("Access", "Environment/ facilities" etc.) labels. It can otherwise be + safely ignored. + + :param dataset: A ``pandas.DataFrame`` (or an object that can be converted into such) with the text data to predict + classes for. + :param str predictor: The column name of the text variable. + :param str, sklearn.model_selection._search.RandomizedSearchCV pipe_path_or_object: A string in the form + path_to_fitted_pipeline/pipeline.sav," where "pipeline" is the name of the SAV file with the fitted + ``Scikit-learn``/``imblearn.pipeline.Pipeline`` or a ``sklearn.model_selection._search.RandomizedSearchCV``. + :param str preds_column: The user-specified name of the column that will have the predictions. If ``None`` (default), + then the name will be ``predictor + '_preds'``. + :param column_names: A ``list``/``tuple`` of strings with the names of the columns of the supplied data frame (incl. + ``predictor``) to be added to the returned ``pandas.DataFrame``. If "preds_only", then the only column in + the returned data frame will be ``preds_column``. Defaults to "all_cols". + :param str theme: For internal use by Nottinghamshire Healthcare NHS Foundation Trust or other trusts + that use theme labels ("Access", "Environment/ facilities" etc.). The column name of the theme variable. + Defaults to `None`. If supplied, the theme variable will be used as a predictor (along with the text predictor) + in the model that is fitted with criticality as the response variable. The rationale is two-fold. First, to + help the model improve predictions on criticality when the theme labels are readily available. Second, to force + the criticality for "Couldn't be improved" to always be "3" in the training and test data, as well as in the + predictions. This is the only criticality value that "Couldn't be improved" can take, so by forcing it to always + be "3", we are improving model performance, but are also correcting possible erroneous assignments of values + other than "3" that are attributed to human error. + :return: A ``pandas.DataFrame`` with the predictions and any other columns supplied in ``column_names``. + """ + + data_unlabelled = pd.DataFrame(dataset) + + # Rename predictor column to names pipeline knows and replace NAs with empty string. + if theme is None: + data_unlabelled = data_unlabelled.rename(columns={predictor: 'predictor'}) + else: + data_unlabelled = data_unlabelled.rename(columns={predictor: 'predictor', theme: 'theme'}) + data_unlabelled['predictor'] = data_unlabelled.predictor.fillna('') + + # Load pipeline (if not already supplied) and make predictions + if isinstance(pipe_path_or_object, str): + pipe = joblib.load(pipe_path_or_object) + else: + pipe = pipe_path_or_object + if theme is None: + predictions = pipe.predict(data_unlabelled[['predictor']]) + else: + predictions = pipe.predict(data_unlabelled[['predictor', 'theme']]) + + if preds_column is None: + preds_column = predictor + '_preds' + data_unlabelled[preds_column] = predictions + + # Rename back to original variable names + data_unlabelled = data_unlabelled.rename(columns={'predictor': predictor}) + data_unlabelled = data_unlabelled.rename(columns={'theme': theme}) + + # Set column names of columns to return in final data frame + if column_names == 'all_cols': + column_names = [data_unlabelled] + elif column_names == 'preds_only': + column_names = None + elif type(column_names) is str: + column_names = [column_names] + + returned_cols = [[preds_column], column_names] # column_names is a list. Put preds_column in a list to create a list + # of lists to unnest later to get a list of strings. + returned_cols = [x for x in returned_cols if x is not None] + returned_cols = list(chain.from_iterable(returned_cols)) # Unnest list of lists. + + return data_unlabelled[returned_cols] diff --git a/build/lib/pxtextmining/factories/factory_write_results.py b/build/lib/pxtextmining/factories/factory_write_results.py index e2690795..92339852 100644 --- a/build/lib/pxtextmining/factories/factory_write_results.py +++ b/build/lib/pxtextmining/factories/factory_write_results.py @@ -10,7 +10,7 @@ def factory_write_results(pipe, tuning_results, pred, accuracy_per_class, p_compare_models_bar, - target, x_train, x_test, metric, + target, x_train, x_test, index_training_data, index_test_data, metric, objects_to_save=[ "pipeline", "tuning results", @@ -68,7 +68,7 @@ def factory_write_results(pipe, tuning_results, pred, accuracy_per_class, p_comp :param str save_pipeline_as: Name of saved pipeline. If "default", then it will be saved as ``'pipeline_' + target + '.sav'``. :param str results_folder_name: Name of the folder that will contain all saved results specified in - ``objects_to_save``. + ``objects_to_save``. If the folder already exists, it will be overwritten. :return: A ``tuple`` of length 3 with the following ``pandas.DataFrame`` objects: - The predictions on the test set; @@ -76,8 +76,8 @@ def factory_write_results(pipe, tuning_results, pred, accuracy_per_class, p_comp - The row indices of the test data; """ - index_training_data = pd.DataFrame(x_train.index, columns=["row_index"]) - index_test_data = pd.DataFrame(x_test.index, columns=["row_index"]) + index_training_data = pd.DataFrame(index_training_data, columns=["row_index"]) + index_test_data = pd.DataFrame(index_test_data, columns=["row_index"]) pred = pd.DataFrame(pred, columns=[target + "_pred"]) pred["row_index"] = index_test_data diff --git a/build/lib/pxtextmining/helpers/__init__.py b/build/lib/pxtextmining/helpers/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/build/lib/pxtextmining/helpers/estimator_switcher.py b/build/lib/pxtextmining/helpers/estimator_switcher.py new file mode 100644 index 00000000..9d0ae65a --- /dev/null +++ b/build/lib/pxtextmining/helpers/estimator_switcher.py @@ -0,0 +1,30 @@ +from sklearn.base import BaseEstimator +from sklearn.linear_model import SGDClassifier + + +class ClfSwitcher(BaseEstimator): + """ + Class to add different learners as pipeline parameters in a + `sklearn.pipeline.Pipeline `_/ + `imblearn.pipeline.Pipeline + `_ + pipeline. + Code taken from `this post + `_. + """ + + def __init__(self, estimator=SGDClassifier(max_iter=10000)): + self.estimator = estimator + + def fit(self, X, y=None, **kwargs): + self.estimator.fit(X, y) + return self + + def predict(self, X, y=None): + return self.estimator.predict(X) + + def predict_proba(self, X): + return self.estimator.predict_proba(X) + + def score(self, X, y): + return self.estimator.score(X, y) \ No newline at end of file diff --git a/build/lib/pxtextmining/helpers/feature_selection_switcher.py b/build/lib/pxtextmining/helpers/feature_selection_switcher.py new file mode 100644 index 00000000..01c1d407 --- /dev/null +++ b/build/lib/pxtextmining/helpers/feature_selection_switcher.py @@ -0,0 +1,21 @@ +from sklearn.base import BaseEstimator, TransformerMixin +from sklearn.feature_selection import SelectPercentile + + +class FeatureSelectionSwitcher(BaseEstimator, TransformerMixin): + """ + Class for choosing between ``Scikit-learn`` `feature selection tests + `_ for use with + `sklearn.feature_selection.SelectPercentile + `_. + """ + + def __init__(self, selector=SelectPercentile()): + self.selector = selector + + def fit(self, X, y, **kwargs): + self.selector.fit(X, y) + return self + + def transform(self, X, y=None, **kwargs): + return self.selector.transform(X) diff --git a/build/lib/pxtextmining/helpers/metrics.py b/build/lib/pxtextmining/helpers/metrics.py new file mode 100644 index 00000000..6fdff910 --- /dev/null +++ b/build/lib/pxtextmining/helpers/metrics.py @@ -0,0 +1,22 @@ +import numpy as np +from sklearn.metrics import confusion_matrix + + +def class_balance_accuracy_score(y_true, y_pred): + """ + Function for Class Balance Accuracy scorer + (p. 40 in `Mosley 2013 `_). + + :param array y_true: True classes, shape = [n_samples]. + :param array y_pred: Predicted classes, shape = [n_samples]. + :return: cba (`float`): The Class Balance Accuracy score. + """ + + cm = confusion_matrix(y_true, y_pred) + c_i_dot = np.sum(cm, axis=1) + c_dot_i = np.sum(cm, axis=0) + cba = [] + for i in range(len(c_dot_i)): + cba.append(cm[i][i] / max(c_i_dot[i], c_dot_i[i])) + cba = sum(cba) / (i + 1) + return cba diff --git a/build/lib/pxtextmining/helpers/ordinal_classification.py b/build/lib/pxtextmining/helpers/ordinal_classification.py new file mode 100644 index 00000000..2e039b62 --- /dev/null +++ b/build/lib/pxtextmining/helpers/ordinal_classification.py @@ -0,0 +1,111 @@ +import numpy as np +import pandas as pd +from sklearn.base import clone +from sklearn.base import BaseEstimator +from sklearn.linear_model import LogisticRegression + + +class OrdinalClassifier(BaseEstimator): + + """ + Estimator class for building an ordinal classification model using the method of + `Frank and Hall (2001) `_ The code in this class is + based on code published online in `this post + `_. + + **NOTE:** As described later, argument `theme` is for internal use by Nottinghamshire Healthcare NHS Foundation + Trust or other trusts who use the theme ("Access", "Environment/ facilities" etc.) labels. It can otherwise be + safely ignored. + + :param estimator: A Scikit-learn classifier. + :param dict clfs: Helper variable. Defined inside the class. + :param y_factorized: Helper variable. Defined inside the class. + :param unique_class: Helper variable. Defined inside the class. + :param dict class_dict: Helper variable. Defined inside the class. + :param str theme: For internal use by Nottinghamshire Healthcare NHS Foundation Trust or other trusts + that use theme labels ("Access", "Environment/ facilities" etc.). The column name of the theme variable. + Defaults to `None`. If supplied, the theme variable will be used as a predictor (along with the text predictor) + in the model that is fitted with criticality as the response variable. The rationale is two-fold. First, to + help the model improve predictions on criticality when the theme labels are readily available. Second, to force + the criticality for "Couldn't be improved" to always be "3" in the training and test data, as well as in the + predictions. This is the only criticality value that "Couldn't be improved" can take, so by forcing it to always + be "3", we are improving model performance, but are also correcting possible erroneous assignments of values + other than "3" that are attributed to human error. + :param str target_class_value: The criticality value to assign to "Couldn't be improved". + :param int theme_class_value: The value of "Couldn't be improved" in the transformed (e.g. one-hot encoded) theme + column. + """ + + def __init__(self, estimator=LogisticRegression(), clfs={}, y_factorized=None, unique_class=None, class_dict=None, + theme=None, target_class_value='3', theme_class_value=1): + self.estimator = estimator + self.clfs = clfs + self.y_factorized = y_factorized + self.unique_class = unique_class + self.class_dict = class_dict + self.theme = theme + self.target_class_value = target_class_value + self.theme_class_value = theme_class_value + + def fit(self, X, y=None, **kwargs): + self.y_factorized = pd.Series(y.astype('int64')).factorize(sort=True)[0] + self.unique_class = np.sort(np.unique(self.y_factorized)) + self.class_dict = dict(zip(self.y_factorized, y)) + + if self.unique_class.shape[0] > 2: + for i in range(self.unique_class.shape[0] - 1): + # for each k - 1 ordinal value we fit a binary classification problem + y_binary = (self.y_factorized > self.unique_class[i]).astype(np.uint8) + estimator = clone(self.estimator) + estimator.fit(X, y_binary) + self.clfs[i] = estimator + return self + + def predict_proba_all(self, X): + clfs_predict = {k: self.clfs[k].predict_proba(X) for k in self.clfs} + predicted = [] + + if self.unique_class.shape[0] > 2: + for i in self.unique_class: + if i == 0: + # V1 = 1 - Pr(y > V1) + predicted.append(1 - clfs_predict[i][:, 1]) + elif i in clfs_predict: + # Vi = Pr(y > Vi-1) - Pr(y > Vi) + predicted.append(clfs_predict[i - 1][:, 1] - clfs_predict[i][:, 1]) + else: + # Vk = Pr(y > Vk-1) + predicted.append(clfs_predict[i - 1][:, 1]) + return np.vstack(predicted).T + + def predict_proba(self, X): + return np.max(self.predict_proba_all(X), axis=1) + + def predict(self, X): + y_pred = np.argmax(self.predict_proba_all(X), axis=1) + y_pred_orig_class_names = [] + for i in y_pred: + y_pred_orig_class_names.append(self.class_dict[i]) + re = np.array(y_pred_orig_class_names) + + # This is for internal use by Nottinghamshire Healthcare NHS Foundation Trust or other trusts that use theme + # labels ("Access", "Environment/ facilities" etc.). We want the criticality for "Couldn't be improved" to + # always be "3" (or theme_class_value). The theme label is passed as a one-hot encoded set of columns, of + # which the first is for "Couldn't be improved". The one-hot encoded columns are actually the first columns of + # the whole sparse matrix that has the TF-IDFs, sentiment features etc. So we want to find the records + # with "Couldn't be improved" (i.e. records with a value of 1) in the first, one-hot encoded, column and replace + # the predicted criticality values with "3". + if self.theme is not None: + if isinstance(X[:, 0], np.ndarray): + theme_col = pd.DataFrame(X[:, 0]) + else: + theme_col = pd.DataFrame(X[:, 0].todense()) + no_improvements_index = theme_col.loc[theme_col.iloc[:, 0] == self.theme_class_value].index + re = pd.DataFrame(re, columns=['aux'], index=theme_col.index) + re.loc[no_improvements_index] = self.target_class_value + re = np.array(re.aux) + return re + + def score(self, X, y): + return self.estimator.score(X, y) + diff --git a/build/lib/pxtextmining/helpers/oversampling.py b/build/lib/pxtextmining/helpers/oversampling.py new file mode 100644 index 00000000..5952233d --- /dev/null +++ b/build/lib/pxtextmining/helpers/oversampling.py @@ -0,0 +1,78 @@ +import numpy as np +import pandas as pd +from imblearn.over_sampling import RandomOverSampler + + +def random_over_sampler_dictionary(y, threshold=200, up_balancing_counts=300): + """ + Function that detects rare classes. + + Finds classes with counts fewer than a specified threshold. The function performs a few validity checks: + + 1. The threshold must be smaller than the up-balancing number(s). When it is not, + the latter takes the value of the former. + + 2. When the up-balancing number is zero or the threshold is smaller than all class counts, then the function + returns the original counts. + + The validity checks ensure that the function does not stop the script. It is completely the user's responsibility + to ensure that the supplied values are meaningful. For example, if each of the rare classes are > 200 in number but + the threshold were 100 in one run and 150 in another run of the pipeline, then the result would be the original + counts in both cases, i.e. there would be a redundant repetition of runs. + Finally, the up-balancing number can be 0, an integer or a list of integers with length = number of rare classes. + It is the user's responsibility to ensure that, when it is a list, it has the correct length. + + :param ndarray y: The dependent variable. Shape (n_samples, ). + :param int threshold: The class count below which a class is considered rare. + :param array[int] up_balancing_counts: The number by which to up-balance a class. + :return: rare_classes (`dict`): Keys are the rare classes and values are the user-specified up-balancing numbers for + each class. + """ + + unique, frequency = np.unique(y, return_counts=True) + rare_classes = pd.DataFrame() + rare_classes['counts'], rare_classes.index = frequency, unique + + if type(up_balancing_counts) is int: + up_balancing_counts = [up_balancing_counts] + + aux = list(filter(lambda x: up_balancing_counts[x] < threshold, + range(len(up_balancing_counts)))) + if any(x < threshold for x in up_balancing_counts): + for i in aux: + print("The supplied up-balancing value for class " + + rare_classes.index[aux] + + " is smaller than the supplied threshold value. " + "Setting up_balancing_counts = threshold for this class") + up_balancing_counts[i] = threshold + + if (len(rare_classes[rare_classes.counts < threshold]) == 0) or (up_balancing_counts == [0]): + rare_classes = rare_classes.to_dict()['counts'] + else: + rare_classes = rare_classes[rare_classes.counts < threshold] + + if len(up_balancing_counts) != 1: + rare_classes.counts = up_balancing_counts + else: + rare_classes.counts = up_balancing_counts * len(rare_classes.counts) + rare_classes = rare_classes.to_dict()['counts'] + return rare_classes + + +def random_over_sampler_data_generator(X, y, threshold=200, up_balancing_counts=300, random_state=0): + """ + Uses random_over_sampler_dictionary() to return the up-balanced dataset. + Can be passed to imblearn.FunctionSampler to be then passed to imblearn.pipeline. + + :param ndarray X: The features table. Shape (n_samples, n_features) + :param ndarray y: The dependent variable. Shape (n_samples, ). + :param int threshold: The class count below which a class is considered rare. + :param array[int] up_balancing_counts: The number by which to up-balance a class. + :param int random_state: RandomState instance or ``None``, optional (default=``None``). + :return: self. + """ + + aux = random_over_sampler_dictionary(y, threshold, up_balancing_counts) + return RandomOverSampler( + sampling_strategy=aux, + random_state=random_state).fit_resample(X, y) diff --git a/build/lib/pxtextmining/helpers/passthrough.py b/build/lib/pxtextmining/helpers/passthrough.py new file mode 100644 index 00000000..e5ec6e21 --- /dev/null +++ b/build/lib/pxtextmining/helpers/passthrough.py @@ -0,0 +1,18 @@ +import pandas as pd +from sklearn.base import BaseEstimator, TransformerMixin + + +class Passthrough(BaseEstimator, TransformerMixin): + """ + Class for passing through features that require no preprocessing. + https://stackoverflow.com/questions/54592115/appending-the-columntransformer-result-to-the-original-data-within-a-pipeline + """ + + def fit(self, X, y=None): + return self + + def transform(self, X): + # Single-column data frames are Pandas series, which Scikit-learn doesn't know how to deal with. Make sure that + # result is always a data frame. + X = pd.DataFrame(X) + return X diff --git a/build/lib/pxtextmining/helpers/scaler_switcher.py b/build/lib/pxtextmining/helpers/scaler_switcher.py new file mode 100644 index 00000000..2e0345da --- /dev/null +++ b/build/lib/pxtextmining/helpers/scaler_switcher.py @@ -0,0 +1,19 @@ +from sklearn.base import BaseEstimator +from sklearn.base import TransformerMixin +from sklearn.preprocessing import MinMaxScaler + + +class ScalerSwitcher(BaseEstimator, TransformerMixin): + """ + Class for choosing between ``Scikit-learn`` + `scalers and preprocessors `_. + """ + + def __init__(self, scaler=MinMaxScaler()): + self.scaler = scaler + + def fit(self, X, y=None): + return self + + def transform(self, X, y=None): + return self.scaler.fit_transform(X) diff --git a/build/lib/pxtextmining/helpers/sentiment_scores.py b/build/lib/pxtextmining/helpers/sentiment_scores.py new file mode 100644 index 00000000..b36e6f0b --- /dev/null +++ b/build/lib/pxtextmining/helpers/sentiment_scores.py @@ -0,0 +1,38 @@ +import pandas as pd +from textblob import TextBlob +from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer + + +def sentiment_scores(X): + """ + Calculate sentiment indicators from `TextBlob `_ (polarity and + subjectivity) and `vaderSentiment `_ (positive, negative and neutral + sentiments and compound score). + + :param X: A dictionary, ``pandas.DataFrame``, tuple or list with the text strings. If it is a dictionary + (``pandas.DataFrame``), it must have a single key (column). + :return: A ``pandas.DataFrame`` with the sentiment scores for each text record. Shape [n_samples, 6]. + """ + + vader_analyser = SentimentIntensityAnalyzer() + X = pd.DataFrame(X).copy().rename(lambda x: 'predictor', axis='columns') + text_blob_scores = [] + vader_scores = [] + + for i in X.index: + text = X.loc[i, 'predictor'] + if text is None or str(text) == 'nan': + text = '' + text_blob_scores.append(TextBlob(text).sentiment) + vader_scores.append(vader_analyser.polarity_scores(text)) + + text_blob_scores_df = pd.DataFrame(text_blob_scores) + text_blob_scores_df.columns = 'text_blob_' + text_blob_scores_df.columns + text_blob_scores_df.index = X.index + + vader_scores_df = pd.DataFrame.from_dict(vader_scores) + vader_scores_df.columns = 'vader_' + vader_scores_df.columns + vader_scores_df.index = X.index + + all_scores = pd.concat([text_blob_scores_df, vader_scores_df], axis=1, ignore_index=False) + return all_scores diff --git a/build/lib/pxtextmining/helpers/text_length.py b/build/lib/pxtextmining/helpers/text_length.py new file mode 100644 index 00000000..5d17854e --- /dev/null +++ b/build/lib/pxtextmining/helpers/text_length.py @@ -0,0 +1,27 @@ +import pandas as pd + + +def text_length(X): + """ + Calculate the length of a given text. + + :param X: A dictionary, ``pandas.DataFrame``, tuple or list with the text strings. + If it is a dictionary (``pandas.DataFrame``), it must have a single key (column). + :return: A ``pandas.DataFrame`` with the length of each text record. Shape [n_samples, 1]. + """ + + X = pd.DataFrame(X).copy().rename(lambda x: 'predictor', axis='columns') + text_length = [] + + for i in X.index: + text = X.loc[i, 'predictor'] + if text is None or str(text) == 'nan': + text_length.append(len('')) + else: + text_length.append(len(text)) + + text_length_df = pd.DataFrame(text_length) + text_length_df.columns = ['text_length'] + text_length_df.index = X.index + + return text_length_df diff --git a/build/lib/pxtextmining/helpers/text_preprocessor.py b/build/lib/pxtextmining/helpers/text_preprocessor.py new file mode 100644 index 00000000..137d6b65 --- /dev/null +++ b/build/lib/pxtextmining/helpers/text_preprocessor.py @@ -0,0 +1,46 @@ +import re +import emojis + + +def text_preprocessor(text_string): + """ + Strips punctuation, excess spaces, and metacharacters "r" and "n" from the text. Converts emojis into "__text__" + (where "text" is the emoji name) and any NAs resulting from text preprocessing into "__notext__". + + :param str text_string: Text string that is passed from + `sklearn.feature_extraction.text.TfidfVectorizer `_. + :return: text_string (str): Cleaned text string. + """ + + text_string = str(text_string) + text_string = emojis.decode(text_string) + pattern = "\:(.*?)\:" # Decoded emojis are enclosed inside ":", e.g. ":blush:" + pattern_search = re.search(pattern, text_string) + + # We want to tell the model that words inside ":" are decoded emojis. + # However, "[^\w]" removes ":". It doesn't remove "_" or "__" though, so we may enclose decoded emojis + # inside "__" instead. + if pattern_search is not None: + emoji_decoded = pattern_search.group(1) + """if keep_emojis: + text_string = re.sub(pattern, "__" + emoji_decoded + "__", text_string) + # Sometimes emojis are consecutive e.g. ❤❤ is encoded into __heart____heart__. Split them. + text_string = re.sub("____", "__ __", text_string) + else: + text_string = re.sub(pattern, "", text_string)""" + text_string = re.sub(pattern, "__" + emoji_decoded + "__", text_string) + # Sometimes emojis are consecutive e.g. ❤❤ is encoded into __heart____heart__. Split them. + text_string = re.sub("____", "__ __", text_string) + + # Remove non-alphanumeric characters + text_string = re.sub("[^\w]", " ", text_string) + + # Remove excess whitespaces + text_string = re.sub(" +", " ", text_string) + text_string = text_string.rstrip() # Removes trailing spaces. + # text_string = " ".join(text.splitlines()) + + if str(text_string) in ("nan", "None", " "): + text_string = "__notext__" + + return text_string diff --git a/build/lib/pxtextmining/helpers/text_transformer_switcher.py b/build/lib/pxtextmining/helpers/text_transformer_switcher.py new file mode 100644 index 00000000..fb950a9b --- /dev/null +++ b/build/lib/pxtextmining/helpers/text_transformer_switcher.py @@ -0,0 +1,18 @@ +from sklearn.base import BaseEstimator, TransformerMixin +from sklearn.feature_extraction.text import TfidfVectorizer + + +class TextTransformerSwitcher(BaseEstimator, TransformerMixin): + """ + Class for choosing between Bag-of-Words and embeddings transformers. + """ + + def __init__(self, transformer=TfidfVectorizer()): + self.transformer = transformer + + def fit(self, X, y=None, **kwargs): + self.transformer.fit(X) + return self + + def transform(self, X, y=None, **kwargs): + return self.transformer.transform(X) diff --git a/build/lib/pxtextmining/helpers/theme_binarization.py b/build/lib/pxtextmining/helpers/theme_binarization.py new file mode 100644 index 00000000..5a82ec4c --- /dev/null +++ b/build/lib/pxtextmining/helpers/theme_binarization.py @@ -0,0 +1,36 @@ +import pandas as pd +from sklearn.base import BaseEstimator, TransformerMixin + + +class ThemeBinarizer(BaseEstimator, TransformerMixin): + """ + Class for binarizing categories. + + Sets a selected category to 1 and the rest to 0. + + **NOTE:** As described later, argument `theme` is for internal use by Nottinghamshire Healthcare NHS Foundation + Trust or other trusts who use the theme ("Access", "Environment/ facilities" etc.) labels. It can otherwise be + safely ignored. + + :param str class_col: The name of the column with the classes to binarize. + :param target_class: The name (if a string) or value (if numeric) of the class that will be set to `set_class_to`. + :param int set_class_to: The value to set the `target_class` to. Defaults to 1. + :param int set_rest_to: The value to set all classes but `target_class` to. Defaults to 0. + """ + + def __init__(self, class_col=None, + target_class=None, + set_class_to=1, set_rest_to=0): + self.class_col = class_col + self.target_class = target_class + self.set_class_to = set_class_to + self.set_rest_to = set_rest_to + + def fit(self, X, y=None): + return self + + def transform(self, X): + X.loc[X[self.class_col] == self.target_class, self.class_col] = self.set_class_to # Seems like it's okay to have some rows with numbers and some with strings + X.loc[X[self.class_col] != self.set_class_to, self.class_col] = self.set_rest_to + # X[self.class_col] = X[self.class_col].apply(pd.to_numeric, errors='coerce', downcast='integer').copy() + return X diff --git a/build/lib/pxtextmining/helpers/tokenization.py b/build/lib/pxtextmining/helpers/tokenization.py new file mode 100644 index 00000000..2d7514d9 --- /dev/null +++ b/build/lib/pxtextmining/helpers/tokenization.py @@ -0,0 +1,25 @@ +from nltk import word_tokenize +from nltk.stem import WordNetLemmatizer +import spacy +nlp = spacy.load("en_core_web_sm") # Don't put this inside the function- loading it in every CV iteration would tremendously slow down the pipeline. + + +class LemmaTokenizer: + """ + Class for custom lemmatization in `sklearn.feature_extraction.text.TfidfVectorizer + `_ + (see `this `_). Uses `spaCy + `_ (``tknz == 'spacy'``) or `NLTK `_ (``tknz == 'wordnet'``). + + """ + + def __init__(self, tknz='wordnet'): + self.tknz = tknz + + def __call__(self, doc): + if self.tknz == 'wordnet': + wln = WordNetLemmatizer() + return [wln.lemmatize(t) for t in word_tokenize(doc)] + if self.tknz == 'spacy': + return [t.lemma_ for t in nlp(doc, + disable=["tagger", "parser", "ner"])] \ No newline at end of file diff --git a/build/lib/pxtextmining/helpers/word_vectorization.py b/build/lib/pxtextmining/helpers/word_vectorization.py new file mode 100644 index 00000000..fb02090e --- /dev/null +++ b/build/lib/pxtextmining/helpers/word_vectorization.py @@ -0,0 +1,24 @@ +from sklearn.base import BaseEstimator, TransformerMixin +import numpy as np +import spacy +from pxtextmining.helpers.text_preprocessor import text_preprocessor +nlp = spacy.load("en_core_web_lg") # Don't put this inside the function- loading it in every CV iteration would tremendously slow down the pipeline. + + +class EmbeddingsTransformer(TransformerMixin, BaseEstimator): + """ + Class for converting text into `GloVe `_ word vectors with + `spaCy `_. Helpful resource `here + `_. + """ + + def __init__(self, model=None): + self.model = model + + def fit(self, X, y=None): + return self + + def transform(self, X): + X_processed = [text_preprocessor(doc) for doc in X] + return np.concatenate([nlp(doc, + disable=["tagger", "parser", "ner"]).vector.reshape(1, -1) for doc in X_processed]) diff --git a/build/lib/pxtextmining/pipelines/__init__.py b/build/lib/pxtextmining/pipelines/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/build/lib/pxtextmining/pipelines/text_classification_pipeline.py b/build/lib/pxtextmining/pipelines/text_classification_pipeline.py new file mode 100644 index 00000000..bf6d0d89 --- /dev/null +++ b/build/lib/pxtextmining/pipelines/text_classification_pipeline.py @@ -0,0 +1,131 @@ +from pxtextmining.factories.factory_data_load_and_split import factory_data_load_and_split +from pxtextmining.factories.factory_pipeline import factory_pipeline +from pxtextmining.factories.factory_model_performance import factory_model_performance +from pxtextmining.factories.factory_write_results import factory_write_results + + +def text_classification_pipeline(filename, target, predictor, test_size=0.33, + ordinal=False, + tknz="spacy", + metric="class_balance_accuracy_score", + cv=5, n_iter=100, n_jobs=5, verbose=3, + learners=["SGDClassifier"], + objects_to_save=[ + "pipeline", + "tuning results", + "predictions", + "accuracy per class", + "index - training data", + "index - test data", + "bar plot" + ], + save_objects_to_server=True, + save_objects_to_disk=False, + save_pipeline_as="default", + results_folder_name="results", + reduce_criticality=True, + theme=None): + + """ + Fit and evaluate the pipeline and write the results. Writes between 1 to 7 files, depending on the value of argument + ``objects_to_save``: + + - The fitted pipeline (SAV); + - All (hyper)parameters tried during fitting and the associated pipeline performance metrics (CSV); + - The predictions on the test set (CSV); + - Accuracies per class (CSV); + - The row indices of the training data (CSV); + - The row indices of the test data (CSV); + - A bar plot comparing the mean scores (of the user-supplied metric parameter) from the cross-validation on + the training set, for the best (hyper)parameter values for each learner (PNG); + + **NOTE:** As described later, arguments `reduce_criticality` and `theme` are for internal use by Nottinghamshire + Healthcare NHS Foundation Trust or other trusts who use the theme ("Access", "Environment/ facilities" etc.) and + criticality labels. They can otherwise be safely ignored. + + :param str filename: Dataset name (CSV), including the data type suffix. If None, data is read from the database. + :param str target: Name of the response variable. + :param str predictor: Name of the predictor variable. + :param float test_size: Proportion of data that will form the test dataset. + :param bool ordinal: Whether to fit an ordinal classification model. The ordinal model is the implementation of + `Frank and Hall (2001) `_ that can use any standard classification model. + :param str tknz: Tokenizer to use ("spacy" or "wordnet"). + :param str metric: Scorer to use during pipeline tuning ("accuracy_score", "balanced_accuracy_score", + "matthews_corrcoef", "class_balance_accuracy_score"). + :param int cv: Number of cross-validation folds. + :param int n_iter: Number of parameter settings that are sampled + (see `sklearn.model_selection.RandomizedSearchCV + `_). + :param int n_jobs: Number of jobs to run in parallel (see ``sklearn.model_selection.RandomizedSearchCV``). + :param int verbose: Controls the verbosity (see ``sklearn.model_selection.RandomizedSearchCV``). + :param list[str] learners: A list of ``Scikit-learn`` names of the learners to tune. Must be one or more of + "SGDClassifier", "RidgeClassifier", "Perceptron", "PassiveAggressiveClassifier", "BernoulliNB", "ComplementNB", + "MultinomialNB", "KNeighborsClassifier", "NearestCentroid", "RandomForestClassifier". + :param list[str] objects_to_save: Objects to save following pipeline fitting and assessment. These are: + + - the pipeline (SAV file); + - table with all (hyper)parameter values tried out and performance indicators on the cross-validation data; + - table with predictions on the test set; + - table with accuracy and counts per class; + - row indices for the training set; + - row indices for the test set; + - bar plot with the best-performing models- plotted values are the mean scores from a k-fold CV on the training + set, for the best (hyper)parameter values for each learner; + :param bool save_objects_to_server: Whether to save the results to the server. **NOTE:** The feature that writes + results to the database is for internal use only. It will be removed when a proper API is developed for this + function. + :param bool save_objects_to_disk: Whether to save the results to disk. See ``results_folder_name``. + :param str save_pipeline_as: Save the pipeline as ``save_pipeline_as + '.sav'``. + :param str results_folder_name: Name of folder in which to save the results. It will create a new folder or + overwrite an existing one that has the same name. + :param bool reduce_criticality: For internal use by Nottinghamshire Healthcare NHS Foundation Trust or other trusts + that hold data on criticality. If `True`, then all records with a criticality of "-5" (respectively, "5") are + assigned a criticality of "-4" (respectively, "4"). This is to avoid situations where the pipeline breaks due to + a lack of sufficient data for "-5" and/or "5". Defaults to `False`. + :param str theme: For internal use by Nottinghamshire Healthcare NHS Foundation Trust or other trusts + that use theme labels ("Access", "Environment/ facilities" etc.). The column name of the theme variable. + Defaults to `None`. If supplied, the theme variable will be used as a predictor (along with the text predictor) + in the model that is fitted with criticality as the response variable. The rationale is two-fold. First, to + help the model improve predictions on criticality when the theme labels are readily available. Second, to force + the criticality for "Couldn't be improved" to always be "3" in the training and test data, as well as in the + predictions. This is the only criticality value that "Couldn't be improved" can take, so by forcing it to always + be "3", we are improving model performance, but are also correcting possible erroneous assignments of values + other than "3" that are attributed to human error. + :return: A ``tuple`` of length 7: + + - The fitted ``Scikit-learn``/``imblearn`` pipeline; + - A ``pandas.DataFrame`` with all (hyper)parameter values and models tried during fitting; + - A ``pandas.DataFrame`` with the predictions on the test set; + - A ``pandas.DataFrame`` with accuracies per class; + - A bar plot comparing the mean scores (of the user-supplied metric parameter) from the cross-validation on + the training set, for the best (hyper)parameter values for each learner. + - The row indices of the training data; + - The row indices of the test data; + """ + + x_train, x_test, y_train, y_test, index_training_data, index_test_data = \ + factory_data_load_and_split(filename, target, predictor, test_size, reduce_criticality, theme) + + pipe = factory_pipeline(x_train, y_train, tknz, ordinal, metric, cv, n_iter, n_jobs, verbose, learners, theme) + + pipe, tuning_results, pred, accuracy_per_class, p_compare_models_bar = \ + factory_model_performance(pipe, x_train, y_train, x_test, y_test, metric) + + pred, index_training_data, index_test_data = factory_write_results(pipe, + tuning_results, + pred, + accuracy_per_class, + p_compare_models_bar, + target, + x_train, + x_test, + index_training_data, + index_test_data, + metric, + objects_to_save, + save_objects_to_server, + save_objects_to_disk, + save_pipeline_as, + results_folder_name) + + return pipe, tuning_results, pred, accuracy_per_class, p_compare_models_bar, index_training_data, index_test_data diff --git a/docs/source/conf.py b/docs/source/conf.py index e4204903..5711525c 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -22,7 +22,7 @@ author = 'Andreas D Soteriades' # The full version, including alpha/beta/rc tags -release = '0.3.0' +release = '0.3.2' # -- General configuration --------------------------------------------------- diff --git a/pxtextmining/factories/factory_model_performance.py b/pxtextmining/factories/factory_model_performance.py index ea9909b2..da319521 100644 --- a/pxtextmining/factories/factory_model_performance.py +++ b/pxtextmining/factories/factory_model_performance.py @@ -93,15 +93,16 @@ def factory_model_performance(pipe, x_train, y_train, x_test, y_test, # Note that SGDClassifier fits a logistic regression when loss is "log", but a Linear SVM when loss is "hinge". # Looking at column "learner" in "tuning results", one cannot tell which of the two models SGD is. # Let's make that clear. - learners = [] - for i, j in zip(tuning_results["learner"], tuning_results["param_clf__estimator__loss"]): - if j == "log": - learners.append("Logistic") - elif j == "hinge": - learners.append("Linear SVM") - else: - learners.append(i) - tuning_results["learner"] = learners + if 'param_clf__estimator_loss' in tuning_results.columns: # Need statement as models other than SGD don't have loss. + learners = [] + for i, j in zip(tuning_results["learner"], tuning_results["param_clf__estimator__loss"]): + if j == "log": + learners.append("Logistic") + elif j == "hinge": + learners.append("Linear SVM") + else: + learners.append(i) + tuning_results["learner"] = learners # Now, let's find the best tunings for each of the fitted models aux = tuning_results.filter(regex="mean_test|learner").groupby(["learner"]).max().reset_index() diff --git a/setup.py b/setup.py index 2fca3e60..bfdfd745 100644 --- a/setup.py +++ b/setup.py @@ -6,7 +6,7 @@ setuptools.setup( name="pxtextmining", - version="0.3.0", + version="0.3.2", author="Andreas D Soteriades", author_email="andreas.soteriades@nottshc.nhs.uk", description="Text Classification of Patient Experience feedback",