Merge branch 'release-0.3.2' into main

The-Strategy-Unit · Sep 28, 2021 · 248094b · 248094b
2 parents 005279d + e91dd17
commit 248094b
Show file tree

Hide file tree

Showing 28 changed files with 1,355 additions and 47 deletions.
diff --git a/README.md b/README.md
@@ -39,8 +39,8 @@ environment `text_venv`:
 ### Install from PyPI
 
 1. Install `pxtextmining` and its PyPI dependencies:
-   - `pip3 install pxtextmining==0.3.0`  (Linux & MacOS);
-   - `pip install pxtextmining==0.3.0` (Windows);
+   - `pip3 install pxtextmining==0.3.2`  (Linux & MacOS);
+   - `pip install pxtextmining==0.3.2` (Windows);
 1. We also need to install a couple of 
    [`spaCy`](https://github.com/explosion/spacy-models) models. 
 
@@ -65,15 +65,15 @@ All steps in one go:
    ```
    python3 -m venv text_venv
    source text_venv/bin/activate
-   pip3 install pxtextmining==0.3.0
+   pip3 install pxtextmining==0.3.2
    pip3 install https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-2.3.1/en_core_web_sm-2.3.1.tar.gz
    pip3 install https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-2.3.1/en_core_web_lg-2.3.1.tar.gz
    ```
 1. **Windows**
    ```
    python -m venv text_venv
    text_venv\Scripts\activate
-   pip install pxtextmining==0.3.0
+   pip install pxtextmining==0.3.2
    pip install https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-2.3.1/en_core_web_sm-2.3.1.tar.gz
    pip install https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-2.3.1/en_core_web_lg-2.3.1.tar.gz
    ```

diff --git a/build/lib/pxtextmining/__init__.py b/build/lib/pxtextmining/__init__.py
diff --git a/build/lib/pxtextmining/factories/__init__.py b/build/lib/pxtextmining/factories/__init__.py
diff --git a/build/lib/pxtextmining/factories/factory_data_load_and_split.py b/build/lib/pxtextmining/factories/factory_data_load_and_split.py
@@ -4,13 +4,18 @@
 from sklearn.model_selection import train_test_split
 
 
-def factory_data_load_and_split(filename, target, predictor, test_size=0.33):
+def factory_data_load_and_split(filename, target, predictor, test_size=0.33, reduce_criticality=False, theme=None):
     """
     Function loads the dataset, renames the response and predictor as "target" and "predictor" respectively,
     and splits the dataset into training and test sets.
 
-    :param str filename: Dataset name (CSV), including the data type suffix. The dataset should be placed in folder
-        ``pxtextmining/datasets``. If ``filename`` is ``None``, the data are read from the database.
+    **NOTE:** As described later, arguments `reduce_criticality` and `theme` are for internal use by Nottinghamshire
+    Healthcare NHS Foundation Trust or other trusts who use the theme ("Access", "Environment/ facilities" etc.) and
+    criticality labels. They can otherwise be safely ignored.
+
+    :param str, pandas.DataFrame filename: A ``pandas.DataFrame`` with the data (class and text columns), otherwise the
+        dataset name (CSV), including full path to the data folder (if not in the project's working directory), and the
+        data type suffix (".csv"). If ``filename`` is ``None``, the data are read from the database.
         **NOTE:** The feature that reads data from the database is for internal use only. Experienced users who would
         like to pull their data from their own databases can, of course, achieve that by slightly modifying the
         relevant lines in the script. A "my.conf" file will need to be placed in the root, with five lines, as follows
@@ -24,45 +29,74 @@ def factory_data_load_and_split(filename, target, predictor, test_size=0.33):
     :param str target: Name of the response variable.
     :param str predictor: Name of the predictor variable.
     :param float test_size: Proportion of data that will form the test dataset.
+    :param bool reduce_criticality: For internal use by Nottinghamshire Healthcare NHS Foundation Trust or other trusts
+        that hold data on criticality. If `True`, then all records with a criticality of "-5" (respectively, "5") are
+        assigned a criticality of "-4" (respectively, "4"). This is to avoid situations where the pipeline breaks due to
+        a lack of sufficient data for "-5" and/or "5". Defaults to `False`.
+    :param str theme: For internal use by Nottinghamshire Healthcare NHS Foundation Trust or other trusts
+        that use theme labels ("Access", "Environment/ facilities" etc.). The column name of the theme variable.
+        Defaults to `None`. If supplied, the theme variable will be used as a predictor (along with the text predictor)
+        in the model that is fitted with criticality as the response variable. The rationale is two-fold. First, to
+        help the model improve predictions on criticality when the theme labels are readily available. Second, to force
+        the criticality for "Couldn't be improved" to always be "3" in the training and test data, as well as in the
+        predictions. This is the only criticality value that "Couldn't be improved" can take, so by forcing it to always
+        be "3", we are improving model performance, but are also correcting possible erroneous assignments of values
+        other than "3" that are attributed to human error.
     :return: A tuple of length 4: predictor-train, predictor-test, target-train and target-test datasets.
     """
 
     print('Loading dataset...')
 
     # Choose to read CSV from folder or table directly from database
     if filename is not None:
-        data_path = path.join('datasets', filename)
-        text_data = pd.read_csv(data_path, encoding='utf-8')
+        if isinstance(filename, str):
+            text_data = pd.read_csv(filename, encoding='utf-8')
+        else:
+            text_data = filename
     else:
         db = mysql.connector.connect(option_files="my.conf", use_pure=True)
-        with db.cursor() as cursor:
-            cursor.execute(
-                "SELECT  " + target + ", " + predictor + " FROM text_data"
-            )
-            text_data = cursor.fetchall()
-            text_data = pd.DataFrame(text_data)
-            text_data.columns = cursor.column_names
+        if theme is None:
+            with db.cursor() as cursor:
+                cursor.execute(
+                    "SELECT  " + target + ", " + predictor + " FROM text_data"
+                )
+                text_data = cursor.fetchall()
+                text_data = pd.DataFrame(text_data)
+                text_data.columns = cursor.column_names
+        else:
+            with db.cursor() as cursor:
+                cursor.execute(
+                    "SELECT  " + target + ", " + predictor + ", " + theme + " FROM text_data"
+                )
+                text_data = cursor.fetchall()
+                text_data = pd.DataFrame(text_data)
+                text_data.columns = cursor.column_names
 
-    text_data = text_data.rename(columns={target: "target", predictor: "predictor"})
-    text_data = text_data.loc[text_data.target.notnull()].copy()
-    text_data = text_data.loc[text_data.target.notna()].copy()
-    text_data['predictor'] = text_data.predictor.fillna('__none__')
+    text_data = text_data.rename(columns={target: 'target', predictor: 'predictor'})
+    if theme is not None:
+        text_data = text_data.rename(columns={theme: 'theme'})
+    text_data = text_data.dropna(subset=['target', 'predictor']).copy()
+    text_data['predictor'] = text_data.predictor.fillna('__notext__')
 
-    # This is specific to NHS patient feedback data labelled with "criticality" classes. Should remove when a
-    # proper API is developed for this function.
-    if target == 'criticality':
+    # This is specific to NHS patient feedback data labelled with "criticality" classes
+    if reduce_criticality:
         text_data = text_data.query("target in ('-5', '-4', '-3', '-2', '-1', '0', '1', '2', '3', '4', '5')")
         text_data.loc[text_data.target == '-5', 'target'] = '-4'
         text_data.loc[text_data.target == '5', 'target'] = '4'
+        if theme is not None:
+            text_data.loc[text_data['theme'] == "Couldn't be improved", 'target'] = '3'
 
     print('Preparing training and test sets...')
-    x = pd.DataFrame(text_data["predictor"])
-    y = text_data["target"].to_numpy()
-    x_train, x_test, y_train, y_test = train_test_split(x, y,
-                                                        test_size=test_size,
-                                                        stratify=y,
-                                                        shuffle=True,
-                                                        # random_state=42 # https://stackoverflow.com/questions/28064634/random-state-pseudo-random-number-in-scikit-learn
-                                                        )
+    x = text_data[['predictor']] # Needs to be an array of a data frame- can't be a pandas Series
+    if theme is not None:
+        x['theme'] = text_data['theme'].copy()
+    y = text_data['target'].to_numpy()
+    x_train, x_test, y_train, y_test, index_training_data, index_test_data = \
+        train_test_split(x, y, pd.DataFrame(x).index,
+                         test_size=test_size,
+                         stratify=y,
+                         shuffle=True
+                         )
+    print("Done")
 
-    return x_train, x_test, y_train, y_test
+    return x_train, x_test, y_train, y_test, index_training_data, index_test_data
diff --git a/build/lib/pxtextmining/factories/factory_model_performance.py b/build/lib/pxtextmining/factories/factory_model_performance.py
@@ -0,0 +1,126 @@
+import numpy as np
+import pandas as pd
+import seaborn as sns
+import matplotlib.pyplot as plt
+from sklearn.metrics import balanced_accuracy_score, confusion_matrix, matthews_corrcoef
+from pxtextmining.helpers.metrics import class_balance_accuracy_score
+
+
+def factory_model_performance(pipe, x_train, y_train, x_test, y_test,
+                              metric):
+
+    """
+    Evaluate the performance of a fitted pipeline.
+
+    :param pipe: Fitted `sklearn.pipeline.Pipeline
+        <https://scikit-learn.org/stable/modules/generated/sklearn.pipeline.Pipeline.html>`_/
+        `imblearn.pipeline.Pipeline
+        <https://imbalanced-learn.org/stable/references/generated/imblearn.pipeline.Pipeline.html#imblearn.pipeline.Pipeline>`_
+    :param x_train: Training data (predictor).
+    :param y_train: Training data (response).
+    :param x_test: Test data (predictor).
+    :param y_test: Test data (response).
+    :param str metric: Scorer that was used in pipeline tuning ("accuracy_score", "balanced_accuracy_score",
+        "matthews_corrcoef", "class_balance_accuracy_score").
+    :return: A ``tuple`` of length 5:
+
+            - The fitted ``Scikit-learn``/``imblearn`` pipeline;
+            - A ``pandas.DataFrame`` with all (hyper)parameter values and models tried during fitting;
+            - A ``pandas.DataFrame`` with the predictions on the test set;
+            - A ``pandas.DataFrame`` with accuracies per class;
+            - A bar plot comparing the mean scores (of the user-supplied metric parameter) from the cross-validation on
+              the training set, for the best (hyper)parameter values for each learner;
+    """
+
+    refit = metric.replace("_", " ").replace(" score", "").title()
+
+    aux = pd.DataFrame(pipe.best_params_.items())
+    best_estimator = aux[aux[0] == "clf__estimator"].reset_index()[1][0]
+    estimator_position = len(pipe.best_estimator_) - 1
+    pipe.best_estimator_.steps.pop(estimator_position)
+    pipe.best_estimator_.steps.append(("clf", best_estimator))
+    pipe.best_estimator_.fit(x_train, y_train)
+
+    print("The best estimator is %s" % (pipe.best_estimator_.named_steps["clf"]))
+    print("The best parameters are:")
+    for param, value in pipe.best_params_.items():
+        print("{}: {}".format(param, value))
+    print("The best score from the cross-validation for \n the supplied scorer (" +
+          refit + ") is %s"
+          % (round(pipe.best_score_, 2)))
+
+    pred = pipe.best_estimator_.predict(x_test)
+    cm = confusion_matrix(y_test, pred)
+
+    print("Model accuracy on the test set is %s percent"
+          % (int(pipe.best_estimator_.score(x_test, y_test) * 100)))
+    print("Balanced accuracy on the test set is %s percent"
+          % (int(balanced_accuracy_score(y_test, pred) * 100)))
+    print("Class balance accuracy on the test set is %s percent"
+          % (int(class_balance_accuracy_score(y_test, pred) * 100)))
+    print("Matthews correlation on the test set is %s "
+          % (round(matthews_corrcoef(y_test, pred), 2)))
+
+    accuracy_per_class = cm.astype("float") / cm.sum(axis=1)[:, np.newaxis]
+    accuracy_per_class = pd.DataFrame(accuracy_per_class.diagonal())
+    accuracy_per_class.columns = ["accuracy"]
+    unique, frequency = np.unique(y_test, return_counts=True)
+    accuracy_per_class["class"], accuracy_per_class["counts"] = unique, frequency
+    accuracy_per_class = accuracy_per_class[["class", "counts", "accuracy"]]
+
+    tuning_results = pd.DataFrame(pipe.cv_results_)
+    tuned_learners = []
+    for i in tuning_results["param_clf__estimator"]:
+        tuned_learners.append(i.__class__.__name__)
+    tuning_results["learner"] = tuned_learners
+    y_axis = "mean_test_" + refit
+    tuning_results = tuning_results.sort_values(y_axis, ascending=False)
+    tuning_results.columns = tuning_results.columns.str.replace('alltrans__process__', '') # When using ordinal with theme='label', names are too long.
+
+    # Convert non-numeric to strings. This is to ensure that writing to MySQL won't throw an error.
+    # (There MUST be a better way of fixing this!)
+    for i in tuning_results.columns:
+        if (
+                (not isinstance(tuning_results[i][0], float)) and
+                (not isinstance(tuning_results[i][0], int)) and
+                (not isinstance(tuning_results[i][0], str))
+        ):
+            tuning_results[i] = tuning_results[i].apply(str)
+
+    print("Plotting performance of the best of each estimator...")
+
+    # Find the best tunings for each model. #
+    # Note that SGDClassifier fits a logistic regression when loss is "log", but a Linear SVM when loss is "hinge".
+    # Looking at column "learner" in "tuning results", one cannot tell which of the two models SGD is.
+    # Let's make that clear.
+    if 'param_clf__estimator_loss' in tuning_results.columns: # Need statement as models other than SGD don't have loss.
+        learners = []
+        for i, j in zip(tuning_results["learner"], tuning_results["param_clf__estimator__loss"]):
+            if j == "log":
+                learners.append("Logistic")
+            elif j == "hinge":
+                learners.append("Linear SVM")
+            else:
+                learners.append(i)
+        tuning_results["learner"] = learners
+
+    # Now, let's find the best tunings for each of the fitted models
+    aux = tuning_results.filter(regex="mean_test|learner").groupby(["learner"]).max().reset_index()
+    aux = aux.sort_values([y_axis], ascending=False)
+    aux = aux.melt("learner")
+    aux["variable"] = aux["variable"].str.replace("mean_test_", "")
+    aux["learner"] = aux["learner"].str.replace("Classifier", "")
+
+    p_compare_models_bar = sns.barplot(x="learner", y="value", hue="variable",
+                                       data=aux)
+    p_compare_models_bar.figure.set_size_inches(15, 13)
+    p_compare_models_bar.set_xticklabels(p_compare_models_bar.get_xticklabels(),
+                                         rotation=90)
+    plt.legend(bbox_to_anchor=(1.01, 1), borderaxespad=0)
+    p_compare_models_bar.set(xlabel=None, ylabel=None,
+                             title="Learner performance ordered by " + refit)
+
+    print("Fitting optimal pipeline on whole dataset...")
+    pipe.best_estimator_.fit(pd.concat([x_train, x_test]), np.concatenate([y_train, y_test]))
+
+    return pipe, tuning_results, pred, accuracy_per_class, p_compare_models_bar