Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

127 adjust threshold #136

Merged
merged 21 commits into from
Sep 8, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
21 commits
Select commit Hold shift + click to select a range
42b2a3e
wrote get_thresholds function
yiwen-h Sep 1, 2023
b0c9788
added test and docstring for get_thresholds
yiwen-h Sep 4, 2023
8b57aee
moved np divide-by-zero error suppression to specific codeblock
yiwen-h Sep 4, 2023
4874366
Merge branch 'development' of github.com:CDU-data-science-team/pxtext…
yiwen-h Sep 4, 2023
1949238
took out already_encoded as positional argument
yiwen-h Sep 4, 2023
dc57f0f
added test and function for turn_probs_into_binary taking custom thre…
yiwen-h Sep 5, 2023
09ec65a
refactored code, taking out model_type
yiwen-h Sep 5, 2023
2d8a0f4
added custom_threshold_dict to predict_multilabel_bert and get_multil…
yiwen-h Sep 5, 2023
14325c4
added threshold_dict to sklearn_pipeline
yiwen-h Sep 5, 2023
b36f1b2
added custom_threshold to the svc pipeline. removed two_layer pipelin…
yiwen-h Sep 5, 2023
b8dad02
added test and threshold dict to additional_analysis function
yiwen-h Sep 6, 2023
39e5b2a
added custom_threshold_dict to write_model_analysis
yiwen-h Sep 6, 2023
e88eaef
write_results now takes preds_df rather than making predictions itself
yiwen-h Sep 6, 2023
daa21b3
added check for length to write_model_preds
yiwen-h Sep 6, 2023
1520ec7
moved prediction step out of get_multilabel_metrics function
yiwen-h Sep 6, 2023
2dca18c
fixed bug with creating threshold_dict
yiwen-h Sep 6, 2023
c775c03
fixed bug with too many arguments for get_multilabel_metrics
yiwen-h Sep 7, 2023
632ae12
trying different values in gridsearch
yiwen-h Sep 7, 2023
0c9bb7f
moved label_fix to before custom threshold in bert_predict
yiwen-h Sep 7, 2023
4cd664a
more resilient turn_probs_into_binary by using dict.get
yiwen-h Sep 7, 2023
5c1c217
fixed bug with threshold_dict and added new coverage
yiwen-h Sep 7, 2023
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
600 changes: 304 additions & 296 deletions coverage.xml

Large diffs are not rendered by default.

65 changes: 16 additions & 49 deletions pxtextmining/factories/factory_model_performance.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,8 +8,6 @@

from pxtextmining.factories.factory_predict_unlabelled_text import (
predict_multiclass_bert,
predict_multilabel_bert,
predict_multilabel_sklearn,
)


Expand Down Expand Up @@ -64,7 +62,6 @@ def get_multiclass_metrics(
x_test,
model,
additional_features=additional_features,
already_encoded=False,
)
elif is_classifier(model) is True:
metrics_string += f"\n{model}\n"
Expand All @@ -83,33 +80,26 @@ def get_multiclass_metrics(


def get_multilabel_metrics(
x_test,
preds_df,
y_test,
labels,
random_state,
model_type,
model,
training_time=None,
additional_features=False,
already_encoded=False,
enhance_with_rules=False,
):
"""Creates a string detailing various performance metrics for a multilabel model, which can then be written to
a text file.

Args:
x_test (pd.DataFrame): DataFrame containing test dataset features
preds_df (pd.DataFrame): DataFrame containing model predictions
y_test (pd.DataFrame): DataFrame containing test dataset true target values
labels (list): List containing the target labels
random_state (int): Seed used to control the shuffling of the data, to enable reproducible results.
model_type (str): Type of model used. Options are 'bert', or 'sklearn'. Defaults to None.
model (tf.keras or sklearn model): Trained estimator.
training_time (str, optional): Amount of time taken for model to train. Defaults to None.
additional_features (bool, optional): Whether or not additional features (e.g. question type) have been included in training the model. Defaults to False.
already_encoded (bool, optional): Whether or not, if a `bert` model was used, x_test has already been encoded. Defaults to False.

Raises:
ValueError: Only model_type 'bert', 'tf' or 'sklearn' are allowed.
ValueError: Only sklearn and tensorflow keras models allowed.

Returns:
(str): String containing the model architecture/hyperparameters, random state used for the train test split, and performance metrics including: exact accuracy, hamming loss, macro jaccard score, and classification report.
Expand All @@ -120,40 +110,22 @@ def get_multilabel_metrics(
f"\n Random state seed for train test split is: {random_state} \n\n"
)
model_metrics = {}
# TF Keras models output probabilities with model.predict, whilst sklearn models output binary outcomes
# Get them both to output the same (binary outcomes) and take max prob as label if no labels predicted at all
if model_type == "bert":
y_pred_df = predict_multilabel_bert(
x_test,
model,
labels=labels,
additional_features=additional_features,
label_fix=True,
enhance_with_rules=enhance_with_rules,
already_encoded=already_encoded,
)
elif model_type == "sklearn":
y_pred_df = predict_multilabel_sklearn(
x_test,
model,
labels=labels,
additional_features=additional_features,
label_fix=True,
enhance_with_probs=True,
enhance_with_rules=enhance_with_rules,
)
if isinstance(model, Model) is True:
stringlist = []
model.summary(print_fn=lambda x: stringlist.append(x))
model_summary = "\n".join(stringlist)
elif is_classifier(model) is True:
model_summary = model
else:
raise ValueError(
'Please select valid model_type. Options are "bert" or "sklearn"'
)
y_pred = np.array(y_pred_df[labels]).astype("int64")
raise ValueError("invalid model type")
y_pred = np.array(preds_df[labels]).astype("int64")
# Calculate various metrics
model_metrics["exact_accuracy"] = metrics.accuracy_score(y_test, y_pred)
model_metrics["hamming_loss"] = metrics.hamming_loss(y_test, y_pred)
model_metrics["macro_jaccard_score"] = metrics.jaccard_score(
y_test, y_pred, average="macro"
)
y_probs = y_pred_df.filter(like="Probability", axis=1)
y_probs = preds_df.filter(like="Probability", axis=1)
model_metrics["macro_roc_auc"] = metrics.roc_auc_score(
y_test, y_probs, multi_class="ovr"
)
Expand All @@ -164,13 +136,7 @@ def get_multilabel_metrics(
y_probs,
)
# Model summary
if model_type in ("bert", "tf"):
stringlist = []
model.summary(print_fn=lambda x: stringlist.append(x))
model_summary = "\n".join(stringlist)
metrics_string += f"\n{model_summary}\n"
else:
metrics_string += f"\n{model}\n"
metrics_string += f"\n{model_summary}\n"
metrics_string += f"\n\nTraining time: {training_time}\n"
for k, v in model_metrics.items():
metrics_string += f"\n{k}: {v}"
Expand Down Expand Up @@ -257,7 +223,7 @@ def get_y_score(probs):
return score


def additional_analysis(preds_df, y_true, labels):
def additional_analysis(preds_df, y_true, labels, custom_threshold_dict=None):
"""For given predictions, returns dataframe containing: macro one-vs-one ROC AUC score, number of True Positives, True Negatives, False Positives, and False Negatives.

Args:
Expand All @@ -268,7 +234,6 @@ def additional_analysis(preds_df, y_true, labels):
Returns:
(pd.DataFrame): dataframe containing: macro one-vs-one ROC AUC score, number of True Positives, True Negatives, False Positives, and False Negatives.
"""
# include threshold?? (later)
y_score = np.array(preds_df.filter(like="Probability", axis=1))
cm = metrics.multilabel_confusion_matrix(y_true, np.array(preds_df[labels]))
cm_dict = {}
Expand All @@ -288,4 +253,6 @@ def additional_analysis(preds_df, y_true, labels):
df = pd.DataFrame.from_dict(cm_dict, orient="index")
average_precision = pd.Series(average_precision)
df["average_precision_score"] = average_precision
if custom_threshold_dict is not None:
df["custom_threshold"] = custom_threshold_dict
return df
15 changes: 7 additions & 8 deletions pxtextmining/factories/factory_pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -353,13 +353,12 @@ def create_sklearn_pipeline(model_type, tokenizer=None, additional_features=True
n_jobs=-1,
),
)
params["multioutputclassifier__estimator__C"] = [10, 15, 20]
params["multioutputclassifier__estimator__gamma"] = np.logspace(-9, 3, 13)
# params["multioutputclassifier__estimator__kernel"] = [
# "linear",
# "rbf",
# "sigmoid",
# ]
params["multioutputclassifier__estimator__C"] = [1, 5, 10, 15, 20]
params["multioutputclassifier__estimator__kernel"] = [
"linear",
"rbf",
"sigmoid",
]
if model_type == "rfc":
pipe = make_pipeline(preproc, RandomForestClassifier(n_jobs=-1))
params["randomforestclassifier__max_depth"] = stats.randint(5, 50)
Expand Down Expand Up @@ -422,7 +421,7 @@ def search_sklearn_pipelines(
search = RandomizedSearchCV(
pipe,
params,
scoring="f1_macro",
scoring="average_precision",
n_iter=100,
cv=4,
n_jobs=-2,
Expand Down
Loading