Merge pull request #147 from CDU-data-science-team/138_final_API_updates

138 final api updates
The-Strategy-Unit · Oct 6, 2023 · 535665f · 535665f
2 parents 7eefe7c + 562312d
commit 535665f
Show file tree

Hide file tree

Showing 31 changed files with 1,489 additions and 3,365 deletions.
diff --git a/.coveragerc b/.coveragerc
@@ -2,7 +2,7 @@
 omit = tests\*
        *\__init__.py
        *\params.py
-       api\test_api.py
+       api\test_api_locally.py
        setup.py
        test_rules.py
 

diff --git a/.gitignore b/.gitignore
@@ -2,8 +2,6 @@
 *__pycache__*
 notebooks/*
 my.conf
-test_results_label/*
-test_results_criticality/*
 site/
 dist/
 .vscode/
@@ -12,6 +10,6 @@ test_multilabel/*
 .env
 api/rsconnect-python/*
 .coverage
-api/bert*
 *_labels.xlsx
-current_best_model/final_bert/bert
+current_best_model/final_bert/bert_multilabel
+current_best_model/sentiment/bert_sentiment
diff --git a/Dockerfile b/Dockerfile
@@ -1,12 +1,16 @@
-FROM python:3.10.12-slim-bookworm
+FROM python:3.10.13-slim
 VOLUME /data
 
-COPY docker-requirements.txt requirements.txt
+COPY pxtextmining /pxtextmining
+COPY pyproject.toml /pyproject.toml
+COPY docker_README.md /README.md
 RUN pip install --upgrade pip setuptools \
-  && pip install -r requirements.txt \
+  && pip install . \
   && rm -rf /root/.cache
-
-COPY api/bert_sentiment bert_sentiment
+COPY current_best_model/sentiment/bert_sentiment bert_sentiment
+COPY current_best_model/final_bert/bert_multilabel bert_multilabel
+COPY current_best_model/final_svc/final_svc.sav /final_svc.sav
+COPY current_best_model/final_xgb/final_xgb.sav /final_xgb.sav
 COPY --chmod=755 docker_run.py docker_run.py
 
 LABEL org.opencontainers.image.source=https://github.com/cdu-data-science-team/pxtextmining

diff --git a/api/api.py b/api/api.py
@@ -1,70 +1,15 @@
 import os
 import pickle
-from typing import List, Union
+from typing import List
 
 import pandas as pd
 from fastapi import FastAPI
-from pydantic import BaseModel, validator
-from tensorflow.keras.saving import load_model
 
+from api import schemas
 from pxtextmining.factories.factory_predict_unlabelled_text import (
     predict_multilabel_sklearn,
-    predict_sentiment_bert,
 )
-
-minor_cats_v5 = [
-    "Gratitude/ good experience",
-    "Negative experience",
-    "Not assigned",
-    "Organisation & efficiency",
-    "Funding & use of financial resources",
-    "Non-specific praise for staff",
-    "Non-specific dissatisfaction with staff",
-    "Staff manner & personal attributes",
-    "Number & deployment of staff",
-    "Staff responsiveness",
-    "Staff continuity",
-    "Competence & training",
-    "Unspecified communication",
-    "Staff listening, understanding & involving patients",
-    "Information directly from staff during care",
-    "Information provision & guidance",
-    "Being kept informed, clarity & consistency of information",
-    "Service involvement with family/ carers",
-    "Patient contact with family/ carers",
-    "Contacting services",
-    "Appointment arrangements",
-    "Appointment method",
-    "Timeliness of care",
-    "Pain management",
-    "Diagnosis & triage",
-    "Referals & continuity of care",
-    "Length of stay/ duration of care",
-    "Discharge",
-    "Care plans",
-    "Patient records",
-    "Links with non-NHS organisations",
-    "Cleanliness, tidiness & infection control",
-    "Safety & security",
-    "Provision of medical equipment",
-    "Service location",
-    "Transport to/ from services",
-    "Parking",
-    "Electronic entertainment",
-    "Feeling safe",
-    "Patient appearance & grooming",
-    "Mental Health Act",
-    "Equality, Diversity & Inclusion",
-    "Admission",
-    "Collecting patients feedback",
-    "Labelling not possible",
-    "Environment & Facilities",
-    "Supplying & understanding medication",
-    "Activities & access to fresh air",
-    "Food & drink provision & facilities",
-    "Sensory experience",
-    "Impact of treatment/ care",
-]
+from pxtextmining.params import minor_cats
 
 description = """
 This API is for classifying patient experience qualitative data,
@@ -77,101 +22,17 @@
         "name": "multilabel",
         "description": "Generate multilabel predictions for given text.",
     },
-    {
-        "name": "sentiment",
-        "description": "Generate predicted sentiment for given text.",
-    },
 ]
 
 
-async def load_sentiment_model():
-    model_path = "bert_sentiment"
-    if not os.path.exists(model_path):
-        model_path = os.path.join("api", model_path)
-    loaded_model = load_model(model_path)
-    return loaded_model
-
-
-async def get_sentiment_predictions(
-    text_to_predict, loaded_model, preprocess_text, additional_features
-):
-    predictions = predict_sentiment_bert(
-        text_to_predict,
-        loaded_model,
-        preprocess_text=preprocess_text,
-        additional_features=additional_features,
-    )
-    return predictions
-
-
-class Test(BaseModel):
-    test: str
-
-    class Config:
-        schema_extra = {"example": {"test": "Hello"}}
-
-
-class ItemIn(BaseModel):
-    comment_id: str
-    comment_text: str
-    question_type: str
-
-    class Config:
-        schema_extra = {
-            "example": {
-                "comment_id": "01",
-                "comment_text": "Nurses were friendly. Parking was awful.",
-                "question_type": "nonspecific",
-            }
-        }
-
-    @validator("question_type")
-    def question_type_validation(cls, v):
-        if v not in ["what_good", "could_improve", "nonspecific"]:
-            raise ValueError(
-                "question_type must be one of what_good, could_improve, or nonspecific"
-            )
-        return v
-
-
-class MultilabelOut(BaseModel):
-    comment_id: str
-    comment_text: str
-    labels: list
-
-    class Config:
-        schema_extra = {
-            "example": {
-                "comment_id": "01",
-                "comment_text": "Nurses were friendly. Parking was awful.",
-                "labels": ["Staff manner & personal attributes", "Parking"],
-            }
-        }
-
-
-class SentimentOut(BaseModel):
-    comment_id: str
-    comment_text: str
-    sentiment: Union[int, str]
-
-    class Config:
-        schema_extra = {
-            "example": {
-                "comment_id": "01",
-                "comment_text": "Nurses were friendly. Parking was awful.",
-                "sentiment": 3,
-            }
-        }
-
-
 app = FastAPI(
     title="pxtextmining API",
     description=description,
-    version="0.0.1",
+    version="1.0.0",
     contact={
         "name": "Patient Experience Qualitative Data Categorisation",
         "url": "https://cdu-data-science-team.github.io/PatientExperience-QDC/",
-        "email": "CDUDataScience@nottshc.nhs.uk",
+        "email": "chris.beeley1@nhs.net",
     },
     license_info={
         "name": "MIT License",
@@ -181,99 +42,53 @@ class Config:
 )
 
 
-@app.get("/", response_model=Test, tags=["index"])
+@app.get("/", response_model=schemas.Test, tags=["index"])
 def index():
     return {"test": "Hello"}
 
 
 @app.post(
-    "/predict_multilabel", response_model=List[MultilabelOut], tags=["multilabel"]
+    "/predict_multilabel",
+    response_model=List[schemas.MultilabelOut],
+    tags=["multilabel"],
 )
-async def predict_multilabel(items: List[ItemIn]):
-    """Accepts comment ids, comment text and question type as JSON in a POST request. Makes predictions using trained SVC model.
+async def predict_multilabel(items: List[schemas.ItemIn]):
+    """Accepts comment ids and comment text as JSON in a POST request. Makes predictions using trained SVC model.
 
     Args:
         items (List[ItemIn]): JSON list of dictionaries with the following compulsory keys:
         - `comment_id` (str)
         - `comment_text` (str)
-        - `question_type` (str)
-        The 'question_type' must be one of three values: 'nonspecific', 'what_good', and 'could_improve'.
-        For example, `[{'comment_id': '1', 'comment_text': 'Thank you', 'question_type': 'what_good'},
-        {'comment_id': '2', 'comment_text': 'Food was cold', 'question_type': 'could_improve'}]`
 
     Returns:
-        (dict): Keys are: `comment_id`, `comment_text`, and predicted `labels`.
+        (dict): Keys are: `comment_id` and predicted `labels`.
     """
 
     # Process received data
     df = pd.DataFrame([i.dict() for i in items], dtype=str)
-    df_newindex = df.set_index("comment_id")
-    if df_newindex.index.duplicated().sum() != 0:
-        raise ValueError("comment_id must all be unique values")
-    df_newindex.index.rename("Comment ID", inplace=True)
-    text_to_predict = df_newindex[["comment_text", "question_type"]]
-    text_to_predict = text_to_predict.rename(
-        columns={"comment_text": "FFT answer", "question_type": "FFT_q_standardised"}
+    df_for_preds = df.copy().rename(
+        columns={"comment_id": "Comment ID", "comment_text": "FFT answer"}
     )
+    df_for_preds = df_for_preds.set_index("Comment ID")
+    if df_for_preds.index.duplicated().sum() != 0:
+        raise ValueError("comment_id must all be unique values")
+    text_to_predict = df_for_preds["FFT answer"]
     # Make predictions
-    model_path = "svc_minorcats_v5.sav"
+    model_path = "final_svc.sav"
     if not os.path.isfile(model_path):
         model_path = os.path.join("api", model_path)
     with open(model_path, "rb") as model:
         loaded_model = pickle.load(model)
     preds_df = predict_multilabel_sklearn(
-        text_to_predict, loaded_model, labels=minor_cats_v5, additional_features=True
+        text_to_predict, loaded_model, labels=minor_cats, additional_features=False
     )
     # Join predicted labels with received data
     preds_df["comment_id"] = preds_df.index.astype(str)
     merged = pd.merge(df, preds_df, how="left", on="comment_id")
     merged["labels"] = merged["labels"].fillna("").apply(list)
     for i in merged["labels"].index:
-        if len(merged["labels"].loc[i]) < 1:
-            merged["labels"].loc[i].append("Labelling not possible")
-    return_dict = merged[["comment_id", "comment_text", "labels"]].to_dict(
-        orient="records"
-    )
-    return return_dict
-
-
-@app.post("/predict_sentiment", response_model=List[SentimentOut], tags=["sentiment"])
-async def predict_sentiment(items: List[ItemIn]):
-    """Accepts comment ids, comment text and question type as JSON in a POST request. Makes predictions using trained Tensorflow Keras model.
-
-    Args:
-        items (List[ItemIn]): JSON list of dictionaries with the following compulsory keys:
-        - `comment_id` (str)
-        - `comment_text` (str)
-        - `question_type` (str)
-        The 'question_type' must be one of three values: 'nonspecific', 'what_good', and 'could_improve'.
-        For example, `[{'comment_id': '1', 'comment_text': 'Thank you', 'question_type': 'what_good'},
-        {'comment_id': '2', 'comment_text': 'Food was cold', 'question_type': 'could_improve'}]`
-
-    Returns:
-        (dict): Keys are: `comment_id`, `comment_text`, and predicted `labels`.
-    """
-
-    # Process received data
-    loaded_model = await load_sentiment_model()
-    df = pd.DataFrame([i.dict() for i in items], dtype=str)
-    df_newindex = df.set_index("comment_id")
-    if df_newindex.index.duplicated().sum() != 0:
-        raise ValueError("comment_id must all be unique values")
-    df_newindex.index.rename("Comment ID", inplace=True)
-    text_to_predict = df_newindex[["comment_text", "question_type"]]
-    text_to_predict = text_to_predict.rename(
-        columns={"comment_text": "FFT answer", "question_type": "FFT_q_standardised"}
-    )
-    # Make predictions
-    preds_df = await get_sentiment_predictions(
-        text_to_predict, loaded_model, preprocess_text=False, additional_features=True
-    )
-    # Join predicted labels with received data
-    preds_df["comment_id"] = preds_df.index.astype(str)
-    merged = pd.merge(df, preds_df, how="left", on="comment_id")
-    merged["sentiment"] = merged["sentiment"].fillna("Labelling not possible")
-    return_dict = merged[["comment_id", "comment_text", "sentiment"]].to_dict(
-        orient="records"
-    )
+        label_list = merged.loc[i, "labels"]
+        if len(label_list) < 1:
+            merged.loc[i, "labels"].append("Labelling not possible")
+    return_dict = merged[["comment_id", "labels"]].to_dict(orient="records")
     return return_dict
diff --git a/...nt_best_model/final_svc_noq/final_svc.sav → api/final_svc.sav b/...nt_best_model/final_svc_noq/final_svc.sav → api/final_svc.sav