Merge pull request #77 from nestauk/76-remove-jq-threshold-from-confi…

…g-and-find_job_quality Remove JQ threshold
nestauk · Sep 26, 2024 · c3cff38 · c3cff38
2 parents 5e87e69 + d3c0da5
commit c3cff38
Show file tree

Hide file tree

Showing 3 changed files with 59 additions and 16 deletions.
diff --git a/dap_job_quality/config/jobbert_config.yaml b/dap_job_quality/config/jobbert_config.yaml
@@ -1,7 +1,6 @@
 model: jjzha/jobbert-base-cased
 max_length: 128
 seed: 42
-jq_threshold: 0.3
 cs_threshold:
   CAREER: 0.6
   FLEX_HOURS: 0.65

diff --git a/dap_job_quality/pipeline/find_job_quality.py b/dap_job_quality/pipeline/find_job_quality.py
@@ -191,12 +191,10 @@ class JobQuality(object):
 
     def __init__(
         self,
-        JQ_THRESHOLD: float = jobbert_config["jq_threshold"],
         CS_THRESHOLD: float = jobbert_config["cs_threshold"],
         batch_size: int = jobbert_config["train_config"]["per_device_train_batch_size"],
         MAX_LENGTH: int = jobbert_config["max_length"],
     ):
-        self.JQ_THRESHOLD = JQ_THRESHOLD
         self.CS_THRESHOLD = CS_THRESHOLD
         self.batch_size = batch_size
         self.MAX_LENGTH = MAX_LENGTH
@@ -213,17 +211,10 @@ def load(self):
         nltk.download("punkt")
         nltk.download("stopwords")
 
-        # The sentence embedding model to use for encoding the sentences for the sentence classifier.
-        #  not used
-        # self.sentence_classifier_bert_transformer = SentenceTransformer(
-        #     "jjzha/jobbert-base-cased", device=self.device
-        # )
-
         # The sentence embedding model to use for encoding the n-grams and target phrases.
         self.ngram_match_bert_transformer = SentenceTransformer(
             "all-MiniLM-L6-v2", device=self.device
         )
-        # self.ngram_match_bert_transformer.max_seq_length = self.MAX_LENGTH #I don't think this part needs truncation
 
         sentence_classifier_model, sentence_classifier_tokenizer = get_jobbert_jq(
             max_length=self.MAX_LENGTH
@@ -327,12 +318,7 @@ def extract_job_quality_sentences(
         jobs_df["job_quality_label"] = labels
         jobs_df["job_quality_prob"] = pred_scores
 
-        job_quality_df = jobs_df[
-            (
-                (jobs_df["job_quality_label"] == "LABEL_1")
-                & (jobs_df["job_quality_prob"] >= self.JQ_THRESHOLD)
-            )
-        ]
+        job_quality_df = jobs_df[(jobs_df["job_quality_label"] == "LABEL_1")]
 
         return job_quality_df.reset_index(drop=True)
 

diff --git a/dap_job_quality/tests/test_extract_job_quality.py b/dap_job_quality/tests/test_extract_job_quality.py
@@ -0,0 +1,58 @@
+import pytest
+import pandas as pd
+from dap_job_quality.pipeline.find_job_quality import JobQuality
+
+
+@pytest.fixture
+def sample_job_adverts():
+    """
+    Fixture to provide a sample DataFrame of job adverts for testing.
+    """
+    return pd.DataFrame(
+        [
+            {
+                "id": 123,
+                "description": "This is a job adverts. It has many benefits such as a pension scheme and a cycle to work scheme.",
+            },
+            {
+                "id": 234,
+                "description": "This is a job adverts for a job at a bank. There are free childcare vouchers. We also offer a yearly bonus and generous salary.",
+            },
+        ]
+    )
+
+
+@pytest.fixture
+def job_quality_model():
+    """
+    Fixture to initialize and load the JobQuality model.
+    """
+    job_quality = JobQuality()
+    job_quality.load()
+    return job_quality
+
+
+def test_extract_job_quality(job_quality_model, sample_job_adverts):
+    """
+    Unit test to check if extract_job_quality extracts the correct target phrases.
+    """
+    jq_df_filtered, job_id_to_target_phrase = job_quality_model.extract_job_quality(
+        sample_job_adverts,
+        id_col="id",
+        text_col="description",
+    )
+
+    # Expected result
+    expected_output = {
+        123: ["Cycle to work", "benefits", "pension", "pension scheme"],
+        234: ["childcare vouchers", "compensation", "performance bonus"],
+    }
+
+    # Assertions to check if the output is as expected
+    assert isinstance(jq_df_filtered, pd.DataFrame), "Output should be a DataFrame"
+    assert isinstance(
+        job_id_to_target_phrase, dict
+    ), "job_id_to_target_phrase should be a dictionary"
+    assert (
+        job_id_to_target_phrase == expected_output
+    ), "Extracted target phrases do not match expected output"