Skip to content

Commit

Permalink
Merge pull request #77 from nestauk/76-remove-jq-threshold-from-confi…
Browse files Browse the repository at this point in the history
…g-and-find_job_quality

Remove JQ threshold
  • Loading branch information
RFOxbury authored Sep 26, 2024
2 parents 5e87e69 + d3c0da5 commit c3cff38
Show file tree
Hide file tree
Showing 3 changed files with 59 additions and 16 deletions.
1 change: 0 additions & 1 deletion dap_job_quality/config/jobbert_config.yaml
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
model: jjzha/jobbert-base-cased
max_length: 128
seed: 42
jq_threshold: 0.3
cs_threshold:
CAREER: 0.6
FLEX_HOURS: 0.65
Expand Down
16 changes: 1 addition & 15 deletions dap_job_quality/pipeline/find_job_quality.py
Original file line number Diff line number Diff line change
Expand Up @@ -191,12 +191,10 @@ class JobQuality(object):

def __init__(
self,
JQ_THRESHOLD: float = jobbert_config["jq_threshold"],
CS_THRESHOLD: float = jobbert_config["cs_threshold"],
batch_size: int = jobbert_config["train_config"]["per_device_train_batch_size"],
MAX_LENGTH: int = jobbert_config["max_length"],
):
self.JQ_THRESHOLD = JQ_THRESHOLD
self.CS_THRESHOLD = CS_THRESHOLD
self.batch_size = batch_size
self.MAX_LENGTH = MAX_LENGTH
Expand All @@ -213,17 +211,10 @@ def load(self):
nltk.download("punkt")
nltk.download("stopwords")

# The sentence embedding model to use for encoding the sentences for the sentence classifier.
# not used
# self.sentence_classifier_bert_transformer = SentenceTransformer(
# "jjzha/jobbert-base-cased", device=self.device
# )

# The sentence embedding model to use for encoding the n-grams and target phrases.
self.ngram_match_bert_transformer = SentenceTransformer(
"all-MiniLM-L6-v2", device=self.device
)
# self.ngram_match_bert_transformer.max_seq_length = self.MAX_LENGTH #I don't think this part needs truncation

sentence_classifier_model, sentence_classifier_tokenizer = get_jobbert_jq(
max_length=self.MAX_LENGTH
Expand Down Expand Up @@ -327,12 +318,7 @@ def extract_job_quality_sentences(
jobs_df["job_quality_label"] = labels
jobs_df["job_quality_prob"] = pred_scores

job_quality_df = jobs_df[
(
(jobs_df["job_quality_label"] == "LABEL_1")
& (jobs_df["job_quality_prob"] >= self.JQ_THRESHOLD)
)
]
job_quality_df = jobs_df[(jobs_df["job_quality_label"] == "LABEL_1")]

return job_quality_df.reset_index(drop=True)

Expand Down
58 changes: 58 additions & 0 deletions dap_job_quality/tests/test_extract_job_quality.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
import pytest
import pandas as pd
from dap_job_quality.pipeline.find_job_quality import JobQuality


@pytest.fixture
def sample_job_adverts():
"""
Fixture to provide a sample DataFrame of job adverts for testing.
"""
return pd.DataFrame(
[
{
"id": 123,
"description": "This is a job adverts. It has many benefits such as a pension scheme and a cycle to work scheme.",
},
{
"id": 234,
"description": "This is a job adverts for a job at a bank. There are free childcare vouchers. We also offer a yearly bonus and generous salary.",
},
]
)


@pytest.fixture
def job_quality_model():
"""
Fixture to initialize and load the JobQuality model.
"""
job_quality = JobQuality()
job_quality.load()
return job_quality


def test_extract_job_quality(job_quality_model, sample_job_adverts):
"""
Unit test to check if extract_job_quality extracts the correct target phrases.
"""
jq_df_filtered, job_id_to_target_phrase = job_quality_model.extract_job_quality(
sample_job_adverts,
id_col="id",
text_col="description",
)

# Expected result
expected_output = {
123: ["Cycle to work", "benefits", "pension", "pension scheme"],
234: ["childcare vouchers", "compensation", "performance bonus"],
}

# Assertions to check if the output is as expected
assert isinstance(jq_df_filtered, pd.DataFrame), "Output should be a DataFrame"
assert isinstance(
job_id_to_target_phrase, dict
), "job_id_to_target_phrase should be a dictionary"
assert (
job_id_to_target_phrase == expected_output
), "Extracted target phrases do not match expected output"

0 comments on commit c3cff38

Please sign in to comment.