diff --git a/custom_dc/load_data.sh b/custom_dc/load_data.sh index 5d978880e4..134944f7f3 100755 --- a/custom_dc/load_data.sh +++ b/custom_dc/load_data.sh @@ -81,10 +81,6 @@ function setup_python { "https://download.pytorch.org/whl/cpu" echo_log "Installing Python requirements from $embeddings_req" run_cmd pip3 install -r "$embeddings_req" - # TODO: remove install once embeddings doesn't need nl_server/requirements.txt - nlserver_req="$WEBSITE_DIR/nl_server/requirements.txt" - echo_log "Installing Python requirements from $nlserver_req" - run_cmd pip3 install -r "$nlserver_req" fi fi } @@ -223,9 +219,45 @@ function generate_embeddings { echo_log "Building embeddings for sentences in $NL_DIR" local cwd="$PWD" cd "$WEBSITE_DIR" - # TODO: Enable with new build_embeddings.py - # run_cmd python -m tools.nl.embeddings.build_custom_dc_embeddings \ - # --input_file_path="$NL_DIR/sentences.csv" --output_dir="$NL_DIR" + + NL_EMBEDDINGS_DIR="$NL_DIR/embeddings" + EMBEDDINGS_PATH="$NL_EMBEDDINGS_DIR/embeddings.csv" + CUSTOM_EMBEDDING_INDEX="user_all_minilm_mem" + CUSTOM_MODEL="ft-final-v20230717230459-all-MiniLM-L6-v2" + CUSTOM_MODEL_PATH="gs://datcom-nl-models/ft_final_v20230717230459.all-MiniLM-L6-v2" + CUSTOM_CATALOG_DICT=$(cat <> $LOG 2>&1 + status=$? + set +x + local duration=$(( $(date +%s) - $start_ts)) + [[ "$status" == "0" ]] || echo_fatal "Failed to build embeddings" + echo_log "Completed building embeddings with status:$status in $duration secs" cd "$cwd" } diff --git a/tools/nl/embeddings/requirements.txt b/tools/nl/embeddings/requirements.txt index 0f1447f0a7..d0c0854bf6 100644 --- a/tools/nl/embeddings/requirements.txt +++ b/tools/nl/embeddings/requirements.txt @@ -6,4 +6,9 @@ google-cloud-storage==2.15.0 lancedb==0.6.8 parameterized==0.8.1 sentence-transformers==2.2.2 -torchvision==0.17.2 \ No newline at end of file +torchvision==0.17.2 +# Downloading the named-entity recognition (NER) library spacy and the large EN model +# using the guidelines here: https://spacy.io/usage/models#production +# TODO: try using the large model +-f https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl +en_core_web_sm==3.7.1 \ No newline at end of file