Refactor NL Server bootstrap code for readability (#3725)

This is to make follow on custom DC specific changes cleaner. There is no intentional logic change other than using a single cache entry (instead of one per index) in test environment. A key simplification is to more directly rely on the embeddings name to extract the base / tuned-model (see `config._parse()`) instead of the current code which uses `models.yaml` and also relies on the naming partially.
datacommonsorg · Oct 26, 2023 · 3c59c1b · 3c59c1b
1 parent fef25f3
commit 3c59c1b
Show file tree

Hide file tree

Showing 9 changed files with 220 additions and 151 deletions.
diff --git a/nl_server/__init__.py b/nl_server/__init__.py
@@ -12,62 +12,53 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import logging
 import os
 import sys
 
 from flask import Flask
 import torch
 import yaml
 
+from nl_server import config
 import nl_server.loader as loader
 import nl_server.routes as routes
 
+_MODEL_YAML = 'models.yaml'
+_EMBEDDINGS_YAML = 'embeddings.yaml'
+
 
 def create_app():
   app = Flask(__name__)
   app.register_blueprint(routes.bp)
 
-  flask_env = os.environ.get('FLASK_ENV')
-
   # https://github.com/UKPLab/sentence-transformers/issues/1318
   if sys.version_info >= (3, 8) and sys.platform == "darwin":
     torch.set_num_threads(1)
 
-  # Download existing finetuned models (if not already downloaded).
-  models_downloaded_paths = {}
-  models_config_path = '/datacommons/nl/models.yaml'
-  if flask_env in ['local', 'test', 'integration_test', 'webdriver']:
-    models_config_path = os.path.join(
-        os.path.dirname(os.path.dirname(os.path.abspath(__file__))),
-        'deploy/nl/models.yaml')
-  app.config['MODELS_CONFIG_PATH'] = models_config_path
-  with open(app.config['MODELS_CONFIG_PATH']) as f:
+  with open(get_env_path(_MODEL_YAML)) as f:
     models_map = yaml.full_load(f)
-    if not models_map:
-      logging.error("No configuration found for model")
-      return
+    assert models_map, 'No models.yaml found!'
 
-    models_downloaded_paths = loader.download_models(models_map)
+  with open(get_env_path(_EMBEDDINGS_YAML)) as f:
+    embeddings_map = yaml.full_load(f)
+    assert embeddings_map, 'No embeddings.yaml found!'
+    app.config[config.NL_EMBEDDINGS_VERSION_KEY] = embeddings_map
 
-  assert models_downloaded_paths, "No models were found/downloaded. Check deploy/nl/models.yaml"
+  loader.load_server_state(app, embeddings_map, models_map)
 
-  # Download existing embeddings (if not already downloaded).
-  embeddings_config_path = '/datacommons/nl/embeddings.yaml'
-  if flask_env in ['local', 'test', 'integration_test', 'webdriver']:
-    embeddings_config_path = os.path.join(
-        os.path.dirname(os.path.dirname(os.path.abspath(__file__))),
-        'deploy/nl/embeddings.yaml')
-  app.config['EMBEDDINGS_CONFIG_PATH'] = embeddings_config_path
+  return app
 
-  # Initialize the NL module.
-  with open(app.config['EMBEDDINGS_CONFIG_PATH']) as f:
-    embeddings_map = yaml.full_load(f)
-    if not embeddings_map:
-      logging.error("No configuration found for embeddings")
-      return
 
-    app.config['EMBEDDINGS_VERSION_MAP'] = embeddings_map
-    loader.load_embeddings(app, embeddings_map, models_downloaded_paths)
+#
+# On prod the yaml files are in /datacommons/nl/, whereas
+# in test-like environments it is the checked in path
+# (deploy/nl/).
+#
+def get_env_path(file_name: str) -> str:
+  flask_env = os.environ.get('FLASK_ENV')
+  if flask_env in ['local', 'test', 'integration_test', 'webdriver']:
+    return os.path.join(
+        os.path.dirname(os.path.dirname(os.path.abspath(__file__))),
+        f'deploy/nl/{file_name}')
 
-  return app
+  return f'/datacommons/nl/{file_name}'
diff --git a/nl_server/config.py b/nl_server/config.py
@@ -0,0 +1,106 @@
+# Copyright 2023 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from dataclasses import dataclass
+from typing import Dict, List
+
+from nl_server import embeddings
+from nl_server import gcs
+
+# Index constants.  Passed in `url=`
+CUSTOM_DC_INDEX = 'custom'
+DEFAULT_INDEX_TYPE = 'medium_ft'
+
+# The default base model we use.
+EMBEDDINGS_BASE_MODEL_NAME = 'all-MiniLM-L6-v2'
+
+# App Config constants.
+NL_MODEL_KEY = 'NL_MODEL'
+NL_EMBEDDINGS_KEY = 'NL_EMBEDDINGS'
+NL_EMBEDDINGS_VERSION_KEY = 'NL_EMBEDDINGS_VERSION_MAP'
+
+
+# Defines one embeddings index config.
+@dataclass
+class EmbeddingsIndex:
+  # Name provided in the yaml file, and set in `idx=` URL param.
+  name: str
+
+  # File name provided in the yaml file.
+  embeddings_file_name: str
+  # Local path.
+  embeddings_local_path: str = ""
+
+  # Fine-tuned model name ("" if embeddings uses base model).
+  tuned_model: str = ""
+  # Fine-tuned model local path.
+  tuned_model_local_path: str = ""
+
+
+#
+# Validates the config input, downloads all the files and returns a list of Indexes to load.
+#
+def load(embeddings_map: Dict[str, str],
+         models_map: Dict[str, str]) -> List[EmbeddingsIndex]:
+  # Create Index objects.
+  indexes = _parse(embeddings_map)
+
+  # This is just a sanity, we can soon deprecate models.yaml
+  tuned_models_provided = list(set(models_map.values()))
+  tuned_models_configured = list(
+      set([i.tuned_model for i in indexes if i.tuned_model]))
+  assert sorted(tuned_models_configured) == sorted(tuned_models_provided), \
+    f'{tuned_models_configured} vs. {tuned_models_provided}'
+
+  #
+  # Download all the models.
+  #
+  model2path = {d: gcs.download_model_folder(d) for d in tuned_models_configured}
+  for idx in indexes:
+    if idx.tuned_model:
+      idx.tuned_model_local_path = model2path[idx.tuned_model]
+
+  #
+  # Download all the embeddings.
+  #
+  for idx in indexes:
+    idx.embeddings_local_path = gcs.download_embeddings(
+        idx.embeddings_file_name)
+
+  return indexes
+
+
+def _parse(embeddings_map: Dict[str, str]) -> List[EmbeddingsIndex]:
+  indexes: List[EmbeddingsIndex] = []
+
+  for key, value in embeddings_map.items():
+    idx = EmbeddingsIndex(name=key, embeddings_file_name=value)
+
+    parts = value.split('.')
+    assert parts[
+        -1] == 'csv', f'Embeddings file {value} name does not end with .csv!'
+
+    if len(parts) == 4:
+      # Expect: <embeddings_version>.<fine-tuned-model-version>.<base-model>.csv
+      # Example: embeddings_sdg_2023_09_12_16_38_04.ft_final_v20230717230459.all-MiniLM-L6-v2.csv
+      assert parts[
+          2] == EMBEDDINGS_BASE_MODEL_NAME, f'Unexpected base model {parts[3]}'
+      idx.tuned_model = f'{parts[1]}.{parts[2]}'
+    else:
+      # Expect: <embeddings_version>.csv
+      # Example: embeddings_small_2023_05_24_23_17_03.csv
+      assert len(parts) == 2, f'Unexpected file name format {value}'
+    indexes.append(idx)
+
+  return indexes
diff --git a/nl_server/embeddings.py b/nl_server/embeddings.py
@@ -12,7 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """Managing the embeddings."""
-from dataclasses import dataclass
 import logging
 import os
 from typing import Dict, List, Union
@@ -22,13 +21,12 @@
 from sentence_transformers.util import semantic_search
 import torch
 
+from nl_server import config
 from nl_server import query_util
 from shared.lib import constants
 from shared.lib import detected_variables as vars
 from shared.lib import utils
 
-MODEL_NAME = 'all-MiniLM-L6-v2'
-
 # A value higher than the highest score.
 _HIGHEST_SCORE = 1.0
 _INIT_SCORE = (_HIGHEST_SCORE + 0.1)
@@ -52,7 +50,7 @@ def __init__(self,
       assert os.path.exists(existing_model_path)
       self.model = SentenceTransformer(existing_model_path)
     else:
-      self.model = SentenceTransformer(MODEL_NAME)
+      self.model = SentenceTransformer(config.EMBEDDINGS_BASE_MODEL_NAME)
     self.dataset_embeddings: torch.Tensor = None
     self.dcids: List[str] = []
     self.sentences: List[str] = []

diff --git a/nl_server/embeddings_store.py b/nl_server/embeddings_store.py
@@ -0,0 +1,37 @@
+# Copyright 2023 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import List
+
+from nl_server.config import DEFAULT_INDEX_TYPE
+from nl_server.config import EmbeddingsIndex
+from nl_server.embeddings import Embeddings
+
+
+#
+# A simple wrapper class around multiple embeddings indexes.
+#
+# TODO: Handle custom DC specific logic here.
+#
+class Store:
+
+  def __init__(self, indexes: List[EmbeddingsIndex]):
+    self.embeddings_map = {}
+    for idx in indexes:
+      self.embeddings_map[idx.name] = Embeddings(idx.embeddings_local_path,
+                                                 idx.tuned_model_local_path)
+
+  # Note: The caller takes care of exceptions.
+  def get(self, index_type: str = DEFAULT_INDEX_TYPE) -> Embeddings:
+    return self.embeddings_map[index_type]