Add resource requirements to the retrieve from faiss component (#905)

When using the retrieve from faiss component with the LocalCluster it is a bit tricky to find the right machine configuration. Changed the initialisation of the LocalCluster. Assign each worker 8Gb of RAM and limit the number of cores accordingly. --------- Co-authored-by: Robbe Sneyders <robbe.sneyders@gmail.com>
ml6team · Mar 13, 2024 · 2743dbe · 2743dbe
1 parent acb6d0a
commit 2743dbe
Show file tree

Hide file tree

Showing 5 changed files with 48 additions and 5 deletions.
diff --git a/src/fondant/components/caption_images/Dockerfile b/src/fondant/components/caption_images/Dockerfile
@@ -12,7 +12,7 @@ RUN pip3 install --no-cache-dir -r requirements.txt
 # Install Fondant
 # This is split from other requirements to leverage caching
 ARG FONDANT_VERSION=main
-RUN pip3 install fondant[aws,azure,gcp]@git+https://github.com/ml6team/fondant@${FONDANT_VERSION}
+RUN pip3 install fondant[aws,azure,gcp,gpu]@git+https://github.com/ml6team/fondant@${FONDANT_VERSION}
 
 # Set the working directory to the component folder
 WORKDIR /component
@@ -25,4 +25,4 @@ RUN python -m pytest tests
 
 FROM base
 WORKDIR /component/src
-ENTRYPOINT ["fondant", "execute", "main"]
+ENTRYPOINT ["fondant", "execute", "main"]
diff --git a/src/fondant/components/caption_images/src/main.py b/src/fondant/components/caption_images/src/main.py
@@ -4,9 +4,12 @@
 import os
 import typing as t
 
+import dask
 import numpy as np
 import pandas as pd
 import torch
+from dask.distributed import Client
+from dask_cuda import LocalCUDACluster
 from fondant.component import PandasTransformComponent
 from PIL import Image
 from transformers import BatchEncoding, BlipForConditionalGeneration, BlipProcessor
@@ -90,6 +93,16 @@ def __init__(
         self.batch_size = batch_size
         self.max_new_tokens = max_new_tokens
 
+    def setup(self) -> Client:
+        """Setup LocalCudaCluster if gpu is available."""
+        dask.config.set({"dataframe.convert-string": False})
+        dask.config.set({"distributed.worker.daemon": False})
+
+        if self.device == "cuda":
+            cluster = LocalCUDACluster()
+            return Client(cluster)
+        return super().setup()
+
     def transform(self, dataframe: pd.DataFrame) -> pd.DataFrame:
         images = dataframe["image"]
 

diff --git a/src/fondant/components/retrieve_from_faiss_by_prompt/src/main.py b/src/fondant/components/retrieve_from_faiss_by_prompt/src/main.py
@@ -2,12 +2,13 @@
 import os
 import typing as t
 
+import dask
 import dask.dataframe as dd
 import faiss
 import fsspec
 import pandas as pd
 import torch
-from dask.distributed import Client, get_worker
+from dask.distributed import Client, LocalCluster, get_worker
 from dask_cuda import LocalCUDACluster
 from fondant.component import PandasTransformComponent
 from transformers import AutoTokenizer, CLIPTextModelWithProjection
@@ -47,11 +48,27 @@ def __init__(  # PLR0913
 
     def setup(self) -> Client:
         """Setup LocalCudaCluster if gpu is available."""
+        dask.config.set({"dataframe.convert-string": False})
+        dask.config.set({"distributed.worker.daemon": False})
+
         if self.device == "cuda":
             cluster = LocalCUDACluster()
             return Client(cluster)
 
-        return super().setup()
+        total_memory = (os.sysconf("SC_PAGE_SIZE") * os.sysconf("SC_PHYS_PAGES")) / (
+            1024**3
+        )
+        # We need at least 8Gb RAM for the datacomp small faiss index
+        # We should consider calculating the memory required for the index based on the faiss
+        # index size
+        cores_to_utilise = total_memory // 8
+        cluster = LocalCluster(
+            processes=True,
+            n_workers=cores_to_utilise,
+            threads_per_worker=1,
+            memory_limit="8 GiB",
+        )
+        return Client(cluster)
 
     def embed_prompt(self, prompt: str):
         """Embed prompt using CLIP model."""

diff --git a/src/fondant/components/segment_images/Dockerfile b/src/fondant/components/segment_images/Dockerfile
@@ -12,7 +12,7 @@ RUN pip3 install --no-cache-dir -r requirements.txt
 # Install Fondant
 # This is split from other requirements to leverage caching
 ARG FONDANT_VERSION=main
-RUN pip3 install fondant[aws,azure,gcp]@git+https://github.com/ml6team/fondant@${FONDANT_VERSION}
+RUN pip3 install fondant[aws,azure,gcp,gpu]@git+https://github.com/ml6team/fondant@${FONDANT_VERSION}
 
 # Set the working directory to the component folder
 WORKDIR /component/src

diff --git a/src/fondant/components/segment_images/src/main.py b/src/fondant/components/segment_images/src/main.py
@@ -4,9 +4,12 @@
 import os
 import typing as t
 
+import dask
 import numpy as np
 import pandas as pd
 import torch
+from dask.distributed import Client
+from dask_cuda import LocalCUDACluster
 from fondant.component import PandasTransformComponent
 from palette import palette
 from PIL import Image
@@ -127,6 +130,16 @@ def __init__(
 
         self.batch_size = batch_size
 
+    def setup(self) -> Client:
+        """Setup LocalCudaCluster if gpu is available."""
+        dask.config.set({"dataframe.convert-string": False})
+        dask.config.set({"distributed.worker.daemon": False})
+
+        if self.device == "cuda":
+            cluster = LocalCUDACluster()
+            return Client(cluster)
+        return super().setup()
+
     def transform(self, dataframe: pd.DataFrame) -> pd.DataFrame:
         images = dataframe["image"]