diff --git a/src/fondant/components/caption_images/Dockerfile b/src/fondant/components/caption_images/Dockerfile index 26213627..9565f1c3 100644 --- a/src/fondant/components/caption_images/Dockerfile +++ b/src/fondant/components/caption_images/Dockerfile @@ -12,7 +12,7 @@ RUN pip3 install --no-cache-dir -r requirements.txt # Install Fondant # This is split from other requirements to leverage caching ARG FONDANT_VERSION=main -RUN pip3 install fondant[aws,azure,gcp]@git+https://github.com/ml6team/fondant@${FONDANT_VERSION} +RUN pip3 install fondant[aws,azure,gcp,gpu]@git+https://github.com/ml6team/fondant@${FONDANT_VERSION} # Set the working directory to the component folder WORKDIR /component @@ -25,4 +25,4 @@ RUN python -m pytest tests FROM base WORKDIR /component/src -ENTRYPOINT ["fondant", "execute", "main"] +ENTRYPOINT ["fondant", "execute", "main"] \ No newline at end of file diff --git a/src/fondant/components/caption_images/src/main.py b/src/fondant/components/caption_images/src/main.py index 234aa28a..6f6a4bc2 100644 --- a/src/fondant/components/caption_images/src/main.py +++ b/src/fondant/components/caption_images/src/main.py @@ -4,9 +4,12 @@ import os import typing as t +import dask import numpy as np import pandas as pd import torch +from dask.distributed import Client +from dask_cuda import LocalCUDACluster from fondant.component import PandasTransformComponent from PIL import Image from transformers import BatchEncoding, BlipForConditionalGeneration, BlipProcessor @@ -90,6 +93,16 @@ def __init__( self.batch_size = batch_size self.max_new_tokens = max_new_tokens + def setup(self) -> Client: + """Setup LocalCudaCluster if gpu is available.""" + dask.config.set({"dataframe.convert-string": False}) + dask.config.set({"distributed.worker.daemon": False}) + + if self.device == "cuda": + cluster = LocalCUDACluster() + return Client(cluster) + return super().setup() + def transform(self, dataframe: pd.DataFrame) -> pd.DataFrame: images = dataframe["image"] diff --git a/src/fondant/components/retrieve_from_faiss_by_prompt/src/main.py b/src/fondant/components/retrieve_from_faiss_by_prompt/src/main.py index 44612d67..7cd18297 100644 --- a/src/fondant/components/retrieve_from_faiss_by_prompt/src/main.py +++ b/src/fondant/components/retrieve_from_faiss_by_prompt/src/main.py @@ -2,12 +2,13 @@ import os import typing as t +import dask import dask.dataframe as dd import faiss import fsspec import pandas as pd import torch -from dask.distributed import Client, get_worker +from dask.distributed import Client, LocalCluster, get_worker from dask_cuda import LocalCUDACluster from fondant.component import PandasTransformComponent from transformers import AutoTokenizer, CLIPTextModelWithProjection @@ -47,11 +48,27 @@ def __init__( # PLR0913 def setup(self) -> Client: """Setup LocalCudaCluster if gpu is available.""" + dask.config.set({"dataframe.convert-string": False}) + dask.config.set({"distributed.worker.daemon": False}) + if self.device == "cuda": cluster = LocalCUDACluster() return Client(cluster) - return super().setup() + total_memory = (os.sysconf("SC_PAGE_SIZE") * os.sysconf("SC_PHYS_PAGES")) / ( + 1024**3 + ) + # We need at least 8Gb RAM for the datacomp small faiss index + # We should consider calculating the memory required for the index based on the faiss + # index size + cores_to_utilise = total_memory // 8 + cluster = LocalCluster( + processes=True, + n_workers=cores_to_utilise, + threads_per_worker=1, + memory_limit="8 GiB", + ) + return Client(cluster) def embed_prompt(self, prompt: str): """Embed prompt using CLIP model.""" diff --git a/src/fondant/components/segment_images/Dockerfile b/src/fondant/components/segment_images/Dockerfile index c93dca10..6d0b5391 100644 --- a/src/fondant/components/segment_images/Dockerfile +++ b/src/fondant/components/segment_images/Dockerfile @@ -12,7 +12,7 @@ RUN pip3 install --no-cache-dir -r requirements.txt # Install Fondant # This is split from other requirements to leverage caching ARG FONDANT_VERSION=main -RUN pip3 install fondant[aws,azure,gcp]@git+https://github.com/ml6team/fondant@${FONDANT_VERSION} +RUN pip3 install fondant[aws,azure,gcp,gpu]@git+https://github.com/ml6team/fondant@${FONDANT_VERSION} # Set the working directory to the component folder WORKDIR /component/src diff --git a/src/fondant/components/segment_images/src/main.py b/src/fondant/components/segment_images/src/main.py index a77b866a..40760fa0 100644 --- a/src/fondant/components/segment_images/src/main.py +++ b/src/fondant/components/segment_images/src/main.py @@ -4,9 +4,12 @@ import os import typing as t +import dask import numpy as np import pandas as pd import torch +from dask.distributed import Client +from dask_cuda import LocalCUDACluster from fondant.component import PandasTransformComponent from palette import palette from PIL import Image @@ -127,6 +130,16 @@ def __init__( self.batch_size = batch_size + def setup(self) -> Client: + """Setup LocalCudaCluster if gpu is available.""" + dask.config.set({"dataframe.convert-string": False}) + dask.config.set({"distributed.worker.daemon": False}) + + if self.device == "cuda": + cluster = LocalCUDACluster() + return Client(cluster) + return super().setup() + def transform(self, dataframe: pd.DataFrame) -> pd.DataFrame: images = dataframe["image"]