Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add resource requirements to the retrieve from faiss component #905

Merged
20 changes: 18 additions & 2 deletions src/fondant/components/retrieve_from_faiss_by_prompt/src/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,12 +2,13 @@
import os
import typing as t

import dask
import dask.dataframe as dd
import faiss
import fsspec
import pandas as pd
import torch
from dask.distributed import Client, get_worker
from dask.distributed import Client, LocalCluster, get_worker
from dask_cuda import LocalCUDACluster
from fondant.component import PandasTransformComponent
from transformers import AutoTokenizer, CLIPTextModelWithProjection
Expand Down Expand Up @@ -51,7 +52,22 @@ def setup(self) -> Client:
cluster = LocalCUDACluster()
return Client(cluster)

return super().setup()
dask.config.set({"dataframe.convert-string": False})
dask.config.set({"distributed.worker.daemon": False})
mrchtr marked this conversation as resolved.
Show resolved Hide resolved
total_memory = (os.sysconf("SC_PAGE_SIZE") * os.sysconf("SC_PHYS_PAGES")) / (
1024**3
)
mrchtr marked this conversation as resolved.
Show resolved Hide resolved
# We need at least 8Gb RAM for the datacomp small faiss index
# We should consider calculating the memory required for the index based on the faiss
# index size
cores_to_utilise = int(total_memory / 8)
mrchtr marked this conversation as resolved.
Show resolved Hide resolved
cluster = LocalCluster(
processes=True,
n_workers=cores_to_utilise,
threads_per_worker=1,
memory_limit="8 GiB",
)
return Client(cluster)

def embed_prompt(self, prompt: str):
"""Embed prompt using CLIP model."""
Expand Down
Loading