Skip to content

Commit

Permalink
[feat] Enable control over chunking of URL lists (#168)
Browse files Browse the repository at this point in the history
Adds an optional parameter to the `upload_files` utility, which can be
used to control how many URLs in a provided URL list are submitted in a
single upload task. Defaults to 1, which provides equivalent behaviour
to before.

Signed-off-by: Christoph Auer <CAU@zurich.ibm.com>
Co-authored-by: Christoph Auer <CAU@zurich.ibm.com>
  • Loading branch information
cau-git and cau-git authored Feb 29, 2024
1 parent bf7c4b5 commit bc5c684
Showing 1 changed file with 12 additions and 4 deletions.
16 changes: 12 additions & 4 deletions deepsearch/cps/data_indices/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@ def upload_files(
local_file: Optional[Union[str, Path]] = None,
s3_coordinates: Optional[S3Coordinates] = None,
conv_settings: Optional[ConversionSettings] = None,
url_chunk_size: int = 1,
):
"""
Orchestrate document conversion and upload to an index in a project
Expand All @@ -43,7 +44,9 @@ def upload_files(
else:
urls = url

return process_url_input(api=api, coords=coords, urls=urls)
return process_url_input(
api=api, coords=coords, urls=urls, url_chunk_size=url_chunk_size
)
elif url is None and local_file is not None and s3_coordinates is None:
return process_local_file(
api=api,
Expand All @@ -64,27 +67,32 @@ def process_url_input(
api: CpsApi,
coords: ElasticProjectDataCollectionSource,
urls: List[str],
url_chunk_size: int,
progress_bar: bool = False,
):
"""
Individual urls are uploaded for conversion and storage in data index.
"""

chunk_list = lambda lst, n: [lst[i : i + n] for i in range(0, len(lst), n)]

root_dir = create_root_dir()

# container list for task_ids
task_ids = []
# submit urls
count_urls = len(urls)
url_chunks = chunk_list(urls, url_chunk_size)
count_urls = len(url_chunks)
with tqdm(
total=count_urls,
desc=f"{'Submitting input:': <{progressbar.padding}}",
disable=not (progress_bar),
colour=progressbar.colour,
bar_format=progressbar.bar_format,
) as progress:
for url in urls:
file_url_array = [url]

for url_chunk in url_chunks:
file_url_array = url_chunk
payload = {"file_url": file_url_array}
task_id = api.data_indices.upload_file(coords=coords, body=payload)
task_ids.append(task_id)
Expand Down

0 comments on commit bc5c684

Please sign in to comment.