From 8f83ccd1fc4958c281efe239bed9cef5fdbe3399 Mon Sep 17 00:00:00 2001 From: Michele Dolfi Date: Thu, 19 Oct 2023 18:31:28 +0200 Subject: [PATCH 1/3] Move uploader code to the main api Signed-off-by: Michele Dolfi --- deepsearch/cps/client/api.py | 3 + deepsearch/cps/client/components/uploader.py | 67 ++++++++++++++++++++ deepsearch/cps/data_indices/utils.py | 6 +- deepsearch/documents/core/convert.py | 30 +-------- 4 files changed, 76 insertions(+), 30 deletions(-) create mode 100644 deepsearch/cps/client/components/uploader.py diff --git a/deepsearch/cps/client/api.py b/deepsearch/cps/client/api.py index 69862c82..46ceaa4e 100644 --- a/deepsearch/cps/client/api.py +++ b/deepsearch/cps/client/api.py @@ -24,6 +24,7 @@ CpsApiTasks, DSApiDocuments, ) +from deepsearch.cps.client.components.uploader import DSApiUploader class CpsApiClient: @@ -101,6 +102,7 @@ class CpsApi: elastic: CpsApiElastic data_indices: CpsApiDataIndices documents: DSApiDocuments + uploader: DSApiUploader def __init__(self, client: CpsApiClient) -> None: self.client = client @@ -115,6 +117,7 @@ def _create_members(self): self.elastic = CpsApiElastic(self) self.data_indices = CpsApiDataIndices(self) self.documents = DSApiDocuments(self) + self.uploader = DSApiUploader(self) def refresh_token(self, admin: bool = False): """Refresh access token diff --git a/deepsearch/cps/client/components/uploader.py b/deepsearch/cps/client/components/uploader.py new file mode 100644 index 00000000..c2fa10c1 --- /dev/null +++ b/deepsearch/cps/client/components/uploader.py @@ -0,0 +1,67 @@ +from __future__ import annotations + +from pathlib import Path +from typing import TYPE_CHECKING, Union + +import requests +from pydantic import BaseModel + +from deepsearch.cps.apis import public as sw_client +from deepsearch.cps.apis.public.models.temporary_upload_file_result import ( + TemporaryUploadFileResult, +) +from deepsearch.cps.client.components.projects import Project + +if TYPE_CHECKING: + from deepsearch.cps.client import CpsApi + + +class UploadedFile(BaseModel): + download_url: str + internal_url: str + + +class DSApiUploader: + def __init__(self, api: CpsApi) -> None: + self.api = api + self.upload_api = sw_client.UploadsApi(self.api.client.swagger_client) + + def upload_file( + self, + project: Union[Project, str], + source_path: Union[Path, str], + tls_verify: bool = True, + ) -> UploadedFile: + """ + Upload a file to the scratch storage of Deep Search. + The returned object provides the `download_url` and `internal_url` which can be + use for retrieving the file or submitting to other Deep Search APIs, respectively. + """ + + proj_key = project.key if isinstance(project, Project) else project + source_path = Path(source_path) + + # Register file + source_basename = source_path.name + scratch_specs: TemporaryUploadFileResult = ( + self.upload_api.create_project_scratch_file( + proj_key=proj_key, filename=source_basename + ) + ) + + # Upload file + upload_specs = scratch_specs.upload + with source_path.open("rb") as f: + files = {"file": (source_basename, f)} + request_upload = requests.post( + url=upload_specs.url, + data=upload_specs.fields, + files=files, + verify=tls_verify, + ) + request_upload.raise_for_status() + + return UploadedFile( + download_url=scratch_specs.download.url, + internal_url=scratch_specs.download_private.url, + ) diff --git a/deepsearch/cps/data_indices/utils.py b/deepsearch/cps/data_indices/utils.py index c4ba43e6..43389fe6 100644 --- a/deepsearch/cps/data_indices/utils.py +++ b/deepsearch/cps/data_indices/utils.py @@ -147,10 +147,10 @@ def process_local_file( # loop over all files for single_zip in files_zip: # upload file - private_download_url = convert.upload_single_file( - api=api, cps_proj_key=coords.proj_key, source_path=Path(single_zip) + uploaded_file = api.uploader.upload_file( + project=coords.proj_key, source_path=Path(single_zip) ) - file_url_array = [private_download_url] + file_url_array = [uploaded_file.internal_url] payload: Dict[str, Any] = { "file_url": file_url_array, } diff --git a/deepsearch/documents/core/convert.py b/deepsearch/documents/core/convert.py index b238111a..d2b08afa 100644 --- a/deepsearch/documents/core/convert.py +++ b/deepsearch/documents/core/convert.py @@ -148,8 +148,8 @@ def send_files_for_conversion( # loop over all files for single_zip in files_zip: # upload file - private_download_url = upload_single_file( - api=api, cps_proj_key=cps_proj_key, source_path=Path(single_zip) + uploaded_file = api.uploader.upload_file( + project=cps_proj_key, source_path=Path(single_zip) ) # submit url for conversion task_id = submit_conversion_payload( @@ -157,7 +157,7 @@ def send_files_for_conversion( cps_proj_key=cps_proj_key, source={ "type": "url", - "download_url": private_download_url, + "download_url": uploaded_file.internal_url, }, target=target, conversion_settings=conversion_settings, @@ -296,30 +296,6 @@ def download_converted_documents( return -def upload_single_file(api: CpsApi, cps_proj_key: str, source_path: Path) -> str: - """ - Uploads a single file. Return internal download url. - """ - filename = os.path.basename(source_path) - sw_api = sw_client.UploadsApi(api.client.swagger_client) - - get_pointer: TemporaryUploadFileResult = sw_api.create_project_scratch_file( - proj_key=cps_proj_key, filename=filename - ) - # upload file - upload = get_pointer.upload - private_download_url = get_pointer.download_private.url - - with open(source_path, "rb") as f: - files = {"file": (os.path.basename(source_path), f)} - request_upload = requests.post( - url=upload.url, data=upload.fields, files=files, verify=False - ) - request_upload.raise_for_status() - - return private_download_url - - def send_urls_for_conversion( api: CpsApi, cps_proj_key: str, From 6b9d4c9ea482815e015e260e215b4013dcba947d Mon Sep 17 00:00:00 2001 From: Michele Dolfi Date: Thu, 19 Oct 2023 18:52:00 +0200 Subject: [PATCH 2/3] Add API function to upload data without convert. Signed-off-by: Michele Dolfi --- .../cps/client/components/data_indices.py | 55 ++++++++++++++++++- 1 file changed, 52 insertions(+), 3 deletions(-) diff --git a/deepsearch/cps/client/components/data_indices.py b/deepsearch/cps/client/components/data_indices.py index e36ce278..92253944 100644 --- a/deepsearch/cps/client/components/data_indices.py +++ b/deepsearch/cps/client/components/data_indices.py @@ -13,6 +13,7 @@ from deepsearch.cps.apis.public.models.attachment_upload_data import ( AttachmentUploadData, ) +from deepsearch.cps.apis.public.models.task import Task from deepsearch.cps.apis.public.models.token_response import TokenResponse from deepsearch.cps.client.components.api_object import ApiConnectedObject @@ -109,13 +110,61 @@ def upload_file( coords: ElasticProjectDataCollectionSource, body: Dict[str, Any], ) -> str: + """ + Deprecated. Use upload_and_convert() instead. + """ + return self.upload_and_convert(coords, body).task_id + + def upload_and_convert( + self, + coords: ElasticProjectDataCollectionSource, + body: Dict[str, Any], + ) -> Task: """ Call api for converting and uploading file to a project's data index. """ - task_id = self.sw_api.ccs_convert_upload_file_project_data_index( + task: Task = self.sw_api.ccs_convert_upload_file_project_data_index( proj_key=coords.proj_key, index_key=coords.index_key, body=body - ).task_id - return task_id + ) + return task + + def upload( + self, + coords: ElasticProjectDataCollectionSource, + source_path: Optional[Union[str, Path]] = None, + source_url: Optional[str] = None, + ) -> Task: + """ + Call api for uploading files to a project's data index. + The source files can be provided by URL with the `source_url` argument, + or by file path with the `source_path` argument. + The arguments `source_url` and `source_path` are mutually exclusive. + """ + + if source_url is not None and source_path is not None: + raise ValueError( + "Only one of `source_url` and `source_path` can be provided." + ) + + if source_path is not None and source_url is None: + uploaded_file = self.api.uploader.upload_file( + project=coords.proj_key, source_path=source_path + ) + source_url = uploaded_file.internal_url + + if source_url is None: + raise ValueError( + "One source between of `source_url` and `source_path` must be provided " + ) + + task = self.sw_api.upload_project_data_index_file( + proj_key=coords.proj_key, + index_key=coords.index_key, + params={ + "file_url": source_url, + }, + ) + return task class ElasticProjectDataCollectionSource(BaseModel): From 798a7767ec2178cf024ec20199cd07999b94548b Mon Sep 17 00:00:00 2001 From: Panos Vagenas <35837085+vagenas@users.noreply.github.com> Date: Fri, 20 Oct 2023 14:06:18 +0200 Subject: [PATCH 3/3] simplify upload params Signed-off-by: Panos Vagenas <35837085+vagenas@users.noreply.github.com> --- .../cps/client/components/data_indices.py | 25 ++++++------------- 1 file changed, 8 insertions(+), 17 deletions(-) diff --git a/deepsearch/cps/client/components/data_indices.py b/deepsearch/cps/client/components/data_indices.py index 92253944..78c42bee 100644 --- a/deepsearch/cps/client/components/data_indices.py +++ b/deepsearch/cps/client/components/data_indices.py @@ -5,6 +5,7 @@ from dataclasses import dataclass from pathlib import Path from typing import TYPE_CHECKING, Any, Dict, List, Optional, Union +from urllib.parse import urlparse import requests from pydantic import BaseModel @@ -131,32 +132,22 @@ def upload_and_convert( def upload( self, coords: ElasticProjectDataCollectionSource, - source_path: Optional[Union[str, Path]] = None, - source_url: Optional[str] = None, + source: Union[Path, str], ) -> Task: """ Call api for uploading files to a project's data index. - The source files can be provided by URL with the `source_url` argument, - or by file path with the `source_path` argument. - The arguments `source_url` and `source_path` are mutually exclusive. + The source files can be provided by local path or URL via `source`. """ - if source_url is not None and source_path is not None: - raise ValueError( - "Only one of `source_url` and `source_path` can be provided." - ) - - if source_path is not None and source_url is None: + parsed = urlparse(str(source)) + if parsed.scheme and parsed.netloc: # is url + source_url = source + else: uploaded_file = self.api.uploader.upload_file( - project=coords.proj_key, source_path=source_path + project=coords.proj_key, source_path=source ) source_url = uploaded_file.internal_url - if source_url is None: - raise ValueError( - "One source between of `source_url` and `source_path` must be provided " - ) - task = self.sw_api.upload_project_data_index_file( proj_key=coords.proj_key, index_key=coords.index_key,