From c4a7b92cfa3febba01e3076a19a77a832721e37a Mon Sep 17 00:00:00 2001 From: Tiago Santana <54704492+SantanaTiago@users.noreply.github.com> Date: Wed, 22 May 2024 09:47:28 +0100 Subject: [PATCH] feat: add target settings (#175) Signed-off-by: Tiago Santana <54704492+SantanaTiago@users.noreply.github.com> --- deepsearch/cps/cli/cli_options.py | 6 ++++++ deepsearch/cps/cli/data_indices_typer.py | 13 ++++++++++++- deepsearch/cps/data_indices/utils.py | 7 ++++++- deepsearch/documents/core/models.py | 15 ++++++++++++++- 4 files changed, 38 insertions(+), 3 deletions(-) diff --git a/deepsearch/cps/cli/cli_options.py b/deepsearch/cps/cli/cli_options.py index bb70512b..220cdbe7 100644 --- a/deepsearch/cps/cli/cli_options.py +++ b/deepsearch/cps/cli/cli_options.py @@ -38,6 +38,12 @@ help="""Provide conversion settings to be used on local file upload""", ) +TARGET_SETTINGS = typer.Option( + None, + "--target-settings", + help="""Provide target conversion settings to be used on local file upload""", +) + SOURCE_PATH = typer.Option( None, "--input-file", diff --git a/deepsearch/cps/cli/data_indices_typer.py b/deepsearch/cps/cli/data_indices_typer.py index adbd8284..6dec71e4 100644 --- a/deepsearch/cps/cli/data_indices_typer.py +++ b/deepsearch/cps/cli/data_indices_typer.py @@ -17,6 +17,7 @@ INDEX_KEY, PROJ_KEY, SOURCE_PATH, + TARGET_SETTINGS, URL, ) from deepsearch.cps.client.api import CpsApi @@ -24,7 +25,7 @@ from deepsearch.cps.client.components.elastic import ElasticProjectDataCollectionSource from deepsearch.cps.data_indices import utils from deepsearch.documents.core.common_routines import ERROR_MSG -from deepsearch.documents.core.models import ConversionSettings +from deepsearch.documents.core.models import ConversionSettings, TargetSettings app = typer.Typer(no_args_is_help=True) @@ -138,6 +139,7 @@ def upload_files( index_key: str = INDEX_KEY, s3_coordinates: Path = COORDINATES_PATH, conv_settings: Optional[str] = CONV_SETTINGS, + target_settings: Optional[str] = TARGET_SETTINGS, ): """ Upload pdfs, zips, or online documents to a data index in a project @@ -173,6 +175,14 @@ def upload_files( else: final_conv_settings = None + if target_settings is not None: + try: + final_target_settings = TargetSettings.parse_file(target_settings) + except Exception as e: + raise e + else: + final_target_settings = None + utils.upload_files( api=api, coords=coords, @@ -180,6 +190,7 @@ def upload_files( local_file=local_file, s3_coordinates=cos_coordinates, conv_settings=final_conv_settings, + target_settings=final_target_settings, ) typer.echo("Tasks have been queued successfully") diff --git a/deepsearch/cps/data_indices/utils.py b/deepsearch/cps/data_indices/utils.py index 94233bc3..9cbd71a8 100644 --- a/deepsearch/cps/data_indices/utils.py +++ b/deepsearch/cps/data_indices/utils.py @@ -14,7 +14,7 @@ from deepsearch.cps.client.components.elastic import ElasticProjectDataCollectionSource from deepsearch.documents.core import convert, input_process from deepsearch.documents.core.common_routines import progressbar -from deepsearch.documents.core.models import ConversionSettings +from deepsearch.documents.core.models import ConversionSettings, TargetSettings from deepsearch.documents.core.utils import cleanup, create_root_dir logger = logging.getLogger(__name__) @@ -27,6 +27,7 @@ def upload_files( local_file: Optional[Union[str, Path]] = None, s3_coordinates: Optional[S3Coordinates] = None, conv_settings: Optional[ConversionSettings] = None, + target_settings: Optional[TargetSettings] = None, url_chunk_size: int = 1, ): """ @@ -53,6 +54,7 @@ def upload_files( coords=coords, local_file=Path(local_file), conv_settings=conv_settings, + target_settings=target_settings, ) elif url is None and local_file is None and s3_coordinates is not None: return process_external_cos( @@ -113,6 +115,7 @@ def process_local_file( local_file: Path, progress_bar: bool = False, conv_settings: Optional[ConversionSettings] = None, + target_settings: Optional[TargetSettings] = None, ): """ Individual files are uploaded for conversion and storage in data index. @@ -164,6 +167,8 @@ def process_local_file( } if conv_settings is not None: payload["conversion_settings"] = conv_settings.to_ccs_spec() + if target_settings is not None: + payload["target_settings"] = target_settings.dict(exclude_none=True) task_id = api.data_indices.upload_file(coords=coords, body=payload) task_ids.append(task_id) diff --git a/deepsearch/documents/core/models.py b/deepsearch/documents/core/models.py index 5709894d..0611135d 100644 --- a/deepsearch/documents/core/models.py +++ b/deepsearch/documents/core/models.py @@ -3,7 +3,7 @@ from textwrap import dedent from typing import ClassVar, Dict, List, Literal, Optional, Set, Union, get_args -from pydantic.v1 import BaseModel, Field, ValidationError, conlist, parse_obj_as +from pydantic.v1 import BaseModel, Field, ValidationError, conlist, root_validator from deepsearch import CpsApi from deepsearch.core.util.ccs_utils import get_ccs_project_key @@ -627,3 +627,16 @@ def to_ccs_spec(self): obj["metadata"] = self.metadata.to_ccs_spec() return obj + + +class TargetSettings(BaseModel): + add_raw_pages: Optional[bool] = None + add_annotations: Optional[bool] = None + + @root_validator() + def check_raw_or_ann(cls, values): + if (values.get("add_raw_pages") is None) and ( + values.get("add_annotations") is None + ): + raise ValueError("either 'add_raw_pages' or 'add_annotations' is required") + return values