Skip to content

Commit

Permalink
feat: add target settings (#175)
Browse files Browse the repository at this point in the history
Signed-off-by: Tiago Santana <54704492+SantanaTiago@users.noreply.github.com>
  • Loading branch information
SantanaTiago committed May 22, 2024
1 parent e62b0c7 commit c4a7b92
Show file tree
Hide file tree
Showing 4 changed files with 38 additions and 3 deletions.
6 changes: 6 additions & 0 deletions deepsearch/cps/cli/cli_options.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,12 @@
help="""Provide conversion settings to be used on local file upload""",
)

TARGET_SETTINGS = typer.Option(
None,
"--target-settings",
help="""Provide target conversion settings to be used on local file upload""",
)

SOURCE_PATH = typer.Option(
None,
"--input-file",
Expand Down
13 changes: 12 additions & 1 deletion deepsearch/cps/cli/data_indices_typer.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,14 +17,15 @@
INDEX_KEY,
PROJ_KEY,
SOURCE_PATH,
TARGET_SETTINGS,
URL,
)
from deepsearch.cps.client.api import CpsApi
from deepsearch.cps.client.components.data_indices import S3Coordinates
from deepsearch.cps.client.components.elastic import ElasticProjectDataCollectionSource
from deepsearch.cps.data_indices import utils
from deepsearch.documents.core.common_routines import ERROR_MSG
from deepsearch.documents.core.models import ConversionSettings
from deepsearch.documents.core.models import ConversionSettings, TargetSettings

app = typer.Typer(no_args_is_help=True)

Expand Down Expand Up @@ -138,6 +139,7 @@ def upload_files(
index_key: str = INDEX_KEY,
s3_coordinates: Path = COORDINATES_PATH,
conv_settings: Optional[str] = CONV_SETTINGS,
target_settings: Optional[str] = TARGET_SETTINGS,
):
"""
Upload pdfs, zips, or online documents to a data index in a project
Expand Down Expand Up @@ -173,13 +175,22 @@ def upload_files(
else:
final_conv_settings = None

if target_settings is not None:
try:
final_target_settings = TargetSettings.parse_file(target_settings)
except Exception as e:
raise e
else:
final_target_settings = None

utils.upload_files(
api=api,
coords=coords,
url=urls,
local_file=local_file,
s3_coordinates=cos_coordinates,
conv_settings=final_conv_settings,
target_settings=final_target_settings,
)

typer.echo("Tasks have been queued successfully")
Expand Down
7 changes: 6 additions & 1 deletion deepsearch/cps/data_indices/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
from deepsearch.cps.client.components.elastic import ElasticProjectDataCollectionSource
from deepsearch.documents.core import convert, input_process
from deepsearch.documents.core.common_routines import progressbar
from deepsearch.documents.core.models import ConversionSettings
from deepsearch.documents.core.models import ConversionSettings, TargetSettings
from deepsearch.documents.core.utils import cleanup, create_root_dir

logger = logging.getLogger(__name__)
Expand All @@ -27,6 +27,7 @@ def upload_files(
local_file: Optional[Union[str, Path]] = None,
s3_coordinates: Optional[S3Coordinates] = None,
conv_settings: Optional[ConversionSettings] = None,
target_settings: Optional[TargetSettings] = None,
url_chunk_size: int = 1,
):
"""
Expand All @@ -53,6 +54,7 @@ def upload_files(
coords=coords,
local_file=Path(local_file),
conv_settings=conv_settings,
target_settings=target_settings,
)
elif url is None and local_file is None and s3_coordinates is not None:
return process_external_cos(
Expand Down Expand Up @@ -113,6 +115,7 @@ def process_local_file(
local_file: Path,
progress_bar: bool = False,
conv_settings: Optional[ConversionSettings] = None,
target_settings: Optional[TargetSettings] = None,
):
"""
Individual files are uploaded for conversion and storage in data index.
Expand Down Expand Up @@ -164,6 +167,8 @@ def process_local_file(
}
if conv_settings is not None:
payload["conversion_settings"] = conv_settings.to_ccs_spec()
if target_settings is not None:
payload["target_settings"] = target_settings.dict(exclude_none=True)

task_id = api.data_indices.upload_file(coords=coords, body=payload)
task_ids.append(task_id)
Expand Down
15 changes: 14 additions & 1 deletion deepsearch/documents/core/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
from textwrap import dedent
from typing import ClassVar, Dict, List, Literal, Optional, Set, Union, get_args

from pydantic.v1 import BaseModel, Field, ValidationError, conlist, parse_obj_as
from pydantic.v1 import BaseModel, Field, ValidationError, conlist, root_validator

from deepsearch import CpsApi
from deepsearch.core.util.ccs_utils import get_ccs_project_key
Expand Down Expand Up @@ -627,3 +627,16 @@ def to_ccs_spec(self):
obj["metadata"] = self.metadata.to_ccs_spec()

return obj


class TargetSettings(BaseModel):
add_raw_pages: Optional[bool] = None
add_annotations: Optional[bool] = None

@root_validator()
def check_raw_or_ann(cls, values):
if (values.get("add_raw_pages") is None) and (
values.get("add_annotations") is None
):
raise ValueError("either 'add_raw_pages' or 'add_annotations' is required")
return values

0 comments on commit c4a7b92

Please sign in to comment.