Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add support for specifying document conversion settings #48

Merged
merged 20 commits into from
Dec 13, 2022
Merged
Show file tree
Hide file tree
Changes from 14 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 13 additions & 0 deletions deepsearch/core/util/ccs_utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
from deepsearch import CpsApi
from deepsearch.cps.apis import public as sw_client


def get_ccs_project_key(api: CpsApi, cps_proj_key: str):
"""
Given a cps project key, returns ccs project key and collection name.
"""
sw_api = sw_client.ProjectApi(api.client.swagger_client)
request_ccs_project_key = sw_api.get_project_default_values(proj_key=cps_proj_key)
ccs_proj_key = request_ccs_project_key.ccs_project.proj_key
collection_name = request_ccs_project_key.ccs_project.collection_name
return (ccs_proj_key, collection_name)
42 changes: 22 additions & 20 deletions deepsearch/documents/core/convert.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,10 @@
import glob
import logging
import os
import pathlib
from enum import Enum
from pathlib import Path
from typing import Any, Dict, List, Literal, Optional
from typing import Any, List, Optional

import requests
import urllib3
from pydantic import BaseModel, Field
from tqdm import tqdm

from deepsearch.cps.apis import public as sw_client
Expand All @@ -17,8 +13,9 @@
)
from deepsearch.cps.client.api import CpsApi

from ...core.util.ccs_utils import get_ccs_project_key
from .common_routines import ERROR_MSG, progressbar
from .models import ExportTarget, ZipTarget
from .models import ConversionSettings, ExportTarget, ZipTarget
from .utils import URLNavigator, collect_all_local_files, download_url

urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
Expand All @@ -28,6 +25,7 @@
def make_payload(
url_document: str,
target: Optional[ExportTarget],
conversion_settings: Optional[ConversionSettings],
collection_name: str = "_default",
):
"""
Expand All @@ -36,13 +34,17 @@ def make_payload(

target = target or ZipTarget()

if conversion_settings:
conversion_settings = conversion_settings.to_ccs_spec()

payload = {
"source": {
"type": "url",
"download_url": url_document,
},
"context": {
"collection_name": collection_name,
"conversion_settings": conversion_settings,
"keep_documents": "false",
},
"target": target.dict(),
Expand All @@ -65,19 +67,12 @@ def check_single_task_status(api: CpsApi, ccs_proj_key: str, task_id: str):
return request_status


def get_ccs_project_key(api: CpsApi, cps_proj_key: str):
"""
Given a cps project key, returns ccs project key and collection name.
"""
sw_api = sw_client.ProjectApi(api.client.swagger_client)
request_ccs_project_key = sw_api.get_project_default_values(proj_key=cps_proj_key)
ccs_proj_key = request_ccs_project_key.ccs_project.proj_key
collection_name = request_ccs_project_key.ccs_project.collection_name
return (ccs_proj_key, collection_name)


def submit_url_for_conversion(
api: CpsApi, cps_proj_key: str, url: str, target: Optional[ExportTarget]
api: CpsApi,
cps_proj_key: str,
url: str,
target: Optional[ExportTarget],
conversion_settings: Optional[ConversionSettings],
) -> str:
"""
Convert an online pdf using DeepSearch Technology.
Expand All @@ -87,7 +82,7 @@ def submit_url_for_conversion(
api=api, cps_proj_key=cps_proj_key
)
# submit conversion request
payload = make_payload(url, target, collection_name)
payload = make_payload(url, target, conversion_settings, collection_name)

try:
request_conversion_task_id = api.client.session.post(
Expand All @@ -110,6 +105,7 @@ def send_files_for_conversion(
cps_proj_key: str,
source_path: Path,
target: Optional[ExportTarget],
conversion_settings: Optional[ConversionSettings],
root_dir: Path,
progress_bar=False,
) -> list:
Expand Down Expand Up @@ -141,6 +137,7 @@ def send_files_for_conversion(
cps_proj_key=cps_proj_key,
url=private_download_url,
target=target,
conversion_settings=conversion_settings,
)
task_ids.append(task_id)
progress.update(1)
Expand Down Expand Up @@ -273,6 +270,7 @@ def send_urls_for_conversion(
cps_proj_key: str,
urls: List[str],
target: Optional[ExportTarget],
conversion_settings: Optional[ConversionSettings],
progress_bar=False,
) -> List[Any]:
"""
Expand All @@ -289,7 +287,11 @@ def send_urls_for_conversion(
) as progress:
for url in urls:
task_id = submit_url_for_conversion(
api=api, cps_proj_key=cps_proj_key, url=url, target=target
api=api,
cps_proj_key=cps_proj_key,
url=url,
target=target,
conversion_settings=conversion_settings,
)
task_ids.append(task_id)
progress.update(1)
Expand Down
2 changes: 1 addition & 1 deletion deepsearch/documents/core/create_report.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,9 +7,9 @@
import requests
from tqdm import tqdm

from deepsearch.core.util.ccs_utils import get_ccs_project_key
from deepsearch.cps.client.api import CpsApi
from deepsearch.documents.core.common_routines import ERROR_MSG, progressbar
from deepsearch.documents.core.convert import get_ccs_project_key
from deepsearch.documents.core.utils import URLNavigator

logger = logging.getLogger(__name__)
Expand Down
6 changes: 5 additions & 1 deletion deepsearch/documents/core/input_process.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
from deepsearch.cps.client.api import CpsApi
from deepsearch.cps.client.components.documents import DocumentConversionResult

from .models import ExportTarget
from .models import ConversionSettings, ExportTarget
from .utils import batch_single_files

urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
Expand All @@ -25,6 +25,7 @@ def process_local_input(
cps_proj_key: str,
source_path: Path,
target: Optional[ExportTarget],
conversion_settings: Optional[ConversionSettings],
progress_bar=False,
) -> DocumentConversionResult:
"""
Expand All @@ -44,6 +45,7 @@ def process_local_input(
cps_proj_key=cps_proj_key,
source_path=source_path,
target=target,
conversion_settings=conversion_settings,
root_dir=Path(tmpdir),
progress_bar=progress_bar,
)
Expand All @@ -68,6 +70,7 @@ def process_urls_input(
cps_proj_key: str,
urls: List[str],
target: Optional[ExportTarget],
conversion_settings: Optional[ConversionSettings],
progress_bar=False,
):
"""
Expand All @@ -78,6 +81,7 @@ def process_urls_input(
cps_proj_key=cps_proj_key,
urls=urls,
target=target,
conversion_settings=conversion_settings,
progress_bar=progress_bar,
)
statuses = check_status_running_tasks(
Expand Down
5 changes: 4 additions & 1 deletion deepsearch/documents/core/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
process_local_input,
process_urls_input,
)
from deepsearch.documents.core.models import ExportTarget
from deepsearch.documents.core.models import ConversionSettings, ExportTarget

urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)

Expand All @@ -19,6 +19,7 @@ def convert_documents(
urls: Optional[Union[str, List[str]]] = None,
source_path: Optional[Path] = None,
target: Optional[ExportTarget] = None,
conversion_settings: Optional[ConversionSettings] = None,
progress_bar=False,
):
"""
Expand Down Expand Up @@ -60,6 +61,7 @@ def convert_documents(
cps_proj_key=proj_key,
urls=urls,
target=target,
conversion_settings=conversion_settings,
progress_bar=progress_bar,
)
elif urls is None and source_path is not None:
Expand All @@ -68,6 +70,7 @@ def convert_documents(
cps_proj_key=proj_key,
source_path=Path(source_path).expanduser().resolve(),
target=target,
conversion_settings=conversion_settings,
progress_bar=progress_bar,
)

Expand Down
Loading