Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Support data export #103

Merged
merged 1 commit into from
Oct 2, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
59 changes: 59 additions & 0 deletions doccano_client/clients/data_export.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
from __future__ import annotations

from typing import Any, Iterator, List

from doccano_client.client import DoccanoClient
from doccano_client.models.data_export import Option


class DataExportClient:
"""Client for interacting with the Doccano data export API"""

def __init__(self, client: DoccanoClient):
self._client = client

def list_options(self, project_id: int) -> List[Option]:
"""Return all download options

Args:
project_id (int): The id of the project

Returns:
List[Option]: The list of the download options.
"""
resource = f"projects/{project_id}/download-format"
response = self._client.get(resource)
options = [Option.parse_obj(label) for label in response.json()]
return options

def schedule_download(self, project_id: int, option: Option, only_approved=False) -> str:
"""Schedule a download

Args:
project_id (int): The id of the project
option (Option): The download option
only_approved (bool): Whether to export approved data only

Returns:
str: The celery task id
"""
resource = f"projects/{project_id}/download"
data = {"format": option.name, "exportApproved": only_approved}
response = self._client.post(resource, json=data)
task_id = response.json()["task_id"]
return task_id

def download(self, project_id: int, task_id: str) -> Iterator[Any]:
"""Download a file from the server

Args:
project_id (int): The id of the project
task_id (str): The celery task id

Yields:
Iterator[Any]: The file content
"""
resource = f"projects/{project_id}/download"
params = {"taskId": task_id}
response = self._client.get(resource, params=params, stream=True)
yield from response.iter_content(chunk_size=65536)
6 changes: 6 additions & 0 deletions doccano_client/models/data_export.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
from pydantic import BaseModel


class Option(BaseModel):
name: str
example: str = ""
46 changes: 46 additions & 0 deletions tests/clients/test_data_export.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
import pathlib
import time

import vcr

from doccano_client.client import DoccanoClient
from doccano_client.clients.data_export import DataExportClient
from doccano_client.models.data_export import Option
from tests.conftest import cassettes_path


class TestDataExportClient:
@classmethod
def setup_class(cls):
with vcr.use_cassette(str(cassettes_path / "data_export/login.yaml"), mode="once"):
client = DoccanoClient("http://localhost:8000")
client.login(username="admin", password="password")
cls.client = DataExportClient(client)
cls.project_id = 16

def test_list_options(self):
with vcr.use_cassette(str(cassettes_path / "data_export/options.yaml"), mode="once"):
response = self.client.list_options(self.project_id)
assert len(response) > 0
assert all(isinstance(option, Option) for option in response)

def test_schedule_download(self):
with vcr.use_cassette(str(cassettes_path / "data_export/schedule_download.yaml"), mode="once"):
option = Option(name="JSONL")
task_id = self.client.schedule_download(self.project_id, option)
assert task_id is not None
assert isinstance(task_id, str)

def test_download(self):
with vcr.use_cassette(str(cassettes_path / "data_export/download.yaml"), mode="once"):
option = Option(name="JSONL")
task_id = self.client.schedule_download(self.project_id, option)
time.sleep(10) # lazy work
chunks = self.client.download(self.project_id, task_id)
file = pathlib.Path(__file__).parent / "data/annotation.zip"
with file.open("wb") as f:
for chunk in chunks:
f.write(chunk)
assert file.exists()
assert file.stat().st_size > 0
file.unlink()
113 changes: 113 additions & 0 deletions tests/fixtures/cassettes/data_export/download.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,113 @@
interactions:
- request:
body: '{"format": "JSONL", "exportApproved": false}'
headers:
Accept-Encoding:
- gzip, deflate
Connection:
- keep-alive
Content-Length:
- '44'
Cookie:
- csrftoken=FEvkld0o1MszakkYc27mcoDrcTxNvA0J9jjIzKW6OMzkxS0Zar9aK0aeOk3iALkQ;
sessionid=rmbx2nr20xbrk8ofdi9u4ocgai6bze9g
User-Agent:
- python-requests/2.28.1
X-CSRFToken:
- FEvkld0o1MszakkYc27mcoDrcTxNvA0J9jjIzKW6OMzkxS0Zar9aK0aeOk3iALkQ
accept:
- application/json
content-type:
- application/json
referer:
- http://localhost:8000
method: POST
uri: http://localhost:8000/v1/projects/16/download
response:
body:
string: '{"task_id":"775c0bde-3ee5-4e19-b9cd-6f4d53c3c120"}'
headers:
Allow:
- GET, POST, HEAD, OPTIONS
Connection:
- close
Content-Length:
- '50'
Content-Type:
- application/json
Cross-Origin-Opener-Policy:
- same-origin
Date:
- Sun, 02 Oct 2022 07:57:46 GMT
Referrer-Policy:
- same-origin
Server:
- gunicorn
Vary:
- Accept, Origin, Cookie
X-Content-Type-Options:
- nosniff
X-Frame-Options:
- DENY
status:
code: 200
message: OK
- request:
body: null
headers:
Accept-Encoding:
- gzip, deflate
Connection:
- keep-alive
Cookie:
- csrftoken=FEvkld0o1MszakkYc27mcoDrcTxNvA0J9jjIzKW6OMzkxS0Zar9aK0aeOk3iALkQ;
sessionid=rmbx2nr20xbrk8ofdi9u4ocgai6bze9g
User-Agent:
- python-requests/2.28.1
X-CSRFToken:
- FEvkld0o1MszakkYc27mcoDrcTxNvA0J9jjIzKW6OMzkxS0Zar9aK0aeOk3iALkQ
accept:
- application/json
content-type:
- application/json
referer:
- http://localhost:8000
method: GET
uri: http://localhost:8000/v1/projects/16/download?taskId=775c0bde-3ee5-4e19-b9cd-6f4d53c3c120
response:
body:
string: !!binary |
UEsDBBQAAAAIADc/QlW3l8FGmgAAAGsCAAALAAAAYWRtaW4uanNvbmytzTELwjAQhuHdn/HNGdRW
q9lsXdzcS4eIRwkmaSFRC6X/3WCxaEGpkOnuOHjeFvIMvojnScTgqHHgflgHBiVOpMDzgiGrtCbj
bH89H343V6UYNDnR792sHbQ4qLYKqq2DasmgUSN0rWg3RcxRV1Y6eSMUL7nFXV7kUZR02Hu7e69s
xpX0zwqDoVL8Ci4/gttxMJsU/GJH3n4AUEsBAhQDFAAAAAgANz9CVbeXwUaaAAAAawIAAAsAAAAA
AAAAAAAAAKSBAAAAAGFkbWluLmpzb25sUEsFBgAAAAABAAEAOQAAAMMAAAAAAA==
headers:
Allow:
- GET, POST, HEAD, OPTIONS
Connection:
- close
Content-Disposition:
- attachment; filename="169be049-d9b1-4882-a974-cd8625adcd67.zip"
Content-Length:
- '274'
Content-Type:
- application/zip
Cross-Origin-Opener-Policy:
- same-origin
Date:
- Sun, 02 Oct 2022 07:57:56 GMT
Referrer-Policy:
- same-origin
Server:
- gunicorn
Vary:
- Accept, Origin, Cookie
X-Content-Type-Options:
- nosniff
X-Frame-Options:
- DENY
status:
code: 200
message: OK
version: 1
55 changes: 55 additions & 0 deletions tests/fixtures/cassettes/data_export/login.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
interactions:
- request:
body: '{"username": "admin", "password": "password"}'
headers:
Accept-Encoding:
- gzip, deflate
Connection:
- keep-alive
Content-Length:
- '45'
User-Agent:
- python-requests/2.28.1
accept:
- application/json
content-type:
- application/json
referer:
- http://localhost:8000
method: POST
uri: http://localhost:8000/v1/auth/login/
response:
body:
string: '{"key":"b37c59f6e0db6bf8a829858ce925b9c2b4d7b03d"}'
headers:
Allow:
- POST, OPTIONS
Connection:
- close
Content-Length:
- '50'
Content-Type:
- application/json
Cross-Origin-Opener-Policy:
- same-origin
Date:
- Sun, 02 Oct 2022 07:53:26 GMT
Referrer-Policy:
- same-origin
Server:
- gunicorn
Set-Cookie:
- csrftoken=FEvkld0o1MszakkYc27mcoDrcTxNvA0J9jjIzKW6OMzkxS0Zar9aK0aeOk3iALkQ;
expires=Sun, 01 Oct 2023 07:53:26 GMT; Max-Age=31449600; Path=/; SameSite=Lax
- sessionid=rmbx2nr20xbrk8ofdi9u4ocgai6bze9g; expires=Sun, 16 Oct 2022 07:53:26
GMT; HttpOnly; Max-Age=1209600; Path=/; SameSite=Lax
Vary:
- Accept, Origin, Cookie
X-Content-Type-Options:
- nosniff
X-Frame-Options:
- DENY
status:
code: 200
message: OK
version: 1
62 changes: 62 additions & 0 deletions tests/fixtures/cassettes/data_export/options.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
interactions:
- request:
body: null
headers:
Accept-Encoding:
- gzip, deflate
Connection:
- keep-alive
Cookie:
- csrftoken=FEvkld0o1MszakkYc27mcoDrcTxNvA0J9jjIzKW6OMzkxS0Zar9aK0aeOk3iALkQ;
sessionid=rmbx2nr20xbrk8ofdi9u4ocgai6bze9g
User-Agent:
- python-requests/2.28.1
X-CSRFToken:
- FEvkld0o1MszakkYc27mcoDrcTxNvA0J9jjIzKW6OMzkxS0Zar9aK0aeOk3iALkQ
accept:
- application/json
content-type:
- application/json
referer:
- http://localhost:8000
method: GET
uri: http://localhost:8000/v1/projects/16/download-format
response:
body:
string: '[{"name":"CSV","example":"text,label\n\"Terrible customer service.\",\"negative\"\n\"Really
great transaction.\",\"positive\"\n\"Great price.\",\"positive\"\n"},{"name":"fastText","example":"__label__negative
Terrible customer service.\n__label__positive Really great transaction.\n__label__positive
Great price.\n"},{"name":"JSON","example":"[\n {\n \"text\": \"Terrible
customer service.\",\n \"label\": [\"negative\"]\n },\n {\n \"text\":
\"Really great transaction.\",\n \"label\": [\"positive\"]\n },\n {\n \"text\":
\"Great price.\",\n \"label\": [\"positive\"]\n }\n]\n"},{"name":"JSONL","example":"{\"text\":
\"Terrible customer service.\", \"label\": [\"negative\"]}\n{\"text\": \"Really
great transaction.\", \"label\": [\"positive\"]}\n{\"text\": \"Great price.\",
\"label\": [\"positive\"]}\n"}]'
headers:
Allow:
- GET, HEAD, OPTIONS
Connection:
- close
Content-Length:
- '867'
Content-Type:
- application/json
Cross-Origin-Opener-Policy:
- same-origin
Date:
- Sun, 02 Oct 2022 07:53:26 GMT
Referrer-Policy:
- same-origin
Server:
- gunicorn
Vary:
- Accept, Origin, Cookie
X-Content-Type-Options:
- nosniff
X-Frame-Options:
- DENY
status:
code: 200
message: OK
version: 1
Loading