Skip to content

Commit

Permalink
Merge pull request #103 from doccano/feature/support-data-export
Browse files Browse the repository at this point in the history
Support data export
  • Loading branch information
Hironsan authored Oct 2, 2022
2 parents de49c6c + 078e63e commit e1270d4
Show file tree
Hide file tree
Showing 7 changed files with 396 additions and 0 deletions.
59 changes: 59 additions & 0 deletions doccano_client/clients/data_export.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
from __future__ import annotations

from typing import Any, Iterator, List

from doccano_client.client import DoccanoClient
from doccano_client.models.data_export import Option


class DataExportClient:
"""Client for interacting with the Doccano data export API"""

def __init__(self, client: DoccanoClient):
self._client = client

def list_options(self, project_id: int) -> List[Option]:
"""Return all download options
Args:
project_id (int): The id of the project
Returns:
List[Option]: The list of the download options.
"""
resource = f"projects/{project_id}/download-format"
response = self._client.get(resource)
options = [Option.parse_obj(label) for label in response.json()]
return options

def schedule_download(self, project_id: int, option: Option, only_approved=False) -> str:
"""Schedule a download
Args:
project_id (int): The id of the project
option (Option): The download option
only_approved (bool): Whether to export approved data only
Returns:
str: The celery task id
"""
resource = f"projects/{project_id}/download"
data = {"format": option.name, "exportApproved": only_approved}
response = self._client.post(resource, json=data)
task_id = response.json()["task_id"]
return task_id

def download(self, project_id: int, task_id: str) -> Iterator[Any]:
"""Download a file from the server
Args:
project_id (int): The id of the project
task_id (str): The celery task id
Yields:
Iterator[Any]: The file content
"""
resource = f"projects/{project_id}/download"
params = {"taskId": task_id}
response = self._client.get(resource, params=params, stream=True)
yield from response.iter_content(chunk_size=65536)
6 changes: 6 additions & 0 deletions doccano_client/models/data_export.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
from pydantic import BaseModel


class Option(BaseModel):
name: str
example: str = ""
46 changes: 46 additions & 0 deletions tests/clients/test_data_export.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
import pathlib
import time

import vcr

from doccano_client.client import DoccanoClient
from doccano_client.clients.data_export import DataExportClient
from doccano_client.models.data_export import Option
from tests.conftest import cassettes_path


class TestDataExportClient:
@classmethod
def setup_class(cls):
with vcr.use_cassette(str(cassettes_path / "data_export/login.yaml"), mode="once"):
client = DoccanoClient("http://localhost:8000")
client.login(username="admin", password="password")
cls.client = DataExportClient(client)
cls.project_id = 16

def test_list_options(self):
with vcr.use_cassette(str(cassettes_path / "data_export/options.yaml"), mode="once"):
response = self.client.list_options(self.project_id)
assert len(response) > 0
assert all(isinstance(option, Option) for option in response)

def test_schedule_download(self):
with vcr.use_cassette(str(cassettes_path / "data_export/schedule_download.yaml"), mode="once"):
option = Option(name="JSONL")
task_id = self.client.schedule_download(self.project_id, option)
assert task_id is not None
assert isinstance(task_id, str)

def test_download(self):
with vcr.use_cassette(str(cassettes_path / "data_export/download.yaml"), mode="once"):
option = Option(name="JSONL")
task_id = self.client.schedule_download(self.project_id, option)
time.sleep(10) # lazy work
chunks = self.client.download(self.project_id, task_id)
file = pathlib.Path(__file__).parent / "data/annotation.zip"
with file.open("wb") as f:
for chunk in chunks:
f.write(chunk)
assert file.exists()
assert file.stat().st_size > 0
file.unlink()
113 changes: 113 additions & 0 deletions tests/fixtures/cassettes/data_export/download.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,113 @@
interactions:
- request:
body: '{"format": "JSONL", "exportApproved": false}'
headers:
Accept-Encoding:
- gzip, deflate
Connection:
- keep-alive
Content-Length:
- '44'
Cookie:
- csrftoken=FEvkld0o1MszakkYc27mcoDrcTxNvA0J9jjIzKW6OMzkxS0Zar9aK0aeOk3iALkQ;
sessionid=rmbx2nr20xbrk8ofdi9u4ocgai6bze9g
User-Agent:
- python-requests/2.28.1
X-CSRFToken:
- FEvkld0o1MszakkYc27mcoDrcTxNvA0J9jjIzKW6OMzkxS0Zar9aK0aeOk3iALkQ
accept:
- application/json
content-type:
- application/json
referer:
- http://localhost:8000
method: POST
uri: http://localhost:8000/v1/projects/16/download
response:
body:
string: '{"task_id":"775c0bde-3ee5-4e19-b9cd-6f4d53c3c120"}'
headers:
Allow:
- GET, POST, HEAD, OPTIONS
Connection:
- close
Content-Length:
- '50'
Content-Type:
- application/json
Cross-Origin-Opener-Policy:
- same-origin
Date:
- Sun, 02 Oct 2022 07:57:46 GMT
Referrer-Policy:
- same-origin
Server:
- gunicorn
Vary:
- Accept, Origin, Cookie
X-Content-Type-Options:
- nosniff
X-Frame-Options:
- DENY
status:
code: 200
message: OK
- request:
body: null
headers:
Accept-Encoding:
- gzip, deflate
Connection:
- keep-alive
Cookie:
- csrftoken=FEvkld0o1MszakkYc27mcoDrcTxNvA0J9jjIzKW6OMzkxS0Zar9aK0aeOk3iALkQ;
sessionid=rmbx2nr20xbrk8ofdi9u4ocgai6bze9g
User-Agent:
- python-requests/2.28.1
X-CSRFToken:
- FEvkld0o1MszakkYc27mcoDrcTxNvA0J9jjIzKW6OMzkxS0Zar9aK0aeOk3iALkQ
accept:
- application/json
content-type:
- application/json
referer:
- http://localhost:8000
method: GET
uri: http://localhost:8000/v1/projects/16/download?taskId=775c0bde-3ee5-4e19-b9cd-6f4d53c3c120
response:
body:
string: !!binary |
UEsDBBQAAAAIADc/QlW3l8FGmgAAAGsCAAALAAAAYWRtaW4uanNvbmytzTELwjAQhuHdn/HNGdRW
q9lsXdzcS4eIRwkmaSFRC6X/3WCxaEGpkOnuOHjeFvIMvojnScTgqHHgflgHBiVOpMDzgiGrtCbj
bH89H343V6UYNDnR792sHbQ4qLYKqq2DasmgUSN0rWg3RcxRV1Y6eSMUL7nFXV7kUZR02Hu7e69s
xpX0zwqDoVL8Ci4/gttxMJsU/GJH3n4AUEsBAhQDFAAAAAgANz9CVbeXwUaaAAAAawIAAAsAAAAA
AAAAAAAAAKSBAAAAAGFkbWluLmpzb25sUEsFBgAAAAABAAEAOQAAAMMAAAAAAA==
headers:
Allow:
- GET, POST, HEAD, OPTIONS
Connection:
- close
Content-Disposition:
- attachment; filename="169be049-d9b1-4882-a974-cd8625adcd67.zip"
Content-Length:
- '274'
Content-Type:
- application/zip
Cross-Origin-Opener-Policy:
- same-origin
Date:
- Sun, 02 Oct 2022 07:57:56 GMT
Referrer-Policy:
- same-origin
Server:
- gunicorn
Vary:
- Accept, Origin, Cookie
X-Content-Type-Options:
- nosniff
X-Frame-Options:
- DENY
status:
code: 200
message: OK
version: 1
55 changes: 55 additions & 0 deletions tests/fixtures/cassettes/data_export/login.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
interactions:
- request:
body: '{"username": "admin", "password": "password"}'
headers:
Accept-Encoding:
- gzip, deflate
Connection:
- keep-alive
Content-Length:
- '45'
User-Agent:
- python-requests/2.28.1
accept:
- application/json
content-type:
- application/json
referer:
- http://localhost:8000
method: POST
uri: http://localhost:8000/v1/auth/login/
response:
body:
string: '{"key":"b37c59f6e0db6bf8a829858ce925b9c2b4d7b03d"}'
headers:
Allow:
- POST, OPTIONS
Connection:
- close
Content-Length:
- '50'
Content-Type:
- application/json
Cross-Origin-Opener-Policy:
- same-origin
Date:
- Sun, 02 Oct 2022 07:53:26 GMT
Referrer-Policy:
- same-origin
Server:
- gunicorn
Set-Cookie:
- csrftoken=FEvkld0o1MszakkYc27mcoDrcTxNvA0J9jjIzKW6OMzkxS0Zar9aK0aeOk3iALkQ;
expires=Sun, 01 Oct 2023 07:53:26 GMT; Max-Age=31449600; Path=/; SameSite=Lax
- sessionid=rmbx2nr20xbrk8ofdi9u4ocgai6bze9g; expires=Sun, 16 Oct 2022 07:53:26
GMT; HttpOnly; Max-Age=1209600; Path=/; SameSite=Lax
Vary:
- Accept, Origin, Cookie
X-Content-Type-Options:
- nosniff
X-Frame-Options:
- DENY
status:
code: 200
message: OK
version: 1
62 changes: 62 additions & 0 deletions tests/fixtures/cassettes/data_export/options.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
interactions:
- request:
body: null
headers:
Accept-Encoding:
- gzip, deflate
Connection:
- keep-alive
Cookie:
- csrftoken=FEvkld0o1MszakkYc27mcoDrcTxNvA0J9jjIzKW6OMzkxS0Zar9aK0aeOk3iALkQ;
sessionid=rmbx2nr20xbrk8ofdi9u4ocgai6bze9g
User-Agent:
- python-requests/2.28.1
X-CSRFToken:
- FEvkld0o1MszakkYc27mcoDrcTxNvA0J9jjIzKW6OMzkxS0Zar9aK0aeOk3iALkQ
accept:
- application/json
content-type:
- application/json
referer:
- http://localhost:8000
method: GET
uri: http://localhost:8000/v1/projects/16/download-format
response:
body:
string: '[{"name":"CSV","example":"text,label\n\"Terrible customer service.\",\"negative\"\n\"Really
great transaction.\",\"positive\"\n\"Great price.\",\"positive\"\n"},{"name":"fastText","example":"__label__negative
Terrible customer service.\n__label__positive Really great transaction.\n__label__positive
Great price.\n"},{"name":"JSON","example":"[\n {\n \"text\": \"Terrible
customer service.\",\n \"label\": [\"negative\"]\n },\n {\n \"text\":
\"Really great transaction.\",\n \"label\": [\"positive\"]\n },\n {\n \"text\":
\"Great price.\",\n \"label\": [\"positive\"]\n }\n]\n"},{"name":"JSONL","example":"{\"text\":
\"Terrible customer service.\", \"label\": [\"negative\"]}\n{\"text\": \"Really
great transaction.\", \"label\": [\"positive\"]}\n{\"text\": \"Great price.\",
\"label\": [\"positive\"]}\n"}]'
headers:
Allow:
- GET, HEAD, OPTIONS
Connection:
- close
Content-Length:
- '867'
Content-Type:
- application/json
Cross-Origin-Opener-Policy:
- same-origin
Date:
- Sun, 02 Oct 2022 07:53:26 GMT
Referrer-Policy:
- same-origin
Server:
- gunicorn
Vary:
- Accept, Origin, Cookie
X-Content-Type-Options:
- nosniff
X-Frame-Options:
- DENY
status:
code: 200
message: OK
version: 1
Loading

0 comments on commit e1270d4

Please sign in to comment.