-
Notifications
You must be signed in to change notification settings - Fork 64
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #103 from doccano/feature/support-data-export
Support data export
- Loading branch information
Showing
7 changed files
with
396 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,59 @@ | ||
from __future__ import annotations | ||
|
||
from typing import Any, Iterator, List | ||
|
||
from doccano_client.client import DoccanoClient | ||
from doccano_client.models.data_export import Option | ||
|
||
|
||
class DataExportClient: | ||
"""Client for interacting with the Doccano data export API""" | ||
|
||
def __init__(self, client: DoccanoClient): | ||
self._client = client | ||
|
||
def list_options(self, project_id: int) -> List[Option]: | ||
"""Return all download options | ||
Args: | ||
project_id (int): The id of the project | ||
Returns: | ||
List[Option]: The list of the download options. | ||
""" | ||
resource = f"projects/{project_id}/download-format" | ||
response = self._client.get(resource) | ||
options = [Option.parse_obj(label) for label in response.json()] | ||
return options | ||
|
||
def schedule_download(self, project_id: int, option: Option, only_approved=False) -> str: | ||
"""Schedule a download | ||
Args: | ||
project_id (int): The id of the project | ||
option (Option): The download option | ||
only_approved (bool): Whether to export approved data only | ||
Returns: | ||
str: The celery task id | ||
""" | ||
resource = f"projects/{project_id}/download" | ||
data = {"format": option.name, "exportApproved": only_approved} | ||
response = self._client.post(resource, json=data) | ||
task_id = response.json()["task_id"] | ||
return task_id | ||
|
||
def download(self, project_id: int, task_id: str) -> Iterator[Any]: | ||
"""Download a file from the server | ||
Args: | ||
project_id (int): The id of the project | ||
task_id (str): The celery task id | ||
Yields: | ||
Iterator[Any]: The file content | ||
""" | ||
resource = f"projects/{project_id}/download" | ||
params = {"taskId": task_id} | ||
response = self._client.get(resource, params=params, stream=True) | ||
yield from response.iter_content(chunk_size=65536) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,6 @@ | ||
from pydantic import BaseModel | ||
|
||
|
||
class Option(BaseModel): | ||
name: str | ||
example: str = "" |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,46 @@ | ||
import pathlib | ||
import time | ||
|
||
import vcr | ||
|
||
from doccano_client.client import DoccanoClient | ||
from doccano_client.clients.data_export import DataExportClient | ||
from doccano_client.models.data_export import Option | ||
from tests.conftest import cassettes_path | ||
|
||
|
||
class TestDataExportClient: | ||
@classmethod | ||
def setup_class(cls): | ||
with vcr.use_cassette(str(cassettes_path / "data_export/login.yaml"), mode="once"): | ||
client = DoccanoClient("http://localhost:8000") | ||
client.login(username="admin", password="password") | ||
cls.client = DataExportClient(client) | ||
cls.project_id = 16 | ||
|
||
def test_list_options(self): | ||
with vcr.use_cassette(str(cassettes_path / "data_export/options.yaml"), mode="once"): | ||
response = self.client.list_options(self.project_id) | ||
assert len(response) > 0 | ||
assert all(isinstance(option, Option) for option in response) | ||
|
||
def test_schedule_download(self): | ||
with vcr.use_cassette(str(cassettes_path / "data_export/schedule_download.yaml"), mode="once"): | ||
option = Option(name="JSONL") | ||
task_id = self.client.schedule_download(self.project_id, option) | ||
assert task_id is not None | ||
assert isinstance(task_id, str) | ||
|
||
def test_download(self): | ||
with vcr.use_cassette(str(cassettes_path / "data_export/download.yaml"), mode="once"): | ||
option = Option(name="JSONL") | ||
task_id = self.client.schedule_download(self.project_id, option) | ||
time.sleep(10) # lazy work | ||
chunks = self.client.download(self.project_id, task_id) | ||
file = pathlib.Path(__file__).parent / "data/annotation.zip" | ||
with file.open("wb") as f: | ||
for chunk in chunks: | ||
f.write(chunk) | ||
assert file.exists() | ||
assert file.stat().st_size > 0 | ||
file.unlink() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,113 @@ | ||
interactions: | ||
- request: | ||
body: '{"format": "JSONL", "exportApproved": false}' | ||
headers: | ||
Accept-Encoding: | ||
- gzip, deflate | ||
Connection: | ||
- keep-alive | ||
Content-Length: | ||
- '44' | ||
Cookie: | ||
- csrftoken=FEvkld0o1MszakkYc27mcoDrcTxNvA0J9jjIzKW6OMzkxS0Zar9aK0aeOk3iALkQ; | ||
sessionid=rmbx2nr20xbrk8ofdi9u4ocgai6bze9g | ||
User-Agent: | ||
- python-requests/2.28.1 | ||
X-CSRFToken: | ||
- FEvkld0o1MszakkYc27mcoDrcTxNvA0J9jjIzKW6OMzkxS0Zar9aK0aeOk3iALkQ | ||
accept: | ||
- application/json | ||
content-type: | ||
- application/json | ||
referer: | ||
- http://localhost:8000 | ||
method: POST | ||
uri: http://localhost:8000/v1/projects/16/download | ||
response: | ||
body: | ||
string: '{"task_id":"775c0bde-3ee5-4e19-b9cd-6f4d53c3c120"}' | ||
headers: | ||
Allow: | ||
- GET, POST, HEAD, OPTIONS | ||
Connection: | ||
- close | ||
Content-Length: | ||
- '50' | ||
Content-Type: | ||
- application/json | ||
Cross-Origin-Opener-Policy: | ||
- same-origin | ||
Date: | ||
- Sun, 02 Oct 2022 07:57:46 GMT | ||
Referrer-Policy: | ||
- same-origin | ||
Server: | ||
- gunicorn | ||
Vary: | ||
- Accept, Origin, Cookie | ||
X-Content-Type-Options: | ||
- nosniff | ||
X-Frame-Options: | ||
- DENY | ||
status: | ||
code: 200 | ||
message: OK | ||
- request: | ||
body: null | ||
headers: | ||
Accept-Encoding: | ||
- gzip, deflate | ||
Connection: | ||
- keep-alive | ||
Cookie: | ||
- csrftoken=FEvkld0o1MszakkYc27mcoDrcTxNvA0J9jjIzKW6OMzkxS0Zar9aK0aeOk3iALkQ; | ||
sessionid=rmbx2nr20xbrk8ofdi9u4ocgai6bze9g | ||
User-Agent: | ||
- python-requests/2.28.1 | ||
X-CSRFToken: | ||
- FEvkld0o1MszakkYc27mcoDrcTxNvA0J9jjIzKW6OMzkxS0Zar9aK0aeOk3iALkQ | ||
accept: | ||
- application/json | ||
content-type: | ||
- application/json | ||
referer: | ||
- http://localhost:8000 | ||
method: GET | ||
uri: http://localhost:8000/v1/projects/16/download?taskId=775c0bde-3ee5-4e19-b9cd-6f4d53c3c120 | ||
response: | ||
body: | ||
string: !!binary | | ||
UEsDBBQAAAAIADc/QlW3l8FGmgAAAGsCAAALAAAAYWRtaW4uanNvbmytzTELwjAQhuHdn/HNGdRW | ||
q9lsXdzcS4eIRwkmaSFRC6X/3WCxaEGpkOnuOHjeFvIMvojnScTgqHHgflgHBiVOpMDzgiGrtCbj | ||
bH89H343V6UYNDnR792sHbQ4qLYKqq2DasmgUSN0rWg3RcxRV1Y6eSMUL7nFXV7kUZR02Hu7e69s | ||
xpX0zwqDoVL8Ci4/gttxMJsU/GJH3n4AUEsBAhQDFAAAAAgANz9CVbeXwUaaAAAAawIAAAsAAAAA | ||
AAAAAAAAAKSBAAAAAGFkbWluLmpzb25sUEsFBgAAAAABAAEAOQAAAMMAAAAAAA== | ||
headers: | ||
Allow: | ||
- GET, POST, HEAD, OPTIONS | ||
Connection: | ||
- close | ||
Content-Disposition: | ||
- attachment; filename="169be049-d9b1-4882-a974-cd8625adcd67.zip" | ||
Content-Length: | ||
- '274' | ||
Content-Type: | ||
- application/zip | ||
Cross-Origin-Opener-Policy: | ||
- same-origin | ||
Date: | ||
- Sun, 02 Oct 2022 07:57:56 GMT | ||
Referrer-Policy: | ||
- same-origin | ||
Server: | ||
- gunicorn | ||
Vary: | ||
- Accept, Origin, Cookie | ||
X-Content-Type-Options: | ||
- nosniff | ||
X-Frame-Options: | ||
- DENY | ||
status: | ||
code: 200 | ||
message: OK | ||
version: 1 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,55 @@ | ||
interactions: | ||
- request: | ||
body: '{"username": "admin", "password": "password"}' | ||
headers: | ||
Accept-Encoding: | ||
- gzip, deflate | ||
Connection: | ||
- keep-alive | ||
Content-Length: | ||
- '45' | ||
User-Agent: | ||
- python-requests/2.28.1 | ||
accept: | ||
- application/json | ||
content-type: | ||
- application/json | ||
referer: | ||
- http://localhost:8000 | ||
method: POST | ||
uri: http://localhost:8000/v1/auth/login/ | ||
response: | ||
body: | ||
string: '{"key":"b37c59f6e0db6bf8a829858ce925b9c2b4d7b03d"}' | ||
headers: | ||
Allow: | ||
- POST, OPTIONS | ||
Connection: | ||
- close | ||
Content-Length: | ||
- '50' | ||
Content-Type: | ||
- application/json | ||
Cross-Origin-Opener-Policy: | ||
- same-origin | ||
Date: | ||
- Sun, 02 Oct 2022 07:53:26 GMT | ||
Referrer-Policy: | ||
- same-origin | ||
Server: | ||
- gunicorn | ||
Set-Cookie: | ||
- csrftoken=FEvkld0o1MszakkYc27mcoDrcTxNvA0J9jjIzKW6OMzkxS0Zar9aK0aeOk3iALkQ; | ||
expires=Sun, 01 Oct 2023 07:53:26 GMT; Max-Age=31449600; Path=/; SameSite=Lax | ||
- sessionid=rmbx2nr20xbrk8ofdi9u4ocgai6bze9g; expires=Sun, 16 Oct 2022 07:53:26 | ||
GMT; HttpOnly; Max-Age=1209600; Path=/; SameSite=Lax | ||
Vary: | ||
- Accept, Origin, Cookie | ||
X-Content-Type-Options: | ||
- nosniff | ||
X-Frame-Options: | ||
- DENY | ||
status: | ||
code: 200 | ||
message: OK | ||
version: 1 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,62 @@ | ||
interactions: | ||
- request: | ||
body: null | ||
headers: | ||
Accept-Encoding: | ||
- gzip, deflate | ||
Connection: | ||
- keep-alive | ||
Cookie: | ||
- csrftoken=FEvkld0o1MszakkYc27mcoDrcTxNvA0J9jjIzKW6OMzkxS0Zar9aK0aeOk3iALkQ; | ||
sessionid=rmbx2nr20xbrk8ofdi9u4ocgai6bze9g | ||
User-Agent: | ||
- python-requests/2.28.1 | ||
X-CSRFToken: | ||
- FEvkld0o1MszakkYc27mcoDrcTxNvA0J9jjIzKW6OMzkxS0Zar9aK0aeOk3iALkQ | ||
accept: | ||
- application/json | ||
content-type: | ||
- application/json | ||
referer: | ||
- http://localhost:8000 | ||
method: GET | ||
uri: http://localhost:8000/v1/projects/16/download-format | ||
response: | ||
body: | ||
string: '[{"name":"CSV","example":"text,label\n\"Terrible customer service.\",\"negative\"\n\"Really | ||
great transaction.\",\"positive\"\n\"Great price.\",\"positive\"\n"},{"name":"fastText","example":"__label__negative | ||
Terrible customer service.\n__label__positive Really great transaction.\n__label__positive | ||
Great price.\n"},{"name":"JSON","example":"[\n {\n \"text\": \"Terrible | ||
customer service.\",\n \"label\": [\"negative\"]\n },\n {\n \"text\": | ||
\"Really great transaction.\",\n \"label\": [\"positive\"]\n },\n {\n \"text\": | ||
\"Great price.\",\n \"label\": [\"positive\"]\n }\n]\n"},{"name":"JSONL","example":"{\"text\": | ||
\"Terrible customer service.\", \"label\": [\"negative\"]}\n{\"text\": \"Really | ||
great transaction.\", \"label\": [\"positive\"]}\n{\"text\": \"Great price.\", | ||
\"label\": [\"positive\"]}\n"}]' | ||
headers: | ||
Allow: | ||
- GET, HEAD, OPTIONS | ||
Connection: | ||
- close | ||
Content-Length: | ||
- '867' | ||
Content-Type: | ||
- application/json | ||
Cross-Origin-Opener-Policy: | ||
- same-origin | ||
Date: | ||
- Sun, 02 Oct 2022 07:53:26 GMT | ||
Referrer-Policy: | ||
- same-origin | ||
Server: | ||
- gunicorn | ||
Vary: | ||
- Accept, Origin, Cookie | ||
X-Content-Type-Options: | ||
- nosniff | ||
X-Frame-Options: | ||
- DENY | ||
status: | ||
code: 200 | ||
message: OK | ||
version: 1 |
Oops, something went wrong.