-
Notifications
You must be signed in to change notification settings - Fork 2
/
prepare_dataset.py
75 lines (56 loc) · 1.83 KB
/
prepare_dataset.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
import click
import magic
import threading
import tempfile
import os
import json
import tarfile
import gzip
import shutil
from pathlib import Path
from heuristics import get_maindoc, EXCLUDED_SAMPLES
_libmagic_threadsafe = threading.Lock()
class TestEnv(object):
def __init__(self, sample):
self.tmpdir = Path(tempfile.mkdtemp('ttrac'))
with _libmagic_threadsafe:
assert magic.detect_from_filename(
sample).mime_type == 'application/gzip'
submission_data_path = self.tmpdir / sample.stem
with gzip.open(sample) as gz:
with open(submission_data_path, "wb") as f:
shutil.copyfileobj(gz, f)
with _libmagic_threadsafe:
if magic.detect_from_filename(submission_data_path).mime_type == "application/x-tar":
with tarfile.open(submission_data_path, 'r') as tar:
tar.extractall(path=self.tmpdir)
submission_data_path.unlink()
def __enter__(self):
return self.tmpdir
def __exit__(self, exc, value, tb):
shutil.rmtree(self.tmpdir)
def prepare(sample):
if sample.stat().st_size < 100:
# submission was withdrawn
return
if sample.stem in EXCLUDED_SAMPLES:
return
with TestEnv(sample) as d:
maindoc = get_maindoc(d, sample)
if maindoc:
return maindoc.name
@click.command()
@click.argument('corpus', type=click.Path(exists=True))
def prepare_dataset(corpus):
output = {}
output_path = corpus + ".json"
assert corpus[-1] != "/"
for sample in Path(corpus).iterdir():
res = prepare(sample)
if res:
print(sample.stem, res)
output[sample.stem] = res
with open(output_path, "w") as f:
json.dump(output, f)
if __name__ == '__main__':
prepare_dataset()