-
Notifications
You must be signed in to change notification settings - Fork 16
/
data.py
76 lines (70 loc) · 1.87 KB
/
data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
import os as __os # add "__" if not want to be exported
from copy import deepcopy as __deepcopy
data_dir = 'your_data_path/anno'
if data_dir is None:
raise ValueError("please set environment `VL_DATA_DIR` before continue")
data_root = __os.path.join(data_dir, "videos_images")
anno_root_pt = __os.path.join(data_dir, "anno_pretrain")
anno_root_downstream = __os.path.join(data_dir, "anno_downstream")
# ============== pretraining datasets=================
available_corpus = dict(
# pretraining datasets
cc3m=[
f"{anno_root_pt}/cc3m_train.json",
"your_cc3m_path"
],
cc12m=[
f"{anno_root_pt}/cc12m_train.json",
"your_cc12m_path"
],
sbu=[
f"{anno_root_pt}/sbu.json",
"your_sbu_path"
],
vg=[
f"{anno_root_pt}/vg.json",
"your_vg_path"
],
coco=[
f"{anno_root_pt}/coco.json",
"your_coco_path"
],
webvid=[
f"{anno_root_pt}/webvid_train.json",
"your_webvid_path",
"video"
],
webvid_10m=[
f"{anno_root_pt}/webvid_10m_train.json",
"your_webvid10m_path",
"video",
],
# downstream datasets.
)
# composed datasets.
available_corpus["data_5m"] = [
available_corpus["webvid"],
available_corpus["cc3m"]
]
available_corpus["data_17m"] = [
available_corpus["webvid"],
available_corpus["cc3m"],
available_corpus["coco"],
available_corpus["vg"],
available_corpus["sbu"],
available_corpus["cc12m"],
]
available_corpus["data_25m"] = [
available_corpus["webvid_10m"],
available_corpus["cc3m"],
available_corpus["coco"],
available_corpus["vg"],
available_corpus["sbu"],
available_corpus["cc12m"],
]
# ============== for validation =================
available_corpus["msrvtt_1k_test"] = [
f"{anno_root_downstream}/msrvtt_test1k.json",
"your_msrvtt_path",
"video",
]