forked from PaddlePaddle/PaddleVideo
-
Notifications
You must be signed in to change notification settings - Fork 0
/
pptimesformer_k400_videos.yaml
163 lines (153 loc) · 5.28 KB
/
pptimesformer_k400_videos.yaml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
MODEL: #MODEL field
framework: "RecognizerTransformer" #Mandatory, indicate the type of network, associate to the 'paddlevideo/modeling/framework/' .
backbone: #Mandatory, indicate the type of backbone, associate to the 'paddlevideo/modeling/backbones/' .
name: "VisionTransformer_tweaks" #Mandatory, The name of backbone.
pretrained: "data/vit_base_patch16_224_miil_21k_trans.pdparams" #Optional, pretrained model path.
img_size: 224
patch_size: 16
in_channels: 3
embed_dim: 768
depth: 12
num_heads: 12
mlp_ratio: 4
qkv_bias: False
epsilon: 1e-6
num_seg: 8
attention_type: 'divided_space_time'
head:
name: "ppTimeSformerHead" #Mandatory, indicate the type of head, associate to the 'paddlevideo/modeling/heads'
num_classes: 400 #Optional, the number of classes to be classified.
in_channels: 768 #input channel of the extracted feature.
std: 0.02 #std value in params initialization
ls_eps: 0.1
runtime_cfg: # configuration used when the model is train or test.
test: # test config
num_seg: 8
avg_type: 'prob' # 'score' or 'prob'
DATASET: #DATASET field
batch_size: 8 #Mandatory, bacth size
num_workers: 4 #Mandatory, XXX the number of subprocess on each GPU.
test_batch_size: 1
train:
format: "VideoDataset" #Mandatory, indicate the type of dataset, associate to the 'paddlevidel/loader/dateset'
data_prefix: "data/k400/videos"
file_path: "data/k400/train.list" #Mandatory, train data index file path
valid:
format: "VideoDataset" #Mandatory, indicate the type of dataset, associate to the 'paddlevidel/loader/dateset'
data_prefix: "data/k400/videos"
file_path: "data/k400/val.list" #Mandatory, valid data index file path
test:
format: "VideoDataset" #Mandatory, indicate the type of dataset, associate to the 'paddlevidel/loader/dateset'
data_prefix: "data/k400/videos"
file_path: "data/k400/val.list" #Mandatory, valid data index file path
PIPELINE: #PIPELINE field
train: #Mandotary, indicate the pipeline to deal with the training data, associate to the 'paddlevideo/loader/pipelines/'
decode:
name: "VideoDecoder"
backend: 'pyav'
mode: 'train'
num_seg: 8
sample:
name: "Sampler"
num_seg: 8
seg_len: 1
valid_mode: False
linspace_sample: True
transform: #Mandotary, image transform operator.
- Normalization:
mean: [0, 0, 0]
std: [1, 1, 1]
tensor_shape: [1, 1, 1, 3]
- Image2Array:
data_format: 'cthw'
- JitterScale:
min_size: 256
max_size: 320
- RandomCrop:
target_size: 224
- RandomFlip:
valid: #Mandatory, indicate the pipeline to deal with the validing data. associate to the 'paddlevideo/loader/pipelines/'
decode:
name: "VideoDecoder"
backend: 'pyav'
mode: 'valid'
num_seg: 8
sample:
name: "Sampler"
num_seg: 8
seg_len: 1
valid_mode: True
linspace_sample: True
transform:
- Normalization:
mean: [0, 0, 0]
std: [1, 1, 1]
tensor_shape: [1, 1, 1, 3]
- Image2Array:
data_format: 'cthw'
- JitterScale:
min_size: 256
max_size: 256
- CenterCrop:
target_size: 224
test:
decode:
name: "VideoDecoder"
backend: 'pyav'
mode: 'test'
num_seg: 8
sample:
name: "Sampler"
num_seg: 8
seg_len: 1
valid_mode: True
linspace_sample: True
transform:
- Normalization:
mean: [0, 0, 0]
std: [1, 1, 1]
tensor_shape: [1, 1, 1, 3]
- Image2Array:
data_format: 'cthw'
- JitterScale:
min_size: 224
max_size: 224
- UniformCrop:
target_size: 224
OPTIMIZER: #OPTIMIZER field
name: 'Momentum'
momentum: 0.9
learning_rate:
iter_step: True
name: 'CustomWarmupCosineDecay'
max_epoch: 20
warmup_epochs: 2
warmup_start_lr: 0.00025
cosine_base_lr: 0.0025
weight_decay:
name: 'L2'
value: 0.00007
use_nesterov: True
grad_clip:
name: 'ClipGradByGlobalNorm'
value: 40.0
GRADIENT_ACCUMULATION:
global_batch_size: 64 # Specify the sum of batches to be calculated by all GPUs
MIX:
name: "VideoMix"
cutmix_prob: 0.5
mixup_alpha: 0.2
cutmix_alpha: 1.0
METRIC:
name: 'CenterCropMetric'
INFERENCE:
name: 'TimeSformer_Inference_helper'
num_seg: 8
target_size: 224
mean: [0, 0, 0]
std: [1, 1, 1]
model_name: "ppTimeSformer"
log_interval: 20 #Optional, the interal of logger, default:10
save_interval: 5
epochs: 20 #Mandatory, total epoch
log_level: "INFO" #Optional, the logger level. default: "INFO"