Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Update README and refine of MM-GDINO #11298

Merged
merged 20 commits into from
Dec 26, 2023
Merged
9 changes: 5 additions & 4 deletions configs/glip/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -166,7 +166,8 @@ Learning visual representations from natural language supervision has recently s

### Results on Flickr30k

| Model | Official | Pre-Train Data | Val R@1 | Val R@5 | Val R@10 | Test R@1 | Test R@5 | Test R@10 |
| ------------- | -------- | -------------- | ------- | ------- | -------- | -------- | -------- | --------- |
| **GLIP-T(C)** | ✔ | O365, GoldG | 84.8 | 94.9 | 96.3 | 85.5 | 95.4 | 96.6 |
| **GLIP-T(C)** | | O365, GoldG | 84.9 | 94.9 | 96.3 | 85.6 | 95.4 | 96.7 |
| Model | Official | Pre-Train Data | Val R@1 | Val R@5 | Val R@10 | Test R@1 | Test R@5 | Test R@10 |
| ------------- | -------- | ------------------- | ------- | ------- | -------- | -------- | -------- | --------- |
| **GLIP-T(C)** | ✔ | O365, GoldG | 84.8 | 94.9 | 96.3 | 85.5 | 95.4 | 96.6 |
| **GLIP-T(C)** | | O365, GoldG | 84.9 | 94.9 | 96.3 | 85.6 | 95.4 | 96.7 |
| **GLIP-T** | | O365,GoldG,CC3M,SBU | 85.3 | 95.5 | 96.9 | 86.0 | 95.9 | 97.2 |
Original file line number Diff line number Diff line change
Expand Up @@ -2,10 +2,10 @@

lang_model_name = 'bert-base-uncased'

model = dict(bbox_head=dict(early_fuse=True), )
model = dict(bbox_head=dict(early_fuse=True))

dataset_type = 'Flickr30kDataset'
data_root = 'data/flickr30k/'
data_root = 'data/flickr30k_entities/'

test_pipeline = [
dict(
Expand All @@ -27,15 +27,15 @@
dataset_Flickr30k_val = dict(
type=dataset_type,
data_root=data_root,
ann_file='mdetr_annotations/final_flickr_separateGT_val.json',
ann_file='final_flickr_separateGT_val.json',
data_prefix=dict(img='flickr30k_images/'),
pipeline=test_pipeline,
)

dataset_Flickr30k_test = dict(
type=dataset_type,
data_root=data_root,
ann_file='mdetr_annotations/final_flickr_separateGT_test.json',
ann_file='final_flickr_separateGT_test.json',
data_prefix=dict(img='flickr30k_images/'),
pipeline=test_pipeline,
)
Expand Down
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
_base_ = '../grounding_dino_swin-t_pretrain_obj365_goldg_cap4m.py'

dataset_type = 'Flickr30kDataset'
data_root = 'data/flickr30k/'
data_root = 'data/flickr30k_entities/'

test_pipeline = [
dict(
Expand All @@ -23,15 +23,15 @@
dataset_Flickr30k_val = dict(
type=dataset_type,
data_root=data_root,
ann_file='mdetr_annotations/final_flickr_separateGT_val.json',
ann_file='final_flickr_separateGT_val.json',
data_prefix=dict(img='flickr30k_images/'),
pipeline=test_pipeline,
)

dataset_Flickr30k_test = dict(
type=dataset_type,
data_root=data_root,
ann_file='mdetr_annotations/final_flickr_separateGT_test.json',
ann_file='final_flickr_separateGT_test.json',
data_prefix=dict(img='flickr30k_images/'),
pipeline=test_pipeline,
)
Expand Down
432 changes: 319 additions & 113 deletions configs/mm_grounding_dino/README.md

Large diffs are not rendered by default.

Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,8 @@
# https://universe.roboflow.com/roboflow-100/brain-tumor-m2pbp/dataset/2
data_root = 'data/brain_tumor_v2/'
class_name = ('label0', 'label1', 'label2')
label_name = '_annotations.coco.json'

palette = [(220, 20, 60), (255, 0, 0), (0, 0, 142)]

metainfo = dict(classes=class_name, palette=palette)
Expand Down Expand Up @@ -64,20 +66,20 @@
pipeline=train_pipeline,
return_classes=True,
data_prefix=dict(img='train/'),
ann_file='train/_annotations.coco.json')))
ann_file='train/' + label_name)))

val_dataloader = dict(
dataset=dict(
metainfo=metainfo,
data_root=data_root,
return_classes=True,
ann_file='valid/_annotations.coco.json',
ann_file='valid/' + label_name,
data_prefix=dict(img='valid/')))
test_dataloader = val_dataloader

val_evaluator = dict(
type='CocoMetric',
ann_file=data_root + 'valid/_annotations.coco.json',
ann_file=data_root + 'valid/' + label_name,
metric='bbox',
format_only=False)
test_evaluator = val_evaluator
Expand Down Expand Up @@ -107,4 +109,4 @@

default_hooks = dict(checkpoint=dict(max_keep_ckpts=1, save_best='auto'))

load_from = ''
load_from = 'https://download.openmmlab.com/mmdetection/v3.0/mm_grounding_dino/grounding_dino_swin-t_pretrain_obj365_goldg_grit9m_v3det/grounding_dino_swin-t_pretrain_obj365_goldg_grit9m_v3det_20231204_095047-b448804b.pth' # noqa
Original file line number Diff line number Diff line change
Expand Up @@ -107,4 +107,4 @@
train_cfg = dict(max_epochs=max_epochs, val_interval=1)
default_hooks = dict(checkpoint=dict(max_keep_ckpts=1, save_best='auto'))

load_from = ''
load_from = 'https://download.openmmlab.com/mmdetection/v3.0/mm_grounding_dino/grounding_dino_swin-t_pretrain_obj365_goldg_grit9m_v3det/grounding_dino_swin-t_pretrain_obj365_goldg_grit9m_v3det_20231204_095047-b448804b.pth' # noqa
Original file line number Diff line number Diff line change
Expand Up @@ -64,7 +64,7 @@
custom_keys={
'absolute_pos_embed': dict(decay_mult=0.),
'backbone': dict(lr_mult=0.1),
# 'language_model': dict(lr_mult=0),
'language_model': dict(lr_mult=0.1),
}))

# learning policy
Expand All @@ -75,11 +75,11 @@
begin=0,
end=max_epochs,
by_epoch=True,
milestones=[11],
milestones=[8, 11],
gamma=0.1)
]
train_cfg = dict(max_epochs=max_epochs, val_interval=1)

default_hooks = dict(checkpoint=dict(max_keep_ckpts=1, save_best='auto'))

load_from = ''
load_from = 'https://download.openmmlab.com/mmdetection/v3.0/mm_grounding_dino/grounding_dino_swin-t_pretrain_obj365_goldg_grit9m_v3det/grounding_dino_swin-t_pretrain_obj365_goldg_grit9m_v3det_20231204_095047-b448804b.pth' # noqa
Original file line number Diff line number Diff line change
Expand Up @@ -8,21 +8,20 @@
'banana', 'apple', 'sandwich', 'orange', 'broccoli', 'carrot',
'pizza', 'donut', 'chair', 'bed', 'toilet', 'tv', 'laptop',
'mouse', 'remote', 'microwave', 'oven', 'toaster',
'refrigerator', 'book', 'clock', 'vase', 'toothbrush')
'refrigerator', 'book', 'clock', 'vase', 'toothbrush') # 48
novel_classes = ('airplane', 'bus', 'cat', 'dog', 'cow', 'elephant',
'umbrella', 'tie', 'snowboard', 'skateboard', 'cup', 'knife',
'cake', 'couch', 'keyboard', 'sink', 'scissors')
all_classes = ('person', 'bicycle', 'car', 'motorcycle', 'airplane', 'bus',
'train', 'truck', 'boat', 'bench', 'bird', 'cat', 'dog',
'horse', 'sheep', 'cow', 'elephant', 'bear', 'zebra', 'giraffe',
'backpack', 'umbrella', 'handbag', 'tie', 'suitcase', 'frisbee',
'skis', 'snowboard', 'kite', 'skateboard', 'surfboard',
'bottle', 'cup', 'fork', 'knife', 'spoon', 'bowl', 'banana',
'apple', 'sandwich', 'orange', 'broccoli', 'carrot', 'pizza',
'donut', 'cake', 'chair', 'couch', 'bed', 'toilet', 'tv',
'laptop', 'mouse', 'remote', 'keyboard', 'microwave', 'oven',
'toaster', 'sink', 'refrigerator', 'book', 'clock', 'vase',
'scissors', 'toothbrush')
'cake', 'couch', 'keyboard', 'sink', 'scissors') # 17
all_classes = (
'person', 'bicycle', 'car', 'motorcycle', 'airplane', 'bus', 'train',
'truck', 'boat', 'bench', 'bird', 'cat', 'dog', 'horse', 'sheep', 'cow',
'elephant', 'bear', 'zebra', 'giraffe', 'backpack', 'umbrella', 'handbag',
'tie', 'suitcase', 'frisbee', 'skis', 'snowboard', 'kite', 'skateboard',
'surfboard', 'bottle', 'cup', 'fork', 'knife', 'spoon', 'bowl', 'banana',
'apple', 'sandwich', 'orange', 'broccoli', 'carrot', 'pizza', 'donut',
'cake', 'chair', 'couch', 'bed', 'toilet', 'tv', 'laptop', 'mouse',
'remote', 'keyboard', 'microwave', 'oven', 'toaster', 'sink',
'refrigerator', 'book', 'clock', 'vase', 'scissors', 'toothbrush') # 65

train_metainfo = dict(classes=base_classes)
test_metainfo = dict(
Expand Down Expand Up @@ -95,7 +94,7 @@
type='CocoDataset',
metainfo=train_metainfo,
data_root=data_root,
ann_file='zero-shot/instances_train2017_seen_2.json',
ann_file='annotations/instances_train2017_seen_2.json',
data_prefix=dict(img='train2017/'),
return_classes=True,
filter_cfg=dict(filter_empty_gt=False, min_size=32),
Expand All @@ -111,7 +110,7 @@
type='CocoDataset',
metainfo=test_metainfo,
data_root=data_root,
ann_file='zero-shot/instances_val2017_all_2.json',
ann_file='annotations/instances_val2017_all_2.json',
data_prefix=dict(img='val2017/'),
test_mode=True,
pipeline=test_pipeline,
Expand All @@ -121,7 +120,7 @@

val_evaluator = dict(
type='OVCocoMetric',
ann_file=data_root + 'zero-shot/instances_val2017_all_2.json',
ann_file=data_root + 'annotations/instances_val2017_all_2.json',
metric='bbox',
format_only=False)
test_evaluator = val_evaluator
Expand Down Expand Up @@ -155,4 +154,4 @@
checkpoint=dict(
max_keep_ckpts=1, save_best='coco/novel_ap50', rule='greater'))

load_from = 'epoch_30.pth'
load_from = 'https://download.openmmlab.com/mmdetection/v3.0/mm_grounding_dino/grounding_dino_swin-t_pretrain_obj365_goldg_grit9m_v3det/grounding_dino_swin-t_pretrain_obj365_goldg_grit9m_v3det_20231204_095047-b448804b.pth' # noqa
Original file line number Diff line number Diff line change
@@ -0,0 +1,93 @@
_base_ = '../grounding_dino_swin-t_pretrain_obj365.py'

data_root = 'data/coco/'

train_pipeline = [
dict(type='LoadImageFromFile'),
dict(type='LoadAnnotations', with_bbox=True),
dict(type='RandomFlip', prob=0.5),
dict(
type='RandomChoice',
transforms=[
[
dict(
type='RandomChoiceResize',
scales=[(480, 1333), (512, 1333), (544, 1333), (576, 1333),
(608, 1333), (640, 1333), (672, 1333), (704, 1333),
(736, 1333), (768, 1333), (800, 1333)],
keep_ratio=True)
],
[
dict(
type='RandomChoiceResize',
# The radio of all image in train dataset < 7
# follow the original implement
scales=[(400, 4200), (500, 4200), (600, 4200)],
keep_ratio=True),
dict(
type='RandomCrop',
crop_type='absolute_range',
crop_size=(384, 600),
allow_negative_crop=True),
dict(
type='RandomChoiceResize',
scales=[(480, 1333), (512, 1333), (544, 1333), (576, 1333),
(608, 1333), (640, 1333), (672, 1333), (704, 1333),
(736, 1333), (768, 1333), (800, 1333)],
keep_ratio=True)
]
]),
dict(
type='RandomSamplingNegPos',
tokenizer_name=_base_.lang_model_name,
num_sample_negative=20, # ======= important =====
label_map_file='data/coco/annotations/coco2017_label_map.json',
max_tokens=256),
dict(
type='PackDetInputs',
meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape',
'scale_factor', 'flip', 'flip_direction', 'text',
'custom_entities', 'tokens_positive', 'dataset_mode'))
]

train_dataloader = dict(
dataset=dict(
_delete_=True,
type='ODVGDataset',
need_text=False,
data_root=data_root,
ann_file='annotations/instances_train2017_od.json',
label_map_file='annotations/coco2017_label_map.json',
data_prefix=dict(img='train2017/'),
return_classes=True,
filter_cfg=dict(filter_empty_gt=False, min_size=32),
pipeline=train_pipeline))

optim_wrapper = dict(
_delete_=True,
type='OptimWrapper',
optimizer=dict(type='AdamW', lr=0.00005, weight_decay=0.0001),
clip_grad=dict(max_norm=0.1, norm_type=2),
paramwise_cfg=dict(
custom_keys={
'absolute_pos_embed': dict(decay_mult=0.),
'backbone': dict(lr_mult=0.1),
'language_model': dict(lr_mult=0.0),
}))

# learning policy
max_epochs = 12
param_scheduler = [
dict(
type='MultiStepLR',
begin=0,
end=max_epochs,
by_epoch=True,
milestones=[8, 11],
gamma=0.1)
]
train_cfg = dict(max_epochs=max_epochs, val_interval=1)

default_hooks = dict(checkpoint=dict(max_keep_ckpts=1, save_best='auto'))

load_from = 'https://download.openmmlab.com/mmdetection/v3.0/mm_grounding_dino/grounding_dino_swin-t_pretrain_obj365_goldg_grit9m_v3det/grounding_dino_swin-t_pretrain_obj365_goldg_grit9m_v3det_20231204_095047-b448804b.pth' # noqa
Loading