From 4b5b99773a11eef8ef06d34ed3b77f14adbf4542 Mon Sep 17 00:00:00 2001 From: peter Date: Tue, 29 Mar 2022 12:11:24 -0500 Subject: [PATCH 01/49] Mask2Former/MaskFormer instance only training/eval --- .../mask2former_r50_lsj_8x2_50e_coco_ins.py | 236 ++++++++++++++++ .../mask2former_r50_lsj_8x2_50e_coco_pan.py | 253 ++++++++++++++++++ ...r_swin-t-p4-w7-224_lsj_8x2_50e_coco_ins.py | 62 +++++ ...r_swin-t-p4-w7-224_lsj_8x2_50e_coco_pan.py | 62 +++++ mmdet/models/dense_heads/maskformer_head.py | 18 +- mmdet/models/detectors/maskformer.py | 40 +-- mmdet/models/utils/panoptic_gt_processing.py | 18 +- 7 files changed, 659 insertions(+), 30 deletions(-) create mode 100644 configs/mask2former/mask2former_r50_lsj_8x2_50e_coco_ins.py create mode 100644 configs/mask2former/mask2former_r50_lsj_8x2_50e_coco_pan.py create mode 100644 configs/mask2former/mask2former_swin-t-p4-w7-224_lsj_8x2_50e_coco_ins.py create mode 100644 configs/mask2former/mask2former_swin-t-p4-w7-224_lsj_8x2_50e_coco_pan.py diff --git a/configs/mask2former/mask2former_r50_lsj_8x2_50e_coco_ins.py b/configs/mask2former/mask2former_r50_lsj_8x2_50e_coco_ins.py new file mode 100644 index 00000000000..2e894e776c3 --- /dev/null +++ b/configs/mask2former/mask2former_r50_lsj_8x2_50e_coco_ins.py @@ -0,0 +1,236 @@ +_base_ = [ + '../_base_/datasets/coco_instance.py', '../_base_/default_runtime.py' +] +num_things_classes = 80 +num_stuff_classes = 0 +num_classes = num_things_classes + num_stuff_classes +model = dict( + type='Mask2Former', + backbone=dict( + type='ResNet', + depth=50, + num_stages=4, + out_indices=(0, 1, 2, 3), + frozen_stages=-1, + norm_cfg=dict(type='BN', requires_grad=False), + norm_eval=True, + style='pytorch', + init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet50')), + panoptic_head=dict( + type='Mask2FormerHead', + in_channels=[256, 512, 1024, 2048], # pass to pixel_decoder inside + strides=[4, 8, 16, 32], + feat_channels=256, + out_channels=256, + num_things_classes=num_things_classes, + num_stuff_classes=num_stuff_classes, + num_queries=100, + num_transformer_feat_level=3, + pixel_decoder=dict( + type='MSDeformAttnPixelDecoder', + num_outs=3, + norm_cfg=dict(type='GN', num_groups=32), + act_cfg=dict(type='ReLU'), + encoder=dict( + type='DetrTransformerEncoder', + num_layers=6, + transformerlayers=dict( + type='BaseTransformerLayer', + attn_cfgs=dict( + type='MultiScaleDeformableAttention', + embed_dims=256, + num_heads=8, + num_levels=3, + num_points=4, + im2col_step=64, + dropout=0.0, + batch_first=False, + norm_cfg=None, + init_cfg=None), + ffn_cfgs=dict( + type='FFN', + embed_dims=256, + feedforward_channels=1024, + num_fcs=2, + ffn_drop=0.0, + act_cfg=dict(type='ReLU', inplace=True)), + operation_order=('self_attn', 'norm', 'ffn', 'norm')), + init_cfg=None), + positional_encoding=dict( + type='SinePositionalEncoding', num_feats=128, normalize=True), + init_cfg=None), + enforce_decoder_input_project=False, + positional_encoding=dict( + type='SinePositionalEncoding', num_feats=128, normalize=True), + transformer_decoder=dict( + type='DetrTransformerDecoder', + return_intermediate=True, + num_layers=9, + transformerlayers=dict( + type='DetrTransformerDecoderLayer', + attn_cfgs=dict( + type='MultiheadAttention', + embed_dims=256, + num_heads=8, + attn_drop=0.0, + proj_drop=0.0, + dropout_layer=None, + batch_first=False), + ffn_cfgs=dict( + embed_dims=256, + feedforward_channels=2048, + num_fcs=2, + act_cfg=dict(type='ReLU', inplace=True), + ffn_drop=0.0, + dropout_layer=None, + add_identity=True), + feedforward_channels=2048, + operation_order=('cross_attn', 'norm', 'self_attn', 'norm', + 'ffn', 'norm')), + init_cfg=None), + loss_cls=dict( + type='CrossEntropyLoss', + use_sigmoid=False, + loss_weight=2.0, + reduction='mean', + class_weight=[1.0] * num_classes + [0.1]), + loss_mask=dict( + type='CrossEntropyLoss', + use_sigmoid=True, + reduction='mean', + loss_weight=5.0), + loss_dice=dict( + type='DiceLoss', + use_sigmoid=True, + activate=True, + reduction='mean', + naive_dice=True, + eps=1.0, + loss_weight=5.0)), + panoptic_fusion_head=dict( + type='MaskFormerFusionHead', + num_things_classes=num_things_classes, + num_stuff_classes=num_stuff_classes, + loss_panoptic=None, + init_cfg=None), + train_cfg=dict( + num_points=12544, + oversample_ratio=3.0, + importance_sample_ratio=0.75, + assigner=dict( + type='MaskHungarianAssigner', + cls_cost=dict(type='ClassificationCost', weight=2.0), + mask_cost=dict( + type='CrossEntropyLossCost', weight=5.0, use_sigmoid=True), + dice_cost=dict( + type='DiceCost', weight=5.0, pred_act=True, eps=1.0)), + sampler=dict(type='MaskPseudoSampler')), + test_cfg=dict( + panoptic_on=False, + # For now, the dataset does not support + # evaluating semantic segmentation metric. + semantic_on=False, + instance_on=True, + # max_per_image is for instance segmentation. + max_per_image=100, + iou_thr=0.8, + # In Mask2Former's panoptic postprocessing, + # it will filter mask area where score is less than 0.5 . + filter_low_score=True), + init_cfg=None) + +# dataset settings +image_size = (1024, 1024) +img_norm_cfg = dict( + mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True) +train_pipeline = [ + dict(type='LoadImageFromFile', to_float32=True), + dict(type='LoadAnnotations', with_bbox=True, with_mask=True), + dict(type='RandomFlip', flip_ratio=0.5), + # large scale jittering + dict( + type='Resize', + img_scale=image_size, + ratio_range=(0.1, 2.0), + multiscale_mode='range', + keep_ratio=True), + dict( + type='RandomCrop', + crop_size=image_size, + crop_type='absolute', + recompute_bbox=True, + allow_negative_crop=True), + dict(type='Normalize', **img_norm_cfg), + dict(type='Pad', size=image_size), + dict(type='DefaultFormatBundle', img_to_float=True), + dict( + type='Collect', + keys=['img', 'gt_bboxes', 'gt_labels', 'gt_masks']), +] +test_pipeline = [ + dict(type='LoadImageFromFile'), + dict( + type='MultiScaleFlipAug', + img_scale=(1333, 800), + flip=False, + transforms=[ + dict(type='Resize', keep_ratio=True), + dict(type='RandomFlip'), + dict(type='Normalize', **img_norm_cfg), + dict(type='Pad', size_divisor=32), + dict(type='ImageToTensor', keys=['img']), + dict(type='Collect', keys=['img']), + ]) +] + +embed_multi = dict(lr_mult=1.0, decay_mult=0.0) +# optimizer +optimizer = dict( + type='AdamW', + lr=0.0001, + weight_decay=0.05, + eps=1e-8, + betas=(0.9, 0.999), + paramwise_cfg=dict( + custom_keys={ + 'backbone': dict(lr_mult=0.1, decay_mult=1.0), + 'query_embed': embed_multi, + 'query_feat': embed_multi, + 'level_embed': embed_multi, + }, + norm_decay_mult=0.0)) +optimizer_config = dict(grad_clip=dict(max_norm=0.01, norm_type=2)) + +# learning policy +lr_config = dict( + policy='step', + gamma=0.1, + by_epoch=False, + step=[327778, 355092], + warmup='linear', + warmup_by_epoch=False, + warmup_ratio=1.0, # no warmup + warmup_iters=10) + +max_iters = 368750 +runner = dict(type='IterBasedRunner', max_iters=max_iters) + +log_config = dict( + interval=50, + hooks=[ + dict(type='TextLoggerHook', by_epoch=False), + dict(type='TensorboardLoggerHook', by_epoch=False) + ]) +interval = 5000 +workflow = [('train', interval)] +checkpoint_config = dict( + by_epoch=False, interval=interval, save_last=True, max_keep_ckpts=3) + +# Before 365001th iteration, we do evaluation every 5000 iterations. +# After 365000th iteration, we do evaluation every 368750 iterations, +# which means that we do evaluation at the end of training. +dynamic_intervals = [(max_iters // interval * interval + 1, max_iters)] +evaluation = dict( + interval=interval, + dynamic_intervals=dynamic_intervals, + metric=['bbox', 'segm']) diff --git a/configs/mask2former/mask2former_r50_lsj_8x2_50e_coco_pan.py b/configs/mask2former/mask2former_r50_lsj_8x2_50e_coco_pan.py new file mode 100644 index 00000000000..2c23625e139 --- /dev/null +++ b/configs/mask2former/mask2former_r50_lsj_8x2_50e_coco_pan.py @@ -0,0 +1,253 @@ +_base_ = [ + '../_base_/datasets/coco_panoptic.py', '../_base_/default_runtime.py' +] +num_things_classes = 80 +num_stuff_classes = 53 +num_classes = num_things_classes + num_stuff_classes +model = dict( + type='Mask2Former', + backbone=dict( + type='ResNet', + depth=50, + num_stages=4, + out_indices=(0, 1, 2, 3), + frozen_stages=-1, + norm_cfg=dict(type='BN', requires_grad=False), + norm_eval=True, + style='pytorch', + init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet50')), + panoptic_head=dict( + type='Mask2FormerHead', + in_channels=[256, 512, 1024, 2048], # pass to pixel_decoder inside + strides=[4, 8, 16, 32], + feat_channels=256, + out_channels=256, + num_things_classes=num_things_classes, + num_stuff_classes=num_stuff_classes, + num_queries=100, + num_transformer_feat_level=3, + pixel_decoder=dict( + type='MSDeformAttnPixelDecoder', + num_outs=3, + norm_cfg=dict(type='GN', num_groups=32), + act_cfg=dict(type='ReLU'), + encoder=dict( + type='DetrTransformerEncoder', + num_layers=6, + transformerlayers=dict( + type='BaseTransformerLayer', + attn_cfgs=dict( + type='MultiScaleDeformableAttention', + embed_dims=256, + num_heads=8, + num_levels=3, + num_points=4, + im2col_step=64, + dropout=0.0, + batch_first=False, + norm_cfg=None, + init_cfg=None), + ffn_cfgs=dict( + type='FFN', + embed_dims=256, + feedforward_channels=1024, + num_fcs=2, + ffn_drop=0.0, + act_cfg=dict(type='ReLU', inplace=True)), + operation_order=('self_attn', 'norm', 'ffn', 'norm')), + init_cfg=None), + positional_encoding=dict( + type='SinePositionalEncoding', num_feats=128, normalize=True), + init_cfg=None), + enforce_decoder_input_project=False, + positional_encoding=dict( + type='SinePositionalEncoding', num_feats=128, normalize=True), + transformer_decoder=dict( + type='DetrTransformerDecoder', + return_intermediate=True, + num_layers=9, + transformerlayers=dict( + type='DetrTransformerDecoderLayer', + attn_cfgs=dict( + type='MultiheadAttention', + embed_dims=256, + num_heads=8, + attn_drop=0.0, + proj_drop=0.0, + dropout_layer=None, + batch_first=False), + ffn_cfgs=dict( + embed_dims=256, + feedforward_channels=2048, + num_fcs=2, + act_cfg=dict(type='ReLU', inplace=True), + ffn_drop=0.0, + dropout_layer=None, + add_identity=True), + feedforward_channels=2048, + operation_order=('cross_attn', 'norm', 'self_attn', 'norm', + 'ffn', 'norm')), + init_cfg=None), + loss_cls=dict( + type='CrossEntropyLoss', + use_sigmoid=False, + loss_weight=2.0, + reduction='mean', + class_weight=[1.0] * num_classes + [0.1]), + loss_mask=dict( + type='CrossEntropyLoss', + use_sigmoid=True, + reduction='mean', + loss_weight=5.0), + loss_dice=dict( + type='DiceLoss', + use_sigmoid=True, + activate=True, + reduction='mean', + naive_dice=True, + eps=1.0, + loss_weight=5.0)), + panoptic_fusion_head=dict( + type='MaskFormerFusionHead', + num_things_classes=num_things_classes, + num_stuff_classes=num_stuff_classes, + loss_panoptic=None, + init_cfg=None), + train_cfg=dict( + num_points=12544, + oversample_ratio=3.0, + importance_sample_ratio=0.75, + assigner=dict( + type='MaskHungarianAssigner', + cls_cost=dict(type='ClassificationCost', weight=2.0), + mask_cost=dict( + type='CrossEntropyLossCost', weight=5.0, use_sigmoid=True), + dice_cost=dict( + type='DiceCost', weight=5.0, pred_act=True, eps=1.0)), + sampler=dict(type='MaskPseudoSampler')), + test_cfg=dict( + panoptic_on=True, + # For now, the dataset does not support + # evaluating semantic segmentation metric. + semantic_on=False, + instance_on=True, + # max_per_image is for instance segmentation. + max_per_image=100, + iou_thr=0.8, + # In Mask2Former's panoptic postprocessing, + # it will filter mask area where score is less than 0.5 . + filter_low_score=True), + init_cfg=None) + +# dataset settings +image_size = (1024, 1024) +img_norm_cfg = dict( + mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True) +train_pipeline = [ + dict(type='LoadImageFromFile', to_float32=True), + dict( + type='LoadPanopticAnnotations', + with_bbox=True, + with_mask=True, + with_seg=True), + dict(type='RandomFlip', flip_ratio=0.5), + # large scale jittering + dict( + type='Resize', + img_scale=image_size, + ratio_range=(0.1, 2.0), + multiscale_mode='range', + keep_ratio=True), + dict( + type='RandomCrop', + crop_size=image_size, + crop_type='absolute', + recompute_bbox=True, + allow_negative_crop=True), + dict(type='Normalize', **img_norm_cfg), + dict(type='Pad', size=image_size), + dict(type='DefaultFormatBundle', img_to_float=True), + dict( + type='Collect', + keys=['img', 'gt_bboxes', 'gt_labels', 'gt_masks', 'gt_semantic_seg']), +] +test_pipeline = [ + dict(type='LoadImageFromFile'), + dict( + type='MultiScaleFlipAug', + img_scale=(1333, 800), + flip=False, + transforms=[ + dict(type='Resize', keep_ratio=True), + dict(type='RandomFlip'), + dict(type='Normalize', **img_norm_cfg), + dict(type='Pad', size_divisor=32), + dict(type='ImageToTensor', keys=['img']), + dict(type='Collect', keys=['img']), + ]) +] +data_root = 'data/coco/' +data = dict( + samples_per_gpu=2, + workers_per_gpu=2, + train=dict(pipeline=train_pipeline), + val=dict( + pipeline=test_pipeline, + ins_ann_file=data_root + 'annotations/instances_val2017.json', + ), + test=dict( + pipeline=test_pipeline, + ins_ann_file=data_root + 'annotations/instances_val2017.json', + )) + +embed_multi = dict(lr_mult=1.0, decay_mult=0.0) +# optimizer +optimizer = dict( + type='AdamW', + lr=0.0001, + weight_decay=0.05, + eps=1e-8, + betas=(0.9, 0.999), + paramwise_cfg=dict( + custom_keys={ + 'backbone': dict(lr_mult=0.1, decay_mult=1.0), + 'query_embed': embed_multi, + 'query_feat': embed_multi, + 'level_embed': embed_multi, + }, + norm_decay_mult=0.0)) +optimizer_config = dict(grad_clip=dict(max_norm=0.01, norm_type=2)) + +# learning policy +lr_config = dict( + policy='step', + gamma=0.1, + by_epoch=False, + step=[327778, 355092], + warmup='linear', + warmup_by_epoch=False, + warmup_ratio=1.0, # no warmup + warmup_iters=10) + +max_iters = 368750 +runner = dict(type='IterBasedRunner', max_iters=max_iters) + +log_config = dict( + interval=50, + hooks=[ + dict(type='TextLoggerHook', by_epoch=False), + dict(type='TensorboardLoggerHook', by_epoch=False) + ]) +interval = 5000 +workflow = [('train', interval)] +checkpoint_config = dict( + by_epoch=False, interval=interval, save_last=True, max_keep_ckpts=3) + +# Before 365001th iteration, we do evaluation every 5000 iterations. +# After 365000th iteration, we do evaluation every 368750 iterations, +# which means that we do evaluation at the end of training. +dynamic_intervals = [(max_iters // interval * interval + 1, max_iters)] +evaluation = dict( + interval=interval, + dynamic_intervals=dynamic_intervals, + metric=['PQ', 'bbox', 'segm']) diff --git a/configs/mask2former/mask2former_swin-t-p4-w7-224_lsj_8x2_50e_coco_ins.py b/configs/mask2former/mask2former_swin-t-p4-w7-224_lsj_8x2_50e_coco_ins.py new file mode 100644 index 00000000000..feeeb02313c --- /dev/null +++ b/configs/mask2former/mask2former_swin-t-p4-w7-224_lsj_8x2_50e_coco_ins.py @@ -0,0 +1,62 @@ +_base_ = ['./mask2former_r50_lsj_8x2_50e_coco_ins.py'] +pretrained = 'https://github.com/SwinTransformer/storage/releases/download/v1.0.0/swin_tiny_patch4_window7_224.pth' # noqa + +depths = [2, 2, 6, 2] +model = dict( + type='Mask2Former', + backbone=dict( + _delete_=True, + type='SwinTransformer', + embed_dims=96, + depths=depths, + num_heads=[3, 6, 12, 24], + window_size=7, + mlp_ratio=4, + qkv_bias=True, + qk_scale=None, + drop_rate=0., + attn_drop_rate=0., + drop_path_rate=0.3, + patch_norm=True, + out_indices=(0, 1, 2, 3), + with_cp=False, + convert_weights=True, + frozen_stages=-1, + init_cfg=dict(type='Pretrained', checkpoint=pretrained)), + panoptic_head=dict( + type='Mask2FormerHead', in_channels=[96, 192, 384, 768]), + init_cfg=None) + +# set all layers in backbone to lr_mult=0.1 +# set all norm layers, position_embeding, +# query_embeding, level_embeding to decay_multi=0.0 +backbone_norm_multi = dict(lr_mult=0.1, decay_mult=0.0) +backbone_embed_multi = dict(lr_mult=0.1, decay_mult=0.0) +embed_multi = dict(lr_mult=1.0, decay_mult=0.0) +custom_keys = { + 'backbone': dict(lr_mult=0.1, decay_mult=1.0), + 'backbone.patch_embed.norm': backbone_norm_multi, + 'backbone.norm': backbone_norm_multi, + 'absolute_pos_embed': backbone_embed_multi, + 'relative_position_bias_table': backbone_embed_multi, + 'query_embed': embed_multi, + 'query_feat': embed_multi, + 'level_embed': embed_multi +} +custom_keys.update({ + f'backbone.stages.{stage_id}.blocks.{block_id}.norm': backbone_norm_multi + for stage_id, num_blocks in enumerate(depths) + for block_id in range(num_blocks) +}) +custom_keys.update({ + f'backbone.stages.{stage_id}.downsample.norm': backbone_norm_multi + for stage_id in range(len(depths) - 1) +}) +# optimizer +optimizer = dict( + type='AdamW', + lr=0.0001, + weight_decay=0.05, + eps=1e-8, + betas=(0.9, 0.999), + paramwise_cfg=dict(custom_keys=custom_keys, norm_decay_mult=0.0)) diff --git a/configs/mask2former/mask2former_swin-t-p4-w7-224_lsj_8x2_50e_coco_pan.py b/configs/mask2former/mask2former_swin-t-p4-w7-224_lsj_8x2_50e_coco_pan.py new file mode 100644 index 00000000000..45054b41ff9 --- /dev/null +++ b/configs/mask2former/mask2former_swin-t-p4-w7-224_lsj_8x2_50e_coco_pan.py @@ -0,0 +1,62 @@ +_base_ = ['./mask2former_r50_lsj_8x2_50e_coco_pan.py'] +pretrained = 'https://github.com/SwinTransformer/storage/releases/download/v1.0.0/swin_tiny_patch4_window7_224.pth' # noqa + +depths = [2, 2, 6, 2] +model = dict( + type='Mask2Former', + backbone=dict( + _delete_=True, + type='SwinTransformer', + embed_dims=96, + depths=depths, + num_heads=[3, 6, 12, 24], + window_size=7, + mlp_ratio=4, + qkv_bias=True, + qk_scale=None, + drop_rate=0., + attn_drop_rate=0., + drop_path_rate=0.3, + patch_norm=True, + out_indices=(0, 1, 2, 3), + with_cp=False, + convert_weights=True, + frozen_stages=-1, + init_cfg=dict(type='Pretrained', checkpoint=pretrained)), + panoptic_head=dict( + type='Mask2FormerHead', in_channels=[96, 192, 384, 768]), + init_cfg=None) + +# set all layers in backbone to lr_mult=0.1 +# set all norm layers, position_embeding, +# query_embeding, level_embeding to decay_multi=0.0 +backbone_norm_multi = dict(lr_mult=0.1, decay_mult=0.0) +backbone_embed_multi = dict(lr_mult=0.1, decay_mult=0.0) +embed_multi = dict(lr_mult=1.0, decay_mult=0.0) +custom_keys = { + 'backbone': dict(lr_mult=0.1, decay_mult=1.0), + 'backbone.patch_embed.norm': backbone_norm_multi, + 'backbone.norm': backbone_norm_multi, + 'absolute_pos_embed': backbone_embed_multi, + 'relative_position_bias_table': backbone_embed_multi, + 'query_embed': embed_multi, + 'query_feat': embed_multi, + 'level_embed': embed_multi +} +custom_keys.update({ + f'backbone.stages.{stage_id}.blocks.{block_id}.norm': backbone_norm_multi + for stage_id, num_blocks in enumerate(depths) + for block_id in range(num_blocks) +}) +custom_keys.update({ + f'backbone.stages.{stage_id}.downsample.norm': backbone_norm_multi + for stage_id in range(len(depths) - 1) +}) +# optimizer +optimizer = dict( + type='AdamW', + lr=0.0001, + weight_decay=0.05, + eps=1e-8, + betas=(0.9, 0.999), + paramwise_cfg=dict(custom_keys=custom_keys, norm_decay_mult=0.0)) diff --git a/mmdet/models/dense_heads/maskformer_head.py b/mmdet/models/dense_heads/maskformer_head.py index 4541e018c0d..3ae46002464 100644 --- a/mmdet/models/dense_heads/maskformer_head.py +++ b/mmdet/models/dense_heads/maskformer_head.py @@ -134,7 +134,8 @@ def init_weights(self): if p.dim() > 1: nn.init.xavier_uniform_(p) - def preprocess_gt(self, gt_labels_list, gt_masks_list, gt_semantic_segs): + def preprocess_gt(self, gt_labels_list, gt_masks_list, gt_semantic_segs, + img_metas): """Preprocess the ground truth for all images. Args: @@ -143,13 +144,12 @@ def preprocess_gt(self, gt_labels_list, gt_masks_list, gt_semantic_segs): gt_masks_list (list[BitmapMasks]): Each is ground truth masks of each instances of a image, shape (num_gts, h, w). - gt_semantic_seg (Tensor): Ground truth of semantic + gt_semantic_seg (Tensor | None): Ground truth of semantic segmentation with the shape (batch_size, n, h, w). [0, num_thing_class - 1] means things, [num_thing_class, num_class-1] means stuff, 255 means VOID. - target_shape (tuple[int]): Shape of output mask_preds. - Resize the masks to shape of mask_preds. + img_metas (list[dict]): List of image meta information. Returns: tuple: a tuple containing the following targets. @@ -161,10 +161,12 @@ def preprocess_gt(self, gt_labels_list, gt_masks_list, gt_semantic_segs): """ num_things_list = [self.num_things_classes] * len(gt_labels_list) num_stuff_list = [self.num_stuff_classes] * len(gt_labels_list) + if gt_semantic_segs == None: + gt_semantic_segs = [None] * len(gt_labels_list) targets = multi_apply(preprocess_panoptic_gt, gt_labels_list, gt_masks_list, gt_semantic_segs, num_things_list, - num_stuff_list) + num_stuff_list, img_metas) labels, masks = targets return labels, masks @@ -494,8 +496,8 @@ def forward_train(self, each box, shape (num_gts,). gt_masks (list[BitmapMasks]): Each element is masks of instances of a image, shape (num_gts, h, w). - gt_semantic_seg (list[tensor]):Each element is the ground truth - of semantic segmentation with the shape (N, H, W). + gt_semantic_seg (list[tensor] | None): Each element is the ground + truth of semantic segmentation with the shape (N, H, W). [0, num_thing_class - 1] means things, [num_thing_class, num_class-1] means stuff, 255 means VOID. @@ -513,7 +515,7 @@ def forward_train(self, # preprocess ground truth gt_labels, gt_masks = self.preprocess_gt(gt_labels, gt_masks, - gt_semantic_seg) + gt_semantic_seg, img_metas) # loss losses = self.loss(all_cls_scores, all_mask_preds, gt_labels, gt_masks, diff --git a/mmdet/models/detectors/maskformer.py b/mmdet/models/detectors/maskformer.py index b626e070813..d83b130dfad 100644 --- a/mmdet/models/detectors/maskformer.py +++ b/mmdet/models/detectors/maskformer.py @@ -43,6 +43,9 @@ def __init__(self, self.train_cfg = train_cfg self.test_cfg = test_cfg + if self.num_stuff_classes > 0: + self.show_result = self._show_pan_result + def forward_dummy(self, img, img_metas): """Used for computing network flops. See `mmdetection/tools/analysis_tools/get_flops.py` @@ -67,7 +70,7 @@ def forward_train(self, gt_bboxes, gt_labels, gt_masks, - gt_semantic_seg, + gt_semantic_seg=None, gt_bboxes_ignore=None, **kargs): """ @@ -86,6 +89,7 @@ def forward_train(self, used if the architecture supports a segmentation task. gt_semantic_seg (list[tensor]): semantic segmentation mask for images. + Defaults to None. gt_bboxes_ignore (list[Tensor]): specify which bounding boxes can be ignored when computing the loss. Defaults to None. @@ -126,6 +130,9 @@ def simple_test(self, imgs, img_metas, **kwargs): }, ... ] + + | list(tuple): Formatted bbox and mask results of multiple \ + images when no panoptic segmentation classes exist. """ feats = self.extract_feat(imgs) mask_cls_results, mask_pred_results = self.panoptic_head.simple_test( @@ -151,6 +158,9 @@ def simple_test(self, imgs, img_metas, **kwargs): assert 'sem_results' not in results[i], 'segmantic segmentation '\ 'results are not supported yet.' + if self.num_stuff_classes == 0: + results = [res['ins_results'] for res in results] + return results def aug_test(self, imgs, img_metas, **kwargs): @@ -159,20 +169,20 @@ def aug_test(self, imgs, img_metas, **kwargs): def onnx_export(self, img, img_metas): raise NotImplementedError - def show_result(self, - img, - result, - score_thr=0.3, - bbox_color=(72, 101, 241), - text_color=(72, 101, 241), - mask_color=None, - thickness=2, - font_size=13, - win_name='', - show=False, - wait_time=0, - out_file=None): - """Draw `result` over `img`. + def _show_pan_result(self, + img, + result, + score_thr=0.3, + bbox_color=(72, 101, 241), + text_color=(72, 101, 241), + mask_color=None, + thickness=2, + font_size=13, + win_name='', + show=False, + wait_time=0, + out_file=None): + """Draw `panoptic result` over `img`. Args: img (str or Tensor): The image to be displayed. diff --git a/mmdet/models/utils/panoptic_gt_processing.py b/mmdet/models/utils/panoptic_gt_processing.py index 513f644945c..711905a5fc6 100644 --- a/mmdet/models/utils/panoptic_gt_processing.py +++ b/mmdet/models/utils/panoptic_gt_processing.py @@ -3,7 +3,7 @@ def preprocess_panoptic_gt(gt_labels, gt_masks, gt_semantic_seg, num_things, - num_stuff): + num_stuff, img_metas): """Preprocess the ground truth for a image. Args: @@ -11,13 +11,12 @@ def preprocess_panoptic_gt(gt_labels, gt_masks, gt_semantic_seg, num_things, with shape (num_gts, ). gt_masks (BitmapMasks): Ground truth masks of each instances of a image, shape (num_gts, h, w). - gt_semantic_seg (Tensor): Ground truth of semantic + gt_semantic_seg (Tensor | None): Ground truth of semantic segmentation with the shape (1, h, w). [0, num_thing_class - 1] means things, [num_thing_class, num_class-1] means stuff, 255 means VOID. - target_shape (tuple[int]): Shape of output mask_preds. - Resize the masks to shape of mask_preds. + img_metas (dict): List of image meta information. Returns: tuple: a tuple containing the following targets. @@ -29,12 +28,17 @@ def preprocess_panoptic_gt(gt_labels, gt_masks, gt_semantic_seg, num_things, shape (n, h, w). """ num_classes = num_things + num_stuff - things_labels = gt_labels - gt_semantic_seg = gt_semantic_seg.squeeze(0) - things_masks = gt_masks.pad(gt_semantic_seg.shape[-2:], pad_val=0)\ + things_masks = gt_masks.pad(img_metas['pad_shape'][:2], pad_val=0)\ .to_tensor(dtype=torch.bool, device=gt_labels.device) + if gt_semantic_seg == None: + masks = things_masks.long() + return gt_labels, masks + + things_labels = gt_labels + gt_semantic_seg = gt_semantic_seg.squeeze(0) + semantic_labels = torch.unique( gt_semantic_seg, sorted=False, From b1e03bbd992ef6a1f82a8a5089028e3a3814cb2f Mon Sep 17 00:00:00 2001 From: peter Date: Tue, 29 Mar 2022 12:28:45 -0500 Subject: [PATCH 02/49] obsolete config names --- .../mask2former_r50_lsj_8x2_50e_coco.py | 253 ------------------ ...ormer_swin-t-p4-w7-224_lsj_8x2_50e_coco.py | 62 ----- 2 files changed, 315 deletions(-) delete mode 100644 configs/mask2former/mask2former_r50_lsj_8x2_50e_coco.py delete mode 100644 configs/mask2former/mask2former_swin-t-p4-w7-224_lsj_8x2_50e_coco.py diff --git a/configs/mask2former/mask2former_r50_lsj_8x2_50e_coco.py b/configs/mask2former/mask2former_r50_lsj_8x2_50e_coco.py deleted file mode 100644 index 2c23625e139..00000000000 --- a/configs/mask2former/mask2former_r50_lsj_8x2_50e_coco.py +++ /dev/null @@ -1,253 +0,0 @@ -_base_ = [ - '../_base_/datasets/coco_panoptic.py', '../_base_/default_runtime.py' -] -num_things_classes = 80 -num_stuff_classes = 53 -num_classes = num_things_classes + num_stuff_classes -model = dict( - type='Mask2Former', - backbone=dict( - type='ResNet', - depth=50, - num_stages=4, - out_indices=(0, 1, 2, 3), - frozen_stages=-1, - norm_cfg=dict(type='BN', requires_grad=False), - norm_eval=True, - style='pytorch', - init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet50')), - panoptic_head=dict( - type='Mask2FormerHead', - in_channels=[256, 512, 1024, 2048], # pass to pixel_decoder inside - strides=[4, 8, 16, 32], - feat_channels=256, - out_channels=256, - num_things_classes=num_things_classes, - num_stuff_classes=num_stuff_classes, - num_queries=100, - num_transformer_feat_level=3, - pixel_decoder=dict( - type='MSDeformAttnPixelDecoder', - num_outs=3, - norm_cfg=dict(type='GN', num_groups=32), - act_cfg=dict(type='ReLU'), - encoder=dict( - type='DetrTransformerEncoder', - num_layers=6, - transformerlayers=dict( - type='BaseTransformerLayer', - attn_cfgs=dict( - type='MultiScaleDeformableAttention', - embed_dims=256, - num_heads=8, - num_levels=3, - num_points=4, - im2col_step=64, - dropout=0.0, - batch_first=False, - norm_cfg=None, - init_cfg=None), - ffn_cfgs=dict( - type='FFN', - embed_dims=256, - feedforward_channels=1024, - num_fcs=2, - ffn_drop=0.0, - act_cfg=dict(type='ReLU', inplace=True)), - operation_order=('self_attn', 'norm', 'ffn', 'norm')), - init_cfg=None), - positional_encoding=dict( - type='SinePositionalEncoding', num_feats=128, normalize=True), - init_cfg=None), - enforce_decoder_input_project=False, - positional_encoding=dict( - type='SinePositionalEncoding', num_feats=128, normalize=True), - transformer_decoder=dict( - type='DetrTransformerDecoder', - return_intermediate=True, - num_layers=9, - transformerlayers=dict( - type='DetrTransformerDecoderLayer', - attn_cfgs=dict( - type='MultiheadAttention', - embed_dims=256, - num_heads=8, - attn_drop=0.0, - proj_drop=0.0, - dropout_layer=None, - batch_first=False), - ffn_cfgs=dict( - embed_dims=256, - feedforward_channels=2048, - num_fcs=2, - act_cfg=dict(type='ReLU', inplace=True), - ffn_drop=0.0, - dropout_layer=None, - add_identity=True), - feedforward_channels=2048, - operation_order=('cross_attn', 'norm', 'self_attn', 'norm', - 'ffn', 'norm')), - init_cfg=None), - loss_cls=dict( - type='CrossEntropyLoss', - use_sigmoid=False, - loss_weight=2.0, - reduction='mean', - class_weight=[1.0] * num_classes + [0.1]), - loss_mask=dict( - type='CrossEntropyLoss', - use_sigmoid=True, - reduction='mean', - loss_weight=5.0), - loss_dice=dict( - type='DiceLoss', - use_sigmoid=True, - activate=True, - reduction='mean', - naive_dice=True, - eps=1.0, - loss_weight=5.0)), - panoptic_fusion_head=dict( - type='MaskFormerFusionHead', - num_things_classes=num_things_classes, - num_stuff_classes=num_stuff_classes, - loss_panoptic=None, - init_cfg=None), - train_cfg=dict( - num_points=12544, - oversample_ratio=3.0, - importance_sample_ratio=0.75, - assigner=dict( - type='MaskHungarianAssigner', - cls_cost=dict(type='ClassificationCost', weight=2.0), - mask_cost=dict( - type='CrossEntropyLossCost', weight=5.0, use_sigmoid=True), - dice_cost=dict( - type='DiceCost', weight=5.0, pred_act=True, eps=1.0)), - sampler=dict(type='MaskPseudoSampler')), - test_cfg=dict( - panoptic_on=True, - # For now, the dataset does not support - # evaluating semantic segmentation metric. - semantic_on=False, - instance_on=True, - # max_per_image is for instance segmentation. - max_per_image=100, - iou_thr=0.8, - # In Mask2Former's panoptic postprocessing, - # it will filter mask area where score is less than 0.5 . - filter_low_score=True), - init_cfg=None) - -# dataset settings -image_size = (1024, 1024) -img_norm_cfg = dict( - mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True) -train_pipeline = [ - dict(type='LoadImageFromFile', to_float32=True), - dict( - type='LoadPanopticAnnotations', - with_bbox=True, - with_mask=True, - with_seg=True), - dict(type='RandomFlip', flip_ratio=0.5), - # large scale jittering - dict( - type='Resize', - img_scale=image_size, - ratio_range=(0.1, 2.0), - multiscale_mode='range', - keep_ratio=True), - dict( - type='RandomCrop', - crop_size=image_size, - crop_type='absolute', - recompute_bbox=True, - allow_negative_crop=True), - dict(type='Normalize', **img_norm_cfg), - dict(type='Pad', size=image_size), - dict(type='DefaultFormatBundle', img_to_float=True), - dict( - type='Collect', - keys=['img', 'gt_bboxes', 'gt_labels', 'gt_masks', 'gt_semantic_seg']), -] -test_pipeline = [ - dict(type='LoadImageFromFile'), - dict( - type='MultiScaleFlipAug', - img_scale=(1333, 800), - flip=False, - transforms=[ - dict(type='Resize', keep_ratio=True), - dict(type='RandomFlip'), - dict(type='Normalize', **img_norm_cfg), - dict(type='Pad', size_divisor=32), - dict(type='ImageToTensor', keys=['img']), - dict(type='Collect', keys=['img']), - ]) -] -data_root = 'data/coco/' -data = dict( - samples_per_gpu=2, - workers_per_gpu=2, - train=dict(pipeline=train_pipeline), - val=dict( - pipeline=test_pipeline, - ins_ann_file=data_root + 'annotations/instances_val2017.json', - ), - test=dict( - pipeline=test_pipeline, - ins_ann_file=data_root + 'annotations/instances_val2017.json', - )) - -embed_multi = dict(lr_mult=1.0, decay_mult=0.0) -# optimizer -optimizer = dict( - type='AdamW', - lr=0.0001, - weight_decay=0.05, - eps=1e-8, - betas=(0.9, 0.999), - paramwise_cfg=dict( - custom_keys={ - 'backbone': dict(lr_mult=0.1, decay_mult=1.0), - 'query_embed': embed_multi, - 'query_feat': embed_multi, - 'level_embed': embed_multi, - }, - norm_decay_mult=0.0)) -optimizer_config = dict(grad_clip=dict(max_norm=0.01, norm_type=2)) - -# learning policy -lr_config = dict( - policy='step', - gamma=0.1, - by_epoch=False, - step=[327778, 355092], - warmup='linear', - warmup_by_epoch=False, - warmup_ratio=1.0, # no warmup - warmup_iters=10) - -max_iters = 368750 -runner = dict(type='IterBasedRunner', max_iters=max_iters) - -log_config = dict( - interval=50, - hooks=[ - dict(type='TextLoggerHook', by_epoch=False), - dict(type='TensorboardLoggerHook', by_epoch=False) - ]) -interval = 5000 -workflow = [('train', interval)] -checkpoint_config = dict( - by_epoch=False, interval=interval, save_last=True, max_keep_ckpts=3) - -# Before 365001th iteration, we do evaluation every 5000 iterations. -# After 365000th iteration, we do evaluation every 368750 iterations, -# which means that we do evaluation at the end of training. -dynamic_intervals = [(max_iters // interval * interval + 1, max_iters)] -evaluation = dict( - interval=interval, - dynamic_intervals=dynamic_intervals, - metric=['PQ', 'bbox', 'segm']) diff --git a/configs/mask2former/mask2former_swin-t-p4-w7-224_lsj_8x2_50e_coco.py b/configs/mask2former/mask2former_swin-t-p4-w7-224_lsj_8x2_50e_coco.py deleted file mode 100644 index 70e3103e482..00000000000 --- a/configs/mask2former/mask2former_swin-t-p4-w7-224_lsj_8x2_50e_coco.py +++ /dev/null @@ -1,62 +0,0 @@ -_base_ = ['./mask2former_r50_lsj_8x2_50e_coco.py'] -pretrained = 'https://github.com/SwinTransformer/storage/releases/download/v1.0.0/swin_tiny_patch4_window7_224.pth' # noqa - -depths = [2, 2, 6, 2] -model = dict( - type='Mask2Former', - backbone=dict( - _delete_=True, - type='SwinTransformer', - embed_dims=96, - depths=depths, - num_heads=[3, 6, 12, 24], - window_size=7, - mlp_ratio=4, - qkv_bias=True, - qk_scale=None, - drop_rate=0., - attn_drop_rate=0., - drop_path_rate=0.3, - patch_norm=True, - out_indices=(0, 1, 2, 3), - with_cp=False, - convert_weights=True, - frozen_stages=-1, - init_cfg=dict(type='Pretrained', checkpoint=pretrained)), - panoptic_head=dict( - type='Mask2FormerHead', in_channels=[96, 192, 384, 768]), - init_cfg=None) - -# set all layers in backbone to lr_mult=0.1 -# set all norm layers, position_embeding, -# query_embeding, level_embeding to decay_multi=0.0 -backbone_norm_multi = dict(lr_mult=0.1, decay_mult=0.0) -backbone_embed_multi = dict(lr_mult=0.1, decay_mult=0.0) -embed_multi = dict(lr_mult=1.0, decay_mult=0.0) -custom_keys = { - 'backbone': dict(lr_mult=0.1, decay_mult=1.0), - 'backbone.patch_embed.norm': backbone_norm_multi, - 'backbone.norm': backbone_norm_multi, - 'absolute_pos_embed': backbone_embed_multi, - 'relative_position_bias_table': backbone_embed_multi, - 'query_embed': embed_multi, - 'query_feat': embed_multi, - 'level_embed': embed_multi -} -custom_keys.update({ - f'backbone.stages.{stage_id}.blocks.{block_id}.norm': backbone_norm_multi - for stage_id, num_blocks in enumerate(depths) - for block_id in range(num_blocks) -}) -custom_keys.update({ - f'backbone.stages.{stage_id}.downsample.norm': backbone_norm_multi - for stage_id in range(len(depths) - 1) -}) -# optimizer -optimizer = dict( - type='AdamW', - lr=0.0001, - weight_decay=0.05, - eps=1e-8, - betas=(0.9, 0.999), - paramwise_cfg=dict(custom_keys=custom_keys, norm_decay_mult=0.0)) From 9f6de5f1fdfef49911def1e62f47f203fb10daaa Mon Sep 17 00:00:00 2001 From: peter Date: Tue, 29 Mar 2022 13:47:22 -0500 Subject: [PATCH 03/49] if cond is None fix --- mmdet/models/dense_heads/maskformer_head.py | 2 +- mmdet/models/utils/panoptic_gt_processing.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/mmdet/models/dense_heads/maskformer_head.py b/mmdet/models/dense_heads/maskformer_head.py index 3ae46002464..591075185be 100644 --- a/mmdet/models/dense_heads/maskformer_head.py +++ b/mmdet/models/dense_heads/maskformer_head.py @@ -161,7 +161,7 @@ def preprocess_gt(self, gt_labels_list, gt_masks_list, gt_semantic_segs, """ num_things_list = [self.num_things_classes] * len(gt_labels_list) num_stuff_list = [self.num_stuff_classes] * len(gt_labels_list) - if gt_semantic_segs == None: + if gt_semantic_segs is None: gt_semantic_segs = [None] * len(gt_labels_list) targets = multi_apply(preprocess_panoptic_gt, gt_labels_list, diff --git a/mmdet/models/utils/panoptic_gt_processing.py b/mmdet/models/utils/panoptic_gt_processing.py index 711905a5fc6..94c1f5baeaf 100644 --- a/mmdet/models/utils/panoptic_gt_processing.py +++ b/mmdet/models/utils/panoptic_gt_processing.py @@ -32,7 +32,7 @@ def preprocess_panoptic_gt(gt_labels, gt_masks, gt_semantic_seg, num_things, things_masks = gt_masks.pad(img_metas['pad_shape'][:2], pad_val=0)\ .to_tensor(dtype=torch.bool, device=gt_labels.device) - if gt_semantic_seg == None: + if gt_semantic_seg is None: masks = things_masks.long() return gt_labels, masks From fb92360f6c695eb1fa3b3431913a651851c04a8a Mon Sep 17 00:00:00 2001 From: peter Date: Tue, 29 Mar 2022 13:59:24 -0500 Subject: [PATCH 04/49] white space --- mmdet/models/dense_heads/maskformer_head.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mmdet/models/dense_heads/maskformer_head.py b/mmdet/models/dense_heads/maskformer_head.py index 591075185be..4ce2d13420d 100644 --- a/mmdet/models/dense_heads/maskformer_head.py +++ b/mmdet/models/dense_heads/maskformer_head.py @@ -496,7 +496,7 @@ def forward_train(self, each box, shape (num_gts,). gt_masks (list[BitmapMasks]): Each element is masks of instances of a image, shape (num_gts, h, w). - gt_semantic_seg (list[tensor] | None): Each element is the ground + gt_semantic_seg (list[tensor] | None): Each element is the ground truth of semantic segmentation with the shape (N, H, W). [0, num_thing_class - 1] means things, [num_thing_class, num_class-1] means stuff, From 1082de97c366ffa622abcd017a21d1e592b0e125 Mon Sep 17 00:00:00 2001 From: peter Date: Tue, 29 Mar 2022 15:10:33 -0500 Subject: [PATCH 05/49] fix tests --- .pre-commit-config.yaml | 10 +++++----- .../test_dense_heads/test_mask2former_head.py | 2 ++ .../test_dense_heads/test_maskformer_head.py | 2 ++ tests/test_models/test_forward.py | 2 +- 4 files changed, 10 insertions(+), 6 deletions(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 82dd58c69c0..3f939a339a5 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -1,6 +1,6 @@ repos: - - repo: https://github.com/PyCQA/flake8 - rev: 3.8.3 + - repo: https://gitlab.com/pycqa/flake8.git + rev: 3.9.2 hooks: - id: flake8 - repo: https://github.com/PyCQA/isort @@ -8,11 +8,11 @@ repos: hooks: - id: isort - repo: https://github.com/pre-commit/mirrors-yapf - rev: v0.30.0 + rev: v0.32.0 hooks: - id: yapf - repo: https://github.com/pre-commit/pre-commit-hooks - rev: v3.1.0 + rev: v4.1.0 hooks: - id: trailing-whitespace - id: check-yaml @@ -38,7 +38,7 @@ repos: - mdformat_frontmatter - linkify-it-py - repo: https://github.com/myint/docformatter - rev: v1.3.1 + rev: v1.4 hooks: - id: docformatter args: ["--in-place", "--wrap-descriptions", "79"] diff --git a/tests/test_models/test_dense_heads/test_mask2former_head.py b/tests/test_models/test_dense_heads/test_mask2former_head.py index 66d144301b2..a04c91c2921 100644 --- a/tests/test_models/test_dense_heads/test_mask2former_head.py +++ b/tests/test_models/test_dense_heads/test_mask2former_head.py @@ -11,10 +11,12 @@ def test_mask2former_head_loss(): base_channels = 64 img_metas = [{ 'batch_input_shape': (128, 160), + 'pad_shape': (128, 160, 3), 'img_shape': (126, 160, 3), 'ori_shape': (63, 80, 3) }, { 'batch_input_shape': (128, 160), + 'pad_shape': (128, 160, 3), 'img_shape': (120, 160, 3), 'ori_shape': (60, 80, 3) }] diff --git a/tests/test_models/test_dense_heads/test_maskformer_head.py b/tests/test_models/test_dense_heads/test_maskformer_head.py index f9cf3b2326f..c9bebee3774 100644 --- a/tests/test_models/test_dense_heads/test_maskformer_head.py +++ b/tests/test_models/test_dense_heads/test_maskformer_head.py @@ -12,10 +12,12 @@ def test_maskformer_head_loss(): # batch_input_shape = (128, 160) img_metas = [{ 'batch_input_shape': (128, 160), + 'pad_shape': (128, 160, 3), 'img_shape': (126, 160, 3), 'ori_shape': (63, 80, 3) }, { 'batch_input_shape': (128, 160), + 'pad_shape': (128, 160, 3), 'img_shape': (120, 160, 3), 'ori_shape': (60, 80, 3) }] diff --git a/tests/test_models/test_forward.py b/tests/test_models/test_forward.py index 77acc9c1aaf..08278208496 100644 --- a/tests/test_models/test_forward.py +++ b/tests/test_models/test_forward.py @@ -813,7 +813,7 @@ def test_maskformer_forward(): def test_mask2former_forward(): model_cfg = _get_detector_cfg( - 'mask2former/mask2former_r50_lsj_8x2_50e_coco.py') + 'mask2former/mask2former_r50_lsj_8x2_50e_coco_pan.py') base_channels = 32 model_cfg.backbone.depth = 18 model_cfg.backbone.init_cfg = None From db1016199399f639e769857259771b88f038e25b Mon Sep 17 00:00:00 2001 From: peter Date: Tue, 29 Mar 2022 16:20:34 -0500 Subject: [PATCH 06/49] yapf formatting fix --- configs/mask2former/mask2former_r50_lsj_8x2_50e_coco_ins.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/configs/mask2former/mask2former_r50_lsj_8x2_50e_coco_ins.py b/configs/mask2former/mask2former_r50_lsj_8x2_50e_coco_ins.py index 2e894e776c3..166f1adce36 100644 --- a/configs/mask2former/mask2former_r50_lsj_8x2_50e_coco_ins.py +++ b/configs/mask2former/mask2former_r50_lsj_8x2_50e_coco_ins.py @@ -163,9 +163,7 @@ dict(type='Normalize', **img_norm_cfg), dict(type='Pad', size=image_size), dict(type='DefaultFormatBundle', img_to_float=True), - dict( - type='Collect', - keys=['img', 'gt_bboxes', 'gt_labels', 'gt_masks']), + dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels', 'gt_masks']), ] test_pipeline = [ dict(type='LoadImageFromFile'), From e50928122111614e4cfa134ec183262cf515b9d5 Mon Sep 17 00:00:00 2001 From: peter Date: Thu, 31 Mar 2022 10:36:33 -0500 Subject: [PATCH 07/49] semantic_seg None docstring --- mmdet/models/dense_heads/maskformer_head.py | 4 ++-- mmdet/models/detectors/maskformer.py | 4 ++-- mmdet/models/utils/panoptic_gt_processing.py | 6 ++++-- 3 files changed, 8 insertions(+), 6 deletions(-) diff --git a/mmdet/models/dense_heads/maskformer_head.py b/mmdet/models/dense_heads/maskformer_head.py index 4ce2d13420d..c7d1753f49b 100644 --- a/mmdet/models/dense_heads/maskformer_head.py +++ b/mmdet/models/dense_heads/maskformer_head.py @@ -148,7 +148,7 @@ def preprocess_gt(self, gt_labels_list, gt_masks_list, gt_semantic_segs, segmentation with the shape (batch_size, n, h, w). [0, num_thing_class - 1] means things, [num_thing_class, num_class-1] means stuff, - 255 means VOID. + 255 means VOID. Is None when training instance segmentation. img_metas (list[dict]): List of image meta information. Returns: @@ -500,7 +500,7 @@ def forward_train(self, truth of semantic segmentation with the shape (N, H, W). [0, num_thing_class - 1] means things, [num_thing_class, num_class-1] means stuff, - 255 means VOID. + 255 means VOID. Is None when training instance segmentation. gt_bboxes_ignore (list[Tensor]): Ground truth bboxes to be ignored. Defaults to None. diff --git a/mmdet/models/detectors/maskformer.py b/mmdet/models/detectors/maskformer.py index d83b130dfad..d80984b1b99 100644 --- a/mmdet/models/detectors/maskformer.py +++ b/mmdet/models/detectors/maskformer.py @@ -88,8 +88,8 @@ def forward_train(self, gt_masks (list[BitmapMasks]): true segmentation masks for each box used if the architecture supports a segmentation task. gt_semantic_seg (list[tensor]): semantic segmentation mask for - images. - Defaults to None. + images for panoptic segmentation. + Defaults to None for instance segmentation. gt_bboxes_ignore (list[Tensor]): specify which bounding boxes can be ignored when computing the loss. Defaults to None. diff --git a/mmdet/models/utils/panoptic_gt_processing.py b/mmdet/models/utils/panoptic_gt_processing.py index 94c1f5baeaf..0dda06a477c 100644 --- a/mmdet/models/utils/panoptic_gt_processing.py +++ b/mmdet/models/utils/panoptic_gt_processing.py @@ -15,7 +15,7 @@ def preprocess_panoptic_gt(gt_labels, gt_masks, gt_semantic_seg, num_things, segmentation with the shape (1, h, w). [0, num_thing_class - 1] means things, [num_thing_class, num_class-1] means stuff, - 255 means VOID. + 255 means VOID. Is None when training instance segmentation. img_metas (dict): List of image meta information. Returns: @@ -25,7 +25,9 @@ def preprocess_panoptic_gt(gt_labels, gt_masks, gt_semantic_seg, num_things, image, with shape (n, ), n is the sum of number of stuff type and number of instance in a image. - masks (Tensor): Ground truth mask for a image, with - shape (n, h, w). + shape (n, h, w). Contains stuff and things when training + panoptic segmentation, and things only when training + instance segmentation. """ num_classes = num_things + num_stuff From ea8014590fdafe3ca99fe893257aa01526268085 Mon Sep 17 00:00:00 2001 From: peter Date: Thu, 31 Mar 2022 10:54:14 -0500 Subject: [PATCH 08/49] original config names --- ..._8x2_50e_coco_pan.py => mask2former_r50_lsj_8x2_50e_coco.py} | 0 ..._pan.py => mask2former_swin-t-p4-w7-224_lsj_8x2_50e_coco.py} | 2 +- tests/test_models/test_forward.py | 2 +- 3 files changed, 2 insertions(+), 2 deletions(-) rename configs/mask2former/{mask2former_r50_lsj_8x2_50e_coco_pan.py => mask2former_r50_lsj_8x2_50e_coco.py} (100%) rename configs/mask2former/{mask2former_swin-t-p4-w7-224_lsj_8x2_50e_coco_pan.py => mask2former_swin-t-p4-w7-224_lsj_8x2_50e_coco.py} (97%) diff --git a/configs/mask2former/mask2former_r50_lsj_8x2_50e_coco_pan.py b/configs/mask2former/mask2former_r50_lsj_8x2_50e_coco.py similarity index 100% rename from configs/mask2former/mask2former_r50_lsj_8x2_50e_coco_pan.py rename to configs/mask2former/mask2former_r50_lsj_8x2_50e_coco.py diff --git a/configs/mask2former/mask2former_swin-t-p4-w7-224_lsj_8x2_50e_coco_pan.py b/configs/mask2former/mask2former_swin-t-p4-w7-224_lsj_8x2_50e_coco.py similarity index 97% rename from configs/mask2former/mask2former_swin-t-p4-w7-224_lsj_8x2_50e_coco_pan.py rename to configs/mask2former/mask2former_swin-t-p4-w7-224_lsj_8x2_50e_coco.py index 45054b41ff9..70e3103e482 100644 --- a/configs/mask2former/mask2former_swin-t-p4-w7-224_lsj_8x2_50e_coco_pan.py +++ b/configs/mask2former/mask2former_swin-t-p4-w7-224_lsj_8x2_50e_coco.py @@ -1,4 +1,4 @@ -_base_ = ['./mask2former_r50_lsj_8x2_50e_coco_pan.py'] +_base_ = ['./mask2former_r50_lsj_8x2_50e_coco.py'] pretrained = 'https://github.com/SwinTransformer/storage/releases/download/v1.0.0/swin_tiny_patch4_window7_224.pth' # noqa depths = [2, 2, 6, 2] diff --git a/tests/test_models/test_forward.py b/tests/test_models/test_forward.py index 08278208496..77acc9c1aaf 100644 --- a/tests/test_models/test_forward.py +++ b/tests/test_models/test_forward.py @@ -813,7 +813,7 @@ def test_maskformer_forward(): def test_mask2former_forward(): model_cfg = _get_detector_cfg( - 'mask2former/mask2former_r50_lsj_8x2_50e_coco_pan.py') + 'mask2former/mask2former_r50_lsj_8x2_50e_coco.py') base_channels = 32 model_cfg.backbone.depth = 18 model_cfg.backbone.init_cfg = None From 5ff7bb6a827ad0bb59a59f31c354ef9d03d61a22 Mon Sep 17 00:00:00 2001 From: peter Date: Thu, 31 Mar 2022 12:06:25 -0500 Subject: [PATCH 09/49] pan/ins unit test --- tests/test_models/test_forward.py | 62 +++++++++++++++++++------------ 1 file changed, 38 insertions(+), 24 deletions(-) diff --git a/tests/test_models/test_forward.py b/tests/test_models/test_forward.py index 77acc9c1aaf..5f74902b31e 100644 --- a/tests/test_models/test_forward.py +++ b/tests/test_models/test_forward.py @@ -812,8 +812,13 @@ def test_maskformer_forward(): def test_mask2former_forward(): - model_cfg = _get_detector_cfg( - 'mask2former/mask2former_r50_lsj_8x2_50e_coco.py') + # Test Panoptic Segmentation and Instance Segmentation + _mask2former_forward('mask2former/mask2former_r50_lsj_8x2_50e_coco.py') + _mask2former_forward('mask2former/mask2former_r50_lsj_8x2_50e_coco_ins.py') + + +def _mask2former_forward(config): + model_cfg = _get_detector_cfg(config) base_channels = 32 model_cfg.backbone.depth = 18 model_cfg.backbone.init_cfg = None @@ -842,10 +847,25 @@ def test_mask2former_forward(): model_cfg.panoptic_head.transformer_decoder.\ transformerlayers.feedforward_channels = base_channels * 8 + num_stuff_classes = model_cfg.panoptic_head.num_stuff_classes + from mmdet.core import BitmapMasks from mmdet.models import build_detector detector = build_detector(model_cfg) + def _forward_train(): + losses = detector.forward( + img, + img_metas, + gt_bboxes=gt_bboxes, + gt_labels=gt_labels, + gt_masks=gt_masks, + gt_semantic_seg=gt_semantic_seg, + return_loss=True) + assert isinstance(losses, dict) + loss, _ = detector._parse_losses(losses) + assert float(loss.item()) > 0 + # Test forward train with non-empty truth batch detector.train() img_metas = [ @@ -872,17 +892,11 @@ def test_mask2former_forward(): gt_semantic_seg = [ stuff_mask1, ] - losses = detector.forward( - img=img, - img_metas=img_metas, - gt_bboxes=gt_bboxes, - gt_labels=gt_labels, - gt_masks=gt_masks, - gt_semantic_seg=gt_semantic_seg, - return_loss=True) - assert isinstance(losses, dict) - loss, _ = detector._parse_losses(losses) - assert float(loss.item()) > 0 + _forward_train() + + # Test forward train with non-empty truth batch and gt_semantic_seg=None + gt_semantic_seg = None + _forward_train() # Test forward train with an empty truth batch gt_bboxes = [ @@ -898,17 +912,11 @@ def test_mask2former_forward(): gt_semantic_seg = [ torch.randint(0, 133, (0, 128, 160)), ] - losses = detector.forward( - img, - img_metas, - gt_bboxes=gt_bboxes, - gt_labels=gt_labels, - gt_masks=gt_masks, - gt_semantic_seg=gt_semantic_seg, - return_loss=True) - assert isinstance(losses, dict) - loss, _ = detector._parse_losses(losses) - assert float(loss.item()) > 0 + _forward_train() + + # Test forward train with an empty truth batch and gt_semantic_seg=None + gt_semantic_seg = None + _forward_train() # Test forward test detector.eval() @@ -919,4 +927,10 @@ def test_mask2former_forward(): result = detector.forward([one_img], [[one_meta]], rescale=True, return_loss=False) + + if num_stuff_classes > 0: + assert isinstance(result[0], dict) + else: + assert isinstance(result[0], tuple) + batch_results.append(result) From d06bf2eeb1d629819686688d84653fa2774a0e9e Mon Sep 17 00:00:00 2001 From: peter Date: Thu, 31 Mar 2022 12:44:03 -0500 Subject: [PATCH 10/49] show_result comment --- mmdet/models/detectors/maskformer.py | 1 + 1 file changed, 1 insertion(+) diff --git a/mmdet/models/detectors/maskformer.py b/mmdet/models/detectors/maskformer.py index d80984b1b99..6ed00b95c1d 100644 --- a/mmdet/models/detectors/maskformer.py +++ b/mmdet/models/detectors/maskformer.py @@ -43,6 +43,7 @@ def __init__(self, self.train_cfg = train_cfg self.test_cfg = test_cfg + # SingleStageDetector.show_result default for instance segmentation if self.num_stuff_classes > 0: self.show_result = self._show_pan_result From 066d2b7db016752edc736bd44a62ce629f5b4123 Mon Sep 17 00:00:00 2001 From: peter Date: Thu, 31 Mar 2022 14:05:18 -0500 Subject: [PATCH 11/49] pan/ins head unit test --- .../test_dense_heads/test_mask2former_head.py | 163 ++++++++++-------- 1 file changed, 94 insertions(+), 69 deletions(-) diff --git a/tests/test_models/test_dense_heads/test_mask2former_head.py b/tests/test_models/test_dense_heads/test_mask2former_head.py index a04c91c2921..25c29379e28 100644 --- a/tests/test_models/test_dense_heads/test_mask2former_head.py +++ b/tests/test_models/test_dense_heads/test_mask2former_head.py @@ -7,8 +7,19 @@ def test_mask2former_head_loss(): - """Tests head loss when truth is empty and non-empty.""" - base_channels = 64 + """Tests head loss when truth is empty and non-empty. + + Tests head loss as Panoptic Segmentation and Instance Segmentation. Tests + forward_train and simple_test with masks and None as gt_semantic_seg + """ + self = _init_model(num_stuff_classes=53) + _mask2former_head_loss(self, label_num=100) + + self = _init_model(num_stuff_classes=0) + _mask2former_head_loss(self, label_num=80) + + +def _mask2former_head_loss(self, label_num): img_metas = [{ 'batch_input_shape': (128, 160), 'pad_shape': (128, 160, 3), @@ -24,8 +35,87 @@ def test_mask2former_head_loss(): torch.rand((2, 64 * 2**i, 4 * 2**(3 - i), 5 * 2**(3 - i))) for i in range(4) ] + all_cls_scores, all_mask_preds = self.forward(feats, img_metas) + # Test that empty ground truth encourages the network to predict background + gt_labels_list = [torch.LongTensor([]), torch.LongTensor([])] + gt_masks_list = [ + torch.zeros((0, 128, 160)).long(), + torch.zeros((0, 128, 160)).long() + ] + + empty_gt_losses = self.loss(all_cls_scores, all_mask_preds, gt_labels_list, + gt_masks_list, img_metas) + # When there is no truth, the cls loss should be nonzero but there should + # be no mask loss. + for key, loss in empty_gt_losses.items(): + if 'cls' in key: + assert loss.item() > 0, 'cls loss should be non-zero' + elif 'mask' in key: + assert loss.item( + ) == 0, 'there should be no mask loss when there are no true mask' + elif 'dice' in key: + assert loss.item( + ) == 0, 'there should be no dice loss when there are no true mask' + + # when truth is non-empty then both cls, mask, dice loss should be nonzero + # random inputs + gt_labels_list = [ + torch.tensor([10, label_num]).long(), + torch.tensor([label_num, 10]).long() + ] + mask1 = torch.zeros((2, 128, 160)).long() + mask1[0, :50] = 1 + mask1[1, 50:] = 1 + mask2 = torch.zeros((2, 128, 160)).long() + mask2[0, :, :50] = 1 + mask2[1, :, 50:] = 1 + gt_masks_list = [mask1, mask2] + two_gt_losses = self.loss(all_cls_scores, all_mask_preds, gt_labels_list, + gt_masks_list, img_metas) + for loss in two_gt_losses.values(): + assert loss.item() > 0, 'all loss should be non-zero' + + # test forward_train + gt_bboxes = None + gt_labels = [ + torch.tensor([10]).long(), + torch.tensor([10]).long(), + ] + thing_mask1 = np.zeros((1, 128, 160), dtype=np.int32) + thing_mask1[0, :50] = 1 + thing_mask2 = np.zeros((1, 128, 160), dtype=np.int32) + thing_mask2[0, :, 50:] = 1 + gt_masks = [ + BitmapMasks(thing_mask1, 128, 160), + BitmapMasks(thing_mask2, 128, 160), + ] + stuff_mask1 = torch.zeros((1, 128, 160)).long() + stuff_mask1[0, :50] = 10 + stuff_mask1[0, 50:] = 100 + stuff_mask2 = torch.zeros((1, 128, 160)).long() + stuff_mask2[0, :, 50:] = 10 + stuff_mask2[0, :, :50] = 100 + gt_semantic_seg = [stuff_mask1, stuff_mask2] + + self.forward_train(feats, img_metas, gt_bboxes, gt_labels, gt_masks, + gt_semantic_seg) + + # test inference mode + self.simple_test(feats, img_metas) + + # test when gt_semantic_seg is None + gt_semantic_seg = None + + self.forward_train(feats, img_metas, gt_bboxes, gt_labels, gt_masks, + gt_semantic_seg) + + # test inference mode + self.simple_test(feats, img_metas) + + +def _init_model(num_stuff_classes): + base_channels = 64 num_things_classes = 80 - num_stuff_classes = 53 num_classes = num_things_classes + num_stuff_classes config = ConfigDict( dict( @@ -149,70 +239,5 @@ def test_mask2former_head_loss(): iou_thr=0.8))) self = Mask2FormerHead(**config) self.init_weights() - all_cls_scores, all_mask_preds = self.forward(feats, img_metas) - # Test that empty ground truth encourages the network to predict background - gt_labels_list = [torch.LongTensor([]), torch.LongTensor([])] - gt_masks_list = [ - torch.zeros((0, 128, 160)).long(), - torch.zeros((0, 128, 160)).long() - ] - - empty_gt_losses = self.loss(all_cls_scores, all_mask_preds, gt_labels_list, - gt_masks_list, img_metas) - # When there is no truth, the cls loss should be nonzero but there should - # be no mask loss. - for key, loss in empty_gt_losses.items(): - if 'cls' in key: - assert loss.item() > 0, 'cls loss should be non-zero' - elif 'mask' in key: - assert loss.item( - ) == 0, 'there should be no mask loss when there are no true mask' - elif 'dice' in key: - assert loss.item( - ) == 0, 'there should be no dice loss when there are no true mask' - - # when truth is non-empty then both cls, mask, dice loss should be nonzero - # random inputs - gt_labels_list = [ - torch.tensor([10, 100]).long(), - torch.tensor([100, 10]).long() - ] - mask1 = torch.zeros((2, 128, 160)).long() - mask1[0, :50] = 1 - mask1[1, 50:] = 1 - mask2 = torch.zeros((2, 128, 160)).long() - mask2[0, :, :50] = 1 - mask2[1, :, 50:] = 1 - gt_masks_list = [mask1, mask2] - two_gt_losses = self.loss(all_cls_scores, all_mask_preds, gt_labels_list, - gt_masks_list, img_metas) - for loss in two_gt_losses.values(): - assert loss.item() > 0, 'all loss should be non-zero' - - # test forward_train - gt_bboxes = None - gt_labels = [ - torch.tensor([10]).long(), - torch.tensor([10]).long(), - ] - thing_mask1 = np.zeros((1, 128, 160), dtype=np.int32) - thing_mask1[0, :50] = 1 - thing_mask2 = np.zeros((1, 128, 160), dtype=np.int32) - thing_mask2[0, :, 50:] = 1 - gt_masks = [ - BitmapMasks(thing_mask1, 128, 160), - BitmapMasks(thing_mask2, 128, 160), - ] - stuff_mask1 = torch.zeros((1, 128, 160)).long() - stuff_mask1[0, :50] = 10 - stuff_mask1[0, 50:] = 100 - stuff_mask2 = torch.zeros((1, 128, 160)).long() - stuff_mask2[0, :, 50:] = 10 - stuff_mask2[0, :, :50] = 100 - gt_semantic_seg = [stuff_mask1, stuff_mask2] - self.forward_train(feats, img_metas, gt_bboxes, gt_labels, gt_masks, - gt_semantic_seg) - - # test inference mode - self.simple_test(feats, img_metas) + return self From acca83a59ec39e8861fee4ff9a5144f06c5e45fa Mon Sep 17 00:00:00 2001 From: peter Date: Thu, 31 Mar 2022 14:09:03 -0500 Subject: [PATCH 12/49] redundant test --- tests/test_models/test_dense_heads/test_mask2former_head.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/tests/test_models/test_dense_heads/test_mask2former_head.py b/tests/test_models/test_dense_heads/test_mask2former_head.py index 25c29379e28..5088b3fc136 100644 --- a/tests/test_models/test_dense_heads/test_mask2former_head.py +++ b/tests/test_models/test_dense_heads/test_mask2former_head.py @@ -100,12 +100,8 @@ def _mask2former_head_loss(self, label_num): self.forward_train(feats, img_metas, gt_bboxes, gt_labels, gt_masks, gt_semantic_seg) - # test inference mode - self.simple_test(feats, img_metas) - # test when gt_semantic_seg is None gt_semantic_seg = None - self.forward_train(feats, img_metas, gt_bboxes, gt_labels, gt_masks, gt_semantic_seg) From c755f4fdf6d9bb7896518ae5b181856027b6bb36 Mon Sep 17 00:00:00 2001 From: peter Date: Thu, 31 Mar 2022 16:14:52 -0500 Subject: [PATCH 13/49] inherit configs --- .../mask2former_r50_lsj_8x2_50e_coco_ins.py | 209 +++--------------- ...r_swin-t-p4-w7-224_lsj_8x2_50e_coco_ins.py | 1 - 2 files changed, 26 insertions(+), 184 deletions(-) diff --git a/configs/mask2former/mask2former_r50_lsj_8x2_50e_coco_ins.py b/configs/mask2former/mask2former_r50_lsj_8x2_50e_coco_ins.py index 166f1adce36..e2cd430fb0b 100644 --- a/configs/mask2former/mask2former_r50_lsj_8x2_50e_coco_ins.py +++ b/configs/mask2former/mask2former_r50_lsj_8x2_50e_coco_ins.py @@ -1,143 +1,16 @@ -_base_ = [ - '../_base_/datasets/coco_instance.py', '../_base_/default_runtime.py' -] +_base_ = ['./mask2former_r50_lsj_8x2_50e_coco.py'] num_things_classes = 80 num_stuff_classes = 0 num_classes = num_things_classes + num_stuff_classes model = dict( - type='Mask2Former', - backbone=dict( - type='ResNet', - depth=50, - num_stages=4, - out_indices=(0, 1, 2, 3), - frozen_stages=-1, - norm_cfg=dict(type='BN', requires_grad=False), - norm_eval=True, - style='pytorch', - init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet50')), panoptic_head=dict( - type='Mask2FormerHead', - in_channels=[256, 512, 1024, 2048], # pass to pixel_decoder inside - strides=[4, 8, 16, 32], - feat_channels=256, - out_channels=256, num_things_classes=num_things_classes, num_stuff_classes=num_stuff_classes, - num_queries=100, - num_transformer_feat_level=3, - pixel_decoder=dict( - type='MSDeformAttnPixelDecoder', - num_outs=3, - norm_cfg=dict(type='GN', num_groups=32), - act_cfg=dict(type='ReLU'), - encoder=dict( - type='DetrTransformerEncoder', - num_layers=6, - transformerlayers=dict( - type='BaseTransformerLayer', - attn_cfgs=dict( - type='MultiScaleDeformableAttention', - embed_dims=256, - num_heads=8, - num_levels=3, - num_points=4, - im2col_step=64, - dropout=0.0, - batch_first=False, - norm_cfg=None, - init_cfg=None), - ffn_cfgs=dict( - type='FFN', - embed_dims=256, - feedforward_channels=1024, - num_fcs=2, - ffn_drop=0.0, - act_cfg=dict(type='ReLU', inplace=True)), - operation_order=('self_attn', 'norm', 'ffn', 'norm')), - init_cfg=None), - positional_encoding=dict( - type='SinePositionalEncoding', num_feats=128, normalize=True), - init_cfg=None), - enforce_decoder_input_project=False, - positional_encoding=dict( - type='SinePositionalEncoding', num_feats=128, normalize=True), - transformer_decoder=dict( - type='DetrTransformerDecoder', - return_intermediate=True, - num_layers=9, - transformerlayers=dict( - type='DetrTransformerDecoderLayer', - attn_cfgs=dict( - type='MultiheadAttention', - embed_dims=256, - num_heads=8, - attn_drop=0.0, - proj_drop=0.0, - dropout_layer=None, - batch_first=False), - ffn_cfgs=dict( - embed_dims=256, - feedforward_channels=2048, - num_fcs=2, - act_cfg=dict(type='ReLU', inplace=True), - ffn_drop=0.0, - dropout_layer=None, - add_identity=True), - feedforward_channels=2048, - operation_order=('cross_attn', 'norm', 'self_attn', 'norm', - 'ffn', 'norm')), - init_cfg=None), - loss_cls=dict( - type='CrossEntropyLoss', - use_sigmoid=False, - loss_weight=2.0, - reduction='mean', - class_weight=[1.0] * num_classes + [0.1]), - loss_mask=dict( - type='CrossEntropyLoss', - use_sigmoid=True, - reduction='mean', - loss_weight=5.0), - loss_dice=dict( - type='DiceLoss', - use_sigmoid=True, - activate=True, - reduction='mean', - naive_dice=True, - eps=1.0, - loss_weight=5.0)), + loss_cls=dict(class_weight=[1.0] * num_classes + [0.1])), panoptic_fusion_head=dict( - type='MaskFormerFusionHead', num_things_classes=num_things_classes, - num_stuff_classes=num_stuff_classes, - loss_panoptic=None, - init_cfg=None), - train_cfg=dict( - num_points=12544, - oversample_ratio=3.0, - importance_sample_ratio=0.75, - assigner=dict( - type='MaskHungarianAssigner', - cls_cost=dict(type='ClassificationCost', weight=2.0), - mask_cost=dict( - type='CrossEntropyLossCost', weight=5.0, use_sigmoid=True), - dice_cost=dict( - type='DiceCost', weight=5.0, pred_act=True, eps=1.0)), - sampler=dict(type='MaskPseudoSampler')), - test_cfg=dict( - panoptic_on=False, - # For now, the dataset does not support - # evaluating semantic segmentation metric. - semantic_on=False, - instance_on=True, - # max_per_image is for instance segmentation. - max_per_image=100, - iou_thr=0.8, - # In Mask2Former's panoptic postprocessing, - # it will filter mask area where score is less than 0.5 . - filter_low_score=True), - init_cfg=None) + num_stuff_classes=num_stuff_classes), + test_cfg=dict(panoptic_on=False)) # dataset settings image_size = (1024, 1024) @@ -180,55 +53,25 @@ dict(type='Collect', keys=['img']), ]) ] - -embed_multi = dict(lr_mult=1.0, decay_mult=0.0) -# optimizer -optimizer = dict( - type='AdamW', - lr=0.0001, - weight_decay=0.05, - eps=1e-8, - betas=(0.9, 0.999), - paramwise_cfg=dict( - custom_keys={ - 'backbone': dict(lr_mult=0.1, decay_mult=1.0), - 'query_embed': embed_multi, - 'query_feat': embed_multi, - 'level_embed': embed_multi, - }, - norm_decay_mult=0.0)) -optimizer_config = dict(grad_clip=dict(max_norm=0.01, norm_type=2)) - -# learning policy -lr_config = dict( - policy='step', - gamma=0.1, - by_epoch=False, - step=[327778, 355092], - warmup='linear', - warmup_by_epoch=False, - warmup_ratio=1.0, # no warmup - warmup_iters=10) - -max_iters = 368750 -runner = dict(type='IterBasedRunner', max_iters=max_iters) - -log_config = dict( - interval=50, - hooks=[ - dict(type='TextLoggerHook', by_epoch=False), - dict(type='TensorboardLoggerHook', by_epoch=False) - ]) -interval = 5000 -workflow = [('train', interval)] -checkpoint_config = dict( - by_epoch=False, interval=interval, save_last=True, max_keep_ckpts=3) - -# Before 365001th iteration, we do evaluation every 5000 iterations. -# After 365000th iteration, we do evaluation every 368750 iterations, -# which means that we do evaluation at the end of training. -dynamic_intervals = [(max_iters // interval * interval + 1, max_iters)] -evaluation = dict( - interval=interval, - dynamic_intervals=dynamic_intervals, - metric=['bbox', 'segm']) +dataset_type = 'CocoDataset' +data_root = 'data/coco/' +data = dict( + _delete_=True, + samples_per_gpu=1, + workers_per_gpu=1, + train=dict( + type=dataset_type, + ann_file=data_root + 'annotations/instances_train2017.json', + img_prefix=data_root + 'train2017/', + pipeline=train_pipeline), + val=dict( + type=dataset_type, + ann_file=data_root + 'annotations/instances_val2017.json', + img_prefix=data_root + 'val2017/', + pipeline=test_pipeline), + test=dict( + type=dataset_type, + ann_file=data_root + 'annotations/instances_val2017.json', + img_prefix=data_root + 'val2017/', + pipeline=test_pipeline)) +evaluation = dict(metric=['bbox', 'segm']) diff --git a/configs/mask2former/mask2former_swin-t-p4-w7-224_lsj_8x2_50e_coco_ins.py b/configs/mask2former/mask2former_swin-t-p4-w7-224_lsj_8x2_50e_coco_ins.py index feeeb02313c..66d69dd7698 100644 --- a/configs/mask2former/mask2former_swin-t-p4-w7-224_lsj_8x2_50e_coco_ins.py +++ b/configs/mask2former/mask2former_swin-t-p4-w7-224_lsj_8x2_50e_coco_ins.py @@ -1,6 +1,5 @@ _base_ = ['./mask2former_r50_lsj_8x2_50e_coco_ins.py'] pretrained = 'https://github.com/SwinTransformer/storage/releases/download/v1.0.0/swin_tiny_patch4_window7_224.pth' # noqa - depths = [2, 2, 6, 2] model = dict( type='Mask2Former', From 99dfe4a25f4acd5c894acf55d77ad2d73046da25 Mon Sep 17 00:00:00 2001 From: peter Date: Thu, 31 Mar 2022 16:25:46 -0500 Subject: [PATCH 14/49] correct gpu # --- configs/mask2former/mask2former_r50_lsj_8x2_50e_coco_ins.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/configs/mask2former/mask2former_r50_lsj_8x2_50e_coco_ins.py b/configs/mask2former/mask2former_r50_lsj_8x2_50e_coco_ins.py index e2cd430fb0b..a3ddc6abfe3 100644 --- a/configs/mask2former/mask2former_r50_lsj_8x2_50e_coco_ins.py +++ b/configs/mask2former/mask2former_r50_lsj_8x2_50e_coco_ins.py @@ -57,8 +57,8 @@ data_root = 'data/coco/' data = dict( _delete_=True, - samples_per_gpu=1, - workers_per_gpu=1, + samples_per_gpu=2, + workers_per_gpu=2, train=dict( type=dataset_type, ann_file=data_root + 'annotations/instances_train2017.json', From 8df82f049488c2d44b4d74261743728c765598a6 Mon Sep 17 00:00:00 2001 From: Peter Vennerstrom <36269250+PeterVennerstrom@users.noreply.github.com> Date: Fri, 1 Apr 2022 07:51:26 -0500 Subject: [PATCH 15/49] revert version --- .pre-commit-config.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 3f939a339a5..b282bae2f9a 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -1,6 +1,6 @@ repos: - repo: https://gitlab.com/pycqa/flake8.git - rev: 3.9.2 + rev: 3.8.3 hooks: - id: flake8 - repo: https://github.com/PyCQA/isort From 30ee9d7e04b6dddc9e6175a63722b2c3745f0f65 Mon Sep 17 00:00:00 2001 From: Peter Vennerstrom <36269250+PeterVennerstrom@users.noreply.github.com> Date: Fri, 1 Apr 2022 07:54:32 -0500 Subject: [PATCH 16/49] BaseDetector.show_result comment --- mmdet/models/detectors/maskformer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mmdet/models/detectors/maskformer.py b/mmdet/models/detectors/maskformer.py index 6ed00b95c1d..ba2f7268bc5 100644 --- a/mmdet/models/detectors/maskformer.py +++ b/mmdet/models/detectors/maskformer.py @@ -43,7 +43,7 @@ def __init__(self, self.train_cfg = train_cfg self.test_cfg = test_cfg - # SingleStageDetector.show_result default for instance segmentation + # BaseDetector.show_result default for instance segmentation if self.num_stuff_classes > 0: self.show_result = self._show_pan_result From 3925920f68b38657be1f2fb1b66e3687b964f3ee Mon Sep 17 00:00:00 2001 From: Peter Vennerstrom <36269250+PeterVennerstrom@users.noreply.github.com> Date: Fri, 1 Apr 2022 08:12:55 -0500 Subject: [PATCH 17/49] revert more versions --- .pre-commit-config.yaml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index b282bae2f9a..32d48c1f040 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -8,11 +8,11 @@ repos: hooks: - id: isort - repo: https://github.com/pre-commit/mirrors-yapf - rev: v0.32.0 + rev: v0.30.0 hooks: - id: yapf - repo: https://github.com/pre-commit/pre-commit-hooks - rev: v4.1.0 + rev: v3.1.0 hooks: - id: trailing-whitespace - id: check-yaml @@ -38,7 +38,7 @@ repos: - mdformat_frontmatter - linkify-it-py - repo: https://github.com/myint/docformatter - rev: v1.4 + rev: v1.3.1 hooks: - id: docformatter args: ["--in-place", "--wrap-descriptions", "79"] From ca5b67ff77bf5c70ca111d5e19f8c16b42040018 Mon Sep 17 00:00:00 2001 From: Peter Vennerstrom <36269250+PeterVennerstrom@users.noreply.github.com> Date: Fri, 1 Apr 2022 08:18:42 -0500 Subject: [PATCH 18/49] clarify comment --- mmdet/models/dense_heads/maskformer_head.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/mmdet/models/dense_heads/maskformer_head.py b/mmdet/models/dense_heads/maskformer_head.py index c7d1753f49b..abb17adef3a 100644 --- a/mmdet/models/dense_heads/maskformer_head.py +++ b/mmdet/models/dense_heads/maskformer_head.py @@ -148,7 +148,7 @@ def preprocess_gt(self, gt_labels_list, gt_masks_list, gt_semantic_segs, segmentation with the shape (batch_size, n, h, w). [0, num_thing_class - 1] means things, [num_thing_class, num_class-1] means stuff, - 255 means VOID. Is None when training instance segmentation. + 255 means VOID. It's None when training instance segmentation. img_metas (list[dict]): List of image meta information. Returns: @@ -500,7 +500,7 @@ def forward_train(self, truth of semantic segmentation with the shape (N, H, W). [0, num_thing_class - 1] means things, [num_thing_class, num_class-1] means stuff, - 255 means VOID. Is None when training instance segmentation. + 255 means VOID. It's None when training instance segmentation. gt_bboxes_ignore (list[Tensor]): Ground truth bboxes to be ignored. Defaults to None. From a52285c6967793a085a66c0640aa6f1d328d3a40 Mon Sep 17 00:00:00 2001 From: Peter Vennerstrom <36269250+PeterVennerstrom@users.noreply.github.com> Date: Fri, 1 Apr 2022 08:19:19 -0500 Subject: [PATCH 19/49] clarify comment --- mmdet/models/utils/panoptic_gt_processing.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mmdet/models/utils/panoptic_gt_processing.py b/mmdet/models/utils/panoptic_gt_processing.py index 0dda06a477c..7685ac96fb9 100644 --- a/mmdet/models/utils/panoptic_gt_processing.py +++ b/mmdet/models/utils/panoptic_gt_processing.py @@ -15,7 +15,7 @@ def preprocess_panoptic_gt(gt_labels, gt_masks, gt_semantic_seg, num_things, segmentation with the shape (1, h, w). [0, num_thing_class - 1] means things, [num_thing_class, num_class-1] means stuff, - 255 means VOID. Is None when training instance segmentation. + 255 means VOID. It's None when training instance segmentation. img_metas (dict): List of image meta information. Returns: From 3964689e43410de8b6dfc34baa8603f039b19b63 Mon Sep 17 00:00:00 2001 From: peter Date: Fri, 1 Apr 2022 09:11:43 -0500 Subject: [PATCH 20/49] add FilterAnnotations to data pipeline --- configs/mask2former/mask2former_r50_lsj_8x2_50e_coco_ins.py | 1 + 1 file changed, 1 insertion(+) diff --git a/configs/mask2former/mask2former_r50_lsj_8x2_50e_coco_ins.py b/configs/mask2former/mask2former_r50_lsj_8x2_50e_coco_ins.py index a3ddc6abfe3..7d03a7a1830 100644 --- a/configs/mask2former/mask2former_r50_lsj_8x2_50e_coco_ins.py +++ b/configs/mask2former/mask2former_r50_lsj_8x2_50e_coco_ins.py @@ -33,6 +33,7 @@ crop_type='absolute', recompute_bbox=True, allow_negative_crop=True), + dict(type='FilterAnnotations', min_gt_bbox_wh=(1e-2, 1e-2)), dict(type='Normalize', **img_norm_cfg), dict(type='Pad', size=image_size), dict(type='DefaultFormatBundle', img_to_float=True), From f46ee3e2814cab223822de72de624478f5f506ca Mon Sep 17 00:00:00 2001 From: peter Date: Fri, 1 Apr 2022 09:24:13 -0500 Subject: [PATCH 21/49] more complete Returns docstring --- mmdet/models/detectors/maskformer.py | 22 +++++++++++++--------- 1 file changed, 13 insertions(+), 9 deletions(-) diff --git a/mmdet/models/detectors/maskformer.py b/mmdet/models/detectors/maskformer.py index ba2f7268bc5..050b9e7f998 100644 --- a/mmdet/models/detectors/maskformer.py +++ b/mmdet/models/detectors/maskformer.py @@ -116,13 +116,12 @@ def simple_test(self, imgs, img_metas, **kwargs): img_metas (list[dict]): List of image information. Returns: - list[dict[str, np.array | tuple]]: Semantic segmentation \ - results and panoptic segmentation results for each \ - image. - + list[dict[str, np.array | tuple[list]] | tuple[list]]: + Semantic segmentation results and panoptic segmentation \ + results of each image for panoptic segmentation, or formatted \ + bbox and mask results of each image for instance segmentation. .. code-block:: none - - [ + [# panoptic segmentation { 'pan_results': np.array, # shape = [h, w] 'ins_results': tuple[list], @@ -131,9 +130,14 @@ def simple_test(self, imgs, img_metas, **kwargs): }, ... ] - - | list(tuple): Formatted bbox and mask results of multiple \ - images when no panoptic segmentation classes exist. + # or + [# instance segmentation + ( + bboxes, # list[np.array] + masks # list[[np.array]] + ), + ... + ] """ feats = self.extract_feat(imgs) mask_cls_results, mask_pred_results = self.panoptic_head.simple_test( From abb13f3403c7dcf483429020d4147d6a1e31c8eb Mon Sep 17 00:00:00 2001 From: peter Date: Fri, 1 Apr 2022 10:08:57 -0500 Subject: [PATCH 22/49] use pytest.mark.parametrize decorator --- .../test_dense_heads/test_mask2former_head.py | 14 +++++--------- tests/test_models/test_forward.py | 13 ++++++------- 2 files changed, 11 insertions(+), 16 deletions(-) diff --git a/tests/test_models/test_dense_heads/test_mask2former_head.py b/tests/test_models/test_dense_heads/test_mask2former_head.py index 5088b3fc136..596a325222d 100644 --- a/tests/test_models/test_dense_heads/test_mask2former_head.py +++ b/tests/test_models/test_dense_heads/test_mask2former_head.py @@ -1,4 +1,5 @@ import numpy as np +import pytest import torch from mmcv import ConfigDict @@ -6,20 +7,15 @@ from mmdet.models.dense_heads import Mask2FormerHead -def test_mask2former_head_loss(): +@pytest.mark.parametrize('num_stuff_classes, \ + label_num', [(53, 100), (0, 80)]) +def test_mask2former_head_loss(num_stuff_classes, label_num): """Tests head loss when truth is empty and non-empty. Tests head loss as Panoptic Segmentation and Instance Segmentation. Tests forward_train and simple_test with masks and None as gt_semantic_seg """ - self = _init_model(num_stuff_classes=53) - _mask2former_head_loss(self, label_num=100) - - self = _init_model(num_stuff_classes=0) - _mask2former_head_loss(self, label_num=80) - - -def _mask2former_head_loss(self, label_num): + self = _init_model(num_stuff_classes) img_metas = [{ 'batch_input_shape': (128, 160), 'pad_shape': (128, 160, 3), diff --git a/tests/test_models/test_forward.py b/tests/test_models/test_forward.py index 5f74902b31e..241c2b71a2b 100644 --- a/tests/test_models/test_forward.py +++ b/tests/test_models/test_forward.py @@ -811,14 +811,13 @@ def test_maskformer_forward(): batch_results.append(result) -def test_mask2former_forward(): +@pytest.mark.parametrize('cfg_file', [ + 'mask2former/mask2former_r50_lsj_8x2_50e_coco.py', + 'mask2former/mask2former_r50_lsj_8x2_50e_coco_ins.py' +]) +def test_mask2former_forward(cfg_file): # Test Panoptic Segmentation and Instance Segmentation - _mask2former_forward('mask2former/mask2former_r50_lsj_8x2_50e_coco.py') - _mask2former_forward('mask2former/mask2former_r50_lsj_8x2_50e_coco_ins.py') - - -def _mask2former_forward(config): - model_cfg = _get_detector_cfg(config) + model_cfg = _get_detector_cfg(cfg_file) base_channels = 32 model_cfg.backbone.depth = 18 model_cfg.backbone.init_cfg = None From 390f7f58e8ab98108bbebdc48de0587a8609cd3f Mon Sep 17 00:00:00 2001 From: Peter Vennerstrom <36269250+PeterVennerstrom@users.noreply.github.com> Date: Fri, 1 Apr 2022 11:49:35 -0500 Subject: [PATCH 23/49] fix docstring formatting --- mmdet/models/detectors/maskformer.py | 22 +++++++++++++++------- 1 file changed, 15 insertions(+), 7 deletions(-) diff --git a/mmdet/models/detectors/maskformer.py b/mmdet/models/detectors/maskformer.py index 050b9e7f998..8b33f55ef9e 100644 --- a/mmdet/models/detectors/maskformer.py +++ b/mmdet/models/detectors/maskformer.py @@ -120,8 +120,11 @@ def simple_test(self, imgs, img_metas, **kwargs): Semantic segmentation results and panoptic segmentation \ results of each image for panoptic segmentation, or formatted \ bbox and mask results of each image for instance segmentation. + .. code-block:: none - [# panoptic segmentation + + [ + # panoptic segmentation { 'pan_results': np.array, # shape = [h, w] 'ins_results': tuple[list], @@ -130,13 +133,18 @@ def simple_test(self, imgs, img_metas, **kwargs): }, ... ] - # or - [# instance segmentation - ( - bboxes, # list[np.array] - masks # list[[np.array]] + + or + + .. code-block:: none + + [ + # instance segmentation + ( + bboxes, # list[np.array] + masks # list[list[np.array]] ), - ... + ... ] """ feats = self.extract_feat(imgs) From d7f61eeaa91da7f5f4702a6f92ec2276fe2f55dc Mon Sep 17 00:00:00 2001 From: peter Date: Sat, 2 Apr 2022 10:26:03 -0500 Subject: [PATCH 24/49] lint --- mmdet/models/detectors/maskformer.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/mmdet/models/detectors/maskformer.py b/mmdet/models/detectors/maskformer.py index 8b33f55ef9e..df8b5c293c4 100644 --- a/mmdet/models/detectors/maskformer.py +++ b/mmdet/models/detectors/maskformer.py @@ -120,9 +120,9 @@ def simple_test(self, imgs, img_metas, **kwargs): Semantic segmentation results and panoptic segmentation \ results of each image for panoptic segmentation, or formatted \ bbox and mask results of each image for instance segmentation. - + .. code-block:: none - + [ # panoptic segmentation { @@ -137,7 +137,7 @@ def simple_test(self, imgs, img_metas, **kwargs): or .. code-block:: none - + [ # instance segmentation ( From de7d48c30a7534c7a6a00c78f2300382336731ed Mon Sep 17 00:00:00 2001 From: peter Date: Mon, 11 Apr 2022 09:31:57 -0500 Subject: [PATCH 25/49] Include instances passing mask area test --- .../mask2former_r50_lsj_8x2_50e_coco_ins.py | 2 +- mmdet/datasets/pipelines/loading.py | 35 ++++++++++++++++--- 2 files changed, 31 insertions(+), 6 deletions(-) diff --git a/configs/mask2former/mask2former_r50_lsj_8x2_50e_coco_ins.py b/configs/mask2former/mask2former_r50_lsj_8x2_50e_coco_ins.py index 7d03a7a1830..f7a200f410e 100644 --- a/configs/mask2former/mask2former_r50_lsj_8x2_50e_coco_ins.py +++ b/configs/mask2former/mask2former_r50_lsj_8x2_50e_coco_ins.py @@ -33,7 +33,7 @@ crop_type='absolute', recompute_bbox=True, allow_negative_crop=True), - dict(type='FilterAnnotations', min_gt_bbox_wh=(1e-2, 1e-2)), + dict(type='FilterAnnotations', min_gt_bbox_wh=(1e-5, 1e-5), by_mask=True), dict(type='Normalize', **img_norm_cfg), dict(type='Pad', size=image_size), dict(type='DefaultFormatBundle', img_to_float=True), diff --git a/mmdet/datasets/pipelines/loading.py b/mmdet/datasets/pipelines/loading.py index 41ccff5d31d..8c923dc134a 100644 --- a/mmdet/datasets/pipelines/loading.py +++ b/mmdet/datasets/pipelines/loading.py @@ -574,23 +574,45 @@ class FilterAnnotations: Args: min_gt_bbox_wh (tuple[int]): Minimum width and height of ground truth boxes. + min_gt_mask_area (int): Minimum foreground area of ground truth masks. + by_box (bool): Keep instances with bounding boxes meeting the + min_gt_bbox_wh threshold. + by_mask (bool): Keep instances with masks meeting min_gt_mask_area + threshold. keep_empty (bool): Whether to return None when it becomes an empty bbox after filtering. Default: True """ - def __init__(self, min_gt_bbox_wh, keep_empty=True): + def __init__(self, + min_gt_bbox_wh, + min_gt_mask_area=1, + by_box=True, + by_mask=False, + keep_empty=True): # TODO: add more filter options + assert by_box or by_mask self.min_gt_bbox_wh = min_gt_bbox_wh + self.min_gt_mask_area = min_gt_mask_area + self.by_box = by_box + self.by_mask = by_mask self.keep_empty = keep_empty def __call__(self, results): assert 'gt_bboxes' in results + if self.by_mask: + assert 'gt_masks' in results gt_bboxes = results['gt_bboxes'] - if gt_bboxes.shape[0] == 0: + instance_num = gt_bboxes.shape[0] + if instance_num == 0: return results - w = gt_bboxes[:, 2] - gt_bboxes[:, 0] - h = gt_bboxes[:, 3] - gt_bboxes[:, 1] - keep = (w > self.min_gt_bbox_wh[0]) & (h > self.min_gt_bbox_wh[1]) + keep = np.zeros(instance_num, dtype=bool) + if self.by_box: + w = gt_bboxes[:, 2] - gt_bboxes[:, 0] + h = gt_bboxes[:, 3] - gt_bboxes[:, 1] + keep += (w > self.min_gt_bbox_wh[0]) & (h > self.min_gt_bbox_wh[1]) + if self.by_mask: + gt_masks = results['gt_masks'] + keep += gt_masks.areas >= self.min_gt_mask_area if not keep.any(): if self.keep_empty: return None @@ -606,4 +628,7 @@ def __call__(self, results): def __repr__(self): return self.__class__.__name__ + \ f'(min_gt_bbox_wh={self.min_gt_bbox_wh},' \ + f'(min_gt_mask_area={self.min_gt_mask_area},' \ + f'(by_box={self.by_box},' \ + f'(by_mask={self.by_mask},' \ f'always_keep={self.always_keep})' From 0d496500d581a2f18afd0c60c6bb31017b9200f9 Mon Sep 17 00:00:00 2001 From: peter Date: Mon, 18 Apr 2022 15:58:48 -0500 Subject: [PATCH 26/49] Make FilterAnnotations generic for masks or bboxes --- mmdet/datasets/pipelines/__init__.py | 22 +++++---- mmdet/datasets/pipelines/loading.py | 49 +++++++++++-------- .../test_data/test_pipelines/test_loading.py | 27 +++++++++- 3 files changed, 67 insertions(+), 31 deletions(-) diff --git a/mmdet/datasets/pipelines/__init__.py b/mmdet/datasets/pipelines/__init__.py index dae4b8b188d..fd01bef6c67 100644 --- a/mmdet/datasets/pipelines/__init__.py +++ b/mmdet/datasets/pipelines/__init__.py @@ -6,9 +6,9 @@ from .formatting import (Collect, DefaultFormatBundle, ImageToTensor, ToDataContainer, ToTensor, Transpose, to_tensor) from .instaboost import InstaBoost -from .loading import (LoadAnnotations, LoadImageFromFile, LoadImageFromWebcam, - LoadMultiChannelImageFromFiles, LoadPanopticAnnotations, - LoadProposals) +from .loading import (FilterAnnotations, LoadAnnotations, LoadImageFromFile, + LoadImageFromWebcam, LoadMultiChannelImageFromFiles, + LoadPanopticAnnotations, LoadProposals) from .test_time_aug import MultiScaleFlipAug from .transforms import (Albu, CopyPaste, CutOut, Expand, MinIoURandomCrop, MixUp, Mosaic, Normalize, Pad, PhotoMetricDistortion, @@ -20,11 +20,13 @@ 'Compose', 'to_tensor', 'ToTensor', 'ImageToTensor', 'ToDataContainer', 'Transpose', 'Collect', 'DefaultFormatBundle', 'LoadAnnotations', 'LoadImageFromFile', 'LoadImageFromWebcam', 'LoadPanopticAnnotations', - 'LoadMultiChannelImageFromFiles', 'LoadProposals', 'MultiScaleFlipAug', - 'Resize', 'RandomFlip', 'Pad', 'RandomCrop', 'Normalize', 'SegRescale', - 'MinIoURandomCrop', 'Expand', 'PhotoMetricDistortion', 'Albu', - 'InstaBoost', 'RandomCenterCropPad', 'AutoAugment', 'CutOut', 'Shear', - 'Rotate', 'ColorTransform', 'EqualizeTransform', 'BrightnessTransform', - 'ContrastTransform', 'Translate', 'RandomShift', 'Mosaic', 'MixUp', - 'RandomAffine', 'YOLOXHSVRandomAug', 'CopyPaste' + + 'LoadMultiChannelImageFromFiles', 'LoadProposals', 'FilterAnnotations', + 'MultiScaleFlipAug', 'Resize', 'RandomFlip', 'Pad', 'RandomCrop', + 'Normalize', 'SegRescale', 'MinIoURandomCrop', 'Expand', + 'PhotoMetricDistortion', 'Albu', 'InstaBoost', 'RandomCenterCropPad', + 'AutoAugment', 'CutOut', 'Shear', 'Rotate', 'ColorTransform', + 'EqualizeTransform', 'BrightnessTransform', 'ContrastTransform', + 'Translate', 'RandomShift', 'Mosaic', 'MixUp', 'RandomAffine', + 'YOLOXHSVRandomAug' ] diff --git a/mmdet/datasets/pipelines/loading.py b/mmdet/datasets/pipelines/loading.py index 8c923dc134a..361dd23b8d1 100644 --- a/mmdet/datasets/pipelines/loading.py +++ b/mmdet/datasets/pipelines/loading.py @@ -572,19 +572,19 @@ class FilterAnnotations: """Filter invalid annotations. Args: - min_gt_bbox_wh (tuple[int]): Minimum width and height of ground truth + min_gt_bbox_wh (tuple[float]): Minimum width and height of ground truth boxes. min_gt_mask_area (int): Minimum foreground area of ground truth masks. - by_box (bool): Keep instances with bounding boxes meeting the + by_box (bool): Filter instances with bounding boxes not meeting the min_gt_bbox_wh threshold. - by_mask (bool): Keep instances with masks meeting min_gt_mask_area - threshold. + by_mask (bool): Filter instances with masks not meeting + min_gt_mask_area threshold. keep_empty (bool): Whether to return None when it becomes an empty bbox after filtering. Default: True """ def __init__(self, - min_gt_bbox_wh, + min_gt_bbox_wh=(1., 1.), min_gt_mask_area=1, by_box=True, by_mask=False, @@ -598,32 +598,41 @@ def __init__(self, self.keep_empty = keep_empty def __call__(self, results): - assert 'gt_bboxes' in results + assert self.by_box or self.by_mask + if self.by_box: + assert 'gt_bboxes' in results + gt_bboxes = results['gt_bboxes'] + instance_num = gt_bboxes.shape[0] if self.by_mask: assert 'gt_masks' in results - gt_bboxes = results['gt_bboxes'] - instance_num = gt_bboxes.shape[0] + gt_masks = results['gt_masks'] + instance_num = len(gt_masks) + if instance_num == 0: return results - keep = np.zeros(instance_num, dtype=bool) + + tests = [] if self.by_box: w = gt_bboxes[:, 2] - gt_bboxes[:, 0] h = gt_bboxes[:, 3] - gt_bboxes[:, 1] - keep += (w > self.min_gt_bbox_wh[0]) & (h > self.min_gt_bbox_wh[1]) + tests.append((w > self.min_gt_bbox_wh[0]) + & (h > self.min_gt_bbox_wh[1])) if self.by_mask: gt_masks = results['gt_masks'] - keep += gt_masks.areas >= self.min_gt_mask_area - if not keep.any(): + tests.append(gt_masks.areas >= self.min_gt_mask_area) + + keep = tests[0] + for t in tests[1:]: + keep = keep & t + + keys = ('gt_bboxes', 'gt_labels', 'gt_masks', 'gt_semantic_seg') + for key in keys: + if key in results: + results[key] = results[key][keep] + if not tests[0].any(): if self.keep_empty: return None - else: - return results - else: - keys = ('gt_bboxes', 'gt_labels', 'gt_masks', 'gt_semantic_seg') - for key in keys: - if key in results: - results[key] = results[key][keep] - return results + return results def __repr__(self): return self.__class__.__name__ + \ diff --git a/tests/test_data/test_pipelines/test_loading.py b/tests/test_data/test_pipelines/test_loading.py index 186d28db85e..b9944a1f174 100644 --- a/tests/test_data/test_pipelines/test_loading.py +++ b/tests/test_data/test_pipelines/test_loading.py @@ -4,8 +4,11 @@ import mmcv import numpy as np +import pytest -from mmdet.datasets.pipelines import (LoadImageFromFile, LoadImageFromWebcam, +from mmdet.core.mask import BitmapMasks +from mmdet.datasets.pipelines import (FilterAnnotations, LoadImageFromFile, + LoadImageFromWebcam, LoadMultiChannelImageFromFiles) @@ -89,3 +92,25 @@ def test_load_webcam_img(self): assert results['img'].dtype == np.uint8 assert results['img_shape'] == (288, 512, 3) assert results['ori_shape'] == (288, 512, 3) + + +kwargs = (dict(min_gt_bbox_wh=(100, 100)), + dict(min_gt_bbox_wh=(100, 100), keep_empty=False), + dict(min_gt_bbox_wh=(1, 1)), dict(min_gt_bbox_wh=(.01, .01)), + dict(min_gt_bbox_wh=(.01, .01), by_mask=True), dict(by_mask=True), + dict(by_box=False, by_mask=True)) +targets = (None, 0, 1, 2, 1, 1, 1) + + +@pytest.mark.parametrize('target, kwargs', list(zip(targets, kwargs))) +def test_filter_annotations(target, kwargs): + filter_ann = FilterAnnotations(**kwargs) + bboxes = np.array([[2., 10., 4., 14.], [2., 10., 2.1, 10.1]]) + raw_masks = np.zeros((2, 24, 24)) + raw_masks[0, 10:14, 2:4] = 1 + bitmap_masks = BitmapMasks(raw_masks, 24, 24) + results = dict(gt_bboxes=bboxes, gt_masks=bitmap_masks) + results = filter_ann(results) + if results is not None: + results = results['gt_bboxes'].shape[0] + assert results == target From 3efadfb530654ea2b7ae814d9105b5be5526a4b5 Mon Sep 17 00:00:00 2001 From: peter Date: Mon, 18 Apr 2022 16:05:59 -0500 Subject: [PATCH 27/49] Duplicate assertion --- mmdet/datasets/pipelines/loading.py | 1 - 1 file changed, 1 deletion(-) diff --git a/mmdet/datasets/pipelines/loading.py b/mmdet/datasets/pipelines/loading.py index 361dd23b8d1..c8124c1b37d 100644 --- a/mmdet/datasets/pipelines/loading.py +++ b/mmdet/datasets/pipelines/loading.py @@ -598,7 +598,6 @@ def __init__(self, self.keep_empty = keep_empty def __call__(self, results): - assert self.by_box or self.by_mask if self.by_box: assert 'gt_bboxes' in results gt_bboxes = results['gt_bboxes'] From 62da9285136eb1356e2adbac1feb0a6d848a7396 Mon Sep 17 00:00:00 2001 From: peter Date: Tue, 19 Apr 2022 14:14:22 -0500 Subject: [PATCH 28/49] Add pad config --- configs/mask2former/mask2former_r50_lsj_8x2_50e_coco_ins.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/configs/mask2former/mask2former_r50_lsj_8x2_50e_coco_ins.py b/configs/mask2former/mask2former_r50_lsj_8x2_50e_coco_ins.py index f7a200f410e..9af19219a92 100644 --- a/configs/mask2former/mask2former_r50_lsj_8x2_50e_coco_ins.py +++ b/configs/mask2former/mask2former_r50_lsj_8x2_50e_coco_ins.py @@ -16,6 +16,7 @@ image_size = (1024, 1024) img_norm_cfg = dict( mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True) +pad_cfg = dict(img=(0.0741, 0.2052, 0.4265), masks=0, seg=255) train_pipeline = [ dict(type='LoadImageFromFile', to_float32=True), dict(type='LoadAnnotations', with_bbox=True, with_mask=True), @@ -35,7 +36,7 @@ allow_negative_crop=True), dict(type='FilterAnnotations', min_gt_bbox_wh=(1e-5, 1e-5), by_mask=True), dict(type='Normalize', **img_norm_cfg), - dict(type='Pad', size=image_size), + dict(type='Pad', size=image_size, pad_val=pad_cfg), dict(type='DefaultFormatBundle', img_to_float=True), dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels', 'gt_masks']), ] @@ -49,7 +50,7 @@ dict(type='Resize', keep_ratio=True), dict(type='RandomFlip'), dict(type='Normalize', **img_norm_cfg), - dict(type='Pad', size_divisor=32), + dict(type='Pad', size_divisor=32, pad_val=pad_cfg), dict(type='ImageToTensor', keys=['img']), dict(type='Collect', keys=['img']), ]) From 556593d1213e95ae3bdb8216980546c22b8422bb Mon Sep 17 00:00:00 2001 From: peter Date: Tue, 19 Apr 2022 15:07:32 -0500 Subject: [PATCH 29/49] Less hard coded padding setting --- configs/mask2former/mask2former_r50_lsj_8x2_50e_coco_ins.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/configs/mask2former/mask2former_r50_lsj_8x2_50e_coco_ins.py b/configs/mask2former/mask2former_r50_lsj_8x2_50e_coco_ins.py index 9af19219a92..81c71048924 100644 --- a/configs/mask2former/mask2former_r50_lsj_8x2_50e_coco_ins.py +++ b/configs/mask2former/mask2former_r50_lsj_8x2_50e_coco_ins.py @@ -16,7 +16,7 @@ image_size = (1024, 1024) img_norm_cfg = dict( mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True) -pad_cfg = dict(img=(0.0741, 0.2052, 0.4265), masks=0, seg=255) +pad_cfg = dict(img=(128, 128, 128), masks=0, seg=255) train_pipeline = [ dict(type='LoadImageFromFile', to_float32=True), dict(type='LoadAnnotations', with_bbox=True, with_mask=True), @@ -35,8 +35,8 @@ recompute_bbox=True, allow_negative_crop=True), dict(type='FilterAnnotations', min_gt_bbox_wh=(1e-5, 1e-5), by_mask=True), - dict(type='Normalize', **img_norm_cfg), dict(type='Pad', size=image_size, pad_val=pad_cfg), + dict(type='Normalize', **img_norm_cfg), dict(type='DefaultFormatBundle', img_to_float=True), dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels', 'gt_masks']), ] @@ -49,8 +49,8 @@ transforms=[ dict(type='Resize', keep_ratio=True), dict(type='RandomFlip'), - dict(type='Normalize', **img_norm_cfg), dict(type='Pad', size_divisor=32, pad_val=pad_cfg), + dict(type='Normalize', **img_norm_cfg), dict(type='ImageToTensor', keys=['img']), dict(type='Collect', keys=['img']), ]) From e09562ba30cdee71ed0e96f091809680311515b2 Mon Sep 17 00:00:00 2001 From: peter Date: Wed, 20 Apr 2022 08:36:39 -0500 Subject: [PATCH 30/49] Clarify test arguments --- mmdet/datasets/pipelines/__init__.py | 2 +- mmdet/datasets/pipelines/loading.py | 9 +++++---- tests/test_data/test_pipelines/test_loading.py | 18 +++++++++++------- 3 files changed, 17 insertions(+), 12 deletions(-) diff --git a/mmdet/datasets/pipelines/__init__.py b/mmdet/datasets/pipelines/__init__.py index fd01bef6c67..ed83fe86184 100644 --- a/mmdet/datasets/pipelines/__init__.py +++ b/mmdet/datasets/pipelines/__init__.py @@ -28,5 +28,5 @@ 'AutoAugment', 'CutOut', 'Shear', 'Rotate', 'ColorTransform', 'EqualizeTransform', 'BrightnessTransform', 'ContrastTransform', 'Translate', 'RandomShift', 'Mosaic', 'MixUp', 'RandomAffine', - 'YOLOXHSVRandomAug' + 'YOLOXHSVRandomAug', 'CopyPaste' ] diff --git a/mmdet/datasets/pipelines/loading.py b/mmdet/datasets/pipelines/loading.py index c8124c1b37d..4d936f2dc8f 100644 --- a/mmdet/datasets/pipelines/loading.py +++ b/mmdet/datasets/pipelines/loading.py @@ -573,12 +573,13 @@ class FilterAnnotations: Args: min_gt_bbox_wh (tuple[float]): Minimum width and height of ground truth - boxes. + boxes. Default: (1., 1.) min_gt_mask_area (int): Minimum foreground area of ground truth masks. + Default: 1 by_box (bool): Filter instances with bounding boxes not meeting the - min_gt_bbox_wh threshold. + min_gt_bbox_wh threshold. Default: True by_mask (bool): Filter instances with masks not meeting - min_gt_mask_area threshold. + min_gt_mask_area threshold. Default: False keep_empty (bool): Whether to return None when it becomes an empty bbox after filtering. Default: True """ @@ -628,7 +629,7 @@ def __call__(self, results): for key in keys: if key in results: results[key] = results[key][keep] - if not tests[0].any(): + if not keep.any(): if self.keep_empty: return None return results diff --git a/tests/test_data/test_pipelines/test_loading.py b/tests/test_data/test_pipelines/test_loading.py index b9944a1f174..2e6bb0747a4 100644 --- a/tests/test_data/test_pipelines/test_loading.py +++ b/tests/test_data/test_pipelines/test_loading.py @@ -94,15 +94,19 @@ def test_load_webcam_img(self): assert results['ori_shape'] == (288, 512, 3) -kwargs = (dict(min_gt_bbox_wh=(100, 100)), - dict(min_gt_bbox_wh=(100, 100), keep_empty=False), - dict(min_gt_bbox_wh=(1, 1)), dict(min_gt_bbox_wh=(.01, .01)), - dict(min_gt_bbox_wh=(.01, .01), by_mask=True), dict(by_mask=True), - dict(by_box=False, by_mask=True)) -targets = (None, 0, 1, 2, 1, 1, 1) +def _build_filter_annotations_args(): + kwargs = (dict(min_gt_bbox_wh=(100, 100)), + dict(min_gt_bbox_wh=(100, 100), keep_empty=False), + dict(min_gt_bbox_wh=(1, 1)), dict(min_gt_bbox_wh=(.01, .01)), + dict(min_gt_bbox_wh=(.01, .01), + by_mask=True), dict(by_mask=True), + dict(by_box=False, by_mask=True)) + targets = (None, 0, 1, 2, 1, 1, 1) + return list(zip(targets, kwargs)) -@pytest.mark.parametrize('target, kwargs', list(zip(targets, kwargs))) + +@pytest.mark.parametrize('target, kwargs', _build_filter_annotations_args()) def test_filter_annotations(target, kwargs): filter_ann = FilterAnnotations(**kwargs) bboxes = np.array([[2., 10., 4., 14.], [2., 10., 2.1, 10.1]]) From c84ad220b9fcae0c21020249697dc9acc1042a3c Mon Sep 17 00:00:00 2001 From: peter Date: Mon, 25 Apr 2022 08:26:18 -0500 Subject: [PATCH 31/49] Additional inst_seg configs --- .../mask2former_r101_lsj_8x2_50e_coco_ins.py | 7 ++++ ...b-p4-w12-384-in21k_lsj_8x2_50e_coco_ins.py | 5 +++ ..._swin-b-p4-w12-384_lsj_8x2_50e_coco_ins.py | 42 +++++++++++++++++++ ...p4-w12-384-in21k_lsj_16x1_100e_coco_ins.py | 26 ++++++++++++ ...r_swin-s-p4-w7-224_lsj_8x2_50e_coco_ins.py | 37 ++++++++++++++++ 5 files changed, 117 insertions(+) create mode 100644 configs/mask2former/mask2former_r101_lsj_8x2_50e_coco_ins.py create mode 100644 configs/mask2former/mask2former_swin-b-p4-w12-384-in21k_lsj_8x2_50e_coco_ins.py create mode 100644 configs/mask2former/mask2former_swin-b-p4-w12-384_lsj_8x2_50e_coco_ins.py create mode 100644 configs/mask2former/mask2former_swin-l-p4-w12-384-in21k_lsj_16x1_100e_coco_ins.py create mode 100644 configs/mask2former/mask2former_swin-s-p4-w7-224_lsj_8x2_50e_coco_ins.py diff --git a/configs/mask2former/mask2former_r101_lsj_8x2_50e_coco_ins.py b/configs/mask2former/mask2former_r101_lsj_8x2_50e_coco_ins.py new file mode 100644 index 00000000000..ce2a54c307c --- /dev/null +++ b/configs/mask2former/mask2former_r101_lsj_8x2_50e_coco_ins.py @@ -0,0 +1,7 @@ +_base_ = ['./mask2former_r50_lsj_8x2_50e_coco_ins.py'] + +model = dict( + backbone=dict( + depth=101, + init_cfg=dict(type='Pretrained', + checkpoint='torchvision://resnet101'))) diff --git a/configs/mask2former/mask2former_swin-b-p4-w12-384-in21k_lsj_8x2_50e_coco_ins.py b/configs/mask2former/mask2former_swin-b-p4-w12-384-in21k_lsj_8x2_50e_coco_ins.py new file mode 100644 index 00000000000..eb42af012fc --- /dev/null +++ b/configs/mask2former/mask2former_swin-b-p4-w12-384-in21k_lsj_8x2_50e_coco_ins.py @@ -0,0 +1,5 @@ +_base_ = ['./mask2former_swin-b-p4-w12-384_lsj_8x2_50e_coco_ins.py'] +pretrained = 'https://github.com/SwinTransformer/storage/releases/download/v1.0.0/swin_base_patch4_window12_384_22k.pth' # noqa + +model = dict( + backbone=dict(init_cfg=dict(type='Pretrained', checkpoint=pretrained))) diff --git a/configs/mask2former/mask2former_swin-b-p4-w12-384_lsj_8x2_50e_coco_ins.py b/configs/mask2former/mask2former_swin-b-p4-w12-384_lsj_8x2_50e_coco_ins.py new file mode 100644 index 00000000000..5d7cc63887a --- /dev/null +++ b/configs/mask2former/mask2former_swin-b-p4-w12-384_lsj_8x2_50e_coco_ins.py @@ -0,0 +1,42 @@ +_base_ = ['./mask2former_swin-t-p4-w7-224_lsj_8x2_50e_coco_ins.py'] +pretrained = 'https://github.com/SwinTransformer/storage/releases/download/v1.0.0/swin_base_patch4_window12_384.pth' # noqa + +depths = [2, 2, 18, 2] +model = dict( + backbone=dict( + pretrain_img_size=384, + embed_dims=128, + depths=depths, + num_heads=[4, 8, 16, 32], + window_size=12, + init_cfg=dict(type='Pretrained', checkpoint=pretrained)), + panoptic_head=dict(in_channels=[128, 256, 512, 1024])) + +# set all layers in backbone to lr_mult=0.1 +# set all norm layers, position_embeding, +# query_embeding, level_embeding to decay_multi=0.0 +backbone_norm_multi = dict(lr_mult=0.1, decay_mult=0.0) +backbone_embed_multi = dict(lr_mult=0.1, decay_mult=0.0) +embed_multi = dict(lr_mult=1.0, decay_mult=0.0) +custom_keys = { + 'backbone': dict(lr_mult=0.1, decay_mult=1.0), + 'backbone.patch_embed.norm': backbone_norm_multi, + 'backbone.norm': backbone_norm_multi, + 'absolute_pos_embed': backbone_embed_multi, + 'relative_position_bias_table': backbone_embed_multi, + 'query_embed': embed_multi, + 'query_feat': embed_multi, + 'level_embed': embed_multi +} +custom_keys.update({ + f'backbone.stages.{stage_id}.blocks.{block_id}.norm': backbone_norm_multi + for stage_id, num_blocks in enumerate(depths) + for block_id in range(num_blocks) +}) +custom_keys.update({ + f'backbone.stages.{stage_id}.downsample.norm': backbone_norm_multi + for stage_id in range(len(depths) - 1) +}) +# optimizer +optimizer = dict( + paramwise_cfg=dict(custom_keys=custom_keys, norm_decay_mult=0.0)) diff --git a/configs/mask2former/mask2former_swin-l-p4-w12-384-in21k_lsj_16x1_100e_coco_ins.py b/configs/mask2former/mask2former_swin-l-p4-w12-384-in21k_lsj_16x1_100e_coco_ins.py new file mode 100644 index 00000000000..30d4d736081 --- /dev/null +++ b/configs/mask2former/mask2former_swin-l-p4-w12-384-in21k_lsj_16x1_100e_coco_ins.py @@ -0,0 +1,26 @@ +_base_ = ['./mask2former_swin-b-p4-w12-384_lsj_8x2_50e_coco_ins.py'] +pretrained = 'https://github.com/SwinTransformer/storage/releases/download/v1.0.0/swin_large_patch4_window12_384_22k.pth' # noqa + +model = dict( + backbone=dict( + embed_dims=192, + num_heads=[6, 12, 24, 48], + init_cfg=dict(type='Pretrained', checkpoint=pretrained)), + panoptic_head=dict(num_queries=200, in_channels=[192, 384, 768, 1536])) + +data = dict(samples_per_gpu=1, workers_per_gpu=1) + +lr_config = dict(step=[655556, 710184]) + +max_iters = 737500 +runner = dict(type='IterBasedRunner', max_iters=max_iters) + +# Before 735001th iteration, we do evaluation every 5000 iterations. +# After 735000th iteration, we do evaluation every 737500 iterations, +# which means that we do evaluation at the end of training.' +interval = 5000 +dynamic_intervals = [(max_iters // interval * interval + 1, max_iters)] +evaluation = dict( + interval=interval, + dynamic_intervals=dynamic_intervals, + metric=['PQ', 'bbox', 'segm']) diff --git a/configs/mask2former/mask2former_swin-s-p4-w7-224_lsj_8x2_50e_coco_ins.py b/configs/mask2former/mask2former_swin-s-p4-w7-224_lsj_8x2_50e_coco_ins.py new file mode 100644 index 00000000000..f33ed9b90a4 --- /dev/null +++ b/configs/mask2former/mask2former_swin-s-p4-w7-224_lsj_8x2_50e_coco_ins.py @@ -0,0 +1,37 @@ +_base_ = ['./mask2former_swin-t-p4-w7-224_lsj_8x2_50e_coco_ins.py'] +pretrained = 'https://github.com/SwinTransformer/storage/releases/download/v1.0.0/swin_small_patch4_window7_224.pth' # noqa + +depths = [2, 2, 18, 2] +model = dict( + backbone=dict( + depths=depths, init_cfg=dict(type='Pretrained', + checkpoint=pretrained))) + +# set all layers in backbone to lr_mult=0.1 +# set all norm layers, position_embeding, +# query_embeding, level_embeding to decay_multi=0.0 +backbone_norm_multi = dict(lr_mult=0.1, decay_mult=0.0) +backbone_embed_multi = dict(lr_mult=0.1, decay_mult=0.0) +embed_multi = dict(lr_mult=1.0, decay_mult=0.0) +custom_keys = { + 'backbone': dict(lr_mult=0.1, decay_mult=1.0), + 'backbone.patch_embed.norm': backbone_norm_multi, + 'backbone.norm': backbone_norm_multi, + 'absolute_pos_embed': backbone_embed_multi, + 'relative_position_bias_table': backbone_embed_multi, + 'query_embed': embed_multi, + 'query_feat': embed_multi, + 'level_embed': embed_multi +} +custom_keys.update({ + f'backbone.stages.{stage_id}.blocks.{block_id}.norm': backbone_norm_multi + for stage_id, num_blocks in enumerate(depths) + for block_id in range(num_blocks) +}) +custom_keys.update({ + f'backbone.stages.{stage_id}.downsample.norm': backbone_norm_multi + for stage_id in range(len(depths) - 1) +}) +# optimizer +optimizer = dict( + paramwise_cfg=dict(custom_keys=custom_keys, norm_decay_mult=0.0)) From 83cc338ce6a6aafd42438cb802d408ebbc39c6ab Mon Sep 17 00:00:00 2001 From: peter Date: Mon, 23 May 2022 10:08:03 -0500 Subject: [PATCH 32/49] delete configs --- .../mask2former_r101_lsj_8x2_50e_coco.py | 7 - .../mask2former_r101_lsj_8x2_50e_coco_ins.py | 7 - .../mask2former_r50_lsj_8x2_50e_coco.py | 253 ------------------ .../mask2former_r50_lsj_8x2_50e_coco_ins.py | 79 ------ ...win-b-p4-w12-384-in21k_lsj_8x2_50e_coco.py | 5 - ...b-p4-w12-384-in21k_lsj_8x2_50e_coco_ins.py | 5 - ...rmer_swin-b-p4-w12-384_lsj_8x2_50e_coco.py | 42 --- ..._swin-b-p4-w12-384_lsj_8x2_50e_coco_ins.py | 42 --- ...n-l-p4-w12-384-in21k_lsj_16x1_100e_coco.py | 26 -- ...p4-w12-384-in21k_lsj_16x1_100e_coco_ins.py | 26 -- ...ormer_swin-s-p4-w7-224_lsj_8x2_50e_coco.py | 37 --- ...r_swin-s-p4-w7-224_lsj_8x2_50e_coco_ins.py | 37 --- ...ormer_swin-t-p4-w7-224_lsj_8x2_50e_coco.py | 62 ----- ...r_swin-t-p4-w7-224_lsj_8x2_50e_coco_ins.py | 61 ----- configs/mask2former/metafile.yml | 159 ----------- 15 files changed, 848 deletions(-) delete mode 100644 configs/mask2former/mask2former_r101_lsj_8x2_50e_coco.py delete mode 100644 configs/mask2former/mask2former_r101_lsj_8x2_50e_coco_ins.py delete mode 100644 configs/mask2former/mask2former_r50_lsj_8x2_50e_coco.py delete mode 100644 configs/mask2former/mask2former_r50_lsj_8x2_50e_coco_ins.py delete mode 100644 configs/mask2former/mask2former_swin-b-p4-w12-384-in21k_lsj_8x2_50e_coco.py delete mode 100644 configs/mask2former/mask2former_swin-b-p4-w12-384-in21k_lsj_8x2_50e_coco_ins.py delete mode 100644 configs/mask2former/mask2former_swin-b-p4-w12-384_lsj_8x2_50e_coco.py delete mode 100644 configs/mask2former/mask2former_swin-b-p4-w12-384_lsj_8x2_50e_coco_ins.py delete mode 100644 configs/mask2former/mask2former_swin-l-p4-w12-384-in21k_lsj_16x1_100e_coco.py delete mode 100644 configs/mask2former/mask2former_swin-l-p4-w12-384-in21k_lsj_16x1_100e_coco_ins.py delete mode 100644 configs/mask2former/mask2former_swin-s-p4-w7-224_lsj_8x2_50e_coco.py delete mode 100644 configs/mask2former/mask2former_swin-s-p4-w7-224_lsj_8x2_50e_coco_ins.py delete mode 100644 configs/mask2former/mask2former_swin-t-p4-w7-224_lsj_8x2_50e_coco.py delete mode 100644 configs/mask2former/mask2former_swin-t-p4-w7-224_lsj_8x2_50e_coco_ins.py delete mode 100644 configs/mask2former/metafile.yml diff --git a/configs/mask2former/mask2former_r101_lsj_8x2_50e_coco.py b/configs/mask2former/mask2former_r101_lsj_8x2_50e_coco.py deleted file mode 100644 index 27050585e18..00000000000 --- a/configs/mask2former/mask2former_r101_lsj_8x2_50e_coco.py +++ /dev/null @@ -1,7 +0,0 @@ -_base_ = './mask2former_r50_lsj_8x2_50e_coco.py' - -model = dict( - backbone=dict( - depth=101, - init_cfg=dict(type='Pretrained', - checkpoint='torchvision://resnet101'))) diff --git a/configs/mask2former/mask2former_r101_lsj_8x2_50e_coco_ins.py b/configs/mask2former/mask2former_r101_lsj_8x2_50e_coco_ins.py deleted file mode 100644 index ce2a54c307c..00000000000 --- a/configs/mask2former/mask2former_r101_lsj_8x2_50e_coco_ins.py +++ /dev/null @@ -1,7 +0,0 @@ -_base_ = ['./mask2former_r50_lsj_8x2_50e_coco_ins.py'] - -model = dict( - backbone=dict( - depth=101, - init_cfg=dict(type='Pretrained', - checkpoint='torchvision://resnet101'))) diff --git a/configs/mask2former/mask2former_r50_lsj_8x2_50e_coco.py b/configs/mask2former/mask2former_r50_lsj_8x2_50e_coco.py deleted file mode 100644 index 2c23625e139..00000000000 --- a/configs/mask2former/mask2former_r50_lsj_8x2_50e_coco.py +++ /dev/null @@ -1,253 +0,0 @@ -_base_ = [ - '../_base_/datasets/coco_panoptic.py', '../_base_/default_runtime.py' -] -num_things_classes = 80 -num_stuff_classes = 53 -num_classes = num_things_classes + num_stuff_classes -model = dict( - type='Mask2Former', - backbone=dict( - type='ResNet', - depth=50, - num_stages=4, - out_indices=(0, 1, 2, 3), - frozen_stages=-1, - norm_cfg=dict(type='BN', requires_grad=False), - norm_eval=True, - style='pytorch', - init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet50')), - panoptic_head=dict( - type='Mask2FormerHead', - in_channels=[256, 512, 1024, 2048], # pass to pixel_decoder inside - strides=[4, 8, 16, 32], - feat_channels=256, - out_channels=256, - num_things_classes=num_things_classes, - num_stuff_classes=num_stuff_classes, - num_queries=100, - num_transformer_feat_level=3, - pixel_decoder=dict( - type='MSDeformAttnPixelDecoder', - num_outs=3, - norm_cfg=dict(type='GN', num_groups=32), - act_cfg=dict(type='ReLU'), - encoder=dict( - type='DetrTransformerEncoder', - num_layers=6, - transformerlayers=dict( - type='BaseTransformerLayer', - attn_cfgs=dict( - type='MultiScaleDeformableAttention', - embed_dims=256, - num_heads=8, - num_levels=3, - num_points=4, - im2col_step=64, - dropout=0.0, - batch_first=False, - norm_cfg=None, - init_cfg=None), - ffn_cfgs=dict( - type='FFN', - embed_dims=256, - feedforward_channels=1024, - num_fcs=2, - ffn_drop=0.0, - act_cfg=dict(type='ReLU', inplace=True)), - operation_order=('self_attn', 'norm', 'ffn', 'norm')), - init_cfg=None), - positional_encoding=dict( - type='SinePositionalEncoding', num_feats=128, normalize=True), - init_cfg=None), - enforce_decoder_input_project=False, - positional_encoding=dict( - type='SinePositionalEncoding', num_feats=128, normalize=True), - transformer_decoder=dict( - type='DetrTransformerDecoder', - return_intermediate=True, - num_layers=9, - transformerlayers=dict( - type='DetrTransformerDecoderLayer', - attn_cfgs=dict( - type='MultiheadAttention', - embed_dims=256, - num_heads=8, - attn_drop=0.0, - proj_drop=0.0, - dropout_layer=None, - batch_first=False), - ffn_cfgs=dict( - embed_dims=256, - feedforward_channels=2048, - num_fcs=2, - act_cfg=dict(type='ReLU', inplace=True), - ffn_drop=0.0, - dropout_layer=None, - add_identity=True), - feedforward_channels=2048, - operation_order=('cross_attn', 'norm', 'self_attn', 'norm', - 'ffn', 'norm')), - init_cfg=None), - loss_cls=dict( - type='CrossEntropyLoss', - use_sigmoid=False, - loss_weight=2.0, - reduction='mean', - class_weight=[1.0] * num_classes + [0.1]), - loss_mask=dict( - type='CrossEntropyLoss', - use_sigmoid=True, - reduction='mean', - loss_weight=5.0), - loss_dice=dict( - type='DiceLoss', - use_sigmoid=True, - activate=True, - reduction='mean', - naive_dice=True, - eps=1.0, - loss_weight=5.0)), - panoptic_fusion_head=dict( - type='MaskFormerFusionHead', - num_things_classes=num_things_classes, - num_stuff_classes=num_stuff_classes, - loss_panoptic=None, - init_cfg=None), - train_cfg=dict( - num_points=12544, - oversample_ratio=3.0, - importance_sample_ratio=0.75, - assigner=dict( - type='MaskHungarianAssigner', - cls_cost=dict(type='ClassificationCost', weight=2.0), - mask_cost=dict( - type='CrossEntropyLossCost', weight=5.0, use_sigmoid=True), - dice_cost=dict( - type='DiceCost', weight=5.0, pred_act=True, eps=1.0)), - sampler=dict(type='MaskPseudoSampler')), - test_cfg=dict( - panoptic_on=True, - # For now, the dataset does not support - # evaluating semantic segmentation metric. - semantic_on=False, - instance_on=True, - # max_per_image is for instance segmentation. - max_per_image=100, - iou_thr=0.8, - # In Mask2Former's panoptic postprocessing, - # it will filter mask area where score is less than 0.5 . - filter_low_score=True), - init_cfg=None) - -# dataset settings -image_size = (1024, 1024) -img_norm_cfg = dict( - mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True) -train_pipeline = [ - dict(type='LoadImageFromFile', to_float32=True), - dict( - type='LoadPanopticAnnotations', - with_bbox=True, - with_mask=True, - with_seg=True), - dict(type='RandomFlip', flip_ratio=0.5), - # large scale jittering - dict( - type='Resize', - img_scale=image_size, - ratio_range=(0.1, 2.0), - multiscale_mode='range', - keep_ratio=True), - dict( - type='RandomCrop', - crop_size=image_size, - crop_type='absolute', - recompute_bbox=True, - allow_negative_crop=True), - dict(type='Normalize', **img_norm_cfg), - dict(type='Pad', size=image_size), - dict(type='DefaultFormatBundle', img_to_float=True), - dict( - type='Collect', - keys=['img', 'gt_bboxes', 'gt_labels', 'gt_masks', 'gt_semantic_seg']), -] -test_pipeline = [ - dict(type='LoadImageFromFile'), - dict( - type='MultiScaleFlipAug', - img_scale=(1333, 800), - flip=False, - transforms=[ - dict(type='Resize', keep_ratio=True), - dict(type='RandomFlip'), - dict(type='Normalize', **img_norm_cfg), - dict(type='Pad', size_divisor=32), - dict(type='ImageToTensor', keys=['img']), - dict(type='Collect', keys=['img']), - ]) -] -data_root = 'data/coco/' -data = dict( - samples_per_gpu=2, - workers_per_gpu=2, - train=dict(pipeline=train_pipeline), - val=dict( - pipeline=test_pipeline, - ins_ann_file=data_root + 'annotations/instances_val2017.json', - ), - test=dict( - pipeline=test_pipeline, - ins_ann_file=data_root + 'annotations/instances_val2017.json', - )) - -embed_multi = dict(lr_mult=1.0, decay_mult=0.0) -# optimizer -optimizer = dict( - type='AdamW', - lr=0.0001, - weight_decay=0.05, - eps=1e-8, - betas=(0.9, 0.999), - paramwise_cfg=dict( - custom_keys={ - 'backbone': dict(lr_mult=0.1, decay_mult=1.0), - 'query_embed': embed_multi, - 'query_feat': embed_multi, - 'level_embed': embed_multi, - }, - norm_decay_mult=0.0)) -optimizer_config = dict(grad_clip=dict(max_norm=0.01, norm_type=2)) - -# learning policy -lr_config = dict( - policy='step', - gamma=0.1, - by_epoch=False, - step=[327778, 355092], - warmup='linear', - warmup_by_epoch=False, - warmup_ratio=1.0, # no warmup - warmup_iters=10) - -max_iters = 368750 -runner = dict(type='IterBasedRunner', max_iters=max_iters) - -log_config = dict( - interval=50, - hooks=[ - dict(type='TextLoggerHook', by_epoch=False), - dict(type='TensorboardLoggerHook', by_epoch=False) - ]) -interval = 5000 -workflow = [('train', interval)] -checkpoint_config = dict( - by_epoch=False, interval=interval, save_last=True, max_keep_ckpts=3) - -# Before 365001th iteration, we do evaluation every 5000 iterations. -# After 365000th iteration, we do evaluation every 368750 iterations, -# which means that we do evaluation at the end of training. -dynamic_intervals = [(max_iters // interval * interval + 1, max_iters)] -evaluation = dict( - interval=interval, - dynamic_intervals=dynamic_intervals, - metric=['PQ', 'bbox', 'segm']) diff --git a/configs/mask2former/mask2former_r50_lsj_8x2_50e_coco_ins.py b/configs/mask2former/mask2former_r50_lsj_8x2_50e_coco_ins.py deleted file mode 100644 index 81c71048924..00000000000 --- a/configs/mask2former/mask2former_r50_lsj_8x2_50e_coco_ins.py +++ /dev/null @@ -1,79 +0,0 @@ -_base_ = ['./mask2former_r50_lsj_8x2_50e_coco.py'] -num_things_classes = 80 -num_stuff_classes = 0 -num_classes = num_things_classes + num_stuff_classes -model = dict( - panoptic_head=dict( - num_things_classes=num_things_classes, - num_stuff_classes=num_stuff_classes, - loss_cls=dict(class_weight=[1.0] * num_classes + [0.1])), - panoptic_fusion_head=dict( - num_things_classes=num_things_classes, - num_stuff_classes=num_stuff_classes), - test_cfg=dict(panoptic_on=False)) - -# dataset settings -image_size = (1024, 1024) -img_norm_cfg = dict( - mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True) -pad_cfg = dict(img=(128, 128, 128), masks=0, seg=255) -train_pipeline = [ - dict(type='LoadImageFromFile', to_float32=True), - dict(type='LoadAnnotations', with_bbox=True, with_mask=True), - dict(type='RandomFlip', flip_ratio=0.5), - # large scale jittering - dict( - type='Resize', - img_scale=image_size, - ratio_range=(0.1, 2.0), - multiscale_mode='range', - keep_ratio=True), - dict( - type='RandomCrop', - crop_size=image_size, - crop_type='absolute', - recompute_bbox=True, - allow_negative_crop=True), - dict(type='FilterAnnotations', min_gt_bbox_wh=(1e-5, 1e-5), by_mask=True), - dict(type='Pad', size=image_size, pad_val=pad_cfg), - dict(type='Normalize', **img_norm_cfg), - dict(type='DefaultFormatBundle', img_to_float=True), - dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels', 'gt_masks']), -] -test_pipeline = [ - dict(type='LoadImageFromFile'), - dict( - type='MultiScaleFlipAug', - img_scale=(1333, 800), - flip=False, - transforms=[ - dict(type='Resize', keep_ratio=True), - dict(type='RandomFlip'), - dict(type='Pad', size_divisor=32, pad_val=pad_cfg), - dict(type='Normalize', **img_norm_cfg), - dict(type='ImageToTensor', keys=['img']), - dict(type='Collect', keys=['img']), - ]) -] -dataset_type = 'CocoDataset' -data_root = 'data/coco/' -data = dict( - _delete_=True, - samples_per_gpu=2, - workers_per_gpu=2, - train=dict( - type=dataset_type, - ann_file=data_root + 'annotations/instances_train2017.json', - img_prefix=data_root + 'train2017/', - pipeline=train_pipeline), - val=dict( - type=dataset_type, - ann_file=data_root + 'annotations/instances_val2017.json', - img_prefix=data_root + 'val2017/', - pipeline=test_pipeline), - test=dict( - type=dataset_type, - ann_file=data_root + 'annotations/instances_val2017.json', - img_prefix=data_root + 'val2017/', - pipeline=test_pipeline)) -evaluation = dict(metric=['bbox', 'segm']) diff --git a/configs/mask2former/mask2former_swin-b-p4-w12-384-in21k_lsj_8x2_50e_coco.py b/configs/mask2former/mask2former_swin-b-p4-w12-384-in21k_lsj_8x2_50e_coco.py deleted file mode 100644 index d0cf3762139..00000000000 --- a/configs/mask2former/mask2former_swin-b-p4-w12-384-in21k_lsj_8x2_50e_coco.py +++ /dev/null @@ -1,5 +0,0 @@ -_base_ = ['./mask2former_swin-b-p4-w12-384_lsj_8x2_50e_coco.py'] -pretrained = 'https://github.com/SwinTransformer/storage/releases/download/v1.0.0/swin_base_patch4_window12_384_22k.pth' # noqa - -model = dict( - backbone=dict(init_cfg=dict(type='Pretrained', checkpoint=pretrained))) diff --git a/configs/mask2former/mask2former_swin-b-p4-w12-384-in21k_lsj_8x2_50e_coco_ins.py b/configs/mask2former/mask2former_swin-b-p4-w12-384-in21k_lsj_8x2_50e_coco_ins.py deleted file mode 100644 index eb42af012fc..00000000000 --- a/configs/mask2former/mask2former_swin-b-p4-w12-384-in21k_lsj_8x2_50e_coco_ins.py +++ /dev/null @@ -1,5 +0,0 @@ -_base_ = ['./mask2former_swin-b-p4-w12-384_lsj_8x2_50e_coco_ins.py'] -pretrained = 'https://github.com/SwinTransformer/storage/releases/download/v1.0.0/swin_base_patch4_window12_384_22k.pth' # noqa - -model = dict( - backbone=dict(init_cfg=dict(type='Pretrained', checkpoint=pretrained))) diff --git a/configs/mask2former/mask2former_swin-b-p4-w12-384_lsj_8x2_50e_coco.py b/configs/mask2former/mask2former_swin-b-p4-w12-384_lsj_8x2_50e_coco.py deleted file mode 100644 index d2a582598f4..00000000000 --- a/configs/mask2former/mask2former_swin-b-p4-w12-384_lsj_8x2_50e_coco.py +++ /dev/null @@ -1,42 +0,0 @@ -_base_ = ['./mask2former_swin-t-p4-w7-224_lsj_8x2_50e_coco.py'] -pretrained = 'https://github.com/SwinTransformer/storage/releases/download/v1.0.0/swin_base_patch4_window12_384.pth' # noqa - -depths = [2, 2, 18, 2] -model = dict( - backbone=dict( - pretrain_img_size=384, - embed_dims=128, - depths=depths, - num_heads=[4, 8, 16, 32], - window_size=12, - init_cfg=dict(type='Pretrained', checkpoint=pretrained)), - panoptic_head=dict(in_channels=[128, 256, 512, 1024])) - -# set all layers in backbone to lr_mult=0.1 -# set all norm layers, position_embeding, -# query_embeding, level_embeding to decay_multi=0.0 -backbone_norm_multi = dict(lr_mult=0.1, decay_mult=0.0) -backbone_embed_multi = dict(lr_mult=0.1, decay_mult=0.0) -embed_multi = dict(lr_mult=1.0, decay_mult=0.0) -custom_keys = { - 'backbone': dict(lr_mult=0.1, decay_mult=1.0), - 'backbone.patch_embed.norm': backbone_norm_multi, - 'backbone.norm': backbone_norm_multi, - 'absolute_pos_embed': backbone_embed_multi, - 'relative_position_bias_table': backbone_embed_multi, - 'query_embed': embed_multi, - 'query_feat': embed_multi, - 'level_embed': embed_multi -} -custom_keys.update({ - f'backbone.stages.{stage_id}.blocks.{block_id}.norm': backbone_norm_multi - for stage_id, num_blocks in enumerate(depths) - for block_id in range(num_blocks) -}) -custom_keys.update({ - f'backbone.stages.{stage_id}.downsample.norm': backbone_norm_multi - for stage_id in range(len(depths) - 1) -}) -# optimizer -optimizer = dict( - paramwise_cfg=dict(custom_keys=custom_keys, norm_decay_mult=0.0)) diff --git a/configs/mask2former/mask2former_swin-b-p4-w12-384_lsj_8x2_50e_coco_ins.py b/configs/mask2former/mask2former_swin-b-p4-w12-384_lsj_8x2_50e_coco_ins.py deleted file mode 100644 index 5d7cc63887a..00000000000 --- a/configs/mask2former/mask2former_swin-b-p4-w12-384_lsj_8x2_50e_coco_ins.py +++ /dev/null @@ -1,42 +0,0 @@ -_base_ = ['./mask2former_swin-t-p4-w7-224_lsj_8x2_50e_coco_ins.py'] -pretrained = 'https://github.com/SwinTransformer/storage/releases/download/v1.0.0/swin_base_patch4_window12_384.pth' # noqa - -depths = [2, 2, 18, 2] -model = dict( - backbone=dict( - pretrain_img_size=384, - embed_dims=128, - depths=depths, - num_heads=[4, 8, 16, 32], - window_size=12, - init_cfg=dict(type='Pretrained', checkpoint=pretrained)), - panoptic_head=dict(in_channels=[128, 256, 512, 1024])) - -# set all layers in backbone to lr_mult=0.1 -# set all norm layers, position_embeding, -# query_embeding, level_embeding to decay_multi=0.0 -backbone_norm_multi = dict(lr_mult=0.1, decay_mult=0.0) -backbone_embed_multi = dict(lr_mult=0.1, decay_mult=0.0) -embed_multi = dict(lr_mult=1.0, decay_mult=0.0) -custom_keys = { - 'backbone': dict(lr_mult=0.1, decay_mult=1.0), - 'backbone.patch_embed.norm': backbone_norm_multi, - 'backbone.norm': backbone_norm_multi, - 'absolute_pos_embed': backbone_embed_multi, - 'relative_position_bias_table': backbone_embed_multi, - 'query_embed': embed_multi, - 'query_feat': embed_multi, - 'level_embed': embed_multi -} -custom_keys.update({ - f'backbone.stages.{stage_id}.blocks.{block_id}.norm': backbone_norm_multi - for stage_id, num_blocks in enumerate(depths) - for block_id in range(num_blocks) -}) -custom_keys.update({ - f'backbone.stages.{stage_id}.downsample.norm': backbone_norm_multi - for stage_id in range(len(depths) - 1) -}) -# optimizer -optimizer = dict( - paramwise_cfg=dict(custom_keys=custom_keys, norm_decay_mult=0.0)) diff --git a/configs/mask2former/mask2former_swin-l-p4-w12-384-in21k_lsj_16x1_100e_coco.py b/configs/mask2former/mask2former_swin-l-p4-w12-384-in21k_lsj_16x1_100e_coco.py deleted file mode 100644 index 13aa28c4a9a..00000000000 --- a/configs/mask2former/mask2former_swin-l-p4-w12-384-in21k_lsj_16x1_100e_coco.py +++ /dev/null @@ -1,26 +0,0 @@ -_base_ = ['./mask2former_swin-b-p4-w12-384_lsj_8x2_50e_coco.py'] -pretrained = 'https://github.com/SwinTransformer/storage/releases/download/v1.0.0/swin_large_patch4_window12_384_22k.pth' # noqa - -model = dict( - backbone=dict( - embed_dims=192, - num_heads=[6, 12, 24, 48], - init_cfg=dict(type='Pretrained', checkpoint=pretrained)), - panoptic_head=dict(num_queries=200, in_channels=[192, 384, 768, 1536])) - -data = dict(samples_per_gpu=1, workers_per_gpu=1) - -lr_config = dict(step=[655556, 710184]) - -max_iters = 737500 -runner = dict(type='IterBasedRunner', max_iters=max_iters) - -# Before 735001th iteration, we do evaluation every 5000 iterations. -# After 735000th iteration, we do evaluation every 737500 iterations, -# which means that we do evaluation at the end of training.' -interval = 5000 -dynamic_intervals = [(max_iters // interval * interval + 1, max_iters)] -evaluation = dict( - interval=interval, - dynamic_intervals=dynamic_intervals, - metric=['PQ', 'bbox', 'segm']) diff --git a/configs/mask2former/mask2former_swin-l-p4-w12-384-in21k_lsj_16x1_100e_coco_ins.py b/configs/mask2former/mask2former_swin-l-p4-w12-384-in21k_lsj_16x1_100e_coco_ins.py deleted file mode 100644 index 30d4d736081..00000000000 --- a/configs/mask2former/mask2former_swin-l-p4-w12-384-in21k_lsj_16x1_100e_coco_ins.py +++ /dev/null @@ -1,26 +0,0 @@ -_base_ = ['./mask2former_swin-b-p4-w12-384_lsj_8x2_50e_coco_ins.py'] -pretrained = 'https://github.com/SwinTransformer/storage/releases/download/v1.0.0/swin_large_patch4_window12_384_22k.pth' # noqa - -model = dict( - backbone=dict( - embed_dims=192, - num_heads=[6, 12, 24, 48], - init_cfg=dict(type='Pretrained', checkpoint=pretrained)), - panoptic_head=dict(num_queries=200, in_channels=[192, 384, 768, 1536])) - -data = dict(samples_per_gpu=1, workers_per_gpu=1) - -lr_config = dict(step=[655556, 710184]) - -max_iters = 737500 -runner = dict(type='IterBasedRunner', max_iters=max_iters) - -# Before 735001th iteration, we do evaluation every 5000 iterations. -# After 735000th iteration, we do evaluation every 737500 iterations, -# which means that we do evaluation at the end of training.' -interval = 5000 -dynamic_intervals = [(max_iters // interval * interval + 1, max_iters)] -evaluation = dict( - interval=interval, - dynamic_intervals=dynamic_intervals, - metric=['PQ', 'bbox', 'segm']) diff --git a/configs/mask2former/mask2former_swin-s-p4-w7-224_lsj_8x2_50e_coco.py b/configs/mask2former/mask2former_swin-s-p4-w7-224_lsj_8x2_50e_coco.py deleted file mode 100644 index 7b1b05abafe..00000000000 --- a/configs/mask2former/mask2former_swin-s-p4-w7-224_lsj_8x2_50e_coco.py +++ /dev/null @@ -1,37 +0,0 @@ -_base_ = ['./mask2former_swin-t-p4-w7-224_lsj_8x2_50e_coco.py'] -pretrained = 'https://github.com/SwinTransformer/storage/releases/download/v1.0.0/swin_small_patch4_window7_224.pth' # noqa - -depths = [2, 2, 18, 2] -model = dict( - backbone=dict( - depths=depths, init_cfg=dict(type='Pretrained', - checkpoint=pretrained))) - -# set all layers in backbone to lr_mult=0.1 -# set all norm layers, position_embeding, -# query_embeding, level_embeding to decay_multi=0.0 -backbone_norm_multi = dict(lr_mult=0.1, decay_mult=0.0) -backbone_embed_multi = dict(lr_mult=0.1, decay_mult=0.0) -embed_multi = dict(lr_mult=1.0, decay_mult=0.0) -custom_keys = { - 'backbone': dict(lr_mult=0.1, decay_mult=1.0), - 'backbone.patch_embed.norm': backbone_norm_multi, - 'backbone.norm': backbone_norm_multi, - 'absolute_pos_embed': backbone_embed_multi, - 'relative_position_bias_table': backbone_embed_multi, - 'query_embed': embed_multi, - 'query_feat': embed_multi, - 'level_embed': embed_multi -} -custom_keys.update({ - f'backbone.stages.{stage_id}.blocks.{block_id}.norm': backbone_norm_multi - for stage_id, num_blocks in enumerate(depths) - for block_id in range(num_blocks) -}) -custom_keys.update({ - f'backbone.stages.{stage_id}.downsample.norm': backbone_norm_multi - for stage_id in range(len(depths) - 1) -}) -# optimizer -optimizer = dict( - paramwise_cfg=dict(custom_keys=custom_keys, norm_decay_mult=0.0)) diff --git a/configs/mask2former/mask2former_swin-s-p4-w7-224_lsj_8x2_50e_coco_ins.py b/configs/mask2former/mask2former_swin-s-p4-w7-224_lsj_8x2_50e_coco_ins.py deleted file mode 100644 index f33ed9b90a4..00000000000 --- a/configs/mask2former/mask2former_swin-s-p4-w7-224_lsj_8x2_50e_coco_ins.py +++ /dev/null @@ -1,37 +0,0 @@ -_base_ = ['./mask2former_swin-t-p4-w7-224_lsj_8x2_50e_coco_ins.py'] -pretrained = 'https://github.com/SwinTransformer/storage/releases/download/v1.0.0/swin_small_patch4_window7_224.pth' # noqa - -depths = [2, 2, 18, 2] -model = dict( - backbone=dict( - depths=depths, init_cfg=dict(type='Pretrained', - checkpoint=pretrained))) - -# set all layers in backbone to lr_mult=0.1 -# set all norm layers, position_embeding, -# query_embeding, level_embeding to decay_multi=0.0 -backbone_norm_multi = dict(lr_mult=0.1, decay_mult=0.0) -backbone_embed_multi = dict(lr_mult=0.1, decay_mult=0.0) -embed_multi = dict(lr_mult=1.0, decay_mult=0.0) -custom_keys = { - 'backbone': dict(lr_mult=0.1, decay_mult=1.0), - 'backbone.patch_embed.norm': backbone_norm_multi, - 'backbone.norm': backbone_norm_multi, - 'absolute_pos_embed': backbone_embed_multi, - 'relative_position_bias_table': backbone_embed_multi, - 'query_embed': embed_multi, - 'query_feat': embed_multi, - 'level_embed': embed_multi -} -custom_keys.update({ - f'backbone.stages.{stage_id}.blocks.{block_id}.norm': backbone_norm_multi - for stage_id, num_blocks in enumerate(depths) - for block_id in range(num_blocks) -}) -custom_keys.update({ - f'backbone.stages.{stage_id}.downsample.norm': backbone_norm_multi - for stage_id in range(len(depths) - 1) -}) -# optimizer -optimizer = dict( - paramwise_cfg=dict(custom_keys=custom_keys, norm_decay_mult=0.0)) diff --git a/configs/mask2former/mask2former_swin-t-p4-w7-224_lsj_8x2_50e_coco.py b/configs/mask2former/mask2former_swin-t-p4-w7-224_lsj_8x2_50e_coco.py deleted file mode 100644 index 70e3103e482..00000000000 --- a/configs/mask2former/mask2former_swin-t-p4-w7-224_lsj_8x2_50e_coco.py +++ /dev/null @@ -1,62 +0,0 @@ -_base_ = ['./mask2former_r50_lsj_8x2_50e_coco.py'] -pretrained = 'https://github.com/SwinTransformer/storage/releases/download/v1.0.0/swin_tiny_patch4_window7_224.pth' # noqa - -depths = [2, 2, 6, 2] -model = dict( - type='Mask2Former', - backbone=dict( - _delete_=True, - type='SwinTransformer', - embed_dims=96, - depths=depths, - num_heads=[3, 6, 12, 24], - window_size=7, - mlp_ratio=4, - qkv_bias=True, - qk_scale=None, - drop_rate=0., - attn_drop_rate=0., - drop_path_rate=0.3, - patch_norm=True, - out_indices=(0, 1, 2, 3), - with_cp=False, - convert_weights=True, - frozen_stages=-1, - init_cfg=dict(type='Pretrained', checkpoint=pretrained)), - panoptic_head=dict( - type='Mask2FormerHead', in_channels=[96, 192, 384, 768]), - init_cfg=None) - -# set all layers in backbone to lr_mult=0.1 -# set all norm layers, position_embeding, -# query_embeding, level_embeding to decay_multi=0.0 -backbone_norm_multi = dict(lr_mult=0.1, decay_mult=0.0) -backbone_embed_multi = dict(lr_mult=0.1, decay_mult=0.0) -embed_multi = dict(lr_mult=1.0, decay_mult=0.0) -custom_keys = { - 'backbone': dict(lr_mult=0.1, decay_mult=1.0), - 'backbone.patch_embed.norm': backbone_norm_multi, - 'backbone.norm': backbone_norm_multi, - 'absolute_pos_embed': backbone_embed_multi, - 'relative_position_bias_table': backbone_embed_multi, - 'query_embed': embed_multi, - 'query_feat': embed_multi, - 'level_embed': embed_multi -} -custom_keys.update({ - f'backbone.stages.{stage_id}.blocks.{block_id}.norm': backbone_norm_multi - for stage_id, num_blocks in enumerate(depths) - for block_id in range(num_blocks) -}) -custom_keys.update({ - f'backbone.stages.{stage_id}.downsample.norm': backbone_norm_multi - for stage_id in range(len(depths) - 1) -}) -# optimizer -optimizer = dict( - type='AdamW', - lr=0.0001, - weight_decay=0.05, - eps=1e-8, - betas=(0.9, 0.999), - paramwise_cfg=dict(custom_keys=custom_keys, norm_decay_mult=0.0)) diff --git a/configs/mask2former/mask2former_swin-t-p4-w7-224_lsj_8x2_50e_coco_ins.py b/configs/mask2former/mask2former_swin-t-p4-w7-224_lsj_8x2_50e_coco_ins.py deleted file mode 100644 index 66d69dd7698..00000000000 --- a/configs/mask2former/mask2former_swin-t-p4-w7-224_lsj_8x2_50e_coco_ins.py +++ /dev/null @@ -1,61 +0,0 @@ -_base_ = ['./mask2former_r50_lsj_8x2_50e_coco_ins.py'] -pretrained = 'https://github.com/SwinTransformer/storage/releases/download/v1.0.0/swin_tiny_patch4_window7_224.pth' # noqa -depths = [2, 2, 6, 2] -model = dict( - type='Mask2Former', - backbone=dict( - _delete_=True, - type='SwinTransformer', - embed_dims=96, - depths=depths, - num_heads=[3, 6, 12, 24], - window_size=7, - mlp_ratio=4, - qkv_bias=True, - qk_scale=None, - drop_rate=0., - attn_drop_rate=0., - drop_path_rate=0.3, - patch_norm=True, - out_indices=(0, 1, 2, 3), - with_cp=False, - convert_weights=True, - frozen_stages=-1, - init_cfg=dict(type='Pretrained', checkpoint=pretrained)), - panoptic_head=dict( - type='Mask2FormerHead', in_channels=[96, 192, 384, 768]), - init_cfg=None) - -# set all layers in backbone to lr_mult=0.1 -# set all norm layers, position_embeding, -# query_embeding, level_embeding to decay_multi=0.0 -backbone_norm_multi = dict(lr_mult=0.1, decay_mult=0.0) -backbone_embed_multi = dict(lr_mult=0.1, decay_mult=0.0) -embed_multi = dict(lr_mult=1.0, decay_mult=0.0) -custom_keys = { - 'backbone': dict(lr_mult=0.1, decay_mult=1.0), - 'backbone.patch_embed.norm': backbone_norm_multi, - 'backbone.norm': backbone_norm_multi, - 'absolute_pos_embed': backbone_embed_multi, - 'relative_position_bias_table': backbone_embed_multi, - 'query_embed': embed_multi, - 'query_feat': embed_multi, - 'level_embed': embed_multi -} -custom_keys.update({ - f'backbone.stages.{stage_id}.blocks.{block_id}.norm': backbone_norm_multi - for stage_id, num_blocks in enumerate(depths) - for block_id in range(num_blocks) -}) -custom_keys.update({ - f'backbone.stages.{stage_id}.downsample.norm': backbone_norm_multi - for stage_id in range(len(depths) - 1) -}) -# optimizer -optimizer = dict( - type='AdamW', - lr=0.0001, - weight_decay=0.05, - eps=1e-8, - betas=(0.9, 0.999), - paramwise_cfg=dict(custom_keys=custom_keys, norm_decay_mult=0.0)) diff --git a/configs/mask2former/metafile.yml b/configs/mask2former/metafile.yml deleted file mode 100644 index 2ceed8056af..00000000000 --- a/configs/mask2former/metafile.yml +++ /dev/null @@ -1,159 +0,0 @@ -Collections: - - Name: Mask2Former - Metadata: - Training Data: COCO - Training Techniques: - - AdamW - - Weight Decay - Training Resources: 8x A100 GPUs - Architecture: - - Mask2Former - Paper: - URL: https://arxiv.org/pdf/2112.01527 - Title: 'Masked-attention Mask Transformer for Universal Image Segmentation' - README: configs/mask2former/README.md - Code: - URL: https://github.com/open-mmlab/mmdetection/blob/v2.23.0/mmdet/models/detectors/mask2former.py#L7 - Version: v2.23.0 - -Models: -- Name: mask2former_r50_lsj_8x2_50e_coco - In Collection: Mask2Former - Config: configs/mask2former/mask2former_r50_lsj_8x2_50e_coco.py - Metadata: - Training Memory (GB): 13.9 - Iterations: 368750 - Results: - - Task: Object Detection - Dataset: COCO - Metrics: - box AP: 44.8 - - Task: Instance Segmentation - Dataset: COCO - Metrics: - mask AP: 41.9 - - Task: Panoptic Segmentation - Dataset: COCO - Metrics: - PQ: 51.9 - Weights: https://download.openmmlab.com/mmdetection/v2.0/mask2former/mask2former_r50_lsj_8x2_50e_coco/mask2former_r50_lsj_8x2_50e_coco_20220326_224516-0091ce2b.pth -- Name: mask2former_r101_lsj_8x2_50e_coco - In Collection: Mask2Former - Config: configs/mask2former/mask2former_r101_lsj_8x2_50e_coco.py - Metadata: - Training Memory (GB): 16.1 - Iterations: 368750 - Results: - - Task: Object Detection - Dataset: COCO - Metrics: - box AP: 45.3 - - Task: Instance Segmentation - Dataset: COCO - Metrics: - mask AP: 42.4 - - Task: Panoptic Segmentation - Dataset: COCO - Metrics: - PQ: 52.4 - Weights: https://download.openmmlab.com/mmdetection/v2.0/mask2former/mask2former_r101_lsj_8x2_50e_coco/mask2former_r101_lsj_8x2_50e_coco_20220329_225104-bb4df090.pth -- Name: mask2former_swin-t-p4-w7-224_lsj_8x2_50e_coco - In Collection: Mask2Former - Config: configs/mask2former/mask2former_swin-t-p4-w7-224_lsj_8x2_50e_coco.py - Metadata: - Training Memory (GB): 15.9 - Iterations: 368750 - Results: - - Task: Object Detection - Dataset: COCO - Metrics: - box AP: 46.3 - - Task: Instance Segmentation - Dataset: COCO - Metrics: - mask AP: 43.4 - - Task: Panoptic Segmentation - Dataset: COCO - Metrics: - PQ: 53.4 - Weights: https://download.openmmlab.com/mmdetection/v2.0/mask2former/mask2former_swin-t-p4-w7-224_lsj_8x2_50e_coco/mask2former_swin-t-p4-w7-224_lsj_8x2_50e_coco_20220326_224553-c92f921c.pth -- Name: mask2former_swin-s-p4-w7-224_lsj_8x2_50e_coco - In Collection: Mask2Former - Config: configs/mask2former/mask2former_swin-s-p4-w7-224_lsj_8x2_50e_coco.py - Metadata: - Training Memory (GB): 19.1 - Iterations: 368750 - Results: - - Task: Object Detection - Dataset: COCO - Metrics: - box AP: 47.8 - - Task: Instance Segmentation - Dataset: COCO - Metrics: - mask AP: 44.5 - - Task: Panoptic Segmentation - Dataset: COCO - Metrics: - PQ: 54.5 - Weights: https://download.openmmlab.com/mmdetection/v2.0/mask2former/mask2former_swin-s-p4-w7-224_lsj_8x2_50e_coco/mask2former_swin-s-p4-w7-224_lsj_8x2_50e_coco_20220329_225200-9f633bcf.pth -- Name: mask2former_swin-b-p4-w12-384_lsj_8x2_50e_coco - In Collection: Mask2Former - Config: configs/mask2former/mask2former_swin-b-p4-w12-384_lsj_8x2_50e_coco.py - Metadata: - Training Memory (GB): 26.0 - Iterations: 368750 - Results: - - Task: Object Detection - Dataset: COCO - Metrics: - box AP: 48.2 - - Task: Instance Segmentation - Dataset: COCO - Metrics: - mask AP: 44.9 - - Task: Panoptic Segmentation - Dataset: COCO - Metrics: - PQ: 55.1 - Weights: https://download.openmmlab.com/mmdetection/v2.0/mask2former/mask2former_swin-b-p4-w12-384_lsj_8x2_50e_coco/mask2former_swin-b-p4-w12-384_lsj_8x2_50e_coco_20220331_002244-1db756b2.pth -- Name: mask2former_swin-b-p4-w12-384-in21k_lsj_8x2_50e_coco - In Collection: Mask2Former - Config: configs/mask2former/mask2former_swin-b-p4-w12-384-in21k_lsj_8x2_50e_coco.py - Metadata: - Training Memory (GB): 25.8 - Iterations: 368750 - Results: - - Task: Object Detection - Dataset: COCO - Metrics: - box AP: 50.0 - - Task: Instance Segmentation - Dataset: COCO - Metrics: - mask AP: 46.3 - - Task: Panoptic Segmentation - Dataset: COCO - Metrics: - PQ: 56.3 - Weights: https://download.openmmlab.com/mmdetection/v2.0/mask2former/mask2former_swin-b-p4-w12-384-in21k_lsj_8x2_50e_coco/mask2former_swin-b-p4-w12-384-in21k_lsj_8x2_50e_coco_20220329_230021-89d7c1b1.pth -- Name: mask2former_swin-l-p4-w12-384-in21k_lsj_16x1_100e_coco - In Collection: Mask2Former - Config: configs/mask2former/mask2former_swin-l-p4-w12-384-in21k_lsj_16x1_100e_coco.py - Metadata: - Training Memory (GB): 21.1 - Iterations: 737500 - Results: - - Task: Object Detection - Dataset: COCO - Metrics: - box AP: 52.2 - - Task: Instance Segmentation - Dataset: COCO - Metrics: - mask AP: 48.5 - - Task: Panoptic Segmentation - Dataset: COCO - Metrics: - PQ: 57.6 - Weights: https://download.openmmlab.com/mmdetection/v2.0/mask2former/mask2former_swin-l-p4-w12-384-in21k_lsj_16x1_100e_coco/mask2former_swin-l-p4-w12-384-in21k_lsj_16x1_100e_coco_20220407_104949-c481ee28.pth From 792f38fc3ab6f1fa4d699f8c4936afb1074ee580 Mon Sep 17 00:00:00 2001 From: peter Date: Mon, 23 May 2022 10:32:12 -0500 Subject: [PATCH 33/49] Include original dev branch configs --- .../mask2former_r101_lsj_8x2_50e_coco.py | 7 + .../mask2former_r50_lsj_8x2_50e_coco.py | 253 ++++++++++++++++++ ...win-b-p4-w12-384-in21k_lsj_8x2_50e_coco.py | 5 + ...rmer_swin-b-p4-w12-384_lsj_8x2_50e_coco.py | 42 +++ ...n-l-p4-w12-384-in21k_lsj_16x1_100e_coco.py | 26 ++ ...ormer_swin-s-p4-w7-224_lsj_8x2_50e_coco.py | 37 +++ ...ormer_swin-t-p4-w7-224_lsj_8x2_50e_coco.py | 62 +++++ configs/mask2former/metafile.yml | 159 +++++++++++ 8 files changed, 591 insertions(+) create mode 100644 configs/mask2former/mask2former_r101_lsj_8x2_50e_coco.py create mode 100644 configs/mask2former/mask2former_r50_lsj_8x2_50e_coco.py create mode 100644 configs/mask2former/mask2former_swin-b-p4-w12-384-in21k_lsj_8x2_50e_coco.py create mode 100644 configs/mask2former/mask2former_swin-b-p4-w12-384_lsj_8x2_50e_coco.py create mode 100644 configs/mask2former/mask2former_swin-l-p4-w12-384-in21k_lsj_16x1_100e_coco.py create mode 100644 configs/mask2former/mask2former_swin-s-p4-w7-224_lsj_8x2_50e_coco.py create mode 100644 configs/mask2former/mask2former_swin-t-p4-w7-224_lsj_8x2_50e_coco.py create mode 100644 configs/mask2former/metafile.yml diff --git a/configs/mask2former/mask2former_r101_lsj_8x2_50e_coco.py b/configs/mask2former/mask2former_r101_lsj_8x2_50e_coco.py new file mode 100644 index 00000000000..27050585e18 --- /dev/null +++ b/configs/mask2former/mask2former_r101_lsj_8x2_50e_coco.py @@ -0,0 +1,7 @@ +_base_ = './mask2former_r50_lsj_8x2_50e_coco.py' + +model = dict( + backbone=dict( + depth=101, + init_cfg=dict(type='Pretrained', + checkpoint='torchvision://resnet101'))) diff --git a/configs/mask2former/mask2former_r50_lsj_8x2_50e_coco.py b/configs/mask2former/mask2former_r50_lsj_8x2_50e_coco.py new file mode 100644 index 00000000000..2c23625e139 --- /dev/null +++ b/configs/mask2former/mask2former_r50_lsj_8x2_50e_coco.py @@ -0,0 +1,253 @@ +_base_ = [ + '../_base_/datasets/coco_panoptic.py', '../_base_/default_runtime.py' +] +num_things_classes = 80 +num_stuff_classes = 53 +num_classes = num_things_classes + num_stuff_classes +model = dict( + type='Mask2Former', + backbone=dict( + type='ResNet', + depth=50, + num_stages=4, + out_indices=(0, 1, 2, 3), + frozen_stages=-1, + norm_cfg=dict(type='BN', requires_grad=False), + norm_eval=True, + style='pytorch', + init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet50')), + panoptic_head=dict( + type='Mask2FormerHead', + in_channels=[256, 512, 1024, 2048], # pass to pixel_decoder inside + strides=[4, 8, 16, 32], + feat_channels=256, + out_channels=256, + num_things_classes=num_things_classes, + num_stuff_classes=num_stuff_classes, + num_queries=100, + num_transformer_feat_level=3, + pixel_decoder=dict( + type='MSDeformAttnPixelDecoder', + num_outs=3, + norm_cfg=dict(type='GN', num_groups=32), + act_cfg=dict(type='ReLU'), + encoder=dict( + type='DetrTransformerEncoder', + num_layers=6, + transformerlayers=dict( + type='BaseTransformerLayer', + attn_cfgs=dict( + type='MultiScaleDeformableAttention', + embed_dims=256, + num_heads=8, + num_levels=3, + num_points=4, + im2col_step=64, + dropout=0.0, + batch_first=False, + norm_cfg=None, + init_cfg=None), + ffn_cfgs=dict( + type='FFN', + embed_dims=256, + feedforward_channels=1024, + num_fcs=2, + ffn_drop=0.0, + act_cfg=dict(type='ReLU', inplace=True)), + operation_order=('self_attn', 'norm', 'ffn', 'norm')), + init_cfg=None), + positional_encoding=dict( + type='SinePositionalEncoding', num_feats=128, normalize=True), + init_cfg=None), + enforce_decoder_input_project=False, + positional_encoding=dict( + type='SinePositionalEncoding', num_feats=128, normalize=True), + transformer_decoder=dict( + type='DetrTransformerDecoder', + return_intermediate=True, + num_layers=9, + transformerlayers=dict( + type='DetrTransformerDecoderLayer', + attn_cfgs=dict( + type='MultiheadAttention', + embed_dims=256, + num_heads=8, + attn_drop=0.0, + proj_drop=0.0, + dropout_layer=None, + batch_first=False), + ffn_cfgs=dict( + embed_dims=256, + feedforward_channels=2048, + num_fcs=2, + act_cfg=dict(type='ReLU', inplace=True), + ffn_drop=0.0, + dropout_layer=None, + add_identity=True), + feedforward_channels=2048, + operation_order=('cross_attn', 'norm', 'self_attn', 'norm', + 'ffn', 'norm')), + init_cfg=None), + loss_cls=dict( + type='CrossEntropyLoss', + use_sigmoid=False, + loss_weight=2.0, + reduction='mean', + class_weight=[1.0] * num_classes + [0.1]), + loss_mask=dict( + type='CrossEntropyLoss', + use_sigmoid=True, + reduction='mean', + loss_weight=5.0), + loss_dice=dict( + type='DiceLoss', + use_sigmoid=True, + activate=True, + reduction='mean', + naive_dice=True, + eps=1.0, + loss_weight=5.0)), + panoptic_fusion_head=dict( + type='MaskFormerFusionHead', + num_things_classes=num_things_classes, + num_stuff_classes=num_stuff_classes, + loss_panoptic=None, + init_cfg=None), + train_cfg=dict( + num_points=12544, + oversample_ratio=3.0, + importance_sample_ratio=0.75, + assigner=dict( + type='MaskHungarianAssigner', + cls_cost=dict(type='ClassificationCost', weight=2.0), + mask_cost=dict( + type='CrossEntropyLossCost', weight=5.0, use_sigmoid=True), + dice_cost=dict( + type='DiceCost', weight=5.0, pred_act=True, eps=1.0)), + sampler=dict(type='MaskPseudoSampler')), + test_cfg=dict( + panoptic_on=True, + # For now, the dataset does not support + # evaluating semantic segmentation metric. + semantic_on=False, + instance_on=True, + # max_per_image is for instance segmentation. + max_per_image=100, + iou_thr=0.8, + # In Mask2Former's panoptic postprocessing, + # it will filter mask area where score is less than 0.5 . + filter_low_score=True), + init_cfg=None) + +# dataset settings +image_size = (1024, 1024) +img_norm_cfg = dict( + mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True) +train_pipeline = [ + dict(type='LoadImageFromFile', to_float32=True), + dict( + type='LoadPanopticAnnotations', + with_bbox=True, + with_mask=True, + with_seg=True), + dict(type='RandomFlip', flip_ratio=0.5), + # large scale jittering + dict( + type='Resize', + img_scale=image_size, + ratio_range=(0.1, 2.0), + multiscale_mode='range', + keep_ratio=True), + dict( + type='RandomCrop', + crop_size=image_size, + crop_type='absolute', + recompute_bbox=True, + allow_negative_crop=True), + dict(type='Normalize', **img_norm_cfg), + dict(type='Pad', size=image_size), + dict(type='DefaultFormatBundle', img_to_float=True), + dict( + type='Collect', + keys=['img', 'gt_bboxes', 'gt_labels', 'gt_masks', 'gt_semantic_seg']), +] +test_pipeline = [ + dict(type='LoadImageFromFile'), + dict( + type='MultiScaleFlipAug', + img_scale=(1333, 800), + flip=False, + transforms=[ + dict(type='Resize', keep_ratio=True), + dict(type='RandomFlip'), + dict(type='Normalize', **img_norm_cfg), + dict(type='Pad', size_divisor=32), + dict(type='ImageToTensor', keys=['img']), + dict(type='Collect', keys=['img']), + ]) +] +data_root = 'data/coco/' +data = dict( + samples_per_gpu=2, + workers_per_gpu=2, + train=dict(pipeline=train_pipeline), + val=dict( + pipeline=test_pipeline, + ins_ann_file=data_root + 'annotations/instances_val2017.json', + ), + test=dict( + pipeline=test_pipeline, + ins_ann_file=data_root + 'annotations/instances_val2017.json', + )) + +embed_multi = dict(lr_mult=1.0, decay_mult=0.0) +# optimizer +optimizer = dict( + type='AdamW', + lr=0.0001, + weight_decay=0.05, + eps=1e-8, + betas=(0.9, 0.999), + paramwise_cfg=dict( + custom_keys={ + 'backbone': dict(lr_mult=0.1, decay_mult=1.0), + 'query_embed': embed_multi, + 'query_feat': embed_multi, + 'level_embed': embed_multi, + }, + norm_decay_mult=0.0)) +optimizer_config = dict(grad_clip=dict(max_norm=0.01, norm_type=2)) + +# learning policy +lr_config = dict( + policy='step', + gamma=0.1, + by_epoch=False, + step=[327778, 355092], + warmup='linear', + warmup_by_epoch=False, + warmup_ratio=1.0, # no warmup + warmup_iters=10) + +max_iters = 368750 +runner = dict(type='IterBasedRunner', max_iters=max_iters) + +log_config = dict( + interval=50, + hooks=[ + dict(type='TextLoggerHook', by_epoch=False), + dict(type='TensorboardLoggerHook', by_epoch=False) + ]) +interval = 5000 +workflow = [('train', interval)] +checkpoint_config = dict( + by_epoch=False, interval=interval, save_last=True, max_keep_ckpts=3) + +# Before 365001th iteration, we do evaluation every 5000 iterations. +# After 365000th iteration, we do evaluation every 368750 iterations, +# which means that we do evaluation at the end of training. +dynamic_intervals = [(max_iters // interval * interval + 1, max_iters)] +evaluation = dict( + interval=interval, + dynamic_intervals=dynamic_intervals, + metric=['PQ', 'bbox', 'segm']) diff --git a/configs/mask2former/mask2former_swin-b-p4-w12-384-in21k_lsj_8x2_50e_coco.py b/configs/mask2former/mask2former_swin-b-p4-w12-384-in21k_lsj_8x2_50e_coco.py new file mode 100644 index 00000000000..d0cf3762139 --- /dev/null +++ b/configs/mask2former/mask2former_swin-b-p4-w12-384-in21k_lsj_8x2_50e_coco.py @@ -0,0 +1,5 @@ +_base_ = ['./mask2former_swin-b-p4-w12-384_lsj_8x2_50e_coco.py'] +pretrained = 'https://github.com/SwinTransformer/storage/releases/download/v1.0.0/swin_base_patch4_window12_384_22k.pth' # noqa + +model = dict( + backbone=dict(init_cfg=dict(type='Pretrained', checkpoint=pretrained))) diff --git a/configs/mask2former/mask2former_swin-b-p4-w12-384_lsj_8x2_50e_coco.py b/configs/mask2former/mask2former_swin-b-p4-w12-384_lsj_8x2_50e_coco.py new file mode 100644 index 00000000000..d2a582598f4 --- /dev/null +++ b/configs/mask2former/mask2former_swin-b-p4-w12-384_lsj_8x2_50e_coco.py @@ -0,0 +1,42 @@ +_base_ = ['./mask2former_swin-t-p4-w7-224_lsj_8x2_50e_coco.py'] +pretrained = 'https://github.com/SwinTransformer/storage/releases/download/v1.0.0/swin_base_patch4_window12_384.pth' # noqa + +depths = [2, 2, 18, 2] +model = dict( + backbone=dict( + pretrain_img_size=384, + embed_dims=128, + depths=depths, + num_heads=[4, 8, 16, 32], + window_size=12, + init_cfg=dict(type='Pretrained', checkpoint=pretrained)), + panoptic_head=dict(in_channels=[128, 256, 512, 1024])) + +# set all layers in backbone to lr_mult=0.1 +# set all norm layers, position_embeding, +# query_embeding, level_embeding to decay_multi=0.0 +backbone_norm_multi = dict(lr_mult=0.1, decay_mult=0.0) +backbone_embed_multi = dict(lr_mult=0.1, decay_mult=0.0) +embed_multi = dict(lr_mult=1.0, decay_mult=0.0) +custom_keys = { + 'backbone': dict(lr_mult=0.1, decay_mult=1.0), + 'backbone.patch_embed.norm': backbone_norm_multi, + 'backbone.norm': backbone_norm_multi, + 'absolute_pos_embed': backbone_embed_multi, + 'relative_position_bias_table': backbone_embed_multi, + 'query_embed': embed_multi, + 'query_feat': embed_multi, + 'level_embed': embed_multi +} +custom_keys.update({ + f'backbone.stages.{stage_id}.blocks.{block_id}.norm': backbone_norm_multi + for stage_id, num_blocks in enumerate(depths) + for block_id in range(num_blocks) +}) +custom_keys.update({ + f'backbone.stages.{stage_id}.downsample.norm': backbone_norm_multi + for stage_id in range(len(depths) - 1) +}) +# optimizer +optimizer = dict( + paramwise_cfg=dict(custom_keys=custom_keys, norm_decay_mult=0.0)) diff --git a/configs/mask2former/mask2former_swin-l-p4-w12-384-in21k_lsj_16x1_100e_coco.py b/configs/mask2former/mask2former_swin-l-p4-w12-384-in21k_lsj_16x1_100e_coco.py new file mode 100644 index 00000000000..13aa28c4a9a --- /dev/null +++ b/configs/mask2former/mask2former_swin-l-p4-w12-384-in21k_lsj_16x1_100e_coco.py @@ -0,0 +1,26 @@ +_base_ = ['./mask2former_swin-b-p4-w12-384_lsj_8x2_50e_coco.py'] +pretrained = 'https://github.com/SwinTransformer/storage/releases/download/v1.0.0/swin_large_patch4_window12_384_22k.pth' # noqa + +model = dict( + backbone=dict( + embed_dims=192, + num_heads=[6, 12, 24, 48], + init_cfg=dict(type='Pretrained', checkpoint=pretrained)), + panoptic_head=dict(num_queries=200, in_channels=[192, 384, 768, 1536])) + +data = dict(samples_per_gpu=1, workers_per_gpu=1) + +lr_config = dict(step=[655556, 710184]) + +max_iters = 737500 +runner = dict(type='IterBasedRunner', max_iters=max_iters) + +# Before 735001th iteration, we do evaluation every 5000 iterations. +# After 735000th iteration, we do evaluation every 737500 iterations, +# which means that we do evaluation at the end of training.' +interval = 5000 +dynamic_intervals = [(max_iters // interval * interval + 1, max_iters)] +evaluation = dict( + interval=interval, + dynamic_intervals=dynamic_intervals, + metric=['PQ', 'bbox', 'segm']) diff --git a/configs/mask2former/mask2former_swin-s-p4-w7-224_lsj_8x2_50e_coco.py b/configs/mask2former/mask2former_swin-s-p4-w7-224_lsj_8x2_50e_coco.py new file mode 100644 index 00000000000..7b1b05abafe --- /dev/null +++ b/configs/mask2former/mask2former_swin-s-p4-w7-224_lsj_8x2_50e_coco.py @@ -0,0 +1,37 @@ +_base_ = ['./mask2former_swin-t-p4-w7-224_lsj_8x2_50e_coco.py'] +pretrained = 'https://github.com/SwinTransformer/storage/releases/download/v1.0.0/swin_small_patch4_window7_224.pth' # noqa + +depths = [2, 2, 18, 2] +model = dict( + backbone=dict( + depths=depths, init_cfg=dict(type='Pretrained', + checkpoint=pretrained))) + +# set all layers in backbone to lr_mult=0.1 +# set all norm layers, position_embeding, +# query_embeding, level_embeding to decay_multi=0.0 +backbone_norm_multi = dict(lr_mult=0.1, decay_mult=0.0) +backbone_embed_multi = dict(lr_mult=0.1, decay_mult=0.0) +embed_multi = dict(lr_mult=1.0, decay_mult=0.0) +custom_keys = { + 'backbone': dict(lr_mult=0.1, decay_mult=1.0), + 'backbone.patch_embed.norm': backbone_norm_multi, + 'backbone.norm': backbone_norm_multi, + 'absolute_pos_embed': backbone_embed_multi, + 'relative_position_bias_table': backbone_embed_multi, + 'query_embed': embed_multi, + 'query_feat': embed_multi, + 'level_embed': embed_multi +} +custom_keys.update({ + f'backbone.stages.{stage_id}.blocks.{block_id}.norm': backbone_norm_multi + for stage_id, num_blocks in enumerate(depths) + for block_id in range(num_blocks) +}) +custom_keys.update({ + f'backbone.stages.{stage_id}.downsample.norm': backbone_norm_multi + for stage_id in range(len(depths) - 1) +}) +# optimizer +optimizer = dict( + paramwise_cfg=dict(custom_keys=custom_keys, norm_decay_mult=0.0)) diff --git a/configs/mask2former/mask2former_swin-t-p4-w7-224_lsj_8x2_50e_coco.py b/configs/mask2former/mask2former_swin-t-p4-w7-224_lsj_8x2_50e_coco.py new file mode 100644 index 00000000000..70e3103e482 --- /dev/null +++ b/configs/mask2former/mask2former_swin-t-p4-w7-224_lsj_8x2_50e_coco.py @@ -0,0 +1,62 @@ +_base_ = ['./mask2former_r50_lsj_8x2_50e_coco.py'] +pretrained = 'https://github.com/SwinTransformer/storage/releases/download/v1.0.0/swin_tiny_patch4_window7_224.pth' # noqa + +depths = [2, 2, 6, 2] +model = dict( + type='Mask2Former', + backbone=dict( + _delete_=True, + type='SwinTransformer', + embed_dims=96, + depths=depths, + num_heads=[3, 6, 12, 24], + window_size=7, + mlp_ratio=4, + qkv_bias=True, + qk_scale=None, + drop_rate=0., + attn_drop_rate=0., + drop_path_rate=0.3, + patch_norm=True, + out_indices=(0, 1, 2, 3), + with_cp=False, + convert_weights=True, + frozen_stages=-1, + init_cfg=dict(type='Pretrained', checkpoint=pretrained)), + panoptic_head=dict( + type='Mask2FormerHead', in_channels=[96, 192, 384, 768]), + init_cfg=None) + +# set all layers in backbone to lr_mult=0.1 +# set all norm layers, position_embeding, +# query_embeding, level_embeding to decay_multi=0.0 +backbone_norm_multi = dict(lr_mult=0.1, decay_mult=0.0) +backbone_embed_multi = dict(lr_mult=0.1, decay_mult=0.0) +embed_multi = dict(lr_mult=1.0, decay_mult=0.0) +custom_keys = { + 'backbone': dict(lr_mult=0.1, decay_mult=1.0), + 'backbone.patch_embed.norm': backbone_norm_multi, + 'backbone.norm': backbone_norm_multi, + 'absolute_pos_embed': backbone_embed_multi, + 'relative_position_bias_table': backbone_embed_multi, + 'query_embed': embed_multi, + 'query_feat': embed_multi, + 'level_embed': embed_multi +} +custom_keys.update({ + f'backbone.stages.{stage_id}.blocks.{block_id}.norm': backbone_norm_multi + for stage_id, num_blocks in enumerate(depths) + for block_id in range(num_blocks) +}) +custom_keys.update({ + f'backbone.stages.{stage_id}.downsample.norm': backbone_norm_multi + for stage_id in range(len(depths) - 1) +}) +# optimizer +optimizer = dict( + type='AdamW', + lr=0.0001, + weight_decay=0.05, + eps=1e-8, + betas=(0.9, 0.999), + paramwise_cfg=dict(custom_keys=custom_keys, norm_decay_mult=0.0)) diff --git a/configs/mask2former/metafile.yml b/configs/mask2former/metafile.yml new file mode 100644 index 00000000000..2ceed8056af --- /dev/null +++ b/configs/mask2former/metafile.yml @@ -0,0 +1,159 @@ +Collections: + - Name: Mask2Former + Metadata: + Training Data: COCO + Training Techniques: + - AdamW + - Weight Decay + Training Resources: 8x A100 GPUs + Architecture: + - Mask2Former + Paper: + URL: https://arxiv.org/pdf/2112.01527 + Title: 'Masked-attention Mask Transformer for Universal Image Segmentation' + README: configs/mask2former/README.md + Code: + URL: https://github.com/open-mmlab/mmdetection/blob/v2.23.0/mmdet/models/detectors/mask2former.py#L7 + Version: v2.23.0 + +Models: +- Name: mask2former_r50_lsj_8x2_50e_coco + In Collection: Mask2Former + Config: configs/mask2former/mask2former_r50_lsj_8x2_50e_coco.py + Metadata: + Training Memory (GB): 13.9 + Iterations: 368750 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 44.8 + - Task: Instance Segmentation + Dataset: COCO + Metrics: + mask AP: 41.9 + - Task: Panoptic Segmentation + Dataset: COCO + Metrics: + PQ: 51.9 + Weights: https://download.openmmlab.com/mmdetection/v2.0/mask2former/mask2former_r50_lsj_8x2_50e_coco/mask2former_r50_lsj_8x2_50e_coco_20220326_224516-0091ce2b.pth +- Name: mask2former_r101_lsj_8x2_50e_coco + In Collection: Mask2Former + Config: configs/mask2former/mask2former_r101_lsj_8x2_50e_coco.py + Metadata: + Training Memory (GB): 16.1 + Iterations: 368750 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 45.3 + - Task: Instance Segmentation + Dataset: COCO + Metrics: + mask AP: 42.4 + - Task: Panoptic Segmentation + Dataset: COCO + Metrics: + PQ: 52.4 + Weights: https://download.openmmlab.com/mmdetection/v2.0/mask2former/mask2former_r101_lsj_8x2_50e_coco/mask2former_r101_lsj_8x2_50e_coco_20220329_225104-bb4df090.pth +- Name: mask2former_swin-t-p4-w7-224_lsj_8x2_50e_coco + In Collection: Mask2Former + Config: configs/mask2former/mask2former_swin-t-p4-w7-224_lsj_8x2_50e_coco.py + Metadata: + Training Memory (GB): 15.9 + Iterations: 368750 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 46.3 + - Task: Instance Segmentation + Dataset: COCO + Metrics: + mask AP: 43.4 + - Task: Panoptic Segmentation + Dataset: COCO + Metrics: + PQ: 53.4 + Weights: https://download.openmmlab.com/mmdetection/v2.0/mask2former/mask2former_swin-t-p4-w7-224_lsj_8x2_50e_coco/mask2former_swin-t-p4-w7-224_lsj_8x2_50e_coco_20220326_224553-c92f921c.pth +- Name: mask2former_swin-s-p4-w7-224_lsj_8x2_50e_coco + In Collection: Mask2Former + Config: configs/mask2former/mask2former_swin-s-p4-w7-224_lsj_8x2_50e_coco.py + Metadata: + Training Memory (GB): 19.1 + Iterations: 368750 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 47.8 + - Task: Instance Segmentation + Dataset: COCO + Metrics: + mask AP: 44.5 + - Task: Panoptic Segmentation + Dataset: COCO + Metrics: + PQ: 54.5 + Weights: https://download.openmmlab.com/mmdetection/v2.0/mask2former/mask2former_swin-s-p4-w7-224_lsj_8x2_50e_coco/mask2former_swin-s-p4-w7-224_lsj_8x2_50e_coco_20220329_225200-9f633bcf.pth +- Name: mask2former_swin-b-p4-w12-384_lsj_8x2_50e_coco + In Collection: Mask2Former + Config: configs/mask2former/mask2former_swin-b-p4-w12-384_lsj_8x2_50e_coco.py + Metadata: + Training Memory (GB): 26.0 + Iterations: 368750 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 48.2 + - Task: Instance Segmentation + Dataset: COCO + Metrics: + mask AP: 44.9 + - Task: Panoptic Segmentation + Dataset: COCO + Metrics: + PQ: 55.1 + Weights: https://download.openmmlab.com/mmdetection/v2.0/mask2former/mask2former_swin-b-p4-w12-384_lsj_8x2_50e_coco/mask2former_swin-b-p4-w12-384_lsj_8x2_50e_coco_20220331_002244-1db756b2.pth +- Name: mask2former_swin-b-p4-w12-384-in21k_lsj_8x2_50e_coco + In Collection: Mask2Former + Config: configs/mask2former/mask2former_swin-b-p4-w12-384-in21k_lsj_8x2_50e_coco.py + Metadata: + Training Memory (GB): 25.8 + Iterations: 368750 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 50.0 + - Task: Instance Segmentation + Dataset: COCO + Metrics: + mask AP: 46.3 + - Task: Panoptic Segmentation + Dataset: COCO + Metrics: + PQ: 56.3 + Weights: https://download.openmmlab.com/mmdetection/v2.0/mask2former/mask2former_swin-b-p4-w12-384-in21k_lsj_8x2_50e_coco/mask2former_swin-b-p4-w12-384-in21k_lsj_8x2_50e_coco_20220329_230021-89d7c1b1.pth +- Name: mask2former_swin-l-p4-w12-384-in21k_lsj_16x1_100e_coco + In Collection: Mask2Former + Config: configs/mask2former/mask2former_swin-l-p4-w12-384-in21k_lsj_16x1_100e_coco.py + Metadata: + Training Memory (GB): 21.1 + Iterations: 737500 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 52.2 + - Task: Instance Segmentation + Dataset: COCO + Metrics: + mask AP: 48.5 + - Task: Panoptic Segmentation + Dataset: COCO + Metrics: + PQ: 57.6 + Weights: https://download.openmmlab.com/mmdetection/v2.0/mask2former/mask2former_swin-l-p4-w12-384-in21k_lsj_16x1_100e_coco/mask2former_swin-l-p4-w12-384-in21k_lsj_16x1_100e_coco_20220407_104949-c481ee28.pth From 235061e338a6fc456c7c52e0adbac33e7518b090 Mon Sep 17 00:00:00 2001 From: Peter Vennerstrom <36269250+PeterVennerstrom@users.noreply.github.com> Date: Wed, 25 May 2022 06:50:18 -0500 Subject: [PATCH 34/49] Fix indent --- mmdet/datasets/pipelines/loading.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mmdet/datasets/pipelines/loading.py b/mmdet/datasets/pipelines/loading.py index 4d936f2dc8f..109276830b4 100644 --- a/mmdet/datasets/pipelines/loading.py +++ b/mmdet/datasets/pipelines/loading.py @@ -579,7 +579,7 @@ class FilterAnnotations: by_box (bool): Filter instances with bounding boxes not meeting the min_gt_bbox_wh threshold. Default: True by_mask (bool): Filter instances with masks not meeting - min_gt_mask_area threshold. Default: False + min_gt_mask_area threshold. Default: False keep_empty (bool): Whether to return None when it becomes an empty bbox after filtering. Default: True """ From cddde116cd394ebbc40ae081af7caa4d610e9949 Mon Sep 17 00:00:00 2001 From: peter Date: Thu, 26 May 2022 15:08:07 -0500 Subject: [PATCH 35/49] fix lint error from merge conflict --- mmdet/datasets/pipelines/__init__.py | 1 - 1 file changed, 1 deletion(-) diff --git a/mmdet/datasets/pipelines/__init__.py b/mmdet/datasets/pipelines/__init__.py index ed83fe86184..8260da64268 100644 --- a/mmdet/datasets/pipelines/__init__.py +++ b/mmdet/datasets/pipelines/__init__.py @@ -20,7 +20,6 @@ 'Compose', 'to_tensor', 'ToTensor', 'ImageToTensor', 'ToDataContainer', 'Transpose', 'Collect', 'DefaultFormatBundle', 'LoadAnnotations', 'LoadImageFromFile', 'LoadImageFromWebcam', 'LoadPanopticAnnotations', - 'LoadMultiChannelImageFromFiles', 'LoadProposals', 'FilterAnnotations', 'MultiScaleFlipAug', 'Resize', 'RandomFlip', 'Pad', 'RandomCrop', 'Normalize', 'SegRescale', 'MinIoURandomCrop', 'Expand', From 5e45ea6243504cc3a1bd37a05d213b33c7a32e5d Mon Sep 17 00:00:00 2001 From: Cedric Luo Date: Fri, 27 May 2022 14:41:35 +0800 Subject: [PATCH 36/49] Update .pre-commit-config.yaml --- .pre-commit-config.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 32d48c1f040..82dd58c69c0 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -1,5 +1,5 @@ repos: - - repo: https://gitlab.com/pycqa/flake8.git + - repo: https://github.com/PyCQA/flake8 rev: 3.8.3 hooks: - id: flake8 From db0b039792765d5a5c01957e87eb74bd813c24bb Mon Sep 17 00:00:00 2001 From: Cedric Luo Date: Fri, 27 May 2022 14:47:00 +0800 Subject: [PATCH 37/49] Rename mask2former_r50_lsj_8x2_50e_coco.py to mask2former_r50_lsj_8x2_50e_coco-panoptic.py --- ...2_50e_coco.py => mask2former_r50_lsj_8x2_50e_coco-panoptic.py} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename configs/mask2former/{mask2former_r50_lsj_8x2_50e_coco.py => mask2former_r50_lsj_8x2_50e_coco-panoptic.py} (100%) diff --git a/configs/mask2former/mask2former_r50_lsj_8x2_50e_coco.py b/configs/mask2former/mask2former_r50_lsj_8x2_50e_coco-panoptic.py similarity index 100% rename from configs/mask2former/mask2former_r50_lsj_8x2_50e_coco.py rename to configs/mask2former/mask2former_r50_lsj_8x2_50e_coco-panoptic.py From 025c7af58957b62a604f7d4733052f04ae220d6b Mon Sep 17 00:00:00 2001 From: Cedric Luo Date: Fri, 27 May 2022 14:47:40 +0800 Subject: [PATCH 38/49] Update and rename mask2former_r101_lsj_8x2_50e_coco.py to mask2former_r101_lsj_8x2_50e_coco-panoptic.py --- ...0e_coco.py => mask2former_r101_lsj_8x2_50e_coco-panoptic.py} | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) rename configs/mask2former/{mask2former_r101_lsj_8x2_50e_coco.py => mask2former_r101_lsj_8x2_50e_coco-panoptic.py} (72%) diff --git a/configs/mask2former/mask2former_r101_lsj_8x2_50e_coco.py b/configs/mask2former/mask2former_r101_lsj_8x2_50e_coco-panoptic.py similarity index 72% rename from configs/mask2former/mask2former_r101_lsj_8x2_50e_coco.py rename to configs/mask2former/mask2former_r101_lsj_8x2_50e_coco-panoptic.py index 27050585e18..33fdde6ccc1 100644 --- a/configs/mask2former/mask2former_r101_lsj_8x2_50e_coco.py +++ b/configs/mask2former/mask2former_r101_lsj_8x2_50e_coco-panoptic.py @@ -1,4 +1,4 @@ -_base_ = './mask2former_r50_lsj_8x2_50e_coco.py' +_base_ = './mask2former_r50_lsj_8x2_50e_coco-panoptic.py' model = dict( backbone=dict( From d20156622964f1e2b745f986efc52e89b9b7935a Mon Sep 17 00:00:00 2001 From: Cedric Luo Date: Fri, 27 May 2022 14:48:19 +0800 Subject: [PATCH 39/49] Update and rename mask2former_swin-b-p4-w12-384-in21k_lsj_8x2_50e_coco.py to mask2former_swin-b-p4-w12-384-in21k_lsj_8x2_50e_coco-panoptic.py --- ...former_swin-b-p4-w12-384-in21k_lsj_8x2_50e_coco-panoptic.py} | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) rename configs/mask2former/{mask2former_swin-b-p4-w12-384-in21k_lsj_8x2_50e_coco.py => mask2former_swin-b-p4-w12-384-in21k_lsj_8x2_50e_coco-panoptic.py} (74%) diff --git a/configs/mask2former/mask2former_swin-b-p4-w12-384-in21k_lsj_8x2_50e_coco.py b/configs/mask2former/mask2former_swin-b-p4-w12-384-in21k_lsj_8x2_50e_coco-panoptic.py similarity index 74% rename from configs/mask2former/mask2former_swin-b-p4-w12-384-in21k_lsj_8x2_50e_coco.py rename to configs/mask2former/mask2former_swin-b-p4-w12-384-in21k_lsj_8x2_50e_coco-panoptic.py index d0cf3762139..f13f5e17843 100644 --- a/configs/mask2former/mask2former_swin-b-p4-w12-384-in21k_lsj_8x2_50e_coco.py +++ b/configs/mask2former/mask2former_swin-b-p4-w12-384-in21k_lsj_8x2_50e_coco-panoptic.py @@ -1,4 +1,4 @@ -_base_ = ['./mask2former_swin-b-p4-w12-384_lsj_8x2_50e_coco.py'] +_base_ = ['./mask2former_swin-b-p4-w12-384_lsj_8x2_50e_coco-panoptic.py'] pretrained = 'https://github.com/SwinTransformer/storage/releases/download/v1.0.0/swin_base_patch4_window12_384_22k.pth' # noqa model = dict( From bb96ebba0a28c9f54b86ae82ce2e2bd99a3ea82c Mon Sep 17 00:00:00 2001 From: Cedric Luo Date: Fri, 27 May 2022 14:48:44 +0800 Subject: [PATCH 40/49] Update and rename mask2former_swin-b-p4-w12-384_lsj_8x2_50e_coco.py to mask2former_swin-b-p4-w12-384_lsj_8x2_50e_coco-panoptic.py --- ... mask2former_swin-b-p4-w12-384_lsj_8x2_50e_coco-panoptic.py} | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) rename configs/mask2former/{mask2former_swin-b-p4-w12-384_lsj_8x2_50e_coco.py => mask2former_swin-b-p4-w12-384_lsj_8x2_50e_coco-panoptic.py} (95%) diff --git a/configs/mask2former/mask2former_swin-b-p4-w12-384_lsj_8x2_50e_coco.py b/configs/mask2former/mask2former_swin-b-p4-w12-384_lsj_8x2_50e_coco-panoptic.py similarity index 95% rename from configs/mask2former/mask2former_swin-b-p4-w12-384_lsj_8x2_50e_coco.py rename to configs/mask2former/mask2former_swin-b-p4-w12-384_lsj_8x2_50e_coco-panoptic.py index d2a582598f4..33a805c35eb 100644 --- a/configs/mask2former/mask2former_swin-b-p4-w12-384_lsj_8x2_50e_coco.py +++ b/configs/mask2former/mask2former_swin-b-p4-w12-384_lsj_8x2_50e_coco-panoptic.py @@ -1,4 +1,4 @@ -_base_ = ['./mask2former_swin-t-p4-w7-224_lsj_8x2_50e_coco.py'] +_base_ = ['./mask2former_swin-t-p4-w7-224_lsj_8x2_50e_coco-panoptic.py'] pretrained = 'https://github.com/SwinTransformer/storage/releases/download/v1.0.0/swin_base_patch4_window12_384.pth' # noqa depths = [2, 2, 18, 2] From 135ba25e1227123fb37cc3558323537168a85478 Mon Sep 17 00:00:00 2001 From: Cedric Luo Date: Fri, 27 May 2022 14:49:10 +0800 Subject: [PATCH 41/49] Update and rename mask2former_swin-l-p4-w12-384-in21k_lsj_16x1_100e_coco.py to mask2former_swin-l-p4-w12-384-in21k_lsj_16x1_100e_coco-panoptic.py --- ...rmer_swin-l-p4-w12-384-in21k_lsj_16x1_100e_coco-panoptic.py} | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) rename configs/mask2former/{mask2former_swin-l-p4-w12-384-in21k_lsj_16x1_100e_coco.py => mask2former_swin-l-p4-w12-384-in21k_lsj_16x1_100e_coco-panoptic.py} (92%) diff --git a/configs/mask2former/mask2former_swin-l-p4-w12-384-in21k_lsj_16x1_100e_coco.py b/configs/mask2former/mask2former_swin-l-p4-w12-384-in21k_lsj_16x1_100e_coco-panoptic.py similarity index 92% rename from configs/mask2former/mask2former_swin-l-p4-w12-384-in21k_lsj_16x1_100e_coco.py rename to configs/mask2former/mask2former_swin-l-p4-w12-384-in21k_lsj_16x1_100e_coco-panoptic.py index 13aa28c4a9a..91a180d4b19 100644 --- a/configs/mask2former/mask2former_swin-l-p4-w12-384-in21k_lsj_16x1_100e_coco.py +++ b/configs/mask2former/mask2former_swin-l-p4-w12-384-in21k_lsj_16x1_100e_coco-panoptic.py @@ -1,4 +1,4 @@ -_base_ = ['./mask2former_swin-b-p4-w12-384_lsj_8x2_50e_coco.py'] +_base_ = ['./mask2former_swin-b-p4-w12-384_lsj_8x2_50e_coco-panoptic.py'] pretrained = 'https://github.com/SwinTransformer/storage/releases/download/v1.0.0/swin_large_patch4_window12_384_22k.pth' # noqa model = dict( From 4cc6e424afc88d0581871197b358830e8c903a84 Mon Sep 17 00:00:00 2001 From: Cedric Luo Date: Fri, 27 May 2022 14:49:35 +0800 Subject: [PATCH 42/49] Update and rename mask2former_swin-s-p4-w7-224_lsj_8x2_50e_coco.py to mask2former_swin-s-p4-w7-224_lsj_8x2_50e_coco-panoptic.py --- ...> mask2former_swin-s-p4-w7-224_lsj_8x2_50e_coco-panoptic.py} | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) rename configs/mask2former/{mask2former_swin-s-p4-w7-224_lsj_8x2_50e_coco.py => mask2former_swin-s-p4-w7-224_lsj_8x2_50e_coco-panoptic.py} (95%) diff --git a/configs/mask2former/mask2former_swin-s-p4-w7-224_lsj_8x2_50e_coco.py b/configs/mask2former/mask2former_swin-s-p4-w7-224_lsj_8x2_50e_coco-panoptic.py similarity index 95% rename from configs/mask2former/mask2former_swin-s-p4-w7-224_lsj_8x2_50e_coco.py rename to configs/mask2former/mask2former_swin-s-p4-w7-224_lsj_8x2_50e_coco-panoptic.py index 7b1b05abafe..b2b621ce781 100644 --- a/configs/mask2former/mask2former_swin-s-p4-w7-224_lsj_8x2_50e_coco.py +++ b/configs/mask2former/mask2former_swin-s-p4-w7-224_lsj_8x2_50e_coco-panoptic.py @@ -1,4 +1,4 @@ -_base_ = ['./mask2former_swin-t-p4-w7-224_lsj_8x2_50e_coco.py'] +_base_ = ['./mask2former_swin-t-p4-w7-224_lsj_8x2_50e_coco-panoptic.py'] pretrained = 'https://github.com/SwinTransformer/storage/releases/download/v1.0.0/swin_small_patch4_window7_224.pth' # noqa depths = [2, 2, 18, 2] From 2ff2d098ea10da4ffd8afb55ff2177cb1b30b0a0 Mon Sep 17 00:00:00 2001 From: Cedric Luo Date: Fri, 27 May 2022 14:49:52 +0800 Subject: [PATCH 43/49] Update and rename mask2former_swin-t-p4-w7-224_lsj_8x2_50e_coco.py to mask2former_swin-t-p4-w7-224_lsj_8x2_50e_coco-panoptic.py --- ...> mask2former_swin-t-p4-w7-224_lsj_8x2_50e_coco-panoptic.py} | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) rename configs/mask2former/{mask2former_swin-t-p4-w7-224_lsj_8x2_50e_coco.py => mask2former_swin-t-p4-w7-224_lsj_8x2_50e_coco-panoptic.py} (97%) diff --git a/configs/mask2former/mask2former_swin-t-p4-w7-224_lsj_8x2_50e_coco.py b/configs/mask2former/mask2former_swin-t-p4-w7-224_lsj_8x2_50e_coco-panoptic.py similarity index 97% rename from configs/mask2former/mask2former_swin-t-p4-w7-224_lsj_8x2_50e_coco.py rename to configs/mask2former/mask2former_swin-t-p4-w7-224_lsj_8x2_50e_coco-panoptic.py index 70e3103e482..04b2f10eddc 100644 --- a/configs/mask2former/mask2former_swin-t-p4-w7-224_lsj_8x2_50e_coco.py +++ b/configs/mask2former/mask2former_swin-t-p4-w7-224_lsj_8x2_50e_coco-panoptic.py @@ -1,4 +1,4 @@ -_base_ = ['./mask2former_r50_lsj_8x2_50e_coco.py'] +_base_ = ['./mask2former_r50_lsj_8x2_50e_coco-panoptic.py'] pretrained = 'https://github.com/SwinTransformer/storage/releases/download/v1.0.0/swin_tiny_patch4_window7_224.pth' # noqa depths = [2, 2, 6, 2] From 8b93b11d234fa23a402c7968c9a99853ced33d09 Mon Sep 17 00:00:00 2001 From: Cedric Luo Date: Fri, 27 May 2022 14:51:30 +0800 Subject: [PATCH 44/49] Create mask2former_r50_lsj_8x2_50e_coco.py --- .../mask2former_r50_lsj_8x2_50e_coco.py | 79 +++++++++++++++++++ 1 file changed, 79 insertions(+) create mode 100644 configs/mask2former/mask2former_r50_lsj_8x2_50e_coco.py diff --git a/configs/mask2former/mask2former_r50_lsj_8x2_50e_coco.py b/configs/mask2former/mask2former_r50_lsj_8x2_50e_coco.py new file mode 100644 index 00000000000..eca6135ba7c --- /dev/null +++ b/configs/mask2former/mask2former_r50_lsj_8x2_50e_coco.py @@ -0,0 +1,79 @@ +_base_ = ['./mask2former_r50_lsj_8x2_50e_coco-panoptic.py'] +num_things_classes = 80 +num_stuff_classes = 0 +num_classes = num_things_classes + num_stuff_classes +model = dict( + panoptic_head=dict( + num_things_classes=num_things_classes, + num_stuff_classes=num_stuff_classes, + loss_cls=dict(class_weight=[1.0] * num_classes + [0.1])), + panoptic_fusion_head=dict( + num_things_classes=num_things_classes, + num_stuff_classes=num_stuff_classes), + test_cfg=dict(panoptic_on=False)) + +# dataset settings +image_size = (1024, 1024) +img_norm_cfg = dict( + mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True) +pad_cfg = dict(img=(128, 128, 128), masks=0, seg=255) +train_pipeline = [ + dict(type='LoadImageFromFile', to_float32=True), + dict(type='LoadAnnotations', with_bbox=True, with_mask=True), + dict(type='RandomFlip', flip_ratio=0.5), + # large scale jittering + dict( + type='Resize', + img_scale=image_size, + ratio_range=(0.1, 2.0), + multiscale_mode='range', + keep_ratio=True), + dict( + type='RandomCrop', + crop_size=image_size, + crop_type='absolute', + recompute_bbox=True, + allow_negative_crop=True), + dict(type='FilterAnnotations', min_gt_bbox_wh=(1e-5, 1e-5), by_mask=True), + dict(type='Pad', size=image_size, pad_val=pad_cfg), + dict(type='Normalize', **img_norm_cfg), + dict(type='DefaultFormatBundle', img_to_float=True), + dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels', 'gt_masks']), +] +test_pipeline = [ + dict(type='LoadImageFromFile'), + dict( + type='MultiScaleFlipAug', + img_scale=(1333, 800), + flip=False, + transforms=[ + dict(type='Resize', keep_ratio=True), + dict(type='RandomFlip'), + dict(type='Pad', size_divisor=32, pad_val=pad_cfg), + dict(type='Normalize', **img_norm_cfg), + dict(type='ImageToTensor', keys=['img']), + dict(type='Collect', keys=['img']), + ]) +] +dataset_type = 'CocoDataset' +data_root = 'data/coco/' +data = dict( + _delete_=True, + samples_per_gpu=2, + workers_per_gpu=2, + train=dict( + type=dataset_type, + ann_file=data_root + 'annotations/instances_train2017.json', + img_prefix=data_root + 'train2017/', + pipeline=train_pipeline), + val=dict( + type=dataset_type, + ann_file=data_root + 'annotations/instances_val2017.json', + img_prefix=data_root + 'val2017/', + pipeline=test_pipeline), + test=dict( + type=dataset_type, + ann_file=data_root + 'annotations/instances_val2017.json', + img_prefix=data_root + 'val2017/', + pipeline=test_pipeline)) +evaluation = dict(metric=['bbox', 'segm']) From e01bf9e6c0012dd8a35d8f185557d2fbc3da6d9b Mon Sep 17 00:00:00 2001 From: Cedric Luo Date: Fri, 27 May 2022 14:51:55 +0800 Subject: [PATCH 45/49] Create mask2former_r101_lsj_8x2_50e_coco.py --- configs/mask2former/mask2former_r101_lsj_8x2_50e_coco.py | 7 +++++++ 1 file changed, 7 insertions(+) create mode 100644 configs/mask2former/mask2former_r101_lsj_8x2_50e_coco.py diff --git a/configs/mask2former/mask2former_r101_lsj_8x2_50e_coco.py b/configs/mask2former/mask2former_r101_lsj_8x2_50e_coco.py new file mode 100644 index 00000000000..5543fb0ebf9 --- /dev/null +++ b/configs/mask2former/mask2former_r101_lsj_8x2_50e_coco.py @@ -0,0 +1,7 @@ +_base_ = ['./mask2former_r50_lsj_8x2_50e_coco.py'] + +model = dict( + backbone=dict( + depth=101, + init_cfg=dict(type='Pretrained', + checkpoint='torchvision://resnet101'))) From c34cebeb773df153627449ec4567e05c058996f3 Mon Sep 17 00:00:00 2001 From: Cedric Luo Date: Fri, 27 May 2022 14:52:33 +0800 Subject: [PATCH 46/49] Create mask2former_swin-s-p4-w7-224_lsj_8x2_50e_coco.py --- ...ormer_swin-s-p4-w7-224_lsj_8x2_50e_coco.py | 37 +++++++++++++++++++ 1 file changed, 37 insertions(+) create mode 100644 configs/mask2former/mask2former_swin-s-p4-w7-224_lsj_8x2_50e_coco.py diff --git a/configs/mask2former/mask2former_swin-s-p4-w7-224_lsj_8x2_50e_coco.py b/configs/mask2former/mask2former_swin-s-p4-w7-224_lsj_8x2_50e_coco.py new file mode 100644 index 00000000000..7b1b05abafe --- /dev/null +++ b/configs/mask2former/mask2former_swin-s-p4-w7-224_lsj_8x2_50e_coco.py @@ -0,0 +1,37 @@ +_base_ = ['./mask2former_swin-t-p4-w7-224_lsj_8x2_50e_coco.py'] +pretrained = 'https://github.com/SwinTransformer/storage/releases/download/v1.0.0/swin_small_patch4_window7_224.pth' # noqa + +depths = [2, 2, 18, 2] +model = dict( + backbone=dict( + depths=depths, init_cfg=dict(type='Pretrained', + checkpoint=pretrained))) + +# set all layers in backbone to lr_mult=0.1 +# set all norm layers, position_embeding, +# query_embeding, level_embeding to decay_multi=0.0 +backbone_norm_multi = dict(lr_mult=0.1, decay_mult=0.0) +backbone_embed_multi = dict(lr_mult=0.1, decay_mult=0.0) +embed_multi = dict(lr_mult=1.0, decay_mult=0.0) +custom_keys = { + 'backbone': dict(lr_mult=0.1, decay_mult=1.0), + 'backbone.patch_embed.norm': backbone_norm_multi, + 'backbone.norm': backbone_norm_multi, + 'absolute_pos_embed': backbone_embed_multi, + 'relative_position_bias_table': backbone_embed_multi, + 'query_embed': embed_multi, + 'query_feat': embed_multi, + 'level_embed': embed_multi +} +custom_keys.update({ + f'backbone.stages.{stage_id}.blocks.{block_id}.norm': backbone_norm_multi + for stage_id, num_blocks in enumerate(depths) + for block_id in range(num_blocks) +}) +custom_keys.update({ + f'backbone.stages.{stage_id}.downsample.norm': backbone_norm_multi + for stage_id in range(len(depths) - 1) +}) +# optimizer +optimizer = dict( + paramwise_cfg=dict(custom_keys=custom_keys, norm_decay_mult=0.0)) From 7ce5c083977a0bc1bea894dfce8e092355dd85f5 Mon Sep 17 00:00:00 2001 From: Cedric Luo Date: Fri, 27 May 2022 14:52:54 +0800 Subject: [PATCH 47/49] Create mask2former_swin-t-p4-w7-224_lsj_8x2_50e_coco.py --- ...ormer_swin-t-p4-w7-224_lsj_8x2_50e_coco.py | 61 +++++++++++++++++++ 1 file changed, 61 insertions(+) create mode 100644 configs/mask2former/mask2former_swin-t-p4-w7-224_lsj_8x2_50e_coco.py diff --git a/configs/mask2former/mask2former_swin-t-p4-w7-224_lsj_8x2_50e_coco.py b/configs/mask2former/mask2former_swin-t-p4-w7-224_lsj_8x2_50e_coco.py new file mode 100644 index 00000000000..0ccbe91c683 --- /dev/null +++ b/configs/mask2former/mask2former_swin-t-p4-w7-224_lsj_8x2_50e_coco.py @@ -0,0 +1,61 @@ +_base_ = ['./mask2former_r50_lsj_8x2_50e_coco.py'] +pretrained = 'https://github.com/SwinTransformer/storage/releases/download/v1.0.0/swin_tiny_patch4_window7_224.pth' # noqa +depths = [2, 2, 6, 2] +model = dict( + type='Mask2Former', + backbone=dict( + _delete_=True, + type='SwinTransformer', + embed_dims=96, + depths=depths, + num_heads=[3, 6, 12, 24], + window_size=7, + mlp_ratio=4, + qkv_bias=True, + qk_scale=None, + drop_rate=0., + attn_drop_rate=0., + drop_path_rate=0.3, + patch_norm=True, + out_indices=(0, 1, 2, 3), + with_cp=False, + convert_weights=True, + frozen_stages=-1, + init_cfg=dict(type='Pretrained', checkpoint=pretrained)), + panoptic_head=dict( + type='Mask2FormerHead', in_channels=[96, 192, 384, 768]), + init_cfg=None) + +# set all layers in backbone to lr_mult=0.1 +# set all norm layers, position_embeding, +# query_embeding, level_embeding to decay_multi=0.0 +backbone_norm_multi = dict(lr_mult=0.1, decay_mult=0.0) +backbone_embed_multi = dict(lr_mult=0.1, decay_mult=0.0) +embed_multi = dict(lr_mult=1.0, decay_mult=0.0) +custom_keys = { + 'backbone': dict(lr_mult=0.1, decay_mult=1.0), + 'backbone.patch_embed.norm': backbone_norm_multi, + 'backbone.norm': backbone_norm_multi, + 'absolute_pos_embed': backbone_embed_multi, + 'relative_position_bias_table': backbone_embed_multi, + 'query_embed': embed_multi, + 'query_feat': embed_multi, + 'level_embed': embed_multi +} +custom_keys.update({ + f'backbone.stages.{stage_id}.blocks.{block_id}.norm': backbone_norm_multi + for stage_id, num_blocks in enumerate(depths) + for block_id in range(num_blocks) +}) +custom_keys.update({ + f'backbone.stages.{stage_id}.downsample.norm': backbone_norm_multi + for stage_id in range(len(depths) - 1) +}) +# optimizer +optimizer = dict( + type='AdamW', + lr=0.0001, + weight_decay=0.05, + eps=1e-8, + betas=(0.9, 0.999), + paramwise_cfg=dict(custom_keys=custom_keys, norm_decay_mult=0.0)) From 487a8ac5b6acdb51a4550d3d681cb59433cebf89 Mon Sep 17 00:00:00 2001 From: Cedric Luo Date: Fri, 27 May 2022 15:08:32 +0800 Subject: [PATCH 48/49] Update test_forward.py --- tests/test_models/test_forward.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_models/test_forward.py b/tests/test_models/test_forward.py index 241c2b71a2b..98f75b83270 100644 --- a/tests/test_models/test_forward.py +++ b/tests/test_models/test_forward.py @@ -813,7 +813,7 @@ def test_maskformer_forward(): @pytest.mark.parametrize('cfg_file', [ 'mask2former/mask2former_r50_lsj_8x2_50e_coco.py', - 'mask2former/mask2former_r50_lsj_8x2_50e_coco_ins.py' + 'mask2former/mask2former_r50_lsj_8x2_50e_coco-panoptic.py' ]) def test_mask2former_forward(cfg_file): # Test Panoptic Segmentation and Instance Segmentation From 6ab15c2ff3552de3e0f2ed8f3ffd39c0e2c4b7d6 Mon Sep 17 00:00:00 2001 From: Cedric Luo Date: Fri, 27 May 2022 15:42:37 +0800 Subject: [PATCH 49/49] remove gt_sem_seg --- mmdet/datasets/pipelines/loading.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mmdet/datasets/pipelines/loading.py b/mmdet/datasets/pipelines/loading.py index 109276830b4..79bbf809981 100644 --- a/mmdet/datasets/pipelines/loading.py +++ b/mmdet/datasets/pipelines/loading.py @@ -625,7 +625,7 @@ def __call__(self, results): for t in tests[1:]: keep = keep & t - keys = ('gt_bboxes', 'gt_labels', 'gt_masks', 'gt_semantic_seg') + keys = ('gt_bboxes', 'gt_labels', 'gt_masks') for key in keys: if key in results: results[key] = results[key][keep]