From f05f314e1c8b3972881929fc344cce2fe3f035f6 Mon Sep 17 00:00:00 2001 From: Marc-Antoine Maheux <35638081+mamaheux@users.noreply.github.com> Date: Tue, 28 May 2024 12:22:11 -0400 Subject: [PATCH] Evaluate descriptor yolo (#102) * Add a way to evaluate the descriptor yolo v7 descriptor. * Update descriptor yolo v7 weights. --- .../dnn_utils/scripts/export_models.bash | 2 +- .../object_detection/datasets/__init__.py | 4 + .../datasets/coco_detection_transforms.py | 10 +- .../datasets/object_detection_objects365.py | 89 +++++++++ .../objects365_detection_transforms.py | 46 +++++ .../object_detection/descriptor_yolo_v7.py | 4 +- tools/dnn_training/test_descriptor_yolo_v7.py | 171 ++++++++++++++++++ 7 files changed, 319 insertions(+), 7 deletions(-) create mode 100644 tools/dnn_training/object_detection/datasets/object_detection_objects365.py create mode 100644 tools/dnn_training/object_detection/datasets/objects365_detection_transforms.py create mode 100644 tools/dnn_training/test_descriptor_yolo_v7.py diff --git a/ros/utils/dnn_utils/scripts/export_models.bash b/ros/utils/dnn_utils/scripts/export_models.bash index 97ed3ca0..5d21f3e1 100755 --- a/ros/utils/dnn_utils/scripts/export_models.bash +++ b/ros/utils/dnn_utils/scripts/export_models.bash @@ -14,7 +14,7 @@ if [ -f Weights.zip ]; then OLD_TIME=$(stat Weights.zip -c %Y) fi -if OUT=$(wget -N https://github.com/introlab/t-top/releases/download/DNN_Weights_v4.0.0/Weights.zip 2>&1); then +if OUT=$(wget -N https://github.com/introlab/t-top/releases/download/DNN_Weights_v4.1.0/Weights.zip 2>&1); then # Output to stdout on success echo $OUT else diff --git a/tools/dnn_training/object_detection/datasets/__init__.py b/tools/dnn_training/object_detection/datasets/__init__.py index 7bdc419d..bf6f4e5c 100644 --- a/tools/dnn_training/object_detection/datasets/__init__.py +++ b/tools/dnn_training/object_detection/datasets/__init__.py @@ -5,3 +5,7 @@ from object_detection.datasets.open_images_detection_transforms import OpenImagesDetectionTrainingTransforms, \ OpenImagesDetectionValidationTransforms from object_detection.datasets.object_detection_open_images import ObjectDetectionOpenImages + +from object_detection.datasets.object_detection_objects365 import ObjectDetectionObjects365, \ + COCO_OBJECTS365_CLASS_INDEXES +from object_detection.datasets.objects365_detection_transforms import Objects365DetectionValidationTransforms diff --git a/tools/dnn_training/object_detection/datasets/coco_detection_transforms.py b/tools/dnn_training/object_detection/datasets/coco_detection_transforms.py index 0dce10ac..67714186 100644 --- a/tools/dnn_training/object_detection/datasets/coco_detection_transforms.py +++ b/tools/dnn_training/object_detection/datasets/coco_detection_transforms.py @@ -77,7 +77,7 @@ def _hflip_bbox(target, image_size): annotation[0] = image_size[1] - center_x -def _convert_bbox_to_yolo(target, scale, image_size, one_hot_class): +def _convert_bbox_to_yolo(target, scale, image_size, offset_x, offset_y, one_hot_class): if one_hot_class: class_count = len(CATEGORY_ID_TO_CLASS_INDEX_MAPPING) converted_target = {'bbox': torch.zeros(len(target), 4, dtype=torch.float), @@ -97,8 +97,8 @@ def _convert_bbox_to_yolo(target, scale, image_size, one_hot_class): w = min([w, image_size[1] - x]) h = min([h, image_size[0] - y]) - center_x = x + w / 2 - center_y = y + h / 2 + center_x = x + w / 2 + offset_x + center_y = y + h / 2 + offset_y converted_target['bbox'][i] = torch.tensor([center_x, center_y, w, h], dtype=torch.float) if one_hot_class: @@ -125,7 +125,7 @@ def __call__(self, image, target): image, target = _random_crop(image, target) resized_image, scale, offset_x, offset_y = _resize_image(image, self._image_size) - target = _convert_bbox_to_yolo(target, scale, self._image_size, self._one_hot_class) + target = _convert_bbox_to_yolo(target, scale, self._image_size, offset_x, offset_y, self._one_hot_class) if random.random() < self._horizontal_flip_p: resized_image = F.hflip(resized_image) @@ -151,7 +151,7 @@ def __call__(self, image, target): resized_image_tensor = F.to_tensor(resized_image) if target is not None: - target = _convert_bbox_to_yolo(target, scale, self._image_size, self._one_hot_class) + target = _convert_bbox_to_yolo(target, scale, self._image_size, offset_x, offset_y, self._one_hot_class) metadata = { 'scale': scale, diff --git a/tools/dnn_training/object_detection/datasets/object_detection_objects365.py b/tools/dnn_training/object_detection/datasets/object_detection_objects365.py new file mode 100644 index 00000000..a31d7c65 --- /dev/null +++ b/tools/dnn_training/object_detection/datasets/object_detection_objects365.py @@ -0,0 +1,89 @@ +import os +from collections import defaultdict + +import torch +from PIL import Image + +CLASS_COUNT = 365 +COCO_OBJECTS365_CLASS_INDEXES = {0, 46, 5, 58, 114, 55, 116, 65, 21, 40, 176, 127, 249, 24, 56, 139, 92, 78, 99, 96, + 144, 295, 178, 180, 38, 39, 13, 43, 194, 219, 119, 173, 154, 137, 113, 145, 146, 204, + 8, 35, 10, 88, 84, 93, 26, 112, 82, 265, 104, 141, 152, 234, 143, 150, 97, 2, 50, 25, + 75, 98, 153, 37, 73, 115, 132, 106, 64, 163, 149, 277, 81, 133, 18, 94, 30, 169, 328, + 226, 239, 156, 165, 177, 206} + + +class ObjectDetectionObjects365(torch.utils.data.Dataset): + def __init__(self, root, split='train', transforms=None, ignored_classes=None): + if ignored_classes is None: + ignored_classes = set() + else: + ignored_classes = set(ignored_classes) + + if split == 'training': + self._image_root = os.path.join(root, 'images', 'train') + self._label_root = os.path.join(root, 'labels', 'train') + elif split == 'validation': + self._image_root = os.path.join(root, 'images', 'val') + self._label_root = os.path.join(root, 'labels', 'val') + else: + raise ValueError('Invalid split') + + self._image_files, self._bboxes = self._list_images(self._image_root, self._label_root, ignored_classes) + self._transforms = transforms + + def _list_images(self, image_path, label_path, ignored_classes): + image_files = os.listdir(image_path) + bboxes = defaultdict(list) + + for image_file in image_files: + with open(os.path.join(label_path, os.path.splitext(image_file)[0] + '.txt'), 'r') as f: + for line in f: + values = line.split(' ') + class_index = int(values[0]) + if class_index in ignored_classes: + continue + + x_center = float(values[1]) + y_center = float(values[2]) + width = float(values[3]) + height = float(values[4]) + + bboxes[image_file].append({ + 'class_index': class_index, + 'x_center': x_center, + 'y_center': y_center, + 'width': width, + 'height': height + }) + + return image_files, bboxes + + def __len__(self): + return len(self._image_files) + + def __getitem__(self, index): + image_file = self._image_files[index] + image = Image.open(os.path.join(self._image_root, image_file)).convert('RGB') + + initial_width, initial_height = image.size + + target = [] + for i in range(len(self._bboxes[image_file])): + target.append({ + 'class_index': self._bboxes[image_file][i]['class_index'], + 'x_center': self._bboxes[image_file][i]['x_center'] * initial_width, + 'y_center': self._bboxes[image_file][i]['y_center'] * initial_height, + 'width': self._bboxes[image_file][i]['width'] * initial_width, + 'height': self._bboxes[image_file][i]['height'] * initial_height + }) + + image, target, transforms_metadata = self._transforms(image, target) + metadata = { + 'initial_width': initial_width, + 'initial_height': initial_height, + 'scale': transforms_metadata['scale'], + 'offset_x': transforms_metadata['offset_x'], + 'offset_y': transforms_metadata['offset_y'] + } + + return image, target, metadata diff --git a/tools/dnn_training/object_detection/datasets/objects365_detection_transforms.py b/tools/dnn_training/object_detection/datasets/objects365_detection_transforms.py new file mode 100644 index 00000000..37312ba9 --- /dev/null +++ b/tools/dnn_training/object_detection/datasets/objects365_detection_transforms.py @@ -0,0 +1,46 @@ +import torch +import torchvision.transforms.functional as F + +from object_detection.datasets.coco_detection_transforms import _resize_image +from object_detection.datasets.object_detection_objects365 import CLASS_COUNT + + +def _convert_bbox_to_yolo(target, scale, offset_x, offset_y, one_hot_class): + if one_hot_class: + converted_target = {'bbox': torch.zeros(len(target), 4, dtype=torch.float), + 'class': torch.zeros(len(target), CLASS_COUNT, dtype=torch.float)} + else: + converted_target = {'bbox': torch.zeros(len(target), 4, dtype=torch.float), + 'class': torch.zeros(len(target), dtype=torch.long)} + + for i in range(len(target)): + converted_target['bbox'][i] = torch.tensor([target[i]['x_center'] * scale + offset_x, + target[i]['y_center'] * scale + offset_y, + target[i]['width'] * scale, + target[i]['height'] * scale], dtype=torch.float) + if one_hot_class: + converted_target['class'][i, target[i]['class_index']] = 1.0 + else: + converted_target['class'][i] = target[i]['class_index'] + + return converted_target + + +class Objects365DetectionValidationTransforms: + def __init__(self, image_size, one_hot_class): + self._image_size = image_size + self._one_hot_class = one_hot_class + + def __call__(self, image, target): + resized_image, scale, offset_x, offset_y = _resize_image(image, self._image_size) + resized_image_tensor = F.to_tensor(resized_image) + + if target is not None: + target = _convert_bbox_to_yolo(target, scale, offset_x, offset_y, self._one_hot_class) + + metadata = { + 'scale': scale, + 'offset_x': offset_x, + 'offset_y': offset_y + } + return resized_image_tensor, target, metadata diff --git a/tools/dnn_training/object_detection/descriptor_yolo_v7.py b/tools/dnn_training/object_detection/descriptor_yolo_v7.py index 555435c2..edcdeab7 100644 --- a/tools/dnn_training/object_detection/descriptor_yolo_v7.py +++ b/tools/dnn_training/object_detection/descriptor_yolo_v7.py @@ -5,6 +5,8 @@ import torch import torch.nn as nn +from common.modules import NormalizedLinear + from object_detection.modules.descriptor_yolo_layer import DescriptorYoloV7Layer from object_detection.modules.yolo_v7_modules import YoloV7SPPCSPC, RepConv @@ -459,7 +461,7 @@ def __init__(self, class_count=80, embedding_size=128, class_probs=False): DescriptorYoloV7Layer(IMAGE_SIZE, 32, self._anchors[2], embedding_size) ) - self._classifier = nn.Linear(embedding_size, class_count, bias=False) + self._classifier = NormalizedLinear(embedding_size, class_count) self._class_probs = class_probs def get_image_size(self): diff --git a/tools/dnn_training/test_descriptor_yolo_v7.py b/tools/dnn_training/test_descriptor_yolo_v7.py new file mode 100644 index 00000000..7c49f469 --- /dev/null +++ b/tools/dnn_training/test_descriptor_yolo_v7.py @@ -0,0 +1,171 @@ +import argparse +import os + + +import numpy as np + +import torch + +from tqdm import tqdm + +from common.metrics import RocDistancesThresholdsEvaluation +from common.modules import load_checkpoint + +from object_detection.criterions.yolo_v4_loss import calculate_iou +from object_detection.datasets import CocoDetectionValidationTransforms, ObjectDetectionCoco +from object_detection.datasets import Objects365DetectionValidationTransforms, ObjectDetectionObjects365, \ + COCO_OBJECTS365_CLASS_INDEXES +from object_detection.descriptor_yolo_v7 import DescriptorYoloV7 +from object_detection.datasets.object_detection_coco import CLASS_COUNT +from object_detection.filter_yolo_predictions import group_predictions, filter_yolo_predictions + + +COMPARABLE_CONFIDENCE_THRESHOLD = 0.01 +NOT_COMPARABLE_CONFIDENCE_THRESHOLD = 0.25 +NMS_THRESHOLD = 0.45 +NOT_COMPARABLE_IOU_THRESHOLD = 0.5 + + +class CocoDescriptorEvaluation(RocDistancesThresholdsEvaluation): + def __init__(self, embeddings_class_pairs, interval, output_path): + super(CocoDescriptorEvaluation, self).__init__(output_path, thresholds=np.arange(0, 2, 0.0001)) + self._embeddings = torch.stack([p[0] for p in embeddings_class_pairs], dim=0).half() + self._classes = torch.stack([p[1] for p in embeddings_class_pairs], dim=0).to(torch.int16) + self._interval = interval + + if self._embeddings.device.type == 'cuda': + self._embeddings = self._embeddings.half() + + def _calculate_distances(self): + N = self._embeddings.size(0) + distances = torch.zeros(self._calculate_pair_count(N), + dtype=self._embeddings.dtype, + device=self._embeddings.device) + + k = 0 + for i in range(N): + others = self._embeddings[i + 1::self._interval] + distances[k:k + others.size(0)] = (self._embeddings[i].repeat(others.size(0), 1) - others).pow(2).sum(dim=1).sqrt() + k += others.size(0) + + torch.cuda.empty_cache() + return distances[::self._interval] + + def _get_is_same_person_target(self): + N = self._classes.size(0) + is_same_person_target = torch.zeros(self._calculate_pair_count(N), + dtype=torch.bool, + device=self._classes.device) + + k = 0 + for i in range(N): + others = self._classes[i + 1::self._interval] + is_same_person_target[k:k + others.size(0)] = self._classes[i] == others + k += others.size(0) + + torch.cuda.empty_cache() + return is_same_person_target[::self._interval] + + def _calculate_pair_count(self, N): + c = 0 + for i in range(N): + c += self._embeddings[i + 1::self._interval].size(0) + + return c + + +def main(): + parser = argparse.ArgumentParser(description='Test the specified descriptor yolo model') + parser.add_argument('--use_gpu', action='store_true', help='Use the GPU') + parser.add_argument('--embedding_size', type=int, help='Choose the embedding size', required=True) + parser.add_argument('--checkpoint', type=str, help='Choose the checkpoint file path', required=True) + parser.add_argument('--dataset_root', type=str, help='Choose the coco root path', required=True) + parser.add_argument('--dataset_type', type=str, choices=['coco', 'objects365'], help='Choose the coco root path', + required=True) + parser.add_argument('--comparable', action='store_true', help='Enable comparable results') + parser.add_argument('--output_path', type=str, help='Choose the output path', required=True) + + args = parser.parse_args() + + device = torch.device('cuda' if torch.cuda.is_available() and args.use_gpu else 'cpu') + + model = DescriptorYoloV7(CLASS_COUNT, embedding_size=args.embedding_size, class_probs=False) + load_checkpoint(model, args.checkpoint) + + if args.dataset_type == 'coco': + transforms = CocoDetectionValidationTransforms(model.get_image_size(), one_hot_class=False) + dataset = ObjectDetectionCoco(os.path.join(args.dataset_root, 'val2017'), + os.path.join(args.dataset_root, 'instances_val2017.json'), + transforms) + interval = 2 if args.comparable else 1 + elif args.dataset_type == 'objects365': + transforms = Objects365DetectionValidationTransforms(model.get_image_size(), one_hot_class=False) + dataset = ObjectDetectionObjects365(os.path.join(args.dataset_root), + split='validation', + transforms=transforms, + ignored_classes=COCO_OBJECTS365_CLASS_INDEXES) + interval = 1000 if args.comparable else 30 + else: + raise ValueError(f'Invalid dataset ({args.dataset_type})') + + os.makedirs(args.output_path, exist_ok=True) + + + evaluate(model, args.embedding_size, dataset, device, args.comparable, interval, args.output_path) + + +def evaluate(model, embedding_size, dataset, device, comparable, interval, output_path): + model = model.to(device) + model.eval() + + embeddings_class_pairs = [] + + bbox_count = 0 + with torch.no_grad(): + for image, target, metadata in tqdm(dataset): + target['bbox'] = target['bbox'].to(device) + target['class'] = target['class'].to(device) + + bbox_count += target['bbox'].size(0) + embeddings_class_pairs.extend( + compute_embedding(model, embedding_size, image.to(device), target, comparable)) + + torch.cuda.empty_cache() + + print(f'{len(embeddings_class_pairs)} boxes out of {bbox_count} detected') + coco_descriptor_evaluation = CocoDescriptorEvaluation(embeddings_class_pairs, interval, output_path) + coco_descriptor_evaluation.evaluate() + + +def compute_embedding(model, embedding_size, image_tensor, target, comparable): + predictions = model(image_tensor.unsqueeze(0)) + predictions = group_predictions(predictions)[0] + C = predictions.size(1) + predictions = filter_yolo_predictions(predictions, + confidence_threshold=COMPARABLE_CONFIDENCE_THRESHOLD if comparable else NOT_COMPARABLE_CONFIDENCE_THRESHOLD, + nms_threshold=NMS_THRESHOLD) + + if len(predictions) == 0: + print('Warning: No predictions found') + predicted_boxes = torch.zeros(1, C).to(image_tensor.device) + else: + predicted_boxes = torch.stack(predictions, dim=0) + + embeddings_class_pairs = [] + + for i in range(target['bbox'].size(0)): + target_box = target['bbox'][i] + target_class = target['class'][i] + + ious = calculate_iou(predicted_boxes[:, :4], target_box.repeat(len(predicted_boxes), 1)) + best_index = ious.argmax() + best_predicted_box = predicted_boxes[best_index] + + if comparable or ious[best_index] > NOT_COMPARABLE_IOU_THRESHOLD: + embeddings_class_pairs.append((best_predicted_box[-embedding_size:], target_class)) + + return embeddings_class_pairs + + +if __name__ == '__main__': + main()