diff --git a/src/Dockerfile b/src/Dockerfile new file mode 100755 index 0000000..1e4053b --- /dev/null +++ b/src/Dockerfile @@ -0,0 +1,3 @@ +FROM pytorch/pytorch:1.8.1-cuda11.1-cudnn8-runtime +RUN pip install pytorch-lightning==1.3.8 yacs==0.1.8 pandas==1.3.0 numpy==1.21 iopath==0.1.9 --no-cache-dir + diff --git a/src/checkpoints/README.md b/src/checkpoints/README.md new file mode 100644 index 0000000..8d1c8b6 --- /dev/null +++ b/src/checkpoints/README.md @@ -0,0 +1 @@ + diff --git a/src/conf/eev.yaml b/src/conf/eev.yaml new file mode 100755 index 0000000..e5bc950 --- /dev/null +++ b/src/conf/eev.yaml @@ -0,0 +1,31 @@ +MODEL: + TEMPORAL_TYPE: 'tcn' + USE_POSITION: True + FC_HIDDEN: 128 + FEATURES: ['effb0', ] +TCN: + NUM_STACK: 2 + NUM_CHANNELS: 512 + DILATIONS: 4 + K_SIZE: 3 + DROPOUT: 0.3 + NORM: False +OPTIM: + BASE_LR: 0.005 + MAX_EPOCH: 20 + LR_POLICY: 'none' + USE_SWA: True +TRAIN: + ACCUM_GRAD_BATCHES: 32 +TEST: + WEIGHTS: '' +DATA_LOADER: + DATA_DIR: '/mnt/sXProject/EvokedExpression/' + NUM_WORKERS: 20 + PIN_MEMORY: False + +VERBOSE: True +OUT_DIR: './tmp' +RNG_SEED: 1 +FAST_DEV_RUN: 2 +LOGGER: "TensorBoard" diff --git a/src/conf/eev_audio.yaml b/src/conf/eev_audio.yaml new file mode 100755 index 0000000..1aec492 --- /dev/null +++ b/src/conf/eev_audio.yaml @@ -0,0 +1,36 @@ +MODEL: + TEMPORAL_TYPE: 'tcn' + USE_POSITION: True + FC_HIDDEN: 128 + FEATURES: ['audio', ] +TCN: + NUM_STACK: 2 + NUM_CHANNELS: 512 + DILATIONS: 4 + K_SIZE: 3 + DROPOUT: 0.3 + NORM: False +LSTM: + HIDDEN_SIZE: 512 + NUM_LAYERS: 2 + BIDIREC: False + DROPOUT: 0.3 +OPTIM: + BASE_LR: 0.005 + MAX_EPOCH: 20 + LR_POLICY: 'none' + USE_SWA: True +TRAIN: + ACCUM_GRAD_BATCHES: 32 +TEST: + WEIGHTS: '' +DATA_LOADER: + DATA_DIR: '/mnt/sXProject/EvokedExpression/dataset/eev2021/' + NUM_WORKERS: 20 + PIN_MEMORY: False + +VERBOSE: True +OUT_DIR: './tmp' +RNG_SEED: 1 +FAST_DEV_RUN: 2 +LOGGER: "TensorBoard" diff --git a/src/conf/eev_audio_mediaeval18.yaml b/src/conf/eev_audio_mediaeval18.yaml new file mode 100755 index 0000000..2c5571c --- /dev/null +++ b/src/conf/eev_audio_mediaeval18.yaml @@ -0,0 +1,37 @@ +MODEL: + TEMPORAL_TYPE: 'tcn' + USE_POSITION: True + FC_HIDDEN: 128 + FEATURES: ['audio', ] +TCN: + NUM_STACK: 2 + NUM_CHANNELS: 512 + DILATIONS: 4 + K_SIZE: 3 + DROPOUT: 0.3 + NORM: False +LSTM: + HIDDEN_SIZE: 512 + NUM_LAYERS: 2 + BIDIREC: False + DROPOUT: 0.3 +OPTIM: + BASE_LR: 0.005 + MAX_EPOCH: 20 + LR_POLICY: 'none' + USE_SWA: True +TRAIN: + ACCUM_GRAD_BATCHES: 32 +TEST: + WEIGHTS: '' +DATA_LOADER: + DATA_DIR: '/mnt/sXProject/EvokedExpression/dataset/mediaeval18/' + NUM_WORKERS: 20 + PIN_MEMORY: False + +DATA_NAME: 'mediaeval18' +VERBOSE: True +OUT_DIR: './tmp' +RNG_SEED: 1 +FAST_DEV_RUN: 2 +LOGGER: "TensorBoard" diff --git a/src/conf/eev_effb0.yaml b/src/conf/eev_effb0.yaml new file mode 100755 index 0000000..18ade24 --- /dev/null +++ b/src/conf/eev_effb0.yaml @@ -0,0 +1,36 @@ +MODEL: + TEMPORAL_TYPE: 'tcn' + USE_POSITION: True + FC_HIDDEN: 128 + FEATURES: ['effb0', ] +TCN: + NUM_STACK: 2 + NUM_CHANNELS: 512 + DILATIONS: 4 + K_SIZE: 3 + DROPOUT: 0.3 + NORM: False +LSTM: + HIDDEN_SIZE: 512 + NUM_LAYERS: 2 + BIDIREC: False + DROPOUT: 0.3 +OPTIM: + BASE_LR: 0.005 + MAX_EPOCH: 20 + LR_POLICY: 'none' + USE_SWA: True +TRAIN: + ACCUM_GRAD_BATCHES: 32 +TEST: + WEIGHTS: '' +DATA_LOADER: + DATA_DIR: '/mnt/sXProject/EvokedExpression/dataset/eev2021/' + NUM_WORKERS: 20 + PIN_MEMORY: False + +VERBOSE: True +OUT_DIR: './tmp' +RNG_SEED: 1 +FAST_DEV_RUN: 2 +LOGGER: "TensorBoard" diff --git a/src/conf/eev_effb0_mediaeval18.yaml b/src/conf/eev_effb0_mediaeval18.yaml new file mode 100755 index 0000000..427c217 --- /dev/null +++ b/src/conf/eev_effb0_mediaeval18.yaml @@ -0,0 +1,37 @@ +MODEL: + TEMPORAL_TYPE: 'tcn' + USE_POSITION: True + FC_HIDDEN: 128 + FEATURES: ['effb0', ] +TCN: + NUM_STACK: 2 + NUM_CHANNELS: 512 + DILATIONS: 4 + K_SIZE: 3 + DROPOUT: 0.3 + NORM: False +LSTM: + HIDDEN_SIZE: 512 + NUM_LAYERS: 2 + BIDIREC: False + DROPOUT: 0.3 +OPTIM: + BASE_LR: 0.005 + MAX_EPOCH: 20 + LR_POLICY: 'none' + USE_SWA: True +TRAIN: + ACCUM_GRAD_BATCHES: 32 +TEST: + WEIGHTS: '' +DATA_LOADER: + DATA_DIR: '/mnt/sXProject/EvokedExpression/dataset/mediaeval18/' + NUM_WORKERS: 20 + PIN_MEMORY: False + +DATA_NAME: 'mediaeval18' +VERBOSE: True +OUT_DIR: './tmp' +RNG_SEED: 1 +FAST_DEV_RUN: 2 +LOGGER: "TensorBoard" diff --git a/src/conf/eev_multi_mediaeval18.yaml b/src/conf/eev_multi_mediaeval18.yaml new file mode 100755 index 0000000..41a8e0d --- /dev/null +++ b/src/conf/eev_multi_mediaeval18.yaml @@ -0,0 +1,37 @@ +MODEL: + TEMPORAL_TYPE: 'tcn' + USE_POSITION: True + FC_HIDDEN: 128 + FEATURES: ['audio', 'effb0'] +TCN: + NUM_STACK: 2 + NUM_CHANNELS: 512 + DILATIONS: 4 + K_SIZE: 3 + DROPOUT: 0.3 + NORM: False +LSTM: + HIDDEN_SIZE: 512 + NUM_LAYERS: 2 + BIDIREC: False + DROPOUT: 0.3 +OPTIM: + BASE_LR: 0.005 + MAX_EPOCH: 20 + LR_POLICY: 'none' + USE_SWA: True +TRAIN: + ACCUM_GRAD_BATCHES: 32 +TEST: + WEIGHTS: '' +DATA_LOADER: + DATA_DIR: '/mnt/sXProject/EvokedExpression/dataset/mediaeval18/' + NUM_WORKERS: 20 + PIN_MEMORY: False + +DATA_NAME: 'mediaeval18' +VERBOSE: True +OUT_DIR: './tmp' +RNG_SEED: 1 +FAST_DEV_RUN: 2 +LOGGER: "TensorBoard" diff --git a/src/core/__init__.py b/src/core/__init__.py new file mode 100755 index 0000000..0ad619c --- /dev/null +++ b/src/core/__init__.py @@ -0,0 +1,2 @@ +from .models import EEVModel +from .eev_data import EEVDataModule \ No newline at end of file diff --git a/src/core/config.py b/src/core/config.py new file mode 100755 index 0000000..2f122af --- /dev/null +++ b/src/core/config.py @@ -0,0 +1,276 @@ +""" +Original source: https://github.com/facebookresearch/pycls/blob/master/pycls/core/config.py +Latest commit 2c152a6 on May 6, 2021 +""" + +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +"""Configuration file (powered by YACS).""" + +import argparse +import os +import sys + +from .io import cache_url, pathmgr +from yacs.config import CfgNode + +# Global config object (example usage: from core.config import cfg) +_C = CfgNode() +cfg = _C + +# ---------------------------------- Model options ----------------------------------- # +_C.MODEL = CfgNode() + +# Number of classes +_C.MODEL.NUM_CLASSES = 15 + +# Loss function (see pycls/models/loss.py for options) +_C.MODEL.LOSS_FUN = "mse" + +# Number of hidden units in last layers +_C.MODEL.FC_HIDDEN = 32 + +# Temporal model type +_C.MODEL.TEMPORAL_TYPE = 'tcn' # tcn or lstm + +# Use position or not +_C.MODEL.USE_POSITION = True + +# List of pre-trained features +_C.MODEL.FEATURES = ['effb0'] +# ------------------------------- TCN options ------------------------------- # +_C.TCN = CfgNode() + +_C.TCN.NUM_CHANNELS = 512 +# TCN channels +_C.TCN.NUM_STACK = 2 + +# TCN Dilations +_C.TCN.DILATIONS = 4 + +# TCN Kernel size +_C.TCN.K_SIZE = 3 + +# TCN Dropout +_C.TCN.DROPOUT = 0. + +# Use WeightNorm in TCN or not +_C.TCN.NORM = True + +# Number of temporal module (head) +_C.TCN.NUM_HEAD = 1 + +# ------------------------------- LSTM options ------------------------------- # +_C.LSTM = CfgNode() + +# LSTM HIDDEN_SIZE +_C.LSTM.HIDDEN_SIZE = 64 + +# LSTM Num layers +_C.LSTM.NUM_LAYERS = 4 + +# LSTM Bidirectional or not +_C.LSTM.BIDIREC = False + +# LSTM Dropout +_C.LSTM.DROPOUT = 0. + +# -------------------------------- Optimizer options --------------------------------- # +_C.OPTIM = CfgNode() + +# Learning rate ranges from BASE_LR to MIN_LR*BASE_LR according to the LR_POLICY +_C.OPTIM.BASE_LR = 0.1 +_C.OPTIM.MIN_LR = 0.0 + +# Learning rate policy select from {'cos', 'exp', 'lin', 'steps'} +_C.OPTIM.LR_POLICY = "cos" + +# Steps for 'steps' policy (in epochs) +_C.OPTIM.STEPS = [] + +# Learning rate multiplier for 'steps' policy +_C.OPTIM.LR_MULT = 0.1 + +# Maximal number of epochs +_C.OPTIM.MAX_EPOCH = 200 + +# Momentum +_C.OPTIM.MOMENTUM = 0.9 + +# Momentum dampening +_C.OPTIM.DAMPENING = 0.0 + +# Nesterov momentum +_C.OPTIM.NESTEROV = True + +# L2 regularization +_C.OPTIM.WEIGHT_DECAY = 5e-4 + +# Start the warm up from OPTIM.BASE_LR * OPTIM.WARMUP_FACTOR +_C.OPTIM.WARMUP_FACTOR = 0.1 + +# Gradually warm up the OPTIM.BASE_LR over this number of epochs +_C.OPTIM.WARMUP_EPOCHS = 0 + +# Exponential Moving Average (EMA) update value +_C.OPTIM.EMA_ALPHA = 1e-5 + +# Iteration frequency with which to update EMA weights +_C.OPTIM.EMA_UPDATE_PERIOD = 32 + +# Use swa or not +_C.OPTIM.USE_SWA = True +# --------------------------------- Training options --------------------------------- # +_C.TRAIN = CfgNode() + +# Dataset and split +_C.TRAIN.DATASET = "" +_C.TRAIN.SPLIT = "train" + +# Total mini-batch size +_C.TRAIN.BATCH_SIZE = 1 + +# If True train using mixed precision +_C.TRAIN.MIXED_PRECISION = False + +# Accumulated gradients runs K small batches of size N before doing a backwards pass +_C.TRAIN.ACCUM_GRAD_BATCHES = 1 + +# Resume training from the latest checkpoint in the output directory +_C.TRAIN.AUTO_RESUME = True + +_C.TRAIN.DROP_PERC = 0.3 + +# Weights to start training from +_C.TRAIN.WEIGHTS = "" +# --------------------------------- Testing options ---------------------------------- # +_C.TEST = CfgNode() + +# Dataset and split +_C.TEST.DATASET = "" +_C.TEST.SPLIT = "val" + +# Total mini-batch size +_C.TEST.BATCH_SIZE = 1 + +# Weights to use for testing +_C.TEST.WEIGHTS = "" + +# ------------------------------- Data loader options -------------------------------- # +_C.DATA_LOADER = CfgNode() + +# Number of data loader workers per process +_C.DATA_LOADER.NUM_WORKERS = 8 + +# Load data to pinned host memory +_C.DATA_LOADER.PIN_MEMORY = False + +# ROOT of DATASET +_C.DATA_LOADER.DATA_DIR = '/mnt/XProject/EvokedExpression/dataset' + +_C.DATA_LOADER.EMO_INDEX = -1 + + +_C.DATA_NAME = 'eev' +# ---------------------------------- CUDNN options ----------------------------------- # +_C.CUDNN = CfgNode() + +# Perform benchmarking to select fastest CUDNN algorithms (best for fixed input sizes) +_C.CUDNN.BENCHMARK = True + +# ----------------------------------- Misc options ----------------------------------- # +# Optional description of a config +_C.DESC = "" + +# If True output additional info to log +_C.VERBOSE = True + +# Number of GPUs to use (applies to both training and testing) +_C.NUM_GPUS = 1 + +# Output directory +_C.OUT_DIR = "./tmp" + +# Config destination (in OUT_DIR) +_C.CFG_DEST = "config.yaml" + +# Note that non-determinism is still be present due to non-deterministic GPU ops +_C.RNG_SEED = 1 + +# Log destination ('stdout' or 'file') +_C.LOG_DEST = "stdout" + +# Log period in iters +_C.LOG_PERIOD = 10 + +# Logger (wandb or TensorBoard) +_C.LOGGER = "TensorBoard" + +# Models weights referred to by URL are downloaded to this local cache +_C.DOWNLOAD_CACHE = "/tmp/pycls-download-cache" + +# Fast dev run, > 0 run fast dev only for check training/validation logic +_C.FAST_DEV_RUN = 0 +# ---------------------------------- Default config ---------------------------------- # +_CFG_DEFAULT = _C.clone() +_CFG_DEFAULT.freeze() + + +def assert_and_infer_cfg(cache_urls=True): + """Checks config values invariants.""" + err_str = "The first lr step must start at 0" + assert not _C.OPTIM.STEPS or _C.OPTIM.STEPS[0] == 0, err_str + data_splits = ["train", "val", "test"] + err_str = "Data split '{}' not supported" + assert _C.TRAIN.SPLIT in data_splits, err_str.format(_C.TRAIN.SPLIT) + assert _C.TEST.SPLIT in data_splits, err_str.format(_C.TEST.SPLIT) + err_str = "Mini-batch size should be a multiple of NUM_GPUS." + assert _C.TRAIN.BATCH_SIZE % _C.NUM_GPUS == 0, err_str + assert _C.TEST.BATCH_SIZE % _C.NUM_GPUS == 0, err_str + err_str = "Log destination '{}' not supported" + assert _C.LOG_DEST in ["stdout", "file"], err_str.format(_C.LOG_DEST) + if cache_urls: + cache_cfg_urls() + + +def cache_cfg_urls(): + """Download URLs in config, cache them, and rewrite cfg to use cached file.""" + _C.TRAIN.WEIGHTS = cache_url(_C.TRAIN.WEIGHTS, _C.DOWNLOAD_CACHE) + _C.TEST.WEIGHTS = cache_url(_C.TEST.WEIGHTS, _C.DOWNLOAD_CACHE) + + +def dump_cfg(): + """Dumps the config to the output directory.""" + cfg_file = os.path.join(_C.OUT_DIR, '{}_{}{}'.format(_C.CFG_DEST[:-5], '-'.join(_C.MODEL.FEATURES), _C.CFG_DEST[-5:])) + with pathmgr.open(cfg_file, "w") as f: + _C.dump(stream=f) + return cfg_file + + +def load_cfg(cfg_file): + """Loads config from specified file.""" + with pathmgr.open(cfg_file, "r") as f: + _C.merge_from_other_cfg(_C.load_cfg(f)) + + +def reset_cfg(): + """Reset config to initial state.""" + _C.merge_from_other_cfg(_CFG_DEFAULT) + + +def load_cfg_fom_args(description="Config file options."): + """Load config from command line arguments and set any specified options.""" + parser = argparse.ArgumentParser(description=description) + help_s = "Config file location" + parser.add_argument("--cfg", dest="cfg_file", help=help_s, required=True, type=str) + help_s = "See pycls/core/config.py for all options" + parser.add_argument("opts", help=help_s, default=None, nargs=argparse.REMAINDER) + if len(sys.argv) == 1: + parser.print_help() + sys.exit(1) + args = parser.parse_args() + load_cfg(args.cfg_file) + _C.merge_from_list(args.opts) diff --git a/src/core/eev_data.py b/src/core/eev_data.py new file mode 100755 index 0000000..64a5f8c --- /dev/null +++ b/src/core/eev_data.py @@ -0,0 +1,171 @@ +""" +Author: HuynhVanThong +Department of AI Convergence, Chonnam Natl. Univ. +""" +import sys +from copy import deepcopy + +import pytorch_lightning as pl +from torch.utils import data +import numpy as np +import pandas as pd +import torch +from torchvision import transforms +from core.config import cfg + + +class ToTensor(object): + """ Convert ndarrays in sample to Tensors""" + + def __call__(self, sample): + transforms_sample = {} + for sp_key in sample.keys(): + if sp_key == 'file_id': + transforms_sample[sp_key] = sample[sp_key] + elif sp_key == 'timestamps': + transforms_sample[sp_key] = torch.from_numpy(sample[sp_key]).type(torch.LongTensor), + else: + transforms_sample[sp_key] = torch.from_numpy(sample[sp_key]).type(torch.FloatTensor) + + return transforms_sample + + +class TimeDropout(object): + """ Timestamp dropout """ + def __init__(self, drop_perc=0.3): + self.drop_perc = drop_perc + + def __call__(self, sample): + random_drop_perc = np.random.rand() + mask = np.ones(sample['scores'].shape[0], dtype=np.bool) + + if random_drop_perc <= self.drop_perc: + drop_index = np.random.choice(len(mask), size=int(random_drop_perc * len(mask)), replace=False) + mask[drop_index] = False + + for sp_key in sample.keys(): + if sp_key != 'file_id': + sample[sp_key] = sample[sp_key][mask] if sample[sp_key].ndim == 1 else sample[sp_key][mask, :] + + return sample + +class EEVDataset(data.Dataset): + def __init__(self, root_path='/mnt/Work/Dataset/EEV/', split='train', features=('resnet',), emotion_index=-1, + transforms=None, save_pt=False, use_position=True, dataset='eev', drop_perc=0.3): + + self.dataset_name = dataset + self.save_pt = save_pt + self.features = features + self.emotion_index = emotion_index + self.root_path = root_path + self.root_path_npy = self.root_path + self.split = split + self.transforms = transforms + self.use_position = use_position + self.drop_perc = drop_perc + if not self.use_position: + print('Do not use position encoding.') + if split not in ['train', 'val', 'test']: + raise ValueError('Do not support {} split for EEV dataset'.format(split)) + + if self.dataset_name == 'eev': + data_csv = pd.read_csv('{}/features_v2/{}.csv'.format(self.root_path, self.split)) + + id_header = 'Video ID' if split == 'test' else 'YouTube ID' + + excluded_ids = np.loadtxt('excluded_files.txt', dtype=str) + self.video_ids = list(set(data_csv[id_header].unique()) - set(excluded_ids)) + else: + video_ids = np.loadtxt('{}features_v2/{}.txt'.format(self.root_path, self.split), dtype=str) + self.video_ids = [x.replace('.mp4', '') for x in video_ids] + self.video_feats = [] + for vid_id in self.video_ids: + sample = torch.load('{}features_v2/{}/{}.pt'.format(self.root_path_npy, self.split, vid_id)) + self.video_feats.append(sample) + + def __len__(self): + return len(self.video_ids) + + def __getitem__(self, index): + current_id = self.video_ids[index] + if 'mediaeval' in self.dataset_name: + sample = deepcopy(self.video_feats[index]) + else: + sample = torch.load('{}features_v2/{}/{}.pt'.format(self.root_path_npy, self.split, current_id)) + + # Check and do drop positions + if self.split in ['train', ]: # 'train', 'val' + if self.dataset_name == 'eev': + mask = np.sum(sample['scores'], axis=-1) > 1e-6 + else: + mask = np.ones(sample['scores'].shape[0], dtype=np.bool) + else: + mask = np.ones(sample['scores'].shape[0], dtype=np.bool) + + if self.emotion_index > -1: + scores = np.reshape(sample['scores'][mask, self.emotion_index], (-1, 1)) + else: + smooth_scores = np.zeros_like(sample['scores'][mask, :]) + + scores = sample['scores'][mask, :] + smooth_scores + + use_sample = {} + if self.use_position: + norm_eff = 1e6 if self.dataset_name == 'eev' else 1e0 + position_info = sample['timestamps'][mask].reshape(-1, 1) / norm_eff + for feat_indx in self.features: + use_sample[feat_indx] = np.hstack([sample[feat_indx][mask, :], position_info]) + else: + for feat_indx in self.features: + use_sample[feat_indx] = sample[feat_indx][mask, :] + + use_sample.update({'timestamps': sample['timestamps'][mask], 'scores': scores, 'file_id': sample['file_id']}) + + if self.transforms is not None: + use_sample = self.transforms(use_sample) + + return use_sample + + +class EEVDataModule(pl.LightningDataModule): + def __init__(self, data_dir, features, dataset_name='eev', emotion_index=-1, drop_perc=0.3): + super(EEVDataModule, self).__init__() + self.data_dir = data_dir + self.features = features + self.transforms = transforms.Compose([ToTensor()]) + self.use_position = cfg.MODEL.USE_POSITION + self.dataset_name = dataset_name + self.emotion_index = emotion_index + self.drop_perc = drop_perc + + def setup(self, stage=None): + if stage == 'fit' or stage is None: + if self.drop_perc > 0.: + train_transforms = transforms.Compose([TimeDropout(drop_perc=self.drop_perc), ToTensor()]) + else: + train_transforms = self.transforms + + self.train_set = EEVDataset(self.data_dir, split='train', features=self.features, + transforms=train_transforms, + use_position=self.use_position, dataset=self.dataset_name, + emotion_index=self.emotion_index) + self.val_set = EEVDataset(self.data_dir, split='val', features=self.features, transforms=self.transforms, + use_position=self.use_position, dataset=self.dataset_name, + emotion_index=self.emotion_index) + + if stage == 'test' or stage is None: + self.test_set = EEVDataset(self.data_dir, split='test', features=self.features, transforms=self.transforms, + use_position=self.use_position, dataset=self.dataset_name, + emotion_index=self.emotion_index) + + def train_dataloader(self): + return data.DataLoader(self.train_set, batch_size=cfg.TRAIN.BATCH_SIZE, num_workers=cfg.DATA_LOADER.NUM_WORKERS, + shuffle=True, prefetch_factor=2) + + def val_dataloader(self): + return data.DataLoader(self.val_set, batch_size=cfg.TEST.BATCH_SIZE, num_workers=cfg.DATA_LOADER.NUM_WORKERS, + shuffle=False, prefetch_factor=2) + + def test_dataloader(self): + return data.DataLoader(self.test_set, batch_size=cfg.TEST.BATCH_SIZE, num_workers=cfg.DATA_LOADER.NUM_WORKERS, + shuffle=False, prefetch_factor=2) diff --git a/src/core/io.py b/src/core/io.py new file mode 100755 index 0000000..efb584e --- /dev/null +++ b/src/core/io.py @@ -0,0 +1,83 @@ +""" +Original source: https://github.com/facebookresearch/pycls/blob/master/pycls/core/io.py +Latest commit 4fab913 on Mar 18, 2021 +""" + +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +"""IO utilities (adapted from Detectron)""" + +import logging +import os +import re +import sys +from urllib import request as urlrequest + +from iopath.common.file_io import PathManagerFactory + +# instantiate global path manager for pycls +pathmgr = PathManagerFactory.get() + +logger = logging.getLogger(__name__) + +_PYCLS_BASE_URL = "https://dl.fbaipublicfiles.com/pycls" + + +def cache_url(url_or_file, cache_dir, base_url=_PYCLS_BASE_URL): + """Download the file specified by the URL to the cache_dir and return the path to + the cached file. If the argument is not a URL, simply return it as is. + """ + is_url = re.match(r"^(?:http)s?://", url_or_file, re.IGNORECASE) is not None + if not is_url: + return url_or_file + url = url_or_file + assert url.startswith(base_url), "url must start with: {}".format(base_url) + cache_file_path = url.replace(base_url, cache_dir) + if pathmgr.exists(cache_file_path): + return cache_file_path + cache_file_dir = os.path.dirname(cache_file_path) + if not pathmgr.exists(cache_file_dir): + pathmgr.mkdirs(cache_file_dir) + logger.info("Downloading remote file {} to {}".format(url, cache_file_path)) + download_url(url, cache_file_path) + return cache_file_path + + +def _progress_bar(count, total): + """Report download progress. Credit: + https://stackoverflow.com/questions/3173320/text-progress-bar-in-the-console/27871113 + """ + bar_len = 60 + filled_len = int(round(bar_len * count / float(total))) + percents = round(100.0 * count / float(total), 1) + bar = "=" * filled_len + "-" * (bar_len - filled_len) + sys.stdout.write( + " [{}] {}% of {:.1f}MB file \r".format(bar, percents, total / 1024 / 1024) + ) + sys.stdout.flush() + if count >= total: + sys.stdout.write("\n") + + +def download_url(url, dst_file_path, chunk_size=8192, progress_hook=_progress_bar): + """Download url and write it to dst_file_path. Credit: + https://stackoverflow.com/questions/2028517/python-urllib2-progress-hook + """ + req = urlrequest.Request(url) + response = urlrequest.urlopen(req) + total_size = response.info().get("Content-Length").strip() + total_size = int(total_size) + bytes_so_far = 0 + with pathmgr.open(dst_file_path, "wb") as f: + while 1: + chunk = response.read(chunk_size) + bytes_so_far += len(chunk) + if not chunk: + break + if progress_hook: + progress_hook(bytes_so_far, total_size) + f.write(chunk) + return bytes_so_far diff --git a/src/core/loss.py b/src/core/loss.py new file mode 100755 index 0000000..508c80f --- /dev/null +++ b/src/core/loss.py @@ -0,0 +1,44 @@ +""" +Author: HuynhVanThong +Department of AI Convergence, Chonnam Natl. Univ. +""" +import sys + +import torch + + +def EEVMSELoss(targets, preds, scale_factor=1.0): + mse_exp = torch.squeeze( + torch.mean((targets * scale_factor - preds) ** 2, dim=1)) + + # if torch.isnan(mse_exp): + # print('Check MSE: ', targets, preds, mse_exp) + # sys.exit(0) + return torch.mean(mse_exp) + + +def EEVPearsonLoss(targets, preds, scale_factor=1.0, ): + x_mean = torch.mean(targets * scale_factor, dim=1, keepdim=True) + xhat_mean = torch.mean(preds, dim=1, keepdim=True) + + numerator = torch.sum(torch.mul(targets * scale_factor - x_mean, preds - xhat_mean), dim=1) + denominator = torch.sqrt( + torch.sum((targets * scale_factor - x_mean) ** 2, dim=1) * torch.sum((preds - xhat_mean) ** 2, dim=1)) + + pearsonr_score = numerator / denominator + + return 1.0 - torch.mean(pearsonr_score) + + +def EEVMSEPCCLoss(targets, preds, scale_factor=1.0, alpha=0.5): + pcc_loss = EEVPearsonLoss(targets, preds, scale_factor) + mse_loss = EEVMSELoss(targets, preds, scale_factor) + # print('PCC loss: ', pcc_loss, torch.isnan(pcc_loss)) + # if torch.isnan(pcc_loss) or torch.isinf(pcc_loss): + # print('Loss is NAN ', pcc_loss, mse_loss) + # sys.exit(0) + # pcc_loss = 2 + # alpha = 0.0 + + loss = alpha * pcc_loss + (1 - alpha) * mse_loss + return loss diff --git a/src/core/metrics.py b/src/core/metrics.py new file mode 100755 index 0000000..326d76a --- /dev/null +++ b/src/core/metrics.py @@ -0,0 +1,42 @@ +""" +Author: HuynhVanThong +Department of AI Convergence, Chonnam Natl. Univ. +""" +import sys + +import torchmetrics +import torch + + +class EEVPearsonr(torchmetrics.Metric): + def __init__(self, dist_sync_on_step=False): + super(EEVPearsonr, self).__init__(dist_sync_on_step=dist_sync_on_step) + self.add_state("sum", default=torch.tensor(0.0), dist_reduce_fx="sum") + self.add_state("total", default=torch.tensor(0), dist_reduce_fx="sum") + + def get_score(self, x, x_hat): + # print(x.shape, x_hat.shape) + # sys.exit(0) + x_mean = torch.mean(x, dim=0, keepdim=True) + x_hat_mean = torch.mean(x_hat, dim=0, keepdim=True) + + numerator = torch.sum(torch.mul(x - x_mean, x_hat - x_hat_mean), dim=0) + denominator = torch.sqrt(torch.sum((x - x_mean) ** 2, dim=0) * torch.sum((x_hat - x_hat_mean) ** 2, dim=0)) + pearsonr_score = numerator / denominator + pearsonr_score[pearsonr_score != pearsonr_score] = -1 + return torch.mean(pearsonr_score) + + def update(self, preds, target): + # Update metric states + update_scores = 0. + + # update_scores = update_scores + self.get_score(target[0, :, :], preds[0, :, :]) + for idx in range(target.shape[0]): + update_scores = update_scores + self.get_score(target[idx, :,], preds[idx, :,]) + + self.sum += update_scores + + self.total = self.total + target.shape[0] + + def compute(self): + return self.sum / self.total diff --git a/src/core/models.py b/src/core/models.py new file mode 100755 index 0000000..636e287 --- /dev/null +++ b/src/core/models.py @@ -0,0 +1,373 @@ +""" +Author: HuynhVanThong +Department of AI Convergence, Chonnam Natl. Univ. +""" +import itertools +import math +import sys + +import torch +from torch import nn +import torch.nn.functional as F +import pytorch_lightning as pl + +from .tcn import TemporalConvNet +from .loss import EEVMSELoss, EEVPearsonLoss, EEVMSEPCCLoss +from .metrics import EEVPearsonr +import os +from collections import ChainMap +import pandas as pd +import numpy as np +from .config import cfg +from functools import partial + + +class EEVModel(pl.LightningModule): + @staticmethod + def get_params(): + if cfg.MODEL.TEMPORAL_TYPE == 'tcn': + # get tcn params + return { + "num_channels": cfg.TCN.NUM_CHANNELS, + "num_stack": cfg.TCN.NUM_STACK, + "dilation": cfg.TCN.DILATIONS, + "kernel_size": cfg.TCN.K_SIZE, + "dropout": cfg.TCN.DROPOUT, + "use_norm": cfg.TCN.NORM, + "fc_head": cfg.MODEL.FC_HIDDEN, + "learning_rate": cfg.OPTIM.BASE_LR + } + elif cfg.MODEL.TEMPORAL_TYPE == 'lstm': + # get lstm params + return { + "num_hidden": cfg.LSTM.HIDDEN_SIZE, + "num_layers": cfg.LSTM.NUM_LAYERS, + "bidirec": cfg.LSTM.BIDIREC, + "dropout": cfg.LSTM.DROPOUT, + "fc_head": cfg.MODEL.FC_HIDDEN, + "learning_rate": cfg.OPTIM.BASE_LR + } + else: + raise ValueError("Do not support temporal type of {}".format(cfg.MODEL.TEMPORAL_TYPE)) + + num_features = {'resnet': 2048, 'audio': 2048, 'effb0': 1280} + + def __init__(self, params, num_outputs=15, features=('resnet50',), result_dir='', dataset_name='eev', + emotion_index=-1): + super(EEVModel, self).__init__() + self.emotion_index = emotion_index + self.dataset_name = dataset_name + self.result_dir = result_dir + self.use_position = cfg.MODEL.USE_POSITION + self.num_outputs = num_outputs + + self.features = features + + for feat_idx in self.features: + cur_num_features = self.num_features[feat_idx] + cfg.MODEL.USE_POSITION + if cfg.MODEL.TEMPORAL_TYPE == 'tcn': + cur_temporal, num_temporal_out, fc_head = self.get_tcn_layers(params, cur_num_features) + else: + cur_temporal, num_temporal_out, fc_head = self.get_lstm_layers(params, cur_num_features) + + self.add_module('temporal_{}'.format(feat_idx), cur_temporal) + + cur_regression = nn.Sequential(nn.Linear(num_temporal_out, fc_head, bias=False), nn.ReLU(), + nn.Linear(fc_head, num_outputs, bias=False)) + self.add_module('regression_{}'.format(feat_idx), cur_regression) + + if len(self.features) > 1: + # Add some layer for fusion module at uni-modal level + pass + + if len(self.features) > 1: + # Add some layer for fusion module at uni-modal level + self.fusion_layer = nn.Sequential(nn.Linear(len(self.features), 1, bias=False), + ) + + # weight initialization + for m in self.modules(): + if isinstance(m, nn.Conv2d): + nn.init.kaiming_normal_(m.weight, mode='fan_out') + if m.bias is not None: + nn.init.zeros_(m.bias) + elif isinstance(m, nn.BatchNorm2d): + nn.init.ones_(m.weight) + nn.init.zeros_(m.bias) + + self.pearsonr = EEVPearsonr() + + self.scale_factor = 10.0 if self.dataset_name == 'eev' else 10.0 + + if self.dataset_name == 'eev': + self.loss_func = EEVMSELoss + else: + self.loss_func = partial(EEVMSEPCCLoss, alpha=0.) + + def get_lstm_layers(self, params, embed_dim): + if params is None: + params = self.get_params() + + vs = ["num_hidden", "num_layers", "bidirec", 'dropout', "fc_head", "learning_rate"] + num_hidden, num_layers, bidirec, dropout, fc_head, learning_rate = [params[v] for v in vs] + temporal_layers = nn.LSTM(input_size=embed_dim, num_layers=num_layers, hidden_size=num_hidden, dropout=dropout, + bidirectional=bidirec, batch_first=True) + self.learning_rate = learning_rate + return temporal_layers, num_hidden * (1 + bidirec), fc_head + + def get_tcn_layers(self, params, tcn_in): + if params is None: + params = self.get_params() + + vs = ["num_channels", "num_stack", "dilation", "kernel_size", "dropout", "use_norm", "fc_head", "learning_rate"] + num_channels, num_stack, dilation, kernel_size, dropout, use_norm, fc_head, learning_rate = [params[v] for v in + vs] + + # input of TCN should have dimension (N, C, L) + if num_stack == 1: + temporal_layers = TemporalConvNet(tcn_in, (num_channels,) * dilation, kernel_size, dropout, + use_norm=use_norm) + else: + list_layers = [] + for idx in range(num_stack): + tcn_in_index = tcn_in if idx == 0 else num_channels + list_layers.append( + TemporalConvNet(tcn_in_index, (num_channels,) * dilation, kernel_size, dropout, use_norm=use_norm)) + temporal_layers = nn.Sequential(*list_layers) + + self.learning_rate = learning_rate + return temporal_layers, num_channels, fc_head + + def forwardx(self, x, feat_idx): + # Input has size batch_size x sequence_length x num_channels (N x L x C) + + if cfg.MODEL.TEMPORAL_TYPE == 'tcn': + # Transform to (N, C, L) first + x = x.permute(0, 2, 1) + x = self._modules['temporal_{}'.format(feat_idx)](x) + # Transform back to (N, L, C) + x = x.permute(0, 2, 1) + else: + x, _ = self._modules['temporal_{}'.format(feat_idx)](x) + x = self._modules['regression_{}'.format(feat_idx)](x) + + if len(self.features) > 1: + # return something for fusion + return x + pass + else: + return x + + def forward(self, batch): + pred_scores = [] + for feat_idx in self.features: + if len(self.features) == 1: + # if len(batch[feat_idx].shape) > 3: + # print(batch[feat_idx].shape, batch['file_id']) + # pass + pred_scores = self.forwardx(batch[feat_idx], feat_idx) # 1 x k x 15 + # if self.use_position and self.dataset_name != 'eev': + # pred_scores = pred_scores / 1e0 + # TODO: Moving average smoothing + # pred_scores = pred_scores.permute(0, 2, 1) + # w_size = pred_scores.shape[2] // 4 + # if w_size % 2 == 0: + # w_size -= 1 + # pad1d = (w_size-1) // 2 + # w_f = torch.ones((pred_scores.shape[1], pred_scores.shape[1], w_size), device=pred_scores.device) + # pred_scores = F.conv1d(pred_scores, w_f, padding=pad1d) / w_size + # pred_scores = pred_scores.permute(0, 2, 1) + + else: + + feat_scores = self.forwardx(batch[feat_idx], feat_idx) + + # TODO: Moving average smoothing + # Do something for fusion + pred_scores.append(feat_scores) + + if len(self.features) > 1: + # Do something for fusion and return final score on pred_scores + pred_scores = torch.stack(pred_scores, dim=-1) + # print(pred_scores.shape) + pred_scores = self.fusion_layer(pred_scores) + pred_scores = torch.squeeze(pred_scores, dim=-1) + # print(pred_scores.shape) + # sys.exit(0) + pass + + return pred_scores + + def training_step(self, batch, batch_idx): + + out = self._shared_eval(batch, batch_idx) + + scores = batch['scores'] + + loss = self.loss_func(scores, out, scale_factor=self.scale_factor) + self.pearsonr.update(preds=out / self.scale_factor, target=scores) + + return {'loss': loss} + + def validation_step(self, batch, batch_idx): + out = self._shared_eval(batch, batch_idx) + scores = batch['scores'] + + loss = self.loss_func(scores, out, scale_factor=self.scale_factor) + self.pearsonr.update(preds=out / self.scale_factor, target=scores) + + return {'val_loss': loss, + 'file_id': (out.data.cpu().numpy()[0, :, :] / self.scale_factor, batch['timestamps'], batch['scores'])} + + def predict(self, batch, batch_idx, dataloader_idx=None): + out = self._shared_eval(batch, batch_idx) + + # return {'file_id': (out.data.cpu().numpy()[0, :, :] / self.scale_factor, batch['timestamps'], batch['scores'])} + return {batch['file_id'][0]: out.data.cpu().numpy()[0, :, :] / self.scale_factor} + + def _shared_eval(self, batch, batch_idx): + out = self(batch) + return out + + def test_step(self, batch, batch_idx): + out = self._shared_eval(batch, batch_idx) + + if torch.sum(torch.abs(batch['scores'])) > 0: + self.pearsonr.update(preds=out / self.scale_factor, target=batch['scores']) + + return {batch['file_id'][0]: out.data.cpu().numpy()[0, :, :] / self.scale_factor} + + def configure_optimizers(self): + optimizer = torch.optim.SGD(self.parameters(), lr=self.learning_rate) + + if cfg.OPTIM.LR_POLICY == 'none': + return optimizer + else: + # Return lr scheduler policy, not implemented yet + raise ValueError('Return lr scheduler policy, not implemented yet') + + def training_epoch_end(self, training_step_outputs): + train_pearsonr = self.pearsonr.compute() + loss_mean = torch.tensor([x['loss'] for x in training_step_outputs]).mean() / (self.scale_factor**2) # .data.cpu().numpy() + # self.log('loss', loss_mean.item(), prog_bar=True, logger=True, on_epoch=True, on_step=False) + # self.log('pearsonr', train_pearsonr.detach().cpu().item(), prog_bar=True, logger=True, on_epoch=True, + # on_step=False) + self.log('loss', loss_mean, prog_bar=True, logger=True, on_epoch=True, on_step=False) + self.log('pearsonr', train_pearsonr, prog_bar=True, logger=True, on_epoch=True, + on_step=False) + # print('Step {}. Learning rate: {}'.format(self.trainer.global_step, self.current_lr)) + + self.pearsonr.reset() + + # def training_epoch_end(self): + # self.pearsonr.reset() + + def validation_epoch_end(self, validation_step_outputs): + val_pearsonr = self.pearsonr.compute() + loss_mean = torch.tensor([x['val_loss'] for x in validation_step_outputs]).mean() / (self.scale_factor**2) # .data.cpu().numpy() + # self.log('val_pearsonr', val_pearsonr.detach().cpu().item(), prog_bar=True, logger=True, on_epoch=True) + # self.log('val_loss', loss_mean.item() / self.scale_factor , prog_bar=True, logger=True, on_epoch=True, on_step=False) + + self.log('val_pearsonr', val_pearsonr, prog_bar=True, logger=True, on_epoch=True) + self.log('val_loss', loss_mean, prog_bar=True, logger=True, on_epoch=True, + on_step=False) + + print_str = 'Epoch: {:5d} | Val-PCC {:10.5f} | Loss{:10.5f}'.format(self.current_epoch, + val_pearsonr.detach().cpu().item(), + loss_mean.item()) + print(print_str) + + with open(os.path.join(self.logger.log_dir, 'run_logs.txt'), 'a') as flog: + flog.write(print_str) + flog.write('\n') + + self.pearsonr.reset() + + # self.loss_func = partial(EEVMSEPCCLoss, alpha=0.5 - 0.5*(self.current_epoch / self.trainer.max_epochs)) + + def test_epoch_end(self, test_step_outputs): + if self.pearsonr.total > 0: + print('Test PCC scores: ', self.pearsonr.compute()) + self.pearsonr.reset() + + if isinstance(test_step_outputs[0], list): + test_results = list(itertools.chain.from_iterable(test_step_outputs)) + else: + test_results = test_step_outputs + test_write = dict(ChainMap(*test_results)) + if self.result_dir == '': + self.result_dir = self.logger.log_dir + + print('Test end, saving to {}'.format(os.path.join(self.result_dir, 'test_results.pt'))) + torch.save(test_write, os.path.join(self.result_dir, 'test_results.pt')) + + if self.dataset_name == 'eev': + self.test2csv(test_write) + elif self.dataset_name == 'mediaeval18': + write_path = os.path.join(self.result_dir, 'test_results') + os.makedirs(write_path, exist_ok=True) + for vid in test_write.keys(): + write_data = np.hstack([np.arange(test_write[vid].shape[0]).reshape(-1, 1), test_write[vid]]) + if self.emotion_index == -1: + pd.DataFrame(write_data, columns=['Time', 'Valence', 'Arousal']).to_csv( + os.path.join(write_path, vid + '.txt'), sep='\t', index=False) + else: + emotion_name = 'Valence' if self.emotion_index == 0 else 'Arousal' + pd.DataFrame(write_data, columns=['Time', emotion_name]).to_csv( + os.path.join(write_path, vid + '.txt'), sep='\t', index=False) + else: + raise ValueError('Do not support {} dataset'.format(self.dataset_name)) + + self.pearsonr.reset() + + def test2csv(self, test_prediction): + dataset_root_path = '/mnt/sXProject/EvokedExpression/' + emotions = ['amusement', 'anger', 'awe', 'concentration', + 'confusion', 'contempt', 'contentment', 'disappointment', 'doubt', 'elation', 'interest', + 'pain', 'sadness', 'surprise', 'triumph'] + + test_csv = pd.read_csv('{}/eev/{}.csv'.format(dataset_root_path, 'test')) + val_csv = pd.read_csv('{}/eev/{}.csv'.format(dataset_root_path, 'val')) + + list_ids = test_csv['Video ID'].unique() + if list_ids[0] not in test_prediction: + print('Use val set') + use_set_csv = val_csv + use_key = 'YouTube ID' + list_ids = val_csv['YouTube ID'].unique() + else: + use_set_csv = test_csv + use_key = 'Video ID' + + list_scores = [] + # cnt = 0 + for id in list_ids: + current_id_times = use_set_csv.loc[use_set_csv[use_key] == id].values[:, :2] # k x 2 + if id not in test_prediction: + current_scores = np.zeros((current_id_times.shape[0], self.num_outputs)) + else: + current_scores = test_prediction[id] # k x 15 + + current_data = np.hstack([current_id_times, current_scores]) + list_scores.append(current_data) + # cnt += 1 + # if cnt > 4: + # break + if self.num_outputs == 1: + columns_name = ['Video ID', 'Timestamp (milliseconds)', emotions[self.emotion_index]] + else: + columns_name = ['Video ID', 'Timestamp (milliseconds)', ] + emotions + + if isinstance(self.features, (list, tuple)): + postfix = '_'.join(self.features) + else: + postfix = self.features + + idx = 0 + while os.path.isfile('{}/{}.csv'.format(self.result_dir, 'test_results_{}_{}'.format(postfix, idx))): + idx += 1 + + list_scores = np.vstack(list_scores) + pd.DataFrame(data=list_scores, + columns=columns_name, ).to_csv( + '{}/{}.csv'.format(self.result_dir, 'test_results_{}_{}'.format(postfix, idx)), index=False) diff --git a/src/tcn.py b/src/core/tcn.py old mode 100644 new mode 100755 similarity index 95% rename from src/tcn.py rename to src/core/tcn.py index ca9612a..93c9aeb --- a/src/tcn.py +++ b/src/core/tcn.py @@ -1,3 +1,6 @@ +""" +Original source https://github.com/locuslab/TCN/blob/master/TCN/tcn.py +""" import torch import torch.nn as nn from torch.nn.utils import weight_norm @@ -82,9 +85,9 @@ def __init__(self, num_inputs, num_channels, kernel_size=3, dropout=0.2, use_nor in_channels = num_inputs if i == 0 else num_channels[i - 1] out_channels = num_channels[i] layers += [TemporalBlock(in_channels, out_channels, kernel_size, stride=1, dilation=dilation_size, - padding=(kernel_size - 1) * dilation_size, dropout=dropout, use_norm=False)] + padding=(kernel_size - 1) * dilation_size, dropout=dropout, use_norm=use_norm)] self.network = nn.Sequential(*layers) def forward(self, x): - return self.network(x) + return self.network(x) \ No newline at end of file diff --git a/src/feats/Dockerfile b/src/feats/Dockerfile new file mode 100755 index 0000000..8f688e7 --- /dev/null +++ b/src/feats/Dockerfile @@ -0,0 +1,9 @@ +FROM nvcr.io/nvidia/tensorflow:21.03-tf2-py3 +RUN apt-get update && apt-get -y install apt-utils gcc libpq-dev libsndfile-dev libgl1-mesa-glx ffmpeg graphviz \ + && rm -rf /var/lib/apt/lists/* +RUN pip install --no-cache-dir librosa pandas opencv-python tabulate moviepy pydot \ + --index-url=http://ftp.daumkakao.com/pypi/simple \ + --trusted-host=ftp.daumkakao.com +RUN pip install --no-cache-dir tensorflow-hub tensorflow-io==0.17.0 tensorflow-addons --no-deps \ + --index-url=http://ftp.daumkakao.com/pypi/simple \ + --trusted-host=ftp.daumkakao.com diff --git a/src/feats/feature_extractor.py b/src/feats/feature_extractor.py new file mode 100755 index 0000000..30d8196 --- /dev/null +++ b/src/feats/feature_extractor.py @@ -0,0 +1,286 @@ +""" +Author: Huynh Van Thong +Department of AI Convergence, Chonnam Natl. Univ. +""" +import gc +import os +import sys + +os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' +import tensorflow as tf +from tensorflow.keras import layers +import tensorflow_io as tfio +import tensorflow_hub as hub +import librosa +import pandas as pd +import cv2 +import numpy as np +import pathlib +import argparse + +DEFAULT_SR = 16000 + +model_link_dict = {'efficientnet-b0': ("https://tfhub.dev/tensorflow/efficientnet/b0/feature-vector/1", 224), + 'resnetv2-50': ("https://tfhub.dev/google/imagenet/resnet_v2_50/feature_vector/4", 224)} + + +def create_feature_extractor_model(modelname='efficientnet-b0'): + target_size = model_link_dict[modelname][1] + target_link = model_link_dict[modelname][0] + + model = tf.keras.Sequential([ # layers.experimental.preprocessing.Resizing(target_size, target_size), + layers.experimental.preprocessing.Rescaling(1. / 255), + hub.KerasLayer(target_link, trainable=False)]) + model.build([None, 224, 224, 3]) + return model + + +def resize_pad(img, target=224): + h, w = img.shape[:2] + if h < w: + pad_h = w - h + pad_w = 0 + else: + pad_w = h - w + pad_h = 0 + + img = np.pad(img, ((pad_h // 2, pad_h - pad_h // 2), (pad_w // 2, pad_w - pad_w // 2), (0, 0)), + mode='constant') + if img.shape[0] != img.shape[1] or img.shape[2] != 3: + print('Error in here, please stop ', img.shape) + sys.exit(0) + img = cv2.resize(img, (target, target)) + return img + + +def read_video(path, num_segments=-1, unlabelled_rows=None, get_audio=False): + print(path) + if not get_audio: + cap = cv2.VideoCapture(path) + vd_frames = [] + while True: + ret, frame = cap.read() + if not ret: + break + + frame = cv2.cvtColor(resize_pad(frame, 224), cv2.COLOR_BGR2RGB) + vd_frames.append(frame) + cap.release() + + vd_frames = np.array(vd_frames) + if num_segments > 0: + segment_len = int(vd_frames.shape[0] / num_segments) + use_indexes = np.linspace(segment_len // 2, vd_frames.shape[0], num=num_segments, dtype=int, endpoint=False) + vd_frames = vd_frames[use_indexes, :, :, :] + # if unlabelled_rows is not None: + # vd_frames = vd_frames[np.logical_not(unlabelled_rows), :, :, :] + + # h, w = vd_frames.shape[1: 3] + # + # + # vd_frames = np.pad(vd_frames, ((0, 0), (pad_h // 2, pad_h - pad_h // 2), (pad_w // 2, pad_w - pad_w // 2), (0, 0)), + # mode='constant') + return vd_frames + + else: + audio, sr = librosa.load(path) + if sr != DEFAULT_SR: + audio = librosa.resample(audio, sr, DEFAULT_SR) + if num_segments > 0 and audio.shape[0] % num_segments > 0: + num_pad = num_segments - (audio.shape[0] % num_segments) + audio = np.pad(audio, ((0, num_pad)), mode='constant') + + audio = audio.reshape(num_segments, -1) + # if unlabelled_rows is not None: + # audio = audio[np.logical_not(unlabelled_rows), :] + + return audio + + +def mediaeval_feature_extractor(split, visual=True): + with open(os.path.join(dataset_root_path, '{}.txt'.format(split)), 'r') as fd: + list_files = fd.readlines() + + list_files = [x.replace('\n', '') for x in list_files] + model_created = {} + prev_shape = {} + + if visual: + use_model = ['resnetv2-50', 'efficientnet-b0'] # list(model_link_dict.keys()) # + for model_id in use_model: + model_created[model_id] = create_feature_extractor_model(model_id) + prev_shape[model_id] = None + else: + module = tf.keras.Sequential([hub.KerasLayer('https://tfhub.dev/google/nonsemantic-speech-benchmark/trill-distilled/3', + arguments={'sample_rate': tf.constant(DEFAULT_SR, tf.int32)}, + trainable=False, output_key='embedding', + output_shape=[None, 2048])]) + + folder_write = pathlib.Path('{}/dataset/features_v2/{}'.format(dataset_root_path, split)) + folder_write.mkdir(parents=True, exist_ok=True) + + count = 0 + for vid in list_files: + count += 1 + if count % 10 == 0 : + print(count, '**', len(list_files)) + + video_path = os.path.join(dataset_root_path, 'raw/{}/{}'.format(split, vid)) + video_anno_path = video_path.replace('.mp4', '_Valence-Arousal.txt') + video_anno = pd.read_csv(video_anno_path, sep='\t').values + + feature_dict = {} + + if visual: + vd_frames = read_video(video_path, num_segments=video_anno.shape[0]) + + feature_dict.update({ + 'file_id': vid, + 'timestamps': video_anno[:, 0].astype(np.int64), + 'scores': video_anno[:, 1:]}) + + for model_id in use_model: + feature_dict['feature'] = model_created[model_id].predict(vd_frames, batch_size=128) + np.save('{}/{}_{}.npy'.format(folder_write.__str__(), vid, model_id), feature_dict) + _ = feature_dict.pop('feature') + + del vd_frames + else: + audio = read_video(video_path, num_segments=video_anno.shape[0], get_audio=True) + # Audio embedding extraction + # audio_emb = module(samples=audio, sample_rate=DEFAULT_SR)['embedding'] + audio_emb = module.predict(audio, batch_size=2048) + + # `emb` is a [batch_size, time, feature_dim] Tensor. In EvokedExpression, time=1 + # audio_emb.shape.assert_is_compatible_with([None, 512]) + audio_emb = np.squeeze(audio_emb) + + feature_dict['feature'] = audio_emb + + np.save('{}/{}_{}.npy'.format(folder_write.__str__(), vid, 'audio'), feature_dict) + + del audio_emb + del feature_dict + del audio + + print('Number of files: ', len(list_files)) + tf.keras.backend.clear_session() + pass + +def eev_feature_extractor(split, visual=True): + data_csv = pd.read_csv('{}/eev/{}.csv'.format(dataset_root_path, split)) + id_header = 'Video ID' if split == 'test' else 'YouTube ID' + video_ids = data_csv[id_header].unique() + + if not visual: + module = tf.keras.Sequential([hub.KerasLayer('https://tfhub.dev/google/nonsemantic-speech-benchmark/trill-distilled/3', + arguments={'sample_rate': tf.constant(DEFAULT_SR, tf.int32)}, + trainable=False, output_key='embedding', + output_shape=[None, 2048])]) + else: + model_created = {} + prev_shape = {} + use_model = ['resnetv2-50', 'efficientnet-b0'] # list(model_link_dict.keys()) # + for model_id in use_model: + model_created[model_id] = create_feature_extractor_model(model_id) + prev_shape[model_id] = None + + count = 0 + num_vids = len(video_ids) + excluded_ids = np.loadtxt('excluded_files.txt', dtype=str) + is_continue=True + for vid in video_ids: + gc.collect() + if count % 10 == 0: + print(count, "/", num_vids) + count += 1 + + if vid in excluded_ids: + continue + folder_write = pathlib.Path('{}/dataset/features_v2/{}'.format(dataset_root_path, split)) + folder_write.mkdir(parents=True, exist_ok=True) + if os.path.isfile('{}/{}_{}.npy'.format(folder_write.__str__(), vid, 'audio')): + tmp = np.load('{}/{}_{}.npy'.format(folder_write.__str__(), vid, 'audio'), allow_pickle=True) + if tmp.item()['feature'].shape[-1] != 2048: + print(tmp.item()['feature'].shape) + else: + continue + + feature_dict = {} + current_frames = data_csv.loc[data_csv[id_header] == vid] + if split in ['train', 'val']: + scores = current_frames.values[:, 2:].astype(np.float32) + unlabelled_rows = np.sum(scores, axis=-1) <= 1e-6 + else: + scores = -1 * np.ones((current_frames.shape[0], 15)).astype(np.float32) + unlabelled_rows = np.zeros(current_frames.shape[0], dtype=np.bool) + + unlabelled_rows = np.zeros(current_frames.shape[0], dtype=np.bool) + if visual: + try: + vd_frames = read_video("{}/dataset/{}/{}.mp4".format(dataset_root_path, split, vid), + num_segments=current_frames.shape[0], unlabelled_rows=unlabelled_rows) + feature_dict.update({ + 'file_id': current_frames.values[:, 0][np.logical_not(unlabelled_rows)].astype(np.str), + 'timestamps': current_frames.values[:, 1][np.logical_not(unlabelled_rows)].astype(np.int64), + 'scores': scores[np.logical_not(unlabelled_rows), :]}) + + for model_id in use_model: + feature_dict['feature'] = model_created[model_id].predict(vd_frames, batch_size=128) + np.save('{}/{}_{}.npy'.format(folder_write.__str__(), vid, model_id), feature_dict) + _ = feature_dict.pop('feature') + except: + with open('excluded_files_v3.txt', 'a') as fd: + fd.write('{} {}\n'.format(split, vid)) + break + continue + + del vd_frames + + else: + audio = read_video("{}/dataset/{}/{}.mp4".format(dataset_root_path, split, vid), + num_segments=current_frames.shape[0], unlabelled_rows=unlabelled_rows, get_audio=True) + # Audio embedding extraction + # audio_emb = module(samples=audio, sample_rate=DEFAULT_SR)['embedding'] + audio_emb = module.predict(audio, batch_size=2048) + + # `emb` is a [batch_size, time, feature_dim] Tensor. In EvokedExpression, time=1 + # audio_emb.shape.assert_is_compatible_with([None, 512]) + audio_emb = np.squeeze(audio_emb) + + feature_dict['feature'] = audio_emb + + np.save('{}/{}_{}.npy'.format(folder_write.__str__(), vid, 'audio'), feature_dict) + + del audio_emb + del feature_dict + del audio + scores = None + unlabelled_rows = None + + tf.keras.backend.clear_session() + + +if __name__ == '__main__': + parser = argparse.ArgumentParser(description='Video feature extractor') + parser.add_argument('--dataset', type=str, default='eev', help='Dataset eev or mediaeval18 (default: eev)') + parser.add_argument('--dataset_root', type=str, default='/mnt/Work/Dataset/EEV/', help='Dataset root path (default: /mnt/Work/Dataset/EEV/)') + parser.add_argument('--visual', action='store_true', help='Extract visual or audio (default: audio)') + + args = parser.parse_args() + + print(args.dataset, args.dataset_root, args.visual) + + dataset_root_path = args.dataset_root + + physical_devices = tf.config.list_physical_devices('GPU') + if len(physical_devices) > 0: + tf.config.experimental.set_memory_growth(physical_devices[0], True) + + for split in ['train', 'val', 'test']: + if args.dataset == 'eev': + eev_feature_extractor(split, args.visual) + elif args.dataset == 'mediaeval18': + mediaeval_feature_extractor(split, args.visual) + else: + raise ValueError('Do not support {} dataset'.format(args.dataset)) diff --git a/src/feature_extractor.py b/src/feature_extractor.py deleted file mode 100644 index eb2a48a..0000000 --- a/src/feature_extractor.py +++ /dev/null @@ -1,210 +0,0 @@ -""" -Author: Huynh Van Thong -Department of AI Convergence, Chonnam Natl. Univ. -""" -import gc -import os -import sys - -os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' -import tensorflow as tf -from tensorflow.keras import layers -import tensorflow_io as tfio -import tensorflow_hub as hub -import librosa -import pandas as pd -import cv2 -import numpy as np -import pathlib - -dataset_root_path = '/mnt/Work/Dataset/EEV/' -DEFAULT_SR = 16000 - -model_link_dict = {'efficientnet-b0': ("https://tfhub.dev/tensorflow/efficientnet/b0/feature-vector/1", 224), - 'efficientnet-b1': ("https://tfhub.dev/tensorflow/efficientnet/b1/feature-vector/1", 240), - 'efficientnet-b2': ("https://tfhub.dev/tensorflow/efficientnet/b2/feature-vector/1", 260), - 'efficientnet-b3': ("https://tfhub.dev/tensorflow/efficientnet/b3/feature-vector/1", 300), - 'resnetv2-50': ("https://tfhub.dev/google/imagenet/resnet_v2_50/feature_vector/4", 224)} - - -def create_feature_extractor_model(modelname='efficientnet-b0'): - target_size = model_link_dict[modelname][1] - target_link = model_link_dict[modelname][0] - - model = tf.keras.Sequential([ # layers.experimental.preprocessing.Resizing(target_size, target_size), - layers.experimental.preprocessing.Rescaling(1. / 255), - hub.KerasLayer(target_link, trainable=False)]) - model.build([None, 224, 224, 3]) - return model - - -def resize_pad(img, target=224): - h, w = img.shape[:2] - if h < w: - pad_h = w - h - pad_w = 0 - else: - pad_w = h - w - pad_h = 0 - - img = np.pad(img, ((pad_h // 2, pad_h - pad_h // 2), (pad_w // 2, pad_w - pad_w // 2), (0, 0)), - mode='constant') - if img.shape[0] != img.shape[1] or img.shape[2] != 3: - print('Error in here, please stop ', img.shape) - sys.exit(0) - img = cv2.resize(img, (target, target)) - return img - - -def read_video(path, num_segments=-1, unlabelled_rows=None, get_audio=False): - print(path) - if not get_audio: - cap = cv2.VideoCapture(path) - vd_frames = [] - while True: - ret, frame = cap.read() - if not ret: - break - - frame = cv2.cvtColor(resize_pad(frame, 224), cv2.COLOR_BGR2RGB) - vd_frames.append(frame) - cap.release() - - vd_frames = np.array(vd_frames) - if num_segments > 0: - segment_len = int(vd_frames.shape[0] / num_segments) - use_indexes = np.linspace(segment_len // 2, vd_frames.shape[0], num=num_segments, dtype=int, endpoint=False) - vd_frames = vd_frames[use_indexes, :, :, :] - # if unlabelled_rows is not None: - # vd_frames = vd_frames[np.logical_not(unlabelled_rows), :, :, :] - - # h, w = vd_frames.shape[1: 3] - # - # - # vd_frames = np.pad(vd_frames, ((0, 0), (pad_h // 2, pad_h - pad_h // 2), (pad_w // 2, pad_w - pad_w // 2), (0, 0)), - # mode='constant') - return vd_frames - - else: - audio, sr = librosa.load(path) - if sr != DEFAULT_SR: - audio = librosa.resample(audio, sr, DEFAULT_SR) - if num_segments > 0 and audio.shape[0] % num_segments > 0: - num_pad = num_segments - (audio.shape[0] % num_segments) - audio = np.pad(audio, ((0, num_pad)), mode='constant') - - audio = audio.reshape(num_segments, -1) - # if unlabelled_rows is not None: - # audio = audio[np.logical_not(unlabelled_rows), :] - - return audio - - -def feature_extractor(split): - data_csv = pd.read_csv('{}/eev/{}.csv'.format(dataset_root_path, split)) - id_header = 'Video ID' if split == 'test' else 'YouTube ID' - video_ids = data_csv[id_header].unique() - - module = tf.keras.Sequential([hub.KerasLayer('https://tfhub.dev/google/nonsemantic-speech-benchmark/trill-distilled/3', - arguments={'sample_rate': tf.constant(DEFAULT_SR, tf.int32)}, - trainable=False, output_key='embedding', - output_shape=[None, 2048])]) - - model_created = {} - prev_shape = {} - use_model = ['resnetv2-50', 'efficientnet-b0'] # list(model_link_dict.keys()) # - for model_id in use_model: - model_created[model_id] = create_feature_extractor_model(model_id) - prev_shape[model_id] = None - - count = 0 - num_vids = len(video_ids) - excluded_ids = np.loadtxt('excluded_files.txt', dtype=str) - is_continue=True - for vid in video_ids: - gc.collect() - if count % 10 == 0: - print(count, "/", num_vids) - count += 1 - # if vid not in ['zy6jKmYv0LM', 'zwmJl7OXvg0']: - # if is_continue: - # continue - # else: - # is_continue = False - - if vid in excluded_ids: - continue - folder_write = pathlib.Path('{}/dataset/features_v2/{}'.format(dataset_root_path, split)) - folder_write.mkdir(parents=True, exist_ok=True) - if os.path.isfile('{}/{}_{}.npy'.format(folder_write.__str__(), vid, 'audio')): - tmp = np.load('{}/{}_{}.npy'.format(folder_write.__str__(), vid, 'audio'), allow_pickle=True) - if tmp.item()['feature'].shape[-1] != 2048: - print(tmp.item()['feature'].shape) - else: - continue - # tmp2 = np.load('{}/{}_{}.npy'.format(folder_write.__str__(), vid, 'resnetv2-50'), allow_pickle=True) - # continue - # else: - # continue - - feature_dict = {} - current_frames = data_csv.loc[data_csv[id_header] == vid] - if split in ['train', 'val']: - scores = current_frames.values[:, 2:].astype(np.float32) - unlabelled_rows = np.sum(scores, axis=-1) <= 1e-6 - else: - scores = -1 * np.ones((current_frames.shape[0], 15)).astype(np.float32) - unlabelled_rows = np.zeros(current_frames.shape[0], dtype=np.bool) - - unlabelled_rows = np.zeros(current_frames.shape[0], dtype=np.bool) - # try: - # vd_frames = read_video("{}/dataset/{}/{}.mp4".format(dataset_root_path, split, vid), - # num_segments=current_frames.shape[0], unlabelled_rows=unlabelled_rows) - # feature_dict.update({ - # 'file_id': current_frames.values[:, 0][np.logical_not(unlabelled_rows)].astype(np.str), - # 'timestamps': current_frames.values[:, 1][np.logical_not(unlabelled_rows)].astype(np.int64), - # 'scores': scores[np.logical_not(unlabelled_rows), :]}) - # - # for model_id in use_model: - # feature_dict['feature'] = model_created[model_id].predict(vd_frames, batch_size=128) - # np.save('{}/{}_{}.npy'.format(folder_write.__str__(), vid, model_id), feature_dict) - # _ = feature_dict.pop('feature') - # except: - # with open('excluded_files_v3.txt', 'a') as fd: - # fd.write('{} {}\n'.format(split, vid)) - # break - # continue - # - # del vd_frames - - audio = read_video("{}/dataset/{}/{}.mp4".format(dataset_root_path, split, vid), - num_segments=current_frames.shape[0], unlabelled_rows=unlabelled_rows, get_audio=True) - # Audio embedding extraction - # audio_emb = module(samples=audio, sample_rate=DEFAULT_SR)['embedding'] - audio_emb = module.predict(audio, batch_size=2048) - - # `emb` is a [batch_size, time, feature_dim] Tensor. In EvokedExpression, time=1 - # audio_emb.shape.assert_is_compatible_with([None, 512]) - audio_emb = np.squeeze(audio_emb) - - feature_dict['feature'] = audio_emb - np.save('{}/{}_{}.npy'.format(folder_write.__str__(), vid, 'audio'), feature_dict) - # print("Writing to tfrecords") - - # count += 1 - - del audio_emb - del feature_dict - del audio - scores = None - unlabelled_rows = None - - tf.keras.backend.clear_session() - - -if __name__ == '__main__': - physical_devices = tf.config.list_physical_devices('GPU') - tf.config.experimental.set_memory_growth(physical_devices[0], True) - for split in ['train', 'val', 'test']: - feature_extractor(split) - pass diff --git a/src/main.py b/src/main.py old mode 100644 new mode 100755 index 89fe7f9..188d9f6 --- a/src/main.py +++ b/src/main.py @@ -2,109 +2,113 @@ Author: Huynh Van Thong Department of AI Convergence, Chonnam Natl. Univ. """ -import argparse + +import os.path as osp import glob -import os.path -import pathlib import shutil -import sys +import time import torch from pytorch_lightning.callbacks import ModelCheckpoint -from torchvision import transforms -from torch.utils.data import DataLoader -from utils import EEVdataset, ToTensor, eev_collatefn import pytorch_lightning as pl -from models import EEVModel from pytorch_lightning.loggers import TensorBoardLogger - -def get_dataloader(emotion_index=-1, feature='resnet'): - loaders = {} - for split in ['train', 'val', 'test']: - current_split = EEVdataset(root_path='/mnt/sXProject/EvokedExpression/', split=split, - feature=feature, emotion_index=emotion_index, - transforms=transforms.Compose([ToTensor()])) - shuffle = (split == 'train') - loaders[split] = DataLoader(current_split, batch_size=1, shuffle=shuffle, num_workers=20, # pin_memory=True, - prefetch_factor=2, collate_fn=None) - # for b in loaders[split]: - # # print(b['effb0'].shape, b['resnet'].shape, b['audio'].shape, b['mask'].shape) - # tmp = b['audio'] - # # # if tmp.shape[1] == 1: - # # # print(split, ' ', b['file_id']) - # # # - # sys.exit(0) - return loaders +from core import EEVModel, EEVDataModule, config +from core.config import cfg +from core.io import pathmgr def copyfiles(source_dir, dest_dir, ext='*.py'): - files = glob.iglob(os.path.join(source_dir, ext)) + # Copy source files or compress to zip + files = glob.iglob(osp.join(source_dir, ext)) for file in files: - if os.path.isfile(file): + if osp.isfile(file): shutil.copy2(file, dest_dir) + if osp.isdir(osp.join(source_dir, 'core')) and not osp.isdir(osp.join(dest_dir, 'core')): + shutil.copytree(osp.join(source_dir, 'core'), osp.join(dest_dir, 'core'), copy_function=shutil.copy2) if __name__ == '__main__': - parser = argparse.ArgumentParser(description='Evoked Expression') - parser.add_argument('--dir', type=str, default='./trial', help="Training directory") - parser.add_argument('--lr_init', type=float, default=1e-3, help="Initial learning rate") - parser.add_argument('--batch_size', type=int, default=32, help="Batch size") - parser.add_argument('--seed', type=int, default=1, help="Random seed") - parser.add_argument('--epoch', type=int, default=5, help="Number of epochs") - parser.add_argument('--opt', type=str, default='sgd', help="Optimizer") - parser.add_argument('--feature', type=str, default='resnet', help="Feature type (audio, resnet, effb0") - parser.add_argument('--emotion', type=int, default=0, help="Emotion index to be learned (0-14) or all (-1)") - - args = parser.parse_args() - - if args.feature not in ['resnet', 'audio', 'effb0']: - raise ValueError('Do not support {} at this time.'.format(args.feature)) - if args.emotion not in range(-1, 15): - raise ValueError('Do not support emotion {} at this time.'.format(args.emotion)) - - tcn_in = {'resnet': 2048, 'audio': 2048, 'effb0': 1280} - if args.emotion == -1: - num_outputs = 15 - emotion_index = -1 + config.load_cfg_fom_args("EEV 2021 Challenges") + config.assert_and_infer_cfg() + cfg.freeze() + + pl.seed_everything(2) # cfg.RNG_SEED + + # torch.backends.cudnn.benchmark = cfg.CUDNN.BENCHMARK + st_time = time.time() + # Ensure that the output dir exists + pathmgr.mkdirs(cfg.OUT_DIR) + cfg_file = config.dump_cfg() + copyfiles(source_dir='./', dest_dir=cfg.OUT_DIR) + copyfiles(source_dir='./', dest_dir=cfg.OUT_DIR, ext='.sh') + copyfiles(source_dir='./', dest_dir=cfg.OUT_DIR, ext='.txt') + + if cfg.LOGGER == 'TensorBoard': + logger = TensorBoardLogger(cfg.OUT_DIR, name='{}_emo'.format('full'), version='_'.join(cfg.MODEL.FEATURES)) else: + raise ValueError('Do not implement with {} logger yet.'.format(cfg.LOGGER)) + + params = None # Default is None + num_outputs = 15 if cfg.DATA_NAME == 'eev' else 2 # TODO + if cfg.DATA_LOADER.EMO_INDEX > -1: num_outputs = 1 - emotion_index = args.emotion - - pl.seed_everything(args.seed) - - emotions = ['amusement', 'anger', 'awe', 'concentration', 'confusion', 'contempt', 'contentment', 'disappointment', - 'doubt', 'elation', 'interest', 'pain', 'sadness', 'surprise', 'triumph'] - - if num_outputs == 15: - save_dir = args.dir # os.path.join(args.dir, 'emo_{}'.format(emotions[emo_ind])) - pathlib.Path(save_dir).mkdir(exist_ok=True, parents=True) - logger = TensorBoardLogger(save_dir, name='{}_emo'.format('full'), version=args.feature) - copyfiles('./', save_dir, ext='*.txt') - copyfiles('./', save_dir, ext='*.sh') - copyfiles('./', save_dir, ext='*.py') - - if args.feature == 'effb0': - tcn_channels = (512, ) - elif args.feature == 'audio': - tcn_channels = (512, 512, ) - else: - tcn_channels = (128, ) - - model = EEVModel(num_outputs=15, tcn_in=tcn_in[args.feature] + 0, tcn_channels=tcn_channels, tcn_kernel_size=3, - dropout=0.3, mtloss=False, - opt=args.opt, lr=args.lr_init, use_norm=True, features_dropout=0., temporal_size=-1, - num_dilations=4, features=args.feature, emotion_index=-1, warmup_steps=200, - accum_grad=args.batch_size) - fast_dev_run = False - loaders = get_dataloader(emotion_index=-1, feature=args.feature) - checkpoint_callback = ModelCheckpoint(monitor='val_loss', mode="min", save_top_k=1, save_last=True) - trainer = pl.Trainer(gpus=1, accumulate_grad_batches=args.batch_size, max_epochs=args.epoch, - fast_dev_run=fast_dev_run, - deterministic=True, callbacks=checkpoint_callback, num_sanity_val_steps=0, - progress_bar_refresh_rate=0, logger=logger) - trainer.fit(model, loaders['train'], loaders['val']) - print('Best model scores: ', checkpoint_callback.best_model_score) + + if cfg.TEST.WEIGHTS != '': + result_dir = cfg.OUT_DIR + else: + result_dir = '' + + eev_model = EEVModel(params=params, num_outputs=num_outputs, features=cfg.MODEL.FEATURES, result_dir=result_dir, + dataset_name=cfg.DATA_NAME, emotion_index=cfg.DATA_LOADER.EMO_INDEX) + eev_data = EEVDataModule(cfg.DATA_LOADER.DATA_DIR, features=cfg.MODEL.FEATURES, dataset_name=cfg.DATA_NAME, + emotion_index=cfg.DATA_LOADER.EMO_INDEX, drop_perc=cfg.TRAIN.DROP_PERC) + + fast_dev_run = cfg.FAST_DEV_RUN + max_epochs = cfg.OPTIM.MAX_EPOCH if cfg.TEST.WEIGHTS == '' else 1 + if cfg.DATA_NAME == 'eev': + check_val_every_n_epoch = 1 + elif 'mediaeval' in cfg.DATA_NAME: + check_val_every_n_epoch = cfg.OPTIM.MAX_EPOCH + else: + check_val_every_n_epoch = 1 + + ckpt_callbacks = ModelCheckpoint(monitor='val_loss', mode="min", save_top_k=1, save_last=True) + trainer = pl.Trainer(gpus=1, fast_dev_run=fast_dev_run, accumulate_grad_batches=cfg.TRAIN.ACCUM_GRAD_BATCHES, + max_epochs=max_epochs, deterministic=True, callbacks=ckpt_callbacks, + num_sanity_val_steps=0, progress_bar_refresh_rate=0, logger=logger, + stochastic_weight_avg=cfg.OPTIM.USE_SWA, weights_summary=None, + check_val_every_n_epoch=check_val_every_n_epoch, gradient_clip_val=10. if num_outputs<15 else 0) + + if cfg.TEST.WEIGHTS == '': + trainer.fit(eev_model, datamodule=eev_data) if not fast_dev_run: - ckpt_path = None # checkpoint_callback.best_model_path - trainer.test(test_dataloaders=loaders['test'], ckpt_path=ckpt_path) + print('Best scores: ', ckpt_callbacks.best_model_score) + + ckpt_path = None # None # ckpt_callbacks.best_model_path + print('Generate test predictions 1') + trainer.test(datamodule=eev_data, ckpt_path=ckpt_path) + + if cfg.OPTIM.USE_SWA: + ckpt_path = ckpt_callbacks.last_model_path.replace('last', 'swa_last') + trainer.save_checkpoint(ckpt_path) + else: + ckpt_path = ckpt_callbacks.last_model_path + + else: + # Do for testing + eev_data.setup() + # Load pre-trained weights + pretrained_weights = torch.load(cfg.TEST.WEIGHTS)['state_dict'] + eev_model.load_state_dict(pretrained_weights, strict=True) + + # trainer.setup(eev_model, stage='test') + print('Do testing ', cfg.TEST.WEIGHTS) + + print('Generate validation prediction') + trainer.test(model=eev_model, test_dataloaders=eev_data.val_dataloader(), ckpt_path=None) + # trainer.test(datamodule=eev_data, ckpt_path=cfg.TEST.WEIGHTS) + print('Generate testing prediction') + trainer.test(test_dataloaders=eev_data.test_dataloader(), ckpt_path=cfg.TEST.WEIGHTS) + + print('Finished. Total time: {} minutes.'.format((time.time() - st_time) / 60)) diff --git a/src/models.py b/src/models.py deleted file mode 100644 index c71dea7..0000000 --- a/src/models.py +++ /dev/null @@ -1,215 +0,0 @@ -import itertools -import math - -import torch -from torch import nn -import torch.nn.functional as F -from torch.utils.data import ConcatDataset, DataLoader -import pytorch_lightning as pl -from tcn import TemporalConvNet -from utils import EEVMSELoss, EEVPersonr, EEVPearsonLoss -from scipy import stats -import os -from collections import ChainMap -import pandas as pd -import numpy as np - - -class EEVModel(pl.LightningModule): - def __init__(self, num_outputs=15, tcn_in=2048, tcn_channels=(512, 512), num_dilations=4, tcn_kernel_size=3, - dropout=0.2, - mtloss=False, opt=None, lr=1e-3, use_norm=False, features_dropout=0., temporal_size=-1, - num_last_regress=128, features='resnet', emotion_index=-1, warmup_steps=500, accum_grad=1): - super(EEVModel, self).__init__() - self.accum_grad = accum_grad - self.warmup_steps = warmup_steps - self.learning_rate = lr - self.args = {'opt': opt, 'lr_init': lr} - self.num_outputs = num_outputs - self.emotion_index = emotion_index - self.temporal_size = temporal_size - self.num_stacks_tcn = len(tcn_channels) - - if features_dropout > 0: - self._dropout = nn.Dropout(p=features_dropout) - else: - self._dropout = None - - self.features = features - - self._temporal = self.get_temporal_layers(tcn_in, tcn_channels, num_dilations, tcn_kernel_size, dropout, - use_norm) - self._regression = nn.Sequential(nn.Linear(tcn_channels[-1], num_last_regress, bias=False), nn.ReLU(), - nn.Linear(num_last_regress, num_outputs, bias=False)) - # weight initialization - for m in self.modules(): - if isinstance(m, nn.Conv2d): - nn.init.kaiming_normal_(m.weight, mode='fan_out') - if m.bias is not None: - nn.init.zeros_(m.bias) - elif isinstance(m, nn.BatchNorm2d): - nn.init.ones_(m.weight) - nn.init.zeros_(m.bias) - - self.pearsonr = EEVPersonr() - self.loss_func = EEVMSELoss - self.scale_factor = 1.0 - # self.automatic_optimization = False - self.current_lr = None - - def get_temporal_layers(self, tcn_in, tcn_channels, num_dilations, tcn_kernel_size, dropout, use_norm): - # input of TCN should have dimension (N, C, L) - if self.num_stacks_tcn == 1: - temporal_layers = TemporalConvNet(tcn_in, (tcn_channels[0],) * num_dilations, tcn_kernel_size, dropout, - use_norm=use_norm) - else: - list_layers = [] - for idx in range(self.num_stacks_tcn): - tcn_in_index = tcn_in if idx == 0 else tcn_channels[idx - 1] - list_layers.append( - TemporalConvNet(tcn_in_index, (tcn_channels[idx],) * num_dilations, tcn_kernel_size, dropout, - use_norm=use_norm)) - temporal_layers = nn.Sequential(*list_layers) - - return temporal_layers - - def forwardx(self, x, temporal_module, regression_module, feat_dropout): - # Input has size batch_size x sequence_length x num_channels (N x L x C) - - # print("Before: ", x.shape) - if self.temporal_size > 0: - # Resize to L / temporal_size x temporal_size C - L_size = x.shape[1] - if L_size % self.temporal_size == 0: - n_pad = 0 - x = torch.reshape(x, (L_size // self.temporal_size, self.temporal_size, -1)) - else: - n_pad = self.temporal_size - L_size % self.temporal_size - x = F.pad(x, (0, 0, n_pad, 0), "constant", 0) - x = torch.reshape(x, (L_size // self.temporal_size + 1, self.temporal_size, -1)) - else: - n_pad = 0 - - if feat_dropout is not None: - x = feat_dropout(x) - - # Transform to (N, C, L) first - x = x.permute(0, 2, 1) - x = temporal_module(x) - # Transform back to (N, L, C) - - x = x.permute(0, 2, 1) - x = regression_module(x) - - if self.temporal_size > 0: - x = torch.reshape(x, (1, -1, self.num_outputs)) - if n_pad > 0: - end_index = x.shape[1] - n_pad - x = x[:, n_pad:, :] - - return x - - def forward(self, x): - pred_scores = self.forwardx(x, self._temporal, self._regression, - self._dropout) # 1 x k x 15 - - return pred_scores - - def training_step(self, batch, batch_idx): - - out = self._shared_eval(batch, batch_idx) - - scores = batch['scores'] - - loss = self.loss_func(scores, out, scale_factor=self.scale_factor) - self.pearsonr.update(preds=out / self.scale_factor, target=scores) - - return {'loss': loss} - - def validation_step(self, batch, batch_idx): - out = self._shared_eval(batch, batch_idx) - scores = batch['scores'] - - loss = self.loss_func(scores, out, scale_factor=self.scale_factor) - self.pearsonr.update(preds=out / self.scale_factor, target=scores) - - return {'val_loss': loss, - 'file_id': (out.data.cpu().numpy()[0, :, :] / self.scale_factor, batch['timestamps'], batch['scores'])} - - def _shared_eval(self, batch, batch_idx): - out = self(batch['feature']) - return out - - def test_step(self, batch, batch_idx): - out = self._shared_eval(batch, batch_idx) - - return {batch['file_id'][0]: out.data.cpu().numpy()[0, :, :] / self.scale_factor} - - def configure_optimizers(self): - if self.args['opt'] == 'adam': - optimizer = torch.optim.Adam(self.parameters(), lr=self.learning_rate) - else: - optimizer = torch.optim.SGD(self.parameters(), lr=self.learning_rate) - - return optimizer - - def training_epoch_end(self, training_step_outputs): - train_pearsonr = self.pearsonr.compute() - loss_mean = torch.tensor([x['loss'] for x in training_step_outputs]).mean() # .data.cpu().numpy() - self.log('loss', loss_mean, prog_bar=True, logger=True, on_epoch=True, on_step=False) - self.log('pearsonr', train_pearsonr, prog_bar=True, logger=True, on_epoch=True, on_step=False) - print('Step {}. Learning rate: {}'.format(self.trainer.global_step, self.current_lr)) - self.pearsonr.reset() - - def validation_epoch_end(self, validation_step_outputs): - val_pearsonr = self.pearsonr.compute() - loss_mean = torch.tensor([x['val_loss'] for x in validation_step_outputs]).mean() # .data.cpu().numpy() - self.log('val_pearsonr', val_pearsonr, prog_bar=True, logger=True, on_epoch=True) - self.log('val_loss', loss_mean, prog_bar=True, logger=True, on_epoch=True, on_step=False) - print(self.current_epoch, val_pearsonr, loss_mean) - self.pearsonr.reset() - - def test_epoch_end(self, test_step_outputs): - if isinstance(test_step_outputs[0], list): - test_results = list(itertools.chain.from_iterable(test_step_outputs)) - else: - test_results = test_step_outputs - test_write = dict(ChainMap(*test_results)) - torch.save(test_write, os.path.join(self.logger.log_dir, 'test_results.pt')) - self.test2csv(test_write) - - self.pearsonr.reset() - - def test2csv(self, test_prediction): - dataset_root_path = '/mnt/sXProject/EvokedExpression/' - emotions = ['amusement', 'anger', 'awe', 'concentration', - 'confusion', 'contempt', 'contentment', 'disappointment', 'doubt', 'elation', 'interest', - 'pain', 'sadness', 'surprise', 'triumph'] - result_dir = self.logger.log_dir - test_csv = pd.read_csv('{}/eev/{}.csv'.format(dataset_root_path, 'test')) - - list_ids = test_csv['Video ID'].unique() - - list_scores = [] - # cnt = 0 - for id in list_ids: - current_id_times = test_csv.loc[test_csv['Video ID'] == id].values # k x 2 - if id not in test_prediction: - current_scores = np.zeros((current_id_times.shape[0], self.num_outputs)) - else: - current_scores = test_prediction[id] # k x 15 - - current_data = np.hstack([current_id_times, current_scores]) - list_scores.append(current_data) - # cnt += 1 - # if cnt > 4: - # break - if self.num_outputs == 1: - columns_name = ['Video ID', 'Timestamp (milliseconds)', emotions[self.emotion_index]] - else: - columns_name = ['Video ID', 'Timestamp (milliseconds)', ] + emotions - - list_scores = np.vstack(list_scores) - pd.DataFrame(data=list_scores, - columns=columns_name, ).to_csv( - '{}/{}.csv'.format(result_dir, 'test_results_{}'.format(self.features)), index=False) diff --git a/src/run_dockerfile.sh b/src/run_dockerfile.sh new file mode 100755 index 0000000..e080cf3 --- /dev/null +++ b/src/run_dockerfile.sh @@ -0,0 +1,21 @@ +# Run EEV +docker run --gpus all --ipc=host -it --rm \ + --user $UID:$GID \ + --volume="/etc/group:/etc/group:ro" \ + --volume="/etc/passwd:/etc/passwd:ro" \ + --volume="/etc/shadow:/etc/shadow:ro" \ + -v /home/hvthong/sXProject/EvokedExpression/dataset/eev2021:/mnt/sXProject/EvokedExpression \ + -v /mnt/XProject/EvokedExpression:/mnt/XProject/EvokedExpression \ + -w /mnt/XProject/EvokedExpression \ + eev:pytorch1.8.1 bash testing.sh + +# Run MediaEval +#docker run --gpus all --ipc=host -it --rm \ +# --user $UID:$GID \ +# --volume="/etc/group:/etc/group:ro" \ +# --volume="/etc/passwd:/etc/passwd:ro" \ +# --volume="/etc/shadow:/etc/shadow:ro" \ +# -v /home/hvthong/sXProject/EvokedExpression/dataset:/mnt/sXProject/EvokedExpression/dataset \ +# -v /mnt/XProject/EvokedExpression:/mnt/XProject/EvokedExpression \ +# -w /mnt/XProject/EvokedExpression \ +# eev:pytorch1.8.1 bash scripts/mediaeval18_train.sh diff --git a/src/scripts/eev_train.sh b/src/scripts/eev_train.sh new file mode 100755 index 0000000..95934f8 --- /dev/null +++ b/src/scripts/eev_train.sh @@ -0,0 +1,58 @@ +#!/bin/bash + +trap "exit" INT + +lr_init=0.005 +batch_size=32 +dropout=0.0 +max_epoch=20 +#test_weights='' #'train_logs/best_checkpoints/train_logs_v9/effb0/checkpoints/epoch=19-step=1859.ckpt' + +train_dir='/home/hvarch/media/Work/Dataset/EvokedExpression/train_logs/check_train_v5_noTimePos_noDropout/audio' +#train_dir='./train_logs/lstm_withTimePos/audio' +python main.py --cfg conf/eev_audio.yaml \ + FAST_DEV_RUN 0 \ + OUT_DIR $train_dir \ + OPTIM.MAX_EPOCH $max_epoch \ + OPTIM.BASE_LR $lr_init \ + OPTIM.USE_SWA True \ + TRAIN.ACCUM_GRAD_BATCHES $batch_size \ + TCN.DROPOUT $dropout \ + MODEL.USE_POSITION False + +sleep 1 +train_dir='/home/hvarch/media/Work/Dataset/EvokedExpression/train_logs/check_train_v5_noTimePos_noDropout/effb0' +python main.py --cfg conf/eev_effb0.yaml \ + FAST_DEV_RUN 0 \ + OUT_DIR $train_dir \ + OPTIM.MAX_EPOCH $max_epoch \ + OPTIM.BASE_LR $lr_init \ + OPTIM.USE_SWA True \ + TRAIN.ACCUM_GRAD_BATCHES $batch_size \ + TCN.DROPOUT $dropout \ + MODEL.USE_POSITION False + +echo "Use position" +train_dir='/home/hvarch/media/Work/Dataset/EvokedExpression/train_logs/check_train_v5_withTimePos_noDropout/audio' +#train_dir='./train_logs/lstm_withTimePos/audio' +python main.py --cfg conf/eev_audio.yaml \ + FAST_DEV_RUN 0 \ + OUT_DIR $train_dir \ + OPTIM.MAX_EPOCH $max_epoch \ + OPTIM.BASE_LR $lr_init \ + OPTIM.USE_SWA True \ + TRAIN.ACCUM_GRAD_BATCHES $batch_size \ + TCN.DROPOUT $dropout \ + MODEL.USE_POSITION True + +sleep 1 +train_dir='/home/hvarch/media/Work/Dataset/EvokedExpression/train_logs/check_train_v5_withTimePos_noDropout/effb0' +python main.py --cfg conf/eev_effb0.yaml \ + FAST_DEV_RUN 0 \ + OUT_DIR $train_dir \ + OPTIM.MAX_EPOCH $max_epoch \ + OPTIM.BASE_LR $lr_init \ + OPTIM.USE_SWA True \ + TRAIN.ACCUM_GRAD_BATCHES $batch_size \ + TCN.DROPOUT $dropout \ + MODEL.USE_POSITION True diff --git a/src/scripts/mediaeval18_train.sh b/src/scripts/mediaeval18_train.sh new file mode 100755 index 0000000..88470d0 --- /dev/null +++ b/src/scripts/mediaeval18_train.sh @@ -0,0 +1,64 @@ +#!/bin/bash + +trap "exit" INT + +lr_init=0.005 +batch_size=32 +dropout=0.3 +max_epoch=20 + +run_ver="v17tmp" + +for train_drop_perc in 0.0 0.5 +do + for use_position in 'True' 'False' + do + if [ "$use_position" = 'True' ] && [ "$train_drop_perc" = '0.0' ]; then + continue + fi + if [ "$use_position" = 'False' ]; then + continue + fi + for feat in 'audio' 'effb0' + do + for emo in 'valence' 'arousal' 'full' + do + if [ "$emo" = "valence" ]; then + emo_index=0 + continue + elif [ "$emo" = "arousal" ]; then + emo_index=1 + continue + else + emo_index=-1 + fi + + if [ "$use_position" = 'True' ]; then + prefix='time_pos_' + else + prefix='no_time_pos_' + fi + + if [ "$use_position" = 'False' ] && [ "$train_drop_perc" = '0.5' ]; then + postfix='_time_dropout_aug' + fi + + train_dir='train_logs_mediaeval18_v2/'$prefix'epochs_full_'$run_ver$postfix'/'$feat'_'$emo + echo $train_dir + sleep 3 + python -W ignore main.py --cfg conf/eev_${feat}_mediaeval18.yaml \ + FAST_DEV_RUN 0 \ + OUT_DIR $train_dir \ + OPTIM.MAX_EPOCH $max_epoch \ + OPTIM.BASE_LR $lr_init \ + OPTIM.USE_SWA True \ + TRAIN.ACCUM_GRAD_BATCHES $batch_size \ + TRAIN.DROP_PERC $train_drop_perc \ + TCN.DROPOUT $dropout \ + DATA_LOADER.EMO_INDEX $emo_index \ + DATA_LOADER.NUM_WORKERS 16 \ + MODEL.USE_POSITION $use_position + done + done + done +done diff --git a/src/scripts/testing.sh b/src/scripts/testing.sh new file mode 100755 index 0000000..5bb2331 --- /dev/null +++ b/src/scripts/testing.sh @@ -0,0 +1,12 @@ +#!/bin/bash + +trap "exit" INT + + +test_weights=PATH_TO_CKPT_FILE # e.g. './train_logs/audio/checkpoints/swa_last.ckpt' +test_dir=PATH_TO_OUT_DIR # folder to write output, e.g. './train_logs/tmp/testing1' +config_path=PATH_TO_CONFIG_FILE # e.g., ./train_logs/audio/config_audio.yaml + +python main.py --cfg $config_path \ + OUT_DIR $test_dir \ + TEST.WEIGHTS $test_weights diff --git a/src/train.sh b/src/train.sh deleted file mode 100644 index e50a6e8..0000000 --- a/src/train.sh +++ /dev/null @@ -1,19 +0,0 @@ -#!/bin/bash - -trap "exit" INT - -train_dir=./train_logs/ablation_bestAudioEff_notime -lr_val=5e-3 -num_epoch=20 -batch_size=32 - -feature_name=audio -echo $feature_name -sleep 1 -python -W ignore main.py --epoch $num_epoch --dir $train_dir --emotion -1 --lr_init $lr_val --feature $feature_name --batch_size $batch_size - -feature_name=effb0 -echo $feature_name -sleep 1 -python -W ignore main.py --epoch $num_epoch --dir $train_dir --emotion -1 --lr_init $lr_val --feature $feature_name --batch_size $batch_size - diff --git a/src/utils.py b/src/utils.py deleted file mode 100644 index 4d193cb..0000000 --- a/src/utils.py +++ /dev/null @@ -1,207 +0,0 @@ -""" -Author: Huynh Van Thong -Department of AI Convergence, Chonnam Natl. Univ. -""" - -import os -import sys - -os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' -import pandas as pd -from torch.utils import data -import numpy as np -import torch -from torchvision import transforms -import torch.nn.functional as F -import torchmetrics -from torch.nn.utils.rnn import pad_sequence - -dataset_root_path = '/mnt/Work/Dataset/EEV/' - - -class ToTensor(object): - """ Convert ndarrays in sample to Tensors""" - - def __call__(self, sample): - feature = sample['feature'] - scores = sample['scores'] - - return {'feature': torch.from_numpy(feature).type(torch.FloatTensor), - 'timestamps': torch.from_numpy(sample['timestamps']).type(torch.LongTensor), - 'scores': torch.from_numpy(scores).type(torch.FloatTensor), - 'file_id': sample['file_id']} - - -def eev_collatefn(batch): - batch_sample = {'audio': [], 'resnet': [], 'timestamps': [], 'scores': [], 'file_id': [], 'length': []} - max_dim = -1 - for item in batch: - max_dim = max(item['resnet'].shape[0], max_dim) - for item in batch: - current_len = -1 - for ky in item.keys(): - current_val = item[ky] - if ky == 'file_id': - batch_sample[ky].append(current_val) - else: - batch_sample[ky].append(current_val) - current_len = current_val.shape[0] - batch_sample['length'].append(current_len) - for ky in batch_sample: - if ky in ['file_id', 'length']: - pass - # batch_sample[ky] = torch.stack(batch_sample[ky], dim=0) - else: - batch_sample[ky] = pad_sequence(batch_sample[ky], batch_first=True, padding_value=0) - return batch_sample - - -class EEVdataset(data.Dataset): - def __init__(self, root_path='/mnt/Work/Dataset/EEV/', split='train', feature='resnet', emotion_index=-1, - transforms=None, save_pt=False): - self.save_pt = save_pt - self.feature = feature - self.emotion_index = emotion_index - self.root_path = root_path - self.root_path_npy = self.root_path # '.' - self.split = split - self.transforms = transforms - if split not in ['train', 'val', 'test']: - raise ValueError('Do not support {} split for EEV dataset'.format(split)) - data_csv = pd.read_csv('{}/eev/{}.csv'.format(self.root_path, self.split)) - - id_header = 'Video ID' if split == 'test' else 'YouTube ID' - - excluded_ids = np.loadtxt('excluded_files.txt', dtype=str) - excluded_ids_single = np.loadtxt('check_1.txt', dtype=str) # - set(excluded_ids_single) - self.video_ids = list(set(data_csv[id_header].unique()) - set(excluded_ids)) - - def __len__(self): - return len(self.video_ids) - - def __getitem__(self, index): - current_id = self.video_ids[index] - if self.save_pt: - resnet_npy = np.load( - '{}/dataset/features_v2/{}/{}_{}.npy'.format(self.root_path_npy, self.split, current_id, 'resnetv2-50'), - allow_pickle=True) - - audio_npy = np.load( - '{}/dataset/features_v2/{}/{}_{}.npy'.format(self.root_path_npy, self.split, current_id, 'audio'), - allow_pickle=True) - - effb0_npy = np.load( - '{}/dataset/features_v2/{}/{}_{}.npy'.format(self.root_path_npy, self.split, current_id, 'efficientnet-b0'), - allow_pickle=True) - - audio_features = audio_npy.item()['feature'] - resnet_features = resnet_npy.item()['feature'] - effb0_features = effb0_npy.item()['feature'] - timestamps = resnet_npy.item()['timestamps'] - scores = resnet_npy.item()['scores'] - mask = np.sum(scores, axis=-1) > 0 - file_id = resnet_npy.item()['file_id'][0] - - assert mask.shape[0] == scores.shape[0] - assert audio_features.shape[0] == resnet_features.shape[0] - if audio_features.ndim == 1: - audio_features = audio_features.reshape(1, -1) - print(current_id) - if audio_features.shape[-1] != 2048: - print('Check: ', current_id) - - sample = {'resnet': resnet_features, 'audio': audio_features, 'effb0': effb0_features, 'timestamps': timestamps, - 'scores': scores, 'mask': mask, - 'file_id': file_id} - torch.save(sample, '{}dataset/features_pt/{}/{}.pt'.format(self.root_path_npy, self.split, current_id)) - return sample - else: - sample = torch.load('{}dataset/features_v2/{}/{}.pt'.format(self.root_path_npy, self.split, current_id)) - if self.split in ['train', 'val']: - mask = np.sum(sample['scores'], axis=-1) > 1e-6 - else: - mask = np.ones(sample['scores'].shape[0], dtype=np.bool) - - num_timestamps = np.sum(mask) - if self.emotion_index > -1: - scores = np.reshape(sample['scores'][mask, self.emotion_index], (-1, 1)) - else: - smooth_scores = np.zeros_like(sample['scores'][mask, :]) - # smooth_scores[:num_timestamps//2, :] = smooth_scores[:num_timestamps//2, :] + 1e-8 - scores = sample['scores'][mask, :] + smooth_scores - # if scores.shape[0] < 2: - # print('Check ', current_id) - position_info = sample['timestamps'][mask].reshape(-1, 1) / 1e6 - features = sample[self.feature][mask, :] #np.hstack([sample[self.feature][mask, :], position_info]) - use_sample = {'feature': features, 'timestamps': sample['timestamps'][mask], - 'scores': scores, 'file_id': sample['file_id']} - - if self.transforms is not None: - use_sample = self.transforms(use_sample) - - return use_sample - - -class EEVPersonr(torchmetrics.Metric): - def __init__(self, dist_sync_on_step=False): - super(EEVPersonr, self).__init__(dist_sync_on_step=dist_sync_on_step) - self.add_state("sum", default=torch.tensor(0.0), dist_reduce_fx="sum") - self.add_state("total", default=torch.tensor(0), dist_reduce_fx="sum") - - def get_score(self, x, x_hat): - x_mean = torch.mean(x, dim=0, keepdim=True) - x_hat_mean = torch.mean(x_hat, dim=0, keepdim=True) - - numerator = torch.sum(torch.mul(x - x_mean, x_hat - x_hat_mean), dim=0) - denominator = torch.sqrt(torch.sum((x - x_mean) ** 2, dim=0) * torch.sum((x_hat - x_hat_mean) ** 2, dim=0)) - pearsonr_score = numerator / denominator - pearsonr_score[pearsonr_score != pearsonr_score] = -1 - return torch.mean(pearsonr_score) - - def update(self, preds, target): - # Update metric states - update_scores = 0. - # print(target.shape, preds.shape) - update_scores = update_scores + self.get_score(target[0, :, :], preds[0, :, :]) - - self.sum += update_scores - - self.total = self.total + 1 - - def compute(self): - return self.sum / self.total - - -def EEVMSELoss(targets, preds, scale_factor=1.0): - mse_exp = torch.squeeze( - torch.mean((targets * scale_factor - preds) ** 2, dim=1)) - return torch.mean(mse_exp) - - -def EEVPearsonLoss(targets, preds, scale_factor=1.0, ): - x_mean = torch.mean(targets * scale_factor, dim=1, keepdim=True) - xhat_mean = torch.mean(preds, dim=1, keepdim=True) - - numerator = torch.sum(torch.mul(targets * scale_factor - x_mean, preds - xhat_mean), dim=1) - denominator = torch.sqrt( - torch.sum((targets * scale_factor - x_mean) ** 2, dim=1) * torch.sum((preds - xhat_mean) ** 2, dim=1)) - - pearsonr_score = numerator / denominator - if torch.sum(torch.isnan(pearsonr_score)) > 0: - print(numerator, denominator) - print('Stop here') - sys.exit(0) - return 1.0 - torch.mean(pearsonr_score) - - -if __name__ == '__main__': - tmp = EEVdataset(split='test', transforms=transforms.Compose([ToTensor()])) - tmp_loader = data.DataLoader(tmp, batch_size=1) - - max_size = 0 - for i, b in enumerate(tmp_loader): - max_size = max(b['resnet'].shape[1], max_size) - print(i, max_size) - - print(max_size) - pass