extract_BCF.py

import os, cv2
import torch,time
import clip
from PIL import Image
import numpy as np
from torchvision.transforms import Compose, ToTensor, Normalize
device = "cuda" if torch.cuda.is_available() else "cpu"
model, _ = clip.load("ViT-B/32", device=device)
preprocess = Compose([
        Normalize((0.48145466, 0.4578275, 0.40821073), (0.26862954, 0.26130258, 0.27577711)),
    ])

# prompts = [
#     f'an underexposed photo',  
#     f'a well-exposed photo',
#     f'a overexposed photo'
#     ]

prompts = [
    f'an underexposed photo',  
    f'a slightly underexposed photo', 
    f'a well-exposed photo',
    f'a slightly overexposed photo',
    f'a overexposed photo'
    ]

text = clip.tokenize(prompts).to(device)
to_tensor = ToTensor()

def GET_BC(videos_dir, video_name):
    
    save_folder = 'BC_Features'
    os.makedirs(save_folder, exist_ok=True)
    
    filename = os.path.join(videos_dir, video_name)
    video_name_str = video_name[:-4]
    video_capture = cv2.VideoCapture()
    video_capture.open(filename)
    cap=cv2.VideoCapture(filename)

    video_length = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))

        
    dim = (1120, 672)
    
    frames = []
    for i in range(video_length):
        has_frames, frame = video_capture.read()
        if has_frames:
            read_frame = cv2.resize(frame, dim)
            frames.append(read_frame)


    final_res = []
    res = []
    length = len(frames)
    print(length)
    now = 0
    interval = 10
    
    while now + interval - 1 < length:
        final = []
        for i in range(interval):
            read_frame = Image.fromarray(cv2.cvtColor(frames[i + now], cv2.COLOR_BGR2RGB))
            final.append(to_tensor(read_frame).to(device))
        
        image = torch.stack(final, dim = 0)
        # 形状变为 [Batch, 3, 5, 224, 3, 224]
        image = image.unfold(2, 224, 224).unfold(3, 224, 224)
        # 接下来，调整维度的顺序
        # 新的形状为 [Batch, 5, 3, 3, 224, 224]
        image = image.permute(0, 3, 2, 1, 4, 5).contiguous()
        # 最后，合并两个中间维度以形成最终的Tensor
        # 形状变为 [Batch, 15, 3, 224, 224]
        image = image.reshape(-1, 15, 3, 224, 224)
        image = image.view(-1, 3, 224, 224)
        image = preprocess(image)
        
        with torch.no_grad():
            logits_per_image, logits_per_text = model(image, text)
            
        tmp = logits_per_image.softmax(dim=-1)
        tmp = tmp.view(interval, -1)
        tmp *= 10
        # print(tmp.shape)
        res.append(tmp)
        now += interval
        
    if length > now:
        final = []
        for i in range(now, length):
            read_frame = Image.fromarray(cv2.cvtColor(frames[i], cv2.COLOR_BGR2RGB))
            final.append(to_tensor(read_frame).to(device))
        
        image = torch.stack(final, dim = 0)
        # 形状变为 [Batch, 3, 5, 224, 3, 224]
        image = image.unfold(2, 224, 224).unfold(3, 224, 224)
        # 接下来，调整维度的顺序
        # 新的形状为 [Batch, 5, 3, 3, 224, 224]
        image = image.permute(0, 3, 2, 1, 4, 5).contiguous()
        # 最后，合并两个中间维度以形成最终的Tensor
        # 形状变为 [Batch, 15, 3, 224, 224]
        image = image.reshape(-1, 15, 3, 224, 224)
        image = image.view(-1, 3, 224, 224)
        image = preprocess(image)
        

        with torch.no_grad():
            logits_per_image, logits_per_text = model(image, text)
            
        tmp = logits_per_image.softmax(dim=-1)

        tmp = tmp.view(length - now, -1)
        tmp *= 10
        # print(tmp)
        res.append(tmp)     
    
    
    res = torch.cat(res, dim=0)
    
    # print(res.shape)
    
    final_res=[]
    for step in [1, 2, 4, 8]:
        chunk_number = 8 // step
        chunk_size = length // chunk_number
        chunks = [] 
        for i in range(chunk_number):
            if i < chunk_number - 1:
                chunk = res[i * chunk_size : (i + 1) * chunk_size, :]
            else:
                chunk = res[(chunk_number - 1) * chunk_size : , :]
            tmp = []
            for j in range(step):
                temp = chunk[j :: step, :]
                tmp.append(torch.var(temp.float(), dim=0))
            chunks.append(tmp)
        final_res.append(chunks)
    
    temp = []
    for i in range(8):
        temp.append(torch.cat(final_res[0][i] 
                              + [torch.mean(torch.stack(final_res[1][i // 2], dim=0), dim=0)] 
                              + [torch.mean(torch.stack(final_res[2][i // 4], dim=0), dim=0)]
                              + [torch.mean(torch.stack(final_res[3][i // 8], dim=0), dim=0)]
                              , dim=0))    
        
    final_res = torch.stack(temp, dim=0)
    print(final_res.shape)
    
    if torch.isnan(final_res).any() or torch.isinf(final_res).any():
        print(f'{video_name_str}: dead.')
    else:
        print(f'{video_name_str}: live.')
    np.save(save_folder + f'/{video_name_str}_BC.npy', final_res.cpu())
    
        
def run(lb,ub):
    cnt = 0
    videos_dir = 'final'
    for video_name in os.listdir(videos_dir):
        if cnt in range(lb, ub):
            GET_BC(videos_dir, video_name)
        cnt += 1

run(0, 4518)