test.py

import torch
from PIL import Image
import PIL
import random
import threading
import os
from pyramid_dit import PyramidDiTForVideoGeneration
import cv2
import numpy as np
from diffusers.utils import export_to_video
from brisque import BRISQUE

from pyramid_dit.flux_modules.modeling_pyramid_flux import PyramidFluxTransformer

# Global model cache
model_cache = {}

# Lock to ensure thread-safe access to the model cache
model_cache_lock = threading.Lock()

cpu_offloading=True

# Configuration
model_name = "pyramid_flux"    # or pyramid_mmdit
model_repo = "rain1011/pyramid-flow-sd3" if model_name == "pyramid_mmdit" else "rain1011/pyramid-flow-miniflux"

model_dtype = "bf16"                      # Support bf16 and fp32
variants = {
    'high': 'diffusion_transformer_768p',  # For high-resolution version
    'low': 'diffusion_transformer_384p'    # For low-resolution version
}
required_file = 'config.json'  # Ensure config.json is present

current_directory = "/mnt/iliad/models/huggingface/hub/"
model_path = os.path.join(current_directory, "pyramid_flow_model")  # Directory to store the model

# Function to initialize the model based on user options
def initialize_model(variant):
    print(f"[INFO] Initializing model with variant='{variant}', using bf16 precision...")

    # Determine the correct variant directory
    variant_dir = variants['high'] if variant == '768p' else variants['low']
    base_path = model_path  # Pass the base model path

    print(f"[DEBUG] Model base path: {base_path}")

    # Verify that config.json exists in the variant directory
    config_path = os.path.join(model_path, variant_dir, 'config.json')
    if not os.path.exists(config_path):
        print(f"[ERROR] config.json not found in '{os.path.join(model_path, variant_dir)}'.")
        raise FileNotFoundError(f"config.json not found in '{os.path.join(model_path, variant_dir)}'.")

    if model_dtype == "bf16":
        torch_dtype_selected = torch.bfloat16
    else:
        torch_dtype_selected = torch.float32

    # Initialize the model
    try:

        model = PyramidDiTForVideoGeneration(
            base_path,                # Pass the base model path
            model_name=model_name,     # set to pyramid_flux or pyramid_mmdit
            model_dtype=model_dtype,  # Use bf16
            model_variant=variant_dir,  # Pass the variant directory name
            cpu_offloading=cpu_offloading,  # Pass the CPU offloading flag
        )

        # Always enable tiling for the VAE
        model.vae.enable_tiling()

        # Remove manual device placement when using CPU offloading
        # The components will be moved to the appropriate devices automatically
        if torch.cuda.is_available():
            torch.cuda.set_device(0)
            # Manual device replacement when not using CPU offloading
            if not cpu_offloading:
                model.vae.to("cuda")
                model.dit.to("cuda")
                model.text_encoder.to("cuda")
        else:
            print("[WARNING] CUDA is not available. Proceeding without GPU.")

        print("[INFO] Model initialized successfully.")
        return model, torch_dtype_selected
    except Exception as e:
        print(f"[ERROR] Error initializing model: {e}")
        raise e


# Function to get the model from cache or initialize it
def initialize_model_cached(variant, seed):
    key = variant

    if seed == 0:
        seed = random.randint(0, 2**8 - 1)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed(seed)
        torch.cuda.manual_seed_all(seed)
    
    # Check if the model is already in the cache
    if key not in model_cache:
        with model_cache_lock:
            # Double-checked locking to prevent race conditions
            if key not in model_cache:
                model, dtype = initialize_model(variant)
                model_cache[key] = (model, dtype)
    
    return model_cache[key]

def resize_crop_image(img: PIL.Image.Image, tgt_width, tgt_height):
    ori_width, ori_height = img.width, img.height
    scale = max(tgt_width / ori_width, tgt_height / ori_height)
    resized_width = round(ori_width * scale)
    resized_height = round(ori_height * scale)
    img = img.resize((resized_width, resized_height), resample=PIL.Image.LANCZOS)

    left = (resized_width - tgt_width) / 2
    top = (resized_height - tgt_height) / 2
    right = (resized_width + tgt_width) / 2
    bottom = (resized_height + tgt_height) / 2

    # Crop the center of the image
    img = img.crop((left, top, right, bottom))
    
    return img

def generate_image_to_video(image, prompt, temp, video_guidance_scale, k=0):
    image = resize_crop_image(image, 640, 384)

    # Initialize model based on user options using cached function
    try:
        model, torch_dtype_selected = initialize_model_cached('384p', seed=0)
    except Exception as e:
        print(f"[ERROR] Model initialization failed: {e}")
        return f"Model initialization failed: {e}"

    try:
        print("[INFO] Starting image-to-video generation...")
        with torch.no_grad(), torch.autocast('cuda', dtype=torch_dtype_selected):
            frames = model.generate_i2v(
                prompt=prompt,
                input_image=image,
                num_inference_steps=[10, 10, 10],
                temp=temp,
                video_guidance_scale=video_guidance_scale,
                output_type="pil",
                cpu_offloading=True,
                save_memory=True,
                callback=None,
                keep_last_n_latents=k,
            )
        print("[INFO] Image-to-video generation completed.")
    except Exception as e:
        print(f"[ERROR] Error during image-to-video generation: {e}")
        return f"Error during video generation: {e}"

    return frames

# Use CLIP to measure similarity between frames
from transformers import CLIPProcessor, CLIPModel
import matplotlib.pyplot as plt

def brisque_score(frames):
    scores = []
    for frame in frames:
        scores.append(brisque.score(frame))
    return scores

def plot_similarity(frames, filename):
    processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")
    model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")

    frames_features = []
    for frame in frames:
        inputs = processor(images=frame, return_tensors="pt")
        outputs = model.get_image_features(**inputs)
        frames_features.append(outputs)

    # Measure similarity between 2 consecutive frames
    similarity = []
    for i in range(len(frames_features) - 1):
        similarity.append(np.linalg.norm((frames_features[i] - frames_features[i+1]).detach().numpy()))

    obj = BRISQUE(url=False)
    brisque_scores = [obj.score(frame) for frame in frames]

    # Plot similarity and brisque scores
    # Clear plot first
    # Create separate axes for similarity and brisque scores
    # But still put them on the same plot with the same x-axis
    fig, ax1 = plt.subplots()
    ax1.plot(similarity)
    ax2 = ax1.twinx()
    ax2.plot(brisque_scores, color='red')
    # Add legend with colors
    ax1.legend(['Similarity'], loc='upper left')
    ax2.legend(['Brisque Scores'], loc='upper center')
    plt.savefig(filename)


all_images = []
#image = Image.open("road.jpg")
image_name = "wall"
image = Image.open(f"assets/the_great_wall.jpg")
image = resize_crop_image(image, 640, 384)

if not os.path.exists("outputs"):
    os.makedirs("outputs")

params = [
    {
        "k": 0,
        "temp": 10,
        "iters": 4,
    },
    {
        "k": 0,
        "temp": 40,
        "iters": 1,
    },
]
for param in params:
    k = param["k"]
    temp = param["temp"]
    iters = param["iters"]
    frames = [image]
    all_frames = []
    for i in range(iters):
        print(f"Generating video {i+1} of {iters}")
        frames = generate_image_to_video(
            frames[-1], 
            prompt = "A car driving on a road", 
            temp = temp, 
            video_guidance_scale = 4.0,
            k=k,
        )
        all_frames.extend(frames)
    
    file_name = f"outputs/{image_name}_{iters}x{temp}_k{k}"
    export_to_video(all_frames, f"{file_name}.mp4", fps=24)
    print(f"Saved video to {file_name}.mp4")
    print("Measuring similarity...")
    plot_similarity(all_frames, f"{file_name}_similarity.png")
    print(f"Saved similarity to {file_name}_similarity.png")