Spaces:

MM-MVR
/

STAR

Running on Zero

App Files Files Community

MM-MVR commited on Nov 13

Commit

97bc03d

verified ·

1 Parent(s): a503028

Upload files

Browse files

Files changed (18) hide show

.gitattributes +2 -0
README.md +3 -4
app.py +763 -0
assets/editing.png +3 -0
assets/understand.png +3 -0
requirements.txt +30 -0
star/.DS_Store +0 -0
star/configs/STAR_Qwen2.5-VL-3B.json +35 -0
star/configs/STAR_Qwen2.5-VL-7B.json +35 -0
star/models/adapter/projector.py +26 -0
star/models/config.py +23 -0
star/models/data_process_utils.py +65 -0
star/models/model.py +587 -0
star/models/pixel_decoder/lumina2_decoder.py +563 -0
star/models/pixel_decoder/transformer_lumina2.py +770 -0
star/models/pixel_decoder/transformer_lumina2_seq.py +551 -0
star/models/pixel_encoder/vq_model.py +510 -0
star/models/rope_2d.py +232 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,5 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+assets/editing.png filter=lfs diff=lfs merge=lfs -text
+assets/understand.png filter=lfs diff=lfs merge=lfs -text

README.md CHANGED Viewed

@@ -1,14 +1,13 @@
 ---
 title: STAR
-emoji: 💻
-colorFrom: blue
-colorTo: red
 sdk: gradio
 sdk_version: 5.49.1
 app_file: app.py
 pinned: false
 license: apache-2.0
-short_description: STAR Demo
 ---
 Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 ---
 title: STAR
+emoji: 👁
+colorFrom: green
+colorTo: yellow
 sdk: gradio
 sdk_version: 5.49.1
 app_file: app.py
 pinned: false
 license: apache-2.0
 ---
 Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

app.py ADDED Viewed

	@@ -0,0 +1,763 @@

+import os
+import sys
+import spaces
+import gradio as gr
+import numpy as np
+import torch
+import random
+import time
+from PIL import Image
+from huggingface_hub import hf_hub_download
+import subprocess
+subprocess.run(
+    "pip install flash-attn==2.7.3 --no-build-isolation",
+    shell=True
+)
+from star.models.config import load_config_from_json, STARMultiModalConfig
+from star.models.model import STARMultiModal
+TEXTS = {
+    "zh": {
+        "title": "🌟 STAR 多模态演示",
+        "description": "基于STAR模型的多模态AI演示系统，支持文本生成图像、图像编辑和图像理解功能。",
+        "please_load_model": "请先加载模型！",
+        "please_upload_image": "请上传图像！",
+        "generation_failed": "生成失败！",
+        "generation_success_diffusion": "生成成功！",
+        "generation_success_vq": "生成成功！",
+        "edit_failed": "编辑失败！",
+        "edit_success_diffusion": "编辑成功！",
+        "edit_success_vq": "编辑成功！",
+        "understanding_failed": "理解失败！",
+        "generation_error": "生成过程中出错: ",
+        "edit_error": "编辑过程中出错: ",
+        "understanding_error": "理解过程中出错: ",
+        "tab_text_to_image": "🖼️ 文本生成图像",
+        "tab_image_edit": "🖌️ 图像编辑",
+        "tab_image_understanding": "📝 图像理解",
+        "text_prompt": "文本提示",
+        "text_prompt_placeholder": "A whimsical scene featuring a small elf with pointed ears and a green hat, sipping orange juice through a long straw from a disproportionately large orange. Next to the elf, a curious squirrel perches on its hind legs, while an owl with wide, observant eyes watches intently from a branch overhead. The orange's vibrant color contrasts with the muted browns and greens of the surrounding forest foliage.",
+        "advanced_params": "高级参数",
+        "cfg_scale": "CFG Scale",
+        "cfg_scale_info": "控制生成图像与文本的匹配程度",
+        "top_k": "Top-K",
+        "top_k_info": "采样时考虑的token数量",
+        "top_p": "Top-P",
+        "top_p_info": "核采样参数",
+        "generate_image": "🎨 生成图像",
+        "generated_image": "生成的图像",
+        "generation_status": "生成状态",
+        "input_image": "输入图像",
+        "edit_instruction": "编辑指令",
+        "edit_instruction_placeholder": "Remove the tiger in the water.",
+        "edit_image": "✏️ 编辑图像",
+        "edited_image": "编辑后的图像",
+        "edit_status": "编辑状态",
+        "question": "问题",
+        "question_placeholder": "Please describe the content of this image",
+        "max_generation_length": "最大生成长度",
+        "understand_image": "🔍 理解图像",
+        "understanding_result": "理解结果",
+        "usage_instructions": "使用说明",
+        "usage_step1": "1. **文本生成图像**: 输入文本描述，调整参数后点击生成",
+        "usage_step2": "2. **图像编辑**: 上传图像并输入编辑指令",
+        "usage_step3": "3. **图像理解**: 上传图像并提出问题",
+        "language": "语言 / Language"
+    },
+    "en": {
+        "title": "🌟 STAR Multi-Modal Demo",
+        "description": "A multi-modal AI demonstration system based on STAR model, supporting text-to-image generation, image editing, and image understanding.",
+        "please_load_model": "Please load the model first!",
+        "please_upload_image": "Please upload an image!",
+        "generation_failed": "Generation failed!",
+        "generation_success_diffusion": "Generation successful! ",
+        "generation_success_vq": "Generation successful! Using VQ decoder",
+        "edit_failed": "Editing failed!",
+        "edit_success_diffusion": "Editing successful! ",
+        "edit_success_vq": "Editing successful! Using VQ decoder",
+        "understanding_failed": "Understanding failed!",
+        "generation_error": "Error during generation: ",
+        "edit_error": "Error during editing: ",
+        "understanding_error": "Error during understanding: ",
+        "tab_text_to_image": "🖼️ Text to Image",
+        "tab_image_edit": "🖌️ Image Editing",
+        "tab_image_understanding": "📝 Image Understanding",
+        "text_prompt": "Text Prompt",
+        "text_prompt_placeholder": "A whimsical scene featuring a small elf with pointed ears and a green hat, sipping orange juice through a long straw from a disproportionately large orange. Next to the elf, a curious squirrel perches on its hind legs, while an owl with wide, observant eyes watches intently from a branch overhead. The orange's vibrant color contrasts with the muted browns and greens of the surrounding forest foliage.",
+        "advanced_params": "Advanced Parameters",
+        "cfg_scale": "CFG Scale",
+        "cfg_scale_info": "Controls how closely the generated image matches the text",
+        "top_k": "Top-K",
+        "top_k_info": "Number of tokens to consider during sampling",
+        "top_p": "Top-P",
+        "top_p_info": "Nucleus sampling parameter",
+        "generate_image": "🎨 Generate Image",
+        "generated_image": "Generated Image",
+        "generation_status": "Generation Status",
+        "input_image": "Input Image",
+        "edit_instruction": "Edit Instruction",
+        "edit_instruction_placeholder": "Remove the tiger in the water.",
+        "edit_image": "✏️ Edit Image",
+        "edited_image": "Edited Image",
+        "edit_status": "Edit Status",
+        "question": "Question",
+        "question_placeholder": "Please describe the content of this image",
+        "max_generation_length": "Max Generation Length",
+        "understand_image": "🔍 Understand Image",
+        "understanding_result": "Understanding Result",
+        "usage_instructions": "Usage Instructions",
+        "usage_step1": "1. **Text to Image**: Enter text description, adjust parameters and click generate",
+        "usage_step2": "2. **Image Editing**: Upload an image and enter editing instructions",
+        "usage_step3": "3. **Image Understanding**: Upload an image and ask questions",
+        "language": "语言 / Language"
+    }
+}
+class MockArgs:
+    def __init__(self):
+        self.data_type = "generation"
+        self.diffusion_as_decoder = True
+        self.ori_inp_dit = "seq"
+        self.grad_ckpt = False
+        self.diffusion_resolution = 1024
+        self.max_diff_seq_length = 256
+        self.max_seq_length = 8192
+        self.max_text_tokens = 512
+        self.max_pixels = 28 * 28 * 576
+        self.min_pixels = 28 * 28 * 16
+        self.vq_image_size = 384
+        self.vq_tokens = 576
+def set_seed(seed=100):
+    if seed > 0:
+        random.seed(seed)
+        np.random.seed(seed)
+        torch.manual_seed(seed)
+        if torch.cuda.is_available():
+            torch.cuda.manual_seed(seed)
+            torch.cuda.manual_seed_all(seed)
+        torch.backends.cudnn.deterministic = True
+        torch.backends.cudnn.benchmark = False
+    return seed
+def print_with_time(msg):
+    print(f"{time.strftime('%Y-%m-%d %H:%M:%S', time.localtime())}: {msg}")
+class STARInferencer:
+    def __init__(self, model_config_path, checkpoint_path, vq_checkpoint, device="cpu"):
+        self.device = device
+        self.model_config_path = model_config_path
+        self.checkpoint_path = checkpoint_path
+        self.vq_checkpint_path = vq_checkpoint
+        self.model = None
+        self._load_model()
+    def _create_mock_args(self):
+        return MockArgs()
+    def _load_model(self):
+        try:
+            print_with_time("Loading model configuration...")
+            config_data = load_config_from_json(self.model_config_path)
+            model_config = STARMultiModalConfig(**config_data)
+            model_config.language_model.model_path = "Qwen/Qwen2.5-VL-7B-Instruct"
+            model_config.pixel_encoder.model_path = self.vq_checkpint_path
+            model_config.pixel_decoder.model_path = "Alpha-VLLM/Lumina-Image-2.0"
+            args = self._create_mock_args()
+            print_with_time("Initializing model...")
+            self.model = STARMultiModal(model_config, args)
+            if os.path.exists(self.checkpoint_path):
+                print_with_time(f"Loading checkpoint from {self.checkpoint_path}")
+                with torch.no_grad():
+                    checkpoint = torch.load(self.checkpoint_path, map_location='cpu', weights_only=False)
+                    if 'state_dict' in checkpoint:
+                        state_dict = checkpoint['state_dict']
+                    else:
+                        state_dict = checkpoint
+                    if not isinstance(state_dict, dict):
+                        raise ValueError("Invalid checkpoint format")
+                    print_with_time(f"Checkpoint contains {len(state_dict)} parameters")
+                    self.model.load_state_dict(state_dict, strict=False)
+            print_with_time(f"Moving model to device: {self.device}")
+            self.model.to(self.device)
+            print_with_time("Setting model to eval mode...")
+            self.model.eval()
+            if torch.cuda.is_available():
+                print_with_time(f"GPU memory after model loading: {torch.cuda.memory_allocated()/1024**3:.2f}GB")
+            print_with_time("Model loaded successfully!")
+        except Exception as e:
+            print_with_time(f"Error loading model: {str(e)}")
+            import traceback
+            traceback.print_exc()
+            raise e
+    @spaces.GPU(duration=210)
+    def generate_image(self, prompt, num_images=1, cfg=20.0, topk=2000, topp=1.0, seed=0):
+        if self.model.device.type == 'cpu':
+            print_with_time("Moving model to GPU...")
+            self.model.to('cuda')
+            self.model.to(torch.bfloat16)
+            print_with_time("Model moved to GPU")
+        set_seed(seed)
+        print_with_time(f"Generating image for prompt: {prompt}")
+        cfg = max(1.0, min(20.0, float(cfg)))
+        topk = max(100, min(2000, int(topk)))
+        topp = max(0.1, min(1.0, float(topp)))
+        print_with_time(f"Using validated params: cfg={cfg}, topk={topk}, topp={topp}")
+        if not (torch.isfinite(torch.tensor(cfg)) and torch.isfinite(torch.tensor(topk)) and torch.isfinite(torch.tensor(topp))):
+            print_with_time("Warning: Non-finite parameters detected")
+            return None
+        try:
+            with torch.no_grad():
+                if torch.cuda.is_available():
+                    torch.cuda.empty_cache()
+                    print_with_time(f"GPU memory before generation: {torch.cuda.memory_allocated()/1024**3:.2f}GB")
+                if not isinstance(prompt, str) or len(prompt.strip()) == 0:
+                    print_with_time("Warning: Invalid prompt")
+                    return None
+                if not (0 < cfg <= 20 and 0 < topk <= 5000 and 0 < topp <= 1):
+                    print_with_time(f"Warning: Invalid parameters - cfg={cfg}, topk={topk}, topp={topp}")
+                    return None
+                print_with_time("Calling model.generate_images...")
+                safe_max_tokens = 576
+                output = self.model.generate_images(
+                    prompt,
+                    max_new_tokens=safe_max_tokens,
+                    num_return_sequences=num_images,
+                    cfg_weight=cfg,
+                    topk_sample=topk,
+                    topp_sample=topp,
+                    reasoning=False,
+                    return_dict=True
+                )
+                print_with_time("Model generation completed")
+                if output is None:
+                    print_with_time("Warning: Model returned None output")
+                    return None
+                print_with_time("Processing output images...")
+                result = self._process_output_images(output, num_images)
+                print_with_time("Image processing completed")
+                return result
+        except Exception as e:
+            print_with_time(f"Error during image generation: {str(e)}")
+            import traceback
+            traceback.print_exc()
+            if torch.cuda.is_available():
+                torch.cuda.empty_cache()
+            raise e
+    @spaces.GPU(duration=210)
+    def edit_image(self, image, instruction, num_images=1, cfg=20.0, topk=2000, topp=1.0, seed=0):
+        if self.model.device.type == 'cpu':
+            print_with_time("Moving model to GPU...")
+            self.model.to('cuda')
+            self.model.to(torch.bfloat16)
+            print_with_time("Model moved to GPU")
+        set_seed(seed)
+        if isinstance(image, np.ndarray):
+            image = Image.fromarray(image)
+        print_with_time(f"Editing image with instruction: {instruction}")
+        with torch.no_grad():
+            output = self.model.generate_images_edit(
+                [image],
+                instruction,
+                max_new_tokens=576,
+                num_return_sequences=num_images,
+                cfg_weight=cfg,
+                topk_sample=topk,
+                topp_sample=topp,
+                return_dict=True
+            )
+            if output is None:
+                return None
+            return self._process_output_images(output, num_images)
+    @spaces.GPU(duration=180)
+    def understand_image(self, image, question, max_new_tokens=256):
+        if self.model.device.type == 'cpu':
+            print_with_time("Moving model to GPU...")
+            self.model.to('cuda')
+            self.model.to(torch.bfloat16)
+            print_with_time("Model moved to GPU")
+        if isinstance(image, np.ndarray):
+            image = Image.fromarray(image)
+        print_with_time(f"Understanding image with question: {question}")
+        with torch.no_grad():
+            answer = self.model.inference_understand(
+                image=image,
+                question=question,
+                max_new_tokens=max_new_tokens
+            )
+        return answer
+    def _process_output_images(self, output, num_images):
+        image_size = 384
+        try:
+            if isinstance(output, dict):
+                output_images = output.get("output_images")
+                diff_images = output.get("diff_images")
+                results = {}
+                if output_images is not None:
+                    if isinstance(output_images, torch.Tensor):
+                        output_images = output_images.detach().cpu().numpy()
+                    if output_images.size == 0:
+                        print_with_time("Warning: Empty output_images array")
+                        results["vq_images"] = None
+                    else:
+                        output_images = np.nan_to_num(output_images, nan=0.0, posinf=1.0, neginf=-1.0)
+                        dec_vq = np.clip((output_images + 1) / 2 * 255, 0, 255)
+                        if len(dec_vq.shape) == 3:
+                            dec_vq = dec_vq.reshape(num_images, image_size, image_size, 3)
+                        visual_img_vq = np.zeros((num_images, image_size, image_size, 3), dtype=np.uint8)
+                        visual_img_vq[:, :, :] = dec_vq
+                        imgs_vq = [Image.fromarray(visual_img_vq[j].astype(np.uint8)) for j in range(visual_img_vq.shape[0])]
+                        results["vq_images"] = imgs_vq
+                if diff_images is not None:
+                    results["diff_images"] = diff_images
+                else:
+                    results["diff_images"] = None
+                return results
+            else:
+                if isinstance(output, torch.Tensor):
+                    output = output.detach().cpu().numpy()
+                output = np.nan_to_num(output, nan=0.0, posinf=1.0, neginf=-1.0)
+                dec = np.clip((output + 1) / 2 * 255, 0, 255)
+                if len(dec.shape) == 3:
+                    dec = dec.reshape(num_images, image_size, image_size, 3)
+                visual_img = np.zeros((num_images, image_size, image_size, 3), dtype=np.uint8)
+                visual_img[:, :, :] = dec
+                imgs = [Image.fromarray(visual_img[j].astype(np.uint8)) for j in range(visual_img.shape[0])]
+                return {"vq_images": imgs, "diff_images": None}
+        except Exception as e:
+            print_with_time(f"Error in _process_output_images: {str(e)}")
+            return {"vq_images": None, "diff_images": None}
+inferencer = None
+def save_language_setting(language):
+    try:
+        with open('.language_setting', 'w') as f:
+            f.write(language)
+    except:
+        pass
+def update_interface_language(language):
+    global current_language
+    current_language = language
+    save_language_setting(language)
+    return [
+        language,
+        f"# {get_text('title')}",
+        get_text("description"),
+        get_text("text_prompt_placeholder"),
+        get_text("edit_instruction_placeholder"),
+        get_text("question_placeholder"),
+        f"""
+        ---
+        ### {get_text("usage_instructions")}
+        {get_text("usage_step1")}
+        {get_text("usage_step2")}
+        {get_text("usage_step3")}
+        """,
+        f"✅ Language switched to {language.upper()} successfully! / 语言已成功切换为{language.upper()}！"  # 状态消息
+    ]
+current_language = "en"
+def get_text(key):
+    return TEXTS[current_language].get(key, key)
+def auto_detect_device():
+    if torch.cuda.is_available():
+        device = f"cuda:{torch.cuda.current_device()}"
+        print_with_time(f"Detected CUDA device: {device}")
+        print_with_time(f"GPU name: {torch.cuda.get_device_name()}")
+        print_with_time(f"GPU memory: {torch.cuda.get_device_properties(0).total_memory / 1024**3:.1f}GB")
+    else:
+        device = "cpu"
+        print_with_time("No CUDA device detected, using CPU")
+    return device
+def initialize_model_on_startup():
+    global inferencer
+    default_checkpoint = hf_hub_download(
+        repo_id="MM-MVR/STAR-7B",
+        filename="STAR-7B.pt"
+    )
+    default_config = "star/configs/STAR_Qwen2.5-VL-7B.json"
+    vq_checkpoint = hf_hub_download(
+        repo_id="MM-MVR/STAR-VQ",
+        filename="VQ-Model.pt"
+    )
+    if not os.path.exists(default_config):
+        print_with_time(f"⚠️ Model config file not found: {default_config}")
+        return False, f"Model config file not found: {default_config}"
+    if not os.path.exists(default_checkpoint):
+        print_with_time(f"⚠️ Model checkpoint file not found: {default_checkpoint}")
+        return False, f"Model checkpoint file not found: {default_checkpoint}"
+    try:
+        device = 'cpu'
+        print_with_time("Starting to load STAR model...")
+        inferencer = STARInferencer(default_config, default_checkpoint, vq_checkpoint, device)
+        print_with_time("✅ STAR model loaded successfully!")
+        return True, "✅ STAR model loaded successfully!"
+    except Exception as e:
+        error_msg = f"❌ Model loading failed: {str(e)}"
+        print_with_time(error_msg)
+        return False, error_msg
+def text_to_image(prompt, cfg_scale=1.0, topk=1000, topp=0.8):
+    if inferencer is None:
+        return None, get_text("please_load_model")
+    cfg_scale = max(1.0, min(20.0, cfg_scale))
+    topk = max(100, min(2000, int(topk)))
+    topp = max(0.1, min(1.0, topp))
+    seed = 100
+    try:
+        print_with_time(f"Starting generation with params: cfg={cfg_scale}, topk={topk}, topp={topp}, seed={seed}")
+        result = inferencer.generate_image(prompt, cfg=cfg_scale, topk=topk, topp=topp, seed=seed)
+        if result is None:
+            return None, get_text("generation_failed")
+        if result.get("diff_images") and len(result["diff_images"]) > 0:
+            return result["diff_images"][0], get_text("generation_success_diffusion")
+        elif result.get("vq_images") and len(result["vq_images"]) > 0:
+            return result["vq_images"][0], get_text("generation_success_vq")
+        else:
+            return None, get_text("generation_failed")
+    except Exception as e:
+        return None, get_text("generation_error") + str(e)
+def image_editing(image, instruction, cfg_scale=1.0, topk=1000, topp=0.8):
+    if inferencer is None:
+        return None, get_text("please_load_model")
+    if image is None:
+        return None, get_text("please_upload_image")
+    cfg_scale = max(1.0, min(20.0, cfg_scale))
+    topk = max(100, min(2000, int(topk)))
+    topp = max(0.1, min(1.0, topp))
+    seed = 100
+    try:
+        print_with_time(f"Starting image editing with params: cfg={cfg_scale}, topk={topk}, topp={topp}, seed={seed}")
+        result = inferencer.edit_image(image, instruction, cfg=cfg_scale, topk=topk, topp=topp, seed=seed)
+        if result is None:
+            return None, get_text("edit_failed")
+        if result.get("diff_images") and len(result["diff_images"]) > 0:
+            return result["diff_images"][0], get_text("edit_success_diffusion")
+        elif result.get("vq_images") and len(result["vq_images"]) > 0:
+            return result["vq_images"][0], get_text("edit_success_vq")
+        else:
+            return None, get_text("edit_failed")
+    except Exception as e:
+        return None, get_text("edit_error") + str(e)
+def image_understanding(image, question, max_new_tokens=256):
+    if inferencer is None:
+        return get_text("please_load_model")
+    if image is None:
+        return get_text("please_upload_image")
+    try:
+        answer = inferencer.understand_image(image, question, max_new_tokens)
+        return answer if answer else get_text("understanding_failed")
+    except Exception as e:
+        return get_text("understanding_error") + str(e)
+def change_language(language):
+    global current_language
+    current_language = language
+    return (
+        get_text("title"),
+        get_text("description"),
+        get_text("tab_text_to_image"),
+        get_text("text_prompt"),
+        get_text("text_prompt_placeholder"),
+        get_text("advanced_params"),
+        get_text("cfg_scale"),
+        get_text("cfg_scale_info"),
+        get_text("top_k"),
+        get_text("top_k_info"),
+        get_text("top_p"),
+        get_text("top_p_info"),
+        get_text("random_seed"),
+        get_text("random_seed_info"),
+        get_text("generate_image"),
+        get_text("generated_image"),
+        get_text("generation_status"),
+        get_text("tab_image_edit"),
+        get_text("input_image"),
+        get_text("edit_instruction"),
+        get_text("edit_instruction_placeholder"),
+        get_text("edit_image"),
+        get_text("edited_image"),
+        get_text("edit_status"),
+        get_text("tab_image_understanding"),
+        get_text("question"),
+        get_text("question_placeholder"),
+        get_text("max_generation_length"),
+        get_text("understand_image"),
+        get_text("understanding_result"),
+        get_text("usage_instructions"),
+        get_text("usage_step1"),
+        get_text("usage_step2"),
+        get_text("usage_step3")
+    )
+def load_example_image(image_path):
+    try:
+        if os.path.exists(image_path):
+            return Image.open(image_path)
+    except Exception as e:
+        print(f"Error loading example image: {e}")
+    return None
+def create_interface():
+    print_with_time("Initializing STAR demo system...")
+    model_loaded, status_message = initialize_model_on_startup()
+    with gr.Blocks(title="🌟 STAR Multi-Modal Demo", theme=gr.themes.Soft()) as demo:
+        language_state = gr.State(value=current_language)
+        title_md = gr.Markdown(f"# {get_text('title')}")
+        desc_md = gr.Markdown(get_text("description"))
+        with gr.Row():
+            with gr.Column():
+                language_dropdown = gr.Dropdown(
+                    choices=[("English", "en"), ("中文", "zh")],
+                    value=current_language,
+                    label="Language / 语言",
+                    interactive=True
+                )
+        with gr.Tabs():
+            with gr.Tab(get_text("tab_text_to_image")) as txt_tab:
+                with gr.Row():
+                    with gr.Column():
+                        txt_prompt = gr.Textbox(
+                            label=get_text("text_prompt"),
+                            value=get_text("text_prompt_placeholder"),
+                            lines=3
+                        )
+                        with gr.Accordion(get_text("advanced_params"), open=False):
+                            txt_cfg_scale = gr.Slider(
+                                minimum=1.0, maximum=20.0, value=1.1, step=0.1,
+                                label=get_text("cfg_scale"), info=get_text("cfg_scale_info")
+                            )
+                            txt_topk = gr.Slider(
+                                minimum=100, maximum=2000, value=1000, step=50,
+                                label=get_text("top_k"), info=get_text("top_k_info")
+                            )
+                            txt_topp = gr.Slider(
+                                minimum=0.1, maximum=1.0, value=0.8, step=0.05,
+                                label=get_text("top_p"), info=get_text("top_p_info")
+                            )
+                        txt_generate_btn = gr.Button(get_text("generate_image"), variant="primary")
+                    with gr.Column():
+                        txt_output_image = gr.Image(label=get_text("generated_image"))
+                        txt_status = gr.Textbox(label=get_text("generation_status"), interactive=False)
+            with gr.Tab(get_text("tab_image_edit")) as edit_tab:
+                with gr.Row():
+                    with gr.Column():
+                        edit_input_image = gr.Image(
+                            label=get_text("input_image"),
+                            value=load_example_image('assets/editing.png')
+                        )
+                        edit_instruction = gr.Textbox(
+                            label=get_text("edit_instruction"),
+                            value=get_text("edit_instruction_placeholder"),
+                            lines=2
+                        )
+                        with gr.Accordion(get_text("advanced_params"), open=False):
+                            edit_cfg_scale = gr.Slider(
+                                minimum=1.0, maximum=20.0, value=1.1, step=0.1,
+                                label=get_text("cfg_scale")
+                            )
+                            edit_topk = gr.Slider(
+                                minimum=100, maximum=2000, value=1000, step=50,
+                                label=get_text("top_k")
+                            )
+                            edit_topp = gr.Slider(
+                                minimum=0.1, maximum=1.0, value=0.8, step=0.05,
+                                label=get_text("top_p")
+                            )
+                        edit_btn = gr.Button(get_text("edit_image"), variant="primary")
+                    with gr.Column():
+                        edit_output_image = gr.Image(label=get_text("edited_image"))
+                        edit_status = gr.Textbox(label=get_text("edit_status"), interactive=False)
+            with gr.Tab(get_text("tab_image_understanding")) as understand_tab:
+                with gr.Row():
+                    with gr.Column():
+                        understand_input_image = gr.Image(
+                            label=get_text("input_image"),
+                            value=load_example_image('assets/understand.png')
+                        )
+                        understand_question = gr.Textbox(
+                            label=get_text("question"),
+                            value=get_text("question_placeholder"),
+                            lines=2
+                        )
+                        with gr.Accordion(get_text("advanced_params"), open=False):
+                            understand_max_tokens = gr.Slider(
+                                minimum=64, maximum=1024, value=256, step=64,
+                                label=get_text("max_generation_length")
+                            )
+                        understand_btn = gr.Button(get_text("understand_image"), variant="primary")
+                    with gr.Column():
+                        understand_output = gr.Textbox(
+                            label=get_text("understanding_result"),
+                            lines=15,
+                            interactive=False
+                        )
+        usage_md = gr.Markdown(
+            f"""
+            ---
+            ### {get_text("usage_instructions")}
+            {get_text("usage_step1")}
+            {get_text("usage_step2")}
+            {get_text("usage_step3")}
+            """
+        )
+        txt_generate_btn.click(
+            fn=text_to_image,
+            inputs=[txt_prompt, txt_cfg_scale, txt_topk, txt_topp],
+            outputs=[txt_output_image, txt_status]
+        )
+        edit_btn.click(
+            fn=image_editing,
+            inputs=[edit_input_image, edit_instruction, edit_cfg_scale, edit_topk, edit_topp],
+            outputs=[edit_output_image, edit_status]
+        )
+        understand_btn.click(
+            fn=image_understanding,
+            inputs=[understand_input_image, understand_question, understand_max_tokens],
+            outputs=understand_output
+        )
+        language_dropdown.change(
+            fn=update_interface_language,
+            inputs=[language_dropdown],
+            outputs=[language_state, title_md, desc_md, txt_prompt, edit_instruction, understand_question, usage_md, txt_status]
+        )
+    return demo
+demo = create_interface()
+demo.launch(share=True, show_error=True)

assets/editing.png ADDED Viewed

Git LFS Details

SHA256: 725278dda08a4ce97589396aac69bb0c703b05d9de861fb9b278444f5b936af5
Pointer size: 131 Bytes
Size of remote file: 591 kB

assets/understand.png ADDED Viewed

Git LFS Details

SHA256: cb0fe61f3b81bc2ffbc0f5838d745881f0e77b95c7e3fd96bf43e7715dfd7fd8
Pointer size: 132 Bytes
Size of remote file: 1.73 MB

requirements.txt ADDED Viewed

	@@ -0,0 +1,30 @@

+torch>=2.2.2
+torchvision>=0.17.2
+torchaudio>=2.2.2
+transformers==4.51.0
+diffusers==0.33.0
+decord>=0.6.0
+attrdict2
+accelerate>=0.32.0
+timm>=1.0.15
+opencv-python>=4.10.0
+pillow>=10.4.0
+einops>=0.8.0
+xformers>=0.0.28
+numpy>=1.26.0
+pandas>=2.2.0
+datasets>=3.0.0
+tokenizers>=0.21.0
+sentencepiece>=0.1.99
+torchmetrics>=1.4.0
+tqdm>=4.66.0
+pyyaml>=6.0.0
+requests>=2.32.0
+packaging>=24.1
+ipython>=8.26.0
+matplotlib>=3.9.0
+deepspeed>=0.14.4
+wandb>=0.16.3
+gradio>=5.34.0
+qwen-vl-utils

star/.DS_Store ADDED Viewed

Binary file (6.15 kB). View file

star/configs/STAR_Qwen2.5-VL-3B.json ADDED Viewed

	@@ -0,0 +1,35 @@

+{
+    "model_name": "STAR_Qwen2.5-3B_VQGAN",
+    "model_type": "STARMultiModalityConfig",
+    "language_model": {
+        "model_name": "Qwen2.5-VL",
+        "model_path": "checkpoints/Qwen2.5-VL-3B-Instruct"
+    },
+    "pixel_encoder": {
+        "model_name": "VQ_Model",
+        "model_path": "checkpoints/VQ-Model.pt",
+        "image_token_size": 65536,
+        "n_embed": 512,
+        "num_tokens": 576,
+        "num_heads": 8
+    },
+    "pixel_adapter": {
+        "model_name": "MLP_GELU",
+        "depth": 2,
+        "input_dim": 512,
+        "n_embed": 2048
+    },
+    "stacked_ar": {
+        "num_layers": 16
+    },
+    "pixel_output_head": {
+        "image_token_embed": 4096,
+        "image_token_size": 65536,
+        "n_embed": 2048
+    },
+    "pixel_decoder": {
+        "model_name": "LUMINA2",
+        "model_path": "checkpoints/lumina-image2"
+    },
+    "torch_dtype": "bfloat16"
+}

star/configs/STAR_Qwen2.5-VL-7B.json ADDED Viewed

	@@ -0,0 +1,35 @@

+{
+    "model_name": "STAR_Qwen2.5-7B_VQGAN",
+    "model_type": "STARMultiModalityConfig",
+    "language_model": {
+        "model_name": "Qwen2.5-VL",
+        "model_path": "checkpoints/Qwen2.5-VL-7B-Instruct"
+    },
+    "pixel_encoder": {
+        "model_name": "VQ_Model",
+        "model_path": "checkpoints/VQ-Model.pt",
+        "image_token_size": 65536,
+        "n_embed": 512,
+        "num_tokens": 576,
+        "num_heads": 8
+    },
+    "pixel_adapter": {
+        "model_name": "MLP_GELU",
+        "depth": 4,
+        "input_dim": 512,
+        "n_embed": 3584
+    },
+    "stacked_ar": {
+        "num_layers": 14
+    },
+    "pixel_output_head": {
+        "image_token_embed": 4096,
+        "image_token_size": 65536,
+        "n_embed": 3584
+    },
+    "pixel_decoder": {
+        "model_name": "LUMINA2",
+        "model_path": "checkpoints/lumina-image2"
+    },
+    "torch_dtype": "bfloat16"
+}

star/models/adapter/projector.py ADDED Viewed

	@@ -0,0 +1,26 @@

+import torch
+import torch.nn as nn
+class MlpProjector(nn.Module):
+    def __init__(self, cfg):
+        super().__init__()
+        self.cfg = cfg
+        if cfg.model_name == "MLP_GELU":
+            mlp_depth = cfg.get("depth", 1)
+            modules = [nn.Linear(cfg.input_dim, cfg.n_embed)]
+            for _ in range(1, mlp_depth):
+                modules.append(nn.GELU())
+                modules.append(nn.Linear(cfg.n_embed, cfg.n_embed))
+            modules = nn.Sequential(*modules)
+        else:
+            raise ValueError(f"Unknown projector type: {cfg.model_name}")
+        self.layers = modules
+    def forward(self, x):
+        return self.layers(x)

star/models/config.py ADDED Viewed

	@@ -0,0 +1,23 @@

+import json
+from attrdict2 import AttrDict
+from transformers.configuration_utils import PretrainedConfig
+def load_config_from_json(json_path):
+    with open(json_path, "r") as f:
+        config_data = json.load(f)
+    return config_data
+class STARMultiModalConfig(PretrainedConfig):
+    model_type = "STARMultiModal"
+    def __init__(self, **kwargs):
+        super().__init__(**kwargs)
+        self.pixel_encoder = AttrDict(kwargs.get("pixel_encoder", {}))
+        self.pixel_adapter = AttrDict(kwargs.get("pixel_adapter", {}))
+        self.pixel_output_head = AttrDict(kwargs.get("pixel_output_head", {}))
+        self.language_model = AttrDict(kwargs.get("language_model", {}))
+        self.stacked_ar = AttrDict(kwargs.get("stacked_ar", {}))
+        self.pixel_decoder = AttrDict(kwargs.get("pixel_decoder", {}))

star/models/data_process_utils.py ADDED Viewed

	@@ -0,0 +1,65 @@

+import numpy as np
+from PIL import Image
+import torch
+import torchvision
+from torchvision import transforms
+BACKGROUND_COLOR=(127, 127, 127)
+from torchvision.transforms import InterpolationMode
+def preprocess_image_with_min_size(image, min_factor=28):
+    width, height = image.size
+    if height < min_factor or width < min_factor:
+        scale_factor = max(min_factor / height, min_factor / width)
+        new_width = int(width * scale_factor)
+        new_height = int(height * scale_factor)
+        image = image.resize((new_width, new_height), Image.Resampling.LANCZOS)
+    return image
+def preprocess_image_gen(images, processor, vq_transform):
+    image_list = []
+    grid_thw_list = []
+    vq_image_list = []
+    for image in images:
+        image = preprocess_image_with_min_size(image)
+        visual_processed = processor.preprocess(image, return_tensors="pt")
+        image_tensor = visual_processed["pixel_values"]
+        if isinstance(image_tensor, list):
+            image_tensor = image_tensor[0]
+        image_list.append(image_tensor)
+        grid_thw = visual_processed["image_grid_thw"][0]
+        grid_thw_list.append(grid_thw)
+        vq_image = vq_transform(image)
+        vq_image_list.append(vq_image)
+    image_tensor = torch.stack(image_list, dim=0)
+    grid_thw = torch.stack(grid_thw_list, dim=0)
+    vq_image = torch.stack(vq_image_list, dim=0)
+    return {
+        "pixel_values": image_tensor,
+        "image_grid_thw": grid_thw,
+        "vq_pixel_values": vq_image
+    }
+def get_vq_transform(args):
+    return transforms.Compose([
+        transforms.Resize((args.vq_image_size, args.vq_image_size), interpolation=InterpolationMode.BILINEAR),
+        transforms.ToTensor(),    # [0, 255] -> [0, 1]
+        transforms.Normalize(mean=(0.5, 0.5, 0.5), std=(0.5, 0.5, 0.5)),    # [0, 1] -> [-1, 1]
+    ])
+def get_full_transform(args):
+    return transforms.Compose([
+        transforms.Resize((1024, 1024), interpolation=InterpolationMode.BILINEAR),
+        transforms.ToTensor(),    # [0, 255] -> [0, 1]
+        transforms.Normalize(mean=(0.5, 0.5, 0.5), std=(0.5, 0.5, 0.5)),    # [0, 1] -> [-1, 1]
+    ])

star/models/model.py ADDED Viewed

	@@ -0,0 +1,587 @@

+import os
+import math
+import torch
+import requests
+from io import BytesIO
+from PIL import Image
+from tqdm import tqdm
+import torch.nn.functional as F
+import torchvision.transforms as T
+from torchvision.transforms.functional import InterpolationMode
+from torch.nn import CrossEntropyLoss
+from transformers import (
+    AutoConfig,
+    AutoTokenizer,
+    AutoModelForCausalLM,
+    PreTrainedModel
+)
+from transformers import Qwen2_5_VLForConditionalGeneration, AutoProcessor, Qwen2VLProcessor
+from star.models.config import STARMultiModalConfig
+from star.models.pixel_encoder.vq_model import VQ_Model
+from star.models.adapter.projector import MlpProjector
+from star.models.pixel_decoder.lumina2_decoder import Lumina2Decoder
+from star.models.data_process_utils import get_full_transform, get_vq_transform, preprocess_image_gen
+from star.models.rope_2d import get_rope_index_25
+class STARMultiModal(PreTrainedModel):
+    def __init__(self, config: STARMultiModalConfig, args=None, **kwargs):
+        super().__init__(config)
+        self.config = config
+        self.args = args if args is not None else kwargs.get("args", None)
+        # Pixel Encoder Generation
+        model_name = config.pixel_encoder.model_name
+        if model_name == "VQ_Model":
+            self.pixel_encoder = VQ_Model(config.pixel_encoder)
+        else:
+            assert None, f"Unsupported {model_name}"
+        self.pixel_encoder.eval()
+        # Pixel Adapter Generation
+        model_name = config.pixel_adapter.model_name
+        if model_name == "MLP_GELU":
+            self.pixel_adapter = MlpProjector(config.pixel_adapter)
+        else:
+            assert None, f"Unsupported {model_name}"
+        # Pixel Ouput Head Generation
+        self.pixel_output_head = torch.nn.Linear(config.pixel_output_head.n_embed, config.pixel_output_head.image_token_size)
+        if getattr(args, "diffusion_as_decoder") and args.diffusion_as_decoder:
+            self.diffusion_decoder = Lumina2Decoder(config.pixel_decoder, args)
+        else:
+            self.diffusion_decoder = None
+        # Large Language Model
+        model_name, model_path = config.language_model.model_name, config.language_model.model_path
+        if model_name == "Qwen2.5-VL":
+            self.llm = Qwen2_5_VLForConditionalGeneration.from_pretrained(model_path, torch_dtype=torch.bfloat16, attn_implementation="flash_attention_2", device_map="cuda")
+            self.processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
+            self.tokenizer = self.processor.tokenizer
+            self.image_processor = self.processor.image_processor
+            self.image_processor.max_pixels = self.args.max_pixels
+            self.image_processor.min_pixels = self.args.min_pixels
+            self.image_processor.size["longest_edge"] = self.args.max_pixels
+            self.image_processor.size["shortest_edge"] = self.args.min_pixels
+            special_token_tags = ["<|vision_start|>", "<|vision_pad|>", "<|image_pad|>", "<|vision_end|>", "<|fim_pad|>"]
+            self.special_tokens = {tag: self.tokenizer.vocab.get(tag, None) for tag in special_token_tags}
+        else:
+            assert None, f"unsupported {model_name}: {model_path}"
+        self.llm.generation_config.pad_token_id = self.tokenizer.encode(self.tokenizer.pad_token)[0]
+        if self.args.grad_ckpt:
+            self.llm.gradient_checkpointing_enable()
+            self.llm.visual.gradient_checkpointing_enable()
+        stacked_ar_config = AutoConfig.from_pretrained(model_path, trust_remote_code=True)
+        num_layers_to_extract = config.stacked_ar.num_layers
+        stacked_ar_config.num_hidden_layers = num_layers_to_extract
+        self.stacked_ar = Qwen2_5_VLForConditionalGeneration(stacked_ar_config)
+        temp_model = Qwen2_5_VLForConditionalGeneration.from_pretrained(model_path, torch_dtype=torch.bfloat16, attn_implementation="flash_attention_2")
+        total_layers = len(temp_model.model.layers)
+        start_layer = max(0, total_layers - num_layers_to_extract)
+        temp_model.model.layers = temp_model.model.layers[start_layer:]
+        self.stacked_ar.load_state_dict(temp_model.state_dict(), strict=False)
+        self.stacked_ar = self.stacked_ar.to("cuda")
+        del self.stacked_ar.visual, self.stacked_ar.model.embed_tokens, self.stacked_ar.lm_head
+    # For Inference Generation
+    def generate_images(self, prompt, max_new_tokens=256, num_return_sequences=1, cfg_weight=5.0, topk_sample=1000, topp_sample=1.0, temperature=1.0, reasoning=False, return_dict=False):
+        if reasoning:
+            return self.generate_images_reasoning(prompt, max_new_tokens, num_return_sequences, cfg_weight, topk_sample, topp_sample, temperature, return_dict)
+        messages = [{'role': 'user', 'content': prompt}]
+        text = self.processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
+        text_token = self.tokenizer.encode(text)
+        text_token = torch.tensor(text_token).long().to(self.device)
+        keys = list(self.special_tokens.keys())
+        start_token = (torch.ones(1) * self.special_tokens.get(keys[0])).long().to(self.device)
+        input_ids = torch.cat((text_token, start_token)).long().to(self.device)
+        tokens = torch.zeros((num_return_sequences*2, len(input_ids)), dtype=torch.int).cuda()
+        assistant_tokens = input_ids[-4:]
+        for i in range(num_return_sequences*2):
+            tokens[i, :] = input_ids
+            if i % 2 != 0:
+                tokens[i, 1:-1] = self.special_tokens.get(keys[4])
+                tokens[i, -4:] = assistant_tokens
+        inputs_embeds = self.llm.model.embed_tokens(tokens).to(self.device)
+        generated_tokens = torch.zeros((num_return_sequences, max_new_tokens), dtype=torch.int).cuda()
+        for i in range(max_new_tokens):
+            outputs = self.llm.model(
+                    inputs_embeds=inputs_embeds,
+                    use_cache=True,
+                    past_key_values=outputs.past_key_values if i != 0 else None,
+                    output_hidden_states=True)
+            last_hidden_states = outputs[0]
+            output_states = self.stacked_ar.model(
+                inputs_embeds=last_hidden_states,
+                past_key_values=output_states.past_key_values if i != 0 else None,
+                output_hidden_states=True,
+                use_cache=True)
+            last_hidden_states = output_states.hidden_states[-1]
+            logits = self.pixel_output_head(last_hidden_states[:, -1, :])
+            logit_cond = logits[0::2, :]
+            logit_uncond = logits[1::2, :]
+            logits = logit_uncond + cfg_weight * (logit_cond-logit_uncond)
+            next_token, _ = self.sample(logits, temperature=1.0, top_k=topk_sample, top_p=topp_sample)
+            generated_tokens[:, i] = next_token.squeeze(dim=-1)
+            next_token = torch.cat([next_token.unsqueeze(dim=1), next_token.unsqueeze(dim=1)], dim=1).view(-1)
+            vqgan_embeds = self.pixel_encoder.get_codebook_entry(next_token)
+            img_embeds = self.pixel_adapter(vqgan_embeds)
+            inputs_embeds = img_embeds.unsqueeze(dim=1)
+        latent_size = int(math.sqrt(max_new_tokens))
+        output_images = self.pixel_encoder.decode_code(generated_tokens.to(dtype=torch.int), shape=[num_return_sequences, self.pixel_encoder.config.codebook_embed_dim, latent_size, latent_size])
+        output_images = output_images.to(torch.float32).cpu().numpy().transpose(0, 2, 3, 1)
+        diff_images = None
+        if self.diffusion_decoder is not None:
+            gen_image_embeds = self.pixel_encoder.get_codebook_entry(generated_tokens)
+            if self.args.diffusion_resolution==512:
+                self.diffusion_decoder.pipe.transformer.config.sample_size=16
+            elif self.args.diffusion_resolution==1024:
+                self.diffusion_decoder.pipe.transformer.config.sample_size=32
+            diff_images = self.diffusion_decoder.pipe(
+                    prompt,
+                    num_inference_steps=40,
+                    guidance_scale=4.5,
+                    gen_image_embeds=gen_image_embeds, #gen_image_embeds,
+                    control_emd="text",
+                    ori_inp_way=self.diffusion_decoder.transformer.ori_inp_dit,
+                    only_t2i="vqconcat",
+                    img_guidance_scale=1.05,
+                    height=self.args.diffusion_resolution,
+                    width=self.args.diffusion_resolution
+                ).images
+        if return_dict:
+            return {"output_images": output_images, "generated_tokens": generated_tokens, "diff_images": diff_images}
+        return output_images
+    def answer_text_qwen_vl(self, question, max_new_tokens=256, do_sample=True):
+        messages = [
+            {
+                "role": "user",
+                "content": [
+                    {"type": "text", "text": question},
+                ],
+            }
+        ]
+        # Preparation for inference
+        text = self.processor.apply_chat_template(
+            messages, tokenize=False, add_generation_prompt=True
+        )
+        # image_inputs, video_inputs = process_vision_info(messages)
+        inputs = self.processor(
+            text=[text],
+            images=None,
+            videos=None,
+            padding=True,
+            return_tensors="pt",
+        )
+        inputs = inputs.to(self.llm.device)
+        # Inference: Generation of the output
+        generated_ids = self.llm.generate(**inputs, max_new_tokens=max_new_tokens, do_sample=do_sample)
+        generated_ids_trimmed = [
+            out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
+        ]
+        output_text = self.processor.batch_decode(
+            generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
+        )
+        return output_text[0] if output_text else ""
+    def generate_images_reasoning(self, prompt, max_new_tokens=256, num_return_sequences=1, cfg_weight=5.0, topk_sample=1000, topp_sample=1.0, temperature=1.0, return_dict=False):
+        messages = [{'role': 'user', 'content': prompt}]
+        text = self.processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
+        text_token = self.tokenizer.encode(text)
+        text_token = torch.tensor(text_token).long().to(self.device)
+        keys = list(self.special_tokens.keys())
+        start_token = (torch.ones(1) * self.special_tokens.get(keys[0])).long().to(self.device)
+        input_ids = torch.cat((text_token, start_token)).long().to(self.device)
+        tokens = torch.zeros((num_return_sequences*2, len(input_ids)), dtype=torch.int).cuda()
+        assistant_tokens = input_ids[-4:]
+        for i in range(num_return_sequences*2):
+            tokens[i, :] = input_ids
+            if i % 2 != 0:
+                tokens[i, 1:-1] = self.special_tokens.get(keys[4])
+                tokens[i, -4:] = assistant_tokens
+        generated_tokens = torch.zeros((num_return_sequences, max_new_tokens), dtype=torch.int).cuda()
+        answer_tokens_list = self.answer_text_qwen_vl(prompt, do_sample=False)
+        if answer_tokens_list:
+            answer_tokens_list = self.tokenizer.encode(answer_tokens_list, add_special_tokens=False)
+            answer_tokens = torch.tensor([answer_tokens_list], device=self.device)  # [1, seq_len]
+            magic_prompt = " Ultra HD, 4K, cinematic composition"
+            magic_prompt_tokens = self.tokenizer.encode(magic_prompt, add_special_tokens=False)
+            magic_prompt_tensor = torch.tensor([magic_prompt_tokens], device=self.device)  # [1, magic_seq_len]
+            answer_tokens = torch.cat([answer_tokens, magic_prompt_tensor], dim=1)  # [1, seq_len + magic_seq_len]
+            answer_prompt = self.tokenizer.decode(answer_tokens[0]).split("assistant\n")[-1] #hjc see
+            special_token = self.special_tokens.get(keys[4])
+            special_token_tensor = torch.tensor([[special_token]], device=self.device)
+            special_token_expanded = special_token_tensor.expand(-1, answer_tokens.size(1))
+            answer_tokens_with_special = torch.cat([answer_tokens, special_token_expanded], dim=0)
+            batch_size = tokens.size(0)  # num_return_sequences*2
+            answer_tokens_expanded = answer_tokens_with_special.repeat(batch_size // 2, 1)
+            input_tokens = torch.cat((tokens[:, :14], answer_tokens_expanded, tokens[:, -6:]), dim=1)
+        else:
+            input_tokens = tokens
+            answer_prompt = None
+        inputs_embeds = self.llm.model.embed_tokens(input_tokens).to(self.device)
+        for i in range(max_new_tokens):
+            outputs = self.llm.model(
+                    inputs_embeds=inputs_embeds,
+                    use_cache=True,
+                    past_key_values=outputs.past_key_values if i != 0 else None,
+                    output_hidden_states=True)
+            last_hidden_states = outputs[0]
+            output_states = self.stacked_ar.model(
+                inputs_embeds=last_hidden_states,
+                past_key_values=output_states.past_key_values if i != 0 else None,
+                output_hidden_states=True,
+                use_cache=True)
+            last_hidden_states = output_states.hidden_states[-1]
+            logits = self.pixel_output_head(last_hidden_states[:, -1, :])
+            logit_cond = logits[0::2, :]
+            logit_uncond = logits[1::2, :]
+            logits = logit_uncond + cfg_weight * (logit_cond-logit_uncond)
+            next_token, _ = self.sample(logits, temperature=1.0, top_k=topk_sample, top_p=topp_sample)
+            generated_tokens[:, i] = next_token.squeeze(dim=-1)
+            next_token = torch.cat([next_token.unsqueeze(dim=1), next_token.unsqueeze(dim=1)], dim=1).view(-1)
+            vqgan_embeds = self.pixel_encoder.get_codebook_entry(next_token)
+            img_embeds = self.pixel_adapter(vqgan_embeds)
+            inputs_embeds = img_embeds.unsqueeze(dim=1)
+        latent_size = int(math.sqrt(max_new_tokens))
+        output_images = self.pixel_encoder.decode_code(generated_tokens.to(dtype=torch.int), shape=[num_return_sequences, self.pixel_encoder.config.codebook_embed_dim, latent_size, latent_size])
+        output_images = output_images.to(torch.float32).cpu().numpy().transpose(0, 2, 3, 1)
+        diff_images = None
+        if self.diffusion_decoder is not None:
+            gen_image_embeds = self.pixel_encoder.get_codebook_entry(generated_tokens)
+            diff_prompt = answer_prompt if answer_prompt else prompt
+            if self.args.diffusion_resolution==512:
+                self.diffusion_decoder.pipe.transformer.config.sample_size=16
+            elif self.args.diffusion_resolution==1024:
+                self.diffusion_decoder.pipe.transformer.config.sample_size=32
+            diff_images = self.diffusion_decoder.pipe(
+                    diff_prompt,
+                    num_inference_steps=40,
+                    guidance_scale=4.5,
+                    gen_image_embeds=gen_image_embeds, #gen_image_embeds,
+                    control_emd="text",
+                    ori_inp_way=self.diffusion_decoder.transformer.ori_inp_dit,
+                    only_t2i="vqconcat",
+                    img_guidance_scale=1.05,
+                    height=self.args.diffusion_resolution,
+                    width=self.args.diffusion_resolution
+                ).images
+        if return_dict:
+            return {"output_images":output_images,"generated_tokens":generated_tokens,"diff_images":diff_images,"answer_prompt":answer_prompt}
+        return output_images
+    def generate_images_edit(self, image, prompt, max_new_tokens=256, num_return_sequences=1, cfg_weight=5.0, topk_sample=1000, topp_sample=1.0, temperature=1.0,return_dict=False):
+        vq_image_transform = get_vq_transform(self.args)
+        full_image_transform = get_full_transform(self.args)
+        if isinstance(image, str):
+            image = Image.open(image).convert('RGB')
+        elif isinstance(image, list):
+            image = [each_image.convert('RGB') for each_image in image]
+        else:
+            image = image.convert('RGB')
+        messages = [{'role': 'user', 'content': prompt}]
+        text = self.processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
+        text_token = self.tokenizer.encode(text)
+        text_token = torch.tensor(text_token).long().to(self.device)
+        keys = list(self.special_tokens.keys())
+        start_token = (torch.ones(1) * self.special_tokens.get(keys[0])).long().to(self.device)
+        user_prompt = "<|im_start|>user\n"
+        user_prompt_token = self.tokenizer.encode(user_prompt, add_special_tokens=False)
+        user_prompt_tensor = torch.tensor(user_prompt_token).long().to(self.device)
+        windows = text_token.unfold(0, len(user_prompt_tensor), 1)
+        matches = (windows == user_prompt_tensor).all(dim=1)
+        image_position = torch.where(matches)[0][0].item() + len(user_prompt_tensor)
+        input_ids = torch.cat((text_token, start_token)).long().to(self.device)
+        tokens = torch.zeros((num_return_sequences*2, len(input_ids)), dtype=torch.int).cuda()
+        assistant_tokens = input_ids[-4:]
+        for i in range(num_return_sequences*2):
+            tokens[i, :] = input_ids
+            if i % 2 != 0:
+                tokens[i, 1:-1] = self.special_tokens.get(keys[4])
+                tokens[i, -4:] = assistant_tokens
+        inputs_embeds = self.llm.model.embed_tokens(tokens).to(self.device)
+        position_ids = None
+        if image is not None:
+            image_info = preprocess_image_gen(image, self.image_processor, vq_image_transform)
+            image_embeds = self.llm.visual(image_info["pixel_values"].to(inputs_embeds.device,self.llm.visual.dtype), grid_thw=image_info["image_grid_thw"].to(inputs_embeds.device))
+            image_embeds = image_embeds[None,:].repeat(2, 1, 1).to(inputs_embeds.device, inputs_embeds.dtype)
+            vq_pixel_values = image_info["vq_pixel_values"].to(inputs_embeds.device)
+            B = inputs_embeds.size(0)
+            if len(vq_pixel_values.shape)==4:
+                vq_pixel_values = vq_pixel_values[:,None]
+            N = vq_pixel_values.size(1)
+            _, _, [_, _, vq_indices] = self.pixel_encoder.encode(vq_pixel_values.flatten(0, 1).bfloat16())
+            batch_size = vq_pixel_values.shape[0]
+            vq_indices = vq_indices.reshape(batch_size, N, vq_indices.shape[-1])
+            vqgan_dec_embeds = self.pixel_encoder.get_codebook_entry(vq_indices)
+            vq_embeds = self.pixel_adapter(vqgan_dec_embeds)
+            vq_embeds = vq_embeds.repeat(B, 1, 1, 1).to(inputs_embeds.device, inputs_embeds.dtype).flatten(1, 2)
+            vision_start_embeds = self.llm.model.embed_tokens(torch.tensor(self.tokenizer.encode("<|vision_start|>")).long().to(self.device))
+            vision_end_embeds = self.llm.model.embed_tokens(torch.tensor(self.tokenizer.encode("<|vision_end|>")).long().to(self.device))
+            newline_embeds = self.llm.model.embed_tokens(torch.tensor(self.tokenizer.encode("\n")).long().to(self.device))
+            vision_start_embeds = vision_start_embeds.unsqueeze(0).repeat(B, 1, 1)
+            vision_end_embeds = vision_end_embeds.unsqueeze(0).repeat(B, 1, 1)
+            newline_embeds = newline_embeds.unsqueeze(0).repeat(B, 1, 1)
+            inputs_embeds = torch.cat((inputs_embeds[:, :image_position],
+                                       vision_start_embeds, vq_embeds, vision_end_embeds,
+                                       vision_start_embeds, image_embeds, vision_end_embeds, newline_embeds,
+                                       inputs_embeds[:, image_position:]), dim=1)
+            SPECIAL_VQ_TOKEN = '<|vision_pad|>'
+            SPECIAL_VIT_TOKEN = '<|image_pad|>'
+            SPECIAL_VQ_TOKEN_ID = self.tokenizer.encode(SPECIAL_VQ_TOKEN)[0]
+            SPECIAL_VIT_TOKEN_ID = self.tokenizer.encode(SPECIAL_VIT_TOKEN)[0]
+            input_ids_for_position = torch.cat([input_ids[:image_position],
+                                       torch.tensor(self.tokenizer.encode("<|vision_start|>")).to(vq_embeds.device), torch.full((vq_embeds.shape[-2],), SPECIAL_VQ_TOKEN_ID, device=vq_embeds.device), torch.tensor(self.tokenizer.encode("<|vision_end|>")).to(vq_embeds.device),
+                                       torch.tensor(self.tokenizer.encode("<|vision_start|>")).to(vq_embeds.device),  torch.full((image_embeds.shape[-2],), SPECIAL_VIT_TOKEN_ID, device=vq_embeds.device), torch.tensor(self.tokenizer.encode("<|vision_end|>")).to(vq_embeds.device), torch.tensor(self.tokenizer.encode("\n")).to(vq_embeds.device),
+                                       input_ids[image_position:],torch.full((vq_embeds.shape[-2],), SPECIAL_VQ_TOKEN_ID, device=vq_embeds.device)], dim=0)
+            position_ids, _ = get_rope_index_25(
+                self.image_processor.merge_size,
+                input_ids_for_position[None],
+                image_grid_thw=image_info["image_grid_thw"],
+                video_grid_thw=None,
+                second_per_grid_ts=None,
+            )
+        generated_tokens = torch.zeros((num_return_sequences, max_new_tokens), dtype=torch.int).cuda()
+        for i in range(max_new_tokens):
+            if i != 0:
+                real_position = position_ids[:,:,outputs.past_key_values.seen_tokens:(outputs.past_key_values.seen_tokens+inputs_embeds.shape[1])].to(inputs_embeds.device)
+            else:
+                real_position = position_ids[:,:,:inputs_embeds.shape[1]].to(inputs_embeds.device)
+            outputs = self.llm.model(
+                    inputs_embeds=inputs_embeds,
+                    use_cache=True,
+                    position_ids = real_position,
+                    past_key_values=outputs.past_key_values if i != 0 else None,
+                    output_hidden_states=True)
+            last_hidden_states = outputs[0]
+            output_states = self.stacked_ar.model(
+                inputs_embeds=last_hidden_states,
+                past_key_values=output_states.past_key_values if i != 0 else None,
+                output_hidden_states=True,
+                position_ids = real_position,
+                use_cache=True)
+            last_hidden_states = output_states.hidden_states[-1]
+            logits = self.pixel_output_head(last_hidden_states[:, -1, :])
+            logit_cond = logits[0::2, :]
+            logit_uncond = logits[1::2, :]
+            logits = logit_uncond + cfg_weight * (logit_cond-logit_uncond)
+            next_token, _ = self.sample(logits, temperature=1.0, top_k=topk_sample, top_p=topp_sample)
+            generated_tokens[:, i] = next_token.squeeze(dim=-1)
+            next_token = torch.cat([next_token.unsqueeze(dim=1), next_token.unsqueeze(dim=1)], dim=1).view(-1)
+            vqgan_embeds = self.pixel_encoder.get_codebook_entry(next_token)
+            img_embeds = self.pixel_adapter(vqgan_embeds)
+            inputs_embeds = img_embeds.unsqueeze(dim=1)
+        latent_size = int(math.sqrt(max_new_tokens))
+        output_images = self.pixel_encoder.decode_code(generated_tokens.to(dtype=torch.int), shape=[num_return_sequences, self.pixel_encoder.config.codebook_embed_dim, latent_size, latent_size])
+        output_images = output_images.to(torch.float32).cpu().numpy().transpose(0, 2, 3, 1)
+        diff_images = None
+        if self.diffusion_decoder is not None:
+            gen_image_embeds = self.pixel_encoder.get_codebook_entry(generated_tokens)
+            if isinstance(image, list):
+                processed_img = [full_image_transform(each_image) for each_image in image]
+            else:
+                processed_img = [full_image_transform(image)]
+            if self.args.diffusion_resolution==512:
+                self.diffusion_decoder.pipe.transformer.config.sample_size=16
+            elif self.args.diffusion_resolution==1024:
+                self.diffusion_decoder.pipe.transformer.config.sample_size=32
+            diff_images = self.diffusion_decoder.pipe(
+                    prompt,
+                    num_inference_steps=50,
+                    guidance_scale=3.0,
+                    gen_image_embeds=gen_image_embeds, #gen_image_embeds,
+                    control_emd="text",ori_inp_img=processed_img[0],ori_inp_way="seq",
+                    only_t2i="vqconcat",img_guidance_scale=1.8,vq_guidance_scale=1,height=self.args.diffusion_resolution,width=self.args.diffusion_resolution
+                ).images
+        if return_dict:
+            return {"output_images": output_images, "generated_tokens": None, "diff_images": diff_images}
+        return None
+    def sample(self, logits, temperature: float=1.0, top_k: int=0, top_p: float=1.0, sample_logits=True):
+        logits = logits / max(temperature, 1e-5)
+        if top_k > 0 or top_p < 1.0:
+            logits = self.top_k_top_p_filtering(logits, top_k=top_k, top_p=top_p)
+        probs = F.softmax(logits, dim=-1)
+        if sample_logits:
+            idx = torch.multinomial(probs, num_samples=1)
+        else:
+            _, idx = torch.topk(probs, k=1, dim=-1)
+        return idx, probs
+    def top_k_top_p_filtering(
+        self,
+        logits,
+        top_k: int = 0,
+        top_p: float = 1.0,
+        filter_value: float = -float("Inf"),
+        min_tokens_to_keep: int = 1,
+    ):
+        """Filter a distribution of logits using top-k and/or nucleus (top-p) filtering
+        """
+        if top_k > 0:
+            top_k = min(max(top_k, min_tokens_to_keep), logits.size(-1))  # Safety check
+            # Remove all tokens with a probability less than the last token of the top-k
+            indices_to_remove = logits < torch.topk(logits, top_k)[0][..., -1, None]
+            logits[indices_to_remove] = filter_value
+        if top_p < 1.0:
+            sorted_logits, sorted_indices = torch.sort(logits, descending=True)
+            cumulative_probs = torch.cumsum(F.softmax(sorted_logits, dim=-1), dim=-1)
+            # Remove tokens with cumulative probability above the threshold (token with 0 are kept)
+            sorted_indices_to_remove = cumulative_probs > top_p
+            if min_tokens_to_keep > 1:
+                # Keep at least min_tokens_to_keep (set to min_tokens_to_keep-1 because we add the first one below)
+                sorted_indices_to_remove[..., :min_tokens_to_keep] = 0
+            # Shift the indices to the right to keep also the first token above the threshold
+            sorted_indices_to_remove[..., 1:] = sorted_indices_to_remove[..., :-1].clone()
+            sorted_indices_to_remove[..., 0] = 0
+            # scatter sorted tensors to original indexing
+            indices_to_remove = sorted_indices_to_remove.scatter(1, sorted_indices, sorted_indices_to_remove)
+            logits[indices_to_remove] = filter_value
+        return logits
+    # For Inference Understand
+    def preprocess_image(self, image):
+        if image is None:
+            return None
+        if isinstance(image, str):
+            if os.path.exists(image):
+                pil_image = Image.open(image).convert('RGB')
+            else:
+                response = requests.get(image)
+                if response.status_code == 200:
+                    image_bytes = BytesIO(response.content)
+                    pil_image = Image.open(image_bytes).convert('RGB')
+                else:
+                    raise ValueError(f"Failed to load image from url {image}")
+        elif isinstance(image, Image.Image):
+            pil_image = image.convert('RGB')
+        elif isinstance(image, list):
+            return self.preprocess_image(image[0])
+        else:
+            raise ValueError("Unsupported image type")
+        return pil_image
+    def inference_understand(self, image, question, max_new_tokens=256):
+        pil_image = self.preprocess_image(image)
+        messages = [
+            {
+                "role": "user",
+                "content": [
+                    {
+                        "type": "image",
+                        "image": pil_image,
+                    },
+                    {"type": "text", "text": question},
+                ],
+            }
+        ]
+        from qwen_vl_utils import process_vision_info
+        # Preparation for inference
+        text = self.processor.apply_chat_template(
+            messages, tokenize=False, add_generation_prompt=True
+        )
+        image_inputs, video_inputs = process_vision_info(messages)
+        inputs = self.processor(
+            text=[text],
+            images=image_inputs,
+            videos=video_inputs,
+            padding=True,
+            return_tensors="pt",
+        )
+        inputs = inputs.to(self.llm.device)
+        # Inference: Generation of the output
+        generated_ids = self.llm.generate(**inputs, max_new_tokens=max_new_tokens)
+        generated_ids_trimmed = [
+            out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
+        ]
+        output_text = self.processor.batch_decode(
+            generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
+        )
+        return output_text[0] if output_text else ""

star/models/pixel_decoder/lumina2_decoder.py ADDED Viewed

	@@ -0,0 +1,563 @@

+import torch
+from diffusers import AutoencoderKL, FlowMatchEulerDiscreteScheduler, Lumina2Pipeline
+from transformers import AutoTokenizer, Gemma2Model
+import copy
+import torch.nn as nn
+import torch.nn.functional as F
+from diffusers.training_utils import (
+    cast_training_params,
+    compute_density_for_timestep_sampling,
+    compute_loss_weighting_for_sd3,
+    free_memory,
+)
+from diffusers.pipelines.lumina2.pipeline_lumina2 import *
+class Lumina2Decoder(torch.nn.Module):
+    def __init__(self, config, args):
+        super().__init__()
+        self.diffusion_model_path = config.model_path
+        if not hasattr(args, "revision"):
+            args.revision = None
+        if not hasattr(args, "variant"):
+            args.variant = None
+        self.tokenizer_one = AutoTokenizer.from_pretrained(
+                self.diffusion_model_path,
+                subfolder="tokenizer",
+                revision=args.revision,
+            )
+        self.noise_scheduler = FlowMatchEulerDiscreteScheduler.from_pretrained(
+                self.diffusion_model_path, subfolder="scheduler"
+                )
+        self.noise_scheduler_copy = copy.deepcopy(self.noise_scheduler)
+        self.text_encoder_one = Gemma2Model.from_pretrained(
+                self.diffusion_model_path, subfolder="text_encoder", revision=args.revision, variant=args.variant
+            )
+        self.text_encoding_pipeline = Lumina2Pipeline.from_pretrained(
+            self.diffusion_model_path,
+            vae=None,
+            transformer=None,
+            text_encoder=self.text_encoder_one,
+            tokenizer=self.tokenizer_one,
+            )
+        self.vae = AutoencoderKL.from_pretrained(
+                    self.diffusion_model_path,
+                    subfolder="vae",
+                    revision=args.revision,
+                    variant=args.variant,
+                )
+        if args.ori_inp_dit=="seq":
+            from star.models.pixel_decoder.transformer_lumina2_seq import Lumina2Transformer2DModel
+        elif args.ori_inp_dit=="ref":
+            from star.models.pixel_decoder.transformer_lumina2 import Lumina2Transformer2DModel
+        self.transformer = Lumina2Transformer2DModel.from_pretrained(
+                self.diffusion_model_path, subfolder="transformer", revision=args.revision, variant=args.variant
+            )
+        vq_dim = 512
+        patch_size = self.transformer.config.patch_size
+        in_channels = vq_dim + self.transformer.config.in_channels # 48 for mask
+        out_channels = self.transformer.x_embedder.out_features
+        load_num_channel = self.transformer.config.in_channels * patch_size * patch_size
+        self.transformer.register_to_config(in_channels=in_channels)
+        transformer = self.transformer
+        with torch.no_grad():
+            new_proj = nn.Linear(
+                in_channels * patch_size * patch_size, out_channels, bias=True
+            )
+            new_proj.weight.zero_()
+            new_proj = new_proj.to(transformer.x_embedder.weight.dtype)
+            new_proj.weight[:, :load_num_channel].copy_(transformer.x_embedder.weight)
+            new_proj.bias.copy_(transformer.x_embedder.bias)
+            transformer.x_embedder = new_proj
+        self.ori_inp_dit = args.ori_inp_dit
+        if args.ori_inp_dit=="seq":
+            refiner_channels = transformer.noise_refiner[-1].dim
+            with torch.no_grad():
+                vae2cond_proj1 = nn.Linear(refiner_channels, refiner_channels, bias=True)
+                vae2cond_act = nn.GELU(approximate='tanh')
+                vae2cond_proj2 = nn.Linear(refiner_channels, refiner_channels, bias=False)
+                vae2cond_proj2.weight.zero_()
+                ori_inp_refiner = nn.Sequential(
+                    vae2cond_proj1,
+                    vae2cond_act,
+                    vae2cond_proj2
+                )
+                transformer.ori_inp_refiner = ori_inp_refiner
+                transformer.ori_inp_dit = self.ori_inp_dit
+        elif args.ori_inp_dit=="ref":
+            transformer.initialize_ref_weights()
+            transformer.ori_inp_dit = self.ori_inp_dit
+        transformer.requires_grad_(True)
+        if args.grad_ckpt and args.diffusion_resolution==1024:
+            transformer.gradient_checkpointing = args.grad_ckpt
+            transformer.enable_gradient_checkpointing()
+        self.vae.requires_grad_(False)
+        self.vae.to(dtype=torch.float32)
+        self.args = args
+        self.pipe = Lumina2InstructPix2PixPipeline.from_pretrained(self.diffusion_model_path,
+                                                                transformer=transformer,
+                                                                text_encoder=self.text_encoder_one,
+                                                                vae=self.vae,
+                                                                torch_dtype=torch.bfloat16)
+        with torch.no_grad():
+            _, _, self.uncond_prompt_embeds, self.uncond_prompt_attention_mask = self.text_encoding_pipeline.encode_prompt(
+                "",
+                max_sequence_length=self.args.max_diff_seq_length,
+            )
+    def compute_text_embeddings(self,prompt, text_encoding_pipeline):
+        with torch.no_grad():
+            prompt_embeds, prompt_attention_mask, _, _ = text_encoding_pipeline.encode_prompt(
+                prompt,
+                max_sequence_length=self.args.max_diff_seq_length,
+            )
+        return prompt_embeds, prompt_attention_mask
+    def get_sigmas(self, timesteps, n_dim=4, dtype=torch.float32):
+        sigmas = self.noise_scheduler_copy.sigmas.to(dtype=dtype)
+        schedule_timesteps = self.noise_scheduler_copy.timesteps.to(device=timesteps.device)
+        timesteps = timesteps
+        step_indices = [(schedule_timesteps == t).nonzero().item() for t in timesteps]
+        sigma = sigmas[step_indices].flatten()
+        while len(sigma.shape) < n_dim:
+            sigma = sigma.unsqueeze(-1)
+        return sigma
+    def forward(self, batch_gpu,batch, image_embeds):
+        args = self.args
+        pixel_values = batch_gpu["full_pixel_values"].to(dtype=self.vae.dtype) #aux_image
+        data_type = "t2i"
+        if len(pixel_values.shape)==5:
+            bs,num_img,c,h,w = pixel_values.shape
+            if num_img==2:
+                data_type = "edit"
+            pixel_values_ori_img = pixel_values[:,0]
+            pixel_values = pixel_values[:,-1]
+        pixel_values = F.interpolate(pixel_values, size=(self.args.diffusion_resolution, self.args.diffusion_resolution), mode='bilinear',align_corners=False)
+        if data_type=="edit" and self.ori_inp_dit!="none":
+            pixel_values_ori_img = F.interpolate(pixel_values_ori_img, size=(self.args.diffusion_resolution, self.args.diffusion_resolution), mode='bilinear', align_corners=False)
+        prompt = batch["prompts"]
+        bs,_,_,_ = pixel_values.shape
+        image_prompt_embeds = None
+        image_embeds_2d = image_embeds.reshape(bs, 24, 24, image_embeds.shape[-1]).permute(0, 3, 1, 2)
+        image_embeds_2d = F.interpolate(image_embeds_2d, size=(args.diffusion_resolution//8, args.diffusion_resolution//8), mode='bilinear', align_corners=False)
+        control_emd = args.control_emd
+        prompt_embeds, prompt_attention_mask = self.compute_text_embeddings(prompt, self.text_encoding_pipeline)
+        if control_emd=="mix":
+            prompt_embeds=torch.cat([prompt_embeds, image_prompt_embeds], dim=1) #use mix
+        elif control_emd=="null":
+            prompt_embeds = torch.zeros_like(prompt_embeds)
+            prompt_attention_mask = torch.ones_like(prompt_attention_mask)
+        elif control_emd=="text":
+            pass
+        elif control_emd=="vit" or control_emd=="vq" or control_emd=="vqvae" or control_emd=="vqconcat" or control_emd=="vqconcatvit":
+            prompt_embeds=image_prompt_embeds
+        latents = self.vae.encode(pixel_values).latent_dist.sample()
+        latents = (latents - self.vae.config.shift_factor) * self.vae.config.scaling_factor
+        latents = latents.to(dtype=image_embeds.dtype)
+        latents_ori_img = torch.zeros_like(latents)
+        if data_type=="edit" and self.ori_inp_dit!="none":
+            latents_ori_img = self.vae.encode(pixel_values_ori_img).latent_dist.sample()
+            latents_ori_img = (latents_ori_img - self.vae.config.shift_factor) * self.vae.config.scaling_factor
+            latents_ori_img = latents_ori_img.to(dtype=image_embeds.dtype)
+        # Sample noise that we'll add to the latents
+        noise = torch.randn_like(latents)
+        bsz = latents.shape[0]
+        # Sample a random timestep for each image
+        # for weighting schemes where we sample timesteps non-uniformly
+        u = compute_density_for_timestep_sampling(
+            weighting_scheme=args.weighting_scheme,
+            batch_size=bsz,
+            logit_mean=args.logit_mean,
+            logit_std=args.logit_std,
+            mode_scale=args.mode_scale,
+        )
+        indices = (u * self.noise_scheduler_copy.config.num_train_timesteps).long()
+        timesteps = self.noise_scheduler_copy.timesteps[indices].to(device=latents.device)
+        # Add noise to the latents according to the noise magnitude at each timestep
+        # (this is the forward diffusion process)
+        sigmas = self.get_sigmas(timesteps, n_dim=latents.ndim, dtype=latents.dtype).to(device=noise.device)
+        #noisy_model_input = (1.0 - sigmas) * noise + sigmas * latents
+        noisy_model_input = sigmas * noise + (1-sigmas) * latents
+        #noisy_model_input + (1-sigmas)*(latents - noise) = latents
+        # Get the additional image embedding for conditioning.
+        # Instead of getting a diagonal Gaussian here, we simply take the mode.
+        original_image_embeds = image_embeds_2d
+        if args.conditioning_dropout_prob is not None:
+            random_p = torch.rand(bsz, device=latents.device)
+            # Sample masks for the edit prompts.
+            prompt_mask = random_p < 2 * args.uncondition_prob
+            prompt_mask = prompt_mask.reshape(bsz, 1, 1)
+            # Final text conditioning.
+            #prompt_embeds = torch.where(prompt_mask, torch.zeros_like(prompt_embeds), prompt_embeds)
+            prompt_embeds = torch.where(prompt_mask, self.uncond_prompt_embeds.repeat(prompt_embeds.shape[0],1,1).to(prompt_embeds.device), prompt_embeds)
+            prompt_attention_mask = torch.where(prompt_mask[:,0], self.uncond_prompt_attention_mask.repeat(prompt_embeds.shape[0],1).to(prompt_embeds.device), prompt_attention_mask)
+            # Sample masks for the original images.
+            #random_p_vq = torch.rand(bsz, device=latents.device)
+            image_mask_dtype = original_image_embeds.dtype
+            image_mask = 1 - (
+                (random_p <= args.conditioning_dropout_prob).to(image_mask_dtype)
+            )
+            image_mask = image_mask.reshape(bsz, 1, 1, 1)
+            if data_type=="edit":
+                image_mask=0
+            # Final image conditioning.
+            original_image_embeds = image_mask * original_image_embeds
+            ori_latent_mask = 1 - (
+                (random_p >= args.uncondition_prob).to(image_mask_dtype)
+                * (random_p < 3 * args.uncondition_prob).to(image_mask_dtype)
+            )
+            ori_latent_mask = ori_latent_mask.reshape(bsz, 1, 1, 1)
+            latents_ori_img = ori_latent_mask * latents_ori_img
+        concatenated_noisy_latents = torch.cat([noisy_model_input, original_image_embeds], dim=1)
+        ref_image_hidden_states = None
+        if self.ori_inp_dit=="dim":
+            concatenated_noisy_latents = torch.cat([concatenated_noisy_latents, latents_ori_img], dim=1)
+        elif self.ori_inp_dit=="seq":
+            latents_ori_img = torch.cat([latents_ori_img, original_image_embeds], dim=1)
+            concatenated_noisy_latents = torch.cat([concatenated_noisy_latents, latents_ori_img], dim=2)
+        elif self.ori_inp_dit=="ref":
+            latents_ori_img = torch.cat([latents_ori_img, original_image_embeds], dim=1)
+            ref_image_hidden_states = latents_ori_img[:,None]
+        # Predict the noise residual
+        # scale the timesteps (reversal not needed as we used a reverse lerp above already)
+        timesteps = 1-timesteps / self.noise_scheduler.config.num_train_timesteps #timesteps / self.noise_scheduler.config.num_train_timesteps
+        model_pred = self.transformer(
+            hidden_states=concatenated_noisy_latents,
+            timestep=timesteps,
+            encoder_hidden_states=prompt_embeds,
+            encoder_attention_mask=prompt_attention_mask,
+            # ref_image_hidden_states = ref_image_hidden_states,
+            return_dict=False,
+        )[0]
+        if self.ori_inp_dit=="seq":
+            model_pred = model_pred[:, :, :args.diffusion_resolution//8, :]
+        weighting = compute_loss_weighting_for_sd3(weighting_scheme=args.weighting_scheme, sigmas=sigmas)
+        target = latents - noise
+        # Conditioning dropout to support classifier-free guidance during inference. For more details
+        # check out the section 3.2.1 of the original paper https://arxiv.org/abs/2211.09800.
+        # Concatenate the `original_image_embeds` with the `noisy_latents`.
+        # Get the target for loss depending on the prediction type
+        loss = torch.mean(
+            (weighting.float() * (model_pred.float() - target.float()) ** 2).reshape(target.shape[0], -1),
+            1,
+        )
+        loss = loss.mean()
+        loss_value = loss.item()
+        return loss
+class Lumina2InstructPix2PixPipeline(Lumina2Pipeline):
+    @torch.no_grad()
+    def __call__(
+        self,
+        prompt: Union[str, List[str]] = None,
+        width: Optional[int] = None,
+        height: Optional[int] = None,
+        num_inference_steps: int = 30,
+        guidance_scale: float = 4.0,
+        negative_prompt: Union[str, List[str]] = None,
+        sigmas: List[float] = None,
+        num_images_per_prompt: Optional[int] = 1,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        latents: Optional[torch.Tensor] = None,
+        prompt_embeds: Optional[torch.Tensor] = None,
+        negative_prompt_embeds: Optional[torch.Tensor] = None,
+        prompt_attention_mask: Optional[torch.Tensor] = None,
+        negative_prompt_attention_mask: Optional[torch.Tensor] = None,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        attention_kwargs: Optional[Dict[str, Any]] = None,
+        callback_on_step_end: Optional[Callable[[int, int, Dict], None]] = None,
+        callback_on_step_end_tensor_inputs: List[str] = ["latents"],
+        system_prompt: Optional[str] = None,
+        cfg_trunc_ratio=[0.0,1.0],
+        cfg_normalization: bool = False,
+        max_sequence_length: int = 256,
+        control_emd="text",
+        img_cfg_trunc_ratio =[0.0,1.0],
+        gen_image_embeds=None,only_t2i="vqconcat",image_prompt_embeds=None,ori_inp_img=None,img_guidance_scale=1.5,vq_guidance_scale=0,ori_inp_way="none",
+    ) -> Union[ImagePipelineOutput, Tuple]:
+        height = height or self.default_sample_size * self.vae_scale_factor
+        width = width or self.default_sample_size * self.vae_scale_factor
+        self._guidance_scale = guidance_scale
+        self._attention_kwargs = attention_kwargs
+        num_images_per_prompt = gen_image_embeds.shape[0] if gen_image_embeds is not None else image_prompt_embeds.shape[0]
+        # 1. Check inputs. Raise error if not correct
+        self.check_inputs(
+            prompt,
+            height,
+            width,
+            negative_prompt,
+            prompt_embeds=prompt_embeds,
+            negative_prompt_embeds=negative_prompt_embeds,
+            prompt_attention_mask=prompt_attention_mask,
+            negative_prompt_attention_mask=negative_prompt_attention_mask,
+            max_sequence_length=max_sequence_length,
+            callback_on_step_end_tensor_inputs=callback_on_step_end_tensor_inputs,
+        )
+        # 2. Define call parameters
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+        device = self._execution_device
+        # 3. Encode input prompt
+        (
+            prompt_embeds,
+            prompt_attention_mask,
+            negative_prompt_embeds,
+            negative_prompt_attention_mask,
+        ) = self.encode_prompt(
+            prompt,
+            self.do_classifier_free_guidance,
+            negative_prompt=negative_prompt,
+            num_images_per_prompt=num_images_per_prompt,
+            device=device,
+            prompt_embeds=prompt_embeds,
+            negative_prompt_embeds=negative_prompt_embeds,
+            prompt_attention_mask=prompt_attention_mask,
+            negative_prompt_attention_mask=negative_prompt_attention_mask,
+            max_sequence_length=max_sequence_length,
+            system_prompt=system_prompt,
+        )
+        if gen_image_embeds is not None:
+            image_embeds_8=gen_image_embeds
+        if control_emd=="text":
+            pass
+        elif control_emd=="null":
+            prompt_embeds = torch.zeros_like(prompt_embeds)
+            prompt_attention_mask = torch.zeros_like(prompt_attention_mask)
+            negative_prompt_embeds = prompt_embeds
+            negative_prompt_attention_mask = prompt_attention_mask
+        if self.do_classifier_free_guidance:
+            prompt_embeds = torch.cat([negative_prompt_embeds,negative_prompt_embeds, prompt_embeds], dim=0)
+            prompt_attention_mask = torch.cat([negative_prompt_attention_mask,negative_prompt_attention_mask, prompt_attention_mask], dim=0)
+        # 4. Prepare latents.
+        latent_channels = self.vae.config.latent_channels #self.transformer.config.in_channels
+        latents = self.prepare_latents(
+            batch_size * num_images_per_prompt,
+            latent_channels,
+            height,
+            width,
+            prompt_embeds.dtype,
+            device,
+            generator,
+            latents,
+        )
+        latents_ori_img = torch.zeros_like(latents)
+        if ori_inp_img is not None and ori_inp_way !="none":
+            #fuck =  torch.load(ori_inp_img).to(latents.device)
+            ori_inp_img = F.interpolate(ori_inp_img[None].to(latents.device,latents.dtype), size=(height,width), mode='bilinear',align_corners=False)
+            latents_ori_img = self.vae.encode(ori_inp_img).latent_dist.sample()
+            latents_ori_img = (latents_ori_img- self.vae.config.shift_factor) * self.vae.config.scaling_factor
+            latents_ori_img = latents_ori_img.to(dtype=latents.dtype)
+        if ori_inp_way !="none":
+            negative_latents_ori_img = torch.zeros_like(latents_ori_img).to(prompt_embeds.dtype)
+            latents_ori_img = torch.cat([negative_latents_ori_img,latents_ori_img, latents_ori_img], dim=0) if self.do_classifier_free_guidance else latents_ori_img
+        vq_in_edit = False
+        if only_t2i==True:
+            image_latents = torch.zeros_like(latents)[:,:8]
+        elif only_t2i=="vqconcat":
+            image_embeds_2d = image_embeds_8.reshape(batch_size* num_images_per_prompt,24,24,image_embeds_8.shape[-1]).permute(0,3,1,2)
+            if ori_inp_img is not None and image_embeds_8.mean()!=0:
+                vq_in_edit = True
+                image_vq_latents = F.interpolate(image_embeds_2d, size=(height//8,width//8), mode='bilinear',align_corners=False).to(latents.device,latents.dtype)
+                image_latents = torch.zeros_like(image_vq_latents)
+            else:
+                image_latents = F.interpolate(image_embeds_2d, size=(height//8,width//8), mode='bilinear',align_corners=False).to(latents.device,latents.dtype)
+        negative_image_latents = torch.zeros_like(image_latents).to(prompt_embeds.dtype)
+        image_latents = torch.cat([negative_image_latents,image_latents, image_latents], dim=0) if self.do_classifier_free_guidance else image_latents
+        # 5. Prepare timesteps
+        sigmas = np.linspace(1.0, 1 / num_inference_steps, num_inference_steps) if sigmas is None else sigmas
+        image_seq_len = latents.shape[1]
+        mu = calculate_shift(
+            image_seq_len,
+            self.scheduler.config.get("base_image_seq_len", 256),
+            self.scheduler.config.get("max_image_seq_len", 4096),
+            self.scheduler.config.get("base_shift", 0.5),
+            self.scheduler.config.get("max_shift", 1.15),
+        )
+        timesteps, num_inference_steps = retrieve_timesteps(
+            self.scheduler,
+            num_inference_steps,
+            device,
+            sigmas=sigmas,
+            mu=mu,
+        )
+        num_warmup_steps = max(len(timesteps) - num_inference_steps * self.scheduler.order, 0)
+        self._num_timesteps = len(timesteps)
+        self.scheduler.sigmas=self.scheduler.sigmas.to(latents.dtype) #hjc find bug
+        # 6. Denoising loop
+        with self.progress_bar(total=num_inference_steps) as progress_bar:
+            for i, t in enumerate(timesteps):
+                # compute whether apply classifier-free truncation on this timestep
+                do_classifier_free_truncation = not ((i + 1) / num_inference_steps > cfg_trunc_ratio[0] and (i + 1) / num_inference_steps < cfg_trunc_ratio[1])
+                img_do_classifier_free_truncation = not ((i + 1) / num_inference_steps > img_cfg_trunc_ratio[0] and (i + 1) / num_inference_steps < img_cfg_trunc_ratio[1])
+                # reverse the timestep since Lumina uses t=0 as the noise and t=1 as the image
+                current_timestep = 1 - t / self.scheduler.config.num_train_timesteps
+                latent_model_input = torch.cat([latents] * 3) if self.do_classifier_free_guidance else latents
+                # broadcast to batch dimension in a way that's compatible with ONNX/Core ML
+                current_timestep = current_timestep.expand(latent_model_input.shape[0])
+                latent_model_input = torch.cat([latent_model_input, image_latents], dim=1)
+                ref_image_hidden_states = None
+                if ori_inp_way=="seq":
+                    latents_ori_img_cat = torch.cat([latents_ori_img, image_latents], dim=1)
+                    latent_model_input = torch.cat([latent_model_input, latents_ori_img_cat], dim=2)
+                elif ori_inp_way=="ref":
+                    latents_ori_img_cat = torch.cat([latents_ori_img, image_latents], dim=1)
+                    ref_image_hidden_states = latents_ori_img_cat[:,None]
+                if ori_inp_way=="ref":
+                    noise_pred = self.transformer(
+                        hidden_states=latent_model_input,
+                        timestep=current_timestep,
+                        encoder_hidden_states=prompt_embeds,
+                        encoder_attention_mask=prompt_attention_mask,
+                        return_dict=False,ref_image_hidden_states=ref_image_hidden_states,
+                        attention_kwargs=self.attention_kwargs,
+                    )[0]
+                else:
+                    noise_pred = self.transformer(
+                        hidden_states=latent_model_input,
+                        timestep=current_timestep,
+                        encoder_hidden_states=prompt_embeds,
+                        encoder_attention_mask=prompt_attention_mask,
+                        return_dict=False,
+                        attention_kwargs=self.attention_kwargs,
+                    )[0]
+                if ori_inp_way=="seq":
+                    noise_pred = noise_pred[:,:,:height//8,:]
+                if vq_in_edit:
+                    latent_model_vq_input = torch.cat([latents, image_vq_latents], dim=1)
+                    if ori_inp_way=="seq":
+                        latents_ori_img_cat_vq = torch.cat([torch.zeros_like(latents), image_vq_latents], dim=1)
+                        latent_model_vq_input = torch.cat([latent_model_vq_input, latents_ori_img_cat_vq], dim=2)
+                    noise_vq_pred = self.transformer(
+                        hidden_states=latent_model_vq_input,
+                        timestep=current_timestep[-1:],
+                        encoder_hidden_states=prompt_embeds[-1:],
+                        encoder_attention_mask=prompt_attention_mask[-1:],
+                        return_dict=False,
+                        attention_kwargs=self.attention_kwargs,
+                    )[0]
+                    if ori_inp_way=="seq":
+                        noise_vq_pred = noise_vq_pred[:,:,:height//8,:]
+                # perform normalization-based guidance scale on a truncated timestep interval
+                if self.do_classifier_free_guidance:
+                    noise_pred_uncond,noise_pred_img, noise_pred_text = noise_pred.chunk(3)
+                    if not do_classifier_free_truncation and not img_do_classifier_free_truncation:
+                        noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_img)+ img_guidance_scale * (noise_pred_img - noise_pred_uncond)
+                    elif not do_classifier_free_truncation and img_do_classifier_free_truncation:
+                        noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_img)+ 1 * (noise_pred_img - noise_pred_uncond)
+                    elif do_classifier_free_truncation and not img_do_classifier_free_truncation:
+                        noise_pred = noise_pred_uncond + 1 * (noise_pred_text - noise_pred_img)+ img_guidance_scale * (noise_pred_img - noise_pred_uncond)
+                    else:
+                        noise_pred = noise_pred_text
+                    if vq_in_edit:
+                        noise_pred = noise_pred +vq_guidance_scale*(noise_vq_pred-noise_pred_uncond)
+                    # apply normalization after classifier-free guidance
+                    if cfg_normalization:
+                        cond_norm = torch.norm(noise_pred_text, dim=-1, keepdim=True)
+                        noise_norm = torch.norm(noise_pred, dim=-1, keepdim=True)
+                        noise_pred = noise_pred * (cond_norm / noise_norm)
+                else:
+                    noise_pred = noise_pred
+                # compute the previous noisy sample x_t -> x_t-1
+                latents_dtype = latents.dtype
+                noise_pred = -noise_pred
+                latents = self.scheduler.step(noise_pred, t, latents, return_dict=False)[0]
+                if latents.dtype != latents_dtype:
+                    if torch.backends.mps.is_available():
+                        # some platforms (eg. apple mps) misbehave due to a pytorch bug: https://github.com/pytorch/pytorch/pull/99272
+                        latents = latents.to(latents_dtype)
+                if callback_on_step_end is not None:
+                    callback_kwargs = {}
+                    for k in callback_on_step_end_tensor_inputs:
+                        callback_kwargs[k] = locals()[k]
+                    callback_outputs = callback_on_step_end(self, i, t, callback_kwargs)
+                    latents = callback_outputs.pop("latents", latents)
+                    prompt_embeds = callback_outputs.pop("prompt_embeds", prompt_embeds)
+                # call the callback, if provided
+                if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
+                    progress_bar.update()
+                if XLA_AVAILABLE:
+                    xm.mark_step()
+        if not output_type == "latent":
+            latents = (latents / self.vae.config.scaling_factor) + self.vae.config.shift_factor
+            image = self.vae.decode(latents, return_dict=False)[0]
+            image = self.image_processor.postprocess(image, output_type=output_type)
+        else:
+            image = latents
+        # Offload all models
+        self.maybe_free_model_hooks()
+        if not return_dict:
+            return (image,)
+        return ImagePipelineOutput(images=image)

star/models/pixel_decoder/transformer_lumina2.py ADDED Viewed

	@@ -0,0 +1,770 @@

+# Copyright 2025 Alpha-VLLM Authors and The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import math
+from typing import Any, Dict, List, Optional, Tuple, Union
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from einops import rearrange
+from diffusers.models.transformers.transformer_lumina2 import *
+from einops import repeat
+from diffusers.models.embeddings import get_1d_rotary_pos_embed
+import itertools
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+class Lumina2CombinedTimestepCaptionEmbedding(nn.Module):
+    def __init__(
+        self,
+        hidden_size: int = 4096,
+        cap_feat_dim: int = 2048,
+        frequency_embedding_size: int = 256,
+        norm_eps: float = 1e-5,
+    ) -> None:
+        super().__init__()
+        self.time_proj = Timesteps(
+            num_channels=frequency_embedding_size, flip_sin_to_cos=True, downscale_freq_shift=0.0
+        )
+        self.timestep_embedder = TimestepEmbedding(
+            in_channels=frequency_embedding_size, time_embed_dim=min(hidden_size, 1024)
+        )
+        self.caption_embedder = nn.Sequential(
+            RMSNorm(cap_feat_dim, eps=norm_eps), nn.Linear(cap_feat_dim, hidden_size, bias=True)
+        )
+    def forward(
+        self, hidden_states: torch.Tensor, timestep: torch.Tensor, encoder_hidden_states: torch.Tensor
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        timestep_proj = self.time_proj(timestep).type_as(hidden_states[0])
+        time_embed = self.timestep_embedder(timestep_proj)
+        caption_embed = self.caption_embedder(encoder_hidden_states)
+        return time_embed, caption_embed
+class Lumina2AttnProcessor2_0:
+    r"""
+    Processor for implementing scaled dot-product attention (enabled by default if you're using PyTorch 2.0). This is
+    used in the Lumina2Transformer2DModel model. It applies normalization and RoPE on query and key vectors.
+    """
+    def __init__(self):
+        if not hasattr(F, "scaled_dot_product_attention"):
+            raise ImportError("AttnProcessor2_0 requires PyTorch 2.0, to use it, please upgrade PyTorch to 2.0.")
+    def __call__(
+        self,
+        attn: Attention,
+        hidden_states: torch.Tensor,
+        encoder_hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        image_rotary_emb: Optional[torch.Tensor] = None,
+        base_sequence_length: Optional[int] = None,
+    ) -> torch.Tensor:
+        batch_size, sequence_length, _ = hidden_states.shape
+        # Get Query-Key-Value Pair
+        query = attn.to_q(hidden_states)
+        key = attn.to_k(encoder_hidden_states)
+        value = attn.to_v(encoder_hidden_states)
+        query_dim = query.shape[-1]
+        inner_dim = key.shape[-1]
+        head_dim = query_dim // attn.heads
+        dtype = query.dtype
+        # Get key-value heads
+        kv_heads = inner_dim // head_dim
+        query = query.view(batch_size, -1, attn.heads, head_dim)
+        key = key.view(batch_size, -1, kv_heads, head_dim)
+        value = value.view(batch_size, -1, kv_heads, head_dim)
+        # Apply Query-Key Norm if needed
+        if attn.norm_q is not None:
+            query = attn.norm_q(query)
+        if attn.norm_k is not None:
+            key = attn.norm_k(key)
+        # Apply RoPE if needed
+        if image_rotary_emb is not None:
+            query = apply_rotary_emb(query, image_rotary_emb, use_real=False)
+            key = apply_rotary_emb(key, image_rotary_emb, use_real=False)
+        query, key = query.to(dtype), key.to(dtype)
+        # Apply proportional attention if true
+        if base_sequence_length is not None:
+            softmax_scale = math.sqrt(math.log(sequence_length, base_sequence_length)) * attn.scale
+        else:
+            softmax_scale = attn.scale
+        # perform Grouped-qurey Attention (GQA)
+        n_rep = attn.heads // kv_heads
+        if n_rep >= 1:
+            key = key.unsqueeze(3).repeat(1, 1, 1, n_rep, 1).flatten(2, 3)
+            value = value.unsqueeze(3).repeat(1, 1, 1, n_rep, 1).flatten(2, 3)
+        # scaled_dot_product_attention expects attention_mask shape to be
+        # (batch, heads, source_length, target_length)
+        if attention_mask is not None:
+            attention_mask = attention_mask.bool().view(batch_size, 1, 1, -1)
+        query = query.transpose(1, 2)
+        key = key.transpose(1, 2)
+        value = value.transpose(1, 2)
+        hidden_states = F.scaled_dot_product_attention(
+            query, key, value, attn_mask=attention_mask, scale=softmax_scale
+        )
+        hidden_states = hidden_states.transpose(1, 2).reshape(batch_size, -1, attn.heads * head_dim)
+        hidden_states = hidden_states.type_as(query)
+        # linear proj
+        hidden_states = attn.to_out[0](hidden_states)
+        hidden_states = attn.to_out[1](hidden_states)
+        return hidden_states
+class Lumina2TransformerBlock(nn.Module):
+    def __init__(
+        self,
+        dim: int,
+        num_attention_heads: int,
+        num_kv_heads: int,
+        multiple_of: int,
+        ffn_dim_multiplier: float,
+        norm_eps: float,
+        modulation: bool = True,
+    ) -> None:
+        super().__init__()
+        self.head_dim = dim // num_attention_heads
+        self.dim = dim
+        self.modulation = modulation
+        self.attn = Attention(
+            query_dim=dim,
+            cross_attention_dim=None,
+            dim_head=dim // num_attention_heads,
+            qk_norm="rms_norm",
+            heads=num_attention_heads,
+            kv_heads=num_kv_heads,
+            eps=1e-5,
+            bias=False,
+            out_bias=False,
+            processor=Lumina2AttnProcessor2_0(),
+        )
+        self.feed_forward = LuminaFeedForward(
+            dim=dim,
+            inner_dim=4 * dim,
+            multiple_of=multiple_of,
+            ffn_dim_multiplier=ffn_dim_multiplier,
+        )
+        if modulation:
+            self.norm1 = LuminaRMSNormZero(
+                embedding_dim=dim,
+                norm_eps=norm_eps,
+                norm_elementwise_affine=True,
+            )
+        else:
+            self.norm1 = RMSNorm(dim, eps=norm_eps)
+        self.ffn_norm1 = RMSNorm(dim, eps=norm_eps)
+        self.norm2 = RMSNorm(dim, eps=norm_eps)
+        self.ffn_norm2 = RMSNorm(dim, eps=norm_eps)
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: torch.Tensor,
+        image_rotary_emb: torch.Tensor,
+        temb: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        if self.modulation:
+            norm_hidden_states, gate_msa, scale_mlp, gate_mlp = self.norm1(hidden_states, temb)
+            attn_output = self.attn(
+                hidden_states=norm_hidden_states,
+                encoder_hidden_states=norm_hidden_states,
+                attention_mask=attention_mask,
+                image_rotary_emb=image_rotary_emb,
+            )
+            hidden_states = hidden_states + gate_msa.unsqueeze(1).tanh() * self.norm2(attn_output)
+            mlp_output = self.feed_forward(self.ffn_norm1(hidden_states) * (1 + scale_mlp.unsqueeze(1)))
+            hidden_states = hidden_states + gate_mlp.unsqueeze(1).tanh() * self.ffn_norm2(mlp_output)
+        else:
+            norm_hidden_states = self.norm1(hidden_states)
+            attn_output = self.attn(
+                hidden_states=norm_hidden_states,
+                encoder_hidden_states=norm_hidden_states,
+                attention_mask=attention_mask,
+                image_rotary_emb=image_rotary_emb,
+            )
+            hidden_states = hidden_states + self.norm2(attn_output)
+            mlp_output = self.feed_forward(self.ffn_norm1(hidden_states))
+            hidden_states = hidden_states + self.ffn_norm2(mlp_output)
+        return hidden_states
+class Lumina2RotaryPosEmbed(nn.Module):
+    def __init__(self, theta: int, axes_dim: List[int], axes_lens: List[int] = (300, 512, 512), patch_size: int = 2):
+        super().__init__()
+        self.theta = theta
+        self.axes_dim = axes_dim
+        self.axes_lens = axes_lens
+        self.patch_size = patch_size
+        self.freqs_cis = self._precompute_freqs_cis(axes_dim, axes_lens, theta)
+    def _precompute_freqs_cis(self, axes_dim: List[int], axes_lens: List[int], theta: int) -> List[torch.Tensor]:
+        freqs_cis = []
+        freqs_dtype = torch.float32 if torch.backends.mps.is_available() else torch.float64
+        for i, (d, e) in enumerate(zip(axes_dim, axes_lens)):
+            emb = get_1d_rotary_pos_embed(d, e, theta=self.theta, freqs_dtype=freqs_dtype)
+            freqs_cis.append(emb)
+        return freqs_cis
+    def _get_freqs_cis(self, ids: torch.Tensor) -> torch.Tensor:
+        device = ids.device
+        if ids.device.type == "mps":
+            ids = ids.to("cpu")
+        result = []
+        for i in range(len(self.axes_dim)):
+            freqs = self.freqs_cis[i].to(ids.device)
+            index = ids[:, :, i : i + 1].repeat(1, 1, freqs.shape[-1]).to(torch.int64)
+            result.append(torch.gather(freqs.unsqueeze(0).repeat(index.shape[0], 1, 1), dim=1, index=index))
+        return torch.cat(result, dim=-1).to(device)
+    def forward(
+        self,
+        attention_mask,
+        l_effective_ref_img_len,
+        l_effective_img_len,
+        ref_img_sizes,
+        img_sizes,
+        device
+    ):
+        batch_size = len(attention_mask)
+        p = self.patch_size
+        encoder_seq_len = attention_mask.shape[1]
+        l_effective_cap_len = attention_mask.sum(dim=1).tolist()
+        seq_lengths = [cap_len + sum(ref_img_len) + img_len for cap_len, ref_img_len, img_len in zip(l_effective_cap_len, l_effective_ref_img_len, l_effective_img_len)]
+        max_seq_len = max(seq_lengths)
+        max_ref_img_len = max([sum(ref_img_len) for ref_img_len in l_effective_ref_img_len])
+        max_img_len = max(l_effective_img_len)
+        # Create position IDs
+        position_ids = torch.zeros(batch_size, max_seq_len, 3, dtype=torch.int32, device=device)
+        for i, (cap_seq_len, seq_len) in enumerate(zip(l_effective_cap_len, seq_lengths)):
+            # add text position ids
+            position_ids[i, :cap_seq_len] = repeat(torch.arange(cap_seq_len, dtype=torch.int32, device=device), "l -> l 3")
+            pe_shift = cap_seq_len
+            pe_shift_len = cap_seq_len
+            if ref_img_sizes[i] is not None:
+                for ref_img_size, ref_img_len in zip(ref_img_sizes[i], l_effective_ref_img_len[i]):
+                    H, W = ref_img_size
+                    ref_H_tokens, ref_W_tokens = H // p, W // p
+                    assert ref_H_tokens * ref_W_tokens == ref_img_len
+                    # add image position ids
+                    row_ids = repeat(torch.arange(ref_H_tokens, dtype=torch.int32, device=device), "h -> h w", w=ref_W_tokens).flatten()
+                    col_ids = repeat(torch.arange(ref_W_tokens, dtype=torch.int32, device=device), "w -> h w", h=ref_H_tokens).flatten()
+                    position_ids[i, pe_shift_len:pe_shift_len + ref_img_len, 0] = pe_shift
+                    position_ids[i, pe_shift_len:pe_shift_len + ref_img_len, 1] = row_ids
+                    position_ids[i, pe_shift_len:pe_shift_len + ref_img_len, 2] = col_ids
+                    pe_shift += max(ref_H_tokens, ref_W_tokens)
+                    pe_shift_len += ref_img_len
+            H, W = img_sizes[i]
+            H_tokens, W_tokens = H // p, W // p
+            assert H_tokens * W_tokens == l_effective_img_len[i]
+            row_ids = repeat(torch.arange(H_tokens, dtype=torch.int32, device=device), "h -> h w", w=W_tokens).flatten()
+            col_ids = repeat(torch.arange(W_tokens, dtype=torch.int32, device=device), "w -> h w", h=H_tokens).flatten()
+            assert pe_shift_len + l_effective_img_len[i] == seq_len
+            position_ids[i, pe_shift_len: seq_len, 0] = pe_shift
+            position_ids[i, pe_shift_len: seq_len, 1] = row_ids
+            position_ids[i, pe_shift_len: seq_len, 2] = col_ids
+        # Get combined rotary embeddings
+        freqs_cis = self._get_freqs_cis(position_ids)
+        # create separate rotary embeddings for captions and images
+        cap_freqs_cis = torch.zeros(
+            batch_size, encoder_seq_len, freqs_cis.shape[-1], device=device, dtype=freqs_cis.dtype
+        )
+        ref_img_freqs_cis = torch.zeros(
+            batch_size, max_ref_img_len, freqs_cis.shape[-1], device=device, dtype=freqs_cis.dtype
+        )
+        img_freqs_cis = torch.zeros(
+            batch_size, max_img_len, freqs_cis.shape[-1], device=device, dtype=freqs_cis.dtype
+        )
+        for i, (cap_seq_len, ref_img_len, img_len, seq_len) in enumerate(zip(l_effective_cap_len, l_effective_ref_img_len, l_effective_img_len, seq_lengths)):
+            cap_freqs_cis[i, :cap_seq_len] = freqs_cis[i, :cap_seq_len]
+            ref_img_freqs_cis[i, :sum(ref_img_len)] = freqs_cis[i, cap_seq_len:cap_seq_len + sum(ref_img_len)]
+            img_freqs_cis[i, :img_len] = freqs_cis[i, cap_seq_len + sum(ref_img_len):cap_seq_len + sum(ref_img_len) + img_len]
+        return (
+            cap_freqs_cis,
+            ref_img_freqs_cis,
+            img_freqs_cis,
+            freqs_cis,
+            l_effective_cap_len,
+            seq_lengths,
+        )
+class Lumina2Transformer2DModel(ModelMixin, ConfigMixin, PeftAdapterMixin, FromOriginalModelMixin):
+    r"""
+    Lumina2NextDiT: Diffusion model with a Transformer backbone.
+    Parameters:
+        sample_size (`int`): The width of the latent images. This is fixed during training since
+            it is used to learn a number of position embeddings.
+        patch_size (`int`, *optional*, (`int`, *optional*, defaults to 2):
+            The size of each patch in the image. This parameter defines the resolution of patches fed into the model.
+        in_channels (`int`, *optional*, defaults to 4):
+            The number of input channels for the model. Typically, this matches the number of channels in the input
+            images.
+        hidden_size (`int`, *optional*, defaults to 4096):
+            The dimensionality of the hidden layers in the model. This parameter determines the width of the model's
+            hidden representations.
+        num_layers (`int`, *optional*, default to 32):
+            The number of layers in the model. This defines the depth of the neural network.
+        num_attention_heads (`int`, *optional*, defaults to 32):
+            The number of attention heads in each attention layer. This parameter specifies how many separate attention
+            mechanisms are used.
+        num_kv_heads (`int`, *optional*, defaults to 8):
+            The number of key-value heads in the attention mechanism, if different from the number of attention heads.
+            If None, it defaults to num_attention_heads.
+        multiple_of (`int`, *optional*, defaults to 256):
+            A factor that the hidden size should be a multiple of. This can help optimize certain hardware
+            configurations.
+        ffn_dim_multiplier (`float`, *optional*):
+            A multiplier for the dimensionality of the feed-forward network. If None, it uses a default value based on
+            the model configuration.
+        norm_eps (`float`, *optional*, defaults to 1e-5):
+            A small value added to the denominator for numerical stability in normalization layers.
+        scaling_factor (`float`, *optional*, defaults to 1.0):
+            A scaling factor applied to certain parameters or layers in the model. This can be used for adjusting the
+            overall scale of the model's operations.
+    """
+    _supports_gradient_checkpointing = True
+    _no_split_modules = ["Lumina2TransformerBlock"]
+    _skip_layerwise_casting_patterns = ["x_embedder", "norm"]
+    @register_to_config
+    def __init__(
+        self,
+        sample_size: int = 128,
+        patch_size: int = 2,
+        in_channels: int = 16,
+        out_channels: Optional[int] = None,
+        hidden_size: int = 2304,
+        num_layers: int = 26,
+        num_refiner_layers: int = 2,
+        num_attention_heads: int = 24,
+        num_kv_heads: int = 8,
+        multiple_of: int = 256,
+        ffn_dim_multiplier: Optional[float] = None,
+        norm_eps: float = 1e-5,
+        scaling_factor: float = 1.0,
+        axes_dim_rope: Tuple[int, int, int] = (32, 32, 32),
+        axes_lens: Tuple[int, int, int] = (300, 512, 512),
+        cap_feat_dim: int = 1024,
+    ) -> None:
+        super().__init__()
+        self.out_channels = out_channels or in_channels
+        # 1. Positional, patch & conditional embeddings
+        self.rope_embedder = Lumina2RotaryPosEmbed(
+            theta=10000, axes_dim=axes_dim_rope, axes_lens=axes_lens, patch_size=patch_size
+        )
+        self.x_embedder = nn.Linear(in_features=patch_size * patch_size * in_channels, out_features=hidden_size)
+        self.time_caption_embed = Lumina2CombinedTimestepCaptionEmbedding(
+            hidden_size=hidden_size, cap_feat_dim=cap_feat_dim, norm_eps=norm_eps
+        )
+        # 2. Noise and context refinement blocks
+        self.noise_refiner = nn.ModuleList(
+            [
+                Lumina2TransformerBlock(
+                    hidden_size,
+                    num_attention_heads,
+                    num_kv_heads,
+                    multiple_of,
+                    ffn_dim_multiplier,
+                    norm_eps,
+                    modulation=True,
+                )
+                for _ in range(num_refiner_layers)
+            ]
+        )
+        self.context_refiner = nn.ModuleList(
+            [
+                Lumina2TransformerBlock(
+                    hidden_size,
+                    num_attention_heads,
+                    num_kv_heads,
+                    multiple_of,
+                    ffn_dim_multiplier,
+                    norm_eps,
+                    modulation=False,
+                )
+                for _ in range(num_refiner_layers)
+            ]
+        )
+        # 3. Transformer blocks
+        self.layers = nn.ModuleList(
+            [
+                Lumina2TransformerBlock(
+                    hidden_size,
+                    num_attention_heads,
+                    num_kv_heads,
+                    multiple_of,
+                    ffn_dim_multiplier,
+                    norm_eps,
+                    modulation=True,
+                )
+                for _ in range(num_layers)
+            ]
+        )
+        # 4. Output norm & projection
+        self.norm_out = LuminaLayerNormContinuous(
+            embedding_dim=hidden_size,
+            conditioning_embedding_dim=min(hidden_size, 1024),
+            elementwise_affine=False,
+            eps=1e-6,
+            bias=True,
+            out_dim=patch_size * patch_size * self.out_channels,
+        )
+        self.gradient_checkpointing = False
+        self.args_dict = {"patch_size":patch_size,"in_channels":in_channels,"hidden_size":hidden_size,
+                          "num_attention_heads":num_attention_heads,"num_kv_heads":num_kv_heads,
+                          "multiple_of":multiple_of,"ffn_dim_multiplier":ffn_dim_multiplier,
+                          "norm_eps":norm_eps,"num_refiner_layers":num_refiner_layers}
+    def initialize_ref_weights(self) -> None:
+        """
+        Initialize the weights of the model.
+        Uses Xavier uniform initialization for linear layers.
+        """
+        patch_size, in_channels, hidden_size, num_attention_heads, num_kv_heads, multiple_of, ffn_dim_multiplier, norm_eps, num_refiner_layers = \
+        (self.args_dict[k] for k in ["patch_size","in_channels","hidden_size","num_attention_heads","num_kv_heads",
+                                 "multiple_of","ffn_dim_multiplier","norm_eps","num_refiner_layers"])
+        with torch.no_grad():
+            self.ref_image_patch_embedder = nn.Linear(
+                in_features=self.x_embedder.in_features,
+                out_features=hidden_size,
+            )
+            self.ref_image_refiner = nn.ModuleList([
+                Lumina2TransformerBlock(
+                    hidden_size,
+                    num_attention_heads,
+                    num_kv_heads,
+                    multiple_of,
+                    ffn_dim_multiplier,
+                    norm_eps,
+                    modulation=True
+                )
+                for _ in range(num_refiner_layers)
+            ])
+        nn.init.xavier_uniform_(self.ref_image_patch_embedder.weight)
+        nn.init.constant_(self.ref_image_patch_embedder.bias, 0.0)
+        # Add learnable embeddings to distinguish different images
+        self.image_index_embedding = nn.Parameter(torch.randn(5, hidden_size)) # support max 5 ref images
+        nn.init.normal_(self.image_index_embedding, std=0.02)
+    def img_patch_embed_and_refine(
+        self,
+        hidden_states,
+        ref_image_hidden_states,
+        padded_img_mask,
+        padded_ref_img_mask,
+        noise_rotary_emb,
+        ref_img_rotary_emb,
+        l_effective_ref_img_len,
+        l_effective_img_len,
+        temb
+    ):
+        batch_size = len(hidden_states)
+        max_combined_img_len = max([img_len + sum(ref_img_len) for img_len, ref_img_len in zip(l_effective_img_len, l_effective_ref_img_len)])
+        hidden_states = self.x_embedder(hidden_states)
+        ref_image_hidden_states = self.ref_image_patch_embedder(ref_image_hidden_states)
+        for i in range(batch_size):
+            shift = 0
+            for j, ref_img_len in enumerate(l_effective_ref_img_len[i]):
+                ref_image_hidden_states[i, shift:shift + ref_img_len, :] = ref_image_hidden_states[i, shift:shift + ref_img_len, :] + self.image_index_embedding[j]
+                shift += ref_img_len
+        for layer in self.noise_refiner:
+            hidden_states = layer(hidden_states, padded_img_mask, noise_rotary_emb, temb)
+        flat_l_effective_ref_img_len = list(itertools.chain(*l_effective_ref_img_len))
+        num_ref_images = len(flat_l_effective_ref_img_len)
+        max_ref_img_len = max(flat_l_effective_ref_img_len)
+        batch_ref_img_mask = ref_image_hidden_states.new_zeros(num_ref_images, max_ref_img_len, dtype=torch.bool)
+        batch_ref_image_hidden_states = ref_image_hidden_states.new_zeros(num_ref_images, max_ref_img_len, self.config.hidden_size)
+        batch_ref_img_rotary_emb = hidden_states.new_zeros(num_ref_images, max_ref_img_len, ref_img_rotary_emb.shape[-1], dtype=ref_img_rotary_emb.dtype)
+        batch_temb = temb.new_zeros(num_ref_images, *temb.shape[1:], dtype=temb.dtype)
+        # sequence of ref imgs to batch
+        idx = 0
+        for i in range(batch_size):
+            shift = 0
+            for ref_img_len in l_effective_ref_img_len[i]:
+                batch_ref_img_mask[idx, :ref_img_len] = True
+                batch_ref_image_hidden_states[idx, :ref_img_len] = ref_image_hidden_states[i, shift:shift + ref_img_len]
+                batch_ref_img_rotary_emb[idx, :ref_img_len] = ref_img_rotary_emb[i, shift:shift + ref_img_len]
+                batch_temb[idx] = temb[i]
+                shift += ref_img_len
+                idx += 1
+        # refine ref imgs separately
+        for layer in self.ref_image_refiner:
+            batch_ref_image_hidden_states = layer(batch_ref_image_hidden_states, batch_ref_img_mask, batch_ref_img_rotary_emb, batch_temb)
+        # batch of ref imgs to sequence
+        idx = 0
+        for i in range(batch_size):
+            shift = 0
+            for ref_img_len in l_effective_ref_img_len[i]:
+                ref_image_hidden_states[i, shift:shift + ref_img_len] = batch_ref_image_hidden_states[idx, :ref_img_len]
+                shift += ref_img_len
+                idx += 1
+        combined_img_hidden_states = hidden_states.new_zeros(batch_size, max_combined_img_len, self.config.hidden_size)
+        for i, (ref_img_len, img_len) in enumerate(zip(l_effective_ref_img_len, l_effective_img_len)):
+            combined_img_hidden_states[i, :sum(ref_img_len)] = ref_image_hidden_states[i, :sum(ref_img_len)]
+            combined_img_hidden_states[i, sum(ref_img_len):sum(ref_img_len) + img_len] = hidden_states[i, :img_len]
+        return combined_img_hidden_states
+    def flat_and_pad_to_seq(self, hidden_states, ref_image_hidden_states):
+        batch_size = len(hidden_states)
+        p = self.config.patch_size
+        device = hidden_states[0].device
+        img_sizes = [(img.size(1), img.size(2)) for img in hidden_states]
+        l_effective_img_len = [(H // p) * (W // p) for (H, W) in img_sizes]
+        if ref_image_hidden_states is not None:
+            ref_img_sizes = [[(img.size(1), img.size(2)) for img in imgs] if imgs is not None else None for imgs in ref_image_hidden_states]
+            l_effective_ref_img_len = [[(ref_img_size[0] // p) * (ref_img_size[1] // p) for ref_img_size in _ref_img_sizes] if _ref_img_sizes is not None else [0] for _ref_img_sizes in ref_img_sizes]
+        else:
+            ref_img_sizes = [None for _ in range(batch_size)]
+            l_effective_ref_img_len = [[0] for _ in range(batch_size)]
+        max_ref_img_len = max([sum(ref_img_len) for ref_img_len in l_effective_ref_img_len])
+        max_img_len = max(l_effective_img_len)
+        # ref image patch embeddings
+        flat_ref_img_hidden_states = []
+        for i in range(batch_size):
+            if ref_img_sizes[i] is not None:
+                imgs = []
+                for ref_img in ref_image_hidden_states[i]:
+                    C, H, W = ref_img.size()
+                    ref_img = rearrange(ref_img, 'c (h p1) (w p2) -> (h w) (p1 p2 c)', p1=p, p2=p)
+                    imgs.append(ref_img)
+                img = torch.cat(imgs, dim=0)
+                flat_ref_img_hidden_states.append(img)
+            else:
+                flat_ref_img_hidden_states.append(None)
+        # image patch embeddings
+        flat_hidden_states = []
+        for i in range(batch_size):
+            img = hidden_states[i]
+            C, H, W = img.size()
+            img = rearrange(img, 'c (h p1) (w p2) -> (h w) (p1 p2 c)', p1=p, p2=p)
+            flat_hidden_states.append(img)
+        padded_ref_img_hidden_states = torch.zeros(batch_size, max_ref_img_len, flat_hidden_states[0].shape[-1], device=device, dtype=flat_hidden_states[0].dtype)
+        padded_ref_img_mask = torch.zeros(batch_size, max_ref_img_len, dtype=torch.bool, device=device)
+        for i in range(batch_size):
+            if ref_img_sizes[i] is not None:
+                padded_ref_img_hidden_states[i, :sum(l_effective_ref_img_len[i])] = flat_ref_img_hidden_states[i]
+                padded_ref_img_mask[i, :sum(l_effective_ref_img_len[i])] = True
+        padded_hidden_states = torch.zeros(batch_size, max_img_len, flat_hidden_states[0].shape[-1], device=device, dtype=flat_hidden_states[0].dtype)
+        padded_img_mask = torch.zeros(batch_size, max_img_len, dtype=torch.bool, device=device)
+        for i in range(batch_size):
+            padded_hidden_states[i, :l_effective_img_len[i]] = flat_hidden_states[i]
+            padded_img_mask[i, :l_effective_img_len[i]] = True
+        return (
+            padded_hidden_states,
+            padded_ref_img_hidden_states,
+            padded_img_mask,
+            padded_ref_img_mask,
+            l_effective_ref_img_len,
+            l_effective_img_len,
+            ref_img_sizes,
+            img_sizes,
+        )
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        timestep: torch.Tensor,
+        encoder_hidden_states: torch.Tensor,
+        encoder_attention_mask: torch.Tensor,
+        ref_image_hidden_states: Optional[List[List[torch.Tensor]]] = None,
+        attention_kwargs: Optional[Dict[str, Any]] = None,
+        return_dict: bool = True,
+    ) -> Union[torch.Tensor, Transformer2DModelOutput]:
+        if attention_kwargs is not None:
+            attention_kwargs = attention_kwargs.copy()
+            lora_scale = attention_kwargs.pop("scale", 1.0)
+        else:
+            lora_scale = 1.0
+        if USE_PEFT_BACKEND:
+            # weight the lora layers by setting `lora_scale` for each PEFT layer
+            scale_lora_layers(self, lora_scale)
+        else:
+            if attention_kwargs is not None and attention_kwargs.get("scale", None) is not None:
+                logger.warning(
+                    "Passing `scale` via `attention_kwargs` when not using the PEFT backend is ineffective."
+                )
+        # 1. Condition, positional & patch embedding
+        batch_size = len(hidden_states)
+        is_hidden_states_tensor = isinstance(hidden_states, torch.Tensor)
+        if is_hidden_states_tensor:
+            assert hidden_states.ndim == 4
+            hidden_states = [_hidden_states for _hidden_states in hidden_states]
+        device = hidden_states[0].device
+        temb, encoder_hidden_states = self.time_caption_embed(hidden_states, timestep, encoder_hidden_states)
+        (
+            hidden_states,
+            ref_image_hidden_states,
+            img_mask,
+            ref_img_mask,
+            l_effective_ref_img_len,
+            l_effective_img_len,
+            ref_img_sizes,
+            img_sizes,
+        ) = self.flat_and_pad_to_seq(hidden_states, ref_image_hidden_states)
+        (
+            context_rotary_emb,
+            ref_img_rotary_emb,
+            noise_rotary_emb,
+            rotary_emb,
+            encoder_seq_lengths,
+            seq_lengths,
+        ) = self.rope_embedder(
+            encoder_attention_mask,
+            l_effective_ref_img_len,
+            l_effective_img_len,
+            ref_img_sizes,
+            img_sizes,
+            device,
+        )
+        # 2. Context & noise refinement
+        for layer in self.context_refiner:
+            encoder_hidden_states = layer(encoder_hidden_states, encoder_attention_mask, context_rotary_emb)
+        combined_img_hidden_states = self.img_patch_embed_and_refine(
+            hidden_states,
+            ref_image_hidden_states,
+            img_mask,
+            ref_img_mask,
+            noise_rotary_emb,
+            ref_img_rotary_emb,
+            l_effective_ref_img_len,
+            l_effective_img_len,
+            temb,
+        )
+        # 3. Joint Transformer blocks
+        max_seq_len = max(seq_lengths)
+        use_mask = len(set(seq_lengths)) > 1
+        attention_mask = hidden_states.new_zeros(batch_size, max_seq_len, dtype=torch.bool)
+        joint_hidden_states = hidden_states.new_zeros(batch_size, max_seq_len, self.config.hidden_size)
+        for i, (encoder_seq_len, seq_len) in enumerate(zip(encoder_seq_lengths, seq_lengths)):
+            attention_mask[i, :seq_len] = True
+            joint_hidden_states[i, :encoder_seq_len] = encoder_hidden_states[i, :encoder_seq_len]
+            joint_hidden_states[i, encoder_seq_len:seq_len] = combined_img_hidden_states[i, :seq_len - encoder_seq_len]
+        hidden_states = joint_hidden_states
+        for layer in self.layers:
+            if torch.is_grad_enabled() and self.gradient_checkpointing:
+                hidden_states = self._gradient_checkpointing_func(
+                    layer, hidden_states, attention_mask if use_mask else None, rotary_emb, temb
+                )
+            else:
+                hidden_states = layer(hidden_states, attention_mask if use_mask else None, rotary_emb, temb)
+        # 4. Output norm & projection
+        hidden_states = self.norm_out(hidden_states, temb)
+        # 5. Unpatchify
+        p = self.config.patch_size
+        output = []
+        for i, (img_size, img_len, seq_len) in enumerate(zip(img_sizes, l_effective_img_len, seq_lengths)):
+            height, width = img_size
+            output.append(rearrange(hidden_states[i][seq_len - img_len:seq_len], '(h w) (p1 p2 c) -> c (h p1) (w p2)', h=height // p, w=width // p, p1=p, p2=p))
+        if is_hidden_states_tensor:
+            output = torch.stack(output, dim=0)
+        if USE_PEFT_BACKEND:
+            # remove `lora_scale` from each PEFT layer
+            unscale_lora_layers(self, lora_scale)
+        if not return_dict:
+            return (output,)
+        return Transformer2DModelOutput(sample=output)

star/models/pixel_decoder/transformer_lumina2_seq.py ADDED Viewed

	@@ -0,0 +1,551 @@

+# Copyright 2025 Alpha-VLLM Authors and The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import math
+from typing import Any, Dict, List, Optional, Tuple, Union
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from einops import rearrange
+from diffusers.models.transformers.transformer_lumina2 import *
+from einops import repeat
+from diffusers.models.embeddings import get_1d_rotary_pos_embed
+import itertools
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+class Lumina2CombinedTimestepCaptionEmbedding(nn.Module):
+    def __init__(
+        self,
+        hidden_size: int = 4096,
+        cap_feat_dim: int = 2048,
+        frequency_embedding_size: int = 256,
+        norm_eps: float = 1e-5,
+    ) -> None:
+        super().__init__()
+        self.time_proj = Timesteps(
+            num_channels=frequency_embedding_size, flip_sin_to_cos=True, downscale_freq_shift=0.0
+        )
+        self.timestep_embedder = TimestepEmbedding(
+            in_channels=frequency_embedding_size, time_embed_dim=min(hidden_size, 1024)
+        )
+        self.caption_embedder = nn.Sequential(
+            RMSNorm(cap_feat_dim, eps=norm_eps), nn.Linear(cap_feat_dim, hidden_size, bias=True)
+        )
+    def forward(
+        self, hidden_states: torch.Tensor, timestep: torch.Tensor, encoder_hidden_states: torch.Tensor
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        timestep_proj = self.time_proj(timestep).type_as(hidden_states)
+        time_embed = self.timestep_embedder(timestep_proj)
+        caption_embed = self.caption_embedder(encoder_hidden_states)
+        return time_embed, caption_embed
+class Lumina2AttnProcessor2_0:
+    r"""
+    Processor for implementing scaled dot-product attention (enabled by default if you're using PyTorch 2.0). This is
+    used in the Lumina2Transformer2DModel model. It applies normalization and RoPE on query and key vectors.
+    """
+    def __init__(self):
+        if not hasattr(F, "scaled_dot_product_attention"):
+            raise ImportError("AttnProcessor2_0 requires PyTorch 2.0, to use it, please upgrade PyTorch to 2.0.")
+    def __call__(
+        self,
+        attn: Attention,
+        hidden_states: torch.Tensor,
+        encoder_hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        image_rotary_emb: Optional[torch.Tensor] = None,
+        base_sequence_length: Optional[int] = None,
+    ) -> torch.Tensor:
+        batch_size, sequence_length, _ = hidden_states.shape
+        # Get Query-Key-Value Pair
+        query = attn.to_q(hidden_states)
+        key = attn.to_k(encoder_hidden_states)
+        value = attn.to_v(encoder_hidden_states)
+        query_dim = query.shape[-1]
+        inner_dim = key.shape[-1]
+        head_dim = query_dim // attn.heads
+        dtype = query.dtype
+        # Get key-value heads
+        kv_heads = inner_dim // head_dim
+        query = query.view(batch_size, -1, attn.heads, head_dim)
+        key = key.view(batch_size, -1, kv_heads, head_dim)
+        value = value.view(batch_size, -1, kv_heads, head_dim)
+        # Apply Query-Key Norm if needed
+        if attn.norm_q is not None:
+            query = attn.norm_q(query)
+        if attn.norm_k is not None:
+            key = attn.norm_k(key)
+        # Apply RoPE if needed
+        if image_rotary_emb is not None:
+            query = apply_rotary_emb(query, image_rotary_emb, use_real=False)
+            key = apply_rotary_emb(key, image_rotary_emb, use_real=False)
+        query, key = query.to(dtype), key.to(dtype)
+        # Apply proportional attention if true
+        if base_sequence_length is not None:
+            softmax_scale = math.sqrt(math.log(sequence_length, base_sequence_length)) * attn.scale
+        else:
+            softmax_scale = attn.scale
+        # perform Grouped-qurey Attention (GQA)
+        n_rep = attn.heads // kv_heads
+        if n_rep >= 1:
+            key = key.unsqueeze(3).repeat(1, 1, 1, n_rep, 1).flatten(2, 3)
+            value = value.unsqueeze(3).repeat(1, 1, 1, n_rep, 1).flatten(2, 3)
+        # scaled_dot_product_attention expects attention_mask shape to be
+        # (batch, heads, source_length, target_length)
+        if attention_mask is not None:
+            attention_mask = attention_mask.bool().view(batch_size, 1, 1, -1)
+        query = query.transpose(1, 2)
+        key = key.transpose(1, 2)
+        value = value.transpose(1, 2)
+        hidden_states = F.scaled_dot_product_attention(
+            query, key, value, attn_mask=attention_mask, scale=softmax_scale
+        )
+        hidden_states = hidden_states.transpose(1, 2).reshape(batch_size, -1, attn.heads * head_dim)
+        hidden_states = hidden_states.type_as(query)
+        # linear proj
+        hidden_states = attn.to_out[0](hidden_states)
+        hidden_states = attn.to_out[1](hidden_states)
+        return hidden_states
+class Lumina2TransformerBlock(nn.Module):
+    def __init__(
+        self,
+        dim: int,
+        num_attention_heads: int,
+        num_kv_heads: int,
+        multiple_of: int,
+        ffn_dim_multiplier: float,
+        norm_eps: float,
+        modulation: bool = True,
+    ) -> None:
+        super().__init__()
+        self.head_dim = dim // num_attention_heads
+        self.dim = dim
+        self.modulation = modulation
+        self.attn = Attention(
+            query_dim=dim,
+            cross_attention_dim=None,
+            dim_head=dim // num_attention_heads,
+            qk_norm="rms_norm",
+            heads=num_attention_heads,
+            kv_heads=num_kv_heads,
+            eps=1e-5,
+            bias=False,
+            out_bias=False,
+            processor=Lumina2AttnProcessor2_0(),
+        )
+        self.feed_forward = LuminaFeedForward(
+            dim=dim,
+            inner_dim=4 * dim,
+            multiple_of=multiple_of,
+            ffn_dim_multiplier=ffn_dim_multiplier,
+        )
+        if modulation:
+            self.norm1 = LuminaRMSNormZero(
+                embedding_dim=dim,
+                norm_eps=norm_eps,
+                norm_elementwise_affine=True,
+            )
+        else:
+            self.norm1 = RMSNorm(dim, eps=norm_eps)
+        self.ffn_norm1 = RMSNorm(dim, eps=norm_eps)
+        self.norm2 = RMSNorm(dim, eps=norm_eps)
+        self.ffn_norm2 = RMSNorm(dim, eps=norm_eps)
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: torch.Tensor,
+        image_rotary_emb: torch.Tensor,
+        temb: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        if self.modulation:
+            norm_hidden_states, gate_msa, scale_mlp, gate_mlp = self.norm1(hidden_states, temb)
+            attn_output = self.attn(
+                hidden_states=norm_hidden_states,
+                encoder_hidden_states=norm_hidden_states,
+                attention_mask=attention_mask,
+                image_rotary_emb=image_rotary_emb,
+            )
+            hidden_states = hidden_states + gate_msa.unsqueeze(1).tanh() * self.norm2(attn_output)
+            mlp_output = self.feed_forward(self.ffn_norm1(hidden_states) * (1 + scale_mlp.unsqueeze(1)))
+            hidden_states = hidden_states + gate_mlp.unsqueeze(1).tanh() * self.ffn_norm2(mlp_output)
+        else:
+            norm_hidden_states = self.norm1(hidden_states)
+            attn_output = self.attn(
+                hidden_states=norm_hidden_states,
+                encoder_hidden_states=norm_hidden_states,
+                attention_mask=attention_mask,
+                image_rotary_emb=image_rotary_emb,
+            )
+            hidden_states = hidden_states + self.norm2(attn_output)
+            mlp_output = self.feed_forward(self.ffn_norm1(hidden_states))
+            hidden_states = hidden_states + self.ffn_norm2(mlp_output)
+        return hidden_states
+class Lumina2RotaryPosEmbed(nn.Module):
+    def __init__(self, theta: int, axes_dim: List[int], axes_lens: List[int] = (300, 512, 512), patch_size: int = 2):
+        super().__init__()
+        self.theta = theta
+        self.axes_dim = axes_dim
+        self.axes_lens = axes_lens
+        self.patch_size = patch_size
+        self.freqs_cis = self._precompute_freqs_cis(axes_dim, axes_lens, theta)
+    def _precompute_freqs_cis(self, axes_dim: List[int], axes_lens: List[int], theta: int) -> List[torch.Tensor]:
+        freqs_cis = []
+        freqs_dtype = torch.float32 if torch.backends.mps.is_available() else torch.float64
+        for i, (d, e) in enumerate(zip(axes_dim, axes_lens)):
+            emb = get_1d_rotary_pos_embed(d, e, theta=self.theta, freqs_dtype=freqs_dtype)
+            freqs_cis.append(emb)
+        return freqs_cis
+    def _get_freqs_cis(self, ids: torch.Tensor) -> torch.Tensor:
+        device = ids.device
+        if ids.device.type == "mps":
+            ids = ids.to("cpu")
+        result = []
+        for i in range(len(self.axes_dim)):
+            freqs = self.freqs_cis[i].to(ids.device)
+            index = ids[:, :, i : i + 1].repeat(1, 1, freqs.shape[-1]).to(torch.int64)
+            result.append(torch.gather(freqs.unsqueeze(0).repeat(index.shape[0], 1, 1), dim=1, index=index))
+        return torch.cat(result, dim=-1).to(device)
+    def forward(self, hidden_states: torch.Tensor, attention_mask: torch.Tensor):
+        batch_size, channels, height, width = hidden_states.shape
+        p = self.patch_size
+        post_patch_height, post_patch_width = height // p, width // p
+        image_seq_len = post_patch_height * post_patch_width
+        device = hidden_states.device
+        encoder_seq_len = attention_mask.shape[1]
+        l_effective_cap_len = attention_mask.sum(dim=1).tolist()
+        seq_lengths = [cap_seq_len + image_seq_len for cap_seq_len in l_effective_cap_len]
+        max_seq_len = max(seq_lengths)
+        # Create position IDs
+        position_ids = torch.zeros(batch_size, max_seq_len, 3, dtype=torch.int32, device=device)
+        for i, (cap_seq_len, seq_len) in enumerate(zip(l_effective_cap_len, seq_lengths)):
+            # add caption position ids
+            position_ids[i, :cap_seq_len, 0] = torch.arange(cap_seq_len, dtype=torch.int32, device=device)
+            position_ids[i, cap_seq_len:seq_len, 0] = cap_seq_len
+            # add image position ids
+            row_ids = (
+                torch.arange(post_patch_height, dtype=torch.int32, device=device)
+                .view(-1, 1)
+                .repeat(1, post_patch_width)
+                .flatten()
+            )
+            col_ids = (
+                torch.arange(post_patch_width, dtype=torch.int32, device=device)
+                .view(1, -1)
+                .repeat(post_patch_height, 1)
+                .flatten()
+            )
+            position_ids[i, cap_seq_len:seq_len, 1] = row_ids
+            position_ids[i, cap_seq_len:seq_len, 2] = col_ids
+        # Get combined rotary embeddings
+        freqs_cis = self._get_freqs_cis(position_ids)
+        # create separate rotary embeddings for captions and images
+        cap_freqs_cis = torch.zeros(
+            batch_size, encoder_seq_len, freqs_cis.shape[-1], device=device, dtype=freqs_cis.dtype
+        )
+        img_freqs_cis = torch.zeros(
+            batch_size, image_seq_len, freqs_cis.shape[-1], device=device, dtype=freqs_cis.dtype
+        )
+        for i, (cap_seq_len, seq_len) in enumerate(zip(l_effective_cap_len, seq_lengths)):
+            cap_freqs_cis[i, :cap_seq_len] = freqs_cis[i, :cap_seq_len]
+            img_freqs_cis[i, :image_seq_len] = freqs_cis[i, cap_seq_len:seq_len]
+        # image patch embeddings
+        hidden_states = (
+            hidden_states.view(batch_size, channels, post_patch_height, p, post_patch_width, p)
+            .permute(0, 2, 4, 3, 5, 1)
+            .flatten(3)
+            .flatten(1, 2)
+        )
+        return hidden_states, cap_freqs_cis, img_freqs_cis, freqs_cis, l_effective_cap_len, seq_lengths
+class Lumina2Transformer2DModel(ModelMixin, ConfigMixin, PeftAdapterMixin, FromOriginalModelMixin):
+    r"""
+    Lumina2NextDiT: Diffusion model with a Transformer backbone.
+    Parameters:
+        sample_size (`int`): The width of the latent images. This is fixed during training since
+            it is used to learn a number of position embeddings.
+        patch_size (`int`, *optional*, (`int`, *optional*, defaults to 2):
+            The size of each patch in the image. This parameter defines the resolution of patches fed into the model.
+        in_channels (`int`, *optional*, defaults to 4):
+            The number of input channels for the model. Typically, this matches the number of channels in the input
+            images.
+        hidden_size (`int`, *optional*, defaults to 4096):
+            The dimensionality of the hidden layers in the model. This parameter determines the width of the model's
+            hidden representations.
+        num_layers (`int`, *optional*, default to 32):
+            The number of layers in the model. This defines the depth of the neural network.
+        num_attention_heads (`int`, *optional*, defaults to 32):
+            The number of attention heads in each attention layer. This parameter specifies how many separate attention
+            mechanisms are used.
+        num_kv_heads (`int`, *optional*, defaults to 8):
+            The number of key-value heads in the attention mechanism, if different from the number of attention heads.
+            If None, it defaults to num_attention_heads.
+        multiple_of (`int`, *optional*, defaults to 256):
+            A factor that the hidden size should be a multiple of. This can help optimize certain hardware
+            configurations.
+        ffn_dim_multiplier (`float`, *optional*):
+            A multiplier for the dimensionality of the feed-forward network. If None, it uses a default value based on
+            the model configuration.
+        norm_eps (`float`, *optional*, defaults to 1e-5):
+            A small value added to the denominator for numerical stability in normalization layers.
+        scaling_factor (`float`, *optional*, defaults to 1.0):
+            A scaling factor applied to certain parameters or layers in the model. This can be used for adjusting the
+            overall scale of the model's operations.
+    """
+    _supports_gradient_checkpointing = True
+    _no_split_modules = ["Lumina2TransformerBlock"]
+    _skip_layerwise_casting_patterns = ["x_embedder", "norm"]
+    @register_to_config
+    def __init__(
+        self,
+        sample_size: int = 128,
+        patch_size: int = 2,
+        in_channels: int = 16,
+        out_channels: Optional[int] = None,
+        hidden_size: int = 2304,
+        num_layers: int = 26,
+        num_refiner_layers: int = 2,
+        num_attention_heads: int = 24,
+        num_kv_heads: int = 8,
+        multiple_of: int = 256,
+        ffn_dim_multiplier: Optional[float] = None,
+        norm_eps: float = 1e-5,
+        scaling_factor: float = 1.0,
+        axes_dim_rope: Tuple[int, int, int] = (32, 32, 32),
+        axes_lens: Tuple[int, int, int] = (300, 512, 512),
+        cap_feat_dim: int = 1024,
+    ) -> None:
+        super().__init__()
+        self.out_channels = out_channels or in_channels
+        # 1. Positional, patch & conditional embeddings
+        self.rope_embedder = Lumina2RotaryPosEmbed(
+            theta=10000, axes_dim=axes_dim_rope, axes_lens=axes_lens, patch_size=patch_size
+        )
+        self.x_embedder = nn.Linear(in_features=patch_size * patch_size * in_channels, out_features=hidden_size)
+        self.time_caption_embed = Lumina2CombinedTimestepCaptionEmbedding(
+            hidden_size=hidden_size, cap_feat_dim=cap_feat_dim, norm_eps=norm_eps
+        )
+        # 2. Noise and context refinement blocks
+        self.noise_refiner = nn.ModuleList(
+            [
+                Lumina2TransformerBlock(
+                    hidden_size,
+                    num_attention_heads,
+                    num_kv_heads,
+                    multiple_of,
+                    ffn_dim_multiplier,
+                    norm_eps,
+                    modulation=True,
+                )
+                for _ in range(num_refiner_layers)
+            ]
+        )
+        self.context_refiner = nn.ModuleList(
+            [
+                Lumina2TransformerBlock(
+                    hidden_size,
+                    num_attention_heads,
+                    num_kv_heads,
+                    multiple_of,
+                    ffn_dim_multiplier,
+                    norm_eps,
+                    modulation=False,
+                )
+                for _ in range(num_refiner_layers)
+            ]
+        )
+        self.ori_inp_dit = "none"
+        self.ori_inp_refiner = None
+        # 3. Transformer blocks
+        self.layers = nn.ModuleList(
+            [
+                Lumina2TransformerBlock(
+                    hidden_size,
+                    num_attention_heads,
+                    num_kv_heads,
+                    multiple_of,
+                    ffn_dim_multiplier,
+                    norm_eps,
+                    modulation=True,
+                )
+                for _ in range(num_layers)
+            ]
+        )
+        # 4. Output norm & projection
+        self.norm_out = LuminaLayerNormContinuous(
+            embedding_dim=hidden_size,
+            conditioning_embedding_dim=min(hidden_size, 1024),
+            elementwise_affine=False,
+            eps=1e-6,
+            bias=True,
+            out_dim=patch_size * patch_size * self.out_channels,
+        )
+        self.gradient_checkpointing = False
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        timestep: torch.Tensor,
+        encoder_hidden_states: torch.Tensor,
+        encoder_attention_mask: torch.Tensor,
+        attention_kwargs: Optional[Dict[str, Any]] = None,
+        return_dict: bool = True,
+    ) -> Union[torch.Tensor, Transformer2DModelOutput]:
+        if attention_kwargs is not None:
+            attention_kwargs = attention_kwargs.copy()
+            lora_scale = attention_kwargs.pop("scale", 1.0)
+        else:
+            lora_scale = 1.0
+        if USE_PEFT_BACKEND:
+            # weight the lora layers by setting `lora_scale` for each PEFT layer
+            scale_lora_layers(self, lora_scale)
+        else:
+            if attention_kwargs is not None and attention_kwargs.get("scale", None) is not None:
+                logger.warning(
+                    "Passing `scale` via `attention_kwargs` when not using the PEFT backend is ineffective."
+                )
+        # 1. Condition, positional & patch embedding
+        batch_size, _, height, width = hidden_states.shape
+        temb, encoder_hidden_states = self.time_caption_embed(hidden_states, timestep, encoder_hidden_states)
+        (
+            hidden_states,
+            context_rotary_emb,
+            noise_rotary_emb,
+            rotary_emb,
+            encoder_seq_lengths,
+            seq_lengths,
+        ) = self.rope_embedder(hidden_states, encoder_attention_mask)
+        hidden_states = self.x_embedder(hidden_states)
+        # 2. Context & noise refinement
+        for layer in self.context_refiner:
+            encoder_hidden_states = layer(encoder_hidden_states, encoder_attention_mask, context_rotary_emb)
+        for layer in self.noise_refiner:
+            hidden_states = layer(hidden_states, None, noise_rotary_emb, temb)
+        if self.ori_inp_dit!="none" and self.ori_inp_refiner is not None:
+            single_img_length = hidden_states.shape[1]//2
+            initial_part = hidden_states[:, :single_img_length]
+            refined_part = self.ori_inp_refiner(hidden_states[:, single_img_length:])
+            updated_hidden_states = torch.cat((initial_part, refined_part), dim=1)
+            hidden_states = updated_hidden_states
+        # 3. Joint Transformer blocks
+        max_seq_len = max(seq_lengths)
+        use_mask = len(set(seq_lengths)) > 1
+        attention_mask = hidden_states.new_zeros(batch_size, max_seq_len, dtype=torch.bool)
+        joint_hidden_states = hidden_states.new_zeros(batch_size, max_seq_len, self.config.hidden_size)
+        for i, (encoder_seq_len, seq_len) in enumerate(zip(encoder_seq_lengths, seq_lengths)):
+            attention_mask[i, :seq_len] = True
+            joint_hidden_states[i, :encoder_seq_len] = encoder_hidden_states[i, :encoder_seq_len]
+            joint_hidden_states[i, encoder_seq_len:seq_len] = hidden_states[i]
+        hidden_states = joint_hidden_states
+        for layer in self.layers:
+            if torch.is_grad_enabled() and self.gradient_checkpointing:
+                hidden_states = self._gradient_checkpointing_func(
+                    layer, hidden_states, attention_mask if use_mask else None, rotary_emb, temb
+                )
+            else:
+                hidden_states = layer(hidden_states, attention_mask if use_mask else None, rotary_emb, temb)
+        # 4. Output norm & projection
+        hidden_states = self.norm_out(hidden_states, temb)
+        # 5. Unpatchify
+        p = self.config.patch_size
+        output = []
+        for i, (encoder_seq_len, seq_len) in enumerate(zip(encoder_seq_lengths, seq_lengths)):
+            output.append(
+                hidden_states[i][encoder_seq_len:seq_len]
+                .view(height // p, width // p, p, p, self.out_channels)
+                .permute(4, 0, 2, 1, 3)
+                .flatten(3, 4)
+                .flatten(1, 2)
+            )
+        output = torch.stack(output, dim=0)
+        if USE_PEFT_BACKEND:
+            # remove `lora_scale` from each PEFT layer
+            unscale_lora_layers(self, lora_scale)
+        if not return_dict:
+            return (output,)
+        return Transformer2DModelOutput(sample=output)

star/models/pixel_encoder/vq_model.py ADDED Viewed

	@@ -0,0 +1,510 @@

+from dataclasses import dataclass, field
+from typing import List
+import math
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from deepspeed.utils import logger
+@dataclass
+class ModelArgs:
+    codebook_size: int = 16384
+    codebook_embed_dim: int = 8
+    codebook_l2_norm: bool = False
+    codebook_show_usage: bool = True
+    commit_loss_beta: float = 0.25
+    entropy_loss_ratio: float = 0.0
+    encoder_ch_mult: List[int] = field(default_factory=lambda: [1, 1, 2, 2, 4])
+    decoder_ch_mult: List[int] = field(default_factory=lambda: [1, 1, 2, 2, 4])
+    z_channels: int = 256
+    dropout_p: float = 0.0
+    num_res_blocks: int = 2
+    ch: int=128
+    attn_num_heads: int = 1
+class VQModel(nn.Module):
+    def __init__(self, config: ModelArgs):
+        super().__init__()
+        self.config = config
+        self.encoder = Encoder(ch_mult=config.encoder_ch_mult, z_channels=config.z_channels, dropout=config.dropout_p, num_res_blocks=config.num_res_blocks, ch=config.ch, attn_num_heads=config.attn_num_heads)
+        self.decoder = Decoder(ch_mult=config.decoder_ch_mult, z_channels=config.z_channels, dropout=config.dropout_p, num_res_blocks=config.num_res_blocks, ch=config.ch, attn_num_heads=config.attn_num_heads)
+        self.quantize = VectorQuantizer(config.codebook_size, config.codebook_embed_dim,
+                                        config.commit_loss_beta, config.entropy_loss_ratio,
+                                        config.codebook_l2_norm, config.codebook_show_usage)
+        self.quant_conv = nn.Conv2d(config.z_channels, config.codebook_embed_dim, 1)
+        self.post_quant_conv = nn.Conv2d(config.codebook_embed_dim, config.z_channels, 1)
+    def encode(self, x):
+        h = self.encoder(x)
+        h = self.quant_conv(h)
+        quant, emb_loss, info = self.quantize(h)
+        return quant, emb_loss, info
+    def decode(self, quant):
+        quant = self.post_quant_conv(quant)
+        dec = self.decoder(quant)
+        return dec
+    def decode_code(self, code_b, shape=None, channel_first=True):
+        quant_b = self.quantize.get_codebook_entry(code_b, shape, channel_first)
+        dec = self.decode(quant_b)
+        return dec # [B, C, H, W]
+    def forward(self, input):
+        quant, diff, _ = self.encode(input)
+        dec = self.decode(quant)
+        return dec, diff
+    def get_codebook_entry(self, code_b, shape=None, channel_first=True):
+        quant_b = self.quantize.get_codebook_entry(code_b, shape, channel_first)
+        return quant_b
+    def image_to_seq(self, image):
+        quant, _, [_, _, indices] = self.encode(image)
+        batch_size = image.shape[0]
+        return indices.reshape(batch_size, -1)
+    def seq_to_image(self, tokens):
+        tokens = torch.clamp(tokens, min=0)
+        assert tokens.size(-1) == self.config.num_tokens, (
+            f"can not generate the image as the token length is {tokens.size(-1)} != {self.config.num_tokens}"
+        )
+        bs, HW = tokens.shape
+        H = W = int(math.sqrt(HW))
+        images = self.decode_code(tokens, shape=[bs, self.config.codebook_embed_dim, H, W])
+        images = torch.clip((images+1)/2, 0, 1)
+        images = torch.permute(images, [0, 2, 3, 1])
+        return images
+    def load_trained_weights(self, pretrained=None):
+        device_index = torch.cuda.current_device()
+        device = torch.device(f'cuda:{device_index}')
+        weights = torch.load(pretrained, map_location=device)
+        self.load_state_dict(weights, strict=True)
+class Encoder(nn.Module):
+    def __init__(self, in_channels=3, ch=128, ch_mult=(1,1,2,2,4), num_res_blocks=2,
+                 norm_type='group', dropout=0.0, resamp_with_conv=True, z_channels=256, attn_num_heads=1):
+        super().__init__()
+        self.num_resolutions = len(ch_mult)
+        self.num_res_blocks = num_res_blocks
+        self.conv_in = nn.Conv2d(in_channels, ch, kernel_size=3, stride=1, padding=1)
+        # downsampling
+        in_ch_mult = (1,) + tuple(ch_mult)
+        self.conv_blocks = nn.ModuleList()
+        for i_level in range(self.num_resolutions):
+            conv_block = nn.Module()
+            # res & attn
+            res_block = nn.ModuleList()
+            attn_block = nn.ModuleList()
+            block_in = ch*in_ch_mult[i_level]
+            block_out = ch*ch_mult[i_level]
+            for _ in range(self.num_res_blocks):
+                res_block.append(ResnetBlock(block_in, block_out, dropout=dropout, norm_type=norm_type))
+                block_in = block_out
+                if i_level == self.num_resolutions - 1:
+                    attn_block.append(AttnBlock(block_in, norm_type, attn_num_heads))
+            conv_block.res = res_block
+            conv_block.attn = attn_block
+            # downsample
+            if i_level != self.num_resolutions-1:
+                conv_block.downsample = Downsample(block_in, resamp_with_conv)
+            self.conv_blocks.append(conv_block)
+        # middle
+        self.mid = nn.ModuleList()
+        self.mid.append(ResnetBlock(block_in, block_in, dropout=dropout, norm_type=norm_type))
+        self.mid.append(AttnBlock(block_in, norm_type=norm_type, num_heads=attn_num_heads))
+        self.mid.append(ResnetBlock(block_in, block_in, dropout=dropout, norm_type=norm_type))
+        # end
+        self.norm_out = Normalize(block_in, norm_type)
+        self.conv_out = nn.Conv2d(block_in, z_channels, kernel_size=3, stride=1, padding=1)
+    def forward(self, x):
+        h = self.conv_in(x)
+        # downsampling
+        for i_level, block in enumerate(self.conv_blocks):
+            for i_block in range(self.num_res_blocks):
+                h = block.res[i_block](h)
+                if len(block.attn) > 0:
+                    h = block.attn[i_block](h)
+            if i_level != self.num_resolutions - 1:
+                h = block.downsample(h)
+        # middle
+        for mid_block in self.mid:
+            h = mid_block(h)
+        # end
+        h = self.norm_out(h)
+        h = nonlinearity(h)
+        h = self.conv_out(h)
+        return h
+class Decoder(nn.Module):
+    def __init__(self, z_channels=256, ch=128, ch_mult=(1,1,2,2,4), num_res_blocks=2, norm_type="group",
+                 dropout=0.0, resamp_with_conv=True, out_channels=3, attn_num_heads=1):
+        super().__init__()
+        self.num_resolutions = len(ch_mult)
+        self.num_res_blocks = num_res_blocks
+        block_in = ch*ch_mult[self.num_resolutions-1]
+        # z to block_in
+        self.conv_in = nn.Conv2d(z_channels, block_in, kernel_size=3, stride=1, padding=1)
+       # middle
+        self.mid = nn.ModuleList()
+        self.mid.append(ResnetBlock(block_in, block_in, dropout=dropout, norm_type=norm_type))
+        self.mid.append(AttnBlock(block_in, norm_type=norm_type, num_heads=attn_num_heads))
+        self.mid.append(ResnetBlock(block_in, block_in, dropout=dropout, norm_type=norm_type))
+        # upsampling
+        self.conv_blocks = nn.ModuleList()
+        for i_level in reversed(range(self.num_resolutions)):
+            conv_block = nn.Module()
+            # res & attn
+            res_block = nn.ModuleList()
+            attn_block = nn.ModuleList()
+            block_out = ch*ch_mult[i_level]
+            for _ in range(self.num_res_blocks + 1):
+                res_block.append(ResnetBlock(block_in, block_out, dropout=dropout, norm_type=norm_type))
+                block_in = block_out
+                if i_level == self.num_resolutions - 1:
+                    attn_block.append(AttnBlock(block_in, norm_type, attn_num_heads))
+            conv_block.res = res_block
+            conv_block.attn = attn_block
+            # downsample
+            if i_level != 0:
+                conv_block.upsample = Upsample(block_in, resamp_with_conv)
+            self.conv_blocks.append(conv_block)
+        # end
+        self.norm_out = Normalize(block_in, norm_type)
+        self.conv_out = nn.Conv2d(block_in, out_channels, kernel_size=3, stride=1, padding=1)
+    @property
+    def last_layer(self):
+        return self.conv_out.weight
+    def forward(self, z):
+        # z to block_in
+        h = self.conv_in(z)
+        # middle
+        for mid_block in self.mid:
+            h = mid_block(h)
+        # upsampling
+        for i_level, block in enumerate(self.conv_blocks):
+            for i_block in range(self.num_res_blocks + 1):
+                h = block.res[i_block](h)
+                if len(block.attn) > 0:
+                    h = block.attn[i_block](h)
+            if i_level != self.num_resolutions - 1:
+                h = block.upsample(h)
+        # end
+        h = self.norm_out(h)
+        h = nonlinearity(h)
+        h = self.conv_out(h)
+        return h
+class VectorQuantizer(nn.Module):
+    def __init__(self, n_e, e_dim, beta, entropy_loss_ratio, l2_norm, show_usage=False):
+        super().__init__()
+        self.n_e = n_e
+        self.e_dim = e_dim
+        self.beta = beta
+        self.entropy_loss_ratio = entropy_loss_ratio
+        self.l2_norm = l2_norm
+        self.show_usage = show_usage
+        self.embedding = nn.Embedding(self.n_e, self.e_dim)
+        self.embedding.weight.data.uniform_(-1.0 / self.n_e, 1.0 / self.n_e)
+        if self.l2_norm:
+            self.embedding.weight.data = F.normalize(self.embedding.weight.data, p=2, dim=-1)
+        if self.show_usage:
+            if self.n_e < 65536:
+                self.register_buffer("codebook_used", nn.Parameter(torch.zeros(65536)))
+            else:
+                self.register_buffer("codebook_used", nn.Parameter(torch.zeros(self.n_e+1)))
+                # self.register_buffer("codebook_used", nn.Parameter(torch.zeros(196608)))
+        # self.h_, self.w_ = int(self.n_e ** 0.5), int(self.n_e ** 0.5)
+        if int(self.n_e ** 0.5) ** 2 == self.n_e:
+            self.h_, self.w_ = int(self.n_e ** 0.5), int(self.n_e ** 0.5)
+        else:
+            self.h_  = int((self.n_e * 2) ** 0.5)
+            self.w_ = self.n_e // self.h_
+    def forward(self, z):
+        # reshape z -> (batch, height, width, channel) and flatten
+        z = torch.einsum('b c h w -> b h w c', z).contiguous()
+        z_flattened = z.view(z.shape[0], -1, self.e_dim) # [b, h*w, e_dim]
+        # distances from z to embeddings e_j (z - e)^2 = z^2 + e^2 - 2 e * z
+        emb_weights = self.embedding.weight[None].repeat(z.shape[0], 1, 1)
+        if self.l2_norm:
+            z = F.normalize(z, p=2, dim=-1)
+            z_flattened = F.normalize(z_flattened, p=2, dim=-1)
+            embedding = F.normalize(emb_weights, p=2, dim=-1) # [b, n_e, e_dim]
+        else:
+            embedding = emb_weights
+        d = torch.sum(z_flattened ** 2, dim=2, keepdim=True) + \
+            torch.sum(embedding**2, dim=2).unsqueeze(1) - 2 * \
+            torch.einsum('bld,bnd->bln', z_flattened, embedding) # [n, h*w, n_e]
+        min_encoding_indices = torch.argmin(d, dim=2) # [n, h*w]
+        z_q = torch.stack([embedding[b, min_encoding_indices[b]] for b in range(z.shape[0])]) # [n, h*w, e_dim]
+        z_q = z_q.view(z.shape)
+        perplexity = None
+        min_encodings = None
+        vq_loss = None
+        commit_loss = None
+        entropy_loss = None
+        codebook_usage = 0
+        if self.show_usage and self.training:
+            self.codebook_used = self.codebook_used.long()
+            cur_len = min_encoding_indices.shape.numel()
+            self.codebook_used[:-cur_len] = self.codebook_used[cur_len:].clone()
+            self.codebook_used[-cur_len:] = min_encoding_indices.view(-1)
+            codebook_usage = len(torch.unique(self.codebook_used)) / self.n_e
+        # compute loss for embedding
+        if self.training:
+            vq_loss = torch.mean((z_q - z.detach()) ** 2)
+            commit_loss = self.beta * torch.mean((z_q.detach() - z) ** 2)
+            entropy_loss = self.entropy_loss_ratio * compute_entropy_loss(-d.view(-1, d.shape[-1]))
+        # preserve gradients
+        z_q = z + (z_q - z).detach()
+        # reshape back to match original input shape
+        z_q = torch.einsum('b h w c -> b c h w', z_q)
+        return z_q, (vq_loss, commit_loss, entropy_loss, codebook_usage), (perplexity, min_encodings, min_encoding_indices)
+    def get_codebook_entry(self, indices, shape=None, channel_first=True):
+        if self.l2_norm:
+            embedding = F.normalize(self.embedding.weight, p=2, dim=-1) # [n, n_e, e_dim]
+        else:
+            embedding = self.embedding.weight
+        z_q = embedding[indices]
+        if shape is not None:
+            if channel_first:
+                z_q = z_q.reshape(shape[0], shape[2], shape[3], shape[1]) # [B, H, W, D]
+                # reshape back to match original input shape
+                z_q = z_q.permute(0, 3, 1, 2).contiguous() # [B, D, H, W]
+            else:
+                z_q = z_q.view(shape)
+        return z_q
+class ResnetBlock(nn.Module):
+    def __init__(self, in_channels, out_channels=None, conv_shortcut=False, dropout=0.0, norm_type='group'):
+        super().__init__()
+        self.in_channels = in_channels
+        out_channels = in_channels if out_channels is None else out_channels
+        self.out_channels = out_channels
+        self.use_conv_shortcut = conv_shortcut
+        self.norm1 = Normalize(in_channels, norm_type)
+        self.conv1 = nn.Conv2d(in_channels, out_channels, kernel_size=3, stride=1, padding=1)
+        self.norm2 = Normalize(out_channels, norm_type)
+        self.dropout = nn.Dropout(dropout)
+        self.conv2 = nn.Conv2d(out_channels, out_channels, kernel_size=3, stride=1, padding=1)
+        if self.in_channels != self.out_channels:
+            if self.use_conv_shortcut:
+                self.conv_shortcut = nn.Conv2d(in_channels, out_channels, kernel_size=3, stride=1, padding=1)
+            else:
+                self.nin_shortcut = nn.Conv2d(in_channels, out_channels, kernel_size=1, stride=1, padding=0)
+    def forward(self, x):
+        h = x
+        h = self.norm1(h)
+        h = nonlinearity(h)
+        h = self.conv1(h)
+        h = self.norm2(h)
+        h = nonlinearity(h)
+        h = self.dropout(h)
+        h = self.conv2(h)
+        if self.in_channels != self.out_channels:
+            if self.use_conv_shortcut:
+                x = self.conv_shortcut(x)
+            else:
+                x = self.nin_shortcut(x)
+        return x+h
+class AttnBlock(nn.Module):
+    def __init__(self, in_channels, norm_type='group', num_heads=1):
+        super().__init__()
+        self.num_heads = num_heads
+        assert in_channels % self.num_heads == 0
+        self.norm = Normalize(in_channels, norm_type)
+        self.q = nn.Conv2d(in_channels, in_channels, kernel_size=1, stride=1, padding=0)
+        self.k = nn.Conv2d(in_channels, in_channels, kernel_size=1, stride=1, padding=0)
+        self.v = nn.Conv2d(in_channels, in_channels, kernel_size=1, stride=1, padding=0)
+        self.proj_out = nn.Conv2d(in_channels, in_channels, kernel_size=1, stride=1, padding=0)
+    def forward_single_head(self, x):
+        h_ = x
+        h_ = self.norm(h_)
+        q = self.q(h_)
+        k = self.k(h_)
+        v = self.v(h_)
+        # compute attention
+        b,c,h,w = q.shape
+        q = q.reshape(b,c,h*w)
+        q = q.permute(0,2,1)   # b,hw,c
+        k = k.reshape(b,c,h*w) # b,c,hw
+        w_ = torch.bmm(q,k)     # b,hw,hw    w[b,i,j]=sum_c q[b,i,c]k[b,c,j]
+        w_ = w_ * (int(c)**(-0.5))
+        w_ = F.softmax(w_, dim=2)
+        # attend to values
+        v = v.reshape(b,c,h*w)
+        w_ = w_.permute(0,2,1)   # b,hw,hw (first hw of k, second of q)
+        h_ = torch.bmm(v,w_)     # b, c,hw (hw of q) h_[b,c,j] = sum_i v[b,c,i] w_[b,i,j]
+        h_ = h_.reshape(b,c,h,w)
+        h_ = self.proj_out(h_)
+        return x+h_
+    def forwar_multi_head(self, x):
+        h_ = x
+        h_ = self.norm(h_)
+        q = self.q(h_)
+        k = self.k(h_)
+        v = self.v(h_)
+        # compute attention
+        b, c, h, w = q.shape
+        q = q.reshape(b, self.num_heads, c//self.num_heads, h * w) # b, head, c, hw
+        q = q.permute(0, 1, 3, 2) # b, head, hw, c
+        k = k.reshape(b, self.num_heads, c//self.num_heads, h * w) # b, head, c, hw
+        # w_ = torch.bmm(q,k)     # b,head,hw,hw    w[b,i,j]=sum_c q[b,i,c]k[b,c,j]
+        w_ = q @ k # b,head,hw,hw    w[b,i,j]=sum_c q[b,i,c]k[b,c,j]
+        w_ = w_ * (int(c // self.num_heads) ** (-0.5))
+        w_ = torch.nn.functional.softmax(w_, dim=3)
+        # attend to values
+        v = v.reshape(b, self.num_heads, c//self.num_heads, h * w) # b, head, c, hw
+        w_ = w_.permute(0, 1, 3, 2)   # b,head,hw,hw (first hw of k, second of q)
+        h_ = v @ w_     # b, head,c,hw (hw of q) h_[b,c,j] = sum_i v[b,c,i] w_[b,i,j]
+        h_ = h_.reshape(b, c, h, w)
+        h_ = self.proj_out(h_)
+        return x + h_
+    def forward(self, x):
+        if self.num_heads > 1:
+            return self.forwar_multi_head(x)
+        else:
+            return self.forward_single_head(x)
+def nonlinearity(x):
+    # swish
+    return x*torch.sigmoid(x)
+def Normalize(in_channels, norm_type='group'):
+    assert norm_type in ['group', 'batch']
+    if norm_type == 'group':
+        return nn.GroupNorm(num_groups=32, num_channels=in_channels, eps=1e-6, affine=True)
+    elif norm_type == 'batch':
+        return nn.SyncBatchNorm(in_channels)
+class Upsample(nn.Module):
+    def __init__(self, in_channels, with_conv):
+        super().__init__()
+        self.with_conv = with_conv
+        if self.with_conv:
+            self.conv = nn.Conv2d(in_channels, in_channels, kernel_size=3, stride=1, padding=1)
+    def forward(self, x):
+        x = F.interpolate(x, scale_factor=2.0, mode="nearest")
+        if self.with_conv:
+            x = self.conv(x)
+        return x
+class Downsample(nn.Module):
+    def __init__(self, in_channels, with_conv):
+        super().__init__()
+        self.with_conv = with_conv
+        if self.with_conv:
+            # no asymmetric padding in torch conv, must do it ourselves
+            self.conv = nn.Conv2d(in_channels, in_channels, kernel_size=3, stride=2, padding=0)
+    def forward(self, x):
+        if self.with_conv:
+            pad = (0,1,0,1)
+            x = F.pad(x, pad, mode="constant", value=0)
+            x = self.conv(x)
+        else:
+            x = F.avg_pool2d(x, kernel_size=2, stride=2)
+        return x
+def compute_entropy_loss(affinity, loss_type="softmax", temperature=0.01):
+    flat_affinity = affinity.reshape(-1, affinity.shape[-1])
+    flat_affinity /= temperature
+    probs = F.softmax(flat_affinity, dim=-1)
+    log_probs = F.log_softmax(flat_affinity + 1e-5, dim=-1)
+    if loss_type == "softmax":
+        target_probs = probs
+    else:
+        raise ValueError("Entropy loss {} not supported".format(loss_type))
+    avg_probs = torch.mean(target_probs, dim=0)
+    avg_entropy = - torch.sum(avg_probs * torch.log(avg_probs + 1e-5))
+    sample_entropy = - torch.mean(torch.sum(target_probs * log_probs, dim=-1))
+    loss = sample_entropy - avg_entropy
+    return loss
+#################################################################################
+#                              VQ Model Configs                                 #
+#################################################################################
+def VQ_Model(config, **kwargs):
+    model = VQModel(ModelArgs(encoder_ch_mult=[1, 2, 2, 4, 8], decoder_ch_mult=[1, 2, 2, 4, 8], codebook_size=config.image_token_size, codebook_embed_dim=config.n_embed, z_channels=512, ch=256, attn_num_heads=config.num_heads, **kwargs))
+    pretrained = config.model_path
+    if pretrained:
+        model.load_trained_weights(pretrained)
+    return model

star/models/rope_2d.py ADDED Viewed

	@@ -0,0 +1,232 @@

+import os
+import copy
+import json
+import random
+import logging
+import re
+import time
+import math
+import ast
+from dataclasses import dataclass, field
+from typing import Dict, Optional, Sequence, List, Tuple
+from io import BytesIO
+import base64
+import numpy as np
+import torch
+from torch.utils.data import Dataset
+from PIL import Image
+from decord import VideoReader
+import transformers
+def get_rope_index_25(
+    spatial_merge_size: Optional[int] = 2,
+    input_ids: Optional[torch.LongTensor] = None,
+    image_grid_thw: Optional[torch.LongTensor] = None,
+    video_grid_thw: Optional[torch.LongTensor] = None,
+    second_per_grid_ts: Optional[torch.Tensor] = None,
+    attention_mask: Optional[torch.Tensor] = None,
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    """
+    Calculate the 3D rope index based on image and video's temporal, height and width in LLM.
+    Explanation:
+        Each embedding sequence contains vision embedding and text embedding or just contains text embedding.
+        For pure text embedding sequence, the rotary position embedding has no difference with modern LLMs.
+        Examples:
+            input_ids: [T T T T T], here T is for text.
+            temporal position_ids: [0, 1, 2, 3, 4]
+            height position_ids: [0, 1, 2, 3, 4]
+            width position_ids: [0, 1, 2, 3, 4]
+        For vision and text embedding sequence, we calculate 3D rotary position embedding for vision part
+        and 1D rotary position embedding for text part.
+        Examples:
+            Temporal (Time): 3 patches, representing different segments of the video in time.
+            Height: 2 patches, dividing each frame vertically.
+            Width: 2 patches, dividing each frame horizontally.
+            We also have some important parameters:
+            fps (Frames Per Second): The video's frame rate, set to 1. This means one frame is processed each second.
+            tokens_per_second: This is a crucial parameter. It dictates how many "time-steps" or "temporal tokens" are conceptually packed into a one-second interval of the video. In this case, we have 25 tokens per second. So each second of the video will be represented with 25 separate time points. It essentially defines the temporal granularity.
+            temporal_patch_size: The number of frames that compose one temporal patch. Here, it's 2 frames.
+            interval: The step size for the temporal position IDs, calculated as tokens_per_second * temporal_patch_size / fps. In this case, 25 * 2 / 1 = 50. This means that each temporal patch will be have a difference of 50 in the temporal position IDs.
+            input_ids: [V V V V V V V V V V V V T T T T T], here V is for vision.
+            vision temporal position_ids: [0, 0, 0, 0, 50, 50, 50, 50, 100, 100, 100, 100]
+            vision height position_ids: [0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1]
+            vision width position_ids: [0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1]
+            text temporal position_ids: [101, 102, 103, 104, 105]
+            text height position_ids: [101, 102, 103, 104, 105]
+            text width position_ids: [101, 102, 103, 104, 105]
+            Here we calculate the text start position_ids as the max vision position_ids plus 1.
+    Args:
+        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
+            Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
+            it.
+        image_grid_thw (`torch.LongTensor` of shape `(num_images, 3)`, *optional*):
+            The temporal, height and width of feature shape of each image in LLM.
+        video_grid_thw (`torch.LongTensor` of shape `(num_videos, 3)`, *optional*):
+            The temporal, height and width of feature shape of each video in LLM.
+        second_per_grid_ts (`torch.Tensor` of shape `(num_videos)`, *optional*):
+            The time interval (in seconds) for each grid along the temporal dimension in the 3D position IDs.
+        attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+    Returns:
+        position_ids (`torch.LongTensor` of shape `(3, batch_size, sequence_length)`)
+        mrope_position_deltas (`torch.Tensor` of shape `(batch_size)`)
+    """
+    image_token_id = 151655
+    video_token_id = 151656
+    vision_start_token_id = 151652
+    mrope_position_deltas = []
+    if input_ids is not None and (
+        image_grid_thw is not None or video_grid_thw is not None
+    ):
+        total_input_ids = input_ids
+        if attention_mask is None:
+            attention_mask = torch.ones_like(total_input_ids)
+        position_ids = torch.ones(
+            3,
+            input_ids.shape[0],
+            input_ids.shape[1],
+            dtype=input_ids.dtype,
+            device=input_ids.device,
+        )
+        image_index, video_index = 0, 0
+        attention_mask = attention_mask.to(total_input_ids.device)
+        for i, input_ids in enumerate(total_input_ids):
+            input_ids = input_ids[attention_mask[i] == 1]
+            image_nums, video_nums = 0, 0
+            vision_start_indices = torch.argwhere(
+                input_ids == vision_start_token_id
+            ).squeeze(1)
+            vision_tokens = input_ids[vision_start_indices + 1]
+            image_nums = (vision_tokens == image_token_id).sum()
+            video_nums = (vision_tokens == video_token_id).sum()
+            input_tokens = input_ids.tolist()
+            llm_pos_ids_list: list = []
+            st = 0
+            remain_images, remain_videos = image_nums, video_nums
+            for _ in range(image_nums + video_nums):
+                if image_token_id in input_tokens and remain_images > 0:
+                    ed_image = input_tokens.index(image_token_id, st)
+                else:
+                    ed_image = len(input_tokens) + 1
+                if video_token_id in input_tokens and remain_videos > 0:
+                    ed_video = input_tokens.index(video_token_id, st)
+                else:
+                    ed_video = len(input_tokens) + 1
+                if ed_image < ed_video:
+                    t, h, w = (
+                        image_grid_thw[image_index][0],
+                        image_grid_thw[image_index][1],
+                        image_grid_thw[image_index][2],
+                    )
+                    second_per_grid_t = 0
+                    image_index += 1
+                    remain_images -= 1
+                    ed = ed_image
+                else:
+                    t, h, w = (
+                        video_grid_thw[video_index][0],
+                        video_grid_thw[video_index][1],
+                        video_grid_thw[video_index][2],
+                    )
+                    if second_per_grid_ts is not None:
+                        second_per_grid_t = second_per_grid_ts[video_index]
+                    else:
+                        second_per_grid_t = 1.0
+                    video_index += 1
+                    remain_videos -= 1
+                    ed = ed_video
+                llm_grid_t, llm_grid_h, llm_grid_w = (
+                    t.item(),
+                    h.item() // spatial_merge_size,
+                    w.item() // spatial_merge_size,
+                )
+                text_len = ed - st
+                st_idx = (
+                    llm_pos_ids_list[-1].max() + 1 if len(llm_pos_ids_list) > 0 else 0
+                )
+                llm_pos_ids_list.append(
+                    torch.arange(text_len).view(1, -1).expand(3, -1) + st_idx
+                )
+                range_tensor = torch.arange(llm_grid_t).view(-1, 1)
+                expanded_range = range_tensor.expand(-1, llm_grid_h * llm_grid_w)
+                time_tensor = expanded_range * second_per_grid_t * 2
+                time_tensor_long = time_tensor.long()
+                t_index = time_tensor_long.flatten()
+                h_index = (
+                    torch.arange(llm_grid_h)
+                    .view(1, -1, 1)
+                    .expand(llm_grid_t, -1, llm_grid_w)
+                    .flatten()
+                )
+                w_index = (
+                    torch.arange(llm_grid_w)
+                    .view(1, 1, -1)
+                    .expand(llm_grid_t, llm_grid_h, -1)
+                    .flatten()
+                )
+                llm_pos_ids_list.append(
+                    torch.stack([t_index, h_index, w_index]) + text_len + st_idx
+                )
+                st = ed + llm_grid_t * llm_grid_h * llm_grid_w
+            if st < len(input_tokens):
+                st_idx = (
+                    llm_pos_ids_list[-1].max() + 1 if len(llm_pos_ids_list) > 0 else 0
+                )
+                text_len = len(input_tokens) - st
+                llm_pos_ids_list.append(
+                    torch.arange(text_len).view(1, -1).expand(3, -1) + st_idx
+                )
+            llm_positions = torch.cat(llm_pos_ids_list, dim=1).reshape(3, -1)
+            position_ids[..., i, attention_mask[i] == 1] = llm_positions.to(
+                position_ids.device
+            )
+            mrope_position_deltas.append(
+                llm_positions.max() + 1 - len(total_input_ids[i])
+            )
+        mrope_position_deltas = torch.tensor(
+            mrope_position_deltas, device=input_ids.device
+        ).unsqueeze(1)
+        return position_ids, mrope_position_deltas
+    else:
+        if attention_mask is not None:
+            position_ids = attention_mask.long().cumsum(-1) - 1
+            position_ids.masked_fill_(attention_mask == 0, 1)
+            position_ids = (
+                position_ids.unsqueeze(0).expand(3, -1, -1).to(attention_mask.device)
+            )
+            max_position_ids = position_ids.max(0, keepdim=False)[0].max(
+                -1, keepdim=True
+            )[0]
+            mrope_position_deltas = max_position_ids + 1 - attention_mask.shape[-1]
+        else:
+            position_ids = (
+                torch.arange(input_ids.shape[1], device=input_ids.device)
+                .view(1, 1, -1)
+                .expand(3, input_ids.shape[0], -1)
+            )
+            mrope_position_deltas = torch.zeros(
+                [input_ids.shape[0], 1],
+                device=input_ids.device,
+                dtype=input_ids.dtype,
+            )
+        return position_ids, mrope_position_deltas