Spaces:

MM-MVR
/

STAR

Sleeping

App Files Files Community

STAR / star /models /pixel_decoder /lumina2_decoder.py

MM-MVR

Upload files

97bc03d verified about 1 month ago

raw

history blame

28.6 kB

	import torch
	from diffusers import AutoencoderKL, FlowMatchEulerDiscreteScheduler, Lumina2Pipeline
	from transformers import AutoTokenizer, Gemma2Model
	import copy
	import torch.nn as nn
	import torch.nn.functional as F
	from diffusers.training_utils import (
	cast_training_params,
	compute_density_for_timestep_sampling,
	compute_loss_weighting_for_sd3,
	free_memory,
	)
	from diffusers.pipelines.lumina2.pipeline_lumina2 import *

	class Lumina2Decoder(torch.nn.Module):
	def __init__(self, config, args):
	super().__init__()
	self.diffusion_model_path = config.model_path

	if not hasattr(args, "revision"):
	args.revision = None
	if not hasattr(args, "variant"):
	args.variant = None

	self.tokenizer_one = AutoTokenizer.from_pretrained(
	self.diffusion_model_path,
	subfolder="tokenizer",
	revision=args.revision,
	)
	self.noise_scheduler = FlowMatchEulerDiscreteScheduler.from_pretrained(
	self.diffusion_model_path, subfolder="scheduler"
	)
	self.noise_scheduler_copy = copy.deepcopy(self.noise_scheduler)
	self.text_encoder_one = Gemma2Model.from_pretrained(
	self.diffusion_model_path, subfolder="text_encoder", revision=args.revision, variant=args.variant
	)
	self.text_encoding_pipeline = Lumina2Pipeline.from_pretrained(
	self.diffusion_model_path,
	vae=None,
	transformer=None,
	text_encoder=self.text_encoder_one,
	tokenizer=self.tokenizer_one,
	)
	self.vae = AutoencoderKL.from_pretrained(
	self.diffusion_model_path,
	subfolder="vae",
	revision=args.revision,
	variant=args.variant,
	)
	if args.ori_inp_dit=="seq":
	from star.models.pixel_decoder.transformer_lumina2_seq import Lumina2Transformer2DModel
	elif args.ori_inp_dit=="ref":
	from star.models.pixel_decoder.transformer_lumina2 import Lumina2Transformer2DModel

	self.transformer = Lumina2Transformer2DModel.from_pretrained(
	self.diffusion_model_path, subfolder="transformer", revision=args.revision, variant=args.variant
	)

	vq_dim = 512
	patch_size = self.transformer.config.patch_size
	in_channels = vq_dim + self.transformer.config.in_channels # 48 for mask
	out_channels = self.transformer.x_embedder.out_features

	load_num_channel = self.transformer.config.in_channels * patch_size * patch_size
	self.transformer.register_to_config(in_channels=in_channels)
	transformer = self.transformer
	with torch.no_grad():
	new_proj = nn.Linear(
	in_channels * patch_size * patch_size, out_channels, bias=True
	)

	new_proj.weight.zero_()

	new_proj = new_proj.to(transformer.x_embedder.weight.dtype)
	new_proj.weight[:, :load_num_channel].copy_(transformer.x_embedder.weight)
	new_proj.bias.copy_(transformer.x_embedder.bias)
	transformer.x_embedder = new_proj

	self.ori_inp_dit = args.ori_inp_dit
	if args.ori_inp_dit=="seq":
	refiner_channels = transformer.noise_refiner[-1].dim
	with torch.no_grad():
	vae2cond_proj1 = nn.Linear(refiner_channels, refiner_channels, bias=True)
	vae2cond_act = nn.GELU(approximate='tanh')
	vae2cond_proj2 = nn.Linear(refiner_channels, refiner_channels, bias=False)
	vae2cond_proj2.weight.zero_()

	ori_inp_refiner = nn.Sequential(
	vae2cond_proj1,
	vae2cond_act,
	vae2cond_proj2
	)
	transformer.ori_inp_refiner = ori_inp_refiner
	transformer.ori_inp_dit = self.ori_inp_dit
	elif args.ori_inp_dit=="ref":
	transformer.initialize_ref_weights()
	transformer.ori_inp_dit = self.ori_inp_dit

	transformer.requires_grad_(True)

	if args.grad_ckpt and args.diffusion_resolution==1024:
	transformer.gradient_checkpointing = args.grad_ckpt
	transformer.enable_gradient_checkpointing()

	self.vae.requires_grad_(False)
	self.vae.to(dtype=torch.float32)
	self.args = args

	self.pipe = Lumina2InstructPix2PixPipeline.from_pretrained(self.diffusion_model_path,
	transformer=transformer,
	text_encoder=self.text_encoder_one,
	vae=self.vae,
	torch_dtype=torch.bfloat16)


	with torch.no_grad():
	_, _, self.uncond_prompt_embeds, self.uncond_prompt_attention_mask = self.text_encoding_pipeline.encode_prompt(
	"",
	max_sequence_length=self.args.max_diff_seq_length,
	)

	def compute_text_embeddings(self,prompt, text_encoding_pipeline):
	with torch.no_grad():
	prompt_embeds, prompt_attention_mask, _, _ = text_encoding_pipeline.encode_prompt(
	prompt,
	max_sequence_length=self.args.max_diff_seq_length,
	)
	return prompt_embeds, prompt_attention_mask

	def get_sigmas(self, timesteps, n_dim=4, dtype=torch.float32):
	sigmas = self.noise_scheduler_copy.sigmas.to(dtype=dtype)
	schedule_timesteps = self.noise_scheduler_copy.timesteps.to(device=timesteps.device)
	timesteps = timesteps
	step_indices = [(schedule_timesteps == t).nonzero().item() for t in timesteps]

	sigma = sigmas[step_indices].flatten()
	while len(sigma.shape) < n_dim:
	sigma = sigma.unsqueeze(-1)
	return sigma

	def forward(self, batch_gpu,batch, image_embeds):
	args = self.args
	pixel_values = batch_gpu["full_pixel_values"].to(dtype=self.vae.dtype) #aux_image
	data_type = "t2i"
	if len(pixel_values.shape)==5:
	bs,num_img,c,h,w = pixel_values.shape
	if num_img==2:
	data_type = "edit"
	pixel_values_ori_img = pixel_values[:,0]
	pixel_values = pixel_values[:,-1]
	pixel_values = F.interpolate(pixel_values, size=(self.args.diffusion_resolution, self.args.diffusion_resolution), mode='bilinear',align_corners=False)
	if data_type=="edit" and self.ori_inp_dit!="none":
	pixel_values_ori_img = F.interpolate(pixel_values_ori_img, size=(self.args.diffusion_resolution, self.args.diffusion_resolution), mode='bilinear', align_corners=False)
	prompt = batch["prompts"]
	bs,_,_,_ = pixel_values.shape
	image_prompt_embeds = None
	image_embeds_2d = image_embeds.reshape(bs, 24, 24, image_embeds.shape[-1]).permute(0, 3, 1, 2)
	image_embeds_2d = F.interpolate(image_embeds_2d, size=(args.diffusion_resolution//8, args.diffusion_resolution//8), mode='bilinear', align_corners=False)

	control_emd = args.control_emd
	prompt_embeds, prompt_attention_mask = self.compute_text_embeddings(prompt, self.text_encoding_pipeline)
	if control_emd=="mix":
	prompt_embeds=torch.cat([prompt_embeds, image_prompt_embeds], dim=1) #use mix
	elif control_emd=="null":
	prompt_embeds = torch.zeros_like(prompt_embeds)
	prompt_attention_mask = torch.ones_like(prompt_attention_mask)
	elif control_emd=="text":
	pass
	elif control_emd=="vit" or control_emd=="vq" or control_emd=="vqvae" or control_emd=="vqconcat" or control_emd=="vqconcatvit":
	prompt_embeds=image_prompt_embeds


	latents = self.vae.encode(pixel_values).latent_dist.sample()
	latents = (latents - self.vae.config.shift_factor) * self.vae.config.scaling_factor
	latents = latents.to(dtype=image_embeds.dtype)

	latents_ori_img = torch.zeros_like(latents)
	if data_type=="edit" and self.ori_inp_dit!="none":
	latents_ori_img = self.vae.encode(pixel_values_ori_img).latent_dist.sample()
	latents_ori_img = (latents_ori_img - self.vae.config.shift_factor) * self.vae.config.scaling_factor
	latents_ori_img = latents_ori_img.to(dtype=image_embeds.dtype)

	# Sample noise that we'll add to the latents
	noise = torch.randn_like(latents)
	bsz = latents.shape[0]
	# Sample a random timestep for each image
	# for weighting schemes where we sample timesteps non-uniformly
	u = compute_density_for_timestep_sampling(
	weighting_scheme=args.weighting_scheme,
	batch_size=bsz,
	logit_mean=args.logit_mean,
	logit_std=args.logit_std,
	mode_scale=args.mode_scale,
	)

	indices = (u * self.noise_scheduler_copy.config.num_train_timesteps).long()
	timesteps = self.noise_scheduler_copy.timesteps[indices].to(device=latents.device)

	# Add noise to the latents according to the noise magnitude at each timestep
	# (this is the forward diffusion process)
	sigmas = self.get_sigmas(timesteps, n_dim=latents.ndim, dtype=latents.dtype).to(device=noise.device)
	#noisy_model_input = (1.0 - sigmas) * noise + sigmas * latents
	noisy_model_input = sigmas * noise + (1-sigmas) * latents
	#noisy_model_input + (1-sigmas)*(latents - noise) = latents
	# Get the additional image embedding for conditioning.
	# Instead of getting a diagonal Gaussian here, we simply take the mode.
	original_image_embeds = image_embeds_2d

	if args.conditioning_dropout_prob is not None:
	random_p = torch.rand(bsz, device=latents.device)
	# Sample masks for the edit prompts.
	prompt_mask = random_p < 2 * args.uncondition_prob
	prompt_mask = prompt_mask.reshape(bsz, 1, 1)
	# Final text conditioning.
	#prompt_embeds = torch.where(prompt_mask, torch.zeros_like(prompt_embeds), prompt_embeds)
	prompt_embeds = torch.where(prompt_mask, self.uncond_prompt_embeds.repeat(prompt_embeds.shape[0],1,1).to(prompt_embeds.device), prompt_embeds)
	prompt_attention_mask = torch.where(prompt_mask[:,0], self.uncond_prompt_attention_mask.repeat(prompt_embeds.shape[0],1).to(prompt_embeds.device), prompt_attention_mask)

	# Sample masks for the original images.
	#random_p_vq = torch.rand(bsz, device=latents.device)
	image_mask_dtype = original_image_embeds.dtype
	image_mask = 1 - (
	(random_p <= args.conditioning_dropout_prob).to(image_mask_dtype)
	)
	image_mask = image_mask.reshape(bsz, 1, 1, 1)

	if data_type=="edit":
	image_mask=0
	# Final image conditioning.
	original_image_embeds = image_mask * original_image_embeds

	ori_latent_mask = 1 - (
	(random_p >= args.uncondition_prob).to(image_mask_dtype)
	* (random_p < 3 * args.uncondition_prob).to(image_mask_dtype)
	)
	ori_latent_mask = ori_latent_mask.reshape(bsz, 1, 1, 1)
	latents_ori_img = ori_latent_mask * latents_ori_img

	concatenated_noisy_latents = torch.cat([noisy_model_input, original_image_embeds], dim=1)

	ref_image_hidden_states = None
	if self.ori_inp_dit=="dim":
	concatenated_noisy_latents = torch.cat([concatenated_noisy_latents, latents_ori_img], dim=1)
	elif self.ori_inp_dit=="seq":
	latents_ori_img = torch.cat([latents_ori_img, original_image_embeds], dim=1)
	concatenated_noisy_latents = torch.cat([concatenated_noisy_latents, latents_ori_img], dim=2)
	elif self.ori_inp_dit=="ref":
	latents_ori_img = torch.cat([latents_ori_img, original_image_embeds], dim=1)
	ref_image_hidden_states = latents_ori_img[:,None]
	# Predict the noise residual
	# scale the timesteps (reversal not needed as we used a reverse lerp above already)
	timesteps = 1-timesteps / self.noise_scheduler.config.num_train_timesteps #timesteps / self.noise_scheduler.config.num_train_timesteps
	model_pred = self.transformer(
	hidden_states=concatenated_noisy_latents,
	timestep=timesteps,
	encoder_hidden_states=prompt_embeds,
	encoder_attention_mask=prompt_attention_mask,
	# ref_image_hidden_states = ref_image_hidden_states,
	return_dict=False,
	)[0]
	if self.ori_inp_dit=="seq":
	model_pred = model_pred[:, :, :args.diffusion_resolution//8, :]

	weighting = compute_loss_weighting_for_sd3(weighting_scheme=args.weighting_scheme, sigmas=sigmas)
	target = latents - noise
	# Conditioning dropout to support classifier-free guidance during inference. For more details
	# check out the section 3.2.1 of the original paper https://arxiv.org/abs/2211.09800.

	# Concatenate the `original_image_embeds` with the `noisy_latents`.

	# Get the target for loss depending on the prediction type
	loss = torch.mean(
	(weighting.float() * (model_pred.float() - target.float()) ** 2).reshape(target.shape[0], -1),
	1,
	)
	loss = loss.mean()

	loss_value = loss.item()

	return loss


	class Lumina2InstructPix2PixPipeline(Lumina2Pipeline):

	@torch.no_grad()
	def __call__(
	self,
	prompt: Union[str, List[str]] = None,
	width: Optional[int] = None,
	height: Optional[int] = None,
	num_inference_steps: int = 30,
	guidance_scale: float = 4.0,
	negative_prompt: Union[str, List[str]] = None,
	sigmas: List[float] = None,
	num_images_per_prompt: Optional[int] = 1,
	generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
	latents: Optional[torch.Tensor] = None,
	prompt_embeds: Optional[torch.Tensor] = None,
	negative_prompt_embeds: Optional[torch.Tensor] = None,
	prompt_attention_mask: Optional[torch.Tensor] = None,
	negative_prompt_attention_mask: Optional[torch.Tensor] = None,
	output_type: Optional[str] = "pil",
	return_dict: bool = True,
	attention_kwargs: Optional[Dict[str, Any]] = None,
	callback_on_step_end: Optional[Callable[[int, int, Dict], None]] = None,
	callback_on_step_end_tensor_inputs: List[str] = ["latents"],
	system_prompt: Optional[str] = None,
	cfg_trunc_ratio=[0.0,1.0],
	cfg_normalization: bool = False,
	max_sequence_length: int = 256,
	control_emd="text",
	img_cfg_trunc_ratio =[0.0,1.0],
	gen_image_embeds=None,only_t2i="vqconcat",image_prompt_embeds=None,ori_inp_img=None,img_guidance_scale=1.5,vq_guidance_scale=0,ori_inp_way="none",
	) -> Union[ImagePipelineOutput, Tuple]:

	height = height or self.default_sample_size * self.vae_scale_factor
	width = width or self.default_sample_size * self.vae_scale_factor
	self._guidance_scale = guidance_scale
	self._attention_kwargs = attention_kwargs

	num_images_per_prompt = gen_image_embeds.shape[0] if gen_image_embeds is not None else image_prompt_embeds.shape[0]
	# 1. Check inputs. Raise error if not correct
	self.check_inputs(
	prompt,
	height,
	width,
	negative_prompt,
	prompt_embeds=prompt_embeds,
	negative_prompt_embeds=negative_prompt_embeds,
	prompt_attention_mask=prompt_attention_mask,
	negative_prompt_attention_mask=negative_prompt_attention_mask,
	max_sequence_length=max_sequence_length,
	callback_on_step_end_tensor_inputs=callback_on_step_end_tensor_inputs,
	)

	# 2. Define call parameters
	if prompt is not None and isinstance(prompt, str):
	batch_size = 1
	elif prompt is not None and isinstance(prompt, list):
	batch_size = len(prompt)
	else:
	batch_size = prompt_embeds.shape[0]

	device = self._execution_device

	# 3. Encode input prompt
	(
	prompt_embeds,
	prompt_attention_mask,
	negative_prompt_embeds,
	negative_prompt_attention_mask,
	) = self.encode_prompt(
	prompt,
	self.do_classifier_free_guidance,
	negative_prompt=negative_prompt,
	num_images_per_prompt=num_images_per_prompt,
	device=device,
	prompt_embeds=prompt_embeds,
	negative_prompt_embeds=negative_prompt_embeds,
	prompt_attention_mask=prompt_attention_mask,
	negative_prompt_attention_mask=negative_prompt_attention_mask,
	max_sequence_length=max_sequence_length,
	system_prompt=system_prompt,
	)


	if gen_image_embeds is not None:
	image_embeds_8=gen_image_embeds

	if control_emd=="text":
	pass
	elif control_emd=="null":
	prompt_embeds = torch.zeros_like(prompt_embeds)
	prompt_attention_mask = torch.zeros_like(prompt_attention_mask)
	negative_prompt_embeds = prompt_embeds
	negative_prompt_attention_mask = prompt_attention_mask

	if self.do_classifier_free_guidance:
	prompt_embeds = torch.cat([negative_prompt_embeds,negative_prompt_embeds, prompt_embeds], dim=0)
	prompt_attention_mask = torch.cat([negative_prompt_attention_mask,negative_prompt_attention_mask, prompt_attention_mask], dim=0)
	# 4. Prepare latents.
	latent_channels = self.vae.config.latent_channels #self.transformer.config.in_channels
	latents = self.prepare_latents(
	batch_size * num_images_per_prompt,
	latent_channels,
	height,
	width,
	prompt_embeds.dtype,
	device,
	generator,
	latents,
	)

	latents_ori_img = torch.zeros_like(latents)
	if ori_inp_img is not None and ori_inp_way !="none":
	#fuck = torch.load(ori_inp_img).to(latents.device)
	ori_inp_img = F.interpolate(ori_inp_img[None].to(latents.device,latents.dtype), size=(height,width), mode='bilinear',align_corners=False)
	latents_ori_img = self.vae.encode(ori_inp_img).latent_dist.sample()
	latents_ori_img = (latents_ori_img- self.vae.config.shift_factor) * self.vae.config.scaling_factor
	latents_ori_img = latents_ori_img.to(dtype=latents.dtype)
	if ori_inp_way !="none":
	negative_latents_ori_img = torch.zeros_like(latents_ori_img).to(prompt_embeds.dtype)
	latents_ori_img = torch.cat([negative_latents_ori_img,latents_ori_img, latents_ori_img], dim=0) if self.do_classifier_free_guidance else latents_ori_img

	vq_in_edit = False
	if only_t2i==True:
	image_latents = torch.zeros_like(latents)[:,:8]
	elif only_t2i=="vqconcat":
	image_embeds_2d = image_embeds_8.reshape(batch_size* num_images_per_prompt,24,24,image_embeds_8.shape[-1]).permute(0,3,1,2)
	if ori_inp_img is not None and image_embeds_8.mean()!=0:
	vq_in_edit = True
	image_vq_latents = F.interpolate(image_embeds_2d, size=(height//8,width//8), mode='bilinear',align_corners=False).to(latents.device,latents.dtype)
	image_latents = torch.zeros_like(image_vq_latents)
	else:
	image_latents = F.interpolate(image_embeds_2d, size=(height//8,width//8), mode='bilinear',align_corners=False).to(latents.device,latents.dtype)

	negative_image_latents = torch.zeros_like(image_latents).to(prompt_embeds.dtype)
	image_latents = torch.cat([negative_image_latents,image_latents, image_latents], dim=0) if self.do_classifier_free_guidance else image_latents

	# 5. Prepare timesteps
	sigmas = np.linspace(1.0, 1 / num_inference_steps, num_inference_steps) if sigmas is None else sigmas
	image_seq_len = latents.shape[1]
	mu = calculate_shift(
	image_seq_len,
	self.scheduler.config.get("base_image_seq_len", 256),
	self.scheduler.config.get("max_image_seq_len", 4096),
	self.scheduler.config.get("base_shift", 0.5),
	self.scheduler.config.get("max_shift", 1.15),
	)
	timesteps, num_inference_steps = retrieve_timesteps(
	self.scheduler,
	num_inference_steps,
	device,
	sigmas=sigmas,
	mu=mu,
	)
	num_warmup_steps = max(len(timesteps) - num_inference_steps * self.scheduler.order, 0)
	self._num_timesteps = len(timesteps)

	self.scheduler.sigmas=self.scheduler.sigmas.to(latents.dtype) #hjc find bug
	# 6. Denoising loop
	with self.progress_bar(total=num_inference_steps) as progress_bar:
	for i, t in enumerate(timesteps):
	# compute whether apply classifier-free truncation on this timestep
	do_classifier_free_truncation = not ((i + 1) / num_inference_steps > cfg_trunc_ratio[0] and (i + 1) / num_inference_steps < cfg_trunc_ratio[1])
	img_do_classifier_free_truncation = not ((i + 1) / num_inference_steps > img_cfg_trunc_ratio[0] and (i + 1) / num_inference_steps < img_cfg_trunc_ratio[1])

	# reverse the timestep since Lumina uses t=0 as the noise and t=1 as the image
	current_timestep = 1 - t / self.scheduler.config.num_train_timesteps

	latent_model_input = torch.cat([latents] * 3) if self.do_classifier_free_guidance else latents
	# broadcast to batch dimension in a way that's compatible with ONNX/Core ML
	current_timestep = current_timestep.expand(latent_model_input.shape[0])

	latent_model_input = torch.cat([latent_model_input, image_latents], dim=1)

	ref_image_hidden_states = None
	if ori_inp_way=="seq":
	latents_ori_img_cat = torch.cat([latents_ori_img, image_latents], dim=1)
	latent_model_input = torch.cat([latent_model_input, latents_ori_img_cat], dim=2)
	elif ori_inp_way=="ref":
	latents_ori_img_cat = torch.cat([latents_ori_img, image_latents], dim=1)
	ref_image_hidden_states = latents_ori_img_cat[:,None]

	if ori_inp_way=="ref":
	noise_pred = self.transformer(
	hidden_states=latent_model_input,
	timestep=current_timestep,
	encoder_hidden_states=prompt_embeds,
	encoder_attention_mask=prompt_attention_mask,
	return_dict=False,ref_image_hidden_states=ref_image_hidden_states,
	attention_kwargs=self.attention_kwargs,
	)[0]
	else:
	noise_pred = self.transformer(
	hidden_states=latent_model_input,
	timestep=current_timestep,
	encoder_hidden_states=prompt_embeds,
	encoder_attention_mask=prompt_attention_mask,
	return_dict=False,
	attention_kwargs=self.attention_kwargs,
	)[0]
	if ori_inp_way=="seq":
	noise_pred = noise_pred[:,:,:height//8,:]

	if vq_in_edit:
	latent_model_vq_input = torch.cat([latents, image_vq_latents], dim=1)
	if ori_inp_way=="seq":
	latents_ori_img_cat_vq = torch.cat([torch.zeros_like(latents), image_vq_latents], dim=1)
	latent_model_vq_input = torch.cat([latent_model_vq_input, latents_ori_img_cat_vq], dim=2)

	noise_vq_pred = self.transformer(
	hidden_states=latent_model_vq_input,
	timestep=current_timestep[-1:],
	encoder_hidden_states=prompt_embeds[-1:],
	encoder_attention_mask=prompt_attention_mask[-1:],
	return_dict=False,
	attention_kwargs=self.attention_kwargs,
	)[0]
	if ori_inp_way=="seq":
	noise_vq_pred = noise_vq_pred[:,:,:height//8,:]
	# perform normalization-based guidance scale on a truncated timestep interval
	if self.do_classifier_free_guidance:
	noise_pred_uncond,noise_pred_img, noise_pred_text = noise_pred.chunk(3)
	if not do_classifier_free_truncation and not img_do_classifier_free_truncation:
	noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_img)+ img_guidance_scale * (noise_pred_img - noise_pred_uncond)
	elif not do_classifier_free_truncation and img_do_classifier_free_truncation:
	noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_img)+ 1 * (noise_pred_img - noise_pred_uncond)
	elif do_classifier_free_truncation and not img_do_classifier_free_truncation:
	noise_pred = noise_pred_uncond + 1 * (noise_pred_text - noise_pred_img)+ img_guidance_scale * (noise_pred_img - noise_pred_uncond)
	else:
	noise_pred = noise_pred_text
	if vq_in_edit:
	noise_pred = noise_pred +vq_guidance_scale*(noise_vq_pred-noise_pred_uncond)
	# apply normalization after classifier-free guidance
	if cfg_normalization:
	cond_norm = torch.norm(noise_pred_text, dim=-1, keepdim=True)
	noise_norm = torch.norm(noise_pred, dim=-1, keepdim=True)
	noise_pred = noise_pred * (cond_norm / noise_norm)
	else:
	noise_pred = noise_pred

	# compute the previous noisy sample x_t -> x_t-1
	latents_dtype = latents.dtype
	noise_pred = -noise_pred
	latents = self.scheduler.step(noise_pred, t, latents, return_dict=False)[0]

	if latents.dtype != latents_dtype:
	if torch.backends.mps.is_available():
	# some platforms (eg. apple mps) misbehave due to a pytorch bug: https://github.com/pytorch/pytorch/pull/99272
	latents = latents.to(latents_dtype)

	if callback_on_step_end is not None:
	callback_kwargs = {}
	for k in callback_on_step_end_tensor_inputs:
	callback_kwargs[k] = locals()[k]
	callback_outputs = callback_on_step_end(self, i, t, callback_kwargs)

	latents = callback_outputs.pop("latents", latents)
	prompt_embeds = callback_outputs.pop("prompt_embeds", prompt_embeds)

	# call the callback, if provided
	if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
	progress_bar.update()

	if XLA_AVAILABLE:
	xm.mark_step()

	if not output_type == "latent":
	latents = (latents / self.vae.config.scaling_factor) + self.vae.config.shift_factor
	image = self.vae.decode(latents, return_dict=False)[0]
	image = self.image_processor.postprocess(image, output_type=output_type)
	else:
	image = latents

	# Offload all models
	self.maybe_free_model_hooks()

	if not return_dict:
	return (image,)

	return ImagePipelineOutput(images=image)