import torch from PIL import Image import librosa from diffsynth import VideoData, save_video_with_audio from diffsynth.pipelines.wan_video_new import WanVideoPipeline, ModelConfig, WanVideoUnit_S2V from modelscope import dataset_snapshot_download def speech_to_video( prompt, input_image, audio_path, negative_prompt="", num_clip=None, audio_sample_rate=16000, pose_video_path=None, infer_frames=80, height=448, width=832, num_inference_steps=40, fps=16, # recommend fixing fps as 16 for s2v motion_frames=73, # hyperparameter of wan2.2-s2v save_path=None, ): # s2v audio input, recommend 16kHz sampling rate input_audio, sample_rate = librosa.load(audio_path, sr=audio_sample_rate) # s2v will use the first (num_frames) frames as reference. height and width must be the same as input_image. And fps should be 16, the same as output video fps. pose_video = VideoData(pose_video_path, height=height, width=width) if pose_video_path is not None else None audio_embeds, pose_latents, num_repeat = WanVideoUnit_S2V.pre_calculate_audio_pose( pipe=pipe, input_audio=input_audio, audio_sample_rate=sample_rate, s2v_pose_video=pose_video, num_frames=infer_frames + 1, height=height, width=width, fps=fps, ) num_repeat = min(num_repeat, num_clip) if num_clip is not None else num_repeat print(f"Generating {num_repeat} video clips...") motion_videos = [] video = [] for r in range(num_repeat): s2v_pose_latents = pose_latents[r] if pose_latents is not None else None current_clip = pipe( prompt=prompt, input_image=input_image, negative_prompt=negative_prompt, seed=0, num_frames=infer_frames + 1, height=height, width=width, audio_embeds=audio_embeds[r], s2v_pose_latents=s2v_pose_latents, motion_video=motion_videos, num_inference_steps=num_inference_steps, ) current_clip = current_clip[-infer_frames:] if r == 0: current_clip = current_clip[3:] overlap_frames_num = min(motion_frames, len(current_clip)) motion_videos = motion_videos[overlap_frames_num:] + current_clip[-overlap_frames_num:] video.extend(current_clip) save_video_with_audio(video, save_path, audio_path, fps=16, quality=5) print(f"processed the {r+1}th clip of total {num_repeat} clips.") return video pipe = WanVideoPipeline.from_pretrained( torch_dtype=torch.bfloat16, device="cuda", model_configs=[ ModelConfig(model_id="Wan-AI/Wan2.2-S2V-14B", origin_file_pattern="diffusion_pytorch_model*.safetensors"), ModelConfig(model_id="Wan-AI/Wan2.2-S2V-14B", origin_file_pattern="models_t5_umt5-xxl-enc-bf16.pth"), ModelConfig(model_id="Wan-AI/Wan2.2-S2V-14B", origin_file_pattern="wav2vec2-large-xlsr-53-english/model.safetensors"), ModelConfig(model_id="Wan-AI/Wan2.2-S2V-14B", origin_file_pattern="Wan2.1_VAE.pth"), ], audio_processor_config=ModelConfig(model_id="Wan-AI/Wan2.2-S2V-14B", origin_file_pattern="wav2vec2-large-xlsr-53-english/"), ) dataset_snapshot_download( dataset_id="DiffSynth-Studio/example_video_dataset", local_dir="./data/example_video_dataset", allow_file_pattern=f"wans2v/*", ) infer_frames = 80 # 4n height = 448 width = 832 prompt = "a person is singing" negative_prompt = "画面模糊,最差质量,画面模糊,细节模糊不清,情绪激动剧烈,手快速抖动,字幕,丑陋的,残缺的,多余的手指,画得不好的手部,画得不好的脸部,畸形的,毁容的,形态畸形的肢体,手指融合,静止不动的画面,杂乱的背景,三条腿,背景人很多,倒着走" input_image = Image.open("data/example_video_dataset/wans2v/pose.png").convert("RGB").resize((width, height)) video_with_audio = speech_to_video( prompt=prompt, input_image=input_image, audio_path='data/example_video_dataset/wans2v/sing.MP3', negative_prompt=negative_prompt, pose_video_path='data/example_video_dataset/wans2v/pose.mp4', save_path="video_with_audio_full.mp4", infer_frames=infer_frames, height=height, width=width, ) # num_clip means generating only the first n clips with n * infer_frames frames. video_with_audio_pose = speech_to_video( prompt=prompt, input_image=input_image, audio_path='data/example_video_dataset/wans2v/sing.MP3', negative_prompt=negative_prompt, pose_video_path='data/example_video_dataset/wans2v/pose.mp4', save_path="video_with_audio_pose_clip_2.mp4", num_clip=2 )