Spaces:

JinhuaL1ANG
/

AudioMorphix

Paused

AudioMorphix / src /module /tango /predict.py

9a6dac6 10 months ago

4.86 kB

	# Prediction interface for Cog ⚙️
	# https://github.com/replicate/cog/blob/main/docs/python.md

	import os
	import subprocess
	import time
	import json
	import torch
	from tqdm import tqdm
	import soundfile as sf
	from models import AudioDiffusion, DDPMScheduler
	from audioldm.audio.stft import TacotronSTFT
	from audioldm.variational_autoencoder import AutoencoderKL
	from cog import BasePredictor, Input, Path


	MODEL_URL = "https://weights.replicate.delivery/default/declare-lab/tango.tar"
	MODEL_CACHE = "tango_weights"


	def download_weights(url, dest):
	start = time.time()
	print("downloading url: ", url)
	print("downloading to: ", dest)
	subprocess.check_call(["pget", "-x", url, dest], close_fds=False)
	print("downloading took: ", time.time() - start)


	class Predictor(BasePredictor):
	def setup(self):
	"""Load the model into memory to make running multiple predictions efficient"""
	if not os.path.exists(MODEL_CACHE):
	download_weights(MODEL_URL, MODEL_CACHE)

	self.models = {k: Tango(name=k) for k in ["tango2", "tango2-full"]}

	def predict(
	self,
	prompt: str = Input(
	description="Input prompt",
	default="Quiet speech and then and airplane flying away",
	),
	model: str = Input(
	description="choose a model",
	choices=[
	"tango2",
	"tango2-full",
	],
	default="tango2",
	),
	steps: int = Input(description="inference steps", default=100),
	guidance: float = Input(description="guidance scale", default=3),
	) -> Path:
	"""Run a single prediction on the model"""

	tango = self.models[model]
	audio = tango.generate(prompt, steps, guidance)
	out = "/tmp/output.wav"
	sf.write(out, audio, samplerate=16000)
	return Path(out)


	class Tango:
	def __init__(self, name="tango2", path=MODEL_CACHE, device="cuda:0"):
	# weights are downloaded from f"https://huggingface.co/declare-lab/{name}/tree/main" and saved to MODEL_CACHE
	vae_config = json.load(open(f"{path}/{name}/vae_config.json"))
	stft_config = json.load(open(f"{path}/{name}/stft_config.json"))
	main_config = json.load(open(f"{path}/{name}/main_config.json"))

	self.vae = AutoencoderKL(**vae_config).to(device)
	self.stft = TacotronSTFT(**stft_config).to(device)
	self.model = AudioDiffusion(**main_config).to(device)

	vae_weights = torch.load(
	f"{path}/{name}/pytorch_model_vae.bin", map_location=device
	)
	stft_weights = torch.load(
	f"{path}/{name}/pytorch_model_stft.bin", map_location=device
	)
	main_weights = torch.load(
	f"{path}/{name}/pytorch_model_main.bin", map_location=device
	)

	self.vae.load_state_dict(vae_weights)
	self.stft.load_state_dict(stft_weights)
	self.model.load_state_dict(main_weights)

	self.vae.eval()
	self.stft.eval()
	self.model.eval()

	self.scheduler = DDPMScheduler.from_pretrained(
	main_config["scheduler_name"], subfolder="scheduler"
	)

	def chunks(self, lst, n):
	"""Yield successive n-sized chunks from a list."""
	for i in range(0, len(lst), n):
	yield lst[i : i + n]

	def generate(self, prompt, steps=100, guidance=3, samples=1, disable_progress=True):
	"""Generate audio for a single prompt string."""
	with torch.no_grad():
	latents = self.model.inference(
	[prompt],
	self.scheduler,
	steps,
	guidance,
	samples,
	disable_progress=disable_progress,
	)
	mel = self.vae.decode_first_stage(latents)
	wave = self.vae.decode_to_waveform(mel)
	return wave[0]

	def generate_for_batch(
	self,
	prompts,
	steps=100,
	guidance=3,
	samples=1,
	batch_size=8,
	disable_progress=True,
	):
	"""Generate audio for a list of prompt strings."""
	outputs = []
	for k in tqdm(range(0, len(prompts), batch_size)):
	batch = prompts[k : k + batch_size]
	with torch.no_grad():
	latents = self.model.inference(
	batch,
	self.scheduler,
	steps,
	guidance,
	samples,
	disable_progress=disable_progress,
	)
	mel = self.vae.decode_first_stage(latents)
	wave = self.vae.decode_to_waveform(mel)
	outputs += [item for item in wave]
	if samples == 1:
	return outputs
	else:
	return list(self.chunks(outputs, samples))