Spaces:

Chrishugs
/

dia-tts-nari

Sleeping

App Files Files Community

dia-tts-nari / app.py

Chrishugs

Upload 10 files

164e442 verified 4 months ago

raw

history blame

5.88 kB

	import gradio as gr
	import torch
	import numpy as np
	import tempfile
	import os
	import logging
	from typing import Optional, Tuple

	# Configure logging
	logging.basicConfig(level=logging.INFO)
	logger = logging.getLogger(__name__)

	# Global model variable
	model = None

	def load_dia_model():
	"""Load the Dia model"""
	global model
	try:
	logger.info("Loading Dia model...")
	from dia import Dia

	# Load with appropriate device and dtype
	device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
	compute_dtype = "float16" if torch.cuda.is_available() else "float32"

	model = Dia.from_pretrained(
	"nari-labs/Dia-1.6B-0626",
	device=device,
	compute_dtype=compute_dtype
	)
	logger.info(f"Dia model loaded successfully on {device}")
	return True
	except Exception as e:
	logger.error(f"Failed to load Dia model: {e}")
	return False

	def generate_speech(
	text: str,
	max_tokens: int = 3072,
	temperature: float = 0.7,
	top_p: float = 0.9
	) -> Tuple[Optional[str], str]:
	"""Generate speech from text using Dia model"""

	if not text or not text.strip():
	return None, "❌ Please enter some text to convert to speech"

	if model is None:
	return None, "❌ Model not loaded. Please refresh the page and try again."

	try:
	logger.info(f"Generating speech for text: {text[:50]}...")

	# Generate audio using Dia model
	audio_array = model.generate(
	text=text.strip(),
	max_tokens=max_tokens,
	temperature=temperature,
	top_p=top_p
	)

	# Save to temporary file
	with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as temp_file:
	model.save_audio(temp_file.name, audio_array)

	logger.info("Speech generation completed successfully")
	return temp_file.name, f"✅ Generated speech for: '{text[:50]}{'...' if len(text) > 50 else ''}'"

	except Exception as e:
	error_msg = f"❌ Error generating speech: {str(e)}"
	logger.error(error_msg)
	return None, error_msg

	# Load model on startup
	model_loaded = load_dia_model()

	# Create Gradio interface
	with gr.Blocks(
	title="Dia TTS - Nari Voice Generator",
	theme=gr.themes.Soft(),
	css="""
	.gradio-container {
	max-width: 800px !important;
	margin: auto !important;
	}
	"""
	) as demo:

	gr.Markdown("""
	# 🎙️ Dia TTS - Nari Voice Generator

	Convert your text into natural, human-like speech using the advanced Dia text-to-speech model.

	Model: `nari-labs/Dia-1.6B-0626`
	""")

	if not model_loaded:
	gr.Markdown("⚠️ Warning: Model failed to load. Some functionality may not work.")

	with gr.Row():
	with gr.Column():
	text_input = gr.Textbox(
	label="📝 Text Input",
	placeholder="Enter the text you want to convert to speech...",
	lines=4,
	max_lines=10
	)

	with gr.Row():
	max_tokens = gr.Slider(
	minimum=512,
	maximum=4096,
	value=3072,
	step=128,
	label="🎯 Max Tokens"
	)
	temperature = gr.Slider(
	minimum=0.1,
	maximum=1.0,
	value=0.7,
	step=0.1,
	label="🌡️ Temperature"
	)
	top_p = gr.Slider(
	minimum=0.1,
	maximum=1.0,
	value=0.9,
	step=0.1,
	label="🎲 Top P"
	)

	generate_btn = gr.Button(
	"🎵 Generate Speech",
	variant="primary",
	size="lg"
	)

	with gr.Column():
	audio_output = gr.Audio(
	label="🔊 Generated Speech",
	type="filepath"
	)
	status_output = gr.Textbox(
	label="📊 Status",
	interactive=False,
	lines=2
	)

	# Event handlers
	generate_btn.click(
	fn=generate_speech,
	inputs=[text_input, max_tokens, temperature, top_p],
	outputs=[audio_output, status_output],
	show_progress=True
	)

	# Examples
	gr.Examples(
	examples=[
	["Transform your text into natural, human-like speech with our advanced AI technology.", 3072, 0.7, 0.9],
	["The quick brown fox jumps over the lazy dog. This is a test of the Dia text-to-speech system.", 2048, 0.8, 0.9],
	["Welcome to the future of voice synthesis. Experience the power of AI-generated speech.", 3072, 0.6, 0.8],
	],
	inputs=[text_input, max_tokens, temperature, top_p],
	outputs=[audio_output, status_output],
	fn=generate_speech,
	cache_examples=False
	)

	gr.Markdown("""
	---

	### 📚 Usage Tips:
	- Max Tokens: Controls the length of generated audio (higher = longer)
	- Temperature: Controls randomness (0.1 = conservative, 1.0 = creative)
	- Top P: Controls diversity of word selection (0.1 = focused, 1.0 = diverse)

	### ⚙️ Technical Details:
	- Model: Dia-1.6B-0626 by Nari Labs
	- Output Format: WAV audio
	- Recommended Text Length: 50-500 characters for best results
	""")

	if __name__ == "__main__":
	demo.launch(
	server_name="0.0.0.0",
	server_port=7860,
	share=False,
	show_error=True,
	quiet=False
	)