Spaces:

HAMMALE
/

speecht5-darija

Running

App Files Files Community

speecht5-darija / app.py

HAMMALE

Update app.py

b816ec3 verified 2 months ago

raw

history blame contribute delete

9.29 kB

	import torch
	import soundfile as sf
	import os
	import re
	from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan
	from speechbrain.pretrained import EncoderClassifier

	# Define paths and device
	model_path = "HAMMALE/speecht5-darija" # Path to your model on HF Hub
	device = "cuda" if torch.cuda.is_available() else "cpu"
	print(f"Using device: {device}")

	# Load models
	processor = SpeechT5Processor.from_pretrained(model_path)
	model = SpeechT5ForTextToSpeech.from_pretrained(model_path).to(device)
	vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan").to(device)

	# Load speaker embedding model
	speaker_model = EncoderClassifier.from_hparams(
	source="speechbrain/spkrec-xvect-voxceleb",
	run_opts={"device": device},
	savedir=os.path.join("/tmp", "spkrec-xvect-voxceleb"),
	)

	# Load pre-computed speaker embeddings
	male_embedding = torch.load("male_embedding.pt") if os.path.exists("male_embedding.pt") else torch.randn(1, 512)
	female_embedding = torch.load("female_embedding.pt") if os.path.exists("female_embedding.pt") else torch.randn(1, 512)

	# Text normalization function
	def normalize_text(text):
	"""Normalize text for TTS processing"""
	text = text.lower()
	# Keep letters, numbers, spaces and apostrophes - fixed regex
	text = re.sub(r'[^\w\s\'\u0600-\u06FF]', '', text)
	text = ' '.join(text.split())
	return text

	# Function to synthesize speech
	def synthesize_speech(text, voice_type="male", speed=1.0):
	"""Generate speech from text using the specified voice type"""
	try:
	# Select speaker embedding based on voice type
	if voice_type == "male":
	speaker_embeddings = male_embedding.to(device)
	else:
	speaker_embeddings = female_embedding.to(device)

	# Normalize and tokenize input text
	normalized_text = normalize_text(text)
	inputs = processor(text=normalized_text, return_tensors="pt").to(device)

	# Generate speech
	with torch.no_grad():
	speech = model.generate_speech(
	inputs["input_ids"],
	speaker_embeddings,
	vocoder=vocoder
	)

	# Convert to numpy array and adjust speed if needed
	speech_np = speech.cpu().numpy()

	# Apply speed adjustment (simple resampling)
	if speed != 1.0:
	# This is a simple approach - for production use a proper resampling library
	import numpy as np
	from scipy import signal
	sample_rate = 16000
	new_length = int(len(speech_np) / speed)
	speech_np = signal.resample(speech_np, new_length)

	# Save temporary audio file
	output_file = "output_speech.wav"
	sf.write(output_file, speech_np, 16000)

	return output_file, None

	except Exception as e:
	return None, f"Error generating speech: {str(e)}"

	# Gradio imports need to be added
	import gradio as gr


	custom_css = """
	body, html {
	margin: 0;
	padding: 0;
	height: 100%;
	width: 100%;
	overflow-x: hidden;
	}

	.gradio-container {
	font-family: 'Montserrat', 'Arial', sans-serif !important;
	height: 100vh;
	width: 100vw;
	background: linear-gradient(135deg, #f5f7fa 0%, #c3cfe2 100%);
	display: flex;
	flex-direction: column;
	padding: 0;
	margin: 0;
	overflow-y: auto;
	}

	.main-header {
	background: linear-gradient(90deg, #d32f2f, #1976d2);
	color: white;
	padding: 2em;
	text-align: center;
	box-shadow: 0 6px 12px rgba(0, 0, 0, 0.15);
	border-bottom: 4px solid #ffffff33;
	}

	.main-header h1 {
	font-size: 2.8em;
	margin: 0;
	font-weight: 700;
	letter-spacing: 1px;
	text-shadow: 2px 2px 4px rgba(0, 0, 0, 0.2);
	}

	.main-header p {
	font-size: 1.2em;
	margin: 0.5em 0 0;
	opacity: 0.9;
	font-weight: 300;
	}

	.container {
	max-width: 1200px;
	margin: 2em auto;
	padding: 0 1em;
	flex: 1;
	}

	.row {
	display: flex;
	gap: 2em;
	background: white;
	border-radius: 15px;
	padding: 2em;
	box-shadow: 0 8px 16px rgba(0, 0, 0, 0.1);
	margin-bottom: 2em;
	}

	.column {
	flex: 1;
	padding: 1em;
	}

	.info-box {
	background: #fef6f6;
	border-left: 5px solid #d32f2f;
	padding: 1.5em;
	border-radius: 8px;
	margin-bottom: 1.5em;
	font-size: 1em;
	line-height: 1.6;
	box-shadow: 0 2px 4px rgba(0, 0, 0, 0.05);
	}

	.textbox textarea {
	border: 2px solid #e0e0e0 !important;
	border-radius: 10px !important;
	padding: 1em !important;
	font-size: 1.1em !important;
	transition: border-color 0.3s ease !important;
	}

	.textbox textarea:focus {
	border-color: #d32f2f !important;
	box-shadow: 0 0 8px rgba(211, 47, 47, 0.2) !important;
	}

	.radio {
	display: flex;
	justify-content: center;
	gap: 1.5em;
	margin: 1em 0;
	}

	.radio label {
	background: #f5f5f5;
	padding: 0.8em 1.5em;
	border-radius: 25px;
	border: 2px solid #e0e0e0;
	cursor: pointer;
	transition: all 0.3s ease;
	}

	.radio input:checked + label {
	background: #d32f2f;
	color: white;
	border-color: #d32f2f;
	box-shadow: 0 4px 8px rgba(211, 47, 47, 0.2);
	}

	.slider {
	margin: 1.5em 0;
	}

	.slider input {
	accent-color: #d32f2f !important;
	}

	.button {
	background: linear-gradient(90deg, #d32f2f, #1976d2) !important;
	color: white !important;
	padding: 1em 2em !important;
	border-radius: 25px !important;
	border: none !important;
	font-size: 1.1em !important;
	font-weight: 600 !important;
	transition: transform 0.2s ease, box-shadow 0.3s ease !important;
	box-shadow: 0 4px 8px rgba(0, 0, 0, 0.15) !important;
	}

	.button:hover {
	transform: translateY(-2px) !important;
	box-shadow: 0 6px 12px rgba(0, 0, 0, 0.25) !important;
	}

	.audio {
	margin-top: 1em;
	}

	.audio audio {
	width: 100%;
	border-radius: 10px;
	box-shadow: 0 4px 8px rgba(0, 0, 0, 0.1);
	}

	.example-header {
	font-weight: 600;
	color: #d32f2f;
	margin: 1.5em 0 0.5em;
	font-size: 1.2em;
	}

	ul {
	padding-left: 1.5em;
	color: #333;
	}

	li {
	margin: 0.5em 0;
	font-size: 1em;
	}

	.examples {
	margin-top: 1.5em;
	padding: 1em;
	background: #f9f9f9;
	border-radius: 10px;
	box-shadow: 0 4px 8px rgba(0, 0, 0, 0.05);
	}

	footer {
	text-align: center;
	padding: 1.5em;
	background: #ffffff;
	color: #666;
	font-size: 0.95em;
	border-top: 1px solid #e0e0e0;
	margin-top: auto;
	}

	.flag-icon {
	width: 30px;
	height: 30px;
	vertical-align: middle;
	margin-right: 10px;
	}
	"""

	# Create Gradio interface with enhanced design
	with gr.Blocks(css=custom_css) as demo:
	gr.HTML(
	"""
	<style>
	.flag {
	display: inline-block;
	width: 40px;
	height: 30px;
	background-image: url('https://flagcdn.com/w40/ma.png');
	background-size: cover;
	border-radius: 4px;
	margin-right: 8px;
	vertical-align: middle;
	}
	</style>
	<div class="main-header">
	<h1><span class="flag"></span>Moroccan Darija Text-to-Speech 🎙️</h1>
	<p>Transform your Darija text into lifelike speech with ease</p>
	</div>
	"""
	)

	with gr.Row(elem_classes="row"):
	with gr.Column(elem_classes="column"):
	text_input = gr.Textbox(
	label="Enter Darija Text",
	placeholder="Kteb chi jomla b darija hna, bhal 'Salam, kifach nta?'...",
	lines=3,
	elem_classes="textbox"
	)

	with gr.Row(elem_classes="radio"):
	voice_type = gr.Radio(
	["male", "female"],
	label="Voice Type",
	value="male"
	)

	speed = gr.Slider(
	minimum=0.5,
	maximum=2.0,
	value=1.0,
	step=0.1,
	label="Speech Speed",
	elem_classes="slider"
	)

	generate_btn = gr.Button("Generate Speech", variant="primary", elem_classes="button")

	with gr.Column(elem_classes="column"):
	audio_output = gr.Audio(label="Generated Speech", elem_classes="audio")
	error_output = gr.Textbox(label="Error (if any)", visible=False)

	gr.Examples(
	examples=[
	["Ana Nadi Bezzaaf hhh", "male", 1.0],
	["Lyoum ajwaa zwina bezzaaf.", "female", 1.0],
	["Lmaghrib ahssan blad fi l3alam", "male", 1.0],
	["Filistine horaa mina lbari ila lbarri", "female", 0.8],
	],
	inputs=[text_input, voice_type, speed],
	outputs=[audio_output, error_output],
	fn=synthesize_speech
	)

	gr.HTML(
	"""
	<footer>
	<p>Developed by HAMMALE \| Data: DODa Audio Dataset \| AtlasAI</p>
	</footer>
	"""
	)

	# Set button click action
	generate_btn.click(
	fn=synthesize_speech,
	inputs=[text_input, voice_type, speed],
	outputs=[audio_output, error_output]
	)

	# Launch the demo
	if __name__ == "__main__":
	demo.launch()