Spaces:
Running
Running
| import torch | |
| import soundfile as sf | |
| import os | |
| import re | |
| from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan | |
| from speechbrain.pretrained import EncoderClassifier | |
| # Define paths and device | |
| model_path = "HAMMALE/speecht5-darija" # Path to your model on HF Hub | |
| device = "cuda" if torch.cuda.is_available() else "cpu" | |
| print(f"Using device: {device}") | |
| # Load models | |
| processor = SpeechT5Processor.from_pretrained(model_path) | |
| model = SpeechT5ForTextToSpeech.from_pretrained(model_path).to(device) | |
| vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan").to(device) | |
| # Load speaker embedding model | |
| speaker_model = EncoderClassifier.from_hparams( | |
| source="speechbrain/spkrec-xvect-voxceleb", | |
| run_opts={"device": device}, | |
| savedir=os.path.join("/tmp", "spkrec-xvect-voxceleb"), | |
| ) | |
| # Load pre-computed speaker embeddings | |
| male_embedding = torch.load("male_embedding.pt") if os.path.exists("male_embedding.pt") else torch.randn(1, 512) | |
| female_embedding = torch.load("female_embedding.pt") if os.path.exists("female_embedding.pt") else torch.randn(1, 512) | |
| # Text normalization function | |
| def normalize_text(text): | |
| """Normalize text for TTS processing""" | |
| text = text.lower() | |
| # Keep letters, numbers, spaces and apostrophes - fixed regex | |
| text = re.sub(r'[^\w\s\'\u0600-\u06FF]', '', text) | |
| text = ' '.join(text.split()) | |
| return text | |
| # Function to synthesize speech | |
| def synthesize_speech(text, voice_type="male", speed=1.0): | |
| """Generate speech from text using the specified voice type""" | |
| try: | |
| # Select speaker embedding based on voice type | |
| if voice_type == "male": | |
| speaker_embeddings = male_embedding.to(device) | |
| else: | |
| speaker_embeddings = female_embedding.to(device) | |
| # Normalize and tokenize input text | |
| normalized_text = normalize_text(text) | |
| inputs = processor(text=normalized_text, return_tensors="pt").to(device) | |
| # Generate speech | |
| with torch.no_grad(): | |
| speech = model.generate_speech( | |
| inputs["input_ids"], | |
| speaker_embeddings, | |
| vocoder=vocoder | |
| ) | |
| # Convert to numpy array and adjust speed if needed | |
| speech_np = speech.cpu().numpy() | |
| # Apply speed adjustment (simple resampling) | |
| if speed != 1.0: | |
| # This is a simple approach - for production use a proper resampling library | |
| import numpy as np | |
| from scipy import signal | |
| sample_rate = 16000 | |
| new_length = int(len(speech_np) / speed) | |
| speech_np = signal.resample(speech_np, new_length) | |
| # Save temporary audio file | |
| output_file = "output_speech.wav" | |
| sf.write(output_file, speech_np, 16000) | |
| return output_file, None | |
| except Exception as e: | |
| return None, f"Error generating speech: {str(e)}" | |
| # Gradio imports need to be added | |
| import gradio as gr | |
| custom_css = """ | |
| body, html { | |
| margin: 0; | |
| padding: 0; | |
| height: 100%; | |
| width: 100%; | |
| overflow-x: hidden; | |
| } | |
| .gradio-container { | |
| font-family: 'Montserrat', 'Arial', sans-serif !important; | |
| height: 100vh; | |
| width: 100vw; | |
| background: linear-gradient(135deg, #f5f7fa 0%, #c3cfe2 100%); | |
| display: flex; | |
| flex-direction: column; | |
| padding: 0; | |
| margin: 0; | |
| overflow-y: auto; | |
| } | |
| .main-header { | |
| background: linear-gradient(90deg, #d32f2f, #1976d2); | |
| color: white; | |
| padding: 2em; | |
| text-align: center; | |
| box-shadow: 0 6px 12px rgba(0, 0, 0, 0.15); | |
| border-bottom: 4px solid #ffffff33; | |
| } | |
| .main-header h1 { | |
| font-size: 2.8em; | |
| margin: 0; | |
| font-weight: 700; | |
| letter-spacing: 1px; | |
| text-shadow: 2px 2px 4px rgba(0, 0, 0, 0.2); | |
| } | |
| .main-header p { | |
| font-size: 1.2em; | |
| margin: 0.5em 0 0; | |
| opacity: 0.9; | |
| font-weight: 300; | |
| } | |
| .container { | |
| max-width: 1200px; | |
| margin: 2em auto; | |
| padding: 0 1em; | |
| flex: 1; | |
| } | |
| .row { | |
| display: flex; | |
| gap: 2em; | |
| background: white; | |
| border-radius: 15px; | |
| padding: 2em; | |
| box-shadow: 0 8px 16px rgba(0, 0, 0, 0.1); | |
| margin-bottom: 2em; | |
| } | |
| .column { | |
| flex: 1; | |
| padding: 1em; | |
| } | |
| .info-box { | |
| background: #fef6f6; | |
| border-left: 5px solid #d32f2f; | |
| padding: 1.5em; | |
| border-radius: 8px; | |
| margin-bottom: 1.5em; | |
| font-size: 1em; | |
| line-height: 1.6; | |
| box-shadow: 0 2px 4px rgba(0, 0, 0, 0.05); | |
| } | |
| .textbox textarea { | |
| border: 2px solid #e0e0e0 !important; | |
| border-radius: 10px !important; | |
| padding: 1em !important; | |
| font-size: 1.1em !important; | |
| transition: border-color 0.3s ease !important; | |
| } | |
| .textbox textarea:focus { | |
| border-color: #d32f2f !important; | |
| box-shadow: 0 0 8px rgba(211, 47, 47, 0.2) !important; | |
| } | |
| .radio { | |
| display: flex; | |
| justify-content: center; | |
| gap: 1.5em; | |
| margin: 1em 0; | |
| } | |
| .radio label { | |
| background: #f5f5f5; | |
| padding: 0.8em 1.5em; | |
| border-radius: 25px; | |
| border: 2px solid #e0e0e0; | |
| cursor: pointer; | |
| transition: all 0.3s ease; | |
| } | |
| .radio input:checked + label { | |
| background: #d32f2f; | |
| color: white; | |
| border-color: #d32f2f; | |
| box-shadow: 0 4px 8px rgba(211, 47, 47, 0.2); | |
| } | |
| .slider { | |
| margin: 1.5em 0; | |
| } | |
| .slider input { | |
| accent-color: #d32f2f !important; | |
| } | |
| .button { | |
| background: linear-gradient(90deg, #d32f2f, #1976d2) !important; | |
| color: white !important; | |
| padding: 1em 2em !important; | |
| border-radius: 25px !important; | |
| border: none !important; | |
| font-size: 1.1em !important; | |
| font-weight: 600 !important; | |
| transition: transform 0.2s ease, box-shadow 0.3s ease !important; | |
| box-shadow: 0 4px 8px rgba(0, 0, 0, 0.15) !important; | |
| } | |
| .button:hover { | |
| transform: translateY(-2px) !important; | |
| box-shadow: 0 6px 12px rgba(0, 0, 0, 0.25) !important; | |
| } | |
| .audio { | |
| margin-top: 1em; | |
| } | |
| .audio audio { | |
| width: 100%; | |
| border-radius: 10px; | |
| box-shadow: 0 4px 8px rgba(0, 0, 0, 0.1); | |
| } | |
| .example-header { | |
| font-weight: 600; | |
| color: #d32f2f; | |
| margin: 1.5em 0 0.5em; | |
| font-size: 1.2em; | |
| } | |
| ul { | |
| padding-left: 1.5em; | |
| color: #333; | |
| } | |
| li { | |
| margin: 0.5em 0; | |
| font-size: 1em; | |
| } | |
| .examples { | |
| margin-top: 1.5em; | |
| padding: 1em; | |
| background: #f9f9f9; | |
| border-radius: 10px; | |
| box-shadow: 0 4px 8px rgba(0, 0, 0, 0.05); | |
| } | |
| footer { | |
| text-align: center; | |
| padding: 1.5em; | |
| background: #ffffff; | |
| color: #666; | |
| font-size: 0.95em; | |
| border-top: 1px solid #e0e0e0; | |
| margin-top: auto; | |
| } | |
| .flag-icon { | |
| width: 30px; | |
| height: 30px; | |
| vertical-align: middle; | |
| margin-right: 10px; | |
| } | |
| """ | |
| # Create Gradio interface with enhanced design | |
| with gr.Blocks(css=custom_css) as demo: | |
| gr.HTML( | |
| """ | |
| <style> | |
| .flag { | |
| display: inline-block; | |
| width: 40px; | |
| height: 30px; | |
| background-image: url('https://flagcdn.com/w40/ma.png'); | |
| background-size: cover; | |
| border-radius: 4px; | |
| margin-right: 8px; | |
| vertical-align: middle; | |
| } | |
| </style> | |
| <div class="main-header"> | |
| <h1><span class="flag"></span>Moroccan Darija Text-to-Speech 🎙️</h1> | |
| <p>Transform your Darija text into lifelike speech with ease</p> | |
| </div> | |
| """ | |
| ) | |
| with gr.Row(elem_classes="row"): | |
| with gr.Column(elem_classes="column"): | |
| text_input = gr.Textbox( | |
| label="Enter Darija Text", | |
| placeholder="Kteb chi jomla b darija hna, bhal 'Salam, kifach nta?'...", | |
| lines=3, | |
| elem_classes="textbox" | |
| ) | |
| with gr.Row(elem_classes="radio"): | |
| voice_type = gr.Radio( | |
| ["male", "female"], | |
| label="Voice Type", | |
| value="male" | |
| ) | |
| speed = gr.Slider( | |
| minimum=0.5, | |
| maximum=2.0, | |
| value=1.0, | |
| step=0.1, | |
| label="Speech Speed", | |
| elem_classes="slider" | |
| ) | |
| generate_btn = gr.Button("Generate Speech", variant="primary", elem_classes="button") | |
| with gr.Column(elem_classes="column"): | |
| audio_output = gr.Audio(label="Generated Speech", elem_classes="audio") | |
| error_output = gr.Textbox(label="Error (if any)", visible=False) | |
| gr.Examples( | |
| examples=[ | |
| ["Ana Nadi Bezzaaf hhh", "male", 1.0], | |
| ["Lyoum ajwaa zwina bezzaaf.", "female", 1.0], | |
| ["Lmaghrib ahssan blad fi l3alam", "male", 1.0], | |
| ["Filistine horaa mina lbari ila lbarri", "female", 0.8], | |
| ], | |
| inputs=[text_input, voice_type, speed], | |
| outputs=[audio_output, error_output], | |
| fn=synthesize_speech | |
| ) | |
| gr.HTML( | |
| """ | |
| <footer> | |
| <p>Developed by HAMMALE | Data: DODa Audio Dataset | AtlasAI</p> | |
| </footer> | |
| """ | |
| ) | |
| # Set button click action | |
| generate_btn.click( | |
| fn=synthesize_speech, | |
| inputs=[text_input, voice_type, speed], | |
| outputs=[audio_output, error_output] | |
| ) | |
| # Launch the demo | |
| if __name__ == "__main__": | |
| demo.launch() |