""" REST API Server for Multi-lingual TTS FastAPI-based server with OpenAPI documentation Hackathon API Specification: - GET /Get_Inference with text, lang, speaker_wav parameters """ import os import io import time import logging import tempfile from typing import Optional, List from pathlib import Path import numpy as np from fastapi import ( FastAPI, HTTPException, Query, Response, BackgroundTasks, UploadFile, File, ) from fastapi.middleware.cors import CORSMiddleware from fastapi.responses import StreamingResponse, FileResponse, JSONResponse from pydantic import BaseModel, Field import soundfile as sf from .engine import TTSEngine, TTSOutput from .config import ( LANGUAGE_CONFIGS, get_available_languages, get_available_voices, STYLE_PRESETS, ) # Language name to voice key mapping (for hackathon API) LANG_TO_VOICE = { "hindi": "hi_female", "bengali": "bn_female", "marathi": "mr_female", "telugu": "te_female", "kannada": "kn_female", "bhojpuri": "bho_female", "chhattisgarhi": "hne_female", "maithili": "mai_female", "magahi": "mag_female", "english": "en_female", "gujarati": "gu_mms", } # Setup logging logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__name__) # Initialize FastAPI app app = FastAPI( title="Voice Tech for All - Multi-lingual TTS API", description=""" A multi-lingual Text-to-Speech API supporting 10+ Indian languages. ## Features - 10 Indian languages with male/female voices - Real-time speech synthesis - Text normalization for Indian languages - Speed control - Multiple audio formats (WAV, MP3) ## Supported Languages Hindi, Bengali, Marathi, Telugu, Kannada, Bhojpuri, Chhattisgarhi, Maithili, Magahi, English ## Use Case Built for an LLM-based healthcare assistant for pregnant mothers in low-income communities. """, version="1.0.0", contact={ "name": "Voice Tech for All Hackathon", "url": "https://huggingface.co/SYSPIN", }, license_info={ "name": "CC BY 4.0", "url": "https://creativecommons.org/licenses/by/4.0/", }, ) # CORS middleware app.add_middleware( CORSMiddleware, allow_origins=["*"], allow_credentials=True, allow_methods=["*"], allow_headers=["*"], ) # Initialize TTS Engine (lazy loading) _engine: Optional[TTSEngine] = None def get_engine() -> TTSEngine: """Get or create TTS engine instance""" global _engine if _engine is None: _engine = TTSEngine(device="auto") return _engine # Request/Response Models class SynthesizeRequest(BaseModel): """Request body for text synthesis""" text: str = Field( ..., description="Text to synthesize", min_length=1, max_length=5000 ) voice: str = Field( "hi_male", description="Voice key (e.g., hi_male, bn_female, gu_mms)" ) speed: float = Field(1.0, description="Speech speed (0.5-2.0)", ge=0.5, le=2.0) pitch: float = Field(1.0, description="Pitch multiplier (0.5-2.0)", ge=0.5, le=2.0) energy: float = Field(1.0, description="Energy/volume (0.5-2.0)", ge=0.5, le=2.0) style: Optional[str] = Field( None, description="Style preset (happy, sad, calm, excited, etc.)" ) normalize: bool = Field(True, description="Apply text normalization") class Config: schema_extra = { "example": { "text": "નમસ્તે, હું તમારી કેવી રીતે મદદ કરી શકું?", "voice": "gu_mms", "speed": 1.0, "pitch": 1.0, "energy": 1.0, "style": "calm", "normalize": True, } } class SynthesizeResponse(BaseModel): """Response metadata for synthesis""" success: bool duration: float sample_rate: int voice: str text: str inference_time: float class VoiceInfo(BaseModel): """Information about a voice""" key: str name: str language_code: str gender: str loaded: bool downloaded: bool model_type: str = "vits" class HealthResponse(BaseModel): """Health check response""" status: str device: str loaded_voices: List[str] available_voices: int style_presets: List[str] # API Endpoints @app.get("/", response_class=JSONResponse) async def root(): """API root - welcome message""" return { "message": "Voice Tech for All - Multi-lingual TTS API", "docs": "/docs", "health": "/health", "synthesize": "/synthesize", } @app.get("/health", response_model=HealthResponse) async def health_check(): """Health check endpoint""" engine = get_engine() return HealthResponse( status="healthy", device=str(engine.device), loaded_voices=engine.get_loaded_voices(), available_voices=len(LANGUAGE_CONFIGS), style_presets=list(STYLE_PRESETS.keys()), ) @app.get("/voices", response_model=List[VoiceInfo]) async def list_voices(): """List all available voices""" engine = get_engine() voices = engine.get_available_voices() return [ VoiceInfo( key=key, name=info["name"], language_code=info["code"], gender=info["gender"], loaded=info["loaded"], downloaded=info["downloaded"], model_type=info.get("type", "vits"), ) for key, info in voices.items() ] @app.get("/styles") async def list_styles(): """List available style presets for prosody control""" return { "presets": STYLE_PRESETS, "description": { "speed": "Speech rate multiplier (0.5-2.0)", "pitch": "Pitch multiplier (0.5-2.0), >1 = higher", "energy": "Volume/energy multiplier (0.5-2.0)", }, } @app.get("/languages") async def list_languages(): """List supported languages""" return get_available_languages() @app.post("/synthesize", response_class=Response) async def synthesize_audio(request: SynthesizeRequest): """ Synthesize speech from text Returns WAV audio file directly """ engine = get_engine() # Validate voice if request.voice not in LANGUAGE_CONFIGS: raise HTTPException( status_code=400, detail=f"Unknown voice: {request.voice}. Use /voices to see available options.", ) try: start_time = time.time() # Synthesize output = engine.synthesize( text=request.text, voice=request.voice, speed=request.speed, pitch=request.pitch, energy=request.energy, style=request.style, normalize_text=request.normalize, ) inference_time = time.time() - start_time # Convert to WAV bytes buffer = io.BytesIO() sf.write(buffer, output.audio, output.sample_rate, format="WAV") buffer.seek(0) # Return audio with metadata headers return Response( content=buffer.read(), media_type="audio/wav", headers={ "X-Duration": str(output.duration), "X-Sample-Rate": str(output.sample_rate), "X-Voice": output.voice, "X-Style": output.style or "default", "X-Inference-Time": str(inference_time), }, ) except Exception as e: logger.error(f"Synthesis error: {e}") raise HTTPException(status_code=500, detail=str(e)) @app.post("/synthesize/stream") async def synthesize_stream(request: SynthesizeRequest): """ Synthesize speech and stream the audio Returns streaming WAV audio """ engine = get_engine() if request.voice not in LANGUAGE_CONFIGS: raise HTTPException(status_code=400, detail=f"Unknown voice: {request.voice}") try: output = engine.synthesize( text=request.text, voice=request.voice, speed=request.speed, pitch=request.pitch, energy=request.energy, style=request.style, normalize_text=request.normalize, ) # Create streaming response buffer = io.BytesIO() sf.write(buffer, output.audio, output.sample_rate, format="WAV") buffer.seek(0) return StreamingResponse( buffer, media_type="audio/wav", headers={"Content-Disposition": "attachment; filename=speech.wav"}, ) except Exception as e: logger.error(f"Streaming error: {e}") raise HTTPException(status_code=500, detail=str(e)) @app.get("/synthesize/get") async def synthesize_get( text: str = Query( ..., description="Text to synthesize", min_length=1, max_length=1000 ), voice: str = Query("hi_male", description="Voice key"), speed: float = Query(1.0, description="Speech speed", ge=0.5, le=2.0), pitch: float = Query(1.0, description="Pitch", ge=0.5, le=2.0), energy: float = Query(1.0, description="Energy", ge=0.5, le=2.0), style: Optional[str] = Query(None, description="Style preset"), ): """ GET endpoint for simple synthesis Useful for testing and simple integrations """ request = SynthesizeRequest( text=text, voice=voice, speed=speed, pitch=pitch, energy=energy, style=style ) return await synthesize_audio(request) @app.api_route("/Get_Inference", methods=["GET", "POST"]) async def get_inference( text: str = Query( ..., description="The input text to be converted into speech. For English, text must be lowercase.", ), lang: str = Query( ..., description="Language of input text. Supported: bhojpuri, bengali, english, gujarati, hindi, chhattisgarhi, kannada, magahi, maithili, marathi, telugu", ), speaker_wav: UploadFile = File( ..., description="A reference WAV file representing the speaker's voice (mandatory per hackathon spec).", ), ): """ Hackathon API - Generate speech audio from text This endpoint follows the Voice Tech for All hackathon specification. Supports both GET and POST methods with multipart form data. Parameters: - text: Input text to synthesize (query param) - lang: Language (query param) - bhojpuri, bengali, english, gujarati, hindi, chhattisgarhi, kannada, magahi, maithili, marathi, telugu - speaker_wav: Reference WAV file (multipart file upload, mandatory) Returns: - 200 OK: WAV audio file as streaming response """ engine = get_engine() # Normalize language name lang_lower = lang.lower().strip() # Enforce lowercase for English text (per spec) if lang_lower == "english": text = text.lower() # Map language to voice if lang_lower not in LANG_TO_VOICE: supported = list(LANG_TO_VOICE.keys()) raise HTTPException( status_code=400, detail=f"Unsupported language: {lang}. Supported languages: {', '.join(supported)}", ) voice = LANG_TO_VOICE[lang_lower] # Read speaker_wav (mandatory per spec) # Note: Current VITS models don't support voice cloning, but we accept the file # for API compatibility and validation. In future, this could be used for voice adaptation. try: speaker_audio_bytes = await speaker_wav.read() logger.info( f"Received speaker reference WAV: {len(speaker_audio_bytes)} bytes, filename: {speaker_wav.filename}" ) # Validate it's a valid audio file (basic check) if len(speaker_audio_bytes) < 44: # Minimum WAV header size raise HTTPException( status_code=400, detail="Invalid speaker_wav: file too small to be a valid WAV", ) except HTTPException: raise except Exception as e: logger.error(f"Could not read speaker_wav: {e}") raise HTTPException( status_code=400, detail=f"Failed to read speaker_wav file: {str(e)}" ) try: # Synthesize audio output = engine.synthesize( text=text, voice=voice, speed=1.0, normalize_text=True, ) # Convert to WAV bytes buffer = io.BytesIO() sf.write(buffer, output.audio, output.sample_rate, format="WAV") buffer.seek(0) # Return as streaming response (per spec) return StreamingResponse( buffer, media_type="audio/wav", headers={ "Content-Disposition": "attachment; filename=output.wav", "X-Duration": str(output.duration), "X-Sample-Rate": str(output.sample_rate), "X-Language": lang, "X-Voice": voice, }, ) except Exception as e: logger.error(f"Synthesis error: {e}") raise HTTPException(status_code=500, detail=str(e)) @app.post("/preload") async def preload_voice(voice: str): """Preload a voice model into memory""" engine = get_engine() if voice not in LANGUAGE_CONFIGS: raise HTTPException(status_code=400, detail=f"Unknown voice: {voice}") try: engine.load_voice(voice) return {"message": f"Voice {voice} loaded successfully"} except Exception as e: raise HTTPException(status_code=500, detail=str(e)) @app.post("/unload") async def unload_voice(voice: str): """Unload a voice model from memory""" engine = get_engine() engine.unload_voice(voice) return {"message": f"Voice {voice} unloaded"} @app.post("/batch") async def batch_synthesize( texts: List[str], voice: str = "hi_male", speed: float = 1.0 ): """ Synthesize multiple texts Returns a list of base64-encoded audio """ import base64 engine = get_engine() if voice not in LANGUAGE_CONFIGS: raise HTTPException(status_code=400, detail=f"Unknown voice: {voice}") results = [] for text in texts: output = engine.synthesize(text, voice, speed) buffer = io.BytesIO() sf.write(buffer, output.audio, output.sample_rate, format="WAV") buffer.seek(0) results.append( { "text": text, "audio_base64": base64.b64encode(buffer.read()).decode(), "duration": output.duration, } ) return results # Startup/Shutdown events @app.on_event("startup") async def startup_event(): """Initialize on startup""" logger.info("Starting TTS API server...") # Optionally preload default voice # get_engine().load_voice("hi_male") @app.on_event("shutdown") async def shutdown_event(): """Cleanup on shutdown""" logger.info("Shutting down TTS API server...") def start_server(host: str = "0.0.0.0", port: int = 8000, reload: bool = False): """Start the API server""" import uvicorn uvicorn.run("src.api:app", host=host, port=port, reload=reload, log_level="info") if __name__ == "__main__": start_server()