VoiceAPI / src /tokenizer.py
Harshil748's picture
Initial HF Spaces deployment - downloads models at runtime
ecde958
"""
TTS Tokenizer for VITS models
Adapted from Coqui TTS for SYSPIN models
CRITICAL: The vocabulary MUST be built as:
[<PAD>] + list(punctuations) + list(characters) + [<BLNK>]
Where:
- punctuations = "!¡'(),-.:;¿? " (standard VITS punctuations)
- characters = content of chars.txt file
"""
import re
from typing import Dict, List, Optional
from dataclasses import dataclass
# Standard VITS punctuations used by SYSPIN models
VITS_PUNCTUATIONS = "!¡'(),-.:;¿? "
@dataclass
class CharactersConfig:
"""Character configuration for tokenizer"""
characters: str = ""
punctuations: str = VITS_PUNCTUATIONS
pad: str = "<PAD>"
eos: str = None
bos: str = None
blank: str = "<BLNK>"
phonemes: Optional[str] = None
class TTSTokenizer:
"""
Tokenizer for TTS models - Compatible with SYSPIN VITS models.
The vocabulary is built EXACTLY as VitsCharacters._create_vocab():
vocab = [<PAD>] + list(punctuations) + list(characters) + [<BLNK>]
For SYSPIN models:
- punctuations = "!¡'(),-.:;¿? " (13 chars)
- characters = content from chars.txt
- Total vocab = 1 + 13 + len(chars.txt) + 1
"""
def __init__(
self,
characters: str,
punctuations: str = VITS_PUNCTUATIONS,
pad: str = "<PAD>",
blank: str = "<BLNK>",
):
"""
Initialize tokenizer.
Args:
characters: The characters string (from chars.txt)
punctuations: Punctuation characters (default: VITS standard)
pad: Padding token
blank: Blank token for CTC
"""
self.characters = characters
self.punctuations = punctuations
self.pad = pad
self.blank = blank
# Build vocabulary: [PAD] + punctuations + characters + [BLANK]
self._build_vocab()
def _build_vocab(self):
"""
Build vocabulary EXACTLY matching VitsCharacters._create_vocab():
self._vocab = [self._pad] + list(self._punctuations) + list(self._characters) + [self._blank]
"""
self.vocab: List[str] = []
self.char_to_id: Dict[str, int] = {}
self.id_to_char: Dict[int, str] = {}
# Build vocab in exact order
# 1. PAD token
self.vocab.append(self.pad)
# 2. Punctuations
for char in self.punctuations:
self.vocab.append(char)
# 3. Characters from chars.txt
for char in self.characters:
self.vocab.append(char)
# 4. BLANK token
self.vocab.append(self.blank)
# Build mappings
for idx, char in enumerate(self.vocab):
self.char_to_id[char] = idx
self.id_to_char[idx] = char
self.vocab_size = len(self.vocab)
self.pad_id = self.char_to_id[self.pad]
self.blank_id = self.char_to_id[self.blank]
def text_to_ids(self, text: str, add_blank: bool = True) -> List[int]:
"""
Convert text to token IDs with interspersed blanks.
Matches TTSTokenizer.text_to_ids() from extra.py:
1. Clean text with multilingual_cleaners
2. Encode to IDs
3. Intersperse blank tokens
"""
# Apply multilingual_cleaners
text = self._clean_text(text)
# Encode characters to IDs
char_ids = []
for char in text:
if char in self.char_to_id:
char_ids.append(self.char_to_id[char])
# Skip unknown characters (matching original behavior)
# Intersperse blank tokens
if add_blank:
result = [self.blank_id] * (len(char_ids) * 2 + 1)
result[1::2] = char_ids
return result
return char_ids
def ids_to_text(self, ids: List[int]) -> str:
"""Convert token IDs back to text"""
chars = []
for idx in ids:
if idx in self.id_to_char:
char = self.id_to_char[idx]
if char not in [self.pad, self.blank]:
chars.append(char)
return "".join(chars)
def _clean_text(self, text: str) -> str:
"""
Text cleaning matching multilingual_cleaners from extra.py:
1. lowercase
2. replace_symbols
3. remove_aux_symbols
4. collapse_whitespace
"""
text = text.lower()
text = self._replace_symbols(text)
text = self._remove_aux_symbols(text)
text = re.sub(r"\s+", " ", text).strip()
return text
def _replace_symbols(self, text: str) -> str:
"""Replace symbols matching extra.py replace_symbols()"""
text = text.replace(";", ",")
text = text.replace("-", " ")
text = text.replace(":", ",")
return text
def _remove_aux_symbols(self, text: str) -> str:
"""Remove auxiliary symbols matching extra.py remove_aux_symbols()"""
text = re.sub(r"[\<\>\(\)\[\]\"]+", "", text)
return text
@classmethod
def from_chars_file(cls, chars_file: str) -> "TTSTokenizer":
"""
Create tokenizer from chars.txt file.
This matches the jit_infer.py setup:
- characters = content of chars.txt
- punctuations = "!¡'(),-.:;¿? " (standard VITS punctuations)
Vocab will be: [<PAD>] + punctuations + characters + [<BLNK>]
"""
with open(chars_file, "r", encoding="utf-8") as f:
characters = f.read().strip("\n")
return cls(
characters=characters,
punctuations=VITS_PUNCTUATIONS,
pad="<PAD>",
blank="<BLNK>",
)
class TextNormalizer:
"""Text normalizer for Indian languages"""
@staticmethod
def normalize_numbers(text: str, lang: str = "hi") -> str:
"""Convert numbers to words"""
pattern = r"\{(\d+)\}\{([^}]+)\}"
text = re.sub(pattern, r"\2", text)
return text
@staticmethod
def normalize_punctuation(text: str) -> str:
"""Normalize punctuation marks"""
text = re.sub(r'["""]', '"', text)
text = re.sub(r"[''']", "'", text)
text = re.sub(r"[–—]", "-", text)
return text
@staticmethod
def clean_text(text: str, lang: str = "hi") -> str:
"""Full text cleaning pipeline"""
text = TextNormalizer.normalize_numbers(text, lang)
text = TextNormalizer.normalize_punctuation(text)
text = re.sub(r"\s+", " ", text).strip()
return text