Spaces:
Build error
Build error
update_gtts for Amharic
Browse files- soni_translate/text_to_speech.py +52 -11
soni_translate/text_to_speech.py
CHANGED
|
@@ -963,6 +963,45 @@ def filter_by_speaker(speakers, segments):
|
|
| 963 |
}
|
| 964 |
|
| 965 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 966 |
def audio_segmentation_to_voice(
|
| 967 |
result_diarize,
|
| 968 |
TRANSLATE_AUDIO_TO,
|
|
@@ -1021,6 +1060,7 @@ def audio_segmentation_to_voice(
|
|
| 1021 |
pattern_coqui = re.compile(r".+\.(wav|mp3|ogg|m4a)$")
|
| 1022 |
pattern_vits_onnx = re.compile(r".* VITS-onnx$")
|
| 1023 |
pattern_openai_tts = re.compile(r".* OpenAI-TTS$")
|
|
|
|
| 1024 |
|
| 1025 |
all_segments = result_diarize["segments"]
|
| 1026 |
|
|
@@ -1028,12 +1068,9 @@ def audio_segmentation_to_voice(
|
|
| 1028 |
speakers_bark = find_spkr(pattern_bark, speaker_to_voice, all_segments)
|
| 1029 |
speakers_vits = find_spkr(pattern_vits, speaker_to_voice, all_segments)
|
| 1030 |
speakers_coqui = find_spkr(pattern_coqui, speaker_to_voice, all_segments)
|
| 1031 |
-
speakers_vits_onnx = find_spkr(
|
| 1032 |
-
|
| 1033 |
-
)
|
| 1034 |
-
speakers_openai_tts = find_spkr(
|
| 1035 |
-
pattern_openai_tts, speaker_to_voice, all_segments
|
| 1036 |
-
)
|
| 1037 |
|
| 1038 |
# Filter method in segments
|
| 1039 |
filtered_edge = filter_by_speaker(speakers_edge, all_segments)
|
|
@@ -1042,6 +1079,7 @@ def audio_segmentation_to_voice(
|
|
| 1042 |
filtered_coqui = filter_by_speaker(speakers_coqui, all_segments)
|
| 1043 |
filtered_vits_onnx = filter_by_speaker(speakers_vits_onnx, all_segments)
|
| 1044 |
filtered_openai_tts = filter_by_speaker(speakers_openai_tts, all_segments)
|
|
|
|
| 1045 |
|
| 1046 |
# Infer
|
| 1047 |
if filtered_edge["segments"]:
|
|
@@ -1049,9 +1087,7 @@ def audio_segmentation_to_voice(
|
|
| 1049 |
segments_egde_tts(filtered_edge, TRANSLATE_AUDIO_TO, is_gui) # mp3
|
| 1050 |
if filtered_bark["segments"]:
|
| 1051 |
logger.info(f"BARK TTS: {speakers_bark}")
|
| 1052 |
-
segments_bark_tts(
|
| 1053 |
-
filtered_bark, TRANSLATE_AUDIO_TO, model_id_bark
|
| 1054 |
-
) # wav
|
| 1055 |
if filtered_vits["segments"]:
|
| 1056 |
logger.info(f"VITS TTS: {speakers_vits}")
|
| 1057 |
segments_vits_tts(filtered_vits, TRANSLATE_AUDIO_TO) # wav
|
|
@@ -1071,6 +1107,9 @@ def audio_segmentation_to_voice(
|
|
| 1071 |
if filtered_openai_tts["segments"]:
|
| 1072 |
logger.info(f"OpenAI TTS: {speakers_openai_tts}")
|
| 1073 |
segments_openai_tts(filtered_openai_tts, TRANSLATE_AUDIO_TO) # wav
|
|
|
|
|
|
|
|
|
|
| 1074 |
|
| 1075 |
[result.pop("tts_name", None) for result in result_diarize["segments"]]
|
| 1076 |
return [
|
|
@@ -1079,7 +1118,8 @@ def audio_segmentation_to_voice(
|
|
| 1079 |
speakers_vits,
|
| 1080 |
speakers_coqui,
|
| 1081 |
speakers_vits_onnx,
|
| 1082 |
-
speakers_openai_tts
|
|
|
|
| 1083 |
]
|
| 1084 |
|
| 1085 |
|
|
@@ -1098,7 +1138,8 @@ def accelerate_segments(
|
|
| 1098 |
speakers_vits,
|
| 1099 |
speakers_coqui,
|
| 1100 |
speakers_vits_onnx,
|
| 1101 |
-
speakers_openai_tts
|
|
|
|
| 1102 |
) = valid_speakers
|
| 1103 |
|
| 1104 |
create_directories(f"{folder_output}/audio/")
|
|
|
|
| 963 |
}
|
| 964 |
|
| 965 |
|
| 966 |
+
def segments_gtts_tts(filtered_gtts_segments, TRANSLATE_AUDIO_TO):
|
| 967 |
+
"""Google TTS implementation"""
|
| 968 |
+
for segment in tqdm(filtered_gtts_segments["segments"]):
|
| 969 |
+
speaker = segment["speaker"]
|
| 970 |
+
text = segment["text"]
|
| 971 |
+
start = segment["start"]
|
| 972 |
+
tts_name = segment["tts_name"]
|
| 973 |
+
|
| 974 |
+
# make the tts audio
|
| 975 |
+
filename = f"audio/{start}.ogg"
|
| 976 |
+
logger.info(f"{text} >> {filename}")
|
| 977 |
+
try:
|
| 978 |
+
# Get language code from TTS name (e.g. "am-GTTS" -> "am")
|
| 979 |
+
lang_code = tts_name.split('-')[0]
|
| 980 |
+
tts = gTTS(text=text, lang=lang_code)
|
| 981 |
+
|
| 982 |
+
# Use temporary file to avoid file system issues
|
| 983 |
+
from tempfile import TemporaryFile
|
| 984 |
+
f = TemporaryFile()
|
| 985 |
+
tts.write_to_fp(f)
|
| 986 |
+
f.seek(0)
|
| 987 |
+
|
| 988 |
+
# Read and save as ogg
|
| 989 |
+
audio_data, samplerate = sf.read(f)
|
| 990 |
+
f.close()
|
| 991 |
+
|
| 992 |
+
data_tts = pad_array(audio_data, samplerate)
|
| 993 |
+
sf.write(
|
| 994 |
+
file=filename,
|
| 995 |
+
samplerate=samplerate,
|
| 996 |
+
data=data_tts,
|
| 997 |
+
format="ogg",
|
| 998 |
+
subtype="vorbis"
|
| 999 |
+
)
|
| 1000 |
+
verify_saved_file_and_size(filename)
|
| 1001 |
+
except Exception as error:
|
| 1002 |
+
error_handling_in_tts(error, segment, TRANSLATE_AUDIO_TO, filename)
|
| 1003 |
+
|
| 1004 |
+
|
| 1005 |
def audio_segmentation_to_voice(
|
| 1006 |
result_diarize,
|
| 1007 |
TRANSLATE_AUDIO_TO,
|
|
|
|
| 1060 |
pattern_coqui = re.compile(r".+\.(wav|mp3|ogg|m4a)$")
|
| 1061 |
pattern_vits_onnx = re.compile(r".* VITS-onnx$")
|
| 1062 |
pattern_openai_tts = re.compile(r".* OpenAI-TTS$")
|
| 1063 |
+
pattern_gtts = re.compile(r".*-GTTS$") # New pattern for gTTS
|
| 1064 |
|
| 1065 |
all_segments = result_diarize["segments"]
|
| 1066 |
|
|
|
|
| 1068 |
speakers_bark = find_spkr(pattern_bark, speaker_to_voice, all_segments)
|
| 1069 |
speakers_vits = find_spkr(pattern_vits, speaker_to_voice, all_segments)
|
| 1070 |
speakers_coqui = find_spkr(pattern_coqui, speaker_to_voice, all_segments)
|
| 1071 |
+
speakers_vits_onnx = find_spkr(pattern_vits_onnx, speaker_to_voice, all_segments)
|
| 1072 |
+
speakers_openai_tts = find_spkr(pattern_openai_tts, speaker_to_voice, all_segments)
|
| 1073 |
+
speakers_gtts = find_spkr(pattern_gtts, speaker_to_voice, all_segments) # New gTTS speakers
|
|
|
|
|
|
|
|
|
|
| 1074 |
|
| 1075 |
# Filter method in segments
|
| 1076 |
filtered_edge = filter_by_speaker(speakers_edge, all_segments)
|
|
|
|
| 1079 |
filtered_coqui = filter_by_speaker(speakers_coqui, all_segments)
|
| 1080 |
filtered_vits_onnx = filter_by_speaker(speakers_vits_onnx, all_segments)
|
| 1081 |
filtered_openai_tts = filter_by_speaker(speakers_openai_tts, all_segments)
|
| 1082 |
+
filtered_gtts = filter_by_speaker(speakers_gtts, all_segments) # New gTTS filter
|
| 1083 |
|
| 1084 |
# Infer
|
| 1085 |
if filtered_edge["segments"]:
|
|
|
|
| 1087 |
segments_egde_tts(filtered_edge, TRANSLATE_AUDIO_TO, is_gui) # mp3
|
| 1088 |
if filtered_bark["segments"]:
|
| 1089 |
logger.info(f"BARK TTS: {speakers_bark}")
|
| 1090 |
+
segments_bark_tts(filtered_bark, TRANSLATE_AUDIO_TO, model_id_bark) # wav
|
|
|
|
|
|
|
| 1091 |
if filtered_vits["segments"]:
|
| 1092 |
logger.info(f"VITS TTS: {speakers_vits}")
|
| 1093 |
segments_vits_tts(filtered_vits, TRANSLATE_AUDIO_TO) # wav
|
|
|
|
| 1107 |
if filtered_openai_tts["segments"]:
|
| 1108 |
logger.info(f"OpenAI TTS: {speakers_openai_tts}")
|
| 1109 |
segments_openai_tts(filtered_openai_tts, TRANSLATE_AUDIO_TO) # wav
|
| 1110 |
+
if filtered_gtts["segments"]: # New gTTS condition
|
| 1111 |
+
logger.info(f"Google TTS: {speakers_gtts}")
|
| 1112 |
+
segments_gtts_tts(filtered_gtts, TRANSLATE_AUDIO_TO) # ogg
|
| 1113 |
|
| 1114 |
[result.pop("tts_name", None) for result in result_diarize["segments"]]
|
| 1115 |
return [
|
|
|
|
| 1118 |
speakers_vits,
|
| 1119 |
speakers_coqui,
|
| 1120 |
speakers_vits_onnx,
|
| 1121 |
+
speakers_openai_tts,
|
| 1122 |
+
speakers_gtts # Add gTTS to returned speakers list
|
| 1123 |
]
|
| 1124 |
|
| 1125 |
|
|
|
|
| 1138 |
speakers_vits,
|
| 1139 |
speakers_coqui,
|
| 1140 |
speakers_vits_onnx,
|
| 1141 |
+
speakers_openai_tts,
|
| 1142 |
+
speakers_gtts
|
| 1143 |
) = valid_speakers
|
| 1144 |
|
| 1145 |
create_directories(f"{folder_output}/audio/")
|