# src/preprocess.py import os import json import string from collections import Counter, defaultdict import pandas as pd from .config import DATA_PATH, VOCAB_PATH, MAX_SRC_LEN, MAX_TGT_LEN def normalize_teluguish(w: str) -> str: w = w.strip().lower() w = w.strip(string.punctuation) return w def build_word_pairs(df: pd.DataFrame): src_words = [] # teluguish tgt_words = [] # telugu n = df.shape[0] for i in range(n): src_tokens = str(df["translit"][i]).split() tgt_tokens = str(df["text"][i]).split() if len(src_tokens) != len(tgt_tokens): continue for sw, tw in zip(src_tokens, tgt_tokens): sw_norm = normalize_teluguish(sw) if not sw_norm: continue src_words.append(sw_norm) tgt_words.append(tw.strip()) return src_words, tgt_words def build_char_vocabs(src_words, tgt_words): src_char_counts = Counter("".join(src_words)) tgt_char_counts = Counter("".join(tgt_words)) src_chars = sorted(src_char_counts.keys()) tgt_chars = sorted(tgt_char_counts.keys()) # special tokens SRC_PAD = 0 SRC_SOS = 1 SRC_EOS = 2 SRC_UNK = 3 TGT_PAD = 0 TGT_SOS = 1 TGT_EOS = 2 TGT_UNK = 3 src_char2idx = {c: i + 4 for i, c in enumerate(src_chars)} src_char2idx[""] = SRC_PAD src_char2idx[""] = SRC_SOS src_char2idx[""] = SRC_EOS src_char2idx[""] = SRC_UNK tgt_char2idx = {c: i + 4 for i, c in enumerate(tgt_chars)} tgt_char2idx[""] = TGT_PAD tgt_char2idx[""] = TGT_SOS tgt_char2idx[""] = TGT_EOS tgt_char2idx[""] = TGT_UNK return src_char2idx, tgt_char2idx def build_word_dictionary(src_words, tgt_words): word_map_counts = defaultdict(Counter) for sw, tw in zip(src_words, tgt_words): word_map_counts[sw][tw] += 1 word_map = {} for sw, counter in word_map_counts.items(): most_common_tgt, _ = counter.most_common(1)[0] word_map[sw] = most_common_tgt return word_map def main(): os.makedirs(os.path.dirname(VOCAB_PATH), exist_ok=True) df = pd.read_parquet(DATA_PATH) print("Building word pairs...") src_words, tgt_words = build_word_pairs(df) print(f"Total word pairs: {len(src_words)}") print("Building char vocabs...") src_char2idx, tgt_char2idx = build_char_vocabs(src_words, tgt_words) print("Building word dictionary...") word_map = build_word_dictionary(src_words, tgt_words) print(f"Unique Teluguish words in dict: {len(word_map)}") vocab = { "src_char2idx": src_char2idx, "tgt_char2idx": tgt_char2idx, "word_map": word_map, "max_src_len": MAX_SRC_LEN, "max_tgt_len": MAX_TGT_LEN, } with open(VOCAB_PATH, "w", encoding="utf-8") as f: json.dump(vocab, f, ensure_ascii=False, indent=2) print(f"Saved vocab + dictionary to {VOCAB_PATH}") if __name__ == "__main__": main()