# src/preprocess.py

import os
import json
import string
from collections import Counter, defaultdict

import pandas as pd

from .config import DATA_PATH, VOCAB_PATH, MAX_SRC_LEN, MAX_TGT_LEN


def normalize_teluguish(w: str) -> str:
    w = w.strip().lower()
    w = w.strip(string.punctuation)
    return w


def build_word_pairs(df: pd.DataFrame):
    src_words = []  # teluguish
    tgt_words = []  # telugu

    n = df.shape[0]
    for i in range(n):
        src_tokens = str(df["translit"][i]).split()
        tgt_tokens = str(df["text"][i]).split()
        if len(src_tokens) != len(tgt_tokens):
            continue
        for sw, tw in zip(src_tokens, tgt_tokens):
            sw_norm = normalize_teluguish(sw)
            if not sw_norm:
                continue
            src_words.append(sw_norm)
            tgt_words.append(tw.strip())
    return src_words, tgt_words


def build_char_vocabs(src_words, tgt_words):
    src_char_counts = Counter("".join(src_words))
    tgt_char_counts = Counter("".join(tgt_words))

    src_chars = sorted(src_char_counts.keys())
    tgt_chars = sorted(tgt_char_counts.keys())

    # special tokens
    SRC_PAD = 0
    SRC_SOS = 1
    SRC_EOS = 2
    SRC_UNK = 3

    TGT_PAD = 0
    TGT_SOS = 1
    TGT_EOS = 2
    TGT_UNK = 3

    src_char2idx = {c: i + 4 for i, c in enumerate(src_chars)}
    src_char2idx["<pad>"] = SRC_PAD
    src_char2idx["<sos>"] = SRC_SOS
    src_char2idx["<eos>"] = SRC_EOS
    src_char2idx["<unk>"] = SRC_UNK

    tgt_char2idx = {c: i + 4 for i, c in enumerate(tgt_chars)}
    tgt_char2idx["<pad>"] = TGT_PAD
    tgt_char2idx["<sos>"] = TGT_SOS
    tgt_char2idx["<eos>"] = TGT_EOS
    tgt_char2idx["<unk>"] = TGT_UNK

    return src_char2idx, tgt_char2idx


def build_word_dictionary(src_words, tgt_words):
    word_map_counts = defaultdict(Counter)
    for sw, tw in zip(src_words, tgt_words):
        word_map_counts[sw][tw] += 1

    word_map = {}
    for sw, counter in word_map_counts.items():
        most_common_tgt, _ = counter.most_common(1)[0]
        word_map[sw] = most_common_tgt
    return word_map


def main():
    os.makedirs(os.path.dirname(VOCAB_PATH), exist_ok=True)
    df = pd.read_parquet(DATA_PATH)

    print("Building word pairs...")
    src_words, tgt_words = build_word_pairs(df)
    print(f"Total word pairs: {len(src_words)}")

    print("Building char vocabs...")
    src_char2idx, tgt_char2idx = build_char_vocabs(src_words, tgt_words)

    print("Building word dictionary...")
    word_map = build_word_dictionary(src_words, tgt_words)
    print(f"Unique Teluguish words in dict: {len(word_map)}")

    vocab = {
        "src_char2idx": src_char2idx,
        "tgt_char2idx": tgt_char2idx,
        "word_map": word_map,
        "max_src_len": MAX_SRC_LEN,
        "max_tgt_len": MAX_TGT_LEN,
    }

    with open(VOCAB_PATH, "w", encoding="utf-8") as f:
        json.dump(vocab, f, ensure_ascii=False, indent=2)

    print(f"Saved vocab + dictionary to {VOCAB_PATH}")


if __name__ == "__main__":
    main()