import torch import pandas as pd from torch_geometric.data import Data from collections import Counter from tqdm import tqdm import setup from dataset import Vocabulary # We'll import the Vocabulary class from our slimmed-down dataset.py print("Step 1: Loading and Cleaning Raw Data...") df = pd.read_parquet(setup.DATA_FILE) df.dropna(inplace=True) # Basic cleaning df[setup.TELUGUISH_COL] = df[setup.TELUGUISH_COL].str.lower().str.replace(r'[^a-z\s]', '', regex=True).str.strip() df[setup.TELUGU_COL] = df[setup.TELUGU_COL].str.strip() # Filter out any rows that became empty after cleaning initial_rows = len(df) df = df[df[setup.TELUGUISH_COL].str.len() > 0] df = df[df[setup.TELUGU_COL].str.len() > 0] print(f"Data Cleaning: Kept {len(df)} rows, filtered out {initial_rows - len(df)} empty rows.") print("\nStep 2: Creating Vocabularies...") special_tokens = ['', '', ''] src_vocab = Vocabulary(df[setup.TELUGUISH_COL], special_tokens) tgt_vocab = Vocabulary(df[setup.TELUGU_COL], special_tokens) print("Vocabularies created.") print("\nStep 3: Converting Sentences to Graph Objects...") data_list = [] for _, row in tqdm(df.iterrows(), total=len(df), desc="Preprocessing Data into Graphs"): src_text = row[setup.TELUGUISH_COL] tgt_text = row[setup.TELUGU_COL] tokens = list(src_text) node_features = torch.tensor([src_vocab.stoi.get(token, setup.PAD_TOKEN) for token in tokens], dtype=torch.long) edge_list = [] if len(tokens) > 1: for i in range(len(tokens) - 1): edge_list.append([i, i + 1]) edge_list.append([i + 1, i]) edge_index = torch.tensor(edge_list, dtype=torch.long).t().contiguous() target_seq_tensor = torch.tensor( [setup.SOS_TOKEN] + [tgt_vocab.stoi[c] for c in tgt_text] + [setup.EOS_TOKEN], dtype=torch.long ) data = Data(x=node_features, edge_index=edge_index, target_seq=target_seq_tensor) data_list.append(data) print("\nStep 4: Saving Preprocessed Data to File...") torch.save({ 'data_list': data_list, 'src_vocab': src_vocab, 'tgt_vocab': tgt_vocab }, 'preprocessed_data.pt') print("\nPreprocessing complete. Data saved to 'preprocessed_data.pt'")