Spaces:
Sleeping
Sleeping
| import torch | |
| import pandas as pd | |
| from torch_geometric.data import Data | |
| from collections import Counter | |
| from tqdm import tqdm | |
| import setup | |
| from dataset import Vocabulary # We'll import the Vocabulary class from our slimmed-down dataset.py | |
| print("Step 1: Loading and Cleaning Raw Data...") | |
| df = pd.read_parquet(setup.DATA_FILE) | |
| df.dropna(inplace=True) | |
| # Basic cleaning | |
| df[setup.TELUGUISH_COL] = df[setup.TELUGUISH_COL].str.lower().str.replace(r'[^a-z\s]', '', regex=True).str.strip() | |
| df[setup.TELUGU_COL] = df[setup.TELUGU_COL].str.strip() | |
| # Filter out any rows that became empty after cleaning | |
| initial_rows = len(df) | |
| df = df[df[setup.TELUGUISH_COL].str.len() > 0] | |
| df = df[df[setup.TELUGU_COL].str.len() > 0] | |
| print(f"Data Cleaning: Kept {len(df)} rows, filtered out {initial_rows - len(df)} empty rows.") | |
| print("\nStep 2: Creating Vocabularies...") | |
| special_tokens = ['<pad>', '<sos>', '<eos>'] | |
| src_vocab = Vocabulary(df[setup.TELUGUISH_COL], special_tokens) | |
| tgt_vocab = Vocabulary(df[setup.TELUGU_COL], special_tokens) | |
| print("Vocabularies created.") | |
| print("\nStep 3: Converting Sentences to Graph Objects...") | |
| data_list = [] | |
| for _, row in tqdm(df.iterrows(), total=len(df), desc="Preprocessing Data into Graphs"): | |
| src_text = row[setup.TELUGUISH_COL] | |
| tgt_text = row[setup.TELUGU_COL] | |
| tokens = list(src_text) | |
| node_features = torch.tensor([src_vocab.stoi.get(token, setup.PAD_TOKEN) for token in tokens], dtype=torch.long) | |
| edge_list = [] | |
| if len(tokens) > 1: | |
| for i in range(len(tokens) - 1): | |
| edge_list.append([i, i + 1]) | |
| edge_list.append([i + 1, i]) | |
| edge_index = torch.tensor(edge_list, dtype=torch.long).t().contiguous() | |
| target_seq_tensor = torch.tensor( | |
| [setup.SOS_TOKEN] + [tgt_vocab.stoi[c] for c in tgt_text] + [setup.EOS_TOKEN], | |
| dtype=torch.long | |
| ) | |
| data = Data(x=node_features, edge_index=edge_index, target_seq=target_seq_tensor) | |
| data_list.append(data) | |
| print("\nStep 4: Saving Preprocessed Data to File...") | |
| torch.save({ | |
| 'data_list': data_list, | |
| 'src_vocab': src_vocab, | |
| 'tgt_vocab': tgt_vocab | |
| }, 'preprocessed_data.pt') | |
| print("\nPreprocessing complete. Data saved to 'preprocessed_data.pt'") |