"""Create master CNEC list with 176 unique CNECs (168 physical + 8 Alegro). This script: 1. Deduplicates physical CNECs from critical_cnecs_all.csv (200 → 168 unique) 2. Extracts 8 Alegro CNECs from tier1_with_alegro.csv 3. Combines into master list (176 unique) 4. Validates uniqueness and saves Usage: python scripts/create_master_cnec_list.py """ import sys from pathlib import Path import polars as pl # Add src to path sys.path.insert(0, str(Path(__file__).parent.parent / 'src')) def deduplicate_physical_cnecs(input_path: Path, output_path: Path) -> pl.DataFrame: """Deduplicate physical CNECs keeping highest importance score per EIC. Args: input_path: Path to critical_cnecs_all.csv (200 rows) output_path: Path to save deduplicated list Returns: DataFrame with 168 unique physical CNECs """ print("=" * 80) print("STEP 1: DEDUPLICATE PHYSICAL CNECs") print("=" * 80) # Load all CNECs all_cnecs = pl.read_csv(input_path) print(f"\n[INPUT] Loaded {len(all_cnecs)} CNECs from {input_path.name}") print(f" Unique EICs: {all_cnecs['cnec_eic'].n_unique()}") # Find duplicates duplicates = all_cnecs.filter(pl.col('cnec_eic').is_duplicated()) dup_eics = duplicates['cnec_eic'].unique() print(f"\n[DUPLICATES] Found {len(dup_eics)} EICs appearing multiple times:") print(f" Total duplicate rows: {len(duplicates)}") # Show first 5 duplicate examples print("\n[EXAMPLES] First 5 duplicate EICs:") for i, eic in enumerate(dup_eics.head(5), 1): dup_rows = all_cnecs.filter(pl.col('cnec_eic') == eic) print(f"\n {i}. {eic} ({len(dup_rows)} occurrences):") for row in dup_rows.iter_rows(named=True): print(f" - {row['cnec_name'][:60]:<60s} (TSO: {row['tso']:<10s}, Score: {row['importance_score']:.2f})") # Deduplicate: Keep highest importance score per EIC deduped = ( all_cnecs .sort('importance_score', descending=True) # Highest score first .unique(subset=['cnec_eic'], keep='first') # Keep first (highest score) .sort('importance_score', descending=True) # Re-sort by score ) print(f"\n[DEDUPLICATION] Kept highest importance score per EIC") print(f" Before: {len(all_cnecs)} rows, {all_cnecs['cnec_eic'].n_unique()} unique") print(f" After: {len(deduped)} rows, {deduped['cnec_eic'].n_unique()} unique") print(f" Removed: {len(all_cnecs) - len(deduped)} duplicate rows") # Validate assert deduped['cnec_eic'].n_unique() == len(deduped), "Deduplication failed - still have duplicates!" assert len(deduped) == 168, f"Expected 168 unique CNECs, got {len(deduped)}" # Add flags deduped = deduped.with_columns([ pl.lit(False).alias('is_alegro'), pl.lit(True).alias('is_physical') ]) # Save output_path.parent.mkdir(parents=True, exist_ok=True) deduped.write_csv(output_path) print(f"\n[SAVED] {len(deduped)} unique physical CNECs to {output_path.name}") print("=" * 80) return deduped def extract_alegro_cnecs(input_path: Path, output_path: Path) -> pl.DataFrame: """Extract 8 Alegro custom CNECs from tier1_with_alegro.csv. Args: input_path: Path to critical_cnecs_tier1_with_alegro.csv output_path: Path to save Alegro CNECs Returns: DataFrame with 8 Alegro CNECs """ print("\nSTEP 2: EXTRACT ALEGRO CNECs") print("=" * 80) # Load tier1 with Alegro tier1 = pl.read_csv(input_path) print(f"\n[INPUT] Loaded {len(tier1)} Tier-1 CNECs from {input_path.name}") # Filter Alegro CNECs (rows where tier contains "Alegro") alegro = tier1.filter(pl.col('tier').str.contains('(?i)alegro')) print(f"\n[ALEGRO] Found {len(alegro)} Alegro CNECs:") for i, row in enumerate(alegro.iter_rows(named=True), 1): print(f" {i}. {row['cnec_eic']:<30s} | {row['cnec_name'][:50]}") # Validate assert len(alegro) == 8, f"Expected 8 Alegro CNECs, found {len(alegro)}" # Add flags alegro = alegro.with_columns([ pl.lit(True).alias('is_alegro'), pl.lit(False).alias('is_physical') ]) # Save output_path.parent.mkdir(parents=True, exist_ok=True) alegro.write_csv(output_path) print(f"\n[SAVED] {len(alegro)} Alegro CNECs to {output_path.name}") print("=" * 80) return alegro def create_master_list( physical_path: Path, alegro_path: Path, output_path: Path ) -> pl.DataFrame: """Combine physical and Alegro CNECs into master list. Args: physical_path: Path to deduplicated physical CNECs (168) alegro_path: Path to Alegro CNECs (8) output_path: Path to save master list (176) Returns: DataFrame with 176 unique CNECs """ print("\nSTEP 3: CREATE MASTER CNEC LIST") print("=" * 80) # Load both physical = pl.read_csv(physical_path) alegro = pl.read_csv(alegro_path) print(f"\n[INPUTS]") print(f" Physical CNECs: {len(physical)}") print(f" Alegro CNECs: {len(alegro)}") print(f" Total: {len(physical) + len(alegro)}") # Combine master = pl.concat([physical, alegro]) # Validate uniqueness assert master['cnec_eic'].n_unique() == len(master), "Master list has duplicate EICs!" assert len(master) == 176, f"Expected 176 total CNECs, got {len(master)}" # Sort by importance score master = master.sort('importance_score', descending=True) # Summary statistics print(f"\n[MASTER LIST] Created {len(master)} unique CNECs") print(f" Physical: {master['is_physical'].sum()} CNECs") print(f" Alegro: {master['is_alegro'].sum()} CNECs") print(f" Tier 1: {master.filter(pl.col('tier').str.contains('Tier 1')).shape[0]} CNECs") print(f" Tier 2: {master.filter(pl.col('tier').str.contains('Tier 2')).shape[0]} CNECs") # TSO distribution print(f"\n[TSO DISTRIBUTION]") tso_dist = ( master .group_by('tso') .agg(pl.len().alias('count')) .sort('count', descending=True) .head(10) ) for row in tso_dist.iter_rows(named=True): tso_name = row['tso'] if row['tso'] else '(Empty)' print(f" {tso_name:<20s}: {row['count']:>3d} CNECs") # Save output_path.parent.mkdir(parents=True, exist_ok=True) master.write_csv(output_path) print(f"\n[SAVED] Master CNEC list to {output_path}") print("=" * 80) return master def main(): """Create master CNEC list (176 unique).""" print("\n") print("=" * 80) print("CREATE MASTER CNEC LIST (176 UNIQUE)") print("=" * 80) print() # Paths base_dir = Path(__file__).parent.parent data_dir = base_dir / 'data' / 'processed' input_all = data_dir / 'critical_cnecs_all.csv' input_alegro = data_dir / 'critical_cnecs_tier1_with_alegro.csv' output_physical = data_dir / 'cnecs_physical_168.csv' output_alegro = data_dir / 'cnecs_alegro_8.csv' output_master = data_dir / 'cnecs_master_176.csv' # Validate inputs exist if not input_all.exists(): print(f"[ERROR] Input file not found: {input_all}") print(" Please ensure data collection and CNEC identification are complete.") sys.exit(1) if not input_alegro.exists(): print(f"[ERROR] Input file not found: {input_alegro}") print(" Please ensure Alegro CNEC list exists.") sys.exit(1) # Execute steps physical_cnecs = deduplicate_physical_cnecs(input_all, output_physical) alegro_cnecs = extract_alegro_cnecs(input_alegro, output_alegro) master_cnecs = create_master_list(output_physical, output_alegro, output_master) # Final summary print("\n") print("=" * 80) print("SUMMARY") print("=" * 80) print(f"\nMaster CNEC List Created: {len(master_cnecs)} unique CNECs") print(f" - Physical (deduplicated): {len(physical_cnecs)} CNECs") print(f" - Alegro (custom): {len(alegro_cnecs)} CNECs") print(f"\nOutput Files:") print(f" 1. {output_physical.name}") print(f" 2. {output_alegro.name}") print(f" 3. {output_master.name} ⭐ PRIMARY") print(f"\nThis master list is the SINGLE SOURCE OF TRUTH for all feature engineering.") print("All JAO and ENTSO-E feature processing MUST use this exact list.") print("=" * 80) print() if __name__ == "__main__": main()