Spaces:
Sleeping
Sleeping
Evgueni Poloukarov
feat: Phase 1 complete - Master CNEC list + synchronized feature engineering
d4939ce
| """Create master CNEC list with 176 unique CNECs (168 physical + 8 Alegro). | |
| This script: | |
| 1. Deduplicates physical CNECs from critical_cnecs_all.csv (200 → 168 unique) | |
| 2. Extracts 8 Alegro CNECs from tier1_with_alegro.csv | |
| 3. Combines into master list (176 unique) | |
| 4. Validates uniqueness and saves | |
| Usage: | |
| python scripts/create_master_cnec_list.py | |
| """ | |
| import sys | |
| from pathlib import Path | |
| import polars as pl | |
| # Add src to path | |
| sys.path.insert(0, str(Path(__file__).parent.parent / 'src')) | |
| def deduplicate_physical_cnecs(input_path: Path, output_path: Path) -> pl.DataFrame: | |
| """Deduplicate physical CNECs keeping highest importance score per EIC. | |
| Args: | |
| input_path: Path to critical_cnecs_all.csv (200 rows) | |
| output_path: Path to save deduplicated list | |
| Returns: | |
| DataFrame with 168 unique physical CNECs | |
| """ | |
| print("=" * 80) | |
| print("STEP 1: DEDUPLICATE PHYSICAL CNECs") | |
| print("=" * 80) | |
| # Load all CNECs | |
| all_cnecs = pl.read_csv(input_path) | |
| print(f"\n[INPUT] Loaded {len(all_cnecs)} CNECs from {input_path.name}") | |
| print(f" Unique EICs: {all_cnecs['cnec_eic'].n_unique()}") | |
| # Find duplicates | |
| duplicates = all_cnecs.filter(pl.col('cnec_eic').is_duplicated()) | |
| dup_eics = duplicates['cnec_eic'].unique() | |
| print(f"\n[DUPLICATES] Found {len(dup_eics)} EICs appearing multiple times:") | |
| print(f" Total duplicate rows: {len(duplicates)}") | |
| # Show first 5 duplicate examples | |
| print("\n[EXAMPLES] First 5 duplicate EICs:") | |
| for i, eic in enumerate(dup_eics.head(5), 1): | |
| dup_rows = all_cnecs.filter(pl.col('cnec_eic') == eic) | |
| print(f"\n {i}. {eic} ({len(dup_rows)} occurrences):") | |
| for row in dup_rows.iter_rows(named=True): | |
| print(f" - {row['cnec_name'][:60]:<60s} (TSO: {row['tso']:<10s}, Score: {row['importance_score']:.2f})") | |
| # Deduplicate: Keep highest importance score per EIC | |
| deduped = ( | |
| all_cnecs | |
| .sort('importance_score', descending=True) # Highest score first | |
| .unique(subset=['cnec_eic'], keep='first') # Keep first (highest score) | |
| .sort('importance_score', descending=True) # Re-sort by score | |
| ) | |
| print(f"\n[DEDUPLICATION] Kept highest importance score per EIC") | |
| print(f" Before: {len(all_cnecs)} rows, {all_cnecs['cnec_eic'].n_unique()} unique") | |
| print(f" After: {len(deduped)} rows, {deduped['cnec_eic'].n_unique()} unique") | |
| print(f" Removed: {len(all_cnecs) - len(deduped)} duplicate rows") | |
| # Validate | |
| assert deduped['cnec_eic'].n_unique() == len(deduped), "Deduplication failed - still have duplicates!" | |
| assert len(deduped) == 168, f"Expected 168 unique CNECs, got {len(deduped)}" | |
| # Add flags | |
| deduped = deduped.with_columns([ | |
| pl.lit(False).alias('is_alegro'), | |
| pl.lit(True).alias('is_physical') | |
| ]) | |
| # Save | |
| output_path.parent.mkdir(parents=True, exist_ok=True) | |
| deduped.write_csv(output_path) | |
| print(f"\n[SAVED] {len(deduped)} unique physical CNECs to {output_path.name}") | |
| print("=" * 80) | |
| return deduped | |
| def extract_alegro_cnecs(input_path: Path, output_path: Path) -> pl.DataFrame: | |
| """Extract 8 Alegro custom CNECs from tier1_with_alegro.csv. | |
| Args: | |
| input_path: Path to critical_cnecs_tier1_with_alegro.csv | |
| output_path: Path to save Alegro CNECs | |
| Returns: | |
| DataFrame with 8 Alegro CNECs | |
| """ | |
| print("\nSTEP 2: EXTRACT ALEGRO CNECs") | |
| print("=" * 80) | |
| # Load tier1 with Alegro | |
| tier1 = pl.read_csv(input_path) | |
| print(f"\n[INPUT] Loaded {len(tier1)} Tier-1 CNECs from {input_path.name}") | |
| # Filter Alegro CNECs (rows where tier contains "Alegro") | |
| alegro = tier1.filter(pl.col('tier').str.contains('(?i)alegro')) | |
| print(f"\n[ALEGRO] Found {len(alegro)} Alegro CNECs:") | |
| for i, row in enumerate(alegro.iter_rows(named=True), 1): | |
| print(f" {i}. {row['cnec_eic']:<30s} | {row['cnec_name'][:50]}") | |
| # Validate | |
| assert len(alegro) == 8, f"Expected 8 Alegro CNECs, found {len(alegro)}" | |
| # Add flags | |
| alegro = alegro.with_columns([ | |
| pl.lit(True).alias('is_alegro'), | |
| pl.lit(False).alias('is_physical') | |
| ]) | |
| # Save | |
| output_path.parent.mkdir(parents=True, exist_ok=True) | |
| alegro.write_csv(output_path) | |
| print(f"\n[SAVED] {len(alegro)} Alegro CNECs to {output_path.name}") | |
| print("=" * 80) | |
| return alegro | |
| def create_master_list( | |
| physical_path: Path, | |
| alegro_path: Path, | |
| output_path: Path | |
| ) -> pl.DataFrame: | |
| """Combine physical and Alegro CNECs into master list. | |
| Args: | |
| physical_path: Path to deduplicated physical CNECs (168) | |
| alegro_path: Path to Alegro CNECs (8) | |
| output_path: Path to save master list (176) | |
| Returns: | |
| DataFrame with 176 unique CNECs | |
| """ | |
| print("\nSTEP 3: CREATE MASTER CNEC LIST") | |
| print("=" * 80) | |
| # Load both | |
| physical = pl.read_csv(physical_path) | |
| alegro = pl.read_csv(alegro_path) | |
| print(f"\n[INPUTS]") | |
| print(f" Physical CNECs: {len(physical)}") | |
| print(f" Alegro CNECs: {len(alegro)}") | |
| print(f" Total: {len(physical) + len(alegro)}") | |
| # Combine | |
| master = pl.concat([physical, alegro]) | |
| # Validate uniqueness | |
| assert master['cnec_eic'].n_unique() == len(master), "Master list has duplicate EICs!" | |
| assert len(master) == 176, f"Expected 176 total CNECs, got {len(master)}" | |
| # Sort by importance score | |
| master = master.sort('importance_score', descending=True) | |
| # Summary statistics | |
| print(f"\n[MASTER LIST] Created {len(master)} unique CNECs") | |
| print(f" Physical: {master['is_physical'].sum()} CNECs") | |
| print(f" Alegro: {master['is_alegro'].sum()} CNECs") | |
| print(f" Tier 1: {master.filter(pl.col('tier').str.contains('Tier 1')).shape[0]} CNECs") | |
| print(f" Tier 2: {master.filter(pl.col('tier').str.contains('Tier 2')).shape[0]} CNECs") | |
| # TSO distribution | |
| print(f"\n[TSO DISTRIBUTION]") | |
| tso_dist = ( | |
| master | |
| .group_by('tso') | |
| .agg(pl.len().alias('count')) | |
| .sort('count', descending=True) | |
| .head(10) | |
| ) | |
| for row in tso_dist.iter_rows(named=True): | |
| tso_name = row['tso'] if row['tso'] else '(Empty)' | |
| print(f" {tso_name:<20s}: {row['count']:>3d} CNECs") | |
| # Save | |
| output_path.parent.mkdir(parents=True, exist_ok=True) | |
| master.write_csv(output_path) | |
| print(f"\n[SAVED] Master CNEC list to {output_path}") | |
| print("=" * 80) | |
| return master | |
| def main(): | |
| """Create master CNEC list (176 unique).""" | |
| print("\n") | |
| print("=" * 80) | |
| print("CREATE MASTER CNEC LIST (176 UNIQUE)") | |
| print("=" * 80) | |
| print() | |
| # Paths | |
| base_dir = Path(__file__).parent.parent | |
| data_dir = base_dir / 'data' / 'processed' | |
| input_all = data_dir / 'critical_cnecs_all.csv' | |
| input_alegro = data_dir / 'critical_cnecs_tier1_with_alegro.csv' | |
| output_physical = data_dir / 'cnecs_physical_168.csv' | |
| output_alegro = data_dir / 'cnecs_alegro_8.csv' | |
| output_master = data_dir / 'cnecs_master_176.csv' | |
| # Validate inputs exist | |
| if not input_all.exists(): | |
| print(f"[ERROR] Input file not found: {input_all}") | |
| print(" Please ensure data collection and CNEC identification are complete.") | |
| sys.exit(1) | |
| if not input_alegro.exists(): | |
| print(f"[ERROR] Input file not found: {input_alegro}") | |
| print(" Please ensure Alegro CNEC list exists.") | |
| sys.exit(1) | |
| # Execute steps | |
| physical_cnecs = deduplicate_physical_cnecs(input_all, output_physical) | |
| alegro_cnecs = extract_alegro_cnecs(input_alegro, output_alegro) | |
| master_cnecs = create_master_list(output_physical, output_alegro, output_master) | |
| # Final summary | |
| print("\n") | |
| print("=" * 80) | |
| print("SUMMARY") | |
| print("=" * 80) | |
| print(f"\nMaster CNEC List Created: {len(master_cnecs)} unique CNECs") | |
| print(f" - Physical (deduplicated): {len(physical_cnecs)} CNECs") | |
| print(f" - Alegro (custom): {len(alegro_cnecs)} CNECs") | |
| print(f"\nOutput Files:") | |
| print(f" 1. {output_physical.name}") | |
| print(f" 2. {output_alegro.name}") | |
| print(f" 3. {output_master.name} ⭐ PRIMARY") | |
| print(f"\nThis master list is the SINGLE SOURCE OF TRUTH for all feature engineering.") | |
| print("All JAO and ENTSO-E feature processing MUST use this exact list.") | |
| print("=" * 80) | |
| print() | |
| if __name__ == "__main__": | |
| main() | |