fbmc-chronos2 / scripts /create_master_cnec_list.py
Evgueni Poloukarov
feat: Phase 1 complete - Master CNEC list + synchronized feature engineering
d4939ce
raw
history blame
8.52 kB
"""Create master CNEC list with 176 unique CNECs (168 physical + 8 Alegro).
This script:
1. Deduplicates physical CNECs from critical_cnecs_all.csv (200 → 168 unique)
2. Extracts 8 Alegro CNECs from tier1_with_alegro.csv
3. Combines into master list (176 unique)
4. Validates uniqueness and saves
Usage:
python scripts/create_master_cnec_list.py
"""
import sys
from pathlib import Path
import polars as pl
# Add src to path
sys.path.insert(0, str(Path(__file__).parent.parent / 'src'))
def deduplicate_physical_cnecs(input_path: Path, output_path: Path) -> pl.DataFrame:
"""Deduplicate physical CNECs keeping highest importance score per EIC.
Args:
input_path: Path to critical_cnecs_all.csv (200 rows)
output_path: Path to save deduplicated list
Returns:
DataFrame with 168 unique physical CNECs
"""
print("=" * 80)
print("STEP 1: DEDUPLICATE PHYSICAL CNECs")
print("=" * 80)
# Load all CNECs
all_cnecs = pl.read_csv(input_path)
print(f"\n[INPUT] Loaded {len(all_cnecs)} CNECs from {input_path.name}")
print(f" Unique EICs: {all_cnecs['cnec_eic'].n_unique()}")
# Find duplicates
duplicates = all_cnecs.filter(pl.col('cnec_eic').is_duplicated())
dup_eics = duplicates['cnec_eic'].unique()
print(f"\n[DUPLICATES] Found {len(dup_eics)} EICs appearing multiple times:")
print(f" Total duplicate rows: {len(duplicates)}")
# Show first 5 duplicate examples
print("\n[EXAMPLES] First 5 duplicate EICs:")
for i, eic in enumerate(dup_eics.head(5), 1):
dup_rows = all_cnecs.filter(pl.col('cnec_eic') == eic)
print(f"\n {i}. {eic} ({len(dup_rows)} occurrences):")
for row in dup_rows.iter_rows(named=True):
print(f" - {row['cnec_name'][:60]:<60s} (TSO: {row['tso']:<10s}, Score: {row['importance_score']:.2f})")
# Deduplicate: Keep highest importance score per EIC
deduped = (
all_cnecs
.sort('importance_score', descending=True) # Highest score first
.unique(subset=['cnec_eic'], keep='first') # Keep first (highest score)
.sort('importance_score', descending=True) # Re-sort by score
)
print(f"\n[DEDUPLICATION] Kept highest importance score per EIC")
print(f" Before: {len(all_cnecs)} rows, {all_cnecs['cnec_eic'].n_unique()} unique")
print(f" After: {len(deduped)} rows, {deduped['cnec_eic'].n_unique()} unique")
print(f" Removed: {len(all_cnecs) - len(deduped)} duplicate rows")
# Validate
assert deduped['cnec_eic'].n_unique() == len(deduped), "Deduplication failed - still have duplicates!"
assert len(deduped) == 168, f"Expected 168 unique CNECs, got {len(deduped)}"
# Add flags
deduped = deduped.with_columns([
pl.lit(False).alias('is_alegro'),
pl.lit(True).alias('is_physical')
])
# Save
output_path.parent.mkdir(parents=True, exist_ok=True)
deduped.write_csv(output_path)
print(f"\n[SAVED] {len(deduped)} unique physical CNECs to {output_path.name}")
print("=" * 80)
return deduped
def extract_alegro_cnecs(input_path: Path, output_path: Path) -> pl.DataFrame:
"""Extract 8 Alegro custom CNECs from tier1_with_alegro.csv.
Args:
input_path: Path to critical_cnecs_tier1_with_alegro.csv
output_path: Path to save Alegro CNECs
Returns:
DataFrame with 8 Alegro CNECs
"""
print("\nSTEP 2: EXTRACT ALEGRO CNECs")
print("=" * 80)
# Load tier1 with Alegro
tier1 = pl.read_csv(input_path)
print(f"\n[INPUT] Loaded {len(tier1)} Tier-1 CNECs from {input_path.name}")
# Filter Alegro CNECs (rows where tier contains "Alegro")
alegro = tier1.filter(pl.col('tier').str.contains('(?i)alegro'))
print(f"\n[ALEGRO] Found {len(alegro)} Alegro CNECs:")
for i, row in enumerate(alegro.iter_rows(named=True), 1):
print(f" {i}. {row['cnec_eic']:<30s} | {row['cnec_name'][:50]}")
# Validate
assert len(alegro) == 8, f"Expected 8 Alegro CNECs, found {len(alegro)}"
# Add flags
alegro = alegro.with_columns([
pl.lit(True).alias('is_alegro'),
pl.lit(False).alias('is_physical')
])
# Save
output_path.parent.mkdir(parents=True, exist_ok=True)
alegro.write_csv(output_path)
print(f"\n[SAVED] {len(alegro)} Alegro CNECs to {output_path.name}")
print("=" * 80)
return alegro
def create_master_list(
physical_path: Path,
alegro_path: Path,
output_path: Path
) -> pl.DataFrame:
"""Combine physical and Alegro CNECs into master list.
Args:
physical_path: Path to deduplicated physical CNECs (168)
alegro_path: Path to Alegro CNECs (8)
output_path: Path to save master list (176)
Returns:
DataFrame with 176 unique CNECs
"""
print("\nSTEP 3: CREATE MASTER CNEC LIST")
print("=" * 80)
# Load both
physical = pl.read_csv(physical_path)
alegro = pl.read_csv(alegro_path)
print(f"\n[INPUTS]")
print(f" Physical CNECs: {len(physical)}")
print(f" Alegro CNECs: {len(alegro)}")
print(f" Total: {len(physical) + len(alegro)}")
# Combine
master = pl.concat([physical, alegro])
# Validate uniqueness
assert master['cnec_eic'].n_unique() == len(master), "Master list has duplicate EICs!"
assert len(master) == 176, f"Expected 176 total CNECs, got {len(master)}"
# Sort by importance score
master = master.sort('importance_score', descending=True)
# Summary statistics
print(f"\n[MASTER LIST] Created {len(master)} unique CNECs")
print(f" Physical: {master['is_physical'].sum()} CNECs")
print(f" Alegro: {master['is_alegro'].sum()} CNECs")
print(f" Tier 1: {master.filter(pl.col('tier').str.contains('Tier 1')).shape[0]} CNECs")
print(f" Tier 2: {master.filter(pl.col('tier').str.contains('Tier 2')).shape[0]} CNECs")
# TSO distribution
print(f"\n[TSO DISTRIBUTION]")
tso_dist = (
master
.group_by('tso')
.agg(pl.len().alias('count'))
.sort('count', descending=True)
.head(10)
)
for row in tso_dist.iter_rows(named=True):
tso_name = row['tso'] if row['tso'] else '(Empty)'
print(f" {tso_name:<20s}: {row['count']:>3d} CNECs")
# Save
output_path.parent.mkdir(parents=True, exist_ok=True)
master.write_csv(output_path)
print(f"\n[SAVED] Master CNEC list to {output_path}")
print("=" * 80)
return master
def main():
"""Create master CNEC list (176 unique)."""
print("\n")
print("=" * 80)
print("CREATE MASTER CNEC LIST (176 UNIQUE)")
print("=" * 80)
print()
# Paths
base_dir = Path(__file__).parent.parent
data_dir = base_dir / 'data' / 'processed'
input_all = data_dir / 'critical_cnecs_all.csv'
input_alegro = data_dir / 'critical_cnecs_tier1_with_alegro.csv'
output_physical = data_dir / 'cnecs_physical_168.csv'
output_alegro = data_dir / 'cnecs_alegro_8.csv'
output_master = data_dir / 'cnecs_master_176.csv'
# Validate inputs exist
if not input_all.exists():
print(f"[ERROR] Input file not found: {input_all}")
print(" Please ensure data collection and CNEC identification are complete.")
sys.exit(1)
if not input_alegro.exists():
print(f"[ERROR] Input file not found: {input_alegro}")
print(" Please ensure Alegro CNEC list exists.")
sys.exit(1)
# Execute steps
physical_cnecs = deduplicate_physical_cnecs(input_all, output_physical)
alegro_cnecs = extract_alegro_cnecs(input_alegro, output_alegro)
master_cnecs = create_master_list(output_physical, output_alegro, output_master)
# Final summary
print("\n")
print("=" * 80)
print("SUMMARY")
print("=" * 80)
print(f"\nMaster CNEC List Created: {len(master_cnecs)} unique CNECs")
print(f" - Physical (deduplicated): {len(physical_cnecs)} CNECs")
print(f" - Alegro (custom): {len(alegro_cnecs)} CNECs")
print(f"\nOutput Files:")
print(f" 1. {output_physical.name}")
print(f" 2. {output_alegro.name}")
print(f" 3. {output_master.name} ⭐ PRIMARY")
print(f"\nThis master list is the SINGLE SOURCE OF TRUTH for all feature engineering.")
print("All JAO and ENTSO-E feature processing MUST use this exact list.")
print("=" * 80)
print()
if __name__ == "__main__":
main()