File size: 8,516 Bytes
d4939ce
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
"""Create master CNEC list with 176 unique CNECs (168 physical + 8 Alegro).

This script:
1. Deduplicates physical CNECs from critical_cnecs_all.csv (200 → 168 unique)
2. Extracts 8 Alegro CNECs from tier1_with_alegro.csv
3. Combines into master list (176 unique)
4. Validates uniqueness and saves

Usage:
    python scripts/create_master_cnec_list.py
"""

import sys
from pathlib import Path
import polars as pl

# Add src to path
sys.path.insert(0, str(Path(__file__).parent.parent / 'src'))


def deduplicate_physical_cnecs(input_path: Path, output_path: Path) -> pl.DataFrame:
    """Deduplicate physical CNECs keeping highest importance score per EIC.

    Args:
        input_path: Path to critical_cnecs_all.csv (200 rows)
        output_path: Path to save deduplicated list

    Returns:
        DataFrame with 168 unique physical CNECs
    """
    print("=" * 80)
    print("STEP 1: DEDUPLICATE PHYSICAL CNECs")
    print("=" * 80)

    # Load all CNECs
    all_cnecs = pl.read_csv(input_path)
    print(f"\n[INPUT] Loaded {len(all_cnecs)} CNECs from {input_path.name}")
    print(f"        Unique EICs: {all_cnecs['cnec_eic'].n_unique()}")

    # Find duplicates
    duplicates = all_cnecs.filter(pl.col('cnec_eic').is_duplicated())
    dup_eics = duplicates['cnec_eic'].unique()

    print(f"\n[DUPLICATES] Found {len(dup_eics)} EICs appearing multiple times:")
    print(f"             Total duplicate rows: {len(duplicates)}")

    # Show first 5 duplicate examples
    print("\n[EXAMPLES] First 5 duplicate EICs:")
    for i, eic in enumerate(dup_eics.head(5), 1):
        dup_rows = all_cnecs.filter(pl.col('cnec_eic') == eic)
        print(f"\n  {i}. {eic} ({len(dup_rows)} occurrences):")
        for row in dup_rows.iter_rows(named=True):
            print(f"     - {row['cnec_name'][:60]:<60s} (TSO: {row['tso']:<10s}, Score: {row['importance_score']:.2f})")

    # Deduplicate: Keep highest importance score per EIC
    deduped = (
        all_cnecs
        .sort('importance_score', descending=True)  # Highest score first
        .unique(subset=['cnec_eic'], keep='first')  # Keep first (highest score)
        .sort('importance_score', descending=True)  # Re-sort by score
    )

    print(f"\n[DEDUPLICATION] Kept highest importance score per EIC")
    print(f"                Before: {len(all_cnecs)} rows, {all_cnecs['cnec_eic'].n_unique()} unique")
    print(f"                After:  {len(deduped)} rows, {deduped['cnec_eic'].n_unique()} unique")
    print(f"                Removed: {len(all_cnecs) - len(deduped)} duplicate rows")

    # Validate
    assert deduped['cnec_eic'].n_unique() == len(deduped), "Deduplication failed - still have duplicates!"
    assert len(deduped) == 168, f"Expected 168 unique CNECs, got {len(deduped)}"

    # Add flags
    deduped = deduped.with_columns([
        pl.lit(False).alias('is_alegro'),
        pl.lit(True).alias('is_physical')
    ])

    # Save
    output_path.parent.mkdir(parents=True, exist_ok=True)
    deduped.write_csv(output_path)

    print(f"\n[SAVED] {len(deduped)} unique physical CNECs to {output_path.name}")
    print("="  * 80)

    return deduped


def extract_alegro_cnecs(input_path: Path, output_path: Path) -> pl.DataFrame:
    """Extract 8 Alegro custom CNECs from tier1_with_alegro.csv.

    Args:
        input_path: Path to critical_cnecs_tier1_with_alegro.csv
        output_path: Path to save Alegro CNECs

    Returns:
        DataFrame with 8 Alegro CNECs
    """
    print("\nSTEP 2: EXTRACT ALEGRO CNECs")
    print("=" * 80)

    # Load tier1 with Alegro
    tier1 = pl.read_csv(input_path)
    print(f"\n[INPUT] Loaded {len(tier1)} Tier-1 CNECs from {input_path.name}")

    # Filter Alegro CNECs (rows where tier contains "Alegro")
    alegro = tier1.filter(pl.col('tier').str.contains('(?i)alegro'))

    print(f"\n[ALEGRO] Found {len(alegro)} Alegro CNECs:")
    for i, row in enumerate(alegro.iter_rows(named=True), 1):
        print(f"  {i}. {row['cnec_eic']:<30s} | {row['cnec_name'][:50]}")

    # Validate
    assert len(alegro) == 8, f"Expected 8 Alegro CNECs, found {len(alegro)}"

    # Add flags
    alegro = alegro.with_columns([
        pl.lit(True).alias('is_alegro'),
        pl.lit(False).alias('is_physical')
    ])

    # Save
    output_path.parent.mkdir(parents=True, exist_ok=True)
    alegro.write_csv(output_path)

    print(f"\n[SAVED] {len(alegro)} Alegro CNECs to {output_path.name}")
    print("=" * 80)

    return alegro


def create_master_list(
    physical_path: Path,
    alegro_path: Path,
    output_path: Path
) -> pl.DataFrame:
    """Combine physical and Alegro CNECs into master list.

    Args:
        physical_path: Path to deduplicated physical CNECs (168)
        alegro_path: Path to Alegro CNECs (8)
        output_path: Path to save master list (176)

    Returns:
        DataFrame with 176 unique CNECs
    """
    print("\nSTEP 3: CREATE MASTER CNEC LIST")
    print("=" * 80)

    # Load both
    physical = pl.read_csv(physical_path)
    alegro = pl.read_csv(alegro_path)

    print(f"\n[INPUTS]")
    print(f"  Physical CNECs: {len(physical)}")
    print(f"  Alegro CNECs:   {len(alegro)}")
    print(f"  Total:          {len(physical) + len(alegro)}")

    # Combine
    master = pl.concat([physical, alegro])

    # Validate uniqueness
    assert master['cnec_eic'].n_unique() == len(master), "Master list has duplicate EICs!"
    assert len(master) == 176, f"Expected 176 total CNECs, got {len(master)}"

    # Sort by importance score
    master = master.sort('importance_score', descending=True)

    # Summary statistics
    print(f"\n[MASTER LIST] Created {len(master)} unique CNECs")
    print(f"  Physical: {master['is_physical'].sum()} CNECs")
    print(f"  Alegro:   {master['is_alegro'].sum()} CNECs")
    print(f"  Tier 1:   {master.filter(pl.col('tier').str.contains('Tier 1')).shape[0]} CNECs")
    print(f"  Tier 2:   {master.filter(pl.col('tier').str.contains('Tier 2')).shape[0]} CNECs")

    # TSO distribution
    print(f"\n[TSO DISTRIBUTION]")
    tso_dist = (
        master
        .group_by('tso')
        .agg(pl.len().alias('count'))
        .sort('count', descending=True)
        .head(10)
    )
    for row in tso_dist.iter_rows(named=True):
        tso_name = row['tso'] if row['tso'] else '(Empty)'
        print(f"  {tso_name:<20s}: {row['count']:>3d} CNECs")

    # Save
    output_path.parent.mkdir(parents=True, exist_ok=True)
    master.write_csv(output_path)

    print(f"\n[SAVED] Master CNEC list to {output_path}")
    print("=" * 80)

    return master


def main():
    """Create master CNEC list (176 unique)."""

    print("\n")
    print("=" * 80)
    print("CREATE MASTER CNEC LIST (176 UNIQUE)")
    print("=" * 80)
    print()

    # Paths
    base_dir = Path(__file__).parent.parent
    data_dir = base_dir / 'data' / 'processed'

    input_all = data_dir / 'critical_cnecs_all.csv'
    input_alegro = data_dir / 'critical_cnecs_tier1_with_alegro.csv'

    output_physical = data_dir / 'cnecs_physical_168.csv'
    output_alegro = data_dir / 'cnecs_alegro_8.csv'
    output_master = data_dir / 'cnecs_master_176.csv'

    # Validate inputs exist
    if not input_all.exists():
        print(f"[ERROR] Input file not found: {input_all}")
        print("        Please ensure data collection and CNEC identification are complete.")
        sys.exit(1)

    if not input_alegro.exists():
        print(f"[ERROR] Input file not found: {input_alegro}")
        print("        Please ensure Alegro CNEC list exists.")
        sys.exit(1)

    # Execute steps
    physical_cnecs = deduplicate_physical_cnecs(input_all, output_physical)
    alegro_cnecs = extract_alegro_cnecs(input_alegro, output_alegro)
    master_cnecs = create_master_list(output_physical, output_alegro, output_master)

    # Final summary
    print("\n")
    print("=" * 80)
    print("SUMMARY")
    print("=" * 80)
    print(f"\nMaster CNEC List Created: {len(master_cnecs)} unique CNECs")
    print(f"  - Physical (deduplicated): {len(physical_cnecs)} CNECs")
    print(f"  - Alegro (custom):         {len(alegro_cnecs)} CNECs")
    print(f"\nOutput Files:")
    print(f"  1. {output_physical.name}")
    print(f"  2. {output_alegro.name}")
    print(f"  3. {output_master.name} ⭐ PRIMARY")
    print(f"\nThis master list is the SINGLE SOURCE OF TRUTH for all feature engineering.")
    print("All JAO and ENTSO-E feature processing MUST use this exact list.")
    print("=" * 80)
    print()


if __name__ == "__main__":
    main()