fbmc-chronos2 / scripts /final_validation.py
Evgueni Poloukarov
feat: complete Phase 1 ENTSO-E asset-specific outage validation
27cb60a
raw
history blame
3.43 kB
"""Final validation of complete 24-month LTA + Net Positions datasets."""
import polars as pl
from pathlib import Path
print("\n" + "=" * 80)
print("FINAL DATA COLLECTION VALIDATION")
print("=" * 80)
# =========================================================================
# LTA Dataset
# =========================================================================
lta_path = Path('data/raw/phase1_24month/jao_lta.parquet')
lta = pl.read_parquet(lta_path)
print("\n[1/2] LTA (Long Term Allocations)")
print("-" * 80)
print(f" Records: {len(lta):,}")
print(f" Columns: {len(lta.columns)} (1 timestamp + {len(lta.columns)-3} borders + 2 masking flags)")
print(f" File size: {lta_path.stat().st_size / (1024**2):.2f} MB")
print(f" Date range: {lta['mtu'].min()} to {lta['mtu'].max()}")
print(f" Unique timestamps: {lta['mtu'].n_unique():,}")
# Check October 2023
oct_2023 = lta.filter((pl.col('mtu').dt.year() == 2023) & (pl.col('mtu').dt.month() == 10))
days_2023 = sorted(oct_2023['mtu'].dt.day().unique().to_list())
masked_2023 = oct_2023.filter(pl.col('is_masked') == True)
print(f"\n October 2023:")
print(f" Days present: {days_2023}")
print(f" Total records: {len(oct_2023)}")
print(f" Masked records: {len(masked_2023)} ({len(masked_2023)/len(lta)*100:.3f}%)")
# Check October 2024
oct_2024 = lta.filter((pl.col('mtu').dt.year() == 2024) & (pl.col('mtu').dt.month() == 10))
days_2024 = sorted(oct_2024['mtu'].dt.day().unique().to_list())
print(f"\n October 2024:")
print(f" Days present: {days_2024}")
print(f" Total records: {len(oct_2024)}")
# =========================================================================
# Net Positions Dataset
# =========================================================================
np_path = Path('data/raw/phase1_24month/jao_net_positions.parquet')
np_df = pl.read_parquet(np_path)
print("\n[2/2] Net Positions (Domain Boundaries)")
print("-" * 80)
print(f" Records: {len(np_df):,}")
print(f" Columns: {len(np_df.columns)} (1 timestamp + 28 zones + 1 collection_date)")
print(f" File size: {np_path.stat().st_size / (1024**2):.2f} MB")
print(f" Date range: {np_df['mtu'].min()} to {np_df['mtu'].max()}")
print(f" Unique dates: {np_df['mtu'].dt.date().n_unique()}")
# Expected: Oct 1, 2023 to Sep 30, 2025 = 731 days
expected_days = 731
print(f" Expected days: {expected_days}")
print(f" Coverage: {np_df['mtu'].dt.date().n_unique() / expected_days * 100:.1f}%")
# =========================================================================
# Summary
# =========================================================================
print("\n" + "=" * 80)
print("COLLECTION STATUS")
print("=" * 80)
lta_complete = (days_2023 == list(range(1, 32))) and (days_2024 == list(range(1, 32)))
np_complete = (np_df['mtu'].dt.date().n_unique() >= expected_days - 1) # Allow 1 day variance
if lta_complete and np_complete:
print("[SUCCESS] Data collection complete!")
print(f" ✓ LTA: {len(lta):,} records with {len(masked_2023)} masked (Oct 27-31, 2023)")
print(f" ✓ Net Positions: {len(np_df):,} records covering {np_df['mtu'].dt.date().n_unique()} days")
else:
print("[WARNING] Data collection incomplete:")
if not lta_complete:
print(f" - LTA October coverage issue")
if not np_complete:
print(f" - Net Positions has {np_df['mtu'].dt.date().n_unique()}/{expected_days} expected days")
print("=" * 80)
print()