Spaces:

evgueni-p
/

fbmc-chronos2

Sleeping

File size: 8,162 Bytes

27cb60a

"""Validate unified JAO data and engineered features.

Checks:
1. Timeline: hourly, no gaps, sorted
2. Feature completeness: null percentages
3. Data leakage: future data not in historical features
4. Summary statistics

Author: Claude
Date: 2025-11-06
"""
import polars as pl
from pathlib import Path

print("\n" + "=" * 80)
print("JAO DATA VALIDATION")
print("=" * 80)

# =========================================================================
# 1. Load datasets
# =========================================================================
print("\nLoading datasets...")

unified_path = Path('data/processed/unified_jao_24month.parquet')
cnec_path = Path('data/processed/cnec_hourly_24month.parquet')
features_path = Path('data/processed/features_jao_24month.parquet')

unified = pl.read_parquet(unified_path)
cnec = pl.read_parquet(cnec_path)
features = pl.read_parquet(features_path)

print(f"  Unified JAO: {unified.shape}")
print(f"  CNEC hourly: {cnec.shape}")
print(f"  Features: {features.shape}")

# =========================================================================
# 2. Timeline Validation
# =========================================================================
print("\n" + "-" * 80)
print("[1/4] TIMELINE VALIDATION")
print("-" * 80)

# Check sorted
is_sorted = unified['mtu'].is_sorted()
print(f"  Timeline sorted: {'[PASS]' if is_sorted else '[FAIL]'}")

# Check for gaps (should be hourly)
time_diffs = unified['mtu'].diff().drop_nulls()
most_common_diff = time_diffs.mode()[0]
hourly_expected = most_common_diff.total_seconds() == 3600

print(f"  Most common time diff: {most_common_diff}")
print(f"  Hourly intervals: {'[PASS]' if hourly_expected else '[FAIL]'}")

# Date range
min_date = unified['mtu'].min()
max_date = unified['mtu'].max()
print(f"  Date range: {min_date} to {max_date}")
print(f"  Total hours: {len(unified):,}")

# Expected: Oct 2023 to Sept 2025 = ~24 months
# After deduplication: 17,544 hours (729.75 days = ~24 months)
expected_days = (max_date - min_date).days + 1
print(f"  Days covered: {expected_days} (~{expected_days / 30:.1f} months)")

# =========================================================================
# 3. Feature Completeness
# =========================================================================
print("\n" + "-" * 80)
print("[2/4] FEATURE COMPLETENESS")
print("-" * 80)

# Count features by category
cnec_t1_cols = [c for c in features.columns if c.startswith('cnec_t1_')]
cnec_t2_cols = [c for c in features.columns if c.startswith('cnec_t2_')]
lta_cols = [c for c in features.columns if c.startswith('lta_')]
temporal_cols = [c for c in features.columns if c in ['hour', 'day', 'month', 'weekday', 'year', 'is_weekend', 'hour_sin', 'hour_cos', 'month_sin', 'month_cos', 'weekday_sin', 'weekday_cos']]
target_cols = [c for c in features.columns if c.startswith('target_')]

print(f"  Tier-1 CNEC features: {len(cnec_t1_cols)}")
print(f"  Tier-2 CNEC features: {len(cnec_t2_cols)}")
print(f"  LTA features: {len(lta_cols)}")
print(f"  Temporal features: {len(temporal_cols)}")
print(f"  Target variables: {len(target_cols)}")
print(f"  Total features: {features.shape[1] - 1} (excluding mtu)")

# Null counts by category
print("\n  Null percentages:")
cnec_t1_nulls = features.select(cnec_t1_cols).null_count().sum_horizontal()[0]
cnec_t2_nulls = features.select(cnec_t2_cols).null_count().sum_horizontal()[0]
lta_nulls = features.select(lta_cols).null_count().sum_horizontal()[0]
temporal_nulls = features.select(temporal_cols).null_count().sum_horizontal()[0]
target_nulls = features.select(target_cols).null_count().sum_horizontal()[0]

total_cells_t1 = len(features) * len(cnec_t1_cols)
total_cells_t2 = len(features) * len(cnec_t2_cols)
total_cells_lta = len(features) * len(lta_cols)
total_cells_temporal = len(features) * len(temporal_cols)
total_cells_target = len(features) * len(target_cols)

print(f"    Tier-1 CNEC: {cnec_t1_nulls / total_cells_t1 * 100:.2f}% nulls")
print(f"    Tier-2 CNEC: {cnec_t2_nulls / total_cells_t2 * 100:.2f}% nulls")
print(f"    LTA: {lta_nulls / total_cells_lta * 100:.2f}% nulls")
print(f"    Temporal: {temporal_nulls / total_cells_temporal * 100:.2f}% nulls")
print(f"    Targets: {target_nulls / total_cells_target * 100:.2f}% nulls")

# Overall null percentage
total_nulls = features.null_count().sum_horizontal()[0]
total_cells = len(features) * len(features.columns)
overall_null_pct = total_nulls / total_cells * 100

print(f"\n  Overall null percentage: {overall_null_pct:.2f}%")

if overall_null_pct < 60:
    print(f"  Completeness: [PASS] (<60% nulls)")
else:
    print(f"  Completeness: [WARNING] (>{overall_null_pct:.1f}% nulls)")

# =========================================================================
# 4. Data Leakage Check
# =========================================================================
print("\n" + "-" * 80)
print("[3/4] DATA LEAKAGE CHECK")
print("-" * 80)

# LTA are future covariates - should have NO nulls (known in advance)
lta_null_count = unified.select([c for c in unified.columns if c.startswith('border_')]).null_count().sum_horizontal()[0]

print(f"  LTA nulls: {lta_null_count}")

if lta_null_count == 0:
    print("  LTA future covariates: [PASS] (no nulls)")
else:
    print(f"  LTA future covariates: [WARNING] ({lta_null_count} nulls)")

# Historical features should have lags (shift creates nulls at start)
# Check that lag features have nulls ONLY at the beginning
has_lag_features = any('_L' in c for c in features.columns)

if has_lag_features:
    print("  Historical lag features: [PRESENT] (nulls expected at start)")
else:
    print("  Historical lag features: [WARNING] (no lag features found)")

# =========================================================================
# 5. Summary Statistics
# =========================================================================
print("\n" + "-" * 80)
print("[4/4] SUMMARY STATISTICS")
print("-" * 80)

print("\nUnified JAO Data:")
print(f"  Rows: {len(unified):,}")
print(f"  Columns: {len(unified.columns)}")
print(f"  MaxBEX borders: {len([c for c in unified.columns if 'border_' in c and 'lta' not in c.lower()])}")
print(f"  LTA borders: {len([c for c in unified.columns if c.startswith('border_')])}")
print(f"  Net Positions: {len([c for c in unified.columns if c.startswith('netpos_')])}")

print("\nCNEC Hourly Data:")
print(f"  Total CNEC records: {len(cnec):,}")
print(f"  Unique CNECs: {cnec['cnec_eic'].n_unique()}")
print(f"  Unique timestamps: {cnec['mtu'].n_unique():,}")
print(f"  CNECs per timestamp: {len(cnec) / cnec['mtu'].n_unique():.1f} avg")

print("\nFeature Engineering:")
print(f"  Total features: {features.shape[1] - 1}")
print(f"  Feature rows: {len(features):,}")
print(f"  File size: {features_path.stat().st_size / (1024**2):.2f} MB")

# =========================================================================
# Validation Summary
# =========================================================================
print("\n" + "=" * 80)
print("VALIDATION SUMMARY")
print("=" * 80)

checks_passed = 0
total_checks = 4

# Timeline check
if is_sorted and hourly_expected:
    print("  [PASS] Timeline validation PASSED")
    checks_passed += 1
else:
    print("  [FAIL] Timeline validation FAILED")

# Feature completeness check
if overall_null_pct < 60:
    print("  [PASS] Feature completeness PASSED")
    checks_passed += 1
else:
    print("  [WARNING] Feature completeness WARNING (high nulls)")

# Data leakage check
if lta_null_count == 0 and has_lag_features:
    print("  [PASS] Data leakage check PASSED")
    checks_passed += 1
else:
    print("  [WARNING] Data leakage check WARNING")

# Overall data quality
if len(unified) == len(features):
    print("  [PASS] Data consistency PASSED")
    checks_passed += 1
else:
    print("  [FAIL] Data consistency FAILED (row mismatch)")

print(f"\nChecks passed: {checks_passed}/{total_checks}")

if checks_passed == total_checks:
    print("\n[SUCCESS] All validation checks PASSED")
elif checks_passed >= total_checks - 1:
    print("\n[WARNING] Minor issues detected")
else:
    print("\n[FAILURE] Critical issues detected")

print("=" * 80)
print()