Spaces:
Sleeping
Sleeping
File size: 8,162 Bytes
27cb60a |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 |
"""Validate unified JAO data and engineered features.
Checks:
1. Timeline: hourly, no gaps, sorted
2. Feature completeness: null percentages
3. Data leakage: future data not in historical features
4. Summary statistics
Author: Claude
Date: 2025-11-06
"""
import polars as pl
from pathlib import Path
print("\n" + "=" * 80)
print("JAO DATA VALIDATION")
print("=" * 80)
# =========================================================================
# 1. Load datasets
# =========================================================================
print("\nLoading datasets...")
unified_path = Path('data/processed/unified_jao_24month.parquet')
cnec_path = Path('data/processed/cnec_hourly_24month.parquet')
features_path = Path('data/processed/features_jao_24month.parquet')
unified = pl.read_parquet(unified_path)
cnec = pl.read_parquet(cnec_path)
features = pl.read_parquet(features_path)
print(f" Unified JAO: {unified.shape}")
print(f" CNEC hourly: {cnec.shape}")
print(f" Features: {features.shape}")
# =========================================================================
# 2. Timeline Validation
# =========================================================================
print("\n" + "-" * 80)
print("[1/4] TIMELINE VALIDATION")
print("-" * 80)
# Check sorted
is_sorted = unified['mtu'].is_sorted()
print(f" Timeline sorted: {'[PASS]' if is_sorted else '[FAIL]'}")
# Check for gaps (should be hourly)
time_diffs = unified['mtu'].diff().drop_nulls()
most_common_diff = time_diffs.mode()[0]
hourly_expected = most_common_diff.total_seconds() == 3600
print(f" Most common time diff: {most_common_diff}")
print(f" Hourly intervals: {'[PASS]' if hourly_expected else '[FAIL]'}")
# Date range
min_date = unified['mtu'].min()
max_date = unified['mtu'].max()
print(f" Date range: {min_date} to {max_date}")
print(f" Total hours: {len(unified):,}")
# Expected: Oct 2023 to Sept 2025 = ~24 months
# After deduplication: 17,544 hours (729.75 days = ~24 months)
expected_days = (max_date - min_date).days + 1
print(f" Days covered: {expected_days} (~{expected_days / 30:.1f} months)")
# =========================================================================
# 3. Feature Completeness
# =========================================================================
print("\n" + "-" * 80)
print("[2/4] FEATURE COMPLETENESS")
print("-" * 80)
# Count features by category
cnec_t1_cols = [c for c in features.columns if c.startswith('cnec_t1_')]
cnec_t2_cols = [c for c in features.columns if c.startswith('cnec_t2_')]
lta_cols = [c for c in features.columns if c.startswith('lta_')]
temporal_cols = [c for c in features.columns if c in ['hour', 'day', 'month', 'weekday', 'year', 'is_weekend', 'hour_sin', 'hour_cos', 'month_sin', 'month_cos', 'weekday_sin', 'weekday_cos']]
target_cols = [c for c in features.columns if c.startswith('target_')]
print(f" Tier-1 CNEC features: {len(cnec_t1_cols)}")
print(f" Tier-2 CNEC features: {len(cnec_t2_cols)}")
print(f" LTA features: {len(lta_cols)}")
print(f" Temporal features: {len(temporal_cols)}")
print(f" Target variables: {len(target_cols)}")
print(f" Total features: {features.shape[1] - 1} (excluding mtu)")
# Null counts by category
print("\n Null percentages:")
cnec_t1_nulls = features.select(cnec_t1_cols).null_count().sum_horizontal()[0]
cnec_t2_nulls = features.select(cnec_t2_cols).null_count().sum_horizontal()[0]
lta_nulls = features.select(lta_cols).null_count().sum_horizontal()[0]
temporal_nulls = features.select(temporal_cols).null_count().sum_horizontal()[0]
target_nulls = features.select(target_cols).null_count().sum_horizontal()[0]
total_cells_t1 = len(features) * len(cnec_t1_cols)
total_cells_t2 = len(features) * len(cnec_t2_cols)
total_cells_lta = len(features) * len(lta_cols)
total_cells_temporal = len(features) * len(temporal_cols)
total_cells_target = len(features) * len(target_cols)
print(f" Tier-1 CNEC: {cnec_t1_nulls / total_cells_t1 * 100:.2f}% nulls")
print(f" Tier-2 CNEC: {cnec_t2_nulls / total_cells_t2 * 100:.2f}% nulls")
print(f" LTA: {lta_nulls / total_cells_lta * 100:.2f}% nulls")
print(f" Temporal: {temporal_nulls / total_cells_temporal * 100:.2f}% nulls")
print(f" Targets: {target_nulls / total_cells_target * 100:.2f}% nulls")
# Overall null percentage
total_nulls = features.null_count().sum_horizontal()[0]
total_cells = len(features) * len(features.columns)
overall_null_pct = total_nulls / total_cells * 100
print(f"\n Overall null percentage: {overall_null_pct:.2f}%")
if overall_null_pct < 60:
print(f" Completeness: [PASS] (<60% nulls)")
else:
print(f" Completeness: [WARNING] (>{overall_null_pct:.1f}% nulls)")
# =========================================================================
# 4. Data Leakage Check
# =========================================================================
print("\n" + "-" * 80)
print("[3/4] DATA LEAKAGE CHECK")
print("-" * 80)
# LTA are future covariates - should have NO nulls (known in advance)
lta_null_count = unified.select([c for c in unified.columns if c.startswith('border_')]).null_count().sum_horizontal()[0]
print(f" LTA nulls: {lta_null_count}")
if lta_null_count == 0:
print(" LTA future covariates: [PASS] (no nulls)")
else:
print(f" LTA future covariates: [WARNING] ({lta_null_count} nulls)")
# Historical features should have lags (shift creates nulls at start)
# Check that lag features have nulls ONLY at the beginning
has_lag_features = any('_L' in c for c in features.columns)
if has_lag_features:
print(" Historical lag features: [PRESENT] (nulls expected at start)")
else:
print(" Historical lag features: [WARNING] (no lag features found)")
# =========================================================================
# 5. Summary Statistics
# =========================================================================
print("\n" + "-" * 80)
print("[4/4] SUMMARY STATISTICS")
print("-" * 80)
print("\nUnified JAO Data:")
print(f" Rows: {len(unified):,}")
print(f" Columns: {len(unified.columns)}")
print(f" MaxBEX borders: {len([c for c in unified.columns if 'border_' in c and 'lta' not in c.lower()])}")
print(f" LTA borders: {len([c for c in unified.columns if c.startswith('border_')])}")
print(f" Net Positions: {len([c for c in unified.columns if c.startswith('netpos_')])}")
print("\nCNEC Hourly Data:")
print(f" Total CNEC records: {len(cnec):,}")
print(f" Unique CNECs: {cnec['cnec_eic'].n_unique()}")
print(f" Unique timestamps: {cnec['mtu'].n_unique():,}")
print(f" CNECs per timestamp: {len(cnec) / cnec['mtu'].n_unique():.1f} avg")
print("\nFeature Engineering:")
print(f" Total features: {features.shape[1] - 1}")
print(f" Feature rows: {len(features):,}")
print(f" File size: {features_path.stat().st_size / (1024**2):.2f} MB")
# =========================================================================
# Validation Summary
# =========================================================================
print("\n" + "=" * 80)
print("VALIDATION SUMMARY")
print("=" * 80)
checks_passed = 0
total_checks = 4
# Timeline check
if is_sorted and hourly_expected:
print(" [PASS] Timeline validation PASSED")
checks_passed += 1
else:
print(" [FAIL] Timeline validation FAILED")
# Feature completeness check
if overall_null_pct < 60:
print(" [PASS] Feature completeness PASSED")
checks_passed += 1
else:
print(" [WARNING] Feature completeness WARNING (high nulls)")
# Data leakage check
if lta_null_count == 0 and has_lag_features:
print(" [PASS] Data leakage check PASSED")
checks_passed += 1
else:
print(" [WARNING] Data leakage check WARNING")
# Overall data quality
if len(unified) == len(features):
print(" [PASS] Data consistency PASSED")
checks_passed += 1
else:
print(" [FAIL] Data consistency FAILED (row mismatch)")
print(f"\nChecks passed: {checks_passed}/{total_checks}")
if checks_passed == total_checks:
print("\n[SUCCESS] All validation checks PASSED")
elif checks_passed >= total_checks - 1:
print("\n[WARNING] Minor issues detected")
else:
print("\n[FAILURE] Critical issues detected")
print("=" * 80)
print()
|