Spaces:
Sleeping
Sleeping
| #!/usr/bin/env python3 | |
| """ | |
| Feature Availability Module | |
| Categorizes 2,514 features by their availability windows for forecasting. | |
| Purpose: Prevent data leakage by clearly defining what features are available | |
| at run time for different forecast horizons. | |
| Categories: | |
| 1. Full-horizon D+14 (always known): temporal, weather, CNEC outages, LTA | |
| 2. Partial D+1 only (masked D+2-D+14): load forecasts | |
| 3. Historical only (not available): prices, generation, demand, lags, etc. | |
| """ | |
| from typing import Dict, List, Tuple, Set | |
| import pandas as pd | |
| import numpy as np | |
| from datetime import datetime, timedelta | |
| class FeatureAvailability: | |
| """ | |
| Defines availability windows for all features in the dataset. | |
| Availability Horizons: | |
| - D+14: Available for full 14-day forecast (temporal, weather, outages, LTA) | |
| - D+1: Available for day-ahead only (load forecasts) | |
| - D+0: Current value only, forward-filled (LTA) | |
| - Historical: Not available for future (prices, generation, demand, lags) | |
| """ | |
| # Feature categories with their availability windows | |
| AVAILABILITY_WINDOWS = { | |
| # FULL HORIZON - D+14 (336 hours) | |
| 'temporal': { | |
| 'horizon_hours': float('inf'), # Always computable | |
| 'description': 'Time-based features (hour, day, month, weekday, etc.)', | |
| 'patterns': ['hour', 'day', 'month', 'weekday', 'year', 'is_weekend'], | |
| 'suffixes': ['_sin', '_cos'], | |
| 'expected_count': 12, | |
| }, | |
| 'weather': { | |
| 'horizon_hours': 336, # D+14 weather forecasts | |
| 'description': 'Weather forecasts (temp, wind, solar, cloud, pressure)', | |
| 'prefixes': ['temp_', 'wind_', 'wind10m_', 'wind100m_', 'winddir_', 'solar_', 'cloud_', 'pressure_'], | |
| 'expected_count': 375, # Approximate (52 grid points × ~7 variables) | |
| }, | |
| 'cnec_outages': { | |
| 'horizon_hours': 336, # D+14+ planned transmission outages | |
| 'description': 'Planned CNEC transmission outages (published weeks ahead)', | |
| 'prefixes': ['outage_cnec_'], | |
| 'expected_count': 176, | |
| }, | |
| 'lta': { | |
| 'horizon_hours': 0, # D+0 only (current value) | |
| 'description': 'Long-term allocations (forward-filled from D+0)', | |
| 'prefixes': ['lta_'], | |
| 'expected_count': 40, | |
| 'forward_fill': True, # Special handling: forward-fill current value | |
| }, | |
| # PARTIAL HORIZON - D+1 only (24 hours) | |
| 'load_forecast': { | |
| 'horizon_hours': 24, # D+1 only, masked D+2-D+14 | |
| 'description': 'Day-ahead load forecasts (published D-1)', | |
| 'prefixes': ['load_forecast_'], | |
| 'expected_count': 12, | |
| 'requires_masking': True, # Mask hours 25-336 | |
| }, | |
| # HISTORICAL ONLY - Not available for forecasting | |
| 'prices': { | |
| 'horizon_hours': -1, # Historical only | |
| 'description': 'Day-ahead electricity prices (determined D-1)', | |
| 'prefixes': ['price_'], | |
| 'expected_count': 24, | |
| }, | |
| 'generation': { | |
| 'horizon_hours': -1, | |
| 'description': 'Actual generation by fuel type', | |
| 'prefixes': ['gen_'], | |
| 'expected_count': 183, # 12 zones × ~15 fuel types | |
| }, | |
| 'demand': { | |
| 'horizon_hours': -1, | |
| 'description': 'Actual electricity demand', | |
| 'prefixes': ['demand_'], | |
| 'expected_count': 24, # 12 zones + aggregates | |
| }, | |
| 'border_lags': { | |
| 'horizon_hours': -1, | |
| 'description': 'Lagged cross-border flows', | |
| 'patterns': ['_lag_', '_L', 'border_'], | |
| 'expected_count': 264, # 38 borders × 7 lags (1h, 3h, 6h, 12h, 24h, 168h, 720h) | |
| }, | |
| 'cnec_flows': { | |
| 'horizon_hours': -1, | |
| 'description': 'Historical CNEC flows and constraints', | |
| 'prefixes': ['cnec_'], | |
| 'patterns': ['_flow', '_binding', '_margin', '_ram'], | |
| 'expected_count': 1000, # Tier-1 CNECs with multiple metrics | |
| }, | |
| 'netpos': { | |
| 'horizon_hours': -1, | |
| 'description': 'Historical net positions', | |
| 'prefixes': ['netpos_'], | |
| 'expected_count': 48, # 12 zones × 4 metrics | |
| }, | |
| 'system_agg': { | |
| 'horizon_hours': -1, | |
| 'description': 'System-level aggregates', | |
| 'prefixes': ['total_', 'avg_', 'max', 'min', 'std_', 'mean_', 'sum_'], | |
| 'expected_count': 353, # Various aggregations | |
| }, | |
| 'pumped_storage': { | |
| 'horizon_hours': -1, | |
| 'description': 'Pumped hydro storage generation', | |
| 'prefixes': ['pumped_'], | |
| 'expected_count': 7, # Countries with pumped storage | |
| }, | |
| 'hydro_storage': { | |
| 'horizon_hours': -1, | |
| 'description': 'Hydro reservoir levels (weekly data)', | |
| 'prefixes': ['hydro_storage_'], | |
| 'expected_count': 7, | |
| }, | |
| } | |
| def categorize_features(cls, columns: List[str]) -> Dict[str, List[str]]: | |
| """ | |
| Categorize all features by their availability windows. | |
| Args: | |
| columns: All column names from dataset | |
| Returns: | |
| Dictionary with categories: | |
| - full_horizon_d14: Available for full 14-day forecast | |
| - partial_d1: Available D+1 only (requires masking) | |
| - historical: Not available for forecasting | |
| - uncategorized: Features that don't match any pattern | |
| """ | |
| full_horizon_d14 = [] | |
| partial_d1 = [] | |
| historical = [] | |
| uncategorized = [] | |
| for col in columns: | |
| # Skip metadata columns | |
| if col == 'timestamp' or col.startswith('target_border_'): | |
| continue | |
| categorized = False | |
| # Check each category | |
| for category, config in cls.AVAILABILITY_WINDOWS.items(): | |
| if cls._matches_category(col, config): | |
| # Assign to appropriate list based on horizon | |
| if config['horizon_hours'] >= 336 or config['horizon_hours'] == float('inf'): | |
| full_horizon_d14.append(col) | |
| elif config['horizon_hours'] == 24: | |
| partial_d1.append(col) | |
| elif config['horizon_hours'] < 0: | |
| historical.append(col) | |
| elif config['horizon_hours'] == 0: | |
| # LTA: forward-filled, treat as full horizon | |
| full_horizon_d14.append(col) | |
| categorized = True | |
| break | |
| if not categorized: | |
| uncategorized.append(col) | |
| return { | |
| 'full_horizon_d14': full_horizon_d14, | |
| 'partial_d1': partial_d1, | |
| 'historical': historical, | |
| 'uncategorized': uncategorized, | |
| } | |
| def _matches_category(cls, col: str, config: Dict) -> bool: | |
| """Check if column matches category patterns.""" | |
| # Check exact matches | |
| if 'patterns' in config: | |
| if col in config['patterns']: | |
| return True | |
| # Check for pattern substring matches | |
| if any(pattern in col for pattern in config['patterns']): | |
| return True | |
| # Check prefixes | |
| if 'prefixes' in config: | |
| if any(col.startswith(prefix) for prefix in config['prefixes']): | |
| return True | |
| # Check suffixes | |
| if 'suffixes' in config: | |
| if any(col.endswith(suffix) for suffix in config['suffixes']): | |
| return True | |
| return False | |
| def create_availability_mask( | |
| cls, | |
| feature_name: str, | |
| forecast_horizon_hours: int = 336 | |
| ) -> np.ndarray: | |
| """ | |
| Create binary availability mask for a feature across forecast horizon. | |
| Args: | |
| feature_name: Name of the feature | |
| forecast_horizon_hours: Length of forecast (default 336 = 14 days) | |
| Returns: | |
| Binary mask: 1 = available, 0 = masked/unavailable | |
| """ | |
| # Determine category | |
| for category, config in cls.AVAILABILITY_WINDOWS.items(): | |
| if cls._matches_category(feature_name, config): | |
| horizon = config['horizon_hours'] | |
| # Full horizon or infinite (temporal) | |
| if horizon >= forecast_horizon_hours or horizon == float('inf'): | |
| return np.ones(forecast_horizon_hours, dtype=np.float32) | |
| # Partial horizon (e.g., D+1 = 24 hours) | |
| elif horizon > 0: | |
| mask = np.zeros(forecast_horizon_hours, dtype=np.float32) | |
| mask[:int(horizon)] = 1.0 | |
| return mask | |
| # Forward-fill (LTA: D+0) | |
| elif horizon == 0: | |
| return np.ones(forecast_horizon_hours, dtype=np.float32) | |
| # Historical only | |
| else: | |
| return np.zeros(forecast_horizon_hours, dtype=np.float32) | |
| # Unknown feature: assume historical (conservative) | |
| return np.zeros(forecast_horizon_hours, dtype=np.float32) | |
| def validate_categorization( | |
| cls, | |
| categories: Dict[str, List[str]], | |
| verbose: bool = True | |
| ) -> Tuple[bool, List[str]]: | |
| """ | |
| Validate feature categorization against expected counts. | |
| Args: | |
| categories: Output from categorize_features() | |
| verbose: Print validation details | |
| Returns: | |
| (is_valid, warnings) | |
| """ | |
| warnings = [] | |
| # Total feature count (excl. timestamp + 38 targets) | |
| total_features = sum(len(v) for v in categories.values()) | |
| expected_total = 2514 # 2,553 columns - 1 timestamp - 38 targets | |
| if total_features != expected_total: | |
| warnings.append( | |
| f"Feature count mismatch: {total_features} vs expected {expected_total}" | |
| ) | |
| # Check full-horizon D+14 features | |
| full_d14 = len(categories['full_horizon_d14']) | |
| # Expected: temporal (12) + weather (~375) + outages (176) + LTA (40) = ~603 | |
| if full_d14 < 200 or full_d14 > 700: | |
| warnings.append( | |
| f"Full-horizon D+14 count unusual: {full_d14} (expected ~240-640)" | |
| ) | |
| # Check partial D+1 features | |
| partial_d1 = len(categories['partial_d1']) | |
| if partial_d1 != 12: | |
| warnings.append( | |
| f"Partial D+1 count: {partial_d1} (expected 12 load forecasts)" | |
| ) | |
| # Check uncategorized | |
| if categories['uncategorized']: | |
| warnings.append( | |
| f"Uncategorized features: {len(categories['uncategorized'])} " | |
| f"(first 5: {categories['uncategorized'][:5]})" | |
| ) | |
| if verbose: | |
| print("="*60) | |
| print("FEATURE CATEGORIZATION VALIDATION") | |
| print("="*60) | |
| print(f"Full-horizon D+14: {len(categories['full_horizon_d14']):4d} features") | |
| print(f"Partial D+1: {len(categories['partial_d1']):4d} features") | |
| print(f"Historical only: {len(categories['historical']):4d} features") | |
| print(f"Uncategorized: {len(categories['uncategorized']):4d} features") | |
| print(f"Total: {total_features:4d} features") | |
| if warnings: | |
| print("\n[!] WARNINGS:") | |
| for w in warnings: | |
| print(f" - {w}") | |
| else: | |
| print("\n[OK] Validation passed!") | |
| print("="*60) | |
| return len(warnings) == 0, warnings | |
| def get_category_summary(cls, categories: Dict[str, List[str]]) -> pd.DataFrame: | |
| """ | |
| Generate summary table of feature categorization. | |
| Returns: | |
| DataFrame with category, count, availability, and sample features | |
| """ | |
| summary = [] | |
| # Full-horizon D+14 | |
| summary.append({ | |
| 'Category': 'Full-horizon D+14', | |
| 'Count': len(categories['full_horizon_d14']), | |
| 'Availability': 'D+1 to D+14 (336 hours)', | |
| 'Masking': 'None', | |
| 'Sample Features': ', '.join(categories['full_horizon_d14'][:3]), | |
| }) | |
| # Partial D+1 | |
| summary.append({ | |
| 'Category': 'Partial D+1', | |
| 'Count': len(categories['partial_d1']), | |
| 'Availability': 'D+1 only (24 hours)', | |
| 'Masking': 'Mask D+2 to D+14', | |
| 'Sample Features': ', '.join(categories['partial_d1'][:3]), | |
| }) | |
| # Historical | |
| summary.append({ | |
| 'Category': 'Historical only', | |
| 'Count': len(categories['historical']), | |
| 'Availability': 'Not available for forecasting', | |
| 'Masking': 'All zeros', | |
| 'Sample Features': ', '.join(categories['historical'][:3]), | |
| }) | |
| # Uncategorized | |
| if categories['uncategorized']: | |
| summary.append({ | |
| 'Category': 'Uncategorized', | |
| 'Count': len(categories['uncategorized']), | |
| 'Availability': 'Unknown (conservative: historical)', | |
| 'Masking': 'All zeros (conservative)', | |
| 'Sample Features': ', '.join(categories['uncategorized'][:3]), | |
| }) | |
| return pd.DataFrame(summary) | |