Spaces:

evgueni-p
/

fbmc-chronos2

Sleeping

fbmc-chronos2 / src /forecasting /feature_availability.py

Evgueni Poloukarov

feat: add dynamic forecast system to prevent data leakage

f4be780 about 1 month ago

13.6 kB

	#!/usr/bin/env python3
	"""
	Feature Availability Module
	Categorizes 2,514 features by their availability windows for forecasting.

	Purpose: Prevent data leakage by clearly defining what features are available
	at run time for different forecast horizons.

	Categories:
	1. Full-horizon D+14 (always known): temporal, weather, CNEC outages, LTA
	2. Partial D+1 only (masked D+2-D+14): load forecasts
	3. Historical only (not available): prices, generation, demand, lags, etc.
	"""

	from typing import Dict, List, Tuple, Set
	import pandas as pd
	import numpy as np
	from datetime import datetime, timedelta


	class FeatureAvailability:
	"""
	Defines availability windows for all features in the dataset.

	Availability Horizons:
	- D+14: Available for full 14-day forecast (temporal, weather, outages, LTA)
	- D+1: Available for day-ahead only (load forecasts)
	- D+0: Current value only, forward-filled (LTA)
	- Historical: Not available for future (prices, generation, demand, lags)
	"""

	# Feature categories with their availability windows
	AVAILABILITY_WINDOWS = {
	# FULL HORIZON - D+14 (336 hours)
	'temporal': {
	'horizon_hours': float('inf'), # Always computable
	'description': 'Time-based features (hour, day, month, weekday, etc.)',
	'patterns': ['hour', 'day', 'month', 'weekday', 'year', 'is_weekend'],
	'suffixes': ['_sin', '_cos'],
	'expected_count': 12,
	},
	'weather': {
	'horizon_hours': 336, # D+14 weather forecasts
	'description': 'Weather forecasts (temp, wind, solar, cloud, pressure)',
	'prefixes': ['temp_', 'wind_', 'wind10m_', 'wind100m_', 'winddir_', 'solar_', 'cloud_', 'pressure_'],
	'expected_count': 375, # Approximate (52 grid points × ~7 variables)
	},
	'cnec_outages': {
	'horizon_hours': 336, # D+14+ planned transmission outages
	'description': 'Planned CNEC transmission outages (published weeks ahead)',
	'prefixes': ['outage_cnec_'],
	'expected_count': 176,
	},
	'lta': {
	'horizon_hours': 0, # D+0 only (current value)
	'description': 'Long-term allocations (forward-filled from D+0)',
	'prefixes': ['lta_'],
	'expected_count': 40,
	'forward_fill': True, # Special handling: forward-fill current value
	},

	# PARTIAL HORIZON - D+1 only (24 hours)
	'load_forecast': {
	'horizon_hours': 24, # D+1 only, masked D+2-D+14
	'description': 'Day-ahead load forecasts (published D-1)',
	'prefixes': ['load_forecast_'],
	'expected_count': 12,
	'requires_masking': True, # Mask hours 25-336
	},

	# HISTORICAL ONLY - Not available for forecasting
	'prices': {
	'horizon_hours': -1, # Historical only
	'description': 'Day-ahead electricity prices (determined D-1)',
	'prefixes': ['price_'],
	'expected_count': 24,
	},
	'generation': {
	'horizon_hours': -1,
	'description': 'Actual generation by fuel type',
	'prefixes': ['gen_'],
	'expected_count': 183, # 12 zones × ~15 fuel types
	},
	'demand': {
	'horizon_hours': -1,
	'description': 'Actual electricity demand',
	'prefixes': ['demand_'],
	'expected_count': 24, # 12 zones + aggregates
	},
	'border_lags': {
	'horizon_hours': -1,
	'description': 'Lagged cross-border flows',
	'patterns': ['_lag_', '_L', 'border_'],
	'expected_count': 264, # 38 borders × 7 lags (1h, 3h, 6h, 12h, 24h, 168h, 720h)
	},
	'cnec_flows': {
	'horizon_hours': -1,
	'description': 'Historical CNEC flows and constraints',
	'prefixes': ['cnec_'],
	'patterns': ['_flow', '_binding', '_margin', '_ram'],
	'expected_count': 1000, # Tier-1 CNECs with multiple metrics
	},
	'netpos': {
	'horizon_hours': -1,
	'description': 'Historical net positions',
	'prefixes': ['netpos_'],
	'expected_count': 48, # 12 zones × 4 metrics
	},
	'system_agg': {
	'horizon_hours': -1,
	'description': 'System-level aggregates',
	'prefixes': ['total_', 'avg_', 'max', 'min', 'std_', 'mean_', 'sum_'],
	'expected_count': 353, # Various aggregations
	},
	'pumped_storage': {
	'horizon_hours': -1,
	'description': 'Pumped hydro storage generation',
	'prefixes': ['pumped_'],
	'expected_count': 7, # Countries with pumped storage
	},
	'hydro_storage': {
	'horizon_hours': -1,
	'description': 'Hydro reservoir levels (weekly data)',
	'prefixes': ['hydro_storage_'],
	'expected_count': 7,
	},
	}

	@classmethod
	def categorize_features(cls, columns: List[str]) -> Dict[str, List[str]]:
	"""
	Categorize all features by their availability windows.

	Args:
	columns: All column names from dataset

	Returns:
	Dictionary with categories:
	- full_horizon_d14: Available for full 14-day forecast
	- partial_d1: Available D+1 only (requires masking)
	- historical: Not available for forecasting
	- uncategorized: Features that don't match any pattern
	"""
	full_horizon_d14 = []
	partial_d1 = []
	historical = []
	uncategorized = []

	for col in columns:
	# Skip metadata columns
	if col == 'timestamp' or col.startswith('target_border_'):
	continue

	categorized = False

	# Check each category
	for category, config in cls.AVAILABILITY_WINDOWS.items():
	if cls._matches_category(col, config):
	# Assign to appropriate list based on horizon
	if config['horizon_hours'] >= 336 or config['horizon_hours'] == float('inf'):
	full_horizon_d14.append(col)
	elif config['horizon_hours'] == 24:
	partial_d1.append(col)
	elif config['horizon_hours'] < 0:
	historical.append(col)
	elif config['horizon_hours'] == 0:
	# LTA: forward-filled, treat as full horizon
	full_horizon_d14.append(col)

	categorized = True
	break

	if not categorized:
	uncategorized.append(col)

	return {
	'full_horizon_d14': full_horizon_d14,
	'partial_d1': partial_d1,
	'historical': historical,
	'uncategorized': uncategorized,
	}

	@classmethod
	def _matches_category(cls, col: str, config: Dict) -> bool:
	"""Check if column matches category patterns."""
	# Check exact matches
	if 'patterns' in config:
	if col in config['patterns']:
	return True
	# Check for pattern substring matches
	if any(pattern in col for pattern in config['patterns']):
	return True

	# Check prefixes
	if 'prefixes' in config:
	if any(col.startswith(prefix) for prefix in config['prefixes']):
	return True

	# Check suffixes
	if 'suffixes' in config:
	if any(col.endswith(suffix) for suffix in config['suffixes']):
	return True

	return False

	@classmethod
	def create_availability_mask(
	cls,
	feature_name: str,
	forecast_horizon_hours: int = 336
	) -> np.ndarray:
	"""
	Create binary availability mask for a feature across forecast horizon.

	Args:
	feature_name: Name of the feature
	forecast_horizon_hours: Length of forecast (default 336 = 14 days)

	Returns:
	Binary mask: 1 = available, 0 = masked/unavailable
	"""
	# Determine category
	for category, config in cls.AVAILABILITY_WINDOWS.items():
	if cls._matches_category(feature_name, config):
	horizon = config['horizon_hours']

	# Full horizon or infinite (temporal)
	if horizon >= forecast_horizon_hours or horizon == float('inf'):
	return np.ones(forecast_horizon_hours, dtype=np.float32)

	# Partial horizon (e.g., D+1 = 24 hours)
	elif horizon > 0:
	mask = np.zeros(forecast_horizon_hours, dtype=np.float32)
	mask[:int(horizon)] = 1.0
	return mask

	# Forward-fill (LTA: D+0)
	elif horizon == 0:
	return np.ones(forecast_horizon_hours, dtype=np.float32)

	# Historical only
	else:
	return np.zeros(forecast_horizon_hours, dtype=np.float32)

	# Unknown feature: assume historical (conservative)
	return np.zeros(forecast_horizon_hours, dtype=np.float32)

	@classmethod
	def validate_categorization(
	cls,
	categories: Dict[str, List[str]],
	verbose: bool = True
	) -> Tuple[bool, List[str]]:
	"""
	Validate feature categorization against expected counts.

	Args:
	categories: Output from categorize_features()
	verbose: Print validation details

	Returns:
	(is_valid, warnings)
	"""
	warnings = []

	# Total feature count (excl. timestamp + 38 targets)
	total_features = sum(len(v) for v in categories.values())
	expected_total = 2514 # 2,553 columns - 1 timestamp - 38 targets

	if total_features != expected_total:
	warnings.append(
	f"Feature count mismatch: {total_features} vs expected {expected_total}"
	)

	# Check full-horizon D+14 features
	full_d14 = len(categories['full_horizon_d14'])
	# Expected: temporal (12) + weather (~375) + outages (176) + LTA (40) = ~603
	if full_d14 < 200 or full_d14 > 700:
	warnings.append(
	f"Full-horizon D+14 count unusual: {full_d14} (expected ~240-640)"
	)

	# Check partial D+1 features
	partial_d1 = len(categories['partial_d1'])
	if partial_d1 != 12:
	warnings.append(
	f"Partial D+1 count: {partial_d1} (expected 12 load forecasts)"
	)

	# Check uncategorized
	if categories['uncategorized']:
	warnings.append(
	f"Uncategorized features: {len(categories['uncategorized'])} "
	f"(first 5: {categories['uncategorized'][:5]})"
	)

	if verbose:
	print("="*60)
	print("FEATURE CATEGORIZATION VALIDATION")
	print("="*60)
	print(f"Full-horizon D+14: {len(categories['full_horizon_d14']):4d} features")
	print(f"Partial D+1: {len(categories['partial_d1']):4d} features")
	print(f"Historical only: {len(categories['historical']):4d} features")
	print(f"Uncategorized: {len(categories['uncategorized']):4d} features")
	print(f"Total: {total_features:4d} features")

	if warnings:
	print("\n[!] WARNINGS:")
	for w in warnings:
	print(f" - {w}")
	else:
	print("\n[OK] Validation passed!")
	print("="*60)

	return len(warnings) == 0, warnings

	@classmethod
	def get_category_summary(cls, categories: Dict[str, List[str]]) -> pd.DataFrame:
	"""
	Generate summary table of feature categorization.

	Returns:
	DataFrame with category, count, availability, and sample features
	"""
	summary = []

	# Full-horizon D+14
	summary.append({
	'Category': 'Full-horizon D+14',
	'Count': len(categories['full_horizon_d14']),
	'Availability': 'D+1 to D+14 (336 hours)',
	'Masking': 'None',
	'Sample Features': ', '.join(categories['full_horizon_d14'][:3]),
	})

	# Partial D+1
	summary.append({
	'Category': 'Partial D+1',
	'Count': len(categories['partial_d1']),
	'Availability': 'D+1 only (24 hours)',
	'Masking': 'Mask D+2 to D+14',
	'Sample Features': ', '.join(categories['partial_d1'][:3]),
	})

	# Historical
	summary.append({
	'Category': 'Historical only',
	'Count': len(categories['historical']),
	'Availability': 'Not available for forecasting',
	'Masking': 'All zeros',
	'Sample Features': ', '.join(categories['historical'][:3]),
	})

	# Uncategorized
	if categories['uncategorized']:
	summary.append({
	'Category': 'Uncategorized',
	'Count': len(categories['uncategorized']),
	'Availability': 'Unknown (conservative: historical)',
	'Masking': 'All zeros (conservative)',
	'Sample Features': ', '.join(categories['uncategorized'][:3]),
	})

	return pd.DataFrame(summary)