Spaces:
Sleeping
Sleeping
| """Final validation of complete 24-month LTA + Net Positions datasets.""" | |
| import polars as pl | |
| from pathlib import Path | |
| print("\n" + "=" * 80) | |
| print("FINAL DATA COLLECTION VALIDATION") | |
| print("=" * 80) | |
| # ========================================================================= | |
| # LTA Dataset | |
| # ========================================================================= | |
| lta_path = Path('data/raw/phase1_24month/jao_lta.parquet') | |
| lta = pl.read_parquet(lta_path) | |
| print("\n[1/2] LTA (Long Term Allocations)") | |
| print("-" * 80) | |
| print(f" Records: {len(lta):,}") | |
| print(f" Columns: {len(lta.columns)} (1 timestamp + {len(lta.columns)-3} borders + 2 masking flags)") | |
| print(f" File size: {lta_path.stat().st_size / (1024**2):.2f} MB") | |
| print(f" Date range: {lta['mtu'].min()} to {lta['mtu'].max()}") | |
| print(f" Unique timestamps: {lta['mtu'].n_unique():,}") | |
| # Check October 2023 | |
| oct_2023 = lta.filter((pl.col('mtu').dt.year() == 2023) & (pl.col('mtu').dt.month() == 10)) | |
| days_2023 = sorted(oct_2023['mtu'].dt.day().unique().to_list()) | |
| masked_2023 = oct_2023.filter(pl.col('is_masked') == True) | |
| print(f"\n October 2023:") | |
| print(f" Days present: {days_2023}") | |
| print(f" Total records: {len(oct_2023)}") | |
| print(f" Masked records: {len(masked_2023)} ({len(masked_2023)/len(lta)*100:.3f}%)") | |
| # Check October 2024 | |
| oct_2024 = lta.filter((pl.col('mtu').dt.year() == 2024) & (pl.col('mtu').dt.month() == 10)) | |
| days_2024 = sorted(oct_2024['mtu'].dt.day().unique().to_list()) | |
| print(f"\n October 2024:") | |
| print(f" Days present: {days_2024}") | |
| print(f" Total records: {len(oct_2024)}") | |
| # ========================================================================= | |
| # Net Positions Dataset | |
| # ========================================================================= | |
| np_path = Path('data/raw/phase1_24month/jao_net_positions.parquet') | |
| np_df = pl.read_parquet(np_path) | |
| print("\n[2/2] Net Positions (Domain Boundaries)") | |
| print("-" * 80) | |
| print(f" Records: {len(np_df):,}") | |
| print(f" Columns: {len(np_df.columns)} (1 timestamp + 28 zones + 1 collection_date)") | |
| print(f" File size: {np_path.stat().st_size / (1024**2):.2f} MB") | |
| print(f" Date range: {np_df['mtu'].min()} to {np_df['mtu'].max()}") | |
| print(f" Unique dates: {np_df['mtu'].dt.date().n_unique()}") | |
| # Expected: Oct 1, 2023 to Sep 30, 2025 = 731 days | |
| expected_days = 731 | |
| print(f" Expected days: {expected_days}") | |
| print(f" Coverage: {np_df['mtu'].dt.date().n_unique() / expected_days * 100:.1f}%") | |
| # ========================================================================= | |
| # Summary | |
| # ========================================================================= | |
| print("\n" + "=" * 80) | |
| print("COLLECTION STATUS") | |
| print("=" * 80) | |
| lta_complete = (days_2023 == list(range(1, 32))) and (days_2024 == list(range(1, 32))) | |
| np_complete = (np_df['mtu'].dt.date().n_unique() >= expected_days - 1) # Allow 1 day variance | |
| if lta_complete and np_complete: | |
| print("[SUCCESS] Data collection complete!") | |
| print(f" ✓ LTA: {len(lta):,} records with {len(masked_2023)} masked (Oct 27-31, 2023)") | |
| print(f" ✓ Net Positions: {len(np_df):,} records covering {np_df['mtu'].dt.date().n_unique()} days") | |
| else: | |
| print("[WARNING] Data collection incomplete:") | |
| if not lta_complete: | |
| print(f" - LTA October coverage issue") | |
| if not np_complete: | |
| print(f" - Net Positions has {np_df['mtu'].dt.date().n_unique()}/{expected_days} expected days") | |
| print("=" * 80) | |
| print() | |