#!/usr/bin/env python3
"""
Combine all raw CSV files into a single master file
This creates one comprehensive CSV with all team activity data
"""
import csv
import os
from pathlib import Path
from datetime import datetime
from collections import defaultdict

# Configuration
RAW_DATA_DIR = Path("data/raw")
OUTPUT_DIR = Path("data/summaries")
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)

def create_master_file():
    """Combine all raw CSV files into a single master file"""
    
    print("="*60)
    print("CREATING MASTER DATA FILE")
    print("="*60)
    
    # Find all CSV files in raw directory
    csv_files = list(RAW_DATA_DIR.glob("*.csv"))
    
    if not csv_files:
        print(f"❌ No CSV files found in {RAW_DATA_DIR}")
        return False
    
    print(f"Found {len(csv_files)} CSV files to process")
    
    # Dictionary to store all data AND track daily first/last times
    all_data = []
    daily_first_last = defaultdict(lambda: defaultdict(lambda: {'first': None, 'last': None}))
    users_seen = set()
    dates_seen = set()
    
    # First pass: read all data and track first/last active times per user per day
    for csv_file in sorted(csv_files):
        # Extract date from filename (format: name_YYYY-MM-DD.csv)
        filename = csv_file.stem
        if '_' in filename:
            date_str = filename.split('_')[-1]
            try:
                file_date = datetime.strptime(date_str, '%Y-%m-%d').date()
            except:
                print(f"⚠️ Skipping {csv_file.name} - cannot parse date")
                continue
        else:
            print(f"⚠️ Skipping {csv_file.name} - invalid filename format")
            continue
        
        # Read the CSV file
        try:
            with open(csv_file, 'r') as f:
                reader = csv.DictReader(f)
                row_count = 0
                
                for row in reader:
                    # Add the date to each row
                    row['date'] = date_str
                    row['day_of_week'] = file_date.strftime('%A')
                    
                    # Extract time from timestamp if present
                    time_str = None
                    if 'timestamp' in row and row['timestamp']:
                        try:
                            timestamp = row['timestamp']
                            # Handle different timestamp formats
                            if 'T' in timestamp:
                                # ISO format: 2024-01-20T09:30:45.123+00:00
                                time_part = timestamp.split('T')[1]
                            elif ' ' in timestamp:
                                # Space-separated: 2024-01-20 09:30:45
                                parts = timestamp.split(' ')
                                time_part = parts[1] if len(parts) > 1 else timestamp
                            else:
                                # Might just be time: 09:30:45
                                time_part = timestamp
                            
                            # Remove timezone and microseconds
                            time_clean = time_part.split('+')[0].split('-')[0].split('.')[0].split('Z')[0]
                            
                            # Extract HH:MM
                            if ':' in time_clean:
                                parts = time_clean.split(':')
                                if len(parts) >= 2:
                                    time_str = f"{parts[0].zfill(2)}:{parts[1].zfill(2)}"
                                    row['time'] = time_str
                                    
                                    # Track first/last active times for this user on this date
                                    if row.get('presence') == 'active':
                                        user_id = row.get('user_id', '')
                                        if user_id:
                                            if not daily_first_last[date_str][user_id]['first'] or time_str < daily_first_last[date_str][user_id]['first']:
                                                daily_first_last[date_str][user_id]['first'] = time_str
                                            if not daily_first_last[date_str][user_id]['last'] or time_str > daily_first_last[date_str][user_id]['last']:
                                                daily_first_last[date_str][user_id]['last'] = time_str
                        except Exception as e:
                            if row_count == 0:  # Only show error once per file
                                print(f"    ⚠️ Could not parse timestamp format in {csv_file.name}: {row.get('timestamp', '')[:30]}")
                    
                    if 'time' not in row:
                        row['time'] = ''
                    
                    # Track unique users and dates
                    if 'user_id' in row:
                        users_seen.add(row['user_id'])
                    if 'user_name' in row:
                        users_seen.add(row['user_name'])
                    dates_seen.add(date_str)
                    
                    all_data.append(row)
                    row_count += 1
                
                print(f"  ✅ {csv_file.name}: {row_count} rows")
                
        except Exception as e:
            print(f"  ❌ Error reading {csv_file.name}: {e}")
            continue
    
    if not all_data:
        print("❌ No data extracted from files")
        return False
    
    # Second pass: add first_active and last_active columns to each row
    print("\nAdding first_active and last_active columns...")
    for row in all_data:
        user_id = row.get('user_id', '')
        date = row.get('date', '')
        
        if user_id and date in daily_first_last and user_id in daily_first_last[date]:
            row['first_active'] = daily_first_last[date][user_id]['first'] or ''
            row['last_active'] = daily_first_last[date][user_id]['last'] or ''
        else:
            row['first_active'] = ''
            row['last_active'] = ''
    
    print(f"✅ Added first/last active times to all rows")
    
    print(f"\n📊 Total records: {len(all_data)}")
    print(f"👥 Unique users: {len(users_seen)}")
    print(f"📅 Unique dates: {len(dates_seen)}")
    
    # Sort data by date, user, and time
    all_data.sort(key=lambda x: (x.get('date', ''), x.get('user_id', ''), x.get('time', '')))
    
    # Write master file
    master_file = OUTPUT_DIR / "master_data.csv"
    
    # EXPLICITLY define the columns we want, including first_active and last_active
    essential_columns = ['date', 'day_of_week', 'time', 'timestamp', 'user_id', 'user_name', 
                        'presence', 'first_active', 'last_active']
    
    # Get all other columns from the data
    all_fields = set()
    for row in all_data:
        all_fields.update(row.keys())
    
    # Build final column list: essential columns first, then others
    fieldnames = essential_columns.copy()
    for field in sorted(all_fields):
        if field not in fieldnames:
            fieldnames.append(field)
    
    # Ensure every row has all fields (fill missing with empty string)
    for row in all_data:
        for field in fieldnames:
            if field not in row:
                row[field] = ''
    
    # Write the master CSV
    with open(master_file, 'w', newline='') as f:
        writer = csv.DictWriter(f, fieldnames=fieldnames)
        writer.writeheader()
        writer.writerows(all_data)
    
    print(f"\n✅ Master file created: {master_file}")
    print(f"   Size: {master_file.stat().st_size:,} bytes")
    print(f"   Total columns: {len(fieldnames)}")
    
    # VERIFY the columns were written correctly
    print(f"\n📝 Verifying columns in master file:")
    print(f"   Essential columns present:")
    for col in essential_columns:
        if col in fieldnames:
            print(f"     ✅ {col}")
        else:
            print(f"     ❌ MISSING: {col}")
    
    # Verify first/last active times were added
    rows_with_first_active = sum(1 for r in all_data if r.get('first_active'))
    rows_with_last_active = sum(1 for r in all_data if r.get('last_active'))
    unique_users_with_times = len(set(r['user_id'] for r in all_data if r.get('first_active')))
    
    print(f"\n📊 Data verification:")
    print(f"   Rows with first_active: {rows_with_first_active}/{len(all_data)}")
    print(f"   Rows with last_active: {rows_with_last_active}/{len(all_data)}")
    print(f"   Users with time data: {unique_users_with_times}/{len(users_seen)}")
    
    if rows_with_first_active == 0:
        print(f"\n⚠️ WARNING: No first/last active times were extracted!")
        print(f"   This usually means the timestamp format couldn't be parsed.")
        print(f"   Please check that your raw CSV files have a 'timestamp' column")
        print(f"   with format like: '2024-01-20T09:30:45' or '2024-01-20 09:30:45'")
        
        # Show a sample of what we found
        sample_row = all_data[0] if all_data else {}
        print(f"\n   Sample row from data:")
        for key in ['timestamp', 'date', 'user_id', 'presence']:
            if key in sample_row:
                print(f"     {key}: {sample_row[key]}")
    
    # Create daily summary for quick reference
    create_daily_summary(all_data)
    
    return True

def create_daily_summary(all_data):
    """Create a daily summary showing activity per user per day"""
    
    print("\n" + "="*60)
    print("CREATING DAILY SUMMARY")
    print("="*60)
    
    # Group data by date and user
    daily_stats = defaultdict(lambda: defaultdict(lambda: {
        'user_name': '',
        'total_minutes': 0,
        'active_minutes': 0,
        'first_seen': None,
        'last_seen': None,
        'presence_log': []
    }))
    
    for row in all_data:
        date = row.get('date', '')
        user_id = row.get('user_id', '')
        user_name = row.get('user_name', '')
        presence = row.get('presence', '')
        timestamp = row.get('timestamp', '')
        
        if not date or not user_id:
            continue
        
        # Update user stats
        stats = daily_stats[date][user_id]
        stats['user_name'] = user_name
        stats['total_minutes'] += 1
        
        if presence == 'active':
            stats['active_minutes'] += 1
        
        # Extract time from timestamp - handle various formats
        if timestamp:
            try:
                # Handle different timestamp formats
                if 'T' in timestamp:
                    # ISO format: 2024-01-20T09:30:45.123+00:00
                    time_part = timestamp.split('T')[1]
                elif ' ' in timestamp:
                    # Space-separated: 2024-01-20 09:30:45
                    time_part = timestamp.split(' ')[1] if len(timestamp.split(' ')) > 1 else timestamp
                else:
                    # Might just be time: 09:30:45
                    time_part = timestamp
                
                # Remove timezone and microseconds
                time_str = time_part.split('+')[0].split('-')[0].split('.')[0].split('Z')[0]
                
                # Extract HH:MM
                if ':' in time_str:
                    parts = time_str.split(':')
                    if len(parts) >= 2:
                        hour_min = f"{parts[0].zfill(2)}:{parts[1].zfill(2)}"
                        
                        if presence == 'active':
                            if not stats['first_seen']:
                                stats['first_seen'] = hour_min
                            stats['last_seen'] = hour_min
                
                # Track presence changes for pattern analysis
                if not stats['presence_log'] or stats['presence_log'][-1] != presence:
                    stats['presence_log'].append(presence)
                    
            except Exception as e:
                # Debug: show what timestamp format is causing issues (only first occurrence per user)
                if not stats.get('timestamp_error_shown'):
                    print(f"    ⚠️ Could not parse timestamp for {user_name}: {timestamp}")
                    stats['timestamp_error_shown'] = True
    
    # Create summary CSV
    summary_file = OUTPUT_DIR / "daily_summary.csv"
    summary_rows = []
    
    for date in sorted(daily_stats.keys()):
        for user_id, stats in daily_stats[date].items():
            summary_rows.append({
                'date': date,
                'user_id': user_id,
                'user_name': stats['user_name'],
                'total_minutes_tracked': stats['total_minutes'],
                'active_minutes': stats['active_minutes'],
                'active_hours': round(stats['active_minutes'] / 60, 2),
                'first_active': stats['first_seen'] or '',
                'last_active': stats['last_seen'] or '',
                'activity_rate': round((stats['active_minutes'] / stats['total_minutes'] * 100) if stats['total_minutes'] > 0 else 0, 1),
                'presence_changes': len(stats['presence_log'])
            })
    
    # Write summary CSV
    if summary_rows:
        fieldnames = ['date', 'user_id', 'user_name', 'total_minutes_tracked', 
                     'active_minutes', 'active_hours', 'first_active', 'last_active', 
                     'activity_rate', 'presence_changes']
        
        with open(summary_file, 'w', newline='') as f:
            writer = csv.DictWriter(f, fieldnames=fieldnames)
            writer.writeheader()
            writer.writerows(summary_rows)
        
        print(f"✅ Daily summary created: {summary_file}")
        print(f"   {len(summary_rows)} user-day records")
    
    # Print sample statistics
    if summary_rows:
        print("\n📊 Sample Statistics:")
        # Get unique dates and users
        unique_dates = set(row['date'] for row in summary_rows)
        unique_users = set(row['user_name'] for row in summary_rows)
        
        print(f"  • Date range: {min(unique_dates)} to {max(unique_dates)}")
        print(f"  • Total users: {len(unique_users)}")
        print(f"  • Total days: {len(unique_dates)}")
        
        # Calculate average hours per day
        total_active_hours = sum(row['active_hours'] for row in summary_rows)
        avg_hours_per_user_day = total_active_hours / len(summary_rows) if summary_rows else 0
        print(f"  • Average hours per user per day: {avg_hours_per_user_day:.2f}")

def main():
    """Main function"""
    success = create_master_file()
    
    if success:
        print("\n" + "="*60)
        print("✅ SUCCESS!")
        print("="*60)
        print("\nGenerated files:")
        print("  1. data/summaries/master_data.csv - All raw data in one file")
        print("  2. data/summaries/daily_summary.csv - Daily statistics per user")
        print("\nNext step: Open team_dashboard_simple.html in your browser")
    else:
        print("\n❌ Failed to create master file")
        return 1
    
    return 0

if __name__ == "__main__":
    import sys
    sys.exit(main())