#!/usr/bin/env python3 """ Combine all raw CSV files into a single master file This creates one comprehensive CSV with all team activity data """ import csv import os from pathlib import Path from datetime import datetime from collections import defaultdict # Configuration RAW_DATA_DIR = Path("data/raw") OUTPUT_DIR = Path("data/summaries") OUTPUT_DIR.mkdir(parents=True, exist_ok=True) def create_master_file(): """Combine all raw CSV files into a single master file""" print("="*60) print("CREATING MASTER DATA FILE") print("="*60) # Find all CSV files in raw directory csv_files = list(RAW_DATA_DIR.glob("*.csv")) if not csv_files: print(f"❌ No CSV files found in {RAW_DATA_DIR}") return False print(f"Found {len(csv_files)} CSV files to process") # Dictionary to store all data AND track daily first/last times all_data = [] daily_first_last = defaultdict(lambda: defaultdict(lambda: {'first': None, 'last': None})) users_seen = set() dates_seen = set() # First pass: read all data and track first/last active times per user per day for csv_file in sorted(csv_files): # Extract date from filename (format: name_YYYY-MM-DD.csv) filename = csv_file.stem if '_' in filename: date_str = filename.split('_')[-1] try: file_date = datetime.strptime(date_str, '%Y-%m-%d').date() except: print(f"⚠️ Skipping {csv_file.name} - cannot parse date") continue else: print(f"⚠️ Skipping {csv_file.name} - invalid filename format") continue # Read the CSV file try: with open(csv_file, 'r') as f: reader = csv.DictReader(f) row_count = 0 for row in reader: # Add the date to each row row['date'] = date_str row['day_of_week'] = file_date.strftime('%A') # Extract time from timestamp if present time_str = None if 'timestamp' in row and row['timestamp']: try: timestamp = row['timestamp'] # Handle different timestamp formats if 'T' in timestamp: # ISO format: 2024-01-20T09:30:45.123+00:00 time_part = timestamp.split('T')[1] elif ' ' in timestamp: # Space-separated: 2024-01-20 09:30:45 parts = timestamp.split(' ') time_part = parts[1] if len(parts) > 1 else timestamp else: # Might just be time: 09:30:45 time_part = timestamp # Remove timezone and microseconds time_clean = time_part.split('+')[0].split('-')[0].split('.')[0].split('Z')[0] # Extract HH:MM if ':' in time_clean: parts = time_clean.split(':') if len(parts) >= 2: time_str = f"{parts[0].zfill(2)}:{parts[1].zfill(2)}" row['time'] = time_str # Track first/last active times for this user on this date if row.get('presence') == 'active': user_id = row.get('user_id', '') if user_id: if not daily_first_last[date_str][user_id]['first'] or time_str < daily_first_last[date_str][user_id]['first']: daily_first_last[date_str][user_id]['first'] = time_str if not daily_first_last[date_str][user_id]['last'] or time_str > daily_first_last[date_str][user_id]['last']: daily_first_last[date_str][user_id]['last'] = time_str except Exception as e: if row_count == 0: # Only show error once per file print(f" ⚠️ Could not parse timestamp format in {csv_file.name}: {row.get('timestamp', '')[:30]}") if 'time' not in row: row['time'] = '' # Track unique users and dates if 'user_id' in row: users_seen.add(row['user_id']) if 'user_name' in row: users_seen.add(row['user_name']) dates_seen.add(date_str) all_data.append(row) row_count += 1 print(f" ✅ {csv_file.name}: {row_count} rows") except Exception as e: print(f" ❌ Error reading {csv_file.name}: {e}") continue if not all_data: print("❌ No data extracted from files") return False # Second pass: add first_active and last_active columns to each row print("\nAdding first_active and last_active columns...") for row in all_data: user_id = row.get('user_id', '') date = row.get('date', '') if user_id and date in daily_first_last and user_id in daily_first_last[date]: row['first_active'] = daily_first_last[date][user_id]['first'] or '' row['last_active'] = daily_first_last[date][user_id]['last'] or '' else: row['first_active'] = '' row['last_active'] = '' print(f"✅ Added first/last active times to all rows") print(f"\n📊 Total records: {len(all_data)}") print(f"👥 Unique users: {len(users_seen)}") print(f"📅 Unique dates: {len(dates_seen)}") # Sort data by date, user, and time all_data.sort(key=lambda x: (x.get('date', ''), x.get('user_id', ''), x.get('time', ''))) # Write master file master_file = OUTPUT_DIR / "master_data.csv" # EXPLICITLY define the columns we want, including first_active and last_active essential_columns = ['date', 'day_of_week', 'time', 'timestamp', 'user_id', 'user_name', 'presence', 'first_active', 'last_active'] # Get all other columns from the data all_fields = set() for row in all_data: all_fields.update(row.keys()) # Build final column list: essential columns first, then others fieldnames = essential_columns.copy() for field in sorted(all_fields): if field not in fieldnames: fieldnames.append(field) # Ensure every row has all fields (fill missing with empty string) for row in all_data: for field in fieldnames: if field not in row: row[field] = '' # Write the master CSV with open(master_file, 'w', newline='') as f: writer = csv.DictWriter(f, fieldnames=fieldnames) writer.writeheader() writer.writerows(all_data) print(f"\n✅ Master file created: {master_file}") print(f" Size: {master_file.stat().st_size:,} bytes") print(f" Total columns: {len(fieldnames)}") # VERIFY the columns were written correctly print(f"\n📝 Verifying columns in master file:") print(f" Essential columns present:") for col in essential_columns: if col in fieldnames: print(f" ✅ {col}") else: print(f" ❌ MISSING: {col}") # Verify first/last active times were added rows_with_first_active = sum(1 for r in all_data if r.get('first_active')) rows_with_last_active = sum(1 for r in all_data if r.get('last_active')) unique_users_with_times = len(set(r['user_id'] for r in all_data if r.get('first_active'))) print(f"\n📊 Data verification:") print(f" Rows with first_active: {rows_with_first_active}/{len(all_data)}") print(f" Rows with last_active: {rows_with_last_active}/{len(all_data)}") print(f" Users with time data: {unique_users_with_times}/{len(users_seen)}") if rows_with_first_active == 0: print(f"\n⚠️ WARNING: No first/last active times were extracted!") print(f" This usually means the timestamp format couldn't be parsed.") print(f" Please check that your raw CSV files have a 'timestamp' column") print(f" with format like: '2024-01-20T09:30:45' or '2024-01-20 09:30:45'") # Show a sample of what we found sample_row = all_data[0] if all_data else {} print(f"\n Sample row from data:") for key in ['timestamp', 'date', 'user_id', 'presence']: if key in sample_row: print(f" {key}: {sample_row[key]}") # Create daily summary for quick reference create_daily_summary(all_data) return True def create_daily_summary(all_data): """Create a daily summary showing activity per user per day""" print("\n" + "="*60) print("CREATING DAILY SUMMARY") print("="*60) # Group data by date and user daily_stats = defaultdict(lambda: defaultdict(lambda: { 'user_name': '', 'total_minutes': 0, 'active_minutes': 0, 'first_seen': None, 'last_seen': None, 'presence_log': [] })) for row in all_data: date = row.get('date', '') user_id = row.get('user_id', '') user_name = row.get('user_name', '') presence = row.get('presence', '') timestamp = row.get('timestamp', '') if not date or not user_id: continue # Update user stats stats = daily_stats[date][user_id] stats['user_name'] = user_name stats['total_minutes'] += 1 if presence == 'active': stats['active_minutes'] += 1 # Extract time from timestamp - handle various formats if timestamp: try: # Handle different timestamp formats if 'T' in timestamp: # ISO format: 2024-01-20T09:30:45.123+00:00 time_part = timestamp.split('T')[1] elif ' ' in timestamp: # Space-separated: 2024-01-20 09:30:45 time_part = timestamp.split(' ')[1] if len(timestamp.split(' ')) > 1 else timestamp else: # Might just be time: 09:30:45 time_part = timestamp # Remove timezone and microseconds time_str = time_part.split('+')[0].split('-')[0].split('.')[0].split('Z')[0] # Extract HH:MM if ':' in time_str: parts = time_str.split(':') if len(parts) >= 2: hour_min = f"{parts[0].zfill(2)}:{parts[1].zfill(2)}" if presence == 'active': if not stats['first_seen']: stats['first_seen'] = hour_min stats['last_seen'] = hour_min # Track presence changes for pattern analysis if not stats['presence_log'] or stats['presence_log'][-1] != presence: stats['presence_log'].append(presence) except Exception as e: # Debug: show what timestamp format is causing issues (only first occurrence per user) if not stats.get('timestamp_error_shown'): print(f" ⚠️ Could not parse timestamp for {user_name}: {timestamp}") stats['timestamp_error_shown'] = True # Create summary CSV summary_file = OUTPUT_DIR / "daily_summary.csv" summary_rows = [] for date in sorted(daily_stats.keys()): for user_id, stats in daily_stats[date].items(): summary_rows.append({ 'date': date, 'user_id': user_id, 'user_name': stats['user_name'], 'total_minutes_tracked': stats['total_minutes'], 'active_minutes': stats['active_minutes'], 'active_hours': round(stats['active_minutes'] / 60, 2), 'first_active': stats['first_seen'] or '', 'last_active': stats['last_seen'] or '', 'activity_rate': round((stats['active_minutes'] / stats['total_minutes'] * 100) if stats['total_minutes'] > 0 else 0, 1), 'presence_changes': len(stats['presence_log']) }) # Write summary CSV if summary_rows: fieldnames = ['date', 'user_id', 'user_name', 'total_minutes_tracked', 'active_minutes', 'active_hours', 'first_active', 'last_active', 'activity_rate', 'presence_changes'] with open(summary_file, 'w', newline='') as f: writer = csv.DictWriter(f, fieldnames=fieldnames) writer.writeheader() writer.writerows(summary_rows) print(f"✅ Daily summary created: {summary_file}") print(f" {len(summary_rows)} user-day records") # Print sample statistics if summary_rows: print("\n📊 Sample Statistics:") # Get unique dates and users unique_dates = set(row['date'] for row in summary_rows) unique_users = set(row['user_name'] for row in summary_rows) print(f" • Date range: {min(unique_dates)} to {max(unique_dates)}") print(f" • Total users: {len(unique_users)}") print(f" • Total days: {len(unique_dates)}") # Calculate average hours per day total_active_hours = sum(row['active_hours'] for row in summary_rows) avg_hours_per_user_day = total_active_hours / len(summary_rows) if summary_rows else 0 print(f" • Average hours per user per day: {avg_hours_per_user_day:.2f}") def main(): """Main function""" success = create_master_file() if success: print("\n" + "="*60) print("✅ SUCCESS!") print("="*60) print("\nGenerated files:") print(" 1. data/summaries/master_data.csv - All raw data in one file") print(" 2. data/summaries/daily_summary.csv - Daily statistics per user") print("\nNext step: Open team_dashboard_simple.html in your browser") else: print("\n❌ Failed to create master file") return 1 return 0 if __name__ == "__main__": import sys sys.exit(main())