#!/usr/bin/env python3
"""
Script to clean Excel files for Google Sheets compatibility.
Fixes common issues that cause upload errors.
"""

import openpyxl
from openpyxl.utils import get_column_letter
from openpyxl.worksheet.datavalidation import DataValidation
import sys
import os

def clean_workbook(input_file, output_file):
    """Clean Excel workbook for Google Sheets compatibility."""
    
    print(f"Loading workbook: {input_file}")
    try:
        wb = openpyxl.load_workbook(input_file, data_only=False)
    except Exception as e:
        print(f"Error loading workbook: {e}")
        return False
    
    print(f"Found {len(wb.sheetnames)} sheets: {wb.sheetnames}")
    
    # Clean each sheet
    for sheet_name in wb.sheetnames:
        ws = wb[sheet_name]
        print(f"\nCleaning sheet: {sheet_name}")
        
        # 1. Remove data validation (can cause issues)
        try:
            if hasattr(ws, 'data_validations') and ws.data_validations:
                count = len(list(ws.data_validations)) if hasattr(ws.data_validations, '__iter__') else 0
                if count > 0:
                    print(f"  - Removing {count} data validations")
                    # Clear data validations
                    ws.data_validations = openpyxl.worksheet.datavalidation.DataValidationList()
        except Exception as e:
            print(f"  - Warning: Could not remove data validations: {e}")
        
        # 2. Remove conditional formatting (can cause issues)
        if ws.conditional_formatting:
            print(f"  - Removing conditional formatting")
            ws.conditional_formatting = {}
        
        # 3. Clean up formulas - check for invalid formulas
        invalid_formulas = []
        for row in ws.iter_rows():
            for cell in row:
                if cell.data_type == 'f':  # Formula
                    try:
                        # Try to evaluate if it's valid
                        if cell.value and isinstance(cell.value, str):
                            # Check for common problematic patterns
                            if cell.value.startswith('='):
                                # Check for array formulas (not supported in Google Sheets)
                                if '{' in cell.value and '}' in cell.value:
                                    print(f"  - Removing array formula at {cell.coordinate}: {cell.value[:50]}")
                                    cell.value = cell.value  # Keep as value if possible
                                    cell.data_type = 'n' if isinstance(cell.value, (int, float)) else 's'
                                # Check for volatile functions that might cause issues
                                elif any(func in cell.value.upper() for func in ['INDIRECT', 'OFFSET', 'NOW()', 'TODAY()', 'RAND()', 'RANDBETWEEN()']):
                                    # These are usually OK, but log them
                                    pass
                    except Exception as e:
                        print(f"  - Warning: Formula issue at {cell.coordinate}: {e}")
                        invalid_formulas.append(cell.coordinate)
        
        # 4. Remove empty rows and columns at the end
        max_row = ws.max_row
        max_col = ws.max_column
        
        # Find actual last row with data
        actual_max_row = 0
        actual_max_col = 0
        for row in ws.iter_rows():
            for cell in row:
                if cell.value is not None:
                    actual_max_row = max(actual_max_row, cell.row)
                    actual_max_col = max(actual_max_col, cell.column)
        
        if actual_max_row < max_row:
            print(f"  - Removing empty rows: {actual_max_row + 1} to {max_row}")
            ws.delete_rows(actual_max_row + 1, max_row - actual_max_row)
        
        if actual_max_col < max_col:
            print(f"  - Removing empty columns: {get_column_letter(actual_max_col + 1)} to {get_column_letter(max_col)}")
            ws.delete_cols(actual_max_col + 1, max_col - actual_max_col)
        
        # 5. Remove comments/notes (can cause issues)
        if hasattr(ws, '_comments') and ws._comments:
            print(f"  - Removing comments")
            for cell_coord in list(ws._comments.keys()):
                ws._comments.pop(cell_coord, None)
        
        # 5b. Remove hyperlinks (can sometimes cause issues)
        if hasattr(ws, '_hyperlinks') and ws._hyperlinks:
            print(f"  - Found {len(ws._hyperlinks)} hyperlinks (keeping them)")
            # Hyperlinks are usually OK, but we'll keep them for now
        
        # 5c. Remove images/charts (can cause issues)
        if hasattr(ws, '_images') and ws._images:
            print(f"  - Found {len(ws._images)} images (keeping them)")
            # Images are usually OK, but may not transfer perfectly
        
        # 6. Clean up merged cells (keep them but ensure they're valid)
        if ws.merged_cells:
            print(f"  - Found {len(ws.merged_cells.ranges)} merged cell ranges")
            # Check for out-of-bounds merges and remove them
            merges_to_remove = []
            for merge_range in list(ws.merged_cells.ranges):
                if merge_range.min_row > ws.max_row or merge_range.min_col > ws.max_column:
                    print(f"  - Removing out-of-bounds merge: {merge_range}")
                    merges_to_remove.append(str(merge_range))
            # Remove invalid merges
            for merge_str in merges_to_remove:
                try:
                    ws.unmerge_cells(merge_str)
                except Exception as e:
                    print(f"  - Warning: Could not unmerge {merge_str}: {e}")
        
        # 7. Fix sheet name if it has invalid characters
        new_sheet_name = sheet_name
        invalid_chars = ['\\', '/', '?', '*', '[', ']', ':']
        for char in invalid_chars:
            if char in new_sheet_name:
                new_sheet_name = new_sheet_name.replace(char, '_')
                print(f"  - Replacing invalid character '{char}' in sheet name")
        
        if new_sheet_name != sheet_name:
            ws.title = new_sheet_name[:31]  # Google Sheets limit is 31 chars
            print(f"  - Renamed sheet to: {new_sheet_name[:31]}")
        elif len(sheet_name) > 31:
            ws.title = sheet_name[:31]
            print(f"  - Truncated sheet name to 31 characters")
    
    # 8. Remove problematic named ranges
    if wb.defined_names:
        print(f"\nFound {len(wb.defined_names)} named ranges")
        # Keep only simple named ranges, remove complex ones
        names_to_remove = []
        for name, definition in wb.defined_names.items():
            if definition and isinstance(definition, str):
                # Check for problematic patterns
                if '!' in definition or '#' in definition or '$' in definition:
                    # These might be OK, but log them
                    pass
        
        if names_to_remove:
            print(f"  - Removing {len(names_to_remove)} problematic named ranges")
            for name in names_to_remove:
                del wb.defined_names[name]
    
    # 9. Save the cleaned workbook
    print(f"\nSaving cleaned workbook to: {output_file}")
    try:
        wb.save(output_file)
        print("✓ Workbook cleaned and saved successfully!")
        return True
    except Exception as e:
        print(f"Error saving workbook: {e}")
        return False

if __name__ == "__main__":
    input_file = "Finance Board Report Dec 2025 p.xlsx"
    output_file = "Finance Board Report Dec 2025 p_cleaned.xlsx"
    
    if len(sys.argv) > 1:
        input_file = sys.argv[1]
    if len(sys.argv) > 2:
        output_file = sys.argv[2]
    
    if not os.path.exists(input_file):
        print(f"Error: File not found: {input_file}")
        sys.exit(1)
    
    success = clean_workbook(input_file, output_file)
    sys.exit(0 if success else 1)

