#!/usr/bin/env python3
"""
PDF Translation Script
Translates a Spanish PDF document to English and creates a new PDF file.
"""

import os
import sys
import glob
from pathlib import Path

try:
    import PyPDF2
except ImportError:
    print("Installing PyPDF2...")
    os.system("pip3 install PyPDF2")
    import PyPDF2

try:
    from fpdf import FPDF
except ImportError:
    print("Installing fpdf2...")
    os.system("pip3 install fpdf2")
    from fpdf import FPDF

try:
    from deep_translator import GoogleTranslator
except ImportError:
    print("Installing deep-translator...")
    os.system("pip3 install deep-translator")
    from deep_translator import GoogleTranslator

import re


def extract_text_from_pdf(pdf_path):
    """Extract text from a PDF file, preserving page structure."""
    pages_content = []
    
    try:
        with open(pdf_path, 'rb') as file:
            pdf_reader = PyPDF2.PdfReader(file)
            num_pages = len(pdf_reader.pages)
            
            print(f"Found {num_pages} pages in {pdf_path}")
            
            for page_num in range(num_pages):
                page = pdf_reader.pages[page_num]
                text = page.extract_text()
                if text.strip():
                    pages_content.append({
                        'page_num': page_num + 1,
                        'text': text
                    })
                    print(f"Extracted {len(text)} characters from page {page_num + 1}")
            
            return pages_content
    
    except Exception as e:
        print(f"Error reading PDF: {e}")
        return None


def detect_spanish(text):
    """Simple detection of Spanish text based on common Spanish words."""
    spanish_indicators = [
        'el', 'la', 'los', 'las', 'de', 'que', 'y', 'a', 'en', 'un', 'una',
        'es', 'se', 'no', 'te', 'lo', 'le', 'da', 'su', 'por', 'son', 'con',
        'para', 'del', 'una', 'las', 'más', 'sus', 'ser', 'tiene', 'haber',
        'este', 'desde', 'está', 'mismo', 'bajo', 'según', 'mejor', 'después',
        'donde', 'mientras', 'estado', 'parte', 'estos', 'también', 'entre',
        'tanto', 'durante', 'están', 'siempre', 'quien', 'todas', 'puede',
        'hacer', 'hemos', 'hasta', 'año', 'años', 'día', 'días'
    ]
    
    text_lower = text.lower()
    spanish_word_count = sum(1 for word in spanish_indicators if word in text_lower)
    
    # Check for Spanish-specific characters
    spanish_chars = ['ñ', 'á', 'é', 'í', 'ó', 'ú', 'ü', '¿', '¡']
    has_spanish_chars = any(char in text for char in spanish_chars)
    
    # If we find many Spanish words or Spanish-specific characters, it's likely Spanish
    if spanish_word_count > 10 or has_spanish_chars:
        return True
    
    return False


def translate_text_pages(pages_content, source_lang='es', target_lang='en'):
    """Translate text pages from Spanish to English, preserving structure."""
    if not pages_content:
        return []
    
    print(f"\nTranslating {len(pages_content)} pages...")
    
    translated_pages = []
    translator = GoogleTranslator(source=source_lang, target=target_lang)
    
    for page_info in pages_content:
        page_num = page_info['page_num']
        text = page_info['text']
        
        print(f"Translating page {page_num} ({len(text)} characters)...")
        
        # Split text into chunks if needed
        chunk_size = 4500
        if len(text) <= chunk_size:
            # Single chunk
            try:
                translated = translator.translate(text)
                translated_pages.append({
                    'page_num': page_num,
                    'text': translated,
                    'original_lines': text.split('\n')
                })
            except Exception as e:
                print(f"Error translating page {page_num}: {e}")
                translated_pages.append({
                    'page_num': page_num,
                    'text': text,
                    'original_lines': text.split('\n')
                })
        else:
            # Multiple chunks
            lines = text.split('\n')
            chunks = []
            current_chunk = ""
            
            for line in lines:
                if len(current_chunk) + len(line) + 1 < chunk_size:
                    if current_chunk:
                        current_chunk += '\n' + line
                    else:
                        current_chunk = line
                else:
                    if current_chunk:
                        chunks.append(current_chunk)
                    current_chunk = line
            
            if current_chunk:
                chunks.append(current_chunk)
            
            translated_chunks = []
            for i, chunk in enumerate(chunks):
                try:
                    translated_chunk = translator.translate(chunk)
                    translated_chunks.append(translated_chunk)
                except Exception as e:
                    print(f"Error translating chunk {i+1} of page {page_num}: {e}")
                    translated_chunks.append(chunk)
            
            translated_pages.append({
                'page_num': page_num,
                'text': '\n'.join(translated_chunks),
                'original_lines': text.split('\n')
            })
    
    return translated_pages


def detect_line_type(line):
    """Detect if a line is a header, label, value, or regular text."""
    line_stripped = line.strip()
    if not line_stripped:
        return 'empty'
    
    # Check for headers (ALL CAPS, short lines)
    if line_stripped.isupper() and len(line_stripped) < 80:
        return 'header'
    
    # Check for label-value pairs (contains colon followed by text)
    # This is the most common pattern in forms
    if ':' in line_stripped:
        parts = line_stripped.split(':', 1)
        label_part = parts[0].strip()
        value_part = parts[1].strip() if len(parts) > 1 else ''
        
        # If label is short and there's a value, it's a label-value pair
        if len(label_part) < 60 and len(label_part) > 0:
            return 'label_value'
    
    # Check for section headers (standalone short lines that aren't headers)
    if len(line_stripped) < 50 and not line_stripped.endswith('.') and not ':' in line_stripped:
        # Common section header patterns
        if any(word in line_stripped.lower() for word in ['model', 'registration', 'presenter', 'request']):
            return 'section_header'
    
    return 'regular'


def safe_encode(text):
    """Safely encode text for FPDF."""
    try:
        return text.encode('latin-1', 'replace').decode('latin-1')
    except:
        return text.encode('utf-8', 'ignore').decode('utf-8', 'ignore')


def create_english_pdf(translated_pages, output_path, title="Translated Document"):
    """Create a PDF file with the translated English text, preserving formatting."""
    print(f"\nCreating English PDF: {output_path}")
    
    # Create a PDF document with proper margins
    pdf = FPDF()
    pdf.set_auto_page_break(auto=True, margin=20)
    pdf.set_margins(left=20, top=20, right=20)
    
    first_page = True
    for page_info in translated_pages:
        if not first_page:
            pdf.add_page()
        else:
            pdf.add_page()
            first_page = False
        
        text = page_info['text']
        lines = text.split('\n')
        
        for line in lines:
            line_stripped = line.strip()
            if not line_stripped:
                pdf.ln(5)
                continue
            
            line_type = detect_line_type(line_stripped)
            
            # Handle different line types with appropriate formatting
            if line_type == 'header':
                # Headers: Bold, larger font
                pdf.ln(4)
                pdf.set_font("Helvetica", "B", 14)
                safe_text = safe_encode(line_stripped)
                pdf.cell(0, 10, safe_text, 0, 1, 'L')
                pdf.ln(3)
                
            elif line_type == 'section_header':
                # Section headers: Bold, medium font
                pdf.ln(3)
                pdf.set_font("Helvetica", "B", 11)
                safe_text = safe_encode(line_stripped)
                pdf.cell(0, 8, safe_text, 0, 1, 'L')
                pdf.ln(2)
                
            elif line_type == 'label_value':
                # Label-value pairs: Label in bold, value in regular
                if ':' in line_stripped:
                    parts = line_stripped.split(':', 1)
                    label = parts[0].strip() + ':'
                    value = parts[1].strip() if len(parts) > 1 else ''
                    
                    # Set font to bold for label and get width
                    pdf.set_font("Helvetica", "B", 10)
                    safe_label = safe_encode(label)
                    label_width = pdf.get_string_width(safe_label) + 3
                    label_width = min(label_width, 85)  # Cap at 85mm
                    
                    # Print label in bold (cell with no line break)
                    pdf.cell(label_width, 6, safe_label, 0, 0, 'L')
                    
                    # Print value in regular font (multi_cell with line break)
                    pdf.set_font("Helvetica", "", 10)
                    if value:
                        safe_value = safe_encode(value)
                        pdf.multi_cell(0, 6, safe_value, 0, 'L')
                    else:
                        pdf.ln(6)
                else:
                    pdf.set_font("Helvetica", "", 10)
                    safe_text = safe_encode(line_stripped)
                    pdf.multi_cell(0, 6, safe_text, 0, 'L')
                pdf.ln(2)
                
            else:  # regular text
                pdf.set_font("Helvetica", "", 10)
                safe_text = safe_encode(line_stripped)
                pdf.multi_cell(0, 5, safe_text, 0, 'L')
                pdf.ln(2)
    
    # Save PDF
    pdf.output(output_path)
    print(f"English PDF created successfully: {output_path}")


def find_spanish_pdfs(directory="."):
    """Find PDF files that might be in Spanish."""
    pdf_files = glob.glob(os.path.join(directory, "*.pdf"))
    pdf_files.extend(glob.glob(os.path.join(directory, "**/*.pdf"), recursive=True))
    
    # Remove duplicates
    pdf_files = list(set(pdf_files))
    
    print(f"Found {len(pdf_files)} PDF files")
    
    spanish_pdfs = []
    
    for pdf_path in pdf_files:
        print(f"\nChecking: {pdf_path}")
        pages_content = extract_text_from_pdf(pdf_path)
        
        if pages_content:
            # Combine all pages text for detection
            all_text = '\n\n'.join([page['text'] for page in pages_content])
            # Check if it's likely Spanish
            if detect_spanish(all_text):
                print(f"✓ This appears to be a Spanish document: {pdf_path}")
                spanish_pdfs.append((pdf_path, pages_content))
            else:
                print(f"  (Does not appear to be Spanish)")
        else:
            print(f"  (Could not extract text)")
    
    return spanish_pdfs


def main():
    """Main function to translate PDF."""
    print("=" * 60)
    print("PDF Translation Tool - Spanish to English")
    print("=" * 60)
    
    # Check for command line argument
    if len(sys.argv) > 1:
        pdf_path = sys.argv[1]
        if not os.path.exists(pdf_path):
            print(f"Error: File not found: {pdf_path}")
            sys.exit(1)
        
        print(f"\nProcessing specified file: {pdf_path}")
        pages_content = extract_text_from_pdf(pdf_path)
        
        if not pages_content:
            print("Error: Could not extract text from PDF")
            sys.exit(1)
        
        # Translate
        translated_pages = translate_text_pages(pages_content)
        
        # Create output filename
        base_name = os.path.splitext(os.path.basename(pdf_path))[0]
        output_path = f"{base_name}_English.pdf"
        
        # Create English PDF
        create_english_pdf(translated_pages, output_path, title="Translated Document")
        
        print(f"\n✓ Translation complete!")
        print(f"  Original: {pdf_path}")
        print(f"  English version: {output_path}")
    
    else:
        # Find Spanish PDFs automatically
        print("\nSearching for Spanish PDF files...")
        spanish_pdfs = find_spanish_pdfs()
        
        if not spanish_pdfs:
            print("\nNo Spanish PDF files found. Please specify a PDF file:")
            print("Usage: python3 translate_spanish_pdf.py <path_to_pdf>")
            sys.exit(1)
        
        # If multiple found, use the first one or let user choose
        if len(spanish_pdfs) == 1:
            pdf_path, pages_content = spanish_pdfs[0]
        else:
            print(f"\nFound {len(spanish_pdfs)} Spanish PDF files:")
            for i, (path, _) in enumerate(spanish_pdfs, 1):
                print(f"  {i}. {path}")
            
            # Use the first one (or the one from 2024 if we can identify it)
            pdf_path, pages_content = spanish_pdfs[0]
            print(f"\nProcessing: {pdf_path}")
        
        # Translate
        translated_pages = translate_text_pages(pages_content)
        
        # Create output filename
        base_name = os.path.splitext(os.path.basename(pdf_path))[0]
        output_path = f"{base_name}_English.pdf"
        
        # Create English PDF
        create_english_pdf(translated_pages, output_path, title="Translated Document")
        
        print(f"\n✓ Translation complete!")
        print(f"  Original: {pdf_path}")
        print(f"  English version: {output_path}")


if __name__ == "__main__":
    main()

