#!/usr/bin/env python3
"""
Job Description Analyzer
Extracts tasks, responsibilities, and decisions from job description documents
and creates a structured overview for management reorganization.

Usage:
    python analyze_job_descriptions.py
    python analyze_job_descriptions.py --output excel
    python analyze_job_descriptions.py --output word
"""

import argparse
import re
from pathlib import Path
from docx import Document
from openpyxl import Workbook
from openpyxl.styles import Font, PatternFill, Alignment, Border, Side
from openpyxl.utils import get_column_letter
from datetime import datetime

class JobDescriptionAnalyzer:
    """Analyzes job descriptions and extracts structured information."""
    
    def __init__(self, job_descriptions_folder):
        """Initialize with path to job descriptions folder."""
        self.folder = Path(job_descriptions_folder)
        self.roles = []
        
    def extract_text_from_docx(self, file_path):
        """Extract all text from a Word document."""
        try:
            doc = Document(file_path)
            full_text = []
            for para in doc.paragraphs:
                if para.text.strip():
                    full_text.append(para.text.strip())
            return '\n'.join(full_text)
        except Exception as e:
            print(f"Error reading {file_path}: {e}")
            return ""
    
    def identify_role_name(self, text, filename):
        """Extract role name from text or filename."""
        # Try to find common patterns
        patterns = [
            r'Job Description[:\s]+([A-Z][^\.\n]+)',
            r'Position[:\s]+([A-Z][^\.\n]+)',
            r'Role[:\s]+([A-Z][^\.\n]+)',
            r'Title[:\s]+([A-Z][^\.\n]+)',
        ]
        
        for pattern in patterns:
            match = re.search(pattern, text, re.IGNORECASE)
            if match:
                return match.group(1).strip()
        
        # Fallback to filename
        name = filename.stem
        # Remove common prefixes
        name = re.sub(r'^\d+\s*', '', name)  # Remove leading numbers
        name = re.sub(r'Job Description\s*', '', name, flags=re.IGNORECASE)
        name = re.sub(r'TERMS OF REFERENCE\s*', '', name, flags=re.IGNORECASE)
        return name.strip()
    
    def extract_tasks(self, text):
        """Extract tasks from text."""
        tasks = []
        
        # Look for task sections
        task_patterns = [
            r'(?:Tasks?|Duties?|Key Tasks?|Main Tasks?)[:\s]*\n((?:[-•*]\s*.+\n?)+)',
            r'(?:Tasks?|Duties?)[:\s]*\n((?:\d+[\.\)]\s*.+\n?)+)',
        ]
        
        for pattern in task_patterns:
            matches = re.finditer(pattern, text, re.IGNORECASE | re.MULTILINE)
            for match in matches:
                task_section = match.group(1)
                # Extract individual tasks
                task_items = re.findall(r'[-•*]\s*(.+?)(?=\n[-•*]|\n\n|$)', task_section, re.MULTILINE)
                tasks.extend([t.strip() for t in task_items if t.strip()])
        
        # Also look for numbered lists
        numbered_tasks = re.findall(r'^\d+[\.\)]\s*(.+?)$', text, re.MULTILINE)
        if numbered_tasks and len(numbered_tasks) > 2:  # Likely a task list
            tasks.extend([t.strip() for t in numbered_tasks if t.strip()])
        
        return list(set(tasks))  # Remove duplicates
    
    def extract_responsibilities(self, text):
        """Extract responsibilities from text."""
        responsibilities = []
        
        # Look for responsibility sections
        resp_patterns = [
            r'(?:Responsibilities?|Key Responsibilities?|Main Responsibilities?)[:\s]*\n((?:[-•*]\s*.+\n?)+)',
            r'(?:Responsibilities?)[:\s]*\n((?:\d+[\.\)]\s*.+\n?)+)',
        ]
        
        for pattern in resp_patterns:
            matches = re.finditer(pattern, text, re.IGNORECASE | re.MULTILINE)
            for match in matches:
                resp_section = match.group(1)
                resp_items = re.findall(r'[-•*]\s*(.+?)(?=\n[-•*]|\n\n|$)', resp_section, re.MULTILINE)
                responsibilities.extend([r.strip() for r in resp_items if r.strip()])
        
        return list(set(responsibilities))
    
    def extract_decisions(self, text):
        """Extract decision-making authority from text."""
        decisions = []
        
        # Look for decision-related keywords
        decision_keywords = [
            r'(?:authority|authorized|decide|decision|approve|approval|authorize)[:\s]*([^\.\n]+)',
            r'(?:can|may|has authority to|responsible for deciding)[:\s]*([^\.\n]+)',
        ]
        
        for pattern in decision_keywords:
            matches = re.finditer(pattern, text, re.IGNORECASE)
            for decision_text in matches:
                decision = decision_text.group(1).strip()
                if len(decision) > 10:  # Filter out very short matches
                    decisions.append(decision)
        
        return list(set(decisions))
    
    def extract_reporting(self, text):
        """Extract reporting relationships."""
        reporting_to = []
        reports = []
        
        # Look for reporting patterns
        report_patterns = [
            r'reports? to[:\s]+([A-Z][^\.\n]+)',
            r'reporting to[:\s]+([A-Z][^\.\n]+)',
            r'accountable to[:\s]+([A-Z][^\.\n]+)',
            r'line manager[:\s]+([A-Z][^\.\n]+)',
        ]
        
        for pattern in report_patterns:
            matches = re.finditer(pattern, text, re.IGNORECASE)
            for match in matches:
                reporting_to.append(match.group(1).strip())
        
        # Look for direct reports
        direct_report_patterns = [
            r'direct reports?[:\s]+([^\.\n]+)',
            r'manages?[:\s]+([^\.\n]+)',
            r'supervises?[:\s]+([^\.\n]+)',
        ]
        
        for pattern in direct_report_patterns:
            matches = re.finditer(pattern, text, re.IGNORECASE)
            for match in matches:
                reports.append(match.group(1).strip())
        
        return list(set(reporting_to)), list(set(reports))
    
    def analyze_document(self, file_path):
        """Analyze a single job description document."""
        print(f"Analyzing: {file_path.name}")
        
        text = self.extract_text_from_docx(file_path)
        if not text:
            print(f"  Warning: Could not extract text from {file_path.name}")
            return None
        
        role_name = self.identify_role_name(text, file_path)
        tasks = self.extract_tasks(text)
        responsibilities = self.extract_responsibilities(text)
        decisions = self.extract_decisions(text)
        reporting_to, direct_reports = self.extract_reporting(text)
        
        # If no tasks/responsibilities found, try to extract from full text
        if not tasks and not responsibilities:
            # Look for bullet points or numbered items in general
            all_items = re.findall(r'[-•*]\s*(.+?)(?=\n[-•*]|\n\n|$)', text, re.MULTILINE)
            if all_items:
                # Use first few as responsibilities if no specific section found
                responsibilities = [item.strip() for item in all_items[:10] if len(item.strip()) > 20]
        
        role_data = {
            'role_name': role_name,
            'filename': file_path.name,
            'tasks': tasks,
            'responsibilities': responsibilities,
            'decisions': decisions,
            'reporting_to': reporting_to,
            'direct_reports': direct_reports,
            'full_text': text[:500]  # First 500 chars for reference
        }
        
        return role_data
    
    def analyze_all_documents(self):
        """Analyze all job description documents in the folder."""
        docx_files = list(self.folder.glob('*.docx'))
        
        if not docx_files:
            print(f"No .docx files found in {self.folder}")
            return
        
        print(f"Found {len(docx_files)} job description files\n")
        
        for file_path in sorted(docx_files):
            role_data = self.analyze_document(file_path)
            if role_data:
                self.roles.append(role_data)
        
        print(f"\nAnalyzed {len(self.roles)} job descriptions")
    
    def create_excel_output(self, output_path):
        """Create Excel file with structured data."""
        wb = Workbook()
        ws = wb.active
        ws.title = "Tasks & Responsibilities"
        
        # Headers
        headers = [
            'Role/Position',
            'Department/Area',
            'Key Tasks',
            'Key Responsibilities',
            'Decision-Making Authority',
            'Reports To',
            'Direct Reports',
            'Source Document',
            'Notes'
        ]
        
        # Style for headers
        header_fill = PatternFill(start_color="366092", end_color="366092", fill_type="solid")
        header_font = Font(bold=True, color="FFFFFF", size=11)
        border = Border(
            left=Side(style='thin'),
            right=Side(style='thin'),
            top=Side(style='thin'),
            bottom=Side(style='thin')
        )
        
        # Write headers
        for col_num, header in enumerate(headers, 1):
            cell = ws.cell(row=1, column=col_num)
            cell.value = header
            cell.fill = header_fill
            cell.font = header_font
            cell.alignment = Alignment(horizontal='center', vertical='center', wrap_text=True)
            cell.border = border
        
        # Write data
        for row_num, role in enumerate(self.roles, 2):
            ws.cell(row=row_num, column=1, value=role['role_name']).border = border
            ws.cell(row=row_num, column=2, value="").border = border  # Department - to be filled manually
            ws.cell(row=row_num, column=3, value='\n'.join(role['tasks'][:10])).border = border
            ws.cell(row=row_num, column=4, value='\n'.join(role['responsibilities'][:10])).border = border
            ws.cell(row=row_num, column=5, value='\n'.join(role['decisions'][:10])).border = border
            ws.cell(row=row_num, column=6, value='\n'.join(role['reporting_to'])).border = border
            ws.cell(row=row_num, column=7, value='\n'.join(role['direct_reports'])).border = border
            ws.cell(row=row_num, column=8, value=role['filename']).border = border
            ws.cell(row=row_num, column=9, value="").border = border  # Notes - to be filled manually
            
            # Set alignment and wrap text
            for col in range(1, 10):
                cell = ws.cell(row=row_num, column=col)
                cell.alignment = Alignment(vertical='top', wrap_text=True)
        
        # Adjust column widths
        ws.column_dimensions['A'].width = 25
        ws.column_dimensions['B'].width = 20
        ws.column_dimensions['C'].width = 40
        ws.column_dimensions['D'].width = 40
        ws.column_dimensions['E'].width = 40
        ws.column_dimensions['F'].width = 25
        ws.column_dimensions['G'].width = 25
        ws.column_dimensions['H'].width = 30
        ws.column_dimensions['I'].width = 30
        
        # Set row heights
        for row in range(2, len(self.roles) + 2):
            ws.row_dimensions[row].height = 100
        
        # Freeze header row
        ws.freeze_panes = 'A2'
        
        # Save
        wb.save(output_path)
        print(f"\nExcel file created: {output_path}")
    
    def create_summary_report(self, output_path):
        """Create a summary text report."""
        with open(output_path, 'w', encoding='utf-8') as f:
            f.write("JOB DESCRIPTION ANALYSIS SUMMARY\n")
            f.write("=" * 60 + "\n\n")
            f.write(f"Generated: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n")
            f.write(f"Total Roles Analyzed: {len(self.roles)}\n\n")
            
            for role in self.roles:
                f.write(f"\n{'='*60}\n")
                f.write(f"ROLE: {role['role_name']}\n")
                f.write(f"Source: {role['filename']}\n")
                f.write(f"{'='*60}\n\n")
                
                if role['reporting_to']:
                    f.write(f"Reports To: {', '.join(role['reporting_to'])}\n")
                if role['direct_reports']:
                    f.write(f"Direct Reports: {', '.join(role['direct_reports'])}\n")
                f.write("\n")
                
                if role['tasks']:
                    f.write("KEY TASKS:\n")
                    for i, task in enumerate(role['tasks'][:15], 1):
                        f.write(f"  {i}. {task}\n")
                    f.write("\n")
                
                if role['responsibilities']:
                    f.write("KEY RESPONSIBILITIES:\n")
                    for i, resp in enumerate(role['responsibilities'][:15], 1):
                        f.write(f"  {i}. {resp}\n")
                    f.write("\n")
                
                if role['decisions']:
                    f.write("DECISION-MAKING AUTHORITY:\n")
                    for i, decision in enumerate(role['decisions'][:10], 1):
                        f.write(f"  {i}. {decision}\n")
                    f.write("\n")
        
        print(f"Summary report created: {output_path}")


def main():
    parser = argparse.ArgumentParser(description='Analyze job descriptions and extract tasks, responsibilities, and decisions')
    parser.add_argument('--folder', default='job_descriptions', help='Folder containing job description files')
    parser.add_argument('--output', choices=['excel', 'word', 'both', 'summary'], default='both',
                       help='Output format (default: both)')
    
    args = parser.parse_args()
    
    folder_path = Path(args.folder)
    if not folder_path.exists():
        print(f"Error: Folder '{folder_path}' does not exist")
        return
    
    analyzer = JobDescriptionAnalyzer(folder_path)
    analyzer.analyze_all_documents()
    
    if not analyzer.roles:
        print("No roles found to analyze")
        return
    
    output_folder = folder_path
    timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
    
    if args.output in ['excel', 'both']:
        excel_path = output_folder / f'Task_Responsibility_Decision_Analysis_{timestamp}.xlsx'
        analyzer.create_excel_output(excel_path)
    
    if args.output in ['summary', 'both']:
        summary_path = output_folder / f'Analysis_Summary_{timestamp}.txt'
        analyzer.create_summary_report(summary_path)
    
    print("\nAnalysis complete!")


if __name__ == '__main__':
    main()

