#!/usr/bin/env python3
"""
Split customer statement PDF by detecting customer account numbers.
Detects account numbers using OCR and groups consecutive pages with the same account.
"""
import sys
import json
import os
import re

try:
    import fitz  # PyMuPDF
except Exception as e:
    print(json.dumps({
        "ok": False,
        "error": "PyMuPDF is not available. Install with: pip3 install pymupdf",
        "details": str(e)
    }))
    sys.exit(1)

try:
    import pytesseract
    from PIL import Image
    import io
except Exception as e:
    print(json.dumps({
        "ok": False,
        "error": "OCR libraries not available. Install with: pip3 install pytesseract pillow",
        "details": str(e)
    }))
    sys.exit(1)


def extract_text_from_page(page):
    """Extract text from a PDF page using PyMuPDF's built-in text extraction."""
    try:
        # Try built-in text extraction first (faster, no OCR needed)
        text = page.get_text()
        return text
    except Exception:
        return ""


def extract_account_number(text):
    """
    Extract customer account number from page text.
    Looks for common patterns like "Account:", "Acc:", "A/C:", or alphanumeric codes.
    """
    if not text:
        return None

    # Try to find account number patterns
    patterns = [
        r'Account[:\s]+([A-Z0-9\-]+)',
        r'Acc[:\s]+([A-Z0-9\-]+)',
        r'A/C[:\s]+([A-Z0-9\-]+)',
        r'Customer[:\s]+([A-Z0-9\-]+)',
        r'Cust[:\s]+([A-Z0-9\-]+)',
        r'^[A-Z]{1,4}\s*-?\s*\d{1,6}',  # Pattern like "ABC-12345" or "ABC 12345"
    ]

    for pattern in patterns:
        match = re.search(pattern, text, re.IGNORECASE | re.MULTILINE)
        if match:
            account = match.group(1) if '(' in pattern else match.group(0)
            return account.strip()

    # If no pattern found, try to extract from first 200 chars (top of page)
    first_section = text[:500].strip()
    lines = first_section.split('\n')
    for line in lines[:10]:  # Check first 10 lines
        line = line.strip()
        # Look for account-like patterns (mix of letters and numbers)
        if re.match(r'^[A-Z0-9\-\s]{3,20}$', line) and any(c.isdigit() for c in line):
            return line

    return None


def main():
    if len(sys.argv) < 3:
        print(json.dumps({
            "ok": False,
            "error": "Usage: split_statement_pdf.py input.pdf output_dir"
        }))
        sys.exit(1)

    input_pdf = sys.argv[1]
    output_dir = sys.argv[2]

    if not os.path.isfile(input_pdf):
        print(json.dumps({
            "ok": False,
            "error": f"Input PDF not found: {input_pdf}"
        }))
        sys.exit(1)

    os.makedirs(output_dir, exist_ok=True)

    try:
        doc = fitz.open(input_pdf)
        total_pages = len(doc)

        if total_pages == 0:
            print(json.dumps({
                "ok": False,
                "error": "PDF has no pages"
            }))
            sys.exit(1)

        # Extract account number from each page
        page_accounts = []
        for page_num in range(total_pages):
            page = doc[page_num]
            text = extract_text_from_page(page)
            account = extract_account_number(text)
            page_accounts.append({
                'page_num': page_num,
                'account': account,
                'text_sample': text[:200] if text else ''
            })

        # Group consecutive pages by account
        groups = []
        current_group = None

        for page_info in page_accounts:
            account = page_info['account']

            if current_group is None:
                # Start new group
                current_group = {
                    'account': account,
                    'start_page': page_info['page_num'],
                    'end_page': page_info['page_num']
                }
            elif account == current_group['account']:
                # Same account, extend group
                current_group['end_page'] = page_info['page_num']
            else:
                # Different account, save current group and start new
                groups.append(current_group)
                current_group = {
                    'account': account,
                    'start_page': page_info['page_num'],
                    'end_page': page_info['page_num']
                }

        if current_group is not None:
            groups.append(current_group)

        # Create PDF for each group
        statements = []
        for idx, group in enumerate(groups):
            account = group['account'] or f"UNKNOWN_{idx}"
            start = group['start_page']
            end = group['end_page']
            page_range = f"{start + 1}-{end + 1}"

            # Sanitize account name for filename
            safe_account = re.sub(r'[^\w\-\.]', '_', str(account))
            filename = f"statement_{safe_account}_pages_{page_range}.pdf"
            output_path = os.path.join(output_dir, filename)

            # Create new PDF with these pages
            new_doc = fitz.open()
            new_doc.insert_pdf(doc, from_page=start, to_page=end)
            new_doc.save(output_path)
            new_doc.close()

            statements.append({
                'account': str(account),
                'page_count': end - start + 1,
                'page_range': page_range,
                'filename': filename,
                'path': output_path
            })

        doc.close()

        print(json.dumps({
            "ok": True,
            "statement_count": len(statements),
            "page_count": total_pages,
            "statements": statements,
            "page_details": page_accounts
        }))

    except Exception as e:
        print(json.dumps({
            "ok": False,
            "error": "Failed to split PDF",
            "details": str(e)
        }))
        sys.exit(1)


if __name__ == "__main__":
    main()
