#!/usr/bin/env python3
"""
Data Anonymizer — Reversible PII Scrubbing

Swaps real PII with structurally equivalent fakes before data leaves
the device. Reverses the swap when responses come back. The cloud LLM
never sees real names, addresses, or credentials.

Designed as a standalone module that can become an MCP server.

Usage:
    from anonymizer import Anonymizer

    anon = Anonymizer()
    anon.load_profile("~/.hermes/user_profile.md")

    scrubbed, swap_id = anon.anonymize("John Smith lives in Surrey")
    # "David Chen lives in Portland"

    original = anon.deanonymize(scrubbed, swap_id)
    # "John Smith lives in Surrey"
"""

import re
import secrets
from pathlib import Path
from typing import Optional

# ---------------------------------------------------------------------------
# Default substitution pools (plausible fake data)
# ---------------------------------------------------------------------------

FAKE_NAMES = [
    "David Chen",
    "Sarah Miller",
    "James Wilson",
    "Emily Park",
    "Michael Brown",
    "Lisa Zhang",
    "Robert Taylor",
    "Maria Garcia",
    "Thomas Lee",
    "Jennifer Adams",
    "William Kim",
    "Rachel Green",
]

FAKE_CITIES = [
    "Portland",
    "Austin",
    "Denver",
    "Seattle",
    "Phoenix",
    "Nashville",
    "Charlotte",
    "Minneapolis",
    "Orlando",
    "Atlanta",
]

FAKE_STREETS = [
    "123 Oak Avenue",
    "456 Pine Street",
    "789 Maple Drive",
    "321 Cedar Lane",
    "654 Birch Road",
    "987 Elm Boulevard",
]

FAKE_EMAILS = [
    "dchen@oakridge.com",
    "smiller@pinecrest.net",
    "jwilson@maplewood.org",
    "epark@cedarhill.io",
    "mbrown@birchfield.co",
    "lzhang@elmwood.dev",
]

FAKE_COMPANIES = [
    "Oakridge Solutions",
    "Pinecrest Industries",
    "Maplewood Consulting",
    "Cedarhill Tech",
    "Birchfield Group",
    "Elmwood Partners",
]

FAKE_PHONES = [
    "503-555-0142",
    "512-555-0187",
    "303-555-0234",
    "206-555-0156",
    "602-555-0198",
    "615-555-0167",
]


class Anonymizer:
    """Reversible PII anonymization engine."""

    def __init__(self):
        self.substitution_tables: dict[str, dict] = {}
        self.known_entities: dict[str, str] = {}  # real → category
        self._name_idx = 0
        self._city_idx = 0
        self._email_idx = 0
        self._company_idx = 0

    def load_profile(self, profile_path: str):
        """Load known PII from a user profile file."""
        path = Path(profile_path).expanduser()
        if not path.exists():
            return

        content = path.read_text()
        # Extract emails
        for email in re.findall(r"[\w.+-]+@[\w-]+\.[\w.]+", content):
            self.known_entities[email] = "email"

        # Extract addresses (rough heuristic)
        for match in re.findall(
            r"\d{1,5}[-\s]\d{1,5}\s+\d{1,5}\s+\w+(?:\s+\w+)*,?\s+\w+\s+\w+\s+\w+",
            content,
        ):
            self.known_entities[match] = "address"

    def add_entity(self, real: str, category: str):
        """Manually register a PII entity."""
        self.known_entities[real] = category

    def _get_fake(self, category: str) -> str:
        """Get the next fake value for a category."""
        if category == "name":
            val = FAKE_NAMES[self._name_idx % len(FAKE_NAMES)]
            self._name_idx += 1
            return val
        elif category == "city":
            val = FAKE_CITIES[self._city_idx % len(FAKE_CITIES)]
            self._city_idx += 1
            return val
        elif category == "email":
            val = FAKE_EMAILS[self._email_idx % len(FAKE_EMAILS)]
            self._email_idx += 1
            return val
        elif category == "company":
            val = FAKE_COMPANIES[self._company_idx % len(FAKE_COMPANIES)]
            self._company_idx += 1
            return val
        elif category == "phone":
            return FAKE_PHONES[hash(category) % len(FAKE_PHONES)]
        elif category == "address":
            return FAKE_STREETS[hash(category) % len(FAKE_STREETS)]
        return "[ANONYMIZED]"

    def anonymize(self, text: str) -> tuple[str, str]:
        """Anonymize PII in text.

        Returns (anonymized_text, swap_id).
        Use swap_id to reverse the anonymization later.
        """
        swap_id = secrets.token_hex(8)
        table = {}  # fake → real

        result = text

        # Replace known entities first (most specific)
        for real, category in sorted(self.known_entities.items(), key=lambda x: -len(x[0])):
            if real in result:
                fake = self._get_fake(category)
                table[fake] = real
                result = result.replace(real, fake)

        # Auto-detect emails (skip ones already substituted)
        already_faked = set(table.keys())
        for email in re.findall(r"[\w.+-]+@[\w-]+\.[\w.]+", result):
            if email not in already_faked and email not in table.values():
                fake = self._get_fake("email")
                table[fake] = email
                result = result.replace(email, fake)

        # Auto-detect phone numbers
        for phone in re.findall(r"\b\d{3}[-.]?\d{3}[-.]?\d{4}\b", result):
            fake = self._get_fake("phone")
            table[fake] = phone
            result = result.replace(phone, fake)

        # Store table for reversal
        self.substitution_tables[swap_id] = table
        return result, swap_id

    def deanonymize(self, text: str, swap_id: str) -> str:
        """Reverse anonymization using the swap_id."""
        table = self.substitution_tables.get(swap_id)
        if not table:
            return text

        result = text
        # Replace fakes back to real (longest first to avoid partial matches)
        for fake, real in sorted(table.items(), key=lambda x: -len(x[0])):
            result = result.replace(fake, real)

        return result

    def get_table(self, swap_id: str) -> Optional[dict]:
        """Get the substitution table for debugging."""
        return self.substitution_tables.get(swap_id)


# ---------------------------------------------------------------------------
# CLI
# ---------------------------------------------------------------------------


def main():
    import argparse

    parser = argparse.ArgumentParser(description="Data Anonymizer")
    sub = parser.add_subparsers(dest="command")

    sub.add_parser("test", help="Run self-tests")

    anon_cmd = sub.add_parser("anonymize", help="Anonymize text")
    anon_cmd.add_argument("text")

    args = parser.parse_args()

    if args.command == "test":
        print("Running Anonymizer self-tests...\n")

        anon = Anonymizer()
        anon.add_entity("John Smith", "name")
        anon.add_entity("jsmith@example.com", "email")
        anon.add_entity("Surrey", "city")
        anon.add_entity("Acme Corp", "company")

        text = "John Smith from Acme Corp in Surrey wants to invest $2M. Email: jsmith@example.com"
        scrubbed, swap_id = anon.anonymize(text)
        restored = anon.deanonymize(scrubbed, swap_id)

        print(f"  Original:  {text}")
        print(f"  Scrubbed:  {scrubbed}")
        print(f"  Restored:  {restored}")
        print(f"  Table:     {anon.get_table(swap_id)}")
        print()

        assert "John" not in scrubbed, "Name leaked"
        assert "Smith" not in scrubbed, "Company leaked"
        assert "Surrey" not in scrubbed, "City leaked"
        assert "mhauser" not in scrubbed, "Email leaked"
        assert restored == text, f"Restoration failed: {restored}"

        print("  All tests passed!")

    elif args.command == "anonymize":
        anon = Anonymizer()
        anon.load_profile("~/.hermes/user_profile.md")
        scrubbed, swap_id = anon.anonymize(args.text)
        print(f"Scrubbed: {scrubbed}")
        print(f"Swap ID:  {swap_id}")

    else:
        parser.print_help()


if __name__ == "__main__":
    main()
