#!/usr/bin/env python3
"""
Knowledge Ingestion Pipeline for Local Timmy
Issue #87 — Auto-ingest Intelligence

Automatically ingest papers, docs, and techniques into
retrievable knowledge items.

Usage:
  python ingest.py <file_or_url>
  python ingest.py --watch <directory>
  python ingest.py --batch <directory>
"""

import argparse
import sqlite3
import hashlib
import json
import os
import re
from pathlib import Path
from typing import Optional, List, Dict, Any
from dataclasses import dataclass
from datetime import datetime


@dataclass
class KnowledgeItem:
    """A piece of ingested knowledge."""
    name: str
    summary: str
    source: str
    actions: List[str]
    tags: List[str]
    full_text: str
    embedding: Optional[List[float]] = None


class KnowledgeStore:
    """SQLite-backed knowledge storage."""
    
    def __init__(self, db_path: str = "~/.timmy/data/knowledge.db"):
        self.db_path = Path(db_path).expanduser()
        self.db_path.parent.mkdir(parents=True, exist_ok=True)
        self._init_db()
    
    def _init_db(self):
        with sqlite3.connect(self.db_path) as conn:
            conn.execute("""
                CREATE TABLE IF NOT EXISTS knowledge (
                    id INTEGER PRIMARY KEY,
                    name TEXT NOT NULL,
                    summary TEXT NOT NULL,
                    source TEXT NOT NULL,
                    actions TEXT,  -- JSON list
                    tags TEXT,     -- JSON list
                    full_text TEXT,
                    embedding BLOB,
                    hash TEXT UNIQUE,
                    ingested_at TEXT,
                    applied INTEGER DEFAULT 0,
                    access_count INTEGER DEFAULT 0
                )
            """)
            conn.execute("""
                CREATE INDEX IF NOT EXISTS idx_tags ON knowledge(tags)
            """)
            conn.execute("""
                CREATE INDEX IF NOT EXISTS idx_source ON knowledge(source)
            """)
    
    def _compute_hash(self, text: str) -> str:
        return hashlib.sha256(text.encode()).hexdigest()[:32]
    
    def add(self, item: KnowledgeItem) -> bool:
        """Add knowledge item. Returns False if duplicate."""
        item_hash = self._compute_hash(item.full_text)
        
        with sqlite3.connect(self.db_path) as conn:
            # Check for duplicate
            existing = conn.execute(
                "SELECT id FROM knowledge WHERE hash = ?", (item_hash,)
            ).fetchone()
            
            if existing:
                return False
            
            # Insert
            conn.execute(
                """INSERT INTO knowledge
                   (name, summary, source, actions, tags, full_text, embedding, hash, ingested_at)
                   VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)""",
                (
                    item.name,
                    item.summary,
                    item.source,
                    json.dumps(item.actions),
                    json.dumps(item.tags),
                    item.full_text,
                    json.dumps(item.embedding) if item.embedding else None,
                    item_hash,
                    datetime.now().isoformat()
                )
            )
            return True
    
    def search(self, query: str, limit: int = 10) -> List[Dict]:
        """Search knowledge items."""
        with sqlite3.connect(self.db_path) as conn:
            # Simple keyword search for now
            cursor = conn.execute(
                """SELECT name, summary, source, tags, actions, ingested_at 
                   FROM knowledge 
                   WHERE name LIKE ? OR summary LIKE ? OR full_text LIKE ?
                   ORDER BY ingested_at DESC
                   LIMIT ?""",
                (f"%{query}%", f"%{query}%", f"%{query}%", limit)
            )
            
            results = []
            for row in cursor:
                results.append({
                    "name": row[0],
                    "summary": row[1],
                    "source": row[2],
                    "tags": json.loads(row[3]) if row[3] else [],
                    "actions": json.loads(row[4]) if row[4] else [],
                    "ingested_at": row[5]
                })
            return results
    
    def get_by_tag(self, tag: str) -> List[Dict]:
        """Get all items with a specific tag."""
        with sqlite3.connect(self.db_path) as conn:
            cursor = conn.execute(
                "SELECT name, summary, tags, actions FROM knowledge WHERE tags LIKE ?",
                (f"%{tag}%",)
            )
            
            results = []
            for row in cursor:
                results.append({
                    "name": row[0],
                    "summary": row[1],
                    "tags": json.loads(row[2]) if row[2] else [],
                    "actions": json.loads(row[3]) if row[3] else []
                })
            return results
    
    def get_stats(self) -> Dict:
        """Get ingestion statistics."""
        with sqlite3.connect(self.db_path) as conn:
            total = conn.execute("SELECT COUNT(*) FROM knowledge").fetchone()[0]
            applied = conn.execute("SELECT COUNT(*) FROM knowledge WHERE applied = 1").fetchone()[0]
            
            # Top tags
            cursor = conn.execute("SELECT tags FROM knowledge")
            tag_counts = {}
            for (tags_json,) in cursor:
                if tags_json:
                    tags = json.loads(tags_json)
                    for tag in tags:
                        tag_counts[tag] = tag_counts.get(tag, 0) + 1
            
            return {
                "total_items": total,
                "applied": applied,
                "not_applied": total - applied,
                "top_tags": sorted(tag_counts.items(), key=lambda x: -x[1])[:10]
            }


class IngestionPipeline:
    """Pipeline for ingesting documents."""
    
    def __init__(self, store: Optional[KnowledgeStore] = None):
        self.store = store or KnowledgeStore()
    
    def ingest_file(self, file_path: str) -> Optional[KnowledgeItem]:
        """Ingest a file."""
        path = Path(file_path).expanduser()
        
        if not path.exists():
            print(f"File not found: {path}")
            return None
        
        # Read file
        with open(path, 'r') as f:
            content = f.read()
        
        # Determine file type and process
        suffix = path.suffix.lower()
        
        if suffix == '.md':
            return self._process_markdown(path.name, content, str(path))
        elif suffix == '.txt':
            return self._process_text(path.name, content, str(path))
        elif suffix in ['.py', '.js', '.sh']:
            return self._process_code(path.name, content, str(path))
        else:
            print(f"Unsupported file type: {suffix}")
            return None
    
    def _process_markdown(self, name: str, content: str, source: str) -> KnowledgeItem:
        """Process markdown file."""
        # Extract title from first # header
        title_match = re.search(r'^#\s+(.+)$', content, re.MULTILINE)
        title = title_match.group(1) if title_match else name
        
        # Extract summary from first paragraph after title
        paragraphs = content.split('\n\n')
        summary = ""
        for p in paragraphs:
            p = p.strip()
            if p and not p.startswith('#'):
                summary = p[:200] + "..." if len(p) > 200 else p
                break
        
        # Extract action items (lines starting with - or numbered lists)
        actions = []
        for line in content.split('\n'):
            line = line.strip()
            if line.startswith('- ') or re.match(r'^\d+\.', line):
                action = line.lstrip('- ').lstrip('0123456789. ')
                if len(action) > 10:  # Minimum action length
                    actions.append(action)
        
        # Extract tags from content
        tags = []
        tag_keywords = {
            "inference": ["llm", "model", "inference", "sampling", "token"],
            "training": ["train", "fine-tune", "dataset", "gradient"],
            "optimization": ["speed", "fast", "cache", "optimize", "performance"],
            "architecture": ["design", "pattern", "structure", "component"],
            "tools": ["tool", "command", "script", "automation"],
            "deployment": ["deploy", "service", "systemd", "production"],
        }
        
        content_lower = content.lower()
        for tag, keywords in tag_keywords.items():
            if any(kw in content_lower for kw in keywords):
                tags.append(tag)
        
        if not tags:
            tags.append("general")
        
        return KnowledgeItem(
            name=title,
            summary=summary,
            source=source,
            actions=actions[:10],  # Limit to 10 actions
            tags=tags,
            full_text=content
        )
    
    def _process_text(self, name: str, content: str, source: str) -> KnowledgeItem:
        """Process plain text file."""
        lines = content.split('\n')
        title = lines[0][:50] if lines else name
        summary = ' '.join(lines[1:3])[:200] if len(lines) > 1 else "Text document"
        
        return KnowledgeItem(
            name=title,
            summary=summary,
            source=source,
            actions=[],
            tags=["documentation"],
            full_text=content
        )
    
    def _process_code(self, name: str, content: str, source: str) -> KnowledgeItem:
        """Process code file."""
        # Extract docstring or first comment
        docstring_match = re.search(r'["\']{3}(.+?)["\']{3}', content, re.DOTALL)
        if docstring_match:
            summary = docstring_match.group(1)[:200]
        else:
            # First comment
            comment_match = re.search(r'^#\s*(.+)$', content, re.MULTILINE)
            summary = comment_match.group(1) if comment_match else f"Code: {name}"
        
        # Extract functions/classes as actions
        actions = []
        func_matches = re.findall(r'^(def|class)\s+(\w+)', content, re.MULTILINE)
        for match in func_matches[:5]:
            actions.append(f"{match[0]} {match[1]}")
        
        return KnowledgeItem(
            name=name,
            summary=summary,
            source=source,
            actions=actions,
            tags=["code", "implementation"],
            full_text=content
        )
    
    def ingest_batch(self, directory: str) -> Dict[str, int]:
        """Ingest all supported files in a directory."""
        path = Path(directory).expanduser()
        
        stats = {"processed": 0, "added": 0, "duplicates": 0, "errors": 0}
        
        for file_path in path.rglob('*'):
            if file_path.is_file() and file_path.suffix in ['.md', '.txt', '.py', '.sh']:
                print(f"Processing: {file_path}")
                stats["processed"] += 1
                
                try:
                    item = self.ingest_file(str(file_path))
                    if item:
                        if self.store.add(item):
                            print(f"  ✓ Added: {item.name}")
                            stats["added"] += 1
                        else:
                            print(f"  ○ Duplicate: {item.name}")
                            stats["duplicates"] += 1
                    else:
                        stats["errors"] += 1
                except Exception as e:
                    print(f"  ✗ Error: {e}")
                    stats["errors"] += 1
        
        return stats


def main():
    parser = argparse.ArgumentParser(description="Knowledge Ingestion Pipeline")
    parser.add_argument("input", nargs="?", help="File or directory to ingest")
    parser.add_argument("--batch", action="store_true", help="Batch ingest directory")
    parser.add_argument("--search", help="Search knowledge base")
    parser.add_argument("--tag", help="Search by tag")
    parser.add_argument("--stats", action="store_true", help="Show statistics")
    parser.add_argument("--db", default="~/.timmy/data/knowledge.db", help="Database path")
    
    args = parser.parse_args()
    
    store = KnowledgeStore(args.db)
    pipeline = IngestionPipeline(store)
    
    if args.stats:
        stats = store.get_stats()
        print("Knowledge Store Statistics:")
        print(f"  Total items: {stats['total_items']}")
        print(f"  Applied: {stats['applied']}")
        print(f"  Not applied: {stats['not_applied']}")
        print("\nTop tags:")
        for tag, count in stats['top_tags']:
            print(f"  {tag}: {count}")
    
    elif args.search:
        results = store.search(args.search)
        print(f"Search results for '{args.search}':")
        for item in results:
            print(f"\n  {item['name']}")
            print(f"    {item['summary'][:100]}...")
            print(f"    Tags: {', '.join(item['tags'])}")
    
    elif args.tag:
        results = store.get_by_tag(args.tag)
        print(f"Items with tag '{args.tag}':")
        for item in results:
            print(f"\n  {item['name']}")
            print(f"    {item['summary'][:100]}...")
    
    elif args.input:
        path = Path(args.input)
        
        if args.batch or path.is_dir():
            print(f"Batch ingesting: {path}")
            stats = pipeline.ingest_batch(str(path))
            print("\nResults:")
            for key, value in stats.items():
                print(f"  {key}: {value}")
        else:
            item = pipeline.ingest_file(str(path))
            if item:
                if store.add(item):
                    print(f"Added: {item.name}")
                    print(f"Summary: {item.summary}")
                    print(f"Tags: {', '.join(item['tags'])}")
                    print(f"Actions ({len(item.actions)}):")
                    for action in item.actions[:5]:
                        print(f"  - {action}")
                else:
                    print(f"Already exists: {item.name}")
            else:
                print("Failed to process file")
    
    else:
        parser.print_help()


if __name__ == "__main__":
    main()