#!/usr/bin/env python3 """ Knowledge Ingestion Pipeline for Local Timmy Issue #87 — Auto-ingest Intelligence Automatically ingest papers, docs, and techniques into retrievable knowledge items. Usage: python ingest.py python ingest.py --watch python ingest.py --batch """ import argparse import sqlite3 import hashlib import json import os import re from pathlib import Path from typing import Optional, List, Dict, Any from dataclasses import dataclass from datetime import datetime @dataclass class KnowledgeItem: """A piece of ingested knowledge.""" name: str summary: str source: str actions: List[str] tags: List[str] full_text: str embedding: Optional[List[float]] = None class KnowledgeStore: """SQLite-backed knowledge storage.""" def __init__(self, db_path: str = "~/.timmy/data/knowledge.db"): self.db_path = Path(db_path).expanduser() self.db_path.parent.mkdir(parents=True, exist_ok=True) self._init_db() def _init_db(self): with sqlite3.connect(self.db_path) as conn: conn.execute(""" CREATE TABLE IF NOT EXISTS knowledge ( id INTEGER PRIMARY KEY, name TEXT NOT NULL, summary TEXT NOT NULL, source TEXT NOT NULL, actions TEXT, -- JSON list tags TEXT, -- JSON list full_text TEXT, embedding BLOB, hash TEXT UNIQUE, ingested_at TEXT, applied INTEGER DEFAULT 0, access_count INTEGER DEFAULT 0 ) """) conn.execute(""" CREATE INDEX IF NOT EXISTS idx_tags ON knowledge(tags) """) conn.execute(""" CREATE INDEX IF NOT EXISTS idx_source ON knowledge(source) """) def _compute_hash(self, text: str) -> str: return hashlib.sha256(text.encode()).hexdigest()[:32] def add(self, item: KnowledgeItem) -> bool: """Add knowledge item. Returns False if duplicate.""" item_hash = self._compute_hash(item.full_text) with sqlite3.connect(self.db_path) as conn: # Check for duplicate existing = conn.execute( "SELECT id FROM knowledge WHERE hash = ?", (item_hash,) ).fetchone() if existing: return False # Insert conn.execute( """INSERT INTO knowledge (name, summary, source, actions, tags, full_text, embedding, hash, ingested_at) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)""", ( item.name, item.summary, item.source, json.dumps(item.actions), json.dumps(item.tags), item.full_text, json.dumps(item.embedding) if item.embedding else None, item_hash, datetime.now().isoformat() ) ) return True def search(self, query: str, limit: int = 10) -> List[Dict]: """Search knowledge items.""" with sqlite3.connect(self.db_path) as conn: # Simple keyword search for now cursor = conn.execute( """SELECT name, summary, source, tags, actions, ingested_at FROM knowledge WHERE name LIKE ? OR summary LIKE ? OR full_text LIKE ? ORDER BY ingested_at DESC LIMIT ?""", (f"%{query}%", f"%{query}%", f"%{query}%", limit) ) results = [] for row in cursor: results.append({ "name": row[0], "summary": row[1], "source": row[2], "tags": json.loads(row[3]) if row[3] else [], "actions": json.loads(row[4]) if row[4] else [], "ingested_at": row[5] }) return results def get_by_tag(self, tag: str) -> List[Dict]: """Get all items with a specific tag.""" with sqlite3.connect(self.db_path) as conn: cursor = conn.execute( "SELECT name, summary, tags, actions FROM knowledge WHERE tags LIKE ?", (f"%{tag}%",) ) results = [] for row in cursor: results.append({ "name": row[0], "summary": row[1], "tags": json.loads(row[2]) if row[2] else [], "actions": json.loads(row[3]) if row[3] else [] }) return results def get_stats(self) -> Dict: """Get ingestion statistics.""" with sqlite3.connect(self.db_path) as conn: total = conn.execute("SELECT COUNT(*) FROM knowledge").fetchone()[0] applied = conn.execute("SELECT COUNT(*) FROM knowledge WHERE applied = 1").fetchone()[0] # Top tags cursor = conn.execute("SELECT tags FROM knowledge") tag_counts = {} for (tags_json,) in cursor: if tags_json: tags = json.loads(tags_json) for tag in tags: tag_counts[tag] = tag_counts.get(tag, 0) + 1 return { "total_items": total, "applied": applied, "not_applied": total - applied, "top_tags": sorted(tag_counts.items(), key=lambda x: -x[1])[:10] } class IngestionPipeline: """Pipeline for ingesting documents.""" def __init__(self, store: Optional[KnowledgeStore] = None): self.store = store or KnowledgeStore() def ingest_file(self, file_path: str) -> Optional[KnowledgeItem]: """Ingest a file.""" path = Path(file_path).expanduser() if not path.exists(): print(f"File not found: {path}") return None # Read file with open(path, 'r') as f: content = f.read() # Determine file type and process suffix = path.suffix.lower() if suffix == '.md': return self._process_markdown(path.name, content, str(path)) elif suffix == '.txt': return self._process_text(path.name, content, str(path)) elif suffix in ['.py', '.js', '.sh']: return self._process_code(path.name, content, str(path)) else: print(f"Unsupported file type: {suffix}") return None def _process_markdown(self, name: str, content: str, source: str) -> KnowledgeItem: """Process markdown file.""" # Extract title from first # header title_match = re.search(r'^#\s+(.+)$', content, re.MULTILINE) title = title_match.group(1) if title_match else name # Extract summary from first paragraph after title paragraphs = content.split('\n\n') summary = "" for p in paragraphs: p = p.strip() if p and not p.startswith('#'): summary = p[:200] + "..." if len(p) > 200 else p break # Extract action items (lines starting with - or numbered lists) actions = [] for line in content.split('\n'): line = line.strip() if line.startswith('- ') or re.match(r'^\d+\.', line): action = line.lstrip('- ').lstrip('0123456789. ') if len(action) > 10: # Minimum action length actions.append(action) # Extract tags from content tags = [] tag_keywords = { "inference": ["llm", "model", "inference", "sampling", "token"], "training": ["train", "fine-tune", "dataset", "gradient"], "optimization": ["speed", "fast", "cache", "optimize", "performance"], "architecture": ["design", "pattern", "structure", "component"], "tools": ["tool", "command", "script", "automation"], "deployment": ["deploy", "service", "systemd", "production"], } content_lower = content.lower() for tag, keywords in tag_keywords.items(): if any(kw in content_lower for kw in keywords): tags.append(tag) if not tags: tags.append("general") return KnowledgeItem( name=title, summary=summary, source=source, actions=actions[:10], # Limit to 10 actions tags=tags, full_text=content ) def _process_text(self, name: str, content: str, source: str) -> KnowledgeItem: """Process plain text file.""" lines = content.split('\n') title = lines[0][:50] if lines else name summary = ' '.join(lines[1:3])[:200] if len(lines) > 1 else "Text document" return KnowledgeItem( name=title, summary=summary, source=source, actions=[], tags=["documentation"], full_text=content ) def _process_code(self, name: str, content: str, source: str) -> KnowledgeItem: """Process code file.""" # Extract docstring or first comment docstring_match = re.search(r'["\']{3}(.+?)["\']{3}', content, re.DOTALL) if docstring_match: summary = docstring_match.group(1)[:200] else: # First comment comment_match = re.search(r'^#\s*(.+)$', content, re.MULTILINE) summary = comment_match.group(1) if comment_match else f"Code: {name}" # Extract functions/classes as actions actions = [] func_matches = re.findall(r'^(def|class)\s+(\w+)', content, re.MULTILINE) for match in func_matches[:5]: actions.append(f"{match[0]} {match[1]}") return KnowledgeItem( name=name, summary=summary, source=source, actions=actions, tags=["code", "implementation"], full_text=content ) def ingest_batch(self, directory: str) -> Dict[str, int]: """Ingest all supported files in a directory.""" path = Path(directory).expanduser() stats = {"processed": 0, "added": 0, "duplicates": 0, "errors": 0} for file_path in path.rglob('*'): if file_path.is_file() and file_path.suffix in ['.md', '.txt', '.py', '.sh']: print(f"Processing: {file_path}") stats["processed"] += 1 try: item = self.ingest_file(str(file_path)) if item: if self.store.add(item): print(f" ✓ Added: {item.name}") stats["added"] += 1 else: print(f" ○ Duplicate: {item.name}") stats["duplicates"] += 1 else: stats["errors"] += 1 except Exception as e: print(f" ✗ Error: {e}") stats["errors"] += 1 return stats def main(): parser = argparse.ArgumentParser(description="Knowledge Ingestion Pipeline") parser.add_argument("input", nargs="?", help="File or directory to ingest") parser.add_argument("--batch", action="store_true", help="Batch ingest directory") parser.add_argument("--search", help="Search knowledge base") parser.add_argument("--tag", help="Search by tag") parser.add_argument("--stats", action="store_true", help="Show statistics") parser.add_argument("--db", default="~/.timmy/data/knowledge.db", help="Database path") args = parser.parse_args() store = KnowledgeStore(args.db) pipeline = IngestionPipeline(store) if args.stats: stats = store.get_stats() print("Knowledge Store Statistics:") print(f" Total items: {stats['total_items']}") print(f" Applied: {stats['applied']}") print(f" Not applied: {stats['not_applied']}") print("\nTop tags:") for tag, count in stats['top_tags']: print(f" {tag}: {count}") elif args.search: results = store.search(args.search) print(f"Search results for '{args.search}':") for item in results: print(f"\n {item['name']}") print(f" {item['summary'][:100]}...") print(f" Tags: {', '.join(item['tags'])}") elif args.tag: results = store.get_by_tag(args.tag) print(f"Items with tag '{args.tag}':") for item in results: print(f"\n {item['name']}") print(f" {item['summary'][:100]}...") elif args.input: path = Path(args.input) if args.batch or path.is_dir(): print(f"Batch ingesting: {path}") stats = pipeline.ingest_batch(str(path)) print("\nResults:") for key, value in stats.items(): print(f" {key}: {value}") else: item = pipeline.ingest_file(str(path)) if item: if store.add(item): print(f"Added: {item.name}") print(f"Summary: {item.summary}") print(f"Tags: {', '.join(item['tags'])}") print(f"Actions ({len(item.actions)}):") for action in item.actions[:5]: print(f" - {action}") else: print(f"Already exists: {item.name}") else: print("Failed to process file") else: parser.print_help() if __name__ == "__main__": main()