395 lines
14 KiB
Python
395 lines
14 KiB
Python
|
|
#!/usr/bin/env python3
|
||
|
|
"""
|
||
|
|
Knowledge Ingestion Pipeline for Local Timmy
|
||
|
|
Issue #87 — Auto-ingest Intelligence
|
||
|
|
|
||
|
|
Automatically ingest papers, docs, and techniques into
|
||
|
|
retrievable knowledge items.
|
||
|
|
|
||
|
|
Usage:
|
||
|
|
python ingest.py <file_or_url>
|
||
|
|
python ingest.py --watch <directory>
|
||
|
|
python ingest.py --batch <directory>
|
||
|
|
"""
|
||
|
|
|
||
|
|
import argparse
|
||
|
|
import sqlite3
|
||
|
|
import hashlib
|
||
|
|
import json
|
||
|
|
import os
|
||
|
|
import re
|
||
|
|
from pathlib import Path
|
||
|
|
from typing import Optional, List, Dict, Any
|
||
|
|
from dataclasses import dataclass
|
||
|
|
from datetime import datetime
|
||
|
|
|
||
|
|
|
||
|
|
@dataclass
|
||
|
|
class KnowledgeItem:
|
||
|
|
"""A piece of ingested knowledge."""
|
||
|
|
name: str
|
||
|
|
summary: str
|
||
|
|
source: str
|
||
|
|
actions: List[str]
|
||
|
|
tags: List[str]
|
||
|
|
full_text: str
|
||
|
|
embedding: Optional[List[float]] = None
|
||
|
|
|
||
|
|
|
||
|
|
class KnowledgeStore:
|
||
|
|
"""SQLite-backed knowledge storage."""
|
||
|
|
|
||
|
|
def __init__(self, db_path: str = "~/.timmy/data/knowledge.db"):
|
||
|
|
self.db_path = Path(db_path).expanduser()
|
||
|
|
self.db_path.parent.mkdir(parents=True, exist_ok=True)
|
||
|
|
self._init_db()
|
||
|
|
|
||
|
|
def _init_db(self):
|
||
|
|
with sqlite3.connect(self.db_path) as conn:
|
||
|
|
conn.execute("""
|
||
|
|
CREATE TABLE IF NOT EXISTS knowledge (
|
||
|
|
id INTEGER PRIMARY KEY,
|
||
|
|
name TEXT NOT NULL,
|
||
|
|
summary TEXT NOT NULL,
|
||
|
|
source TEXT NOT NULL,
|
||
|
|
actions TEXT, -- JSON list
|
||
|
|
tags TEXT, -- JSON list
|
||
|
|
full_text TEXT,
|
||
|
|
embedding BLOB,
|
||
|
|
hash TEXT UNIQUE,
|
||
|
|
ingested_at TEXT,
|
||
|
|
applied INTEGER DEFAULT 0,
|
||
|
|
access_count INTEGER DEFAULT 0
|
||
|
|
)
|
||
|
|
""")
|
||
|
|
conn.execute("""
|
||
|
|
CREATE INDEX IF NOT EXISTS idx_tags ON knowledge(tags)
|
||
|
|
""")
|
||
|
|
conn.execute("""
|
||
|
|
CREATE INDEX IF NOT EXISTS idx_source ON knowledge(source)
|
||
|
|
""")
|
||
|
|
|
||
|
|
def _compute_hash(self, text: str) -> str:
|
||
|
|
return hashlib.sha256(text.encode()).hexdigest()[:32]
|
||
|
|
|
||
|
|
def add(self, item: KnowledgeItem) -> bool:
|
||
|
|
"""Add knowledge item. Returns False if duplicate."""
|
||
|
|
item_hash = self._compute_hash(item.full_text)
|
||
|
|
|
||
|
|
with sqlite3.connect(self.db_path) as conn:
|
||
|
|
# Check for duplicate
|
||
|
|
existing = conn.execute(
|
||
|
|
"SELECT id FROM knowledge WHERE hash = ?", (item_hash,)
|
||
|
|
).fetchone()
|
||
|
|
|
||
|
|
if existing:
|
||
|
|
return False
|
||
|
|
|
||
|
|
# Insert
|
||
|
|
conn.execute(
|
||
|
|
"""INSERT INTO knowledge
|
||
|
|
(name, summary, source, actions, tags, full_text, embedding, hash, ingested_at)
|
||
|
|
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)""",
|
||
|
|
(
|
||
|
|
item.name,
|
||
|
|
item.summary,
|
||
|
|
item.source,
|
||
|
|
json.dumps(item.actions),
|
||
|
|
json.dumps(item.tags),
|
||
|
|
item.full_text,
|
||
|
|
json.dumps(item.embedding) if item.embedding else None,
|
||
|
|
item_hash,
|
||
|
|
datetime.now().isoformat()
|
||
|
|
)
|
||
|
|
)
|
||
|
|
return True
|
||
|
|
|
||
|
|
def search(self, query: str, limit: int = 10) -> List[Dict]:
|
||
|
|
"""Search knowledge items."""
|
||
|
|
with sqlite3.connect(self.db_path) as conn:
|
||
|
|
# Simple keyword search for now
|
||
|
|
cursor = conn.execute(
|
||
|
|
"""SELECT name, summary, source, tags, actions, ingested_at
|
||
|
|
FROM knowledge
|
||
|
|
WHERE name LIKE ? OR summary LIKE ? OR full_text LIKE ?
|
||
|
|
ORDER BY ingested_at DESC
|
||
|
|
LIMIT ?""",
|
||
|
|
(f"%{query}%", f"%{query}%", f"%{query}%", limit)
|
||
|
|
)
|
||
|
|
|
||
|
|
results = []
|
||
|
|
for row in cursor:
|
||
|
|
results.append({
|
||
|
|
"name": row[0],
|
||
|
|
"summary": row[1],
|
||
|
|
"source": row[2],
|
||
|
|
"tags": json.loads(row[3]) if row[3] else [],
|
||
|
|
"actions": json.loads(row[4]) if row[4] else [],
|
||
|
|
"ingested_at": row[5]
|
||
|
|
})
|
||
|
|
return results
|
||
|
|
|
||
|
|
def get_by_tag(self, tag: str) -> List[Dict]:
|
||
|
|
"""Get all items with a specific tag."""
|
||
|
|
with sqlite3.connect(self.db_path) as conn:
|
||
|
|
cursor = conn.execute(
|
||
|
|
"SELECT name, summary, tags, actions FROM knowledge WHERE tags LIKE ?",
|
||
|
|
(f"%{tag}%",)
|
||
|
|
)
|
||
|
|
|
||
|
|
results = []
|
||
|
|
for row in cursor:
|
||
|
|
results.append({
|
||
|
|
"name": row[0],
|
||
|
|
"summary": row[1],
|
||
|
|
"tags": json.loads(row[2]) if row[2] else [],
|
||
|
|
"actions": json.loads(row[3]) if row[3] else []
|
||
|
|
})
|
||
|
|
return results
|
||
|
|
|
||
|
|
def get_stats(self) -> Dict:
|
||
|
|
"""Get ingestion statistics."""
|
||
|
|
with sqlite3.connect(self.db_path) as conn:
|
||
|
|
total = conn.execute("SELECT COUNT(*) FROM knowledge").fetchone()[0]
|
||
|
|
applied = conn.execute("SELECT COUNT(*) FROM knowledge WHERE applied = 1").fetchone()[0]
|
||
|
|
|
||
|
|
# Top tags
|
||
|
|
cursor = conn.execute("SELECT tags FROM knowledge")
|
||
|
|
tag_counts = {}
|
||
|
|
for (tags_json,) in cursor:
|
||
|
|
if tags_json:
|
||
|
|
tags = json.loads(tags_json)
|
||
|
|
for tag in tags:
|
||
|
|
tag_counts[tag] = tag_counts.get(tag, 0) + 1
|
||
|
|
|
||
|
|
return {
|
||
|
|
"total_items": total,
|
||
|
|
"applied": applied,
|
||
|
|
"not_applied": total - applied,
|
||
|
|
"top_tags": sorted(tag_counts.items(), key=lambda x: -x[1])[:10]
|
||
|
|
}
|
||
|
|
|
||
|
|
|
||
|
|
class IngestionPipeline:
|
||
|
|
"""Pipeline for ingesting documents."""
|
||
|
|
|
||
|
|
def __init__(self, store: Optional[KnowledgeStore] = None):
|
||
|
|
self.store = store or KnowledgeStore()
|
||
|
|
|
||
|
|
def ingest_file(self, file_path: str) -> Optional[KnowledgeItem]:
|
||
|
|
"""Ingest a file."""
|
||
|
|
path = Path(file_path).expanduser()
|
||
|
|
|
||
|
|
if not path.exists():
|
||
|
|
print(f"File not found: {path}")
|
||
|
|
return None
|
||
|
|
|
||
|
|
# Read file
|
||
|
|
with open(path, 'r') as f:
|
||
|
|
content = f.read()
|
||
|
|
|
||
|
|
# Determine file type and process
|
||
|
|
suffix = path.suffix.lower()
|
||
|
|
|
||
|
|
if suffix == '.md':
|
||
|
|
return self._process_markdown(path.name, content, str(path))
|
||
|
|
elif suffix == '.txt':
|
||
|
|
return self._process_text(path.name, content, str(path))
|
||
|
|
elif suffix in ['.py', '.js', '.sh']:
|
||
|
|
return self._process_code(path.name, content, str(path))
|
||
|
|
else:
|
||
|
|
print(f"Unsupported file type: {suffix}")
|
||
|
|
return None
|
||
|
|
|
||
|
|
def _process_markdown(self, name: str, content: str, source: str) -> KnowledgeItem:
|
||
|
|
"""Process markdown file."""
|
||
|
|
# Extract title from first # header
|
||
|
|
title_match = re.search(r'^#\s+(.+)$', content, re.MULTILINE)
|
||
|
|
title = title_match.group(1) if title_match else name
|
||
|
|
|
||
|
|
# Extract summary from first paragraph after title
|
||
|
|
paragraphs = content.split('\n\n')
|
||
|
|
summary = ""
|
||
|
|
for p in paragraphs:
|
||
|
|
p = p.strip()
|
||
|
|
if p and not p.startswith('#'):
|
||
|
|
summary = p[:200] + "..." if len(p) > 200 else p
|
||
|
|
break
|
||
|
|
|
||
|
|
# Extract action items (lines starting with - or numbered lists)
|
||
|
|
actions = []
|
||
|
|
for line in content.split('\n'):
|
||
|
|
line = line.strip()
|
||
|
|
if line.startswith('- ') or re.match(r'^\d+\.', line):
|
||
|
|
action = line.lstrip('- ').lstrip('0123456789. ')
|
||
|
|
if len(action) > 10: # Minimum action length
|
||
|
|
actions.append(action)
|
||
|
|
|
||
|
|
# Extract tags from content
|
||
|
|
tags = []
|
||
|
|
tag_keywords = {
|
||
|
|
"inference": ["llm", "model", "inference", "sampling", "token"],
|
||
|
|
"training": ["train", "fine-tune", "dataset", "gradient"],
|
||
|
|
"optimization": ["speed", "fast", "cache", "optimize", "performance"],
|
||
|
|
"architecture": ["design", "pattern", "structure", "component"],
|
||
|
|
"tools": ["tool", "command", "script", "automation"],
|
||
|
|
"deployment": ["deploy", "service", "systemd", "production"],
|
||
|
|
}
|
||
|
|
|
||
|
|
content_lower = content.lower()
|
||
|
|
for tag, keywords in tag_keywords.items():
|
||
|
|
if any(kw in content_lower for kw in keywords):
|
||
|
|
tags.append(tag)
|
||
|
|
|
||
|
|
if not tags:
|
||
|
|
tags.append("general")
|
||
|
|
|
||
|
|
return KnowledgeItem(
|
||
|
|
name=title,
|
||
|
|
summary=summary,
|
||
|
|
source=source,
|
||
|
|
actions=actions[:10], # Limit to 10 actions
|
||
|
|
tags=tags,
|
||
|
|
full_text=content
|
||
|
|
)
|
||
|
|
|
||
|
|
def _process_text(self, name: str, content: str, source: str) -> KnowledgeItem:
|
||
|
|
"""Process plain text file."""
|
||
|
|
lines = content.split('\n')
|
||
|
|
title = lines[0][:50] if lines else name
|
||
|
|
summary = ' '.join(lines[1:3])[:200] if len(lines) > 1 else "Text document"
|
||
|
|
|
||
|
|
return KnowledgeItem(
|
||
|
|
name=title,
|
||
|
|
summary=summary,
|
||
|
|
source=source,
|
||
|
|
actions=[],
|
||
|
|
tags=["documentation"],
|
||
|
|
full_text=content
|
||
|
|
)
|
||
|
|
|
||
|
|
def _process_code(self, name: str, content: str, source: str) -> KnowledgeItem:
|
||
|
|
"""Process code file."""
|
||
|
|
# Extract docstring or first comment
|
||
|
|
docstring_match = re.search(r'["\']{3}(.+?)["\']{3}', content, re.DOTALL)
|
||
|
|
if docstring_match:
|
||
|
|
summary = docstring_match.group(1)[:200]
|
||
|
|
else:
|
||
|
|
# First comment
|
||
|
|
comment_match = re.search(r'^#\s*(.+)$', content, re.MULTILINE)
|
||
|
|
summary = comment_match.group(1) if comment_match else f"Code: {name}"
|
||
|
|
|
||
|
|
# Extract functions/classes as actions
|
||
|
|
actions = []
|
||
|
|
func_matches = re.findall(r'^(def|class)\s+(\w+)', content, re.MULTILINE)
|
||
|
|
for match in func_matches[:5]:
|
||
|
|
actions.append(f"{match[0]} {match[1]}")
|
||
|
|
|
||
|
|
return KnowledgeItem(
|
||
|
|
name=name,
|
||
|
|
summary=summary,
|
||
|
|
source=source,
|
||
|
|
actions=actions,
|
||
|
|
tags=["code", "implementation"],
|
||
|
|
full_text=content
|
||
|
|
)
|
||
|
|
|
||
|
|
def ingest_batch(self, directory: str) -> Dict[str, int]:
|
||
|
|
"""Ingest all supported files in a directory."""
|
||
|
|
path = Path(directory).expanduser()
|
||
|
|
|
||
|
|
stats = {"processed": 0, "added": 0, "duplicates": 0, "errors": 0}
|
||
|
|
|
||
|
|
for file_path in path.rglob('*'):
|
||
|
|
if file_path.is_file() and file_path.suffix in ['.md', '.txt', '.py', '.sh']:
|
||
|
|
print(f"Processing: {file_path}")
|
||
|
|
stats["processed"] += 1
|
||
|
|
|
||
|
|
try:
|
||
|
|
item = self.ingest_file(str(file_path))
|
||
|
|
if item:
|
||
|
|
if self.store.add(item):
|
||
|
|
print(f" ✓ Added: {item.name}")
|
||
|
|
stats["added"] += 1
|
||
|
|
else:
|
||
|
|
print(f" ○ Duplicate: {item.name}")
|
||
|
|
stats["duplicates"] += 1
|
||
|
|
else:
|
||
|
|
stats["errors"] += 1
|
||
|
|
except Exception as e:
|
||
|
|
print(f" ✗ Error: {e}")
|
||
|
|
stats["errors"] += 1
|
||
|
|
|
||
|
|
return stats
|
||
|
|
|
||
|
|
|
||
|
|
def main():
|
||
|
|
parser = argparse.ArgumentParser(description="Knowledge Ingestion Pipeline")
|
||
|
|
parser.add_argument("input", nargs="?", help="File or directory to ingest")
|
||
|
|
parser.add_argument("--batch", action="store_true", help="Batch ingest directory")
|
||
|
|
parser.add_argument("--search", help="Search knowledge base")
|
||
|
|
parser.add_argument("--tag", help="Search by tag")
|
||
|
|
parser.add_argument("--stats", action="store_true", help="Show statistics")
|
||
|
|
parser.add_argument("--db", default="~/.timmy/data/knowledge.db", help="Database path")
|
||
|
|
|
||
|
|
args = parser.parse_args()
|
||
|
|
|
||
|
|
store = KnowledgeStore(args.db)
|
||
|
|
pipeline = IngestionPipeline(store)
|
||
|
|
|
||
|
|
if args.stats:
|
||
|
|
stats = store.get_stats()
|
||
|
|
print("Knowledge Store Statistics:")
|
||
|
|
print(f" Total items: {stats['total_items']}")
|
||
|
|
print(f" Applied: {stats['applied']}")
|
||
|
|
print(f" Not applied: {stats['not_applied']}")
|
||
|
|
print("\nTop tags:")
|
||
|
|
for tag, count in stats['top_tags']:
|
||
|
|
print(f" {tag}: {count}")
|
||
|
|
|
||
|
|
elif args.search:
|
||
|
|
results = store.search(args.search)
|
||
|
|
print(f"Search results for '{args.search}':")
|
||
|
|
for item in results:
|
||
|
|
print(f"\n {item['name']}")
|
||
|
|
print(f" {item['summary'][:100]}...")
|
||
|
|
print(f" Tags: {', '.join(item['tags'])}")
|
||
|
|
|
||
|
|
elif args.tag:
|
||
|
|
results = store.get_by_tag(args.tag)
|
||
|
|
print(f"Items with tag '{args.tag}':")
|
||
|
|
for item in results:
|
||
|
|
print(f"\n {item['name']}")
|
||
|
|
print(f" {item['summary'][:100]}...")
|
||
|
|
|
||
|
|
elif args.input:
|
||
|
|
path = Path(args.input)
|
||
|
|
|
||
|
|
if args.batch or path.is_dir():
|
||
|
|
print(f"Batch ingesting: {path}")
|
||
|
|
stats = pipeline.ingest_batch(str(path))
|
||
|
|
print("\nResults:")
|
||
|
|
for key, value in stats.items():
|
||
|
|
print(f" {key}: {value}")
|
||
|
|
else:
|
||
|
|
item = pipeline.ingest_file(str(path))
|
||
|
|
if item:
|
||
|
|
if store.add(item):
|
||
|
|
print(f"Added: {item.name}")
|
||
|
|
print(f"Summary: {item.summary}")
|
||
|
|
print(f"Tags: {', '.join(item['tags'])}")
|
||
|
|
print(f"Actions ({len(item.actions)}):")
|
||
|
|
for action in item.actions[:5]:
|
||
|
|
print(f" - {action}")
|
||
|
|
else:
|
||
|
|
print(f"Already exists: {item.name}")
|
||
|
|
else:
|
||
|
|
print("Failed to process file")
|
||
|
|
|
||
|
|
else:
|
||
|
|
parser.print_help()
|
||
|
|
|
||
|
|
|
||
|
|
if __name__ == "__main__":
|
||
|
|
main()
|