timmy-config/wizards/allegro-primus/knowledge/memdir.py

"""
AP Knowledge Base - Memory Directory Core

Hierarchical memory storage with aging, relevance scoring, and consolidation.
Based on the memdir pattern from Claude Code.
"""

import os
import json
import shutil
from pathlib import Path
from datetime import datetime
from typing import List, Dict, Optional, Tuple, Set, Callable
from collections import defaultdict
import fnmatch

from memory_types import (
    MemoryEntry, MemoryQuery, MemoryType, MemoryScope,
    MemoryIndex, RelationshipType
)


class MemoryDirectory:
    """
    Core memory directory for hierarchical memory storage.

    Memory layout:
        <base_dir>/
            MEMORY.md          # Index file
            facts/             # Fact memories
            procedures/        # Procedure memories
            observations/      # Observation memories
            lessons/           # Lesson memories
            archive/           # Archived memories
            relationships.json # Memory relationship graph
    """

    MAX_INDEX_LINES = 200
    MAX_INDEX_BYTES = 25000
    MAX_MEMORY_FILES = 200
    CONSOLIDATION_THRESHOLD = 0.85  # Similarity threshold for merging

    def __init__(self, base_dir: Path):
        self.base_dir = Path(base_dir)
        self._ensure_directories()
        self._memories: Dict[str, MemoryEntry] = {}
        self._relationships: Dict[str, List[Tuple[str, RelationshipType]]] = defaultdict(list)
        self._load_all()

    def _ensure_directories(self):
        """Create directory structure if it doesn't exist."""
        dirs = ['facts', 'procedures', 'observations', 'lessons', 'archive']
        for d in dirs:
            (self.base_dir / d).mkdir(parents=True, exist_ok=True)

    def _type_dir(self, mem_type: MemoryType) -> Path:
        """Get the directory for a memory type."""
        return self.base_dir / f"{mem_type.value}s"

    def _memory_path(self, entry: MemoryEntry) -> Path:
        """Generate file path for a memory entry."""
        safe_name = "".join(c if c.isalnum() or c in '-_' else '_' for c in entry.name)
        safe_name = safe_name[:50]  # Limit length
        return self._type_dir(entry.type) / f"{entry.id}_{safe_name}.md"

    def _load_all(self):
        """Load all memories from disk."""
        self._memories.clear()

        for mem_type in MemoryType:
            type_dir = self._type_dir(mem_type)
            if not type_dir.exists():
                continue

            for file_path in type_dir.glob("*.md"):
                try:
                    content = file_path.read_text(encoding='utf-8')
                    entry = MemoryEntry.from_frontmatter(content, file_path)
                    self._memories[entry.id] = entry
                except Exception as e:
                    print(f"Warning: Failed to load {file_path}: {e}")

        # Load relationships
        rel_path = self.base_dir / "relationships.json"
        if rel_path.exists():
            try:
                data = json.loads(rel_path.read_text())
                for rel in data:
                    self._relationships[rel['source']].append(
                        (rel['target'], RelationshipType(rel['type']))
                    )
            except Exception as e:
                print(f"Warning: Failed to load relationships: {e}")

    def save(self, entry: MemoryEntry) -> Path:
        """
        Save a memory entry to disk.

        Returns:
            Path to the saved file
        """
        entry.modified_at = datetime.utcnow()

        # Update file path
        entry.file_path = self._memory_path(entry)

        # Write to disk
        entry.file_path.parent.mkdir(parents=True, exist_ok=True)
        entry.file_path.write_text(entry.to_frontmatter(), encoding='utf-8')

        # Update cache
        self._memories[entry.id] = entry

        # Update index
        self._update_index()

        return entry.file_path

    def get(self, memory_id: str) -> Optional[MemoryEntry]:
        """Get a memory by ID."""
        entry = self._memories.get(memory_id)
        if entry:
            entry.touch()
        return entry

    def delete(self, memory_id: str) -> bool:
        """Delete a memory by ID."""
        entry = self._memories.get(memory_id)
        if not entry:
            return False

        # Remove file
        if entry.file_path and entry.file_path.exists():
            entry.file_path.unlink()

        # Remove from cache
        del self._memories[memory_id]

        # Remove relationships
        if memory_id in self._relationships:
            del self._relationships[memory_id]
        for src, rels in self._relationships.items():
            self._relationships[src] = [(t, r) for t, r in rels if t != memory_id]

        self._save_relationships()
        self._update_index()

        return True

    def archive(self, memory_id: str) -> bool:
        """Move a memory to the archive."""
        entry = self._memories.get(memory_id)
        if not entry or not entry.file_path:
            return False

        archive_dir = self.base_dir / "archive"
        archive_dir.mkdir(exist_ok=True)

        new_path = archive_dir / entry.file_path.name
        shutil.move(str(entry.file_path), str(new_path))

        entry.file_path = new_path
        entry.metadata['archived'] = True
        entry.metadata['archived_at'] = datetime.utcnow().isoformat()

        return True

    def search(self, query: MemoryQuery) -> List[Tuple[MemoryEntry, float]]:
        """
        Search memories matching the query.

        Returns:
            List of (entry, score) tuples sorted by score
        """
        results = []

        for entry in self._memories.values():
            if not query.matches(entry):
                continue

            # Calculate score based on sort criteria
            if query.sort_by == "relevance":
                score = entry.relevance_score(query.text)
                # Boost by freshness
                score = score * 0.7 + entry.freshness_score() * 0.3
            elif query.sort_by == "freshness":
                score = entry.freshness_score()
            elif query.sort_by == "access":
                score = min(1.0, entry.access_count / 10.0)
            else:  # date
                score = 1.0 / (1 + entry.age_days())

            results.append((entry, score))

        # Sort by score descending
        results.sort(key=lambda x: x[1], reverse=True)

        return results[:query.limit]

    def find_by_tags(self, tags: List[str], match_all: bool = False) -> List[MemoryEntry]:
        """Find memories by tags."""
        tag_set = set(t.lower() for t in tags)
        results = []

        for entry in self._memories.values():
            entry_tags = set(t.lower() for t in entry.tags)

            if match_all:
                if tag_set <= entry_tags:
                    results.append(entry)
            else:
                if tag_set & entry_tags:
                    results.append(entry)

        return results

    def find_similar(self, entry: MemoryEntry, threshold: float = 0.8) -> List[MemoryEntry]:
        """
        Find memories similar to the given entry.

        Uses simple text similarity based on name, description, and content.
        """
        similar = []

        for other in self._memories.values():
            if other.id == entry.id:
                continue

            score = self._similarity(entry, other)
            if score >= threshold:
                similar.append(other)

        return similar

    def _similarity(self, a: MemoryEntry, b: MemoryEntry) -> float:
        """Calculate similarity between two memories (0.0-1.0)."""
        # Simple word-based Jaccard similarity
        def get_words(text: str) -> Set[str]:
            return set(w.lower() for w in text.split() if len(w) > 2)

        a_words = get_words(a.name + " " + a.description + " " + a.content[:500])
        b_words = get_words(b.name + " " + b.description + " " + b.content[:500])

        if not a_words or not b_words:
            return 0.0

        intersection = len(a_words & b_words)
        union = len(a_words | b_words)

        return intersection / union if union > 0 else 0.0

    def consolidate(self, dry_run: bool = False) -> List[Tuple[MemoryEntry, MemoryEntry]]:
        """
        Consolidate similar memories by merging them.

        Returns:
            List of merged (old, new) pairs
        """
        merged = []
        to_delete = set()

        entries = list(self._memories.values())

        for i, entry in enumerate(entries):
            if entry.id in to_delete:
                continue

            for other in entries[i+1:]:
                if other.id in to_delete:
                    continue

                similarity = self._similarity(entry, other)

                if similarity >= self.CONSOLIDATION_THRESHOLD:
                    # Merge other into entry
                    if not dry_run:
                        # Combine content
                        entry.content = self._merge_content(entry, other)
                        # Combine tags
                        entry.tags = list(set(entry.tags + other.tags))
                        # Update description
                        if len(other.description) > len(entry.description):
                            entry.description = other.description
                        # Boost confidence
                        entry.confidence = max(entry.confidence, other.confidence)
                        entry.modified_at = datetime.utcnow()

                        # Save merged entry
                        self.save(entry)

                        # Delete duplicate
                        self.delete(other.id)

                    merged.append((other, entry))
                    to_delete.add(other.id)

        return merged

    def _merge_content(self, a: MemoryEntry, b: MemoryEntry) -> str:
        """Merge content from two memories intelligently."""
        if a.content == b.content:
            return a.content

        # Simple merge: combine unique lines
        a_lines = set(a.content.strip().split('\n'))
        b_lines = set(b.content.strip().split('\n'))

        merged = list(a_lines | b_lines)
        return '\n'.join(merged)

    def get_stale_memories(self, max_age_days: int = 90) -> List[MemoryEntry]:
        """Get memories older than the specified age."""
        cutoff = datetime.utcnow().timestamp() - (max_age_days * 86400)
        stale = []

        for entry in self._memories.values():
            if entry.created_at.timestamp() < cutoff:
                stale.append(entry)

        return stale

    def cleanup_archive(self, max_age_days: int = 365) -> int:
        """
        Permanently delete archived memories older than max_age_days.

        Returns:
            Number of memories deleted
        """
        archive_dir = self.base_dir / "archive"
        if not archive_dir.exists():
            return 0

        cutoff = datetime.utcnow().timestamp() - (max_age_days * 86400)
        deleted = 0

        for file_path in archive_dir.glob("*.md"):
            try:
                mtime = file_path.stat().st_mtime
                if mtime < cutoff:
                    file_path.unlink()
                    deleted += 1
            except Exception as e:
                print(f"Warning: Failed to delete {file_path}: {e}")

        return deleted

    def _update_index(self):
        """Update the MEMORY.md index file."""
        index_path = self.base_dir / "MEMORY.md"

        # Build index entries
        entries = []
        for entry in self._memories.values():
            if entry.file_path:
                rel_path = entry.file_path.relative_to(self.base_dir)
                entries.append(MemoryIndex(
                    name=entry.name,
                    file_path=rel_path,
                    description=entry.description,
                    type=entry.type,
                    created_at=entry.created_at
                ))

        # Sort by date descending
        entries.sort(key=lambda x: x.created_at, reverse=True)

        # Build content
        lines = ["# AP Knowledge Base - Memory Index", ""]
        lines.append("This is an auto-generated index of memories. Do not edit manually.")
        lines.append("")
        lines.append("## Recent Memories")
        lines.append("")

        for idx in entries[:self.MAX_INDEX_LINES]:
            lines.append(idx.to_line())

        if len(entries) > self.MAX_INDEX_LINES:
            lines.append(f"\n... and {len(entries) - self.MAX_INDEX_LINES} more memories")

        lines.append("")
        lines.append("## Memory Types")
        lines.append("")
        for mem_type in MemoryType:
            count = sum(1 for e in self._memories.values() if e.type == mem_type)
            lines.append(f"- **{mem_type.value}**: {count} memories")

        content = '\n'.join(lines)

        # Truncate if too large
        if len(content) > self.MAX_INDEX_BYTES:
            content = content[:self.MAX_INDEX_BYTES]
            content += f"\n\n> WARNING: Index truncated ({len(content)} bytes, limit: {self.MAX_INDEX_BYTES})"

        index_path.write_text(content, encoding='utf-8')

    def _save_relationships(self):
        """Save relationships to disk."""
        rel_path = self.base_dir / "relationships.json"

        data = []
        for source, targets in self._relationships.items():
            for target, rel_type in targets:
                data.append({
                    'source': source,
                    'target': target,
                    'type': rel_type.value
                })

        rel_path.write_text(json.dumps(data, indent=2), encoding='utf-8')

    def get_stats(self) -> Dict:
        """Get statistics about the memory directory."""
        stats = {
            'total_memories': len(self._memories),
            'by_type': {},
            'by_scope': {},
            'by_source': {},
            'total_relationships': sum(len(r) for r in self._relationships.values()),
            'avg_age_days': 0,
            'avg_confidence': 0.0,
        }

        total_age = 0
        total_confidence = 0.0

        for entry in self._memories.values():
            t = entry.type.value
            s = entry.scope.value
            src = entry.source

            stats['by_type'][t] = stats['by_type'].get(t, 0) + 1
            stats['by_scope'][s] = stats['by_scope'].get(s, 0) + 1
            stats['by_source'][src] = stats['by_source'].get(src, 0) + 1

            total_age += entry.age_days()
            total_confidence += entry.confidence

        if self._memories:
            stats['avg_age_days'] = total_age / len(self._memories)
            stats['avg_confidence'] = total_confidence / len(self._memories)

        return stats

    def export_all(self) -> Dict:
        """Export all memories as a dictionary."""
        return {
            'memories': [entry.to_frontmatter() for entry in self._memories.values()],
            'relationships': [
                {
                    'source': src,
                    'target': tgt,
                    'type': rel.value
                }
                for src, rels in self._relationships.items()
                for tgt, rel in rels
            ],
            'exported_at': datetime.utcnow().isoformat(),
            'version': '1.0'
        }

    def import_data(self, data: Dict, overwrite: bool = False) -> Tuple[int, int]:
        """
        Import memories from a dictionary.

        Returns:
            (imported_count, skipped_count)
        """
        imported = 0
        skipped = 0

        for mem_text in data.get('memories', []):
            try:
                entry = MemoryEntry.from_frontmatter(mem_text)

                if entry.id in self._memories and not overwrite:
                    skipped += 1
                    continue

                self.save(entry)
                imported += 1
            except Exception as e:
                print(f"Warning: Failed to import memory: {e}")
                skipped += 1

        # Import relationships
        for rel in data.get('relationships', []):
            self._relationships[rel['source']].append(
                (rel['target'], RelationshipType(rel['type']))
            )

        self._save_relationships()

        return imported, skipped