Compare commits

..

1 Commits

Author SHA1 Message Date
Alexander Whitestone
0a814f5bef fix: vendor vision benchmark fixtures (#868)
All checks were successful
Lint / lint (pull_request) Successful in 11s
2026-04-22 11:37:04 -04:00
32 changed files with 677 additions and 983 deletions

View File

@@ -1,194 +1,354 @@
[
{
"id": "screenshot_github_home",
"url": "https://github.githubassets.com/images/modules/logos_page/GitHub-Mark.png",
"url": "test_images/screenshot_github_home.png",
"category": "screenshot",
"expected_keywords": ["github", "logo", "mark"],
"expected_keywords": [
"github",
"logo",
"mark"
],
"ground_truth_ocr": "",
"expected_structure": {"min_length": 30, "min_sentences": 1, "has_numbers": false}
"expected_structure": {
"min_length": 30,
"min_sentences": 1,
"has_numbers": false
}
},
{
"id": "diagram_mermaid_flow",
"url": "https://mermaid.ink/img/pako:eNpdkE9PwzAMxb-K5VOl7gc7sAOIIDuAw9gptnRaSJLSJttQStmXs9LCH-ymBOI1ef_42U6cUSae4IkDxbAAWtB6siSZXVhjQTlgl1nigHg5fRBOzSfebopROCu_cytObSfgLSE1ANOeZWkO2IH5upZxYot8m1hqAdpD_63WRl0xdUG1jdl9kPiOb_EWk2JBtPaiKkF4eVIYgO0EtkW-RSgC4gJ6HJYRG1UNdN0HNVd0Bftjj7X8P92qPj-F8l8T3w",
"url": "test_images/diagram_mermaid_flow.png",
"category": "diagram",
"expected_keywords": ["flow", "diagram", "process"],
"expected_keywords": [
"flow",
"diagram",
"process"
],
"ground_truth_ocr": "",
"expected_structure": {"min_length": 50, "min_sentences": 2, "has_numbers": false}
"expected_structure": {
"min_length": 50,
"min_sentences": 2,
"has_numbers": false
}
},
{
"id": "photo_random_1",
"url": "https://picsum.photos/seed/vision1/400/300",
"url": "test_images/photo_random_1.png",
"category": "photo",
"expected_keywords": [],
"ground_truth_ocr": "",
"expected_structure": {"min_length": 30, "min_sentences": 1, "has_numbers": false}
"expected_structure": {
"min_length": 30,
"min_sentences": 1,
"has_numbers": false
}
},
{
"id": "photo_random_2",
"url": "https://picsum.photos/seed/vision2/400/300",
"url": "test_images/photo_random_2.png",
"category": "photo",
"expected_keywords": [],
"ground_truth_ocr": "",
"expected_structure": {"min_length": 30, "min_sentences": 1, "has_numbers": false}
"expected_structure": {
"min_length": 30,
"min_sentences": 1,
"has_numbers": false
}
},
{
"id": "chart_simple_bar",
"url": "https://quickchart.io/chart?c={type:'bar',data:{labels:['Q1','Q2','Q3','Q4'],datasets:[{label:'Revenue',data:[100,150,200,250]}]}}",
"url": "test_images/chart_simple_bar.png",
"category": "chart",
"expected_keywords": ["bar", "chart", "revenue"],
"expected_keywords": [
"bar",
"chart",
"revenue"
],
"ground_truth_ocr": "",
"expected_structure": {"min_length": 50, "min_sentences": 2, "has_numbers": true}
"expected_structure": {
"min_length": 50,
"min_sentences": 2,
"has_numbers": true
}
},
{
"id": "chart_pie",
"url": "https://quickchart.io/chart?c={type:'pie',data:{labels:['A','B','C'],datasets:[{data:[30,50,20]}]}}",
"url": "test_images/chart_pie.png",
"category": "chart",
"expected_keywords": ["pie", "chart", "percentage"],
"expected_keywords": [
"pie",
"chart",
"percentage"
],
"ground_truth_ocr": "",
"expected_structure": {"min_length": 50, "min_sentences": 2, "has_numbers": true}
"expected_structure": {
"min_length": 50,
"min_sentences": 2,
"has_numbers": true
}
},
{
"id": "diagram_org_chart",
"url": "https://mermaid.ink/img/pako:eNpdkE9PwzAMxb-K5VOl7gc7sAOIIDuAw9gptnRaSJLSJttQStmXs9LCH-ymBOI1ef_42U6cUSae4IkDxbAAWtB6iuyIWyrLgXLALrPEAfFy-iCcmk-83RSjcFZ-51ac2k7AW0JqAKY9y9IcsAPzdS3jxBb5NrHUAraH_lutjbpi6oJqG7P7IPEd3-ItJsWCaO1FVYLw8qQwANsJbIt8i1AExAX0OCwjNqoa6LoPaq7oCvbHHmv5f7pVfX4K5b8mvg",
"url": "test_images/diagram_org_chart.png",
"category": "diagram",
"expected_keywords": ["organization", "hierarchy", "chart"],
"expected_keywords": [
"organization",
"hierarchy",
"chart"
],
"ground_truth_ocr": "",
"expected_structure": {"min_length": 50, "min_sentences": 2, "has_numbers": false}
"expected_structure": {
"min_length": 50,
"min_sentences": 2,
"has_numbers": false
}
},
{
"id": "screenshot_terminal",
"url": "https://raw.githubusercontent.com/nicehash/nicehash-quick-start/main/images/nicehash-terminal.png",
"url": "test_images/screenshot_terminal.png",
"category": "screenshot",
"expected_keywords": ["terminal", "command", "output"],
"expected_keywords": [
"terminal",
"command",
"output"
],
"ground_truth_ocr": "",
"expected_structure": {"min_length": 30, "min_sentences": 1, "has_numbers": false}
"expected_structure": {
"min_length": 30,
"min_sentences": 1,
"has_numbers": false
}
},
{
"id": "photo_random_3",
"url": "https://picsum.photos/seed/vision3/400/300",
"url": "test_images/photo_random_3.png",
"category": "photo",
"expected_keywords": [],
"ground_truth_ocr": "",
"expected_structure": {"min_length": 30, "min_sentences": 1, "has_numbers": false}
"expected_structure": {
"min_length": 30,
"min_sentences": 1,
"has_numbers": false
}
},
{
"id": "chart_line",
"url": "https://quickchart.io/chart?c={type:'line',data:{labels:['Jan','Feb','Mar','Apr'],datasets:[{label:'Temperature',data:[5,8,12,18]}]}}",
"url": "test_images/chart_line.png",
"category": "chart",
"expected_keywords": ["line", "chart", "temperature"],
"expected_keywords": [
"line",
"chart",
"temperature"
],
"ground_truth_ocr": "",
"expected_structure": {"min_length": 50, "min_sentences": 2, "has_numbers": true}
"expected_structure": {
"min_length": 50,
"min_sentences": 2,
"has_numbers": true
}
},
{
"id": "diagram_sequence",
"url": "https://mermaid.ink/img/pako:eNpdkE9PwzAMxb-K5VOl7gc7sAOIIDuAw9gptnRaSJLSJttQStmXs9LCH-ymBOI1ef_42U6cUSae4IkDxbAAWtB6iuyIWyrLgXLALrPEAfFy-iCcmk-83RSjcFZ-51ac2k7AW0JqAKY9y9IcsAPzdS3jxBb5NrHUAraH_lutjbpi6oJqG7P7IPEd3-ItJsWCaO1FVYLw8qQwANsJbIt8i1AExAX0OCwjNqoa6LoPaq7oCvbHHmv5f7pVfX4K5b8mvg",
"url": "test_images/diagram_sequence.png",
"category": "diagram",
"expected_keywords": ["sequence", "interaction", "message"],
"expected_keywords": [
"sequence",
"interaction",
"message"
],
"ground_truth_ocr": "",
"expected_structure": {"min_length": 50, "min_sentences": 2, "has_numbers": false}
"expected_structure": {
"min_length": 50,
"min_sentences": 2,
"has_numbers": false
}
},
{
"id": "photo_random_4",
"url": "https://picsum.photos/seed/vision4/400/300",
"url": "test_images/photo_random_4.png",
"category": "photo",
"expected_keywords": [],
"ground_truth_ocr": "",
"expected_structure": {"min_length": 30, "min_sentences": 1, "has_numbers": false}
"expected_structure": {
"min_length": 30,
"min_sentences": 1,
"has_numbers": false
}
},
{
"id": "screenshot_webpage",
"url": "https://github.githubassets.com/images/modules/site/social-cards.png",
"url": "test_images/screenshot_webpage.png",
"category": "screenshot",
"expected_keywords": ["github", "page", "web"],
"expected_keywords": [
"github",
"page",
"web"
],
"ground_truth_ocr": "",
"expected_structure": {"min_length": 30, "min_sentences": 1, "has_numbers": false}
"expected_structure": {
"min_length": 30,
"min_sentences": 1,
"has_numbers": false
}
},
{
"id": "chart_radar",
"url": "https://quickchart.io/chart?c={type:'radar',data:{labels:['Speed','Power','Defense','Magic'],datasets:[{label:'Hero',data:[80,60,70,90]}]}}",
"url": "test_images/chart_radar.png",
"category": "chart",
"expected_keywords": ["radar", "chart", "skill"],
"expected_keywords": [
"radar",
"chart",
"skill"
],
"ground_truth_ocr": "",
"expected_structure": {"min_length": 50, "min_sentences": 2, "has_numbers": true}
"expected_structure": {
"min_length": 50,
"min_sentences": 2,
"has_numbers": true
}
},
{
"id": "photo_random_5",
"url": "https://picsum.photos/seed/vision5/400/300",
"url": "test_images/photo_random_5.png",
"category": "photo",
"expected_keywords": [],
"ground_truth_ocr": "",
"expected_structure": {"min_length": 30, "min_sentences": 1, "has_numbers": false}
"expected_structure": {
"min_length": 30,
"min_sentences": 1,
"has_numbers": false
}
},
{
"id": "diagram_class",
"url": "https://mermaid.ink/img/pako:eNpdkE9PwzAMxb-K5VOl7gc7sAOIIDuAw9gptnRaSJLSJttQStmXs9LCH-ymBOI1ef_42U6cUSae4IkDxbAAWtB6iuyIWyrLgXLALrPEAfFy-iCcmk-83RSjcFZ-51ac2k7AW0JqAKY9y9IcsAPzdS3jxBb5NrHUAraH_lutjbpi6oJqG7P7IPEd3-ItJsWCaO1FVYLw8qQwANsJbIt8i1AExAX0OCwjNqoa6LoPaq7oCvbHHmv5f7pVfX4K5b8mvg",
"url": "test_images/diagram_class.png",
"category": "diagram",
"expected_keywords": ["class", "object", "attribute"],
"expected_keywords": [
"class",
"object",
"attribute"
],
"ground_truth_ocr": "",
"expected_structure": {"min_length": 50, "min_sentences": 2, "has_numbers": false}
"expected_structure": {
"min_length": 50,
"min_sentences": 2,
"has_numbers": false
}
},
{
"id": "chart_doughnut",
"url": "https://quickchart.io/chart?c={type:'doughnut',data:{labels:['Desktop','Mobile','Tablet'],datasets:[{data:[60,30,10]}]}}",
"url": "test_images/chart_doughnut.png",
"category": "chart",
"expected_keywords": ["doughnut", "chart", "device"],
"expected_keywords": [
"doughnut",
"chart",
"device"
],
"ground_truth_ocr": "",
"expected_structure": {"min_length": 50, "min_sentences": 2, "has_numbers": true}
"expected_structure": {
"min_length": 50,
"min_sentences": 2,
"has_numbers": true
}
},
{
"id": "photo_random_6",
"url": "https://picsum.photos/seed/vision6/400/300",
"url": "test_images/photo_random_6.png",
"category": "photo",
"expected_keywords": [],
"ground_truth_ocr": "",
"expected_structure": {"min_length": 30, "min_sentences": 1, "has_numbers": false}
"expected_structure": {
"min_length": 30,
"min_sentences": 1,
"has_numbers": false
}
},
{
"id": "screenshot_error",
"url": "https://http.cat/404.jpg",
"url": "test_images/screenshot_error.png",
"category": "screenshot",
"expected_keywords": ["404", "error", "cat"],
"expected_keywords": [
"404",
"error",
"cat"
],
"ground_truth_ocr": "",
"expected_structure": {"min_length": 30, "min_sentences": 1, "has_numbers": true}
"expected_structure": {
"min_length": 30,
"min_sentences": 1,
"has_numbers": true
}
},
{
"id": "diagram_network",
"url": "https://mermaid.ink/img/pako:eNpdkE9PwzAMxb-K5VOl7gc7sAOIIDuAw9gptnRaSJLSJttQStmXs9LCH-ymBOI1ef_42U6cUSae4IkDxbAAWtB6iuyIWyrLgXLALrPEAfFy-iCcmk-83RSjcFZ-51ac2k7AW0JqAKY9y9IcsAPzdS3jxBb5NrHUAraH_lutjbpi6oJqG7P7IPEd3-ItJsWCaO1FVYLw8qQwANsJbIt8i1AExAX0OCwjNqoa6LoPaq7oCvbHHmv5f7pVfX4K5b8mvg",
"url": "test_images/diagram_network.png",
"category": "diagram",
"expected_keywords": ["network", "node", "connection"],
"expected_keywords": [
"network",
"node",
"connection"
],
"ground_truth_ocr": "",
"expected_structure": {"min_length": 50, "min_sentences": 2, "has_numbers": false}
"expected_structure": {
"min_length": 50,
"min_sentences": 2,
"has_numbers": false
}
},
{
"id": "photo_random_7",
"url": "https://picsum.photos/seed/vision7/400/300",
"url": "test_images/photo_random_7.png",
"category": "photo",
"expected_keywords": [],
"ground_truth_ocr": "",
"expected_structure": {"min_length": 30, "min_sentences": 1, "has_numbers": false}
"expected_structure": {
"min_length": 30,
"min_sentences": 1,
"has_numbers": false
}
},
{
"id": "chart_stacked_bar",
"url": "https://quickchart.io/chart?c={type:'bar',data:{labels:['2022','2023','2024'],datasets:[{label:'Cloud',data:[100,150,200]},{label:'On-prem',data:[200,180,160]}]},options:{scales:{x:{stacked:true},y:{stacked:true}}}}",
"url": "test_images/chart_stacked_bar.png",
"category": "chart",
"expected_keywords": ["stacked", "bar", "chart"],
"expected_keywords": [
"stacked",
"bar",
"chart"
],
"ground_truth_ocr": "",
"expected_structure": {"min_length": 50, "min_sentences": 2, "has_numbers": true}
"expected_structure": {
"min_length": 50,
"min_sentences": 2,
"has_numbers": true
}
},
{
"id": "screenshot_dashboard",
"url": "https://github.githubassets.com/images/modules/site/features-code-search.png",
"url": "test_images/screenshot_dashboard.png",
"category": "screenshot",
"expected_keywords": ["search", "code", "feature"],
"expected_keywords": [
"search",
"code",
"feature"
],
"ground_truth_ocr": "",
"expected_structure": {"min_length": 30, "min_sentences": 1, "has_numbers": false}
"expected_structure": {
"min_length": 30,
"min_sentences": 1,
"has_numbers": false
}
},
{
"id": "photo_random_8",
"url": "https://picsum.photos/seed/vision8/400/300",
"url": "test_images/photo_random_8.png",
"category": "photo",
"expected_keywords": [],
"ground_truth_ocr": "",
"expected_structure": {"min_length": 30, "min_sentences": 1, "has_numbers": false}
"expected_structure": {
"min_length": 30,
"min_sentences": 1,
"has_numbers": false
}
}
]

Binary file not shown.

After

Width:  |  Height:  |  Size: 4.4 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 4.1 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 4.0 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 3.5 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 4.2 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 5.0 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 4.6 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 4.8 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 5.0 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 5.1 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 5.2 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 3.0 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 3.0 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 3.0 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 2.9 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 3.0 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 3.0 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 3.0 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 3.0 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 7.1 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 6.2 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 7.1 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 7.1 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 7.2 KiB

View File

@@ -11,17 +11,19 @@ Usage:
# Single image test
python benchmarks/vision_benchmark.py --url https://example.com/image.png
python benchmarks/vision_benchmark.py --url benchmarks/test_images/photo_random_1.png
# Generate test report
python benchmarks/vision_benchmark.py --images benchmarks/test_images.json --output benchmarks/vision_results.json
Test image dataset: benchmarks/test_images.json (50-100 diverse images)
Test image dataset: benchmarks/test_images.json (committed local fixtures under benchmarks/test_images/)
"""
import argparse
import asyncio
import base64
import json
import mimetypes
import os
import statistics
import sys
@@ -67,6 +69,28 @@ EVAL_PROMPTS = {
# ---------------------------------------------------------------------------
def _is_remote_image_source(image_source: str) -> bool:
return image_source.startswith(("http://", "https://", "data:", "file://"))
def _image_source_to_payload_url(image_source: str) -> str:
"""Convert local image paths into data URLs; keep remote URLs unchanged."""
if image_source.startswith(("http://", "https://", "data:")):
return image_source
resolved = image_source[len("file://"):] if image_source.startswith("file://") else image_source
local_path = Path(os.path.expanduser(resolved)).resolve()
if not local_path.is_file():
return image_source
mime_type, _ = mimetypes.guess_type(str(local_path))
if not mime_type:
mime_type = "application/octet-stream"
encoded = base64.b64encode(local_path.read_bytes()).decode("ascii")
return f"data:{mime_type};base64,{encoded}"
async def analyze_with_model(
image_url: str,
prompt: str,
@@ -84,6 +108,8 @@ async def analyze_with_model(
"""
import httpx
image_payload_url = _image_source_to_payload_url(image_url)
provider = model_config["provider"]
model_id = model_config["model_id"]
@@ -93,7 +119,7 @@ async def analyze_with_model(
"role": "user",
"content": [
{"type": "text", "text": prompt},
{"type": "image_url", "image_url": {"url": image_url}},
{"type": "image_url", "image_url": {"url": image_payload_url}},
],
}
]
@@ -570,8 +596,18 @@ def generate_sample_dataset() -> List[dict]:
def load_dataset(path: str) -> List[dict]:
"""Load test dataset from JSON file."""
with open(path) as f:
return json.load(f)
dataset_path = Path(path).resolve()
with open(dataset_path) as f:
dataset = json.load(f)
base_dir = dataset_path.parent
for image in dataset:
image_url = image.get("url")
if not image_url or _is_remote_image_source(image_url):
continue
image["url"] = str((base_dir / image_url).resolve())
return dataset
# ---------------------------------------------------------------------------
@@ -582,7 +618,7 @@ def load_dataset(path: str) -> List[dict]:
async def main():
parser = argparse.ArgumentParser(description="Vision Benchmark Suite (Issue #817)")
parser.add_argument("--images", help="Path to test images JSON file")
parser.add_argument("--url", help="Single image URL to test")
parser.add_argument("--url", help="Single image URL or local file path to test")
parser.add_argument("--category", default="photo", help="Category for single URL")
parser.add_argument("--output", default=None, help="Output JSON file")
parser.add_argument("--runs", type=int, default=1, help="Runs per model per image")

View File

@@ -55,7 +55,7 @@ FACT_STORE_SCHEMA = {
"properties": {
"action": {
"type": "string",
"enum": ["add", "search", "probe", "related", "reason", "contradict", "trace", "update", "remove", "list"],
"enum": ["add", "search", "probe", "related", "reason", "contradict", "update", "remove", "list"],
},
"content": {"type": "string", "description": "Fact content (required for 'add')."},
"query": {"type": "string", "description": "Search query (required for 'search')."},
@@ -67,13 +67,6 @@ FACT_STORE_SCHEMA = {
"trust_delta": {"type": "number", "description": "Trust adjustment for 'update'."},
"min_trust": {"type": "number", "description": "Minimum trust filter (default: 0.3)."},
"limit": {"type": "integer", "description": "Max results (default: 10)."},
"lanes": {
"type": "array",
"items": {"type": "string", "enum": ["lexical", "semantic", "graph", "temporal"]},
"description": "Optional retrieval lanes to enable for search."
},
"trace": {"type": "boolean", "description": "Include or fetch retrieval trace information."},
"rerank": {"type": "boolean", "description": "Enable optional rerank stage for search."},
},
"required": ["action"],
},
@@ -126,9 +119,6 @@ class HolographicMemoryProvider(MemoryProvider):
self._store = None
self._retriever = None
self._min_trust = float(self._config.get("min_trust_threshold", 0.3))
self._retrieval_lanes = self._parse_retrieval_lanes(self._config.get("retrieval_lanes"))
self._enable_rerank = str(self._config.get("enable_rerank", "true")).lower() != "false"
self._last_retrieval_trace: dict | None = None
@property
def name(self) -> str:
@@ -154,14 +144,6 @@ class HolographicMemoryProvider(MemoryProvider):
except Exception:
pass
def _parse_retrieval_lanes(self, value) -> list[str]:
if isinstance(value, str):
value = [part.strip() for part in value.split(",") if part.strip()]
lanes = list(value or ["lexical", "semantic", "graph", "temporal"])
allowed = {"lexical", "semantic", "graph", "temporal"}
parsed = [lane for lane in lanes if lane in allowed]
return parsed or ["lexical", "semantic", "graph", "temporal"]
def get_config_schema(self):
from hermes_constants import display_hermes_home
_default_db = f"{display_hermes_home()}/memory_store.db"
@@ -170,10 +152,6 @@ class HolographicMemoryProvider(MemoryProvider):
{"key": "auto_extract", "description": "Auto-extract facts at session end", "default": "false", "choices": ["true", "false"]},
{"key": "default_trust", "description": "Default trust score for new facts", "default": "0.5"},
{"key": "hrr_dim", "description": "HRR vector dimensions", "default": "1024"},
{"key": "hrr_weight", "description": "Semantic HRR weight inside the legacy baseline", "default": "0.3"},
{"key": "temporal_decay_half_life", "description": "Temporal decay half-life in days (0 disables baseline decay)", "default": "0"},
{"key": "retrieval_lanes", "description": "Comma-separated retrieval lanes (lexical,semantic,graph,temporal)", "default": "lexical,semantic,graph,temporal"},
{"key": "enable_rerank", "description": "Enable optional local rerank stage", "default": "true", "choices": ["true", "false"]},
]
def initialize(self, session_id: str, **kwargs) -> None:
@@ -191,8 +169,6 @@ class HolographicMemoryProvider(MemoryProvider):
hrr_dim = int(self._config.get("hrr_dim", 1024))
hrr_weight = float(self._config.get("hrr_weight", 0.3))
temporal_decay = int(self._config.get("temporal_decay_half_life", 0))
self._retrieval_lanes = self._parse_retrieval_lanes(self._config.get("retrieval_lanes", self._retrieval_lanes))
self._enable_rerank = str(self._config.get("enable_rerank", self._enable_rerank)).lower() != "false"
self._store = MemoryStore(db_path=db_path, default_trust=default_trust, hrr_dim=hrr_dim)
self._retriever = FactRetriever(
@@ -200,8 +176,6 @@ class HolographicMemoryProvider(MemoryProvider):
temporal_decay_half_life=temporal_decay,
hrr_weight=hrr_weight,
hrr_dim=hrr_dim,
retrieval_lanes=self._retrieval_lanes,
enable_rerank=self._enable_rerank,
)
self._session_id = session_id
@@ -232,23 +206,13 @@ class HolographicMemoryProvider(MemoryProvider):
if not self._retriever or not query:
return ""
try:
payload = self._retriever.search_with_trace(
query,
min_trust=self._min_trust,
limit=5,
lanes=self._retrieval_lanes,
rerank=self._enable_rerank,
)
self._last_retrieval_trace = payload["trace"]
results = payload["results"]
results = self._retriever.search(query, min_trust=self._min_trust, limit=5)
if not results:
return ""
lines = []
for r in results:
trust = r.get("trust_score", r.get("trust", 0))
lanes = ",".join(r.get("matched_lanes", []))
lane_suffix = f" [{lanes}]" if lanes else ""
lines.append(f"- [{trust:.1f}] {r.get('content', '')}{lane_suffix}")
lines.append(f"- [{trust:.1f}] {r.get('content', '')}")
return "## Holographic Memory\n" + "\n".join(lines)
except Exception as e:
logger.debug("Holographic prefetch failed: %s", e)
@@ -306,39 +270,14 @@ class HolographicMemoryProvider(MemoryProvider):
return json.dumps({"fact_id": fact_id, "status": "added"})
elif action == "search":
lanes = args.get("lanes")
rerank = args.get("rerank")
with_trace = bool(args.get("trace", False))
if with_trace:
payload = retriever.search_with_trace(
args["query"],
category=args.get("category"),
min_trust=float(args.get("min_trust", self._min_trust)),
limit=int(args.get("limit", 10)),
lanes=lanes,
rerank=rerank,
)
self._last_retrieval_trace = payload["trace"]
return json.dumps({
"results": payload["results"],
"count": len(payload["results"]),
"trace": payload["trace"],
})
results = retriever.search(
args["query"],
category=args.get("category"),
min_trust=float(args.get("min_trust", self._min_trust)),
limit=int(args.get("limit", 10)),
lanes=lanes,
rerank=rerank,
)
self._last_retrieval_trace = retriever.last_trace
return json.dumps({"results": results, "count": len(results)})
elif action == "trace":
return json.dumps({"trace": self._last_retrieval_trace or retriever.last_trace or {}})
elif action == "probe":
results = retriever.probe(
args["entity"],
@@ -384,8 +323,7 @@ class HolographicMemoryProvider(MemoryProvider):
return json.dumps({"updated": updated})
elif action == "remove":
removed = store.remove_fact(int(args["fact_id"])
)
removed = store.remove_fact(int(args["fact_id"]))
return json.dumps({"removed": removed})
elif action == "list":

File diff suppressed because it is too large Load Diff

View File

@@ -83,7 +83,6 @@ _TRUST_MAX = 1.0
# Entity extraction patterns
_RE_CAPITALIZED = re.compile(r'\b([A-Z][a-z]+(?:\s+[A-Z][a-z]+)+)\b')
_RE_SINGLE_PROPER = re.compile(r'\b([A-Z][A-Za-z0-9_-]{2,})\b')
_RE_DOUBLE_QUOTE = re.compile(r'"([^"]+)"')
_RE_SINGLE_QUOTE = re.compile(r"'([^']+)'")
_RE_AKA = re.compile(
@@ -415,13 +414,6 @@ class MemoryStore:
for m in _RE_CAPITALIZED.finditer(text):
_add(m.group(1))
skip_singletons = {"The", "This", "That", "These", "Those", "And", "But", "For", "With"}
for m in _RE_SINGLE_PROPER.finditer(text):
candidate = m.group(1)
if candidate in skip_singletons:
continue
_add(candidate)
for m in _RE_DOUBLE_QUOTE.finditer(text):
_add(m.group(1))

View File

@@ -1,56 +0,0 @@
{
"facts": [
{
"content": "Alexander Whitestone aka Rockachopa.",
"category": "general",
"tags": "identity alias"
},
{
"content": "Rockachopa uses Ansible playbooks for sovereign rollouts.",
"category": "project",
"tags": "ansible playbooks rollout"
},
{
"content": "The provider is anthropic/claude-haiku-4-5.",
"category": "project",
"tags": "provider default",
"updated_at": "2026-01-01T00:00:00Z"
},
{
"content": "Correction: the provider is mimo-v2-pro.",
"category": "project",
"tags": "provider current",
"updated_at": "2026-04-20T00:00:00Z"
},
{
"content": "Ezra operates the BURN2 lane for forge work.",
"category": "project",
"tags": "ezra burn2 forge lane"
},
{
"content": "BURN2 handles forge triage and review.",
"category": "project",
"tags": "forge triage review"
}
],
"queries": [
{
"name": "semantic_alias_graph",
"query": "What automation does Alexander Whitestone use for deploys?",
"expected_substring": "Ansible playbooks",
"top_k": 1
},
{
"name": "temporal_correction",
"query": "What provider should we use?",
"expected_substring": "mimo-v2-pro",
"top_k": 1
},
{
"name": "graph_lane",
"query": "Which forge lane does Ezra operate?",
"expected_substring": "BURN2 lane",
"top_k": 1
}
]
}

View File

@@ -1,116 +0,0 @@
"""Tests for multi-path holographic retrieval fusion and traceability."""
from __future__ import annotations
import json
import sys
from pathlib import Path
sys.path.insert(0, str(Path(__file__).resolve().parents[3]))
from plugins.memory.holographic import HolographicMemoryProvider
from plugins.memory.holographic.retrieval import FactRetriever, format_benchmark_report
from plugins.memory.holographic.store import MemoryStore
_FIXTURE_PATH = Path(__file__).resolve().parents[2] / "fixtures" / "holographic_recall_matrix.json"
def _fixture() -> dict:
return json.loads(_FIXTURE_PATH.read_text())
def _seed_store(tmp_path) -> MemoryStore:
store = MemoryStore(db_path=tmp_path / "memory_store.db")
for fact in _fixture()["facts"]:
fact_id = store.add_fact(fact["content"], category=fact["category"], tags=fact.get("tags", ""))
if fact.get("updated_at"):
store._conn.execute(
"UPDATE facts SET created_at = ?, updated_at = ? WHERE fact_id = ?",
(fact["updated_at"], fact["updated_at"], fact_id),
)
store._conn.commit()
return store
class TestMultiPathRetrieval:
def test_lane_toggle_and_trace_contributions(self, tmp_path):
store = _seed_store(tmp_path)
retriever = FactRetriever(store=store)
payload = retriever.search_with_trace(
"Which forge lane does Ezra operate?",
limit=3,
lanes=["lexical", "graph"],
)
assert payload["trace"]["lanes_run"] == ["lexical", "graph"]
assert payload["results"]
top = payload["results"][0]
assert "BURN2 lane" in top["content"]
assert "graph" in top["lane_contributions"]
assert set(top["lane_contributions"]).issubset({"lexical", "graph"})
def test_trace_available_for_failed_recall(self, tmp_path):
store = _seed_store(tmp_path)
retriever = FactRetriever(store=store)
payload = retriever.search_with_trace(
"nonexistent memory topic xyz123",
limit=3,
lanes=["lexical", "semantic", "graph", "temporal"],
)
assert payload["results"] == []
assert payload["trace"]["fused_count"] == 0
assert payload["trace"]["lane_hits"]["lexical"] == 0
assert payload["trace"]["lane_hits"]["semantic"] == 0
def test_benchmark_prompt_matrix_shows_gain_over_baseline(self, tmp_path):
store = _seed_store(tmp_path)
retriever = FactRetriever(store=store)
report = retriever.benchmark_prompt_matrix(_fixture()["queries"], limit=3)
assert report["fused_top1_hits"] > report["baseline_top1_hits"]
assert report["improvement"] > 0
rendered = format_benchmark_report(report)
assert "Prompt matrix benchmark" in rendered
assert "semantic_alias_graph" in rendered
assert "improvement" in rendered.lower()
class TestHolographicProviderTrace:
def test_prefetch_records_trace_and_trace_action_returns_it(self, tmp_path):
provider = HolographicMemoryProvider(
config={
"db_path": str(tmp_path / "provider.db"),
"retrieval_lanes": ["lexical", "semantic", "graph", "temporal"],
"enable_rerank": True,
}
)
provider.initialize("test-session")
seed_store = _seed_store(tmp_path / "seed")
rows = seed_store.list_facts(min_trust=0.0, limit=20)
for row in rows:
provider._store.add_fact(row["content"], category=row["category"], tags=row.get("tags", ""))
if row["content"].startswith("The provider is anthropic"):
provider._store._conn.execute(
"UPDATE facts SET created_at = ?, updated_at = ? WHERE content = ?",
("2026-01-01T00:00:00Z", "2026-01-01T00:00:00Z", row["content"]),
)
elif row["content"].startswith("Correction: the provider is mimo"):
provider._store._conn.execute(
"UPDATE facts SET created_at = ?, updated_at = ? WHERE content = ?",
("2026-04-20T00:00:00Z", "2026-04-20T00:00:00Z", row["content"]),
)
provider._store._conn.commit()
block = provider.prefetch("What provider should we use?")
assert "Holographic Memory" in block
assert "mimo-v2-pro" in block
trace_payload = json.loads(provider.handle_tool_call("fact_store", {"action": "trace"}))
assert trace_payload["trace"]["query"] == "What provider should we use?"
assert trace_payload["trace"]["rerank_applied"] in {True, False}
assert trace_payload["trace"]["lane_hits"]["temporal"] >= 1

View File

@@ -11,12 +11,14 @@ import pytest
sys.path.insert(0, str(Path(__file__).parent.parent / "benchmarks"))
from vision_benchmark import (
analyze_with_model,
compute_ocr_accuracy,
compute_description_completeness,
compute_structural_accuracy,
aggregate_results,
to_markdown,
generate_sample_dataset,
load_dataset,
MODELS,
EVAL_PROMPTS,
)
@@ -197,6 +199,71 @@ class TestMarkdown:
class TestDataset:
def test_repo_dataset_uses_local_image_paths(self):
dataset_path = Path(__file__).parent.parent / "benchmarks" / "test_images.json"
dataset = json.loads(dataset_path.read_text())
assert dataset, "benchmark dataset should not be empty"
assert all(not entry["url"].startswith(("http://", "https://")) for entry in dataset)
def test_load_dataset_resolves_relative_local_paths(self, tmp_path):
images_dir = tmp_path / "images"
images_dir.mkdir()
image_path = images_dir / "sample.png"
image_path.write_bytes(b"png-bytes")
dataset_path = tmp_path / "dataset.json"
dataset_path.write_text(json.dumps([
{
"id": "sample",
"url": "images/sample.png",
"category": "photo",
"expected_keywords": [],
"expected_structure": {"min_length": 30, "min_sentences": 1},
}
]))
loaded = load_dataset(str(dataset_path))
assert loaded[0]["url"] == str(image_path.resolve())
@pytest.mark.asyncio
async def test_analyze_with_model_encodes_local_file_as_data_url(self, tmp_path, monkeypatch):
image_path = tmp_path / "tiny.png"
image_path.write_bytes(
bytes.fromhex(
"89504E470D0A1A0A"
"0000000D49484452000000010000000108060000001F15C489"
"0000000D49444154789C6360000002000154A24F5D00000000"
"49454E44AE426082"
)
)
fake_response = MagicMock()
fake_response.raise_for_status.return_value = None
fake_response.json.return_value = {
"choices": [{"message": {"content": "Looks like a tiny image."}}],
"usage": {"prompt_tokens": 1, "completion_tokens": 2, "total_tokens": 3},
}
fake_client = MagicMock()
fake_client.post = AsyncMock(return_value=fake_response)
fake_ctx = MagicMock()
fake_ctx.__aenter__ = AsyncMock(return_value=fake_client)
fake_ctx.__aexit__ = AsyncMock(return_value=None)
monkeypatch.setenv("OPENROUTER_API_KEY", "test-key")
with patch("httpx.AsyncClient", return_value=fake_ctx):
result = await analyze_with_model(
str(image_path),
"Describe this image",
{"provider": "openrouter", "model_id": "fake/model"},
)
assert result["success"] is True
sent_url = fake_client.post.await_args.kwargs["json"]["messages"][0]["content"][1]["image_url"]["url"]
assert sent_url.startswith("data:image/png;base64,")
def test_sample_dataset_has_entries(self):
dataset = generate_sample_dataset()
assert len(dataset) >= 4