Compare commits

..

1 Commits

Author SHA1 Message Date
Alexander Whitestone
0a814f5bef fix: vendor vision benchmark fixtures (#868)
All checks were successful
Lint / lint (pull_request) Successful in 11s
2026-04-22 11:37:04 -04:00
31 changed files with 332 additions and 294 deletions

View File

@@ -1,194 +1,354 @@
[
{
"id": "screenshot_github_home",
"url": "https://github.githubassets.com/images/modules/logos_page/GitHub-Mark.png",
"url": "test_images/screenshot_github_home.png",
"category": "screenshot",
"expected_keywords": ["github", "logo", "mark"],
"expected_keywords": [
"github",
"logo",
"mark"
],
"ground_truth_ocr": "",
"expected_structure": {"min_length": 30, "min_sentences": 1, "has_numbers": false}
"expected_structure": {
"min_length": 30,
"min_sentences": 1,
"has_numbers": false
}
},
{
"id": "diagram_mermaid_flow",
"url": "https://mermaid.ink/img/pako:eNpdkE9PwzAMxb-K5VOl7gc7sAOIIDuAw9gptnRaSJLSJttQStmXs9LCH-ymBOI1ef_42U6cUSae4IkDxbAAWtB6siSZXVhjQTlgl1nigHg5fRBOzSfebopROCu_cytObSfgLSE1ANOeZWkO2IH5upZxYot8m1hqAdpD_63WRl0xdUG1jdl9kPiOb_EWk2JBtPaiKkF4eVIYgO0EtkW-RSgC4gJ6HJYRG1UNdN0HNVd0Bftjj7X8P92qPj-F8l8T3w",
"url": "test_images/diagram_mermaid_flow.png",
"category": "diagram",
"expected_keywords": ["flow", "diagram", "process"],
"expected_keywords": [
"flow",
"diagram",
"process"
],
"ground_truth_ocr": "",
"expected_structure": {"min_length": 50, "min_sentences": 2, "has_numbers": false}
"expected_structure": {
"min_length": 50,
"min_sentences": 2,
"has_numbers": false
}
},
{
"id": "photo_random_1",
"url": "https://picsum.photos/seed/vision1/400/300",
"url": "test_images/photo_random_1.png",
"category": "photo",
"expected_keywords": [],
"ground_truth_ocr": "",
"expected_structure": {"min_length": 30, "min_sentences": 1, "has_numbers": false}
"expected_structure": {
"min_length": 30,
"min_sentences": 1,
"has_numbers": false
}
},
{
"id": "photo_random_2",
"url": "https://picsum.photos/seed/vision2/400/300",
"url": "test_images/photo_random_2.png",
"category": "photo",
"expected_keywords": [],
"ground_truth_ocr": "",
"expected_structure": {"min_length": 30, "min_sentences": 1, "has_numbers": false}
"expected_structure": {
"min_length": 30,
"min_sentences": 1,
"has_numbers": false
}
},
{
"id": "chart_simple_bar",
"url": "https://quickchart.io/chart?c={type:'bar',data:{labels:['Q1','Q2','Q3','Q4'],datasets:[{label:'Revenue',data:[100,150,200,250]}]}}",
"url": "test_images/chart_simple_bar.png",
"category": "chart",
"expected_keywords": ["bar", "chart", "revenue"],
"expected_keywords": [
"bar",
"chart",
"revenue"
],
"ground_truth_ocr": "",
"expected_structure": {"min_length": 50, "min_sentences": 2, "has_numbers": true}
"expected_structure": {
"min_length": 50,
"min_sentences": 2,
"has_numbers": true
}
},
{
"id": "chart_pie",
"url": "https://quickchart.io/chart?c={type:'pie',data:{labels:['A','B','C'],datasets:[{data:[30,50,20]}]}}",
"url": "test_images/chart_pie.png",
"category": "chart",
"expected_keywords": ["pie", "chart", "percentage"],
"expected_keywords": [
"pie",
"chart",
"percentage"
],
"ground_truth_ocr": "",
"expected_structure": {"min_length": 50, "min_sentences": 2, "has_numbers": true}
"expected_structure": {
"min_length": 50,
"min_sentences": 2,
"has_numbers": true
}
},
{
"id": "diagram_org_chart",
"url": "https://mermaid.ink/img/pako:eNpdkE9PwzAMxb-K5VOl7gc7sAOIIDuAw9gptnRaSJLSJttQStmXs9LCH-ymBOI1ef_42U6cUSae4IkDxbAAWtB6iuyIWyrLgXLALrPEAfFy-iCcmk-83RSjcFZ-51ac2k7AW0JqAKY9y9IcsAPzdS3jxBb5NrHUAraH_lutjbpi6oJqG7P7IPEd3-ItJsWCaO1FVYLw8qQwANsJbIt8i1AExAX0OCwjNqoa6LoPaq7oCvbHHmv5f7pVfX4K5b8mvg",
"url": "test_images/diagram_org_chart.png",
"category": "diagram",
"expected_keywords": ["organization", "hierarchy", "chart"],
"expected_keywords": [
"organization",
"hierarchy",
"chart"
],
"ground_truth_ocr": "",
"expected_structure": {"min_length": 50, "min_sentences": 2, "has_numbers": false}
"expected_structure": {
"min_length": 50,
"min_sentences": 2,
"has_numbers": false
}
},
{
"id": "screenshot_terminal",
"url": "https://raw.githubusercontent.com/nicehash/nicehash-quick-start/main/images/nicehash-terminal.png",
"url": "test_images/screenshot_terminal.png",
"category": "screenshot",
"expected_keywords": ["terminal", "command", "output"],
"expected_keywords": [
"terminal",
"command",
"output"
],
"ground_truth_ocr": "",
"expected_structure": {"min_length": 30, "min_sentences": 1, "has_numbers": false}
"expected_structure": {
"min_length": 30,
"min_sentences": 1,
"has_numbers": false
}
},
{
"id": "photo_random_3",
"url": "https://picsum.photos/seed/vision3/400/300",
"url": "test_images/photo_random_3.png",
"category": "photo",
"expected_keywords": [],
"ground_truth_ocr": "",
"expected_structure": {"min_length": 30, "min_sentences": 1, "has_numbers": false}
"expected_structure": {
"min_length": 30,
"min_sentences": 1,
"has_numbers": false
}
},
{
"id": "chart_line",
"url": "https://quickchart.io/chart?c={type:'line',data:{labels:['Jan','Feb','Mar','Apr'],datasets:[{label:'Temperature',data:[5,8,12,18]}]}}",
"url": "test_images/chart_line.png",
"category": "chart",
"expected_keywords": ["line", "chart", "temperature"],
"expected_keywords": [
"line",
"chart",
"temperature"
],
"ground_truth_ocr": "",
"expected_structure": {"min_length": 50, "min_sentences": 2, "has_numbers": true}
"expected_structure": {
"min_length": 50,
"min_sentences": 2,
"has_numbers": true
}
},
{
"id": "diagram_sequence",
"url": "https://mermaid.ink/img/pako:eNpdkE9PwzAMxb-K5VOl7gc7sAOIIDuAw9gptnRaSJLSJttQStmXs9LCH-ymBOI1ef_42U6cUSae4IkDxbAAWtB6iuyIWyrLgXLALrPEAfFy-iCcmk-83RSjcFZ-51ac2k7AW0JqAKY9y9IcsAPzdS3jxBb5NrHUAraH_lutjbpi6oJqG7P7IPEd3-ItJsWCaO1FVYLw8qQwANsJbIt8i1AExAX0OCwjNqoa6LoPaq7oCvbHHmv5f7pVfX4K5b8mvg",
"url": "test_images/diagram_sequence.png",
"category": "diagram",
"expected_keywords": ["sequence", "interaction", "message"],
"expected_keywords": [
"sequence",
"interaction",
"message"
],
"ground_truth_ocr": "",
"expected_structure": {"min_length": 50, "min_sentences": 2, "has_numbers": false}
"expected_structure": {
"min_length": 50,
"min_sentences": 2,
"has_numbers": false
}
},
{
"id": "photo_random_4",
"url": "https://picsum.photos/seed/vision4/400/300",
"url": "test_images/photo_random_4.png",
"category": "photo",
"expected_keywords": [],
"ground_truth_ocr": "",
"expected_structure": {"min_length": 30, "min_sentences": 1, "has_numbers": false}
"expected_structure": {
"min_length": 30,
"min_sentences": 1,
"has_numbers": false
}
},
{
"id": "screenshot_webpage",
"url": "https://github.githubassets.com/images/modules/site/social-cards.png",
"url": "test_images/screenshot_webpage.png",
"category": "screenshot",
"expected_keywords": ["github", "page", "web"],
"expected_keywords": [
"github",
"page",
"web"
],
"ground_truth_ocr": "",
"expected_structure": {"min_length": 30, "min_sentences": 1, "has_numbers": false}
"expected_structure": {
"min_length": 30,
"min_sentences": 1,
"has_numbers": false
}
},
{
"id": "chart_radar",
"url": "https://quickchart.io/chart?c={type:'radar',data:{labels:['Speed','Power','Defense','Magic'],datasets:[{label:'Hero',data:[80,60,70,90]}]}}",
"url": "test_images/chart_radar.png",
"category": "chart",
"expected_keywords": ["radar", "chart", "skill"],
"expected_keywords": [
"radar",
"chart",
"skill"
],
"ground_truth_ocr": "",
"expected_structure": {"min_length": 50, "min_sentences": 2, "has_numbers": true}
"expected_structure": {
"min_length": 50,
"min_sentences": 2,
"has_numbers": true
}
},
{
"id": "photo_random_5",
"url": "https://picsum.photos/seed/vision5/400/300",
"url": "test_images/photo_random_5.png",
"category": "photo",
"expected_keywords": [],
"ground_truth_ocr": "",
"expected_structure": {"min_length": 30, "min_sentences": 1, "has_numbers": false}
"expected_structure": {
"min_length": 30,
"min_sentences": 1,
"has_numbers": false
}
},
{
"id": "diagram_class",
"url": "https://mermaid.ink/img/pako:eNpdkE9PwzAMxb-K5VOl7gc7sAOIIDuAw9gptnRaSJLSJttQStmXs9LCH-ymBOI1ef_42U6cUSae4IkDxbAAWtB6iuyIWyrLgXLALrPEAfFy-iCcmk-83RSjcFZ-51ac2k7AW0JqAKY9y9IcsAPzdS3jxBb5NrHUAraH_lutjbpi6oJqG7P7IPEd3-ItJsWCaO1FVYLw8qQwANsJbIt8i1AExAX0OCwjNqoa6LoPaq7oCvbHHmv5f7pVfX4K5b8mvg",
"url": "test_images/diagram_class.png",
"category": "diagram",
"expected_keywords": ["class", "object", "attribute"],
"expected_keywords": [
"class",
"object",
"attribute"
],
"ground_truth_ocr": "",
"expected_structure": {"min_length": 50, "min_sentences": 2, "has_numbers": false}
"expected_structure": {
"min_length": 50,
"min_sentences": 2,
"has_numbers": false
}
},
{
"id": "chart_doughnut",
"url": "https://quickchart.io/chart?c={type:'doughnut',data:{labels:['Desktop','Mobile','Tablet'],datasets:[{data:[60,30,10]}]}}",
"url": "test_images/chart_doughnut.png",
"category": "chart",
"expected_keywords": ["doughnut", "chart", "device"],
"expected_keywords": [
"doughnut",
"chart",
"device"
],
"ground_truth_ocr": "",
"expected_structure": {"min_length": 50, "min_sentences": 2, "has_numbers": true}
"expected_structure": {
"min_length": 50,
"min_sentences": 2,
"has_numbers": true
}
},
{
"id": "photo_random_6",
"url": "https://picsum.photos/seed/vision6/400/300",
"url": "test_images/photo_random_6.png",
"category": "photo",
"expected_keywords": [],
"ground_truth_ocr": "",
"expected_structure": {"min_length": 30, "min_sentences": 1, "has_numbers": false}
"expected_structure": {
"min_length": 30,
"min_sentences": 1,
"has_numbers": false
}
},
{
"id": "screenshot_error",
"url": "https://http.cat/404.jpg",
"url": "test_images/screenshot_error.png",
"category": "screenshot",
"expected_keywords": ["404", "error", "cat"],
"expected_keywords": [
"404",
"error",
"cat"
],
"ground_truth_ocr": "",
"expected_structure": {"min_length": 30, "min_sentences": 1, "has_numbers": true}
"expected_structure": {
"min_length": 30,
"min_sentences": 1,
"has_numbers": true
}
},
{
"id": "diagram_network",
"url": "https://mermaid.ink/img/pako:eNpdkE9PwzAMxb-K5VOl7gc7sAOIIDuAw9gptnRaSJLSJttQStmXs9LCH-ymBOI1ef_42U6cUSae4IkDxbAAWtB6iuyIWyrLgXLALrPEAfFy-iCcmk-83RSjcFZ-51ac2k7AW0JqAKY9y9IcsAPzdS3jxBb5NrHUAraH_lutjbpi6oJqG7P7IPEd3-ItJsWCaO1FVYLw8qQwANsJbIt8i1AExAX0OCwjNqoa6LoPaq7oCvbHHmv5f7pVfX4K5b8mvg",
"url": "test_images/diagram_network.png",
"category": "diagram",
"expected_keywords": ["network", "node", "connection"],
"expected_keywords": [
"network",
"node",
"connection"
],
"ground_truth_ocr": "",
"expected_structure": {"min_length": 50, "min_sentences": 2, "has_numbers": false}
"expected_structure": {
"min_length": 50,
"min_sentences": 2,
"has_numbers": false
}
},
{
"id": "photo_random_7",
"url": "https://picsum.photos/seed/vision7/400/300",
"url": "test_images/photo_random_7.png",
"category": "photo",
"expected_keywords": [],
"ground_truth_ocr": "",
"expected_structure": {"min_length": 30, "min_sentences": 1, "has_numbers": false}
"expected_structure": {
"min_length": 30,
"min_sentences": 1,
"has_numbers": false
}
},
{
"id": "chart_stacked_bar",
"url": "https://quickchart.io/chart?c={type:'bar',data:{labels:['2022','2023','2024'],datasets:[{label:'Cloud',data:[100,150,200]},{label:'On-prem',data:[200,180,160]}]},options:{scales:{x:{stacked:true},y:{stacked:true}}}}",
"url": "test_images/chart_stacked_bar.png",
"category": "chart",
"expected_keywords": ["stacked", "bar", "chart"],
"expected_keywords": [
"stacked",
"bar",
"chart"
],
"ground_truth_ocr": "",
"expected_structure": {"min_length": 50, "min_sentences": 2, "has_numbers": true}
"expected_structure": {
"min_length": 50,
"min_sentences": 2,
"has_numbers": true
}
},
{
"id": "screenshot_dashboard",
"url": "https://github.githubassets.com/images/modules/site/features-code-search.png",
"url": "test_images/screenshot_dashboard.png",
"category": "screenshot",
"expected_keywords": ["search", "code", "feature"],
"expected_keywords": [
"search",
"code",
"feature"
],
"ground_truth_ocr": "",
"expected_structure": {"min_length": 30, "min_sentences": 1, "has_numbers": false}
"expected_structure": {
"min_length": 30,
"min_sentences": 1,
"has_numbers": false
}
},
{
"id": "photo_random_8",
"url": "https://picsum.photos/seed/vision8/400/300",
"url": "test_images/photo_random_8.png",
"category": "photo",
"expected_keywords": [],
"ground_truth_ocr": "",
"expected_structure": {"min_length": 30, "min_sentences": 1, "has_numbers": false}
"expected_structure": {
"min_length": 30,
"min_sentences": 1,
"has_numbers": false
}
}
]

Binary file not shown.

After

Width:  |  Height:  |  Size: 4.4 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 4.1 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 4.0 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 3.5 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 4.2 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 5.0 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 4.6 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 4.8 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 5.0 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 5.1 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 5.2 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 3.0 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 3.0 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 3.0 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 2.9 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 3.0 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 3.0 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 3.0 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 3.0 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 7.1 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 6.2 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 7.1 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 7.1 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 7.2 KiB

View File

@@ -11,17 +11,19 @@ Usage:
# Single image test
python benchmarks/vision_benchmark.py --url https://example.com/image.png
python benchmarks/vision_benchmark.py --url benchmarks/test_images/photo_random_1.png
# Generate test report
python benchmarks/vision_benchmark.py --images benchmarks/test_images.json --output benchmarks/vision_results.json
Test image dataset: benchmarks/test_images.json (50-100 diverse images)
Test image dataset: benchmarks/test_images.json (committed local fixtures under benchmarks/test_images/)
"""
import argparse
import asyncio
import base64
import json
import mimetypes
import os
import statistics
import sys
@@ -67,6 +69,28 @@ EVAL_PROMPTS = {
# ---------------------------------------------------------------------------
def _is_remote_image_source(image_source: str) -> bool:
return image_source.startswith(("http://", "https://", "data:", "file://"))
def _image_source_to_payload_url(image_source: str) -> str:
"""Convert local image paths into data URLs; keep remote URLs unchanged."""
if image_source.startswith(("http://", "https://", "data:")):
return image_source
resolved = image_source[len("file://"):] if image_source.startswith("file://") else image_source
local_path = Path(os.path.expanduser(resolved)).resolve()
if not local_path.is_file():
return image_source
mime_type, _ = mimetypes.guess_type(str(local_path))
if not mime_type:
mime_type = "application/octet-stream"
encoded = base64.b64encode(local_path.read_bytes()).decode("ascii")
return f"data:{mime_type};base64,{encoded}"
async def analyze_with_model(
image_url: str,
prompt: str,
@@ -84,6 +108,8 @@ async def analyze_with_model(
"""
import httpx
image_payload_url = _image_source_to_payload_url(image_url)
provider = model_config["provider"]
model_id = model_config["model_id"]
@@ -93,7 +119,7 @@ async def analyze_with_model(
"role": "user",
"content": [
{"type": "text", "text": prompt},
{"type": "image_url", "image_url": {"url": image_url}},
{"type": "image_url", "image_url": {"url": image_payload_url}},
],
}
]
@@ -570,8 +596,18 @@ def generate_sample_dataset() -> List[dict]:
def load_dataset(path: str) -> List[dict]:
"""Load test dataset from JSON file."""
with open(path) as f:
return json.load(f)
dataset_path = Path(path).resolve()
with open(dataset_path) as f:
dataset = json.load(f)
base_dir = dataset_path.parent
for image in dataset:
image_url = image.get("url")
if not image_url or _is_remote_image_source(image_url):
continue
image["url"] = str((base_dir / image_url).resolve())
return dataset
# ---------------------------------------------------------------------------
@@ -582,7 +618,7 @@ def load_dataset(path: str) -> List[dict]:
async def main():
parser = argparse.ArgumentParser(description="Vision Benchmark Suite (Issue #817)")
parser.add_argument("--images", help="Path to test images JSON file")
parser.add_argument("--url", help="Single image URL to test")
parser.add_argument("--url", help="Single image URL or local file path to test")
parser.add_argument("--category", default="photo", help="Category for single URL")
parser.add_argument("--output", default=None, help="Output JSON file")
parser.add_argument("--runs", type=int, default=1, help="Runs per model per image")

View File

@@ -1,190 +0,0 @@
---
name: adversarial-ux-test
description: Roleplay the most difficult, tech-resistant user for your product. Browse the app as that persona, find every UX pain point, then filter complaints through a pragmatism layer to separate real problems from noise. Creates actionable tickets from genuine issues only.
version: 1.0.0
author: Omni @ Comelse
license: MIT
metadata:
hermes:
tags: [qa, ux, testing, adversarial, dogfood, personas, user-testing]
related_skills: [dogfood]
---
# Adversarial UX Test
Roleplay the worst-case user for your product — the person who hates technology, doesn't want your software, and will find every reason to complain. Then filter their feedback through a pragmatism layer to separate real UX problems from "I hate computers" noise.
Think of it as an automated "mom test" — but angry.
## Why This Works
Most QA finds bugs. This finds **friction**. A technically correct app can still be unusable for real humans. The adversarial persona catches:
- Confusing terminology that makes sense to developers but not users
- Too many steps to accomplish basic tasks
- Missing onboarding or "aha moments"
- Accessibility issues (font size, contrast, click targets)
- Cold-start problems (empty states, no demo content)
- Paywall/signup friction that kills conversion
The **pragmatism filter** (Phase 3) is what makes this useful instead of just entertaining. Without it, you'd add a "print this page" button to every screen because Grandpa can't figure out PDFs.
## How to Use
Tell the agent:
```
"Run an adversarial UX test on [URL]"
"Be a grumpy [persona type] and test [app name]"
"Do an asshole user test on my staging site"
```
You can provide a persona or let the agent generate one based on your product's target audience.
## Step 1: Define the Persona
If no persona is provided, generate one by answering:
1. **Who is the HARDEST user for this product?** (age 50+, non-technical role, decades of experience doing it "the old way")
2. **What is their tech comfort level?** (the lower the better — WhatsApp-only, paper notebooks, wife set up their email)
3. **What is the ONE thing they need to accomplish?** (their core job, not your feature list)
4. **What would make them give up?** (too many clicks, jargon, slow, confusing)
5. **How do they talk when frustrated?** (blunt, sweary, dismissive, sighing)
### Good Persona Example
> **"Big Mick" McAllister** — 58-year-old S&C coach. Uses WhatsApp and that's it. His "spreadsheet" is a paper notebook. "If I can't figure it out in 10 seconds I'm going back to my notebook." Needs to log session results for 25 players. Hates small text, jargon, and passwords.
### Bad Persona Example
> "A user who doesn't like the app" — too vague, no constraints, no voice.
The persona must be **specific enough to stay in character** for 20 minutes of testing.
## Step 2: Become the Asshole (Browse as the Persona)
1. Read any available project docs for app context and URLs
2. **Fully inhabit the persona** — their frustrations, limitations, goals
3. Navigate to the app using browser tools
4. **Attempt the persona's ACTUAL TASKS** (not a feature tour):
- Can they do what they came to do?
- How many clicks/screens to accomplish it?
- What confuses them?
- What makes them angry?
- Where do they get lost?
- What would make them give up and go back to their old way?
5. Test these friction categories:
- **First impression** — would they even bother past the landing page?
- **Core workflow** — the ONE thing they need to do most often
- **Error recovery** — what happens when they do something wrong?
- **Readability** — text size, contrast, information density
- **Speed** — does it feel faster than their current method?
- **Terminology** — any jargon they wouldn't understand?
- **Navigation** — can they find their way back? do they know where they are?
6. Take screenshots of every pain point
7. Check browser console for JS errors on every page
## Step 3: The Rant (Write Feedback in Character)
Write the feedback AS THE PERSONA — in their voice, with their frustrations. This is not a bug report. This is a real human venting.
```
[PERSONA NAME]'s Review of [PRODUCT]
Overall: [Would they keep using it? Yes/No/Maybe with conditions]
THE GOOD (grudging admission):
- [things even they have to admit work]
THE BAD (legitimate UX issues):
- [real problems that would stop them from using the product]
THE UGLY (showstoppers):
- [things that would make them uninstall/cancel immediately]
SPECIFIC COMPLAINTS:
1. [Page/feature]: "[quote in persona voice]" — [what happened, expected]
2. ...
VERDICT: "[one-line persona quote summarizing their experience]"
```
## Step 4: The Pragmatism Filter (Critical — Do Not Skip)
Step OUT of the persona. Evaluate each complaint as a product person:
- **RED: REAL UX BUG** — Any user would have this problem, not just grumpy ones. Fix it.
- **YELLOW: VALID BUT LOW PRIORITY** — Real issue but only for extreme users. Note it.
- **WHITE: PERSONA NOISE** — "I hate computers" talking, not a product problem. Skip it.
- **GREEN: FEATURE REQUEST** — Good idea hidden in the complaint. Consider it.
### Filter Criteria
1. Would a 35-year-old competent-but-busy user have the same complaint? → RED
2. Is this a genuine accessibility issue (font size, contrast, click targets)? → RED
3. Is this "I want it to work like paper" resistance to digital? → WHITE
4. Is this a real workflow inefficiency the persona stumbled on? → YELLOW or RED
5. Would fixing this add complexity for the 80% who are fine? → WHITE
6. Does the complaint reveal a missing onboarding moment? → GREEN
**This filter is MANDATORY.** Never ship raw persona complaints as tickets.
## Step 5: Create Tickets
For **RED** and **GREEN** items only:
- Clear, actionable title
- Include the persona's verbatim quote (entertaining + memorable)
- The real UX issue underneath (objective)
- A suggested fix (actionable)
- Tag/label: "ux-review"
For **YELLOW** items: one catch-all ticket with all notes.
**WHITE** items appear in the report only. No tickets.
**Max 10 tickets per session** — focus on the worst issues.
## Step 6: Report
Deliver:
1. The persona rant (Step 3) — entertaining and visceral
2. The filtered assessment (Step 4) — pragmatic and actionable
3. Tickets created (Step 5) — with links
4. Screenshots of key issues
## Tips
- **One persona per session.** Don't mix perspectives.
- **Stay in character during Steps 2-3.** Break character only at Step 4.
- **Test the CORE WORKFLOW first.** Don't get distracted by settings pages.
- **Empty states are gold.** New user experience reveals the most friction.
- **The best findings are RED items the persona found accidentally** while trying to do something else.
- **If the persona has zero complaints, your persona is too tech-savvy.** Make them older, less patient, more set in their ways.
- **Run this before demos, launches, or after shipping a batch of features.**
- **Register as a NEW user when possible.** Don't use pre-seeded admin accounts — the cold start experience is where most friction lives.
- **Zero WHITE items is a signal, not a failure.** If the pragmatism filter finds no noise, your product has real UX problems, not just a grumpy persona.
- **Check known issues in project docs AFTER the test.** If the persona found a bug that's already in the known issues list, that's actually the most damning finding — it means the team knew about it but never felt the user's pain.
- **Subscription/paywall testing is critical.** Test with expired accounts, not just active ones. The "what happens when you can't pay" experience reveals whether the product respects users or holds their data hostage.
- **Count the clicks to accomplish the persona's ONE task.** If it's more than 5, that's almost always a RED finding regardless of persona tech level.
## Example Personas by Industry
These are starting points — customize for your specific product:
| Product Type | Persona | Age | Key Trait |
|-------------|---------|-----|-----------|
| CRM | Retirement home director | 68 | Filing cabinet is the current CRM |
| Photography SaaS | Rural wedding photographer | 62 | Books clients by phone, invoices on paper |
| AI/ML Tool | Department store buyer | 55 | Burned by 3 failed tech startups |
| Fitness App | Old-school gym coach | 58 | Paper notebook, thick fingers, bad eyes |
| Accounting | Family bakery owner | 64 | Shoebox of receipts, hates subscriptions |
| E-commerce | Market stall vendor | 60 | Cash only, smartphone is for calls |
| Healthcare | Senior GP | 63 | Dictates notes, nurse handles the computer |
| Education | Veteran teacher | 57 | Chalk and talk, worksheets in ring binders |
## Rules
- Stay in character during Steps 2-3
- Be genuinely mean but fair — find real problems, not manufactured ones
- The pragmatism filter (Step 4) is **MANDATORY**
- Screenshots required for every complaint
- Max 10 tickets per session
- Test on staging/deployed app, not local dev
- One persona, one session, one report

View File

@@ -1,25 +0,0 @@
from pathlib import Path
from tools.skills_hub import OptionalSkillSource
REPO_ROOT = Path(__file__).resolve().parents[1]
def test_optional_skill_source_scans_adversarial_ux_test():
source = OptionalSkillSource()
metas = {meta.identifier: meta for meta in source._scan_all()}
assert "official/dogfood/adversarial-ux-test" in metas
assert metas["official/dogfood/adversarial-ux-test"].name == "adversarial-ux-test"
assert "tech-resistant user" in metas["official/dogfood/adversarial-ux-test"].description
def test_optional_skill_catalog_docs_list_adversarial_ux_test():
optional_catalog = (REPO_ROOT / "website" / "docs" / "reference" / "optional-skills-catalog.md").read_text(encoding="utf-8")
bundled_catalog = (REPO_ROOT / "website" / "docs" / "reference" / "skills-catalog.md").read_text(encoding="utf-8")
assert "**adversarial-ux-test**" in optional_catalog
assert "official/dogfood/adversarial-ux-test" in optional_catalog
assert "`adversarial-ux-test`" in bundled_catalog
assert "dogfood/adversarial-ux-test" in bundled_catalog

View File

@@ -11,12 +11,14 @@ import pytest
sys.path.insert(0, str(Path(__file__).parent.parent / "benchmarks"))
from vision_benchmark import (
analyze_with_model,
compute_ocr_accuracy,
compute_description_completeness,
compute_structural_accuracy,
aggregate_results,
to_markdown,
generate_sample_dataset,
load_dataset,
MODELS,
EVAL_PROMPTS,
)
@@ -197,6 +199,71 @@ class TestMarkdown:
class TestDataset:
def test_repo_dataset_uses_local_image_paths(self):
dataset_path = Path(__file__).parent.parent / "benchmarks" / "test_images.json"
dataset = json.loads(dataset_path.read_text())
assert dataset, "benchmark dataset should not be empty"
assert all(not entry["url"].startswith(("http://", "https://")) for entry in dataset)
def test_load_dataset_resolves_relative_local_paths(self, tmp_path):
images_dir = tmp_path / "images"
images_dir.mkdir()
image_path = images_dir / "sample.png"
image_path.write_bytes(b"png-bytes")
dataset_path = tmp_path / "dataset.json"
dataset_path.write_text(json.dumps([
{
"id": "sample",
"url": "images/sample.png",
"category": "photo",
"expected_keywords": [],
"expected_structure": {"min_length": 30, "min_sentences": 1},
}
]))
loaded = load_dataset(str(dataset_path))
assert loaded[0]["url"] == str(image_path.resolve())
@pytest.mark.asyncio
async def test_analyze_with_model_encodes_local_file_as_data_url(self, tmp_path, monkeypatch):
image_path = tmp_path / "tiny.png"
image_path.write_bytes(
bytes.fromhex(
"89504E470D0A1A0A"
"0000000D49484452000000010000000108060000001F15C489"
"0000000D49444154789C6360000002000154A24F5D00000000"
"49454E44AE426082"
)
)
fake_response = MagicMock()
fake_response.raise_for_status.return_value = None
fake_response.json.return_value = {
"choices": [{"message": {"content": "Looks like a tiny image."}}],
"usage": {"prompt_tokens": 1, "completion_tokens": 2, "total_tokens": 3},
}
fake_client = MagicMock()
fake_client.post = AsyncMock(return_value=fake_response)
fake_ctx = MagicMock()
fake_ctx.__aenter__ = AsyncMock(return_value=fake_client)
fake_ctx.__aexit__ = AsyncMock(return_value=None)
monkeypatch.setenv("OPENROUTER_API_KEY", "test-key")
with patch("httpx.AsyncClient", return_value=fake_ctx):
result = await analyze_with_model(
str(image_path),
"Describe this image",
{"provider": "openrouter", "model_id": "fake/model"},
)
assert result["success"] is True
sent_url = fake_client.post.await_args.kwargs["json"]["messages"][0]["content"][1]["image_url"]["url"]
assert sent_url.startswith("data:image/png;base64,")
def test_sample_dataset_has_entries(self):
dataset = generate_sample_dataset()
assert len(dataset) >= 4

View File

@@ -16,7 +16,6 @@ For example:
```bash
hermes skills install official/blockchain/solana
hermes skills install official/dogfood/adversarial-ux-test
hermes skills install official/mlops/flash-attention
```
@@ -57,12 +56,6 @@ hermes skills uninstall <skill-name>
| **blender-mcp** | Control Blender directly from Hermes via socket connection to the blender-mcp addon. Create 3D objects, materials, animations, and run arbitrary Blender Python (bpy) code. |
| **meme-generation** | Generate real meme images by picking a template and overlaying text with Pillow. Produces actual `.png` meme files. |
## Dogfood
| Skill | Description |
|-------|-------------|
| **adversarial-ux-test** | Roleplay the most difficult, tech-resistant user for a product — browse in-persona, rant, then filter through a RED/YELLOW/WHITE/GREEN pragmatism layer so only real UX friction becomes tickets. |
## DevOps
| Skill | Description |

View File

@@ -59,12 +59,9 @@ DevOps and infrastructure automation skills.
## dogfood
Internal dogfooding and QA skills used to test Hermes Agent itself.
| Skill | Description | Path |
|-------|-------------|------|
| `dogfood` | Systematic exploratory QA testing of web applications — find bugs, capture evidence, and generate structured reports. | `dogfood/dogfood` |
| `adversarial-ux-test` | Roleplay the most difficult, tech-resistant user for a product — browse in-persona, rant, then filter through a RED/YELLOW/WHITE/GREEN pragmatism layer so only real UX friction becomes tickets. | `dogfood/adversarial-ux-test` |
| `hermes-agent-setup` | Help users configure Hermes Agent — CLI usage, setup wizard, model/provider selection, tools, skills, voice/STT/TTS, gateway, and troubleshooting. | `dogfood/hermes-agent-setup` |
## email