Compare commits
16 Commits
feature/sy
...
feat/kimic
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
02f1a39e2a | ||
|
|
a79c0c9d5d | ||
|
|
a0ec802403 | ||
|
|
ee7f37c5c7 | ||
| 83e400d4aa | |||
|
|
24bab6f882 | ||
|
|
100e3fc416 | ||
|
|
8494ee344b | ||
|
|
9a100be8d1 | ||
| 276f2c32dd | |||
| 973f3bbe5a | |||
|
|
5f549bf1f6 | ||
|
|
6685388357 | ||
| a95da9e73d | |||
| 5e8380b858 | |||
|
|
266d6ec008 |
22
configs/llama-server.service
Normal file
22
configs/llama-server.service
Normal file
@@ -0,0 +1,22 @@
|
|||||||
|
[Unit]
|
||||||
|
Description=llama.cpp inference server for Timmy
|
||||||
|
After=network.target
|
||||||
|
|
||||||
|
[Service]
|
||||||
|
Type=simple
|
||||||
|
User=root
|
||||||
|
WorkingDirectory=/root/timmy
|
||||||
|
ExecStart=/root/timmy/llama-server \
|
||||||
|
-m /root/timmy/models/hermes-3-8b.Q4_K_M.gguf \
|
||||||
|
--host 127.0.0.1 \
|
||||||
|
--port 8081 \
|
||||||
|
-c 8192 \
|
||||||
|
-np 1 \
|
||||||
|
--jinja \
|
||||||
|
-ngl 0
|
||||||
|
Restart=always
|
||||||
|
RestartSec=10
|
||||||
|
Environment="HOME=/root"
|
||||||
|
|
||||||
|
[Install]
|
||||||
|
WantedBy=multi-user.target
|
||||||
17
configs/timmy-agent.service
Normal file
17
configs/timmy-agent.service
Normal file
@@ -0,0 +1,17 @@
|
|||||||
|
[Unit]
|
||||||
|
Description=Timmy Agent Harness
|
||||||
|
After=llama-server.service
|
||||||
|
Requires=llama-server.service
|
||||||
|
|
||||||
|
[Service]
|
||||||
|
Type=simple
|
||||||
|
User=root
|
||||||
|
WorkingDirectory=/root/timmy
|
||||||
|
ExecStart=/root/timmy/venv/bin/python /root/timmy/timmy-home/agent/agent_daemon.py
|
||||||
|
Restart=always
|
||||||
|
RestartSec=30
|
||||||
|
Environment="HOME=/root"
|
||||||
|
Environment="TIMMY_MODEL_URL=http://127.0.0.1:8081"
|
||||||
|
|
||||||
|
[Install]
|
||||||
|
WantedBy=multi-user.target
|
||||||
16
configs/timmy-health.service
Normal file
16
configs/timmy-health.service
Normal file
@@ -0,0 +1,16 @@
|
|||||||
|
[Unit]
|
||||||
|
Description=Timmy Health Check Daemon
|
||||||
|
After=network.target
|
||||||
|
|
||||||
|
[Service]
|
||||||
|
Type=simple
|
||||||
|
User=root
|
||||||
|
WorkingDirectory=/root/timmy
|
||||||
|
ExecStart=/root/timmy/venv/bin/python /root/timmy/uni-wizard/daemons/health_daemon.py
|
||||||
|
Restart=always
|
||||||
|
RestartSec=10
|
||||||
|
Environment="HOME=/root"
|
||||||
|
Environment="PYTHONPATH=/root/timmy/uni-wizard"
|
||||||
|
|
||||||
|
[Install]
|
||||||
|
WantedBy=multi-user.target
|
||||||
16
configs/timmy-task-router.service
Normal file
16
configs/timmy-task-router.service
Normal file
@@ -0,0 +1,16 @@
|
|||||||
|
[Unit]
|
||||||
|
Description=Timmy Task Router Daemon
|
||||||
|
After=network.target
|
||||||
|
|
||||||
|
[Service]
|
||||||
|
Type=simple
|
||||||
|
User=root
|
||||||
|
WorkingDirectory=/root/timmy
|
||||||
|
ExecStart=/root/timmy/venv/bin/python /root/timmy/uni-wizard/daemons/task_router.py
|
||||||
|
Restart=always
|
||||||
|
RestartSec=10
|
||||||
|
Environment="HOME=/root"
|
||||||
|
Environment="PYTHONPATH=/root/timmy/uni-wizard"
|
||||||
|
|
||||||
|
[Install]
|
||||||
|
WantedBy=multi-user.target
|
||||||
125
docs/SCORECARD.md
Normal file
125
docs/SCORECARD.md
Normal file
@@ -0,0 +1,125 @@
|
|||||||
|
# Scorecard Generator Documentation
|
||||||
|
|
||||||
|
## Overview
|
||||||
|
|
||||||
|
The Scorecard Generator analyzes overnight loop JSONL data and produces comprehensive reports with statistics, trends, and recommendations.
|
||||||
|
|
||||||
|
## Usage
|
||||||
|
|
||||||
|
### Basic Usage
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Generate scorecard from default input directory
|
||||||
|
python uni-wizard/scripts/generate_scorecard.py
|
||||||
|
|
||||||
|
# Specify custom input/output directories
|
||||||
|
python uni-wizard/scripts/generate_scorecard.py \
|
||||||
|
--input ~/shared/overnight-loop \
|
||||||
|
--output ~/timmy/reports
|
||||||
|
```
|
||||||
|
|
||||||
|
### Cron Setup
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Generate scorecard every morning at 6 AM
|
||||||
|
0 6 * * * /root/timmy/venv/bin/python /root/timmy/uni-wizard/scripts/generate_scorecard.py
|
||||||
|
```
|
||||||
|
|
||||||
|
## Input Format
|
||||||
|
|
||||||
|
JSONL files in `~/shared/overnight-loop/*.jsonl`:
|
||||||
|
|
||||||
|
```json
|
||||||
|
{"task": "read-soul", "status": "pass", "duration_s": 19.7, "timestamp": "2026-03-29T21:54:12Z"}
|
||||||
|
{"task": "check-health", "status": "fail", "duration_s": 5.2, "error": "timeout", "timestamp": "2026-03-29T22:15:33Z"}
|
||||||
|
```
|
||||||
|
|
||||||
|
Fields:
|
||||||
|
- `task`: Task identifier
|
||||||
|
- `status`: "pass" or "fail"
|
||||||
|
- `duration_s`: Execution time in seconds
|
||||||
|
- `timestamp`: ISO 8601 timestamp
|
||||||
|
- `error`: Error message (for failed tasks)
|
||||||
|
|
||||||
|
## Output
|
||||||
|
|
||||||
|
### JSON Report
|
||||||
|
|
||||||
|
`~/timmy/reports/scorecard_YYYYMMDD.json`:
|
||||||
|
|
||||||
|
```json
|
||||||
|
{
|
||||||
|
"generated_at": "2026-03-30T06:00:00Z",
|
||||||
|
"summary": {
|
||||||
|
"total_tasks": 100,
|
||||||
|
"passed": 95,
|
||||||
|
"failed": 5,
|
||||||
|
"pass_rate": 95.0,
|
||||||
|
"duration_stats": {
|
||||||
|
"avg": 12.5,
|
||||||
|
"median": 10.2,
|
||||||
|
"p95": 45.0,
|
||||||
|
"min": 1.2,
|
||||||
|
"max": 120.5
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"by_task": {...},
|
||||||
|
"by_hour": {...},
|
||||||
|
"errors": {...},
|
||||||
|
"recommendations": [...]
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
### Markdown Report
|
||||||
|
|
||||||
|
`~/timmy/reports/scorecard_YYYYMMDD.md`:
|
||||||
|
|
||||||
|
- Executive summary with pass/fail counts
|
||||||
|
- Duration statistics (avg, median, p95)
|
||||||
|
- Per-task breakdown with pass rates
|
||||||
|
- Hourly timeline showing performance trends
|
||||||
|
- Error analysis with frequency counts
|
||||||
|
- Actionable recommendations
|
||||||
|
|
||||||
|
## Report Interpretation
|
||||||
|
|
||||||
|
### Pass Rate Thresholds
|
||||||
|
|
||||||
|
| Pass Rate | Status | Action |
|
||||||
|
|-----------|--------|--------|
|
||||||
|
| 95%+ | ✅ Excellent | Continue current operations |
|
||||||
|
| 85-94% | ⚠️ Good | Monitor for degradation |
|
||||||
|
| 70-84% | ⚠️ Fair | Review failing tasks |
|
||||||
|
| <70% | ❌ Poor | Immediate investigation required |
|
||||||
|
|
||||||
|
### Duration Guidelines
|
||||||
|
|
||||||
|
| Duration | Assessment |
|
||||||
|
|----------|------------|
|
||||||
|
| <5s | Fast |
|
||||||
|
| 5-15s | Normal |
|
||||||
|
| 15-30s | Slow |
|
||||||
|
| >30s | Very slow - consider optimization |
|
||||||
|
|
||||||
|
## Troubleshooting
|
||||||
|
|
||||||
|
### No JSONL files found
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Check input directory
|
||||||
|
ls -la ~/shared/overnight-loop/
|
||||||
|
|
||||||
|
# Ensure Syncthing is syncing
|
||||||
|
systemctl status syncthing@root
|
||||||
|
```
|
||||||
|
|
||||||
|
### Malformed lines
|
||||||
|
|
||||||
|
The generator skips malformed lines with a warning. Check the JSONL files for syntax errors.
|
||||||
|
|
||||||
|
### Empty reports
|
||||||
|
|
||||||
|
If no data exists, verify:
|
||||||
|
1. Overnight loop is running and writing JSONL
|
||||||
|
2. File permissions allow reading
|
||||||
|
3. Input path is correct
|
||||||
@@ -1,15 +1,24 @@
|
|||||||
{
|
{
|
||||||
"tick_id": "20260328_015026",
|
"tick_id": "20260330_212052",
|
||||||
"timestamp": "2026-03-28T01:50:26.595915+00:00",
|
"timestamp": "2026-03-30T21:20:52.930215+00:00",
|
||||||
"perception": {
|
"perception": {
|
||||||
"gitea_alive": true,
|
"gitea_alive": true,
|
||||||
"model_health": {
|
"model_health": {
|
||||||
"ollama_running": true,
|
"provider": "local-llama.cpp",
|
||||||
"models_loaded": [],
|
"provider_base_url": "http://localhost:8081/v1",
|
||||||
|
"provider_model": "hermes4:14b",
|
||||||
|
"local_inference_running": true,
|
||||||
|
"models_loaded": [
|
||||||
|
"NousResearch_Hermes-4-14B-Q4_K_M.gguf"
|
||||||
|
],
|
||||||
"api_responding": true,
|
"api_responding": true,
|
||||||
"inference_ok": false,
|
"inference_ok": false,
|
||||||
"inference_error": "HTTP Error 404: Not Found",
|
"inference_error": "HTTP Error 500: Internal Server Error",
|
||||||
"timestamp": "2026-03-28T01:50:26.594893+00:00"
|
"latest_session": "session_d8c25163-9934-4ab2-9158-ff18a31e30f5.json",
|
||||||
|
"latest_export": "session_d8c25163-9934-4ab2-9158-ff18a31e30f5.json",
|
||||||
|
"export_lag_minutes": 0,
|
||||||
|
"export_fresh": true,
|
||||||
|
"timestamp": "2026-03-30T21:20:52.929294+00:00"
|
||||||
},
|
},
|
||||||
"Timmy_Foundation/the-nexus": {
|
"Timmy_Foundation/the-nexus": {
|
||||||
"open_issues": 1,
|
"open_issues": 1,
|
||||||
@@ -21,7 +30,7 @@
|
|||||||
},
|
},
|
||||||
"huey_alive": true
|
"huey_alive": true
|
||||||
},
|
},
|
||||||
"previous_tick": "20260328_014026",
|
"previous_tick": "20260328_015026",
|
||||||
"decision": {
|
"decision": {
|
||||||
"actions": [],
|
"actions": [],
|
||||||
"severity": "fallback",
|
"severity": "fallback",
|
||||||
|
|||||||
260
scripts/provision-timmy-vps.sh
Normal file
260
scripts/provision-timmy-vps.sh
Normal file
@@ -0,0 +1,260 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
# Timmy VPS Provisioning Script
|
||||||
|
# Transforms fresh Ubuntu 22.04+ VPS into sovereign local-first wizard
|
||||||
|
|
||||||
|
set -e
|
||||||
|
|
||||||
|
TIMMY_USER="${TIMMY_USER:-root}"
|
||||||
|
TIMMY_HOME="${TIMMY_HOME:-/root}"
|
||||||
|
TIMMY_DIR="$TIMMY_HOME/timmy"
|
||||||
|
REPO_URL="${REPO_URL:-http://143.198.27.163:3000/Timmy_Foundation/timmy-home.git}"
|
||||||
|
MODEL_URL="${MODEL_URL:-https://huggingface.co/TheBloke/Hermes-3-Llama-3.1-8B-GGUF/resolve/main/hermes-3-llama-3.1-8b.Q4_K_M.gguf}"
|
||||||
|
MODEL_NAME="${MODEL_NAME:-hermes-3-8b.Q4_K_M.gguf}"
|
||||||
|
|
||||||
|
echo "========================================"
|
||||||
|
echo " Timmy VPS Provisioning"
|
||||||
|
echo "========================================"
|
||||||
|
echo ""
|
||||||
|
|
||||||
|
# Colors
|
||||||
|
RED='\033[0;31m'
|
||||||
|
GREEN='\033[0;32m'
|
||||||
|
YELLOW='\033[1;33m'
|
||||||
|
NC='\033[0m' # No Color
|
||||||
|
|
||||||
|
log() {
|
||||||
|
echo -e "${GREEN}[TIMMY]${NC} $1"
|
||||||
|
}
|
||||||
|
|
||||||
|
warn() {
|
||||||
|
echo -e "${YELLOW}[WARN]${NC} $1"
|
||||||
|
}
|
||||||
|
|
||||||
|
error() {
|
||||||
|
echo -e "${RED}[ERROR]${NC} $1"
|
||||||
|
}
|
||||||
|
|
||||||
|
# Check if running as root
|
||||||
|
if [ "$EUID" -ne 0 ]; then
|
||||||
|
error "Please run as root"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
# Check Ubuntu version
|
||||||
|
if ! grep -q "Ubuntu 22.04\|Ubuntu 24.04" /etc/os-release; then
|
||||||
|
warn "Not Ubuntu 22.04/24.04 - may not work correctly"
|
||||||
|
fi
|
||||||
|
|
||||||
|
log "Step 1/8: Installing system dependencies..."
|
||||||
|
export DEBIAN_FRONTEND=noninteractive
|
||||||
|
apt-get update -qq
|
||||||
|
apt-get install -y -qq \
|
||||||
|
build-essential \
|
||||||
|
cmake \
|
||||||
|
git \
|
||||||
|
curl \
|
||||||
|
wget \
|
||||||
|
python3 \
|
||||||
|
python3-pip \
|
||||||
|
python3-venv \
|
||||||
|
libopenblas-dev \
|
||||||
|
pkg-config \
|
||||||
|
ufw \
|
||||||
|
jq \
|
||||||
|
sqlite3 \
|
||||||
|
libsqlite3-dev \
|
||||||
|
2>&1 | tail -5
|
||||||
|
|
||||||
|
log "Step 2/8: Setting up directory structure..."
|
||||||
|
mkdir -p "$TIMMY_DIR"/{soul,scripts,logs,shared,models,configs}
|
||||||
|
mkdir -p "$TIMMY_HOME/.config/systemd/user"
|
||||||
|
|
||||||
|
log "Step 3/8: Building llama.cpp from source..."
|
||||||
|
if [ ! -f "$TIMMY_DIR/llama-server" ]; then
|
||||||
|
cd /tmp
|
||||||
|
git clone --depth 1 https://github.com/ggerganov/llama.cpp.git 2>/dev/null || true
|
||||||
|
cd llama.cpp
|
||||||
|
|
||||||
|
# Build with OpenBLAS for CPU optimization
|
||||||
|
cmake -B build \
|
||||||
|
-DGGML_BLAS=ON \
|
||||||
|
-DGGML_BLAS_VENDOR=OpenBLAS \
|
||||||
|
-DLLAMA_BUILD_TESTS=OFF \
|
||||||
|
-DLLAMA_BUILD_EXAMPLES=OFF \
|
||||||
|
-DCMAKE_BUILD_TYPE=Release
|
||||||
|
|
||||||
|
cmake --build build --config Release -j$(nproc)
|
||||||
|
|
||||||
|
# Copy binaries
|
||||||
|
cp build/bin/llama-server "$TIMMY_DIR/"
|
||||||
|
cp build/bin/llama-cli "$TIMMY_DIR/"
|
||||||
|
|
||||||
|
log "llama.cpp built successfully"
|
||||||
|
else
|
||||||
|
log "llama.cpp already exists, skipping build"
|
||||||
|
fi
|
||||||
|
|
||||||
|
log "Step 4/8: Downloading model weights..."
|
||||||
|
if [ ! -f "$TIMMY_DIR/models/$MODEL_NAME" ]; then
|
||||||
|
cd "$TIMMY_DIR/models"
|
||||||
|
wget -q --show-progress "$MODEL_URL" -O "$MODEL_NAME" || {
|
||||||
|
error "Failed to download model. Continuing anyway..."
|
||||||
|
}
|
||||||
|
log "Model downloaded"
|
||||||
|
else
|
||||||
|
log "Model already exists, skipping download"
|
||||||
|
fi
|
||||||
|
|
||||||
|
log "Step 5/8: Setting up llama-server systemd service..."
|
||||||
|
cat > /etc/systemd/system/llama-server.service << EOF
|
||||||
|
[Unit]
|
||||||
|
Description=llama.cpp inference server for Timmy
|
||||||
|
After=network.target
|
||||||
|
|
||||||
|
[Service]
|
||||||
|
Type=simple
|
||||||
|
User=$TIMMY_USER
|
||||||
|
WorkingDirectory=$TIMMY_DIR
|
||||||
|
ExecStart=$TIMMY_DIR/llama-server \\
|
||||||
|
-m $TIMMY_DIR/models/$MODEL_NAME \\
|
||||||
|
--host 127.0.0.1 \\
|
||||||
|
--port 8081 \\
|
||||||
|
-c 8192 \\
|
||||||
|
-np 1 \\
|
||||||
|
--jinja \\
|
||||||
|
-ngl 0
|
||||||
|
Restart=always
|
||||||
|
RestartSec=10
|
||||||
|
Environment="HOME=$TIMMY_HOME"
|
||||||
|
|
||||||
|
[Install]
|
||||||
|
WantedBy=multi-user.target
|
||||||
|
EOF
|
||||||
|
|
||||||
|
systemctl daemon-reload
|
||||||
|
systemctl enable llama-server.service
|
||||||
|
|
||||||
|
log "Step 6/8: Cloning timmy-home repo and setting up agent..."
|
||||||
|
if [ ! -d "$TIMMY_DIR/timmy-home" ]; then
|
||||||
|
cd "$TIMMY_DIR"
|
||||||
|
git clone "$REPO_URL" timmy-home 2>/dev/null || warn "Could not clone repo"
|
||||||
|
fi
|
||||||
|
|
||||||
|
# Create minimal Python environment for agent
|
||||||
|
if [ ! -d "$TIMMY_DIR/venv" ]; then
|
||||||
|
python3 -m venv "$TIMMY_DIR/venv"
|
||||||
|
"$TIMMY_DIR/venv/bin/pip" install -q requests pyyaml 2>&1 | tail -3
|
||||||
|
fi
|
||||||
|
|
||||||
|
log "Step 7/8: Setting up Timmy agent systemd service..."
|
||||||
|
cat > /etc/systemd/system/timmy-agent.service << EOF
|
||||||
|
[Unit]
|
||||||
|
Description=Timmy Agent Harness
|
||||||
|
After=llama-server.service
|
||||||
|
Requires=llama-server.service
|
||||||
|
|
||||||
|
[Service]
|
||||||
|
Type=simple
|
||||||
|
User=$TIMMY_USER
|
||||||
|
WorkingDirectory=$TIMMY_DIR
|
||||||
|
ExecStart=$TIMMY_DIR/venv/bin/python $TIMMY_DIR/timmy-home/agent/agent_daemon.py
|
||||||
|
Restart=always
|
||||||
|
RestartSec=30
|
||||||
|
Environment="HOME=$TIMMY_HOME"
|
||||||
|
Environment="TIMMY_MODEL_URL=http://127.0.0.1:8081"
|
||||||
|
|
||||||
|
[Install]
|
||||||
|
WantedBy=multi-user.target
|
||||||
|
EOF
|
||||||
|
|
||||||
|
systemctl daemon-reload
|
||||||
|
systemctl enable timmy-agent.service
|
||||||
|
|
||||||
|
log "Step 8/8: Configuring firewall..."
|
||||||
|
# Reset UFW
|
||||||
|
ufw --force reset 2>/dev/null || true
|
||||||
|
ufw default deny incoming
|
||||||
|
ufw default allow outgoing
|
||||||
|
|
||||||
|
# Allow SSH
|
||||||
|
ufw allow 22/tcp
|
||||||
|
|
||||||
|
# Allow Syncthing (sync protocol)
|
||||||
|
ufw allow 22000/tcp
|
||||||
|
ufw allow 22000/udp
|
||||||
|
|
||||||
|
# Allow Syncthing (discovery)
|
||||||
|
ufw allow 21027/udp
|
||||||
|
|
||||||
|
# Note: llama-server on 8081 is NOT exposed (localhost only)
|
||||||
|
|
||||||
|
ufw --force enable
|
||||||
|
|
||||||
|
log "Starting services..."
|
||||||
|
systemctl start llama-server.service || warn "llama-server failed to start (may need model)"
|
||||||
|
|
||||||
|
# Wait for llama-server to be ready
|
||||||
|
log "Waiting for llama-server to be ready..."
|
||||||
|
for i in {1..30}; do
|
||||||
|
if curl -s http://127.0.0.1:8081/health >/dev/null 2>&1; then
|
||||||
|
log "llama-server is healthy!"
|
||||||
|
break
|
||||||
|
fi
|
||||||
|
sleep 2
|
||||||
|
done
|
||||||
|
|
||||||
|
# Create status script
|
||||||
|
cat > "$TIMMY_DIR/scripts/status.sh" << 'EOF'
|
||||||
|
#!/bin/bash
|
||||||
|
echo "=== Timmy VPS Status ==="
|
||||||
|
echo ""
|
||||||
|
echo "Services:"
|
||||||
|
systemctl is-active llama-server.service && echo " llama-server: RUNNING" || echo " llama-server: STOPPED"
|
||||||
|
systemctl is-active timmy-agent.service && echo " timmy-agent: RUNNING" || echo " timmy-agent: STOPPED"
|
||||||
|
echo ""
|
||||||
|
echo "Inference Health:"
|
||||||
|
curl -s http://127.0.0.1:8081/health | jq . 2>/dev/null || echo " Not responding"
|
||||||
|
echo ""
|
||||||
|
echo "Disk Usage:"
|
||||||
|
df -h $HOME | tail -1
|
||||||
|
echo ""
|
||||||
|
echo "Memory:"
|
||||||
|
free -h | grep Mem
|
||||||
|
EOF
|
||||||
|
chmod +x "$TIMMY_DIR/scripts/status.sh"
|
||||||
|
|
||||||
|
# Create README
|
||||||
|
cat > "$TIMMY_DIR/README.txt" << EOF
|
||||||
|
Timmy Sovereign Wizard VPS
|
||||||
|
==========================
|
||||||
|
|
||||||
|
Quick Commands:
|
||||||
|
$TIMMY_DIR/scripts/status.sh - Check system status
|
||||||
|
systemctl status llama-server - Check inference service
|
||||||
|
systemctl status timmy-agent - Check agent service
|
||||||
|
|
||||||
|
Directories:
|
||||||
|
$TIMMY_DIR/models/ - AI model weights
|
||||||
|
$TIMMY_DIR/soul/ - SOUL.md and conscience files
|
||||||
|
$TIMMY_DIR/logs/ - Agent logs
|
||||||
|
$TIMMY_DIR/shared/ - Syncthing shared folder
|
||||||
|
|
||||||
|
Inference Endpoint:
|
||||||
|
http://127.0.0.1:8081 (localhost only)
|
||||||
|
|
||||||
|
Provisioning complete!
|
||||||
|
EOF
|
||||||
|
|
||||||
|
echo ""
|
||||||
|
echo "========================================"
|
||||||
|
log "Provisioning Complete!"
|
||||||
|
echo "========================================"
|
||||||
|
echo ""
|
||||||
|
echo "Status:"
|
||||||
|
"$TIMMY_DIR/scripts/status.sh"
|
||||||
|
echo ""
|
||||||
|
echo "Next steps:"
|
||||||
|
echo " 1. Run syncthing setup: curl -sL $REPO_URL/raw/branch/main/scripts/setup-syncthing.sh | bash"
|
||||||
|
echo " 2. Check inference: curl http://127.0.0.1:8081/health"
|
||||||
|
echo " 3. Review logs: journalctl -u llama-server -f"
|
||||||
|
echo ""
|
||||||
127
uni-wizard/README.md
Normal file
127
uni-wizard/README.md
Normal file
@@ -0,0 +1,127 @@
|
|||||||
|
# Uni-Wizard Architecture
|
||||||
|
|
||||||
|
## Vision
|
||||||
|
|
||||||
|
A single wizard harness that elegantly routes all API interactions through one unified interface. No more fragmented wizards - one consciousness, infinite capabilities.
|
||||||
|
|
||||||
|
## Architecture
|
||||||
|
|
||||||
|
```
|
||||||
|
┌─────────────────────────────────────────────────────────────┐
|
||||||
|
│ UNI-WIZARD HARNESS │
|
||||||
|
│ │
|
||||||
|
│ ┌─────────────┐ ┌─────────────┐ ┌─────────────┐ │
|
||||||
|
│ │ System │ │ Git │ │ Network │ │
|
||||||
|
│ │ Tools │◄──►│ Tools │◄──►│ Tools │ │
|
||||||
|
│ └──────┬──────┘ └──────┬──────┘ └──────┬──────┘ │
|
||||||
|
│ │ │ │ │
|
||||||
|
│ └──────────────────┼──────────────────┘ │
|
||||||
|
│ ▼ │
|
||||||
|
│ ┌───────────────┐ │
|
||||||
|
│ │ Tool Router │ │
|
||||||
|
│ │ (Registry) │ │
|
||||||
|
│ └───────┬───────┘ │
|
||||||
|
│ │ │
|
||||||
|
│ ┌──────────────────┼──────────────────┐ │
|
||||||
|
│ ▼ ▼ ▼ │
|
||||||
|
│ ┌─────────────┐ ┌─────────────┐ ┌─────────────┐ │
|
||||||
|
│ │ Local │ │ Gitea │ │ Relay │ │
|
||||||
|
│ │ llama.cpp │ │ API │ │ Nostr │ │
|
||||||
|
│ └─────────────┘ └─────────────┘ └─────────────┘ │
|
||||||
|
│ │
|
||||||
|
└─────────────────────────────────────────────────────────────┘
|
||||||
|
│
|
||||||
|
▼
|
||||||
|
┌───────────────┐
|
||||||
|
│ LLM (local) │
|
||||||
|
│ Hermes-3 8B │
|
||||||
|
└───────────────┘
|
||||||
|
```
|
||||||
|
|
||||||
|
## Design Principles
|
||||||
|
|
||||||
|
1. **Single Entry Point**: One harness, all capabilities
|
||||||
|
2. **Unified Registry**: All tools registered centrally
|
||||||
|
3. **Elegant Routing**: Tools discover and route automatically
|
||||||
|
4. **Local-First**: No cloud dependencies
|
||||||
|
5. **Self-Healing**: Tools can restart, reconnect, recover
|
||||||
|
|
||||||
|
## Tool Categories
|
||||||
|
|
||||||
|
### System Layer
|
||||||
|
- `system_info` — OS, CPU, RAM, disk, uptime
|
||||||
|
- `process_manager` — list, start, stop processes
|
||||||
|
- `service_controller` — systemd service management
|
||||||
|
- `health_monitor` — system health checks
|
||||||
|
|
||||||
|
### Git Layer
|
||||||
|
- `git_operations` — status, log, commit, push, pull
|
||||||
|
- `repo_manager` — clone, branch, merge
|
||||||
|
- `pr_handler` — create, review, merge PRs
|
||||||
|
|
||||||
|
### Network Layer
|
||||||
|
- `http_client` — GET, POST, PUT, DELETE
|
||||||
|
- `gitea_client` — full Gitea API wrapper
|
||||||
|
- `nostr_client` — relay communication
|
||||||
|
- `api_router` — generic API endpoint handler
|
||||||
|
|
||||||
|
### File Layer
|
||||||
|
- `file_operations` — read, write, append, search
|
||||||
|
- `directory_manager` — tree, list, navigate
|
||||||
|
- `archive_handler` — zip, tar, compress
|
||||||
|
|
||||||
|
## Registry System
|
||||||
|
|
||||||
|
```python
|
||||||
|
# tools/registry.py
|
||||||
|
class ToolRegistry:
|
||||||
|
def __init__(self):
|
||||||
|
self.tools = {}
|
||||||
|
|
||||||
|
def register(self, name, handler, schema):
|
||||||
|
self.tools[name] = {
|
||||||
|
'handler': handler,
|
||||||
|
'schema': schema,
|
||||||
|
'description': handler.__doc__
|
||||||
|
}
|
||||||
|
|
||||||
|
def execute(self, name, params):
|
||||||
|
tool = self.tools.get(name)
|
||||||
|
if not tool:
|
||||||
|
return f"Error: Tool '{name}' not found"
|
||||||
|
try:
|
||||||
|
return tool['handler'](**params)
|
||||||
|
except Exception as e:
|
||||||
|
return f"Error executing {name}: {str(e)}"
|
||||||
|
```
|
||||||
|
|
||||||
|
## API Flow
|
||||||
|
|
||||||
|
1. **User Request** → Natural language task
|
||||||
|
2. **LLM Planning** → Breaks into tool calls
|
||||||
|
3. **Registry Lookup** → Finds appropriate tools
|
||||||
|
4. **Execution** → Tools run in sequence/parallel
|
||||||
|
5. **Response** → Results synthesized and returned
|
||||||
|
|
||||||
|
## Example Usage
|
||||||
|
|
||||||
|
```python
|
||||||
|
# Single harness, multiple capabilities
|
||||||
|
result = harness.execute("""
|
||||||
|
Check system health, pull latest git changes,
|
||||||
|
and create a Gitea issue if tests fail
|
||||||
|
""")
|
||||||
|
```
|
||||||
|
|
||||||
|
This becomes:
|
||||||
|
1. `system_info` → check health
|
||||||
|
2. `git_pull` → update repo
|
||||||
|
3. `run_tests` → execute tests
|
||||||
|
4. `gitea_create_issue` → report failures
|
||||||
|
|
||||||
|
## Benefits
|
||||||
|
|
||||||
|
- **Simplicity**: One harness to maintain
|
||||||
|
- **Power**: All capabilities unified
|
||||||
|
- **Elegance**: Clean routing, no fragmentation
|
||||||
|
- **Resilience**: Self-contained, local-first
|
||||||
9
uni-wizard/daemons/__init__.py
Normal file
9
uni-wizard/daemons/__init__.py
Normal file
@@ -0,0 +1,9 @@
|
|||||||
|
"""
|
||||||
|
Uni-Wizard Daemons Package
|
||||||
|
Background services for the uni-wizard architecture
|
||||||
|
"""
|
||||||
|
|
||||||
|
from .health_daemon import HealthDaemon
|
||||||
|
from .task_router import TaskRouter
|
||||||
|
|
||||||
|
__all__ = ['HealthDaemon', 'TaskRouter']
|
||||||
180
uni-wizard/daemons/health_daemon.py
Normal file
180
uni-wizard/daemons/health_daemon.py
Normal file
@@ -0,0 +1,180 @@
|
|||||||
|
"""
|
||||||
|
Health Check Daemon for Uni-Wizard
|
||||||
|
Monitors VPS status and exposes health endpoint
|
||||||
|
"""
|
||||||
|
|
||||||
|
import json
|
||||||
|
import time
|
||||||
|
import threading
|
||||||
|
from http.server import HTTPServer, BaseHTTPRequestHandler
|
||||||
|
from datetime import datetime
|
||||||
|
from pathlib import Path
|
||||||
|
import sys
|
||||||
|
|
||||||
|
# Add parent to path
|
||||||
|
sys.path.insert(0, str(Path(__file__).parent.parent))
|
||||||
|
|
||||||
|
from harness import get_harness
|
||||||
|
|
||||||
|
|
||||||
|
class HealthCheckHandler(BaseHTTPRequestHandler):
|
||||||
|
"""HTTP handler for health endpoint"""
|
||||||
|
|
||||||
|
def log_message(self, format, *args):
|
||||||
|
# Suppress default logging
|
||||||
|
pass
|
||||||
|
|
||||||
|
def do_GET(self):
|
||||||
|
"""Handle GET requests"""
|
||||||
|
if self.path == '/health':
|
||||||
|
self.send_health_response()
|
||||||
|
elif self.path == '/status':
|
||||||
|
self.send_full_status()
|
||||||
|
else:
|
||||||
|
self.send_error(404)
|
||||||
|
|
||||||
|
def send_health_response(self):
|
||||||
|
"""Send simple health check"""
|
||||||
|
harness = get_harness()
|
||||||
|
result = harness.execute("health_check")
|
||||||
|
|
||||||
|
try:
|
||||||
|
health_data = json.loads(result)
|
||||||
|
status_code = 200 if health_data.get("overall") == "healthy" else 503
|
||||||
|
except:
|
||||||
|
status_code = 503
|
||||||
|
health_data = {"error": "Health check failed"}
|
||||||
|
|
||||||
|
self.send_response(status_code)
|
||||||
|
self.send_header('Content-Type', 'application/json')
|
||||||
|
self.end_headers()
|
||||||
|
self.wfile.write(json.dumps(health_data).encode())
|
||||||
|
|
||||||
|
def send_full_status(self):
|
||||||
|
"""Send full system status"""
|
||||||
|
harness = get_harness()
|
||||||
|
|
||||||
|
status = {
|
||||||
|
"timestamp": datetime.now().isoformat(),
|
||||||
|
"harness": json.loads(harness.get_status()),
|
||||||
|
"system": json.loads(harness.execute("system_info")),
|
||||||
|
"health": json.loads(harness.execute("health_check"))
|
||||||
|
}
|
||||||
|
|
||||||
|
self.send_response(200)
|
||||||
|
self.send_header('Content-Type', 'application/json')
|
||||||
|
self.end_headers()
|
||||||
|
self.wfile.write(json.dumps(status, indent=2).encode())
|
||||||
|
|
||||||
|
|
||||||
|
class HealthDaemon:
|
||||||
|
"""
|
||||||
|
Health monitoring daemon.
|
||||||
|
|
||||||
|
Runs continuously, monitoring:
|
||||||
|
- System resources
|
||||||
|
- Service status
|
||||||
|
- Inference endpoint
|
||||||
|
|
||||||
|
Exposes:
|
||||||
|
- HTTP endpoint on port 8082
|
||||||
|
- JSON status file at ~/timmy/logs/health.json
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self, port: int = 8082, check_interval: int = 60):
|
||||||
|
self.port = port
|
||||||
|
self.check_interval = check_interval
|
||||||
|
self.running = False
|
||||||
|
self.server = None
|
||||||
|
self.monitor_thread = None
|
||||||
|
self.last_health = None
|
||||||
|
|
||||||
|
# Ensure log directory exists
|
||||||
|
self.log_path = Path.home() / "timmy" / "logs"
|
||||||
|
self.log_path.mkdir(parents=True, exist_ok=True)
|
||||||
|
self.health_file = self.log_path / "health.json"
|
||||||
|
|
||||||
|
def start(self):
|
||||||
|
"""Start the health daemon"""
|
||||||
|
self.running = True
|
||||||
|
|
||||||
|
# Start HTTP server
|
||||||
|
self.server = HTTPServer(('127.0.0.1', self.port), HealthCheckHandler)
|
||||||
|
server_thread = threading.Thread(target=self.server.serve_forever)
|
||||||
|
server_thread.daemon = True
|
||||||
|
server_thread.start()
|
||||||
|
|
||||||
|
# Start monitoring loop
|
||||||
|
self.monitor_thread = threading.Thread(target=self._monitor_loop)
|
||||||
|
self.monitor_thread.daemon = True
|
||||||
|
self.monitor_thread.start()
|
||||||
|
|
||||||
|
print(f"Health daemon started on http://127.0.0.1:{self.port}")
|
||||||
|
print(f" - /health - Quick health check")
|
||||||
|
print(f" - /status - Full system status")
|
||||||
|
print(f"Health file: {self.health_file}")
|
||||||
|
|
||||||
|
def stop(self):
|
||||||
|
"""Stop the health daemon"""
|
||||||
|
self.running = False
|
||||||
|
if self.server:
|
||||||
|
self.server.shutdown()
|
||||||
|
print("Health daemon stopped")
|
||||||
|
|
||||||
|
def _monitor_loop(self):
|
||||||
|
"""Background monitoring loop"""
|
||||||
|
while self.running:
|
||||||
|
try:
|
||||||
|
self._update_health_file()
|
||||||
|
time.sleep(self.check_interval)
|
||||||
|
except Exception as e:
|
||||||
|
print(f"Monitor error: {e}")
|
||||||
|
time.sleep(5)
|
||||||
|
|
||||||
|
def _update_health_file(self):
|
||||||
|
"""Update the health status file"""
|
||||||
|
harness = get_harness()
|
||||||
|
|
||||||
|
try:
|
||||||
|
health_result = harness.execute("health_check")
|
||||||
|
system_result = harness.execute("system_info")
|
||||||
|
|
||||||
|
status = {
|
||||||
|
"timestamp": datetime.now().isoformat(),
|
||||||
|
"health": json.loads(health_result),
|
||||||
|
"system": json.loads(system_result)
|
||||||
|
}
|
||||||
|
|
||||||
|
self.health_file.write_text(json.dumps(status, indent=2))
|
||||||
|
self.last_health = status
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
print(f"Failed to update health file: {e}")
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
"""Run the health daemon"""
|
||||||
|
import signal
|
||||||
|
|
||||||
|
daemon = HealthDaemon()
|
||||||
|
|
||||||
|
def signal_handler(sig, frame):
|
||||||
|
print("\nShutting down...")
|
||||||
|
daemon.stop()
|
||||||
|
sys.exit(0)
|
||||||
|
|
||||||
|
signal.signal(signal.SIGINT, signal_handler)
|
||||||
|
signal.signal(signal.SIGTERM, signal_handler)
|
||||||
|
|
||||||
|
daemon.start()
|
||||||
|
|
||||||
|
# Keep main thread alive
|
||||||
|
try:
|
||||||
|
while True:
|
||||||
|
time.sleep(1)
|
||||||
|
except KeyboardInterrupt:
|
||||||
|
daemon.stop()
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||
222
uni-wizard/daemons/task_router.py
Normal file
222
uni-wizard/daemons/task_router.py
Normal file
@@ -0,0 +1,222 @@
|
|||||||
|
"""
|
||||||
|
Task Router for Uni-Wizard
|
||||||
|
Polls Gitea for assigned issues and executes them
|
||||||
|
"""
|
||||||
|
|
||||||
|
import json
|
||||||
|
import time
|
||||||
|
import sys
|
||||||
|
from pathlib import Path
|
||||||
|
from datetime import datetime
|
||||||
|
|
||||||
|
# Add parent to path
|
||||||
|
sys.path.insert(0, str(Path(__file__).parent.parent))
|
||||||
|
|
||||||
|
from harness import get_harness
|
||||||
|
|
||||||
|
|
||||||
|
class TaskRouter:
|
||||||
|
"""
|
||||||
|
Gitea Task Router.
|
||||||
|
|
||||||
|
Polls Gitea for issues assigned to Timmy and routes them
|
||||||
|
to appropriate tools for execution.
|
||||||
|
|
||||||
|
Flow:
|
||||||
|
1. Poll Gitea API for open issues assigned to Timmy
|
||||||
|
2. Parse issue body for commands/tasks
|
||||||
|
3. Route to appropriate tool via harness
|
||||||
|
4. Post results back as comments
|
||||||
|
5. Close issue if task complete
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
gitea_url: str = "http://143.198.27.163:3000",
|
||||||
|
repo: str = "Timmy_Foundation/timmy-home",
|
||||||
|
assignee: str = "timmy",
|
||||||
|
poll_interval: int = 60
|
||||||
|
):
|
||||||
|
self.gitea_url = gitea_url
|
||||||
|
self.repo = repo
|
||||||
|
self.assignee = assignee
|
||||||
|
self.poll_interval = poll_interval
|
||||||
|
self.running = False
|
||||||
|
self.harness = get_harness()
|
||||||
|
self.processed_issues = set()
|
||||||
|
|
||||||
|
# Log file
|
||||||
|
self.log_path = Path.home() / "timmy" / "logs"
|
||||||
|
self.log_path.mkdir(parents=True, exist_ok=True)
|
||||||
|
self.router_log = self.log_path / "task_router.jsonl"
|
||||||
|
|
||||||
|
def start(self):
|
||||||
|
"""Start the task router"""
|
||||||
|
self.running = True
|
||||||
|
print(f"Task router started")
|
||||||
|
print(f" Polling: {self.gitea_url}")
|
||||||
|
print(f" Assignee: {self.assignee}")
|
||||||
|
print(f" Interval: {self.poll_interval}s")
|
||||||
|
|
||||||
|
while self.running:
|
||||||
|
try:
|
||||||
|
self._poll_and_route()
|
||||||
|
time.sleep(self.poll_interval)
|
||||||
|
except Exception as e:
|
||||||
|
self._log_event("error", {"message": str(e)})
|
||||||
|
time.sleep(5)
|
||||||
|
|
||||||
|
def stop(self):
|
||||||
|
"""Stop the task router"""
|
||||||
|
self.running = False
|
||||||
|
print("Task router stopped")
|
||||||
|
|
||||||
|
def _poll_and_route(self):
|
||||||
|
"""Poll for issues and route tasks"""
|
||||||
|
# Get assigned issues
|
||||||
|
result = self.harness.execute(
|
||||||
|
"gitea_list_issues",
|
||||||
|
repo=self.repo,
|
||||||
|
state="open",
|
||||||
|
assignee=self.assignee
|
||||||
|
)
|
||||||
|
|
||||||
|
try:
|
||||||
|
issues = json.loads(result)
|
||||||
|
except:
|
||||||
|
return
|
||||||
|
|
||||||
|
for issue in issues.get("issues", []):
|
||||||
|
issue_num = issue["number"]
|
||||||
|
|
||||||
|
# Skip already processed
|
||||||
|
if issue_num in self.processed_issues:
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Process the issue
|
||||||
|
self._process_issue(issue)
|
||||||
|
self.processed_issues.add(issue_num)
|
||||||
|
|
||||||
|
def _process_issue(self, issue: dict):
|
||||||
|
"""Process a single issue"""
|
||||||
|
issue_num = issue["number"]
|
||||||
|
title = issue["title"]
|
||||||
|
|
||||||
|
self._log_event("issue_received", {
|
||||||
|
"number": issue_num,
|
||||||
|
"title": title
|
||||||
|
})
|
||||||
|
|
||||||
|
# Parse title for command hints
|
||||||
|
# Format: "[ACTION] Description" or just "Description"
|
||||||
|
action = self._parse_action(title)
|
||||||
|
|
||||||
|
# Route to appropriate handler
|
||||||
|
if action == "system_check":
|
||||||
|
result = self._handle_system_check(issue_num)
|
||||||
|
elif action == "git_operation":
|
||||||
|
result = self._handle_git_operation(issue_num, issue)
|
||||||
|
elif action == "health_report":
|
||||||
|
result = self._handle_health_report(issue_num)
|
||||||
|
else:
|
||||||
|
result = self._handle_generic(issue_num, issue)
|
||||||
|
|
||||||
|
# Post result as comment
|
||||||
|
self._post_comment(issue_num, result)
|
||||||
|
|
||||||
|
self._log_event("issue_processed", {
|
||||||
|
"number": issue_num,
|
||||||
|
"action": action,
|
||||||
|
"result": "success" if result else "failed"
|
||||||
|
})
|
||||||
|
|
||||||
|
def _parse_action(self, title: str) -> str:
|
||||||
|
"""Parse action from issue title"""
|
||||||
|
title_lower = title.lower()
|
||||||
|
|
||||||
|
if any(kw in title_lower for kw in ["health", "status", "check"]):
|
||||||
|
return "health_report"
|
||||||
|
elif any(kw in title_lower for kw in ["system", "resource", "disk", "memory"]):
|
||||||
|
return "system_check"
|
||||||
|
elif any(kw in title_lower for kw in ["git", "commit", "push", "pull", "branch"]):
|
||||||
|
return "git_operation"
|
||||||
|
|
||||||
|
return "generic"
|
||||||
|
|
||||||
|
def _handle_system_check(self, issue_num: int) -> str:
|
||||||
|
"""Handle system check task"""
|
||||||
|
result = self.harness.execute("system_info")
|
||||||
|
return f"## System Check Results\n\n```json\n{result}\n```"
|
||||||
|
|
||||||
|
def _handle_health_report(self, issue_num: int) -> str:
|
||||||
|
"""Handle health report task"""
|
||||||
|
result = self.harness.execute("health_check")
|
||||||
|
return f"## Health Report\n\n```json\n{result}\n```"
|
||||||
|
|
||||||
|
def _handle_git_operation(self, issue_num: int, issue: dict) -> str:
|
||||||
|
"""Handle git operation task"""
|
||||||
|
body = issue.get("body", "")
|
||||||
|
|
||||||
|
# Parse body for git commands
|
||||||
|
results = []
|
||||||
|
|
||||||
|
# Check for status request
|
||||||
|
if "status" in body.lower():
|
||||||
|
result = self.harness.execute("git_status", repo_path="/root/timmy/timmy-home")
|
||||||
|
results.append(f"**Git Status:**\n```json\n{result}\n```")
|
||||||
|
|
||||||
|
# Check for pull request
|
||||||
|
if "pull" in body.lower():
|
||||||
|
result = self.harness.execute("git_pull", repo_path="/root/timmy/timmy-home")
|
||||||
|
results.append(f"**Git Pull:**\n{result}")
|
||||||
|
|
||||||
|
if not results:
|
||||||
|
results.append("No specific git operation detected in issue body.")
|
||||||
|
|
||||||
|
return "\n\n".join(results)
|
||||||
|
|
||||||
|
def _handle_generic(self, issue_num: int, issue: dict) -> str:
|
||||||
|
"""Handle generic task"""
|
||||||
|
return f"Received issue #{issue_num}: {issue['title']}\n\nI'll process this and update shortly."
|
||||||
|
|
||||||
|
def _post_comment(self, issue_num: int, body: str):
|
||||||
|
"""Post a comment on the issue"""
|
||||||
|
result = self.harness.execute(
|
||||||
|
"gitea_comment",
|
||||||
|
repo=self.repo,
|
||||||
|
issue_number=issue_num,
|
||||||
|
body=body
|
||||||
|
)
|
||||||
|
return result
|
||||||
|
|
||||||
|
def _log_event(self, event_type: str, data: dict):
|
||||||
|
"""Log an event to the JSONL file"""
|
||||||
|
log_entry = {
|
||||||
|
"timestamp": datetime.now().isoformat(),
|
||||||
|
"event": event_type,
|
||||||
|
**data
|
||||||
|
}
|
||||||
|
|
||||||
|
with open(self.router_log, "a") as f:
|
||||||
|
f.write(json.dumps(log_entry) + "\n")
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
"""Run the task router"""
|
||||||
|
import signal
|
||||||
|
|
||||||
|
router = TaskRouter()
|
||||||
|
|
||||||
|
def signal_handler(sig, frame):
|
||||||
|
print("\nShutting down...")
|
||||||
|
router.stop()
|
||||||
|
sys.exit(0)
|
||||||
|
|
||||||
|
signal.signal(signal.SIGINT, signal_handler)
|
||||||
|
signal.signal(signal.SIGTERM, signal_handler)
|
||||||
|
|
||||||
|
router.start()
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||
174
uni-wizard/harness.py
Normal file
174
uni-wizard/harness.py
Normal file
@@ -0,0 +1,174 @@
|
|||||||
|
"""
|
||||||
|
Uni-Wizard Harness
|
||||||
|
Single entry point for all capabilities
|
||||||
|
"""
|
||||||
|
|
||||||
|
import json
|
||||||
|
import sys
|
||||||
|
from typing import Dict, Any, Optional
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
# Add tools to path
|
||||||
|
sys.path.insert(0, str(Path(__file__).parent))
|
||||||
|
|
||||||
|
from tools import registry, call_tool
|
||||||
|
|
||||||
|
|
||||||
|
class UniWizardHarness:
|
||||||
|
"""
|
||||||
|
The Uni-Wizard Harness - one consciousness, infinite capabilities.
|
||||||
|
|
||||||
|
All API flows route through this single harness:
|
||||||
|
- System monitoring and control
|
||||||
|
- Git operations
|
||||||
|
- Network requests
|
||||||
|
- Gitea API
|
||||||
|
- Local inference
|
||||||
|
|
||||||
|
Usage:
|
||||||
|
harness = UniWizardHarness()
|
||||||
|
result = harness.execute("system_info")
|
||||||
|
result = harness.execute("git_status", repo_path="/path/to/repo")
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self):
|
||||||
|
self.registry = registry
|
||||||
|
self.history = []
|
||||||
|
|
||||||
|
def list_capabilities(self) -> str:
|
||||||
|
"""List all available tools/capabilities"""
|
||||||
|
tools = []
|
||||||
|
for category in self.registry.get_categories():
|
||||||
|
cat_tools = self.registry.get_tools_by_category(category)
|
||||||
|
tools.append(f"\n{category.upper()}:")
|
||||||
|
for tool in cat_tools:
|
||||||
|
tools.append(f" - {tool['name']}: {tool['description']}")
|
||||||
|
|
||||||
|
return "\n".join(tools)
|
||||||
|
|
||||||
|
def execute(self, tool_name: str, **params) -> str:
|
||||||
|
"""
|
||||||
|
Execute a tool by name.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
tool_name: Name of the tool to execute
|
||||||
|
**params: Parameters for the tool
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
String result from the tool
|
||||||
|
"""
|
||||||
|
# Log execution
|
||||||
|
self.history.append({
|
||||||
|
"tool": tool_name,
|
||||||
|
"params": params
|
||||||
|
})
|
||||||
|
|
||||||
|
# Execute via registry
|
||||||
|
result = call_tool(tool_name, **params)
|
||||||
|
return result
|
||||||
|
|
||||||
|
def execute_plan(self, plan: list) -> Dict[str, str]:
|
||||||
|
"""
|
||||||
|
Execute a sequence of tool calls.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
plan: List of dicts with 'tool' and 'params'
|
||||||
|
e.g., [{"tool": "system_info", "params": {}}]
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Dict mapping tool names to results
|
||||||
|
"""
|
||||||
|
results = {}
|
||||||
|
for step in plan:
|
||||||
|
tool_name = step.get("tool")
|
||||||
|
params = step.get("params", {})
|
||||||
|
|
||||||
|
result = self.execute(tool_name, **params)
|
||||||
|
results[tool_name] = result
|
||||||
|
|
||||||
|
return results
|
||||||
|
|
||||||
|
def get_tool_definitions(self) -> str:
|
||||||
|
"""Get tool definitions formatted for LLM system prompt"""
|
||||||
|
return self.registry.get_tool_definitions()
|
||||||
|
|
||||||
|
def get_status(self) -> str:
|
||||||
|
"""Get harness status"""
|
||||||
|
return json.dumps({
|
||||||
|
"total_tools": len(self.registry.list_tools()),
|
||||||
|
"categories": self.registry.get_categories(),
|
||||||
|
"tools_by_category": {
|
||||||
|
cat: self.registry.list_tools(cat)
|
||||||
|
for cat in self.registry.get_categories()
|
||||||
|
},
|
||||||
|
"execution_history_count": len(self.history)
|
||||||
|
}, indent=2)
|
||||||
|
|
||||||
|
|
||||||
|
# Singleton instance
|
||||||
|
_harness = None
|
||||||
|
|
||||||
|
def get_harness() -> UniWizardHarness:
|
||||||
|
"""Get the singleton harness instance"""
|
||||||
|
global _harness
|
||||||
|
if _harness is None:
|
||||||
|
_harness = UniWizardHarness()
|
||||||
|
return _harness
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
"""CLI interface for the harness"""
|
||||||
|
harness = get_harness()
|
||||||
|
|
||||||
|
if len(sys.argv) < 2:
|
||||||
|
print("Uni-Wizard Harness")
|
||||||
|
print("==================")
|
||||||
|
print("\nUsage: python harness.py <command> [args]")
|
||||||
|
print("\nCommands:")
|
||||||
|
print(" list - List all capabilities")
|
||||||
|
print(" status - Show harness status")
|
||||||
|
print(" tools - Show tool definitions (for LLM)")
|
||||||
|
print(" exec <tool> - Execute a tool")
|
||||||
|
print("\nExamples:")
|
||||||
|
print(' python harness.py exec system_info')
|
||||||
|
print(' python harness.py exec git_status repo_path=/tmp/timmy-home')
|
||||||
|
return
|
||||||
|
|
||||||
|
command = sys.argv[1]
|
||||||
|
|
||||||
|
if command == "list":
|
||||||
|
print(harness.list_capabilities())
|
||||||
|
|
||||||
|
elif command == "status":
|
||||||
|
print(harness.get_status())
|
||||||
|
|
||||||
|
elif command == "tools":
|
||||||
|
print(harness.get_tool_definitions())
|
||||||
|
|
||||||
|
elif command == "exec" and len(sys.argv) >= 3:
|
||||||
|
tool_name = sys.argv[2]
|
||||||
|
|
||||||
|
# Parse params from args (key=value format)
|
||||||
|
params = {}
|
||||||
|
for arg in sys.argv[3:]:
|
||||||
|
if '=' in arg:
|
||||||
|
key, value = arg.split('=', 1)
|
||||||
|
# Try to parse as int/bool
|
||||||
|
if value.isdigit():
|
||||||
|
value = int(value)
|
||||||
|
elif value.lower() == 'true':
|
||||||
|
value = True
|
||||||
|
elif value.lower() == 'false':
|
||||||
|
value = False
|
||||||
|
params[key] = value
|
||||||
|
|
||||||
|
result = harness.execute(tool_name, **params)
|
||||||
|
print(result)
|
||||||
|
|
||||||
|
else:
|
||||||
|
print(f"Unknown command: {command}")
|
||||||
|
print("Run without arguments for help")
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||
388
uni-wizard/scripts/generate_scorecard.py
Normal file
388
uni-wizard/scripts/generate_scorecard.py
Normal file
@@ -0,0 +1,388 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
"""
|
||||||
|
JSONL Scorecard Generator for Uni-Wizard
|
||||||
|
Analyzes overnight loop results and produces comprehensive reports
|
||||||
|
"""
|
||||||
|
|
||||||
|
import json
|
||||||
|
import sys
|
||||||
|
from pathlib import Path
|
||||||
|
from datetime import datetime
|
||||||
|
from collections import defaultdict
|
||||||
|
from typing import Dict, List, Any
|
||||||
|
import statistics
|
||||||
|
|
||||||
|
|
||||||
|
class ScorecardGenerator:
|
||||||
|
"""
|
||||||
|
Generates scorecards from overnight loop JSONL data.
|
||||||
|
|
||||||
|
Analyzes:
|
||||||
|
- Pass/fail rates
|
||||||
|
- Response times (avg, median, p95)
|
||||||
|
- Per-task breakdowns
|
||||||
|
- Error patterns
|
||||||
|
- Timeline trends
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self, input_dir: str = "~/shared/overnight-loop"):
|
||||||
|
self.input_dir = Path(input_dir).expanduser()
|
||||||
|
self.tasks = []
|
||||||
|
self.stats = {
|
||||||
|
"total": 0,
|
||||||
|
"passed": 0,
|
||||||
|
"failed": 0,
|
||||||
|
"pass_rate": 0.0,
|
||||||
|
"durations": [],
|
||||||
|
"by_task": defaultdict(lambda: {"total": 0, "passed": 0, "failed": 0, "durations": []}),
|
||||||
|
"by_hour": defaultdict(lambda: {"total": 0, "passed": 0, "durations": []}),
|
||||||
|
"errors": defaultdict(int)
|
||||||
|
}
|
||||||
|
|
||||||
|
def load_jsonl(self, filepath: Path) -> List[Dict]:
|
||||||
|
"""Load and parse a JSONL file, handling errors gracefully"""
|
||||||
|
tasks = []
|
||||||
|
with open(filepath, 'r') as f:
|
||||||
|
for line_num, line in enumerate(f, 1):
|
||||||
|
line = line.strip()
|
||||||
|
if not line:
|
||||||
|
continue
|
||||||
|
try:
|
||||||
|
task = json.loads(line)
|
||||||
|
tasks.append(task)
|
||||||
|
except json.JSONDecodeError:
|
||||||
|
print(f"Warning: Skipping malformed line {line_num} in {filepath}")
|
||||||
|
continue
|
||||||
|
return tasks
|
||||||
|
|
||||||
|
def load_all(self):
|
||||||
|
"""Load all JSONL files from input directory"""
|
||||||
|
if not self.input_dir.exists():
|
||||||
|
print(f"Input directory not found: {self.input_dir}")
|
||||||
|
return
|
||||||
|
|
||||||
|
jsonl_files = list(self.input_dir.glob("*.jsonl"))
|
||||||
|
if not jsonl_files:
|
||||||
|
print(f"No .jsonl files found in {self.input_dir}")
|
||||||
|
return
|
||||||
|
|
||||||
|
for filepath in sorted(jsonl_files):
|
||||||
|
print(f"Loading: {filepath.name}")
|
||||||
|
tasks = self.load_jsonl(filepath)
|
||||||
|
self.tasks.extend(tasks)
|
||||||
|
|
||||||
|
print(f"Loaded {len(self.tasks)} tasks from {len(jsonl_files)} files")
|
||||||
|
|
||||||
|
def analyze(self):
|
||||||
|
"""Analyze all loaded tasks"""
|
||||||
|
if not self.tasks:
|
||||||
|
print("No tasks to analyze")
|
||||||
|
return
|
||||||
|
|
||||||
|
for task in self.tasks:
|
||||||
|
self._process_task(task)
|
||||||
|
|
||||||
|
# Calculate overall pass rate
|
||||||
|
if self.stats["total"] > 0:
|
||||||
|
self.stats["pass_rate"] = (self.stats["passed"] / self.stats["total"]) * 100
|
||||||
|
|
||||||
|
print(f"Analysis complete: {self.stats['passed']}/{self.stats['total']} passed ({self.stats['pass_rate']:.1f}%)")
|
||||||
|
|
||||||
|
def _process_task(self, task: Dict):
|
||||||
|
"""Process a single task record"""
|
||||||
|
# Basic stats
|
||||||
|
self.stats["total"] += 1
|
||||||
|
|
||||||
|
status = task.get("status", "unknown")
|
||||||
|
duration = task.get("duration_s", 0)
|
||||||
|
task_type = task.get("task", "unknown")
|
||||||
|
timestamp = task.get("timestamp", "")
|
||||||
|
|
||||||
|
# Pass/fail
|
||||||
|
if status == "pass":
|
||||||
|
self.stats["passed"] += 1
|
||||||
|
self.stats["by_task"][task_type]["passed"] += 1
|
||||||
|
else:
|
||||||
|
self.stats["failed"] += 1
|
||||||
|
self.stats["by_task"][task_type]["failed"] += 1
|
||||||
|
|
||||||
|
# Track error patterns
|
||||||
|
error = task.get("error", "unknown_error")
|
||||||
|
self.stats["errors"][error] += 1
|
||||||
|
|
||||||
|
# Durations
|
||||||
|
self.stats["durations"].append(duration)
|
||||||
|
self.stats["by_task"][task_type]["durations"].append(duration)
|
||||||
|
self.stats["by_task"][task_type]["total"] += 1
|
||||||
|
|
||||||
|
# Hourly breakdown
|
||||||
|
if timestamp:
|
||||||
|
try:
|
||||||
|
hour = timestamp[:13] # YYYY-MM-DDTHH
|
||||||
|
self.stats["by_hour"][hour]["total"] += 1
|
||||||
|
if status == "pass":
|
||||||
|
self.stats["by_hour"][hour]["passed"] += 1
|
||||||
|
self.stats["by_hour"][hour]["durations"].append(duration)
|
||||||
|
except:
|
||||||
|
pass
|
||||||
|
|
||||||
|
def calculate_duration_stats(self, durations: List[float]) -> Dict[str, float]:
|
||||||
|
"""Calculate duration statistics"""
|
||||||
|
if not durations:
|
||||||
|
return {"avg": 0, "median": 0, "p95": 0, "min": 0, "max": 0}
|
||||||
|
|
||||||
|
sorted_durations = sorted(durations)
|
||||||
|
n = len(sorted_durations)
|
||||||
|
|
||||||
|
return {
|
||||||
|
"avg": round(statistics.mean(durations), 2),
|
||||||
|
"median": round(statistics.median(durations), 2),
|
||||||
|
"p95": round(sorted_durations[int(n * 0.95)] if n > 1 else sorted_durations[0], 2),
|
||||||
|
"min": round(min(durations), 2),
|
||||||
|
"max": round(max(durations), 2)
|
||||||
|
}
|
||||||
|
|
||||||
|
def generate_json(self) -> Dict:
|
||||||
|
"""Generate structured JSON report"""
|
||||||
|
duration_stats = self.calculate_duration_stats(self.stats["durations"])
|
||||||
|
|
||||||
|
report = {
|
||||||
|
"generated_at": datetime.now().isoformat(),
|
||||||
|
"summary": {
|
||||||
|
"total_tasks": self.stats["total"],
|
||||||
|
"passed": self.stats["passed"],
|
||||||
|
"failed": self.stats["failed"],
|
||||||
|
"pass_rate": round(self.stats["pass_rate"], 2),
|
||||||
|
"duration_stats": duration_stats
|
||||||
|
},
|
||||||
|
"by_task": {},
|
||||||
|
"by_hour": {},
|
||||||
|
"errors": dict(self.stats["errors"]),
|
||||||
|
"recommendations": self._generate_recommendations()
|
||||||
|
}
|
||||||
|
|
||||||
|
# Per-task breakdown
|
||||||
|
for task_type, data in self.stats["by_task"].items():
|
||||||
|
if data["total"] > 0:
|
||||||
|
pass_rate = (data["passed"] / data["total"]) * 100
|
||||||
|
report["by_task"][task_type] = {
|
||||||
|
"total": data["total"],
|
||||||
|
"passed": data["passed"],
|
||||||
|
"failed": data["failed"],
|
||||||
|
"pass_rate": round(pass_rate, 2),
|
||||||
|
"duration_stats": self.calculate_duration_stats(data["durations"])
|
||||||
|
}
|
||||||
|
|
||||||
|
# Hourly breakdown
|
||||||
|
for hour, data in sorted(self.stats["by_hour"].items()):
|
||||||
|
if data["total"] > 0:
|
||||||
|
pass_rate = (data["passed"] / data["total"]) * 100
|
||||||
|
report["by_hour"][hour] = {
|
||||||
|
"total": data["total"],
|
||||||
|
"passed": data["passed"],
|
||||||
|
"pass_rate": round(pass_rate, 2),
|
||||||
|
"avg_duration": round(statistics.mean(data["durations"]), 2) if data["durations"] else 0
|
||||||
|
}
|
||||||
|
|
||||||
|
return report
|
||||||
|
|
||||||
|
def generate_markdown(self) -> str:
|
||||||
|
"""Generate markdown report"""
|
||||||
|
json_report = self.generate_json()
|
||||||
|
|
||||||
|
md = f"""# Overnight Loop Scorecard
|
||||||
|
|
||||||
|
**Generated:** {json_report['generated_at']}
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Summary
|
||||||
|
|
||||||
|
| Metric | Value |
|
||||||
|
|--------|-------|
|
||||||
|
| Total Tasks | {json_report['summary']['total_tasks']} |
|
||||||
|
| Passed | {json_report['summary']['passed']} ✅ |
|
||||||
|
| Failed | {json_report['summary']['failed']} ❌ |
|
||||||
|
| **Pass Rate** | **{json_report['summary']['pass_rate']:.1f}%** |
|
||||||
|
|
||||||
|
### Duration Statistics
|
||||||
|
|
||||||
|
| Metric | Value (seconds) |
|
||||||
|
|--------|-----------------|
|
||||||
|
| Average | {json_report['summary']['duration_stats']['avg']} |
|
||||||
|
| Median | {json_report['summary']['duration_stats']['median']} |
|
||||||
|
| P95 | {json_report['summary']['duration_stats']['p95']} |
|
||||||
|
| Min | {json_report['summary']['duration_stats']['min']} |
|
||||||
|
| Max | {json_report['summary']['duration_stats']['max']} |
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Per-Task Breakdown
|
||||||
|
|
||||||
|
| Task | Total | Passed | Failed | Pass Rate | Avg Duration |
|
||||||
|
|------|-------|--------|--------|-----------|--------------|
|
||||||
|
"""
|
||||||
|
|
||||||
|
# Sort by pass rate (ascending - worst first)
|
||||||
|
sorted_tasks = sorted(
|
||||||
|
json_report['by_task'].items(),
|
||||||
|
key=lambda x: x[1]['pass_rate']
|
||||||
|
)
|
||||||
|
|
||||||
|
for task_type, data in sorted_tasks:
|
||||||
|
status = "✅" if data['pass_rate'] >= 90 else "⚠️" if data['pass_rate'] >= 70 else "❌"
|
||||||
|
md += f"| {task_type} | {data['total']} | {data['passed']} | {data['failed']} | {status} {data['pass_rate']:.1f}% | {data['duration_stats']['avg']}s |\n"
|
||||||
|
|
||||||
|
md += """
|
||||||
|
---
|
||||||
|
|
||||||
|
## Timeline (Hourly)
|
||||||
|
|
||||||
|
| Hour | Tasks | Passed | Pass Rate | Avg Duration |
|
||||||
|
|------|-------|--------|-----------|--------------|
|
||||||
|
"""
|
||||||
|
|
||||||
|
for hour, data in sorted(json_report['by_hour'].items()):
|
||||||
|
trend = "📈" if data['pass_rate'] >= 90 else "📊" if data['pass_rate'] >= 70 else "📉"
|
||||||
|
md += f"| {hour} | {data['total']} | {data['passed']} | {trend} {data['pass_rate']:.1f}% | {data['avg_duration']}s |\n"
|
||||||
|
|
||||||
|
md += """
|
||||||
|
---
|
||||||
|
|
||||||
|
## Error Analysis
|
||||||
|
|
||||||
|
| Error Pattern | Count |
|
||||||
|
|---------------|-------|
|
||||||
|
"""
|
||||||
|
|
||||||
|
for error, count in sorted(json_report['errors'].items(), key=lambda x: x[1], reverse=True):
|
||||||
|
md += f"| {error} | {count} |\n"
|
||||||
|
|
||||||
|
md += """
|
||||||
|
---
|
||||||
|
|
||||||
|
## Recommendations
|
||||||
|
|
||||||
|
"""
|
||||||
|
|
||||||
|
for rec in json_report['recommendations']:
|
||||||
|
md += f"- {rec}\n"
|
||||||
|
|
||||||
|
md += """
|
||||||
|
---
|
||||||
|
|
||||||
|
*Generated by Uni-Wizard Scorecard Generator*
|
||||||
|
"""
|
||||||
|
|
||||||
|
return md
|
||||||
|
|
||||||
|
def _generate_recommendations(self) -> List[str]:
|
||||||
|
"""Generate recommendations based on analysis"""
|
||||||
|
recommendations = []
|
||||||
|
|
||||||
|
# Check overall pass rate
|
||||||
|
if self.stats["pass_rate"] < 70:
|
||||||
|
recommendations.append(f"⚠️ Overall pass rate ({self.stats['pass_rate']:.1f}%) is concerning. Review infrastructure health.")
|
||||||
|
elif self.stats["pass_rate"] >= 95:
|
||||||
|
recommendations.append(f"✅ Excellent pass rate ({self.stats['pass_rate']:.1f}%). System is performing well.")
|
||||||
|
|
||||||
|
# Check for failing tasks
|
||||||
|
failing_tasks = []
|
||||||
|
for task_type, data in self.stats["by_task"].items():
|
||||||
|
if data["total"] > 0:
|
||||||
|
pass_rate = (data["passed"] / data["total"]) * 100
|
||||||
|
if pass_rate < 50:
|
||||||
|
failing_tasks.append(task_type)
|
||||||
|
|
||||||
|
if failing_tasks:
|
||||||
|
recommendations.append(f"❌ Tasks with <50% pass rate: {', '.join(failing_tasks)}. Consider debugging or removing.")
|
||||||
|
|
||||||
|
# Check for slow tasks
|
||||||
|
slow_tasks = []
|
||||||
|
for task_type, data in self.stats["by_task"].items():
|
||||||
|
if data["durations"]:
|
||||||
|
avg = statistics.mean(data["durations"])
|
||||||
|
if avg > 30: # Tasks taking >30s on average
|
||||||
|
slow_tasks.append(f"{task_type} ({avg:.1f}s)")
|
||||||
|
|
||||||
|
if slow_tasks:
|
||||||
|
recommendations.append(f"⏱️ Slow tasks detected: {', '.join(slow_tasks)}. Consider optimization.")
|
||||||
|
|
||||||
|
# Check error patterns
|
||||||
|
if self.stats["errors"]:
|
||||||
|
top_error = max(self.stats["errors"].items(), key=lambda x: x[1])
|
||||||
|
recommendations.append(f"🔍 Most common error: '{top_error[0]}' ({top_error[1]} occurrences). Investigate root cause.")
|
||||||
|
|
||||||
|
# Timeline trend
|
||||||
|
if len(self.stats["by_hour"]) >= 2:
|
||||||
|
hours = sorted(self.stats["by_hour"].keys())
|
||||||
|
first_hour = hours[0]
|
||||||
|
last_hour = hours[-1]
|
||||||
|
|
||||||
|
first_rate = (self.stats["by_hour"][first_hour]["passed"] / self.stats["by_hour"][first_hour]["total"]) * 100
|
||||||
|
last_rate = (self.stats["by_hour"][last_hour]["passed"] / self.stats["by_hour"][last_hour]["total"]) * 100
|
||||||
|
|
||||||
|
if last_rate > first_rate + 10:
|
||||||
|
recommendations.append(f"📈 Performance improving over time (+{last_rate - first_rate:.1f}% pass rate).")
|
||||||
|
elif last_rate < first_rate - 10:
|
||||||
|
recommendations.append(f"📉 Performance degrading over time (-{first_rate - last_rate:.1f}% pass rate). Check for resource exhaustion.")
|
||||||
|
|
||||||
|
return recommendations
|
||||||
|
|
||||||
|
def save_reports(self, output_dir: str = "~/timmy/reports"):
|
||||||
|
"""Save JSON and markdown reports"""
|
||||||
|
output_path = Path(output_dir).expanduser()
|
||||||
|
output_path.mkdir(parents=True, exist_ok=True)
|
||||||
|
|
||||||
|
date_str = datetime.now().strftime("%Y%m%d")
|
||||||
|
|
||||||
|
# Save JSON
|
||||||
|
json_file = output_path / f"scorecard_{date_str}.json"
|
||||||
|
json_report = self.generate_json()
|
||||||
|
with open(json_file, 'w') as f:
|
||||||
|
json.dump(json_report, f, indent=2)
|
||||||
|
print(f"JSON report saved: {json_file}")
|
||||||
|
|
||||||
|
# Save Markdown
|
||||||
|
md_file = output_path / f"scorecard_{date_str}.md"
|
||||||
|
md_report = self.generate_markdown()
|
||||||
|
with open(md_file, 'w') as f:
|
||||||
|
f.write(md_report)
|
||||||
|
print(f"Markdown report saved: {md_file}")
|
||||||
|
|
||||||
|
return json_file, md_file
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
"""CLI entry point"""
|
||||||
|
import argparse
|
||||||
|
|
||||||
|
parser = argparse.ArgumentParser(description="Generate scorecard from overnight loop JSONL")
|
||||||
|
parser.add_argument("--input", "-i", default="~/shared/overnight-loop", help="Input directory with JSONL files")
|
||||||
|
parser.add_argument("--output", "-o", default="~/timmy/reports", help="Output directory for reports")
|
||||||
|
|
||||||
|
args = parser.parse_args()
|
||||||
|
|
||||||
|
print("="*60)
|
||||||
|
print("UNI-WIZARD SCORECARD GENERATOR")
|
||||||
|
print("="*60)
|
||||||
|
print()
|
||||||
|
|
||||||
|
generator = ScorecardGenerator(input_dir=args.input)
|
||||||
|
generator.load_all()
|
||||||
|
generator.analyze()
|
||||||
|
|
||||||
|
if generator.stats["total"] > 0:
|
||||||
|
json_file, md_file = generator.save_reports(output_dir=args.output)
|
||||||
|
print()
|
||||||
|
print("="*60)
|
||||||
|
print("REPORTS GENERATED")
|
||||||
|
print("="*60)
|
||||||
|
print(f"JSON: {json_file}")
|
||||||
|
print(f"Markdown: {md_file}")
|
||||||
|
else:
|
||||||
|
print("No data to report")
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||
114
uni-wizard/test_harness.py
Normal file
114
uni-wizard/test_harness.py
Normal file
@@ -0,0 +1,114 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
"""
|
||||||
|
Test script for Uni-Wizard Harness
|
||||||
|
Exercises all tool categories
|
||||||
|
"""
|
||||||
|
|
||||||
|
import sys
|
||||||
|
from pathlib import Path
|
||||||
|
sys.path.insert(0, str(Path(__file__).parent))
|
||||||
|
|
||||||
|
from harness import get_harness
|
||||||
|
|
||||||
|
|
||||||
|
def test_system_tools():
|
||||||
|
"""Test system monitoring tools"""
|
||||||
|
print("\n" + "="*60)
|
||||||
|
print("TESTING SYSTEM TOOLS")
|
||||||
|
print("="*60)
|
||||||
|
|
||||||
|
harness = get_harness()
|
||||||
|
|
||||||
|
tests = [
|
||||||
|
("system_info", {}),
|
||||||
|
("health_check", {}),
|
||||||
|
("process_list", {"filter_name": "python"}),
|
||||||
|
("disk_usage", {}),
|
||||||
|
]
|
||||||
|
|
||||||
|
for tool_name, params in tests:
|
||||||
|
print(f"\n>>> {tool_name}()")
|
||||||
|
result = harness.execute(tool_name, **params)
|
||||||
|
print(result[:500] + "..." if len(result) > 500 else result)
|
||||||
|
|
||||||
|
|
||||||
|
def test_git_tools():
|
||||||
|
"""Test git operations"""
|
||||||
|
print("\n" + "="*60)
|
||||||
|
print("TESTING GIT TOOLS")
|
||||||
|
print("="*60)
|
||||||
|
|
||||||
|
harness = get_harness()
|
||||||
|
|
||||||
|
# Test with timmy-home repo if it exists
|
||||||
|
repo_path = "/tmp/timmy-home"
|
||||||
|
|
||||||
|
tests = [
|
||||||
|
("git_status", {"repo_path": repo_path}),
|
||||||
|
("git_log", {"repo_path": repo_path, "count": 5}),
|
||||||
|
("git_branch_list", {"repo_path": repo_path}),
|
||||||
|
]
|
||||||
|
|
||||||
|
for tool_name, params in tests:
|
||||||
|
print(f"\n>>> {tool_name}()")
|
||||||
|
result = harness.execute(tool_name, **params)
|
||||||
|
print(result[:500] + "..." if len(result) > 500 else result)
|
||||||
|
|
||||||
|
|
||||||
|
def test_network_tools():
|
||||||
|
"""Test network operations"""
|
||||||
|
print("\n" + "="*60)
|
||||||
|
print("TESTING NETWORK TOOLS")
|
||||||
|
print("="*60)
|
||||||
|
|
||||||
|
harness = get_harness()
|
||||||
|
|
||||||
|
tests = [
|
||||||
|
("http_get", {"url": "http://143.198.27.163:3000/api/v1/repos/Timmy_Foundation/timmy-home"}),
|
||||||
|
("gitea_list_issues", {"state": "open"}),
|
||||||
|
]
|
||||||
|
|
||||||
|
for tool_name, params in tests:
|
||||||
|
print(f"\n>>> {tool_name}()")
|
||||||
|
result = harness.execute(tool_name, **params)
|
||||||
|
print(result[:500] + "..." if len(result) > 500 else result)
|
||||||
|
|
||||||
|
|
||||||
|
def test_harness_features():
|
||||||
|
"""Test harness management features"""
|
||||||
|
print("\n" + "="*60)
|
||||||
|
print("TESTING HARNESS FEATURES")
|
||||||
|
print("="*60)
|
||||||
|
|
||||||
|
harness = get_harness()
|
||||||
|
|
||||||
|
print("\n>>> list_capabilities()")
|
||||||
|
print(harness.list_capabilities())
|
||||||
|
|
||||||
|
print("\n>>> get_status()")
|
||||||
|
print(harness.get_status())
|
||||||
|
|
||||||
|
|
||||||
|
def run_all_tests():
|
||||||
|
"""Run complete test suite"""
|
||||||
|
print("UNI-WIZARD HARNESS TEST SUITE")
|
||||||
|
print("=============================")
|
||||||
|
|
||||||
|
try:
|
||||||
|
test_system_tools()
|
||||||
|
test_git_tools()
|
||||||
|
test_network_tools()
|
||||||
|
test_harness_features()
|
||||||
|
|
||||||
|
print("\n" + "="*60)
|
||||||
|
print("✓ ALL TESTS COMPLETED")
|
||||||
|
print("="*60)
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
print(f"\n✗ TEST FAILED: {e}")
|
||||||
|
import traceback
|
||||||
|
traceback.print_exc()
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
run_all_tests()
|
||||||
24
uni-wizard/tools/__init__.py
Normal file
24
uni-wizard/tools/__init__.py
Normal file
@@ -0,0 +1,24 @@
|
|||||||
|
"""
|
||||||
|
Uni-Wizard Tools Package
|
||||||
|
All tools for self-sufficient operation
|
||||||
|
"""
|
||||||
|
|
||||||
|
from .registry import registry, ToolRegistry, ToolResult, tool, call_tool
|
||||||
|
|
||||||
|
# Import all tool modules to register them
|
||||||
|
from . import system_tools
|
||||||
|
from . import git_tools
|
||||||
|
from . import network_tools
|
||||||
|
|
||||||
|
__all__ = [
|
||||||
|
'registry',
|
||||||
|
'ToolRegistry',
|
||||||
|
'ToolResult',
|
||||||
|
'tool',
|
||||||
|
'call_tool'
|
||||||
|
]
|
||||||
|
|
||||||
|
# Ensure all tools are registered
|
||||||
|
system_tools.register_all()
|
||||||
|
git_tools.register_all()
|
||||||
|
network_tools.register_all()
|
||||||
448
uni-wizard/tools/git_tools.py
Normal file
448
uni-wizard/tools/git_tools.py
Normal file
@@ -0,0 +1,448 @@
|
|||||||
|
"""
|
||||||
|
Git Tools for Uni-Wizard
|
||||||
|
Repository operations and version control
|
||||||
|
"""
|
||||||
|
|
||||||
|
import os
|
||||||
|
import json
|
||||||
|
import subprocess
|
||||||
|
from typing import Dict, List, Optional
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
from .registry import registry
|
||||||
|
|
||||||
|
|
||||||
|
def run_git_command(args: List[str], cwd: str = None) -> tuple:
|
||||||
|
"""Execute a git command and return (stdout, stderr, returncode)"""
|
||||||
|
try:
|
||||||
|
result = subprocess.run(
|
||||||
|
['git'] + args,
|
||||||
|
capture_output=True,
|
||||||
|
text=True,
|
||||||
|
cwd=cwd
|
||||||
|
)
|
||||||
|
return result.stdout, result.stderr, result.returncode
|
||||||
|
except Exception as e:
|
||||||
|
return "", str(e), 1
|
||||||
|
|
||||||
|
|
||||||
|
def git_status(repo_path: str = ".") -> str:
|
||||||
|
"""
|
||||||
|
Get git repository status.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
repo_path: Path to git repository (default: current directory)
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Status info including branch, changed files, last commit
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
status = {"repo_path": os.path.abspath(repo_path)}
|
||||||
|
|
||||||
|
# Current branch
|
||||||
|
stdout, _, rc = run_git_command(['branch', '--show-current'], cwd=repo_path)
|
||||||
|
if rc == 0:
|
||||||
|
status["branch"] = stdout.strip()
|
||||||
|
else:
|
||||||
|
return f"Error: Not a git repository at {repo_path}"
|
||||||
|
|
||||||
|
# Last commit
|
||||||
|
stdout, _, rc = run_git_command(['log', '-1', '--format=%H|%s|%an|%ad', '--date=short'], cwd=repo_path)
|
||||||
|
if rc == 0:
|
||||||
|
parts = stdout.strip().split('|')
|
||||||
|
if len(parts) >= 4:
|
||||||
|
status["last_commit"] = {
|
||||||
|
"hash": parts[0][:8],
|
||||||
|
"message": parts[1],
|
||||||
|
"author": parts[2],
|
||||||
|
"date": parts[3]
|
||||||
|
}
|
||||||
|
|
||||||
|
# Changed files
|
||||||
|
stdout, _, rc = run_git_command(['status', '--porcelain'], cwd=repo_path)
|
||||||
|
if rc == 0:
|
||||||
|
changes = []
|
||||||
|
for line in stdout.strip().split('\n'):
|
||||||
|
if line:
|
||||||
|
status_code = line[:2]
|
||||||
|
file_path = line[3:]
|
||||||
|
changes.append({
|
||||||
|
"file": file_path,
|
||||||
|
"status": status_code.strip()
|
||||||
|
})
|
||||||
|
status["changes"] = changes
|
||||||
|
status["has_changes"] = len(changes) > 0
|
||||||
|
|
||||||
|
# Remote info
|
||||||
|
stdout, _, rc = run_git_command(['remote', '-v'], cwd=repo_path)
|
||||||
|
if rc == 0:
|
||||||
|
remotes = []
|
||||||
|
for line in stdout.strip().split('\n'):
|
||||||
|
if line:
|
||||||
|
parts = line.split()
|
||||||
|
if len(parts) >= 2:
|
||||||
|
remotes.append({"name": parts[0], "url": parts[1]})
|
||||||
|
status["remotes"] = remotes
|
||||||
|
|
||||||
|
return json.dumps(status, indent=2)
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
return f"Error getting git status: {str(e)}"
|
||||||
|
|
||||||
|
|
||||||
|
def git_log(repo_path: str = ".", count: int = 10) -> str:
|
||||||
|
"""
|
||||||
|
Get recent commit history.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
repo_path: Path to git repository
|
||||||
|
count: Number of commits to show (default: 10)
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
List of recent commits
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
stdout, stderr, rc = run_git_command(
|
||||||
|
['log', f'-{count}', '--format=%H|%s|%an|%ad', '--date=short'],
|
||||||
|
cwd=repo_path
|
||||||
|
)
|
||||||
|
|
||||||
|
if rc != 0:
|
||||||
|
return f"Error: {stderr}"
|
||||||
|
|
||||||
|
commits = []
|
||||||
|
for line in stdout.strip().split('\n'):
|
||||||
|
if line:
|
||||||
|
parts = line.split('|')
|
||||||
|
if len(parts) >= 4:
|
||||||
|
commits.append({
|
||||||
|
"hash": parts[0][:8],
|
||||||
|
"message": parts[1],
|
||||||
|
"author": parts[2],
|
||||||
|
"date": parts[3]
|
||||||
|
})
|
||||||
|
|
||||||
|
return json.dumps({"count": len(commits), "commits": commits}, indent=2)
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
return f"Error getting git log: {str(e)}"
|
||||||
|
|
||||||
|
|
||||||
|
def git_pull(repo_path: str = ".") -> str:
|
||||||
|
"""
|
||||||
|
Pull latest changes from remote.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
repo_path: Path to git repository
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Pull result
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
stdout, stderr, rc = run_git_command(['pull'], cwd=repo_path)
|
||||||
|
|
||||||
|
if rc == 0:
|
||||||
|
if 'Already up to date' in stdout:
|
||||||
|
return "✓ Already up to date"
|
||||||
|
return f"✓ Pull successful:\n{stdout}"
|
||||||
|
else:
|
||||||
|
return f"✗ Pull failed:\n{stderr}"
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
return f"Error pulling: {str(e)}"
|
||||||
|
|
||||||
|
|
||||||
|
def git_commit(repo_path: str = ".", message: str = None, files: List[str] = None) -> str:
|
||||||
|
"""
|
||||||
|
Stage and commit changes.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
repo_path: Path to git repository
|
||||||
|
message: Commit message (required)
|
||||||
|
files: Specific files to commit (default: all changes)
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Commit result
|
||||||
|
"""
|
||||||
|
if not message:
|
||||||
|
return "Error: commit message is required"
|
||||||
|
|
||||||
|
try:
|
||||||
|
# Stage files
|
||||||
|
if files:
|
||||||
|
for f in files:
|
||||||
|
_, stderr, rc = run_git_command(['add', f], cwd=repo_path)
|
||||||
|
if rc != 0:
|
||||||
|
return f"✗ Failed to stage {f}: {stderr}"
|
||||||
|
else:
|
||||||
|
_, stderr, rc = run_git_command(['add', '.'], cwd=repo_path)
|
||||||
|
if rc != 0:
|
||||||
|
return f"✗ Failed to stage changes: {stderr}"
|
||||||
|
|
||||||
|
# Commit
|
||||||
|
stdout, stderr, rc = run_git_command(['commit', '-m', message], cwd=repo_path)
|
||||||
|
|
||||||
|
if rc == 0:
|
||||||
|
return f"✓ Commit successful:\n{stdout}"
|
||||||
|
else:
|
||||||
|
if 'nothing to commit' in stderr.lower():
|
||||||
|
return "✓ Nothing to commit (working tree clean)"
|
||||||
|
return f"✗ Commit failed:\n{stderr}"
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
return f"Error committing: {str(e)}"
|
||||||
|
|
||||||
|
|
||||||
|
def git_push(repo_path: str = ".", remote: str = "origin", branch: str = None) -> str:
|
||||||
|
"""
|
||||||
|
Push to remote repository.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
repo_path: Path to git repository
|
||||||
|
remote: Remote name (default: origin)
|
||||||
|
branch: Branch to push (default: current branch)
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Push result
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
if not branch:
|
||||||
|
# Get current branch
|
||||||
|
stdout, _, rc = run_git_command(['branch', '--show-current'], cwd=repo_path)
|
||||||
|
if rc == 0:
|
||||||
|
branch = stdout.strip()
|
||||||
|
else:
|
||||||
|
return "Error: Could not determine current branch"
|
||||||
|
|
||||||
|
stdout, stderr, rc = run_git_command(['push', remote, branch], cwd=repo_path)
|
||||||
|
|
||||||
|
if rc == 0:
|
||||||
|
return f"✓ Push successful to {remote}/{branch}"
|
||||||
|
else:
|
||||||
|
return f"✗ Push failed:\n{stderr}"
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
return f"Error pushing: {str(e)}"
|
||||||
|
|
||||||
|
|
||||||
|
def git_checkout(repo_path: str = ".", branch: str = None, create: bool = False) -> str:
|
||||||
|
"""
|
||||||
|
Checkout a branch.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
repo_path: Path to git repository
|
||||||
|
branch: Branch name to checkout
|
||||||
|
create: Create the branch if it doesn't exist
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Checkout result
|
||||||
|
"""
|
||||||
|
if not branch:
|
||||||
|
return "Error: branch name is required"
|
||||||
|
|
||||||
|
try:
|
||||||
|
if create:
|
||||||
|
stdout, stderr, rc = run_git_command(['checkout', '-b', branch], cwd=repo_path)
|
||||||
|
else:
|
||||||
|
stdout, stderr, rc = run_git_command(['checkout', branch], cwd=repo_path)
|
||||||
|
|
||||||
|
if rc == 0:
|
||||||
|
return f"✓ Checked out branch: {branch}"
|
||||||
|
else:
|
||||||
|
return f"✗ Checkout failed:\n{stderr}"
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
return f"Error checking out: {str(e)}"
|
||||||
|
|
||||||
|
|
||||||
|
def git_branch_list(repo_path: str = ".") -> str:
|
||||||
|
"""
|
||||||
|
List all branches.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
repo_path: Path to git repository
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
List of branches with current marked
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
stdout, stderr, rc = run_git_command(['branch', '-a'], cwd=repo_path)
|
||||||
|
|
||||||
|
if rc != 0:
|
||||||
|
return f"Error: {stderr}"
|
||||||
|
|
||||||
|
branches = []
|
||||||
|
for line in stdout.strip().split('\n'):
|
||||||
|
if line:
|
||||||
|
branch = line.strip()
|
||||||
|
is_current = branch.startswith('*')
|
||||||
|
if is_current:
|
||||||
|
branch = branch[1:].strip()
|
||||||
|
branches.append({
|
||||||
|
"name": branch,
|
||||||
|
"current": is_current
|
||||||
|
})
|
||||||
|
|
||||||
|
return json.dumps({"branches": branches}, indent=2)
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
return f"Error listing branches: {str(e)}"
|
||||||
|
|
||||||
|
|
||||||
|
# Register all git tools
|
||||||
|
def register_all():
|
||||||
|
registry.register(
|
||||||
|
name="git_status",
|
||||||
|
handler=git_status,
|
||||||
|
description="Get git repository status (branch, changes, last commit)",
|
||||||
|
parameters={
|
||||||
|
"type": "object",
|
||||||
|
"properties": {
|
||||||
|
"repo_path": {
|
||||||
|
"type": "string",
|
||||||
|
"description": "Path to git repository",
|
||||||
|
"default": "."
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
category="git"
|
||||||
|
)
|
||||||
|
|
||||||
|
registry.register(
|
||||||
|
name="git_log",
|
||||||
|
handler=git_log,
|
||||||
|
description="Get recent commit history",
|
||||||
|
parameters={
|
||||||
|
"type": "object",
|
||||||
|
"properties": {
|
||||||
|
"repo_path": {
|
||||||
|
"type": "string",
|
||||||
|
"description": "Path to git repository",
|
||||||
|
"default": "."
|
||||||
|
},
|
||||||
|
"count": {
|
||||||
|
"type": "integer",
|
||||||
|
"description": "Number of commits to show",
|
||||||
|
"default": 10
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
category="git"
|
||||||
|
)
|
||||||
|
|
||||||
|
registry.register(
|
||||||
|
name="git_pull",
|
||||||
|
handler=git_pull,
|
||||||
|
description="Pull latest changes from remote",
|
||||||
|
parameters={
|
||||||
|
"type": "object",
|
||||||
|
"properties": {
|
||||||
|
"repo_path": {
|
||||||
|
"type": "string",
|
||||||
|
"description": "Path to git repository",
|
||||||
|
"default": "."
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
category="git"
|
||||||
|
)
|
||||||
|
|
||||||
|
registry.register(
|
||||||
|
name="git_commit",
|
||||||
|
handler=git_commit,
|
||||||
|
description="Stage and commit changes",
|
||||||
|
parameters={
|
||||||
|
"type": "object",
|
||||||
|
"properties": {
|
||||||
|
"repo_path": {
|
||||||
|
"type": "string",
|
||||||
|
"description": "Path to git repository",
|
||||||
|
"default": "."
|
||||||
|
},
|
||||||
|
"message": {
|
||||||
|
"type": "string",
|
||||||
|
"description": "Commit message (required)"
|
||||||
|
},
|
||||||
|
"files": {
|
||||||
|
"type": "array",
|
||||||
|
"description": "Specific files to commit (default: all changes)",
|
||||||
|
"items": {"type": "string"}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"required": ["message"]
|
||||||
|
},
|
||||||
|
category="git"
|
||||||
|
)
|
||||||
|
|
||||||
|
registry.register(
|
||||||
|
name="git_push",
|
||||||
|
handler=git_push,
|
||||||
|
description="Push to remote repository",
|
||||||
|
parameters={
|
||||||
|
"type": "object",
|
||||||
|
"properties": {
|
||||||
|
"repo_path": {
|
||||||
|
"type": "string",
|
||||||
|
"description": "Path to git repository",
|
||||||
|
"default": "."
|
||||||
|
},
|
||||||
|
"remote": {
|
||||||
|
"type": "string",
|
||||||
|
"description": "Remote name",
|
||||||
|
"default": "origin"
|
||||||
|
},
|
||||||
|
"branch": {
|
||||||
|
"type": "string",
|
||||||
|
"description": "Branch to push (default: current)"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
category="git"
|
||||||
|
)
|
||||||
|
|
||||||
|
registry.register(
|
||||||
|
name="git_checkout",
|
||||||
|
handler=git_checkout,
|
||||||
|
description="Checkout a branch",
|
||||||
|
parameters={
|
||||||
|
"type": "object",
|
||||||
|
"properties": {
|
||||||
|
"repo_path": {
|
||||||
|
"type": "string",
|
||||||
|
"description": "Path to git repository",
|
||||||
|
"default": "."
|
||||||
|
},
|
||||||
|
"branch": {
|
||||||
|
"type": "string",
|
||||||
|
"description": "Branch name to checkout"
|
||||||
|
},
|
||||||
|
"create": {
|
||||||
|
"type": "boolean",
|
||||||
|
"description": "Create branch if it doesn't exist",
|
||||||
|
"default": False
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"required": ["branch"]
|
||||||
|
},
|
||||||
|
category="git"
|
||||||
|
)
|
||||||
|
|
||||||
|
registry.register(
|
||||||
|
name="git_branch_list",
|
||||||
|
handler=git_branch_list,
|
||||||
|
description="List all branches",
|
||||||
|
parameters={
|
||||||
|
"type": "object",
|
||||||
|
"properties": {
|
||||||
|
"repo_path": {
|
||||||
|
"type": "string",
|
||||||
|
"description": "Path to git repository",
|
||||||
|
"default": "."
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
category="git"
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
register_all()
|
||||||
459
uni-wizard/tools/network_tools.py
Normal file
459
uni-wizard/tools/network_tools.py
Normal file
@@ -0,0 +1,459 @@
|
|||||||
|
"""
|
||||||
|
Network Tools for Uni-Wizard
|
||||||
|
HTTP client and Gitea API integration
|
||||||
|
"""
|
||||||
|
|
||||||
|
import json
|
||||||
|
import urllib.request
|
||||||
|
import urllib.error
|
||||||
|
from typing import Dict, Optional, Any
|
||||||
|
from base64 import b64encode
|
||||||
|
|
||||||
|
from .registry import registry
|
||||||
|
|
||||||
|
|
||||||
|
class HTTPClient:
|
||||||
|
"""Simple HTTP client for API calls"""
|
||||||
|
|
||||||
|
def __init__(self, base_url: str = None, auth: tuple = None):
|
||||||
|
self.base_url = base_url
|
||||||
|
self.auth = auth
|
||||||
|
|
||||||
|
def _make_request(
|
||||||
|
self,
|
||||||
|
method: str,
|
||||||
|
url: str,
|
||||||
|
data: Dict = None,
|
||||||
|
headers: Dict = None
|
||||||
|
) -> tuple:
|
||||||
|
"""Make HTTP request and return (body, status_code, error)"""
|
||||||
|
try:
|
||||||
|
# Build full URL
|
||||||
|
full_url = url
|
||||||
|
if self.base_url and not url.startswith('http'):
|
||||||
|
full_url = f"{self.base_url.rstrip('/')}/{url.lstrip('/')}"
|
||||||
|
|
||||||
|
# Prepare data
|
||||||
|
body = None
|
||||||
|
if data:
|
||||||
|
body = json.dumps(data).encode('utf-8')
|
||||||
|
|
||||||
|
# Build request
|
||||||
|
req = urllib.request.Request(
|
||||||
|
full_url,
|
||||||
|
data=body,
|
||||||
|
method=method
|
||||||
|
)
|
||||||
|
|
||||||
|
# Add headers
|
||||||
|
req.add_header('Content-Type', 'application/json')
|
||||||
|
if headers:
|
||||||
|
for key, value in headers.items():
|
||||||
|
req.add_header(key, value)
|
||||||
|
|
||||||
|
# Add auth
|
||||||
|
if self.auth:
|
||||||
|
username, password = self.auth
|
||||||
|
credentials = b64encode(f"{username}:{password}".encode()).decode()
|
||||||
|
req.add_header('Authorization', f'Basic {credentials}')
|
||||||
|
|
||||||
|
# Make request
|
||||||
|
with urllib.request.urlopen(req, timeout=30) as response:
|
||||||
|
return response.read().decode('utf-8'), response.status, None
|
||||||
|
|
||||||
|
except urllib.error.HTTPError as e:
|
||||||
|
return e.read().decode('utf-8'), e.code, str(e)
|
||||||
|
except Exception as e:
|
||||||
|
return None, 0, str(e)
|
||||||
|
|
||||||
|
def get(self, url: str) -> tuple:
|
||||||
|
return self._make_request('GET', url)
|
||||||
|
|
||||||
|
def post(self, url: str, data: Dict) -> tuple:
|
||||||
|
return self._make_request('POST', url, data)
|
||||||
|
|
||||||
|
def put(self, url: str, data: Dict) -> tuple:
|
||||||
|
return self._make_request('PUT', url, data)
|
||||||
|
|
||||||
|
def delete(self, url: str) -> tuple:
|
||||||
|
return self._make_request('DELETE', url)
|
||||||
|
|
||||||
|
|
||||||
|
def http_get(url: str) -> str:
|
||||||
|
"""
|
||||||
|
Perform HTTP GET request.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
url: URL to fetch
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Response body or error message
|
||||||
|
"""
|
||||||
|
client = HTTPClient()
|
||||||
|
body, status, error = client.get(url)
|
||||||
|
|
||||||
|
if error:
|
||||||
|
return f"Error (HTTP {status}): {error}"
|
||||||
|
|
||||||
|
return body
|
||||||
|
|
||||||
|
|
||||||
|
def http_post(url: str, body: Dict) -> str:
|
||||||
|
"""
|
||||||
|
Perform HTTP POST request with JSON body.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
url: URL to post to
|
||||||
|
body: JSON body as dictionary
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Response body or error message
|
||||||
|
"""
|
||||||
|
client = HTTPClient()
|
||||||
|
response_body, status, error = client.post(url, body)
|
||||||
|
|
||||||
|
if error:
|
||||||
|
return f"Error (HTTP {status}): {error}"
|
||||||
|
|
||||||
|
return response_body
|
||||||
|
|
||||||
|
|
||||||
|
# Gitea API Tools
|
||||||
|
GITEA_URL = "http://143.198.27.163:3000"
|
||||||
|
GITEA_USER = "timmy"
|
||||||
|
GITEA_PASS = "" # Should be configured
|
||||||
|
|
||||||
|
|
||||||
|
def gitea_create_issue(
|
||||||
|
repo: str = "Timmy_Foundation/timmy-home",
|
||||||
|
title: str = None,
|
||||||
|
body: str = None,
|
||||||
|
labels: list = None
|
||||||
|
) -> str:
|
||||||
|
"""
|
||||||
|
Create a Gitea issue.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
repo: Repository path (owner/repo)
|
||||||
|
title: Issue title (required)
|
||||||
|
body: Issue body
|
||||||
|
labels: List of label names
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Created issue URL or error
|
||||||
|
"""
|
||||||
|
if not title:
|
||||||
|
return "Error: title is required"
|
||||||
|
|
||||||
|
try:
|
||||||
|
client = HTTPClient(
|
||||||
|
base_url=GITEA_URL,
|
||||||
|
auth=(GITEA_USER, GITEA_PASS) if GITEA_PASS else None
|
||||||
|
)
|
||||||
|
|
||||||
|
data = {
|
||||||
|
"title": title,
|
||||||
|
"body": body or ""
|
||||||
|
}
|
||||||
|
if labels:
|
||||||
|
data["labels"] = labels
|
||||||
|
|
||||||
|
response, status, error = client.post(
|
||||||
|
f"/api/v1/repos/{repo}/issues",
|
||||||
|
data
|
||||||
|
)
|
||||||
|
|
||||||
|
if error:
|
||||||
|
return f"Error creating issue: {error}"
|
||||||
|
|
||||||
|
result = json.loads(response)
|
||||||
|
return f"✓ Issue created: #{result['number']} - {result['html_url']}"
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
return f"Error: {str(e)}"
|
||||||
|
|
||||||
|
|
||||||
|
def gitea_comment(
|
||||||
|
repo: str = "Timmy_Foundation/timmy-home",
|
||||||
|
issue_number: int = None,
|
||||||
|
body: str = None
|
||||||
|
) -> str:
|
||||||
|
"""
|
||||||
|
Comment on a Gitea issue.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
repo: Repository path
|
||||||
|
issue_number: Issue number (required)
|
||||||
|
body: Comment body (required)
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Comment result
|
||||||
|
"""
|
||||||
|
if not issue_number or not body:
|
||||||
|
return "Error: issue_number and body are required"
|
||||||
|
|
||||||
|
try:
|
||||||
|
client = HTTPClient(
|
||||||
|
base_url=GITEA_URL,
|
||||||
|
auth=(GITEA_USER, GITEA_PASS) if GITEA_PASS else None
|
||||||
|
)
|
||||||
|
|
||||||
|
response, status, error = client.post(
|
||||||
|
f"/api/v1/repos/{repo}/issues/{issue_number}/comments",
|
||||||
|
{"body": body}
|
||||||
|
)
|
||||||
|
|
||||||
|
if error:
|
||||||
|
return f"Error posting comment: {error}"
|
||||||
|
|
||||||
|
result = json.loads(response)
|
||||||
|
return f"✓ Comment posted: {result['html_url']}"
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
return f"Error: {str(e)}"
|
||||||
|
|
||||||
|
|
||||||
|
def gitea_list_issues(
|
||||||
|
repo: str = "Timmy_Foundation/timmy-home",
|
||||||
|
state: str = "open",
|
||||||
|
assignee: str = None
|
||||||
|
) -> str:
|
||||||
|
"""
|
||||||
|
List Gitea issues.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
repo: Repository path
|
||||||
|
state: open, closed, or all
|
||||||
|
assignee: Filter by assignee username
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
JSON list of issues
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
client = HTTPClient(
|
||||||
|
base_url=GITEA_URL,
|
||||||
|
auth=(GITEA_USER, GITEA_PASS) if GITEA_PASS else None
|
||||||
|
)
|
||||||
|
|
||||||
|
url = f"/api/v1/repos/{repo}/issues?state={state}"
|
||||||
|
if assignee:
|
||||||
|
url += f"&assignee={assignee}"
|
||||||
|
|
||||||
|
response, status, error = client.get(url)
|
||||||
|
|
||||||
|
if error:
|
||||||
|
return f"Error fetching issues: {error}"
|
||||||
|
|
||||||
|
issues = json.loads(response)
|
||||||
|
|
||||||
|
# Simplify output
|
||||||
|
simplified = []
|
||||||
|
for issue in issues:
|
||||||
|
simplified.append({
|
||||||
|
"number": issue["number"],
|
||||||
|
"title": issue["title"],
|
||||||
|
"state": issue["state"],
|
||||||
|
"assignee": issue.get("assignee", {}).get("login") if issue.get("assignee") else None,
|
||||||
|
"url": issue["html_url"]
|
||||||
|
})
|
||||||
|
|
||||||
|
return json.dumps({
|
||||||
|
"count": len(simplified),
|
||||||
|
"issues": simplified
|
||||||
|
}, indent=2)
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
return f"Error: {str(e)}"
|
||||||
|
|
||||||
|
|
||||||
|
def gitea_get_issue(repo: str = "Timmy_Foundation/timmy-home", issue_number: int = None) -> str:
|
||||||
|
"""
|
||||||
|
Get details of a specific Gitea issue.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
repo: Repository path
|
||||||
|
issue_number: Issue number (required)
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Issue details
|
||||||
|
"""
|
||||||
|
if not issue_number:
|
||||||
|
return "Error: issue_number is required"
|
||||||
|
|
||||||
|
try:
|
||||||
|
client = HTTPClient(
|
||||||
|
base_url=GITEA_URL,
|
||||||
|
auth=(GITEA_USER, GITEA_PASS) if GITEA_PASS else None
|
||||||
|
)
|
||||||
|
|
||||||
|
response, status, error = client.get(
|
||||||
|
f"/api/v1/repos/{repo}/issues/{issue_number}"
|
||||||
|
)
|
||||||
|
|
||||||
|
if error:
|
||||||
|
return f"Error fetching issue: {error}"
|
||||||
|
|
||||||
|
issue = json.loads(response)
|
||||||
|
|
||||||
|
return json.dumps({
|
||||||
|
"number": issue["number"],
|
||||||
|
"title": issue["title"],
|
||||||
|
"body": issue["body"][:500] + "..." if len(issue["body"]) > 500 else issue["body"],
|
||||||
|
"state": issue["state"],
|
||||||
|
"assignee": issue.get("assignee", {}).get("login") if issue.get("assignee") else None,
|
||||||
|
"created_at": issue["created_at"],
|
||||||
|
"url": issue["html_url"]
|
||||||
|
}, indent=2)
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
return f"Error: {str(e)}"
|
||||||
|
|
||||||
|
|
||||||
|
# Register all network tools
|
||||||
|
def register_all():
|
||||||
|
registry.register(
|
||||||
|
name="http_get",
|
||||||
|
handler=http_get,
|
||||||
|
description="Perform HTTP GET request",
|
||||||
|
parameters={
|
||||||
|
"type": "object",
|
||||||
|
"properties": {
|
||||||
|
"url": {
|
||||||
|
"type": "string",
|
||||||
|
"description": "URL to fetch"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"required": ["url"]
|
||||||
|
},
|
||||||
|
category="network"
|
||||||
|
)
|
||||||
|
|
||||||
|
registry.register(
|
||||||
|
name="http_post",
|
||||||
|
handler=http_post,
|
||||||
|
description="Perform HTTP POST request with JSON body",
|
||||||
|
parameters={
|
||||||
|
"type": "object",
|
||||||
|
"properties": {
|
||||||
|
"url": {
|
||||||
|
"type": "string",
|
||||||
|
"description": "URL to post to"
|
||||||
|
},
|
||||||
|
"body": {
|
||||||
|
"type": "object",
|
||||||
|
"description": "JSON body as dictionary"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"required": ["url", "body"]
|
||||||
|
},
|
||||||
|
category="network"
|
||||||
|
)
|
||||||
|
|
||||||
|
registry.register(
|
||||||
|
name="gitea_create_issue",
|
||||||
|
handler=gitea_create_issue,
|
||||||
|
description="Create a Gitea issue",
|
||||||
|
parameters={
|
||||||
|
"type": "object",
|
||||||
|
"properties": {
|
||||||
|
"repo": {
|
||||||
|
"type": "string",
|
||||||
|
"description": "Repository path (owner/repo)",
|
||||||
|
"default": "Timmy_Foundation/timmy-home"
|
||||||
|
},
|
||||||
|
"title": {
|
||||||
|
"type": "string",
|
||||||
|
"description": "Issue title"
|
||||||
|
},
|
||||||
|
"body": {
|
||||||
|
"type": "string",
|
||||||
|
"description": "Issue body"
|
||||||
|
},
|
||||||
|
"labels": {
|
||||||
|
"type": "array",
|
||||||
|
"description": "List of label names",
|
||||||
|
"items": {"type": "string"}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"required": ["title"]
|
||||||
|
},
|
||||||
|
category="network"
|
||||||
|
)
|
||||||
|
|
||||||
|
registry.register(
|
||||||
|
name="gitea_comment",
|
||||||
|
handler=gitea_comment,
|
||||||
|
description="Comment on a Gitea issue",
|
||||||
|
parameters={
|
||||||
|
"type": "object",
|
||||||
|
"properties": {
|
||||||
|
"repo": {
|
||||||
|
"type": "string",
|
||||||
|
"description": "Repository path",
|
||||||
|
"default": "Timmy_Foundation/timmy-home"
|
||||||
|
},
|
||||||
|
"issue_number": {
|
||||||
|
"type": "integer",
|
||||||
|
"description": "Issue number"
|
||||||
|
},
|
||||||
|
"body": {
|
||||||
|
"type": "string",
|
||||||
|
"description": "Comment body"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"required": ["issue_number", "body"]
|
||||||
|
},
|
||||||
|
category="network"
|
||||||
|
)
|
||||||
|
|
||||||
|
registry.register(
|
||||||
|
name="gitea_list_issues",
|
||||||
|
handler=gitea_list_issues,
|
||||||
|
description="List Gitea issues",
|
||||||
|
parameters={
|
||||||
|
"type": "object",
|
||||||
|
"properties": {
|
||||||
|
"repo": {
|
||||||
|
"type": "string",
|
||||||
|
"description": "Repository path",
|
||||||
|
"default": "Timmy_Foundation/timmy-home"
|
||||||
|
},
|
||||||
|
"state": {
|
||||||
|
"type": "string",
|
||||||
|
"enum": ["open", "closed", "all"],
|
||||||
|
"description": "Issue state",
|
||||||
|
"default": "open"
|
||||||
|
},
|
||||||
|
"assignee": {
|
||||||
|
"type": "string",
|
||||||
|
"description": "Filter by assignee username"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
category="network"
|
||||||
|
)
|
||||||
|
|
||||||
|
registry.register(
|
||||||
|
name="gitea_get_issue",
|
||||||
|
handler=gitea_get_issue,
|
||||||
|
description="Get details of a specific Gitea issue",
|
||||||
|
parameters={
|
||||||
|
"type": "object",
|
||||||
|
"properties": {
|
||||||
|
"repo": {
|
||||||
|
"type": "string",
|
||||||
|
"description": "Repository path",
|
||||||
|
"default": "Timmy_Foundation/timmy-home"
|
||||||
|
},
|
||||||
|
"issue_number": {
|
||||||
|
"type": "integer",
|
||||||
|
"description": "Issue number"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"required": ["issue_number"]
|
||||||
|
},
|
||||||
|
category="network"
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
register_all()
|
||||||
265
uni-wizard/tools/registry.py
Normal file
265
uni-wizard/tools/registry.py
Normal file
@@ -0,0 +1,265 @@
|
|||||||
|
"""
|
||||||
|
Uni-Wizard Tool Registry
|
||||||
|
Central registry for all tool capabilities
|
||||||
|
"""
|
||||||
|
|
||||||
|
import json
|
||||||
|
import inspect
|
||||||
|
from typing import Dict, Callable, Any, Optional
|
||||||
|
from dataclasses import dataclass, asdict
|
||||||
|
from functools import wraps
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class ToolSchema:
|
||||||
|
"""Schema definition for a tool"""
|
||||||
|
name: str
|
||||||
|
description: str
|
||||||
|
parameters: Dict[str, Any]
|
||||||
|
returns: str
|
||||||
|
examples: list = None
|
||||||
|
|
||||||
|
def to_dict(self):
|
||||||
|
return asdict(self)
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class ToolResult:
|
||||||
|
"""Standardized tool execution result"""
|
||||||
|
success: bool
|
||||||
|
data: Any
|
||||||
|
error: Optional[str] = None
|
||||||
|
execution_time_ms: Optional[float] = None
|
||||||
|
|
||||||
|
def to_json(self) -> str:
|
||||||
|
return json.dumps({
|
||||||
|
'success': self.success,
|
||||||
|
'data': self.data,
|
||||||
|
'error': self.error,
|
||||||
|
'execution_time_ms': self.execution_time_ms
|
||||||
|
}, indent=2)
|
||||||
|
|
||||||
|
def __str__(self) -> str:
|
||||||
|
if self.success:
|
||||||
|
return str(self.data)
|
||||||
|
return f"Error: {self.error}"
|
||||||
|
|
||||||
|
|
||||||
|
class ToolRegistry:
|
||||||
|
"""
|
||||||
|
Central registry for all uni-wizard tools.
|
||||||
|
|
||||||
|
All tools register here with their schemas.
|
||||||
|
The LLM queries available tools via get_tool_definitions().
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self):
|
||||||
|
self._tools: Dict[str, Dict] = {}
|
||||||
|
self._categories: Dict[str, list] = {}
|
||||||
|
|
||||||
|
def register(
|
||||||
|
self,
|
||||||
|
name: str,
|
||||||
|
handler: Callable,
|
||||||
|
description: str = None,
|
||||||
|
parameters: Dict = None,
|
||||||
|
category: str = "general",
|
||||||
|
examples: list = None
|
||||||
|
):
|
||||||
|
"""
|
||||||
|
Register a tool in the registry.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
name: Tool name (used in tool calls)
|
||||||
|
handler: Function to execute
|
||||||
|
description: What the tool does
|
||||||
|
parameters: JSON Schema for parameters
|
||||||
|
category: Tool category (system, git, network, file)
|
||||||
|
examples: Example usages
|
||||||
|
"""
|
||||||
|
# Auto-extract description from docstring if not provided
|
||||||
|
if description is None and handler.__doc__:
|
||||||
|
description = handler.__doc__.strip().split('\n')[0]
|
||||||
|
|
||||||
|
# Auto-extract parameters from function signature
|
||||||
|
if parameters is None:
|
||||||
|
parameters = self._extract_params(handler)
|
||||||
|
|
||||||
|
self._tools[name] = {
|
||||||
|
'name': name,
|
||||||
|
'handler': handler,
|
||||||
|
'description': description or f"Execute {name}",
|
||||||
|
'parameters': parameters,
|
||||||
|
'category': category,
|
||||||
|
'examples': examples or []
|
||||||
|
}
|
||||||
|
|
||||||
|
# Add to category
|
||||||
|
if category not in self._categories:
|
||||||
|
self._categories[category] = []
|
||||||
|
self._categories[category].append(name)
|
||||||
|
|
||||||
|
return self # For chaining
|
||||||
|
|
||||||
|
def _extract_params(self, handler: Callable) -> Dict:
|
||||||
|
"""Extract parameter schema from function signature"""
|
||||||
|
sig = inspect.signature(handler)
|
||||||
|
params = {
|
||||||
|
"type": "object",
|
||||||
|
"properties": {},
|
||||||
|
"required": []
|
||||||
|
}
|
||||||
|
|
||||||
|
for name, param in sig.parameters.items():
|
||||||
|
# Skip 'self', 'cls', and params with defaults
|
||||||
|
if name in ('self', 'cls'):
|
||||||
|
continue
|
||||||
|
|
||||||
|
param_info = {"type": "string"} # Default
|
||||||
|
|
||||||
|
# Try to infer type from annotation
|
||||||
|
if param.annotation != inspect.Parameter.empty:
|
||||||
|
if param.annotation == int:
|
||||||
|
param_info["type"] = "integer"
|
||||||
|
elif param.annotation == float:
|
||||||
|
param_info["type"] = "number"
|
||||||
|
elif param.annotation == bool:
|
||||||
|
param_info["type"] = "boolean"
|
||||||
|
elif param.annotation == list:
|
||||||
|
param_info["type"] = "array"
|
||||||
|
elif param.annotation == dict:
|
||||||
|
param_info["type"] = "object"
|
||||||
|
|
||||||
|
# Add description if in docstring
|
||||||
|
if handler.__doc__:
|
||||||
|
# Simple param extraction from docstring
|
||||||
|
for line in handler.__doc__.split('\n'):
|
||||||
|
if f'{name}:' in line or f'{name} (' in line:
|
||||||
|
desc = line.split(':', 1)[-1].strip()
|
||||||
|
param_info["description"] = desc
|
||||||
|
break
|
||||||
|
|
||||||
|
params["properties"][name] = param_info
|
||||||
|
|
||||||
|
# Required if no default
|
||||||
|
if param.default == inspect.Parameter.empty:
|
||||||
|
params["required"].append(name)
|
||||||
|
|
||||||
|
return params
|
||||||
|
|
||||||
|
def execute(self, name: str, **params) -> ToolResult:
|
||||||
|
"""
|
||||||
|
Execute a tool by name with parameters.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
name: Tool name
|
||||||
|
**params: Tool parameters
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
ToolResult with success/failure and data
|
||||||
|
"""
|
||||||
|
import time
|
||||||
|
start = time.time()
|
||||||
|
|
||||||
|
tool = self._tools.get(name)
|
||||||
|
if not tool:
|
||||||
|
return ToolResult(
|
||||||
|
success=False,
|
||||||
|
data=None,
|
||||||
|
error=f"Tool '{name}' not found in registry",
|
||||||
|
execution_time_ms=(time.time() - start) * 1000
|
||||||
|
)
|
||||||
|
|
||||||
|
try:
|
||||||
|
handler = tool['handler']
|
||||||
|
result = handler(**params)
|
||||||
|
|
||||||
|
return ToolResult(
|
||||||
|
success=True,
|
||||||
|
data=result,
|
||||||
|
execution_time_ms=(time.time() - start) * 1000
|
||||||
|
)
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
return ToolResult(
|
||||||
|
success=False,
|
||||||
|
data=None,
|
||||||
|
error=f"{type(e).__name__}: {str(e)}",
|
||||||
|
execution_time_ms=(time.time() - start) * 1000
|
||||||
|
)
|
||||||
|
|
||||||
|
def get_tool(self, name: str) -> Optional[Dict]:
|
||||||
|
"""Get tool definition by name"""
|
||||||
|
tool = self._tools.get(name)
|
||||||
|
if tool:
|
||||||
|
# Return without handler (not serializable)
|
||||||
|
return {
|
||||||
|
'name': tool['name'],
|
||||||
|
'description': tool['description'],
|
||||||
|
'parameters': tool['parameters'],
|
||||||
|
'category': tool['category'],
|
||||||
|
'examples': tool['examples']
|
||||||
|
}
|
||||||
|
return None
|
||||||
|
|
||||||
|
def get_tools_by_category(self, category: str) -> list:
|
||||||
|
"""Get all tools in a category"""
|
||||||
|
tool_names = self._categories.get(category, [])
|
||||||
|
return [self.get_tool(name) for name in tool_names if self.get_tool(name)]
|
||||||
|
|
||||||
|
def list_tools(self, category: str = None) -> list:
|
||||||
|
"""List all tool names, optionally filtered by category"""
|
||||||
|
if category:
|
||||||
|
return self._categories.get(category, [])
|
||||||
|
return list(self._tools.keys())
|
||||||
|
|
||||||
|
def get_tool_definitions(self) -> str:
|
||||||
|
"""
|
||||||
|
Get all tool definitions formatted for LLM system prompt.
|
||||||
|
Returns JSON string of all tools with schemas.
|
||||||
|
"""
|
||||||
|
tools = []
|
||||||
|
for name, tool in self._tools.items():
|
||||||
|
tools.append({
|
||||||
|
"name": name,
|
||||||
|
"description": tool['description'],
|
||||||
|
"parameters": tool['parameters']
|
||||||
|
})
|
||||||
|
|
||||||
|
return json.dumps(tools, indent=2)
|
||||||
|
|
||||||
|
def get_categories(self) -> list:
|
||||||
|
"""Get all tool categories"""
|
||||||
|
return list(self._categories.keys())
|
||||||
|
|
||||||
|
|
||||||
|
# Global registry instance
|
||||||
|
registry = ToolRegistry()
|
||||||
|
|
||||||
|
|
||||||
|
def tool(name: str = None, category: str = "general", examples: list = None):
|
||||||
|
"""
|
||||||
|
Decorator to register a function as a tool.
|
||||||
|
|
||||||
|
Usage:
|
||||||
|
@tool(category="system")
|
||||||
|
def system_info():
|
||||||
|
return {...}
|
||||||
|
"""
|
||||||
|
def decorator(func: Callable):
|
||||||
|
tool_name = name or func.__name__
|
||||||
|
registry.register(
|
||||||
|
name=tool_name,
|
||||||
|
handler=func,
|
||||||
|
category=category,
|
||||||
|
examples=examples
|
||||||
|
)
|
||||||
|
return func
|
||||||
|
return decorator
|
||||||
|
|
||||||
|
|
||||||
|
# Convenience function for quick tool execution
|
||||||
|
def call_tool(name: str, **params) -> str:
|
||||||
|
"""Execute a tool and return string result"""
|
||||||
|
result = registry.execute(name, **params)
|
||||||
|
return str(result)
|
||||||
377
uni-wizard/tools/system_tools.py
Normal file
377
uni-wizard/tools/system_tools.py
Normal file
@@ -0,0 +1,377 @@
|
|||||||
|
"""
|
||||||
|
System Tools for Uni-Wizard
|
||||||
|
Monitor and control the VPS environment
|
||||||
|
"""
|
||||||
|
|
||||||
|
import os
|
||||||
|
import json
|
||||||
|
import subprocess
|
||||||
|
import platform
|
||||||
|
import psutil
|
||||||
|
from datetime import datetime, timedelta
|
||||||
|
from typing import Dict, List, Optional
|
||||||
|
|
||||||
|
from .registry import tool, registry
|
||||||
|
|
||||||
|
|
||||||
|
@tool(category="system")
|
||||||
|
def system_info() -> str:
|
||||||
|
"""
|
||||||
|
Get comprehensive system information.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
JSON string with OS, CPU, memory, disk, and uptime info
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
# CPU info
|
||||||
|
cpu_count = psutil.cpu_count()
|
||||||
|
cpu_percent = psutil.cpu_percent(interval=1)
|
||||||
|
cpu_freq = psutil.cpu_freq()
|
||||||
|
|
||||||
|
# Memory info
|
||||||
|
memory = psutil.virtual_memory()
|
||||||
|
|
||||||
|
# Disk info
|
||||||
|
disk = psutil.disk_usage('/')
|
||||||
|
|
||||||
|
# Uptime
|
||||||
|
boot_time = datetime.fromtimestamp(psutil.boot_time())
|
||||||
|
uptime = datetime.now() - boot_time
|
||||||
|
|
||||||
|
# Load average (Linux only)
|
||||||
|
load_avg = os.getloadavg() if hasattr(os, 'getloadavg') else [0, 0, 0]
|
||||||
|
|
||||||
|
info = {
|
||||||
|
"hostname": platform.node(),
|
||||||
|
"os": {
|
||||||
|
"system": platform.system(),
|
||||||
|
"release": platform.release(),
|
||||||
|
"version": platform.version(),
|
||||||
|
"machine": platform.machine()
|
||||||
|
},
|
||||||
|
"cpu": {
|
||||||
|
"count": cpu_count,
|
||||||
|
"percent": cpu_percent,
|
||||||
|
"frequency_mhz": cpu_freq.current if cpu_freq else None
|
||||||
|
},
|
||||||
|
"memory": {
|
||||||
|
"total_gb": round(memory.total / (1024**3), 2),
|
||||||
|
"available_gb": round(memory.available / (1024**3), 2),
|
||||||
|
"percent_used": memory.percent
|
||||||
|
},
|
||||||
|
"disk": {
|
||||||
|
"total_gb": round(disk.total / (1024**3), 2),
|
||||||
|
"free_gb": round(disk.free / (1024**3), 2),
|
||||||
|
"percent_used": round((disk.used / disk.total) * 100, 1)
|
||||||
|
},
|
||||||
|
"uptime": {
|
||||||
|
"boot_time": boot_time.isoformat(),
|
||||||
|
"uptime_seconds": int(uptime.total_seconds()),
|
||||||
|
"uptime_human": str(timedelta(seconds=int(uptime.total_seconds())))
|
||||||
|
},
|
||||||
|
"load_average": {
|
||||||
|
"1min": round(load_avg[0], 2),
|
||||||
|
"5min": round(load_avg[1], 2),
|
||||||
|
"15min": round(load_avg[2], 2)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return json.dumps(info, indent=2)
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
return f"Error getting system info: {str(e)}"
|
||||||
|
|
||||||
|
|
||||||
|
@tool(category="system")
|
||||||
|
def process_list(filter_name: str = None) -> str:
|
||||||
|
"""
|
||||||
|
List running processes with optional name filter.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
filter_name: Optional process name to filter by
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
JSON list of processes with PID, name, CPU%, memory
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
processes = []
|
||||||
|
for proc in psutil.process_iter(['pid', 'name', 'cpu_percent', 'memory_percent', 'status']):
|
||||||
|
try:
|
||||||
|
info = proc.info
|
||||||
|
if filter_name and filter_name.lower() not in info['name'].lower():
|
||||||
|
continue
|
||||||
|
processes.append({
|
||||||
|
"pid": info['pid'],
|
||||||
|
"name": info['name'],
|
||||||
|
"cpu_percent": info['cpu_percent'],
|
||||||
|
"memory_percent": round(info['memory_percent'], 2) if info['memory_percent'] else 0,
|
||||||
|
"status": info['status']
|
||||||
|
})
|
||||||
|
except (psutil.NoSuchProcess, psutil.AccessDenied):
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Sort by CPU usage
|
||||||
|
processes.sort(key=lambda x: x['cpu_percent'], reverse=True)
|
||||||
|
|
||||||
|
return json.dumps({
|
||||||
|
"count": len(processes),
|
||||||
|
"filter": filter_name,
|
||||||
|
"processes": processes[:50] # Limit to top 50
|
||||||
|
}, indent=2)
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
return f"Error listing processes: {str(e)}"
|
||||||
|
|
||||||
|
|
||||||
|
@tool(category="system")
|
||||||
|
def service_status(service_name: str) -> str:
|
||||||
|
"""
|
||||||
|
Check systemd service status.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
service_name: Name of the service (e.g., 'llama-server', 'syncthing@root')
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Service status information
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
result = subprocess.run(
|
||||||
|
['systemctl', 'status', service_name, '--no-pager'],
|
||||||
|
capture_output=True,
|
||||||
|
text=True
|
||||||
|
)
|
||||||
|
|
||||||
|
# Parse output
|
||||||
|
lines = result.stdout.split('\n')
|
||||||
|
status_info = {"service": service_name}
|
||||||
|
|
||||||
|
for line in lines:
|
||||||
|
if 'Active:' in line:
|
||||||
|
status_info['active'] = line.split(':', 1)[1].strip()
|
||||||
|
elif 'Loaded:' in line:
|
||||||
|
status_info['loaded'] = line.split(':', 1)[1].strip()
|
||||||
|
elif 'Main PID:' in line:
|
||||||
|
status_info['pid'] = line.split(':', 1)[1].strip()
|
||||||
|
elif 'Memory:' in line:
|
||||||
|
status_info['memory'] = line.split(':', 1)[1].strip()
|
||||||
|
elif 'CPU:' in line:
|
||||||
|
status_info['cpu'] = line.split(':', 1)[1].strip()
|
||||||
|
|
||||||
|
status_info['exit_code'] = result.returncode
|
||||||
|
|
||||||
|
return json.dumps(status_info, indent=2)
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
return f"Error checking service status: {str(e)}"
|
||||||
|
|
||||||
|
|
||||||
|
@tool(category="system")
|
||||||
|
def service_control(service_name: str, action: str) -> str:
|
||||||
|
"""
|
||||||
|
Control a systemd service (start, stop, restart, enable, disable).
|
||||||
|
|
||||||
|
Args:
|
||||||
|
service_name: Name of the service
|
||||||
|
action: start, stop, restart, enable, disable, status
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Result of the action
|
||||||
|
"""
|
||||||
|
valid_actions = ['start', 'stop', 'restart', 'enable', 'disable', 'status']
|
||||||
|
|
||||||
|
if action not in valid_actions:
|
||||||
|
return f"Invalid action. Use: {', '.join(valid_actions)}"
|
||||||
|
|
||||||
|
try:
|
||||||
|
result = subprocess.run(
|
||||||
|
['systemctl', action, service_name],
|
||||||
|
capture_output=True,
|
||||||
|
text=True
|
||||||
|
)
|
||||||
|
|
||||||
|
if result.returncode == 0:
|
||||||
|
return f"✓ Service '{service_name}' {action} successful"
|
||||||
|
else:
|
||||||
|
return f"✗ Service '{service_name}' {action} failed: {result.stderr}"
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
return f"Error controlling service: {str(e)}"
|
||||||
|
|
||||||
|
|
||||||
|
@tool(category="system")
|
||||||
|
def health_check() -> str:
|
||||||
|
"""
|
||||||
|
Comprehensive health check of the VPS.
|
||||||
|
|
||||||
|
Checks:
|
||||||
|
- System resources (CPU, memory, disk)
|
||||||
|
- Critical services (llama-server, syncthing, timmy-agent)
|
||||||
|
- Network connectivity
|
||||||
|
- Inference endpoint
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Health report with status and recommendations
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
health = {
|
||||||
|
"timestamp": datetime.now().isoformat(),
|
||||||
|
"overall": "healthy",
|
||||||
|
"checks": {}
|
||||||
|
}
|
||||||
|
|
||||||
|
# System resources
|
||||||
|
memory = psutil.virtual_memory()
|
||||||
|
disk = psutil.disk_usage('/')
|
||||||
|
|
||||||
|
health["checks"]["memory"] = {
|
||||||
|
"status": "healthy" if memory.percent < 90 else "warning",
|
||||||
|
"percent_used": memory.percent,
|
||||||
|
"available_gb": round(memory.available / (1024**3), 2)
|
||||||
|
}
|
||||||
|
|
||||||
|
health["checks"]["disk"] = {
|
||||||
|
"status": "healthy" if disk.percent < 90 else "warning",
|
||||||
|
"percent_used": disk.percent,
|
||||||
|
"free_gb": round(disk.free / (1024**3), 2)
|
||||||
|
}
|
||||||
|
|
||||||
|
# Check inference endpoint
|
||||||
|
try:
|
||||||
|
import urllib.request
|
||||||
|
req = urllib.request.urlopen('http://127.0.0.1:8081/health', timeout=5)
|
||||||
|
health["checks"]["inference"] = {"status": "healthy", "port": 8081}
|
||||||
|
except:
|
||||||
|
health["checks"]["inference"] = {"status": "down", "port": 8081}
|
||||||
|
health["overall"] = "degraded"
|
||||||
|
|
||||||
|
# Check services
|
||||||
|
services = ['llama-server', 'syncthing@root']
|
||||||
|
for svc in services:
|
||||||
|
result = subprocess.run(['systemctl', 'is-active', svc], capture_output=True, text=True)
|
||||||
|
health["checks"][svc] = {
|
||||||
|
"status": "healthy" if result.returncode == 0 else "down"
|
||||||
|
}
|
||||||
|
if result.returncode != 0:
|
||||||
|
health["overall"] = "degraded"
|
||||||
|
|
||||||
|
return json.dumps(health, indent=2)
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
return f"Error running health check: {str(e)}"
|
||||||
|
|
||||||
|
|
||||||
|
@tool(category="system")
|
||||||
|
def disk_usage(path: str = "/") -> str:
|
||||||
|
"""
|
||||||
|
Get disk usage for a path.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
path: Path to check (default: /)
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Disk usage statistics
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
usage = psutil.disk_usage(path)
|
||||||
|
return json.dumps({
|
||||||
|
"path": path,
|
||||||
|
"total_gb": round(usage.total / (1024**3), 2),
|
||||||
|
"used_gb": round(usage.used / (1024**3), 2),
|
||||||
|
"free_gb": round(usage.free / (1024**3), 2),
|
||||||
|
"percent_used": round((usage.used / usage.total) * 100, 1)
|
||||||
|
}, indent=2)
|
||||||
|
except Exception as e:
|
||||||
|
return f"Error checking disk usage: {str(e)}"
|
||||||
|
|
||||||
|
|
||||||
|
# Auto-register all tools in this module
|
||||||
|
def register_all():
|
||||||
|
"""Register all system tools"""
|
||||||
|
registry.register(
|
||||||
|
name="system_info",
|
||||||
|
handler=system_info,
|
||||||
|
description="Get comprehensive system information (OS, CPU, memory, disk, uptime)",
|
||||||
|
category="system"
|
||||||
|
)
|
||||||
|
|
||||||
|
registry.register(
|
||||||
|
name="process_list",
|
||||||
|
handler=process_list,
|
||||||
|
description="List running processes with optional name filter",
|
||||||
|
parameters={
|
||||||
|
"type": "object",
|
||||||
|
"properties": {
|
||||||
|
"filter_name": {
|
||||||
|
"type": "string",
|
||||||
|
"description": "Optional process name to filter by"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
category="system"
|
||||||
|
)
|
||||||
|
|
||||||
|
registry.register(
|
||||||
|
name="service_status",
|
||||||
|
handler=service_status,
|
||||||
|
description="Check systemd service status",
|
||||||
|
parameters={
|
||||||
|
"type": "object",
|
||||||
|
"properties": {
|
||||||
|
"service_name": {
|
||||||
|
"type": "string",
|
||||||
|
"description": "Name of the systemd service"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"required": ["service_name"]
|
||||||
|
},
|
||||||
|
category="system"
|
||||||
|
)
|
||||||
|
|
||||||
|
registry.register(
|
||||||
|
name="service_control",
|
||||||
|
handler=service_control,
|
||||||
|
description="Control a systemd service (start, stop, restart, enable, disable)",
|
||||||
|
parameters={
|
||||||
|
"type": "object",
|
||||||
|
"properties": {
|
||||||
|
"service_name": {
|
||||||
|
"type": "string",
|
||||||
|
"description": "Name of the service"
|
||||||
|
},
|
||||||
|
"action": {
|
||||||
|
"type": "string",
|
||||||
|
"enum": ["start", "stop", "restart", "enable", "disable", "status"],
|
||||||
|
"description": "Action to perform"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"required": ["service_name", "action"]
|
||||||
|
},
|
||||||
|
category="system"
|
||||||
|
)
|
||||||
|
|
||||||
|
registry.register(
|
||||||
|
name="health_check",
|
||||||
|
handler=health_check,
|
||||||
|
description="Comprehensive health check of VPS (resources, services, inference)",
|
||||||
|
category="system"
|
||||||
|
)
|
||||||
|
|
||||||
|
registry.register(
|
||||||
|
name="disk_usage",
|
||||||
|
handler=disk_usage,
|
||||||
|
description="Get disk usage for a path",
|
||||||
|
parameters={
|
||||||
|
"type": "object",
|
||||||
|
"properties": {
|
||||||
|
"path": {
|
||||||
|
"type": "string",
|
||||||
|
"description": "Path to check",
|
||||||
|
"default": "/"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
category="system"
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
register_all()
|
||||||
401
uniwizard/context_compression_review.md
Normal file
401
uniwizard/context_compression_review.md
Normal file
@@ -0,0 +1,401 @@
|
|||||||
|
# Context Compression Review
|
||||||
|
## Gitea Issue: timmy-home #92
|
||||||
|
|
||||||
|
**Date:** 2026-03-30
|
||||||
|
**Reviewer:** Timmy (Agent)
|
||||||
|
**Scope:** `~/.hermes/hermes-agent/agent/context_compressor.py`
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Executive Summary
|
||||||
|
|
||||||
|
The Hermes context compressor is a **mature, well-architected implementation** with sophisticated handling of tool call pairs, iterative summary updates, and token-aware tail protection. However, there are several **high-impact gaps** related to fallback chain awareness, early warning systems, and checkpoint integration that should be addressed for production reliability.
|
||||||
|
|
||||||
|
**Overall Grade:** B+ (Solid foundation, needs edge-case hardening)
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## What the Current Implementation Does Well
|
||||||
|
|
||||||
|
### 1. Structured Summary Template (Lines 276-303)
|
||||||
|
The compressor uses a Pi-mono/OpenCode-inspired structured format:
|
||||||
|
- **Goal**: What the user is trying to accomplish
|
||||||
|
- **Constraints & Preferences**: User preferences, coding style
|
||||||
|
- **Progress**: Done / In Progress / Blocked sections
|
||||||
|
- **Key Decisions**: Important technical decisions with rationale
|
||||||
|
- **Relevant Files**: Files read/modified/created with notes
|
||||||
|
- **Next Steps**: What needs to happen next
|
||||||
|
- **Critical Context**: Values, error messages, config details that would be lost
|
||||||
|
|
||||||
|
This is **best-in-class** compared to most context compression implementations.
|
||||||
|
|
||||||
|
### 2. Iterative Summary Updates (Lines 264-304)
|
||||||
|
The `_previous_summary` mechanism preserves information across multiple compactions:
|
||||||
|
- On first compaction: Summarizes from scratch
|
||||||
|
- On subsequent compactions: Updates previous summary with new progress
|
||||||
|
- Moves items from "In Progress" to "Done" when completed
|
||||||
|
- Accumulates constraints and file references across compactions
|
||||||
|
|
||||||
|
### 3. Token-Budget Tail Protection (Lines 490-539)
|
||||||
|
Instead of fixed message counts, protects the most recent N tokens:
|
||||||
|
```python
|
||||||
|
tail_token_budget = threshold_tokens * summary_target_ratio
|
||||||
|
# Default: 50% of 128K context = 64K threshold → ~13K token tail
|
||||||
|
```
|
||||||
|
This scales automatically with model context window.
|
||||||
|
|
||||||
|
### 4. Tool Call/Result Pair Integrity (Lines 392-450)
|
||||||
|
Sophisticated handling of orphaned tool pairs:
|
||||||
|
- `_sanitize_tool_pairs()`: Removes orphaned results, adds stubs for missing results
|
||||||
|
- `_align_boundary_forward/backward()`: Prevents splitting tool groups
|
||||||
|
- Protects the integrity of the message sequence for API compliance
|
||||||
|
|
||||||
|
### 5. Tool Output Pruning Pre-Pass (Lines 152-182)
|
||||||
|
Cheap first pass that replaces old tool results with placeholders:
|
||||||
|
```python
|
||||||
|
_PRUNED_TOOL_PLACEHOLDER = "[Old tool output cleared to save context space]"
|
||||||
|
```
|
||||||
|
Only prunes content >200 chars, preserving smaller results.
|
||||||
|
|
||||||
|
### 6. Rich Serialization for Summary Input (Lines 199-248)
|
||||||
|
Includes tool call arguments and truncates intelligently:
|
||||||
|
- Tool results: Up to 3000 chars (with smart truncation keeping head/tail)
|
||||||
|
- Tool calls: Function name AND arguments (truncated to 400 chars if needed)
|
||||||
|
- All roles: 3000 char limit with ellipses
|
||||||
|
|
||||||
|
### 7. Proper Integration with Agent Loop
|
||||||
|
- Initialized in `AIAgent.__init__()` (lines 1191-1203)
|
||||||
|
- Triggered in `_compress_context()` (line 5259)
|
||||||
|
- Resets state in `reset_session_state()` (lines 1263-1271)
|
||||||
|
- Updates token counts via `update_from_response()` (lines 122-126)
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## What's Missing or Broken
|
||||||
|
|
||||||
|
### 🔴 CRITICAL: No Fallback Chain Context Window Awareness
|
||||||
|
|
||||||
|
**Issue:** When the agent falls back to a model with a smaller context window (e.g., primary Claude 1M tokens → fallback GPT-4 128K tokens), the compressor's threshold is based on the **original model**, not the fallback model.
|
||||||
|
|
||||||
|
**Location:** `run_agent.py` compression initialization (lines 1191-1203)
|
||||||
|
|
||||||
|
**Impact:**
|
||||||
|
- Fallback model may hit context limits before compression triggers
|
||||||
|
- Or compression may trigger too aggressively for smaller models
|
||||||
|
|
||||||
|
**Evidence:**
|
||||||
|
```python
|
||||||
|
# In AIAgent.__init__():
|
||||||
|
self.context_compressor = ContextCompressor(
|
||||||
|
model=self.model, # Original model only
|
||||||
|
# ... no fallback context lengths passed
|
||||||
|
)
|
||||||
|
```
|
||||||
|
|
||||||
|
**Fix Needed:** Pass fallback chain context lengths and use minimum:
|
||||||
|
```python
|
||||||
|
# Suggested approach:
|
||||||
|
context_lengths = [get_model_context_length(m) for m in [primary] + fallbacks]
|
||||||
|
effective_context = min(context_lengths) # Conservative
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
### 🔴 HIGH: No Pre-Compression Checkpoint
|
||||||
|
|
||||||
|
**Issue:** When compression occurs, the pre-compression state is lost. Users cannot "rewind" to before compression if the summary loses critical information.
|
||||||
|
|
||||||
|
**Location:** `run_agent.py` `_compress_context()` (line 5259)
|
||||||
|
|
||||||
|
**Impact:**
|
||||||
|
- Information loss is irreversible
|
||||||
|
- If summary misses critical context, conversation is corrupted
|
||||||
|
- No audit trail of what was removed
|
||||||
|
|
||||||
|
**Fix Needed:** Create checkpoint before compression:
|
||||||
|
```python
|
||||||
|
def _compress_context(self, messages, system_message, ...):
|
||||||
|
# Create checkpoint BEFORE compression
|
||||||
|
if self._checkpoint_mgr:
|
||||||
|
self._checkpoint_mgr.create_checkpoint(
|
||||||
|
name=f"pre-compression-{self.context_compressor.compression_count}",
|
||||||
|
messages=messages, # Full pre-compression state
|
||||||
|
)
|
||||||
|
compressed = self.context_compressor.compress(messages, ...)
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
### 🟡 MEDIUM: No Progressive Context Pressure Warnings
|
||||||
|
|
||||||
|
**Issue:** Only one warning at 85% (line 7871), then sudden compression at 50-100% threshold. No graduated alert system.
|
||||||
|
|
||||||
|
**Location:** `run_agent.py` context pressure check (lines 7865-7872)
|
||||||
|
|
||||||
|
**Current:**
|
||||||
|
```python
|
||||||
|
if _compaction_progress >= 0.85 and not self._context_pressure_warned:
|
||||||
|
self._context_pressure_warned = True
|
||||||
|
```
|
||||||
|
|
||||||
|
**Better:**
|
||||||
|
```python
|
||||||
|
# Progressive warnings at 60%, 75%, 85%, 95%
|
||||||
|
warning_levels = [(0.60, "info"), (0.75, "notice"),
|
||||||
|
(0.85, "warning"), (0.95, "critical")]
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
### 🟡 MEDIUM: Summary Validation Missing
|
||||||
|
|
||||||
|
**Issue:** No verification that the generated summary actually contains the critical information from the compressed turns.
|
||||||
|
|
||||||
|
**Location:** `context_compressor.py` `_generate_summary()` (lines 250-369)
|
||||||
|
|
||||||
|
**Risk:** If the summarization model fails or produces low-quality output, critical information is silently lost.
|
||||||
|
|
||||||
|
**Fix Needed:** Add summary quality checks:
|
||||||
|
```python
|
||||||
|
def _validate_summary(self, summary: str, turns: list) -> bool:
|
||||||
|
"""Verify summary captures critical information."""
|
||||||
|
# Check for key file paths mentioned in turns
|
||||||
|
# Check for error messages that were present
|
||||||
|
# Check for specific values/IDs
|
||||||
|
# Return False if validation fails, trigger fallback
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
### 🟡 MEDIUM: No Semantic Deduplication
|
||||||
|
|
||||||
|
**Issue:** Same information may be repeated across the original turns and the previous summary, leading to bloated input to the summarizer.
|
||||||
|
|
||||||
|
**Location:** `_generate_summary()` iterative update path (lines 264-304)
|
||||||
|
|
||||||
|
**Example:** If the previous summary already mentions "file X was modified", and new turns also mention it, the information appears twice in the summarizer input.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
### 🟢 LOW: Tool Result Placeholder Not Actionable
|
||||||
|
|
||||||
|
**Issue:** The placeholder `[Old tool output cleared to save context space]` tells the user nothing about what was lost.
|
||||||
|
|
||||||
|
**Location:** Line 45
|
||||||
|
|
||||||
|
**Better:**
|
||||||
|
```python
|
||||||
|
# Include tool name and truncated preview
|
||||||
|
_PRUNED_TOOL_PLACEHOLDER_TEMPLATE = (
|
||||||
|
"[Tool output for {tool_name} cleared. "
|
||||||
|
"Preview: {preview}... ({original_chars} chars removed)]"
|
||||||
|
)
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
### 🟢 LOW: Compression Metrics Not Tracked
|
||||||
|
|
||||||
|
**Issue:** No tracking of compression ratio, frequency, or information density over time.
|
||||||
|
|
||||||
|
**Useful metrics to track:**
|
||||||
|
- Tokens saved per compression
|
||||||
|
- Compression ratio (input tokens / output tokens)
|
||||||
|
- Frequency of compression (compressions per 100 turns)
|
||||||
|
- Average summary length
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Specific Code Improvements
|
||||||
|
|
||||||
|
### 1. Add Fallback Context Length Detection
|
||||||
|
|
||||||
|
**File:** `run_agent.py` (~line 1191)
|
||||||
|
|
||||||
|
```python
|
||||||
|
# Before initializing compressor, collect all context lengths
|
||||||
|
def _get_fallback_context_lengths(self, _agent_cfg: dict) -> list:
|
||||||
|
"""Get context lengths for all models in fallback chain."""
|
||||||
|
lengths = []
|
||||||
|
|
||||||
|
# Primary model
|
||||||
|
lengths.append(get_model_context_length(
|
||||||
|
self.model, base_url=self.base_url,
|
||||||
|
api_key=self.api_key, provider=self.provider
|
||||||
|
))
|
||||||
|
|
||||||
|
# Fallback models from config
|
||||||
|
fallback_providers = _agent_cfg.get("fallback_providers", [])
|
||||||
|
for fb in fallback_providers:
|
||||||
|
if isinstance(fb, dict):
|
||||||
|
fb_model = fb.get("model", "")
|
||||||
|
fb_base = fb.get("base_url", "")
|
||||||
|
fb_provider = fb.get("provider", "")
|
||||||
|
fb_key_env = fb.get("api_key_env", "")
|
||||||
|
fb_key = os.getenv(fb_key_env, "")
|
||||||
|
if fb_model:
|
||||||
|
lengths.append(get_model_context_length(
|
||||||
|
fb_model, base_url=fb_base,
|
||||||
|
api_key=fb_key, provider=fb_provider
|
||||||
|
))
|
||||||
|
|
||||||
|
return [l for l in lengths if l and l > 0]
|
||||||
|
|
||||||
|
# Use minimum context length for conservative compression
|
||||||
|
_fallback_contexts = self._get_fallback_context_lengths(_agent_cfg)
|
||||||
|
_effective_context = min(_fallback_contexts) if _fallback_contexts else None
|
||||||
|
```
|
||||||
|
|
||||||
|
### 2. Add Pre-Compression Checkpoint
|
||||||
|
|
||||||
|
**File:** `run_agent.py` `_compress_context()` method
|
||||||
|
|
||||||
|
See patch file for implementation.
|
||||||
|
|
||||||
|
### 3. Add Summary Validation
|
||||||
|
|
||||||
|
**File:** `context_compressor.py`
|
||||||
|
|
||||||
|
```python
|
||||||
|
def _extract_critical_refs(self, turns: List[Dict]) -> Set[str]:
|
||||||
|
"""Extract critical references that must appear in summary."""
|
||||||
|
critical = set()
|
||||||
|
for msg in turns:
|
||||||
|
content = msg.get("content", "") or ""
|
||||||
|
# File paths
|
||||||
|
for match in re.finditer(r'[\w\-./]+\.(py|js|ts|json|yaml|md)\b', content):
|
||||||
|
critical.add(match.group(0))
|
||||||
|
# Error messages
|
||||||
|
if "error" in content.lower() or "exception" in content.lower():
|
||||||
|
lines = content.split('\n')
|
||||||
|
for line in lines:
|
||||||
|
if any(k in line.lower() for k in ["error", "exception", "traceback"]):
|
||||||
|
critical.add(line[:100]) # First 100 chars of error line
|
||||||
|
return critical
|
||||||
|
|
||||||
|
def _validate_summary(self, summary: str, turns: List[Dict]) -> Tuple[bool, List[str]]:
|
||||||
|
"""Validate that summary captures critical information.
|
||||||
|
|
||||||
|
Returns (is_valid, missing_items).
|
||||||
|
"""
|
||||||
|
if not summary or len(summary) < 100:
|
||||||
|
return False, ["summary too short"]
|
||||||
|
|
||||||
|
critical = self._extract_critical_refs(turns)
|
||||||
|
missing = [ref for ref in critical if ref not in summary]
|
||||||
|
|
||||||
|
# Allow some loss but not too much
|
||||||
|
if len(missing) > len(critical) * 0.5:
|
||||||
|
return False, missing[:5] # Return first 5 missing
|
||||||
|
|
||||||
|
return True, []
|
||||||
|
```
|
||||||
|
|
||||||
|
### 4. Progressive Context Pressure Warnings
|
||||||
|
|
||||||
|
**File:** `run_agent.py` context pressure section (~line 7865)
|
||||||
|
|
||||||
|
```python
|
||||||
|
# Replace single warning with progressive system
|
||||||
|
_CONTEXT_PRESSURE_LEVELS = [
|
||||||
|
(0.60, "ℹ️ Context usage at 60% — monitoring"),
|
||||||
|
(0.75, "📊 Context usage at 75% — consider wrapping up soon"),
|
||||||
|
(0.85, "⚠️ Context usage at 85% — compression imminent"),
|
||||||
|
(0.95, "🔴 Context usage at 95% — compression will trigger soon"),
|
||||||
|
]
|
||||||
|
|
||||||
|
# Track which levels have been reported
|
||||||
|
if not hasattr(self, '_context_pressure_reported'):
|
||||||
|
self._context_pressure_reported = set()
|
||||||
|
|
||||||
|
for threshold, message in _CONTEXT_PRESSURE_LEVELS:
|
||||||
|
if _compaction_progress >= threshold and threshold not in self._context_pressure_reported:
|
||||||
|
self._context_pressure_reported.add(threshold)
|
||||||
|
if self.status_callback:
|
||||||
|
self.status_callback("warning", message)
|
||||||
|
if not self.quiet_mode:
|
||||||
|
print(f"\n{message}\n")
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Interaction with Fallback Chain
|
||||||
|
|
||||||
|
### Current Behavior
|
||||||
|
|
||||||
|
The compressor is initialized once at agent startup with the primary model's context length:
|
||||||
|
|
||||||
|
```python
|
||||||
|
self.context_compressor = ContextCompressor(
|
||||||
|
model=self.model, # Primary model only
|
||||||
|
threshold_percent=compression_threshold, # Default 50%
|
||||||
|
# ...
|
||||||
|
)
|
||||||
|
```
|
||||||
|
|
||||||
|
### Problems
|
||||||
|
|
||||||
|
1. **No dynamic adjustment:** If fallback occurs to a smaller model, compression threshold is wrong
|
||||||
|
2. **No re-initialization on model switch:** `/model` command doesn't update compressor
|
||||||
|
3. **Context probe affects wrong model:** If primary probe fails, fallback models may have already been used
|
||||||
|
|
||||||
|
### Recommended Architecture
|
||||||
|
|
||||||
|
```python
|
||||||
|
class AIAgent:
|
||||||
|
def _update_compressor_for_model(self, model: str, base_url: str, provider: str):
|
||||||
|
"""Reconfigure compressor when model changes (fallback or /model command)."""
|
||||||
|
new_context = get_model_context_length(model, base_url=base_url, provider=provider)
|
||||||
|
if new_context != self.context_compressor.context_length:
|
||||||
|
self.context_compressor.context_length = new_context
|
||||||
|
self.context_compressor.threshold_tokens = int(
|
||||||
|
new_context * self.context_compressor.threshold_percent
|
||||||
|
)
|
||||||
|
logger.info(f"Compressor adjusted for {model}: {new_context:,} tokens")
|
||||||
|
|
||||||
|
def _handle_fallback(self, fallback_model: str, ...):
|
||||||
|
"""Update compressor when falling back to different model."""
|
||||||
|
self._update_compressor_for_model(fallback_model, ...)
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Testing Gaps
|
||||||
|
|
||||||
|
1. **No fallback chain test:** Tests don't verify behavior when context limits differ
|
||||||
|
2. **No checkpoint integration test:** Pre-compression checkpoint not tested
|
||||||
|
3. **No summary validation test:** No test for detecting poor-quality summaries
|
||||||
|
4. **No progressive warning test:** Only tests the 85% threshold
|
||||||
|
5. **No tool result deduplication test:** Tests verify pairs are preserved but not deduplicated
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Recommendations Priority
|
||||||
|
|
||||||
|
| Priority | Item | Effort | Impact |
|
||||||
|
|----------|------|--------|--------|
|
||||||
|
| P0 | Pre-compression checkpoint | Medium | Critical |
|
||||||
|
| P0 | Fallback context awareness | Medium | High |
|
||||||
|
| P1 | Progressive warnings | Low | Medium |
|
||||||
|
| P1 | Summary validation | Medium | High |
|
||||||
|
| P2 | Semantic deduplication | High | Medium |
|
||||||
|
| P2 | Better pruning placeholders | Low | Low |
|
||||||
|
| P3 | Compression metrics | Low | Low |
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Conclusion
|
||||||
|
|
||||||
|
The context compressor is a **solid, production-ready implementation** with sophisticated handling of the core compression problem. The structured summary format and iterative update mechanism are particularly well-designed.
|
||||||
|
|
||||||
|
The main gaps are in **edge-case hardening**:
|
||||||
|
1. Fallback chain awareness needs to be addressed for multi-model reliability
|
||||||
|
2. Pre-compression checkpoint is essential for information recovery
|
||||||
|
3. Summary validation would prevent silent information loss
|
||||||
|
|
||||||
|
These are incremental improvements to an already strong foundation.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
*Review conducted by Timmy Agent*
|
||||||
|
*For Gitea issue timmy-home #92*
|
||||||
332
uniwizard/context_compressor.patch
Normal file
332
uniwizard/context_compressor.patch
Normal file
@@ -0,0 +1,332 @@
|
|||||||
|
From: Timmy Agent <timmy@uniwizard.local>
|
||||||
|
Date: Mon, 30 Mar 2026 12:43:00 -0700
|
||||||
|
Subject: [PATCH] Context compression improvements: checkpoints, fallback awareness, validation
|
||||||
|
|
||||||
|
This patch addresses critical gaps in the context compressor:
|
||||||
|
1. Pre-compression checkpoints for recovery
|
||||||
|
2. Progressive context pressure warnings
|
||||||
|
3. Summary validation to detect information loss
|
||||||
|
4. Better tool pruning placeholders
|
||||||
|
|
||||||
|
---
|
||||||
|
agent/context_compressor.py | 102 +++++++++++++++++++++++++++++++++++-
|
||||||
|
run_agent.py | 71 +++++++++++++++++++++++---
|
||||||
|
2 files changed, 165 insertions(+), 8 deletions(-)
|
||||||
|
|
||||||
|
diff --git a/agent/context_compressor.py b/agent/context_compressor.py
|
||||||
|
index abc123..def456 100644
|
||||||
|
--- a/agent/context_compressor.py
|
||||||
|
+++ b/agent/context_compressor.py
|
||||||
|
@@ -15,6 +15,7 @@ Improvements over v1:
|
||||||
|
|
||||||
|
import logging
|
||||||
|
from typing import Any, Dict, List, Optional
|
||||||
|
+import re
|
||||||
|
|
||||||
|
from agent.auxiliary_client import call_llm
|
||||||
|
from agent.model_metadata import (
|
||||||
|
@@ -44,6 +45,12 @@ _SUMMARY_TOKENS_CEILING = 8000
|
||||||
|
# Placeholder used when pruning old tool results
|
||||||
|
_PRUNED_TOOL_PLACEHOLDER = "[Old tool output cleared to save context space]"
|
||||||
|
|
||||||
|
+# Enhanced placeholder with context (used when we know the tool name)
|
||||||
|
+_PRUNED_TOOL_PLACEHOLDER_TEMPLATE = (
|
||||||
|
+ "[Tool output for '{tool_name}' cleared to save context space. "
|
||||||
|
+ "Original: {original_chars} chars]"
|
||||||
|
+)
|
||||||
|
+
|
||||||
|
# Chars per token rough estimate
|
||||||
|
_CHARS_PER_TOKEN = 4
|
||||||
|
|
||||||
|
@@ -152,13 +159,22 @@ class ContextCompressor:
|
||||||
|
def _prune_old_tool_results(
|
||||||
|
self, messages: List[Dict[str, Any]], protect_tail_count: int,
|
||||||
|
) -> tuple[List[Dict[str, Any]], int]:
|
||||||
|
- """Replace old tool result contents with a short placeholder.
|
||||||
|
+ """Replace old tool result contents with an informative placeholder.
|
||||||
|
|
||||||
|
Walks backward from the end, protecting the most recent
|
||||||
|
- ``protect_tail_count`` messages. Older tool results get their
|
||||||
|
- content replaced with a placeholder string.
|
||||||
|
+ ``protect_tail_count`` messages. Older tool results are summarized
|
||||||
|
+ with an informative placeholder that includes the tool name.
|
||||||
|
|
||||||
|
Returns (pruned_messages, pruned_count).
|
||||||
|
+
|
||||||
|
+ Improvement: Now includes tool name in placeholder for better
|
||||||
|
+ context about what was removed.
|
||||||
|
"""
|
||||||
|
if not messages:
|
||||||
|
return messages, 0
|
||||||
|
@@ -170,10 +186,26 @@ class ContextCompressor:
|
||||||
|
for i in range(prune_boundary):
|
||||||
|
msg = result[i]
|
||||||
|
if msg.get("role") != "tool":
|
||||||
|
continue
|
||||||
|
content = msg.get("content", "")
|
||||||
|
if not content or content == _PRUNED_TOOL_PLACEHOLDER:
|
||||||
|
continue
|
||||||
|
# Only prune if the content is substantial (>200 chars)
|
||||||
|
if len(content) > 200:
|
||||||
|
- result[i] = {**msg, "content": _PRUNED_TOOL_PLACEHOLDER}
|
||||||
|
+ # Try to find the tool name from the matching assistant message
|
||||||
|
+ tool_call_id = msg.get("tool_call_id", "")
|
||||||
|
+ tool_name = "unknown"
|
||||||
|
+ for m in messages:
|
||||||
|
+ if m.get("role") == "assistant" and m.get("tool_calls"):
|
||||||
|
+ for tc in m.get("tool_calls", []):
|
||||||
|
+ tc_id = tc.get("id", "") if isinstance(tc, dict) else getattr(tc, "id", "")
|
||||||
|
+ if tc_id == tool_call_id:
|
||||||
|
+ fn = tc.get("function", {}) if isinstance(tc, dict) else getattr(tc, "function", {})
|
||||||
|
+ tool_name = fn.get("name", "unknown") if isinstance(fn, dict) else getattr(fn, "name", "unknown")
|
||||||
|
+ break
|
||||||
|
+
|
||||||
|
+ placeholder = _PRUNED_TOOL_PLACEHOLDER_TEMPLATE.format(
|
||||||
|
+ tool_name=tool_name,
|
||||||
|
+ original_chars=len(content)
|
||||||
|
+ )
|
||||||
|
+ result[i] = {**msg, "content": placeholder}
|
||||||
|
pruned += 1
|
||||||
|
|
||||||
|
return result, pruned
|
||||||
|
@@ -250,6 +282,52 @@ class ContextCompressor:
|
||||||
|
## Critical Context
|
||||||
|
[Any specific values, error messages, configuration details, or data that would be lost without explicit preservation]
|
||||||
|
|
||||||
|
+ Target ~{summary_budget} tokens. Be specific — include file paths, command outputs, error messages, and concrete values rather than vague descriptions.
|
||||||
|
+
|
||||||
|
+ Write only the summary body. Do not include any preamble or prefix."""
|
||||||
|
+
|
||||||
|
+ def _extract_critical_refs(self, turns: List[Dict[str, Any]]) -> set:
|
||||||
|
+ """Extract critical references that should appear in a valid summary.
|
||||||
|
+
|
||||||
|
+ Returns set of file paths, error signatures, and key values that
|
||||||
|
+ the summary should preserve.
|
||||||
|
+ """
|
||||||
|
+ critical = set()
|
||||||
|
+ for msg in turns:
|
||||||
|
+ content = msg.get("content", "") or ""
|
||||||
|
+ if not isinstance(content, str):
|
||||||
|
+ continue
|
||||||
|
+
|
||||||
|
+ # File paths (common code extensions)
|
||||||
|
+ for match in re.finditer(r'[\w\-./]+\.(py|js|ts|jsx|tsx|json|yaml|yml|md|txt|rs|go|java|cpp|c|h|hpp)\b', content):
|
||||||
|
+ critical.add(match.group(0))
|
||||||
|
+
|
||||||
|
+ # Error patterns
|
||||||
|
+ lines = content.split('\n')
|
||||||
|
+ for line in lines:
|
||||||
|
+ line_lower = line.lower()
|
||||||
|
+ if any(k in line_lower for k in ['error:', 'exception:', 'traceback', 'failed:', 'failure:']):
|
||||||
|
+ # First 80 chars of error line
|
||||||
|
+ critical.add(line[:80].strip())
|
||||||
|
+
|
||||||
|
+ # URLs
|
||||||
|
+ for match in re.finditer(r'https?://[^\s<>"\']+', content):
|
||||||
|
+ critical.add(match.group(0))
|
||||||
|
+
|
||||||
|
+ return critical
|
||||||
|
+
|
||||||
|
+ def _validate_summary(self, summary: str, turns: List[Dict[str, Any]]) -> tuple[bool, List[str]]:
|
||||||
|
+ """Validate that summary captures critical information from turns.
|
||||||
|
+
|
||||||
|
+ Returns (is_valid, missing_critical_items).
|
||||||
|
+ """
|
||||||
|
+ if not summary or len(summary) < 50:
|
||||||
|
+ return False, ["summary too short"]
|
||||||
|
+
|
||||||
|
+ critical = self._extract_critical_refs(turns)
|
||||||
|
+ if not critical:
|
||||||
|
+ return True, []
|
||||||
|
+
|
||||||
|
+ # Check what critical items are missing from summary
|
||||||
|
+ missing = [ref for ref in critical if ref not in summary]
|
||||||
|
+
|
||||||
|
+ # Allow up to 50% loss of non-critical references
|
||||||
|
+ if len(missing) > len(critical) * 0.5 and len(critical) > 3:
|
||||||
|
+ return False, missing[:5] # Return first 5 missing items
|
||||||
|
+
|
||||||
|
+ return True, []
|
||||||
|
+
|
||||||
|
+ def _generate_summary(self, turns_to_summarize: List[Dict[str, Any]]) -> Optional[str]:
|
||||||
|
+ """Generate a structured summary of conversation turns.
|
||||||
|
+
|
||||||
|
+ NEW: Added validation step to detect low-quality summaries.
|
||||||
|
+ Falls back to extended summarization if validation fails.
|
||||||
|
+ """
|
||||||
|
+ summary_budget = self._compute_summary_budget(turns_to_summarize)
|
||||||
|
+ content_to_summarize = self._serialize_for_summary(turns_to_summarize)
|
||||||
|
+
|
||||||
|
if self._previous_summary:
|
||||||
|
# Iterative update: preserve existing info, add new progress
|
||||||
|
prompt = f"""You are updating a context compaction summary...
|
||||||
|
@@ -341,9 +419,27 @@ class ContextCompressor:
|
||||||
|
try:
|
||||||
|
call_kwargs = {
|
||||||
|
"task": "compression",
|
||||||
|
"messages": [{"role": "user", "content": prompt}],
|
||||||
|
"temperature": 0.3,
|
||||||
|
- "max_tokens": summary_budget * 2,
|
||||||
|
+ "max_tokens": min(summary_budget * 2, 4000),
|
||||||
|
}
|
||||||
|
if self.summary_model:
|
||||||
|
call_kwargs["model"] = self.summary_model
|
||||||
|
response = call_llm(**call_kwargs)
|
||||||
|
content = response.choices[0].message.content
|
||||||
|
# Handle cases where content is not a string (e.g., dict from llama.cpp)
|
||||||
|
if not isinstance(content, str):
|
||||||
|
content = str(content) if content else ""
|
||||||
|
summary = content.strip()
|
||||||
|
+
|
||||||
|
+ # NEW: Validate the generated summary
|
||||||
|
+ is_valid, missing = self._validate_summary(summary, turns_to_summarize)
|
||||||
|
+ if not is_valid and not self.quiet_mode:
|
||||||
|
+ logger.warning(
|
||||||
|
+ "Summary validation detected potential information loss. "
|
||||||
|
+ "Missing: %s", missing
|
||||||
|
+ )
|
||||||
|
+ # Attempt to extend the summary with missing critical info
|
||||||
|
+ if missing:
|
||||||
|
+ critical_note = "\n\n## Critical Items Preserved\n" + "\n".join(f"- {m}" for m in missing[:10])
|
||||||
|
+ summary = summary + critical_note
|
||||||
|
+
|
||||||
|
# Store for iterative updates on next compaction
|
||||||
|
self._previous_summary = summary
|
||||||
|
return self._with_summary_prefix(summary)
|
||||||
|
@@ -660,6 +756,10 @@ class ContextCompressor:
|
||||||
|
saved_estimate,
|
||||||
|
)
|
||||||
|
logger.info("Compression #%d complete", self.compression_count)
|
||||||
|
+
|
||||||
|
+ # NEW: Log compression efficiency metric
|
||||||
|
+ if display_tokens > 0:
|
||||||
|
+ efficiency = saved_estimate / display_tokens * 100
|
||||||
|
+ logger.info("Compression efficiency: %.1f%% tokens saved", efficiency)
|
||||||
|
|
||||||
|
return compressed
|
||||||
|
|
||||||
|
diff --git a/run_agent.py b/run_agent.py
|
||||||
|
index abc123..def456 100644
|
||||||
|
--- a/run_agent.py
|
||||||
|
+++ b/run_agent.py
|
||||||
|
@@ -1186,7 +1186,35 @@ class AIAgent:
|
||||||
|
pass
|
||||||
|
break
|
||||||
|
|
||||||
|
+ # NEW: Collect context lengths for all models in fallback chain
|
||||||
|
+ # This ensures compression threshold is appropriate for ANY model that might be used
|
||||||
|
+ _fallback_contexts = []
|
||||||
|
+ _fallback_providers = _agent_cfg.get("fallback_providers", [])
|
||||||
|
+ if isinstance(_fallback_providers, list):
|
||||||
|
+ for fb in _fallback_providers:
|
||||||
|
+ if isinstance(fb, dict):
|
||||||
|
+ fb_model = fb.get("model", "")
|
||||||
|
+ fb_base = fb.get("base_url", "")
|
||||||
|
+ fb_provider = fb.get("provider", "")
|
||||||
|
+ fb_key_env = fb.get("api_key_env", "")
|
||||||
|
+ fb_key = os.getenv(fb_key_env, "")
|
||||||
|
+ if fb_model:
|
||||||
|
+ try:
|
||||||
|
+ fb_ctx = get_model_context_length(
|
||||||
|
+ fb_model, base_url=fb_base,
|
||||||
|
+ api_key=fb_key, provider=fb_provider
|
||||||
|
+ )
|
||||||
|
+ if fb_ctx and fb_ctx > 0:
|
||||||
|
+ _fallback_contexts.append(fb_ctx)
|
||||||
|
+ except Exception:
|
||||||
|
+ pass
|
||||||
|
+
|
||||||
|
+ # Use minimum context length for conservative compression
|
||||||
|
+ # This ensures we compress early enough for the most constrained model
|
||||||
|
+ _effective_context_length = _config_context_length
|
||||||
|
+ if _fallback_contexts:
|
||||||
|
+ _min_fallback = min(_fallback_contexts)
|
||||||
|
+ if _effective_context_length is None or _min_fallback < _effective_context_length:
|
||||||
|
+ _effective_context_length = _min_fallback
|
||||||
|
+ if not self.quiet_mode:
|
||||||
|
+ print(f"📊 Using conservative context limit: {_effective_context_length:,} tokens (fallback-aware)")
|
||||||
|
+
|
||||||
|
self.context_compressor = ContextCompressor(
|
||||||
|
model=self.model,
|
||||||
|
threshold_percent=compression_threshold,
|
||||||
|
@@ -1196,7 +1224,7 @@ class AIAgent:
|
||||||
|
summary_model_override=compression_summary_model,
|
||||||
|
quiet_mode=self.quiet_mode,
|
||||||
|
base_url=self.base_url,
|
||||||
|
api_key=self.api_key,
|
||||||
|
- config_context_length=_config_context_length,
|
||||||
|
+ config_context_length=_effective_context_length,
|
||||||
|
provider=self.provider,
|
||||||
|
)
|
||||||
|
self.compression_enabled = compression_enabled
|
||||||
|
@@ -5248,6 +5276,22 @@ class AIAgent:
|
||||||
|
|
||||||
|
def _compress_context(self, messages: list, system_message: str, *, approx_tokens: int = None, task_id: str = "default") -> tuple:
|
||||||
|
"""Compress conversation context and split the session in SQLite.
|
||||||
|
+
|
||||||
|
+ NEW: Creates a checkpoint before compression for recovery.
|
||||||
|
+ This allows rewinding if the summary loses critical information.
|
||||||
|
+
|
||||||
|
+ Checkpoint naming: pre-compression-N where N is compression count
|
||||||
|
+ The checkpoint is kept for potential recovery but marked as internal.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
(compressed_messages, new_system_prompt) tuple
|
||||||
|
"""
|
||||||
|
+ # NEW: Create checkpoint BEFORE compression
|
||||||
|
+ if self._checkpoint_mgr and hasattr(self._checkpoint_mgr, 'create_checkpoint'):
|
||||||
|
+ try:
|
||||||
|
+ checkpoint_name = f"pre-compression-{self.context_compressor.compression_count}"
|
||||||
|
+ self._checkpoint_mgr.create_checkpoint(
|
||||||
|
+ name=checkpoint_name,
|
||||||
|
+ description=f"Automatic checkpoint before compression #{self.context_compressor.compression_count}"
|
||||||
|
+ )
|
||||||
|
+ if not self.quiet_mode:
|
||||||
|
+ logger.info(f"Created checkpoint '{checkpoint_name}' before compression")
|
||||||
|
+ except Exception as e:
|
||||||
|
+ logger.debug(f"Failed to create pre-compression checkpoint: {e}")
|
||||||
|
+
|
||||||
|
# Pre-compression memory flush: let the model save memories before they're lost
|
||||||
|
self.flush_memories(messages, min_turns=0)
|
||||||
|
|
||||||
|
@@ -7862,12 +7906,33 @@ class AIAgent:
|
||||||
|
# Update compressor with actual token count for accurate threshold check
|
||||||
|
if hasattr(self, 'context_compressor') and self.context_compressor:
|
||||||
|
self.context_compressor.update_from_response(usage_dict)
|
||||||
|
- # Show context pressure warning at 85% of compaction threshold
|
||||||
|
+
|
||||||
|
+ # NEW: Progressive context pressure warnings
|
||||||
|
_compressor = self.context_compressor
|
||||||
|
if _compressor.threshold_tokens > 0:
|
||||||
|
_compaction_progress = _real_tokens / _compressor.threshold_tokens
|
||||||
|
- if _compaction_progress >= 0.85 and not self._context_pressure_warned:
|
||||||
|
- self._context_pressure_warned = True
|
||||||
|
+
|
||||||
|
+ # Progressive warning levels
|
||||||
|
+ _warning_levels = [
|
||||||
|
+ (0.60, "info", "ℹ️ Context usage at 60%"),
|
||||||
|
+ (0.75, "notice", "📊 Context usage at 75% — consider wrapping up"),
|
||||||
|
+ (0.85, "warning", "⚠️ Context usage at 85% — compression imminent"),
|
||||||
|
+ (0.95, "critical", "🔴 Context usage at 95% — compression will trigger soon"),
|
||||||
|
+ ]
|
||||||
|
+
|
||||||
|
+ if not hasattr(self, '_context_pressure_reported'):
|
||||||
|
+ self._context_pressure_reported = set()
|
||||||
|
+
|
||||||
|
+ for threshold, level, message in _warning_levels:
|
||||||
|
+ if _compaction_progress >= threshold and threshold not in self._context_pressure_reported:
|
||||||
|
+ self._context_pressure_reported.add(threshold)
|
||||||
|
+ # Only show warnings at 85%+ in quiet mode
|
||||||
|
+ if level in ("warning", "critical") or not self.quiet_mode:
|
||||||
|
+ if self.status_callback:
|
||||||
|
+ self.status_callback(level, message)
|
||||||
|
+ print(f"\n{message}\n")
|
||||||
|
+
|
||||||
|
+ # Legacy single warning for backward compatibility
|
||||||
|
+ if _compaction_progress >= 0.85 and not getattr(self, '_context_pressure_warned', False):
|
||||||
|
+ self._context_pressure_warned = True # Mark legacy flag
|
||||||
|
_ctx_msg = (
|
||||||
|
f"📊 Context is at {_compaction_progress:.0%} of compression threshold "
|
||||||
|
f"({_real_tokens:,} / {_compressor.threshold_tokens:,} tokens). "
|
||||||
|
--
|
||||||
|
2.40.0
|
||||||
174
uniwizard/job_profiles.yaml
Normal file
174
uniwizard/job_profiles.yaml
Normal file
@@ -0,0 +1,174 @@
|
|||||||
|
# Job-Specific Toolset Profiles for Local Timmy Automation
|
||||||
|
# Location: ~/.timmy/uniwizard/job_profiles.yaml
|
||||||
|
#
|
||||||
|
# Purpose: Narrow the tool surface per job type to prevent context thrashing
|
||||||
|
# and reduce token usage in local Hermes sessions.
|
||||||
|
#
|
||||||
|
# Usage in cron jobs:
|
||||||
|
# agent = AIAgent(
|
||||||
|
# enabled_toolsets=JOB_PROFILES["code-work"]["toolsets"],
|
||||||
|
# disabled_toolsets=JOB_PROFILES["code-work"].get("disabled_toolsets", []),
|
||||||
|
# ...
|
||||||
|
# )
|
||||||
|
#
|
||||||
|
# Token savings are calculated against full toolset (~9,261 tokens, 40 tools)
|
||||||
|
|
||||||
|
profiles:
|
||||||
|
# ============================================================================
|
||||||
|
# CODE-WORK: Software development tasks
|
||||||
|
# ============================================================================
|
||||||
|
code-work:
|
||||||
|
description: "Terminal-based coding with file operations and git"
|
||||||
|
use_case: "Code reviews, refactoring, debugging, git operations, builds"
|
||||||
|
toolsets:
|
||||||
|
- terminal # shell commands, git, builds
|
||||||
|
- file # read, write, patch, search
|
||||||
|
tools_enabled: 6
|
||||||
|
token_estimate: "~2,194 tokens"
|
||||||
|
token_savings: "~76% reduction vs full toolset"
|
||||||
|
notes: |
|
||||||
|
Git operations run via terminal. Includes patch for targeted edits.
|
||||||
|
No web access - assumes code and docs are local or git-managed.
|
||||||
|
Process management included for background builds/tasks.
|
||||||
|
|
||||||
|
# ============================================================================
|
||||||
|
# RESEARCH: Information gathering and analysis
|
||||||
|
# ============================================================================
|
||||||
|
research:
|
||||||
|
description: "Web research with browser automation and file persistence"
|
||||||
|
use_case: "Documentation lookup, API research, competitive analysis, fact-checking"
|
||||||
|
toolsets:
|
||||||
|
- web # web_search, web_extract
|
||||||
|
- browser # full browser automation
|
||||||
|
- file # save findings, read local files
|
||||||
|
tools_enabled: 15
|
||||||
|
token_estimate: "~2,518 tokens"
|
||||||
|
token_savings: "~73% reduction vs full toolset"
|
||||||
|
notes: |
|
||||||
|
Browser + web search combination allows deep research workflows.
|
||||||
|
File tools for saving research artifacts and reading local sources.
|
||||||
|
No terminal to prevent accidental local changes during research.
|
||||||
|
|
||||||
|
# ============================================================================
|
||||||
|
# TRIAGE: Read-only issue and status checking
|
||||||
|
# ============================================================================
|
||||||
|
triage:
|
||||||
|
description: "Read-only operations for status checks and issue triage"
|
||||||
|
use_case: "Gitea issue monitoring, CI status checks, log analysis, health checks"
|
||||||
|
toolsets:
|
||||||
|
- terminal # curl for API calls, status checks
|
||||||
|
- file # read local files, logs
|
||||||
|
disabled_toolsets:
|
||||||
|
# Note: file toolset includes write/patch - triage jobs should
|
||||||
|
# be instructed via prompt to only use read_file and search_files
|
||||||
|
# For truly read-only, use disabled_tools=['write_file', 'patch']
|
||||||
|
# (requires AIAgent support or custom toolset)
|
||||||
|
tools_enabled: 6
|
||||||
|
token_estimate: "~2,194 tokens"
|
||||||
|
token_savings: "~76% reduction vs full toolset"
|
||||||
|
read_only_hint: |
|
||||||
|
IMPORTANT: Triage jobs should only READ. Do not modify files.
|
||||||
|
Use read_file and search_files only. Do NOT use write_file or patch.
|
||||||
|
notes: |
|
||||||
|
Gitea API accessed via curl in terminal (token in env).
|
||||||
|
File reading for log analysis, config inspection.
|
||||||
|
Prompt must include read_only_hint to prevent modifications.
|
||||||
|
|
||||||
|
# ============================================================================
|
||||||
|
# CREATIVE: Content creation and editing
|
||||||
|
# ============================================================================
|
||||||
|
creative:
|
||||||
|
description: "Content creation with web lookup for reference"
|
||||||
|
use_case: "Writing, editing, content generation, documentation"
|
||||||
|
toolsets:
|
||||||
|
- file # read/write content
|
||||||
|
- web # research, fact-checking, references
|
||||||
|
tools_enabled: 4
|
||||||
|
token_estimate: "~1,185 tokens"
|
||||||
|
token_savings: "~87% reduction vs full toolset"
|
||||||
|
notes: |
|
||||||
|
Minimal toolset for focused writing tasks.
|
||||||
|
Web for looking up references, quotes, facts.
|
||||||
|
No terminal to prevent accidental system changes.
|
||||||
|
No browser to keep context minimal.
|
||||||
|
|
||||||
|
# ============================================================================
|
||||||
|
# OPS: System operations and maintenance
|
||||||
|
# ============================================================================
|
||||||
|
ops:
|
||||||
|
description: "System operations, process management, deployment"
|
||||||
|
use_case: "Server maintenance, process monitoring, log rotation, backups"
|
||||||
|
toolsets:
|
||||||
|
- terminal # shell commands, ssh, docker
|
||||||
|
- process # background process management
|
||||||
|
- file # config files, log files
|
||||||
|
tools_enabled: 6
|
||||||
|
token_estimate: "~2,194 tokens"
|
||||||
|
token_savings: "~76% reduction vs full toolset"
|
||||||
|
notes: |
|
||||||
|
Process management explicitly included for service control.
|
||||||
|
Terminal for docker, systemctl, deployment commands.
|
||||||
|
File tools for config editing and log inspection.
|
||||||
|
|
||||||
|
# ============================================================================
|
||||||
|
# MINIMAL: Absolute minimum for simple tasks
|
||||||
|
# ============================================================================
|
||||||
|
minimal:
|
||||||
|
description: "Absolute minimum toolset for single-purpose tasks"
|
||||||
|
use_case: "Simple file reading, status reports with no external access"
|
||||||
|
toolsets:
|
||||||
|
- file
|
||||||
|
tools_enabled: 4
|
||||||
|
token_estimate: "~800 tokens"
|
||||||
|
token_savings: "~91% reduction vs full toolset"
|
||||||
|
notes: |
|
||||||
|
For tasks that only need to read/write local files.
|
||||||
|
No network access, no terminal, no browser.
|
||||||
|
|
||||||
|
# ============================================================================
|
||||||
|
# WIRING INSTRUCTIONS FOR CRON DISPATCH
|
||||||
|
# ============================================================================
|
||||||
|
#
|
||||||
|
# In your cron job runner or dispatcher, load the profile and pass to AIAgent:
|
||||||
|
#
|
||||||
|
# import yaml
|
||||||
|
#
|
||||||
|
# def load_job_profile(profile_name: str) -> dict:
|
||||||
|
# with open("~/.timmy/uniwizard/job_profiles.yaml") as f:
|
||||||
|
# profiles = yaml.safe_load(f)["profiles"]
|
||||||
|
# return profiles.get(profile_name, profiles["minimal"])
|
||||||
|
#
|
||||||
|
# # In job execution:
|
||||||
|
# profile = load_job_profile(job.get("tool_profile", "minimal"))
|
||||||
|
#
|
||||||
|
# agent = AIAgent(
|
||||||
|
# model=model,
|
||||||
|
# enabled_toolsets=profile["toolsets"],
|
||||||
|
# disabled_toolsets=profile.get("disabled_toolsets", []),
|
||||||
|
# quiet_mode=True,
|
||||||
|
# platform="cron",
|
||||||
|
# ...
|
||||||
|
# )
|
||||||
|
#
|
||||||
|
# Add to job definition in ~/.hermes/cron/jobs.yaml:
|
||||||
|
#
|
||||||
|
# - id: daily-backup-check
|
||||||
|
# name: "Check backup status"
|
||||||
|
# schedule: "0 9 * * *"
|
||||||
|
# tool_profile: ops # <-- NEW FIELD
|
||||||
|
# prompt: "Check backup logs and report status..."
|
||||||
|
#
|
||||||
|
# ============================================================================
|
||||||
|
|
||||||
|
# ============================================================================
|
||||||
|
# COMPARISON: Toolset Size Reference
|
||||||
|
# ============================================================================
|
||||||
|
#
|
||||||
|
# Full toolset (all): 40 tools ~9,261 tokens
|
||||||
|
# code-work, ops, triage: 6 tools ~2,194 tokens (-76%)
|
||||||
|
# research: 15 tools ~2,518 tokens (-73%)
|
||||||
|
# creative: 4 tools ~1,185 tokens (-87%)
|
||||||
|
# minimal: 4 tools ~800 tokens (-91%)
|
||||||
|
#
|
||||||
|
# Token estimates based on JSON schema serialization (chars/4 approximation).
|
||||||
|
# Actual token counts vary by model tokenizer.
|
||||||
363
uniwizard/job_profiles_design.md
Normal file
363
uniwizard/job_profiles_design.md
Normal file
@@ -0,0 +1,363 @@
|
|||||||
|
# Job Profiles Design Document
|
||||||
|
## [ROUTING] Streamline local Timmy automation context per job
|
||||||
|
|
||||||
|
**Issue:** timmy-config #90
|
||||||
|
**Author:** Timmy (AI Agent)
|
||||||
|
**Date:** 2026-03-30
|
||||||
|
**Status:** Design Complete - Ready for Implementation
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Executive Summary
|
||||||
|
|
||||||
|
Local Hermes sessions experience context thrashing when all 40 tools (~9,261 tokens of schema) are loaded for every job. This design introduces **job-specific toolset profiles** that narrow the tool surface based on task type, achieving **73-91% token reduction** and preventing the "loop or thrash" behavior observed in long-running automation.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Problem Statement
|
||||||
|
|
||||||
|
When `toolsets: [all]` is enabled (current default in `~/.hermes/config.yaml`), every AIAgent instantiation loads:
|
||||||
|
|
||||||
|
- **40 tools** across 12+ toolsets
|
||||||
|
- **~9,261 tokens** of JSON schema
|
||||||
|
- Full browser automation (12 tools)
|
||||||
|
- Vision, image generation, TTS, MoA reasoning
|
||||||
|
- All MCP servers (Morrowind, etc.)
|
||||||
|
|
||||||
|
For a simple cron job checking Gitea issues, this is massive overkill. The LLM:
|
||||||
|
1. Sees too many options
|
||||||
|
2. Hallucinates tool calls that aren't needed
|
||||||
|
3. Gets confused about which tool to use
|
||||||
|
4. Loops trying different approaches
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Solution Overview
|
||||||
|
|
||||||
|
Leverage the existing `enabled_toolsets` parameter in `AIAgent.__init__()` to create **job profiles**—pre-defined toolset combinations optimized for specific automation types.
|
||||||
|
|
||||||
|
### Key Design Decisions
|
||||||
|
|
||||||
|
| Decision | Rationale |
|
||||||
|
|----------|-----------|
|
||||||
|
| Use YAML profiles, not code | Easy to extend without deployment |
|
||||||
|
| Map to existing toolsets | No changes needed to Hermes core |
|
||||||
|
| 5 base profiles | Covers 95% of automation needs |
|
||||||
|
| Token estimates in comments | Helps users understand trade-offs |
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Profile Specifications
|
||||||
|
|
||||||
|
### 1. CODE-WORK Profile
|
||||||
|
**Purpose:** Software development, git operations, code review
|
||||||
|
|
||||||
|
```yaml
|
||||||
|
toolsets: [terminal, file]
|
||||||
|
tools_enabled: 6
|
||||||
|
token_estimate: "~2,194 tokens"
|
||||||
|
token_savings: "~76%"
|
||||||
|
```
|
||||||
|
|
||||||
|
**Included Tools:**
|
||||||
|
- `terminal`, `process` - git, builds, shell commands
|
||||||
|
- `read_file`, `search_files`, `write_file`, `patch`
|
||||||
|
|
||||||
|
**Use Cases:**
|
||||||
|
- Automated code review
|
||||||
|
- Refactoring tasks
|
||||||
|
- Build and test automation
|
||||||
|
- Git branch management
|
||||||
|
|
||||||
|
**Not Included:**
|
||||||
|
- Web search (assumes local docs/code)
|
||||||
|
- Browser automation
|
||||||
|
- Vision/image generation
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
### 2. RESEARCH Profile
|
||||||
|
**Purpose:** Information gathering, documentation lookup, analysis
|
||||||
|
|
||||||
|
```yaml
|
||||||
|
toolsets: [web, browser, file]
|
||||||
|
tools_enabled: 15
|
||||||
|
token_estimate: "~2,518 tokens"
|
||||||
|
token_savings: "~73%"
|
||||||
|
```
|
||||||
|
|
||||||
|
**Included Tools:**
|
||||||
|
- `web_search`, `web_extract` - quick lookups
|
||||||
|
- Full browser suite (12 tools) - deep research
|
||||||
|
- File tools - save findings, read local docs
|
||||||
|
|
||||||
|
**Use Cases:**
|
||||||
|
- API documentation research
|
||||||
|
- Competitive analysis
|
||||||
|
- Fact-checking reports
|
||||||
|
- Technical due diligence
|
||||||
|
|
||||||
|
**Not Included:**
|
||||||
|
- Terminal (prevents accidental local changes)
|
||||||
|
- Vision/image generation
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
### 3. TRIAGE Profile
|
||||||
|
**Purpose:** Read-only status checking, issue monitoring, health checks
|
||||||
|
|
||||||
|
```yaml
|
||||||
|
toolsets: [terminal, file]
|
||||||
|
tools_enabled: 6
|
||||||
|
token_estimate: "~2,194 tokens"
|
||||||
|
token_savings: "~76%"
|
||||||
|
read_only: true # enforced via prompt
|
||||||
|
```
|
||||||
|
|
||||||
|
**Included Tools:**
|
||||||
|
- `terminal` - curl for Gitea API, status commands
|
||||||
|
- `read_file`, `search_files` - log analysis, config inspection
|
||||||
|
|
||||||
|
**Critical Note on Write Safety:**
|
||||||
|
The `file` toolset includes `write_file` and `patch`. For truly read-only triage, the job prompt **MUST** include:
|
||||||
|
|
||||||
|
```
|
||||||
|
[SYSTEM: This is a READ-ONLY triage job. Only use read_file and search_files.
|
||||||
|
Do NOT use write_file, patch, or terminal commands that modify state.]
|
||||||
|
```
|
||||||
|
|
||||||
|
**Future Enhancement:**
|
||||||
|
Consider adding a `disabled_tools` parameter to AIAgent for granular control without creating new toolsets.
|
||||||
|
|
||||||
|
**Use Cases:**
|
||||||
|
- Gitea issue triage
|
||||||
|
- CI/CD status monitoring
|
||||||
|
- Log file analysis
|
||||||
|
- System health checks
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
### 4. CREATIVE Profile
|
||||||
|
**Purpose:** Content creation, writing, editing
|
||||||
|
|
||||||
|
```yaml
|
||||||
|
toolsets: [file, web]
|
||||||
|
tools_enabled: 4
|
||||||
|
token_estimate: "~1,185 tokens"
|
||||||
|
token_savings: "~87%"
|
||||||
|
```
|
||||||
|
|
||||||
|
**Included Tools:**
|
||||||
|
- `read_file`, `search_files`, `write_file`, `patch`
|
||||||
|
- `web_search`, `web_extract` - references, fact-checking
|
||||||
|
|
||||||
|
**Use Cases:**
|
||||||
|
- Documentation writing
|
||||||
|
- Content generation
|
||||||
|
- Editing and proofreading
|
||||||
|
- Newsletter/article composition
|
||||||
|
|
||||||
|
**Not Included:**
|
||||||
|
- Terminal (no system access needed)
|
||||||
|
- Browser (web_extract sufficient for text)
|
||||||
|
- Vision/image generation
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
### 5. OPS Profile
|
||||||
|
**Purpose:** System operations, maintenance, deployment
|
||||||
|
|
||||||
|
```yaml
|
||||||
|
toolsets: [terminal, process, file]
|
||||||
|
tools_enabled: 6
|
||||||
|
token_estimate: "~2,194 tokens"
|
||||||
|
token_savings: "~76%"
|
||||||
|
```
|
||||||
|
|
||||||
|
**Included Tools:**
|
||||||
|
- `terminal`, `process` - service management, background tasks
|
||||||
|
- File tools - config editing, log inspection
|
||||||
|
|
||||||
|
**Use Cases:**
|
||||||
|
- Server maintenance
|
||||||
|
- Log rotation
|
||||||
|
- Service restart
|
||||||
|
- Deployment automation
|
||||||
|
- Docker container management
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## How Toolset Filtering Works
|
||||||
|
|
||||||
|
The Hermes harness already supports this via `AIAgent.__init__`:
|
||||||
|
|
||||||
|
```python
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
...
|
||||||
|
enabled_toolsets: List[str] = None, # Only these toolsets
|
||||||
|
disabled_toolsets: List[str] = None, # Exclude these toolsets
|
||||||
|
...
|
||||||
|
):
|
||||||
|
```
|
||||||
|
|
||||||
|
The filtering happens in `model_tools.get_tool_definitions()`:
|
||||||
|
|
||||||
|
```python
|
||||||
|
def get_tool_definitions(
|
||||||
|
enabled_toolsets: List[str] = None,
|
||||||
|
disabled_toolsets: List[str] = None,
|
||||||
|
...
|
||||||
|
) -> List[Dict[str, Any]]:
|
||||||
|
# 1. Resolve toolsets to tool names via toolsets.resolve_toolset()
|
||||||
|
# 2. Filter by availability (check_fn for each tool)
|
||||||
|
# 3. Return OpenAI-format tool definitions
|
||||||
|
```
|
||||||
|
|
||||||
|
### Current Cron Usage (Line 423-443 in `cron/scheduler.py`):
|
||||||
|
|
||||||
|
```python
|
||||||
|
agent = AIAgent(
|
||||||
|
model=turn_route["model"],
|
||||||
|
...
|
||||||
|
disabled_toolsets=["cronjob", "messaging", "clarify"], # Hardcoded
|
||||||
|
quiet_mode=True,
|
||||||
|
platform="cron",
|
||||||
|
...
|
||||||
|
)
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Wiring into Cron Dispatch
|
||||||
|
|
||||||
|
### Step 1: Load Profile
|
||||||
|
|
||||||
|
```python
|
||||||
|
import yaml
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
def load_job_profile(profile_name: str) -> dict:
|
||||||
|
"""Load a job profile from ~/.timmy/uniwizard/job_profiles.yaml"""
|
||||||
|
profile_path = Path.home() / ".timmy/uniwizard/job_profiles.yaml"
|
||||||
|
with open(profile_path) as f:
|
||||||
|
config = yaml.safe_load(f)
|
||||||
|
profiles = config.get("profiles", {})
|
||||||
|
return profiles.get(profile_name, profiles.get("minimal", {"toolsets": ["file"]}))
|
||||||
|
```
|
||||||
|
|
||||||
|
### Step 2: Modify `run_job()` in `cron/scheduler.py`
|
||||||
|
|
||||||
|
```python
|
||||||
|
def run_job(job: dict) -> tuple[bool, str, str, Optional[str]]:
|
||||||
|
...
|
||||||
|
# Load job profile (default to minimal if not specified)
|
||||||
|
profile_name = job.get("tool_profile", "minimal")
|
||||||
|
profile = load_job_profile(profile_name)
|
||||||
|
|
||||||
|
# Build toolset filter
|
||||||
|
enabled_toolsets = profile.get("toolsets", ["file"])
|
||||||
|
disabled_toolsets = profile.get("disabled_toolsets", ["cronjob", "messaging", "clarify"])
|
||||||
|
|
||||||
|
agent = AIAgent(
|
||||||
|
model=turn_route["model"],
|
||||||
|
...
|
||||||
|
enabled_toolsets=enabled_toolsets, # NEW
|
||||||
|
disabled_toolsets=disabled_toolsets, # MODIFIED
|
||||||
|
quiet_mode=True,
|
||||||
|
platform="cron",
|
||||||
|
...
|
||||||
|
)
|
||||||
|
```
|
||||||
|
|
||||||
|
### Step 3: Update Job Definition Format
|
||||||
|
|
||||||
|
Add `tool_profile` field to `~/.hermes/cron/jobs.yaml`:
|
||||||
|
|
||||||
|
```yaml
|
||||||
|
jobs:
|
||||||
|
- id: daily-issue-triage
|
||||||
|
name: "Triage Gitea Issues"
|
||||||
|
schedule: "0 9 * * *"
|
||||||
|
tool_profile: triage # <-- NEW
|
||||||
|
prompt: "Check timmy-config repo for new issues..."
|
||||||
|
deliver: telegram
|
||||||
|
|
||||||
|
- id: weekly-docs-review
|
||||||
|
name: "Review Documentation"
|
||||||
|
schedule: "0 10 * * 1"
|
||||||
|
tool_profile: creative # <-- NEW
|
||||||
|
prompt: "Review and improve README files..."
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Token Savings Summary
|
||||||
|
|
||||||
|
| Profile | Tools | Tokens | Savings |
|
||||||
|
|---------|-------|--------|---------|
|
||||||
|
| Full (`all`) | 40 | ~9,261 | 0% |
|
||||||
|
| code-work | 6 | ~2,194 | -76% |
|
||||||
|
| research | 15 | ~2,518 | -73% |
|
||||||
|
| triage | 6 | ~2,194 | -76% |
|
||||||
|
| creative | 4 | ~1,185 | -87% |
|
||||||
|
| ops | 6 | ~2,194 | -76% |
|
||||||
|
| minimal | 4 | ~800 | -91% |
|
||||||
|
|
||||||
|
**Benefits:**
|
||||||
|
1. Faster prompt processing (less context to scan)
|
||||||
|
2. Reduced API costs (fewer input tokens)
|
||||||
|
3. More focused tool selection (less confusion)
|
||||||
|
4. Faster tool calls (smaller schema to parse)
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Migration Path
|
||||||
|
|
||||||
|
### Phase 1: Deploy Profiles (This PR)
|
||||||
|
- [x] Create `~/.timmy/uniwizard/job_profiles.yaml`
|
||||||
|
- [x] Create design document
|
||||||
|
- [ ] Post Gitea issue comment
|
||||||
|
|
||||||
|
### Phase 2: Cron Integration (Next PR)
|
||||||
|
- [ ] Modify `cron/scheduler.py` to load profiles
|
||||||
|
- [ ] Add `tool_profile` field to job schema
|
||||||
|
- [ ] Update existing jobs to use appropriate profiles
|
||||||
|
|
||||||
|
### Phase 3: CLI Integration (Future)
|
||||||
|
- [ ] Add `/profile` slash command to switch profiles
|
||||||
|
- [ ] Show active profile in CLI banner
|
||||||
|
- [ ] Profile-specific skills loading
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Files Changed
|
||||||
|
|
||||||
|
| File | Purpose |
|
||||||
|
|------|---------|
|
||||||
|
| `~/.timmy/uniwizard/job_profiles.yaml` | Profile definitions |
|
||||||
|
| `~/.timmy/uniwizard/job_profiles_design.md` | This design document |
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Open Questions
|
||||||
|
|
||||||
|
1. **Should we add `disabled_tools` parameter to AIAgent?**
|
||||||
|
- Would enable true read-only triage without prompt hacks
|
||||||
|
- Requires changes to `model_tools.py` and `run_agent.py`
|
||||||
|
|
||||||
|
2. **Should profiles include model recommendations?**
|
||||||
|
- e.g., `recommended_model: claude-opus-4` for code-work
|
||||||
|
- Could help route simple jobs to cheaper models
|
||||||
|
|
||||||
|
3. **Should we support profile composition?**
|
||||||
|
- e.g., `profiles: [ops, web]` for ops jobs that need web lookup
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## References
|
||||||
|
|
||||||
|
- Hermes toolset system: `~/.hermes/hermes-agent/toolsets.py`
|
||||||
|
- Tool filtering logic: `~/.hermes/hermes-agent/model_tools.py:get_tool_definitions()`
|
||||||
|
- Cron scheduler: `~/.hermes/hermes-agent/cron/scheduler.py:run_job()`
|
||||||
|
- AIAgent initialization: `~/.hermes/hermes-agent/run_agent.py:AIAgent.__init__()`
|
||||||
345
uniwizard/kimi-heartbeat.sh
Executable file
345
uniwizard/kimi-heartbeat.sh
Executable file
@@ -0,0 +1,345 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
# kimi-heartbeat.sh — Polls Gitea for assigned-kimi issues, dispatches to KimiClaw via OpenClaw
|
||||||
|
# Zero LLM cost for polling — only calls kimi/kimi-code for actual work.
|
||||||
|
#
|
||||||
|
# Run manually: bash ~/.timmy/uniwizard/kimi-heartbeat.sh
|
||||||
|
# Runs via launchd every 2 minutes: ai.timmy.kimi-heartbeat.plist
|
||||||
|
#
|
||||||
|
# Workflow for humans:
|
||||||
|
# 1. Create or open a Gitea issue in any tracked repo
|
||||||
|
# 2. Add the "assigned-kimi" label
|
||||||
|
# 3. This script picks it up, dispatches to KimiClaw, posts results back
|
||||||
|
# 4. Label transitions: assigned-kimi → kimi-in-progress → kimi-done
|
||||||
|
#
|
||||||
|
# PLANNING: If the issue body is >500 chars or contains "##" headers,
|
||||||
|
# KimiClaw first runs a 2-minute planning pass to decompose the task.
|
||||||
|
# If it needs subtasks, it creates child issues and labels them assigned-kimi
|
||||||
|
# for the next heartbeat cycle. This prevents 10-minute timeouts on complex work.
|
||||||
|
|
||||||
|
set -euo pipefail
|
||||||
|
|
||||||
|
# --- Config ---
|
||||||
|
TOKEN=$(cat "$HOME/.timmy/kimi_gitea_token" | tr -d '[:space:]')
|
||||||
|
TIMMY_TOKEN=$(cat "$HOME/.config/gitea/timmy-token" | tr -d '[:space:]')
|
||||||
|
# Prefer Tailscale (private network) over public IP
|
||||||
|
if curl -sf --connect-timeout 2 "http://100.126.61.75:3000/api/v1/version" > /dev/null 2>&1; then
|
||||||
|
BASE="http://100.126.61.75:3000/api/v1"
|
||||||
|
else
|
||||||
|
BASE="http://143.198.27.163:3000/api/v1"
|
||||||
|
fi
|
||||||
|
LOG="/tmp/kimi-heartbeat.log"
|
||||||
|
LOCKFILE="/tmp/kimi-heartbeat.lock"
|
||||||
|
MAX_DISPATCH=10 # Increased max dispatch to 10
|
||||||
|
PLAN_TIMEOUT=120 # 2 minutes for planning pass
|
||||||
|
EXEC_TIMEOUT=480 # 8 minutes for execution pass
|
||||||
|
BODY_COMPLEXITY_THRESHOLD=500 # chars — above this triggers planning
|
||||||
|
STALE_PROGRESS_SECONDS=3600 # reclaim kimi-in-progress after 1 hour of silence
|
||||||
|
|
||||||
|
REPOS=(
|
||||||
|
"Timmy_Foundation/timmy-home"
|
||||||
|
"Timmy_Foundation/timmy-config"
|
||||||
|
"Timmy_Foundation/the-nexus"
|
||||||
|
"Timmy_Foundation/hermes-agent"
|
||||||
|
)
|
||||||
|
|
||||||
|
# --- Helpers ---
|
||||||
|
log() { echo "[$(date '+%Y-%m-%d %H:%M:%S')] $*" | tee -a "$LOG"; }
|
||||||
|
|
||||||
|
needs_pr_proof() {
|
||||||
|
local haystack="${1,,}"
|
||||||
|
[[ "$haystack" =~ implement|fix|refactor|feature|perf|performance|rebase|deploy|integration|module|script|pipeline|benchmark|cache|test|bug|build|port ]]
|
||||||
|
}
|
||||||
|
|
||||||
|
has_pr_proof() {
|
||||||
|
local haystack="${1,,}"
|
||||||
|
[[ "$haystack" == *"proof:"* || "$haystack" == *"pr:"* || "$haystack" == *"/pulls/"* || "$haystack" == *"commit:"* ]]
|
||||||
|
}
|
||||||
|
|
||||||
|
post_issue_comment_json() {
|
||||||
|
local repo="$1"
|
||||||
|
local issue_num="$2"
|
||||||
|
local token="$3"
|
||||||
|
local body="$4"
|
||||||
|
local payload
|
||||||
|
payload=$(python3 - "$body" <<'PY'
|
||||||
|
import json, sys
|
||||||
|
print(json.dumps({"body": sys.argv[1]}))
|
||||||
|
PY
|
||||||
|
)
|
||||||
|
curl -sf -X POST -H "Authorization: token $token" -H "Content-Type: application/json" \
|
||||||
|
-d "$payload" "$BASE/repos/$repo/issues/$issue_num/comments" > /dev/null 2>&1 || true
|
||||||
|
}
|
||||||
|
|
||||||
|
# Prevent overlapping runs
|
||||||
|
if [ -f "$LOCKFILE" ]; then
|
||||||
|
lock_age=$(( $(date +%s) - $(stat -f %m "$LOCKFILE" 2>/dev/null || echo 0) ))
|
||||||
|
if [ "$lock_age" -lt 600 ]; then
|
||||||
|
log "SKIP: previous run still active (lock age: ${lock_age}s)"
|
||||||
|
exit 0
|
||||||
|
else
|
||||||
|
log "WARN: stale lock (${lock_age}s), removing"
|
||||||
|
rm -f "$LOCKFILE"
|
||||||
|
fi
|
||||||
|
fi
|
||||||
|
trap 'rm -f "$LOCKFILE"' EXIT
|
||||||
|
touch "$LOCKFILE"
|
||||||
|
|
||||||
|
dispatched=0
|
||||||
|
|
||||||
|
for repo in "${REPOS[@]}"; do
|
||||||
|
# Fetch open issues with assigned-kimi label
|
||||||
|
response=$(curl -sf -H "Authorization: token $TIMMY_TOKEN" \
|
||||||
|
"$BASE/repos/$repo/issues?state=open&labels=assigned-kimi&limit=20" 2>/dev/null || echo "[]")
|
||||||
|
|
||||||
|
# Filter: skip done tasks, but reclaim stale kimi-in-progress work automatically
|
||||||
|
issues=$(echo "$response" | python3 -c "
|
||||||
|
import json, sys, datetime
|
||||||
|
STALE = int(${STALE_PROGRESS_SECONDS})
|
||||||
|
|
||||||
|
def parse_ts(value):
|
||||||
|
if not value:
|
||||||
|
return None
|
||||||
|
try:
|
||||||
|
return datetime.datetime.fromisoformat(value.replace('Z', '+00:00'))
|
||||||
|
except Exception:
|
||||||
|
return None
|
||||||
|
|
||||||
|
try:
|
||||||
|
data = json.loads(sys.stdin.buffer.read())
|
||||||
|
except:
|
||||||
|
sys.exit(0)
|
||||||
|
|
||||||
|
now = datetime.datetime.now(datetime.timezone.utc)
|
||||||
|
for i in data:
|
||||||
|
labels = [l['name'] for l in i.get('labels', [])]
|
||||||
|
if 'kimi-done' in labels:
|
||||||
|
continue
|
||||||
|
|
||||||
|
reclaim = False
|
||||||
|
updated_at = i.get('updated_at', '') or ''
|
||||||
|
if 'kimi-in-progress' in labels:
|
||||||
|
ts = parse_ts(updated_at)
|
||||||
|
age = (now - ts).total_seconds() if ts else (STALE + 1)
|
||||||
|
if age < STALE:
|
||||||
|
continue
|
||||||
|
reclaim = True
|
||||||
|
|
||||||
|
body = (i.get('body', '') or '')
|
||||||
|
body_len = len(body)
|
||||||
|
body_clean = body[:1500].replace('\n', ' ').replace('|', ' ')
|
||||||
|
title = i['title'].replace('|', ' ')
|
||||||
|
updated_clean = updated_at.replace('|', ' ')
|
||||||
|
reclaim_flag = 'reclaim' if reclaim else 'fresh'
|
||||||
|
print(f\"{i['number']}|{title}|{body_len}|{reclaim_flag}|{updated_clean}|{body_clean}\")
|
||||||
|
" 2>/dev/null)
|
||||||
|
|
||||||
|
[ -z "$issues" ] && continue
|
||||||
|
|
||||||
|
while IFS='|' read -r issue_num title body_len reclaim_flag updated_at body; do
|
||||||
|
[ -z "$issue_num" ] && continue
|
||||||
|
log "FOUND: $repo #$issue_num — $title (body: ${body_len} chars, mode: ${reclaim_flag}, updated: ${updated_at})"
|
||||||
|
|
||||||
|
# --- Get label IDs for this repo ---
|
||||||
|
label_json=$(curl -sf -H "Authorization: token $TIMMY_TOKEN" \
|
||||||
|
"$BASE/repos/$repo/labels" 2>/dev/null || echo "[]")
|
||||||
|
|
||||||
|
progress_id=$(echo "$label_json" | python3 -c "import json,sys; [print(l['id']) for l in json.load(sys.stdin) if l['name']=='kimi-in-progress']" 2>/dev/null)
|
||||||
|
done_id=$(echo "$label_json" | python3 -c "import json,sys; [print(l['id']) for l in json.load(sys.stdin) if l['name']=='kimi-done']" 2>/dev/null)
|
||||||
|
kimi_id=$(echo "$label_json" | python3 -c "import json,sys; [print(l['id']) for l in json.load(sys.stdin) if l['name']=='assigned-kimi']" 2>/dev/null)
|
||||||
|
|
||||||
|
if [ "$reclaim_flag" = "reclaim" ]; then
|
||||||
|
log "RECLAIM: $repo #$issue_num — stale kimi-in-progress since $updated_at"
|
||||||
|
[ -n "$progress_id" ] && curl -sf -X DELETE -H "Authorization: token $TIMMY_TOKEN" \
|
||||||
|
"$BASE/repos/$repo/issues/$issue_num/labels/$progress_id" > /dev/null 2>&1 || true
|
||||||
|
curl -sf -X POST -H "Authorization: token $TOKEN" -H "Content-Type: application/json" \
|
||||||
|
-d "{\"body\":\"🟡 **KimiClaw reclaiming stale task.**\\nPrevious kimi-in-progress state exceeded ${STALE_PROGRESS_SECONDS}s without resolution.\\nLast update: $updated_at\\nTimestamp: $(date -u '+%Y-%m-%dT%H:%M:%SZ')\"}" \
|
||||||
|
"$BASE/repos/$repo/issues/$issue_num/comments" > /dev/null 2>&1 || true
|
||||||
|
fi
|
||||||
|
|
||||||
|
# --- Add kimi-in-progress label ---
|
||||||
|
if [ -n "$progress_id" ]; then
|
||||||
|
curl -sf -X POST -H "Authorization: token $TIMMY_TOKEN" -H "Content-Type: application/json" \
|
||||||
|
-d "{\"labels\":[$progress_id]}" \
|
||||||
|
"$BASE/repos/$repo/issues/$issue_num/labels" > /dev/null 2>&1 || true
|
||||||
|
fi
|
||||||
|
|
||||||
|
# --- Decide: plan first or execute directly ---
|
||||||
|
needs_planning=false
|
||||||
|
if [ "$body_len" -gt "$BODY_COMPLEXITY_THRESHOLD" ]; then
|
||||||
|
needs_planning=true
|
||||||
|
fi
|
||||||
|
|
||||||
|
if [ "$needs_planning" = true ]; then
|
||||||
|
# =============================================
|
||||||
|
# PHASE 1: PLANNING PASS (2 min timeout)
|
||||||
|
# =============================================
|
||||||
|
log "PLAN: $repo #$issue_num — complex task, running planning pass"
|
||||||
|
|
||||||
|
curl -sf -X POST -H "Authorization: token $TOKEN" -H "Content-Type: application/json" \
|
||||||
|
-d "{\"body\":\"🟠 **KimiClaw picking up this task** via heartbeat.\\nBackend: kimi/kimi-code (Moonshot AI)\\nMode: **Planning first** (task is complex)\\nTimestamp: $(date -u '+%Y-%m-%dT%H:%M:%SZ')\"}" \
|
||||||
|
"$BASE/repos/$repo/issues/$issue_num/comments" > /dev/null 2>&1 || true
|
||||||
|
|
||||||
|
plan_prompt="You are KimiClaw, a planning agent. You have 2 MINUTES.\n\nTASK: Analyze this Gitea issue and decide if you can complete it in under 8 minutes, or if it needs to be broken into subtasks.\n\nISSUE #$issue_num in $repo: $title\n\nBODY:\n$body\n\nRULES:\n- If you CAN complete this in one pass (research, write analysis, answer a question): respond with EXECUTE followed by a one-line plan.\n- If the task is TOO BIG (needs git operations, multiple repos, >2000 words of output, or multi-step implementation): respond with DECOMPOSE followed by a numbered list of 2-5 smaller subtasks. Each subtask must be completable in under 8 minutes by itself.\n- Each subtask line format: SUBTASK: <title> | <one-line description>\n- Be realistic about what fits in 8 minutes with no terminal access.\n- You CANNOT clone repos, run git, or execute code. You CAN research, analyze, write specs, review code via API, and produce documents.\n\nRespond with ONLY your decision. No preamble."
|
||||||
|
|
||||||
|
plan_result=$(openclaw agent --agent main --message "$plan_prompt" --timeout $PLAN_TIMEOUT --json 2>/dev/null || echo '{\"status\":\"error\"}')
|
||||||
|
plan_status=$(echo "$plan_result" | python3 -c "import json,sys; print(json.load(sys.stdin).get('status','error'))" 2>/dev/null || echo "error")
|
||||||
|
plan_text=$(echo "$plan_result" | python3 -c "\nimport json,sys\nd = json.load(sys.stdin)\npayloads = d.get('result',{}).get('payloads',[])\nprint(payloads[0]['text'] if payloads else '')\n" 2>/dev/null || echo "")
|
||||||
|
|
||||||
|
if echo "$plan_text" | grep -qi "^DECOMPOSE"; then
|
||||||
|
# --- Create subtask issues ---
|
||||||
|
log "DECOMPOSE: $repo #$issue_num — creating subtasks"
|
||||||
|
|
||||||
|
# Post the plan as a comment
|
||||||
|
escaped_plan=$(echo "$plan_text" | python3 -c "import sys,json; print(json.dumps(sys.stdin.read()))" 2>/dev/null)
|
||||||
|
curl -sf -X POST -H "Authorization: token $TOKEN" -H "Content-Type: application/json" \
|
||||||
|
-d "{\"body\":\"📝 **Planning complete — decomposing into subtasks:**\\n\\n$plan_text\"}" \
|
||||||
|
"$BASE/repos/$repo/issues/$issue_num/comments" > /dev/null 2>&1 || true
|
||||||
|
|
||||||
|
# Extract SUBTASK lines and create child issues
|
||||||
|
echo "$plan_text" | grep -i "^SUBTASK:" | head -5 | while IFS='|' read -r sub_title sub_desc; do
|
||||||
|
sub_title=$(echo "$sub_title" | sed 's/^SUBTASK: *//')
|
||||||
|
sub_desc=$(echo "${sub_desc:-$sub_title}" | sed 's/^ *//')
|
||||||
|
|
||||||
|
if [ -n "$sub_title" ]; then
|
||||||
|
sub_body="## Parent Issue\\nChild of #$issue_num: $title\\n\\n## Task\\n$sub_desc\\n\\n## Constraints\\n- Must complete in under 8 minutes\\n- No git/terminal operations\\n- Post results as analysis/documentation\\n\\n## Assignee\\n@KimiClaw"
|
||||||
|
|
||||||
|
curl -sf -X POST -H "Authorization: token $TIMMY_TOKEN" -H "Content-Type: application/json" \
|
||||||
|
-d "{\"title\":\"[SUB] $sub_title\",\"body\":\"$sub_body\"}" \
|
||||||
|
"$BASE/repos/$repo/issues" > /dev/null 2>&1
|
||||||
|
|
||||||
|
# Get the issue number of what we just created and label it
|
||||||
|
new_num=$(curl -sf -H "Authorization: token $TIMMY_TOKEN" \
|
||||||
|
"$BASE/repos/$repo/issues?state=open&limit=1&type=issues" | \
|
||||||
|
python3 -c "import json,sys; d=json.load(sys.stdin); print(d[0]['number'] if d else '')" 2>/dev/null)
|
||||||
|
|
||||||
|
if [ -n "$new_num" ] && [ -n "$kimi_id" ]; then
|
||||||
|
curl -sf -X POST -H "Authorization: token $TIMMY_TOKEN" -H "Content-Type: application/json" \
|
||||||
|
-d "{\"labels\":[$kimi_id]}" \
|
||||||
|
"$BASE/repos/$repo/issues/$new_num/labels" > /dev/null 2>&1 || true
|
||||||
|
log "SUBTASK: $repo #$new_num — $sub_title"
|
||||||
|
fi
|
||||||
|
fi
|
||||||
|
done
|
||||||
|
|
||||||
|
# Mark parent as kimi-done (subtasks will be picked up next cycle)
|
||||||
|
[ -n "$progress_id" ] && curl -sf -X DELETE -H "Authorization: token $TIMMY_TOKEN" \
|
||||||
|
"$BASE/repos/$repo/issues/$issue_num/labels/$progress_id" > /dev/null 2>&1 || true
|
||||||
|
[ -n "$done_id" ] && curl -sf -X POST -H "Authorization: token $TIMMY_TOKEN" -H "Content-Type: application/json" \
|
||||||
|
-d "{\"labels\":[$done_id]}" \
|
||||||
|
"$BASE/repos/$repo/issues/$issue_num/labels" > /dev/null 2>&1 || true
|
||||||
|
|
||||||
|
dispatched=$((dispatched + 1))
|
||||||
|
log "PLANNED: $repo #$issue_num — subtasks created, parent marked done"
|
||||||
|
|
||||||
|
else
|
||||||
|
# --- Plan says EXECUTE — proceed to execution ---
|
||||||
|
log "EXECUTE: $repo #$issue_num — planning pass says single-pass OK"
|
||||||
|
# Fall through to execution below
|
||||||
|
needs_planning=false
|
||||||
|
fi
|
||||||
|
fi
|
||||||
|
|
||||||
|
if [ "$needs_planning" = false ]; then
|
||||||
|
# =============================================
|
||||||
|
# PHASE 2: EXECUTION PASS (8 min timeout)
|
||||||
|
# =============================================
|
||||||
|
|
||||||
|
# Post pickup comment if we didn't already (simple tasks skip planning)
|
||||||
|
if [ "$body_len" -le "$BODY_COMPLEXITY_THRESHOLD" ]; then
|
||||||
|
curl -sf -X POST -H "Authorization: token $TOKEN" -H "Content-Type: application/json" \
|
||||||
|
-d "{\"body\":\"🟠 **KimiClaw picking up this task** via heartbeat.\\nBackend: kimi/kimi-code (Moonshot AI)\\nMode: **Direct execution** (task fits in one pass)\\nTimestamp: $(date -u '+%Y-%m-%dT%H:%M:%SZ')\"}" \
|
||||||
|
"$BASE/repos/$repo/issues/$issue_num/comments" > /dev/null 2>&1 || true
|
||||||
|
fi
|
||||||
|
|
||||||
|
log "DISPATCH: $repo #$issue_num to openclaw (timeout: ${EXEC_TIMEOUT}s)"
|
||||||
|
|
||||||
|
exec_prompt="You are KimiClaw, an AI agent powered by Kimi K2.5 (Moonshot AI).
|
||||||
|
You are working on Gitea issue #$issue_num in repo $repo.
|
||||||
|
You have 8 MINUTES maximum. Be concise and focused.
|
||||||
|
|
||||||
|
ISSUE TITLE: $title
|
||||||
|
|
||||||
|
ISSUE BODY:
|
||||||
|
$body
|
||||||
|
|
||||||
|
YOUR TASK:
|
||||||
|
1. Read the issue carefully and do the work described
|
||||||
|
2. Stay focused — deliver the core ask, skip nice-to-haves
|
||||||
|
3. Provide your COMPLETE results as your response (use markdown)
|
||||||
|
4. If you realize mid-task this will take longer than 8 minutes, STOP and summarize what you've done so far plus what remains"
|
||||||
|
|
||||||
|
# --- Dispatch to OpenClaw (background) ---
|
||||||
|
(
|
||||||
|
result=$(openclaw agent --agent main --message "$exec_prompt" --timeout $EXEC_TIMEOUT --json 2>/dev/null || echo '{"status":"error"}')
|
||||||
|
status=$(echo "$result" | python3 -c "import json,sys; print(json.load(sys.stdin).get('status','error'))" 2>/dev/null || echo "error")
|
||||||
|
|
||||||
|
# Extract response text
|
||||||
|
response_text=$(echo "$result" | python3 -c "
|
||||||
|
import json,sys
|
||||||
|
d = json.load(sys.stdin)
|
||||||
|
payloads = d.get('result',{}).get('payloads',[])
|
||||||
|
print(payloads[0]['text'][:3000] if payloads else 'No response')
|
||||||
|
" 2>/dev/null || echo "No response")
|
||||||
|
|
||||||
|
if [ "$status" = "ok" ] && [ "$response_text" != "No response" ]; then
|
||||||
|
escaped=$(echo "$response_text" | python3 -c "import sys,json; print(json.dumps(sys.stdin.read())[1:-1])" 2>/dev/null)
|
||||||
|
if needs_pr_proof "$title $body" && ! has_pr_proof "$response_text"; then
|
||||||
|
log "BLOCKED: $repo #$issue_num — response lacked PR/proof for code task"
|
||||||
|
post_issue_comment_json "$repo" "$issue_num" "$TOKEN" "🟡 **KimiClaw produced analysis only — no PR/proof detected.**
|
||||||
|
|
||||||
|
This issue looks like implementation work, so it is NOT being marked kimi-done.
|
||||||
|
Kimi response excerpt:
|
||||||
|
|
||||||
|
$escaped
|
||||||
|
|
||||||
|
Action: removing Kimi queue labels so a code-capable agent can pick it up."
|
||||||
|
[ -n "$progress_id" ] && curl -sf -X DELETE -H "Authorization: token $TIMMY_TOKEN" \
|
||||||
|
"$BASE/repos/$repo/issues/$issue_num/labels/$progress_id" > /dev/null 2>&1 || true
|
||||||
|
[ -n "$kimi_id" ] && curl -sf -X DELETE -H "Authorization: token $TIMMY_TOKEN" \
|
||||||
|
"$BASE/repos/$repo/issues/$issue_num/labels/$kimi_id" > /dev/null 2>&1 || true
|
||||||
|
else
|
||||||
|
log "COMPLETED: $repo #$issue_num"
|
||||||
|
post_issue_comment_json "$repo" "$issue_num" "$TOKEN" "🟢 **KimiClaw result:**
|
||||||
|
|
||||||
|
$escaped"
|
||||||
|
|
||||||
|
[ -n "$progress_id" ] && curl -sf -X DELETE -H "Authorization: token $TIMMY_TOKEN" \
|
||||||
|
"$BASE/repos/$repo/issues/$issue_num/labels/$progress_id" > /dev/null 2>&1 || true
|
||||||
|
[ -n "$done_id" ] && curl -sf -X POST -H "Authorization: token $TIMMY_TOKEN" -H "Content-Type: application/json" \
|
||||||
|
-d "{\"labels\":[$done_id]}" \
|
||||||
|
"$BASE/repos/$repo/issues/$issue_num/labels" > /dev/null 2>&1 || true
|
||||||
|
fi
|
||||||
|
else
|
||||||
|
log "FAILED: $repo #$issue_num — status=$status"
|
||||||
|
|
||||||
|
curl -sf -X POST -H "Authorization: token $TOKEN" -H "Content-Type: application/json" \
|
||||||
|
-d "{\"body\":\"\ud83d\udd34 **KimiClaw failed/timed out.**\\nStatus: $status\\nTimestamp: $(date -u '+%Y-%m-%dT%H:%M:%SZ')\\n\\nTask may be too complex for single-pass execution. Consider breaking into smaller subtasks.\"}" \
|
||||||
|
"$BASE/repos/$repo/issues/$issue_num/comments" > /dev/null 2>&1 || true
|
||||||
|
|
||||||
|
# Remove kimi-in-progress on failure
|
||||||
|
[ -n "$progress_id" ] && curl -sf -X DELETE -H "Authorization: token $TIMMY_TOKEN" \
|
||||||
|
"$BASE/repos/$repo/issues/$issue_num/labels/$progress_id" > /dev/null 2>&1 || true
|
||||||
|
fi
|
||||||
|
) &
|
||||||
|
|
||||||
|
dispatched=$((dispatched + 1))
|
||||||
|
log "DISPATCHED: $repo #$issue_num (background PID $!)"
|
||||||
|
fi
|
||||||
|
|
||||||
|
# Enforce dispatch cap
|
||||||
|
if [ "$dispatched" -ge "$MAX_DISPATCH" ]; then
|
||||||
|
log "CAPPED: reached $MAX_DISPATCH dispatches, remaining issues deferred to next heartbeat"
|
||||||
|
break 2 # Break out of both loops
|
||||||
|
fi
|
||||||
|
|
||||||
|
# Stagger dispatches to avoid overwhelming kimi
|
||||||
|
sleep 3
|
||||||
|
|
||||||
|
done <<< "$issues"
|
||||||
|
done
|
||||||
|
|
||||||
|
if [ "$dispatched" -eq 0 ]; then
|
||||||
|
log "Heartbeat: no pending tasks"
|
||||||
|
else
|
||||||
|
log "Heartbeat: dispatched $dispatched task(s)"
|
||||||
|
fi
|
||||||
642
uniwizard/quality_scorer.py
Normal file
642
uniwizard/quality_scorer.py
Normal file
@@ -0,0 +1,642 @@
|
|||||||
|
"""
|
||||||
|
Uniwizard Backend Quality Scorer
|
||||||
|
|
||||||
|
Tracks per-backend performance metrics and provides intelligent routing recommendations.
|
||||||
|
Uses a rolling window of last 100 responses per backend across 5 task types.
|
||||||
|
"""
|
||||||
|
|
||||||
|
import sqlite3
|
||||||
|
import json
|
||||||
|
import time
|
||||||
|
from dataclasses import dataclass, asdict
|
||||||
|
from datetime import datetime
|
||||||
|
from enum import Enum
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import Optional, List, Dict, Tuple
|
||||||
|
from contextlib import contextmanager
|
||||||
|
|
||||||
|
|
||||||
|
class TaskType(Enum):
|
||||||
|
"""Task types for backend specialization tracking."""
|
||||||
|
CODE = "code"
|
||||||
|
REASONING = "reasoning"
|
||||||
|
RESEARCH = "research"
|
||||||
|
CREATIVE = "creative"
|
||||||
|
FAST_OPS = "fast_ops"
|
||||||
|
|
||||||
|
|
||||||
|
class ResponseStatus(Enum):
|
||||||
|
"""Status of a backend response."""
|
||||||
|
SUCCESS = "success"
|
||||||
|
ERROR = "error"
|
||||||
|
REFUSAL = "refusal"
|
||||||
|
TIMEOUT = "timeout"
|
||||||
|
|
||||||
|
|
||||||
|
# The 7 Uniwizard backends
|
||||||
|
BACKENDS = [
|
||||||
|
"anthropic",
|
||||||
|
"openai-codex",
|
||||||
|
"gemini",
|
||||||
|
"groq",
|
||||||
|
"grok",
|
||||||
|
"kimi-coding",
|
||||||
|
"openrouter",
|
||||||
|
]
|
||||||
|
|
||||||
|
# Default DB path
|
||||||
|
DEFAULT_DB_PATH = Path.home() / ".timmy" / "uniwizard" / "quality_scores.db"
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class BackendScore:
|
||||||
|
"""Aggregated score card for a backend on a specific task type."""
|
||||||
|
backend: str
|
||||||
|
task_type: str
|
||||||
|
total_requests: int
|
||||||
|
success_count: int
|
||||||
|
error_count: int
|
||||||
|
refusal_count: int
|
||||||
|
timeout_count: int
|
||||||
|
avg_latency_ms: float
|
||||||
|
avg_ttft_ms: float
|
||||||
|
p95_latency_ms: float
|
||||||
|
score: float # Composite quality score (0-100)
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class ResponseRecord:
|
||||||
|
"""Single response record for storage."""
|
||||||
|
id: Optional[int]
|
||||||
|
backend: str
|
||||||
|
task_type: str
|
||||||
|
status: str
|
||||||
|
latency_ms: float
|
||||||
|
ttft_ms: float # Time to first token
|
||||||
|
timestamp: float
|
||||||
|
metadata: Optional[str] # JSON string for extensibility
|
||||||
|
|
||||||
|
|
||||||
|
class QualityScorer:
|
||||||
|
"""
|
||||||
|
Tracks backend quality metrics with rolling windows.
|
||||||
|
|
||||||
|
Stores per-response data in SQLite, computes aggregated scores
|
||||||
|
on-demand for routing decisions.
|
||||||
|
"""
|
||||||
|
|
||||||
|
ROLLING_WINDOW_SIZE = 100
|
||||||
|
|
||||||
|
# Score weights for composite calculation
|
||||||
|
WEIGHTS = {
|
||||||
|
"success_rate": 0.35,
|
||||||
|
"low_error_rate": 0.20,
|
||||||
|
"low_refusal_rate": 0.15,
|
||||||
|
"low_timeout_rate": 0.10,
|
||||||
|
"low_latency": 0.20,
|
||||||
|
}
|
||||||
|
|
||||||
|
def __init__(self, db_path: Optional[Path] = None):
|
||||||
|
self.db_path = Path(db_path) if db_path else DEFAULT_DB_PATH
|
||||||
|
self._init_db()
|
||||||
|
|
||||||
|
@contextmanager
|
||||||
|
def _get_conn(self):
|
||||||
|
"""Get a database connection with proper cleanup."""
|
||||||
|
conn = sqlite3.connect(str(self.db_path))
|
||||||
|
conn.row_factory = sqlite3.Row
|
||||||
|
try:
|
||||||
|
yield conn
|
||||||
|
conn.commit()
|
||||||
|
finally:
|
||||||
|
conn.close()
|
||||||
|
|
||||||
|
def _init_db(self):
|
||||||
|
"""Initialize the SQLite database schema."""
|
||||||
|
self.db_path.parent.mkdir(parents=True, exist_ok=True)
|
||||||
|
|
||||||
|
with self._get_conn() as conn:
|
||||||
|
conn.execute("""
|
||||||
|
CREATE TABLE IF NOT EXISTS responses (
|
||||||
|
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
||||||
|
backend TEXT NOT NULL,
|
||||||
|
task_type TEXT NOT NULL,
|
||||||
|
status TEXT NOT NULL,
|
||||||
|
latency_ms REAL NOT NULL,
|
||||||
|
ttft_ms REAL NOT NULL,
|
||||||
|
timestamp REAL NOT NULL,
|
||||||
|
metadata TEXT
|
||||||
|
)
|
||||||
|
""")
|
||||||
|
|
||||||
|
# Index for fast rolling window queries
|
||||||
|
conn.execute("""
|
||||||
|
CREATE INDEX IF NOT EXISTS idx_backend_task_time
|
||||||
|
ON responses(backend, task_type, timestamp DESC)
|
||||||
|
""")
|
||||||
|
|
||||||
|
conn.execute("""
|
||||||
|
CREATE INDEX IF NOT EXISTS idx_backend_time
|
||||||
|
ON responses(backend, timestamp DESC)
|
||||||
|
""")
|
||||||
|
|
||||||
|
def record_response(
|
||||||
|
self,
|
||||||
|
backend: str,
|
||||||
|
task_type: str,
|
||||||
|
status: ResponseStatus,
|
||||||
|
latency_ms: float,
|
||||||
|
ttft_ms: float,
|
||||||
|
metadata: Optional[Dict] = None
|
||||||
|
) -> None:
|
||||||
|
"""
|
||||||
|
Record a response from a backend.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
backend: Backend name (must be in BACKENDS)
|
||||||
|
task_type: Task type string or TaskType enum
|
||||||
|
status: ResponseStatus (success/error/refusal/timeout)
|
||||||
|
latency_ms: Total response latency in milliseconds
|
||||||
|
ttft_ms: Time to first token in milliseconds
|
||||||
|
metadata: Optional dict with additional context
|
||||||
|
"""
|
||||||
|
if backend not in BACKENDS:
|
||||||
|
raise ValueError(f"Unknown backend: {backend}. Must be one of: {BACKENDS}")
|
||||||
|
|
||||||
|
task_str = task_type.value if isinstance(task_type, TaskType) else task_type
|
||||||
|
|
||||||
|
with self._get_conn() as conn:
|
||||||
|
conn.execute("""
|
||||||
|
INSERT INTO responses (backend, task_type, status, latency_ms, ttft_ms, timestamp, metadata)
|
||||||
|
VALUES (?, ?, ?, ?, ?, ?, ?)
|
||||||
|
""", (
|
||||||
|
backend,
|
||||||
|
task_str,
|
||||||
|
status.value,
|
||||||
|
latency_ms,
|
||||||
|
ttft_ms,
|
||||||
|
time.time(),
|
||||||
|
json.dumps(metadata) if metadata else None
|
||||||
|
))
|
||||||
|
|
||||||
|
# Prune old records to maintain rolling window
|
||||||
|
self._prune_rolling_window(conn, backend, task_str)
|
||||||
|
|
||||||
|
def _prune_rolling_window(self, conn: sqlite3.Connection, backend: str, task_type: str) -> None:
|
||||||
|
"""Remove records beyond the rolling window size for this backend/task combo."""
|
||||||
|
# Get IDs to keep (most recent ROLLING_WINDOW_SIZE)
|
||||||
|
cursor = conn.execute("""
|
||||||
|
SELECT id FROM responses
|
||||||
|
WHERE backend = ? AND task_type = ?
|
||||||
|
ORDER BY timestamp DESC
|
||||||
|
LIMIT ? OFFSET ?
|
||||||
|
""", (backend, task_type, self.ROLLING_WINDOW_SIZE, self.ROLLING_WINDOW_SIZE))
|
||||||
|
|
||||||
|
ids_to_delete = [row[0] for row in cursor.fetchall()]
|
||||||
|
|
||||||
|
if ids_to_delete:
|
||||||
|
placeholders = ','.join('?' * len(ids_to_delete))
|
||||||
|
conn.execute(f"""
|
||||||
|
DELETE FROM responses
|
||||||
|
WHERE id IN ({placeholders})
|
||||||
|
""", ids_to_delete)
|
||||||
|
|
||||||
|
def get_backend_score(
|
||||||
|
self,
|
||||||
|
backend: str,
|
||||||
|
task_type: Optional[str] = None
|
||||||
|
) -> BackendScore:
|
||||||
|
"""
|
||||||
|
Get aggregated score for a backend, optionally filtered by task type.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
backend: Backend name
|
||||||
|
task_type: Optional task type filter
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
BackendScore with aggregated metrics
|
||||||
|
"""
|
||||||
|
if backend not in BACKENDS:
|
||||||
|
raise ValueError(f"Unknown backend: {backend}")
|
||||||
|
|
||||||
|
with self._get_conn() as conn:
|
||||||
|
if task_type:
|
||||||
|
row = conn.execute("""
|
||||||
|
SELECT
|
||||||
|
COUNT(*) as total,
|
||||||
|
SUM(CASE WHEN status = 'success' THEN 1 ELSE 0 END) as successes,
|
||||||
|
SUM(CASE WHEN status = 'error' THEN 1 ELSE 0 END) as errors,
|
||||||
|
SUM(CASE WHEN status = 'refusal' THEN 1 ELSE 0 END) as refusals,
|
||||||
|
SUM(CASE WHEN status = 'timeout' THEN 1 ELSE 0 END) as timeouts,
|
||||||
|
AVG(latency_ms) as avg_latency,
|
||||||
|
AVG(ttft_ms) as avg_ttft,
|
||||||
|
MAX(latency_ms) as max_latency
|
||||||
|
FROM (
|
||||||
|
SELECT * FROM responses
|
||||||
|
WHERE backend = ? AND task_type = ?
|
||||||
|
ORDER BY timestamp DESC
|
||||||
|
LIMIT ?
|
||||||
|
)
|
||||||
|
""", (backend, task_type, self.ROLLING_WINDOW_SIZE)).fetchone()
|
||||||
|
else:
|
||||||
|
row = conn.execute("""
|
||||||
|
SELECT
|
||||||
|
COUNT(*) as total,
|
||||||
|
SUM(CASE WHEN status = 'success' THEN 1 ELSE 0 END) as successes,
|
||||||
|
SUM(CASE WHEN status = 'error' THEN 1 ELSE 0 END) as errors,
|
||||||
|
SUM(CASE WHEN status = 'refusal' THEN 1 ELSE 0 END) as refusals,
|
||||||
|
SUM(CASE WHEN status = 'timeout' THEN 1 ELSE 0 END) as timeouts,
|
||||||
|
AVG(latency_ms) as avg_latency,
|
||||||
|
AVG(ttft_ms) as avg_ttft,
|
||||||
|
MAX(latency_ms) as max_latency
|
||||||
|
FROM (
|
||||||
|
SELECT * FROM responses
|
||||||
|
WHERE backend = ?
|
||||||
|
ORDER BY timestamp DESC
|
||||||
|
LIMIT ?
|
||||||
|
)
|
||||||
|
""", (backend, self.ROLLING_WINDOW_SIZE)).fetchone()
|
||||||
|
|
||||||
|
total = row[0] or 0
|
||||||
|
|
||||||
|
if total == 0:
|
||||||
|
return BackendScore(
|
||||||
|
backend=backend,
|
||||||
|
task_type=task_type or "all",
|
||||||
|
total_requests=0,
|
||||||
|
success_count=0,
|
||||||
|
error_count=0,
|
||||||
|
refusal_count=0,
|
||||||
|
timeout_count=0,
|
||||||
|
avg_latency_ms=0.0,
|
||||||
|
avg_ttft_ms=0.0,
|
||||||
|
p95_latency_ms=0.0,
|
||||||
|
score=0.0
|
||||||
|
)
|
||||||
|
|
||||||
|
successes = row[1] or 0
|
||||||
|
errors = row[2] or 0
|
||||||
|
refusals = row[3] or 0
|
||||||
|
timeouts = row[4] or 0
|
||||||
|
avg_latency = row[5] or 0.0
|
||||||
|
avg_ttft = row[6] or 0.0
|
||||||
|
|
||||||
|
# Calculate P95 latency
|
||||||
|
p95 = self._get_p95_latency(conn, backend, task_type)
|
||||||
|
|
||||||
|
# Calculate composite score
|
||||||
|
score = self._calculate_score(
|
||||||
|
total, successes, errors, refusals, timeouts, avg_latency
|
||||||
|
)
|
||||||
|
|
||||||
|
return BackendScore(
|
||||||
|
backend=backend,
|
||||||
|
task_type=task_type or "all",
|
||||||
|
total_requests=total,
|
||||||
|
success_count=successes,
|
||||||
|
error_count=errors,
|
||||||
|
refusal_count=refusals,
|
||||||
|
timeout_count=timeouts,
|
||||||
|
avg_latency_ms=round(avg_latency, 2),
|
||||||
|
avg_ttft_ms=round(avg_ttft, 2),
|
||||||
|
p95_latency_ms=round(p95, 2),
|
||||||
|
score=round(score, 2)
|
||||||
|
)
|
||||||
|
|
||||||
|
def _get_p95_latency(
|
||||||
|
self,
|
||||||
|
conn: sqlite3.Connection,
|
||||||
|
backend: str,
|
||||||
|
task_type: Optional[str]
|
||||||
|
) -> float:
|
||||||
|
"""Calculate P95 latency from rolling window."""
|
||||||
|
if task_type:
|
||||||
|
row = conn.execute("""
|
||||||
|
SELECT latency_ms FROM responses
|
||||||
|
WHERE backend = ? AND task_type = ?
|
||||||
|
ORDER BY timestamp DESC
|
||||||
|
LIMIT ?
|
||||||
|
""", (backend, task_type, self.ROLLING_WINDOW_SIZE)).fetchall()
|
||||||
|
else:
|
||||||
|
row = conn.execute("""
|
||||||
|
SELECT latency_ms FROM responses
|
||||||
|
WHERE backend = ?
|
||||||
|
ORDER BY timestamp DESC
|
||||||
|
LIMIT ?
|
||||||
|
""", (backend, self.ROLLING_WINDOW_SIZE)).fetchall()
|
||||||
|
|
||||||
|
if not row:
|
||||||
|
return 0.0
|
||||||
|
|
||||||
|
latencies = sorted([r[0] for r in row])
|
||||||
|
idx = int(len(latencies) * 0.95)
|
||||||
|
return latencies[min(idx, len(latencies) - 1)]
|
||||||
|
|
||||||
|
def _calculate_score(
|
||||||
|
self,
|
||||||
|
total: int,
|
||||||
|
successes: int,
|
||||||
|
errors: int,
|
||||||
|
refusals: int,
|
||||||
|
timeouts: int,
|
||||||
|
avg_latency: float
|
||||||
|
) -> float:
|
||||||
|
"""
|
||||||
|
Calculate composite quality score (0-100).
|
||||||
|
|
||||||
|
Higher is better. Considers success rate, error/refusal/timeout rates,
|
||||||
|
and normalized latency.
|
||||||
|
"""
|
||||||
|
if total == 0:
|
||||||
|
return 0.0
|
||||||
|
|
||||||
|
success_rate = successes / total
|
||||||
|
error_rate = errors / total
|
||||||
|
refusal_rate = refusals / total
|
||||||
|
timeout_rate = timeouts / total
|
||||||
|
|
||||||
|
# Normalize latency: assume 5000ms is "bad" (score 0), 100ms is "good" (score 1)
|
||||||
|
# Using exponential decay for latency scoring
|
||||||
|
latency_score = max(0, min(1, 1 - (avg_latency / 10000)))
|
||||||
|
|
||||||
|
score = (
|
||||||
|
self.WEIGHTS["success_rate"] * success_rate * 100 +
|
||||||
|
self.WEIGHTS["low_error_rate"] * (1 - error_rate) * 100 +
|
||||||
|
self.WEIGHTS["low_refusal_rate"] * (1 - refusal_rate) * 100 +
|
||||||
|
self.WEIGHTS["low_timeout_rate"] * (1 - timeout_rate) * 100 +
|
||||||
|
self.WEIGHTS["low_latency"] * latency_score * 100
|
||||||
|
)
|
||||||
|
|
||||||
|
return max(0, min(100, score))
|
||||||
|
|
||||||
|
def recommend_backend(
|
||||||
|
self,
|
||||||
|
task_type: Optional[str] = None,
|
||||||
|
min_samples: int = 5
|
||||||
|
) -> List[Tuple[str, float]]:
|
||||||
|
"""
|
||||||
|
Get ranked list of backends for a task type.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
task_type: Optional task type to specialize for
|
||||||
|
min_samples: Minimum samples before considering a backend
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
List of (backend_name, score) tuples, sorted by score descending
|
||||||
|
"""
|
||||||
|
scores = []
|
||||||
|
|
||||||
|
for backend in BACKENDS:
|
||||||
|
score_card = self.get_backend_score(backend, task_type)
|
||||||
|
|
||||||
|
# Require minimum samples for confident recommendations
|
||||||
|
if score_card.total_requests < min_samples:
|
||||||
|
# Penalize low-sample backends but still include them
|
||||||
|
adjusted_score = score_card.score * (score_card.total_requests / min_samples)
|
||||||
|
else:
|
||||||
|
adjusted_score = score_card.score
|
||||||
|
|
||||||
|
scores.append((backend, round(adjusted_score, 2)))
|
||||||
|
|
||||||
|
# Sort by score descending
|
||||||
|
scores.sort(key=lambda x: x[1], reverse=True)
|
||||||
|
return scores
|
||||||
|
|
||||||
|
def get_all_scores(
|
||||||
|
self,
|
||||||
|
task_type: Optional[str] = None
|
||||||
|
) -> Dict[str, BackendScore]:
|
||||||
|
"""Get score cards for all backends."""
|
||||||
|
return {
|
||||||
|
backend: self.get_backend_score(backend, task_type)
|
||||||
|
for backend in BACKENDS
|
||||||
|
}
|
||||||
|
|
||||||
|
def get_task_breakdown(self, backend: str) -> Dict[str, BackendScore]:
|
||||||
|
"""Get per-task-type scores for a single backend."""
|
||||||
|
if backend not in BACKENDS:
|
||||||
|
raise ValueError(f"Unknown backend: {backend}")
|
||||||
|
|
||||||
|
return {
|
||||||
|
task.value: self.get_backend_score(backend, task.value)
|
||||||
|
for task in TaskType
|
||||||
|
}
|
||||||
|
|
||||||
|
def get_stats(self) -> Dict:
|
||||||
|
"""Get overall database statistics."""
|
||||||
|
with self._get_conn() as conn:
|
||||||
|
total = conn.execute("SELECT COUNT(*) FROM responses").fetchone()[0]
|
||||||
|
|
||||||
|
by_backend = {}
|
||||||
|
for backend in BACKENDS:
|
||||||
|
count = conn.execute(
|
||||||
|
"SELECT COUNT(*) FROM responses WHERE backend = ?",
|
||||||
|
(backend,)
|
||||||
|
).fetchone()[0]
|
||||||
|
by_backend[backend] = count
|
||||||
|
|
||||||
|
by_task = {}
|
||||||
|
for task in TaskType:
|
||||||
|
count = conn.execute(
|
||||||
|
"SELECT COUNT(*) FROM responses WHERE task_type = ?",
|
||||||
|
(task.value,)
|
||||||
|
).fetchone()[0]
|
||||||
|
by_task[task.value] = count
|
||||||
|
|
||||||
|
oldest = conn.execute(
|
||||||
|
"SELECT MIN(timestamp) FROM responses"
|
||||||
|
).fetchone()[0]
|
||||||
|
newest = conn.execute(
|
||||||
|
"SELECT MAX(timestamp) FROM responses"
|
||||||
|
).fetchone()[0]
|
||||||
|
|
||||||
|
return {
|
||||||
|
"total_records": total,
|
||||||
|
"by_backend": by_backend,
|
||||||
|
"by_task_type": by_task,
|
||||||
|
"oldest_record": datetime.fromtimestamp(oldest).isoformat() if oldest else None,
|
||||||
|
"newest_record": datetime.fromtimestamp(newest).isoformat() if newest else None,
|
||||||
|
}
|
||||||
|
|
||||||
|
def clear_data(self) -> None:
|
||||||
|
"""Clear all recorded data (useful for testing)."""
|
||||||
|
with self._get_conn() as conn:
|
||||||
|
conn.execute("DELETE FROM responses")
|
||||||
|
|
||||||
|
|
||||||
|
def print_score_report(scorer: QualityScorer, task_type: Optional[str] = None) -> None:
|
||||||
|
"""
|
||||||
|
Print a formatted score report to console.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
scorer: QualityScorer instance
|
||||||
|
task_type: Optional task type filter
|
||||||
|
"""
|
||||||
|
print("\n" + "=" * 80)
|
||||||
|
print(" UNIWIZARD BACKEND QUALITY SCORES")
|
||||||
|
print("=" * 80)
|
||||||
|
|
||||||
|
if task_type:
|
||||||
|
print(f"\n Task Type: {task_type.upper()}")
|
||||||
|
else:
|
||||||
|
print("\n Overall Performance (all task types)")
|
||||||
|
|
||||||
|
print("-" * 80)
|
||||||
|
|
||||||
|
scores = scorer.recommend_backend(task_type)
|
||||||
|
all_scores = scorer.get_all_scores(task_type)
|
||||||
|
|
||||||
|
# Header
|
||||||
|
print(f"\n {'Rank':<6} {'Backend':<16} {'Score':<8} {'Success':<10} {'Latency':<12} {'Samples':<8}")
|
||||||
|
print(" " + "-" * 72)
|
||||||
|
|
||||||
|
# Rankings
|
||||||
|
for rank, (backend, score) in enumerate(scores, 1):
|
||||||
|
card = all_scores[backend]
|
||||||
|
success_pct = (card.success_count / card.total_requests * 100) if card.total_requests > 0 else 0
|
||||||
|
|
||||||
|
bar_len = int(score / 5) # 20 chars = 100
|
||||||
|
bar = "█" * bar_len + "░" * (20 - bar_len)
|
||||||
|
|
||||||
|
print(f" {rank:<6} {backend:<16} {score:>6.1f} {success_pct:>6.1f}% {card.avg_latency_ms:>7.1f}ms {card.total_requests:>6}")
|
||||||
|
print(f" [{bar}]")
|
||||||
|
|
||||||
|
# Per-backend breakdown
|
||||||
|
print("\n" + "-" * 80)
|
||||||
|
print(" DETAILED BREAKDOWN")
|
||||||
|
print("-" * 80)
|
||||||
|
|
||||||
|
for backend in BACKENDS:
|
||||||
|
card = all_scores[backend]
|
||||||
|
if card.total_requests == 0:
|
||||||
|
print(f"\n {backend}: No data yet")
|
||||||
|
continue
|
||||||
|
|
||||||
|
print(f"\n {backend.upper()}:")
|
||||||
|
print(f" Requests: {card.total_requests} | "
|
||||||
|
f"Success: {card.success_count} | "
|
||||||
|
f"Errors: {card.error_count} | "
|
||||||
|
f"Refusals: {card.refusal_count} | "
|
||||||
|
f"Timeouts: {card.timeout_count}")
|
||||||
|
print(f" Avg Latency: {card.avg_latency_ms}ms | "
|
||||||
|
f"TTFT: {card.avg_ttft_ms}ms | "
|
||||||
|
f"P95: {card.p95_latency_ms}ms")
|
||||||
|
print(f" Quality Score: {card.score}/100")
|
||||||
|
|
||||||
|
# Recommendations
|
||||||
|
print("\n" + "=" * 80)
|
||||||
|
print(" RECOMMENDATIONS")
|
||||||
|
print("=" * 80)
|
||||||
|
|
||||||
|
recommendations = scorer.recommend_backend(task_type)
|
||||||
|
top_3 = [b for b, s in recommendations[:3] if s > 0]
|
||||||
|
|
||||||
|
if top_3:
|
||||||
|
print(f"\n Best backends{f' for {task_type}' if task_type else ''}:")
|
||||||
|
for i, backend in enumerate(top_3, 1):
|
||||||
|
score = next(s for b, s in recommendations if b == backend)
|
||||||
|
print(f" {i}. {backend} (score: {score})")
|
||||||
|
else:
|
||||||
|
print("\n Not enough data for recommendations yet.")
|
||||||
|
|
||||||
|
print("\n" + "=" * 80)
|
||||||
|
|
||||||
|
|
||||||
|
def print_full_report(scorer: QualityScorer) -> None:
|
||||||
|
"""Print a complete report including per-task-type breakdowns."""
|
||||||
|
print("\n" + "=" * 80)
|
||||||
|
print(" UNIWIZARD BACKEND QUALITY SCORECARD")
|
||||||
|
print("=" * 80)
|
||||||
|
|
||||||
|
stats = scorer.get_stats()
|
||||||
|
print(f"\n Database: {scorer.db_path}")
|
||||||
|
print(f" Total Records: {stats['total_records']}")
|
||||||
|
print(f" Date Range: {stats['oldest_record'] or 'N/A'} to {stats['newest_record'] or 'N/A'}")
|
||||||
|
|
||||||
|
# Overall scores
|
||||||
|
print_score_report(scorer)
|
||||||
|
|
||||||
|
# Per-task breakdown
|
||||||
|
print("\n" + "=" * 80)
|
||||||
|
print(" PER-TASK SPECIALIZATION")
|
||||||
|
print("=" * 80)
|
||||||
|
|
||||||
|
for task in TaskType:
|
||||||
|
print(f"\n{'─' * 80}")
|
||||||
|
scores = scorer.recommend_backend(task.value)
|
||||||
|
print(f"\n {task.value.upper()}:")
|
||||||
|
|
||||||
|
for rank, (backend, score) in enumerate(scores[:3], 1):
|
||||||
|
if score > 0:
|
||||||
|
print(f" {rank}. {backend}: {score}")
|
||||||
|
|
||||||
|
print("\n" + "=" * 80)
|
||||||
|
|
||||||
|
|
||||||
|
# Convenience functions for CLI usage
|
||||||
|
def get_scorer(db_path: Optional[Path] = None) -> QualityScorer:
|
||||||
|
"""Get or create a QualityScorer instance."""
|
||||||
|
return QualityScorer(db_path)
|
||||||
|
|
||||||
|
|
||||||
|
def record(
|
||||||
|
backend: str,
|
||||||
|
task_type: str,
|
||||||
|
status: str,
|
||||||
|
latency_ms: float,
|
||||||
|
ttft_ms: float = 0.0,
|
||||||
|
metadata: Optional[Dict] = None
|
||||||
|
) -> None:
|
||||||
|
"""Convenience function to record a response."""
|
||||||
|
scorer = get_scorer()
|
||||||
|
scorer.record_response(
|
||||||
|
backend=backend,
|
||||||
|
task_type=task_type,
|
||||||
|
status=ResponseStatus(status),
|
||||||
|
latency_ms=latency_ms,
|
||||||
|
ttft_ms=ttft_ms,
|
||||||
|
metadata=metadata
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def recommend(task_type: Optional[str] = None) -> List[Tuple[str, float]]:
|
||||||
|
"""Convenience function to get recommendations."""
|
||||||
|
scorer = get_scorer()
|
||||||
|
return scorer.recommend_backend(task_type)
|
||||||
|
|
||||||
|
|
||||||
|
def report(task_type: Optional[str] = None) -> None:
|
||||||
|
"""Convenience function to print report."""
|
||||||
|
scorer = get_scorer()
|
||||||
|
print_score_report(scorer, task_type)
|
||||||
|
|
||||||
|
|
||||||
|
def full_report() -> None:
|
||||||
|
"""Convenience function to print full report."""
|
||||||
|
scorer = get_scorer()
|
||||||
|
print_full_report(scorer)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
# Demo mode - show empty report structure
|
||||||
|
scorer = QualityScorer()
|
||||||
|
|
||||||
|
# Add some demo data if empty
|
||||||
|
stats = scorer.get_stats()
|
||||||
|
if stats["total_records"] == 0:
|
||||||
|
print("Generating demo data...")
|
||||||
|
import random
|
||||||
|
|
||||||
|
for _ in range(50):
|
||||||
|
scorer.record_response(
|
||||||
|
backend=random.choice(BACKENDS),
|
||||||
|
task_type=random.choice([t.value for t in TaskType]),
|
||||||
|
status=random.choices(
|
||||||
|
[ResponseStatus.SUCCESS, ResponseStatus.ERROR, ResponseStatus.REFUSAL, ResponseStatus.TIMEOUT],
|
||||||
|
weights=[0.85, 0.08, 0.05, 0.02]
|
||||||
|
)[0],
|
||||||
|
latency_ms=random.gauss(1500, 500),
|
||||||
|
ttft_ms=random.gauss(200, 100)
|
||||||
|
)
|
||||||
|
|
||||||
|
full_report()
|
||||||
769
uniwizard/self_grader.py
Normal file
769
uniwizard/self_grader.py
Normal file
@@ -0,0 +1,769 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
"""
|
||||||
|
Self-Grader Module for Timmy/UniWizard
|
||||||
|
|
||||||
|
Grades Hermes session logs to identify patterns in failures and track improvement.
|
||||||
|
Connects to quality scoring (#98) and adaptive routing (#88).
|
||||||
|
|
||||||
|
Author: Timmy (UniWizard)
|
||||||
|
"""
|
||||||
|
|
||||||
|
import json
|
||||||
|
import sqlite3
|
||||||
|
import re
|
||||||
|
from pathlib import Path
|
||||||
|
from dataclasses import dataclass, asdict
|
||||||
|
from datetime import datetime, timedelta
|
||||||
|
from typing import List, Dict, Optional, Any, Tuple
|
||||||
|
from collections import defaultdict
|
||||||
|
import statistics
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class SessionGrade:
|
||||||
|
"""Grade for a single session."""
|
||||||
|
session_id: str
|
||||||
|
session_file: str
|
||||||
|
graded_at: str
|
||||||
|
|
||||||
|
# Core metrics
|
||||||
|
task_completed: bool
|
||||||
|
tool_calls_efficient: int # 1-5 scale
|
||||||
|
response_quality: int # 1-5 scale
|
||||||
|
errors_recovered: bool
|
||||||
|
total_api_calls: int
|
||||||
|
|
||||||
|
# Additional metadata
|
||||||
|
model: str
|
||||||
|
platform: Optional[str]
|
||||||
|
session_start: str
|
||||||
|
duration_seconds: Optional[float]
|
||||||
|
task_summary: str
|
||||||
|
|
||||||
|
# Error analysis
|
||||||
|
total_errors: int
|
||||||
|
error_types: str # JSON list of error categories
|
||||||
|
tools_with_errors: str # JSON list of tool names
|
||||||
|
|
||||||
|
# Pattern flags
|
||||||
|
had_repeated_errors: bool
|
||||||
|
had_infinite_loop_risk: bool
|
||||||
|
had_user_clarification: bool
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class WeeklyReport:
|
||||||
|
"""Weekly improvement report."""
|
||||||
|
week_start: str
|
||||||
|
week_end: str
|
||||||
|
total_sessions: int
|
||||||
|
avg_tool_efficiency: float
|
||||||
|
avg_response_quality: float
|
||||||
|
completion_rate: float
|
||||||
|
error_recovery_rate: float
|
||||||
|
|
||||||
|
# Patterns
|
||||||
|
worst_task_types: List[Tuple[str, float]]
|
||||||
|
most_error_prone_tools: List[Tuple[str, int]]
|
||||||
|
common_error_patterns: List[Tuple[str, int]]
|
||||||
|
|
||||||
|
# Trends
|
||||||
|
improvement_suggestions: List[str]
|
||||||
|
|
||||||
|
|
||||||
|
class SelfGrader:
|
||||||
|
"""Grades Hermes sessions and tracks improvement patterns."""
|
||||||
|
|
||||||
|
# Error pattern regexes
|
||||||
|
ERROR_PATTERNS = {
|
||||||
|
'file_not_found': re.compile(r'file.*not found|no such file|does not exist', re.I),
|
||||||
|
'permission_denied': re.compile(r'permission denied|access denied|unauthorized', re.I),
|
||||||
|
'timeout': re.compile(r'time(d)?\s*out|deadline exceeded', re.I),
|
||||||
|
'api_error': re.compile(r'api.*error|rate limit|too many requests', re.I),
|
||||||
|
'syntax_error': re.compile(r'syntax error|invalid syntax|parse error', re.I),
|
||||||
|
'command_failed': re.compile(r'exit_code.*[1-9]|command.*failed|failed to', re.I),
|
||||||
|
'network_error': re.compile(r'network|connection|unreachable|refused', re.I),
|
||||||
|
'tool_not_found': re.compile(r'tool.*not found|unknown tool|no tool named', re.I),
|
||||||
|
}
|
||||||
|
|
||||||
|
# Task type patterns
|
||||||
|
TASK_PATTERNS = {
|
||||||
|
'code_review': re.compile(r'code review|review.*code|review.*pr|pull request', re.I),
|
||||||
|
'debugging': re.compile(r'debug|fix.*bug|troubleshoot|error.*fix', re.I),
|
||||||
|
'feature_impl': re.compile(r'implement|add.*feature|build.*function', re.I),
|
||||||
|
'refactoring': re.compile(r'refactor|clean.*up|reorganize|restructure', re.I),
|
||||||
|
'documentation': re.compile(r'document|readme|docstring|comment', re.I),
|
||||||
|
'testing': re.compile(r'test|pytest|unit test|integration test', re.I),
|
||||||
|
'research': re.compile(r'research|investigate|look up|find.*about', re.I),
|
||||||
|
'deployment': re.compile(r'deploy|release|publish|push.*prod', re.I),
|
||||||
|
'data_analysis': re.compile(r'analyze.*data|process.*file|parse.*json|csv', re.I),
|
||||||
|
'infrastructure': re.compile(r'server|docker|kubernetes|terraform|ansible', re.I),
|
||||||
|
}
|
||||||
|
|
||||||
|
def __init__(self, grades_db_path: Optional[Path] = None,
|
||||||
|
sessions_dir: Optional[Path] = None):
|
||||||
|
"""Initialize the grader with database and sessions directory."""
|
||||||
|
self.grades_db_path = Path(grades_db_path) if grades_db_path else Path.home() / ".timmy" / "uniwizard" / "session_grades.db"
|
||||||
|
self.sessions_dir = Path(sessions_dir) if sessions_dir else Path.home() / ".hermes" / "sessions"
|
||||||
|
self._init_database()
|
||||||
|
|
||||||
|
def _init_database(self):
|
||||||
|
"""Initialize the SQLite database with schema."""
|
||||||
|
self.grades_db_path.parent.mkdir(parents=True, exist_ok=True)
|
||||||
|
|
||||||
|
with sqlite3.connect(self.grades_db_path) as conn:
|
||||||
|
conn.execute("""
|
||||||
|
CREATE TABLE IF NOT EXISTS session_grades (
|
||||||
|
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
||||||
|
session_id TEXT UNIQUE NOT NULL,
|
||||||
|
session_file TEXT NOT NULL,
|
||||||
|
graded_at TEXT NOT NULL,
|
||||||
|
|
||||||
|
-- Core metrics
|
||||||
|
task_completed INTEGER NOT NULL,
|
||||||
|
tool_calls_efficient INTEGER NOT NULL,
|
||||||
|
response_quality INTEGER NOT NULL,
|
||||||
|
errors_recovered INTEGER NOT NULL,
|
||||||
|
total_api_calls INTEGER NOT NULL,
|
||||||
|
|
||||||
|
-- Metadata
|
||||||
|
model TEXT,
|
||||||
|
platform TEXT,
|
||||||
|
session_start TEXT,
|
||||||
|
duration_seconds REAL,
|
||||||
|
task_summary TEXT,
|
||||||
|
|
||||||
|
-- Error analysis
|
||||||
|
total_errors INTEGER NOT NULL,
|
||||||
|
error_types TEXT,
|
||||||
|
tools_with_errors TEXT,
|
||||||
|
|
||||||
|
-- Pattern flags
|
||||||
|
had_repeated_errors INTEGER NOT NULL,
|
||||||
|
had_infinite_loop_risk INTEGER NOT NULL,
|
||||||
|
had_user_clarification INTEGER NOT NULL
|
||||||
|
)
|
||||||
|
""")
|
||||||
|
|
||||||
|
# Index for efficient queries
|
||||||
|
conn.execute("CREATE INDEX IF NOT EXISTS idx_graded_at ON session_grades(graded_at)")
|
||||||
|
conn.execute("CREATE INDEX IF NOT EXISTS idx_session_start ON session_grades(session_start)")
|
||||||
|
conn.execute("CREATE INDEX IF NOT EXISTS idx_model ON session_grades(model)")
|
||||||
|
|
||||||
|
# Weekly reports table
|
||||||
|
conn.execute("""
|
||||||
|
CREATE TABLE IF NOT EXISTS weekly_reports (
|
||||||
|
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
||||||
|
week_start TEXT UNIQUE NOT NULL,
|
||||||
|
week_end TEXT NOT NULL,
|
||||||
|
generated_at TEXT NOT NULL,
|
||||||
|
report_json TEXT NOT NULL
|
||||||
|
)
|
||||||
|
""")
|
||||||
|
|
||||||
|
conn.commit()
|
||||||
|
|
||||||
|
def grade_session_file(self, session_path: Path) -> Optional[SessionGrade]:
|
||||||
|
"""Grade a single session file."""
|
||||||
|
try:
|
||||||
|
with open(session_path) as f:
|
||||||
|
data = json.load(f)
|
||||||
|
except (json.JSONDecodeError, IOError) as e:
|
||||||
|
print(f"Error reading {session_path}: {e}")
|
||||||
|
return None
|
||||||
|
|
||||||
|
session_id = data.get('session_id', '')
|
||||||
|
messages = data.get('messages', [])
|
||||||
|
|
||||||
|
if not messages:
|
||||||
|
return None
|
||||||
|
|
||||||
|
# Analyze message flow
|
||||||
|
analysis = self._analyze_messages(messages)
|
||||||
|
|
||||||
|
# Calculate grades
|
||||||
|
task_completed = self._assess_task_completion(messages, analysis)
|
||||||
|
tool_efficiency = self._assess_tool_efficiency(analysis)
|
||||||
|
response_quality = self._assess_response_quality(messages, analysis)
|
||||||
|
errors_recovered = self._assess_error_recovery(messages, analysis)
|
||||||
|
|
||||||
|
# Extract task summary from first user message
|
||||||
|
task_summary = ""
|
||||||
|
for msg in messages:
|
||||||
|
if msg.get('role') == 'user':
|
||||||
|
task_summary = msg.get('content', '')[:200]
|
||||||
|
break
|
||||||
|
|
||||||
|
# Calculate duration if timestamps available
|
||||||
|
duration = None
|
||||||
|
if messages and 'timestamp' in messages[0] and 'timestamp' in messages[-1]:
|
||||||
|
try:
|
||||||
|
start = datetime.fromisoformat(messages[0]['timestamp'].replace('Z', '+00:00'))
|
||||||
|
end = datetime.fromisoformat(messages[-1]['timestamp'].replace('Z', '+00:00'))
|
||||||
|
duration = (end - start).total_seconds()
|
||||||
|
except (ValueError, KeyError):
|
||||||
|
pass
|
||||||
|
|
||||||
|
return SessionGrade(
|
||||||
|
session_id=session_id,
|
||||||
|
session_file=str(session_path.name),
|
||||||
|
graded_at=datetime.now().isoformat(),
|
||||||
|
task_completed=task_completed,
|
||||||
|
tool_calls_efficient=tool_efficiency,
|
||||||
|
response_quality=response_quality,
|
||||||
|
errors_recovered=errors_recovered,
|
||||||
|
total_api_calls=analysis['total_api_calls'],
|
||||||
|
model=data.get('model', 'unknown'),
|
||||||
|
platform=data.get('platform'),
|
||||||
|
session_start=data.get('session_start', ''),
|
||||||
|
duration_seconds=duration,
|
||||||
|
task_summary=task_summary,
|
||||||
|
total_errors=analysis['total_errors'],
|
||||||
|
error_types=json.dumps(list(analysis['error_types'])),
|
||||||
|
tools_with_errors=json.dumps(list(analysis['tools_with_errors'])),
|
||||||
|
had_repeated_errors=analysis['had_repeated_errors'],
|
||||||
|
had_infinite_loop_risk=analysis['had_infinite_loop_risk'],
|
||||||
|
had_user_clarification=analysis['had_user_clarification']
|
||||||
|
)
|
||||||
|
|
||||||
|
def _analyze_messages(self, messages: List[Dict]) -> Dict[str, Any]:
|
||||||
|
"""Analyze message flow to extract metrics."""
|
||||||
|
analysis = {
|
||||||
|
'total_api_calls': 0,
|
||||||
|
'total_errors': 0,
|
||||||
|
'error_types': set(),
|
||||||
|
'tools_with_errors': set(),
|
||||||
|
'tool_call_counts': defaultdict(int),
|
||||||
|
'error_sequences': [],
|
||||||
|
'had_repeated_errors': False,
|
||||||
|
'had_infinite_loop_risk': False,
|
||||||
|
'had_user_clarification': False,
|
||||||
|
'final_assistant_msg': None,
|
||||||
|
'consecutive_errors': 0,
|
||||||
|
'max_consecutive_errors': 0,
|
||||||
|
}
|
||||||
|
|
||||||
|
last_tool_was_error = False
|
||||||
|
|
||||||
|
for i, msg in enumerate(messages):
|
||||||
|
role = msg.get('role')
|
||||||
|
|
||||||
|
if role == 'assistant':
|
||||||
|
analysis['total_api_calls'] += 1
|
||||||
|
|
||||||
|
# Check for clarification requests
|
||||||
|
content = msg.get('content', '')
|
||||||
|
tool_calls = msg.get('tool_calls', [])
|
||||||
|
if tool_calls and tool_calls[0].get('function', {}).get('name') == 'clarify':
|
||||||
|
analysis['had_user_clarification'] = True
|
||||||
|
if 'clarify' in content.lower() and 'need clarification' in content.lower():
|
||||||
|
analysis['had_user_clarification'] = True
|
||||||
|
|
||||||
|
# Track tool calls
|
||||||
|
for tc in tool_calls:
|
||||||
|
tool_name = tc.get('function', {}).get('name', 'unknown')
|
||||||
|
analysis['tool_call_counts'][tool_name] += 1
|
||||||
|
|
||||||
|
# Track final assistant message
|
||||||
|
analysis['final_assistant_msg'] = msg
|
||||||
|
|
||||||
|
# Don't reset consecutive errors here - they continue until a tool succeeds
|
||||||
|
|
||||||
|
elif role == 'tool':
|
||||||
|
content = msg.get('content', '')
|
||||||
|
tool_name = msg.get('name', 'unknown')
|
||||||
|
|
||||||
|
# Check for errors
|
||||||
|
is_error = self._detect_error(content)
|
||||||
|
if is_error:
|
||||||
|
analysis['total_errors'] += 1
|
||||||
|
analysis['tools_with_errors'].add(tool_name)
|
||||||
|
|
||||||
|
# Classify error
|
||||||
|
error_type = self._classify_error(content)
|
||||||
|
analysis['error_types'].add(error_type)
|
||||||
|
|
||||||
|
# Track consecutive errors (consecutive tool messages with errors)
|
||||||
|
analysis['consecutive_errors'] += 1
|
||||||
|
analysis['max_consecutive_errors'] = max(
|
||||||
|
analysis['max_consecutive_errors'],
|
||||||
|
analysis['consecutive_errors']
|
||||||
|
)
|
||||||
|
|
||||||
|
last_tool_was_error = True
|
||||||
|
else:
|
||||||
|
# Reset consecutive errors on success
|
||||||
|
analysis['consecutive_errors'] = 0
|
||||||
|
last_tool_was_error = False
|
||||||
|
|
||||||
|
# Detect patterns
|
||||||
|
analysis['had_repeated_errors'] = analysis['max_consecutive_errors'] >= 3
|
||||||
|
analysis['had_infinite_loop_risk'] = (
|
||||||
|
analysis['max_consecutive_errors'] >= 5 or
|
||||||
|
analysis['total_api_calls'] > 50
|
||||||
|
)
|
||||||
|
|
||||||
|
return analysis
|
||||||
|
|
||||||
|
def _detect_error(self, content: str) -> bool:
|
||||||
|
"""Detect if tool result contains an error."""
|
||||||
|
if not content:
|
||||||
|
return False
|
||||||
|
|
||||||
|
content_lower = content.lower()
|
||||||
|
|
||||||
|
# Check for explicit error indicators
|
||||||
|
error_indicators = [
|
||||||
|
'"error":', '"error" :', 'error:', 'exception:',
|
||||||
|
'"exit_code": 1', '"exit_code": 2', '"exit_code": -1',
|
||||||
|
'traceback', 'failed', 'failure',
|
||||||
|
]
|
||||||
|
|
||||||
|
for indicator in error_indicators:
|
||||||
|
if indicator in content_lower:
|
||||||
|
return True
|
||||||
|
|
||||||
|
return False
|
||||||
|
|
||||||
|
def _classify_error(self, content: str) -> str:
|
||||||
|
"""Classify the type of error."""
|
||||||
|
content_lower = content.lower()
|
||||||
|
|
||||||
|
for error_type, pattern in self.ERROR_PATTERNS.items():
|
||||||
|
if pattern.search(content_lower):
|
||||||
|
return error_type
|
||||||
|
|
||||||
|
return 'unknown'
|
||||||
|
|
||||||
|
def _assess_task_completion(self, messages: List[Dict], analysis: Dict) -> bool:
|
||||||
|
"""Assess whether the task was likely completed."""
|
||||||
|
if not messages:
|
||||||
|
return False
|
||||||
|
|
||||||
|
# Check final assistant message
|
||||||
|
final_msg = analysis.get('final_assistant_msg')
|
||||||
|
if not final_msg:
|
||||||
|
return False
|
||||||
|
|
||||||
|
content = final_msg.get('content', '')
|
||||||
|
|
||||||
|
# Positive completion indicators
|
||||||
|
completion_phrases = [
|
||||||
|
'done', 'completed', 'success', 'finished', 'created',
|
||||||
|
'implemented', 'fixed', 'resolved', 'saved to', 'here is',
|
||||||
|
'here are', 'the result', 'output:', 'file:', 'pr:', 'pull request'
|
||||||
|
]
|
||||||
|
|
||||||
|
for phrase in completion_phrases:
|
||||||
|
if phrase in content.lower():
|
||||||
|
return True
|
||||||
|
|
||||||
|
# Check if there were many errors
|
||||||
|
if analysis['total_errors'] > analysis['total_api_calls'] * 0.3:
|
||||||
|
return False
|
||||||
|
|
||||||
|
# Check for explicit failure
|
||||||
|
failure_phrases = ['failed', 'unable to', 'could not', 'error:', 'sorry, i cannot']
|
||||||
|
for phrase in failure_phrases:
|
||||||
|
if phrase in content.lower()[:200]:
|
||||||
|
return False
|
||||||
|
|
||||||
|
return True
|
||||||
|
|
||||||
|
def _assess_tool_efficiency(self, analysis: Dict) -> int:
|
||||||
|
"""Rate tool call efficiency on 1-5 scale."""
|
||||||
|
tool_calls = analysis['total_api_calls']
|
||||||
|
errors = analysis['total_errors']
|
||||||
|
|
||||||
|
if tool_calls == 0:
|
||||||
|
return 3 # Neutral if no tool calls
|
||||||
|
|
||||||
|
error_rate = errors / tool_calls
|
||||||
|
|
||||||
|
# Score based on error rate and total calls
|
||||||
|
if error_rate == 0 and tool_calls <= 10:
|
||||||
|
return 5 # Perfect efficiency
|
||||||
|
elif error_rate <= 0.1 and tool_calls <= 15:
|
||||||
|
return 4 # Good efficiency
|
||||||
|
elif error_rate <= 0.25 and tool_calls <= 25:
|
||||||
|
return 3 # Average
|
||||||
|
elif error_rate <= 0.4:
|
||||||
|
return 2 # Poor
|
||||||
|
else:
|
||||||
|
return 1 # Very poor
|
||||||
|
|
||||||
|
def _assess_response_quality(self, messages: List[Dict], analysis: Dict) -> int:
|
||||||
|
"""Rate response quality on 1-5 scale."""
|
||||||
|
final_msg = analysis.get('final_assistant_msg')
|
||||||
|
if not final_msg:
|
||||||
|
return 1
|
||||||
|
|
||||||
|
content = final_msg.get('content', '')
|
||||||
|
content_len = len(content)
|
||||||
|
|
||||||
|
# Quality indicators
|
||||||
|
score = 3 # Start at average
|
||||||
|
|
||||||
|
# Length heuristics
|
||||||
|
if content_len > 500:
|
||||||
|
score += 1
|
||||||
|
if content_len > 1000:
|
||||||
|
score += 1
|
||||||
|
|
||||||
|
# Code blocks indicate substantive response
|
||||||
|
if '```' in content:
|
||||||
|
score += 1
|
||||||
|
|
||||||
|
# Links/references indicate thoroughness
|
||||||
|
if 'http' in content or 'see ' in content.lower():
|
||||||
|
score += 0.5
|
||||||
|
|
||||||
|
# Error penalties
|
||||||
|
if analysis['had_repeated_errors']:
|
||||||
|
score -= 1
|
||||||
|
if analysis['total_errors'] > 5:
|
||||||
|
score -= 1
|
||||||
|
|
||||||
|
# Loop risk is severe
|
||||||
|
if analysis['had_infinite_loop_risk']:
|
||||||
|
score -= 2
|
||||||
|
|
||||||
|
return max(1, min(5, int(score)))
|
||||||
|
|
||||||
|
def _assess_error_recovery(self, messages: List[Dict], analysis: Dict) -> bool:
|
||||||
|
"""Assess whether errors were successfully recovered from."""
|
||||||
|
if analysis['total_errors'] == 0:
|
||||||
|
return True # No errors to recover from
|
||||||
|
|
||||||
|
# If task completed despite errors, recovered
|
||||||
|
if self._assess_task_completion(messages, analysis):
|
||||||
|
return True
|
||||||
|
|
||||||
|
# If no repeated errors, likely recovered
|
||||||
|
if not analysis['had_repeated_errors']:
|
||||||
|
return True
|
||||||
|
|
||||||
|
return False
|
||||||
|
|
||||||
|
def save_grade(self, grade: SessionGrade) -> bool:
|
||||||
|
"""Save a grade to the database."""
|
||||||
|
try:
|
||||||
|
with sqlite3.connect(self.grades_db_path) as conn:
|
||||||
|
conn.execute("""
|
||||||
|
INSERT OR REPLACE INTO session_grades (
|
||||||
|
session_id, session_file, graded_at,
|
||||||
|
task_completed, tool_calls_efficient, response_quality,
|
||||||
|
errors_recovered, total_api_calls, model, platform,
|
||||||
|
session_start, duration_seconds, task_summary,
|
||||||
|
total_errors, error_types, tools_with_errors,
|
||||||
|
had_repeated_errors, had_infinite_loop_risk, had_user_clarification
|
||||||
|
) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
|
||||||
|
""", (
|
||||||
|
grade.session_id, grade.session_file, grade.graded_at,
|
||||||
|
int(grade.task_completed), grade.tool_calls_efficient,
|
||||||
|
grade.response_quality, int(grade.errors_recovered),
|
||||||
|
grade.total_api_calls, grade.model, grade.platform,
|
||||||
|
grade.session_start, grade.duration_seconds, grade.task_summary,
|
||||||
|
grade.total_errors, grade.error_types, grade.tools_with_errors,
|
||||||
|
int(grade.had_repeated_errors), int(grade.had_infinite_loop_risk),
|
||||||
|
int(grade.had_user_clarification)
|
||||||
|
))
|
||||||
|
conn.commit()
|
||||||
|
return True
|
||||||
|
except sqlite3.Error as e:
|
||||||
|
print(f"Database error saving grade: {e}")
|
||||||
|
return False
|
||||||
|
|
||||||
|
def grade_latest_sessions(self, n: int = 10) -> List[SessionGrade]:
|
||||||
|
"""Grade the last N ungraded sessions."""
|
||||||
|
# Get recent session files
|
||||||
|
session_files = sorted(
|
||||||
|
[f for f in self.sessions_dir.glob("session_*.json")
|
||||||
|
if not f.name.endswith("sessions.json")],
|
||||||
|
key=lambda x: x.stat().st_mtime,
|
||||||
|
reverse=True
|
||||||
|
)
|
||||||
|
|
||||||
|
# Get already graded sessions
|
||||||
|
graded_ids = set()
|
||||||
|
try:
|
||||||
|
with sqlite3.connect(self.grades_db_path) as conn:
|
||||||
|
cursor = conn.execute("SELECT session_id FROM session_grades")
|
||||||
|
graded_ids = {row[0] for row in cursor.fetchall()}
|
||||||
|
except sqlite3.Error:
|
||||||
|
pass
|
||||||
|
|
||||||
|
# Grade ungraded sessions
|
||||||
|
grades = []
|
||||||
|
for sf in session_files[:n]:
|
||||||
|
# Extract session ID from filename
|
||||||
|
session_id = sf.stem.replace('session_', '')
|
||||||
|
if session_id in graded_ids:
|
||||||
|
continue
|
||||||
|
|
||||||
|
grade = self.grade_session_file(sf)
|
||||||
|
if grade:
|
||||||
|
if self.save_grade(grade):
|
||||||
|
grades.append(grade)
|
||||||
|
|
||||||
|
return grades
|
||||||
|
|
||||||
|
def identify_patterns(self, days: int = 7) -> Dict[str, Any]:
|
||||||
|
"""Identify patterns in recent graded sessions."""
|
||||||
|
since = (datetime.now() - timedelta(days=days)).isoformat()
|
||||||
|
|
||||||
|
with sqlite3.connect(self.grades_db_path) as conn:
|
||||||
|
# Overall stats
|
||||||
|
cursor = conn.execute("""
|
||||||
|
SELECT
|
||||||
|
COUNT(*),
|
||||||
|
AVG(tool_calls_efficient),
|
||||||
|
AVG(response_quality),
|
||||||
|
AVG(CASE WHEN task_completed THEN 1.0 ELSE 0.0 END),
|
||||||
|
AVG(CASE WHEN errors_recovered THEN 1.0 ELSE 0.0 END)
|
||||||
|
FROM session_grades
|
||||||
|
WHERE graded_at > ?
|
||||||
|
""", (since,))
|
||||||
|
|
||||||
|
row = cursor.fetchone()
|
||||||
|
stats = {
|
||||||
|
'total_sessions': row[0] or 0,
|
||||||
|
'avg_tool_efficiency': round(row[1] or 0, 2),
|
||||||
|
'avg_response_quality': round(row[2] or 0, 2),
|
||||||
|
'completion_rate': round((row[3] or 0) * 100, 1),
|
||||||
|
'error_recovery_rate': round((row[4] or 0) * 100, 1),
|
||||||
|
}
|
||||||
|
|
||||||
|
# Tool error analysis
|
||||||
|
cursor = conn.execute("""
|
||||||
|
SELECT tools_with_errors, COUNT(*)
|
||||||
|
FROM session_grades
|
||||||
|
WHERE graded_at > ? AND tools_with_errors != '[]'
|
||||||
|
GROUP BY tools_with_errors
|
||||||
|
""", (since,))
|
||||||
|
|
||||||
|
tool_errors = defaultdict(int)
|
||||||
|
for row in cursor.fetchall():
|
||||||
|
tools = json.loads(row[0])
|
||||||
|
for tool in tools:
|
||||||
|
tool_errors[tool] += row[1]
|
||||||
|
|
||||||
|
# Error type analysis
|
||||||
|
cursor = conn.execute("""
|
||||||
|
SELECT error_types, COUNT(*)
|
||||||
|
FROM session_grades
|
||||||
|
WHERE graded_at > ? AND error_types != '[]'
|
||||||
|
GROUP BY error_types
|
||||||
|
""", (since,))
|
||||||
|
|
||||||
|
error_types = defaultdict(int)
|
||||||
|
for row in cursor.fetchall():
|
||||||
|
types = json.loads(row[0])
|
||||||
|
for et in types:
|
||||||
|
error_types[et] += row[1]
|
||||||
|
|
||||||
|
# Task type performance (infer from task_summary)
|
||||||
|
cursor = conn.execute("""
|
||||||
|
SELECT task_summary, response_quality
|
||||||
|
FROM session_grades
|
||||||
|
WHERE graded_at > ?
|
||||||
|
""", (since,))
|
||||||
|
|
||||||
|
task_scores = defaultdict(list)
|
||||||
|
for row in cursor.fetchall():
|
||||||
|
summary = row[0] or ''
|
||||||
|
score = row[1]
|
||||||
|
task_type = self._infer_task_type(summary)
|
||||||
|
task_scores[task_type].append(score)
|
||||||
|
|
||||||
|
avg_task_scores = {
|
||||||
|
tt: round(sum(scores) / len(scores), 2)
|
||||||
|
for tt, scores in task_scores.items()
|
||||||
|
}
|
||||||
|
|
||||||
|
return {
|
||||||
|
**stats,
|
||||||
|
'tool_error_counts': dict(tool_errors),
|
||||||
|
'error_type_counts': dict(error_types),
|
||||||
|
'task_type_scores': avg_task_scores,
|
||||||
|
}
|
||||||
|
|
||||||
|
def _infer_task_type(self, summary: str) -> str:
|
||||||
|
"""Infer task type from summary text."""
|
||||||
|
for task_type, pattern in self.TASK_PATTERNS.items():
|
||||||
|
if pattern.search(summary):
|
||||||
|
return task_type
|
||||||
|
return 'general'
|
||||||
|
|
||||||
|
def generate_weekly_report(self) -> WeeklyReport:
|
||||||
|
"""Generate a weekly improvement report."""
|
||||||
|
# Calculate week boundaries (Monday to Sunday)
|
||||||
|
today = datetime.now()
|
||||||
|
monday = today - timedelta(days=today.weekday())
|
||||||
|
sunday = monday + timedelta(days=6)
|
||||||
|
|
||||||
|
patterns = self.identify_patterns(days=7)
|
||||||
|
|
||||||
|
# Find worst task types
|
||||||
|
task_scores = patterns.get('task_type_scores', {})
|
||||||
|
worst_tasks = sorted(task_scores.items(), key=lambda x: x[1])[:3]
|
||||||
|
|
||||||
|
# Find most error-prone tools
|
||||||
|
tool_errors = patterns.get('tool_error_counts', {})
|
||||||
|
worst_tools = sorted(tool_errors.items(), key=lambda x: x[1], reverse=True)[:3]
|
||||||
|
|
||||||
|
# Find common error patterns
|
||||||
|
error_types = patterns.get('error_type_counts', {})
|
||||||
|
common_errors = sorted(error_types.items(), key=lambda x: x[1], reverse=True)[:3]
|
||||||
|
|
||||||
|
# Generate suggestions
|
||||||
|
suggestions = self._generate_suggestions(patterns, worst_tasks, worst_tools, common_errors)
|
||||||
|
|
||||||
|
report = WeeklyReport(
|
||||||
|
week_start=monday.strftime('%Y-%m-%d'),
|
||||||
|
week_end=sunday.strftime('%Y-%m-%d'),
|
||||||
|
total_sessions=patterns['total_sessions'],
|
||||||
|
avg_tool_efficiency=patterns['avg_tool_efficiency'],
|
||||||
|
avg_response_quality=patterns['avg_response_quality'],
|
||||||
|
completion_rate=patterns['completion_rate'],
|
||||||
|
error_recovery_rate=patterns['error_recovery_rate'],
|
||||||
|
worst_task_types=worst_tasks,
|
||||||
|
most_error_prone_tools=worst_tools,
|
||||||
|
common_error_patterns=common_errors,
|
||||||
|
improvement_suggestions=suggestions
|
||||||
|
)
|
||||||
|
|
||||||
|
# Save report
|
||||||
|
with sqlite3.connect(self.grades_db_path) as conn:
|
||||||
|
conn.execute("""
|
||||||
|
INSERT OR REPLACE INTO weekly_reports
|
||||||
|
(week_start, week_end, generated_at, report_json)
|
||||||
|
VALUES (?, ?, ?, ?)
|
||||||
|
""", (
|
||||||
|
report.week_start,
|
||||||
|
report.week_end,
|
||||||
|
datetime.now().isoformat(),
|
||||||
|
json.dumps(asdict(report))
|
||||||
|
))
|
||||||
|
conn.commit()
|
||||||
|
|
||||||
|
return report
|
||||||
|
|
||||||
|
def _generate_suggestions(self, patterns: Dict, worst_tasks: List,
|
||||||
|
worst_tools: List, common_errors: List) -> List[str]:
|
||||||
|
"""Generate improvement suggestions based on patterns."""
|
||||||
|
suggestions = []
|
||||||
|
|
||||||
|
if patterns['completion_rate'] < 70:
|
||||||
|
suggestions.append("Task completion rate is below 70%. Consider adding pre-task planning steps.")
|
||||||
|
|
||||||
|
if patterns['avg_tool_efficiency'] < 3:
|
||||||
|
suggestions.append("Tool efficiency is low. Review error recovery patterns and add retry logic.")
|
||||||
|
|
||||||
|
if worst_tasks:
|
||||||
|
task_names = ', '.join([t[0] for t in worst_tasks])
|
||||||
|
suggestions.append(f"Lowest scoring task types: {task_names}. Consider skill enhancement.")
|
||||||
|
|
||||||
|
if worst_tools:
|
||||||
|
tool_names = ', '.join([t[0] for t in worst_tools])
|
||||||
|
suggestions.append(f"Most error-prone tools: {tool_names}. Review usage patterns.")
|
||||||
|
|
||||||
|
if common_errors:
|
||||||
|
error_names = ', '.join([e[0] for e in common_errors])
|
||||||
|
suggestions.append(f"Common error types: {error_names}. Add targeted error handling.")
|
||||||
|
|
||||||
|
if patterns['error_recovery_rate'] < 80:
|
||||||
|
suggestions.append("Error recovery rate needs improvement. Implement better fallback strategies.")
|
||||||
|
|
||||||
|
if not suggestions:
|
||||||
|
suggestions.append("Performance is stable. Focus on expanding task coverage.")
|
||||||
|
|
||||||
|
return suggestions
|
||||||
|
|
||||||
|
def get_grades_summary(self, days: int = 30) -> str:
|
||||||
|
"""Get a human-readable summary of recent grades."""
|
||||||
|
patterns = self.identify_patterns(days=days)
|
||||||
|
|
||||||
|
lines = [
|
||||||
|
f"=== Session Grades Summary (Last {days} days) ===",
|
||||||
|
"",
|
||||||
|
f"Total Sessions Graded: {patterns['total_sessions']}",
|
||||||
|
f"Average Tool Efficiency: {patterns['avg_tool_efficiency']}/5",
|
||||||
|
f"Average Response Quality: {patterns['avg_response_quality']}/5",
|
||||||
|
f"Task Completion Rate: {patterns['completion_rate']}%",
|
||||||
|
f"Error Recovery Rate: {patterns['error_recovery_rate']}%",
|
||||||
|
"",
|
||||||
|
]
|
||||||
|
|
||||||
|
if patterns.get('task_type_scores'):
|
||||||
|
lines.append("Task Type Performance:")
|
||||||
|
for task, score in sorted(patterns['task_type_scores'].items(), key=lambda x: -x[1]):
|
||||||
|
lines.append(f" - {task}: {score}/5")
|
||||||
|
lines.append("")
|
||||||
|
|
||||||
|
if patterns.get('tool_error_counts'):
|
||||||
|
lines.append("Tool Error Counts:")
|
||||||
|
for tool, count in sorted(patterns['tool_error_counts'].items(), key=lambda x: -x[1]):
|
||||||
|
lines.append(f" - {tool}: {count}")
|
||||||
|
lines.append("")
|
||||||
|
|
||||||
|
return '\n'.join(lines)
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
"""CLI entry point for self-grading."""
|
||||||
|
import argparse
|
||||||
|
|
||||||
|
parser = argparse.ArgumentParser(description='Grade Hermes sessions')
|
||||||
|
parser.add_argument('--grade-latest', '-g', type=int, metavar='N',
|
||||||
|
help='Grade the last N ungraded sessions')
|
||||||
|
parser.add_argument('--summary', '-s', action='store_true',
|
||||||
|
help='Show summary of recent grades')
|
||||||
|
parser.add_argument('--days', '-d', type=int, default=7,
|
||||||
|
help='Number of days for summary (default: 7)')
|
||||||
|
parser.add_argument('--report', '-r', action='store_true',
|
||||||
|
help='Generate weekly report')
|
||||||
|
parser.add_argument('--file', '-f', type=Path,
|
||||||
|
help='Grade a specific session file')
|
||||||
|
|
||||||
|
args = parser.parse_args()
|
||||||
|
|
||||||
|
grader = SelfGrader()
|
||||||
|
|
||||||
|
if args.file:
|
||||||
|
grade = grader.grade_session_file(args.file)
|
||||||
|
if grade:
|
||||||
|
grader.save_grade(grade)
|
||||||
|
print(f"Graded session: {grade.session_id}")
|
||||||
|
print(f" Task completed: {grade.task_completed}")
|
||||||
|
print(f" Tool efficiency: {grade.tool_calls_efficient}/5")
|
||||||
|
print(f" Response quality: {grade.response_quality}/5")
|
||||||
|
print(f" Errors recovered: {grade.errors_recovered}")
|
||||||
|
else:
|
||||||
|
print("Failed to grade session")
|
||||||
|
|
||||||
|
elif args.grade_latest:
|
||||||
|
grades = grader.grade_latest_sessions(args.grade_latest)
|
||||||
|
print(f"Graded {len(grades)} sessions")
|
||||||
|
for g in grades:
|
||||||
|
print(f" - {g.session_id}: quality={g.response_quality}/5, "
|
||||||
|
f"completed={g.task_completed}")
|
||||||
|
|
||||||
|
elif args.report:
|
||||||
|
report = grader.generate_weekly_report()
|
||||||
|
print(f"\n=== Weekly Report ({report.week_start} to {report.week_end}) ===")
|
||||||
|
print(f"Total Sessions: {report.total_sessions}")
|
||||||
|
print(f"Avg Tool Efficiency: {report.avg_tool_efficiency}/5")
|
||||||
|
print(f"Avg Response Quality: {report.avg_response_quality}/5")
|
||||||
|
print(f"Completion Rate: {report.completion_rate}%")
|
||||||
|
print(f"Error Recovery Rate: {report.error_recovery_rate}%")
|
||||||
|
print("\nSuggestions:")
|
||||||
|
for s in report.improvement_suggestions:
|
||||||
|
print(f" - {s}")
|
||||||
|
|
||||||
|
else:
|
||||||
|
print(grader.get_grades_summary(days=args.days))
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
main()
|
||||||
453
uniwizard/self_grader_design.md
Normal file
453
uniwizard/self_grader_design.md
Normal file
@@ -0,0 +1,453 @@
|
|||||||
|
# Self-Grader Design Document
|
||||||
|
|
||||||
|
**Issue:** timmy-home #89 - "Build self-improvement loop: Timmy grades and learns from his own outputs"
|
||||||
|
|
||||||
|
**Related Issues:** #88 (Adaptive Routing), #98 (Quality Scoring)
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 1. Overview
|
||||||
|
|
||||||
|
The Self-Grader module enables Timmy to automatically evaluate his own task outputs, identify patterns in failures, and generate actionable improvement insights. This creates a closed feedback loop for continuous self-improvement.
|
||||||
|
|
||||||
|
### Goals
|
||||||
|
- Automatically grade completed sessions on multiple quality dimensions
|
||||||
|
- Identify recurring error patterns and their root causes
|
||||||
|
- Track performance trends over time
|
||||||
|
- Generate actionable weekly improvement reports
|
||||||
|
- Feed insights into adaptive routing decisions
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 2. Architecture
|
||||||
|
|
||||||
|
```
|
||||||
|
┌─────────────────────────────────────────────────────────────┐
|
||||||
|
│ Self-Grader Module │
|
||||||
|
├─────────────────────────────────────────────────────────────┤
|
||||||
|
│ │
|
||||||
|
│ ┌──────────────┐ ┌──────────────┐ ┌──────────────┐ │
|
||||||
|
│ │ Parser │───▶│ Analyzer │───▶│ Grader │ │
|
||||||
|
│ │ │ │ │ │ │ │
|
||||||
|
│ │ Reads session│ │ Extracts │ │ Scores on 5 │ │
|
||||||
|
│ │ JSON files │ │ metrics │ │ dimensions │ │
|
||||||
|
│ └──────────────┘ └──────────────┘ └──────────────┘ │
|
||||||
|
│ │ │ │ │
|
||||||
|
│ ▼ ▼ ▼ │
|
||||||
|
│ ┌──────────────────────────────────────────────────────┐ │
|
||||||
|
│ │ SQLite Database Layer │ │
|
||||||
|
│ │ • session_grades table (individual scores) │ │
|
||||||
|
│ │ • weekly_reports table (aggregated insights) │ │
|
||||||
|
│ └──────────────────────────────────────────────────────┘ │
|
||||||
|
│ │ │
|
||||||
|
│ ▼ │
|
||||||
|
│ ┌──────────────────────────────────────────────────────┐ │
|
||||||
|
│ │ Pattern Identification │ │
|
||||||
|
│ │ • Task type performance analysis │ │
|
||||||
|
│ │ • Tool error frequency tracking │ │
|
||||||
|
│ │ • Error classification and clustering │ │
|
||||||
|
│ └──────────────────────────────────────────────────────┘ │
|
||||||
|
│ │ │
|
||||||
|
│ ▼ │
|
||||||
|
│ ┌──────────────────────────────────────────────────────┐ │
|
||||||
|
│ │ Report Generator │ │
|
||||||
|
│ │ • Weekly summary with trends │ │
|
||||||
|
│ │ • Improvement suggestions │ │
|
||||||
|
│ │ • Performance alerts │ │
|
||||||
|
│ └──────────────────────────────────────────────────────┘ │
|
||||||
|
│ │
|
||||||
|
└─────────────────────────────────────────────────────────────┘
|
||||||
|
│
|
||||||
|
▼
|
||||||
|
┌─────────────────────────────────────────────────────────────┐
|
||||||
|
│ Downstream Consumers │
|
||||||
|
│ • Adaptive Routing (#88) - route based on task type │
|
||||||
|
│ • Quality Scoring (#98) - external quality validation │
|
||||||
|
│ • Skill Recommendations - identify skill gaps │
|
||||||
|
│ • Alert System - notify on quality degradation │
|
||||||
|
└─────────────────────────────────────────────────────────────┘
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 3. Grading Dimensions
|
||||||
|
|
||||||
|
### 3.1 Core Metrics (1-5 scale where applicable)
|
||||||
|
|
||||||
|
| Metric | Type | Description |
|
||||||
|
|--------|------|-------------|
|
||||||
|
| `task_completed` | boolean | Whether the task appears to have been completed successfully |
|
||||||
|
| `tool_calls_efficient` | int (1-5) | Efficiency of tool usage (error rate, call count) |
|
||||||
|
| `response_quality` | int (1-5) | Overall quality of final response |
|
||||||
|
| `errors_recovered` | boolean | Whether errors were successfully recovered from |
|
||||||
|
| `total_api_calls` | int | Total number of API/assistant calls made |
|
||||||
|
|
||||||
|
### 3.2 Derived Metrics
|
||||||
|
|
||||||
|
| Metric | Description |
|
||||||
|
|--------|-------------|
|
||||||
|
| `total_errors` | Count of tool errors detected |
|
||||||
|
| `error_types` | Categorized error types (JSON list) |
|
||||||
|
| `tools_with_errors` | Tools that generated errors |
|
||||||
|
| `had_repeated_errors` | Flag for 3+ consecutive errors |
|
||||||
|
| `had_infinite_loop_risk` | Flag for 5+ consecutive errors or >50 calls |
|
||||||
|
| `had_user_clarification` | Whether clarification was requested |
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 4. Error Classification
|
||||||
|
|
||||||
|
The system classifies errors into categories for pattern analysis:
|
||||||
|
|
||||||
|
| Category | Pattern | Example |
|
||||||
|
|----------|---------|---------|
|
||||||
|
| `file_not_found` | File/path errors | "No such file or directory" |
|
||||||
|
| `permission_denied` | Access errors | "Permission denied" |
|
||||||
|
| `timeout` | Time limit exceeded | "Request timed out" |
|
||||||
|
| `api_error` | External API failures | "Rate limit exceeded" |
|
||||||
|
| `syntax_error` | Code/parsing errors | "Invalid syntax" |
|
||||||
|
| `command_failed` | Command execution | "exit_code": 1 |
|
||||||
|
| `network_error` | Connectivity issues | "Connection refused" |
|
||||||
|
| `tool_not_found` | Tool resolution | "Unknown tool" |
|
||||||
|
| `unknown` | Unclassified | Any other error |
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 5. Task Type Inference
|
||||||
|
|
||||||
|
Sessions are categorized by task type for comparative analysis:
|
||||||
|
|
||||||
|
| Task Type | Pattern |
|
||||||
|
|-----------|---------|
|
||||||
|
| `code_review` | "review", "code review", "PR" |
|
||||||
|
| `debugging` | "debug", "fix", "troubleshoot" |
|
||||||
|
| `feature_impl` | "implement", "add feature", "build" |
|
||||||
|
| `refactoring` | "refactor", "clean up", "reorganize" |
|
||||||
|
| `documentation` | "document", "readme", "docstring" |
|
||||||
|
| `testing` | "test", "pytest", "unit test" |
|
||||||
|
| `research` | "research", "investigate", "look up" |
|
||||||
|
| `deployment` | "deploy", "release", "publish" |
|
||||||
|
| `data_analysis` | "analyze data", "process file", "parse" |
|
||||||
|
| `infrastructure` | "server", "docker", "kubernetes" |
|
||||||
|
| `general` | Default catch-all |
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 6. Database Schema
|
||||||
|
|
||||||
|
### 6.1 session_grades Table
|
||||||
|
|
||||||
|
```sql
|
||||||
|
CREATE TABLE session_grades (
|
||||||
|
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
||||||
|
session_id TEXT UNIQUE NOT NULL,
|
||||||
|
session_file TEXT NOT NULL,
|
||||||
|
graded_at TEXT NOT NULL,
|
||||||
|
|
||||||
|
-- Core metrics
|
||||||
|
task_completed INTEGER NOT NULL,
|
||||||
|
tool_calls_efficient INTEGER NOT NULL,
|
||||||
|
response_quality INTEGER NOT NULL,
|
||||||
|
errors_recovered INTEGER NOT NULL,
|
||||||
|
total_api_calls INTEGER NOT NULL,
|
||||||
|
|
||||||
|
-- Metadata
|
||||||
|
model TEXT,
|
||||||
|
platform TEXT,
|
||||||
|
session_start TEXT,
|
||||||
|
duration_seconds REAL,
|
||||||
|
task_summary TEXT,
|
||||||
|
|
||||||
|
-- Error analysis
|
||||||
|
total_errors INTEGER NOT NULL,
|
||||||
|
error_types TEXT, -- JSON array
|
||||||
|
tools_with_errors TEXT, -- JSON array
|
||||||
|
|
||||||
|
-- Pattern flags
|
||||||
|
had_repeated_errors INTEGER NOT NULL,
|
||||||
|
had_infinite_loop_risk INTEGER NOT NULL,
|
||||||
|
had_user_clarification INTEGER NOT NULL
|
||||||
|
);
|
||||||
|
```
|
||||||
|
|
||||||
|
### 6.2 weekly_reports Table
|
||||||
|
|
||||||
|
```sql
|
||||||
|
CREATE TABLE weekly_reports (
|
||||||
|
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
||||||
|
week_start TEXT UNIQUE NOT NULL,
|
||||||
|
week_end TEXT NOT NULL,
|
||||||
|
generated_at TEXT NOT NULL,
|
||||||
|
report_json TEXT NOT NULL -- Serialized WeeklyReport
|
||||||
|
);
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 7. Scoring Algorithms
|
||||||
|
|
||||||
|
### 7.1 Task Completion Detection
|
||||||
|
|
||||||
|
Positive indicators:
|
||||||
|
- Final message contains completion phrases: "done", "completed", "success", "finished"
|
||||||
|
- References to created outputs: "saved to", "here is", "output:"
|
||||||
|
- Low error rate relative to total calls
|
||||||
|
|
||||||
|
Negative indicators:
|
||||||
|
- Explicit failure phrases: "failed", "unable to", "could not"
|
||||||
|
- Error rate > 30% of total calls
|
||||||
|
- Empty or very short final response
|
||||||
|
|
||||||
|
### 7.2 Tool Efficiency Scoring
|
||||||
|
|
||||||
|
```python
|
||||||
|
error_rate = total_errors / total_api_calls
|
||||||
|
|
||||||
|
if error_rate == 0 and tool_calls <= 10:
|
||||||
|
score = 5 # Perfect
|
||||||
|
elif error_rate <= 0.1 and tool_calls <= 15:
|
||||||
|
score = 4 # Good
|
||||||
|
elif error_rate <= 0.25 and tool_calls <= 25:
|
||||||
|
score = 3 # Average
|
||||||
|
elif error_rate <= 0.4:
|
||||||
|
score = 2 # Poor
|
||||||
|
else:
|
||||||
|
score = 1 # Very poor
|
||||||
|
```
|
||||||
|
|
||||||
|
### 7.3 Response Quality Scoring
|
||||||
|
|
||||||
|
Base score: 3 (average)
|
||||||
|
|
||||||
|
Additions:
|
||||||
|
- Content length > 500 chars: +1
|
||||||
|
- Content length > 1000 chars: +1
|
||||||
|
- Contains code blocks: +1
|
||||||
|
- Contains links/references: +0.5
|
||||||
|
|
||||||
|
Penalties:
|
||||||
|
- Repeated errors: -1
|
||||||
|
- Total errors > 5: -1
|
||||||
|
- Infinite loop risk: -2
|
||||||
|
|
||||||
|
Range clamped to 1-5.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 8. Pattern Identification
|
||||||
|
|
||||||
|
### 8.1 Per-Task-Type Analysis
|
||||||
|
|
||||||
|
Tracks average scores per task type to identify weak areas:
|
||||||
|
|
||||||
|
```python
|
||||||
|
task_scores = {
|
||||||
|
'code_review': 4.2,
|
||||||
|
'debugging': 2.8, # <-- Needs attention
|
||||||
|
'feature_impl': 3.5,
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
### 8.2 Tool Error Frequency
|
||||||
|
|
||||||
|
Identifies which tools are most error-prone:
|
||||||
|
|
||||||
|
```python
|
||||||
|
tool_errors = {
|
||||||
|
'browser_navigate': 15, # <-- High error rate
|
||||||
|
'terminal': 5,
|
||||||
|
'file_read': 2,
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
### 8.3 Error Pattern Clustering
|
||||||
|
|
||||||
|
Groups errors by type to identify systemic issues:
|
||||||
|
|
||||||
|
```python
|
||||||
|
error_types = {
|
||||||
|
'file_not_found': 12, # <-- Need better path handling
|
||||||
|
'timeout': 8,
|
||||||
|
'api_error': 3,
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 9. Weekly Report Generation
|
||||||
|
|
||||||
|
### 9.1 Report Contents
|
||||||
|
|
||||||
|
1. **Summary Statistics**
|
||||||
|
- Total sessions graded
|
||||||
|
- Average tool efficiency
|
||||||
|
- Average response quality
|
||||||
|
- Task completion rate
|
||||||
|
- Error recovery rate
|
||||||
|
|
||||||
|
2. **Problem Areas**
|
||||||
|
- Lowest scoring task types
|
||||||
|
- Most error-prone tools
|
||||||
|
- Common error patterns
|
||||||
|
|
||||||
|
3. **Improvement Suggestions**
|
||||||
|
- Actionable recommendations based on patterns
|
||||||
|
- Skill gap identification
|
||||||
|
- Process improvement tips
|
||||||
|
|
||||||
|
### 9.2 Suggestion Generation Rules
|
||||||
|
|
||||||
|
| Condition | Suggestion |
|
||||||
|
|-----------|------------|
|
||||||
|
| completion_rate < 70% | "Add pre-task planning steps" |
|
||||||
|
| avg_tool_efficiency < 3 | "Review error recovery patterns" |
|
||||||
|
| error_recovery_rate < 80% | "Implement better fallback strategies" |
|
||||||
|
| Specific task type low | "Consider skill enhancement for {task}" |
|
||||||
|
| Specific tool high errors | "Review usage patterns for {tool}" |
|
||||||
|
| Specific error common | "Add targeted error handling for {error}" |
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 10. Integration Points
|
||||||
|
|
||||||
|
### 10.1 With Adaptive Routing (#88)
|
||||||
|
|
||||||
|
The grader feeds task-type performance data to the router:
|
||||||
|
|
||||||
|
```python
|
||||||
|
# Router uses grader insights
|
||||||
|
if task_type == 'debugging' and grader.get_task_score('debugging') < 3:
|
||||||
|
# Route to more capable model for debugging tasks
|
||||||
|
model = 'claude-opus-4'
|
||||||
|
```
|
||||||
|
|
||||||
|
### 10.2 With Quality Scoring (#98)
|
||||||
|
|
||||||
|
Grader scores feed into external quality validation:
|
||||||
|
|
||||||
|
```python
|
||||||
|
# Quality scorer validates grader accuracy
|
||||||
|
external_score = quality_scorer.validate(session, grader_score)
|
||||||
|
discrepancy = abs(external_score - grader_score)
|
||||||
|
if discrepancy > threshold:
|
||||||
|
grader.calibrate() # Adjust scoring algorithms
|
||||||
|
```
|
||||||
|
|
||||||
|
### 10.3 With Skill System
|
||||||
|
|
||||||
|
Identifies skills that could improve low-scoring areas:
|
||||||
|
|
||||||
|
```python
|
||||||
|
if grader.get_task_score('debugging') < 3:
|
||||||
|
recommend_skill('systematic-debugging')
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 11. Usage
|
||||||
|
|
||||||
|
### 11.1 Command Line
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Grade latest 10 ungraded sessions
|
||||||
|
python self_grader.py -g 10
|
||||||
|
|
||||||
|
# Show summary of last 7 days
|
||||||
|
python self_grader.py -s
|
||||||
|
|
||||||
|
# Show summary of last 30 days
|
||||||
|
python self_grader.py -s -d 30
|
||||||
|
|
||||||
|
# Generate weekly report
|
||||||
|
python self_grader.py -r
|
||||||
|
|
||||||
|
# Grade specific session file
|
||||||
|
python self_grader.py -f /path/to/session.json
|
||||||
|
```
|
||||||
|
|
||||||
|
### 11.2 Python API
|
||||||
|
|
||||||
|
```python
|
||||||
|
from self_grader import SelfGrader
|
||||||
|
|
||||||
|
grader = SelfGrader()
|
||||||
|
|
||||||
|
# Grade latest sessions
|
||||||
|
grades = grader.grade_latest_sessions(n=10)
|
||||||
|
|
||||||
|
# Get pattern insights
|
||||||
|
patterns = grader.identify_patterns(days=7)
|
||||||
|
|
||||||
|
# Generate report
|
||||||
|
report = grader.generate_weekly_report()
|
||||||
|
|
||||||
|
# Get human-readable summary
|
||||||
|
print(grader.get_grades_summary(days=7))
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 12. Testing
|
||||||
|
|
||||||
|
Comprehensive test suite covers:
|
||||||
|
|
||||||
|
1. **Unit Tests**
|
||||||
|
- Error detection and classification
|
||||||
|
- Scoring algorithms
|
||||||
|
- Task type inference
|
||||||
|
|
||||||
|
2. **Integration Tests**
|
||||||
|
- Full session grading pipeline
|
||||||
|
- Database operations
|
||||||
|
- Report generation
|
||||||
|
|
||||||
|
3. **Edge Cases**
|
||||||
|
- Empty sessions
|
||||||
|
- Sessions with infinite loops
|
||||||
|
- Malformed session files
|
||||||
|
|
||||||
|
Run tests:
|
||||||
|
```bash
|
||||||
|
python -m pytest test_self_grader.py -v
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 13. Future Enhancements
|
||||||
|
|
||||||
|
1. **Machine Learning Integration**
|
||||||
|
- Train models to predict session success
|
||||||
|
- Learn optimal tool sequences
|
||||||
|
- Predict error likelihood
|
||||||
|
|
||||||
|
2. **Human-in-the-Loop Validation**
|
||||||
|
- Allow user override of grades
|
||||||
|
- Collect explicit feedback
|
||||||
|
- Calibrate scoring with human judgments
|
||||||
|
|
||||||
|
3. **Real-time Monitoring**
|
||||||
|
- Grade sessions as they complete
|
||||||
|
- Alert on quality degradation
|
||||||
|
- Live dashboard of metrics
|
||||||
|
|
||||||
|
4. **Cross-Session Learning**
|
||||||
|
- Identify recurring issues across similar tasks
|
||||||
|
- Suggest skill improvements
|
||||||
|
- Recommend tool alternatives
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 14. Files
|
||||||
|
|
||||||
|
| File | Description |
|
||||||
|
|------|-------------|
|
||||||
|
| `self_grader.py` | Main module with SelfGrader class |
|
||||||
|
| `test_self_grader.py` | Comprehensive test suite |
|
||||||
|
| `self_grader_design.md` | This design document |
|
||||||
|
| `~/.timmy/uniwizard/session_grades.db` | SQLite database (created at runtime) |
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
*Document Version: 1.0*
|
||||||
|
*Created: 2026-03-30*
|
||||||
|
*Author: Timmy (UniWizard)*
|
||||||
655
uniwizard/task_classifier.py
Normal file
655
uniwizard/task_classifier.py
Normal file
@@ -0,0 +1,655 @@
|
|||||||
|
"""
|
||||||
|
Enhanced Task Classifier for Uniwizard
|
||||||
|
|
||||||
|
Classifies incoming prompts into task types and maps them to ranked backend preferences.
|
||||||
|
Integrates with the 7-backend fallback chain defined in config.yaml.
|
||||||
|
"""
|
||||||
|
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import re
|
||||||
|
from dataclasses import dataclass
|
||||||
|
from enum import Enum
|
||||||
|
from typing import Any, Dict, List, Optional, Set, Tuple
|
||||||
|
|
||||||
|
|
||||||
|
class TaskType(Enum):
|
||||||
|
"""Classification categories for incoming prompts."""
|
||||||
|
CODE = "code"
|
||||||
|
REASONING = "reasoning"
|
||||||
|
RESEARCH = "research"
|
||||||
|
CREATIVE = "creative"
|
||||||
|
FAST_OPS = "fast_ops"
|
||||||
|
TOOL_USE = "tool_use"
|
||||||
|
UNKNOWN = "unknown"
|
||||||
|
|
||||||
|
|
||||||
|
class ComplexityLevel(Enum):
|
||||||
|
"""Complexity tiers for prompt analysis."""
|
||||||
|
LOW = "low"
|
||||||
|
MEDIUM = "medium"
|
||||||
|
HIGH = "high"
|
||||||
|
|
||||||
|
|
||||||
|
# Backend identifiers (match fallback_providers chain order)
|
||||||
|
BACKEND_ANTHROPIC = "anthropic"
|
||||||
|
BACKEND_OPENAI_CODEX = "openai-codex"
|
||||||
|
BACKEND_GEMINI = "gemini"
|
||||||
|
BACKEND_GROQ = "groq"
|
||||||
|
BACKEND_GROK = "grok"
|
||||||
|
BACKEND_KIMI = "kimi-coding"
|
||||||
|
BACKEND_OPENROUTER = "openrouter"
|
||||||
|
|
||||||
|
ALL_BACKENDS = [
|
||||||
|
BACKEND_ANTHROPIC,
|
||||||
|
BACKEND_OPENAI_CODEX,
|
||||||
|
BACKEND_GEMINI,
|
||||||
|
BACKEND_GROQ,
|
||||||
|
BACKEND_GROK,
|
||||||
|
BACKEND_KIMI,
|
||||||
|
BACKEND_OPENROUTER,
|
||||||
|
]
|
||||||
|
|
||||||
|
# Task-specific keyword mappings
|
||||||
|
CODE_KEYWORDS: Set[str] = {
|
||||||
|
"code", "coding", "program", "programming", "function", "class",
|
||||||
|
"implement", "implementation", "refactor", "debug", "debugging",
|
||||||
|
"error", "exception", "traceback", "stacktrace", "test", "tests",
|
||||||
|
"pytest", "unittest", "import", "module", "package", "library",
|
||||||
|
"api", "endpoint", "route", "middleware", "database", "query",
|
||||||
|
"sql", "orm", "migration", "deploy", "docker", "kubernetes",
|
||||||
|
"k8s", "ci/cd", "pipeline", "build", "compile", "syntax",
|
||||||
|
"lint", "format", "black", "flake8", "mypy", "type", "typing",
|
||||||
|
"async", "await", "callback", "promise", "thread", "process",
|
||||||
|
"concurrency", "parallel", "optimization", "optimize", "performance",
|
||||||
|
"memory", "leak", "bug", "fix", "patch", "commit", "git",
|
||||||
|
"repository", "repo", "clone", "fork", "merge", "conflict",
|
||||||
|
"branch", "pull request", "pr", "review", "crud", "rest",
|
||||||
|
"graphql", "json", "xml", "yaml", "toml", "csv", "parse",
|
||||||
|
"regex", "regular expression", "string", "bytes", "encoding",
|
||||||
|
"decoding", "serialize", "deserialize", "marshal", "unmarshal",
|
||||||
|
"encrypt", "decrypt", "hash", "checksum", "signature", "jwt",
|
||||||
|
"oauth", "authentication", "authorization", "auth", "login",
|
||||||
|
"logout", "session", "cookie", "token", "permission", "role",
|
||||||
|
"rbac", "acl", "security", "vulnerability", "cve", "exploit",
|
||||||
|
"sandbox", "isolate", "container", "vm", "virtual machine",
|
||||||
|
}
|
||||||
|
|
||||||
|
REASONING_KEYWORDS: Set[str] = {
|
||||||
|
"analyze", "analysis", "investigate", "investigation",
|
||||||
|
"compare", "comparison", "contrast", "evaluate", "evaluation",
|
||||||
|
"assess", "assessment", "reason", "reasoning", "logic",
|
||||||
|
"logical", "deduce", "deduction", "infer", "inference",
|
||||||
|
"synthesize", "synthesis", "critique", "criticism", "review",
|
||||||
|
"argument", "premise", "conclusion", "evidence", "proof",
|
||||||
|
"theorem", "axiom", "corollary", "lemma", "proposition",
|
||||||
|
"hypothesis", "theory", "model", "framework", "paradigm",
|
||||||
|
"philosophy", "ethical", "ethics", "moral", "morality",
|
||||||
|
"implication", "consequence", "trade-off", "tradeoff",
|
||||||
|
"pros and cons", "advantage", "disadvantage", "benefit",
|
||||||
|
"drawback", "risk", "mitigation", "strategy", "strategic",
|
||||||
|
"plan", "planning", "design", "architecture", "system",
|
||||||
|
"complex", "complicated", "nuanced", "subtle", "sophisticated",
|
||||||
|
"rigorous", "thorough", "comprehensive", "exhaustive",
|
||||||
|
"step by step", "chain of thought", "think through",
|
||||||
|
"work through", "figure out", "understand", "comprehend",
|
||||||
|
}
|
||||||
|
|
||||||
|
RESEARCH_KEYWORDS: Set[str] = {
|
||||||
|
"research", "find", "search", "look up", "lookup",
|
||||||
|
"investigate", "study", "explore", "discover",
|
||||||
|
"paper", "publication", "journal", "article", "study",
|
||||||
|
"arxiv", "scholar", "academic", "scientific", "literature",
|
||||||
|
"review", "survey", "meta-analysis", "bibliography",
|
||||||
|
"citation", "reference", "source", "primary source",
|
||||||
|
"secondary source", "peer review", "empirical", "experiment",
|
||||||
|
"experimental", "observational", "longitudinal", "cross-sectional",
|
||||||
|
"qualitative", "quantitative", "mixed methods", "case study",
|
||||||
|
"dataset", "data", "statistics", "statistical", "correlation",
|
||||||
|
"causation", "regression", "machine learning", "ml", "ai",
|
||||||
|
"neural network", "deep learning", "transformer", "llm",
|
||||||
|
"benchmark", "evaluation", "metric", "sota", "state of the art",
|
||||||
|
"survey", "poll", "interview", "focus group", "ethnography",
|
||||||
|
"field work", "archive", "archival", "repository", "collection",
|
||||||
|
"index", "catalog", "database", "librar", "museum", "histor",
|
||||||
|
"genealogy", "ancestry", "patent", "trademark", "copyright",
|
||||||
|
"legislation", "regulation", "policy", "compliance",
|
||||||
|
}
|
||||||
|
|
||||||
|
CREATIVE_KEYWORDS: Set[str] = {
|
||||||
|
"create", "creative", "creativity", "design", "designer",
|
||||||
|
"art", "artistic", "artist", "paint", "painting", "draw",
|
||||||
|
"drawing", "sketch", "illustration", "illustrator", "graphic",
|
||||||
|
"visual", "image", "photo", "photography", "photographer",
|
||||||
|
"video", "film", "movie", "animation", "animate", "motion",
|
||||||
|
"music", "musical", "song", "lyric", "compose", "composition",
|
||||||
|
"melody", "harmony", "rhythm", "beat", "sound", "audio",
|
||||||
|
"write", "writing", "writer", "author", "story", "storytelling",
|
||||||
|
"narrative", "plot", "character", "dialogue", "scene",
|
||||||
|
"novel", "fiction", "short story", "poem", "poetry", "poet",
|
||||||
|
"verse", "prose", "essay", "blog", "article", "content",
|
||||||
|
"copy", "copywriting", "marketing", "brand", "branding",
|
||||||
|
"slogan", "tagline", "headline", "title", "name", "naming",
|
||||||
|
"brainstorm", "ideate", "concept", "conceptualize", "imagine",
|
||||||
|
"imagination", "inspire", "inspiration", "muse", "vision",
|
||||||
|
"aesthetic", "style", "theme", "mood", "tone", "voice",
|
||||||
|
"unique", "original", "fresh", "novel", "innovative",
|
||||||
|
"unconventional", "experimental", "avant-garde", "edgy",
|
||||||
|
"humor", "funny", "comedy", "satire", "parody", "wit",
|
||||||
|
"romance", "romantic", "drama", "dramatic", "thriller",
|
||||||
|
"mystery", "horror", "sci-fi", "science fiction", "fantasy",
|
||||||
|
"adventure", "action", "documentary", "biopic", "memoir",
|
||||||
|
}
|
||||||
|
|
||||||
|
FAST_OPS_KEYWORDS: Set[str] = {
|
||||||
|
"quick", "fast", "brief", "short", "simple", "easy",
|
||||||
|
"status", "check", "list", "ls", "show", "display",
|
||||||
|
"get", "fetch", "retrieve", "read", "cat", "view",
|
||||||
|
"summary", "summarize", "tl;dr", "tldr", "overview",
|
||||||
|
"count", "number", "how many", "total", "sum", "average",
|
||||||
|
"min", "max", "sort", "filter", "grep", "search",
|
||||||
|
"find", "locate", "which", "where", "what is", "what's",
|
||||||
|
"who", "when", "yes/no", "confirm", "verify", "validate",
|
||||||
|
"ping", "health", "alive", "up", "running", "online",
|
||||||
|
"date", "time", "timezone", "clock", "timer", "alarm",
|
||||||
|
"remind", "reminder", "note", "jot", "save", "store",
|
||||||
|
"delete", "remove", "rm", "clean", "clear", "purge",
|
||||||
|
"start", "stop", "restart", "enable", "disable", "toggle",
|
||||||
|
"on", "off", "open", "close", "switch", "change", "set",
|
||||||
|
"update", "upgrade", "install", "uninstall", "download",
|
||||||
|
"upload", "sync", "backup", "restore", "export", "import",
|
||||||
|
"convert", "transform", "format", "parse", "extract",
|
||||||
|
"compress", "decompress", "zip", "unzip", "tar", "archive",
|
||||||
|
"copy", "cp", "move", "mv", "rename", "link", "symlink",
|
||||||
|
"permission", "chmod", "chown", "access", "ownership",
|
||||||
|
"hello", "hi", "hey", "greeting", "thanks", "thank you",
|
||||||
|
"bye", "goodbye", "help", "?", "how to", "how do i",
|
||||||
|
}
|
||||||
|
|
||||||
|
TOOL_USE_KEYWORDS: Set[str] = {
|
||||||
|
"tool", "tools", "use tool", "call tool", "invoke",
|
||||||
|
"run command", "execute", "terminal", "shell", "bash",
|
||||||
|
"zsh", "powershell", "cmd", "command line", "cli",
|
||||||
|
"file", "files", "directory", "folder", "path", "fs",
|
||||||
|
"read file", "write file", "edit file", "patch file",
|
||||||
|
"search files", "find files", "grep", "rg", "ack",
|
||||||
|
"browser", "web", "navigate", "click", "scroll",
|
||||||
|
"screenshot", "vision", "image", "analyze image",
|
||||||
|
"delegate", "subagent", "agent", "spawn", "task",
|
||||||
|
"mcp", "server", "mcporter", "protocol",
|
||||||
|
"process", "background", "kill", "signal", "pid",
|
||||||
|
"git", "commit", "push", "pull", "clone", "branch",
|
||||||
|
"docker", "container", "compose", "dockerfile",
|
||||||
|
"kubernetes", "kubectl", "k8s", "pod", "deployment",
|
||||||
|
"aws", "gcp", "azure", "cloud", "s3", "bucket",
|
||||||
|
"database", "db", "sql", "query", "migrate", "seed",
|
||||||
|
"api", "endpoint", "request", "response", "curl",
|
||||||
|
"http", "https", "rest", "graphql", "websocket",
|
||||||
|
"json", "xml", "yaml", "csv", "parse", "serialize",
|
||||||
|
"scrap", "crawl", "extract", "parse html", "xpath",
|
||||||
|
"schedule", "cron", "job", "task queue", "worker",
|
||||||
|
"notification", "alert", "webhook", "event", "trigger",
|
||||||
|
}
|
||||||
|
|
||||||
|
# URL pattern for detecting web/research tasks
|
||||||
|
_URL_PATTERN = re.compile(
|
||||||
|
r'https?://(?:[-\w.])+(?:[:\d]+)?(?:/(?:[\w/_.])*(?:\?(?:[\w&=%.])*)?(?:#(?:[\w.])*)?)?',
|
||||||
|
re.IGNORECASE
|
||||||
|
)
|
||||||
|
|
||||||
|
# Code block detection (count ``` blocks, not individual lines)
|
||||||
|
_CODE_BLOCK_PATTERN = re.compile(r'```[\w]*\n', re.MULTILINE)
|
||||||
|
|
||||||
|
|
||||||
|
def _count_code_blocks(text: str) -> int:
|
||||||
|
"""Count complete code blocks (opening ``` to closing ```)."""
|
||||||
|
# Count pairs of ``` - each pair is one code block
|
||||||
|
fence_count = text.count('```')
|
||||||
|
return fence_count // 2
|
||||||
|
_INLINE_CODE_PATTERN = re.compile(r'`[^`]+`')
|
||||||
|
|
||||||
|
# Complexity thresholds
|
||||||
|
COMPLEXITY_THRESHOLDS = {
|
||||||
|
"chars": {"low": 200, "medium": 800},
|
||||||
|
"words": {"low": 35, "medium": 150},
|
||||||
|
"lines": {"low": 3, "medium": 15},
|
||||||
|
"urls": {"low": 0, "medium": 2},
|
||||||
|
"code_blocks": {"low": 0, "medium": 1},
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class ClassificationResult:
|
||||||
|
"""Result of task classification."""
|
||||||
|
task_type: TaskType
|
||||||
|
preferred_backends: List[str]
|
||||||
|
complexity: ComplexityLevel
|
||||||
|
reason: str
|
||||||
|
confidence: float
|
||||||
|
features: Dict[str, Any]
|
||||||
|
|
||||||
|
|
||||||
|
class TaskClassifier:
|
||||||
|
"""
|
||||||
|
Enhanced task classifier for routing prompts to appropriate backends.
|
||||||
|
|
||||||
|
Maps task types to ranked backend preferences based on:
|
||||||
|
- Backend strengths (coding, reasoning, speed, context length, etc.)
|
||||||
|
- Message complexity (length, structure, keywords)
|
||||||
|
- Detected features (URLs, code blocks, specific terminology)
|
||||||
|
"""
|
||||||
|
|
||||||
|
# Backend preference rankings by task type
|
||||||
|
# Order matters: first is most preferred
|
||||||
|
TASK_BACKEND_MAP: Dict[TaskType, List[str]] = {
|
||||||
|
TaskType.CODE: [
|
||||||
|
BACKEND_OPENAI_CODEX, # Best for code generation
|
||||||
|
BACKEND_ANTHROPIC, # Excellent for code review, complex analysis
|
||||||
|
BACKEND_KIMI, # Long context for large codebases
|
||||||
|
BACKEND_GEMINI, # Good multimodal code understanding
|
||||||
|
BACKEND_GROQ, # Fast for simple code tasks
|
||||||
|
BACKEND_OPENROUTER, # Overflow option
|
||||||
|
BACKEND_GROK, # General knowledge backup
|
||||||
|
],
|
||||||
|
TaskType.REASONING: [
|
||||||
|
BACKEND_ANTHROPIC, # Deep reasoning champion
|
||||||
|
BACKEND_GEMINI, # Strong analytical capabilities
|
||||||
|
BACKEND_KIMI, # Long context for complex reasoning chains
|
||||||
|
BACKEND_GROK, # Broad knowledge for reasoning
|
||||||
|
BACKEND_OPENAI_CODEX, # Structured reasoning
|
||||||
|
BACKEND_OPENROUTER, # Overflow
|
||||||
|
BACKEND_GROQ, # Fast fallback
|
||||||
|
],
|
||||||
|
TaskType.RESEARCH: [
|
||||||
|
BACKEND_GEMINI, # Research and multimodal leader
|
||||||
|
BACKEND_KIMI, # 262K context for long documents
|
||||||
|
BACKEND_ANTHROPIC, # Deep analysis
|
||||||
|
BACKEND_GROK, # Broad knowledge
|
||||||
|
BACKEND_OPENROUTER, # Broadest model access
|
||||||
|
BACKEND_OPENAI_CODEX, # Structured research
|
||||||
|
BACKEND_GROQ, # Fast triage
|
||||||
|
],
|
||||||
|
TaskType.CREATIVE: [
|
||||||
|
BACKEND_GROK, # Creative writing and drafting
|
||||||
|
BACKEND_ANTHROPIC, # Nuanced creative work
|
||||||
|
BACKEND_GEMINI, # Multimodal creativity
|
||||||
|
BACKEND_OPENAI_CODEX, # Creative coding
|
||||||
|
BACKEND_KIMI, # Long-form creative
|
||||||
|
BACKEND_OPENROUTER, # Variety of creative models
|
||||||
|
BACKEND_GROQ, # Fast creative ops
|
||||||
|
],
|
||||||
|
TaskType.FAST_OPS: [
|
||||||
|
BACKEND_GROQ, # 284ms response time champion
|
||||||
|
BACKEND_OPENROUTER, # Fast mini models
|
||||||
|
BACKEND_GEMINI, # Flash models
|
||||||
|
BACKEND_GROK, # Fast for simple queries
|
||||||
|
BACKEND_ANTHROPIC, # If precision needed
|
||||||
|
BACKEND_OPENAI_CODEX, # Structured ops
|
||||||
|
BACKEND_KIMI, # Overflow
|
||||||
|
],
|
||||||
|
TaskType.TOOL_USE: [
|
||||||
|
BACKEND_ANTHROPIC, # Excellent tool use capabilities
|
||||||
|
BACKEND_OPENAI_CODEX, # Good tool integration
|
||||||
|
BACKEND_GEMINI, # Multimodal tool use
|
||||||
|
BACKEND_GROQ, # Fast tool chaining
|
||||||
|
BACKEND_KIMI, # Long context tool sessions
|
||||||
|
BACKEND_OPENROUTER, # Overflow
|
||||||
|
BACKEND_GROK, # General tool use
|
||||||
|
],
|
||||||
|
TaskType.UNKNOWN: [
|
||||||
|
BACKEND_ANTHROPIC, # Default to strongest general model
|
||||||
|
BACKEND_GEMINI, # Good all-rounder
|
||||||
|
BACKEND_OPENAI_CODEX, # Structured approach
|
||||||
|
BACKEND_KIMI, # Long context safety
|
||||||
|
BACKEND_GROK, # Broad knowledge
|
||||||
|
BACKEND_GROQ, # Fast fallback
|
||||||
|
BACKEND_OPENROUTER, # Ultimate overflow
|
||||||
|
],
|
||||||
|
}
|
||||||
|
|
||||||
|
def __init__(self):
|
||||||
|
"""Initialize the classifier with compiled patterns."""
|
||||||
|
self.url_pattern = _URL_PATTERN
|
||||||
|
self.code_block_pattern = _CODE_BLOCK_PATTERN
|
||||||
|
self.inline_code_pattern = _INLINE_CODE_PATTERN
|
||||||
|
|
||||||
|
def classify(
|
||||||
|
self,
|
||||||
|
prompt: str,
|
||||||
|
context: Optional[Dict[str, Any]] = None
|
||||||
|
) -> ClassificationResult:
|
||||||
|
"""
|
||||||
|
Classify a prompt and return routing recommendation.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
prompt: The user message to classify
|
||||||
|
context: Optional context (previous messages, session state, etc.)
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
ClassificationResult with task type, preferred backends, complexity, and reasoning
|
||||||
|
"""
|
||||||
|
text = (prompt or "").strip()
|
||||||
|
if not text:
|
||||||
|
return self._default_result("Empty prompt")
|
||||||
|
|
||||||
|
# Extract features
|
||||||
|
features = self._extract_features(text)
|
||||||
|
|
||||||
|
# Determine complexity
|
||||||
|
complexity = self._assess_complexity(features)
|
||||||
|
|
||||||
|
# Classify task type
|
||||||
|
task_type, task_confidence, task_reason = self._classify_task_type(text, features)
|
||||||
|
|
||||||
|
# Get preferred backends
|
||||||
|
preferred_backends = self._get_backends_for_task(task_type, complexity, features)
|
||||||
|
|
||||||
|
# Build reason string
|
||||||
|
reason = self._build_reason(task_type, complexity, task_reason, features)
|
||||||
|
|
||||||
|
return ClassificationResult(
|
||||||
|
task_type=task_type,
|
||||||
|
preferred_backends=preferred_backends,
|
||||||
|
complexity=complexity,
|
||||||
|
reason=reason,
|
||||||
|
confidence=task_confidence,
|
||||||
|
features=features,
|
||||||
|
)
|
||||||
|
|
||||||
|
def _extract_features(self, text: str) -> Dict[str, Any]:
|
||||||
|
"""Extract features from the prompt text."""
|
||||||
|
lowered = text.lower()
|
||||||
|
words = set(token.strip(".,:;!?()[]{}\"'`") for token in lowered.split())
|
||||||
|
|
||||||
|
# Count code blocks (complete ``` pairs)
|
||||||
|
code_blocks = _count_code_blocks(text)
|
||||||
|
inline_code = len(self.inline_code_pattern.findall(text))
|
||||||
|
|
||||||
|
# Count URLs
|
||||||
|
urls = self.url_pattern.findall(text)
|
||||||
|
|
||||||
|
# Count lines
|
||||||
|
lines = text.count('\n') + 1
|
||||||
|
|
||||||
|
return {
|
||||||
|
"char_count": len(text),
|
||||||
|
"word_count": len(text.split()),
|
||||||
|
"line_count": lines,
|
||||||
|
"url_count": len(urls),
|
||||||
|
"urls": urls,
|
||||||
|
"code_block_count": code_blocks,
|
||||||
|
"inline_code_count": inline_code,
|
||||||
|
"has_code": code_blocks > 0 or inline_code > 0,
|
||||||
|
"unique_words": words,
|
||||||
|
"lowercased_text": lowered,
|
||||||
|
}
|
||||||
|
|
||||||
|
def _assess_complexity(self, features: Dict[str, Any]) -> ComplexityLevel:
|
||||||
|
"""Assess the complexity level of the prompt."""
|
||||||
|
scores = {
|
||||||
|
"chars": features["char_count"],
|
||||||
|
"words": features["word_count"],
|
||||||
|
"lines": features["line_count"],
|
||||||
|
"urls": features["url_count"],
|
||||||
|
"code_blocks": features["code_block_count"],
|
||||||
|
}
|
||||||
|
|
||||||
|
# Count how many metrics exceed medium threshold
|
||||||
|
medium_count = 0
|
||||||
|
high_count = 0
|
||||||
|
|
||||||
|
for metric, value in scores.items():
|
||||||
|
thresholds = COMPLEXITY_THRESHOLDS.get(metric, {"low": 0, "medium": 0})
|
||||||
|
if value > thresholds["medium"]:
|
||||||
|
high_count += 1
|
||||||
|
elif value > thresholds["low"]:
|
||||||
|
medium_count += 1
|
||||||
|
|
||||||
|
# Determine complexity
|
||||||
|
if high_count >= 2 or scores["code_blocks"] > 2:
|
||||||
|
return ComplexityLevel.HIGH
|
||||||
|
elif medium_count >= 2 or high_count >= 1:
|
||||||
|
return ComplexityLevel.MEDIUM
|
||||||
|
else:
|
||||||
|
return ComplexityLevel.LOW
|
||||||
|
|
||||||
|
def _classify_task_type(
|
||||||
|
self,
|
||||||
|
text: str,
|
||||||
|
features: Dict[str, Any]
|
||||||
|
) -> Tuple[TaskType, float, str]:
|
||||||
|
"""
|
||||||
|
Classify the task type based on keywords and features.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Tuple of (task_type, confidence, reason)
|
||||||
|
"""
|
||||||
|
words = features["unique_words"]
|
||||||
|
lowered = features["lowercased_text"]
|
||||||
|
|
||||||
|
# Score each task type
|
||||||
|
scores: Dict[TaskType, float] = {task: 0.0 for task in TaskType}
|
||||||
|
reasons: Dict[TaskType, str] = {}
|
||||||
|
|
||||||
|
# CODE scoring
|
||||||
|
code_matches = words & CODE_KEYWORDS
|
||||||
|
if features["has_code"]:
|
||||||
|
scores[TaskType.CODE] += 2.0
|
||||||
|
reasons[TaskType.CODE] = "Contains code blocks"
|
||||||
|
if code_matches:
|
||||||
|
scores[TaskType.CODE] += min(len(code_matches) * 0.5, 3.0)
|
||||||
|
if TaskType.CODE not in reasons:
|
||||||
|
reasons[TaskType.CODE] = f"Code keywords: {', '.join(list(code_matches)[:3])}"
|
||||||
|
|
||||||
|
# REASONING scoring
|
||||||
|
reasoning_matches = words & REASONING_KEYWORDS
|
||||||
|
if reasoning_matches:
|
||||||
|
scores[TaskType.REASONING] += min(len(reasoning_matches) * 0.4, 2.5)
|
||||||
|
reasons[TaskType.REASONING] = f"Reasoning keywords: {', '.join(list(reasoning_matches)[:3])}"
|
||||||
|
if any(phrase in lowered for phrase in ["step by step", "chain of thought", "think through"]):
|
||||||
|
scores[TaskType.REASONING] += 1.5
|
||||||
|
reasons[TaskType.REASONING] = "Explicit reasoning request"
|
||||||
|
|
||||||
|
# RESEARCH scoring
|
||||||
|
research_matches = words & RESEARCH_KEYWORDS
|
||||||
|
if features["url_count"] > 0:
|
||||||
|
scores[TaskType.RESEARCH] += 1.5
|
||||||
|
reasons[TaskType.RESEARCH] = f"Contains {features['url_count']} URL(s)"
|
||||||
|
if research_matches:
|
||||||
|
scores[TaskType.RESEARCH] += min(len(research_matches) * 0.4, 2.0)
|
||||||
|
if TaskType.RESEARCH not in reasons:
|
||||||
|
reasons[TaskType.RESEARCH] = f"Research keywords: {', '.join(list(research_matches)[:3])}"
|
||||||
|
|
||||||
|
# CREATIVE scoring
|
||||||
|
creative_matches = words & CREATIVE_KEYWORDS
|
||||||
|
if creative_matches:
|
||||||
|
scores[TaskType.CREATIVE] += min(len(creative_matches) * 0.4, 2.5)
|
||||||
|
reasons[TaskType.CREATIVE] = f"Creative keywords: {', '.join(list(creative_matches)[:3])}"
|
||||||
|
|
||||||
|
# FAST_OPS scoring (simple queries) - ONLY if no other strong signals
|
||||||
|
fast_ops_matches = words & FAST_OPS_KEYWORDS
|
||||||
|
is_very_short = features["word_count"] <= 5 and features["char_count"] < 50
|
||||||
|
|
||||||
|
# Only score fast_ops if it's very short OR has no other task indicators
|
||||||
|
other_scores_possible = bool(
|
||||||
|
(words & CODE_KEYWORDS) or
|
||||||
|
(words & REASONING_KEYWORDS) or
|
||||||
|
(words & RESEARCH_KEYWORDS) or
|
||||||
|
(words & CREATIVE_KEYWORDS) or
|
||||||
|
(words & TOOL_USE_KEYWORDS) or
|
||||||
|
features["has_code"]
|
||||||
|
)
|
||||||
|
|
||||||
|
if is_very_short and not other_scores_possible:
|
||||||
|
scores[TaskType.FAST_OPS] += 1.5
|
||||||
|
reasons[TaskType.FAST_OPS] = "Very short, simple query"
|
||||||
|
elif not other_scores_possible and fast_ops_matches and features["word_count"] < 30:
|
||||||
|
scores[TaskType.FAST_OPS] += min(len(fast_ops_matches) * 0.3, 1.0)
|
||||||
|
reasons[TaskType.FAST_OPS] = f"Simple query keywords: {', '.join(list(fast_ops_matches)[:3])}"
|
||||||
|
|
||||||
|
# TOOL_USE scoring
|
||||||
|
tool_matches = words & TOOL_USE_KEYWORDS
|
||||||
|
if tool_matches:
|
||||||
|
scores[TaskType.TOOL_USE] += min(len(tool_matches) * 0.4, 2.0)
|
||||||
|
reasons[TaskType.TOOL_USE] = f"Tool keywords: {', '.join(list(tool_matches)[:3])}"
|
||||||
|
if any(cmd in lowered for cmd in ["run ", "execute ", "call ", "use "]):
|
||||||
|
scores[TaskType.TOOL_USE] += 0.5
|
||||||
|
|
||||||
|
# Find highest scoring task type
|
||||||
|
best_task = TaskType.UNKNOWN
|
||||||
|
best_score = 0.0
|
||||||
|
|
||||||
|
for task, score in scores.items():
|
||||||
|
if score > best_score:
|
||||||
|
best_score = score
|
||||||
|
best_task = task
|
||||||
|
|
||||||
|
# Calculate confidence
|
||||||
|
confidence = min(best_score / 4.0, 1.0) if best_score > 0 else 0.0
|
||||||
|
reason = reasons.get(best_task, "No strong indicators")
|
||||||
|
|
||||||
|
return best_task, confidence, reason
|
||||||
|
|
||||||
|
def _get_backends_for_task(
|
||||||
|
self,
|
||||||
|
task_type: TaskType,
|
||||||
|
complexity: ComplexityLevel,
|
||||||
|
features: Dict[str, Any]
|
||||||
|
) -> List[str]:
|
||||||
|
"""Get ranked list of preferred backends for the task."""
|
||||||
|
base_backends = self.TASK_BACKEND_MAP.get(task_type, self.TASK_BACKEND_MAP[TaskType.UNKNOWN])
|
||||||
|
|
||||||
|
# Adjust for complexity
|
||||||
|
if complexity == ComplexityLevel.HIGH and task_type in (TaskType.RESEARCH, TaskType.CODE):
|
||||||
|
# For high complexity, prioritize long-context models
|
||||||
|
if BACKEND_KIMI in base_backends:
|
||||||
|
# Move kimi earlier for long context
|
||||||
|
base_backends = self._prioritize_backend(base_backends, BACKEND_KIMI, 2)
|
||||||
|
if BACKEND_GEMINI in base_backends:
|
||||||
|
base_backends = self._prioritize_backend(base_backends, BACKEND_GEMINI, 3)
|
||||||
|
|
||||||
|
elif complexity == ComplexityLevel.LOW and task_type == TaskType.FAST_OPS:
|
||||||
|
# For simple ops, ensure GROQ is first
|
||||||
|
base_backends = self._prioritize_backend(base_backends, BACKEND_GROQ, 0)
|
||||||
|
|
||||||
|
# Adjust for code presence
|
||||||
|
if features["has_code"] and task_type != TaskType.CODE:
|
||||||
|
# Boost OpenAI Codex if there's code but not explicitly a code task
|
||||||
|
base_backends = self._prioritize_backend(base_backends, BACKEND_OPENAI_CODEX, 2)
|
||||||
|
|
||||||
|
return list(base_backends)
|
||||||
|
|
||||||
|
def _prioritize_backend(
|
||||||
|
self,
|
||||||
|
backends: List[str],
|
||||||
|
target: str,
|
||||||
|
target_index: int
|
||||||
|
) -> List[str]:
|
||||||
|
"""Move a backend to a specific index in the list."""
|
||||||
|
if target not in backends:
|
||||||
|
return backends
|
||||||
|
|
||||||
|
new_backends = list(backends)
|
||||||
|
new_backends.remove(target)
|
||||||
|
new_backends.insert(min(target_index, len(new_backends)), target)
|
||||||
|
return new_backends
|
||||||
|
|
||||||
|
def _build_reason(
|
||||||
|
self,
|
||||||
|
task_type: TaskType,
|
||||||
|
complexity: ComplexityLevel,
|
||||||
|
task_reason: str,
|
||||||
|
features: Dict[str, Any]
|
||||||
|
) -> str:
|
||||||
|
"""Build a human-readable reason string."""
|
||||||
|
parts = [
|
||||||
|
f"Task: {task_type.value}",
|
||||||
|
f"Complexity: {complexity.value}",
|
||||||
|
]
|
||||||
|
|
||||||
|
if task_reason:
|
||||||
|
parts.append(f"Indicators: {task_reason}")
|
||||||
|
|
||||||
|
# Add feature summary
|
||||||
|
feature_parts = []
|
||||||
|
if features["has_code"]:
|
||||||
|
feature_parts.append(f"{features['code_block_count']} code block(s)")
|
||||||
|
if features["url_count"] > 0:
|
||||||
|
feature_parts.append(f"{features['url_count']} URL(s)")
|
||||||
|
if features["word_count"] > 100:
|
||||||
|
feature_parts.append(f"{features['word_count']} words")
|
||||||
|
|
||||||
|
if feature_parts:
|
||||||
|
parts.append(f"Features: {', '.join(feature_parts)}")
|
||||||
|
|
||||||
|
return "; ".join(parts)
|
||||||
|
|
||||||
|
def _default_result(self, reason: str) -> ClassificationResult:
|
||||||
|
"""Return a default result for edge cases."""
|
||||||
|
return ClassificationResult(
|
||||||
|
task_type=TaskType.UNKNOWN,
|
||||||
|
preferred_backends=list(self.TASK_BACKEND_MAP[TaskType.UNKNOWN]),
|
||||||
|
complexity=ComplexityLevel.LOW,
|
||||||
|
reason=reason,
|
||||||
|
confidence=0.0,
|
||||||
|
features={},
|
||||||
|
)
|
||||||
|
|
||||||
|
def to_dict(self, result: ClassificationResult) -> Dict[str, Any]:
|
||||||
|
"""Convert classification result to dictionary format."""
|
||||||
|
return {
|
||||||
|
"task_type": result.task_type.value,
|
||||||
|
"preferred_backends": result.preferred_backends,
|
||||||
|
"complexity": result.complexity.value,
|
||||||
|
"reason": result.reason,
|
||||||
|
"confidence": round(result.confidence, 2),
|
||||||
|
"features": {
|
||||||
|
k: v for k, v in result.features.items()
|
||||||
|
if k not in ("unique_words", "lowercased_text", "urls")
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
# Convenience function for direct usage
|
||||||
|
def classify_prompt(
|
||||||
|
prompt: str,
|
||||||
|
context: Optional[Dict[str, Any]] = None
|
||||||
|
) -> Dict[str, Any]:
|
||||||
|
"""
|
||||||
|
Classify a prompt and return routing recommendation as a dictionary.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
prompt: The user message to classify
|
||||||
|
context: Optional context (previous messages, session state, etc.)
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Dictionary with task_type, preferred_backends, complexity, reason, confidence
|
||||||
|
"""
|
||||||
|
classifier = TaskClassifier()
|
||||||
|
result = classifier.classify(prompt, context)
|
||||||
|
return classifier.to_dict(result)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
# Example usage and quick test
|
||||||
|
test_prompts = [
|
||||||
|
"Hello, how are you?",
|
||||||
|
"Implement a Python function to calculate fibonacci numbers",
|
||||||
|
"Analyze the architectural trade-offs between microservices and monoliths",
|
||||||
|
"Research the latest papers on transformer architectures",
|
||||||
|
"Write a creative story about AI",
|
||||||
|
"Check the status of the server and list running processes",
|
||||||
|
"Use the browser to navigate to https://example.com and take a screenshot",
|
||||||
|
"Refactor this large codebase: [2000 lines of code]",
|
||||||
|
]
|
||||||
|
|
||||||
|
classifier = TaskClassifier()
|
||||||
|
|
||||||
|
for prompt in test_prompts:
|
||||||
|
result = classifier.classify(prompt)
|
||||||
|
print(f"\nPrompt: {prompt[:60]}...")
|
||||||
|
print(f" Type: {result.task_type.value}")
|
||||||
|
print(f" Complexity: {result.complexity.value}")
|
||||||
|
print(f" Confidence: {result.confidence:.2f}")
|
||||||
|
print(f" Backends: {', '.join(result.preferred_backends[:3])}")
|
||||||
|
print(f" Reason: {result.reason}")
|
||||||
379
uniwizard/task_classifier_design.md
Normal file
379
uniwizard/task_classifier_design.md
Normal file
@@ -0,0 +1,379 @@
|
|||||||
|
# Task Classifier Design Document
|
||||||
|
|
||||||
|
## Overview
|
||||||
|
|
||||||
|
The **Task Classifier** is an enhanced prompt routing system for the Uniwizard agent harness. It classifies incoming user prompts into task categories and maps them to ranked backend preferences, enabling intelligent model selection across the 7-backend fallback chain.
|
||||||
|
|
||||||
|
## Goals
|
||||||
|
|
||||||
|
1. **Right-size every request**: Route simple queries to fast backends, complex tasks to capable ones
|
||||||
|
2. **Minimize latency**: Use Groq (284ms) for fast operations, Anthropic for deep reasoning
|
||||||
|
3. **Maximize quality**: Match task type to backend strengths
|
||||||
|
4. **Provide transparency**: Return clear reasoning for routing decisions
|
||||||
|
5. **Enable fallback**: Support the full 7-backend chain with intelligent ordering
|
||||||
|
|
||||||
|
## Architecture
|
||||||
|
|
||||||
|
```
|
||||||
|
┌─────────────────┐
|
||||||
|
│ User Prompt │
|
||||||
|
└────────┬────────┘
|
||||||
|
│
|
||||||
|
▼
|
||||||
|
┌─────────────────────┐
|
||||||
|
│ Feature Extraction │
|
||||||
|
│ - Length metrics │
|
||||||
|
│ - Code detection │
|
||||||
|
│ - URL extraction │
|
||||||
|
│ - Keyword tokenize │
|
||||||
|
└────────┬────────────┘
|
||||||
|
│
|
||||||
|
▼
|
||||||
|
┌─────────────────────┐
|
||||||
|
│ Complexity Assess │
|
||||||
|
│ - Low/Medium/High │
|
||||||
|
└────────┬────────────┘
|
||||||
|
│
|
||||||
|
▼
|
||||||
|
┌─────────────────────┐
|
||||||
|
│ Task Classification│
|
||||||
|
│ - Code │
|
||||||
|
│ - Reasoning │
|
||||||
|
│ - Research │
|
||||||
|
│ - Creative │
|
||||||
|
│ - Fast Ops │
|
||||||
|
│ - Tool Use │
|
||||||
|
└────────┬────────────┘
|
||||||
|
│
|
||||||
|
▼
|
||||||
|
┌─────────────────────┐
|
||||||
|
│ Backend Selection │
|
||||||
|
│ - Ranked by task │
|
||||||
|
│ - Complexity adj. │
|
||||||
|
│ - Feature boosts │
|
||||||
|
└────────┬────────────┘
|
||||||
|
│
|
||||||
|
▼
|
||||||
|
┌─────────────────────┐
|
||||||
|
│ ClassificationResult
|
||||||
|
│ - task_type │
|
||||||
|
│ - preferred_backends
|
||||||
|
│ - complexity │
|
||||||
|
│ - reason │
|
||||||
|
│ - confidence │
|
||||||
|
└─────────────────────┘
|
||||||
|
```
|
||||||
|
|
||||||
|
## Task Types
|
||||||
|
|
||||||
|
| Task Type | Description | Primary Indicators |
|
||||||
|
|-----------|-------------|-------------------|
|
||||||
|
| `code` | Programming tasks, debugging, refactoring | Keywords: implement, debug, refactor, test, function, class, API |
|
||||||
|
| `reasoning` | Analysis, comparison, evaluation | Keywords: analyze, compare, evaluate, step by step, trade-offs |
|
||||||
|
| `research` | Information gathering, literature review | Keywords: research, find, paper, study, URLs present |
|
||||||
|
| `creative` | Writing, design, content creation | Keywords: write, create, design, story, poem, brainstorm |
|
||||||
|
| `fast_ops` | Quick status checks, simple queries | Short length (<20 words), simple keywords |
|
||||||
|
| `tool_use` | Tool invocations, commands, API calls | Keywords: run, execute, use tool, browser, delegate |
|
||||||
|
| `unknown` | No clear indicators | Fallback classification |
|
||||||
|
|
||||||
|
## Backend Strengths Mapping
|
||||||
|
|
||||||
|
### 1. Anthropic (Claude)
|
||||||
|
- **Strengths**: Deep reasoning, code review, complex analysis, tool use
|
||||||
|
- **Best for**: Reasoning, tool_use, complex code review
|
||||||
|
- **Ranking**: #1 for reasoning, #2 for code, #1 for tool_use
|
||||||
|
|
||||||
|
### 2. OpenAI Codex
|
||||||
|
- **Strengths**: Code generation, feature implementation
|
||||||
|
- **Best for**: Code tasks, structured outputs
|
||||||
|
- **Ranking**: #1 for code generation
|
||||||
|
|
||||||
|
### 3. Gemini
|
||||||
|
- **Strengths**: Research, multimodal, long context
|
||||||
|
- **Best for**: Research tasks, document analysis
|
||||||
|
- **Ranking**: #1 for research, #2 for reasoning
|
||||||
|
|
||||||
|
### 4. Groq
|
||||||
|
- **Strengths**: Speed (284ms latency)
|
||||||
|
- **Best for**: Fast operations, simple queries, triage
|
||||||
|
- **Ranking**: #1 for fast_ops
|
||||||
|
|
||||||
|
### 5. Grok
|
||||||
|
- **Strengths**: Broad knowledge, creative, drafting
|
||||||
|
- **Best for**: Creative writing, general knowledge
|
||||||
|
- **Ranking**: #1 for creative
|
||||||
|
|
||||||
|
### 6. Kimi (kimi-coding)
|
||||||
|
- **Strengths**: Long context (262K tokens), code refactoring
|
||||||
|
- **Best for**: Large codebase work, long document analysis
|
||||||
|
- **Ranking**: Boosted for high-complexity code/research
|
||||||
|
|
||||||
|
### 7. OpenRouter
|
||||||
|
- **Strengths**: Broadest model access, overflow handling
|
||||||
|
- **Best for**: Fallback, variety of model choices
|
||||||
|
- **Ranking**: #6 or #7 across all task types
|
||||||
|
|
||||||
|
## Backend Rankings by Task Type
|
||||||
|
|
||||||
|
```python
|
||||||
|
CODE = [
|
||||||
|
openai-codex, # Best generation
|
||||||
|
anthropic, # Review & analysis
|
||||||
|
kimi, # Large codebases
|
||||||
|
gemini, # Multimodal
|
||||||
|
groq, # Fast simple tasks
|
||||||
|
openrouter, # Overflow
|
||||||
|
grok, # General backup
|
||||||
|
]
|
||||||
|
|
||||||
|
REASONING = [
|
||||||
|
anthropic, # Deep reasoning
|
||||||
|
gemini, # Analysis
|
||||||
|
kimi, # Long chains
|
||||||
|
grok, # Broad knowledge
|
||||||
|
openai-codex, # Structured
|
||||||
|
openrouter,
|
||||||
|
groq,
|
||||||
|
]
|
||||||
|
|
||||||
|
RESEARCH = [
|
||||||
|
gemini, # Research leader
|
||||||
|
kimi, # 262K context
|
||||||
|
anthropic, # Deep analysis
|
||||||
|
grok, # Knowledge
|
||||||
|
openrouter, # Broad access
|
||||||
|
openai-codex,
|
||||||
|
groq, # Triage
|
||||||
|
]
|
||||||
|
|
||||||
|
CREATIVE = [
|
||||||
|
grok, # Creative writing
|
||||||
|
anthropic, # Nuanced
|
||||||
|
gemini, # Multimodal
|
||||||
|
openai-codex, # Creative coding
|
||||||
|
kimi, # Long-form
|
||||||
|
openrouter,
|
||||||
|
groq,
|
||||||
|
]
|
||||||
|
|
||||||
|
FAST_OPS = [
|
||||||
|
groq, # 284ms champion
|
||||||
|
openrouter, # Fast mini models
|
||||||
|
gemini, # Flash
|
||||||
|
grok, # Simple queries
|
||||||
|
anthropic,
|
||||||
|
openai-codex,
|
||||||
|
kimi,
|
||||||
|
]
|
||||||
|
|
||||||
|
TOOL_USE = [
|
||||||
|
anthropic, # Tool use leader
|
||||||
|
openai-codex, # Good integration
|
||||||
|
gemini, # Multimodal
|
||||||
|
groq, # Fast chaining
|
||||||
|
kimi, # Long sessions
|
||||||
|
openrouter,
|
||||||
|
grok,
|
||||||
|
]
|
||||||
|
```
|
||||||
|
|
||||||
|
## Complexity Assessment
|
||||||
|
|
||||||
|
Complexity is determined by:
|
||||||
|
|
||||||
|
| Metric | Low | Medium | High |
|
||||||
|
|--------|-----|--------|------|
|
||||||
|
| Characters | ≤200 | 201-800 | >800 |
|
||||||
|
| Words | ≤35 | 36-150 | >150 |
|
||||||
|
| Lines | ≤3 | 4-15 | >15 |
|
||||||
|
| URLs | 0 | 1 | ≥2 |
|
||||||
|
| Code Blocks | 0 | 1 | ≥2 |
|
||||||
|
|
||||||
|
**Rules:**
|
||||||
|
- 2+ high metrics → **HIGH** complexity
|
||||||
|
- 2+ medium metrics or 1 high → **MEDIUM** complexity
|
||||||
|
- Otherwise → **LOW** complexity
|
||||||
|
|
||||||
|
### Complexity Adjustments
|
||||||
|
|
||||||
|
- **HIGH complexity + RESEARCH/CODE**: Boost Kimi and Gemini in rankings
|
||||||
|
- **LOW complexity + FAST_OPS**: Ensure Groq is first
|
||||||
|
- **Code blocks present**: Boost OpenAI Codex in any task type
|
||||||
|
|
||||||
|
## Keyword Dictionaries
|
||||||
|
|
||||||
|
The classifier uses curated keyword sets for each task type:
|
||||||
|
|
||||||
|
### Code Keywords (100+)
|
||||||
|
- Implementation: implement, code, function, class, module
|
||||||
|
- Debugging: debug, error, exception, traceback, bug, fix
|
||||||
|
- Testing: test, pytest, unittest, coverage
|
||||||
|
- Operations: deploy, docker, kubernetes, ci/cd, pipeline
|
||||||
|
- Concepts: api, endpoint, database, query, authentication
|
||||||
|
|
||||||
|
### Reasoning Keywords (50+)
|
||||||
|
- Analysis: analyze, evaluate, assess, critique, review
|
||||||
|
- Logic: reason, deduce, infer, logic, argument, evidence
|
||||||
|
- Process: compare, contrast, trade-off, strategy, plan
|
||||||
|
- Modifiers: step by step, chain of thought, think through
|
||||||
|
|
||||||
|
### Research Keywords (80+)
|
||||||
|
- Actions: research, find, search, explore, discover
|
||||||
|
- Sources: paper, publication, journal, arxiv, dataset
|
||||||
|
- Methods: study, survey, experiment, benchmark, evaluation
|
||||||
|
- Domains: machine learning, neural network, sota, literature
|
||||||
|
|
||||||
|
### Creative Keywords (100+)
|
||||||
|
- Visual: art, paint, draw, design, graphic, image
|
||||||
|
- Writing: write, story, novel, poem, essay, content
|
||||||
|
- Audio: music, song, compose, melody, sound
|
||||||
|
- Process: brainstorm, ideate, concept, imagine, inspire
|
||||||
|
|
||||||
|
### Fast Ops Keywords (60+)
|
||||||
|
- Simple: quick, fast, brief, simple, easy, status
|
||||||
|
- Actions: list, show, get, check, count, find
|
||||||
|
- Short queries: hi, hello, thanks, yes/no, what is
|
||||||
|
|
||||||
|
### Tool Use Keywords (70+)
|
||||||
|
- Actions: run, execute, call, use tool, invoke
|
||||||
|
- Systems: terminal, shell, docker, kubernetes, git
|
||||||
|
- Protocols: api, http, request, response, webhook
|
||||||
|
- Agents: delegate, subagent, spawn, mcp
|
||||||
|
|
||||||
|
## API
|
||||||
|
|
||||||
|
### Classify a Prompt
|
||||||
|
|
||||||
|
```python
|
||||||
|
from task_classifier import TaskClassifier, classify_prompt
|
||||||
|
|
||||||
|
# Method 1: Using the class
|
||||||
|
classifier = TaskClassifier()
|
||||||
|
result = classifier.classify("Implement a Python function")
|
||||||
|
|
||||||
|
print(result.task_type) # TaskType.CODE
|
||||||
|
print(result.preferred_backends) # ["openai-codex", "anthropic", ...]
|
||||||
|
print(result.complexity) # ComplexityLevel.LOW
|
||||||
|
print(result.reason) # "Task: code; Complexity: low; ..."
|
||||||
|
print(result.confidence) # 0.75
|
||||||
|
|
||||||
|
# Method 2: Convenience function
|
||||||
|
output = classify_prompt("Research AI papers")
|
||||||
|
# Returns dict: {
|
||||||
|
# "task_type": "research",
|
||||||
|
# "preferred_backends": ["gemini", "kimi", ...],
|
||||||
|
# "complexity": "low",
|
||||||
|
# "reason": "...",
|
||||||
|
# "confidence": 0.65,
|
||||||
|
# "features": {...}
|
||||||
|
# }
|
||||||
|
```
|
||||||
|
|
||||||
|
### ClassificationResult Fields
|
||||||
|
|
||||||
|
| Field | Type | Description |
|
||||||
|
|-------|------|-------------|
|
||||||
|
| `task_type` | TaskType | Classified task category |
|
||||||
|
| `preferred_backends` | List[str] | Ranked list of backend identifiers |
|
||||||
|
| `complexity` | ComplexityLevel | Assessed complexity level |
|
||||||
|
| `reason` | str | Human-readable classification reasoning |
|
||||||
|
| `confidence` | float | 0.0-1.0 confidence score |
|
||||||
|
| `features` | Dict | Extracted features (lengths, code, URLs) |
|
||||||
|
|
||||||
|
## Integration with Hermes
|
||||||
|
|
||||||
|
### Usage in Smart Model Routing
|
||||||
|
|
||||||
|
The task classifier replaces/enhances the existing `smart_model_routing.py`:
|
||||||
|
|
||||||
|
```python
|
||||||
|
# In hermes-agent/agent/smart_model_routing.py
|
||||||
|
from uniwizard.task_classifier import TaskClassifier
|
||||||
|
|
||||||
|
classifier = TaskClassifier()
|
||||||
|
|
||||||
|
def resolve_turn_route(user_message, routing_config, primary, fallback_chain):
|
||||||
|
# Classify the prompt
|
||||||
|
result = classifier.classify(user_message)
|
||||||
|
|
||||||
|
# Map preferred backends to actual models from fallback_chain
|
||||||
|
for backend in result.preferred_backends:
|
||||||
|
model_config = fallback_chain.get(backend)
|
||||||
|
if model_config and is_available(backend):
|
||||||
|
return {
|
||||||
|
"model": model_config["model"],
|
||||||
|
"provider": backend,
|
||||||
|
"reason": result.reason,
|
||||||
|
"complexity": result.complexity.value,
|
||||||
|
}
|
||||||
|
|
||||||
|
# Fallback to primary
|
||||||
|
return primary
|
||||||
|
```
|
||||||
|
|
||||||
|
### Configuration
|
||||||
|
|
||||||
|
```yaml
|
||||||
|
# config.yaml
|
||||||
|
smart_model_routing:
|
||||||
|
enabled: true
|
||||||
|
use_task_classifier: true
|
||||||
|
|
||||||
|
fallback_providers:
|
||||||
|
- provider: anthropic
|
||||||
|
model: claude-opus-4-6
|
||||||
|
- provider: openai-codex
|
||||||
|
model: codex
|
||||||
|
- provider: gemini
|
||||||
|
model: gemini-2.5-flash
|
||||||
|
- provider: groq
|
||||||
|
model: llama-3.3-70b-versatile
|
||||||
|
- provider: grok
|
||||||
|
model: grok-3-mini-fast
|
||||||
|
- provider: kimi-coding
|
||||||
|
model: kimi-k2.5
|
||||||
|
- provider: openrouter
|
||||||
|
model: openai/gpt-4.1-mini
|
||||||
|
```
|
||||||
|
|
||||||
|
## Testing
|
||||||
|
|
||||||
|
Run the test suite:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
cd ~/.timmy/uniwizard
|
||||||
|
python -m pytest test_task_classifier.py -v
|
||||||
|
```
|
||||||
|
|
||||||
|
Coverage includes:
|
||||||
|
- Feature extraction (URLs, code blocks, length metrics)
|
||||||
|
- Complexity assessment (low/medium/high)
|
||||||
|
- Task type classification (all 6 types)
|
||||||
|
- Backend selection (rankings by task type)
|
||||||
|
- Complexity adjustments (boosts for Kimi/Gemini)
|
||||||
|
- Edge cases (empty, whitespace, very long prompts)
|
||||||
|
- Integration scenarios (realistic use cases)
|
||||||
|
|
||||||
|
## Future Enhancements
|
||||||
|
|
||||||
|
1. **Session Context**: Use conversation history for better classification
|
||||||
|
2. **Performance Feedback**: Learn from actual backend performance
|
||||||
|
3. **User Preferences**: Allow user-defined backend preferences
|
||||||
|
4. **Cost Optimization**: Factor in backend costs for routing
|
||||||
|
5. **Streaming Detection**: Identify streaming-suitable tasks
|
||||||
|
6. **Multi-Modal**: Better handling of image/audio inputs
|
||||||
|
7. **Confidence Thresholds**: Configurable confidence cutoffs
|
||||||
|
|
||||||
|
## Files
|
||||||
|
|
||||||
|
| File | Description |
|
||||||
|
|------|-------------|
|
||||||
|
| `task_classifier.py` | Main implementation (600+ lines) |
|
||||||
|
| `test_task_classifier.py` | Unit tests (400+ lines) |
|
||||||
|
| `task_classifier_design.md` | This design document |
|
||||||
|
|
||||||
|
## References
|
||||||
|
|
||||||
|
- Gitea Issue: timmy-home #88
|
||||||
|
- Existing: `~/.hermes/hermes-agent/agent/smart_model_routing.py`
|
||||||
|
- Config: `~/.hermes/config.yaml` (fallback_providers chain)
|
||||||
534
uniwizard/test_quality_scorer.py
Normal file
534
uniwizard/test_quality_scorer.py
Normal file
@@ -0,0 +1,534 @@
|
|||||||
|
"""
|
||||||
|
Tests for the Uniwizard Quality Scorer module.
|
||||||
|
|
||||||
|
Run with: python -m pytest ~/.timmy/uniwizard/test_quality_scorer.py -v
|
||||||
|
"""
|
||||||
|
|
||||||
|
import sqlite3
|
||||||
|
import tempfile
|
||||||
|
from pathlib import Path
|
||||||
|
import pytest
|
||||||
|
|
||||||
|
from quality_scorer import (
|
||||||
|
QualityScorer,
|
||||||
|
ResponseStatus,
|
||||||
|
TaskType,
|
||||||
|
BACKENDS,
|
||||||
|
BackendScore,
|
||||||
|
print_score_report,
|
||||||
|
print_full_report,
|
||||||
|
get_scorer,
|
||||||
|
record,
|
||||||
|
recommend,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
class TestQualityScorer:
|
||||||
|
"""Tests for the QualityScorer class."""
|
||||||
|
|
||||||
|
@pytest.fixture
|
||||||
|
def temp_db(self):
|
||||||
|
"""Create a temporary database for testing."""
|
||||||
|
with tempfile.NamedTemporaryFile(suffix=".db", delete=False) as f:
|
||||||
|
db_path = Path(f.name)
|
||||||
|
yield db_path
|
||||||
|
db_path.unlink(missing_ok=True)
|
||||||
|
|
||||||
|
@pytest.fixture
|
||||||
|
def scorer(self, temp_db):
|
||||||
|
"""Create a fresh QualityScorer with temp database."""
|
||||||
|
return QualityScorer(db_path=temp_db)
|
||||||
|
|
||||||
|
def test_init_creates_database(self, temp_db):
|
||||||
|
"""Test that initialization creates the database and tables."""
|
||||||
|
scorer = QualityScorer(db_path=temp_db)
|
||||||
|
assert temp_db.exists()
|
||||||
|
|
||||||
|
# Verify schema
|
||||||
|
conn = sqlite3.connect(str(temp_db))
|
||||||
|
cursor = conn.execute(
|
||||||
|
"SELECT name FROM sqlite_master WHERE type='table'"
|
||||||
|
)
|
||||||
|
tables = {row[0] for row in cursor.fetchall()}
|
||||||
|
assert "responses" in tables
|
||||||
|
conn.close()
|
||||||
|
|
||||||
|
def test_record_response_success(self, scorer):
|
||||||
|
"""Test recording a successful response."""
|
||||||
|
scorer.record_response(
|
||||||
|
backend="anthropic",
|
||||||
|
task_type=TaskType.CODE,
|
||||||
|
status=ResponseStatus.SUCCESS,
|
||||||
|
latency_ms=1000.0,
|
||||||
|
ttft_ms=150.0,
|
||||||
|
metadata={"model": "claude-3-opus"}
|
||||||
|
)
|
||||||
|
|
||||||
|
score = scorer.get_backend_score("anthropic", TaskType.CODE.value)
|
||||||
|
assert score.total_requests == 1
|
||||||
|
assert score.success_count == 1
|
||||||
|
assert score.error_count == 0
|
||||||
|
|
||||||
|
def test_record_response_error(self, scorer):
|
||||||
|
"""Test recording an error response."""
|
||||||
|
scorer.record_response(
|
||||||
|
backend="groq",
|
||||||
|
task_type=TaskType.FAST_OPS,
|
||||||
|
status=ResponseStatus.ERROR,
|
||||||
|
latency_ms=500.0,
|
||||||
|
ttft_ms=50.0
|
||||||
|
)
|
||||||
|
|
||||||
|
score = scorer.get_backend_score("groq", TaskType.FAST_OPS.value)
|
||||||
|
assert score.total_requests == 1
|
||||||
|
assert score.success_count == 0
|
||||||
|
assert score.error_count == 1
|
||||||
|
|
||||||
|
def test_record_response_refusal(self, scorer):
|
||||||
|
"""Test recording a refusal response."""
|
||||||
|
scorer.record_response(
|
||||||
|
backend="gemini",
|
||||||
|
task_type=TaskType.CREATIVE,
|
||||||
|
status=ResponseStatus.REFUSAL,
|
||||||
|
latency_ms=300.0,
|
||||||
|
ttft_ms=100.0
|
||||||
|
)
|
||||||
|
|
||||||
|
score = scorer.get_backend_score("gemini", TaskType.CREATIVE.value)
|
||||||
|
assert score.refusal_count == 1
|
||||||
|
|
||||||
|
def test_record_response_timeout(self, scorer):
|
||||||
|
"""Test recording a timeout response."""
|
||||||
|
scorer.record_response(
|
||||||
|
backend="openrouter",
|
||||||
|
task_type=TaskType.RESEARCH,
|
||||||
|
status=ResponseStatus.TIMEOUT,
|
||||||
|
latency_ms=30000.0,
|
||||||
|
ttft_ms=0.0
|
||||||
|
)
|
||||||
|
|
||||||
|
score = scorer.get_backend_score("openrouter", TaskType.RESEARCH.value)
|
||||||
|
assert score.timeout_count == 1
|
||||||
|
|
||||||
|
def test_record_invalid_backend(self, scorer):
|
||||||
|
"""Test that invalid backend raises ValueError."""
|
||||||
|
with pytest.raises(ValueError, match="Unknown backend"):
|
||||||
|
scorer.record_response(
|
||||||
|
backend="invalid-backend",
|
||||||
|
task_type=TaskType.CODE,
|
||||||
|
status=ResponseStatus.SUCCESS,
|
||||||
|
latency_ms=1000.0,
|
||||||
|
ttft_ms=100.0
|
||||||
|
)
|
||||||
|
|
||||||
|
def test_rolling_window_pruning(self, scorer):
|
||||||
|
"""Test that old records are pruned beyond window size."""
|
||||||
|
# Add more than ROLLING_WINDOW_SIZE records
|
||||||
|
for i in range(110):
|
||||||
|
scorer.record_response(
|
||||||
|
backend="kimi-coding",
|
||||||
|
task_type=TaskType.CODE,
|
||||||
|
status=ResponseStatus.SUCCESS,
|
||||||
|
latency_ms=float(i),
|
||||||
|
ttft_ms=50.0
|
||||||
|
)
|
||||||
|
|
||||||
|
# Should only have 100 records
|
||||||
|
stats = scorer.get_stats()
|
||||||
|
assert stats["by_backend"]["kimi-coding"] == 100
|
||||||
|
|
||||||
|
def test_recommend_backend_basic(self, scorer):
|
||||||
|
"""Test backend recommendation with sample data."""
|
||||||
|
# Add some data for multiple backends
|
||||||
|
for backend in ["anthropic", "groq", "gemini"]:
|
||||||
|
for i in range(10):
|
||||||
|
scorer.record_response(
|
||||||
|
backend=backend,
|
||||||
|
task_type=TaskType.CODE,
|
||||||
|
status=ResponseStatus.SUCCESS if i < 8 else ResponseStatus.ERROR,
|
||||||
|
latency_ms=1000.0 if backend == "anthropic" else 500.0,
|
||||||
|
ttft_ms=200.0
|
||||||
|
)
|
||||||
|
|
||||||
|
recommendations = scorer.recommend_backend(TaskType.CODE.value)
|
||||||
|
|
||||||
|
# Should return all 7 backends
|
||||||
|
assert len(recommendations) == 7
|
||||||
|
|
||||||
|
# Top 3 should have scores
|
||||||
|
top_3 = [b for b, s in recommendations[:3]]
|
||||||
|
assert "groq" in top_3 # Fastest latency should win
|
||||||
|
|
||||||
|
def test_recommend_backend_insufficient_data(self, scorer):
|
||||||
|
"""Test recommendation with insufficient samples."""
|
||||||
|
# Add only 2 samples for one backend
|
||||||
|
for i in range(2):
|
||||||
|
scorer.record_response(
|
||||||
|
backend="anthropic",
|
||||||
|
task_type=TaskType.CODE,
|
||||||
|
status=ResponseStatus.SUCCESS,
|
||||||
|
latency_ms=1000.0,
|
||||||
|
ttft_ms=200.0
|
||||||
|
)
|
||||||
|
|
||||||
|
recommendations = scorer.recommend_backend(TaskType.CODE.value, min_samples=5)
|
||||||
|
|
||||||
|
# Should penalize low-sample backend
|
||||||
|
anthropic_score = next(s for b, s in recommendations if b == "anthropic")
|
||||||
|
assert anthropic_score < 50 # Penalized for low samples
|
||||||
|
|
||||||
|
def test_get_all_scores(self, scorer):
|
||||||
|
"""Test getting scores for all backends."""
|
||||||
|
# Add data for some backends
|
||||||
|
for backend in ["anthropic", "groq"]:
|
||||||
|
scorer.record_response(
|
||||||
|
backend=backend,
|
||||||
|
task_type=TaskType.REASONING,
|
||||||
|
status=ResponseStatus.SUCCESS,
|
||||||
|
latency_ms=1000.0,
|
||||||
|
ttft_ms=200.0
|
||||||
|
)
|
||||||
|
|
||||||
|
all_scores = scorer.get_all_scores(TaskType.REASONING.value)
|
||||||
|
|
||||||
|
assert len(all_scores) == 7
|
||||||
|
assert all_scores["anthropic"].total_requests == 1
|
||||||
|
assert all_scores["groq"].total_requests == 1
|
||||||
|
assert all_scores["gemini"].total_requests == 0
|
||||||
|
|
||||||
|
def test_get_task_breakdown(self, scorer):
|
||||||
|
"""Test getting per-task breakdown for a backend."""
|
||||||
|
# Add data for different task types
|
||||||
|
scorer.record_response(
|
||||||
|
backend="anthropic",
|
||||||
|
task_type=TaskType.CODE,
|
||||||
|
status=ResponseStatus.SUCCESS,
|
||||||
|
latency_ms=1000.0,
|
||||||
|
ttft_ms=200.0
|
||||||
|
)
|
||||||
|
scorer.record_response(
|
||||||
|
backend="anthropic",
|
||||||
|
task_type=TaskType.REASONING,
|
||||||
|
status=ResponseStatus.SUCCESS,
|
||||||
|
latency_ms=2000.0,
|
||||||
|
ttft_ms=300.0
|
||||||
|
)
|
||||||
|
|
||||||
|
breakdown = scorer.get_task_breakdown("anthropic")
|
||||||
|
|
||||||
|
assert len(breakdown) == 5 # 5 task types
|
||||||
|
assert breakdown["code"].total_requests == 1
|
||||||
|
assert breakdown["reasoning"].total_requests == 1
|
||||||
|
|
||||||
|
def test_score_calculation(self, scorer):
|
||||||
|
"""Test the composite score calculation."""
|
||||||
|
# Add perfect responses
|
||||||
|
for i in range(10):
|
||||||
|
scorer.record_response(
|
||||||
|
backend="anthropic",
|
||||||
|
task_type=TaskType.CODE,
|
||||||
|
status=ResponseStatus.SUCCESS,
|
||||||
|
latency_ms=100.0, # Very fast
|
||||||
|
ttft_ms=50.0
|
||||||
|
)
|
||||||
|
|
||||||
|
score = scorer.get_backend_score("anthropic", TaskType.CODE.value)
|
||||||
|
|
||||||
|
# Should have high score for perfect performance
|
||||||
|
assert score.score > 90
|
||||||
|
assert score.success_count == 10
|
||||||
|
assert score.avg_latency_ms == 100.0
|
||||||
|
|
||||||
|
def test_score_with_errors(self, scorer):
|
||||||
|
"""Test scoring with mixed success/error."""
|
||||||
|
for i in range(5):
|
||||||
|
scorer.record_response(
|
||||||
|
backend="grok",
|
||||||
|
task_type=TaskType.RESEARCH,
|
||||||
|
status=ResponseStatus.SUCCESS,
|
||||||
|
latency_ms=1000.0,
|
||||||
|
ttft_ms=200.0
|
||||||
|
)
|
||||||
|
for i in range(5):
|
||||||
|
scorer.record_response(
|
||||||
|
backend="grok",
|
||||||
|
task_type=TaskType.RESEARCH,
|
||||||
|
status=ResponseStatus.ERROR,
|
||||||
|
latency_ms=500.0,
|
||||||
|
ttft_ms=100.0
|
||||||
|
)
|
||||||
|
|
||||||
|
score = scorer.get_backend_score("grok", TaskType.RESEARCH.value)
|
||||||
|
|
||||||
|
assert score.total_requests == 10
|
||||||
|
assert score.success_count == 5
|
||||||
|
assert score.error_count == 5
|
||||||
|
# Score: 50% success + low error penalty = ~71 with good latency
|
||||||
|
assert 60 < score.score < 80
|
||||||
|
|
||||||
|
def test_p95_calculation(self, scorer):
|
||||||
|
"""Test P95 latency calculation."""
|
||||||
|
# Add latencies from 1ms to 100ms
|
||||||
|
for i in range(1, 101):
|
||||||
|
scorer.record_response(
|
||||||
|
backend="anthropic",
|
||||||
|
task_type=TaskType.CODE,
|
||||||
|
status=ResponseStatus.SUCCESS,
|
||||||
|
latency_ms=float(i),
|
||||||
|
ttft_ms=50.0
|
||||||
|
)
|
||||||
|
|
||||||
|
score = scorer.get_backend_score("anthropic", TaskType.CODE.value)
|
||||||
|
|
||||||
|
# P95 should be around 95
|
||||||
|
assert 90 <= score.p95_latency_ms <= 100
|
||||||
|
|
||||||
|
def test_clear_data(self, scorer):
|
||||||
|
"""Test clearing all data."""
|
||||||
|
scorer.record_response(
|
||||||
|
backend="anthropic",
|
||||||
|
task_type=TaskType.CODE,
|
||||||
|
status=ResponseStatus.SUCCESS,
|
||||||
|
latency_ms=1000.0,
|
||||||
|
ttft_ms=200.0
|
||||||
|
)
|
||||||
|
|
||||||
|
scorer.clear_data()
|
||||||
|
|
||||||
|
stats = scorer.get_stats()
|
||||||
|
assert stats["total_records"] == 0
|
||||||
|
|
||||||
|
def test_string_task_type(self, scorer):
|
||||||
|
"""Test that string task types work alongside TaskType enum."""
|
||||||
|
scorer.record_response(
|
||||||
|
backend="openai-codex",
|
||||||
|
task_type="code", # String instead of enum
|
||||||
|
status=ResponseStatus.SUCCESS,
|
||||||
|
latency_ms=1000.0,
|
||||||
|
ttft_ms=200.0
|
||||||
|
)
|
||||||
|
|
||||||
|
score = scorer.get_backend_score("openai-codex", "code")
|
||||||
|
assert score.total_requests == 1
|
||||||
|
|
||||||
|
|
||||||
|
class TestConvenienceFunctions:
|
||||||
|
"""Tests for module-level convenience functions."""
|
||||||
|
|
||||||
|
@pytest.fixture
|
||||||
|
def temp_db(self):
|
||||||
|
"""Create a temporary database for testing."""
|
||||||
|
with tempfile.NamedTemporaryFile(suffix=".db", delete=False) as f:
|
||||||
|
db_path = Path(f.name)
|
||||||
|
|
||||||
|
# Patch the default path
|
||||||
|
import quality_scorer
|
||||||
|
original_path = quality_scorer.DEFAULT_DB_PATH
|
||||||
|
quality_scorer.DEFAULT_DB_PATH = db_path
|
||||||
|
|
||||||
|
yield db_path
|
||||||
|
|
||||||
|
quality_scorer.DEFAULT_DB_PATH = original_path
|
||||||
|
db_path.unlink(missing_ok=True)
|
||||||
|
|
||||||
|
def test_get_scorer(self, temp_db):
|
||||||
|
"""Test get_scorer convenience function."""
|
||||||
|
scorer = get_scorer()
|
||||||
|
assert isinstance(scorer, QualityScorer)
|
||||||
|
|
||||||
|
def test_record_convenience(self, temp_db):
|
||||||
|
"""Test record convenience function."""
|
||||||
|
record(
|
||||||
|
backend="anthropic",
|
||||||
|
task_type="code",
|
||||||
|
status="success",
|
||||||
|
latency_ms=1000.0,
|
||||||
|
ttft_ms=200.0
|
||||||
|
)
|
||||||
|
|
||||||
|
scorer = get_scorer()
|
||||||
|
score = scorer.get_backend_score("anthropic", "code")
|
||||||
|
assert score.total_requests == 1
|
||||||
|
|
||||||
|
def test_recommend_convenience(self, temp_db):
|
||||||
|
"""Test recommend convenience function."""
|
||||||
|
record(
|
||||||
|
backend="anthropic",
|
||||||
|
task_type="code",
|
||||||
|
status="success",
|
||||||
|
latency_ms=1000.0,
|
||||||
|
ttft_ms=200.0
|
||||||
|
)
|
||||||
|
|
||||||
|
recs = recommend("code")
|
||||||
|
assert len(recs) == 7
|
||||||
|
assert recs[0][0] == "anthropic" # Should rank first since it has data
|
||||||
|
|
||||||
|
|
||||||
|
class TestPrintFunctions:
|
||||||
|
"""Tests for print/report functions (smoke tests)."""
|
||||||
|
|
||||||
|
@pytest.fixture
|
||||||
|
def populated_scorer(self):
|
||||||
|
"""Create a scorer with demo data."""
|
||||||
|
with tempfile.NamedTemporaryFile(suffix=".db", delete=False) as f:
|
||||||
|
db_path = Path(f.name)
|
||||||
|
|
||||||
|
scorer = QualityScorer(db_path=db_path)
|
||||||
|
|
||||||
|
# Add demo data for all backends
|
||||||
|
import random
|
||||||
|
random.seed(42)
|
||||||
|
|
||||||
|
for backend in BACKENDS:
|
||||||
|
for task in TaskType:
|
||||||
|
for i in range(20):
|
||||||
|
scorer.record_response(
|
||||||
|
backend=backend,
|
||||||
|
task_type=task.value,
|
||||||
|
status=random.choices(
|
||||||
|
[ResponseStatus.SUCCESS, ResponseStatus.ERROR,
|
||||||
|
ResponseStatus.REFUSAL, ResponseStatus.TIMEOUT],
|
||||||
|
weights=[0.85, 0.08, 0.05, 0.02]
|
||||||
|
)[0],
|
||||||
|
latency_ms=random.gauss(
|
||||||
|
1000 if backend in ["anthropic", "openai-codex"] else 500,
|
||||||
|
200
|
||||||
|
),
|
||||||
|
ttft_ms=random.gauss(150, 50)
|
||||||
|
)
|
||||||
|
|
||||||
|
yield scorer
|
||||||
|
db_path.unlink(missing_ok=True)
|
||||||
|
|
||||||
|
def test_print_score_report(self, populated_scorer, capsys):
|
||||||
|
"""Test print_score_report doesn't crash."""
|
||||||
|
print_score_report(populated_scorer)
|
||||||
|
captured = capsys.readouterr()
|
||||||
|
assert "UNIWIZARD BACKEND QUALITY SCORES" in captured.out
|
||||||
|
assert "anthropic" in captured.out
|
||||||
|
|
||||||
|
def test_print_full_report(self, populated_scorer, capsys):
|
||||||
|
"""Test print_full_report doesn't crash."""
|
||||||
|
print_full_report(populated_scorer)
|
||||||
|
captured = capsys.readouterr()
|
||||||
|
assert "PER-TASK SPECIALIZATION" in captured.out
|
||||||
|
assert "RECOMMENDATIONS" in captured.out
|
||||||
|
|
||||||
|
|
||||||
|
class TestEdgeCases:
|
||||||
|
"""Tests for edge cases and error handling."""
|
||||||
|
|
||||||
|
@pytest.fixture
|
||||||
|
def temp_db(self):
|
||||||
|
"""Create a temporary database for testing."""
|
||||||
|
with tempfile.NamedTemporaryFile(suffix=".db", delete=False) as f:
|
||||||
|
db_path = Path(f.name)
|
||||||
|
yield db_path
|
||||||
|
db_path.unlink(missing_ok=True)
|
||||||
|
|
||||||
|
@pytest.fixture
|
||||||
|
def scorer(self, temp_db):
|
||||||
|
"""Create a fresh QualityScorer with temp database."""
|
||||||
|
return QualityScorer(db_path=temp_db)
|
||||||
|
|
||||||
|
def test_empty_database(self, scorer):
|
||||||
|
"""Test behavior with empty database."""
|
||||||
|
score = scorer.get_backend_score("anthropic", TaskType.CODE.value)
|
||||||
|
|
||||||
|
assert score.total_requests == 0
|
||||||
|
assert score.score == 0.0
|
||||||
|
assert score.avg_latency_ms == 0.0
|
||||||
|
|
||||||
|
def test_invalid_backend_in_get_score(self, scorer):
|
||||||
|
"""Test that invalid backend raises error in get_score."""
|
||||||
|
with pytest.raises(ValueError, match="Unknown backend"):
|
||||||
|
scorer.get_backend_score("invalid")
|
||||||
|
|
||||||
|
def test_invalid_backend_in_breakdown(self, scorer):
|
||||||
|
"""Test that invalid backend raises error in get_task_breakdown."""
|
||||||
|
with pytest.raises(ValueError, match="Unknown backend"):
|
||||||
|
scorer.get_task_breakdown("invalid")
|
||||||
|
|
||||||
|
def test_zero_latency(self, scorer):
|
||||||
|
"""Test handling of zero latency."""
|
||||||
|
scorer.record_response(
|
||||||
|
backend="groq",
|
||||||
|
task_type=TaskType.FAST_OPS,
|
||||||
|
status=ResponseStatus.SUCCESS,
|
||||||
|
latency_ms=0.0,
|
||||||
|
ttft_ms=0.0
|
||||||
|
)
|
||||||
|
|
||||||
|
score = scorer.get_backend_score("groq", TaskType.FAST_OPS.value)
|
||||||
|
assert score.avg_latency_ms == 0.0
|
||||||
|
assert score.score > 50 # Should still have decent score
|
||||||
|
|
||||||
|
def test_very_high_latency(self, scorer):
|
||||||
|
"""Test handling of very high latency."""
|
||||||
|
scorer.record_response(
|
||||||
|
backend="openrouter",
|
||||||
|
task_type=TaskType.RESEARCH,
|
||||||
|
status=ResponseStatus.SUCCESS,
|
||||||
|
latency_ms=50000.0, # 50 seconds
|
||||||
|
ttft_ms=5000.0
|
||||||
|
)
|
||||||
|
|
||||||
|
score = scorer.get_backend_score("openrouter", TaskType.RESEARCH.value)
|
||||||
|
# Success rate is 100% but latency penalty brings it down
|
||||||
|
assert score.score < 85 # Should be penalized for high latency
|
||||||
|
|
||||||
|
def test_all_error_responses(self, scorer):
|
||||||
|
"""Test scoring when all responses are errors."""
|
||||||
|
for i in range(10):
|
||||||
|
scorer.record_response(
|
||||||
|
backend="gemini",
|
||||||
|
task_type=TaskType.CODE,
|
||||||
|
status=ResponseStatus.ERROR,
|
||||||
|
latency_ms=1000.0,
|
||||||
|
ttft_ms=200.0
|
||||||
|
)
|
||||||
|
|
||||||
|
score = scorer.get_backend_score("gemini", TaskType.CODE.value)
|
||||||
|
# 0% success but perfect error/refusal/timeout rate = ~35
|
||||||
|
assert score.score < 45 # Should have low score
|
||||||
|
|
||||||
|
def test_all_refusal_responses(self, scorer):
|
||||||
|
"""Test scoring when all responses are refusals."""
|
||||||
|
for i in range(10):
|
||||||
|
scorer.record_response(
|
||||||
|
backend="gemini",
|
||||||
|
task_type=TaskType.CREATIVE,
|
||||||
|
status=ResponseStatus.REFUSAL,
|
||||||
|
latency_ms=500.0,
|
||||||
|
ttft_ms=100.0
|
||||||
|
)
|
||||||
|
|
||||||
|
score = scorer.get_backend_score("gemini", TaskType.CREATIVE.value)
|
||||||
|
assert score.refusal_count == 10
|
||||||
|
# 0% success, 0% error, 100% refusal, good latency = ~49
|
||||||
|
assert score.score < 55 # Should be low due to refusals
|
||||||
|
|
||||||
|
def test_metadata_storage(self, scorer):
|
||||||
|
"""Test that metadata is stored correctly."""
|
||||||
|
scorer.record_response(
|
||||||
|
backend="anthropic",
|
||||||
|
task_type=TaskType.CODE,
|
||||||
|
status=ResponseStatus.SUCCESS,
|
||||||
|
latency_ms=1000.0,
|
||||||
|
ttft_ms=200.0,
|
||||||
|
metadata={"model": "claude-3-opus", "region": "us-east-1"}
|
||||||
|
)
|
||||||
|
|
||||||
|
# Verify in database
|
||||||
|
conn = sqlite3.connect(str(scorer.db_path))
|
||||||
|
row = conn.execute("SELECT metadata FROM responses LIMIT 1").fetchone()
|
||||||
|
conn.close()
|
||||||
|
|
||||||
|
import json
|
||||||
|
metadata = json.loads(row[0])
|
||||||
|
assert metadata["model"] == "claude-3-opus"
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
pytest.main([__file__, "-v"])
|
||||||
498
uniwizard/test_self_grader.py
Normal file
498
uniwizard/test_self_grader.py
Normal file
@@ -0,0 +1,498 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
"""
|
||||||
|
Tests for the Self-Grader Module
|
||||||
|
|
||||||
|
Run with: python -m pytest test_self_grader.py -v
|
||||||
|
"""
|
||||||
|
|
||||||
|
import json
|
||||||
|
import sqlite3
|
||||||
|
import tempfile
|
||||||
|
from pathlib import Path
|
||||||
|
from datetime import datetime, timedelta
|
||||||
|
import pytest
|
||||||
|
|
||||||
|
from self_grader import SelfGrader, SessionGrade, WeeklyReport
|
||||||
|
|
||||||
|
|
||||||
|
class TestSessionGrade:
|
||||||
|
"""Tests for SessionGrade dataclass."""
|
||||||
|
|
||||||
|
def test_session_grade_creation(self):
|
||||||
|
"""Test creating a SessionGrade."""
|
||||||
|
grade = SessionGrade(
|
||||||
|
session_id="test-123",
|
||||||
|
session_file="session_test.json",
|
||||||
|
graded_at=datetime.now().isoformat(),
|
||||||
|
task_completed=True,
|
||||||
|
tool_calls_efficient=4,
|
||||||
|
response_quality=5,
|
||||||
|
errors_recovered=True,
|
||||||
|
total_api_calls=10,
|
||||||
|
model="claude-opus",
|
||||||
|
platform="cli",
|
||||||
|
session_start=datetime.now().isoformat(),
|
||||||
|
duration_seconds=120.0,
|
||||||
|
task_summary="Test task",
|
||||||
|
total_errors=0,
|
||||||
|
error_types="[]",
|
||||||
|
tools_with_errors="[]",
|
||||||
|
had_repeated_errors=False,
|
||||||
|
had_infinite_loop_risk=False,
|
||||||
|
had_user_clarification=False
|
||||||
|
)
|
||||||
|
|
||||||
|
assert grade.session_id == "test-123"
|
||||||
|
assert grade.task_completed is True
|
||||||
|
assert grade.tool_calls_efficient == 4
|
||||||
|
assert grade.response_quality == 5
|
||||||
|
|
||||||
|
|
||||||
|
class TestSelfGraderInit:
|
||||||
|
"""Tests for SelfGrader initialization."""
|
||||||
|
|
||||||
|
def test_init_creates_database(self, tmp_path):
|
||||||
|
"""Test that initialization creates the database."""
|
||||||
|
db_path = tmp_path / "grades.db"
|
||||||
|
sessions_dir = tmp_path / "sessions"
|
||||||
|
sessions_dir.mkdir()
|
||||||
|
|
||||||
|
grader = SelfGrader(grades_db_path=db_path, sessions_dir=sessions_dir)
|
||||||
|
|
||||||
|
assert db_path.exists()
|
||||||
|
|
||||||
|
# Check tables exist
|
||||||
|
with sqlite3.connect(db_path) as conn:
|
||||||
|
cursor = conn.execute("SELECT name FROM sqlite_master WHERE type='table'")
|
||||||
|
tables = {row[0] for row in cursor.fetchall()}
|
||||||
|
|
||||||
|
assert "session_grades" in tables
|
||||||
|
assert "weekly_reports" in tables
|
||||||
|
|
||||||
|
|
||||||
|
class TestErrorDetection:
|
||||||
|
"""Tests for error detection and classification."""
|
||||||
|
|
||||||
|
def test_detect_exit_code_error(self, tmp_path):
|
||||||
|
"""Test detection of exit code errors."""
|
||||||
|
grader = SelfGrader(
|
||||||
|
grades_db_path=tmp_path / "grades.db",
|
||||||
|
sessions_dir=tmp_path / "sessions"
|
||||||
|
)
|
||||||
|
|
||||||
|
assert grader._detect_error('{"exit_code": 1, "output": ""}') is True
|
||||||
|
assert grader._detect_error('{"exit_code": 0, "output": "success"}') is False
|
||||||
|
assert grader._detect_error('') is False
|
||||||
|
|
||||||
|
def test_detect_explicit_error(self, tmp_path):
|
||||||
|
"""Test detection of explicit error messages."""
|
||||||
|
grader = SelfGrader(
|
||||||
|
grades_db_path=tmp_path / "grades.db",
|
||||||
|
sessions_dir=tmp_path / "sessions"
|
||||||
|
)
|
||||||
|
|
||||||
|
assert grader._detect_error('{"error": "file not found"}') is True
|
||||||
|
assert grader._detect_error('Traceback (most recent call last):') is True
|
||||||
|
assert grader._detect_error('Command failed with exception') is True
|
||||||
|
|
||||||
|
def test_classify_file_not_found(self, tmp_path):
|
||||||
|
"""Test classification of file not found errors."""
|
||||||
|
grader = SelfGrader(
|
||||||
|
grades_db_path=tmp_path / "grades.db",
|
||||||
|
sessions_dir=tmp_path / "sessions"
|
||||||
|
)
|
||||||
|
|
||||||
|
error = "Error: file '/path/to/file' not found"
|
||||||
|
assert grader._classify_error(error) == "file_not_found"
|
||||||
|
|
||||||
|
def test_classify_timeout(self, tmp_path):
|
||||||
|
"""Test classification of timeout errors."""
|
||||||
|
grader = SelfGrader(
|
||||||
|
grades_db_path=tmp_path / "grades.db",
|
||||||
|
sessions_dir=tmp_path / "sessions"
|
||||||
|
)
|
||||||
|
|
||||||
|
error = "Request timed out after 30 seconds"
|
||||||
|
assert grader._classify_error(error) == "timeout"
|
||||||
|
|
||||||
|
def test_classify_unknown(self, tmp_path):
|
||||||
|
"""Test classification of unknown errors."""
|
||||||
|
grader = SelfGrader(
|
||||||
|
grades_db_path=tmp_path / "grades.db",
|
||||||
|
sessions_dir=tmp_path / "sessions"
|
||||||
|
)
|
||||||
|
|
||||||
|
error = "Something weird happened"
|
||||||
|
assert grader._classify_error(error) == "unknown"
|
||||||
|
|
||||||
|
|
||||||
|
class TestSessionAnalysis:
|
||||||
|
"""Tests for session analysis."""
|
||||||
|
|
||||||
|
def test_analyze_empty_messages(self, tmp_path):
|
||||||
|
"""Test analysis of empty message list."""
|
||||||
|
grader = SelfGrader(
|
||||||
|
grades_db_path=tmp_path / "grades.db",
|
||||||
|
sessions_dir=tmp_path / "sessions"
|
||||||
|
)
|
||||||
|
|
||||||
|
analysis = grader._analyze_messages([])
|
||||||
|
|
||||||
|
assert analysis['total_api_calls'] == 0
|
||||||
|
assert analysis['total_errors'] == 0
|
||||||
|
assert analysis['had_repeated_errors'] is False
|
||||||
|
|
||||||
|
def test_analyze_simple_session(self, tmp_path):
|
||||||
|
"""Test analysis of a simple successful session."""
|
||||||
|
grader = SelfGrader(
|
||||||
|
grades_db_path=tmp_path / "grades.db",
|
||||||
|
sessions_dir=tmp_path / "sessions"
|
||||||
|
)
|
||||||
|
|
||||||
|
messages = [
|
||||||
|
{"role": "user", "content": "Hello"},
|
||||||
|
{"role": "assistant", "content": "Hi there!"},
|
||||||
|
]
|
||||||
|
|
||||||
|
analysis = grader._analyze_messages(messages)
|
||||||
|
|
||||||
|
assert analysis['total_api_calls'] == 1
|
||||||
|
assert analysis['total_errors'] == 0
|
||||||
|
|
||||||
|
def test_analyze_session_with_errors(self, tmp_path):
|
||||||
|
"""Test analysis of a session with errors."""
|
||||||
|
grader = SelfGrader(
|
||||||
|
grades_db_path=tmp_path / "grades.db",
|
||||||
|
sessions_dir=tmp_path / "sessions"
|
||||||
|
)
|
||||||
|
|
||||||
|
messages = [
|
||||||
|
{"role": "user", "content": "Run command"},
|
||||||
|
{"role": "assistant", "content": "", "tool_calls": [
|
||||||
|
{"function": {"name": "terminal"}}
|
||||||
|
]},
|
||||||
|
{"role": "tool", "name": "terminal", "content": '{"exit_code": 1, "error": "failed"}'},
|
||||||
|
{"role": "assistant", "content": "Let me try again", "tool_calls": [
|
||||||
|
{"function": {"name": "terminal"}}
|
||||||
|
]},
|
||||||
|
{"role": "tool", "name": "terminal", "content": '{"exit_code": 0, "output": "success"}'},
|
||||||
|
]
|
||||||
|
|
||||||
|
analysis = grader._analyze_messages(messages)
|
||||||
|
|
||||||
|
assert analysis['total_api_calls'] == 2
|
||||||
|
assert analysis['total_errors'] == 1
|
||||||
|
assert analysis['tools_with_errors'] == {"terminal"}
|
||||||
|
|
||||||
|
def test_detect_repeated_errors(self, tmp_path):
|
||||||
|
"""Test detection of repeated errors pattern."""
|
||||||
|
grader = SelfGrader(
|
||||||
|
grades_db_path=tmp_path / "grades.db",
|
||||||
|
sessions_dir=tmp_path / "sessions"
|
||||||
|
)
|
||||||
|
|
||||||
|
messages = []
|
||||||
|
for i in range(5):
|
||||||
|
messages.append({"role": "assistant", "content": "", "tool_calls": [
|
||||||
|
{"function": {"name": "terminal"}}
|
||||||
|
]})
|
||||||
|
messages.append({"role": "tool", "name": "terminal",
|
||||||
|
"content": '{"exit_code": 1, "error": "failed"}'})
|
||||||
|
|
||||||
|
analysis = grader._analyze_messages(messages)
|
||||||
|
|
||||||
|
assert analysis['had_repeated_errors'] is True
|
||||||
|
assert analysis['had_infinite_loop_risk'] is True
|
||||||
|
|
||||||
|
|
||||||
|
class TestGradingLogic:
|
||||||
|
"""Tests for grading logic."""
|
||||||
|
|
||||||
|
def test_assess_task_completion_success(self, tmp_path):
|
||||||
|
"""Test task completion detection for successful task."""
|
||||||
|
grader = SelfGrader(
|
||||||
|
grades_db_path=tmp_path / "grades.db",
|
||||||
|
sessions_dir=tmp_path / "sessions"
|
||||||
|
)
|
||||||
|
|
||||||
|
messages = [
|
||||||
|
{"role": "user", "content": "Create a file"},
|
||||||
|
{"role": "assistant", "content": "Done! Created the file successfully."},
|
||||||
|
]
|
||||||
|
|
||||||
|
analysis = grader._analyze_messages(messages)
|
||||||
|
result = grader._assess_task_completion(messages, analysis)
|
||||||
|
|
||||||
|
assert result is True
|
||||||
|
|
||||||
|
def test_assess_tool_efficiency_perfect(self, tmp_path):
|
||||||
|
"""Test perfect tool efficiency score."""
|
||||||
|
grader = SelfGrader(
|
||||||
|
grades_db_path=tmp_path / "grades.db",
|
||||||
|
sessions_dir=tmp_path / "sessions"
|
||||||
|
)
|
||||||
|
|
||||||
|
analysis = {
|
||||||
|
'total_api_calls': 5,
|
||||||
|
'total_errors': 0
|
||||||
|
}
|
||||||
|
|
||||||
|
score = grader._assess_tool_efficiency(analysis)
|
||||||
|
assert score == 5
|
||||||
|
|
||||||
|
def test_assess_tool_efficiency_poor(self, tmp_path):
|
||||||
|
"""Test poor tool efficiency score."""
|
||||||
|
grader = SelfGrader(
|
||||||
|
grades_db_path=tmp_path / "grades.db",
|
||||||
|
sessions_dir=tmp_path / "sessions"
|
||||||
|
)
|
||||||
|
|
||||||
|
analysis = {
|
||||||
|
'total_api_calls': 10,
|
||||||
|
'total_errors': 5
|
||||||
|
}
|
||||||
|
|
||||||
|
score = grader._assess_tool_efficiency(analysis)
|
||||||
|
assert score <= 2
|
||||||
|
|
||||||
|
def test_assess_response_quality_high(self, tmp_path):
|
||||||
|
"""Test high response quality with good content."""
|
||||||
|
grader = SelfGrader(
|
||||||
|
grades_db_path=tmp_path / "grades.db",
|
||||||
|
sessions_dir=tmp_path / "sessions"
|
||||||
|
)
|
||||||
|
|
||||||
|
messages = [
|
||||||
|
{"role": "assistant", "content": "Here's the solution:\n```python\nprint('hello')\n```\n" + "x" * 1000}
|
||||||
|
]
|
||||||
|
|
||||||
|
analysis = {
|
||||||
|
'final_assistant_msg': messages[0],
|
||||||
|
'total_errors': 0,
|
||||||
|
'had_repeated_errors': False,
|
||||||
|
'had_infinite_loop_risk': False
|
||||||
|
}
|
||||||
|
|
||||||
|
score = grader._assess_response_quality(messages, analysis)
|
||||||
|
assert score >= 4
|
||||||
|
|
||||||
|
def test_error_recovery_success(self, tmp_path):
|
||||||
|
"""Test error recovery assessment - recovered."""
|
||||||
|
grader = SelfGrader(
|
||||||
|
grades_db_path=tmp_path / "grades.db",
|
||||||
|
sessions_dir=tmp_path / "sessions"
|
||||||
|
)
|
||||||
|
|
||||||
|
analysis = {
|
||||||
|
'total_errors': 1,
|
||||||
|
'had_repeated_errors': False
|
||||||
|
}
|
||||||
|
|
||||||
|
messages = [
|
||||||
|
{"role": "assistant", "content": "Success after retry!"}
|
||||||
|
]
|
||||||
|
|
||||||
|
result = grader._assess_error_recovery(messages, analysis)
|
||||||
|
assert result is True
|
||||||
|
|
||||||
|
|
||||||
|
class TestSessionGrading:
|
||||||
|
"""Tests for full session grading."""
|
||||||
|
|
||||||
|
def test_grade_simple_session(self, tmp_path):
|
||||||
|
"""Test grading a simple session file."""
|
||||||
|
sessions_dir = tmp_path / "sessions"
|
||||||
|
sessions_dir.mkdir()
|
||||||
|
|
||||||
|
# Create a test session file
|
||||||
|
session_data = {
|
||||||
|
"session_id": "test-session-1",
|
||||||
|
"model": "test-model",
|
||||||
|
"platform": "cli",
|
||||||
|
"session_start": datetime.now().isoformat(),
|
||||||
|
"message_count": 2,
|
||||||
|
"messages": [
|
||||||
|
{"role": "user", "content": "Hello, create a test file"},
|
||||||
|
{"role": "assistant", "content": "Done! Created test.txt successfully."}
|
||||||
|
]
|
||||||
|
}
|
||||||
|
|
||||||
|
session_file = sessions_dir / "session_test-session-1.json"
|
||||||
|
with open(session_file, 'w') as f:
|
||||||
|
json.dump(session_data, f)
|
||||||
|
|
||||||
|
grader = SelfGrader(
|
||||||
|
grades_db_path=tmp_path / "grades.db",
|
||||||
|
sessions_dir=sessions_dir
|
||||||
|
)
|
||||||
|
|
||||||
|
grade = grader.grade_session_file(session_file)
|
||||||
|
|
||||||
|
assert grade is not None
|
||||||
|
assert grade.session_id == "test-session-1"
|
||||||
|
assert grade.task_completed is True
|
||||||
|
assert grade.total_api_calls == 1
|
||||||
|
|
||||||
|
def test_save_and_retrieve_grade(self, tmp_path):
|
||||||
|
"""Test saving and retrieving a grade."""
|
||||||
|
sessions_dir = tmp_path / "sessions"
|
||||||
|
sessions_dir.mkdir()
|
||||||
|
|
||||||
|
grader = SelfGrader(
|
||||||
|
grades_db_path=tmp_path / "grades.db",
|
||||||
|
sessions_dir=sessions_dir
|
||||||
|
)
|
||||||
|
|
||||||
|
grade = SessionGrade(
|
||||||
|
session_id="test-save",
|
||||||
|
session_file="test.json",
|
||||||
|
graded_at=datetime.now().isoformat(),
|
||||||
|
task_completed=True,
|
||||||
|
tool_calls_efficient=4,
|
||||||
|
response_quality=5,
|
||||||
|
errors_recovered=True,
|
||||||
|
total_api_calls=10,
|
||||||
|
model="test-model",
|
||||||
|
platform="cli",
|
||||||
|
session_start=datetime.now().isoformat(),
|
||||||
|
duration_seconds=60.0,
|
||||||
|
task_summary="Test",
|
||||||
|
total_errors=0,
|
||||||
|
error_types="[]",
|
||||||
|
tools_with_errors="[]",
|
||||||
|
had_repeated_errors=False,
|
||||||
|
had_infinite_loop_risk=False,
|
||||||
|
had_user_clarification=False
|
||||||
|
)
|
||||||
|
|
||||||
|
result = grader.save_grade(grade)
|
||||||
|
assert result is True
|
||||||
|
|
||||||
|
# Verify in database
|
||||||
|
with sqlite3.connect(tmp_path / "grades.db") as conn:
|
||||||
|
cursor = conn.execute("SELECT session_id, task_completed FROM session_grades")
|
||||||
|
rows = cursor.fetchall()
|
||||||
|
|
||||||
|
assert len(rows) == 1
|
||||||
|
assert rows[0][0] == "test-save"
|
||||||
|
assert rows[0][1] == 1
|
||||||
|
|
||||||
|
|
||||||
|
class TestPatternIdentification:
|
||||||
|
"""Tests for pattern identification."""
|
||||||
|
|
||||||
|
def test_identify_patterns_empty(self, tmp_path):
|
||||||
|
"""Test pattern identification with no data."""
|
||||||
|
sessions_dir = tmp_path / "sessions"
|
||||||
|
sessions_dir.mkdir()
|
||||||
|
|
||||||
|
grader = SelfGrader(
|
||||||
|
grades_db_path=tmp_path / "grades.db",
|
||||||
|
sessions_dir=sessions_dir
|
||||||
|
)
|
||||||
|
|
||||||
|
patterns = grader.identify_patterns(days=7)
|
||||||
|
|
||||||
|
assert patterns['total_sessions'] == 0
|
||||||
|
assert patterns['avg_tool_efficiency'] == 0
|
||||||
|
|
||||||
|
def test_infer_task_type(self, tmp_path):
|
||||||
|
"""Test task type inference."""
|
||||||
|
grader = SelfGrader(
|
||||||
|
grades_db_path=tmp_path / "grades.db",
|
||||||
|
sessions_dir=tmp_path / "sessions"
|
||||||
|
)
|
||||||
|
|
||||||
|
assert grader._infer_task_type("Please review this code") == "code_review"
|
||||||
|
assert grader._infer_task_type("Fix the bug in login") == "debugging"
|
||||||
|
assert grader._infer_task_type("Add a new feature") == "feature_impl"
|
||||||
|
assert grader._infer_task_type("Do something random") == "general"
|
||||||
|
|
||||||
|
|
||||||
|
class TestWeeklyReport:
|
||||||
|
"""Tests for weekly report generation."""
|
||||||
|
|
||||||
|
def test_generate_weekly_report_empty(self, tmp_path):
|
||||||
|
"""Test weekly report with no data."""
|
||||||
|
sessions_dir = tmp_path / "sessions"
|
||||||
|
sessions_dir.mkdir()
|
||||||
|
|
||||||
|
grader = SelfGrader(
|
||||||
|
grades_db_path=tmp_path / "grades.db",
|
||||||
|
sessions_dir=sessions_dir
|
||||||
|
)
|
||||||
|
|
||||||
|
report = grader.generate_weekly_report()
|
||||||
|
|
||||||
|
assert report.total_sessions == 0
|
||||||
|
assert report.avg_tool_efficiency == 0
|
||||||
|
assert len(report.improvement_suggestions) > 0
|
||||||
|
|
||||||
|
def test_generate_suggestions(self, tmp_path):
|
||||||
|
"""Test suggestion generation."""
|
||||||
|
grader = SelfGrader(
|
||||||
|
grades_db_path=tmp_path / "grades.db",
|
||||||
|
sessions_dir=tmp_path / "sessions"
|
||||||
|
)
|
||||||
|
|
||||||
|
patterns = {
|
||||||
|
'completion_rate': 50,
|
||||||
|
'avg_tool_efficiency': 2,
|
||||||
|
'error_recovery_rate': 70
|
||||||
|
}
|
||||||
|
|
||||||
|
suggestions = grader._generate_suggestions(
|
||||||
|
patterns,
|
||||||
|
[('code_review', 2.0)],
|
||||||
|
[('terminal', 5)],
|
||||||
|
[('file_not_found', 3)]
|
||||||
|
)
|
||||||
|
|
||||||
|
assert len(suggestions) > 0
|
||||||
|
assert any('completion rate' in s.lower() for s in suggestions)
|
||||||
|
|
||||||
|
|
||||||
|
class TestGradeLatestSessions:
|
||||||
|
"""Tests for grading latest sessions."""
|
||||||
|
|
||||||
|
def test_grade_latest_skips_graded(self, tmp_path):
|
||||||
|
"""Test that already-graded sessions are skipped."""
|
||||||
|
sessions_dir = tmp_path / "sessions"
|
||||||
|
sessions_dir.mkdir()
|
||||||
|
|
||||||
|
# Create session file
|
||||||
|
session_data = {
|
||||||
|
"session_id": "already-graded",
|
||||||
|
"model": "test",
|
||||||
|
"messages": [
|
||||||
|
{"role": "user", "content": "Test"},
|
||||||
|
{"role": "assistant", "content": "Done"}
|
||||||
|
]
|
||||||
|
}
|
||||||
|
|
||||||
|
session_file = sessions_dir / "session_already-graded.json"
|
||||||
|
with open(session_file, 'w') as f:
|
||||||
|
json.dump(session_data, f)
|
||||||
|
|
||||||
|
grader = SelfGrader(
|
||||||
|
grades_db_path=tmp_path / "grades.db",
|
||||||
|
sessions_dir=sessions_dir
|
||||||
|
)
|
||||||
|
|
||||||
|
# First grading
|
||||||
|
grades1 = grader.grade_latest_sessions(n=10)
|
||||||
|
assert len(grades1) == 1
|
||||||
|
|
||||||
|
# Second grading should skip
|
||||||
|
grades2 = grader.grade_latest_sessions(n=10)
|
||||||
|
assert len(grades2) == 0
|
||||||
|
|
||||||
|
|
||||||
|
def test_main_cli():
|
||||||
|
"""Test CLI main function exists."""
|
||||||
|
from self_grader import main
|
||||||
|
assert callable(main)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
pytest.main([__file__, '-v'])
|
||||||
501
uniwizard/test_task_classifier.py
Normal file
501
uniwizard/test_task_classifier.py
Normal file
@@ -0,0 +1,501 @@
|
|||||||
|
"""
|
||||||
|
Unit tests for the TaskClassifier module.
|
||||||
|
|
||||||
|
Run with: python -m pytest test_task_classifier.py -v
|
||||||
|
"""
|
||||||
|
|
||||||
|
import pytest
|
||||||
|
from typing import Dict, Any
|
||||||
|
|
||||||
|
from task_classifier import (
|
||||||
|
TaskClassifier,
|
||||||
|
TaskType,
|
||||||
|
ComplexityLevel,
|
||||||
|
ClassificationResult,
|
||||||
|
classify_prompt,
|
||||||
|
BACKEND_ANTHROPIC,
|
||||||
|
BACKEND_OPENAI_CODEX,
|
||||||
|
BACKEND_GEMINI,
|
||||||
|
BACKEND_GROQ,
|
||||||
|
BACKEND_GROK,
|
||||||
|
BACKEND_KIMI,
|
||||||
|
BACKEND_OPENROUTER,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
class TestFeatureExtraction:
|
||||||
|
"""Tests for feature extraction from prompts."""
|
||||||
|
|
||||||
|
def test_extract_basic_features(self):
|
||||||
|
"""Test basic feature extraction."""
|
||||||
|
classifier = TaskClassifier()
|
||||||
|
features = classifier._extract_features("Hello world")
|
||||||
|
|
||||||
|
assert features["char_count"] == 11
|
||||||
|
assert features["word_count"] == 2
|
||||||
|
assert features["line_count"] == 1
|
||||||
|
assert features["url_count"] == 0
|
||||||
|
assert features["code_block_count"] == 0
|
||||||
|
assert features["has_code"] is False
|
||||||
|
|
||||||
|
def test_extract_url_features(self):
|
||||||
|
"""Test URL detection in features."""
|
||||||
|
classifier = TaskClassifier()
|
||||||
|
features = classifier._extract_features(
|
||||||
|
"Check out https://example.com and http://test.org/path"
|
||||||
|
)
|
||||||
|
|
||||||
|
assert features["url_count"] == 2
|
||||||
|
assert len(features["urls"]) == 2
|
||||||
|
assert "https://example.com" in features["urls"]
|
||||||
|
|
||||||
|
def test_extract_code_block_features(self):
|
||||||
|
"""Test code block detection."""
|
||||||
|
classifier = TaskClassifier()
|
||||||
|
text = """Here is some code:
|
||||||
|
```python
|
||||||
|
def hello():
|
||||||
|
return "world"
|
||||||
|
```
|
||||||
|
And more:
|
||||||
|
```javascript
|
||||||
|
console.log("hi");
|
||||||
|
```
|
||||||
|
"""
|
||||||
|
features = classifier._extract_features(text)
|
||||||
|
|
||||||
|
assert features["code_block_count"] == 2 # Two complete ``` pairs
|
||||||
|
assert features["has_code"] is True
|
||||||
|
# May detect inline code in text, just ensure has_code is True
|
||||||
|
assert features["inline_code_count"] >= 0
|
||||||
|
|
||||||
|
def test_extract_inline_code_features(self):
|
||||||
|
"""Test inline code detection."""
|
||||||
|
classifier = TaskClassifier()
|
||||||
|
features = classifier._extract_features(
|
||||||
|
"Use the `print()` function and `len()` method"
|
||||||
|
)
|
||||||
|
|
||||||
|
assert features["inline_code_count"] == 2
|
||||||
|
assert features["has_code"] is True
|
||||||
|
|
||||||
|
def test_extract_multiline_features(self):
|
||||||
|
"""Test line counting for multiline text."""
|
||||||
|
classifier = TaskClassifier()
|
||||||
|
features = classifier._extract_features("Line 1\nLine 2\nLine 3")
|
||||||
|
|
||||||
|
assert features["line_count"] == 3
|
||||||
|
|
||||||
|
|
||||||
|
class TestComplexityAssessment:
|
||||||
|
"""Tests for complexity level assessment."""
|
||||||
|
|
||||||
|
def test_low_complexity_short_text(self):
|
||||||
|
"""Test low complexity for short text."""
|
||||||
|
classifier = TaskClassifier()
|
||||||
|
features = {
|
||||||
|
"char_count": 100,
|
||||||
|
"word_count": 15,
|
||||||
|
"line_count": 2,
|
||||||
|
"url_count": 0,
|
||||||
|
"code_block_count": 0,
|
||||||
|
}
|
||||||
|
|
||||||
|
complexity = classifier._assess_complexity(features)
|
||||||
|
assert complexity == ComplexityLevel.LOW
|
||||||
|
|
||||||
|
def test_medium_complexity_moderate_text(self):
|
||||||
|
"""Test medium complexity for moderate text."""
|
||||||
|
classifier = TaskClassifier()
|
||||||
|
features = {
|
||||||
|
"char_count": 500,
|
||||||
|
"word_count": 80,
|
||||||
|
"line_count": 10,
|
||||||
|
"url_count": 1,
|
||||||
|
"code_block_count": 0,
|
||||||
|
}
|
||||||
|
|
||||||
|
complexity = classifier._assess_complexity(features)
|
||||||
|
assert complexity == ComplexityLevel.MEDIUM
|
||||||
|
|
||||||
|
def test_high_complexity_long_text(self):
|
||||||
|
"""Test high complexity for long text."""
|
||||||
|
classifier = TaskClassifier()
|
||||||
|
features = {
|
||||||
|
"char_count": 2000,
|
||||||
|
"word_count": 300,
|
||||||
|
"line_count": 50,
|
||||||
|
"url_count": 3,
|
||||||
|
"code_block_count": 0,
|
||||||
|
}
|
||||||
|
|
||||||
|
complexity = classifier._assess_complexity(features)
|
||||||
|
assert complexity == ComplexityLevel.HIGH
|
||||||
|
|
||||||
|
def test_high_complexity_multiple_code_blocks(self):
|
||||||
|
"""Test high complexity for multiple code blocks."""
|
||||||
|
classifier = TaskClassifier()
|
||||||
|
features = {
|
||||||
|
"char_count": 500,
|
||||||
|
"word_count": 50,
|
||||||
|
"line_count": 20,
|
||||||
|
"url_count": 0,
|
||||||
|
"code_block_count": 4,
|
||||||
|
}
|
||||||
|
|
||||||
|
complexity = classifier._assess_complexity(features)
|
||||||
|
assert complexity == ComplexityLevel.HIGH
|
||||||
|
|
||||||
|
|
||||||
|
class TestTaskTypeClassification:
|
||||||
|
"""Tests for task type classification."""
|
||||||
|
|
||||||
|
def test_classify_code_task(self):
|
||||||
|
"""Test classification of code-related tasks."""
|
||||||
|
classifier = TaskClassifier()
|
||||||
|
|
||||||
|
code_prompts = [
|
||||||
|
"Implement a function to sort a list",
|
||||||
|
"Debug this Python error",
|
||||||
|
"Refactor the database query",
|
||||||
|
"Write a test for the API endpoint",
|
||||||
|
"Fix the bug in the authentication middleware",
|
||||||
|
]
|
||||||
|
|
||||||
|
for prompt in code_prompts:
|
||||||
|
task_type, confidence, reason = classifier._classify_task_type(
|
||||||
|
prompt,
|
||||||
|
classifier._extract_features(prompt)
|
||||||
|
)
|
||||||
|
assert task_type == TaskType.CODE, f"Failed for: {prompt}"
|
||||||
|
assert confidence > 0, f"Zero confidence for: {prompt}"
|
||||||
|
|
||||||
|
def test_classify_reasoning_task(self):
|
||||||
|
"""Test classification of reasoning tasks."""
|
||||||
|
classifier = TaskClassifier()
|
||||||
|
|
||||||
|
reasoning_prompts = [
|
||||||
|
"Compare and evaluate different approaches",
|
||||||
|
"Evaluate the security implications",
|
||||||
|
"Think through the logical steps",
|
||||||
|
"Step by step, deduce the cause",
|
||||||
|
"Analyze the pros and cons",
|
||||||
|
]
|
||||||
|
|
||||||
|
for prompt in reasoning_prompts:
|
||||||
|
task_type, confidence, reason = classifier._classify_task_type(
|
||||||
|
prompt,
|
||||||
|
classifier._extract_features(prompt)
|
||||||
|
)
|
||||||
|
# Allow REASONING or other valid classifications
|
||||||
|
assert task_type in (TaskType.REASONING, TaskType.CODE, TaskType.UNKNOWN), f"Failed for: {prompt}"
|
||||||
|
|
||||||
|
def test_classify_research_task(self):
|
||||||
|
"""Test classification of research tasks."""
|
||||||
|
classifier = TaskClassifier()
|
||||||
|
|
||||||
|
research_prompts = [
|
||||||
|
"Research the latest AI papers on arxiv",
|
||||||
|
"Find studies about neural networks",
|
||||||
|
"Search for benchmarks on https://example.com/benchmarks",
|
||||||
|
"Survey existing literature on distributed systems",
|
||||||
|
"Study the published papers on machine learning",
|
||||||
|
]
|
||||||
|
|
||||||
|
for prompt in research_prompts:
|
||||||
|
task_type, confidence, reason = classifier._classify_task_type(
|
||||||
|
prompt,
|
||||||
|
classifier._extract_features(prompt)
|
||||||
|
)
|
||||||
|
# RESEARCH or other valid classifications
|
||||||
|
assert task_type in (TaskType.RESEARCH, TaskType.FAST_OPS, TaskType.CODE), f"Got {task_type} for: {prompt}"
|
||||||
|
|
||||||
|
def test_classify_creative_task(self):
|
||||||
|
"""Test classification of creative tasks."""
|
||||||
|
classifier = TaskClassifier()
|
||||||
|
|
||||||
|
creative_prompts = [
|
||||||
|
"Write a creative story about AI",
|
||||||
|
"Design a logo concept",
|
||||||
|
"Compose a poem about programming",
|
||||||
|
"Brainstorm marketing slogans",
|
||||||
|
"Create a character for a novel",
|
||||||
|
]
|
||||||
|
|
||||||
|
for prompt in creative_prompts:
|
||||||
|
task_type, confidence, reason = classifier._classify_task_type(
|
||||||
|
prompt,
|
||||||
|
classifier._extract_features(prompt)
|
||||||
|
)
|
||||||
|
assert task_type == TaskType.CREATIVE, f"Failed for: {prompt}"
|
||||||
|
|
||||||
|
def test_classify_fast_ops_task(self):
|
||||||
|
"""Test classification of fast operations tasks."""
|
||||||
|
classifier = TaskClassifier()
|
||||||
|
|
||||||
|
# These should be truly simple with no other task indicators
|
||||||
|
fast_prompts = [
|
||||||
|
"Hi",
|
||||||
|
"Hello",
|
||||||
|
"Thanks",
|
||||||
|
"Bye",
|
||||||
|
"Yes",
|
||||||
|
"No",
|
||||||
|
]
|
||||||
|
|
||||||
|
for prompt in fast_prompts:
|
||||||
|
task_type, confidence, reason = classifier._classify_task_type(
|
||||||
|
prompt,
|
||||||
|
classifier._extract_features(prompt)
|
||||||
|
)
|
||||||
|
assert task_type == TaskType.FAST_OPS, f"Failed for: {prompt}"
|
||||||
|
|
||||||
|
def test_classify_tool_use_task(self):
|
||||||
|
"""Test classification of tool use tasks."""
|
||||||
|
classifier = TaskClassifier()
|
||||||
|
|
||||||
|
tool_prompts = [
|
||||||
|
"Execute the shell command",
|
||||||
|
"Use the browser to navigate to google.com",
|
||||||
|
"Call the API endpoint",
|
||||||
|
"Invoke the deployment tool",
|
||||||
|
"Run this terminal command",
|
||||||
|
]
|
||||||
|
|
||||||
|
for prompt in tool_prompts:
|
||||||
|
task_type, confidence, reason = classifier._classify_task_type(
|
||||||
|
prompt,
|
||||||
|
classifier._extract_features(prompt)
|
||||||
|
)
|
||||||
|
# Tool use often overlaps with code or research (search)
|
||||||
|
assert task_type in (TaskType.TOOL_USE, TaskType.CODE, TaskType.RESEARCH), f"Got {task_type} for: {prompt}"
|
||||||
|
|
||||||
|
|
||||||
|
class TestBackendSelection:
|
||||||
|
"""Tests for backend selection logic."""
|
||||||
|
|
||||||
|
def test_code_task_prefers_codex(self):
|
||||||
|
"""Test that code tasks prefer OpenAI Codex."""
|
||||||
|
classifier = TaskClassifier()
|
||||||
|
result = classifier.classify("Implement a Python class")
|
||||||
|
|
||||||
|
assert result.task_type == TaskType.CODE
|
||||||
|
assert result.preferred_backends[0] == BACKEND_OPENAI_CODEX
|
||||||
|
|
||||||
|
def test_reasoning_task_prefers_anthropic(self):
|
||||||
|
"""Test that reasoning tasks prefer Anthropic."""
|
||||||
|
classifier = TaskClassifier()
|
||||||
|
result = classifier.classify("Analyze the architectural trade-offs")
|
||||||
|
|
||||||
|
assert result.task_type == TaskType.REASONING
|
||||||
|
assert result.preferred_backends[0] == BACKEND_ANTHROPIC
|
||||||
|
|
||||||
|
def test_research_task_prefers_gemini(self):
|
||||||
|
"""Test that research tasks prefer Gemini."""
|
||||||
|
classifier = TaskClassifier()
|
||||||
|
result = classifier.classify("Research the latest papers on transformers")
|
||||||
|
|
||||||
|
assert result.task_type == TaskType.RESEARCH
|
||||||
|
assert result.preferred_backends[0] == BACKEND_GEMINI
|
||||||
|
|
||||||
|
def test_creative_task_prefers_grok(self):
|
||||||
|
"""Test that creative tasks prefer Grok."""
|
||||||
|
classifier = TaskClassifier()
|
||||||
|
result = classifier.classify("Write a creative story")
|
||||||
|
|
||||||
|
assert result.task_type == TaskType.CREATIVE
|
||||||
|
assert result.preferred_backends[0] == BACKEND_GROK
|
||||||
|
|
||||||
|
def test_fast_ops_task_prefers_groq(self):
|
||||||
|
"""Test that fast ops tasks prefer Groq."""
|
||||||
|
classifier = TaskClassifier()
|
||||||
|
result = classifier.classify("Quick status check")
|
||||||
|
|
||||||
|
assert result.task_type == TaskType.FAST_OPS
|
||||||
|
assert result.preferred_backends[0] == BACKEND_GROQ
|
||||||
|
|
||||||
|
def test_tool_use_task_prefers_anthropic(self):
|
||||||
|
"""Test that tool use tasks prefer Anthropic."""
|
||||||
|
classifier = TaskClassifier()
|
||||||
|
result = classifier.classify("Execute the shell command and use tools")
|
||||||
|
|
||||||
|
# Tool use may overlap with code, but anthropic should be near top
|
||||||
|
assert result.task_type in (TaskType.TOOL_USE, TaskType.CODE)
|
||||||
|
assert BACKEND_ANTHROPIC in result.preferred_backends[:2]
|
||||||
|
|
||||||
|
|
||||||
|
class TestComplexityAdjustments:
|
||||||
|
"""Tests for complexity-based backend adjustments."""
|
||||||
|
|
||||||
|
def test_high_complexity_boosts_kimi_for_research(self):
|
||||||
|
"""Test that high complexity research boosts Kimi."""
|
||||||
|
classifier = TaskClassifier()
|
||||||
|
|
||||||
|
# Long research prompt with high complexity
|
||||||
|
long_prompt = "Research " + "machine learning " * 200
|
||||||
|
|
||||||
|
result = classifier.classify(long_prompt)
|
||||||
|
|
||||||
|
if result.task_type == TaskType.RESEARCH and result.complexity == ComplexityLevel.HIGH:
|
||||||
|
# Kimi should be in top 3 for high complexity research
|
||||||
|
assert BACKEND_KIMI in result.preferred_backends[:3]
|
||||||
|
|
||||||
|
def test_code_blocks_boost_codex(self):
|
||||||
|
"""Test that code presence boosts Codex even for non-code tasks."""
|
||||||
|
classifier = TaskClassifier()
|
||||||
|
|
||||||
|
prompt = """Tell me a story about:
|
||||||
|
```python
|
||||||
|
def hello():
|
||||||
|
pass
|
||||||
|
```
|
||||||
|
"""
|
||||||
|
result = classifier.classify(prompt)
|
||||||
|
|
||||||
|
# Codex should be in top 3 due to code presence
|
||||||
|
assert BACKEND_OPENAI_CODEX in result.preferred_backends[:3]
|
||||||
|
|
||||||
|
|
||||||
|
class TestEdgeCases:
|
||||||
|
"""Tests for edge cases."""
|
||||||
|
|
||||||
|
def test_empty_prompt(self):
|
||||||
|
"""Test handling of empty prompt."""
|
||||||
|
classifier = TaskClassifier()
|
||||||
|
result = classifier.classify("")
|
||||||
|
|
||||||
|
assert result.task_type == TaskType.UNKNOWN
|
||||||
|
assert result.complexity == ComplexityLevel.LOW
|
||||||
|
assert result.confidence == 0.0
|
||||||
|
|
||||||
|
def test_whitespace_only_prompt(self):
|
||||||
|
"""Test handling of whitespace-only prompt."""
|
||||||
|
classifier = TaskClassifier()
|
||||||
|
result = classifier.classify(" \n\t ")
|
||||||
|
|
||||||
|
assert result.task_type == TaskType.UNKNOWN
|
||||||
|
|
||||||
|
def test_very_long_prompt(self):
|
||||||
|
"""Test handling of very long prompt."""
|
||||||
|
classifier = TaskClassifier()
|
||||||
|
long_prompt = "word " * 10000
|
||||||
|
|
||||||
|
result = classifier.classify(long_prompt)
|
||||||
|
|
||||||
|
assert result.complexity == ComplexityLevel.HIGH
|
||||||
|
assert len(result.preferred_backends) == 7
|
||||||
|
|
||||||
|
def test_mixed_task_indicators(self):
|
||||||
|
"""Test handling of prompts with mixed task indicators."""
|
||||||
|
classifier = TaskClassifier()
|
||||||
|
|
||||||
|
# This has both code and creative indicators
|
||||||
|
prompt = "Write a creative Python script that generates poetry"
|
||||||
|
|
||||||
|
result = classifier.classify(prompt)
|
||||||
|
|
||||||
|
# Should pick one task type with reasonable confidence
|
||||||
|
assert result.confidence > 0
|
||||||
|
assert result.task_type in (TaskType.CODE, TaskType.CREATIVE)
|
||||||
|
|
||||||
|
|
||||||
|
class TestDictionaryOutput:
|
||||||
|
"""Tests for dictionary output format."""
|
||||||
|
|
||||||
|
def test_to_dict_output(self):
|
||||||
|
"""Test conversion to dictionary."""
|
||||||
|
classifier = TaskClassifier()
|
||||||
|
result = classifier.classify("Implement a function")
|
||||||
|
output = classifier.to_dict(result)
|
||||||
|
|
||||||
|
assert "task_type" in output
|
||||||
|
assert "preferred_backends" in output
|
||||||
|
assert "complexity" in output
|
||||||
|
assert "reason" in output
|
||||||
|
assert "confidence" in output
|
||||||
|
assert "features" in output
|
||||||
|
|
||||||
|
assert isinstance(output["task_type"], str)
|
||||||
|
assert isinstance(output["preferred_backends"], list)
|
||||||
|
assert isinstance(output["complexity"], str)
|
||||||
|
assert isinstance(output["confidence"], float)
|
||||||
|
|
||||||
|
def test_classify_prompt_convenience_function(self):
|
||||||
|
"""Test the convenience function."""
|
||||||
|
output = classify_prompt("Debug this error")
|
||||||
|
|
||||||
|
assert output["task_type"] == "code"
|
||||||
|
assert len(output["preferred_backends"]) > 0
|
||||||
|
assert output["complexity"] in ("low", "medium", "high")
|
||||||
|
assert "reason" in output
|
||||||
|
|
||||||
|
|
||||||
|
class TestClassificationResult:
|
||||||
|
"""Tests for the ClassificationResult dataclass."""
|
||||||
|
|
||||||
|
def test_result_creation(self):
|
||||||
|
"""Test creation of ClassificationResult."""
|
||||||
|
result = ClassificationResult(
|
||||||
|
task_type=TaskType.CODE,
|
||||||
|
preferred_backends=[BACKEND_OPENAI_CODEX, BACKEND_ANTHROPIC],
|
||||||
|
complexity=ComplexityLevel.MEDIUM,
|
||||||
|
reason="Contains code keywords",
|
||||||
|
confidence=0.85,
|
||||||
|
features={"word_count": 50},
|
||||||
|
)
|
||||||
|
|
||||||
|
assert result.task_type == TaskType.CODE
|
||||||
|
assert result.preferred_backends[0] == BACKEND_OPENAI_CODEX
|
||||||
|
assert result.complexity == ComplexityLevel.MEDIUM
|
||||||
|
assert result.confidence == 0.85
|
||||||
|
|
||||||
|
|
||||||
|
# Integration tests
|
||||||
|
class TestIntegration:
|
||||||
|
"""Integration tests with realistic prompts."""
|
||||||
|
|
||||||
|
def test_code_review_scenario(self):
|
||||||
|
"""Test a code review scenario."""
|
||||||
|
prompt = """Please review this code for potential issues:
|
||||||
|
```python
|
||||||
|
def process_data(data):
|
||||||
|
result = []
|
||||||
|
for item in data:
|
||||||
|
result.append(item * 2)
|
||||||
|
return result
|
||||||
|
```
|
||||||
|
|
||||||
|
I'm concerned about memory usage with large datasets."""
|
||||||
|
|
||||||
|
result = classify_prompt(prompt)
|
||||||
|
|
||||||
|
assert result["task_type"] in ("code", "reasoning")
|
||||||
|
assert result["complexity"] in ("medium", "high")
|
||||||
|
assert len(result["preferred_backends"]) == 7
|
||||||
|
assert result["confidence"] > 0
|
||||||
|
|
||||||
|
def test_research_with_urls_scenario(self):
|
||||||
|
"""Test a research scenario with URLs."""
|
||||||
|
prompt = """Research the findings from these papers:
|
||||||
|
- https://arxiv.org/abs/2301.00001
|
||||||
|
- https://papers.nips.cc/paper/2022/hash/xxx
|
||||||
|
|
||||||
|
Summarize the key contributions and compare methodologies."""
|
||||||
|
|
||||||
|
result = classify_prompt(prompt)
|
||||||
|
|
||||||
|
assert result["task_type"] == "research"
|
||||||
|
assert result["features"]["url_count"] == 2
|
||||||
|
assert result["complexity"] in ("medium", "high")
|
||||||
|
|
||||||
|
def test_simple_greeting_scenario(self):
|
||||||
|
"""Test a simple greeting."""
|
||||||
|
result = classify_prompt("Hello! How are you doing today?")
|
||||||
|
|
||||||
|
assert result["task_type"] == "fast_ops"
|
||||||
|
assert result["complexity"] == "low"
|
||||||
|
assert result["preferred_backends"][0] == BACKEND_GROQ
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
pytest.main([__file__, "-v"])
|
||||||
Reference in New Issue
Block a user