Files
timmy-home/uniwizard/self-grader.js
Alexander Whitestone 9c1dd7fff7 chore: check in all local work — uniwizard, briefings, reports, evennia, morrowind, scripts, specs, training data, angband MCP, diagrams, twitter archive, wizards
- Resolve decisions.md merge conflict (keep both Codex boundary + Ezra/Bezalel entries)
- Update .gitignore: protect bare secret files, exclude venvs and nexus-localhost
- Add uniwizard tools (mention watcher, adaptive prompt router, self-grader, classifiers)
- Add briefings, good-morning reports, production reports
- Add evennia world scaffold and training data
- Add angband and morrowind MCP servers
- Add diagrams, specs, test results, overnight loop scripts
- Add twitter archive insights and media metadata
- Add wizard workspaces (allegro, nahshon)
2026-03-30 17:18:09 -04:00

117 lines
4.3 KiB
JavaScript

/**
* LLM-based Self-Grader
*
* Uses an LLM to evaluate outputs against task-specific criteria.
*/
class LLMSelfGrader {
constructor(options = {}) {
this.model = options.model || 'default';
this.temperature = options.temperature || 0.1;
}
/**
* Create grading criteria for common task types
*/
static getCriteriaForTaskType(taskType) {
const criteriaMap = {
'code_generation': {
correctness: { weight: 0.4, description: 'Code runs without errors and produces correct output' },
readability: { weight: 0.2, description: 'Code is well-formatted and easy to understand' },
efficiency: { weight: 0.2, description: 'Code uses appropriate algorithms and data structures' },
completeness: { weight: 0.2, description: 'All requirements from the prompt are addressed' }
},
'file_operations': {
correctness: { weight: 0.4, description: 'Files are read/written correctly with proper paths' },
safety: { weight: 0.3, description: 'No destructive operations without validation' },
efficiency: { weight: 0.2, description: 'Uses appropriate batching and avoids unnecessary IO' },
completeness: { weight: 0.1, description: 'All requested files are handled' }
},
'research': {
accuracy: { weight: 0.4, description: 'Information is factually correct and well-sourced' },
completeness: { weight: 0.3, description: 'Covers all aspects of the question' },
relevance: { weight: 0.2, description: 'Information directly addresses the query' },
conciseness: { weight: 0.1, description: 'No unnecessary verbosity' }
},
'analysis': {
accuracy: { weight: 0.3, description: 'Analysis is based on correct interpretation of data' },
depth: { weight: 0.3, description: 'Goes beyond surface-level observations' },
clarity: { weight: 0.2, description: 'Findings are clearly presented' },
usefulness: { weight: 0.2, description: 'Insights are actionable or informative' }
},
'communication': {
clarity: { weight: 0.3, description: 'Message is easy to understand' },
tone: { weight: 0.3, description: 'Appropriate tone for context and audience' },
completeness: { weight: 0.2, description: 'All necessary information is included' },
conciseness: { weight: 0.2, description: 'No unnecessary words or filler' }
}
};
return criteriaMap[taskType] || criteriaMap['communication'];
}
/**
* Build a prompt for the LLM to grade an output
*/
buildGradingPrompt(taskType, originalPrompt, output, expectedFormat) {
const criteria = LLMSelfGrader.getCriteriaForTaskType(taskType);
return `You are evaluating an AI assistant's response. Grade the following output on these dimensions:
${Object.entries(criteria).map(([k, v]) => `- ${k}: ${v.description} (weight: ${v.weight})`).join('\n')}
ORIGINAL REQUEST:
${originalPrompt}
ACTUAL OUTPUT:
${output}
${expectedFormat ? `EXPECTED FORMAT:\n${expectedFormat}\n\n` : ''}
Provide your evaluation as JSON:
{
"dimensions": {
${Object.keys(criteria).map(k => ` "${k}": { "score": 0-100, "reasoning": "brief explanation" }`).join(',\n')}
},
"overall": "brief summary of quality",
"improvement_suggestions": ["suggestion 1", "suggestion 2"]
}`;
}
/**
* Parse LLM grading response into structured criteria
*/
parseGradingResponse(response, taskType) {
try {
const parsed = JSON.parse(response);
const criteria = LLMSelfGrader.getCriteriaForTaskType(taskType);
const result = {};
for (const [dim, config] of Object.entries(criteria)) {
if (parsed.dimensions?.[dim]) {
result[dim] = {
score: parsed.dimensions[dim].score,
reasoning: parsed.dimensions[dim].reasoning,
weight: config.weight
};
}
}
return {
dimensions: result,
overall: parsed.overall,
suggestions: parsed.improvement_suggestions || []
};
} catch (e) {
// Fallback: return empty criteria for manual grading
return {
dimensions: {},
overall: 'Failed to parse grading',
suggestions: ['Manual review required'],
parseError: e.message
};
}
}
}
module.exports = { LLMSelfGrader };