- Resolve decisions.md merge conflict (keep both Codex boundary + Ezra/Bezalel entries) - Update .gitignore: protect bare secret files, exclude venvs and nexus-localhost - Add uniwizard tools (mention watcher, adaptive prompt router, self-grader, classifiers) - Add briefings, good-morning reports, production reports - Add evennia world scaffold and training data - Add angband and morrowind MCP servers - Add diagrams, specs, test results, overnight loop scripts - Add twitter archive insights and media metadata - Add wizard workspaces (allegro, nahshon)
117 lines
4.3 KiB
JavaScript
117 lines
4.3 KiB
JavaScript
/**
|
|
* LLM-based Self-Grader
|
|
*
|
|
* Uses an LLM to evaluate outputs against task-specific criteria.
|
|
*/
|
|
|
|
class LLMSelfGrader {
|
|
constructor(options = {}) {
|
|
this.model = options.model || 'default';
|
|
this.temperature = options.temperature || 0.1;
|
|
}
|
|
|
|
/**
|
|
* Create grading criteria for common task types
|
|
*/
|
|
static getCriteriaForTaskType(taskType) {
|
|
const criteriaMap = {
|
|
'code_generation': {
|
|
correctness: { weight: 0.4, description: 'Code runs without errors and produces correct output' },
|
|
readability: { weight: 0.2, description: 'Code is well-formatted and easy to understand' },
|
|
efficiency: { weight: 0.2, description: 'Code uses appropriate algorithms and data structures' },
|
|
completeness: { weight: 0.2, description: 'All requirements from the prompt are addressed' }
|
|
},
|
|
'file_operations': {
|
|
correctness: { weight: 0.4, description: 'Files are read/written correctly with proper paths' },
|
|
safety: { weight: 0.3, description: 'No destructive operations without validation' },
|
|
efficiency: { weight: 0.2, description: 'Uses appropriate batching and avoids unnecessary IO' },
|
|
completeness: { weight: 0.1, description: 'All requested files are handled' }
|
|
},
|
|
'research': {
|
|
accuracy: { weight: 0.4, description: 'Information is factually correct and well-sourced' },
|
|
completeness: { weight: 0.3, description: 'Covers all aspects of the question' },
|
|
relevance: { weight: 0.2, description: 'Information directly addresses the query' },
|
|
conciseness: { weight: 0.1, description: 'No unnecessary verbosity' }
|
|
},
|
|
'analysis': {
|
|
accuracy: { weight: 0.3, description: 'Analysis is based on correct interpretation of data' },
|
|
depth: { weight: 0.3, description: 'Goes beyond surface-level observations' },
|
|
clarity: { weight: 0.2, description: 'Findings are clearly presented' },
|
|
usefulness: { weight: 0.2, description: 'Insights are actionable or informative' }
|
|
},
|
|
'communication': {
|
|
clarity: { weight: 0.3, description: 'Message is easy to understand' },
|
|
tone: { weight: 0.3, description: 'Appropriate tone for context and audience' },
|
|
completeness: { weight: 0.2, description: 'All necessary information is included' },
|
|
conciseness: { weight: 0.2, description: 'No unnecessary words or filler' }
|
|
}
|
|
};
|
|
|
|
return criteriaMap[taskType] || criteriaMap['communication'];
|
|
}
|
|
|
|
/**
|
|
* Build a prompt for the LLM to grade an output
|
|
*/
|
|
buildGradingPrompt(taskType, originalPrompt, output, expectedFormat) {
|
|
const criteria = LLMSelfGrader.getCriteriaForTaskType(taskType);
|
|
|
|
return `You are evaluating an AI assistant's response. Grade the following output on these dimensions:
|
|
|
|
${Object.entries(criteria).map(([k, v]) => `- ${k}: ${v.description} (weight: ${v.weight})`).join('\n')}
|
|
|
|
ORIGINAL REQUEST:
|
|
${originalPrompt}
|
|
|
|
ACTUAL OUTPUT:
|
|
${output}
|
|
|
|
${expectedFormat ? `EXPECTED FORMAT:\n${expectedFormat}\n\n` : ''}
|
|
Provide your evaluation as JSON:
|
|
{
|
|
"dimensions": {
|
|
${Object.keys(criteria).map(k => ` "${k}": { "score": 0-100, "reasoning": "brief explanation" }`).join(',\n')}
|
|
},
|
|
"overall": "brief summary of quality",
|
|
"improvement_suggestions": ["suggestion 1", "suggestion 2"]
|
|
}`;
|
|
}
|
|
|
|
/**
|
|
* Parse LLM grading response into structured criteria
|
|
*/
|
|
parseGradingResponse(response, taskType) {
|
|
try {
|
|
const parsed = JSON.parse(response);
|
|
const criteria = LLMSelfGrader.getCriteriaForTaskType(taskType);
|
|
|
|
const result = {};
|
|
for (const [dim, config] of Object.entries(criteria)) {
|
|
if (parsed.dimensions?.[dim]) {
|
|
result[dim] = {
|
|
score: parsed.dimensions[dim].score,
|
|
reasoning: parsed.dimensions[dim].reasoning,
|
|
weight: config.weight
|
|
};
|
|
}
|
|
}
|
|
|
|
return {
|
|
dimensions: result,
|
|
overall: parsed.overall,
|
|
suggestions: parsed.improvement_suggestions || []
|
|
};
|
|
} catch (e) {
|
|
// Fallback: return empty criteria for manual grading
|
|
return {
|
|
dimensions: {},
|
|
overall: 'Failed to parse grading',
|
|
suggestions: ['Manual review required'],
|
|
parseError: e.message
|
|
};
|
|
}
|
|
}
|
|
}
|
|
|
|
module.exports = { LLMSelfGrader };
|