timmy-home/uniwizard/self-grader.js

/**
 * LLM-based Self-Grader
 *
 * Uses an LLM to evaluate outputs against task-specific criteria.
 */

class LLMSelfGrader {
  constructor(options = {}) {
    this.model = options.model || 'default';
    this.temperature = options.temperature || 0.1;
  }

  /**
   * Create grading criteria for common task types
   */
  static getCriteriaForTaskType(taskType) {
    const criteriaMap = {
      'code_generation': {
        correctness: { weight: 0.4, description: 'Code runs without errors and produces correct output' },
        readability: { weight: 0.2, description: 'Code is well-formatted and easy to understand' },
        efficiency: { weight: 0.2, description: 'Code uses appropriate algorithms and data structures' },
        completeness: { weight: 0.2, description: 'All requirements from the prompt are addressed' }
      },
      'file_operations': {
        correctness: { weight: 0.4, description: 'Files are read/written correctly with proper paths' },
        safety: { weight: 0.3, description: 'No destructive operations without validation' },
        efficiency: { weight: 0.2, description: 'Uses appropriate batching and avoids unnecessary IO' },
        completeness: { weight: 0.1, description: 'All requested files are handled' }
      },
      'research': {
        accuracy: { weight: 0.4, description: 'Information is factually correct and well-sourced' },
        completeness: { weight: 0.3, description: 'Covers all aspects of the question' },
        relevance: { weight: 0.2, description: 'Information directly addresses the query' },
        conciseness: { weight: 0.1, description: 'No unnecessary verbosity' }
      },
      'analysis': {
        accuracy: { weight: 0.3, description: 'Analysis is based on correct interpretation of data' },
        depth: { weight: 0.3, description: 'Goes beyond surface-level observations' },
        clarity: { weight: 0.2, description: 'Findings are clearly presented' },
        usefulness: { weight: 0.2, description: 'Insights are actionable or informative' }
      },
      'communication': {
        clarity: { weight: 0.3, description: 'Message is easy to understand' },
        tone: { weight: 0.3, description: 'Appropriate tone for context and audience' },
        completeness: { weight: 0.2, description: 'All necessary information is included' },
        conciseness: { weight: 0.2, description: 'No unnecessary words or filler' }
      }
    };

    return criteriaMap[taskType] || criteriaMap['communication'];
  }

  /**
   * Build a prompt for the LLM to grade an output
   */
  buildGradingPrompt(taskType, originalPrompt, output, expectedFormat) {
    const criteria = LLMSelfGrader.getCriteriaForTaskType(taskType);

    return `You are evaluating an AI assistant's response. Grade the following output on these dimensions:

${Object.entries(criteria).map(([k, v]) => `- ${k}: ${v.description} (weight: ${v.weight})`).join('\n')}

ORIGINAL REQUEST:
${originalPrompt}

ACTUAL OUTPUT:
${output}

${expectedFormat ? `EXPECTED FORMAT:\n${expectedFormat}\n\n` : ''}
Provide your evaluation as JSON:
{
  "dimensions": {
${Object.keys(criteria).map(k => `    "${k}": { "score": 0-100, "reasoning": "brief explanation" }`).join(',\n')}
  },
  "overall": "brief summary of quality",
  "improvement_suggestions": ["suggestion 1", "suggestion 2"]
}`;
  }

  /**
   * Parse LLM grading response into structured criteria
   */
  parseGradingResponse(response, taskType) {
    try {
      const parsed = JSON.parse(response);
      const criteria = LLMSelfGrader.getCriteriaForTaskType(taskType);

      const result = {};
      for (const [dim, config] of Object.entries(criteria)) {
        if (parsed.dimensions?.[dim]) {
          result[dim] = {
            score: parsed.dimensions[dim].score,
            reasoning: parsed.dimensions[dim].reasoning,
            weight: config.weight
          };
        }
      }

      return {
        dimensions: result,
        overall: parsed.overall,
        suggestions: parsed.improvement_suggestions || []
      };
    } catch (e) {
      // Fallback: return empty criteria for manual grading
      return {
        dimensions: {},
        overall: 'Failed to parse grading',
        suggestions: ['Manual review required'],
        parseError: e.message
      };
    }
  }
}

module.exports = { LLMSelfGrader };