Files
hermes-agent/skills/mlops/instructor/references/validation.md
teknium f172f7d4aa Add skills tools and enhance model integration
- Introduced new skills tools: `skills_categories`, `skills_list`, and `skill_view` in `model_tools.py`, allowing for better organization and access to skill-related functionalities.
- Updated `toolsets.py` to include a new `skills` toolset, providing a dedicated space for skill tools.
- Enhanced `batch_runner.py` to recognize and validate skills tools during batch processing.
- Added comprehensive tool definitions for skills tools, ensuring compatibility with OpenAI's expected format.
- Created new shell script `test_skills_kimi.sh` for testing skills tool functionality with Kimi K2.5.
- Added example skill files demonstrating the structure and usage of skills within the Hermes-Agent framework, including `SKILL.md` for example and audiocraft skills.
- Improved documentation for skills tools and their integration into the existing tool framework, ensuring clarity for future development and usage.
2026-01-30 07:39:55 +00:00

15 KiB

Advanced Validation Patterns

Complete guide to validation in Instructor using Pydantic.

Table of Contents

  • Built-in Validators
  • Custom Field Validators
  • Model-Level Validation
  • Complex Validation Patterns
  • Error Handling

Built-in Validators

Numeric Constraints

from pydantic import BaseModel, Field

class Product(BaseModel):
    price: float = Field(gt=0, description="Price must be positive")
    discount: float = Field(ge=0, le=100, description="Discount 0-100%")
    quantity: int = Field(ge=1, description="At least 1 item")
    rating: float = Field(ge=0.0, le=5.0, description="Rating 0-5 stars")

# If LLM provides invalid values, automatic retry with error feedback

Available constraints:

  • gt: Greater than
  • ge: Greater than or equal
  • lt: Less than
  • le: Less than or equal
  • multiple_of: Must be multiple of this number

String Constraints

class User(BaseModel):
    username: str = Field(
        min_length=3,
        max_length=20,
        pattern=r'^[a-zA-Z0-9_]+$',
        description="3-20 alphanumeric characters"
    )
    bio: str = Field(max_length=500, description="Bio up to 500 chars")
    status: str = Field(pattern=r'^(active|inactive|pending)$')

# pattern validates against regex

Email and URL Validation

from pydantic import EmailStr, HttpUrl, AnyUrl

class Contact(BaseModel):
    email: EmailStr  # Validates email format
    website: HttpUrl  # Validates HTTP/HTTPS URLs
    portfolio: AnyUrl  # Any valid URL scheme

contact = client.messages.create(
    model="claude-sonnet-4-5-20250929",
    max_tokens=1024,
    messages=[{
        "role": "user",
        "content": "Extract: john@example.com, https://example.com"
    }],
    response_model=Contact
)

Date and DateTime Validation

from datetime import date, datetime
from pydantic import Field, field_validator

class Event(BaseModel):
    event_date: date  # Validates date format
    created_at: datetime  # Validates datetime format
    year: int = Field(ge=1900, le=2100)

    @field_validator('event_date')
    def future_date(cls, v):
        """Ensure event is in the future."""
        if v < date.today():
            raise ValueError('Event must be in the future')
        return v

List and Dict Validation

class Document(BaseModel):
    tags: list[str] = Field(min_length=1, max_length=10)
    keywords: list[str] = Field(min_length=3, description="At least 3 keywords")
    metadata: dict[str, str] = Field(description="String key-value pairs")

    @field_validator('tags')
    def unique_tags(cls, v):
        """Ensure tags are unique."""
        if len(v) != len(set(v)):
            raise ValueError('Tags must be unique')
        return v

Custom Field Validators

Basic Field Validator

from pydantic import field_validator

class Person(BaseModel):
    name: str
    age: int

    @field_validator('name')
    def name_must_not_be_empty(cls, v):
        """Validate name is not empty or just whitespace."""
        if not v or not v.strip():
            raise ValueError('Name cannot be empty')
        return v.strip()

    @field_validator('age')
    def age_must_be_reasonable(cls, v):
        """Validate age is between 0 and 120."""
        if v < 0 or v > 120:
            raise ValueError('Age must be between 0 and 120')
        return v

Validator with Field Info

from pydantic import ValidationInfo

class Article(BaseModel):
    title: str
    content: str

    @field_validator('content')
    def content_length(cls, v, info: ValidationInfo):
        """Validate content is longer than title."""
        if 'title' in info.data:
            title_len = len(info.data['title'])
            if len(v) < title_len * 2:
                raise ValueError('Content should be at least 2x title length')
        return v

Multiple Fields Validation

class TimeRange(BaseModel):
    start_time: str
    end_time: str

    @field_validator('start_time', 'end_time')
    def valid_time_format(cls, v):
        """Validate both times are in HH:MM format."""
        import re
        if not re.match(r'^\d{2}:\d{2}$', v):
            raise ValueError('Time must be in HH:MM format')
        return v

Transform and Validate

class URL(BaseModel):
    url: str

    @field_validator('url')
    def normalize_url(cls, v):
        """Add https:// if missing."""
        if not v.startswith(('http://', 'https://')):
            v = f'https://{v}'
        return v

Model-Level Validation

Cross-Field Validation

from pydantic import model_validator

class DateRange(BaseModel):
    start_date: str
    end_date: str

    @model_validator(mode='after')
    def check_dates(self):
        """Ensure end_date is after start_date."""
        from datetime import datetime
        start = datetime.strptime(self.start_date, '%Y-%m-%d')
        end = datetime.strptime(self.end_date, '%Y-%m-%d')

        if end < start:
            raise ValueError('end_date must be after start_date')
        return self

class PriceRange(BaseModel):
    min_price: float
    max_price: float

    @model_validator(mode='after')
    def check_price_range(self):
        """Ensure max > min."""
        if self.max_price <= self.min_price:
            raise ValueError('max_price must be greater than min_price')
        return self

Conditional Validation

class Order(BaseModel):
    order_type: str  # "standard" or "express"
    delivery_date: str
    delivery_time: Optional[str] = None

    @model_validator(mode='after')
    def check_delivery_time(self):
        """Express orders need delivery time."""
        if self.order_type == "express" and not self.delivery_time:
            raise ValueError('Express orders require delivery_time')
        return self

Complex Business Logic

class Discount(BaseModel):
    code: str
    percentage: float = Field(ge=0, le=100)
    min_purchase: float = Field(ge=0)
    max_discount: float = Field(ge=0)

    @model_validator(mode='after')
    def validate_discount(self):
        """Ensure discount logic is sound."""
        # Max discount can't exceed percentage of min_purchase
        theoretical_max = (self.percentage / 100) * self.min_purchase
        if self.max_discount > theoretical_max:
            self.max_discount = theoretical_max
        return self

Complex Validation Patterns

Nested Model Validation

class Address(BaseModel):
    street: str
    city: str
    country: str
    postal_code: str

    @field_validator('postal_code')
    def validate_postal_code(cls, v, info: ValidationInfo):
        """Validate postal code format based on country."""
        if 'country' in info.data:
            country = info.data['country']
            if country == "USA":
                import re
                if not re.match(r'^\d{5}(-\d{4})?$', v):
                    raise ValueError('Invalid US postal code')
            elif country == "Canada":
                if not re.match(r'^[A-Z]\d[A-Z] \d[A-Z]\d$', v):
                    raise ValueError('Invalid Canadian postal code')
        return v

class Person(BaseModel):
    name: str
    address: Address

# Nested validation runs automatically

List of Models

class Task(BaseModel):
    title: str = Field(min_length=1)
    priority: int = Field(ge=1, le=5)

class Project(BaseModel):
    name: str
    tasks: list[Task] = Field(min_length=1, description="At least 1 task")

    @field_validator('tasks')
    def at_least_one_high_priority(cls, v):
        """Ensure at least one task has priority >= 4."""
        if not any(task.priority >= 4 for task in v):
            raise ValueError('Project needs at least one high-priority task')
        return v

Union Type Validation

from typing import Union

class TextBlock(BaseModel):
    type: str = "text"
    content: str = Field(min_length=1)

class ImageBlock(BaseModel):
    type: str = "image"
    url: HttpUrl
    alt_text: str

class Page(BaseModel):
    title: str
    blocks: list[Union[TextBlock, ImageBlock]]

    @field_validator('blocks')
    def validate_block_types(cls, v):
        """Ensure first block is TextBlock."""
        if v and not isinstance(v[0], TextBlock):
            raise ValueError('First block must be text')
        return v

Dependent Fields

class Subscription(BaseModel):
    plan: str  # "free", "pro", "enterprise"
    max_users: int
    features: list[str]

    @model_validator(mode='after')
    def validate_plan_limits(self):
        """Enforce plan-specific limits."""
        limits = {
            "free": {"max_users": 1, "required_features": ["basic"]},
            "pro": {"max_users": 10, "required_features": ["basic", "advanced"]},
            "enterprise": {"max_users": 999, "required_features": ["basic", "advanced", "premium"]}
        }

        if self.plan in limits:
            limit = limits[self.plan]

            if self.max_users > limit["max_users"]:
                raise ValueError(f'{self.plan} plan limited to {limit["max_users"]} users')

            for feature in limit["required_features"]:
                if feature not in self.features:
                    raise ValueError(f'{self.plan} plan requires {feature} feature')

        return self

Error Handling

Graceful Degradation

class OptionalExtraction(BaseModel):
    # Required fields
    title: str

    # Optional fields with defaults
    author: Optional[str] = None
    date: Optional[str] = None
    tags: list[str] = Field(default_factory=list)

# LLM can succeed even if it can't extract everything

Partial Validation

from pydantic import ValidationError

def extract_with_fallback(text: str):
    """Try full extraction, fall back to partial."""
    try:
        # Try full extraction
        return client.messages.create(
            model="claude-sonnet-4-5-20250929",
            max_tokens=1024,
            messages=[{"role": "user", "content": text}],
            response_model=FullModel
        )
    except ValidationError:
        # Fall back to partial model
        return client.messages.create(
            model="claude-sonnet-4-5-20250929",
            max_tokens=1024,
            messages=[{"role": "user", "content": text}],
            response_model=PartialModel
        )

Validation Error Inspection

from pydantic import ValidationError

try:
    result = client.messages.create(
        model="claude-sonnet-4-5-20250929",
        max_tokens=1024,
        messages=[...],
        response_model=MyModel,
        max_retries=3
    )
except ValidationError as e:
    # Inspect specific errors
    for error in e.errors():
        field = error['loc'][0]
        message = error['msg']
        print(f"Field '{field}' failed: {message}")

        # Custom handling per field
        if field == 'email':
            # Handle email validation failure
            pass

Custom Error Messages

class DetailedModel(BaseModel):
    name: str = Field(
        min_length=2,
        max_length=100,
        description="Name between 2-100 characters"
    )
    age: int = Field(
        ge=0,
        le=120,
        description="Age between 0 and 120 years"
    )

    @field_validator('name')
    def validate_name(cls, v):
        """Provide helpful error message."""
        if not v.strip():
            raise ValueError(
                'Name cannot be empty. '
                'Please provide a valid name from the text.'
            )
        return v

# When validation fails, LLM sees these helpful messages

Validation Best Practices

1. Be Specific

# ❌ Bad: Vague validation
class Item(BaseModel):
    name: str

# ✅ Good: Specific constraints
class Item(BaseModel):
    name: str = Field(
        min_length=1,
        max_length=200,
        description="Item name, 1-200 characters"
    )

2. Provide Context

# ✅ Good: Explain why validation failed
@field_validator('price')
def validate_price(cls, v):
    if v <= 0:
        raise ValueError(
            'Price must be positive. '
            'Extract numeric price from text without currency symbols.'
        )
    return v

3. Use Enums for Fixed Sets

# ❌ Bad: String validation
status: str

@field_validator('status')
def validate_status(cls, v):
    if v not in ['active', 'inactive', 'pending']:
        raise ValueError('Invalid status')
    return v

# ✅ Good: Enum
class Status(str, Enum):
    ACTIVE = "active"
    INACTIVE = "inactive"
    PENDING = "pending"

status: Status  # Validation automatic

4. Balance Strictness

# Too strict: May fail unnecessarily
class StrictModel(BaseModel):
    date: str = Field(pattern=r'^\d{4}-\d{2}-\d{2}$')
    # Fails if LLM uses "2024-1-5" instead of "2024-01-05"

# Better: Normalize in validator
class FlexibleModel(BaseModel):
    date: str

    @field_validator('date')
    def normalize_date(cls, v):
        from datetime import datetime
        # Parse flexible formats
        for fmt in ['%Y-%m-%d', '%Y/%m/%d', '%m/%d/%Y']:
            try:
                dt = datetime.strptime(v, fmt)
                return dt.strftime('%Y-%m-%d')  # Normalize
            except ValueError:
                continue
        raise ValueError('Invalid date format')

5. Test Validation

# Test your validators with edge cases
def test_validation():
    # Should succeed
    valid = MyModel(field="valid_value")

    # Should fail
    try:
        invalid = MyModel(field="invalid")
        assert False, "Should have raised ValidationError"
    except ValidationError:
        pass  # Expected

# Run tests before using in production

Advanced Techniques

Conditional Required Fields

from typing import Optional

class ConditionalModel(BaseModel):
    type: str
    detail_a: Optional[str] = None
    detail_b: Optional[str] = None

    @model_validator(mode='after')
    def check_required_details(self):
        """Require different fields based on type."""
        if self.type == "type_a" and not self.detail_a:
            raise ValueError('type_a requires detail_a')
        if self.type == "type_b" and not self.detail_b:
            raise ValueError('type_b requires detail_b')
        return self

Validation with External Data

class Product(BaseModel):
    sku: str
    name: str

    @field_validator('sku')
    def validate_sku(cls, v):
        """Check SKU exists in database."""
        # Query database or API
        if not database.sku_exists(v):
            raise ValueError(f'SKU {v} not found in catalog')
        return v

Progressive Validation

# Start with loose validation
class Stage1(BaseModel):
    data: str  # Any string

# Then strict validation
class Stage2(BaseModel):
    data: str = Field(pattern=r'^[A-Z]{3}-\d{6}$')

# Use Stage1 for initial extraction
# Use Stage2 for final validation

Resources