The skills directory was getting disorganized — mlops alone had 40 skills in a flat list, and 12 categories were singletons with just one skill each. Code change: - prompt_builder.py: Support sub-categories in skill scanner. skills/mlops/training/axolotl/SKILL.md now shows as category 'mlops/training' instead of just 'mlops'. Backwards-compatible with existing flat structure. Split mlops (40 skills) into 7 sub-categories: - mlops/training (12): accelerate, axolotl, flash-attention, grpo-rl-training, peft, pytorch-fsdp, pytorch-lightning, simpo, slime, torchtitan, trl-fine-tuning, unsloth - mlops/inference (8): gguf, guidance, instructor, llama-cpp, obliteratus, outlines, tensorrt-llm, vllm - mlops/models (6): audiocraft, clip, llava, segment-anything, stable-diffusion, whisper - mlops/vector-databases (4): chroma, faiss, pinecone, qdrant - mlops/evaluation (5): huggingface-tokenizers, lm-evaluation-harness, nemo-curator, saelens, weights-and-biases - mlops/cloud (2): lambda-labs, modal - mlops/research (1): dspy Merged singleton categories: - gifs → media (gif-search joins youtube-content) - music-creation → media (heartmula, songsee) - diagramming → creative (excalidraw joins ascii-art) - ocr-and-documents → productivity - domain → research (domain-intel) - feeds → research (blogwatcher) - market-data → research (polymarket) Fixed misplaced skills: - mlops/code-review → software-development (not ML-specific) - mlops/ml-paper-writing → research (academic writing) Added DESCRIPTION.md files for all new/updated categories.
15 KiB
15 KiB
Advanced Validation Patterns
Complete guide to validation in Instructor using Pydantic.
Table of Contents
- Built-in Validators
- Custom Field Validators
- Model-Level Validation
- Complex Validation Patterns
- Error Handling
Built-in Validators
Numeric Constraints
from pydantic import BaseModel, Field
class Product(BaseModel):
price: float = Field(gt=0, description="Price must be positive")
discount: float = Field(ge=0, le=100, description="Discount 0-100%")
quantity: int = Field(ge=1, description="At least 1 item")
rating: float = Field(ge=0.0, le=5.0, description="Rating 0-5 stars")
# If LLM provides invalid values, automatic retry with error feedback
Available constraints:
gt: Greater thange: Greater than or equallt: Less thanle: Less than or equalmultiple_of: Must be multiple of this number
String Constraints
class User(BaseModel):
username: str = Field(
min_length=3,
max_length=20,
pattern=r'^[a-zA-Z0-9_]+$',
description="3-20 alphanumeric characters"
)
bio: str = Field(max_length=500, description="Bio up to 500 chars")
status: str = Field(pattern=r'^(active|inactive|pending)$')
# pattern validates against regex
Email and URL Validation
from pydantic import EmailStr, HttpUrl, AnyUrl
class Contact(BaseModel):
email: EmailStr # Validates email format
website: HttpUrl # Validates HTTP/HTTPS URLs
portfolio: AnyUrl # Any valid URL scheme
contact = client.messages.create(
model="claude-sonnet-4-5-20250929",
max_tokens=1024,
messages=[{
"role": "user",
"content": "Extract: john@example.com, https://example.com"
}],
response_model=Contact
)
Date and DateTime Validation
from datetime import date, datetime
from pydantic import Field, field_validator
class Event(BaseModel):
event_date: date # Validates date format
created_at: datetime # Validates datetime format
year: int = Field(ge=1900, le=2100)
@field_validator('event_date')
def future_date(cls, v):
"""Ensure event is in the future."""
if v < date.today():
raise ValueError('Event must be in the future')
return v
List and Dict Validation
class Document(BaseModel):
tags: list[str] = Field(min_length=1, max_length=10)
keywords: list[str] = Field(min_length=3, description="At least 3 keywords")
metadata: dict[str, str] = Field(description="String key-value pairs")
@field_validator('tags')
def unique_tags(cls, v):
"""Ensure tags are unique."""
if len(v) != len(set(v)):
raise ValueError('Tags must be unique')
return v
Custom Field Validators
Basic Field Validator
from pydantic import field_validator
class Person(BaseModel):
name: str
age: int
@field_validator('name')
def name_must_not_be_empty(cls, v):
"""Validate name is not empty or just whitespace."""
if not v or not v.strip():
raise ValueError('Name cannot be empty')
return v.strip()
@field_validator('age')
def age_must_be_reasonable(cls, v):
"""Validate age is between 0 and 120."""
if v < 0 or v > 120:
raise ValueError('Age must be between 0 and 120')
return v
Validator with Field Info
from pydantic import ValidationInfo
class Article(BaseModel):
title: str
content: str
@field_validator('content')
def content_length(cls, v, info: ValidationInfo):
"""Validate content is longer than title."""
if 'title' in info.data:
title_len = len(info.data['title'])
if len(v) < title_len * 2:
raise ValueError('Content should be at least 2x title length')
return v
Multiple Fields Validation
class TimeRange(BaseModel):
start_time: str
end_time: str
@field_validator('start_time', 'end_time')
def valid_time_format(cls, v):
"""Validate both times are in HH:MM format."""
import re
if not re.match(r'^\d{2}:\d{2}$', v):
raise ValueError('Time must be in HH:MM format')
return v
Transform and Validate
class URL(BaseModel):
url: str
@field_validator('url')
def normalize_url(cls, v):
"""Add https:// if missing."""
if not v.startswith(('http://', 'https://')):
v = f'https://{v}'
return v
Model-Level Validation
Cross-Field Validation
from pydantic import model_validator
class DateRange(BaseModel):
start_date: str
end_date: str
@model_validator(mode='after')
def check_dates(self):
"""Ensure end_date is after start_date."""
from datetime import datetime
start = datetime.strptime(self.start_date, '%Y-%m-%d')
end = datetime.strptime(self.end_date, '%Y-%m-%d')
if end < start:
raise ValueError('end_date must be after start_date')
return self
class PriceRange(BaseModel):
min_price: float
max_price: float
@model_validator(mode='after')
def check_price_range(self):
"""Ensure max > min."""
if self.max_price <= self.min_price:
raise ValueError('max_price must be greater than min_price')
return self
Conditional Validation
class Order(BaseModel):
order_type: str # "standard" or "express"
delivery_date: str
delivery_time: Optional[str] = None
@model_validator(mode='after')
def check_delivery_time(self):
"""Express orders need delivery time."""
if self.order_type == "express" and not self.delivery_time:
raise ValueError('Express orders require delivery_time')
return self
Complex Business Logic
class Discount(BaseModel):
code: str
percentage: float = Field(ge=0, le=100)
min_purchase: float = Field(ge=0)
max_discount: float = Field(ge=0)
@model_validator(mode='after')
def validate_discount(self):
"""Ensure discount logic is sound."""
# Max discount can't exceed percentage of min_purchase
theoretical_max = (self.percentage / 100) * self.min_purchase
if self.max_discount > theoretical_max:
self.max_discount = theoretical_max
return self
Complex Validation Patterns
Nested Model Validation
class Address(BaseModel):
street: str
city: str
country: str
postal_code: str
@field_validator('postal_code')
def validate_postal_code(cls, v, info: ValidationInfo):
"""Validate postal code format based on country."""
if 'country' in info.data:
country = info.data['country']
if country == "USA":
import re
if not re.match(r'^\d{5}(-\d{4})?$', v):
raise ValueError('Invalid US postal code')
elif country == "Canada":
if not re.match(r'^[A-Z]\d[A-Z] \d[A-Z]\d$', v):
raise ValueError('Invalid Canadian postal code')
return v
class Person(BaseModel):
name: str
address: Address
# Nested validation runs automatically
List of Models
class Task(BaseModel):
title: str = Field(min_length=1)
priority: int = Field(ge=1, le=5)
class Project(BaseModel):
name: str
tasks: list[Task] = Field(min_length=1, description="At least 1 task")
@field_validator('tasks')
def at_least_one_high_priority(cls, v):
"""Ensure at least one task has priority >= 4."""
if not any(task.priority >= 4 for task in v):
raise ValueError('Project needs at least one high-priority task')
return v
Union Type Validation
from typing import Union
class TextBlock(BaseModel):
type: str = "text"
content: str = Field(min_length=1)
class ImageBlock(BaseModel):
type: str = "image"
url: HttpUrl
alt_text: str
class Page(BaseModel):
title: str
blocks: list[Union[TextBlock, ImageBlock]]
@field_validator('blocks')
def validate_block_types(cls, v):
"""Ensure first block is TextBlock."""
if v and not isinstance(v[0], TextBlock):
raise ValueError('First block must be text')
return v
Dependent Fields
class Subscription(BaseModel):
plan: str # "free", "pro", "enterprise"
max_users: int
features: list[str]
@model_validator(mode='after')
def validate_plan_limits(self):
"""Enforce plan-specific limits."""
limits = {
"free": {"max_users": 1, "required_features": ["basic"]},
"pro": {"max_users": 10, "required_features": ["basic", "advanced"]},
"enterprise": {"max_users": 999, "required_features": ["basic", "advanced", "premium"]}
}
if self.plan in limits:
limit = limits[self.plan]
if self.max_users > limit["max_users"]:
raise ValueError(f'{self.plan} plan limited to {limit["max_users"]} users')
for feature in limit["required_features"]:
if feature not in self.features:
raise ValueError(f'{self.plan} plan requires {feature} feature')
return self
Error Handling
Graceful Degradation
class OptionalExtraction(BaseModel):
# Required fields
title: str
# Optional fields with defaults
author: Optional[str] = None
date: Optional[str] = None
tags: list[str] = Field(default_factory=list)
# LLM can succeed even if it can't extract everything
Partial Validation
from pydantic import ValidationError
def extract_with_fallback(text: str):
"""Try full extraction, fall back to partial."""
try:
# Try full extraction
return client.messages.create(
model="claude-sonnet-4-5-20250929",
max_tokens=1024,
messages=[{"role": "user", "content": text}],
response_model=FullModel
)
except ValidationError:
# Fall back to partial model
return client.messages.create(
model="claude-sonnet-4-5-20250929",
max_tokens=1024,
messages=[{"role": "user", "content": text}],
response_model=PartialModel
)
Validation Error Inspection
from pydantic import ValidationError
try:
result = client.messages.create(
model="claude-sonnet-4-5-20250929",
max_tokens=1024,
messages=[...],
response_model=MyModel,
max_retries=3
)
except ValidationError as e:
# Inspect specific errors
for error in e.errors():
field = error['loc'][0]
message = error['msg']
print(f"Field '{field}' failed: {message}")
# Custom handling per field
if field == 'email':
# Handle email validation failure
pass
Custom Error Messages
class DetailedModel(BaseModel):
name: str = Field(
min_length=2,
max_length=100,
description="Name between 2-100 characters"
)
age: int = Field(
ge=0,
le=120,
description="Age between 0 and 120 years"
)
@field_validator('name')
def validate_name(cls, v):
"""Provide helpful error message."""
if not v.strip():
raise ValueError(
'Name cannot be empty. '
'Please provide a valid name from the text.'
)
return v
# When validation fails, LLM sees these helpful messages
Validation Best Practices
1. Be Specific
# ❌ Bad: Vague validation
class Item(BaseModel):
name: str
# ✅ Good: Specific constraints
class Item(BaseModel):
name: str = Field(
min_length=1,
max_length=200,
description="Item name, 1-200 characters"
)
2. Provide Context
# ✅ Good: Explain why validation failed
@field_validator('price')
def validate_price(cls, v):
if v <= 0:
raise ValueError(
'Price must be positive. '
'Extract numeric price from text without currency symbols.'
)
return v
3. Use Enums for Fixed Sets
# ❌ Bad: String validation
status: str
@field_validator('status')
def validate_status(cls, v):
if v not in ['active', 'inactive', 'pending']:
raise ValueError('Invalid status')
return v
# ✅ Good: Enum
class Status(str, Enum):
ACTIVE = "active"
INACTIVE = "inactive"
PENDING = "pending"
status: Status # Validation automatic
4. Balance Strictness
# Too strict: May fail unnecessarily
class StrictModel(BaseModel):
date: str = Field(pattern=r'^\d{4}-\d{2}-\d{2}$')
# Fails if LLM uses "2024-1-5" instead of "2024-01-05"
# Better: Normalize in validator
class FlexibleModel(BaseModel):
date: str
@field_validator('date')
def normalize_date(cls, v):
from datetime import datetime
# Parse flexible formats
for fmt in ['%Y-%m-%d', '%Y/%m/%d', '%m/%d/%Y']:
try:
dt = datetime.strptime(v, fmt)
return dt.strftime('%Y-%m-%d') # Normalize
except ValueError:
continue
raise ValueError('Invalid date format')
5. Test Validation
# Test your validators with edge cases
def test_validation():
# Should succeed
valid = MyModel(field="valid_value")
# Should fail
try:
invalid = MyModel(field="invalid")
assert False, "Should have raised ValidationError"
except ValidationError:
pass # Expected
# Run tests before using in production
Advanced Techniques
Conditional Required Fields
from typing import Optional
class ConditionalModel(BaseModel):
type: str
detail_a: Optional[str] = None
detail_b: Optional[str] = None
@model_validator(mode='after')
def check_required_details(self):
"""Require different fields based on type."""
if self.type == "type_a" and not self.detail_a:
raise ValueError('type_a requires detail_a')
if self.type == "type_b" and not self.detail_b:
raise ValueError('type_b requires detail_b')
return self
Validation with External Data
class Product(BaseModel):
sku: str
name: str
@field_validator('sku')
def validate_sku(cls, v):
"""Check SKU exists in database."""
# Query database or API
if not database.sku_exists(v):
raise ValueError(f'SKU {v} not found in catalog')
return v
Progressive Validation
# Start with loose validation
class Stage1(BaseModel):
data: str # Any string
# Then strict validation
class Stage2(BaseModel):
data: str = Field(pattern=r'^[A-Z]{3}-\d{6}$')
# Use Stage1 for initial extraction
# Use Stage2 for final validation
Resources
- Pydantic Docs: https://docs.pydantic.dev/latest/concepts/validators/
- Instructor Examples: https://python.useinstructor.com/examples