Sync all skills and memories 2026-04-14 07:27
This commit is contained in:
652
skills/mlops/inference/outlines/references/json_generation.md
Normal file
652
skills/mlops/inference/outlines/references/json_generation.md
Normal file
@@ -0,0 +1,652 @@
|
||||
# Comprehensive JSON Generation Guide
|
||||
|
||||
Complete guide to JSON generation with Outlines using Pydantic models and JSON schemas.
|
||||
|
||||
## Table of Contents
|
||||
- Pydantic Models
|
||||
- JSON Schema Support
|
||||
- Advanced Patterns
|
||||
- Nested Structures
|
||||
- Complex Types
|
||||
- Validation
|
||||
- Performance Optimization
|
||||
|
||||
## Pydantic Models
|
||||
|
||||
### Basic Models
|
||||
|
||||
```python
|
||||
from pydantic import BaseModel
|
||||
import outlines
|
||||
|
||||
class User(BaseModel):
|
||||
name: str
|
||||
age: int
|
||||
email: str
|
||||
|
||||
model = outlines.models.transformers("microsoft/Phi-3-mini-4k-instruct")
|
||||
generator = outlines.generate.json(model, User)
|
||||
|
||||
user = generator("Generate user: Alice, 25, alice@example.com")
|
||||
print(user.name) # "Alice"
|
||||
print(user.age) # 25
|
||||
print(user.email) # "alice@example.com"
|
||||
```
|
||||
|
||||
###
|
||||
|
||||
Field Constraints
|
||||
|
||||
```python
|
||||
from pydantic import BaseModel, Field
|
||||
|
||||
class Product(BaseModel):
|
||||
name: str = Field(min_length=1, max_length=100)
|
||||
price: float = Field(gt=0, description="Price in USD")
|
||||
discount: float = Field(ge=0, le=100, description="Discount percentage")
|
||||
quantity: int = Field(ge=0, description="Available quantity")
|
||||
sku: str = Field(pattern=r"^[A-Z]{3}-\d{6}$")
|
||||
|
||||
model = outlines.models.transformers("microsoft/Phi-3-mini-4k-instruct")
|
||||
generator = outlines.generate.json(model, Product)
|
||||
|
||||
product = generator("Generate product: iPhone 15, $999")
|
||||
# All fields guaranteed to meet constraints
|
||||
```
|
||||
|
||||
**Available Constraints:**
|
||||
- `min_length`, `max_length`: String length
|
||||
- `gt`, `ge`, `lt`, `le`: Numeric comparisons
|
||||
- `multiple_of`: Number must be multiple of value
|
||||
- `pattern`: Regex pattern for strings
|
||||
- `min_items`, `max_items`: List length
|
||||
|
||||
### Optional Fields
|
||||
|
||||
```python
|
||||
from typing import Optional
|
||||
|
||||
class Article(BaseModel):
|
||||
title: str # Required
|
||||
author: Optional[str] = None # Optional
|
||||
published_date: Optional[str] = None # Optional
|
||||
tags: list[str] = [] # Default empty list
|
||||
view_count: int = 0 # Default value
|
||||
|
||||
generator = outlines.generate.json(model, Article)
|
||||
|
||||
# Can generate even if optional fields missing
|
||||
article = generator("Title: Introduction to AI")
|
||||
print(article.author) # None (not provided)
|
||||
print(article.tags) # [] (default)
|
||||
```
|
||||
|
||||
### Default Values
|
||||
|
||||
```python
|
||||
class Config(BaseModel):
|
||||
debug: bool = False
|
||||
max_retries: int = 3
|
||||
timeout: float = 30.0
|
||||
log_level: str = "INFO"
|
||||
|
||||
# Generator uses defaults when not specified
|
||||
generator = outlines.generate.json(model, Config)
|
||||
config = generator("Generate config with debug enabled")
|
||||
print(config.debug) # True (from prompt)
|
||||
print(config.timeout) # 30.0 (default)
|
||||
```
|
||||
|
||||
## Enums and Literals
|
||||
|
||||
### Enum Fields
|
||||
|
||||
```python
|
||||
from enum import Enum
|
||||
|
||||
class Status(str, Enum):
|
||||
PENDING = "pending"
|
||||
APPROVED = "approved"
|
||||
REJECTED = "rejected"
|
||||
CANCELLED = "cancelled"
|
||||
|
||||
class Application(BaseModel):
|
||||
applicant_name: str
|
||||
status: Status # Must be one of enum values
|
||||
submitted_date: str
|
||||
|
||||
generator = outlines.generate.json(model, Application)
|
||||
app = generator("Generate application for John Doe")
|
||||
|
||||
print(app.status) # Status.PENDING (or one of the enum values)
|
||||
print(type(app.status)) # <enum 'Status'>
|
||||
```
|
||||
|
||||
### Literal Types
|
||||
|
||||
```python
|
||||
from typing import Literal
|
||||
|
||||
class Task(BaseModel):
|
||||
title: str
|
||||
priority: Literal["low", "medium", "high", "critical"]
|
||||
status: Literal["todo", "in_progress", "done"]
|
||||
assigned_to: str
|
||||
|
||||
generator = outlines.generate.json(model, Task)
|
||||
task = generator("Create high priority task: Fix bug")
|
||||
|
||||
print(task.priority) # One of: "low", "medium", "high", "critical"
|
||||
```
|
||||
|
||||
### Multiple Choice Fields
|
||||
|
||||
```python
|
||||
class Survey(BaseModel):
|
||||
question: str
|
||||
answer: Literal["strongly_disagree", "disagree", "neutral", "agree", "strongly_agree"]
|
||||
confidence: Literal["low", "medium", "high"]
|
||||
|
||||
generator = outlines.generate.json(model, Survey)
|
||||
survey = generator("Rate: 'I enjoy using this product'")
|
||||
```
|
||||
|
||||
## Nested Structures
|
||||
|
||||
### Nested Models
|
||||
|
||||
```python
|
||||
class Address(BaseModel):
|
||||
street: str
|
||||
city: str
|
||||
state: str
|
||||
zip_code: str
|
||||
country: str = "USA"
|
||||
|
||||
class Person(BaseModel):
|
||||
name: str
|
||||
age: int
|
||||
email: str
|
||||
address: Address # Nested model
|
||||
|
||||
model = outlines.models.transformers("microsoft/Phi-3-mini-4k-instruct")
|
||||
generator = outlines.generate.json(model, Person)
|
||||
|
||||
prompt = """
|
||||
Extract person:
|
||||
Name: Alice Johnson
|
||||
Age: 28
|
||||
Email: alice@example.com
|
||||
Address: 123 Main St, Boston, MA, 02101
|
||||
"""
|
||||
|
||||
person = generator(prompt)
|
||||
print(person.name) # "Alice Johnson"
|
||||
print(person.address.city) # "Boston"
|
||||
print(person.address.state) # "MA"
|
||||
```
|
||||
|
||||
### Deep Nesting
|
||||
|
||||
```python
|
||||
class Coordinates(BaseModel):
|
||||
latitude: float
|
||||
longitude: float
|
||||
|
||||
class Location(BaseModel):
|
||||
name: str
|
||||
coordinates: Coordinates
|
||||
|
||||
class Event(BaseModel):
|
||||
title: str
|
||||
date: str
|
||||
location: Location
|
||||
|
||||
generator = outlines.generate.json(model, Event)
|
||||
event = generator("Generate event: Tech Conference in San Francisco")
|
||||
|
||||
print(event.title) # "Tech Conference"
|
||||
print(event.location.name) # "San Francisco"
|
||||
print(event.location.coordinates.latitude) # 37.7749
|
||||
```
|
||||
|
||||
### Lists of Nested Models
|
||||
|
||||
```python
|
||||
class Item(BaseModel):
|
||||
name: str
|
||||
quantity: int
|
||||
price: float
|
||||
|
||||
class Order(BaseModel):
|
||||
order_id: str
|
||||
customer: str
|
||||
items: list[Item] # List of nested models
|
||||
total: float
|
||||
|
||||
generator = outlines.generate.json(model, Order)
|
||||
|
||||
prompt = """
|
||||
Generate order for John:
|
||||
- 2x Widget ($10 each)
|
||||
- 3x Gadget ($15 each)
|
||||
Order ID: ORD-001
|
||||
"""
|
||||
|
||||
order = generator(prompt)
|
||||
print(f"Order ID: {order.order_id}")
|
||||
for item in order.items:
|
||||
print(f"- {item.quantity}x {item.name} @ ${item.price}")
|
||||
print(f"Total: ${order.total}")
|
||||
```
|
||||
|
||||
## Complex Types
|
||||
|
||||
### Union Types
|
||||
|
||||
```python
|
||||
from typing import Union
|
||||
|
||||
class TextContent(BaseModel):
|
||||
type: Literal["text"]
|
||||
content: str
|
||||
|
||||
class ImageContent(BaseModel):
|
||||
type: Literal["image"]
|
||||
url: str
|
||||
caption: str
|
||||
|
||||
class Post(BaseModel):
|
||||
title: str
|
||||
content: Union[TextContent, ImageContent] # Either type
|
||||
|
||||
generator = outlines.generate.json(model, Post)
|
||||
|
||||
# Can generate either text or image content
|
||||
post = generator("Generate blog post with image")
|
||||
if post.content.type == "text":
|
||||
print(post.content.content)
|
||||
elif post.content.type == "image":
|
||||
print(post.content.url)
|
||||
```
|
||||
|
||||
### Lists and Arrays
|
||||
|
||||
```python
|
||||
class Article(BaseModel):
|
||||
title: str
|
||||
authors: list[str] # List of strings
|
||||
tags: list[str]
|
||||
sections: list[dict[str, str]] # List of dicts
|
||||
related_ids: list[int]
|
||||
|
||||
generator = outlines.generate.json(model, Article)
|
||||
article = generator("Generate article about AI")
|
||||
|
||||
print(article.authors) # ["Alice", "Bob"]
|
||||
print(article.tags) # ["AI", "Machine Learning", "Technology"]
|
||||
```
|
||||
|
||||
### Dictionaries
|
||||
|
||||
```python
|
||||
class Metadata(BaseModel):
|
||||
title: str
|
||||
properties: dict[str, str] # String keys and values
|
||||
counts: dict[str, int] # String keys, int values
|
||||
settings: dict[str, Union[str, int, bool]] # Mixed value types
|
||||
|
||||
generator = outlines.generate.json(model, Metadata)
|
||||
meta = generator("Generate metadata")
|
||||
|
||||
print(meta.properties) # {"author": "Alice", "version": "1.0"}
|
||||
print(meta.counts) # {"views": 1000, "likes": 50}
|
||||
```
|
||||
|
||||
### Any Type (Use Sparingly)
|
||||
|
||||
```python
|
||||
from typing import Any
|
||||
|
||||
class FlexibleData(BaseModel):
|
||||
name: str
|
||||
structured_field: str
|
||||
flexible_field: Any # Can be anything
|
||||
|
||||
# Note: Any reduces type safety, use only when necessary
|
||||
generator = outlines.generate.json(model, FlexibleData)
|
||||
```
|
||||
|
||||
## JSON Schema Support
|
||||
|
||||
### Direct Schema Usage
|
||||
|
||||
```python
|
||||
import outlines
|
||||
|
||||
model = outlines.models.transformers("microsoft/Phi-3-mini-4k-instruct")
|
||||
|
||||
# Define JSON schema
|
||||
schema = {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"name": {"type": "string"},
|
||||
"age": {"type": "integer", "minimum": 0, "maximum": 120},
|
||||
"email": {"type": "string", "format": "email"}
|
||||
},
|
||||
"required": ["name", "age", "email"]
|
||||
}
|
||||
|
||||
# Generate from schema
|
||||
generator = outlines.generate.json(model, schema)
|
||||
result = generator("Generate person: Alice, 25, alice@example.com")
|
||||
|
||||
print(result) # Valid JSON matching schema
|
||||
```
|
||||
|
||||
### Schema from Pydantic
|
||||
|
||||
```python
|
||||
class User(BaseModel):
|
||||
name: str
|
||||
age: int
|
||||
email: str
|
||||
|
||||
# Get JSON schema from Pydantic model
|
||||
schema = User.model_json_schema()
|
||||
print(schema)
|
||||
# {
|
||||
# "type": "object",
|
||||
# "properties": {
|
||||
# "name": {"type": "string"},
|
||||
# "age": {"type": "integer"},
|
||||
# "email": {"type": "string"}
|
||||
# },
|
||||
# "required": ["name", "age", "email"]
|
||||
# }
|
||||
|
||||
# Both approaches equivalent:
|
||||
generator1 = outlines.generate.json(model, User)
|
||||
generator2 = outlines.generate.json(model, schema)
|
||||
```
|
||||
|
||||
## Advanced Patterns
|
||||
|
||||
### Conditional Fields
|
||||
|
||||
```python
|
||||
class Order(BaseModel):
|
||||
order_type: Literal["standard", "express"]
|
||||
delivery_date: str
|
||||
express_fee: Optional[float] = None # Only for express orders
|
||||
|
||||
generator = outlines.generate.json(model, Order)
|
||||
|
||||
# Express order
|
||||
order1 = generator("Create express order for tomorrow")
|
||||
print(order1.express_fee) # 25.0
|
||||
|
||||
# Standard order
|
||||
order2 = generator("Create standard order")
|
||||
print(order2.express_fee) # None
|
||||
```
|
||||
|
||||
### Recursive Models
|
||||
|
||||
```python
|
||||
from typing import Optional, List
|
||||
|
||||
class TreeNode(BaseModel):
|
||||
value: str
|
||||
children: Optional[List['TreeNode']] = None
|
||||
|
||||
# Enable forward references
|
||||
TreeNode.model_rebuild()
|
||||
|
||||
generator = outlines.generate.json(model, TreeNode)
|
||||
tree = generator("Generate file tree with subdirectories")
|
||||
|
||||
print(tree.value) # "root"
|
||||
print(tree.children[0].value) # "subdir1"
|
||||
```
|
||||
|
||||
### Model with Validation
|
||||
|
||||
```python
|
||||
from pydantic import field_validator
|
||||
|
||||
class DateRange(BaseModel):
|
||||
start_date: str
|
||||
end_date: str
|
||||
|
||||
@field_validator('end_date')
|
||||
def end_after_start(cls, v, info):
|
||||
"""Ensure end_date is after start_date."""
|
||||
if 'start_date' in info.data:
|
||||
from datetime import datetime
|
||||
start = datetime.strptime(info.data['start_date'], '%Y-%m-%d')
|
||||
end = datetime.strptime(v, '%Y-%m-%d')
|
||||
if end < start:
|
||||
raise ValueError('end_date must be after start_date')
|
||||
return v
|
||||
|
||||
generator = outlines.generate.json(model, DateRange)
|
||||
# Validation happens after generation
|
||||
```
|
||||
|
||||
## Multiple Objects
|
||||
|
||||
### Generate List of Objects
|
||||
|
||||
```python
|
||||
class Person(BaseModel):
|
||||
name: str
|
||||
age: int
|
||||
|
||||
class Team(BaseModel):
|
||||
team_name: str
|
||||
members: list[Person]
|
||||
|
||||
generator = outlines.generate.json(model, Team)
|
||||
|
||||
team = generator("Generate engineering team with 5 members")
|
||||
print(f"Team: {team.team_name}")
|
||||
for member in team.members:
|
||||
print(f"- {member.name}, {member.age}")
|
||||
```
|
||||
|
||||
### Batch Generation
|
||||
|
||||
```python
|
||||
def generate_batch(prompts: list[str], schema: type[BaseModel]):
|
||||
"""Generate structured outputs for multiple prompts."""
|
||||
model = outlines.models.transformers("microsoft/Phi-3-mini-4k-instruct")
|
||||
generator = outlines.generate.json(model, schema)
|
||||
|
||||
results = []
|
||||
for prompt in prompts:
|
||||
result = generator(prompt)
|
||||
results.append(result)
|
||||
|
||||
return results
|
||||
|
||||
class Product(BaseModel):
|
||||
name: str
|
||||
price: float
|
||||
|
||||
prompts = [
|
||||
"Product: iPhone 15, $999",
|
||||
"Product: MacBook Pro, $2499",
|
||||
"Product: AirPods, $179"
|
||||
]
|
||||
|
||||
products = generate_batch(prompts, Product)
|
||||
for product in products:
|
||||
print(f"{product.name}: ${product.price}")
|
||||
```
|
||||
|
||||
## Performance Optimization
|
||||
|
||||
### Caching Generators
|
||||
|
||||
```python
|
||||
from functools import lru_cache
|
||||
|
||||
@lru_cache(maxsize=10)
|
||||
def get_generator(model_name: str, schema_hash: int):
|
||||
"""Cache generators for reuse."""
|
||||
model = outlines.models.transformers(model_name)
|
||||
return outlines.generate.json(model, schema)
|
||||
|
||||
# First call: creates generator
|
||||
gen1 = get_generator("microsoft/Phi-3-mini-4k-instruct", hash(User))
|
||||
|
||||
# Second call: returns cached generator (fast!)
|
||||
gen2 = get_generator("microsoft/Phi-3-mini-4k-instruct", hash(User))
|
||||
```
|
||||
|
||||
### Batch Processing
|
||||
|
||||
```python
|
||||
# Process multiple items efficiently
|
||||
model = outlines.models.transformers("microsoft/Phi-3-mini-4k-instruct")
|
||||
generator = outlines.generate.json(model, User)
|
||||
|
||||
texts = ["User: Alice, 25", "User: Bob, 30", "User: Carol, 35"]
|
||||
|
||||
# Reuse generator (model stays loaded)
|
||||
users = [generator(text) for text in texts]
|
||||
```
|
||||
|
||||
### Minimize Schema Complexity
|
||||
|
||||
```python
|
||||
# ✅ Good: Simple, flat structure (faster)
|
||||
class SimplePerson(BaseModel):
|
||||
name: str
|
||||
age: int
|
||||
city: str
|
||||
|
||||
# ⚠️ Slower: Deep nesting
|
||||
class ComplexPerson(BaseModel):
|
||||
personal_info: PersonalInfo
|
||||
address: Address
|
||||
employment: Employment
|
||||
# ... many nested levels
|
||||
```
|
||||
|
||||
## Error Handling
|
||||
|
||||
### Handle Missing Fields
|
||||
|
||||
```python
|
||||
from pydantic import ValidationError
|
||||
|
||||
class User(BaseModel):
|
||||
name: str
|
||||
age: int
|
||||
email: str
|
||||
|
||||
try:
|
||||
user = generator("Generate user") # May not include all fields
|
||||
except ValidationError as e:
|
||||
print(f"Validation error: {e}")
|
||||
# Handle gracefully
|
||||
```
|
||||
|
||||
### Fallback with Optional Fields
|
||||
|
||||
```python
|
||||
class RobustUser(BaseModel):
|
||||
name: str # Required
|
||||
age: Optional[int] = None # Optional
|
||||
email: Optional[str] = None # Optional
|
||||
|
||||
# More likely to succeed even with incomplete data
|
||||
user = generator("Generate user: Alice")
|
||||
print(user.name) # "Alice"
|
||||
print(user.age) # None (not provided)
|
||||
```
|
||||
|
||||
## Best Practices
|
||||
|
||||
### 1. Use Specific Types
|
||||
|
||||
```python
|
||||
# ✅ Good: Specific types
|
||||
class Product(BaseModel):
|
||||
name: str
|
||||
price: float # Not Any or str
|
||||
quantity: int # Not str
|
||||
in_stock: bool # Not int
|
||||
|
||||
# ❌ Bad: Generic types
|
||||
class Product(BaseModel):
|
||||
name: Any
|
||||
price: str # Should be float
|
||||
quantity: str # Should be int
|
||||
```
|
||||
|
||||
### 2. Add Descriptions
|
||||
|
||||
```python
|
||||
# ✅ Good: Clear descriptions
|
||||
class Article(BaseModel):
|
||||
title: str = Field(description="Article title, 10-100 characters")
|
||||
content: str = Field(description="Main article content in paragraphs")
|
||||
tags: list[str] = Field(description="List of relevant topic tags")
|
||||
|
||||
# Descriptions help the model understand expected output
|
||||
```
|
||||
|
||||
### 3. Use Constraints
|
||||
|
||||
```python
|
||||
# ✅ Good: With constraints
|
||||
class Age(BaseModel):
|
||||
value: int = Field(ge=0, le=120, description="Age in years")
|
||||
|
||||
# ❌ Bad: No constraints
|
||||
class Age(BaseModel):
|
||||
value: int # Could be negative or > 120
|
||||
```
|
||||
|
||||
### 4. Prefer Enums Over Strings
|
||||
|
||||
```python
|
||||
# ✅ Good: Enum for fixed set
|
||||
class Priority(str, Enum):
|
||||
LOW = "low"
|
||||
MEDIUM = "medium"
|
||||
HIGH = "high"
|
||||
|
||||
class Task(BaseModel):
|
||||
priority: Priority # Guaranteed valid
|
||||
|
||||
# ❌ Bad: Free-form string
|
||||
class Task(BaseModel):
|
||||
priority: str # Could be "urgent", "ASAP", "!!", etc.
|
||||
```
|
||||
|
||||
### 5. Test Your Models
|
||||
|
||||
```python
|
||||
# Test models work as expected
|
||||
def test_product_model():
|
||||
product = Product(
|
||||
name="Test Product",
|
||||
price=19.99,
|
||||
quantity=10,
|
||||
in_stock=True
|
||||
)
|
||||
assert product.price == 19.99
|
||||
assert isinstance(product, Product)
|
||||
|
||||
# Run tests before using in production
|
||||
```
|
||||
|
||||
## Resources
|
||||
|
||||
- **Pydantic Docs**: https://docs.pydantic.dev
|
||||
- **JSON Schema**: https://json-schema.org
|
||||
- **Outlines GitHub**: https://github.com/outlines-dev/outlines
|
||||
Reference in New Issue
Block a user