Python Data Validation Patterns for Production Systems
Data validation is a critical component of any production system. Invalid data can cause silent failures, corrupt databases, create security vulnerabilities, and lead to poor user experiences. In this post, we'll explore battle-tested patterns for validating data in Python production systems, focusing on practical implementations that scale.
Why Data Validation Matters
Consider a simple e-commerce checkout system. A user submits an order with a negative quantity, or a product ID that doesn't exist. Without proper validation, these inputs could:
- Crash your application
- Create negative inventory records
- Allow price manipulation attacks
- Cause downstream data processing failures
Let's build a robust validation framework that prevents these issues.
Pattern 1: Functional Validators
The simplest pattern uses pure functions for validation. Each validator takes a value and returns either the validated value or raises an exception.
from typing import Any, Callable, TypeVar
T = TypeVar('T')
def validate_email(email: str) -> str:
"""Validate email format."""
import re
pattern = r'^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}$'
if not re.match(pattern, email):
raise ValueError(f"Invalid email format: {email}")
return email
def validate_positive_int(value: Any) -> int:
"""Validate positive integer."""
if not isinstance(value, int):
raise TypeError(f"Expected int, got {type(value).__name__}")
if value <= 0:
raise ValueError(f"Value must be positive, got {value}")
return value
def validate_non_empty_string(value: Any) -> str:
"""Validate non-empty string."""
if not isinstance(value, str):
raise TypeError(f"Expected string, got {type(value).__name__}")
if not value.strip():
raise ValueError("String cannot be empty")
return value
Pattern 2: Schema-Based Validation
For complex data structures, schema-based validation is more maintainable. Here's a lightweight schema system:
from typing import Dict, List, Optional, Any
from dataclasses import dataclass, field
@dataclass
class Field:
type: type
required: bool = True
validators: List[Callable] = field(default_factory=list)
default: Any = None
class Schema:
def __init__(self, fields: Dict[str, Field]):
self.fields = fields
def validate(self, data: Dict[str, Any]) -> Dict[str, Any]:
validated = {}
errors = {}
for field_name, field_spec in self.fields.items():
value = data.get(field_name)
# Check required fields
if field_spec.required and value is None:
errors[field_name] = "Field is required"
continue
# Set default if not provided
if value is None and field_spec.default is not None:
value = field_spec.default
# Type validation
if value is not None and not isinstance(value, field_spec.type):
errors[field_name] = f"Expected {field_spec.type.__name__}, got {type(value).__name__}"
continue
# Custom validators
for validator in field_spec.validators:
try:
value = validator(value)
except (ValueError, TypeError) as e:
errors[field_name] = str(e)
break
else:
validated[field_name] = value
if errors:
raise ValidationError(errors)
return validated
class ValidationError(Exception):
def __init__(self, errors: Dict[str, str]):
self.errors = errors
super().__init__(str(errors))
Usage Example
# Define validators
def validate_phone(phone: str) -> str:
if not phone.startswith('+') or len(phone) < 10:
raise ValueError("Invalid phone number format")
return phone
# Create schema
user_schema = Schema({
'name': Field(str, validators=[validate_non_empty_string]),
'email': Field(str, validators=[validate_email]),
'age': Field(int, validators=[validate_positive_int]),
'phone': Field(str, required=False, validators=[validate_phone]),
'role': Field(str, default='user')
})
# Validate data
user_data = {
'name': 'Alice',
'email': 'alice@example.com',
'age': 30,
'phone': '+1234567890'
}
try:
validated_user = user_schema.validate(user_data)
print(f"Validated user: {validated_user}")
except ValidationError as e:
print(f"Validation failed: {e.errors}")
Pattern 3: Pydantic Integration
For production systems, I recommend using Pydantic for complex validation. It provides powerful built-in validators and serialization:
from pydantic import BaseModel, Field, validator
from typing import Optional
from datetime import datetime
import re
class ProductCreate(BaseModel):
name: str = Field(..., min_length=1, max_length=100)
price: float = Field(..., gt=0)
sku: str = Field(..., regex=r'^[A-Z]{2}-\d{4}$')
tags: list[str] = Field(default_factory=list)
created_at: Optional[datetime] = None
@validator('name')
def name_must_be_meaningful(cls, v):
if not re.search(r'[a-zA-Z]', v):
raise ValueError('Name must contain at least one letter')
return v.strip()
@validator('price')
def price_must_have_two_decimals(cls, v):
if round(v, 2) != v:
raise ValueError('Price must have at most 2 decimal places')
return v
# Usage
try:
product = ProductCreate(
name=' Wireless Mouse ',
price=29.99,
sku='WM-001',
tags=['electronics', 'accessories']
)
print(f"Validated product: {product.dict()}")
except Exception as e:
print(f"Validation error: {e}")
Pattern 4: Validation Pipeline
For complex workflows, chain validators together in a pipeline:
from typing import List, Tuple, Any
class ValidationPipeline:
def __init__(self):
self.steps: List[Tuple[str, Callable]] = []
def add_step(self, name: str, validator: Callable):
self.steps.append((name, validator))
def process(self, data: Any) -> Any:
for step_name, validator in self.steps:
try:
data = validator(data)
print(f"Step '{step_name}' passed")
except (ValueError, TypeError) as e:
raise RuntimeError(f"Validation failed at step '{step_name}': {e}")
return data
# Example pipeline
pipeline = ValidationPipeline()
pipeline.add_step("sanitize", lambda x: x.strip())
pipeline.add_step("email_format", validate_email)
pipeline.add_step("check_domain", lambda email: email if 'company.com' in email else (_ for _ in ()).throw(ValueError("Not company email")))
# Usage
try:
result = pipeline.process(" user@company.com ")
print(f"Pipeline result: {result}")
except RuntimeError as e:
print(f"Pipeline failed: {e}")
Pattern 5: Async Validation
For I/O-bound validation (database lookups, API calls), use async validators:
import asyncio
from typing import Any, Awaitable
class AsyncValidator:
def __init__(self):
self.validators: list[Awaitable] = []
async def validate(self, data: Any) -> Any:
results = await asyncio.gather(
*[v(data) for v in self.validators],
return_exceptions=True
)
errors = []
for result in results:
if isinstance(result, Exception):
errors.append(str(result))
if errors:
raise ValidationError({"async": errors})
return data
# Example async validators
async def check_user_exists(user_id: int) -> int:
# Simulate DB lookup
await asyncio.sleep(0.1)
if user_id == 999:
raise ValueError(f"User {user_id} not found")
return user_id
async def check_rate_limit(user_id: int) -> int:
# Simulate rate limit check
await asyncio.sleep(0.05)
if user_id == 500:
raise ValueError("Rate limit exceeded")
return user_id
# Usage
async def main():
validator = AsyncValidator()
validator.validators = [check_user_exists, check_rate_limit]
try:
await validator.validate(123)
print("Async validation passed")
except ValidationError as e:
print(f"Async validation failed: {e}")
# asyncio.run(main())
Error Handling Best Practices
- **Be explicit about errors
For further actions, you may consider blocking this person and/or reporting abuse
