VOOZH about

URL: https://dev.to/biao_lin_14b493a4944b1361/python-data-validation-patterns-for-production-systems-151p

⇱ Python Data Validation Patterns for Production Systems - DEV Community


Python Data Validation Patterns for Production Systems

Data validation is a critical component of any production system. Invalid data can cause silent failures, corrupt databases, create security vulnerabilities, and lead to poor user experiences. In this post, we'll explore battle-tested patterns for validating data in Python production systems, focusing on practical implementations that scale.

Why Data Validation Matters

Consider a simple e-commerce checkout system. A user submits an order with a negative quantity, or a product ID that doesn't exist. Without proper validation, these inputs could:

  • Crash your application
  • Create negative inventory records
  • Allow price manipulation attacks
  • Cause downstream data processing failures

Let's build a robust validation framework that prevents these issues.

Pattern 1: Functional Validators

The simplest pattern uses pure functions for validation. Each validator takes a value and returns either the validated value or raises an exception.

from typing import Any, Callable, TypeVar

T = TypeVar('T')

def validate_email(email: str) -> str:
 """Validate email format."""
 import re
 pattern = r'^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}$'
 if not re.match(pattern, email):
 raise ValueError(f"Invalid email format: {email}")
 return email

def validate_positive_int(value: Any) -> int:
 """Validate positive integer."""
 if not isinstance(value, int):
 raise TypeError(f"Expected int, got {type(value).__name__}")
 if value <= 0:
 raise ValueError(f"Value must be positive, got {value}")
 return value

def validate_non_empty_string(value: Any) -> str:
 """Validate non-empty string."""
 if not isinstance(value, str):
 raise TypeError(f"Expected string, got {type(value).__name__}")
 if not value.strip():
 raise ValueError("String cannot be empty")
 return value

Pattern 2: Schema-Based Validation

For complex data structures, schema-based validation is more maintainable. Here's a lightweight schema system:

from typing import Dict, List, Optional, Any
from dataclasses import dataclass, field

@dataclass
class Field:
 type: type
 required: bool = True
 validators: List[Callable] = field(default_factory=list)
 default: Any = None

class Schema:
 def __init__(self, fields: Dict[str, Field]):
 self.fields = fields

 def validate(self, data: Dict[str, Any]) -> Dict[str, Any]:
 validated = {}
 errors = {}

 for field_name, field_spec in self.fields.items():
 value = data.get(field_name)

 # Check required fields
 if field_spec.required and value is None:
 errors[field_name] = "Field is required"
 continue

 # Set default if not provided
 if value is None and field_spec.default is not None:
 value = field_spec.default

 # Type validation
 if value is not None and not isinstance(value, field_spec.type):
 errors[field_name] = f"Expected {field_spec.type.__name__}, got {type(value).__name__}"
 continue

 # Custom validators
 for validator in field_spec.validators:
 try:
 value = validator(value)
 except (ValueError, TypeError) as e:
 errors[field_name] = str(e)
 break
 else:
 validated[field_name] = value

 if errors:
 raise ValidationError(errors)

 return validated

class ValidationError(Exception):
 def __init__(self, errors: Dict[str, str]):
 self.errors = errors
 super().__init__(str(errors))

Usage Example

# Define validators
def validate_phone(phone: str) -> str:
 if not phone.startswith('+') or len(phone) < 10:
 raise ValueError("Invalid phone number format")
 return phone

# Create schema
user_schema = Schema({
 'name': Field(str, validators=[validate_non_empty_string]),
 'email': Field(str, validators=[validate_email]),
 'age': Field(int, validators=[validate_positive_int]),
 'phone': Field(str, required=False, validators=[validate_phone]),
 'role': Field(str, default='user')
})

# Validate data
user_data = {
 'name': 'Alice',
 'email': 'alice@example.com',
 'age': 30,
 'phone': '+1234567890'
}

try:
 validated_user = user_schema.validate(user_data)
 print(f"Validated user: {validated_user}")
except ValidationError as e:
 print(f"Validation failed: {e.errors}")

Pattern 3: Pydantic Integration

For production systems, I recommend using Pydantic for complex validation. It provides powerful built-in validators and serialization:

from pydantic import BaseModel, Field, validator
from typing import Optional
from datetime import datetime
import re

class ProductCreate(BaseModel):
 name: str = Field(..., min_length=1, max_length=100)
 price: float = Field(..., gt=0)
 sku: str = Field(..., regex=r'^[A-Z]{2}-\d{4}$')
 tags: list[str] = Field(default_factory=list)
 created_at: Optional[datetime] = None

 @validator('name')
 def name_must_be_meaningful(cls, v):
 if not re.search(r'[a-zA-Z]', v):
 raise ValueError('Name must contain at least one letter')
 return v.strip()

 @validator('price')
 def price_must_have_two_decimals(cls, v):
 if round(v, 2) != v:
 raise ValueError('Price must have at most 2 decimal places')
 return v

# Usage
try:
 product = ProductCreate(
 name=' Wireless Mouse ',
 price=29.99,
 sku='WM-001',
 tags=['electronics', 'accessories']
 )
 print(f"Validated product: {product.dict()}")
except Exception as e:
 print(f"Validation error: {e}")

Pattern 4: Validation Pipeline

For complex workflows, chain validators together in a pipeline:

from typing import List, Tuple, Any

class ValidationPipeline:
 def __init__(self):
 self.steps: List[Tuple[str, Callable]] = []

 def add_step(self, name: str, validator: Callable):
 self.steps.append((name, validator))

 def process(self, data: Any) -> Any:
 for step_name, validator in self.steps:
 try:
 data = validator(data)
 print(f"Step '{step_name}' passed")
 except (ValueError, TypeError) as e:
 raise RuntimeError(f"Validation failed at step '{step_name}': {e}")
 return data

# Example pipeline
pipeline = ValidationPipeline()
pipeline.add_step("sanitize", lambda x: x.strip())
pipeline.add_step("email_format", validate_email)
pipeline.add_step("check_domain", lambda email: email if 'company.com' in email else (_ for _ in ()).throw(ValueError("Not company email")))

# Usage
try:
 result = pipeline.process(" user@company.com ")
 print(f"Pipeline result: {result}")
except RuntimeError as e:
 print(f"Pipeline failed: {e}")

Pattern 5: Async Validation

For I/O-bound validation (database lookups, API calls), use async validators:

import asyncio
from typing import Any, Awaitable

class AsyncValidator:
 def __init__(self):
 self.validators: list[Awaitable] = []

 async def validate(self, data: Any) -> Any:
 results = await asyncio.gather(
 *[v(data) for v in self.validators],
 return_exceptions=True
 )

 errors = []
 for result in results:
 if isinstance(result, Exception):
 errors.append(str(result))

 if errors:
 raise ValidationError({"async": errors})
 return data

# Example async validators
async def check_user_exists(user_id: int) -> int:
 # Simulate DB lookup
 await asyncio.sleep(0.1)
 if user_id == 999:
 raise ValueError(f"User {user_id} not found")
 return user_id

async def check_rate_limit(user_id: int) -> int:
 # Simulate rate limit check
 await asyncio.sleep(0.05)
 if user_id == 500:
 raise ValueError("Rate limit exceeded")
 return user_id

# Usage
async def main():
 validator = AsyncValidator()
 validator.validators = [check_user_exists, check_rate_limit]

 try:
 await validator.validate(123)
 print("Async validation passed")
 except ValidationError as e:
 print(f"Async validation failed: {e}")

# asyncio.run(main())

Error Handling Best Practices

  1. **Be explicit about errors