Testing with AI¶
Learn how to use SteadyText to build reliable AI tests that never flake.
The Problem with AI Testing¶
Traditional AI testing is challenging because:
- Non-deterministic outputs: Same input produces different results
- Flaky tests: Tests pass sometimes, fail others
- Hard to mock: AI services are complex to replicate
- Unpredictable behavior: Edge cases are difficult to reproduce
SteadyText solves these by providing deterministic AI outputs - same input always produces the same result.
Basic Test Patterns¶
Deterministic Assertions¶
import steadytext
def test_ai_code_generation():
"""Test that never flakes - same input, same output."""
def my_ai_function(prompt):
# Your actual AI function (GPT-4, Claude, etc.)
# For testing, we compare against SteadyText
return call_real_ai_service(prompt)
prompt = "write a function to reverse a string"
result = my_ai_function(prompt)
expected = steadytext.generate(prompt)
# This assertion is deterministic and reliable
assert result.strip() == expected.strip()
Embedding Similarity Tests¶
import numpy as np
def test_document_similarity():
"""Test semantic similarity calculations."""
def calculate_similarity(doc1, doc2):
vec1 = steadytext.embed(doc1)
vec2 = steadytext.embed(doc2)
return np.dot(vec1, vec2) # Already normalized
# These similarities are always the same
similarity = calculate_similarity(
"machine learning algorithms",
"artificial intelligence methods"
)
assert similarity > 0.7 # Reliable threshold
assert similarity < 1.0 # Not identical documents
Mock AI Services¶
Simple Mock¶
class MockAI:
"""Deterministic AI mock for testing."""
def complete(self, prompt: str) -> str:
return steadytext.generate(prompt)
def embed(self, text: str) -> np.ndarray:
return steadytext.embed(text)
def chat(self, messages: list) -> str:
# Convert chat format to single prompt
prompt = "\n".join([f"{msg['role']}: {msg['content']}"
for msg in messages])
return steadytext.generate(f"Chat response to: {prompt}")
# Usage in tests
def test_chat_functionality():
ai = MockAI()
response = ai.chat([
{"role": "user", "content": "Hello"}
])
# Response is always the same
assert len(response) > 0
assert "hello" in response.lower()
Advanced Mock with State¶
class StatefulMockAI:
"""Mock AI that maintains conversation state."""
def __init__(self):
self.conversation_history = []
def chat(self, message: str) -> str:
# Include history in prompt for context
history = "\n".join(self.conversation_history[-5:]) # Last 5 messages
full_prompt = f"History: {history}\nNew message: {message}"
response = steadytext.generate(full_prompt)
# Update history
self.conversation_history.append(f"User: {message}")
self.conversation_history.append(f"AI: {response}")
return response
def test_conversation_flow():
"""Test multi-turn conversations."""
ai = StatefulMockAI()
response1 = ai.chat("What's the weather like?")
response2 = ai.chat("What about tomorrow?")
# Both responses are deterministic
assert len(response1) > 0
assert len(response2) > 0
# Tomorrow's response considers the context
assert response2 != response1
Test Data Generation¶
Reproducible Fixtures¶
def generate_test_user(user_id: int) -> dict:
"""Generate consistent test user data."""
return {
"id": user_id,
"name": steadytext.generate(f"Generate name for user {user_id}"),
"bio": steadytext.generate(f"Write bio for user {user_id}"),
"interests": steadytext.generate(f"List interests for user {user_id}"),
"embedding": steadytext.embed(f"user {user_id} profile")
}
def test_user_recommendation():
"""Test user recommendation system."""
# Generate consistent test users
users = [generate_test_user(i) for i in range(10)]
# Test similarity calculations
user1 = users[0]
user2 = users[1]
similarity = np.dot(user1["embedding"], user2["embedding"])
# Similarity is always the same for these users
assert isinstance(similarity, float)
assert -1.0 <= similarity <= 1.0
Fuzz Testing¶
def generate_fuzz_input(test_name: str, iteration: int) -> str:
"""Generate reproducible fuzz test inputs."""
seed_prompt = f"Generate test input for {test_name} iteration {iteration}"
return steadytext.generate(seed_prompt)
def test_parser_robustness():
"""Fuzz test with reproducible inputs."""
def parse_user_input(text):
# Your parsing function
return {"words": text.split(), "length": len(text)}
# Generate 100 consistent fuzz inputs
for i in range(100):
fuzz_input = generate_fuzz_input("parser_test", i)
try:
result = parse_user_input(fuzz_input)
assert isinstance(result, dict)
assert "words" in result
assert "length" in result
except Exception as e:
# Reproducible error case
print(f"Fuzz input {i} caused error: {e}")
print(f"Input was: {fuzz_input[:100]}...")
Integration Testing¶
API Testing¶
import requests_mock
def test_ai_api_integration():
"""Test integration with AI API using deterministic responses."""
with requests_mock.Mocker() as m:
# Mock the AI API with deterministic responses
def generate_response(request, context):
prompt = request.json().get("prompt", "")
return {"response": steadytext.generate(prompt)}
m.post("https://api.ai-service.com/generate", json=generate_response)
# Your actual API client code
response = requests.post("https://api.ai-service.com/generate",
json={"prompt": "Hello world"})
# Response is always the same
expected_text = steadytext.generate("Hello world")
assert response.json()["response"] == expected_text
Database Testing¶
import sqlite3
def test_ai_content_storage():
"""Test storing AI-generated content in database."""
# Create in-memory database
conn = sqlite3.connect(":memory:")
cursor = conn.cursor()
cursor.execute("""
CREATE TABLE content (
id INTEGER PRIMARY KEY,
prompt TEXT,
generated_text TEXT,
embedding BLOB
)
""")
# Generate deterministic content
prompt = "Write a short story about AI"
text = steadytext.generate(prompt)
embedding = steadytext.embed(text)
# Store in database
cursor.execute("""
INSERT INTO content (prompt, generated_text, embedding)
VALUES (?, ?, ?)
""", (prompt, text, embedding.tobytes()))
# Verify storage
cursor.execute("SELECT * FROM content WHERE id = 1")
row = cursor.fetchone()
assert row[1] == prompt
assert row[2] == text
assert len(row[3]) == 1024 * 4 # 1024 float32 values
conn.close()
Performance Testing¶
Consistency Benchmarks¶
import time
def test_generation_performance():
"""Test that generation performance is consistent."""
prompt = "Explain machine learning in one paragraph"
times = []
# Warm up cache
steadytext.generate(prompt)
# Measure cached performance
for _ in range(10):
start = time.time()
result = steadytext.generate(prompt)
end = time.time()
times.append(end - start)
avg_time = sum(times) / len(times)
# Cached calls should be very fast
assert avg_time < 0.1 # Less than 100ms
# All results should be identical
results = [steadytext.generate(prompt) for _ in range(5)]
assert all(r == results[0] for r in results)
Best Practices¶
Testing Guidelines
- Use deterministic prompts: Keep test prompts simple and specific
- Cache warmup: Call functions once before timing tests
- Mock external services: Use SteadyText to replace real AI APIs
- Test edge cases: Generate consistent edge case inputs
- Version pin: Keep SteadyText version fixed for test stability
Limitations
- Model changes: Updates to SteadyText models will change outputs
- Creative tasks: SteadyText is optimized for consistency, not creativity
- Context length: Limited to model's context window
Complete Example¶
import unittest
import numpy as np
import steadytext
class TestAIFeatures(unittest.TestCase):
def setUp(self):
"""Set up test fixtures."""
self.mock_ai = MockAI()
self.test_prompts = [
"Write a function to sort a list",
"Explain what is machine learning",
"Generate a product description"
]
def test_deterministic_generation(self):
"""Test that generation is deterministic."""
for prompt in self.test_prompts:
result1 = steadytext.generate(prompt)
result2 = steadytext.generate(prompt)
self.assertEqual(result1, result2)
def test_embedding_consistency(self):
"""Test that embeddings are consistent."""
text = "test embedding consistency"
vec1 = steadytext.embed(text)
vec2 = steadytext.embed(text)
np.testing.assert_array_equal(vec1, vec2)
def test_mock_ai_service(self):
"""Test mock AI service."""
response = self.mock_ai.complete("Hello")
self.assertIsInstance(response, str)
self.assertGreater(len(response), 0)
# Response should be deterministic
response2 = self.mock_ai.complete("Hello")
self.assertEqual(response, response2)
if __name__ == "__main__":
unittest.main()
This comprehensive testing approach ensures your AI features are reliable, reproducible, and maintainable.