Testing Guide¶

This guide explains the testing framework and practices used in DeepCritical, including unit tests, integration tests, and testing best practices.

Testing Framework¶

DeepCritical uses a comprehensive testing framework with multiple test categories:

Test Categories¶

# Run all tests
make test

# Run specific test categories
make test unit_tests          # Unit tests only
make test integration_tests   # Integration tests only
make test performance_tests   # Performance tests only
make test vllm_tests          # VLLM-specific tests only

# Run tests with coverage
make test-cov

# Run tests excluding slow tests
make test-fast

Test Organization¶

Directory Structure¶

tests/
├── __init__.py
├── test_agents.py              # Agent system tests
├── test_tools.py               # Tool framework tests
├── test_workflows.py           # Workflow execution tests
├── test_datatypes.py           # Data type validation tests
├── test_configuration.py       # Configuration tests
├── test_integration.py         # End-to-end integration tests
└── test_performance.py         # Performance and load tests

Test Naming Conventions¶

# Unit tests
def test_function_name():
    """Test specific function behavior."""

def test_function_name_edge_cases():
    """Test edge cases and error conditions."""

# Integration tests
def test_workflow_integration():
    """Test complete workflow execution."""

def test_cross_component_interaction():
    """Test interaction between components."""

# Performance tests
def test_performance_under_load():
    """Test performance with high load."""

def test_memory_usage():
    """Test memory usage patterns."""

Writing Tests¶

Unit Tests¶

import pytest
from deepresearch.agents import SearchAgent
from deepresearch.datatypes import AgentDependencies

def test_search_agent_initialization():
    """Test SearchAgent initialization."""
    agent = SearchAgent()
    assert agent.agent_type == AgentType.SEARCH
    assert agent.status == AgentStatus.IDLE

def test_search_agent_execution():
    """Test SearchAgent execution."""
    agent = SearchAgent()
    deps = AgentDependencies()

    # Mock external dependencies
    with patch('deepresearch.tools.web_search') as mock_search:
        mock_search.return_value = "mock results"

        result = await agent.execute("test query", deps)

        assert result.success
        assert result.data == "mock results"
        mock_search.assert_called_once()

def test_search_agent_error_handling():
    """Test SearchAgent error handling."""
    agent = SearchAgent()
    deps = AgentDependencies()

    # Test with invalid input
    result = await agent.execute(None, deps)

    assert not result.success
    assert result.error is not None

Integration Tests¶

import pytest
from deepresearch.app import main

@pytest.mark.integration
async def test_full_workflow_execution():
    """Test complete workflow execution."""
    result = await main(
        question="What is machine learning?",
        flows={"prime": {"enabled": False}}
    )

    assert result.success
    assert result.data is not None
    assert len(result.execution_history.entries) > 0

@pytest.mark.integration
async def test_multi_flow_integration():
    """Test integration between multiple flows."""
    result = await main(
        question="Analyze protein function",
        flows={
            "prime": {"enabled": True},
            "bioinformatics": {"enabled": True}
        }
    )

    assert result.success
    # Verify results from both flows
    assert "prime_results" in result.data
    assert "bioinformatics_results" in result.data

Performance Tests¶

import pytest
import time
import psutil
import os

@pytest.mark.performance
async def test_execution_time():
    """Test execution time requirements."""
    start_time = time.time()

    result = await main(question="Performance test query")

    execution_time = time.time() - start_time

    # Should complete within reasonable time
    assert execution_time < 300  # 5 minutes
    assert result.success

@pytest.mark.performance
async def test_memory_usage():
    """Test memory usage during execution."""
    process = psutil.Process(os.getpid())
    initial_memory = process.memory_info().rss / 1024 / 1024  # MB

    result = await main(question="Memory usage test")

    final_memory = process.memory_info().rss / 1024 / 1024  # MB
    memory_increase = final_memory - initial_memory

    # Memory increase should be reasonable
    assert memory_increase < 500  # Less than 500MB increase
    assert result.success

Test Configuration¶

Test Configuration Files¶

# tests/test_config.yaml
test_settings:
  mock_external_apis: true
  use_test_databases: true
  enable_performance_monitoring: true

  timeouts:
    unit_test: 30
    integration_test: 300
    performance_test: 600

  resources:
    max_memory_mb: 1000
    max_execution_time: 300

Test Fixtures¶

# tests/conftest.py
import pytest
from deepresearch.datatypes import AgentDependencies, ResearchState

@pytest.fixture
def sample_dependencies():
    """Provide sample agent dependencies for tests."""
    return AgentDependencies(
        model_name="anthropic:claude-sonnet-4-0",
        api_keys={"anthropic": "test-key"},
        config={"temperature": 0.7}
    )

@pytest.fixture
def sample_research_state():
    """Provide sample research state for tests."""
    return ResearchState(
        question="Test question",
        plan=["step1", "step2"],
        agent_results={},
        tool_outputs={}
    )

@pytest.fixture
def mock_tool_registry():
    """Mock tool registry for isolated testing."""
    with patch('deepresearch.tools.base.registry') as mock_registry:
        yield mock_registry

Testing Best Practices¶

1. Test Isolation¶

# Use fixtures for test isolation
def test_isolated_functionality(sample_dependencies):
    """Test with isolated dependencies."""
    # Test implementation using fixture
    pass

# Avoid global state in tests
def test_without_global_state():
    """Test without relying on global state."""
    # Create fresh instances for each test
    pass

2. Mocking External Dependencies¶

from unittest.mock import patch, MagicMock

def test_with_mocked_external_api():
    """Test with mocked external API calls."""
    with patch('requests.get') as mock_get:
        mock_response = MagicMock()
        mock_response.status_code = 200
        mock_response.json.return_value = {"data": "test"}
        mock_get.return_value = mock_response

        # Test implementation
        result = call_external_api()
        assert result == {"data": "test"}

3. Async Testing¶

import pytest

@pytest.mark.asyncio
async def test_async_functionality():
    """Test async functions properly."""
    result = await async_function()
    assert result.success

# For testing async context managers
@pytest.mark.asyncio
async def test_async_context_manager():
    """Test async context managers."""
    async with async_context_manager() as manager:
        result = await manager.do_something()
        assert result is not None

4. Parameterized Tests¶

import pytest

@pytest.mark.parametrize("input_data,expected", [
    ("test1", "result1"),
    ("test2", "result2"),
    ("test3", "result3"),
])
def test_parameterized_functionality(input_data, expected):
    """Test function with multiple parameter sets."""
    result = process_data(input_data)
    assert result == expected

@pytest.mark.parametrize("flow_enabled", [True, False])
@pytest.mark.parametrize("config_override", ["config1", "config2"])
async def test_flow_combinations(flow_enabled, config_override):
    """Test different flow and configuration combinations."""
    result = await main(
        question="Test query",
        flows={"test_flow": {"enabled": flow_enabled}},
        config_name=config_override
    )
    assert result.success

Specialized Testing¶

Tool Testing¶

from deepresearch.tools import ToolRunner, ToolSpec

def test_custom_tool():
    """Test custom tool implementation."""
    tool = CustomTool()

    # Test tool specification
    spec = tool.get_spec()
    assert spec.name == "custom_tool"
    assert spec.category == ToolCategory.ANALYTICS

    # Test tool execution
    result = tool.run({"input": "test_data"})
    assert result.success
    assert "output" in result.data

def test_tool_error_handling():
    """Test tool error conditions."""
    tool = CustomTool()

    # Test with invalid input
    result = tool.run({"invalid": "input"})
    assert not result.success
    assert result.error is not None

Agent Testing¶

from deepresearch.agents import SearchAgent

def test_agent_lifecycle():
    """Test complete agent lifecycle."""
    agent = SearchAgent()

    # Test initialization
    assert agent.status == AgentStatus.IDLE

    # Test execution
    result = await agent.execute("test query", AgentDependencies())
    assert result.success

    # Test cleanup
    agent.cleanup()
    assert agent.status == AgentStatus.IDLE

Workflow Testing¶

from deepresearch.app import main

@pytest.mark.integration
async def test_workflow_error_recovery():
    """Test workflow error recovery mechanisms."""
    # Test with failing components
    result = await main(
        question="Test error recovery",
        enable_error_recovery=True,
        max_retries=3
    )

    # Should either succeed or provide meaningful error information
    assert result is not None
    if not result.success:
        assert result.error is not None
        assert len(result.error_history) > 0

Tool Testing¶

Testing Custom Tools¶

DeepCritical provides comprehensive testing support for custom tools:

Tool Unit Testing¶

import pytest
from deepresearch.src.tools.base import ToolRunner, ExecutionResult

class TestCustomTool:
    """Test cases for custom tool implementation."""

    @pytest.fixture
    def tool(self):
        """Create tool instance for testing."""
        return CustomTool()

    def test_tool_specification(self, tool):
        """Test tool specification is correctly defined."""
        spec = tool.get_spec()

        assert spec.name == "custom_tool"
        assert spec.category.value == "custom"
        assert "input_param" in spec.inputs
        assert "output_result" in spec.outputs

    def test_tool_execution_success(self, tool):
        """Test successful tool execution."""
        result = tool.run({
            "input_param": "test_value",
            "options": {"verbose": True}
        })

        assert isinstance(result, ExecutionResult)
        assert result.success
        assert "output_result" in result.data
        assert result.execution_time > 0

Tool Integration Testing¶

import pytest
from deepresearch.src.utils.tool_registry import ToolRegistry

class TestToolIntegration:
    """Integration tests for tool registry and execution."""

    @pytest.fixture
    def registry(self):
        """Get tool registry instance."""
        return ToolRegistry.get_instance()

    def test_tool_registration(self, registry):
        """Test tool registration in registry."""
        tool = CustomTool()
        registry.register_tool(tool.get_spec(), tool)

        # Verify tool is registered
        assert "custom_tool" in registry.list_tools()
        spec = registry.get_tool_spec("custom_tool")
        assert spec.name == "custom_tool"

    def test_tool_execution_through_registry(self, registry):
        """Test tool execution through registry."""
        tool = CustomTool()
        registry.register_tool(tool.get_spec(), tool)

        result = registry.execute_tool("custom_tool", {
            "input_param": "registry_test"
        })

        assert result.success
        assert result.data["output_result"] == "processed: registry_test"

Testing Best Practices for Tools¶

Tool Test Organization¶

# tests/tools/test_custom_tool.py
import pytest
from deepresearch.src.tools.custom_tool import CustomTool

class TestCustomTool:
    """Comprehensive test suite for CustomTool."""

    # Unit tests
    def test_initialization(self): ...
    def test_input_validation(self): ...
    def test_output_formatting(self): ...

    # Integration tests
    def test_registry_integration(self): ...
    def test_workflow_integration(self): ...

    # Performance tests
    def test_execution_performance(self): ...
    def test_memory_usage(self): ...

Continuous Integration Testing¶

CI Test Configuration¶

# .github/workflows/test.yml
test:
  runs-on: ubuntu-latest
  strategy:
    matrix:
      python-version: ['3.10', '3.11']

  steps:
    - uses: actions/checkout@v4
    - uses: actions/setup-python@v4
      with:
        python-version: ${{ matrix.python-version }}

    - name: Install dependencies
      run: |
        python -m pip install --upgrade pip
        pip install -e .
        pip install -e ".[dev]"

    - name: Run tests
      run: make test

    - name: Run tests with coverage
      run: make test-cov

    - name: Upload coverage
      uses: codecov/codecov-action@v3
      with:
        file: ./coverage.xml

Test Markers¶

# Use pytest markers for test categorization
@pytest.mark.unit
def test_unit_functionality():
    """Unit test marker."""
    pass

@pytest.mark.integration
@pytest.mark.slow
async def test_integration_functionality():
    """Integration test that may be slow."""
    pass

@pytest.mark.performance
@pytest.mark.skip(reason="Requires significant resources")
async def test_performance_benchmark():
    """Performance test that may be skipped in CI."""
    pass

# Run specific marker categories
# pytest -m "unit"                    # Unit tests only
# pytest -m "integration and not slow"  # Fast integration tests
# pytest -m "not performance"         # Exclude performance tests

Test Data Management¶

Test Data Fixtures¶

# tests/fixtures/test_data.py
@pytest.fixture
def sample_protein_data():
    """Sample protein data for testing."""
    return {
        "accession": "P04637",
        "name": "Cellular tumor antigen p53",
        "sequence": "MEEPQSDPSVEPPLSQETFSDLWKLLPENNVLSPLPSQAMDDLMLSPDDIEQWFTEDPGP",
        "organism": "Homo sapiens"
    }

@pytest.fixture
def sample_go_annotations():
    """Sample GO annotations for testing."""
    return [
        {
            "gene_id": "TP53",
            "go_id": "GO:0003677",
            "go_term": "DNA binding",
            "evidence_code": "IDA"
        }
    ]

Test Database Setup¶

# tests/conftest.py
@pytest.fixture(scope="session")
def test_database():
    """Set up test database."""
    # Create test database
    db_config = {
        "type": "sqlite",
        "database": ":memory:",
        "echo": False
    }

    # Initialize database
    engine = create_engine(**db_config)
    Base.metadata.create_all(engine)

    yield engine

    # Cleanup
    engine.dispose()

Performance Testing¶

Benchmark Tests¶

import pytest
import time

def test_function_performance(benchmark):
    """Benchmark function performance."""
    result = benchmark(process_large_dataset, large_dataset)
    assert result is not None

def test_memory_usage():
    """Test memory usage patterns."""
    import tracemalloc

    tracemalloc.start()

    # Execute function
    result = process_data(large_input)

    current, peak = tracemalloc.get_traced_memory()
    tracemalloc.stop()

    # Check memory usage
    assert current < 100 * 1024 * 1024  # Less than 100MB
    assert peak < 200 * 1024 * 1024     # Peak less than 200MB

Load Testing¶

@pytest.mark.load
async def test_concurrent_execution():
    """Test concurrent execution performance."""
    # Test with multiple concurrent requests
    tasks = [
        main(question=f"Query {i}") for i in range(10)
    ]

    start_time = time.time()
    results = await asyncio.gather(*tasks)
    execution_time = time.time() - start_time

    # Check performance requirements
    assert execution_time < 60  # Complete within 60 seconds
    assert all(result.success for result in results)

Debugging Tests¶

Test Debugging Techniques¶

def test_with_debugging():
    """Test with detailed debugging information."""
    # Enable debug logging
    import logging
    logging.basicConfig(level=logging.DEBUG)

    # Execute with debug information
    result = function_under_test()

    # Log intermediate results
    logger.debug(f"Intermediate result: {intermediate_value}")

    assert result.success

Test Failure Analysis¶

def test_failure_analysis():
    """Analyze test failures systematically."""
    try:
        result = await main(question="Test query")
        assert result.success
    except AssertionError as e:
        # Log failure details for debugging
        logger.error(f"Test failed: {e}")
        logger.error(f"Result data: {result.data if 'result' in locals() else 'N/A'}")
        logger.error(f"Error details: {result.error if 'result' in locals() else 'N/A'}")

        # Re-raise for test framework
        raise

Test Coverage¶

Coverage Requirements¶

# Run tests with coverage
def test_coverage_requirements():
    """Ensure adequate test coverage."""
    # Aim for >80% overall coverage
    # >90% coverage for critical paths
    # 100% coverage for error conditions

    coverage = pytest.main([
        "--cov=deepresearch",
        "--cov-report=html",
        "--cov-report=term-missing",
        "--cov-fail-under=80"
    ])

    assert coverage == 0  # No test failures

Coverage Exclusions¶

# pytest.ini
[tool:pytest]
addopts = --cov=deepresearch --cov-report=html --cov-report=term-missing
testpaths = tests
python_files = test_*.py
python_classes = Test*
python_functions = test_*

# Exclude certain files from coverage
[coverage:run]
omit =
    */tests/*
    */test_*.py
    */conftest.py
    deepresearch/__init__.py
    deepresearch/scripts/*

Best Practices¶

Test Early and Often: Write tests as you develop features
Keep Tests Fast: Unit tests should run quickly (<1 second each)
Test in Isolation: Each test should be independent
Use Descriptive Names: Test names should explain what they test
Test Error Conditions: Include tests for failure cases
Mock External Dependencies: Avoid relying on external services in tests
Use Fixtures: Create reusable test data and setup
Document Test Intent: Explain why each test exists

Troubleshooting¶

Common Test Issues¶

Flaky Tests:

# Use retry for flaky tests
@pytest.mark.flaky(reruns=3)
async def test_flaky_functionality():
    """Test that may occasionally fail."""
    pass

Slow Tests:

# Mark slow tests to skip in fast mode
@pytest.mark.slow
async def test_slow_operation():
    """Test that takes significant time."""
    pass

# Run fast tests only
pytest -m "not slow"

Resource-Intensive Tests:

# Mark tests that require significant resources
@pytest.mark.resource_intensive
async def test_large_dataset_processing():
    """Test with large datasets."""
    pass

# Run on CI with resource allocation
# pytest -m "resource_intensive" --maxfail=1

For more information about testing patterns and examples, see the Contributing Guide and CI/CD Guide.