Bioinformatics Tools¶

DeepCritical provides bioinformatics tools for integrative biological data analysis, including gene ontology analysis, literature mining, protein structure prediction, and multi-source data fusion.

High-Level Analysis Tools¶

The bioinformatics tools integrate multiple biological databases and provide sophisticated analysis capabilities for gene function prediction, protein analysis, and biological data integration.

Data Sources¶

Gene Ontology (GO)¶

from deepresearch.tools.bioinformatics import GOAnnotationTool

# Initialize GO annotation tool
go_tool = GOAnnotationTool()

# Query GO annotations
annotations = await go_tool.query_annotations(
    gene_id="TP53",
    evidence_codes=["IDA", "EXP", "TAS"],
    organism="human",
    max_results=100
)

# Process annotations
for annotation in annotations:
    print(f"GO Term: {annotation.go_id}")
    print(f"Term Name: {annotation.term_name}")
    print(f"Evidence: {annotation.evidence_code}")
    print(f"Reference: {annotation.reference}")

PubMed Integration¶

from deepresearch.tools.bioinformatics import PubMedTool

# Initialize PubMed tool
pubmed_tool = PubMedTool()

# Search literature
papers = await pubmed_tool.search_and_fetch(
    query="TP53 AND cancer AND apoptosis",
    max_results=50,
    include_abstracts=True,
    year_min=2020
)

# Analyze papers
for paper in papers:
    print(f"PMID: {paper.pmid}")
    print(f"Title: {paper.title}")
    print(f"Abstract: {paper.abstract[:200]}...")

UniProt Integration¶

from deepresearch.tools.bioinformatics import UniProtTool

# Initialize UniProt tool
uniprot_tool = UniProtTool()

# Get protein information
protein_info = await uniprot_tool.get_protein_info(
    accession="P04637",
    include_sequences=True,
    include_features=True
)

print(f"Protein Name: {protein_info.name}")
print(f"Function: {protein_info.function}")
print(f"Sequence Length: {len(protein_info.sequence)}")

Analysis Tools¶

GO Enrichment Analysis¶

from deepresearch.tools.bioinformatics import GOEnrichmentTool

# Initialize enrichment tool
enrichment_tool = GOEnrichmentTool()

# Perform enrichment analysis
enrichment_results = await enrichment_tool.analyze_enrichment(
    gene_list=["TP53", "BRCA1", "EGFR", "MYC"],
    background_genes=["TP53", "BRCA1", "EGFR", "MYC", "RB1", "APC"],
    organism="human",
    p_value_threshold=0.05
)

# Display results
for result in enrichment_results:
    print(f"GO Term: {result.go_id}")
    print(f"P-value: {result.p_value}")
    print(f"Enrichment Ratio: {result.enrichment_ratio}")

Protein-Protein Interaction Analysis¶

from deepresearch.tools.bioinformatics import InteractionTool

# Initialize interaction tool
interaction_tool = InteractionTool()

# Get protein interactions
interactions = await interaction_tool.get_interactions(
    protein_id="P04637",
    interaction_types=["physical", "genetic"],
    confidence_threshold=0.7,
    max_interactions=50
)

# Analyze interaction network
for interaction in interactions:
    print(f"Interactor: {interaction.interactor}")
    print(f"Interaction Type: {interaction.interaction_type}")
    print(f"Confidence: {interaction.confidence}")

Pathway Analysis¶

from deepresearch.tools.bioinformatics import PathwayTool

# Initialize pathway tool
pathway_tool = PathwayTool()

# Analyze pathways
pathway_results = await pathway_tool.analyze_pathways(
    gene_list=["TP53", "BRCA1", "EGFR"],
    pathway_databases=["KEGG", "Reactome", "WikiPathways"],
    organism="human"
)

# Display pathway information
for pathway in pathway_results:
    print(f"Pathway: {pathway.name}")
    print(f"Database: {pathway.database}")
    print(f"Genes in pathway: {len(pathway.genes)}")

Structure Analysis Tools¶

Structure Prediction¶

from deepresearch.tools.bioinformatics import StructurePredictionTool

# Initialize structure prediction tool
structure_tool = StructurePredictionTool()

# Predict protein structure
structure_result = await structure_tool.predict_structure(
    sequence="MKTVRQERLKSIVRILERSKEPVSGAQLAEELSVSRQVIVQDIAYLRSLGYNIVATPRGYVLAGG",
    method="alphafold2",
    include_confidence=True,
    use_templates=True
)

print(f"pLDDT Score: {structure_result.plddt_score}")
print(f"Structure Quality: {structure_result.quality}")

Structure Comparison¶

from deepresearch.tools.bioinformatics import StructureComparisonTool

# Initialize comparison tool
comparison_tool = StructureComparisonTool()

# Compare structures
comparison_result = await comparison_tool.compare_structures(
    structure1_pdb="1tup.pdb",
    structure2_pdb="predicted_structure.pdb",
    comparison_method="tm_align",
    include_visualization=True
)

print(f"RMSD: {comparison_result.rmsd}")
print(f"TM Score: {comparison_result.tm_score}")
print(f"Alignment Length: {comparison_result.alignment_length}")

Integration Tools¶

Multi-Source Data Fusion¶

from deepresearch.tools.bioinformatics import DataFusionTool

# Initialize fusion tool
fusion_tool = DataFusionTool()

# Fuse multiple data sources
fused_data = await fusion_tool.fuse_data_sources(
    go_annotations=go_annotations,
    literature=papers,
    interactions=interactions,
    expression_data=expression_data,
    quality_threshold=0.8,
    max_entities=1000
)

print(f"Fused entities: {len(fused_data.entities)}")
print(f"Confidence scores: {fused_data.confidence_scores}")

Evidence Integration¶

from deepresearch.tools.bioinformatics import EvidenceIntegrationTool

# Initialize evidence integration tool
evidence_tool = EvidenceIntegrationTool()

# Integrate evidence from multiple sources
integrated_evidence = await evidence_tool.integrate_evidence(
    go_evidence=go_evidence,
    literature_evidence=lit_evidence,
    experimental_evidence=exp_evidence,
    computational_evidence=comp_evidence,
    evidence_weights={
        "IDA": 1.0,
        "EXP": 0.9,
        "TAS": 0.8,
        "IMP": 0.7
    }
)

print(f"Integrated confidence: {integrated_evidence.confidence}")
print(f"Evidence summary: {integrated_evidence.evidence_summary}")

Advanced Analysis¶

Gene Set Enrichment Analysis (GSEA)¶

from deepresearch.tools.bioinformatics import GSEATool

# Initialize GSEA tool
gsea_tool = GSEATool()

# Perform GSEA
gsea_results = await gsea_tool.perform_gsea(
    gene_expression_data=expression_matrix,
    gene_sets=["hallmark_pathways", "go_biological_process"],
    permutations=1000,
    p_value_threshold=0.05
)

# Analyze results
for result in gsea_results:
    print(f"Gene Set: {result.gene_set_name}")
    print(f"ES Score: {result.enrichment_score}")
    print(f"P-value: {result.p_value}")
    print(f"FDR: {result.fdr}")

Network Analysis¶

from deepresearch.tools.bioinformatics import NetworkAnalysisTool

# Initialize network tool
network_tool = NetworkAnalysisTool()

# Analyze interaction network
network_analysis = await network_tool.analyze_network(
    interactions=interaction_data,
    analysis_types=["centrality", "clustering", "community_detection"],
    include_visualization=True
)

print(f"Network nodes: {network_analysis.node_count}")
print(f"Network edges: {network_analysis.edge_count}")
print(f"Clustering coefficient: {network_analysis.clustering_coefficient}")

Configuration¶

Tool Configuration¶

# configs/bioinformatics/tools.yaml
bioinformatics_tools:
  go_annotation:
    api_base_url: "https://api.geneontology.org"
    cache_enabled: true
    cache_ttl: 3600
    max_requests_per_minute: 60

  pubmed:
    api_base_url: "https://eutils.ncbi.nlm.nih.gov/entrez/eutils"
    max_results: 100
    include_abstracts: true
    request_delay: 0.5

  uniprot:
    api_base_url: "https://rest.uniprot.org"
    include_sequences: true
    include_features: true

  structure_prediction:
    alphafold:
      max_model_len: 2000
      use_gpu: true
      recycle_iterations: 3

    esmfold:
      model_size: "650M"
      use_templates: true

Database Configuration¶

# configs/bioinformatics/data_sources.yaml
data_sources:
  go:
    enabled: true
    evidence_codes: ["IDA", "EXP", "TAS", "IMP"]
    year_min: 2020
    quality_threshold: 0.85

  pubmed:
    enabled: true
    max_results: 100
    include_full_text: false
    year_min: 2020

  string_db:
    enabled: true
    confidence_threshold: 0.7
    max_interactions: 1000

  kegg:
    enabled: true
    organism_codes: ["hsa", "mmu", "sce"]

Usage Examples¶

Gene Function Analysis¶

# Comprehensive gene function analysis
async def analyze_gene_function(gene_id: str):
    # Get GO annotations
    go_annotations = await go_tool.query_annotations(gene_id)

    # Get literature
    literature = await pubmed_tool.search_and_fetch(f"{gene_id} function")

    # Get interactions
    interactions = await interaction_tool.get_interactions(gene_id)

    # Fuse and analyze
    fused_result = await fusion_tool.fuse_data_sources(
        go_annotations=go_annotations,
        literature=literature,
        interactions=interactions
    )

    return fused_result

Protein Structure-Function Analysis¶

# Analyze protein structure and function
async def analyze_protein_structure_function(protein_id: str):
    # Get protein information
    protein_info = await uniprot_tool.get_protein_info(protein_id)

    # Predict structure if not available
    if not protein_info.pdb_id:
        structure = await structure_tool.predict_structure(protein_info.sequence)
    else:
        structure = await pdb_tool.get_structure(protein_info.pdb_id)

    # Analyze functional sites
    functional_sites = await function_tool.predict_functional_sites(structure)

    # Integrate findings
    integrated_analysis = await evidence_tool.integrate_evidence(
        sequence_evidence=protein_info,
        structure_evidence=structure,
        functional_evidence=functional_sites
    )

    return integrated_analysis

Best Practices¶

Data Quality: Always validate data quality from external sources
Evidence Integration: Use multiple evidence types for robust conclusions
Cross-Validation: Validate findings across different data sources
Performance Optimization: Use caching and batch processing for large datasets
Error Handling: Implement robust error handling for API failures

Troubleshooting¶

Common Issues¶

API Rate Limits:

# Configure request delays
go_tool.configure_request_delay(1.0)  # 1 second between requests
pubmed_tool.configure_request_delay(0.5)  # 0.5 seconds between requests

Data Quality Issues:

# Enable quality filtering
fusion_tool.enable_quality_filtering(
    min_confidence=0.8,
    require_multiple_sources=True,
    validate_temporal_consistency=True
)

Large Dataset Handling:

# Use batch processing
results = await batch_tool.process_batch(
    data_list=large_dataset,
    batch_size=100,
    max_workers=4
)

For more detailed information, see the Tool Development Guide and Data Types API Reference.