scripts/query_clinpgx.py

#!/usr/bin/env python3
"""
ClinPGx API Query Helper Script

Provides ready-to-use functions for querying the ClinPGx database API.
Includes rate limiting, error handling, and caching functionality.

ClinPGx API: https://api.clinpgx.org/
Rate limit: 2 requests per second
License: Creative Commons Attribution-ShareAlike 4.0 International
"""

import requests
import time
import json
from pathlib import Path
from typing import Dict, List, Optional, Any

# API Configuration
BASE_URL = "https://api.clinpgx.org/v1/"
RATE_LIMIT_DELAY = 0.5  # 500ms delay = 2 requests/second


def rate_limited_request(url: str, params: Optional[Dict] = None, delay: float = RATE_LIMIT_DELAY) -> requests.Response:
    """
    Make API request with rate limiting compliance.

    Args:
        url: API endpoint URL
        params: Query parameters
        delay: Delay in seconds between requests (default 0.5s for 2 req/sec)

    Returns:
        Response object
    """
    response = requests.get(url, params=params)
    time.sleep(delay)
    return response


def safe_api_call(url: str, params: Optional[Dict] = None, max_retries: int = 3) -> Optional[Dict]:
    """
    Make API call with error handling and exponential backoff retry.

    Args:
        url: API endpoint URL
        params: Query parameters
        max_retries: Maximum number of retry attempts

    Returns:
        JSON response data or None on failure
    """
    for attempt in range(max_retries):
        try:
            response = requests.get(url, params=params, timeout=10)

            if response.status_code == 200:
                time.sleep(RATE_LIMIT_DELAY)
                return response.json()
            elif response.status_code == 429:
                # Rate limit exceeded
                wait_time = 2 ** attempt  # Exponential backoff: 1s, 2s, 4s
                print(f"Rate limit exceeded. Waiting {wait_time}s before retry...")
                time.sleep(wait_time)
            elif response.status_code == 404:
                print(f"Resource not found: {url}")
                return None
            else:
                response.raise_for_status()

        except requests.exceptions.RequestException as e:
            print(f"Attempt {attempt + 1}/{max_retries} failed: {e}")
            if attempt == max_retries - 1:
                print(f"Failed after {max_retries} attempts")
                return None
            time.sleep(1)

    return None


def cached_query(cache_file: str, query_func, *args, **kwargs) -> Any:
    """
    Cache API results to avoid repeated queries.

    Args:
        cache_file: Path to cache file
        query_func: Function to call if cache miss
        *args, **kwargs: Arguments to pass to query_func

    Returns:
        Cached or freshly queried data
    """
    cache_path = Path(cache_file)

    if cache_path.exists():
        print(f"Loading from cache: {cache_file}")
        with open(cache_path) as f:
            return json.load(f)

    print(f"Cache miss. Querying API...")
    result = query_func(*args, **kwargs)

    if result is not None:
        cache_path.parent.mkdir(parents=True, exist_ok=True)
        with open(cache_path, 'w') as f:
            json.dump(result, f, indent=2)
        print(f"Cached to: {cache_file}")

    return result


# Core Query Functions

def get_gene_info(gene_symbol: str) -> Optional[Dict]:
    """
    Retrieve detailed information about a pharmacogene.

    Args:
        gene_symbol: Gene symbol (e.g., "CYP2D6", "TPMT")

    Returns:
        Gene information dictionary

    Example:
        >>> gene_data = get_gene_info("CYP2D6")
        >>> print(gene_data['symbol'], gene_data['name'])
    """
    url = f"{BASE_URL}gene/{gene_symbol}"
    return safe_api_call(url)


def get_drug_info(drug_name: str) -> Optional[List[Dict]]:
    """
    Search for drug/chemical information by name.

    Args:
        drug_name: Drug name (e.g., "warfarin", "codeine")

    Returns:
        List of matching drugs

    Example:
        >>> drugs = get_drug_info("warfarin")
        >>> for drug in drugs:
        >>>     print(drug['name'], drug['id'])
    """
    url = f"{BASE_URL}chemical"
    params = {"name": drug_name}
    return safe_api_call(url, params)


def get_gene_drug_pairs(gene: Optional[str] = None, drug: Optional[str] = None) -> Optional[List[Dict]]:
    """
    Query gene-drug interaction pairs.

    Args:
        gene: Gene symbol (optional)
        drug: Drug name (optional)

    Returns:
        List of gene-drug pairs with clinical annotations

    Example:
        >>> # Get all pairs for CYP2D6
        >>> pairs = get_gene_drug_pairs(gene="CYP2D6")
        >>>
        >>> # Get specific gene-drug pair
        >>> pair = get_gene_drug_pairs(gene="CYP2D6", drug="codeine")
    """
    url = f"{BASE_URL}geneDrugPair"
    params = {}
    if gene:
        params["gene"] = gene
    if drug:
        params["drug"] = drug

    return safe_api_call(url, params)


def get_cpic_guidelines(gene: Optional[str] = None, drug: Optional[str] = None) -> Optional[List[Dict]]:
    """
    Retrieve CPIC clinical practice guidelines.

    Args:
        gene: Gene symbol (optional)
        drug: Drug name (optional)

    Returns:
        List of CPIC guidelines

    Example:
        >>> # Get all CPIC guidelines
        >>> guidelines = get_cpic_guidelines()
        >>>
        >>> # Get guideline for specific gene-drug
        >>> guideline = get_cpic_guidelines(gene="CYP2C19", drug="clopidogrel")
    """
    url = f"{BASE_URL}guideline"
    params = {"source": "CPIC"}
    if gene:
        params["gene"] = gene
    if drug:
        params["drug"] = drug

    return safe_api_call(url, params)


def get_alleles(gene: str) -> Optional[List[Dict]]:
    """
    Get all alleles for a pharmacogene including function and frequency.

    Args:
        gene: Gene symbol (e.g., "CYP2D6")

    Returns:
        List of alleles with functional annotations and population frequencies

    Example:
        >>> alleles = get_alleles("CYP2D6")
        >>> for allele in alleles:
        >>>     print(f"{allele['name']}: {allele['function']}")
    """
    url = f"{BASE_URL}allele"
    params = {"gene": gene}
    return safe_api_call(url, params)


def get_allele_info(allele_name: str) -> Optional[Dict]:
    """
    Get detailed information about a specific allele.

    Args:
        allele_name: Allele name (e.g., "CYP2D6*4")

    Returns:
        Allele information dictionary

    Example:
        >>> allele = get_allele_info("CYP2D6*4")
        >>> print(allele['function'], allele['frequencies'])
    """
    url = f"{BASE_URL}allele/{allele_name}"
    return safe_api_call(url)


def get_clinical_annotations(
    gene: Optional[str] = None,
    drug: Optional[str] = None,
    evidence_level: Optional[str] = None
) -> Optional[List[Dict]]:
    """
    Retrieve curated literature annotations for gene-drug interactions.

    Args:
        gene: Gene symbol (optional)
        drug: Drug name (optional)
        evidence_level: Filter by evidence level (1A, 1B, 2A, 2B, 3, 4)

    Returns:
        List of clinical annotations

    Example:
        >>> # Get all annotations for CYP2D6
        >>> annotations = get_clinical_annotations(gene="CYP2D6")
        >>>
        >>> # Get high-quality evidence only
        >>> high_quality = get_clinical_annotations(evidence_level="1A")
    """
    url = f"{BASE_URL}clinicalAnnotation"
    params = {}
    if gene:
        params["gene"] = gene
    if drug:
        params["drug"] = drug
    if evidence_level:
        params["evidenceLevel"] = evidence_level

    return safe_api_call(url, params)


def get_drug_labels(drug: str, source: Optional[str] = None) -> Optional[List[Dict]]:
    """
    Retrieve pharmacogenomic drug label information.

    Args:
        drug: Drug name
        source: Regulatory source (e.g., "FDA", "EMA")

    Returns:
        List of drug labels with PGx information

    Example:
        >>> # Get all labels for warfarin
        >>> labels = get_drug_labels("warfarin")
        >>>
        >>> # Get only FDA labels
        >>> fda_labels = get_drug_labels("warfarin", source="FDA")
    """
    url = f"{BASE_URL}drugLabel"
    params = {"drug": drug}
    if source:
        params["source"] = source

    return safe_api_call(url, params)


def search_variants(rsid: Optional[str] = None, chromosome: Optional[str] = None,
                   position: Optional[str] = None) -> Optional[List[Dict]]:
    """
    Search for genetic variants by rsID or genomic position.

    Args:
        rsid: dbSNP rsID (e.g., "rs4244285")
        chromosome: Chromosome number
        position: Genomic position

    Returns:
        List of matching variants

    Example:
        >>> # Search by rsID
        >>> variant = search_variants(rsid="rs4244285")
        >>>
        >>> # Search by position
        >>> variants = search_variants(chromosome="10", position="94781859")
    """
    url = f"{BASE_URL}variant"

    if rsid:
        url = f"{BASE_URL}variant/{rsid}"
        return safe_api_call(url)

    params = {}
    if chromosome:
        params["chromosome"] = chromosome
    if position:
        params["position"] = position

    return safe_api_call(url, params)


def get_pathway_info(pathway_id: Optional[str] = None, drug: Optional[str] = None) -> Optional[Any]:
    """
    Retrieve pharmacokinetic/pharmacodynamic pathway information.

    Args:
        pathway_id: ClinPGx pathway ID (optional)
        drug: Drug name (optional)

    Returns:
        Pathway information or list of pathways

    Example:
        >>> # Get specific pathway
        >>> pathway = get_pathway_info(pathway_id="PA146123006")
        >>>
        >>> # Get all pathways for a drug
        >>> pathways = get_pathway_info(drug="warfarin")
    """
    if pathway_id:
        url = f"{BASE_URL}pathway/{pathway_id}"
        return safe_api_call(url)

    url = f"{BASE_URL}pathway"
    params = {}
    if drug:
        params["drug"] = drug

    return safe_api_call(url, params)


# Utility Functions

def export_to_dataframe(data: List[Dict], output_file: Optional[str] = None):
    """
    Convert API results to pandas DataFrame for analysis.

    Args:
        data: List of dictionaries from API
        output_file: Optional CSV output file path

    Returns:
        pandas DataFrame

    Example:
        >>> pairs = get_gene_drug_pairs(gene="CYP2D6")
        >>> df = export_to_dataframe(pairs, "cyp2d6_pairs.csv")
        >>> print(df.head())
    """
    try:
        import pandas as pd
    except ImportError:
        print("pandas not installed. Install with: pip install pandas")
        return None

    df = pd.DataFrame(data)

    if output_file:
        df.to_csv(output_file, index=False)
        print(f"Data exported to: {output_file}")

    return df


def batch_gene_query(gene_list: List[str], delay: float = 0.5) -> Dict[str, Dict]:
    """
    Query multiple genes in batch with rate limiting.

    Args:
        gene_list: List of gene symbols
        delay: Delay between requests (default 0.5s)

    Returns:
        Dictionary mapping gene symbols to gene data

    Example:
        >>> genes = ["CYP2D6", "CYP2C19", "CYP2C9", "TPMT"]
        >>> results = batch_gene_query(genes)
        >>> for gene, data in results.items():
        >>>     print(f"{gene}: {data['name']}")
    """
    results = {}

    print(f"Querying {len(gene_list)} genes with {delay}s delay between requests...")

    for gene in gene_list:
        print(f"Fetching: {gene}")
        data = get_gene_info(gene)
        if data:
            results[gene] = data
        time.sleep(delay)

    print(f"Completed: {len(results)}/{len(gene_list)} successful")
    return results


def find_actionable_gene_drug_pairs(cpic_level: str = "A") -> Optional[List[Dict]]:
    """
    Find all clinically actionable gene-drug pairs with CPIC guidelines.

    Args:
        cpic_level: CPIC recommendation level (A, B, C, D)

    Returns:
        List of actionable gene-drug pairs

    Example:
        >>> # Get all Level A recommendations
        >>> actionable = find_actionable_gene_drug_pairs(cpic_level="A")
        >>> for pair in actionable:
        >>>     print(f"{pair['gene']} - {pair['drug']}")
    """
    url = f"{BASE_URL}geneDrugPair"
    params = {"cpicLevel": cpic_level}
    return safe_api_call(url, params)


# Example Usage
if __name__ == "__main__":
    print("ClinPGx API Query Examples\n")

    # Example 1: Get gene information
    print("=" * 60)
    print("Example 1: Get CYP2D6 gene information")
    print("=" * 60)
    cyp2d6 = get_gene_info("CYP2D6")
    if cyp2d6:
        print(f"Gene: {cyp2d6.get('symbol')}")
        print(f"Name: {cyp2d6.get('name')}")
        print()

    # Example 2: Search for a drug
    print("=" * 60)
    print("Example 2: Search for warfarin")
    print("=" * 60)
    warfarin = get_drug_info("warfarin")
    if warfarin:
        for drug in warfarin[:1]:  # Show first result
            print(f"Drug: {drug.get('name')}")
            print(f"ID: {drug.get('id')}")
        print()

    # Example 3: Get gene-drug pairs
    print("=" * 60)
    print("Example 3: Get CYP2C19-clopidogrel pair")
    print("=" * 60)
    pair = get_gene_drug_pairs(gene="CYP2C19", drug="clopidogrel")
    if pair:
        print(f"Found {len(pair)} gene-drug pair(s)")
        if len(pair) > 0:
            print(f"Annotations: {pair[0].get('sources', [])}")
        print()

    # Example 4: Get CPIC guidelines
    print("=" * 60)
    print("Example 4: Get CPIC guidelines for CYP2C19")
    print("=" * 60)
    guidelines = get_cpic_guidelines(gene="CYP2C19")
    if guidelines:
        print(f"Found {len(guidelines)} guideline(s)")
        for g in guidelines[:2]:  # Show first 2
            print(f"  - {g.get('name')}")
        print()

    # Example 5: Get alleles for a gene
    print("=" * 60)
    print("Example 5: Get CYP2D6 alleles")
    print("=" * 60)
    alleles = get_alleles("CYP2D6")
    if alleles:
        print(f"Found {len(alleles)} allele(s)")
        for allele in alleles[:3]:  # Show first 3
            print(f"  - {allele.get('name')}: {allele.get('function')}")
        print()

    print("=" * 60)
    print("Examples completed!")
    print("=" * 60)
← Back to clinpgx-database