scripts/bioactivity_query.py

#!/usr/bin/env python3
"""
PubChem Bioactivity Data Retrieval

This script provides functions for retrieving biological activity data
from PubChem for compounds and assays.
"""

import sys
import json
import time
from typing import Dict, List, Optional

try:
    import requests
except ImportError:
    print("Error: requests is not installed. Install it with: pip install requests")
    sys.exit(1)


BASE_URL = "https://pubchem.ncbi.nlm.nih.gov/rest/pug"
PUG_VIEW_URL = "https://pubchem.ncbi.nlm.nih.gov/rest/pug_view"

# Rate limiting: 5 requests per second maximum
REQUEST_DELAY = 0.21  # seconds between requests


def rate_limited_request(url: str, method: str = 'GET', **kwargs) -> Optional[requests.Response]:
    """
    Make a rate-limited request to PubChem API.

    Args:
        url: Request URL
        method: HTTP method ('GET' or 'POST')
        **kwargs: Additional arguments for requests

    Returns:
        Response object or None on error
    """
    time.sleep(REQUEST_DELAY)

    try:
        if method.upper() == 'GET':
            response = requests.get(url, **kwargs)
        else:
            response = requests.post(url, **kwargs)

        response.raise_for_status()
        return response
    except requests.exceptions.RequestException as e:
        print(f"Request error: {e}")
        return None


def get_bioassay_summary(cid: int) -> Optional[Dict]:
    """
    Get bioassay summary for a compound.

    Args:
        cid: PubChem Compound ID

    Returns:
        Dictionary containing bioassay summary data
    """
    url = f"{BASE_URL}/compound/cid/{cid}/assaysummary/JSON"
    response = rate_limited_request(url)

    if response and response.status_code == 200:
        return response.json()
    return None


def get_compound_bioactivities(
    cid: int,
    activity_outcome: Optional[str] = None
) -> List[Dict]:
    """
    Get bioactivity data for a compound.

    Args:
        cid: PubChem Compound ID
        activity_outcome: Filter by activity ('active', 'inactive', 'inconclusive')

    Returns:
        List of bioactivity records
    """
    data = get_bioassay_summary(cid)

    if not data:
        return []

    activities = []
    table = data.get('Table', {})

    for row in table.get('Row', []):
        activity = {}
        for i, cell in enumerate(row.get('Cell', [])):
            column_name = table['Columns']['Column'][i]
            activity[column_name] = cell

        if activity_outcome:
            if activity.get('Activity Outcome', '').lower() == activity_outcome.lower():
                activities.append(activity)
        else:
            activities.append(activity)

    return activities


def get_assay_description(aid: int) -> Optional[Dict]:
    """
    Get detailed description for a specific assay.

    Args:
        aid: PubChem Assay ID (AID)

    Returns:
        Dictionary containing assay description
    """
    url = f"{BASE_URL}/assay/aid/{aid}/description/JSON"
    response = rate_limited_request(url)

    if response and response.status_code == 200:
        return response.json()
    return None


def get_assay_targets(aid: int) -> List[str]:
    """
    Get biological targets for an assay.

    Args:
        aid: PubChem Assay ID

    Returns:
        List of target names
    """
    description = get_assay_description(aid)

    if not description:
        return []

    targets = []
    assay_data = description.get('PC_AssayContainer', [{}])[0]
    assay = assay_data.get('assay', {})

    # Extract target information
    descr = assay.get('descr', {})
    for target in descr.get('target', []):
        mol_id = target.get('mol_id', '')
        name = target.get('name', '')
        if name:
            targets.append(name)
        elif mol_id:
            targets.append(f"GI:{mol_id}")

    return targets


def search_assays_by_target(
    target_name: str,
    max_results: int = 100
) -> List[int]:
    """
    Search for assays targeting a specific protein or gene.

    Args:
        target_name: Name of the target (e.g., 'EGFR', 'p53')
        max_results: Maximum number of results

    Returns:
        List of Assay IDs (AIDs)
    """
    # Use PubChem's text search for assays
    url = f"{BASE_URL}/assay/target/{target_name}/aids/JSON"
    response = rate_limited_request(url)

    if response and response.status_code == 200:
        data = response.json()
        aids = data.get('IdentifierList', {}).get('AID', [])
        return aids[:max_results]
    return []


def get_active_compounds_in_assay(aid: int, max_results: int = 1000) -> List[int]:
    """
    Get list of active compounds in an assay.

    Args:
        aid: PubChem Assay ID
        max_results: Maximum number of results

    Returns:
        List of Compound IDs (CIDs) that showed activity
    """
    url = f"{BASE_URL}/assay/aid/{aid}/cids/JSON?cids_type=active"
    response = rate_limited_request(url)

    if response and response.status_code == 200:
        data = response.json()
        cids = data.get('IdentifierList', {}).get('CID', [])
        return cids[:max_results]
    return []


def get_compound_annotations(cid: int, section: Optional[str] = None) -> Optional[Dict]:
    """
    Get comprehensive compound annotations from PUG-View.

    Args:
        cid: PubChem Compound ID
        section: Specific section to retrieve (e.g., 'Pharmacology and Biochemistry')

    Returns:
        Dictionary containing annotation data
    """
    url = f"{PUG_VIEW_URL}/data/compound/{cid}/JSON"

    if section:
        url += f"?heading={section}"

    response = rate_limited_request(url)

    if response and response.status_code == 200:
        return response.json()
    return None


def get_drug_information(cid: int) -> Optional[Dict]:
    """
    Get drug and medication information for a compound.

    Args:
        cid: PubChem Compound ID

    Returns:
        Dictionary containing drug information
    """
    return get_compound_annotations(cid, section="Drug and Medication Information")


def get_safety_hazards(cid: int) -> Optional[Dict]:
    """
    Get safety and hazard information for a compound.

    Args:
        cid: PubChem Compound ID

    Returns:
        Dictionary containing safety information
    """
    return get_compound_annotations(cid, section="Safety and Hazards")


def summarize_bioactivities(cid: int) -> Dict:
    """
    Generate a summary of bioactivity data for a compound.

    Args:
        cid: PubChem Compound ID

    Returns:
        Dictionary with bioactivity summary statistics
    """
    activities = get_compound_bioactivities(cid)

    summary = {
        'total_assays': len(activities),
        'active': 0,
        'inactive': 0,
        'inconclusive': 0,
        'unspecified': 0,
        'assay_types': {}
    }

    for activity in activities:
        outcome = activity.get('Activity Outcome', '').lower()

        if 'active' in outcome:
            summary['active'] += 1
        elif 'inactive' in outcome:
            summary['inactive'] += 1
        elif 'inconclusive' in outcome:
            summary['inconclusive'] += 1
        else:
            summary['unspecified'] += 1

    return summary


def find_compounds_by_bioactivity(
    target: str,
    threshold: Optional[float] = None,
    max_compounds: int = 100
) -> List[Dict]:
    """
    Find compounds with bioactivity against a specific target.

    Args:
        target: Target name (e.g., 'EGFR')
        threshold: Activity threshold (if applicable)
        max_compounds: Maximum number of compounds to return

    Returns:
        List of dictionaries with compound information and activity data
    """
    # Step 1: Find assays for the target
    assay_ids = search_assays_by_target(target, max_results=10)

    if not assay_ids:
        print(f"No assays found for target: {target}")
        return []

    # Step 2: Get active compounds from these assays
    compound_set = set()
    compound_data = []

    for aid in assay_ids[:5]:  # Limit to first 5 assays
        active_cids = get_active_compounds_in_assay(aid, max_results=max_compounds)

        for cid in active_cids:
            if cid not in compound_set and len(compound_data) < max_compounds:
                compound_set.add(cid)
                compound_data.append({
                    'cid': cid,
                    'aid': aid,
                    'target': target
                })

        if len(compound_data) >= max_compounds:
            break

    return compound_data


def main():
    """Example usage of bioactivity query functions."""

    # Example 1: Get bioassay summary for aspirin (CID 2244)
    print("Example 1: Getting bioassay summary for aspirin (CID 2244)...")
    summary = summarize_bioactivities(2244)
    print(json.dumps(summary, indent=2))

    # Example 2: Get active bioactivities for a compound
    print("\nExample 2: Getting active bioactivities for aspirin...")
    activities = get_compound_bioactivities(2244, activity_outcome='active')
    print(f"Found {len(activities)} active bioactivities")
    if activities:
        print(f"First activity: {activities[0].get('Assay Name', 'N/A')}")

    # Example 3: Get assay information
    print("\nExample 3: Getting assay description...")
    if activities:
        aid = activities[0].get('AID', 0)
        targets = get_assay_targets(aid)
        print(f"Assay {aid} targets: {', '.join(targets) if targets else 'N/A'}")

    # Example 4: Search for compounds targeting EGFR
    print("\nExample 4: Searching for EGFR inhibitors...")
    egfr_compounds = find_compounds_by_bioactivity('EGFR', max_compounds=5)
    print(f"Found {len(egfr_compounds)} compounds with EGFR activity")
    for comp in egfr_compounds[:5]:
        print(f"  CID {comp['cid']} (from AID {comp['aid']})")


if __name__ == '__main__':
    main()
← Back to pubchem-database