scripts/query_clinicaltrials.py

#!/usr/bin/env python3
"""
ClinicalTrials.gov API Query Helper

A comprehensive Python script for querying the ClinicalTrials.gov API v2.
Provides convenient functions for common query patterns including searching
by condition, intervention, location, sponsor, and retrieving specific trials.

API Documentation: https://clinicaltrials.gov/data-api/api
Rate Limit: ~50 requests per minute per IP address
"""

import requests
import json
from typing import Dict, List, Optional, Union
from urllib.parse import urlencode


BASE_URL = "https://clinicaltrials.gov/api/v2"


def search_studies(
    condition: Optional[str] = None,
    intervention: Optional[str] = None,
    location: Optional[str] = None,
    sponsor: Optional[str] = None,
    status: Optional[Union[str, List[str]]] = None,
    nct_ids: Optional[List[str]] = None,
    sort: str = "LastUpdatePostDate:desc",
    page_size: int = 10,
    page_token: Optional[str] = None,
    format: str = "json"
) -> Dict:
    """
    Search for clinical trials using various filters.

    Args:
        condition: Disease or condition (e.g., "lung cancer", "diabetes")
        intervention: Treatment or intervention (e.g., "Pembrolizumab", "exercise")
        location: Geographic location (e.g., "New York", "California")
        sponsor: Sponsor or collaborator name (e.g., "National Cancer Institute")
        status: Study status(es). Can be string or list. Valid values:
                RECRUITING, NOT_YET_RECRUITING, ENROLLING_BY_INVITATION,
                ACTIVE_NOT_RECRUITING, SUSPENDED, TERMINATED, COMPLETED, WITHDRAWN
        nct_ids: List of NCT IDs to filter by
        sort: Sort order (e.g., "LastUpdatePostDate:desc", "EnrollmentCount:desc")
        page_size: Number of results per page (default: 10, max: 1000)
        page_token: Token for pagination (returned from previous query)
        format: Response format ("json" or "csv")

    Returns:
        Dictionary containing search results with studies and metadata
    """
    params = {}

    # Build query parameters
    if condition:
        params['query.cond'] = condition
    if intervention:
        params['query.intr'] = intervention
    if location:
        params['query.locn'] = location
    if sponsor:
        params['query.spons'] = sponsor

    # Handle status filter (can be list or string)
    if status:
        if isinstance(status, list):
            params['filter.overallStatus'] = ','.join(status)
        else:
            params['filter.overallStatus'] = status

    # Handle NCT IDs filter
    if nct_ids:
        params['filter.ids'] = ','.join(nct_ids)

    # Add pagination and sorting
    params['sort'] = sort
    params['pageSize'] = page_size
    if page_token:
        params['pageToken'] = page_token

    # Set format
    params['format'] = format

    url = f"{BASE_URL}/studies"
    response = requests.get(url, params=params)
    response.raise_for_status()

    if format == "json":
        return response.json()
    else:
        return response.text


def get_study_details(nct_id: str, format: str = "json") -> Dict:
    """
    Retrieve detailed information about a specific clinical trial.

    Args:
        nct_id: The NCT ID of the trial (e.g., "NCT04852770")
        format: Response format ("json" or "csv")

    Returns:
        Dictionary containing comprehensive study information
    """
    params = {'format': format}
    url = f"{BASE_URL}/studies/{nct_id}"

    response = requests.get(url, params=params)
    response.raise_for_status()

    if format == "json":
        return response.json()
    else:
        return response.text


def search_with_all_results(
    condition: Optional[str] = None,
    intervention: Optional[str] = None,
    location: Optional[str] = None,
    sponsor: Optional[str] = None,
    status: Optional[Union[str, List[str]]] = None,
    max_results: Optional[int] = None
) -> List[Dict]:
    """
    Search for clinical trials and automatically paginate through all results.

    Args:
        condition: Disease or condition to search for
        intervention: Treatment or intervention to search for
        location: Geographic location to search in
        sponsor: Sponsor or collaborator name
        status: Study status(es) to filter by
        max_results: Maximum number of results to retrieve (None for all)

    Returns:
        List of all matching studies
    """
    all_studies = []
    page_token = None

    while True:
        result = search_studies(
            condition=condition,
            intervention=intervention,
            location=location,
            sponsor=sponsor,
            status=status,
            page_size=1000,  # Use max page size for efficiency
            page_token=page_token
        )

        studies = result.get('studies', [])
        all_studies.extend(studies)

        # Check if we've reached the max or there are no more results
        if max_results and len(all_studies) >= max_results:
            return all_studies[:max_results]

        # Check for next page
        page_token = result.get('nextPageToken')
        if not page_token:
            break

    return all_studies


def extract_study_summary(study: Dict) -> Dict:
    """
    Extract key information from a study for quick overview.

    Args:
        study: A study dictionary from the API response

    Returns:
        Dictionary with essential study information
    """
    protocol = study.get('protocolSection', {})
    identification = protocol.get('identificationModule', {})
    status_module = protocol.get('statusModule', {})
    description = protocol.get('descriptionModule', {})

    return {
        'nct_id': identification.get('nctId'),
        'title': identification.get('officialTitle') or identification.get('briefTitle'),
        'status': status_module.get('overallStatus'),
        'phase': protocol.get('designModule', {}).get('phases', []),
        'enrollment': protocol.get('designModule', {}).get('enrollmentInfo', {}).get('count'),
        'brief_summary': description.get('briefSummary'),
        'last_update': status_module.get('lastUpdatePostDateStruct', {}).get('date')
    }


# Example usage
if __name__ == "__main__":
    # Example 1: Search for recruiting lung cancer trials
    print("Example 1: Searching for recruiting lung cancer trials...")
    results = search_studies(
        condition="lung cancer",
        status="RECRUITING",
        page_size=5
    )
    print(f"Found {results.get('totalCount', 0)} total trials")
    print(f"Showing first {len(results.get('studies', []))} trials\n")

    # Example 2: Get details for a specific trial
    if results.get('studies'):
        first_study = results['studies'][0]
        nct_id = first_study['protocolSection']['identificationModule']['nctId']
        print(f"Example 2: Getting details for {nct_id}...")
        details = get_study_details(nct_id)
        summary = extract_study_summary(details)
        print(json.dumps(summary, indent=2))
← Back to clinicaltrials-database