scripts/download_cosmic.py

#!/usr/bin/env python3
"""
COSMIC Data Download Utility

This script provides functions to download data from the COSMIC database
(Catalogue of Somatic Mutations in Cancer).

Usage:
    from download_cosmic import download_cosmic_file, list_available_files

    # Download a specific file
    download_cosmic_file(
        email="user@example.com",
        password="password",
        filepath="GRCh38/cosmic/latest/CosmicMutantExport.tsv.gz",
        output_filename="mutations.tsv.gz"
    )

Requirements:
    - requests library: pip install requests
    - Valid COSMIC account credentials (register at cancer.sanger.ac.uk/cosmic)
"""

import requests
import sys
import os
from typing import Optional


def download_cosmic_file(
    email: str,
    password: str,
    filepath: str,
    output_filename: Optional[str] = None,
    genome_assembly: str = "GRCh38"
) -> bool:
    """
    Download a file from COSMIC database.

    Args:
        email: COSMIC account email
        password: COSMIC account password
        filepath: Relative path to file (e.g., "GRCh38/cosmic/latest/CosmicMutantExport.tsv.gz")
        output_filename: Optional custom output filename (default: last part of filepath)
        genome_assembly: Genome assembly version (GRCh37 or GRCh38, default: GRCh38)

    Returns:
        True if download successful, False otherwise

    Example:
        download_cosmic_file(
            "user@email.com",
            "pass123",
            "GRCh38/cosmic/latest/CosmicMutantExport.tsv.gz"
        )
    """
    base_url = "https://cancer.sanger.ac.uk/cosmic/file_download/"

    # Determine output filename
    if output_filename is None:
        output_filename = os.path.basename(filepath)

    try:
        # Step 1: Get the download URL
        print(f"Requesting download URL for: {filepath}")
        r = requests.get(
            base_url + filepath,
            auth=(email, password),
            timeout=30
        )

        if r.status_code == 401:
            print("ERROR: Authentication failed. Check email and password.")
            return False
        elif r.status_code == 404:
            print(f"ERROR: File not found: {filepath}")
            return False
        elif r.status_code != 200:
            print(f"ERROR: Request failed with status code {r.status_code}")
            print(f"Response: {r.text}")
            return False

        # Parse response to get download URL
        response_data = r.json()
        download_url = response_data.get("url")

        if not download_url:
            print("ERROR: No download URL in response")
            return False

        # Step 2: Download the file
        print(f"Downloading file from: {download_url}")
        file_response = requests.get(download_url, stream=True, timeout=300)

        if file_response.status_code != 200:
            print(f"ERROR: Download failed with status code {file_response.status_code}")
            return False

        # Step 3: Write to disk
        print(f"Saving to: {output_filename}")
        total_size = int(file_response.headers.get('content-length', 0))

        with open(output_filename, 'wb') as f:
            if total_size == 0:
                f.write(file_response.content)
            else:
                downloaded = 0
                for chunk in file_response.iter_content(chunk_size=8192):
                    if chunk:
                        f.write(chunk)
                        downloaded += len(chunk)
                        # Show progress
                        progress = (downloaded / total_size) * 100
                        print(f"\rProgress: {progress:.1f}%", end='', flush=True)
                print()  # New line after progress

        print(f"✓ Successfully downloaded: {output_filename}")
        return True

    except requests.exceptions.Timeout:
        print("ERROR: Request timed out")
        return False
    except requests.exceptions.RequestException as e:
        print(f"ERROR: Request failed: {e}")
        return False
    except Exception as e:
        print(f"ERROR: Unexpected error: {e}")
        return False


def get_common_file_path(
    data_type: str,
    genome_assembly: str = "GRCh38",
    version: str = "latest"
) -> Optional[str]:
    """
    Get the filepath for common COSMIC data files.

    Args:
        data_type: Type of data (e.g., 'mutations', 'gene_census', 'signatures')
        genome_assembly: GRCh37 or GRCh38
        version: COSMIC version (use 'latest' for most recent)

    Returns:
        Filepath string or None if type unknown
    """
    common_files = {
        'mutations': f'{genome_assembly}/cosmic/{version}/CosmicMutantExport.tsv.gz',
        'mutations_vcf': f'{genome_assembly}/cosmic/{version}/VCF/CosmicCodingMuts.vcf.gz',
        'gene_census': f'{genome_assembly}/cosmic/{version}/cancer_gene_census.csv',
        'resistance_mutations': f'{genome_assembly}/cosmic/{version}/CosmicResistanceMutations.tsv.gz',
        'structural_variants': f'{genome_assembly}/cosmic/{version}/CosmicStructExport.tsv.gz',
        'gene_expression': f'{genome_assembly}/cosmic/{version}/CosmicCompleteGeneExpression.tsv.gz',
        'copy_number': f'{genome_assembly}/cosmic/{version}/CosmicCompleteCNA.tsv.gz',
        'fusion_genes': f'{genome_assembly}/cosmic/{version}/CosmicFusionExport.tsv.gz',
        'signatures': f'signatures/signatures.tsv',
        'sample_info': f'{genome_assembly}/cosmic/{version}/CosmicSample.tsv.gz',
    }

    return common_files.get(data_type)


def main():
    """Command-line interface for downloading COSMIC files."""
    import argparse

    parser = argparse.ArgumentParser(
        description='Download files from COSMIC database',
        formatter_class=argparse.RawDescriptionHelpFormatter,
        epilog="""
Examples:
  # Download mutations file
  %(prog)s user@email.com --filepath GRCh38/cosmic/latest/CosmicMutantExport.tsv.gz

  # Download using shorthand
  %(prog)s user@email.com --data-type mutations

  # Download for GRCh37
  %(prog)s user@email.com --data-type gene_census --assembly GRCh37
        """
    )

    parser.add_argument('email', help='COSMIC account email')
    parser.add_argument('--password', help='COSMIC account password (will prompt if not provided)')
    parser.add_argument('--filepath', help='Full filepath to download')
    parser.add_argument('--data-type',
                       choices=['mutations', 'mutations_vcf', 'gene_census', 'resistance_mutations',
                               'structural_variants', 'gene_expression', 'copy_number',
                               'fusion_genes', 'signatures', 'sample_info'],
                       help='Common data type shorthand')
    parser.add_argument('--assembly', default='GRCh38',
                       choices=['GRCh37', 'GRCh38'],
                       help='Genome assembly (default: GRCh38)')
    parser.add_argument('--version', default='latest',
                       help='COSMIC version (default: latest)')
    parser.add_argument('-o', '--output', help='Output filename')

    args = parser.parse_args()

    # Get password if not provided
    if not args.password:
        import getpass
        args.password = getpass.getpass('COSMIC password: ')

    # Determine filepath
    if args.filepath:
        filepath = args.filepath
    elif args.data_type:
        filepath = get_common_file_path(args.data_type, args.assembly, args.version)
        if not filepath:
            print(f"ERROR: Unknown data type: {args.data_type}")
            return 1
    else:
        print("ERROR: Must provide either --filepath or --data-type")
        parser.print_help()
        return 1

    # Download the file
    success = download_cosmic_file(
        email=args.email,
        password=args.password,
        filepath=filepath,
        output_filename=args.output,
        genome_assembly=args.assembly
    )

    return 0 if success else 1


if __name__ == '__main__':
    sys.exit(main())
← Back to cosmic-database