scripts/batch_gene_lookup.py

#!/usr/bin/env python3
"""
Batch gene lookup using NCBI APIs.

This script efficiently processes multiple gene queries with proper
rate limiting and error handling.
"""

import argparse
import json
import sys
import time
import urllib.parse
import urllib.request
from typing import Optional, List, Dict, Any


def read_gene_list(filepath: str) -> List[str]:
    """
    Read gene identifiers from a file (one per line).

    Args:
        filepath: Path to file containing gene symbols or IDs

    Returns:
        List of gene identifiers
    """
    try:
        with open(filepath, 'r') as f:
            genes = [line.strip() for line in f if line.strip()]
        return genes
    except FileNotFoundError:
        print(f"Error: File '{filepath}' not found", file=sys.stderr)
        sys.exit(1)
    except Exception as e:
        print(f"Error reading file: {e}", file=sys.stderr)
        sys.exit(1)


def batch_esearch(queries: List[str], organism: Optional[str] = None,
                  api_key: Optional[str] = None) -> Dict[str, str]:
    """
    Search for multiple gene symbols and return their IDs.

    Args:
        queries: List of gene symbols
        organism: Optional organism filter
        api_key: Optional NCBI API key

    Returns:
        Dictionary mapping gene symbol to Gene ID (or 'NOT_FOUND')
    """
    base_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/"
    results = {}

    # Rate limiting
    delay = 0.1 if api_key else 0.34  # 10 req/sec with key, 3 req/sec without

    for query in queries:
        # Build search term
        search_term = f"{query}[gene]"
        if organism:
            search_term += f" AND {organism}[organism]"

        params = {
            'db': 'gene',
            'term': search_term,
            'retmax': 1,
            'retmode': 'json'
        }

        if api_key:
            params['api_key'] = api_key

        url = f"{base_url}esearch.fcgi?{urllib.parse.urlencode(params)}"

        try:
            with urllib.request.urlopen(url) as response:
                data = json.loads(response.read().decode())

            if 'esearchresult' in data and 'idlist' in data['esearchresult']:
                id_list = data['esearchresult']['idlist']
                results[query] = id_list[0] if id_list else 'NOT_FOUND'
            else:
                results[query] = 'ERROR'

        except Exception as e:
            print(f"Error searching for {query}: {e}", file=sys.stderr)
            results[query] = 'ERROR'

        time.sleep(delay)

    return results


def batch_esummary(gene_ids: List[str], api_key: Optional[str] = None,
                   chunk_size: int = 200) -> Dict[str, Dict[str, Any]]:
    """
    Get summaries for multiple genes in batches.

    Args:
        gene_ids: List of Gene IDs
        api_key: Optional NCBI API key
        chunk_size: Number of IDs per request (max 500)

    Returns:
        Dictionary mapping Gene ID to summary data
    """
    base_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/"
    all_results = {}

    # Rate limiting
    delay = 0.1 if api_key else 0.34

    # Process in chunks
    for i in range(0, len(gene_ids), chunk_size):
        chunk = gene_ids[i:i + chunk_size]

        params = {
            'db': 'gene',
            'id': ','.join(chunk),
            'retmode': 'json'
        }

        if api_key:
            params['api_key'] = api_key

        url = f"{base_url}esummary.fcgi?{urllib.parse.urlencode(params)}"

        try:
            with urllib.request.urlopen(url) as response:
                data = json.loads(response.read().decode())

            if 'result' in data:
                for gene_id in chunk:
                    if gene_id in data['result']:
                        all_results[gene_id] = data['result'][gene_id]

        except Exception as e:
            print(f"Error fetching summaries for chunk: {e}", file=sys.stderr)

        time.sleep(delay)

    return all_results


def batch_lookup_by_ids(gene_ids: List[str], api_key: Optional[str] = None) -> List[Dict[str, Any]]:
    """
    Lookup genes by IDs and return structured data.

    Args:
        gene_ids: List of Gene IDs
        api_key: Optional NCBI API key

    Returns:
        List of gene information dictionaries
    """
    summaries = batch_esummary(gene_ids, api_key=api_key)

    results = []
    for gene_id in gene_ids:
        if gene_id in summaries:
            gene = summaries[gene_id]
            results.append({
                'gene_id': gene_id,
                'symbol': gene.get('name', 'N/A'),
                'description': gene.get('description', 'N/A'),
                'organism': gene.get('organism', {}).get('scientificname', 'N/A'),
                'chromosome': gene.get('chromosome', 'N/A'),
                'map_location': gene.get('maplocation', 'N/A'),
                'type': gene.get('geneticsource', 'N/A')
            })
        else:
            results.append({
                'gene_id': gene_id,
                'error': 'Not found or error fetching'
            })

    return results


def batch_lookup_by_symbols(gene_symbols: List[str], organism: str,
                            api_key: Optional[str] = None) -> List[Dict[str, Any]]:
    """
    Lookup genes by symbols and return structured data.

    Args:
        gene_symbols: List of gene symbols
        organism: Organism name
        api_key: Optional NCBI API key

    Returns:
        List of gene information dictionaries
    """
    # First, search for IDs
    print(f"Searching for {len(gene_symbols)} gene symbols...", file=sys.stderr)
    symbol_to_id = batch_esearch(gene_symbols, organism=organism, api_key=api_key)

    # Filter to valid IDs
    valid_ids = [id for id in symbol_to_id.values() if id not in ['NOT_FOUND', 'ERROR']]

    if not valid_ids:
        print("No genes found", file=sys.stderr)
        return []

    print(f"Found {len(valid_ids)} genes, fetching details...", file=sys.stderr)

    # Fetch summaries
    summaries = batch_esummary(valid_ids, api_key=api_key)

    # Build results
    results = []
    for symbol, gene_id in symbol_to_id.items():
        if gene_id == 'NOT_FOUND':
            results.append({
                'query_symbol': symbol,
                'status': 'not_found'
            })
        elif gene_id == 'ERROR':
            results.append({
                'query_symbol': symbol,
                'status': 'error'
            })
        elif gene_id in summaries:
            gene = summaries[gene_id]
            results.append({
                'query_symbol': symbol,
                'gene_id': gene_id,
                'symbol': gene.get('name', 'N/A'),
                'description': gene.get('description', 'N/A'),
                'organism': gene.get('organism', {}).get('scientificname', 'N/A'),
                'chromosome': gene.get('chromosome', 'N/A'),
                'map_location': gene.get('maplocation', 'N/A'),
                'type': gene.get('geneticsource', 'N/A')
            })

    return results


def main():
    parser = argparse.ArgumentParser(
        description='Batch gene lookup using NCBI APIs',
        formatter_class=argparse.RawDescriptionHelpFormatter,
        epilog="""
Examples:
  # Lookup by gene IDs
  %(prog)s --ids 672,7157,5594

  # Lookup by symbols from a file
  %(prog)s --file genes.txt --organism human

  # Lookup with API key and save to file
  %(prog)s --ids 672,7157,5594 --api-key YOUR_KEY --output results.json
        """
    )

    parser.add_argument('--ids', '-i', help='Comma-separated Gene IDs')
    parser.add_argument('--file', '-f', help='File containing gene symbols (one per line)')
    parser.add_argument('--organism', '-o', help='Organism name (required with --file)')
    parser.add_argument('--output', '-O', help='Output file path (JSON format)')
    parser.add_argument('--api-key', '-k', help='NCBI API key')
    parser.add_argument('--pretty', '-p', action='store_true',
                       help='Pretty-print JSON output')

    args = parser.parse_args()

    if not args.ids and not args.file:
        parser.error("Either --ids or --file must be provided")

    if args.file and not args.organism:
        parser.error("--organism is required when using --file")

    # Process genes
    if args.ids:
        gene_ids = [id.strip() for id in args.ids.split(',')]
        results = batch_lookup_by_ids(gene_ids, api_key=args.api_key)
    else:
        gene_symbols = read_gene_list(args.file)
        results = batch_lookup_by_symbols(gene_symbols, args.organism, api_key=args.api_key)

    # Output results
    indent = 2 if args.pretty else None
    json_output = json.dumps(results, indent=indent)

    if args.output:
        try:
            with open(args.output, 'w') as f:
                f.write(json_output)
            print(f"Results written to {args.output}", file=sys.stderr)
        except Exception as e:
            print(f"Error writing output file: {e}", file=sys.stderr)
            sys.exit(1)
    else:
        print(json_output)


if __name__ == '__main__':
    main()