scripts/query_gene.py - gene-database

#!/usr/bin/env python3
"""
Query NCBI Gene database using E-utilities.

This script provides access to ESearch, ESummary, and EFetch functions
for searching and retrieving gene information.
"""

import argparse
import json
import sys
import time
import urllib.parse
import urllib.request
from typing import Optional, Dict, List, Any
from xml.etree import ElementTree as ET


BASE_URL = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/"
DB = "gene"


def esearch(query: str, retmax: int = 20, api_key: Optional[str] = None) -> List[str]:
    """
    Search NCBI Gene database and return list of Gene IDs.

    Args:
        query: Search query (e.g., "BRCA1[gene] AND human[organism]")
        retmax: Maximum number of results to return
        api_key: Optional NCBI API key for higher rate limits

    Returns:
        List of Gene IDs as strings
    """
    params = {
        'db': DB,
        'term': query,
        'retmax': retmax,
        'retmode': 'json'
    }

    if api_key:
        params['api_key'] = api_key

    url = f"{BASE_URL}esearch.fcgi?{urllib.parse.urlencode(params)}"

    try:
        with urllib.request.urlopen(url) as response:
            data = json.loads(response.read().decode())

        if 'esearchresult' in data and 'idlist' in data['esearchresult']:
            return data['esearchresult']['idlist']
        else:
            print(f"Error: Unexpected response format", file=sys.stderr)
            return []

    except urllib.error.HTTPError as e:
        print(f"HTTP Error {e.code}: {e.reason}", file=sys.stderr)
        return []
    except Exception as e:
        print(f"Error: {e}", file=sys.stderr)
        return []


def esummary(gene_ids: List[str], api_key: Optional[str] = None) -> Dict[str, Any]:
    """
    Get document summaries for Gene IDs.

    Args:
        gene_ids: List of Gene IDs
        api_key: Optional NCBI API key

    Returns:
        Dictionary of gene summaries
    """
    params = {
        'db': DB,
        'id': ','.join(gene_ids),
        'retmode': 'json'
    }

    if api_key:
        params['api_key'] = api_key

    url = f"{BASE_URL}esummary.fcgi?{urllib.parse.urlencode(params)}"

    try:
        with urllib.request.urlopen(url) as response:
            data = json.loads(response.read().decode())
        return data
    except urllib.error.HTTPError as e:
        print(f"HTTP Error {e.code}: {e.reason}", file=sys.stderr)
        return {}
    except Exception as e:
        print(f"Error: {e}", file=sys.stderr)
        return {}


def efetch(gene_ids: List[str], retmode: str = 'xml', api_key: Optional[str] = None) -> str:
    """
    Fetch full gene records.

    Args:
        gene_ids: List of Gene IDs
        retmode: Return format ('xml', 'text', 'asn.1')
        api_key: Optional NCBI API key

    Returns:
        Gene records as string in requested format
    """
    params = {
        'db': DB,
        'id': ','.join(gene_ids),
        'retmode': retmode
    }

    if api_key:
        params['api_key'] = api_key

    url = f"{BASE_URL}efetch.fcgi?{urllib.parse.urlencode(params)}"

    try:
        with urllib.request.urlopen(url) as response:
            return response.read().decode()
    except urllib.error.HTTPError as e:
        print(f"HTTP Error {e.code}: {e.reason}", file=sys.stderr)
        return ""
    except Exception as e:
        print(f"Error: {e}", file=sys.stderr)
        return ""


def search_and_summarize(query: str, organism: Optional[str] = None,
                        max_results: int = 20, api_key: Optional[str] = None) -> None:
    """
    Search for genes and display summaries.

    Args:
        query: Gene search query
        organism: Optional organism filter
        max_results: Maximum number of results
        api_key: Optional NCBI API key
    """
    # Add organism filter if provided
    if organism:
        if '[organism]' not in query.lower():
            query = f"{query} AND {organism}[organism]"

    print(f"Searching for: {query}")
    print("-" * 80)

    # Search for gene IDs
    gene_ids = esearch(query, retmax=max_results, api_key=api_key)

    if not gene_ids:
        print("No results found.")
        return

    print(f"Found {len(gene_ids)} gene(s)")
    print()

    # Get summaries
    summaries = esummary(gene_ids, api_key=api_key)

    if 'result' in summaries:
        for gene_id in gene_ids:
            if gene_id in summaries['result']:
                gene = summaries['result'][gene_id]
                print(f"Gene ID: {gene_id}")
                print(f"  Symbol: {gene.get('name', 'N/A')}")
                print(f"  Description: {gene.get('description', 'N/A')}")
                print(f"  Organism: {gene.get('organism', {}).get('scientificname', 'N/A')}")
                print(f"  Chromosome: {gene.get('chromosome', 'N/A')}")
                print(f"  Map Location: {gene.get('maplocation', 'N/A')}")
                print(f"  Type: {gene.get('geneticsource', 'N/A')}")
                print()

    # Respect rate limits
    time.sleep(0.34)  # ~3 requests per second


def fetch_by_id(gene_ids: List[str], output_format: str = 'json',
                api_key: Optional[str] = None) -> None:
    """
    Fetch and display gene information by ID.

    Args:
        gene_ids: List of Gene IDs
        output_format: Output format ('json', 'xml', 'text')
        api_key: Optional NCBI API key
    """
    if output_format == 'json':
        # Get summaries in JSON format
        summaries = esummary(gene_ids, api_key=api_key)
        print(json.dumps(summaries, indent=2))
    else:
        # Fetch full records
        data = efetch(gene_ids, retmode=output_format, api_key=api_key)
        print(data)

    # Respect rate limits
    time.sleep(0.34)


def main():
    parser = argparse.ArgumentParser(
        description='Query NCBI Gene database using E-utilities',
        formatter_class=argparse.RawDescriptionHelpFormatter,
        epilog="""
Examples:
  # Search for gene by symbol
  %(prog)s --search "BRCA1" --organism "human"

  # Fetch gene by ID
  %(prog)s --id 672 --format json

  # Complex search query
  %(prog)s --search "insulin[gene] AND diabetes[disease]"

  # Multiple gene IDs
  %(prog)s --id 672,7157,5594
        """
    )

    parser.add_argument('--search', '-s', help='Search query')
    parser.add_argument('--organism', '-o', help='Organism filter')
    parser.add_argument('--id', '-i', help='Gene ID(s), comma-separated')
    parser.add_argument('--format', '-f', default='json',
                       choices=['json', 'xml', 'text'],
                       help='Output format (default: json)')
    parser.add_argument('--max-results', '-m', type=int, default=20,
                       help='Maximum number of search results (default: 20)')
    parser.add_argument('--api-key', '-k', help='NCBI API key for higher rate limits')

    args = parser.parse_args()

    if not args.search and not args.id:
        parser.error("Either --search or --id must be provided")

    if args.id:
        # Fetch by ID
        gene_ids = [id.strip() for id in args.id.split(',')]
        fetch_by_id(gene_ids, output_format=args.format, api_key=args.api_key)
    else:
        # Search and summarize
        search_and_summarize(args.search, organism=args.organism,
                           max_results=args.max_results, api_key=args.api_key)


if __name__ == '__main__':
    main()