scripts/kegg_api.py - kegg-database

"""
KEGG REST API Helper Functions

This module provides Python functions for interacting with the KEGG REST API.
All functions return raw response text which can be parsed as needed.

API Base URL: https://rest.kegg.jp
Documentation: https://www.kegg.jp/kegg/rest/keggapi.html

IMPORTANT: KEGG API is made available only for academic use by academic users.
"""

import urllib.request
import urllib.parse
import urllib.error
from typing import Optional, List, Union


KEGG_BASE_URL = "https://rest.kegg.jp"


def kegg_info(database: str) -> str:
    """
    Get database metadata and statistics.

    Args:
        database: KEGG database name (e.g., 'kegg', 'pathway', 'enzyme', 'genes')

    Returns:
        str: Database information and statistics

    Example:
        info = kegg_info('pathway')
    """
    url = f"{KEGG_BASE_URL}/info/{database}"
    try:
        with urllib.request.urlopen(url) as response:
            return response.read().decode('utf-8')
    except urllib.error.HTTPError as e:
        return f"Error: {e.code} - {e.reason}"


def kegg_list(database: str, org: Optional[str] = None) -> str:
    """
    List entry identifiers and associated names.

    Args:
        database: KEGG database name or specific entry (e.g., 'pathway', 'enzyme', 'hsa:10458')
        org: Optional organism code for pathway/module listings (e.g., 'hsa' for human)

    Returns:
        str: Tab-delimited list of entries

    Examples:
        pathways = kegg_list('pathway')  # List all reference pathways
        hsa_pathways = kegg_list('pathway', 'hsa')  # List human pathways
        genes = kegg_list('hsa:10458+ece:Z5100')  # List specific genes
    """
    if org:
        url = f"{KEGG_BASE_URL}/list/{database}/{org}"
    else:
        url = f"{KEGG_BASE_URL}/list/{database}"

    try:
        with urllib.request.urlopen(url) as response:
            return response.read().decode('utf-8')
    except urllib.error.HTTPError as e:
        return f"Error: {e.code} - {e.reason}"


def kegg_find(database: str, query: str, option: Optional[str] = None) -> str:
    """
    Search for entries by keywords or molecular properties.

    Args:
        database: Database to search ('genes', 'compound', 'drug', etc.)
        query: Search term or molecular property
        option: Optional parameter for molecular searches:
                'formula' - exact match to chemical formula
                'exact_mass' - range search by exact mass (e.g., '174.05-174.15')
                'mol_weight' - range search by molecular weight

    Returns:
        str: Tab-delimited search results

    Examples:
        # Keyword search
        results = kegg_find('genes', 'shiga toxin')

        # Formula search
        compounds = kegg_find('compound', 'C7H10N4O2', 'formula')

        # Mass range search
        drugs = kegg_find('drug', '300-310', 'exact_mass')
    """
    query_encoded = urllib.parse.quote(query)

    if option:
        url = f"{KEGG_BASE_URL}/find/{database}/{query_encoded}/{option}"
    else:
        url = f"{KEGG_BASE_URL}/find/{database}/{query_encoded}"

    try:
        with urllib.request.urlopen(url) as response:
            return response.read().decode('utf-8')
    except urllib.error.HTTPError as e:
        return f"Error: {e.code} - {e.reason}"


def kegg_get(entries: Union[str, List[str]], option: Optional[str] = None) -> str:
    """
    Retrieve full database entries or specific data formats.

    Args:
        entries: Single entry ID or list of entry IDs (max 10)
        option: Optional output format:
                'aaseq' or 'ntseq' - FASTA sequence
                'mol' - MOL format (for compounds)
                'kcf' - KCF format (for compounds)
                'image' - PNG image (pathway maps, single entry only)
                'kgml' - KGML format (pathway XML, single entry only)
                'json' - JSON format (pathway only, single entry only)

    Returns:
        str: Entry data in requested format

    Examples:
        # Get pathway entry
        pathway = kegg_get('hsa00010')

        # Get multiple entries
        genes = kegg_get(['hsa:10458', 'ece:Z5100'])

        # Get sequence
        sequence = kegg_get('hsa:10458', 'aaseq')

        # Get pathway as JSON
        pathway_json = kegg_get('hsa05130', 'json')
    """
    if isinstance(entries, list):
        entries_str = '+'.join(entries[:10])  # Max 10 entries
    else:
        entries_str = entries

    if option:
        url = f"{KEGG_BASE_URL}/get/{entries_str}/{option}"
    else:
        url = f"{KEGG_BASE_URL}/get/{entries_str}"

    try:
        with urllib.request.urlopen(url) as response:
            return response.read().decode('utf-8')
    except urllib.error.HTTPError as e:
        return f"Error: {e.code} - {e.reason}"


def kegg_conv(target_db: str, source_db: str) -> str:
    """
    Convert identifiers between KEGG and external databases.

    Args:
        target_db: Target database (e.g., 'ncbi-geneid', 'uniprot', 'pubchem')
        source_db: Source database or entry (e.g., 'hsa', 'compound', 'hsa:10458')

    Returns:
        str: Tab-delimited conversion table

    Examples:
        # Convert all human genes to NCBI Gene IDs
        conversions = kegg_conv('ncbi-geneid', 'hsa')

        # Convert specific gene
        gene_id = kegg_conv('ncbi-geneid', 'hsa:10458')

        # Convert compounds to PubChem IDs
        pubchem = kegg_conv('pubchem', 'compound')
    """
    url = f"{KEGG_BASE_URL}/conv/{target_db}/{source_db}"
    try:
        with urllib.request.urlopen(url) as response:
            return response.read().decode('utf-8')
    except urllib.error.HTTPError as e:
        return f"Error: {e.code} - {e.reason}"


def kegg_link(target_db: str, source_db: str) -> str:
    """
    Find related entries across KEGG databases.

    Args:
        target_db: Target database (e.g., 'pathway', 'enzyme', 'genes')
        source_db: Source database or entry (e.g., 'hsa', 'pathway', 'hsa:10458')

    Returns:
        str: Tab-delimited list of linked entries

    Examples:
        # Find pathways linked to human genes
        links = kegg_link('pathway', 'hsa')

        # Find genes in a specific pathway
        genes = kegg_link('genes', 'hsa00010')

        # Find pathways for a specific gene
        pathways = kegg_link('pathway', 'hsa:10458')
    """
    url = f"{KEGG_BASE_URL}/link/{target_db}/{source_db}"
    try:
        with urllib.request.urlopen(url) as response:
            return response.read().decode('utf-8')
    except urllib.error.HTTPError as e:
        return f"Error: {e.code} - {e.reason}"


def kegg_ddi(drug_entries: Union[str, List[str]]) -> str:
    """
    Check drug-drug interactions.

    Args:
        drug_entries: Single drug entry or list of drug entries (max 10)

    Returns:
        str: Drug interaction information

    Example:
        interactions = kegg_ddi(['D00001', 'D00002'])
    """
    if isinstance(drug_entries, list):
        entries_str = '+'.join(drug_entries[:10])  # Max 10 entries
    else:
        entries_str = drug_entries

    url = f"{KEGG_BASE_URL}/ddi/{entries_str}"
    try:
        with urllib.request.urlopen(url) as response:
            return response.read().decode('utf-8')
    except urllib.error.HTTPError as e:
        return f"Error: {e.code} - {e.reason}"


if __name__ == "__main__":
    # Example usage
    print("KEGG Info Example:")
    print(kegg_info('pathway')[:200] + "...\n")

    print("KEGG List Example (first 3 pathways):")
    pathways = kegg_list('pathway')
    print('\n'.join(pathways.split('\n')[:3]) + "\n")

    print("KEGG Find Example:")
    print(kegg_find('genes', 'p53')[:200] + "...")