scripts/protein_analysis_workflow.py

#!/usr/bin/env python3
"""
Complete Protein Analysis Workflow

This script performs a comprehensive protein analysis pipeline:
1. UniProt search and identifier retrieval
2. FASTA sequence retrieval
3. BLAST similarity search
4. KEGG pathway discovery
5. PSICQUIC interaction mapping
6. GO annotation retrieval

Usage:
    python protein_analysis_workflow.py PROTEIN_NAME EMAIL [--skip-blast]

Examples:
    python protein_analysis_workflow.py ZAP70_HUMAN user@example.com
    python protein_analysis_workflow.py P43403 user@example.com --skip-blast

Note: BLAST searches can take several minutes. Use --skip-blast to skip this step.
"""

import sys
import time
import argparse
from bioservices import UniProt, KEGG, NCBIblast, PSICQUIC, QuickGO


def search_protein(query):
    """Search UniProt for protein and retrieve basic information."""
    print(f"\n{'='*70}")
    print("STEP 1: UniProt Search")
    print(f"{'='*70}")

    u = UniProt(verbose=False)

    print(f"Searching for: {query}")

    # Try direct retrieval first (if query looks like accession)
    if len(query) == 6 and query[0] in "OPQ":
        try:
            entry = u.retrieve(query, frmt="tab")
            if entry:
                uniprot_id = query
                print(f"✓ Found UniProt entry: {uniprot_id}")
                return u, uniprot_id
        except:
            pass

    # Otherwise search
    results = u.search(query, frmt="tab", columns="id,genes,organism,length,protein names", limit=5)

    if not results:
        print("✗ No results found")
        return u, None

    lines = results.strip().split("\n")
    if len(lines) < 2:
        print("✗ No entries found")
        return u, None

    # Display results
    print(f"\n✓ Found {len(lines)-1} result(s):")
    for i, line in enumerate(lines[1:], 1):
        fields = line.split("\t")
        print(f"  {i}. {fields[0]} - {fields[1]} ({fields[2]})")

    # Use first result
    first_entry = lines[1].split("\t")
    uniprot_id = first_entry[0]
    gene_names = first_entry[1] if len(first_entry) > 1 else "N/A"
    organism = first_entry[2] if len(first_entry) > 2 else "N/A"
    length = first_entry[3] if len(first_entry) > 3 else "N/A"
    protein_name = first_entry[4] if len(first_entry) > 4 else "N/A"

    print(f"\nUsing first result:")
    print(f"  UniProt ID: {uniprot_id}")
    print(f"  Gene names: {gene_names}")
    print(f"  Organism: {organism}")
    print(f"  Length: {length} aa")
    print(f"  Protein: {protein_name}")

    return u, uniprot_id


def retrieve_sequence(uniprot, uniprot_id):
    """Retrieve FASTA sequence for protein."""
    print(f"\n{'='*70}")
    print("STEP 2: FASTA Sequence Retrieval")
    print(f"{'='*70}")

    try:
        sequence = uniprot.retrieve(uniprot_id, frmt="fasta")

        if sequence:
            # Extract sequence only (remove header)
            lines = sequence.strip().split("\n")
            header = lines[0]
            seq_only = "".join(lines[1:])

            print(f"✓ Retrieved sequence:")
            print(f"  Header: {header}")
            print(f"  Length: {len(seq_only)} residues")
            print(f"  First 60 residues: {seq_only[:60]}...")

            return seq_only
        else:
            print("✗ Failed to retrieve sequence")
            return None

    except Exception as e:
        print(f"✗ Error: {e}")
        return None


def run_blast(sequence, email, skip=False):
    """Run BLAST similarity search."""
    print(f"\n{'='*70}")
    print("STEP 3: BLAST Similarity Search")
    print(f"{'='*70}")

    if skip:
        print("⊘ Skipped (--skip-blast flag)")
        return None

    if not email or "@" not in email:
        print("⊘ Skipped (valid email required for BLAST)")
        return None

    try:
        print(f"Submitting BLASTP job...")
        print(f"  Database: uniprotkb")
        print(f"  Sequence length: {len(sequence)} aa")

        s = NCBIblast(verbose=False)

        jobid = s.run(
            program="blastp",
            sequence=sequence,
            stype="protein",
            database="uniprotkb",
            email=email
        )

        print(f"✓ Job submitted: {jobid}")
        print(f"  Waiting for completion...")

        # Poll for completion
        max_wait = 300  # 5 minutes
        start_time = time.time()

        while time.time() - start_time < max_wait:
            status = s.getStatus(jobid)
            elapsed = int(time.time() - start_time)
            print(f"  Status: {status} (elapsed: {elapsed}s)", end="\r")

            if status == "FINISHED":
                print(f"\n✓ BLAST completed in {elapsed}s")

                # Retrieve results
                results = s.getResult(jobid, "out")

                # Parse and display summary
                lines = results.split("\n")
                print(f"\n  Results preview:")
                for line in lines[:20]:
                    if line.strip():
                        print(f"    {line}")

                return results

            elif status == "ERROR":
                print(f"\n✗ BLAST job failed")
                return None

            time.sleep(5)

        print(f"\n✗ Timeout after {max_wait}s")
        return None

    except Exception as e:
        print(f"✗ Error: {e}")
        return None


def discover_pathways(uniprot, kegg, uniprot_id):
    """Discover KEGG pathways for protein."""
    print(f"\n{'='*70}")
    print("STEP 4: KEGG Pathway Discovery")
    print(f"{'='*70}")

    try:
        # Map UniProt → KEGG
        print(f"Mapping {uniprot_id} to KEGG...")
        kegg_mapping = uniprot.mapping(fr="UniProtKB_AC-ID", to="KEGG", query=uniprot_id)

        if not kegg_mapping or uniprot_id not in kegg_mapping:
            print("✗ No KEGG mapping found")
            return []

        kegg_ids = kegg_mapping[uniprot_id]
        print(f"✓ KEGG ID(s): {kegg_ids}")

        # Get pathways for first KEGG ID
        kegg_id = kegg_ids[0]
        organism, gene_id = kegg_id.split(":")

        print(f"\nSearching pathways for {kegg_id}...")
        pathways = kegg.get_pathway_by_gene(gene_id, organism)

        if not pathways:
            print("✗ No pathways found")
            return []

        print(f"✓ Found {len(pathways)} pathway(s):\n")

        # Get pathway names
        pathway_info = []
        for pathway_id in pathways:
            try:
                entry = kegg.get(pathway_id)

                # Extract pathway name
                pathway_name = "Unknown"
                for line in entry.split("\n"):
                    if line.startswith("NAME"):
                        pathway_name = line.replace("NAME", "").strip()
                        break

                pathway_info.append((pathway_id, pathway_name))
                print(f"  • {pathway_id}: {pathway_name}")

            except Exception as e:
                print(f"  • {pathway_id}: [Error retrieving name]")

        return pathway_info

    except Exception as e:
        print(f"✗ Error: {e}")
        return []


def find_interactions(protein_query):
    """Find protein-protein interactions via PSICQUIC."""
    print(f"\n{'='*70}")
    print("STEP 5: Protein-Protein Interactions")
    print(f"{'='*70}")

    try:
        p = PSICQUIC()

        # Try querying MINT database
        query = f"{protein_query} AND species:9606"
        print(f"Querying MINT database...")
        print(f"  Query: {query}")

        results = p.query("mint", query)

        if not results:
            print("✗ No interactions found in MINT")
            return []

        # Parse PSI-MI TAB format
        lines = results.strip().split("\n")
        print(f"✓ Found {len(lines)} interaction(s):\n")

        # Display first 10 interactions
        interactions = []
        for i, line in enumerate(lines[:10], 1):
            fields = line.split("\t")
            if len(fields) >= 12:
                protein_a = fields[4].split(":")[1] if ":" in fields[4] else fields[4]
                protein_b = fields[5].split(":")[1] if ":" in fields[5] else fields[5]
                interaction_type = fields[11]

                interactions.append((protein_a, protein_b, interaction_type))
                print(f"  {i}. {protein_a} ↔ {protein_b}")

        if len(lines) > 10:
            print(f"  ... and {len(lines)-10} more")

        return interactions

    except Exception as e:
        print(f"✗ Error: {e}")
        return []


def get_go_annotations(uniprot_id):
    """Retrieve GO annotations."""
    print(f"\n{'='*70}")
    print("STEP 6: Gene Ontology Annotations")
    print(f"{'='*70}")

    try:
        g = QuickGO()

        print(f"Retrieving GO annotations for {uniprot_id}...")
        annotations = g.Annotation(protein=uniprot_id, format="tsv")

        if not annotations:
            print("✗ No GO annotations found")
            return []

        lines = annotations.strip().split("\n")
        print(f"✓ Found {len(lines)-1} annotation(s)\n")

        # Group by aspect
        aspects = {"P": [], "F": [], "C": []}
        for line in lines[1:]:
            fields = line.split("\t")
            if len(fields) >= 9:
                go_id = fields[6]
                go_term = fields[7]
                go_aspect = fields[8]

                if go_aspect in aspects:
                    aspects[go_aspect].append((go_id, go_term))

        # Display summary
        print(f"  Biological Process (P): {len(aspects['P'])} terms")
        for go_id, go_term in aspects['P'][:5]:
            print(f"    • {go_id}: {go_term}")
        if len(aspects['P']) > 5:
            print(f"    ... and {len(aspects['P'])-5} more")

        print(f"\n  Molecular Function (F): {len(aspects['F'])} terms")
        for go_id, go_term in aspects['F'][:5]:
            print(f"    • {go_id}: {go_term}")
        if len(aspects['F']) > 5:
            print(f"    ... and {len(aspects['F'])-5} more")

        print(f"\n  Cellular Component (C): {len(aspects['C'])} terms")
        for go_id, go_term in aspects['C'][:5]:
            print(f"    • {go_id}: {go_term}")
        if len(aspects['C']) > 5:
            print(f"    ... and {len(aspects['C'])-5} more")

        return aspects

    except Exception as e:
        print(f"✗ Error: {e}")
        return {}


def main():
    """Main workflow."""
    parser = argparse.ArgumentParser(
        description="Complete protein analysis workflow using BioServices",
        formatter_class=argparse.RawDescriptionHelpFormatter,
        epilog="""
Examples:
  python protein_analysis_workflow.py ZAP70_HUMAN user@example.com
  python protein_analysis_workflow.py P43403 user@example.com --skip-blast
        """
    )
    parser.add_argument("protein", help="Protein name or UniProt ID")
    parser.add_argument("email", help="Email address (required for BLAST)")
    parser.add_argument("--skip-blast", action="store_true",
                       help="Skip BLAST search (faster)")

    args = parser.parse_args()

    print("=" * 70)
    print("BIOSERVICES: Complete Protein Analysis Workflow")
    print("=" * 70)

    # Step 1: Search protein
    uniprot, uniprot_id = search_protein(args.protein)
    if not uniprot_id:
        print("\n✗ Failed to find protein. Exiting.")
        sys.exit(1)

    # Step 2: Retrieve sequence
    sequence = retrieve_sequence(uniprot, uniprot_id)
    if not sequence:
        print("\n⚠ Warning: Could not retrieve sequence")

    # Step 3: BLAST search
    if sequence:
        blast_results = run_blast(sequence, args.email, args.skip_blast)

    # Step 4: Pathway discovery
    kegg = KEGG()
    pathways = discover_pathways(uniprot, kegg, uniprot_id)

    # Step 5: Interaction mapping
    interactions = find_interactions(args.protein)

    # Step 6: GO annotations
    go_terms = get_go_annotations(uniprot_id)

    # Summary
    print(f"\n{'='*70}")
    print("WORKFLOW SUMMARY")
    print(f"{'='*70}")
    print(f"  Protein: {args.protein}")
    print(f"  UniProt ID: {uniprot_id}")
    print(f"  Sequence: {'✓' if sequence else '✗'}")
    print(f"  BLAST: {'✓' if not args.skip_blast and sequence else '⊘'}")
    print(f"  Pathways: {len(pathways)} found")
    print(f"  Interactions: {len(interactions)} found")
    print(f"  GO annotations: {sum(len(v) for v in go_terms.values())} found")
    print(f"{'='*70}")


if __name__ == "__main__":
    main()