scripts/biorxiv_search.py - biorxiv-database

#!/usr/bin/env python3
"""
bioRxiv Search Tool
A comprehensive Python tool for searching and retrieving preprints from bioRxiv.
Supports keyword search, author search, date filtering, category filtering, and more.

Note: This tool is focused exclusively on bioRxiv (life sciences preprints).
"""

import requests
import json
import argparse
from datetime import datetime, timedelta
from typing import List, Dict, Optional, Any
import time
import sys
from urllib.parse import quote


class BioRxivSearcher:
    """Efficient search interface for bioRxiv preprints."""

    BASE_URL = "https://api.biorxiv.org"

    # Valid bioRxiv categories
    CATEGORIES = [
        "animal-behavior-and-cognition", "biochemistry", "bioengineering",
        "bioinformatics", "biophysics", "cancer-biology", "cell-biology",
        "clinical-trials", "developmental-biology", "ecology", "epidemiology",
        "evolutionary-biology", "genetics", "genomics", "immunology",
        "microbiology", "molecular-biology", "neuroscience", "paleontology",
        "pathology", "pharmacology-and-toxicology", "physiology",
        "plant-biology", "scientific-communication-and-education",
        "synthetic-biology", "systems-biology", "zoology"
    ]

    def __init__(self, verbose: bool = False):
        """Initialize the searcher."""
        self.verbose = verbose
        self.session = requests.Session()
        self.session.headers.update({
            'User-Agent': 'BioRxiv-Search-Tool/1.0'
        })

    def _log(self, message: str):
        """Print verbose logging messages."""
        if self.verbose:
            print(f"[INFO] {message}", file=sys.stderr)

    def _make_request(self, endpoint: str, params: Optional[Dict] = None) -> Dict:
        """Make an API request with error handling and rate limiting."""
        url = f"{self.BASE_URL}/{endpoint}"
        self._log(f"Requesting: {url}")

        try:
            response = self.session.get(url, params=params, timeout=30)
            response.raise_for_status()

            # Rate limiting - be respectful to the API
            time.sleep(0.5)

            return response.json()
        except requests.exceptions.RequestException as e:
            self._log(f"Error making request: {e}")
            return {"messages": [{"status": "error", "message": str(e)}], "collection": []}

    def search_by_date_range(
        self,
        start_date: str,
        end_date: str,
        category: Optional[str] = None
    ) -> List[Dict]:
        """
        Search for preprints within a date range.

        Args:
            start_date: Start date in YYYY-MM-DD format
            end_date: End date in YYYY-MM-DD format
            category: Optional category filter (e.g., 'neuroscience')

        Returns:
            List of preprint dictionaries
        """
        self._log(f"Searching bioRxiv from {start_date} to {end_date}")

        if category:
            endpoint = f"details/biorxiv/{start_date}/{end_date}/{category}"
        else:
            endpoint = f"details/biorxiv/{start_date}/{end_date}"

        data = self._make_request(endpoint)

        if "collection" in data:
            self._log(f"Found {len(data['collection'])} preprints")
            return data["collection"]

        return []

    def search_by_interval(
        self,
        interval: str = "1",
        cursor: int = 0,
        format: str = "json"
    ) -> Dict:
        """
        Retrieve preprints from a specific time interval.

        Args:
            interval: Number of days back to search
            cursor: Pagination cursor (0 for first page, then use returned cursor)
            format: Response format ('json' or 'xml')

        Returns:
            Dictionary with collection and pagination info
        """
        endpoint = f"pubs/biorxiv/{interval}/{cursor}/{format}"
        return self._make_request(endpoint)

    def get_paper_details(self, doi: str) -> Dict:
        """
        Get detailed information about a specific paper by DOI.

        Args:
            doi: The DOI of the paper (e.g., '10.1101/2021.01.01.123456')

        Returns:
            Dictionary with paper details
        """
        # Clean DOI if full URL was provided
        if 'doi.org' in doi:
            doi = doi.split('doi.org/')[-1]

        self._log(f"Fetching details for DOI: {doi}")
        endpoint = f"details/biorxiv/{doi}"

        data = self._make_request(endpoint)

        if "collection" in data and len(data["collection"]) > 0:
            return data["collection"][0]

        return {}

    def search_by_author(
        self,
        author_name: str,
        start_date: Optional[str] = None,
        end_date: Optional[str] = None
    ) -> List[Dict]:
        """
        Search for papers by author name.

        Args:
            author_name: Author name to search for
            start_date: Optional start date (YYYY-MM-DD)
            end_date: Optional end date (YYYY-MM-DD)

        Returns:
            List of matching preprints
        """
        # If no date range specified, search last 3 years
        if not start_date:
            end_date = datetime.now().strftime("%Y-%m-%d")
            start_date = (datetime.now() - timedelta(days=1095)).strftime("%Y-%m-%d")

        self._log(f"Searching for author: {author_name}")

        # Get all papers in date range
        papers = self.search_by_date_range(start_date, end_date)

        # Filter by author name (case-insensitive)
        author_lower = author_name.lower()
        matching_papers = []

        for paper in papers:
            authors = paper.get("authors", "")
            if author_lower in authors.lower():
                matching_papers.append(paper)

        self._log(f"Found {len(matching_papers)} papers by {author_name}")
        return matching_papers

    def search_by_keywords(
        self,
        keywords: List[str],
        start_date: Optional[str] = None,
        end_date: Optional[str] = None,
        category: Optional[str] = None,
        search_fields: List[str] = ["title", "abstract"]
    ) -> List[Dict]:
        """
        Search for papers containing specific keywords.

        Args:
            keywords: List of keywords to search for
            start_date: Optional start date (YYYY-MM-DD)
            end_date: Optional end date (YYYY-MM-DD)
            category: Optional category filter
            search_fields: Fields to search in (title, abstract, authors)

        Returns:
            List of matching preprints
        """
        # If no date range specified, search last year
        if not start_date:
            end_date = datetime.now().strftime("%Y-%m-%d")
            start_date = (datetime.now() - timedelta(days=365)).strftime("%Y-%m-%d")

        self._log(f"Searching for keywords: {keywords}")

        # Get all papers in date range
        papers = self.search_by_date_range(start_date, end_date, category)

        # Filter by keywords
        matching_papers = []
        keywords_lower = [k.lower() for k in keywords]

        for paper in papers:
            # Build search text from specified fields
            search_text = ""
            for field in search_fields:
                if field in paper:
                    search_text += " " + str(paper[field]).lower()

            # Check if any keyword matches
            if any(keyword in search_text for keyword in keywords_lower):
                matching_papers.append(paper)

        self._log(f"Found {len(matching_papers)} papers matching keywords")
        return matching_papers

    def download_pdf(self, doi: str, output_path: str) -> bool:
        """
        Download the PDF of a paper.

        Args:
            doi: The DOI of the paper
            output_path: Path where PDF should be saved

        Returns:
            True if download successful, False otherwise
        """
        # Clean DOI
        if 'doi.org' in doi:
            doi = doi.split('doi.org/')[-1]

        # Construct PDF URL
        pdf_url = f"https://www.biorxiv.org/content/{doi}v1.full.pdf"

        self._log(f"Downloading PDF from: {pdf_url}")

        try:
            response = self.session.get(pdf_url, timeout=60)
            response.raise_for_status()

            with open(output_path, 'wb') as f:
                f.write(response.content)

            self._log(f"PDF saved to: {output_path}")
            return True
        except Exception as e:
            self._log(f"Error downloading PDF: {e}")
            return False

    def format_result(self, paper: Dict, include_abstract: bool = True) -> Dict:
        """
        Format a paper result with standardized fields.

        Args:
            paper: Raw paper dictionary from API
            include_abstract: Whether to include the abstract

        Returns:
            Formatted paper dictionary
        """
        result = {
            "doi": paper.get("doi", ""),
            "title": paper.get("title", ""),
            "authors": paper.get("authors", ""),
            "author_corresponding": paper.get("author_corresponding", ""),
            "author_corresponding_institution": paper.get("author_corresponding_institution", ""),
            "date": paper.get("date", ""),
            "version": paper.get("version", ""),
            "type": paper.get("type", ""),
            "license": paper.get("license", ""),
            "category": paper.get("category", ""),
            "jatsxml": paper.get("jatsxml", ""),
            "published": paper.get("published", "")
        }

        if include_abstract:
            result["abstract"] = paper.get("abstract", "")

        # Add PDF and HTML URLs
        if result["doi"]:
            result["pdf_url"] = f"https://www.biorxiv.org/content/{result['doi']}v{result['version']}.full.pdf"
            result["html_url"] = f"https://www.biorxiv.org/content/{result['doi']}v{result['version']}"

        return result


def main():
    """Command-line interface for bioRxiv search."""
    parser = argparse.ArgumentParser(
        description="Search bioRxiv preprints efficiently",
        formatter_class=argparse.RawDescriptionHelpFormatter
    )

    parser.add_argument("--verbose", "-v", action="store_true",
                       help="Enable verbose logging")

    # Search type arguments
    search_group = parser.add_argument_group("Search options")
    search_group.add_argument("--keywords", "-k", nargs="+",
                            help="Keywords to search for")
    search_group.add_argument("--author", "-a",
                            help="Author name to search for")
    search_group.add_argument("--doi",
                            help="Get details for specific DOI")

    # Date range arguments
    date_group = parser.add_argument_group("Date range options")
    date_group.add_argument("--start-date",
                          help="Start date (YYYY-MM-DD)")
    date_group.add_argument("--end-date",
                          help="End date (YYYY-MM-DD)")
    date_group.add_argument("--days-back", type=int,
                          help="Search N days back from today")

    # Filter arguments
    filter_group = parser.add_argument_group("Filter options")
    filter_group.add_argument("--category", "-c",
                            choices=BioRxivSearcher.CATEGORIES,
                            help="Filter by category")
    filter_group.add_argument("--search-fields", nargs="+",
                            default=["title", "abstract"],
                            choices=["title", "abstract", "authors"],
                            help="Fields to search in for keywords")

    # Output arguments
    output_group = parser.add_argument_group("Output options")
    output_group.add_argument("--output", "-o",
                            help="Output file (default: stdout)")
    output_group.add_argument("--include-abstract", action="store_true",
                            default=True, help="Include abstracts in output")
    output_group.add_argument("--download-pdf",
                            help="Download PDF to specified path (requires --doi)")
    output_group.add_argument("--limit", type=int,
                            help="Limit number of results")

    args = parser.parse_args()

    # Initialize searcher
    searcher = BioRxivSearcher(verbose=args.verbose)

    # Handle date range
    end_date = args.end_date or datetime.now().strftime("%Y-%m-%d")
    if args.days_back:
        start_date = (datetime.now() - timedelta(days=args.days_back)).strftime("%Y-%m-%d")
    else:
        start_date = args.start_date

    # Execute search based on arguments
    results = []

    if args.download_pdf:
        if not args.doi:
            print("Error: --doi required with --download-pdf", file=sys.stderr)
            return 1

        success = searcher.download_pdf(args.doi, args.download_pdf)
        return 0 if success else 1

    elif args.doi:
        # Get specific paper by DOI
        paper = searcher.get_paper_details(args.doi)
        if paper:
            results = [paper]

    elif args.author:
        # Search by author
        results = searcher.search_by_author(
            args.author, start_date, end_date
        )

    elif args.keywords:
        # Search by keywords
        if not start_date:
            print("Error: --start-date or --days-back required for keyword search",
                  file=sys.stderr)
            return 1

        results = searcher.search_by_keywords(
            args.keywords, start_date, end_date,
            args.category, args.search_fields
        )

    else:
        # Date range search
        if not start_date:
            print("Error: Must specify search criteria (--keywords, --author, or --doi)",
                  file=sys.stderr)
            return 1

        results = searcher.search_by_date_range(
            start_date, end_date, args.category
        )

    # Apply limit
    if args.limit:
        results = results[:args.limit]

    # Format results
    formatted_results = [
        searcher.format_result(paper, args.include_abstract)
        for paper in results
    ]

    # Output results
    output_data = {
        "query": {
            "keywords": args.keywords,
            "author": args.author,
            "doi": args.doi,
            "start_date": start_date,
            "end_date": end_date,
            "category": args.category
        },
        "result_count": len(formatted_results),
        "results": formatted_results
    }

    output_json = json.dumps(output_data, indent=2)

    if args.output:
        with open(args.output, 'w') as f:
            f.write(output_json)
        print(f"Results written to {args.output}", file=sys.stderr)
    else:
        print(output_json)

    return 0


if __name__ == "__main__":
    sys.exit(main())