scripts/workflow_generator.py

#!/usr/bin/env python3
"""
deepTools Workflow Generator

Generates bash script templates for common deepTools workflows.
"""

import argparse
import sys


WORKFLOWS = {
    'chipseq_qc': {
        'name': 'ChIP-seq Quality Control',
        'description': 'Complete QC workflow for ChIP-seq experiments',
    },
    'chipseq_analysis': {
        'name': 'ChIP-seq Complete Analysis',
        'description': 'Full ChIP-seq analysis from BAM to heatmaps',
    },
    'rnaseq_coverage': {
        'name': 'RNA-seq Coverage Tracks',
        'description': 'Generate strand-specific RNA-seq coverage',
    },
    'atacseq': {
        'name': 'ATAC-seq Analysis',
        'description': 'ATAC-seq workflow with Tn5 correction',
    },
}


def generate_chipseq_qc_workflow(output_file, params):
    """Generate ChIP-seq QC workflow script."""

    script = f"""#!/bin/bash
# deepTools ChIP-seq Quality Control Workflow
# Generated by deepTools workflow generator

# Configuration
INPUT_BAM="{params.get('input_bam', 'Input.bam')}"
CHIP_BAM=("{params.get('chip_bams', 'ChIP1.bam ChIP2.bam')}")
GENOME_SIZE={params.get('genome_size', '2913022398')}
THREADS={params.get('threads', '8')}
OUTPUT_DIR="{params.get('output_dir', 'deeptools_qc')}"

# Create output directory
mkdir -p $OUTPUT_DIR

echo "=== Starting ChIP-seq QC workflow ==="

# Step 1: Correlation analysis
echo "Step 1: Computing correlation matrix..."
multiBamSummary bins \\
    --bamfiles $INPUT_BAM ${{CHIP_BAM[@]}} \\
    -o $OUTPUT_DIR/readCounts.npz \\
    --numberOfProcessors $THREADS

echo "Step 2: Generating correlation heatmap..."
plotCorrelation \\
    -in $OUTPUT_DIR/readCounts.npz \\
    --corMethod pearson \\
    --whatToShow heatmap \\
    --plotFile $OUTPUT_DIR/correlation_heatmap.png \\
    --plotNumbers

echo "Step 3: Generating PCA plot..."
plotPCA \\
    -in $OUTPUT_DIR/readCounts.npz \\
    -o $OUTPUT_DIR/PCA_plot.png \\
    -T "PCA of ChIP-seq samples"

# Step 2: Coverage assessment
echo "Step 4: Assessing coverage..."
plotCoverage \\
    --bamfiles $INPUT_BAM ${{CHIP_BAM[@]}} \\
    --plotFile $OUTPUT_DIR/coverage.png \\
    --ignoreDuplicates \\
    --numberOfProcessors $THREADS

# Step 3: Fragment size (for paired-end data)
echo "Step 5: Analyzing fragment sizes..."
bamPEFragmentSize \\
    --bamfiles $INPUT_BAM ${{CHIP_BAM[@]}} \\
    --histogram $OUTPUT_DIR/fragmentSizes.png \\
    --plotTitle "Fragment Size Distribution"

# Step 4: ChIP signal strength
echo "Step 6: Evaluating ChIP enrichment..."
plotFingerprint \\
    --bamfiles $INPUT_BAM ${{CHIP_BAM[@]}} \\
    --plotFile $OUTPUT_DIR/fingerprint.png \\
    --extendReads 200 \\
    --ignoreDuplicates \\
    --numberOfProcessors $THREADS \\
    --outQualityMetrics $OUTPUT_DIR/fingerprint_metrics.txt

echo "=== ChIP-seq QC workflow complete ==="
echo "Results are in: $OUTPUT_DIR"
"""

    with open(output_file, 'w') as f:
        f.write(script)

    return f"✓ Generated ChIP-seq QC workflow: {output_file}"


def generate_chipseq_analysis_workflow(output_file, params):
    """Generate complete ChIP-seq analysis workflow script."""

    script = f"""#!/bin/bash
# deepTools ChIP-seq Complete Analysis Workflow
# Generated by deepTools workflow generator

# Configuration
INPUT_BAM="{params.get('input_bam', 'Input.bam')}"
CHIP_BAM="{params.get('chip_bam', 'ChIP.bam')}"
GENES_BED="{params.get('genes_bed', 'genes.bed')}"
PEAKS_BED="{params.get('peaks_bed', 'peaks.bed')}"
GENOME_SIZE={params.get('genome_size', '2913022398')}
THREADS={params.get('threads', '8')}
OUTPUT_DIR="{params.get('output_dir', 'chipseq_analysis')}"

# Create output directory
mkdir -p $OUTPUT_DIR

echo "=== Starting ChIP-seq analysis workflow ==="

# Step 1: Generate normalized coverage tracks
echo "Step 1: Generating coverage tracks..."

bamCoverage \\
    --bam $INPUT_BAM \\
    --outFileName $OUTPUT_DIR/Input_coverage.bw \\
    --normalizeUsing RPGC \\
    --effectiveGenomeSize $GENOME_SIZE \\
    --binSize 10 \\
    --extendReads 200 \\
    --ignoreDuplicates \\
    --numberOfProcessors $THREADS

bamCoverage \\
    --bam $CHIP_BAM \\
    --outFileName $OUTPUT_DIR/ChIP_coverage.bw \\
    --normalizeUsing RPGC \\
    --effectiveGenomeSize $GENOME_SIZE \\
    --binSize 10 \\
    --extendReads 200 \\
    --ignoreDuplicates \\
    --numberOfProcessors $THREADS

# Step 2: Create log2 ratio track
echo "Step 2: Creating log2 ratio track..."
bamCompare \\
    --bamfile1 $CHIP_BAM \\
    --bamfile2 $INPUT_BAM \\
    --outFileName $OUTPUT_DIR/ChIP_vs_Input_log2ratio.bw \\
    --operation log2 \\
    --scaleFactorsMethod readCount \\
    --binSize 10 \\
    --extendReads 200 \\
    --ignoreDuplicates \\
    --numberOfProcessors $THREADS

# Step 3: Compute matrix around TSS
echo "Step 3: Computing matrix around TSS..."
computeMatrix reference-point \\
    --referencePoint TSS \\
    --scoreFileName $OUTPUT_DIR/ChIP_coverage.bw \\
    --regionsFileName $GENES_BED \\
    --beforeRegionStartLength 3000 \\
    --afterRegionStartLength 3000 \\
    --binSize 10 \\
    --sortRegions descend \\
    --sortUsing mean \\
    --outFileName $OUTPUT_DIR/matrix_TSS.gz \\
    --numberOfProcessors $THREADS

# Step 4: Generate heatmap
echo "Step 4: Generating heatmap..."
plotHeatmap \\
    --matrixFile $OUTPUT_DIR/matrix_TSS.gz \\
    --outFileName $OUTPUT_DIR/heatmap_TSS.png \\
    --colorMap RdBu \\
    --whatToShow 'plot, heatmap and colorbar' \\
    --yAxisLabel "Genes" \\
    --xAxisLabel "Distance from TSS (bp)" \\
    --refPointLabel "TSS" \\
    --heatmapHeight 15 \\
    --kmeans 3

# Step 5: Generate profile plot
echo "Step 5: Generating profile plot..."
plotProfile \\
    --matrixFile $OUTPUT_DIR/matrix_TSS.gz \\
    --outFileName $OUTPUT_DIR/profile_TSS.png \\
    --plotType lines \\
    --perGroup \\
    --colors blue \\
    --plotTitle "ChIP-seq signal around TSS" \\
    --yAxisLabel "Average signal" \\
    --refPointLabel "TSS"

# Step 6: Enrichment at peaks (if peaks provided)
if [ -f "$PEAKS_BED" ]; then
    echo "Step 6: Calculating enrichment at peaks..."
    plotEnrichment \\
        --bamfiles $INPUT_BAM $CHIP_BAM \\
        --BED $PEAKS_BED \\
        --labels Input ChIP \\
        --plotFile $OUTPUT_DIR/enrichment.png \\
        --outRawCounts $OUTPUT_DIR/enrichment_counts.tab \\
        --extendReads 200 \\
        --ignoreDuplicates
fi

echo "=== ChIP-seq analysis complete ==="
echo "Results are in: $OUTPUT_DIR"
"""

    with open(output_file, 'w') as f:
        f.write(script)

    return f"✓ Generated ChIP-seq analysis workflow: {output_file}"


def generate_rnaseq_coverage_workflow(output_file, params):
    """Generate RNA-seq coverage workflow script."""

    script = f"""#!/bin/bash
# deepTools RNA-seq Coverage Workflow
# Generated by deepTools workflow generator

# Configuration
RNASEQ_BAM="{params.get('rnaseq_bam', 'rnaseq.bam')}"
THREADS={params.get('threads', '8')}
OUTPUT_DIR="{params.get('output_dir', 'rnaseq_coverage')}"

# Create output directory
mkdir -p $OUTPUT_DIR

echo "=== Starting RNA-seq coverage workflow ==="

# Generate strand-specific coverage tracks
echo "Step 1: Generating forward strand coverage..."
bamCoverage \\
    --bam $RNASEQ_BAM \\
    --outFileName $OUTPUT_DIR/forward_coverage.bw \\
    --filterRNAstrand forward \\
    --normalizeUsing CPM \\
    --binSize 1 \\
    --numberOfProcessors $THREADS

echo "Step 2: Generating reverse strand coverage..."
bamCoverage \\
    --bam $RNASEQ_BAM \\
    --outFileName $OUTPUT_DIR/reverse_coverage.bw \\
    --filterRNAstrand reverse \\
    --normalizeUsing CPM \\
    --binSize 1 \\
    --numberOfProcessors $THREADS

echo "=== RNA-seq coverage workflow complete ==="
echo "Results are in: $OUTPUT_DIR"
echo ""
echo "Note: These bigWig files can be loaded into genome browsers"
echo "for strand-specific visualization of RNA-seq data."
"""

    with open(output_file, 'w') as f:
        f.write(script)

    return f"✓ Generated RNA-seq coverage workflow: {output_file}"


def generate_atacseq_workflow(output_file, params):
    """Generate ATAC-seq workflow script."""

    script = f"""#!/bin/bash
# deepTools ATAC-seq Analysis Workflow
# Generated by deepTools workflow generator

# Configuration
ATAC_BAM="{params.get('atac_bam', 'atacseq.bam')}"
PEAKS_BED="{params.get('peaks_bed', 'peaks.bed')}"
GENOME_SIZE={params.get('genome_size', '2913022398')}
THREADS={params.get('threads', '8')}
OUTPUT_DIR="{params.get('output_dir', 'atacseq_analysis')}"

# Create output directory
mkdir -p $OUTPUT_DIR

echo "=== Starting ATAC-seq analysis workflow ==="

# Step 1: Shift reads for Tn5 correction
echo "Step 1: Applying Tn5 offset correction..."
alignmentSieve \\
    --bam $ATAC_BAM \\
    --outFile $OUTPUT_DIR/atacseq_shifted.bam \\
    --ATACshift \\
    --minFragmentLength 38 \\
    --maxFragmentLength 2000 \\
    --ignoreDuplicates

# Index the shifted BAM
samtools index $OUTPUT_DIR/atacseq_shifted.bam

# Step 2: Generate coverage track
echo "Step 2: Generating coverage track..."
bamCoverage \\
    --bam $OUTPUT_DIR/atacseq_shifted.bam \\
    --outFileName $OUTPUT_DIR/atacseq_coverage.bw \\
    --normalizeUsing RPGC \\
    --effectiveGenomeSize $GENOME_SIZE \\
    --binSize 1 \\
    --numberOfProcessors $THREADS

# Step 3: Fragment size analysis
echo "Step 3: Analyzing fragment sizes..."
bamPEFragmentSize \\
    --bamfiles $ATAC_BAM \\
    --histogram $OUTPUT_DIR/fragmentSizes.png \\
    --maxFragmentLength 1000

# Step 4: Compute matrix at peaks (if peaks provided)
if [ -f "$PEAKS_BED" ]; then
    echo "Step 4: Computing matrix at peaks..."
    computeMatrix reference-point \\
        --referencePoint center \\
        --scoreFileName $OUTPUT_DIR/atacseq_coverage.bw \\
        --regionsFileName $PEAKS_BED \\
        --beforeRegionStartLength 2000 \\
        --afterRegionStartLength 2000 \\
        --binSize 10 \\
        --outFileName $OUTPUT_DIR/matrix_peaks.gz \\
        --numberOfProcessors $THREADS

    echo "Step 5: Generating heatmap..."
    plotHeatmap \\
        --matrixFile $OUTPUT_DIR/matrix_peaks.gz \\
        --outFileName $OUTPUT_DIR/heatmap_peaks.png \\
        --colorMap YlOrRd \\
        --refPointLabel "Peak Center" \\
        --heatmapHeight 15
fi

echo "=== ATAC-seq analysis complete ==="
echo "Results are in: $OUTPUT_DIR"
echo ""
echo "Expected fragment size pattern:"
echo "  ~50bp: nucleosome-free regions"
echo "  ~200bp: mono-nucleosome"
echo "  ~400bp: di-nucleosome"
"""

    with open(output_file, 'w') as f:
        f.write(script)

    return f"✓ Generated ATAC-seq workflow: {output_file}"


def main():
    parser = argparse.ArgumentParser(
        description="Generate deepTools workflow scripts",
        formatter_class=argparse.RawDescriptionHelpFormatter,
        epilog=f"""
Available workflows:
{chr(10).join(f"  {key}: {value['name']}" for key, value in WORKFLOWS.items())}

Examples:
  # Generate ChIP-seq QC workflow
  python workflow_generator.py chipseq_qc -o chipseq_qc.sh

  # Generate ChIP-seq analysis with custom parameters
  python workflow_generator.py chipseq_analysis -o analysis.sh \\
      --chip-bam H3K4me3.bam --input-bam Input.bam

  # List all available workflows
  python workflow_generator.py --list
        """
    )

    parser.add_argument('workflow', nargs='?', choices=list(WORKFLOWS.keys()),
                        help='Workflow type to generate')
    parser.add_argument('-o', '--output', default='deeptools_workflow.sh',
                        help='Output script filename (default: deeptools_workflow.sh)')
    parser.add_argument('--list', action='store_true',
                        help='List all available workflows')

    # Common parameters
    parser.add_argument('--threads', type=int, default=8,
                        help='Number of threads (default: 8)')
    parser.add_argument('--genome-size', type=int, default=2913022398,
                        help='Effective genome size (default: 2913022398 for hg38)')
    parser.add_argument('--output-dir', default=None,
                        help='Output directory for results')

    # Workflow-specific parameters
    parser.add_argument('--input-bam', help='Input/control BAM file')
    parser.add_argument('--chip-bam', help='ChIP BAM file')
    parser.add_argument('--chip-bams', help='Multiple ChIP BAM files (space-separated)')
    parser.add_argument('--rnaseq-bam', help='RNA-seq BAM file')
    parser.add_argument('--atac-bam', help='ATAC-seq BAM file')
    parser.add_argument('--genes-bed', help='Genes BED file')
    parser.add_argument('--peaks-bed', help='Peaks BED file')

    args = parser.parse_args()

    # List workflows
    if args.list:
        print("\nAvailable deepTools workflows:\n")
        for key, value in WORKFLOWS.items():
            print(f"  {key}")
            print(f"    {value['name']}")
            print(f"    {value['description']}\n")
        sys.exit(0)

    # Check if workflow was specified
    if not args.workflow:
        parser.print_help()
        sys.exit(1)

    # Prepare parameters
    params = {
        'threads': args.threads,
        'genome_size': args.genome_size,
        'output_dir': args.output_dir or f"{args.workflow}_output",
        'input_bam': args.input_bam,
        'chip_bam': args.chip_bam,
        'chip_bams': args.chip_bams,
        'rnaseq_bam': args.rnaseq_bam,
        'atac_bam': args.atac_bam,
        'genes_bed': args.genes_bed,
        'peaks_bed': args.peaks_bed,
    }

    # Generate workflow
    if args.workflow == 'chipseq_qc':
        message = generate_chipseq_qc_workflow(args.output, params)
    elif args.workflow == 'chipseq_analysis':
        message = generate_chipseq_analysis_workflow(args.output, params)
    elif args.workflow == 'rnaseq_coverage':
        message = generate_rnaseq_coverage_workflow(args.output, params)
    elif args.workflow == 'atacseq':
        message = generate_atacseq_workflow(args.output, params)

    print(message)
    print(f"\nTo run the workflow:")
    print(f"  chmod +x {args.output}")
    print(f"  ./{args.output}")
    print(f"\nNote: Edit the script to customize file paths and parameters.")


if __name__ == "__main__":
    main()
← Back to deeptools