Update: July 26, 2019
This section of the forum is no longer actively monitored. We are working on a support migration plan that we will share here shortly. Apologies for this inconvenience.

Cromwell slow when running MarkDuplicates

Hi,

I'm using Cromwell and GATK to replicate the best practices preprocessing pipeline on a SLURM cluster. When I run MarkDuplicates on a BAM with 52 million records in a normal bash script, it completes in roughly 30 minutes. When I run the same code as a task in a Cromwell workflow, the task takes 6+ hours to complete. Both bash and Cromwell are given the same amount of memory (48GB). I'm using GATK v4.1.0.0. Are there any reasons why Cromwell would be taking so much longer?

I've included the source code below.

Thanks,
Tomas

Here is the bash source code:

```
#!/bin/bash
#SBATCH --job-name=time_markduplicates
#SBATCH --output=/home/users/tbencomo/out/run_16GB.out
#SBATCH --error=/home/users/tbencomo/errout/run_16GB.err
#SBATCH --nodes=1
#SBATCH --mem=48000
#SBATCH --cpus-per-task=1
#SBATCH --time=1-00:00:00
#SBATCH --mail-type=END
#SBATCH --workdir=/scratch/groups/carilee/test/

input_bam=/scratch/groups/carilee/test/cromwell-executions/PreProcessingForVariantDiscovery/d32bbf61-9f22-4ecf-b056-ad38cb337e90/call-MarkDuplicates/inputs/560482291/CTR119.hg19.aligned.unsorted.bam
output_bam=time_markduplicatesoutput.bam
metrics_filename=CTR119.hg19.duplicate_metrics

module load biology gatk
gatk MarkDuplicates \
--INPUT $input_bam \
--OUTPUT $output_bam \
--METRICS_FILE $metrics_filename \
--VALIDATION_STRINGENCY SILENT \
--OPTICAL_DUPLICATE_PIXEL_DISTANCE 100 \
--ASSUME_SORT_ORDER "queryname" \
--CREATE_MD5_FILE true
```

My configuration file:
```
include required(classpath("application"))

backend {
default = "SLURM"
providers {
SLURM {
actor-factory = "cromwell.backend.impl.sfs.config.ConfigBackendLifecycleActorFactory"
config {
filesystems {
local {
localization: ["hard-link", "soft-link", "copy"]
}
}
concurrent-job-limit = 500
script-epilogue = ""
runtime-attributes = """
Int runtime_minutes = 180
Int memory = 32000
String queue = "normal"
"""

# -n swapped for -c. -c == --cpus-per-task which should all the cores to a single node
submit = """
sbatch -J ${job_name} -D ${cwd} -o ${out} -e ${err} -t ${runtime_minutes} \
-p ${queue} \
--mem=${memory} \
--wrap "/bin/bash ${script}"
"""
kill = "scancel ${job_id}"
check-alive = "squeue -j ${job_id}"
job-id-regex = "Submitted batch job (\\d+).*"
}
}
}
}
```

My WDL file. The task is called MarkDuplicates:
```
workflow PreProcessingForVariantDiscovery {
String sample_name
String ref_name

File input_fastq_1
File input_fastq_2

# WARNING: Unzip the all the reference files!!
# Some of the command line tools implicitly try to guess
# the names of the supporting extension files (i.e. index files)
# which they mess up when the file names include .gz
File ref_fasta
File ref_fasta_index
File ref_dict

String bwa_commandline
Int compression_level

File dbSNP_vcf
File dbSNP_vcf_index
Array[File] known_indels_sites_VCFs
Array[File] known_indels_sites_indices

String base_file_name = sample_name + "." + ref_name

String read_group
String platform_unit
String platform

call GetBwaVersion {}

call FastqToBam {
input:
fastq_1 = input_fastq_1,
fastq_2 = input_fastq_2,
sample_name = sample_name,
read_group = read_group,
platform_unit = platform_unit,
platform = platform,
output_file_name = base_file_name + ".unaligned.bam",
}

call SamToFastqAndBwaMem {
input:
input_bam = FastqToBam.output_bam,
bwa_commandline = bwa_commandline,
output_bam_name = base_file_name + ".unmerged",
ref_fasta = ref_fasta,
ref_fasta_index = ref_fasta_index,
ref_dict = ref_dict,
}

call MergeBamAlignment {
input:
unaligned_bam = FastqToBam.output_bam,
bwa_commandline = bwa_commandline,
bwa_version = GetBwaVersion.version,
aligned_bam = SamToFastqAndBwaMem.output_bam,
output_bam_name = base_file_name + ".aligned.unsorted",
ref_fasta = ref_fasta,
ref_fasta_index = ref_fasta_index,
ref_dict = ref_dict,
}

call MarkDuplicates {
input:
input_bam = MergeBamAlignment.output_bam,
output_bam_name = base_file_name + ".aligned.unsorted.duplicates_marked",
metrics_filename = base_file_name + ".duplicate_metrics",
}
}

task GetBwaVersion {
command <<<
module load biology bwa
bwa 2>&1 | \
grep -e '^Version' | \
sed 's/Version: //'
>>>
runtime {
runtime_minutes: "5"
memory: 1000
}
output {
String version = read_string(stdout())
}
}

task FastqToBam {
File fastq_1
File fastq_2

String output_file_name

String sample_name
String read_group
String platform_unit
String platform


command <<<
module load biology gatk
gatk FastqToSam \
--FASTQ=${fastq_1} \
--FASTQ2=${fastq_2} \
--OUTPUT=${output_file_name} \
--PLATFORM_UNIT="${platform_unit}" \
--PLATFORM="${platform}" \
--SAMPLE_NAME=${sample_name} \
-RG="${read_group}"

>>>
runtime {
runtime_minutes: "180"
memory: 8000
}
output {
File output_bam = output_file_name
}
}

task SamToFastqAndBwaMem {
# Not sure what - hyphens do in bash script - assuming that it is used
# to mark certain command line parameters as null
File input_bam
String bwa_commandline
String output_bam_name
File ref_fasta
File ref_fasta_index
File ref_dict

File ref_amb
File ref_ann
File ref_bwt
File ref_pac
File ref_sa



command <<<
set -o pipefail
set -e

module load biology bwa
module load biology samtools
module load biology gatk
gatk SamToFastq \
--INPUT=${input_bam} \
--FASTQ=/dev/stdout \
--INTERLEAVE=true \
--INCLUDE_NON_PF_READS=true \
| \
bwa ${bwa_commandline} ${ref_fasta} /dev/stdin - 2> >(tee ${output_bam_name}.bwa.stderr.log >&2) \
| \
samtools view -1 - > ${output_bam_name}.bam
>>>
runtime {
runtime_minutes: "180"
memory: 32000
}
output {
File output_bam = "${output_bam_name}.bam"
File bwa_stderr_log = "${output_bam_name}.bwa.stderr.log"
}
}

task MergeBamAlignment {
File unaligned_bam
String bwa_commandline
String bwa_version
File aligned_bam
String output_bam_name
File ref_fasta
File ref_fasta_index
File ref_dict


command <<<
module load biology gatk
gatk MergeBamAlignment \
--VALIDATION_STRINGENCY SILENT \
--EXPECTED_ORIENTATIONS FR \
--ATTRIBUTES_TO_RETAIN X0 \
--ALIGNED_BAM ${aligned_bam} \
--UNMAPPED_BAM ${unaligned_bam} \
--OUTPUT ${output_bam_name}.bam \
--REFERENCE_SEQUENCE ${ref_fasta} \
--PAIRED_RUN true \
--SORT_ORDER "unsorted" \
--IS_BISULFITE_SEQUENCE false \
--ALIGNED_READS_ONLY false \
--CLIP_ADAPTERS false \
--MAX_RECORDS_IN_RAM 2000000 \
--ADD_MATE_CIGAR true \
--MAX_INSERTIONS_OR_DELETIONS -1 \
--PRIMARY_ALIGNMENT_STRATEGY MostDistant \
--PROGRAM_RECORD_ID "bwamem" \
--PROGRAM_GROUP_VERSION "${bwa_version}" \
--PROGRAM_GROUP_COMMAND_LINE "${bwa_commandline}" \
--PROGRAM_GROUP_NAME "bwamem" \
--UNMAPPED_READ_STRATEGY COPY_TO_TAG \
--ALIGNER_PROPER_PAIR_FLAGS true \
--UNMAP_CONTAMINANT_READS true
>>>
runtime {
runtime_minutes: "180"
memory: 24000
}
output {
File output_bam = "${output_bam_name}.bam"
}
}

task MarkDuplicates {
File input_bam
String output_bam_name
String metrics_filename


command <<<
module load biology gatk
gatk MarkDuplicates \
--INPUT ${input_bam} \
--OUTPUT ${output_bam_name}.bam \
--METRICS_FILE ${metrics_filename} \
--VALIDATION_STRINGENCY SILENT \
--OPTICAL_DUPLICATE_PIXEL_DISTANCE 100 \
--ASSUME_SORT_ORDER "queryname" \
--CREATE_MD5_FILE true
>>>
runtime {
runtime_minutes: "660"
memory: 48000
}
output {
File output_bam = "${output_bam_name}.bam"
File duplicate_metrics = "${metrics_filename}"
}
}
}
```

Answers

  • YannLGYannLG Member
    I have just posted a similar issue gatk/discussion/24125/mark-duplicates-extremely-slow-in-particular-settings
    If you have found an answer please do not hesitate to share.
  • tbencomotbencomo Member
    edited June 16
    Hi,

    In my situation, I found that the location of the temp directory used by MarkDuplicates significantly affects performance. In my case, Cromwell was setting the temp directory used by java programs (including MarkDuplicates) to a temp directory inside my home directory, which is slower than the scratch space we have on the cluster I use. I changed my Cromwell config file to set the temp directory to a faster directory and this seemed to remedy the speed issues. This config file change worked for me; see the temporary-directory setting.

    ```
    include required(classpath("application"))

    backend {
    default = "SLURM"
    providers {
    SLURM {
    actor-factory = "cromwell.backend.impl.sfs.config.ConfigBackendLifecycleActorFactory"
    config {
    filesystems {
    local {
    localization: ["hard-link", "soft-link", "copy"]
    }
    }
    temporary-directory = "$(mkdir -p /tmp/cromwell && echo /tmp/cromwell)"
    concurrent-job-limit = 500
    script-epilogue = ""
    runtime-attributes = """
    Int runtime_minutes = 180
    Int memory = 32000
    String queue = "normal"
    """

    # -n swapped for -c. -c == --cpus-per-task which should all the cores to a single node
    submit = """
    sbatch -J ${job_name} -D ${cwd} -o ${out} -e ${err} -t ${runtime_minutes} \
    -p ${queue} \
    --mem=${memory} \
    --wrap "/bin/bash ${script}"
    """
    kill = "scancel ${job_id}"
    check-alive = "squeue -j ${job_id}"
    job-id-regex = "Submitted batch job (\\d+).*"
    }
    }
    }
    }
    ```

    Hope this helps!
Sign In or Register to comment.