We've moved!
This site is now read-only. You can find our new documentation site and support forum for posting questions here.
Be sure to read our welcome blog!

Question about setting intervals for variant calling

Hello GATK team!
I am processing non-human WGS data (50 samples), which has more than 11000 scaffolds, with GATK4.
Now I have a question about HaplotypeCalling and JointDiscovery steps.
In my understanding, parallelizing haplotype calling step with -nct option is not recommended.
So I want to parallelize these steps with intervals.
The longest scaffold is around 160Mb, and I chopped all the scaffolds into 30 Mb pieces.
Then I made 50 interval files (xxx.intervals), some of which contain only one interval (with the length of 30 Mb) and some of which contain many short scaffolds.
I performed 50 independent jobs for both of HaplotypeCalling and JointDiscovery steps with different interval files.
I believe that in the case I only need SNPs and don't call indels, cutting sequence into random interval does not harm me... Is this correct?
Here I paste my modified pipelines and example of interval files:

#### script for haplotype Calling ####

# WORKFLOW DEFINITION
workflow HaplotypeCallerGvcf_GATK4 {
String sample_name
File input_bam
File input_bam_index
#String input_dir
File ref_dict
File ref_fasta
File ref_fasta_index
File interval_file

String sample_basename = basename(input_bam, ".bam")
String sample_num = basename(interval_file, ".intervals")
String vcf_basename = sample_basename
#String output_suffix = if making_gvcf then ".g.vcf.gz" else ".vcf.gz"
String output_suffix = ".g.vcf.gz"
String output_filename = vcf_basename + "_" + sample_num +output_suffix


# Call variants in parallel over grouped calling intervals
#scatter (interval_file in scattered_calling_intervals) {

# Generate GVCF by interval
call HaplotypeCaller {
input:
#input_bam = select_first([CramToBamTask.output_bam, input_bam]),
#input_bam_index = select_first([CramToBamTask.output_bai, input_bam_index]),
interval_list = interval_file,
input_bam = input_bam,
input_bam_index = input_bam_index,
output_filename = output_filename,
ref_dict = ref_dict,
ref_fasta = ref_fasta,
ref_fasta_index = ref_fasta_index
#make_gvcf = making_gvcf
#docker = gatk_docker,
#gatk_path = gatk_path
}
#}



# Outputs that will be retained when execution is complete
output {
#File output_vcf = MergeGVCFs.output_vcf
#File output_vcf_index = MergeGVCFs.output_vcf_index
File output_vcf = HaplotypeCaller.output_vcf
File output_vcf_index = HaplotypeCaller.output_vcf_index
}
}

# TASK DEFINITIONS

task CramToBamTask {
# Command parameters
File ref_fasta
File ref_fasta_index
File ref_dict
File input_cram
String sample_name


Float output_bam_size = size(input_cram, "GB") / 0.60
Float ref_size = size(ref_fasta, "GB") + size(ref_fasta_index, "GB") + size(ref_dict, "GB")
Int disk_size = ceil(size(input_cram, "GB") + output_bam_size + ref_size) + 20

command {
set -e
set -o pipefail

samtools view -h -T ${ref_fasta} ${input_cram} |
samtools view -b -o ${sample_name}.bam -
samtools index -b ${sample_name}.bam
mv ${sample_name}.bam.bai ${sample_name}.bai
}

output {
File output_bam = "${sample_name}.bam"
File output_bai = "${sample_name}.bai"
}
}

# HaplotypeCaller per-sample in GVCF mode
task HaplotypeCaller {
File input_bam
File input_bam_index
File interval_list
String output_filename
File ref_dict
File ref_fasta
File ref_fasta_index
#Float? contamination
#Boolean make_gvcf

String gatk_path
#String? java_options
#String java_opt = select_first([java_options, "-XX:GCTimeLimit=50 -XX:GCHeapFreeLimit=10"])
String java_opt = "-XX:GCTimeLimit=50 -XX:GCHeapFreeLimit=10"


Int machine_mem_gb = select_first([mem_gb, 7])
Int command_mem_gb = machine_mem_gb - 1

Float ref_size = size(ref_fasta, "GB") + size(ref_fasta_index, "GB") + size(ref_dict, "GB")
Int disk_size = ceil(size(input_bam, "GB") + ref_size) + 20

command <<<
set -e

${gatk_path} --java-options "-Xmx${command_mem_gb}G ${java_opt}" \
HaplotypeCaller \
-R ${ref_fasta} \
-I ${input_bam} \
-L ${interval_list} \
-O ${output_filename} \
-ERC GVCF
>>>

#runtime {
#docker: docker
#memory: machine_mem_gb + " GB"
#disks: "local-disk " + select_first([disk_space_gb, disk_size]) + if use_ssd then " SSD" else " HDD"
#preemptible: select_first([preemptible_attempts, 3])
#}

output {
File output_vcf = "${output_filename}"
File output_vcf_index = "${output_filename}.tbi"
}
}
# Merge GVCFs generated per-interval for the same sample
task MergeGVCFs {
Array[File] input_vcfs
Array[File] input_vcfs_indexes
String output_filename

String gatk_path

# Runtime parameters
String docker
Int mem_gb
#Int? disk_space_gb
Boolean use_ssd = false
Int? preemptible_attempts

Int machine_mem_gb = select_first([mem_gb, 3])
Int command_mem_gb = machine_mem_gb - 1

command <<<
set -e

${gatk_path} --java-options "-Xmx${command_mem_gb}G" \
MergeVcfs \
--INPUT ${sep=' --INPUT ' input_vcfs} \
--OUTPUT ${output_filename}
>>>


output {
File output_vcf = "${output_filename}"
File output_vcf_index = "${output_filename}.tbi"
}
}


#### Script for Joint Discovery ####

workflow JointGenotyping {
File intervals_file

String callset_name
#Boolean ready
File ref_fasta
File ref_fasta_index
File ref_dict

#File dbsnp_vcf
#File dbsnp_vcf_index

#Array[String] sample_names
#Array[File] input_gvcfs
#Array[File] input_gvcfs_indices
File cohort_sample_map

#File eval_interval_list
#File dbsnp_resource_vcf = dbsnp_vcf
#File dbsnp_resource_vcf_index = dbsnp_vcf_index

#String sample_num = basename(intervals_file, ".intervals")
String output_filename = callset_name

# ExcessHet is a phred-scaled p-value. We want a cutoff of anything more extreme
# than a z-score of -4.5 which is a p-value of 3.4e-06, which phred-scaled is 54.69
Float excess_het_threshold = 54.69


#Int num_of_original_intervals = length(read_lines(unpadded_intervals_file))

#Array[String] unpadded_intervals = read_lines(unpadded_intervals_file)

#scatter (idx in range(length(unpadded_intervals))) {
# the batch_size value was carefully chosen here as it
# is the optimal value for the amount of memory allocated
# within the task; please do not change it without consulting
# the Hellbender (GATK engine) team!
call ImportGVCFs {
input:
#sample_names = sample_names,
interval = intervals_file,
workspace_dir_name = "genomicsdb",
inputs_samples = cohort_sample_map,
batch_size = 5
}

call GenotypeGVCFs {
input:
workspace_tar = ImportGVCFs.output_genomicsdb,
interval = intervals_file,
output_vcf_filename = "output.vcf.gz",
ref_fasta = ref_fasta,
ref_fasta_index = ref_fasta_index,
ref_dict = ref_dict,
#dbsnp_vcf = dbsnp_vcf,
#dbsnp_vcf_index = dbsnp_vcf_index
}

call HardFilterAndMakeSitesOnlyVcf {
input:
vcf = GenotypeGVCFs.output_vcf,
vcf_index = GenotypeGVCFs.output_vcf_index,
excess_het_threshold = excess_het_threshold,
variant_filtered_vcf_filename = output_filename + ".variant_filtered.vcf.gz"
#sites_only_vcf_filename = output_filename + ".sites_only.variant_filtered.vcf.gz"
}


call GatherVcfs as SitesOnlyGatherVcf {
input:
input_vcfs_fofn = HardFilterAndMakeSitesOnlyVcf.variant_filtered_vcf,
input_vcf_indexes_fofn = HardFilterAndMakeSitesOnlyVcf.variant_filtered_vcf_index,
output_vcf_name = output_filename + ".vcf.gz"
}








output {
# outputs from the small callset path through the wdl

File prepared_vcf = SitesOnlyGatherVcf.output_vcf
File prepared_vcf_index = SitesOnlyGatherVcf.output_vcf_index


Boolean done = true

}
}



task GetNumberOfSamples {
File sample_name_map
#String docker
command <<<
wc -l ${sample_name_map} | awk '{print $1}'
>>>
output {
Int sample_count = read_int(stdout())
}
}

task ImportGVCFs {
#Array[String] sample_names
#Array[File] input_gvcfs
#Array[File] input_gvcfs_indices
File inputs_samples
String interval

String workspace_dir_name
Int batch_size

command <<<
# The memory setting here is very important and must be several GB lower
# than the total memory allocated to the VM because this tool uses
# a significant amount of non-heap memory for native libraries.
# Also, testing has shown that the multithreaded reader initialization
# does not scale well beyond 5 threads, so don't increase beyond that.
gatk --java-options "-Xmx7g -Xms7g -XX:ConcGCThreads=1" \
GenomicsDBImport \
--genomicsdb-workspace-path ${workspace_dir_name} \
--batch-size ${batch_size} \
-L ${interval} \
--sample-name-map ${inputs_samples}

tar -cf ${workspace_dir_name}.tar ${workspace_dir_name}

>>>
output {
File output_genomicsdb = "${workspace_dir_name}.tar"
}
}

task GenotypeGVCFs {
File workspace_tar
String interval

String output_vcf_filename

File ref_fasta
File ref_fasta_index
File ref_dict

#File dbsnp_vcf
#File dbsnp_vcf_index

command <<<
set -e

tar -xf ${workspace_tar}
WORKSPACE=$( basename ${workspace_tar} .tar)

gatk --java-options "-Xmx7g -Xms7g -XX:ConcGCThreads=1" \
GenotypeGVCFs \
-R ${ref_fasta} \
-O ${output_vcf_filename} \
-G StandardAnnotation \
--only-output-calls-starting-in-intervals \
--use-new-qual-calculator \
-V gendb://$WORKSPACE \
-L ${interval}
>>>
output {
File output_vcf = "${output_vcf_filename}"
File output_vcf_index = "${output_vcf_filename}.tbi"
}
}

task HardFilterAndMakeSitesOnlyVcf {
File vcf
File vcf_index
Float excess_het_threshold

String variant_filtered_vcf_filename
#String sites_only_vcf_filename
#String gatk_path

#String docker
#Int disk_size

command {
set -e

gatk --java-options "-Xmx32g -Xms32g -XX:ConcGCThreads=1" \
VariantFiltration \
--filter-expression "ExcessHet > ${excess_het_threshold}" \
--filter-name ExcessHet \
-O ${variant_filtered_vcf_filename} \
-V ${vcf}


}
output {
File variant_filtered_vcf = "${variant_filtered_vcf_filename}"
File variant_filtered_vcf_index = "${variant_filtered_vcf_filename}.tbi"
}
}



task GatherVcfs {
File input_vcfs_fofn
File input_vcf_indexes_fofn
String output_vcf_name


command <<<
set -e
set -o pipefail

# ignoreSafetyChecks make a big performance difference so we include it in our invocation
gatk --java-options "-Xmx60g -Xms60g -XX:ConcGCThreads=1" \
GatherVcfsCloud \
--ignore-safety-checks \
--gather-type BLOCK \
--input ${sep=" --input " input_vcfs_fofn} \
--output ${output_vcf_name}

gatk --java-options "-Xmx60g -Xms60g -XX:ConcGCThreads=1" \
IndexFeatureFile \
--feature-file ${output_vcf_name}
>>>
output {
File output_vcf = "${output_vcf_name}"
File output_vcf_index = "${output_vcf_name}.tbi"
}
}


#### interval files ####
# A.intervals
HiC_scaffold_10:30000001-30097849
HiC_scaffold_11:1-24811943

# B.intervals
HiC_scaffold_10:1-30000000

# C.intervals
HiC_scaffold_4888:1-2392
HiC_scaffold_4889:1-2392
HiC_scaffold_4890:1-2389
HiC_scaffold_4891:1-2388
HiC_scaffold_4892:1-2387
HiC_scaffold_4893:1-2387
HiC_scaffold_4894:1-2386
HiC_scaffold_4895:1-2385
HiC_scaffold_4896:1-2385
HiC_scaffold_4897:1-2384
HiC_scaffold_4898:1-2384
HiC_scaffold_4899:1-2384
HiC_scaffold_4900:1-2384
HiC_scaffold_4901:1-2383
HiC_scaffold_4902:1-2382
HiC_scaffold_4903:1-2382
HiC_scaffold_4904:1-2381
HiC_scaffold_4905:1-2381
HiC_scaffold_4906:1-2381
HiC_scaffold_4907:1-2380
HiC_scaffold_4908:1-2380
HiC_scaffold_4909:1-2380
HiC_scaffold_4910:1-2380
HiC_scaffold_4911:1-2380

Answers

Sign In or Register to comment.