From one sample to multiple samples

I wrote a script in WDL that does 4 tasks to analyze the samples starting from Fastq files until getting a raw VCF file.
The script can handle one sample only per time and I wonder if there is any way to make it read a folder of multiple Fastq files (2 files per sample, R1 and R2) and do the same workflow for each of them.
Thanks for your help
This is the script:
workflow FromFastqToVCF {
String SAMPLENAME
File FASTQ1
File FASTQ2
File REFFASTA
File REFINDEX
File REFDICT
File HG19AMB
File HG19ANN
File HG19BWT
File HG19PAC
File HG19SA
File DBSNP
File DBSNPINDEX
File BEDFILE

call step1 {
input:
FastqR1=FASTQ1,
FastqR2=FASTQ2,
SampleName=SAMPLENAME
}
call step2 {
input:
SampleName=SAMPLENAME,
uBAM=step1.uBAM
}
call step3 {
input:
SampleName=SAMPLENAME,
mBAM=step2.mBAM,
refFasta=REFFASTA,
refIndex=REFINDEX,
refDict=REFDICT,
hg19amb=HG19AMB,
hg19ann=HG19ANN,
hg19bwt=HG19BWT,
hg19pac=HG19PAC,
hg19sa=HG19SA,
uBAM=step1.uBAM
}
call step4 {
input:
SampleName=SAMPLENAME,
refFasta=REFFASTA,
refIndex=REFINDEX,
refDict=REFDICT,
dbSNP=DBSNP,
dbSNPIndex=DBSNPINDEX,
aBAM=step3.aBAM,
aBAMIndex=step3.aBAMIndex,
bedFile=BEDFILE
}

}

task step1 {
File FastqR1
File FastqR2
String SampleName
command {
gatk FastqToSam \
--FASTQ "${FastqR1}" \
--FASTQ2 "${FastqR2}" \
--OUTPUT "/home/projects/cu_10111/data/Test/${SampleName}_fastqtosam.bam" \
--SAMPLE_NAME "${SampleName}"
}
output {
File uBAM = "/home/projects/cu_10111/data/Test/${SampleName}_fastqtosam.bam"
}

}

task step2 {
File uBAM
String SampleName
command {
gatk MarkIlluminaAdapters \
--INPUT "${uBAM}" \
--METRICS "/home/projects/cu_10111/data/Test/${SampleName}_markilluminaadapters_metrics.txt" \
--OUTPUT "/home/projects/cu_10111/data/Test/${SampleName}_markilluminaadapters.bam"
}
output {
File mBAM = "/home/projects/cu_10111/data/Test/${SampleName}_markilluminaadapters.bam"
}

}

task step3 {
File mBAM
String SampleName
File refFasta
File refIndex
File refDict
File hg19amb
File hg19ann
File hg19bwt
File hg19pac
File hg19sa
File uBAM
command {
gatk SamToFastq \
--INPUT "${mBAM}" \
--FASTQ "/dev/stdout" \
--CLIPPING_ATTRIBUTE XT --CLIPPING_ACTION 2 --INTERLEAVE true --INCLUDE_NON_PF_READS true \
--TMP_DIR "/home/projects/cu_10111/data/Test/temp" \
| \
bwa mem -M -t 31 -p "${refFasta}" "/dev/stdin" \
| \
gatk MergeBamAlignment \
--REFERENCE_SEQUENCE "${refFasta}" \
--UNMAPPED_BAM "${uBAM}" \
--ALIGNED_BAM "/dev/stdin" \
--CREATE_INDEX true --ADD_MATE_CIGAR true --CLIP_ADAPTERS false --CLIP_OVERLAPPING_READS true \
--INCLUDE_SECONDARY_ALIGNMENTS true --MAX_INSERTIONS_OR_DELETIONS -1 --PRIMARY_ALIGNMENT_STRATEGY MostDistant \
--ATTRIBUTES_TO_RETAIN XS \
--OUTPUT "/home/projects/cu_10111/data/Test/${SampleName}_MergeBamAlignment_piped_plumbed.bam" \
--TMP_DIR "/home/projects/cu_10111/data/Test/temp"
}
output {
File aBAM = "/home/projects/cu_10111/data/Test/${SampleName}_MergeBamAlignment_piped_plumbed.bam"
File aBAMIndex = "/home/projects/cu_10111/data/Test/${SampleName}_MergeBamAlignment_piped_plumbed.bai"
}
}

task step4 {
File aBAM
File aBAMIndex
String SampleName
File refFasta
File refIndex
File refDict
File dbSNP
File dbSNPIndex
File bedFile
command {
gatk HaplotypeCaller \
--reference "${refFasta}" \
--input "${aBAM}" \
--output "/home/projects/cu_10111/data/Test/${SampleName}-raw.indels.snps.vcf" \
--dbsnp "${dbSNP}" \
--intervals "${bedFile}"
}
output {
File rawVCF = "/home/projects/cu_10111/data/Test/${SampleName}-raw.indels.snps.vcf"
}
}

Answers

  • ChrisLChrisL Cambridge, MAMember, Broadie, Moderator, Dev

    One comment before I answer - you'll find it easier to format if you wrap the code sections in triple-backticks (ie just put ``` on the lines before and after your code sections.)

    It sounds like what you want is the scatter construct? Something like:

    Array[Pair[File, File]] inputs
    scatter(p in inputs) {
      File r1 = p.left
      File r2 = p.right
      call task1 { ... }
      call task2 { ... }
      call task3 { ... }
      call task4 { ... }
    }
    
Sign In or Register to comment.