To celebrate the release of GATK 4.0, we are giving away free credits for running the GATK4 Best Practices pipelines in FireCloud, our secure online analysis portal. It’s first come first serve, so sign up now to claim your free credits worth $250. Sponsored by Google Cloud. Learn more at https://software.broadinstitute.org/firecloud/documentation/freecredits

Code for hard-filtering and QC statistics

Will_GilksWill_Gilks University of Sussex, UKMember

I thought I'd post some code for hard-filtering a vcf based-on currrent GATK recommendations (as far as I know).

I'd be interested to hear any suggestions for improvement.

The recommended hard-filters are from http://gatkforums.broadinstitute.org/gatk/discussion/2806/howto-apply-hard-filters-to-a-call-set

SNPs:
QualByDepth (QD) 2.0
FisherStrand (FS) 60.0
RMSMappingQuality (MQ) 40.0
MappingQualityRankSumTest (MQRankSum) 12.5
ReadPosRankSumTest (ReadPosRankSum) 8.0

Indels:
QualByDepth (QD) 2.0
FisherStrand (FS) 200.0
ReadPosRankSumTest (ReadPosRankSum) 20.0

#!/bin/sh
#$ -N z_vcf_filter1
#$ -pe openmp 1
#$ -S /bin/sh
#$ -cwd
#$ -j y
#$ -q bioinf.q

## load modules
. /etc/profile.d/modules.sh
module load jre/1.7.0_25
module load gatk/3.4-0

## set variables
vcf_dir=../vcf_prefilter/
in_vcf=unfilt.lhm_rg_HC_raw.vcf
touch filter1.lhm_rg_HC.vcf
out_vcf=filter1.lhm_rg_HC.vcf
refseq=local_reference/dm6.fa



## ____ make vcf for SNPs only ____
GenomeAnalysisTK -R ${refseq} \
-T SelectVariants \
-V ${vcf_dir}${in_vcf} \
-selectType SNP \
    -o snp.${in_vcf}

## ____ Mark bad qual SNPs ____
GenomeAnalysisTK -R ${refseq} \
-T VariantFiltration \
-V snp.${in_vcf} \
--filterExpression "QD<2.0||FS>60.000||MQ<40.00||MQRankSum<-12.5||ReadPosRankSum<-8.0" \
--filterName "GATKsnp" \
   -o marked.snp.${in_vcf}

## ____ Remove bad quality SNPs ____
GenomeAnalysisTK -R ${refseq} \
-T SelectVariants \
-V marked.snp.${in_vcf} \
--excludeFiltered \
    -o snp.${out_vcf}
rm snp.${in_vcf}*
rm marked.snp.${in_vcf}*



## ____ make vcf indels only ____
GenomeAnalysisTK -R ${refseq} \
-T SelectVariants \
-V ${vcf_dir}${in_vcf} \
-selectType INDEL \
    -o indel.${in_vcf}

## ____ Mark bad qual indels ____
GenomeAnalysisTK -R ${refseq} \
-T VariantFiltration \
-V indel.${in_vcf} \
--filterExpression "QD<2.0||FS>200.000||MQ<40.00||ReadPosRankSum<-20.0" \
--filterName "GATKindel" \
    -o marked.indel.${in_vcf}

## ____ Remove bad quality indels ____
GenomeAnalysisTK -R ${refseq} \
-T SelectVariants \
-V marked.indel.${in_vcf} \
--excludeFiltered \
   -o indel.${out_vcf}
rm indel.${in_vcf}
rm marked.indel.${in_vcf}



## ____ Combine filtered SNPs and indels ____
GenomeAnalysisTK -R ${refseq} \
-T CombineVariants \
-genotypeMergeOptions UNIQUIFY \
-V snp.${out_vcf} \
-V indel.${out_vcf} \
    -o ${out_vcf}
rm snp.${out_vcf}*
rm indel.${out_vcf}*





## ____ GENERATE SUMMARY TABLES ____
mkdir gatk_sums

##  ____ Evaluate variants by sample ____
GenomeAnalysisTK -R ${refseq} \
-T VariantEval \
-eval ${out_vcf} \
--evalModule CountVariants \
--stratificationModule Sample \
-noEV \
    -o gatk_sums/${out_vcf}.vareval.txt



##  ____ make locus metrics table ____
GenomeAnalysisTK -R ${refseq} \
-T VariantsToTable \
-V ${out_vcf} \
-F CHROM -F POS -F ID -F VRT -F QD -F MQRankSum -F MQRankSum -F ReadPosRankSum \
-F QUAL -F DP -F MQ -F FS -F AC -F AF -F AN \
-F HOM-REF -F HET -F HOM-VAR -F NO-CALL \
--allowMissingData \
    -o gatk_sums/${out_vcf}.qc.tbl
gzip gatk_sums/${out_vcf}.qc.tbl


## ____ make genotype DEPTH table ____
GenomeAnalysisTK -R ${refseq} \
-T VariantsToTable \
-V form.unfilt.${vcf} \
-F CHROM -F POS -F ID -GF DP \
--allowMissingData \
    -o gatk_sums/${out_vcf}.depth.tbl
gzip gatk_sums/${out_vcf}.depth.tbl`



##  ____ ANALYSIS WITH VCF-TOOLS ____
module load vcftools/0.1.12b
mkdir vcftools_sums

vcftools --vcf ${out_vcf} --SNPdensity 100000 --remove-indels --out              
vcftools_sums/${out_vcf}.snpDens_100kb.txt
vcftools --vcf ${out_vcf} --SNPdensity 100000 --keep-only-indels --out vcftools_sums/${out_vcf}.indelDens_100kb.txt
vcftools --vcf ${out_vcf} --window-pi 100000 --window-pi-step 100000 --out vcftools_sums/${out_vcf}.pi_100kb.txt
vcftools --vcf ${out_vcf} --singletons --out vcftools_sums/${out_vcf}.sing.txt
vcftools --vcf ${out_vcf} --indv-freq-burden --out vcftools_sums/${out_vcf}.ifb.txt
vcftools --vcf ${out_vcf} --het --out vcftools_sums/${out_vcf}.heterozygosity.txt
vcftools --vcf ${out_vcf} --relatedness --out vcftools_sums/${out_vcf}.relate.txt
module unload vcftools/0.1.12b
module unload jre/1.7.0_25
exit
Sign In or Register to comment.