Attention:
The frontline support team will be unavailable to answer questions on April 15th and 17th 2019. We will be back soon after. Thank you for your patience and we apologize for any inconvenience!

Code for hard-filtering and QC statistics

Will_GilksWill_Gilks University of Sussex, UKMember ✭✭

I thought I'd post some code for hard-filtering a vcf based-on currrent GATK recommendations (as far as I know).

I'd be interested to hear any suggestions for improvement.

The recommended hard-filters are from http://gatkforums.broadinstitute.org/gatk/discussion/2806/howto-apply-hard-filters-to-a-call-set

SNPs:
QualByDepth (QD) 2.0
FisherStrand (FS) 60.0
RMSMappingQuality (MQ) 40.0
MappingQualityRankSumTest (MQRankSum) 12.5
ReadPosRankSumTest (ReadPosRankSum) 8.0

Indels:
QualByDepth (QD) 2.0
FisherStrand (FS) 200.0
ReadPosRankSumTest (ReadPosRankSum) 20.0

#!/bin/sh
#$ -N z_vcf_filter1
#$ -pe openmp 1
#$ -S /bin/sh
#$ -cwd
#$ -j y
#$ -q bioinf.q

## load modules
. /etc/profile.d/modules.sh
module load jre/1.7.0_25
module load gatk/3.4-0

## set variables
vcf_dir=../vcf_prefilter/
in_vcf=unfilt.lhm_rg_HC_raw.vcf
touch filter1.lhm_rg_HC.vcf
out_vcf=filter1.lhm_rg_HC.vcf
refseq=local_reference/dm6.fa



## ____ make vcf for SNPs only ____
GenomeAnalysisTK -R ${refseq} \
-T SelectVariants \
-V ${vcf_dir}${in_vcf} \
-selectType SNP \
    -o snp.${in_vcf}

## ____ Mark bad qual SNPs ____
GenomeAnalysisTK -R ${refseq} \
-T VariantFiltration \
-V snp.${in_vcf} \
--filterExpression "QD<2.0||FS>60.000||MQ<40.00||MQRankSum<-12.5||ReadPosRankSum<-8.0" \
--filterName "GATKsnp" \
   -o marked.snp.${in_vcf}

## ____ Remove bad quality SNPs ____
GenomeAnalysisTK -R ${refseq} \
-T SelectVariants \
-V marked.snp.${in_vcf} \
--excludeFiltered \
    -o snp.${out_vcf}
rm snp.${in_vcf}*
rm marked.snp.${in_vcf}*



## ____ make vcf indels only ____
GenomeAnalysisTK -R ${refseq} \
-T SelectVariants \
-V ${vcf_dir}${in_vcf} \
-selectType INDEL \
    -o indel.${in_vcf}

## ____ Mark bad qual indels ____
GenomeAnalysisTK -R ${refseq} \
-T VariantFiltration \
-V indel.${in_vcf} \
--filterExpression "QD<2.0||FS>200.000||MQ<40.00||ReadPosRankSum<-20.0" \
--filterName "GATKindel" \
    -o marked.indel.${in_vcf}

## ____ Remove bad quality indels ____
GenomeAnalysisTK -R ${refseq} \
-T SelectVariants \
-V marked.indel.${in_vcf} \
--excludeFiltered \
   -o indel.${out_vcf}
rm indel.${in_vcf}
rm marked.indel.${in_vcf}



## ____ Combine filtered SNPs and indels ____
GenomeAnalysisTK -R ${refseq} \
-T CombineVariants \
-genotypeMergeOptions UNIQUIFY \
-V snp.${out_vcf} \
-V indel.${out_vcf} \
    -o ${out_vcf}
rm snp.${out_vcf}*
rm indel.${out_vcf}*





## ____ GENERATE SUMMARY TABLES ____
mkdir gatk_sums

##  ____ Evaluate variants by sample ____
GenomeAnalysisTK -R ${refseq} \
-T VariantEval \
-eval ${out_vcf} \
--evalModule CountVariants \
--stratificationModule Sample \
-noEV \
    -o gatk_sums/${out_vcf}.vareval.txt



##  ____ make locus metrics table ____
GenomeAnalysisTK -R ${refseq} \
-T VariantsToTable \
-V ${out_vcf} \
-F CHROM -F POS -F ID -F VRT -F QD -F MQRankSum -F MQRankSum -F ReadPosRankSum \
-F QUAL -F DP -F MQ -F FS -F AC -F AF -F AN \
-F HOM-REF -F HET -F HOM-VAR -F NO-CALL \
--allowMissingData \
    -o gatk_sums/${out_vcf}.qc.tbl
gzip gatk_sums/${out_vcf}.qc.tbl


## ____ make genotype DEPTH table ____
GenomeAnalysisTK -R ${refseq} \
-T VariantsToTable \
-V form.unfilt.${vcf} \
-F CHROM -F POS -F ID -GF DP \
--allowMissingData \
    -o gatk_sums/${out_vcf}.depth.tbl
gzip gatk_sums/${out_vcf}.depth.tbl`



##  ____ ANALYSIS WITH VCF-TOOLS ____
module load vcftools/0.1.12b
mkdir vcftools_sums

vcftools --vcf ${out_vcf} --SNPdensity 100000 --remove-indels --out              
vcftools_sums/${out_vcf}.snpDens_100kb.txt
vcftools --vcf ${out_vcf} --SNPdensity 100000 --keep-only-indels --out vcftools_sums/${out_vcf}.indelDens_100kb.txt
vcftools --vcf ${out_vcf} --window-pi 100000 --window-pi-step 100000 --out vcftools_sums/${out_vcf}.pi_100kb.txt
vcftools --vcf ${out_vcf} --singletons --out vcftools_sums/${out_vcf}.sing.txt
vcftools --vcf ${out_vcf} --indv-freq-burden --out vcftools_sums/${out_vcf}.ifb.txt
vcftools --vcf ${out_vcf} --het --out vcftools_sums/${out_vcf}.heterozygosity.txt
vcftools --vcf ${out_vcf} --relatedness --out vcftools_sums/${out_vcf}.relate.txt
module unload vcftools/0.1.12b
module unload jre/1.7.0_25
exit
Sign In or Register to comment.