Iteration in FireCloud

manidrmanidr Cambridge, MAMember

Other than scatter, are there alternate methods to iterate over an array in FireCloud?

I am creating several subsets of data (based on an input dataset and class vector). For each data subset, I want to run a copy number correlation analysis (that requires the use of a scatter to parallelize the calculation of millions of correlation values). I had implemented this as a 2-level scatter with a sub-workflow. I tested it out locally and it works fine in cromwell v29. But when I try to upload the workflow to FireCloud I get a "WDL imports not yet supported" error -- looks like I can't import the sub-workflow.

Any ideas on implementing this workflow are greatly appreciated. I've attached the workflow (cna_analysis_bysubgroup.wdl) and sub-workflow (cna_analysis.wdl).

cna_analysis_bysubgroup.wdl

import "cna_analysis.wdl" as cna_analysis


task harmonize_data {
  File rna
  File cna
  File pome
  String analysisDir
  String codeDir = "/prot/proteomics/Projects/PGDAC/src"
  String dataDir = "/prot/proteomics/Projects/PGDAC/data"

  command {
    set -euo pipefail
    # create matrix files from input gct -- harmonize both rows and columns
    /prot/proteomics/Projects/PGDAC/src/run-pipeline.sh harmonize -c ${codeDir} -d ${dataDir} -r ${analysisDir} -rna ${rna} -cna ${cna} -f ${pome}
  }

  output {
    File outputs = "harmonize-output.tar"
  }

  runtime {
    docker : "broadcptac/pgdac_basic:1"
  }

  meta {
    author : "D. R. Mani"
    email : "[email protected]"
  }
}



task cna_analysis_setup {
  File tarball
  Int? jidMax
  File? groups    # expt-design-like file to use for subgroups
  String codeDir = "/prot/proteomics/Projects/PGDAC/src"

  command {
    set -euo pipefail
    # setup directories and code (not needed to run, but for final tar file output); 
    # use matrix files from harmonization and create any subsets
    # create table of matrix files (tsv, one line per group, in order: rna, cna, pome)
    # determine subgroup list (separately, since this is a string and cannot be part of matrix files)
    # determine actual jidMax and create file with list of jid's
    /prot/proteomics/Projects/PGDAC/src/run-pipeline.sh CNAsetup -i ${tarball} -c ${codeDir} ${"-g " + groups} ${"-pe " + jidMax}
  }

  output {
    Array[Array[File]] matrixFiles = read_tsv ("file_table.tsv")
    Array[String] subgroups = read_lines ("subgroups.txt")
    File jidsFile = "jids.txt"
    File outputs = "CNAsetup-output.tar"
  }

  runtime {
    docker : "broadcptac/pgdac_basic:1"
  }

  meta {
    author : "D. R. Mani"
    email : "[email protected]"
  }
}



task assemble_results {
  Array[Array[File]] table_files
  Array[File] plot_files
  File tarball

  command {
    set -eu   # do not use -o pipefail -- results in error 141 when using ... | head -1 | ...
    # extract tarball in current directory and set $analysis_dir
    tar -x -f ${tarball}
    analysis_dir=`tar -t -f ${tarball} | head -1 | sed -e 's/\/.*//'`
    cd $analysis_dir
    # copy result tables/plots to appropriate location
    # (first flatted the table_files 2D array;
    #  using sep=" " creates ["item1.1", ... "item1.n"] ["item2.1", ... "item2.n"] ...
    #  flatten by removing [ ] , "
    file_list=`echo '${sep=" " table_files}' | tr -d '][,"'`
    cp $file_list cna
    cp ${sep=" " plot_files} cna
    # recreate new tarball for output
    cd ..
    tar -c -f CNA-output.tar $analysis_dir
  }

  output {
    File outputs = "CNA-output.tar"
  }

  runtime {
    docker : "broadcptac/pgdac_basic:1"
  }

  meta {
    author : "D. R. Mani"
    email : "[email protected]"
  }
}




workflow run_cna_analysis_on_subgroups {
  File rna
  File cna
  File pome
  String analysisDir

  call harmonize_data {
    input:
      rna=rna,
      cna=cna,
      pome=pome,
      analysisDir=analysisDir
  }

  call cna_analysis_setup {
    input:
       tarball=harmonize_data.outputs
  }

  scatter (idx in range (length (cna_analysis_setup.subgroups))) {
    call cna_analysis.run_cna_analysis as cna_s {
      input:
        prefix=cna_analysis_setup.subgroups[idx],
        rna=cna_analysis_setup.matrixFiles[idx][0],
        cna=cna_analysis_setup.matrixFiles[idx][1],
        pome=cna_analysis_setup.matrixFiles[idx][2],
        jidsFile=cna_analysis_setup.jidsFile
    }
  }

  call assemble_results {
    input:
      table_files=cna_s.tables,
      plot_files=cna_s.plot,
      tarball=cna_analysis_setup.outputs
  }

  output {
    File final_output = assemble_results.outputs
  }
}

cna_analysis.wdl

task cna_analysis {
  File rna
  File cna
  File pome
  String prefix
  Int jidMax
  Int jid
  String codeDir = "/prot/proteomics/Projects/PGDAC/src"

  command {
    set -euo pipefail
    # setup directories and code
    cp ${codeDir}/cna-analysis.r ${codeDir}/generate-cna-plots.r .
    if [ ! -d ${prefix}-output ]; then 
      mkdir ${prefix}-output 
    fi
    # run cna analysis for corresponding shard / gather
    Rscript cna-analysis.r ${jid} ${jidMax} ${prefix} ${rna} ${cna} ${pome}
  }

  output {
    File rna_cna_corr = "${prefix}-output/mrna-vs-cna-corr${jid}.csv"
    File rna_cna_pval = "${prefix}-output/mrna-vs-cna-pval${jid}.csv"
    File pome_cna_corr = "${prefix}-output/pome-vs-cna-corr${jid}.csv"
    File pome_cna_pval = "${prefix}-output/pome-vs-cna-pval${jid}.csv"
  }

  runtime {
    docker : "broadcptac/pgdac_basic:1"
  }

  meta {
    author : "D. R. Mani"
    email : "[email protected]"
  }
}



task gather_results_and_plot {
  String prefix
  Int jidMax
  Array[File] rna_vs_cna_corr
  Array[File] rna_vs_cna_pval
  Array[File] pome_vs_cna_corr
  Array[File] pome_vs_cna_pval
  String codeDir = "/prot/proteomics/Projects/PGDAC/src"
  String dataDir = "/prot/proteomics/Projects/PGDAC/data"


  command {
    set -euo pipefail
    # setup directories and code
    cp ${codeDir}/cna-analysis.r ${codeDir}/generate-cna-plots.r .
    cp ${dataDir}/chr-length.csv ${dataDir}/gene-location.csv .
    if [ ! -d ${prefix}-output ]; then 
      mkdir ${prefix}-output 
    fi
    # copy results from scatter operation
    mv ${sep=" " rna_vs_cna_corr} ${prefix}-output
    mv ${sep=" " rna_vs_cna_pval} ${prefix}-output
    mv ${sep=" " pome_vs_cna_corr} ${prefix}-output
    mv ${sep=" " pome_vs_cna_pval} ${prefix}-output
    # run cna analysis for corresponding shard / gather
    Rscript cna-analysis.r 0 ${jidMax} ${prefix} NULL NULL NULL
  }

  output {
    Array[File] tables=glob ("${prefix}-*-vs-*.csv")
    File plot="${prefix}-cna-plot.png"
  }

  runtime {
    docker : "broadcptac/pgdac_basic:1"
  }

  meta {
    author : "D. R. Mani"
    email : "[email protected]"
  }
}




workflow run_cna_analysis {
  File rna
  File cna
  File pome
  String prefix
  File jidsFile
  Array[Int] jids = read_lines ("${jidsFile}")
  Int jidMax = length (jids)


  scatter (i in jids) {
    call cna_analysis {
      input:
        rna=rna,
        cna=cna,
        pome=pome,
        prefix=prefix,
        jidMax=jidMax,
        jid=i
    }
  }

  call gather_results_and_plot {
    input:
      prefix=prefix,
      jidMax=jidMax,
      rna_vs_cna_corr=cna_analysis.rna_cna_corr,
      rna_vs_cna_pval=cna_analysis.rna_cna_pval,
      pome_vs_cna_corr=cna_analysis.pome_cna_corr,
      pome_vs_cna_pval=cna_analysis.pome_cna_pval
  }

  output {
    Array[File] tables = gather_results_and_plot.tables
    File plot = gather_results_and_plot.plot
  }
}

Answers

  • KateNKateN Cambridge, MAMember, Broadie, Moderator admin

    Unfortunately, no. There is no other way to iterate over an array in WDL as a language; scatter is your only option. I will ask one of our developers to come take a look at your workflow and see if they have a solution for you while we wait for import statements to be implemented.

  • KateNKateN Cambridge, MAMember, Broadie, Moderator admin

    Unfortunately our developer did not have a separate solution for you, and I do apologize for neglecting to get back to you on that matter. However, import statements are now implemented in FireCloud as of November 15th. You can read more about it in the release notes here.

Sign In or Register to comment.