nf-core · ramprasadn · Apr 25, 2023 · Apr 2, 2023 · Apr 2, 2023 · Apr 3, 2023
diff --git a/CITATIONS.md b/CITATIONS.md
@@ -22,6 +22,12 @@
 
   > Vasimuddin Md, Misra S, Li H, Aluru S. Efficient Architecture-Aware Acceleration of BWA-MEM for Multicore Systems. In: 2019 IEEE International Parallel and Distributed Processing Symposium (IPDPS). IEEE; 2019:314-324. doi:10.1109/IPDPS.2019.00041
 
+- [CADD<sup>1</sup>](https://genomemedicine.biomedcentral.com/articles/10.1186/s13073-021-00835-9)<sup>,</sup> [<sup>2</sup>](https://academic.oup.com/nar/article/47/D1/D886/5146191)
+
+  > Rentzsch P, Schubach M, Shendure J, Kircher M. CADD-Splice—improving genome-wide variant effect prediction using deep learning-derived splice scores. Genome Med. 2021;13(1):31. doi:10.1186/s13073-021-00835-9
+
+  > Rentzsch P, Witten D, Cooper GM, Shendure J, Kircher M. CADD: predicting the deleteriousness of variants throughout the human genome. Nucleic Acids Research. 2019;47(D1):D886-D894. doi:10.1093/nar/gky1016
+
 - [DeepVariant](https://www.nature.com/articles/nbt.4235)
 
   > Poplin R, Chang PC, Alexander D, et al. A universal SNP and small-indel variant caller using deep neural networks. Nat Biotechnol. 2018;36(10):983-987. doi:10.1038/nbt.4235

diff --git a/README.md b/README.md
@@ -59,6 +59,7 @@ On release, automated continuous integration tests run the pipeline on a full-si
 
 - [bcftools roh](https://samtools.github.io/bcftools/bcftools.html#roh)
 - [vcfanno](https://github.com/brentp/vcfanno)
+- [CADD](https://cadd.gs.washington.edu/)
 - [VEP](https://www.ensembl.org/info/docs/tools/vep/index.html)
 
 **6. Annotation - SV:**
@@ -72,6 +73,7 @@ On release, automated continuous integration tests run the pipeline on a full-si
 - Annotation:
   - [HaploGrep2](https://github.com/seppinho/haplogrep-cmd)
   - [vcfanno](https://github.com/brentp/vcfanno)
+  - [CADD](https://cadd.gs.washington.edu/)
   - [VEP](https://www.ensembl.org/info/docs/tools/vep/index.html)
 
 **8. Variant calling - repeat expansions:**

diff --git a/assets/cadd_to_vcf_header_-1.0-.txt b/assets/cadd_to_vcf_header_-1.0-.txt
@@ -0,0 +1 @@
+##INFO=<ID=CADD,Number=1,Type=Float,Description="PHRED-like scaled CADD score.">
diff --git a/conf/modules/annotate_cadd.config b/conf/modules/annotate_cadd.config
@@ -0,0 +1,41 @@
+/*
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+    Config file for defining DSL2 per module options and publishing paths
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+    Available keys to override module options:
+        ext.args            = Additional arguments appended to command in module.
+        ext.args2           = Second set of arguments appended to command in module (multi-tool modules).
+        ext.args3           = Third set of arguments appended to command in module (multi-tool modules).
+        ext.prefix          = File name prefix for output files.
+        ext.when            = Conditional clause
+----------------------------------------------------------------------------------------
+*/
+
+//
+// CADD annotation
+//
+
+process {
+    withName: '.*:ANNOTATE_CADD.*' {
+        ext.when = { (params.cadd_resources != null) && ( !(workflow.profile.tokenize(',').intersect(['test', 'test_one_sample']).size() >= 1) || workflow.stubRun) }
+    }
+
+    withName: '.*:ANNOTATE_CADD:BCFTOOLS_VIEW' {
+        ext.args   = { "--output-type z --types indels" }
+        ext.prefix = { "${vcf.simpleName}_indels" }
+    }
+
+    withName: '.*:ANNOTATE_CADD:CADD' {
+        ext.args   = { "-g ${params.genome}" }
+        ext.prefix = { "${vcf.simpleName}_cadd" }
+    }
+
+    withName: '.*:ANNOTATE_CADD:TABIX_CADD' {
+        ext.args = { "--force --sequence 1 --begin 2 --end 2" }
+    }
+
+    withName: '.*:ANNOTATE_CADD:BCFTOOLS_ANNOTATE' {
+        ext.args   = { "--columns Chrom,Pos,Ref,Alt,-,CADD --output-type z" }
+        ext.prefix = { "${input.simpleName}_ann" }
+    }
+}
diff --git a/conf/modules/scatter_genome.config b/conf/modules/scatter_genome.config
@@ -17,12 +17,12 @@
 
 process {
     withName: '.*SCATTER_GENOME:BUILD_BED' {
-        ext.when = { !params.skip_snv_annotation && !(params.analysis_type == "wes")}
+        ext.when = { !params.skip_snv_annotation }
     }
 
     withName: '.*SCATTER_GENOME:GATK4_SPLITINTERVALS' {
         ext.args = { "--subdivision-mode BALANCING_WITHOUT_INTERVAL_SUBDIVISION --scatter-count 22" }
-        ext.when = { !params.skip_snv_annotation && !(params.analysis_type == "wes")}
+        ext.when = { !params.skip_snv_annotation }
         ext.prefix = { "${meta.id}_genome_intervals" }
         publishDir = [
             enabled: params.save_reference,

diff --git a/docs/output.md b/docs/output.md
@@ -40,6 +40,7 @@ The pipeline is built using [Nextflow](https://www.nextflow.io/) and processes d
 - [Annotation - SNV](#annotation---snv)
   - [bcftools roh](#bcftools-roh)
   - [vcfanno](#vcfanno)
+  - [CADD](#cadd)
   - [VEP](#vep)
 - [Annotation - SV](#annotation---sv)
   - [SVDB query](#svdb-query)
@@ -296,7 +297,13 @@ The pipeline performs variant calling using [Sentieon DNAscope](https://support.
 
 #### vcfanno
 
-[vcfanno](https://github.com/brentp/vcfanno) allows you to quickly annotate your VCF with any number of INFO fields from any number of VCFs. It uses a simple conf file to allow the user to specify the source annotation files and fields and how they will be added to the info of the query VCF. Values are pulled by name from the INFO field with special-cases of ID and FILTER to pull from those VCF columns. The output files are not published in the output folder by default, and is passed to vep for further annotation.
+[vcfanno](https://github.com/brentp/vcfanno) allows you to quickly annotate your VCF with any number of INFO fields from any number of VCFs. It uses a simple conf file to allow the user to specify the source annotation files and fields and how they will be added to the info of the query VCF. Values are pulled by name from the INFO field with special-cases of ID and FILTER to pull from those VCF columns. The output files are not published in the output folder by default, and is passed to CADD and/or VEP for further annotation.
+
+We recommend using vcfanno to annotate SNVs with precomputed CADD scores (files can be downloaded from [here](https://cadd.gs.washington.edu/download)).
+
+#### CADD
+
+[CADD](https://cadd.gs.washington.edu/) is a tool for scoring the deleteriousness of single nucleotide variants as well as insertion/deletions variants in the human genome. In nf-core/raredisease, SNVs can be annotated with precomputed CADD scores using vcfanno. However, for small indels they will be calculated on the fly by CADD. The output files are not published in the output folder by default, and is passed to VEP for further annotation.
 
 #### VEP
 
@@ -371,6 +378,12 @@ The pipeline for mitochondrial variant discovery, using Mutect2, uses a high sen
 
 [vcfanno](https://github.com/brentp/vcfanno) allows you to quickly annotate your VCF with any number of INFO fields from any number of VCFs. It uses a simple conf file to allow the user to specify the source annotation files and fields and how they will be added to the info of the query VCF. Values are pulled by name from the INFO field with special-cases of ID and FILTER to pull from those VCF columns. The output files are not published in the output folder by default, and is passed to vep for further annotation.
 
+We recommend using vcfanno to annotate SNVs with precomputed CADD scores (files can be downloaded from [here](https://cadd.gs.washington.edu/download)).
+
+#### CADD
+
+[CADD](https://cadd.gs.washington.edu/) is a tool for scoring the deleteriousness of single nucleotide variants as well as insertion/deletions variants in the human genome. In nf-core/raredisease, SNVs can be annotated with precomputed CADD scores using vcfanno. However, for small indels they will be calculated on the fly by CADD. The output files are not published in the output folder by default, and is passed to VEP for further annotation.
+
 ##### VEP
 
 [VEP](https://www.ensembl.org/info/docs/tools/vep/index.html) determines the effect of your variants on genes, transcripts, and protein sequence, as well as regulatory regions.

diff --git a/docs/usage.md b/docs/usage.md
@@ -196,6 +196,7 @@ The mandatory and optional parameters for each category are tabulated below.
 | vcfanno_toml<sup>3</sup>      | vcfanno_lua                    |
 | vep_cache_version             | vep_filters<sup>6</sup>        |
 | vep_cache                     | score_config_snv<sup>7</sup>   |
+|                               | cadd_resources<sup>8</sup>     |
 
 <sup>1</sup>Genome version is used by VEP. You have the option to choose between GRCh37 and GRCh38.<br />
 <sup>2</sup>Path to VCF files and their indices used by vcfanno. Sample file [here](https://github.com/nf-core/test-datasets/blob/raredisease/reference/vcfanno_resources.txt).<br />
@@ -204,6 +205,9 @@ The mandatory and optional parameters for each category are tabulated below.
 <sup>5</sup>Used by GENMOD while modeling the variants. Contains a list of loci that show [reduced penetrance](https://medlineplus.gov/genetics/understanding/inheritance/penetranceexpressivity/) in people. Sample file [here](https://github.com/nf-core/test-datasets/blob/raredisease/reference/reduced_penetrance.tsv).<br />
 <sup>6</sup> This file contains a list of candidate genes (with [HGNC](https://www.genenames.org/) IDs) that is used to split the variants into canditate variants and research variants. Research variants contain all the variants, while candidate variants are a subset of research variants and are associated with candidate genes. Sample file [here](https://github.com/nf-core/test-datasets/blob/raredisease/reference/hgnc.txt).<br />
 <sup>7</sup>Used by GENMOD for ranking the variants. Sample file [here](https://github.com/nf-core/test-datasets/blob/raredisease/reference/rank_model_snv.ini).<br />
+<sup>8</sup>Path to a folder containing cadd annotations. Equivalent of the data/annotations/ folder described [here](https://github.com/kircherlab/CADD-scripts/#manual-installation), and it is used to calculate CADD scores for small indels. <br />
+
+> NB: We use CADD only to annotate small indels. To annotate SNVs with precomputed CADD scores, pass the file containing CADD scores as a resource to vcfanno instead. Files containing the precomputed CADD scores for SNVs can be downloaded from [here](https://cadd.gs.washington.edu/download) (description: "All possible SNVs of GRCh3<7/8>/hg3<7/8>")
 
 ##### 7. SV annotation & Ranking
 

diff --git a/main.nf b/main.nf
@@ -23,6 +23,7 @@ params.fasta_fai                      = WorkflowMain.getGenomeAttribute(params,
 params.bwa                            = WorkflowMain.getGenomeAttribute(params, 'bwa')
 params.bwamem2                        = WorkflowMain.getGenomeAttribute(params, 'bwamem2')
 params.call_interval                  = WorkflowMain.getGenomeAttribute(params, 'call_interval')
+params.cadd_resources                 = WorkflowMain.getGenomeAttribute(params, 'cadd_resources')
 params.gnomad_af                      = WorkflowMain.getGenomeAttribute(params, 'gnomad_af')
 params.gnomad_af_idx                  = WorkflowMain.getGenomeAttribute(params, 'gnomad_af_idx')
 params.intervals_wgs                  = WorkflowMain.getGenomeAttribute(params, 'intervals_wgs')

diff --git a/modules.json b/modules.json
@@ -5,6 +5,11 @@
         "https://github.com/nf-core/modules.git": {
             "modules": {
                 "nf-core": {
+                    "bcftools/annotate": {
+                        "branch": "master",
+                        "git_sha": "00567d35852dfde7e30a707b8d2e415dfa9d5970",
+                        "installed_by": ["modules"]
+                    },
                     "bcftools/concat": {
                         "branch": "master",
                         "git_sha": "582ff1755bdd205c65e2ba4c31e0a008dae299ec",
@@ -55,6 +60,11 @@
                         "git_sha": "c8e35eb2055c099720a75538d1b8adb3fb5a464c",
                         "installed_by": ["modules"]
                     },
+                    "cadd": {
+                        "branch": "master",
+                        "git_sha": "2e2f8581f4d2ab4729c2b7bd5da8400b54fb8fdf",
+                        "installed_by": ["modules"]
+                    },
                     "cat/cat": {
                         "branch": "master",
                         "git_sha": "0f8a77ff00e65eaeebc509b8156eaa983192474b",

diff --git a/modules/nf-core/bcftools/annotate/main.nf b/modules/nf-core/bcftools/annotate/main.nf
diff --git a/modules/nf-core/bcftools/annotate/meta.yml b/modules/nf-core/bcftools/annotate/meta.yml
diff --git a/modules/nf-core/cadd/main.nf b/modules/nf-core/cadd/main.nf
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		##INFO=<ID=CADD,Number=1,Type=Float,Description="PHRED-like scaled CADD score.">