diff --git a/CHANGELOG.md b/CHANGELOG.md index 81c47fdc7..b8b01d04e 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -19,13 +19,12 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### `Changed` -- [#428](https://github.com/nf-core/mag/pull/428) - Update to nf-core 2.8 `TEMPLATE` (by @jfy133) +- [#428](https://github.com/nf-core/mag/pull/428) [#467](https://github.com/nf-core/mag/pull/467) - Update to nf-core 2.8, 2.9 `TEMPLATE` (by @jfy133) - [#429](https://github.com/nf-core/mag/pull/429) - Replaced hardcoded CheckM database auto-download URL to a parameter (reported by @erikrikarddaniel, fix by @jfy133) - [#441](https://github.com/nf-core/mag/pull/441) - Deactivated CONCOCT in AWS 'full test' due to very long runtime (fix by @jfy133). - [#442](https://github.com/nf-core/mag/pull/442) - Remove warning when BUSCO finds no genes in bins, as this can be expected in some datasets (reported by @Lumimar, fix by @jfy133). - [#444](https://github.com/nf-core/mag/pull/444) - Moved BUSCO bash code to script (by @jfy133) -- [#428](https://github.com/nf-core/mag/pull/429) - Update to nf-core 2.9 `TEMPLATE` (by @jfy133) -- [#437](https://github.com/nf-core/mag/pull/429) - `--gtdb` parameter is split into `--skip_gtdbtk` and `--gtdb_db` to allow finer control over GTDB database retrieval (fix by @jfy133) +- [#477](https://github.com/nf-core/mag/pull/477) - `--gtdb` parameter is split into `--skip_gtdbtk` and `--gtdb_db` to allow finer control over GTDB database retrieval (fix by @jfy133) - [#500](https://github.com/nf-core/mag/pull/500) - Temporarily disabled downstream processing of both refined and raw bins due to bug (by @jfy133) ### `Fixed` diff --git a/README.md b/README.md index 8eaaa1798..3ed797e82 100644 --- a/README.md +++ b/README.md @@ -30,10 +30,11 @@ The pipeline then: - assigns taxonomy to reads using [Centrifuge](https://ccb.jhu.edu/software/centrifuge/) and/or [Kraken2](https://github.com/DerrickWood/kraken2/wiki) - performs assembly using [MEGAHIT](https://github.com/voutcn/megahit) and [SPAdes](http://cab.spbu.ru/software/spades/), and checks their quality using [Quast](http://quast.sourceforge.net/quast) - (optionally) performs ancient DNA assembly validation using [PyDamage](https://github.com/maxibor/pydamage) and contig consensus sequence recalling with [Freebayes](https://github.com/freebayes/freebayes) and [BCFtools](http://samtools.github.io/bcftools/bcftools.html) -- predicts protein-coding genes for the assemblies using [Prodigal](https://github.com/hyattpd/Prodigal) +- predicts protein-coding genes for the assemblies using [Prodigal](https://github.com/hyattpd/Prodigal), and bins with [Prokka](https://github.com/tseemann/prokka) and optionally [MetaEuk](https://www.google.com/search?channel=fs&client=ubuntu-sn&q=MetaEuk) - performs metagenome binning using [MetaBAT2](https://bitbucket.org/berkeleylab/metabat/src/master/), [MaxBin2](https://sourceforge.net/projects/maxbin2/), and/or with [CONCOCT](https://github.com/BinPro/CONCOCT), and checks the quality of the genome bins using [Busco](https://busco.ezlab.org/), or [CheckM](https://ecogenomics.github.io/CheckM/), and optionally [GUNC](https://grp-bork.embl-community.io/gunc/). +- Performs ancient DNA validation and repair with [pyDamage](https://github.com/maxibor/pydamage) and [freebayes](https://github.com/freebayes/freebayes) - optionally refines bins with [DAS Tool](https://github.com/cmks/DAS_Tool) -- assigns taxonomy to bins using [GTDB-Tk](https://github.com/Ecogenomics/GTDBTk) and/or [CAT](https://github.com/dutilh/CAT) and optionally identifies viruses in assemblies using [geNomad](https://github.com/apcamargo/genomad) +- assigns taxonomy to bins using [GTDB-Tk](https://github.com/Ecogenomics/GTDBTk) and/or [CAT](https://github.com/dutilh/CAT) and optionally identifies viruses in assemblies using [geNomad](https://github.com/apcamargo/genomad), or Eukaryotes with [Tiara](https://github.com/ibe-uw/tiara) Furthermore, the pipeline creates various reports in the results directory specified, including a [MultiQC](https://multiqc.info/) report summarizing some of the findings and software versions. diff --git a/conf/modules.config b/conf/modules.config index 0c1781a11..081670307 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -223,7 +223,6 @@ process { } withName: CENTRIFUGE { - ext.prefix = { ${meta.id} } publishDir = [ path: { "${params.outdir}/Taxonomy/centrifuge/${meta.id}" }, mode: params.publish_dir_mode, @@ -232,7 +231,6 @@ process { } withName: KRAKEN2 { - ext.prefix = { ${meta.id} } ext.args = '--quiet' publishDir = [ path: { "${params.outdir}/Taxonomy/kraken2/${meta.id}" }, diff --git a/docs/images/mag_workflow.png b/docs/images/mag_workflow.png index 8cc89e2af..d4cda1a0e 100644 Binary files a/docs/images/mag_workflow.png and b/docs/images/mag_workflow.png differ diff --git a/docs/images/mag_workflow.svg b/docs/images/mag_workflow.svg index 1d4c99f14..cf9dfc1f8 100644 --- a/docs/images/mag_workflow.svg +++ b/docs/images/mag_workflow.svg @@ -5,7 +5,7 @@ viewBox="0 0 320.14583 160.81199" version="1.1" id="svg8" - inkscape:version="1.2.2 (b0a84865, 2022-12-01)" + inkscape:version="1.3 (1:1.3+202307231459+0e150ed6c4)" sodipodi:docname="mag_workflow.svg" inkscape:export-filename="mag_workflow.png" inkscape:export-xdpi="289.40701" @@ -377,10 +377,10 @@ inkscape:pageopacity="0.0" inkscape:pageshadow="2" inkscape:zoom="1.3311751" - inkscape:cx="787.64995" + inkscape:cx="788.02556" inkscape:cy="281.33038" inkscape:document-units="mm" - inkscape:current-layer="g4116" + inkscape:current-layer="g6248" showgrid="false" inkscape:window-width="1664" inkscape:window-height="1051" @@ -411,7 +411,11 @@ id="grid5642" originx="26.458333" originy="145.52081" - dotted="true" /> + dotted="true" + spacingy="1" + spacingx="1" + units="mm" + visible="false" /> v2.3.0 + y="65.938919">v2.4.0 diff --git a/docs/output.md b/docs/output.md index 902d0f128..31b868837 100644 --- a/docs/output.md +++ b/docs/output.md @@ -425,7 +425,7 @@ By default, only the raw bins (and unbinned contigs) from the actual binning met ⚠️ Due to ability to perform downstream QC of both raw and refined bins in parallel (via `--postbinning_input)`, bin names in DAS Tools's `*_allBins.eval` file will include `Refined`. However for this particular file, they _actually_ refer to the 'raw' input bins. The pipeline renames the input files prior to running DASTool to ensure they can be disambiguated from the original bin files in the downstream QC steps. -### Tiara +### Tiara Tiara is a contig classifier that identifies the domain (prokarya, eukarya) of contigs within an assembly. This is used in this pipeline to rapidly and with few resources identify the most likely domain classification of each bin or unbin based on its contig identities. diff --git a/docs/usage.md b/docs/usage.md index f61f56215..c991434c2 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -154,7 +154,6 @@ with `params.yaml` containing: ```yaml input: './samplesheet.csv' outdir: './results/' -input: 'data' <...> ``` @@ -191,7 +190,7 @@ To allow also reproducible bin QC with BUSCO, run BUSCO providing already downlo For the taxonomic bin classification with [CAT](https://github.com/dutilh/CAT), when running the pipeline with `--cat_db_generate` the parameter `--save_cat_db` can be used to also save the generated database to allow reproducibility in future runs. Note that when specifying a pre-built database with `--cat_db`, currently the database can not be saved. -When it comes to visualizing taxonomic data using [Krona](https://github.com/marbl/Krona), you have the option to provide a taxonomy file, such as `taxonomy.tab`, using the `--krona_db` parameter. If you don't supply a taxonomy file, Krona is designed to automatically download the required taxonomy data for visualization. If you choose to provide a pre-existing taxonomy file using the `--krona_db` parameter, Krona will use that file for visualization. On the other hand, if you omit the `--krona_db` parameter, Krona will download the necessary taxonomy information automatically to enable visualization. +When it comes to visualizing taxonomic data using [Krona](https://github.com/marbl/Krona), you have the option to provide a taxonomy file, such as `taxonomy.tab`, using the `--krona_db` parameter. If you don't supply a taxonomy file, Krona is designed to automatically download the required taxonomy data for visualization. The taxonomic classification of bins with GTDB-Tk is not guaranteed to be reproducible, since the placement of bins in the reference tree is non-deterministic. However, the authors of the GTDB-Tk article examined the reproducibility on a set of 100 genomes across 50 trials and did not observe any difference (see [https://doi.org/10.1093/bioinformatics/btz848](https://doi.org/10.1093/bioinformatics/btz848)). diff --git a/modules.json b/modules.json index 5567070fb..9c3aac661 100644 --- a/modules.json +++ b/modules.json @@ -12,7 +12,7 @@ }, "aria2": { "branch": "master", - "git_sha": "0f8a77ff00e65eaeebc509b8156eaa983192474b", + "git_sha": "911696ea0b62df80e900ef244d7867d177971f73", "installed_by": ["modules"], "patch": "modules/nf-core/aria2/aria2.diff" }, diff --git a/modules/local/centrifuge.nf b/modules/local/centrifuge.nf index d06bdf37a..1e70896a9 100644 --- a/modules/local/centrifuge.nf +++ b/modules/local/centrifuge.nf @@ -13,7 +13,7 @@ process CENTRIFUGE { output: tuple val("centrifuge"), val(meta), path("results.krona"), emit: results_for_krona path "report.txt" , emit: report - tuple val(meta), path("*kreport.txt") , emit: kreport + tuple val(meta), path("*kreport.txt") , emit: kreport path "versions.yml" , emit: versions script: diff --git a/modules/nf-core/aria2/aria2.diff b/modules/nf-core/aria2/aria2.diff index 5d9b47f32..789fdb44c 100644 --- a/modules/nf-core/aria2/aria2.diff +++ b/modules/nf-core/aria2/aria2.diff @@ -1,15 +1,6 @@ Changes in module 'nf-core/aria2' --- modules/nf-core/aria2/main.nf +++ modules/nf-core/aria2/main.nf -@@ -3,7 +3,7 @@ - tag "$source_url" - label 'process_single' - -- conda "conda-forge::aria2=1.36.0" -+ conda "conda-forge::aria2=1.36.0 conda-forge::tar" - container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'https://depot.galaxyproject.org/singularity/aria2:1.36.0' : - 'quay.io/biocontainers/aria2:1.36.0' }" @@ -12,7 +12,7 @@ val source_url diff --git a/modules/nf-core/aria2/main.nf b/modules/nf-core/aria2/main.nf index 0dcd7423a..b6091dad6 100644 --- a/modules/nf-core/aria2/main.nf +++ b/modules/nf-core/aria2/main.nf @@ -3,10 +3,10 @@ process ARIA2 { tag "$source_url" label 'process_single' - conda "conda-forge::aria2=1.36.0 conda-forge::tar" + conda "conda-forge::aria2=1.36.0" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? 'https://depot.galaxyproject.org/singularity/aria2:1.36.0' : - 'quay.io/biocontainers/aria2:1.36.0' }" + 'biocontainers/aria2:1.36.0' }" input: val source_url diff --git a/subworkflows/local/binning.nf b/subworkflows/local/binning.nf index aa9a4a9bf..a07ca4162 100644 --- a/subworkflows/local/binning.nf +++ b/subworkflows/local/binning.nf @@ -25,9 +25,7 @@ workflow BINNING { // generate coverage depths for each contig ch_summarizedepth_input = assemblies .map { meta, assembly, bams, bais -> - def meta_keys = meta.keySet() - def meta_new = meta + meta.subMap(meta_keys) - [ meta_new, bams, bais ] + [ meta, bams, bais ] } METABAT2_JGISUMMARIZEBAMCONTIGDEPTHS ( ch_summarizedepth_input ) diff --git a/subworkflows/local/depths.nf b/subworkflows/local/depths.nf index 140a809c5..81c93c6f2 100644 --- a/subworkflows/local/depths.nf +++ b/subworkflows/local/depths.nf @@ -1,7 +1,3 @@ -params.mag_depths_options = [:] -params.mag_depths_plot_options = [:] -params.mag_depths_summary_options = [:] - include { MAG_DEPTHS } from '../../modules/local/mag_depths' include { MAG_DEPTHS_PLOT } from '../../modules/local/mag_depths_plot' include { MAG_DEPTHS_SUMMARY } from '../../modules/local/mag_depths_summary' diff --git a/workflows/mag.nf b/workflows/mag.nf index 11be85842..825549a0e 100644 --- a/workflows/mag.nf +++ b/workflows/mag.nf @@ -89,18 +89,18 @@ include { COMBINE_TSV as COMBINE_SUMMARY_TSV } from '../modules // // SUBWORKFLOW: Consisting of a mix of local and nf-core/modules // -include { INPUT_CHECK } from '../subworkflows/local/input_check' -include { BINNING_PREPARATION } from '../subworkflows/local/binning_preparation' -include { BINNING } from '../subworkflows/local/binning' -include { BINNING_REFINEMENT } from '../subworkflows/local/binning_refinement' -include { BUSCO_QC } from '../subworkflows/local/busco_qc' -include { VIRUS_IDENTIFICATION} from '../subworkflows/local/virus_identification' -include { CHECKM_QC } from '../subworkflows/local/checkm_qc' -include { GUNC_QC } from '../subworkflows/local/gunc_qc' -include { GTDBTK } from '../subworkflows/local/gtdbtk' +include { INPUT_CHECK } from '../subworkflows/local/input_check' +include { BINNING_PREPARATION } from '../subworkflows/local/binning_preparation' +include { BINNING } from '../subworkflows/local/binning' +include { BINNING_REFINEMENT } from '../subworkflows/local/binning_refinement' +include { BUSCO_QC } from '../subworkflows/local/busco_qc' +include { VIRUS_IDENTIFICATION } from '../subworkflows/local/virus_identification' +include { CHECKM_QC } from '../subworkflows/local/checkm_qc' +include { GUNC_QC } from '../subworkflows/local/gunc_qc' +include { GTDBTK } from '../subworkflows/local/gtdbtk' include { ANCIENT_DNA_ASSEMBLY_VALIDATION } from '../subworkflows/local/ancient_dna' -include { DOMAIN_CLASSIFICATION } from '../subworkflows/local/domain_classification' -include { DEPTHS } from '../subworkflows/local/depths' +include { DOMAIN_CLASSIFICATION } from '../subworkflows/local/domain_classification' +include { DEPTHS } from '../subworkflows/local/depths' /* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -414,7 +414,7 @@ workflow MAG { ch_versions = ch_versions.mix(NANOPLOT_RAW.out.versions.first()) ch_long_reads = ch_raw_long_reads - .map { + .map { meta, reads -> def meta_new = meta - meta.subMap('run') [ meta_new, reads ]