nf-core · saraterzo · Feb 27, 2025 · Feb 28, 2025 · Feb 28, 2025 · Mar 3, 2025
diff --git a/docs/output.md b/docs/output.md
@@ -20,6 +20,8 @@ The pipeline is built using [Nextflow](https://www.nextflow.io/) and processes d
 - [Cellranger ARC](#cellranger-arc)
 - [Cellranger multi](#cellranger-multi)
 - [Cellbender remove background filter](#cellbender-remove-background-filter)
+- [VDJ Concatenation](#vdj_concatenation)
+- [Multimodal Data implementation](#Multimodal_data_implementation)
 - [Other output data](#other-output-data)
 - [MultiQC](#multiqc)
 
@@ -133,6 +135,16 @@ The pipeline also possess a subworkflow imported from scdownstream to perform fi
 
 - Contains the cellbender filtered matrices results generated by the remove background functionality.
 
+## VDJ concatenation
+The 'filtered_contig_annotation' tables are concatenated to generate a unified high-level annotation for each high-confidence cellular contig. It takes a filtered_contig_annotation.csv file for each sample and generates an AnnData object saved as .h5ad.
+
+**Output directory: `results/concatenate`**
+
+## Multimodal Data implementation
+This step enables handling multimodal data. It takes .h5ad AnnData objects from both the Gene Expression (GEX), Cellular Indexing of Transcriptomes and Epitopes by sequencing (CITE) and V(D)J modalities and constructs MuData objects, which is then saved as .h5mu file.
+
+**Output directory: `results/convert`**
+
 ## Other output data
 
 **Output directory: `results/reference_genome`**

diff --git a/modules/local/concatenate_vdj/main.nf b/modules/local/concatenate_vdj/main.nf
@@ -0,0 +1,41 @@
+process CONCATENATE_VDJ   {
+    tag "$meta.id"
+    label 'process_single'
+
+    container = 'quay.io/biocontainers/scirpy:0.20.1--pyhdfd78af_0'
+
+    input:
+    tuple val(meta), path(input_vdj, stageAs: '?/*')
+
+    output:
+    tuple val(meta), path("*.vdj.h5ad") , emit: h5ad, optional: true
+    path "versions.yml",  emit: versions
+
+    when:
+    task.ext.when == null || task.ext.when
+
+    script:
+    """
+    export NUMBA_CACHE_DIR=/tmp
+    export MPLCONFIGDIR=/tmp
+    export XDG_CONFIG_HOME=/tmp
+
+    concatenate_vdj.py -ai ${input_vdj.join(' ')} -id ${meta.collect{ it.id }.join(' ')}
+
+    cat <<-END_VERSIONS >> versions.yml
+    "${task.process}":
+        concatenate_vdj.py --version >> versions.yml
+    END_VERSIONS
+
+    """
+
+    stub:
+    """
+    touch combined_vdj.h5ad
+
+    cat <<-END_VERSIONS > versions.yml
+    "${task.process}":
+        concatenate_vdj.py --version >> versions.yml
+    END_VERSIONS
+    """
+}
diff --git a/modules/local/concatenate_vdj/resources/usr/bin/concatenate_vdj.py b/modules/local/concatenate_vdj/resources/usr/bin/concatenate_vdj.py
@@ -0,0 +1,127 @@
+#!/usr/bin/env python3
+# ====================================================================================================================
+#                                          PRELIMINARIES
+# ====================================================================================================================
+
+# MODULE IMPORT
+import warnings
+import argparse                     # command line arguments parser
+import pathlib                      # library for handle filesystem paths
+import glob
+import os
+import scanpy as sc                 # single-cell data processing
+import scirpy as ir                 # single-cell AIRR-data
+import anndata as ad                # store annotated matrix as anndata object
+
+
+warnings.filterwarnings("ignore")
+
+# PARAMETERS
+# set script version number
+VERSION = "0.0.1"
+
+
+# ====================================================================================================================
+#                                          MAIN FUNCTION
+# ====================================================================================================================
+
+def main():
+    """
+    This function concatenates csv files from vdj modality.
+    """
+# --------------------------------------------------------------------------------------------------------------------
+#                                          LIBRARY CONFIG
+# --------------------------------------------------------------------------------------------------------------------
+
+    sc.settings.verbosity = 3 # verbosity: errors (0), warnings (1), info (2), hints (3)
+    sc.logging.print_header()
+
+# --------------------------------------------------------------------------------------------------------------------
+#                                          INPUT FROM COMMAND LINE
+# --------------------------------------------------------------------------------------------------------------------
+
+# Define command line arguments with argparse
+
+    parser = argparse.ArgumentParser(prog='Concatenate_vdj', usage='%(prog)s [options]',description = "VDJ data concatenation",
+                        epilog = "This function concatenated vdj filtered contig annotation files into a single csv files.")
+    parser.add_argument('-ai', '--input-vdj-dir', metavar='VDJ_INPUT_FILES',nargs='+',type=pathlib.Path, dest='input_vdj_files',
+                        help="paths of existing directory containing vdj matrix files in csv format (including file names)")
+    parser.add_argument('-id', '--input-run-id', metavar='INPUT_RUN_ID', nargs='+', dest='input_run_id',
+                        help="names of the run-id corresponding to the input adata")
+    parser.add_argument('-o', '--out', metavar='H5AD_OUTPUT_FILE', type=pathlib.Path, default="combined.vdj.h5ad",
+                        help="name of the h5ad object containing the concatenated vdj table")
+    parser.add_argument('-v', '--version', action='version', version=VERSION)
+    args = parser.parse_args()
+
+# --------------------------------------------------------------------------------------------------------------------
+#                                 DEFINE SAMPLES AND MTX PATHS
+# --------------------------------------------------------------------------------------------------------------------
+
+    print("\n===== VDJ FILES =====")
+    input_vdj_file = args.input_vdj_files
+    input_run_id = args.input_run_id
+    output = args.out
+
+    # print info on the available matrices
+    print("Reading vdj matrix from the following files:")
+    for run, mtx in zip(input_run_id, input_vdj_file):
+        print(f"Run: {run:15s} - File: {mtx}")
+
+
+# --------------------------------------------------------------------------------------------------------------------
+#                                 READ VDJ FILES
+# --------------------------------------------------------------------------------------------------------------------
+    vdj_files = []
+    for folder in glob.glob("*/filtered_contig_annotations.csv"):
+        vdj_files.append(folder)
+
+    adata_vdj_list = []
+    if vdj_files:
+
+        for run, vdj in zip(input_run_id,vdj_files):
+            # Read folders with the filtered contigue annotation and store datasets in a dictionary
+            print("\n===== READING CONTIGUE ANNOTATION MATRIX =====")
+            print("\nProcessing filtered contigue table in folder ... ", end ='')
-            print("\nProcessing filtered contigue table in folder ... ", end ='')
+            print("\nProcessing filtered contig table in folder ... ", end ='')
-            print("\nProcessing filtered contigue table in folder ... ", end ='')
+            print("\nProcessing filtered contig table in folder ... ", end ='')
+            if os.path.getsize(vdj) == 0:
+                print(f"Warning: {vdj} is empty and will be skipped.")
+            else:
+                adata_vdj= ir.io.read_10x_vdj(vdj)
+                print("Done!")
+                adata_vdj_list.append(adata_vdj)
+    else:
+        print("No valid input file provided. Skipping reading of the vdj annotation.")
+
+# --------------------------------------------------------------------------------------------------------------------
+#                           VDJ TABLE CONCATENATION
+# --------------------------------------------------------------------------------------------------------------------
+
+    print("\n===== CONCATENATING VDJ TABLES =====")
+
+    if len(adata_vdj_list) == 0:
+        print("No valid files were found. Nothing to save.")
+    else:
+        if len(adata_vdj_list) == 1:
+            adata_vdj_concatenated = adata_vdj_list[0]
+            print("Only one non-empty file found. Saving the file as is without concatenation.")
+        else:
+            adata_vdj_concatenated = ad.concat(adata_vdj_list, join= "outer", merge ="same", label="sample",
+                                        keys= input_run_id, index_unique="_")
+
+        print(f"Concatenated vdj table for {len(input_run_id)} batched has {adata_vdj_concatenated.shape[0]} cells")
+        print("Done!")
+# --------------------------------------------------------------------------------------------------------------------
+#                           SAVE OUTPUT FILE
+# --------------------------------------------------------------------------------------------------------------------
+
+        print("\n===== SAVING OUTPUT FILE =====")
+
+        print(f"Saving vdj table data in {output}")
+        adata_vdj_concatenated.write(output)
+        print("Done!")
+
+
+#####################################################################################################
+
+
+if __name__ == '__main__':
+    main()
diff --git a/modules/local/convert_mudata/main.nf b/modules/local/convert_mudata/main.nf
@@ -0,0 +1,44 @@
+process CONVERT_MUDATA  {
+    tag "$meta.id"
+    label 'process_single'
+
+    container = 'quay.io/biocontainers/scirpy:0.20.1--pyhdfd78af_0'
+
+    input:
+    tuple val(meta), path(input_h5ad)
+    tuple val(meta), path (input_vdj)
+
+    output:
+    tuple val(meta), path("*.mudata.h5mu") , emit: h5mu
+    path "versions.yml",  emit: versions
+
+    when:
+    task.ext.when == null || task.ext.when
+
+    script:
+    def ai = input_vdj ? "-ai $input_vdj" : ''
+
+    """
+    export NUMBA_CACHE_DIR=/tmp
+    export MPLCONFIGDIR=/tmp
+    export XDG_CONFIG_HOME=/tmp
+
+    convert_mudata.py -ad $input_h5ad $ai
+
+    cat <<-END_VERSIONS >> versions.yml
+    "${task.process}":
+        convert_mudata.py --version >> versions.yml
+    END_VERSIONS
+
+    """
+
+    stub:
+    """
+    touch matrix.mudata.h5mu
+
+    cat <<-END_VERSIONS > versions.yml
+    "${task.process}":
+        convert_mudata.py --version >> versions.yml
+    END_VERSIONS
+    """
+}
diff --git a/modules/local/convert_mudata/resources/usr/bin/convert_mudata.py b/modules/local/convert_mudata/resources/usr/bin/convert_mudata.py
@@ -0,0 +1,134 @@
+#!/usr/bin/env python3
+# ====================================================================================================================
+#                                          PRELIMINARIES
+# ====================================================================================================================
+
+# MODULE IMPORT
+import warnings
+import argparse                     # command line arguments parser
+import pathlib                      # library for handle filesystem paths
+import scanpy as sc                 # single-cell data processing
+from mudata import MuData
+
+
+warnings.filterwarnings("ignore")
+
+# PARAMETERS
+# set script version number
+VERSION = "0.0.1"
+
+
+# ====================================================================================================================
+#                                          MAIN FUNCTION
+# ====================================================================================================================
+
+def main():
+    """
+    This function creates a MuData object.
+    """
+# --------------------------------------------------------------------------------------------------------------------
+#                                          LIBRARY CONFIG
+# --------------------------------------------------------------------------------------------------------------------
+
+    sc.settings.verbosity = 3 # verbosity: errors (0), warnings (1), info (2), hints (3)
+    sc.logging.print_header()
+
+# --------------------------------------------------------------------------------------------------------------------
+#                                          INPUT FROM COMMAND LINE
+# --------------------------------------------------------------------------------------------------------------------
+
+# Define command line arguments with argparse
+
+    parser = argparse.ArgumentParser(prog='Create MuData object', usage='%(prog)s [options]',description = "MuData object convertion",
+                        epilog = "This function creates a MuData object for storing GEX,VDJ and CITE-seq data.")
+    parser.add_argument('-ad','--input-gex-file',metavar= 'GEX_INPUT_FILES', type=pathlib.Path, dest='input_gex_files',
+                        help="paths of existing count matrix files in h5ad format (including file names)")
+    parser.add_argument('-ai', '--input-vdj-file', metavar='VDJ_INPUT_FILES',type=pathlib.Path, dest='input_vdj_files',
+                        default=pathlib.Path(''),help="paths of existing vdj matrix files in h5ad format (including file names)")
+    parser.add_argument('-o', '--out', metavar='MUDATA_OUTPUT_FILE', type=pathlib.Path, default="matrix.mudata.h5mu",
+                        help="name of the muData object")
+    parser.add_argument('-v', '--version', action='version', version=VERSION)
+    args = parser.parse_args()
+
+# --------------------------------------------------------------------------------------------------------------------
+#                                 DEFINE SAMPLES AND MTX PATHS
+# --------------------------------------------------------------------------------------------------------------------
+
+    print("\n===== INPUT GEX and VDJ FILES =====")
+    input_gex_file = args.input_gex_files
+    input_vdj_file = args.input_vdj_files
+    output = args.out
+
+    # print info on the available matrices
+    print("Reading combined gex count matrix from the following file:")
+    print(f"-File {input_gex_file}")
+
+    print("Reading filtered annotation table from the following file:")
+    print(f"-File {input_vdj_file}")
+
+# --------------------------------------------------------------------------------------------------------------------
+#                                 READ GEX AND AB FILES
+# --------------------------------------------------------------------------------------------------------------------
+    if input_gex_file:
+        # Read folders with the MTX combined count matrice and store datasets in a dictionary
+        print("\n===== READING COMBINED MATRIX =====")
+        # read the gex count matrix for the combined samples and print some initial info
+        print("\nProcessing count matrix in folder ... ", end ='')
+        adata= sc.read_h5ad(input_gex_file)
+        print("Done!")
+        print(f"Gex count matrix for combined samples has {adata.shape[0]} cells and {adata.shape[1]} genes")
+    else:
+        print("No valid input file provided. Skipping reading of the count matrix.")
+
+# --------------------------------------------------------------------------------------------------------------------
+#                                 READ VDJ FILES
+# --------------------------------------------------------------------------------------------------------------------
+    if input_vdj_file and input_vdj_file != pathlib.Path(''):
+        # Read folders with the filtered contigue annotation and store datasets in a dictionary
+        print("\n===== READING CONTIGUE ANNOTATION MATRIX =====")
+        # read the filtered contigue annotation file for the combined samples and print some initial info
+        print("\nProcessing filtered contigue table in folder ... ", end ='')
+        adata_vdj= sc.read_h5ad(input_vdj_file)
+        print("Done!")
+    else:
+        print("No valid input file provided. Skipping reading of the vdj annotation.")
+
+# --------------------------------------------------------------------------------------------------------------------
+#                                 CREATE MUDATA OBJECT
+# --------------------------------------------------------------------------------------------------------------------
+    #Creates dictionary to store all modalities
+    modalities = {}
+    try:
+        # Add 'gex' modality if defined
+        if adata[:, adata.var["feature_types"] == "Gene Expression"].shape[1] > 0:
-        if adata[:, adata.var["feature_types"] == "Gene Expression"].shape[1] > 0:
+        if any(adata.var["feature_types"] == "Gene Expression"]):
-        if adata[:, adata.var["feature_types"] == "Gene Expression"].shape[1] > 0:
+        if any(adata.var["feature_types"] == "Gene Expression"]):
+            modalities["gex"] = adata[:, adata.var["feature_types"] == "Gene Expression"]
+        # Add 'pro' modality if defined
+        if adata[:, adata.var["feature_types"] == "Antibody Capture"].shape[1] > 0:
+            modalities["pro"] = adata[:, adata.var["feature_types"] == "Antibody Capture"]
-            modalities["pro"] = adata[:, adata.var["feature_types"] == "Antibody Capture"]
+            modalities["protein"] = adata[:, adata.var["feature_types"] == "Antibody Capture"]
-            modalities["pro"] = adata[:, adata.var["feature_types"] == "Antibody Capture"]
+            modalities["protein"] = adata[:, adata.var["feature_types"] == "Antibody Capture"]
+    except NameError:
+        pass
+
+    try:
+        # Add 'airr' modality if defined
+        if adata_vdj is not None:
+            modalities["airr"] = adata_vdj
+    except NameError:
+        pass
+
+    # Creates MuData object
+    mdata = MuData(modalities)
+
+# --------------------------------------------------------------------------------------------------------------------
+#                           SAVE OUTPUT FILE
+# --------------------------------------------------------------------------------------------------------------------
+
+    print("\n===== SAVING OUTPUT FILE =====")
+
+    print(f"Saving MuData object to file {output}")
+    mdata.write(output)
+    print("Done!")
+
+#####################################################################################################
+
+if __name__ == '__main__':
+    main()
diff --git a/modules/local/templates/mtx_to_h5ad_simpleaf.py b/modules/local/templates/mtx_to_h5ad_simpleaf.py
diff --git a/nextflow.config b/nextflow.config
@@ -5,7 +5,7 @@
     Default config options for all compute environments
 ----------------------------------------------------------------------------------------
 */
-
+nextflow.enable.moduleBinaries = true
 // Global default params, used in configs
 params {