-
Notifications
You must be signed in to change notification settings - Fork 191
Added VDJ concatenation modules and added MuData object #435
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: dev
Are you sure you want to change the base?
Changes from all commits
5080be6
7852c46
4741d10
9020e9f
2a0331b
a19b49e
56cd8cc
7e2e38a
8e903af
2ceb3e9
654e65a
6a03158
755a69d
25519c4
71695ac
096e080
5c2ea4b
f53418d
2302b58
c783c0f
a303817
3b7d9a8
e311262
c0ad49c
1d773cc
3ccfae8
f334e76
5902d3d
af4576b
cb74297
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,41 @@ | ||
process CONCATENATE_VDJ { | ||
tag "$meta.id" | ||
label 'process_single' | ||
|
||
container = 'quay.io/biocontainers/scirpy:0.20.1--pyhdfd78af_0' | ||
|
||
input: | ||
tuple val(meta), path(input_vdj, stageAs: '?/*') | ||
|
||
output: | ||
tuple val(meta), path("*.vdj.h5ad") , emit: h5ad, optional: true | ||
path "versions.yml", emit: versions | ||
|
||
when: | ||
task.ext.when == null || task.ext.when | ||
|
||
script: | ||
""" | ||
export NUMBA_CACHE_DIR=/tmp | ||
export MPLCONFIGDIR=/tmp | ||
export XDG_CONFIG_HOME=/tmp | ||
|
||
concatenate_vdj.py -ai ${input_vdj.join(' ')} -id ${meta.collect{ it.id }.join(' ')} | ||
|
||
cat <<-END_VERSIONS >> versions.yml | ||
"${task.process}": | ||
concatenate_vdj.py --version >> versions.yml | ||
END_VERSIONS | ||
|
||
""" | ||
|
||
stub: | ||
""" | ||
touch combined_vdj.h5ad | ||
|
||
cat <<-END_VERSIONS > versions.yml | ||
"${task.process}": | ||
concatenate_vdj.py --version >> versions.yml | ||
END_VERSIONS | ||
""" | ||
} |
Original file line number | Diff line number | Diff line change | ||||
---|---|---|---|---|---|---|
@@ -0,0 +1,127 @@ | ||||||
#!/usr/bin/env python3 | ||||||
# ==================================================================================================================== | ||||||
# PRELIMINARIES | ||||||
# ==================================================================================================================== | ||||||
|
||||||
# MODULE IMPORT | ||||||
import warnings | ||||||
import argparse # command line arguments parser | ||||||
import pathlib # library for handle filesystem paths | ||||||
import glob | ||||||
import os | ||||||
import scanpy as sc # single-cell data processing | ||||||
import scirpy as ir # single-cell AIRR-data | ||||||
import anndata as ad # store annotated matrix as anndata object | ||||||
|
||||||
|
||||||
warnings.filterwarnings("ignore") | ||||||
|
||||||
# PARAMETERS | ||||||
# set script version number | ||||||
VERSION = "0.0.1" | ||||||
|
||||||
|
||||||
# ==================================================================================================================== | ||||||
# MAIN FUNCTION | ||||||
# ==================================================================================================================== | ||||||
|
||||||
def main(): | ||||||
""" | ||||||
This function concatenates csv files from vdj modality. | ||||||
""" | ||||||
# -------------------------------------------------------------------------------------------------------------------- | ||||||
# LIBRARY CONFIG | ||||||
# -------------------------------------------------------------------------------------------------------------------- | ||||||
|
||||||
sc.settings.verbosity = 3 # verbosity: errors (0), warnings (1), info (2), hints (3) | ||||||
sc.logging.print_header() | ||||||
|
||||||
# -------------------------------------------------------------------------------------------------------------------- | ||||||
# INPUT FROM COMMAND LINE | ||||||
# -------------------------------------------------------------------------------------------------------------------- | ||||||
|
||||||
# Define command line arguments with argparse | ||||||
|
||||||
parser = argparse.ArgumentParser(prog='Concatenate_vdj', usage='%(prog)s [options]',description = "VDJ data concatenation", | ||||||
epilog = "This function concatenated vdj filtered contig annotation files into a single csv files.") | ||||||
parser.add_argument('-ai', '--input-vdj-dir', metavar='VDJ_INPUT_FILES',nargs='+',type=pathlib.Path, dest='input_vdj_files', | ||||||
help="paths of existing directory containing vdj matrix files in csv format (including file names)") | ||||||
parser.add_argument('-id', '--input-run-id', metavar='INPUT_RUN_ID', nargs='+', dest='input_run_id', | ||||||
help="names of the run-id corresponding to the input adata") | ||||||
parser.add_argument('-o', '--out', metavar='H5AD_OUTPUT_FILE', type=pathlib.Path, default="combined.vdj.h5ad", | ||||||
help="name of the h5ad object containing the concatenated vdj table") | ||||||
parser.add_argument('-v', '--version', action='version', version=VERSION) | ||||||
args = parser.parse_args() | ||||||
|
||||||
# -------------------------------------------------------------------------------------------------------------------- | ||||||
# DEFINE SAMPLES AND MTX PATHS | ||||||
# -------------------------------------------------------------------------------------------------------------------- | ||||||
|
||||||
print("\n===== VDJ FILES =====") | ||||||
input_vdj_file = args.input_vdj_files | ||||||
input_run_id = args.input_run_id | ||||||
output = args.out | ||||||
|
||||||
# print info on the available matrices | ||||||
print("Reading vdj matrix from the following files:") | ||||||
for run, mtx in zip(input_run_id, input_vdj_file): | ||||||
print(f"Run: {run:15s} - File: {mtx}") | ||||||
|
||||||
|
||||||
# -------------------------------------------------------------------------------------------------------------------- | ||||||
# READ VDJ FILES | ||||||
# -------------------------------------------------------------------------------------------------------------------- | ||||||
vdj_files = [] | ||||||
for folder in glob.glob("*/filtered_contig_annotations.csv"): | ||||||
vdj_files.append(folder) | ||||||
|
||||||
adata_vdj_list = [] | ||||||
if vdj_files: | ||||||
|
||||||
for run, vdj in zip(input_run_id,vdj_files): | ||||||
# Read folders with the filtered contigue annotation and store datasets in a dictionary | ||||||
print("\n===== READING CONTIGUE ANNOTATION MATRIX =====") | ||||||
print("\nProcessing filtered contigue table in folder ... ", end ='') | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Suggested change
|
||||||
if os.path.getsize(vdj) == 0: | ||||||
print(f"Warning: {vdj} is empty and will be skipped.") | ||||||
else: | ||||||
adata_vdj= ir.io.read_10x_vdj(vdj) | ||||||
print("Done!") | ||||||
adata_vdj_list.append(adata_vdj) | ||||||
else: | ||||||
print("No valid input file provided. Skipping reading of the vdj annotation.") | ||||||
|
||||||
# -------------------------------------------------------------------------------------------------------------------- | ||||||
# VDJ TABLE CONCATENATION | ||||||
# -------------------------------------------------------------------------------------------------------------------- | ||||||
|
||||||
print("\n===== CONCATENATING VDJ TABLES =====") | ||||||
|
||||||
if len(adata_vdj_list) == 0: | ||||||
print("No valid files were found. Nothing to save.") | ||||||
else: | ||||||
if len(adata_vdj_list) == 1: | ||||||
adata_vdj_concatenated = adata_vdj_list[0] | ||||||
print("Only one non-empty file found. Saving the file as is without concatenation.") | ||||||
else: | ||||||
Comment on lines
+103
to
+106
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. is it necessary to special-case this? i.e. wouldn't ad.concat just work fine with a single file? |
||||||
adata_vdj_concatenated = ad.concat(adata_vdj_list, join= "outer", merge ="same", label="sample", | ||||||
keys= input_run_id, index_unique="_") | ||||||
|
||||||
print(f"Concatenated vdj table for {len(input_run_id)} batched has {adata_vdj_concatenated.shape[0]} cells") | ||||||
print("Done!") | ||||||
# -------------------------------------------------------------------------------------------------------------------- | ||||||
# SAVE OUTPUT FILE | ||||||
# -------------------------------------------------------------------------------------------------------------------- | ||||||
|
||||||
print("\n===== SAVING OUTPUT FILE =====") | ||||||
|
||||||
print(f"Saving vdj table data in {output}") | ||||||
adata_vdj_concatenated.write(output) | ||||||
print("Done!") | ||||||
|
||||||
|
||||||
##################################################################################################### | ||||||
|
||||||
|
||||||
if __name__ == '__main__': | ||||||
main() |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,44 @@ | ||
process CONVERT_MUDATA { | ||
tag "$meta.id" | ||
label 'process_single' | ||
|
||
container = 'quay.io/biocontainers/scirpy:0.20.1--pyhdfd78af_0' | ||
|
||
input: | ||
tuple val(meta), path(input_h5ad) | ||
tuple val(meta), path (input_vdj) | ||
|
||
output: | ||
tuple val(meta), path("*.mudata.h5mu") , emit: h5mu | ||
path "versions.yml", emit: versions | ||
|
||
when: | ||
task.ext.when == null || task.ext.when | ||
|
||
script: | ||
def ai = input_vdj ? "-ai $input_vdj" : '' | ||
|
||
""" | ||
export NUMBA_CACHE_DIR=/tmp | ||
export MPLCONFIGDIR=/tmp | ||
export XDG_CONFIG_HOME=/tmp | ||
|
||
convert_mudata.py -ad $input_h5ad $ai | ||
|
||
cat <<-END_VERSIONS >> versions.yml | ||
"${task.process}": | ||
convert_mudata.py --version >> versions.yml | ||
END_VERSIONS | ||
|
||
""" | ||
|
||
stub: | ||
""" | ||
touch matrix.mudata.h5mu | ||
|
||
cat <<-END_VERSIONS > versions.yml | ||
"${task.process}": | ||
convert_mudata.py --version >> versions.yml | ||
END_VERSIONS | ||
""" | ||
} |
Original file line number | Diff line number | Diff line change | ||||
---|---|---|---|---|---|---|
@@ -0,0 +1,134 @@ | ||||||
#!/usr/bin/env python3 | ||||||
# ==================================================================================================================== | ||||||
# PRELIMINARIES | ||||||
# ==================================================================================================================== | ||||||
|
||||||
# MODULE IMPORT | ||||||
import warnings | ||||||
import argparse # command line arguments parser | ||||||
import pathlib # library for handle filesystem paths | ||||||
import scanpy as sc # single-cell data processing | ||||||
from mudata import MuData | ||||||
|
||||||
|
||||||
warnings.filterwarnings("ignore") | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It would be better to filter only specific (expected) warnings, e.g. by category or message. |
||||||
|
||||||
# PARAMETERS | ||||||
# set script version number | ||||||
10000 VERSION = "0.0.1" | ||||||
|
||||||
|
||||||
# ==================================================================================================================== | ||||||
# MAIN FUNCTION | ||||||
# ==================================================================================================================== | ||||||
|
||||||
def main(): | ||||||
""" | ||||||
This function creates a MuData object. | ||||||
""" | ||||||
# -------------------------------------------------------------------------------------------------------------------- | ||||||
# LIBRARY CONFIG | ||||||
# -------------------------------------------------------------------------------------------------------------------- | ||||||
|
||||||
sc.settings.verbosity = 3 # verbosity: errors (0), warnings (1), info (2), hints (3) | ||||||
sc.logging.print_header() | ||||||
|
||||||
# -------------------------------------------------------------------------------------------------------------------- | ||||||
# INPUT FROM COMMAND LINE | ||||||
# -------------------------------------------------------------------------------------------------------------------- | ||||||
|
||||||
# Define command line arguments with argparse | ||||||
|
||||||
parser = argparse.ArgumentParser(prog='Create MuData object', usage='%(prog)s [options]',description = "MuData object convertion", | ||||||
epilog = "This function creates a MuData object for storing GEX,VDJ and CITE-seq data.") | ||||||
parser.add_argument('-ad','--input-gex-file',metavar= 'GEX_INPUT_FILES', type=pathlib.Path, dest='input_gex_files', | ||||||
help="paths of existing count matrix files in h5ad format (including file names)") | ||||||
parser.add_argument('-ai', '--input-vdj-file', metavar='VDJ_INPUT_FILES',type=pathlib.Path, dest='input_vdj_files', | ||||||
default=pathlib.Path(''),help="paths of existing vdj matrix files in h5ad format (including file names)") | ||||||
parser.add_argument('-o', '--out', metavar='MUDATA_OUTPUT_FILE', type=pathlib.Path, default="matrix.mudata.h5mu", | ||||||
help="name of the muData object") | ||||||
parser.add_argument('-v', '--version', action='version', version=VERSION) | ||||||
args = parser.parse_args() | ||||||
|
||||||
# -------------------------------------------------------------------------------------------------------------------- | ||||||
# DEFINE SAMPLES AND MTX PATHS | ||||||
# -------------------------------------------------------------------------------------------------------------------- | ||||||
|
||||||
print("\n===== INPUT GEX and VDJ FILES =====") | ||||||
input_gex_file = args.input_gex_files | ||||||
input_vdj_file = args.input_vdj_files | ||||||
output = args.out | ||||||
|
||||||
# print info on the available matrices | ||||||
print("Reading combined gex count matrix from the following file:") | ||||||
print(f"-File {input_gex_file}") | ||||||
|
||||||
print("Reading filtered annotation table from the following file:") | ||||||
print(f"-File {input_vdj_file}") | ||||||
|
||||||
# -------------------------------------------------------------------------------------------------------------------- | ||||||
# READ GEX AND AB FILES | ||||||
# -------------------------------------------------------------------------------------------------------------------- | ||||||
if input_gex_file: | ||||||
# Read folders with the MTX combined count matrice and store datasets in a dictionary | ||||||
print("\n===== READING COMBINED MATRIX =====") | ||||||
# read the gex count matrix for the combined samples and print some initial info | ||||||
print("\nProcessing count matrix in folder ... ", end ='') | ||||||
adata= sc.read_h5ad(input_gex_file) | ||||||
print("Done!") | ||||||
print(f"Gex count matrix for combined samples has {adata.shape[0]} cells and {adata.shape[1]} genes") | ||||||
else: | ||||||
print("No valid input file provided. Skipping reading of the count matrix.") | ||||||
|
||||||
# -------------------------------------------------------------------------------------------------------------------- | ||||||
# READ VDJ FILES | ||||||
# -------------------------------------------------------------------------------------------------------------------- | ||||||
if input_vdj_file and input_vdj_file != pathlib.Path(''): | ||||||
# Read folders with the filtered contigue annotation and store datasets in a dictionary | ||||||
print("\n===== READING CONTIGUE ANNOTATION MATRIX =====") | ||||||
# read the filtered contigue annotation file for the combined samples and print some initial info | ||||||
print("\nProcessing filtered contigue table in folder ... ", end ='') | ||||||
adata_vdj= sc.read_h5ad(input_vdj_file) | ||||||
print("Done!") | ||||||
else: | ||||||
print("No valid input file provided. Skipping reading of the vdj annotation.") | ||||||
|
||||||
# -------------------------------------------------------------------------------------------------------------------- | ||||||
# CREATE MUDATA OBJECT | ||||||
# -------------------------------------------------------------------------------------------------------------------- | ||||||
#Creates dictionary to store all modalities | ||||||
modalities = {} | ||||||
try: | ||||||
# Add 'gex' modality if defined | ||||||
if adata[:, adata.var["feature_types"] == "Gene Expression"].shape[1] > 0: | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Suggested change
|
||||||
modalities["gex"] = adata[:, adata.var["feature_types"] == "Gene Expression"] | ||||||
# Add 'pro' modality if defined | ||||||
if adata[:, adata.var["feature_types"] == "Antibody Capture"].shape[1] > 0: | ||||||
modalities["pro"] = adata[:, adata.var["feature_types"] == "Antibody Capture"] | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Suggested change
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Maybe more explicit? |
||||||
except NameError: | ||||||
pass | ||||||
|
||||||
try: | ||||||
# Add 'airr' modality if defined | ||||||
if adata_vdj is not None: | ||||||
modalities["airr"] = adata_vdj | ||||||
except NameError: | ||||||
pass | ||||||
|
||||||
# Creates MuData object | ||||||
mdata = MuData(modalities) | ||||||
|
||||||
# -------------------------------------------------------------------------------------------------------------------- | ||||||
# SAVE OUTPUT FILE | ||||||
# -------------------------------------------------------------------------------------------------------------------- | ||||||
|
||||||
print("\n===== SAVING OUTPUT FILE =====") | ||||||
|
||||||
print(f"Saving MuData object to file {output}") | ||||||
mdata.write(output) | ||||||
print("Done!") | ||||||
|
||||||
##################################################################################################### | ||||||
|
||||||
if __name__ == '__main__': | ||||||
main() |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -5,7 +5,7 @@ | |
Default config options for all compute environments | ||
---------------------------------------------------------------------------------------- | ||
*/ | ||
|
||
nextflow.enable.moduleBinaries = true | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think this is not supposed to be used in nf-core pipelines, because it's not compatible with all platforms. But I don't recall the details, maybe also ask again on slack if this is still a current requirement and what's the reasoning. |
||
// Global default params, used in configs | ||
params { | ||
|
||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
It would be better to filter only specific (expected) warnings, e.g. by category or message.