From b892321bb9f6eca2cf5a5a837f14ed081b212f32 Mon Sep 17 00:00:00 2001 From: Sabrina Krakau Date: Mon, 9 Mar 2020 16:49:40 +0100 Subject: [PATCH 1/4] Add spliting of peptides with python. --- main.nf | 28 +++++++++++++++++++++++++--- 1 file changed, 25 insertions(+), 3 deletions(-) diff --git a/main.nf b/main.nf index d787621..d64acd1 100644 --- a/main.nf +++ b/main.nf @@ -259,11 +259,33 @@ process splitPeptides { when: !params.input - // @TODO - // splitting mechanism missing script: """ - cat ${peptides} > "${peptides.fileName}.tsv" + #!/usr/bin/python + + import math + + with open("${peptides}", 'r') as infile: + tot_size = sum([1 for _ in infile]) + + # min. number of peptides in one chunk + min_size=5000 + # max. number of files that should be created + max_chunks=100 + + n = int(min(math.ceil(float(tot_size)/min_size), max_chunks)) + h = int(max(min_size, math.ceil(float(tot_size)/n))) + + with open("${peptides}", "r") as infile: + header = next(infile) + for chunk in range(n): + with open("${peptides.baseName}"+".chunk_"+str(chunk)+".tsv", "w") as outfile: + outfile.write(header) + for _ in range(h): + try: + outfile.write(next(infile)) + except StopIteration: + break """ } From 22dcaa34badbe782731923df6d50e2b8921477eb Mon Sep 17 00:00:00 2001 From: Sabrina Krakau Date: Mon, 16 Mar 2020 18:19:59 +0100 Subject: [PATCH 2/4] Moved python code from main.nf to bin/ --- bin/split_peptides.py | 29 +++++++++++++++++++++++++++++ main.nf | 28 ++-------------------------- 2 files changed, 31 insertions(+), 26 deletions(-) create mode 100755 bin/split_peptides.py diff --git a/bin/split_peptides.py b/bin/split_peptides.py new file mode 100755 index 0000000..a39a0be --- /dev/null +++ b/bin/split_peptides.py @@ -0,0 +1,29 @@ +#!/usr/bin/python + +import math +import argparse + + +parser = argparse.ArgumentParser("Split peptides input file.") +parser.add_argument('-i', '--input', metavar='FILE', type=str, help = 'Input file contain peptides.') +parser.add_argument('-o', '--output_base', type=str, help='Base filename for output files.') +parser.add_argument('-s', '--min_size', metavar='N', type=int, help = 'Min. number of peptides that should be into one file.') +parser.add_argument('-c', '--max_chunks', metavar='N', type=int, help = 'Max. number of chunks that should be created.') +args = parser.parse_args() + +with open(args.input, 'r') as infile: + tot_size = sum([1 for _ in infile]) + +n = int(min(math.ceil(float(tot_size)/args.min_size), args.max_chunks)) +h = int(max(args.min_size, math.ceil(float(tot_size)/n))) + +with open(args.input, "r") as infile: + header = next(infile) + for chunk in range(n): + with open(args.output_base+".chunk_"+str(chunk)+".tsv", "w") as outfile: + outfile.write(header) + for _ in range(h): + try: + outfile.write(next(infile)) + except StopIteration: + break diff --git a/main.nf b/main.nf index d64acd1..ac08885 100644 --- a/main.nf +++ b/main.nf @@ -255,37 +255,13 @@ process splitPeptides { file peptides from ch_split_peptides output: - file '*.tsv' into ch_splitted_peptides + file '*.chunk_*.tsv' into ch_splitted_peptides when: !params.input script: """ - #!/usr/bin/python - - import math - - with open("${peptides}", 'r') as infile: - tot_size = sum([1 for _ in infile]) - - # min. number of peptides in one chunk - min_size=5000 - # max. number of files that should be created - max_chunks=100 - - n = int(min(math.ceil(float(tot_size)/min_size), max_chunks)) - h = int(max(min_size, math.ceil(float(tot_size)/n))) - - with open("${peptides}", "r") as infile: - header = next(infile) - for chunk in range(n): - with open("${peptides.baseName}"+".chunk_"+str(chunk)+".tsv", "w") as outfile: - outfile.write(header) - for _ in range(h): - try: - outfile.write(next(infile)) - except StopIteration: - break + split_peptides.py --input ${peptides} --output_base ${peptides.baseName} --min_size 5000 --max_chunks 100 """ } From 12f19c6ae095e003f14fdc3d9656f650cd42c681 Mon Sep 17 00:00:00 2001 From: Sabrina Krakau Date: Tue, 24 Mar 2020 12:16:24 +0100 Subject: [PATCH 3/4] Fixed total peptide count. --- bin/split_peptides.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bin/split_peptides.py b/bin/split_peptides.py index a39a0be..d4ae00c 100755 --- a/bin/split_peptides.py +++ b/bin/split_peptides.py @@ -12,7 +12,7 @@ args = parser.parse_args() with open(args.input, 'r') as infile: - tot_size = sum([1 for _ in infile]) + tot_size = sum([1 for _ in infile]) - 1 n = int(min(math.ceil(float(tot_size)/args.min_size), args.max_chunks)) h = int(max(args.min_size, math.ceil(float(tot_size)/n))) From 234fa48776d8ed9b193584a14700024add890a79 Mon Sep 17 00:00:00 2001 From: Sabrina Krakau Date: Tue, 24 Mar 2020 12:17:07 +0100 Subject: [PATCH 4/4] Added user parameters to control splitting of peptides for parallelization. --- bin/split_peptides.py | 8 +++---- main.nf | 50 +++++++++++++++++++++++-------------------- nextflow.config | 2 ++ 3 files changed, 33 insertions(+), 27 deletions(-) diff --git a/bin/split_peptides.py b/bin/split_peptides.py index d4ae00c..1ef879a 100755 --- a/bin/split_peptides.py +++ b/bin/split_peptides.py @@ -5,10 +5,10 @@ parser = argparse.ArgumentParser("Split peptides input file.") -parser.add_argument('-i', '--input', metavar='FILE', type=str, help = 'Input file contain peptides.') +parser.add_argument('-i', '--input', metavar='FILE', type=str, help = 'Input file containing peptides.') parser.add_argument('-o', '--output_base', type=str, help='Base filename for output files.') -parser.add_argument('-s', '--min_size', metavar='N', type=int, help = 'Min. number of peptides that should be into one file.') -parser.add_argument('-c', '--max_chunks', metavar='N', type=int, help = 'Max. number of chunks that should be created.') +parser.add_argument('-s', '--min_size', metavar='N', type=int, help = 'Minimum number of peptides that should be written into one file.') +parser.add_argument('-c', '--max_chunks', metavar='N', type=int, help = 'Maximum number of chunks that should be created.') args = parser.parse_args() with open(args.input, 'r') as infile: @@ -26,4 +26,4 @@ try: outfile.write(next(infile)) except StopIteration: - break + break diff --git a/main.nf b/main.nf index ac08885..ce44a4f 100644 --- a/main.nf +++ b/main.nf @@ -23,36 +23,38 @@ def helpMessage() { nextflow run nf-core/epitopeprediction -profile --input "*.vcf.gz" Mandatory arguments: - --input [file] Path to input data (must be surrounded with quotes) - --alleles [file] Path to the file containing the MHC alleles - -profile [str] Configuration profile to use. Can use multiple (comma separated) - Available: conda, docker, singularity, test, awsbatch, and more + --input [file] Path to input data (must be surrounded with quotes) + --alleles [file] Path to the file containing the MHC alleles + -profile [str] Configuration profile to use. Can use multiple (comma separated) + Available: conda, docker, singularity, test, awsbatch, and more Alternative inputs: - --peptides [file] Path to TSV file containing peptide sequences (minimum required: id and sequence column) + --peptides [file] Path to TSV file containing peptide sequences (minimum required: id and sequence column) Pipeline options: - --filter_self [bool] Specifies that peptides should be filtered against the specified human proteome references Default: false - --wild_type [bool] Specifies that wild-type sequences of mutated peptides should be predicted as well Default: false - --mhc_class [1,2] Specifies whether the predictions should be done for MHC class I (1) or class II (2). Default: 1 - --max_peptide_length [int] Specifies the maximum peptide length Default: MHC class I: 11 aa, MHC class II: 16 aa - --min_peptide_length [int] Specifies the minimum peptide length Default: MCH class I: 8 aa, MHC class II: 15 aa - --tools [str] Specifies a list of tool(s) to use. Available are: 'syfpeithi', 'mhcflurry', 'mhcnuggets-class-1', 'mhcnuggets-class-2'. Can be combined in a list separated by comma. - - References If not specified in the configuration file or you wish to overwrite any of the references - --genome [str] Specifies the ensembl reference genome version (GRCh37, GRCh38) Default: GRCh37 - --proteome [path/file] Specifies the reference proteome files that are used for self-filtering. Should be either a folder of FASTA files or a single FASTA file containing the reference proteome(s). + --filter_self [bool] Specifies that peptides should be filtered against the specified human proteome references Default: false + --wild_type [bool] Specifies that wild-type sequences of mutated peptides should be predicted as well Default: false + --mhc_class [1,2] Specifies whether the predictions should be done for MHC class I (1) or class II (2). Default: 1 + --max_peptide_length [int] Specifies the maximum peptide length Default: MHC class I: 11 aa, MHC class II: 16 aa + --min_peptide_length [int] Specifies the minimum peptide length Default: MCH class I: 8 aa, MHC class II: 15 aa + --tools [str] Specifies a list of tool(s) to use. Available are: 'syfpeithi', 'mhcflurry', 'mhcnuggets-class-1', 'mhcnuggets-class-2'. Can be combined in a list separated by comma. + --peptides_split_maxchunks [int] Used in combination with '--peptides' or '--proteins': maximum number of peptide chunks that will be created for parallelization. Default: 100 + --peptides_split_minchunksize [int] Used in combination with '--peptides' or '--proteins': minimum number of peptides that should be written into one chunk. Default: 5000 + + References If not specified in the configuration file or you wish to overwrite any of the references + --genome [str] Specifies the ensembl reference genome version (GRCh37, GRCh38) Default: GRCh37 + --proteome [path/file] Specifies the reference proteome files that are used for self-filtering. Should be either a folder of FASTA files or a single FASTA file containing the reference proteome(s). Other options: - --outdir [path] The output directory where the results will be saved - --email [email] Set this parameter to your e-mail address to get a summary e-mail with details of the run sent to you when the workflow exits - -name [str] Name for the pipeline run. If not specified, Nextflow will automatically generate a random mnemonic. - --max_multiqc_email_size Threshold size for MultiQC report to be attached in notification email. If file generated by pipeline exceeds the threshold, it will not be attached (Default: 25MB) + --outdir [path] The output directory where the results will be saved + --email [email] Set this parameter to your e-mail address to get a summary e-mail with details of the run sent to you when the workflow exits + -name [str] Name for the pipeline run. If not specified, Nextflow will automatically generate a random mnemonic. + --max_multiqc_email_size Threshold size for MultiQC report to be attached in notification email. If file generated by pipeline exceeds the threshold, it will not be attached (Default: 25MB) AWSBatch options: - --awsqueue [str] The AWSBatch JobQueue that needs to be set when running on AWSBatch - --awsregion [str] The AWS Region for your AWS Batch job to run on - --awscli [str] Path to the AWS CLI tool + --awsqueue [str] The AWSBatch JobQueue that needs to be set when running on AWSBatch + --awsregion [str] The AWS Region for your AWS Batch job to run on + --awscli [str] Path to the AWS CLI tool """.stripIndent() } @@ -140,6 +142,8 @@ summary['Self-Filter'] = params.filter_self summary['Tools'] = params.tools if ( params.input ) summary['Variants'] = params.input summary['Wild-types'] = params.wild_type +if ( params.peptides || params.proteins ) summary['Max. number of chunks for parallelization'] = params.peptides_split_maxchunks +if ( params.peptides || params.proteins ) summary['Min. number of peptides in one chunk'] = params.peptides_split_minchunksize //Standard Params for nf-core pipelines summary['Max Resources'] = "$params.max_memory memory, $params.max_cpus cpus, $params.max_time time per job" if (workflow.containerEngine) summary['Container'] = "$workflow.containerEngine - $workflow.container" @@ -261,7 +265,7 @@ process splitPeptides { script: """ - split_peptides.py --input ${peptides} --output_base ${peptides.baseName} --min_size 5000 --max_chunks 100 + split_peptides.py --input ${peptides} --output_base ${peptides.baseName} --min_size ${params.peptides_split_minchunksize} --max_chunks ${params.peptides_split_maxchunks} """ } diff --git a/nextflow.config b/nextflow.config index a859d88..3d336a0 100644 --- a/nextflow.config +++ b/nextflow.config @@ -18,6 +18,8 @@ params { genome = 'GRCh37' input = false wild_type = false + peptides_split_maxchunks = 100 + peptides_split_minchunksize = 5000 // Additional annotation files proteome = false