nf-core · christopher-mohr · Apr 14, 2020 · Mar 9, 2020 · Mar 16, 2020 · Mar 24, 2020
@@ -0,0 +1,29 @@
+#!/usr/bin/python
+
+import math
+import argparse
+
+
+parser = argparse.ArgumentParser("Split peptides input file.")
+parser.add_argument('-i', '--input', metavar='FILE', type=str, help = 'Input file containing peptides.')
+parser.add_argument('-o', '--output_base', type=str, help='Base filename for output files.')
+parser.add_argument('-s', '--min_size', metavar='N', type=int, help = 'Minimum number of peptides that should be written into one file.')
+parser.add_argument('-c', '--max_chunks', metavar='N', type=int, help = 'Maximum number of chunks that should be created.')
+args = parser.parse_args()
+
+with open(args.input, 'r') as infile:
+    tot_size = sum([1 for _ in infile]) - 1
+
+n = int(min(math.ceil(float(tot_size)/args.min_size), args.max_chunks))
+h = int(max(args.min_size, math.ceil(float(tot_size)/n)))
+
+with open(args.input, "r") as infile:
+    header = next(infile)
+    for chunk in range(n):
+        with open(args.output_base+".chunk_"+str(chunk)+".tsv", "w") as outfile:
+            outfile.write(header)
+            for _ in range(h):
+                try:
+                    outfile.write(next(infile))
+                except StopIteration:
+                    break
@@ -23,38 +23,39 @@ def helpMessage() {
     nextflow run nf-core/epitopeprediction -profile <docker/singularity/conda/institute> --input "*.vcf.gz"
 
     Mandatory arguments:
-      --input [file]                Path to input data (must be surrounded with quotes)
-      --alleles [file]              Path to the file containing the MHC alleles
-      -profile [str]                Configuration profile to use. Can use multiple (comma separated)
-                                    Available: conda, docker, singularity, test, awsbatch, <institute> and more
+      --input [file]                        Path to input data (must be surrounded with quotes)
+      --alleles [file]                      Path to the file containing the MHC alleles
+      -profile [str]                        Configuration profile to use. Can use multiple (comma separated)
+                                            Available: conda, docker, singularity, test, awsbatch, <institute> and more
 
     Alternative inputs:
-      --peptides [file]             Path to TSV file containing peptide sequences (minimum required: id and sequence column)
-      --proteins [file]             Path to FASTA file containing protein sequences
-
+      --peptides [file]                     Path to TSV file containing peptide sequences (minimum required: id and sequence column)
+      --proteins [file]                     Path to FASTA file containing protein sequences
 
     Pipeline options:
-      --filter_self [bool]          Specifies that peptides should be filtered against the specified human proteome references Default: false
-      --wild_type  [bool]           Specifies that wild-type sequences of mutated peptides should be predicted as well Default: false
-      --mhc_class [1,2]             Specifies whether the predictions should be done for MHC class I (1) or class II (2). Default: 1
-      --max_peptide_length [int]    Specifies the maximum peptide length Default: MHC class I: 11 aa, MHC class II: 16 aa 
-      --min_peptide_length [int]    Specifies the minimum peptide length Default: MCH class I: 8 aa, MHC class II: 15 aa
-      --tools [str]                 Specifies a list of tool(s) to use. Available are: 'syfpeithi', 'mhcflurry', 'mhcnuggets-class-1', 'mhcnuggets-class-2'. Can be combined in a list separated by comma.
-
-    References                      If not specified in the configuration file or you wish to overwrite any of the references
-      --genome [str]                Specifies the ensembl reference genome version (GRCh37, GRCh38) Default: GRCh37
-      --proteome [path/file]        Specifies the reference proteome files that are used for self-filtering. Should be either a folder of FASTA files or a single FASTA file containing the reference proteome(s).
+      --filter_self [bool]                  Specifies that peptides should be filtered against the specified human proteome references Default: false
+      --wild_type  [bool]                   Specifies that wild-type sequences of mutated peptides should be predicted as well Default: false
+      --mhc_class [1,2]                     Specifies whether the predictions should be done for MHC class I (1) or class II (2). Default: 1
+      --max_peptide_length [int]            Specifies the maximum peptide length Default: MHC class I: 11 aa, MHC class II: 16 aa
+      --min_peptide_length [int]            Specifies the minimum peptide length Default: MCH class I: 8 aa, MHC class II: 15 aa
+      --tools [str]                         Specifies a list of tool(s) to use. Available are: 'syfpeithi', 'mhcflurry', 'mhcnuggets-class-1', 'mhcnuggets-class-2'. Can be combined in a list separated by comma.
+      --peptides_split_maxchunks [int]      Used in combination with '--peptides' or '--proteins': maximum number of peptide chunks that will be created for parallelization. Default: 100
+      --peptides_split_minchunksize [int]   Used in combination with '--peptides' or '--proteins': minimum number of peptides that should be written into one chunk. Default: 5000
+
+    References                              If not specified in the configuration file or you wish to overwrite any of the references
+      --genome [str]                        Specifies the ensembl reference genome version (GRCh37, GRCh38) Default: GRCh37
+      --proteome [path/file]                Specifies the reference proteome files that are used for self-filtering. Should be either a folder of FASTA files or a single FASTA file containing the reference proteome(s).
 
     Other options:
-      --outdir [path]               The output directory where the results will be saved
-      --email [email]               Set this parameter to your e-mail address to get a summary e-mail with details of the run sent to you when the workflow exits
-      -name [str]                   Name for the pipeline run. If not specified, Nextflow will automatically generate a random mnemonic.
-      --max_multiqc_email_size      Threshold size for MultiQC report to be attached in notification email. If file generated by pipeline exceeds the threshold, it will not be attached (Default: 25MB)
+      --outdir [path]                       The output directory where the results will be saved
+      --email [email]                       Set this parameter to your e-mail address to get a summary e-mail with details of the run sent to you when the workflow exits
+      -name [str]                           Name for the pipeline run. If not specified, Nextflow will automatically generate a random mnemonic.
+      --max_multiqc_email_size              Threshold size for MultiQC report to be attached in notification email. If file generated by pipeline exceeds the threshold, it will not be attached (Default: 25MB)
 
     AWSBatch options:
-      --awsqueue [str]                The AWSBatch JobQueue that needs to be set when running on AWSBatch
-      --awsregion [str]               The AWS Region for your AWS Batch job to run on
-      --awscli [str]                  Path to the AWS CLI tool
+      --awsqueue [str]                      The AWSBatch JobQueue that needs to be set when running on AWSBatch
+      --awsregion [str]                     The AWS Region for your AWS Batch job to run on
+      --awscli [str]                        Path to the AWS CLI tool
     """.stripIndent()
 }
 
@@ -153,6 +154,8 @@ summary['Self-Filter'] = params.filter_self
 summary['Tools'] = params.tools
 if ( params.input ) summary['Variants'] = params.input
 summary['Wild-types'] = params.wild_type
+if ( params.peptides || params.proteins ) summary['Max. number of chunks for parallelization'] = params.peptides_split_maxchunks
+if ( params.peptides || params.proteins ) summary['Min. number of peptides in one chunk'] = params.peptides_split_minchunksize
 //Standard Params for nf-core pipelines
 summary['Max Resources']    = "$params.max_memory memory, $params.max_cpus cpus, $params.max_time time per job"
 if (workflow.containerEngine) summary['Co
8000
ntainer'] = "$workflow.containerEngine - $workflow.container"
@@ -290,15 +293,13 @@ process splitPeptides {
     file peptides from ch_split_peptides
 
     output:
-    file '*.tsv' into ch_splitted_peptides
+    file '*.chunk_*.tsv' into ch_splitted_peptides
 
     when: !params.input
 
-    // @TODO
-    // splitting mechanism missing
     script:
     """
-    cat ${peptides} > "${peptides.fileName}.tsv"
+    split_peptides.py --input ${peptides} --output_base ${peptides.baseName} --min_size ${params.peptides_split_minchunksize} --max_chunks ${params.peptides_split_maxchunks}
     """
 }
 

@@ -19,6 +19,8 @@ params {
   genome = 'GRCh37'
   input = false
   wild_type = false
+  peptides_split_maxchunks = 100
+  peptides_split_minchunksize = 5000
 
   // Additional annotation files
   proteome = false