From b892321bb9f6eca2cf5a5a837f14ed081b212f32 Mon Sep 17 00:00:00 2001
From: Sabrina Krakau <sabrina.krakau.qbic@gmail.com>
Date: Mon, 9 Mar 2020 16:49:40 +0100
Subject: [PATCH 1/4] Add spliting of peptides with python.

---
 main.nf | 28 +++++++++++++++++++++++++---
 1 file changed, 25 insertions(+), 3 deletions(-)

diff --git a/main.nf b/main.nf
index d787621..d64acd1 100644
--- a/main.nf
+++ b/main.nf
@@ -259,11 +259,33 @@ process splitPeptides {
 
     when: !params.input
 
-    // @TODO
-    // splitting mechanism missing
     script:
     """
-    cat ${peptides} > "${peptides.fileName}.tsv"
+    #!/usr/bin/python
+
+    import math
+
+    with open("${peptides}", 'r') as infile:
+        tot_size = sum([1 for _ in infile])
+
+    # min. number of peptides in one chunk
+    min_size=5000
+    # max. number of files that should be created
+    max_chunks=100
+
+    n = int(min(math.ceil(float(tot_size)/min_size), max_chunks))
+    h = int(max(min_size, math.ceil(float(tot_size)/n)))
+
+    with open("${peptides}", "r") as infile:
+        header = next(infile)
+        for chunk in range(n):
+            with open("${peptides.baseName}"+".chunk_"+str(chunk)+".tsv", "w") as outfile:
+                outfile.write(header)
+                for _ in range(h):
+                    try:
+                        outfile.write(next(infile))
+                    except StopIteration:
+                        break	
     """
 }
 

From 22dcaa34badbe782731923df6d50e2b8921477eb Mon Sep 17 00:00:00 2001
From: Sabrina Krakau <sabrina.krakau.qbic@gmail.com>
Date: Mon, 16 Mar 2020 18:19:59 +0100
Subject: [PATCH 2/4] Moved python code from main.nf to bin/

---
 bin/split_peptides.py | 29 +++++++++++++++++++++++++++++
 main.nf               | 28 ++--------------------------
 2 files changed, 31 insertions(+), 26 deletions(-)
 create mode 100755 bin/split_peptides.py

diff --git a/bin/split_peptides.py b/bin/split_peptides.py
new file mode 100755
index 0000000..a39a0be
--- /dev/null
+++ b/bin/split_peptides.py
@@ -0,0 +1,29 @@
+#!/usr/bin/python
+
+import math
+import argparse
+
+
+parser = argparse.ArgumentParser("Split peptides input file.")
+parser.add_argument('-i', '--input', metavar='FILE', type=str, help = 'Input file contain peptides.')
+parser.add_argument('-o', '--output_base', type=str, help='Base filename for output files.')
+parser.add_argument('-s', '--min_size', metavar='N', type=int, help = 'Min. number of peptides that should be into one file.')
+parser.add_argument('-c', '--max_chunks', metavar='N', type=int, help = 'Max. number of chunks that should be created.')
+args = parser.parse_args()
+
+with open(args.input, 'r') as infile:
+    tot_size = sum([1 for _ in infile])
+
+n = int(min(math.ceil(float(tot_size)/args.min_size), args.max_chunks))
+h = int(max(args.min_size, math.ceil(float(tot_size)/n)))
+
+with open(args.input, "r") as infile:
+    header = next(infile)
+    for chunk in range(n):
+        with open(args.output_base+".chunk_"+str(chunk)+".tsv", "w") as outfile:
+            outfile.write(header)
+            for _ in range(h):
+                try:
+                    outfile.write(next(infile))
+                except StopIteration:
+                    break	
diff --git a/main.nf b/main.nf
index d64acd1..ac08885 100644
--- a/main.nf
+++ b/main.nf
@@ -255,37 +255,13 @@ process splitPeptides {
     file peptides from ch_split_peptides
 
     output:
-    file '*.tsv' into ch_splitted_peptides
+    file '*.chunk_*.tsv' into ch_splitted_peptides
 
     when: !params.input
 
     script:
     """
-    #!/usr/bin/python
-
-    import math
-
-    with open("${peptides}", 'r') as infile:
-        tot_size = sum([1 for _ in infile])
-
-    # min. number of peptides in one chunk
-    min_size=5000
-    # max. number of files that should be created
-    max_chunks=100
-
-    n = int(min(math.ceil(float(tot_size)/min_size), max_chunks))
-    h = int(max(min_size, math.ceil(float(tot_size)/n)))
-
-    with open("${peptides}", "r") as infile:
-        header = next(infile)
-        for chunk in range(n):
-            with open("${peptides.baseName}"+".chunk_"+str(chunk)+".tsv", "w") as outfile:
-                outfile.write(header)
-                for _ in range(h):
-                    try:
-                        outfile.write(next(infile))
-                    except StopIteration:
-                        break	
+    split_peptides.py --input ${peptides} --output_base ${peptides.baseName} --min_size 5000 --max_chunks 100
     """
 }
 

From 12f19c6ae095e003f14fdc3d9656f650cd42c681 Mon Sep 17 00:00:00 2001
From: Sabrina Krakau <sabrina.krakau.qbic@gmail.com>
Date: Tue, 24 Mar 2020 12:16:24 +0100
Subject: [PATCH 3/4] Fixed total peptide count.

---
 bin/split_peptides.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/bin/split_peptides.py b/bin/split_peptides.py
index a39a0be..d4ae00c 100755
--- a/bin/split_peptides.py
+++ b/bin/split_peptides.py
@@ -12,7 +12,7 @@
 args = parser.parse_args()
 
 with open(args.input, 'r') as infile:
-    tot_size = sum([1 for _ in infile])
+    tot_size = sum([1 for _ in infile]) - 1
 
 n = int(min(math.ceil(float(tot_size)/args.min_size), args.max_chunks))
 h = int(max(args.min_size, math.ceil(float(tot_size)/n)))

From 234fa48776d8ed9b193584a14700024add890a79 Mon Sep 17 00:00:00 2001
From: Sabrina Krakau <sabrina.krakau.qbic@gmail.com>
Date: Tue, 24 Mar 2020 12:17:07 +0100
Subject: [PATCH 4/4] Added user parameters to control splitting of peptides
 for parallelization.

---
 bin/split_peptides.py |  8 +++----
 main.nf               | 50 +++++++++++++++++++++++--------------------
 nextflow.config       |  2 ++
 3 files changed, 33 insertions(+), 27 deletions(-)

diff --git a/bin/split_peptides.py b/bin/split_peptides.py
index d4ae00c..1ef879a 100755
--- a/bin/split_peptides.py
+++ b/bin/split_peptides.py
@@ -5,10 +5,10 @@
 
 
 parser = argparse.ArgumentParser("Split peptides input file.")
-parser.add_argument('-i', '--input', metavar='FILE', type=str, help = 'Input file contain peptides.')
+parser.add_argument('-i', '--input', metavar='FILE', type=str, help = 'Input file containing peptides.')
 parser.add_argument('-o', '--output_base', type=str, help='Base filename for output files.')
-parser.add_argument('-s', '--min_size', metavar='N', type=int, help = 'Min. number of peptides that should be into one file.')
-parser.add_argument('-c', '--max_chunks', metavar='N', type=int, help = 'Max. number of chunks that should be created.')
+parser.add_argument('-s', '--min_size', metavar='N', type=int, help = 'Minimum number of peptides that should be written into one file.')
+parser.add_argument('-c', '--max_chunks', metavar='N', type=int, help = 'Maximum number of chunks that should be created.')
 args = parser.parse_args()
 
 with open(args.input, 'r') as infile:
@@ -26,4 +26,4 @@
                 try:
                     outfile.write(next(infile))
                 except StopIteration:
-                    break	
+                    break
diff --git a/main.nf b/main.nf
index ac08885..ce44a4f 100644
--- a/main.nf
+++ b/main.nf
@@ -23,36 +23,38 @@ def helpMessage() {
     nextflow run nf-core/epitopeprediction -profile <docker/singularity/conda/institute> --input "*.vcf.gz"
 
     Mandatory arguments:
-      --input [file]                Path to input data (must be surrounded with quotes)
-      --alleles [file]              Path to the file containing the MHC alleles
-      -profile [str]                Configuration profile to use. Can use multiple (comma separated)
-                                    Available: conda, docker, singularity, test, awsbatch, <institute> and more
+      --input [file]                        Path to input data (must be surrounded with quotes)
+      --alleles [file]                      Path to the file containing the MHC alleles
+      -profile [str]                        Configuration profile to use. Can use multiple (comma separated)
+                                            Available: conda, docker, singularity, test, awsbatch, <institute> and more
 
     Alternative inputs:
-      --peptides [file]             Path to TSV file containing peptide sequences (minimum required: id and sequence column)
+      --peptides [file]                     Path to TSV file containing peptide sequences (minimum required: id and sequence column)
     
     Pipeline options:
-      --filter_self [bool]          Specifies that peptides should be filtered against the specified human proteome references Default: false
-      --wild_type  [bool]           Specifies that wild-type sequences of mutated peptides should be predicted as well Default: false
-      --mhc_class [1,2]             Specifies whether the predictions should be done for MHC class I (1) or class II (2). Default: 1
-      --max_peptide_length [int]    Specifies the maximum peptide length Default: MHC class I: 11 aa, MHC class II: 16 aa 
-      --min_peptide_length [int]    Specifies the minimum peptide length Default: MCH class I: 8 aa, MHC class II: 15 aa
-      --tools [str]                 Specifies a list of tool(s) to use. Available are: 'syfpeithi', 'mhcflurry', 'mhcnuggets-class-1', 'mhcnuggets-class-2'. Can be combined in a list separated by comma.
-
-    References                      If not specified in the configuration file or you wish to overwrite any of the references
-      --genome [str]                Specifies the ensembl reference genome version (GRCh37, GRCh38) Default: GRCh37
-      --proteome [path/file]        Specifies the reference proteome files that are used for self-filtering. Should be either a folder of FASTA files or a single FASTA file containing the reference proteome(s).
+      --filter_self [bool]                  Specifies that peptides should be filtered against the specified human proteome references Default: false
+      --wild_type  [bool]                   Specifies that wild-type sequences of mutated peptides should be predicted as well Default: false
+      --mhc_class [1,2]                     Specifies whether the predictions should be done for MHC class I (1) or class II (2). Default: 1
+      --max_peptide_length [int]            Specifies the maximum peptide length Default: MHC class I: 11 aa, MHC class II: 16 aa
+      --min_peptide_length [int]            Specifies the minimum peptide length Default: MCH class I: 8 aa, MHC class II: 15 aa
+      --tools [str]                         Specifies a list of tool(s) to use. Available are: 'syfpeithi', 'mhcflurry', 'mhcnuggets-class-1', 'mhcnuggets-class-2'. Can be combined in a list separated by comma.
+      --peptides_split_maxchunks [int]      Used in combination with '--peptides' or '--proteins': maximum number of peptide chunks that will be created for parallelization. Default: 100
+      --peptides_split_minchunksize [int]   Used in combination with '--peptides' or '--proteins': minimum number of peptides that should be written into one chunk. Default: 5000
+
+    References                              If not specified in the configuration file or you wish to overwrite any of the references
+      --genome [str]                        Specifies the ensembl reference genome version (GRCh37, GRCh38) Default: GRCh37
+      --proteome [path/file]                Specifies the reference proteome files that are used for self-filtering. Should be either a folder of FASTA files or a single FASTA file containing the reference proteome(s).
        
     Other options:
-      --outdir [path]               The output directory where the results will be saved
-      --email [email]               Set this parameter to your e-mail address to get a summary e-mail with details of the run sent to you when the workflow exits
-      -name [str]                   Name for the pipeline run. If not specified, Nextflow will automatically generate a random mnemonic.
-      --max_multiqc_email_size      Threshold size for MultiQC report to be attached in notification email. If file generated by pipeline exceeds the threshold, it will not be attached (Default: 25MB)
+      --outdir [path]                       The output directory where the results will be saved
+      --email [email]                       Set this parameter to your e-mail address to get a summary e-mail with details of the run sent to you when the workflow exits
+      -name [str]                           Name for the pipeline run. If not specified, Nextflow will automatically generate a random mnemonic.
+      --max_multiqc_email_size              Threshold size for MultiQC report to be attached in notification email. If file generated by pipeline exceeds the threshold, it will not be attached (Default: 25MB)
 
     AWSBatch options:
-      --awsqueue [str]                The AWSBatch JobQueue that needs to be set when running on AWSBatch
-      --awsregion [str]               The AWS Region for your AWS Batch job to run on
-      --awscli [str]                  Path to the AWS CLI tool
+      --awsqueue [str]                      The AWSBatch JobQueue that needs to be set when running on AWSBatch
+      --awsregion [str]                     The AWS Region for your AWS Batch job to run on
+      --awscli [str]                        Path to the AWS CLI tool
     """.stripIndent()
 }
 
@@ -140,6 +142,8 @@ summary['Self-Filter'] = params.filter_self
 summary['Tools'] = params.tools
 if ( params.input ) summary['Variants'] = params.input
 summary['Wild-types'] = params.wild_type
+if ( params.peptides || params.proteins ) summary['Max. number of chunks for parallelization'] = params.peptides_split_maxchunks
+if ( params.peptides || params.proteins ) summary['Min. number of peptides in one chunk'] = params.peptides_split_minchunksize
 //Standard Params for nf-core pipelines
 summary['Max Resources']    = "$params.max_memory memory, $params.max_cpus cpus, $params.max_time time per job"
 if (workflow.containerEngine) summary['Container'] = "$workflow.containerEngine - $workflow.container"
@@ -261,7 +265,7 @@ process splitPeptides {
 
     script:
     """
-    split_peptides.py --input ${peptides} --output_base ${peptides.baseName} --min_size 5000 --max_chunks 100
+    split_peptides.py --input ${peptides} --output_base ${peptides.baseName} --min_size ${params.peptides_split_minchunksize} --max_chunks ${params.peptides_split_maxchunks}
     """
 }
 
diff --git a/nextflow.config b/nextflow.config
index a859d88..3d336a0 100644
--- a/nextflow.config
+++ b/nextflow.config
@@ -18,6 +18,8 @@ params {
   genome = 'GRCh37'
   input = false
   wild_type = false
+  peptides_split_maxchunks = 100
+  peptides_split_minchunksize = 5000
 
   // Additional annotation files
   proteome = false