nf-core · christopher-mohr · Apr 14, 2020 · Mar 26, 2020
@@ -330,6 +330,8 @@ def read_peptide_input(filename):
 
     '''expected columns (min required): id sequence'''
     with open(filename, 'r') as peptide_input:
+        # enable listing of protein names for each peptide
+        csv.field_size_limit(600000)
         reader = csv.DictReader(peptide_input, delimiter='\t')
         for row in reader:
             pep = Peptide(row['sequence'])

@@ -0,0 +1,53 @@
+#!/usr/bin/env python
+
+import sys
+import argparse
+import pandas as pd
+
+from Bio.SeqIO.FastaIO import SimpleFastaParser
+from Fred2.Core import Allele, Peptide, Protein, generate_peptides_from_proteins
+
+
+parser = argparse.ArgumentParser("Generating peptides from protein sequences.")
+parser.add_argument('-i', '--input', metavar='FILE', type=argparse.FileType('r'), help = 'FASTA filename containing proteins.')
+parser.add_argument('-o', '--output', metavar='FILE', type=argparse.FileType('w'), help='Output file containing peptides.')
+parser.add_argument('-min', '--min_length', metavar='N', type=int, help='Minimal length of peptides that will be generated.')
+parser.add_argument('-max', '--max_length', metavar='N', type=int, help='Maximum length of peptides that will be generated.')
+args = parser.parse_args()
+
+
+
+def read_protein_fasta(file):
+    # split at first whitespace and use short ID
+
+    collect = set()
+    # iterate over all FASTA entries:
+    for _id, seq in SimpleFastaParser(file):
+        # generate element:
+        _id = _id.split(" ")[0]
+
+        try:
+            collect.add(Protein(seq.strip().upper(), transcript_id=_id))
+        except TypeError:
+            collect.add(Protein(seq.strip().upper()))
+    return list(collect)
+
+
+proteins = read_protein_fasta(args.input)
+
+c = 0
+for k in range(args.min_length, args.max_length+1):
+    peptides = generate_peptides_from_proteins(proteins, k)
+    # get proteins and corresponding counts
+    pd_peptides = pd.DataFrame(
+        [ (str(pep), ','.join([ prot.transcript_id.split(' ')[0] for prot in pep.get_all_protein
8000
s() ]), ','.join([ str(len(pep.proteinPos[prot.transcript_id])) for prot in pep.get_all_proteins() ])) for pep in peptides ],
+        columns = ['sequence', 'protein_ids', 'counts']
+        )
+    # assign id
+    pd_peptides = pd_peptides.assign(id=[str(c+id) for id in pd_peptides.index])
+    c += len(pd_peptides['sequence'])
+
+    if k == args.min_length:
+        pd_peptides[['sequence','id','protein_ids','counts']].to_csv(args.output, sep='\t', index=False)
+    else:
+        pd_peptides[['sequence','id','protein_ids','counts']].to_csv(args.output, sep='\t', index=False, mode='a', header=False)
@@ -0,0 +1,18 @@
+/*
+ * -------------------------------------------------
+ *  Nextflow config file for running tests
+ * -------------------------------------------------
+ * Defines bundled input files and everything required
+ * to run a fast and simple test. Use as follows:
+ *   nextflow run nf-core/epitopeprediction -profile test_proteins
+ */
+
+params {
+  max_cpus = 2
+  max_memory = 6.GB
+  max_time = 4.h
+
+  // Input data
+  proteins = 'https://github.com/nf-core/test-datasets/raw/epitopeprediction/testdata/proteins/proteins.fasta'
+  alleles = 'https://github.com/nf-core/test-datasets/raw/epitopeprediction/testdata/alleles/alleles.txt'
+}
@@ -30,7 +30,9 @@ def helpMessage() {
 
     Alternative inputs:
       --peptides [file]             Path to TSV file containing peptide sequences (minimum required: id and sequence column)
-
+      --proteins [file]             Path to FASTA file containing protein sequences
+
+
     Pipeline options:
       --filter_self [bool]          Specifies that peptides should be filtered against the specified human proteome references Default: false
       --wild_type  [bool]           Specifies that wild-type sequences of mutated peptides should be predicted as well Default: false
@@ -62,10 +64,11 @@ if (params.help) {
     exit 0
 }
 
-//Generate empty channels for peptides and variants
-ch_split_peptides = Channel.empty()
-ch_split_variants = Channel.empty()
 
+//Generate empty channels for peptides, proteins and variants
+ch_peptides = Channel.empty()
+ch_proteins = Channel.empty()
+ch_split_variants = Channel.empty()
 
 
 if ( params.peptides ) {
@@ -75,7 +78,16 @@ if ( params.peptides ) {
     Channel
         .fromPath(params.peptides)
         .ifEmpty { exit 1, "Peptide input not found: ${params.peptides}" }
-        .set { ch_split_peptides }
+        .set { ch_peptides }
+}
+else if ( params.proteins ) {
+    if ( params.wild_type ) {
+        exit 1, "Protein input not compatible with wild-type sequence generation."
+    }
+    Channel
+        .fromPath(params.proteins)
+        .ifEmpty { exit 1, "Protein input not found: ${params.proteins}" }
+        .set { ch_proteins }
 }
 else if (params.input) {
     Channel
@@ -84,7 +96,7 @@ else if (params.input) {
         .set { ch_split_variants }
 }
 else {
-    exit 1, "Please specify a file that contains annotated variants OR a file that contains peptide sequences."
+    exit 1, "Please specify a file that contains annotated variants, protein sequences OR peptide sequences."
 }
 
 if ( !params.alleles ) {
@@ -134,6 +146,7 @@ if ( params.alleles ) summary['Alleles'] = params.alleles
 summary['Max. Peptide Length'] = params.max_peptide_length
 summary['MHC Class'] = params.mhc_class
 if ( params.peptides ) summary['Peptides'] = params.peptides
+if ( params.proteins ) summary['Proteins'] = params.proteins
 if ( !params.peptides && !params.proteins ) summary['Reference Genome'] = params.genome
 if ( params.proteome ) summary['Reference proteome'] = params.proteome
 summary['Self-Filter'] = params.filter_self
@@ -247,8 +260,30 @@ process splitVariants {
     }
 }
 
+
+/*
+ * STEP 0b - Process FASTA file and generate peptides
+ */
+if (params.proteins) {
+    process genPeptides {
+        input:
+        file proteins from ch_proteins
+
+        output:
+        file 'peptides.tsv' into ch_split_peptides
+
+        when: !params.peptides
+
+        script:
+        """
+        gen_peptides.py --input ${proteins} --output 'peptides.tsv' --max_length ${params.max_peptide_length} --min_length ${params.min_peptide_length}
+        """
+    }
+ } else {
+    ch_peptides.set{ch_split_peptides}
+ }
 /*
- * STEP 1b - Split peptide data
+ * STEP 1b- Split peptide data
  */
 process splitPeptides {
     input:
@@ -282,7 +317,7 @@ process peptidePrediction {
    file "*.json" into ch_json_reports
 
    script:
-   def input_type = params.peptides ? "--peptides ${inputs}" : "--somatic_mutations ${inputs}"
+   def input_type = params.peptides ? "--peptides ${inputs}" : params.proteins ?  "--peptides ${inputs}" : "--somatic_mutations ${inputs}"
    def ref_prot = params.proteome ? "--proteome ${params.proteome}" : ""
    def wt = params.wild_type ? "--wild_type" : ""
    """

@@ -12,6 +12,7 @@ params {
   filter_self = false
   mhc_class = 1
   peptides = false
+  proteins = false
   max_peptide_length = (mhc_class == 1) ? 11 : 16
   min_peptide_length = (mhc_class == 1) ? 8 : 15
   tools = 'syfpeithi'