nf-core · christopher-mohr · Sep 18, 2020 · Sep 11, 2020 · Sep 11, 2020 · Sep 14, 2020
@@ -7,6 +7,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ### `Added`
 
+- [#57](https://github.com/nf-core/epitopeprediction/pull/57) - Add option (`--fasta_output`) to write out FASTA file with protein sequences
 - [#45](https://github.com/nf-core/epitopeprediction/pull/45) - Add test for FASTA input
 - [#44](https://github.com/nf-core/epitopeprediction/pull/44) - Add parameter (`--show_supported_models`) to write out supported models in files
 - [#44](https://github.com/nf-core/epitopeprediction/pull/44) - Add check if requested models for specified tools are supported by `FRED2`
@@ -25,6 +26,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ### `Fixed`
 
+- [#56](https://github.com/nf-core/epitopeprediction/pull/56) - Fix result output for more than one prediction method [#55](https://github.com/nf-core/epitopeprediction/issues/55)
 - [#53](https://github.com/nf-core/epitopeprediction/pull/53) - Fix score and affinity output of MHCnuggets and MHCflurry [#32](https://github.com/nf-core/epitopeprediction/issues/32)
 - [#52](https://github.com/nf-core/epitopeprediction/pull/52) - Fix MHCflurry permission problem when run with docker profile [#51](https://github.com/nf-core/epitopeprediction/issues/51)
 - [#39](https://github.com/nf-core/epitopeprediction/pull/39) - Fix display of prediction tool version [#36](https://github.com/nf-core/epitopeprediction/issues/36)

@@ -224,6 +224,7 @@ def read_vcf(filename, pass_only=True):
         r = str(record.REF)
         v_list = record.ALT
         f = record.FILTER
+
         if pass_only and f:
             continue
 
@@ -587,6 +588,7 @@ def create_ligandomics_column_value_for_result(row, lig_id, val, wild_type):
     else:
         return ''
 
+
 def get_protein_ids_for_transcripts(idtype, transcripts, ensembl_url, reference):
     result = {}
     result_swissProt = {}
@@ -836,9 +838,9 @@ def make_predictions_from_variants(variants_all, methods, alleles, minlength, ma
         df = df.rename(columns={'Method': 'method'})
         pred_dataframes.append(df)
 
-    statistics = {'prediction_methods': [ method + "-" + version for method, version in methods.items() ] ,'number_of_variants': len(variants_all), 'number_of_peptides': len(all_peptides), 'number_of_peptides_after_filtering': len(all_peptides_filtered)}
+    statistics = {'prediction_methods': [ method + "-" + version for method, version in methods.items() ] ,'number_of_variants': len(variants_all), 'number_of_unique_peptides': [str(p) for p in all_peptides], 'number_of_unique_peptides_after_filtering': [str(p) for p in all_peptides_filtered]}
 
-    return pred_dataframes, statistics, all_peptides_filtered
+    return pred_dataframes, statistics, all_peptides_filtered, prots
 
 
 def make_predictions_from_peptides(peptides, methods, alleles, protein_db, identifier, metadata):
@@ -913,11 +915,9 @@ def make_predictions_from_peptides(peptides, methods, alleles, protein_db, ident
         pred_dataframes.append(df)
 
     # write prediction statistics
-    statistics = {'prediction_methods': [ method + "-" + version for method, version in methods.items() ],'number_of_variants': '-', 'number_of_peptides': len(peptides), 'number_of_peptides_after_filtering': len(peptides_filtered)}
-
+    statistics = {'prediction_methods': [ method + "-" + version for method, version in methods.items() ],'number_of_variants': '-', 'number_of_unique_peptides': [str(p) for p in peptides], 'number_of_unique_peptides_after_filtering': [str(p) for p in peptides_filtered]}
     return pred_dataframes, statistics
 
-
 def __main__():
     parser = argparse.ArgumentParser(description="""EPAA - Epitope Prediction And Annotation \n Pipeline for prediction of MHC class I and II epitopes from variants or peptides for a list of specified alleles. 
         Additionally predicted epitopes can be annotated with protein quantification values for the corresponding proteins, identified ligands, or differential expression values for the corresponding transcripts.""", version=VERSION)
@@ -934,6 +934,7 @@ def __main__():
     parser.add_argument('-r', "--reference", help="Reference, retrieved information will be based on this ensembl version", required=False, default='GRCh37', choices=['GRCh37', 'GRCh38'])
     parser.add_argument('-f', "--filter_self", help="Filter peptides against human proteom", required=False, action='store_true')
     parser.add_argument('-wt', "--wild_type", help="Add wild type sequences of mutated peptides to output", required=False, action='store_true')
+    parser.add_argument('-fo', "--fasta_output", help="Create FASTA file with protein sequences", required=False, action='store_true')
     parser.add_argument('-rp', "--reference_proteome", help="Reference proteome for self-filtering", required=False)
     parser.add_argument('-gr', "--gene_reference", help="List of gene IDs for ID mapping.", required=False)
     parser.add_argument('-pq', "--protein_quantification", help="File with protein quantification values")
@@ -1001,12 +1002,12 @@ def __main__():
         if args.peptides:
             pred_dataframes, statistics = make_predictions_from_peptides(peptides, methods, alleles, up_db, args.identifier, metadata)
         else:
-            pred_dataframes, statistics, all_peptides_filtered = make_predictions_from_variants(vl, methods, alleles, int(args.min_length), int(args.max_length) + 1, ma, up_db, args.identifier, metadata, transcriptProteinMap)
+            pred_dataframes, statistics, all_peptides_filtered, proteins = make_predictions_from_variants(vl, methods, alleles, int(args.min_length), int(args.max_length) + 1, ma, up_db, args.identifier, metadata, transcriptProteinMap)
     else:
         if args.peptides:
             pred_dataframes, statistics = make_predictions_from_peptides(peptides, methods, alleles, up_db, args.identifier, metadata)
         else:
-            pred_dataframes, statistics, all_peptides_filtered = make_predictions_from_variants(vl, methods, alleles, int(args.min_length), int(args.max_length) + 1, ma, up_db, args.identifier, metadata, transcriptProteinMap)
+            pred_dataframes, statistics, all_peptides_filtered, proteins = make_predictions_from_variants(vl, methods, alleles, int(args.min_length), int(args.max_length) + 1, ma, up_db, args.identifier, metadata, transcriptProteinMap)
 
     # concat dataframes for all peptide lengths
     try:
@@ -1098,11 +1099,25 @@ def __main__():
             complete_df['wt ligand score'] = complete_df.apply(lambda row: create_ligandomics_column_value_for_result(row, lig_id, 0, True), axis=1)
             complete_df['wt ligand intensity'] = complete_df.apply(lambda row: create_ligandomics_column_value_for_result(row, lig_id, 1, True), axis=1)
 
+    # write mutated protein sequences to fasta file
+    if args.fasta_output:
+        with open('{}_prediction_proteins.fasta'.format(args.identifier), 'w') as protein_outfile:
+            for p in proteins:
+                variants = []
+                for v in p.vars:
+                    variants = variants + p.vars[v]
+                c = [x.coding.values() for x in variants]
+                cf = list(itertools.chain.from_iterable(c))
+                cds = ','.join([y.cdsMutationSyntax for y in set(cf)])
+                aas = ','.join([y.aaMutationSyntax for y in set(cf)])
+                protein_outfile.write('>{}:{}:{}\n'.format(p.transcript_id, aas, cds))
+                protein_outfile.write('{}\n'.format(str(p)))
+
     # write dataframe to tsv
     complete_df.fillna('')
     complete_df.to_csv("{}_prediction_results.tsv".format(args.identifier), '\t', index=False)
 
-    statistics['number_of_predictions'] = complete_df.shape[0]
+    statistics['number_of_predictions'] = len(complete_df)
     statistics['number_of_binders'] = len(pos_predictions)
     statistics['number_of_nonbinders'] = len(neg_predictions)
     statistics['number_of_unique_binders'] = list(set(binders))

@@ -9,7 +9,9 @@
 
 def __main__():
     parser = argparse.ArgumentParser(description="")
+    parser.add_argument('-s', "--single_input", help='Single input JSON report')
     parser.add_argument('-i', "--input", help='Input directory with JSON reports')
+    parser.add_argument('-p', "--prefix", help="Prefix for output")
 
     args = parser.parse_args()
 
@@ -19,18 +21,24 @@ def __main__():
 
     # read in json reports
     data = []
-    for file in os.listdir(args.input):
-        if file.endswith(".json"):
-            with open(file, "r") as infile:
-                data.append(Counter(json.load(infile)))
+    if args.single_input:
+        with open(args.single_input, 'r') as infile:
+            data.append(Counter(json.load(infile)))
+    else:
+        for file in os.listdir(args.input):
+            if file.endswith(".json"):
+                with open(file, "r") as infile:
+                    data.append(Counter(json.load(infile)))
 
     # merge and write json report
     merged_data = sum(data, Counter())
     merged_data['prediction_methods'] = ''.join(set(merged_data['prediction_methods']))
+    merged_data['number_of_unique_peptides'] = len(set(merged_data['number_of_unique_peptides']))
+    merged_data['number_of_unique_peptides_after_filtering'] = len(set(merged_data['number_of_unique_peptides_after_filtering']))
     merged_data['number_of_unique_nonbinders'] = len(set(merged_data['number_of_unique_nonbinders']))
     merged_data['number_of_unique_binders'] = len(set(merged_data['number_of_unique_binders']))
 
-    with open('prediction_report.json', 'w') as outfile:
+    with open('{}_prediction_report.json'.format(args.prefix), 'w') as outfile:
         json.dump(merged_data, outfile)
 
 if __name__ == "__main__":

@@ -16,11 +16,11 @@ The pipeline is built using [Nextflow](https://www.nextflow.io/) and processes d
 
 [FRED2](https://github.com/FRED-2) is used to perform the prediction of Epitopes on the given data, independent of the chosen `tools` to perform the prediction.
 
-**Output directory: `results/`**
+**Output directory: `predictions/`**
 
-* `prediction_report.json`
+* `[input_base_name]_prediction_report.json`
   * The predicted epitopes in JSON format for downstream analysis tasks
-* `prediction_report.tsv`
+* `[input_base_name]_prediction_results.tsv`
   * The predicted epitopes in TSV format for further processing.
 
 An example report looks like this in TSV format:
@@ -33,6 +33,13 @@ HTHVYLFLS 9 17 3336962 ENSG00000127780 ENST00000248384 ENSP00000248384 SNP syfpe
 HVYLFLSNL 9 17 3336962 ENSG00000127780 ENST00000248384 ENSP00000248384 SNP syfpeithi-1.0 0.0 0.0 False False False c.173C>A p.Pro58His
 ```
 
+When the parameter `--fasta_output` is specified a `FASTA` file will be generated that contains the sequences of proteins that are affected by the provided genomic variants. The resulting `FASTA` file will contain the wild-type and mutated protein sequences.
+
+**Output directory: `predictions/`**
+
+* `[input_base_name]_prediction_proteins.fasta`
+  * The sequences of proteins, affected by provided variants, in FASTA format.
+
 ### Supported models
 
 When running the pipeline using the `--show_supported_models` parameter, the information about supported models for the available predictor tool versions will be written to the results folder as well.

@@ -34,8 +34,9 @@ def helpMessage() {
 
     Main options:
       --show_supported_models [str]         Writes out supported models. Does not run actual prediction pipeline. Default: false.
-      --filter_self [bool]                  Specifies that peptides should be filtered against the specified human proteome references Default: false
-      --wild_type  [bool]                   Specifies that wild-type sequences of mutated peptides should be predicted as well Default: false
+      --filter_self [bool]                  Specifies that peptides should be filtered against the specified human proteome references. Default: false
+      --wild_type  [bool]                   Specifies that wild-type sequences of mutated peptides should be predicted as well. Default: false
+      --fasta_output [bool]                 Specifies that sequences of proteins, affected by provided variants, will be written to a FASTA file. Default: false
       --mhc_class [1,2]                     Specifies whether the predictions should be done for MHC class I (1) or class II (2). Default: 1
       --max_peptide_length [int]            Specifies the maximum peptide length (not applied when '--peptides' is specified). Default: MHC class I: 11 aa, MHC class II: 16 aa
       --min_peptide_length [int]            Specifies the minimum peptide length (not applied when '--peptides' is specified). Default: MCH class I: 8 aa, MHC class II: 15 aa
@@ -79,25 +80,37 @@ ch_split_variants = Channel.empty()
 ch_alleles = Channel.empty()
 ch_check_alleles = Channel.empty()
 
+// Store input base name for later
+def input_base_name = ''
+
 if ( !params.show_supported_models ){
     if ( params.peptides ) {
+        if ( params.fasta_output ) {
+            exit 1, "Peptide input not compatible with protein sequence FASTA output."
+        }
         if ( params.wild_type ) {
             exit 1, "Peptide input not compatible with wild-type sequence generation."
         }
+        input_base_name = file(params.peptides).baseName
         Channel
             .fromPath(params.peptides, checkIfExists: true)
             .set { ch_peptides }
             (ch_peptides, ch_check_peptides) = ch_peptides.into(2)
     }
     else if ( params.proteins ) {
+        if ( params.fasta_output ) {
+            exit 1, "Protein input not compatible with protein sequence FASTA output."
+        }
         if ( params.wild_type ) {
             exit 1, "Protein input not compatible with wild-type sequence generation."
         }
+        input_base_name = file(params.proteins).baseName
         Channel
             .fromPath(params.proteins, checkIfExists: true)
             .set { ch_proteins }
     }
     else if (params.input) {
+        input_base_name = file(params.input).baseName
         Channel
             .fromPath(params.input, checkIfExists: true)
             .set { ch_split_variants }
@@ -175,7 +188,7 @@ summary['Run Name']         = custom_runName ?: workflow.runName
 summary['Max Resources']    = "$params.max_memory memory, $params.max_cpus cpus, $params.max_time time per job"
 //Pipeline Parameters
 if ( params.show_supported_models ) {
-    summary['Show supported models'] = params.show_supported_models
+    summary['Show Supported Models'] = params.show_supported_models
 } else {
     if ( params.input ) summary['Variants'] = params.input
     if ( params.peptides ) summary['Peptides'] = params.peptides
@@ -187,19 +200,20 @@ if ( params.show_supported_models ) {
     }
     summary['MHC Class'] = params.mhc_class
     if ( !params.peptides && !params.proteins ) summary['Reference Genome'] = params.genome_version
-    if ( params.proteome ) summary['Reference proteome'] = params.proteome
+    if ( params.proteome ) summary['Reference Proteome'] = params.proteome
     summary['Self-Filter'] = params.filter_self
     summary['Tools'] = params.tools
     summary['Wild-types'] = params.wild_type
-    if ( params.peptides || params.proteins ) summary['Max. number of chunks for parallelization'] = params.peptides_split_maxchunks
-    if ( params.peptides || params.proteins ) summary['Min. number of peptides in one chunk'] = params.peptides_split_minchunksize
+    summary['Protein FASTA Output'] = params.fasta_output
+    if ( params.peptides || params.proteins ) summary['Max. Number of Chunks for Parallelization'] = params.peptides_split_maxchunks
+    if ( params.peptides || params.proteins ) summary['Min. Number of Peptides in One Chunk'] = params.peptides_split_minchunksize
 }
-summary['Memory mode']      = params.mem_mode
+summary['Memory Mode']      = params.mem_mode
 if (workflow.containerEngine) summary['Container'] = "$workflow.containerEngine - $workflow.container"
-summary['Output dir']       = params.outdir
-summary['Launch dir']       = workflow.launchDir
-summary['Working dir']      = workflow.workDir
-summary['Script dir']       = workflow.projectDir
+summary['Output Dir']       = params.outdir
+summary['Launch Dir']       = workflow.launchDir
+summary['Working Dir']      = workflow.workDir
+summary['Script Dir']       = workflow.projectDir
 summary['User']             = workflow.userName
 if (workflow.profile.contains('awsbatch')) {
     summary['AWS Region']   = params.awsregion
@@ -403,11 +417,13 @@ process peptidePrediction {
    output:
    file "*.tsv" into ch_predicted_peptides
    file "*.json" into ch_json_reports
+   file "*.fasta" optional true into ch_protein_fastas
 
    script:
    def input_type = params.peptides ? "--peptides ${inputs}" : params.proteins ?  "--peptides ${inputs}" : "--somatic_mutations ${inputs}"
    def ref_prot = params.proteome ? "--proteome ${params.proteome}" : ""
    def wt = params.wild_type ? "--wild_type" : ""
+   def fasta_output = params.fasta_output ? "--fasta_output" : ""
    """
    # create folder for MHCflurry downloads to avoid permission problems when running pipeline with docker profile and mhcflurry selected
    mkdir -p mhcflurry-data
@@ -424,28 +440,49 @@ process peptidePrediction {
                          --versions ${software_versions} \
                          --reference ${params.genome_version} \
                          ${ref_prot} \
-                         ${wt}
+                         ${wt} \
+                         ${fasta_output}
    """
 }
 
 /*
  * STEP 3 - Combine epitope prediction results
  */
 process mergeResults {
-    publishDir "${params.outdir}/results", mode: 'copy'
+    publishDir "${params.outdir}/predictions", mode: 'copy'
 
     input:
     file predictions from ch_predicted_peptides.collect()
 
     output:
-    file 'prediction_result.tsv'
+    file "${input_base_name}_prediction_result.tsv"
 
     script:
     def single = predictions instanceof Path ? 1 : predictions.size()
     def merge = (single == 1) ? 'cat' : 'csvtk concat -t'
 
     """
-    $merge $predictions > prediction_result.tsv
+    $merge $predictions > ${input_base_name}_prediction_result.tsv
+    """
+}
+
+/*
+ * STEP 3(2) optional - Combine protein sequences
+ */
+process mergeFastas {
+    publishDir "${params.outdir}/predictions", mode: 'copy'
+
+    input:
+    file proteins from ch_protein_fastas.collect()
+
+    output:
+    file "${input_base_name}_prediction_proteins.fasta"
+
+    when:
+    params.fasta_output
+
+    """
+    cat $proteins > ${input_base_name}_prediction_proteins.fasta
     """
 }
 
@@ -454,17 +491,17 @@ process mergeResults {
  */
 
 process mergeReports {
-    publishDir "${params.outdir}/results", mode: 'copy'
+    publishDir "${params.outdir}/predictions", mode: 'copy'
 
     input:
     file jsons from ch_json_reports.collect()
 
     output:
-    file 'prediction_report.json'
+    file "${input_base_name}_prediction_report.json"
 
     script:
     def single = jsons instanceof Path ? 1 : jsons.size()
-    def command = (single == 1) ? "cat ${jsons} > prediction_report.json" : "merge_jsons.py --input \$PWD"
+    def command = (single == 1) ? "merge_jsons.py --single_input ${jsons} --prefix ${input_base_name}" : "merge_jsons.py --input \$PWD --prefix ${input_base_name}"
 
     """
     $command

@@ -18,6 +18,7 @@ params {
   tools = 'syfpeithi'
   genome_version = 'GRCh37'
   wild_type = false
+  fasta_output = false
   peptides_split_maxchunks = 100
   peptides_split_minchunksize = 5000
   show_supported_models = false