8000 Add functionality to write fasta with protein sequences by christopher-mohr · Pull Request #57 · nf-core/epitopeprediction · GitHub
[go: up one dir, main page]
More Web Proxy on the site http://driver.im/
Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0

### `Added`

- [#57](https://github.com/nf-core/epitopeprediction/pull/57) - Add option (`--fasta_output`) to write out FASTA file with protein sequences
- [#45](https://github.com/nf-core/epitopeprediction/pull/45) - Add test for FASTA input
- [#44](https://github.com/nf-core/epitopeprediction/pull/44) - Add parameter (`--show_supported_models`) to write out supported models in files
- [#44](https://github.com/nf-core/epitopeprediction/pull/44) - Add check if requested models for specified tools are supported by `FRED2`
Expand All @@ -25,6 +26,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0

### `Fixed`

- [#56](https://github.com/nf-core/epitopeprediction/pull/56) - Fix result output for more than one prediction method [#55](https://github.com/nf-core/epitopeprediction/issues/55)
- [#53](https://github.com/nf-core/epitopeprediction/pull/53) - Fix score and affinity output of MHCnuggets and MHCflurry [#32](https://github.com/nf-core/epitopeprediction/issues/32)
- [#52](https://github.com/nf-core/epitopeprediction/pull/52) - Fix MHCflurry permission problem when run with docker profile [#51](https://github.com/nf-core/epitopeprediction/issues/51)
- [#39](https://github.com/nf-core/epitopeprediction/pull/39) - Fix display of prediction tool version [#36](https://github.com/nf-core/epitopeprediction/issues/36)
Expand Down
31 changes: 23 additions & 8 deletions bin/epaa.py
Original file line number Diff line number Diff line change
Expand Up @@ -224,6 +224,7 @@ def read_vcf(filename, pass_only=True):
r = str(record.REF)
v_list = record.ALT
f = record.FILTER

if pass_only and f:
continue

Expand Down Expand Up @@ -587,6 +588,7 @@ def create_ligandomics_column_value_for_result(row, lig_id, val, wild_type):
else:
return ''


def get_protein_ids_for_transcripts(idtype, transcripts, ensembl_url, reference):
result = {}
result_swissProt = {}
Expand Down Expand Up @@ -836,9 +838,9 @@ def make_predictions_from_variants(variants_all, methods, alleles, minlength, ma
df = df.rename(columns={'Method': 'method'})
pred_dataframes.append(df)

statistics = {'prediction_methods': [ method + "-" + version for method, version in methods.items() ] ,'number_of_variants': len(variants_all), 'number_of_peptides': len(all_peptides), 'number_of_peptides_after_filtering': len(all_peptides_filtered)}
statistics = {'prediction_methods': [ method + "-" + version for method, version in methods.items() ] ,'number_of_variants': len(variants_all), 'number_of_unique_peptides': [str(p) for p in all_peptides], 'number_of_unique_peptides_after_filtering': [str(p) for p in all_peptides_filtered]}

return pred_dataframes, statistics, all_peptides_filtered
return pred_dataframes, statistics, all_peptides_filtered, prots


def make_predictions_from_peptides(peptides, methods, alleles, protein_db, identifier, metadata):
Expand Down Expand Up @@ -913,11 +915,9 @@ def make_predictions_from_peptides(peptides, methods, alleles, protein_db, ident
pred_dataframes.append(df)

# write prediction statistics
statistics = {'prediction_methods': [ method + "-" + version for method, version in methods.items() ],'number_of_variants': '-', 'number_of_peptides': len(peptides), 'number_of_peptides_after_filtering': len(peptides_filtered)}

statistics = {'prediction_methods': [ method + "-" + version for method, version in methods.items() ],'number_of_variants': '-', 'number_of_unique_peptides': [str(p) for p in peptides], 'number_of_unique_peptides_after_filtering': [str(p) for p in peptides_filtered]}
return pred_dataframes, statistics


def __main__():
parser = argparse.ArgumentParser(description="""EPAA - Epitope Prediction And Annotation \n Pipeline for prediction of MHC class I and II epitopes from variants or peptides for a list of specified alleles.
Additionally predicted epitopes can be annotated with protein quantification values for the corresponding proteins, identified ligands, or differential expression values for the corresponding transcripts.""", version=VERSION)
Expand All @@ -934,6 +934,7 @@ def __main__():
parser.add_argument('-r', "--reference", help="Reference, retrieved information will be based on this ensembl version", required=False, default='GRCh37', choices=['GRCh37', 'GRCh38'])
parser.add_argument('-f', "--filter_self", help="Filter peptides against human proteom", required=False, action='store_true')
parser.add_argument('-wt', "--wild_type", help="Add wild type sequences of mutated peptides to output", required=False, action='store_true')
parser.add_argument('-fo', "--fasta_output", help="Create FASTA file with protein sequences", required=False, action='store_true')
parser.add_argument('-rp', "--reference_proteome", help="Reference proteome for self-filtering", required=False)
parser.add_argument('-gr', "--gene_reference", help="List of gene IDs for ID mapping.", required=False)
parser.add_argument('-pq', "--protein_quantification", help="File with protein quantification values")
Expand Down Expand Up @@ -1001,12 +1002,12 @@ def __main__():
if args.peptides:
pred_dataframes, statistics = make_predictions_from_peptides(peptides, methods, alleles, up_db, args.identifier, metadata)
else:
pred_dataframes, statistics, all_peptides_filtered = make_predictions_from_variants(vl, methods, alleles, int(args.min_length), int(args.max_length) + 1, ma, up_db, args.identifier, metadata, transcriptProteinMap)
pred_dataframes, statistics, all_peptides_filtered, proteins = make_predictions_from_variants(vl, methods, alleles, int(args.min_length), int(args.max_length) + 1, ma, up_db, args.identifier, metadata, transcriptProteinMap)
else:
if args.peptides:
pred_dataframes, statistics = make_predictions_from_peptides(peptides, methods, alleles, up_db, args.identifier, metadata)
else:
pred_dataframes, statistics, all_peptides_filtered = make_predictions_from_variants(vl, methods, alleles, int(args.min_length), int(args.max_length) + 1, ma, up_db, args.identifier, metadata, transcriptProteinMap)
pred_dataframes, statistics, all_peptides_filtered, proteins = make_predictions_from_variants(vl, methods, alleles, int(args.min_length), int(args.max_length) + 1, ma, up_db, args.identifier, metadata, transcriptProteinMap)

# concat dataframes for all peptide lengths
try:
Expand Down Expand Up @@ -1098,11 +1099,25 @@ def __main__():
complete_df['wt ligand score'] = complete_df.apply(lambda row: create_ligandomics_column_value_for_result(row, lig_id, 0, True), axis=1)
complete_df['wt ligand intensity'] = complete_df.apply(lambda row: create_ligandomics_column_value_for_result(row, lig_id, 1, True), axis=1)

# write mutated protein sequences to fasta file
if args.fasta_output:
with open('{}_prediction_proteins.fasta'.format(args.identifier), 'w') as protein_outfile:
for p in proteins:
variants = []
for v in p.vars:
variants = variants + p.vars[v]
c = [x.coding.values() for x in variants]
cf = list(itertools.chain.from_iterable(c))
cds = ','.join([y.cdsMutationSyntax for y in set(cf)])
aas = ','.join([y.aaMutationSyntax for y in set(cf)])
protein_outfile.write('>{}:{}:{}\n'.format(p.transcript_id, aas, cds))
protein_outfile.write('{}\n'.format(str(p)))

# write dataframe to tsv
complete_df.fillna('')
complete_df.to_csv("{}_prediction_results.tsv".format(args.identifier), '\t', index=False)

statistics['number_of_predictions'] = complete_df.shape[0]
statistics['number_of_predictions'] = len(complete_df)
statistics['number_of_binders'] = len(pos_predictions)
statistics['number_of_nonbinders'] = len(neg_predictions)
statistics['number_of_unique_binders'] = list(set(binders))
Expand Down
18 changes: 13 additions & 5 deletions bin/merge_jsons.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,9 @@

def __main__():
parser = argparse.ArgumentParser(description="")
parser.add_argument('-s', "--single_input", help='Single input JSON report')
parser.add_argument('-i', "--input", help='Input directory with JSON reports')
parser.add_argument('-p', "--prefix", help="Prefix for output")

args = parser.parse_args()

Expand All @@ -19,18 +21,24 @@ def __main__():

# read in json reports
data = []
for file in os.listdir(args.input):
if file.endswith(".json"):
with open(file, "r") as infile:
data.append(Counter(json.load(infile)))
if args.single_input:
with open(args.single_input, 'r') as infile:
data.append(Counter(json.load(infile)))
else:
for file in os.listdir(args.input):
if file.endswith(".json"):
with open(file, "r") as infile:
data.append(Counter(json.load(infile)))

# merge and write json report
merged_data = sum(data, Counter())
merged_data['prediction_methods'] = ''.join(set(merged_data['prediction_methods']))
merged_data['number_of_unique_peptides'] = len(set(merged_data['number_of_unique_peptides']))
merged_data['number_of_unique_peptides_after_filtering'] = len(set(merged_data['number_of_unique_peptides_after_filtering']))
merged_data['number_of_unique_nonbinders'] = len(set(merged_data['number_of_unique_nonbinders']))
merged_data['number_of_unique_binders'] = len(set(merged_data['number_of_unique_binders']))

with open('prediction_report.json', 'w') as outfile:
with open('{}_prediction_report.json'.format(args.prefix), 'w') as outfile:
json.dump(merged_data, outfile)

if __name__ == "__main__":
Expand Down
13 changes: 10 additions & 3 deletions docs/output.md
Original file line number Diff line number Diff line change
Expand Up @@ -16,11 +16,11 @@ The pipeline is built using [Nextflow](https://www.nextflow.io/) and processes d

[FRED2](https://github.com/FRED-2) is used to perform the prediction of Epitopes on the given data, independent of the chosen `tools` to perform the prediction.

**Output directory: `results/`**
**Output directory: `predictions/`**

* `prediction_report.json`
* `[input_base_name]_prediction_report.json`
* The predicted epitopes in JSON format for downstream analysis tasks
* `prediction_report.tsv`
* `[input_base_name]_prediction_results.tsv`
* The predicted epitopes in TSV format for further processing.

An example report looks like this in TSV format:
Expand All @@ -33,6 +33,13 @@ HTHVYLFLS 9 17 3336962 ENSG00000127780 ENST00000248384 ENSP00000248384 SNP syfpe
HVYLFLSNL 9 17 3336962 ENSG00000127780 ENST00000248384 ENSP00000248384 SNP syfpeithi-1.0 0.0 0.0 False False False c.173C>A p.Pro58His
```

When the parameter `--fasta_output` is specified a `FASTA` file will be generated that contains the sequences of proteins that are affected by the provided genomic variants. The resulting `FASTA` file will contain the wild-type and mutated protein sequences.

**Output directory: `predictions/`**

* `[input_base_name]_prediction_proteins.fasta`
* The sequences of proteins, affected by provided variants, in FASTA format.

### Supported models

When running the pipeline using the `--show_supported_models` parameter, the information about supported models for the available predictor tool versions will be written to the results folder as well.
Expand Down
73 changes: 55 additions & 18 deletions main.nf
10000
Original file line number Diff line number Diff line change
Expand Up @@ -34,8 +34,9 @@ def helpMessage() {

Main options:
--show_supported_models [str] Writes out supported models. Does not run actual prediction pipeline. Default: false.
--filter_self [bool] Specifies that peptides should be filtered against the specified human proteome references Default: false
--wild_type [bool] Specifies that wild-type sequences of mutated peptides should be predicted as well Default: false
--filter_self [bool] Specifies that peptides should be filtered against the specified human proteome references. Default: false
--wild_type [bool] Specifies that wild-type sequences of mutated peptides should be predicted as well. Default: false
--fasta_output [bool] Specifies that sequences of proteins, affected by provided variants, will be written to a FASTA file. Default: false
--mhc_class [1,2] Specifies whether the predictions should be done for MHC class I (1) or class II (2). Default: 1
--max_peptide_length [int] Specifies the maximum peptide length (not applied when '--peptides' is specified). Default: MHC class I: 11 aa, MHC class II: 16 aa
--min_peptide_length [int] Specifies the minimum peptide length (not applied when '--peptides' is specified). Default: MCH class I: 8 aa, MHC class II: 15 aa
Expand Down Expand Up @@ -79,25 +80,37 @@ ch_split_variants = Channel.empty()
ch_alleles = Channel.empty()
ch_check_alleles = Channel.empty()

// Store input base name for later
def input_base_name = ''

if ( !params.show_supported_models ){
if ( params.peptides ) {
if ( params.fasta_output ) {
exit 1, "Peptide input not compatible with protein sequence FASTA output."
}
if ( params.wild_type ) {
exit 1, "Peptide input not compatible with wild-type sequence generation."
}
input_base_name = file(params.peptides).baseName
Channel
.fromPath(params.peptides, checkIfExists: true)
.set { ch_peptides }
(ch_peptides, ch_check_peptides) = ch_peptides.into(2)
}
else if ( params.proteins ) {
if ( params.fasta_output ) {
exit 1, "Protein input not compatible with protein sequence FASTA output."
}
if ( params.wild_type ) {
exit 1, "Protein input not compatible with wild-type sequence generation."
}
input_base_name = file(params.proteins).baseName
Channel
.fromPath(params.proteins, checkIfExists: true)
.set { ch_proteins }
}
else if (params.input) {
input_base_name = file(params.input).baseName
Channel
.fromPath(params.input, checkIfExists: true)
.set { ch_split_variants }
Expand Down Expand Up @@ -175,7 +188,7 @@ summary['Run Name'] = custom_runName ?: workflow.runName
summary['Max Resources'] = "$params.max_memory memory, $params.max_cpus cpus, $params.max_time time per job"
//Pipeline Parameters
if ( params.show_supported_models ) {
summary['Show supported models'] = params.show_supported_models
summary['Show Supported Models'] = params.show_supported_models
} else {
if ( params.input ) summary['Variants'] = params.input
if ( params.peptides ) summary['Peptides'] = params.peptides
Expand All @@ -187,19 +200,20 @@ if ( params.show_supported_models ) {
}
summary['MHC Class'] = params.mhc_class
if ( !params.peptides && !params.proteins ) summary['Reference Genome'] = params.genome_version
if ( params.proteome ) summary['Reference proteome'] = params.proteome
if ( params.proteome ) summary['Reference Proteome'] = params.proteome
summary['Self-Filter'] = params.filter_self
summary['Tools'] = params.tools
summary['Wild-types'] = params.wild_type
if ( params.peptides || params.proteins ) summary['Max. number of chunks for parallelization'] = params.peptides_split_maxchunks
if ( params.peptides || params.proteins ) summary['Min. number of peptides in one chunk'] = params.peptides_split_minchunksize
summary['Protein FASTA Output'] = params.fasta_output
if ( params.peptides || params.proteins ) summary['Max. Number of Chunks for Parallelization'] = params.peptides_split_maxchunks
if ( params.peptides || params.proteins ) summary['Min. Number of Peptides in One Chunk'] = params.peptides_split_minchunksize
}
summary['Memory mode'] = params.mem_mode
summary['Memory Mode'] = params.mem_mode
if (workflow.containerEngine) summary['Container'] = "$workflow.containerEngine - $workflow.container"
summary['Output dir'] = params.outdir
summary['Launch dir'] = workflow.launchDir
summary['Working dir'] = workflow.workDir
summary['Script dir'] = workflow.projectDir
summary['Output Dir'] = params.outdir
summary['Launch Dir'] = workflow.launchDir
summary['Working Dir'] = workflow.workDir
summary['Script Dir'] = workflow.projectDir
summary['User'] = workflow.userName
if (workflow.profile.contains('awsbatch')) {
summary['AWS Region'] = params.awsregion
Expand Down Expand Up @@ -403,11 +417,13 @@ process peptidePrediction {
output:
file "*.tsv" into ch_predicted_peptides
file "*.json" into ch_json_reports
file "*.fasta" optional true into ch_protein_fastas

script:
def input_type = params.peptides ? "--peptides ${inputs}" : params.proteins ? "--peptides ${inputs}" : "--somatic_mutations ${inputs}"
def ref_prot = params.proteome ? "--proteome ${params.proteome}" : ""
def wt = params.wild_type ? "--wild_type" : ""
def fasta_output = params.fasta_output ? "--fasta_output" : ""
"""
# create folder for MHCflurry downloads to avoid permission problems when running pipeline with docker profile and mhcflurry selected
mkdir -p mhcflurry-data
Expand All @@ -424,28 +440,49 @@ process peptidePrediction {
--versions ${software_versions} \
--reference ${params.genome_version} \
${ref_prot} \
${wt}
${wt} \
${fasta_output}
"""
}

/*
* STEP 3 - Combine epitope prediction results
*/
process mergeResults {
publishDir "${params.outdir}/results", mode: 'copy'
publishDir "${params.outdir}/predictions", mode: 'copy'

input:
file predictions from ch_predicted_peptides.collect()

output:
file 'prediction_result.tsv'
file "${input_base_name}_prediction_result.tsv"

script:
def single = predictions instanceof Path ? 1 : predictions.size()
def merge = (single == 1) ? 'cat' : 'csvtk concat -t'

"""
$merge $predictions > prediction_result.tsv
$merge $predictions > ${input_base_name}_prediction_result.tsv
"""
}

/*
* STEP 3(2) optional - Combine protein sequences
*/
process mergeFastas {
publishDir "${params.outdir}/predictions", mode: 'copy'

input:
file proteins from ch_protein_fastas.collect()

output:
file "${input_base_name}_prediction_proteins.fasta"

when:
params.fasta_output

"""
cat $proteins > ${input_base_name}_prediction_proteins.fasta
"""
}

Expand All @@ -454,17 +491,17 @@ process mergeResults {
*/

process mergeReports {
publishDir "${params.outdir}/results", mode: 'copy'
publishDir "${params.outdir}/predictions", mode: 'copy'

input:
file jsons from ch_json_reports.collect()

output:
file 'prediction_report.json'
file "${input_base_name}_prediction_report.json"

script:
def single = jsons instanceof Path ? 1 : jsons.size()
def command = (single == 1) ? "cat ${jsons} > prediction_report.json" : "merge_jsons.py --input \$PWD"
def command = (single == 1) ? "merge_jsons.py --single_input ${jsons} --prefix ${input_base_name}" : "merge_jsons.py --input \$PWD --prefix ${input_base_name}"

"""
$command
Expand Down
1 change: 1 addition & 0 deletions nextflow.config
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ params {
tools = 'syfpeithi'
genome_version = 'GRCh37'
wild_type = false
fasta_output = false
peptides_split_maxchunks = 100
peptides_split_minchunksize = 5000
show_supported_models = false
Expand Down
Loading
0