8000 Add option to use FASTA files with protein sequences using Fred2 by skrakau · Pull Request #42 · nf-core/epitopeprediction · GitHub
[go: up one dir, main page]
More Web Proxy on the site http://driver.im/
Skip to content

Add option to use FASTA files with protein sequences using Fred2 #42

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Apr 14, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions bin/epaa.py
Original file line number Diff line number Diff line change
Expand Up @@ -330,6 +330,8 @@ def read_peptide_input(filename):

'''expected columns (min required): id sequence'''
with open(filename, 'r') as peptide_input:
# enable listing of protein names for each peptide
csv.field_size_limit(600000)
reader = csv.DictReader(peptide_input, delimiter='\t')
for row in reader:
pep = Peptide(row['sequence'])
Expand Down
53 changes: 53 additions & 0 deletions bin/gen_peptides.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
#!/usr/bin/env python

import sys
import argparse
import pandas as pd

from Bio.SeqIO.FastaIO import SimpleFastaParser
from Fred2.Core import Allele, Peptide, Protein, generate_peptides_from_proteins


parser = argparse.ArgumentParser("Generating peptides from protein sequences.")
parser.add_argument('-i', '--input', metavar='FILE', type=argparse.FileType('r'), help = 'FASTA filename containing proteins.')
parser.add_argument('-o', '--output', metavar='FILE', type=argparse.FileType('w'), help='Output file containing peptides.')
parser.add_argument('-min', '--min_length', metavar='N', type=int, help='Minimal length of peptides that will be generated.')
parser.add_argument('-max', '--max_length', metavar='N', type=int, help='Maximum length of peptides that will be generated.')
args = parser.parse_args()



def read_protein_fasta(file):
# split at first whitespace and use short ID

collect = set()
# iterate over all FASTA entries:
for _id, seq in SimpleFastaParser(file):
# generate element:
_id = _id.split(" ")[0]

try:
collect.add(Protein(seq.strip().upper(), transcript_id=_id))
except TypeError:
collect.add(Protein(seq.strip().upper()))
return list(collect)


proteins = read_protein_fasta(args.input)

c = 0
for k in range(args.min_length, args.max_length+1):
peptides = generate_peptides_from_proteins(proteins, k)
# get proteins and corresponding counts
pd_peptides = pd.DataFrame(
[ (str(pep), ','.join([ prot.transcript_id.split(' ')[0] for prot in pep.get_all_protein 8000 s() ]), ','.join([ str(len(pep.proteinPos[prot.transcript_id])) for prot in pep.get_all_proteins() ])) for pep in peptides ],
columns = ['sequence', 'protein_ids', 'counts']
)
# assign id
pd_peptides = pd_peptides.assign(id=[str(c+id) for id in pd_peptides.index])
c += len(pd_peptides['sequence'])

if k == args.min_length:
pd_peptides[['sequence','id','protein_ids','counts']].to_csv(args.output, sep='\t', index=False)
else:
pd_peptides[['sequence','id','protein_ids','counts']].to_csv(args.output, sep='\t', index=False, mode='a', header=False)
18 changes: 18 additions & 0 deletions conf/test_proteins.config
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
/*
* -------------------------------------------------
* Nextflow config file for running tests
* -------------------------------------------------
* Defines bundled input files and everything required
* to run a fast and simple test. Use as follows:
* nextflow run nf-core/epitopeprediction -profile test_proteins
*/

params {
max_cpus = 2
max_memory = 6.GB
max_time = 4.h

// Input data
proteins = 'https://github.com/nf-core/test-datasets/raw/epitopeprediction/testdata/proteins/proteins.fasta'
alleles = 'https://github.com/nf-core/test-datasets/raw/epitopeprediction/testdata/alleles/alleles.txt'
}
51 changes: 43 additions & 8 deletions main.nf
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,9 @@ def helpMessage() {

Alternative inputs:
--peptides [file] Path to TSV file containing peptide sequences (minimum required: id and sequence column)

--proteins [file] Path to FASTA file containing protein sequences


Pipeline options:
--filter_self [bool] Specifies that peptides should be filtered against the specified human proteome references Default: false
--wild_type [bool] Specifies that wild-type sequences of mutated peptides should be predicted as well Default: false
Expand Down Expand Up @@ -62,10 +64,11 @@ if (params.help) {
exit 0
}

//Generate empty channels for peptides and variants
ch_split_peptides = Channel.empty()
ch_split_variants = Channel.empty()

//Generate empty channels for peptides, proteins and variants
ch_peptides = Channel.empty()
ch_proteins = Channel.empty()
ch_split_variants = Channel.empty()


if ( params.peptides ) {
Expand All @@ -75,7 +78,16 @@ if ( params.peptides ) {
Channel
.fromPath(params.peptides)
.ifEmpty { exit 1, "Peptide input not found: ${params.peptides}" }
.set { ch_split_peptides }
.set { ch_peptides }
}
else if ( params.proteins ) {
if ( params.wild_type ) {
exit 1, "Protein input not compatible with wild-type sequence generation."
}
Channel
.fromPath(params.proteins)
.ifEmpty { exit 1, "Protein input not found: ${params.proteins}" }
.set { ch_proteins }
}
else if (params.input) {
Channel
Expand All @@ -84,7 +96,7 @@ else if (params.input) {
.set { ch_split_variants }
}
else {
exit 1, "Please specify a file that contains annotated variants OR a file that contains peptide sequences."
exit 1, "Please specify a file that contains annotated variants, protein sequences OR peptide sequences."
}

if ( !params.alleles ) {
Expand Down Expand Up @@ -134,6 +146,7 @@ if ( params.alleles ) summary['Alleles'] = params.alleles
summary['Max. Peptide Length'] = params.max_peptide_length
summary['MHC Class'] = params.mhc_class
if ( params.peptides ) summary['Peptides'] = params.peptides
if ( params.proteins ) summary['Proteins'] = params.proteins
if ( !params.peptides && !params.proteins ) summary['Reference Genome'] = params.genome
if ( params.proteome ) summary['Reference proteome'] = params.proteome
summary['Self-Filter'] = params.filter_self
Expand Down Expand Up @@ -247,8 +260,30 @@ process splitVariants {
}
}


/*
* STEP 0b - Process FASTA file and generate peptides
*/
if (params.proteins) {
process genPeptides {
input:
file proteins from ch_proteins

output:
file 'peptides.tsv' into ch_split_peptides

when: !params.peptides

script:
"""
gen_peptides.py --input ${proteins} --output 'peptides.tsv' --max_length ${params.max_peptide_length} --min_length ${params.min_peptide_length}
"""
}
} else {
ch_peptides.set{ch_split_peptides}
}
/*
* STEP 1b - Split peptide data
* STEP 1b- Split peptide data
*/
process splitPeptides {
input:
Expand Down Expand Up @@ -282,7 +317,7 @@ process peptidePrediction {
file "*.json" into ch_json_reports

script:
def input_type = params.peptides ? "--peptides ${inputs}" : "--somatic_mutations ${inputs}"
def input_type = params.peptides ? "--peptides ${inputs}" : params.proteins ? "--peptides ${inputs}" : "--somatic_mutations ${inputs}"
def ref_prot = params.proteome ? "--proteome ${params.proteome}" : ""
def wt = params.wild_type ? "--wild_type" : ""
"""
Expand Down
1 change: 1 addition & 0 deletions nextflow.config
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ params {
filter_self = false
mhc_class = 1
peptides = false
proteins = false
max_peptide_length = (mhc_class == 1) ? 11 : 16
min_peptide_length = (mhc_class == 1) ? 8 : 15
tools = 'syfpeithi'
Expand Down
0