8000 FDR filtering in DBSuitability Tool by Waschi97 · Pull Request #4814 · OpenMS/OpenMS · GitHub
[go: up one dir, main page]
More Web Proxy on the site http://driver.im/
Skip to content

FDR filtering in DBSuitability Tool #4814

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 3 commits into from
Jul 16, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 3 additions & 3 deletions src/tests/topp/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -218,15 +218,15 @@ set_tests_properties("TOPP_MapNormalizer_1_out1" PROPERTIES DEPENDS "TOPP_MapNor
#------------------------------------------------------------------------------
# DatabaseSuitability tests
# test default
add_test("TOPP_DatabaseSuitability_1" ${TOPP_BIN_PATH}/DatabaseSuitability -test -in_id ${DATA_DIR_TOPP}/DatabaseSuitability_in_id.idXML -in_spec ${DATA_DIR_TOPP}/DatabaseSuitability_in_spec.mzML -in_novo ${DATA_DIR_TOPP}/DatabaseSuitability_in_novo.idXML -out DatabaseSuitability_1.tmp)
add_test("TOPP_DatabaseSuitability_1" ${TOPP_BIN_PATH}/DatabaseSuitability -test -in_id ${DATA_DIR_TOPP}/DatabaseSuitability_in_id.idXML -in_spec ${DATA_DIR_TOPP}/DatabaseSuitability_in_spec.mzML -in_novo ${DATA_DIR_TOPP}/DatabaseSuitability_in_novo.idXML -FDR 0.8 -out DatabaseSuitability_1.tmp)
add_test("TOPP_DatabaseSuitability_1_out" ${DIFF} -in1 DatabaseSuitability_1.tmp -in2 ${DATA_DIR_TOPP}/DatabaseSuitability_out_1.tsv )
set_tests_properties("TOPP_DatabaseSuitability_1_out" PROPERTIES DEPENDS "TOPP_DatabaseSuitability_1")
# test with custom novor_fract
add_test("TOPP_DatabaseSuitability_2" ${TOPP_BIN_PATH}/DatabaseSuitability -test -in_id ${DATA_DIR_TOPP}/DatabaseSuitability_in_id.idXML -in_spec ${DATA_DIR_TOPP}/DatabaseSuitability_in_spec.mzML -in_novo ${DATA_DIR_TOPP}/DatabaseSuitability_in_novo.idXML -novor_fract 0.9 -out DatabaseSuitability_2.tmp)
add_test("TOPP_DatabaseSuitability_2" ${TOPP_BIN_PATH}/DatabaseSuitability -test -in_id ${DATA_DIR_TOPP}/DatabaseSuitability_in_id.idXML -in_spec ${DATA_DIR_TOPP}/DatabaseSuitability_in_s 8000 pec.mzML -in_novo ${DATA_DIR_TOPP}/DatabaseSuitability_in_novo.idXML -FDR 1 -novor_fract 0.9 -out DatabaseSuitability_2.tmp)
add_test("TOPP_DatabaseSuitability_2_out" ${DIFF} -whitelist ${INDEX_WHITELIST} -in1 DatabaseSuitability_2.tmp -in2 ${DATA_DIR_TOPP}/DatabaseSuitability_out_2.tsv )
set_tests_properties("TOPP_DatabaseSuitability_2_out" PROPERTIES DEPENDS "TOPP_DatabaseSuitability_2")
# test without re-ranking
add_test("TOPP_DatabaseSuitability_3" ${TOPP_BIN_PATH}/DatabaseSuitability -test -in_id ${DATA_DIR_TOPP}/DatabaseSuitability_in_id.idXML -in_spec ${DATA_DIR_TOPP}/DatabaseSuitability_in_spec.mzML -in_novo ${DATA_DIR_TOPP}/DatabaseSuitability_in_novo.idXML -force_no_re_rank -out DatabaseSuitability_3.tmp)
add_test("TOPP_DatabaseSuitability_3" ${TOPP_BIN_PATH}/DatabaseSuitability -test -in_id ${DATA_DIR_TOPP}/DatabaseSuitability_in_id.idXML -in_spec ${DATA_DIR_TOPP}/DatabaseSuitability_in_spec.mzML -in_novo ${DATA_DIR_TOPP}/DatabaseSuitability_in_novo.idXML -FDR 0.9 -force_no_re_rank -out DatabaseSuitability_3.tmp)
add_test("TOPP_DatabaseSuitability_3_out" ${DIFF} -whitelist ${INDEX_WHITELIST} -in1 DatabaseSuitability_3.tmp -in2 ${DATA_DIR_TOPP}/DatabaseSuitability_out_3.tsv )
set_tests_properties("TOPP_DatabaseSuitability_3_out" PROPERTIES DEPENDS "TOPP_DatabaseSuitability_3")

Expand Down
6 changes: 3 additions & 3 deletions src/tests/topp/DatabaseSuitability_out_1.tsv
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
key value
#top_db_hits 438
#top_novo_hits 1
db_suitability 0.997722095671982
#top_db_hits 158
#top_novo_hits 2
db_suitability 0.9875
#total_novo_seqs 1118
#unique_novo_seqs 1111
#ms2_spectra 1120
Expand Down
4 changes: 2 additions & 2 deletions src/tests/topp/DatabaseSuitability_out_3.tsv
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
key value
#top_db_hits 436
#top_db_hits 297
#top_novo_hits 3
db_suitability 0.993166287015945
db_suitability 0.99
#total_novo_seqs 1118
#unique_novo_seqs 1111
#ms2_spectra 1120
Expand Down
94 changes: 74 additions & 20 deletions src/topp/DatabaseSuitability.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -94,7 +94,8 @@ To generate the de novo "database":
- @ref TOPP_IDFilter can filter out unwanted ones.
- @ref TOPP_IDFileConverter generates the de novo fasta file.

For re-ranking all cases where a peptide hit only found in the de novo "database" scores above a peptide hit found in the actual database are checked. In all these cases the cross-correlation scores of those peptide hits are compared. If they are similar enough, the database hit will be re-ranked to be on top of the de novo hit. You can control how much of cases with similar scores will be re-ranked by using the @p novor_fract option.
For re-ranking all cases where a peptide hit only found in the de novo "database" scores above a peptide hit found in the actual database are checked. In all these cases the cross-correlation scores of those peptide hits are compared. If they are similar enough, the database hit will be re-ranked to be on top of the de novo hit. You can control how much of cases with similar scores 8000 will be re-ranked by using the @p novor_fract option.@n
For this to work it is important that FDR filtering is done in this tool and not beforehand by @ref TOPP_FalseDiscoveryRate. You can provide the wanted FDR using the corresponding flag.

@note For identification search the only supported search engine for the time being is Comet because the Comet cross-correlation score is needed for re-ranking.@n
You can still uses other search engines and disable the re-ranking via the @p force_no_re_rank flag in this tool. This will probably result in an underestimated suitability though.@n
Expand Down Expand Up @@ -145,6 +146,9 @@ class DatabaseSuitability :
registerDoubleOption_("novor_fract", "<double>", 1, "Set the fraction of how many cases, where a de novo peptide scores just higher than the database peptide, you wish to re-rank.", false, true);
setMinFloat_("novor_fract", 0);
setMaxFloat_("novor_fract", 1);
registerDoubleOption_("FDR", "<double>", 0.01, "Filter peptide hits based on this q-value. (e.g., 0.05 = 5 % FDR)", false, true);
setMinFloat_("FDR", 0);
setMaxFloat_("FDR", 1);
registerFlag_("force_no_re_rank", "Use this flag if you want to disable re-ranking. Cases, where a de novo peptide scores just higher than the database peptide, are overlooked and counted as a de novo hit. This might underestimate the database quality.", true);
}

Expand All @@ -159,33 +163,34 @@ class DatabaseSuitability :
String in_novo = getStringOption_("in_novo");
String out = getStringOption_("out");
double novo_fract = getDoubleOption_("novor_fract");
double FDR = getDoubleOption_("FDR");
bool no_re_rank = getFlag_("force_no_re_rank");

//-------------------------------------------------------------
// reading input
//-------------------------------------------------------------
IdXMLFile x;
vector<ProteinIdentification> prot_ids;
vector<PeptideIdentification> pep_ids;
x.load(in_id, prot_ids, pep_ids);

vector<ProteinIdentification> novo_prots;
vector<PeptideIdentification> novo_peps;
x.load(in_novo, novo_prots, novo_peps);

// load mzML file in scope because we only need the number of ms2 spectra and no data
// this saves some memory
Size count_ms2_lvl;
{
MzMLFile m;
PeakFileOptions op;
op.setMSLevels({2}); // only ms2
op.setMSLevels({ 2 }); // only ms2
op.setFillData(false); // no data
m.setOptions(op);
PeakMap exp;
m.load(in_spec, exp);
count_ms2_lvl = exp.size();
}

IdXMLFile x;
vector<ProteinIdentification> prot_ids;
vector<PeptideIdentification> pep_ids;
x.load(in_id, prot_ids, pep_ids);

vector<ProteinIdentification> novo_prots;
vector<PeptideIdentification> novo_peps;
x.load(in_novo, novo_prots, novo_peps);

//-------------------------------------------------------------
// calculations
Expand All @@ -209,35 +214,64 @@ class DatabaseSuitability :
Size count_re_ranked = 0;
Size count_interest = 0;

for (const auto& pep_id : pep_ids)
for (auto& pep_id : pep_ids)
{
const vector<PeptideHit>& hits = pep_id.getHits();
vector<PeptideHit>& hits = pep_id.getHits();
bool q_value_score = (pep_id.getScoreType() == "q-value");

if (hits.empty()) continue;

// sort hits by q-value
if (q_value_score)
{
sort(hits.begin(), hits.end(),
[](const PeptideHit& a, const PeptideHit& b)
{
return a.getScore() < b.getScore();
});
}
else
{
if (!hits[0].metaValueExists("q-value"))
{
throw(Exception::Precondition(__FILE__, __LINE__, OPENMS_PRETTY_FUNCTION, "No q-value found at peptide identification nor at peptide hits. Make sure 'False Discovery Rate' is run beforehand."));
}

sort(hits.begin(), hits.end(),
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ah yes you want to sort by q-value.. I think the ranking should be the same with the normal score. Then you can use pep_id.sort(). Which is more concise and a bit faster (since it does not access meta values). But yes, maybe this is a bit safer. Up to you since it is probably not a bottleneck.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

While the ranking should be the same for the score from which the q-value was calculated, I'm not sure that this is necessarily the case for any other score. That's why I decided to make sure the sorting is done by q-value.

[](const PeptideHit& a, const PeptideHit& b)
{
return float(a.getMetaValue("q-value")) < float(b.getMetaValue("q-value"));
});
}


const PeptideHit& top_hit = hits[0];

// skip if the top hit is a decoy hit
if (!top_hit.metaValueExists("target_decoy"))
{
throw(Exception::MissingInformation(__FILE__, __LINE__, OPENMS_PRETTY_FUNCTION, "No target/decoy information found! Make sure 'PeptideIndexer' is run beforehand."));
}
if (top_hit.getMetaValue("target_decoy") == "decoy") continue;

// skip if top hit is out ouf FDR
if (scoreHigherThanFDR_(top_hit, FDR, q_value_score)) continue;

// check if top hit is found in de novo protein
if (!isNovoHit_(top_hit)) // top hit is db hit
{
++count_db;
continue;
}
// top hit is novo hit
if (hits.size() == 1)
{
++count_novo;
continue;
}

// find the second target hit, skip all decoy or novo hits inbetween
const PeptideHit* second_hit = nullptr;
String target = "target";
for (UInt i = 1; i < hits.size(); ++i)
{
// check for FDR
if (scoreHigherThanFDR_(hits[i], FDR, q_value_score)) break;

if (target.find(String(hits[i].getMetaValue("target_decoy"), 0)) == 0) // also check for "target+decoy" value
{
// check if hit is novo hit
Expand All @@ -247,7 +281,7 @@ class DatabaseSuitability :
break;
}
}
if (second_hit == nullptr)
if (second_hit == nullptr) // no second target hit with given FDR found
{
++count_novo;
continue;
Expand Down Expand Up @@ -428,6 +462,26 @@ class DatabaseSuitability :
}
return true;
}

// Checks if the q-value of a peptide hit is higher than a given FDR.
// Throws an error if no q-value is found.
bool scoreHigherThanFDR_(const PeptideHit& hit, double FDR, bool q_value_score)
{
if (q_value_score) // score type is q-value
{
if (hit.getScore() > FDR) return true;
return false;
}

if (hit.metaValueExists("q-value")) // look for q-value at metavalues
{
if (float(hit.getMetaValue("q-value")) > FDR) return true;
return false;
}

// no q-value found
throw(Exception::Precondition(__FILE__, __LINE__, OPENMS_PRETTY_FUNCTION, "No q-value found at peptide identification nor at peptide hits. Make sure 'False Discovery Rate' is run beforehand."));
}
};


Expand Down
0