OpenMS · jpfeuffer · Jul 16, 2020 · Jul 15, 2020 · Jul 16, 2020 · Jul 16, 2020
diff --git a/src/tests/topp/CMakeLists.txt b/src/tests/topp/CMakeLists.txt
@@ -218,15 +218,15 @@ set_tests_properties("TOPP_MapNormalizer_1_out1" PROPERTIES DEPENDS "TOPP_MapNor
 #------------------------------------------------------------------------------
 # DatabaseSuitability tests
 # test default
-add_test("TOPP_DatabaseSuitability_1" ${TOPP_BIN_PATH}/DatabaseSuitability -test -in_id ${DATA_DIR_TOPP}/DatabaseSuitability_in_id.idXML -in_spec ${DATA_DIR_TOPP}/DatabaseSuitability_in_spec.mzML -in_novo ${DATA_DIR_TOPP}/DatabaseSuitability_in_novo.idXML -out DatabaseSuitability_1.tmp)
+add_test("TOPP_DatabaseSuitability_1" ${TOPP_BIN_PATH}/DatabaseSuitability -test -in_id ${DATA_DIR_TOPP}/DatabaseSuitability_in_id.idXML -in_spec ${DATA_DIR_TOPP}/DatabaseSuitability_in_spec.mzML -in_novo ${DATA_DIR_TOPP}/DatabaseSuitability_in_novo.idXML -FDR 0.8 -out DatabaseSuitability_1.tmp)
 add_test("TOPP_DatabaseSuitability_1_out" ${DIFF} -in1 DatabaseSuitability_1.tmp -in2 ${DATA_DIR_TOPP}/DatabaseSuitability_out_1.tsv )
 set_tests_properties("TOPP_DatabaseSuitability_1_out" PROPERTIES DEPENDS "TOPP_DatabaseSuitability_1")
 # test with custom novor_fract
-add_test("TOPP_DatabaseSuitability_2" ${TOPP_BIN_PATH}/DatabaseSuitability -test -in_id ${DATA_DIR_TOPP}/DatabaseSuitability_in_id.idXML -in_spec ${DATA_DIR_TOPP}/DatabaseSuitability_in_spec.mzML -in_novo ${DATA_DIR_TOPP}/DatabaseSuitability_in_novo.idXML -novor_fract 0.9 -out DatabaseSuitability_2.tmp)
+add_test("TOPP_DatabaseSuitability_2" ${TOPP_BIN_PATH}/DatabaseSuitability -test -in_id ${DATA_DIR_TOPP}/DatabaseSuitability_in_id.idXML -in_spec ${DATA_DIR_TOPP}/DatabaseSuitability_in_s
8000
pec.mzML -in_novo ${DATA_DIR_TOPP}/DatabaseSuitability_in_novo.idXML -FDR 1 -novor_fract 0.9 -out DatabaseSuitability_2.tmp)
 add_test("TOPP_DatabaseSuitability_2_out" ${DIFF} -whitelist ${INDEX_WHITELIST} -in1 DatabaseSuitability_2.tmp -in2 ${DATA_DIR_TOPP}/DatabaseSuitability_out_2.tsv )
 set_tests_properties("TOPP_DatabaseSuitability_2_out" PROPERTIES DEPENDS "TOPP_DatabaseSuitability_2")
 # test without re-ranking
-add_test("TOPP_DatabaseSuitability_3" ${TOPP_BIN_PATH}/DatabaseSuitability -test -in_id ${DATA_DIR_TOPP}/DatabaseSuitability_in_id.idXML -in_spec ${DATA_DIR_TOPP}/DatabaseSuitability_in_spec.mzML -in_novo ${DATA_DIR_TOPP}/DatabaseSuitability_in_novo.idXML -force_no_re_rank -out DatabaseSuitability_3.tmp)
+add_test("TOPP_DatabaseSuitability_3" ${TOPP_BIN_PATH}/DatabaseSuitability -test -in_id ${DATA_DIR_TOPP}/DatabaseSuitability_in_id.idXML -in_spec ${DATA_DIR_TOPP}/DatabaseSuitability_in_spec.mzML -in_novo ${DATA_DIR_TOPP}/DatabaseSuitability_in_novo.idXML -FDR 0.9 -force_no_re_rank -out DatabaseSuitability_3.tmp)
 add_test("TOPP_DatabaseSuitability_3_out" ${DIFF} -whitelist ${INDEX_WHITELIST} -in1 DatabaseSuitability_3.tmp -in2 ${DATA_DIR_TOPP}/DatabaseSuitability_out_3.tsv )
 set_tests_properties("TOPP_DatabaseSuitability_3_out" PROPERTIES DEPENDS "TOPP_DatabaseSuitability_3")
 

diff --git a/src/tests/topp/DatabaseSuitability_out_1.tsv b/src/tests/topp/DatabaseSuitability_out_1.tsv
@@ -1,7 +1,7 @@
 key	value
-#top_db_hits	438
-#top_novo_hits	1
-db_suitability	0.997722095671982
+#top_db_hits	158
+#top_novo_hits	2
+db_suitability	0.9875
 #total_novo_seqs	1118
 #unique_novo_seqs	1111
 #ms2_spectra	1120

diff --git a/src/tests/topp/DatabaseSuitability_out_3.tsv b/src/tests/topp/DatabaseSuitability_out_3.tsv
@@ -1,7 +1,7 @@
 key	value
-#top_db_hits	436
+#top_db_hits	297
 #top_novo_hits	3
-db_suitability	0.993166287015945
+db_suitability	0.99
 #total_novo_seqs	1118
 #unique_novo_seqs	1111
 #ms2_spectra	1120

diff --git a/src/topp/DatabaseSuitability.cpp b/src/topp/DatabaseSuitability.cpp
@@ -94,7 +94,8 @@ To generate the de novo "database":
       - @ref TOPP_IDFilter can filter out unwanted ones.
       - @ref TOPP_IDFileConverter generates the de novo fasta file.
 
-For re-ranking all cases where a peptide hit only found in the de novo "database" scores above a peptide hit found in the actual database are checked. In all these cases the cross-correlation scores of those peptide hits are compared. If they are similar enough, the database hit will be re-ranked to be on top of the de novo hit. You can control how much of cases with similar scores will be re-ranked by using the @p novor_fract option.
+For re-ranking all cases where a peptide hit only found in the de novo "database" scores above a peptide hit found in the actual database are checked. In all these cases the cross-correlation scores of those peptide hits are compared. If they are similar enough, the database hit will be re-ranked to be on top of the de novo hit. You can control how much of cases with similar scores
8000
 will be re-ranked by using the @p novor_fract option.@n
+For this to work it is important that FDR filtering is done in this tool and not beforehand by @ref TOPP_FalseDiscoveryRate. You can provide the wanted FDR using the corresponding flag.
 
 @note For identification search the only supported search engine for the time being is Comet because the Comet cross-correlation score is needed for re-ranking.@n
 You can still uses other search engines and disable the re-ranking via the @p force_no_re_rank flag in this tool. This will probably result in an underestimated suitability though.@n
@@ -145,6 +146,9 @@ class DatabaseSuitability :
     registerDoubleOption_("novor_fract", "<double>", 1, "Set the fraction of how many cases, where a de novo peptide scores just higher than the database peptide, you wish to re-rank.", false, true);
     setMinFloat_("novor_fract", 0);
     setMaxFloat_("novor_fract", 1);
+    registerDoubleOption_("FDR", "<double>", 0.01, "Filter peptide hits based on this q-value. (e.g., 0.05 = 5 % FDR)", false, true);
+    setMinFloat_("FDR", 0);
+    setMaxFloat_("FDR", 1);
     registerFlag_("force_no_re_rank", "Use this flag if you want to disable re-ranking. Cases, where a de novo peptide scores just higher than the database peptide, are overlooked and counted as a de novo hit. This might underestimate the database quality.", true);
   }
 
@@ -159,33 +163,34 @@ class DatabaseSuitability :
     String in_novo = getStringOption_("in_novo");
     String out = getStringOption_("out");
     double novo_fract = getDoubleOption_("novor_fract");
+    double FDR = getDoubleOption_("FDR");
     bool no_re_rank = getFlag_("force_no_re_rank");
 
     //-------------------------------------------------------------
     // reading input
     //-------------------------------------------------------------
-    IdXMLFile x;
-    vector<ProteinIdentification> prot_ids;
-    vector<PeptideIdentification> pep_ids;
-    x.load(in_id, prot_ids, pep_ids);
-
-    vector<ProteinIdentification> novo_prots;
-    vector<PeptideIdentification> novo_peps;
-    x.load(in_novo, novo_prots, novo_peps);
-
     // load mzML file in scope because we only need the number of ms2 spectra and no data
     // this saves some memory
     Size count_ms2_lvl;
     {
       MzMLFile m;
       PeakFileOptions op;
-      op.setMSLevels({2}); // only ms2
+      op.setMSLevels({ 2 }); // only ms2
       op.setFillData(false); // no data
       m.setOptions(op);
       PeakMap exp;
       m.load(in_spec, exp);
       count_ms2_lvl = exp.size();
     }
+
+    IdXMLFile x;
+    vector<ProteinIdentification> prot_ids;
+    vector<PeptideIdentification> pep_ids;
+    x.load(in_id, prot_ids, pep_ids);
+
+    vector<ProteinIdentification> novo_prots;
+    vector<PeptideIdentification> novo_peps;
+    x.load(in_novo, novo_prots, novo_peps);
 
     //-------------------------------------------------------------
     // calculations
@@ -209,35 +214,64 @@ class DatabaseSuitability :
     Size count_re_ranked = 0;
     Size count_interest = 0;
 
-    for (const auto& pep_id : pep_ids)
+    for (auto& pep_id : pep_ids)
     {
-      const vector<PeptideHit>& hits = pep_id.getHits();
+      vector<PeptideHit>& hits = pep_id.getHits();
+      bool q_value_score = (pep_id.getScoreType() == "q-value");
 
       if (hits.empty()) continue;
 
+      // sort hits by q-value
+      if (q_value_score)
+      {
+        sort(hits.begin(), hits.end(),
+          [](const PeptideHit& a, const PeptideHit& b)
+          {
+            return a.getScore() < b.getScore();
+          });
+      }
+      else
+      {
+        if (!hits[0].metaValueExists("q-value"))
+        {
+          throw(Exception::Precondition(__FILE__, __LINE__, OPENMS_PRETTY_FUNCTION, "No q-value found at peptide identification nor at peptide hits. Make sure 'False Discovery Rate' is run beforehand."));
+        }
+
+        sort(hits.begin(), hits.end(),
+          [](const PeptideHit& a, const PeptideHit& b)
+          {
+            return float(a.getMetaValue("q-value")) < float(b.getMetaValue("q-value"));
+          });
+      }
+
+
       const PeptideHit& top_hit = hits[0];
 
       // skip if the top hit is a decoy hit
+      if (!top_hit.metaValueExists("target_decoy"))
+      {
+        throw(Exception::MissingInformation(__FILE__, __LINE__, OPENMS_PRETTY_FUNCTION, "No target/decoy information found! Make sure 'PeptideIndexer' is run beforehand."));
+      }
       if (top_hit.getMetaValue("target_decoy") == "decoy") continue;
 
+      // skip if top hit is out ouf FDR
+      if (scoreHigherThanFDR_(top_hit, FDR, q_value_score)) continue;
+
       // check if top hit is found in de novo protein
       if (!isNovoHit_(top_hit)) // top hit is db hit
       {
         ++count_db;
         continue;
       }
-      // top hit is novo hit
-      if (hits.size() == 1)
-      {
-        ++count_novo;
-        continue;
-      }
 
       // find the second target hit, skip all decoy or novo hits inbetween
       const PeptideHit* second_hit = nullptr;
       String target = "target";
       for (UInt i = 1; i < hits.size(); ++i)
         {
+        // check for FDR
+        if (scoreHigherThanFDR_(hits[i], FDR, q_value_score)) break;
+
         if (target.find(String(hits[i].getMetaValue("target_decoy"), 0)) == 0) // also check for "target+decoy" value
         {
           // check if hit is novo hit
@@ -247,7 +281,7 @@ class DatabaseSuitability :
           break;
         }
       }
-      if (second_hit == nullptr)
+      if (second_hit == nullptr) // no second target hit with given FDR found
       {
         ++count_novo;
         continue;
@@ -428,6 +462,26 @@ class DatabaseSuitability :
     }
     return true;
   }
+
+  // Checks if the q-value of a peptide hit is higher than a given FDR.
+  // Throws an error if no q-value is found.
+  bool scoreHigherThanFDR_(const PeptideHit& hit, double FDR, bool q_value_score)
+  {
+    if (q_value_score) // score type is q-value
+    {
+      if (hit.getScore() > FDR) return true;
+      return false;
+    }
+
+    if (hit.metaValueExists("q-value")) // look for q-value at metavalues
+    {
+      if (float(hit.getMetaValue("q-value")) > FDR) return true;
+      return false;
+    }
+
+    // no q-value found
+    throw(Exception::Precondition(__FILE__, __LINE__, OPENMS_PRETTY_FUNCTION, "No q-value found at peptide identification nor at peptide hits. Make sure 'False Discovery Rate' is run beforehand."));
+  }
 };