diff --git a/CHANGELOG.md b/CHANGELOG.md
index e8a279b..ad12218 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -5,6 +5,19 @@ All notable changes to this project will be documented in this file.
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
+## [0.9.0] - 2024-05-01
+
+### Added
+
+- `io`: Read and write support for writing PSMs to Apache Parquet for efficient storage of PSM lists.
+- `io.sage`: Support for Sage results in Parquet format (new `SageParquetReader`, renamed `SageReader` to `SageTSVReader`).
+
+### Changed
+
+- Upgrade Pydantic dependency to v2. The PSM `spectrum_id` field is now always coerced to a string.
+- `io.proteoscape`: Use pyarrow to iteratively read from Parquet instead of first reading an entire dataframe with Pandas.
+- `io.sage`: Update compatibility to Sage v0.14
+
## [0.8.3] - 2024-04-16
### Added
diff --git a/README.rst b/README.rst
index 3f4c2d1..fede4a6 100644
--- a/README.rst
+++ b/README.rst
@@ -94,11 +94,13 @@ Supported file formats
`MaxQuant msms.txt `_ ``msms`` ✅ ❌
`MS Amanda CSV `_ ``msamanda`` ✅ ❌
`mzIdentML `_ ``mzid`` ✅ ✅
+ `Parquet ` ``parquet`` ✅ ✅
`Peptide Record `_ ``peprec`` ✅ ✅
`pepXML `_ ``pepxml`` ✅ ❌
`Percolator tab `_ ``percolator`` ✅ ✅
Proteome Discoverer MSF ``proteome_discoverer`` ✅ ❌
- `Sage `_ ``sage`` ✅ ❌
+ `Sage Parquet `_ ``sage_parquet`` ✅ ❌
+ `Sage TSV `_ ``sage_tsv`` ✅ ❌
ProteoScape Parquet ``proteoscape`` ✅ ❌
`TSV `_ ``tsv`` ✅ ✅
`X!Tandem XML `_ ``xtandem`` ✅ ❌
diff --git a/psm_utils/__init__.py b/psm_utils/__init__.py
index 3922589..7e9087d 100644
--- a/psm_utils/__init__.py
+++ b/psm_utils/__init__.py
@@ -1,6 +1,6 @@
"""Common utilities for parsing and handling PSMs, and search engine results."""
-__version__ = "0.8.3"
+__version__ = "0.9.0"
__all__ = ["Peptidoform", "PSM", "PSMList"]
from functools import lru_cache
diff --git a/psm_utils/io/__init__.py b/psm_utils/io/__init__.py
index a0c2d42..45b3d7a 100644
--- a/psm_utils/io/__init__.py
+++ b/psm_utils/io/__init__.py
@@ -13,6 +13,7 @@
import psm_utils.io.maxquant as maxquant
import psm_utils.io.msamanda as msamanda
import psm_utils.io.mzid as mzid
+import psm_utils.io.parquet as parquet
import psm_utils.io.peptide_record as peptide_record
import psm_utils.io.pepxml as pepxml
import psm_utils.io.percolator as percolator
@@ -75,12 +76,6 @@
"extension": ".parquet",
"filename_pattern": r"^.*\.candidates\.parquet$",
},
- "tsv": {
- "reader": tsv.TSVReader,
- "writer": tsv.TSVWriter,
- "extension": ".tsv",
- "filename_pattern": r"^.*\.tsv$",
- },
"xtandem": {
"reader": xtandem.XTandemReader,
"writer": None,
@@ -93,19 +88,40 @@
"extension": ".csv",
"filename_pattern": r"^.*(?:_|\.)msamanda.csv$",
},
- "sage": {
- "reader": sage.SageReader,
+ "sage_tsv": {
+ "reader": sage.SageTSVReader,
"writer": None,
"extension": ".tsv",
"filename_pattern": r"^.*(?:_|\.).sage.tsv$",
},
+ "sage_parquet": {
+ "reader": sage.SageParquetReader,
+ "writer": None,
+ "extension": ".parquet",
+ "filename_pattern": r"^.*(?:_|\.).sage.parquet$",
+ },
"ionbot": {
"reader": ionbot.IonbotReader,
"writer": None,
"extension": "ionbot.first.csv",
"filename_pattern": r"^ionbot.first.csv$",
},
+ "parquet": { # List after proteoscape and sage to avoid extension matching conflicts
+ "reader": parquet.ParquetReader,
+ "writer": parquet.ParquetWriter,
+ "extension": ".parquet",
+ "filename_pattern": r"^.*\.parquet$",
+ },
+ "tsv": { # List after sage to avoid extension matching conflicts
+ "reader": tsv.TSVReader,
+ "writer": tsv.TSVWriter,
+ "extension": ".tsv",
+ "filename_pattern": r"^.*\.tsv$",
+ },
}
+
+FILETYPES["sage"] = FILETYPES["sage_tsv"] # Alias for backwards compatibility
+
READERS = {k: v["reader"] for k, v in FILETYPES.items() if v["reader"]}
WRITERS = {k: v["writer"] for k, v in FILETYPES.items() if v["writer"]}
@@ -124,10 +140,10 @@ def _supports_write_psm(writer: WriterBase):
with NamedTemporaryFile(delete=False) as temp_file:
temp_file.close()
Path(temp_file.name).unlink()
- example_psm = PSM(peptidoform="ACDE", spectrum_id=0)
+ example_psm = PSM(peptidoform="ACDE", spectrum_id="0")
try:
with writer(temp_file.name, example_psm=example_psm) as writer_instance:
- writer_instance.write_psm(None)
+ writer_instance.write_psm(example_psm)
except NotImplementedError:
supports_write_psm = False
except AttributeError: # `None` is not valid PSM
diff --git a/psm_utils/io/_utils.py b/psm_utils/io/_utils.py
index 01c175f..e1572c2 100644
--- a/psm_utils/io/_utils.py
+++ b/psm_utils/io/_utils.py
@@ -10,7 +10,6 @@ def set_csv_field_size_limit():
This function should be called before reading any CSV files to ensure that the field size
limit is properly set.
-
"""
max_int = sys.maxsize
diff --git a/psm_utils/io/parquet.py b/psm_utils/io/parquet.py
new file mode 100644
index 0000000..755cfcc
--- /dev/null
+++ b/psm_utils/io/parquet.py
@@ -0,0 +1,130 @@
+"""
+Reader and writer for a simple, lossless psm_utils Parquet format.
+
+Similar to the :py:mod:`psm_utils.io.tsv` module, this module provides a reader and writer
+for :py:class:`~psm_utils.psm_list.PSMList` objects in a lossless manner. However, Parquet provides
+better performance and storage efficiency compared to TSV, and is recommended for large datasets.
+
+"""
+
+from __future__ import annotations
+
+from pathlib import Path
+from typing import Union
+
+import pyarrow as pa
+import pyarrow.parquet as pq
+from pydantic import ValidationError
+
+from psm_utils.io._base_classes import ReaderBase, WriterBase
+from psm_utils.io.exceptions import PSMUtilsIOException
+from psm_utils.psm import PSM
+from psm_utils.psm_list import PSMList
+
+
+class ParquetReader(ReaderBase):
+ def __init__(self, path: Union[str, Path], *args, **kwargs):
+ """
+ Reader for Parquet files.
+
+ Parameters
+ ----------
+ path : Union[str, Path]
+ Path to the Parquet file.
+
+ """
+ self.path = path
+
+ def __iter__(self):
+ with pq.ParquetFile(self.path) as reader:
+ for batch in reader.iter_batches():
+ for row in batch.to_pylist():
+ # Convert map columns (rendered as lists of tuples) to dictionaries
+ row["metadata"] = dict(row["metadata"] or {})
+ row["provenance_data"] = dict(row["provenance_data"] or {})
+ row["rescoring_features"] = dict(row["rescoring_features"] or {})
+
+ # Convert to PSM object and yield
+ try:
+ yield PSM(**row)
+ except ValidationError as e:
+ raise PSMUtilsIOException(f"Error while parsing row {row}:\n{e}")
+
+
+class ParquetWriter(WriterBase):
+ def __init__(self, path: Union[str, Path], chunk_size: int = 1e6, *args, **kwargs):
+ """
+ Writer for Parquet files.
+
+ Parameters
+ ----------
+ path : Union[str, Path]
+ Path to the Parquet file.
+ chunk_size : int
+ Number of PSMs to write in a single batch. Default is 1e6.
+
+ """
+ self.path = path
+ self.chunk_size = chunk_size
+
+ self._writer = None
+ self._psm_cache = []
+
+ def __enter__(self):
+ self._writer = pq.ParquetWriter(self.path, schema=SCHEMA)
+ return self
+
+ def __exit__(self, *args, **kwargs):
+ self._flush()
+ self._writer.close()
+
+ def write_psm(self, psm: PSM):
+ """Write a single PSM to the Parquet file."""
+ self._psm_cache.append(self._psm_to_entry(psm))
+ if len(self._psm_cache) > self.chunk_size:
+ self._flush()
+
+ def write_file(self, psm_list: PSMList):
+ """Write a list of PSMs to the Parquet file."""
+ with self:
+ for psm in psm_list:
+ self.write_psm(psm)
+
+ @staticmethod
+ def _psm_to_entry(psm: PSM) -> dict:
+ """Convert a PSM object to a dictionary suitable for writing to Parquet."""
+ psm_dict = dict(psm)
+ psm_dict["peptidoform"] = str(psm.peptidoform)
+ return psm_dict
+
+ def _flush(self):
+ """Write the cached PSMs to the Parquet file."""
+ if not self._psm_cache:
+ return
+ table = pa.Table.from_pylist(self._psm_cache, schema=SCHEMA)
+ self._writer.write_table(table)
+ self._psm_cache = []
+
+
+SCHEMA = pa.schema(
+ [
+ ("peptidoform", pa.string()),
+ ("spectrum_id", pa.string()),
+ ("run", pa.string()),
+ ("collection", pa.string()),
+ ("spectrum", pa.string()),
+ ("is_decoy", pa.bool_()),
+ ("score", pa.float32()),
+ ("qvalue", pa.float32()),
+ ("pep", pa.float32()),
+ ("precursor_mz", pa.float32()),
+ ("retention_time", pa.float32()),
+ ("ion_mobility", pa.float32()),
+ ("protein_list", pa.list_(pa.string())),
+ ("rank", pa.int32()),
+ ("source", pa.string()),
+ ("provenance_data", pa.map_(pa.string(), pa.string())),
+ ("metadata", pa.map_(pa.string(), pa.string())),
+ ("rescoring_features", pa.map_(pa.string(), pa.float32())),
+ ]
+)
diff --git a/psm_utils/io/proteoscape.py b/psm_utils/io/proteoscape.py
index cc0e834..ddc4386 100644
--- a/psm_utils/io/proteoscape.py
+++ b/psm_utils/io/proteoscape.py
@@ -4,13 +4,15 @@
import re
from pathlib import Path
from typing import Union
-from collections import namedtuple
import numpy as np
import pandas as pd
+import pyarrow.parquet as pq
-from psm_utils import PSM
+from psm_utils.psm import PSM
+from psm_utils.psm_list import PSMList
from psm_utils.io._base_classes import ReaderBase
+from psm_utils.io.exceptions import PSMUtilsIOException
from psm_utils.peptidoform import format_number_as_string
logger = logging.getLogger(__name__)
@@ -36,31 +38,31 @@ def __init__(
Path to MSF file.
"""
- if isinstance(filename, pd.DataFrame):
- self.data = filename
- else:
- super().__init__(filename, *args, **kwargs)
- self.data = pd.read_parquet(self.filename)
-
- self._Row = namedtuple("Row", self.data.columns)
+ self.filename = filename
def __len__(self):
"""Return number of PSMs in file."""
- return len(self.data)
+ return pq.read_metadata(self.filename).num_rows
def __iter__(self):
"""Iterate over file and return PSMs one-by-one."""
- for entry in self.data.itertuples():
- yield _parse_entry(entry)
-
- def __getitem__(self, index):
- """Return PSM at index."""
- return _parse_entry(self._Row(*self.data.iloc[index]))
+ with pq.ParquetFile(self.filename) as reader:
+ for batch in reader.iter_batches():
+ for row in batch.to_pylist():
+ try:
+ yield _parse_entry(row)
+ except Exception as e:
+ raise PSMUtilsIOException(f"Error while parsing row {row}:\n{e}") from e
@classmethod
- def from_dataframe(cls, dataframe: pd.DataFrame, *args, **kwargs):
- """Create a ProteoScapeReader from a DataFrame."""
- return cls(dataframe, *args, **kwargs)
+ def from_dataframe(cls, dataframe: pd.DataFrame) -> PSMList:
+ """Create a PSMList from a ProteoScape Pandas DataFrame."""
+ return PSMList(
+ psm_list=[
+ cls._get_peptide_spectrum_match(cls(""), entry)
+ for entry in dataframe.to_dict(orient="records")
+ ]
+ )
def _parse_peptidoform(
@@ -81,40 +83,43 @@ def _parse_peptidoform(
return f"{n_term}{''.join(peptidoform)}{c_term}/{precursor_charge}"
-def _parse_entry(entry) -> PSM:
+def _parse_entry(entry: dict) -> PSM:
"""Parse a single entry from ProteoScape Parquet file to PSM object."""
return PSM(
peptidoform=_parse_peptidoform(
- entry.stripped_peptide, entry.ptms, entry.ptm_locations, entry.precursor_charge
+ entry["stripped_peptide"],
+ entry["ptms"],
+ entry["ptm_locations"],
+ entry["precursor_charge"],
),
- spectrum_id=entry.ms2_id,
- run=getattr(entry, "run", None),
- is_decoy=all(DECOY_PATTERN.match(p) for p in entry.locus_name),
- score=entry.x_corr_score,
- precursor_mz=entry.precursor_mz,
- retention_time=entry.rt,
- ion_mobility=entry.ook0,
- protein_list=list(entry.locus_name),
- rank=entry.rank,
+ spectrum_id=entry["ms2_id"],
+ run=entry.get("run", None),
+ is_decoy=all(DECOY_PATTERN.match(p) for p in entry["locus_name"]),
+ score=entry["x_corr_score"],
+ precursor_mz=entry["precursor_mz"],
+ retention_time=entry["rt"],
+ ion_mobility=entry["ook0"],
+ protein_list=list(entry["locus_name"]),
+ rank=entry["rank"],
source="ProteoScape",
provenance_data={
- "candidate_id": str(entry.candidate_id),
- "ms2_id": str(entry.ms2_id),
- "parent_id": str(entry.parent_id),
+ "candidate_id": str(entry["candidate_id"]),
+ "ms2_id": str(entry["ms2_id"]),
+ "parent_id": str(entry["parent_id"]),
},
metadata={
- "leading_aa": str(entry.leading_aa),
- "trailing_aa": str(entry.trailing_aa),
- "corrected_ook0": str(entry.corrected_ook0),
+ "leading_aa": str(entry["leading_aa"]),
+ "trailing_aa": str(entry["trailing_aa"]),
+ "corrected_ook0": str(entry["corrected_ook0"]),
},
rescoring_features={
- "tims_score": float(entry.tims_score),
- "x_corr_score": float(entry.x_corr_score),
- "delta_cn_score": float(entry.delta_cn_score),
- "ppm_error": float(entry.ppm_error),
- "number_matched_ions": float(entry.number_matched_ions),
- "number_expected_ions": float(entry.number_expected_ions),
- "ion_proportion": float(entry.ion_proportion),
- "spectrum_total_ion_intensity": float(entry.spectrum_total_ion_intensity),
+ "tims_score": float(entry["tims_score"]),
+ "x_corr_score": float(entry["x_corr_score"]),
+ "delta_cn_score": float(entry["delta_cn_score"]),
+ "ppm_error": float(entry["ppm_error"]),
+ "number_matched_ions": float(entry["number_matched_ions"]),
+ "number_expected_ions": float(entry["number_expected_ions"]),
+ "ion_proportion": float(entry["ion_proportion"]),
+ "spectrum_total_ion_intensity": float(entry["spectrum_total_ion_intensity"]),
},
)
diff --git a/psm_utils/io/sage.py b/psm_utils/io/sage.py
index e0cc0d1..4cc23e3 100644
--- a/psm_utils/io/sage.py
+++ b/psm_utils/io/sage.py
@@ -2,26 +2,29 @@
Reader for PSM files from the Sage search engine.
Reads the ``results.sage.tsv`` file as defined on the
-`Sage documentation page `_.
+`Sage documentation page `_.
"""
from __future__ import annotations
import csv
+from abc import ABC, abstractmethod
from pathlib import Path
from typing import Iterable, Optional
+import pyarrow.parquet as pq
from pyteomics import mass
from psm_utils.io._base_classes import ReaderBase
-from psm_utils.psm import PSM
from psm_utils.io._utils import set_csv_field_size_limit
+from psm_utils.psm import PSM
+from psm_utils.psm_list import PSMList
set_csv_field_size_limit()
-class SageReader(ReaderBase):
+class SageReaderBase(ReaderBase, ABC):
def __init__(
self, filename, score_column: str = "sage_discriminant_score", *args, **kwargs
) -> None:
@@ -41,42 +44,15 @@ def __init__(
self.filename = filename
self.score_column = score_column
+ @abstractmethod
def __iter__(self) -> Iterable[PSM]:
"""Iterate over file and return PSMs one-by-one."""
- with open(self.filename) as open_file:
- reader = csv.DictReader(open_file, delimiter="\t")
- for row in reader:
- psm = self._get_peptide_spectrum_match(row)
- yield psm
+ raise NotImplementedError("Use `SageTSVReader` or `SageParquetReader` instead.")
def _get_peptide_spectrum_match(self, psm_dict) -> PSM:
"""Parse a single PSM from a sage PSM file."""
rescoring_features = {}
- for ft in [
- "expmass",
- "calcmass",
- "delta_mass",
- "peptide_len",
- "missed_cleavages",
- "isotope_error",
- "precursor_ppm",
- "fragment_ppm",
- "hyperscore",
- "delta_next",
- "delta_best",
- "delta_rt_model",
- "aligned_rt",
- "predicted_rt",
- "matched_peaks",
- "longest_b",
- "longest_y",
- "longest_y_pct",
- "matched_intensity_pct",
- "scored_candidates",
- "poisson",
- "ms1_intensity",
- "ms2_intensity",
- ]:
+ for ft in RESCORING_FEATURES:
try:
rescoring_features[ft] = psm_dict[ft]
except KeyError:
@@ -89,9 +65,7 @@ def _get_peptide_spectrum_match(self, psm_dict) -> PSM:
),
spectrum_id=psm_dict["scannr"],
run=Path(psm_dict["filename"]).stem,
- is_decoy=(
- True if psm_dict["label"] == "-1" else False if psm_dict["label"] == "1" else None
- ),
+ is_decoy=psm_dict["is_decoy"],
qvalue=psm_dict["spectrum_q"],
score=float(psm_dict[self.score_column]),
precursor_mz=self._parse_precursor_mz(psm_dict["expmass"], psm_dict["charge"]),
@@ -118,3 +92,64 @@ def _parse_precursor_mz(expmass: str, charge: Optional[str]) -> Optional[float]:
return (expmass + (mass.nist_mass["H"][1][0] * charge)) / charge
else:
return None
+
+ @classmethod
+ def from_dataframe(cls, dataframe) -> PSMList:
+ """Create a PSMList from a Sage Pandas DataFrame."""
+ return PSMList(
+ psm_list=[
+ cls._get_peptide_spectrum_match(cls(""), entry)
+ for entry in dataframe.to_dict(orient="records")
+ ]
+ )
+
+
+class SageTSVReader(SageReaderBase):
+ def __iter__(self) -> Iterable[PSM]:
+ """Iterate over file and return PSMs one-by-one."""
+ with open(self.filename, "r") as open_file:
+ reader = csv.DictReader(open_file, delimiter="\t")
+ for row in reader:
+ row["is_decoy"] = (
+ True if row["label"] == "-1" else False if row["label"] == "1" else None
+ )
+
+ yield self._get_peptide_spectrum_match(row)
+
+SageReader = SageTSVReader # Alias for backwards compatibility
+
+
+class SageParquetReader(SageReaderBase):
+ def __iter__(self) -> Iterable[PSM]:
+ """Iterate over file and return PSMs one-by-one."""
+ with pq.ParquetFile(self.filename) as pq_file:
+ for batch in pq_file.iter_batches():
+ for row in batch.to_pylist():
+ yield self._get_peptide_spectrum_match(row)
+
+
+RESCORING_FEATURES = [
+ "expmass",
+ "calcmass",
+ "delta_mass",
+ "peptide_len",
+ "missed_cleavages",
+ "isotope_error",
+ "precursor_ppm",
+ "fragment_ppm",
+ "hyperscore",
+ "delta_next",
+ "delta_best",
+ "delta_rt_model",
+ "aligned_rt",
+ "predicted_rt",
+ "matched_peaks",
+ "longest_b",
+ "longest_y",
+ "longest_y_pct",
+ "matched_intensity_pct",
+ "scored_candidates",
+ "poisson",
+ # "ms1_intensity", # Removed in Sage v0.14
+ "ms2_intensity",
+]
diff --git a/psm_utils/psm.py b/psm_utils/psm.py
index 6053937..0ed97d9 100644
--- a/psm_utils/psm.py
+++ b/psm_utils/psm.py
@@ -2,7 +2,7 @@
from typing import Any, Dict, List, Optional, Union
-from pydantic import BaseModel
+from pydantic import ConfigDict, BaseModel
from psm_utils.peptidoform import Peptidoform
@@ -11,7 +11,7 @@ class PSM(BaseModel):
"""Data class representing a peptide-spectrum match (PSM)."""
peptidoform: Union[Peptidoform, str]
- spectrum_id: Union[int, str]
+ spectrum_id: Union[str]
run: Optional[str] = None
collection: Optional[str] = None
spectrum: Optional[Any] = None
@@ -28,9 +28,7 @@ class PSM(BaseModel):
provenance_data: Optional[Dict[str, str]] = dict()
metadata: Optional[Dict[str, str]] = dict()
rescoring_features: Optional[Dict[str, float]] = dict()
-
- class Config:
- arbitrary_types_allowed = True # Allows non-pydantic class Peptidoform
+ model_config = ConfigDict(arbitrary_types_allowed=True, coerce_numbers_to_str=True)
def __init__(self, **data):
"""
diff --git a/pyproject.toml b/pyproject.toml
index b6923bc..a2f3629 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -20,29 +20,30 @@ classifiers = [
dynamic = ["version"]
requires-python = ">=3.7"
dependencies = [
- "pyteomics >= 4, <4.7",
- "pyopenms",
+ "click",
"lxml",
- "psims",
- "pandas",
"numpy",
- "click",
+ "pandas",
+ "psims",
+ "pyarrow",
+ "pydantic >= 2",
+ "pyopenms",
+ "pyteomics >= 4, <4.7",
"rich",
- "pydantic",
"sqlalchemy",
]
[project.optional-dependencies]
dev = ["ruff", "isort>5", "pytest", "pytest-cov"]
docs = [
- "sphinx",
"numpydoc>=1,<2",
"recommonmark",
- "sphinx-mdinclude",
- "toml",
"semver>=2",
"sphinx_rtd_theme",
"sphinx-autobuild",
+ "sphinx-mdinclude",
+ "sphinx",
+ "toml",
]
online = ["streamlit", "plotly"]
@@ -62,7 +63,6 @@ build-backend = "setuptools.build_meta"
[tool.setuptools.packages.find]
include = ["psm_utils*"]
-
[tool.setuptools.dynamic]
version = { attr = "psm_utils.__version__" }
diff --git a/tests/test_data/results.sage.parquet b/tests/test_data/results.sage.parquet
new file mode 100644
index 0000000..8060fe6
Binary files /dev/null and b/tests/test_data/results.sage.parquet differ
diff --git a/tests/test_data/results.sage.tsv b/tests/test_data/results.sage.tsv
index 9d6ec16..6543ec3 100644
--- a/tests/test_data/results.sage.tsv
+++ b/tests/test_data/results.sage.tsv
@@ -1,2 +1,2 @@
-peptide proteins num_proteins filename scannr rank label expmass calcmass charge peptide_len missed_cleavages isotope_error precursor_ppm fragment_ppm hyperscore delta_next delta_best rt aligned_rt predicted_rt delta_rt_model matched_peaks longest_b longest_y longest_y_pct matched_intensity_pct scored_candidates poisson sage_discriminant_score posterior_error spectrum_q peptide_q protein_q ms1_intensity ms2_intensity
-LQSRPAAPPAPGPGQLTLR sp|Q99536|VAT1_HUMAN 1 LQSRPAAPPAPGPGQLTLR.mzML controllerType=0 controllerNumber=1 scan=30069 1 1 1926.0815 1926.08 3 19 0 0.0 0.8239083 0.5347518 71.78844460255384 71.78844460255384 0.0 108.2854 0.0 0.0 0.0 22 9 12 0.6315789 50.785 1 -1.9562811911083433 1.2944585 1.0 1.0 1.0 1.0 306146180.0 56930696.0
+psm_id peptide proteins num_proteins filename scannr rank label expmass calcmass charge peptide_len missed_cleavages semi_enzymatic isotope_error precursor_ppm fragment_ppm hyperscore delta_next delta_best rt aligned_rt predicted_rt delta_rt_model ion_mobility predicted_mobility delta_mobility matched_peaks longest_b longest_y longest_y_pct matched_intensity_pct scored_candidates poisson sage_discriminant_score posterior_error spectrum_q peptide_q protein_q ms2_intensity
+1 LQSRPAAPPAPGPGQLTLR sp|Q99536|VAT1_HUMAN 1 LQSRPAAPPAPGPGQLTLR.mzML controllerType=0 controllerNumber=1 scan=30069 1 1 1926.0815 1926.08 3 19 0 0 0.0 0.8239083 0.503857 72.26591573806016 72.26591573806016 0.0 108.2854 0.993444 0.0 0.993444 0.0 0.0 0.0 22 9 12 0.6315789 64.770966 1 -1.9562811911083433 1.2944585 1.0 1.0 1.0 1.0 72609170.0
diff --git a/tests/test_io/test_idxml.py b/tests/test_io/test_idxml.py
index 075b12f..7bf1eac 100644
--- a/tests/test_io/test_idxml.py
+++ b/tests/test_io/test_idxml.py
@@ -3,9 +3,9 @@
import hashlib
from psm_utils.io.idxml import IdXMLReader, IdXMLWriter
-from psm_utils.io.sage import SageReader
-from psm_utils.psm import PSM
+from psm_utils.io.sage import SageTSVReader
from psm_utils.peptidoform import Peptidoform
+from psm_utils.psm import PSM
class TestIdXMLReader:
@@ -103,8 +103,8 @@ def test_write_file_with_pyopenms_objects(self):
assert sha == expected_sha
def test_write_file_without_pyopenms_objects(self):
- expected_sha = "b81addaf8ef1f5cb5007f14a914bee508c54d59f34f8857a5770d3db9aa2c15b"
- reader = SageReader("./tests/test_data/results.sage.tsv")
+ expected_sha = "148889926276fbe391e23ed7952c3a8410fc67ffb099bbf1a72df75f8d727ccd"
+ reader = SageTSVReader("./tests/test_data/results.sage.tsv")
psm_list = reader.read_file()
writer = IdXMLWriter("./tests/test_data/test_out_sage.idXML")
writer.write_file(psm_list)
diff --git a/tests/test_io/test_parquet.py b/tests/test_io/test_parquet.py
new file mode 100644
index 0000000..20bb0b0
--- /dev/null
+++ b/tests/test_io/test_parquet.py
@@ -0,0 +1,71 @@
+"""Tests for psm_utils.io.tsv."""
+
+import hashlib
+import os
+
+from psm_utils.io.parquet import ParquetReader, ParquetWriter
+from psm_utils.psm import PSM
+from psm_utils.psm_list import PSMList
+
+test_cases = [
+ {"peptidoform": "ACDE", "spectrum_id": "1"},
+ {
+ "peptidoform": "ACDE",
+ "spectrum_id": "2",
+ "run": None,
+ "collection": None,
+ "ion_mobility": None,
+ "is_decoy": None,
+ "pep": None,
+ "precursor_mz": None,
+ "protein_list": None,
+ "qvalue": None,
+ "rank": None,
+ "retention_time": None,
+ "score": None,
+ "source": None,
+ "spectrum": None,
+ "provenance_data": {"source": "test"},
+ "metadata": {},
+ "rescoring_features": {"feature": 2.0},
+ },
+]
+
+
+def compute_checksum(filename):
+ hash_func = hashlib.sha256()
+ with open(filename, "rb") as f:
+ for chunk in iter(lambda: f.read(4096), b""):
+ hash_func.update(chunk)
+ return hash_func.hexdigest()
+
+
+class TestParquetWriter:
+ expected_checksum = "cf3f2e9f073be58612ce81f240da9f4109e1c76eea25f1b7881e09c0a8fdee16"
+
+ def test_write_psm(self):
+ with ParquetWriter("test.pq") as writer:
+ for test_case in test_cases:
+ writer.write_psm(PSM(**test_case))
+ actual_checksum = compute_checksum("test.pq")
+ assert actual_checksum == self.expected_checksum, "Checksums do not match"
+ os.remove("test.pq")
+
+ def test_write_file(self):
+ with ParquetWriter("test.pq") as writer:
+ writer.write_file(PSMList(psm_list=[PSM(**t) for t in test_cases]))
+ actual_checksum = compute_checksum("test.pq")
+ assert actual_checksum == self.expected_checksum, "Checksums do not match"
+ # os.remove("test.pq")
+
+
+class TestParquetReader:
+ def test_iter(self):
+ # Write test cases to file
+ ParquetWriter("test.pq").write_file(PSMList(psm_list=[PSM(**t) for t in test_cases]))
+
+ # Read test cases from file
+ for i, psm in enumerate(ParquetReader("test.pq")):
+ assert psm == PSM(**test_cases[i])
+
+ os.remove("test.pq")
diff --git a/tests/test_io/test_sage.py b/tests/test_io/test_sage.py
index 10d2bcc..60d87ba 100644
--- a/tests/test_io/test_sage.py
+++ b/tests/test_io/test_sage.py
@@ -1,6 +1,8 @@
"""Tests for psm_utils.io.sage."""
-from psm_utils.io.sage import SageReader
+import pytest
+
+from psm_utils.io.sage import SageParquetReader, SageTSVReader
from psm_utils.psm import PSM
test_psm = PSM(
@@ -27,29 +29,52 @@
"missed_cleavages": 0.0,
"isotope_error": 0.0,
"precursor_ppm": 0.8239083,
- "fragment_ppm": 0.5347518,
- "hyperscore": 71.78844460255384,
- "delta_next": 71.78844460255384,
+ "fragment_ppm": 0.503857,
+ "hyperscore": 72.26591573806016,
+ "delta_next": 72.26591573806016,
"delta_best": 0.0,
- "delta_rt_model": 0.0,
- "aligned_rt": 0.0,
+ "delta_rt_model": 0.993444,
+ "aligned_rt": 0.993444,
"predicted_rt": 0.0,
"matched_peaks": 22.0,
"longest_b": 9.0,
"longest_y": 12.0,
"longest_y_pct": 0.6315789,
- "matched_intensity_pct": 50.785,
+ "matched_intensity_pct": 64.770966,
"scored_candidates": 1.0,
"poisson": -1.9562811911083433,
- "ms1_intensity": 306146180.0,
- "ms2_intensity": 56930696.0,
+ "ms2_intensity": 72609170.0,
},
)
-class TestSageReader:
+class TestSageTSVReader:
def test_iter(self):
- with SageReader("./tests/test_data/results.sage.tsv") as reader:
+ with SageTSVReader("./tests/test_data/results.sage.tsv") as reader:
for psm in reader:
psm.provenance_data = {}
assert psm == test_psm
+
+
+class TestSageParquetReader:
+ def test_iter(self):
+ with SageParquetReader("./tests/test_data/results.sage.parquet") as reader:
+ # Parquet results in float precision differences, so pytest.approx is used, which does
+ # not support objects with nested dicts.
+ for psm in reader:
+ psm_dict = dict(psm)
+ test_psm_dict = dict(test_psm)
+
+ # Nested dicts
+ assert psm_dict.pop("rescoring_features", {}) == pytest.approx(
+ test_psm_dict.pop("rescoring_features", {})
+ )
+ assert psm_dict.pop("metadata", {}) == test_psm_dict.pop("metadata", {})
+ psm_dict.pop("provenance_data", {})
+
+ # Remaining keys
+ for k, v in psm_dict.items():
+ if isinstance(v, float):
+ assert v == pytest.approx(test_psm_dict[k])
+ else:
+ assert v == test_psm_dict[k]
diff --git a/tests/test_psm_list.py b/tests/test_psm_list.py
index 21fa709..cff86aa 100644
--- a/tests/test_psm_list.py
+++ b/tests/test_psm_list.py
@@ -26,30 +26,30 @@ def test___get_item__(self):
psm_list = PSMList(psm_list=sample_psm_list)
# Single index
- assert psm_list[0] == PSM(peptidoform="ACDK", spectrum_id=1, score=140.2)
+ assert psm_list[0] == PSM(peptidoform="ACDK", spectrum_id="1", score=140.2)
# Slice
assert psm_list[0:2] == PSMList(
psm_list=[
- PSM(peptidoform="ACDK", spectrum_id=1, score=140.2),
- PSM(peptidoform="CDEFR", spectrum_id=2, score=132.9),
+ PSM(peptidoform="ACDK", spectrum_id="1", score=140.2),
+ PSM(peptidoform="CDEFR", spectrum_id="2", score=132.9),
]
)
# PSM property as array
- np.testing.assert_equal(psm_list["spectrum_id"], np.array([1, 2, 3]))
+ np.testing.assert_equal(psm_list["spectrum_id"], np.array(["1", "2", "3"]))
# Multiple PSM properties as 2D array
np.testing.assert_equal(
psm_list[["spectrum_id", "score"]],
- np.array([[1, 140.2], [2, 132.9], [3, 55.7]]),
+ np.array([["1", 140.2], ["2", 132.9], ["3", 55.7]]),
)
# Index by multiple indices
psm_list[0, 2] == PSMList(
psm_list=[
- PSM(peptidoform="ACDK", spectrum_id=1, score=140.2),
- PSM(peptidoform="DEM[Oxidation]K", spectrum_id=3, score=55.7),
+ PSM(peptidoform="ACDK", spectrum_id="1", score=140.2),
+ PSM(peptidoform="DEM[Oxidation]K", spectrum_id="3", score=55.7),
]
)