From 41cb3fe30986a41ccd89c8433c7459721c79d077 Mon Sep 17 00:00:00 2001
From: RalfG <ralfg@hotmail.be>
Date: Tue, 16 Apr 2024 22:55:47 +0200
Subject: [PATCH 01/10] Update publish workflow to use build or setuptools
 instead of flit.

---
 .github/workflows/publish.yml | 13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)

diff --git a/.github/workflows/publish.yml b/.github/workflows/publish.yml
index 6f0ae07..d02c1d5 100644
--- a/.github/workflows/publish.yml
+++ b/.github/workflows/publish.yml
@@ -17,21 +17,22 @@ jobs:
       - uses: actions/checkout@v4
 
       - name: Set up Python
-        uses: actions/setup-python@v4
+        uses: actions/setup-python@v5
         with:
           python-version: "3.8"
 
       - name: Install dependencies
         run: |
-          python -m pip install --upgrade pip flit pytest
+          python -m pip install --upgrade pip
+          pip install build
 
-      - name: Build 
-        run: flit build
+      - name: Build
+        run: python -m build --sdist --wheel .
 
-      - name: Install
+      - name: Install wheel
         run: pip install dist/psm_utils-*.whl
 
-      - name: Test package
+      - name: Test wheel
         run: |
           pytest
 

From bfa12248bed7f7017ee6363e082eb5c71713bf61 Mon Sep 17 00:00:00 2001
From: RalfG <ralfg@hotmail.be>
Date: Sun, 28 Apr 2024 11:36:56 +0200
Subject: [PATCH 02/10] Add generic Parquet reading and writing.

---
 psm_utils/io/__init__.py      |  23 ++++---
 psm_utils/io/_utils.py        |   1 -
 psm_utils/io/parquet.py       | 124 ++++++++++++++++++++++++++++++++++
 tests/test_io/test_parquet.py |  69 +++++++++++++++++++
 4 files changed, 208 insertions(+), 9 deletions(-)
 create mode 100644 psm_utils/io/parquet.py
 create mode 100644 tests/test_io/test_parquet.py

diff --git a/psm_utils/io/__init__.py b/psm_utils/io/__init__.py
index a0c2d42..0df6529 100644
--- a/psm_utils/io/__init__.py
+++ b/psm_utils/io/__init__.py
@@ -13,6 +13,7 @@
 import psm_utils.io.maxquant as maxquant
 import psm_utils.io.msamanda as msamanda
 import psm_utils.io.mzid as mzid
+import psm_utils.io.parquet as parquet
 import psm_utils.io.peptide_record as peptide_record
 import psm_utils.io.pepxml as pepxml
 import psm_utils.io.percolator as percolator
@@ -75,12 +76,6 @@
         "extension": ".parquet",
         "filename_pattern": r"^.*\.candidates\.parquet$",
     },
-    "tsv": {
-        "reader": tsv.TSVReader,
-        "writer": tsv.TSVWriter,
-        "extension": ".tsv",
-        "filename_pattern": r"^.*\.tsv$",
-    },
     "xtandem": {
         "reader": xtandem.XTandemReader,
         "writer": None,
@@ -105,6 +100,18 @@
         "extension": "ionbot.first.csv",
         "filename_pattern": r"^ionbot.first.csv$",
     },
+    "parquet": {  # List after proteoscape to avoid extension matching conflicts
+        "reader": parquet.ParquetReader,
+        "writer": parquet.ParquetWriter,
+        "extension": ".parquet",
+        "filename_pattern": r"^.*\.parquet$",
+    },
+    "tsv": {  # List after sage to avoid extension matching conflicts
+        "reader": tsv.TSVReader,
+        "writer": tsv.TSVWriter,
+        "extension": ".tsv",
+        "filename_pattern": r"^.*\.tsv$",
+    },
 }
 READERS = {k: v["reader"] for k, v in FILETYPES.items() if v["reader"]}
 WRITERS = {k: v["writer"] for k, v in FILETYPES.items() if v["writer"]}
@@ -124,10 +131,10 @@ def _supports_write_psm(writer: WriterBase):
     with NamedTemporaryFile(delete=False) as temp_file:
         temp_file.close()
         Path(temp_file.name).unlink()
-        example_psm = PSM(peptidoform="ACDE", spectrum_id=0)
+        example_psm = PSM(peptidoform="ACDE", spectrum_id="0")
         try:
             with writer(temp_file.name, example_psm=example_psm) as writer_instance:
-                writer_instance.write_psm(None)
+                writer_instance.write_psm(example_psm)
         except NotImplementedError:
             supports_write_psm = False
         except AttributeError:  # `None` is not valid PSM
diff --git a/psm_utils/io/_utils.py b/psm_utils/io/_utils.py
index 01c175f..e1572c2 100644
--- a/psm_utils/io/_utils.py
+++ b/psm_utils/io/_utils.py
@@ -10,7 +10,6 @@ def set_csv_field_size_limit():
         This function should be called before reading any CSV files to ensure that the field size
         limit is properly set.
 
-
     """
     max_int = sys.maxsize
 
diff --git a/psm_utils/io/parquet.py b/psm_utils/io/parquet.py
new file mode 100644
index 0000000..38e59ad
--- /dev/null
+++ b/psm_utils/io/parquet.py
@@ -0,0 +1,124 @@
+"""
+Reader and writer for a simple, lossless psm_utils Parquet format.
+
+Similar to the :py:mod:`psm_utils.io.tsv` module, this module provides a reader and writer
+for :py:class:`~psm_utils.psm_list.PSMList` objects in a lossless manner. However, Parquet provides
+better performance and storage efficiency compared to TSV, and is recommended for large datasets.
+
+"""
+
+from __future__ import annotations
+
+from pathlib import Path
+from typing import Union
+
+import pyarrow as pa
+import pyarrow.parquet as pq
+from pydantic import ValidationError
+
+from psm_utils.io._base_classes import ReaderBase, WriterBase
+from psm_utils.io.exceptions import PSMUtilsIOException
+from psm_utils.psm import PSM
+from psm_utils.psm_list import PSMList
+
+
+class ParquetReader(ReaderBase):
+    def __init__(self, path: Union[str, Path], *args, **kwargs):
+        """
+        Reader for Parquet files.
+
+        Parameters
+        ----------
+        path : Union[str, Path]
+            Path to the Parquet file.
+
+        """
+        self.path = path
+
+    def __iter__(self):
+        with pq.ParquetFile(self.path) as reader:
+            for batch in reader.iter_batches():
+                for row in batch.to_pylist():
+                    try:
+                        yield PSM(**row)
+                    except ValidationError as e:
+                        raise PSMUtilsIOException(f"Error while parsing row {row}:\n{e}")
+
+
+class ParquetWriter(WriterBase):
+    def __init__(self, path: Union[str, Path], chunk_size: int = 1e6, *args, **kwargs):
+        """
+        Writer for Parquet files.
+
+        Parameters
+        ----------
+        path : Union[str, Path]
+            Path to the Parquet file.
+        chunk_size : int
+            Number of PSMs to write in a single batch. Default is 1e6.
+
+        """
+        self.path = path
+        self.chunk_size = chunk_size
+
+        self._writer = None
+        self._psm_cache = []
+
+    def __enter__(self):
+        self._writer = pq.ParquetWriter(self.path, schema=SCHEMA)
+        return self
+
+    def __exit__(self, *args, **kwargs):
+        self._flush()
+        self._writer.close()
+
+    def write_psm(self, psm: PSM):
+        """Write a single PSM to the Parquet file."""
+        self._psm_cache.append(self._psm_to_entry(psm))
+        if len(self._psm_cache) > self.chunk_size:
+            self._flush()
+
+    def write_file(self, psm_list: PSMList):
+        """Write a list of PSMs to the Parquet file."""
+        with self:
+            for psm in psm_list:
+                self.write_psm(psm)
+
+    @staticmethod
+    def _psm_to_entry(psm: PSM) -> dict:
+        """Convert a PSM object to a dictionary suitable for writing to Parquet."""
+        psm_dict = dict(psm)
+        psm_dict["peptidoform"] = str(psm.peptidoform)
+        return psm_dict
+
+    def _flush(self):
+        """Write the cached PSMs to the Parquet file."""
+        if not self._psm_cache:
+            return
+        table = pa.Table.from_pylist(self._psm_cache, schema=SCHEMA)
+        self._writer.write_table(table)
+        self._psm_cache = []
+
+
+SCHEMA = pa.schema(
+    [
+        ("peptidoform", pa.string()),
+        ("spectrum_id", pa.string()),
+        ("run", pa.string()),
+        ("collection", pa.string()),
+        ("spectrum", pa.string()),
+        ("is_decoy", pa.bool_()),
+        ("score", pa.float32()),
+        ("qvalue", pa.float32()),
+        ("pep", pa.float32()),
+        ("precursor_mz", pa.float32()),
+        ("retention_time", pa.float32()),
+        ("ion_mobility", pa.float32()),
+        ("protein_list", pa.list_(pa.string())),
+        ("rank", pa.int32()),
+        ("source", pa.string()),
+        ("provenance_data", pa.map_(pa.string(), pa.string())),
+        ("metadata", pa.map_(pa.string(), pa.string())),
+        ("rescoring_features", pa.map_(pa.string(), pa.float32())),
+    ]
+)
diff --git a/tests/test_io/test_parquet.py b/tests/test_io/test_parquet.py
new file mode 100644
index 0000000..b1a1e09
--- /dev/null
+++ b/tests/test_io/test_parquet.py
@@ -0,0 +1,69 @@
+"""Tests for psm_utils.io.tsv."""
+
+import os
+import hashlib
+
+from psm_utils.io.parquet import ParquetReader, ParquetWriter
+from psm_utils.psm import PSM
+from psm_utils.psm_list import PSMList
+
+test_cases = [
+    {"peptidoform": "ACDE", "spectrum_id": "1"},
+    {
+        "peptidoform": "ACDE",
+        "spectrum_id": "2",
+        "run": None,
+        "collection": None,
+        "ion_mobility": None,
+        "is_decoy": None,
+        "pep": None,
+        "precursor_mz": None,
+        "protein_list": None,
+        "qvalue": None,
+        "rank": None,
+        "retention_time": None,
+        "score": None,
+        "source": None,
+        "spectrum": None,
+        "provenance_data": {"source": "test"},
+        "metadata": {},
+        "rescoring_features": {"feature": 2.0},
+    },
+]
+
+
+
+def compute_checksum(filename):
+    hash_func = hashlib.sha256()
+    with open(filename, 'rb') as f:
+        for chunk in iter(lambda: f.read(4096), b''):
+            hash_func.update(chunk)
+    return hash_func.hexdigest()
+
+class TestParquetWriter:
+    expected_checksum = "c0782793f8c6fd52e39d5ec1cf5567fb0a7e7e245d795f4f1f720337f756b44c"
+    def test_write_psm(self):
+        with ParquetWriter("test.pq") as writer:
+            for test_case in test_cases:
+                writer.write_psm(PSM(**test_case))
+        actual_checksum = compute_checksum("test.pq")
+        assert actual_checksum == self.expected_checksum, "Checksums do not match"
+        os.remove("test.pq")
+
+    def test_write_file(self):
+        with ParquetWriter("test.pq") as writer:
+            writer.write_file(PSMList(psm_list=[PSM(**t) for t in test_cases]))
+        actual_checksum = compute_checksum("test.pq")
+        assert actual_checksum == self.expected_checksum, "Checksums do not match"
+        # os.remove("test.pq")
+
+class TestParquetReader:
+    def test_iter(self):
+        # Write test cases to file
+        ParquetWriter("test.pq").write_file(PSMList(psm_list=[PSM(**t) for t in test_cases]))
+
+        # Read test cases from file
+        for i, psm in enumerate(ParquetReader("test.pq")):
+            assert psm == PSM(**test_cases[i])
+
+        os.remove("test.pq")

From 3ddabac90c568acec4cf38171402ea275a8492fd Mon Sep 17 00:00:00 2001
From: RalfG <ralfg@hotmail.be>
Date: Sun, 28 Apr 2024 12:06:44 +0200
Subject: [PATCH 03/10] Fully upgrade to Pydantic 2.0

---
 psm_utils/io/parquet.py |  6 ++++++
 psm_utils/psm.py        |  8 +++-----
 pyproject.toml          | 20 ++++++++++----------
 tests/test_psm_list.py  | 14 +++++++-------
 4 files changed, 26 insertions(+), 22 deletions(-)

diff --git a/psm_utils/io/parquet.py b/psm_utils/io/parquet.py
index 38e59ad..755cfcc 100644
--- a/psm_utils/io/parquet.py
+++ b/psm_utils/io/parquet.py
@@ -39,6 +39,12 @@ def __iter__(self):
         with pq.ParquetFile(self.path) as reader:
             for batch in reader.iter_batches():
                 for row in batch.to_pylist():
+                    # Convert map columns (rendered as lists of tuples) to dictionaries
+                    row["metadata"] = dict(row["metadata"] or {})
+                    row["provenance_data"] = dict(row["provenance_data"] or {})
+                    row["rescoring_features"] = dict(row["rescoring_features"] or {})
+
+                    # Convert to PSM object and yield
                     try:
                         yield PSM(**row)
                     except ValidationError as e:
diff --git a/psm_utils/psm.py b/psm_utils/psm.py
index 6053937..0ed97d9 100644
--- a/psm_utils/psm.py
+++ b/psm_utils/psm.py
@@ -2,7 +2,7 @@
 
 from typing import Any, Dict, List, Optional, Union
 
-from pydantic import BaseModel
+from pydantic import ConfigDict, BaseModel
 
 from psm_utils.peptidoform import Peptidoform
 
@@ -11,7 +11,7 @@ class PSM(BaseModel):
     """Data class representing a peptide-spectrum match (PSM)."""
 
     peptidoform: Union[Peptidoform, str]
-    spectrum_id: Union[int, str]
+    spectrum_id: Union[str]
     run: Optional[str] = None
     collection: Optional[str] = None
     spectrum: Optional[Any] = None
@@ -28,9 +28,7 @@ class PSM(BaseModel):
     provenance_data: Optional[Dict[str, str]] = dict()
     metadata: Optional[Dict[str, str]] = dict()
     rescoring_features: Optional[Dict[str, float]] = dict()
-
-    class Config:
-        arbitrary_types_allowed = True  # Allows non-pydantic class Peptidoform
+    model_config = ConfigDict(arbitrary_types_allowed=True, coerce_numbers_to_str=True)
 
     def __init__(self, **data):
         """
diff --git a/pyproject.toml b/pyproject.toml
index b6923bc..a2f3629 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -20,29 +20,30 @@ classifiers = [
 dynamic = ["version"]
 requires-python = ">=3.7"
 dependencies = [
-    "pyteomics >= 4, <4.7",
-    "pyopenms",
+    "click",
     "lxml",
-    "psims",
-    "pandas",
     "numpy",
-    "click",
+    "pandas",
+    "psims",
+    "pyarrow",
+    "pydantic >= 2",
+    "pyopenms",
+    "pyteomics >= 4, <4.7",
     "rich",
-    "pydantic",
     "sqlalchemy",
 ]
 
 [project.optional-dependencies]
 dev = ["ruff", "isort>5", "pytest", "pytest-cov"]
 docs = [
-    "sphinx",
     "numpydoc>=1,<2",
     "recommonmark",
-    "sphinx-mdinclude",
-    "toml",
     "semver>=2",
     "sphinx_rtd_theme",
     "sphinx-autobuild",
+    "sphinx-mdinclude",
+    "sphinx",
+    "toml",
 ]
 online = ["streamlit", "plotly"]
 
@@ -62,7 +63,6 @@ build-backend = "setuptools.build_meta"
 [tool.setuptools.packages.find]
 include = ["psm_utils*"]
 
-
 [tool.setuptools.dynamic]
 version = { attr = "psm_utils.__version__" }
 
diff --git a/tests/test_psm_list.py b/tests/test_psm_list.py
index 21fa709..cff86aa 100644
--- a/tests/test_psm_list.py
+++ b/tests/test_psm_list.py
@@ -26,30 +26,30 @@ def test___get_item__(self):
         psm_list = PSMList(psm_list=sample_psm_list)
 
         # Single index
-        assert psm_list[0] == PSM(peptidoform="ACDK", spectrum_id=1, score=140.2)
+        assert psm_list[0] == PSM(peptidoform="ACDK", spectrum_id="1", score=140.2)
 
         # Slice
         assert psm_list[0:2] == PSMList(
             psm_list=[
-                PSM(peptidoform="ACDK", spectrum_id=1, score=140.2),
-                PSM(peptidoform="CDEFR", spectrum_id=2, score=132.9),
+                PSM(peptidoform="ACDK", spectrum_id="1", score=140.2),
+                PSM(peptidoform="CDEFR", spectrum_id="2", score=132.9),
             ]
         )
 
         # PSM property as array
-        np.testing.assert_equal(psm_list["spectrum_id"], np.array([1, 2, 3]))
+        np.testing.assert_equal(psm_list["spectrum_id"], np.array(["1", "2", "3"]))
 
         # Multiple PSM properties as 2D array
         np.testing.assert_equal(
             psm_list[["spectrum_id", "score"]],
-            np.array([[1, 140.2], [2, 132.9], [3, 55.7]]),
+            np.array([["1", 140.2], ["2", 132.9], ["3", 55.7]]),
         )
 
         # Index by multiple indices
         psm_list[0, 2] == PSMList(
             psm_list=[
-                PSM(peptidoform="ACDK", spectrum_id=1, score=140.2),
-                PSM(peptidoform="DEM[Oxidation]K", spectrum_id=3, score=55.7),
+                PSM(peptidoform="ACDK", spectrum_id="1", score=140.2),
+                PSM(peptidoform="DEM[Oxidation]K", spectrum_id="3", score=55.7),
             ]
         )
 

From f70a905eb1faa840a1cc58391ddbec2350132cb5 Mon Sep 17 00:00:00 2001
From: RalfG <ralfg@hotmail.be>
Date: Wed, 1 May 2024 13:28:36 +0200
Subject: [PATCH 04/10] Add support for Sage Parquet files, next to TSV.
 SageReader is now SageTSVReader, with aliases for backwards compatibility;
 Update compatibility to Sage v14

---
 psm_utils/io/__init__.py             |  15 +++-
 psm_utils/io/sage.py                 | 107 ++++++++++++++++++---------
 tests/test_data/results.sage.parquet | Bin 0 -> 12270 bytes
 tests/test_data/results.sage.tsv     |   4 +-
 tests/test_io/test_idxml.py          |   6 +-
 tests/test_io/test_sage.py           |  47 +++++++++---
 6 files changed, 124 insertions(+), 55 deletions(-)
 create mode 100644 tests/test_data/results.sage.parquet

diff --git a/psm_utils/io/__init__.py b/psm_utils/io/__init__.py
index 0df6529..45b3d7a 100644
--- a/psm_utils/io/__init__.py
+++ b/psm_utils/io/__init__.py
@@ -88,19 +88,25 @@
         "extension": ".csv",
         "filename_pattern": r"^.*(?:_|\.)msamanda.csv$",
     },
-    "sage": {
-        "reader": sage.SageReader,
+    "sage_tsv": {
+        "reader": sage.SageTSVReader,
         "writer": None,
         "extension": ".tsv",
         "filename_pattern": r"^.*(?:_|\.).sage.tsv$",
     },
+    "sage_parquet": {
+        "reader": sage.SageParquetReader,
+        "writer": None,
+        "extension": ".parquet",
+        "filename_pattern": r"^.*(?:_|\.).sage.parquet$",
+    },
     "ionbot": {
         "reader": ionbot.IonbotReader,
         "writer": None,
         "extension": "ionbot.first.csv",
         "filename_pattern": r"^ionbot.first.csv$",
     },
-    "parquet": {  # List after proteoscape to avoid extension matching conflicts
+    "parquet": {  # List after proteoscape and sage to avoid extension matching conflicts
         "reader": parquet.ParquetReader,
         "writer": parquet.ParquetWriter,
         "extension": ".parquet",
@@ -113,6 +119,9 @@
         "filename_pattern": r"^.*\.tsv$",
     },
 }
+
+FILETYPES["sage"] = FILETYPES["sage_tsv"]  # Alias for backwards compatibility
+
 READERS = {k: v["reader"] for k, v in FILETYPES.items() if v["reader"]}
 WRITERS = {k: v["writer"] for k, v in FILETYPES.items() if v["writer"]}
 
diff --git a/psm_utils/io/sage.py b/psm_utils/io/sage.py
index e0cc0d1..4cc23e3 100644
--- a/psm_utils/io/sage.py
+++ b/psm_utils/io/sage.py
@@ -2,26 +2,29 @@
 Reader for PSM files from the Sage search engine.
 
 Reads the ``results.sage.tsv`` file as defined on the
-`Sage documentation page <https://github.com/lazear/sage/blob/v0.13.2/DOCS.md#interpreting-sage-output>`_.
+`Sage documentation page <https://github.com/lazear/sage/blob/v0.14.7/DOCS.md#interpreting-sage-output>`_.
 
 """
 
 from __future__ import annotations
 
 import csv
+from abc import ABC, abstractmethod
 from pathlib import Path
 from typing import Iterable, Optional
 
+import pyarrow.parquet as pq
 from pyteomics import mass
 
 from psm_utils.io._base_classes import ReaderBase
-from psm_utils.psm import PSM
 from psm_utils.io._utils import set_csv_field_size_limit
+from psm_utils.psm import PSM
+from psm_utils.psm_list import PSMList
 
 set_csv_field_size_limit()
 
 
-class SageReader(ReaderBase):
+class SageReaderBase(ReaderBase, ABC):
     def __init__(
         self, filename, score_column: str = "sage_discriminant_score", *args, **kwargs
     ) -> None:
@@ -41,42 +44,15 @@ def __init__(
         self.filename = filename
         self.score_column = score_column
 
+    @abstractmethod
     def __iter__(self) -> Iterable[PSM]:
         """Iterate over file and return PSMs one-by-one."""
-        with open(self.filename) as open_file:
-            reader = csv.DictReader(open_file, delimiter="\t")
-            for row in reader:
-                psm = self._get_peptide_spectrum_match(row)
-                yield psm
+        raise NotImplementedError("Use `SageTSVReader` or `SageParquetReader` instead.")
 
     def _get_peptide_spectrum_match(self, psm_dict) -> PSM:
         """Parse a single PSM from a sage PSM file."""
         rescoring_features = {}
-        for ft in [
-            "expmass",
-            "calcmass",
-            "delta_mass",
-            "peptide_len",
-            "missed_cleavages",
-            "isotope_error",
-            "precursor_ppm",
-            "fragment_ppm",
-            "hyperscore",
-            "delta_next",
-            "delta_best",
-            "delta_rt_model",
-            "aligned_rt",
-            "predicted_rt",
-            "matched_peaks",
-            "longest_b",
-            "longest_y",
-            "longest_y_pct",
-            "matched_intensity_pct",
-            "scored_candidates",
-            "poisson",
-            "ms1_intensity",
-            "ms2_intensity",
-        ]:
+        for ft in RESCORING_FEATURES:
             try:
                 rescoring_features[ft] = psm_dict[ft]
             except KeyError:
@@ -89,9 +65,7 @@ def _get_peptide_spectrum_match(self, psm_dict) -> PSM:
             ),
             spectrum_id=psm_dict["scannr"],
             run=Path(psm_dict["filename"]).stem,
-            is_decoy=(
-                True if psm_dict["label"] == "-1" else False if psm_dict["label"] == "1" else None
-            ),
+            is_decoy=psm_dict["is_decoy"],
             qvalue=psm_dict["spectrum_q"],
             score=float(psm_dict[self.score_column]),
             precursor_mz=self._parse_precursor_mz(psm_dict["expmass"], psm_dict["charge"]),
@@ -118,3 +92,64 @@ def _parse_precursor_mz(expmass: str, charge: Optional[str]) -> Optional[float]:
             return (expmass + (mass.nist_mass["H"][1][0] * charge)) / charge
         else:
             return None
+
+    @classmethod
+    def from_dataframe(cls, dataframe) -> PSMList:
+        """Create a PSMList from a Sage Pandas DataFrame."""
+        return PSMList(
+            psm_list=[
+                cls._get_peptide_spectrum_match(cls(""), entry)
+                for entry in dataframe.to_dict(orient="records")
+            ]
+        )
+
+
+class SageTSVReader(SageReaderBase):
+    def __iter__(self) -> Iterable[PSM]:
+        """Iterate over file and return PSMs one-by-one."""
+        with open(self.filename, "r") as open_file:
+            reader = csv.DictReader(open_file, delimiter="\t")
+            for row in reader:
+                row["is_decoy"] = (
+                    True if row["label"] == "-1" else False if row["label"] == "1" else None
+                )
+
+                yield self._get_peptide_spectrum_match(row)
+
+SageReader = SageTSVReader  # Alias for backwards compatibility
+
+
+class SageParquetReader(SageReaderBase):
+    def __iter__(self) -> Iterable[PSM]:
+        """Iterate over file and return PSMs one-by-one."""
+        with pq.ParquetFile(self.filename) as pq_file:
+            for batch in pq_file.iter_batches():
+                for row in batch.to_pylist():
+                    yield self._get_peptide_spectrum_match(row)
+
+
+RESCORING_FEATURES = [
+    "expmass",
+    "calcmass",
+    "delta_mass",
+    "peptide_len",
+    "missed_cleavages",
+    "isotope_error",
+    "precursor_ppm",
+    "fragment_ppm",
+    "hyperscore",
+    "delta_next",
+    "delta_best",
+    "delta_rt_model",
+    "aligned_rt",
+    "predicted_rt",
+    "matched_peaks",
+    "longest_b",
+    "longest_y",
+    "longest_y_pct",
+    "matched_intensity_pct",
+    "scored_candidates",
+    "poisson",
+    # "ms1_intensity",  # Removed in Sage v0.14
+    "ms2_intensity",
+]
diff --git a/tests/test_data/results.sage.parquet b/tests/test_data/results.sage.parquet
new file mode 100644
index 0000000000000000000000000000000000000000..8060fe64f7069d4f51d763e19b51abe94fc5e2c2
GIT binary patch
literal 12270
zcmc&)4~P`k8J}H;^VBo(-@MG)ZF<`HVy!tf_RQMEn6qN6z36E)o^6`#x;J{G`_I|c
z#*l=oNRdPAVMQbjN09W8L#`68;VQwVp&TM2se~gO60RXgNJS0_M-dzPeQ$Q&dvi0h
z<E@FBF!Rmt=KbF9{h9Z^_kG*av}Kv5YBRL?&017bW+}?Tzb^To(%Pgb5yk$eDewYW
z41b=X)o3;G)>`K;I=5!dgm>+dXhk(O)~M9Xi0L(jVm4{EX>}w@4n;^$gs!8taZylM
z2~H+3q=m{8+6K3}e=CX()iiH@aZ5{6Q%g%z%kwSIZ*Jb&yrm)g+6&Dd#d^@2df;C?
zEvhTGsCJi`F><M_!P@jygdB^~-bwVI(RgbhS><Y=g>72F-NH8_%He1Ov@xB}m8^Uw
zV^~`c7K~L(>o@Mt?li1b%j%2iRBqMsrAt?=j40-xyp3+~6cvCEi9W&i13Rb&h2<>P
zJvXD|=p=e#GCczGW`Y$|BM7>(L3_^Km48!|+5VHoGf!sw%oE)Y3~c=5f>9`$ZT952
z93i6-dV3n}o)&NQqY31G;O&@EELmouV6-K@4eETD_B}N51})XrxCiDRiZTZVrdW7=
z^U9Tt%U8VqQq$ID$@SY_Xxiu*nK^!M>Aq;-*whxRe916#MK(3BM#xx%_RXRn&y2VF
z5(U%4@YHH^-L08tub3X?Dz26~yLyyG%WFzqZhtoEQn!a3i_p^=y)h?lm$k2nactkV
z5Y<ZM_Ok9_0{S1J-;l}z6xbK$EXKXV-W(!|9#N!*PJz~IN^F5r6Vq$WVzSLh=MS<P
z5@ZAIh&|KFO9V9L?T<#+dV3X7UB#<ZnZ5VamV0%w@oFKPD%!)%+H++tJvA5FBMK)Z
z8;Jrm{J~cPqPmLLKn=C&R3_~fF%+lo#6g58osjGbA||n}1cmZutRkwjt4D-0y=v0!
zDQmaACb51Pd6f1(3KB%wgk)EcfD5U2ED+UIyaq~`R$fMuuy>02b-kXRs|Ou|AR*Zm
zbU;-;IuO-Wyawu+ku{4&Sk=-QBXuCP+mI&5*nIlod=Mj85|YitOk}b@R5Y??(#XAb
zFq<lwX+GtL9}A!I@4eCK-Op7-b$0bM&zbVmv&AQqX0Bx9ie}0F%<K=D(r*?(+XO{I
zva9dtj$kEbF<;6TjHF>%dD|IaN-i#>#}~@I+#&TcqYL;1&-`c0YVXuTR9EpDtfXKW
z>HStQZzT(bteAY)7SWzXphQq4B)dYN4Bk<Elpw0Bcnwrix64ZH&KkLrTgd2QdVVno
z5ex~*t{{XS>00CM5=3<suYp3Qwu3Ke71Mc3S`-c~p_dy#hA5wq?C(TIn~^D{k~!nm
zk{Fh*WpsFHH8SwK_4Wq;nuDk6R9nVQqbSMfUQWM!5@d+-cR5e%kALbjD2VDRUIPau
zYDv|fZKPvAf$BxMgk)D3j2&Cv^qHK9>MC9XtDl<6n7ebZG|Do1pQ2x{02!iuLb5B!
zsCqE!;A?F&)1`4j23FETE2|lfs{7+1dsbMbWHt|`n4p)QrYD}hzkOonbD(0UnQ<-Z
zBdcids%rYA>cN>M2FEMs)@u5}>ia6EJVah8U2EuYQ#Dcubd7fnK~z`q>iK#c7i&6f
z^67SVTuSYgoQj^cbo6JSL{KCoyTS)#`_1E>Jw$aCuYpRYWb!%iYf8zTV#=OhM-Q(9
z6{2`TvMZ>lgPOekfvB$HHC%<{gbc2yUF$)GD4vk)3Mv-<WuDKPhN!OM)uV#DLrz~D
zOcv78>UwnpeR~5a5fll@uAt<D&fobg?uhCtUIUeA&ivrcnOo8IW_qp}6bY(?WLHpx
zr%Ue$-Nl)6CfokO6HevY%(hfXc7;1P(N8vl8bNcH2hYsz+kf|}K~z`q8rVOR3wdzN
z^U~R%XEPmb0S%&XLb5CL$iL40*=M#Os;hVn)bL0V4g|?IvzWHbteH!JpEk}#>U)v?
zcME6|WC_Wxph;0Wd^9179!5v7riTmpV#%;fa9+nb(nH(mJKI2uAVCL3CJb<-@g?sV
zxr@4Dxz5)#wOBCHB@03o`^3_6b2}Z_eqUuwagLIbiaR^#;EwyO!VV_MeR1Svdi7Uf
zDim!Np6#4adp*Y4;d!pM7^5gG#O{RBO^Bq!=N&Cy6|FoH31S(Ayan?jiJk#pQq+uD
zEKN2t2A(n45Vexm>Cso{XTOfG0H)Z?C<I~fqsD)dsYPS3L9ssv#;{|-L2Rk`1Y^9h
zi3WM4s{8_~G2e*ly%zF~4#xb$gHl~=`Hb1CXgC~!0f|6>fsX`>OhIuL|B_&g8v%?r
z`E88l0E{;o#?T7Dc#~mxL>O-p?A3q)19*=GBqR+`tixNJ9iIVaM|J6KJlFz^H>omq
zxCEG;Vx_n7*#{1&5)Vl5Hr`}zbo&8j-oL~hKr*qn14sg3^!z{x-eiI3iUEu_8Ajg-
zV7$pNIw}C;O@`5102psFj9)mwc#~n=Ndd;24C7`7Fy3SsHw1v$ZsF>|dxRr6&7G+p
zRbp|nPHShK6eqotv@?;iePXiK5iQ%dr)eh~yz^mU_s!ByIrwQ!Yc0Rt{|GlbV%ml>
zdSxz0hvM8Ed6b*i>$y2LpPR#vafW`%(TfYYd2JClM;CMR&=PKTE#qeQa&De&<Yw<v
z+#FcR%}Y;nb7U1aZ>{Dg1b>_s_pIgS`E}eJT+hv`8@PGBnVX%PxY@Irn|&{Gb7&hk
zZ*J%2ogLgf@-jD%zM^f!u7*i;=ZW=@1#CC6DQzo+)E6jD9$3-p;DM)57r@&Hlv9Sz
zqmx8)o;}GZ=MOxigFqCBT!|DLVkr&SVX<B({{TBFJUaOS`-w;D?PBGax&0tVuWiH=
zN49Gi7xcB6-9gzh<_eFNF;}3{##|vo8<K)|Wk7_{TmS~NYyd(_<-dp^j}5rOEjBbR
z@KJtG4nf$EPcXnm;2g{C4s#!nZA=_h$2}jkiKe@bZ#L$Ckh05rk2H1}=Wt;^1-be5
zU&lM#HY&NkHXxi(=h<-wmBy$Z;GcniSJV6qd?Jl1EW_ZdX;Ca4g_&skYzm3S;;}o6
zXkR4$E&hap9I~`IWH_lktI(@zT>I{R%0Nz9nodsrMtf7CC#Hy+9^`KVNFQoCnbL+L
z3OzPmh;o)0<|$X7t0#lVe`BY%N1@#_1ifd~lIp1@g-$-hdWO<i&){rP4_^cs|GlKO
z$r#q~VVjmy=xr?y3%UIx9@nbMYGJ9I@bok}R@4aTGqeVU9@PcmgwxFAT_n23h%UwA
zcbQNo=do1hE^WR-d%hQUj!%5y7flII43m$M>+CLUO5gVd7vDAnS$Kk%Tt~8DMmF#R
z(JxgbYRMg>I<gyeU6?1R<Sk1`3s1O`<0asGpYdJ!q2Tjuc!D%MaZ1i0=@*Rj-35I3
z#_g7j5q}lnN|BNaSoU~34&u?rMaR2$RiUOoG}TEyLB{@e?6P-$Br&@7WWgAovLs(2
z<L8XA|Hl%eYeyH1;i*T`u^$+_Ow@Ss3Bl;{2Lxew3Xz;d!V8RWc!?mCya_=Uo(d#q
zk?sbg`*N8?Cwvh>7oO51A0gdad$74<%RO`xP0f){k?ta+JG?^BO)$kqZX(SXqdAli
zG*za`$l(J(_wHV7WY<pxoy!Lky%wJGB5xt#RYrLF89_MCgA<hDsVj00DLXQ#ao|}&
zStt8{f-pP<MFx?uj}cyaP7wNdkAgNll|-%~?KMXG<r+bI4=EvX3n@FZ*z|AKij~jp
zxeAftsU33Ux4<~S7>_?M7=_;}*uqmZq#N0;Gq&!Z`>;(Y)k6A_?L-b++xrXAfo`8$
z=nGGokW0w;5o5gYf?)Lb*adTV>Vu3R^C)Bfe3NLo*WVYc;VBF<hOFIrHr|_sUgxwR
zxWZEr<mjuwb(wMfeyiXLI%^2-@RR~Mjoh~xci&5bd;Cd7aE7M_$N+MlDPY?NTLq_d
z&JlFsA%1cN>4q5HC%+VQ;*=!V!bA1s3uHUI55<kVEVDUhEWs8YawoTu?LEde{0e8Q
zHik~R4g%fCK5$3K`;U_-ttDSS2|r_97%QZ#efy2l6IQYQ0Q|TH7R>s_r48^O9O&YT
R@&Nx={s_NpY=K|;{~u}aqh0_2

literal 0
HcmV?d00001

diff --git a/tests/test_data/results.sage.tsv b/tests/test_data/results.sage.tsv
index 9d6ec16..6543ec3 100644
--- a/tests/test_data/results.sage.tsv
+++ b/tests/test_data/results.sage.tsv
@@ -1,2 +1,2 @@
-peptide	proteins	num_proteins	filename	scannr	rank	label	expmass	calcmass	charge	peptide_len	missed_cleavages	isotope_error	precursor_ppm	fragment_ppm	hyperscore	delta_next	delta_best	rt	aligned_rt	predicted_rt	delta_rt_model	matched_peaks	longest_b	longest_y	longest_y_pct	matched_intensity_pct	scored_candidates	poisson	sage_discriminant_score	posterior_error	spectrum_q	peptide_q	protein_q	ms1_intensity	ms2_intensity
-LQSRPAAPPAPGPGQLTLR	sp|Q99536|VAT1_HUMAN	1	LQSRPAAPPAPGPGQLTLR.mzML	controllerType=0 controllerNumber=1 scan=30069	1	1	1926.0815	1926.08	3	19	0	0.0	0.8239083	0.5347518	71.78844460255384	71.78844460255384	0.0	108.2854	0.0	0.0	0.0	22	9	12	0.6315789	50.785	1	-1.9562811911083433	1.2944585	1.0	1.0	1.0	1.0	306146180.0	56930696.0
+psm_id	peptide	proteins	num_proteins	filename	scannr	rank	label	expmass	calcmass	charge	peptide_len	missed_cleavages	semi_enzymatic	isotope_error	precursor_ppm	fragment_ppm	hyperscore	delta_next	delta_best	rt	aligned_rt	predicted_rt	delta_rt_model	ion_mobility	predicted_mobility	delta_mobility	matched_peaks	longest_b	longest_y	longest_y_pct	matched_intensity_pct	scored_candidates	poisson	sage_discriminant_score	posterior_error	spectrum_q	peptide_q	protein_q	ms2_intensity
+1	LQSRPAAPPAPGPGQLTLR	sp|Q99536|VAT1_HUMAN	1	LQSRPAAPPAPGPGQLTLR.mzML	controllerType=0 controllerNumber=1 scan=30069	1	1	1926.0815	1926.08	3	19	0	0	0.0	0.8239083	0.503857	72.26591573806016	72.26591573806016	0.0	108.2854	0.993444	0.0	0.993444	0.0	0.0	0.0	22	9	12	0.6315789	64.770966	1	-1.9562811911083433	1.2944585	1.0	1.0	1.0	1.0	72609170.0
diff --git a/tests/test_io/test_idxml.py b/tests/test_io/test_idxml.py
index 075b12f..adb6e96 100644
--- a/tests/test_io/test_idxml.py
+++ b/tests/test_io/test_idxml.py
@@ -3,9 +3,9 @@
 import hashlib
 
 from psm_utils.io.idxml import IdXMLReader, IdXMLWriter
-from psm_utils.io.sage import SageReader
-from psm_utils.psm import PSM
+from psm_utils.io.sage import SageTSVReader
 from psm_utils.peptidoform import Peptidoform
+from psm_utils.psm import PSM
 
 
 class TestIdXMLReader:
@@ -104,7 +104,7 @@ def test_write_file_with_pyopenms_objects(self):
 
     def test_write_file_without_pyopenms_objects(self):
         expected_sha = "b81addaf8ef1f5cb5007f14a914bee508c54d59f34f8857a5770d3db9aa2c15b"
-        reader = SageReader("./tests/test_data/results.sage.tsv")
+        reader = SageTSVReader("./tests/test_data/results.sage.tsv")
         psm_list = reader.read_file()
         writer = IdXMLWriter("./tests/test_data/test_out_sage.idXML")
         writer.write_file(psm_list)
diff --git a/tests/test_io/test_sage.py b/tests/test_io/test_sage.py
index 10d2bcc..60d87ba 100644
--- a/tests/test_io/test_sage.py
+++ b/tests/test_io/test_sage.py
@@ -1,6 +1,8 @@
 """Tests for psm_utils.io.sage."""
 
-from psm_utils.io.sage import SageReader
+import pytest
+
+from psm_utils.io.sage import SageParquetReader, SageTSVReader
 from psm_utils.psm import PSM
 
 test_psm = PSM(
@@ -27,29 +29,52 @@
         "missed_cleavages": 0.0,
         "isotope_error": 0.0,
         "precursor_ppm": 0.8239083,
-        "fragment_ppm": 0.5347518,
-        "hyperscore": 71.78844460255384,
-        "delta_next": 71.78844460255384,
+        "fragment_ppm": 0.503857,
+        "hyperscore": 72.26591573806016,
+        "delta_next": 72.26591573806016,
         "delta_best": 0.0,
-        "delta_rt_model": 0.0,
-        "aligned_rt": 0.0,
+        "delta_rt_model": 0.993444,
+        "aligned_rt": 0.993444,
         "predicted_rt": 0.0,
         "matched_peaks": 22.0,
         "longest_b": 9.0,
         "longest_y": 12.0,
         "longest_y_pct": 0.6315789,
-        "matched_intensity_pct": 50.785,
+        "matched_intensity_pct": 64.770966,
         "scored_candidates": 1.0,
         "poisson": -1.9562811911083433,
-        "ms1_intensity": 306146180.0,
-        "ms2_intensity": 56930696.0,
+        "ms2_intensity": 72609170.0,
     },
 )
 
 
-class TestSageReader:
+class TestSageTSVReader:
     def test_iter(self):
-        with SageReader("./tests/test_data/results.sage.tsv") as reader:
+        with SageTSVReader("./tests/test_data/results.sage.tsv") as reader:
             for psm in reader:
                 psm.provenance_data = {}
                 assert psm == test_psm
+
+
+class TestSageParquetReader:
+    def test_iter(self):
+        with SageParquetReader("./tests/test_data/results.sage.parquet") as reader:
+            # Parquet results in float precision differences, so pytest.approx is used, which does
+            # not support objects with nested dicts.
+            for psm in reader:
+                psm_dict = dict(psm)
+                test_psm_dict = dict(test_psm)
+
+                # Nested dicts
+                assert psm_dict.pop("rescoring_features", {}) == pytest.approx(
+                    test_psm_dict.pop("rescoring_features", {})
+                )
+                assert psm_dict.pop("metadata", {}) == test_psm_dict.pop("metadata", {})
+                psm_dict.pop("provenance_data", {})
+
+                # Remaining keys
+                for k, v in psm_dict.items():
+                    if isinstance(v, float):
+                        assert v == pytest.approx(test_psm_dict[k])
+                    else:
+                        assert v == test_psm_dict[k]

From d4854f4637772e15ffce6150ef60aae0e5e6ab67 Mon Sep 17 00:00:00 2001
From: RalfG <ralfg@hotmail.be>
Date: Wed, 1 May 2024 13:31:44 +0200
Subject: [PATCH 05/10] Use pyarrow for iterative parquet reading instead a
 reading a pandas dataframe intermediate

---
 psm_utils/io/proteoscape.py | 93 +++++++++++++++++++------------------
 1 file changed, 49 insertions(+), 44 deletions(-)

diff --git a/psm_utils/io/proteoscape.py b/psm_utils/io/proteoscape.py
index cc0e834..ddc4386 100644
--- a/psm_utils/io/proteoscape.py
+++ b/psm_utils/io/proteoscape.py
@@ -4,13 +4,15 @@
 import re
 from pathlib import Path
 from typing import Union
-from collections import namedtuple
 
 import numpy as np
 import pandas as pd
+import pyarrow.parquet as pq
 
-from psm_utils import PSM
+from psm_utils.psm import PSM
+from psm_utils.psm_list import PSMList
 from psm_utils.io._base_classes import ReaderBase
+from psm_utils.io.exceptions import PSMUtilsIOException
 from psm_utils.peptidoform import format_number_as_string
 
 logger = logging.getLogger(__name__)
@@ -36,31 +38,31 @@ def __init__(
             Path to MSF file.
 
         """
-        if isinstance(filename, pd.DataFrame):
-            self.data = filename
-        else:
-            super().__init__(filename, *args, **kwargs)
-            self.data = pd.read_parquet(self.filename)
-
-        self._Row = namedtuple("Row", self.data.columns)
+        self.filename = filename
 
     def __len__(self):
         """Return number of PSMs in file."""
-        return len(self.data)
+        return pq.read_metadata(self.filename).num_rows
 
     def __iter__(self):
         """Iterate over file and return PSMs one-by-one."""
-        for entry in self.data.itertuples():
-            yield _parse_entry(entry)
-
-    def __getitem__(self, index):
-        """Return PSM at index."""
-        return _parse_entry(self._Row(*self.data.iloc[index]))
+        with pq.ParquetFile(self.filename) as reader:
+            for batch in reader.iter_batches():
+                for row in batch.to_pylist():
+                    try:
+                        yield _parse_entry(row)
+                    except Exception as e:
+                        raise PSMUtilsIOException(f"Error while parsing row {row}:\n{e}") from e
 
     @classmethod
-    def from_dataframe(cls, dataframe: pd.DataFrame, *args, **kwargs):
-        """Create a ProteoScapeReader from a DataFrame."""
-        return cls(dataframe, *args, **kwargs)
+    def from_dataframe(cls, dataframe: pd.DataFrame) -> PSMList:
+        """Create a PSMList from a ProteoScape Pandas DataFrame."""
+        return PSMList(
+            psm_list=[
+                cls._get_peptide_spectrum_match(cls(""), entry)
+                for entry in dataframe.to_dict(orient="records")
+            ]
+        )
 
 
 def _parse_peptidoform(
@@ -81,40 +83,43 @@ def _parse_peptidoform(
     return f"{n_term}{''.join(peptidoform)}{c_term}/{precursor_charge}"
 
 
-def _parse_entry(entry) -> PSM:
+def _parse_entry(entry: dict) -> PSM:
     """Parse a single entry from ProteoScape Parquet file to PSM object."""
     return PSM(
         peptidoform=_parse_peptidoform(
-            entry.stripped_peptide, entry.ptms, entry.ptm_locations, entry.precursor_charge
+            entry["stripped_peptide"],
+            entry["ptms"],
+            entry["ptm_locations"],
+            entry["precursor_charge"],
         ),
-        spectrum_id=entry.ms2_id,
-        run=getattr(entry, "run", None),
-        is_decoy=all(DECOY_PATTERN.match(p) for p in entry.locus_name),
-        score=entry.x_corr_score,
-        precursor_mz=entry.precursor_mz,
-        retention_time=entry.rt,
-        ion_mobility=entry.ook0,
-        protein_list=list(entry.locus_name),
-        rank=entry.rank,
+        spectrum_id=entry["ms2_id"],
+        run=entry.get("run", None),
+        is_decoy=all(DECOY_PATTERN.match(p) for p in entry["locus_name"]),
+        score=entry["x_corr_score"],
+        precursor_mz=entry["precursor_mz"],
+        retention_time=entry["rt"],
+        ion_mobility=entry["ook0"],
+        protein_list=list(entry["locus_name"]),
+        rank=entry["rank"],
         source="ProteoScape",
         provenance_data={
-            "candidate_id": str(entry.candidate_id),
-            "ms2_id": str(entry.ms2_id),
-            "parent_id": str(entry.parent_id),
+            "candidate_id": str(entry["candidate_id"]),
+            "ms2_id": str(entry["ms2_id"]),
+            "parent_id": str(entry["parent_id"]),
         },
         metadata={
-            "leading_aa": str(entry.leading_aa),
-            "trailing_aa": str(entry.trailing_aa),
-            "corrected_ook0": str(entry.corrected_ook0),
+            "leading_aa": str(entry["leading_aa"]),
+            "trailing_aa": str(entry["trailing_aa"]),
+            "corrected_ook0": str(entry["corrected_ook0"]),
         },
         rescoring_features={
-            "tims_score": float(entry.tims_score),
-            "x_corr_score": float(entry.x_corr_score),
-            "delta_cn_score": float(entry.delta_cn_score),
-            "ppm_error": float(entry.ppm_error),
-            "number_matched_ions": float(entry.number_matched_ions),
-            "number_expected_ions": float(entry.number_expected_ions),
-            "ion_proportion": float(entry.ion_proportion),
-            "spectrum_total_ion_intensity": float(entry.spectrum_total_ion_intensity),
+            "tims_score": float(entry["tims_score"]),
+            "x_corr_score": float(entry["x_corr_score"]),
+            "delta_cn_score": float(entry["delta_cn_score"]),
+            "ppm_error": float(entry["ppm_error"]),
+            "number_matched_ions": float(entry["number_matched_ions"]),
+            "number_expected_ions": float(entry["number_expected_ions"]),
+            "ion_proportion": float(entry["ion_proportion"]),
+            "spectrum_total_ion_intensity": float(entry["spectrum_total_ion_intensity"]),
         },
     )

From 30e2490be136d7ccde7fe50c55cf90d299178c7e Mon Sep 17 00:00:00 2001
From: RalfG <ralfg@hotmail.be>
Date: Wed, 1 May 2024 13:52:56 +0200
Subject: [PATCH 06/10] Update readme; changelog; version bump

---
 CHANGELOG.md          | 13 +++++++++++++
 README.rst            |  4 +++-
 psm_utils/__init__.py |  2 +-
 3 files changed, 17 insertions(+), 2 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index e8a279b..ad12218 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -5,6 +5,19 @@ All notable changes to this project will be documented in this file.
 The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
 and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
 
+## [0.9.0] - 2024-05-01
+
+### Added
+
+- `io`: Read and write support for writing PSMs to Apache Parquet for efficient storage of PSM lists.
+- `io.sage`: Support for Sage results in Parquet format (new `SageParquetReader`, renamed `SageReader` to `SageTSVReader`).
+
+### Changed
+
+- Upgrade Pydantic dependency to v2. The PSM `spectrum_id` field is now always coerced to a string.
+- `io.proteoscape`: Use pyarrow to iteratively read from Parquet instead of first reading an entire dataframe with Pandas.
+- `io.sage`: Update compatibility to Sage v0.14
+
 ## [0.8.3] - 2024-04-16
 
 ### Added
diff --git a/README.rst b/README.rst
index 3f4c2d1..fede4a6 100644
--- a/README.rst
+++ b/README.rst
@@ -94,11 +94,13 @@ Supported file formats
  `MaxQuant msms.txt <https://www.maxquant.org/>`_                                                                      ``msms``                 ✅              ❌
  `MS Amanda CSV <https://ms.imp.ac.at/?goto=msamanda>`_                                                                ``msamanda``             ✅              ❌
  `mzIdentML <https://psidev.info/mzidentml>`_                                                                          ``mzid``                 ✅              ✅
+ `Parquet <https://psm-utils.readthedocs.io/en/stable/api/psm_utils.io#module-psm_utils.io.parquet>`                   ``parquet``              ✅              ✅
  `Peptide Record <https://psm-utils.readthedocs.io/en/stable/api/psm_utils.io/#module-psm_utils.io.peptide_record>`_   ``peprec``               ✅              ✅
  `pepXML <http://tools.proteomecenter.org/wiki/index.php?title=Formats:pepXML>`_                                       ``pepxml``               ✅              ❌
  `Percolator tab <https://github.com/percolator/percolator/wiki/Interface>`_                                           ``percolator``           ✅              ✅
  Proteome Discoverer MSF                                                                                               ``proteome_discoverer``  ✅              ❌
- `Sage <https://github.com/lazear/sage/blob/v0.12.0/DOCS.md#interpreting-sage-output>`_                                ``sage``                 ✅              ❌
+ `Sage Parquet <https://github.com/lazear/sage/blob/v0.14.7/DOCS.md#interpreting-sage-output>`_                        ``sage_parquet``         ✅              ❌
+ `Sage TSV <https://github.com/lazear/sage/blob/v0.14.7/DOCS.md#interpreting-sage-output>`_                            ``sage_tsv``             ✅              ❌
  ProteoScape Parquet                                                                                                   ``proteoscape``          ✅              ❌
  `TSV <https://psm-utils.readthedocs.io/en/stable/api/psm_utils.io/#module-psm_utils.io.tsv>`_                         ``tsv``                  ✅              ✅
  `X!Tandem XML <https://www.thegpm.org/tandem/>`_                                                                      ``xtandem``              ✅              ❌
diff --git a/psm_utils/__init__.py b/psm_utils/__init__.py
index 3922589..7e9087d 100644
--- a/psm_utils/__init__.py
+++ b/psm_utils/__init__.py
@@ -1,6 +1,6 @@
 """Common utilities for parsing and handling PSMs, and search engine results."""
 
-__version__ = "0.8.3"
+__version__ = "0.9.0"
 __all__ = ["Peptidoform", "PSM", "PSMList"]
 
 from functools import lru_cache

From 6a8b51f26626f7efcbd90853211093380df7ed66 Mon Sep 17 00:00:00 2001
From: RalfG <ralfg@hotmail.be>
Date: Wed, 1 May 2024 14:31:04 +0200
Subject: [PATCH 07/10] Fix test checksums

---
 tests/test_io/test_idxml.py   |  2 +-
 tests/test_io/test_parquet.py | 12 +++++++-----
 2 files changed, 8 insertions(+), 6 deletions(-)

diff --git a/tests/test_io/test_idxml.py b/tests/test_io/test_idxml.py
index adb6e96..7bf1eac 100644
--- a/tests/test_io/test_idxml.py
+++ b/tests/test_io/test_idxml.py
@@ -103,7 +103,7 @@ def test_write_file_with_pyopenms_objects(self):
         assert sha == expected_sha
 
     def test_write_file_without_pyopenms_objects(self):
-        expected_sha = "b81addaf8ef1f5cb5007f14a914bee508c54d59f34f8857a5770d3db9aa2c15b"
+        expected_sha = "148889926276fbe391e23ed7952c3a8410fc67ffb099bbf1a72df75f8d727ccd"
         reader = SageTSVReader("./tests/test_data/results.sage.tsv")
         psm_list = reader.read_file()
         writer = IdXMLWriter("./tests/test_data/test_out_sage.idXML")
diff --git a/tests/test_io/test_parquet.py b/tests/test_io/test_parquet.py
index b1a1e09..20bb0b0 100644
--- a/tests/test_io/test_parquet.py
+++ b/tests/test_io/test_parquet.py
@@ -1,7 +1,7 @@
 """Tests for psm_utils.io.tsv."""
 
-import os
 import hashlib
+import os
 
 from psm_utils.io.parquet import ParquetReader, ParquetWriter
 from psm_utils.psm import PSM
@@ -32,16 +32,17 @@
 ]
 
 
-
 def compute_checksum(filename):
     hash_func = hashlib.sha256()
-    with open(filename, 'rb') as f:
-        for chunk in iter(lambda: f.read(4096), b''):
+    with open(filename, "rb") as f:
+        for chunk in iter(lambda: f.read(4096), b""):
             hash_func.update(chunk)
     return hash_func.hexdigest()
 
+
 class TestParquetWriter:
-    expected_checksum = "c0782793f8c6fd52e39d5ec1cf5567fb0a7e7e245d795f4f1f720337f756b44c"
+    expected_checksum = "cf3f2e9f073be58612ce81f240da9f4109e1c76eea25f1b7881e09c0a8fdee16"
+
     def test_write_psm(self):
         with ParquetWriter("test.pq") as writer:
             for test_case in test_cases:
@@ -57,6 +58,7 @@ def test_write_file(self):
         assert actual_checksum == self.expected_checksum, "Checksums do not match"
         # os.remove("test.pq")
 
+
 class TestParquetReader:
     def test_iter(self):
         # Write test cases to file

From 0f2643cbce30f285ca93a90b9a6f4a5e2ed3a486 Mon Sep 17 00:00:00 2001
From: RalfG <ralfg@hotmail.be>
Date: Wed, 1 May 2024 14:49:54 +0200
Subject: [PATCH 08/10] Remove temporary patch for caching Proforma
 modification resolvers

---
 CHANGELOG.md          |  1 +
 psm_utils/__init__.py | 12 ------------
 pyproject.toml        |  2 +-
 3 files changed, 2 insertions(+), 13 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index ad12218..f54d0fd 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -17,6 +17,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 - Upgrade Pydantic dependency to v2. The PSM `spectrum_id` field is now always coerced to a string.
 - `io.proteoscape`: Use pyarrow to iteratively read from Parquet instead of first reading an entire dataframe with Pandas.
 - `io.sage`: Update compatibility to Sage v0.14
+- Remove temporary patch for caching Proforma modification resolvers (now in Pyteomics v4.7.2).
 
 ## [0.8.3] - 2024-04-16
 
diff --git a/psm_utils/__init__.py b/psm_utils/__init__.py
index 7e9087d..4cf394e 100644
--- a/psm_utils/__init__.py
+++ b/psm_utils/__init__.py
@@ -3,7 +3,6 @@
 __version__ = "0.9.0"
 __all__ = ["Peptidoform", "PSM", "PSMList"]
 
-from functools import lru_cache
 from warnings import filterwarnings
 
 # mzmlb is not used, so hdf5plugin is not needed
@@ -14,17 +13,6 @@
     module="psims.mzmlb",
 )
 
-from pyteomics.proforma import (  # noqa: E402
-    GenericResolver,
-    PSIModResolver,
-    UnimodResolver,
-)
-
 from psm_utils.peptidoform import Peptidoform  # noqa: E402
 from psm_utils.psm import PSM  # noqa: E402
 from psm_utils.psm_list import PSMList  # noqa: E402
-
-# Temporary patch until released in pyteomics (see levitsky/pyteomics#147)
-UnimodResolver.resolve = lru_cache(maxsize=None)(UnimodResolver.resolve)
-PSIModResolver.resolve = lru_cache(maxsize=None)(PSIModResolver.resolve)
-GenericResolver.resolve = lru_cache(maxsize=None)(GenericResolver.resolve)
diff --git a/pyproject.toml b/pyproject.toml
index a2f3629..edaf8b6 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -28,7 +28,7 @@ dependencies = [
     "pyarrow",
     "pydantic >= 2",
     "pyopenms",
-    "pyteomics >= 4, <4.7",
+    "pyteomics >= 4",
     "rich",
     "sqlalchemy",
 ]

From 301ae766b36d5c658d7f52e1fa208dd746e85637 Mon Sep 17 00:00:00 2001
From: RalfG <ralfg@hotmail.be>
Date: Wed, 1 May 2024 15:08:42 +0200
Subject: [PATCH 09/10] Add dev dependencies for wheel test

---
 .github/workflows/publish.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/publish.yml b/.github/workflows/publish.yml
index d02c1d5..4bd65db 100644
--- a/.github/workflows/publish.yml
+++ b/.github/workflows/publish.yml
@@ -30,7 +30,7 @@ jobs:
         run: python -m build --sdist --wheel .
 
       - name: Install wheel
-        run: pip install dist/psm_utils-*.whl
+        run: pip install dist/psm_utils-*.whl[dev]
 
       - name: Test wheel
         run: |

From 6e518962bd9f050318bfaf332070e867167e9502 Mon Sep 17 00:00:00 2001
From: RalfG <ralfg@hotmail.be>
Date: Wed, 1 May 2024 15:12:42 +0200
Subject: [PATCH 10/10] Fix publish workflow

---
 .github/workflows/publish.yml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/publish.yml b/.github/workflows/publish.yml
index 4bd65db..6b5e117 100644
--- a/.github/workflows/publish.yml
+++ b/.github/workflows/publish.yml
@@ -24,13 +24,13 @@ jobs:
       - name: Install dependencies
         run: |
           python -m pip install --upgrade pip
-          pip install build
+          pip install build pytest
 
       - name: Build
         run: python -m build --sdist --wheel .
 
       - name: Install wheel
-        run: pip install dist/psm_utils-*.whl[dev]
+        run: pip install dist/psm_utils-*.whl
 
       - name: Test wheel
         run: |