From 41cb3fe30986a41ccd89c8433c7459721c79d077 Mon Sep 17 00:00:00 2001 From: RalfG Date: Tue, 16 Apr 2024 22:55:47 +0200 Subject: [PATCH 01/10] Update publish workflow to use build or setuptools instead of flit. --- .github/workflows/publish.yml | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/.github/workflows/publish.yml b/.github/workflows/publish.yml index 6f0ae07..d02c1d5 100644 --- a/.github/workflows/publish.yml +++ b/.github/workflows/publish.yml @@ -17,21 +17,22 @@ jobs: - uses: actions/checkout@v4 - name: Set up Python - uses: actions/setup-python@v4 + uses: actions/setup-python@v5 with: python-version: "3.8" - name: Install dependencies run: | - python -m pip install --upgrade pip flit pytest + python -m pip install --upgrade pip + pip install build - - name: Build - run: flit build + - name: Build + run: python -m build --sdist --wheel . - - name: Install + - name: Install wheel run: pip install dist/psm_utils-*.whl - - name: Test package + - name: Test wheel run: | pytest From bfa12248bed7f7017ee6363e082eb5c71713bf61 Mon Sep 17 00:00:00 2001 From: RalfG Date: Sun, 28 Apr 2024 11:36:56 +0200 Subject: [PATCH 02/10] Add generic Parquet reading and writing. --- psm_utils/io/__init__.py | 23 ++++--- psm_utils/io/_utils.py | 1 - psm_utils/io/parquet.py | 124 ++++++++++++++++++++++++++++++++++ tests/test_io/test_parquet.py | 69 +++++++++++++++++++ 4 files changed, 208 insertions(+), 9 deletions(-) create mode 100644 psm_utils/io/parquet.py create mode 100644 tests/test_io/test_parquet.py diff --git a/psm_utils/io/__init__.py b/psm_utils/io/__init__.py index a0c2d42..0df6529 100644 --- a/psm_utils/io/__init__.py +++ b/psm_utils/io/__init__.py @@ -13,6 +13,7 @@ import psm_utils.io.maxquant as maxquant import psm_utils.io.msamanda as msamanda import psm_utils.io.mzid as mzid +import psm_utils.io.parquet as parquet import psm_utils.io.peptide_record as peptide_record import psm_utils.io.pepxml as pepxml import psm_utils.io.percolator as percolator @@ -75,12 +76,6 @@ "extension": ".parquet", "filename_pattern": r"^.*\.candidates\.parquet$", }, - "tsv": { - "reader": tsv.TSVReader, - "writer": tsv.TSVWriter, - "extension": ".tsv", - "filename_pattern": r"^.*\.tsv$", - }, "xtandem": { "reader": xtandem.XTandemReader, "writer": None, @@ -105,6 +100,18 @@ "extension": "ionbot.first.csv", "filename_pattern": r"^ionbot.first.csv$", }, + "parquet": { # List after proteoscape to avoid extension matching conflicts + "reader": parquet.ParquetReader, + "writer": parquet.ParquetWriter, + "extension": ".parquet", + "filename_pattern": r"^.*\.parquet$", + }, + "tsv": { # List after sage to avoid extension matching conflicts + "reader": tsv.TSVReader, + "writer": tsv.TSVWriter, + "extension": ".tsv", + "filename_pattern": r"^.*\.tsv$", + }, } READERS = {k: v["reader"] for k, v in FILETYPES.items() if v["reader"]} WRITERS = {k: v["writer"] for k, v in FILETYPES.items() if v["writer"]} @@ -124,10 +131,10 @@ def _supports_write_psm(writer: WriterBase): with NamedTemporaryFile(delete=False) as temp_file: temp_file.close() Path(temp_file.name).unlink() - example_psm = PSM(peptidoform="ACDE", spectrum_id=0) + example_psm = PSM(peptidoform="ACDE", spectrum_id="0") try: with writer(temp_file.name, example_psm=example_psm) as writer_instance: - writer_instance.write_psm(None) + writer_instance.write_psm(example_psm) except NotImplementedError: supports_write_psm = False except AttributeError: # `None` is not valid PSM diff --git a/psm_utils/io/_utils.py b/psm_utils/io/_utils.py index 01c175f..e1572c2 100644 --- a/psm_utils/io/_utils.py +++ b/psm_utils/io/_utils.py @@ -10,7 +10,6 @@ def set_csv_field_size_limit(): This function should be called before reading any CSV files to ensure that the field size limit is properly set. - """ max_int = sys.maxsize diff --git a/psm_utils/io/parquet.py b/psm_utils/io/parquet.py new file mode 100644 index 0000000..38e59ad --- /dev/null +++ b/psm_utils/io/parquet.py @@ -0,0 +1,124 @@ +""" +Reader and writer for a simple, lossless psm_utils Parquet format. + +Similar to the :py:mod:`psm_utils.io.tsv` module, this module provides a reader and writer +for :py:class:`~psm_utils.psm_list.PSMList` objects in a lossless manner. However, Parquet provides +better performance and storage efficiency compared to TSV, and is recommended for large datasets. + +""" + +from __future__ import annotations + +from pathlib import Path +from typing import Union + +import pyarrow as pa +import pyarrow.parquet as pq +from pydantic import ValidationError + +from psm_utils.io._base_classes import ReaderBase, WriterBase +from psm_utils.io.exceptions import PSMUtilsIOException +from psm_utils.psm import PSM +from psm_utils.psm_list import PSMList + + +class ParquetReader(ReaderBase): + def __init__(self, path: Union[str, Path], *args, **kwargs): + """ + Reader for Parquet files. + + Parameters + ---------- + path : Union[str, Path] + Path to the Parquet file. + + """ + self.path = path + + def __iter__(self): + with pq.ParquetFile(self.path) as reader: + for batch in reader.iter_batches(): + for row in batch.to_pylist(): + try: + yield PSM(**row) + except ValidationError as e: + raise PSMUtilsIOException(f"Error while parsing row {row}:\n{e}") + + +class ParquetWriter(WriterBase): + def __init__(self, path: Union[str, Path], chunk_size: int = 1e6, *args, **kwargs): + """ + Writer for Parquet files. + + Parameters + ---------- + path : Union[str, Path] + Path to the Parquet file. + chunk_size : int + Number of PSMs to write in a single batch. Default is 1e6. + + """ + self.path = path + self.chunk_size = chunk_size + + self._writer = None + self._psm_cache = [] + + def __enter__(self): + self._writer = pq.ParquetWriter(self.path, schema=SCHEMA) + return self + + def __exit__(self, *args, **kwargs): + self._flush() + self._writer.close() + + def write_psm(self, psm: PSM): + """Write a single PSM to the Parquet file.""" + self._psm_cache.append(self._psm_to_entry(psm)) + if len(self._psm_cache) > self.chunk_size: + self._flush() + + def write_file(self, psm_list: PSMList): + """Write a list of PSMs to the Parquet file.""" + with self: + for psm in psm_list: + self.write_psm(psm) + + @staticmethod + def _psm_to_entry(psm: PSM) -> dict: + """Convert a PSM object to a dictionary suitable for writing to Parquet.""" + psm_dict = dict(psm) + psm_dict["peptidoform"] = str(psm.peptidoform) + return psm_dict + + def _flush(self): + """Write the cached PSMs to the Parquet file.""" + if not self._psm_cache: + return + table = pa.Table.from_pylist(self._psm_cache, schema=SCHEMA) + self._writer.write_table(table) + self._psm_cache = [] + + +SCHEMA = pa.schema( + [ + ("peptidoform", pa.string()), + ("spectrum_id", pa.string()), + ("run", pa.string()), + ("collection", pa.string()), + ("spectrum", pa.string()), + ("is_decoy", pa.bool_()), + ("score", pa.float32()), + ("qvalue", pa.float32()), + ("pep", pa.float32()), + ("precursor_mz", pa.float32()), + ("retention_time", pa.float32()), + ("ion_mobility", pa.float32()), + ("protein_list", pa.list_(pa.string())), + ("rank", pa.int32()), + ("source", pa.string()), + ("provenance_data", pa.map_(pa.string(), pa.string())), + ("metadata", pa.map_(pa.string(), pa.string())), + ("rescoring_features", pa.map_(pa.string(), pa.float32())), + ] +) diff --git a/tests/test_io/test_parquet.py b/tests/test_io/test_parquet.py new file mode 100644 index 0000000..b1a1e09 --- /dev/null +++ b/tests/test_io/test_parquet.py @@ -0,0 +1,69 @@ +"""Tests for psm_utils.io.tsv.""" + +import os +import hashlib + +from psm_utils.io.parquet import ParquetReader, ParquetWriter +from psm_utils.psm import PSM +from psm_utils.psm_list import PSMList + +test_cases = [ + {"peptidoform": "ACDE", "spectrum_id": "1"}, + { + "peptidoform": "ACDE", + "spectrum_id": "2", + "run": None, + "collection": None, + "ion_mobility": None, + "is_decoy": None, + "pep": None, + "precursor_mz": None, + "protein_list": None, + "qvalue": None, + "rank": None, + "retention_time": None, + "score": None, + "source": None, + "spectrum": None, + "provenance_data": {"source": "test"}, + "metadata": {}, + "rescoring_features": {"feature": 2.0}, + }, +] + + + +def compute_checksum(filename): + hash_func = hashlib.sha256() + with open(filename, 'rb') as f: + for chunk in iter(lambda: f.read(4096), b''): + hash_func.update(chunk) + return hash_func.hexdigest() + +class TestParquetWriter: + expected_checksum = "c0782793f8c6fd52e39d5ec1cf5567fb0a7e7e245d795f4f1f720337f756b44c" + def test_write_psm(self): + with ParquetWriter("test.pq") as writer: + for test_case in test_cases: + writer.write_psm(PSM(**test_case)) + actual_checksum = compute_checksum("test.pq") + assert actual_checksum == self.expected_checksum, "Checksums do not match" + os.remove("test.pq") + + def test_write_file(self): + with ParquetWriter("test.pq") as writer: + writer.write_file(PSMList(psm_list=[PSM(**t) for t in test_cases])) + actual_checksum = compute_checksum("test.pq") + assert actual_checksum == self.expected_checksum, "Checksums do not match" + # os.remove("test.pq") + +class TestParquetReader: + def test_iter(self): + # Write test cases to file + ParquetWriter("test.pq").write_file(PSMList(psm_list=[PSM(**t) for t in test_cases])) + + # Read test cases from file + for i, psm in enumerate(ParquetReader("test.pq")): + assert psm == PSM(**test_cases[i]) + + os.remove("test.pq") From 3ddabac90c568acec4cf38171402ea275a8492fd Mon Sep 17 00:00:00 2001 From: RalfG Date: Sun, 28 Apr 2024 12:06:44 +0200 Subject: [PATCH 03/10] Fully upgrade to Pydantic 2.0 --- psm_utils/io/parquet.py | 6 ++++++ psm_utils/psm.py | 8 +++----- pyproject.toml | 20 ++++++++++---------- tests/test_psm_list.py | 14 +++++++------- 4 files changed, 26 insertions(+), 22 deletions(-) diff --git a/psm_utils/io/parquet.py b/psm_utils/io/parquet.py index 38e59ad..755cfcc 100644 --- a/psm_utils/io/parquet.py +++ b/psm_utils/io/parquet.py @@ -39,6 +39,12 @@ def __iter__(self): with pq.ParquetFile(self.path) as reader: for batch in reader.iter_batches(): for row in batch.to_pylist(): + # Convert map columns (rendered as lists of tuples) to dictionaries + row["metadata"] = dict(row["metadata"] or {}) + row["provenance_data"] = dict(row["provenance_data"] or {}) + row["rescoring_features"] = dict(row["rescoring_features"] or {}) + + # Convert to PSM object and yield try: yield PSM(**row) except ValidationError as e: diff --git a/psm_utils/psm.py b/psm_utils/psm.py index 6053937..0ed97d9 100644 --- a/psm_utils/psm.py +++ b/psm_utils/psm.py @@ -2,7 +2,7 @@ from typing import Any, Dict, List, Optional, Union -from pydantic import BaseModel +from pydantic import ConfigDict, BaseModel from psm_utils.peptidoform import Peptidoform @@ -11,7 +11,7 @@ class PSM(BaseModel): """Data class representing a peptide-spectrum match (PSM).""" peptidoform: Union[Peptidoform, str] - spectrum_id: Union[int, str] + spectrum_id: Union[str] run: Optional[str] = None collection: Optional[str] = None spectrum: Optional[Any] = None @@ -28,9 +28,7 @@ class PSM(BaseModel): provenance_data: Optional[Dict[str, str]] = dict() metadata: Optional[Dict[str, str]] = dict() rescoring_features: Optional[Dict[str, float]] = dict() - - class Config: - arbitrary_types_allowed = True # Allows non-pydantic class Peptidoform + model_config = ConfigDict(arbitrary_types_allowed=True, coerce_numbers_to_str=True) def __init__(self, **data): """ diff --git a/pyproject.toml b/pyproject.toml index b6923bc..a2f3629 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -20,29 +20,30 @@ classifiers = [ dynamic = ["version"] requires-python = ">=3.7" dependencies = [ - "pyteomics >= 4, <4.7", - "pyopenms", + "click", "lxml", - "psims", - "pandas", "numpy", - "click", + "pandas", + "psims", + "pyarrow", + "pydantic >= 2", + "pyopenms", + "pyteomics >= 4, <4.7", "rich", - "pydantic", "sqlalchemy", ] [project.optional-dependencies] dev = ["ruff", "isort>5", "pytest", "pytest-cov"] docs = [ - "sphinx", "numpydoc>=1,<2", "recommonmark", - "sphinx-mdinclude", - "toml", "semver>=2", "sphinx_rtd_theme", "sphinx-autobuild", + "sphinx-mdinclude", + "sphinx", + "toml", ] online = ["streamlit", "plotly"] @@ -62,7 +63,6 @@ build-backend = "setuptools.build_meta" [tool.setuptools.packages.find] include = ["psm_utils*"] - [tool.setuptools.dynamic] version = { attr = "psm_utils.__version__" } diff --git a/tests/test_psm_list.py b/tests/test_psm_list.py index 21fa709..cff86aa 100644 --- a/tests/test_psm_list.py +++ b/tests/test_psm_list.py @@ -26,30 +26,30 @@ def test___get_item__(self): psm_list = PSMList(psm_list=sample_psm_list) # Single index - assert psm_list[0] == PSM(peptidoform="ACDK", spectrum_id=1, score=140.2) + assert psm_list[0] == PSM(peptidoform="ACDK", spectrum_id="1", score=140.2) # Slice assert psm_list[0:2] == PSMList( psm_list=[ - PSM(peptidoform="ACDK", spectrum_id=1, score=140.2), - PSM(peptidoform="CDEFR", spectrum_id=2, score=132.9), + PSM(peptidoform="ACDK", spectrum_id="1", score=140.2), + PSM(peptidoform="CDEFR", spectrum_id="2", score=132.9), ] ) # PSM property as array - np.testing.assert_equal(psm_list["spectrum_id"], np.array([1, 2, 3])) + np.testing.assert_equal(psm_list["spectrum_id"], np.array(["1", "2", "3"])) # Multiple PSM properties as 2D array np.testing.assert_equal( psm_list[["spectrum_id", "score"]], - np.array([[1, 140.2], [2, 132.9], [3, 55.7]]), + np.array([["1", 140.2], ["2", 132.9], ["3", 55.7]]), ) # Index by multiple indices psm_list[0, 2] == PSMList( psm_list=[ - PSM(peptidoform="ACDK", spectrum_id=1, score=140.2), - PSM(peptidoform="DEM[Oxidation]K", spectrum_id=3, score=55.7), + PSM(peptidoform="ACDK", spectrum_id="1", score=140.2), + PSM(peptidoform="DEM[Oxidation]K", spectrum_id="3", score=55.7), ] ) From f70a905eb1faa840a1cc58391ddbec2350132cb5 Mon Sep 17 00:00:00 2001 From: RalfG Date: Wed, 1 May 2024 13:28:36 +0200 Subject: [PATCH 04/10] Add support for Sage Parquet files, next to TSV. SageReader is now SageTSVReader, with aliases for backwards compatibility; Update compatibility to Sage v14 --- psm_utils/io/__init__.py | 15 +++- psm_utils/io/sage.py | 107 ++++++++++++++++++--------- tests/test_data/results.sage.parquet | Bin 0 -> 12270 bytes tests/test_data/results.sage.tsv | 4 +- tests/test_io/test_idxml.py | 6 +- tests/test_io/test_sage.py | 47 +++++++++--- 6 files changed, 124 insertions(+), 55 deletions(-) create mode 100644 tests/test_data/results.sage.parquet diff --git a/psm_utils/io/__init__.py b/psm_utils/io/__init__.py index 0df6529..45b3d7a 100644 --- a/psm_utils/io/__init__.py +++ b/psm_utils/io/__init__.py @@ -88,19 +88,25 @@ "extension": ".csv", "filename_pattern": r"^.*(?:_|\.)msamanda.csv$", }, - "sage": { - "reader": sage.SageReader, + "sage_tsv": { + "reader": sage.SageTSVReader, "writer": None, "extension": ".tsv", "filename_pattern": r"^.*(?:_|\.).sage.tsv$", }, + "sage_parquet": { + "reader": sage.SageParquetReader, + "writer": None, + "extension": ".parquet", + "filename_pattern": r"^.*(?:_|\.).sage.parquet$", + }, "ionbot": { "reader": ionbot.IonbotReader, "writer": None, "extension": "ionbot.first.csv", "filename_pattern": r"^ionbot.first.csv$", }, - "parquet": { # List after proteoscape to avoid extension matching conflicts + "parquet": { # List after proteoscape and sage to avoid extension matching conflicts "reader": parquet.ParquetReader, "writer": parquet.ParquetWriter, "extension": ".parquet", @@ -113,6 +119,9 @@ "filename_pattern": r"^.*\.tsv$", }, } + +FILETYPES["sage"] = FILETYPES["sage_tsv"] # Alias for backwards compatibility + READERS = {k: v["reader"] for k, v in FILETYPES.items() if v["reader"]} WRITERS = {k: v["writer"] for k, v in FILETYPES.items() if v["writer"]} diff --git a/psm_utils/io/sage.py b/psm_utils/io/sage.py index e0cc0d1..4cc23e3 100644 --- a/psm_utils/io/sage.py +++ b/psm_utils/io/sage.py @@ -2,26 +2,29 @@ Reader for PSM files from the Sage search engine. Reads the ``results.sage.tsv`` file as defined on the -`Sage documentation page `_. +`Sage documentation page `_. """ from __future__ import annotations import csv +from abc import ABC, abstractmethod from pathlib import Path from typing import Iterable, Optional +import pyarrow.parquet as pq from pyteomics import mass from psm_utils.io._base_classes import ReaderBase -from psm_utils.psm import PSM from psm_utils.io._utils import set_csv_field_size_limit +from psm_utils.psm import PSM +from psm_utils.psm_list import PSMList set_csv_field_size_limit() -class SageReader(ReaderBase): +class SageReaderBase(ReaderBase, ABC): def __init__( self, filename, score_column: str = "sage_discriminant_score", *args, **kwargs ) -> None: @@ -41,42 +44,15 @@ def __init__( self.filename = filename self.score_column = score_column + @abstractmethod def __iter__(self) -> Iterable[PSM]: """Iterate over file and return PSMs one-by-one.""" - with open(self.filename) as open_file: - reader = csv.DictReader(open_file, delimiter="\t") - for row in reader: - psm = self._get_peptide_spectrum_match(row) - yield psm + raise NotImplementedError("Use `SageTSVReader` or `SageParquetReader` instead.") def _get_peptide_spectrum_match(self, psm_dict) -> PSM: """Parse a single PSM from a sage PSM file.""" rescoring_features = {} - for ft in [ - "expmass", - "calcmass", - "delta_mass", - "peptide_len", - "missed_cleavages", - "isotope_error", - "precursor_ppm", - "fragment_ppm", - "hyperscore", - "delta_next", - "delta_best", - "delta_rt_model", - "aligned_rt", - "predicted_rt", - "matched_peaks", - "longest_b", - "longest_y", - "longest_y_pct", - "matched_intensity_pct", - "scored_candidates", - "poisson", - "ms1_intensity", - "ms2_intensity", - ]: + for ft in RESCORING_FEATURES: try: rescoring_features[ft] = psm_dict[ft] except KeyError: @@ -89,9 +65,7 @@ def _get_peptide_spectrum_match(self, psm_dict) -> PSM: ), spectrum_id=psm_dict["scannr"], run=Path(psm_dict["filename"]).stem, - is_decoy=( - True if psm_dict["label"] == "-1" else False if psm_dict["label"] == "1" else None - ), + is_decoy=psm_dict["is_decoy"], qvalue=psm_dict["spectrum_q"], score=float(psm_dict[self.score_column]), precursor_mz=self._parse_precursor_mz(psm_dict["expmass"], psm_dict["charge"]), @@ -118,3 +92,64 @@ def _parse_precursor_mz(expmass: str, charge: Optional[str]) -> Optional[float]: return (expmass + (mass.nist_mass["H"][1][0] * charge)) / charge else: return None + + @classmethod + def from_dataframe(cls, dataframe) -> PSMList: + """Create a PSMList from a Sage Pandas DataFrame.""" + return PSMList( + psm_list=[ + cls._get_peptide_spectrum_match(cls(""), entry) + for entry in dataframe.to_dict(orient="records") + ] + ) + + +class SageTSVReader(SageReaderBase): + def __iter__(self) -> Iterable[PSM]: + """Iterate over file and return PSMs one-by-one.""" + with open(self.filename, "r") as open_file: + reader = csv.DictReader(open_file, delimiter="\t") + for row in reader: + row["is_decoy"] = ( + True if row["label"] == "-1" else False if row["label"] == "1" else None + ) + + yield self._get_peptide_spectrum_match(row) + +SageReader = SageTSVReader # Alias for backwards compatibility + + +class SageParquetReader(SageReaderBase): + def __iter__(self) -> Iterable[PSM]: + """Iterate over file and return PSMs one-by-one.""" + with pq.ParquetFile(self.filename) as pq_file: + for batch in pq_file.iter_batches(): + for row in batch.to_pylist(): + yield self._get_peptide_spectrum_match(row) + + +RESCORING_FEATURES = [ + "expmass", + "calcmass", + "delta_mass", + "peptide_len", + "missed_cleavages", + "isotope_error", + "precursor_ppm", + "fragment_ppm", + "hyperscore", + "delta_next", + "delta_best", + "delta_rt_model", + "aligned_rt", + "predicted_rt", + "matched_peaks", + "longest_b", + "longest_y", + "longest_y_pct", + "matched_intensity_pct", + "scored_candidates", + "poisson", + # "ms1_intensity", # Removed in Sage v0.14 + "ms2_intensity", +] diff --git a/tests/test_data/results.sage.parquet b/tests/test_data/results.sage.parquet new file mode 100644 index 0000000000000000000000000000000000000000..8060fe64f7069d4f51d763e19b51abe94fc5e2c2 GIT binary patch literal 12270 zcmc&)4~P`k8J}H;^VBo(-@MG)ZF<`HVy!tf_RQMEn6qN6z36E)o^6`#x;J{G`_I|c z#*l=oNRdPAVMQbjN09W8L#`68;VQwVp&TM2se~gO60RXgNJS0_M-dzPeQ$Q&dvi0h z`K;I=5!dgm>+dXhk(O)~M9Xi0L(jVm4{EX>}w@4n;^$gs!8taZylM z2~H+3q=m{8+6K3}e=CX()iiH@aZ5{6Q%g%z%kwSIZ*Jb&yrm)g+6&Dd#d^@2df;C? zEvhTGsCJi`F>72F-NH8_%He1Ov@xB}m8^Uw zV^~`c7K~L(>o@Mt?li1b%j%2iRBqMsrAt?=j40-xyp3+~6cvCEi9W&i13Rb&h2<>P zJvXD|=p=e#GCczGW`Y$|BM7>(L3_^Km48!|+5VHoGf!sw%oE)Y3~c=5f>9`$ZT952 z93i6-dV3n}o)&NQqY31G;O&@EELmouV6-K@4eETD_B}N51})XrxCiDRiZTZVrdW7= z^U9Tt%U8VqQq$ID$@SY_Xxiu*nK^!M>Aq;-*whxRe916#MK(3BM#xx%_RXRn&y2VF z5(U%4@YHH^-L08tub3X?Dz26~yLyyG%WFzqZhtoEQn!a3i_p^=y)h?lm$k2nactkV z5Y02b-kXRs|Ou|AR*Zm zbU;-;IuO-Wyawu+ku{4&Sk=-QBXuCP+mI&5*nIlod=Mj85|YitOk}b@R5Y??(#XAb zFq%dD|IaN-i#>#}~@I+#&TcqYL;1&-`c0YVXuTR9EpDtfXKW z>HStQZzT(bteAY)7SWzXphQq4B)dYN4Bk00CM5=3MC9XtDl<6n7ebZG|Do1pQ2x{02!iuLb5B! zsCqE!;A?F&)1`4j23FETE2|lfs{7+1dsbMbWHt|`n4p)QrYD}hzkOonbD(0UnQ<-Z zBdcids%rYA>cN>M2FEMs)@u5}>ia6EJVah8U2EuYQ#Dcubd7fnK~z`q>iK#c7i&6f z^67SVTuSYgoQj^cbo6JSL{KCoyTS)#`_1E>Jw$aCuYpRYWb!%iYf8zTV#=OhM-Q(9 z6{2`TvMZ>lgPOekfvB$HHC%<{gbc2yUF$)GD4vk)3Mv-UwnpeR~5a5fll@uAtU)v? zcME6|WC_Wxph;0Wd^9179!5v7riTmpV#%;fa9+nb(nH(mJKI2uAVCL3CJb<-@g?sV zxr@4Dxz5)#wOBCHB@03o`^3_6b2}Z_eqUuwagLIbiaR^#;EwyO!VV_MeR1Svdi7Uf zDim!Np6#4adp*Y4;d!pM7^5gG#O{RBO^Bq!=N&Cy6|FoH31S(Ayan?jiJk#pQq+uD zEKN2t2A(n45Vexm>Cso{XTOfG0H)Z?CfsX`>OhIuL|B_&g8v%?r z`E88l0E{;o#?T7Dc#~mxL>O-p?A3q)19*=GBqR+`tixNJ9iIVaM|J6KJlFz^H>omq zxCEG;Vx_n7*#{1&5)Vl5Hr`}zbo&8j-oL~hKr*qn14sg3^!z{x-eiI3iUEu_8Ajg- zV7$pNIw}C;O@`5102psFj9)mwc#~n=Ndd;24C7`7Fy3SsHw1v$ZsF>|dxRr6&7G+p zRbp|nPHShK6eqotv@?;iePXiK5iQ%dr)eh~yz^mU_s!ByIrwQ!Yc0Rt{|GlbV%ml> zdSxz0hvM8Ed6b*i>$y2LpPR#vafW`%(TfYYd2JClM;CMR&=PKTE#qeQa&De&{Dg1b>_s_pIgS`E}eJT+hv`8@PGBnVX%PxY@Irn|&{Gb7&hk zZ*J%2ogLgf@-jD%zM^f!u7*i;=ZW=@1#CC6DQzo+)E6jD9$3-p;DM)57r@&Hlv9Sz zqmx8)o;}GZ=MOxigFqCBT!|DLVkr&SVXI{R%0Nz9nodsrMtf7CC#Hy+9^`KVNFQoCnbL+L z3OzPmh;o)0<|$X7t0#lVe`BY%N1@#_1ifd~lIp1@g-$-hdWO+CLUO5gVd7vDAnS$Kk%Tt~8DMmF#R z(JxgbYRMg>I7oO51A0gdad$74<%RO`xP0f){k?ta+JG?^BO)$kqZX(SXqdAli zG*za`$l(J(_wHV7WY_?M7=_;}*uqmZq#N0;Gq&!Z`>;(Y)k6A_?L-b++xrXAfo`8$ z=nGGokW0w;5o5gYf?)Lb*adTV>Vu3R^C)Bfe3NLo*WVYc;VBF4q5HC%+VQ;*=!V!bA1s3uHUI55s_r48^O9O&YT R@&Nx={s_NpY=K|;{~u}aqh0_2 literal 0 HcmV?d00001 diff --git a/tests/test_data/results.sage.tsv b/tests/test_data/results.sage.tsv index 9d6ec16..6543ec3 100644 --- a/tests/test_data/results.sage.tsv +++ b/tests/test_data/results.sage.tsv @@ -1,2 +1,2 @@ -peptide proteins num_proteins filename scannr rank label expmass calcmass charge peptide_len missed_cleavages isotope_error precursor_ppm fragment_ppm hyperscore delta_next delta_best rt aligned_rt predicted_rt delta_rt_model matched_peaks longest_b longest_y longest_y_pct matched_intensity_pct scored_candidates poisson sage_discriminant_score posterior_error spectrum_q peptide_q protein_q ms1_intensity ms2_intensity -LQSRPAAPPAPGPGQLTLR sp|Q99536|VAT1_HUMAN 1 LQSRPAAPPAPGPGQLTLR.mzML controllerType=0 controllerNumber=1 scan=30069 1 1 1926.0815 1926.08 3 19 0 0.0 0.8239083 0.5347518 71.78844460255384 71.78844460255384 0.0 108.2854 0.0 0.0 0.0 22 9 12 0.6315789 50.785 1 -1.9562811911083433 1.2944585 1.0 1.0 1.0 1.0 306146180.0 56930696.0 +psm_id peptide proteins num_proteins filename scannr rank label expmass calcmass charge peptide_len missed_cleavages semi_enzymatic isotope_error precursor_ppm fragment_ppm hyperscore delta_next delta_best rt aligned_rt predicted_rt delta_rt_model ion_mobility predicted_mobility delta_mobility matched_peaks longest_b longest_y longest_y_pct matched_intensity_pct scored_candidates poisson sage_discriminant_score posterior_error spectrum_q peptide_q protein_q ms2_intensity +1 LQSRPAAPPAPGPGQLTLR sp|Q99536|VAT1_HUMAN 1 LQSRPAAPPAPGPGQLTLR.mzML controllerType=0 controllerNumber=1 scan=30069 1 1 1926.0815 1926.08 3 19 0 0 0.0 0.8239083 0.503857 72.26591573806016 72.26591573806016 0.0 108.2854 0.993444 0.0 0.993444 0.0 0.0 0.0 22 9 12 0.6315789 64.770966 1 -1.9562811911083433 1.2944585 1.0 1.0 1.0 1.0 72609170.0 diff --git a/tests/test_io/test_idxml.py b/tests/test_io/test_idxml.py index 075b12f..adb6e96 100644 --- a/tests/test_io/test_idxml.py +++ b/tests/test_io/test_idxml.py @@ -3,9 +3,9 @@ import hashlib from psm_utils.io.idxml import IdXMLReader, IdXMLWriter -from psm_utils.io.sage import SageReader -from psm_utils.psm import PSM +from psm_utils.io.sage import SageTSVReader from psm_utils.peptidoform import Peptidoform +from psm_utils.psm import PSM class TestIdXMLReader: @@ -104,7 +104,7 @@ def test_write_file_with_pyopenms_objects(self): def test_write_file_without_pyopenms_objects(self): expected_sha = "b81addaf8ef1f5cb5007f14a914bee508c54d59f34f8857a5770d3db9aa2c15b" - reader = SageReader("./tests/test_data/results.sage.tsv") + reader = SageTSVReader("./tests/test_data/results.sage.tsv") psm_list = reader.read_file() writer = IdXMLWriter("./tests/test_data/test_out_sage.idXML") writer.write_file(psm_list) diff --git a/tests/test_io/test_sage.py b/tests/test_io/test_sage.py index 10d2bcc..60d87ba 100644 --- a/tests/test_io/test_sage.py +++ b/tests/test_io/test_sage.py @@ -1,6 +1,8 @@ """Tests for psm_utils.io.sage.""" -from psm_utils.io.sage import SageReader +import pytest + +from psm_utils.io.sage import SageParquetReader, SageTSVReader from psm_utils.psm import PSM test_psm = PSM( @@ -27,29 +29,52 @@ "missed_cleavages": 0.0, "isotope_error": 0.0, "precursor_ppm": 0.8239083, - "fragment_ppm": 0.5347518, - "hyperscore": 71.78844460255384, - "delta_next": 71.78844460255384, + "fragment_ppm": 0.503857, + "hyperscore": 72.26591573806016, + "delta_next": 72.26591573806016, "delta_best": 0.0, - "delta_rt_model": 0.0, - "aligned_rt": 0.0, + "delta_rt_model": 0.993444, + "aligned_rt": 0.993444, "predicted_rt": 0.0, "matched_peaks": 22.0, "longest_b": 9.0, "longest_y": 12.0, "longest_y_pct": 0.6315789, - "matched_intensity_pct": 50.785, + "matched_intensity_pct": 64.770966, "scored_candidates": 1.0, "poisson": -1.9562811911083433, - "ms1_intensity": 306146180.0, - "ms2_intensity": 56930696.0, + "ms2_intensity": 72609170.0, }, ) -class TestSageReader: +class TestSageTSVReader: def test_iter(self): - with SageReader("./tests/test_data/results.sage.tsv") as reader: + with SageTSVReader("./tests/test_data/results.sage.tsv") as reader: for psm in reader: psm.provenance_data = {} assert psm == test_psm + + +class TestSageParquetReader: + def test_iter(self): + with SageParquetReader("./tests/test_data/results.sage.parquet") as reader: + # Parquet results in float precision differences, so pytest.approx is used, which does + # not support objects with nested dicts. + for psm in reader: + psm_dict = dict(psm) + test_psm_dict = dict(test_psm) + + # Nested dicts + assert psm_dict.pop("rescoring_features", {}) == pytest.approx( + test_psm_dict.pop("rescoring_features", {}) + ) + assert psm_dict.pop("metadata", {}) == test_psm_dict.pop("metadata", {}) + psm_dict.pop("provenance_data", {}) + + # Remaining keys + for k, v in psm_dict.items(): + if isinstance(v, float): + assert v == pytest.approx(test_psm_dict[k]) + else: + assert v == test_psm_dict[k] From d4854f4637772e15ffce6150ef60aae0e5e6ab67 Mon Sep 17 00:00:00 2001 From: RalfG Date: Wed, 1 May 2024 13:31:44 +0200 Subject: [PATCH 05/10] Use pyarrow for iterative parquet reading instead a reading a pandas dataframe intermediate --- psm_utils/io/proteoscape.py | 93 +++++++++++++++++++------------------ 1 file changed, 49 insertions(+), 44 deletions(-) diff --git a/psm_utils/io/proteoscape.py b/psm_utils/io/proteoscape.py index cc0e834..ddc4386 100644 --- a/psm_utils/io/proteoscape.py +++ b/psm_utils/io/proteoscape.py @@ -4,13 +4,15 @@ import re from pathlib import Path from typing import Union -from collections import namedtuple import numpy as np import pandas as pd +import pyarrow.parquet as pq -from psm_utils import PSM +from psm_utils.psm import PSM +from psm_utils.psm_list import PSMList from psm_utils.io._base_classes import ReaderBase +from psm_utils.io.exceptions import PSMUtilsIOException from psm_utils.peptidoform import format_number_as_string logger = logging.getLogger(__name__) @@ -36,31 +38,31 @@ def __init__( Path to MSF file. """ - if isinstance(filename, pd.DataFrame): - self.data = filename - else: - super().__init__(filename, *args, **kwargs) - self.data = pd.read_parquet(self.filename) - - self._Row = namedtuple("Row", self.data.columns) + self.filename = filename def __len__(self): """Return number of PSMs in file.""" - return len(self.data) + return pq.read_metadata(self.filename).num_rows def __iter__(self): """Iterate over file and return PSMs one-by-one.""" - for entry in self.data.itertuples(): - yield _parse_entry(entry) - - def __getitem__(self, index): - """Return PSM at index.""" - return _parse_entry(self._Row(*self.data.iloc[index])) + with pq.ParquetFile(self.filename) as reader: + for batch in reader.iter_batches(): + for row in batch.to_pylist(): + try: + yield _parse_entry(row) + except Exception as e: + raise PSMUtilsIOException(f"Error while parsing row {row}:\n{e}") from e @classmethod - def from_dataframe(cls, dataframe: pd.DataFrame, *args, **kwargs): - """Create a ProteoScapeReader from a DataFrame.""" - return cls(dataframe, *args, **kwargs) + def from_dataframe(cls, dataframe: pd.DataFrame) -> PSMList: + """Create a PSMList from a ProteoScape Pandas DataFrame.""" + return PSMList( + psm_list=[ + cls._get_peptide_spectrum_match(cls(""), entry) + for entry in dataframe.to_dict(orient="records") + ] + ) def _parse_peptidoform( @@ -81,40 +83,43 @@ def _parse_peptidoform( return f"{n_term}{''.join(peptidoform)}{c_term}/{precursor_charge}" -def _parse_entry(entry) -> PSM: +def _parse_entry(entry: dict) -> PSM: """Parse a single entry from ProteoScape Parquet file to PSM object.""" return PSM( peptidoform=_parse_peptidoform( - entry.stripped_peptide, entry.ptms, entry.ptm_locations, entry.precursor_charge + entry["stripped_peptide"], + entry["ptms"], + entry["ptm_locations"], + entry["precursor_charge"], ), - spectrum_id=entry.ms2_id, - run=getattr(entry, "run", None), - is_decoy=all(DECOY_PATTERN.match(p) for p in entry.locus_name), - score=entry.x_corr_score, - precursor_mz=entry.precursor_mz, - retention_time=entry.rt, - ion_mobility=entry.ook0, - protein_list=list(entry.locus_name), - rank=entry.rank, + spectrum_id=entry["ms2_id"], + run=entry.get("run", None), + is_decoy=all(DECOY_PATTERN.match(p) for p in entry["locus_name"]), + score=entry["x_corr_score"], + precursor_mz=entry["precursor_mz"], + retention_time=entry["rt"], + ion_mobility=entry["ook0"], + protein_list=list(entry["locus_name"]), + rank=entry["rank"], source="ProteoScape", provenance_data={ - "candidate_id": str(entry.candidate_id), - "ms2_id": str(entry.ms2_id), - "parent_id": str(entry.parent_id), + "candidate_id": str(entry["candidate_id"]), + "ms2_id": str(entry["ms2_id"]), + "parent_id": str(entry["parent_id"]), }, metadata={ - "leading_aa": str(entry.leading_aa), - "trailing_aa": str(entry.trailing_aa), - "corrected_ook0": str(entry.corrected_ook0), + "leading_aa": str(entry["leading_aa"]), + "trailing_aa": str(entry["trailing_aa"]), + "corrected_ook0": str(entry["corrected_ook0"]), }, rescoring_features={ - "tims_score": float(entry.tims_score), - "x_corr_score": float(entry.x_corr_score), - "delta_cn_score": float(entry.delta_cn_score), - "ppm_error": float(entry.ppm_error), - "number_matched_ions": float(entry.number_matched_ions), - "number_expected_ions": float(entry.number_expected_ions), - "ion_proportion": float(entry.ion_proportion), - "spectrum_total_ion_intensity": float(entry.spectrum_total_ion_intensity), + "tims_score": float(entry["tims_score"]), + "x_corr_score": float(entry["x_corr_score"]), + "delta_cn_score": float(entry["delta_cn_score"]), + "ppm_error": float(entry["ppm_error"]), + "number_matched_ions": float(entry["number_matched_ions"]), + "number_expected_ions": float(entry["number_expected_ions"]), + "ion_proportion": float(entry["ion_proportion"]), + "spectrum_total_ion_intensity": float(entry["spectrum_total_ion_intensity"]), }, ) From 30e2490be136d7ccde7fe50c55cf90d299178c7e Mon Sep 17 00:00:00 2001 From: RalfG Date: Wed, 1 May 2024 13:52:56 +0200 Subject: [PATCH 06/10] Update readme; changelog; version bump --- CHANGELOG.md | 13 +++++++++++++ README.rst | 4 +++- psm_utils/__init__.py | 2 +- 3 files changed, 17 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index e8a279b..ad12218 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,6 +5,19 @@ All notable changes to this project will be documented in this file. The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). +## [0.9.0] - 2024-05-01 + +### Added + +- `io`: Read and write support for writing PSMs to Apache Parquet for efficient storage of PSM lists. +- `io.sage`: Support for Sage results in Parquet format (new `SageParquetReader`, renamed `SageReader` to `SageTSVReader`). + +### Changed + +- Upgrade Pydantic dependency to v2. The PSM `spectrum_id` field is now always coerced to a string. +- `io.proteoscape`: Use pyarrow to iteratively read from Parquet instead of first reading an entire dataframe with Pandas. +- `io.sage`: Update compatibility to Sage v0.14 + ## [0.8.3] - 2024-04-16 ### Added diff --git a/README.rst b/README.rst index 3f4c2d1..fede4a6 100644 --- a/README.rst +++ b/README.rst @@ -94,11 +94,13 @@ Supported file formats `MaxQuant msms.txt `_ ``msms`` ✅ ❌ `MS Amanda CSV `_ ``msamanda`` ✅ ❌ `mzIdentML `_ ``mzid`` ✅ ✅ + `Parquet ` ``parquet`` ✅ ✅ `Peptide Record `_ ``peprec`` ✅ ✅ `pepXML `_ ``pepxml`` ✅ ❌ `Percolator tab `_ ``percolator`` ✅ ✅ Proteome Discoverer MSF ``proteome_discoverer`` ✅ ❌ - `Sage `_ ``sage`` ✅ ❌ + `Sage Parquet `_ ``sage_parquet`` ✅ ❌ + `Sage TSV `_ ``sage_tsv`` ✅ ❌ ProteoScape Parquet ``proteoscape`` ✅ ❌ `TSV `_ ``tsv`` ✅ ✅ `X!Tandem XML `_ ``xtandem`` ✅ ❌ diff --git a/psm_utils/__init__.py b/psm_utils/__init__.py index 3922589..7e9087d 100644 --- a/psm_utils/__init__.py +++ b/psm_utils/__init__.py @@ -1,6 +1,6 @@ """Common utilities for parsing and handling PSMs, and search engine results.""" -__version__ = "0.8.3" +__version__ = "0.9.0" __all__ = ["Peptidoform", "PSM", "PSMList"] from functools import lru_cache From 6a8b51f26626f7efcbd90853211093380df7ed66 Mon Sep 17 00:00:00 2001 From: RalfG Date: Wed, 1 May 2024 14:31:04 +0200 Subject: [PATCH 07/10] Fix test checksums --- tests/test_io/test_idxml.py | 2 +- tests/test_io/test_parquet.py | 12 +++++++----- 2 files changed, 8 insertions(+), 6 deletions(-) diff --git a/tests/test_io/test_idxml.py b/tests/test_io/test_idxml.py index adb6e96..7bf1eac 100644 --- a/tests/test_io/test_idxml.py +++ b/tests/test_io/test_idxml.py @@ -103,7 +103,7 @@ def test_write_file_with_pyopenms_objects(self): assert sha == expected_sha def test_write_file_without_pyopenms_objects(self): - expected_sha = "b81addaf8ef1f5cb5007f14a914bee508c54d59f34f8857a5770d3db9aa2c15b" + expected_sha = "148889926276fbe391e23ed7952c3a8410fc67ffb099bbf1a72df75f8d727ccd" reader = SageTSVReader("./tests/test_data/results.sage.tsv") psm_list = reader.read_file() writer = IdXMLWriter("./tests/test_data/test_out_sage.idXML") diff --git a/tests/test_io/test_parquet.py b/tests/test_io/test_parquet.py index b1a1e09..20bb0b0 100644 --- a/tests/test_io/test_parquet.py +++ b/tests/test_io/test_parquet.py @@ -1,7 +1,7 @@ """Tests for psm_utils.io.tsv.""" -import os import hashlib +import os from psm_utils.io.parquet import ParquetReader, ParquetWriter from psm_utils.psm import PSM @@ -32,16 +32,17 @@ ] - def compute_checksum(filename): hash_func = hashlib.sha256() - with open(filename, 'rb') as f: - for chunk in iter(lambda: f.read(4096), b''): + with open(filename, "rb") as f: + for chunk in iter(lambda: f.read(4096), b""): hash_func.update(chunk) return hash_func.hexdigest() + class TestParquetWriter: - expected_checksum = "c0782793f8c6fd52e39d5ec1cf5567fb0a7e7e245d795f4f1f720337f756b44c" + expected_checksum = "cf3f2e9f073be58612ce81f240da9f4109e1c76eea25f1b7881e09c0a8fdee16" + def test_write_psm(self): with ParquetWriter("test.pq") as writer: for test_case in test_cases: @@ -57,6 +58,7 @@ def test_write_file(self): assert actual_checksum == self.expected_checksum, "Checksums do not match" # os.remove("test.pq") + class TestParquetReader: def test_iter(self): # Write test cases to file From 0f2643cbce30f285ca93a90b9a6f4a5e2ed3a486 Mon Sep 17 00:00:00 2001 From: RalfG Date: Wed, 1 May 2024 14:49:54 +0200 Subject: [PATCH 08/10] Remove temporary patch for caching Proforma modification resolvers --- CHANGELOG.md | 1 + psm_utils/__init__.py | 12 ------------ pyproject.toml | 2 +- 3 files changed, 2 insertions(+), 13 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index ad12218..f54d0fd 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -17,6 +17,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - Upgrade Pydantic dependency to v2. The PSM `spectrum_id` field is now always coerced to a string. - `io.proteoscape`: Use pyarrow to iteratively read from Parquet instead of first reading an entire dataframe with Pandas. - `io.sage`: Update compatibility to Sage v0.14 +- Remove temporary patch for caching Proforma modification resolvers (now in Pyteomics v4.7.2). ## [0.8.3] - 2024-04-16 diff --git a/psm_utils/__init__.py b/psm_utils/__init__.py index 7e9087d..4cf394e 100644 --- a/psm_utils/__init__.py +++ b/psm_utils/__init__.py @@ -3,7 +3,6 @@ __version__ = "0.9.0" __all__ = ["Peptidoform", "PSM", "PSMList"] -from functools import lru_cache from warnings import filterwarnings # mzmlb is not used, so hdf5plugin is not needed @@ -14,17 +13,6 @@ module="psims.mzmlb", ) -from pyteomics.proforma import ( # noqa: E402 - GenericResolver, - PSIModResolver, - UnimodResolver, -) - from psm_utils.peptidoform import Peptidoform # noqa: E402 from psm_utils.psm import PSM # noqa: E402 from psm_utils.psm_list import PSMList # noqa: E402 - -# Temporary patch until released in pyteomics (see levitsky/pyteomics#147) -UnimodResolver.resolve = lru_cache(maxsize=None)(UnimodResolver.resolve) -PSIModResolver.resolve = lru_cache(maxsize=None)(PSIModResolver.resolve) -GenericResolver.resolve = lru_cache(maxsize=None)(GenericResolver.resolve) diff --git a/pyproject.toml b/pyproject.toml index a2f3629..edaf8b6 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -28,7 +28,7 @@ dependencies = [ "pyarrow", "pydantic >= 2", "pyopenms", - "pyteomics >= 4, <4.7", + "pyteomics >= 4", "rich", "sqlalchemy", ] From 301ae766b36d5c658d7f52e1fa208dd746e85637 Mon Sep 17 00:00:00 2001 From: RalfG Date: Wed, 1 May 2024 15:08:42 +0200 Subject: [PATCH 09/10] Add dev dependencies for wheel test --- .github/workflows/publish.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/publish.yml b/.github/workflows/publish.yml index d02c1d5..4bd65db 100644 --- a/.github/workflows/publish.yml +++ b/.github/workflows/publish.yml @@ -30,7 +30,7 @@ jobs: run: python -m build --sdist --wheel . - name: Install wheel - run: pip install dist/psm_utils-*.whl + run: pip install dist/psm_utils-*.whl[dev] - name: Test wheel run: | From 6e518962bd9f050318bfaf332070e867167e9502 Mon Sep 17 00:00:00 2001 From: RalfG Date: Wed, 1 May 2024 15:12:42 +0200 Subject: [PATCH 10/10] Fix publish workflow --- .github/workflows/publish.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/publish.yml b/.github/workflows/publish.yml index 4bd65db..6b5e117 100644 --- a/.github/workflows/publish.yml +++ b/.github/workflows/publish.yml @@ -24,13 +24,13 @@ jobs: - name: Install dependencies run: | python -m pip install --upgrade pip - pip install build + pip install build pytest - name: Build run: python -m build --sdist --wheel . - name: Install wheel - run: pip install dist/psm_utils-*.whl[dev] + run: pip install dist/psm_utils-*.whl - name: Test wheel run: |