8000 [ENH] Add Subsequence Extraction Transformer to `sktime.transformations` module by wirrywoo · Pull Request #6967 · sktime/sktime · GitHub
[go: up one dir, main page]
More Web Proxy on the site http://driver.im/
Skip to content

[ENH] Add Subsequence Extraction Transformer to sktime.transformations module #6967

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 14 commits into from
Aug 23, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 9 additions & 0 deletions .all-contributorsrc
Original file line number Diff line number Diff line change
Expand Up @@ -2978,6 +2978,15 @@
"maintenance"
]
},
{
"login": "wirrywoo",
"name": "Wilson Cheung",
"avatar_url": "https://avatars.githubusercontent.com/u/148647848?v=4?s=100",
"profile": "https://github.com/wirrywoo",
"contributions": [
"code"
]
},
{
"login": "janasberger",
"name": "Jana Schmidberger",
Expand Down
10 changes: 9 additions & 1 deletion docs/source/api_reference/transformations.rst
Original file line number Diff line number Diff line change
Expand Up @@ -597,6 +597,15 @@ These transformers create a series based on a sequence of sliding windows.

HOG1DTransformer

.. currentmodule:: sktime.transformations.series.subsequence_extraction

.. autosummary::
:toctree: auto_generated/
:template: class.rst

SubsequenceExtractionTransformer


Multivariate-to-univariate
~~~~~~~~~~~~~~~~~~~~~~~~~~

Expand Down Expand Up @@ -702,7 +711,6 @@ These transformations ensure all series in a panel have equal length

TruncationTransformer


Dimension reduction
~~~~~~~~~~~~~~~~~~~

Expand Down
197 changes: 197 additions & 0 deletions sktime/transformations/series/subsequence_extraction.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,197 @@
"""Subsequence extraction transformer.

A transformer for the extraction of contiguous subsequences of specified
length based on maximal/minimal rolling window aggregates.
"""

# copyright: sktime developers, BSD-3-Clause License (see LICENSE file)

import warnings
from functools import partial

import numpy as np
import pandas as pd
from numpy._core._multiarray_umath import _ArrayFunctionDispatcher

from sktime.transformations.base import BaseTransformer

__all__ = ["SubsequenceExtractionTransformer"]
__author__ = ["wirrywoo"]


class SubsequenceExtractionTransformer(BaseTransformer):
r"""
Extract contiguous subsequences of specified length based on rolling aggregates.

A transformer for the extraction of contiguous subsequences of specified
length based on maximal/minimal rolling window aggregates.

Given a sequence :math:`\\{x_1, x_2, \cdots, x_n \\}` and ``subseq_len`` integer
:math:`k` such that :math:`0 < k \leq n`, the transformer's task is to find index
:math:`i` satisfying :math:`1 \leq i \leq i + k - 1 \leq n` such that for given
``aggregate_fn`` :math:`A: \mathbb{R}^k \longrightarrow \mathbb{R}`:

1. :math:`A(x_{i}, \cdots, x_{i+k-1})` is maximal when ``selector = 'max'``, and
2. :math:`A(x_{i}, \cdots, x_{i+k-1})` is minimal when ``selector = 'min'``.

The `maximum sum subarray problem <https://en.wikipedia.org/wiki/Maximum_subarray_problem>`_
is a special case and can be obtained by setting ``aggregate_fn = np.sum`` and
``selector = 'max'``.

Parameters
----------
aggregate_fn : callable of signature ``np.ndarray -> float``
Callable function in ``numpy`` used to aggregate values in contiguous
subsequence to a scalar.
subseq_len : int
Length of the subsequence in .iloc units. Must be less than the lengths of all
input series.
kwargs : dict, default: None
Dictionary of additional keyword arguments to pass to aggregate_fn.
selector : {'max', 'min'}, default: 'max'
Function used to decide which subsequence to return from the set of scalars or
primitives.

Examples
--------
>>> from sktime.transformations.series.subsequence_extraction import (
>>> SubsequenceExtractionTransformer
>>> )
>>> from sktime.utils._testing.hierarchical import _make_hierarchical
>>> X = _make_hierarchical(same_cutoff=False)
>>> subseq_extract = SubsequenceExtractionTransformer(
>>> aggregate_fn = np.sum, subseq_len = 3)
>>> subseq_extract.fit(X)
>>> X_transformed = subseq_extract.transform(X)

References
----------
Jon Bentley. 1984. Programming pearls: algorithm design techniques.
Commun. ACM 27, 9 (Sept. 1984), 865-873. https://doi.org/10.1145/358234.381162
"""

_tags = {
"univariate-only": False,
"authors": ["wirrywoo"],
"maintainers": ["wirrywoo"],
"scitype:transform-input": "Series",
"scitype:transform-output": "Series",
"scitype:instancewise": False,
"scitype:transform-labels": "None",
"X_inner_mtype": "pd.DataFrame",
"fit_is_empty": False,
"capability:inverse_transform": False,
"capability:unequal_length:removes": True,
"handles-missing-data": False,
}

def __init__(self, aggregate_fn, subseq_len, kwargs=None, selector="max"):
self.aggregate_fn = aggregate_fn
self.subseq_len = subseq_len
self.kwargs = kwargs
self.selector = selector

super().__init__()

def _fit(self, X, y=None):
"""Fit transformer to X and y.

private _fit containing the core logic, called from fit

Parameters
----------
X : nested pandas DataFrame of shape [n_instances, n_features]
each cell of X must contain pandas.Series
Data to fit transform to
y : ignored argument for interface compatibility
Additional data, e.g., labels for transformation

Returns
-------
self : reference to self
"""
if self.subseq_len > len(X):
raise ValueError(
f"Subsequence length parameter ({self.subseq_len}) is not less \
than or equal to the minimum sequence length of X ({len(X)})."
)

if not (isinstance(self.aggregate_fn, _ArrayFunctionDispatcher)):
raise ValueError(
f"{self.aggregate_fn} is not supported for parameter aggregate_fn"
)

if self.selector not in ["max", "min"]:
raise ValueError(f"{self.selector} is not supported for parameter selector")

return self

def _transform(self, X, y=None):
"""Transform X and return a transformed version.

private _transform containing core logic, called from transform

Parameters
----------
X : nested pandas DataFrame of shape [n_instances, n_features]
each cell of X must contain pandas.Series
Data to transform
y : ignored argument for interface compatibility

Returns
-------
Xt : nested pandas DataFrame of shape [n_instances, n_features]
each cell of Xt contains pandas.Series
transformed version of X
"""
index_list = X.index.get_level_values(X.index.names[-1])

with warnings.catch_warnings():
warnings.simplefilter(action="ignore", category=FutureWarning)
fnc = partial(self.aggregate_fn, **(self.kwargs or {}))
X_aggregate = getattr(X.rolling(window=self.subseq_len), "agg")(
fnc.func, **fnc.keywords
).dropna()

indices = getattr(X_aggregate, f"idx{self.selector}")()

upper = pd.Categorical(indices, categories=index_list, ordered=True).codes + 1
lower = upper - self.subseq_len

dfs = [
X[col].iloc[l:u].reset_index(drop=True)
for col, l, u in zip(X.columns, lower, upper)
]

return pd.concat(dfs, axis=1, ignore_index=False)

@classmethod
def get_test_params(cls, parameter_set="default"):
"""Return testing parameter settings for the estimator.

Parameters
----------
parameter_set : str, default="default"
Name of the set of test parameters to return, for use in tests. If no
special parameters are defined for a value, will return ``"default"`` set.

Returns
-------
params : dict or list of dict, default = {}
Parameters to create testing instances of the class
Each dict are parameters to construct an "interesting" test instance, i.e.,
``MyClass(**params)`` or ``MyClass(**params[i])`` creates a valid test
instance.
``create_test_instance`` uses the first (or only) dictionary in ``params``
"""
params = [
{
"subseq_len": 3,
"aggregate_fn": np.average,
"kwargs": {"weights": [0.5, 0.3, 0.2], "axis": 0},
"selector": "max",
},
{"subseq_len": 5, "aggregate_fn": np.median, "selector": "min"},
{"subseq_len": 8, "aggregate_fn": np.mean},
]
return params
Loading
0