From 96c029dd367d44f11eeef86df794df3b5d9cca7b Mon Sep 17 00:00:00 2001 From: Wilson Cheung Date: Tue, 13 Aug 2024 23:28:53 +0100 Subject: [PATCH 01/12] Create subsequence_extraction.py --- .../panel/subsequence_extraction.py | 148 ++++++++++++++++++ 1 file changed, 148 insertions(+) create mode 100644 sktime/transformations/panel/subsequence_extraction.py diff --git a/sktime/transformations/panel/subsequence_extraction.py b/sktime/transformations/panel/subsequence_extraction.py new file mode 100644 index 00000000000..861432512f7 --- /dev/null +++ b/sktime/transformations/panel/subsequence_extraction.py @@ -0,0 +1,148 @@ +"""Subsequence extraction transformer - extract subsequences of specified length that +meet some criterion with respect to an aggregate function.""" + +# copyright: sktime developers, BSD-3-Clause License (see LICENSE file) + +import numpy as np +import pandas as pd + +from sktime.transformations.base import BaseTransformer +from sktime.transformations.panel.padder import PaddingTransformer + +__all__ = ["SubsequenceExtractionTransformer"] +__author__ = ['wirrywoo'] + + +class SubsequenceExtractionTransformer(BaseTransformer): + """ + Parameters + ---------- + subsequence_len : int + Length of the subsequence. Must be less than the lengths of all input series. + aggregate : {'mean', 'median'}, default 'mean' + Function used to aggregate all values in subsequence to a scalar or primitive. + method : {'max', 'min'}, default 'max' + Function used to decide which subsequence to return from the set of scalars or primitives. + + Examples + -------- + >>> from sktime.transformations.panel.subsequence_extraction import SubsequenceExtractionTransformer + >>> from sktime.utils._testing.hierarchical import _make_hierarchical + >>> X = _make_hierarchical(same_cutoff=False) + >>> subseq_extract = SubsequenceExtractionTransformer(subsequence_len = 3) + >>> subseq_extract.fit(X) + >>> X_transformed = subseq_extract.transform(X) + """ + + _tags = { + "univariate-only": False, + "authors": ["wirrywoo"], + "maintainers": ["wirrywoo"], + "scitype:transform-input": "Series", + "scitype:transform-output": "Series", + "scitype:instancewise": False, + "scitype:transform-labels": "None", + "X_inner_mtype": "pd.DataFrame", + "fit_is_empty": False, + "capability:inverse_transform": False, + "capability:unequal_length:removes": True, + "handles-missing-data": False, + } + + def __init__(self, subsequence_len, aggregate="mean", method="max"): + self.subsequence_len = subsequence_len + self.aggregate = aggregate + self.method = method + + super().__init__() + + def _fit(self, X, y=None): + """Fit transformer to X and y. + + private _fit containing the core logic, called from fit + + Parameters + ---------- + X : nested pandas DataFrame of shape [n_instances, n_features] + each cell of X must contain pandas.Series + Data to fit transform to + y : ignored argument for interface compatibility + Additional data, e.g., labels for transformation + + Returns + ------- + self : reference to self + """ + + self.X_padded = PaddingTransformer(fill_value=np.nan).fit_transform(X) + self.X_aggregate = self.X_padded.rolling(window = self.subsequence_len) + + if self.aggregate == "mean": + self.X_aggregate = self.X_aggregate.mean() + elif self.aggregate == "median": + self.X_aggregate = self.X_aggregate.median() + else: + raise ValueError(f"{self.aggregate} is currently not supported for parameter aggregate") + + try: + if self.method == "max": + self.indices = self.X_aggregate.dropna().idxmax() + elif self.method == "min": + self.indices = self.X_aggregate.dropna().idxmin() + else: + raise ValueError(f"{self.method} is currently not supported for parameter method") + except ValueError: + raise ValueError(f"Subsequence length parameter ({self.subsequence_len}) is not less than minimum sequence length of X ({len(X)}).") + + return self + + def _transform(self, X, y=None): + """Transform X and return a transformed version. + + private _transform containing core logic, called from transform + + Parameters + ---------- + X : nested pandas DataFrame of shape [n_instances, n_features] + each cell of X must contain pandas.Series + Data to transform + y : ignored argument for interface compatibility + + Returns + ------- + Xt : nested pandas DataFrame of shape [n_instances, n_features] + each cell of Xt contains pandas.Series + transformed version of X + """ + + upper = self.indices + 1 + lower = upper - self.subsequence_len + + dfs = [X[col].iloc[l:u].reset_index(drop=True) + for col, l, u in zip(X.columns, lower, upper)] + + return pd.concat(dfs, axis=1, ignore_index=False) + + @classmethod + def get_test_params(cls, parameter_set="default"): + """Return testing parameter settings for the estimator. + + Parameters + ---------- + parameter_set : str, default="default" + Name of the set of test parameters to return, for use in tests. If no + special parameters are defined for a value, will return ``"default"`` set. + + + Returns + ------- + params : dict or list of dict, default = {} + Parameters to create testing instances of the class + Each dict are parameters to construct an "interesting" test instance, i.e., + ``MyClass(**params)`` or ``MyClass(**params[i])`` creates a valid test + instance. + ``create_test_instance`` uses the first (or only) dictionary in ``params`` + """ + + params = {"subsequence_len": 3} + return params From ade59a1df772463133e77f584636b69eb5d7ee97 Mon Sep 17 00:00:00 2001 From: Wilson Cheung Date: Tue, 13 Aug 2024 23:51:33 +0100 Subject: [PATCH 02/12] updated docs --- .all-contributorsrc | 11 ++++++++++- docs/source/api_reference/transformations.rst | 8 ++++++++ 2 files changed, 18 insertions(+), 1 deletion(-) diff --git a/.all-contributorsrc b/.all-contributorsrc index e44e1806a95..c381eb1c445 100644 --- a/.all-contributorsrc +++ b/.all-contributorsrc @@ -2965,6 +2965,15 @@ "code", "maintenance" ] - } + }, + { + "login": "wirrywoo", + "name": "Wilson Cheung", + "avatar_url": "https://avatars.githubusercontent.com/u/148647848?v=4?s=100", + "profile": "https://github.com/wirrywoo", + "contributions": [ + "code" + ] + } ] } diff --git a/docs/source/api_reference/transformations.rst b/docs/source/api_reference/transformations.rst index 58204f34e4a..4b454c086ca 100644 --- a/docs/source/api_reference/transformations.rst +++ b/docs/source/api_reference/transformations.rst @@ -702,6 +702,14 @@ These transformations ensure all series in a panel have equal length TruncationTransformer +.. currentmodule:: sktime.transformations.panel.subsequence_extraction + +.. autosummary:: + :toctree: auto_generated/ + :template: class.rst + + SubsequenceExtractionTransformer + Dimension reduction ~~~~~~~~~~~~~~~~~~~ From 02d431559f84e56aeffb6472fae7231dfcd8b42f Mon Sep 17 00:00:00 2001 From: Wilson Cheung Date: Wed, 14 Aug 2024 11:49:22 +0100 Subject: [PATCH 03/12] removed PaddingTransformer and addressed lint --- .../panel/subsequence_extraction.py | 49 ++++++++++++------- 1 file changed, 30 insertions(+), 19 deletions(-) diff --git a/sktime/transformations/panel/subsequence_extraction.py b/sktime/transformations/panel/subsequence_extraction.py index 861432512f7..292177c3c8a 100644 --- a/sktime/transformations/panel/subsequence_extraction.py +++ b/sktime/transformations/panel/subsequence_extraction.py @@ -1,20 +1,20 @@ """Subsequence extraction transformer - extract subsequences of specified length that -meet some criterion with respect to an aggregate function.""" +meet some criterion with respect to an aggregate function. +""" # copyright: sktime developers, BSD-3-Clause License (see LICENSE file) -import numpy as np import pandas as pd from sktime.transformations.base import BaseTransformer -from sktime.transformations.panel.padder import PaddingTransformer __all__ = ["SubsequenceExtractionTransformer"] -__author__ = ['wirrywoo'] +__author__ = ["wirrywoo"] class SubsequenceExtractionTransformer(BaseTransformer): """ + Parameters ---------- subsequence_len : int @@ -22,11 +22,14 @@ class SubsequenceExtractionTransformer(BaseTransformer): aggregate : {'mean', 'median'}, default 'mean' Function used to aggregate all values in subsequence to a scalar or primitive. method : {'max', 'min'}, default 'max' - Function used to decide which subsequence to return from the set of scalars or primitives. + Function used to decide which subsequence to return from the set of scalars or + primitives. Examples -------- - >>> from sktime.transformations.panel.subsequence_extraction import SubsequenceExtractionTransformer + >>> from sktime.transformations.panel.subsequence_extraction import ( + >>> SubsequenceExtractionTransformer + >>> ) >>> from sktime.utils._testing.hierarchical import _make_hierarchical >>> X = _make_hierarchical(same_cutoff=False) >>> subseq_extract = SubsequenceExtractionTransformer(subsequence_len = 3) @@ -73,16 +76,16 @@ def _fit(self, X, y=None): ------- self : reference to self """ - - self.X_padded = PaddingTransformer(fill_value=np.nan).fit_transform(X) - self.X_aggregate = self.X_padded.rolling(window = self.subsequence_len) + self.X_aggregate = X.rolling(window=self.subsequence_len) if self.aggregate == "mean": self.X_aggregate = self.X_aggregate.mean() elif self.aggregate == "median": self.X_aggregate = self.X_aggregate.median() else: - raise ValueError(f"{self.aggregate} is currently not supported for parameter aggregate") + raise ValueError( + f"{self.aggregate} is currently not supported for parameter aggregate" + ) try: if self.method == "max": @@ -90,9 +93,14 @@ def _fit(self, X, y=None): elif self.method == "min": self.indices = self.X_aggregate.dropna().idxmin() else: - raise ValueError(f"{self.method} is currently not supported for parameter method") + raise ValueError( + f"{self.method} is currently not supported for parameter method" + ) except ValueError: - raise ValueError(f"Subsequence length parameter ({self.subsequence_len}) is not less than minimum sequence length of X ({len(X)}).") + raise ValueError( + f"Subsequence length parameter ({self.subsequence_len}) is not less \ + than minimum sequence length of X ({len(X)})." + ) return self @@ -114,13 +122,17 @@ def _transform(self, X, y=None): each cell of Xt contains pandas.Series transformed version of X """ - - upper = self.indices + 1 + index_list = X.index.get_level_values(X.index.names[-1]) + upper = ( + pd.Categorical(self.indices, categories=index_list, ordered=True).codes + 1 + ) lower = upper - self.subsequence_len - dfs = [X[col].iloc[l:u].reset_index(drop=True) - for col, l, u in zip(X.columns, lower, upper)] - + dfs = [ + X[col].iloc[l:u].reset_index(drop=True) + for col, l, u in zip(X.columns, lower, upper) + ] + return pd.concat(dfs, axis=1, ignore_index=False) @classmethod @@ -143,6 +155,5 @@ def get_test_params(cls, parameter_set="default"): instance. ``create_test_instance`` uses the first (or only) dictionary in ``params`` """ - - params = {"subsequence_len": 3} + params = [{"subsequence_len": 3}, {"subsequence_len": 5}] return params From 4a947a05bfecf368bcbb709e21e4acb67c1dcab7 Mon Sep 17 00:00:00 2001 From: Wilson Cheung Date: Wed, 14 Aug 2024 12:11:16 +0100 Subject: [PATCH 04/12] resolved lint issues --- sktime/transformations/panel/subsequence_extraction.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/sktime/transformations/panel/subsequence_extraction.py b/sktime/transformations/panel/subsequence_extraction.py index 292177c3c8a..69b37a0d277 100644 --- a/sktime/transformations/panel/subsequence_extraction.py +++ b/sktime/transformations/panel/subsequence_extraction.py @@ -1,6 +1,4 @@ -"""Subsequence extraction transformer - extract subsequences of specified length that -meet some criterion with respect to an aggregate function. -""" +"""Subsequence extraction - extract subsequences of equal length based on conditions.""" # copyright: sktime developers, BSD-3-Clause License (see LICENSE file) @@ -14,6 +12,10 @@ class SubsequenceExtractionTransformer(BaseTransformer): """ + Extract subsequences of specified length based on various conditions. + + Aims to identify subsequences of specified length that yields the maximal/minimal + rolling mean/median. Parameters ---------- From 33e20d44454005dc642f34745086c20bc0f3c14c Mon Sep 17 00:00:00 2001 From: Wilson Cheung Date: Wed, 14 Aug 2024 14:19:35 +0100 Subject: [PATCH 05/12] refactored error handling --- .../panel/subsequence_extraction.py | 44 +++++++++---------- 1 file changed, 22 insertions(+), 22 deletions(-) diff --git a/sktime/transformations/panel/subsequence_extraction.py b/sktime/transformations/panel/subsequence_extraction.py index 69b37a0d277..6730bab633b 100644 --- a/sktime/transformations/panel/subsequence_extraction.py +++ b/sktime/transformations/panel/subsequence_extraction.py @@ -1,4 +1,8 @@ -"""Subsequence extraction - extract subsequences of equal length based on conditions.""" +"""Subsequence extraction transformer. + +A transformer for the extraction of subsequences of specified length based on +maximal/minimal rolling aggregates. +""" # copyright: sktime developers, BSD-3-Clause License (see LICENSE file) @@ -12,10 +16,10 @@ class SubsequenceExtractionTransformer(BaseTransformer): """ - Extract subsequences of specified length based on various conditions. + Extract subsequences of specified length based on rolling aggregatess. - Aims to identify subsequences of specified length that yields the maximal/minimal - rolling mean/median. + A transformer for the extraction of subsequences of specified length based on + maximal/minimal rolling aggregates. Parameters ---------- @@ -78,32 +82,28 @@ def _fit(self, X, y=None): ------- self : reference to self """ - self.X_aggregate = X.rolling(window=self.subsequence_len) + if self.subsequence_len > len(X): + raise ValueError( + f"Subsequence length parameter ({self.subsequence_len}) is not less \ + than or equal to the minimum sequence length of X ({len(X)})." + ) - if self.aggregate == "mean": - self.X_aggregate = self.X_aggregate.mean() - elif self.aggregate == "median": - self.X_aggregate = self.X_aggregate.median() - else: + if self.aggregate not in ["mean", "median"]: raise ValueError( f"{self.aggregate} is currently not supported for parameter aggregate" ) - try: - if self.method == "max": - self.indices = self.X_aggregate.dropna().idxmax() - elif self.method == "min": - self.indices = self.X_aggregate.dropna().idxmin() - else: - raise ValueError( - f"{self.method} is currently not supported for parameter method" - ) - except ValueError: + if self.method not in ["max", "min"]: raise ValueError( - f"Subsequence length parameter ({self.subsequence_len}) is not less \ - than minimum sequence length of X ({len(X)})." + f"{self.method} is currently not supported for parameter method" ) + self.X_aggregate = getattr( + X.rolling(window=self.subsequence_len), self.aggregate + )().dropna() + + self.indices = getattr(self.X_aggregate, f"idx{self.method}")() + return self def _transform(self, X, y=None): From 9391fdcbb3897c6866aeed710f24ce0ce79c6b3f Mon Sep 17 00:00:00 2001 From: Wilson Cheung Date: Wed, 14 Aug 2024 14:51:36 +0100 Subject: [PATCH 06/12] moved to series folder from panel --- docs/source/api_reference/transformations.rst | 18 +++++++++--------- .../subsequence_extraction.py | 6 +++--- 2 files changed, 12 insertions(+), 12 deletions(-) rename sktime/transformations/{panel => series}/subsequence_extraction.py (97%) diff --git a/docs/source/api_reference/transformations.rst b/docs/source/api_reference/transformations.rst index 4b454c086ca..42373d54093 100644 --- a/docs/source/api_reference/transformations.rst +++ b/docs/source/api_reference/transformations.rst @@ -597,6 +597,15 @@ These transformers create a series based on a sequence of sliding windows. HOG1DTransformer +.. currentmodule:: sktime.transformations.series.subsequence_extraction + +.. autosummary:: + :toctree: auto_generated/ + :template: class.rst + + SubsequenceExtractionTransformer + + Multivariate-to-univariate ~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -702,15 +711,6 @@ These transformations ensure all series in a panel have equal length TruncationTransformer -.. currentmodule:: sktime.transformations.panel.subsequence_extraction - -.. autosummary:: - :toctree: auto_generated/ - :template: class.rst - - SubsequenceExtractionTransformer - - Dimension reduction ~~~~~~~~~~~~~~~~~~~ diff --git a/sktime/transformations/panel/subsequence_extraction.py b/sktime/transformations/series/subsequence_extraction.py similarity index 97% rename from sktime/transformations/panel/subsequence_extraction.py rename to sktime/transformations/series/subsequence_extraction.py index 6730bab633b..4cb3b822568 100644 --- a/sktime/transformations/panel/subsequence_extraction.py +++ b/sktime/transformations/series/subsequence_extraction.py @@ -1,7 +1,7 @@ """Subsequence extraction transformer. A transformer for the extraction of subsequences of specified length based on -maximal/minimal rolling aggregates. +maximal/minimal rolling window aggregates. """ # copyright: sktime developers, BSD-3-Clause License (see LICENSE file) @@ -19,7 +19,7 @@ class SubsequenceExtractionTransformer(BaseTransformer): Extract subsequences of specified length based on rolling aggregatess. A transformer for the extraction of subsequences of specified length based on - maximal/minimal rolling aggregates. + maximal/minimal rolling window aggregates. Parameters ---------- @@ -33,7 +33,7 @@ class SubsequenceExtractionTransformer(BaseTransformer): Examples -------- - >>> from sktime.transformations.panel.subsequence_extraction import ( + >>> from sktime.transformations.series.subsequence_extraction import ( >>> SubsequenceExtractionTransformer >>> ) >>> from sktime.utils._testing.hierarchical import _make_hierarchical From 71f768cc9db74b51610d23bcb65657b2fc2581f9 Mon Sep 17 00:00:00 2001 From: Wilson Cheung Date: Wed, 14 Aug 2024 17:40:23 +0100 Subject: [PATCH 07/12] redefine fit function and improved test cases --- .../series/subsequence_extraction.py | 26 +++++++++++++------ 1 file changed, 18 insertions(+), 8 deletions(-) diff --git a/sktime/transformations/series/subsequence_extraction.py b/sktime/transformations/series/subsequence_extraction.py index 4cb3b822568..e337b9eb8e4 100644 --- a/sktime/transformations/series/subsequence_extraction.py +++ b/sktime/transformations/series/subsequence_extraction.py @@ -98,12 +98,6 @@ def _fit(self, X, y=None): f"{self.method} is currently not supported for parameter method" ) - self.X_aggregate = getattr( - X.rolling(window=self.subsequence_len), self.aggregate - )().dropna() - - self.indices = getattr(self.X_aggregate, f"idx{self.method}")() - return self def _transform(self, X, y=None): @@ -125,8 +119,15 @@ def _transform(self, X, y=None): transformed version of X """ index_list = X.index.get_level_values(X.index.names[-1]) + + X_aggregate = getattr( + X.rolling(window=self.subsequence_len), self.aggregate + )().dropna() + + indices = getattr(X_aggregate, f"idx{self.method}")() + upper = ( - pd.Categorical(self.indices, categories=index_list, ordered=True).codes + 1 + pd.Categorical(indices, categories=index_list, ordered=True).codes + 1 ) lower = upper - self.subsequence_len @@ -157,5 +158,14 @@ def get_test_params(cls, parameter_set="default"): instance. ``create_test_instance`` uses the first (or only) dictionary in ``params`` """ - params = [{"subsequence_len": 3}, {"subsequence_len": 5}] + params = [ + { + "subsequence_len": 3 + }, + { + "subsequence_len": 5, + "aggregate": "median", + "method": "min" + } + ] return params From 3e843f3e40ea5558235fb2d2dddf49adf4e9d1c7 Mon Sep 17 00:00:00 2001 From: Wilson Cheung Date: Thu, 15 Aug 2024 10:10:05 +0100 Subject: [PATCH 08/12] resolve failed ruff-format test --- .../series/subsequence_extraction.py | 14 +++----------- 1 file changed, 3 insertions(+), 11 deletions(-) diff --git a/sktime/transformations/series/subsequence_extraction.py b/sktime/transformations/series/subsequence_extraction.py index e337b9eb8e4..67eb1700956 100644 --- a/sktime/transformations/series/subsequence_extraction.py +++ b/sktime/transformations/series/subsequence_extraction.py @@ -126,9 +126,7 @@ def _transform(self, X, y=None): indices = getattr(X_aggregate, f"idx{self.method}")() - upper = ( - pd.Categorical(indices, categories=index_list, ordered=True).codes + 1 - ) + upper = pd.Categorical(indices, categories=index_list, ordered=True).codes + 1 lower = upper - self.subsequence_len dfs = [ @@ -159,13 +157,7 @@ def get_test_params(cls, parameter_set="default"): ``create_test_instance`` uses the first (or only) dictionary in ``params`` """ params = [ - { - "subsequence_len": 3 - }, - { - "subsequence_len": 5, - "aggregate": "median", - "method": "min" - } + {"subsequence_len": 3}, + {"subsequence_len": 5, "aggregate": "median", "method": "min"}, ] return params From eef8b96b0d41a97e73dff459d9ab87336cdbbba5 Mon Sep 17 00:00:00 2001 From: Wilson Cheung Date: Mon, 19 Aug 2024 11:08:54 +0100 Subject: [PATCH 09/12] addressed all fkiraly's suggestions for improvement --- .../series/subsequence_extraction.py | 93 ++++++++++++------- 1 file changed, 62 insertions(+), 31 deletions(-) diff --git a/sktime/transformations/series/subsequence_extraction.py b/sktime/transformations/series/subsequence_extraction.py index 67eb1700956..c943fc9f579 100644 --- a/sktime/transformations/series/subsequence_extraction.py +++ b/sktime/transformations/series/subsequence_extraction.py @@ -1,12 +1,17 @@ """Subsequence extraction transformer. -A transformer for the extraction of subsequences of specified length based on -maximal/minimal rolling window aggregates. +A transformer for the extraction of contiguous subsequences of specified +length based on maximal/minimal rolling window aggregates. """ # copyright: sktime developers, BSD-3-Clause License (see LICENSE file) +import warnings +from functools import partial + +import numpy as np import pandas as pd +from numpy._core._multiarray_umath import _ArrayFunctionDispatcher from sktime.transformations.base import BaseTransformer @@ -15,19 +20,33 @@ class SubsequenceExtractionTransformer(BaseTransformer): - """ - Extract subsequences of specified length based on rolling aggregatess. + r""" + Extract contiguous subsequences of specified length based on rolling aggregates. + + A transformer for the extraction of contiguous subsequences of specified + length based on maximal/minimal rolling window aggregates. + + Given a sequence :math:`\\{x_1, x_2, \cdots, x_n \\}` and `subseq_len` integer + :math:`k` such that :math:`0 < k \leq n`, the transformer's task is to find index + :math:`i` satisfying :math:`1 \leq i \leq i + k - 1 \leq n` such that for given + `aggregate_fn` :math:`A: \mathbb{R}^k \longrightarrow \mathbb{R}`: + + 1. :math:`A(x_{i}, \cdots, x_{i+k-1})` is maximal when `selector = 'max'`, and + 2. :math:`A(x_{i}, \cdots, x_{i+k-1})` is minimal when `selector = 'min'`. - A transformer for the extraction of subsequences of specified length based on - maximal/minimal rolling window aggregates. + When `aggregate_fn = np.sum` and `selector = 'max'`, the problem degenerates to the + `maximum sum subarray problem `_. Parameters ---------- - subsequence_len : int - Length of the subsequence. Must be less than the lengths of all input series. - aggregate : {'mean', 'median'}, default 'mean' - Function used to aggregate all values in subsequence to a scalar or primitive. - method : {'max', 'min'}, default 'max' + subseq_len : int + Length of the subsequence in .iloc units. Must be less than the lengths of all + input series. + aggregate_fn : np._core._multiarray_umath._ArrayFunctionDispatcher, default: np.sum + NumPy callable used to aggregate values in contiguous subsequence to a scalar. + kwargs : dict, default: {} + Dictionary of additional keyword arguments to pass to aggregate_fn. + selector : {'max', 'min'}, default: 'max' Function used to decide which subsequence to return from the set of scalars or primitives. @@ -38,9 +57,14 @@ class SubsequenceExtractionTransformer(BaseTransformer): >>> ) >>> from sktime.utils._testing.hierarchical import _make_hierarchical >>> X = _make_hierarchical(same_cutoff=False) - >>> subseq_extract = SubsequenceExtractionTransformer(subsequence_len = 3) + >>> subseq_extract = SubsequenceExtractionTransformer(subseq_len = 3) >>> subseq_extract.fit(X) >>> X_transformed = subseq_extract.transform(X) + + References + ---------- + Jon Bentley. 1984. Programming pearls: algorithm design techniques. + Commun. ACM 27, 9 (Sept. 1984), 865-873. https://doi.org/10.1145/358234.381162 """ _tags = { @@ -58,10 +82,11 @@ class SubsequenceExtractionTransformer(BaseTransformer): "handles-missing-data": False, } - def __init__(self, subsequence_len, aggregate="mean", method="max"): - self.subsequence_len = subsequence_len - self.aggregate = aggregate - self.method = method + def __init__(self, subseq_len, aggregate_fn=np.sum, kwargs={}, selector="max"): + self.subseq_len = subseq_len + self.aggregate_fn = aggregate_fn + self.kwargs = kwargs + self.selector = selector super().__init__() @@ -82,21 +107,19 @@ def _fit(self, X, y=None): ------- self : reference to self """ - if self.subsequence_len > len(X): + if self.subseq_len > len(X): raise ValueError( - f"Subsequence length parameter ({self.subsequence_len}) is not less \ + f"Subsequence length parameter ({self.subseq_len}) is not less \ than or equal to the minimum sequence length of X ({len(X)})." ) - if self.aggregate not in ["mean", "median"]: + if not (isinstance(self.aggregate_fn, _ArrayFunctionDispatcher)): raise ValueError( - f"{self.aggregate} is currently not supported for parameter aggregate" + f"{self.aggregate_fn} is not supported for parameter aggregate" ) - if self.method not in ["max", "min"]: - raise ValueError( - f"{self.method} is currently not supported for parameter method" - ) + if self.selector not in ["max", "min"]: + raise ValueError(f"{self.selector} is not supported for parameter selector") return self @@ -120,14 +143,17 @@ def _transform(self, X, y=None): """ index_list = X.index.get_level_values(X.index.names[-1]) - X_aggregate = getattr( - X.rolling(window=self.subsequence_len), self.aggregate - )().dropna() + with warnings.catch_warnings(): + warnings.simplefilter(action="ignore", category=FutureWarning) + fnc = partial(self.aggregate_fn, **self.kwargs) + X_aggregate = getattr(X.rolling(window=self.subseq_len), "agg")( + fnc.func, **fnc.keywords + ).dropna() - indices = getattr(X_aggregate, f"idx{self.method}")() + indices = getattr(X_aggregate, f"idx{self.selector}")() upper = pd.Categorical(indices, categories=index_list, ordered=True).codes + 1 - lower = upper - self.subsequence_len + lower = upper - self.subseq_len dfs = [ X[col].iloc[l:u].reset_index(drop=True) @@ -157,7 +183,12 @@ def get_test_params(cls, parameter_set="default"): ``create_test_instance`` uses the first (or only) dictionary in ``params`` """ params = [ - {"subsequence_len": 3}, - {"subsequence_len": 5, "aggregate": "median", "method": "min"}, + { + "subseq_len": 3, + "aggregate_fn": np.average, + "kwargs": {"weights": [0.5, 0.3, 0.2], "axis": 0}, + "selector": "max", + }, + {"subseq_len": 5, "aggregate_fn": np.median, "selector": "min"}, ] return params From 6d2c976c0ed3e06325aff1f9658525dee060cd76 Mon Sep 17 00:00:00 2001 From: Wilson Cheung Date: Mon, 19 Aug 2024 13:19:01 +0100 Subject: [PATCH 10/12] addressed fkiraly's code review --- .../series/subsequence_extraction.py | 29 ++++++++++--------- 1 file changed, 15 insertions(+), 14 deletions(-) diff --git a/sktime/transformations/series/subsequence_extraction.py b/sktime/transformations/series/subsequence_extraction.py index c943fc9f579..b64612a26ae 100644 --- a/sktime/transformations/series/subsequence_extraction.py +++ b/sktime/transformations/series/subsequence_extraction.py @@ -26,25 +26,26 @@ class SubsequenceExtractionTransformer(BaseTransformer): A transformer for the extraction of contiguous subsequences of specified length based on maximal/minimal rolling window aggregates. - Given a sequence :math:`\\{x_1, x_2, \cdots, x_n \\}` and `subseq_len` integer + Given a sequence :math:`\\{x_1, x_2, \cdots, x_n \\}` and ``subseq_len`` integer :math:`k` such that :math:`0 < k \leq n`, the transformer's task is to find index :math:`i` satisfying :math:`1 \leq i \leq i + k - 1 \leq n` such that for given - `aggregate_fn` :math:`A: \mathbb{R}^k \longrightarrow \mathbb{R}`: + ``aggregate_fn`` :math:`A: \mathbb{R}^k \longrightarrow \mathbb{R}`: - 1. :math:`A(x_{i}, \cdots, x_{i+k-1})` is maximal when `selector = 'max'`, and - 2. :math:`A(x_{i}, \cdots, x_{i+k-1})` is minimal when `selector = 'min'`. + 1. :math:`A(x_{i}, \cdots, x_{i+k-1})` is maximal when ``selector = 'max'``, and + 2. :math:`A(x_{i}, \cdots, x_{i+k-1})` is minimal when ``selector = 'min'``. - When `aggregate_fn = np.sum` and `selector = 'max'`, the problem degenerates to the - `maximum sum subarray problem `_. + The `maximum sum subarray problem `_ + is a special case and can be obtained by setting ``aggregate_fn = np.sum`` and + ``selector = 'max'``. Parameters ---------- + aggregate_fn : np._core._multiarray_umath._ArrayFunctionDispatcher + NumPy callable used to aggregate values in contiguous subsequence to a scalar. subseq_len : int Length of the subsequence in .iloc units. Must be less than the lengths of all input series. - aggregate_fn : np._core._multiarray_umath._ArrayFunctionDispatcher, default: np.sum - NumPy callable used to aggregate values in contiguous subsequence to a scalar. - kwargs : dict, default: {} + kwargs : dict, default: None Dictionary of additional keyword arguments to pass to aggregate_fn. selector : {'max', 'min'}, default: 'max' Function used to decide which subsequence to return from the set of scalars or @@ -82,9 +83,9 @@ class SubsequenceExtractionTransformer(BaseTransformer): "handles-missing-data": False, } - def __init__(self, subseq_len, aggregate_fn=np.sum, kwargs={}, selector="max"): - self.subseq_len = subseq_len + def __init__(self, aggregate_fn, subseq_len, kwargs=None, selector="max"): self.aggregate_fn = aggregate_fn + self.subseq_len = subseq_len self.kwargs = kwargs self.selector = selector @@ -115,7 +116,7 @@ def _fit(self, X, y=None): if not (isinstance(self.aggregate_fn, _ArrayFunctionDispatcher)): raise ValueError( - f"{self.aggregate_fn} is not supported for parameter aggregate" + f"{self.aggregate_fn} is not supported for parameter aggregate_fn" ) if self.selector not in ["max", "min"]: @@ -145,7 +146,7 @@ def _transform(self, X, y=None): with warnings.catch_warnings(): warnings.simplefilter(action="ignore", category=FutureWarning) - fnc = partial(self.aggregate_fn, **self.kwargs) + fnc = partial(self.aggregate_fn, **(self.kwargs or {})) X_aggregate = getattr(X.rolling(window=self.subseq_len), "agg")( fnc.func, **fnc.keywords ).dropna() @@ -172,7 +173,6 @@ def get_test_params(cls, parameter_set="default"): Name of the set of test parameters to return, for use in tests. If no special parameters are defined for a value, will return ``"default"`` set. - Returns ------- params : dict or list of dict, default = {} @@ -190,5 +190,6 @@ def get_test_params(cls, parameter_set="default"): "selector": "max", }, {"subseq_len": 5, "aggregate_fn": np.median, "selector": "min"}, + {"subseq_len": 8, "aggregate_fn": np.mean}, ] return params From 83e37f7007d23c637a1fbed06aa75c4fcca12fda Mon Sep 17 00:00:00 2001 From: Wilson Cheung Date: Mon, 19 Aug 2024 16:36:51 +0100 Subject: [PATCH 11/12] updated aggregate_fn description and updated example --- sktime/transformations/series/subsequence_extraction.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/sktime/transformations/series/subsequence_extraction.py b/sktime/transformations/series/subsequence_extraction.py index b64612a26ae..01b49d8485d 100644 --- a/sktime/transformations/series/subsequence_extraction.py +++ b/sktime/transformations/series/subsequence_extraction.py @@ -40,8 +40,9 @@ class SubsequenceExtractionTransformer(BaseTransformer): Parameters ---------- - aggregate_fn : np._core._multiarray_umath._ArrayFunctionDispatcher - NumPy callable used to aggregate values in contiguous subsequence to a scalar. + aggregate_fn : callable of signature ``np.ndarray -> float`` + Callable function in ``numpy`` used to aggregate values in contiguous + subsequence to a scalar. subseq_len : int Length of the subsequence in .iloc units. Must be less than the lengths of all input series. @@ -58,7 +59,8 @@ class SubsequenceExtractionTransformer(BaseTransformer): >>> ) >>> from sktime.utils._testing.hierarchical import _make_hierarchical >>> X = _make_hierarchical(same_cutoff=False) - >>> subseq_extract = SubsequenceExtractionTransformer(subseq_len = 3) + >>> subseq_extract = SubsequenceExtractionTransformer( + >>> aggregate_fn = np.sum, subseq_len = 3) >>> subseq_extract.fit(X) >>> X_transformed = subseq_extract.transform(X) From 70af559b78cd78d595f7fbbd67a80cfc1953ae5e Mon Sep 17 00:00:00 2001 From: Wilson Cheung Date: Tue, 20 Aug 2024 12:47:53 +0100 Subject: [PATCH 12/12] resolved formatting in .all-contributorsrc --- .all-contributorsrc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.all-contributorsrc b/.all-contributorsrc index ba557ee5cec..6c124f58b05 100644 --- a/.all-contributorsrc +++ b/.all-contributorsrc @@ -2984,7 +2984,7 @@ "avatar_url": "https://avatars.githubusercontent.com/u/148647848?v=4?s=100", "profile": "https://github.com/wirrywoo", "contributions": [ - "code" + "code" ] }, {