Source code for gluonts.dataset.hierarchical

# Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License").
# You may not use this file except in compliance with the License.
# A copy of the License is located at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# or in the "license" file accompanying this file. This file is distributed
# on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
# express or implied. See the License for the specific language governing
# permissions and limitations under the License.


# Standard library imports
from typing import Optional

# Third-party imports
import numpy as np
import pandas as pd

# First-party imports
from gluonts.dataset.pandas import PandasDataset


[docs]class HierarchicalTimeSeries:
    r"""
    Class for representing hierarchical time series.

    The hierarchy is represented by the standard aggregation matrix `S`.
    The time series at the bottom (leaf) level of the hierarchy
    (`ts_at_bottom_level`) are assumed to be given by the columns of
    a single pandas dataframe.

    The ordering of columns of `ts_at_bottom_level` should be consistent
    with the ordering of the columns of `S`.

    Parameters
    ----------
    ts_at_bottom_level
        A single pandas dataframe whose columns are the time series
        corresponding to the leaves of the hierarchy.
    S
        Summation or aggregation matrix whose ordering should be consistent
        with the ordering of the columns of `ts_at_all_levels`.
        In particular, the bottom `k x k` sub-matrix should be identity matrix,
        where `k` is the number of leaves of the hierarchy.
    """

    def __init__(
        self,
        ts_at_bottom_level: pd.DataFrame,
        S: np.ndarray,
    ):
        assert isinstance(ts_at_bottom_level.index, pd.PeriodIndex), (
            "Index of `ts_at_bottom_level` must be an instance of "
            "`pd.PeriodIndex`."
        )

        self._freq = ts_at_bottom_level.index.freqstr

        self._S = S
        self.ts_at_bottom_level = ts_at_bottom_level

        self.ts_aggregated = HierarchicalTimeSeries.aggregate_ts(
            ts_at_bottom_level=self.ts_at_bottom_level,
            S=self._S,
        )

        self._ts_at_all_levels = pd.concat(
            [self.ts_aggregated, self.ts_at_bottom_level],
            axis=1,
        )
        self._ts_at_all_levels.columns = list(range(self.num_ts))

    @property
    def freq(self):
        return self._freq

    @property
    def ts_at_all_levels(self):
        return self._ts_at_all_levels

    @property
    def S(self):
        return self._S

    @property
    def num_ts(self):
        return self._S.shape[0]

    @property
    def num_bottom_ts(self):
        return self._S.shape[1]

[docs]    @staticmethod
    def aggregate_ts(
        ts_at_bottom_level: pd.DataFrame,
        S: np.ndarray,
    ) -> pd.DataFrame:
        """
        Constructs aggregated time series according to the
        summation/aggregation matrix `S`.

        Parameters
        ----------
        ts_at_bottom_level
            A single pandas dataframe whose columns are the time series
            corresponding to the leaves of the hierarchy.
        S
            Summation or aggregation matrix whose ordering should be consistent
            with the ordering of the columns of `ts_at_all_levels`.
            In particular, the bottom `k x k` sub-matrix should be an identity
            matrix, where `k` is the number of leaves of the hierarchy.

        Returns
        -------
            A pandas dataframe consisting of aggregated time series
            (at all non-leaf levels).
        """
        num_ts, num_bottom_ts = S.shape
        num_agg_ts = num_ts - num_bottom_ts

        assert ts_at_bottom_level.shape[1] == num_bottom_ts, (
            "Number of columns of the aggregation matrix `S` and "
            "the dataframe `ts_at_bottom_level` should be  same."
            f"But shape of `S`: {S.shape} and shape of `ts_at_bottom_level`: "
            f"{ts_at_bottom_level.shape}."
        )

        # Last `num_bottom_ts` rows contain the identity matrix.
        assert (S[num_agg_ts:, :] == np.eye(num_bottom_ts)).all(), (
            f"The last {num_bottom_ts} rows of aggregation matrix `S`"
            f" should contain Identity matrix."
        )

        # First `num_agg_ts` rows contain the aggregation information.
        S_sum = S[:num_agg_ts, :]

        # Construct aggregated time series.
        ts_aggregated = pd.concat(
            {
                f"agg_ts_{i}": ts_at_bottom_level.apply(
                    lambda row: np.dot(row, agg),
                    axis=1,
                )
                for i, agg in enumerate(S_sum)
            },
            axis=1,
        )
        ts_aggregated.set_index(ts_at_bottom_level.index, inplace=True)

        return ts_aggregated

[docs]    def to_dataset(
        self,
        feat_dynamic_real: Optional[pd.DataFrame] = None,
    ):
        """
        Convert the hierarchical time series into
        `gluonts.dataset.PandasDataset`.

        Note: Currently only dynamic real features are used by the hierarchical
        model. However, the model internally creates a categorical feature
        to distinguish between different time series of the hierarchy.

        Parameters
        ----------
        feat_dynamic_real
            A pandas dataframe containing dynamic features as columns.
            Note that features of any (or all) time series in the hierarchy
            can be passed here, since all time series are considered together
            as a single multivariate time series.

        Returns
        -------
        PandasDataset
            An instance of `PandasDataset`.
        """
        future_length = 0

        if feat_dynamic_real is not None:
            assert (
                self.ts_at_all_levels.index[0] == feat_dynamic_real.index[0]
            ), (
                "The staring time point of dynamic features should match "
                "with that of the hierarchical time series. "
                f"Start of `feat_dynamic_real`: "
                f"{feat_dynamic_real.index[0]} and "
                f"the start of hierarchical time series: "
                f"{self.ts_at_all_levels.index[0]}."
            )

            assert feat_dynamic_real.index.intersection(
                self.ts_at_all_levels.index
            ).equals(self.ts_at_all_levels.index), (
                "Dynamic features should be provided for all time "
                "points where the target is defined. "
                f"Index of `feat_dynamic_real`: {feat_dynamic_real.index}, \n"
                f"Index of `ts_at_all_levels` of `hts`: "
                f"{self.ts_at_all_levels.index}. \n "
                "Check if the periods of these indices also match. \n"
            )

            feat_dynamic_real.columns = [
                f"feat_dynamic_real_{col}" for col in feat_dynamic_real.columns
            ]
            future_length = len(feat_dynamic_real.index) - len(
                self.ts_at_all_levels.index
            )
        else:
            feat_dynamic_real = pd.DataFrame()

        pandas_ds = PandasDataset(
            dataframes=pd.concat(
                [self.ts_at_all_levels, feat_dynamic_real],
                axis=1,
            ),
            target=list(self.ts_at_all_levels.columns),
            feat_dynamic_real=list(feat_dynamic_real.columns),
            future_length=future_length,
        )

        return pandas_ds