Source code for gluonts.transform.sampler

# Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License").
# You may not use this file except in compliance with the License.
# A copy of the License is located at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# or in the "license" file accompanying this file. This file is distributed
# on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
# express or implied. See the License for the specific language governing
# permissions and limitations under the License.

from typing import Tuple

import numpy as np
from pydantic import BaseModel

from gluonts.dataset.stat import ScaleHistogram


[docs]class InstanceSampler(BaseModel):
    """
    An InstanceSampler is called with the time series ``ts``, and returns a set
    of indices at which training instances will be generated.

    The sampled indices ``i`` satisfy ``a <= i <= b``, where ``a = min_past``
    and ``b = ts.shape[axis] - min_future``.
    """

    axis: int = -1
    min_past: int = 0
    min_future: int = 0

[docs]    class Config:
        arbitrary_types_allowed = True

    def _get_bounds(self, ts: np.ndarray) -> Tuple[int, int]:
        return (
            self.min_past,
            ts.shape[self.axis] - self.min_future,
        )

    def __call__(self, ts: np.ndarray) -> np.ndarray:
        raise NotImplementedError()


[docs]class UniformSplitSampler(InstanceSampler):
    """
    Samples each point with the same fixed probability.

    Parameters
    ----------
    p
        Probability of selecting a time point
    """

    p: float

    def __call__(self, ts: np.ndarray) -> np.ndarray:
        a, b = self._get_bounds(ts)

        if a > b:
            return np.array([], dtype=int)

        window_size = b - a + 1
        (indices,) = np.where(np.random.random_sample(window_size) < self.p)
        return indices + a


[docs]class PredictionSplitSampler(InstanceSampler):
    """
    Sampler used for prediction.

    Always selects the last time point for splitting i.e. the forecast point
    for the time series.
    """

    allow_empty_interval: bool = False

    def __call__(self, ts: np.ndarray) -> np.ndarray:
        a, b = self._get_bounds(ts)
        assert self.allow_empty_interval or a <= b
        return np.array([b]) if a <= b else np.array([], dtype=int)


[docs]def ValidationSplitSampler(
    axis: int = -1, min_past: int = 0, min_future: int = 0
) -> PredictionSplitSampler:
    return PredictionSplitSampler(
        allow_empty_interval=True,
        axis=axis,
        min_past=min_past,
        min_future=min_future,
    )


[docs]def TestSplitSampler(
    axis: int = -1, min_past: int = 0
) -> PredictionSplitSampler:
    return PredictionSplitSampler(
        allow_empty_interval=False,
        axis=axis,
        min_past=min_past,
        min_future=0,
    )


[docs]class ExpectedNumInstanceSampler(InstanceSampler):
    """
    Keeps track of the average time series length and adjusts the probability
    per time point such that on average `num_instances` training examples are
    generated per time series.

    Parameters
    ----------

    num_instances
        number of training examples generated per time series on average
    """

    num_instances: float
    total_length: int = 0
    n: int = 0

    def __call__(self, ts: np.ndarray) -> np.ndarray:
        a, b = self._get_bounds(ts)
        window_size = b - a + 1

        if window_size <= 0:
            return np.array([], dtype=int)

        self.n += 1
        self.total_length += window_size
        avg_length = self.total_length / self.n

        if avg_length <= 0:
            return np.array([], dtype=int)

        p = self.num_instances / avg_length
        (indices,) = np.where(np.random.random_sample(window_size) < p)
        return indices + a


[docs]class BucketInstanceSampler(InstanceSampler):
    """
    This sample can be used when working with a set of time series that have a
    skewed distributions. For instance, if the dataset contains many time
    series with small values and few with large values.

    The probability of sampling from bucket i is the inverse of its number of
    elements.

    Parameters
    ----------
    scale_histogram
        The histogram of scale for the time series. Here scale is the mean abs
        value of the time series.
    """

    scale_histogram: ScaleHistogram

    def __call__(self, ts: np.ndarray) -> np.ndarray:
        a, b = self._get_bounds(ts)
        p = 1.0 / self.scale_histogram.count(ts)
        (indices,) = np.where(np.random.random_sample(b - a + 1) < p)
        return indices + a


[docs]class ContinuousTimePointSampler(BaseModel):
    """
    Abstract class for "continuous time" samplers, which, given a lower bound
    and upper bound, sample "points" (events) in continuous time from a
    specified interval.
    """

    min_past: float = 0.0
    min_future: float = 0.0

    def _get_bounds(self, interval_length: float) -> Tuple[float, float]:
        return (
            self.min_past,
            interval_length - self.min_future,
        )

    def __call__(self, interval_length: float) -> np.ndarray:
        """
        Returns random points in the real interval between :code:`a` and
        :code:`b`.

        Parameters
        ----------
        a
            The lower bound (minimum time value that a sampled point can take)
        b
            Upper bound. Must be greater than a.
        """
        raise NotImplementedError()


[docs]class ContinuousTimeUniformSampler(ContinuousTimePointSampler):
    """
    Implements a simple random sampler to sample points in the continuous
    interval between :code:`a` and :code:`b`.
    """

    num_instances: int

    def __call__(self, interval_length: float) -> np.ndarray:
        a, b = self._get_bounds(interval_length)
        return (
            np.random.rand(self.num_instances) * (b - a) + a
            if a <= b
            else np.array([])
        )


[docs]class ContinuousTimePredictionSampler(ContinuousTimePointSampler):
    allow_empty_interval: bool = False

    def __call__(self, interval_length: float) -> np.ndarray:
        a, b = self._get_bounds(interval_length)
        assert (
            self.allow_empty_interval or a <= b
        ), "Interval start time must be before interval end time."
        return np.array([b]) if a <= b else np.array([])