# Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License").
# You may not use this file except in compliance with the License.
# A copy of the License is located at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# or in the "license" file accompanying this file. This file is distributed
# on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
# express or implied. See the License for the specific language governing
# permissions and limitations under the License.
from copy import deepcopy
from dataclasses import dataclass, field
from typing import Any, cast, Dict, Iterator, List, Optional, Union
import pandas as pd
from pandas.core.indexes.datetimelike import DatetimeIndexOpsMixin
from toolz import valmap
from gluonts.dataset.common import DataEntry, ProcessDataEntry
from gluonts.dataset.field_names import FieldName
[docs]@dataclass
class PandasDataset:
"""
A pandas.DataFrame-based dataset type.
This class is constructed with a collection of pandas.DataFrame-objects
where each ``DataFrame`` is representing one time series.
A ``target`` and a ``timestamp`` columns are essential. Furthermore,
static/dynamic real/categorical features can be specified.
Parameters
----------
dataframes
Single ``pd.DataFrame``/``pd.Series`` or a collection as list or dict
containing at least ``timestamp`` and ``target`` values.
If a Dict is provided, the key will be the associated ``item_id``.
target
Name of the column that contains the ``target`` time series.
For multivariate targets, a list of column names should be provided.
timestamp
Name of the column that contains the timestamp information.
freq
Frequency of observations in the time series. Must be a valid pandas
frequency.
feat_dynamic_real
List of column names that contain dynamic real features.
feat_dynamic_cat
List of column names that contain dynamic categorical features.
feat_static_real
List of column names that contain static real features.
feat_static_cat
List of column names that contain static categorical features.
past_feat_dynamic_real
List of column names that contain dynamic real features only for the
history.
ignore_last_n_targets
For target and past dynamic features last ``ignore_last_n_targets``
elements are removed when iterating over the data set. This becomes
important when the predictor is called.
"""
dataframes: Union[
pd.DataFrame,
pd.Series,
List[pd.DataFrame],
List[pd.Series],
Dict[str, pd.DataFrame],
Dict[str, pd.Series],
]
target: Union[str, List[str]] = "target"
timestamp: Optional[str] = None
freq: Optional[str] = None
feat_dynamic_real: List[str] = field(default_factory=list)
feat_dynamic_cat: List[str] = field(default_factory=list)
feat_static_real: List[str] = field(default_factory=list)
feat_static_cat: List[str] = field(default_factory=list)
past_feat_dynamic_real: List[str] = field(default_factory=list)
ignore_last_n_targets: int = 0
def __post_init__(self) -> None:
if isinstance(self.target, list) and len(self.target) == 1:
self.target = self.target[0]
self.one_dim_target = not isinstance(self.target, list)
if is_series(self.dataframes):
self.dataframes = series_to_dataframe(self.dataframes)
# store data internally as List[Tuple[str, pandas.DataFrame]]
# if str is not empty it will be set in ``DataEntry`` as ``item_id``.
if isinstance(self.dataframes, dict):
self._dataframes = list(self.dataframes.items())
elif isinstance(self.dataframes, list):
self._dataframes = [(None, df) for df in self.dataframes]
else: # case single dataframe
self._dataframes = [(None, self.dataframes)]
for i, (item_id, df) in enumerate(self._dataframes):
if self.timestamp:
df = df.set_index(keys=self.timestamp)
if not isinstance(df.index, pd.PeriodIndex):
df.index = pd.to_datetime(df.index)
df = df.to_period(freq=self.freq)
df.sort_index(inplace=True)
assert is_uniform(df.index), (
"Dataframe index is not uniformly spaced. "
"If your dataframe contains data from multiple series in the "
'same column ("long" format), consider constructing the '
"dataset with `PandasDataset.from_long_dataframe` instead."
)
self._dataframes[i] = (item_id, df)
if not self.freq: # infer frequency from index
self.freq = self._dataframes[0][1].index.freqstr
self.process = ProcessDataEntry(
cast(str, self.freq), one_dim_target=self.one_dim_target
)
def _dataentry(
self, item_id: Optional[str], df: pd.DataFrame
) -> DataEntry:
dataentry = as_dataentry(
data=df,
target=self.target,
feat_dynamic_real=self.feat_dynamic_real,
feat_dynamic_cat=self.feat_dynamic_cat,
feat_static_real=self.feat_static_real,
feat_static_cat=self.feat_static_cat,
past_feat_dynamic_real=self.past_feat_dynamic_real,
)
if item_id is not None:
dataentry["item_id"] = item_id
return dataentry
def __iter__(self) -> Iterator[DataEntry]:
for item_id, df in self._dataframes:
dataentry = self.process(self._dataentry(item_id, df))
if self.ignore_last_n_targets:
dataentry = prepare_prediction_data(
dataentry, self.ignore_last_n_targets
)
yield dataentry
def __len__(self) -> int:
return len(self._dataframes)
[docs] @classmethod
def from_long_dataframe(
cls, dataframe: pd.DataFrame, item_id: str, **kwargs
) -> "PandasDataset":
"""
Construct ``PandasDataset`` out of a long dataframe.
A long dataframe uses the long format for each variable. Target time
series values, for example, are stacked on top of each other rather
than side-by-side. The same is true for other dynamic or categorical
features.
Parameters
----------
dataframe
pandas.DataFrame containing at least ``timestamp``, ``target`` and
``item_id`` columns.
item_id
Name of the column that, when grouped by, gives the different time
series.
**kwargs
Additional arguments. Same as of PandasDataset class.
Returns
-------
PandasDataset
Gluonts dataset based on ``pandas.DataFrame``s.
"""
return cls(dataframes=dict(list(dataframe.groupby(item_id))), **kwargs)
[docs]def series_to_dataframe(
series: Union[pd.Series, List[pd.Series], Dict[str, pd.Series]]
) -> Union[pd.DataFrame, List[pd.DataFrame], Dict[str, pd.DataFrame]]:
def to_df(series):
assert isinstance(
series.index, DatetimeIndexOpsMixin
), "series index has to be a DatetimeIndex."
return series.to_frame(name="target")
if isinstance(series, list):
return list(map(to_df, series))
elif isinstance(series, dict):
return valmap(to_df, series)
return to_df(series)
[docs]def is_series(series: Any) -> bool:
"""
return True if ``series`` is ``pd.Series`` or a collection of
``pd.Series``.
"""
if isinstance(series, list):
return is_series(series[0])
elif isinstance(series, dict):
return is_series(list(series.values()))
return isinstance(series, pd.Series)
[docs]def as_dataentry(
data: pd.DataFrame,
target: Union[str, List[str]],
timestamp: Optional[str] = None,
feat_dynamic_real: List[str] = [],
feat_dynamic_cat: List[str] = [],
feat_static_real: List[str] = [],
feat_static_cat: List[str] = [],
past_feat_dynamic_real: List[str] = [],
) -> DataEntry:
"""
Convert a single time series (uni- or multi-variate) that is given in
a pandas.DataFrame format to a DataEntry.
Parameters
----------
data
pandas.DataFrame containing at least ``timestamp``, ``target`` and
``item_id`` columns.
target
Name of the column that contains the ``target`` time series.
For multivariate targets ``target`` is expecting a list of column
names.
timestamp
Name of the column that contains the timestamp information.
If ``None`` the index of ``data`` is assumed to be the time.
feat_dynamic_real
List of column names that contain dynamic real features.
feat_dynamic_cat
List of column names that contain dynamic categorical features.
feat_static_real
List of column names that contain static real features.
feat_static_cat
List of column names that contain static categorical features.
past_feat_dynamic_real
List of column names that contain dynamic real features only for
the history.
Returns
-------
DataEntry
A dictionary with at least ``target`` and ``start`` field.
"""
start = data.loc[:, timestamp].iloc[0] if timestamp else data.index[0]
dataentry = {FieldName.START: start}
def set_field(fieldname, col_names, f=lambda x: x):
if col_names:
dataentry[fieldname] = [
f(data.loc[:, n].to_list()) for n in col_names
]
if isinstance(target, str):
dataentry[FieldName.TARGET] = data.loc[:, target].to_list()
else:
set_field(FieldName.TARGET, target)
set_field(FieldName.FEAT_DYNAMIC_REAL, feat_dynamic_real)
set_field(FieldName.FEAT_DYNAMIC_CAT, feat_dynamic_cat)
set_field(FieldName.FEAT_STATIC_REAL, feat_static_real, lambda x: x[0])
set_field(FieldName.FEAT_STATIC_CAT, feat_static_cat, lambda x: x[0])
set_field(FieldName.PAST_FEAT_DYNAMIC_REAL, past_feat_dynamic_real)
return dataentry
[docs]def prepare_prediction_data(
dataentry: DataEntry, ignore_last_n_targets: int
) -> DataEntry:
"""
Remove ``ignore_last_n_targets`` values from ``target`` and
``past_feat_dynamic_real``. Works in univariate and multivariate case.
>>> prepare_prediction_data(
>>> {"target": np.array([1., 2., 3., 4.])}, ignore_last_n_targets=2
>>> )
{'target': array([1., 2.])}
"""
entry = deepcopy(dataentry)
for fname in [FieldName.TARGET, FieldName.PAST_FEAT_DYNAMIC_REAL]:
if fname in entry:
entry[fname] = entry[fname][..., :-ignore_last_n_targets]
return entry