[Download]

Synthetic Data Generation Tutorial

In [1]:
import json
from itertools import islice
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.ticker import (AutoMinorLocator, MultipleLocator)
In [2]:
from gluonts.dataset.artificial import recipe as rcp
from gluonts.core.serde import dump_code, load_code
In [3]:
# plotting utils

def plot_recipe(recipe, length):
    output_dict = rcp.evaluate(recipe, length)
    K = len(output_dict)
    lct = MultipleLocator(288)
    minor = AutoMinorLocator(12)

    fig, axs = plt.subplots(K, 1, figsize=(16, 2 * len(recipe)))
    for i, k in enumerate(output_dict):
        axs[i].xaxis.set_major_locator(lct)
        axs[i].xaxis.set_minor_locator(minor)
        axs[i].plot(output_dict[k])
        axs[i].grid()
        axs[i].set_ylabel(k)


def plot_examples(target, length, num, anomaly_indicator=None):
    fix, axs = plt.subplots(num, 1, figsize=(16, num * 2))
    for i in range(num):
        xx = rcp.evaluate(
            dict(
                target=target,
                anomaly_indicator=anomaly_indicator
            ), length)
        axs[i].plot(xx['target'])
        axs[i].set_ylim(0, 1.1*np.max(xx['target']))
        axs[i].grid()
        if anomaly_indicator is not None:
            axs[i].fill_between(
                np.arange(len(xx['target'])),
                xx['anomaly_indicator'] * 1.1*np.max(xx['target']),
                np.zeros(len(xx['target'])),
                alpha=0.3,
                color="red")


def print_dicts(*dicts):
    for d in dicts:
        print("{")
        for k,v in d.items():
            print("\t", k, ": ", v)
        print("}\n")

Data Generation Recipes

To generate realistic artificial data, we describe the data generation process through a symbolic graph (this is akin to how mxnet symbol graphs work).

Your graph can contain python values as well as operators that correspond to random variables or random processes. The output of a recipe can be a list, dictionary or a value:

In [4]:
rcp.evaluate(rcp.RandomGaussian(), length=5)
Out[4]:
array([ 9.58790271e-04,  2.53943908e+00, -1.72219699e-01, -8.77149980e-01,
       -1.08723839e+00])
In [5]:
rcp.evaluate({
    'var1': rcp.RandomGaussian(),
    'var2': 3.0
}, length=5)
Out[5]:
{'var1': array([ 0.03387796,  0.23116965,  0.13772227, -0.98208679, -1.56309833]),
 'var2': 3.0}
In [6]:
rcp.evaluate(
    [3.0, rcp.RandomUniform()]
, length=5)
Out[6]:
[3.0, array([0.48531735, 0.04688332, 0.18475095, 0.69602717, 0.41166535])]
In [7]:
recipe = dict(
    myOutput1=rcp.RandomGaussian()
)

# multiple evaluations lead to different results, due to randomness
print_dicts(
    rcp.evaluate(recipe, length=5),
    rcp.evaluate(recipe, length=5),
)
{
         myOutput1 :  [-1.429189    0.89021697 -0.37033636 -0.84653731 -0.86422911]
}

{
         myOutput1 :  [ 1.73939576 -0.86715954 -0.90203556  0.62946362 -0.70510983]
}

Referencing variables

Each time you create a random variable such as RandomGaussian the variable refers to a new independent RV. You can re-use and refer to previously created random variables.

In [8]:
stddev1 = 2.0
stddev2 = rcp.RandomUniform(low=0, high=1, shape=(1, ))
x1 = rcp.RandomGaussian(stddev=stddev1)
x2 = rcp.RandomGaussian(stddev=stddev2)
x3 = 2 * x2

recipe = dict(
    x1=x1,
    x2=x2,
    x3=x3
)

# multiple evaluations lead to different results, due to randomness
print_dicts(
    rcp.evaluate(recipe, length=5),
    rcp.evaluate(recipe, length=5)
)
{
         x1 :  [ 1.01862547 -2.87200092 -1.84967283 -3.62099548 -3.11820883]
         x2 :  [ 0.57035031 -0.04441775  0.3929043  -0.4961036  -0.98404194]
         x3 :  [ 1.14070062 -0.0888355   0.7858086  -0.99220719 -1.96808388]
}

{
         x1 :  [-1.21940076 -2.02573529 -2.46268212  2.8217056  -1.30156845]
         x2 :  [-0.45961885  0.36095062 -0.33025855  0.78780716 -0.50341056]
         x3 :  [-0.9192377   0.72190124 -0.6605171   1.57561432 -1.00682111]
}

Note that you may create and use intermediate random varibles such as stddev2 in the above example without including them in the output.

In [9]:
recipe = dict(
    random_out=rcp.RandomGaussian(shape=(1,)),
    fixed_out=np.random.randn(1)
)

# note that fixed_out stays the same;
# it's evaluated only once when the recipe is created
print_dicts(
    rcp.evaluate(recipe, length=1),
    rcp.evaluate(recipe, length=1)
)
{
         random_out :  [-0.83550952]
         fixed_out :  [-2.28316656]
}

{
         random_out :  [-0.78650777]
         fixed_out :  [-2.28316656]
}

Length

Most operators in the recipe package have a length argument that is automatically passed when the expression is evaluated. The idea is that these recipes are used to generate fixed-length time series, and most operators produce individual components of the time series that have the same length.

In [10]:
recipe = dict(
    random_gaussian=rcp.RandomGaussian(),
    constant_vec=rcp.ConstantVec(42)

)

print_dicts(
    rcp.evaluate(recipe, length=3),
    rcp.evaluate(recipe, length=5)
)
{
         random_gaussian :  [ 0.5260809  -0.83401569 -2.28387345]
         constant_vec :  [42. 42. 42.]
}

{
         random_gaussian :  [-2.22905848 -0.67652594 -1.35647729  1.88702621  0.07152953]
         constant_vec :  [42. 42. 42. 42. 42.]
}

Operator Overloading

The operators defined in the recipe package overload the basic arithmetic operations (addition, subtraction, multiplication, division).

In [11]:
x1 = 42 * rcp.ConstantVec(1)
x2 = x1 * rcp.RandomUniform()
x3 = rcp.RandomGaussian() + rcp.RandomUniform()
result = x1 + x2 + x3

rcp.evaluate(result, 3)
Out[11]:
array([61.74653909, 51.61887988, 53.45712192])

SerDe

Recipes composed of serializable / representable components can easily be serialized / deserialized.

In [12]:
dumped = dump_code(result)
print(dumped)

reconstructed = load_code(dumped)

rcp.evaluate(reconstructed, 3)
gluonts.dataset.artificial.recipe._LiftedBinaryOp(left=gluonts.dataset.artificial.recipe._LiftedBinaryOp(left=gluonts.dataset.artificial.recipe._LiftedBinaryOp(left=42, op="*", right=gluonts.dataset.artificial.recipe.ConstantVec(constant=1)), op="+", right=gluonts.dataset.artificial.recipe._LiftedBinaryOp(left=gluonts.dataset.artificial.recipe._LiftedBinaryOp(left=42, op="*", right=gluonts.dataset.artificial.recipe.ConstantVec(constant=1)), op="*", right=gluonts.dataset.artificial.recipe.RandomUniform(high=1.0, low=0.0, shape=(0,)))), op="+", right=gluonts.dataset.artificial.recipe._LiftedBinaryOp(left=gluonts.dataset.artificial.recipe.RandomGaussian(shape=(0,), stddev=1.0), op="+", right=gluonts.dataset.artificial.recipe.RandomUniform(high=1.0, low=0.0, shape=(0,))))
Out[12]:
array([43.23656199, 77.28065399, 47.30667987])

Simple Examples

In [13]:
daily_smooth_seasonality = rcp.SmoothSeasonality(period=288, phase=-72)
noise = rcp.RandomGaussian(stddev=0.1)
signal = daily_smooth_seasonality + noise

recipe = dict(
    daily_smooth_seasonality=daily_smooth_seasonality,
    noise=noise,
    signal=signal
)

plot_recipe(recipe, 3 * 288)
../../_images/tutorials_data_manipulation_synthetic_data_generation_20_0.png
In [14]:
slope = rcp.RandomUniform(low=0, high=3, shape=(1,))
trend = rcp.LinearTrend(slope=slope)
daily_smooth_seasonality = rcp.SmoothSeasonality(period=288, phase=-72)
noise = rcp.RandomGaussian(stddev=0.1)
signal = trend + daily_smooth_seasonality + noise

plot_examples(signal, 3 * 288, 5)
../../_images/tutorials_data_manipulation_synthetic_data_generation_21_0.png

Composing Recipes

There are many ways to combine and extend generation recipes. For example using python functions.

In [15]:
def weekly_seasonal_unscaled():
    daily_smooth_seasonality = rcp.SmoothSeasonality(period=288, phase=-72)
    weekday_scale = rcp.RandomUniform(0.1, 10, shape=(1,))
    weekly_pattern = rcp.NormalizeMax(rcp.Concatenate([weekday_scale * np.ones(5), np.ones(2)]))
    day_of_week = rcp.Dilated(rcp.Repeated(weekly_pattern), 288)
    level = rcp.RandomUniform(low=0, high=10, shape=1)
    noise_level = rcp.RandomUniform(low=0.01, high=1, shape=1)
    noise = noise_level * rcp.RandomGaussian()
    signal = daily_smooth_seasonality * day_of_week
    unscaled = level + signal + noise

    return dict(
        daily_smooth_seasonality=daily_smooth_seasonality,
        weekday_scale=weekday_scale,
        weekly_pattern=weekly_pattern,
        day_of_week=day_of_week,
        level=level,
        noise_level=noise_level,
        noise=noise,
        signal=signal,
        unscaled=unscaled
    )

recipe = weekly_seasonal_unscaled()
plot_recipe(recipe, 10 * 288)

plot_examples(recipe['unscaled'], 10 * 288, 5)
../../_images/tutorials_data_manipulation_synthetic_data_generation_23_0.png
../../_images/tutorials_data_manipulation_synthetic_data_generation_23_1.png
In [16]:
def weekly_seasonal():
    c = weekly_seasonal_unscaled()
    unscaled = c['unscaled']

    scale = rcp.RandomUniform(low=0, high=1000, shape=1)
    z = scale * unscaled
    return z

plot_examples(weekly_seasonal(), 10 * 288, 5)
../../_images/tutorials_data_manipulation_synthetic_data_generation_24_0.png

Here is a more complex example

In [17]:
def scale(unscaled):
    s = rcp.RandomUniform(low=0, high=1000, shape=1)
    z = s * unscaled
    return z


def complex_weekly_seasonality():
    daily_pattern = rcp.RandomUniform(0, 1, shape=(24,))
    daily_seasonality = rcp.Dilated(rcp.Repeated(daily_pattern), 12)
    weekly_pattern = rcp.RandomUniform(0, 1, shape=(7,))
    weekly_seasonality = rcp.Dilated(rcp.Repeated(weekly_pattern), 288)
    unnormalized_seasonality = daily_seasonality * weekly_seasonality
    seasonality = rcp.NormalizeMax(unnormalized_seasonality)

    noise_level = rcp.RandomUniform(low=0.01, high=0.1, shape=1)
    noise = noise_level * rcp.RandomGaussian()

    level = rcp.RandomUniform(low=0, high=10, shape=1)
    signal = level + seasonality

    unscaled = signal + noise
    return scale(unscaled)

plot_examples(complex_weekly_seasonality(), 10 * 288, 5)
../../_images/tutorials_data_manipulation_synthetic_data_generation_26_0.png

Generating Anomalies

Anomalies are just another effect added/multiplied to a base time series. We can define a recipe for creating certain types of anomalies, and then compose it with a base recipe.

In [18]:
z = rcp.ConstantVec(1.0)

def inject_anomalies(z):
    normal_indicator = rcp.BinaryMarkovChain(one_to_zero=1/(288*7), zero_to_one=0.1)
    anomaly_indicator = 1 - normal_indicator
    anomaly_scale = 0.5 + rcp.RandomUniform(-1.0, 1.0, shape=1)
    anomaly_multiplier = 1 + anomaly_scale * anomaly_indicator
    target = z * anomaly_multiplier
    return target, anomaly_indicator

target, anomaly_indicator = inject_anomalies(z)
plot_examples(target, 10*288, 5, anomaly_indicator)
../../_images/tutorials_data_manipulation_synthetic_data_generation_28_0.png
In [19]:
target, anomaly_indicator = inject_anomalies(weekly_seasonal())
plot_examples(target, 288*7, 5, anomaly_indicator)
../../_images/tutorials_data_manipulation_synthetic_data_generation_29_0.png

Generating Changepoints

In [20]:
level = rcp.RandomUniform(0, 10, shape=1)
noise_level = rcp.RandomUniform(0.01, 1, shape=1)
noise =  rcp.RandomGaussian(noise_level)
homoskedastic_gaussian_noise = level + noise
In [21]:
z1 = homoskedastic_gaussian_noise
z2 = weekly_seasonal_unscaled()['unscaled']
z_stacked = rcp.Stack([z1, z2])
change = rcp.RandomChangepoints(1)
unscaled = rcp.Choose(z_stacked, change)

target = scale(unscaled)
target, anomaly_indicator = inject_anomalies(target)

In [22]:
plot_examples(target, 288*7, 10, anomaly_indicator)
../../_images/tutorials_data_manipulation_synthetic_data_generation_33_0.png

Generating several time series

In [23]:
rcp.take_as_list(rcp.generate(10, weekly_seasonal_unscaled(), "2018-01-01", {}), 2)
Out[23]:
[{'daily_smooth_seasonality': array([0.        , 0.00011899, 0.00047589, 0.00107054, 0.00190265,
         0.00297183, 0.00427757, 0.00581924, 0.00759612, 0.00960736]),
  'weekday_scale': array([5.53325369]),
  'weekly_pattern': array([1.        , 1.        , 1.        , 1.        , 1.        ,
         0.18072549, 0.18072549]),
  'day_of_week': array([1., 1., 1., 1., 1., 1., 1., 1., 1., 1.]),
  'level': array([7.15189366]),
  'noise_level': array([0.60673574]),
  'noise': array([-1.37627579,  0.80910965, -0.5113108 ,  1.19522357,  0.76819937,
         -0.30693338,  1.54426428,  0.65576722,  0.29384949,  0.35138523]),
  'signal': array([0.        , 0.00011899, 0.00047589, 0.00107054, 0.00190265,
         0.00297183, 0.00427757, 0.00581924, 0.00759612, 0.00960736]),
  'unscaled': array([5.77561787, 7.9611223 , 6.64105875, 8.34818777, 7.92199568,
         6.84793212, 8.70043552, 7.81348013, 7.45333928, 7.51288625]),
  'item_id': '0',
  'start': '2018-01-01'},
 {'daily_smooth_seasonality': array([0.        , 0.00011899, 0.00047589, 0.00107054, 0.00190265,
         0.00297183, 0.00427757, 0.00581924, 0.00759612, 0.00960736]),
  'weekday_scale': array([8.71312027]),
  'weekly_pattern': array([1.        , 1.        , 1.        , 1.        , 1.        ,
         0.11476945, 0.11476945]),
  'day_of_week': array([1., 1., 1., 1., 1., 1., 1., 1., 1., 1.]),
  'level': array([9.78618342]),
  'noise_level': array([0.80116698]),
  'noise': array([ 1.19700682, -0.16436603,  0.2508195 , -0.6842733 , -2.04537114,
          0.52365764,  0.69255774, -0.59459811,  1.81845245, -1.16518975]),
  'signal': array([0.        , 0.00011899, 0.00047589, 0.00107054, 0.00190265,
         0.00297183, 0.00427757, 0.00581924, 0.00759612, 0.00960736]),
  'unscaled': array([10.98319024,  9.62193638, 10.03747882,  9.10298066,  7.74271494,
         10.31281289, 10.48301873,  9.19740456, 11.612232  ,  8.63060103]),
  'item_id': '1',
  'start': '2018-01-01'}]

Saving to a file

In [24]:
def write_to_file(recipe, length, num_ts, fields, fn):
    with open(fn, 'w') as f, open(fn+"-all", 'w') as g:
        for x in islice(rcp.generate(length, recipe, "2019-01-07 00:00"), num_ts):
            z = {}
            for k in x:
                if type(x[k]) == np.ndarray:
                    z[k] = x[k].tolist()
                else:
                    z[k] = x[k]
            xx = {}
            for fi in fields:
                xx[fi] = z[fi]
            try:
                f.write(json.dumps(xx))
            except Exception as e:
                print(xx)
                print(z)
                raise e
            f.write('\n')
            g.write(json.dumps(z))
            g.write('\n')