Stochastic time-series modeling

How to Model & Predict Future Time-Steps Using ARIMA Statistical Model

Time-Series modeling using Natural Gas data

Image for post
Image for post
Image by author
print("....Data Loading...."); print();
print('\033[4mHenry Hub Natural Gas Price\033[0m');
data = web.DataReader('NNJ24.NYM', data_source = 'yahoo', start = '2000-01-01')
data = data.sort_index(ascending=True)
print(data.head())
df = data.resample('M').last()
print(df.tail())
Image for post
Image for post
df['monthly_return'] = df['price'].pct_change()
df['month'] = df.index.month
sns.set(rc={'figure.figsize':(18, 9)})
sns.boxplot(data=df, x='month', y='monthly_return')
plt.title("Natural Gas Monthly return 2011-2020")
plt.suptitle("")
plt.show()
Image for post
Image for post

Rolling statistics:

def plot_rolling_statistics_ts(ts, titletext,ytext, window_size=12):
ts.plot(color='red', label='Original', lw=0.5)
ts.rolling(window_size).mean().plot(color='blue',label='Rolling Mean')
ts.rolling(window_size).std().plot(color='black', label='Rolling Std')
plt.legend(loc='best')
plt.ylabel(ytext)
plt.title(titletext)
plt.show(block=False)
plot_rolling_statistics_ts(df['monthly_return'],
'Natural Gas prices: rolling mean and standard deviation',
'Monthly return')
plt.figure(figsize=(10,5))
plot_rolling_statistics_ts(data['Open'],
'Natural Gas prices: rolling mean and standard deviation',
'Daily prices', 365)
Image for post
Image for post
Image for post
Image for post

Decomposition

Decomposition will be performed by breaking down the series into multiple components. Objective is to have deeper understanding of the series. It provides insight in terms of modeling complexity and which approaches to follow in order to accurately capture each of the components.

  1. trend is associated with the slope (increasing/decreasing) of the series.
  2. seasonality which is the deviations from the mean caused by repeating short-term cycles and
  3. noise is the random variation in the series
decomp = seasonal_decompose(df['Adj Close'], model='multiplicative')
rcParams['figure.figsize'] = 10, 6
decomp.plot().suptitle('Multiplicative Decomposition', fontsize=14);
Image for post
Image for post

Stationarity test:

To confirm our observation, let us use statistical test:

  • The Kwiatkowski-Phillips-Schmidt-Shin (KPSS) test
  • Plots of the (partial) auto-correlation function (PACF/ACF)
def test_stationarity(timeseries):
print('Results of Dickey-Fuller Test:')
dftest = adfuller(timeseries[1:], autolag='AIC')
dfoutput = pd.Series(dftest[0:4], index=['Test Statistic', 'pvalue',
'#Lags Used', 'Number of Observations Used'])
print (dfoutput)
print(test_stationarity(df['Adj Close'])); print()
print(test_stationarity(df['monthly_return']))
Image for post
Image for post
def kpss_test(x, h0_type='c'):
indices = ['Test Statistic', 'p-value', '# of Lags']
kpss_test = kpss(x, regression=h0_type)
results = pd.Series(kpss_test[0:3], index=indices)
for key, value in kpss_test[3].items():
results[f'Critical Value ({key})'] = value
return results
print(kpss_test(df['Adj Close'])); print()
print(kpss_test(df.monthly_return).dropna())
Image for post
Image for post
plt.figure(figsize=(15,5))
plt.subplot(211)
plot_acf(df['Adj Close'], ax=plt.gca(),lags=10)
plt.subplot(212)
plot_pacf(df['Adj Close'], ax=plt.gca(),lags=10)
plt.show()
Image for post
Image for post
plt.figure(figsize=(15,5))
plt.subplot(211)
plot_acf(df['monthly_return'].dropna(),
ax=plt.gca(),lags=10)
plt.subplot(212)
plot_pacf(df['monthly_return'].dropna(),
ax=plt.gca(),lags=10)
plt.show()
Image for post
Image for post
ret = df.monthly_return.dropna()lag_acf = acf(ret, nlags=20)
lag_pacf = pacf(ret, nlags=20, method = 'ols')
# plot acf
plt.subplot(121); plt.plot(lag_acf);
plt.axhline(y=1, linestyle='--', color = 'gray')
plt.axhline(y=-1.96/np.sqrt(len(ret)), linestyle='--', color = 'gray')
plt.axhline(y=1.96/np.sqrt(len(ret)), linestyle='--', color = 'gray')
plt.title('Auto correlation function')
# plot pacf
plt.subplot(122); plt.plot(lag_pacf);
plt.axhline(y=1, linestyle='--', color = 'gray')
plt.axhline(y=-1.96/np.sqrt(len(ret)), linestyle='--', color = 'gray')
plt.axhline(y=1.96/np.sqrt(len(ret)), linestyle='--', color = 'gray')
plt.title('Partial Auto correlation function'); plt.tight_layout();
Image for post
Image for post

ARIMA model

arima = ARIMA(df['Adj Close'], order=(2, 1, 2)).fit(disp=0)
arima.summary()
Image for post
Image for post
def arima_diagnostics(resids, n_lags=40):
fig, ((ax1, ax2), (ax3, ax4)) = plt.subplots(2, 2);
r = resids; resids = (r - np.nanmean(r)) / np.nanstd(r);
resids_nonmissing = resids[~(np.isnan(resids))];
sns.lineplot(x=np.arange(len(resids)), y=resids, ax=ax1);
ax1.set_title('Standardized residuals');
x_lim = (-1.96 * 2, 1.96 * 2); r_range = np.linspace(x_lim[0], x_lim[1]); norm_pdf = scs.norm.pdf(r_range);
sns.distplot(resids_nonmissing, hist=True, kde=True, norm_hist =True, ax=ax2);
ax2.plot(r_range, norm_pdf, 'g', lw=2, label='N(0,1)');
ax2.set_title('Distribution of standardized residuals');
ax2.set_xlim(x_lim); ax2.legend();
# Q-Q plot
qq = sm.qqplot(resids_nonmissing, line='s', ax=ax3);
ax3.set_title('Q-Q plot');
# ACF plot
plot_acf(resids, ax=ax4, lags=n_lags, alpha=0.05);
ax4.set_title('ACF plot');
return fig
arima_diagnostics(arima.resid, 40); plt.tight_layout();
Image for post
Image for post
ljung_box_results = acorr_ljungbox(arima.resid)
fig, ax = plt.subplots(1, figsize=[10, 5])
sns.scatterplot(x=range(len(ljung_box_results[1])), y=ljung_box_results[1], ax=ax)
ax.axhline(0.05, ls='--', c='r')
ax.set(title="Ljung-Box test's results", xlabel='Lag', ylabel='p-value')
plt.show()
Image for post
Image for post

Prediction based on ARIMA model

forecast = int(12)
arima_pred, std, ci = (arima.forecast(steps=forecast))
arima_pred = DataFrame(arima_pred)
d = DataFrame(df['Adj Close'].tail(len(arima_pred))); d.reset_index(inplace = True)
d = d.append(DataFrame({'Date': pd.date_range(start = d.Date.iloc[-1],
periods = (len(d)+1), freq = 'm', closed = 'right')}))
d = d.tail(forecast); d.set_index('Date', inplace = True)
arima_pred.index = d.index
arima_pred.rename(columns = {0: 'arima_fcast'}, inplace=True)
# 95% prediction interval
ci = DataFrame(ci)
ci.rename(columns = {0: 'lower95', 1:'upper95'}, inplace=True)
ci.index = arima_pred.index
ARIMA = concat([arima_pred, ci], axis=1)
ARIMA
Image for post
Image for post

Auto-ARIMA

To re-validate the manual selection of ARIMA parameters, let us run through auto-arima.

model = pm.auto_arima(df['Adj Close'], error_action='ignore', suppress_warnings=True,
seasonal=False)
model.summary()
Image for post
Image for post

Prediction (auto-arima)

auto_arima_pred = model.predict(n_periods=forecast, return_conf_int=True,alpha=0.05)
auto_arima_pred = [DataFrame(auto_arima_pred[0],
columns=['prediction']),
DataFrame(auto_arima_pred[1],
columns=['ci_lower', 'ci_upper'])]
auto_arima_pred = concat(auto_arima_pred,axis=1).set_index(ARIMA.index)
auto_arima_pred
Image for post
Image for post

Visualization

plt.set_cmap('cubehelix'); sns.set_palette('cubehelix')
COLORS = [plt.cm.cubehelix(x) for x in [0.1, 0.3, 0.5, 0.7]]; fig, ax = plt.subplots(1)
ax = sns.lineplot(data=df['Adj Close'], color=COLORS[0], label='Actual')
ax.plot(ARIMA.arima_fcast, c=COLORS[1], label='ARIMA(2,1,1)')
ax.fill_between(ARIMA.index, ARIMA.lower95, ARIMA.upper95, alpha=0.3, facecolor=COLORS[1])
ax.plot(auto_arima_pred.prediction, c=COLORS[2], label='ARIMA(0,1,0)')
ax.fill_between(auto_arima_pred.index, auto_arima_pred.ci_lower, auto_arima_pred.ci_upper,
alpha=0.2, facecolor=COLORS[2])
ax.set(title="Natural Gas price - historical and forecast", xlabel='Date', ylabel='Price ($)')
ax.legend(loc='best')
Image for post
Image for post

Conclusion

The approach applied here to forecast the future trends of price movements based on its past behavior using stochastic time-series modeling . ARIMA model is designated by ARIMA (p, d, q), where ‘p’ is the auto regressive process order, ‘d’ corresponds to stationary data order and ‘q’ denotes the moving average process order. Validity of the developed ARIMA models can be testified via statistical parameters such as RMSE, MAPE, AIC, AICc and BIC which was not shown here. ARIMA with GARCH volatility model can be tried too to capture the volatility in the series.

Written by

Data Science Practice Lead at KSG Analytics Pvt. Ltd.

Get the Medium app

A button that says 'Download on the App Store', and if clicked it will lead you to the iOS App store
A button that says 'Get it on, Google Play', and if clicked it will lead you to the Google Play store