# How to Model & Predict Future Time-Steps Using ARIMA Statistical Model

## Time-Series modeling using Natural Gas data

`print("....Data Loading...."); print();print('\033[4mHenry Hub Natural Gas Price\033[0m');data = web.DataReader('NNJ24.NYM', data_source = 'yahoo', start = '2000-01-01')data = data.sort_index(ascending=True)print(data.head())df = data.resample('M').last()print(df.tail())`
`df['monthly_return'] = df['price'].pct_change()df['month'] = df.index.monthsns.set(rc={'figure.figsize':(18, 9)})sns.boxplot(data=df, x='month', y='monthly_return')plt.title("Natural Gas Monthly return 2011-2020")plt.suptitle("")plt.show()`

## Rolling statistics:

`def plot_rolling_statistics_ts(ts, titletext,ytext, window_size=12):    ts.plot(color='red', label='Original', lw=0.5)    ts.rolling(window_size).mean().plot(color='blue',label='Rolling Mean')    ts.rolling(window_size).std().plot(color='black', label='Rolling Std')    plt.legend(loc='best')    plt.ylabel(ytext)    plt.title(titletext)    plt.show(block=False)plot_rolling_statistics_ts(df['monthly_return'],                           'Natural Gas prices: rolling mean and standard deviation',                           'Monthly return')plt.figure(figsize=(10,5))plot_rolling_statistics_ts(data['Open'],                           'Natural Gas prices: rolling mean and standard deviation',                           'Daily prices', 365)`

## Decomposition

Decomposition will be performed by breaking down the series into multiple components. Objective is to have deeper understanding of the series. It provides insight in terms of modeling complexity and which approaches to follow in order to accurately capture each of the components.

1. trend is associated with the slope (increasing/decreasing) of the series.
2. seasonality which is the deviations from the mean caused by repeating short-term cycles and
3. noise is the random variation in the series
`decomp = seasonal_decompose(df['Adj Close'], model='multiplicative')rcParams['figure.figsize'] = 10, 6decomp.plot().suptitle('Multiplicative Decomposition', fontsize=14);`

## Stationarity test:

To confirm our observation, let us use statistical test:

• The Kwiatkowski-Phillips-Schmidt-Shin (KPSS) test
• Plots of the (partial) auto-correlation function (PACF/ACF)
`def test_stationarity(timeseries):    print('Results of Dickey-Fuller Test:')    dftest = adfuller(timeseries[1:], autolag='AIC')    dfoutput = pd.Series(dftest[0:4], index=['Test Statistic', 'pvalue',                                             '#Lags Used', 'Number of Observations Used'])    print (dfoutput)print(test_stationarity(df['Adj Close'])); print()print(test_stationarity(df['monthly_return']))`
`def kpss_test(x, h0_type='c'):    indices = ['Test Statistic', 'p-value', '# of Lags']    kpss_test = kpss(x, regression=h0_type)    results = pd.Series(kpss_test[0:3], index=indices)    for key, value in kpss_test.items():        results[f'Critical Value ({key})'] = value        return resultsprint(kpss_test(df['Adj Close'])); print()print(kpss_test(df.monthly_return).dropna())`
`plt.figure(figsize=(15,5))plt.subplot(211)plot_acf(df['Adj Close'], ax=plt.gca(),lags=10)plt.subplot(212)plot_pacf(df['Adj Close'], ax=plt.gca(),lags=10)plt.show()`
`plt.figure(figsize=(15,5))plt.subplot(211)plot_acf(df['monthly_return'].dropna(),          ax=plt.gca(),lags=10)plt.subplot(212)plot_pacf(df['monthly_return'].dropna(),           ax=plt.gca(),lags=10)plt.show()`
`ret = df.monthly_return.dropna()lag_acf = acf(ret, nlags=20)lag_pacf = pacf(ret, nlags=20, method = 'ols')# plot acfplt.subplot(121); plt.plot(lag_acf);plt.axhline(y=1, linestyle='--', color = 'gray')plt.axhline(y=-1.96/np.sqrt(len(ret)), linestyle='--', color = 'gray')plt.axhline(y=1.96/np.sqrt(len(ret)), linestyle='--', color = 'gray')plt.title('Auto correlation function')# plot pacfplt.subplot(122); plt.plot(lag_pacf);plt.axhline(y=1, linestyle='--', color = 'gray')plt.axhline(y=-1.96/np.sqrt(len(ret)), linestyle='--', color = 'gray')plt.axhline(y=1.96/np.sqrt(len(ret)), linestyle='--', color = 'gray')plt.title('Partial Auto correlation function'); plt.tight_layout();`

# ARIMA model

`arima = ARIMA(df['Adj Close'], order=(2, 1, 2)).fit(disp=0)arima.summary()`
`def arima_diagnostics(resids, n_lags=40):  fig, ((ax1, ax2), (ax3, ax4)) = plt.subplots(2, 2);  r = resids;  resids = (r - np.nanmean(r)) / np.nanstd(r);  resids_nonmissing = resids[~(np.isnan(resids))];  sns.lineplot(x=np.arange(len(resids)), y=resids, ax=ax1);  ax1.set_title('Standardized residuals');  x_lim = (-1.96 * 2, 1.96 * 2);  r_range = np.linspace(x_lim, x_lim);  norm_pdf = scs.norm.pdf(r_range);  sns.distplot(resids_nonmissing, hist=True, kde=True,   norm_hist     =True, ax=ax2);  ax2.plot(r_range, norm_pdf, 'g', lw=2, label='N(0,1)');  ax2.set_title('Distribution of standardized residuals');  ax2.set_xlim(x_lim); ax2.legend();  # Q-Q plot  qq = sm.qqplot(resids_nonmissing, line='s', ax=ax3);  ax3.set_title('Q-Q plot');  # ACF plot  plot_acf(resids, ax=ax4, lags=n_lags, alpha=0.05);  ax4.set_title('ACF plot');  return figarima_diagnostics(arima.resid, 40); plt.tight_layout();`
`ljung_box_results = acorr_ljungbox(arima.resid)fig, ax = plt.subplots(1, figsize=[10, 5])sns.scatterplot(x=range(len(ljung_box_results)), y=ljung_box_results, ax=ax)ax.axhline(0.05, ls='--', c='r')ax.set(title="Ljung-Box test's results", xlabel='Lag', ylabel='p-value')plt.show()`

## Prediction based on ARIMA model

`forecast = int(12)arima_pred, std, ci = (arima.forecast(steps=forecast))arima_pred = DataFrame(arima_pred)d = DataFrame(df['Adj Close'].tail(len(arima_pred))); d.reset_index(inplace = True)d = d.append(DataFrame({'Date': pd.date_range(start = d.Date.iloc[-1],                                              periods = (len(d)+1), freq = 'm', closed = 'right')}))d = d.tail(forecast); d.set_index('Date', inplace = True)arima_pred.index = d.indexarima_pred.rename(columns = {0: 'arima_fcast'}, inplace=True)# 95% prediction intervalci = DataFrame(ci)ci.rename(columns = {0: 'lower95', 1:'upper95'}, inplace=True)ci.index = arima_pred.indexARIMA = concat([arima_pred, ci], axis=1)ARIMA`

## Auto-ARIMA

To re-validate the manual selection of ARIMA parameters, let us run through auto-arima.

`model = pm.auto_arima(df['Adj Close'], error_action='ignore', suppress_warnings=True,                      seasonal=False)model.summary()`

## Prediction (auto-arima)

`auto_arima_pred = model.predict(n_periods=forecast, return_conf_int=True,alpha=0.05)auto_arima_pred = [DataFrame(auto_arima_pred,                             columns=['prediction']),                   DataFrame(auto_arima_pred,                              columns=['ci_lower', 'ci_upper'])]auto_arima_pred = concat(auto_arima_pred,axis=1).set_index(ARIMA.index)auto_arima_pred`

## Visualization

`plt.set_cmap('cubehelix'); sns.set_palette('cubehelix')COLORS = [plt.cm.cubehelix(x) for x in [0.1, 0.3, 0.5, 0.7]]; fig, ax = plt.subplots(1)ax = sns.lineplot(data=df['Adj Close'], color=COLORS, label='Actual')ax.plot(ARIMA.arima_fcast, c=COLORS, label='ARIMA(2,1,1)')ax.fill_between(ARIMA.index, ARIMA.lower95, ARIMA.upper95, alpha=0.3, facecolor=COLORS)ax.plot(auto_arima_pred.prediction, c=COLORS, label='ARIMA(0,1,0)')ax.fill_between(auto_arima_pred.index, auto_arima_pred.ci_lower, auto_arima_pred.ci_upper,                alpha=0.2, facecolor=COLORS)ax.set(title="Natural Gas price - historical and forecast", xlabel='Date', ylabel='Price (\$)') ax.legend(loc='best')`

# Conclusion

The approach applied here to forecast the future trends of price movements based on its past behavior using stochastic time-series modeling . ARIMA model is designated by ARIMA (p, d, q), where ‘p’ is the auto regressive process order, ‘d’ corresponds to stationary data order and ‘q’ denotes the moving average process order. Validity of the developed ARIMA models can be testified via statistical parameters such as RMSE, MAPE, AIC, AICc and BIC which was not shown here. ARIMA with GARCH volatility model can be tried too to capture the volatility in the series.

Written by