import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from datetime import datetime
from datetime import timedelta
from pandas.plotting import register_matplotlib_converters
from statsmodels.tsa.stattools import acf, pacf
from statsmodels.tsa.arima.model import ARIMA
register_matplotlib_converters()
from time import time

8: ARMa Model#

def parser(s):
    return datetime.strptime(s, '%Y-%m-%d')
# Catfish Sales Data
#read data
catfish_sales = pd.read_csv('../data/catfish.csv', parse_dates=[0], index_col=0, squeeze=True, date_parser=parser)
/tmp/ipykernel_3775839/2935818357.py:2: FutureWarning: The squeeze argument has been deprecated and will be removed in a future version. Append .squeeze("columns") to the call to squeeze.


  catfish_sales = pd.read_csv('../data/catfish.csv', parse_dates=[0], index_col=0, squeeze=True, date_parser=parser)
#infer the frequency of the data
catfish_sales = catfish_sales.asfreq(pd.infer_freq(catfish_sales.index))
start_date = datetime(2000,1,1)
end_date = datetime(2004,1,1)
lim_catfish_sales = catfish_sales[start_date:end_date]
plt.figure(figsize=(10,4))
plt.plot(lim_catfish_sales)
plt.title('Catfish Sales in 1000s of Pounds', fontsize=20)
plt.ylabel('Sales', fontsize=16)
for year in range(start_date.year,end_date.year):
    plt.axvline(pd.to_datetime(str(year)+'-01-01'), color='k', linestyle='--', alpha=0.2)
plt.axhline(lim_catfish_sales.mean(), color='r', alpha=0.2, linestyle='--')
<matplotlib.lines.Line2D at 0x7f1f08706820>
../../../_images/cab6cb52251f29da4be54dea3a248c455afa54e8975533ea4d76cbbd22dc8331.png
first_diff = lim_catfish_sales.diff()[1:]
acf_vals.shape
(17,)
acf_vals = acf(first_diff)
num_lags = 10
plt.bar(range(num_lags), acf_vals[:num_lags])
<BarContainer object of 10 artists>
../../../_images/e254e57808b131b4754ec03852acbb0f846321bc233a8136db5d44a76aa02a1c.png
plt.figure(figsize=(10,4))
plt.plot(first_diff)
plt.title('First Difference of Catfish Sales', fontsize=20)
plt.ylabel('Sales', fontsize=16)
for year in range(start_date.year,end_date.year):
    plt.axvline(pd.to_datetime(str(year)+'-01-01'), color='k', linestyle='--', alpha=0.2)
plt.axhline(first_diff.mean(), color='r', alpha=0.2, linestyle='--')
<matplotlib.lines.Line2D at 0x7f1efe8b65b0>
../../../_images/b6c07555790c8fb7f04e03d1ac7b8f35383d1aeb80756037ca8d9b74b64e8324.png

ACF#

Based on ACF, we should start with a MA(1) process#

PACF#

pacf_vals = pacf(first_diff)
plt.bar(range(num_lags), pacf_vals[:num_lags])
<BarContainer object of 10 artists>
../../../_images/f9d7c6011df00d12d9fbe4293d571d3915697c43c292560be7c00befff71dcb8.png

Based on PACF, we should start with a AR(4) process#

Get training and testing sets#

train_end = datetime(2003,7,1)
test_end = datetime(2004,1,1)

train_data = first_diff[:train_end]
test_data = first_diff[train_end + timedelta(days=1):test_end]

Fit the ARMA Model#

# define model
model = ARIMA(train_data, order=(4,1,0))
#fit the model
start = time()
model_fit = model.fit()
end = time()
print('Model Fitting Time:', end - start)
Model Fitting Time: 0.036299705505371094
#summary of the model
print(model_fit.summary())
                               SARIMAX Results                                
==============================================================================
Dep. Variable:                  Total   No. Observations:                   42
Model:                 ARIMA(4, 1, 0)   Log Likelihood                -377.949
Date:                Wed, 22 Mar 2023   AIC                            765.898
Time:                        13:02:56   BIC                            774.465
Sample:                    02-01-2000   HQIC                           769.018
                         - 07-01-2003                                         
Covariance Type:                  opg                                         
==============================================================================
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
ar.L1         -0.9224      0.146     -6.315      0.000      -1.209      -0.636
ar.L2         -0.6459      0.165     -3.905      0.000      -0.970      -0.322
ar.L3         -0.6608      0.177     -3.724      0.000      -1.009      -0.313
ar.L4         -0.6099      0.134     -4.558      0.000      -0.872      -0.348
sigma2      5.563e+06   1.45e+06      3.848      0.000    2.73e+06     8.4e+06
===================================================================================
Ljung-Box (L1) (Q):                   0.50   Jarque-Bera (JB):                 0.57
Prob(Q):                              0.48   Prob(JB):                         0.75
Heteroskedasticity (H):               1.23   Skew:                             0.28
Prob(H) (two-sided):                  0.70   Kurtosis:                         2.90
===================================================================================

Warnings:
[1] Covariance matrix calculated using the outer product of gradients (complex-step).

So the ARMA(4,1) model is:#

\(\hat{y_t} = -0.87y_{t-1} - 0.42y_{t-2} - 0.56y_{t-3} - 0.61y_{t-4} + 0.52\varepsilon_{t-1}\)#

#get prediction start and end dates
pred_start_date = test_data.index[0]
pred_end_date = test_data.index[-1]
#get the predictions and residuals
predictions = model_fit.predict(start=pred_start_date, end=pred_end_date)
residuals = test_data - predictions
plt.figure(figsize=(10,4))
plt.plot(residuals)
plt.title('Residuals from AR Model', fontsize=20)
plt.ylabel('Error', fontsize=16)
plt.axhline(0, color='r', linestyle='--', alpha=0.2)
<matplotlib.lines.Line2D at 0x7f1efe8357f0>
../../../_images/c3b6f51abc772bd85548646066e7063e0326c1b9090c7f77f270e74ab4c192a0.png
plt.figure(figsize=(10,4))

plt.plot(test_data)
plt.plot(predictions)

plt.legend(('Data', 'Predictions'), fontsize=16)

plt.title('First Difference of Catfish Sales', fontsize=20)
plt.ylabel('Sales', fontsize=16)
Text(0, 0.5, 'Sales')
../../../_images/3627d241a7b6973fbccddff8094e2d19151874e91ec195e61ae34a198922561e.png
print('Root Mean Squared Error:', np.sqrt(np.mean(residuals**2)))
Root Mean Squared Error: 2350.1523951992594