SARIMAX

SARIMAX#

import os
os.chdir("../../")

import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style("whitegrid")

from statsmodels.tsa.seasonal import seasonal_decompose, STL
from statsmodels.tsa.statespace.sarimax import SARIMAX
from statsmodels.tsa.arima.model import ARIMA
import pmdarima as pm
from pmdarima import model_selection
from pmdarima import auto_arima
from scripts.python.tsa.ts_utils import * 
from scripts.python.tsa.utsmodel import * 
from scripts.python.tsa.ts_eval import *

import warnings
warnings.filterwarnings('ignore')
Hide code cell source
countries = ["solomon_islands", "palau", "samoa", "vanuatu", "tonga"]
# Set parameter range
p, d, q = range(0, 3), range(0, 2), range(0, 3)
P, D, Q, s = range(0, 3), range(0, 2), range(0, 3), [12]

# list of all parameter combos
pdq = list(itertools.product(p, d, q))
seasonal_pdq = list(itertools.product(P, D, Q, s))
all_param = list(itertools.product(pdq, seasonal_pdq))

for country in countries:

    mod = SARIMAXPipeline(country=country, y_var="total",
                          data=None,
                          exog_var=["covid", "stringency_index",
                                    str(country)+"_travel"],
                          transform_method="scaledlogit",
                          training_ratio=1,
                          verbose=False)
    mod.read_and_merge()
    display(mod.data.head(5))
    
    
    mod.transform()
    print(f"The Benchmark Evaluation for {country}".upper(), "\n")
    mod.get_benchmark_evaluation()
    print(f"Started to conduct stepwise searching for {country}".upper(), "\n")
    mod.stepwise_search()

    print(f"Started to conduct Grid searching for {country}".upper(), "\n")
    mod_msres = mod.manual_search(params=all_param)
    mod_msres.sort(key=lambda x: x[1])

    mod_swm = mod.stepwise_model
    if mod_msres[0][-1] == (mod_swm["order"], mod_swm["seasonal_order"]):
        print(mod_msres[0][-1])
    else:
        cv_models = []
        cv_models.append(pm.ARIMA(
            mod_swm["order"], mod_swm["seasonal_order"],  exog=mod.exog[:mod.training_size]))

        # Append top five GridSearch results
        for res in mod_msres[:5]:
            info_criteria = res[1]
            if info_criteria < 100:
                pass
            else:
                order, seasonal_order = res[-1]
                model = pm.ARIMA(order, seasonal_order,
                                 exog=mod.exog[:mod.training_size])
                cv_models.append(model)

        print(
            f"Started to conduct Cross-validation for {country}".upper(), "\n")
        mod_cv_comp = mod.compare_models(
            y=mod.transformed_y, exog=mod.exog, models=cv_models)
        best_cv_idx = np.nanargmin(mod_cv_comp["avg_error"])
        print(
            f"Best Models from Cross-validation is {cv_models[best_cv_idx]}", "\n")

        if best_cv_idx > 0:
            best_mod = mod_msres[best_cv_idx-1][0]
            best_mod_pred = mod.get_prediction_df(
                best_mod, mod.test_size, mod.exog[-mod.test_size:])
        
        lower = mod.data["total"].min() - 1
        upper = mod.data["total"].max() + 1
        for col_idx, col in enumerate(best_mod_pred.columns):
            for row_idx, _ in enumerate(best_mod_pred[col]):
                best_mod_pred.iloc[row_idx, col_idx] = mod.inverse_scaledlogit(
                    best_mod_pred.iloc[row_idx, col_idx], upper, lower)

        # Merge the prediction with actual values
        best_mod_pred.columns.name = None
        best_mod_pred = pd.concat(
            [mod.data[["date", "total"]], best_mod_pred], axis=1)
        
        save_pred_path = mod.country_data_folder + "/model/sarimax_" + str(country) + ".csv"
        best_mod_pred.to_csv(save_pred_path, encoding="utf-8")

        if mod.test_size != 0:
            pred_series = (best_mod_pred["train_pred"].fillna(0)
                           + best_mod_pred["test_pred"].fillna(0))
        else:
            pred_series = best_mod_pred["train_pred"]
        
        mod_eval = (pd.DataFrame(calculate_evaluation(
            best_mod_pred["total"], pred_series), index=["SARIMAX"]))
        mod_eval = pd.concat([mod.benchmark, mod_eval], axis=0)
        display(mod_eval)
        
        mod_eval.to_csv(mod.country_data_folder + "/model/sarimax_eval_" + str(country) + ".csv",
                        encoding="utf-8")
date total stringency_index covid solomon_islands_flights solomon_islands_hotel solomon_islands_travel
0 2009-01-01 1602 0.0 0.0 0.000000 0.0 0.000000
1 2009-02-01 1422 0.0 0.0 0.484667 0.0 0.203672
2 2009-03-01 1249 0.0 0.0 0.000000 0.0 0.000000
3 2009-04-01 1499 0.0 0.0 0.000000 0.0 0.000000
4 2009-05-01 1393 0.0 0.0 0.000000 0.0 0.478020
training size : 144, testing size : 0
THE BENCHMARK EVALUATION FOR SOLOMON_ISLANDS 

STARTED TO CONDUCT STEPWISE SEARCHING FOR SOLOMON_ISLANDS 

                                      SARIMAX Results                                       
============================================================================================
Dep. Variable:                                    y   No. Observations:                  144
Model:             SARIMAX(2, 1, 2)x(1, 0, [1], 12)   Log Likelihood                -170.543
Date:                              Wed, 14 Jun 2023   AIC                            361.087
Time:                                      15:49:31   BIC                            390.715
Sample:                                           0   HQIC                           373.126
                                              - 144                                         
Covariance Type:                                opg                                         
==========================================================================================
                             coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------------------
covid                     -3.0401      1.491     -2.039      0.041      -5.962      -0.118
stringency_index          -0.0648      0.026     -2.497      0.013      -0.116      -0.014
solomon_islands_travel    -0.3945      1.574     -0.251      0.802      -3.479       2.690
ar.L1                     -0.7511      0.172     -4.375      0.000      -1.088      -0.415
ar.L2                      0.1471      0.125      1.180      0.238      -0.097       0.391
ma.L1                      0.0672      0.105      0.640      0.522      -0.138       0.273
ma.L2                     -0.8797      0.082    -10.683      0.000      -1.041      -0.718
ar.S.L12                   0.9434      0.127      7.457      0.000       0.695       1.191
ma.S.L12                  -0.8220      0.243     -3.383      0.001      -1.298      -0.346
sigma2                     0.6149      0.054     11.411      0.000       0.509       0.721
===================================================================================
Ljung-Box (L1) (Q):                   0.07   Jarque-Bera (JB):             16100.35
Prob(Q):                              0.79   Prob(JB):                         0.00
Heteroskedasticity (H):               0.33   Skew:                             5.80
Prob(H) (two-sided):                  0.00   Kurtosis:                        53.67
===================================================================================

Warnings:
[1] Covariance matrix calculated using the outer product of gradients (complex-step).
../_images/ba79e2e0401cccdc68cf0d2acd7a6171b91a6fbd825be7a0733ccff8ffdc4c15.png
STARTED TO CONDUCT GRID SEARCHING FOR SOLOMON_ISLANDS 

STARTED TO CONDUCT CROSS-VALIDATION FOR SOLOMON_ISLANDS 

Best Models from Cross-validation is  ARIMA(2,0,2)(0,1,1)[12] intercept 
MSE RMSE MAE SMAPE
naive 130146.048951 360.757604 273.083916 19.605133
mean 373388.932822 611.055589 428.780575 28.510498
seasonal naive 719201.125000 848.057265 537.375000 42.210038
SARIMAX 81169.100331 284.901914 201.117080 14.616283
date japan south_korea taiwan china usa/canada europe others total stringency_index covid palau_flights palau_hotel palau_travel
0 2007-06-01 856.0 1291.0 3245.0 86.0 669.0 99.0 463.0 6709.0 0.0 0.0 1.068426 8.334537 0.320004
1 2007-07-01 2119.0 1366.0 3269.0 33.0 653.0 144.0 437.0 8021.0 0.0 0.0 0.000000 9.667343 0.952482
2 2007-08-01 3476.0 1354.0 3046.0 46.0 580.0 256.0 438.0 9196.0 0.0 0.0 0.704914 6.906282 0.453967
3 2007-09-01 3022.0 910.0 2497.0 61.0 559.0 145.0 401.0 7595.0 0.0 0.0 0.798284 3.591027 0.000000
4 2007-10-01 1807.0 1082.0 2298.0 49.0 774.0 390.0 395.0 6795.0 0.0 0.0 0.431326 4.111283 0.000000
training size : 184, testing size : 0
THE BENCHMARK EVALUATION FOR PALAU 

STARTED TO CONDUCT STEPWISE SEARCHING FOR PALAU 

                                     SARIMAX Results                                      
==========================================================================================
Dep. Variable:                                  y   No. Observations:                  184
Model:             SARIMAX(1, 0, 1)x(1, 0, 1, 12)   Log Likelihood                -231.068
Date:                            Wed, 14 Jun 2023   AIC                            478.136
Time:                                    15:54:50   BIC                            503.855
Sample:                                         0   HQIC                           488.560
                                            - 184                                         
Covariance Type:                              opg                                         
====================================================================================
                       coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------------
covid               -5.9728      0.833     -7.170      0.000      -7.605      -4.340
stringency_index    -0.0172      0.018     -0.950      0.342      -0.053       0.018
palau_travel         0.3199      0.571      0.560      0.575      -0.800       1.439
ar.L1                0.9569      0.033     28.662      0.000       0.891       1.022
ma.L1               -0.5431      0.052    -10.518      0.000      -0.644      -0.442
ar.S.L12             0.9414      0.132      7.137      0.000       0.683       1.200
ma.S.L12            -0.8473      0.210     -4.026      0.000      -1.260      -0.435
sigma2               0.7038      0.028     24.753      0.000       0.648       0.760
===================================================================================
Ljung-Box (L1) (Q):                   0.07   Jarque-Bera (JB):             19273.54
Prob(Q):                              0.80   Prob(JB):                         0.00
Heteroskedasticity (H):               6.06   Skew:                             4.88
Prob(H) (two-sided):                  0.00   Kurtosis:                        52.18
===================================================================================

Warnings:
[1] Covariance matrix calculated using the outer product of gradients (complex-step).
../_images/4027bf1f2eac20ce84caabccbb8172e44c632e3465fb87de462decc6b45e97f9.png
STARTED TO CONDUCT GRID SEARCHING FOR PALAU 

STARTED TO CONDUCT CROSS-VALIDATION FOR PALAU 

Best Models from Cross-validation is  ARIMA(0,1,1)(0,1,1)[12] intercept 
MSE RMSE MAE SMAPE
naive 2.096767e+06 1448.021907 1092.759563 23.272616
mean 1.639656e+07 4049.266859 3178.648757 51.716972
seasonal naive 1.123704e+07 3352.169852 2405.935201 56.509474
SARIMAX 1.826282e+06 1351.399848 947.467373 21.701883
date total american_samoa australia europe new_zealand usa other_countries cruise_ships stringency_index covid samoa_flights samoa_hotel samoa_travel
0 2002-08-01 8260.0 2877.0 1115.0 429.0 2035.0 933.0 871.0 0.0 0.0 0.0 0.0 0.0 0.0
1 2002-09-01 6708.0 2119.0 972.0 372.0 1879.0 612.0 754.0 0.0 0.0 0.0 0.0 0.0 0.0
2 2002-10-01 5737.0 1892.0 737.0 468.0 1492.0 577.0 571.0 0.0 0.0 0.0 0.0 0.0 0.0
3 2002-11-01 6653.0 2530.0 864.0 393.0 1675.0 483.0 708.0 0.0 0.0 0.0 0.0 0.0 0.0
4 2002-12-01 13042.0 4678.0 1918.0 272.0 4401.0 1007.0 766.0 0.0 0.0 0.0 0.0 0.0 0.0
training size : 242, testing size : 0
THE BENCHMARK EVALUATION FOR SAMOA 

STARTED TO CONDUCT STEPWISE SEARCHING FOR SAMOA 

                                      SARIMAX Results                                       
============================================================================================
Dep. Variable:                                    y   No. Observations:                  242
Model:             SARIMAX(0, 1, 2)x(2, 0, [1], 12)   Log Likelihood                -335.391
Date:                              Wed, 14 Jun 2023   AIC                            690.782
Time:                                      16:00:58   BIC                            725.630
Sample:                                           0   HQIC                           704.822
                                              - 242                                         
Covariance Type:                                opg                                         
====================================================================================
                       coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------------
intercept            0.0097      0.010      0.961      0.337      -0.010       0.029
covid               -7.2406      0.737     -9.827      0.000      -8.685      -5.796
stringency_index    -0.0750      0.010     -7.871      0.000      -0.094      -0.056
samoa_travel        -0.0038      0.268     -0.014      0.989      -0.529       0.521
ma.L1               -0.6578      0.037    -17.626      0.000      -0.731      -0.585
ma.L2               -0.3261      0.067     -4.854      0.000      -0.458      -0.194
ar.S.L12            -0.3483      0.400     -0.871      0.384      -1.132       0.435
ar.S.L24             0.3509      0.118      2.975      0.003       0.120       0.582
ma.S.L12             0.6056      0.406      1.492      0.136      -0.190       1.401
sigma2               0.9498      0.045     21.045      0.000       0.861       1.038
===================================================================================
Ljung-Box (L1) (Q):                   0.00   Jarque-Bera (JB):             23312.56
Prob(Q):                              0.96   Prob(JB):                         0.00
Heteroskedasticity (H):              25.64   Skew:                             5.65
Prob(H) (two-sided):                  0.00   Kurtosis:                        49.84
===================================================================================

Warnings:
[1] Covariance matrix calculated using the outer product of gradients (complex-step).
../_images/e623aeed9e2756e5c55a0384198d8cbf6aa07dfe344f01cbf9b5feb4fa6a7866.png
STARTED TO CONDUCT GRID SEARCHING FOR SAMOA 

STARTED TO CONDUCT CROSS-VALIDATION FOR SAMOA 

Best Models from Cross-validation is  ARIMA(2,1,1)(0,1,1)[12] intercept 
MSE RMSE MAE SMAPE
naive 1.180276e+07 3435.514859 2313.560166 23.936179
mean 2.193805e+07 4683.807205 3367.534731 43.918693
seasonal naive 1.643507e+07 4054.018920 2003.082645 31.466659
SARIMAX 2.952049e+06 1718.152711 1020.766988 33.573105
date australia new_zealand new_caledonia other_pic europe north_america japan china other_countries total fileyear not_stated stringency_index covid vanuatu_flights vanuatu_hotel vanuatu_travel
0 2003-12-01 2938.0 580.0 646.0 136.0 201.0 88.0 42.0 0.0 103.0 4734.0 2007.0 0.0 0.0 0.0 0.000000 0.000000 0.000000
1 2004-01-01 2864.0 439.0 717.0 122.0 186.0 67.0 36.0 0.0 61.0 4492.0 2005.0 0.0 0.0 0.0 2.232933 3.102047 2.453441
2 2004-02-01 2158.0 194.0 416.0 282.0 229.0 82.0 34.0 0.0 93.0 3488.0 2005.0 0.0 0.0 0.0 0.000000 0.000000 0.000000
3 2004-03-01 2733.0 420.0 255.0 160.0 195.0 115.0 53.0 0.0 83.0 4014.0 2005.0 0.0 0.0 0.0 0.000000 0.000000 0.000000
4 2004-04-01 2377.0 438.0 469.0 206.0 225.0 115.0 45.0 0.0 80.0 3955.0 2005.0 0.0 0.0 0.0 0.000000 0.000000 1.694580
training size : 224, testing size : 0
THE BENCHMARK EVALUATION FOR VANUATU 

STARTED TO CONDUCT STEPWISE SEARCHING FOR VANUATU 

                                     SARIMAX Results                                      
==========================================================================================
Dep. Variable:                                  y   No. Observations:                  224
Model:             SARIMAX(1, 1, 1)x(1, 0, 1, 12)   Log Likelihood                -278.286
Date:                            Wed, 14 Jun 2023   AIC                            572.571
Time:                                    16:10:25   BIC                            599.829
Sample:                                         0   HQIC                           583.575
                                            - 224                                         
Covariance Type:                              opg                                         
====================================================================================
                       coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------------
covid               -9.1747      0.708    -12.966      0.000     -10.562      -7.788
stringency_index    -0.0419      0.011     -3.973      0.000      -0.063      -0.021
vanuatu_travel       0.0298      0.213      0.140      0.888      -0.387       0.447
ar.L1                0.0483      0.102      0.475      0.635      -0.151       0.248
ma.L1               -0.8991      0.072    -12.570      0.000      -1.039      -0.759
ar.S.L12             0.9723      0.042     22.969      0.000       0.889       1.055
ma.S.L12            -0.8100      0.102     -7.956      0.000      -1.009      -0.610
sigma2               0.6688      0.034     19.531      0.000       0.602       0.736
===================================================================================
Ljung-Box (L1) (Q):                   0.02   Jarque-Bera (JB):             18394.33
Prob(Q):                              0.89   Prob(JB):                         0.00
Heteroskedasticity (H):              17.69   Skew:                             5.20
Prob(H) (two-sided):                  0.00   Kurtosis:                        46.26
===================================================================================

Warnings:
[1] Covariance matrix calculated using the outer product of gradients (complex-step).
../_images/20cc1845f0360d20f6eaab25d62abb0c104e734153181fc0dc2563de4a4acd4f.png
STARTED TO CONDUCT GRID SEARCHING FOR VANUATU 

STARTED TO CONDUCT CROSS-VALIDATION FOR VANUATU 

Best Models from Cross-validation is  ARIMA(1,0,1)(2,1,2)[12] intercept 
MSE RMSE MAE SMAPE
naive 2.915180e+06 1707.389827 1292.829596 20.537447
mean 1.125218e+07 3354.426412 2628.520727 47.767194
seasonal naive 8.396013e+06 2897.587484 1568.424107 34.617696
SARIMAX 1.238855e+06 1113.038657 795.607455 35.718702
date yacht all total ship stringency_index covid tonga_flights tonga_hotel tonga_travel
0 2010-01-01 4 3808 3158 646 0.0 0.0 1.118195 1.296399 0.212187
1 2010-02-01 5 2384 2379 0 0.0 0.0 0.811451 1.013375 0.480398
2 2010-03-01 5 3992 3134 853 0.0 0.0 0.872967 1.359485 0.233651
3 2010-04-01 30 5650 2818 2802 0.0 0.0 0.583649 1.142021 0.400232
4 2010-05-01 177 9150 3670 5303 0.0 0.0 0.906579 1.746620 0.749327
training size : 144, testing size : 0
THE BENCHMARK EVALUATION FOR TONGA 

STARTED TO CONDUCT STEPWISE SEARCHING FOR TONGA 

                                      SARIMAX Results                                      
===========================================================================================
Dep. Variable:                                   y   No. Observations:                  144
Model:             SARIMAX(0, 0, 1)x(2, 0, [], 12)   Log Likelihood                -196.126
Date:                             Wed, 14 Jun 2023   AIC                            406.251
Time:                                     16:17:52   BIC                            427.040
Sample:                                          0   HQIC                           414.699
                                             - 144                                         
Covariance Type:                               opg                                         
====================================================================================
                       coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------------
covid               -4.4685      0.936     -4.775      0.000      -6.303      -2.634
stringency_index    -0.0572      0.020     -2.866      0.004      -0.096      -0.018
tonga_travel        -0.3096      0.880     -0.352      0.725      -2.034       1.414
ma.L1                0.2152      0.056      3.831      0.000       0.105       0.325
ar.S.L12             0.2147      0.054      3.988      0.000       0.109       0.320
ar.S.L24             0.2993      0.058      5.150      0.000       0.185       0.413
sigma2               0.8666      0.036     24.334      0.000       0.797       0.936
===================================================================================
Ljung-Box (L1) (Q):                   0.00   Jarque-Bera (JB):             10347.59
Prob(Q):                              0.96   Prob(JB):                         0.00
Heteroskedasticity (H):               6.03   Skew:                             4.58
Prob(H) (two-sided):                  0.00   Kurtosis:                        43.51
===================================================================================

Warnings:
[1] Covariance matrix calculated using the outer product of gradients (complex-step).
../_images/99ac3f3d294f63729539c4e7b77f91adf412168c51ed431f0c85a0c6537b1180.png
STARTED TO CONDUCT GRID SEARCHING FOR TONGA 

STARTED TO CONDUCT CROSS-VALIDATION FOR TONGA 

Best Models from Cross-validation is  ARIMA(0,0,1)(0,1,1)[12] intercept 
MSE RMSE MAE SMAPE
naive 2.543522e+06 1594.842251 1043.874126 35.445486
mean 4.529700e+06 2128.309258 1637.978395 53.528230
seasonal naive 4.760284e+06 2181.807619 1256.819444 53.730230
SARIMAX 7.153640e+05 845.791954 556.793463 30.118200