library("forecast"); library("ggplot2"); library("readxl"); library(tseries); theme_set(theme_light())
## Registered S3 method overwritten by 'quantmod':
##   method            from
##   as.zoo.data.frame zoo
sales_data <- read_excel("GER_retail_sales.xlsx")
library(anomalize)
## == Use anomalize to improve your Forecasts by 50%! ======================================================================================
## Business Science offers a 1-hour course - Lab #18: Time Series Anomaly Detection!
## </> Learn more at: https://university.business-science.io/p/learning-labs-pro </>
decomposed = time_decompose(data = sales_data,target = SALES, message = FALSE)
anomaly = anomalize (data = decomposed,target = remainder)
series_recomposed = time_recompose(data = anomaly)
plot_anomalies(data = series_recomposed,time_recomposed = TRUE, ncol = 3, alpha_dots = 0.5)

No anomalies/ outliers were detected. We can proceed with the original data.

sales_ts <- ts(sales_data[,2],start = 2000,frequency = 12)
sales_train <- window(sales_ts, start=c(2000,1),end=c(2015,12))
autoplot(sales_train) + xlab("Time (in months)") + ylab("Retail sales")

The plot straight away shows that there is presence of seasonality and no perticular trend is present. The mean is centered around 42500 and the series is definitely non-stationary

transformed_sales <- log(sales_train)
ggtsdisplay(transformed_sales)

ACF shows that many lags are insignificant for the exception of Lag 12 and its multiple, that is explainable because of Seasonality.

diff_sales <- diff(transformed_sales,12)
ggtsdisplay(diff_sales)

ndiffs(diff_sales)
## [1] 1

The seasonal differenced series still appear to be non-stationary, and nidffs function also tells us to differentiate once more.

diff_sales_again <- diff(diff_sales, 1)
adf.test(diff_sales_again)
## Warning in adf.test(diff_sales_again): p-value smaller than printed p-value
## 
##  Augmented Dickey-Fuller Test
## 
## data:  diff_sales_again
## Dickey-Fuller = -7.4782, Lag order = 5, p-value = 0.01
## alternative hypothesis: stationary

Null Hypothesis is rejected. We can say that now our series is Stationary.We can start our analysis now.

ggtsdisplay(diff_sales_again)

We will try to fit an ARIMA(0,1,2)(0,1,1)[12] model

fitArima_012_011 <- Arima(sales_train, order = c(0,1,2), seasonal = c(0,1,1), lambda = 0)
summary(fitArima_012_011)
## Series: sales_train 
## ARIMA(0,1,2)(0,1,1)[12] 
## Box Cox transformation: lambda= 0 
## 
## Coefficients:
##           ma1     ma2     sma1
##       -1.0732  0.3158  -0.7072
## s.e.   0.0722  0.0748   0.0663
## 
## sigma^2 estimated as 0.0005322:  log likelihood=418.82
## AIC=-829.65   AICc=-829.42   BIC=-816.9
## 
## Training set error measures:
##                    ME     RMSE      MAE        MPE     MAPE     MASE       ACF1
## Training set 35.55084 915.1288 719.2133 0.08012758 1.726788 0.743759 0.04857844
residuals(object = fitArima_012_011) %>% ggtsdisplay(main = "Arima(0,1,2)(0,1,1)[12]", theme = theme_light())

We see that there are significant spikes in ACF and PACF indicating that there are some more terms to be included in the model. Specifically Lag 3 is significant in both plots. PACF also has a significant lag at 4.

Next we’ll try to fit ARIMA(1,1,2)(0,1,1)[12], as from ACF there’s no other susbstantial information.

fitArima_112_011 <- Arima(sales_train, order = c(1,1,2), seasonal = c(0,1,1), lambda = 0)
summary(fitArima_112_011)
## Series: sales_train 
## ARIMA(1,1,2)(0,1,1)[12] 
## Box Cox transformation: lambda= 0 
## 
## Coefficients:
##          ar1      ma1     ma2     sma1
##       0.1219  -1.1719  0.3909  -0.7070
## s.e.  0.1844   0.1616  0.1278   0.0671
## 
## sigma^2 estimated as 0.0005338:  log likelihood=419.06
## AIC=-828.13   AICc=-827.78   BIC=-812.19
## 
## Training set error measures:
##                    ME     RMSE      MAE     MPE     MAPE      MASE       ACF1
## Training set 34.00523 913.7639 719.2714 0.07571 1.727659 0.7438191 0.03669854
residuals(fitArima_112_011) %>% ggtsdisplay(main = "Arima(1,1,2)(0,1,1)[12]", theme = theme_light()) 

fitArima_212_011 <- Arima(sales_train, order = c(2,1,2), seasonal = c(0,1,1), lambda = 0, include.constant = TRUE)
summary(fitArima_212_011)
## Series: sales_train 
## ARIMA(2,1,2)(0,1,1)[12] 
## Box Cox transformation: lambda= 0 
## 
## Coefficients:
##           ar1      ar2     ma1      ma2     sma1
##       -0.9352  -0.5169  0.0038  -0.3948  -0.6838
## s.e.   0.1162   0.0780  0.1292   0.1239   0.0672
## 
## sigma^2 estimated as 0.0004723:  log likelihood=430.91
## AIC=-849.83   AICc=-849.34   BIC=-830.7
## 
## Training set error measures:
##                    ME     RMSE      MAE        MPE     MAPE      MASE
## Training set 32.63036 850.3358 656.8036 0.07080255 1.584083 0.6792194
##                    ACF1
## Training set 0.02353749
residuals(fitArima_212_011) %>% ggtsdisplay(main = "Arima(2,1,2)(0,1,1)[12]", theme = theme_light())

fitArima_312_011 <- Arima(sales_train, order = c(3,1,2), seasonal = c(0,1,1), lambda = 0)
summary(fitArima_312_011)
## Series: sales_train 
## ARIMA(3,1,2)(0,1,1)[12] 
## Box Cox transformation: lambda= 0 
## 
## Coefficients:
##           ar1      ar2      ar3     ma1      ma2     sma1
##       -1.0809  -0.6393  -0.1030  0.1413  -0.4153  -0.6729
## s.e.   0.2514   0.2337   0.1782  0.2387   0.1015   0.0711
## 
## sigma^2 estimated as 0.0004751:  log likelihood=431.05
## AIC=-848.11   AICc=-847.45   BIC=-825.8
## 
## Training set error measures:
##                    ME     RMSE      MAE        MPE    MAPE      MASE       ACF1
## Training set 30.22619 850.4559 659.3693 0.06517316 1.58995 0.6818727 0.03196716
residuals(fitArima_312_011) %>% ggtsdisplay(main = "Arima(3,1,2)(0,1,1)[12]", theme = theme_light())

fitArima_213_011 <- Arima(sales_train, order = c(2,1,3), seasonal = c(0,1,1), lambda = 0)
summary(fitArima_213_011)
## Series: sales_train 
## ARIMA(2,1,3)(0,1,1)[12] 
## Box Cox transformation: lambda= 0 
## 
## Coefficients:
##           ar1      ar2      ma1      ma2     ma3     sma1
##       -0.8877  -0.4564  -0.0509  -0.4179  0.0742  -0.6737
## s.e.   0.1506   0.1462   0.1627   0.1159  0.1457   0.0713
## 
## sigma^2 estimated as 0.0004752:  log likelihood=431.04
## AIC=-848.07   AICc=-847.42   BIC=-825.76
## 
## Training set error measures:
##                   ME     RMSE     MAE        MPE    MAPE      MASE       ACF1
## Training set 30.0438 850.5609 659.196 0.06487023 1.58945 0.6816934 0.03069566
residuals(fitArima_213_011) %>% ggtsdisplay(main = "Arima(2,1,3)(0,1,1)[12]", theme = theme_light())

Both Models ARIMA(3,1,2)(0,1,1)[12] and ARIMA(2,1,3)(0,1,1)[12] produce marginally worse ICs than previously fitted model.


Thus, ARIMA(2,1,2)(0,1,1)[12] model is the best fit for the data
checkresiduals(fitArima_212_011)

## 
##  Ljung-Box test
## 
## data:  Residuals from ARIMA(2,1,2)(0,1,1)[12]
## Q* = 40.108, df = 19, p-value = 0.003168
## 
## Model df: 5.   Total lags used: 24

Checking Rolling Forecasts for the model on the hold-out data

forecastarima <- function(x, h){forecast(x, model=fitArima_212_011, h=h)}
sales_ts %>% tsCV(forecastfunction = forecastarima, h = 1) %>% window (start=2016) -> testfce
mean(testfce^2, na.rm = TRUE) %>% sqrt() -> testrmse
print(testrmse)
## [1] 1081.64
forecastSnaive <- function(x, h){forecast(snaive(x), h=h)}
sales_ts %>% tsCV(forecastfunction = forecastSnaive, h = 1) %>% window (start=2016) -> testfce
mean(testfce^2, na.rm = TRUE) %>% sqrt() -> testrmse
print(testrmse)
## [1] 1604.916
forecastnaive <- function(x, h){forecast(naive(x), h=h)}
sales_ts %>% tsCV(forecastfunction = forecastnaive, h = 1) %>% window (start=2016) -> testfce
mean(testfce^2, na.rm = TRUE) %>% sqrt() -> testrmse
print(testrmse)
## [1] 4008.185

Since, RMSE(ARIMA_model) < {RMSE(Naive_model), RMSE (Snaive_model)} : It is evidence that ARIMA model is a good fit for the data.


Forecasting from April 2020 until December 2020
sales_ts %>% Arima(order=c(2,1,2),seasonal = c(0,1,1),lambda = 0) %>% forecast(h=9) %>% autoplot() + xlab('Year') + ylab('Sales')