library("forecast"); library("ggplot2"); library("readxl"); library(tseries); theme_set(theme_light())
## Registered S3 method overwritten by 'quantmod':
## method from
## as.zoo.data.frame zoo
sales_data <- read_excel("GER_retail_sales.xlsx")
library(anomalize)
## == Use anomalize to improve your Forecasts by 50%! ======================================================================================
## Business Science offers a 1-hour course - Lab #18: Time Series Anomaly Detection!
## </> Learn more at: https://university.business-science.io/p/learning-labs-pro </>
decomposed = time_decompose(data = sales_data,target = SALES, message = FALSE)
anomaly = anomalize (data = decomposed,target = remainder)
series_recomposed = time_recompose(data = anomaly)
plot_anomalies(data = series_recomposed,time_recomposed = TRUE, ncol = 3, alpha_dots = 0.5)
No anomalies/ outliers were detected. We can proceed with the original data.
sales_ts <- ts(sales_data[,2],start = 2000,frequency = 12)
sales_train <- window(sales_ts, start=c(2000,1),end=c(2015,12))
autoplot(sales_train) + xlab("Time (in months)") + ylab("Retail sales")
transformed_sales <- log(sales_train)
ggtsdisplay(transformed_sales)
ACF shows that many lags are insignificant for the exception of Lag 12 and its multiple, that is explainable because of Seasonality.
diff_sales <- diff(transformed_sales,12)
ggtsdisplay(diff_sales)
ndiffs(diff_sales)
## [1] 1
The seasonal differenced series still appear to be non-stationary, and nidffs function also tells us to differentiate once more.
diff_sales_again <- diff(diff_sales, 1)
adf.test(diff_sales_again)
## Warning in adf.test(diff_sales_again): p-value smaller than printed p-value
##
## Augmented Dickey-Fuller Test
##
## data: diff_sales_again
## Dickey-Fuller = -7.4782, Lag order = 5, p-value = 0.01
## alternative hypothesis: stationary
Null Hypothesis is rejected. We can say that now our series is Stationary.We can start our analysis now.
ggtsdisplay(diff_sales_again)
We will try to fit an ARIMA(0,1,2)(0,1,1)[12] model
fitArima_012_011 <- Arima(sales_train, order = c(0,1,2), seasonal = c(0,1,1), lambda = 0)
summary(fitArima_012_011)
## Series: sales_train
## ARIMA(0,1,2)(0,1,1)[12]
## Box Cox transformation: lambda= 0
##
## Coefficients:
## ma1 ma2 sma1
## -1.0732 0.3158 -0.7072
## s.e. 0.0722 0.0748 0.0663
##
## sigma^2 estimated as 0.0005322: log likelihood=418.82
## AIC=-829.65 AICc=-829.42 BIC=-816.9
##
## Training set error measures:
## ME RMSE MAE MPE MAPE MASE ACF1
## Training set 35.55084 915.1288 719.2133 0.08012758 1.726788 0.743759 0.04857844
residuals(object = fitArima_012_011) %>% ggtsdisplay(main = "Arima(0,1,2)(0,1,1)[12]", theme = theme_light())
We see that there are significant spikes in ACF and PACF indicating that there are some more terms to be included in the model. Specifically Lag 3 is significant in both plots. PACF also has a significant lag at 4.
Next we’ll try to fit ARIMA(1,1,2)(0,1,1)[12], as from ACF there’s no other susbstantial information.
fitArima_112_011 <- Arima(sales_train, order = c(1,1,2), seasonal = c(0,1,1), lambda = 0)
summary(fitArima_112_011)
## Series: sales_train
## ARIMA(1,1,2)(0,1,1)[12]
## Box Cox transformation: lambda= 0
##
## Coefficients:
## ar1 ma1 ma2 sma1
## 0.1219 -1.1719 0.3909 -0.7070
## s.e. 0.1844 0.1616 0.1278 0.0671
##
## sigma^2 estimated as 0.0005338: log likelihood=419.06
## AIC=-828.13 AICc=-827.78 BIC=-812.19
##
## Training set error measures:
## ME RMSE MAE MPE MAPE MASE ACF1
## Training set 34.00523 913.7639 719.2714 0.07571 1.727659 0.7438191 0.03669854
residuals(fitArima_112_011) %>% ggtsdisplay(main = "Arima(1,1,2)(0,1,1)[12]", theme = theme_light())
fitArima_212_011 <- Arima(sales_train, order = c(2,1,2), seasonal = c(0,1,1), lambda = 0, include.constant = TRUE)
summary(fitArima_212_011)
## Series: sales_train
## ARIMA(2,1,2)(0,1,1)[12]
## Box Cox transformation: lambda= 0
##
## Coefficients:
## ar1 ar2 ma1 ma2 sma1
## -0.9352 -0.5169 0.0038 -0.3948 -0.6838
## s.e. 0.1162 0.0780 0.1292 0.1239 0.0672
##
## sigma^2 estimated as 0.0004723: log likelihood=430.91
## AIC=-849.83 AICc=-849.34 BIC=-830.7
##
## Training set error measures:
## ME RMSE MAE MPE MAPE MASE
## Training set 32.63036 850.3358 656.8036 0.07080255 1.584083 0.6792194
## ACF1
## Training set 0.02353749
residuals(fitArima_212_011) %>% ggtsdisplay(main = "Arima(2,1,2)(0,1,1)[12]", theme = theme_light())
fitArima_312_011 <- Arima(sales_train, order = c(3,1,2), seasonal = c(0,1,1), lambda = 0)
summary(fitArima_312_011)
## Series: sales_train
## ARIMA(3,1,2)(0,1,1)[12]
## Box Cox transformation: lambda= 0
##
## Coefficients:
## ar1 ar2 ar3 ma1 ma2 sma1
## -1.0809 -0.6393 -0.1030 0.1413 -0.4153 -0.6729
## s.e. 0.2514 0.2337 0.1782 0.2387 0.1015 0.0711
##
## sigma^2 estimated as 0.0004751: log likelihood=431.05
## AIC=-848.11 AICc=-847.45 BIC=-825.8
##
## Training set error measures:
## ME RMSE MAE MPE MAPE MASE ACF1
## Training set 30.22619 850.4559 659.3693 0.06517316 1.58995 0.6818727 0.03196716
residuals(fitArima_312_011) %>% ggtsdisplay(main = "Arima(3,1,2)(0,1,1)[12]", theme = theme_light())
fitArima_213_011 <- Arima(sales_train, order = c(2,1,3), seasonal = c(0,1,1), lambda = 0)
summary(fitArima_213_011)
## Series: sales_train
## ARIMA(2,1,3)(0,1,1)[12]
## Box Cox transformation: lambda= 0
##
## Coefficients:
## ar1 ar2 ma1 ma2 ma3 sma1
## -0.8877 -0.4564 -0.0509 -0.4179 0.0742 -0.6737
## s.e. 0.1506 0.1462 0.1627 0.1159 0.1457 0.0713
##
## sigma^2 estimated as 0.0004752: log likelihood=431.04
## AIC=-848.07 AICc=-847.42 BIC=-825.76
##
## Training set error measures:
## ME RMSE MAE MPE MAPE MASE ACF1
## Training set 30.0438 850.5609 659.196 0.06487023 1.58945 0.6816934 0.03069566
residuals(fitArima_213_011) %>% ggtsdisplay(main = "Arima(2,1,3)(0,1,1)[12]", theme = theme_light())
Both Models ARIMA(3,1,2)(0,1,1)[12] and ARIMA(2,1,3)(0,1,1)[12] produce marginally worse ICs than previously fitted model.
checkresiduals(fitArima_212_011)
##
## Ljung-Box test
##
## data: Residuals from ARIMA(2,1,2)(0,1,1)[12]
## Q* = 40.108, df = 19, p-value = 0.003168
##
## Model df: 5. Total lags used: 24
Checking Rolling Forecasts for the model on the hold-out data
forecastarima <- function(x, h){forecast(x, model=fitArima_212_011, h=h)}
sales_ts %>% tsCV(forecastfunction = forecastarima, h = 1) %>% window (start=2016) -> testfce
mean(testfce^2, na.rm = TRUE) %>% sqrt() -> testrmse
print(testrmse)
## [1] 1081.64
forecastSnaive <- function(x, h){forecast(snaive(x), h=h)}
sales_ts %>% tsCV(forecastfunction = forecastSnaive, h = 1) %>% window (start=2016) -> testfce
mean(testfce^2, na.rm = TRUE) %>% sqrt() -> testrmse
print(testrmse)
## [1] 1604.916
forecastnaive <- function(x, h){forecast(naive(x), h=h)}
sales_ts %>% tsCV(forecastfunction = forecastnaive, h = 1) %>% window (start=2016) -> testfce
mean(testfce^2, na.rm = TRUE) %>% sqrt() -> testrmse
print(testrmse)
## [1] 4008.185
Since, RMSE(ARIMA_model) < {RMSE(Naive_model), RMSE (Snaive_model)} : It is evidence that ARIMA model is a good fit for the data.
sales_ts %>% Arima(order=c(2,1,2),seasonal = c(0,1,1),lambda = 0) %>% forecast(h=9) %>% autoplot() + xlab('Year') + ylab('Sales')