setwd("~/School/Prediction/Week_4") library(forecast) intel <- read.table("data/intel.txt", header = TRUE) sunspot <- read.table("data/sunspot.txt", header = TRUE) sales <- read.table("data/sales.txt", header = TRUE, row.names = 4) intel_ts <- ts(intel$Intel_Close) sunspot_ts <- ts(sunspot$Spots, start = 1749) sales_ts <- ts(sales$Sales, start = 1970, frequency = 12) ts_plotter <- function(ts, lag = 10) { par(mfrow = c(2,2)) plot(ts) acf(ts, lag = lag) pacf(ts, lag = lag) spectr <- spectrum(ts, plot = F) periods <- 1/spectr$freq plot(periods, spectr$spec, xlab = 'Period', ylab = 'Spectral density') sprintf("Period with maximum spectral density is %.1f", periods[which.max(spectr$spec)]) } ljung_box_p_vals <- function(series, n_params) { sapply(seq(n_params + 1, length(series) - 1, 1), function(lag) Box.test(series, lag, fitdf = n_params, type = 'Ljung')$p.value) } plot_model <- function(data_ts, model_vals) { plot(model_vals, type = "b", col = "blue", ylab = "Value", xlab = "Time", cex = 0.5, pch = 16, main = "") lines(data_ts, col = "red", type = "b", cex = 0.5, pch = 16) legend("topleft", legend = c("Time series", "Model"), col = c("red", "blue"), lty = c(1, 1), cex = 0.5) } plot_pred <- function(data_ts, prediction, title = "", xlim = NULL, ylim = NULL) { plot(data_ts, col = "red", type = "b", cex = 0.5, pch = 16, ylab = "Value", xlab = "Time", main = title, xlim = xlim, ylim = ylim) lines(prediction, col = "blue", type = "b", cex = 0.5, pch = 16) legend("topleft", legend = c("Time series", "Prediction"), col = c("red", "blue"), lty = c(1, 1), cex = 0.5) } # Box-Jenkins method for SARIMA models # A workflow/framework for SARIMA modeling # (1) Model identification # 1.1 Transform the time series to a stationary one with differencing # and other transformations # 1.2 Identify the orders of the SARMA process using the correlation plots # (2) Model estimation # Estimate the identified SARMA model using e.g. maximum likelihood # (3) Model diagnostics # The residual process of a SARIMA process should be WN(0, Sigma^2) # Thus we analyze the estimated residual process by looking at # the autocorrelation functions and performing statistical # testing using e.g. Ljung-Box. # The model is sufficient if autocorrelation is not present # in the estimated residual process # Otherwise, return to step (1) # One can also check how well the model predicts future observations # by predicting the tail of the process. This can be a more practical # performance test than hypothesis testing. # 4.1 ts_plotter(intel_ts) # The autocorrelation functions decay quite quickly -> the series could # be close to stationarity. One reasonable ARMA process could be # AR(2) since the PACF vanishes after two lags. ?Arima # order takes in the pure orders of the process i.e. ARIMA # seasonal takes in the seasonal orders of the process i.e. SARIMA and # the period, by default period is equal to frequency intel_ar_2 <- Arima(intel_ts, order = c(2,0,0)) summary(intel_ar_2) ts_plotter(intel_ar_2$residuals) # Seem to be uncorrelated # Since we don't really care about individual autocorrelations, # we can obtain better sensitivity by using the Ljung-Box test # H_0: The first k autocorrelations are zero, H_1: There exists a # non-zero autocorrelation among the first k. intel_ar_2_box <- ljung_box_p_vals(intel_ar_2$residuals, 2) intel_ar_2_box which(intel_ar_2_box < 0.05) # None are significant at the 5 % level -> the model is sufficient plot_model(intel_ts, intel_ar_2$fitted, ylim = c(60,70)) # The fit looks good, but just how well the model predicts? ?forecast pred_model_intel <- Arima(intel_ts[1:16], order = c(2,0,0)) plot_pred(intel_ts, forecast(pred_model_intel, h = 4)$mean) # The predictions start to deviate substantially from the data # Since the series is short, the AR(2) model is probably overfitting # somewhat ts_plotter(sunspot_ts, lag = 50) # Notable slow decay in the ACF with oscillations, period length at around # 11 years ts_plotter(diff(sunspot_ts,lag = 11), lag = 50) # Still pretty bad since the season length is unstable, let's try AR(2) with # seasonal differencing anyway # (order = c(0,1,0) sunspot_ar_2 <- Arima(sunspot_ts, order = c(2,0,0), seasonal = list(order = c(0,0,0), period = 11)) summary(sunspot_ar_2) ts_plotter(sunspot_ar_2$residuals, lag = 50) # Some significant individual lags sunspot_ar_2_box <- ljung_box_p_vals(sunspot_ar_2$residuals, 2) which(sunspot_ar_2_box < 0.05) + 2 # Automatic search: Greedily minimize AIC ?auto.arima sunspot_auto <- auto.arima(sunspot_ts) summary(sunspot_auto) ts_plotter(sunspot_auto$residuals, lag = 50) # Essentially identical to the AR(2) result, thus one should # not trust the automatic search blindly since it can # result in unnecessarily complicated models that are only marginally # better sunspot_auto_box <- ljung_box_p_vals(sunspot_auto$residuals, 4) which(sunspot_auto_box < 0.05) + 4 # Again, the null is rejected for many lags # In this case, the instability of the period length makes the ARIMA # models unsuitable plot_model(sunspot_ts, sunspot_ar_2$fitted, ylim = c(0,200)) # Fitted values are consistent but what about predictions? n_step <- 43 # 5 pred_model_sunspot <- Arima(sunspot_ts[1:(215 - n_step)], order = c(2,0,0)) plot_pred(sunspot_ts, ts(forecast(pred_model_sunspot, h = n_step)$mean, end = 1964)) # Few step predictions are okay but long predictions decay to a constant # (the mean of the process). This is characteristic for ARIMA so # they can't really be used for long term forecasts. ts_plotter(sales_ts, lag = 50) # This was more stationary after the transformations shown in the # previous HW ts_plotter(diff(diff(log(sales_ts), lag = 12)), lag = 50) # AR(2) style behavior at the start of the PACF, # perhaps SAR(1) as well # lambda = 0 -> log transform before differencing sales_model <- Arima(sales_ts, order = c(2,1,0), seasonal = c(1,1,0), lambda = 0) ts_plotter(sales_model$residuals, lag = 50) sales_model_box <- ljung_box_p_vals(sales_model$residuals, 3) which(sunspot_auto_box < 0.05) + 3 # Not satisfactory plot_model(sales_ts, sales_model$fitted) n_step <- 29 pred_model_sales <- Arima(ts(sales_ts[1:(144 - n_step)], start = 1970, frequency = 12), order = c(2,1,0), seasonal = c(1,1,0), lambda = 0) plot_pred(sales_ts, forecast(pred_model_sales, h = n_step)$mean) # The fit and the predictions are quite good # HW 4.2 # Instead of trying to find the ideal model, think about # plausible models according to the correlation plots # Perhaps you could select a couple of them # and compare using e.g. AIC