# Exercise 4.1 # Attach package forecast library(forecast) # Read the data intel <- read.table("data/intel.txt", header = TRUE) sunspot <- read.table("data/sunspot.txt", header = TRUE) sales <- read.table("data/sales.txt", header = TRUE, row.names = 4) # Create ts objects intel_ts <- ts(intel$Intel_Close) sunspot_ts <- ts(sunspot$Spots, start = 1749) sales_ts <- ts(sales$Sales, start = 1970, frequency = 12) # Define some helper functions #' Plot autocorrelation and partial autocorrelation of time series #' #' @param data_ts Time series as a ts object. #' @param lagmax Provides value for lag.max argument of acf and pacf. plot_acf <- function(data_ts, lagmax = NULL) { par(mfrow = c(1, 2)) acf(data_ts, main = "ACF", lag.max = lagmax) pacf(data_ts, main = "PACF", lag.max = lagmax) par(mfrow = c(1, 1)) } #' Perform Ljung-Box test #' #' Calculates p-values for Ljung-Box test for lags {k+1, ..., n}, where k #' is the number of model parameters. #' #' @param model Object of class Arima, represents the fitted model. #' @param k Number of model parameters. #' @param n Sample size (length of the time series). #' #' @return Vector of p-values. box_test <- function(model, k, n) { pvalues <- c(rep(NA, n - k - 1)) for (i in 1:(n - k - 1)) { pvalues[i] <- Box.test(model$res, lag = i + k, fitdf = k, type = "Ljung-Box")$p.value } pvalues } #' Plot p-values from Ljung-box test for different lags #' #' @param pvalues Vector of p-values. #' @param alpha Significance level. #' @param k Number of model parameters. #' @param n Sample size (length of the time series). plot_pvalues <- function(pvalues, alpha, k, n) { plot((k + 1):(n - 1), pvalues, pch = 16, col = "midnightblue", ylim = c(0, max(pvalues)), xlab = "lag", ylab = "p-value") abline(h = alpha, lty = 2, lwd = 2) } #' Plot original time series and fitted model. #' #' @param data_ts Time series as ts object. #' @param model Fitted time series model as Arima object. #' plot_fit <- function(data_ts, model) { fit <- model$fitted plot(fit, type = "b", col = "blue", ylab = "Value", xlab = "Time", cex = 0.5, pch = 16, main = "") lines(data_ts, col = "red", type = "b", cex = 0.5, pch = 16) legend("topleft", legend = c("Time series", "Fit"), col = c("red", "blue"), lty = c(1, 1), cex = 0.5) } #' Plot original time series and s-step prediction. #' #' @param data_ts Time series as ts object. #' @param prediction S-step prediction as ts object. plot_pred <- function(data_ts, prediction, title = "", xlim = NULL, ylim = NULL) { plot(data_ts, col = "red", type = "b", cex = 0.5, pch = 16, ylab = "Value", xlab = "Time", main = title, xlim = xlim, ylim = ylim) lines(prediction, col = "blue", type = "b", cex = 0.5, pch = 16) legend("topleft", legend = c("Time series", "Prediction"), col = c("red", "blue"), lty = c(1, 1), cex = 0.5) } # Intel_Close # Plot time series plot(intel_ts) # ACF and PACF plot_acf(intel_ts) # Based on time series plot and ACF/PACF time series Intel_Close might be # stationary. Additionally, ACF decays exponentially and PACF cuts off after # lag 2. Thus we fit AR(2) model. model_intel <- Arima(intel_ts, order = c(2, 0, 0)) model_intel # Next, study if residuals are white noise # There are no significant spikes in ACF/PACF plots plot_acf(model_intel$residuals) # Ljung-Box test # There is no evidence against H_0 for any lag k+1, ..., n-1, where k is the # number of model parameters. k <- 2 n <- length(intel_ts) pvalues <- box_test(model_intel, k, n) pvalues which(pvalues > 0.05) + k plot_pvalues(pvalues, 0.05, k, n) # => Residuals seem to be white noise and thus model is satisfactory. # Fit seems good visually plot_fit(intel_ts, model_intel) # Goodness of prediction prediction <- forecast(Arima(intel_ts[1:16], order = c(2, 0, 0)), h = 4)$mean plot_pred(intel_ts, prediction) # The first 16 observations do not provide a good prediction for the last four # observations. One reason might be a small sample (short time series). # Spots # Time series plot and ACF/PACF plot(sunspot_ts) plot_acf(sunspot_ts, lagmax = 50) # Cyclical component of approximately 11 year. # => Time series Spots is not stationary. # PACF cuts off after lag 2. # => We try to fit AR(2) model anyway. # Fit model and study residuals model_spots1 <- Arima(sunspot_ts, order = c(2, 0, 0)) plot_acf(model_spots1$residuals) k <- 2 n <- length(sunspot_ts) pvalues <- box_test(model_spots1, k, n) plot_pvalues(pvalues, 0.05, k, n) # There are significant spikes in ACF/PACF plots and H_0 of Ljung-Box test is # rejected for multiple lags. # => AR(2) model is not satisfactory # Try to fit model with auto.arima model_spots2 <- auto.arima(sunspot_ts) model_spots2 # ARMA(3, 1) # Study residuals plot_acf(model_spots2$residuals) k <- 4 pvalues <- box_test(model_spots2, k, n) plot_pvalues(pvalues, 0.05, k, n) # There are significant spikes in ACF/PACF plots and H_0 of Ljung-Box test is # rejected for multiple lags. # => Neither ARMA(3, 1) model is satisfactory. # Time series Spots turned out to be hard to model, since the period # (around 11 years) of the cyclical component is not constant. # Study fits and goodness of predictions anyway visually # AR(2) fit plot_fit(sunspot_ts, model_spots1) # ARMA(3, 1) fit plot_fit(sunspot_ts, model_spots2) # => Both fits look good # AR(2) goodness of prediction prediction1 <- forecast(Arima(sunspot_ts[1:210], order = c(2, 0, 0)), h = 5)$mean prediction2 <- forecast(Arima(sunspot_ts[1:172], order = c(2, 0, 0)), h = 43)$mean prediction1 <- ts(prediction1, end = 1964) prediction2 <- ts(prediction2, end = 1964) par(mfrow = c(1, 2)) plot_pred(sunspot_ts, prediction1, title = "5-step prediction") plot_pred(sunspot_ts, prediction2, title = "43-step prediction") par(mfrow = c(1, 1)) # ARMA(3, 1) goodness of prediction prediction1 <- forecast(Arima(sunspot_ts[1:210], order = c(3, 0, 1)), h = 5)$mean prediction2 <- forecast(Arima(sunspot_ts[1:172], order = c(3, 0, 1)), h = 43)$mean prediction1 <- ts(prediction1, end = 1964) prediction2 <- ts(prediction2, end = 1964) par(mfrow = c(1, 2)) plot_pred(sunspot_ts, prediction1, title = "5-step prediction") plot_pred(sunspot_ts, prediction2, title = "43-step prediction") par(mfrow = c(1, 1)) # For both models short-term predictions look good but long-term predictions # do not. # It is no surprise that models are not suited for long term predictions, # since they were labeled as unsatisfactory by the earlier diagnostics. # Sales # Time series plot and ACF/PACF of stationarized time series # (remember homework 3) plot(sales_ts) plot_acf(diff(diff(log(sales_ts), lag = 12)), lagmax = 50) # Based on ACF and PACF one possible model could be # SARIMA(2,1,0)(1,1,0)_[12] after log transformation. # We can have log transformation before differencing by setting lambda = 0 # in function Arima model_sales <- Arima(sales_ts, order = c(2, 1, 0), seasonal = c(1, 1, 0), lambda = 0) model_sales # Are residuals white noise? plot_acf(model_sales$residuals, lagmax = 50) n <- length(sales_ts) k <- 3 pvalues <- box_test(model_sales, k, n) plot_pvalues(pvalues, 0.05, k, n) # There are significant spikes in ACF/PACF plots and H_0 of Ljung-Box test is # rejected for multiple lags. # => SARIMA(2,1,0)(1,1,0)_[12] model is satisfactory. # Fit look good visually anyway plot_fit(sales_ts, model_sales) # So do the predictions sales_ts_pred <- ts(sales_ts[1:115], start = 1970, frequency = 12) model_sales_pred <- Arima(sales_ts_pred, order = c(2, 1, 0), seasonal = c(1, 1, 0), lambda = 0) prediction <- forecast(model_sales_pred, h = 29)$mean plot_pred(sales_ts, prediction) # Also time series Sales turned out to be hard to model. # It is quite common that in practice, it is hard to find fitted models # with all model assumptions satisfied. However, something is better than # nothing. Even in this case, predictions seem visually good even though # residual are not white noise. # Remember, also regarding Exercise 4.2, that there is no single "correct" # model. Many different models can be sufficient. However, you have to be # able to justify your choices of model construction.