# Exercise 4.1

# Attach package forecast
library(forecast)

# Read the data
intel <- read.table("data/intel.txt", header = TRUE)
sunspot <- read.table("data/sunspot.txt", header = TRUE)
sales <- read.table("data/sales.txt", header = TRUE, row.names = 4)

# Create ts objects
intel_ts <- ts(intel$Intel_Close)
sunspot_ts <- ts(sunspot$Spots, start = 1749)
sales_ts <- ts(sales$Sales, start = 1970, frequency = 12)

# Define some helper functions

#' Plot autocorrelation and partial autocorrelation of time series
#'
#' @param data_ts Time series as a ts object.
#' @param lagmax Provides value for lag.max argument of acf and pacf.
plot_acf <- function(data_ts, lagmax = NULL) {
  par(mfrow = c(1, 2))
  acf(data_ts, main = "ACF", lag.max = lagmax)
  pacf(data_ts, main = "PACF", lag.max = lagmax)
  par(mfrow = c(1, 1))
}

#' Perform Ljung-Box test
#'
#' Calculates p-values for Ljung-Box test for lags {k+1, ..., n}, where k
#' is the number of model parameters.
#'
#' @param model Object of class Arima, represents the fitted model.
#' @param k Number of model parameters.
#' @param n Sample size (length of the time series).
#'
#' @return Vector of p-values.
box_test <- function(model, k, n) {
  pvalues <- c(rep(NA, n - k - 1))
  for (i in 1:(n - k - 1)) {
    pvalues[i] <- Box.test(model$res, lag = i + k, fitdf = k,
                           type = "Ljung-Box")$p.value
  }
  pvalues
}

#' Plot p-values from Ljung-box test for different lags
#'
#' @param pvalues Vector of p-values.
#' @param alpha Significance level.
#' @param k Number of model parameters.
#' @param n Sample size (length of the time series).
plot_pvalues <- function(pvalues, alpha, k, n) {
  plot((k + 1):(n - 1), pvalues, pch = 16, col = "midnightblue",
       ylim = c(0, max(pvalues)), xlab = "lag", ylab = "p-value")
  abline(h = alpha, lty = 2, lwd = 2)
}

#' Plot original time series and fitted model.
#'
#' @param data_ts Time series as ts object.
#' @param model Fitted time series model as Arima object.
#'
plot_fit <- function(data_ts, model) {
  fit <- model$fitted
  plot(fit, type = "b", col = "blue", ylab = "Value", xlab = "Time", cex = 0.5,
       pch = 16, main = "")
  lines(data_ts, col = "red", type = "b", cex = 0.5, pch = 16)
  legend("topleft", legend = c("Time series", "Fit"), col = c("red", "blue"),
         lty = c(1, 1), cex = 0.5)
}

#' Plot original time series and s-step prediction.
#'
#' @param data_ts Time series as ts object.
#' @param prediction S-step prediction as ts object.
plot_pred <- function(data_ts, prediction, title = "", xlim = NULL,
                      ylim = NULL) {
  plot(data_ts, col = "red", type = "b", cex = 0.5, pch = 16, ylab = "Value",
       xlab = "Time", main = title, xlim = xlim, ylim = ylim)
  lines(prediction, col = "blue", type = "b", cex = 0.5, pch = 16)
  legend("topleft", legend = c("Time series", "Prediction"),
         col = c("red", "blue"), lty = c(1, 1), cex = 0.5)
}

# Intel_Close

# Plot time series
plot(intel_ts)

# ACF and PACF
plot_acf(intel_ts)

# Based on time series plot and ACF/PACF time series Intel_Close might be
# stationary. Additionally, ACF decays exponentially and PACF cuts off after
# lag 2. Thus we fit AR(2) model.
model_intel <- Arima(intel_ts, order = c(2, 0, 0))
model_intel

# Next, study if residuals are white noise
# There are no significant spikes in ACF/PACF plots
plot_acf(model_intel$residuals)

# Ljung-Box test
# There is no evidence against H_0 for any lag k+1, ..., n-1, where k is the
# number of model parameters.
k <- 2
n <- length(intel_ts)
pvalues <- box_test(model_intel, k, n)
pvalues
which(pvalues > 0.05) + k
plot_pvalues(pvalues, 0.05, k, n)

# => Residuals seem to be white noise and thus model is satisfactory.

# Fit seems good visually
plot_fit(intel_ts, model_intel)

# Goodness of prediction
prediction <- forecast(Arima(intel_ts[1:16], order = c(2, 0, 0)), h = 4)$mean
plot_pred(intel_ts, prediction)

# The first 16 observations do not provide a good prediction for the last four
# observations. One reason might be a small sample (short time series).

# Spots

# Time series plot and ACF/PACF
plot(sunspot_ts)
plot_acf(sunspot_ts, lagmax = 50)

# Cyclical component of approximately 11 year.
# => Time series Spots is not stationary.

# PACF cuts off after lag 2.
# => We try to fit AR(2) model anyway.

# Fit model and study residuals
model_spots1 <- Arima(sunspot_ts, order = c(2, 0, 0))
plot_acf(model_spots1$residuals)

k <- 2
n <- length(sunspot_ts)
pvalues <- box_test(model_spots1, k, n)
plot_pvalues(pvalues, 0.05, k, n)

# There are significant spikes in ACF/PACF plots and H_0 of Ljung-Box test is
# rejected for multiple lags.
# => AR(2) model is not satisfactory

# Try to fit model with auto.arima
model_spots2 <- auto.arima(sunspot_ts)
model_spots2 # ARMA(3, 1)

# Study residuals
plot_acf(model_spots2$residuals)

k <- 4
pvalues <- box_test(model_spots2, k, n)
plot_pvalues(pvalues, 0.05, k, n)

# There are significant spikes in ACF/PACF plots and H_0 of Ljung-Box test is
# rejected for multiple lags.
# => Neither ARMA(3, 1) model is satisfactory.

# Time series Spots turned out to be hard to model, since the period
# (around 11 years) of the cyclical component is not constant.

# Study fits and goodness of predictions anyway visually

# AR(2) fit
plot_fit(sunspot_ts, model_spots1)

# ARMA(3, 1) fit
plot_fit(sunspot_ts, model_spots2)

# => Both fits look good

# AR(2) goodness of prediction
prediction1 <- forecast(Arima(sunspot_ts[1:210], order = c(2, 0, 0)),
                        h = 5)$mean
prediction2 <- forecast(Arima(sunspot_ts[1:172], order = c(2, 0, 0)),
                        h = 43)$mean

prediction1 <- ts(prediction1, end = 1964)
prediction2 <- ts(prediction2, end = 1964)

par(mfrow = c(1, 2))
plot_pred(sunspot_ts, prediction1, title = "5-step prediction")
plot_pred(sunspot_ts, prediction2, title = "43-step prediction")
par(mfrow = c(1, 1))

# ARMA(3, 1) goodness of prediction
prediction1 <- forecast(Arima(sunspot_ts[1:210], order = c(3, 0, 1)),
                        h = 5)$mean
prediction2 <- forecast(Arima(sunspot_ts[1:172], order = c(3, 0, 1)),
                        h = 43)$mean

prediction1 <- ts(prediction1, end = 1964)
prediction2 <- ts(prediction2, end = 1964)

par(mfrow = c(1, 2))
plot_pred(sunspot_ts, prediction1, title = "5-step prediction")
plot_pred(sunspot_ts, prediction2, title = "43-step prediction")
par(mfrow = c(1, 1))

# For both models short-term predictions look good but long-term predictions
# do not.

# It is no surprise that models are not suited for long term predictions,
# since they were labeled as unsatisfactory by the earlier diagnostics.

# Sales

# Time series plot and ACF/PACF of stationarized time series
# (remember homework 3)
plot(sales_ts)
plot_acf(diff(diff(log(sales_ts), lag = 12)), lagmax = 50)

# Based on ACF and PACF one possible model could be
# SARIMA(2,1,0)(1,1,0)_[12] after log transformation.

# We can have log transformation before differencing by setting lambda = 0
# in function Arima
model_sales <- Arima(sales_ts, order = c(2, 1, 0), seasonal = c(1, 1, 0),
                     lambda = 0)
model_sales

# Are residuals white noise?
plot_acf(model_sales$residuals, lagmax = 50)

n <- length(sales_ts)
k <- 3
pvalues <- box_test(model_sales, k, n)
plot_pvalues(pvalues, 0.05, k, n)

# There are significant spikes in ACF/PACF plots and H_0 of Ljung-Box test is
# rejected for multiple lags.
# => SARIMA(2,1,0)(1,1,0)_[12] model is satisfactory.

# Fit look good visually anyway
plot_fit(sales_ts, model_sales)

# So do the predictions
sales_ts_pred <- ts(sales_ts[1:115], start = 1970, frequency = 12)
model_sales_pred <- Arima(sales_ts_pred, order = c(2, 1, 0),
                          seasonal = c(1, 1, 0), lambda = 0)
prediction <- forecast(model_sales_pred, h = 29)$mean
plot_pred(sales_ts, prediction)

# Also time series Sales turned out to be hard to model.
# It is quite common that in practice, it is hard to find fitted models
# with all model assumptions satisfied. However, something is better than
# nothing. Even in this case, predictions seem visually good even though
# residual are not white noise.

# Remember, also regarding Exercise 4.2, that there is no single "correct"
# model. Many different models can be sufficient. However, you have to be
# able to justify your choices of model construction.