setwd("~/School/Prediction/Week_4")

library(forecast)

intel <- read.table("data/intel.txt", header = TRUE)
sunspot <- read.table("data/sunspot.txt", header = TRUE)
sales <- read.table("data/sales.txt", header = TRUE, row.names = 4)

intel_ts <- ts(intel$Intel_Close)
sunspot_ts <- ts(sunspot$Spots, start = 1749)
sales_ts <- ts(sales$Sales, start = 1970, frequency = 12)

ts_plotter <- function(ts, lag = 10) {
  par(mfrow = c(2,2))
  plot(ts)
  acf(ts, lag = lag)
  pacf(ts, lag = lag)
  spectr <- spectrum(ts, plot = F)
  periods <- 1/spectr$freq
  plot(periods, spectr$spec, xlab = 'Period', ylab = 'Spectral density')
  sprintf("Period with maximum spectral density is %.1f", periods[which.max(spectr$spec)])
}

ljung_box_p_vals <- function(series, n_params) {
  sapply(seq(n_params + 1, length(series) - 1, 1), function(lag) 
    Box.test(series, lag, fitdf = n_params, type = 'Ljung')$p.value)
}

plot_model <- function(data_ts, model_vals) {
  plot(model_vals, type = "b", col = "blue", ylab = "Value", xlab = "Time", cex = 0.5,
       pch = 16, main = "")
  lines(data_ts, col = "red", type = "b", cex = 0.5, pch = 16)
  legend("topleft", legend = c("Time series", "Model"), col = c("red", "blue"),
         lty = c(1, 1), cex = 0.5)
}

plot_pred <- function(data_ts, prediction, title = "", xlim = NULL,
                      ylim = NULL) {
  plot(data_ts, col = "red", type = "b", cex = 0.5, pch = 16, ylab = "Value",
       xlab = "Time", main = title, xlim = xlim, ylim = ylim)
  lines(prediction, col = "blue", type = "b", cex = 0.5, pch = 16)
  legend("topleft", legend = c("Time series", "Prediction"),
         col = c("red", "blue"), lty = c(1, 1), cex = 0.5)
}

# Box-Jenkins method for SARIMA models
# A workflow/framework for SARIMA modeling
# (1) Model identification
# 1.1 Transform the time series to a stationary one with differencing
# and other transformations
# 1.2 Identify the orders of the SARMA process using the correlation plots
# (2) Model estimation
# Estimate the identified SARMA model using e.g. maximum likelihood
# (3) Model diagnostics
# The residual process of a SARIMA process should be WN(0, Sigma^2)
# Thus we analyze the estimated residual process by looking at
# the autocorrelation functions and performing statistical
# testing using e.g. Ljung-Box.
# The model is sufficient if autocorrelation is not present
# in the estimated residual process
# Otherwise, return to step (1)

# One can also check how well the model predicts future observations
# by predicting the tail of the process. This can be a more practical
# performance test than hypothesis testing.

# 4.1

ts_plotter(intel_ts)

# The autocorrelation functions decay quite quickly -> the series could
# be close to stationarity. One reasonable ARMA process could be
# AR(2) since the PACF vanishes after two lags.

?Arima
# order takes in the pure orders of the process i.e. ARIMA
# seasonal takes in the seasonal orders of the process i.e. SARIMA and
# the period, by default period is equal to frequency

intel_ar_2 <- Arima(intel_ts, order = c(2,0,0))
summary(intel_ar_2)

ts_plotter(intel_ar_2$residuals)
# Seem to be uncorrelated

# Since we don't really care about individual autocorrelations,
# we can obtain better sensitivity by using the Ljung-Box test
# H_0: The first k autocorrelations are zero, H_1: There exists a
# non-zero autocorrelation among the first k.

intel_ar_2_box <- ljung_box_p_vals(intel_ar_2$residuals, 2)
intel_ar_2_box
which(intel_ar_2_box < 0.05)
# None are significant at the 5 % level -> the model is sufficient

plot_model(intel_ts, intel_ar_2$fitted, ylim = c(60,70))
# The fit looks good, but just how well the model predicts?

?forecast
pred_model_intel <- Arima(intel_ts[1:16], order = c(2,0,0))
plot_pred(intel_ts, forecast(pred_model_intel, h = 4)$mean)

# The predictions start to deviate substantially from the data
# Since the series is short, the AR(2) model is probably overfitting
# somewhat

ts_plotter(sunspot_ts, lag = 50)

# Notable slow decay in the ACF with oscillations, period length at around
# 11 years
ts_plotter(diff(sunspot_ts,lag = 11), lag = 50)
# Still pretty bad since the season length is unstable, let's try AR(2) with
# seasonal differencing anyway # (order = c(0,1,0)

sunspot_ar_2 <- Arima(sunspot_ts, order = c(2,0,0), seasonal = list(order = c(0,0,0), period = 11))
summary(sunspot_ar_2)

ts_plotter(sunspot_ar_2$residuals, lag = 50)
# Some significant individual lags

sunspot_ar_2_box <- ljung_box_p_vals(sunspot_ar_2$residuals, 2)
which(sunspot_ar_2_box < 0.05) + 2

# Automatic search: Greedily minimize AIC 
?auto.arima

sunspot_auto <- auto.arima(sunspot_ts)
summary(sunspot_auto)

ts_plotter(sunspot_auto$residuals, lag = 50)
# Essentially identical to the AR(2) result, thus one should
# not trust the automatic search blindly since it can
# result in unnecessarily complicated models that are only marginally
# better

sunspot_auto_box <- ljung_box_p_vals(sunspot_auto$residuals, 4)
which(sunspot_auto_box < 0.05) + 4
# Again, the null is rejected for many lags

# In this case, the instability of the period length makes the ARIMA
# models unsuitable 

plot_model(sunspot_ts, sunspot_ar_2$fitted, ylim = c(0,200))
# Fitted values are consistent but what about predictions?

n_step <- 43 # 5
pred_model_sunspot <- Arima(sunspot_ts[1:(215 - n_step)], order = c(2,0,0))
plot_pred(sunspot_ts, ts(forecast(pred_model_sunspot, h = n_step)$mean, end = 1964))

# Few step predictions are okay but long predictions decay to a constant
# (the mean of the process). This is characteristic for ARIMA so
# they can't really be used for long term forecasts.

ts_plotter(sales_ts, lag = 50)
# This was more stationary after the transformations shown in the
# previous HW

ts_plotter(diff(diff(log(sales_ts), lag = 12)), lag = 50)
# AR(2) style behavior at the start of the PACF,
# perhaps SAR(1) as well

# lambda = 0 -> log transform before differencing
sales_model <- Arima(sales_ts, order = c(2,1,0), seasonal = c(1,1,0), lambda = 0)

ts_plotter(sales_model$residuals, lag = 50)

sales_model_box <- ljung_box_p_vals(sales_model$residuals, 3)
which(sunspot_auto_box < 0.05) + 3
# Not satisfactory

plot_model(sales_ts, sales_model$fitted)

n_step <- 29
pred_model_sales <- Arima(ts(sales_ts[1:(144 - n_step)], start = 1970, frequency = 12), 
                               order = c(2,1,0), seasonal = c(1,1,0), lambda = 0)
plot_pred(sales_ts, forecast(pred_model_sales, h = n_step)$mean)

# The fit and the predictions are quite good

# HW 4.2
# Instead of trying to find the ideal model, think about
# plausible models according to the correlation plots
# Perhaps you could select a couple of them
# and compare using e.g. AIC