setwd('~/Prediction/Week_4')

library(forecast)

INTEL <- read.table("INTEL.txt",header=T)
SUNSPOT <- read.table("SUNSPOT.txt",header=T,row.names=1)
SALES <- read.table("SALES.txt",header=T)

Intel_Close <- ts(INTEL$Intel_Close)
Spots <- ts(SUNSPOT,start=1749)
Sales <- ts(SALES$Sales,frequency=12,start = 1970)

ts_plotter <- function(ts, lag_max) {
  par(mfrow = c(2,2))
  plot(ts, main = 'Run Plot')
  acf(ts, main = 'Autocorrelation', lag.max = lag_max)
  pacf(ts, main = 'Partial Autocorrelation', lag.max = lag_max)
  spec <- spectrum(ts, plot = F)
  plot(spec$freq, spec$spec, main = 'Spectrum' ,
       xlab = 'Frequency', ylab = 'Energy density')
}

# Box-Jenkins method: A three-stage approach to SARIMA(p,h,q)(P,H,Q)_s model selection

# (1) Model Identification: Determine the orders of the model
# (1a) Determine the order of differencing h and the order of seasonal differencing H
# (and season length s) needed to stationarize the time series (SARIMA -> SARMA)
# You might also need to consider other transformations as well such as log, sqrt etc.
# depending on the behavior of the trend and the variance.
# (1b) Identify the orders p,q,P,Q

# (2) Model Estimation: Estimate the parameters of the SARMA model
# We do this by using the maximum likelihood estimating equations of the normal model
# even if the residuals are not assumed to be normally distributed.

# (3) Model Diagnostics: Determine whether the estimated model is sufficient
# For SARMA processes, the residual process should be a white noise (WN) process
# WN Process e_t
# (a) E[e_t] = 0 for all t
# (b) Var[e_t] = sigma for all t
# (c) Cov[e_t,e_j] = 0 for all t != j
# These can be analyzed by examining the estimated residuals the same way we examined the original time series.
# We can also perform a significance test on the autocorrelations of the residual process, more about this later.

# If there is substantial autocorrelation present in the residual process, some part of the model is
# misspecified and we need to return to step (1) and identify a better candidate model.
# The model is deemed sufficient if autocorrelation is not present in the estimated residual process.

# 4.1
ts_plotter(Intel_Close, lag_max = 19)

# The autocorrelation function seems to decay exponentially
# The PACF vanishes after two lags.
# No seasonality observed in the run plot or the spectrum.
# These observations indicate an autoregressive process of order 2 (AR(2) process)

?arima

intel_ar_2 <- arima(Intel_Close, order = c(2,0,0))
summary(intel_ar_2)

# For diagnostics, we analyze the residual process.
ts_plotter(intel_ar_2$residuals, lag_max = 19)
# The autocorrelations are zero according to the individual tests (H_0: rho_k = 0, H_1: rho_k != 0) depicted by the blue lines
# No periodicity present

# We can also use an aggregated test for testing the autocorrelations, the Ljung-Box test
# The hypothesis is H_0: e_t ~ WN, H_1: e_t !~ WN where the test statistic aggregates multiple sample autocorrelations.
# The idea is that pooling the statistics can (1) give us a test with a higher power (Probability of observing the
# effect if one exists) and (2) control the Type 1 error (Probability of rejecting a true null hypothesis) when compared
# to testing all the autocorrelations individually which inflates the Type 1 error rate. Also, we are not really interested in 
# the individual autocorrelations, only whether there exists one which is non-zero.

?Box.test

ljung_box_tester <- function(res, n_param, max_lag) {
  lags <- seq(n_param + 1, max_lag)
  sapply(lags, function(lag) Box.test(res, lag = lag, type = 'Ljung', fitdf = n_param)$p.value)
}

ljung_box_tester(intel_ar_2$residuals, 2, 19)

# The null hypothesis is not rejected for any lag, thus it is reasonable to believe that the residual
# process is white noise.

fit <- fitted(intel_ar_2)
plot(fit,type="b", col="blue", ylim=c(60,68),
     ylab="Price", xlab="Time")
lines(Intel_Close, col="red", type="b")
legend(16, 68, legend=c("Observed", "Fitted"),
       col=c("red","blue"),lty=c(1,1),cex=0.8)

# The fit does seem quite reasonable. To analyze the forecasting capacity of the model, we use the first
# 16 observations to fit the model and use the forecast package

intel_ar_2_pred <- arima(Intel_Close[1:16], order = c(2,0,0))
?forecast # We consider how predictions can be constructed later in this course
intel_price_pred <- forecast(intel_ar_2_pred, h = 15, level = F)$mean

plot(Intel_Close, col="red", type="b", ylim=c(60,68), xlim=c(0,30),
     ylab="Price", xlab="Time")
lines(intel_price_pred, col="blue", type="b")
legend(16, 68, legend=c("Observed", "Predicted"),
       col=c("red","blue"),lty=c(1,1),cex=0.8)

# While the previous diagnostics implied a sufficient fit, the predictions start to deviate rapidly
# from the observed values. Since the time series is relatively short, even a simple model can
# start to deviate substantially if we predict too far into the future if we overfit.
# Later in this course, we consider certain
# fundamental issues with ARMA models and long term predictions.

ts_plotter(Spots, lag_max = 50)
# The autocorrelation seems to decay relatively slowly and clearly oscillates due to the seasonal component
# in the time series. The spectrum also identifies this seasonal component.
# The PACF seems to mostly vanish after lag 2 so it is at least in some sense similar to the Intel_Close data.

# For the sake of illustration, let's examine the AR(2) model here.

spots_ar_2 <- arima(Spots, order = c(2,0,0))
summary(spots_ar_2)

ts_plotter(spots_ar_2$residuals, lag_max = 50)
# The correlations corresponding to lags that are close to the period length seem to be statistically significant.

ljung_box_tester(spots_ar_2$residuals, 2, max_lag = 50) < 0.05
# The lags between 10 and 18 are statistically significant at the 5 % level.
# Thus AR(2) is not satisfactory.

# To find a suitable model, we experiment with an automatic model search method
?auto.arima

spots_auto <- auto.arima(Spots) # stepwise = F, approximation = F
summary(spots_auto) # A rather complicated model emerges, whereas the AR(2) actually has a slightly better AIC value
# Always consider whether the criterion being optimized serves your goals, optimizing AIC might introduce
# undesirable uncertainty in the model estimates.

ts_plotter(spots_auto$residuals, lag_max = 50)
# Relatively small changes compared to AR(2)
ljung_box_tester(spots_auto$residuals, 4, max_lag = 50) < 0.05
# Similar conclusions as previously for AR(2)

# See the official model solution for comments 

ts_plotter(diff(Spots ,lag = 12), lag_max = 50) # The period length seems to be unstable

# Let's observe the fit

fit.sun <- fitted(spots_ar_2)
fit.auto <- fitted(spots_auto)
plot(fit.sun, type="b", col="blue",
     ylab="Spots", xlab="Time")
lines(Spots, col="red", type="b")
lines(fit.auto, col="green", type="b")
legend("topleft", legend=c("Spots time series", "AR(2)-fit",
                           "ARMA(3,1)-fit"),
       col=c("red","blue","green"),lty=c(1,1),cex=0.8)

# The models are relatively reasonable though their predictions are biased upwards for certain valleys.
# These valleys seem to sustain a lower level and not immediately recover as compared to the fits.

# Let's try forecasting with the AR(2)
spots_ar_2_pred <- arima(Spots[1:172], order = c(2,0,0))

spots_pred <- ts(forecast(spots_ar_2_pred, h=43)$mean ,start=1921)
plot(Spots, col="red",type="l", ylim=c(0,200))
lines(spots_pred, col="blue", type="l")

# The predictions a few steps ahead are relatively reasonable but the long terms predictions converge
# to a fixed value corresponding to the intercept. This is a general limitation of the ARMA family of models.
# This problem is likely exacerbated by the lack of fit observed in the diagnostics.


ts_plotter(Sales, lag_max = 50)
# ACF decays rather slowly (the upward trend) and peaks at the period length of 1 year.
# According to the previous homework, we could try differencing to eliminate the trend and seasonality.
# (And log-transformation can also be used)

# Let's see what auto.arima has to say

sales_auto <- auto.arima(Sales) # stepwise = F, approximation = F
summary(sales_auto) # The order of the differences definitely makes sense
ts_plotter(diff(diff(Sales), lag = 12), lag_max = 50)
# The AR component sort of makes sense due to PACF vanishing after lag 2 but the MA part is rather strange.
# Perhaps the periodic component that remains after differencing is not that stable.

ts_plotter(sales_auto$residuals, lag_max = 50)
# There are some peaks in the correlation plots, especially around the period length and its multiples.
# Maybe some periodicity remains in the residual process?

ljung_box_tester(sales_auto$residuals, n_param = 3, max_lag = 50) < 0.05
# Clearly some of the autocorrelations are statistically significant
# Thus the model is not satisfactory from this perspective.

fit_sales <- fitted(sales_auto)
plot(fit_sales, type="b", col="blue",
     ylab="Spots", xlab="Time")
lines(Sales, col="red", type="b")
legend("topleft", legend=c("Sales time series", "Fitted"),
       col=c("red","blue"),lty=c(1,1),cex=0.8)

# The fit is relatively reasonable though the exact fit at the beginning is quite interesting.
# Clearly at least some part of the model is on the right track.

prediction_sales <- forecast(sales_auto,h=48)$mean
plot(Sales,col="red",type="b",ylim=c(100,340),
     xlim=c(1970,1987),ylab="Sales",xlab="Time")
lines(prediction_sales,col="blue",type="b")

# Predictions are quite reasonable as well.
# Thus even if the Ljung-Box test rejected the null, this model could be useful.

# HW
# a) Consider the differencing (both standard and seasonal) needed to obtain a stationary time series. 
# Any slow decay in the autocorrelation functions indicate that the series is not likely to be stationary.
# After obtaining a relatively stationary time series, consider the properties of SARMA models and select
# some reasonable model.
# Use Ljung-Box test to assess the sufficiency of the model.
# While you can use auto.arima, the results might not be as good as one can obtain by analyzing
# the correlation plots of the stationarized time series.

# b) Use forecast as in the demo exercises.