setwd('~/Prediction/Week_6')

# 6.1

# a)

CONST<- read.table("Const.txt",header=T,sep=",",row.names=1)

const <- ts(CONST$V1,start=1966, frequency=12)

ts_plotter <- function(ts, lag_max) {
  par(mfrow = c(2,2))
  plot(ts, main = 'Run Plot')
  acf(ts, main = 'Autocorrelation', lag.max = lag_max)
  pacf(ts, main = 'Partial Autocorrelation', lag.max = lag_max)
  spec <- spectrum(ts, plot = F)
  plot(spec$freq, spec$spec, main = 'Spectrum' ,
       xlab = 'Frequency', ylab = 'Energy density')
}

ts_plotter(const,50)

# Some sort of seasonal component modulates the shape of the time series
# The expected value also seems to evolve as a function of time, though
# the trend is not monotonic (i.e. increasing or decreasing over the range)
# Perhaps some kind of polynomial trend would suffice for us.
# In any case, the time series does not seem to be stationary.

# We can think of the time series as being composed of three components
# x_t = m_t + s_t + e_t
# where m_t is the deterministic trend, s_t the deterministic seasonal and e_t the stochastic
# component.

# b)

?stl
# The basic structure of this decomposition method:
# (1) Extract the seasonal component by taking the mean of all seasonal sub-series
# (when s.window = periodic).
# (2) Remove the extracted seasonal component from the series and use local polynomial regression
# i.e. fit low-degree polynomials using a small window of the data to extract the trend.
# (LOESS = "Locally Estimated Scatter-Plot Smoothing")
# (3) Remove the extracted trend from the seasonal component and add to the trend component
# (4) Repeat the above for a couple of iterations. What remains is the stochastic component.

stl_decomp <- stl(const, s.window = "per")
plot(stl_decomp)
# The bar on the right-hand side provides a reference range to evaluate the relative contribution
# of each component. The smaller the component (relative to the remainder), the higher its contribution is
# relative to noise. So in this case the seasonal component is not as substantial in explaining the 
# the variation of the time series as the trend component.

# c) The linear filter given in the exercise is the Savitzky-Golay filter
# https://en.wikipedia.org/wiki/Savitzky%E2%80%93Golay_filter
# using a window of 13 observations and some low-degree polynomial.
# The coefficients are obtained by minimizing a least squares loss function.
?filter
const_filt <- filter(const, c(1,rep(2,11),1) * 1/24)
stl_trend <- stl_decomp$time.series[,'trend']

plot(const,lty=3)
lines(stl_trend, col="blue")
lines(const_filt, lty=2, col="red")
legend("topleft", legend=c("Time series","STL","Filter"),
       col=c(1,"blue",'red'), lty=c(3,1,2))

# The overall behavior of the trend estimates are similar but the filter trend is somewhat rougher
# than the STL trend. Perhaps caused by the usage of higher order polynomials whereas STL uses
# first order polynomials to perform the smoothing.

# d)

const_diff <- diff(diff(const, lag = 12))

const_diff_stl <- stl(const_diff, s.window = 'per')
plot(const_diff_stl)

# After differencing, the contribution of the seasonal and trend components have become minor
# relative to the stochastic component. Such behavior indicates that the differenced time
# series could be stationary. This kind of descriptive analysis is probably the most reasonable
# way to use these model-less smoothing methods since it is very difficult to infer anything
# about the precision or significance of the components.

# 6.2

# a)

library(forecast)

simu <- read.table("arsimulation.txt",header=TRUE)

data_6_2 <- apply(simu,2,ts, simplify = F)

lapply(data_6_2, function(x) ts_plotter(x,50))

# The kurtosis of the distribution decreases when degrees of freedom increases
# Thus large deviations are substantially more often observed when dof = 1.

# b)

models <- lapply(data_6_2, function(x) arima(x, order = c(1,0,0), include.mean = F, method = 'ML'))
sapply(models, coef) # The estimated AR(1) parameters are quite close to the true parameter value
# in all cases

# c)

set.seed(3141)

interval_boot_helper <- function(data, min_len) {
  start = sample(1:(length(data) - min_len + 1), size = 1)
  end = sample((start + min_len - 1):length(data), size = 1)
  
  coef(arima(data[start:end], order = c(1,0,0), include.mean = F, method = 'ML'))
  
}

boot_replicator <- function(n_boot ,data, min_len) {
  replicate(n_boot,interval_boot_helper(data, min_len))
}

n_boot <- 5000
alpha <- 0.05

X_boot <- boot_replicator(n_boot, data_6_2$X, 10)
Y_boot <- boot_replicator(n_boot, data_6_2$Y, 10)
Z_boot <- boot_replicator(n_boot, data_6_2$Z, 10)
boot_samples <- cbind(X_boot, Y_boot, Z_boot)

apply(boot_samples, 2, function(x) quantile(x, probs = c(alpha/2, 1 - alpha/2) ))

#boot_samples <- sapply(data_6_2, function(x) boot_replicator(n_boot, x, 10))

# d)
# An important observation with regards to the residual processes is that the moments of the t-distribution
# with n degree of freedom are defined up to order n-1. Thus process (1) has residuals with an undefined
# expected value and variance. Since the variance has to be well-defined (i.e. finite), the process (1)
# is not weakly stationary.

# The remaining processes have finite variance. Since the AR(1) coefficient is |.| < 1 and the residuals
# are mutually independent w.r.t. themselves and past elements of the time series, they are weakly stationary.

# 6.3
library(KFAS)

# Linear State Space Model (LSSM)
# A flexible generalization of the SARIMA models (i.e. all SARIMA models can be represented as
# linear state-space models). LSSM processes do not need to be stationarity.
# Basic definition:
# Hidden state x_t, observation y_t
# x_{t+1} = Tx_t + Rn_t, n_t ~ N(0,Q) # State Process
# y_t = Zx_t + e_t, e_t ~ N(0,H) # Observation Process
# x_1 ~ N(x_1, P_1)
# Noise terms and the initial state distribution are mutually independent.

# In the lecture slides, we are concerned with inferring properties of the hidden
# states x_t given the model parameters, mainly its expected value and variance. 

# There are various different ways of incorporating the observations to the inference.
# The two main ways are: Filtering Distribution P(x_t | y_t, y_{t-1}, ..., y_1)
# Smoothing Distribution P(x_t | y_T, y_{T-1}, ..., y_1) where T is the final observed time point.
# We also consider the Predictive Distribution P(x_t | y_{t-1}, ..., y_1)

# Which one of them is useful to you depends on what the use case is. Smoothing is probably useful
# if you are performing retrospective analysis i.e. all data has already been collected.
# Filtering is more useful in online cases where observations are received sequentially, especially
# if there are control inputs to the system.

# Kalman Filter is an algorithm which can be used to calculate the distributions of the hidden state
# given above. The recursive update equations show that we only need to keep track of the current
# state prediction and its variance which is again useful for online applications.
# Check the Wikipedia article, you can find derivations and other interesting material about the
# update equations. 
# Roughly speaking, the Filter incorporates the current observation to the
# running state estimate using the optimal Kalman Gain which minimizes the mean square error
# related to the state estimate.

# The estimation of the model parameters is not really discussed in this course.
# The approach is maximum likelihood but it is complicated by the fact that
# the model involves hidden states. The traditional way of dealing with this
# is the Expectation-Maximization (EM) algorithm which is discussed at least in
# the course Advanced Probabilistic Methods.

alko <-ts(read.table("alcoholdeaths.txt"),start=1969)

# Our model:
# mu_{t+1} = mu_t + v + n_t, n_t ~ N(0,Q)
# y_t = mu_t + e_t, e_t ~ N(0,H)

# Construct the hidden state from mu_t and v.

a1 <- c(0,0) # Mean vector for the initial state
Zt <- matrix(c(1, 0), 1, 2) # Emission matrix
Ht <- matrix(NA) # Observation noise variance
Tt <- matrix(c(1, 0, 1, 1), 2, 2) # Transition matrix
Rt <- matrix(c(1, 0), 2, 1) # State noise matrix
Qt <- matrix(NA) # State noise variance
P1 <- matrix(0, 2, 2) # Non-diffuse state covariance matrix
P1inf <- diag(2) # Diffuse state indicator matrix

?SSModel # Specify the model

model_gaussian <- SSModel(alko~-1+SSMcustom(a1=a1,Z=Zt,T=Tt,R = Rt,Q=Qt,
                                            P1=P1,P1inf=P1inf),H=Ht)

?fitSSM # ML estimation of the variance parameters

fit_gaussian <- fitSSM(model_gaussian, inits = c(0, 0))

# ML estimates
fit_gaussian$model$Q # State noise variance
fit_gaussian$model$H # Observation noise variance

?KFS # Filtering, smoothing, whatever distribution you want to know.

hidden_state_infer <- KFS(fit_gaussian$model)
plot(alko, ylab = 'Deaths per 100 000')
lines(hidden_state_infer$a[,1],col="red") # One-step-ahead predictions of state
lines(hidden_state_infer$alphahat[,1],col='blue') # Smoothed states
legend('topleft', legend = c('Original', 'Prediction', 'Smoothed'), 
       col = c('black', 'red','blue'), lty = 1)
pred_states <- hidden_state_infer$a[,1]
std_pred_states <- sqrt(hidden_state_infer$P[1,1,])

arrows(x0=time(alko), y0=pred_states - std_pred_states,
       x1=time(alko), y1=pred_states + std_pred_states, code=3, angle=90, length=0.1)

# For the drift, we can check how the predictions evolve
hidden_state_infer$a[,2] # The last value is 0.84
hidden_state_infer$alphahat[,2] # Equal everywhere as expected