########################### # MS-C1620 # Statistical inference # Lecture 11 library(MASS) library(KernSmooth) ############################################################## # Kernel smoothing # Motorcycle data x <- mcycle$times y <- mcycle$accel plot(x,y) # Linear regression model does not seem very good linmodel <- lm(y ~ x) linmodel abline(linmodel) # Difficult to claim that a parametric model would be good. # Try nonparametric ones. # Nadaraya-Watson = simple weighted average of nearby points kernelmodel5 <- ksmooth(x, y, kernel="normal", bandwidth=5) lines(kernelmodel5, col="red") # Try different bandwidths kernelmodel20 <- ksmooth(x, y, kernel="normal", bandwidth=20) lines(kernelmodel20, col="blue") kernelmodel200 <- ksmooth(x, y, kernel="normal", bandwidth=200) lines(kernelmodel200, col="cyan") kernelmodel1 <- ksmooth(x, y, kernel="normal", bandwidth=1) lines(kernelmodel1, col="magenta") kernelmodel02 <- ksmooth(x, y, kernel="normal", bandwidth=0.2) lines(kernelmodel02, col="black") # Use cross validation to select bandwidth # that predicts unseen points with smallest error. # Split data randomly (half and half) to training and test. set.seed(12345) n <- length(x) training <- sort(sample(seq_len(n), size=round(n/2))) # We try several bandwidths between 1 and 15. # This creates many plots (use the <- -> arrows in the plot window to browse). nh <- 100 h_vals <- exp(seq(log(1), log(15), length=nh)) mse_vals <- rep(0, nh) for (i in 1:nh){ h <- h_vals[i] ytestpred <- ksmooth(x[training], y[training], kernel="normal", bandwidth=h, x.points=c(x[-training]))$y # Calculate the MSE (mean squared error) mse <- mean((y[-training] - ytestpred)^2) mse_vals[i] <- mse # Demonstrate what happens with this bandwidth plot(x[training],y[training],col="black", main=paste("h=",round(h,digits=3),"MSE=",round(mse,digits=2))) lines(x[-training],ytestpred,col="black") points(x[-training],y[-training],col="blue") segments(x[-training], y[-training], x[-training], ytestpred, col="red") } # Showing how bandwidth affects MSE on test set. plot(h_vals, mse_vals) # Select bandwidth that minimizes MSE imin = which.min(mse_vals) h <- h_vals[imin] h # Demonstrate what happens with this bandwidth ytestpred <- ksmooth(x[training], y[training], kernel="normal", bandwidth=h, x.points=c(x[-training]))$y plot(x[training],y[training],col="black", main=paste("h=",round(h,digits=3),"MSE=",round(mse_vals[imin],digits=3))) lines(x[-training],ytestpred,col="black") points(x[-training],y[-training],col="blue") segments(x[-training], y[-training], x[-training], ytestpred, col="red") # Look at the plot critically. # We selected a "globally best" bandwidth. # Does it seem too narrow in the rightmost portion of the data? # Perhaps we could use an adaptive bandwidth (but we are not doing it now). # Another data: Old Faithful waiting times, predicted from eruption length F <- faithful[order(faithful$eruptions),] x <- F$eruptions y <- F$waiting n <- length(x) plot(x, y) model <- ksmooth(x, y, kernel="normal", bandwidth=0.1) lines(model$x, model$y) # Seems too jumpy? # Split data randomly (half and half) to training and test. set.seed(12345) n <- length(x) training <- sort(sample(seq_len(n), size=round(n/2))) # We try several bandwidths between 0.1 and 5. # This creates many plots (use the <- -> arrows in the plot window to browse). nh <- 50 h_vals <- exp(seq(log(0.1), log(5), length=nh)) mse_vals <- rep(0, nh) for (i in 1:nh){ h <- h_vals[i] ytestpred <- ksmooth(x[training], y[training], kernel="normal", bandwidth=h, x.points=c(x[-training]))$y # Calculate the MSE (mean squared error) mse <- mean((y[-training] - ytestpred)^2) mse_vals[i] <- mse # Demonstrate what happens with this bandwidth plot(x[training],y[training],col="black", main=paste("h=",round(h,digits=3),"MSE=",round(mse,digits=2))) lines(x[-training],ytestpred,col="black") points(x[-training],y[-training],col="blue") segments(x[-training], y[-training], x[-training], ytestpred, col="red") } # Showing how bandwidth affects MSE on test set. plot(h_vals, mse_vals) # Select bandwidth that minimizes MSE imin = which.min(mse_vals) h <- h_vals[imin] h # Demonstrate what happens with this bandwidth ytestpred <- ksmooth(x[training], y[training], kernel="normal", bandwidth=h, x.points=c(x[-training]))$y plot(x[training],y[training],col="black", main=paste("h=",round(h,digits=3))) lines(x[-training],ytestpred,col="black") points(x[-training],y[-training],col="blue") segments(x[-training], y[-training], x[-training], ytestpred, col="red") ############################################################## # Local linear regression # Again using motorcycle data x <- mcycle$times y <- mcycle$accel plot(x,y) loclinmodel5 <- locpoly(x, y, degree=1, bandwidth=5) lines(loclinmodel5, col="red") # seems too wide smoothing window (bandwidth) # try smaller bandwidth loclinmodel2 <- locpoly(x, y, degree=1, bandwidth=2) lines(loclinmodel2, col="magenta") # Demonstrating edge effects, NW versus local linear # Using different data which comes from a sine curve. x <- seq(from=0, to=5.5, length.out=20) y <- sin(x) h <- 0.5 range <- c(-1, 7) plot(x,y, xlim=range) # Nadaraya-Watson corresponds to zero-degree (constant) local polynomial fit nwmodel <- locpoly(x, y, degree=0, bandwidth=h, range.x=range) lines(nwmodel, col="blue") # Compare to degree-1 (linear) local polynomial fit llmodel <- locpoly(x, y, degree=1, bandwidth=h, range.x=range) lines(llmodel, col="red") # What about degree-2 (quadratic) local polynomial fit? lqmodel <- locpoly(x, y, degree=2, bandwidth=h, range.x=range) lines(lqmodel, col="green") # Higher degree polynomial traces the bends better. But how does it handle noise? yerr <- y + rnorm(length(x), 0, 0.2) plot(x, yerr, xlim=range) # Nadaraya-Watson nwmodel <- locpoly(x, yerr, degree=0, bandwidth=h, range.x=range) lines(nwmodel, col="blue") # Degree 1 llmodel <- locpoly(x, yerr, degree=1, bandwidth=h, range.x=range) lines(llmodel, col="red") # Degree 2 lqmodel <- locpoly(x, yerr, degree=2, bandwidth=h, range.x=range) lines(lqmodel, col="green") ############################################################## # Kernel density estimation # KDE example 1: A very small sample from uniform distribution on [10,20] n <- 5 range <- c(10,20) x <- runif(n, 10, 20) h <- 0.3 densitymodel <- bkde(x, bandwidth=h, range.x=range) plot(x, rep(0,n), xlim=range, ylim=c(0,max(densitymodel$y))) lines(densitymodel) # try increasing bandwidth to eg. 1 # try increasing n to eg. 20 or 300 # KDE example 2: Sample from an exponential distribution n <- 200 x <- rexp(n, rate=5) h <- 0.05 densitymodel <- bkde(x, bandwidth=h) plot(x, rep(0,n), ylim=c(0,max(densitymodel$y))) lines(densitymodel) # KDE example 3: Old Faithful eruption lengths (a bimodal distribution) x <- faithful$eruptions n <- length(x) h <- .1 densitymodel <- bkde(x, bandwidth=h) plot(x, rep(0,n), ylim=c(0,max(densitymodel$y))) lines(densitymodel) # try changing bandwidth, eg. values 0.01, 0.1, 1, 3 # Can we use CV here to choose bandwidth? # Yes (example coming here later...)