# Graphics preset:
pcol <- c("#FFFFFFD9", '#1f2828', '#e85836', '#6c8d8d', '#ccb100')
op.web <- par(family = 'serif', bg = '#ffcd00',
cex.axis = 1.25, cex.lab = 1.5, cex.main = 1.5,
col.axis = pcol[2], col.lab = pcol[2], col.main = pcol[2], col.sub = pcol[2],
font.lab = 3, font.axis = 3, fg = pcol[2], tck = 0.015)
####
# 1:
####
# a)
rivers
hist(rivers, density = 10, xlab = 'Length (miles)', ylim = c(0,100))
boxplot(rivers)
# Pro tip: axis tick labels horizontally
hist(rivers, density = 10, xlab = 'Length (miles)', ylim = c(0,100),
yaxt = 'n')
axis(2, las = 2)
# b)
# This can be explained, or a exponential distribution PDF can be shown, or
# Show it on R:
# Extra demonstration (code does not need to be explained deeply):
# Drawing some exponential distribution probability density function
# Histogram with probability densities (freq = F):
hist(rivers, density = 10, xlab = 'Length (miles)', ylab = '',
yaxt = 'n', freq = F);axis(2, las = 2)
# Exponential distribution PDF with rate parameter lambda = 0.0015
lines(seq(0,4000,length.out = 1000),
dexp(seq(0,4000,length.out = 1000), rate = 0.0015), lwd = 2, col = pcol[1])
# c)
a <- cut(rivers,
breaks = c(min(rivers),250,500,750,1000,1250,max(rivers)),
include.lowest = TRUE, right = TRUE, dig.lab = 4)
a
# Words of warning: include.lowest is set to FALSE by default, therefore
# it would 'drop' the minimum value when the breaks are defined like this.
# Moreover, dig.lab can be use to adjust the number of digits if
# some lables are given in scientific notation, for example
# d)
barplot(table(a), xlab = 'Hello world', names.arg = c('min','mid','med','semilarge','large','extra large'))
# We can leave the figure like that, or with some inishing touches.
# Y-axis tick labels, x-axis label size, and bar colors:
barplot(table(a), yaxt = 'n',
cex.names = 0.75, col = pcol[2]);axis(2, las = 2)
title(xlab = 'River length (miles)', ylab = 'Frequency')
# OR
pie(table(a))
# With sprinkles on top
pie(table(a), col = colorRampPalette(c(pcol[2],pcol[1]))(6))
# colorRampPalette(x)(y) can be used to generate color gradients
# where x is a vector of color names or hexcode
# and y is an integer number on how many colors should be outputted
colorRampPalette(c('red','blue'))(10)
# e)
# - Histogram begins from 0 which is below the minimum, i.e., empty class
# - Barplot and piechart classes begins from the actual miminum value thus is more 'defined'
# - Barplot is here, perhaps, the most informative: 1. does not show empty classes,
# 2. the distribution (theoretical) can still be guessed (in piechart not too well)
# 3. displays the 'mode' well.
####
# 2:
####
# Find out the source of the data:
?islands
# Look at the data:
islands
# a)
hist(islands, density = 10,
xlab = 'Area (1000 x miles^2)',
ylim = c(0,50), xlim = c(0,20000),
yaxt = 'n');axis(2, las = 2)
# certainly there are more small islands than large islands.
# Moreover, some empty classes / intervals can be seen; try different breaks:
hist(islands, density = 10,
xlab = 'Area (1000 x miles^2)',
ylim = c(0,50), xlim = c(0,20000),
yaxt = 'n', breaks = 5);axis(2, las = 2)
# Nevertheless, exponential distribution, most likely.
# Since we have so few data points (observations in terms of statistics)
# this can be visualized point-by-point:
plot(islands)
# We can do better that this!
dotchart(sort(islands), pch = 16, lcol = 1)
# Note that, dotchart() function can be frustrating sometimes with long label names.
# b)
# From the dotchart, we can immediately see that the seven largest are the continents
# whereas the rest are, well, large islands.
# If we would have more information on, for example, the latitudes / longnitudes
# we could further analyze how the island landmass is distributed by the lat/long.
# c)
mean(islands)
median(islands)
mean(islands) / median(islands)
# Hold up, 30x difference! Shouldn't these measures be more or less the same or at least much closer?
# Well, since our data is NOT from a symmetric distribution, therefore
# Mean and median are NOT the same, and the more the data is 'skewed' the more
# they deviate, typically.
sd(islands)
mad(islands)
# Visualize:
dotchart(sort(islands), pch = 16, lcol = 1)
abline(v = c(mean(islands), median(islands)))
# Essentially, mean is NOT a robust location estimator whereas median is.
# This means, that it takes only one observation taken into the infinity
# and the mean explodes, whereas median is not.
# d)
# Perhaps the justification we can come up with is to remove the continents
# from the data. How to remove data points in practice in R? There are many ways
# and here is one:
# - To find the continents, sort from largest to smallest
sort(islands, decreasing = T)
# - Remove the continents:
b <- sort(islands, decreasing = T)[-c(1:7)]
# Make the calculations:
mean(b)
median(b)
mean(b) / median(b)
# Now these estimates are at least in the same ballpark!
# And visualize:
dotchart(sort(b), pch = 16, lcol = 1)
abline(v = c(mean(b), median(b)))
# Note that, by delting 'outliers' by visual 'evidence' can quite easily
# lead into you delting all the points, except those you like!
# Therefore, you should have an excellent justification to remove
# outliers! Typically, the justifications should stem from the context
# not from the numbers!
# There are of course numerical ways to determine outliers, one is Tukey's
# boxplot:
boxplot(islands)
# The outliers (points outside the box) are defined:
# - top outlier > 3rd quartile + 1.5 * IQR (points above the top whisker)
# - bot outlier < 1st quartile - 1.5 * IQR (not in this data)
# - Interquartile range (IQR) is Q3 - Q1
# See
IQR(islands)
quantile(islands)
# Q3 - Q1 = IQR:
diff(quantile(islands)[c(2,4)])
# To use this to remove the ourliers for the islands data:
b2 <- islands[islands <= quantile(islands)[4] + 1.5 * IQR(islands)]
# And the stats and vizs:
mean(b2)
median(b2)
mean(b2) / median(b2)
dotchart(sort(b2), pch = 16, lcol = pcol[2])
abline(v = c(mean(b2), median(b2)))
####
# 3:
####
# Look at the data source:
?Nile
# Aah, time-series!
Nile
# a)
# Histogram:
hist(Nile, density = 10,
xlab = 'Water Flow volume (10^8 m^3)', ylim = c(0,30),
yaxt = 'n');axis(2, las = 2)
# Looks like the century water flow of the river Nile is normally distributed
# Line plot
plot(Nile)
# Some adjustments:
plot(Nile, type = 'b',
yaxt = 'n', ylab = 'Water Flow volume (10^8 m^3)',
pch = 16);axis(2, las = 2)
# b)
# - Perhaps a more dry season in the middle of the follow-up time.
# - Perhaps there is seasonal u-shaped variation spanning ~ 100 years
# - There is also a clear change-point in ~ 1898
# c)
# We could achieve this by calculating these values one-by-one, BUT
# there are easier ways to do this:
summary(Nile)
# The base package summary() function is quite useful
# and can understand (at least tries to) multiple
# input types, i.e., where the object is from
# (linear regression model, time-series, etc.)
# A package called 'psych' has a function called
# 'describe' which would be the most simple to use here
#install.packages('psych')
library(psych)
describe(Nile)
var(Nile)
sort(table(Nile), decreasing = T)
# Kurtosis can be defined in (at least) three ways and
# different packages may implement different methods.
# This has caused some mix-ups and frustration in the previous years so let's
# look into this.
#install.packages('e1071')
#install.packages('moments')
library(e1071)
library(moments)
moments::kurtosis(Nile) # This is kurtosis
e1071::kurtosis(Nile) # This is so called excess kurtosis
# Excess kurtosis is defined as kurtosis - 3
moments::kurtosis(Nile) - 3
e1071::kurtosis(Nile)
# w00000t but it isn't!!
# There are also some minor differences in the way the kurtosis
# is calculated. In psych package kurtosi() function
# you can look and define different ways to calculate kurtosis:
kurtosi(Nile, type = 1) # How moments package implements:
moments::kurtosis(Nile) - 3
kurtosi(Nile, type = 2) # How someone can implement
kurtosi(Nile, type = 3) # how package e1071 implements
e1071::kurtosis(Nile)
# Perhaps the way how e1071 defines the excess kurtosis is the most used.
# Finally, mode, i.e., the most frequent single (or multiple if ties) observation:
table(Nile)
# More conveniently
sort(table(Nile), decreasing = T)
# The data has four modes (the first four observations in the ranked observations)
# d)
# Discuss and visualize:
plot(Nile, type = 'b', yaxt = 'n',
ylab = '', pch = 16);axis(2, las = 2)
# Mean = the average level around which the data fluctuates
# SD & Var = the average size of these fluctuations
# Min = the lowest point of the curve
# Max = the lowest point of the curve
# Median = the average level around which the data fluctuates (robust)
# MAD = the average size of these fluctuations (robust)
# (Mode = difficult to see...)
# (Skewness = difficult to see...)
# (Kurtosis = difficult to see...)
# Visualize the stats:
mtext(c('Mean','Median','Sd-','Sd+','Max','Min','MAD-','MAD+'),
at = c(mean(Nile), median(Nile), mean(Nile) - sd(Nile),
mean(Nile) + sd(Nile), max(Nile), min(Nile),
median(Nile) - mad(Nile), median(Nile) + mad(Nile)),
side = 2, las = 2)
abline(h = c(mean(Nile), median(Nile), mean(Nile) - sd(Nile),
mean(Nile) + sd(Nile), max(Nile), min(Nile),
median(Nile) - mad(Nile), median(Nile) + mad(Nile)), lty = 3)