- Histograms
- Glivenko-Cantelli theorem
- Error for density estimates
- Kernel density estimates
- Bivariate density estimates
data(faithful); x0 <- 0; x1 <- 8; h <- .5
my.breaks<-seq(from=x0, to=x1, by=h)
myhist<-hist(faithful$eruptions, breaks=my.breaks, right=F, plot=F)
plot(myhist$mids, myhist$density, type="s", xlab="Eruption length",
ylab="Frequency", main="Histogram of Eruption Lengths")
Bin width primarily controls the amount of smoothing, lots of guidance available
my.w<-function(x){
if (abs(x) < 1)
w <- 1/2
else
w <- 0
w
}
x <- seq(0,6,0.2)
m <- length(x)
n <- length(faithful$eruptions)
h <- .5
fhat <- rep(0,m)
for (i in 1:m){
S <- 0
for (j in 1:n){
S <- S+(1/h)*my.w((faithful$eruptions[j]-x[i])/h)
}
fhat[i] <- (1/n)*S
}
plot(x,fhat, type="s", xlab="Eruption length", ylab="Density Estimate",
main="Naive Density Estimator")
Most popular choices for the kernel \(K\) are
| Family | Kernel |
|---|---|
| Gaussian | \(K(t)=\frac{1}{\sqrt{2\pi}}e^{-t^2/2}\) |
| Rectangular | \(K(t) = 1/2\) for $ |
| Triangular | $K(t) = 1- |
| Epanechnikov | \(K(t)=\frac{3}{4}(1-(1/5) t^2)\) for $ |
my.w<-function(x, type="gaussian"){
if(type=="gaussian"){
w <- dnorm(x)
return(w)
}
if(type=="naive"){
if (abs(x) < 1)
w <- 1/2
else
w <- 0
return(w)
}
print("You have asked for an undefined kernel.")
return(NULL)
}
x <- seq(0,6,0.02)
m <- length(x)
n <- length(faithful$eruptions)
h <- .1
fhat <- rep(0,m)
for (i in 1:m){
S <- 0
for (j in 1:n){
S <- S+(1/h)*my.w((faithful$eruptions[j]-x[i])/h)
}
fhat[i] <- (1/n)*S
}
plot(x,fhat, type="l", xlab="Eruption length", ylab="Density Estimate",
main="Naive Density Estimator")
density in R, this is basically what it doesrnorm(1,x[i],h), or rnorm(q,sample(x,q,replace=TRUE),h) for \(q\) drawsdensity() function is the most commondensity(x, ...)
## Default S3 method:
density(x, bw = "nrd0", adjust = 1,
kernel = c("gaussian", "epanechnikov", "rectangular",
"triangular", "biweight",
"cosine", "optcosine"),
weights = NULL, window = kernel, width,
give.Rkern = FALSE,
n = 512, from, to, cut = 3, na.rm = FALSE, ...)
ASH and KernSmooth are both fast, accurate, and well-maintained (Deng and Wickham, 2011)bin2d function in R will bin a bivariate data setbin2d <- function(x, breaks1 = "Sturges", breaks2 = "Sturges"){
histg1 <- hist(x[,1], breaks = breaks1, plot = FALSE)
histg2 <- hist(x[,2], breaks = breaks2, plot = FALSE)
brx <- histg1$breaks
bry <- histg2$breaks
freq <- table(cut(x[,1], brx), cut(x[,2], bry))
return(list(call = match.call(), freq = freq,
breaks1 = brx, breaks2 = bry,
mids1 = histg1$mids, mids2 = histg2$mids))
}
persp function plots the density histogramdata(iris) fit1=bin2d(iris[1:50, 1:2]) persp(x=fit1$mids1, y=fit1$mids2, z=fit1$freq, shade=T, theta=45, phi=30, ltheta=60)
library(MASS)
n <- 2000
p <- c(.2, .3, .5)
mu <- matrix(c(0, 1, 4, 0, 3, -1), 3, 2)
Sigma <- diag(2)
i <- sample(1:3, replace = TRUE, prob = p, size = n)
k <- table(i)
x1 <- mvrnorm(k[1], mu = mu[1,], Sigma)
x2 <- mvrnorm(k[2], mu = mu[2,], Sigma)
x3 <- mvrnorm(k[3], mu = mu[3,], Sigma)
X <- rbind(x1, x2, x3) #the mixture data
x <- X[,1]
y <- X[,2]
fhat <- kde2d(x, y, h=c(1.87, 1.84))
par(mfrow = c(1, 2)) contour(fhat) persp(fhat, phi = 30, theta = 20, d = 5, xlab = "x")
par(mfrow = c(1, 1))