- Histograms
- Glivenko-Cantelli theorem
- Error for density estimates
- Kernel density estimates
- Bivariate density estimates
data(faithful); x0 <- 0; x1 <- 8; h <- .5 my.breaks<-seq(from=x0, to=x1, by=h) myhist<-hist(faithful$eruptions, breaks=my.breaks, right=F, plot=F) plot(myhist$mids, myhist$density, type="s", xlab="Eruption length", ylab="Frequency", main="Histogram of Eruption Lengths")
Bin width primarily controls the amount of smoothing, lots of guidance available
my.w<-function(x){ if (abs(x) < 1) w <- 1/2 else w <- 0 w } x <- seq(0,6,0.2) m <- length(x) n <- length(faithful$eruptions) h <- .5 fhat <- rep(0,m) for (i in 1:m){ S <- 0 for (j in 1:n){ S <- S+(1/h)*my.w((faithful$eruptions[j]-x[i])/h) } fhat[i] <- (1/n)*S }
plot(x,fhat, type="s", xlab="Eruption length", ylab="Density Estimate", main="Naive Density Estimator")
Most popular choices for the kernel \(K\) are
Family | Kernel |
---|---|
Gaussian | \(K(t)=\frac{1}{\sqrt{2\pi}}e^{-t^2/2}\) |
Rectangular | \(K(t) = 1/2\) for $ |
Triangular | $K(t) = 1- |
Epanechnikov | \(K(t)=\frac{3}{4}(1-(1/5) t^2)\) for $ |
my.w<-function(x, type="gaussian"){ if(type=="gaussian"){ w <- dnorm(x) return(w) } if(type=="naive"){ if (abs(x) < 1) w <- 1/2 else w <- 0 return(w) } print("You have asked for an undefined kernel.") return(NULL) }
x <- seq(0,6,0.02) m <- length(x) n <- length(faithful$eruptions) h <- .1 fhat <- rep(0,m) for (i in 1:m){ S <- 0 for (j in 1:n){ S <- S+(1/h)*my.w((faithful$eruptions[j]-x[i])/h) } fhat[i] <- (1/n)*S }
plot(x,fhat, type="l", xlab="Eruption length", ylab="Density Estimate", main="Naive Density Estimator")
density
in R, this is basically what it doesrnorm(1,x[i],h)
, or rnorm(q,sample(x,q,replace=TRUE),h)
for \(q\) drawsdensity()
function is the most commondensity(x, ...) ## Default S3 method: density(x, bw = "nrd0", adjust = 1, kernel = c("gaussian", "epanechnikov", "rectangular", "triangular", "biweight", "cosine", "optcosine"), weights = NULL, window = kernel, width, give.Rkern = FALSE, n = 512, from, to, cut = 3, na.rm = FALSE, ...)
ASH
and KernSmooth
are both fast, accurate, and well-maintained (Deng and Wickham, 2011)bin2d
function in R will bin a bivariate data setbin2d <- function(x, breaks1 = "Sturges", breaks2 = "Sturges"){ histg1 <- hist(x[,1], breaks = breaks1, plot = FALSE) histg2 <- hist(x[,2], breaks = breaks2, plot = FALSE) brx <- histg1$breaks bry <- histg2$breaks freq <- table(cut(x[,1], brx), cut(x[,2], bry)) return(list(call = match.call(), freq = freq, breaks1 = brx, breaks2 = bry, mids1 = histg1$mids, mids2 = histg2$mids)) }
persp
function plots the density histogramdata(iris) fit1=bin2d(iris[1:50, 1:2]) persp(x=fit1$mids1, y=fit1$mids2, z=fit1$freq, shade=T, theta=45, phi=30, ltheta=60)
library(MASS) n <- 2000 p <- c(.2, .3, .5) mu <- matrix(c(0, 1, 4, 0, 3, -1), 3, 2) Sigma <- diag(2) i <- sample(1:3, replace = TRUE, prob = p, size = n) k <- table(i) x1 <- mvrnorm(k[1], mu = mu[1,], Sigma) x2 <- mvrnorm(k[2], mu = mu[2,], Sigma) x3 <- mvrnorm(k[3], mu = mu[3,], Sigma) X <- rbind(x1, x2, x3) #the mixture data x <- X[,1] y <- X[,2] fhat <- kde2d(x, y, h=c(1.87, 1.84))
par(mfrow = c(1, 2)) contour(fhat) persp(fhat, phi = 30, theta = 20, d = 5, xlab = "x")
par(mfrow = c(1, 1))