- Random number generation
- Distributions in R
- Distributional Models
- Moments, generalized moments, likelihood
- Visual comparisons and basic testing
How does R get “random” numbers?
runif()
runif(1:10)
## [1] 0.9473717 0.7757503 0.7620696 0.6479452 0.6269060 0.4509551 0.9870357 ## [8] 0.1537041 0.2177579 0.9817671
set.seed(10) runif(1:10)
## [1] 0.50747820 0.30676851 0.42690767 0.69310208 0.08513597 0.22543662 ## [7] 0.27453052 0.27230507 0.61582931 0.42967153
set.seed(10) runif(1:10)
## [1] 0.50747820 0.30676851 0.42690767 0.69310208 0.08513597 0.22543662 ## [7] 0.27453052 0.27230507 0.61582931 0.42967153
seed <- 10 new.random <- function (a=5, c=12, m=16) { out <- (a*seed + c) %% m seed <<- out return(out) } out.length <- 20 variates <- rep (NA, out.length) for (kk in 1:out.length) variates[kk] <- new.random() variates
## [1] 14 2 6 10 14 2 6 10 14 2 6 10 14 2 6 10 14 2 6 10
Unfortunately, this sequence has period 8
variates <- rep (NA, out.length) for (kk in 1:out.length) variates[kk] <- new.random(a=131, c=7, m=16) variates
## [1] 5 6 9 2 13 14 1 10 5 6 9 2 13 14 1 10 5 6 9 2
variates <- rep (NA, out.length) for (kk in 1:out.length) variates[kk] <- new.random(a=129, c=7, m=16) variates
## [1] 9 0 7 14 5 12 3 10 1 8 15 6 13 4 11 2 9 0 7 14
variates <- rep (NA, out.length) for (kk in 1:out.length) variates[kk] <- new.random(a=1664545, c=1013904223, m=2^32) variates
## [1] 1037207853 2090831916 4106096907 768378826 3835752553 1329121000 ## [7] 2125006663 2668506502 3581687205 2079234980 2067291011 2197025090 ## [13] 3748878561 2913996384 758844863 4029469438 2836748829 1458315036 ## [19] 2399149563 2766656186
Every distribution that R handles has four functions. There is a root name, for example, the root name for the normal distribution is norm
. This root is prefixed by one of the letters
p
for "probability", the cumulative distribution function (c. d. f.)q
for "quantile", the inverse c. d. f.d
for "density", the density function (p. f. or p. d. f.)r
for "random", a random variable having the specified distributionBeta, Binomial, Cauchy, Chi-Square, Exponential, F, Gamma, Geometric, Hypergeometric, Logistic, Log Normal, Negative Binomial, Normal, Poisson, Student t, Studentized Range, Uniform, Weibull, Wilcoxon Rank Sum Statistic, Wilcoxon Signed Rank Statistic
beta
, binom
, cauchy
, chisq
, exp
, f
, gamma
, geom
, hyper
, logis
, lnorm
, nbinom
, norm
, pois
, t
, tukey
, unif
, weibull
, wilcox
, signrank
Exponential distribution has pexp
, qexp
, dexp
, and rexp
this.range <- seq(0, 8, .05) plot (this.range, dexp(this.range), ty="l", main="Exponential Distributions", xlab="x", ylab="f(x)") lines (this.range, dexp(this.range, rate=0.5), col="red") lines (this.range, dexp(this.range, rate=0.2), col="blue")
this.range <- seq(0, 8, .05) plot (this.range, pexp(this.range), ty="l", main="Exponential Distributions", xlab="x", ylab="P(X<x)") lines (this.range, pexp(this.range, rate=0.5), col="red") lines (this.range, pexp(this.range, rate=0.2), col="blue")
gamma.est_MM <- function(x) { m <- mean(x); v <- var(x) return(c(shape=m^2/v, scale=v/m)) }
gamma.mean <- function(shape,scale) { return(shape*scale) } gamma.var <- function(shape,scale) { return(shape*scale^2) } gamma.discrepancy <- function(shape,scale,x) { return((mean(x)-gamma.mean(shape,scale))^2 + (var(x)-gamma.mean(shape,scale))^2) }
loglike.foo <- function(params, x) { sum(dfoo(x=x,params,log=TRUE)) }
fitdistr
in MASS
packagelibrary(MASS) data("cats",package="MASS") hist(cats$Hwt, xlab="Heart Weight", main="Histogram of Cat Heart Weights")
fitdistr(cats$Hwt, densfun="gamma")
## shape rate ## 20.2998092 1.9095724 ## ( 2.3729250) ( 0.2259942)
cats.gamma <- gamma.est_MM(cats$Hwt) qqplot(cats$Hwt, qgamma(ppoints(500), shape=cats.gamma["shape"], scale=cats.gamma["scale"]), xlab="Observed", ylab="Theoretical", main="QQ Plot")
plot(density(cats$Hwt)) # more on this later curve(dgamma(x,shape=cats.gamma["shape"],scale=cats.gamma["scale"]),add=TRUE,col="blue")
plot(ecdf(pgamma(cats$Hwt, shape=cats.gamma["shape"], scale=cats.gamma["scale"])), main="Calibration Plot for Cat Heart Weights", verticals = T, pch = "") abline(0,1,col="grey")
Exercise: Write a general function making a calibration plot that inputs a data vector, cumulative probability function, and parameter vector
ks.test(cats$Hwt, pgamma, shape=cats.gamma["shape"], scale=cats.gamma["scale"])
## Warning in ks.test(cats$Hwt, pgamma, shape = cats.gamma["shape"], scale = ## cats.gamma["scale"]): ties should not be present for the Kolmogorov-Smirnov ## test
## ## One-sample Kolmogorov-Smirnov test ## ## data: cats$Hwt ## D = 0.068637, p-value = 0.5062 ## alternative hypothesis: two-sided
x <- rnorm(100, mean=0, sd=1) y <- rnorm(100, mean=.5, sd=1) ks.test(x, y)
## ## Two-sample Kolmogorov-Smirnov test ## ## data: x and y ## D = 0.28, p-value = 0.0007873 ## alternative hypothesis: two-sided
M <- as.table(rbind(c(762, 327, 468), c(484, 239, 477))) dimnames(M) <- list(gender = c("F", "M"), party = c("Democrat","Independent", "Republican")) M
## party ## gender Democrat Independent Republican ## F 762 327 468 ## M 484 239 477
Xsq <- chisq.test(M) Xsq
## ## Pearson's Chi-squared test ## ## data: M ## X-squared = 30.07, df = 2, p-value = 2.954e-07
chisq.test()
assumes that all the probabilities in p
were fixed, not estimated from the data used for testing, so df = number of cells in the table −1q
parameters, we need to subtract q
degrees of freedomx
in chisq.test()
p
foo
to calculate the theoretical probability of each bin; this is p
chisq.test
hist()
gives us break points and counts:
cats.hist <- hist(cats$Hwt,plot=FALSE) cats.hist$breaks
## [1] 6 8 10 12 14 16 18 20 22
cats.hist$counts
## [1] 20 45 42 23 11 2 0 1
Use these for a \(\chi^2\) test
# Why the padding by -Inf and Inf? p <- diff(pgamma(c(-Inf,cats.hist$breaks,Inf),shape=cats.gamma["shape"], scale=cats.gamma["scale"])) # Why the padding by 0 and 0? x2 <- chisq.test(c(0,cats.hist$counts,0),p=p)$statistic
## Warning in chisq.test(c(0, cats.hist$counts, 0), p = p): Chi-squared ## approximation may be incorrect
# Why +2? Why -length(cats.gamma)? pchisq(x2,df=length(cats.hist$counts)+2 - length(cats.gamma))
## X-squared ## 0.854616
Don’t need to run hist
first; can also use cut
to discretize
There are better ways to do this
chisq.test()
ks.test
, "bootstrap" testing, "smooth tests of goodness of fit", …