\documentclass{chapman}
%%% copy Sweave.sty definitions
%%% keeps `sweave' from adding `\usepackage{Sweave}': DO NOT REMOVE
%\usepackage{Sweave}
\RequirePackage[T1]{fontenc}
\RequirePackage{graphicx,ae,fancyvrb}
\IfFileExists{upquote.sty}{\RequirePackage{upquote}}{}
\usepackage{relsize}
\DefineVerbatimEnvironment{Sinput}{Verbatim}{}
\DefineVerbatimEnvironment{Soutput}{Verbatim}{fontfamily=courier,
fontshape=it,
fontsize=\relsize{-1}}
\DefineVerbatimEnvironment{Scode}{Verbatim}{}
\newenvironment{Schunk}{}{}
%%% environment for raw output
\newcommand{\SchunkRaw}{\renewenvironment{Schunk}{}{}
\DefineVerbatimEnvironment{Soutput}{Verbatim}{fontfamily=courier,
fontshape=it,
fontsize=\small}
\rawSinput
}
%%% environment for labeled output
\newcommand{\nextcaption}{}
\newcommand{\SchunkLabel}{
\renewenvironment{Schunk}{\begin{figure}[ht] }{\caption{\nextcaption}
\end{figure} }
\DefineVerbatimEnvironment{Sinput}{Verbatim}{frame = topline}
\DefineVerbatimEnvironment{Soutput}{Verbatim}{frame = bottomline,
samepage = true,
fontfamily=courier,
fontshape=it,
fontsize=\relsize{-1}}
}
%%% S code with line numbers
\DefineVerbatimEnvironment{Sinput}
{Verbatim}
{
%% numbers=left
}
\newcommand{\numberSinput}{
\DefineVerbatimEnvironment{Sinput}{Verbatim}{numbers=left}
}
\newcommand{\rawSinput}{
\DefineVerbatimEnvironment{Sinput}{Verbatim}{}
}
%%% R / System symbols
\newcommand{\R}{\textsf{R}}
\newcommand{\rR}{{R}}
\renewcommand{\S}{\textsf{S}}
\newcommand{\SPLUS}{\textsf{S-PLUS}}
\newcommand{\rSPLUS}{{S-PLUS}}
\newcommand{\SPSS}{\textsf{SPSS}}
\newcommand{\EXCEL}{\textsf{Excel}}
\newcommand{\ACCESS}{\textsf{Access}}
\newcommand{\SQL}{\textsf{SQL}}
%%\newcommand{\Rpackage}[1]{\hbox{\rm\textit{#1}}}
%%\newcommand{\Robject}[1]{\hbox{\rm\texttt{#1}}}
%%\newcommand{\Rclass}[1]{\hbox{\rm\textit{#1}}}
%%\newcommand{\Rcmd}[1]{\hbox{\rm\texttt{#1}}}
\newcommand{\Rpackage}[1]{\index{#1 package@{\fontseries{b}\selectfont #1} package} {\fontseries{b}\selectfont #1}}
\newcommand{\rpackage}[1]{{\fontseries{b}\selectfont #1}}
\newcommand{\Robject}[1]{\texttt{#1}}
\newcommand{\Rclass}[1]{\index{#1 class@\textit{#1} class}\textit{#1}}
\newcommand{\Rcmd}[1]{\index{#1 function@\texttt{#1} function}\texttt{#1}}
\newcommand{\Roperator}[1]{\texttt{#1}}
\newcommand{\Rarg}[1]{\texttt{#1}}
\newcommand{\Rlevel}[1]{\texttt{#1}}
%%% other symbols
\newcommand{\file}[1]{\hbox{\rm\texttt{#1}}}
%%\newcommand{\stress}[1]{\index{#1}\textit{#1}}
\newcommand{\stress}[1]{\textit{#1}}
\newcommand{\booktitle}[1]{\textit{#1}} %%'
%%% Math symbols
\usepackage{amstext}
\usepackage{amsmath}
\newcommand{\E}{\mathsf{E}}
\newcommand{\Var}{\mathsf{Var}}
\newcommand{\Cov}{\mathsf{Cov}}
\newcommand{\Cor}{\mathsf{Cor}}
\newcommand{\x}{\mathbf{x}}
\newcommand{\y}{\mathbf{y}}
\renewcommand{\a}{\mathbf{a}}
\newcommand{\W}{\mathbf{W}}
\newcommand{\C}{\mathbf{C}}
\renewcommand{\H}{\mathbf{H}}
\newcommand{\X}{\mathbf{X}}
\newcommand{\B}{\mathbf{B}}
\newcommand{\V}{\mathbf{V}}
\newcommand{\I}{\mathbf{I}}
\newcommand{\D}{\mathbf{D}}
\newcommand{\bS}{\mathbf{S}}
\newcommand{\N}{\mathcal{N}}
\renewcommand{\L}{L}
\renewcommand{\P}{\mathsf{P}}
\newcommand{\K}{\mathbf{K}}
\newcommand{\m}{\mathbf{m}}
\newcommand{\argmin}{\operatorname{argmin}\displaylimits}
\newcommand{\argmax}{\operatorname{argmax}\displaylimits}
\newcommand{\bx}{\mathbf{x}}
\newcommand{\bbeta}{\mathbf{\beta}}
%%% links
\usepackage{hyperref}
\hypersetup{%
pdftitle = {A Handbook of Statistical Analyses Using R (3rd Edition)},
pdfsubject = {Book},
pdfauthor = {Torsten Hothorn and Brian S. Everitt},
colorlinks = {black},
linkcolor = {black},
citecolor = {black},
urlcolor = {black},
hyperindex = {true},
linktocpage = {true},
}
%%% captions & tables
%% : conflics with figure definition in chapman.cls
%%\usepackage[format=hang,margin=10pt,labelfont=bf]{caption}
%%
\usepackage{longtable}
\usepackage[figuresright]{rotating}
%%% R symbol in chapter 1
\usepackage{wrapfig}
%%% Bibliography
\usepackage[round,comma]{natbib}
\renewcommand{\refname}{References \addcontentsline{toc}{chapter}{References}}
\citeindexfalse
%%% texi2dvi complains that \newblock is undefined, hm...
\def\newblock{\hskip .11em plus .33em minus .07em}
%%% Example sections
\newcounter{exercise}[chapter]
\setcounter{exercise}{0}
\newcommand{\exercise}{\stepcounter{exercise} \item{Ex.~\arabic{chapter}.\arabic{exercise} }}
%% URLs
\newcommand{\curl}[1]{\begin{center} \url{#1} \end{center}}
%%% for manual corrections
%\renewcommand{\baselinestretch}{2}
%%% plot sizes
\setkeys{Gin}{width=0.95\textwidth}
%%% color
\usepackage{color}
%%% hyphenations
\hyphenation{drop-out}
\hyphenation{mar-gi-nal}
%%% new bidirectional quotes need
\usepackage[utf8]{inputenc}
%\usepackage{setspace}
\definecolor{sidebox_todo}{rgb}{1,1,0.2}
\newcommand{\todo}[1]{
\hspace{0pt}%
\marginpar{%
\fcolorbox{black}{sidebox_todo}{%
\parbox{\marginparwidth} {
\raggedright\sffamily\footnotesize{TODO: #1}%
}
}%
}
}
\begin{document}
%% Title page
\title{A Handbook of Statistical Analyses Using \R{} --- 3rd Edition}
\author{Torsten Hothorn and Brian S. Everitt}
\maketitle
%%\VignetteIndexEntry{Chapter Density Estimation}
%%\VignetteDepends{flexmix,KernSmooth,boot}
\setcounter{chapter}{7}
\SweaveOpts{prefix.string=figures/HSAUR,eps=FALSE,keep.source=TRUE}
<>=
rm(list = ls())
s <- search()[-1]
s <- s[-match(c("package:base", "package:stats", "package:graphics", "package:grDevices",
"package:utils", "package:datasets", "package:methods", "Autoloads"), s)]
if (length(s) > 0) sapply(s, detach, character.only = TRUE)
if (!file.exists("tables")) dir.create("tables")
if (!file.exists("figures")) dir.create("figures")
set.seed(290875)
options(prompt = "R> ", continue = "+ ",
width = 63, # digits = 4,
show.signif.stars = FALSE,
SweaveHooks = list(leftpar = function()
par(mai = par("mai") * c(1, 1.05, 1, 1)),
bigleftpar = function()
par(mai = par("mai") * c(1, 1.7, 1, 1))))
HSAURpkg <- require("HSAUR3")
if (!HSAURpkg) stop("cannot load package ", sQuote("HSAUR3"))
rm(HSAURpkg)
### hm, R-2.4.0 --vanilla seems to need this
a <- Sys.setlocale("LC_ALL", "C")
###
book <- TRUE
refs <- cbind(c("AItR", "DAGD", "SI", "CI", "ANOVA", "MLR", "GLM",
"DE", "RP", "GAM", "SA", "ALDI", "ALDII", "SIMC", "MA", "PCA",
"MDS", "CA"), 1:18)
ch <- function(x) {
ch <- refs[which(refs[,1] == x),]
if (book) {
return(paste("Chapter~\\\\ref{", ch[1], "}", sep = ""))
} else {
return(paste("Chapter~", ch[2], sep = ""))
}
}
if (file.exists("deparse.R"))
source("deparse.R")
setHook(packageEvent("lattice", "attach"), function(...) {
lattice.options(default.theme =
function()
standard.theme("pdf", color = FALSE))
})
@
\pagestyle{headings}
<>=
book <- FALSE
@
%% lower png resolution for vignettes
\SweaveOpts{resolution = 100}
<>=
x <- library("KernSmooth")
x <- library("flexmix")
x <- library("boot")
@
\chapter[Density Estimation]{Density Estimation: Erupting Geysers and Star
Clusters \label{DE}}
\section{Introduction}
\section{Density Estimation}
The three kernel functions are implemented in \R{} as shown in lines 1--3 of
Figure~\ref{DE-kernel-fig}. For some grid \Robject{x}, the kernel functions
are plotted using the \R{} statements in lines 5--11 (Figure~\ref{DE-kernel-fig}).
\numberSinput
\begin{figure}
\begin{center}
<>=
rec <- function(x) (abs(x) < 1) * 0.5
tri <- function(x) (abs(x) < 1) * (1 - abs(x))
gauss <- function(x) 1/sqrt(2*pi) * exp(-(x^2)/2)
x <- seq(from = -3, to = 3, by = 0.001)
plot(x, rec(x), type = "l", ylim = c(0,1), lty = 1,
ylab = expression(K(x)))
lines(x, tri(x), lty = 2)
lines(x, gauss(x), lty = 3)
legend(-3, 0.8, legend = c("Rectangular", "Triangular",
"Gaussian"), lty = 1:3, title = "kernel functions",
bty = "n")
@
\caption{Three commonly used kernel functions. \label{DE-kernel-fig}}
\end{center}
\end{figure}
\rawSinput
<>=
w <- options("width")$w
options(width = 66)
@
The kernel estimator $\hat{f}$ is a sum of `bumps' placed at the observations. %'
The kernel function determines the shape of the bumps while the
window width $h$ determines their width.
\index{Windows, in kernel density estimation}
Figure~\ref{DE-bumps} \citep[redrawn from a similar plot in][]{HSAUR:Silverman1986}
shows the individual bumps $n^{-1}h^{-1} K((x - x_i) / h)$, as well as the estimate $\hat{f}$
obtained by adding them up for an artificial set of data points
<>=
x <- c(0, 1, 1.1, 1.5, 1.9, 2.8, 2.9, 3.5)
n <- length(x)
@
For a grid
<>=
xgrid <- seq(from = min(x) - 1, to = max(x) + 1, by = 0.01)
@
on the real line, we can compute the contribution of each measurement in
\Robject{x}, with $h = 0.4$, by the Gaussian kernel (defined in
Figure~\ref{DE-kernel-fig}, line 3) as follows;
<>=
h <- 0.4
bumps <- sapply(x, function(a) gauss((xgrid - a)/h)/(n * h))
@
A plot of the individual bumps and their sum, the kernel density estimate
$\hat{f}$, is shown in Figure~\ref{DE-bumps}.
<>=
options(width = w)
@
\numberSinput
\begin{figure}
\begin{center}
<>=
plot(xgrid, rowSums(bumps), ylab = expression(hat(f)(x)),
type = "l", xlab = "x", lwd = 2)
rug(x, lwd = 2)
out <- apply(bumps, 2, function(b) lines(xgrid, b))
@
\caption{Kernel estimate showing the contributions of Gaussian
kernels evaluated for the individual observations with bandwidth $h =
0.4$. \label{DE-bumps}}
\end{center}
\end{figure}
\rawSinput
\begin{figure}
\begin{center}
<>=
epa <- function(x, y)
((x^2 + y^2) < 1) * 2/pi * (1 - x^2 - y^2)
x <- seq(from = -1.1, to = 1.1, by = 0.05)
epavals <- sapply(x, function(a) epa(a, x))
persp(x = x, y = x, z = epavals, xlab = "x", ylab = "y",
zlab = expression(K(x, y)), theta = -35, axes = TRUE,
box = TRUE)
@
\caption{Epanechnikov kernel for a grid between $(-1.1, -1.1)$ and $(1.1, 1.1)$.
\label{DE-epakernel-fig}}
\end{center}
\end{figure}
\section{Analysis Using \R{}}
\numberSinput
\begin{figure}
\begin{center}
<>=
data("faithful", package = "datasets")
x <- faithful$waiting
layout(matrix(1:3, ncol = 3))
hist(x, xlab = "Waiting times (in min.)", ylab = "Frequency",
probability = TRUE, main = "Gaussian kernel",
border = "gray")
lines(density(x, width = 12), lwd = 2)
rug(x)
hist(x, xlab = "Waiting times (in min.)", ylab = "Frequency",
probability = TRUE, main = "Rectangular kernel",
border = "gray")
lines(density(x, width = 12, window = "rectangular"), lwd = 2)
rug(x)
hist(x, xlab = "Waiting times (in min.)", ylab = "Frequency",
probability = TRUE, main = "Triangular kernel",
border = "gray")
lines(density(x, width = 12, window = "triangular"), lwd = 2)
rug(x)
@
\caption{Density estimates of the geyser eruption data imposed on a histogram
of the data. \label{DE:faithfuldens}}
\end{center}
\end{figure}
\rawSinput
\begin{figure}
\begin{center}
<>=
library("KernSmooth")
data("CYGOB1", package = "HSAUR3")
CYGOB1d <- bkde2D(CYGOB1, bandwidth = sapply(CYGOB1, dpik))
contour(x = CYGOB1d$x1, y = CYGOB1d$x2, z = CYGOB1d$fhat,
xlab = "log surface temperature",
ylab = "log light intensity")
@
\caption{A contour plot of the bivariate density estimate of the \Robject{CYGOB1} data,
i.e., a two-dimensional graphical display for a three-dimensional problem.
\label{DE:CYGOB12Dcontour}}
\end{center}
\end{figure}
\begin{figure}
\begin{center}
<>=
persp(x = CYGOB1d$x1, y = CYGOB1d$x2, z = CYGOB1d$fhat,
xlab = "log surface temperature",
ylab = "log light intensity",
zlab = "estimated density",
theta = -35, axes = TRUE, box = TRUE)
@
\caption{The bivariate density estimate of the \Robject{CYGOB1} data, here shown in a
three-dimensional fashion using the \Rcmd{persp} function.
\label{DE:CYGOB12Dpersp}}
\end{center}
\end{figure}
\subsection{A Parametric Density Estimate for the Old Faithful Data
\label{DE-waiting}}
<>=
logL <- function(param, x) {
d1 <- dnorm(x, mean = param[2], sd = param[3])
d2 <- dnorm(x, mean = param[4], sd = param[5])
-sum(log(param[1] * d1 + (1 - param[1]) * d2))
}
startparam <- c(p = 0.5, mu1 = 50, sd1 = 3, mu2 = 80, sd2 = 3)
opp <- optim(startparam, logL, x = faithful$waiting,
method = "L-BFGS-B",
lower = c(0.01, rep(1, 4)),
upper = c(0.99, rep(200, 4)))
@
\newpage
<>=
opp
@
<>=
print(opp[names(opp) != "message"])
@
Of course, optimizing the appropriate likelihood `by hand' %'
is not very convenient. In fact, (at least) two packages offer high-level
functionality for estimating mixture models. The first one is package
\Rpackage{mclust} \citep{PKG:mclust} implementing the methodology described
in \cite{HSAUR:FraleyRaftery2002}. Here, a Bayesian information criterion
(BIC) is applied to choose the form of the mixture model:
\index{Bayesian Information Criterion (BIC)}
<>=
library("mclust")
@
<>=
library("mclust")
mc <- Mclust(faithful$waiting)
mc
@
and the estimated means are
<>=
mc$parameters$mean
@
with estimated standard deviation (found to be equal within both groups)
<>=
sqrt(mc$parameters$variance$sigmasq)
@
The proportion is $\hat{p} = \Sexpr{round(mc$parameters$pro[1], 2)}$. The second package is called
\Rpackage{flexmix} whose functionality is described by
\cite{HSAUR:Leisch2004}.
A mixture of two normals can be fitted using
<>=
library("flexmix")
fl <- flexmix(waiting ~ 1, data = faithful, k = 2)
@
with $\hat{p} = \Sexpr{round(fl@prior, 2)}$ and estimated parameters
<>=
parameters(fl, component = 1)
parameters(fl, component = 2)
@
\begin{figure}
\begin{center}
<>=
opar <- as.list(opp$par)
rx <- seq(from = 40, to = 110, by = 0.1)
d1 <- dnorm(rx, mean = opar$mu1, sd = opar$sd1)
d2 <- dnorm(rx, mean = opar$mu2, sd = opar$sd2)
f <- opar$p * d1 + (1 - opar$p) * d2
hist(x, probability = TRUE, xlab = "Waiting times (in min.)",
border = "gray", xlim = range(rx), ylim = c(0, 0.06),
main = "")
lines(rx, f, lwd = 2)
lines(rx, dnorm(rx, mean = mean(x), sd = sd(x)), lty = 2,
lwd = 2)
legend(50, 0.06, lty = 1:2, bty = "n",
legend = c("Fitted two-component mixture density",
"Fitted single normal density"))
@
\caption{Fitted normal density and two-component normal mixture for geyser
eruption data. \label{DE:2Dplot}}
\end{center}
\end{figure}
\index{Bootstrap approach|(}
We can get standard errors for the five parameter estimates
by using a bootstrap approach \citep[see][]{HSAUR:EfronTibshirani1993}.
The original data are slightly perturbed by drawing $n$ out of $n$
observations \stress{with replacement} and those artificial replications of
the original data are called \stress{bootstrap samples}. Now, we can fit the
mixture for each bootstrap sample and assess the variability of the
estimates, for example using confidence intervals.
\index{Confidence interval!derived from bootstrap samples}
Some suitable \R{} code based on the \Rcmd{Mclust}
function follows. First, we define a function that,
for a bootstrap sample \Robject{indx}, fits a two-component mixture
model and returns $\hat{p}$ and the estimated means (note that we need to make
sure that we always get an estimate of $p$, not $1 - p$):
<>=
library("boot")
fit <- function(x, indx) {
a <- Mclust(x[indx], minG = 2, maxG = 2,
modelNames="E")$parameters
if (a$pro[1] < 0.5)
return(c(p = a$pro[1], mu1 = a$mean[1],
mu2 = a$mean[2]))
return(c(p = 1 - a$pro[1], mu1 = a$mean[2],
mu2 = a$mean[1]))
}
@
The function \Rcmd{fit} can now be fed into the \Rcmd{boot} function \citep{PKG:boot}
for bootstrapping (here $1000$ bootstrap samples are drawn)
\begin{Schunk}
\begin{Sinput}
R> bootpara <- boot(faithful$waiting, fit, R = 1000)
\end{Sinput}
\end{Schunk}
<>=
bootparafile <- system.file("cache", "DE-bootpara.rda", package = "HSAUR3")
if (file.exists(bootparafile)) {
load(bootparafile)
} else {
bootpara <- boot(faithful$waiting, fit, R = 1000)
}
@
We assess the variability of our estimates $\hat{p}$ by
means of adjusted bootstrap percentile (BCa) confidence intervals, which for
$\hat{p}$ can be obtained from
<>=
boot.ci(bootpara, type = "bca", index = 1)
@
We see that there is a reasonable variability in the mixture model; however,
the means in the two components are rather stable, as can be seen from
<>=
boot.ci(bootpara, type = "bca", index = 2)
@
for $\hat{\mu}_1$ and for $\hat{\mu}_2$ from
<>=
boot.ci(bootpara, type = "bca", index = 3)
@
Finally, we show a graphical representation of both the bootstrap
distribution of the mean estimates \stress{and} the corresponding confidence
intervals. For convenience, we define a function for plotting, namely
<>=
bootplot <- function(b, index, main = "") {
dens <- density(b$t[,index])
ci <- boot.ci(b, type = "bca", index = index)$bca[4:5]
est <- b$t0[index]
plot(dens, main = main)
y <- max(dens$y) / 10
segments(ci[1], y, ci[2], y, lty = 2)
points(ci[1], y, pch = "(")
points(ci[2], y, pch = ")")
points(est, y, pch = 19)
}
@
The element \Robject{t} of an object created by \Rcmd{boot} contains the
bootstrap replications of our estimates, i.e., the values computed by
\Rcmd{fit} for each of the $1000$ bootstrap samples of the geyser data.
First, we plot a simple density estimate and then construct a line
representing the confidence interval.
We apply this function to the bootstrap distributions of our estimates
$\hat{\mu}_1$ and $\hat{\mu}_2$ in Figure~\ref{DE-bootplot}.
\begin{figure}
\begin{center}
<>=
layout(matrix(1:2, ncol = 2))
bootplot(bootpara, 2, main = expression(mu[1]))
bootplot(bootpara, 3, main = expression(mu[2]))
@
\caption{Bootstrap distribution and confidence intervals for the mean estimates
of a two-component mixture for the geyser data. \label{DE-bootplot}}
\end{center}
\end{figure}
\index{Bootstrap approach|)}
\bibliographystyle{LaTeXBibTeX/refstyle}
\bibliography{LaTeXBibTeX/HSAUR}
\end{document}