\documentclass{chapman} %%% copy Sweave.sty definitions %%% keeps `sweave' from adding `\usepackage{Sweave}': DO NOT REMOVE %\usepackage{Sweave} \RequirePackage[T1]{fontenc} \RequirePackage{graphicx,ae,fancyvrb} \IfFileExists{upquote.sty}{\RequirePackage{upquote}}{} \usepackage{relsize} \DefineVerbatimEnvironment{Sinput}{Verbatim}{} \DefineVerbatimEnvironment{Soutput}{Verbatim}{fontfamily=courier, fontshape=it, fontsize=\relsize{-1}} \DefineVerbatimEnvironment{Scode}{Verbatim}{} \newenvironment{Schunk}{}{} %%% environment for raw output \newcommand{\SchunkRaw}{\renewenvironment{Schunk}{}{} \DefineVerbatimEnvironment{Soutput}{Verbatim}{fontfamily=courier, fontshape=it, fontsize=\small} \rawSinput } %%% environment for labeled output \newcommand{\nextcaption}{} \newcommand{\SchunkLabel}{ \renewenvironment{Schunk}{\begin{figure}[ht] }{\caption{\nextcaption} \end{figure} } \DefineVerbatimEnvironment{Sinput}{Verbatim}{frame = topline} \DefineVerbatimEnvironment{Soutput}{Verbatim}{frame = bottomline, samepage = true, fontfamily=courier, fontshape=it, fontsize=\relsize{-1}} } %%% S code with line numbers \DefineVerbatimEnvironment{Sinput} {Verbatim} { %% numbers=left } \newcommand{\numberSinput}{ \DefineVerbatimEnvironment{Sinput}{Verbatim}{numbers=left} } \newcommand{\rawSinput}{ \DefineVerbatimEnvironment{Sinput}{Verbatim}{} } %%% R / System symbols \newcommand{\R}{\textsf{R}} \newcommand{\rR}{{R}} \renewcommand{\S}{\textsf{S}} \newcommand{\SPLUS}{\textsf{S-PLUS}} \newcommand{\rSPLUS}{{S-PLUS}} \newcommand{\SPSS}{\textsf{SPSS}} \newcommand{\EXCEL}{\textsf{Excel}} \newcommand{\ACCESS}{\textsf{Access}} \newcommand{\SQL}{\textsf{SQL}} %%\newcommand{\Rpackage}[1]{\hbox{\rm\textit{#1}}} %%\newcommand{\Robject}[1]{\hbox{\rm\texttt{#1}}} %%\newcommand{\Rclass}[1]{\hbox{\rm\textit{#1}}} %%\newcommand{\Rcmd}[1]{\hbox{\rm\texttt{#1}}} \newcommand{\Rpackage}[1]{\index{#1 package@{\fontseries{b}\selectfont #1} package} {\fontseries{b}\selectfont #1}} \newcommand{\rpackage}[1]{{\fontseries{b}\selectfont #1}} \newcommand{\Robject}[1]{\texttt{#1}} \newcommand{\Rclass}[1]{\index{#1 class@\textit{#1} class}\textit{#1}} \newcommand{\Rcmd}[1]{\index{#1 function@\texttt{#1} function}\texttt{#1}} \newcommand{\Roperator}[1]{\texttt{#1}} \newcommand{\Rarg}[1]{\texttt{#1}} \newcommand{\Rlevel}[1]{\texttt{#1}} %%% other symbols \newcommand{\file}[1]{\hbox{\rm\texttt{#1}}} %%\newcommand{\stress}[1]{\index{#1}\textit{#1}} \newcommand{\stress}[1]{\textit{#1}} \newcommand{\booktitle}[1]{\textit{#1}} %%' %%% Math symbols \usepackage{amstext} \usepackage{amsmath} \newcommand{\E}{\mathsf{E}} \newcommand{\Var}{\mathsf{Var}} \newcommand{\Cov}{\mathsf{Cov}} \newcommand{\Cor}{\mathsf{Cor}} \newcommand{\x}{\mathbf{x}} \newcommand{\y}{\mathbf{y}} \renewcommand{\a}{\mathbf{a}} \newcommand{\W}{\mathbf{W}} \newcommand{\C}{\mathbf{C}} \renewcommand{\H}{\mathbf{H}} \newcommand{\X}{\mathbf{X}} \newcommand{\B}{\mathbf{B}} \newcommand{\V}{\mathbf{V}} \newcommand{\I}{\mathbf{I}} \newcommand{\D}{\mathbf{D}} \newcommand{\bS}{\mathbf{S}} \newcommand{\N}{\mathcal{N}} \renewcommand{\L}{L} \renewcommand{\P}{\mathsf{P}} \newcommand{\K}{\mathbf{K}} \newcommand{\m}{\mathbf{m}} \newcommand{\argmin}{\operatorname{argmin}\displaylimits} \newcommand{\argmax}{\operatorname{argmax}\displaylimits} \newcommand{\bx}{\mathbf{x}} \newcommand{\bbeta}{\mathbf{\beta}} %%% links \usepackage{hyperref} \hypersetup{% pdftitle = {A Handbook of Statistical Analyses Using R (3rd Edition)}, pdfsubject = {Book}, pdfauthor = {Torsten Hothorn and Brian S. Everitt}, colorlinks = {black}, linkcolor = {black}, citecolor = {black}, urlcolor = {black}, hyperindex = {true}, linktocpage = {true}, } %%% captions & tables %% : conflics with figure definition in chapman.cls %%\usepackage[format=hang,margin=10pt,labelfont=bf]{caption} %% \usepackage{longtable} \usepackage[figuresright]{rotating} %%% R symbol in chapter 1 \usepackage{wrapfig} %%% Bibliography \usepackage[round,comma]{natbib} \renewcommand{\refname}{References \addcontentsline{toc}{chapter}{References}} \citeindexfalse %%% texi2dvi complains that \newblock is undefined, hm... \def\newblock{\hskip .11em plus .33em minus .07em} %%% Example sections \newcounter{exercise}[chapter] \setcounter{exercise}{0} \newcommand{\exercise}{\stepcounter{exercise} \item{Ex.~\arabic{chapter}.\arabic{exercise} }} %% URLs \newcommand{\curl}[1]{\begin{center} \url{#1} \end{center}} %%% for manual corrections %\renewcommand{\baselinestretch}{2} %%% plot sizes \setkeys{Gin}{width=0.95\textwidth} %%% color \usepackage{color} %%% hyphenations \hyphenation{drop-out} \hyphenation{mar-gi-nal} %%% new bidirectional quotes need \usepackage[utf8]{inputenc} %\usepackage{setspace} \definecolor{sidebox_todo}{rgb}{1,1,0.2} \newcommand{\todo}[1]{ \hspace{0pt}% \marginpar{% \fcolorbox{black}{sidebox_todo}{% \parbox{\marginparwidth} { \raggedright\sffamily\footnotesize{TODO: #1}% } }% } } \begin{document} %% Title page \title{A Handbook of Statistical Analyses Using \R{} --- 3rd Edition} \author{Torsten Hothorn and Brian S. Everitt} \maketitle %%\VignetteIndexEntry{Chapter Density Estimation} %%\VignetteDepends{flexmix,KernSmooth,boot} \setcounter{chapter}{7} \SweaveOpts{prefix.string=figures/HSAUR,eps=FALSE,keep.source=TRUE} <>= rm(list = ls()) s <- search()[-1] s <- s[-match(c("package:base", "package:stats", "package:graphics", "package:grDevices", "package:utils", "package:datasets", "package:methods", "Autoloads"), s)] if (length(s) > 0) sapply(s, detach, character.only = TRUE) if (!file.exists("tables")) dir.create("tables") if (!file.exists("figures")) dir.create("figures") set.seed(290875) options(prompt = "R> ", continue = "+ ", width = 63, # digits = 4, show.signif.stars = FALSE, SweaveHooks = list(leftpar = function() par(mai = par("mai") * c(1, 1.05, 1, 1)), bigleftpar = function() par(mai = par("mai") * c(1, 1.7, 1, 1)))) HSAURpkg <- require("HSAUR3") if (!HSAURpkg) stop("cannot load package ", sQuote("HSAUR3")) rm(HSAURpkg) ### hm, R-2.4.0 --vanilla seems to need this a <- Sys.setlocale("LC_ALL", "C") ### book <- TRUE refs <- cbind(c("AItR", "DAGD", "SI", "CI", "ANOVA", "MLR", "GLM", "DE", "RP", "GAM", "SA", "ALDI", "ALDII", "SIMC", "MA", "PCA", "MDS", "CA"), 1:18) ch <- function(x) { ch <- refs[which(refs[,1] == x),] if (book) { return(paste("Chapter~\\\\ref{", ch[1], "}", sep = "")) } else { return(paste("Chapter~", ch[2], sep = "")) } } if (file.exists("deparse.R")) source("deparse.R") setHook(packageEvent("lattice", "attach"), function(...) { lattice.options(default.theme = function() standard.theme("pdf", color = FALSE)) }) @ \pagestyle{headings} <>= book <- FALSE @ %% lower png resolution for vignettes \SweaveOpts{resolution = 100} <>= x <- library("KernSmooth") x <- library("flexmix") x <- library("boot") @ \chapter[Density Estimation]{Density Estimation: Erupting Geysers and Star Clusters \label{DE}} \section{Introduction} \section{Density Estimation} The three kernel functions are implemented in \R{} as shown in lines 1--3 of Figure~\ref{DE-kernel-fig}. For some grid \Robject{x}, the kernel functions are plotted using the \R{} statements in lines 5--11 (Figure~\ref{DE-kernel-fig}). \numberSinput \begin{figure} \begin{center} <>= rec <- function(x) (abs(x) < 1) * 0.5 tri <- function(x) (abs(x) < 1) * (1 - abs(x)) gauss <- function(x) 1/sqrt(2*pi) * exp(-(x^2)/2) x <- seq(from = -3, to = 3, by = 0.001) plot(x, rec(x), type = "l", ylim = c(0,1), lty = 1, ylab = expression(K(x))) lines(x, tri(x), lty = 2) lines(x, gauss(x), lty = 3) legend(-3, 0.8, legend = c("Rectangular", "Triangular", "Gaussian"), lty = 1:3, title = "kernel functions", bty = "n") @ \caption{Three commonly used kernel functions. \label{DE-kernel-fig}} \end{center} \end{figure} \rawSinput <>= w <- options("width")$w options(width = 66) @ The kernel estimator $\hat{f}$ is a sum of `bumps' placed at the observations. %' The kernel function determines the shape of the bumps while the window width $h$ determines their width. \index{Windows, in kernel density estimation} Figure~\ref{DE-bumps} \citep[redrawn from a similar plot in][]{HSAUR:Silverman1986} shows the individual bumps $n^{-1}h^{-1} K((x - x_i) / h)$, as well as the estimate $\hat{f}$ obtained by adding them up for an artificial set of data points <>= x <- c(0, 1, 1.1, 1.5, 1.9, 2.8, 2.9, 3.5) n <- length(x) @ For a grid <>= xgrid <- seq(from = min(x) - 1, to = max(x) + 1, by = 0.01) @ on the real line, we can compute the contribution of each measurement in \Robject{x}, with $h = 0.4$, by the Gaussian kernel (defined in Figure~\ref{DE-kernel-fig}, line 3) as follows; <>= h <- 0.4 bumps <- sapply(x, function(a) gauss((xgrid - a)/h)/(n * h)) @ A plot of the individual bumps and their sum, the kernel density estimate $\hat{f}$, is shown in Figure~\ref{DE-bumps}. <>= options(width = w) @ \numberSinput \begin{figure} \begin{center} <>= plot(xgrid, rowSums(bumps), ylab = expression(hat(f)(x)), type = "l", xlab = "x", lwd = 2) rug(x, lwd = 2) out <- apply(bumps, 2, function(b) lines(xgrid, b)) @ \caption{Kernel estimate showing the contributions of Gaussian kernels evaluated for the individual observations with bandwidth $h = 0.4$. \label{DE-bumps}} \end{center} \end{figure} \rawSinput \begin{figure} \begin{center} <>= epa <- function(x, y) ((x^2 + y^2) < 1) * 2/pi * (1 - x^2 - y^2) x <- seq(from = -1.1, to = 1.1, by = 0.05) epavals <- sapply(x, function(a) epa(a, x)) persp(x = x, y = x, z = epavals, xlab = "x", ylab = "y", zlab = expression(K(x, y)), theta = -35, axes = TRUE, box = TRUE) @ \caption{Epanechnikov kernel for a grid between $(-1.1, -1.1)$ and $(1.1, 1.1)$. \label{DE-epakernel-fig}} \end{center} \end{figure} \section{Analysis Using \R{}} \numberSinput \begin{figure} \begin{center} <>= data("faithful", package = "datasets") x <- faithful$waiting layout(matrix(1:3, ncol = 3)) hist(x, xlab = "Waiting times (in min.)", ylab = "Frequency", probability = TRUE, main = "Gaussian kernel", border = "gray") lines(density(x, width = 12), lwd = 2) rug(x) hist(x, xlab = "Waiting times (in min.)", ylab = "Frequency", probability = TRUE, main = "Rectangular kernel", border = "gray") lines(density(x, width = 12, window = "rectangular"), lwd = 2) rug(x) hist(x, xlab = "Waiting times (in min.)", ylab = "Frequency", probability = TRUE, main = "Triangular kernel", border = "gray") lines(density(x, width = 12, window = "triangular"), lwd = 2) rug(x) @ \caption{Density estimates of the geyser eruption data imposed on a histogram of the data. \label{DE:faithfuldens}} \end{center} \end{figure} \rawSinput \begin{figure} \begin{center} <>= library("KernSmooth") data("CYGOB1", package = "HSAUR3") CYGOB1d <- bkde2D(CYGOB1, bandwidth = sapply(CYGOB1, dpik)) contour(x = CYGOB1d$x1, y = CYGOB1d$x2, z = CYGOB1d$fhat, xlab = "log surface temperature", ylab = "log light intensity") @ \caption{A contour plot of the bivariate density estimate of the \Robject{CYGOB1} data, i.e., a two-dimensional graphical display for a three-dimensional problem. \label{DE:CYGOB12Dcontour}} \end{center} \end{figure} \begin{figure} \begin{center} <>= persp(x = CYGOB1d$x1, y = CYGOB1d$x2, z = CYGOB1d$fhat, xlab = "log surface temperature", ylab = "log light intensity", zlab = "estimated density", theta = -35, axes = TRUE, box = TRUE) @ \caption{The bivariate density estimate of the \Robject{CYGOB1} data, here shown in a three-dimensional fashion using the \Rcmd{persp} function. \label{DE:CYGOB12Dpersp}} \end{center} \end{figure} \subsection{A Parametric Density Estimate for the Old Faithful Data \label{DE-waiting}} <>= logL <- function(param, x) { d1 <- dnorm(x, mean = param[2], sd = param[3]) d2 <- dnorm(x, mean = param[4], sd = param[5]) -sum(log(param[1] * d1 + (1 - param[1]) * d2)) } startparam <- c(p = 0.5, mu1 = 50, sd1 = 3, mu2 = 80, sd2 = 3) opp <- optim(startparam, logL, x = faithful$waiting, method = "L-BFGS-B", lower = c(0.01, rep(1, 4)), upper = c(0.99, rep(200, 4))) @ \newpage <>= opp @ <>= print(opp[names(opp) != "message"]) @ Of course, optimizing the appropriate likelihood `by hand' %' is not very convenient. In fact, (at least) two packages offer high-level functionality for estimating mixture models. The first one is package \Rpackage{mclust} \citep{PKG:mclust} implementing the methodology described in \cite{HSAUR:FraleyRaftery2002}. Here, a Bayesian information criterion (BIC) is applied to choose the form of the mixture model: \index{Bayesian Information Criterion (BIC)} <>= library("mclust") @ <>= library("mclust") mc <- Mclust(faithful$waiting) mc @ and the estimated means are <>= mc$parameters$mean @ with estimated standard deviation (found to be equal within both groups) <>= sqrt(mc$parameters$variance$sigmasq) @ The proportion is $\hat{p} = \Sexpr{round(mc$parameters$pro[1], 2)}$. The second package is called \Rpackage{flexmix} whose functionality is described by \cite{HSAUR:Leisch2004}. A mixture of two normals can be fitted using <>= library("flexmix") fl <- flexmix(waiting ~ 1, data = faithful, k = 2) @ with $\hat{p} = \Sexpr{round(fl@prior, 2)}$ and estimated parameters <>= parameters(fl, component = 1) parameters(fl, component = 2) @ \begin{figure} \begin{center} <>= opar <- as.list(opp$par) rx <- seq(from = 40, to = 110, by = 0.1) d1 <- dnorm(rx, mean = opar$mu1, sd = opar$sd1) d2 <- dnorm(rx, mean = opar$mu2, sd = opar$sd2) f <- opar$p * d1 + (1 - opar$p) * d2 hist(x, probability = TRUE, xlab = "Waiting times (in min.)", border = "gray", xlim = range(rx), ylim = c(0, 0.06), main = "") lines(rx, f, lwd = 2) lines(rx, dnorm(rx, mean = mean(x), sd = sd(x)), lty = 2, lwd = 2) legend(50, 0.06, lty = 1:2, bty = "n", legend = c("Fitted two-component mixture density", "Fitted single normal density")) @ \caption{Fitted normal density and two-component normal mixture for geyser eruption data. \label{DE:2Dplot}} \end{center} \end{figure} \index{Bootstrap approach|(} We can get standard errors for the five parameter estimates by using a bootstrap approach \citep[see][]{HSAUR:EfronTibshirani1993}. The original data are slightly perturbed by drawing $n$ out of $n$ observations \stress{with replacement} and those artificial replications of the original data are called \stress{bootstrap samples}. Now, we can fit the mixture for each bootstrap sample and assess the variability of the estimates, for example using confidence intervals. \index{Confidence interval!derived from bootstrap samples} Some suitable \R{} code based on the \Rcmd{Mclust} function follows. First, we define a function that, for a bootstrap sample \Robject{indx}, fits a two-component mixture model and returns $\hat{p}$ and the estimated means (note that we need to make sure that we always get an estimate of $p$, not $1 - p$): <>= library("boot") fit <- function(x, indx) { a <- Mclust(x[indx], minG = 2, maxG = 2, modelNames="E")$parameters if (a$pro[1] < 0.5) return(c(p = a$pro[1], mu1 = a$mean[1], mu2 = a$mean[2])) return(c(p = 1 - a$pro[1], mu1 = a$mean[2], mu2 = a$mean[1])) } @ The function \Rcmd{fit} can now be fed into the \Rcmd{boot} function \citep{PKG:boot} for bootstrapping (here $1000$ bootstrap samples are drawn) \begin{Schunk} \begin{Sinput} R> bootpara <- boot(faithful$waiting, fit, R = 1000) \end{Sinput} \end{Schunk} <>= bootparafile <- system.file("cache", "DE-bootpara.rda", package = "HSAUR3") if (file.exists(bootparafile)) { load(bootparafile) } else { bootpara <- boot(faithful$waiting, fit, R = 1000) } @ We assess the variability of our estimates $\hat{p}$ by means of adjusted bootstrap percentile (BCa) confidence intervals, which for $\hat{p}$ can be obtained from <>= boot.ci(bootpara, type = "bca", index = 1) @ We see that there is a reasonable variability in the mixture model; however, the means in the two components are rather stable, as can be seen from <>= boot.ci(bootpara, type = "bca", index = 2) @ for $\hat{\mu}_1$ and for $\hat{\mu}_2$ from <>= boot.ci(bootpara, type = "bca", index = 3) @ Finally, we show a graphical representation of both the bootstrap distribution of the mean estimates \stress{and} the corresponding confidence intervals. For convenience, we define a function for plotting, namely <>= bootplot <- function(b, index, main = "") { dens <- density(b$t[,index]) ci <- boot.ci(b, type = "bca", index = index)$bca[4:5] est <- b$t0[index] plot(dens, main = main) y <- max(dens$y) / 10 segments(ci[1], y, ci[2], y, lty = 2) points(ci[1], y, pch = "(") points(ci[2], y, pch = ")") points(est, y, pch = 19) } @ The element \Robject{t} of an object created by \Rcmd{boot} contains the bootstrap replications of our estimates, i.e., the values computed by \Rcmd{fit} for each of the $1000$ bootstrap samples of the geyser data. First, we plot a simple density estimate and then construct a line representing the confidence interval. We apply this function to the bootstrap distributions of our estimates $\hat{\mu}_1$ and $\hat{\mu}_2$ in Figure~\ref{DE-bootplot}. \begin{figure} \begin{center} <>= layout(matrix(1:2, ncol = 2)) bootplot(bootpara, 2, main = expression(mu[1])) bootplot(bootpara, 3, main = expression(mu[2])) @ \caption{Bootstrap distribution and confidence intervals for the mean estimates of a two-component mixture for the geyser data. \label{DE-bootplot}} \end{center} \end{figure} \index{Bootstrap approach|)} \bibliographystyle{LaTeXBibTeX/refstyle} \bibliography{LaTeXBibTeX/HSAUR} \end{document}