\documentclass{chapman}
%%% copy Sweave.sty definitions
%%% keeps `sweave' from adding `\usepackage{Sweave}': DO NOT REMOVE
%\usepackage{Sweave}
\RequirePackage[T1]{fontenc}
\RequirePackage{graphicx,ae,fancyvrb}
\IfFileExists{upquote.sty}{\RequirePackage{upquote}}{}
\usepackage{relsize}
\DefineVerbatimEnvironment{Sinput}{Verbatim}{}
\DefineVerbatimEnvironment{Soutput}{Verbatim}{fontfamily=courier,
fontshape=it,
fontsize=\relsize{-1}}
\DefineVerbatimEnvironment{Scode}{Verbatim}{}
\newenvironment{Schunk}{}{}
%%% environment for raw output
\newcommand{\SchunkRaw}{\renewenvironment{Schunk}{}{}
\DefineVerbatimEnvironment{Soutput}{Verbatim}{fontfamily=courier,
fontshape=it,
fontsize=\small}
\rawSinput
}
%%% environment for labeled output
\newcommand{\nextcaption}{}
\newcommand{\SchunkLabel}{
\renewenvironment{Schunk}{\begin{figure}[ht] }{\caption{\nextcaption}
\end{figure} }
\DefineVerbatimEnvironment{Sinput}{Verbatim}{frame = topline}
\DefineVerbatimEnvironment{Soutput}{Verbatim}{frame = bottomline,
samepage = true,
fontfamily=courier,
fontshape=it,
fontsize=\relsize{-1}}
}
%%% S code with line numbers
\DefineVerbatimEnvironment{Sinput}
{Verbatim}
{
%% numbers=left
}
\newcommand{\numberSinput}{
\DefineVerbatimEnvironment{Sinput}{Verbatim}{numbers=left}
}
\newcommand{\rawSinput}{
\DefineVerbatimEnvironment{Sinput}{Verbatim}{}
}
%%% R / System symbols
\newcommand{\R}{\textsf{R}}
\newcommand{\rR}{{R}}
\renewcommand{\S}{\textsf{S}}
\newcommand{\SPLUS}{\textsf{S-PLUS}}
\newcommand{\rSPLUS}{{S-PLUS}}
\newcommand{\SPSS}{\textsf{SPSS}}
\newcommand{\EXCEL}{\textsf{Excel}}
\newcommand{\ACCESS}{\textsf{Access}}
\newcommand{\SQL}{\textsf{SQL}}
%%\newcommand{\Rpackage}[1]{\hbox{\rm\textit{#1}}}
%%\newcommand{\Robject}[1]{\hbox{\rm\texttt{#1}}}
%%\newcommand{\Rclass}[1]{\hbox{\rm\textit{#1}}}
%%\newcommand{\Rcmd}[1]{\hbox{\rm\texttt{#1}}}
\newcommand{\Rpackage}[1]{\index{#1 package@{\fontseries{b}\selectfont #1} package} {\fontseries{b}\selectfont #1}}
\newcommand{\rpackage}[1]{{\fontseries{b}\selectfont #1}}
\newcommand{\Robject}[1]{\texttt{#1}}
\newcommand{\Rclass}[1]{\index{#1 class@\textit{#1} class}\textit{#1}}
\newcommand{\Rcmd}[1]{\index{#1 function@\texttt{#1} function}\texttt{#1}}
\newcommand{\Roperator}[1]{\texttt{#1}}
\newcommand{\Rarg}[1]{\texttt{#1}}
\newcommand{\Rlevel}[1]{\texttt{#1}}
%%% other symbols
\newcommand{\file}[1]{\hbox{\rm\texttt{#1}}}
%%\newcommand{\stress}[1]{\index{#1}\textit{#1}}
\newcommand{\stress}[1]{\textit{#1}}
\newcommand{\booktitle}[1]{\textit{#1}} %%'
%%% Math symbols
\usepackage{amstext}
\usepackage{amsmath}
\newcommand{\E}{\mathsf{E}}
\newcommand{\Var}{\mathsf{Var}}
\newcommand{\Cov}{\mathsf{Cov}}
\newcommand{\Cor}{\mathsf{Cor}}
\newcommand{\x}{\mathbf{x}}
\newcommand{\y}{\mathbf{y}}
\renewcommand{\a}{\mathbf{a}}
\newcommand{\W}{\mathbf{W}}
\newcommand{\C}{\mathbf{C}}
\renewcommand{\H}{\mathbf{H}}
\newcommand{\X}{\mathbf{X}}
\newcommand{\B}{\mathbf{B}}
\newcommand{\V}{\mathbf{V}}
\newcommand{\I}{\mathbf{I}}
\newcommand{\D}{\mathbf{D}}
\newcommand{\bS}{\mathbf{S}}
\newcommand{\N}{\mathcal{N}}
\renewcommand{\L}{L}
\renewcommand{\P}{\mathsf{P}}
\newcommand{\K}{\mathbf{K}}
\newcommand{\m}{\mathbf{m}}
\newcommand{\argmin}{\operatorname{argmin}\displaylimits}
\newcommand{\argmax}{\operatorname{argmax}\displaylimits}
\newcommand{\bx}{\mathbf{x}}
\newcommand{\bbeta}{\mathbf{\beta}}
%%% links
\usepackage{hyperref}
\hypersetup{%
pdftitle = {A Handbook of Statistical Analyses Using R (3rd Edition)},
pdfsubject = {Book},
pdfauthor = {Torsten Hothorn and Brian S. Everitt},
colorlinks = {black},
linkcolor = {black},
citecolor = {black},
urlcolor = {black},
hyperindex = {true},
linktocpage = {true},
}
%%% captions & tables
%% : conflics with figure definition in chapman.cls
%%\usepackage[format=hang,margin=10pt,labelfont=bf]{caption}
%%
\usepackage{longtable}
\usepackage[figuresright]{rotating}
%%% R symbol in chapter 1
\usepackage{wrapfig}
%%% Bibliography
\usepackage[round,comma]{natbib}
\renewcommand{\refname}{References \addcontentsline{toc}{chapter}{References}}
\citeindexfalse
%%% texi2dvi complains that \newblock is undefined, hm...
\def\newblock{\hskip .11em plus .33em minus .07em}
%%% Example sections
\newcounter{exercise}[chapter]
\setcounter{exercise}{0}
\newcommand{\exercise}{\stepcounter{exercise} \item{Ex.~\arabic{chapter}.\arabic{exercise} }}
%% URLs
\newcommand{\curl}[1]{\begin{center} \url{#1} \end{center}}
%%% for manual corrections
%\renewcommand{\baselinestretch}{2}
%%% plot sizes
\setkeys{Gin}{width=0.95\textwidth}
%%% color
\usepackage{color}
%%% hyphenations
\hyphenation{drop-out}
\hyphenation{mar-gi-nal}
%%% new bidirectional quotes need
\usepackage[utf8]{inputenc}
%\usepackage{setspace}
\definecolor{sidebox_todo}{rgb}{1,1,0.2}
\newcommand{\todo}[1]{
\hspace{0pt}%
\marginpar{%
\fcolorbox{black}{sidebox_todo}{%
\parbox{\marginparwidth} {
\raggedright\sffamily\footnotesize{TODO: #1}%
}
}%
}
}
\begin{document}
%% Title page
\title{A Handbook of Statistical Analyses Using \R{} --- 3rd Edition}
\author{Torsten Hothorn and Brian S. Everitt}
\maketitle
%%\VignetteIndexEntry{Chapter Missing Values}
%%\VignetteDepends{mice}
\setcounter{chapter}{15}
\SweaveOpts{prefix.string=figures/HSAUR,eps=FALSE,keep.source=TRUE}
<>=
rm(list = ls())
s <- search()[-1]
s <- s[-match(c("package:base", "package:stats", "package:graphics", "package:grDevices",
"package:utils", "package:datasets", "package:methods", "Autoloads"), s)]
if (length(s) > 0) sapply(s, detach, character.only = TRUE)
if (!file.exists("tables")) dir.create("tables")
if (!file.exists("figures")) dir.create("figures")
set.seed(290875)
options(prompt = "R> ", continue = "+ ",
width = 63, # digits = 4,
show.signif.stars = FALSE,
SweaveHooks = list(leftpar = function()
par(mai = par("mai") * c(1, 1.05, 1, 1)),
bigleftpar = function()
par(mai = par("mai") * c(1, 1.7, 1, 1))))
HSAURpkg <- require("HSAUR3")
if (!HSAURpkg) stop("cannot load package ", sQuote("HSAUR3"))
rm(HSAURpkg)
### hm, R-2.4.0 --vanilla seems to need this
a <- Sys.setlocale("LC_ALL", "C")
###
book <- TRUE
refs <- cbind(c("AItR", "DAGD", "SI", "CI", "ANOVA", "MLR", "GLM",
"DE", "RP", "GAM", "SA", "ALDI", "ALDII", "SIMC", "MA", "PCA",
"MDS", "CA"), 1:18)
ch <- function(x) {
ch <- refs[which(refs[,1] == x),]
if (book) {
return(paste("Chapter~\\\\ref{", ch[1], "}", sep = ""))
} else {
return(paste("Chapter~", ch[2], sep = ""))
}
}
if (file.exists("deparse.R"))
source("deparse.R")
setHook(packageEvent("lattice", "attach"), function(...) {
lattice.options(default.theme =
function()
standard.theme("pdf", color = FALSE))
})
@
\pagestyle{headings}
<>=
book <- FALSE
@
\chapter[Missing Values]{Missing Values:
Lowering Blood Pressure During Surgery \label{MV}}
\section{Introduction}
\index{Blood pressure}
It is sometimes necessary to lower a patient's blood pressure during
surgery, using a hypotensive drug. Such drugs are administered continuously
during the relevant phase of the operation; because the duration of this
phase varies so does the total amount of drug administered. Patients also
vary in the extent to which the drugs succeed in lowering blood pressure.
The sooner the blood pressure rises again to normal after the drug is
discontinued, the better. The data in Table~\ref{MV-bp-tab} \citep[a
missing-value version of the data presented
by][]{HSAUR:RobertsonArmitage1959} relate to a particular hypotensive drug
and give the time in minutes before the patient's systolic blood pressure
returned to 100mm of mercury (the recovery time), the logarithm (base 10) of
the dose of drug in milligrams, and the average systolic blood pressure
achieved while the drug was being administered. The question of interest is
how is the recovery time related to the other two variables? For some
patients the recovery time was not recorded and the missing values are
indicated as NA in Table~\ref{MV-bp-tab}.
<>=
data("bp", package = "HSAUR3")
toLatex(HSAURtable(bp), pcol = 2,
caption = paste("Blood pressure data."),
label = "MV-bp-tab")
@
\section{Analyzing Multiply Imputed Data} \label{MI:ana}
From the analysis of each data set we need to look at the estimates of the
quantity of interest, say $Q$, and the variance of the estimates. We let
$\hat{Q}_i$ be the estimate from the $i$th data set and $S_i$ its
corresponding variance. The combined estimate of the quantity of interest
is
\begin{eqnarray*}
\bar{Q} = \frac{1}{m}\sum_{i = 1}^m \hat{Q}_i.
\end{eqnarray*}
To find the combined variance involves first calculating the
within-imputation variance,
\begin{eqnarray*}
\bar{S} = \frac{1}{m}\sum_{i = 1}^m S_i
\end{eqnarray*}
followed by the between-imputation variance,
\begin{eqnarray*}
B = \frac{1}{m - 1} \sum_{i = 1}^m (\hat{Q}_i - \bar{Q})^2
\end{eqnarray*}
then the required total variance can now be found from
\begin{eqnarray*}
T = \bar{S} + (1 + m^{-1}) B
\end{eqnarray*}
This total variance is made up of two components; the first which preserves
the natural variability, $\bar{S}$, is simply the average of the variance
estimates for each imputed data set and is analogous to the variance that
would be suitable if we did not need to account for missing data; the second
component, $B$, estimates uncertainty caused by missing data by measuring
how the point estimates vary from data set to data set. More explanation of
how the formula for $T$ arises is given in
\cite{HSAUR:vanBuuren2012}.
The overall standard error is simply the square root of $T$. A significance
test for $Q$ and a confidence interval is found from the usual test statistic,
($Q-$ hypothesized value of $Q$)/$\sqrt{T}$, the value of which is referred to a
Student's $t$-distribution. The question arises however as to what is the appropriate value for the degrees of
freedom of the test, say $v_0$? \cite{HSAUR:Rubin1987} suggests that the answer to this
question is given by;
\begin{eqnarray*}
v_0 = (m - 1) (1 + 1/r^2)
\end{eqnarray*}
where
\begin{eqnarray*}
r = \frac{B + B / m}{\bar{S}}
\end{eqnarray*}
But \cite{HSAUR:BarnardRubin1999} noted that using this value of $v_0$ can produce values that are
larger than the degrees of freedom in the complete data, a result which they
considered `clearly inappropriate'. Consequently they developed an adapted
version that does not lead to the same problem. Barnard and Rubin's revised
value for the degrees of freedom of the $t$-test in which we are interested is $v_1$
given by;
\begin{eqnarray*}
v_1 = \frac{v_0 v_2}{v_0 + v_2}
\end{eqnarray*}
where
\begin{eqnarray*}
v_2 = \frac{n(n-1)(1 - \lambda)}{n + 2}
\end{eqnarray*}
and
\begin{eqnarray*}
\lambda = \frac{r}{\sqrt{r^2 + 1}}.
\end{eqnarray*}
The quantity $v_1$ is always less than or equal to the degrees of freedom of
the test applied to the hypothetically complete data. \citep[For more
details see][]{HSAUR:vanBuuren2012}.
\index{Imputation|)}
\section{Analysis Using \R{}}
To begin we shall analyze the blood pressure data in Table~\ref{MV-bp-tab}
using the complete-case approach, i.e., by simply removing the data for
patients where the recovery time is missing. To begin we might simply count
the number of missing values using the sapply function as follows:
<>=
sapply(bp, function(x) sum(is.na(x)))
@
So there are ten missing values of recovery time but no missing values
amongst the other two variables. Now we use the \Rcmd{summary} function to look at
some basic statistics of the complete data for recovery time:
<>=
summary(bp$recovtime, na.rm = TRUE)
@
And next we can calculate the complete data estimate of the standard deviation
of recover time
<>=
sd(bp$recovtime, na.rm = TRUE)
@
The final numerical results we might be interested in are the correlations
of recovery time with blood pressure and of recovery time with logdose.
These can be found as follows:
<>=
with(bp, cor(bloodp, recovtime, use = "complete.obs"))
with(bp, cor(logdose, recovtime, use = "complete.obs"))
@
And a useful graphic of the data is a scatterplot matrix which we can
construct using \Rcmd{pairs}. The scatterplot matrix is given in
Figure~\ref{MV-bp-pairs-cc}.
\begin{figure}
\begin{center}
<>=
layout(matrix(1:3, nrow = 1))
plot(bloodp ~ logdose, data = bp)
plot(recovtime ~ bloodp, data = bp)
plot(recovtime ~ logdose, data = bp)
@
\caption{Scatterplots of the complete cases of the \Robject{bp} data.
\label{MV-bp-pairs-cc}}
\end{center}
\end{figure}
To investigate how recovery time is related to blood pressure and logdose we
might begin by fitting a multiple linear regression model (see
Chapter~\ref{MLR}). The relevant command and the summary of the results is
shown in Figure~\ref{MV-bp-lm-cc}. Note that this summary output reports
that ten observations with missing values were removed prior to the
analysis; this is default for many models in \R.
\renewcommand{\nextcaption}{\R{} output of the complete-case linear model
for the \Robject{bp} data.
\label{MV-bp-lm-cc}}
\SchunkLabel
<>=
summary(lm(recovtime ~ bloodp + logdose, data = bp))
@
\SchunkRaw
Now let us see what happens when we impute the missing values of the
recovery time variable simply by the mean of the complete case; for this we
will use the \Rpackage{mice} \citep{PKG:mice} package;
<>=
library("mice")
@
We begin by creating a new data set, \Robject{imp}, which will contain the three
variables log-dose, blood pressure, and recovery time with the missing values
in the latter replaced by the mean recovery time of the complete cases;
<>=
imp <- mice(bp, method = "mean", m = 1, maxit = 1)
@
So now we can find the summary statistics of recovery time to compare with
those given previously
<>=
with(imp, summary(recovtime))
@
Making the comparison we see that only the values of the first and third
quantile and the median have changed. The minimum and maximum values are
the same and so, of course, is the mean. But of more interest is what
happens to the sample standard deviation; its value for the imputed data can
be found using:
<>=
with(imp, sd(recovtime))
@
The value for the imputed data, $\Sexpr{round(with(imp, sd(recovtime))[["analyses"]][[1]], 2)}$
is, as we would expect, lower than that for the complete data,
$\Sexpr{round(with(bp, sd(recovtime, na.rm = TRUE)), 2)}$.
What about the correlations?
<>=
with(imp, cor(bloodp, recovtime))
with(imp, cor(logdose, recovtime))
@
The correlations of blood pression and recovery time are very similar before
($\Sexpr{round(with(bp, cor(bloodp, recovtime, use = "complete.obs")), 2)}$)
after
($\Sexpr{round(with(imp, cor(bloodp, recovtime))[["analyses"]][[1]], 2)}$)
imputation. For log-dose, imputation changes the correlation from
$\Sexpr{round(with(bp, cor(logdose, recovtime, use = "complete.obs")), 2)}$
to
$\Sexpr{round(with(imp, cor(logdose, recovtime))[["analyses"]][[1]], 2)}$.
The scatterplot of the imputed data is found as given by the code displayed
with Figure~\ref{MV-bp-pairs-imp}. For mean imputation, the imputed value
of the recovery time is constant for all observations and so they appear as
a series of points along the value of the mean value of the observed
recovery times namely, $\Sexpr{round(with(bp, mean(recovtime, na.rm = TRUE)), 2)}$.
\begin{figure}
\begin{center}
<>=
layout(matrix(1:2, nrow = 1))
plot(recovtime ~ bloodp, data = complete(imp),
pch = is.na(bp$recovtime) + 1)
plot(recovtime ~ logdose, data = complete(imp),
pch = is.na(bp$recovtime) + 1)
legend("topleft", pch = 1:2, bty = "n",
legend = c("original", "imputed"))
@
\caption{Scatterplots of the imputed \Robject{bp} data. Imputed
observations are depicted as triangles.
\label{MV-bp-pairs-imp}}
\end{center}
\end{figure}
\renewcommand{\nextcaption}{\R{} output of the mean imputation linear model
for the \Robject{bp} data.
\label{MV-bp-lm-imp}}
\SchunkLabel
<>=
with(imp, summary(lm(recovtime ~ bloodp + logdose)))
@
\SchunkRaw
Comparison of the multiple linear regression results in
Figure~\ref{MV-bp-lm-imp} with those in Figure~\ref{MV-bp-lm-cc} show some
interesting differences, for example, the standard errors of the regression
coefficients are somewhat lower for the mean imputed data but the
conclusions drawn from the results in each table would be broadly similar.
\index{Predictive mean matching}
The single imputation of a sample mean is not to be recommended and so we
will move on to using a more sophisticated multiple imputation procedure
know as \stress{predictive mean matching}. The method is described in
detail in \cite{HSAUR:vanBuuren2012} who considers it both easy-to-use and versatile.
And imputations outside the observed data range will not occur so that
problems with meaningless imputations, for example, a negative recovery
time, will not occur. The method is labeled \Robject{pmm} in the
\Rpackage{mice} package and here we will apply it to the blood pressure data
with $m = 10$ (we need to fix the seed in order to make the result
reproducible):
<>=
imp_ppm <- mice(bp, m = 10, method = "pmm",
print = FALSE, seed = 1)
@
The scatterplot of the imputed data is found as given by the code displayed
with Figure~\ref{MV-bp-pairs-imp-mice}. We only show the imputed recovery
times from the first iteration ($m = 1$).The imputed recovery times now take
different values.
\begin{figure}
\begin{center}
<>=
layout(matrix(1:2, nrow = 1))
plot(recovtime ~ bloodp, data = complete(imp_ppm),
pch = is.na(bp$recovtime) + 1)
plot(recovtime ~ logdose, data = complete(imp_ppm),
pch = is.na(bp$recovtime) + 1)
legend("topleft", pch = 1:2, bty = "n",
legend = c("original", "imputed"))
@
\caption{Scatterplots of the multiple imputed \Robject{bp} data (first iteration).
Imputed observations are depicted as triangles.
\label{MV-bp-pairs-imp-mice}}
\end{center}
\end{figure}
From the resulting object we can compute the mean and standard deviations of
recovery time for each of the $m = 10$ iterations. We first extract these
numbers from the \Robject{analyses} element of the returned object, convert
this list to a vector, and use the \Rcmd{summary} function to compute the
usual summary statistics:
<>=
summary(unlist(with(imp_ppm, mean(recovtime))$analyses))
summary(unlist(with(imp_ppm, sd(recovtime))$analyses))
@
We do the same with the correlations as follows
<>=
summary(unlist(with(imp_ppm,
cor(bloodp, recovtime))$analyses))
summary(unlist(with(imp_ppm,
cor(logdose, recovtime))$analyses))
@
The estimate of the mean of the blood pressure data from the multiply
imputed results is $\Sexpr{round(mean(unlist(with(imp_ppm, mean(recovtime))$analyses)) , 2)}$,
very similar to the values found previously.
Similarly the estimate of the standard deviation of the data is
$\Sexpr{round(mean(unlist(with(imp_ppm, sd(recovtime))$analyses)) , 2)}$ which
lies between the complete data estimate and the \emph{mean-imputed}
value. The two correlation estimates are also
very close to the previous values. The variation in the estimates of mean,
standard deviation, and correlations across the ten imputation is relatively
small apart from that for the correlation between log-dose and recovery
time -- here there is considerable variation in the values for the ten
imputations.
Finally, we will fit a linear model to each of the imputed samples and then
find the summary statistics for the ten sets of regression coefficients: the
results are given in Figure~\ref{MV-bp-lm-cc-mice}:
<>=
fit <- with(imp_ppm, lm(recovtime ~ bloodp + logdose))
@
\renewcommand{\nextcaption}{\R{} output of the multiple imputed linear model
for the \Robject{bp} data.
\label{MV-bp-lm-cc-mice}}
\SchunkLabel
<>=
summary(pool(fit))
@
\SchunkRaw
The result for blood pressure is similar to the previous complete data and
mean-imputed results with the regression coefficient for this variable being
highly significant
$(p = \Sexpr{round(summary(pool(fit))["bloodp", 5], 3)})$.
But the result for log dose differs from
those found previously; for the multiply imputed data the regression
coefficient for log dose is not significant at the $5\%$ level
$(p = \Sexpr{round(summary(pool(fit))["logdose", 5], 3)})$
whereas in both of the previous two analyses it was significant. This
finding reflects the greater variation of the value of the correlation
between log dose and recovery time in the ten imputations noted
above. (Remember that the standard errors in Figure~\ref{MV-bp-lm-cc-mice}
computed by \Rcmd{pool} arise from the formulae given in Section~\ref{MI:ana}.)
Now suppose we wish to test the hypothesis that in the population from which
the sample data in Table~\ref{MV-bp-tab} arises a mean recovery time of $27$ minutes.
We will test this hypothesis in the usual way using Student's t-test applied to the
complete-data, the singly imputed data, and the multiply imputed data:
<>=
with(bp, t.test(recovtime, mu = 27))
with(imp, t.test(recovtime, mu = 27))$analyses[[1]]
@
For the multiply imputed data we need to use the \Rcmd{lm} function to get the
equivalent of the $t$-test by modeling recovery time minus $27$ with an
intercept only and testing for zero intercept. So the code needed is:
<>=
fit <- with(imp_ppm, lm(I(recovtime - 27) ~ 1))
summary(pool(fit))
@
Looking at the results of the three analyses we see that the complete-case
analysis fails to reject the hypothesis at the $5\%$ level whereas the other
two analyses lead to results that are statistically significant at the
level. This simple (and perhaps rather artificial) example demonstrates
that different conclusions can be reached by the different approaches.
\section{Summary of Findings}
The estimated standard deviation of the blood pressure is lower when
computed from the mean-imputed data than from the complete data. The
corresponding value from the multiply imputed data lies between these two
values.
The estimate of the mean from the multiply imputed data is very similar
to the value obtained in the complete data analysis. (The value from the
singly imputed data is, of course, the same as from the complete data.)
The estimates of the correlations between blood pressure and recovery
time and log dose and recovery time are very similar in all three analyses
but the variation in the latter across the ten multiple imputations is
considerable and this results in the regression coefficient for log dose being less significant than in the other two analyses.
Testing the hypothesis that the population mean of recovery time is $27$
minutes using complete-case analysis leads to a different conclusion than is
arrived at by the two multiple imputations approaches.
\section{Final Comments}
Missing values are an ever-present possibility in all types of studies
although everything possible should be done to avoid them. But when data
contain missing values multiple imputation can be used to provide valid
inferences for parameter estimates from the incomplete data. If carefully
handled, multiple imputation can cope with missing data in all types of
variables. In this chapter we have given only a brief account of dealing
with missing values; a detailed account is available in the issue of
\stress{Statistical Methods in Medical Research entitled Multiple Imputation:
Current Perspectives} (Volume 16, Number 3, 2007) and in \cite{HSAUR:vanBuuren2012}.
\section*{Exercises}
\begin{description}
\exercise
The data in Table~\ref{MI-UStemp-tab} give the lowest temperatures (in Fahrenheit) recorded in
various months for cities in the US; missing values are indicated by NA.
Calculate the correlation matrix of the data using
\begin{enumerate}
\item the complete-case approach,
\item the available-data approach, and
\item a multiple-imputation approach.
\end{enumerate}
Find the principal components of the data using each of three
correlation matrices and plot the cities in the space of the first two
components of each solution.
<>=
data("UStemp", package = "HSAUR3")
toLatex(HSAURtable(UStemp),
caption = "Lowest temperatures in Fahrenheit recorded in various months for cities in the US.",
label = "MI-UStemp-tab", rownames = TRUE)
@
\exercise
Find $95\%$ confidence intervals for the population means of the lowest
temperature in each month using
\begin{enumerate}
\item the complete-case approach,
\item the mean value imputation, and
\item a multiple-imputation approach.
\end{enumerate}
\exercise
Find the correlation matrix for the four months in Table~\ref{MI-UStemp-tab} using complete-case analysis,
listwise deletion, and multiple imputation.
\end{description}
%%\bibliographystyle{LaTeXBibTeX/refstyle}
%%\bibliography{LaTeXBibTeX/HSAUR}
\end{document}