\documentclass{chapman} %%% copy Sweave.sty definitions %%% keeps `sweave' from adding `\usepackage{Sweave}': DO NOT REMOVE %\usepackage{Sweave} \RequirePackage[T1]{fontenc} \RequirePackage{graphicx,ae,fancyvrb} \IfFileExists{upquote.sty}{\RequirePackage{upquote}}{} \usepackage{relsize} \DefineVerbatimEnvironment{Sinput}{Verbatim}{} \DefineVerbatimEnvironment{Soutput}{Verbatim}{fontfamily=courier, fontshape=it, fontsize=\relsize{-1}} \DefineVerbatimEnvironment{Scode}{Verbatim}{} \newenvironment{Schunk}{}{} %%% environment for raw output \newcommand{\SchunkRaw}{\renewenvironment{Schunk}{}{} \DefineVerbatimEnvironment{Soutput}{Verbatim}{fontfamily=courier, fontshape=it, fontsize=\small} \rawSinput } %%% environment for labeled output \newcommand{\nextcaption}{} \newcommand{\SchunkLabel}{ \renewenvironment{Schunk}{\begin{figure}[ht] }{\caption{\nextcaption} \end{figure} } \DefineVerbatimEnvironment{Sinput}{Verbatim}{frame = topline} \DefineVerbatimEnvironment{Soutput}{Verbatim}{frame = bottomline, samepage = true, fontfamily=courier, fontshape=it, fontsize=\relsize{-1}} } %%% S code with line numbers \DefineVerbatimEnvironment{Sinput} {Verbatim} { %% numbers=left } \newcommand{\numberSinput}{ \DefineVerbatimEnvironment{Sinput}{Verbatim}{numbers=left} } \newcommand{\rawSinput}{ \DefineVerbatimEnvironment{Sinput}{Verbatim}{} } %%% R / System symbols \newcommand{\R}{\textsf{R}} \newcommand{\rR}{{R}} \renewcommand{\S}{\textsf{S}} \newcommand{\SPLUS}{\textsf{S-PLUS}} \newcommand{\rSPLUS}{{S-PLUS}} \newcommand{\SPSS}{\textsf{SPSS}} \newcommand{\EXCEL}{\textsf{Excel}} \newcommand{\ACCESS}{\textsf{Access}} \newcommand{\SQL}{\textsf{SQL}} %%\newcommand{\Rpackage}[1]{\hbox{\rm\textit{#1}}} %%\newcommand{\Robject}[1]{\hbox{\rm\texttt{#1}}} %%\newcommand{\Rclass}[1]{\hbox{\rm\textit{#1}}} %%\newcommand{\Rcmd}[1]{\hbox{\rm\texttt{#1}}} \newcommand{\Rpackage}[1]{\index{#1 package@{\fontseries{b}\selectfont #1} package} {\fontseries{b}\selectfont #1}} \newcommand{\rpackage}[1]{{\fontseries{b}\selectfont #1}} \newcommand{\Robject}[1]{\texttt{#1}} \newcommand{\Rclass}[1]{\index{#1 class@\textit{#1} class}\textit{#1}} \newcommand{\Rcmd}[1]{\index{#1 function@\texttt{#1} function}\texttt{#1}} \newcommand{\Roperator}[1]{\texttt{#1}} \newcommand{\Rarg}[1]{\texttt{#1}} \newcommand{\Rlevel}[1]{\texttt{#1}} %%% other symbols \newcommand{\file}[1]{\hbox{\rm\texttt{#1}}} %%\newcommand{\stress}[1]{\index{#1}\textit{#1}} \newcommand{\stress}[1]{\textit{#1}} \newcommand{\booktitle}[1]{\textit{#1}} %%' %%% Math symbols \usepackage{amstext} \usepackage{amsmath} \newcommand{\E}{\mathsf{E}} \newcommand{\Var}{\mathsf{Var}} \newcommand{\Cov}{\mathsf{Cov}} \newcommand{\Cor}{\mathsf{Cor}} \newcommand{\x}{\mathbf{x}} \newcommand{\y}{\mathbf{y}} \renewcommand{\a}{\mathbf{a}} \newcommand{\W}{\mathbf{W}} \newcommand{\C}{\mathbf{C}} \renewcommand{\H}{\mathbf{H}} \newcommand{\X}{\mathbf{X}} \newcommand{\B}{\mathbf{B}} \newcommand{\V}{\mathbf{V}} \newcommand{\I}{\mathbf{I}} \newcommand{\D}{\mathbf{D}} \newcommand{\bS}{\mathbf{S}} \newcommand{\N}{\mathcal{N}} \renewcommand{\L}{L} \renewcommand{\P}{\mathsf{P}} \newcommand{\K}{\mathbf{K}} \newcommand{\m}{\mathbf{m}} \newcommand{\argmin}{\operatorname{argmin}\displaylimits} \newcommand{\argmax}{\operatorname{argmax}\displaylimits} \newcommand{\bx}{\mathbf{x}} \newcommand{\bbeta}{\mathbf{\beta}} %%% links \usepackage{hyperref} \hypersetup{% pdftitle = {A Handbook of Statistical Analyses Using R (3rd Edition)}, pdfsubject = {Book}, pdfauthor = {Torsten Hothorn and Brian S. Everitt}, colorlinks = {black}, linkcolor = {black}, citecolor = {black}, urlcolor = {black}, hyperindex = {true}, linktocpage = {true}, } %%% captions & tables %% : conflics with figure definition in chapman.cls %%\usepackage[format=hang,margin=10pt,labelfont=bf]{caption} %% \usepackage{longtable} \usepackage[figuresright]{rotating} %%% R symbol in chapter 1 \usepackage{wrapfig} %%% Bibliography \usepackage[round,comma]{natbib} \renewcommand{\refname}{References \addcontentsline{toc}{chapter}{References}} \citeindexfalse %%% texi2dvi complains that \newblock is undefined, hm... \def\newblock{\hskip .11em plus .33em minus .07em} %%% Example sections \newcounter{exercise}[chapter] \setcounter{exercise}{0} \newcommand{\exercise}{\stepcounter{exercise} \item{Ex.~\arabic{chapter}.\arabic{exercise} }} %% URLs \newcommand{\curl}[1]{\begin{center} \url{#1} \end{center}} %%% for manual corrections %\renewcommand{\baselinestretch}{2} %%% plot sizes \setkeys{Gin}{width=0.95\textwidth} %%% color \usepackage{color} %%% hyphenations \hyphenation{drop-out} \hyphenation{mar-gi-nal} %%% new bidirectional quotes need \usepackage[utf8]{inputenc} %\usepackage{setspace} \definecolor{sidebox_todo}{rgb}{1,1,0.2} \newcommand{\todo}[1]{ \hspace{0pt}% \marginpar{% \fcolorbox{black}{sidebox_todo}{% \parbox{\marginparwidth} { \raggedright\sffamily\footnotesize{TODO: #1}% } }% } } \begin{document} %% Title page \title{A Handbook of Statistical Analyses Using \R{} --- 3rd Edition} \author{Torsten Hothorn and Brian S. Everitt} \maketitle %%\VignetteIndexEntry{Chapter Simple Inference} %%\VignetteDepends{vcd} \setcounter{chapter}{2} \SweaveOpts{prefix.string=figures/HSAUR,eps=FALSE,keep.source=TRUE} <>= rm(list = ls()) s <- search()[-1] s <- s[-match(c("package:base", "package:stats", "package:graphics", "package:grDevices", "package:utils", "package:datasets", "package:methods", "Autoloads"), s)] if (length(s) > 0) sapply(s, detach, character.only = TRUE) if (!file.exists("tables")) dir.create("tables") if (!file.exists("figures")) dir.create("figures") set.seed(290875) options(prompt = "R> ", continue = "+ ", width = 63, # digits = 4, show.signif.stars = FALSE, SweaveHooks = list(leftpar = function() par(mai = par("mai") * c(1, 1.05, 1, 1)), bigleftpar = function() par(mai = par("mai") * c(1, 1.7, 1, 1)))) HSAURpkg <- require("HSAUR3") if (!HSAURpkg) stop("cannot load package ", sQuote("HSAUR3")) rm(HSAURpkg) ### hm, R-2.4.0 --vanilla seems to need this a <- Sys.setlocale("LC_ALL", "C") ### book <- TRUE refs <- cbind(c("AItR", "DAGD", "SI", "CI", "ANOVA", "MLR", "GLM", "DE", "RP", "GAM", "SA", "ALDI", "ALDII", "SIMC", "MA", "PCA", "MDS", "CA"), 1:18) ch <- function(x) { ch <- refs[which(refs[,1] == x),] if (book) { return(paste("Chapter~\\\\ref{", ch[1], "}", sep = "")) } else { return(paste("Chapter~", ch[2], sep = "")) } } if (file.exists("deparse.R")) source("deparse.R") setHook(packageEvent("lattice", "attach"), function(...) { lattice.options(default.theme = function() standard.theme("pdf", color = FALSE)) }) @ \pagestyle{headings} <>= book <- FALSE @ \chapter[Simple Inference]{Simple Inference: Guessing Lengths, Wave Energy, Water Hardness, Piston Rings, and Rearrests of Juveniles \label{SI}} \section{Introduction} <>= library("vcd") if (!interactive()) { print.htest <- function (x, digits = 4, quote = TRUE, prefix = "", ...) { cat("\n") cat(strwrap(x$method, prefix = "\t"), sep = "\n") cat("\n") cat("data: ", x$data.name, "\n") out <- character() if (!is.null(x$statistic)) out <- c(out, paste(names(x$statistic), "=", format(round(x$statistic, 4)))) if (!is.null(x$parameter)) out <- c(out, paste(names(x$parameter), "=", format(round(x$parameter, 3)))) if (!is.null(x$p.value)) { fp <- format.pval(x$p.value, digits = digits) out <- c(out, paste("p-value", if (substr(fp, 1, 1) == "<") fp else paste("=", fp))) } cat(strwrap(paste(out, collapse = ", ")), sep = "\n") if (!is.null(x$conf.int)) { cat(format(100 * attr(x$conf.int, "conf.level")), "percent confidence interval:\n", format(c(x$conf.int[1], x$conf.int[2])), "\n") } if (!is.null(x$estimate)) { cat("sample estimates:\n") print(x$estimate, ...) } cat("\n") invisible(x) } } @ \section{Statistical Tests} \section{Analysis Using \R{}} \subsection{Estimating the Width of a Room} The data shown in Table~\ref{SI-rw-tab} are available as \Robject{roomwidth} \Rclass{data.frame} from the \Rpackage{HSAUR3} package and can be attached by using <>= data("roomwidth", package = "HSAUR3") @ If we convert the estimates of the room width in meters into feet by multiplying each by $3.28$ then we would like to test the hypothesis that the mean of the population of `metre' estimates is equal to the mean %' of the population of `feet' estimates. We shall do this first %' by using an independent samples $t$-test, but first it is good practice to check, informally at least, the normality and equal variance assumptions. Here we can use a combination of numerical and graphical approaches. The first step should be to convert the meter estimates into feet by a factor <>= convert <- ifelse(roomwidth$unit == "feet", 1, 3.28) @ which equals one for all feet measurements and $3.28$ for the measurements in meters. Now, we get the usual summary statistics and standard deviations of each set of estimates using <>= tapply(roomwidth$width * convert, roomwidth$unit, summary) tapply(roomwidth$width * convert, roomwidth$unit, sd) @ where \Rcmd{tapply} applies \Rcmd{summary}, or \Rcmd{sd}, to the converted widths for both groups of measurements given by \Robject{roomwidth\$unit}. A boxplot of each set of estimates might be useful and is depicted in Figure~\ref{SI-rw-bxp}. The \Rcmd{layout} function (line 1 in Figure~\ref{SI-rw-bxp}) divides the plotting area into three parts. The \Rcmd{boxplot} function produces a boxplot in the upper part and the two \Rcmd{qqnorm} statements in lines 7 and 10 set up the normal probability plots that can be used to assess the normality assumption of the $t$-test. \index{Normal probability plot} \numberSinput \begin{figure} \begin{center} <>= layout(matrix(c(1,2,1,3), nrow = 2, ncol = 2, byrow = FALSE)) boxplot(I(width * convert) ~ unit, data = roomwidth, ylab = "Estimated width (feet)", varwidth = TRUE, names = c("Estimates in feet", "Estimates in meters (converted to feet)")) feet <- roomwidth$unit == "feet" qqnorm(roomwidth$width[feet], ylab = "Estimated width (feet)") qqline(roomwidth$width[feet]) qqnorm(roomwidth$width[!feet], ylab = "Estimated width (meters)") qqline(roomwidth$width[!feet]) @ \caption{Boxplots of estimates of room width in feet and meters (after conversion to feet) and normal probability plots of estimates of room width made in feet and in meters. \label{SI-rw-bxp}} \end{center} \end{figure} \rawSinput The boxplots indicate that both sets of estimates contain a number of outliers and also that the estimates made in meters are skewed and more variable than those made in feet, a point underlined by the numerical summary statistics above. Both normal probability plots depart from linearity, suggesting that the distributions of both sets of estimates are not normal. The presence of outliers, the apparently different variances and the evidence of non-normality all suggest caution in applying the $t$-test, but for the moment we shall apply the usual version of the test using the \Rcmd{t.test} function in \R{}. The two-sample test problem is specified by a \Rclass{formula}, here by <>= I(width * convert) ~ unit @ where the response, \Robject{width}, on the left-hand side needs to be converted first and, because the star has a special meaning in formulae as will be explained in \Sexpr{ch("ANOVA")}, the conversion needs to be embedded by \texttt{I}. The factor \Robject{unit} on the right-hand side specifies the two groups to be compared. <>= tt <- t.test(I(width * convert) ~ unit, data = roomwidth, var.equal = TRUE) @ \renewcommand{\nextcaption}{\R{} output of the independent samples $t$-test for the \Robject{roomwidth} data. \label{SI-roomwidth-tt-fig}} \SchunkLabel <>= t.test(I(width * convert) ~ unit, data = roomwidth, var.equal = TRUE) @ \SchunkRaw \renewcommand{\nextcaption}{\R{} output of the independent samples Welch test for the \Robject{roomwidth} data. \label{SI-roomwidth-welch-fig}} \SchunkLabel <>= t.test(I(width * convert) ~ unit, data = roomwidth, var.equal = FALSE) @ \SchunkRaw \renewcommand{\nextcaption}{\R{} output of the Wilcoxon rank sum test for the \Robject{roomwidth} data. \label{SI-roomwidth-wilcox-fig}} \SchunkLabel <>= wilcox.test(I(width * convert) ~ unit, data = roomwidth, conf.int = TRUE) @ \SchunkRaw <>= pwt <- round(wilcox.test(I(width * convert) ~ unit, data = roomwidth)$p.value, 3) @ \subsection{Wave Energy Device Mooring} The data from Table~\ref{SI-m-tab} are available as \Rclass{data.frame} \Robject{waves} <>= data("waves", package = "HSAUR3") @ and requires the use of a matched pairs $t$-test to answer the question of interest. This test assumes that the differences between the matched observations have a normal distribution so we can begin by checking this assumption by constructing a boxplot and a normal probability plot -- see Figure~\ref{SI-w-bxp}. \begin{figure} \begin{center} <>= mooringdiff <- waves$method1 - waves$method2 layout(matrix(1:2, ncol = 2)) boxplot(mooringdiff, ylab = "Differences (Newton meters)", main = "Boxplot") abline(h = 0, lty = 2) qqnorm(mooringdiff, ylab = "Differences (Newton meters)") qqline(mooringdiff) @ \caption{Boxplot and normal probability plot for differences between the two mooring methods. \label{SI-w-bxp}} \end{center} \end{figure} \renewcommand{\nextcaption}{\R{} output of the paired $t$-test for the \Robject{waves} data. \label{SI-waves-tt-fig}} \SchunkLabel <>= t.test(mooringdiff) @ \SchunkRaw <>= pwt <- round(wilcox.test(mooringdiff)$p.value, 3) @ \renewcommand{\nextcaption}{\R{} output of the Wilcoxon signed rank test for the \Robject{waves} data. \label{SI-waves-ws-fig}} \SchunkLabel <>= wilcox.test(mooringdiff) @ \SchunkRaw \subsection{Mortality and Water Hardness} There is a wide range of analyses we could apply to the data in Table~\ref{SI-w-tab} available from <>= data("water", package = "HSAUR3") @ But to begin we will construct a scatterplot of the data enhanced somewhat by the addition of information about the marginal distributions of water hardness (calcium concentration) and mortality, and by adding the estimated linear regression fit (see \Sexpr{ch("MLR")}) for mortality on hardness. The plot and the required \R{} code are given along with Figure~\ref{SI-water-sp}. In line 1 of Figure~\ref{SI-water-sp}, we divide the plotting region into four areas of different size. The scatterplot (line 3) uses a plotting symbol depending on the location of the city (by the \Rarg{pch} argument); a legend for the location is added in line 6. We add a least squares fit (see \Sexpr{ch("MLR")}) to the scatterplot and, finally, depict the marginal distributions by means of a boxplot and a histogram. The scatterplot shows that as hardness increases mortality decreases, and the histogram for the water hardness shows it has a rather skewed distribution. \numberSinput \begin{figure} \begin{center} <>= nf <- layout(matrix(c(2, 0, 1, 3), 2, 2, byrow = TRUE), c(2, 1), c(1, 2), TRUE) psymb <- as.numeric(water$location) plot(mortality ~ hardness, data = water, pch = psymb) abline(lm(mortality ~ hardness, data = water)) legend("topright", legend = levels(water$location), pch = c(1,2), bty = "n") hist(water$hardness) boxplot(water$mortality) @ \caption{Enhanced scatterplot of water hardness and mortality, showing both the joint and the marginal distributions and, in addition, the location of the city by different plotting symbols. \label{SI-water-sp}} \end{center} \end{figure} \rawSinput \renewcommand{\nextcaption}{\R{} output of Pearsons' correlation coefficient %' for the \Robject{water} data. \label{SI-water-c-fig}} \SchunkLabel <>= cor.test(~ mortality + hardness, data = water) @ \SchunkRaw <>= cr <- round(cor.test(~ mortality + hardness, data = water)$estimate, 3) @ \subsection{Piston-ring Failures} <>= chisqt <- chisq.test(pistonrings) @ \renewcommand{\nextcaption}{\R{} output of the chi-squared test for the \Robject{pistonrings} data. \label{SI-pr-x2-fig}} \SchunkLabel <>= data("pistonrings", package = "HSAUR3") chisq.test(pistonrings) @ \SchunkRaw Rather than looking at the simple differences of observed and expected values for each cell which would be unsatisfactory since a difference of fixed size is clearly more important for smaller samples, it is preferable to consider a \stress{standardized residual} \index{Standardized residual, for chi-squared tests} given by dividing the observed minus the expected difference by the square root of the appropriate expected value. The $X^2$ statistic for assessing independence is simply the sum, over all the cells in the table, of the squares of these terms. We can find these values extracting the \Robject{residuals} element of the object returned by the \Rcmd{chisq.test} function <>= chisq.test(pistonrings)$residuals @ A graphical representation of these residuals is called an \stress{association plot} \index{Association plot} and is available via the \Rcmd{assoc} function from package \Rpackage{vcd} \citep{PKG:vcd} applied to the contingency table of the two categorical variables. Figure~\ref{SI-assoc-plot} depicts the residuals for the piston ring data. The deviations from independence are largest for C1 and C4 compressors in the center and south leg. \begin{figure} \begin{center} <>= library("vcd") assoc(pistonrings) @ \caption{Association plot of the residuals for the \Robject{pistonrings} data. \label{SI-assoc-plot}} \end{center} \end{figure} \subsection{Rearrests of Juveniles} The data in Table~\ref{SI-r-tab} are available as \Rclass{table} object via <>= data("rearrests", package = "HSAUR3") rearrests @ <>= mcs <- round(mcnemar.test(rearrests, correct = FALSE)$statistic, 2) @ and in \Robject{rearrests} the counts in the four cells refer to the matched pairs of subjects; for example, in $\Sexpr{rearrests[1,1]}$ pairs both members of the pair were rearrested. Here we need to use McNemar's %' test to assess whether rearrest is associated with the type of court where the juvenile was tried. We can use the \R{} function \Rcmd{mcnemar.test}. The test statistic shown in Figure~\ref{SI-ra-mc-fig} is $\Sexpr{mcs}$ with a single degree of freedom -- the associated $p$-value is extremely small and there is strong evidence that type of court and the probability of rearrest are related. It appears that trial at a juvenile court is less likely to result in rearrest (see Exercise~3.4). % An exact version of McNemar's test %%' can be obtained by testing whether $b$ and $c$ are equal using a binomial test (see Figure~\ref{SI-ra-mcbin-fig}). \renewcommand{\nextcaption}{\R{} output of McNemar's test %' for the \Robject{rearrests} data. \label{SI-ra-mc-fig}} \SchunkLabel <>= mcnemar.test(rearrests, correct = FALSE) @ \SchunkRaw \renewcommand{\nextcaption}{\R{} output of an exact version of McNemar's test %' for the \Robject{rearrests} data computed via a binomial test. \label{SI-ra-mcbin-fig}} \SchunkLabel <>= binom.test(rearrests[2], n = sum(rearrests[c(2,3)])) @ \SchunkRaw \bibliographystyle{LaTeXBibTeX/refstyle} \bibliography{LaTeXBibTeX/HSAUR} \end{document}