\documentclass{chapman} %%% copy Sweave.sty definitions %%% keeps `sweave' from adding `\usepackage{Sweave}': DO NOT REMOVE %\usepackage{Sweave} \RequirePackage[T1]{fontenc} \RequirePackage{graphicx,ae,fancyvrb} \IfFileExists{upquote.sty}{\RequirePackage{upquote}}{} \usepackage{relsize} \DefineVerbatimEnvironment{Sinput}{Verbatim}{} \DefineVerbatimEnvironment{Soutput}{Verbatim}{fontfamily=courier, fontshape=it, fontsize=\relsize{-1}} \DefineVerbatimEnvironment{Scode}{Verbatim}{} \newenvironment{Schunk}{}{} %%% environment for raw output \newcommand{\SchunkRaw}{\renewenvironment{Schunk}{}{} \DefineVerbatimEnvironment{Soutput}{Verbatim}{fontfamily=courier, fontshape=it, fontsize=\small} \rawSinput } %%% environment for labeled output \newcommand{\nextcaption}{} \newcommand{\SchunkLabel}{ \renewenvironment{Schunk}{\begin{figure}[ht] }{\caption{\nextcaption} \end{figure} } \DefineVerbatimEnvironment{Sinput}{Verbatim}{frame = topline} \DefineVerbatimEnvironment{Soutput}{Verbatim}{frame = bottomline, samepage = true, fontfamily=courier, fontshape=it, fontsize=\relsize{-1}} } %%% S code with line numbers \DefineVerbatimEnvironment{Sinput} {Verbatim} { %% numbers=left } \newcommand{\numberSinput}{ \DefineVerbatimEnvironment{Sinput}{Verbatim}{numbers=left} } \newcommand{\rawSinput}{ \DefineVerbatimEnvironment{Sinput}{Verbatim}{} } %%% R / System symbols \newcommand{\R}{\textsf{R}} \newcommand{\rR}{{R}} \renewcommand{\S}{\textsf{S}} \newcommand{\SPLUS}{\textsf{S-PLUS}} \newcommand{\rSPLUS}{{S-PLUS}} \newcommand{\SPSS}{\textsf{SPSS}} \newcommand{\EXCEL}{\textsf{Excel}} \newcommand{\ACCESS}{\textsf{Access}} \newcommand{\SQL}{\textsf{SQL}} %%\newcommand{\Rpackage}[1]{\hbox{\rm\textit{#1}}} %%\newcommand{\Robject}[1]{\hbox{\rm\texttt{#1}}} %%\newcommand{\Rclass}[1]{\hbox{\rm\textit{#1}}} %%\newcommand{\Rcmd}[1]{\hbox{\rm\texttt{#1}}} \newcommand{\Rpackage}[1]{\index{#1 package@{\fontseries{b}\selectfont #1} package} {\fontseries{b}\selectfont #1}} \newcommand{\rpackage}[1]{{\fontseries{b}\selectfont #1}} \newcommand{\Robject}[1]{\texttt{#1}} \newcommand{\Rclass}[1]{\index{#1 class@\textit{#1} class}\textit{#1}} \newcommand{\Rcmd}[1]{\index{#1 function@\texttt{#1} function}\texttt{#1}} \newcommand{\Roperator}[1]{\texttt{#1}} \newcommand{\Rarg}[1]{\texttt{#1}} \newcommand{\Rlevel}[1]{\texttt{#1}} %%% other symbols \newcommand{\file}[1]{\hbox{\rm\texttt{#1}}} %%\newcommand{\stress}[1]{\index{#1}\textit{#1}} \newcommand{\stress}[1]{\textit{#1}} \newcommand{\booktitle}[1]{\textit{#1}} %%' %%% Math symbols \usepackage{amstext} \usepackage{amsmath} \newcommand{\E}{\mathsf{E}} \newcommand{\Var}{\mathsf{Var}} \newcommand{\Cov}{\mathsf{Cov}} \newcommand{\Cor}{\mathsf{Cor}} \newcommand{\x}{\mathbf{x}} \newcommand{\y}{\mathbf{y}} \renewcommand{\a}{\mathbf{a}} \newcommand{\W}{\mathbf{W}} \newcommand{\C}{\mathbf{C}} \renewcommand{\H}{\mathbf{H}} \newcommand{\X}{\mathbf{X}} \newcommand{\B}{\mathbf{B}} \newcommand{\V}{\mathbf{V}} \newcommand{\I}{\mathbf{I}} \newcommand{\D}{\mathbf{D}} \newcommand{\bS}{\mathbf{S}} \newcommand{\N}{\mathcal{N}} \renewcommand{\L}{L} \renewcommand{\P}{\mathsf{P}} \newcommand{\K}{\mathbf{K}} \newcommand{\m}{\mathbf{m}} \newcommand{\argmin}{\operatorname{argmin}\displaylimits} \newcommand{\argmax}{\operatorname{argmax}\displaylimits} \newcommand{\bx}{\mathbf{x}} \newcommand{\bbeta}{\mathbf{\beta}} %%% links \usepackage{hyperref} \hypersetup{% pdftitle = {A Handbook of Statistical Analyses Using R (3rd Edition)}, pdfsubject = {Book}, pdfauthor = {Torsten Hothorn and Brian S. Everitt}, colorlinks = {black}, linkcolor = {black}, citecolor = {black}, urlcolor = {black}, hyperindex = {true}, linktocpage = {true}, } %%% captions & tables %% : conflics with figure definition in chapman.cls %%\usepackage[format=hang,margin=10pt,labelfont=bf]{caption} %% \usepackage{longtable} \usepackage[figuresright]{rotating} %%% R symbol in chapter 1 \usepackage{wrapfig} %%% Bibliography \usepackage[round,comma]{natbib} \renewcommand{\refname}{References \addcontentsline{toc}{chapter}{References}} \citeindexfalse %%% texi2dvi complains that \newblock is undefined, hm... \def\newblock{\hskip .11em plus .33em minus .07em} %%% Example sections \newcounter{exercise}[chapter] \setcounter{exercise}{0} \newcommand{\exercise}{\stepcounter{exercise} \item{Ex.~\arabic{chapter}.\arabic{exercise} }} %% URLs \newcommand{\curl}[1]{\begin{center} \url{#1} \end{center}} %%% for manual corrections %\renewcommand{\baselinestretch}{2} %%% plot sizes \setkeys{Gin}{width=0.95\textwidth} %%% color \usepackage{color} %%% hyphenations \hyphenation{drop-out} \hyphenation{mar-gi-nal} %%% new bidirectional quotes need \usepackage[utf8]{inputenc} %\usepackage{setspace} \definecolor{sidebox_todo}{rgb}{1,1,0.2} \newcommand{\todo}[1]{ \hspace{0pt}% \marginpar{% \fcolorbox{black}{sidebox_todo}{% \parbox{\marginparwidth} { \raggedright\sffamily\footnotesize{TODO: #1}% } }% } } \begin{document} %% Title page \title{A Handbook of Statistical Analyses Using \R{} --- 3rd Edition} \author{Torsten Hothorn and Brian S. Everitt} \maketitle %%\VignetteIndexEntry{Multiple Linear Regression} %%\VignetteDepends{wordcloud} \setcounter{chapter}{5} \SweaveOpts{prefix.string=figures/HSAUR,eps=FALSE,keep.source=TRUE} <>= rm(list = ls()) s <- search()[-1] s <- s[-match(c("package:base", "package:stats", "package:graphics", "package:grDevices", "package:utils", "package:datasets", "package:methods", "Autoloads"), s)] if (length(s) > 0) sapply(s, detach, character.only = TRUE) if (!file.exists("tables")) dir.create("tables") if (!file.exists("figures")) dir.create("figures") set.seed(290875) options(prompt = "R> ", continue = "+ ", width = 63, # digits = 4, show.signif.stars = FALSE, SweaveHooks = list(leftpar = function() par(mai = par("mai") * c(1, 1.05, 1, 1)), bigleftpar = function() par(mai = par("mai") * c(1, 1.7, 1, 1)))) HSAURpkg <- require("HSAUR3") if (!HSAURpkg) stop("cannot load package ", sQuote("HSAUR3")) rm(HSAURpkg) ### hm, R-2.4.0 --vanilla seems to need this a <- Sys.setlocale("LC_ALL", "C") ### book <- TRUE refs <- cbind(c("AItR", "DAGD", "SI", "CI", "ANOVA", "MLR", "GLM", "DE", "RP", "GAM", "SA", "ALDI", "ALDII", "SIMC", "MA", "PCA", "MDS", "CA"), 1:18) ch <- function(x) { ch <- refs[which(refs[,1] == x),] if (book) { return(paste("Chapter~\\\\ref{", ch[1], "}", sep = "")) } else { return(paste("Chapter~", ch[2], sep = "")) } } if (file.exists("deparse.R")) source("deparse.R") setHook(packageEvent("lattice", "attach"), function(...) { lattice.options(default.theme = function() standard.theme("pdf", color = FALSE)) }) @ \pagestyle{headings} <>= book <- FALSE @ <>= library("wordcloud") @ \chapter[Simple and Multiple Linear Regression]{Simple and Multiple Linear Regression: \\ How Old is the Universe and Cloud Seeding \label{MLR}} \section{Introduction} \index{Age of the Universe} \cite{HSAUR:Freedmanetal2001} give the relative velocity and the distance of $24$ galaxies, according to measurements made using the Hubble Space Telescope -- the data are contained in the \Rpackage{gamair} package accompanying \cite{HSAUR:Wood2006}, see Table~\ref{MLR-hubble-tab}. Velocities are assessed by measuring the Doppler red shift in the spectrum of light observed from the galaxies concerned, although some correction for `local' velocity components is required. Distances are measured using the known relationship between the period of Cepheid variable stars and their luminosity. How can these data be used to estimate the age of the universe? Here we shall show how this can be done using simple linear regression. <>= data("hubble", package = "gamair") names(hubble) <- c("galaxy", "velocity", "distance") toLatex(HSAURtable(hubble, package = "gamair"), pcol = 2, caption = paste("Distance and velocity for 24 galaxies."), label = "MLR-hubble-tab") @ \vspace*{-1cm} \textit{Source}: From Freedman W. L., et al., \textit{The Astrophysical Journal}, 553, 47--72, 2001. With permission. \vspace*{1cm} \index{Cloud seeding} {\tabcolsep3.5pt <>= data("clouds", package = "HSAUR3") names(clouds) <- c("seeding", "time", "sne", "cloudc", "prewet", "EM", "rain") toLatex(HSAURtable(clouds), pcol = 1, caption = paste("Cloud seeding experiments in Florida -- see text for", "explanations of the variables. Note that the \\Robject{clouds} data set has slightly different variable names."), label = "MLR-clouds-tab") @ } Weather modification, or cloud seeding, is the treatment of individual clouds or storm systems with various inorganic and organic materials in the hope of achieving an increase in rainfall. Introduction of such material into a cloud that contains supercooled water, that is, liquid water colder than zero degrees Celsius, has the aim of inducing freezing, with the consequent ice particles growing at the expense of liquid droplets and becoming heavy enough to fall as rain from clouds that otherwise would produce none. The data shown in Table~\ref{MLR-clouds-tab} were collected in the summer of 1975 from an experiment to investigate the use of massive amounts of silver iodide ($100$ to $1000$ grams per cloud) in cloud seeding to increase rainfall \citep{HSAUR:Woodleyetal1977}. In the experiment, which was conducted in an area of Florida, 24 days were judged suitable for seeding on the basis that a measured suitability criterion, denoted \stress{S-Ne}, was not less than $1.5$. Here \stress{S} is the `seedability', %' the difference between the maximum height of a cloud if seeded and the same cloud if not seeded predicted by a suitable cloud model, and \stress{Ne} is the number of hours between $1300$ and $1600$ G.M.T. with $10$ centimeter echoes in the target; this quantity biases the decision for experimentation against naturally rainy days. Consequently, optimal days for seeding are those on which seedability is large and the natural rainfall early in the day is small. On suitable days, a decision was taken at random as to whether to seed or not. For each day the following variables were measured: \begin{description} \item[\Robject{seeding}] a factor indicating whether seeding action occurred (yes or no), \item[\Robject{time}] number of days after the first day of the experiment, \item[\Robject{cloudc}] the percentage cloud cover in the experimental area, measured using radar, \item[\Robject{prewet}] the total rainfall in the target area one hour before seeding (in cubic meters $\times 10^{7}$), \item[\Robject{EM}] a factor showing whether the radar echo was moving or stationary, \item[\Robject{rain}] the amount of rain in cubic meters $\times 10^{7}$, \item[\Robject{sne}] suitability criterion, see above. \end{description} The objective in analyzing these data is to see how rainfall is related to the explanatory variables and, in particular, to determine the effectiveness of seeding. The method to be used is \stress{multiple linear regression}. \section{Simple Linear Regression} \section{Multiple Linear Regression \label{MLR-MLR}} \subsection{Regression Diagnostics} \section{Analysis Using \R{}} \subsection{Estimating the Age of the Universe} Prior to applying a simple regression to the data it will be useful to look at a plot to assess their major features. The \R{} code given in Figure~\ref{MLR-hubble-plot} produces a scatterplot of velocity and distance. \begin{figure} \begin{center} <>= plot(velocity ~ distance, data = hubble) @ \caption{Scatterplot of velocity and distance. \label{MLR-hubble-plot}} \end{center} \end{figure} The diagram shows a clear, strong relationship between velocity and distance. The next step is to fit a simple linear regression model to the data, but in this case the nature of the data requires a model without intercept because if distance is zero so is relative speed. So the model to be fitted to these data is \begin{eqnarray*} \text{velocity} = \beta_1 \text{distance} + \varepsilon. \end{eqnarray*} This is essentially what astronomers call Hubble's Law and $\beta_1$ is known as Hubble's constant; $\beta_1^{-1}$ gives an approximate age of the universe. To fit this model we are estimating $\beta_1$ using formula (\ref{MLR:beta1}). Although this operation is rather easy <>= sum(hubble$distance * hubble$velocity) / sum(hubble$distance^2) @ it is more convenient to apply \R's linear modeling function <>= hmod <- lm(velocity ~ distance - 1, data = hubble) @ Note that the model formula specifies a model without intercept. We can now extract the estimated model coefficients via <>= coef(hmod) @ and add this estimated regression line to the scatterplot; the result is shown in Figure~\ref{MLR-hubble-lmplot}. In addition, we produce a scatterplot of the residuals $y_i - \hat{y}_i$ against fitted values $\hat{y}_i$ to assess the quality of the model fit. It seems that for higher distance values the variance of velocity increases; however, we are interested in only the estimated parameter $\hat{\beta}_1$ which remains valid under variance heterogeneity (in contrast to $t$-tests and associated $p$-values). Now we can use the estimated value of $\beta_1$ to find an approximate value for the age of the universe. The Hubble constant itself has units of $\text{km} \times \text{sec}^{-1} \times \text{Mpc}^{-1}$. A mega-parsec (Mpc) is $3.09 \times 10^{19}$km, so we need to divide the estimated value of $\beta_1$ by this amount in order to obtain Hubble's constant with units of $\text{sec}^{-1}$. The approximate age of the universe in seconds will then be the inverse of this calculation. Carrying out the necessary computations <>= Mpc <- 3.09 * 10^19 ysec <- 60^2 * 24 * 365.25 Mpcyear <- Mpc / ysec 1 / (coef(hmod) / Mpcyear) @ gives an estimated age of roughly $12.8$ billion years. \begin{figure} \begin{center} <>= layout(matrix(1:2, ncol = 2)) plot(velocity ~ distance, data = hubble) abline(hmod) plot(hmod, which = 1) @ \caption{Scatterplot of velocity and distance with estimated regression line (left) and plot of residuals against fitted values (right). \label{MLR-hubble-lmplot}} \end{center} \end{figure} \subsection{Cloud Seeding} Again, a graphical display highlighting the most important aspects of the data will be helpful. Here we will construct boxplots of the rainfall in each category of the dichotomous explanatory variables and scatterplots of rainfall against each of the continuous explanatory variables. \begin{figure} \begin{center} <>= data("clouds", package = "HSAUR3") layout(matrix(1:2, nrow = 2)) bxpseeding <- boxplot(rain ~ seeding, data = clouds, ylab = "Rainfall", xlab = "Seeding") bxpecho <- boxplot(rain ~ EM, data = clouds, ylab = "Rainfall", xlab = "Echo Motion") @ <>= layout(matrix(1:2, nrow = 2)) bxpseeding <- boxplot(rain ~ seeding, data = clouds, ylab = "Rainfall", xlab = "Seeding") bxpecho <- boxplot(rain ~ EM, data = clouds, ylab = "Rainfall", xlab = "Echo Motion") @ \caption{Boxplots of \Robject{rain}. \label{MLR-rainfall-boxplot}} \end{center} \end{figure} \begin{figure} \begin{center} <>= layout(matrix(1:4, nrow = 2)) plot(rain ~ time, data = clouds) plot(rain ~ cloudc, data = clouds) plot(rain ~ sne, data = clouds, xlab="S-Ne criterion") plot(rain ~ prewet, data = clouds) @ \caption{Scatterplots of \Robject{rain} against the continuous covariates. \label{MLR-rainfall-scplot}} \end{center} \end{figure} Both the boxplots (Figure~\ref{MLR-rainfall-boxplot}) and the scatterplots (Figure~\ref{MLR-rainfall-scplot}) show some evidence of outliers. The row names of the extreme observations in the \Robject{clouds} \Rclass{data.frame} can be identified via <>= rownames(clouds)[clouds$rain %in% c(bxpseeding$out, bxpecho$out)] @ where \Robject{bxpseeding} and \Robject{bxpecho} are variables created by \Rcmd{boxplot} in Figure~\ref{MLR-rainfall-boxplot}. Now we shall not remove these observations but bear in mind during the modeling process that they may cause problems. In this example it is sensible to assume that the effect of some of the other explanatory variables is modified by seeding and therefore consider a model that includes seeding as covariate and, furthermore, allows interaction terms \index{Interaction} for \Robject{seeding} with each of the covariates except \Robject{time}. This model can be described by the \Rclass{formula} <>= clouds_formula <- rain ~ seeding + seeding:(sne + cloudc + prewet + EM) + time @ and the design matrix $\X^\star$ can be computed via <>= Xstar <- model.matrix(clouds_formula, data = clouds) @ By default, treatment contrasts have been applied to the dummy codings of the factors \Robject{seeding} and \Robject{EM} as can be seen from the inspection of the \Robject{contrasts} attribute of the model matrix <>= attr(Xstar, "contrasts") @ The default contrasts can be changed via the \Rarg{contrasts.arg} argument to \Rcmd{model.matrix} or the \Robject{contrasts} argument to the fitting function, for example \Rcmd{lm} or \Rcmd{aov} as shown in \Sexpr{ch("ANOVA")}. However, such internals are hidden and performed by high-level model-fitting functions such as \Rcmd{lm} which will be used to fit the linear model defined by the \Rclass{formula} \Robject{clouds\_formula}: <>= clouds_lm <- lm(clouds_formula, data = clouds) class(clouds_lm) @ The result of the model fitting is an object of class \Rclass{lm} for which a \Rcmd{summary} method showing the conventional regression analysis output is available. The output in Figure~\ref{MLR-clouds-summary} shows the estimates $\hat{\beta}^\star$ with corresponding standard errors and $t$-statistics as well as the $F$-statistic with associated $p$-value. \renewcommand{\nextcaption}{\R{} output of the linear model fit for the \Robject{clouds} data. \label{MLR-clouds-summary}} \SchunkLabel <>= summary(clouds_lm) @ \SchunkRaw Many methods are available for extracting components of the fitted model. The estimates $\hat{\beta}^\star$ can be assessed via \newpage <>= betastar <- coef(clouds_lm) betastar @ and the corresponding covariance matrix $\Cov(\hat{\beta}^\star)$ is available from the \Rcmd{vcov} method <>= Vbetastar <- vcov(clouds_lm) @ where the square roots of the diagonal elements are the standard errors as shown in Figure~\ref{MLR-clouds-summary} <>= sqrt(diag(Vbetastar)) @ \begin{figure} \begin{center} <>= psymb <- as.numeric(clouds$seeding) plot(rain ~ sne, data = clouds, pch = psymb, xlab = "S-Ne criterion") abline(lm(rain ~ sne, data = clouds, subset = seeding == "no")) abline(lm(rain ~ sne, data = clouds, subset = seeding == "yes"), lty = 2) legend("topright", legend = c("No seeding", "Seeding"), pch = 1:2, lty = 1:2, bty = "n") @ \caption{Regression relationship between S-Ne criterion and rainfall with and without seeding. \label{MLR-clouds-lmplot}} \end{center} \end{figure} In order to investigate the quality of the model fit, we need access to the residuals and the fitted values. The residuals can be found by the \Rcmd{residuals} method and the fitted values of the response from the \Rcmd{fitted} (or \Rcmd{predict}) method <>= clouds_resid <- residuals(clouds_lm) clouds_fitted <- fitted(clouds_lm) @ Now the residuals and the fitted values can be used to construct diagnostic plots; for example the residual plot in Figure~\ref{MLR-resid} where each observation is labelled by its number (using \Rcmd{textplot} from package \Rpackage{wordclouds}). Observations $1$ and $15$ give rather large residual values and the data should perhaps be reanalysed after these two observations are removed. The normal probability plot of the residuals shown in Figure~\ref{MLR-qqplot} shows a reasonable agreement between theoretical and sample quantiles, however, observations $1$ and $15$ are extreme again. \begin{figure} \begin{center} <>= plot(clouds_fitted, clouds_resid, xlab = "Fitted values", ylab = "Residuals", type = "n", ylim = max(abs(clouds_resid)) * c(-1, 1)) abline(h = 0, lty = 2) textplot(clouds_fitted, clouds_resid, words = rownames(clouds), new = FALSE) @ \caption{Plot of residuals against fitted values for \Robject{clouds} seeding data. \label{MLR-resid}} \end{center} \end{figure} \begin{figure} \begin{center} <>= qqnorm(clouds_resid, ylab = "Residuals") qqline(clouds_resid) @ \caption{Normal probability plot of residuals from cloud seeding model \Robject{clouds\_lm}. \label{MLR-qqplot}} \end{center} \end{figure} An index plot of the Cook's distances for each observation %' (and many other plots including those constructed above from using the basic functions) can be found from applying the \Rcmd{plot} method to the object that results from the application of the \Rcmd{lm} function. \begin{figure} \begin{center} <>= plot(clouds_lm) @ <>= plot(clouds_lm, which = 4, sub.caption = NULL) @ \caption{Index plot of Cook's distances for cloud seeding data. %' \label{MLR-cook}} \end{center} \end{figure} Figure~\ref{MLR-cook} suggests that observations 2 and 18 have undue influence on the estimated regression coefficients, but the two outliers identified previously do not. Again it may be useful to look at the results after these two observations have been removed (see Exercise 6.2). %% \ref{MLR-ex2}) \index{Regression diagnostics|)} %%\bibliographystyle{LaTeXBibTeX/refstyle} %%\bibliography{LaTeXBibTeX/HSAUR} \end{document}