\documentclass{chapman} %%% copy Sweave.sty definitions %%% keeps `sweave' from adding `\usepackage{Sweave}': DO NOT REMOVE %\usepackage{Sweave} \RequirePackage[T1]{fontenc} \RequirePackage{graphicx,ae,fancyvrb} \IfFileExists{upquote.sty}{\RequirePackage{upquote}}{} \usepackage{relsize} \DefineVerbatimEnvironment{Sinput}{Verbatim}{} \DefineVerbatimEnvironment{Soutput}{Verbatim}{fontfamily=courier, fontshape=it, fontsize=\relsize{-1}} \DefineVerbatimEnvironment{Scode}{Verbatim}{} \newenvironment{Schunk}{}{} %%% environment for raw output \newcommand{\SchunkRaw}{\renewenvironment{Schunk}{}{} \DefineVerbatimEnvironment{Soutput}{Verbatim}{fontfamily=courier, fontshape=it, fontsize=\small} \rawSinput } %%% environment for labeled output \newcommand{\nextcaption}{} \newcommand{\SchunkLabel}{ \renewenvironment{Schunk}{\begin{figure}[ht] }{\caption{\nextcaption} \end{figure} } \DefineVerbatimEnvironment{Sinput}{Verbatim}{frame = topline} \DefineVerbatimEnvironment{Soutput}{Verbatim}{frame = bottomline, samepage = true, fontfamily=courier, fontshape=it, fontsize=\relsize{-1}} } %%% S code with line numbers \DefineVerbatimEnvironment{Sinput} {Verbatim} { %% numbers=left } \newcommand{\numberSinput}{ \DefineVerbatimEnvironment{Sinput}{Verbatim}{numbers=left} } \newcommand{\rawSinput}{ \DefineVerbatimEnvironment{Sinput}{Verbatim}{} } %%% R / System symbols \newcommand{\R}{\textsf{R}} \newcommand{\rR}{{R}} \renewcommand{\S}{\textsf{S}} \newcommand{\SPLUS}{\textsf{S-PLUS}} \newcommand{\rSPLUS}{{S-PLUS}} \newcommand{\SPSS}{\textsf{SPSS}} \newcommand{\EXCEL}{\textsf{Excel}} \newcommand{\ACCESS}{\textsf{Access}} \newcommand{\SQL}{\textsf{SQL}} %%\newcommand{\Rpackage}[1]{\hbox{\rm\textit{#1}}} %%\newcommand{\Robject}[1]{\hbox{\rm\texttt{#1}}} %%\newcommand{\Rclass}[1]{\hbox{\rm\textit{#1}}} %%\newcommand{\Rcmd}[1]{\hbox{\rm\texttt{#1}}} \newcommand{\Rpackage}[1]{\index{#1 package@{\fontseries{b}\selectfont #1} package} {\fontseries{b}\selectfont #1}} \newcommand{\rpackage}[1]{{\fontseries{b}\selectfont #1}} \newcommand{\Robject}[1]{\texttt{#1}} \newcommand{\Rclass}[1]{\index{#1 class@\textit{#1} class}\textit{#1}} \newcommand{\Rcmd}[1]{\index{#1 function@\texttt{#1} function}\texttt{#1}} \newcommand{\Roperator}[1]{\texttt{#1}} \newcommand{\Rarg}[1]{\texttt{#1}} \newcommand{\Rlevel}[1]{\texttt{#1}} %%% other symbols \newcommand{\file}[1]{\hbox{\rm\texttt{#1}}} %%\newcommand{\stress}[1]{\index{#1}\textit{#1}} \newcommand{\stress}[1]{\textit{#1}} \newcommand{\booktitle}[1]{\textit{#1}} %%' %%% Math symbols \usepackage{amstext} \usepackage{amsmath} \newcommand{\E}{\mathsf{E}} \newcommand{\Var}{\mathsf{Var}} \newcommand{\Cov}{\mathsf{Cov}} \newcommand{\Cor}{\mathsf{Cor}} \newcommand{\x}{\mathbf{x}} \newcommand{\y}{\mathbf{y}} \renewcommand{\a}{\mathbf{a}} \newcommand{\W}{\mathbf{W}} \newcommand{\C}{\mathbf{C}} \renewcommand{\H}{\mathbf{H}} \newcommand{\X}{\mathbf{X}} \newcommand{\B}{\mathbf{B}} \newcommand{\V}{\mathbf{V}} \newcommand{\I}{\mathbf{I}} \newcommand{\D}{\mathbf{D}} \newcommand{\bS}{\mathbf{S}} \newcommand{\N}{\mathcal{N}} \renewcommand{\L}{L} \renewcommand{\P}{\mathsf{P}} \newcommand{\K}{\mathbf{K}} \newcommand{\m}{\mathbf{m}} \newcommand{\argmin}{\operatorname{argmin}\displaylimits} \newcommand{\argmax}{\operatorname{argmax}\displaylimits} \newcommand{\bx}{\mathbf{x}} \newcommand{\bbeta}{\mathbf{\beta}} %%% links \usepackage{hyperref} \hypersetup{% pdftitle = {A Handbook of Statistical Analyses Using R (3rd Edition)}, pdfsubject = {Book}, pdfauthor = {Torsten Hothorn and Brian S. Everitt}, colorlinks = {black}, linkcolor = {black}, citecolor = {black}, urlcolor = {black}, hyperindex = {true}, linktocpage = {true}, } %%% captions & tables %% : conflics with figure definition in chapman.cls %%\usepackage[format=hang,margin=10pt,labelfont=bf]{caption} %% \usepackage{longtable} \usepackage[figuresright]{rotating} %%% R symbol in chapter 1 \usepackage{wrapfig} %%% Bibliography \usepackage[round,comma]{natbib} \renewcommand{\refname}{References \addcontentsline{toc}{chapter}{References}} \citeindexfalse %%% texi2dvi complains that \newblock is undefined, hm... \def\newblock{\hskip .11em plus .33em minus .07em} %%% Example sections \newcounter{exercise}[chapter] \setcounter{exercise}{0} \newcommand{\exercise}{\stepcounter{exercise} \item{Ex.~\arabic{chapter}.\arabic{exercise} }} %% URLs \newcommand{\curl}[1]{\begin{center} \url{#1} \end{center}} %%% for manual corrections %\renewcommand{\baselinestretch}{2} %%% plot sizes \setkeys{Gin}{width=0.95\textwidth} %%% color \usepackage{color} %%% hyphenations \hyphenation{drop-out} \hyphenation{mar-gi-nal} %%% new bidirectional quotes need \usepackage[utf8]{inputenc} %\usepackage{setspace} \definecolor{sidebox_todo}{rgb}{1,1,0.2} \newcommand{\todo}[1]{ \hspace{0pt}% \marginpar{% \fcolorbox{black}{sidebox_todo}{% \parbox{\marginparwidth} { \raggedright\sffamily\footnotesize{TODO: #1}% } }% } } \begin{document} %% Title page \title{A Handbook of Statistical Analyses Using \R{} --- 3rd Edition} \author{Torsten Hothorn and Brian S. Everitt} \maketitle %%\VignetteIndexEntry{Chapter Data Analysis using Graphical Displays} %%\VignetteDepends{lattice, maps, sf, sp} \setcounter{chapter}{1} \SweaveOpts{prefix.string=figures/HSAUR,eps=FALSE,keep.source=TRUE} <>= rm(list = ls()) s <- search()[-1] s <- s[-match(c("package:base", "package:stats", "package:graphics", "package:grDevices", "package:utils", "package:datasets", "package:methods", "Autoloads"), s)] if (length(s) > 0) sapply(s, detach, character.only = TRUE) if (!file.exists("tables")) dir.create("tables") if (!file.exists("figures")) dir.create("figures") set.seed(290875) options(prompt = "R> ", continue = "+ ", width = 63, # digits = 4, show.signif.stars = FALSE, SweaveHooks = list(leftpar = function() par(mai = par("mai") * c(1, 1.05, 1, 1)), bigleftpar = function() par(mai = par("mai") * c(1, 1.7, 1, 1)))) HSAURpkg <- require("HSAUR3") if (!HSAURpkg) stop("cannot load package ", sQuote("HSAUR3")) rm(HSAURpkg) ### hm, R-2.4.0 --vanilla seems to need this a <- Sys.setlocale("LC_ALL", "C") ### book <- TRUE refs <- cbind(c("AItR", "DAGD", "SI", "CI", "ANOVA", "MLR", "GLM", "DE", "RP", "GAM", "SA", "ALDI", "ALDII", "SIMC", "MA", "PCA", "MDS", "CA"), 1:18) ch <- function(x) { ch <- refs[which(refs[,1] == x),] if (book) { return(paste("Chapter~\\\\ref{", ch[1], "}", sep = "")) } else { return(paste("Chapter~", ch[2], sep = "")) } } if (file.exists("deparse.R")) source("deparse.R") setHook(packageEvent("lattice", "attach"), function(...) { lattice.options(default.theme = function() standard.theme("pdf", color = FALSE)) }) @ \pagestyle{headings} <>= book <- FALSE @ %% lower png resolution for vignettes \SweaveOpts{resolution = 100} \chapter[Data Analysis Using Graphical Displays]{Data Analysis Using Graphical Displays: Malignant Melanoma in the US and Chinese Health and \\ Family Life \label{DAGD}} \section{Introduction} \section{Initial Data Analysis} \section{Analysis Using \R{}} \subsection{Malignant Melanoma} \index{Boxplot|(} \index{Histogram|(} \index{Scatterplot|(} We might begin to examine the malignant melanoma data in Table~\ref{DAGD-USmelanoma-tab} by constructing a histogram or boxplot for \stress{all} the mortality rates in Figure~\ref{DAGD-USmelanoma-histbox}. The \Rcmd{plot}, \Rcmd{hist} and \Rcmd{boxplot} functions have already been introduced in \Sexpr{ch("AItR")} and we want to produce a plot where both techniques are applied at once. The \Rcmd{layout} function organizes two independent plots on one plotting device, for example on top of each other. Using this relatively simple technique (more advanced methods will be introduced later) we have to make sure that the $x$-axis is the same in both graphs. This can be done by computing a plausible range of the data, later to be specified in a plot via the \Rcmd{xlim} argument: <>= xr <- range(USmelanoma$mortality) * c(0.9, 1.1) xr @ Now, plotting both the histogram and the boxplot requires setting up the plotting device with equal space for two independent plots on top of each other. Calling the \Rcmd{layout} function on a matrix with two cells in two rows, containing the numbers one and two, leads to such a partitioning. The \Rcmd{boxplot} function is called first on the mortality data and then the \Rcmd{hist} function, where the range of the $x$-axis in both plots is defined by $(\Sexpr{xr[1]}, \Sexpr{xr[2]})$. One tiny problem to solve is the size of the margins; their defaults are too large for such a plot. As with many other graphical parameters, one can adjust their value for a specific plot using function \Rcmd{par}. The \R{} code and the resulting display are given in Figure~\ref{DAGD-USmelanoma-histbox}. \begin{figure} \begin{center} <>= layout(matrix(1:2, nrow = 2)) par(mar = par("mar") * c(0.8, 1, 1, 1)) boxplot(USmelanoma$mortality, ylim = xr, horizontal = TRUE, xlab = "Mortality") hist(USmelanoma$mortality, xlim = xr, xlab = "", main = "", axes = FALSE, ylab = "") axis(1) @ \caption{Histogram (top) and boxplot (bottom) of malignant melanoma mortality rates. \label{DAGD-USmelanoma-histbox}} \end{center} \end{figure} Both the histogram and the boxplot in Figure~\ref{DAGD-USmelanoma-histbox} indicate a certain skewness of the mortality distribution. Looking at the characteristics of all the mortality rates is a useful beginning but for these data we might be more interested in comparing mortality rates for ocean and non-ocean states. So we might construct two histograms or two boxplots. Such a \stress{parallel boxplot}, visualizing the conditional distribution of a numeric variable in groups as given by a categorical variable, are easily computed using the \Rcmd{boxplot} function. The continuous response variable and the categorical independent variable are specified via a \Rclass{formula} as described in \Sexpr{ch("AItR")}. Figure~\ref{DAGD-USmelanoma-boxocean} shows such parallel boxplots, as by default produced the \Rcmd{plot} function for such data, for the mortality in ocean and non-ocean states and leads to the impression that the mortality is increased in east or west coast states compared to the rest of the country. \begin{figure} \begin{center} <>= plot(mortality ~ ocean, data = USmelanoma, xlab = "Contiguity to an ocean", ylab = "Mortality") @ \caption{Parallel boxplots of malignant melanoma mortality rates by contiguity to an ocean. \label{DAGD-USmelanoma-boxocean}} \end{center} \end{figure} Histograms are generally used for two purposes: counting and displaying the distribution of a variable; according to \cite{HSAUR:Wilkinson1992}, `they are effective for neither'. Histograms can often be misleading for displaying distributions because of their dependence on the number of classes chosen. An alternative is to formally estimate the density function of a variable and then plot the resulting estimate; details of density estimation are given in \Sexpr{ch("DE")} but for the ocean and non-ocean states the two density estimates can be produced and plotted as shown in Figure~\ref{DAGD-USmelanoma-dens} which supports the impression from Figure~\ref{DAGD-USmelanoma-boxocean}. For more details on such density estimates we refer to \Sexpr{ch("DE")}. \begin{figure} \begin{center} <>= dyes <- with(USmelanoma, density(mortality[ocean == "yes"])) dno <- with(USmelanoma, density(mortality[ocean == "no"])) plot(dyes, lty = 1, xlim = xr, main = "", ylim = c(0, 0.018), xlab = "Mortality") lines(dno, lty = 2) legend("topleft", lty = 1:2, legend = c("Coastal State", "Land State"), bty = "n") @ \caption{Estimated densities of malignant melanoma mortality rates by contiguity to an ocean. \label{DAGD-USmelanoma-dens}} \end{center} \end{figure} Now we might move on to look at how mortality rates are related to the geographic location of a state as represented by the latitude and longitude of the center of the state. Here the main graphic will be the scatterplot. The simple $xy$ scatterplot has been in use since at least the eighteenth century and has many virtues -- indeed according to \cite{HSAUR:Tufte1983}: \begin{quote} The relational graphic -- in its barest form the scatterplot and its variants -- is the greatest of all graphical designs. It links at least two variables, encouraging and even imploring the viewer to assess the possible causal relationship between the plotted variables. It confronts causal theories that $x$ causes $y$ with empirical evidence as to the actual relationship between $x$ and $y$. \end{quote} Let's begin with simple scatterplots of mortality rate against longitude %%' and mortality rate against latitude which can be produced by the code preceding Figure~\ref{DAGD-USmelanoma-xy}. Again, the \Rcmd{layout} function is used for partitioning the plotting device, now resulting in two side-by-side plots. The argument to \Rcmd{layout} is now a matrix with only one row but two columns containing the numbers one and two. In each cell, the \Rcmd{plot} function is called for producing a scatterplot of the variables given in the \Rclass{formula}. \begin{figure} \begin{center} <>= layout(matrix(1:2, ncol = 2)) plot(mortality ~ longitude, data = USmelanoma, ylab = "Mortality", xlab = "Longitude") plot(mortality ~ latitude, data = USmelanoma, ylab = "Mortality", xlab = "Latitude") @ \caption{Scatterplot of malignant melanoma mortality rates by geographical location. \label{DAGD-USmelanoma-xy}} \end{center} \end{figure} Since mortality rate is clearly related only to latitude we can now produce scatterplots of mortality rate against latitude separately for ocean and non-ocean states. Instead of producing two displays, one can choose different plotting symbols for either states. This can be achieved by specifying a vector of integers or characters to the \Rcmd{pch}, where the $i$th element of this vector defines the plot symbol of the $i$th observation in the data to be plotted. For the sake of simplicity, we convert the \Robject{ocean} factor to an \Rclass{integer} vector containing the numbers one for land states and two for ocean states. As a consequence, land states can be identified by the dot symbol and ocean states by triangles. It is useful to add a legend to such a plot, most conveniently by using the \Rcmd{legend} function. This function takes three arguments: a string indicating the position of the legend in the plot, a character vector of labels to be printed and the corresponding plotting symbols (referred to by integers). In addition, the display of a bounding box is anticipated (\Rcmd{bty = "n"}). \begin{figure} \begin{center} <>= plot(mortality ~ latitude, data = USmelanoma, pch = (1:2)[ocean], ylab = "Mortality", xlab = "Latitude") legend("topright", legend = c("Land state", "Coast state"), pch = 1:2, bty = "n") @ \caption{Scatterplot of malignant melanoma mortality rates against latitude. \label{DAGD-USmelanoma-lat}} \end{center} \end{figure} The scatterplot in Figure~\ref{DAGD-USmelanoma-lat} highlights that the mortality is lowest in the northern land states. Coastal states show a higher mortality than land states at roughly the same latitude. The highest mortalities can be observed for the south coastal states with latitude less than $32^\circ$, say, that is <>= subset(USmelanoma, latitude < 32) @ Alternatively, we also may simply want to look at a color-coded map of the United States, where each state is plotted in a color that corresponds to its mortality rate. It is fairly simple to set-up such a plot using the \Rpackage{sp} family of packages \citep{PKG:sp}. We start with loading a map of the mainland states, basically a number of polygons: <>= library("sp") library("sf") library("maps") states <- map("state", plot = FALSE, fill = TRUE) @ It is of course important to match the mortality rates to the corresponding state. We therefore create unique names of the states in lower-case letters for both the polygons and the mortality data <>= IDs <- sapply(strsplit(states$names, ":"), function(x) x[1]) rownames(USmelanoma) <- tolower(rownames(USmelanoma)) @ Now we are ready to merge these two objects into a so-called \Rclass{SpatialPolygonsDataFrame} object. We first create a \Rclass{SpatialPolygons} object from the map in the correct reference system (WGS84, in our case) and then merge the polygons with the data <>= us1 <- merge(st_as_sf(states), USmelanoma) us2 <- as(us1, "Spatial") @ The resulting object \Robject{us2} can now be plotted using the \Rcmd{spplot} function, see Figure~\ref{DAGD-USmelanoma-long-lat}. The colors correspond to the mortality rate, as shown in the color legend to the right of the map. We see that darker grey values corresponding to higher mortality rates appear in the southern costal states, both on the east and the west coast in good agreement with our earlier results. \begin{figure} \begin{center} <>= spplot(us2, "mortality", col.regions = rev(grey.colors(100))) @ \caption{Map of the United States of America showing malignant melanoma mortality rates. \label{DAGD-USmelanoma-long-lat}} \end{center} \end{figure} Up to now we have primarily focused on the visualization of continuous variables. We now extend our focus to the visualization of categorical variables. \index{Boxplot|)} \index{Histogram|)} \index{Scatterplot|)} \subsection{Chinese Health and Family Life} \index{Barchart|(} \index{Spineplot|(} \index{Spinogram|(} One part of the questionnaire the Chinese Health and Family Life Survey focuses on is the self-reported health status. Two questions are interesting for us. The first one is `Generally speaking, do you consider the condition of your health to be excellent, good, fair, not good, or poor?'. The second question is `Generally speaking, in the past twelve months, how happy were you?'. The distribution of such variables is commonly visualized using barcharts where for each category the total or relative number of observations is displayed. Such a barchart can conveniently be produced by applying the \Rcmd{barplot} function to a tabulation of the data. The empirical density of the variable \Robject{R\_happy} is computed by the \Rcmd{xtabs} function for producing (contingency) tables; the resulting barchart is given in Figure~\ref{DAGD-CHFLS-happy}. \begin{figure} <>= barplot(xtabs(~ R_happy, data = CHFLS)) @ \caption{Bar chart of happiness. \label{DAGD-CHFLS-happy}} \end{figure} The visualization of two categorical variables could be done by conditional barcharts, i.e., barcharts of the first variable within the categories of the second variable. An attractive alternative for displaying such two-way tables are \stress{spineplots} \citep{HSAUR:Friendly1994,HSAUR:HofmannTheus2005,HSAUR:Chenetal2008}; the meaning of the name will become clear when looking at such a plot in Figure~\ref{DAGD-CHFLS-health_happy}. Before constructing such a plot, we produce a two-way table of the health status and self-reported happiness using the \Rcmd{xtabs} function: <>= xtabs(~ R_happy + R_health, data = CHFLS) @ <>= hh <- xtabs(~ R_health + R_happy, data = CHFLS) @ A \stress{spineplot} is a group of rectangles, each representing one cell in the two-way contingency table. The area of the rectangle is proportional with the number of observations in the cell. Here, we produce a mosaic plot of health status and happiness in Figure~\ref{DAGD-CHFLS-health_happy}. \begin{figure} <>= plot(R_happy ~ R_health, data = CHFLS, ylab = "Happiness", xlab = "Health") @ \caption{Spineplot of health status and happiness. \label{DAGD-CHFLS-health_happy}} \end{figure} Consider the right upper cell in Figure~\ref{DAGD-CHFLS-health_happy}, i.e., the $\Sexpr{hh["Excellent", "Very happy"]}$ very happy women with excellent health status. The width of the right-most bar corresponds to the frequency of women with excellent health status. The length of the top-right rectangle corresponds to the conditional frequency of very happy women given their health status is excellent. Multiplying these two quantities gives the area of this cell which corresponds to the frequency of women who are both very happy and enjoy an excellent health status. The conditional frequency of very happy women increases with increasing health status, whereas the conditional frequency of very unhappy or not too happy women decreases. When the association of a categorical and a continuous variable is of interest, say the monthly income and self-reported happiness, one might use parallel boxplots to visualize the distribution of the income depending on happiness. If we were studying self-reported happiness as response and income as independent variable, however, this would give a representation of the conditional distribution of income given happiness, but we are interested in the conditional distribution of happiness given income. One possibility to produce a more appropriate plot is called \stress{spinogram}. Here, the continuous $x$-variable is categorized first. Within each of these categories, the conditional frequencies of the response variable are given by stacked barcharts, in a way similar to spineplots. For happiness depending on log-income (since income is naturally skewed we use a log-transformation of the income) it seems that the proportion of unhappy and not too happy women decreases with increasing income whereas the proportion of very happy women stays rather constant. In contrast to spinograms, where bins, as in a histogram, are given on the $x$-axis, a \stress{conditional density plot} uses the original $x$-axis for a display of the conditional density of the categorical response given the independent variable. \begin{figure} <>= layout(matrix(1:2, ncol = 2)) plot(R_happy ~ log(R_income + 1), data = CHFLS, ylab = "Happiness", xlab = "log(Income + 1)") cdplot(R_happy ~ log(R_income + 1), data = CHFLS, ylab = "Happiness", xlab = "log(Income + 1)") @ \caption{Spinogram (left) and conditional density plot (right) of happiness depending on log-income. \label{DAGD-CHFLS-happy_income}} \end{figure} \index{Barchart|)} \index{Spineplot|)} \index{Spinogram|)} \index{Trellis plot|(} For our last example we return to scatterplots for inspecting the association between a woman's monthly income and the income of her partner. Both income variables have been computed and partially imputed from other self-reported variables and are only rough assessments of the real income. Moreover, the data itself is numeric but heavily tied, making it difficult to produce `correct' scatterplots because points will overlap. A relatively easy trick is to jitter the observation by adding a small random noise to each point in order to avoid overlapping plotting symbols. In addition, we want to study the relationship between both monthly incomes conditional on the woman's education. Such conditioning plots are called \stress{trellis} plots and are implemented in the package \Rpackage{lattice} \citep{PKG:lattice, HSAUR:Sarkar2008}. We utilize the \Rcmd{xyplot} function from package \Rpackage{lattice} to produce a scatterplot. The formula reads as already explained with the exception that a third \stress{conditioning} variable, \Robject{R\_edu} in our case, is present. For each level of education, a separate scatterplot will be produced. The plots are directly comparable since the axes remain the same for all plots. \begin{figure} <>= library("lattice") xyplot(jitter(log(R_income + 0.5)) ~ jitter(log(A_income + 0.5)) | R_edu, data = CHFLS, pch = 19, col = rgb(.1, .1, .1, .1), ylab = "log(Wife's income + .5)", xlab = "log(Husband's income + .5)") @ <>= library("lattice") trellis.par.set(list(plot.symbol = list(col=1,pch=20, cex=0.7), box.rectangle = list(col=1), plot.line = list(col = 1, lwd = 1), box.umbrella = list(lty=1, col=1), strip.background = list(col = "white"))) ltheme <- canonical.theme(color = FALSE) ## in-built B&W theme ltheme$strip.background$col <- "transparent" ## change strip bg lattice.options(default.theme = ltheme) xyplot(jitter(log(R_income + 0.5)) ~ jitter(log(A_income + 0.5)) | R_edu, data = CHFLS, pch = 19, col = rgb(.1, .1, .1, .1), ylab = "log(Wife's income + .5)", xlab = "log(Husband's income + .5)") @ \caption{Scatterplot of jittered log-income of wife and husband, conditional on the wife's education. \label{DAGD-CHFLS-RAincome3}} \end{figure} The plot shown in Figure~\ref{DAGD-CHFLS-RAincome3} reveals several interesting issues. Some observations are positioned on a straight line with slope one, most probably an artifact of missing value imputation by linear models (as described in the data dictionary, see the documentation \texttt{?CHFLS}). Four constellations can be identified: both partners have zero income, the partner has no income, the woman has no income or both partners have a positive income. For couples where the woman has a university degree, the income of both partners is relatively high (except for two couples where only the woman has income). A small number of former junior college students live in relationships where only the man has income, the income of both partners seems only slightly positively correlated for the remaining couples. For lower levels of education, all four constellations are present. The frequency of couples where only the man has some income seems larger than the other way around. Ignoring the observations on the straight line, there is almost no association between the income of both partners. \index{Trellis plot|)} \section{Summary of Findings} Using relatively straightforward graphical techniques only on the two sets of data considered in this chapter we have been able to uncover a number of important features of each data set; \begin{description} \item[Melanoma mortality] Mortality is related only to the latitude of a state not to its longitude, mortality is higher for costal states than for land states, and the highest mortality is observed in the south costal states with latitude less than 32 degrees. \item[Health and family life] We saw that happiness depends on health status. Women reported to be very happy more often when they also reported a good or excellent health status. The dependency of happiness on the income of the women seems to be less clear, but we conclude that, conditional on education, the income of wives and their husbands is highly correlated. \end{description} \section{Final Comments} Producing publication-quality graphics is one of the major strengths of the \R{} system and almost anything is possible since graphics are programmable in \R{}. Naturally, this chapter can be only a very brief introduction to some commonly used displays and the reader is referred to specialized books, most important \cite{HSAUR:Murrell2005}, \cite{HSAUR:Sarkar2008}, and \cite{HSAUR:Chenetal2008}. Interactive 3D-graphics are available from package \Rpackage{rgl} \citep{PKG:rgl}. \section*{Exercises} \begin{description} \exercise The data in Table~\ref{DAGD-household-tab} are part of a data set collected from a survey of household expenditure and give the expenditure of $20$ single men and $20$ single women on four commodity groups. The units of expenditure are Hong Kong dollars, and the four commodity groups are \begin{description} \item[\Robject{housing}] housing, including fuel and light, \item[\Robject{food}] foodstuffs, including alcohol and tobacco, \item[\Robject{goods}] other goods, including clothing, footwear, and durable goods, \item[\Robject{service}] services, including transport and vehicles. \end{description} The aim of the survey was to investigate how the division of household expenditure between the four commodity groups depends on total expenditure and to find out whether this relationship differs for men and women. Use appropriate graphical methods to answer these questions and state your conclusions. <>= data("household", package = "HSAUR3") toLatex(HSAURtable(household), caption = paste("Household expenditure for single men and women."), label = "DAGD-household-tab") @ \exercise The data set shown in Table~\ref{DAGD-USstates-tab} contains values of seven variables for ten states in the US. The seven variables are \begin{description} \item[\Robject{Population}] population size divided by $1000$, \item[\Robject{Income}] average per capita income, \item[\Robject{Illiteracy}] illiteracy rate (\% population), \item[\Robject{Life.Expectancy}] life expectancy (years), \item[\Robject{Homicide}] homicide rate (per $1000$), \item[\Robject{Graduates}] percentage of high school graduates, \item[\Robject{Freezing}] average number of days per below freezing. \end{description} With these data \begin{enumerate} \item Construct a scatterplot matrix of the data labeling the points by state name (using function \Rcmd{text}). \item Construct a plot of life expectancy and homicide rate conditional on average per capita income. \end{enumerate} \begin{sidewaystable} \vspace*{12.5cm} \begin{center} <>= data("USstates", package = "HSAUR3") toLatex(HSAURtable(USstates), caption = paste("Socio-demographic variables for ten US states."), label = "DAGD-USstates-tab") @ \end{center} \end{sidewaystable} \exercise Mortality rates per $100,000$ from male suicides for a number of age groups and a number of countries are given in Table~\ref{DAGD-suicides2-tab}. Construct side-by-side box plots for the data from different age groups, and comment on what the graphic tells us about the data. <>= data("suicides2", package = "HSAUR3") toLatex(HSAURtable(suicides2), caption = paste("Mortality rates per $100,000$ from male suicides."), label = "DAGD-suicides2-tab", rownames = TRUE) @ \exercise \cite{HSAUR:FluryRiedwyl1988} report data that give various length measurements on $200$ Swiss bank notes. The data are available from package \Rpackage{mclust} \citep{PKG:mclust}; a sample of ten bank notes is given in Table~\ref{DAGD-banknote-tab}. <>= data("banknote", package = "mclust") banknote$Status <- NULL banknote <- banknote[c(1:5, 101:200),] toLatex(HSAURtable(banknote, pkg = "mclust", nrow = 10), caption = paste("Swiss bank note data."), label = "DAGD-banknote-tab", rownames = FALSE) @ Use whatever graphical techniques you think are appropriate to investigate whether there is any `pattern' or structure in the data. Do you observe something suspicious? \exercise The data in Table~\ref{DAGD-birds-tab} were originally derived from a study reported in \cite{HSAUR:Vuilleumier1970} which investigated numbers of bird species in isolated `islands' of paramo vegetation in the northern Andes. The aim of the study was to investigate how the number of species (\Robject{N}) is related to four other variables, \Robject{AR} (area of `island' in thousands of square km), \Robject{EL} (elevation in thousands of m), \Robject{Dec} (distance from Ecuador in km) and \Robject{DNI} (distance to the nearest `island' in km). Begin by constructing a scatterplot matrix of the data differentiating the islands on each panel by a different plotting symbol and on each diagonal panel showing the histogram of the associated variable. What can you conclude from this plot about how N is related to the other four variables? <>= data("birds", package = "HSAUR3") toLatex(HSAURtable(birds), caption = paste("Birds in paramo vegetation."), label = "DAGD-birds-tab", rownames = TRUE) @ \end{description} \bibliographystyle{LaTeXBibTeX/refstyle} \bibliography{LaTeXBibTeX/HSAUR} \end{document}