# These examples were done in R 1.3.1 on an i686 running debian linux. I've
# checked most of the syntax. Everything should work under most
# versions of S-PLUS although the output is slightly different.
#
# Let's work with 2-way tables first
#
# The first thing we need to do is enter our data. Suppose we find the
# following contingency table relating hair color and eye color for a
# number of men:
# Eye
# Hair Brown Blue Hazel Green
# Black 32 11 10 3
# Brown 38 50 25 15
# Red 10 10 7 7
# Blond 3 30 5 8
#
# to enter these data into a data frame we could do the following:
> hair <- factor(rep(c("Black", "Brown", "Red", "Blond"), c(4,4,4,4)))
> eye <- factor(rep(c("Brown", "Blue", "Hazel", "Green"), 4))
> freq <- c(32,11,10,3, 38,50,25,15, 10,10,7,7, 3,30,5,8)
> haireye.data <- data.frame(hair, eye, freq)
#
# Let's take a look at this data frame:
#
> haireye.data
hair eye freq
1 Black Brown 32
2 Black Blue 11
3 Black Hazel 10
4 Black Green 3
5 Brown Brown 38
6 Brown Blue 50
7 Brown Hazel 25
8 Brown Green 15
9 Red Brown 10
10 Red Blue 10
11 Red Hazel 7
12 Red Green 7
13 Blond Brown 3
14 Blond Blue 30
15 Blond Hazel 5
16 Blond Green 8
#
# We may also want to put these data into a table. To do this we do
# the following:
> haireye.table <- table(hair, eye)
> haireye.table[cbind(hair,eye)] <- freq
#
# Let's look at the table:
#
> haireye.table
eye
hair Blue Brown Green Hazel
Black 11 32 3 10
Blond 30 3 8 5
Brown 50 38 15 25
Red 10 10 7 7
attr(,"class")
[1] "table"
#
# Note that the factor levels have been rearranged alphabetically but
# the information is the same
#
# OK, let's start fitting some models. First we'll use the glm()
# function to fit our models. Here we'll make use of the data as
# organized in the haireye.data dataframe
#
# First we'll switch to sum-to-zero contrasts
#
> options(contrasts=c("contr.sum", "contr.poly"))
#
# now let's fit a simple independence model
#
> glm.out1 <- glm(freq~hair+eye, family=poisson, data=haireye.data)
> summary(glm.out1)
Call:
glm(formula = freq ~ hair + eye, family = poisson, data = haireye.data)
Deviance Residuals:
Min 1Q Median 3Q Max
-3.6724 -0.9527 -0.1018 0.5635 3.0744
Coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept) 2.57730 0.07489 34.416 < 2e-16 ***
hair1 -0.03274 0.11717 -0.279 0.77989
hair2 -0.22945 0.12515 -1.833 0.06675 .
hair3 0.79393 0.09331 8.509 < 2e-16 ***
eye1 0.51997 0.09770 5.322 1.03e-07 ***
eye2 0.32369 0.10304 3.141 0.00168 **
eye3 -0.59865 0.14052 -4.260 2.04e-05 ***
---
Signif. codes: 0 `***' 0.001 `**' 0.01 `*' 0.05 `.' 0.1 ` ' 1
(Dispersion parameter for poisson family taken to be 1)
Null deviance: 163.492 on 15 degrees of freedom
Residual deviance: 44.315 on 9 degrees of freedom
AIC: 127.46
Number of Fisher Scoring iterations: 4
#
# Looking at the residual deviance of 44.315 on 9 degrees of freedom
# suggests that we can reject this model in favor of the saturated
# model at any conventional significance level. To get the p-value we
# type
#
> 1-pchisq(44.315,9)
[1] 1.234755e-06
#
# OK, so let's fit the saturated model:
#
> glm.out2 <- glm(freq~hair*eye, family=poisson, data=haireye.data)
> summary(glm.out2)
Call:
glm(formula = freq ~ hair * eye, family = poisson, data = haireye.data)
Deviance Residuals:
[1] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
Coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept) 2.464190 0.085819 28.714 < 2e-16 ***
hair1 -0.147983 0.157220 -0.941 0.346578
hair2 -0.417018 0.170213 -2.450 0.014286 *
hair3 0.904944 0.110207 8.211 < 2e-16 ***
eye1 0.539235 0.122463 4.403 1.07e-05 ***
eye2 0.161940 0.150664 1.075 0.282448
eye3 -0.506187 0.168015 -3.013 0.002589 **
hair1:eye1 -0.457547 0.234685 -1.950 0.051221 .
hair2:eye1 0.814790 0.211986 3.844 0.000121 ***
hair3:eye1 0.003654 0.157410 0.023 0.981479
hair1:eye2 0.987589 0.218782 4.514 6.36e-06 ***
hair2:eye2 -1.110500 0.357206 -3.109 0.001878 **
hair3:eye2 0.106513 0.184552 0.577 0.563843
hair1:eye3 -0.711408 0.358987 -1.982 0.047512 *
hair2:eye3 0.538456 0.284753 1.891 0.058630 .
hair3:eye3 -0.154897 0.222881 -0.695 0.487071
---
Signif. codes: 0 `***' 0.001 `**' 0.01 `*' 0.05 `.' 0.1 ` ' 1
(Dispersion parameter for poisson family taken to be 1)
Null deviance: 1.6349e+02 on 15 degrees of freedom
Residual deviance: 1.1742e-13 on 0 degrees of freedom
AIC: 101.15
Number of Fisher Scoring iterations: 2
#
# We can fit the same models using the loglin() function and the
# tabular form of the data.
#
# First the independence model:
#
> loglin.out1 <- loglin(haireye.table, margin=list(1,2), param=TRUE)
2 iterations: deviation 0
> loglin.out1
$lrt
[1] 44.31537
$pearson
[1] 42.16325
$df
[1] 9
$margin
$margin[[1]]
[1] "hair"
$margin[[2]]
[1] "eye"
$param
$param$"(Intercept)"
[1] 2.577301
$param$hair
Black Blond Brown Red
-0.03274428 -0.22945457 0.79393429 -0.53173544
$param$eye
Blue Brown Green Hazel
0.5199664 0.3236865 -0.5986465 -0.2450065
#
# Now the saturated model
#
> loglin.out2 <- loglin(haireye.table, margin=list(c(1,2)), param=TRUE)
2 iterations: deviation 0
> loglin.out2
$lrt
[1] 0
$pearson
[1] 0
$df
[1] 0
$margin
$margin[[1]]
[1] "hair" "eye"
$param
$param$"(Intercept)"
[1] 2.46419
$param$hair
Black Blond Brown Red
-0.1479831 -0.4170179 0.9049436 -0.3399426
$param$eye
Blue Brown Green Hazel
0.5392350 0.1619397 -0.5061867 -0.1949880
$param$hair.eye
eye
hair Blue Brown Green Hazel
Black -0.457546845 0.98758911 -0.7114082 0.18136592
Blond 0.814790122 -1.11049964 0.5384559 -0.24274640
Brown 0.003654229 0.10651271 -0.1548969 0.04472999
Red -0.360897506 0.01639782 0.3278492 0.01665049
#
# Note the estimates are the same regardless of whether we use the
# glm() function or the loglin() function to fit the models.
#
# OK, now suppose we have hair and eyecolor data for women as well as
# men. This gives us a 3-way table. Suppose this 3-way table is the
# following:
# , , Sex = Male
#
# Eye
# Hair Brown Blue Hazel Green
# Black 32 11 10 3
# Brown 38 50 25 15
# Red 10 10 7 7
# Blond 3 30 5 8
#
# , , Sex = Female
#
# Eye
# Hair Brown Blue Hazel Green
# Black 36 9 5 2
# Brown 81 34 29 14
# Red 16 7 7 7
# Blond 4 64 5 8
#
#
# To enter these data into a dataframe we could do the following:
#
> hair <- factor(rep(rep(c("Black", "Brown", "Red", "Blond"), c(4,4,4,4)),2))
> eye <- factor(rep(c("Brown", "Blue", "Hazel", "Green"), 8))
> sex <- factor(rep(c("Male", "Female"), c(16,16)))
> freq <- c(32,11,10,3, 38,50,25,15, 10,10,7,7, 3,30,5,8,
36,9,5,2, 81,34,29,14, 16,7,7,7, 4,64,5,8)
> haireye2.data <- data.frame(hair, eye, sex, freq)
> haireye2.data
hair eye sex freq
1 Black Brown Male 32
2 Black Blue Male 11
3 Black Hazel Male 10
4 Black Green Male 3
5 Brown Brown Male 38
6 Brown Blue Male 50
7 Brown Hazel Male 25
8 Brown Green Male 15
9 Red Brown Male 10
10 Red Blue Male 10
11 Red Hazel Male 7
12 Red Green Male 7
13 Blond Brown Male 3
14 Blond Blue Male 30
15 Blond Hazel Male 5
16 Blond Green Male 8
17 Black Brown Female 36
18 Black Blue Female 9
19 Black Hazel Female 5
20 Black Green Female 2
21 Brown Brown Female 81
22 Brown Blue Female 34
23 Brown Hazel Female 29
24 Brown Green Female 14
25 Red Brown Female 16
26 Red Blue Female 7
27 Red Hazel Female 7
28 Red Green Female 7
29 Blond Brown Female 4
30 Blond Blue Female 64
31 Blond Hazel Female 5
32 Blond Green Female 8
#
# OK, now lets put these data into table form
#
> haireye2.table <- table(hair, eye, sex)
> haireye2.table[cbind(hair,eye,sex)] <- freq
> haireye2.table
, , sex = Female
eye
hair Blue Brown Green Hazel
Black 9 36 2 5
Blond 64 4 8 5
Brown 34 81 14 29
Red 7 16 7 7
, , sex = Male
eye
hair Blue Brown Green Hazel
Black 11 32 3 10
Blond 30 3 8 5
Brown 50 38 15 25
Red 10 10 7 7
attr(,"class")
[1] "table"
#
# OK, let's fit the independence model using the glm() function
#
> glm.out3 <- glm(freq~hair+eye+sex, family=poisson, data=haireye2.data)
> summary(glm.out3)
Call:
glm(formula = freq ~ hair + eye + sex, family = poisson, data = haireye2.data)
Deviance Residuals:
Min 1Q Median 3Q Max
-5.4110 -1.5996 0.2315 0.9084 6.3734
Coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept) 2.64265 0.05217 50.657 < 2e-16 ***
hair1 -0.17912 0.08244 -2.173 0.02980 *
hair2 -0.01705 0.07803 -0.219 0.82699
hair3 0.79474 0.06257 12.701 < 2e-16 ***
eye1 0.50670 0.06742 7.516 5.67e-14 ***
eye2 0.52969 0.06702 7.904 2.70e-15 ***
eye3 -0.70505 0.10014 -7.040 1.92e-12 ***
sex1 0.10853 0.04132 2.626 0.00863 **
---
Signif. codes: 0 `***' 0.001 `**' 0.01 `*' 0.05 `.' 0.1 ` ' 1
(Dispersion parameter for poisson family taken to be 1)
Null deviance: 489.59 on 31 degrees of freedom
Residual deviance: 175.79 on 24 degrees of freedom
AIC: 330.54
Number of Fisher Scoring iterations: 4
#
# Looking at the residual deviance it is easy to see that independence
# does not hold here.
#
# Let's fit the saturated model and then do an analysis of deviance
# to find the best submodel
#
> glm.out4 <- glm(freq~hair*eye*sex, family=poisson, data=haireye2.data)
> anova(glm.out4, test="Chisq")
Analysis of Deviance Table
Model: poisson, link: log
Response: freq
Terms added sequentially (first to last)
Df Deviance Resid. Df Resid. Dev P(>|Chi|)
NULL 31 489.59
hair 3 165.59 28 324.00 1.138e-35
eye 3 141.27 25 182.73 2.010e-30
sex 1 6.93 24 175.79 0.01
hair:eye 9 146.44 15 29.35 4.806e-27
hair:sex 3 6.27 12 23.08 0.10
eye:sex 3 14.90 9 8.19 1.908e-03
hair:eye:sex 9 8.19 0 1.059e-12 0.52
# If we set our significance level at 0.10, this suggests that the
# model with all 2-way interactions included but no 3-way interactions
# is the best model. This means that no pair of variables are
# conditionally independent given the third but that the association
# between any two variables is identical at each level of the third
# variable.
#
# Given that we probably don't expect an interaction between sex and either
# hair or eye color and the fact that the hair:sex interaction is only
# significant at the 0.10 level we might want to look at a model with
# no interactions between sex and the other variables.
> glm.out5 <- glm(freq~sex+hair*eye, family=poisson, data=haireye2.data)
> anova(glm.out5, test="Chisq")
Analysis of Deviance Table
Model: poisson, link: log
Response: freq
Terms added sequentially (first to last)
Df Deviance Resid. Df Resid. Dev P(>|Chi|)
NULL 31 489.59
sex 1 6.93 30 482.66 0.01
hair 3 165.59 27 317.07 1.138e-35
eye 3 141.27 24 175.79 2.010e-30
hair:eye 9 146.44 15 29.35 4.806e-27
# all of these terms should be in the model but with a residual
# deviance of 29.35 on 15 degrees of freedom we can reject this
# model in favor of the saturated model. The p-value is:
> 1 - pchisq(29.35, 15)
[1] 0.01449365
# including the eye:sex interaction but not the hair:sex interaction
# we get
> glm.out6 <- glm(freq~sex+eye:sex+hair*eye, family=poisson,
data=haireye2.data)
> anova(glm.out6, test="Chisq")
Analysis of Deviance Table
Model: poisson, link: log
Response: freq
Terms added sequentially (first to last)
Df Deviance Resid. Df Resid. Dev P(>|Chi|)
NULL 31 489.59
sex 1 6.93 30 482.66 0.01
eye 3 141.27 27 341.39 2.010e-30
hair 3 165.59 24 175.79 1.138e-35
sex:eye 3 7.32 21 168.48 0.06
eye:hair 9 146.44 12 22.03 4.806e-27
# here we see that the sex:eye interaction is borderline significant
# with a p-value of 0.06. With a residual deviance of 22.03
# on 12 degrees of freedom we can reject this model in favor of the
# saturated model at significance levels above 0.037.
> 1 - pchisq(22.03, 12)
[1] 0.03718496
# Given some of the problems associated with using classical
# hypothesis test with contingency tables it might make sense to
# compare the models using BIC or AIC. Using BIC we get the following
> AIC(glm.out4, k=log(sum(freq)))
[1] 343.0238
> AIC(glm.out5, k=log(sum(freq)))
[1] 276.621
> AIC(glm.out6, k=log(sum(freq)))
[1] 288.4533
# Since smaller BIC values are better as calculated by R's AIC
# function, the best model appears to be model 5, the model with
# the eye:hair interaction but without sex:hair and sex:eye
# interactions which makes subtantive sense as well.