STT 430/530 R#7 Fall, 2007

#let's illustrate the Kruskal-Wallis test

#both by the R function kruskal.test & the permuation test.

#we'll illustrate it on the data in Example 3.2.2

#on page 87. . .

#call the factor trtm

trtm=c(rep("c",4),rep("p1",6),rep("p2",5),rep("p3",6))

trtm

#now put the log of the bacteria counts into a vector

lobact=scan()

#type in the data as we've done before. . .

lobact

#now apply the Kruskal-Wallis test via the R command

#and note the agreement with the textbook values . . .

kruskal.test(lobact~as.factor(trtm))

#don't forget to use the as.factor function so the

#formula will work properly. . .

#now let's do a permutation test based on the KW statistic

#first learn about a function in R called tapply

#use help.start() to check out the help file on this

#important function. Basically, tapply applies a function

#(the last argument) to a vector (the first argument)

#across the values of a categorical vector (the 2nd arg)

#e.g., the line below computes the mean of the values of

#of lobact by the values of trtm (it's like BY trtm in SAS)

#and the line following that computes the mean of the ranks

#of lobact across the treatment groups. . .

tapply(lobact,trtm,mean)

tapply(rank(lobact),trtm,mean)

#now compute the KW statistic "by hand" to get us ready

#for doing the permutation test based on the KW . . .

#we need to program the parts of the KW formula on p.86

paren=(tapply(rank(lobact),trtm,mean)-11)^2

#this gives the value inside the parentheses. . .

#and the following multiplies each component by the number

#of sample values in each level of the factor

term=tapply(trtm,trtm,length)*paren

#now sum these up and multiply by the coefficient in front

sum(term)*(12/(length(trtm)*(length(trtm)+1)))

#now let's work on doing a permutation test based on this

#statistic - follow the pattern we've done before.

#fill a vector with zeros

zz=numeric(5000)

#our observed value of the kw statistic is below . . .

kwobs= sum(term)*(12/(length(trtm)*(length(trtm)+1)))

#and the values in size are our sample sizes. . .

size=tapply(trtm,trtm,length)

#now let's do the permutations. . .

for (i in 1:5000) {

shufrank=rank(sample(lobact))

paren=(tapply(shufrank,trtm,mean)-11)^2

zz[i]=sum(size*paren)*(12/(length(trtm)*(length(trtm)+1)))

}