## An R program to implement the algorithms discussed ## in the paper "A simple implementation of a normal mixture ## approach to differential expression in multiclass microarrays" ## by McLachlan, Bean and Jones (Bioinformatics 2006) ## at the moment, this only runs on Windows, because it uses system() ## to call "em2.exe" or "emmix01.exe" (EMMIX). ## Unfortunately there is no DLL version ## of EMMIX which works with R at the moment. It seems EMMIX ## was intended to be used this way, but documentation is lacking. ## The functions work only for the two-group case ## "Permutation assessment of p-value" (Section 5.2) ## pooled and unpooled versions ## permpval uses pooled t-statistics ## then estimates p-values using the unpooled method ## nperm is the number of permutations ## tvals are the t-scores permpval <- function(file,len1,len2,nperm,tvals) { a <- read.table(file) test <- matrix(rep(0,nperm*nrow(a)),nrow=nrow(a),ncol=nperm) for (j in seq(1:nperm)) { r1 <- sample(1:length(a),len1) r2 <- setdiff(1:length(a),r1) mean1 <- apply(a[,r1],1,mean) mean2 <- apply(a[,r2],1,mean) sd1<-sqrt(apply(a[,r1],1,var)) sd2<-sqrt(apply(a[,r2],1,var)) sd <- sqrt( ( (len1-1)*sd1^2 + (len2-1)*sd2^2) / (len1+len2-2) ) test[,j] <- (mean1-mean2)/(sd*(1/len1+1/len2)^(1/2)) } for (i in 1:nrow(a)) pvals[i] <- length(which(abs(test[i,])>abs(tvals[i])))/nperm return(pvals) } ## permpvalpool uses pooled t-statistics ## then estimates p-values using the pooled method ## nperm is the number of permutations permpvalpool <- function(file,len1,len2,nperm,tvals) { a <- read.table(file) test <- matrix(rep(0,nperm*nrow(a)),nrow=nrow(a),ncol=nperm) for (j in seq(1:nperm)) { r1 <- sample(1:length(a),len1) r2 <- setdiff(1:length(a),r1) mean1 <- apply(a[,r1],1,mean) mean2 <- apply(a[,r2],1,mean) sd1<-sqrt(apply(a[,r1],1,var)) sd2<-sqrt(apply(a[,r2],1,var)) sd <- sqrt( ( (len1-1)*sd1^2 + (len2-1)*sd2^2) / (len1+len2-2) ) test[,j] <- (mean1-mean2)/(sd*(1/len1+1/len2)^(1/2)) } #return(test) for (i in 1:nrow(a)) pvals[i] <- length(which(abs(as.vector(test))>abs(tvals[i])))/nperm/nrow(a) return(pvals) } ## try ALL permutations as in Section 10 of the paper ## when we are examining the HIV data and use B = 35 ## permutations where len1 = len2 = 4 ## the next function is the one used in the paper ## to calculate estimated P-values pooled over all the genes permpvalall <- function(file,len1,len2) { library(sma) library(gregmisc) a <- read.table(file) tvals <- gettpooled(file,len1,len2) c <- combinations(ncol(a),len1) nperm <- nrow(c) test <- matrix(rep(0,nperm*nrow(a)),nrow=nrow(a),ncol=nperm) for (j in seq(1:nrow(c))) { r1 <- c[j,] r2 <- setdiff(1:ncol(a),r1) sd1<-sqrt(apply(a[,r1],1,var)) sd2<-sqrt(apply(a[,r2],1,var)) sd <- sqrt( ( (len1-1)*sd1^2 + (len2-1)*sd2^2) / (len1+len2-2) ) mean1 <- apply(a[,r1],1,mean) mean2 <- apply(a[,r2],1,mean) test[,j]<-(mean1-mean2)/(sd*(1/len1 + 1/len2)^(1/2)) } for (i in 1:nrow(a)) pvals[i] <- length(which(abs(test[i,])>abs(tvals[i])))/nperm return(pvals) } permpvalallpooled <- function(file,len1,len2) { library(sma) library(gregmisc) a <- read.table(file) tvals <- gettpooled(file,len1,len2) c<-combinations(ncol(a),len1) nperm<-nrow(c) test <- matrix(rep(0,nperm*nrow(a)),nrow=nrow(a),ncol=nperm) for (j in seq(1:nrow(c))) { r1 <- c[j,] r2 <- setdiff(1:ncol(a),r1) sd1<-sqrt(apply(a[,r1],1,var)) sd2<-sqrt(apply(a[,r2],1,var)) sd <- sqrt( ( (len1-1)*sd1^2 + (len2-1)*sd2^2) / (len1+len2-2) ) mean1 <- apply(a[,r1],1,mean) mean2 <- apply(a[,r2],1,mean) test[,j]<-(mean1-mean2)/(sd*(1/len1 + 1/len2)^(1/2)) } pvals <- rep(0,nrow(a)) for (i in 1:nrow(a)) pvals[i] <- length(which(abs(as.vector(test))>abs(tvals[i])))/nperm/nrow(a) return(pvals) } gettpooled <- function(file,len1,len2) { a <- read.table(file) t <- rep(0,nrow(a)) for (i in 1:nrow(a)) t[i]<-t.test(a[i,1:len1],a[i,(len1+1):(len1+len2)],var.equal=T)$s return (t) } get_theo_null <- function(file,len1,len2) { # # calculate the theoretical null # t <- gettpooled(file,len1,len2) p <- 1-pf(t^2,1,len1+len2-2) z <- qnorm(1-p) ## estimate \pi_0 by looking at the number of p-values between ## a certain number \xi and 1 -- an application of equation (21) in ## the paper for (i in c(1/2,2/3,3/4,4/5)) { a0 <- length(which(p>i)) pi0 <- a0/(length(p)*(1-i)) mu <- mean(z) var <- var(z) ## since m0 and v0 are 0, then we use equations (19) and (20) ## in the paper to calculate m1 and v1 m1<- mu/(1-pi0) v1<-(var-pi0-pi0*(1-pi0)*m1*m1)/(1-pi0) if (v1 < 0) v1=1 cat(i,a0/(length(p)*(1-i)),m1,v1,'\n') newfile <- paste("z",file,i,sep="") newfileout <- paste (newfile,".out",sep="") ## write these parameters to a file and call EMMIX ## to get the posterior probabilities (ie \tau_0 values) ## for each gene and the new estimates for ## \pi_0, \pi_1, \mu_0, \mu_1, \var_0, and \var_1 ## emmix01 is the same as em2, but the mean and variance ## of the first component are constrained to always be 0 and 1 write(z,newfile,ncolumns=1) write(0,newfile,ncolumns=1,append=TRUE) write(1,newfile,ncolumns=1,append=TRUE) write(m1,newfile,ncolumns=1,append=TRUE) write(v1,newfile,ncolumns=1,append=TRUE) write(pi0,newfile,ncolumns=1,append=TRUE) write(1-pi0,newfile,ncolumns=1,append=TRUE) inp <- paste (2,newfile,newfileout,length(p),1,1,2,2,2,"n",sep="\n") system('"emmix01.exe"',input=inp, i,show.output.on.console=T) } } get_empi_null <- function(file,len1,len2) { # # empirical null # # do the latest Geoff method, 20 October 2005 # assume g = 2 # t <- gettpooled(file,len1,len2) p <- 1-pf(t^2,1,len1+len2-2) z <- qnorm(1-p) # estimate \pi_0 using \xi as above for (i in c(1/2,2/3,3/4,4/5)) { a0 <- length(which(p>i)) a1 <- a0/(1-i) pi0 <- a0/(length(p)*(1-i)) zs <- sort(z) m0<-(mean(zs[1:a1])) v0<-(var(zs[1:a1])) m1<-(mean(zs[(a1+1):length(p)])) v1<-(var(zs[(a1+1):length(p)])) cat(i,pi0,1-pi0,m0,v0,m1,v1,'\n') newfile <- paste("z",file,i,sep="") newfileout <- paste (newfile,".out",sep="") write(z,newfile,ncolumns=1) write(m0,newfile,ncolumns=1,append=TRUE) write(v0,newfile,ncolumns=1,append=TRUE) write(m1,newfile,ncolumns=1,append=TRUE) write(v1,newfile,ncolumns=1,append=TRUE) write(pi0,newfile,ncolumns=1,append=TRUE) write(1-pi0,newfile,ncolumns=1,append=TRUE) # when covariance matrices are equal, the second 2 is changed to 1 inp <- paste (2,newfile,newfileout,length(p),1,1,2,2,2,"n",sep="\n") system('"em2.exe"',input=inp, i,show.output.on.console=T) } } get_theo_null_2 <- function(file,len1,len2) { # get the theoretical null # by trying various values of \pi_0 # t <- gettpooled(file,len1,len2) p <- 1-pf(t^2,1,len1+len2-2) z <- qnorm(1-p) for (pi0 in seq(0.025,0.975,0.025)) { mu <- mean(z) var <- var(z) m1<- mu/(1-pi0) v1<-(var-pi0-pi0*(1-pi0)*m1*m1)/(1-pi0) if (v1 < 0) v1=1 cat(pi0,m1,v1,'\n') newfile <- paste("theo",file,".",pi0,sep="") newfileout <- paste (newfile,".out",sep="") write(z,newfile,ncolumns=1) write(0,newfile,ncolumns=1,append=TRUE) write(1,newfile,ncolumns=1,append=TRUE) write(m1,newfile,ncolumns=1,append=TRUE) write(v1,newfile,ncolumns=1,append=TRUE) write(pi0,newfile,ncolumns=1,append=TRUE) write(1-pi0,newfile,ncolumns=1,append=TRUE) inp <- paste (2,newfile,newfileout,length(z),1,1,2,2,2,"n",sep="\n") system('"emmix01.exe"',input=inp, 1, show.output.on.console=T) } } get_empi_null_2 <- function(file,len1,len2) { # # empirical null # t <- gettpooled(file,len1,len2) p <- 1-pf(t^2,1,len1+len2-2) z <- qnorm(1-p) for (pi0 in seq(0.025,0.975,0.025)) { a1 <- pi0*length(z) zs <- sort(z) m0<-(mean(zs[1:a1])) v0<-(var(zs[1:a1])) m1<-(mean(zs[(a1+1):length(z)])) v1<-(var(zs[(a1+1):length(z)])) cat(pi0,1-pi0,m0,v0,m1,v1,'\n') newfile <- paste("empi",file,pi0,sep="") newfileout <- paste (newfile,".out",sep="") write(z,newfile,ncolumns=1) write(m0,newfile,ncolumns=1,append=TRUE) write(v0,newfile,ncolumns=1,append=TRUE) write(m1,newfile,ncolumns=1,append=TRUE) write(v1,newfile,ncolumns=1,append=TRUE) write(pi0,newfile,ncolumns=1,append=TRUE) write(1-pi0,newfile,ncolumns=1,append=TRUE) inp <- paste (2,newfile,newfileout,length(z),1,1,2,2,2,"n",sep="\n") system('"em2.exe"',input=inp, ,show.output.on.console=T) } }