# R code for permutation and classification # Written by Ian Wood for a paper with Peter Visscher and Kerrie Mengersen. # Specifically to perform 1-level and 2-level cross-validation # and permutation experiments on datasets based on those used by # Sharma et al and Khan et al. and some random uninformative data. # refs: # P. Sharma et al. "Early detection of breast cancer based on gene-expression # patterns in peripheral blood cells", Breast Cancer Research, 7:R634–R644, 2005. # J. Khan et al., "Classification and diagnostic prediction of cancers using # gene expression profiling and artificial neural networks. # Nature Medicine, 7(6):673–679, 2001. # started around 1/10/05, placed on permr web page 25/10/06. # #---------------------------------------------- # pamr data format: a list containing the following components: # x - the covariates (usually gene expression levels) # y - the responses (labels or classes) # geneid - a vector of short strings each describing an id for the gene, e.g. "gene12" # genenames - a vector of longer strings each giving some description of a gene, # e.g. " ""\""catenin (cadherin-a"" " # samplelabels - a vector of sample labels, i.e. some sort of descriptive # (statistically irrelevant) string for each sample. # batchlabels - a vector of batch labels, if provided. These indicate which batch # a particular sample has come from. This is statistically relevant and # can be handled by pamr.batchadjust. # Here I ignore the batch labels # This are no batch labels for the khan data and our random data. # There are batch labels for the Sharma data, but by using the # batch-corrected data, these can be ignored. #---------------------------------------------- #---------------------------------------------- # functions available here: #function_name(parameters): return_type # get ready to run other functions #NSCstartup() # read in Khan data #KhanNSCread():dataliste # read in Sharma data #SharmaNSCread():dataliste # create random data #randomNSCcreate(sample_size):dataliste # fit NSC to data and evaluate using 2-level (double) external cv #propercvNSC(datalist): NSClist # randomly permute the responses (class labels) #permute(dl):datalist # create a header row for the data frame for 2-level cv #frameres(NSClist,K,numclasses):results dataframe # write out NSClist as a list: 2-level cv #listres(NSClist,K,numclasses):results list # write results to a csv file #writeresults(NSCtable,desc) # fit NSC to data and evaluate using 1-level (single) external cv #cv1NSC(dl): NSClist2 # data structures used in code: #dataliste = (desc, trainin, trainout, testin, testout, gnames, gid) #datalist = (desc, trainin, trainout, testin, testout, gnames, gid, K) #NSClist =(total_errors,simplesterate,ravg,ovrcerr,oneovrcerr,trcerr,oneovrtcerr,avgnogenes,test_total_errors,test_simplesterate,testr,test_rcerr,test_oneovrcerr,test_oneovrtcerr,test_nogenes)) #NSClist2 = (simplesterate,oneovrtcerr,nogenes,numclasses,tcount,cerr,rcerr)) #---------------------------------------------- #---------------------------------------------- # code for all cases: # load up the PAMR library # This needs to be installed and is available from CRAN. library('pamr') # set the random number seed to be something repeatable. # 521 is used in the R manual. Any other integer value can be used instead. set.seed(521) #---------------------------------------------- #---------------------------------------------- KhanNSCread <- function() { # read in the Khan data set #khan.data <- pamr.from.excel("data_Khan/khan.txt", 65, sample.labels=TRUE) # the above works for the PAMR-supplied khan.txt file (training data only). # The version below works with the full labelled dataset of 83 SRBCTs: khanc.txt # available on the permr website. The latter should be used for performing # 2-level cv on the Khan dataset. khan.data <- pamr.from.excel("data_Khan/khanc.txt", 85, sample.labels=TRUE) # converts from excel csv format to pamr data format. # the fields: filename, number of columns (# samples + 2), yes - includes sample labels. # +2 because first two columns of the data are geneid and genename. desc <- "Khan" trainin <- khan.data$x # a vector of the covariates for each sample point trainout <- khan.data$y # a vector of the response (label or class) for each sample point # Have not used a separate test set here. 2-level external cv allows one to train # and validly test using all possible labelled data. testin <- NULL testout <- NULL # get the labels for the genes gid <- khan.data$geneid gnames <- khan.data$genenames datalist <- list(desc=desc, trainin=trainin, trainout=trainout, testin=testin, testout=testout, gnames=gid, gid=gid) return(datalist) } # end of function KhanNSCread #---------------------------------------------- #---------------------------------------------- SharmaNSCread <- function() { # read in the Sharma60 data set # this reads our cut-down version of the batch-corrected Sharma data with # 1 data point per patient (60 data points in total) - randomly selected. sharma.data <- pamr.from.excel("data_Sharma/sharmabatch1.txt", 62, sample.labels=TRUE, batch.labels=TRUE) # converts from excel csv format to pamr data format. # the fields: filename, number of columns (# samples + 2), yes - includes sample labels. # +2 because first two columns of the data are geneid and genename. desc <- "Sharma60" trainin <- sharma.data$x # a vector of the covariates for each sample point trainout <- sharma.data$y # a vector of the response (label or class) for each sample point testin <- NULL testout <- NULL # get the labels for the genes gid <- sharma.data$geneid gnames <- sharma.data$genenames datalist <- list(desc=desc, trainin=trainin, trainout=trainout, testin=testin, testout=testout, gnames=gid, gid=gid) return(datalist) } # end of function SharmaNSCread #---------------------------------------------- #---------------------------------------------- randomNSCcreate <- function(sample_size) { # create a simulated random non-informative dataset, with: # 2000 covariates (input variables) drawn independently from N(0,1) # 1 binary response variable (output or label) which could mean e.g. # "diseased" or "not diseased", drawn from a uniform distribution over {0,1}, # i.e. each from an independent Bernoulli trial with p = 0.5 . # num_rand (labelled) sample points in the training set # (like having num_rand patients with 1 sample point per patient) # 2500 (labelled) sample points in the test set # Note: these points are always the same if you use the same random number seed. num_rand <- sample_size desc <- paste("random", num_rand, sep="") trainin <- array(rnorm(2000*num_rand,0,1), c(2000,num_rand)) testin <- array(rnorm(5000000,0,1), c(2000,2500)) A = c(0,1) # the set of possible responses trainout <- sample(A,num_rand,replace=T) # the set of training responses (outputs) #trainout <- sample(rep(A,num_rand/2),num_rand,replace=F) # the above (commented out) produces exactly the same number of responses from each class # Can be used to help check estimated error rate without the issue of imbalanced # observed class sizes. testout <- sample(A,2500,replace=T) # the set of testing responses (outputs) # get the labels for the genes ready (just 1:2000 numeric ids) gid <- as.vector(1:2000,"integer") # copy all the gene ids into the gene names: boring, but easy to follow. gnames <- gid datalist <- list(desc=desc, trainin=trainin, trainout=trainout, testin=testin, testout=testout, gnames=gid, gid=gid) return(datalist) } # end of function randomNSCcreate #---------------------------------------------- #---------------------------------------------- # a function to run 2-level external cross-validation with the NSC classifier using PAMR propercvNSC <- function(dl) { # notes: # dl = datalist = list(desc, trainin, trainout, testin, testout, gnames, gid, K, verbose). # desc is a description of the data set being used. # testin and testout can be NULL. If both are not null, the procedure tries to # do a predictive performance check of the trained classifier on the test set. # list the unique classes in a vector classes <- unique(dl$trainout) # Now replaced later with classes2 - has the same ordering as the splitrids list # find the number of classes numclasses <- length(classes) N <- length(dl$trainout) # number of observations in dataset. # dl$K = the number of folds of cross-validation. # with balanced cv code, one can choose K=anything <= N and it should work. if (dl$K>N) stop("You have set K>N - choose a smaller value for K (number of cv folds)") sqrtK <- sqrt(dl$K) r <- vector("numeric",dl$K) # overall error rate for each fold cerr <- matrix(0,dl$K,numclasses) # number of class errors for each fold counts_cerr <- matrix(0,dl$K,numclasses) # number of class members for each fold rcerr <- matrix(0,dl$K,numclasses) # class error rates for each fold # note: putting 0 first automatically initialises each to zero (in R) nogenes <- vector("numeric",dl$K) # optimal no of genes for each fold foldsize <- vector("numeric",dl$K) # size of each fold #################################### # balanced cv: working out the fold elements indices <- 1:N # indices for the data rindices <- sample(indices, N, replace=F)# a randomised version of the indices # determine the number of data points in each class class_size <- vector("numeric",numclasses) splitrids <- split(rindices,dl$trainout[rindices]) # splits rindices into as many classes as there are based on the value of trainout # split order needs to be the same as the order of "classes": # result in classes2, used throughout classes2<-NULL for (p in 1:numclasses) { class_size[p] <- length(splitrids[[p]]) classes2 <- c(classes2, dl$trainout[splitrids[[p]][1]]) } # sort the data by its class # easiest way seems to be to join up the split data sindices <- NULL for (i in 1:numclasses) sindices <- c(sindices,splitrids[[i]]) # make the bins: numbins bins of size (exactly) dl$K each # each bin will contribute 1 point to each fold (randomly) numbins <- floor(N/dl$K) binallocend <- dl$K*numbins # now create the folds by sampling without replacement from the bins # note that each bin (bindices[[j]]) has exactly dl$K elements bindices <- alist() for (j in 1:numbins) { bindices[[j]] <- sindices[(((j-1)*dl$K)+1):(j*dl$K)] } # sample without replacement from the bins to create the folds findices <- alist() findices[[dl$K+1]] <- 0 # a dummy value to set up an empty list for (j in 1:numbins) { foldallocs <- sample(1:dl$K, replace=F) for (i in 1:dl$K) { findices[[foldallocs[i]]] <- c(findices[[foldallocs[i]]],bindices[[j]][i]) } } # also effectively make 1 more bin (if necessary) of size N mod dl$K # allocate the contents of this extra bin randomly to folds with each # receiving at most one point. numleft = N-binallocend if (numleft>0) { # send the sindices[(binallocend+1):N] to folds at random # how to: select numleft points without replacement from 1:dl$K (the fold indices) leftalloc <- sample(1:dl$K,numleft, replace=F) for (i in 1:numleft) { findices[[leftalloc[i]]] <- c(findices[[leftalloc[i]]],sindices[binallocend+i]) # add point (index) binallocend+i to fold[leftalloc[i]] } } ###################### # estimating the misclassification rate for of the NSC classifier # note: the routine to create the final classifier would be simpler - # would not have any external cross-validation. for (i in 1:dl$K) { foldsize[i] <- length(findices[[i]]) b <- findices[[i]] # b is the subset of indices of the fold of data to be left out in the external cv # external cross validation: step i notb <- setdiff(rindices,b) # notb is the remaining data indices (folds): the data indexed by these # folds is to be the training set. # we train on this data by internal cross-validation to find the optimal threshold. cvtrainin <- dl$trainin[,notb] cvtrainout <- dl$trainout[notb] cvtestin <- dl$trainin[,b] cvtestin <- as.matrix(cvtestin) cvtestout <- dl$trainout[b] data <- list(x = cvtrainin, y = cvtrainout, genenames = dl$gnames, geneid = dl$gid, samplelabels = NULL, batchlabels = NULL) # sets up the data list of vectors and matrices as needed by PAMR # Note: I make no use of sample or batch labels, so I am ignoring any that # may have been read in. NSC.trained <- pamr.train(data) # this is the way this seems to work with PAMR: you give it all the training data # to set up with, then do (internal) CV to choose the best threshold # find optimal value of the threshold (optthresh) NSC.results <- pamr.cv(NSC.trained, data) minerr <- min(NSC.results$error) minindex <- 1 numthresholds <- length(NSC.results$error) for (j in 1:numthresholds) { if (minerr==NSC.results$error[j]) minindex <- j } optthresh <- NSC.results$threshold[minindex] # so we have now used 1-level CV to find the optimal threshold on the training data # and have set all other parameters of the NSC classifier nogenes[i] <- NSC.results$size[minindex] # do a 2nd (outer) level CV test to get a predictive accuracy # predict response classes using this NSC classifier on the test data NSC.test <- pamr.predict(NSC.trained,cvtestin,optthresh) # find out the error rate on this test set # report 3 types of error: simple, per class and average across classes(as per Wessels05). # simple: r ; per class: rcerr ; avg across classes: oneovrtcerr rcv<-0 # rcv will hold the number of errors made on this (i th) fold for (j in 1:foldsize[i]) { if (NSC.test[j] != cvtestout[j]) rcv <- rcv + 1 # add 1 to rcv when NSC.test (the prediction) differs from cvtestout (the 'truth') # assign a class number to the output class (it will be unique) class <- which(classes2==cvtestout[j]) if (NSC.test[j] != cvtestout[j]) cerr[i,class] <- cerr[i,class] + 1 counts_cerr[i,class] <- counts_cerr[i,class] + 1 } r[i] <- rcv/foldsize[i] for (l in 1:numclasses) { if (counts_cerr[i,l]>0) rcerr[i,l] <- cerr[i,l]/counts_cerr[i,l] else rcerr[i,l] <- NA # print(paste("i,l,counts_cerr,cerr =",i,l,counts_cerr[i,l],cerr[i,l])) } } # ravg is the simple estimate of classifier error obtained using # (proper) external cross-validation ravg <- mean(r) # Didn't report this. Did use r for se calculations though. ser <- sd(r)/sqrtK # est of std error of the simple overall error rate sercerr <- vector("numeric",numclasses) # est of std error of the class error rates for (l in 1:numclasses) { sercerr[l] <- sd(rcerr[,l],na.rm=TRUE)/sqrtK } # now get the mean of each class error rate (over the cv folds) ovrcerr <- vector("numeric",numclasses) # mean class error rates for (j in 1:numclasses) { ovrcerr[j] <- mean(rcerr[,j],na.rm=TRUE) } # oneovrcerr is the overall average class error rate # (averaging over the various classes after averaging over the folds # to get each class average) oneovrcerr <- mean(ovrcerr) # Only used to obtain std err of avg of class error rates # (by checking variation in these over the folds). ovrcerr2 <- vector("numeric",dl$K) # mean fold error rates for (j in 1:dl$K) { ovrcerr2[j] <- mean(rcerr[j,],na.rm=TRUE) } # calculate std err of oneovrtcerr seovrcerr2 <- sd(ovrcerr2)/sqrtK trcerr <- vector("numeric",numclasses) # error rate on each class tcount <- vector("numeric",numclasses) # count of observations in each class tcerr <- vector("numeric",numclasses) # count of errors made on each class for (l in 1:numclasses) { tcount[l] <- sum(counts_cerr[,l]) tcerr[l] <- sum(cerr[,l]) if (tcount[l]>0) trcerr[l] <- tcerr[l]/tcount[l] else trcerr[l] <- NA # can only happen if there are no examples of this class: prob impossible from defn. } oneovrtcerr <- mean(trcerr,na.rm=TRUE) avgnogenes <- mean(nogenes) senogenes <- sd(nogenes)/sqrtK total_errors <- sum(cerr) simplesterate <- total_errors/N if (dl$verbose) { print(dl$desc) # give the description of the data print(paste("N (number of sample points) =",N)) print(paste("K (number of cv folds) =",dl$K)) print(paste("class_size =",class_size)) print(paste("foldsize =",foldsize)) print(paste("total number of errors =",total_errors)) print(paste("simplest error rate =",simplesterate)) print(paste("r =",r)) print(paste("Average simple error rate (over cv folds): ravg =",ravg)) print(paste("numclasses =",numclasses)) print(paste("Class error rates avgd over cv folds: ovrcerr=",ovrcerr)) print(paste("Avg of class error rate avgs: oneovrcerr=",oneovrcerr)) print(paste("Class error rates (best method): trcerr =",trcerr)) print(paste("Average class error rate (over cv folds - best method): oneovrtcerr =",oneovrtcerr)) print(paste("Numbers of genes used =",nogenes)) print(paste("Average number of genes used =",avgnogenes)) print(paste("----------")) } # predictive part: classifier is trained on whole of training data, # then tested on test data. # current version: test results are not returned by propercvNSC(). test_total_errors<-NA test_simplesterate<-NA testr<-NA test_rcerr<-NA test_oneovrcerr<-NA test_oneovrtcerr<-NA test_nogenes<-NA # return the above if there is no test set if (!is.null(dl$testin) & !is.null(dl$testout)) { # do the predictive check test_class_size <- vector("numeric",numclasses) splittestout <- split(dl$testout,dl$testout) for (p in 1:numclasses) { test_class_size[p] <- length(splittestout[[p]]) } datat <- list(x = dl$trainin, y = dl$trainout, genenames = dl$gnames, geneid = dl$gid, samplelabels = NULL, batchlabels = NULL) NSC.trained_test <- pamr.train(datat) NSC.results_test <- pamr.cv(NSC.trained_test, datat) minerr <- min(NSC.results_test$error) minindex <- 1 numthresholds <- length(NSC.results_test$error) for (j in 1:numthresholds) { if (minerr==NSC.results_test$error[j]) minindex <- j } optthresh <- NSC.results_test$threshold[minindex] test_nogenes <- NSC.results_test$size[minindex] NSC.test_test <- pamr.predict(NSC.trained_test,dl$testin,optthresh) test_rcv<-0 # test_rcv will hold the number of errors made on the test dataset testsetsize <- length(dl$testout) test_cerr <- vector("numeric",numclasses) test_counts_cerr <- vector("numeric",numclasses) for (j in 1:testsetsize) { if (NSC.test_test[j] != dl$testout[j]) test_rcv <- test_rcv + 1 class <- which(classes2==dl$testout[j]) # class <- which(classes==dl$testout[j]) if (NSC.test_test[j] != dl$testout[j]) test_cerr[class] <- test_cerr[class] + 1 test_counts_cerr[class] <- test_counts_cerr[class] + 1 } testr <- test_rcv/testsetsize test_rcerr <- vector("numeric",numclasses) for (l in 1:numclasses) { if (test_counts_cerr[l]>0) test_rcerr[l] <- test_cerr[l]/test_counts_cerr[l] else test_rcerr[l] <- NA # print(paste("l,test_counts_cerr,test_cerr =",l,test_counts_cerr[l],testcerr[l])) } # only need test_rcerr for class error rates # no need for ovrcerr (no folds to average over) # test_oneovrcerr is the overall average class error rate test_oneovrcerr <- mean(test_rcerr,na.rm=TRUE) test_trcerr <- vector("numeric",numclasses) for (l in 1:numclasses) { if (test_counts_cerr[l]>0) test_trcerr[l] <- test_cerr[l]/test_counts_cerr[l] else test_trcerr[l] <- NA } test_oneovrtcerr <- mean(test_trcerr,na.rm=TRUE) test_total_errors <- sum(test_cerr) test_simplesterate <- test_total_errors/testsetsize if (dl$verbose) { print(paste("testsetsize =",testsetsize)) print(paste("test_class_size =",test_class_size)) print(paste("test_nogenes =",test_nogenes)) print(paste("total number of test errors =",test_total_errors)) print(paste("simplest test error rate =",test_simplesterate)) print(paste("test_rcv =",test_rcv)) print(paste("testr =",testr)) print(paste("test_rcerr =",test_rcerr)) print(paste("test_oneovrcerr =",test_oneovrcerr)) print(paste("test_oneovrtcerr =",test_oneovrtcerr)) # note: these two produce the same results. print(paste("===============================")) } } return(list( tcerr=tcerr, tcount=tcount, trcerr=trcerr, simplesterate=simplesterate, oneovrtcerr=oneovrtcerr, rcerr=rcerr, r=r, ovrcerr2=ovrcerr2, sercerr=sercerr, ser=ser, seovrcerr2=seovrcerr2, avgnogenes=avgnogenes, nogenes=nogenes, senogenes=senogenes)) } #needed improvement: add test set results # end of function propercvNSC #---------------------------------------- permute <- function(dl) { # randomly permute the responses (class labels) # dl: datalist permtrainout <- sample(dl$trainout, replace=F) dl$trainout <- permtrainout return(dl) } frameres <- function(NSClist,K,numclasses) { ll<-listres(NSClist,K,numclasses) df<-as.data.frame(t(ll)) # make the names header row for the data frame header<-c("simplesterate","ser","oneovrtcerr","seovrcerr2","avgnogenes","senogenes") for (i in 1:numclasses) { header<-c(header,paste("tcount[",i,"]",sep="")) } for (i in 1:numclasses) { header<-c(header,paste("tcerr[",i,"]",sep="")) } for (i in 1:numclasses) { header<-c(header,paste("trcerr[",i,"]",sep="")) } for (i in 1:numclasses) { header<-c(header,paste("sercerr[",i,"]",sep="")) } for (i in 1:K) { header<-c(header,paste("r[",i,"]",sep="")) } for (i in 1:K) { header<-c(header,paste("ovrcerr2[",i,"]",sep="")) } for (i in 1:K) { header<-c(header,paste("nogenes[",i,"]",sep="")) } for (j in 1:numclasses) { for (i in 1:K) { header<-c(header,paste("rcerr[",i,",",j,"]",sep="")) } } # puts the header as the names - will be the top row in a .csv file # top row and ll as the 2nd row # should be ok to put permutation rows below this somehow. names(df) <- header # add rows to data frame: use rbind return(df) } listres <- function(NSClist,K,numclasses) { # writing out NSClist as a list ll<-NULL ll<-c(NSClist$simplesterate,NSClist$ser,NSClist$oneovrtcerr,NSClist$seovrcerr2,NSClist$avgnogenes,NSClist$senogenes) for (i in 1:numclasses) { ll<-c(ll,NSClist$tcount[i]) } for (i in 1:numclasses) { ll<-c(ll,NSClist$tcerr[i]) } for (i in 1:numclasses) { ll<-c(ll,NSClist$trcerr[i]) } for (i in 1:numclasses) { ll<-c(ll,NSClist$sercerr[i]) } for (i in 1:K) { ll<-c(ll,NSClist$r[i]) } for (i in 1:K) { ll<-c(ll,NSClist$ovrcerr2[i]) } for (i in 1:K) { ll<-c(ll,NSClist$nogenes[i]) } for (j in 1:numclasses) { for (i in 1:K) { ll<-c(ll,NSClist$rcerr[i,j]) } } return(ll) } writeresults <- function(NSCtable,desc) { # write results to a csv file fname <- paste("NSCresults_",desc,"_",Sys.Date(),".csv",sep="") write.table(NSCtable,file=fname,sep=",",col.names=NA) } cv1NSC <- function(dl) { # notes: # dl = datalist = list(desc, trainin, trainout, testin, testout, gnames, gid, K, verbose). # desc is a description of the data set being used. # list the unique classes in a vector classes <- unique(dl$trainout) numclasses <- length(classes) N <- length(dl$trainout) # total number of data points if (dl$K>N) stop("You have set K>N - choose a smaller value for K (number of cv folds)") sqrtK <- sqrt(dl$K) cerr <- vector("numeric",numclasses) # number of class errors tcount <- vector("numeric",numclasses) # number of class members rcerr <- vector("numeric",numclasses) # class error rates finalnogenes <- NA # optimal number of genes (for final classifier) nogenes <- vector("numeric",dl$K) # optimal no of genes for each fold r <- vector("numeric",dl$K) # overall error rate for each fold fcerr <- matrix(0,dl$K,numclasses) # number of class errors for each fold fcounts_cerr <- matrix(0,dl$K,numclasses) # number of class members for each fold frcerr <- matrix(0,dl$K,numclasses) # class error rates for each fold #################################### indices <- 1:N # indices for the data rindices <- sample(indices, N, replace=F) # a randomised version of the indices # determine the number of data points in each class class_size <- vector("numeric",numclasses) splitrids <- split(rindices,dl$trainout[rindices]) # splits rindices into as many classes as there are based on the value of trainout # split order needs to be the same as the order of "classes": # result in classes2, used throughout classes2<-NULL for (p in 1:numclasses) { class_size[p] <- length(splitrids[[p]]) classes2 <- c(classes2, dl$trainout[splitrids[[p]][1]]) } ###################### # estimating the misclassification rate for our NSC classifier # using 1-external cv (i.e. as given by simple use of PAMR) data <- list(x = dl$trainin, y = dl$trainout, genenames = dl$gnames, geneid = dl$gid, samplelabels = NULL, batchlabels = NULL) # sets up the data list of vectors and matrices as needed by PAMR NSC.trained <- pamr.train(data) # this is the way this seems to work with PAMR: you give it all the training data # to set up with, then do CV to choose the best threshold # find optimal value of the threshold (optthresh) NSC.results <- pamr.cv(NSC.trained, data) findices <- NSC.results$folds minerr <- min(NSC.results$error) minindex <- 1 numthresholds <- length(NSC.results$error) for (j in 1:numthresholds) { if (minerr==NSC.results$error[j]) minindex <- j } optthresh <- NSC.results$threshold[minindex] # so we have now used 1-level CV to find the optimal threshold on the training # data and have set all other parameters of the NSC classifier finalnogenes <- NSC.results$size[minindex] for (j in 1:N) { class <- which(classes2==NSC.results$y[j]) tcount[class] <- tcount[class] + 1 if (NSC.results$y[j] != NSC.results$yhat[j,minindex]) { cerr[class] <- cerr[class] + 1 } } for (l in 1:numclasses) { if (tcount[l]>0) rcerr[l] <- cerr[l]/tcount[l] else rcerr[l] <- NA } oneovrtcerr <- mean(rcerr,na.rm=TRUE) # work out the standard errors of the cv error estimates based on the values # on each fold. foldsize <- vector("numeric",dl$K) # size of each fold for (i in 1:dl$K) { # K iterations: over folds. rcv<-0 # rcv = count of errors in a fold. foldsize[i] <- length(findices[[i]]) for (j in 1:foldsize[i]) { # assign a class number to the output class (it will be unique) class <- which(classes2==NSC.results$y[findices[[i]][j]]) if (NSC.results$y[findices[[i]][j]] != NSC.results$yhat[findices[[i]][j],minindex]) { rcv <- rcv + 1 # add 1 to rcv when the prediction yhat[j] differs from the true label y[j] fcerr[i,class] <- fcerr[i,class] + 1 # add 1 to the error count of class "class" in fold i } fcounts_cerr[i,class] <- fcounts_cerr[i,class] + 1 # add 1 to the total count of class "class" in fold i } r[i] <- rcv/foldsize[i] for (l in 1:numclasses) { if (fcounts_cerr[i,l]>0) frcerr[i,l] <- fcerr[i,l]/fcounts_cerr[i,l] else frcerr[i,l] <- NA } sercerr <- vector("numeric",numclasses) # est of std error of the class error rates for (l in 1:numclasses) { sercerr[l] <- sd(frcerr[,l],na.rm=TRUE)/sqrtK } } #debugging print(paste("fcerr = ",fcerr)) print(paste("fcounts_cerr = ",fcounts_cerr)) print(paste("frcerr = ",frcerr)) print(paste("rcerr = ",rcerr)) # est of std error of the simple overall error rate ser <- sd(r)/sqrtK # obtain std err of avg of class error rates by checking variation in these over the folds. ovrcerr2 <- vector("numeric",dl$K) # mean fold error rates for (j in 1:dl$K) { ovrcerr2[j] <- mean(frcerr[j,],na.rm=TRUE) } # calculate std err of oneovrtcerr seovrcerr2 <- sd(ovrcerr2)/sqrtK for (i in 1:dl$K) { nogenes[i] <- NSC.results$cv.objects[[i]]$nonzero[minindex] } avgnogenes <- mean(nogenes) senogenes <- sd(nogenes)/sqrtK #return(list(simplesterate=minerr,oneovrtcerr=oneovrtcerr,avgnogenes=avgnogenes,numclasses=numclasses,tcount=tcount,cerr=cerr,rcerr=rcerr)) return(list(tcerr=cerr, tcount=tcount, trcerr=rcerr, simplesterate=minerr, oneovrtcerr=oneovrtcerr, rcerr=frcerr, r=r, ovrcerr2=ovrcerr2, sercerr=sercerr, ser=ser, seovrcerr2=seovrcerr2, avgnogenes=avgnogenes, nogenes=nogenes, senogenes=senogenes)) # end of function cv1NSC }