# R code for permutation and classification 
# Written by Ian Wood for a paper with Peter Visscher and Kerrie Mengersen.
# Specifically to perform 1-level and 2-level cross-validation 
# and permutation experiments on datasets based on those used by
# Sharma et al and Khan et al. and some random uninformative data.
# refs:
# P. Sharma et al. "Early detection of breast cancer based on gene-expression
# patterns in peripheral blood cells", Breast Cancer Research, 7:R634–R644, 2005.
# J. Khan et al., "Classification and diagnostic prediction of cancers using 
# gene expression profiling and artificial neural networks.
# Nature Medicine, 7(6):673–679, 2001.
# started around 1/10/05, placed on permr web page 25/10/06.
#
#----------------------------------------------
# pamr data format: a list containing the following components:
# x - the covariates (usually gene expression levels)
# y - the responses (labels or classes)
# geneid - a vector of short strings each describing an id for the gene, e.g. "gene12"
# genenames - a vector of longer strings each giving some description of a gene, 
#  e.g. " ""\""catenin (cadherin-a"" "
# samplelabels - a vector of sample labels, i.e. some sort of descriptive
#  (statistically irrelevant) string for each sample.
# batchlabels - a vector of batch labels, if provided. These indicate which batch 
#  a particular sample has come from. This is statistically relevant and
#  can be handled by pamr.batchadjust. 
# Here I ignore the batch labels
# This are no batch labels for the khan data and our random data. 
# There are batch labels for the Sharma data, but by using the
# batch-corrected data, these can be ignored.
#----------------------------------------------

#----------------------------------------------
# functions available here:
#function_name(parameters): return_type

# get ready to run other functions
#NSCstartup()

# read in Khan data
#KhanNSCread():dataliste

# read in Sharma data
#SharmaNSCread():dataliste

# create random data
#randomNSCcreate(sample_size):dataliste

# fit NSC to data and evaluate using 2-level (double) external cv
#propercvNSC(datalist): NSClist

# randomly permute the responses (class labels)
#permute(dl):datalist

# create a header row for the data frame for 2-level cv
#frameres(NSClist,K,numclasses):results dataframe

# write out NSClist as a list: 2-level cv
#listres(NSClist,K,numclasses):results list

# write results to a csv file
#writeresults(NSCtable,desc)

# fit NSC to data and evaluate using 1-level (single) external cv
#cv1NSC(dl): NSClist2

# data structures used in code:
#dataliste = (desc, trainin, trainout, testin, testout, gnames, gid)
#datalist = (desc, trainin, trainout, testin, testout, gnames, gid, K)
#NSClist =(total_errors,simplesterate,ravg,ovrcerr,oneovrcerr,trcerr,oneovrtcerr,avgnogenes,test_total_errors,test_simplesterate,testr,test_rcerr,test_oneovrcerr,test_oneovrtcerr,test_nogenes))
#NSClist2 = (simplesterate,oneovrtcerr,nogenes,numclasses,tcount,cerr,rcerr))

#----------------------------------------------

#----------------------------------------------
# code for all cases:
# load up the PAMR library
# This needs to be installed and is available from CRAN.
library('pamr')
# set the random number seed to be something repeatable.
# 521 is used in the R manual. Any other integer value can be used instead.
set.seed(521)
#----------------------------------------------

#----------------------------------------------
KhanNSCread <- function() {
# read in the Khan data set

#khan.data <- pamr.from.excel("data_Khan/khan.txt", 65, sample.labels=TRUE)
#  the above works for the PAMR-supplied khan.txt file (training data only).
#  The version below works with the full labelled dataset of 83 SRBCTs: khanc.txt
#  available on the permr website. The latter should be used for performing
#  2-level cv on the Khan dataset.
khan.data <- pamr.from.excel("data_Khan/khanc.txt", 85, sample.labels=TRUE)

# converts from excel csv format to pamr data format.
# the fields: filename, number of columns (# samples + 2), yes - includes sample labels.
# +2 because first two columns of the data are geneid and genename.
desc <- "Khan"

trainin <- khan.data$x   # a vector of the covariates for each sample point
trainout <- khan.data$y  # a vector of the response (label or class) for each sample point
# Have not used a separate test set here. 2-level external cv allows one to train 
# and validly test using all possible labelled data.
testin <- NULL
testout <- NULL

# get the labels for the genes
gid <- khan.data$geneid
gnames <- khan.data$genenames

datalist <- list(desc=desc, trainin=trainin, trainout=trainout, testin=testin, testout=testout, gnames=gid, gid=gid)
return(datalist)
} 
# end of function KhanNSCread
#----------------------------------------------

#----------------------------------------------
SharmaNSCread <- function() {
# read in the Sharma60 data set
# this reads our cut-down version of the batch-corrected Sharma data with 
# 1 data point per patient (60 data points in total) - randomly selected.
sharma.data <- pamr.from.excel("data_Sharma/sharmabatch1.txt", 62, sample.labels=TRUE, batch.labels=TRUE)
# converts from excel csv format to pamr data format.
# the fields: filename, number of columns (# samples + 2), yes - includes sample labels.
# +2 because first two columns of the data are geneid and genename.
desc <- "Sharma60"

trainin <- sharma.data$x  # a vector of the covariates for each sample point
trainout <- sharma.data$y # a vector of the response (label or class) for each sample point
testin <- NULL
testout <- NULL

# get the labels for the genes
gid <- sharma.data$geneid
gnames <- sharma.data$genenames

datalist <- list(desc=desc, trainin=trainin, trainout=trainout, testin=testin, testout=testout, gnames=gid, gid=gid)
return(datalist)
} 
# end of function SharmaNSCread
#----------------------------------------------

#----------------------------------------------
randomNSCcreate <- function(sample_size) {
# create a simulated random non-informative dataset, with:
# 2000 covariates (input variables) drawn independently from N(0,1)
# 1 binary response variable (output or label) which could mean e.g.
#  "diseased" or "not diseased", drawn from a uniform distribution over {0,1}, 
#   i.e. each from an independent Bernoulli trial with p = 0.5 .
# num_rand (labelled) sample points in the training set 
#     (like having num_rand patients with 1 sample point per patient)
# 2500 (labelled) sample points in the test set 
# Note: these points are always the same if you use the same random number seed.
num_rand <- sample_size
desc <- paste("random", num_rand, sep="")

trainin <- array(rnorm(2000*num_rand,0,1), c(2000,num_rand))
testin <- array(rnorm(5000000,0,1), c(2000,2500))
A = c(0,1) # the set of possible responses
trainout <- sample(A,num_rand,replace=T) # the set of training responses (outputs)
#trainout <- sample(rep(A,num_rand/2),num_rand,replace=F)
# the above (commented out) produces exactly the same number of responses from each class
#  Can be used to help check estimated error rate without the issue of imbalanced
#  observed class sizes.
testout <- sample(A,2500,replace=T) # the set of testing responses  (outputs)

# get the labels for the genes ready (just 1:2000 numeric ids)
gid <- as.vector(1:2000,"integer")
# copy all the gene ids into the gene names: boring, but easy to follow.
gnames <- gid

datalist <- list(desc=desc, trainin=trainin, trainout=trainout, testin=testin, testout=testout, gnames=gid, gid=gid)
return(datalist)
} 
# end of function randomNSCcreate
#----------------------------------------------


#----------------------------------------------
# a function to run 2-level external cross-validation with the NSC classifier using PAMR

propercvNSC <- function(dl) {
# notes: 
# dl = datalist = list(desc, trainin, trainout, testin, testout, gnames, gid, K, verbose).
# desc is a description of the data set being used.
# testin and testout can be NULL. If both are not null, the procedure tries to
# do a predictive performance check of the trained classifier on the test set.

# list the unique classes in a vector
classes <- unique(dl$trainout)
# Now replaced later with classes2 - has the same ordering as the splitrids list
# find the number of classes
numclasses <- length(classes)

N <- length(dl$trainout) # number of observations in dataset.

# dl$K = the number of folds of cross-validation.
# with balanced cv code, one can choose K=anything <= N and it should work.
if (dl$K>N) stop("You have set K>N - choose a smaller value for K (number of cv folds)")
sqrtK <- sqrt(dl$K)

r <- vector("numeric",dl$K)              # overall error rate for each fold
cerr <- matrix(0,dl$K,numclasses)        # number of class errors for each fold
counts_cerr <- matrix(0,dl$K,numclasses) # number of class members for each fold
rcerr <- matrix(0,dl$K,numclasses) 	     # class error rates for each fold
# note: putting 0 first automatically initialises each to zero (in R)
nogenes <- vector("numeric",dl$K)        # optimal no of genes for each fold
foldsize <- vector("numeric",dl$K)	     # size of each fold
####################################
# balanced cv: working out the fold elements

indices <- 1:N 				     # indices for the data
rindices <- sample(indices, N, replace=F)# a randomised version of the indices

# determine the number of data points in each class
class_size <- vector("numeric",numclasses)

splitrids <- split(rindices,dl$trainout[rindices])
# splits rindices into as many classes as there are based on the value of trainout 
# split order needs to be the same as the order of "classes": 
# result in classes2, used throughout
classes2<-NULL
for (p in 1:numclasses) {
 class_size[p] <- length(splitrids[[p]])
 classes2 <- c(classes2, dl$trainout[splitrids[[p]][1]])
}

# sort the data by its class
# easiest way seems to be to join up the split data
sindices <- NULL
for (i in 1:numclasses) sindices <- c(sindices,splitrids[[i]])
# make the bins: numbins bins of size (exactly) dl$K each
# each bin will contribute 1 point to each fold (randomly)
numbins <- floor(N/dl$K)
binallocend <- dl$K*numbins
# now create the folds by sampling without replacement from the bins
# note that each bin (bindices[[j]]) has exactly dl$K elements
bindices <- alist()
for (j in 1:numbins) {
	bindices[[j]] <- sindices[(((j-1)*dl$K)+1):(j*dl$K)]
}

# sample without replacement from the bins to create the folds
findices <- alist()
findices[[dl$K+1]] <- 0 # a dummy value to set up an empty list
for (j in 1:numbins) {
	foldallocs <- sample(1:dl$K, replace=F)
	for (i in 1:dl$K) {
		findices[[foldallocs[i]]] <- c(findices[[foldallocs[i]]],bindices[[j]][i])
	}
}
# also effectively make 1 more bin (if necessary) of size N mod dl$K
# allocate the contents of this extra bin randomly to folds with each 
# receiving at most one point.
numleft = N-binallocend
if (numleft>0) {
	# send the sindices[(binallocend+1):N] to folds at random 
	# how to: select numleft points without replacement from 1:dl$K (the fold indices)
	leftalloc <- sample(1:dl$K,numleft, replace=F)
	for (i in 1:numleft) {
		findices[[leftalloc[i]]] <- c(findices[[leftalloc[i]]],sindices[binallocend+i])
		# add point (index) binallocend+i to fold[leftalloc[i]]
	}
}
######################
# estimating the misclassification rate for of the NSC classifier
# note: the routine to create the final classifier would be simpler -
# would not have any external cross-validation.
for (i in 1:dl$K) {
	foldsize[i] <- length(findices[[i]])
	b <- findices[[i]]	
	# b is the subset of indices of the fold of data to be left out in the external cv
	# external cross validation: step i
	notb <- setdiff(rindices,b)
	# notb is the remaining data indices (folds): the data indexed by these 
	# folds is to be the training set.
	# we train on this data by internal cross-validation to find the optimal threshold.

	cvtrainin  <- dl$trainin[,notb]
	cvtrainout <- dl$trainout[notb]

	cvtestin   <- dl$trainin[,b]
	cvtestin   <- as.matrix(cvtestin)
	cvtestout  <- dl$trainout[b]

	data <- list(x = cvtrainin, y = cvtrainout, genenames = dl$gnames, geneid = dl$gid, 
        samplelabels = NULL, batchlabels = NULL)
	# sets up the data list of vectors and matrices as needed by PAMR
	# Note: I make no use of sample or batch labels, so I am ignoring any that
	#   may have been read in.
	
	NSC.trained <- pamr.train(data)
	# this is the way this seems to work with PAMR: you give it all the training data
	# to set up with, then do (internal) CV to choose the best threshold

	# find optimal value of the threshold (optthresh)
	NSC.results <- pamr.cv(NSC.trained, data)
	minerr <- min(NSC.results$error)
	minindex <- 1
	numthresholds <- length(NSC.results$error)
	for (j in 1:numthresholds) {  
		if (minerr==NSC.results$error[j]) minindex <- j
	}
	optthresh <- NSC.results$threshold[minindex]
	# so we have now used 1-level CV to find the optimal threshold on the training data
	# and have set all other parameters of the NSC classifier
	nogenes[i] <- NSC.results$size[minindex]

	# do a 2nd (outer) level CV test to get a predictive accuracy
	# predict response classes using this NSC classifier on the test data
	NSC.test <- pamr.predict(NSC.trained,cvtestin,optthresh)
	# find out the error rate on this test set

	# report 3 types of error: simple, per class and average across classes(as per Wessels05).
	# simple: r ; per class: rcerr ; avg across classes: oneovrtcerr
	rcv<-0
	# rcv will hold the number of errors made on this (i th) fold

	for (j in 1:foldsize[i]) {
		if (NSC.test[j] != cvtestout[j]) rcv <- rcv + 1
		# add 1 to rcv when NSC.test (the prediction) differs from cvtestout (the 'truth')
		# assign a class number to the output class (it will be unique)
		class <- which(classes2==cvtestout[j])
		if (NSC.test[j] != cvtestout[j]) cerr[i,class] <- cerr[i,class] + 1
		counts_cerr[i,class] <- counts_cerr[i,class] + 1
	}
	r[i] <- rcv/foldsize[i]
	for (l in 1:numclasses) {
		if (counts_cerr[i,l]>0) rcerr[i,l] <- cerr[i,l]/counts_cerr[i,l]
		else rcerr[i,l] <- NA
		# print(paste("i,l,counts_cerr,cerr =",i,l,counts_cerr[i,l],cerr[i,l]))
	}
}
# ravg is the simple estimate of classifier error obtained using 
# (proper) external cross-validation
ravg <- mean(r)
# Didn't report this. Did use r for se calculations though.
ser <- sd(r)/sqrtK
# est of std error of the simple overall error rate
sercerr <- vector("numeric",numclasses)
# est of std error of the class error rates
for (l in 1:numclasses) {
	sercerr[l] <- sd(rcerr[,l],na.rm=TRUE)/sqrtK
}

# now get the mean of each class error rate (over the cv folds)
ovrcerr <- vector("numeric",numclasses) # mean class error rates
for (j in 1:numclasses) {
  ovrcerr[j] <- mean(rcerr[,j],na.rm=TRUE)
}

# oneovrcerr is the overall average class error rate 
# (averaging over the various classes after averaging over the folds 
#  to get each class average)
oneovrcerr <- mean(ovrcerr)

# Only used to obtain std err of avg of class error rates 
# (by checking variation in these over the folds).
ovrcerr2 <- vector("numeric",dl$K) # mean fold error rates
for (j in 1:dl$K) {
  ovrcerr2[j] <- mean(rcerr[j,],na.rm=TRUE)
}
# calculate std err of oneovrtcerr
seovrcerr2 <- sd(ovrcerr2)/sqrtK

trcerr <- vector("numeric",numclasses)
# error rate on each class
tcount <- vector("numeric",numclasses)
# count of observations in each class
tcerr <- vector("numeric",numclasses)
# count of errors made on each class
for (l in 1:numclasses) {
	tcount[l] <- sum(counts_cerr[,l])
	tcerr[l] <- sum(cerr[,l])
	if (tcount[l]>0) trcerr[l] <- tcerr[l]/tcount[l]
	else trcerr[l] <- NA 
	# can only happen if there are no examples of this class: prob impossible from defn.
}
oneovrtcerr <- mean(trcerr,na.rm=TRUE)
avgnogenes <- mean(nogenes)
senogenes <- sd(nogenes)/sqrtK

total_errors <- sum(cerr)
simplesterate <- total_errors/N

if (dl$verbose) {
	print(dl$desc) # give the description of the data
	print(paste("N (number of sample points) =",N))
	print(paste("K (number of cv folds) =",dl$K))
	print(paste("class_size =",class_size))
	print(paste("foldsize =",foldsize))
	print(paste("total number of errors =",total_errors))
	print(paste("simplest error rate =",simplesterate))
	print(paste("r =",r))
	print(paste("Average simple error rate (over cv folds): ravg =",ravg))
	print(paste("numclasses =",numclasses))
	print(paste("Class error rates avgd over cv folds: ovrcerr=",ovrcerr))
	print(paste("Avg of class error rate avgs: oneovrcerr=",oneovrcerr))
	print(paste("Class error rates (best method): trcerr =",trcerr))
	print(paste("Average class error rate (over cv folds - best method): oneovrtcerr =",oneovrtcerr))
	print(paste("Numbers of genes used =",nogenes))
	print(paste("Average number of genes used =",avgnogenes))
	print(paste("----------"))
}

# predictive part: classifier is trained on whole of training data,
# then tested on test data.
# current version: test results are not returned by propercvNSC().
test_total_errors<-NA
test_simplesterate<-NA
testr<-NA
test_rcerr<-NA
test_oneovrcerr<-NA
test_oneovrtcerr<-NA
test_nogenes<-NA
# return the above if there is no test set
if (!is.null(dl$testin) & !is.null(dl$testout)) {
	# do the predictive check

	test_class_size <- vector("numeric",numclasses)
	splittestout <- split(dl$testout,dl$testout)
	for (p in 1:numclasses) {
	 test_class_size[p] <- length(splittestout[[p]])
	}

	datat <- list(x = dl$trainin, y = dl$trainout, genenames = dl$gnames, geneid = dl$gid, 
       samplelabels = NULL, batchlabels = NULL)	
	NSC.trained_test <- pamr.train(datat)
	NSC.results_test <- pamr.cv(NSC.trained_test, datat)
	minerr <- min(NSC.results_test$error)
	minindex <- 1
	numthresholds <- length(NSC.results_test$error)
	for (j in 1:numthresholds) {  
		if (minerr==NSC.results_test$error[j]) minindex <- j
	}
	optthresh <- NSC.results_test$threshold[minindex]
	test_nogenes <- NSC.results_test$size[minindex]

	NSC.test_test <- pamr.predict(NSC.trained_test,dl$testin,optthresh)
	test_rcv<-0
	# test_rcv will hold the number of errors made on the test dataset

	testsetsize <- length(dl$testout)
	test_cerr <- vector("numeric",numclasses)
	test_counts_cerr <- vector("numeric",numclasses)
	for (j in 1:testsetsize) {
		if (NSC.test_test[j] != dl$testout[j]) test_rcv <- test_rcv + 1
		class <- which(classes2==dl$testout[j])
		# class <- which(classes==dl$testout[j])
		if (NSC.test_test[j] != dl$testout[j]) test_cerr[class] <- test_cerr[class] + 1
		test_counts_cerr[class] <- test_counts_cerr[class] + 1
	}

	testr <- test_rcv/testsetsize
	test_rcerr <- vector("numeric",numclasses)
	for (l in 1:numclasses) {
		if (test_counts_cerr[l]>0) test_rcerr[l] <- test_cerr[l]/test_counts_cerr[l]
		else test_rcerr[l] <- NA
		# print(paste("l,test_counts_cerr,test_cerr =",l,test_counts_cerr[l],testcerr[l]))
	}
	# only need test_rcerr for class error rates
	# no need for ovrcerr (no folds to average over)
	# test_oneovrcerr is the overall average class error rate 
	test_oneovrcerr <- mean(test_rcerr,na.rm=TRUE)

	test_trcerr <- vector("numeric",numclasses)
	for (l in 1:numclasses) {
		if (test_counts_cerr[l]>0) test_trcerr[l] <- test_cerr[l]/test_counts_cerr[l]
		else test_trcerr[l] <- NA 
	}
	test_oneovrtcerr <- mean(test_trcerr,na.rm=TRUE)

	test_total_errors <- sum(test_cerr)
	test_simplesterate <- test_total_errors/testsetsize

	if (dl$verbose) {
		print(paste("testsetsize =",testsetsize))
		print(paste("test_class_size =",test_class_size))
		print(paste("test_nogenes =",test_nogenes))
		print(paste("total number of test errors =",test_total_errors))
		print(paste("simplest test error rate =",test_simplesterate))
		print(paste("test_rcv =",test_rcv))
		print(paste("testr =",testr))
		print(paste("test_rcerr =",test_rcerr))
		print(paste("test_oneovrcerr =",test_oneovrcerr))
		print(paste("test_oneovrtcerr =",test_oneovrtcerr))
		# note: these two produce the same results.
		print(paste("==============================="))
	}
}

return(list(
tcerr=tcerr, tcount=tcount, trcerr=trcerr, simplesterate=simplesterate, oneovrtcerr=oneovrtcerr, rcerr=rcerr, r=r, ovrcerr2=ovrcerr2, sercerr=sercerr, ser=ser, seovrcerr2=seovrcerr2, avgnogenes=avgnogenes, nogenes=nogenes, senogenes=senogenes))
}
#needed improvement: add test set results
# end of function propercvNSC
#----------------------------------------

permute <- function(dl) {
# randomly permute the responses (class labels)
	# dl: datalist
	permtrainout <- sample(dl$trainout, replace=F)
	dl$trainout <- permtrainout
	return(dl)
}

frameres <- function(NSClist,K,numclasses) {

ll<-listres(NSClist,K,numclasses)
df<-as.data.frame(t(ll))

# make the names header row for the data frame
header<-c("simplesterate","ser","oneovrtcerr","seovrcerr2","avgnogenes","senogenes")
for (i in 1:numclasses) {
  header<-c(header,paste("tcount[",i,"]",sep=""))
}
for (i in 1:numclasses) {
  header<-c(header,paste("tcerr[",i,"]",sep=""))
}
for (i in 1:numclasses) {
  header<-c(header,paste("trcerr[",i,"]",sep=""))
}
for (i in 1:numclasses) {
  header<-c(header,paste("sercerr[",i,"]",sep=""))
}
for (i in 1:K) {
  header<-c(header,paste("r[",i,"]",sep=""))
}
for (i in 1:K) {
  header<-c(header,paste("ovrcerr2[",i,"]",sep=""))
}
for (i in 1:K) {
  header<-c(header,paste("nogenes[",i,"]",sep=""))
}
for (j in 1:numclasses) {
  for (i in 1:K) {
	header<-c(header,paste("rcerr[",i,",",j,"]",sep=""))
  }
}

# puts the header as the names - will be the top row in a .csv file
# top row and ll as the 2nd row
# should be ok to put permutation rows below this somehow.
names(df) <- header

# add rows to data frame: use rbind
return(df)
}

listres <- function(NSClist,K,numclasses) {
# writing out NSClist as a list
ll<-NULL
ll<-c(NSClist$simplesterate,NSClist$ser,NSClist$oneovrtcerr,NSClist$seovrcerr2,NSClist$avgnogenes,NSClist$senogenes)
for (i in 1:numclasses) {
  ll<-c(ll,NSClist$tcount[i])
}
for (i in 1:numclasses) {
  ll<-c(ll,NSClist$tcerr[i])
}
for (i in 1:numclasses) {
  ll<-c(ll,NSClist$trcerr[i])
}
for (i in 1:numclasses) {
  ll<-c(ll,NSClist$sercerr[i])
}
for (i in 1:K) {
  ll<-c(ll,NSClist$r[i])
}
for (i in 1:K) {
  ll<-c(ll,NSClist$ovrcerr2[i])
}
for (i in 1:K) {
  ll<-c(ll,NSClist$nogenes[i])
}
for (j in 1:numclasses) {
  for (i in 1:K) {
	ll<-c(ll,NSClist$rcerr[i,j])
  }
}
return(ll)
}

writeresults <- function(NSCtable,desc) {
# write results to a csv file
	fname <- paste("NSCresults_",desc,"_",Sys.Date(),".csv",sep="")
	write.table(NSCtable,file=fname,sep=",",col.names=NA)
}

cv1NSC <- function(dl) {
# notes: 
# dl = datalist = list(desc, trainin, trainout, testin, testout, gnames, gid, K, verbose).
# desc is a description of the data set being used.

# list the unique classes in a vector
classes <- unique(dl$trainout)
numclasses <- length(classes)

N <- length(dl$trainout) # total number of data points
if (dl$K>N) stop("You have set K>N - choose a smaller value for K (number of cv folds)")
sqrtK <- sqrt(dl$K)

cerr <- vector("numeric",numclasses)      # number of class errors
tcount <- vector("numeric",numclasses)	# number of class members
rcerr <- vector("numeric",numclasses) 	# class error rates
finalnogenes <- NA				# optimal number of genes (for final classifier)
nogenes <- vector("numeric",dl$K)         # optimal no of genes for each fold

r <- vector("numeric",dl$K)               # overall error rate for each fold
fcerr <- matrix(0,dl$K,numclasses)        # number of class errors for each fold
fcounts_cerr <- matrix(0,dl$K,numclasses) # number of class members for each fold
frcerr <- matrix(0,dl$K,numclasses) 	# class error rates for each fold

####################################

indices <- 1:N 					# indices for the data
rindices <- sample(indices, N, replace=F) # a randomised version of the indices
# determine the number of data points in each class
class_size <- vector("numeric",numclasses)

splitrids <- split(rindices,dl$trainout[rindices])
# splits rindices into as many classes as there are based on the value of trainout 
# split order needs to be the same as the order of "classes": 
# result in classes2, used throughout
classes2<-NULL
for (p in 1:numclasses) {
 class_size[p] <- length(splitrids[[p]])
 classes2 <- c(classes2, dl$trainout[splitrids[[p]][1]])
}

######################
# estimating the misclassification rate for our NSC classifier
# using 1-external cv (i.e. as given by simple use of PAMR)

data <- list(x = dl$trainin, y = dl$trainout, genenames = dl$gnames, geneid = dl$gid, samplelabels = NULL, batchlabels = NULL)
# sets up the data list of vectors and matrices as needed by PAMR

NSC.trained <- pamr.train(data)
# this is the way this seems to work with PAMR: you give it all the training data
# to set up with, then do CV to choose the best threshold
# find optimal value of the threshold (optthresh)
NSC.results <- pamr.cv(NSC.trained, data)

findices <- NSC.results$folds

minerr <- min(NSC.results$error)
minindex <- 1
numthresholds <- length(NSC.results$error)
for (j in 1:numthresholds) {  
	if (minerr==NSC.results$error[j]) minindex <- j
}
optthresh <- NSC.results$threshold[minindex]
# so we have now used 1-level CV to find the optimal threshold on the training
# data and have set all other parameters of the NSC classifier
finalnogenes <- NSC.results$size[minindex]

for (j in 1:N) {
  class <- which(classes2==NSC.results$y[j])
  tcount[class] <- tcount[class] + 1
  if (NSC.results$y[j] != NSC.results$yhat[j,minindex]) {
	cerr[class] <- cerr[class] + 1
  }
}
for (l in 1:numclasses) {
	if (tcount[l]>0) rcerr[l] <- cerr[l]/tcount[l]
	else rcerr[l] <- NA
}
oneovrtcerr <- mean(rcerr,na.rm=TRUE)

# work out the standard errors of the cv error estimates based on the values
# on each fold.
foldsize <- vector("numeric",dl$K)	     # size of each fold

for (i in 1:dl$K) { # K iterations: over folds.
	rcv<-0
	# rcv = count of errors in a fold.
	foldsize[i] <- length(findices[[i]])
	for (j in 1:foldsize[i]) {
		# assign a class number to the output class (it will be unique)
		class <- which(classes2==NSC.results$y[findices[[i]][j]])
		if (NSC.results$y[findices[[i]][j]] != NSC.results$yhat[findices[[i]][j],minindex]) {
			rcv <- rcv + 1
			# add 1 to rcv when the prediction yhat[j] differs from the true label y[j]
			fcerr[i,class] <- fcerr[i,class] + 1
			# add 1 to the error count of class "class" in fold i
		}
		fcounts_cerr[i,class] <- fcounts_cerr[i,class] + 1
		# add 1 to the total count of class "class" in fold i
	}

	r[i] <- rcv/foldsize[i]

	for (l in 1:numclasses) {
		if (fcounts_cerr[i,l]>0) frcerr[i,l] <- fcerr[i,l]/fcounts_cerr[i,l]
		else frcerr[i,l] <- NA
	}
	sercerr <- vector("numeric",numclasses)
	# est of std error of the class error rates
	for (l in 1:numclasses) {
		sercerr[l] <- sd(frcerr[,l],na.rm=TRUE)/sqrtK
	}
}

#debugging
print(paste("fcerr = ",fcerr))
print(paste("fcounts_cerr = ",fcounts_cerr))
print(paste("frcerr = ",frcerr))
print(paste("rcerr = ",rcerr))

# est of std error of the simple overall error rate
ser <- sd(r)/sqrtK
# obtain std err of avg of class error rates by checking variation in these over the folds.
ovrcerr2 <- vector("numeric",dl$K) # mean fold error rates
for (j in 1:dl$K) {
  ovrcerr2[j] <- mean(frcerr[j,],na.rm=TRUE)
}

# calculate std err of oneovrtcerr
seovrcerr2 <- sd(ovrcerr2)/sqrtK

for (i in 1:dl$K) {
	nogenes[i] <- NSC.results$cv.objects[[i]]$nonzero[minindex]
}
avgnogenes <- mean(nogenes)
senogenes <- sd(nogenes)/sqrtK


#return(list(simplesterate=minerr,oneovrtcerr=oneovrtcerr,avgnogenes=avgnogenes,numclasses=numclasses,tcount=tcount,cerr=cerr,rcerr=rcerr))

return(list(tcerr=cerr, tcount=tcount, trcerr=rcerr, simplesterate=minerr, oneovrtcerr=oneovrtcerr, rcerr=frcerr, r=r, ovrcerr2=ovrcerr2, sercerr=sercerr, ser=ser, seovrcerr2=seovrcerr2, avgnogenes=avgnogenes, nogenes=nogenes, senogenes=senogenes))

# end of function cv1NSC
}