# $Id: fossils.R,v 1.20 2007/11/06 06:27:33 kaip Exp $ # # Copyright (c) 2007 Kai Puolamaki # # Permission to use, copy, modify, and distribute this software for any # purpose with or without fee is hereby granted, provided that the above # copyright notice and this permission notice appear in all copies. # # THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES # WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF # MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR # ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES # WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN # ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF # OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. # # # T-61.3050 Machine Learning: Basic Principles # For problem session 7/2007 # Reference: Alpaydin, 2004. Introduction to Machine Learning. The MIT # Press. # We use function maColorBar to get a color bar. For that we # need BioConductor which can be installed as follows: # > source("http://bioconductor.org/biocLite.R",echo=TRUE) # > biocLite() # See http://bioconductor.org/docs/install-howto.html library(marray) D <- read.table("http://www.cis.hut.fi/Opinnot/T-61.3050/2007/fossils.txt") # Extract the taxa and sites: taxa <- colnames(D)[4:142] sites <- rownames(D) # Make a matrix, for convenience: X <- as.matrix(D[sites,taxa]) # Make a pretty image: #pdf("fossils.pdf") image(1:139,1:124, t(X), #take transpose col=gray(1:0), #ones in black xlab="taxa",ylab="fossil sites", main="Cenozoic Large Land Mammals") #dev.off() ######################################################################### # Naive Bayes binary classifier. # See Alpaydin (2004), subsection 5.7 (pages 99-100), for reference. # Training phase: # Input: # class vector # 0/1 covariate vectors, corresponding to classes # Output: # parameters of the classifier # Prediction phase: # Input: # 0/1 covariate vectors # parameteres of the classifier # Output: # predicted classes # Train the classifier: # Input: # C n-dimensional vector containing the K classes. # X nXm binary matrix, rows containing the covariates. # alpha Prior count parameter (alpha=1 by default). # Output: # pars list containing the parameters of the classifier # pars$p Kxm matrix, containing the Bernoulli parameters (eq. # 5.13 of Alpaydin (2004)) # pars$pc K-dimensional vector, containing the class probabilities. trainNB <- function(C,X,alpha=1) { n <- dim(X)[1] m <- dim(X)[2] classes <- unique(C) K <- length(classes) # number of classes # pc or $\hat P(C)$ is essentially the frequency of classes. # We add a prior observation count $\alpha$ for each class (see # the discussion below). pc <- (alpha+table(C))/(K*alpha+length(C)) # If there are no covariates: if(m==0) return(list(p=NA,pc=pc)) # p[i,j] of $\hat p_{ij}$ is the frequency of covariate j in class i. p <- matrix(NA, nrow=K,ncol=m, dimnames=list(classes,colnames(X))) for(i in 1:K) { for(j in 1:m) { # To avoid zero probabilities we modify the equation 5.31 of Alpaydin # (2004) as follows: # \[ # \hat p_{ij}=(\alpha+\sum_t{x^t_j r^t_i})/(2\alpha+\sum_t{r^t_i}), # \] # where $\alpha$ is a positive constant (for example, $\alpha=1$). # This makes the equation work also for no data, in which case we get # $\hat p_{ij}=1/2$. The $\alpha$ can be interpeted as coming from # a MAP solution, when posterior favours solution with probability # of one half. $\alpha$ can be also interpreted as prior count. # (Here p[i,j] is $\hat p_{ij}$.) p[i,j] <- (alpha+sum(X[C==classes[i],j]))/(2*alpha+sum(C==classes[i])) } } list(p=p,pc=pc) } # Use the classifier: # Input: # X n'Xm binary matrix, rows containing the covariates # pars list containing the parameters of the classifier (see trainNB). # Output: # C predicted classes. classifyNB <- function(X,pars) { # Coerce X to matrix: n <- dim(X)[1] m <- dim(X)[2] K <- length(pars$pc) C <- rep(NA,n) p <- pars$p pc <- pars$pc classes <- rownames(pc) # If there are no covariates return the largest class: if(m==0) return(rep(classes[which.max(pc)],n)) for(t in 1:n) { # Compute the discriminant functions $g_i(x) (equation 5.30 of # Alpaydin (2004)): g <- rep(NA,K) for(i in 1:K) { g[i] <- sum(X[t,]*log(p[i,])+(1-X[t,])*log(1-p[i,]))+log(pc[i]) } # Choose a class with largest discriminant. C[t] <- classes[which.max(g)] } C } ######################################################################### # Predict a column of binary matrix X, given all other columns. # This way, try to "fix" binary matrix. fixNB <- function(X) { Y <- array(NA,dim(X),dimnames=dimnames(X)) for(j in colnames(X)) { # Predict the column j, given all other columns pars <- trainNB(X[,j],X[,colnames(X)!=j,drop=FALSE]) Y[,j] <- as.numeric(classifyNB(X[,colnames(X)!=j,drop=FALSE],pars)) } Y } ######################################################################### # Subset selection # See subsection 6.2 (pages 106-108) of Alpaydin (2004). # # In principle, instead of using training and validation sets, we # could add some penalty term for increasing number of parameters (as # in structural learning, Bayesian regularization etc.). # # Forward selection: # Input (as in trainNB): # C Class vector (training set) # X Matrix (training set) # Cva Class vector (validation set) # Xva Matrix (validation set) # Output: # f The order in which features have been added. # errs Classification errors. forwardNB <- function(C,X,Cva,Xva,alpha=1,maxf=NA) { unusedf <- colnames(X) f <- c() pars <- trainNB(C,X[,f,drop=FALSE],alpha) errs <- sum(as.character(Cva)!=classifyNB(Xva[,f,drop=FALSE],pars))/length(Cva) cat(sprintf("forwardNB (0): 0 features, error %.4f.\n",errs)) while(length(unusedf)>0) { besterr <- NA for(newf in unusedf) { # Try adding all unused features to current set of features: pars <- trainNB(C,X[,c(f,newf),drop=FALSE],alpha) err <- sum(as.character(Cva)!=classifyNB(Xva[,c(f,newf),drop=FALSE],pars))/length(Cva) if(is.na(besterr) || err=maxf) { cat("forwardNB: limit of iterations reached.\n") break } } list(f=f,errs=errs) } # The same as forwardNB, except backwardNB implements the backward # selection. backwardNB <- function(C,X,Cva,Xva,alpha=1) { f <- colnames(X) pars <- trainNB(C,X[,f,drop=FALSE],alpha) errs <- sum(as.character(Cva)!=classifyNB(Xva[,f,drop=FALSE],pars))/length(Cva) cat(sprintf("backwardNB (0): all %d features, error %.4f.\n",length(f),errs)) while(length(f)>0) { besterr <- NA for(oldf in f) { # Try removing features of the current set of features: pars <- trainNB(C,X[,f[f!=oldf],drop=FALSE],alpha) err <- sum(as.character(Cva)!=classifyNB(Xva[,f[f!=oldf],drop=FALSE],pars))/length(Cva) if(is.na(besterr) || err