# $Id: fossils.R,v 1.20 2007/11/06 06:27:33 kaip Exp $
#
# Copyright (c) 2007 Kai Puolamaki <Kai.Puolamaki@iki.fi>
#
# Permission to use, copy, modify, and distribute this software for any
# purpose with or without fee is hereby granted, provided that the above
# copyright notice and this permission notice appear in all copies.
#
# THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
# WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
# MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
# ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
# WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
# ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
# OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
#
#
# T-61.3050 Machine Learning: Basic Principles
# For problem session 7/2007

# Reference: Alpaydin, 2004. Introduction to Machine Learning. The MIT
# Press.

# We use function maColorBar to get a color bar. For that we
# need BioConductor which can be installed as follows:
# > source("http://bioconductor.org/biocLite.R",echo=TRUE)
# > biocLite()
# See http://bioconductor.org/docs/install-howto.html
library(marray)


D <- read.table("http://www.cis.hut.fi/Opinnot/T-61.3050/2007/fossils.txt")

# Extract the taxa and sites:
taxa <- colnames(D)[4:142]
sites <- rownames(D)
# Make a matrix, for convenience:
X <- as.matrix(D[sites,taxa])

# Make a pretty image:
#pdf("fossils.pdf")
image(1:139,1:124,
      t(X), #take transpose
      col=gray(1:0), #ones in black
      xlab="taxa",ylab="fossil sites",
      main="Cenozoic Large Land Mammals")
#dev.off()

#########################################################################
# Naive Bayes binary classifier.
# See Alpaydin (2004), subsection 5.7 (pages 99-100), for reference.
# Training phase:
#   Input:
#     class vector 
#     0/1 covariate vectors, corresponding to classes
#   Output:
#     parameters of the classifier
# Prediction phase:
#   Input:
#     0/1 covariate vectors
#     parameteres of the classifier
#   Output:
#     predicted classes

# Train the classifier:
# Input:
#   C     n-dimensional vector containing the K classes.
#   X     nXm binary matrix, rows containing the covariates.
#   alpha Prior count parameter (alpha=1 by default).
# Output:
#   pars  list containing the parameters of the classifier
#         pars$p  Kxm matrix, containing the Bernoulli parameters (eq.
#                 5.13 of Alpaydin (2004))
#         pars$pc K-dimensional vector, containing the class probabilities.
trainNB <- function(C,X,alpha=1) {
  n <- dim(X)[1]
  m <- dim(X)[2]
  classes <- unique(C)
  K <- length(classes)  # number of classes
  
  # pc or $\hat P(C)$ is essentially the frequency of classes.
  # We add a prior observation count $\alpha$ for each class (see
  # the discussion below).
  pc <- (alpha+table(C))/(K*alpha+length(C))

  # If there are no covariates:
  if(m==0) return(list(p=NA,pc=pc))

  # p[i,j] of $\hat p_{ij}$ is the frequency of covariate j in class i.
  p <- matrix(NA,
              nrow=K,ncol=m,
              dimnames=list(classes,colnames(X)))
  for(i in 1:K) {
    for(j in 1:m) {
      # To avoid zero probabilities we modify the equation 5.31 of Alpaydin
      # (2004) as follows:
      # \[
      #    \hat p_{ij}=(\alpha+\sum_t{x^t_j r^t_i})/(2\alpha+\sum_t{r^t_i}),
      # \]
      # where $\alpha$ is a positive constant (for example, $\alpha=1$).
      # This makes the equation work also for no data, in which case we get
      # $\hat p_{ij}=1/2$. The $\alpha$ can be interpeted as coming from
      # a MAP solution, when posterior favours solution with probability
      # of one half. $\alpha$ can be also interpreted as prior count.
      # (Here p[i,j] is $\hat p_{ij}$.)
      p[i,j] <- (alpha+sum(X[C==classes[i],j]))/(2*alpha+sum(C==classes[i]))
    }
  }

  list(p=p,pc=pc)
}

# Use the classifier:
# Input:
#   X    n'Xm binary matrix, rows containing the covariates
#   pars list containing the parameters of the classifier (see trainNB).
# Output:
#   C    predicted classes.
classifyNB <- function(X,pars) {
  # Coerce X to matrix:
  n <- dim(X)[1]
  m <- dim(X)[2]
  K <- length(pars$pc)
  C <- rep(NA,n)

  p  <- pars$p
  pc <- pars$pc
  classes <- rownames(pc)

  # If there are no covariates return the largest class:
  if(m==0) return(rep(classes[which.max(pc)],n))
  
  for(t in 1:n) {
    # Compute the discriminant functions $g_i(x) (equation 5.30 of
    # Alpaydin (2004)):
    g <- rep(NA,K)
    for(i in 1:K) {
      g[i] <- sum(X[t,]*log(p[i,])+(1-X[t,])*log(1-p[i,]))+log(pc[i])
    }
    # Choose a class with largest discriminant.
    C[t] <- classes[which.max(g)]
  }

  C
}

#########################################################################
# Predict a column of binary matrix X, given all other columns.
# This way, try to "fix" binary matrix.
fixNB <- function(X) {
  Y <- array(NA,dim(X),dimnames=dimnames(X))
  for(j in colnames(X)) {
    # Predict the column j, given all other columns
    pars <- trainNB(X[,j],X[,colnames(X)!=j,drop=FALSE])
    Y[,j] <- as.numeric(classifyNB(X[,colnames(X)!=j,drop=FALSE],pars))
  }

  Y
}

#########################################################################
# Subset selection
# See subsection 6.2 (pages 106-108) of Alpaydin (2004).
#
# In principle, instead of using training and validation sets, we
# could add some penalty term for increasing number of parameters (as
# in structural learning, Bayesian regularization etc.).
#

# Forward selection:
# Input (as in trainNB):
#   C   Class vector (training set)
#   X   Matrix (training set)
#   Cva Class vector (validation set)
#   Xva Matrix (validation set)
# Output:
#   f    The order in which features have been added.
#   errs Classification errors.
forwardNB <- function(C,X,Cva,Xva,alpha=1,maxf=NA) {
  unusedf <- colnames(X)
  f    <- c()

  pars <- trainNB(C,X[,f,drop=FALSE],alpha)
  errs <- sum(as.character(Cva)!=classifyNB(Xva[,f,drop=FALSE],pars))/length(Cva)
  cat(sprintf("forwardNB (0): 0 features, error %.4f.\n",errs))
  
  while(length(unusedf)>0) {
    besterr <- NA
    for(newf in unusedf) {
      # Try adding all unused features to current set of features:
      pars <- trainNB(C,X[,c(f,newf),drop=FALSE],alpha)
      err  <- sum(as.character(Cva)!=classifyNB(Xva[,c(f,newf),drop=FALSE],pars))/length(Cva)
      if(is.na(besterr) || err<besterr) {
        besterr <- err
        bestf <- newf
      }
    }
    f    <- c(f,bestf)
    errs <- c(errs,besterr)
    unusedf <- unusedf[unusedf!=bestf]
    cat(sprintf("forwardNB (%d): added %s, error %.4f.\n",
                length(f),bestf,besterr))
    if(!is.na(maxf) && length(f)>=maxf) {
      cat("forwardNB: limit of iterations reached.\n")
      break
    }
  }

  list(f=f,errs=errs)
}

# The same as forwardNB, except backwardNB implements the backward
# selection.
backwardNB <- function(C,X,Cva,Xva,alpha=1) {
  f    <- colnames(X)

  pars <- trainNB(C,X[,f,drop=FALSE],alpha)
  errs <- sum(as.character(Cva)!=classifyNB(Xva[,f,drop=FALSE],pars))/length(Cva)
  cat(sprintf("backwardNB (0): all %d features, error %.4f.\n",length(f),errs))
  
  while(length(f)>0) {
    besterr <- NA
    for(oldf in f) {
      # Try removing features of the current set of features:
      pars <- trainNB(C,X[,f[f!=oldf],drop=FALSE],alpha)
      err  <- sum(as.character(Cva)!=classifyNB(Xva[,f[f!=oldf],drop=FALSE],pars))/length(Cva)
      if(is.na(besterr) || err<besterr) {
        besterr <- err
        bestf <- oldf
      }
    }
    f    <- f[f!=bestf]
    errs <- c(errs,besterr)
    cat(sprintf("backwardNB (%d): removed %s, error %.4f.\n",
                length(f),bestf,besterr))
  }

  list(f=f,errs=errs)
}


# Make a classifier that predicts the occurence of "Tapirus" on
# various sites, given occurences of all other taxa in the respective
# sites, and select the paramters using forward selection.
# Use training and validation sets:
perm <- sample(124)
ptr <- perm[1:62]
pva <- perm[63:124]
res <- forwardNB(X[ptr,"Tapirus"],X[ptr,taxa!="Tapirus"],X[pva,"Tapirus"],X[pva,taxa!="Tapirus"])


#########################################################################
# PCA
# See subsection 6.3 (pages 108-115) of Alpaydin (2004).
#
# Make a nice plot, with color bar.
plotPCA <- function(coords,main="") {
  n <- dim(coords)[1]
  cols <- terrain.colors(n)

  layout(matrix(c(1,1,1,1,1,1,1,1,1,2,2),1,11,byrow=TRUE))
  # Make a plot with equal ranges on X and Y axis:
  xx <- range(coords[,1])
  yy <- range(coords[,2])
  if(xx[2]-xx[1]<yy[2]-yy[1]) {
    # Y has a larger range.
    miny <- yy[1]
    maxy <- yy[2]
    # Make X and Y ranges equal.
    minx <- (xx[1]+xx[2])/2-(yy[2]-yy[1])/2
    maxx <- (xx[1]+xx[2])/2+(yy[2]-yy[1])/2
  }
  else {
    # X has a larger range.
    minx <- xx[1]
    maxx <- xx[2]
    # Make X and Y ranges equal.
    miny <- (yy[1]+yy[2])/2-(xx[2]-xx[1])/2
    maxy <- (yy[1]+yy[2])/2+(xx[2]-xx[1])/2
  }
  # Don't really plot anything, just make the coordinate frame by plotting
  # two points at (minx,miny) and (maxx,maxy), respectively. In principle
  # we could use eqscplot from the MASS library, but it seems to have
  # a problem with layout command...
  plot(c(minx,maxx),c(miny,maxy),type="n",
       xlab="first principal component",
       ylab="second principal component",
       main=main)
  points(coords,pch=19,col=cols)
  maColorBar(1:n,col=cols,horizontal=FALSE)
}

# Singular Value Decomposition (SVD) is a robust way to obtain eigenvectors
# and eigenvalues of a matrix.
# The principal components are eigenvectors of covariance or correlation
# matrix. Below you can find a simplified variant of R's prcomp function.
# See subsection 6.3 (pages 108-115) of Alpaydin (2004) for discussion.
myprcomp <- function(X,scale=FALSE) {
  # There are two options: the principal components are eigenvector
  # of either correlation or covariance matrix. The most significant
  # difference is that in correlation matrix the "self-correlation"
  # is scaled to a unit variance, that is, the diagonal terms are one.
  # Usually, it is preferred to use the correlation matrix (scale=TRUE).
  S <- if(scale) cor(X) else cov(X)

  # The principal components are the Singular Value Decomposition (SVD)
  # of the correlation or covariance matrix, or in other words,
  # eigenvectors of S.
  S.svd <- svd(S)

  # Make sure that the singular values are in decreasing order and
  # take only the first min(dim(X)) eigenvalues (the rest are guaranteed
  # to be zero, because the rank of correlation or covariance matrix
  # is at most min(dim(X)).
  perm <- order(S.svd$d,decreasing=TRUE)[1:min(dim(X))]
  
  
  # The eigenvalues give the variances. The standard deviation is square
  # root of variance:
  sdev <- sqrt(S.svd$d[perm])

  # The eigenvectors:
  rotation <- S.svd$u[,perm]

  list(sdev=sdev,rotation=rotation)
}
# You can check that the R function prcomp and the above myprcomp give
# equivalent results. However, according to rule 1 of numerical methods,
# we should use robust methods from numerical libraries, whenever
# possible, therefore in the example below we use the R's version.


D.pr1 <- prcomp(X,   scale=TRUE)
D.pr2 <- prcomp(t(X),scale=TRUE)

# Two largest eigenvalues (should be c(1,2), this is not really needed):
largest1 <- order(D.pr1$sdev,decreasing=TRUE)[1:2]
largest2 <- order(D.pr2$sdev,decreasing=TRUE)[1:2]

# We want to scale the data matrix to zero mean and unit variance.
# R function scale should do the same as myscale below.
myscale <- function(X) {
  Y <- t(t(X)-rowMeans(t(X))) # center columns
  Y <- Y * (rep(1,dim(Y)[1]) %o% (1/sqrt(apply(Y,2,var)))) # scale columns
  Y
}

# We could then compute the PCA coordinates of the data points as follows:
#coords1 <- scale(X) %*% D.pr1$rotation[,largest1]
#coords2 <- scale(t(X)) %*% D.pr2$rotation[,largest2]

# However, we prefer to use the equivalent R's predict method (it is
# simpler to use and it applies to all other projections, not just PCA):
coords1 <- predict(D.pr1)[,largest1]
coords2 <- predict(D.pr2)[,largest2]


# pdf("PCAtaxa.pdf")
plotPCA(coords1, main="Sites")
# def.off()

# pdf("PCAsites.pdf")
plotPCA(coords2,main="Taxa")
# dev.off()


# Centering and scaling may be off below...
D.pr <- prcomp(t(X),scale=FALSE)
W1 <- D.pr$rotation[,1:2]
W2 <- D.pr$rotation[,1:52]
m <- rowMeans(X)
Y <- X-m
Yhat1 <- W1 %*% t(W1) %*% Y
Yhat2 <- W2 %*% t(W2) %*% Y
cols <- rev(heat.colors(256))
rng <- range(c(range(Y+m),range(Yhat1+m),range(Yhat2+m)))
pdf("fossilY.pdf")
image(1:139,1:124,1+floor(255*(t(Y+m)-rng[1])/(rng[2]-rng[1])),col=cols,
      xlab="taxa",ylab="fossil sites",
      main="X (original data)")
dev.off()
pdf("fossilY2.pdf")
image(1:139,1:124,1+floor(255*(t(Yhat1+m)-rng[1])/(rng[2]-rng[1])),col=cols,
      xlab="taxa",ylab="fossil sites",
      main="X (reconstructed data with k=2)")
dev.off()
pdf("fossilY52.pdf")
image(1:139,1:124,1+floor(255*(t(Yhat2+m)-rng[1])/(rng[2]-rng[1])),col=cols,
      xlab="taxa",ylab="fossil sites",
      main="X (reconstructed data with k=52)")
dev.off()


#########################################################################
# For lecture 8/2007
#
# Warning: badly documented hacks follow.

# a 2-dimensional example of clustering
x <- rbind(matrix(rnorm(100, sd = 0.3), ncol = 2),
           matrix(rnorm(100, mean = 1, sd = 0.3), ncol = 2))
colnames(x) <- c("x", "y")
#pdf("kmeans2.pdf")
(cl <- kmeans(x, 2))
plot(x, col = cl$cluster,pch=c(19,21)[cl$cluster],main="k-means (k=2)")
points(cl$centers, col = 1:2, pch = 8, cex=2)
#dev.off()


X.km <- kmeans(t(X),centers=3)
#pdf("fossilskmeans.pdf")
image(1:139,1:124,
      t(X)*X.km$cluster, #take transpose
      col=c("white","red","green","blue"), 
      xlab="taxa",ylab="fossil sites",
      main="Cenozoic Large Land Mammals (k=3)")
legend("bottomright",
       legend=c("cluster 1","cluster 2", "cluster 3"),
       pch=20,
       col=c("red","green","blue"),
       bg="white")
#dev.off()

#pdf("fossilskmeans2.pdf")
image(1:139,1:124,
      X.km$centers[X.km$cluster,],
      col=rev(gray(0:255/255)),
      xlab="taxa",ylab="fossil sites",
      main="Cenozoic Large Land Mammals (cluster prototypes)")
#dev.off()


res <- rep(NA,1000)
minres <- NA
for(i in 1:1000) {
  cl <- kmeans(t(X),centers=6,algorithm="Lloyd",iter.max=100)
  res[i] <- sum((cl$centers[cl$cluster,]-t(X))^2)
  if(is.na(minres) || res[i]<minres) {
    minres <- res[i]
    X.km <- cl
  }
}
#pdf("fossilskmeanshist.pdf")
hist(res,freq=TRUE,main="Error (1000 runs, k=6)",xlab="error")
rug(res)
#dev.off()

#pdf("fossilskmeans6.pdf")
image(1:139,1:124,
      t(X)*X.km$cluster, #take transpose
      col=c("white",rainbow(6)),
      xlab="taxa",ylab="fossil sites",
      main="Cenozoic Large Land Mammals (k=6)")
legend("bottomright",
       legend=c("cluster 1","cluster 2", "cluster 3",
         "cluster 4","cluster 5", "cluster 6"),
       pch=20,
       col=rainbow(6),
       bg="white")
#dev.off()

#pdf("fossilskmeans62.pdf")
image(1:139,1:124,
      X.km$centers[X.km$cluster,],
      col=rev(gray(0:255/255)),
      xlab="taxa",ylab="fossil sites",
      main="Cenozoic Large Land Mammals (cluster prototypes)")
#dev.off()