# $Id: subsetselection.R,v 1.2 2007/10/23 10:11:27 kaip Exp $
#
#
# Copyright (c) 2007 Kai Puolamaki <Kai.Puolamaki@iki.fi>
#
# Permission to use, copy, modify, and distribute this software for any
# purpose with or without fee is hereby granted, provided that the above
# copyright notice and this permission notice appear in all copies.
#
# THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
# WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
# MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
# ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
# WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
# ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
# OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
#
#
# T-61.3050 Machine Learning: Basic Principles
# For lecture 7/2007

#######################################################################
# Create toy data with class "Class" and 10 attributes with only
# 2 being related to the data.

# T is just a matrix of Gaussian random variables with zero mean and
# unit variance. The first 25 rows are in class 1, the last 25 in
# class 0.
T <- matrix(rnorm(1000),nrow=100,ncol=10)

Tva <- matrix(rnorm(1000),nrow=100,ncol=10)
Tva[  1:50,1:2] <- Tva[ 1:50,1:2]+1
Tva[51:100,1:2] <- Tva[51:100,1:2]-1
TOY1va <- data.frame(Tva,Class=factor(c(rep(1,50),rep(0,50)),ordered=FALSE))


                                        # T1 is the data where the classes 1 and 0 the variables X1 and X2
# have means +1 and -1, respectively.
T1 <- T
T1[  1:50,1:2] <- T1[ 1:50,1:2]+1
T1[51:100,1:2] <- T1[51:100,1:2]-1
TOY1 <- data.frame(T1,Class=factor(c(rep(1,50),rep(0,50)),ordered=FALSE))
TOY1tr <- TOY1[c(1:30,51:80),]
TOY1te <- TOY1[c(31:50,81:100),]
# The optimal discriminant is
#  g_A(i)= T1[i,1]+T1[i,2]
#  g_B(i)=-T1[i,1]-T1[i,2]
# That is, only the first two entries matter.


# Rows of T2 are the rows of T1 multiplied by a random matrix.
W    <- matrix(rnorm(100),nrow=10,ncol=10)
invW <- ginv(W) # inverse of W
T2 <- T1 %*% W
TOY2 <- data.frame(T2,Class=factor(c(rep(1,50),rep(0,50)),ordered=FALSE))
# The optimal discriminant is, again,
#  g_A(i)= T1[i,1]+T1[i,2]
#  g_B(i)=-T1[i,1]-T1[i,2]
# where T1 == T2 %*% invW. In other words, the first two columns of
# invW are the most discriminative directions.

nearestmean <- function(tr,va,features) {
  if(length(features)>1) {
    mtr0 <- colMeans(tr[tr[,"Class"]=="0",features])
    mtr1 <- colMeans(tr[tr[,"Class"]=="1",features])
  }
  else {
    mtr0 <- mean(tr[tr[,"Class"]=="0",features])
    mtr1 <- mean(tr[tr[,"Class"]=="1",features])
  }
  err <- 0
  for(i in rownames(va)) {
    if(sum((va[i,features]-mtr0)^2)<sum((va[i,features]-mtr1)^2)) {
      if(va[i,"Class"]=="1") err <- err+1
    }
    else {
      if(va[i,"Class"]=="0") err <- err+1
    }
  }
  err/length(rownames(va))
}


# Forward selection:
unusedfeatures <- colnames(TOY1tr)[1:10]
features <- c()
while(length(unusedfeatures)>0) {
  besterr <- 2
  bestcandidate <- NA
  for(candidate in unusedfeatures) {
    err <- nearestmean(TOY1tr,TOY1te,c(features,candidate))
    if(err<besterr) {
      besterr <- err
      bestcandidate <- candidate
    }
  }
  features <- c(features,bestcandidate)
  unusedfeatures <- unusedfeatures[unusedfeatures != bestcandidate]
  cat(bestcandidate,sprintf("%.4f\n",besterr))
}

# Backward selection:
unusedfeatures <- c()
features <- colnames(TOY1tr)[1:10]
err <- nearestmean(TOY1tr,TOY1te,features)
cat(sprintf("- %.4f\n",err))
while(length(features)>0) {
  besterr <- 2
  bestcandidate <- NA
  for(candidate in features) {
    err <- nearestmean(TOY1tr,TOY1te,features[features != candidate])
    if(err<besterr) {
      besterr <- err
      bestcandidate <- candidate
    }
  }
  unusedfeatures <- c(unusedfeatures,bestcandidate)
  features <- features[features != bestcandidate]
  cat(bestcandidate," ",features," ",sprintf("%.4f\n",besterr))
}

TOY1.pca <- princomp(TOY1[,1:10])
TOY1.pc  <- predict(TOY1.pca)
pdf("toy1pc.pdf")
eqscplot(TOY1.pc[,1:2],type="n",
         xlab="first principal component",
         ylab="second principal component")
text(TOY1.pc[,1:2],labels=as.character(TOY1[,"Class"]))
dev.off()

TOY2.pca <- princomp(TOY2[,1:10])
TOY2.pc  <- predict(TOY2.pca)
pdf("toy2pc.pdf")
eqscplot(TOY2.pc[,1:2],type="n",
         xlab="first principal component",
         ylab="second principal component")
text(TOY2.pc[,1:2],labels=as.character(TOY2[,"Class"]))
dev.off()


TOY1.cpca <- princomp(TOY1[,1:10],cor=TRUE)
TOY1.cpc  <- predict(TOY1.cpca)
pdf("toy1cpc.pdf")
eqscplot(TOY1.cpc[,1:2],type="n",
         xlab="first principal component",
         ylab="second principal component")
text(TOY1.cpc[,1:2],labels=as.character(TOY1[,"Class"]))
dev.off()

TOY1.lda <- lda(Class ~ .,TOY1)
TOY1.ld  <- predict(TOY1.lda)


#######################################################################
# The optdigits data set.
# See ftp://ftp.ics.uci.edu/pub/machine-learning-databases/optdigits/
# for further information and references.

# Training set.
ODtr <- read.csv("ftp://ftp.ics.uci.edu/pub/machine-learning-databases/optdigits/optdigits.tra",
                 header=FALSE)
# Test set.
ODte <- read.csv("ftp://ftp.ics.uci.edu/pub/machine-learning-databases/optdigits/optdigits.tes",
                 header=FALSE)
# The last column is class; change the column name.
colnames(ODtr)[65] <- "Class"
colnames(ODte)[65] <- "Class"
# The last column is not really a ordinal number but a categorical
# variable without a priori order.
ODtr[,"Class"] <- factor(ODtr[,"Class"],ordered=TRUE)
ODte[,"Class"] <- factor(ODte[,"Class"],ordered=TRUE)

##########################################################
# Some convenient functions:
# Extract the image matrix:
imagematrix <- function(x) {
  matrix(as.numeric(x[1:64]),nrow=8,ncol=8)[,8:1]
}
#
# Actually show the matrix as an image:
showimage <- function(x) {
  image(imagematrix(x),col=rev(gray(0:17/17)),
        main=if(length(x)>64) sprintf("%s",x[,"Class"]) else "",
        xlab="",ylab="",xaxt="n",yaxt="n")
}
#
##########################################################