# $Id: subsetselection.R,v 1.2 2007/10/23 10:11:27 kaip Exp $ # # # Copyright (c) 2007 Kai Puolamaki # # Permission to use, copy, modify, and distribute this software for any # purpose with or without fee is hereby granted, provided that the above # copyright notice and this permission notice appear in all copies. # # THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES # WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF # MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR # ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES # WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN # ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF # OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. # # # T-61.3050 Machine Learning: Basic Principles # For lecture 7/2007 ####################################################################### # Create toy data with class "Class" and 10 attributes with only # 2 being related to the data. # T is just a matrix of Gaussian random variables with zero mean and # unit variance. The first 25 rows are in class 1, the last 25 in # class 0. T <- matrix(rnorm(1000),nrow=100,ncol=10) Tva <- matrix(rnorm(1000),nrow=100,ncol=10) Tva[ 1:50,1:2] <- Tva[ 1:50,1:2]+1 Tva[51:100,1:2] <- Tva[51:100,1:2]-1 TOY1va <- data.frame(Tva,Class=factor(c(rep(1,50),rep(0,50)),ordered=FALSE)) # T1 is the data where the classes 1 and 0 the variables X1 and X2 # have means +1 and -1, respectively. T1 <- T T1[ 1:50,1:2] <- T1[ 1:50,1:2]+1 T1[51:100,1:2] <- T1[51:100,1:2]-1 TOY1 <- data.frame(T1,Class=factor(c(rep(1,50),rep(0,50)),ordered=FALSE)) TOY1tr <- TOY1[c(1:30,51:80),] TOY1te <- TOY1[c(31:50,81:100),] # The optimal discriminant is # g_A(i)= T1[i,1]+T1[i,2] # g_B(i)=-T1[i,1]-T1[i,2] # That is, only the first two entries matter. # Rows of T2 are the rows of T1 multiplied by a random matrix. W <- matrix(rnorm(100),nrow=10,ncol=10) invW <- ginv(W) # inverse of W T2 <- T1 %*% W TOY2 <- data.frame(T2,Class=factor(c(rep(1,50),rep(0,50)),ordered=FALSE)) # The optimal discriminant is, again, # g_A(i)= T1[i,1]+T1[i,2] # g_B(i)=-T1[i,1]-T1[i,2] # where T1 == T2 %*% invW. In other words, the first two columns of # invW are the most discriminative directions. nearestmean <- function(tr,va,features) { if(length(features)>1) { mtr0 <- colMeans(tr[tr[,"Class"]=="0",features]) mtr1 <- colMeans(tr[tr[,"Class"]=="1",features]) } else { mtr0 <- mean(tr[tr[,"Class"]=="0",features]) mtr1 <- mean(tr[tr[,"Class"]=="1",features]) } err <- 0 for(i in rownames(va)) { if(sum((va[i,features]-mtr0)^2)0) { besterr <- 2 bestcandidate <- NA for(candidate in unusedfeatures) { err <- nearestmean(TOY1tr,TOY1te,c(features,candidate)) if(err0) { besterr <- 2 bestcandidate <- NA for(candidate in features) { err <- nearestmean(TOY1tr,TOY1te,features[features != candidate]) if(err64) sprintf("%s",x[,"Class"]) else "", xlab="",ylab="",xaxt="n",yaxt="n") } # ##########################################################