# $Id: lindiscr.R,v 1.3 2007/11/22 04:17:18 kaip Exp $ # # Copyright (c) 2007 Kai Puolamaki # # Permission to use, copy, modify, and distribute this software for any # purpose with or without fee is hereby granted, provided that the above # copyright notice and this permission notice appear in all copies. # # THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES # WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF # MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR # ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES # WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN # ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF # OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. # # # For T-61.3050 Machine Learning: Basic Principles # Create a toy data of two classes. # For visualization purposes the data is two-dimensional. However, only the first # dimension is relevant for the classification task. # Both classes are distributed uniformly between 0 and 1 in the second dimension, # that is, y ~ Unif(0,1). # The first class obeys normal distribution with zero mean and unit # variance in the first dimension, that is, x ~ N(0,1). # The second class obeys distribution F in the first dimension, that is, x ~ F. # A real number x is sampled from distribution F by first sampling z from # normal distribution with zero mean and unit variance. x is then obtained by # x <- 1+2|z|. D <- data.frame(x=c(rnorm(500),1+2*abs(rnorm(500))), y=runif(1000), r=factor(c(rep(1,500),rep(2,500)))) # Plot data to a plane pdf("toydiscr.pdf") plot(D[,"x"],D[,"y"], pch=c(21,19)[D[,"r"]], col=c("black","red")[D[,"r"]], main="Toy data", xlab="x", ylab="y") abline(v=1,lty="dotted") legend(x="topleft", legend=c("Class 1","Class 2"), pch=c(21,19), col=c("black","red"), bg="white") dev.off() # Compute mean and variance of classes. n1 <- sum(D[,"r"]==1) m1 <- mean(D[D[,"r"]==1,"x"]) s1 <- var( D[D[,"r"]==1,"x"]) n2 <- sum(D[,"r"]==2) m2 <- mean(D[D[,"r"]==2,"x"]) s2 <- var( D[D[,"r"]==2,"x"]) # Decision boundary using parametric classification, with the above # parameters for the Gaussian distributions. # See Alpaydin (2004), Ch 4, problem 4. a <- 1/(2*s2)-1/(2*s1) b <- m1/s1-m2/s2 c <- m2^2/(2*s2)-m1^2/(2*s1)+log(s2/s1) # Because variances are different there are actually two discriminant points: x1 <- (-b-sqrt(b^2-4*a*c))/(2*a) x2 <- (-b+sqrt(b^2-4*a*c))/(2*a) rng <- range(D[,"x"]) breaks <- seq(rng[1]-.2,rng[2]+.2,by=.2) x <- seq(from=rng[1],to=rng[2],length.out=200) p1 <- dnorm(x,mean=m1,sd=sqrt(s1))*500*.2 p2 <- dnorm(x,mean=m2,sd=sqrt(s2))*500*.2 # Plot distributions pdf("toydiscr2.pdf") layout(matrix(c(1,2),nrow=2,ncol=1)) hist(D[D[,"r"]==1,"x"], main="class 1", xlab="x", breaks=breaks) lines(x,p1) abline(v=x1,col="green",lwd=2) abline(v=x2,col="green",lwd=2) hist(D[D[,"r"]==2,"x"], main="class 2", xlab="x", col="red", breaks=breaks) lines(x,p2) abline(v=x1,col="green",lwd=2) abline(v=x2,col="green",lwd=2) dev.off() #################################################################### # Find the best discriminator w. accuracy <- x for(i in 1:length(x)) { accuracy[i] <- (sum(D[D[,"x"]=x[i],"r"]=="2"))/length(D[,"r"]) } pdf("toydiscr3.pdf") plot(c(rng[1],rng[2]),c(0,1),type="n",xlab="w",ylab="accuracy", main="accuracy of linear discriminator") lines(x,accuracy) abline(v=x1,col="green",lwd="2") legend(x="topleft",legend=c("Naive Bayes"),col=c("green"),lwd=2,bg="white") dev.off() #################################################################### #################################################################### # Logistic discrimination using glm # Define logit function: linkinv <- function(t) binomial(link="logit")$linkinv(t) # Alternative way: # linkinv <- function(t) 1/(1+exp(-t)) # Or using probit: # linkinv <- function(t) binomial(link="probit")$linkinv(t) D.glm <- glm(r ~ x+y,family=binomial(link="logit"),data=D) summary(D.glm) # We notice that the factor of y is not significantly non-zero. Drop y. D.glm <- glm(r ~ x,family=binomial(link="logit"),data=D) # We obtain estimator P(r|x)\approx logit(-2.2963+2.215*x) pl <- linkinv(predict(D.glm,data.frame(x=x))) xlogit <- -D.glm$coefficients[1]/D.glm$coefficients[2] pdf("toydiscr4.pdf") plot(c(rng[1],rng[2]),c(0,1),type="n",xlab="w",ylab="accuracy", main="accuracy of linear discriminator") lines(x,accuracy) abline(v=x1,col="green",lwd="2") abline(v=xlogit,col="red",lwd="2") legend(x="topleft",legend=c("Bayes","logit"),col=c("green","red"),lwd=2) dev.off()