# $Id: lindiscr.R,v 1.3 2007/11/22 04:17:18 kaip Exp $
#
# Copyright (c) 2007 Kai Puolamaki <Kai.Puolamaki@iki.fi>
#
# Permission to use, copy, modify, and distribute this software for any
# purpose with or without fee is hereby granted, provided that the above
# copyright notice and this permission notice appear in all copies.
#
# THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
# WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
# MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
# ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
# WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
# ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
# OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
#
#
# For T-61.3050 Machine Learning: Basic Principles

# Create a toy data of two classes.
# For visualization purposes the data is two-dimensional. However, only the first
# dimension is relevant for the classification task.
# Both classes are distributed uniformly between 0 and 1 in the second dimension,
# that is, y ~ Unif(0,1).
# The first class obeys normal distribution with zero mean and unit
# variance in the first dimension, that is, x ~ N(0,1).
# The second class obeys distribution F in the first dimension, that is, x ~ F.
# A real number x is sampled from distribution F by first sampling z from
# normal distribution with zero mean and unit variance. x is then obtained by
# x <- 1+2|z|.


D <- data.frame(x=c(rnorm(500),1+2*abs(rnorm(500))),
                y=runif(1000),
                r=factor(c(rep(1,500),rep(2,500))))


# Plot data to a plane
pdf("toydiscr.pdf")
plot(D[,"x"],D[,"y"],
     pch=c(21,19)[D[,"r"]],
     col=c("black","red")[D[,"r"]],
     main="Toy data",
     xlab="x",
     ylab="y")
abline(v=1,lty="dotted")
legend(x="topleft",
       legend=c("Class 1","Class 2"),
       pch=c(21,19),
       col=c("black","red"),
       bg="white")
dev.off()

# Compute mean and variance of classes.
n1 <- sum(D[,"r"]==1)
m1 <- mean(D[D[,"r"]==1,"x"])
s1 <- var( D[D[,"r"]==1,"x"])

n2 <- sum(D[,"r"]==2)
m2 <- mean(D[D[,"r"]==2,"x"])
s2 <- var( D[D[,"r"]==2,"x"])

# Decision boundary using parametric classification, with the above
# parameters for the Gaussian distributions.
# See Alpaydin (2004), Ch 4, problem 4.
a <- 1/(2*s2)-1/(2*s1)
b <- m1/s1-m2/s2
c <- m2^2/(2*s2)-m1^2/(2*s1)+log(s2/s1)
# Because variances are different there are actually two discriminant points:
x1 <- (-b-sqrt(b^2-4*a*c))/(2*a)
x2 <- (-b+sqrt(b^2-4*a*c))/(2*a)


rng <- range(D[,"x"])

breaks <- seq(rng[1]-.2,rng[2]+.2,by=.2)
x <- seq(from=rng[1],to=rng[2],length.out=200)
p1 <- dnorm(x,mean=m1,sd=sqrt(s1))*500*.2
p2 <- dnorm(x,mean=m2,sd=sqrt(s2))*500*.2


# Plot distributions
pdf("toydiscr2.pdf")
layout(matrix(c(1,2),nrow=2,ncol=1))
hist(D[D[,"r"]==1,"x"],
     main="class 1",
     xlab="x",
     breaks=breaks)
lines(x,p1)
abline(v=x1,col="green",lwd=2)
abline(v=x2,col="green",lwd=2)

hist(D[D[,"r"]==2,"x"],
     main="class 2",
     xlab="x",
     col="red",
     breaks=breaks)
lines(x,p2)
abline(v=x1,col="green",lwd=2)
abline(v=x2,col="green",lwd=2)
dev.off()

####################################################################
# Find the best discriminator w.
accuracy <- x
for(i in 1:length(x)) {
  accuracy[i] <- (sum(D[D[,"x"]<x[i],"r"]=="1")+sum(D[D[,"x"]>=x[i],"r"]=="2"))/length(D[,"r"])
}
pdf("toydiscr3.pdf")
plot(c(rng[1],rng[2]),c(0,1),type="n",xlab="w",ylab="accuracy",
     main="accuracy of linear discriminator")
lines(x,accuracy)
abline(v=x1,col="green",lwd="2")
legend(x="topleft",legend=c("Naive Bayes"),col=c("green"),lwd=2,bg="white")
dev.off()
####################################################################


####################################################################
# Logistic discrimination using glm

# Define logit function:
linkinv <- function(t) binomial(link="logit")$linkinv(t)
# Alternative way:
# linkinv <- function(t) 1/(1+exp(-t))
# Or using probit:
# linkinv <- function(t) binomial(link="probit")$linkinv(t)


D.glm <- glm(r ~ x+y,family=binomial(link="logit"),data=D)
summary(D.glm)
# We notice that the factor of y is not significantly non-zero. Drop y.
D.glm <- glm(r ~ x,family=binomial(link="logit"),data=D)
# We obtain estimator P(r|x)\approx logit(-2.2963+2.215*x)

pl <- linkinv(predict(D.glm,data.frame(x=x)))

xlogit <- -D.glm$coefficients[1]/D.glm$coefficients[2]

pdf("toydiscr4.pdf")
plot(c(rng[1],rng[2]),c(0,1),type="n",xlab="w",ylab="accuracy",
     main="accuracy of linear discriminator")
lines(x,accuracy)
abline(v=x1,col="green",lwd="2")
abline(v=xlogit,col="red",lwd="2")
legend(x="topleft",legend=c("Bayes","logit"),col=c("green","red"),lwd=2)
dev.off()