# $Id: regression.R,v 1.1 2007/09/18 04:17:54 kaip Exp $ # # # Copyright (c) 2007 Kai Puolamaki # # Permission to use, copy, modify, and distribute this software for any # purpose with or without fee is hereby granted, provided that the above # copyright notice and this permission notice appear in all copies. # # THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES # WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF # MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR # ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES # WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN # ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF # OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. # # # T-61.3050 Machine Learning: Basic Principles # Example for the lecture of 18 September 2007 # # f is a stochastic function defined in interval [-1,1]. # We use f to create the toy data set to be used in the lecture. f <- function(x,noise=.2) { sin(pi*x)+noise*rnorm(length(x)) } # Make the regressor plots and print the factors. makeplot <- function(data,form,leg="regressor",main="",datate=NA,phi="X^{%d}") { plot(c(-1,1),c(-1.5,1.5),type="n",xlab="X",ylab="Y",main=main) points(datate,col="red",pch=21) x <- -100:100/100; y <- f(x,noise=0) lines(x,y,col="red",lty="dashed") points(data,pch=19) data.lm <- lm(form,data) y <- predict(data.lm,data.frame(list(X=x,Y=NA))) lines(x,y,col="black",lty="solid") legend(x="topleft",legend=c(expression(plain(sin)(X/pi)),leg), col=c("red","black"),lty=c("dashed","solid")) errtr <- sum((predict(data.lm,data)-data[,"Y"])^2)/length(data[,"Y"]) errte <- sum((predict(data.lm,datate)-datate[,"Y"])^2)/length(datate[,"Y"]) # Leave-one-out crossvalidation. Train the data leaving one item in # turn out and compute the average error of the data points left out. #errcr <- 0 #for(i in rownames(data)) { # data.lmcr <- lm(form,data[rownames(data)!=i,]) # errcr <- errcr+(predict(data.lmcr,data[i,])-data[i,"Y"])^2 #} #errcr <- errcr/length(rownames(data)) # E_TRAIN, E_CRVALID, E_TEST: #cat(sprintf("$%.4f$&$%.4f$&$%.4f$&$",errtr,errcr,errte)) cat(sprintf("$%.4f$&$%.4f$&$",errtr,errte)) i <- 0 for(a in data.lm$coefficients) { cat(sprintf(sprintf("%%+.2f%s",phi),a,i)) i <- i+1 } cat("$\n") data.lm } # Chebyshev Polynomials of the First Kind can be obtained by using a # recurrence relation: Tn <- function(n,x) { if(n==0) rep(1,length(x)) # T_0(x)=1 else if(n==1) x # T_1(x)=x else 2*x*Tn(n-1,x)-Tn(n-2,x) # T_n(x)=2xT_{n-1}(x)-T_{n-2}(x) } # Plot Chebyshev Polynomials of the First Kind: pdf("chebyshev1.pdf") plot(c(-1,1),c(-1,1),type="n", main="Chebyshev Polynomials of the First Kind", xlab="X",ylab=expression(T[n](X))) x <- -100:100/100 cols=c("yellow","black","red","green","blue","black") ltys=c("solid","solid","solid","solid","solid","dashed") for(n in 0:5) lines(x,Tn(n,x),col=cols[n+1],lty=ltys[n+1]) legend(x="bottomright",col=cols,lty=ltys,bg="white", legend=c(expression(T[0](X)),expression(T[1](X)),expression(T[2](X)), expression(T[3](X)),expression(T[4](X)),expression(T[5](X)))) dev.off() # Plot Polynomial terms: pdf("polynomials.pdf") plot(c(-1,1),c(-1,1),type="n", main="Polynomials", xlab="X",ylab=expression(X^n)) x <- -100:100/100 cols=c("yellow","black","red","green","blue","black") ltys=c("solid","solid","solid","solid","solid","dashed") for(n in 0:5) lines(x,x^n,col=cols[n+1],lty=ltys[n+1]) legend(x="bottomright",col=cols,lty=ltys,bg="white", legend=c(expression(X^0),expression(X^1),expression(X^2), expression(X^3),expression(X^4),expression(X^5))) dev.off() # Create data in interval [-1,1]. N <- 100 DATA <- data.frame(list(X=runif(N,min=-1,max=1),Y=0)) DATA[,"Y"] <- f(DATA[,"X"]) # Divide the data in random to the training set of 7 and test set of N-7. DATATR <- DATA[1:7,] DATATE <- DATA[8:N,] # Polynomials of various degrees: polys <- c("Y ~ 1") for(i in 1:20) polys <- c(polys,sprintf("%s+I(X^%d)",polys[i],i)) # Chebyshev polynomials of the first kind of various degrees: chebs <- c("Y ~ 1") for(i in 1:20) chebs <- c(chebs,sprintf("%s+I(Tn(%d,X))",chebs[i],i)) # Effect of data size on accuracy. k <- 5 p <- sample(N) for(n in c(7,10,15,20,25,30,35,40,45,50)) { DATATR <- DATA[p[1:n],] DATATE <- DATA[p[(n+1):N],] DATA.lm <- lm(formula(polys[k+1]),DATATR) errtr <- sum((predict(DATA.lm,DATATR)-DATATR[,"Y"])^2)/n errte <- sum((predict(DATA.lm,DATATE)-DATATE[,"Y"])^2)/(N-n) cat(sprintf("$%d$&$%.4f$&$%.4f$\\\\\n",n,errtr,errte)) } for(k in c(0:6)) { cat(sprintf("$%d$&",k)) pdf(sprintf("reg-poly-%d.pdf",k)) makeplot(DATATR,formula(polys[k+1]),leg=sprintf("degree %d polynomial",k),datate=DATATE) dev.off() }