# $Id: t613050-2-2007.R,v 1.7 2007/09/19 05:22:02 kaip Exp $
#
#
# Copyright (c) 2007 Kai Puolamaki <Kai.Puolamaki@iki.fi>
#
# Permission to use, copy, modify, and distribute this software for any
# purpose with or without fee is hereby granted, provided that the above
# copyright notice and this permission notice appear in all copies.
#
# THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
# WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
# MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
# ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
# WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
# ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
# OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
#
#
# T-61.3050 Machine Learning: Basic Principles
# For problems 2/2007
#
# Data analysis using R or S. You can download and install R freely
# at http://www.r-project.org/
#
# You can get help using help, for example on read.table function:
# help(read.table)

# Read data file directly from the web:
D1 <- read.table("http://www.cis.hut.fi/Opinnot/T-61.3050/2007/t613050-2-2007.txt")

# Print out a summary of the data:
summary(D1)
##        X                 Y            
##  Min.   :-1.0010   Min.   :   0.2508  
##  1st Qu.:-0.5114   1st Qu.:   0.5289  
##  Median :-0.0320   Median :   0.6341  
##  Mean   :-0.0202   Mean   :   0.6460  
##  3rd Qu.: 0.4725   3rd Qu.:   0.7734  
##  Max.   : 0.9955   Max.   :   0.9455  
##                    NA's   :4566.0000  
dim(D1)
## [1] 4586    2
#
# The data has 4586 rows and two columns (X and Y). 20 of the Y are known,
# 4566 are unknown (NA in R).
#
# Make a data set without unknowns Y's, for convenience:
D2 <- D1[!is.na(D1[,"Y"]),]

# Plot data:
plot(D2)
# You should see 20 points.

# Make a random permutation of indices:
perm <- sample(20)
perm
## [1] 19 13  1 14  3 17  9 20  5  6  4 12 18 15  2 11  7  8 10 16
#
# Say that the first 10 are "training set" and remaining 10 are
# the "validation set".
ptrain <- perm[1:10]
pvalid <- perm[11:20]

# Train a second degree linear regressor, trying to predict Y given X
# using g(X)=w_0+w_1*X+w_2*X^2. In R, this can be done using the
# following formula, where X^2 has been enclosed to I() to get it
# evaluated. The intercept term w_0 is included automatically.
# See help(lm) for details.
D2.lm <- lm(Y ~ X+I(X^2),D2[ptrain,])
D2.lm
## Call:
## lm(formula = Y ~ X + I(X^2), data = D2[ptrain, ])
##
## Coefficients:
## (Intercept)            X       I(X^2)  
##      0.7922       0.2138      -0.8285  
##
#
# The above tells us that the regressor is
# g(x)=0.7922+0.2138*X-0.8285*X^2
#
# Command "summary" gives more information:
summary(D2.lm)
## Call:
## lm(formula = Y ~ X + I(X^2), data = D2[ptrain, ])
##
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.08393 -0.03365 -0.02010  0.02583  0.14040 
##
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  0.79217    0.02891  27.404 2.21e-08 ***
## X            0.21377    0.07769   2.751 0.028445 *  
## I(X^2)      -0.82848    0.11459  -7.230 0.000173 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 
##
## Residual standard error: 0.0714 on 7 degrees of freedom
## Multiple R-Squared: 0.9079,	Adjusted R-squared: 0.8815 
## F-statistic: 34.48 on 2 and 7 DF,  p-value: 0.0002375 
#
# Make a vector at [-1,1] for plotting:
X <- -100:100/100
plot(c(-1,1),c(0,1),type="n",xlab="X",ylab="Y")
points(D2[pvalid,],pch=21,col="red")
points(D2[ptrain,],pch=19,col="black")
lines(X,predict(D2.lm,data.frame(list(X=X))))
legend(x="topleft",legend=c("validation","training"),
       col=c("red","black"),pch=c(21,19),bg="white")

# Error on training set:
errtr <- mean((predict(D2.lm,D2[ptrain,])-D2[ptrain,"Y"])^2)
errva <- mean((predict(D2.lm,D2[pvalid,])-D2[pvalid,"Y"])^2)
cat(sprintf("E_TRAIN = %.4f     E_VALID = %.4f\n",errtr,errva))
## E_TRAIN = 0.0036     E_VALID = 0.0673
# Error on training set is 0.0036 and on validation set 0.0673.


# Leave-one-out cross-validation, see Aplaydin (2004) Ch 14.2.1,
# pages 330-331:
errcr <- 0
for(i in rownames(D2)) {
  # Train using all data points except i:
  D2.lm <- lm(Y ~ X+I(X^2),D2[rownames(D2)!=i,])
  # Test using point i:
  errcr <- errcr+(predict(D2.lm,D2[i,])-D2[i,"Y"])^2
}
errcr <- errcr/length(rownames(D2))
# Leave-one-out cross-validation error is the average error of the
# left-out points:
cat(sprintf("Cross-validation error: %.4f\n",errcr))
## Cross-validation error: 0.0216
# Notice how cross-validation gave much smaller error than 0.0673 obtained
# by a split to training (10) and validation (10) sets!


############################################################################
############################################################################
# Using the test set:
#
# Train and validate you regressor using the training set of 20
# known values of Y in the training data alone.
# Assume you get a regressor D2.lm, as in the above examples.
# You can then calculate the generalization error
# or the error on the test set as follows:

# Load the test set:
Dtest <- read.table("http://www.cis.hut.fi/Opinnot/T-61.3050/2007/t613050-2-2007-testset.txt")
errte <- mean((predict(D2.lm,Dtest)-Dtest[,"Y"])^2)
cat(sprintf("Error on the test set: %.4f\n",errte))

############################################################################
############################################################################