# $Id: t613050-2-2007.R,v 1.7 2007/09/19 05:22:02 kaip Exp $ # # # Copyright (c) 2007 Kai Puolamaki # # Permission to use, copy, modify, and distribute this software for any # purpose with or without fee is hereby granted, provided that the above # copyright notice and this permission notice appear in all copies. # # THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES # WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF # MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR # ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES # WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN # ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF # OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. # # # T-61.3050 Machine Learning: Basic Principles # For problems 2/2007 # # Data analysis using R or S. You can download and install R freely # at http://www.r-project.org/ # # You can get help using help, for example on read.table function: # help(read.table) # Read data file directly from the web: D1 <- read.table("http://www.cis.hut.fi/Opinnot/T-61.3050/2007/t613050-2-2007.txt") # Print out a summary of the data: summary(D1) ## X Y ## Min. :-1.0010 Min. : 0.2508 ## 1st Qu.:-0.5114 1st Qu.: 0.5289 ## Median :-0.0320 Median : 0.6341 ## Mean :-0.0202 Mean : 0.6460 ## 3rd Qu.: 0.4725 3rd Qu.: 0.7734 ## Max. : 0.9955 Max. : 0.9455 ## NA's :4566.0000 dim(D1) ## [1] 4586 2 # # The data has 4586 rows and two columns (X and Y). 20 of the Y are known, # 4566 are unknown (NA in R). # # Make a data set without unknowns Y's, for convenience: D2 <- D1[!is.na(D1[,"Y"]),] # Plot data: plot(D2) # You should see 20 points. # Make a random permutation of indices: perm <- sample(20) perm ## [1] 19 13 1 14 3 17 9 20 5 6 4 12 18 15 2 11 7 8 10 16 # # Say that the first 10 are "training set" and remaining 10 are # the "validation set". ptrain <- perm[1:10] pvalid <- perm[11:20] # Train a second degree linear regressor, trying to predict Y given X # using g(X)=w_0+w_1*X+w_2*X^2. In R, this can be done using the # following formula, where X^2 has been enclosed to I() to get it # evaluated. The intercept term w_0 is included automatically. # See help(lm) for details. D2.lm <- lm(Y ~ X+I(X^2),D2[ptrain,]) D2.lm ## Call: ## lm(formula = Y ~ X + I(X^2), data = D2[ptrain, ]) ## ## Coefficients: ## (Intercept) X I(X^2) ## 0.7922 0.2138 -0.8285 ## # # The above tells us that the regressor is # g(x)=0.7922+0.2138*X-0.8285*X^2 # # Command "summary" gives more information: summary(D2.lm) ## Call: ## lm(formula = Y ~ X + I(X^2), data = D2[ptrain, ]) ## ## Residuals: ## Min 1Q Median 3Q Max ## -0.08393 -0.03365 -0.02010 0.02583 0.14040 ## ## Coefficients: ## Estimate Std. Error t value Pr(>|t|) ## (Intercept) 0.79217 0.02891 27.404 2.21e-08 *** ## X 0.21377 0.07769 2.751 0.028445 * ## I(X^2) -0.82848 0.11459 -7.230 0.000173 *** ## --- ## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 ## ## Residual standard error: 0.0714 on 7 degrees of freedom ## Multiple R-Squared: 0.9079, Adjusted R-squared: 0.8815 ## F-statistic: 34.48 on 2 and 7 DF, p-value: 0.0002375 # # Make a vector at [-1,1] for plotting: X <- -100:100/100 plot(c(-1,1),c(0,1),type="n",xlab="X",ylab="Y") points(D2[pvalid,],pch=21,col="red") points(D2[ptrain,],pch=19,col="black") lines(X,predict(D2.lm,data.frame(list(X=X)))) legend(x="topleft",legend=c("validation","training"), col=c("red","black"),pch=c(21,19),bg="white") # Error on training set: errtr <- mean((predict(D2.lm,D2[ptrain,])-D2[ptrain,"Y"])^2) errva <- mean((predict(D2.lm,D2[pvalid,])-D2[pvalid,"Y"])^2) cat(sprintf("E_TRAIN = %.4f E_VALID = %.4f\n",errtr,errva)) ## E_TRAIN = 0.0036 E_VALID = 0.0673 # Error on training set is 0.0036 and on validation set 0.0673. # Leave-one-out cross-validation, see Aplaydin (2004) Ch 14.2.1, # pages 330-331: errcr <- 0 for(i in rownames(D2)) { # Train using all data points except i: D2.lm <- lm(Y ~ X+I(X^2),D2[rownames(D2)!=i,]) # Test using point i: errcr <- errcr+(predict(D2.lm,D2[i,])-D2[i,"Y"])^2 } errcr <- errcr/length(rownames(D2)) # Leave-one-out cross-validation error is the average error of the # left-out points: cat(sprintf("Cross-validation error: %.4f\n",errcr)) ## Cross-validation error: 0.0216 # Notice how cross-validation gave much smaller error than 0.0673 obtained # by a split to training (10) and validation (10) sets! ############################################################################ ############################################################################ # Using the test set: # # Train and validate you regressor using the training set of 20 # known values of Y in the training data alone. # Assume you get a regressor D2.lm, as in the above examples. # You can then calculate the generalization error # or the error on the test set as follows: # Load the test set: Dtest <- read.table("http://www.cis.hut.fi/Opinnot/T-61.3050/2007/t613050-2-2007-testset.txt") errte <- mean((predict(D2.lm,Dtest)-Dtest[,"Y"])^2) cat(sprintf("Error on the test set: %.4f\n",errte)) ############################################################################ ############################################################################