# $Id: adult.R,v 1.1 2007/09/21 13:26:59 kaip Exp $ # # Copyright (c) 2007 Kai Puolamaki # # Permission to use, copy, modify, and distribute this software for any # purpose with or without fee is hereby granted, provided that the above # copyright notice and this permission notice appear in all copies. # # THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES # WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF # MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR # ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES # WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN # ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF # OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. # # # T-61.3050 Machine Learning: Basic Principles # Example for the lecture of 25 September 2007 # # Adults database. For details and references see: # ftp://ftp.ics.uci.edu/pub/machine-learning-databases/adult/adult.names D <- read.csv("ftp://ftp.ics.uci.edu/pub/machine-learning-databases/adult/adult.data", header=FALSE,na.strings=c("?"),strip.white=TRUE) # Or download the file adult.data first and issue: # D <- read.csv("adult.data",header=FALSE,na.strings=c("?"),strip.white=TRUE) colnames(D) <- c("age","workclass","fnlwgt","education","educationnum", "maritalstatus","occupation","relationship","race", "sex","capitalgain","capitalloss","hoursperweek", "nativecountry","label") # Remove all rows with NA's: D <- na.omit(D) summary(D) ## age workclass fnlwgt education educationnum ## Min. :17.00 Private :22286 Min. : 13769 HS-grad :9840 Min. : 1.00 ## 1st Qu.:28.00 Self-emp-not-inc: 2499 1st Qu.: 117627 Some-college:6678 1st Qu.: 9.00 ## Median :37.00 Local-gov : 2067 Median : 178425 Bachelors :5044 Median :10.00 ## Mean :38.44 State-gov : 1279 Mean : 189794 Masters :1627 Mean :10.12 ## 3rd Qu.:47.00 Self-emp-inc : 1074 3rd Qu.: 237628 Assoc-voc :1307 3rd Qu.:13.00 ## Max. :90.00 Federal-gov : 943 Max. :1484705 11th :1048 Max. :16.00 ## (Other) : 14 (Other) :4618 ## maritalstatus occupation relationship ## Divorced : 4214 Prof-specialty :4038 Husband :12463 ## Married-AF-spouse : 21 Craft-repair :4030 Not-in-family : 7726 ## Married-civ-spouse :14065 Exec-managerial:3992 Other-relative: 889 ## Married-spouse-absent: 370 Adm-clerical :3721 Own-child : 4466 ## Never-married : 9726 Sales :3584 Unmarried : 3212 ## Separated : 939 Other-service :3212 Wife : 1406 ## Widowed : 827 (Other) :7585 ## race sex capitalgain capitalloss hoursperweek ## Amer-Indian-Eskimo: 286 Female: 9782 Min. : 0 Min. : 0.00 Min. : 1.00 ## Asian-Pac-Islander: 895 Male :20380 1st Qu.: 0 1st Qu.: 0.00 1st Qu.:40.00 ## Black : 2817 Median : 0 Median : 0.00 Median :40.00 ## Other : 231 Mean : 1092 Mean : 88.37 Mean :40.93 ## White :25933 3rd Qu.: 0 3rd Qu.: 0.00 3rd Qu.:45.00 ## Max. : 99999 Max. :4356.00 Max. :99.00 ## nativecountry label ## United-States:27504 <=50K:22654 ## Mexico : 610 >50K : 7508 ## Philippines : 188 ## Germany : 128 ## Puerto-Rico : 109 ## Canada : 107 ## (Other) : 1516 # Construct a table with categorical variables only: D2 <- D[,c("age","workclass","education","maritalstatus","occupation","relationship","race","sex", "capitalgain","capitalloss","hoursperweek","nativecountry","label")] # Discretize age, capitalgain, capitalloss, hoursperweek: D2[,"age"] <- factor(c("young","old")[1+(D2[,"age"]<37)]) D2[,"capitalgain"] <- factor(c("small","large")[1+(D2[,"capitalgain"]<1000)]) D2[,"capitalloss"] <- factor(c("small","large")[1+(D2[,"capitalloss"]<100)]) D2[,"hoursperweek"] <- factor(c("few","lots")[1+(D2[,"hoursperweek"]<=40)]) summary(D2) ## age workclass education maritalstatus ## old :14590 Private :22286 HS-grad :9840 Divorced : 4214 ## young:15572 Self-emp-not-inc: 2499 Some-college:6678 Married-AF-spouse : 21 ## Local-gov : 2067 Bachelors :5044 Married-civ-spouse :14065 ## State-gov : 1279 Masters :1627 Married-spouse-absent: 370 ## Self-emp-inc : 1074 Assoc-voc :1307 Never-married : 9726 ## Federal-gov : 943 11th :1048 Separated : 939 ## (Other) : 14 (Other) :4618 Widowed : 827 ## occupation relationship race sex ## Prof-specialty :4038 Husband :12463 Amer-Indian-Eskimo: 286 Female: 9782 ## Craft-repair :4030 Not-in-family : 7726 Asian-Pac-Islander: 895 Male :20380 ## Exec-managerial:3992 Other-relative: 889 Black : 2817 ## Adm-clerical :3721 Own-child : 4466 Other : 231 ## Sales :3584 Unmarried : 3212 White :25933 ## Other-service :3212 Wife : 1406 ## (Other) :7585 ## capitalgain capitalloss hoursperweek nativecountry label ## large:27670 large:28735 few : 9197 United-States:27504 <=50K:22654 ## small: 2492 small: 1427 lots:20965 Mexico : 610 >50K : 7508 ## Philippines : 188 ## Germany : 128 ## Puerto-Rico : 109 ## Canada : 107 ## (Other) : 1516 ############################################################################## ############################################################################## # Let's construct a network using Bene at http://b-course.hiit.fi/bene # Reference: Silander T, Myllymaki P (2006) A Simple Optimal Approach # for Finding the Globally Optimal Bayesian Network Structure. In Proc # 22nd Annual Conference on Uncertainty in Artificial Intelligence # (UAI'06). # # We must transform each column such that the factors are from 0 to n: numerize <- function(D) { res <- D for(col in colnames(D)) res[,col] <- as.integer(D[,col])-1 res } # Pick random 1000 rows into D3 in numerical format: D3 <- numerize(D2)[sample(rownames(D2),size=1000),] #for(row in rownames(D3)) { # for(col in colnames(D3)) cat(sprintf("%d ",D3[row,col])) # cat("\n") #} write.table(D2,file="adult.txt") write.table(D3,file="adultN.txt",col.names=FALSE,row.names=FALSE) for(i in 1:length(colnames(D3))) cat(sprintf("%d %s\n",i-1,colnames(D3)[i])) # Cut-and-paste the numerical output to http://b-course.hiit.fi/bene # You can then analyze the resulting network.