# $Id: adult.R,v 1.1 2007/09/21 13:26:59 kaip Exp $

#
# Copyright (c) 2007 Kai Puolamaki <Kai.Puolamaki@iki.fi>
#
# Permission to use, copy, modify, and distribute this software for any
# purpose with or without fee is hereby granted, provided that the above
# copyright notice and this permission notice appear in all copies.
#
# THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
# WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
# MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
# ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
# WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
# ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
# OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
#
#
# T-61.3050 Machine Learning: Basic Principles
# Example for the lecture of 25 September 2007
#
# Adults database. For details and references see:
# ftp://ftp.ics.uci.edu/pub/machine-learning-databases/adult/adult.names

D <- read.csv("ftp://ftp.ics.uci.edu/pub/machine-learning-databases/adult/adult.data",
              header=FALSE,na.strings=c("?"),strip.white=TRUE)

# Or download the file adult.data first and issue:
# D <- read.csv("adult.data",header=FALSE,na.strings=c("?"),strip.white=TRUE)
colnames(D) <- c("age","workclass","fnlwgt","education","educationnum",
                 "maritalstatus","occupation","relationship","race",
                 "sex","capitalgain","capitalloss","hoursperweek",
                 "nativecountry","label")
# Remove all rows with NA's:
D <- na.omit(D)

summary(D)
##       age                   workclass         fnlwgt               education     educationnum  
##  Min.   :17.00   Private         :22286   Min.   :  13769   HS-grad     :9840   Min.   : 1.00  
##  1st Qu.:28.00   Self-emp-not-inc: 2499   1st Qu.: 117627   Some-college:6678   1st Qu.: 9.00  
##  Median :37.00   Local-gov       : 2067   Median : 178425   Bachelors   :5044   Median :10.00  
##  Mean   :38.44   State-gov       : 1279   Mean   : 189794   Masters     :1627   Mean   :10.12  
##  3rd Qu.:47.00   Self-emp-inc    : 1074   3rd Qu.: 237628   Assoc-voc   :1307   3rd Qu.:13.00  
##  Max.   :90.00   Federal-gov     :  943   Max.   :1484705   11th        :1048   Max.   :16.00  
##                  (Other)         :   14                     (Other)     :4618                  
##                maritalstatus             occupation           relationship  
##  Divorced             : 4214   Prof-specialty :4038   Husband       :12463  
##  Married-AF-spouse    :   21   Craft-repair   :4030   Not-in-family : 7726  
##  Married-civ-spouse   :14065   Exec-managerial:3992   Other-relative:  889  
##  Married-spouse-absent:  370   Adm-clerical   :3721   Own-child     : 4466  
##  Never-married        : 9726   Sales          :3584   Unmarried     : 3212  
##  Separated            :  939   Other-service  :3212   Wife          : 1406  
##  Widowed              :  827   (Other)        :7585                         
##                  race           sex         capitalgain      capitalloss       hoursperweek  
##  Amer-Indian-Eskimo:  286   Female: 9782   Min.   :     0   Min.   :   0.00   Min.   : 1.00  
##  Asian-Pac-Islander:  895   Male  :20380   1st Qu.:     0   1st Qu.:   0.00   1st Qu.:40.00  
##  Black             : 2817                  Median :     0   Median :   0.00   Median :40.00  
##  Other             :  231                  Mean   :  1092   Mean   :  88.37   Mean   :40.93  
##  White             :25933                  3rd Qu.:     0   3rd Qu.:   0.00   3rd Qu.:45.00  
##                                            Max.   : 99999   Max.   :4356.00   Max.   :99.00  
##        nativecountry     label      
##  United-States:27504   <=50K:22654  
##  Mexico       :  610   >50K : 7508  
##  Philippines  :  188                
##  Germany      :  128                
##  Puerto-Rico  :  109                
##  Canada       :  107                
##  (Other)      : 1516                

# Construct a table with categorical variables only:
D2 <- D[,c("age","workclass","education","maritalstatus","occupation","relationship","race","sex",
           "capitalgain","capitalloss","hoursperweek","nativecountry","label")]
# Discretize age, capitalgain, capitalloss, hoursperweek:
D2[,"age"]          <- factor(c("young","old")[1+(D2[,"age"]<37)])
D2[,"capitalgain"]  <- factor(c("small","large")[1+(D2[,"capitalgain"]<1000)])
D2[,"capitalloss"]  <- factor(c("small","large")[1+(D2[,"capitalloss"]<100)])
D2[,"hoursperweek"] <- factor(c("few","lots")[1+(D2[,"hoursperweek"]<=40)])
summary(D2)
##     age                   workclass            education                  maritalstatus  
##  old  :14590   Private         :22286   HS-grad     :9840   Divorced             : 4214  
##  young:15572   Self-emp-not-inc: 2499   Some-college:6678   Married-AF-spouse    :   21  
##                Local-gov       : 2067   Bachelors   :5044   Married-civ-spouse   :14065  
##                State-gov       : 1279   Masters     :1627   Married-spouse-absent:  370  
##                Self-emp-inc    : 1074   Assoc-voc   :1307   Never-married        : 9726  
##                Federal-gov     :  943   11th        :1048   Separated            :  939  
##                (Other)         :   14   (Other)     :4618   Widowed              :  827  
##            occupation           relationship                   race           sex       
##  Prof-specialty :4038   Husband       :12463   Amer-Indian-Eskimo:  286   Female: 9782  
##  Craft-repair   :4030   Not-in-family : 7726   Asian-Pac-Islander:  895   Male  :20380  
##  Exec-managerial:3992   Other-relative:  889   Black             : 2817                 
##  Adm-clerical   :3721   Own-child     : 4466   Other             :  231                 
##  Sales          :3584   Unmarried     : 3212   White             :25933                 
##  Other-service  :3212   Wife          : 1406                                            
##  (Other)        :7585                                                                   
##  capitalgain   capitalloss   hoursperweek       nativecountry     label      
##  large:27670   large:28735   few : 9197   United-States:27504   <=50K:22654  
##  small: 2492   small: 1427   lots:20965   Mexico       :  610   >50K : 7508  
##                                           Philippines  :  188                
##                                           Germany      :  128                
##                                           Puerto-Rico  :  109                
##                                           Canada       :  107                
##                                           (Other)      : 1516                


##############################################################################
##############################################################################
# Let's construct a network using Bene at http://b-course.hiit.fi/bene
# Reference: Silander T, Myllymaki P (2006) A Simple Optimal Approach
# for Finding the Globally Optimal Bayesian Network Structure. In Proc
# 22nd Annual Conference on Uncertainty in Artificial Intelligence
# (UAI'06).
#
# We must transform each column such that the factors are from 0 to n:


numerize <- function(D) {
  res <- D
  for(col in colnames(D)) res[,col] <- as.integer(D[,col])-1
  res
}

# Pick random 1000 rows into D3 in numerical format:
D3 <- numerize(D2)[sample(rownames(D2),size=1000),]
#for(row in rownames(D3)) {
#  for(col in colnames(D3)) cat(sprintf("%d ",D3[row,col]))
#  cat("\n")
#}

write.table(D2,file="adult.txt")
write.table(D3,file="adultN.txt",col.names=FALSE,row.names=FALSE)

for(i in 1:length(colnames(D3))) cat(sprintf("%d %s\n",i-1,colnames(D3)[i]))

# Cut-and-paste the numerical output to http://b-course.hiit.fi/bene
# You can then analyze the resulting network.