Overview

Predict whether income exceeds $50K/yr based on census data. Also known as “Adult” dataset. Extraction was done by Barry Becker from the 1994 Census database. Prediction task is to determine whether a person makes over 50K a year. See the data source and description for more information. These data are also used for demonstrating Tensorflow.

Exploratory data analysis

The biggest drivers for predicting income over $50k are: marital status (married is better), education (more is better), and sex (male is better). We will explore the continuous and categorical predictors before building statistical models. Data manipulation is carried out in dplyr and visualizations are done in ggplot2 and plotly.

knitr::opts_chunk$set(warning = FALSE, message = FALSE)
library(tidyverse)
library(plotly)

Download and read the data

The data can be downloaded from the web. The training and test data are 3.8 MB and 1.9 MB respectively. The missing values are converted from ? to NA.

download.file("https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data",
              "data/train_raw.csv")
download.file("https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.test", 
              "data/test_raw.csv")

Create modeling data

Convert the target variable income_bracket into a numeric value. Create a new column age_buckets and remove records with missing values. Apply to both the test and training data. Create interactions if desired. Note: These interactions can be extremely time consuming to model, therefore they are examined here, but are not included in the predictive models.

format.rawdata <- function(data){
  data %>%
    mutate(label = ifelse(income_bracket == ">50K" | income_bracket == ">50K.", 1, 0)) %>%
    mutate(age_buckets = cut(age, c(16, 18, 25, 30, 35, 40, 45, 50, 55, 60, 65, 90))) %>%
    select(label, gender, native_country, education, education_num, occupation, workclass, marital_status, 
           race, age_buckets) %>%
    na.omit
}
create.interactions <- function(data){
  data %>%
    mutate(education_x_occupation = paste(education, occupation, sep = ":")) %>%
    mutate(age_buckets_x_education_x_occupation = paste(age_buckets, education, occupation, sep = ":"))
}
train <- train_raw %>% format.rawdata #%>% create.interactions
test  <- test_raw  %>% format.rawdata #%>% create.interactions

Plot categorical columns

Most of the columns in the census data are categorical. We plot a few of the most important columns here. The complete list of categorical columns are:

plot.main.effects <- function(data, x, y){
  p <- data %>%
    mutate_(group = x, metric = y) %>%
    group_by(group) %>%
    summarize(percent = 100 * mean(metric)) %>%
    ggplot(aes(x = reorder(group, percent), percent)) +
    geom_bar(stat="identity", fill = "lightblue4") +
    coord_flip() +
    labs(y = "Percent", x = "") +
    ggtitle(paste("Percent surveyed with incomes over $50k by", x))
  ggplotly(p, tooltip = "percent")
}
plot.main.effects(train, "marital_status", "label")

plot.main.effects(train, "gender", "label")

plot.main.effects(train, "education", "label")

Plot continuous columns

We can compare the distribution of the categorical variables for those who earn more than $50k and those who earn less. The complete list of categorical variables are:

plot.continuous <- function(data, x, y, alpha = 0.2, ...){ 
  lab <- stringr::str_replace_all(y, "_", " ") %>% stringr::str_to_title(y)
  data %>%
    select_(groups = x, y = y) %>%
    na.omit %>%
    ggplot(aes(y, fill = groups)) + geom_density(alpha = alpha, ...) +
    labs(x = lab, y = "") +
    ggtitle(paste0("Income by ", lab))
}
# People who earn more also work more, are better educated, and are older
plot.continuous(train_raw, "income_bracket", "age")

plot.continuous(train_raw, "income_bracket", "education_num", adjust = 5)

plot.continuous(train_raw, "income_bracket", "hours_per_week", adjust = 5)

Plot interactions

We can examine some two-way and three-way intearcations with choropleth maps:

p <- train %>%
  select(education_num, age_buckets, label) %>%
  group_by(age_buckets, education_num) %>%
  summarize(percent = 100 * mean(label)) %>%
  ggplot(aes(education_num, age_buckets, fill = percent)) +
  geom_tile() +
  labs(x = "Education", y = "Age") +
  ggtitle("Percent surveyed with incomes over $50k by age, education")
ggplotly(p)

p <- train %>%
  select(age_buckets, education_num, occupation, label) %>%
  group_by(age_buckets, education_num, occupation) %>%
  summarize(percent = 100 * mean(label)) %>%
  ggplot(aes(education_num, age_buckets, fill = percent)) +
  geom_tile() +
  facet_wrap( ~ occupation) +
  labs(x = "Education", y = "Age") +
  ggtitle("Percent surveyed with incomes over $50k by age, education, and occupation")
ggplotly(p)

Save

Finally, we save the test and training data so we can use them in susequent analysis.

write_csv(train, "data/train.csv")
write_csv(test, "data/test.csv")
LS0tCnRpdGxlOiAiQ2Vuc3VzIERhdGEgRXhwbG9yYXRvcnkgQW5hbHlzaXMiCm91dHB1dDoKICBodG1sX25vdGVib29rOiBkZWZhdWx0CiAgaHRtbF9kb2N1bWVudDogZGVmYXVsdAotLS0KCiMjIE92ZXJ2aWV3CgpQcmVkaWN0IHdoZXRoZXIgaW5jb21lIGV4Y2VlZHMgXCQ1MEsveXIgYmFzZWQgb24gY2Vuc3VzIGRhdGEuIEFsc28ga25vd24gYXMgIkFkdWx0IiBkYXRhc2V0LiBFeHRyYWN0aW9uIHdhcyBkb25lIGJ5IEJhcnJ5IEJlY2tlciBmcm9tIHRoZSAxOTk0IENlbnN1cyBkYXRhYmFzZS4gUHJlZGljdGlvbiB0YXNrIGlzIHRvIGRldGVybWluZSB3aGV0aGVyIGEgcGVyc29uIG1ha2VzIG92ZXIgNTBLIGEgeWVhci4gU2VlIHRoZSBbZGF0YSBzb3VyY2VdKGh0dHBzOi8vYXJjaGl2ZS5pY3MudWNpLmVkdS9tbC9kYXRhc2V0cy9DZW5zdXMrSW5jb21lKSBhbmQgW2Rlc2NyaXB0aW9uXShodHRwczovL2FyY2hpdmUuaWNzLnVjaS5lZHUvbWwvbWFjaGluZS1sZWFybmluZy1kYXRhYmFzZXMvYWR1bHQvYWR1bHQubmFtZXMpIGZvciBtb3JlIGluZm9ybWF0aW9uLiBUaGVzZSBkYXRhIGFyZSBhbHNvIHVzZWQgZm9yIGRlbW9uc3RyYXRpbmcgW1RlbnNvcmZsb3ddKGh0dHBzOi8vd3d3LnRlbnNvcmZsb3cub3JnL3R1dG9yaWFscy93aWRlKS4KCiMjIEV4cGxvcmF0b3J5IGRhdGEgYW5hbHlzaXMKClRoZSBiaWdnZXN0IGRyaXZlcnMgZm9yIHByZWRpY3RpbmcgaW5jb21lIG92ZXIgXCQ1MGsgYXJlOiBtYXJpdGFsIHN0YXR1cyAobWFycmllZCBpcyBiZXR0ZXIpLCBlZHVjYXRpb24gKG1vcmUgaXMgYmV0dGVyKSwgYW5kIHNleCAobWFsZSBpcyBiZXR0ZXIpLiBXZSB3aWxsIGV4cGxvcmUgdGhlIGNvbnRpbnVvdXMgYW5kIGNhdGVnb3JpY2FsIHByZWRpY3RvcnMgYmVmb3JlIGJ1aWxkaW5nIHN0YXRpc3RpY2FsIG1vZGVscy4gRGF0YSBtYW5pcHVsYXRpb24gaXMgY2FycmllZCBvdXQgaW4gYGRwbHlyYCBhbmQgdmlzdWFsaXphdGlvbnMgYXJlIGRvbmUgaW4gYGdncGxvdDJgIGFuZCBgcGxvdGx5YC4KCmBgYHtyIHNldHVwLCBtZXNzYWdlPUZBTFNFLCB3YXJuaW5nPUZBTFNFfQprbml0cjo6b3B0c19jaHVuayRzZXQod2FybmluZyA9IEZBTFNFLCBtZXNzYWdlID0gRkFMU0UpCmxpYnJhcnkodGlkeXZlcnNlKQpsaWJyYXJ5KHBsb3RseSkKYGBgCgojIyBEb3dubG9hZCBhbmQgcmVhZCB0aGUgZGF0YQoKVGhlIGRhdGEgY2FuIGJlIGRvd25sb2FkZWQgZnJvbSB0aGUgd2ViLiBUaGUgdHJhaW5pbmcgYW5kIHRlc3QgZGF0YSBhcmUgMy44IE1CIGFuZCAxLjkgTUIgcmVzcGVjdGl2ZWx5LiBUaGUgbWlzc2luZyB2YWx1ZXMgYXJlIGNvbnZlcnRlZCBmcm9tIGA/YCB0byBgTkFgLgoKYGBge3IsIGV2YWw9RkFMU0UsIG1lc3NhZ2U9RkFMU0UsIHdhcm5pbmc9RkFMU0V9CmRvd25sb2FkLmZpbGUoImh0dHBzOi8vYXJjaGl2ZS5pY3MudWNpLmVkdS9tbC9tYWNoaW5lLWxlYXJuaW5nLWRhdGFiYXNlcy9hZHVsdC9hZHVsdC5kYXRhIiwKICAgICAgICAgICAgICAiZGF0YS90cmFpbl9yYXcuY3N2IikKZG93bmxvYWQuZmlsZSgiaHR0cHM6Ly9hcmNoaXZlLmljcy51Y2kuZWR1L21sL21hY2hpbmUtbGVhcm5pbmctZGF0YWJhc2VzL2FkdWx0L2FkdWx0LnRlc3QiLCAKICAgICAgICAgICAgICAiZGF0YS90ZXN0X3Jhdy5jc3YiKQpgYGAKCmBgYHtyLCBldmFsPUZBTFNFLCBpbmNsdWRlPUZBTFNFfQojIEFsdGVybmF0aXZlIGRvd25sb2FkIGxvY2F0aW9uCiNkb3dubG9hZC5maWxlKCJodHRwOi8vbWxyLmNzLnVtYXNzLmVkdS9tbC9tYWNoaW5lLWxlYXJuaW5nLWRhdGFiYXNlcy9hZHVsdC9hZHVsdC5kYXRhIiwgImRhdGEvdHJhaW5fcmF3LmNzdiIpCiNkb3dubG9hZC5maWxlKCJodHRwOi8vbWxyLmNzLnVtYXNzLmVkdS9tbC9tYWNoaW5lLWxlYXJuaW5nLWRhdGFiYXNlcy9hZHVsdC9hZHVsdC50ZXN0IiwgImRhdGEvdGVzdF9yYXcuY3N2IikKYGBgCgpgYGB7ciwgbWVzc2FnZT1GQUxTRSwgZWNobz1GQUxTRSwgd2FybmluZz1GQUxTRX0KY29sX25hbWVzID0gYygKICAiYWdlIiwgIndvcmtjbGFzcyIsICJmbmx3Z3QiLCAiZWR1Y2F0aW9uIiwgImVkdWNhdGlvbl9udW0iLAogICJtYXJpdGFsX3N0YXR1cyIsICJvY2N1cGF0aW9uIiwgInJlbGF0aW9uc2hpcCIsICJyYWNlIiwgImdlbmRlciIsCiAgImNhcGl0YWxfZ2FpbiIsICJjYXBpdGFsX2xvc3MiLCAiaG91cnNfcGVyX3dlZWsiLCAibmF0aXZlX2NvdW50cnkiLAogICJpbmNvbWVfYnJhY2tldCIKKQoKdHJhaW5fcmF3IDwtIHJlYWRfY3N2KCJkYXRhL3RyYWluX3Jhdy5jc3YiLCBjb2xfbmFtZXMgPSBjb2xfbmFtZXMsIG5hID0gIj8iKQp0ZXN0X3JhdyAgPC0gcmVhZF9jc3YoImRhdGEvdGVzdF9yYXcuY3N2IiwgY29sX25hbWVzID0gY29sX25hbWVzLCBuYSA9ICI/Iiwgc2tpcCA9IDEpCmBgYAoKIyMgQ3JlYXRlIG1vZGVsaW5nIGRhdGEKCkNvbnZlcnQgdGhlIHRhcmdldCB2YXJpYWJsZSBgaW5jb21lX2JyYWNrZXRgIGludG8gYSBudW1lcmljIHZhbHVlLiBDcmVhdGUgYSBuZXcgY29sdW1uIGBhZ2VfYnVja2V0c2AgYW5kIHJlbW92ZSByZWNvcmRzIHdpdGggbWlzc2luZyB2YWx1ZXMuIEFwcGx5IHRvIGJvdGggdGhlIHRlc3QgYW5kIHRyYWluaW5nIGRhdGEuIENyZWF0ZSBpbnRlcmFjdGlvbnMgaWYgZGVzaXJlZC4gTm90ZTogVGhlc2UgaW50ZXJhY3Rpb25zIGNhbiBiZSBleHRyZW1lbHkgdGltZSBjb25zdW1pbmcgdG8gbW9kZWwsIHRoZXJlZm9yZSB0aGV5IGFyZSBleGFtaW5lZCBoZXJlLCBidXQgYXJlIG5vdCBpbmNsdWRlZCBpbiB0aGUgcHJlZGljdGl2ZSBtb2RlbHMuCgpgYGB7cn0KZm9ybWF0LnJhd2RhdGEgPC0gZnVuY3Rpb24oZGF0YSl7CiAgZGF0YSAlPiUKICAgIG11dGF0ZShsYWJlbCA9IGlmZWxzZShpbmNvbWVfYnJhY2tldCA9PSAiPjUwSyIgfCBpbmNvbWVfYnJhY2tldCA9PSAiPjUwSy4iLCAxLCAwKSkgJT4lCiAgICBtdXRhdGUoYWdlX2J1Y2tldHMgPSBjdXQoYWdlLCBjKDE2LCAxOCwgMjUsIDMwLCAzNSwgNDAsIDQ1LCA1MCwgNTUsIDYwLCA2NSwgOTApKSkgJT4lCiAgICBzZWxlY3QobGFiZWwsIGdlbmRlciwgbmF0aXZlX2NvdW50cnksIGVkdWNhdGlvbiwgZWR1Y2F0aW9uX251bSwgb2NjdXBhdGlvbiwgd29ya2NsYXNzLCBtYXJpdGFsX3N0YXR1cywgCiAgICAgICAgICAgcmFjZSwgYWdlX2J1Y2tldHMpICU+JQogICAgbmEub21pdAp9CgpjcmVhdGUuaW50ZXJhY3Rpb25zIDwtIGZ1bmN0aW9uKGRhdGEpewogIGRhdGEgJT4lCiAgICBtdXRhdGUoZWR1Y2F0aW9uX3hfb2NjdXBhdGlvbiA9IHBhc3RlKGVkdWNhdGlvbiwgb2NjdXBhdGlvbiwgc2VwID0gIjoiKSkgJT4lCiAgICBtdXRhdGUoYWdlX2J1Y2tldHNfeF9lZHVjYXRpb25feF9vY2N1cGF0aW9uID0gcGFzdGUoYWdlX2J1Y2tldHMsIGVkdWNhdGlvbiwgb2NjdXBhdGlvbiwgc2VwID0gIjoiKSkKfQoKdHJhaW4gPC0gdHJhaW5fcmF3ICU+JSBmb3JtYXQucmF3ZGF0YSAjJT4lIGNyZWF0ZS5pbnRlcmFjdGlvbnMKdGVzdCAgPC0gdGVzdF9yYXcgICU+JSBmb3JtYXQucmF3ZGF0YSAjJT4lIGNyZWF0ZS5pbnRlcmFjdGlvbnMKYGBgCgojIyBQbG90IGNhdGVnb3JpY2FsIGNvbHVtbnMKCk1vc3Qgb2YgdGhlIGNvbHVtbnMgaW4gdGhlIGNlbnN1cyBkYXRhIGFyZSBjYXRlZ29yaWNhbC4gV2UgcGxvdCBhIGZldyBvZiB0aGUgbW9zdCBpbXBvcnRhbnQgY29sdW1ucyBoZXJlLiBUaGUgY29tcGxldGUgbGlzdCBvZiBjYXRlZ29yaWNhbCBjb2x1bW5zIGFyZToKCiogd29ya2NsYXNzCiogZWR1Y2F0aW9uCiogbWFyaXRhbF9zdGF0dXMKKiBvY2N1cGF0aW9uCiogcmVsYXRpb25zaGlwCiogcmFjZQoqIGdlbmRlcgoqIG5hdGl2ZV9jb3VudHJ5CgpgYGB7cn0KcGxvdC5tYWluLmVmZmVjdHMgPC0gZnVuY3Rpb24oZGF0YSwgeCwgeSl7CiAgcCA8LSBkYXRhICU+JQogICAgbXV0YXRlXyhncm91cCA9IHgsIG1ldHJpYyA9IHkpICU+JQogICAgZ3JvdXBfYnkoZ3JvdXApICU+JQogICAgc3VtbWFyaXplKHBlcmNlbnQgPSAxMDAgKiBtZWFuKG1ldHJpYykpICU+JQogICAgZ2dwbG90KGFlcyh4ID0gcmVvcmRlcihncm91cCwgcGVyY2VudCksIHBlcmNlbnQpKSArCiAgICBnZW9tX2JhcihzdGF0PSJpZGVudGl0eSIsIGZpbGwgPSAibGlnaHRibHVlNCIpICsKICAgIGNvb3JkX2ZsaXAoKSArCiAgICBsYWJzKHkgPSAiUGVyY2VudCIsIHggPSAiIikgKwogICAgZ2d0aXRsZShwYXN0ZSgiUGVyY2VudCBzdXJ2ZXllZCB3aXRoIGluY29tZXMgb3ZlciAkNTBrIGJ5IiwgeCkpCiAgZ2dwbG90bHkocCwgdG9vbHRpcCA9ICJwZXJjZW50IikKfQoKcGxvdC5tYWluLmVmZmVjdHModHJhaW4sICJtYXJpdGFsX3N0YXR1cyIsICJsYWJlbCIpCnBsb3QubWFpbi5lZmZlY3RzKHRyYWluLCAiZ2VuZGVyIiwgImxhYmVsIikKcGxvdC5tYWluLmVmZmVjdHModHJhaW4sICJlZHVjYXRpb24iLCAibGFiZWwiKQpgYGAKCiMjIFBsb3QgY29udGludW91cyBjb2x1bW5zCgpXZSBjYW4gY29tcGFyZSB0aGUgZGlzdHJpYnV0aW9uIG9mIHRoZSBjYXRlZ29yaWNhbCB2YXJpYWJsZXMgZm9yIHRob3NlIHdobyBlYXJuIG1vcmUgdGhhbiBcJDUwayBhbmQgdGhvc2Ugd2hvIGVhcm4gbGVzcy4gVGhlIGNvbXBsZXRlIGxpc3Qgb2YgY2F0ZWdvcmljYWwgdmFyaWFibGVzIGFyZToKCiogYWdlCiogZWR1Y2F0aW9uX251bQoqIGNhcGl0YWxfZ2FpbgoqIGNhcGl0YWxfbG9zcwoqIGhvdXJzX3Blcl93ZWVrCgpgYGB7cn0KcGxvdC5jb250aW51b3VzIDwtIGZ1bmN0aW9uKGRhdGEsIHgsIHksIGFscGhhID0gMC4yLCAuLi4peyAKICBsYWIgPC0gc3RyaW5ncjo6c3RyX3JlcGxhY2VfYWxsKHksICJfIiwgIiAiKSAlPiUgc3RyaW5ncjo6c3RyX3RvX3RpdGxlKHkpCiAgZGF0YSAlPiUKICAgIHNlbGVjdF8oZ3JvdXBzID0geCwgeSA9IHkpICU+JQogICAgbmEub21pdCAlPiUKICAgIGdncGxvdChhZXMoeSwgZmlsbCA9IGdyb3VwcykpICsgZ2VvbV9kZW5zaXR5KGFscGhhID0gYWxwaGEsIC4uLikgKwogICAgbGFicyh4ID0gbGFiLCB5ID0gIiIpICsKICAgIGdndGl0bGUocGFzdGUwKCJJbmNvbWUgYnkgIiwgbGFiKSkKfQoKIyBQZW9wbGUgd2hvIGVhcm4gbW9yZSBhbHNvIHdvcmsgbW9yZSwgYXJlIGJldHRlciBlZHVjYXRlZCwgYW5kIGFyZSBvbGRlcgpwbG90LmNvbnRpbnVvdXModHJhaW5fcmF3LCAiaW5jb21lX2JyYWNrZXQiLCAiYWdlIikKcGxvdC5jb250aW51b3VzKHRyYWluX3JhdywgImluY29tZV9icmFja2V0IiwgImVkdWNhdGlvbl9udW0iLCBhZGp1c3QgPSA1KQpwbG90LmNvbnRpbnVvdXModHJhaW5fcmF3LCAiaW5jb21lX2JyYWNrZXQiLCAiaG91cnNfcGVyX3dlZWsiLCBhZGp1c3QgPSA1KQoKYGBgCgoKIyMgUGxvdCBpbnRlcmFjdGlvbnMKCldlIGNhbiBleGFtaW5lIHNvbWUgdHdvLXdheSBhbmQgdGhyZWUtd2F5IGludGVhcmNhdGlvbnMgd2l0aCBjaG9yb3BsZXRoIG1hcHM6CgpgYGB7cn0KcCA8LSB0cmFpbiAlPiUKICBzZWxlY3QoZWR1Y2F0aW9uX251bSwgYWdlX2J1Y2tldHMsIGxhYmVsKSAlPiUKICBncm91cF9ieShhZ2VfYnVja2V0cywgZWR1Y2F0aW9uX251bSkgJT4lCiAgc3VtbWFyaXplKHBlcmNlbnQgPSAxMDAgKiBtZWFuKGxhYmVsKSkgJT4lCiAgZ2dwbG90KGFlcyhlZHVjYXRpb25fbnVtLCBhZ2VfYnVja2V0cywgZmlsbCA9IHBlcmNlbnQpKSArCiAgZ2VvbV90aWxlKCkgKwogIGxhYnMoeCA9ICJFZHVjYXRpb24iLCB5ID0gIkFnZSIpICsKICBnZ3RpdGxlKCJQZXJjZW50IHN1cnZleWVkIHdpdGggaW5jb21lcyBvdmVyICQ1MGsgYnkgYWdlLCBlZHVjYXRpb24iKQpnZ3Bsb3RseShwKQoKcCA8LSB0cmFpbiAlPiUKICBzZWxlY3QoYWdlX2J1Y2tldHMsIGVkdWNhdGlvbl9udW0sIG9jY3VwYXRpb24sIGxhYmVsKSAlPiUKICBncm91cF9ieShhZ2VfYnVja2V0cywgZWR1Y2F0aW9uX251bSwgb2NjdXBhdGlvbikgJT4lCiAgc3VtbWFyaXplKHBlcmNlbnQgPSAxMDAgKiBtZWFuKGxhYmVsKSkgJT4lCiAgZ2dwbG90KGFlcyhlZHVjYXRpb25fbnVtLCBhZ2VfYnVja2V0cywgZmlsbCA9IHBlcmNlbnQpKSArCiAgZ2VvbV90aWxlKCkgKwogIGZhY2V0X3dyYXAoIH4gb2NjdXBhdGlvbikgKwogIGxhYnMoeCA9ICJFZHVjYXRpb24iLCB5ID0gIkFnZSIpICsKICBnZ3RpdGxlKCJQZXJjZW50IHN1cnZleWVkIHdpdGggaW5jb21lcyBvdmVyICQ1MGsgYnkgYWdlLCBlZHVjYXRpb24sIGFuZCBvY2N1cGF0aW9uIikKZ2dwbG90bHkocCkKCmBgYAoKIyMgU2F2ZQoKRmluYWxseSwgd2Ugc2F2ZSB0aGUgdGVzdCBhbmQgdHJhaW5pbmcgZGF0YSBzbyB3ZSBjYW4gdXNlIHRoZW0gaW4gc3VzZXF1ZW50IGFuYWx5c2lzLgoKYGBge3J9CndyaXRlX2Nzdih0cmFpbiwgImRhdGEvdHJhaW4uY3N2IikKd3JpdGVfY3N2KHRlc3QsICJkYXRhL3Rlc3QuY3N2IikKYGBgCgo=