ETL Step 2 - Tidy data

Published

March 28, 2024

This notebook tidies the raw raw_data_table from the Content DB database. The tidy data is written back to the Content DB database to the bike_model_data table.

Get data from database

Connect to Content DB to get the bike_raw_data and bike_station_info table.

con <- odbc::dbConnect(odbc::odbc(), "Content DB", timeout = 10)

bike_raw_data <- tbl(con, "bike_raw_data")
glimpse(bike_raw_data)
Rows: ??
Columns: 12
Database: postgres  [soleng@localhost:/rds]
$ station_id                <chr> "197e33c5-30cf-4007-b7a8-b9b1dfd064e3", "082…
$ num_bikes_available       <dbl> 6, 7, 9, 18, 9, 8, 3, 11, 2, 26, 9, 2, 3, 2,…
$ num_ebikes_available      <dbl> 0, 3, 3, 1, 6, 5, 1, 4, 0, 5, 0, 1, 1, 0, 1,…
$ num_bikes_disabled        <dbl> 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 2, 1, 1,…
$ num_docks_available       <dbl> 5, 12, 10, 5, 6, 2, 9, 14, 17, 9, 8, 9, 6, 1…
$ num_docks_disabled        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
$ is_installed              <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,…
$ is_renting                <dbl> 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,…
$ is_returning              <dbl> 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,…
$ last_reported             <dbl> 1703845015, 1703845015, 1703845012, 17038182…
$ eightd_has_available_keys <lgl> FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FA…
$ time                      <dttm> 2023-12-29 10:17:45, 2023-12-29 10:17:45, 2…
bike_station_info <- tbl(con, "bike_station_info")
glimpse(bike_station_info)
Rows: ??
Columns: 5
Database: postgres  [soleng@localhost:/rds]
$ station_id   <chr> "a2507ccc-aca1-48b2-a6a3-a3144ad92d21", "ee2cb739-9118-40…
$ name         <chr> "Connecticut Ave & R St NW", "37th & Ely Pl SE", "Dorr Av…
$ lat          <dbl> 38.91264, 38.88313, 38.87926, 39.08367, 38.81074, 38.9233…
$ lon          <dbl> -77.04564, -76.95001, -77.23358, -77.14916, -77.04463, -7…
$ last_updated <dttm> 2024-03-28 08:00:49, 2024-03-28 08:00:49, 2024-03-28 08:…

Tidy the data

Apply the data tidying steps. All of the tidying steps are performed in SQL. The results are written back to Content DB to the bike_model_data table.

if (odbc::dbExistsTable(con, "bike_model_data")) {
  odbc::dbRemoveTable(con, "bike_model_data")
}

# Build a SQL query to tidy the data.
query <- bike_raw_data %>% 
    group_by(
        id = station_id, 
        hour = hour(time), 
        date = date(time), 
        month = month(time), 
        dow = TRIM(to_char(time, "Day"))
    ) %>%
    summarize(
        n_bikes = mean(num_bikes_available, na.rm = TRUE),
        .groups = "drop"
    ) %>%
    inner_join(
        select(bike_station_info, id = station_id, lat, lon)
    ) %>%
    dbplyr::sql_render() %>%
    stringr::str_replace("SELECT", "CREATE TABLE bike_model_data AS SELECT")

# Execute the SQL query.
odbc::dbSendQuery(con, query)
<OdbcResult>
  SQL  CREATE TABLE bike_model_data AS SELECT "LHS"."id" AS "id", "hour", "date", "month", "dow", "n_bikes", "lat", "lon"
FROM (SELECT "id", "hour", "date", "month", "dow", AVG("num_bikes_available") AS "n_bikes"
FROM (SELECT "station_id", "num_bikes_available", "num_ebikes_available", "num_bikes_disabled", "num_docks_available", "num_docks_disabled", "is_installed", "is_renting", "is_returning", "last_reported", "eightd_has_available_keys", "time", "station_id" AS "id", EXTRACT(hour FROM "time") AS "hour", date("time") AS "date", EXTRACT(MONTH FROM "time") AS "month", TRIM(to_char("time", 'Day')) AS "dow"
FROM "bike_raw_data") "q01"
GROUP BY "id", "hour", "date", "month", "dow") "LHS"
INNER JOIN (SELECT "station_id" AS "id", "lat", "lon"
FROM "bike_station_info") "RHS"
ON ("LHS"."id" = "RHS"."id")

  ROWS Fetched: 0 [complete]
       Changed: 2636038
# Preview the table.
tbl(con, "bike_model_data")
# Source:   table<bike_model_data> [?? x 8]
# Database: postgres [soleng@localhost:/rds]
   id                            hour date       month dow   n_bikes   lat   lon
   <chr>                        <dbl> <date>     <dbl> <chr>   <dbl> <dbl> <dbl>
 1 0099b016-32c9-4536-ac4c-dcc…     0 2023-03-22     3 Wedn…       2  38.9 -76.9
 2 0099b016-32c9-4536-ac4c-dcc…     0 2023-03-23     3 Thur…       2  38.9 -76.9
 3 0099b016-32c9-4536-ac4c-dcc…     0 2023-03-24     3 Frid…       2  38.9 -76.9
 4 0099b016-32c9-4536-ac4c-dcc…     0 2023-03-25     3 Satu…       2  38.9 -76.9
 5 0099b016-32c9-4536-ac4c-dcc…     0 2023-03-26     3 Sund…       2  38.9 -76.9
 6 0099b016-32c9-4536-ac4c-dcc…     0 2023-03-27     3 Mond…       2  38.9 -76.9
 7 0099b016-32c9-4536-ac4c-dcc…     0 2023-03-28     3 Tues…       2  38.9 -76.9
 8 0099b016-32c9-4536-ac4c-dcc…     0 2023-03-29     3 Wedn…       2  38.9 -76.9
 9 0099b016-32c9-4536-ac4c-dcc…     0 2023-03-30     3 Thur…       2  38.9 -76.9
10 0099b016-32c9-4536-ac4c-dcc…     0 2023-03-31     3 Frid…       2  38.9 -76.9
# … with more rows
odbc::dbDisconnect(con)
print("Complete 🎉")
[1] "Complete 🎉"