ETL Step 2 - Tidy data

Published

April 17, 2024

This notebook tidies the raw raw_data_table from the Content DB database. The tidy data is written back to the Content DB database to the bike_model_data table.

Get data from database

Connect to Content DB to get the bike_raw_data and bike_station_info table.

con <- odbc::dbConnect(odbc::odbc(), "Content DB", timeout = 10)

bike_raw_data <- tbl(con, "bike_raw_data")
glimpse(bike_raw_data)
Rows: ??
Columns: 12
Database: postgres  [soleng@localhost:/rds]
$ station_id                <chr> "0824f644-1f3f-11e7-bf6b-3863bb334450", "082…
$ num_bikes_available       <dbl> 5, 0, 1, 4, 3, 6, 1, 0, 4, 3, 6, 9, 2, 19, 8…
$ num_ebikes_available      <dbl> 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 1,…
$ num_bikes_disabled        <dbl> 1, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1,…
$ num_docks_available       <dbl> 17, 11, 10, 13, 12, 17, 22, 15, 11, 15, 9, 5…
$ num_docks_disabled        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
$ is_installed              <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,…
$ is_renting                <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,…
$ is_returning              <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,…
$ last_reported             <dbl> 1703348257, 1703348258, 1703348258, 17033482…
$ eightd_has_available_keys <lgl> FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FA…
$ time                      <dttm> 2023-12-23 16:18:14, 2023-12-23 16:18:14, 2…
bike_station_info <- tbl(con, "bike_station_info")
glimpse(bike_station_info)
Rows: ??
Columns: 4
Database: postgres  [soleng@localhost:/rds]
$ station_id <chr> "1", "2", "3", "4", "5", "6", "7", "8", "10", "11", "12", "…
$ name       <chr> "Eads St & 15th St S", "18th St & S Eads St", "Crystal Dr &…
$ lat        <dbl> 38.85897, 38.85725, 38.85643, 38.86017, 38.85787, 38.86230,…
$ lon        <dbl> -77.05323, -77.05332, -77.04923, -77.04959, -77.05949, -77.…

Tidy the data

Apply the data tidying steps. All of the tidying steps are performed in SQL. The results are written back to Content DB to the bike_model_data table.

if (odbc::dbExistsTable(con, "bike_model_data")) {
  odbc::dbRemoveTable(con, "bike_model_data")
}

# Build a SQL query to tidy the data.
query <- bike_raw_data %>% 
    group_by(
        id = station_id, 
        hour = hour(time), 
        date = date(time), 
        month = month(time), 
        dow = TRIM(to_char(time, "Day"))
    ) %>%
    summarize(
        n_bikes = mean(num_bikes_available, na.rm = TRUE),
        .groups = "drop"
    ) %>%
    inner_join(
        select(bike_station_info, id = station_id, lat, lon)
    ) %>%
    dbplyr::sql_render() %>%
    stringr::str_replace("SELECT", "CREATE TABLE bike_model_data AS SELECT")

# Execute the SQL query.
odbc::dbSendQuery(con, query)
<OdbcResult>
  SQL  CREATE TABLE bike_model_data AS SELECT "LHS"."id" AS "id", "hour", "date", "month", "dow", "n_bikes", "lat", "lon"
FROM (SELECT "id", "hour", "date", "month", "dow", AVG("num_bikes_available") AS "n_bikes"
FROM (SELECT "station_id", "num_bikes_available", "num_ebikes_available", "num_bikes_disabled", "num_docks_available", "num_docks_disabled", "is_installed", "is_renting", "is_returning", "last_reported", "eightd_has_available_keys", "time", "station_id" AS "id", EXTRACT(hour FROM "time") AS "hour", date("time") AS "date", EXTRACT(MONTH FROM "time") AS "month", TRIM(to_char("time", 'Day')) AS "dow"
FROM "bike_raw_data") "q01"
GROUP BY "id", "hour", "date", "month", "dow") "LHS"
INNER JOIN (SELECT "station_id" AS "id", "lat", "lon"
FROM "bike_station_info") "RHS"
ON ("LHS"."id" = "RHS"."id")

  ROWS Fetched: 0 [complete]
       Changed: 15060921
# Preview the table.
tbl(con, "bike_model_data")
# Source:   table<bike_model_data> [?? x 8]
# Database: postgres [soleng@localhost:/rds]
   id     hour date       month dow       n_bikes   lat   lon
   <chr> <dbl> <date>     <dbl> <chr>       <dbl> <dbl> <dbl>
 1 1         0 2019-09-21     9 Saturday     4.33  38.9 -77.1
 2 1         0 2019-09-22     9 Sunday       3     38.9 -77.1
 3 1         0 2019-09-23     9 Monday      11.3   38.9 -77.1
 4 1         0 2019-09-24     9 Tuesday      9     38.9 -77.1
 5 1         0 2019-09-25     9 Wednesday    8     38.9 -77.1
 6 1         0 2019-09-26     9 Thursday     5     38.9 -77.1
 7 1         0 2019-09-27     9 Friday      10     38.9 -77.1
 8 1         0 2019-09-28     9 Saturday    14     38.9 -77.1
 9 1         0 2019-09-29     9 Sunday       0     38.9 -77.1
10 1         0 2019-09-30     9 Monday       9     38.9 -77.1
# … with more rows
odbc::dbDisconnect(con)
print("Complete 🎉")
[1] "Complete 🎉"