Load Libraries and connect to board

library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.0 ──
## ✔ ggplot2 3.3.2     ✔ purrr   0.3.4
## ✔ tibble  3.0.1     ✔ dplyr   1.0.0
## ✔ tidyr   1.1.0     ✔ stringr 1.4.0
## ✔ readr   1.3.1     ✔ forcats 0.5.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
library(tidymodels)
## ── Attaching packages ────────────────────────────────────── tidymodels 0.1.0 ──
## ✔ broom     0.5.6      ✔ rsample   0.0.7 
## ✔ dials     0.0.7      ✔ tune      0.1.0 
## ✔ infer     0.5.2      ✔ workflows 0.1.1 
## ✔ parsnip   0.1.1      ✔ yardstick 0.0.6 
## ✔ recipes   0.1.13
## ── Conflicts ───────────────────────────────────────── tidymodels_conflicts() ──
## ✖ scales::discard() masks purrr::discard()
## ✖ dplyr::filter()   masks stats::filter()
## ✖ recipes::fixed()  masks stringr::fixed()
## ✖ dplyr::lag()      masks stats::lag()
## ✖ yardstick::spec() masks readr::spec()
## ✖ recipes::step()   masks stats::step()
library(lubridate)
## 
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
## 
##     date, intersect, setdiff, union
library(bikeHelpR)
library(xgboost)
## 
## Attaching package: 'xgboost'
## The following object is masked from 'package:dplyr':
## 
##     slice

Get data

con <- DBI::dbConnect(odbc::odbc(), "Content DB")
pins::board_register_rsconnect(server = Sys.getenv("CONNECT_SERVER"),
                               key = Sys.getenv("CONNECT_API_KEY"))

Perform Training Split

all_days <- tbl(con, "bike_model_data")
n_days_test <- 2
months_train <- 6

dates <- all_days %>% 
  count(date) %>%
  arrange(desc(date)) %>%
  head(n_days_test + 1) %>%
  pull(date) %>%
  as.Date()

split_date <- dates[n_days_test + 1]
start_train_date <- split_date - dmonths(months_train)

test_dates <- dates[1:n_days_test]
test_dates_str <- paste(test_dates, collapse = " and ")

print(glue::glue(
  "Using data on or before {min(test_dates)} as training, data from {test_dates_str} to test."
))
## Using data on or before 2023-08-03 as training, data from 2023-08-04 and 2023-08-03 to test.
train_dat <- all_days %>% 
  dplyr::filter(
    date <= split_date, 
    date >= start_train_date
  ) %>% 
  dplyr::collect()

recipe <- recipe(n_bikes ~ ., data = train_dat) %>%
  step_dummy(dow) %>%
  prep(train_dat, retain = FALSE)

Make recipe for model:

# downsample if working interactively
if (interactive()) {
  train_dat <- dplyr::sample_frac(train_dat, 0.5)
}

train_mat <- recipe %>%
  bake(train_dat)

Train and Save Model

mod <- parsnip::xgb_train(
  train_mat %>% select(-n_bikes, -id, -date), 
  train_mat %>% pull(n_bikes), 
  nrounds = ifelse(interactive(), 50, 500)
)
## [08:31:17] WARNING: amalgamation/../src/objective/regression_obj.cu:170: reg:linear is now deprecated in favor of reg:squarederror.

Brief model evaluation

test_date_start <- min(test_dates)
test_dat <- all_days %>% 
  filter(date >= test_date_start) %>% 
  collect()

preds <- bake(recipe, test_dat) %>%
  select(-n_bikes, -id, -date) %>% 
  as.matrix() %>% 
  predict(mod, .)

results <- test_dat %>% 
  mutate(preds = preds)

oos_metrics(results$n_bikes, results$preds)
## # A tibble: 1 x 4
##    rmse   mae   ccc    r2
##   <dbl> <dbl> <dbl> <dbl>
## 1  3.61  2.66 0.688 0.486

Save model as pin with some metadata

model_details <- list(
     model = mod,
     train_date = today(),
     train_window_start = start_train_date,
     split_date = split_date, 
     recipe = recipe
)

pins::pin(model_details, 
          "bike_model_rxgb", 
          "Model of Capitol Citybikes Available per Station", 
          board = "rsconnect")
DBI::dbDisconnect(con)