Model Step 1 - Train and Deploy Model

Published

October 1, 2022

This notebook trains a model to predict the number of bikes at a given bike docking station. The model is trained using the bike_model_data table from Content DB. The trained model is then:

Get data

Connect to the database:

con <- DBI::dbConnect(odbc::odbc(), "Content DB")

Split the data into a train/test split:

all_days <- tbl(con, "bike_model_data")

# Get a vector that contains all of the dates.
dates <- all_days %>%
  distinct(date) %>%
  collect() %>%
  arrange(desc(date)) %>%
  pull(date) %>%
  as.Date()

# Split the data into test and train.
n_days_test <- 2
n_days_to_train <- 10

train_end_date <- dates[n_days_test + 1]
train_start_date <- train_end_date - n_days_to_train

# Training data split.
train_data <- all_days %>%
  filter(
    date >= train_start_date,
    date <= train_end_date
  ) %>%
  distinct() %>%
  collect()

print(glue::glue(
  "The model will be trained on data from {start} to {end} ",
  "({num_obs} observations). ",
  start = min(train_data$date),
  end = max(train_data$date),
  num_obs = scales::comma(nrow(train_data)),
))
## The model will be trained on data from 2022-09-19 to 2022-09-29 (91,296 observations).

# Test data split.
test_data <- all_days %>%
  filter(date > train_end_date) %>%
  distinct() %>%
  collect()

print(glue::glue(
  "The model will be tested on data from {start} to {end} ",
  "({num_obs} observations). ",
  start = min(test_data$date),
  end = max(test_data$date),
  num_obs = scales::comma(nrow(test_data)),
))
## The model will be tested on data from 2022-09-30 to 2022-10-01 (11,866 observations).

Train the model

Data preprocessing

Define a recipe to clean the data.

# Define a recipe to clean the data.
recipe_spec <- 
  recipe(n_bikes ~ ., data = train_data) %>% 
  step_dummy(dow) %>%
  step_integer(id, date)

# Preview the cleaned training data.
recipe_spec %>% 
  prep(train_data) %>% 
  bake(head(train_data)) %>%
  glimpse()
## Rows: 6
## Columns: 13
## $ id            <dbl> 1, 1, 1, 1, 1, 1
## $ hour          <dbl> 0, 0, 0, 0, 0, 0
## $ date          <dbl> 1, 2, 3, 4, 5, 6
## $ month         <dbl> 9, 9, 9, 9, 9, 9
## $ lat           <dbl> 38.85897, 38.85897, 38.85897, 38.85897, 38.85897, 38.858…
## $ lon           <dbl> -77.05323, -77.05323, -77.05323, -77.05323, -77.05323, -…
## $ n_bikes       <dbl> 8, 7, 4, 4, 2, 0
## $ dow_Monday    <dbl> 1, 0, 0, 0, 0, 0
## $ dow_Saturday  <dbl> 0, 0, 0, 0, 0, 1
## $ dow_Sunday    <dbl> 0, 0, 0, 0, 0, 0
## $ dow_Thursday  <dbl> 0, 0, 0, 1, 0, 0
## $ dow_Tuesday   <dbl> 0, 1, 0, 0, 0, 0
## $ dow_Wednesday <dbl> 0, 0, 1, 0, 0, 0

Fit model

Fit a random forest model:

model_spec <- 
  rand_forest() %>%
  set_mode("regression") %>%
  set_engine("ranger")

model_workflow <- 
  workflow() %>%
  add_recipe(recipe_spec) %>%
  add_model(model_spec)

model_fit <- fit(model_workflow, data = train_data)
model_fit
## ══ Workflow [trained] ══════════════════════════════════════════════════════════
## Preprocessor: Recipe
## Model: rand_forest()
## 
## ── Preprocessor ────────────────────────────────────────────────────────────────
## 2 Recipe Steps
## 
## • step_dummy()
## • step_integer()
## 
## ── Model ───────────────────────────────────────────────────────────────────────
## Ranger result
## 
## Call:
##  ranger::ranger(x = maybe_data_frame(x), y = y, num.threads = 1,      verbose = FALSE, seed = sample.int(10^5, 1)) 
## 
## Type:                             Regression 
## Number of trees:                  500 
## Sample size:                      91296 
## Number of independent variables:  12 
## Mtry:                             3 
## Target node size:                 5 
## Variable importance mode:         none 
## Splitrule:                        variance 
## OOB prediction error (MSE):       10.70339 
## R squared (OOB):                  0.5698731

Model evaluation

predictions <- predict(model_fit, test_data)

results <- test_data %>%
  mutate(preds = predictions$.pred)

oos_metrics(results$n_bikes, results$preds)
## # A tibble: 1 × 4
##    rmse   mae   ccc    r2
##   <dbl> <dbl> <dbl> <dbl>
## 1  4.13  3.24 0.403 0.283

Model deployment

vetiver

Create a vetiver model object.

model_name <- "bike_predict_model_r"
pin_name <- glue("sam.edwardes/{model_name}")

# Get the train and test data ranges. This will be passed into the pin metadata
# so that other scripts can access this information.
date_metadata <- list(
  train_dates = c(
    as.character(min(train_data$date)), 
    as.character(max(train_data$date))
  ),
  test_dates = c(
    as.character(min(test_data$date)), 
    as.character(max(test_data$date))
  )
)

print(date_metadata)
## $train_dates
## [1] "2022-09-19" "2022-09-29"
## 
## $test_dates
## [1] "2022-09-30" "2022-10-01"

# Create the vetiver model.
v <- vetiver_model(
  model_fit, 
  model_name,
  versioned = TRUE,
  save_ptype = train_data %>%
    head(1) %>%
    select(-n_bikes),
  metadata = date_metadata
)

v
## 
## ── bike_predict_model_r ─ <butchered_workflow> model for deployment 
## A ranger regression modeling workflow using 7 features

pins

Save the model as a pin to RStudio Connect:

# Use RStudio Connect as a board.
board <- pins::board_rsconnect(
  server = Sys.getenv("CONNECT_SERVER"),
  key = Sys.getenv("CONNECT_API_KEY"),
  versioned = TRUE
)

# Write the model to the board.
board %>%
 vetiver_pin_write(vetiver_model = v)

plumber

Convert the model into a plumber API. The function vetiver_write_plumber will generate the plumber code for you and write it to plumber.R.

# Write the model to `api/plumber.R`.
dir.create("api")
vetiver_write_plumber(board, pin_name, file = "api/plumber.R")

# Write a manifest.json file for the api
rsconnect::writeManifest("api")

Here is what the plumber file looks like:

# Generated by the vetiver package; edit with care

library(pins)
library(plumber)
library(rapidoc)
library(vetiver)

# Packages needed to generate model predictions
if (FALSE) {
    library(parsnip)
    library(ranger)
    library(recipes)
    library(workflows)
}
b <- board_rsconnect("envvar", server = "https://colorado.rstudio.com/rsc")
v <- vetiver_pin_read(b, "sam.edwardes/bike_predict_model_r", version = "63032")

#* @plumber
function(pr) {
    pr %>% vetiver_api(v)
}

RStudio Connect

Then, deploy the plumber API to RStudio Connect.

app_name <- "bike-predict-r-api"
app_title <- "Bike Predict - Model - API"

# Establish a connection to RStudio connect.
client <- connectapi::connect(
  server = Sys.getenv("CONNECT_SERVER"),
  api_key = Sys.getenv("CONNECT_API_KEY")
)

# Deploy the content.
content <- 
  client %>%
  connectapi::deploy(
    connectapi::bundle_dir("api"),
    name = app_name,
    title = app_title
  )
DBI::dbDisconnect(con)