library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(dbplyr)
## 
## Attaching package: 'dbplyr'
## The following objects are masked from 'package:dplyr':
## 
##     ident, sql
library(bikeHelpR)
library(recipes)
## 
## Attaching package: 'recipes'
## The following object is masked from 'package:stats':
## 
##     step
library(lubridate)
## 
## Attaching package: 'lubridate'
## The following object is masked from 'package:base':
## 
##     date
con <- DBI::dbConnect(odbc::odbc(), "Content DB")
df <- tbl(con, "bike_model_data")
pins::board_register_rsconnect(
  server = "https://colorado.rstudio.com/rsc",
  key = Sys.getenv("RSTUDIOCONNECT_API_KEY")
)
n_days_test <- 2
months_train <- 6
dates <- df %>% 
  count(date) %>%
  arrange(desc(date)) %>%
  head(n_days_test + 1) %>%
  pull(date) %>%
  as.Date()

split_date <- dates[n_days_test + 1]
start_train_date <- split_date %m-% months(months_train)

test_dates <- dates[1:(length(dates) - 1)]
test_dates_str <- paste(test_dates, collapse = " and ")

print(glue::glue(
  "Using data on or before {min(test_dates)} as training, data from {test_dates_str} to test."
))
## Using data on or before 2020-11-27 as training, data from 2020-11-28 and 2020-11-27 to test.
print(glue::glue("Using data between {start_train_date} and {split_date} for training."))
## Using data between 2020-05-26 and 2020-11-26 for training.
train_dat <- df %>% 
  dplyr::filter(
    date <= split_date, 
    date >= start_train_date
  ) %>% 
  dplyr::collect()

recipe <- recipe(n_bikes ~ ., data = train_dat) %>%
  step_dummy(dow) %>%
  prep(train_dat, retain = FALSE)
pins::pin(
  list(
    train_date = Sys.Date(), 
    split_date = split_date, 
    train_start = start_train_date,
    recipe = recipe), 
  "bike_model_params", "Parameters for Creating Training Dataset", 
  board = "rsconnect"
)
## $train_date
## [1] "2020-11-28"
## 
## $split_date
## [1] "2020-11-26"
## 
## $train_start
## [1] "2020-05-26"
## 
## $recipe
## Data Recipe
## 
## Inputs:
## 
##       role #variables
##    outcome          1
##  predictor          7
## 
## Training data contained 2538075 data points and no missing data.
## 
## Operations:
## 
## Dummy variables from dow [trained]