import os
import warnings
from math import sqrt
import numpy as np
import pandas as pd
import pins
import pyodbc
import vetiver
from dotenv import load_dotenv
from pprint import pprint
from rsconnect.api import RSConnectServer
from sklearn.base import BaseEstimator,TransformerMixin
from sklearn.model_selection import TimeSeriesSplit, train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import OneHotEncoder, FunctionTransformer
# import custom functions
from bikeshare.data import clean_data
warnings.filterwarnings('ignore')
load_dotenv(override=True)
rsc_server = os.getenv("CONNECT_SERVER")
rsc_key = os.getenv("CONNECT_API_KEY")
connect_server = RSConnectServer(url=rsc_server,api_key=rsc_key)
Read in the raw data from the database.
# read table built by R ETL process in the database
connection = pyodbc.connect('DSN=Content DB')
sql = "select * from bike_model_data where date in (select distinct date from bike_model_data order by date desc limit 12);"
all_days = pd.read_sql_query(sql, connection)
all_days.sort_values(by='date', inplace=True, ascending=False)
all_days = all_days.reset_index(drop=True)
all_days
id | hour | date | month | dow | n_bikes | lat | lon | |
---|---|---|---|---|---|---|---|---|
0 | 453 | 4.0 | 2023-06-09 | 6.0 | Friday | 16.0 | 38.919086 | -77.034502 |
1 | 299 | 0.0 | 2023-06-09 | 6.0 | Friday | 1.0 | 39.110314 | -77.182669 |
2 | 298 | 8.0 | 2023-06-09 | 6.0 | Friday | 13.0 | 39.114688 | -77.171487 |
3 | 298 | 10.0 | 2023-06-09 | 6.0 | Friday | 13.0 | 39.114688 | -77.171487 |
4 | 298 | 12.0 | 2023-06-09 | 6.0 | Friday | 13.0 | 39.114688 | -77.171487 |
... | ... | ... | ... | ... | ... | ... | ... | ... |
39093 | 125 | 8.0 | 2023-05-29 | 5.0 | Monday | 6.0 | 38.897857 | -77.026975 |
39094 | 326 | 16.0 | 2023-05-29 | 5.0 | Monday | 8.0 | 38.964992 | -77.103381 |
39095 | 326 | 18.0 | 2023-05-29 | 5.0 | Monday | 8.0 | 38.964992 | -77.103381 |
39096 | 326 | 20.0 | 2023-05-29 | 5.0 | Monday | 8.0 | 38.964992 | -77.103381 |
39097 | 101 | 0.0 | 2023-05-29 | 5.0 | Monday | 4.0 | 38.894832 | -76.987633 |
39098 rows × 8 columns
X = all_days.drop(columns=["n_bikes", "id", "date"])
y = all_days[["n_bikes"]]
X_train, X_test, y_train, y_test = train_test_split(X, y)
X_train
hour | month | dow | lat | lon | |
---|---|---|---|---|---|
4868 | 12.0 | 6.0 | Thursday | 38.999378 | -77.097882 |
22530 | 10.0 | 6.0 | Friday | 38.903732 | -76.987211 |
27299 | 2.0 | 6.0 | Thursday | 38.907908 | -76.997070 |
9776 | 16.0 | 6.0 | Tuesday | 38.990639 | -77.100239 |
22635 | 12.0 | 6.0 | Friday | 38.902406 | -77.016006 |
... | ... | ... | ... | ... | ... |
10577 | 8.0 | 6.0 | Tuesday | 38.894000 | -76.947974 |
17090 | 2.0 | 6.0 | Sunday | 38.799267 | -77.044700 |
16326 | 22.0 | 6.0 | Sunday | 38.899703 | -77.008911 |
29567 | 8.0 | 5.0 | Wednesday | 38.900427 | -76.988250 |
17236 | 18.0 | 6.0 | Sunday | 38.899032 | -77.033354 |
29323 rows × 5 columns
y_train
n_bikes | |
---|---|
4868 | 2.0 |
22530 | 11.0 |
27299 | 18.0 |
9776 | 1.0 |
22635 | 13.0 |
... | ... |
10577 | 6.0 |
17090 | 14.0 |
16326 | 11.0 |
29567 | 12.0 |
17236 | 12.0 |
29323 rows × 1 columns
# build a random forest model
model = Pipeline(
steps=[
("clean-data", FunctionTransformer(clean_data)),
("regressor", RandomForestRegressor(
n_estimators=100,
random_state=0,
n_jobs=-1
))
]
)
model.fit(X_train, y_train)
Pipeline(steps=[('clean-data', FunctionTransformer(func=<function clean_data at 0x7f81eafc0820>)), ('regressor', RandomForestRegressor(n_jobs=-1, random_state=0))])In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
Pipeline(steps=[('clean-data', FunctionTransformer(func=<function clean_data at 0x7f81eafc0820>)), ('regressor', RandomForestRegressor(n_jobs=-1, random_state=0))])
FunctionTransformer(func=<function clean_data at 0x7f81eafc0820>)
RandomForestRegressor(n_jobs=-1, random_state=0)
# test the random forest model
y_pred = model.predict(X_test)
# compare predictions
test_mse = mean_squared_error(y_test, y_pred)
rmse = sqrt(test_mse)
print('RMSE: %f' % rmse)
RMSE: 2.800244
Deploy the model with vetiver.
user_name="sam.edwardes"
pin_name = f"{user_name}/bikeshare-rf-python"
# convert the random forest model into a vetiver model
v = vetiver.VetiverModel(
model=model,
model_name=pin_name,
prototype_data=X_train.head(1),
description="A model to predict the number of bikes that will be available."
)
# create a board on Posit Connect
board = pins.board_connect(
server_url="https://colorado.posit.co/rsc",
allow_pickle_read=True
)
# write the vetiver model as pin to Posit Connect
vetiver.pin_read_write.vetiver_pin_write(
board=board,
model=v
)
Model Cards provide a framework for transparent, responsible reporting. Use the vetiver `.qmd` Quarto template as a place to start, with vetiver.model_card() Writing pin: Name: 'sam.edwardes/bikeshare-rf-python' Version: 20230609T235428Z-4429b
board.pin_versions(pin_name)
version | |
---|---|
0 | 75565 |
1 | 75569 |
2 | 75571 |
3 | 75685 |
4 | 75693 |
5 | 75696 |
6 | 75701 |
7 | 75704 |
8 | 75706 |
9 | 75711 |
10 | 75716 |
11 | 75719 |
# use Vetiver provided Posit Connect deployment function
# to deploy the model as a FASTApi
vetiver.deploy_rsconnect(
connect_server=connect_server,
board=board,
pin_name=pin_name,
version=board.pin_versions(pin_name).tail(1)["version"].values[0],
title="Random Forest model for Bikeshare Python",
app_id="28923e33-dcb6-4774-b753-bf1d4c367579",
extra_files=["requirements.txt"]
)
Validating server... [OK] Validating app mode... [OK] Making bundle ... [OK] Deploying bundle ... [OK] Saving deployed information... [OK] Building FastAPI application... Bundle created with Python version 3.10.11 is compatible with environment Kubernetes::ghcr.io/rstudio/content-pro:r4.1.3-py3.10.11-ubuntu2204 with Python version 3.10.11 from /opt/python/3.10.11/bin/python3 Bundle requested Python version 3.10.11; using /opt/python/3.10.11/bin/python3 from Kubernetes::ghcr.io/rstudio/content-pro:r4.1.3-py3.10.11-ubuntu2204 which has version 3.10.11 Determining session server location ... 2023/06/09 23:55:33.254125408 [rsc-session] Content GUID: 28923e33-dcb6-4774-b753-bf1d4c367579 2023/06/09 23:55:33.254459061 [rsc-session] Content ID: 16788 2023/06/09 23:55:33.254474290 [rsc-session] Bundle ID: 75720 2023/06/09 23:55:33.402594800 Running on host: python-environment-restore-bqh55-vwsbf Connecting to session server http://service-eeb2b360-8f12-4bf5-9f69-27ae8e5968fe.rstudio-connect:50734 ... 2023/06/09 23:55:34.025031166 Linux distribution: Ubuntu 22.04.2 LTS (jammy) 2023/06/09 23:55:34.033730507 Running as user: uid=999 gid=999 groups=999 2023/06/09 23:55:34.033747076 Connect version: 2023.05.0 2023/06/09 23:55:34.033789616 LANG: en_US.UTF-8 2023/06/09 23:55:34.033791212 Working directory: /opt/rstudio-connect/mnt/app 2023/06/09 23:55:34.033799763 Building environment using Python 3.10.11 (main, Jun 4 2023, 22:34:21) [GCC 11.3.0] at /opt/python/3.10.11/bin/python3 2023/06/09 23:55:34.052713842 Using cached environment: MIR5r4bk5IQWKKeAX4NTAw Connected to session server http://service-eeb2b360-8f12-4bf5-9f69-27ae8e5968fe.rstudio-connect:50734 2023/06/09 23:55:36.061413457 Packages in the environment: aiofiles==23.1.0, anyio==3.7.0, appdirs==1.4.4, asttokens==2.2.1, backcall==0.2.0, bikeshare @ https://github.com/sol-eng/bike_predict_python/releases/download/v0.0.1/bikeshare-0.1.0-py3-none-any.whl#sha256=1f7a53fbd366ee9f27071d745f28fe24385549833d9b8eb24e59a9d6f727f9ba, build==0.10.0, certifi==2023.5.7, charset-normalizer==3.1.0, click==8.1.3, comm==0.1.3, debugpy==1.6.7, decorator==5.1.1, exceptiongroup==1.1.1, executing==1.2.0, fastapi==0.96.0, fsspec==2023.6.0, h11==0.14.0, httpcore==0.17.2, httpx==0.24.1, humanize==4.6.0, idna==3.4, importlib-metadata==6.6.0, importlib-resources==5.12.0, ipykernel==6.23.1, ipython==8.14.0, jedi==0.18.2, Jinja2==3.1.2, joblib==1.2.0, jupyter_client==8.2.0, jupyter_core==5.3.0, MarkupSafe==2.1.3, matplotlib-inline==0.1.6, nest-asyncio==1.5.6, numpy==1.24.3, packaging==23.1, pandas==2.0.2, parso==0.8.3, pexpect==4.8.0, pickleshare==0.7.5, pins==0.8.1, pip-tools==6.13.0, platformdirs==3.5.1, plotly==5.15.0, prompt-toolkit==3.0.38, psutil==5.9.5, ptyprocess==0.7.0, pure-eval==0.2.2, pydantic==1.10.9, Pygments==2.15.1, PyJWT==2.7.0, pyodbc==4.0.39, pyproject_hooks==1.0.0, python-dateutil==2.8.2, python-dotenv==1.0.0, pytz==2023.3, PyYAML==6.0, pyzmq==25.1.0, requests==2.31.0, rsconnect-python==1.17.1, scikit-learn==1.2.2, scipy==1.10.1, semver==2.13.0, six==1.16.0, sniffio==1.3.0, stack-data==0.6.2, starlette==0.27.0, tenacity==8.2.2, threadpoolctl==3.1.0, tomli==2.0.1, tornado==6.3.2, traitlets==5.9.0, typing_extensions==4.6.3, tzdata==2023.3, urllib3==2.0.3, uvicorn==0.22.0, vetiver==0.2.1, wcwidth==0.2.6, websockets==11.0.3, xxhash==3.2.0, zipp==3.15.0, 2023/06/09 23:55:36.062779103 Creating lockfile: python/requirements.txt.lock Stopped session pings to http://service-eeb2b360-8f12-4bf5-9f69-27ae8e5968fe.rstudio-connect:50734 Launching FastAPI application... Deployment completed successfully. Dashboard content URL: https://colorado.posit.co/rsc/connect/#/apps/28923e33-dcb6-4774-b753-bf1d4c367579 Direct content URL: https://colorado.posit.co/rsc/bike-predict-python-api/