pkgs <- c("keras", "lime", "rsample", "recipes", "yardstick", "corrr")
install.packages(pkgs)
Parsed with column specification:
cols(
.default = col_character(),
SeniorCitizen = [32mcol_double()[39m,
tenure = [32mcol_double()[39m,
MonthlyCharges = [32mcol_double()[39m,
TotalCharges = [32mcol_double()[39m
)
See spec(...) for full column specifications.
glimpse(datos_perdimiento)
https://tidymodels.github.io/rsample/
library(rsample)
set.seed(100)
separa_datos <- initial_split(
datos_perdimiento,
prop = 0.3)
tbl_entrenar <- training(separa_datos)
tbl_prueba <- testing(separa_datos)
https://tidymodels.github.io/recipes/
library(recipes)
receta <- tbl_entrenar %>%
recipe(Churn ~ .) %>%
step_rm(customerID) %>%
step_naomit(all_outcomes(), all_predictors()) %>%
step_discretize(tenure, options = list(cuts = 6)) %>%
step_log(TotalCharges) %>%
step_mutate(Churn = ifelse(Churn == "Yes", 1, 0)) %>%
step_dummy(all_nominal(), -all_outcomes()) %>%
step_center(all_predictors(), -all_outcomes()) %>%
step_scale(all_predictors(), -all_outcomes()) %>%
prep()
summary(receta)
save(receta, file = "../aplicacion/receta.RData")
x_tbl_entrenar <- receta %>%
juice(all_predictors(), composition = "matrix")
y_vec_entrenar <- receta %>%
juice(all_outcomes()) %>%
pull()
baked_test <- bake(receta, tbl_prueba)
x_tbl_prueba <- baked_test %>%
select(-Churn) %>%
as.matrix()
y_vec_prueba <- baked_test %>%
select(Churn) %>%
pull()
https://tensorflow.rstudio.com/tensorflow/articles/installation.html
https://tensorflow.rstudio.com/keras/#installation
library(tensorflow)
library(keras)
#install_tensorflow()
#install_keras()
model_keras <- keras_model_sequential() %>%
layer_dense(
units = 16,
kernel_initializer = "uniform",
activation = "relu",
input_shape = ncol(x_tbl_entrenar)) %>%
layer_dropout(rate = 0.1) %>%
layer_dense(
units = 16,
kernel_initializer = "uniform",
activation = "relu") %>%
layer_dropout(rate = 0.1) %>%
layer_dense(
units = 1,
kernel_initializer = "uniform",
activation = "sigmoid") %>%
compile(
optimizer = 'adam',
loss = 'binary_crossentropy',
metrics = c('accuracy')
)
model_keras
Model
___________________________________________________________________________________________________________
Layer (type) Output Shape Param #
===========================================================================================================
dense_1 (Dense) (None, 16) 576
___________________________________________________________________________________________________________
dropout_1 (Dropout) (None, 16) 0
___________________________________________________________________________________________________________
dense_2 (Dense) (None, 16) 272
___________________________________________________________________________________________________________
dropout_2 (Dropout) (None, 16) 0
___________________________________________________________________________________________________________
dense_3 (Dense) (None, 1) 17
===========================================================================================================
Total params: 865
Trainable params: 865
Non-trainable params: 0
___________________________________________________________________________________________________________
history <- fit(
object = model_keras,
x = x_tbl_entrenar,
y = y_vec_entrenar,
batch_size = 50,
epochs = 35,
validation_split = 0.30,
verbose = 0
)
2019-03-17 11:19:29.569656: I T:\src\github\tensorflow\tensorflow\core\platform\cpu_feature_guard.cc:141] Your CPU supports instructions that this TensorFlow binary was not compiled to use: AVX2
print(history)
Trained on 1,478 samples, validated on 634 samples (batch_size=50, epochs=35)
Final epoch (plot to see history):
val_loss: 0.4467
val_acc: 0.7855
loss: 0.4049
acc: 0.8031
theme_set(theme_bw())
plot(history)
yhat_keras_class_vec <- model_keras %>%
predict_classes(x_tbl_prueba) %>%
as.factor() %>%
fct_recode(yes = "1", no = "0")
yhat_keras_prob_vec <- model_keras %>%
predict_proba(x_tbl_prueba) %>%
as.vector()
test_truth <- y_vec_prueba %>%
as.factor() %>%
fct_recode(yes = "1", no = "0")
estimates_keras_tbl <- tibble(
truth = test_truth,
estimate = yhat_keras_class_vec,
class_prob = yhat_keras_prob_vec
)
estimates_keras_tbl
https://tidymodels.github.io/yardstick/
yardstick
is a package to estimate how well models are working using tidy data principals.
library(yardstick)
options(yardstick.event_first = FALSE)
estimates_keras_tbl %>%
conf_mat(truth, estimate)
Truth
Prediction no yes
no 3229 569
yes 396 726
estimates_keras_tbl %>%
metrics(truth, estimate)
estimates_keras_tbl %>%
roc_auc(truth, class_prob)
estimates_keras_tbl %>%
precision(truth, estimate) %>%
bind_rows(
estimates_keras_tbl %>%
recall(truth, estimate)
)
estimates_keras_tbl %>%
f_meas(truth, estimate, beta = 1)
https://github.com/thomasp85/lime
library(lime)
model_type.keras.engine.sequential.Sequential <- function(x, ...) {
"classification"
}
predict_model.keras.engine.sequential.Sequential <- function(x, newdata, type, ...) {
pred <- predict_proba(object = x, x = as.matrix(newdata))
data.frame(Yes = pred, No = 1 - pred)
}
model_keras %>%
predict_model(x_tbl_prueba, "raw") %>%
as_tibble()
library(lime)
explainer <- x_tbl_entrenar %>%
as_tibble() %>%
lime(model_keras,
bin_continuous = FALSE)
explanation <- x_tbl_entrenar %>%
as.data.frame() %>%
head(40) %>%
lime::explain(
explainer = explainer,
n_labels = 1,
n_features = 4,
kernel_width = 0.5
)
plot_explanations(explanation) +
labs(
title = "Importancia de cada variable",
subtitle = "Usando 40 observaciones de prueba"
)
https://github.com/drsimonj/corrr
library(corrr)
corrr_analysis <- x_tbl_entrenar %>%
as_tibble() %>%
mutate(Churn = y_vec_entrenar) %>%
correlate() %>%
focus(Churn) %>%
rename(feature = rowname) %>%
arrange(abs(Churn)) %>%
mutate(feature = as_factor(feature))
Correlation method: 'pearson'
Missing treated using: 'pairwise.complete.obs'
corrr_analysis
over <- corrr_analysis %>%
filter(Churn > 0)
under <- corrr_analysis %>%
filter(Churn < 0)
corrr_analysis %>%
ggplot(aes(x = Churn, y = fct_reorder(feature, desc(Churn)))) +
geom_point() +
geom_segment(aes(xend = 0, yend = feature), data = under, color = "orange") +
geom_point(data = under, color = "orange") +
geom_segment(aes(xend = 0, yend = feature), data = over, color = "blue") +
geom_point(data = over, color = "blue") +
labs(title = "Corelaciones de perdida de clientes", y = "", x = "")
NA
datos_perdimiento %>%
group_by(Contract, Churn) %>%
tally() %>%
spread(Churn, n)
datos_perdimiento %>%
group_by(InternetService, Churn) %>%
tally() %>%
spread(Churn, n)
export_savedmodel(model_keras, "tfmodel")
library(rsconnect)
deployTFModel(
"tfmodel",
server = "colorado.rstudio.com",
account = rstudioapi::askForPassword("Enter Connect Username:")
)
library(httr)
baked_numeric <- x_tbl_prueba %>%
as_tibble() %>%
head(4) %>%
transpose() %>%
map(as.numeric)
body <- list(instances = list(baked_numeric))
r <- POST("https://colorado.rstudio.com/rsc/content/2230/serving_default/predict", body = body, encode = "json")
jsonlite::fromJSON(content(r))$predictions[, , 1]
[1] 0.5589350 0.2727856 0.5589350 0.5589350