library(tidyverse)
library(data.table)
library(xgboost)
library(knitr)
library(broom)
library(caret)
library(ggthemes)
library(DT)
library(glmnet)
cancer <- fread("data.csv")
cancer[, V33 := NULL]Shap Calculation R
Packages
Head
A mini example of calculating shap values in R. I used an open source data set from Kaggle. See more about Wiscosin breast cancer data set. I use IML package from R I prefer it since you can calculate from any model. At this time I have tested it using models from caret package but they have examples from mlr package and h20 packages.
head(cancer) %>%
datatable(options = list(scrollX = TRUE)) Scale predictors
nms <- names(cancer)
new_nms <- gsub("\\s", "_", nms) %>% tolower() %>% str_trim()
setnames(cancer, nms, new_nms)
pat_id <- cancer$id
cancer[, id := NULL]
predictors <- names(cancer)[3:31]
scale01 <- function(x){
y = (x - min(x))/(max(x) - min(x))
return(y)
}
cancer[, (predictors) := lapply(.SD, function(x) scale01(x)), .SDcols = predictors ]
cancer[, diagnosis := factor(diagnosis, levels = c("M", "B"))]
head(cancer) %>%
datatable(options = list(scrollX = TRUE)) Boxplots
- I like this especially when you want to see if there is difference between groups when the indipedent var is numeric and outcome categorical.
cancerm <- melt(cancer, id.vars = "diagnosis")
ggplot(cancerm, aes(x = diagnosis, y = value))+
geom_boxplot() + facet_wrap(~variable, scales = "free_y")Split test, train
n_row <- nrow(cancer)
train_size <- (0.7 * n_row) %>% as.integer()
train_ids <- sample(1:n_row, train_size)
train_data <- cancer[train_ids, ]
test_data <- cancer
test_pat_id <- pat_id[-train_ids]Fit xgboost, random forest,
cv_fold <- createFolds(train_data$diagnosis, k = 10)
train_ctrl <- trainControl(method = "cv",
number = 10,
summaryFunction = twoClassSummary,
classProbs = TRUE,
allowParallel=T,
index = cv_fold,
verboseIter = FALSE,
savePredictions = TRUE,
search = "grid")
xgb_grid <- expand.grid(nrounds = c(10, 50, 100),
eta = seq(0.06, .2, length.out = 3),
max_depth = c( 5, 10),
gamma = c(0,.01, 0.1),
colsample_bytree = c(0.6, 0.7,0.8),
min_child_weight = c(0.7, 0.95),
subsample = c(0.5, 0.7, 0.9))
trees_ranger <- seq(from = 5, to = (ncol(cancer) - 1),
length.out = 5 ) %>% as.integer()
ranger_grid <- expand.grid(splitrule = c("extratrees", "gini"),
mtry = trees_ranger,
min.node.size = c(0.85, .95))
svm_grid <- expand.grid(C = c(0.5, 1, 10),
sigma = seq(0.001, 0.1, length.out = 10))Fit Random forest
library(caretEnsemble)
library(foreach)
library(doParallel) # ignore it's used for parallel
library(tictoc) # ignore it's used for timing
cl <- makeCluster(3)
registerDoParallel(cl)
tic()
set.seed(100)
rf <- caret::train(
diagnosis~.,
data= train_data,
trControl=train_ctrl,
metric = "ROC",
method = "ranger",
tuneGrid= ranger_grid)
toc()
registerDoSEQ()
rfCalculate Shap values
- use the shapper package it uses python shap package so you must have python installed + python shap library
- https://github.com/ModelOriented/shapper
- install from cran
library(shapper)
# X_pred <- train_data[, .SD, .SDcols = !c("diagnosis")] %>%
# as.data.frame() # this because i use data.table
#
# tic()
# p_function <- function(model, data) predict(model, newdata = data, type = "prob")
# shap_values <- individual_variable_effect(rf, data = X_pred, predict_function = p_function,
# new_observation = X_pred, nsamples = 150)
#
#
# toc()
tic()
tic()
shap_list <- foreach(i = 1:nrow(X_pred)) %do%{
shap <- Shapley$new(predictor, x.interest = X_pred[i, ], sample.size = 150)
shap_import <-shap$results %>% data.table()
shap_import <- shap_import[class == "M"]
shap_import[,id := pat_id[i]]
}
toc()
toc()
shap_values <- rbindlist(shap_list)
write.csv(shap_values, file ="shap_values3.csv", row.names = F )Shap plots
library(ggforce)
shap_values <- fread("shap_values3.csv")
setnames(shap_values, c("_attribution_", "_vname_"), c("phi","feature" ))
shap_values[, phi2 := abs( phi )]
shap_imp <- shap_values[, .(Med = median(phi2),
Mean = mean(phi2)), by = feature] %>%
setorder(-Med)
shap_imp <- shap_imp[1:30, ]
shap_values <- shap_values[feature %in%shap_imp$feature]
shap_values[, feature := factor(feature, levels = rev(shap_imp$feature) )]
ggplot(shap_values, aes(feature, phi, color = abs(phi)))+
geom_sina()+
geom_hline(yintercept = 0) +
scale_color_gradient(name = "",low="#2187E3", high="#F32858",
breaks=c(0,.2), labels=c("Low","High"),
limits = c(0,.2))+
theme_bw() +
theme(axis.line.y = element_blank(),
axis.ticks.y = element_blank(), # remove axis line
legend.position="bottom") +
coord_flip()Fit models using caret ensembles
## Ignore; just a simpler if you want to compare models perfomance
# cl <- makeCluster(3)
# registerDoParallel(cl)
# tic()
# set.seed(100)
# model_list <- caretList(
# diagnosis~.,
# data= train_data,
# trControl=train_ctrl,
# metric = "ROC",
# tuneList = list(caretModelSpec(method="xgbTree", tuneGrid= xgb_grid),
# caretModelSpec(method = "svmRadial", tuneGrid = svm_grid),
# caretModelSpec(method="ranger", tuneGrid= ranger_grid)
#
#
#
# )
# )
#
# toc()
#
# registerDoSEQ()
#
# model_list