library(tidyverse)
library(data.table)
library(xgboost)
library(knitr)
library(broom)
library(caret)
library(ggthemes)
library(DT)
library(glmnet)
<- fread("data.csv")
cancer
:= NULL] cancer[, V33
Shap Calculation R
Packages
Head
A mini example of calculating shap values in R. I used an open source data set from Kaggle. See more about Wiscosin breast cancer data set. I use IML package from R I prefer it since you can calculate from any model. At this time I have tested it using models from caret package but they have examples from mlr package and h20 packages.
head(cancer) %>%
datatable(options = list(scrollX = TRUE))
Scale predictors
<- names(cancer)
nms <- gsub("\\s", "_", nms) %>% tolower() %>% str_trim()
new_nms setnames(cancer, nms, new_nms)
<- cancer$id
pat_id := NULL]
cancer[, id <- names(cancer)[3:31]
predictors <- function(x){
scale01 = (x - min(x))/(max(x) - min(x))
y return(y)
}:= lapply(.SD, function(x) scale01(x)), .SDcols = predictors ]
cancer[, (predictors) := factor(diagnosis, levels = c("M", "B"))]
cancer[, diagnosis
head(cancer) %>%
datatable(options = list(scrollX = TRUE))
Boxplots
- I like this especially when you want to see if there is difference between groups when the indipedent var is numeric and outcome categorical.
<- melt(cancer, id.vars = "diagnosis")
cancerm
ggplot(cancerm, aes(x = diagnosis, y = value))+
geom_boxplot() + facet_wrap(~variable, scales = "free_y")
Split test, train
<- nrow(cancer)
n_row <- (0.7 * n_row) %>% as.integer()
train_size <- sample(1:n_row, train_size)
train_ids <- cancer[train_ids, ]
train_data <- cancer
test_data <- pat_id[-train_ids] test_pat_id
Fit xgboost, random forest,
<- createFolds(train_data$diagnosis, k = 10)
cv_fold <- trainControl(method = "cv",
train_ctrl number = 10,
summaryFunction = twoClassSummary,
classProbs = TRUE,
allowParallel=T,
index = cv_fold,
verboseIter = FALSE,
savePredictions = TRUE,
search = "grid")
<- expand.grid(nrounds = c(10, 50, 100),
xgb_grid eta = seq(0.06, .2, length.out = 3),
max_depth = c( 5, 10),
gamma = c(0,.01, 0.1),
colsample_bytree = c(0.6, 0.7,0.8),
min_child_weight = c(0.7, 0.95),
subsample = c(0.5, 0.7, 0.9))
<- seq(from = 5, to = (ncol(cancer) - 1),
trees_ranger length.out = 5 ) %>% as.integer()
<- expand.grid(splitrule = c("extratrees", "gini"),
ranger_grid mtry = trees_ranger,
min.node.size = c(0.85, .95))
<- expand.grid(C = c(0.5, 1, 10),
svm_grid sigma = seq(0.001, 0.1, length.out = 10))
Fit Random forest
library(caretEnsemble)
library(foreach)
library(doParallel) # ignore it's used for parallel
library(tictoc) # ignore it's used for timing
<- makeCluster(3)
cl registerDoParallel(cl)
tic()
set.seed(100)
<- caret::train(
rf ~.,
diagnosisdata= train_data,
trControl=train_ctrl,
metric = "ROC",
method = "ranger",
tuneGrid= ranger_grid)
toc()
registerDoSEQ()
rf
Calculate Shap values
- use the shapper package it uses python shap package so you must have python installed + python shap library
- https://github.com/ModelOriented/shapper
- install from cran
library(shapper)
# X_pred <- train_data[, .SD, .SDcols = !c("diagnosis")] %>%
# as.data.frame() # this because i use data.table
#
# tic()
# p_function <- function(model, data) predict(model, newdata = data, type = "prob")
# shap_values <- individual_variable_effect(rf, data = X_pred, predict_function = p_function,
# new_observation = X_pred, nsamples = 150)
#
#
# toc()
tic()
tic()
<- foreach(i = 1:nrow(X_pred)) %do%{
shap_list <- Shapley$new(predictor, x.interest = X_pred[i, ], sample.size = 150)
shap <-shap$results %>% data.table()
shap_import <- shap_import[class == "M"]
shap_import := pat_id[i]]
shap_import[,id
}toc()
toc()
<- rbindlist(shap_list)
shap_values write.csv(shap_values, file ="shap_values3.csv", row.names = F )
Shap plots
library(ggforce)
<- fread("shap_values3.csv")
shap_values
setnames(shap_values, c("_attribution_", "_vname_"), c("phi","feature" ))
:= abs( phi )]
shap_values[, phi2 <- shap_values[, .(Med = median(phi2),
shap_imp Mean = mean(phi2)), by = feature] %>%
setorder(-Med)
<- shap_imp[1:30, ]
shap_imp
<- shap_values[feature %in%shap_imp$feature]
shap_values
:= factor(feature, levels = rev(shap_imp$feature) )]
shap_values[, feature
ggplot(shap_values, aes(feature, phi, color = abs(phi)))+
geom_sina()+
geom_hline(yintercept = 0) +
scale_color_gradient(name = "",low="#2187E3", high="#F32858",
breaks=c(0,.2), labels=c("Low","High"),
limits = c(0,.2))+
theme_bw() +
theme(axis.line.y = element_blank(),
axis.ticks.y = element_blank(), # remove axis line
legend.position="bottom") +
coord_flip()
Fit models using caret ensembles
## Ignore; just a simpler if you want to compare models perfomance
# cl <- makeCluster(3)
# registerDoParallel(cl)
# tic()
# set.seed(100)
# model_list <- caretList(
# diagnosis~.,
# data= train_data,
# trControl=train_ctrl,
# metric = "ROC",
# tuneList = list(caretModelSpec(method="xgbTree", tuneGrid= xgb_grid),
# caretModelSpec(method = "svmRadial", tuneGrid = svm_grid),
# caretModelSpec(method="ranger", tuneGrid= ranger_grid)
#
#
#
# )
# )
#
# toc()
#
# registerDoSEQ()
#
# model_list