Shap Calculation R

Author

Mburu

Published

March 5, 2021

Packages

library(tidyverse)
library(data.table)
library(xgboost)
library(knitr)
library(broom)
library(caret)
library(ggthemes)
library(DT)
library(glmnet)

cancer <- fread("data.csv")

cancer[, V33 := NULL]

Head

A mini example of calculating shap values in R. I used an open source data set from Kaggle. See more about Wiscosin breast cancer data set. I use IML package from R I prefer it since you can calculate from any model. At this time I have tested it using models from caret package but they have examples from mlr package and h20 packages.

head(cancer)  %>%
  datatable(options = list(scrollX = TRUE))

Scale predictors

nms <- names(cancer)
new_nms <- gsub("\\s", "_", nms) %>% tolower() %>% str_trim()
setnames(cancer, nms, new_nms)
pat_id <- cancer$id
cancer[, id := NULL]
predictors <- names(cancer)[3:31]
scale01 <- function(x){
  y = (x - min(x))/(max(x) - min(x))
  return(y)
}
cancer[, (predictors) := lapply(.SD, function(x) scale01(x)), .SDcols = predictors ]
cancer[, diagnosis := factor(diagnosis, levels = c("M", "B"))]

head(cancer)  %>%
  datatable(options = list(scrollX = TRUE))

Boxplots

I like this especially when you want to see if there is difference between groups when the indipedent var is numeric and outcome categorical.

cancerm <- melt(cancer, id.vars = "diagnosis")

ggplot(cancerm, aes(x = diagnosis, y = value))+
    geom_boxplot() + facet_wrap(~variable, scales = "free_y")

Split test, train

n_row <- nrow(cancer)
train_size <- (0.7 * n_row) %>% as.integer()
train_ids <- sample(1:n_row, train_size)
train_data <- cancer[train_ids, ]
test_data <- cancer
test_pat_id <- pat_id[-train_ids]

Fit xgboost, random forest,

cv_fold <- createFolds(train_data$diagnosis, k = 10)
train_ctrl <- trainControl(method = "cv",
                        number = 10,
                        summaryFunction = twoClassSummary,
                        classProbs = TRUE,
                        allowParallel=T,
                        index = cv_fold,
                        verboseIter = FALSE,
                        savePredictions = TRUE,
                        search = "grid")

xgb_grid <- expand.grid(nrounds = c(10, 50, 100),
                        eta = seq(0.06, .2, length.out = 3),
                        max_depth = c( 5, 10),
                        gamma = c(0,.01, 0.1),
                        colsample_bytree = c(0.6, 0.7,0.8),
                        min_child_weight = c(0.7, 0.95),
                        subsample =  c(0.5, 0.7, 0.9))

trees_ranger <-  seq(from = 5, to = (ncol(cancer) - 1),
                     length.out = 5 ) %>% as.integer()

ranger_grid <- expand.grid(splitrule = c("extratrees", "gini"),
                        mtry = trees_ranger,
                        min.node.size = c(0.85, .95))

svm_grid <- expand.grid(C = c(0.5, 1, 10),
                        sigma = seq(0.001, 0.1, length.out = 10))

Fit Random forest

library(caretEnsemble)
library(foreach)
library(doParallel) # ignore  it's used for parallel
library(tictoc) # ignore it's used for timing
cl <- makeCluster(3)
registerDoParallel(cl)
tic()
set.seed(100)


rf  <- caret::train(
  diagnosis~.,
  data= train_data,
  trControl=train_ctrl,
  metric = "ROC",
  method = "ranger",
  tuneGrid= ranger_grid)

toc()

registerDoSEQ()

rf

Calculate Shap values

use the shapper package it uses python shap package so you must have python installed + python shap library
https://github.com/ModelOriented/shapper
install from cran

library(shapper)
# X_pred <- train_data[, .SD, .SDcols = !c("diagnosis")] %>%
#   as.data.frame() # this because i use data.table
# 
# tic()
# p_function <- function(model, data) predict(model, newdata = data, type = "prob")
# shap_values <- individual_variable_effect(rf, data = X_pred, predict_function = p_function,
#                                      new_observation = X_pred, nsamples = 150)
# 
# 
# toc()
tic()

tic()

shap_list <- foreach(i = 1:nrow(X_pred)) %do%{
    shap <- Shapley$new(predictor,  x.interest = X_pred[i, ], sample.size = 150)
    shap_import <-shap$results %>% data.table()
    shap_import <- shap_import[class == "M"]
    shap_import[,id := pat_id[i]]
    
}
toc()


toc()

shap_values <- rbindlist(shap_list)
write.csv(shap_values, file ="shap_values3.csv", row.names = F )

Shap plots

library(ggforce)
shap_values <-  fread("shap_values3.csv")

setnames(shap_values, c("_attribution_", "_vname_"), c("phi","feature" ))
shap_values[, phi2 := abs( phi )]
shap_imp <- shap_values[, .(Med = median(phi2),
                            Mean = mean(phi2)), by = feature] %>%
    setorder(-Med)
shap_imp <- shap_imp[1:30, ]

shap_values <- shap_values[feature %in%shap_imp$feature]

shap_values[, feature := factor(feature, levels = rev(shap_imp$feature) )]

ggplot(shap_values, aes(feature, phi,  color = abs(phi)))+
  geom_sina()+
  geom_hline(yintercept = 0) +
  scale_color_gradient(name = "",low="#2187E3", high="#F32858", 
                       breaks=c(0,.2), labels=c("Low","High"),
                       limits = c(0,.2))+ 
  theme_bw() + 
    theme(axis.line.y = element_blank(), 
          axis.ticks.y = element_blank(), # remove axis line
          legend.position="bottom") +
  coord_flip()

Fit models using caret ensembles

## Ignore; just a simpler if you want to compare models perfomance

# cl <- makeCluster(3)
# registerDoParallel(cl)
# tic()
# set.seed(100)
# model_list <- caretList(
#    diagnosis~.,
#     data= train_data,
#     trControl=train_ctrl,
#     metric = "ROC",
#     tuneList = list(caretModelSpec(method="xgbTree",  tuneGrid= xgb_grid),
#                     caretModelSpec(method = "svmRadial", tuneGrid = svm_grid),
#                     caretModelSpec(method="ranger", tuneGrid= ranger_grid)
# 
#                    
#                     
#                     )
# )
# 
# toc()
# 
# registerDoSEQ()
# 
# model_list