library(tidyverse)
library(tidymodels)
library(data.table)
library(gtsummary)
library(mTools)
Diabetes
data
diabetes_df_all <- fread("data/diabetes_prediction_dataset.csv")
Data processing
diabetes_df_all[, diabetes_char := factor(diabetes,
levels = c(0, 1),
labels = c("Non diabetic", "Diabetic"))]
re_balance_class <- function(df, outcome_col = "diabetes_char", pos_class = "Diabetic", pos_class_perc = .4){
pos_class_df = df[get(outcome_col) == pos_class]
neg_class = df[get(outcome_col) != pos_class]
pos_perc = nrow(pos_class_df)/nrow(df)
N = round(nrow(pos_class_df)/pos_class_perc)
Nneg = N - nrow(pos_class_df)
neg_class_df = neg_class[sample(1:.N, Nneg)]
rbind(pos_class_df,neg_class_df )
}
diabetes_df = re_balance_class(df = diabetes_df_all)
Summary Stats
library(ggiraph)
db_perc <- diabetes_df[, .(freq = .N),
by = diabetes_char][
,perc := round(freq/sum(freq) * 100, 1)]
ggplot(db_perc, aes(diabetes_char, freq, fill = diabetes_char))+
geom_bar_interactive(width = 0.5, stat = "identity")+
geom_text(aes(label = paste0(freq, "(", perc, "%)")),
position = position_dodge(width = 0.5),
vjust = 0.05)+
scale_fill_brewer(name = "", type = "qual", palette = "Dark2")+
theme_minimal()+
theme(
legend.position = "bottom"
)

tab2 <- diabetes_df %>%
tbl_summary(
by = diabetes_char,
type = all_continuous() ~ "continuous2",
statistic = all_continuous() ~ c(
"{mean} ({sd})",
"{median} ({p25}, {p75})",
"[{min}, {max}]"
),
missing = "ifany"
) %>%
add_p(pvalue_fun = ~ style_pvalue(.x, digits = 2))
tab_df = as.data.frame(tab2)
nms <- names(tab_df)
nms <- gsub("\\*", "", nms)
names(tab_df) <- nms
data_table(tab_df)
Model Fitting
set.seed(100)
diabetes_df[, diabetes:= as.factor(diabetes)]
diabetes_df_split <- initial_split(diabetes_df[,.SD, .SDcols = !"diabetes_char"],
strata = diabetes)
diabetes_df_train <- training(diabetes_df_split)
diabetes_df_test <- testing(diabetes_df_split)
# Specify a logistic regression model
logistic_model <- logistic_reg() %>%
# Set the engine
set_engine('glm') %>%
# Set the mode
set_mode('classification')
# Fit to training data
logistic_fit <- logistic_model %>%
fit(diabetes ~ .,
data = diabetes_df_train)
# Print model fit object
logistic_fit %>%
DT_tidy_model()
xgb_spec <- boost_tree(
trees = 2000,
tree_depth = tune(),
min_n = tune(),
loss_reduction = tune(), ## first three: model complexity
sample_size = tune(),
mtry = tune(), ## randomness
learn_rate = tune() ## step size
) %>%
set_engine("xgboost") %>%
set_mode("classification")
xgb_spec
## Boosted Tree Model Specification (classification)
##
## Main Arguments:
## mtry = tune()
## trees = 2000
## min_n = tune()
## tree_depth = tune()
## learn_rate = tune()
## loss_reduction = tune()
## sample_size = tune()
##
## Computational engine: xgboost