library(tidyverse)
library(tidymodels)
library(data.table)
library(gtsummary)
library(mTools)
Diabetes Prediction using Tidymodels
<- fread("data/diabetes_prediction_dataset.csv") diabetes_df_all
Data processing
:= factor(diabetes,
diabetes_df_all[, diabetes_char levels = c(0, 1),
labels = c("Non diabetic", "Diabetic"))]
<- function(df, outcome_col = "diabetes_char", pos_class = "Diabetic", pos_class_perc = .4){
re_balance_class
= df[get(outcome_col) == pos_class]
pos_class_df = df[get(outcome_col) != pos_class]
neg_class = nrow(pos_class_df)/nrow(df)
pos_perc = round(nrow(pos_class_df)/pos_class_perc)
N = N - nrow(pos_class_df)
Nneg = neg_class[sample(1:.N, Nneg)]
neg_class_df rbind(pos_class_df,neg_class_df )
}
= re_balance_class(df = diabetes_df_all) diabetes_df
Summary Stats
library(ggiraph)
<- diabetes_df[, .(freq = .N),
db_perc = diabetes_char][
by := round(freq/sum(freq) * 100, 1)]
,perc
ggplot(db_perc, aes(diabetes_char, freq, fill = diabetes_char))+
geom_bar_interactive(width = 0.5, stat = "identity")+
geom_text(aes(label = paste0(freq, "(", perc, "%)")),
position = position_dodge(width = 0.5),
vjust = 0.05)+
scale_fill_brewer(name = "", type = "qual", palette = "Dark2")+
theme_minimal()+
theme(
legend.position = "bottom"
)
<- diabetes_df %>%
tab2 tbl_summary(
by = diabetes_char,
type = all_continuous() ~ "continuous2",
statistic = all_continuous() ~ c(
"{mean} ({sd})",
"{median} ({p25}, {p75})",
"[{min}, {max}]"
),missing = "ifany"
%>%
) add_p(pvalue_fun = ~ style_pvalue(.x, digits = 2))
= as.data.frame(tab2)
tab_df <- names(tab_df)
nms <- gsub("\\*", "", nms)
nms names(tab_df) <- nms
data_table(tab_df)
Model Fitting
set.seed(100)
:= as.factor(diabetes)]
diabetes_df[, diabetes<- initial_split(diabetes_df[,.SD, .SDcols = !"diabetes_char"],
diabetes_df_split strata = diabetes)
<- training(diabetes_df_split)
diabetes_df_train
<- testing(diabetes_df_split) diabetes_df_test
# Specify a logistic regression model
<- logistic_reg() %>%
logistic_model # Set the engine
set_engine('glm') %>%
# Set the mode
set_mode('classification')
# Fit to training data
<- logistic_model %>%
logistic_fit fit(diabetes ~ .,
data = diabetes_df_train)
# Print model fit object
%>%
logistic_fit DT_tidy_model()
<- boost_tree(
xgb_spec trees = 2000,
tree_depth = tune(),
min_n = tune(),
loss_reduction = tune(), ## first three: model complexity
sample_size = tune(),
mtry = tune(), ## randomness
learn_rate = tune() ## step size
%>%
) set_engine("xgboost") %>%
set_mode("classification")
xgb_spec
Boosted Tree Model Specification (classification)
Main Arguments:
mtry = tune()
trees = 2000
min_n = tune()
tree_depth = tune()
learn_rate = tune()
loss_reduction = tune()
sample_size = tune()
Computational engine: xgboost