set.seed(42)
library(tidyverse)
library(mlr3verse)
library(mlr3tuning)
library(mlr3tuningspaces)
Read in data
dat = read.csv("./data/all.all.df.csv")
Make outcome a binary variable (0/1 relapse)
dat$rbin = factor(dat$rbin, levels = c("yes", "no"))
Filter out any tests that are post-relapse
dat = dat[which(dat$bdate < dat$dor | is.na(dat$dor)), ]
Filter out relapse >720 days
dat = dat[which(dat$rbin == "no" | dat$rtime < 720),]
Filter out any missing tests
dat = dat[!is.na(dat$bmc_cdw) & !is.na(dat$bmc_cd3) &
!is.na(dat$bmc_cd15) & !is.na(dat$bmc_cd34) &
!is.na(dat$pbc_cdw) & !is.na(dat$pbc_cd3) &
!is.na(dat$pbc_cd15) & !is.na(dat$pbc_cd34),]
Get \(p(relapse)\) for baseline model
prbin = sum(as.numeric(dat$rbin)-1) / nrow(dat)
# dat2 <- dat %>%
# select(rbin, txage, hla, tbi, abd, ci, mtx, mmf, agvhd, cgvhd,
# bmc_cdw, bmc_cd3, bmc_cd15, bmc_cd34,
# pbc_cdw, pbc_cd3, pbc_cd15, pbc_cd34, ID)
dat2 <- dat %>%
select(rbin, sex, txage,
rstatprtx, ghgp, tbi,
bmc_cdw, bmc_cd3, bmc_cd15, bmc_cd34,
pbc_cdw, pbc_cd3, pbc_cd15, pbc_cd34, ID)
dat2 <- dat2 %>%
mutate_if(is.character, as.factor) %>%
mutate_if(is.integer, as.numeric) %>%
# mutate(abd = tolower(abd)) %>%
drop_na() %>%
droplevels()
task_chim <- TaskClassif$new(id = "all", backend = dat2,
target = "rbin")
Define patients for use in cross validation
# task_chim$col_roles$group <- "ID"
task_chim$set_col_roles("ID", remove_from = 'feature')
as.data.table(mlr_tuning_spaces)
## key label
## 1: classif.glmnet.default Classification GLM with Default
## 2: classif.glmnet.rbv2 Classification GLM with RandomBot
## 3: classif.kknn.default Classification KKNN with Default
## 4: classif.kknn.rbv2 Classification KKNN with RandomBot
## 5: classif.ranger.default Classification Ranger with Default
## 6: classif.ranger.rbv2 Classification Ranger with RandomBot
## 7: classif.rpart.default Classification Rpart with Default
## 8: classif.rpart.rbv2 Classification Rpart with RandomBot
## 9: classif.svm.default Classification SVM with Default
## 10: classif.svm.rbv2 Classification SVM with RandomBot
## 11: classif.xgboost.default Classification XGBoost with Default
## 12: classif.xgboost.rbv2 Classification XGBoost with RandomBot
## 13: regr.glmnet.default Regression GLM with Default
## 14: regr.glmnet.rbv2 Regression GLM with RandomBot
## 15: regr.kknn.default Regression KKNN with Default
## 16: regr.kknn.rbv2 Regression KKNN with RandomBot
## 17: regr.ranger.default Regression Ranger with Default
## 18: regr.ranger.rbv2 Regression Ranger with RandomBot
## 19: regr.rpart.default Regression Rpart with Default
## 20: regr.rpart.rbv2 Regression Rpart with RandomBot
## 21: regr.svm.default Regression SVM with Default
## 22: regr.svm.rbv2 Regression SVM with RandomBot
## 23: regr.xgboost.default Regression XGBoost with Default
## 24: regr.xgboost.rbv2 Regression XGBoost with RandomBot
## key label
## learner n_values
## 1: classif.glmnet 2
## 2: classif.glmnet 2
## 3: classif.kknn 3
## 4: classif.kknn 1
## 5: classif.ranger 4
## 6: classif.ranger 8
## 7: classif.rpart 3
## 8: classif.rpart 4
## 9: classif.svm 4
## 10: classif.svm 5
## 11: classif.xgboost 8
## 12: classif.xgboost 13
## 13: regr.glmnet 2
## 14: regr.glmnet 2
## 15: regr.kknn 3
## 16: regr.kknn 1
## 17: regr.ranger 4
## 18: regr.ranger 7
## 19: regr.rpart 3
## 20: regr.rpart 4
## 21: regr.svm 4
## 22: regr.svm 5
## 23: regr.xgboost 8
## 24: regr.xgboost 13
## learner n_values
instance_rf = tune(
method = "grid_search",
task = task_chim,
learner = lts(lrn("classif.ranger")),
resampling = rsmp ("cv", folds = 5),
measure = msr("classif.bacc"),
term_evals = 100
)
instance_rf$result
## mtry.ratio replace sample.fraction num.trees learner_param_vals x_domain
## 1: 0.8888889 FALSE 0.8 1556 <list[5]> <list[4]>
## classif.bacc
## 1: 0.8514847
Stanford Medicine, dcshyr@stanford.edu↩︎
University of Utah, simon.brewer@geog.utah.edu↩︎