Set random seed for reproducibility
set.seed(1234)
library(tidyverse)
library(lubridate)
library(ggpubr)
library(mlr3verse)
library(ranger)
library(vip)
library(pdp)
library(gridExtra)
Read in data
aml.df <- read.csv("./data/aml.all.df.csv")
Convert dates
aml.df$dot <- ymd(aml.df$dot)
aml.df$dor <- ymd(aml.df$dor)
aml.df$bdate <- ymd(aml.df$bdate)
aml.df$pdate <- ymd(aml.df$pdate)
Convert all character strings to factors
aml.df <- aml.df %>% mutate_if(is.character,as.factor)
Make outcome a binary variable (0/1 relapse)
aml.df$rbin <- factor(aml.df$rbin, levels = c("yes", "no"))
Filter out any tests that are post-relapse
aml.df <- aml.df[which(aml.df$bdate < aml.df$dor | is.na(aml.df$dor)), ]
Filter out relapse >720 days
aml.df <- aml.df[which(aml.df$rbin == "no" | aml.df$rtime < 720),]
Filter out any missing tests
aml.df <- aml.df[!is.na(aml.df$bmc_cdw) & !is.na(aml.df$bmc_cd3) &
!is.na(aml.df$bmc_cd15) & !is.na(aml.df$bmc_cd34) &
!is.na(aml.df$pbc_cdw) & !is.na(aml.df$pbc_cd3) &
!is.na(aml.df$pbc_cd15) & !is.na(aml.df$pbc_cd34),]
aml.df <<- aml.df
# dat2 <- dat %>%
# select(rbin, txage, hla, tbi, abd, ci, mtx, mmf, agvhd, cgvhd,
# bmc_cdw, bmc_cd3, bmc_cd15, bmc_cd34,
# pbc_cdw, pbc_cd3, pbc_cd15, pbc_cd34, ID)
aml.df <- aml.df %>%
select(rbin, sex, txage,
rstatprtx, ghgp, tbi,
bmc_cdw, bmc_cd3, bmc_cd15, bmc_cd34,
pbc_cdw, pbc_cd3, pbc_cd15, pbc_cd34, ID)
aml.df <- aml.df %>%
mutate_if(is.character, as.factor) %>%
mutate_if(is.integer, as.numeric) %>%
# mutate(abd = tolower(abd)) %>%
drop_na() %>%
droplevels()
task_chim <- TaskClassif$new(id = "all", backend = aml.df,
target = "rbin")
Define patients for use in cross validation
# task_chim$col_roles$group <- "ID"
task_chim$set_col_roles("ID", remove_from = 'feature')
lrn_rf <- lrn("classif.ranger",
mtry.ratio = 0.56,
sample.fraction = 0.9,
num.trees = 500,
replace = FALSE,
importance = 'permutation',
predict_type = "prob")
lrn_rf$train(task_chim)
How did it do?
lrn_rf$model
## Ranger result
##
## Call:
## ranger::ranger(dependent.variable.name = task$target_names, data = task$data(), probability = self$predict_type == "prob", case.weights = task$weights$weight, num.threads = 1L, sample.fraction = 0.9, num.trees = 500L, replace = FALSE, importance = "permutation", mtry = 8)
##
## Type: Probability estimation
## Number of trees: 500
## Sample size: 102
## Number of independent variables: 13
## Mtry: 8
## Target node size: 10
## Variable importance mode: permutation
## Splitrule: gini
## OOB prediction error (Brier s.): 0.1368587
vip(lrn_rf$model, num_features = 20, scale = TRUE)
myvars <- c("txage", "sex", "rstatprtx", "ghgp", "tbi",
"bmc_cdw", "bmc_cd3", "bmc_cd15", "bmc_cd34",
"pbc_cdw", "pbc_cd3", "pbc_cd15", "pbc_cd34")
for (i in myvars) {
p1 <- pdp::partial(lrn_rf$model,
train = task_chim$data(),
pred.var = i,
prob = TRUE,
plot = TRUE, rug = TRUE,
plot.engine = "ggplot2") + theme_light() +
ggtitle(paste("PDP:", i))
print(p1)
}
pw <- partial(lrn_rf$model,
train = task_chim$data(),
pred.var = "pbc_cdw", prob = TRUE)
p3 <- partial(lrn_rf$model,
train = task_chim$data(),
pred.var = "pbc_cd3", prob = TRUE)
p15 <- partial(lrn_rf$model,
train = task_chim$data(),
pred.var = "pbc_cd15", prob = TRUE)
p34 <- partial(lrn_rf$model,
train = task_chim$data(),
pred.var = "pbc_cd34", prob = TRUE)
plot.df <- data.frame(test = c(pw$pbc_cdw, p3$pbc_cd3, p15$pbc_cd15, p34$pbc_cd34),
yhat = c(pw$yhat, p3$yhat, p15$yhat, p34$yhat),
type = c(rep("cdw", nrow(pw)),
rep("cd3", nrow(p3)),
rep("cd15", nrow(p15)),
rep("cd34", nrow(p34))
))
ggline(plot.df, x = "test", y = "yhat", col = "type",
numeric.x.axis = TRUE,
plot_type = "l", size = 1.5, main = "Chimerism values (PB, prob)")
pw <- partial(lrn_rf$model,
train = task_chim$data(),
pred.var = "bmc_cdw", prob = TRUE)
p3 <- partial(lrn_rf$model,
train = task_chim$data(),
pred.var = "bmc_cd3", prob = TRUE)
p15 <- partial(lrn_rf$model,
train = task_chim$data(),
pred.var = "bmc_cd15", prob = TRUE)
p34 <- partial(lrn_rf$model,
train = task_chim$data(),
pred.var = "bmc_cd34", prob = TRUE)
plot.df <- data.frame(test = c(pw$bmc_cdw, p3$bmc_cd3, p15$bmc_cd15, p34$bmc_cd34),
yhat = c(pw$yhat, p3$yhat, p15$yhat, p34$yhat),
type = c(rep("cdw", nrow(pw)),
rep("cd3", nrow(p3)),
rep("cd15", nrow(p15)),
rep("cd34", nrow(p34))
))
ggline(plot.df, x = "test", y = "yhat", col = "type",
numeric.x.axis = TRUE,
plot_type = "l", size = 1.5,
main = "Chimerism values (BM, prob)")
myvars <- c("sex", "rstatprtx", "ghgp")
for (i in myvars) {
# Compute partial dependence data for lstat and rm
pd <- pdp::partial(lrn_rf$model,
train = task_chim$data(),
pred.var = c("txage", i), chull = TRUE, prob = TRUE)
# Default PDP
# pdp1 <- plotPartial(pd, main = i)
# print(pdp1)
p1 <- ggplot(pd, aes_string(x = "txage", y = "yhat", col = i)) +
geom_line(size = 1.5) +
ggtitle(paste("PDP:", i)) +
theme_bw()
print(p1)
}
pd <- partial(lrn_rf$model,
train = task_chim$data(),
pred.var = c("bmc_cd3", "bmc_cd34"), chull = TRUE, prob = TRUE)
pdp1 <- plotPartial(pd, main = "Chimerism (BM, prob)")
pdp2 <- plotPartial(pd, levelplot = FALSE, zlab = "rbin", colorkey = TRUE,
screen = list(z = 160, x = -60))
grid.arrange(pdp1, pdp2, nrow = 1)
print(pdp1)
pd <- partial(lrn_rf$model,
train = task_chim$data(),
pred.var = c("pbc_cd3", "pbc_cd34"), chull = TRUE, prob = TRUE)
pdp1 <- plotPartial(pd, main = "Chimerism (PB, prob)")
pdp2 <- plotPartial(pd, levelplot = FALSE, zlab = "rbin", colorkey = TRUE,
screen = list(z = 160, x = -60))
grid.arrange(pdp1, pdp2, nrow = 1)
print(pdp1)
myvars <- c("bmc_cdw", "bmc_cd3", "bmc_cd15", "bmc_cd34",
"pbc_cdw", "pbc_cd3", "pbc_cd15", "pbc_cd34")
for (i in myvars) {
# Compute partial dependence data for lstat and rm
pd <- partial(lrn_rf$model,
train = task_chim$data(),
pred.var = c("txage", i), chull = TRUE, prob = TRUE)
# Default PDP
pdp1 <- plotPartial(pd, main = i)
pdp2 <- plotPartial(pd, levelplot = FALSE, zlab = "rbin", colorkey = TRUE,
screen = list(z = 160, x = -60))
grid.arrange(pdp1, pdp2, nrow = 1)
print(pdp1)
}
Stanford Medicine, dcshyr@stanford.edu↩︎
University of Utah, simon.brewer@geog.utah.edu↩︎