Libraries

Set random seed for reproducibility

set.seed(1234)
library(tidyverse)
library(lubridate)
library(ggpubr)
library(mlr3verse)
library(ranger)
library(vip)
library(pdp)
library(gridExtra)

Data

Read in data

aml.df <- read.csv("./data/aml.all.df.csv")

Convert dates

aml.df$dot <- ymd(aml.df$dot)
aml.df$dor <- ymd(aml.df$dor)
aml.df$bdate <- ymd(aml.df$bdate)
aml.df$pdate <- ymd(aml.df$pdate)

Convert all character strings to factors

aml.df <- aml.df %>% mutate_if(is.character,as.factor)

Make outcome a binary variable (0/1 relapse)

aml.df$rbin <- factor(aml.df$rbin, levels = c("yes", "no"))

Filter out any tests that are post-relapse

aml.df <- aml.df[which(aml.df$bdate < aml.df$dor | is.na(aml.df$dor)), ]

Filter out relapse >720 days

aml.df <- aml.df[which(aml.df$rbin == "no" | aml.df$rtime < 720),]

Filter out any missing tests

aml.df <- aml.df[!is.na(aml.df$bmc_cdw) & !is.na(aml.df$bmc_cd3) & 
                   !is.na(aml.df$bmc_cd15) & !is.na(aml.df$bmc_cd34) &
                   !is.na(aml.df$pbc_cdw) & !is.na(aml.df$pbc_cd3) & 
                   !is.na(aml.df$pbc_cd15) & !is.na(aml.df$pbc_cd34),]
aml.df <<- aml.df
# dat2 <- dat %>%
#   select(rbin, txage, hla, tbi, abd, ci, mtx, mmf, agvhd, cgvhd,
#          bmc_cdw, bmc_cd3, bmc_cd15, bmc_cd34, 
#          pbc_cdw, pbc_cd3, pbc_cd15, pbc_cd34, ID)
aml.df <- aml.df %>%
  select(rbin, sex, txage, 
         rstatprtx, ghgp, tbi, 
         bmc_cdw, bmc_cd3, bmc_cd15, bmc_cd34, 
         pbc_cdw, pbc_cd3, pbc_cd15, pbc_cd34, ID)

aml.df <- aml.df %>% 
  mutate_if(is.character, as.factor)  %>% 
  mutate_if(is.integer, as.numeric) %>%
  # mutate(abd = tolower(abd)) %>%
  drop_na() %>%
  droplevels()

Random forest

Set up task

task_chim <- TaskClassif$new(id = "all", backend = aml.df, 
                             target = "rbin")

Define patients for use in cross validation

# task_chim$col_roles$group <- "ID"
task_chim$set_col_roles("ID", remove_from = 'feature')

Learner

lrn_rf <- lrn("classif.ranger",
              mtry.ratio = 0.56,
              sample.fraction = 0.9,
              num.trees = 500, 
              replace = FALSE,
              importance = 'permutation',
              predict_type = "prob")

Training

lrn_rf$train(task_chim)

How did it do?

lrn_rf$model
## Ranger result
## 
## Call:
##  ranger::ranger(dependent.variable.name = task$target_names, data = task$data(),      probability = self$predict_type == "prob", case.weights = task$weights$weight,      num.threads = 1L, sample.fraction = 0.9, num.trees = 500L,      replace = FALSE, importance = "permutation", mtry = 8) 
## 
## Type:                             Probability estimation 
## Number of trees:                  500 
## Sample size:                      102 
## Number of independent variables:  13 
## Mtry:                             8 
## Target node size:                 10 
## Variable importance mode:         permutation 
## Splitrule:                        gini 
## OOB prediction error (Brier s.):  0.1368587

Variable importance

vip(lrn_rf$model, num_features = 20, scale = TRUE)

Partial plots

myvars <- c("txage", "sex", "rstatprtx", "ghgp", "tbi",
            "bmc_cdw", "bmc_cd3", "bmc_cd15", "bmc_cd34",
            "pbc_cdw", "pbc_cd3", "pbc_cd15", "pbc_cd34")

for (i in myvars) {
  
  p1 <- pdp::partial(lrn_rf$model, 
                     train = task_chim$data(),
                     pred.var = i, 
                     prob = TRUE,
                     plot = TRUE, rug = TRUE, 
                     plot.engine = "ggplot2") + theme_light() + 
    ggtitle(paste("PDP:", i)) 
  print(p1)
}

All peripheral blood lineaages

pw <- partial(lrn_rf$model, 
              train = task_chim$data(), 
              pred.var = "pbc_cdw", prob = TRUE) 
p3 <- partial(lrn_rf$model, 
              train = task_chim$data(), 
              pred.var = "pbc_cd3", prob = TRUE) 
p15 <- partial(lrn_rf$model, 
              train = task_chim$data(), 
              pred.var = "pbc_cd15", prob = TRUE) 
p34 <- partial(lrn_rf$model, 
              train = task_chim$data(), 
              pred.var = "pbc_cd34", prob = TRUE) 

plot.df <- data.frame(test = c(pw$pbc_cdw, p3$pbc_cd3, p15$pbc_cd15, p34$pbc_cd34),
                      yhat = c(pw$yhat, p3$yhat, p15$yhat, p34$yhat),
                      type = c(rep("cdw", nrow(pw)),
                               rep("cd3", nrow(p3)),
                               rep("cd15", nrow(p15)),
                               rep("cd34", nrow(p34))
                      ))

ggline(plot.df, x = "test", y = "yhat", col = "type", 
       numeric.x.axis = TRUE, 
       plot_type = "l", size = 1.5, main = "Chimerism values (PB, prob)")

All bone marrow lineaages

pw <- partial(lrn_rf$model, 
              train = task_chim$data(), 
              pred.var = "bmc_cdw", prob = TRUE) 
p3 <- partial(lrn_rf$model, 
              train = task_chim$data(), 
              pred.var = "bmc_cd3", prob = TRUE) 
p15 <- partial(lrn_rf$model, 
              train = task_chim$data(), 
              pred.var = "bmc_cd15", prob = TRUE) 
p34 <- partial(lrn_rf$model, 
              train = task_chim$data(), 
              pred.var = "bmc_cd34", prob = TRUE) 

plot.df <- data.frame(test = c(pw$bmc_cdw, p3$bmc_cd3, p15$bmc_cd15, p34$bmc_cd34),
                      yhat = c(pw$yhat, p3$yhat, p15$yhat, p34$yhat),
                      type = c(rep("cdw", nrow(pw)),
                               rep("cd3", nrow(p3)),
                               rep("cd15", nrow(p15)),
                               rep("cd34", nrow(p34))
                      ))

ggline(plot.df, x = "test", y = "yhat", col = "type", 
       numeric.x.axis = TRUE,
       plot_type = "l", size = 1.5, 
       main = "Chimerism values (BM, prob)")

2D plots

myvars <- c("sex", "rstatprtx", "ghgp")

for (i in myvars) {
  # Compute partial dependence data for lstat and rm
  pd <- pdp::partial(lrn_rf$model, 
              train = task_chim$data(), 
              pred.var = c("txage", i), chull = TRUE, prob = TRUE)
  
  # Default PDP
  # pdp1 <- plotPartial(pd, main = i)
  # print(pdp1)
  p1 <- ggplot(pd, aes_string(x = "txage", y = "yhat", col = i)) + 
    geom_line(size = 1.5) + 
    ggtitle(paste("PDP:", i)) +
    theme_bw()
  print(p1)
  
}

pd <- partial(lrn_rf$model, 
              train = task_chim$data(), 
              pred.var = c("bmc_cd3", "bmc_cd34"), chull = TRUE, prob = TRUE)
pdp1 <- plotPartial(pd, main = "Chimerism (BM, prob)")

pdp2 <- plotPartial(pd, levelplot = FALSE, zlab = "rbin", colorkey = TRUE,
                    screen = list(z = 160, x = -60))

grid.arrange(pdp1, pdp2, nrow = 1) 

print(pdp1)

pd <- partial(lrn_rf$model, 
              train = task_chim$data(), 
              pred.var = c("pbc_cd3", "pbc_cd34"), chull = TRUE, prob = TRUE)
pdp1 <- plotPartial(pd, main = "Chimerism (PB, prob)")

pdp2 <- plotPartial(pd, levelplot = FALSE, zlab = "rbin", colorkey = TRUE,
                    screen = list(z = 160, x = -60))

grid.arrange(pdp1, pdp2, nrow = 1) 

print(pdp1)

myvars <- c("bmc_cdw", "bmc_cd3", "bmc_cd15", "bmc_cd34",
            "pbc_cdw", "pbc_cd3", "pbc_cd15", "pbc_cd34")

for (i in myvars) {
  # Compute partial dependence data for lstat and rm
  pd <- partial(lrn_rf$model, 
              train = task_chim$data(), 
              pred.var = c("txage", i), chull = TRUE, prob = TRUE)
  
  # Default PDP
  pdp1 <- plotPartial(pd, main = i)
  
  pdp2 <- plotPartial(pd, levelplot = FALSE, zlab = "rbin", colorkey = TRUE,
                      screen = list(z = 160, x = -60))
  
  grid.arrange(pdp1, pdp2, nrow = 1) 
  
  print(pdp1)
  
}

  1. Stanford Medicine, ↩︎

  2. University of Utah, ↩︎