Libraries

Set random seed for reproducibility

set.seed(1234)
library(tidyverse)
library(lubridate)
library(ggpubr)
library(ranger)
library(vivid)

Data

Read in data

all.df <- read.csv("./data/all.all.df.csv")

Convert dates

all.df$dot <- ymd(all.df$dot)
all.df$dor <- ymd(all.df$dor)
all.df$bdate <- ymd(all.df$bdate)
all.df$pdate <- ymd(all.df$pdate)

Convert all character strings to factors

all.df <- all.df %>% mutate_if(is.character,as.factor)

Make outcome a binary variable (0/1 relapse)

all.df$rbin <- factor(all.df$rbin, levels = c("yes", "no"))

Filter out any tests that are post-relapse

all.df <- all.df[which(all.df$bdate < all.df$dor | is.na(all.df$dor)), ]

Filter out relapse >720 days

all.df <- all.df[which(all.df$rbin == "no" | all.df$rtime < 720),]

Filter out any missing tests

all.df <- all.df[!is.na(all.df$bmc_cdw) & !is.na(all.df$bmc_cd3) & 
                   !is.na(all.df$bmc_cd15) & !is.na(all.df$bmc_cd34) &
                   !is.na(all.df$pbc_cdw) & !is.na(all.df$pbc_cd3) & 
                   !is.na(all.df$pbc_cd15) & !is.na(all.df$pbc_cd34),]
all.df <<- all.df
all.df <- all.df %>%
  select(rbin, sex, txage, 
         rstatprtx, ghgp, tbi, 
         bmc_cdw, bmc_cd3, bmc_cd15, bmc_cd34, 
         pbc_cdw, pbc_cd3, pbc_cd15, pbc_cd34, ID)

all.df <- all.df %>% 
  mutate_if(is.character, as.factor)  %>% 
  mutate_if(is.integer, as.numeric) %>%
  # mutate(abd = tolower(abd)) %>%
  drop_na() %>%
  droplevels() %>%
  select(-ID)

Random forest

all_rf <- ranger(rbin ~ ., all.df, 
                 importance = 'impurity', 
                 probability = TRUE)
all_rf
## Ranger result
## 
## Call:
##  ranger(rbin ~ ., all.df, importance = "impurity", probability = TRUE) 
## 
## Type:                             Probability estimation 
## Number of trees:                  500 
## Sample size:                      163 
## Number of independent variables:  13 
## Mtry:                             3 
## Target node size:                 10 
## Variable importance mode:         impurity 
## Splitrule:                        gini 
## OOB prediction error (Brier s.):  0.1029066
all_vivi <- vivi(fit = all_rf, 
                 data = all.df, 
                 response = "rbin",
                 importanceType = "impurity")
## Embedded impurity variable importance method used.
## Calculating interactions...

Heatmap

viviHeatmap(mat = all_vivi)

Interaction network

viviNetwork(mat = all_vivi)


  1. Stanford Medicine, ↩︎

  2. University of Utah, ↩︎