AML dataset - vivid plots

Libraries

Set random seed for reproducibility

set.seed(1234)

library(tidyverse)
library(lubridate)
library(ggpubr)
library(ranger)
library(vivid)

Data

Read in data

aml.df <- read.csv("./data/aml.all.df.csv")

Convert dates

aml.df$dot <- ymd(aml.df$dot)
aml.df$dor <- ymd(aml.df$dor)
aml.df$bdate <- ymd(aml.df$bdate)
aml.df$pdate <- ymd(aml.df$pdate)

Convert all character strings to factors

aml.df <- aml.df %>% mutate_if(is.character,as.factor)

Make outcome a binary variable (0/1 relapse)

aml.df$rbin <- factor(aml.df$rbin, levels = c("yes", "no"))

Filter out any tests that are post-relapse

aml.df <- aml.df[which(aml.df$bdate < aml.df$dor | is.na(aml.df$dor)), ]

Filter out relapse >720 days

aml.df <- aml.df[which(aml.df$rbin == "no" | aml.df$rtime < 720),]

Filter out any missing tests

aml.df <- aml.df[!is.na(aml.df$bmc_cdw) & !is.na(aml.df$bmc_cd3) & 
                   !is.na(aml.df$bmc_cd15) & !is.na(aml.df$bmc_cd34) &
                   !is.na(aml.df$pbc_cdw) & !is.na(aml.df$pbc_cd3) & 
                   !is.na(aml.df$pbc_cd15) & !is.na(aml.df$pbc_cd34),]
aml.df <<- aml.df

aml.df <- aml.df %>%
  select(rbin, sex, txage, 
         rstatprtx, ghgp, tbi, 
         bmc_cdw, bmc_cd3, bmc_cd15, bmc_cd34, 
         pbc_cdw, pbc_cd3, pbc_cd15, pbc_cd34, ID)

aml.df <- aml.df %>% 
  mutate_if(is.character, as.factor)  %>% 
  mutate_if(is.integer, as.numeric) %>%
  # mutate(abd = tolower(abd)) %>%
  drop_na() %>%
  droplevels() %>%
  select(-ID)

Random forest

aml_rf <- ranger(rbin ~ ., aml.df, 
                 importance = 'impurity', 
                 probability = TRUE)
aml_rf

## Ranger result
## 
## Call:
##  ranger(rbin ~ ., aml.df, importance = "impurity", probability = TRUE) 
## 
## Type:                             Probability estimation 
## Number of trees:                  500 
## Sample size:                      102 
## Number of independent variables:  13 
## Mtry:                             3 
## Target node size:                 10 
## Variable importance mode:         impurity 
## Splitrule:                        gini 
## OOB prediction error (Brier s.):  0.127328

aml_vivi <- vivi(fit = aml_rf, 
                 data = aml.df, 
                 response = "rbin",
                 importanceType = "impurity")

## Embedded impurity variable importance method used.

## Calculating interactions...

Heatmap

viviHeatmap(mat = aml_vivi)

Interaction network

viviNetwork(mat = aml_vivi)

Stanford Medicine, dcshyr@stanford.edu ↩︎
University of Utah, simon.brewer@geog.utah.edu ↩︎

AML dataset - `vivid` plots

David C. Shyr¹

Simon Brewer²

06 July, 2024

Libraries

Data

Random forest

Heatmap

Interaction network

AML dataset - vivid plots

David C. Shyr1

Simon Brewer2

06 July, 2024

Libraries

Data

Random forest

Heatmap

Interaction network

AML dataset - `vivid` plots

David C. Shyr¹

Simon Brewer²