Run the Variational Bayes patient phenotyping model

Usage

run_Model(
  biomarkers,
  gmm_X,
  logit_X,
  gmm_delta = 1e-06,
  logit_delta = 1e-16,
  gmm_maxiters = 200,
  logit_maxiters = 10000,
  gmm_init = "kmeans",
  gmm_initParams = NULL,
  gmm_prior = NULL,
  logit_prior = NULL,
  gmm_stopIfELBOReverse = FALSE,
  gmm_verbose = FALSE,
  logit_verbose = FALSE,
  gmm_progressbar = FALSE,
  logit_progressbar = FALSE
)

Arguments

biomarkers: The EHR variables that are biomarkers. This is a vector of data column names corresponding to the biomarker variables.
gmm_X: n x p data matrix (or data frame that will be converted to a matrix).
logit_X: The input design matrix. Note the intercept column vector is assumed included.
gmm_delta: Change in ELBO that triggers algorithm stopping.
logit_delta: Change in ELBO that triggers algorithm stopping.
gmm_maxiters: The maximum iterations for VB GMM.
logit_maxiters: The maximum iterations for VB logit.
gmm_init: Initialize the clusters c("random", "kmeans", "dbscan").
gmm_initParams: Parameters for an initialiser requiring its own parameters e.g. dbscan requires 'eps' and 'minPts'.
gmm_prior: An informative prior for the GMM.
logit_prior: An informative prior for the logit.
gmm_stopIfELBOReverse: Stop the VB iterations if the ELBO reverses direction (TRUE or FALSE).
gmm_verbose: Print out information per iteration to track progress in case of long-running experiments.
logit_verbose: Print out information per iteration to track progress in case of long-running experiments.
gmm_progressbar: Show a progressbar driven by the GMM variational iterations.
logit_progressbar: Show a progressbar driven by the logit variational iterations.

Value

A list containing:

prevalence - The mean probability of latent phenotype given the data and priors.
biomarker_shift - A data frame containing the biomarker shifts from normal for the phenotype.
gmm - The VB GMM results. For details see help(vb_gmm_cavi).
logit - The VB Logit results. For details see help(logit_CAVI).

Examples

# \donttest{
##Example 1: Use the internal Sickle Cell Disease data to find the rare
##           phenotype.  SCD is extremely rare so we use DBSCAN to initialise
##           the VB GMM. We also use an informative prior for the mixing
##           coefficient and stop iterations when the ELBO starts to reverse
##           so that we stop when the minor (SCD) component is reached.

library(data.table)

# Load the SCD example data supplied with the VBphenoR package
data(scd_cohort)

# We will use the SCD biomarkers to discover the SCD latent class.
# X1 is the data matrix for the VB GMM.
X1 <- scd_cohort[,.(CBC,RC)]

# We need to supply DBSCAN hyper-parameters as we will initialise VBphenoR
# with DBSCAN. See help(DBSCAN) for details of these parameters.
initParams <- c(0.15, 5)
names(initParams) <- c('eps','minPts')

# Set an informative prior for the VB GMM mixing coefficient alpha
# hyper-parameter
prior_gmm <- list(
  alpha = 0.001
)

# Set informative priors for the beta coefficients of the VB logit
prior_logit <- list(mu=c(1,
                   mean(scd_cohort$age),
                   mean(scd_cohort$highrisk),
                   mean(scd_cohort$CBC),
                   mean(scd_cohort$RC)),
              Sigma=diag(1,5))           # Simplest isotropic case

# X2 is the design matrix for the VB logit
X2 <- scd_cohort[,.(age,highrisk,CBC,RC)]
X2[,age:=as.numeric(age)]
#>          age highrisk      CBC        RC
#>        <num>   <lgcl>    <num>     <num>
#>     1:     3     TRUE  9.25100 2.5290000
#>     2:    84     TRUE  8.04900 2.5210000
#>     3:     7     TRUE  8.93400 2.3990000
#>     4:    33     TRUE  9.44500 2.3580000
#>     5:    43    FALSE  8.79200 2.4070000
#>    ---                                  
#> 63250:    18     TRUE 13.92457 0.5225167
#> 63251:    56     TRUE 13.38314 0.7163306
#> 63252:    87     TRUE 12.81786 0.6605872
#> 63253:    63     TRUE 12.41853 0.8727518
#> 63254:    11     TRUE 12.79134 1.0333188
X2[,highrisk:=as.numeric(highrisk)]
#>          age highrisk      CBC        RC
#>        <num>    <num>    <num>     <num>
#>     1:     3        1  9.25100 2.5290000
#>     2:    84        1  8.04900 2.5210000
#>     3:     7        1  8.93400 2.3990000
#>     4:    33        1  9.44500 2.3580000
#>     5:    43        0  8.79200 2.4070000
#>    ---                                  
#> 63250:    18        1 13.92457 0.5225167
#> 63251:    56        1 13.38314 0.7163306
#> 63252:    87        1 12.81786 0.6605872
#> 63253:    63        1 12.41853 0.8727518
#> 63254:    11        1 12.79134 1.0333188
X2[,Intercept:=1]
#>          age highrisk      CBC        RC Intercept
#>        <num>    <num>    <num>     <num>     <num>
#>     1:     3        1  9.25100 2.5290000         1
#>     2:    84        1  8.04900 2.5210000         1
#>     3:     7        1  8.93400 2.3990000         1
#>     4:    33        1  9.44500 2.3580000         1
#>     5:    43        0  8.79200 2.4070000         1
#>    ---                                            
#> 63250:    18        1 13.92457 0.5225167         1
#> 63251:    56        1 13.38314 0.7163306         1
#> 63252:    87        1 12.81786 0.6605872         1
#> 63253:    63        1 12.41853 0.8727518         1
#> 63254:    11        1 12.79134 1.0333188         1
setcolorder(X2, c("Intercept","age","highrisk","CBC","RC"))

# Run the patient phenotyping model

# Need to state what columns are the biomarkers
biomarkers <- c('CBC', 'RC')
set.seed(123)

pheno_result <- run_Model(biomarkers,
                        gmm_X=X1, gmm_init="dbscan",
                        gmm_initParams=initParams,
                        gmm_maxiters=20, gmm_prior=prior_gmm,
                        gmm_stopIfELBOReverse=TRUE,
                        logit_X=X2, logit_prior=prior_logit
)

# S3 print method
print(pheno_result)
#> VBphenoR phenotyping results.
#> -----------------------------
#> 
#> EHR data with 63,254 rows
#> 
#> Prevalence = 0.312% in these data
#> Biomarker shift for patients with the phenotype of interest:
#>  biomarker shift
#>        CBC  6.25
#>         RC  4.18
# }