Run the Variational Bayes patient phenotyping model
Usage
run_Model(
biomarkers,
gmm_X,
logit_X,
gmm_delta = 1e-06,
logit_delta = 1e-16,
gmm_maxiters = 200,
logit_maxiters = 10000,
gmm_init = "kmeans",
gmm_initParams = NULL,
gmm_prior = NULL,
logit_prior = NULL,
gmm_stopIfELBOReverse = FALSE,
gmm_verbose = FALSE,
logit_verbose = FALSE,
gmm_progressbar = FALSE,
logit_progressbar = FALSE
)
Arguments
- biomarkers
The EHR variables that are biomarkers. This is a vector of data column names corresponding to the biomarker variables.
- gmm_X
n x p data matrix (or data frame that will be converted to a matrix).
- logit_X
The input design matrix. Note the intercept column vector is assumed included.
- gmm_delta
Change in ELBO that triggers algorithm stopping.
- logit_delta
Change in ELBO that triggers algorithm stopping.
- gmm_maxiters
The maximum iterations for VB GMM.
- logit_maxiters
The maximum iterations for VB logit.
- gmm_init
Initialize the clusters c("random", "kmeans", "dbscan").
- gmm_initParams
Parameters for an initialiser requiring its own parameters e.g. dbscan requires 'eps' and 'minPts'.
- gmm_prior
An informative prior for the GMM.
- logit_prior
An informative prior for the logit.
- gmm_stopIfELBOReverse
Stop the VB iterations if the ELBO reverses direction (TRUE or FALSE).
- gmm_verbose
Print out information per iteration to track progress in case of long-running experiments.
- logit_verbose
Print out information per iteration to track progress in case of long-running experiments.
- gmm_progressbar
Show a progressbar driven by the GMM variational iterations.
- logit_progressbar
Show a progressbar driven by the logit variational iterations.
Value
A list containing:
prevalence - The mean probability of latent phenotype given the data and priors.
biomarker_shift - A data frame containing the biomarker shifts from normal for the phenotype.
gmm - The VB GMM results. For details see help(vb_gmm_cavi).
logit - The VB Logit results. For details see help(logit_CAVI).
Examples
# \donttest{
##Example 1: Use the internal Sickle Cell Disease data to find the rare
## phenotype. SCD is extremely rare so we use DBSCAN to initialise
## the VB GMM. We also use an informative prior for the mixing
## coefficient and stop iterations when the ELBO starts to reverse
## so that we stop when the minor (SCD) component is reached.
library(data.table)
# Load the SCD example data supplied with the VBphenoR package
data(scd_cohort)
# We will use the SCD biomarkers to discover the SCD latent class.
# X1 is the data matrix for the VB GMM.
X1 <- scd_cohort[,.(CBC,RC)]
# We need to supply DBSCAN hyper-parameters as we will initialise VBphenoR
# with DBSCAN. See help(DBSCAN) for details of these parameters.
initParams <- c(0.15, 5)
names(initParams) <- c('eps','minPts')
# Set an informative prior for the VB GMM mixing coefficient alpha
# hyper-parameter
prior_gmm <- list(
alpha = 0.001
)
# Set informative priors for the beta coefficients of the VB logit
prior_logit <- list(mu=c(1,
mean(scd_cohort$age),
mean(scd_cohort$highrisk),
mean(scd_cohort$CBC),
mean(scd_cohort$RC)),
Sigma=diag(1,5)) # Simplest isotropic case
# X2 is the design matrix for the VB logit
X2 <- scd_cohort[,.(age,highrisk,CBC,RC)]
X2[,age:=as.numeric(age)]
#> age highrisk CBC RC
#> <num> <lgcl> <num> <num>
#> 1: 3 TRUE 9.25100 2.5290000
#> 2: 84 TRUE 8.04900 2.5210000
#> 3: 7 TRUE 8.93400 2.3990000
#> 4: 33 TRUE 9.44500 2.3580000
#> 5: 43 FALSE 8.79200 2.4070000
#> ---
#> 63250: 18 TRUE 13.92457 0.5225167
#> 63251: 56 TRUE 13.38314 0.7163306
#> 63252: 87 TRUE 12.81786 0.6605872
#> 63253: 63 TRUE 12.41853 0.8727518
#> 63254: 11 TRUE 12.79134 1.0333188
X2[,highrisk:=as.numeric(highrisk)]
#> age highrisk CBC RC
#> <num> <num> <num> <num>
#> 1: 3 1 9.25100 2.5290000
#> 2: 84 1 8.04900 2.5210000
#> 3: 7 1 8.93400 2.3990000
#> 4: 33 1 9.44500 2.3580000
#> 5: 43 0 8.79200 2.4070000
#> ---
#> 63250: 18 1 13.92457 0.5225167
#> 63251: 56 1 13.38314 0.7163306
#> 63252: 87 1 12.81786 0.6605872
#> 63253: 63 1 12.41853 0.8727518
#> 63254: 11 1 12.79134 1.0333188
X2[,Intercept:=1]
#> age highrisk CBC RC Intercept
#> <num> <num> <num> <num> <num>
#> 1: 3 1 9.25100 2.5290000 1
#> 2: 84 1 8.04900 2.5210000 1
#> 3: 7 1 8.93400 2.3990000 1
#> 4: 33 1 9.44500 2.3580000 1
#> 5: 43 0 8.79200 2.4070000 1
#> ---
#> 63250: 18 1 13.92457 0.5225167 1
#> 63251: 56 1 13.38314 0.7163306 1
#> 63252: 87 1 12.81786 0.6605872 1
#> 63253: 63 1 12.41853 0.8727518 1
#> 63254: 11 1 12.79134 1.0333188 1
setcolorder(X2, c("Intercept","age","highrisk","CBC","RC"))
# Run the patient phenotyping model
# Need to state what columns are the biomarkers
biomarkers <- c('CBC', 'RC')
set.seed(123)
pheno_result <- run_Model(biomarkers,
gmm_X=X1, gmm_init="dbscan",
gmm_initParams=initParams,
gmm_maxiters=20, gmm_prior=prior_gmm,
gmm_stopIfELBOReverse=TRUE,
logit_X=X2, logit_prior=prior_logit
)
# S3 print method
print(pheno_result)
#> VBphenoR phenotyping results.
#> -----------------------------
#>
#> EHR data with 63,254 rows
#>
#> Prevalence = 0.312% in these data
#> Biomarker shift for patients with the phenotype of interest:
#> biomarker shift
#> CBC 6.25
#> RC 4.18
# }