# Ensemble Meta-Prediction Framework to Integrate Multiple Regression Models

## Overview

This `R` package is an ensemble meta-prediction framework to integrate multiple regression models into a current study.

## Installation

If the devtools package is not yet installed, install it first:

``install.packages('devtools')``
``````# install the package from Github:
devtools::install_github('umich-biostatistics/MetaIntegration') ``````

Once installed, load the package:

``library(MetaIntegration)``

## Example Usage

For this example, we simulate data and test each of the functions.

## Simulate data

``````############## Generating 1 internal dataset of size n ###########################################
# Full model: Y|X1, X2, X3, X4, B
# (X1, X2, X3, X4, B) follows normal distribution with mean zero, variance one and correlation 0.3
# Y|X1, X2, X3, X4, B follows Bernoulli[expit(-1-0.5X+0.5B)], where expit(x)=exp(x)/[1+exp(x)]
##################################################################################################
set.seed(2333)
n = 200
data.n = data.frame(matrix(ncol = 5, nrow = n))
colnames(data.n) = c('Y', 'X1', 'X2', 'X3', 'B')
data.n[,c('X1', 'X2', 'X3', 'B')] = MASS::mvrnorm(n, rep(0,4), diag(0.7,4)+0.3)
############# Function 1 #########################################
data.n\$Y = rbinom(n, 1, expit(-1 - 0.5*(data.n\$X1 + data.n\$X2 + data.n\$X3) + 0.5*data.n\$B))

############# Generate k=3 external models #########################################
# Full model: Y|X1, X2, X3, B
# Reduced external model 1: Y|X1, X2 with sample size m1
# Reduced external model 2: Y|X1, X3 with sample size m2
# Reduced external model 3: Y|X1, X2, X3 with sample size m3
####################################################################################
# generate data from the full model first, then fit the reduced logistic regression
# on the data to obtain the beta estiamtes and the corresponsing estimated variance
####################################################################################
m1 = 500
m2 = 2000
m3 = 30000
data.m1 = data.frame(matrix(ncol = 5, nrow = m1))
data.m2 = data.frame(matrix(ncol = 5, nrow = m2))
data.m3 = data.frame(matrix(ncol = 5, nrow = m3))
names(data.m1) = names(data.m2) = names(data.m3) = c('Y', 'X1', 'X2', 'X3', 'B')

data.m1[,c('X1', 'X2', 'X3', 'B')] = MASS::mvrnorm(m1, rep(0,4), diag(0.7,4)+0.3)
data.m2[,c('X1', 'X2', 'X3', 'B')] = MASS::mvrnorm(m2, rep(0,4), diag(0.7,4)+0.3)
data.m3[,c('X1', 'X2', 'X3', 'B')] = MASS::mvrnorm(m3, rep(0,4), diag(0.7,4)+0.3)
data.m1\$Y = rbinom(m1, 1, expit(-1 - 0.5*(data.m1\$X1 + data.m1\$X2 + data.m1\$X3) + 0.5*data.m1\$B))
data.m2\$Y = rbinom(m2, 1, expit(-1 - 0.5*(data.m2\$X1 + data.m2\$X2 + data.m2\$X3) + 0.5*data.m2\$B))
data.m3\$Y = rbinom(m3, 1, expit(-1 - 0.5*(data.m3\$X1 + data.m3\$X2 + data.m3\$X3) + 0.5*data.m3\$B))

#fit Y|X using logistic regression to obtain the external beta estimates
fit.E1 = glm(Y ~ X1 + X2,      data = data.m1, family = binomial(link='logit'))
fit.E2 = glm(Y ~ X1 + X3,      data = data.m2, family = binomial(link='logit'))
fit.E3 = glm(Y ~ X1 + X2 + X3, data = data.m3, family = binomial(link='logit'))

#Save the beta estiamtes and the corresponsing estimated variance of the reduced external model 1
beta.E1 = coef(fit.E1)
names(beta.E1) = c('int', 'X1', 'X2')
V.E1 = vcov(fit.E1)

#Save the beta estiamtes and the corresponsing estimated variance of the reduced external model 2
beta.E2 = coef(fit.E2)
names(beta.E2) = c('int','X1','X3')
V.E2 = vcov(fit.E2)

#Save the beta estiamtes and the corresponsing estimated variance of the reduced external model 3
beta.E3 = coef(fit.E3)
names(beta.E3) = c('int','X1','X2','X3')
V.E3 = vcov(fit.E3)

#Save all the external model information into lists for later use
betaHatExt_list = list(Ext1 = beta.E1, Ext2 = beta.E2, Ext3 = beta.E3)
CovExt_list = list(Ext1 = V.E1, Ext2 = V.E2, Ext3 = V.E3)
rho = list(Ext1 = n/m1, Ext2 = n/m2, Ext3 = n/m3)``````

## fxnCC_LogReg()

Constraint maximum likelihood (CML) method for logistic regression (binary outcome Y).

``````fit.gamma.I = glm(Y ~ X1 + X2 + X3 + B, data = data.n, family = binomial(link='logit'))
gamma.I = coef(fit.gamma.I)
gamma.CML1 = fxnCC_LogReg(p=3,
q=5,
YInt=data.n\$Y,
XInt=cbind(data.n\$X1,data.n\$X2),
BInt=cbind(data.n\$X3,data.n\$B),
betaHatExt=beta.E1,
gammaHatInt=gamma.I,
n=n,
tol=1e-8,
maxIter=400,
factor=1)[["gammaHat"]]
gamma.CML2 = fxnCC_LogReg(p=3,
q=5,
YInt=data.n\$Y,
XInt=cbind(data.n\$X1,data.n\$X3),
BInt=cbind(data.n\$X2, data.n\$B),
betaHatExt=beta.E2,
gammaHatInt=c(gamma.I[1:2],gamma.I[4],gamma.I[3],gamma.I[5]),
n=n,
tol=1e-8,
maxIter=400,
factor=1)[["gammaHat"]]
gamma.CML2 = c(gamma.CML2[1:2], gamma.CML2[4], gamma.CML2[3], gamma.CML2[5])
gamma.CML3 = fxnCC_LogReg(p=4,
q=5,
YInt=data.n\$Y,
XInt=cbind(data.n\$X1,data.n\$X2,data.n\$X3),
BInt=cbind(data.n\$B),
betaHatExt=beta.E3,
gammaHatInt=gamma.I,
n=n,
tol=1e-8,
maxIter=400,
factor=1)[["gammaHat"]]``````

## asympVar_LogReg()

Asymptotic variance-covariance matrix for gamma_Int and gamma_CML for logistic regression (binary outcome Y).

``````asy.CML = asympVar_LogReg(k=3,
p=4,
q=5,
YInt=data.n\$Y,
XInt=data.n[,c('X1','X2','X3')], #covariates that appeared in at least one external model
BInt=data.n\$B,  #covariates that not used in any of the external models
gammaHatInt=gamma.I,
betaHatExt_list=betaHatExt_list,
CovExt_list=CovExt_list,
rho=rho,
ExUncertainty=TRUE)
asyV.I = asy.CML[['asyV.I']]                      #variance of gamma.I
asyV.CML1 = asy.CML[['asyV.CML']][[1]]            #variance of gamma.CML1
asyV.CML2 = asy.CML[['asyV.CML']][[2]]            #variance of gamma.CML2
asyCov.CML1.I = asy.CML[['asyCov.CML.I']][[1]]    #covariance of gamma.CML1 and gamma.I
asyCov.CML2.I = asy.CML[['asyCov.CML.I']][[2]]    #covariance of gamma.CML2 and gamma.I
asyCov.CML12 = asy.CML[['asyCov.CML']][['12']]    #covariance of gamma.CML1 and gamma.CML2``````

## get_gamma_EB()

Calculate the empirical Bayes (EB) estimates.

``````gamma.EB1 = get_gamma_EB(gamma_I=gamma.I, gamma_CML=gamma.CML1, asyV.I=asyV.I)[['gamma.EB']]
gamma.EB2 = get_gamma_EB(gamma_I=gamma.I, gamma_CML=gamma.CML2, asyV.I=asyV.I)[['gamma.EB']]
gamma.EB3 = get_gamma_EB(gamma_I=gamma.I, gamma_CML=gamma.CML3, asyV.I=asyV.I)[['gamma.EB']]``````

## get_var_EB()

Using simulation to obtain the asymptotic variance-covariance matrix of gamma_EB, package corpcor and MASS are required.

``````#Get the asymptotic variance of the EB estimates
V.EB = get_var_EB(k=3,
q=5,
gamma.CML=c(gamma.CML1, gamma.CML2, gamma.CML3),
gamma.I = gamma.I,
asy.CML=asy.CML,
seed=2333,
nsim=2000)
asyV.EB1 = V.EB[['asyV.EB']][[1]]             #variance of gamma.EB1
asyV.EB2 = V.EB[['asyV.EB']][[2]]             #variance of gamma.EB2
asyV.EB3 = V.EB[['asyV.EB']][[3]]             #variance of gamma.EB3
asyCov.EB1.I = V.EB[['asyCov.EB.I']][[1]]     #covariance of gamma.EB1 and gamma.I
asyCov.EB2.I = V.EB[['asyCov.EB.I']][[2]]     #covariance of gamma.EB2 and gamma.I
asyCov.EB3.I = V.EB[['asyCov.EB.I']][[3]]     #covariance of gamma.EB2 and gamma.I
asyCov.EB12 = V.EB[['asyCov.EB']][['12']]     #covariance of gamma.EB1 and gamma.EB2
asyCov.EB13 = V.EB[['asyCov.EB']][['13']]     #covariance of gamma.EB1 and gamma.EB3
asyCov.EB23 = V.EB[['asyCov.EB']][['23']]     #covariance of gamma.EB2 and gamma.EB3``````

# get_OCW()

Obtain the proposed Optimal covariate-Weighted (OCW) estimates, package Rsolnp is required.

``````get_OCW(k=3,
q=5,
data.XB=data.n[,c('X1','X2','X3','B')],
gamma.EB=c(gamma.EB1, gamma.EB2, gamma.EB3),
V.EB=V.EB)``````

# get_SCLearner()

Obtain the proposed Selective Coefficient-Learner (SC-Learner) estimates.

``````pred.matrix = matrix(c(1,1,1,0,0,
1,1,0,1,0,
1,1,1,1,0), 5, 3)
rownames(pred.matrix) = c('int','X1','X2','X3','B')
colnames(pred.matrix) = c('E1','E2','E3')

get_SCLearner(k=3,
q=5,
pred.matrix=pred.matrix,
gamma.EB=cbind(gamma.EB1, gamma.EB2, gamma.EB3),
V.EB)``````

### Current Suggested Citation

Gu, T., Taylor, J.M.G. and Mukherjee, B. (2020). An ensemble meta-prediction framework to integrate multiple regression models into a current study. Manuscript in preparation.