# The data

We simulate a small data set with a covariance structure to explore the actions of RFlocalfdr. This package implements an empirical Bayes method of determining a significance level for the Random Forest MDI (Mean Decrease in Impurity) variable importance measure. See Dunne et al. (2022) for details

We

• simulate the data set

• fit a Random Forest model

• use RFlocalfdr to estimate the significant variables

• estimate the significant variables using

• Boruta (Kursa and Rudnicki (2010))
• Recursive Fearure Elimination
• AIR (Nembrini et al. (2018))
• PIMP (Altmann et al. (2010))

as comparators.

## setup

We plot a small data set of $$300\times 508$$ to explore the data generation method. This dataset consist of bands, with blocks of $$\{1, 2, 4, 8, 16, 32, 64\}$$ of identical variables (see figure @ref(fig:simulation2). The variables are $$\in \{0,1,2\}$$, a common encoding for genomic data where the numbers represent the number of copies of the minor allele. Only band 1 is used to calculate the $$y$$ vector and $$y$$ is 1 if any of $$X[, c(1, 2, 4, 8, 16, 32, 64)]$$ is non-zero, so $$y$$ is unbalanced, containing more 1’s than 0’s.

library(ranger)
library(RFlocalfdr)
library(caret)
library(pROC)
set.seed(13)
num_samples <- 300
num_bands <- 4
band_rank <- 6
num_vars <- num_bands * (2 ** (band_rank+1) -1)
print(num_vars)

X <- matrix(NA, num_samples , num_vars)
set.seed(123)

temp<-matrix(0,508,3)
var_index <- 1
for(band in 1:num_bands) {
for (rank in 0:band_rank) {
for (i in 1:2**rank) {
temp[var_index,]<-c(band , rank, var_index)
var_index <- var_index +1
}
}
}

#png("./supp_figures/small_simulated_data_set.png")
plot(temp[,1],ylim=c(0,10),type="p")
lines(temp[,2],type="b",col="red")
legend(0,10,c("band","rank"),pch=1,col=c(1,2))
#dev.off()
abline(v=c( 1,  2 , 4 , 8 ,16 ,32, 64 ))

table(temp[temp[,1]==1,2])
## # 0  1  2  3  4  5  6
## # 1  2  4  8 16 32 64 

## data

We now generate the daa for the simulation. We have 50 bands and 300 observations so $$X$$ is $$300 \times 6350$$ with 127 non-null variables. We fit a standard RF to this dataset and record the resulting MDI importance score.

set.seed(13)
num_samples <- 300
num_bands <- 50
band_rank <- 6
num_vars <- num_bands * (2 ** (band_rank+1) -1)
print(num_vars)

X <- matrix(NA, num_samples , num_vars)
set.seed(123)

var_index <- 1
for(band in 1:num_bands) {
for (rank in 0:band_rank) {
#        cat("rank=",rank,"\n")
var <- sample(0:2, num_samples, replace=TRUE)
for (i in 1:2**rank) {
X[,var_index] <- var
var_index <- var_index +1
}
#        print(X[1:2,1:140])
#        system("sleep 3")
}
}

y <- as.numeric(X[,1] > 1 |  X[,2] > 1  |  X[,4] > 1 |  X[,8] > 1 |  X[,16] > 1 |
X[,32] > 1 | X[,64] > 1 )

# Random forest

data <- cbind(y,X)
colnames(data) <- c("y",make.names(1:num_vars))
rfModel <- ranger(data=data,dependent.variable.name="y", importance="impurity",
classification=TRUE,  mtry=100,num.trees = 10000, replace=TRUE, seed  = 17 )
imp <-log(ranger::importance(rfModel))
t2 <-count_variables(rfModel)
plot(density(imp))

roccurve <- roc(c(rep(1,127),rep(0,6350-127)),imp)
plot(roccurve)
auc(roccurve) # 0.9912

## Determine cutoff

cutoffs <- c(0,1,4,10)
#png("./supp_figures/simulated_data_determine_cutoff.png")
res.con<- determine_cutoff(imp,t2,cutoff=cutoffs,plot=c(0,1,4,10))
#dev.off()

We plot the kernel density estimate of the histogram of the data $$y$$, and the skew normal fit, $$t_1$$, using the data up to the quantile $$Q$$, shown in red.

plot(cutoffs,res.con[,3])
cutoffs[which.min(res.con[,3])]

by plotting $$\max(|y - t_1|)$$ versus the cutoff values, we determine the appropriate cutoff. In this case it is just $$t2>0$$

## Rflocalfdr

temp<-imp[t2 > 0]

qq <- plotQ(temp,debug.flag = 1)
ppp<-run.it.importances(qq,temp,debug=0)
aa<-significant.genes(ppp,temp,cutoff=0.05,do.plot=2)
length(aa$probabilities) # 95 tt1 <-as.numeric(gsub("X([0-9]*)","\\1",names(aa$probabilities)))
tt2 <- table(ifelse((tt1 < 127),1,2))
# 1  2
# 59 36
# The false discovery rate is 0.3789474
tt2[2]/(tt2[1]+tt2[2])
#59 36   36/(36+59) = 0.3789474

predicted_values<-rep(0, 6350);predicted_values[tt1]<-1
conf_matrix<-table(predicted_values,c(rep(1,127),rep(0,6350-127)))
conf_matrix
sensitivity(conf_matrix) #0.994215 TP/(TP+FN)
specificity(conf_matrix) #0.4645669 TN/(FP+TN)

roccurve <- roc(c(rep(1,127),rep(0,6350-127)),predicted_values)
plot(roccurve)
auc(roccurve) #0.7294

accuracy<-(conf_matrix[1,1]+conf_matrix[2,2])/(sum(conf_matrix))
accuracy # 0.983622

The FDR is 0.379. We can also calculate the sensitivity, sensitivity and accruacy.

In order to make the comparisons with other methods, the following packages amy need to be installed.

install.packages("Boruta")
install.packages("locfdr")
install.packages("vita")
install.packages("locfdr")
devtools::install_github("silkeszy/Pomona")

# Boruta

We try the Boruta algorithm (Kursa and Rudnicki (2010)).

library(Boruta)
set.seed(120)
boruta1 <- Boruta(X,as.factor(y),  num.threads = 7,getImp=getImpRfGini)
print(boruta1)
#Boruta performed 99 iterations in 19.54727 secs.
#4 attributes confirmed important: X4859, X58, X6132, X7;
# 6346 attributes confirmed unimportant: X1, X10, X100, X1000, X1001 and 6341 more;
plotImpHistory(boruta1)
aa <- which(boruta1$finalDecision=="Confirmed") bb <- which(boruta1$finalDecision=="Tentative")
predicted_values <-rep(0,6350);predicted_values[c(aa,bb)]<-1
conf_matrix<-table(predicted_values,c(rep(1,127),rep(0,6350-127)))
conf_matrix

sensitivity(conf_matrix) #0.9996786 TP/(TP+FN)
specificity(conf_matrix) #0.01574803 TN/(FP+TN)

roccurve <- roc(c(rep(1,127),rep(0,6350-127)),predicted_values)
plot(roccurve)
auc(roccurve) #0.5077

accuracy<-(conf_matrix[1,1]+conf_matrix[2,2])/(sum(conf_matrix))
accuracy #0.98
  D  !D
+ TP FP
- FN TN
• Sensitivity=[TP/(TP+FN)]×100
• Specificity=[TN/(FP+TN)]×100
• Positive predictive value(PPV)= [TP/(TP+FP)]×100
• Negative predictive value(NPV)=[TN/(FN+TN)]×100.

# Recursive Feature Elimination

This is provided by the package Pomona (Fouodo (2022))

library(Pomona)
colnames(X) <- make.names(1:dim(X)[2])
set.seed(111)
res <- var.sel.rfe(X, y, prop.rm = 0.2,  recalculate = TRUE, tol = 10,
ntree = 500, mtry.prop = 0.2, nodesize.prop = 0.1, no.threads = 7,
method = "ranger", type = "classification", importance = "impurity",
case.weights = NULL)
res$var #[1] "X1" "X106" "X11" "X12" "X127" "X13" "X15" "X16" "X2" #[10] "X23" "X24" "X3" "X4" "X44" "X46" "X4885" "X5" "X54" #[19] "X5474" "X6" "X7" "X72" "X9" "X91" tt <-as.numeric(gsub("X([0-9]*)","\\1", res$var))
table(ifelse((tt < 127),1,2))
# 1  2
#21  3 0.0833

res<-c(1,106, 11, 12, 127, 13, 15, 16,  2, 23, 24,  3,  4, 44, 46,  4885,  5, 54, 5474,  6,  7, 72,  9, 91)
predicted_values <-rep(0,6350);predicted_values[c(res)]<-1
conf_matrix<-table(predicted_values,c(rep(1,127),rep(0,6350-127)))
conf_matrix
sensitivity(conf_matrix) # 0.9996786 TP/(TP+FN)
specificity(conf_matrix) #0 0.1732283 TN/(FP+TN)
roccurve <- roc(c(rep(1,127),rep(0,6350-127)),predicted_values)
plot(roccurve)
auc(roccurve) #0.5865
accuracy<-(conf_matrix[1,1]+conf_matrix[2,2])/(sum(conf_matrix))
accuracy #  0.9831496

# AIR

See Nembrini et al. (2018). AIR is provided in the package ranger, using the option impurity_corrected.

rfModel2 <- ranger(data=data,dependent.variable.name="y", importance="impurity_corrected",
classification=TRUE,  mtry=100,num.trees = 10000, replace=TRUE, seed  = 17 )
ww<- importance_pvalues( rfModel2, method = "janitza")

p <- ww[,"pvalue"]
cc <- which(p< 0.05)
predicted_values <-rep(0,6350);predicted_values[cc]<-1
conf_matrix<-table(predicted_values,c(rep(1,127),rep(0,6350-127)))
conf_matrix
#predicted_values    0    1
#               0 5950    0
#               1  273  127
#FDR is 273/(127+273) = 0.6825

sensitivity(conf_matrix) #0.9561305 TP/(TP+FN)
specificity(conf_matrix) #1          TN/(FP+TN)
roccurve <- roc(c(rep(1,127),rep(0,6350-127)),predicted_values)
plot(roccurve)
auc(roccurve) # 0.9781
accuracy<-(conf_matrix[1,1]+conf_matrix[2,2])/(sum(conf_matrix))
accuracy #  0.9570079

As the null values, as modelled bt the AIR procedure, are symmetric around 0, the question arises as to whether we can apply the locfdr (Efron, Turnbull, and Narasimhan (2015)) procedure. In this case, it reduces the FDR from 0.682 to 0.601.

plot(density(ww[,"importance"]))
imp <- ww[,"importance"]
#imp <-imp/sqrt(var(imp))
#plot(density(imp))
library(locfdr)

aa<-locfdr(imp,df=13)
which( (aa$fdr< 0.05) & (imp>0)) cc2 <- which( (aa$fdr< 0.05) & (imp>0))
length(cc2) # [1] 309
length(intersect(cc,cc2)) #[1] 309

(length(cc2)  - length(which(cc2 <= 127)))/length(cc2) #[1] 0.6019417  fdr
predicted_values <-rep(0,6350);predicted_values[cc2]<-1
conf_matrix<-table(predicted_values,c(rep(1,127),rep(0,6350-127)))
conf_matrix
sensitivity(conf_matrix) # 0.9701109 TP/(TP+FN)
specificity(conf_matrix) #0.9685039         TN/(FP+TN)
roccurve <- roc(c(rep(1,127),rep(0,6350-127)),predicted_values)
plot(roccurve)
auc(roccurve) # 0.9693
accuracy<-(conf_matrix[1,1]+conf_matrix[2,2])/(sum(conf_matrix))
accuracy #   0.9700787

# PIMP

note that PIMP

• permutes the response vector $$y$$ $$S$$ times to give $$y^{*s}$$

• For each permutation, a new RF is grown and the permutation variable importance measure (VarImp$$^{*s}$$) for all predictor variables X is computed.

• The vector ‘perVarImp’ of $$S$$ VarImp measures for every predictor variable is used to approximate the null importance distributions for each variable (see PimpTest)

• we are doing $$p$$ tests, hence a multiple testing correction is in order

• we base our work on the PIMP implementation provided by Celik (2015) Variable Importance Testing Approaches

• pump as described by Altman is only applicable to permutation importance.

• however we see no impediment in extending the method to Gini importance

• we have done that in our code and also

• fixed a small error
• extended the code to use ranger as well as randomForest implementations of RF
library(vita) #vita: Variable Importance Testing Approaches
y<-factor(y)
X<-data.frame(X)
set.seed(117)
cl.ranger <- ranger(y~. , X,mtry = 3,num.trees = 1000, importance = 'impurity')
system.time(pimp.varImp.cl<-my_ranger_PIMP(X,y,cl.ranger,S=10, parallel=TRUE, ncores=3))
pimp.t.cl <- vita::PimpTest(pimp.varImp.cl,para = FALSE)
aa <- summary(pimp.t.cl,pless = 0.1)

tt<-as.numeric(gsub("X([0-9]*)","\\1",  names(which(aa$cmat2[,"p-value"]< 0.05)))) table(ifelse((tt < 127),1,2)) # 1 2 # 38 527 527 /(527+38) = 00.9327434 predicted_values <-rep(0,6350);predicted_values[which(aa$cmat2[,"p-value"]< 0.05)]<-1
conf_matrix<-table(predicted_values,c(rep(1,127),rep(0,6350-127)))
conf_matrix
sensitivity(conf_matrix) #0.9154749 TP/(TP+FN)
specificity(conf_matrix) #0.3070866          TN/(FP+TN)
roccurve <- roc(c(rep(1,127),rep(0,6350-127)),predicted_values)
plot(roccurve)
auc(roccurve) #  0.6113
accuracy<-(conf_matrix[1,1]+conf_matrix[2,2])/(sum(conf_matrix))
accuracy # 0.9033071

# Conclusion

Method     | true positives | false positives | Sensitivity        | AUC            |
RFlocalfdr | 59             | 36              | 0.99               | 0.73           |
Boruta     | 2              | 2               | 0.99               | 0.50           |
RFE        | 22             | 2               | 0.99               | 0.58           |
AIR        | 176            | 273             | 0.96               | 0.98           |
PIMP       | 39             | 556             | 0.91               | 0.61           |

Performance of variable selection for the simulated example by several methods. The best outcomes for each category are in bold face. RFE has performed better than Boruta but apart from that, ordering the methods wi ll depend on the cost of false positives versus the cost of false negatives.

# References

Method     | true positives | false positives | FDR  | Sensitivity|specificity | AUC   |
RFlocalfdr | 59             | 36              | 0.38  | 0.99       |  *         | 0.73  |
Boruta     | 2              | 2               | *    | 0.99       |  *         | 0.50  |
RFE        | 22             | 2               | 0.08  | 0.99       |  *         | 0.58  |
AIR        | 176            | 273             | 0.68  | 0.96       |  *         | 0.98  |
AIR+locfdr | 123             | 186             | 0.61  | 0.97       | 0.97        | 0.97  |
PIMP       | 39             | 556             | 0.93  | 0.91       |  *         | 0.61  |

Altmann, André, Laura Toloşi, Oliver Sander, and Thomas Lengauer. 2010. “Permutation Importance: A Corrected Feature Importance Measure.” Bioinformatics 26 (10): 1340. https://doi.org/10.1093/bioinformatics/btq134.
Celik, Ender. 2015. Vita: Variable Importance Testing Approaches. https://CRAN.R-project.org/package=vita.
Dunne, Robert, Roc Reguant, Priya Ramarao-Milne, Piotr Szul, Letitia Sng, Mischa Lundberg, Natalie A. Twine, and Denis C. Bauer. 2022. “Thresholding Gini Variable Importance with a Single Trained Random Forest: An Empirical Bayes Approach.” bioRxiv. https://doi.org/10.1101/2022.04.06.487300.
Efron, Bradley, Brit Turnbull, and Balasubramanian Narasimhan. 2015. Locfdr: Computes Local False Discovery Rates. https://CRAN.R-project.org/package=locfdr.
Fouodo, Cesaire. 2022. Pomona: Identification of Relevant Variables in Omics Data Sets Using Random Forests.
Kursa, Miron B., and Witold R. Rudnicki. 2010. “Feature Selection with the Boruta Package.” Journal of Statistical Software 36 (11): 1–13. https://www.jstatsoft.org/v36/i11/.
Nembrini, Stefano, Inke R. König, Marvin N. Wright, and Alfonso Valencia. 2018. “The Revival of the Gini Importance?” Bioinformatics. https://doi.org/10.1093/bioinformatics/bty373.