South African Red List
Purpose
While there are a small number of threatened species with a large range, it is clear that Range is likely a strong predictor of Group status
A lower range predicts a higher likelihood of threatened or extinct grouping.
corrDF <- train %>% mutate(Range=ntile(Range, n=20))
corrDF <- corrDF %>% select(!GenSpec) %>% mutate(across(c("Group","LF","GF","Range",
"Biomes","Range","Habitat_degradation",
"Habitat_loss","IAS","Other",
"Over_exploitation","Pollution","Unknown"),
as_factor))
ggcorrplot::ggcorrplot(DescTools::PairApply(corrDF,DescTools::CramerV), type='lower')
Raw Factor
1 Parasitic 1
2 Tree 2
3 Suffrutex 3
corrDF <- train %>%
mutate(Range=ntile(Range, n=20))
corrDF <- corrDF %>%
mutate(across(c(
"Group","LF","GF","Range",
"Biomes","Range",
"Habitat_degradation",
"Habitat_loss", "IAS","Other",
"Over_exploitation","Pollution",
"Unknown"),as_factor))
printFactors=matrix(
c(train$GF[1],train$GF[2],
train$GF[3],corrDF$GF[1],
corrDF$GF[2],corrDF$GF[3]),
nrow=3)
colnames(printFactors)=c('Raw','Factor')
rownames(printFactors)=c('1','2','3')
print(printFactors,quote=FALSE)
GenSpec | Group | LF | GF |
---|---|---|---|
1020_4001 | NotThr | Perennial | Parasitic |
1023_1 | Thr | Perennial | Tree |
1028_9 | NotThr | Perennial | Suffrutex |
1029_1 | NotThr | Annual | Herb |
1031_125 | Thr | Perennial | Lithophyte |
GenSpec | LF | GF | Fam |
---|---|---|---|
1020.400 | 0 | 1 | 1 |
1023.100 | 0 | 2 | 1 |
1028.900 | 0 | 3 | 2 |
1029.100 | 1 | 4 | 2 |
1031.125 | 0 | 5 | 2 |
Group Counts Pre-Balancing: 490 148 23
Group Counts Post-Balancing: 490 490 490
AB <- data_train
AB <- AB[AB$label != '3',]
AB_res <- ovun.sample(label ~ ., data = AB,
method = "over", N = 980,
seed = 1)$data
AC <- data_train
AC <- AC[AC$label != '2',]
AC_res <- ovun.sample(label ~ ., data = AC,
method = "over", N = 980,
seed = 1)$data
AB_2 <- AB_res[AB_res$label == '2',]
AC_3 <- AC_res[AC_res$label == '3',]
data_train_1 <- AB_res[AB_res$label == '1',]
data_train_combined <- rbind(data_train_1, AB_2, AC_3)
cat("Group Counts Pre-Balancing: ",
table(data_train$label),
"\nGroup Counts Post-Balancing: ",
table(data_train_combined$label))
\(X_{new}=\frac{X_{old}-\min(X_{old})}{\max(X_{old})-\min(x_{old})}\)
trainMM <- as.data.frame(lapply(features_train,
function(x) {(x-min(x))/(max(x)-min(x))}))
testMM <- as.data.frame(lapply(features_test,
function(x) {(x-min(x))/(max(x)-min(x))}))
train <- trainMM
train$label <- label
MMCounts <- table(train$label)
MM <- randomForest(x = train[-ncol(data_train_combined)],
y = as.factor(train$label), ntree = 4)
importanceMM = importance(MM)
PredMM <- predict(MM, testMM)
accuracy <- sum(label_test == PredMM) / length(label_test)
testfactorMM <- as.factor(label_test)
PredFactorMM <- as.factor(PredMM)
cm <- confusionMatrix(PredFactorMM, testfactorMM)
rownames(cm$byClass)<-c("NotThr","Thr","Ext")
recall <- mean(c(cm$byClass["NotThr", "Sensitivity"],
cm$byClass["Thr", "Sensitivity"],
cm$byClass["Ext", "Sensitivity"]))
precision <- mean(c(cm$byClass["NotThr", "Pos Pred Value"],
cm$byClass["Thr", "Pos Pred Value"],
cm$byClass["Ext", "Pos Pred Value"]))
F1 = 2 * recall * precision / ( recall + precision )
printTable=matrix(c(round(accuracy,2),round(recall,2),
round(precision,2),round(F1,2)),
ncol=1,byrow=TRUE)
colnames(printTable)=c('Score')
rownames(printTable)=c('Accuracy','Recall',
'Precision','F1')
print(printTable)
\(X_{new}=\frac{X_{old}-\bar{X}_{old}}{\sigma_{X_{old}}}\)
TrainZS <- as.data.frame(lapply(features_train,
function(x) {(x - mean(x))/sd(x)}))
TestZS <- as.data.frame(lapply(features_test,
function(x) {(x - mean(x))/sd(x)}))
train <- TrainZS
train$label <- label
ZSCounts <- table(train$label)
ZS <- randomForest(x = train[-ncol(data_train_combined)],
y = as.factor(train$label), ntree = 4)
ImportanceZS = importance(ZS)
PredZS <- predict(ZS, TestZS)
accuracy <- sum(label_test == PredZS) / length(label_test)
label_test_factor <- as.factor(label_test)
PredFactorZS <- as.factor(PredZS)
cm <- confusionMatrix(PredFactorZS, label_test_factor)
recall <- mean(c(cm$byClass["Class: 1", "Sensitivity"],
cm$byClass["Class: 2", "Sensitivity"],
cm$byClass["Class: 3", "Sensitivity"]))
precision <- mean(c(cm$byClass["Class: 1", "Pos Pred Value"],
cm$byClass["Class: 2", "Pos Pred Value"],
cm$byClass["Class: 3", "Pos Pred Value"]))
F1 = 2 * recall * precision / ( recall + precision )
printTable=matrix(c(round(accuracy,2),round(recall,2),
round(precision,2),round(F1,2)),
ncol=1,byrow=TRUE)
colnames(printTable)=c('Score')
rownames(printTable)=c('Accuracy','Recall','Precision','F1')
print(printTable)
\(X_{new}=\frac{X_{old}}{\max(|X_{old}|)}\)
TrainMAV <- as.data.frame(lapply(features_train,
function(x) {x / max(abs(x))}))
TestMAV <- as.data.frame(lapply(features_test,
function(x) {x / max(abs(x))}))
train <- TrainMAV
train$label <- label
MAVCounts <- table(train$label)
MAV <- randomForest(x = train[-ncol(data_train_combined)],
y = as.factor(train$label),
ntree = 4)
ImportanceMAV = importance(MAV)
PredMAV <- predict(MAV, TestMAV)
accuracy <- sum(label_test == PredMAV) / length(label_test)
label_test_factor <- as.factor(label_test)
PredFactorMAV <- as.factor(PredMAV)
cm <- confusionMatrix(PredFactorMAV, label_test_factor)
recall <- mean(c(cm$byClass["Class: 1", "Sensitivity"],
cm$byClass["Class: 2", "Sensitivity"],
cm$byClass["Class: 3", "Sensitivity"]))
precision <- mean(c(cm$byClass["Class: 1", "Pos Pred Value"],
cm$byClass["Class: 2", "Pos Pred Value"],
cm$byClass["Class: 3", "Pos Pred Value"]))
F1 = 2 * recall * precision / ( recall + precision )
printTable=matrix(c(round(accuracy,2),round(recall,2),
round(precision,2),round(F1,2)),
ncol=1,byrow=TRUE)
colnames(printTable)=c('Score')
rownames(printTable)=c('Accuracy','Recall','Precision','F1')
print(printTable)
\(X_{new}=\frac{X_{old}}{\sum(|X_{old}|)}\)
trainL1 <- as.data.frame(lapply(features_train,
function(x) {x / sum(abs(x))}))
TestL1 <- as.data.frame(lapply(features_test,
function(x) {x / sum(abs(x))}))
train <- trainL1
train$label <- label
L1Counts <- table(train$label)
L1 <- randomForest(x = train[-ncol(data_train_combined)],
y = as.factor(train$label), ntree = 4)
L1Importance = importance(L1)
PredL1 <- predict(L1, TestL1)
accuracy <- sum(label_test == PredL1) / length(label_test)
label_test_factor <- as.factor(label_test)
PredFactorL1 <- as.factor(PredL1)
cm <- confusionMatrix(PredFactorL1, label_test_factor)
recall <- mean(c(cm$byClass["Class: 1", "Sensitivity"],
cm$byClass["Class: 2", "Sensitivity"],
cm$byClass["Class: 3", "Sensitivity"]))
precision <- mean(c(cm$byClass["Class: 1", "Pos Pred Value"],
cm$byClass["Class: 2", "Pos Pred Value"],
cm$byClass["Class: 3", "Pos Pred Value"]))
F1 = 2 * recall * precision / ( recall + precision )
printTable=matrix(c(round(accuracy,2),round(recall,2),
round(precision,2),round(F1,2)),
ncol=1,byrow=TRUE)
colnames(printTable)=c('Score')
rownames(printTable)=c('Accuracy','Recall','Precision','F1')
print(printTable)
\(X_{new}=\frac{X_{old}}{\sqrt{\sum{X_{old}^2}}}\)
TrainL2 <- as.data.frame(lapply(features_train,
function(x) {x / sqrt(sum(x^2))}))
TestL2 <- as.data.frame(lapply(features_test,
function(x) {x / sqrt(sum(x^2))}))
train <- TrainL2
train$label <- label
L2Counts <- table(train$label)
L2 <- randomForest(x = train[-ncol(data_train_combined)],
y = as.factor(train$label), ntree = 4)
L2Importance = importance(L2)
PredL2 <- predict(L2, TestL2)
accuracy <- sum(label_test == PredL2) / length(label_test)
label_test_factor <- as.factor(label_test)
PredFactorL2 <- as.factor(PredL2)
cm <- confusionMatrix(PredFactorL2, label_test_factor)
recall <- mean(c(cm$byClass["Class: 1", "Sensitivity"],
cm$byClass["Class: 2", "Sensitivity"],
cm$byClass["Class: 3", "Sensitivity"]))
precision <- mean(c(cm$byClass["Class: 1", "Pos Pred Value"],
cm$byClass["Class: 2", "Pos Pred Value"],
cm$byClass["Class: 3", "Pos Pred Value"]))
F1 = 2 * recall * precision / ( recall + precision )
printTable=matrix(c(round(accuracy,2),round(recall,2),
round(precision,2),round(F1,2)),ncol=1,
byrow=TRUE)
colnames(printTable)=c('Score')
rownames(printTable)=c('Accuracy','Recall','Precision','F1')
print(printTable)
\(\text{Accuracy}\)
\(\text{Recall}\)
\(\text{Precision}\)
\(\text{F1}\)
\(=\frac{\sum{\left(\text{Actual Label} = \text{Predicted Label}\right)}}{\text{Label Count}}\)
\(=\frac{\text{True Positives}}{\text{True Positives} + \text{False Negatives}}\)
\(=\frac{\text{True Positives}}{\text{True Positives}+\text{False Positives}}\)
\(=\frac{2*(\text{Precision}*\text{Recall})}{\text{Precision}+\text{Recall}}\)
Score
Accuracy 0.92
Recall 0.82
Precision 0.88
F1 0.85
n <- length(PredMM)
final_pred <-rep(NA, n)
for(i in 1:n) {
preds <- c(PredMM[i], PredZS[i], PredMAV[i],
PredL1[i], PredL2[i])
final_pred[i] <-as.numeric(names(
which.max(table(preds))))
}
accuracy <-sum(label_test == final_pred)/
length(label_test)
final_pred_factor<-as.factor(final_pred)
label_test_factor<-as.factor(label_test)
cm_vote <-confusionMatrix(final_pred_factor,
label_test_factor)
rownames(cm_vote$byClass)<-c("NotThr","Thr","Ext")
sensitivity_class1<-cm_vote$byClass[
"NotThr", "Sensitivity"]
sensitivity_class2<-cm_vote$byClass[
"Thr", "Sensitivity"]
sensitivity_class3<-cm_vote$byClass[
"Ext", "Sensitivity"]
recall =(sensitivity_class1+sensitivity_class2+
sensitivity_class3)/3
precision_class1<-cm_vote$byClass[
"NotThr", "Pos Pred Value"]
precision_class2<-cm_vote$byClass[
"Thr", "Pos Pred Value"]
precision_class3<-cm_vote$byClass[
"Ext", "Pos Pred Value"]
precision=(precision_class1+precision_class2+
precision_class3)/3
F1=2*recall*precision/(recall+precision)
printTable=matrix(c(round(accuracy,2),
round(recall,2),
round(precision,2),
round(F1,2)),ncol=1,
byrow=TRUE)
colnames(printTable)=c('Score')
rownames(printTable)=c('Accuracy','Recall',
'Precision','F1')
print(printTable)
Score
Accuracy 0.92
Accuracy p-value <.001
95% CI (0.88,0.94)
Kappa 0.77
NotThr Thr Ext
Sensitivity 0.99 0.68 0.80
Specificity 0.75 0.98 0.99
Pos Pred Value 0.92 0.91 0.80
Neg Pred Value 0.96 0.92 0.99
Precision 0.92 0.91 0.80
Recall 0.99 0.68 0.80
F1 0.95 0.78 0.80
Prevalence 0.74 0.22 0.04
Detection Rate 0.73 0.15 0.03
Detection Prevalence 0.80 0.17 0.04
Balanced Accuracy 0.87 0.83 0.90
cm_vote <- confusionMatrix(final_pred_factor, label_test_factor)
cm <- confusionMatrix(final_pred_factor, label_test_factor)
cm_d <- as.data.frame(cm$table)
cm_st <-data.frame(cm$overall)
cm_st$cm.overall <- round(cm_st$cm.overall,2)
cm_d$diag <- cm_d$Prediction == cm_d$Reference
cm_d$ndiag <- cm_d$Prediction != cm_d$Reference
cm_d[cm_d == 0] <- NA
cm_d$Reference <- reverse.levels(cm_d$Reference)
cm_d$ref_freq <- cm_d$Freq * ifelse(is.na(cm_d$diag),-1,1)
plt1 <- ggplot(data = cm_d, aes(x = Prediction , y = Reference,
fill = Freq))+
scale_x_discrete(position = "top") +
geom_tile( data = cm_d,aes(fill = ref_freq)) +
scale_fill_gradient2(guide = FALSE ,low="red",high="mediumvioletred",
mid= "mistyrose",
midpoint = 0,na.value = 'white') +
geom_text(aes(label = Freq), color = 'black', size = 3)+
theme_bw() +
theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank(),
legend.position = "none",
panel.border = element_blank(),
plot.background = element_blank(),
axis.line = element_blank(),
)
plt2 <- tableGrob(cm_st)
grid.arrange(plt1, plt2, nrow = 1, ncol = 2,
top=textGrob("Confusion Matrix",gp=gpar(fontsize=25,font=1)))
printTable=matrix(c(round(cm$overall['Accuracy'],2),
if(cm$overall['AccuracyPValue']<0.001){"<.001"}
else round(cm$overall['AccuracyPValue'],3),
paste("(",round(cm$overall['AccuracyLower'],2),
",",round(cm$overall['AccuracyUpper'],2),
")",sep=""),
round(cm$overall['Kappa'],2)),ncol=1,
byrow=TRUE)
colnames(printTable)=c('Score')
rownames(printTable)=c('Accuracy','Accuracy p-value','95% CI','Kappa')
print(printTable,quote=FALSE)
cmBC<-cm$byClass
rownames(cmBC)<-c("NotThr","Thr","Ext")
print(t(round(cmBC,2)),quote=FALSE)