knitr::opts_chunk$set(eval = TRUE,
echo = TRUE,
warning = FALSE)
References:
- solitude: https://github.com/talegari/solitude
- isofor: https://campus.datacamp.com/courses/anomaly-detection-in-r/isolation-forest
# install.packages("mvoutlier"))
# install.packages("remotes"))
# remotes::install_github("Zelazny7/isofor")
#import libraries
library(ggplot2)
library(ggpubr)
library(solitude) # hereafter, "sol"
library(isofor) # hereafter, "iso"
library(viridis)
packageVersion("solitude")
#create sample data
data("humus", package = "mvoutlier")
# 2-dimensional
columns_required <- c("Bi", "Cd")
humus2 <- humus[ , columns_required]
# multi-dimensional
columns_required_mul <- setdiff(colnames(humus)
, c("Cond", "ID", "XCOO", "YCOO", "LOI")
)
humus_mul <- humus[ , columns_required_mul]
str(humus2)
str(humus_mul)
#plot data
ggplot(humus2, aes(x = Bi, y = Cd)) +
geom_point(shape = 1, alpha = 0.5) +
labs(x = "Bi", y = "Cd") +
labs(alpha = "", colour="Legend") +
theme_bw()
# solitude library
# sample (50%)
set.seed(1)
index <- sample(ceiling(nrow(humus2) * 0.5))
index_mul <- sample(ceiling(nrow(humus_mul) * 0.5))
# initiate an isolation forest
if_sol <- isolationForest$new(sample_size = length(index))
if_sol_mul <- isolationForest$new(sample_size = length(index))
# fit an isolation forest
if_sol$fit(humus2[index, ])
if_sol_mul$fit(humus_mul[index, ])
# isofor library
if_iso <- iForest(humus2,
phi = nrow(humus2) * 0.5 # sample size
)
# Obtain anomaly scores
humus2$score_sol <- if_sol$predict(humus2)$anomaly_score
humus_mul$score <- if_sol_mul$predict(humus_mul)$anomaly_score
humus2$score_iso <- predict(if_iso,
newdata = humus2[, c(1, 2)])
# decision: outlier if score > 0.6 (threshold could vary by users)
humus2$is_outlier_sol <- ifelse(humus2$score_sol > 0.6, 1, 0)
humus2$is_outlier_iso <- ifelse(humus2$score_iso > 0.55, 1, 0)
# create grid values for contour plot
Bi_seq <- seq(min(humus2$Bi),
max(humus2$Bi),
length.out = 100)
Cd_seq <- seq(min(humus2$Cd),
max(humus2$Cd),
length.out = 100)
humus2_grid <- expand.grid(Bi = Bi_seq,
Cd = Cd_seq)
humus2_grid$score_sol <- if_sol$predict(humus2_grid)$anomaly_score
humus2_grid$score_iso <- predict(if_iso,
newdata = humus2_grid[, c(1, 2)])
# scatter plot
sca_sol <-
ggplot(humus2, aes(x = Bi, y = Cd, z = score_sol)) +
geom_point(aes(color = is_outlier_sol)) +
labs(x = "Bi", y = "Cd") +
scale_color_viridis(discrete = FALSE,
direction = -1) +
theme_bw()
sca_iso <-
ggplot(humus2, aes(x = Bi, y = Cd, z = score_iso)) +
geom_point(aes(color = is_outlier_iso)) +
labs(x = "Bi", y = "Cd") +
scale_color_viridis(discrete = FALSE,
direction = -1) +
theme_bw()
# contour plot
gmin <- min(humus2_grid$score_sol, humus2_grid$score_iso)
gmax <- max(humus2_grid$score_sol, humus2_grid$score_iso)
gbreaks <- round(seq(gmin, gmax, length.out = 9), 2)
cont_sol <-
ggplot(humus2_grid, aes(x = Bi, y = Cd, z = score_sol)) +
geom_contour_filled(breaks = gbreaks) +
labs(x = "Bi", y = "Cd") +
theme_bw()
cont_iso <-
ggplot(humus2_grid, aes(x = Bi, y = Cd, z = score_iso)) +
geom_contour_filled(breaks = gbreaks) +
labs(x = "Bi", y = "Cd") +
theme_bw()
# aggregate ggplots
theme_set(
theme_bw() +
theme(legend.position = "top")
)
if_plot <-
ggarrange(sca_sol, sca_iso,
cont_sol, cont_iso,
labels = rep(c("solitude", "iso"), 2),
ncol = 2, nrow = 2)
if_plot
- Two scatter plots are similar even if they use different threshold values for the decision.
- Two contour plots look very different. This is because they have different range of scores. If we compare the regions with the same levels and colors, they show similar result.