Examples.Rmd
# Install the package from GitHub
devtools::install_github("yhhc2/machinelearnr")
# Load package
library("machinelearnr")
All functions with example code is run in this section. The functions are listed below in alphabetical order with example code to illustrate how each function should be used. The example code should be very similar to the example code in the function reference.
To see detailed descriptions for each function, please visit the package’s website.
example.data <- data.frame(x = c(18, 21, 22, 24, 26, 26, 27, 30, 31,
35, 39, 40, 41, 42, 44, 46, 47, 48, 49, 54, 35, 30),
y = c(10, 11, 22, 15, 12, 13, 14, 33, 39, 37, 44,
27, 29, 20, 28, 21, 30, 31, 23, 24, 40, 45))
#dev.new()
plot(example.data$x, example.data$y)
#Results should say that 3 clusters is optimal
output <- CalcOptimalNumClustersForKMeans(example.data, c("x", "y"))
elbow.plot <- output[[1]]
ch.and.asw.plot <- output[[2]]
#dev.new()
elbow.plot
#dev.new()
ch.and.asw.plot
example.data <- data.frame(x = c(18, 21, 22, 24, 26, 26, 27, 30, 31, 35,
39, 40, 41, 42, 44, 46, 47, 48, 49, 54, 35, 30),
y = c(10, 11, 22, 15, 12, 13, 14, 33, 39, 37, 44, 27,
29, 20, 28, 21, 30, 31, 23, 24, 40, 45),
z = c(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1))
#dev.new()
plot(example.data$x, example.data$y)
km.res <- stats::kmeans(example.data[,c("x", "y", "z")], 3, nstart = 25, iter.max=10)
grouped <- km.res$cluster
pca.results <- prcomp(example.data[,c("x", "y", "z")], scale=FALSE)
actual.group.label <- c("A", "A", "A", "A", "A", "A", "A", "B", "B", "B", "B",
"B", "B", "B", "B", "B", "B", "B", "B", "B", "B", "B")
results <- generate.2D.clustering.with.labeled.subgroup(pca.results, grouped, actual.group.label)
#PC1 vs PC2
print(results[[1]])
#PC1 vs PC3
print(results[[2]])
#Chi-square results
print(results[[3]])
##
## Pearson's Chi-squared test
##
## data: tbl
## X-squared = 22, df = 2, p-value = 1.67e-05
#Table
print(results[[4]])
## cluster.labels.input
## subgroup.labels.input 1 2 3
## A 0 7 0
## B 9 0 6
example.data <- data.frame(x = c(18, 21, 22, 24, 26, 26, 27, 30, 31, 35,
39, 40, 41, 42, 44, 46, 47, 48, 49, 54, 35, 30),
y = c(10, 11, 22, 15, 12, 13, 14, 33, 39, 37, 44,
27, 29, 20, 28, 21, 30, 31, 23, 24, 40, 45),
z = c(1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 3,
3, 3, 3, 3, 3, 3, 3, 3))
#dev.new()
plot(example.data$x, example.data$y)
km.res <- stats::kmeans(example.data[,c("x", "y", "z")], 3, nstart = 25, iter.max=10)
grouped <- km.res$cluster
pca.results <- prcomp(example.data[,c("x", "y", "z")], scale=FALSE)
actual.group.label <- c("A", "A", "A", "A", "A", "A", "A", "B", "B", "B", "B",
"B", "B", "B", "B", "B", "B", "B", "B", "B", "B", "B")
results <- generate.3D.clustering.with.labeled.subgroup(pca.results, grouped, actual.group.label)
xlab.values <- results[[1]]
ylab.values <- results[[2]]
zlab.values <- results[[3]]
xdata.values <- results[[4]]
ydata.values <- results[[5]]
zdata.values <- results[[6]]
data.to.plot <- data.frame(xdata.values, ydata.values, zdata.values)
# #Interactive 3d plot, but cannot display in html or md
# rgl::rgl.bg(color = "white")
# rgl::plot3d(x= xdata.values, y= ydata.values, z= zdata.values,
# xlab = xlab.values, ylab = ylab.values, zlab = zlab.values, col=(grouped+1), pch=20, cex=2)
# rgl::text3d(x= xdata.values, y= ydata.values, z= zdata.values, text= actual.group.label, cex=1)
# rgl::rglwidget()
#Non-interactive 3d plot
s3d <- scatterplot3d::scatterplot3d(data.to.plot[,1:3], pch = 16, color=(grouped+1),
xlab = xlab.values, ylab = ylab.values,
zlab = zlab.values)
legend(s3d$xyz.convert(-30, 20, 1), legend = levels(as.factor(grouped)),
col = levels(as.factor(grouped+1)), pch = 16)
text(s3d$xyz.convert(data.to.plot[,1:3]), labels = actual.group.label,
cex= 0.7, col = "steelblue")
example.data <- data.frame(x = c(18, 21, 22, 24, 26, 26, 27, 30, 31,
35, 39, 40, 41, 42, 44, 46, 47, 48, 49, 54, 35, 30),
y = c(10, 11, 22, 15, 12, 13, 14, 33, 39, 37, 44,
27, 29, 20, 28, 21, 30, 31, 23, 24, 40, 45))
dev.new()
plot(example.data$x, example.data$y)
km.res <- stats::kmeans(example.data[,c("x", "y")], 3, nstart = 25, iter.max=10)
grouped <- km.res$cluster
generate.plots.comparing.clusters(example.data, grouped, c("x", "y"))
example.data <- data.frame(x = c(18, 21, 22, 24, 26, 26, 27, 30, 31,
35, 39, 40, 41, 42, 44, 46, 47, 48, 49, 54, 35, 30),
y = c(10, 11, 22, 15, 12, 13, 14, 33, 39, 37, 44,
27, 29, 20, 28, 21, 30, 31, 23, 24, 40, 45))
plot(example.data$x, example.data$y)
km.res <- stats::kmeans(example.data[,c("x", "y")], 3, nstart = 25, iter.max=10)
grouped <- km.res$cluster
example.data <- cbind(example.data, grouped)
print(GenerateParcoordForClusters(example.data, "grouped", c("x", "y")))
## $rect
## $rect$w
## [1] 0.241386
##
## $rect$h
## [1] 0.4538482
##
## $rect$left
## [1] 1.798614
##
## $rect$top
## [1] 1.04
##
##
## $text
## $text$x
## [1] 1.884276 1.884276 1.884276 1.884276 1.884276 1.884276
##
## $text$y
## [1] 0.9751645 0.9103291 0.8454936 0.7806582 0.7158227 0.6509873
id = c("1a", "1b", "1c", "1d", "1e", "1f", "1g", "2a", "2b", "2c", "2d", "3h", "3i", "3a",
"3b", "3c", "3d", "3e", "3f", "3g", "2g", "2h")
x = c(18, 21, 22, 24, 26, 26, 27, 30, 31, 35, 39, 40, 41, 42, 44, 46, 47, 48, 49, 54, 35, 30)
y = c(10, 11, 22, 15, 12, 13, 14, 33, 39, 37, 44, 27, 29, 20, 28, 21, 30, 31, 23, 24, 40, 45)
color = as.factor(c(0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1))
example.data <- data.frame(id, x, y, color)
#dev.new()
plot(example.data$x, example.data$y)
text(example.data$x, example.data$y, labels = id, cex=0.9, font=2)
results <- HierarchicalClustering(working.data = example.data,
clustering.columns = c("x", "y"),
label.column.name = "id",
grouping.column.name = "color",
number.of.clusters.to.use = 3,
distance_method = "euclidean",
correlation_method = NULL,
linkage_method_type = "ward.D",
Use.correlation.for.hclust = FALSE,
terminal.branch.font.size = 1,
title.to.use = "Clustering based on x and y data")
hclust.res <- results[[1]]
dend <- results[[2]]
kcboot.res <- results[[3]]
title.to.use <- results[[4]]
labeled.samples <- results[[5]]
stability.table <- results[[6]]
#Plot dendrogram
plot(dend, main = title.to.use, cex.main = 0.75)
#Add legend to dendrogram
legend.labels.to.use <- levels(example.data[,"color"])
col.to.use <- as.integer(levels(example.data[,"color"])) + 1
pch.to.use <- rep(20, times = length(legend.labels.to.use))
graphics::legend("topright",
legend = legend.labels.to.use,
col = col.to.use,
pch = pch.to.use, bty = "n", pt.cex = 1.5, cex = 0.8 ,
text.col = "black", horiz = FALSE, inset = c(0, 0.1),
title = "Color")
#Display sample assignment
labeled.samples
## 1a 1b 1c 1d 1g 1e 1f 2a 2b 2c 2g 2d 2h 3g 3a 3c 3f 3d 3e 3b 3h 3i
## 1 1 1 1 1 1 1 2 2 2 2 2 2 3 3 3 3 3 3 3 3 3
#Display stability of clusters
plot(1,1)
gridExtra::grid.table(stability.table)
#Display kcboot full output
kcboot.res
## * Cluster stability assessment *
## Cluster method: hclust
## Full clustering results are given as parameter result
## of the clusterboot object, which also provides further statistics
## of the resampling results.
## Number of resampling runs: 100
##
## Number of clusters found in data: 3
##
## Clusterwise Jaccard bootstrap (omitting multiple points) mean:
## [1] 1 1 1
## dissolved:
## [1] 0 0 0
## recovered:
## [1] 100 100 100
id = c("1a", "1b", "1c", "1d", "1e", "1f", "1g", "2a", "2b", "2c", "2d", "2e", "2f", "3a",
"3b", "3c", "3d", "3e", "3f", "3g", "3h", "3i")
x = c(18, 21, 22, 24, 26, 26, 27, 30, 31, 35, 39, 35, 30, 40, 41, 42, 44, 46, 47, 48, 49, 54)
y = c(10, 11, 22, 15, 12, 13, 14, 33, 39, 37, 44, 40, 45, 27, 29, 20, 28, 21, 30, 31, 23, 24)
a = c(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1)
b = c(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1)
actual = as.factor(c("1", "1", "1", "1", "1", "1", "1", "2", "2", "2", "2", "2", "2", "3", "3", "3",
"3", "3", "3", "3", "3", "3"))
example.data <- data.frame(id, x, y, a, b, actual)
set.seed(2)
rf.result <- randomForest::randomForest(x=example.data[,c("x", "y", "a", "b")],
y=example.data[,"actual"], proximity=TRUE)
predicted <- rf.result$predicted
actual <- example.data[,"actual"]
eval.classification.results(as.character(actual), as.character(predicted), "Example")
## Warning: `data_frame()` was deprecated in tibble 1.1.0.
## Please use `tibble()` instead.
## [[1]]
## [1] "Example"
##
## [[2]]
## predicted
## actual 1 2 3
## 1 7 0 0
## 2 0 5 1
## 3 0 0 9
##
## [[3]]
## [[3]]$accuracy
## [1] 0.9545455
##
## [[3]]$macro_prf
## # A tibble: 3 x 3
## precision recall f1
## <dbl> <dbl> <dbl>
## 1 1 1 1
## 2 1 0.833 0.909
## 3 0.9 1 0.947
##
## [[3]]$macro_avg
## # A tibble: 1 x 3
## avg_precision avg_recall avg_f1
## <dbl> <dbl> <dbl>
## 1 0.967 0.944 0.952
##
## [[3]]$ova
## [[3]]$ova$`1`
## classified
## actual 1 others
## 1 7 0
## others 0 15
##
## [[3]]$ova$`2`
## classified
## actual 2 others
## 2 5 1
## others 0 16
##
## [[3]]$ova$`3`
## classified
## actual 3 others
## 3 9 0
## others 1 12
##
##
## [[3]]$ova_sum
## classified
## actual relevant others
## relevant 21 1
## others 1 43
##
## [[3]]$kappa
## [1] 0.9301587
##
##
## [[4]]
## [1] 0.9331967
id = c("1a", "1b", "1c", "1d", "1e", "1f", "1g", "2a", "2b", "2c", "2d", "2e",
"2f", "3a",
"3b", "3c", "3d", "3e", "3f", "3g", "3h", "3i")
x = c(18, 21, 22, 24, 26, 26, 27, 30, 31, 35, 39, 35, 30, 40, 41, 42, 44, 46,
47, 48, 49, 54)
y = c(10, 11, 22, 15, 12, 13, 14, 33, 39, 37, 44, 40, 45, 27, 29, 20, 28, 21,
30, 31, 23, 24)
a = c(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1)
b = c(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1)
actual = as.factor(c("1", "1", "1", "1", "1", "1", "1", "2", "2", "2", "2",
"2", "2", "3", "3", "3",
"3", "3", "3", "3", "3", "3"))
example.data <- data.frame(id, x, y, a, b, actual)
set.seed(1)
rf.result <- randomForest::randomForest(x=example.data[,c("x", "y", "a", "b")],
y=example.data[,"actual"], proximity=TRUE, ntree=50)
error.oob <- rf.result[[4]][,1]
best.tree <- find.best.number.of.trees(error.oob)
trees <- 1:length(error.oob)
plot(trees, error.oob, type = "l")
#dev.new()
plot(example.data$x, example.data$y)
text(example.data$x, example.data$y,labels=example.data$id)
id = c("1a", "1b", "1c", "1d", "1e", "1f", "1g", "2a", "2b", "2c", "2d", "2e", "2f", "3a",
"3b", "3c", "3d", "3e", "3f", "3g", "3h", "3i")
x = c(18, 21, 22, 24, 26, 26, 27, 30, 31, 35, 39, 35, 30, 40, 41, 42, 44, 46, 47, 48, 49, 54)
y = c(10, 11, 22, 15, 12, 13, 14, 33, 39, 37, 44, 40, 45, 27, 29, 20, 28, 21, 30, 31, 23, 24)
a = c(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1)
b = c(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1)
actual = as.factor(c("1", "1", "1", "1", "1", "1", "1", "2", "2", "2", "2", "2", "2",
"3", "3", "3",
"3", "3", "3", "3", "3", "3"))
example.data <- data.frame(id, x, y, a, b, actual)
result <- LOOCVPredictionsRandomForestAutomaticMtryAndNtree(example.data,
predictors.that.PCA.can.be.done.on = c("x", "y", "a", "b"),
predictors.that.should.not.PCA = NULL,
should.PCA.be.used = FALSE,
target.column.name = "actual",
seed=2,
percentile.threshold.to.keep = 0.5)
## [1] 1
## [1] 2
## [1] 3
## [1] 4
## [1] 5
## [1] 6
## [1] 7
## [1] 8
## [1] 9
## [1] 10
## [1] 11
## [1] 12
## [1] 13
## [1] 14
## [1] 15
## [1] 16
## [1] 17
## [1] 18
## [1] 19
## [1] 20
## [1] 21
## [1] 22
predicted <- result[[1]]
actual <- example.data[,"actual"]
eval.classification.results(as.character(actual), as.character(predicted), "Example")
## [[1]]
## [1] "Example"
##
## [[2]]
## predicted
## actual 1 2 3
## 1 7 0 0
## 2 0 5 1
## 3 0 0 9
##
## [[3]]
## [[3]]$accuracy
## [1] 0.9545455
##
## [[3]]$macro_prf
## # A tibble: 3 x 3
## precision recall f1
## <dbl> <dbl> <dbl>
## 1 1 1 1
## 2 1 0.833 0.909
## 3 0.9 1 0.947
##
## [[3]]$macro_avg
## # A tibble: 1 x 3
## avg_precision avg_recall avg_f1
## <dbl> <dbl> <dbl>
## 1 0.967 0.944 0.952
##
## [[3]]$ova
## [[3]]$ova$`1`
## classified
## actual 1 others
## 1 7 0
## others 0 15
##
## [[3]]$ova$`2`
## classified
## actual 2 others
## 2 5 1
## others 0 16
##
## [[3]]$ova$`3`
## classified
## actual 3 others
## 3 9 0
## others 1 12
##
##
## [[3]]$ova_sum
## classified
## actual relevant others
## relevant 21 1
## others 1 43
##
## [[3]]$kappa
## [1] 0.9301587
##
##
## [[4]]
## [1] 0.9331967
result[[2]]
## variables.with.sig.contributions
## y x
## 100 100
#Make example data where samples with 1, 2, and 3 in their ID names can be
#predicted using features x and y, while samples with 4 and 5 in their ID names
#can be predicted using features a and b.
id = c("1a", "1b", "1c", "1d", "1e", "1f", "1g", "2a", "2b", "2c", "2d", "2e", "2f", "3a",
"3b", "3c", "3d", "3e", "3f", "3g", "3h", "3i",
"4a", "4b", "4c", "4d", "4e", "4f", "4g", "5a", "5b", "5c",
"5d", "5e", "5f", "5g")
x = c(18, 21, 22, 24, 26, 26, 27, 30, 31, 35, 39, 35, 30, 40, 41, 42, 44, 46, 47, 48, 49, 54,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1)
y = c(10, 11, 22, 15, 12, 13, 14, 33, 39, 37, 44, 40, 45, 27, 29, 20, 28, 21, 30, 31, 23, 24,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1)
a = c(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
18, 21, 22, 24, 26, 26, 27, 40, 41, 42, 44, 46, 47, 48)
b = c(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
10, 11, 22, 15, 12, 13, 14, 27, 29, 20, 28, 21, 30, 31)
sep.xy.ab = c("1/2/3", "1/2/3", "1/2/3", "1/2/3", "1/2/3", "1/2/3", "1/2/3",
"1/2/3", "1/2/3", "1/2/3", "1/2/3", "1/2/3", "1/2/3", "1/2/3", "1/2/3", "1/2/3",
"1/2/3", "1/2/3", "1/2/3", "1/2/3", "1/2/3", "1/2/3",
"4/5", "4/5", "4/5", "4/5", "4/5", "4/5", "4/5", "4/5", "4/5", "4/5", "4/5",
"4/5", "4/5", "4/5")
actual = as.factor(c("1", "1", "1", "1", "1", "1", "1", "2", "2", "2", "2",
"2", "2", "3", "3", "3",
"3", "3", "3", "3", "3", "3", "4", "4", "4", "4", "4", "4", "4", "5",
"5", "5", "5", "5", "5", "5"))
example.data <- data.frame(id, x, y, a, b, sep.xy.ab, actual)
#dev.new()
plot(example.data$x, example.data$y)
text(example.data$x, example.data$y,labels=example.data$id)
#dev.new()
plot(example.data$a, example.data$b)
text(example.data$a, example.data$b,labels=example.data$id)
result <- LOOCVRandomForestClassificationMatrixForPheatmap(input.data = example.data,
factor.name.for.subsetting = "sep.xy.ab",
name.of.predictors.to.use = c("x", "y", "a", "b"),
target.column.name = "actual",
seed = 2,
should.mtry.and.ntree.be.optimized = FALSE,
percentile.threshold.to.keep = 0.5)
## [1] 1
## [1] 2
## [1] 3
## [1] 4
## [1] 5
## [1] 6
## [1] 7
## [1] 8
## [1] 9
## [1] 10
## [1] 11
## [1] 12
## [1] 13
## [1] 14
## [1] 15
## [1] 16
## [1] 17
## [1] 18
## [1] 19
## [1] 20
## [1] 21
## [1] 22
## [1] 1
## [1] 2
## [1] 3
## [1] 4
## [1] 5
## [1] 6
## [1] 7
## [1] 8
## [1] 9
## [1] 10
## [1] 11
## [1] 12
## [1] 13
## [1] 14
matrix.for.pheatmap <- result[[1]]
pheatmap_RF <- pheatmap::pheatmap(matrix.for.pheatmap, fontsize_col = 12, fontsize_row=12)
#The pheatmap shows that the points in groups 1, 2, and 3 can be predicted
#with features x and y. While points in group 4 and 5 can be predicted with
#features a and b.
#dev.new()
pheatmap_RF
id = c("1a", "1b", "1c", "1d", "1e", "1f", "1g", "2a", "2b", "2c", "2d", "2e", "2f", "3a",
"3b", "3c", "3d", "3e", "3f", "3g", "3h", "3i")
x = c(18, 21, 22, 24, 26, 26, 27, 30, 31, 35, 39, 35, 30, 40, 41, 42, 44, 46, 47, 48, 49, 54)
y = c(10, 11, 22, 15, 12, 13, 14, 33, 39, 37, 44, 40, 45, 27, 29, 20, 28, 21, 30, 31, 23, 24)
a = c(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1)
b = c(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1)
actual = as.factor(c("1", "1", "1", "1", "1", "1", "1", "2", "2", "2", "2", "2",
"2", "3", "3", "3",
"3", "3", "3", "3", "3", "3"))
example.data <- data.frame(id, x, y, a, b, actual)
rf.result <- RandomForestAutomaticMtryAndNtree(example.data, c("x", "y", "a", "b"),
"actual", seed=2)
predicted <- rf.result$predicted
actual <- example.data[,"actual"]
#Result is not perfect because RF model does not over fit to the training data.
eval.classification.results(as.character(actual), as.character(predicted), "Example")
## [[1]]
## [1] "Example"
##
## [[2]]
## predicted
## actual 1 2 3
## 1 7 0 0
## 2 0 5 1
## 3 0 0 9
##
## [[3]]
## [[3]]$accuracy
## [1] 0.9545455
##
## [[3]]$macro_prf
## # A tibble: 3 x 3
## precision recall f1
## <dbl> <dbl> <dbl>
## 1 1 1 1
## 2 1 0.833 0.909
## 3 0.9 1 0.947
##
## [[3]]$macro_avg
## # A tibble: 1 x 3
## avg_precision avg_recall avg_f1
## <dbl> <dbl> <dbl>
## 1 0.967 0.944 0.952
##
## [[3]]$ova
## [[3]]$ova$`1`
## classified
## actual 1 others
## 1 7 0
## others 0 15
##
## [[3]]$ova$`2`
## classified
## actual 2 others
## 2 5 1
## others 0 16
##
## [[3]]$ova$`3`
## classified
## actual 3 others
## 3 9 0
## others 1 12
##
##
## [[3]]$ova_sum
## classified
## actual relevant others
## relevant 21 1
## others 1 43
##
## [[3]]$kappa
## [1] 0.9301587
##
##
## [[4]]
## [1] 0.9331967
#Make example data where samples with 1, 2, and 3 in their ID names can be
#predicted using features x and y, while samples with 4 and 5 in their ID names
#can be predicted using features a and b.
id = c("1a", "1b", "1c", "1d", "1e", "1f", "1g", "2a", "2b", "2c", "2d", "2e", "2f", "3a",
"3b", "3c", "3d", "3e", "3f", "3g", "3h", "3i",
"4a", "4b", "4c", "4d", "4e", "4f", "4g", "5a", "5b", "5c",
"5d", "5e", "5f", "5g")
x = c(18, 21, 22, 24, 26, 26, 27, 30, 31, 35, 39, 35, 30, 40, 41, 42, 44, 46, 47, 48, 49, 54,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1)
y = c(10, 11, 22, 15, 12, 13, 14, 33, 39, 37, 44, 40, 45, 27, 29, 20, 28, 21, 30, 31, 23, 24,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1)
a = c(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
18, 21, 22, 24, 26, 26, 27, 40, 41, 42, 44, 46, 47, 48)
b = c(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
10, 11, 22, 15, 12, 13, 14, 27, 29, 20, 28, 21, 30, 31)
sep.xy.ab = c("1/2/3", "1/2/3", "1/2/3", "1/2/3", "1/2/3", "1/2/3", "1/2/3",
"1/2/3", "1/2/3", "1/2/3", "1/2/3", "1/2/3", "1/2/3", "1/2/3", "1/2/3", "1/2/3",
"1/2/3", "1/2/3", "1/2/3", "1/2/3", "1/2/3", "1/2/3",
"4/5", "4/5", "4/5", "4/5", "4/5", "4/5", "4/5", "4/5", "4/5", "4/5", "4/5",
"4/5", "4/5", "4/5")
actual = as.factor(c("1", "1", "1", "1", "1", "1", "1", "2", "2", "2", "2", "2", "2", "3", "3", "3",
"3", "3", "3", "3", "3", "3", "4", "4", "4", "4", "4", "4", "4", "5", "5", "5", "5", "5",
"5", "5"))
example.data <- data.frame(id, x, y, a, b, sep.xy.ab, actual)
#dev.new()
plot(example.data$x, example.data$y)
text(example.data$x, example.data$y,labels=example.data$id)
#dev.new()
plot(example.data$a, example.data$b)
text(example.data$a, example.data$b,labels=example.data$id)
results <- RandomForestClassificationGiniMatrixForPheatmap(input.data = example.data,
factor.name.for.subsetting = "sep.xy.ab",
name.of.predictors.to.use = c("x", "y", "a", "b"),
target.column.name = "actual",
seed = 2)
matrix.for.pheatmap <- results[[1]]
#Add MCC to column names
for(i in 1:dim(matrix.for.pheatmap)[2])
{
old.name <- colnames(matrix.for.pheatmap)[[i]]
MCC.val <- matrix.for.pheatmap[dim(matrix.for.pheatmap)[1],][[i]]
new.name <- paste(old.name, ", MCC: ", format(round(MCC.val, 2), nsmall = 2))
colnames(matrix.for.pheatmap)[[i]] <- new.name
}
#Remove MCC row from matrix
matrix.for.pheatmap.MCC.row.removed <- matrix.for.pheatmap[1:(dim(matrix.for.pheatmap)[1]-1),]
pheatmap_RF <- pheatmap::pheatmap(matrix.for.pheatmap.MCC.row.removed, fontsize_col = 12,
fontsize_row=12)
#The pheatmap shows that the points in groups 1, 2, and 3 can be predicted
#with features x and y. While points in group 4 and 5 can be predicted with
#features a and b.
#dev.new()
pheatmap_RF
#Note that the 1/2/3 column is exactly the same as the random forest
#model created in the example for eval.classification.results.
#Since this pheatmap is using the number of LOOCV rounds
#as the input matrix, the colors can be misleading. It looks
#like features x and y are more important for 1/2/3 than features a and b
#is for 4/5. However, this is not the case. The mean decrease
#in gini index cannot be used to compare features between models. It
#can only be used to compare features within a single model.
#Make example data where samples with 1, 2, and 3 in their ID names can be
#predicted using features x and y, while samples with 4 and 5 in their ID names
#can be predicted using features a and b.
id = c("1a", "1b", "1c", "1d", "1e", "1f", "1g", "2a", "2b", "2c", "2d", "2e", "2f", "3a",
"3b", "3c", "3d", "3e", "3f", "3g", "3h", "3i",
"4a", "4b", "4c", "4d", "4e", "4f", "4g", "5a", "5b", "5c",
"5d", "5e", "5f", "5g")
x = c(18, 21, 22, 24, 26, 26, 27, 30, 31, 35, 39, 35, 30, 40, 41, 42, 44, 46, 47, 48, 49, 54,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1)
y = c(10, 11, 22, 15, 12, 13, 14, 33, 39, 37, 44, 40, 45, 27, 29, 20, 28, 21, 30, 31, 23, 24,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1)
a = c(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
18, 21, 22, 24, 26, 26, 27, 40, 41, 42, 44, 46, 47, 48)
b = c(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
10, 11, 22, 15, 12, 13, 14, 27, 29, 20, 28, 21, 30, 31)
sep.xy.ab = c("1/2/3", "1/2/3", "1/2/3", "1/2/3", "1/2/3", "1/2/3", "1/2/3",
"1/2/3", "1/2/3", "1/2/3", "1/2/3", "1/2/3", "1/2/3", "1/2/3", "1/2/3", "1/2/3",
"1/2/3", "1/2/3", "1/2/3", "1/2/3", "1/2/3", "1/2/3",
"4/5", "4/5", "4/5", "4/5", "4/5", "4/5", "4/5", "4/5", "4/5", "4/5", "4/5",
"4/5", "4/5", "4/5")
actual = as.factor(c("1", "1", "1", "1", "1", "1", "1", "2", "2", "2", "2", "2", "2",
"3", "3", "3",
"3", "3", "3", "3", "3", "3", "4", "4", "4", "4", "4", "4", "4", "5", "5", "5",
"5", "5", "5", "5"))
example.data <- data.frame(id, x, y, a, b, sep.xy.ab, actual)
#dev.new()
plot(example.data$x, example.data$y)
text(example.data$x, example.data$y,labels=example.data$id)
#dev.new()
plot(example.data$a, example.data$b)
text(example.data$a, example.data$b,labels=example.data$id)
results <- RandomForestClassificationPercentileMatrixForPheatmap(
input.data = example.data,
factor.name.for.subsetting = "sep.xy.ab",
name.of.predictors.to.use = c("x", "y", "a", "b"),
target.column.name = "actual",
seed = 2)
matrix.for.pheatmap <- results[[1]]
pheatmap_RF <- pheatmap::pheatmap(matrix.for.pheatmap, fontsize_col = 12, fontsize_row=12)
#The pheatmap shows that the points in groups 1, 2, and 3 can be predicted
#with features x and y. While points in group 4 and 5 can be predicted with
#features a and b.
#dev.new()
pheatmap_RF
example.data <- GenerateExampleDataMachinelearnr()
set.seed(1)
example.data.shuffled <- example.data[sample(nrow(example.data)),]
result.CV <- CVPredictionsRandomForest(inputted.data = example.data.shuffled,
name.of.predictors.to.use = c("x", "y", "a", "b"),
target.column.name = "actual",
seed = 2,
percentile.threshold.to.keep = 0.5,
number.of.folds = nrow(example.data))
## [1] 1
## [1] 2
## [1] 3
## [1] 4
## [1] 5
## [1] 6
## [1] 7
## [1] 8
## [1] 9
## [1] 10
## [1] 11
## [1] 12
## [1] 13
## [1] 14
## [1] 15
## [1] 16
## [1] 17
## [1] 18
## [1] 19
## [1] 20
## [1] 21
## [1] 22
## [1] 23
## [1] 24
## [1] 25
## [1] 26
## [1] 27
## [1] 28
## [1] 29
## [1] 30
## [1] 31
## [1] 32
## [1] 33
## [1] 34
## [1] 35
## [1] 36
#Predicted
result.CV[[1]]
## [1] "1" "1" "5" "4" "3" "3" "3" "4" "3" "5" "3" "4" "2" "3" "5" "4" "1" "2" "3"
## [20] "1" "4" "3" "5" "3" "2" "1" "5" "5" "5" "3" "4" "2" "1" "5" "2" "1"
#Feature importance
result.CV[[2]]
## variables.with.sig.contributions
## x y a
## 100.00 97.22 2.78
example.data <- GenerateExampleDataMachinelearnr()
set.seed(1)
example.data.shuffled <- example.data[sample(nrow(example.data)),]
result.two.fold.CV <- CVRandomForestClassificationMatrixForPheatmap(
input.data = example.data.shuffled,
factor.name.for.subsetting = "sep.xy.ab",
name.of.predictors.to.use = c("x", "y", "a", "b"),
target.column.name = "actual",
seed = 2,
percentile.threshold.to.keep = 0.5,
number.of.folds = 2)
## [1] 1
## [1] 2
## [1] 1
## [1] 2
matrix.for.pheatmap <- result.two.fold.CV[[1]]
pheatmap_RF <- pheatmap::pheatmap(matrix.for.pheatmap, fontsize_col = 12, fontsize_row=12)