# Chapter 1 ################################################################### ## install.packages("mclust") library("mclust") # Chapter 2 ################################################################### ## ---------------------------------------------------------------------------- data(Snapper, package = "FSAdata") x <- Snapper[,1] mod1 <- densityMclust(x, G = 1, plot = FALSE) mod2 <- densityMclust(x, G = 2, plot = FALSE) mod3 <- densityMclust(x, G = 3, plot = FALSE) mod4 <- densityMclust(x, G = 4, plot = FALSE) par(mfrow = c(2, 2), mar = c(2, 2, 1, 1), oma = c(2, 2, 0, 0)) x0 <- extendrange(x, f = 0.1) x0 <- seq(x0[1], x0[2], length = 1000) # cdens <- predict(mod1, newdata = x0, what = "cdens") cdens <- sweep(cdens, 2, mod1$parameters$pro, "*") plot(mod1, x, what = "density", lwd = 2, breaks = 20, ylim = c(0, 0.4)) matplot(x0, cdens, type = "l", lty = 2, col = 1, add = TRUE) # cdens <- predict(mod2, newdata = x0, what = "cdens") cdens <- sweep(cdens, 2, mod2$parameters$pro, "*") plot(mod2, x, what = "density", lwd = 2, breaks = 20, ylim = c(0, 0.4)) matplot(x0, cdens, type = "l", lty = 2, col = 1, add = TRUE) # cdens <- predict(mod3, newdata = x0, what = "cdens") cdens <- sweep(cdens, 2, mod3$parameters$pro, "*") plot(mod3, x, what = "density", lwd = 2, breaks = 20, ylim = c(0, 0.4)) matplot(x0, cdens, type = "l", lty = 2, col = 1, add = TRUE) # cdens <- predict(mod4, newdata = x0, what = "cdens") cdens <- sweep(cdens, 2, mod4$parameters$pro, "*") plot(mod4, x, what = "density", lwd = 2, breaks = 20, ylim = c(0, 0.4)) matplot(x0, cdens, type = "l", lty = 2, col = 1, add = TRUE) # mtext("Fish length (in)", side = 1, line = 1, outer = TRUE) mtext("Density", side = 2, line = 1, outer = TRUE) # Chapter 3 ################################################################### ## ---------------------------------------------------------------------------- data("diabetes", package = "rrcov") X <- diabetes[, 1:5] Class <- diabetes$group table(Class) ## ---------------------------------------------------------------------------- clp <- clPairs(X, Class, lower.panel = NULL) clPairsLegend(0.1, 0.3, class = clp$class, col = clp$col, pch = clp$pch) ## ---------------------------------------------------------------------------- mod <- Mclust(X, G = 3, modelNames = "VVV") ## ---------------------------------------------------------------------------- summary(mod) ## ---------------------------------------------------------------------------- summary(mod, parameters = TRUE) ## ---------------------------------------------------------------------------- table(Class, Cluster = mod$classification) adjustedRandIndex(Class, mod$classification) ## ---------------------------------------------------------------------------- plot(mod, what = "classification") ## ---------------------------------------------------------------------------- plot(mod, what = "classification", fillEllipses = TRUE) ## ---------------------------------------------------------------------------- plot(mod, what = "classification", dimens = c(3, 4), fillEllipses = TRUE) ## ---------------------------------------------------------------------------- plot(mod, dimens = c(3, 4), what = "uncertainty") ## ---------------------------------------------------------------------------- data("thyroid", package = "mclust") X <- data.matrix(thyroid[, 2:6]) Class <- thyroid$Diagnosis clp <- clPairs(X, Class, lower.panel = NULL, symbols = c(0, 1, 2), colors = c("gray50", "black", "red3")) clPairsLegend(0.1, 0.3, title = "Thyroid diagnosis:", class = clp$class, col = clp$col, pch = clp$pch) ## ---------------------------------------------------------------------------- mod <- Mclust(X) ## ---------------------------------------------------------------------------- mod$BIC ## ---------------------------------------------------------------------------- summary(mod$BIC, k = 5) ## ---------------------------------------------------------------------------- plot(mod, what = "BIC", legendArgs = list("bottomright", ncol = 5)) ## ---------------------------------------------------------------------------- summary(mod, parameters = TRUE) ## ---------------------------------------------------------------------------- plot(mod, what = "classification") ## ---------------------------------------------------------------------------- table(Class, Cluster = mod$classification) adjustedRandIndex(Class, mod$classification) ## ---------------------------------------------------------------------------- z <- mod$z # posterior conditional probabilities cl <- mod$classification # MAP clustering G <- mod$G # number of clusters sclass <- 10 # class separation sedge <- 3 # edge spacing L <- nrow(z) + G*(sclass+2*sedge) plot(1:L, runif(L), ylim = c(0, 1), type = "n", axes = FALSE, ylab = "Posterior conditional probabilities", xlab = "") axis(2) col <- mclust.options("classPlotColors") l <- sclass for (k in 1:G) { i <- which(cl == k) ord <- i[order(z[i, k], decreasing = TRUE)] for (j in 1:G) points((l+sedge)+1:length(i), z[ord, j], pch = as.character(j), col = col[j]) rect(l, 0, l+2*sedge+length(i), 1, border = col[k], col = col[k], lwd = 2, density = 0) l <- l + 2*sedge + length(i) + sclass } ## ---------------------------------------------------------------------------- data("wine", package = "gclus") Class <- factor(wine$Class, levels = 1:3, labels = c("Barolo", "Grignolino", "Barbera")) X <- data.matrix(wine[, -1]) ## ---------------------------------------------------------------------------- mod <- Mclust(X) summary(mod$BIC, k = 3) ## ---------------------------------------------------------------------------- plot(mod, what = "BIC", ylim = range(mod$BIC[, -(1:2)], na.rm = TRUE), legendArgs = list(x = "bottomleft")) ## ---------------------------------------------------------------------------- summary(mod) table(Class, mod$classification) adjustedRandIndex(Class, mod$classification) ## ---------------------------------------------------------------------------- norm01 <- function(x) (x - min(x))/(max(x) - min(x)) M <- apply(t(mod$parameters$mean), 2, norm01) heatmap(M, Rowv = NA, scale = "none", margins = c(8, 2), labRow = paste("Cluster", 1:mod$G), cexRow = 1.2) ## ---------------------------------------------------------------------------- plot(mod, what = "classification", dimens = c(1, 2, 7)) ## ---------------------------------------------------------------------------- data("faithful", package = "datasets") plot(faithful) ## ---------------------------------------------------------------------------- BIC <- mclustBIC(faithful) BIC plot(BIC) ## ---------------------------------------------------------------------------- mod1 <- Mclust(faithful, x = BIC) ## ---------------------------------------------------------------------------- ICL <- mclustICL(faithful) ICL ## ---------------------------------------------------------------------------- plot(ICL) ## ---------------------------------------------------------------------------- mod2 <- Mclust(faithful, G = 2, modelNames = "VVE") ## ---------------------------------------------------------------------------- plot(mod1, what = "classification", fillEllipses = TRUE) plot(mod2, what = "classification", fillEllipses = TRUE) ## ---------------------------------------------------------------------------- LRT <- mclustBootstrapLRT(faithful, modelName = "VVV") LRT ## ---------------------------------------------------------------------------- plot(LRT, G = 1) plot(LRT, G = 2) ## ---------------------------------------------------------------------------- data("hemophilia", package = "rrcov") X <- hemophilia[, 1:2] Class <- as.factor(hemophilia$gr) clp <- clPairs(X, Class, symbols = c(16, 0), colors = "black") clPairsLegend(0.8, 0.2, class = clp$class, col = clp$col, pch = clp$pch) ## ---------------------------------------------------------------------------- mod <- Mclust(X, G = 2, modelName = "VVV") summary(mod, parameters = TRUE) ## ---------------------------------------------------------------------------- plot(mod, what = "classification", fillEllipses = TRUE) ## ---------------------------------------------------------------------------- clp <- clPairs(X, Class, symbols = c(16, 0), colors = "black") clPairsLegend(0.8, 0.2, class = clp$class, col = clp$col, pch = clp$pch) plot(mod, what = "classification", fillEllipses = TRUE) ## ---------------------------------------------------------------------------- boot <- MclustBootstrap(mod, nboot = 999, type = "bs") ## ---------------------------------------------------------------------------- summary(boot, what = "se") ## ---------------------------------------------------------------------------- summary(boot, what = "ci", conf.level = 0.9) ## ---------------------------------------------------------------------------- par(mfrow = c(1, 2)) plot(boot, what = "pro") ## ---------------------------------------------------------------------------- par(mfcol = c(2, 2)) plot(boot, what = "mean") ## ---------------------------------------------------------------------------- wlboot <- MclustBootstrap(mod, nboot = 999, type = "wlbs") summary(wlboot, what = "se") ## ---------------------------------------------------------------------------- par(mfrow = c(1, 2), mar = c(4, 4, 1, 1)) boot.ci <- summary(boot, what = "ci") wlboot.ci <- summary(wlboot, what = "ci") for (j in 1:mod$d) { plot(1:mod$G, mod$parameters$mean[j, ], col = 1:mod$G, pch = 15, ylab = colnames(X)[j], xlab = "Mixture component", ylim = range(boot.ci$mean, wlboot.ci$mean), xlim = c(.5, mod$G+.5), xaxt = "n") points(1:mod$G+0.2, mod$parameters$mean[j, ], col = 1:mod$G, pch = 15) axis(side = 1, at = 1:mod$G) with(boot.ci, errorBars(1:G, mean[1, j, ], mean[2, j, ], col = 1:G)) with(wlboot.ci, errorBars(1:G+0.2, mean[1, j, ], mean[2, j, ], col = 1:G, lty = 2)) } ## ---------------------------------------------------------------------------- data("precip", package = "datasets") names(precip)[c(45,52)] = c("Bismarck", "Pittsburgh") dotchart(sort(precip), cex = 0.6, pch = 19, xlab = "Average annual rainfall (in inches)") ## ---------------------------------------------------------------------------- mod <- Mclust(precip) summary(mod, parameters = TRUE) plot(mod, what = "BIC", legendArgs = list(x = "bottomleft")) ## ---------------------------------------------------------------------------- par(mfrow=c(1, 2)) plot(mod, what = "classification") plot(mod, what = "uncertainty") ## ---------------------------------------------------------------------------- x <- data.frame(precip, clusters = mod$classification) rownames(x) <- make.unique(names(precip)) # correct duplicated names x <- x[order(x$precip), ] dotchart(x$precip, labels = rownames(x), groups = factor(x$clusters, levels = 2:1, labels = c("Cluster 2", "Cluster 1")), cex = 0.6, pch = 19, color = mclust.options("classPlotColors")[x$clusters], xlab = "Average annual rainfall (in inches)") ## ---------------------------------------------------------------------------- data("EuroUnemployment", package = "mclust") summary(EuroUnemployment) ## ---------------------------------------------------------------------------- HC_EII <- hc(EuroUnemployment, modelName = "EII") HC_VVV <- hc(EuroUnemployment, modelName = "VVV") ## ---------------------------------------------------------------------------- plot(HC_EII, what = "merge", labels = TRUE, hang = 0.02) plot(HC_VVV, what = "merge", labels = TRUE, hang = 0.02) ## ---------------------------------------------------------------------------- plot(HC_EII, what = "loglik") plot(HC_VVV, what = "loglik") ## ---------------------------------------------------------------------------- data("HRstars", package = "GDAdata") set.seed(0) initial <- kmeans(HRstars[, -1], centers = 100, nstart=10)$cluster HC_VVV <- hc(HRstars[, -1], modelName = "VVV", partition = initial, use = "VARS") HC_VVV ## ---------------------------------------------------------------------------- data("flea", package = "tourr") X <- data.matrix(flea[, 1:6]) Class <- factor(flea$species, labels = c("Concinna", "Heikertingeri", "Heptapotamica")) table(Class) ## ---------------------------------------------------------------------------- col <- mclust.options("classPlotColors") clp <- clPairs(X, Class, lower.panel = NULL, gap = 0, symbols = c(16, 15, 17), colors = adjustcolor(col, alpha.f = 0.5)) clPairsLegend(x = 0.1, y = 0.3, class = clp$class, col = col, pch = clp$pch, title = "Flea beatle species") ## ---------------------------------------------------------------------------- # set the default for the current session mclust.options("hcUse" = "VARS") mod1 <- Mclust(X) # or specify the initialization method only for this model # mod1 <- Mclust(X, initialization = list(hcPairs = hc(X, use = "VARS"))) summary(mod1) table(Class, mod1$classification) adjustedRandIndex(Class, mod1$classification) ## ---------------------------------------------------------------------------- mod2 <- Mclust(X[, 6:1]) summary(mod2) table(Class, mod2$classification) adjustedRandIndex(Class, mod2$classification) ## ---------------------------------------------------------------------------- mod3 <- Mclust(X, initialization = list(hcPairs = hcRandomPairs(X, seed = 1))) summary(mod3) table(Class, mod3$classification) adjustedRandIndex(Class, mod3$classification) ## ---------------------------------------------------------------------------- set.seed(20190603) BIC <- NULL for (i in 1:50) { # get BIC table from initial random start BIC0 <- mclustBIC(X, verbose = FALSE, initialization = list(hcPairs = hcRandomPairs(X))) # update BIC table by merging best BIC values for each # G and modelNames BIC <- mclustBICupdate(BIC, BIC0) } summary(BIC, k = 5) ## ---------------------------------------------------------------------------- mod4 <- Mclust(X, x = BIC) summary(mod4) table(Class, mod4$classification) adjustedRandIndex(Class, mod4$classification) ## ---------------------------------------------------------------------------- mclust.options("hcUse" = "SVD") # restore the default mod5 <- Mclust(X) # X is the unscaled flea data # or specify only for this model fit # mod5 <- Mclust(X, initialization = list(hcPairs = hc(X, use = "SVD"))) summary(mod5) table(Class, mod5$classification) adjustedRandIndex(Class, mod5$classification) ## ---------------------------------------------------------------------------- data("iris", package = "datasets") str(iris) ms <- mstep(iris[, 1:4], modelName = "VVV", z = unmap(iris$Species)) str(ms, 1) es <- estep(iris[, 1:4], modelName = "VVV", parameters = ms$parameters) str(es, 1) # Chapter 4 ################################################################### ## ---------------------------------------------------------------------------- data("wdbc", package = "mclust") X <- wdbc[, c("Texture_mean", "Area_extreme", "Smoothness_extreme")] Class <- wdbc[, "Diagnosis"] ## ---------------------------------------------------------------------------- set.seed(123) train <- sample(1:nrow(X), size = round(nrow(X)*2/3), replace = FALSE) X_train <- X[train, ] Class_train <- Class[train] tab <- table(Class_train) cbind(Counts = tab, "%" = prop.table(tab)*100) X_test <- X[-train, ] Class_test <- Class[-train] tab <- table(Class_test) cbind(Counts = tab, "%" = prop.table(tab)*100) ## ---------------------------------------------------------------------------- clp <- clPairs(X_train, Class_train, lower.panel = NULL) clPairsLegend(0.1, 0.3, col = clp$col, pch = clp$pch, class = ifelse(clp$class == "B", "Benign", "Malign"), title = "Breast cancer diagnosis:") ## ---------------------------------------------------------------------------- mod1 <- MclustDA(X_train, Class_train, modelType = "EDDA") summary(mod1) ## ---------------------------------------------------------------------------- summary(mod1, parameters = TRUE) ## ---------------------------------------------------------------------------- summary(mod1, newdata = X_test, newclass = Class_test) ## ---------------------------------------------------------------------------- plot(mod1, what = "scatterplot") ## ---------------------------------------------------------------------------- plot(mod1, what = "error") ## ---------------------------------------------------------------------------- mod2 <- MclustDA(X_train, Class_train) summary(mod2, newdata = X_test, newclass = Class_test) ## ---------------------------------------------------------------------------- plot(mod2, what = "scatterplot") ## ---------------------------------------------------------------------------- plot(mod2, what = "scatterplot", dimens = c(1, 2)) ## ---------------------------------------------------------------------------- mod3 <- MclustDA(X_train, Class_train, G = 2, modelNames = "EEE") summary(mod3, newdata = X_test, newclass = Class_test) ## ---------------------------------------------------------------------------- set.seed(20190520) cv1 <- cvMclustDA(mod1) str(cv1) cv2 <- cvMclustDA(mod2) str(cv2) ## ---------------------------------------------------------------------------- unlist(cv1[c("ce", "se.ce", "brier", "se.brier")]) unlist(cv2[c("ce", "se.ce", "brier", "se.brier")]) ## ---------------------------------------------------------------------------- set.seed(2) models <- mclust.options("emModelNames") tab_CE <- tab_Brier <- matrix(as.double(NA), nrow = length(models)+1, ncol = 5) rownames(tab_CE) <- rownames(tab_Brier) <- c(paste0("EDDA[", models, "]"), "MCLUSTDA") colnames(tab_CE) <- colnames(tab_Brier) <- c("Train", "10-fold CV", "se(CV)", "lower", "upper") for (i in seq(models)) { mod <- MclustDA(X, Class, modelType = "EDDA", modelNames = models[i], verbose = FALSE) pred <- predict(mod, X) cv <- cvMclustDA(mod, nfold = 10, verbose = FALSE) # tab_CE[i, 1] <- classError(pred$classification, Class)$errorRate tab_CE[i, 2] <- cv$ce tab_CE[i, 3] <- cv$se.ce tab_CE[i, 4] <- cv$ce - cv$se.ce tab_CE[i, 5] <- cv$ce + cv$se.ce # tab_Brier[i, 1] <- BrierScore(pred$z, Class) tab_Brier[i, 2] <- cv$brier tab_Brier[i, 3] <- cv$se.brier tab_Brier[i, 4] <- cv$brier - cv$se.brier tab_Brier[i, 5] <- cv$brier + cv$se.brier } i <- length(models)+1 mod <- MclustDA(X, Class, modelType = "MclustDA", verbose = FALSE) pred <- predict(mod, X) cv <- cvMclustDA(mod, nfold = 10, verbose = FALSE) # tab_CE[i, 1] <- classError(pred$classification, Class)$errorRate tab_CE[i, 2] <- cv$ce tab_CE[i, 3] <- cv$se.ce tab_CE[i, 4] <- cv$ce - cv$se.ce tab_CE[i, 5] <- cv$ce + cv$se.ce # tab_Brier[i, 1] <- BrierScore(pred$z, Class) tab_Brier[i, 2] <- cv$brier tab_Brier[i, 3] <- cv$se.brier tab_Brier[i, 4] <- cv$brier - cv$se.brier tab_Brier[i, 5] <- cv$brier + cv$se.brier ## ---------------------------------------------------------------------------- tab_CE ## ---------------------------------------------------------------------------- library("ggplot2") df <- data.frame(rownames(tab_CE), tab_CE) colnames(df) <- c("model", "train", "cv", "se", "lower", "upper") df$model <- factor(df$model, levels = rev(df$model)) ggplot(df, aes(x = model, y = cv, ymin = lower, ymax = upper)) + geom_point(aes(shape = "s1", color = "c1")) + geom_errorbar(width = 0.5, col = "dodgerblue3") + geom_point(aes(y = train, shape = "s2", color = "c2")) + scale_y_continuous(breaks = seq(0, 0.2, by = 0.01), lim = c(0, NA)) + scale_color_manual(name = "", breaks = c("c1", "c2"), values = c("dodgerblue3", "black"), labels = c("CV", "Train")) + scale_shape_manual(name = "", breaks = c("s1", "s2"), values = c(19, 0), labels = c("CV", "Train")) + ylab("Classification error") + xlab("") + coord_flip() + theme(legend.position = "top") ## ---------------------------------------------------------------------------- tab_Brier ## ---------------------------------------------------------------------------- df <- data.frame(rownames(tab_Brier), tab_Brier) colnames(df) <- c("model", "train", "cv", "se", "lower", "upper") df$model <- factor(df$model, levels = rev(df$model)) ggplot(df, aes(x = model, y = cv, ymin = lower, ymax = upper)) + geom_point(aes(shape = "s1", color = "c1")) + geom_errorbar(width = 0.5, col = "dodgerblue3") + geom_point(aes(y = train, shape = "s2", color = "c2")) + scale_y_continuous(breaks = seq(0, 0.2, by = 0.01), lim = c(0, NA)) + scale_color_manual(name = "", breaks = c("c1", "c2"), values = c("dodgerblue3", "black"), labels = c("CV", "Train")) + scale_shape_manual(name = "", breaks = c("s1", "s2"), values = c(19, 0), labels = c("CV", "Train")) + ylab("Brier score") + xlab("") + coord_flip() + theme(legend.position = "top") ## ---------------------------------------------------------------------------- # confusion matrix (tab <- table(Predict = cv1$classification, Class = Class_train)) tab[2, 2]/sum(tab[, 2]) # sensitivity tab[1, 1]/sum(tab[, 1]) # specificity ## ---------------------------------------------------------------------------- threshold <- seq(0, 1, by = 0.01) sensitivity <- specificity <- rep(NA, length(threshold)) for(i in 1:length(threshold)) { pred <- factor(ifelse(cv1$z[, "M"] > threshold[i], "M", "B"), levels = c("B", "M")) tab <- table(pred, Class_train) sensitivity[i] <- tab[2, 2]/sum(tab[, 2]) specificity[i] <- tab[1, 1]/sum(tab[, 1]) } ## ---------------------------------------------------------------------------- auc_approx <- function(tpr, fpr) { x <- 1 - fpr y <- tpr dx <- c(diff(x), 0) dy <- c(diff(y), 0) sum(y * dx) + sum(dy * dx)/2 } auc_approx(tpr = sensitivity, fpr = 1 - specificity) ## ---------------------------------------------------------------------------- threshold <- seq(0, 1, by = 0.01) sensitivity2 <- specificity2 <- rep(NA, length(threshold)) for(i in 1:length(threshold)) { pred <- factor(ifelse(cv2$z[, "M"] > threshold[i], "M", "B"), levels = c("B", "M")) tab <- table(pred, Class_train) sensitivity2[i] <- tab[2, 2]/sum(tab[, 2]) specificity2[i] <- tab[1, 1]/sum(tab[, 1]) } ## ---------------------------------------------------------------------------- plot(1-specificity, sensitivity, type = "l", lwd = 2) abline(h = c(0, 1), v = c(0, 1), lty = 3) abline(a = 0, b = 1, lty = 2) plot(1-specificity2, sensitivity2, type = "l", lwd = 2, xlab = "1-specificity", ylab = "sensitivity") abline(h = c(0, 1), v = c(0, 1), lty = 3) abline(a = 0, b = 1, lty = 2) ## ---------------------------------------------------------------------------- J <- sensitivity + specificity - 1 threshold[which.max(J)] # optimal threshold sensitivity[which.max(J)] # sensitivity at optimal threshold specificity[which.max(J)] # specificity at optimal threshold ## ---------------------------------------------------------------------------- data("bankruptcy", package = "MixGHD") X <- bankruptcy[, -1] Class <- factor(bankruptcy$Y, levels = c(1:0), labels = c("solvent", "bankrupt")) cl <- clPairs(X, Class) legend("bottomright", legend = cl$class, pch = cl$pch, col = cl $col, inset = 0.02) ## ---------------------------------------------------------------------------- mod <- MclustDA(X, Class, modelType = "EDDA") summary(mod) ## ---------------------------------------------------------------------------- plot(mod, what = "scatterplot") plot(mod, what = "error") ## ---------------------------------------------------------------------------- (C <- matrix(c(0, 1, 10, 0), nrow = 2, ncol = 2, byrow = TRUE)) ## ---------------------------------------------------------------------------- rowSums(C) ## ---------------------------------------------------------------------------- pred <- predict(mod) (tab <- table(Class, Predicted = pred$classification)) sum(tab * C) ## ---------------------------------------------------------------------------- pred <- predict(mod, prop = mod$prop*rowSums(C)) (tab <- table(Class, Predicted = pred$classification)) sum(tab * C) ## ---------------------------------------------------------------------------- set.seed(20190515) # generate training data from a balanced case-control sample n_train <- 1000 class_train <- factor(sample(0:1, size = n_train, prob = c(0.5, 0.5), replace = TRUE)) x_train <- ifelse(class_train == 1, rnorm(n_train, mean = 3, sd = 1), rnorm(n_train, mean = 0, sd = 1)) hist(x_train[class_train == 0], breaks = 11, xlim = range(x_train), main = "", xlab = "x", col = adjustcolor("dodgerblue2", alpha.f = 0.5), border = "white") hist(x_train[class_train == 1], breaks = 11, add = TRUE, col = adjustcolor("red3", alpha.f = 0.5), border = "white") box() # generate test data from mixture f(x) = 0.9 * N(0,1) + 0.1 * N(3,1) n <- 10000 mixpro <- c(0.9, 0.1) class_test <- factor(sample(0:1, size = n, prob = mixpro, replace = TRUE)) x_test <- ifelse(class_test == 1, rnorm(n, mean = 3, sd = 1), rnorm(n, mean = 0, sd = 1)) hist(x_test[class_test == 0], breaks = 15, xlim = range(x_test), main = "", xlab = "x", col = adjustcolor("dodgerblue2", alpha.f = 0.5), border = "white") hist(x_test[class_test == 1], breaks = 11, add = TRUE, col = adjustcolor("red3", alpha.f = 0.5), border = "white") box() ## ---------------------------------------------------------------------------- mod <- MclustDA(x_train, class_train) summary(mod, parameters = TRUE) ## ---------------------------------------------------------------------------- pred <- predict(mod, newdata = x_test) classError(pred$classification, class_test)$error BrierScore(pred$z, class_test) ## ---------------------------------------------------------------------------- priorProp <- seq(0.01, 0.99, by = 0.01) CE <- BS <- rep(as.double(NA), length(priorProp)) for (i in seq(priorProp)) { pred <- predict(mod, newdata = x_test, prop = c(1-priorProp[i], priorProp[i])) CE[i] <- classError(pred$classification, class = class_test)$error BS[i] <- BrierScore(pred$z, class_test) } ## ---------------------------------------------------------------------------- matplot(priorProp, cbind(CE, BS), type = "l", lty = 1, lwd = 2, xaxt = "n", xlab = "Class prior probability", ylab = "", ylim = c(0, max(CE, BS)), col = c("red3", "dodgerblue3"), panel.first = { abline(h = seq(0, 1, by = 0.05), col = "grey", lty = 3) abline(v = seq(0, 1, by = 0.05), col = "grey", lty = 3) }) axis(side = 1, at = seq(0, 1, by = 0.1)) abline(v = mod$prop[2], # training proportions lty = 2, lwd = 2) abline(v = mean(class_test == 1), # test proportions (usually unknown) lty = 3, lwd = 2) legend("topleft", legend = c("ClassError", "BrierScore"), col = c("red3", "dodgerblue3"), lty = 1, lwd = 2, inset = 0.02) ## ---------------------------------------------------------------------------- (priorProbs <- classPriorProbs(mod, x_test)) ## ---------------------------------------------------------------------------- pred <- predict(mod, newdata = x_test, prop = priorProbs) classError(pred$classification, class = class_test)$error BrierScore(pred$z, class_test) ## ---------------------------------------------------------------------------- (prior_test <- prop.table(table(class_test))) pred <- predict(mod, newdata = x_test, prop = prior_test) classError(pred$classification, class = class_test)$error BrierScore(pred$z, class_test) ## ---------------------------------------------------------------------------- data("wdbc", package = "mclust") x <- with(wdbc, 0.2322*Texture_mean + 0.01117*Area_extreme + 68.37*Smoothness_extreme) Class <- wdbc[, "Diagnosis"] mod <- MclustDA(x, Class, modelType = "MclustDA") summary(mod) ## ---------------------------------------------------------------------------- (prop <- mod$prop) col <- mclust.options("classPlotColors") x0 <- seq(0, max(x)*1.1, length = 1000) par1 <- mod$models[["B"]]$parameters f1 <- dens(par1$variance$modelName, data = x0, parameters = par1) par2 <- mod$models[["M"]]$parameters f2 <- dens(par2$variance$modelName, data = x0, parameters = par2) matplot(x0, cbind(prop[1]*f1, prop[2]*f2), type = "l", lty = 1, col = col, ylab = "Class density", xlab = "x") legend("topright", title = "Diagnosis:", legend = names(prop), col = col, lty = 1, inset = 0.02) ## ---------------------------------------------------------------------------- set.seed(20190520) cv <- cvMclustDA(mod) # by default: prop = mod$prop unlist(cv[c("ce", "se.ce")]) ## ---------------------------------------------------------------------------- set.seed(20190520) cv <- cvMclustDA(mod, prop = c(0.5, 0.5)) unlist(cv[c("ce", "se.ce")]) ## ---------------------------------------------------------------------------- x0 <- seq(min(x), max(x), length.out = 1000) pred <- predict(mod, newdata = x0) (threshold1 <- approx(pred$z[, 2], x0, xout = 0.5)$y) pred <- predict(mod, newdata = x0, prop = c(0.5, 0.5)) (threshold2 <- approx(pred$z[, 2], x0, xout = 0.5)$y) ## ---------------------------------------------------------------------------- par(mfrow=c(1, 2)) plot(mod, what = "scatterplot", main = TRUE) abline(v = threshold1, lty = 2) plot(mod, what = "classification", main = TRUE) abline(v = threshold1, lty = 2) ## ---------------------------------------------------------------------------- set.seed(1) threshold <- seq(0.1, 0.9, by = 0.05) ngrid <- length(threshold) cv <- data.frame(threshold, error = numeric(ngrid)) cverr <- cvMclustDA(mod, verbose = FALSE) for (i in seq(threshold)) { cv$error[i] <- classError(ifelse(cverr$z[, 2] > threshold[i], "M", "B"), Class)$errorRate } min(cv$error) threshold[which.min(cv$error)] ggplot(cv, aes(x = threshold, y = error)) + geom_point() + geom_line() + scale_x_continuous(breaks = seq(0, 1, by = 0.1)) + ylab("CV misclassification error") + xlab("Probability threshold of malignant (M) tumor class") ## ---------------------------------------------------------------------------- set.seed(1) priorProb <- seq(0.1, 0.9, by = 0.05) ngrid <- length(priorProb) cv_error2 <- data.frame(priorProb, cv = numeric(ngrid), lower = numeric(ngrid), upper = numeric(ngrid)) for (i in seq(priorProb)) { cv <- cvMclustDA(mod, prop = c(1-priorProb[i], priorProb[i]), verbose = FALSE) cv_error2$cv[i] <- cv$ce cv_error2$lower[i] <- cv$ce - cv$se.ce cv_error2$upper[i] <- cv$ce + cv$se.ce } min(cv_error2$cv) priorProb[which.min(cv_error2$cv)] ggplot(cv_error2, aes(x = priorProb, y = cv)) + geom_point() + geom_linerange(aes(ymin = lower, ymax = upper)) + scale_x_continuous(breaks = seq(0, 1, by = 0.1)) + ylab("CV misclassification error") + xlab("Malignant (M) tumor class prior probability") ## ---------------------------------------------------------------------------- n <- 200 pars <- list(pro = c(0.5, 0.5), mean = matrix(c(-1, 1), nrow = 2, ncol = 2, byrow = TRUE), variance = mclustVariance("EII", d = 2, G = 2)) pars$variance$sigmasq <- 1 data <- sim("EII", parameters = pars, n = n, seed = 12) class <- data[, 1] X <- data[, -1] clPairs(X, class, symbols = c(1, 2)) # Randomly remove labels cl <- class; cl[sample(1:n, size = 195)] <- NA # table(cl, useNA = "ifany") clPairs(X, ifelse(is.na(cl), 0, class), symbols = c(0, 16, 17), colors = c("grey", 4, 2)) ## ---------------------------------------------------------------------------- mod <- MclustDA(X, class, modelType = "EDDA") mod_EDDA <- MclustDA(X[!is.na(cl), ], cl[!is.na(cl)], modelType = "EDDA", modelNames = "EII") mod_SSC <- MclustSSC(X, cl) ngrid = 100 xgrid = seq(-4, 4, length.out = ngrid) ygrid = seq(-5, 5, length.out = ngrid) xygrid = expand.grid(xgrid, ygrid) pred_mod <- predict(mod, newdata = xygrid) pred_EDDA <- predict(mod_EDDA, newdata = xygrid) pred_SSC <- predict(mod_SSC, newdata = xygrid) col = mclust.options("classPlotColors")[class] pch = class pch[!is.na(cl)] = ifelse(cl[!is.na(cl)] == 1, 19, 17) plot(X, pch = pch, col = col) contour(xgrid, ygrid, matrix(pred_mod$z[, 1], ngrid, ngrid), add = TRUE, levels = 0.5, drawlabels = FALSE, lwd = 2, col = "cyan2") contour(xgrid, ygrid, matrix(pred_EDDA$z[, 1], ngrid, ngrid), add = TRUE, levels = 0.5, drawlabels = FALSE, lty = 2, lwd = 2) contour(xgrid, ygrid, matrix(pred_SSC$z[, 1], ngrid, ngrid), add = TRUE, levels = 0.5, drawlabels = FALSE, lty = 3, lwd = 2) ## ---------------------------------------------------------------------------- data("olive", package = "pgmm") X <- olive[, 3:10] class <- factor(olive$Region, levels = 1:3, labels = c("South", "Sardinia", "North")) table(class) ## ---------------------------------------------------------------------------- mod_EDDA_full <- MclustDA(X, class, modelType = "EDDA") pred_EDDA_full <- predict(mod_EDDA_full, newdata = X) classError(pred_EDDA_full$classification, class)$errorRate BrierScore(pred_EDDA_full$z, class) ## ---------------------------------------------------------------------------- set.seed(20200807) pct_labeled_data <- 10 n <- nrow(X) cl <- class is.na(cl) <- sample(1:n, round(n*(1-pct_labeled_data/100))) table(cl, useNA = "ifany") ## ---------------------------------------------------------------------------- mod_SSC <- MclustSSC(X, cl) plot(mod_SSC, what = "BIC") mod_SSC$BIC pickBIC(mod_SSC$BIC, 5) - max(mod_SSC$BIC) # BIC diff for the top-5 models ## ---------------------------------------------------------------------------- summary(mod_SSC) ## ---------------------------------------------------------------------------- pred_SSC <- predict(mod_SSC, newdata = X[is.na(cl), ]) table(Predicted = pred_SSC$classification, Class = class[is.na(cl)]) classError(pred_SSC$classification, class[is.na(cl)])$errorRate BrierScore(pred_SSC$z, class[is.na(cl)]) ## ---------------------------------------------------------------------------- set.seed(20201013) pct_labeled_data <- c(5, seq(10, 90, by = 10), 95) BS <- matrix(as.double(NA), nrow = length(pct_labeled_data), ncol = 2, dimnames = list(pct_labeled_data, c("EDDA", "SSC"))) for (i in seq(pct_labeled_data)) { cl <- class labeled <- sample(1:n, round(n*pct_labeled_data[i]/100)) cl[-labeled] <- NA # Classification on labeled data mod_EDDA <- MclustDA(X[labeled, ], cl[labeled], modelType = "EDDA") # prediction for the unlabeled data pred_EDDA <- predict(mod_EDDA, newdata = X[-labeled, ]) BS[i, 1] <- BrierScore(pred_EDDA$z, class[-labeled]) # Semi-supervised classification mod_SSC <- MclustSSC(X, cl) # prediction for the unlabeled data pred_SSC <- predict(mod_SSC, newdata = X[-labeled, ]) BS[i, 2] <- BrierScore(pred_SSC$z, class[-labeled]) } BS matplot(pct_labeled_data, BS, type = "b", lty = 1, pch = c(19, 15), col = c(2, 4), xaxt = "n", xlab = "Percentage of labeled data", ylab = "Brier score") axis(side = 1, at = pct_labeled_data) abline(h = BrierScore(pred_EDDA_full$z, class), lty = 2) legend("topright", pch = c(19, 15), col = c(2, 4), lty = 1, legend = c("EDDA", "SSC"), inset = 0.02) # Chapter 5 ################################################################### ## ---------------------------------------------------------------------------- data("stamps1", package = "multimode") str(stamps1) Thickness <- stamps1$thickness ## ---------------------------------------------------------------------------- dens <- densityMclust(Thickness) ## ---------------------------------------------------------------------------- summary(dens, parameters = TRUE) ## ---------------------------------------------------------------------------- br <- seq(min(Thickness), max(Thickness), length.out = 21) plot(dens, what = "density", data = Thickness, breaks = br) ## ---------------------------------------------------------------------------- with(dens$parameters, data.frame(mean = mean, sd = sqrt(variance$sigmasq), CoefVar = sqrt(variance$sigmasq)/mean*100)) ## ---------------------------------------------------------------------------- x <- c(0.07, 0.08, 0.1, 0.12) predict(dens, newdata = x, what = "dens") predict(dens, newdata = x, logarithm = TRUE) predict(dens, newdata = x, what = "cdens") ## ---------------------------------------------------------------------------- predict(dens, newdata = x, what = "z") ## ---------------------------------------------------------------------------- Year <- stamps1$year table(Year) h1 <- hist(Thickness[Year == "1872"], breaks = br, plot = FALSE) h1$density <- h1$density*prop.table(table(Year))[1] h2 <- hist(Thickness[Year == "1873-1874"], breaks = br, plot = FALSE) h2$density <- h2$density*prop.table(table(Year))[2] x <- seq(min(Thickness)-diff(range(Thickness))/10, max(Thickness)+diff(range(Thickness))/10, length = 200) cdens <- predict(dens, x, what = "cdens") cdens <- sweep(cdens, 2, dens$parameters$pro, "*") col <- adjustcolor(mclust.options("classPlotColors")[1:2], alpha = 0.3) ylim <- range(h1$density, h2$density, cdens) plot(h1, xlab = "Thickness", freq = FALSE, main = "", border = "white", col = col[1], xlim = range(x), ylim = ylim) plot(h2, add = TRUE, freq = FALSE, border = "white", col = col[2]) matplot(x, cdens, type = "l", lwd = 1, lty = 1, col = 1, add = TRUE) box() legend("topright", legend = levels(Year), col = col, pch = 15, inset = 0.02, title = "Overprinted years:", title.adj = 0.2) ## ---------------------------------------------------------------------------- data("acidity", package = "BNPdensity") ## ---------------------------------------------------------------------------- summary(mclustBIC(acidity), k = 5) ## ---------------------------------------------------------------------------- dens_E2 <- densityMclust(acidity, G = 2, modelNames = "E") summary(dens_E2, parameters = TRUE) ## ---------------------------------------------------------------------------- mclustBootstrapLRT(acidity, modelName = "V") ## ---------------------------------------------------------------------------- dens_V3 <- densityMclust(acidity, G = 3, modelNames = "V") summary(dens_V3, parameters = TRUE) ## ---------------------------------------------------------------------------- plot(dens_E2, what = "density", ylim = c(0, max(dens_E2$density, dens_V3$density))) rug(acidity) plot(dens_V3, what = "density", ylim = c(0, max(dens_E2$density, dens_V3$density))) rug(acidity) ## ---------------------------------------------------------------------------- plot(dens_E2, what = "diagnostic", type = "cdf") plot(dens_E2, what = "diagnostic", type = "qq") ## ---------------------------------------------------------------------------- plot(dens_V3, what = "diagnostic", type = "cdf") plot(dens_V3, what = "diagnostic", type = "qq") ## ---------------------------------------------------------------------------- data("faithful", package = "datasets") plot(faithful) dens <- densityMclust(faithful) summary(dens, parameters = TRUE) ## ---------------------------------------------------------------------------- plot(faithful) ## ---------------------------------------------------------------------------- plot(dens, what = "dens") ## ---------------------------------------------------------------------------- plot(dens, what = "density", type = "image") ## ---------------------------------------------------------------------------- plot(dens, what = "density", type = "persp") ## ---------------------------------------------------------------------------- plot(dens, what = "density") ## ---------------------------------------------------------------------------- plot(dens, what = "density", type = "image") plot(dens, what = "density", type = "persp") ## ---------------------------------------------------------------------------- plot(dens, what = "density", type = "hdr") ## ---------------------------------------------------------------------------- data("aircraft", package = "sm") X <- log(subset(aircraft, subset = (Period == 3), select = 3:8)) PCA <- prcomp(X, scale = TRUE) summary(PCA) PCA$rotation # loadings Z <- PCA$x[, 1:3] # PCA projection ## ---------------------------------------------------------------------------- BIC <- mclustBIC(Z) summary(BIC, k = 5) ## ---------------------------------------------------------------------------- densAircraft <- densityMclust(Z, G = 3, modelNames = "VVE", plot = FALSE) plot(densAircraft, what = "density", type = "hdr", data = Z, points.cex = 0.5) ## ---------------------------------------------------------------------------- library("mclustAddons") ## ---------------------------------------------------------------------------- data("suicide", package = "mclustAddons") dens <- densityMclust(suicide) rug(suicide) # add data points at the bottom of the graph abline(v = 0, lty = 3) # draw a vertical line at the natural boundary ## ---------------------------------------------------------------------------- bdens <- densityMclustBounded(suicide, lbound = 0) summary(bdens, parameters = TRUE) plot(bdens, what = "density") rug(suicide) # add data points at the bottom of the graph abline(v = 0, lty = 3) # draw a vertical line at the natural boundary ## ---------------------------------------------------------------------------- plot(dens, what = "density") rug(suicide) abline(v = 0, lty = 3) # plot(bdens, what = "density") rug(suicide) abline(v = 0, lty = 3) ## ---------------------------------------------------------------------------- data("racial", package = "mclustAddons") bdens <- densityMclustBounded(racial$PropWhite, lbound = 0, ubound = 1) plot(bdens, what = "density", lwd = 2, col = "dodgerblue2", data = racial$PropWhite, breaks = 15, xlab = "Proportion of white students enrolled in schools") rug(racial$PropWhite) # add data points at the bottom of the graph abline(v = c(0, 1), lty = 3) # draw a vertical line at the natural boundary ## ---------------------------------------------------------------------------- f <- function(x) 0.7*dnorm(x, mean = 0, sd = 1) + 0.3*dnorm(x, mean = 4, sd = 1) curve(f, from = -4, to = 8) ## ---------------------------------------------------------------------------- set.seed(20190525) par <- list(pro = c(0.7, 0.3), mean = c(0, 4), variance = mclustVariance("E", G = 2)) par$variance$sigmasq <- c(1, 1) x <- sim(modelName = "E", parameters = par, n = 1e4)[, -1] ## ---------------------------------------------------------------------------- par(mfrow = c(2, 2)) prob <- c(0.25, 0.5, 0.75, 0.95) (hdr <- hdrlevels(f(x), prob)) for (j in seq(prob)) { curve(f, from = -4, to = 8) mtext(side = 3, paste0(prob[j]*100, "% HDR"), adj = 0) abline(h = hdr[j], lty = 2) rug(x, col = "lightgrey") rug(x[f(x) >= hdr[j]]) } ## ---------------------------------------------------------------------------- dens <- densityMclust(x, plot = FALSE) hdrlevels(predict(dens, x), prob) # Chapter 6 ################################################################### ## ---------------------------------------------------------------------------- data("Snapper", package = "FSAdata") x <- Snapper[,1] mod <- Mclust(x, G = 4, modelNames = "V") summary(mod, parameters = TRUE) ## ---------------------------------------------------------------------------- mclust1Dplot(x, what = "classification", parameters = mod$parameters, z = mod$z, xlab = "Fish length") mclust1Dplot(x, what = "uncertainty", parameters = mod$parameters, z = mod$z, xlab = "Fish length") ## ---------------------------------------------------------------------------- mod <- Mclust(faithful) mclust2Dplot(data = faithful, what = "classification", parameters = mod$parameters, z = mod$z) mclust2Dplot(data = faithful, what = "uncertainty", parameters = mod$parameters, z = mod$z) ## ---------------------------------------------------------------------------- surfacePlot(data = faithful, parameters = mod$parameters, what = "density", type = "contour", transformation = "log") ## ---------------------------------------------------------------------------- surfacePlot(data = faithful, parameters = mod$parameters, what = "density", type = "image") ## ---------------------------------------------------------------------------- surfacePlot(data = faithful, parameters = mod$parameters, what = "density", type = "persp") ## ---------------------------------------------------------------------------- surfacePlot(data = faithful, parameters = mod$parameters, what = "uncertainty", type = "image", transformation = "sqrt") ## ---------------------------------------------------------------------------- data("iris", package = "datasets") mod <- Mclust(iris[,1:4], G = 3) summary(mod) ## ---------------------------------------------------------------------------- coordProj(data = iris[,1:4], dimens = c(2,4), what = "classification", parameters = mod$parameters, z = mod$z) coordProj(data = iris[,1:4], dimens = c(2,4), what = "uncertainty", parameters = mod$parameters, z = mod$z) coordProj(data = iris[,1:4], dimens = c(2,4), what = "error", parameters = mod$parameters, z = mod$z, truth = iris$Species) ## ---------------------------------------------------------------------------- coordProj(data = iris[,1:4], dimens = c(2,4), what = "classification", parameters = mod$parameters, z = mod$z) ## ---------------------------------------------------------------------------- coordProj(data = iris[,1:4], dimens = c(2,4), what = "uncertainty", parameters = mod$parameters, z = mod$z) ## ---------------------------------------------------------------------------- coordProj(data = iris[,1:4], dimens = c(2,4), what = "error", parameters = mod$parameters, z = mod$z, truth = iris$Species) ## ---------------------------------------------------------------------------- set.seed(1) nrow(iris) Q <- randomOrthogonalMatrix(ncol(iris[,1:4]), 2) dim(Q) QTQ <- crossprod(Q) # equivalently t(Q) %*% Q zapsmall(QTQ) # 2 x 2 identity matrix # projection of iris data onto coordinates Q irisProj <- as.matrix(iris[,1:4]) %*% Q ## ---------------------------------------------------------------------------- randProj(data = iris[,1:4], seeds = c(1,13,79,201), what = "classification", parameters = mod$parameters, z = mod$z) ## ---------------------------------------------------------------------------- randProj(data = iris[,1:4], seed = 1, what = "classification", parameters = mod$parameters, z = mod$z) ## ---------------------------------------------------------------------------- randProj(data = iris[,1:4], seed = 13, what = "classification", parameters = mod$parameters, z = mod$z) ## ---------------------------------------------------------------------------- randProj(data = iris[,1:4], seed = 79, what = "classification", parameters = mod$parameters, z = mod$z) ## ---------------------------------------------------------------------------- randProj(data = iris[,1:4], seed = 201, what = "classification", parameters = mod$parameters, z = mod$z) ## ---------------------------------------------------------------------------- plot(crimcoords(iris[,1:4], mod$classification)) ## ---------------------------------------------------------------------------- data("thyroid", package = "mclust") CRIMCOORDS <- crimcoords(thyroid[,-1], thyroid$Diagnosis) summary(CRIMCOORDS) ## ---------------------------------------------------------------------------- plot(CRIMCOORDS) points(CRIMCOORDS$means %*% CRIMCOORDS$basis, pch = 3, cex = 1.5, lwd = 2) legend("topright", legend = levels(thyroid$Diagnosis), inset = 0.02, col = mclust.options("classPlotColors")[1:3], pch = mclust.options("classPlotSymbols")[1:3]) ## ---------------------------------------------------------------------------- data("wine", package = "gclus") Class <- factor(wine$Class, levels = 1:3, labels = c("Barolo", "Grignolino", "Barbera")) X <- data.matrix(wine[,-1]) mod <- Mclust(X, G = 3, modelNames = "VVE") table(Class, Cluster = mod$classification) ## ---------------------------------------------------------------------------- drmod <- MclustDR(mod, lambda = 1) summary(drmod) ## ---------------------------------------------------------------------------- plot(drmod, what = "contour") ## ---------------------------------------------------------------------------- plot(drmod, what = "boundaries") ## ---------------------------------------------------------------------------- mod <- Mclust(iris[, 1:4], G = 3) drmod <- MclustDR(mod, lambda = .5) summary(drmod) ## ---------------------------------------------------------------------------- plot(drmod, what = "evalues") ## ---------------------------------------------------------------------------- plot(drmod, what = "pairs") ## ---------------------------------------------------------------------------- sdrmod <- MclustDRsubsel(drmod, verbose = TRUE) summary(sdrmod) ## ---------------------------------------------------------------------------- zapsmall(cor(drmod$dir, sdrmod$dir)^2) ## ---------------------------------------------------------------------------- plot(sdrmod, what = "contour", nlevels = 7) ## ---------------------------------------------------------------------------- plot(sdrmod, what = "classification") ## ---------------------------------------------------------------------------- plot(sdrmod, what = "boundaries") ## ---------------------------------------------------------------------------- plot(sdrmod, what = "density") ## ---------------------------------------------------------------------------- data("banknote", package = "mclust") mod <- MclustDA(data = banknote[, -1], class = banknote$Status) summary(mod) ## ---------------------------------------------------------------------------- drmod <- MclustDR(mod, lambda = .5) summary(drmod) ## ---------------------------------------------------------------------------- plot(drmod, what = "evalues") ## ---------------------------------------------------------------------------- plot(drmod, what = "pairs", lower.panel = NULL) clPairsLegend(0.1, 0.4, class = levels(drmod$classification), col = mclust.options("classPlotColors")[1:2], pch = mclust.options("classPlotSymbols")[1:2], title = "Swiss banknote data") ## ---------------------------------------------------------------------------- sdrmod <- MclustDRsubsel(drmod, verbose = TRUE) summary(sdrmod) ## ---------------------------------------------------------------------------- zapsmall(cor(drmod$dir, sdrmod$dir)^2) ## ---------------------------------------------------------------------------- plot(sdrmod, what = "contour", nlevels = 15) plot(sdrmod, what = "classification") ## ---------------------------------------------------------------------------- mod <- Mclust(faithful) DF <- data.frame(mod$data, cluster = factor(mod$classification)) library("ggplot2") ggplot(DF, aes(x = eruptions, y = waiting, colour = cluster, shape = cluster)) + geom_point() ## ---------------------------------------------------------------------------- library("tidyr") DF <- data.frame(mod$BIC[], G = 1:nrow(mod$BIC)) DF <- pivot_longer(DF, cols = 1:14, names_to = "Model", values_to = "BIC") DF$Model <- factor(DF$Model, levels = mclust.options("emModelNames")) ggplot(DF, aes(x = G, y = BIC, colour = Model, shape = Model)) + geom_point() + geom_line() + scale_shape_manual(values = mclust.options("bicPlotSymbols")) + scale_color_manual(values = mclust.options("bicPlotColors")) + scale_x_continuous(breaks = unique(DF$G)) + xlab("Number of mixture components") + guides(shape = guide_legend(ncol=2)) ## ---------------------------------------------------------------------------- mod <- Mclust(iris[, 1:4], G = 3) ## ---------------------------------------------------------------------------- means <- data.frame(Profile = 1:mod$G, t(mod$parameters$mean)) means <- pivot_longer(means, cols = -1, names_to = "Variable", values_to = "Mean") means$Profile <- factor(means$Profile) means$Variable <- factor(means$Variable, levels = rownames(mod$parameters$mean)) means ggplot(means, aes(Variable, Mean, group = Profile, shape = Profile, color = Profile)) + geom_point(size = 2) + geom_line() + labs(x = NULL, y = "Latent profiles means") + scale_color_manual(values = mclust.options("classPlotColors")) + theme(axis.text.x = element_text(angle = 45, hjust = 1), legend.position = "bottom") ## ---------------------------------------------------------------------------- damod <- MclustDA(iris[, 1:4], iris$Species) drmod <- MclustDR(damod) DF1 <- data.frame(drmod$dir[, 1:2], class = damod$class) DF2 <- do.call("rbind", by(DF1, DF1[, 3], function(x) x[chull(x), ])) ggplot() + geom_point(data = DF1, aes(x = Dir1, y = Dir2, color = class, shape = class)) + geom_polygon(data = DF2, aes(x = Dir1, y = Dir2, fill = class), alpha = 0.3) + scale_color_manual(values = mclust.options("classPlotColors")) + scale_fill_manual(values = mclust.options("classPlotColors")) + scale_shape_manual(values = mclust.options("classPlotSymbols")) ## ---------------------------------------------------------------------------- mod <- densityMclust(faithful$waiting, plot = FALSE) x <- extendrange(faithful$waiting, f = 0.1) x <- seq(x[1], x[2], length.out = 101) pred <- data.frame(x, density = predict(mod, newdata = x)) ggplot(faithful, aes(waiting)) + geom_histogram(aes(y = stat(density)), bins = 15, fill = "slategray3", colour = "grey92") + geom_line(data = pred, aes(x, density)) ## ---------------------------------------------------------------------------- data("hemophilia", package = "rrcov") X <- hemophilia[, 1:2] mod <- Mclust(X, G = 2, modelName = "VVV") boot <- MclustBootstrap(mod, nboot = 999, type = "bs") DF <- data.frame(mixcomp = rep(1:boot$G, each = boot$nboot), pro = as.vector(boot$pro)) ggplot(DF, aes(x = pro)) + geom_histogram(aes(y = stat(density)), bins = 15, fill = "slategray3", colour = "grey92") + facet_grid(~ mixcomp) + xlab("Mixing proportions") + ylab("Density of bootstrap distribution") ## ---------------------------------------------------------------------------- DF0 <- data.frame(rbind(cbind(mixcomp = 1, boot$mean[, , 1]), cbind(mixcomp = 2, boot$mean[, , 2]))) DF0 <- tidyr::pivot_longer(DF0, cols = 2:3, names_to = "variable", values_to = "mean") DF <- rbind( data.frame("mixcomp" = 1, "variable" = rep(colnames(boot$mean[, , 1]), each = dim(boot$mean)[1]), "mean" = as.vector(boot$mean[, , 1])), data.frame("mixcomp" = 2, "variable" = rep(colnames(boot$mean[, , 2]), each = dim(boot$mean)[1]), "mean" = as.vector(boot$mean[, , 2]))) ggplot(DF, aes(x = mean)) + geom_histogram(aes(y = stat(density)), bins = 15, fill = "slategray3", colour = "grey92") + facet_grid(mixcomp ~ variable, scales = "free_x") + xlab("Means of mixture") + ylab("Density of bootstrap distribution") ## ---------------------------------------------------------------------------- mclust.options("bicPlotColors") mclust.options("classPlotColors") ## ---------------------------------------------------------------------------- palette.colors(palette = "Okabe-Ito") ## ---------------------------------------------------------------------------- # get and save default palettes bicPlotColors <- mclust.options("bicPlotColors") classPlotColors <- mclust.options("classPlotColors") # set Okabe-Ito palette for use in mclust bicPlotColors_Okabe_Ito <- palette.colors(palette = "Okabe-Ito")[c(9, 1, 2:8, 2:6, 9, 1)] names(bicPlotColors_Okabe_Ito) <- names(bicPlotColors) classPlotColorsWong <- palette.colors(palette = "Okabe-Ito")[-1] mclust.options("bicPlotColors" = bicPlotColors_Okabe_Ito) mclust.options("classPlotColors" = classPlotColorsWong) ## ---------------------------------------------------------------------------- mod <- Mclust(iris[,3:4]) plot(mod, what = "BIC") plot(mod, what = "classification") ## ---------------------------------------------------------------------------- mclust.options("bicPlotColors" = bicPlotColors) mclust.options("classPlotColors" = classPlotColors) # Chapter 7 ################################################################### ## ---------------------------------------------------------------------------- data("chevron", package = "mclust") summary(chevron) noise <- with(chevron, class == "noise") X <- chevron[,2:3] plot(X, cex = 0.5) plot(X, cex = 0.5, col = ifelse(noise, "grey", "black"), pch = ifelse(noise, 3, 1)) ## ---------------------------------------------------------------------------- library("prabclus") nnc <- NNclean(X, k = 5) table(nnc$z) clPairs(X, nnc$z, colors = c("darkgrey", "black"), symbols = c(3, 1)) ## ---------------------------------------------------------------------------- modNoise <- Mclust(X, initialization = list(noise = (nnc$z == 0))) summary(modNoise$BIC) ## ---------------------------------------------------------------------------- summary(modNoise, parameters = TRUE) addmargins(table(chevron$class, modNoise$classification), 2) plot(modNoise, what = "classification") ## ---------------------------------------------------------------------------- clPairs(X, nnc$z, colors = c("darkgrey", "black"), symbols = c(3, 1)) ## ---------------------------------------------------------------------------- plot(modNoise, what = "classification") ## ---------------------------------------------------------------------------- hypvol(X) ## ---------------------------------------------------------------------------- library("cluster") ehull <- ellipsoidhull(as.matrix(X)) volume(ehull) modNoise.ehull <- Mclust(X, Vinv = 1/volume(ehull), initialization = list(noise = (nnc$z == 0))) summary(modNoise.ehull) tab <- table(chevron$class, modNoise.ehull$classification) addmargins(tab, 2) ## ---------------------------------------------------------------------------- library("geometry") chull <- convhulln(X, options = "FA") chull$vol ## ---------------------------------------------------------------------------- modNoise.chull <- Mclust(X, Vinv = 1/chull$vol, initialization = list(noise = (nnc$z == 0))) summary(modNoise.chull) tab <- table(chevron$class, modNoise.chull$classification) addmargins(tab, 2) ## ---------------------------------------------------------------------------- library("covRobust") nnve <- cov.nnve(X, k = 5) table(nnve$classification) ## ---------------------------------------------------------------------------- modNoise.nnve <- Mclust(X, initialization = list(noise = (nnve$classification == 0))) summary(modNoise.nnve$BIC) summary(modNoise.nnve) addmargins(table(chevron$class, modNoise.nnve$classification), 2) ## ---------------------------------------------------------------------------- set.seed(123) (Sigma <- array(c(2, 0.5, 0.5, 0.5, 1, 0, 0, 0.1, 2, -0.5, -0.5, 0.5), dim = c(2, 2, 3))) var.decomp <- sigma2decomp(Sigma) str(var.decomp) par <- list(pro = c(1/3, 1/3, 1/3), mean = cbind(c(0, 3), c(3, 0), c(-3, 0)), variance = var.decomp) data <- sim(par$variance$modelName, parameters = par, n = 200) noise <- matrix(runif(100, -10, 10), nrow = 50, ncol = 2) X <- rbind(data[, 2:3], noise) cluster <- c(data[, 1], rep(0, 50)) clPairs(X, ifelse(cluster == 0, 0, 1), colors = "black", symbols = c(16, 1), cex = c(0.5, 1)) ## ---------------------------------------------------------------------------- nnc <- NNclean(X, k = 5) modNoise <- Mclust(X, initialization = list(noise = (nnc$z == 0))) summary(modNoise$BIC) summary(modNoise, parameters = TRUE) ## ---------------------------------------------------------------------------- clPairs(X, ifelse(cluster == 0, 0, 1), colors = "black", symbols = c(16, 1), cex = c(0.5, 1)) ## ---------------------------------------------------------------------------- plot(modNoise, what = "classification") ## ---------------------------------------------------------------------------- plot(modNoise, what = "classification") table(cluster, Classification = modNoise$classification) ## ---------------------------------------------------------------------------- data("galaxies", package = "MASS") # now fix a typographical error in the data # see help("galaxies", package = "MASS") galaxies[78] <- 26960 galaxies <- galaxies / 1000 ## ---------------------------------------------------------------------------- set.seed(20190410) BIC <- NULL for(i in 1:50) { BIC0 <- mclustBIC(galaxies, verbose = FALSE, initialization = list(hcPairs = hcRandomPairs(galaxies))) BIC <- mclustBICupdate(BIC, BIC0) } summary(BIC, k = 5) plot(BIC) mod <- densityMclust(galaxies, x = BIC) summary(mod, parameters = TRUE) plot(mod, what = "density", data = galaxies, breaks = 11) rug(galaxies) ## ---------------------------------------------------------------------------- plot(BIC) ## ---------------------------------------------------------------------------- plot(mod, what = "density", data = galaxies, breaks = 11) rug(galaxies) ## ---------------------------------------------------------------------------- set.seed(20190411) BICp <- NULL for(i in 1:50) { BIC0p <- mclustBIC(galaxies, verbose = FALSE, prior = priorControl(), initialization = list(hcPairs = hcRandomPairs(galaxies))) BICp <- mclustBICupdate(BICp, BIC0p) } summary(BICp, k = 5) plot(BICp) modp <- densityMclust(galaxies, x = BICp) summary(modp, parameters = TRUE) plot(modp, what = "density", data = galaxies, breaks = 11) rug(galaxies) ## ---------------------------------------------------------------------------- plot(BICp) ## ---------------------------------------------------------------------------- plot(modp, what = "density", data = galaxies, breaks = 11) rug(galaxies) ## ---------------------------------------------------------------------------- defaultPrior(galaxies, G = 3, modelName = "V") ## ---------------------------------------------------------------------------- data("olive", package = "pgmm") # recode of labels for Region and Area Regions <- c("South", "Sardinia", "North") Areas <- c("North Apulia", "Calabria", "South Apulia", "Sicily", "Inland Sardinia", "Coastal Sardinia", "East Liguria", "West Liguria", "Umbria") olive$Region <- factor(olive$Region, levels = 1:3, labels = Regions) olive$Area <- factor(olive$Area, levels = 1:9, labels = Areas) with(olive, table(Area, Region)) ## ---------------------------------------------------------------------------- X <- scale(olive[, 3:10]) BIC <- mclustBIC(X, G = 1:15) summary(BIC) plot(BIC, legendArgs = list(x = "bottomright", ncol = 5)) BICp <- mclustBIC(X, G = 1:15, prior = priorControl()) summary(BICp) plot(BICp, legendArgs = list(x = "bottomright", ncol = 5)) ## ---------------------------------------------------------------------------- plot(BIC, legendArgs = list(x = "bottomright", ncol = 5)) ## ---------------------------------------------------------------------------- plot(BICp, legendArgs = list(x = "bottomright", ncol = 5)) ## ---------------------------------------------------------------------------- mod <- Mclust(X, x = BIC) summary(mod) table(Region = olive$Region, cluster = mod$classification) adjustedRandIndex(olive$Region, mod$classification) table(Area = olive$Area, cluster = mod$classification) adjustedRandIndex(olive$Area, mod$classification) ## ---------------------------------------------------------------------------- modp <- Mclust(X, x = BICp) summary(modp) table(Region = olive$Region, cluster = modp$classification) adjustedRandIndex(olive$Region, modp$classification) table(Area = olive$Area, cluster = modp$classification) adjustedRandIndex(olive$Area, modp$classification) ## ---------------------------------------------------------------------------- data("Baudry_etal_2010_JCGS_examples", package = "mclust") plot(ex4.1) ## ---------------------------------------------------------------------------- mod_ex4.1 <- Mclust(ex4.1) summary(mod_ex4.1) ## ---------------------------------------------------------------------------- CLUSTCOMBI <- clustCombi(mod_ex4.1) summary(CLUSTCOMBI) ## ---------------------------------------------------------------------------- par(mfrow = c(3, 2), mar = c(4, 4, 3, 1)) plot(CLUSTCOMBI, ex4.1, what = "classification") ## ---------------------------------------------------------------------------- plot(CLUSTCOMBI, what = "tree") plot(CLUSTCOMBI, what = "tree", type = "rectangle", yaxis = "step") ## ---------------------------------------------------------------------------- optimClust <- clustCombiOptim(CLUSTCOMBI, reg = 2, plot = TRUE) str(optimClust) ## ---------------------------------------------------------------------------- optimClust <- clustCombiOptim(CLUSTCOMBI, reg = 2, plot = TRUE) ## ---------------------------------------------------------------------------- data("faithful", package = "datasets") mod <- Mclust(faithful) clPairs(faithful, mod$classification) plot(as.densityMclust(mod), what = "density", add = TRUE) ## ---------------------------------------------------------------------------- GMMHD <- gmmhd(mod) summary(GMMHD) ## ---------------------------------------------------------------------------- plot(GMMHD, what = "mode") plot(GMMHD, what = "cores") plot(GMMHD, what = "clusters") ## ---------------------------------------------------------------------------- clPairs(faithful, mod$classification) plot(as.densityMclust(mod), what = "density", add = TRUE) ## ---------------------------------------------------------------------------- plot(GMMHD, what = "mode") ## ---------------------------------------------------------------------------- plot(GMMHD, what = "cores") ## ---------------------------------------------------------------------------- plot(GMMHD, what = "clusters") ## ---------------------------------------------------------------------------- data("yeast", package = "MixSAL") dup <- duplicated(yeast) sum(dup) # count replicated observations mod <- Mclust(yeast[!dup, -1]) summary(mod) adjustedRandIndex(yeast$Site[!dup], mod$classification) ## ---------------------------------------------------------------------------- GMMHD <- gmmhd(mod) summary(GMMHD) ## ---------------------------------------------------------------------------- table(yeast$Site[!dup], cluster = GMMHD$cluster) adjustedRandIndex(yeast$Site[!dup], GMMHD$cluster) ## ---------------------------------------------------------------------------- plot(GMMHD, what = "cores", col = c("grey50", "red3", "dodgerblue2"), pch = c(0, 17, 16)) surfacePlot(GMMHD$x, parameters = GMMHD$MclustDR$parameters, what = "density", type = "contour", nlevels = 15, col = "black", drawlabels = FALSE, add = TRUE) plot(GMMHD, what = "clusters", col = c("red3", "dodgerblue2"), pch = c(17, 16)) ## ---------------------------------------------------------------------------- data("Baudry_etal_2010_JCGS_examples", package = "mclust") mod <- Mclust(ex4.1) GMMHD <- gmmhd(mod) summary(GMMHD) optimClust <- clustCombiOptim(clustCombi(mod), reg = 2) table(GMMHD = GMMHD$cluster, CLUSTCOMBI = optimClust$cluster) ## ---------------------------------------------------------------------------- mod <- Mclust(faithful) sim0 <- sim(modelName = mod$modelName, parameters = mod$parameters, n = nrow(faithful), seed = 0) sim1 <- sim(modelName = mod$modelName, parameters = mod$parameters, n = nrow(faithful), seed = 1) ## ---------------------------------------------------------------------------- xlim <- range(c(faithful[, 1], sim0[, 2], sim1[, 2])) ylim <- range(c(faithful[, 2], sim0[, 3], sim1[, 3])) mclust2Dplot(data = sim0[, -1], parameters = mod$parameters, classification = sim0[, 1], xlim = xlim, ylim = ylim) mclust2Dplot(data = sim1[, -1], parameters = mod$parameters, classification = sim1[, 1], xlim = xlim, ylim = ylim) ## ---------------------------------------------------------------------------- head(sim0) ## ---------------------------------------------------------------------------- par <- list( pro = c(0.5, 0.3, 0.2), mean = matrix(c(0, 0, 3, 3, -4, 1), nrow = 2, ncol = 3), variance = sigma2decomp(matrix(c(1, 0.6, 0.6, 1.5), nrow = 2, ncol = 2), G = 3)) str(par) ## ---------------------------------------------------------------------------- sim <- sim(modelName = "EEI", parameters = par, n = 10000, seed = 123) cluster <- sim[, 1] x <- sim[, 2:3] ## ---------------------------------------------------------------------------- data("stlouis", package = "mix") x <- data.frame(stlouis[, -(1:3)], row.names = NULL) table(complete.cases(x)) apply(x, 2, function(x) prop.table(table(complete.cases(x)))) ## ---------------------------------------------------------------------------- library("ggplot2") df <- data.frame(obs = rep(1:nrow(x), times = ncol(x)), var = rep(colnames(x), each = nrow(x)), missing = as.vector(is.na(x))) ggplot(data = df, aes(x = var, y = obs)) + geom_tile(aes(fill = missing)) + scale_fill_manual(values = c("lightgrey", "black")) + labs(x = "Variables", y = "Observations") + theme_minimal() + theme(axis.text.x = element_text(margin = margin(b = 10))) + theme(axis.ticks.y = element_blank()) + theme(axis.text.y = element_blank()) ## ---------------------------------------------------------------------------- ggplot(data = df, aes(x = var, y = obs)) + geom_tile(aes(fill = missing)) + scale_fill_manual(values = c("lightgrey", "black")) + labs(x = "Variables", y = "Observations") + theme_minimal() + theme(axis.text.x = element_text(margin = margin(b = 10))) + theme(axis.ticks.y = element_blank()) + theme(axis.text.y = element_blank()) ## ---------------------------------------------------------------------------- ximp <- imputeData(x, seed = 123) ## ---------------------------------------------------------------------------- imputePairs(x, ximp) ## ---------------------------------------------------------------------------- imputePairs(x, ximp) ## ---------------------------------------------------------------------------- x <- as.matrix(iris[, 1:4]) mod <- Mclust(x, G = 3, modelNames = "EEE") table(iris$Species, mod$classification) adjustedRandIndex(iris$Species, mod$classification) mod$parameters$mean # component means ## ---------------------------------------------------------------------------- set.seed(20171211) isMissing <- sample(c(TRUE, FALSE), size = prod(dim(x)), replace = TRUE, prob = c(0.1, 0.9)) x[isMissing] <- NA table(cmpObs <- complete.cases(x)) ## ---------------------------------------------------------------------------- set.seed(20171212) nImp <- 100 muImp <- array(NA, c(ncol(x), 3, nImp)) clImp <- array(NA, c(nrow(x), nImp)) for(i in 1:nImp) { xImp <- imputeData(x, verbose = FALSE) modImp <- Mclust(xImp, G = 3, modelNames = "EEE", verbose = FALSE) if (i == 1) clImp[, i] <- modImp$classification mcl <- matchCluster(clImp[, 1], modImp$classification) clImp[, i] <- mcl$cluster muImp[, , i] <- modImp$parameters$mean[, mcl$ord] } # majority rule cl <- apply(clImp, 1, function(x) majorityVote(x)$majority) table(iris$Species, cl) adjustedRandIndex(iris$Species, cl) # pooled estimate of cluster means apply(muImp, 1:2, mean)