Introduction

An R package implementing a Projection Pursuit algorithm based on finite Gaussian mixtures models for density estimation using Genetic Algorithms to maximise an approximated Negentropy index. The ppgmmga algorithm provides a method to visualise high-dimensional data in a lower-dimensional space.

library(ppgmmga)
##    ___  ___  ___ ___ _  __ _  ___ ____ _
##   / _ \/ _ \/ _ `/  ' \/  ' \/ _ `/ _ `/
##  / .__/ .__/\_, /_/_/_/_/_/_/\_, /\_,_/ 
## /_/  /_/   /___/            /___/       version 1.0

Banknote data

library(mclust)
## Package 'mclust' version 5.4.1
## Type 'citation("mclust")' for citing this R package in publications.
data("banknote")
X <- banknote[,-1]
Class <- banknote$Status
table(Class)
## Class
## counterfeit     genuine 
##         100         100
clPairs(X, classification = Class)

plot of chunk unnamed-chunk-2

ppgmmga

1-dimensional ppgmmga

pp1D <- ppgmmga(data = X, d = 1, approx = "UT", seed = 1)
pp1D
## Call:
## ppgmmga(data = X, d = 1, approx = "UT", seed = 1)
## 
## 'ppgmmga' object containing: 
## [1] "data"       "d"          "approx"     "GMM"        "GA"        
## [6] "Negentropy" "basis"      "Z"
summary(pp1D)
## ── ppgmmga ───────────────────────────── 
## 
## Data dimensions               = 200 x 6 
## Data transformation           = center & scale 
## Projection subspace dimension = 1 
## GMM density estimate          = (VEE,4)
## Negentropy approximation      = UT 
## GA optimal negentropy         = 0.6345935 
## GA encoded basis solution: 
##            x1        x2       x3       x4       x5
## [1,] 0.127318 0.7685482 2.090227 2.828462 2.609874
## 
## Estimated projection basis: 
##                  PP1
## Length   -0.01196616
## Left     -0.09347799
## Right     0.16021148
## Bottom    0.57406937
## Top       0.34503416
## Diagonal -0.71892054
plot(pp1D)

plot of chunk unnamed-chunk-3

plot(pp1D, class = Class)

plot of chunk unnamed-chunk-3

2-dimensional ppgmmga

pp2D <- ppgmmga(data = X, d = 2, approx = "UT", seed = 1)
summary(pp2D, check = TRUE)
## ── ppgmmga ───────────────────────────── 
## 
## Data dimensions               = 200 x 6 
## Data transformation           = center & scale 
## Projection subspace dimension = 2 
## GMM density estimate          = (VEE,4)
## Negentropy approximation      = UT 
## GA optimal negentropy         = 1.13624 
## GA encoded basis solution: 
##            x1      x2       x3       x4        x5       x6        x7
## [1,] 1.972386 2.97614 1.231153 1.590697 0.3159946 3.859221 0.9727494
##            x8       x9      x10
## [1,] 1.114576 0.167687 1.683149
## 
## Estimated projection basis: 
##                   PP1         PP2
## Length   -0.044409969 -0.06764246
## Left      0.018859484 -0.12241338
## Right    -0.147571751  0.07854225
## Bottom    0.003090167  0.86813545
## Top      -0.054869445  0.46843261
## Diagonal  0.986343980  0.03438478
## 
## Monte Carlo Negentropy approximation check: 
##                            UT
## Approx Negentropy 1.136240193
## MC Negentropy     1.134698923
## MC se             0.003533358
## Relative accuracy 1.001358307
summary(pp2D$GMM)
## ------------------------------------------------------- 
## Density estimation via Gaussian finite mixture modeling 
## ------------------------------------------------------- 
## 
## Mclust VEE (ellipsoidal, equal shape and orientation) model with 4
## components: 
## 
##  log.likelihood   n df       BIC       ICL
##       -1191.595 200 51 -2653.405 -2666.898
## 
## Clustering table:
##  1  2  3  4 
## 16 99 47 38
plot(pp2D$GA)

plot of chunk unnamed-chunk-4

plot(pp2D)

plot of chunk unnamed-chunk-4

plot(pp2D, class = Class, drawAxis = FALSE)

plot of chunk unnamed-chunk-4

3-dimensional ppgmmga

gmm <- densityMclust(data = scale(X, center = TRUE, scale = FALSE), G = 2)
pp3D <- ppgmmga(data = X, d = 3, 
                center = TRUE, scale = FALSE, gmm = gmm, 
                gatype = "gaisl", 
                options = ppgmmga.options(numIslands = 2),
                seed = 1)
summary(pp3D$GA)
## ── Islands Genetic Algorithm ─────────── 
## 
## GA settings: 
## Type                  =  real-valued 
## Number of islands     =  2 
## Islands pop. size     =  50 
## Migration rate        =  0.1 
## Migration interval    =  10 
## Elitism               =  1 
## Crossover probability =  0.8 
## Mutation probability  =  0.1 
## Search domain = 
##             x1       x2       x3       x4       x5       x6       x7
## lower 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000
## upper 6.283185 3.141593 3.141593 3.141593 3.141593 6.283185 3.141593
##             x8       x9      x10      x11      x12      x13      x14
## lower 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000
## upper 3.141593 3.141593 3.141593 6.283185 3.141593 3.141593 3.141593
##            x15
## lower 0.000000
## upper 3.141593
## 
## GA results: 
## Iterations              = 120 
## Epochs                  = 12 
## Fitness function values = 0.8572447 0.8572447 
## Solutions = 
##            x1       x2        x3        x4        x5       x6        x7
## [1,] 1.851015 1.178840 0.9667885 0.5621497 0.4874974 4.149389 0.9175687
## [2,] 1.851015 1.178839 0.9667894 0.5621510 0.4874959 4.149388 0.9175686
##            x8      x9     x10       x11       x12      x13      x14
## [1,] 1.951967 1.57328 2.37529 0.6113169 0.8308838 1.753269 1.942719
## [2,] 1.951967 1.57328 2.37529 0.6113156 0.8308834 1.753269 1.942719
##           x15
## [1,] 1.541154
## [2,] 1.541153
plot(pp3D$GA)

plot of chunk unnamed-chunk-5

plot(pp3D)

plot of chunk unnamed-chunk-5

plot(pp3D, class = Class)

plot of chunk unnamed-chunk-5

plot(pp3D, dim = c(1,2))

plot of chunk unnamed-chunk-5

plot(pp3D, dim = c(1,3), class = Class)

plot of chunk unnamed-chunk-5

References

Scrucca, L. and Serafini, A. (2018) Projection pursuit based on Gaussian mixtures and evolutionary algorithms. Under review.