See the wikipedia page on k-Means Clustering

Clustering Irises

A demonstration of k-Means Clustering using the Iris flower data set

Imports for Distance quanta

import cats.implicits._
import axle._
import axle.quanta.Distance
import axle.jung.directedGraphJung
import edu.uci.ics.jung.graph.DirectedSparseGraph
import axle.quanta.UnitOfMeasurement

implicit val distanceConverter = {
  import spire.implicits.DoubleAlgebra
  import axle.algebra.modules.doubleRationalModule
  Distance.converterGraphK2[Double, DirectedSparseGraph]
}

Import the Irises data set

import axle.data.Irises
import axle.data.Iris
val irisesData = new Irises
// irisesData: axle.data.Irises = axle.data.Irises@2c89cc22

Make a 2-D Euclidean space implicitly available for clustering

import axle.ml.distance.Euclidean
import org.jblas.DoubleMatrix
import axle.jblas.linearAlgebraDoubleMatrix

implicit val space = {
  import spire.implicits.IntAlgebra
  import spire.implicits.DoubleAlgebra
  implicit val inner = axle.jblas.rowVectorInnerProductSpace[Int, Int, Double](2)
  Euclidean[DoubleMatrix, Double]
}

Build a classifier of irises based on sepal length and width using the K-Means algorithm

import axle.ml.KMeans
import axle.ml.PCAFeatureNormalizer
import distanceConverter.cm
import spire.implicits.DoubleAlgebra
val irisFeaturizer = (iris: Iris) => List((iris.sepalLength in cm).magnitude.toDouble, (iris.sepalWidth in cm).magnitude.toDouble)
// irisFeaturizer: axle.data.Iris => List[Double] = <function1>

val normalizer = (PCAFeatureNormalizer[DoubleMatrix] _).curried.apply(0.98)
// normalizer: org.jblas.DoubleMatrix => axle.ml.PCAFeatureNormalizer[org.jblas.DoubleMatrix] = <function1>

val classifier = KMeans[Iris, List[Iris], List[Seq[Double]], DoubleMatrix](
    irisesData.irises,
    N = 2,
    irisFeaturizer,
    normalizer,
    K = 3,
    iterations = 20)
// classifier: axle.ml.KMeans[axle.data.Iris,List[axle.data.Iris],List[Seq[Double]],org.jblas.DoubleMatrix] = <function1>

Produce a “confusion matrix”

import axle.ml.ConfusionMatrix
import spire.implicits.IntAlgebra
val confusion = ConfusionMatrix[Iris, Int, String, Vector[Iris], DoubleMatrix, Vector[(String, Int)], Vector[String]](
  classifier,
  irisesData.irises.toVector,
  _.species,
  0 to 2)
// confusion: axle.ml.ConfusionMatrix[axle.data.Iris,Int,String,Vector[axle.data.Iris],org.jblas.DoubleMatrix,Vector[(String, Int)],Vector[String]] = ConfusionMatrix(<function1>,Vector(Iris(UnittedQuantity(5.1,UnitOfMeasurement(centimeter,cm,None)),UnittedQuantity(3.5,UnitOfMeasurement(centimeter,cm,None)),UnittedQuantity(1.4,UnitOfMeasurement(centimeter,cm,None)),UnittedQuantity(0.2,UnitOfMeasurement(centimeter,cm,None)),Iris-setosa), Iris(UnittedQuantity(4.9,UnitOfMeasurement(centimeter,cm,None)),UnittedQuantity(3.0,UnitOfMeasurement(centimeter,cm,None)),UnittedQuantity(1.4,UnitOfMeasurement(centimeter,cm,None)),UnittedQuantity(0.2,UnitOfMeasurement(centimeter,cm,None)),Iris-setosa), Iris(UnittedQuantity(4.7,UnitOfMeasurement(centimeter,cm,None)),UnittedQuantity(3.2,UnitOfMeasurement(cen...

string(confusion)
// res2: String =
// "  1   0  49 :  50 Iris-setosa
//  34  16   0 :  50 Iris-versicolor
//  16  34   0 :  50 Iris-virginica
// 
//  51  50  49
// "

Visualize the final (two dimensional) centroid positions

import axle.web._
import axle.visualize.KMeansVisualization
import axle.visualize.Color._
val colors = Vector(red, blue, green)
// colors: scala.collection.immutable.Vector[axle.visualize.Color] = Vector(Color(255,0,0), Color(0,0,255), Color(0,255,0))

val vis = KMeansVisualization(classifier, colors)
// vis: axle.visualize.KMeansVisualization[axle.data.Iris,List[axle.data.Iris],List[Seq[Double]],org.jblas.DoubleMatrix] = KMeansVisualization(<function1>,Vector(Color(255,0,0), Color(0,0,255), Color(0,255,0)),600,600,50,10,Courier New,12)

svg(vis, "kmeans.svg")

kmeans

Average centroid/cluster vs iteration:

import scala.collection.immutable.TreeMap
import axle.visualize._
val plot = Plot(
  () => classifier.distanceLogSeries,
  connect = true,
  drawKey = true,
  colorOf = colors,
  title = Some("KMeans Mean Centroid Distances"),
  xAxis = Some(0d),
  xAxisLabel = Some("step"),
  yAxis = Some(0),
  yAxisLabel = Some("average distance to centroid"))
// plot: axle.visualize.Plot[Int,Int,Double,scala.collection.immutable.TreeMap[Int,Double]] = Plot(<function0>,true,true,700,600,50,4,20,50,80,Courier New,12,false,Palatino,20,Vector(Color(255,0,0), Color(0,0,255), Color(0,255,0)),Some(KMeans Mean Centroid Distances),None,Some(0.0),Some(step),Some(0),Some(average distance to centroid))

import axle.web._
// import axle.web._

svg(plot, "kmeansvsiteration.svg")

kmeans