Imports

import axle.data.FederalistPapers._

The Federalist articles:

articles.size
// res0: Int = 86

Construct a Corpus object to assist with content analysis

import axle.nlp._
// import axle.nlp._

import axle.nlp.language.English
// import axle.nlp.language.English

val corpus = Corpus(articles.map(_.text), English)
// corpus: axle.nlp.Corpus =
// Corpus(List(
// 
// To the People of the State of New York:
// 
// AFTER an unequivocal experience of the inefficacy of the
// subsisting federal government, you are called upon to deliberate on
// a new Constitution for the United States of America. The subject
// speaks its own importance; comprehending in its consequences
// nothing less than the existence of the UNION, the safety and welfare
// of the parts of which it is composed, the fate of an empire in many
// respects the most interesting in the world. It has been frequently
// remarked that it seems to have been reserved to the people of this
// country, by their conduct and example, to decide the important
// question, whether societies of men are really capable or not of
// establishing good government from reflection and choice, or whether...

Define a feature extractor using top words and bigrams.

val frequentWords = corpus.wordsMoreFrequentThan(100)
// frequentWords: List[String] = List(the, of, to, and, in, a, be, that, it, is, which, by, as, this, would, have, will, for, or, not, their, with, from, are, on, they, an, states, government, may, been, state, all, but, its, other, people, power, has, no, more, at, if, than, any, them, one, those, there, can, we, constitution, these, must, who, such, so, most, upon, i, his, should, union, every, same, national, was, against, new, might, federal, under, our, were, into, only, great, authority, had, public, ought, some, executive, shall, general, powers, between, time, men, each, united, could, what, less, number, part, members, first, body, us, particular, two, he, different, without, legislative, laws, necessary, very, well, either, representatives, made, legislature, own, subject, senate...

val topBigrams = corpus.topKBigrams(200)
// topBigrams: List[(String, String)] = List((of,the), (to,the), (in,the), (to,be), (that,the), (it,is), (by,the), (of,a), (the,people), (on,the), (would,be), (will,be), (for,the), (from,the), (the,state), (may,be), (have,been), (and,the), (the,same), (in,a), (with,the), (the,union), (has,been), (of,their), (the,states), (of,this), (the,constitution), (as,the), (the,federal), (the,government), (power,of), (the,national), (the,most), (the,other), (which,the), (all,the), (the,united), (to,a), (united,states), (the,executive), (it,will), (the,public), (is,to), (ought,to), (in,this), (the,power), (and,to), (must,be), (be,the), (it,may), (people,of), (against,the), (it,would), (is,the), (the,convention), (upon,the), (the,latter), (of,its), (to,have), (of,them), (is,a), (part,of), (that,it), (th...

val numDimensions = frequentWords.size + topBigrams.size
// numDimensions: Int = 403

def featureExtractor(fp: Article): List[Double] = {
  import axle.enrichGenSeq
  import spire.implicits.LongAlgebra

  val tokens = English.tokenize(fp.text.toLowerCase)
  val wordCounts = tokens.tally[Long]
  val bigramCounts =  bigrams(tokens).tally[Long]
  val wordFeatures = frequentWords.map(wordCounts(_) + 0.1)
  val bigramFeatures = topBigrams.map(bigramCounts(_) + 0.1)
  wordFeatures ++ bigramFeatures
}
// featureExtractor: (fp: axle.data.FederalistPapers.Article)List[Double]

Place a MetricSpace implicitly in scope that defines the space in which to measure similarity of Articles.

import spire.implicits._
import spire.algebra._
import axle.ml.distance._
import axle.ml.distance.Euclidean
import org.jblas.DoubleMatrix
import axle.jblas.linearAlgebraDoubleMatrix

implicit val space = {
  import spire.implicits.IntAlgebra
  import spire.implicits.DoubleAlgebra
  implicit val inner = axle.jblas.rowVectorInnerProductSpace[Int, Int, Double](numDimensions)
  Euclidean[DoubleMatrix, Double]
}

Create 4 clusters using k-Means

import axle.ml.KMeans
import axle.ml.PCAFeatureNormalizer
import spire.implicits.DoubleAlgebra
val normalizer = (PCAFeatureNormalizer[DoubleMatrix] _).curried.apply(0.98)
// normalizer: org.jblas.DoubleMatrix => axle.ml.PCAFeatureNormalizer[org.jblas.DoubleMatrix] = <function1>

val classifier = KMeans[Article, List[Article], List[Seq[Double]], DoubleMatrix](
    articles,
    N = numDimensions,
    featureExtractor,
    normalizer,
    K = 4,
    iterations = 100)
// classifier: axle.ml.KMeans[axle.data.FederalistPapers.Article,List[axle.data.FederalistPapers.Article],List[Seq[Double]],org.jblas.DoubleMatrix] = <function1>

Show cluster vs author in a confusion matrix:

import cats.implicits._
import axle.ml.ConfusionMatrix
//import spire.implicits.IntAlgebra
import axle.string
val confusion = ConfusionMatrix[Article, Int, String, Vector[Article], DoubleMatrix, Vector[(String, Int)], Vector[String]](
  classifier,
  articles.toVector,
  _.author,
  0 to 3)
// confusion: axle.ml.ConfusionMatrix[axle.data.FederalistPapers.Article,Int,String,Vector[axle.data.FederalistPapers.Article],org.jblas.DoubleMatrix,Vector[(String, Int)],Vector[String]] =
// ConfusionMatrix(<function1>,Vector(Article(1,HAMILTON,
// 
// To the People of the State of New York:
// 
// AFTER an unequivocal experience of the inefficacy of the
// subsisting federal government, you are called upon to deliberate on
// a new Constitution for the United States of America. The subject
// speaks its own importance; comprehending in its consequences
// nothing less than the existence of the UNION, the safety and welfare
// of the parts of which it is composed, the fate of an empire in many
// respects the most interesting in the world. It has been frequently
// remarked that it seems to have been reserved to the people...

string(confusion)
// res3: String =
// "22  8 22  0 : 52 HAMILTON
//  0  0  3  0 :  3 HAMILTON AND MADISON
//  1  0  4  0 :  5 JAY
//  0  0 11  0 : 11 HAMILTON OR MADISON
//  0  8  6  1 : 15 MADISON
// 
// 23 16 46  1
// "