Data/Datamining/Clustering/Gsom.hs

------------------------------------------------------------------------------
-- |
-- Module       : Data.Datamining.Clustering.Gsom
-- Copyright    : (c) 2009 Stephan Günther
-- License      : BSD3
--
-- Maintainer   : gnn.github@gmail.com
-- Stability    : experimental
-- Portability  : non-portable (requires STM)
--
-- This module should contain everything you need to run the GSOM clustering
-- algorithm. It collects and re-exports all important and needed functions
-- from moduls lower in the hirarchy.
--
-- Ideally you should never need to look at those modules. If you do need
-- to do this, it is a design failure and I would appreciate it if you
-- would drop me a note.
------------------------------------------------------------------------------

module Data.Datamining.Clustering.Gsom (
-- * Algorithm Input
-- | The GSOM algorithm expects a list of input vectors as its input.
-- These input vectors are just lists of @'Double'@s.
  Input, Inputs

-- ** Input processing
, dimension, Bounds, bounds, normalize, unnormalize

-- ** Operations between input vectors
, distance, (*.), (.*), (<+>), (<->)

-- ** Auxiliary Types
, Coordinates

-- * The map created by GSOM
-- | The growing self organizing map builds, as its name suggests, a map
-- representing the input vectors. This map is of type @'Lattice'@ and is
-- a network of @'Node'@s.

-- ** The @'Node'@s of the map
, Node(..), Nodes, Neighbours, Neighbourhood

-- *** Node Creation
, node

-- *** Modification
, propagate, update, updateError

-- *** Querying
, isLeaf, isNode, neighbourhood, unwrappedNeighbours

-- *** Debugging
, putNode

-- ** The map type
, Lattice

-- *** Creating initial lattices
, newCentered, newRandom

-- *** Querying a lattice
, L.bmu, nodes

-- *** Debugging and Output
, putLattice, putWeights, dumpInputs, renderScript

-- * Running GSOM
-- | The GSOM algorithm builds the map by sequentially @'run'@ning a given
-- number of @'Phases'@ over the input.

-- ** The Phase type
, Phase(..), Phases, Kernel(..), LearningRate(..)

-- *** Predefined phases
, defaultFirst, defaultSecond, defaultThird, defaults

-- *** Running a phase/phases
, phase, run

-- * Constructing a clustering from the map
-- ** The Clustering type
, Cluster(..), Clustering

-- ** Construction and Query
, cluster, clustering, nearestCluster

) where

------------------------------------------------------------------------------
-- Standard modules
------------------------------------------------------------------------------

import Control.Concurrent.STM
import Control.Monad
import Data.Function
import Data.List
import Data.Map(Map(..))
import qualified Data.Map as Map
import Data.Ord

------------------------------------------------------------------------------
-- Private modules
------------------------------------------------------------------------------

import Data.Datamining.Clustering.Gsom.Coordinates
import Data.Datamining.Clustering.Gsom.Input
import Data.Datamining.Clustering.Gsom.Lattice hiding (bmu)
import qualified Data.Datamining.Clustering.Gsom.Lattice as L(bmu)
import Data.Datamining.Clustering.Gsom.Node
import Data.Datamining.Clustering.Gsom.Phase

------------------------------------------------------------------------------
-- Types
------------------------------------------------------------------------------

-- | The clusters generated by GSOM basically consist of three things:
data Cluster = Cluster {
  -- | the vector which best represents all the vectors belonging to this
  -- cluster.
  center :: Input
  -- | The indices of the input vectors belonging to this cluster.
  -- That means a cluster is always relative to a set of @'Inputs'@
, contents :: [Int]
  -- | the coordinates of this cluster
, coordinates :: Coordinates
} deriving (Read, Show)

-- | The final clustering which is the result of the GSOM algorithm
-- is a @'Data.Map'@ mapping @'Coordinates'@ to @'Cluster'@s.
type Clustering = Map Coordinates Cluster

------------------------------------------------------------------------------
-- Creation
------------------------------------------------------------------------------

-- | Computes a clustering induced by the given lattice.
--
-- @'clustering' lattice@ uses the @'weights'@ of the @'nodes'@ stored
-- in @lattice@ to generate 'cluster's and returns the @'Clustering'@
-- storing these 'cluster's. Each non leaf node @n@ in @lattice@
-- corresponds to one 'cluster' @c@ with @('coordinates' c = 'location'
-- n)@ and with @'center' c@ equal to the weight vector of @n@. Each
-- generated 'cluster'&#39;s 'contents' are empty. Use the 'cluster'
-- function with a set of inputs to obtain a clustering where each
-- 'Cluster'&#39;s 'contents' is a list of the indices of the input
-- points belonging to this 'cluster'.
clustering :: Lattice -> IO Clustering
clustering l = atomically $ do
  ns <- liftM (filter isNode) (nodes l)
  associations <- mapM (\n -> do
    ws <- readTVar $ weights n
    return $! (location n, ws)) ns
  return $! Map.fromList $ map (\(xy, w) ->
    (xy, Cluster {center = w, contents = [], coordinates = xy})) associations

-- | @'cluster' inputs clustering@ clusters the given @inputs@ according to
-- the centers of the clusters in @clustering@. That means for each input @i@
-- from @inputs@ the index of @i@ is added to the contents of the cluster
-- center to which @i@ has minimal distance.
-- TODO: Implement tiebreaker.
cluster :: Inputs -> Clustering -> Clustering
cluster is cs = foldl' f cs (zip [0..] is) where
  f cs (index, i) = let c = bmu i cs in
    Map.insert (coordinates c) (c{contents = index : contents c}) cs

-- | @'nearestCluster' input clustering@ returns the cluster which has
-- the center with the smallest distance to @input@.
nearestCluster :: Input -> Clustering -> Cluster
nearestCluster = bmu

-------------------------------------------------------------------------------
-- Output
-------------------------------------------------------------------------------

-- | @'renderScript' c path@ expects to be given a 'Clustering' @c@
-- having 2 dimensional 'center's and will call 'error' if that\'s not
-- the case. On success it will save a python script to @path@.py. If
-- this python script is run it will in turn save a PDF image to
-- @path@.pdf. The image will contain the graph induced by @c@ with each
-- node (cluster center) placed positioned according to the @c@\'s
-- center (weight vector). The python script will depend on the
-- @networkx@ and @mathplotlib@ python packages being installed.
-- I know that this is relatively clunky, but since I haven't found a
-- better way of creating an image of a graph with known node positions,
-- this is the way I chose to go.
renderScript :: Clustering -> String -> IO ()
renderScript c destination = writeFile
  (destination ++ ".py")
  (head ++ nodes ++ edges ++ positions ++ writePdf) where
  head = "#! /user/bin/env python\n" ++
    "import networkx as nx\n" ++
    "import matplotlib.pyplot as p\n\n" ++
    "class constant_dict(dict):\n" ++
    "  def __missing__(self, key):\n" ++
    "    return None\n\n" ++
    "class echo_dict(dict):\n" ++
    "  def __missing__(self, key):\n" ++
    "    return self.setdefault(key, key)\n\n\n" ++
    "G = nx.Graph()\n"
  nodes = "G.add_nodes_from(" ++ nodelist ++ ")\n"
  nodelist = show [(x ,y) | [x,y] <- map (center) $ Map.elems c]
  edges = "G.add_edges_from(" ++ edgelist ++ ")\n"
  edgelist = show $ concatMap (edgesFrom) (Map.keys c)
  edgesFrom coordinate = let [xc, yc] = center $ c Map.! coordinate in do
    n <- map (neighbour coordinate) directions
    [xn,yn] <- maybe [] (return.center) (Map.lookup n c)
    return ((xc, yc), (xn, yn))
  positions = "positions = echo_dict()\n"
  writePdf = "p.clf()\n" ++
    "nx.draw_networkx(G, positions, labels=constant_dict(), node_size=7)\n" ++
    "p.savefig('" ++ destination ++ ".pdf')\n\n"

-- | Dumps the given input vectors to a string which can be fed to
-- gnuplot. Just write the string to a file and write @plot \"file\"@ in
-- gnuplot.
dumpInputs :: Inputs -> String
dumpInputs = unlines . map (unwords . map show)

------------------------------------------------------------------------------
-- Internal Functions
------------------------------------------------------------------------------

-- | Again computes the best matching unit, only this time it is pure.
bmu :: Input -> Clustering -> Cluster
bmu i = snd . minimumBy (comparing $ distance i . center . snd) . Map.assocs