------------------------------------------------------------------------------ -- | -- Module : Data.Datamining.Clustering.Gsom -- Copyright : (c) 2009 Stephan Günther -- License : BSD3 -- -- Maintainer : gnn.github@gmail.com -- Stability : experimental -- Portability : non-portable (requires STM) -- -- This module should contain everything you need to run the GSOM clustering -- algorithm. It collects and re-exports all important and needed functions -- from moduls lower in the hirarchy. -- -- Ideally you should never need to look at those modules. If you do need -- to do this, it is a design failure and I would appreciate it if you -- would drop me a note. ------------------------------------------------------------------------------ module Data.Datamining.Clustering.Gsom ( -- * Algorithm Input -- | The GSOM algorithm expects a list of input vectors as its input. -- These input vectors are just lists of @'Double'@s. Input, Inputs -- ** Input processing , dimension, Bounds, bounds, normalize, unnormalize -- ** Operations between input vectors , distance, (*.), (.*), (<+>), (<->) -- ** Auxiliary Types , Coordinates -- * The map created by GSOM -- | The growing self organizing map builds, as its name suggests, a map -- representing the input vectors. This map is of type @'Lattice'@ and is -- a network of @'Node'@s. -- ** The @'Node'@s of the map , Node(..), Nodes, Neighbours, Neighbourhood -- *** Node Creation , node -- *** Modification , propagate, update, updateError -- *** Querying , isLeaf, isNode, neighbourhood, unwrappedNeighbours -- *** Debugging , putNode -- ** The map type , Lattice -- *** Creating initial lattices , newCentered, newRandom -- *** Querying a lattice , L.bmu, nodes -- *** Debugging and Output , putLattice, putWeights, dumpInputs, renderScript -- * Running GSOM -- | The GSOM algorithm builds the map by sequentially @'run'@ning a given -- number of @'Phases'@ over the input. -- ** The Phase type , Phase(..), Phases, Kernel(..), LearningRate(..) -- *** Predefined phases , defaultFirst, defaultSecond, defaultThird, defaults -- *** Running a phase/phases , phase, run -- * Constructing a clustering from the map -- ** The Clustering type , Cluster(..), Clustering -- ** Construction and Query , cluster, clustering, nearestCluster ) where ------------------------------------------------------------------------------ -- Standard modules ------------------------------------------------------------------------------ import Control.Concurrent.STM import Control.Monad import Data.Function import Data.List import Data.Map(Map(..)) import qualified Data.Map as Map import Data.Ord ------------------------------------------------------------------------------ -- Private modules ------------------------------------------------------------------------------ import Data.Datamining.Clustering.Gsom.Coordinates import Data.Datamining.Clustering.Gsom.Input import Data.Datamining.Clustering.Gsom.Lattice hiding (bmu) import qualified Data.Datamining.Clustering.Gsom.Lattice as L(bmu) import Data.Datamining.Clustering.Gsom.Node import Data.Datamining.Clustering.Gsom.Phase ------------------------------------------------------------------------------ -- Types ------------------------------------------------------------------------------ -- | The clusters generated by GSOM basically consist of three things: data Cluster = Cluster { -- | the vector which best represents all the vectors belonging to this -- cluster. center :: Input -- | The indices of the input vectors belonging to this cluster. -- That means a cluster is always relative to a set of @'Inputs'@ , contents :: [Int] -- | the coordinates of this cluster , coordinates :: Coordinates } deriving (Read, Show) -- | The final clustering which is the result of the GSOM algorithm -- is a @'Data.Map'@ mapping @'Coordinates'@ to @'Cluster'@s. type Clustering = Map Coordinates Cluster ------------------------------------------------------------------------------ -- Creation ------------------------------------------------------------------------------ -- | Computes a clustering induced by the given lattice. -- -- @'clustering' lattice@ uses the @'weights'@ of the @'nodes'@ stored -- in @lattice@ to generate 'cluster's and returns the @'Clustering'@ -- storing these 'cluster's. Each non leaf node @n@ in @lattice@ -- corresponds to one 'cluster' @c@ with @('coordinates' c = 'location' -- n)@ and with @'center' c@ equal to the weight vector of @n@. Each -- generated 'cluster''s 'contents' are empty. Use the 'cluster' -- function with a set of inputs to obtain a clustering where each -- 'Cluster''s 'contents' is a list of the indices of the input -- points belonging to this 'cluster'. clustering :: Lattice -> IO Clustering clustering l = atomically $ do ns <- liftM (filter isNode) (nodes l) associations <- mapM (\n -> do ws <- readTVar $ weights n return $! (location n, ws)) ns return $! Map.fromList $ map (\(xy, w) -> (xy, Cluster {center = w, contents = [], coordinates = xy})) associations -- | @'cluster' inputs clustering@ clusters the given @inputs@ according to -- the centers of the clusters in @clustering@. That means for each input @i@ -- from @inputs@ the index of @i@ is added to the contents of the cluster -- center to which @i@ has minimal distance. -- TODO: Implement tiebreaker. cluster :: Inputs -> Clustering -> Clustering cluster is cs = foldl' f cs (zip [0..] is) where f cs (index, i) = let c = bmu i cs in Map.insert (coordinates c) (c{contents = index : contents c}) cs -- | @'nearestCluster' input clustering@ returns the cluster which has -- the center with the smallest distance to @input@. nearestCluster :: Input -> Clustering -> Cluster nearestCluster = bmu ------------------------------------------------------------------------------- -- Output ------------------------------------------------------------------------------- -- | @'renderScript' c path@ expects to be given a 'Clustering' @c@ -- having 2 dimensional 'center's and will call 'error' if that\'s not -- the case. On success it will save a python script to @path@.py. If -- this python script is run it will in turn save a PDF image to -- @path@.pdf. The image will contain the graph induced by @c@ with each -- node (cluster center) placed positioned according to the @c@\'s -- center (weight vector). The python script will depend on the -- @networkx@ and @mathplotlib@ python packages being installed. -- I know that this is relatively clunky, but since I haven't found a -- better way of creating an image of a graph with known node positions, -- this is the way I chose to go. renderScript :: Clustering -> String -> IO () renderScript c destination = writeFile (destination ++ ".py") (head ++ nodes ++ edges ++ positions ++ writePdf) where head = "#! /user/bin/env python\n" ++ "import networkx as nx\n" ++ "import matplotlib.pyplot as p\n\n" ++ "class constant_dict(dict):\n" ++ " def __missing__(self, key):\n" ++ " return None\n\n" ++ "class echo_dict(dict):\n" ++ " def __missing__(self, key):\n" ++ " return self.setdefault(key, key)\n\n\n" ++ "G = nx.Graph()\n" nodes = "G.add_nodes_from(" ++ nodelist ++ ")\n" nodelist = show [(x ,y) | [x,y] <- map (center) $ Map.elems c] edges = "G.add_edges_from(" ++ edgelist ++ ")\n" edgelist = show $ concatMap (edgesFrom) (Map.keys c) edgesFrom coordinate = let [xc, yc] = center $ c Map.! coordinate in do n <- map (neighbour coordinate) directions [xn,yn] <- maybe [] (return.center) (Map.lookup n c) return ((xc, yc), (xn, yn)) positions = "positions = echo_dict()\n" writePdf = "p.clf()\n" ++ "nx.draw_networkx(G, positions, labels=constant_dict(), node_size=7)\n" ++ "p.savefig('" ++ destination ++ ".pdf')\n\n" -- | Dumps the given input vectors to a string which can be fed to -- gnuplot. Just write the string to a file and write @plot \"file\"@ in -- gnuplot. dumpInputs :: Inputs -> String dumpInputs = unlines . map (unwords . map show) ------------------------------------------------------------------------------ -- Internal Functions ------------------------------------------------------------------------------ -- | Again computes the best matching unit, only this time it is pure. bmu :: Input -> Clustering -> Cluster bmu i = snd . minimumBy (comparing $ distance i . center . snd) . Map.assocs