-- | This module provides a very simple implementation of a decisiontree. It is \"optimized\" for readability, not so much for performance. I doubt it can be used for real (=huge) datasets, but it should be ok for a couple of hundred (thousand?) items.
-- 
-- You are encouraged to have a look at the source
-- 
-- It is build (for now) using the ID3 algorithm (or at least something closely resembling that). That means the attributes you choose must have a finite set of possible values.
module Data.DecisionTree (
    build,
    decide,
    Datum(D, dName, attributes),
    PreLabeled,
    Attribute(A, aName, possibleValues),
    DecisionTree) where

import Data.Maybe (fromJust)
import Data.List hiding (partition)
import Data.Map (Map)
import qualified Data.Map as Map
import Data.Function (on)

type PreLabeled a b= (b, Datum a)

-- | The type for our DecisionTree
data DecisionTree a b= Leaf b -- ^ Leafs have labels
    | Node { 
        att ::Attribute a, -- ^ a node asks for this attribute
        child :: a -> (DecisionTree a b) -- ^ and has children which can be found with a value of the attribute
        }

-- | A Datum has Attributes
data Attribute a = A {
    aName :: String, -- ^ Attributes have a name
    possibleValues ::  [a] -- ^ and a set of possible values
    } 

-- | Things we want to find labels for
data Datum a= D {
    dName :: String, -- ^ They have names
    attributes :: [(Attribute a,a)] -- ^ and attributes
    } deriving Show

instance (Show a, Show b) => Show (DecisionTree a b) where
    show x = showTree x ""

showTree :: (Show a, Show b) => DecisionTree a b -> ShowS
showTree (Leaf x) = shows x
showTree (Node att child) = ('<':).shows att.("|\n"++).showList [child a | a <- possibleValues att].('>':)

instance Eq (Attribute a) where
    (==) = (==) `on` aName
    
instance Show (Attribute a) where
    show = aName
    
-- | Build a DecisionTree from the given Trainingset
build :: (Ord a, Ord b) => [Attribute a] -> [PreLabeled a b] -> DecisionTree a b
build atts dataset =  case inf of
    0 -> Leaf dominantLabel -- even the best Attribute doesn't gain any information. We're done
    _ -> Node { 
        att = bAtt,
        child = safeLookup
        } 
    where
            (inf,bAtt) = bestAttribute dataset atts -- get the best attribute