DDC/Core/Flow/Transform/Rates/Clusters/Linear.hs

{-# LANGUAGE DataKinds #-}
module DDC.Core.Flow.Transform.Rates.Clusters.Linear
    (solve_linear)
 where

import DDC.Base.Pretty
import DDC.Core.Flow.Transform.Rates.Graph
import DDC.Core.Flow.Transform.Rates.Clusters.Base

import qualified Data.Map  as Map

import Numeric.Limp.Program.ResultKind
import Numeric.Limp.Program
import Numeric.Limp.Rep
import Numeric.Limp.Solvers.Cbc.Solve

import qualified Numeric.Limp.Canon.Convert as Conv
import qualified Numeric.Limp.Canon.Simplify as CSimp


-- | Get parent transducers of two nodes, if exists.
-- | Integer-valued variable type for linear program
--
-- SameCluster i j  - {0,1} 0 if i and j are fused together
--
-- C n - 0 if node is fused with all its users. Minimising this is good for array contraction.
--
data ZVar n
 = SameCluster n n
 | C n
 deriving (Eq, Show, Ord)

instance Pretty n => Pretty (ZVar n) where
 ppr (SameCluster a b) = text "SC" <+> ppr a <+> ppr b
 ppr (C a)             = text "C"  <+> ppr a


-- | Variable type for linear program
-- Pi i             - {0..} used to show the resulting partition is acyclic:
--
--      A graph is acyclic iff there is a mapping
--          pi : Node -> Int
--      such that \forall (i,j) \in Edge
--          pi(j) > pi(i)
--
data RVar n
 = Pi n
 deriving (Eq, Show, Ord)

instance Pretty n => Pretty (RVar n) where
 ppr (Pi a) = text "O" <+> ppr a

-- | Canonical form of SameCluster variables.
-- Since we only want to generate one SameCluster variable for each pair, we generate the
-- variable with min variable then max.
mkSameCluster :: Ord n => n -> n -> ZVar n
mkSameCluster m n
 = SameCluster (min m n) (max m n)

-- | Minimise objective:
-- \Sigma_i,j Weight(i,j) * SameCluster(i,j)
gobjective :: Ord n => [n] -> [(Int,n,n)] -> Linear (ZVar n) (RVar n) IntDouble 'KZ
gobjective ns ws
 =  foldl (.+.) c0
 (  map (\(w,i,j) -> z (mkSameCluster i j) (Z w)) ws
 ++ map (\n -> z (C n) (Z $ length ns)) ns)


-- | Get variable bounds - Pi are unbounded, SameCluster and C are "bools" (0 or 1)
getBounds :: Ord n => [n] -> [(Int,n,n)] -> [Bounds (ZVar n) (RVar n) IntDouble]
getBounds ns ws
 =  map boundC  ns
 ++ map boundSC ws
 where
  boundC n
   = binary (C n)
  boundSC (_,i,j)
   = binary (mkSameCluster i j)


-- | Create constraints for edges and weights
getConstraints :: (Ord n, Eq t, Show n)
               => Int -> Graph n t
               -> [((n,n),Bool)]
               -> [(Int,n,n)]
               -> TransducerMap n
               -> Constraint (ZVar n) (RVar n) IntDouble
getConstraints bigN g arcs ws trans
 = mconcat $  map edgeConstraint arcs
           ++ map weightConstraint ws
 where
  piDiff u v = r1 (Pi v) .-. r1 (Pi u)
  sc     u v = z1 (mkSameCluster u v)

  -- Edge constraints:
  --
  -- For nonfusible edges (u,v), add a constraint
  --    pi(v) - pi(u) >= 1
  -- which is equivalent to
  --    pi(v) > pi(u)
  -- or "v must be scheduled after u"
  -- This will disallow u and v from being in the same cluster, as other
  -- constraints require x(u,v) = 0 can only be true if pi(u) = pi(v).
  --
  -- For fusible edges (u,v)
  --    if they are merged together,    x(u,v) = 0 and pi(v) = pi(u)
  --    otherwise,                      x(u,v) = 1 and pi(v) > pi(u)
  -- This is achieved with the constraint
  --    x(u,v) <= pi(v) - pi(u) <= n * x(u,v)
  --
  --
  -- Note that arcs are reversed in graph, so (v,u) below is actually an edge from u to v.
  edgeConstraint ((v,u), fusible)
   -- The edge must be fusible, the two nodes must have a similar size,
   -- and there can be no other paths between u and v that aren't fusible.
   | fusible && typeComparable g trans u v && noFusionPreventingPath arcs u v
   -- We may want to remove the 'typeComparable' restriction later, and just check
   -- that they have some iteration size, but not necessarily similar.
   -- This would allow fusing @a@ into @c@ in @a = map...; c = cross a b@.
   = let x = sc u v
     in  Between x (piDiff u v) (Z bigN *. x)
     :&& x :<= z1 (C u)

   -- Non-fusible edge, or nodes are different types
   | otherwise
        -- pi(v) - pi(u) >= 1
   =   piDiff u v :>= c1
   :&& z1 (C u)   :== c1


  -- Weights between other nodes:
  --
  -- For any two nodes that may be scheduled together,
  -- we must make sure that if they are together, their pis *must* be the same.
  -- If they are not together, their pis are unconstrained.
  --
  --    -n * x(u,v) <= pi(v) - pi(u) <= n * x(u,v)
  --
  -- That is, if u and v are in the same cluster (x(u,v)=0)
  -- then     pi(v) - pi(u) = 0, or pi(v) = pi(u)
  --
  -- Otherwise, pi(v) - pi(u) has a large enough range to be practically unbounded.
  -- 
  -- This constraint is not necessary if there is a fusible edge between the two,
  -- as a more restrictive constraint will be added by edgeConstraint.
  --
  weightConstraint (_,u,v)
   -- If there's an edge between the two, don't bother adding this constraint
   | not $ any (\((i,j),_) -> (u,v) == (i,j) || (v,u) == (i,j)) arcs
   = let x = sc u v
     in  Between (Z (-bigN) *. x) (piDiff u v) (Z bigN *. x) 
     :&& checkTypes u v

   | otherwise
   = CTrue


  -- If two nodes have different types, but parent transducers with same type,
  -- we may still fuse them together if their parent transducers are fused together
  checkTypes u v
   | Just uT <- nodeType g u
   , Just vT <- nodeType g v
   , uT /= vT
   , Just (u',v') <- trans u v
   =   filtConstraint v' v  u v
   :&& filtConstraint u' u  u v
   :&& filtConstraint u' v' u v

   | otherwise
   = CTrue

  -- c and d can only be fused if a and b are fused
  filtConstraint a b c d
   -- If a and b are the same node, they're already fused!
   | a == b
   = CTrue

   -- Check if it's even possible for a and b to be fused.
   -- There might be a fusion-preventing edge between them.
   | checkFusible a b
   -- If it's possible, constrain (SC a b) <= (SC c d).
   -- This means that if (SC a b) is 1 (unfused), it forces (SC c d) = 1 too.
   = sc a b :<= sc c d

   -- There's a fusion-preventing path between a and b, so they can't possibly be fused.
   -- So c and d won't be fused - let's just set it to 1.
   | otherwise
   = sc c d :== c1

  checkFusible a b
   = any (\(_, i,j) -> (i,j) == (a,b) || (i,j) == (b,a)) ws




-- | Get list of all nodes that might be clustered together,
-- and the weighted benefit of doing so.
clusterings :: (Ord n, Eq t) => [((n,n),Bool)] -> [n] -> Int -> Graph n t -> TransducerMap n -> [(Int, n,n)]
clusterings arcs ns bigN g trans
 = go ns
 where
   -- For some node, find all later nodes of same or similar type, and calculate benefit
   go (u:rest)
    =  [ (w,u,v)
       | v <- rest
       , noFusionPreventingPath arcs u v
       , cmp u v
       , let w = weight u v
       , w > 0]
    ++ go rest
   go []
    = []

   cmp = typeComparable g trans

   -- Simple trick:
   -- if there is an edge between the two,
   --   there will be some cache locality benefit from merging
   -- otherwise, 
   --   the only benefit is reducing loop overhead
   --
   -- Another heuristic would be to count nodes with shared parents as having a locality benefit
   weight u v
    -- An edge between them
    | (_:_) <- filter (\((i,j),_) -> (u,v) == (i,j) || (v,u) == (i,j)) arcs
    = bigN * bigN

    -- Share a parent
    | ius <- map (fst.fst) $ filter (\((_,j),_) -> j == u) arcs
    , ivs <- map (fst.fst) $ filter (\((_,j),_) -> j == v) arcs
    , _:_ <- filter (flip elem ius) ivs

    -- Assume that this is for an array.
    = bigN * bigN
    | otherwise
    = 1


-- | Create linear program for graph, and put all the pieces together.
lp :: (Ord n, Eq t, Show n) => Graph n t -> TransducerMap n -> Program (ZVar n) (RVar n) IntDouble
lp g trans
 = minimise (gobjective names weights)
            (getConstraints nNodes g arcs weights trans)
            (getBounds names weights)
 where
   g'    = listOfGraph g
   names = map fst $ fst g'
   arcs  =           snd g'

   weights = clusterings arcs names nNodes g trans

   nNodes
     = numNodes g


-- | Find a good clustering for some graph.
-- The output is:
--  (Pi, Type number) -> list of nodes
solve_linear :: (Ord n, Eq t, Show n, Pretty n) => Graph n t -> TransducerMap n -> [[n]]
solve_linear g trans
 = case solve lp's of
   Left  e   -> error (show e)
   Right ass -> Map.elems
              $ fixMap (sub `mappend` ass)
 where
  lp'  = lp g trans
  {- show_lp = CPr.ppr (show.ppr) (show.ppr) -}

  lp'c        = Conv.program lp'
  -- Simplify can return a (Left InfeasibleError) if the program can't be solved,
  -- but we luckily have a proof that the programs we generate will always be feasible.
  Right (sub, lp's) = CSimp.simplify lp'c


  fixMap ass@(Assignment mz _r)
   = reorder ass $ snd $ fillMap $ Map.foldWithKey go (0 :: Int, Map.empty) mz

  go k v (n, m)
   -- SameCluster i j = 0 --> i and j must be fused together
   | SameCluster i j <- k
   , v == 0
   = case (Map.lookup i m, Map.lookup j m) of
     (Just iC, Just jC)
      -> if   iC == jC
         then (n, m)
         else (n, Map.map (\x -> if x == iC then jC else x) m)
     (Just iC, Nothing)
      -> (n, Map.insert j iC m)
     (Nothing, Just jC)
      -> (n, Map.insert i jC m)
     (Nothing, Nothing)
      -> ( n + 1
         , Map.insert i n 
         $ Map.insert j n m)

   | otherwise
   = (n, m)

  fillMap (n, m)
   = foldr goFill (n, m) (fst $ listOfGraph g)

  goFill (k,_ty) (n, m)
   | Map.member k m
   = (n, m)
   | otherwise
   = ( n + 1
     , Map.insert k n m)


  reorder ass m
   = Map.fromList
   $ map (reorder' ass)
   $ Map.toList $ invertMap m

  reorder' ass (k,v:vs)
   = let k' = rOf ass (Pi v)
     in  ((truncate k' :: Int, k), v:vs)
  reorder' _ (_, [])
   = error "ddc-core-flow:DDC.Core.Flow.Transform.Rates.Linear: impossible, empty list in inverted map"