-- | Simple score and distance matrices. These are two-dimensional tables -- together with row and column vectors of names. module Data.PrimitiveArray.ScoreMatrix where import Control.Monad (when,unless) import Data.Text (Text) import Data.Vector.Unboxed (Unbox) import Numeric.Log import qualified Data.Text as T import qualified Data.Vector as V import System.Exit (exitFailure) import Data.PrimitiveArray hiding (map) import qualified Data.PrimitiveArray as PA -- | NxN sized score matrices -- -- TODO needs a vector with the column names! data ScoreMatrix t = ScoreMatrix { scoreMatrix :: !(Unboxed (Z:.Int:.Int) t) , scoreNodes :: !(Unboxed Int t) , rowNames :: !(V.Vector Text) , colNames :: !(V.Vector Text) } deriving (Show) -- | Get the distance between edges @(From,To)@. (.!.) :: Unbox t => ScoreMatrix t -> (Int,Int) -> t ScoreMatrix mat _ _ _ .!. (f,t) = mat ! (Z:.f:.t) {-# Inline (.!.) #-} -- | If the initial node has a "distance", it'll be here nodeDist :: Unbox t => ScoreMatrix t -> Int -> t nodeDist ScoreMatrix{..} k = scoreNodes ! k -- | Get the name of the node at an row index rowNameOf :: ScoreMatrix t -> Int -> Text rowNameOf ScoreMatrix{..} k = rowNames V.! k {-# Inline rowNameOf #-} -- | Get the name of the node at an column index colNameOf :: ScoreMatrix t -> Int -> Text colNameOf ScoreMatrix{..} k = colNames V.! k {-# Inline colNameOf #-} -- | Number of rows in a score matrix. numRows :: Unbox t => ScoreMatrix t -> Int numRows ScoreMatrix{..} = let (_:..LtInt n':.._) = upperBound scoreMatrix in n' + 1 {-# Inline numRows #-} -- | Number of columns in a score matrix. numCols :: Unbox t => ScoreMatrix t -> Int numCols ScoreMatrix{..} = let (_:.._:..LtInt n') = upperBound scoreMatrix in n' + 1 {-# Inline numCols #-} listOfRowNames :: ScoreMatrix t -> [Text] listOfRowNames ScoreMatrix{..} = V.toList rowNames listOfColNames :: ScoreMatrix t -> [Text] listOfColNames ScoreMatrix{..} = V.toList colNames -- | Turns a @ScoreMatrix@ for distances into one scaled by "temperature" for -- Inside/Outside calculations. Each value is scaled by -- @\k -> exp $ negate k / r * t@ where -- r = (n-1) * d -- d = mean of genetic distance -- -- Node scores are turned directly into probabilities. -- -- TODO Again, there is overlap and we should really have @newtype -- Distance@ and friends. -- -- TODO @newtype Temperature = Temperature Double@ -- -- TODO fix for rows /= cols!!! toPartMatrix :: Double -- ^ temperature -> ScoreMatrix Double -> ScoreMatrix (Log Double) toPartMatrix t scoreMat@(ScoreMatrix mat sn rns cns) = ScoreMatrix p psn rns cns where p = PA.map (\k -> Exp {- . log . exp -} $ negate k / (r * t)) mat psn = PA.map (\k -> Exp $ negate k) sn n = numRows scoreMat d = Prelude.sum [ mat ! (Z:.i:.j) | i <- [0..n-1], j <- [i+1..n-1] ] / fromIntegral (n*(n-1)) r = fromIntegral (n-1) * d -- | Import data. -- -- TODO Should be generalized because @Lib-BiobaseBlast@ does almost the -- same thing. fromFile :: FilePath -> IO (ScoreMatrix Double) fromFile fp = do ls <- lines <$> readFile fp when (null ls) $ do putStrLn $ fp ++ " is empty" exitFailure let mat' = map (map read . tail . words) $ tail ls let n = length mat' unless (all ((==n) . length) mat') $ do putStrLn $ fp ++ " is not a NxN matrix" print mat' exitFailure let scoreMatrix = PA.fromAssocs (ZZ:..LtInt (n-1):..LtInt (n-1)) 0 $ concatMap (\(r,es) -> [ ((Z:.r:.c),e) | (c,e) <- zip [0..] es ]) $ zip [0..] mat' -- rows let scoreNodes = PA.fromAssocs (LtInt $ n-1) 0 [] let rowNames = V.fromList . map T.pack . drop 1 . words $ head ls let colNames = V.fromList . map (T.pack . head . words) $ tail ls return $ ScoreMatrix{..} -- mat rowNames colNames (V.fromList $ replicate n 0)