-- | Import PAM/BLOSUM substituion matrices.

module Biobase.SubstMatrix.Import where

import           Control.Applicative
import           Control.Monad.Except
import           Control.Monad.IO.Class
import           Data.ByteString.Char8 (ByteString,unpack)
import           Data.Char (toLower)
import qualified Data.ByteString.Char8 as BS
import qualified Data.Map as M
import           System.Directory (doesFileExist)

import           Biobase.Primary.AA (charAA)
import           Biobase.Primary.Letter (getLetter,LimitType(..))
import           Data.PrimitiveArray hiding (map)
import           Numeric.Discretized
import qualified Biobase.Primary.AA as AA
import           Statistics.Odds

import           Biobase.SubstMatrix.Types
import           Biobase.SubstMatrix.Statistics



-- | Import substituion matrix from a bytestring.
--
-- TODO the parser is fragile, since it uses @read@. This should be fixed.

fromByteString
  :: ( MonadError String m
     , Real (Discretized k)
     )
  => ByteString
  -> m (AASubstMat t (DiscLogOdds k) a b)
fromByteString :: ByteString -> m (AASubstMat t (DiscLogOdds k) a b)
fromByteString ByteString
bstring = do
  let ([Char]
b:[[Char]]
bs) = ([Char] -> Bool) -> [[Char]] -> [[Char]]
forall a. (a -> Bool) -> [a] -> [a]
dropWhile (([Char]
"#"[Char] -> [Char] -> Bool
forall a. Eq a => a -> a -> Bool
==)([Char] -> Bool) -> ([Char] -> [Char]) -> [Char] -> Bool
forall b c a. (b -> c) -> (a -> b) -> a -> c
.Int -> [Char] -> [Char]
forall a. Int -> [a] -> [a]
take Int
1) ([[Char]] -> [[Char]])
-> ([Char] -> [[Char]]) -> [Char] -> [[Char]]
forall b c a. (b -> c) -> (a -> b) -> a -> c
. [Char] -> [[Char]]
lines ([Char] -> [[Char]]) -> [Char] -> [[Char]]
forall a b. (a -> b) -> a -> b
$ ByteString -> [Char]
unpack ByteString
bstring
      cs :: [Char]
cs = ([Char] -> Char) -> [[Char]] -> [Char]
forall a b. (a -> b) -> [a] -> [b]
map [Char] -> Char
forall a. [a] -> a
head ([[Char]] -> [Char]) -> ([Char] -> [[Char]]) -> [Char] -> [Char]
forall b c a. (b -> c) -> (a -> b) -> a -> c
. [Char] -> [[Char]]
words ([Char] -> [Char]) -> [Char] -> [Char]
forall a b. (a -> b) -> a -> b
$ [Char]
b -- should give us the characters encoding an amino acid
      ss :: [[DiscLogOdds k]]
ss = ([Char] -> [DiscLogOdds k]) -> [[Char]] -> [[DiscLogOdds k]]
forall a b. (a -> b) -> [a] -> [b]
map ((Int -> DiscLogOdds k) -> [Int] -> [DiscLogOdds k]
forall a b. (a -> b) -> [a] -> [b]
map (Discretized k -> DiscLogOdds k
forall k (t :: k). Discretized t -> DiscLogOdds t
DiscLogOdds (Discretized k -> DiscLogOdds k)
-> (Int -> Discretized k) -> Int -> DiscLogOdds k
forall b c a. (b -> c) -> (a -> b) -> a -> c
. Int -> Discretized k
forall k (b :: k). Int -> Discretized b
Discretized) ([Int] -> [DiscLogOdds k])
-> ([Char] -> [Int]) -> [Char] -> [DiscLogOdds k]
forall b c a. (b -> c) -> (a -> b) -> a -> c
. ([Char] -> Int) -> [[Char]] -> [Int]
forall a b. (a -> b) -> [a] -> [b]
map [Char] -> Int
forall a. Read a => [Char] -> a
read ([[Char]] -> [Int]) -> ([Char] -> [[Char]]) -> [Char] -> [Int]
forall b c a. (b -> c) -> (a -> b) -> a -> c
. Int -> [[Char]] -> [[Char]]
forall a. Int -> [a] -> [a]
drop Int
1 ([[Char]] -> [[Char]])
-> ([Char] -> [[Char]]) -> [Char] -> [[Char]]
forall b c a. (b -> c) -> (a -> b) -> a -> c
. [Char] -> [[Char]]
words) ([[Char]] -> [[DiscLogOdds k]]) -> [[Char]] -> [[DiscLogOdds k]]
forall a b. (a -> b) -> a -> b
$ [[Char]]
bs
      xs :: [((Z :. Letter AA a) :. Letter AA b, DiscLogOdds k)]
xs = [ ((Z
ZZ -> Letter AA a -> Z :. Letter AA a
forall a b. a -> b -> a :. b
:.Char -> Letter AA a
forall k (n :: k). Char -> Letter AA n
charAA Char
k1(Z :. Letter AA a)
-> Letter AA b -> (Z :. Letter AA a) :. Letter AA b
forall a b. a -> b -> a :. b
:.Char -> Letter AA b
forall k (n :: k). Char -> Letter AA n
charAA Char
k2),DiscLogOdds k
z)
           | (Char
k1,[DiscLogOdds k]
s) <- [Char] -> [[DiscLogOdds k]] -> [(Char, [DiscLogOdds k])]
forall a b. [a] -> [b] -> [(a, b)]
zip [Char]
cs [[DiscLogOdds k]]
ss
           , (Char
k2,DiscLogOdds k
z) <- [Char] -> [DiscLogOdds k] -> [(Char, DiscLogOdds k)]
forall a b. [a] -> [b] -> [(a, b)]
zip [Char]
cs [DiscLogOdds k]
s
           ]
      as :: Dense Vector ((Z :. Letter AA a) :. Letter AA b) (DiscLogOdds k)
as = LimitType ((Z :. Letter AA a) :. Letter AA b)
-> DiscLogOdds k
-> [((Z :. Letter AA a) :. Letter AA b, DiscLogOdds k)]
-> Dense Vector ((Z :. Letter AA a) :. Letter AA b) (DiscLogOdds k)
forall (arr :: * -> * -> *) sh elm.
PrimArrayOps arr sh elm =>
LimitType sh -> elm -> [(sh, elm)] -> arr sh elm
fromAssocs (LimitType Z
ZZLimitType Z
-> LimitType (Letter AA a) -> LimitType (Z :. Letter AA a)
forall zs z. LimitType zs -> LimitType z -> LimitType (zs :. z)
:..Letter AA a -> LimitType (Letter AA a)
forall k l (n :: k). Letter l n -> LimitType (Letter l n)
LtLetter Letter AA a
forall k (n :: k). Letter AA n
AA.ZLimitType (Z :. Letter AA a)
-> LimitType (Letter AA b)
-> LimitType ((Z :. Letter AA a) :. Letter AA b)
forall zs z. LimitType zs -> LimitType z -> LimitType (zs :. z)
:..Letter AA b -> LimitType (Letter AA b)
forall k l (n :: k). Letter l n -> LimitType (Letter l n)
LtLetter Letter AA b
forall k (n :: k). Letter AA n
AA.Z) (Discretized k -> DiscLogOdds k
forall k (t :: k). Discretized t -> DiscLogOdds t
DiscLogOdds (Discretized k -> DiscLogOdds k)
-> (Int -> Discretized k) -> Int -> DiscLogOdds k
forall b c a. (b -> c) -> (a -> b) -> a -> c
. Int -> Discretized k
forall k (b :: k). Int -> Discretized b
Discretized (Int -> DiscLogOdds k) -> Int -> DiscLogOdds k
forall a b. (a -> b) -> a -> b
$ -Int
999) [((Z :. Letter AA a) :. Letter AA b, DiscLogOdds k)]
xs
      mat :: AASubstMat t (DiscLogOdds k) a b
mat = Dense Vector ((Z :. Letter AA a) :. Letter AA b) (DiscLogOdds k)
-> Double -> AASubstMat t (DiscLogOdds k) a b
forall k k k (t :: k) s (a :: k) (b :: k).
Unboxed ((Z :. Letter AA a) :. Letter AA b) s
-> Double -> AASubstMat t s a b
AASubstMat Dense Vector ((Z :. Letter AA a) :. Letter AA b) (DiscLogOdds k)
as (AASubstMat t (DiscLogOdds k) a b -> Double
forall k1 k2 k3 s (t :: k1) (a :: k2) (b :: k3).
(Unbox s, Num s, Real s) =>
AASubstMat t s a b -> Double
estimateLambda AASubstMat t (DiscLogOdds k) a b
mat)
  AASubstMat t (DiscLogOdds k) a b
-> m (AASubstMat t (DiscLogOdds k) a b)
forall (m :: * -> *) a. Monad m => a -> m a
return AASubstMat t (DiscLogOdds k) a b
mat

-- | Import substitution matrix from file.

fromFile
  :: ( MonadIO m, MonadError String m
     , Real (Discretized k)
     )
  => FilePath
  -> m (AASubstMat t (DiscLogOdds k) a b)
fromFile :: [Char] -> m (AASubstMat t (DiscLogOdds k) a b)
fromFile [Char]
fname = IO ByteString -> m ByteString
forall (m :: * -> *) a. MonadIO m => IO a -> m a
liftIO ([Char] -> IO ByteString
BS.readFile [Char]
fname) m ByteString
-> (ByteString -> m (AASubstMat t (DiscLogOdds k) a b))
-> m (AASubstMat t (DiscLogOdds k) a b)
forall (m :: * -> *) a b. Monad m => m a -> (a -> m b) -> m b
>>= ByteString -> m (AASubstMat t (DiscLogOdds k) a b)
forall k k k k (m :: * -> *) (k :: k) (t :: k) (a :: k) (b :: k).
(MonadError [Char] m, Real (Discretized k)) =>
ByteString -> m (AASubstMat t (DiscLogOdds k) a b)
fromByteString