module Bio.Sequence.Stockholm
(
Stockholm(..)
,StockholmSeq(..)
,Ann(..)
,FileAnnotation(..)
,SequenceAnnotation(..)
,ColumnAnnotation(..)
,InFile
,InSeq
,findAnn
,parseStockholm
,StockholmExc(..)
,prettyPrintStockholm
#ifdef TEST
,test_Stockholm
#endif
)
where
import Control.Applicative ((<$>))
import Control.Arrow (second)
import Control.DeepSeq (NFData(..))
import Control.Monad (mplus)
import Data.Char (isSpace)
import Data.List (foldl', find)
import Data.Maybe (fromMaybe)
import Data.Typeable (Typeable)
import Text.Show (showParen, showString)
import qualified Data.Map as M
import qualified Data.ByteString.Lazy.Char8 as B
import Data.ByteString.Lazy.Char8 (ByteString)
import Bio.Core.Sequence
import Control.Monad.Exception.Synchronous (Exceptional, throw)
#ifdef TEST
import Test.Hspec.Monadic
import Test.Hspec.HUnit ()
import Test.HUnit
#endif
data Stockholm = Stockholm [Ann FileAnnotation]
[Ann (ColumnAnnotation InFile)]
[StockholmSeq]
deriving (Show, Eq, Typeable)
instance NFData Stockholm where
rnf (Stockholm file clmn seqs) = rnf file `seq` rnf clmn `seq` rnf seqs
data StockholmSeq = StSeq !SeqLabel
!SeqData
[Ann SequenceAnnotation]
[Ann (ColumnAnnotation InSeq)]
deriving (Eq, Typeable)
instance Show StockholmSeq where
showsPrec prec (StSeq (SeqLabel l) (SeqData d) sa ca) =
showParen (prec > 10) $
showString "StSeq (SeqLabel " .
showsPrec 11 l .
showString ") (SeqData " .
showsPrec 11 d .
(')':) . (' ':) .
showsPrec 11 sa .
(' ':) .
showsPrec 11 ca
instance NFData StockholmSeq where
rnf (StSeq _ _ sa ca) = rnf sa `seq` rnf ca
instance BioSeq StockholmSeq where
seqlabel (StSeq sl _ _ _) = sl
seqdata (StSeq _ sd _ _) = sd
seqlength (StSeq _ sd _ _) = Offset $ B.length (unSD sd)
data Ann d = Ann { feature :: !d
, text :: !ByteString
}
deriving (Show, Eq, Ord, Typeable)
instance NFData (Ann d) where
data FileAnnotation =
AC
| ID
| DE
| AU
| SE
| GA
| TC
| NC
| TP
| SQ
| AM
| DC
| DR
| RC
| RN
| RM
| RT
| RA
| RL
| PI
| KW
| CC
| NE
| NL
| F_Other !ByteString
deriving (Show, Eq, Ord, Typeable)
data ColumnAnnotation a =
SS
| SA
| TM
| PP
| LI
| AS
| PAS
| SAS
| IN
| C_Other !ByteString
deriving (Show, Eq, Ord, Typeable)
data InFile
data InSeq
data SequenceAnnotation =
S_AC
| S_DE
| S_DR
| OS
| OC
| LO
| S_Other !ByteString
deriving (Show, Eq, Ord, Typeable)
class IsAnnotation a where
parseAnn :: ByteString -> a
showAnn :: a -> ByteString
mkParseAnn :: (ByteString -> a -> ByteString) -> [(ByteString, a)]
-> (ByteString -> a) -> ByteString -> a
mkParseAnn modify anns mkOther =
let annots = M.fromList anns
in \feat -> let featMod = modify feat (error "mkParseAnn: never here")
in fromMaybe (mkOther feat) $ M.lookup featMod annots
mkShowAnn :: Ord a => (ByteString -> a -> ByteString) -> [(ByteString, a)]
-> (a -> Maybe ByteString) -> a -> ByteString
mkShowAnn modify anns fromOther =
let annots = M.fromList [(a,b) | (b,a) <- anns]
in \ann -> fromMaybe (error "mkShowAnn: never here 2") $
fromOther ann `mplus` (mod' <$> M.lookup ann annots)
where mod' = flip modify (error "mkShowAnn: never here 1")
instance IsAnnotation SequenceAnnotation where
parseAnn = mkParseAnn const seqAnns S_Other
showAnn = mkShowAnn const seqAnns f
where f (S_Other o) = Just o
f _ = Nothing
seqAnns :: [(ByteString, SequenceAnnotation)]
seqAnns = [("LO",LO), ("OC",OC), ("OS",OS),
("AC",S_AC), ("DE",S_DE), ("DR",S_DR)]
instance IsAnnotation FileAnnotation where
parseAnn = mkParseAnn const fileAnns F_Other
showAnn = mkShowAnn const fileAnns f
where f (F_Other o) = Just o
f _ = Nothing
fileAnns :: [(ByteString, FileAnnotation)]
fileAnns = [("AC",AC), ("AM",AM), ("AU",AU), ("CC",CC),
("DC",DC), ("DE",DE), ("DR",DR), ("GA",GA),
("ID",ID), ("KW",KW), ("NC",NC), ("NE",NE),
("NL",NL), ("PI",PI), ("RA",RA), ("RC",RC),
("RL",RL), ("RM",RM), ("RN",RN), ("RT",RT),
("SE",SE), ("SQ",SQ), ("TC",TC), ("TP",TP)]
instance ClmnAnnLoc a => IsAnnotation (ColumnAnnotation a) where
parseAnn = mkParseAnn removeSuffix clmnAnns C_Other
where
removeSuffix feat phantom =
let suffix = clmnAnnSuffix phantom
(f, s) = B.splitAt (B.length feat B.length suffix) feat
in if suffix == s then f else ""
showAnn = mkShowAnn addSuffix clmnAnns f
where
f (C_Other o) = Just o
f _ = Nothing
addSuffix feat phantom = feat `B.append` clmnAnnSuffix phantom
clmnAnns :: [(ByteString, ColumnAnnotation a)]
clmnAnns = [("AS",AS), ("IN",IN), ("LI",LI), ("PAS",PAS), ("PP",PP),
("SA",SA), ("SAS",SAS), ("SS",SS), ("TM",TM)]
class ClmnAnnLoc a where
clmnAnnSuffix :: b a -> ByteString
instance ClmnAnnLoc InSeq where
clmnAnnSuffix _ = ""
instance ClmnAnnLoc InFile where
clmnAnnSuffix _ = "_cons"
parseAnn' :: IsAnnotation a => ByteString -> ByteString -> Ann a
parseAnn' = Ann . parseAnn
#ifdef TEST
test_parseAnnots :: Specs
test_parseAnnots =
describe "parse*" $ do
it "1" $ parseAnn "AC" @?= AC
it "2" $ parseAnn "SQ" @?= SQ
it "3a" $ parseAnn "SS" @?= (SS :: ColumnAnnotation InSeq)
it "3b" $ parseAnn "SS" @?= (C_Other "SS" :: ColumnAnnotation InFile)
it "4a" $ parseAnn "SS_cons" @?= (SS :: ColumnAnnotation InFile)
it "4b" $ parseAnn "SS_cons" @?= (C_Other "SS_cons" :: ColumnAnnotation InSeq)
it "5a" $ parseAnn "SS_CONS" @?= (C_Other "SS_CONS" :: ColumnAnnotation InSeq)
it "5b" $ parseAnn "SS_CONS" @?= (C_Other "SS_CONS" :: ColumnAnnotation InFile)
it "6" $ parseAnn "LO" @?= LO
#endif
findAnn :: Eq d => d -> [Ann d] -> Maybe ByteString
findAnn x = fmap text . find ((== x) . feature)
class StockholmExc e where
emptyFileExc :: e
headerExc :: e
malformedAnnExc :: ByteString -> e
unknownAnnTypeExc :: Char -> e
malformedSeqDataExc :: ByteString -> e
instance StockholmExc () where
emptyFileExc = ()
headerExc = ()
malformedAnnExc _ = ()
unknownAnnTypeExc _ = ()
malformedSeqDataExc _ = ()
instance StockholmExc ByteString where
emptyFileExc = "parseStockholm: empty file."
headerExc = "parseStockholm: header is missing."
malformedAnnExc line =
B.concat ["parseStockholm: malformed annotation '", line, "'."]
unknownAnnTypeExc typ =
B.concat ["parseStockholm: unknown annotation type '", B.pack [typ], "'."]
malformedSeqDataExc line =
B.concat ["parseStockholm: malformed sequence data line '", line, "'."]
type ParseAnnRet = ((Char, Maybe SeqLabel, ByteString), ByteString)
parseAnnotation :: (StockholmExc e) => ByteString -> Exceptional e ParseAnnRet
parseAnnotation line
| not (B.isPrefixOf "#=G" line) || B.length line < 5 =
throw (malformedAnnExc line)
parseAnnotation line =
let Just (typ, rest) = B.uncons $ B.drop 3 line
(word1, text1) = second dropSpace . B.break isSpace $ dropSpace rest
(word2, text2) = second dropSpace . B.break isSpace $ text1
global = ((typ, Nothing, word1), text1)
seqspe = ((typ, Just (SeqLabel word1), word2), text2)
in case typ of
'F' -> return global
'C' -> return global
'S' -> return seqspe
'R' -> return seqspe
_ -> throw (unknownAnnTypeExc typ)
dropSpace :: ByteString -> ByteString
dropSpace = B.dropWhile isSpace
parseSeqData :: (StockholmExc e) => ByteString
-> Exceptional e (SeqLabel, SeqData)
parseSeqData str = case B.words str of
[ident, sq] -> return (SeqLabel ident, SeqData sq)
_ -> throw (malformedSeqDataExc str)
parseStockholm :: (StockholmExc e) => ByteString
-> [Exceptional e Stockholm]
parseStockholm = map parseStockholm' . split .
filter (not . B.all isSpace) . B.lines
where
split [] = []
split xs = case break (B.isPrefixOf "//") xs of
(y, ys) -> y : split (tail ys)
parseStockholm' :: (StockholmExc e) => [ByteString]
-> Exceptional e Stockholm
parseStockholm' = header . filter (not . B.null)
where
header (h:hs)
| h == stockholm = do (annots,seqs) <- go initial hs
return (makeStockholm annots seqs)
| otherwise = throw headerExc
where stockholm = "# STOCKHOLM 1.0"
initial = (M.empty, M.empty)
header [] = throw emptyFileExc
go acc [] = return acc
go (annots,seqs) (line:ls) | B.take 2 line == "#=" = do
annot <- parseAnnotation line
go (insertDM annot annots, seqs) ls
go (annots,seqs) (l:ls) | B.head l == '#' =
go (annots,seqs) ls
go (annots,seqs) (line:ls) = do
seqData <- parseSeqData line
go (annots, insertDM seqData seqs) ls
type DiffMap a b = M.Map a ([b] -> [b])
insertDM :: Ord a => (a,b) -> DiffMap a b -> DiffMap a b
insertDM (key,val) = M.insertWith (flip (.)) key (val:)
finishDM :: (b -> ByteString) -> DiffMap a b -> M.Map a ByteString
finishDM f = fmap (B.concat . map f . ($ []))
makeStockholm :: DiffMap (Char, Maybe SeqLabel, ByteString) ByteString
-> DiffMap SeqLabel SeqData -> Stockholm
makeStockholm annotsDM seqsDM =
let annots = finishDM id annotsDM
seqs = finishDM unSD seqsDM
go ('F', Nothing, feat) txt (f,c,r) = (parseAnn' feat txt:f,c,r)
go ('C', Nothing, feat) txt (f,c,r) = (f,parseAnn' feat txt:c,r)
go (typ, Just sq, feat) txt (f,c,r) = (f,c,(typ,sq,feat,txt):r)
go _ _ _ = error "makeStockholm: not here, ever"
add ('S',sq,feat,txt) = flip M.adjust sq $ \(StSeq sl sd sa ca) ->
StSeq sl sd (parseAnn' feat txt : sa) ca
add ('R',sq,feat,txt) = flip M.adjust sq $ \(StSeq sl sd sa ca) ->
StSeq sl sd sa (parseAnn' feat txt : ca)
add _ = error "makeStockholm: not here either"
(file, clmn, rest) = M.foldWithKey go ([],[],[]) annots
plainseqs = M.mapWithKey (\k s -> StSeq k (SeqData s) [] []) seqs
in Stockholm file clmn (M.elems $ foldl' (flip add) plainseqs rest)
prettyPrintStockholm :: Stockholm -> B.ByteString
prettyPrintStockholm (Stockholm file clmn seqs) =
let showAnnF :: IsAnnotation a => Char -> Ann a -> (ByteString, ByteString)
showAnnF t ann = (B.concat [B.pack ("#=G" ++ t : " "),
showAnn (feature ann)], text ann)
showAnnS :: IsAnnotation a => ByteString -> Char -> Ann a -> (ByteString, ByteString)
showAnnS s t ann = (B.unwords [B.pack ("#=G" ++ [t]), s,
showAnn (feature ann)], text ann)
fileLines = map (showAnnF 'F') file
clmnLines = map (showAnnF 'C') clmn
sequences = do
StSeq (SeqLabel name) (SeqData seqd) sa ca <- seqs
(name, seqd) : map (showAnnS name 'R') ca
++ map (showAnnS name 'S') sa
allLines = fileLines ++ sequences ++ clmnLines
firstColLen = maximum $ map (B.length . fst) allLines
mkLine (col1, col2) = B.concat [col1, B.replicate n ' ', col2]
where n = 1 + firstColLen B.length col1
in B.unlines ("# STOCKHOLM 1.0" : map mkLine allLines ++ ["//"])
#ifdef TEST
stockFile :: B.ByteString
stockFile = B.unlines [
"# STOCKHOLM 1.0",
"#=GF AU Infernal 1.0",
"",
"#=GS Purine1 DE Number 1 :)",
"Purine1 AAAAUUGAAUAUCGUUUUACUUGUUUAUGUC-GUGAAU-UGGCAC-GACG",
"Purine2 AAAAUUUAAUAA-GAAGCACUCAUAUAAUCCCGAGAAUAUGGCUCGGGAG",
"Purine3 UGGCAGUAACUAGCGUCACUUCGUAUAACCCCAGUGAUAUGGAUUGGGGG",
"#=GC SS_cons :::::::::::::::::((((((((,,,<<<-<<<_______>>>->>>,",
"",
"# We may have comments =)",
"",
"Purine1 UUUCUACAAGGUG-CCGGAA--CACCUAACAAUAAGUAAGUCAGCAGUGA",
"Purine2 UCUCUACCGAACAACCGUAAAUUGUUCGACUAUGAGUGAAAGUGUACCUA",
"Purine3 UCUCUACCAGGAACCAAUAA--AUCCUGAUUACGAAGAGUUUAGUGCUUU",
"#=GC SS_cons ,,,,,,,<<<<<<_________>>>>>>,,))))))))::::::::::::",
"",
"Purine1 GAU",
"Purine2 GGG",
"Purine3 AGU",
"#=GC SS_cons :::",
"// "]
purine1, purine2, purine3 :: SeqData
ss_cons :: ByteString
purine1 = SeqData "AAAAUUGAAUAUCGUUUUACUUGUUUAUGUC-GUGAAU-UGGCAC-GACGUUUCUACAAGGUG-CCGGAA--CACCUAACAAUAAGUAAGUCAGCAGUGAGAU"
purine2 = SeqData "AAAAUUUAAUAA-GAAGCACUCAUAUAAUCCCGAGAAUAUGGCUCGGGAGUCUCUACCGAACAACCGUAAAUUGUUCGACUAUGAGUGAAAGUGUACCUAGGG"
purine3 = SeqData "UGGCAGUAACUAGCGUCACUUCGUAUAACCCCAGUGAUAUGGAUUGGGGGUCUCUACCAGGAACCAAUAA--AUCCUGAUUACGAAGAGUUUAGUGCUUUAGU"
ss_cons = ":::::::::::::::::((((((((,,,<<<-<<<_______>>>->>>,,,,,,,,<<<<<<_________>>>>>>,,)))))))):::::::::::::::"
result :: [Stockholm]
result = [Stockholm file clmn seqs]
where
file = [Ann AU "Infernal 1.0"]
clmn = [Ann SS ss_cons]
seqs = [mkStock "Purine1" purine1 [Ann S_DE "Number 1 :)"],
mkStock "Purine2" purine2 [],
mkStock "Purine3" purine3 []]
mkStock name data_ sa = StSeq name data_ sa []
stockFile2 :: B.ByteString
stockFile2 = B.unlines [stockFile, stockFile]
result2 :: [Stockholm]
result2 = result ++ result
returnExc :: a -> Exceptional B.ByteString a
returnExc = return
test_parseStockholm :: Specs
test_parseStockholm =
describe "parseStockholm" $ do
it "correctly parses test file 1" $ parseStockholm stockFile @?= returnExc result
it "correctly parses test file 2" $ parseStockholm stockFile2 @?= returnExc result2
test_prettyPrintStockholm :: Specs
test_prettyPrintStockholm =
describe "parseStockholm/prettyPrintStockholm" $ do
it "parses printed test file 1" $ parseStockholm (func result) @?= returnExc result
it "parses printed test file 2" $ parseStockholm (func result2) @?= returnExc result2
where func = B.unlines . map prettyPrintStockholm
#endif
#ifdef TEST
test_Stockholm :: Specs
test_Stockholm = describe "Bio.Sequence.Stockholm" $ do
test_parseAnnots
test_parseStockholm
test_prettyPrintStockholm
#endif