module Bio.Sequence.Stockholm.Document
(
Stockholm(..)
, StockholmSeq(..)
, Ann(..)
, FileAnnotation(..)
, SequenceAnnotation(..)
, ColumnAnnotation(..)
, InFile
, InSeq
, parseDoc
, renderDoc
)
where
import Control.Applicative ((<$>))
import Control.DeepSeq (NFData(..))
import Control.Monad (mplus)
import Data.Maybe (fromMaybe)
import Data.Typeable (Typeable)
import qualified Data.Map as M
import qualified Data.ByteString.Char8 as B
import qualified Data.ByteString.Lazy.Char8 as L
import Bio.Core.Sequence
import qualified Data.Conduit as C
import qualified Data.Conduit.List as CL
import Bio.Sequence.Stockholm.Stream
data Stockholm = Stockholm [Ann FileAnnotation]
[Ann (ColumnAnnotation InFile)]
[StockholmSeq]
deriving (Show, Eq, Ord, Typeable)
instance NFData Stockholm where
rnf (Stockholm file clmn seqs) = rnf file `seq` rnf clmn `seq` rnf seqs
data StockholmSeq = StSeq !SeqLabel
!SeqData
[Ann SequenceAnnotation]
[Ann (ColumnAnnotation InSeq)]
deriving (Eq, Ord, Typeable)
instance Show StockholmSeq where
showsPrec prec (StSeq (SeqLabel l) (SeqData d) sa ca) =
showParen (prec > 10) $
showString "StSeq (SeqLabel " .
showsPrec 11 l .
showString ") (SeqData " .
showsPrec 11 d .
(')':) . (' ':) .
showsPrec 11 sa .
(' ':) .
showsPrec 11 ca
instance NFData StockholmSeq where
rnf (StSeq _ _ sa ca) = rnf sa `seq` rnf ca
instance BioSeq StockholmSeq where
seqlabel (StSeq sl _ _ _) = sl
seqdata (StSeq _ sd _ _) = sd
seqlength (StSeq _ sd _ _) = Offset $ L.length (unSD sd)
data Ann d = Ann { feature :: !d
, text :: !L.ByteString
}
deriving (Show, Eq, Ord, Typeable)
instance NFData (Ann d) where
data FileAnnotation =
AC
| ID
| DE
| AU
| SE
| GA
| TC
| NC
| TP
| SQ
| AM
| DC
| DR
| RC
| RN
| RM
| RT
| RA
| RL
| PI
| KW
| CC
| NE
| NL
| F_Other !B.ByteString
deriving (Show, Eq, Ord, Typeable)
data ColumnAnnotation a =
SS
| SA
| TM
| PP
| LI
| AS
| PAS
| SAS
| IN
| C_Other !B.ByteString
deriving (Show, Eq, Ord, Typeable)
data InFile
data InSeq
data SequenceAnnotation =
S_AC
| S_DE
| S_DR
| OS
| OC
| LO
| S_Other !B.ByteString
deriving (Show, Eq, Ord, Typeable)
type ParseFeature a = B.ByteString -> a
type ShowFeature a = a -> B.ByteString
mkParseFeature :: (B.ByteString -> a -> B.ByteString)
-> [(B.ByteString, a)]
-> (B.ByteString -> a)
-> ParseFeature a
mkParseFeature modify anns mkOther =
let annots = M.fromList anns
in \feat -> let featMod = modify feat (error "mkParseFeature: never here")
in fromMaybe (mkOther feat) $ M.lookup featMod annots
mkShowFeature :: Ord a =>
(B.ByteString -> a -> B.ByteString)
-> [(B.ByteString, a)]
-> (a -> Maybe B.ByteString)
-> ShowFeature a
mkShowFeature modify anns fromOther =
let annots = M.fromList [(a,b) | (b,a) <- anns]
in \ann -> fromMaybe (error "mkShowFeature: never here 2") $
fromOther ann `mplus` (mod' <$> M.lookup ann annots)
where mod' = flip modify (error "mkShowFeature: never here 1")
parseSeqFeature :: ParseFeature SequenceAnnotation
showSeqFeature :: ShowFeature SequenceAnnotation
(parseSeqFeature, showSeqFeature) =
( mkParseFeature const seqFeatures S_Other
, mkShowFeature const seqFeatures f )
where
f (S_Other o) = Just o
f _ = Nothing
seqFeatures = [("LO",LO), ("OC",OC), ("OS",OS),
("AC",S_AC), ("DE",S_DE), ("DR",S_DR)]
parseFileFeature :: ParseFeature FileAnnotation
showFileFeature :: ShowFeature FileAnnotation
(parseFileFeature, showFileFeature) =
( mkParseFeature const fileFeatures F_Other
, mkShowFeature const fileFeatures f )
where
f (F_Other o) = Just o
f _ = Nothing
fileFeatures = [("AC",AC), ("AM",AM), ("AU",AU), ("CC",CC),
("DC",DC), ("DE",DE), ("DR",DR), ("GA",GA),
("ID",ID), ("KW",KW), ("NC",NC), ("NE",NE),
("NL",NL), ("PI",PI), ("RA",RA), ("RC",RC),
("RL",RL), ("RM",RM), ("RN",RN), ("RT",RT),
("SE",SE), ("SQ",SQ), ("TC",TC), ("TP",TP)]
parseClmnFeature :: ClmnFeatureLoc a => ParseFeature (ColumnAnnotation a)
parseClmnFeature = mkParseFeature removeSuffix clmnFeatures C_Other
where
removeSuffix feat phantom =
let suffix = clmnFeatureSuffix phantom
(f, s) = B.splitAt (B.length feat B.length suffix) feat
in if suffix == s then f else ""
showClmnFeature :: ClmnFeatureLoc a => ShowFeature (ColumnAnnotation a)
showClmnFeature = mkShowFeature addSuffix clmnFeatures f
where
f (C_Other o) = Just o
f _ = Nothing
addSuffix feat phantom = feat `B.append` clmnFeatureSuffix phantom
clmnFeatures :: [(B.ByteString, ColumnAnnotation a)]
clmnFeatures = [("AS",AS), ("IN",IN), ("LI",LI), ("PAS",PAS), ("PP",PP),
("SA",SA), ("SAS",SAS), ("SS",SS), ("TM",TM)]
class ClmnFeatureLoc a where
clmnFeatureSuffix :: b a -> B.ByteString
instance ClmnFeatureLoc InSeq where
clmnFeatureSuffix _ = ""
instance ClmnFeatureLoc InFile where
clmnFeatureSuffix _ = "_cons"
type DiffMap a b = M.Map a [b]
insertDM :: Ord a => (a, b) -> DiffMap a b -> DiffMap a b
insertDM (key, val) = M.insertWith' (\_ old -> val:old) key [val]
finishDM :: (b -> L.ByteString) -> DiffMap a b -> M.Map a L.ByteString
finishDM f = fmap (L.concat . map f . reverse)
type AnnMap d = DiffMap d L.ByteString
insertAnn :: Ord d => Ann d -> AnnMap d -> AnnMap d
insertAnn (Ann key val) = insertDM (key, val)
finishAnn :: AnnMap d -> [Ann d]
finishAnn m = [Ann a b | (a, b) <- M.toList (finishDM id m)]
type SeqAnnMap d = M.Map B.ByteString (AnnMap d)
insertSM :: Ord d => B.ByteString -> Ann d -> SeqAnnMap d -> SeqAnnMap d
insertSM sq ann = M.alter (just . insertAnn ann . fromMaybe M.empty) sq
where
just !x = Just x
finishSM :: SeqAnnMap d -> M.Map B.ByteString [Ann d]
finishSM = fmap finishAnn
data PartialAnns =
PartialAnns { paFileAnns :: !(AnnMap FileAnnotation)
, paFileColAnns :: !(AnnMap (ColumnAnnotation InFile))
, paSeqAnns :: !(SeqAnnMap SequenceAnnotation)
, paSeqColAnns :: !(SeqAnnMap (ColumnAnnotation InSeq))
}
emptyPA :: PartialAnns
emptyPA = PartialAnns M.empty M.empty M.empty M.empty
insertPA_GF :: Ann (FileAnnotation ) -> PartialAnns -> PartialAnns
insertPA_GC :: Ann (ColumnAnnotation InFile) -> PartialAnns -> PartialAnns
insertPA_GS :: B.ByteString -> Ann (SequenceAnnotation ) -> PartialAnns -> PartialAnns
insertPA_GR :: B.ByteString -> Ann (ColumnAnnotation InSeq ) -> PartialAnns -> PartialAnns
insertPA_GF ann pa = pa { paFileAnns = insertAnn ann (paFileAnns pa) }
insertPA_GC ann pa = pa { paFileColAnns = insertAnn ann (paFileColAnns pa) }
insertPA_GS sq ann pa = pa { paSeqAnns = insertSM sq ann (paSeqAnns pa) }
insertPA_GR sq ann pa = pa { paSeqColAnns = insertSM sq ann (paSeqColAnns pa) }
parseDoc :: C.Resource m => C.Conduit Event m Stockholm
parseDoc = C.conduitState LookingForHeader push close
where
close LookingForHeader = return []
close (InsideStockholm annots seqs) = return [makeStockholm annots seqs]
push state (EvComment _) =
return (C.StateProducing state [])
push LookingForHeader EvHeader =
continue (emptyPA, M.empty)
push LookingForHeader x =
fail $ "parseDoc: unexpected " ++ show x ++ " before header"
push (InsideStockholm _ _) EvHeader =
fail "parseDoc: unexpected header"
push (InsideStockholm annots seqs) EvEnd =
return (C.StateProducing LookingForHeader [makeStockholm annots seqs])
push (InsideStockholm annots seqs) (EvSeqData label data_) =
continue (annots, insertDM (label, data_) seqs)
push (InsideStockholm annots seqs) (EvGF feat data_) =
continue (insertPA_GF (Ann (parseFileFeature feat) data_) annots, seqs)
push (InsideStockholm annots seqs) (EvGC feat data_) =
continue (insertPA_GC (Ann (parseClmnFeature feat) data_) annots, seqs)
push (InsideStockholm annots seqs) (EvGS sq feat data_) =
continue (insertPA_GS sq (Ann (parseSeqFeature feat) data_) annots, seqs)
push (InsideStockholm annots seqs) (EvGR sq feat data_) =
continue (insertPA_GR sq (Ann (parseClmnFeature feat) data_) annots, seqs)
continue (annots, seqs) = return (C.StateProducing (InsideStockholm annots seqs) [])
data ParseDoc = LookingForHeader
| InsideStockholm
{ pdAnnots :: !PartialAnns
, pdSeqs :: !(DiffMap B.ByteString L.ByteString)
}
makeStockholm :: PartialAnns -> DiffMap B.ByteString L.ByteString -> Stockholm
makeStockholm annots seqsDM =
let fileAnns_ = finishAnn (paFileAnns annots)
fileColAnns = finishAnn (paFileColAnns annots)
seqAnns_ = finishSM (paSeqAnns annots)
seqColAnns = finishSM (paSeqColAnns annots)
stseqs = [StSeq (SeqLabel $ l sq) (SeqData dt) (f sq seqAnns_) (f sq seqColAnns)
| (sq, dt) <- M.toList (finishDM id seqsDM)]
where
f = M.findWithDefault []
l = L.fromChunks . return
in Stockholm fileAnns_ fileColAnns stseqs
renderDoc :: C.Resource m => C.Conduit Stockholm m Event
renderDoc = CL.concatMap toEvents
where
toEvents (Stockholm file clmn seqs) =
(EvHeader:) $
toEventsFileAnns file $
toEventsSeqs seqs $
toEventsFileClmn clmn $
[EvEnd]
toEventsFileAnns [] = id
toEventsFileAnns (a:as) =
(EvGF (showFileFeature $ feature a) (text a) :) .
toEventsFileAnns as
toEventsFileClmn [] = id
toEventsFileClmn (a:as) =
wrap (EvGC (showClmnFeature $ feature a)) (text a) .
toEventsFileClmn as
toEventsSeqs (StSeq (SeqLabel name) (SeqData seqd) sa ca : xs) =
wrap (EvSeqData name') seqd .
toEventsSeqAnns name' sa .
toEventsSeqClmn name' ca .
toEventsSeqs xs
where name' = B.concat $ L.toChunks name
toEventsSeqs [] = id
toEventsSeqAnns _ [] = id
toEventsSeqAnns n (a:as) =
(EvGS n (showSeqFeature $ feature a) (text a) :) .
toEventsSeqAnns n as
toEventsSeqClmn _ [] = id
toEventsSeqClmn n (a:as) =
wrap (EvGR n (showClmnFeature $ feature a)) (text a) .
toEventsSeqClmn n as
wrap :: (L.ByteString -> b) -> L.ByteString -> [b] -> [b]
wrap mk bs = case L.splitAt 70 bs of
(x, "") -> (mk x :)
(x, xs) -> (mk x :) . wrap mk xs