{-# LANGUAGE BangPatterns, CPP, EmptyDataDecls, DeriveDataTypeable, OverloadedStrings #-} -- | Take low-level 'Event'@s@ and turn them high-level data -- structures. module Bio.Sequence.Stockholm.Document ( -- * Data types Stockholm(..) , StockholmSeq(..) , Ann(..) , FileAnnotation(..) , SequenceAnnotation(..) , ColumnAnnotation(..) , InFile , InSeq -- * Conduits , parseDoc , renderDoc ) where -- from base import Control.Applicative ((<$>)) import Control.DeepSeq (NFData(..)) import Control.Monad (mplus) import Data.Maybe (fromMaybe) import Data.Typeable (Typeable) -- from containers import qualified Data.Map as M -- from bytestring import qualified Data.ByteString.Char8 as B import qualified Data.ByteString.Lazy.Char8 as L -- from biocore import Bio.Core.Sequence -- from conduit import qualified Data.Conduit as C import qualified Data.Conduit.List as CL -- from this package import Bio.Sequence.Stockholm.Stream ---------------------------------------------------------------------- -- Types -- | An Stockholm 1.0 formatted file represented in memory. data Stockholm = Stockholm [Ann FileAnnotation] [Ann (ColumnAnnotation InFile)] [StockholmSeq] deriving (Show, Eq, Ord, Typeable) instance NFData Stockholm where rnf (Stockholm file clmn seqs) = rnf file `seq` rnf clmn `seq` rnf seqs -- | A sequence in Stockholm 1.0 format. data StockholmSeq = StSeq !SeqLabel !SeqData [Ann SequenceAnnotation] [Ann (ColumnAnnotation InSeq)] deriving (Eq, Ord, Typeable) -- We don't derive Show to be able support biocore-0.1, which -- doesn't have Show instances for SeqLabel and SeqData. instance Show StockholmSeq where showsPrec prec (StSeq (SeqLabel l) (SeqData d) sa ca) = showParen (prec > 10) $ showString "StSeq (SeqLabel " . showsPrec 11 l . showString ") (SeqData " . showsPrec 11 d . (')':) . (' ':) . showsPrec 11 sa . (' ':) . showsPrec 11 ca instance NFData StockholmSeq where rnf (StSeq _ _ sa ca) = rnf sa `seq` rnf ca instance BioSeq StockholmSeq where seqlabel (StSeq sl _ _ _) = sl seqdata (StSeq _ sd _ _) = sd seqlength (StSeq _ sd _ _) = Offset $ L.length (unSD sd) -- | A generic annotation. data Ann d = Ann { feature :: !d , text :: !L.ByteString } deriving (Show, Eq, Ord, Typeable) instance NFData (Ann d) where -- already strict, default instance -- | Possible file annotations. data FileAnnotation = AC -- ^ Accession number: Accession number in form PFxxxxx.version or PBxxxxxx. | ID -- ^ Identification: One word name for family. | DE -- ^ Definition: Short description of family. | AU -- ^ Author: Authors of the entry. | SE -- ^ Source of seed: The source suggesting the seed members belong to one family. | GA -- ^ Gathering method: Search threshold to build the full alignment. | TC -- ^ Trusted Cutoff: Lowest sequence score and domain score of match in the full alignment. | NC -- ^ Noise Cutoff: Highest sequence score and domain score of match not in full alignment. | TP -- ^ Type: Type of family (presently Family, Domain, Motif or Repeat). | SQ -- ^ Sequence: Number of sequences in alignment. | AM -- ^ Alignment Method: The order ls and fs hits are aligned to the model to build the full align. | DC -- ^ Database Comment: Comment about database reference. | DR -- ^ Database Reference: Reference to external database. | RC -- ^ Reference Comment: Comment about literature reference. | RN -- ^ Reference Number: Reference Number. | RM -- ^ Reference Medline: Eight digit medline UI number. | RT -- ^ Reference Title: Reference Title. | RA -- ^ Reference Author: Reference Author | RL -- ^ Reference Location: Journal location. | PI -- ^ Previous identifier: Record of all previous ID lines. | KW -- ^ Keywords: Keywords. | CC -- ^ Comment: Comments. | NE -- ^ Pfam accession: Indicates a nested domain. | NL -- ^ Location: Location of nested domains - sequence ID, start and end of insert. | F_Other !B.ByteString -- ^ Other file annotation. deriving (Show, Eq, Ord, Typeable) -- | Possible column annotations. Phantom type can be 'InFile' -- or 'InSeq'. data ColumnAnnotation a = SS -- ^ Secondary structure. | SA -- ^ Surface accessibility. | TM -- ^ TransMembrane. | PP -- ^ Posterior probability. | LI -- ^ LIgand binding. | AS -- ^ Active site. | PAS -- ^ AS - Pfam predicted. | SAS -- ^ AS - from SwissProt. | IN -- ^ INtron (in or after). | C_Other !B.ByteString -- ^ Other column annotation. deriving (Show, Eq, Ord, Typeable) -- | Phantom type for 'ColumnAnnotation's of the whole file. data InFile -- | Phantom type for 'ColumnAnnotation's of a single sequence. data InSeq -- | Possible sequence annotations. data SequenceAnnotation = S_AC -- ^ Accession number | S_DE -- ^ Description | S_DR -- ^ Database reference | OS -- ^ Organism (species) | OC -- ^ Organism classification (clade, etc.) | LO -- ^ Look (Color, etc.) | S_Other !B.ByteString -- ^ Other sequence annotation. deriving (Show, Eq, Ord, Typeable) ---------------------------------------------------------------------- -- Parsing and showing features -- | Parse a feature. type ParseFeature a = B.ByteString -> a -- | Show a feature. type ShowFeature a = a -> B.ByteString -- | Helper to create 'ParseFeature'@s@. mkParseFeature :: (B.ByteString -> a -> B.ByteString) -> [(B.ByteString, a)] -> (B.ByteString -> a) -> ParseFeature a mkParseFeature modify anns mkOther = let annots = M.fromList anns in \feat -> let featMod = modify feat (error "mkParseFeature: never here") in fromMaybe (mkOther feat) $ M.lookup featMod annots -- | Helper to create 'ShowFeature'@s@. mkShowFeature :: Ord a => (B.ByteString -> a -> B.ByteString) -> [(B.ByteString, a)] -> (a -> Maybe B.ByteString) -> ShowFeature a mkShowFeature modify anns fromOther = let annots = M.fromList [(a,b) | (b,a) <- anns] in \ann -> fromMaybe (error "mkShowFeature: never here 2") $ fromOther ann `mplus` (mod' <$> M.lookup ann annots) where mod' = flip modify (error "mkShowFeature: never here 1") -- | Parse and show sequence annotations. parseSeqFeature :: ParseFeature SequenceAnnotation showSeqFeature :: ShowFeature SequenceAnnotation (parseSeqFeature, showSeqFeature) = ( mkParseFeature const seqFeatures S_Other , mkShowFeature const seqFeatures f ) where f (S_Other o) = Just o f _ = Nothing seqFeatures = [("LO",LO), ("OC",OC), ("OS",OS), ("AC",S_AC), ("DE",S_DE), ("DR",S_DR)] -- | Parse and show file annotations. parseFileFeature :: ParseFeature FileAnnotation showFileFeature :: ShowFeature FileAnnotation (parseFileFeature, showFileFeature) = ( mkParseFeature const fileFeatures F_Other , mkShowFeature const fileFeatures f ) where f (F_Other o) = Just o f _ = Nothing fileFeatures = [("AC",AC), ("AM",AM), ("AU",AU), ("CC",CC), ("DC",DC), ("DE",DE), ("DR",DR), ("GA",GA), ("ID",ID), ("KW",KW), ("NC",NC), ("NE",NE), ("NL",NL), ("PI",PI), ("RA",RA), ("RC",RC), ("RL",RL), ("RM",RM), ("RN",RN), ("RT",RT), ("SE",SE), ("SQ",SQ), ("TC",TC), ("TP",TP)] -- | Parse and show column annotations. parseClmnFeature :: ClmnFeatureLoc a => ParseFeature (ColumnAnnotation a) parseClmnFeature = mkParseFeature removeSuffix clmnFeatures C_Other where removeSuffix feat phantom = let suffix = clmnFeatureSuffix phantom (f, s) = B.splitAt (B.length feat - B.length suffix) feat in if suffix == s then f else "" showClmnFeature :: ClmnFeatureLoc a => ShowFeature (ColumnAnnotation a) showClmnFeature = mkShowFeature addSuffix clmnFeatures f where f (C_Other o) = Just o f _ = Nothing addSuffix feat phantom = feat `B.append` clmnFeatureSuffix phantom clmnFeatures :: [(B.ByteString, ColumnAnnotation a)] clmnFeatures = [("AS",AS), ("IN",IN), ("LI",LI), ("PAS",PAS), ("PP",PP), ("SA",SA), ("SAS",SAS), ("SS",SS), ("TM",TM)] class ClmnFeatureLoc a where clmnFeatureSuffix :: b a -> B.ByteString instance ClmnFeatureLoc InSeq where clmnFeatureSuffix _ = "" instance ClmnFeatureLoc InFile where clmnFeatureSuffix _ = "_cons" ---------------------------------------------------------------------- -- Specilized Maps. type DiffMap a b = M.Map a [b] insertDM :: Ord a => (a, b) -> DiffMap a b -> DiffMap a b insertDM (key, val) = M.insertWith' (\_ old -> val:old) key [val] finishDM :: (b -> L.ByteString) -> DiffMap a b -> M.Map a L.ByteString finishDM f = fmap (L.concat . map f . reverse) type AnnMap d = DiffMap d L.ByteString insertAnn :: Ord d => Ann d -> AnnMap d -> AnnMap d insertAnn (Ann key val) = insertDM (key, val) finishAnn :: AnnMap d -> [Ann d] finishAnn m = [Ann a b | (a, b) <- M.toList (finishDM id m)] type SeqAnnMap d = M.Map B.ByteString (AnnMap d) insertSM :: Ord d => B.ByteString -> Ann d -> SeqAnnMap d -> SeqAnnMap d insertSM sq ann = M.alter (just . insertAnn ann . fromMaybe M.empty) sq where just !x = Just x finishSM :: SeqAnnMap d -> M.Map B.ByteString [Ann d] finishSM = fmap finishAnn data PartialAnns = PartialAnns { paFileAnns :: !(AnnMap FileAnnotation) , paFileColAnns :: !(AnnMap (ColumnAnnotation InFile)) , paSeqAnns :: !(SeqAnnMap SequenceAnnotation) , paSeqColAnns :: !(SeqAnnMap (ColumnAnnotation InSeq)) } emptyPA :: PartialAnns emptyPA = PartialAnns M.empty M.empty M.empty M.empty insertPA_GF :: Ann (FileAnnotation ) -> PartialAnns -> PartialAnns insertPA_GC :: Ann (ColumnAnnotation InFile) -> PartialAnns -> PartialAnns insertPA_GS :: B.ByteString -> Ann (SequenceAnnotation ) -> PartialAnns -> PartialAnns insertPA_GR :: B.ByteString -> Ann (ColumnAnnotation InSeq ) -> PartialAnns -> PartialAnns insertPA_GF ann pa = pa { paFileAnns = insertAnn ann (paFileAnns pa) } insertPA_GC ann pa = pa { paFileColAnns = insertAnn ann (paFileColAnns pa) } insertPA_GS sq ann pa = pa { paSeqAnns = insertSM sq ann (paSeqAnns pa) } insertPA_GR sq ann pa = pa { paSeqColAnns = insertSM sq ann (paSeqColAnns pa) } ---------------------------------------------------------------------- -- [Event <-> Document] conversion functions -- | Conduit that parses 'Event'@s@ into documents 'Stockholm'. parseDoc :: Monad m => C.Conduit Event m Stockholm parseDoc = C.conduitState LookingForHeader push close where -- FIXME: Nice exceptions close LookingForHeader = return [] close (InsideStockholm annots seqs) = return [makeStockholm annots seqs] push state (EvComment _) = return (C.StateProducing state []) push LookingForHeader EvHeader = continue (emptyPA, M.empty) push LookingForHeader x = fail $ "parseDoc: unexpected " ++ show x ++ " before header" push (InsideStockholm _ _) EvHeader = fail "parseDoc: unexpected header" push (InsideStockholm annots seqs) EvEnd = return (C.StateProducing LookingForHeader [makeStockholm annots seqs]) push (InsideStockholm annots seqs) (EvSeqData label data_) = continue (annots, insertDM (label, data_) seqs) push (InsideStockholm annots seqs) (EvGF feat data_) = continue (insertPA_GF (Ann (parseFileFeature feat) data_) annots, seqs) push (InsideStockholm annots seqs) (EvGC feat data_) = continue (insertPA_GC (Ann (parseClmnFeature feat) data_) annots, seqs) push (InsideStockholm annots seqs) (EvGS sq feat data_) = continue (insertPA_GS sq (Ann (parseSeqFeature feat) data_) annots, seqs) push (InsideStockholm annots seqs) (EvGR sq feat data_) = continue (insertPA_GR sq (Ann (parseClmnFeature feat) data_) annots, seqs) continue (annots, seqs) = return (C.StateProducing (InsideStockholm annots seqs) []) {-# INLINE continue #-} data ParseDoc = LookingForHeader | InsideStockholm { pdAnnots :: {-# UNPACK #-} !PartialAnns , pdSeqs :: !(DiffMap B.ByteString L.ByteString) } -- | Glue everything into place, as the Stockholm format lets -- everything be everywhere and split in any number of parts. makeStockholm :: PartialAnns -> DiffMap B.ByteString L.ByteString -> Stockholm makeStockholm annots seqsDM = let fileAnns_ = finishAnn (paFileAnns annots) fileColAnns = finishAnn (paFileColAnns annots) seqAnns_ = finishSM (paSeqAnns annots) seqColAnns = finishSM (paSeqColAnns annots) stseqs = [StSeq (SeqLabel $ l sq) (SeqData dt) (f sq seqAnns_) (f sq seqColAnns) | (sq, dt) <- M.toList (finishDM id seqsDM)] where f = M.findWithDefault [] l = L.fromChunks . return in Stockholm fileAnns_ fileColAnns stseqs -- | Conduit that renders 'Stockholm'@s@ into 'Event'@s@. renderDoc :: Monad m => C.Conduit Stockholm m Event renderDoc = CL.concatMap toEvents where toEvents (Stockholm file clmn seqs) = (EvHeader:) $ toEventsFileAnns file $ toEventsSeqs seqs $ toEventsFileClmn clmn $ [EvEnd] toEventsFileAnns [] = id toEventsFileAnns (a:as) = (EvGF (showFileFeature $ feature a) (text a) :) . toEventsFileAnns as toEventsFileClmn [] = id toEventsFileClmn (a:as) = wrap (EvGC (showClmnFeature $ feature a)) (text a) . toEventsFileClmn as toEventsSeqs (StSeq (SeqLabel name) (SeqData seqd) sa ca : xs) = wrap (EvSeqData name') seqd . toEventsSeqAnns name' sa . toEventsSeqClmn name' ca . toEventsSeqs xs where name' = B.concat $ L.toChunks name toEventsSeqs [] = id toEventsSeqAnns _ [] = id toEventsSeqAnns n (a:as) = (EvGS n (showSeqFeature $ feature a) (text a) :) . toEventsSeqAnns n as toEventsSeqClmn _ [] = id toEventsSeqClmn n (a:as) = wrap (EvGR n (showClmnFeature $ feature a)) (text a) . toEventsSeqClmn n as wrap :: (L.ByteString -> b) -> L.ByteString -> [b] -> [b] wrap mk bs = case L.splitAt 70 bs of (x, "") -> (mk x :) (x, xs) -> (mk x :) . wrap mk xs