{- Parse Gregory W. Schwartz Collects the functions pertaining to the parsing of the abundance output. -} {-# LANGUAGE BangPatterns #-} {-# LANGUAGE OverloadedStrings #-} module Parse ( getAbundanceMap , getFrequencyMap , getAccSet , parseDuplications , parseBAM ) where -- Standard import Data.Char import qualified Data.Map.Strict as Map import qualified Data.Set as Set -- Cabal import Control.Lens import Data.Csv import Data.Text.Read import qualified Data.ByteString.Lazy.Char8 as B import qualified Data.Text as T import qualified Data.Vector as V import Safe -- Local import Types import Utility -- | Get the abundance map from the contents of the abundance output. getAbundanceMap :: B.ByteString -> AbundanceMap getAbundanceMap = AbundanceMap . Map.unions . fmap (\ !m -> Map.singleton (findWithError ("target_id" :: T.Text) m) (getDouble . findWithError ("est_counts" :: T.Text) $ m) ) . V.toList . either error snd . decodeByNameWith decodeOpts where decodeOpts = defaultDecodeOptions { decDelimiter = fromIntegral (ord '\t') } getDouble = either error fst . double -- | Convert an abundance map to a frequency map. getFrequencyMap :: AbundanceMap -> FrequencyMap getFrequencyMap (AbundanceMap m) = FrequencyMap . Map.map (/ totalCount) $ m where totalCount = Map.foldl (+) 0 m -- | Get the list of accessions from a duplication row post-Trinity. getAccSet :: [DuplicationRow] -> AccSet getAccSet = AccSet . Set.fromList . fmap ( flip (Safe.at) 1 . T.splitOn "|" . findWithError "fHeader" . unDuplicationRow ) -- | Get the duplications in an easy to read format. parseDuplications :: B.ByteString -> (Header, [DuplicationRow]) parseDuplications = over _2 (fmap DuplicationRow . V.toList) . either error id . decodeByName -- | Parse bam rows. parseBAM :: T.Text -> [BamRow] parseBAM = fmap (BamRow . T.splitOn "\t") . T.lines