-- | Support for reading various data module Formats where import Text.Regex import Data.List (elemIndex) import Data.Map hiding (map) type LibTable = [(Regex,String)] -- | pattern and name type Cluster = (String,[String]) -- | name and list of reads -- -------------------------------------------------- -- | Classify a read according to the LibTable classify :: LibTable -> String -> String classify ps str = case concatMap (class1 str) ps of [] -> error ("no match for "++str++" in library table") [x] -> x s@(_:_) -> error ("multiple matches for "++str++": "++show s) where class1 st (r,s) = maybe [] (const [s]) (matchRegex r st) -- -------------------------------------------------- -- | Read a LibTable from a file of wsp-separated columns, first line is -- a header, and only columns "Pattern" and "Name" are used. readPatternTable :: FilePath -> IO LibTable readPatternTable f = do (h:ls) <- return . map words . lines =<< readFile f let (p,n) = case (elemIndex "Pattern" h, elemIndex "Name" h) of (Just x,Just y) -> (x,y) _ -> error ("Need both 'Pattern' and 'Name' headers in '"++f++"'") z l | length l < max p n = error ("Line in library table too short:\n"++show l) | otherwise = (mkRegex (l!!p),l!!n) return $ map z ls -- -------------------------------------------------- -- | Acquire total counts by library from a clustering totalsByLib :: LibTable -> FilePath -> IO (String,[Int]) totalsByLib lt f = do cs <- readClusters f let unify cs = [("total", concatMap snd cs)] case countClusters lt (unify cs) of [x] -> return x _ -> error "totalsByLib: this is impossible" -- count by classification countClusters :: LibTable -> [Cluster] -> [(String,[Int])] countClusters lt = map count1 where count1 (c,ss) = (c,map (\k -> findWithDefault 0 k (counts ss)) (map snd lt)) counts ss = fromListWith (+) $ zip (map (classify lt) ss) $ repeat 1 -- -------------------------------------------------- -- | Parse clusters readClusters :: FilePath -> IO [Cluster] readClusters f = readFile f >>= return . rc . lines where rc [] = [] rc [_] = error "odd number of cluster lines!" rc (c:ss:rest) = case head $ words c of ('>':name) -> (name,words ss) : rc rest _ -> error ("Cluster '"++c++"' didn't start with '>'")