-- count homopolymer distributions in a fasta file module HPLCount where import Bio.Sequence import qualified Data.ByteString.Lazy.Char8 as B import qualified Data.Map as M import Text.Printf import Data.Char (toUpper) import Data.List (intersperse) import Data.Maybe (fromJust) import System.Environment type HPLprob = Char -> Int -> Double -- lookup and return probability of this hpl length for this letter mkHPLprobs :: M.Map (Char,Int) Int -> HPLprob mkHPLprobs ft = let totals = [(c, fromIntegral $ sum [M.findWithDefault 0 (c,x) ft | x <- [0..50]]) | c <- "ACGT" ] in \ch n -> fromIntegral (M.findWithDefault 0 (ch,n) ft) / fromJust (lookup ch totals) main = do [f] <- getArgs display "tacg" . freqtable . concatMap (hpls "tacg" . seqdata) =<< readFasta f display :: [Char] -> M.Map (Char,Int) Int -> IO () display fs m = let imax = maximum $ map snd $ M.keys m header = concat ("len\t " : intersperse "\t " (map (return . toUpper) fs)) in putStrLn $ unlines (header : [ printf "%3d" i ++ concat [ printf "\t%5d" (M.findWithDefault 0 (c,i) m) | c <- map toUpper fs ] | i <- [0..imax]]) freqtable :: [(Char,Int)] -> M.Map (Char,Int) Int freqtable = go M.empty where go m [] = m go m (i:is) = let v = M.findWithDefault 0 i m v' = v+1 m' = M.insert i v' m in v' `seq` m' `seq` go m' is hpls :: [Char] -> SeqData -> [(Char,Int)] hpls xs = go (cycle xs') where xs' = map toUpper xs go (f:fs) sd | B.null sd = [] | not (elem (toUpper $ B.head sd) xs') = go (f:fs) (B.tail sd) | otherwise = let (this,rest) = B.span ((==f).toUpper) sd in (f,fromIntegral (B.length this)) : go fs rest