-- clusterLibs -- calculate clusters by library, using the lib table -- read patterns from a table, first line is header module Main where import System.Environment (getArgs) import Data.List (intersperse) import Numeric (showFFloat) import Statistics import Formats (LibTable, Cluster , readPatternTable, totalsByLib, readClusters, countClusters, classify) main :: IO () main = do [ps,cs] <- do [xs,ys] <- getArgs return [xs,ys] `catch` error "Usage: clusterlibs " pat <- readPatternTable ps counts <- totalsByLib pat cs writeTable [("","sum":map snd pat++["significance"])] writeTable . map (decorate counts) . countClusters pat =<< readClusters cs -- | Using count by library, takes a cluster, calculates significance scores -- and formats it for output. decorate :: (String,[Int]) -> (String,[Int]) -> (String,[String]) decorate counts (x,ys) = let fractions = [ fromIntegral x / fromIntegral (sum $ snd counts) | x <- snd counts] clustersize = sum ys significance = unwords $ zipWith isSignificant ys fractions isSignificant obs frac = showFFloat (Just 3) (pvalue (1-frac) clustersize (clustersize-obs)) "" in (x,map show (clustersize:ys) ++ [significance]) -- -------------------------------------------------- -- | Calculate the p value of observing k sequences -- from a library, given an a priori background distribution. pvalue :: Double -> Int -> Int -> Double pvalue fraction tot obs = cumbin fraction tot obs -- -------------------------------------------------- -- will need to match against all, to check for multiple matches -- tag names with library classClusters :: LibTable -> FilePath -> IO [Cluster] classClusters ps f = return . map class1 =<< readClusters f where class1 (l,ss) = (l, map (\s -> classify ps s++":"++s) ss) -- | Output the clusters to stdout writeTable :: [(String,[String])] -> IO () writeTable = putStrLn . unlines . map show1 where show1 (name,stuff) = concat $ intersperse "\t" (name:stuff)