\section{MaskCount} Examine each nucleotide from two data sets, and count the Ns (or Xs?). Separate into A AB B 0. Also (optionally?) count masked region sizes to generate gnuplot'able histograms. TODO: - parameter to specify lower case/N/X as masked - justify columns in output - output percentages \begin{code} {-# LANGUAGE BangPatterns, FlexibleContexts #-} module Main where import Bio.Sequence import Lib.TextFormat import Control.Monad (when) import Data.Char (isLower) import Data.List (foldl',intersperse,partition) import System.Environment (getArgs) import System.Exit (exitWith,ExitCode(..)) import Text.Printf (printf, PrintfType) main :: IO () main = do (opts,[a,b]) <- parseArgs as <- readFasta a bs <- readFasta b let res = zipComp as bs when (elem 'l' opts) (do putStrLn ("# fraction of masked positions: "++a++" both "++b++"none") putStrLn $ unlines $ map printres res) when (elem 's' opts) (do putStrLn "# Summary:" putStrLn $ printSum a b $ foldl' add (CS 0 0 0 0) (map snd res)) parseArgs :: IO (String, [String]) parseArgs = do as <- getArgs let (opts,files) = partition ((=='-').head) as when (length files /= 2) usage let short = concat $ map tail opts return (short,files) usage :: IO a usage = do putStrLn "mc: compare two FASTA files for masked nucleotides" putStrLn "usage: mc [-s|-l] file_1 file_2" exitWith $ ExitFailure 1 data CompStruct = CS !Int !Int !Int !Int add :: CompStruct -> CompStruct -> CompStruct add (CS a ab b o) (CS a' ab' b' o') = CS (a+a') (ab+ab') (b+b') (o+o') -- construct the percentage similar nucleotides printres :: (String, CompStruct) -> String printres (s,CS a ab b o) = let tot = a+ab+b+o in concat $ intersperse " " $ s : map (percent tot) [a,ab,b,o] printSum :: String -> String -> CompStruct -> String printSum a b (CS ca cab cb co) = unlines (["Summary for "++a++" vs. "++b++":\n"] ++ columns " " [ map (" "++) ["unmasked in both:","masked in both:" ,"masked in "++a++":","masked in "++b++":"] , map show [co, cab, ca, cb] , map (paren . percent tot) [co,cab,ca,cb ]]) ++ "Pearson's phi: "++printf "%.3f" (phi :: Double) where tot = co+cab+ca+cb phi = let [a,b,ab,o] = map ((/(fromIntegral tot)) . fromIntegral) [ca,cb,cab,co] in (ab*o-a*b)/sqrt((ab+a)*(ab+b)*(a+o)*(b+o)) paren :: String -> String paren s = "("++s++"%)" percent :: (PrintfType (Double -> t), Integral a, Integral a1) => a1 -> a -> t percent tot x = printf "%.2f" (100*fromIntegral x/fromIntegral tot :: Double) zipComp :: [Sequence a] -> [Sequence a] -> [(String,CompStruct)] zipComp as bs = map (z1 (CS 0 0 0 0)) $ zipSeqs as bs z1 :: CompStruct -> (Sequence a, Sequence a) -> (String, CompStruct) z1 cs (a,b) = (toStr $ seqlabel a, foldl' collect cs $ zip (get a) (get b)) where get s = map isLower $ map (s!) [0..seqlength s-1] -- s/isLower/isN/ for N detect. zipSeqs :: [Sequence a] -> [Sequence a] -> [(Sequence a,Sequence a)] zipSeqs [] [] = [] zipSeqs (a:as) (b:bs) = if seqlabel a == seqlabel b then (a,b):zipSeqs as bs else error ("sequence sets differ (offending sequences: '" ++ toStr (seqlabel a) ++ "' and '" ++ toStr (seqlabel b) ++"'.") zipSeqs _ _ = error "extraneous sequences" collect :: CompStruct -> (Bool, Bool) -> CompStruct collect (CS a ab b o) (True,True) = (CS a (ab+1) b o) collect (CS a ab b o) (True,False) = (CS (a+1) ab b o) collect (CS a ab b o) (False,True) = (CS a ab (b+1) o) collect (CS a ab b o) (False,False) = (CS a ab b (o+1)) \end{code}