-- Tests for alignments module Bio.Alignment.Test where import Control.Exception (bracket) import Control.Monad import Data.Char (isSpace) import Data.Ord (comparing) import System.Cmd import System.Directory import System.IO import System.IO.Unsafe import qualified Bio.Location.ContigLocation as CLoc import Bio.Location.OnSeq import qualified Bio.Location.SeqLocation as SeqLoc import Bio.Location.Strand import Bio.Sequence import Bio.Util.TestBase import Bio.Alignment.AlignData (showalign) import Bio.Alignment.Matrices as M import Test.QuickCheck import Bio.Alignment.QAlign as Q import Bio.Alignment.AAlign as A import Bio.Alignment.AlignData (toStrings, extractGaps, insertGaps) import Bio.Alignment.Multiple import qualified Bio.Alignment.Soap as Soap import Data.List (intersperse, sort, sortBy, nub) import qualified Data.ByteString.Lazy as B import qualified Data.ByteString.Lazy.Char8 as BC -- default blastn parameters mx = Q.qualMx amx = M.blastn_default g = (-5,-2) tests :: [Test] -- .........o.........o.........o tests = [ T "gapped assembly invariant" prop_asm_gaps , T "global reverse" (prop_reverse_g mx g) , T "global reverse w/qual" (prop_reverse_g_qual mx g) , T "local reverse" (prop_reverse_l mx g) , T "local reverse w/qual" (prop_reverse_l_qual mx g) , T "global recovery" (prop_recover_g mx g) , T "global score <= local" (prop_global_local mx g) , T "global score <= overlap" (prop_global_overlap mx g) , T "overlap score <= local" (prop_overlap_local mx g) , T "overlap score = align" (prop_overlap_score_align mx g) , T "s == s => equal scores" (prop_equal_scores mx g) -- should give same result without n's(?) , T "global aalign = qualign" prop_AAlign_QAlign_g , T "local aalign = qualign" prop_AAlign_QAlign_l -- are these really correct? , T "decr qual => decr score (l)" (prop_quality_dec_l mx (-100,-100)) , T "decr qual => decr score (g)" (prop_quality_dec_g mx (-100,-100)) , T "indirect recover (g)" prop_indirect_recover , T "soap parsing" prop_SoapAlign_invertparse ] -- pretty ugly, to ensure the gaps are correctly generated prop_asm_gaps :: (String,[Int]) -> Bool prop_asm_gaps (s,gs) = let (str,gaps) = (BC.pack $ filter (/='-') $ filter (/='*') s , nub $ takeWhile (<= BC.length str) $ dropWhile (<0) $ map fromIntegral $ sort gs) in (str,gaps) == (extractGaps . insertGaps '*' $ (str,gaps)) -- these aren't entirely equivalent, as the different column generators or selector -- sometimes make different, but equally scoring, choices. Should be fixed, but in -- the meantime, checking scores to within an epsilot should be okay. -- (May fail if two n's are aligned?) prop_AAlign_QAlign_g (E s1) (E s2) = let qmx = qualMx 22 22 a = (A.global_align qmx g s1 s2) q = (Q.global_align mx g s1 s2) in if abs (fst a - fst q) < 0.001 then True else error ("\n"++show (fst a)++"\n"++ showalign (snd a) ++"\n"++show (fst q) ++ "\n"++ showalign (snd q)) prop_AAlign_QAlign_l (E s1) (E s2) = let qmx = qualMx 22 22 a = (A.local_align qmx g s1 s2) q = (Q.local_align mx g s1 s2) in if abs (fst a - fst q) < 0.001 then True else error ("\n"++show (fst a)++"\n"++ showalign (snd a) ++"\n"++show (fst q) ++ "\n"++ showalign (snd q)) -- Check reverse without quality values prop_reverse_g mx g (E s1) (E s2) = abs (Q.global_score mx g s1 s2 - Q.global_score mx g (revcompl s1) (revcompl s2)) < 0.1 prop_reverse_l mx g (E s1) (E s2) = abs (Q.local_score mx g s1 s2 - Q.local_score mx g (revcompl s1) (revcompl s2)) < 0.1 -- Check reverse with quality prop_reverse_g_qual mx g (Eq s1) (Eq s2) = abs (Q.global_score mx g s1 s2 - Q.global_score mx g (revcompl s1) (revcompl s2)) < 0.1 prop_reverse_l_qual mx g (Eq s1) (Eq s2) = abs (Q.local_score mx g s1 s2 - Q.local_score mx g (revcompl s1) (revcompl s2)) < 0.1 -- Check that alignments are produced from the correct sequences prop_recover_g mx g (Eq s1) (Eq s2) = let (s1',s2') = toStrings $ snd $ Q.global_align mx g s1 s2 in filter (/='-') s1' == toStr (seqdata s1) && filter (/='-') s2' == toStr (seqdata s2) -- global <= overlap <= local -- Global score never exceeds optimal local score prop_global_local mx g (Eq s1) (Eq s2) = Q.global_score mx g s1 s2 <= Q.local_score mx g s1 s2 -- Global score never exceeds optimal overlap score prop_global_overlap mx g (Eq s1) (Eq s2) = Q.global_score mx g s1 s2 <= Q.overlap_score mx g s1 s2 -- Overlap score never exceeds optimal local score prop_overlap_local mx g (Eq s1) (Eq s2) = Q.overlap_score mx g s1 s2 <= Q.local_score mx g s1 s2 prop_overlap_score_align mx g (Eq s1) (Eq s2) = abs (Q.overlap_score mx g s1 s2 - fst (Q.overlap_align mx g s1 s2)) < 0.1 prop_equal_scores mx g (Eq s1) = let qg = Q.global_score mx g s1 s1 ql = Q.local_score mx g s1 s1 qo = Q.overlap_score mx g s1 s1 in qg == ql && ql == qo -- Sinking score with sinking quality - local (same as global) prop_quality_dec_l mx g (E (Seq h d _)) = let q10 = Just $ B.map (const 10) d q20 = Just $ B.map (const 20) d in Q.local_score mx g (Seq h d q10) (Seq h d q10) <= Q.local_score mx g (Seq h d q20) (Seq h d q20) -- Sinking score with sinking quality - global -- NB: score is always positive (aligning seq. with itself) prop_quality_dec_g mx g (E (Seq h d _)) = let q10 = Just $ B.map (const 10) d q20 = Just $ B.map (const 20) d in Q.global_score mx g (Seq h d q10) (Seq h d q10) <= Q.global_score mx g (Seq h d q20) (Seq h d q20) -- Sinking avg score with sinking quality -- This is not correct, low quality reduces the penalty for mismatch -- much harder than the reward for a match. -- (E.g. mmmxmmm may score higher with poor quality) prop_quality_dec2 mx g (E (Seq h d _)) (E (Seq h' d' _)) = let q10 = Just $ B.map (const 10) d q20 = Just $ B.map (const 20) d (s1,a1) = Q.local_align mx g (Seq h d q10) (Seq h' d' q10) (s2,a2) = Q.local_align mx g (Seq h d q20) (Seq h' d' q20) in if null a1 && null a2 then True else s1 / fromIntegral (length a1) <= s2 / fromIntegral (length a2) -- not expected to pass(?), but useful to debug indirect alignments prop_indirect (E s1) (E s2) (E s3) = let a1 = snd $ A.global_align amx g s1 s2 a2 = snd $ A.global_align amx g s2 s3 a3 = snd $ A.global_align amx g s1 s3 i1 = indirect a1 a2 in if a3 == i1 then True else error ("\n"++unlines [showalign a1, showalign a2,"",showalign a3,"",showalign i1]) prop_indirect_recover (E s1) (E s2) (E s3) = let a1 = snd $ A.global_align amx g s1 s2 a2 = snd $ A.global_align amx g s2 s3 i1 = indirect a1 a2 (s1',s3') = toStrings i1 f = filter (/= '-') in if f s1' == toStr (seqdata s1) && f s3' == toStr (seqdata s3) then True else error ("\n"++(unlines [showalign a1, showalign a2,"",showalign i1])) {- -- | Affine = Simple when go == ge prop_affine1 s1 s2 = S.global_align mx g s1 s2 == A.global_align mx (g,g) s1 s2 prop_affine2 s1 s2 = S.global_score mx g s1 s2 == A.global_score mx (g,g) s1 s2 -} {- -- | For a given scoring scheme, scoring must correspond to alignments prop_score1 mx g (Eq s1) (Eq s2) = sum . S.eval mx g . snd . S.global_align mx g s1 $ s2 == S.global_score mx g s1 s2 -- -> todo: ditto for A and Q, ditto for local -} {- -- | Any global alignment should return the exact same sequences prop_recover mx g s1 s2 = undefined -- | Qual = Affine when no quality data are supplied prop_qual s1 s2 = undefined prop_local s1 s2 = S.global_score mx g s1 s2 <= S.local_score mx g s1 s2 -- -> todo: ditto for A and Q -- | Verify that all columns have the same lenght (ie. it should be a square matrix) -- (Check vs sequence lengths) Both QAlign.columns and AlignData.columns prop_columns = undefined -} instance Arbitrary Soap.SoapAlign where arbitrary = elements [18..36] >>= genSoapAlign genSeqName :: Gen BC.ByteString genSeqName = liftM (BC.pack . filter (not . isSpace)) $ sized $ vector . (+ 1) genSoapAlign :: Offset -> Gen Soap.SoapAlign genSoapAlign len = do name <- genSeqName sequ <- liftM (fromStr . map fromN) $ vector $ fromIntegral len qual <- liftM (B.pack . map fromQ) $ vector $ fromIntegral len let nhit = 1 let pairend = 'a' strand <- elements [Fwd, RevCompl] refname <- genSeqName refstart <- genOffset nmismatch <- choose (0, 3) mismatches <- genMismatches nmismatch sequ qual return $ Soap.SA name sequ qual nhit pairend len strand refname refstart nmismatch mismatches genMismatches :: Int -> SeqData -> QualData -> Gen [Soap.SoapAlignMismatch] genMismatches nmismatch sequ qual = shuffle [0..(BC.length sequ - 1)] >>= mapM genMismatchAt . take nmismatch where shuffle :: [a] -> Gen [a] shuffle = liftM (map fst . sortBy (comparing snd)) . mapM prioritize prioritize :: a -> Gen (a, Double) prioritize x = liftM ((,) x) $ choose (0.0, 1.0) genMismatchAt :: Offset -> Gen Soap.SoapAlignMismatch genMismatchAt off = let readnt = BC.index sequ off qualnt = B.index qual off in do refnt <- elements $ filter (/= readnt) "acgt" return $ Soap.SAM readnt refnt off qualnt prop_SoapAlign_invertparse :: Soap.SoapAlign -> Bool prop_SoapAlign_invertparse sa = let saLine = Soap.unparse sa saErr :: Either String Soap.SoapAlign saErr = Soap.parse saLine in case saErr of (Right sa') -> let saLine' = Soap.unparse sa' in (sa == sa') && (saLine == saLine') (Left x) -> error x genNumberedEsts :: Gen EST_set genNumberedEsts = liftM (ESet . zipWith numberedSequence [1..]) $ replicateM numEsts genEstSeqData where numberedSequence z s = Seq (BC.pack $ "seq" ++ show z) s Nothing numEsts = 100 genEstSeqData = liftM BC.pack $ choose (200, 1000) >>= flip replicateM (elements "ACGT") genReadCSeqLoc :: Offset -> Sequence t -> Gen SeqLoc.ContigSeqLoc genReadCSeqLoc len (Seq seqName seqSeq _) = do strand <- elements [Fwd, RevCompl] offset5 <- liftM fromIntegral $ chooseInteger (0, fromIntegral $ BC.length seqSeq - len) return $ OnSeq seqName $ CLoc.ContigLoc offset5 len strand where chooseInteger :: (Integer, Integer) -> Gen Integer chooseInteger = choose genReadSequence :: Offset -> Sequence t -> Gen (Sequence t) genReadSequence len sequ@(Seq _ seqSeq _) = do rloc <- genReadCSeqLoc len sequ return $ Seq (readName rloc) (readSeq rloc) (readQual rloc) where readName (OnSeq targName (CLoc.ContigLoc targOffset5 targLen targStrand)) = BC.intercalate (BC.singleton '_') [ targName , BC.pack $ show targOffset5 , BC.pack $ show targLen , BC.pack $ show targStrand ] readSeq (OnSeq _ cloc) = either error id $ CLoc.seqData seqSeq cloc readQual rloc = Just $ B.replicate (BC.length $ readSeq rloc) (fromIntegral 40) genEstReadSequence :: Offset -> EST_set -> Gen (Sequence Nuc) genEstReadSequence len (ESet seqs) = elements seqs >>= genReadSequence len genEstReads :: Offset -> EST_set -> Gen [Sequence Nuc] genEstReads len eset = choose readNumRange >>= flip replicateM (genEstReadSequence len eset) where readNumRange = (5, 50) runSoap :: EST_set -> [Sequence Nuc] -> IO [Soap.SoapAlign] runSoap (ESet refSeqs) readSeqs = withTempFile "/tmp" "soap-refs-XXXXXX.txt" $ \(refFile, hRef) -> withTempFile "/tmp" "soap-reads-XXXXXX.txt" $ \(readFile, hRead) -> withTempFile "/tmp" "soap-aligns-XXXXXX.txt" $ \(alignFile, hAlign) -> do hWriteFasta hRef refSeqs >> hClose hRef hWriteFastQ hRead readSeqs >> hClose hRead hClose hAlign rawSystem soapExe [ "-d", refFile, "-a", readFile, "-r", "2", "-v", "3", "-o", alignFile ] liftM (map parseSoapLine . BC.lines) $ BC.readFile alignFile where parseSoapLine = either error id . Soap.parse soapExe = "/home/ingolia/Prog/extsrc/soap_1.10/soap.contig" withTempFile :: FilePath -> String -> ((FilePath,Handle) -> IO r) -> IO r withTempFile name tmp = bracket (openTempFile name tmp) (\(f,h) -> hClose h >> removeFile f) verifySoap :: [Soap.SoapAlign] -> Sequence t -> Bool verifySoap aligns (Seq readName _ _) = case BC.split '_' readName of [refName, offset5Str, lenStr, strandStr] -> let offset5 = read $ BC.unpack offset5Str len = read $ BC.unpack lenStr strand = read $ BC.unpack strandStr cLoc = CLoc.ContigLoc offset5 len strand readLoc = OnSeq refName cLoc in any ((== readLoc) . Soap.refCSeqLoc) aligns _ -> False prop_soap_in_vivo :: Property prop_soap_in_vivo = forAll genNumberedEsts $ \refSeqs -> forAll (genEstReads 36 refSeqs) $ \readSeqs -> let aligns = unsafePerformIO $ runSoap refSeqs readSeqs in collect (length aligns - length readSeqs) $ all (verifySoap aligns) readSeqs bench :: [Test] -- .........o.........o.........o bench = [ T "local score, short" bench_local_rev_s , T "overlap score, short" bench_overlap_rev_s , T "global score, short" bench_global_rev_s , T "local score, long" bench_local_rev_l , T "overlap score, long" bench_overlap_rev_l , T "global score, long" bench_global_rev_l ] bench_global_rev_s (ES x) (ES y) = abs (Q.global_score mx g x y - Q.global_score mx g (revcompl x) (revcompl y)) < 0.1 bench_local_rev_s (ES x) (ES y) = abs (Q.local_score mx g x y - Q.local_score mx g (revcompl x) (revcompl y)) < 0.1 bench_overlap_rev_s (ES x'@(Seq h d _)) (ES y'@(Seq h2 d2 _)) = let x = Seq h d Nothing y = Seq h2 d2 Nothing f = Q.overlap_align mx g x y r = Q.overlap_align mx g (revcompl x) (revcompl y) in if abs (fst f-fst r) < 0.1 then True else error ("\nFwd: "++show (fst f)++"\n"++ showalign (snd f) ++"\nRev: "++show (fst r)++"\n"++ showalign (snd r) ++"\n"++toStr (seqdata x)++"\n"++(toStr $ seqdata y)) bench_overlap_rev_l (EL x) (EL y) = abs (Q.overlap_score mx g x y - Q.overlap_score mx g (revcompl x) (revcompl y)) < 0.1 bench_global_rev_l (EL x) (EL y) = abs (Q.global_score mx g x y - Q.global_score mx g (revcompl x) (revcompl y)) < 0.1 bench_local_rev_l (EL x) (EL y) = abs (Q.local_score mx g x y - Q.local_score mx g (revcompl x) (revcompl y)) < 0.1