module Bio.Alignment.Bowtie (
Align(..), Mismatch(..), length, nmismatch, querySequ, queryQual
, refCLoc, refCSeqLoc, refSeqLoc, refSeqPos, mismatchSeqPos
, parse
, sameRead
)
where
import Prelude hiding (length)
import Control.Monad.Error
import Control.Monad.State
import qualified Data.ByteString.Lazy as LBSW
import qualified Data.ByteString.Lazy.Char8 as LBS
import Data.Char
import qualified Data.List as List (length)
import Data.Maybe
import qualified Bio.Location.ContigLocation as CLoc
import qualified Bio.Location.Location as Loc
import qualified Bio.Location.Position as Pos
import Bio.Location.OnSeq
import qualified Bio.Location.SeqLocation as SeqLoc
import Bio.Location.Strand
import Bio.Sequence.SeqData
data Align = Align { name :: !SeqName
, strand :: !Strand
, refname :: !SeqName
, leftoffset :: !Offset
, sequ :: !SeqData
, qual :: !QualData
, mismatches :: ![Mismatch]
} deriving (Read, Show, Eq, Ord)
length :: Align -> Offset
length = LBS.length . sequ
nmismatch :: Align -> Int
nmismatch = List.length . mismatches
parse :: LBS.ByteString -> Either String Align
parse bstr
= case LBS.split '\t' bstr of
[nameBStr,strandBStr,refnameBStr,leftoffBStr,sequBStr,qualBStr,_,mismatchesBStr]
-> do str <- parseStrand strandBStr
loff <- parseInt64 leftoffBStr
mms <- mapM parseMismatch . LBS.split (',') $ mismatchesBStr
return $ Align (LBS.copy nameBStr) str (LBS.copy refnameBStr) loff (LBS.copy sequBStr) (LBS.copy qualBStr) mms
_ -> throwError $ strMsg $ "Malformed Bowtie alignment " ++ (show . LBS.unpack) bstr
where parseStrand str | str == (LBS.singleton '+') = return Fwd
| str == (LBS.singleton '-') = return RevCompl
| otherwise = throwError $ "Unknown strand " ++ (show . LBS.unpack $ str)
querySequ :: Align -> SeqData
querySequ ba = stranded (strand ba) (sequ ba)
queryQual :: Align -> QualData
queryQual ba = case strand ba of
Fwd -> qual ba
RevCompl -> LBSW.reverse $ qual ba
refSeqPos :: Align -> SeqLoc.SeqPos
refSeqPos ba = OnSeq (refname ba) $ CLoc.startPos $ refCLoc ba
refCSeqLoc :: Align -> SeqLoc.ContigSeqLoc
refCSeqLoc ba = OnSeq (refname ba) (refCLoc ba)
refCLoc :: Align -> CLoc.ContigLoc
refCLoc ba = CLoc.ContigLoc (leftoffset ba) (length ba) (strand ba)
refSeqLoc :: Align -> SeqLoc.SeqLoc
refSeqLoc ba = OnSeq (refname ba) (Loc.Loc [ refCLoc ba ])
sameRead :: Align -> Align -> Bool
sameRead ba1 ba2 = (name ba1) == (name ba2)
data Mismatch = Mismatch { mmoffset :: !Offset
, refbase :: !Char
, readbase :: !Char
} deriving (Read, Show, Eq, Ord)
mismatchSeqPos :: Align -> Mismatch -> SeqLoc.SeqPos
mismatchSeqPos ba mm = OnSeq (refname ba) mmpos
where mmpos = fromMaybe badOffset $ (Pos.Pos (mmoffset mm) Fwd) `CLoc.posOutof` (refCLoc ba)
badOffset = error $ "Bad mismatch offset: " ++ show (ba, mm)
parseMismatch :: LBS.ByteString -> Either String Mismatch
parseMismatch = evalStateT parser
where parser = liftM3 Mismatch parseOffset parseRefNt parseReadNt
parseOffset = parseTo ':' >>= lift . parseInt64
parseRefNt = parseTo '>' >>= parseChar
parseReadNt = parseToEnd >>= parseChar
parseTo ch = StateT $ \str ->
case LBS.span (/= ch) str of
(out, rest) -> do (_, afterCh) <- maybe missingChError Right . LBS.uncons $ rest
return (out, afterCh)
where missingChError = Left ("Failed to find " ++ show ch)
parseToEnd = StateT $ \str -> return (str, LBS.empty)
parseChar str = lift $ case LBS.uncons str of
Just (ch, rest) | LBS.null rest -> return ch
_ -> Left ("Malformed character " ++ (show . LBS.unpack) str)
parseInt64 :: LBS.ByteString -> Either String Offset
parseInt64 zstr = case LBS.readInteger zstr of
Just (z, rest) | LBS.null rest -> return $ fromIntegral z
_ -> throwError $ "Malformed integer " ++ show zstr