{-| This module provides a data type to represent an alignment produced by the Bowtie short-read alignment tool (see ). The simple accessors recapitulate the details of the Bowtie alignment output. The position of the alignment is given by the \"0-based offset into the reference sequence where leftmost character of the alignment occurs\". Thus, for forward-strand alignments this is the 5\' end of the query sequence while for reverse-complement alignments this is the 3\' end of the query sequence. Similarly, the query sequence and query quality are shown in reference forward strand orientation, and thus may be reverse complemented. -} module Bio.Alignment.Bowtie ( -- * Data type and basic accessors Align(..), Mismatch(..), length, nmismatch, querySequ, queryQual -- * Sequence positions of alignments , refCLoc, refCSeqLoc, refSeqLoc, refSeqPos, mismatchSeqPos -- * Parsing Bowtie output , parse -- * Other utilities , sameRead ) where import Prelude hiding (length) import Control.Monad.Error import Control.Monad.State import qualified Data.ByteString.Lazy as LBSW import qualified Data.ByteString.Lazy.Char8 as LBS import Data.Char import qualified Data.List as List (length) import Data.Maybe import qualified Bio.Location.ContigLocation as CLoc import qualified Bio.Location.Location as Loc import qualified Bio.Location.Position as Pos import Bio.Location.OnSeq import qualified Bio.Location.SeqLocation as SeqLoc import Bio.Location.Strand import Bio.Sequence.SeqData data Align = Align { name :: !SeqName -- ^ Name of the query sequence , strand :: !Strand -- ^ Strand of the alignment on the reference sequence , refname :: !SeqName -- ^ Name of the reference sequence , leftoffset :: !Offset -- ^ Zero-based offset of the left-most aligned position in the reference , sequ :: !SeqData -- ^ Query sequence, in the reference forward strand orientation , qual :: !QualData -- ^ Query quality, in the reference forward strand orientation , mismatches :: ![Mismatch] -- ^ Mismatches } deriving (Read, Show, Eq, Ord) -- | Returns the length of the query sequence length :: Align -> Offset length = LBS.length . sequ -- | Returns the number of mismatches in the alignment nmismatch :: Align -> Int nmismatch = List.length . mismatches -- | Parses a line of Bowtie output to produce a 'Align' parse :: LBS.ByteString -> Either String Align parse bstr = case LBS.split '\t' bstr of [nameBStr,strandBStr,refnameBStr,leftoffBStr,sequBStr,qualBStr,_,mismatchesBStr] -> do str <- parseStrand strandBStr loff <- parseInt64 leftoffBStr mms <- mapM parseMismatch . LBS.split (',') $ mismatchesBStr return $ Align (LBS.copy nameBStr) str (LBS.copy refnameBStr) loff (LBS.copy sequBStr) (LBS.copy qualBStr) mms _ -> throwError $ strMsg $ "Malformed Bowtie alignment " ++ (show . LBS.unpack) bstr where parseStrand str | str == (LBS.singleton '+') = return Fwd | str == (LBS.singleton '-') = return RevCompl | otherwise = throwError $ "Unknown strand " ++ (show . LBS.unpack $ str) -- | Query sequence as given in the query file querySequ :: Align -> SeqData querySequ ba = stranded (strand ba) (sequ ba) -- | Query quality as given in the query file queryQual :: Align -> QualData queryQual ba = case strand ba of Fwd -> qual ba RevCompl -> LBSW.reverse $ qual ba -- | Returns the sequence position of the start of the query sequence -- alignment. This will include the strand of the alignment and will -- not be the same as the position computed from 'leftoffset' when the -- alignment is on the reverse complement strand. refSeqPos :: Align -> SeqLoc.SeqPos refSeqPos ba = OnSeq (refname ba) $ CLoc.startPos $ refCLoc ba -- | Returns the sequence location covered by the query in -- the alignment. This will be a sequence location on the reference -- sequence and may run on the forward or the reverse complement -- strand. refCSeqLoc :: Align -> SeqLoc.ContigSeqLoc refCSeqLoc ba = OnSeq (refname ba) (refCLoc ba) -- | As 'refCSeqLoc' but without the reference sequence name. refCLoc :: Align -> CLoc.ContigLoc refCLoc ba = CLoc.ContigLoc (leftoffset ba) (length ba) (strand ba) -- | Returns the sequence location covered by the query, as -- 'refCSeqLoc', as a 'SeqLoc.SeqLoc' location. refSeqLoc :: Align -> SeqLoc.SeqLoc refSeqLoc ba = OnSeq (refname ba) (Loc.Loc [ refCLoc ba ]) -- | Returns true when two alignments were derived from the same -- sequencing read. As Bowtie writes alignments of query sequences in -- their order in the query file, all alignments of a given read are -- grouped together and the lists of all alignments for each read can -- be gathered with -- -- > groupBy sameRead sameRead :: Align -> Align -> Bool sameRead ba1 ba2 = (name ba1) == (name ba2) -- | Representation of a single mismatch in a bowtie alignment data Mismatch = Mismatch { mmoffset :: !Offset -- ^ Offset of the mismatch site from the 5\' end of the query , refbase :: !Char -- ^ Reference nucleotide , readbase :: !Char -- ^ Query nucleotide } deriving (Read, Show, Eq, Ord) -- | Sequence position of a mismatch on the reference sequence. mismatchSeqPos :: Align -> Mismatch -> SeqLoc.SeqPos mismatchSeqPos ba mm = OnSeq (refname ba) mmpos where mmpos = fromMaybe badOffset $ (Pos.Pos (mmoffset mm) Fwd) `CLoc.posOutof` (refCLoc ba) badOffset = error $ "Bad mismatch offset: " ++ show (ba, mm) parseMismatch :: LBS.ByteString -> Either String Mismatch parseMismatch = evalStateT parser where parser = liftM3 Mismatch parseOffset parseRefNt parseReadNt parseOffset = parseTo ':' >>= lift . parseInt64 parseRefNt = parseTo '>' >>= parseChar parseReadNt = parseToEnd >>= parseChar parseTo ch = StateT $ \str -> case LBS.span (/= ch) str of (out, rest) -> do (_, afterCh) <- maybe missingChError Right . LBS.uncons $ rest return (out, afterCh) where missingChError = Left ("Failed to find " ++ show ch) parseToEnd = StateT $ \str -> return (str, LBS.empty) parseChar str = lift $ case LBS.uncons str of Just (ch, rest) | LBS.null rest -> return ch _ -> Left ("Malformed character " ++ (show . LBS.unpack) str) parseInt64 :: LBS.ByteString -> Either String Offset parseInt64 zstr = case LBS.readInteger zstr of Just (z, rest) | LBS.null rest -> return $ fromIntegral z _ -> throwError $ "Malformed integer " ++ show zstr