{-| This module provides a data type to represent an alignment
produced by the Bowtie short-read alignment tool (see
<http://bowtie-bio.sourceforge.net/index.shtml>).

The simple accessors recapitulate the details of the Bowtie alignment
output.  The position of the alignment is given by the \"0-based offset
into the reference sequence where leftmost character of the alignment
occurs\".  Thus, for forward-strand alignments this is the 5\' end of
the query sequence while for reverse-complement alignments this is the
3\' end of the query sequence.  Similarly, the query sequence and query
quality are shown in reference forward strand orientation, and thus
may be reverse complemented.

-}

module Bio.Alignment.Bowtie ( -- * Data type and basic accessors
                              Align(..), Mismatch(..), length, nmismatch, querySequ, queryQual

                            -- * Sequence positions of alignments
                            , refCLoc, refCSeqLoc, refSeqLoc, refSeqPos, mismatchSeqPos

                            -- * Parsing Bowtie output
                            , parse

                            -- * Other utilities
                            , sameRead
                            )
    where 

import Prelude hiding (length)
import Control.Monad.Error
import Control.Monad.State
import qualified Data.ByteString.Lazy as LBSW
import qualified Data.ByteString.Lazy.Char8 as LBS
import Data.Char
import qualified Data.List as List (length)
import Data.Maybe

import qualified Bio.Location.ContigLocation as CLoc
import qualified Bio.Location.Location as Loc
import qualified Bio.Location.Position as Pos
import Bio.Location.OnSeq
import qualified Bio.Location.SeqLocation as SeqLoc
import Bio.Location.Strand
import Bio.Sequence.SeqData

data Align = Align { name :: !SeqName -- ^ Name of the query sequence
                   , strand :: !Strand -- ^ Strand of the alignment on the reference sequence
                   , refname :: !SeqName -- ^ Name of the reference sequence
                   , leftoffset :: !Offset -- ^ Zero-based offset of the left-most aligned position in the reference
                   , sequ :: !SeqData -- ^ Query sequence, in the reference forward strand orientation
                   , qual :: !QualData -- ^ Query quality, in the reference forward strand orientation
                   , mismatches :: ![Mismatch] -- ^ Mismatches
                   } deriving (Read, Show, Eq, Ord)

-- | Returns the length of the query sequence
length :: Align -> Offset
length = LBS.length . sequ

-- | Returns the number of mismatches in the alignment
nmismatch :: Align -> Int
nmismatch = List.length . mismatches

-- | Parses a line of Bowtie output to produce a 'Align'
parse :: LBS.ByteString -> Either String Align
parse bstr
    = case LBS.split '\t' bstr of
        [nameBStr,strandBStr,refnameBStr,leftoffBStr,sequBStr,qualBStr,_,mismatchesBStr]
            -> do str <- parseStrand strandBStr
                  loff <- parseInt64 leftoffBStr
                  mms <- mapM parseMismatch . LBS.split (',') $ mismatchesBStr
                  return $ Align (LBS.copy nameBStr) str (LBS.copy refnameBStr) loff (LBS.copy sequBStr) (LBS.copy qualBStr) mms
        _ -> throwError $ strMsg $ "Malformed Bowtie alignment " ++ (show . LBS.unpack) bstr
    where parseStrand str | str == (LBS.singleton '+') = return Fwd
                          | str == (LBS.singleton '-') = return RevCompl
                          | otherwise = throwError $ "Unknown strand " ++ (show . LBS.unpack $ str)

-- | Query sequence as given in the query file
querySequ :: Align -> SeqData
querySequ ba = stranded (strand ba) (sequ ba)

-- | Query quality as given in the query file
queryQual :: Align -> QualData
queryQual ba = case strand ba of
                 Fwd -> qual ba
                 RevCompl -> LBSW.reverse $ qual ba

-- | Returns the sequence position of the start of the query sequence
-- alignment.  This will include the strand of the alignment and will
-- not be the same as the position computed from 'leftoffset' when the
-- alignment is on the reverse complement strand.
refSeqPos :: Align -> SeqLoc.SeqPos
refSeqPos ba = OnSeq (refname ba) $ CLoc.startPos $ refCLoc ba

-- | Returns the sequence location covered by the query in
-- the alignment.  This will be a sequence location on the reference
-- sequence and may run on the forward or the reverse complement
-- strand.
refCSeqLoc :: Align -> SeqLoc.ContigSeqLoc
refCSeqLoc ba = OnSeq (refname ba) (refCLoc ba)

-- | As 'refCSeqLoc' but without the reference sequence name.
refCLoc :: Align -> CLoc.ContigLoc
refCLoc ba = CLoc.ContigLoc (leftoffset ba) (length ba) (strand ba)

-- | Returns the sequence location covered by the query, as
-- 'refCSeqLoc', as a 'SeqLoc.SeqLoc' location.
refSeqLoc :: Align -> SeqLoc.SeqLoc 
refSeqLoc ba = OnSeq (refname ba) (Loc.Loc [ refCLoc ba ])

-- | Returns true when two alignments were derived from the same
-- sequencing read.  As Bowtie writes alignments of query sequences in
-- their order in the query file, all alignments of a given read are
-- grouped together and the lists of all alignments for each read can
-- be gathered with
-- 
-- > groupBy sameRead
sameRead :: Align -> Align -> Bool
sameRead ba1 ba2 = (name ba1) == (name ba2)

-- | Representation of a single mismatch in a bowtie alignment
data Mismatch = Mismatch { mmoffset :: !Offset -- ^ Offset of the mismatch site from the 5\' end of the query
                         , refbase :: !Char -- ^ Reference nucleotide
                         , readbase :: !Char -- ^ Query nucleotide
                         } deriving (Read, Show, Eq, Ord)

-- | Sequence position of a mismatch on the reference sequence.
mismatchSeqPos :: Align -> Mismatch -> SeqLoc.SeqPos
mismatchSeqPos ba mm = OnSeq (refname ba) mmpos
    where mmpos = fromMaybe badOffset $ (Pos.Pos (mmoffset mm) Fwd) `CLoc.posOutof` (refCLoc ba)
          badOffset = error $ "Bad mismatch offset: " ++ show (ba, mm)

parseMismatch :: LBS.ByteString -> Either String Mismatch
parseMismatch = evalStateT parser
    where parser = liftM3 Mismatch parseOffset parseRefNt parseReadNt
          parseOffset = parseTo ':' >>= lift . parseInt64
          parseRefNt = parseTo '>' >>= parseChar
          parseReadNt = parseToEnd >>= parseChar
          parseTo ch = StateT $ \str ->
                       case LBS.span (/= ch) str of
                         (out, rest) -> do (_, afterCh) <- maybe missingChError Right . LBS.uncons $ rest
                                           return (out, afterCh)
              where missingChError = Left ("Failed to find " ++ show ch)
          parseToEnd = StateT $ \str -> return (str, LBS.empty)
          parseChar str = lift $ case LBS.uncons str of
                                   Just (ch, rest) | LBS.null rest -> return ch
                                   _ -> Left ("Malformed character " ++ (show . LBS.unpack) str) 

parseInt64 :: LBS.ByteString -> Either String Offset
parseInt64 zstr = case LBS.readInteger zstr of
                    Just (z, rest) | LBS.null rest -> return $ fromIntegral z
                    _ -> throwError $ "Malformed integer " ++ show zstr