Bio/Sequence/HashWord.lhs

| This module contains a collection of functions for calculate hash values from nucleotide words. (Actually, it mostly packs nucleotide words from into integers, so it's a one-to-one relationship.) It is intended to be used for indexing large sequence collections. \begin{code}

module Bio.Sequence.HashWord where

import Bio.Sequence.SeqData
import Data.List (sort)
import Data.Char (toUpper)
import Data.Int

import qualified Data.ByteString.Lazy.Char8 as B

-- | This is a struct for containing a set of hashing functions
data HashF k = HF { hash   :: SeqData -> Offset -> Maybe k -- ^ calculates the hash at a given offset in the sequence
                 , hashes  :: SeqData -> [(k,Offset)]      -- ^ calculate all hashes from a sequence, and their indices
                 , ksort :: [k] -> [k]                     -- ^ for sorting hashes
                 }

-- | Adds a default "hashes" function to a @HashF@, when "hash" is defined.
genkeys :: HashF k -> HashF k
genkeys kf = kf { hashes = gkeys }
    where gkeys s = mkkeys (hash kf $ s)  [0..B.length s-1]
          mkkeys f (p:ps) = case f p of Nothing -> mkkeys f ps
                                        Just k -> (k,p) : mkkeys f ps
          mkkeys _ [] = []

\end{code} \subsection{Generating hashes} Contigous hashes. \begin{code}


-- | Contigous constructs an int\/eger from a contigous k-word.
contigous :: Integral k => Int -> HashF k
contigous k' = let 
   k = fromIntegral k'

   c_key s i | k+i > B.length s = Nothing 
             | otherwise = let s' = B.take k $ B.drop i s 
                           in if B.find isN s' == Nothing then Just (n2k k' s')
                              else Nothing
   c_keys s = c_keys' 0 s
   c_keys' c s | k > B.length s = []
               | otherwise = case B.findIndex isN w0 of 
                               Nothing -> let cur = n2k (fromIntegral k) s
                                          in (cur,c) : c_keys'' cur c (B.drop k s)
                               Just i  -> c_keys' (c+i+1) (B.drop (i+1) s)
            where w0 = B.take k s

   c_keys'' cur p s | B.null s = []
                    | isN (B.head s) = c_keys' (p+k+1) (B.tail s)
                    | otherwise = let new = (cur `mod` 4^(k-1))*4+val (B.head s)
                                  in (new,p+1) : c_keys'' new (p+1) (B.tail s)

              in HF { hash = c_key
                    , hashes  = c_keys
                    , ksort = Data.List.sort
                    }

\end{code} \begin{code}


-- | Like 'contigous', but returns the same hash for a word and its reverse complement.
{-# SPECIALIZE rcontig :: Int -> HashF Int #-}
rcontig :: Integral k => Int -> HashF k
rcontig k' = let
   k = fromIntegral k'

   c_key s i | k+i > B.length s = Nothing 
             | B.find isN s' /= Nothing = Nothing
             | otherwise = Just $ min (n2k k' s') (n2k k' rs')
             where s' = B.take k $ B.drop i s
                   rs' = B.reverse $ B.map complement s'

   c_keys s = c_keys' 0 s
   c_keys' c s | k > B.length s = []
               | otherwise = case B.findIndex isN w0 of 
                               Nothing -> let c1 = n2k (fromIntegral k) w0
                                              c2 = n2k (fromIntegral k) w0r
                                          in (min c1 c2,c) : c_keys'' (c1,c2) c (B.drop k s)
                               Just i  -> c_keys' (c+i+1) (B.drop (i+1) s)
            where w0  = B.take k s
                  w0r = B.reverse $ B.map complement w0

   c_keys'' (c1,c2) p s | B.null s = []
                        | isN (B.head s) = c_keys' (p+k+1) (B.tail s)
                        | otherwise = let n1 = (c1 `mod` 4^(k-1))*4+val (B.head s)
                                          n2 = c2 `div` 4+4^(k-1)*(val . complement) (B.head s)
                                      in (min n1 n2,p+1) : c_keys'' (n1,n2) (p+1) (B.tail s)

              in HF { hash = c_key
                    , hashes  = c_keys
                    , ksort = Data.List.sort
                    }

\end{code} \section{454-hashes} 454 sequencing results in relatively few substitution errors, but has problems with mulitplicity of mononucleotides. One option is to ignore multiplicity, and simply hash nucleotide changes. Note that genkeys won't work on this one. \begin{code}


-- what about Ns?
compact :: SeqData -> [SeqData]
compact bs | B.null bs = []
           | otherwise = let (h,t) = B.break (/=B.head bs) bs in h : compact t

-- | Like @rcontig@, but ignoring monomers (i.e. arbitrarily long runs of a single nucelotide
--   are treated the same a single nucleotide.
rcpacked :: Integral k => Int -> HashF k
rcpacked k' = let
    k = fromIntegral k'

    c_key s i | k > B.length s' = Nothing
              | B.any isN s'    = Nothing
              | otherwise = Just $ min (n2k k' s') (n2k k' rs')
              where s1 = take k' $ compact $ B.drop i s
                    s' = fromStr $ map B.head s1
                    rs' = B.reverse $ B.map complement s'

    c_keys = c_keys' 0 . compact -- :: Integral k => SeqData -> [(k,Offset)]
--    c_keys' :: Offset -> [SeqData] -> [(k,Offset)]
    c_keys' i ss | k' > length ss = []
                 | any isN $ map B.head $ take k' ss = c_keys' (i+(B.length . head) ss) (tail ss)
                 | otherwise = let s' = fromStr $ map B.head $ take k' ss
                                   key1 = n2k k' s'
                                   key2 = n2k k' (B.reverse $ B.map complement s')
                               in  (min key1 key2,i) : c_keys'' (key1,key2) (i+(B.length . head) ss) (tail ss)

    c_keys'' (c1,c2) i [] = []
    c_keys'' (c1,c2) i (s:ss) 
        | isN $ B.head s = c_keys' (i + (sum . map B.length) (s:take k' ss)) ss
        | otherwise = let s1 = B.head s
                          n1 = (c1 `mod` 4^(k-1))*4+val s1
                          n2 = c2 `div` 4+4^(k-1)*(val . complement) s1
                      in (min n1 n2,i) : c_keys'' (n1,n2) (i+B.length s) ss

             in HF { hash = c_key 
                   , hashes = c_keys
                   , ksort = Data.List.sort
                   }

\end{code} \section{Gapped words} Shape uses a gapped shape specified via a string Note that this will be challenging to get symmetric, since an 'N' character may appear in the mask in only one direction. \begin{code}


type Shape = String
gapped :: Integral k => Shape -> HashF k
gapped = error "gapped"

\end{code} \subsection{Key arithmetic} A hash is an integer representing packed k-words. This is currently limited to the nucleotide alphabet, but could easily(?) be extended to arbitrary alphabets. Ideally, it should also handle wildcards (i.e. inserting a position with multiple hashes) \begin{code}


isN :: Char -> Bool
isN = not . flip elem "ACGT" . toUpper

-- conversion between integers (hashes) and strings
-- n2k ignores contents beyond k chars
n2k :: Integral k => Int -> SeqData -> k
n2k k = n2i' 0 . B.take (fromIntegral k)

n2i' :: (Num a) => a -> SeqData -> a
n2i' acc gs = if B.null gs then acc else n2i' (4*acc + val (B.head gs)) (B.tail gs)

-- inefficient, but not used much in critical code anyway
k2n :: Integral k => Int -> k -> SeqData
k2n k = fromStr . k2n' k

k2n' k i = if k==1 then [unval i]
           else let (q,r) = i `divMod` (4^(k-1)) in unval q : k2n' (k-1) r

{-
shiftRight, shiftLeft :: Integral k => Int -> Int -> k -> String -> k
shiftRight k l pk str = (pk `mod` 4^l)*4^(k-l) + n2k 0 str
shiftLeft = error "Keys.shiftLeft not implemented"
-}

val :: (Num t) => Char -> t
val g = case toUpper g of {'A' -> 0; 'C' -> 1; 'G' -> 2; 'T' -> 3; 
                           _ -> error ("val: illegal character" ++ show g)}

unval :: (Num a) => a -> Char
unval i = case i of {0 -> 'A'; 1 -> 'C'; 2 -> 'G'; 3 -> 'T'; 
                          _ -> error ("unval: illegal value "++show i)}

complement :: Char -> Char
complement g = case g of {'A' -> 'T'; 'T' -> 'A'; 'a' -> 't'; 't' -> 'A';
                          'C' -> 'G'; 'G' -> 'C'; 'c' -> 'g'; 'g' -> 'c'; 
                          _ -> error ("Can't complement "++show g)}

\end{code}