| This module contains a collection of functions for calculate hash values from nucleotide words. (Actually, it mostly packs nucleotide words from into integers, so it's a one-to-one relationship.) It is intended to be used for indexing large sequence collections. \begin{code} module Bio.Sequence.HashWord where import Bio.Sequence.SeqData import Data.List (sort) import Data.Char (toUpper) import Data.Int import qualified Data.ByteString.Lazy.Char8 as B -- | This is a struct for containing a set of hashing functions data HashF k = HF { hash :: SeqData -> Offset -> Maybe k -- ^ calculates the hash at a given offset in the sequence , hashes :: SeqData -> [(k,Offset)] -- ^ calculate all hashes from a sequence, and their indices , ksort :: [k] -> [k] -- ^ for sorting hashes } -- | Adds a default "hashes" function to a @HashF@, when "hash" is defined. genkeys :: HashF k -> HashF k genkeys kf = kf { hashes = gkeys } where gkeys s = mkkeys (hash kf $ s) [0..B.length s-1] mkkeys f (p:ps) = case f p of Nothing -> mkkeys f ps Just k -> (k,p) : mkkeys f ps mkkeys _ [] = [] \end{code} \subsection{Generating hashes} Contigous hashes. \begin{code} -- | Contigous constructs an int\/eger from a contigous k-word. contigous :: Integral k => Int -> HashF k contigous k' = let k = fromIntegral k' c_key s i | k+i > B.length s = Nothing | otherwise = let s' = B.take k $ B.drop i s in if B.find isN s' == Nothing then Just (n2k k' s') else Nothing c_keys s = c_keys' 0 s c_keys' c s | k > B.length s = [] | otherwise = case B.findIndex isN w0 of Nothing -> let cur = n2k (fromIntegral k) s in (cur,c) : c_keys'' cur c (B.drop k s) Just i -> c_keys' (c+i+1) (B.drop (i+1) s) where w0 = B.take k s c_keys'' cur p s | B.null s = [] | isN (B.head s) = c_keys' (p+k+1) (B.tail s) | otherwise = let new = (cur `mod` 4^(k-1))*4+val (B.head s) in (new,p+1) : c_keys'' new (p+1) (B.tail s) in HF { hash = c_key , hashes = c_keys , ksort = Data.List.sort } \end{code} \begin{code} -- | Like 'contigous', but returns the same hash for a word and its reverse complement. {-# SPECIALIZE rcontig :: Int -> HashF Int #-} rcontig :: Integral k => Int -> HashF k rcontig k' = let k = fromIntegral k' c_key s i | k+i > B.length s = Nothing | B.find isN s' /= Nothing = Nothing | otherwise = Just $ min (n2k k' s') (n2k k' rs') where s' = B.take k $ B.drop i s rs' = B.reverse $ B.map complement s' c_keys s = c_keys' 0 s c_keys' c s | k > B.length s = [] | otherwise = case B.findIndex isN w0 of Nothing -> let c1 = n2k (fromIntegral k) w0 c2 = n2k (fromIntegral k) w0r in (min c1 c2,c) : c_keys'' (c1,c2) c (B.drop k s) Just i -> c_keys' (c+i+1) (B.drop (i+1) s) where w0 = B.take k s w0r = B.reverse $ B.map complement w0 c_keys'' (c1,c2) p s | B.null s = [] | isN (B.head s) = c_keys' (p+k+1) (B.tail s) | otherwise = let n1 = (c1 `mod` 4^(k-1))*4+val (B.head s) n2 = c2 `div` 4+4^(k-1)*(val . complement) (B.head s) in (min n1 n2,p+1) : c_keys'' (n1,n2) (p+1) (B.tail s) in HF { hash = c_key , hashes = c_keys , ksort = Data.List.sort } \end{code} \section{454-hashes} 454 sequencing results in relatively few substitution errors, but has problems with mulitplicity of mononucleotides. One option is to ignore multiplicity, and simply hash nucleotide changes. Note that genkeys won't work on this one. \begin{code} -- what about Ns? compact :: SeqData -> [SeqData] compact bs | B.null bs = [] | otherwise = let (h,t) = B.break (/=B.head bs) bs in h : compact t -- | Like @rcontig@, but ignoring monomers (i.e. arbitrarily long runs of a single nucelotide -- are treated the same a single nucleotide. rcpacked :: Integral k => Int -> HashF k rcpacked k' = let k = fromIntegral k' c_key s i | k > B.length s' = Nothing | B.any isN s' = Nothing | otherwise = Just $ min (n2k k' s') (n2k k' rs') where s1 = take k' $ compact $ B.drop i s s' = fromStr $ map B.head s1 rs' = B.reverse $ B.map complement s' c_keys = c_keys' 0 . compact -- :: Integral k => SeqData -> [(k,Offset)] -- c_keys' :: Offset -> [SeqData] -> [(k,Offset)] c_keys' i ss | k' > length ss = [] | any isN $ map B.head $ take k' ss = c_keys' (i+(B.length . head) ss) (tail ss) | otherwise = let s' = fromStr $ map B.head $ take k' ss key1 = n2k k' s' key2 = n2k k' (B.reverse $ B.map complement s') in (min key1 key2,i) : c_keys'' (key1,key2) (i+(B.length . head) ss) (tail ss) c_keys'' (c1,c2) i [] = [] c_keys'' (c1,c2) i (s:ss) | isN $ B.head s = c_keys' (i + (sum . map B.length) (s:take k' ss)) ss | otherwise = let s1 = B.head s n1 = (c1 `mod` 4^(k-1))*4+val s1 n2 = c2 `div` 4+4^(k-1)*(val . complement) s1 in (min n1 n2,i) : c_keys'' (n1,n2) (i+B.length s) ss in HF { hash = c_key , hashes = c_keys , ksort = Data.List.sort } \end{code} \section{Gapped words} Shape uses a gapped shape specified via a string Note that this will be challenging to get symmetric, since an 'N' character may appear in the mask in only one direction. \begin{code} type Shape = String gapped :: Integral k => Shape -> HashF k gapped = error "gapped" \end{code} \subsection{Key arithmetic} A hash is an integer representing packed k-words. This is currently limited to the nucleotide alphabet, but could easily(?) be extended to arbitrary alphabets. Ideally, it should also handle wildcards (i.e. inserting a position with multiple hashes) \begin{code} isN :: Char -> Bool isN = not . flip elem "ACGT" . toUpper -- conversion between integers (hashes) and strings -- n2k ignores contents beyond k chars n2k :: Integral k => Int -> SeqData -> k n2k k = n2i' 0 . B.take (fromIntegral k) n2i' :: (Num a) => a -> SeqData -> a n2i' acc gs = if B.null gs then acc else n2i' (4*acc + val (B.head gs)) (B.tail gs) -- inefficient, but not used much in critical code anyway k2n :: Integral k => Int -> k -> SeqData k2n k = fromStr . k2n' k k2n' k i = if k==1 then [unval i] else let (q,r) = i `divMod` (4^(k-1)) in unval q : k2n' (k-1) r {- shiftRight, shiftLeft :: Integral k => Int -> Int -> k -> String -> k shiftRight k l pk str = (pk `mod` 4^l)*4^(k-l) + n2k 0 str shiftLeft = error "Keys.shiftLeft not implemented" -} val :: (Num t) => Char -> t val g = case toUpper g of {'A' -> 0; 'C' -> 1; 'G' -> 2; 'T' -> 3; _ -> error ("val: illegal character" ++ show g)} unval :: (Num a) => a -> Char unval i = case i of {0 -> 'A'; 1 -> 'C'; 2 -> 'G'; 3 -> 'T'; _ -> error ("unval: illegal value "++show i)} complement :: Char -> Char complement g = case g of {'A' -> 'T'; 'T' -> 'A'; 'a' -> 't'; 't' -> 'A'; 'C' -> 'G'; 'G' -> 'C'; 'c' -> 'g'; 'g' -> 'c'; _ -> error ("Can't complement "++show g)} \end{code}