|
This module contains a collection of functions for calculate hash values from nucleotide words.
(Actually, it mostly packs nucleotide words from into integers, so it's a one-to-one relationship.)
It is intended to be used for indexing large sequence collections.
\begin{code}
module Bio.Sequence.HashWord where
import Bio.Sequence.SeqData
import Data.List (sort)
import Data.Char (toUpper)
import Data.Int
import qualified Data.ByteString.Lazy.Char8 as B
data HashF k = HF { hash :: SeqData -> Offset -> Maybe k
, hashes :: SeqData -> [(k,Offset)]
, ksort :: [k] -> [k]
}
genkeys :: HashF k -> HashF k
genkeys kf = kf { hashes = gkeys }
where gkeys s = mkkeys (hash kf $ s) [0..B.length s1]
mkkeys f (p:ps) = case f p of Nothing -> mkkeys f ps
Just k -> (k,p) : mkkeys f ps
mkkeys _ [] = []
\end{code}
\subsection{Generating hashes}
Contigous hashes.
\begin{code}
contigous :: Integral k => Int -> HashF k
contigous k' = let
k = fromIntegral k'
c_key s i | k+i > B.length s = Nothing
| otherwise = let s' = B.take k $ B.drop i s
in if B.find isN s' == Nothing then Just (n2k k' s')
else Nothing
c_keys s = c_keys' 0 s
c_keys' c s | k > B.length s = []
| otherwise = case B.findIndex isN w0 of
Nothing -> let cur = n2k (fromIntegral k) s
in (cur,c) : c_keys'' cur c (B.drop k s)
Just i -> c_keys' (c+i+1) (B.drop (i+1) s)
where w0 = B.take k s
c_keys'' cur p s | B.null s = []
| isN (B.head s) = c_keys' (p+k+1) (B.tail s)
| otherwise = let new = (cur `mod` 4^(k1))*4+val (B.head s)
in (new,p+1) : c_keys'' new (p+1) (B.tail s)
in HF { hash = c_key
, hashes = c_keys
, ksort = Data.List.sort
}
\end{code}
\begin{code}
rcontig :: Integral k => Int -> HashF k
rcontig k' = let
k = fromIntegral k'
c_key s i | k+i > B.length s = Nothing
| B.find isN s' /= Nothing = Nothing
| otherwise = Just $ min (n2k k' s') (n2k k' rs')
where s' = B.take k $ B.drop i s
rs' = B.reverse $ B.map complement s'
c_keys s = c_keys' 0 s
c_keys' c s | k > B.length s = []
| otherwise = case B.findIndex isN w0 of
Nothing -> let c1 = n2k (fromIntegral k) w0
c2 = n2k (fromIntegral k) w0r
in (min c1 c2,c) : c_keys'' (c1,c2) c (B.drop k s)
Just i -> c_keys' (c+i+1) (B.drop (i+1) s)
where w0 = B.take k s
w0r = B.reverse $ B.map complement w0
c_keys'' (c1,c2) p s | B.null s = []
| isN (B.head s) = c_keys' (p+k+1) (B.tail s)
| otherwise = let n1 = (c1 `mod` 4^(k1))*4+val (B.head s)
n2 = c2 `div` 4+4^(k1)*(val . complement) (B.head s)
in (min n1 n2,p+1) : c_keys'' (n1,n2) (p+1) (B.tail s)
in HF { hash = c_key
, hashes = c_keys
, ksort = Data.List.sort
}
\end{code}
\section{454-hashes}
454 sequencing results in relatively few substitution errors, but has problems
with mulitplicity of mononucleotides. One option is to ignore multiplicity, and
simply hash nucleotide changes.
Note that genkeys won't work on this one.
\begin{code}
compact :: SeqData -> [SeqData]
compact bs | B.null bs = []
| otherwise = let (h,t) = B.break (/=B.head bs) bs in h : compact t
rcpacked :: Integral k => Int -> HashF k
rcpacked k' = let
k = fromIntegral k'
c_key s i | k > B.length s' = Nothing
| B.any isN s' = Nothing
| otherwise = Just $ min (n2k k' s') (n2k k' rs')
where s1 = take k' $ compact $ B.drop i s
s' = fromStr $ map B.head s1
rs' = B.reverse $ B.map complement s'
c_keys = c_keys' 0 . compact
c_keys' i ss | k' > length ss = []
| any isN $ map B.head $ take k' ss = c_keys' (i+(B.length . head) ss) (tail ss)
| otherwise = let s' = fromStr $ map B.head $ take k' ss
key1 = n2k k' s'
key2 = n2k k' (B.reverse $ B.map complement s')
in (min key1 key2,i) : c_keys'' (key1,key2) (i+(B.length . head) ss) (tail ss)
c_keys'' (c1,c2) i [] = []
c_keys'' (c1,c2) i (s:ss)
| isN $ B.head s = c_keys' (i + (sum . map B.length) (s:take k' ss)) ss
| otherwise = let s1 = B.head s
n1 = (c1 `mod` 4^(k1))*4+val s1
n2 = c2 `div` 4+4^(k1)*(val . complement) s1
in (min n1 n2,i) : c_keys'' (n1,n2) (i+B.length s) ss
in HF { hash = c_key
, hashes = c_keys
, ksort = Data.List.sort
}
\end{code}
\section{Gapped words}
Shape uses a gapped shape specified via a string
Note that this will be challenging to get symmetric, since
an 'N' character may appear in the mask in only one direction.
\begin{code}
type Shape = String
gapped :: Integral k => Shape -> HashF k
gapped = error "gapped"
\end{code}
\subsection{Key arithmetic}
A hash is an integer representing packed k-words.
This is currently limited to the nucleotide alphabet, but could easily(?)
be extended to arbitrary alphabets. Ideally, it should also handle
wildcards (i.e. inserting a position with multiple hashes)
\begin{code}
isN :: Char -> Bool
isN = not . flip elem "ACGT" . toUpper
n2k :: Integral k => Int -> SeqData -> k
n2k k = n2i' 0 . B.take (fromIntegral k)
n2i' :: (Num a) => a -> SeqData -> a
n2i' acc gs = if B.null gs then acc else n2i' (4*acc + val (B.head gs)) (B.tail gs)
k2n :: Integral k => Int -> k -> SeqData
k2n k = fromStr . k2n' k
k2n' k i = if k==1 then [unval i]
else let (q,r) = i `divMod` (4^(k1)) in unval q : k2n' (k1) r
val :: (Num t) => Char -> t
val g = case toUpper g of {'A' -> 0; 'C' -> 1; 'G' -> 2; 'T' -> 3;
_ -> error ("val: illegal character" ++ show g)}
unval :: (Num a) => a -> Char
unval i = case i of {0 -> 'A'; 1 -> 'C'; 2 -> 'G'; 3 -> 'T';
_ -> error ("unval: illegal value "++show i)}
complement :: Char -> Char
complement g = case g of {'A' -> 'T'; 'T' -> 'A'; 'a' -> 't'; 't' -> 'A';
'C' -> 'G'; 'G' -> 'C'; 'c' -> 'g'; 'g' -> 'c';
_ -> error ("Can't complement "++show g)}
\end{code}