module Bio.Sequence.HashWord where import Bio.Sequence.SeqData import Data.List (sort) import Data.Char (toUpper) import qualified Data.ByteString.Lazy.Char8 as B -- | This is a struct for containing a set of hashing functions data HashF k = HF { hash :: SeqData -> Offset -> Maybe k -- ^ calculates the hash at a given offset in the sequence , hashes :: SeqData -> [(k,Offset)] -- ^ calculate all hashes from a sequence, and their indices , ksort :: [k] -> [k] -- ^ for sorting hashes } -- | Adds a default "hashes" function to a @HashF@, when "hash" is defined. genkeys :: HashF k -> HashF k genkeys kf = kf { hashes = gkeys } where gkeys s = mkkeys (hash kf $ s) [0..B.length s-1] mkkeys f (p:ps) = case f p of Nothing -> mkkeys f ps Just k -> (k,p) : mkkeys f ps mkkeys _ [] = []\end{code} \subsection{Generating hashes} Contigous hashes. \begin{code}

-- | Contigous constructs an int\/eger from a contigous k-word. contigous :: (Integral k, Show k, Eq k) => Int -> HashF k contigous k' = let k = fromIntegral k' klim = let v = 4^(k-1) in if v==0 then error ("HashWord: word size of k="++show k++" too large for this data type.") else v c_key s i | k+i > B.length s = Nothing | otherwise = let s' = B.take k $ B.drop i s in if B.find isN s' == Nothing then Just (n2k k' s') else Nothing c_keys s = c_keys' 0 s c_keys' c s | k > B.length s = [] | otherwise = case B.findIndex isN w0 of Nothing -> let cur = n2k (fromIntegral k) s in (cur,c) : c_keys'' cur c (B.drop k s) Just i -> c_keys' (c+i+1) (B.drop (i+1) s) where w0 = B.take k s c_keys'' cur p s | B.null s = [] | isN (B.head s) = c_keys' (p+k+1) (B.tail s) | otherwise = let new = (cur `mod` klim)*4+val (B.head s) in (new,p+1) : c_keys'' new (p+1) (B.tail s) in HF { hash = c_key , hashes = c_keys , ksort = Data.List.sort }\end{code} \begin{code}

-- | Like 'contigous', but returns the same hash for a word and its reverse complement. {-# SPECIALIZE rcontig :: Int -> HashF Int #-} rcontig :: (Integral k, Show k, Eq k) => Int -> HashF k rcontig k' = let k = fromIntegral k' klim = let v = 4^(k-1) in if v==0 then error ("HashWord: word size of k="++show k++" too large for this data type.") else v c_key s i | k+i > B.length s = Nothing | B.find isN s' /= Nothing = Nothing | otherwise = Just $ min (n2k k' s') (n2k k' rs') where s' = B.take k $ B.drop i s rs' = B.reverse $ B.map complement s' c_keys s = c_keys' 0 s c_keys' c s | k > B.length s = [] | otherwise = case B.findIndex isN w0 of Nothing -> let c1 = n2k (fromIntegral k) w0 c2 = n2k (fromIntegral k) w0r in (min c1 c2,c) : c_keys'' (c1,c2) c (B.drop k s) Just i -> c_keys' (c+i+1) (B.drop (i+1) s) where w0 = B.take k s w0r = B.reverse $ B.map complement w0 c_keys'' (c1,c2) p s | B.null s = [] | isN (B.head s) = c_keys' (p+k+1) (B.tail s) | otherwise = let n1 = (c1 `mod` klim)*4+val (B.head s) n2 = c2 `div` 4+klim*(val . complement) (B.head s) in (min n1 n2,p+1) : c_keys'' (n1,n2) (p+1) (B.tail s) in HF { hash = c_key , hashes = c_keys , ksort = Data.List.sort }\end{code} \section{454-hashes} 454 sequencing results in relatively few substitution errors, but has problems with mulitplicity of mononucleotides. One option is to ignore multiplicity, and simply hash nucleotide changes. Note that genkeys won't work on this one. \begin{code}

-- what about Ns? compact :: SeqData -> [SeqData] compact bs | B.null bs = [] | otherwise = let (h,t) = B.break (/=B.head bs) bs in h : compact t -- | Like @rcontig@, but ignoring monomers (i.e. arbitrarily long runs of a single nucelotide -- are treated the same a single nucleotide. rcpacked :: (Integral k, Show k, Eq k) => Int -> HashF k rcpacked k' = let k = fromIntegral k' klim = let v = 4^(k-1) in if v==0 then error ("HashWord: word size of k="++show k++" too large for this data type.") else v c_key s i | k > B.length s' = Nothing | B.any isN s' = Nothing | otherwise = Just $ min (n2k k' s') (n2k k' rs') where s1 = take k' $ compact $ B.drop i s s' = fromStr $ map B.head s1 rs' = B.reverse $ B.map complement s' c_keys = c_keys' 0 . compact -- :: Integral k => SeqData -> [(k,Offset)] -- c_keys' :: Offset -> [SeqData] -> [(k,Offset)] c_keys' i ss | k' > length ss = [] | any isN $ map B.head $ take k' ss = c_keys' (i+(B.length . head) ss) (tail ss) | otherwise = let s' = fromStr $ map B.head $ take k' ss key1 = n2k k' s' key2 = n2k k' (B.reverse $ B.map complement s') in (min key1 key2,i) : c_keys'' (key1,key2) (i+(B.length . head) ss) (tail ss) c_keys'' (_c1,_c2) _i [] = [] c_keys'' (c1,c2) i (s:ss) | isN $ B.head s = c_keys' (i + (sum . map B.length) (s:take k' ss)) ss | otherwise = let s1 = B.head s n1 = (c1 `mod` klim)*4+val s1 n2 = c2 `div` 4+klim*(val . complement) s1 in (min n1 n2,i) : c_keys'' (n1,n2) (i+B.length s) ss in HF { hash = c_key , hashes = c_keys , ksort = Data.List.sort }\end{code} \section{Gapped words} Shape uses a gapped shape specified via a string Note that this will be challenging to get symmetric, since an 'N' character may appear in the mask in only one direction. \begin{code}

type Shape = String gapped :: Integral k => Shape -> HashF k gapped = error "gapped"\end{code} \subsection{Key arithmetic} A hash is an integer representing packed k-words. This is currently limited to the nucleotide alphabet, but could easily(?) be extended to arbitrary alphabets. Ideally, it should also handle wildcards (i.e. inserting a position with multiple hashes) \begin{code}

isN :: Char -> Bool isN = not . flip elem "ACGT" . toUpper -- conversion between integers (hashes) and strings -- n2k ignores contents beyond k chars n2k :: (Integral k, Show k, Eq k) => Int -> SeqData -> k n2k k = n2i' 0 . B.take (fromIntegral k) n2i' :: (Num a, Show a, Eq a) => a -> SeqData -> a n2i' acc gs = if B.null gs then acc else n2i' (4*acc + val (B.head gs)) (B.tail gs) -- inefficient, but not used much in critical code anyway k2n :: (Integral k, Show k, Eq k) => Int -> k -> SeqData k2n k = fromStr . k2n' k k2n' :: (Integral a, Integral a1, Show a, Show a1) => a -> a1 -> String k2n' k i = if k==1 then [unval i] else let (q,r) = i `divMod` klim in unval q : k2n' (k-1) r where klim = let v = 4^(k-1) in if v==0 then error ("HashWord: word size of k="++show k++" too large for this data type.") else v {- shiftRight, shiftLeft :: Integral k => Int -> Int -> k -> String -> k shiftRight k l pk str = (pk `mod` 4^l)*4^(k-l) + n2k 0 str shiftLeft = error "Keys.shiftLeft not implemented" -} val :: (Num t, Show t, Eq t) => Char -> t val g = case toUpper g of {'A' -> 0; 'C' -> 1; 'G' -> 2; 'T' -> 3; _ -> error ("val: illegal character" ++ show g)} unval :: (Num a, Show a, Eq a) => a -> Char unval i = case i of {0 -> 'A'; 1 -> 'C'; 2 -> 'G'; 3 -> 'T'; _ -> error ("unval: illegal value "++show i)} complement :: Char -> Char complement g = case g of {'A' -> 'T'; 'T' -> 'A'; 'a' -> 't'; 't' -> 'A'; 'C' -> 'G'; 'G' -> 'C'; 'c' -> 'g'; 'g' -> 'c'; _ -> error ("Can't complement "++show g)}\end{code}