{-# LANGUAGE DeriveDataTypeable #-} module Hoogle.DataBase.SubstrSearch (SubstrSearch, createSubstrSearch ,searchSubstrSearch ,completionsSubstrSearch ) where import Hoogle.Store.All import qualified Data.Set as Set import qualified Data.ByteString as BS import qualified Data.ByteString.Unsafe as BS import qualified Data.ByteString.Char8 as BSC import General.Base import Data.Array import Hoogle.Type.All import Hoogle.Score.All {- Format 2: -- build a Huffman table huffman :: Eq a => [a] -> Huffman a -- encode a value using the table -- return the first 32 bits of the encoding, and a mask (will be all 1's if more than 32 bits) encode :: Huffman a -> [a] -> (Word32, Word32) -- We have 4 buckets, one per priority level - Prelude first, then base, then platform, then anything data Substr a = Substr [Bucket a] -- Each bucket contains the encoding of each entry (a pointer to it) along -- with the Word32 prefix of each string -- the 31'st bit is 1 if the string comes from the start of a string -- and the 32'nd bit is 1 if the string contains upper case letters -- within each entry, the tree is used to find shifts -- items are sorted by prefixes data Bucket a = Bucket {answers :: [a], prefixes :: [Word32], tree :: Tree} -- at each tree point the range is the start/end index where you may find things with that prefix -- if the Maybe is Just then all the points in that range are shifted by one bit data Tree = Tree {range :: (Int, Int), rest :: Maybe (Tree, Tree)} -} -- idea for speed improvement -- store as one long bytestring with \0 between the words, then do findSubstrings to find the indexes -- store the lengths in a separate bytestring then use index to step through them, retrieving the data as Word8 via foldl -- store the links in another bytestring with the lengths, but only unpack them when they are needed -- can even make length==0 code for it's the same string as before, to compress it and reduce searching -- was previously ~ 0.047 seconds {- Description: Data is stored flattened. For default we expect ~200Kb of disk usage. -} -- keys are sorted after being made lower case data SubstrSearch a = SubstrSearch {text :: BString -- all the bytestrings, in preference order ,lens :: BString -- a list of lengths ,inds :: Array Int a -- the results } deriving Typeable -- | Create a substring search index. Values are returned in order where possible. createSubstrSearch :: [(String,a)] -> SubstrSearch a createSubstrSearch xs = SubstrSearch (fromString $ concat ts2) (BS.pack $ map fromIntegral ls2) (listArray (0,length is-1) is) where (ts,is) = unzip $ map (first $ map toLower) xs (ts2,ls2) = f "" ts f x (y:ys) = first (y:) $ second (length y:) $ f y ys f x [] = ([],[]) data S a = S {sCount :: !Int -- which one are we on ,sFocus :: !BS.ByteString -- where we are in the string ,sPrefix :: ![(a,EntryView,Score)] -- the prefixes ,sInfix :: ![(a,EntryView,Score)] -- the infixes } searchSubstrSearch :: SubstrSearch a -> String -> [(a, EntryView, Score)] searchSubstrSearch x y = reverse (sPrefix sN) ++ reverse (sInfix sN) where view = FocusOn y match = bsMatch (BSC.pack $ map toLower y) sN = BS.foldl f s0 $ lens x s0 = S 0 (text x) [] [] f s ii = addCount $ moveFocus i $ maybe id addMatch t s where t = match i $ BS.unsafeTake i $ sFocus s i = fromIntegral ii addCount s = s{sCount=sCount s+1} moveFocus i s = s{sFocus=BS.unsafeDrop i $ sFocus s} addMatch MatchSubstr s = s{sInfix =(inds x ! sCount s,view,textScore MatchSubstr):sInfix s} addMatch t s = s{sPrefix=(inds x ! sCount s,view,textScore t):sPrefix s} data S2 = S2 {_s2Focus :: !BS.ByteString -- where we are in the string ,s2Result :: Set.Set BS.ByteString } completionsSubstrSearch :: SubstrSearch a -> String -> [String] completionsSubstrSearch x y = map (\x -> y ++ drop ny (BSC.unpack x)) $ take 10 $ Set.toAscList $ s2Result $ BS.foldl f (S2 (text x) Set.empty) $ lens x where ny = length y ly = fromString $ map toLower y f (S2 foc res) ii = S2 (BS.unsafeDrop i foc) (if ly `BS.isPrefixOf` x then Set.insert x res else res) where x = BS.unsafeTake i foc i = fromIntegral ii instance Show a => Show (SubstrSearch a) where show x = "SubstrSearch" instance (Typeable a, Store a) => Store (SubstrSearch a) where put (SubstrSearch a b c) = putDefer $ put3 a b c get = getDefer $ get3 SubstrSearch -- if first word is empty, always return Exact/Prefix -- if first word is a single letter, do elemIndex -- if first word is multiple, do isPrefixOf's but only up until n from the end -- partially apply on the first word bsMatch :: BS.ByteString -> Int -> BS.ByteString -> Maybe TextMatch bsMatch x | nx == 0 = \ny _ -> Just $ if ny == 0 then MatchExact else MatchPrefix | nx == 1 = let c = BS.head x in \ny y -> case BS.elemIndex c y of Nothing -> Nothing Just 0 -> Just $ if ny == 1 then MatchExact else MatchPrefix Just _ -> Just MatchSubstr | otherwise = \ny y -> if BS.isPrefixOf x y then Just (if nx == nx then MatchExact else MatchPrefix) else if BS.isInfixOf x y then Just MatchSubstr else Nothing where nx = BS.length x