-- |
--
-- == Implementation
--
-- See "Data.Suffix" for implementation details.
module Data.Suffix.ByteString
  (
    -- * Suffix array
    buildSuffixArray
  , Suf.SuffixArray(..)
  , search

    -- * Longest common prefix array
  , buildLCPArray
  , Suf.LCPArray(..)

    -- * LLCP and RLCP arrays
  , Suf.buildLRLCPArray
  , Suf.LRLCPArrays(..)
  , searchLRLCP

    -- * Suffix tree
  , Suf.foldSuffixTree

    -- * Intn
  , Intn

    -- * Examples
    -- $examples
  ) where

import qualified Data.ByteString as B
import Data.Int
import Data.Suffix (Intn, SuffixArray, LCPArray, LRLCPArrays)
import qualified Data.Suffix as Suf

-- | \(O(n)\). Build a suffix array from a @ByteString@.
--
-- On 64-bit systems, a @SuffixArray Int32@ requires half the memory as a
-- @SuffixArray Int@.
buildSuffixArray
  :: Intn i
  => B.ByteString   -- ^ Input @ByteString@ of length \(n\).
  -> SuffixArray i  -- ^ Output type @i@ can be @Int@ or @Int32@. If @i@ is
                    --   @Int32@, \(n\) must be @<= (maxBound :: Int32)@.
buildSuffixArray :: forall i. Intn i => ByteString -> SuffixArray i
buildSuffixArray ByteString
b =
  Int -> Pull Int -> SuffixArray i
forall i. Intn i => Int -> Pull Int -> SuffixArray i
Suf.buildSuffixArray Int
256 (Word8 -> Int
forall a b. (Integral a, Num b) => a -> b
fromIntegral (Word8 -> Int) -> Pull Word8 -> Pull Int
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
<$> ByteString -> Pull Word8
Suf.pullFromByteString ByteString
b)
{-# SPECIALIZE buildSuffixArray :: B.ByteString -> SuffixArray Int #-}
{-# SPECIALIZE buildSuffixArray :: B.ByteString -> SuffixArray Int32 #-}

-- | \(O(m \log n)\). Search for a pattern in a @ByteString@ using its suffix
-- array.
--
-- Note: For typical inputs, the worst case is unlikely and the running time is
-- close to \(O(m + \log n)\). To get guaranteed \(O(m + \log n)\) running time
-- consider using 'searchLRLCP' instead.
search
  :: Intn i
  => B.ByteString   -- ^ @ByteString@ of length \(n\)
  -> SuffixArray i  -- ^ Suffix array for the above @ByteString@
  -> B.ByteString   -- ^ Pattern @ByteString@ of length \(m\)
  -> (Int, Int)     -- ^ An @(offset, length)@ pair, denoting a slice of the
                    --   suffix array. Beginning at @offset@, @length@ suffixes
                    --   start with the pattern. @length@ is 0 if the pattern
                    --   does not occur in the sequence.
search :: forall i.
Intn i =>
ByteString -> SuffixArray i -> ByteString -> (Int, Int)
search ByteString
b SuffixArray i
sa ByteString
b2 =
  Pull Word8 -> SuffixArray i -> Pull Word8 -> (Int, Int)
forall a i.
(Ord a, Intn i) =>
Pull a -> SuffixArray i -> Pull a -> (Int, Int)
Suf.search (ByteString -> Pull Word8
Suf.pullFromByteString ByteString
b) SuffixArray i
sa (ByteString -> Pull Word8
Suf.pullFromByteString ByteString
b2)
{-# SPECIALIZE search :: B.ByteString -> SuffixArray Int -> B.ByteString -> (Int, Int) #-}
{-# SPECIALIZE search :: B.ByteString -> SuffixArray Int32 -> B.ByteString -> (Int, Int) #-}

-- | \(O(n)\). Build a longest common prefix array from a @ByteString@ and its
-- suffix array.
--
-- The LCP array has the same length as the sequence, \(n\). The \(0\)-th
-- element of the LCP array is \(0\). The \(i\)-th element of the LCP array for
-- \(0 < i < n\) is the longest common prefix of the \(i\)-th and \((i-1)\)-th
-- suffix in the suffix array.
buildLCPArray :: Intn i => B.ByteString -> SuffixArray i -> LCPArray i
buildLCPArray :: forall i. Intn i => ByteString -> SuffixArray i -> LCPArray i
buildLCPArray ByteString
b SuffixArray i
sa = Pull Word8 -> SuffixArray i -> LCPArray i
forall a i. (Eq a, Intn i) => Pull a -> SuffixArray i -> LCPArray i
Suf.buildLCPArray (ByteString -> Pull Word8
Suf.pullFromByteString ByteString
b) SuffixArray i
sa
{-# SPECIALIZE buildLCPArray :: B.ByteString -> SuffixArray Int -> LCPArray Int #-}
{-# SPECIALIZE buildLCPArray :: B.ByteString -> SuffixArray Int32 -> LCPArray Int32 #-}

-- | \(O(m + \log n)\). Search for a pattern in a sequence using its suffix
-- array and LLCP and RLCP arrays.
searchLRLCP
  :: Intn i
  => B.ByteString   -- ^ @ByteString@ of length \(n\)
  -> SuffixArray i  -- ^ Suffix array for the above @ByteString@
  -> LRLCPArrays i  -- ^ LLCP and RLCP arrays for the above @ByteString@
  -> B.ByteString   -- ^ Pattern @ByteString@ of length \(m\)
  -> (Int, Int)     -- ^ An @(offset, length)@ pair, denoting a slice of the
                    --   suffix array. Beginning at @offset@, @length@ suffixes
                    --   start with the pattern. @length@ is 0 if the pattern
                    --   does not occur in the sequence.
searchLRLCP :: forall i.
Intn i =>
ByteString
-> SuffixArray i -> LRLCPArrays i -> ByteString -> (Int, Int)
searchLRLCP ByteString
b SuffixArray i
sa LRLCPArrays i
lrlcp ByteString
b2 =
  Pull Word8
-> SuffixArray i -> LRLCPArrays i -> Pull Word8 -> (Int, Int)
forall a i.
(Ord a, Intn i) =>
Pull a -> SuffixArray i -> LRLCPArrays i -> Pull a -> (Int, Int)
Suf.searchLRLCP
    (ByteString -> Pull Word8
Suf.pullFromByteString ByteString
b)
    SuffixArray i
sa
    LRLCPArrays i
lrlcp
    (ByteString -> Pull Word8
Suf.pullFromByteString ByteString
b2)
{-# SPECIALIZE searchLRLCP :: B.ByteString -> SuffixArray Int -> LRLCPArrays Int -> B.ByteString -> (Int, Int) #-}
{-# SPECIALIZE searchLRLCP :: B.ByteString -> SuffixArray Int32 -> LRLCPArrays Int32 -> B.ByteString -> (Int, Int) #-}

-- $examples
--
-- === Build a suffix array and LCP array
--
-- @
-- import Data.ByteString (ByteString)
-- import qualified Data.ByteString.Char8 as BC
--
-- import Data.Suffix.ByteString
--
-- banana :: ByteString
-- banana = BC.pack \"BANANA\"
-- @
--
-- >>> let sa = buildSuffixArray banana :: SuffixArray Int
-- >>> sa
-- SuffixArray [5,3,1,0,4,2]
-- >>> let lcpa = buildLCPArray banana sa
-- >>> lcpa
-- LCPArray [0,1,3,0,0,2]
--
-- === Tabulate a suffix array
--
-- @
-- import Data.ByteString (ByteString)
-- import qualified Data.ByteString.Char8 as BC
-- import Data.Primitive.PrimArray (indexPrimArray)
-- import Text.Printf (printf)
--
-- import Data.Suffix.ByteString
--
-- tabulatePrint :: ByteString -> IO ()
-- tabulatePrint bs = putStrLn $ unlines $ header : map row [0 .. n-1]
--   where
--     header = "SA  LCP   Suffix"
--     n = BC.length bs
--     sa@('SuffixArray' sa_) = 'buildSuffixArray' bs :: SuffixArray Int
--     'LCPArray' lcpa_ = 'buildLCPArray' bs sa
--     row i = printf "%2d   %2d   %s" sufIdx lcp suffix
--       where
--         suffixId = indexPrimArray sa_ i
--         lcp = indexPrimArray lcpa_ i
--         suffix = BC.unpack (BC.drop suffixId bs)
-- @
--
-- >>> tabulatePrint (BC.pack "mississippi")
-- SA  LCP   Suffix
-- 10    0   i
--  7    1   ippi
--  4    1   issippi
--  1    4   ississippi
--  0    0   mississippi
--  9    0   pi
--  8    1   ppi
--  6    0   sippi
--  3    2   sissippi
--  5    1   ssippi
--  2    3   ssissippi
--
-- === Search
--
-- @
-- import Data.ByteString (ByteString)
-- import qualified Data.ByteString.Char8 as BC
-- import Data.Foldable (for_)
-- import Data.Primitive.PrimArray (indexPrimArray)
--
-- import Data.Suffix.ByteString
--
-- searchAndPrint :: ByteString -> 'SuffixArray' Int -> ByteString -> IO ()
-- searchAndPrint bs sa@('SuffixArray' sa_) pat = case 'search' bs sa pat of
--   (_, 0) -> putStrLn "not found"
--   (off, len) -> for_ [off .. off+len-1] $ \\i -> do
--     let suffixId = indexPrimArray sa_ i
--     putStrLn $ BC.unpack bs
--     putStrLn $ replicate suffixId ' ' ++ replicate (BC.length pat) \'^\'
-- @
--
-- >>> let str = BC.pack "shikanokonokonokokoshitantan"
-- >>> let sa = buildSuffixArray str
-- >>> search str sa (BC.pack "nokonoko")
-- (14,2)
-- >>> searchAndPrint str sa (BC.pack "nokonoko")
-- shikanokonokonokokoshitantan
--          ^^^^^^^^
-- shikanokonokonokokoshitantan
--      ^^^^^^^^
-- >>> search str sa (BC.pack "shika senbei")
-- (24,0)
-- >>> searchAndPrint str sa (BC.pack "shika senbei")
-- not found
--
-- === Visualize a suffix tree
--
-- @
-- import Control.Monad.Trans.Class
-- import Control.Monad.Trans.Writer.Lazy
-- import Control.Monad.Trans.State.Strict
-- import Data.ByteString (ByteString)
-- import qualified Data.ByteString.Char8 as BC
-- import Data.Maybe (fromJust)
--
-- import Data.Suffix.ByteString
--
-- -- Output a tree in Graphviz DOT format
-- suffixTreeDot :: ByteString -> String
-- suffixTreeDot bs =
--   execWriter $ flip evalStateT (0 :: Int) $ do
--     writeln "digraph suffixtree {"
--     _ <- 'Suf.foldSuffixTree' l1 b1 b2 b3 sa lcpa
--     writeln "}"
--   where
--     n = BC.length bs
--     sa = 'buildSuffixArray' bs :: 'SuffixArray' Int
--     lcpa = 'buildLCPArray' bs sa
--
--     fresh = state $ \\i -> (i, i+1)
--     writeln = lift . tell . (++ "\\n")
--
--     l1 suffixId = do
--       nodeId <- fresh
--       writeln $ concat [show nodeId, " [label=\\"", show suffixId, "\\", shape=square];"]
--       let depth = n - suffixId
--       pure (depth, suffixId, nodeId)
--     b1 _ = do
--       nodeId <- fresh
--       writeln $ concat [show nodeId, " [label=\\"\\", shape=circle];"]
--       pure (Nothing, nodeId)
--     b2 depth (_, nodeId) (chDepth, chSuffixId, chNodeId) = do
--       let label = BC.unpack $ BC.drop depth $ BC.take chDepth $ BC.drop chSuffixId bs
--           label' = if null label then "ϵ" else ' ':label
--       writeln $ concat [show nodeId, " -> ", show chNodeId, " [label=\\"", label', "\\"];"]
--       pure (Just chSuffixId, nodeId)
--     b3 depth (mSuffixId, nodeId) =
--       pure (depth, fromJust mSuffixId, nodeId)
-- @
--
-- >>> let bs = BC.pack "mississippi"
-- >>> writeFile "suffixtree.dot" (suffixTreeDot bs)
-- >>> :! dot -Tsvg -o suffixtree.svg suffixtree.dot
--
-- Generates the image:
--
-- ![suffix tree](images/suffixtree.svg)
--