-- | The suffix array data structure. Supports (de-) serialization via -- aeson,cereal,binary. -- -- Reading and writing to and from specialized "bio" formats is currently -- open. -- -- TODO compression during serialization? -- TODO versioning? -- TODO read sam/bam format? -- TODO what about mmap for really large indices? module Data.SuffixStructure.ESA where import Data.Aeson import Data.Binary import Data.Default.Class import Data.Int (Int8) import Data.IntMap.Strict (IntMap) import Data.Serialize import Data.Vector.Binary import Data.Vector.Cereal import Data.Vector.Unboxed (Vector) import GHC.Generics (Generic) import qualified Data.IntMap.Strict as IM import qualified Data.Vector.Unboxed as VU -- | The Suffix Array data type, together with the longest common prefix -- table. -- -- TODO skip table? -- TODO inverse suffix array? -- -- TODO maybe parametrize on the Int type (Int,Int64,Int32,Word's) This -- will require better specialization of operations in @NaiveArray@ and -- elsewhere. Otherwise performance drops quite noticable by @x5@ to @x10@. data SA = SA { sa :: !(Vector Int) -- ^ the actual suffix array using 8byte Ints , lcp :: !(Vector Int8) -- ^ 1byte longest common prefix vector, negative number indicates to look at lcpLong , lcpLong :: !(IntMap Int) -- ^ lcp's that are unusual long, but this is sparse } deriving (Eq,Ord,Show,Generic) instance Binary SA instance FromJSON SA instance Serialize SA instance ToJSON SA instance Default SA where def = SA VU.empty VU.empty IM.empty -- | Automatically check 'lcp' and 'lcpLong' to return the real prefix -- length in 'Int' (as opposed to 'Int8' storage of 'lcp'). lcpAt :: SA -> Int -> Int lcpAt SA{..} k | p >= 0 = fromIntegral p | Just p' <- IM.lookup k lcpLong = p' -- by construction! where p = VU.unsafeIndex lcp k {-# INLINE lcpAt #-}