{-# LANGUAGE MultiWayIf       #-}
{-# LANGUAGE ViewPatterns     #-}
{-# LANGUAGE Strict           #-}
{-# LANGUAGE DeriveGeneric    #-}
{-# LANGUAGE TypeApplications #-}


-- |
-- Module      :  Data.BWT.Internal
-- Copyright   :  (c) Matthew Mosior 2022
-- License     :  BSD-style
-- Maintainer  :  mattm.github@gmail.com
-- Portability :  portable
--
-- = WARNING
--
-- This module is considered __internal__.
--
-- The Package Versioning Policy __does not apply__.
--
-- The contents of this module may change __in any way whatsoever__
-- and __without any warning__ between minor versions of this package.
--
-- Authors importing this library are expected to track development
-- closely.
--
-- All credit goes to the author(s)/maintainer(s) of the
-- [containers](https://hackage.haskell.org/package/containers) library
-- for the above warning text.
--
-- = Description
--
-- Various data structures and custom data types to describe the Burrows-Wheeler Transform (BWT)
-- and the Inverse BWT.
--
-- The implementation of the BWT relies upon sequence provided
-- by the [containers](https://hackage.haskell.org/package/containers).
--
-- The internal 'BWTMatrix' data type relies upon the [massiv](https://hackage.haskell.org/package/massiv) package.


module Data.BWT.Internal where

import Control.Monad as CM
import Control.Monad.ST as CMST
import Control.Monad.State.Strict()
import Data.Foldable as DFold
import Data.List as DL
import Data.Sequence as DS
import Data.Massiv.Array as DMA
import Data.Massiv.Core()
import Data.STRef as DSTR
import GHC.Generics
import Prelude as P


{-Base level types.-}

-- | Basic suffix data type.  Used to describe
-- the core data inside of the 'SuffixArray' data type.
data Suffix = Suffix { Suffix -> Int
suffixindex    :: Int
                     , Suffix -> Int
suffixstartpos :: Int
                     , Suffix -> Seq Char
suffix         :: Seq Char
                     }
  deriving (Int -> Suffix -> ShowS
[Suffix] -> ShowS
Suffix -> String
forall a.
(Int -> a -> ShowS) -> (a -> String) -> ([a] -> ShowS) -> Show a
showList :: [Suffix] -> ShowS
$cshowList :: [Suffix] -> ShowS
show :: Suffix -> String
$cshow :: Suffix -> String
showsPrec :: Int -> Suffix -> ShowS
$cshowsPrec :: Int -> Suffix -> ShowS
Show,ReadPrec [Suffix]
ReadPrec Suffix
Int -> ReadS Suffix
ReadS [Suffix]
forall a.
(Int -> ReadS a)
-> ReadS [a] -> ReadPrec a -> ReadPrec [a] -> Read a
readListPrec :: ReadPrec [Suffix]
$creadListPrec :: ReadPrec [Suffix]
readPrec :: ReadPrec Suffix
$creadPrec :: ReadPrec Suffix
readList :: ReadS [Suffix]
$creadList :: ReadS [Suffix]
readsPrec :: Int -> ReadS Suffix
$creadsPrec :: Int -> ReadS Suffix
Read,Suffix -> Suffix -> Bool
forall a. (a -> a -> Bool) -> (a -> a -> Bool) -> Eq a
/= :: Suffix -> Suffix -> Bool
$c/= :: Suffix -> Suffix -> Bool
== :: Suffix -> Suffix -> Bool
$c== :: Suffix -> Suffix -> Bool
Eq,Eq Suffix
Suffix -> Suffix -> Bool
Suffix -> Suffix -> Ordering
Suffix -> Suffix -> Suffix
forall a.
Eq a
-> (a -> a -> Ordering)
-> (a -> a -> Bool)
-> (a -> a -> Bool)
-> (a -> a -> Bool)
-> (a -> a -> Bool)
-> (a -> a -> a)
-> (a -> a -> a)
-> Ord a
min :: Suffix -> Suffix -> Suffix
$cmin :: Suffix -> Suffix -> Suffix
max :: Suffix -> Suffix -> Suffix
$cmax :: Suffix -> Suffix -> Suffix
>= :: Suffix -> Suffix -> Bool
$c>= :: Suffix -> Suffix -> Bool
> :: Suffix -> Suffix -> Bool
$c> :: Suffix -> Suffix -> Bool
<= :: Suffix -> Suffix -> Bool
$c<= :: Suffix -> Suffix -> Bool
< :: Suffix -> Suffix -> Bool
$c< :: Suffix -> Suffix -> Bool
compare :: Suffix -> Suffix -> Ordering
$ccompare :: Suffix -> Suffix -> Ordering
Ord,forall x. Rep Suffix x -> Suffix
forall x. Suffix -> Rep Suffix x
forall a.
(forall x. a -> Rep a x) -> (forall x. Rep a x -> a) -> Generic a
$cto :: forall x. Rep Suffix x -> Suffix
$cfrom :: forall x. Suffix -> Rep Suffix x
Generic)

-- | The SuffixArray data type.
-- Uses sequence internally.
type SuffixArray = Seq Suffix

-- | The BWT data type.
-- Uses sequence internally.
type BWT         = Seq Char

-- | The BWTMatrix data type.
-- Uses a massiv array internally.
type BWTMatrix   = DMA.Array BN Ix1 String

{-------------------}


{-toBWT functions.-}

-- | Computes the Burrows-Wheeler Transform (BWT) using the suffix array
-- and the original string (represented as a sequence for performance).
saToBWT :: SuffixArray -> Seq Char -> BWT
saToBWT :: SuffixArray -> Seq Char -> Seq Char
saToBWT SuffixArray
DS.Empty Seq Char
_ = forall a. Seq a
DS.Empty
saToBWT (Suffix
y DS.:<| SuffixArray
ys) Seq Char
t =
  if | Suffix -> Int
suffixstartpos Suffix
y forall a. Eq a => a -> a -> Bool
/= Int
1
     -> forall a. Seq a -> Int -> a
DS.index Seq Char
t (Suffix -> Int
suffixstartpos Suffix
y forall a. Num a => a -> a -> a
- Int
1 forall a. Num a => a -> a -> a
- Int
1)
        forall a. a -> Seq a -> Seq a
DS.<| (SuffixArray -> Seq Char -> Seq Char
saToBWT SuffixArray
ys Seq Char
t)
     | Bool
otherwise
     -> forall a. Seq a -> Int -> a
DS.index Seq Char
t (forall a. Seq a -> Int
DS.length Seq Char
t forall a. Num a => a -> a -> a
- Int
1)
        forall a. a -> Seq a -> Seq a
DS.<| (SuffixArray -> Seq Char -> Seq Char
saToBWT SuffixArray
ys Seq Char
t)

-- | Computes the corresponding 'SuffixArray' of a given string. Please see [suffix array](https://en.wikipedia.org/wiki/Suffix_array)
-- for more information. 
createSuffixArray :: Seq Char -> SuffixArray
createSuffixArray :: Seq Char -> SuffixArray
createSuffixArray Seq Char
xs =
  forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
fmap (\(Int, Int, Seq Char)
x -> Suffix { suffixindex :: Int
suffixindex    = ((\(Int
a,Int
_,Seq Char
_) -> Int
a) (Int, Int, Seq Char)
x)
                     , suffixstartpos :: Int
suffixstartpos = ((\(Int
_,Int
b,Seq Char
_) -> Int
b) (Int, Int, Seq Char)
x)
                     , suffix :: Seq Char
suffix         = ((\(Int
_,Int
_,Seq Char
c) -> Seq Char
c) (Int, Int, Seq Char)
x)
                     }
       ) Seq (Int, Int, Seq Char)
xsssuffixesfff
    where
      xsssuffixes :: Seq (Seq Char)
xsssuffixes         = forall a. Seq a -> Seq (Seq a)
DS.tails Seq Char
xs
      xsssuffixesf :: Seq (Int, Seq Char)
xsssuffixesf        = forall a b. Seq a -> Seq b -> Seq (a, b)
DS.zip (forall a. [a] -> Seq a
DS.fromList [Int
1..(forall a. Seq a -> Int
DS.length Seq (Seq Char)
xsssuffixes)])
                                   Seq (Seq Char)
xsssuffixes
      xsssuffixesff :: Seq (Int, Seq Char)
xsssuffixesff       = forall a. (a -> Bool) -> Seq a -> Seq a
DS.filter (\(Int
_,Seq Char
b) -> Bool -> Bool
not forall a b. (a -> b) -> a -> b
$ forall a. Seq a -> Bool
DS.null Seq Char
b)
                                      Seq (Int, Seq Char)
xsssuffixesf
      xsssuffixesffsorted :: Seq (Int, Seq Char)
xsssuffixesffsorted = forall b a. Ord b => (a -> b) -> Seq a -> Seq a
DS.sortOn forall a b. (a, b) -> b
snd Seq (Int, Seq Char)
xsssuffixesff
      xsssuffixesfff :: Seq (Int, Int, Seq Char)
xsssuffixesfff      = (\(Int
a,(Int
b,Seq Char
c)) -> (Int
a,Int
b,Seq Char
c))
                            forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
<$>
                            forall a b. Seq a -> Seq b -> Seq (a, b)
DS.zip (forall a. [a] -> Seq a
DS.fromList [Int
1..(forall a. Seq a -> Int
DS.length Seq (Int, Seq Char)
xsssuffixesffsorted)])
                                   Seq (Int, Seq Char)
xsssuffixesffsorted

{------------------}


{-fromBWT functions.-}

-- | Hierarchical sorting scheme that compares fst first then snd.
-- Necessary for the setting up the BWT in order to correctly
-- invert it using the [Magic](https://www.youtube.com/watch?v=QwSsppKrCj4) algorithm.
sortTB :: (Ord a1, Ord a2) => (a1, a2) -> (a1, a2) -> Ordering
sortTB :: forall a1 a2. (Ord a1, Ord a2) => (a1, a2) -> (a1, a2) -> Ordering
sortTB (a1
c1,a2
i1) (a1
c2,a2
i2) = forall a. Ord a => a -> a -> Ordering
compare a1
c1 a1
c2 forall a. Semigroup a => a -> a -> a
<>
                         forall a. Ord a => a -> a -> Ordering
compare a2
i1 a2
i2

-- | Abstract BWTSeq type utilizing a sequence.
type BWTSeq a = Seq Char

-- | Abstract data type representing a BWTSeq in the (strict) ST monad.
type STBWTSeq s a = STRef s (BWTSeq Char)

-- | State function to push BWTString data into stack.
pushSTBWTSeq :: STBWTSeq s Char -> Char -> ST s ()
pushSTBWTSeq :: forall s. STBWTSeq s Char -> Char -> ST s ()
pushSTBWTSeq STBWTSeq s Char
s Char
e = do
  Seq Char
s2 <- forall s a. STRef s a -> ST s a
readSTRef STBWTSeq s Char
s
  forall s a. STRef s a -> a -> ST s ()
writeSTRef STBWTSeq s Char
s (Seq Char
s2 forall a. Seq a -> a -> Seq a
DS.|> Char
e)

-- | State function to create empty STBWTString type.
emptySTBWTSeq :: ST s (STBWTSeq s Char)
emptySTBWTSeq :: forall s. ST s (STBWTSeq s Char)
emptySTBWTSeq = forall a s. a -> ST s (STRef s a)
newSTRef forall a. Seq a
DS.empty

-- | Abstract BWTCounter and associated state type.
type STBWTCounter s a = STRef s Int

-- | State function to update BWTCounter.
updateSTBWTCounter :: STBWTCounter s Int -> Int -> ST s ()
updateSTBWTCounter :: forall s. STBWTCounter s Int -> Int -> ST s ()
updateSTBWTCounter STBWTCounter s Int
s Int
e = forall s a. STRef s a -> a -> ST s ()
writeSTRef STBWTCounter s Int
s Int
e

-- | State function to create empty STBWTCounter type.
emptySTBWTCounter :: ST s (STBWTCounter s Int)
emptySTBWTCounter :: forall s. ST s (STBWTCounter s Int)
emptySTBWTCounter = forall a s. a -> ST s (STRef s a)
newSTRef (-Int
1)

-- | "Magic" Inverse BWT function.
magicInverseBWT :: Seq (Char,Int) -> ST s (BWTSeq Char)
magicInverseBWT :: forall s. Seq (Char, Int) -> ST s (Seq Char)
magicInverseBWT Seq (Char, Int)
DS.Empty = do
  STBWTSeq s Char
bwtseqstackempty <- forall s. ST s (STBWTSeq s Char)
emptySTBWTSeq
  Seq Char
bwtseqstackemptyr <- forall s a. STRef s a -> ST s a
readSTRef STBWTSeq s Char
bwtseqstackempty
  forall (m :: * -> *) a. Monad m => a -> m a
return Seq Char
bwtseqstackemptyr
magicInverseBWT Seq (Char, Int)
xs       = do
  STBWTSeq s Char
bwtseqstack <- forall s. ST s (STBWTSeq s Char)
emptySTBWTSeq
  STBWTCounter s Int
bwtcounterstack <- forall s. ST s (STBWTCounter s Int)
emptySTBWTCounter
  case (forall a. (a -> Bool) -> Seq a -> Maybe Int
DS.findIndexL ((forall a. Eq a => a -> a -> Bool
== Char
'$') forall b c a. (b -> c) -> (a -> b) -> a -> c
. forall a b. (a, b) -> a
fst) Seq (Char, Int)
xs) of
    Maybe Int
Nothing              -> do Seq Char
bwtseqstackr <- forall s a. STRef s a -> ST s a
readSTRef STBWTSeq s Char
bwtseqstack
                               forall (m :: * -> *) a. Monad m => a -> m a
return Seq Char
bwtseqstackr
    Just Int
dollarsignindex -> do let dollarsignfirst :: (Char, Int)
dollarsignfirst = forall a. Seq a -> Int -> a
DS.index Seq (Char, Int)
xs
                                                              Int
dollarsignindex
                               forall s. STBWTCounter s Int -> Int -> ST s ()
updateSTBWTCounter STBWTCounter s Int
bwtcounterstack
                                                  (forall a b. (a, b) -> b
snd (Char, Int)
dollarsignfirst)
                               forall {s}.
Seq (Char, Int) -> STRef s (Seq Char) -> STRef s Int -> ST s ()
iBWT Seq (Char, Int)
xs
                                    STBWTSeq s Char
bwtseqstack
                                    STBWTCounter s Int
bwtcounterstack
                               Seq Char
bwtseqstackr <- forall s a. STRef s a -> ST s a
readSTRef STBWTSeq s Char
bwtseqstack
                               forall (m :: * -> *) a. Monad m => a -> m a
return Seq Char
bwtseqstackr
      where
        iBWT :: Seq (Char, Int) -> STRef s (Seq Char) -> STRef s Int -> ST s ()
iBWT Seq (Char, Int)
ys STRef s (Seq Char)
bwtss STRef s Int
bwtcs = do
          Int
cbwtcs <- forall s a. STRef s a -> ST s a
readSTRef STRef s Int
bwtcs
          Seq Char
cbwtss <- forall s a. STRef s a -> ST s a
readSTRef STRef s (Seq Char)
bwtss
          forall (f :: * -> *). Applicative f => Bool -> f () -> f ()
CM.when (forall a. Seq a -> Int
DS.length Seq Char
cbwtss forall a. Ord a => a -> a -> Bool
< forall a. Seq a -> Int
DS.length Seq (Char, Int)
ys) forall a b. (a -> b) -> a -> b
$ do
            let next :: (Char, Int)
next = forall a. Seq a -> Int -> a
DS.index Seq (Char, Int)
ys Int
cbwtcs
            forall s. STBWTSeq s Char -> Char -> ST s ()
pushSTBWTSeq STRef s (Seq Char)
bwtss
                         (forall a b. (a, b) -> a
fst (Char, Int)
next)
            forall s. STBWTCounter s Int -> Int -> ST s ()
updateSTBWTCounter STRef s Int
bwtcs
                               (forall a b. (a, b) -> b
snd (Char, Int)
next)
            Seq (Char, Int) -> STRef s (Seq Char) -> STRef s Int -> ST s ()
iBWT Seq (Char, Int)
ys STRef s (Seq Char)
bwtss STRef s Int
bwtcs

-- | Easy way to grab the first two elements of a sequence.
grabHeadChunks :: Seq (Seq Char) -> (Seq Char,Seq Char)
grabHeadChunks :: Seq (Seq Char) -> (Seq Char, Seq Char)
grabHeadChunks Seq (Seq Char)
DS.Empty       = (forall a. Seq a
DS.Empty,forall a. Seq a
DS.Empty)
grabHeadChunks (Seq Char
x1 DS.:<| Seq (Seq Char)
xs) = (Seq Char
x1,Seq (Seq Char) -> Seq Char
grabHeadChunksInternal Seq (Seq Char)
xs)
    where
      grabHeadChunksInternal :: Seq (Seq Char) -> Seq Char
      grabHeadChunksInternal :: Seq (Seq Char) -> Seq Char
grabHeadChunksInternal Seq (Seq Char)
DS.Empty       = forall a. Seq a
DS.Empty
      grabHeadChunksInternal (Seq Char
y1 DS.:<| Seq (Seq Char)
_) = Seq Char
y1

-- | Simple yet efficient implementation of converting a given string
-- into a BWT Matrix (the BWTMatrix type is a massiv array).
createBWTMatrix :: String -> BWTMatrix
createBWTMatrix :: String -> BWTMatrix
createBWTMatrix String
t =
  forall r e. Manifest r e => Comp -> [e] -> Vector r e
DMA.fromList (Word16 -> Comp
ParN Word16
0) [String]
zippedfff :: Array BN Ix1 String   
    where
      zippedfff :: [String]
zippedfff = forall a b. (a -> b) -> [a] -> [b]
DL.map forall (t :: * -> *) a. Foldable t => t a -> [a]
DFold.toList          forall a b. (a -> b) -> a -> b
$
                  forall a b. (a -> b) -> [a] -> [b]
DL.map (\(Seq Char
a,Seq Char
b) -> Seq Char
a forall a. Seq a -> Seq a -> Seq a
DS.>< Seq Char
b) forall a b. (a -> b) -> a -> b
$
                  forall (t :: * -> *) a. Foldable t => t a -> [a]
DFold.toList Seq (Seq Char, Seq Char)
zippedff
      zippedff :: Seq (Seq Char, Seq Char)
zippedff  = forall a. (a -> a -> Ordering) -> Seq a -> Seq a
DS.sortBy (\(Seq Char
a,Seq Char
_) (Seq Char
c,Seq Char
_) -> forall a. Ord a => a -> a -> Ordering
compare Seq Char
a Seq Char
c)
                  Seq (Seq Char, Seq Char)
zippedf
      zippedf :: Seq (Seq Char, Seq Char)
zippedf   = Seq (Seq Char, Seq Char)
zippedh
                  forall a. Seq a -> Seq a -> Seq a
DS.><
                  Seq (Seq Char, Seq Char)
zippedp
      zippedh :: Seq (Seq Char, Seq Char)
zippedh   = forall a. a -> Seq a
DS.singleton   forall a b. (a -> b) -> a -> b
$
                  Seq (Seq Char) -> (Seq Char, Seq Char)
grabHeadChunks forall a b. (a -> b) -> a -> b
$
                  forall a. Int -> Seq a -> Seq (Seq a)
DS.chunksOf ((forall a. Seq a -> Int
DS.length Seq Char
tseq) forall a. Num a => a -> a -> a
- Int
1)
                              Seq Char
tseq
      zippedp :: Seq (Seq Char, Seq Char)
zippedp   = forall a b. Seq a -> Seq b -> Seq (a, b)
DS.zip Seq (Seq Char)
suffixesf Seq (Seq Char)
prefixesf
      prefixesf :: Seq (Seq Char)
prefixesf = forall a. Int -> Seq a -> Seq a
DS.take ((forall a. Seq a -> Int
DS.length Seq (Seq Char)
prefixes) forall a. Num a => a -> a -> a
- Int
1)
                          Seq (Seq Char)
prefixes
      suffixesf :: Seq (Seq Char)
suffixesf = forall a. Int -> Seq a -> Seq a
DS.drop Int
1
                          Seq (Seq Char)
suffixes
      suffixes :: Seq (Seq Char)
suffixes = forall a. (a -> Bool) -> Seq a -> Seq a
DS.filter (Bool -> Bool
not forall b c a. (b -> c) -> (a -> b) -> a -> c
. forall a. Seq a -> Bool
DS.null)
                 (forall a. Seq a -> Seq (Seq a)
DS.tails Seq Char
tseq)
      prefixes :: Seq (Seq Char)
prefixes = forall a. (a -> Bool) -> Seq a -> Seq a
DS.filter (Bool -> Bool
not forall b c a. (b -> c) -> (a -> b) -> a -> c
. forall a. Seq a -> Bool
DS.null)
                 (forall a. Seq a -> Seq (Seq a)
DS.inits Seq Char
tseq)
      tseq :: Seq Char
tseq = (forall a. [a] -> Seq a
DS.fromList String
t) forall a. Seq a -> a -> Seq a
DS.|> Char
'$'

{--------------------}