{-# LANGUAGE BangPatterns #-}
{-# LANGUAGE DeriveGeneric #-}
{-# LANGUAGE FlexibleInstances #-}
{-# LANGUAGE GeneralizedNewtypeDeriving #-}
{-# LANGUAGE InstanceSigs #-}
{-# LANGUAGE MultiParamTypeClasses #-}
{-# LANGUAGE StrictData #-}
{-# LANGUAGE TypeSynonymInstances #-}
{-# OPTIONS_GHC -fno-warn-orphans #-}

-- |
-- If you're accustomed to working with text in almost any other programming
-- language, you'd be aware that a \"string\" typically refers to an in-memory
-- /array/ of characters. Traditionally this was a single ASCII byte per
-- character; more recently UTF-8 variable byte encodings which dramatically
-- complicates finding offsets but which gives efficient support for the
-- entire Unicode character space. In Haskell, the original text type,
-- 'String', is implemented as a list of 'Char' which, because a Haskell list
-- is implemented as a /linked-list of boxed values/, is wildly inefficient at
-- any kind of scale.
--
-- In modern Haskell there are two primary ways to represent text.
--
-- First is via the [rather poorly named] @ByteString@ from the __bytestring__
-- package (which is an array of bytes in pinned memory). The
-- "Data.ByteString.Char8" submodule gives you ways to manipulate those arrays
-- as if they were ASCII characters. Confusingly there are both strict
-- (@Data.ByteString@) and lazy (@Data.ByteString.Lazy@) variants which are
-- often hard to tell the difference between when reading function signatures
-- or haddock documentation. The performance problem an immutable array backed
-- data type runs into is that appending a character (that is, ASCII byte) or
-- concatonating a string (that is, another array of ASCII bytes) is very
-- expensive and requires allocating a new larger array and copying the whole
-- thing into it. This led to the development of \"builders\" which amortize
-- this reallocation cost over time, but it can be cumbersome to switch
-- between @Builder@, the lazy @ByteString@ that results, and then having to
-- inevitably convert to a strict @ByteString@ because that's what the next
-- function in your sequence requires.
--
-- The second way is through the opaque @Text@ type of "Data.Text" from the
-- __text__ package, which is well tuned and high-performing but suffers from
-- the same design; it is likewise backed by arrays. Rather surprisingly, the
-- storage backing Text objects are encoded in UTF-16, meaning every time you
-- want to work with unicode characters that came in from /anywhere/ else and
-- which inevitably are UTF-8 encoded you have to convert to UTF-16 and copy
-- into a new array, wasting time and memory.
--
-- In this package we introduce 'Rope', a text type backed by the 2-3
-- 'Data.FingerTree.FingerTree' data structure from the __fingertree__
-- package. This is not an uncommon solution in many languages as finger trees
-- support exceptionally efficient appending to either end and good
-- performance inserting anywhere else (you often find them as the backing
-- data type underneath text editors for this reason). Rather than 'Char' the
-- pieces of the rope are 'Data.Text.Short.ShortText' from the __text-short__
-- package, which are UTF-8 encoded and in normal memory managed by the
-- Haskell runtime. Conversion from other Haskell text types is not /O(1)/
-- (UTF-8 validity must be checked, or UTF-16 decoded, or...), but in our
-- benchmarking the performance has been comparable to the established types
-- and you may find the resultant interface for combining chunks is comparable
-- to using a Builder, without being forced to use a Builder.
--
-- 'Rope' is used as the text type throughout this library. If you use the
-- functions within this package (rather than converting to other text types)
-- operations are quite efficient. When you do need to convert to another type
-- you can use 'fromRope' or 'intoRope' from the 'Textual' typeclass.
--
-- Note that we haven't tried to cover the entire gamut of operations or
-- customary convenience functions you would find in the other libraries; so
-- far 'Rope' is concentrated on aiding interoperation, being good at
-- appending (lots of) small pieces, and then efficiently taking the resultant
-- text object out to a file handle, be that the terminal console, a file, or
-- a network socket.
module Core.Text.Rope
  ( -- * Rope type
    Rope,
    emptyRope,
    singletonRope,
    replicateRope,
    replicateChar,
    widthRope,
    splitRope,
    insertRope,
    containsCharacter,
    findIndexRope,

    -- * Interoperation and Output
    Textual (fromRope, intoRope, appendRope),
    hWrite,

    -- * Internals
    unRope,
    nullRope,
    unsafeIntoRope,
    Width (..),
  )
where

import Control.DeepSeq (NFData (..))
import Core.Text.Bytes
import qualified Data.ByteString as B (ByteString)
import qualified Data.ByteString.Builder as B
  ( hPutBuilder,
    toLazyByteString,
  )
import qualified Data.ByteString.Lazy as L
  ( ByteString,
    foldrChunks,
    toStrict,
  )
import qualified Data.FingerTree as F
  ( FingerTree,
    Measured (..),
    SearchResult (..),
    ViewL (..),
    empty,
    null,
    search,
    singleton,
    viewl,
    (<|),
    (><),
    (|>),
  )
import Data.Foldable (foldl', foldr', toList)
import Data.Hashable (Hashable, hashWithSalt)
import Data.String (IsString (..))
import qualified Data.Text as T (Text)
import qualified Data.Text.Lazy as U
  ( Text,
    foldrChunks,
    fromChunks,
    toStrict,
  )
import qualified Data.Text.Lazy.Builder as U
  ( Builder,
    fromText,
    toLazyText,
  )
import Data.Text.Prettyprint.Doc (Pretty (..), emptyDoc)
import qualified Data.Text.Short as S
  ( ShortText,
    any,
    append,
    empty,
    findIndex,
    fromByteString,
    fromText,
    length,
    null,
    pack,
    replicate,
    singleton,
    splitAt,
    toBuilder,
    toText,
    unpack,
  )
import qualified Data.Text.Short.Unsafe as S (fromByteStringUnsafe)
import GHC.Generics (Generic)
import System.IO (Handle)

-- |
-- A type for textual data. A rope is text backed by a tree data structure,
-- rather than a single large continguous array, as is the case for strings.
--
-- There are three use cases:
--
-- /Referencing externally sourced data/
--
-- Often we interpret large blocks of data sourced from external systems as
-- text. Ideally we would hold onto this without copying the memory, but (as
-- in the case of @ByteString@ which is the most common source of data) before
-- we can treat it as text we have to validate the UTF-8 content. Safety
-- first. We also copy it out of pinned memory, allowing the Haskell runtime
-- to manage the storage.
--
-- /Interoperating with other libraries/
--
-- The only constant of the Haskell universe is that you won't have the right
-- combination of {strict, lazy} × {@Text@, @ByteString@, @String@, @[Word8]@,
-- etc} you need for the next function call. The 'Textual' typeclass provides
-- for moving between different text representations. To convert between
-- @Rope@ and something else use 'fromRope'; to construct a @Rope@ from
-- textual content in another type use 'intoRope'.
--
-- You can get at the underlying finger tree with the 'unRope' function.
--
-- /Assembling text to go out/
--
-- This involves considerable appending of data, very very occaisionally
-- inserting it. Often the pieces are tiny. To add text to a @Rope@ use the
-- 'appendRope' method as below or the ('Data.Semigroup.<>') operator from
-- "Data.Monoid" (like you would have with a @Builder@).
--
-- Output to a @Handle@ can be done efficiently with 'hWrite'.
newtype Rope
  = Rope (F.FingerTree Width S.ShortText)
  deriving (Generic)

instance NFData Rope where
  rnf (Rope x) = foldMap (\piece -> rnf piece) x

instance Show Rope where
  show text = "\"" ++ fromRope text ++ "\""

instance Eq Rope where
  (==) (Rope x1) (Rope x2) = (==) (stream x1) (stream x2)
    where
      stream x = foldMap S.unpack x

instance Ord Rope where
  compare (Rope x1) (Rope x2) = compare x1 x2

instance Pretty Rope where
  pretty (Rope x) = foldr ((<>) . pretty . S.toText) emptyDoc x

-- |
-- Access the finger tree underlying the @Rope@. You'll want the following
-- imports:
--
-- @
-- import qualified "Data.FingerTree" as F  -- from the __fingertree__ package
-- import qualified "Data.Text.Short" as S  -- from the __text-short__ package
-- @
unRope :: Rope -> F.FingerTree Width S.ShortText
unRope (Rope x) = x
{-# INLINE unRope #-}

-- |
-- The length of the @Rope@, in characters. This is the monoid used to
-- structure the finger tree underlying the @Rope@.
newtype Width = Width Int
  deriving (Eq, Ord, Show, Num, Generic)

instance F.Measured Width S.ShortText where
  measure :: S.ShortText -> Width
  measure piece = Width (S.length piece)

instance Semigroup Width where
  (<>) (Width w1) (Width w2) = Width (w1 + w2)

instance Monoid Width where
  mempty = Width 0
  mappend = (<>)

-- here Maybe we just need type Strand = ShortText and then Rope is
-- FingerTree Strand or Builder (Strand)

instance IsString Rope where
  fromString "" = emptyRope
  fromString xs = Rope . F.singleton . S.pack $ xs

instance Semigroup Rope where
  (<>) text1@(Rope x1) text2@(Rope x2) =
    if F.null x2
      then text1
      else
        if F.null x1
          then text2
          else Rope ((F.><) x1 x2) -- god I hate these operators

instance Monoid Rope where
  mempty = emptyRope
  mappend = (<>)

-- |
-- An zero-length 'Rope'. You can also use @\"\"@ presuming the
-- __@OverloadedStrings@__ language extension is turned on in your source
-- file.
emptyRope :: Rope
emptyRope = Rope F.empty
{-# INLINEABLE emptyRope #-}

-- |
-- A 'Rope' with but a single character.
singletonRope :: Char -> Rope
singletonRope = Rope . F.singleton . S.singleton

-- |
-- Repeat the input 'Rope' @n@ times. The follows the same semantics as other
-- @replicate@ functions; if you ask for zero copies you'll get an empty text
-- and if you ask for lots of @""@ you'll get ... an empty text.
--
-- /Implementation note/
--
-- Rather than copying the input /n/ times, this will simply add structure to hold /n/
-- references to the provided input text.
replicateRope :: Int -> Rope -> Rope
replicateRope count (Rope x) =
  let x' = foldr (\_ acc -> (F.><) x acc) F.empty [1 .. count]
   in Rope x'

-- |
-- Repeat the input 'Char' @n@ times. This is a special case of
-- 'replicateRope' above.
--
-- /Implementation note/
--
-- Rather than making a huge FingerTree full of single characters, this
-- function will allocate a single ShortText comprised of the repeated input
-- character.
replicateChar :: Int -> Char -> Rope
replicateChar count = Rope . F.singleton . S.replicate count . S.singleton

-- |
-- Get the length of this text, in characters.
widthRope :: Rope -> Int
widthRope = foldr' f 0 . unRope
  where
    f piece count = S.length piece + count

nullRope :: Rope -> Bool
nullRope (Rope x) = case F.viewl x of
  F.EmptyL -> True
  (F.:<) piece _ -> S.null piece

-- |
-- Break the text into two pieces at the specified offset.
--
-- Examples:
--
-- @
-- λ> __splitRope 0 \"abcdef\"__
-- (\"\", \"abcdef\")
-- λ> __splitRope 3 \"abcdef\"__
-- (\"abc\", \"def\")
-- λ> __splitRope 6 \"abcdef\"__
-- (\"abcdef\",\"\")
-- @
--
-- Going off either end behaves sensibly:
--
-- @
-- λ> __splitRope 7 \"abcdef\"__
-- (\"abcdef\",\"\")
-- λ> __splitRope (-1) \"abcdef\"__
-- (\"\", \"abcdef\")
-- @
splitRope :: Int -> Rope -> (Rope, Rope)
splitRope i text@(Rope x) =
  let pos = Width i
      result = F.search (\w1 _ -> w1 >= pos) x
   in case result of
        F.Position before piece after ->
          let (Width w) = F.measure before
              (one, two) = S.splitAt (i - w) piece
           in (Rope ((F.|>) before one), Rope ((F.<|) two after))
        F.OnLeft -> (Rope F.empty, text)
        F.OnRight -> (text, Rope F.empty)
        F.Nowhere -> error "Position not found in split. Probable cause: predicate function given not monotonic. This is supposed to be unreachable"

-- |
-- Insert a new piece of text into an existing @Rope@ at the specified offset.
--
-- Examples:
--
-- @
-- λ> __insertRope 3 \"Con\" \"Def 1\"__
-- "DefCon 1"
-- λ> __insertRope 0 \"United \" \"Nations\"__
-- "United Nations"
-- @
insertRope :: Int -> Rope -> Rope -> Rope
insertRope 0 (Rope new) (Rope x) = Rope ((F.><) new x)
insertRope i (Rope new) text =
  let (Rope before, Rope after) = splitRope i text
   in Rope (mconcat [before, new, after])

findIndexRope :: (Char -> Bool) -> Rope -> Maybe Int
findIndexRope predicate = fst . foldl f (Nothing, 0) . unRope
  where
    -- convert this to Maybe monad, maybe
    f :: (Maybe Int, Int) -> S.ShortText -> (Maybe Int, Int)
    f acc piece = case acc of
      (Just j, _) -> (Just j, 0)
      (Nothing, !i) -> case S.findIndex predicate piece of
        Nothing -> (Nothing, i + S.length piece)
        Just !j -> (Just (i + j), 0)

--
-- Manual instance to get around the fact that FingerTree doesn't have a
-- Hashable instance. If this were ever to become a hotspot we could
-- potentially use the Hashed caching type in the finger tree as
--
-- FingerTree Width (Hashed S.ShortText)
--
-- at the cost of endless unwrapping. Another alternative would be to cache
-- hash values in the monoid, changing Width from being a wrapper of Int to
-- a record type with width, hash, and perhaps newlines within the
-- corresponding tree.
--
instance Hashable Rope where
  hashWithSalt salt (Rope x) = foldl' f salt x
    where
      f :: Int -> S.ShortText -> Int
      f num piece = hashWithSalt num piece

-- |
-- Machinery to interpret a type as containing valid Unicode that can be
-- represented as a @Rope@ object.
--
-- /Implementation notes/
--
-- Given that @Rope@ is backed by a finger tree, 'append' is relatively
-- inexpensive, plus whatever the cost of conversion is. There is a subtle
-- trap, however: if adding small fragments of that were obtained by slicing
-- (for example) a large ByteString we would end up holding on to a reference
-- to the entire underlying block of memory. This module is optimized to
-- reduce heap fragmentation by letting the Haskell runtime and garbage
-- collector manage the memory, so instances are expected to /copy/ these
-- substrings out of pinned memory.
--
-- The @ByteString@ instance requires that its content be valid UTF-8. If not
-- an empty @Rope@ will be returned.
--
-- Several of the 'fromRope' implementations are expensive and involve a lot
-- of intermediate allocation and copying. If you're ultimately writing to a
-- handle prefer 'hWrite' which will write directly to the output buffer.
class Textual α where
  -- |
  -- Convert a @Rope@ into another text-like type.
  fromRope :: Rope -> α

  -- |
  -- Take another text-like type and convert it to a @Rope@.
  intoRope :: α -> Rope

  -- |
  -- Append some text to this @Rope@. The default implementation is basically a
  -- convenience wrapper around calling 'intoRope' and 'mappend'ing it to your
  -- text (which will work just fine, but for some types more efficient
  -- implementations are possible).
  appendRope :: α -> Rope -> Rope
  appendRope thing text = text <> intoRope thing

instance Textual (F.FingerTree Width S.ShortText) where
  fromRope = unRope
  intoRope = Rope

instance Textual Rope where
  fromRope = id
  intoRope = id
  appendRope (Rope x2) (Rope x1) = Rope ((F.><) x1 x2)

-- | from "Data.Text.Short"
instance Textual S.ShortText where
  fromRope = foldr S.append S.empty . unRope
  intoRope = Rope . F.singleton
  appendRope piece (Rope x) = Rope ((F.|>) x piece)

-- | from "Data.Text" Strict
instance Textual T.Text where
  fromRope = U.toStrict . U.toLazyText . foldr f mempty . unRope
    where
      f :: S.ShortText -> U.Builder -> U.Builder
      f piece built = (<>) (U.fromText (S.toText piece)) built

  intoRope t = Rope (F.singleton (S.fromText t))
  appendRope chunk (Rope x) = Rope ((F.|>) x (S.fromText chunk))

-- | from "Data.Text.Lazy"
instance Textual U.Text where
  fromRope (Rope x) = U.fromChunks . fmap S.toText . toList $ x
  intoRope t = Rope (U.foldrChunks ((F.<|) . S.fromText) F.empty t)

-- | from "Data.ByteString" Strict
instance Textual B.ByteString where
  fromRope = L.toStrict . B.toLazyByteString . foldr g mempty . unRope
    where
      g piece built = (<>) (S.toBuilder piece) built

  -- If the input ByteString does not contain valid UTF-8 then an empty
  -- Rope will be returned. That's not ideal.
  intoRope b' = case S.fromByteString b' of
    Just piece -> Rope (F.singleton piece)
    Nothing -> Rope F.empty -- bad

  -- ditto
  appendRope b' (Rope x) = case S.fromByteString b' of
    Just piece -> Rope ((F.|>) x piece)
    Nothing -> (Rope x) -- bad

-- | from "Data.ByteString.Lazy"
instance Textual L.ByteString where
  fromRope = B.toLazyByteString . foldr g mempty . unRope
    where
      g piece built = (<>) (S.toBuilder piece) built

  intoRope b' = Rope (L.foldrChunks ((F.<|) . check) F.empty b')
    where
      check chunk = case S.fromByteString chunk of
        Just piece -> piece
        Nothing -> S.empty -- very bad

instance Textual Bytes where
  fromRope = intoBytes . (fromRope :: Rope -> B.ByteString)
  intoRope = intoRope . unBytes

instance Binary Rope where
  fromBytes = intoRope . unBytes
  intoBytes = intoBytes . (fromRope :: Rope -> B.ByteString)

-- |
-- If you /know/ the input bytes are valid UTF-8 encoded characters, then
-- you can use this function to convert to a piece of @Rope@.
unsafeIntoRope :: B.ByteString -> Rope
unsafeIntoRope = Rope . F.singleton . S.fromByteStringUnsafe

-- | from "Data.String"
instance Textual [Char] where
  fromRope (Rope x) = foldr h [] x
    where
      h piece string = (S.unpack piece) ++ string -- ugh
  intoRope = Rope . F.singleton . S.pack

-- |
-- Write the 'Rope' to the given 'Handle'.
--
-- @
-- import "Core.Text"
-- import "Core.System" -- re-exports stdout
--
-- main :: IO ()
-- main =
--   let
--     text :: 'Rope'
--     text = "Hello World"
--   in
--     'hWrite' 'System.IO.stdout' text
-- @
-- because it's tradition.
--
-- Uses 'Data.ByteString.Builder.hPutBuilder' internally which saves all kinds
-- of intermediate allocation and copying because we can go from the
-- 'Data.Text.Short.ShortText's in the finger tree to
-- 'Data.ByteString.Short.ShortByteString' to
-- 'Data.ByteString.Builder.Builder' to the 'System.IO.Handle''s output buffer
-- in one go.
--
-- If you're working in the
-- <https://hackage.haskell.org/package/core-program/docs/Core-Program-Execute.html#t:Program Program>
-- monad, then
-- <https://hackage.haskell.org/package/core-program/docs/Core-Program-Logging.html#v:write write>
-- provides an efficient way to write a @Rope@ to @stdout@.
hWrite :: Handle -> Rope -> IO ()
hWrite handle (Rope x) = B.hPutBuilder handle (foldr j mempty x)
  where
    j piece built = (<>) (S.toBuilder piece) built

-- |
-- Does the text contain this character?
--
-- We've used it to ask whether there are newlines present in a @Rope@, for
-- example:
--
-- @
--     if 'containsCharacter' \'\\n\' text
--         then handleComplexCase
--         else keepItSimple
-- @
containsCharacter :: Char -> Rope -> Bool
containsCharacter q (Rope x) = any j x
  where
    j piece = S.any (\c -> c == q) piece