-- | -- Copyright: (c) 2021-2022 Andrew Lelechenko -- Licence: BSD3 -- Maintainer: Andrew Lelechenko {-# LANGUAGE BangPatterns #-} {-# LANGUAGE CPP #-} {-# LANGUAGE GeneralizedNewtypeDeriving #-} {-# LANGUAGE LambdaCase #-} {-# LANGUAGE MultiParamTypeClasses #-} {-# LANGUAGE StandaloneDeriving #-} {-# LANGUAGE TupleSections #-} #ifdef DEBUG #define DEFRAGMENTATION_THRESHOLD 4 #else #define DEFRAGMENTATION_THRESHOLD 4096 #endif module Data.Text.Utf16.Rope.Mixed ( Rope , fromText , fromTextLines , toText , toTextLines , null -- * Lines , lines , lengthInLines , splitAtLine -- * Code points , charLength , charSplitAt , charLengthAsPosition , charSplitAtPosition -- * UTF-16 code units , utf16Length , utf16SplitAt , utf16LengthAsPosition , utf16SplitAtPosition ) where import Prelude ((-), (+), seq) import Control.DeepSeq (NFData, rnf) import Data.Bool (Bool(..), otherwise) import Data.Char (Char) import Data.Eq (Eq, (==)) import Data.Function ((.), ($), on) import Data.Maybe (Maybe(..)) import Data.Monoid (Monoid(..)) import Data.Ord (Ord, compare, (<), (<=), Ordering(..)) import Data.Semigroup (Semigroup(..)) import Data.String (IsString(..)) import Data.Text (Text) import qualified Data.Text as T import qualified Data.Text.Lazy as TextLazy import qualified Data.Text.Lazy.Builder as Builder import Data.Text.Lines.Internal (TextLines) import qualified Data.Text.Lines.Internal as TL (null, fromText, toText, lines, splitAtLine) import qualified Data.Text.Lines as Char import qualified Data.Text.Utf16.Lines as Utf16 import Data.Word (Word) import Text.Show (Show) #ifdef DEBUG import Prelude (error) import GHC.Stack (HasCallStack) #else #define HasCallStack () import Text.Show (show) #endif -- | Rope of 'Text' chunks with logarithmic concatenation. -- This rope offers two interfaces: one based on code points -- and another one based on UTF-16 code units. This comes with a price -- of double bookkeeping and is less performant than "Data.Text.Rope" -- or "Data.Text.Utf16.Rope". data Rope = Empty | Node { _ropeLeft :: !Rope , _ropeMiddle :: !TextLines , _ropeRight :: !Rope , _ropeCharLen :: !Word , _ropeCharLenAsPos :: !Char.Position , _ropeUtf16Len :: !Word , _ropeUtf16LenAsPos :: !Utf16.Position } instance NFData Rope where rnf Empty = () -- No need to deepseq strict fields, for which WHNF = NF rnf (Node l _ r _ _ _ _) = rnf l `seq` rnf r instance Eq Rope where (==) = (==) `on` toLazyText instance Ord Rope where compare = compare `on` toLazyText #ifdef DEBUG deriving instance Show Rope #else instance Show Rope where show = show . toLazyText #endif instance IsString Rope where fromString = fromTextLines . fromString -- | Check whether a rope is empty, O(1). null :: Rope -> Bool null = \case Empty -> True Node{} -> False -- | Length in code points, similar to @Data.Text.@'Data.Text.length', O(1). -- -- >>> :set -XOverloadedStrings -- >>> charLength "fяš€€" -- 3 -- charLength :: Rope -> Word charLength = \case Empty -> 0 Node _ _ _ w _ _ _ -> w -- | Length in UTF-16 code units, O(1). -- -- >>> :set -XOverloadedStrings -- >>> utf16Length "fяš€€" -- 4 -- utf16Length :: Rope -> Word utf16Length = \case Empty -> 0 Node _ _ _ _ _ w _ -> w -- | Measure text length as an amount of lines and columns, O(1). -- -- >>> :set -XOverloadedStrings -- >>> charLengthAsPosition "fš€€" -- Position {posLine = 0, posColumn = 2} -- >>> charLengthAsPosition "f\nš€€" -- Position {posLine = 1, posColumn = 1} -- >>> charLengthAsPosition "f\nš€€\n" -- Position {posLine = 2, posColumn = 0} -- charLengthAsPosition :: Rope -> Char.Position charLengthAsPosition = \case Empty -> mempty Node _ _ _ _ p _ _ -> p -- | Measure text length as an amount of lines and columns, O(1). -- -- >>> :set -XOverloadedStrings -- >>> utf16LengthAsPosition "fš€€" -- Position {posLine = 0, posColumn = 3} -- >>> utf16LengthAsPosition "f\nš€€" -- Position {posLine = 1, posColumn = 2} -- >>> utf16LengthAsPosition "f\nš€€\n" -- Position {posLine = 2, posColumn = 0} -- utf16LengthAsPosition :: Rope -> Utf16.Position utf16LengthAsPosition = \case Empty -> mempty Node _ _ _ _ _ _ p -> p instance Semigroup Rope where Empty <> t = t t <> Empty = t Node l1 c1 r1 u1 p1 u1' p1' <> Node l2 c2 r2 u2 p2 u2' p2' = defragment l1 c1 (Node (r1 <> l2) c2 r2 (charLength r1 + u2) (charLengthAsPosition r1 <> p2) (utf16Length r1 + u2') (utf16LengthAsPosition r1 <> p2')) (u1 + u2) (p1 <> p2) (u1' + u2') (p1' <> p2') instance Monoid Rope where mempty = Empty mappend = (<>) defragment :: HasCallStack => Rope -> TextLines -> Rope -> Word -> Char.Position -> Word -> Utf16.Position -> Rope defragment !l !c !r !u !p !u' !p' #ifdef DEBUG | TL.null c = error "Data.Text.Lines: violated internal invariant" #endif | u < DEFRAGMENTATION_THRESHOLD = Node Empty (toTextLines rp) Empty u p u' p' | otherwise = rp where rp = Node l c r u p u' p' -- | Create from 'TextLines', linear time. fromTextLines :: TextLines -> Rope fromTextLines tl | TL.null tl = Empty | otherwise = Node Empty tl Empty (Char.length tl) (Char.lengthAsPosition tl) (Utf16.length tl) (Utf16.lengthAsPosition tl) node :: HasCallStack => Rope -> TextLines -> Rope -> Rope node l c r = defragment l c r totalLength totalLengthAsPosition totalLength' totalLengthAsPosition' where totalLength = charLength l + Char.length c + charLength r totalLengthAsPosition = charLengthAsPosition l <> Char.lengthAsPosition c <> charLengthAsPosition r totalLength' = utf16Length l + Utf16.length c + utf16Length r totalLengthAsPosition' = utf16LengthAsPosition l <> Utf16.lengthAsPosition c <> utf16LengthAsPosition r (|>) :: Rope -> TextLines -> Rope tr |> tl | TL.null tl = tr | otherwise = node tr tl Empty (<|) :: TextLines -> Rope -> Rope tl <| tr | TL.null tl = tr | otherwise = node Empty tl tr -- | Create from 'Text', linear time. fromText :: Text -> Rope fromText = fromTextLines . TL.fromText foldMapRope :: Monoid a => (TextLines -> a) -> Rope -> a foldMapRope f = go where go = \case Empty -> mempty Node l c r _ _ _ _ -> go l `mappend` f c `mappend` go r data Lines = Lines ![Text] !Bool instance Semigroup Lines where Lines [] _ <> ls = ls ls <> Lines [] _ = ls Lines xs x <> Lines ys y = Lines (if x then xs <> ys else go xs ys) y where go [] vs = vs go [u] (v : vs) = (u <> v) : vs go (u : us) vs = u : go us vs instance Monoid Lines where mempty = Lines [] False mappend = (<>) -- | Split into lines by @\\n@, similar to @Data.Text.@'Data.Text.lines'. -- Each line is produced in O(1). -- -- >>> :set -XOverloadedStrings -- >>> lines "" -- [] -- >>> lines "foo" -- ["foo"] -- >>> lines "foo\n" -- ["foo"] -- >>> lines "foo\n\n" -- ["foo",""] -- >>> lines "foo\nbar" -- ["foo","bar"] -- lines :: Rope -> [Text] lines = (\(Lines ls _) -> ls) . foldMapRope -- This assumes that there are no empty chunks: (\tl -> Lines (TL.lines tl) (T.last (TL.toText tl) == '\n')) lastChar :: Rope -> Maybe Char lastChar = \case Empty -> Nothing -- This assumes that there are no empty chunks: Node _ c Empty _ _ _ _ -> Just $ T.last $ TL.toText c Node _ _ r _ _ _ _ -> lastChar r -- | Equivalent to 'Data.List.length' . 'lines', but in logarithmic time. -- -- >>> :set -XOverloadedStrings -- >>> lengthInLines "" -- 0 -- >>> lengthInLines "foo" -- 1 -- >>> lengthInLines "foo\n" -- 1 -- >>> lengthInLines "foo\n\n" -- 2 -- >>> lengthInLines "foo\nbar" -- 2 -- -- If you do not care about ignoring the last newline character, -- you can use 'Char.posLine' . 'charLengthAsPosition' instead, which works in O(1). -- lengthInLines :: Rope -> Word lengthInLines rp = case lastChar rp of Nothing -> 0 Just ch -> Char.posLine (charLengthAsPosition rp) + (if ch == '\n' then 0 else 1) -- | Glue chunks into 'TextLines', linear time. toTextLines :: Rope -> TextLines toTextLines = mconcat . foldMapRope (:[]) toLazyText :: Rope -> TextLazy.Text toLazyText = foldMapRope (TextLazy.fromStrict . TL.toText) -- | Glue chunks into 'Text', linear time. toText :: Rope -> Text toText = TextLazy.toStrict . Builder.toLazyText . foldMapRope (Builder.fromText . TL.toText) -- | Split at given code point, similar to @Data.Text.@'Data.Text.splitAt'. -- Takes linear time. -- -- >>> :set -XOverloadedStrings -- >>> map (\c -> charSplitAt c "fяš€€") [0..4] -- [("","fяš€€"),("f","яš€€"),("fя","š€€"),("fяš€€",""),("fяš€€","")] -- charSplitAt :: HasCallStack => Word -> Rope -> (Rope, Rope) charSplitAt !len = \case Empty -> (Empty, Empty) Node l c r _ _ _ _ | len <= ll -> case charSplitAt len l of (before, after) -> (before, node after c r) | len <= llc -> case Char.splitAt (len - ll) c of (before, after) -> (l |> before, after <| r) | otherwise -> case charSplitAt (len - llc) r of (before, after) -> (node l c before, after) where ll = charLength l llc = ll + Char.length c -- | Split at given UTF-16 code unit. -- If requested number of code units splits a code point in half, return 'Nothing'. -- Takes linear time. -- -- >>> :set -XOverloadedStrings -- >>> map (\c -> utf16SplitAt c "fяš€€") [0..4] -- [Just ("","fяš€€"),Just ("f","яš€€"),Just ("fя","š€€"),Nothing,Just ("fяš€€","")] -- utf16SplitAt :: HasCallStack => Word -> Rope -> Maybe (Rope, Rope) utf16SplitAt !len = \case Empty -> Just (Empty, Empty) Node l c r _ _ _ _ | len <= ll -> case utf16SplitAt len l of Nothing -> Nothing Just (before, after) -> Just (before, node after c r) | len <= llc -> case Utf16.splitAt (len - ll) c of Nothing -> Nothing Just (before, after) -> Just (l |> before, after <| r) | otherwise -> case utf16SplitAt (len - llc) r of Nothing -> Nothing Just (before, after) -> Just (node l c before, after) where ll = utf16Length l llc = ll + Utf16.length c -- | Split at given line, logarithmic time. -- -- >>> :set -XOverloadedStrings -- >>> map (\l -> splitAtLine l "foo\nbar") [0..3] -- [("","foo\nbar"),("foo\n","bar"),("foo\nbar",""),("foo\nbar","")] -- splitAtLine :: HasCallStack => Word -> Rope -> (Rope, Rope) splitAtLine !len = \case Empty -> (Empty, Empty) Node l c r _ _ _ _ | len <= ll -> case splitAtLine len l of (before, after) -> (before, node after c r) | len <= llc -> case TL.splitAtLine (len - ll) c of (before, after) -> (l |> before, after <| r) | otherwise -> case splitAtLine (len - llc) r of (before, after) -> (node l c before, after) where -- posLine is the same both in Char.lengthAsPosition and Utf16.lengthAsPosition ll = Char.posLine (charLengthAsPosition l) llc = ll + Char.posLine (Char.lengthAsPosition c) charSubOnRope :: Rope -> Char.Position -> Char.Position -> Char.Position charSubOnRope rp (Char.Position xl xc) (Char.Position yl yc) = case xl `compare` yl of GT -> Char.Position (xl - yl) xc EQ -> Char.Position 0 (xc - yc) LT -> Char.Position 0 (xc - charLength rp') where (_, rp') = splitAtLine xl rp utf16SubOnRope :: Rope -> Utf16.Position -> Utf16.Position -> Utf16.Position utf16SubOnRope rp (Utf16.Position xl xc) (Utf16.Position yl yc) = case xl `compare` yl of GT -> Utf16.Position (xl - yl) xc EQ -> Utf16.Position 0 (xc - yc) LT -> Utf16.Position 0 (xc - utf16Length rp') where (_, rp') = splitAtLine xl rp charSubOnLines :: Char.TextLines -> Char.Position -> Char.Position -> Char.Position charSubOnLines tl (Char.Position xl xc) (Char.Position yl yc) = case xl `compare` yl of GT -> Char.Position (xl - yl) xc EQ -> Char.Position 0 (xc - yc) LT -> Char.Position 0 (xc - Char.length tl') where (_, tl') = Char.splitAtLine xl tl utf16SubOnLines :: Utf16.TextLines -> Utf16.Position -> Utf16.Position -> Utf16.Position utf16SubOnLines tl (Utf16.Position xl xc) (Utf16.Position yl yc) = case xl `compare` yl of GT -> Utf16.Position (xl - yl) xc EQ -> Utf16.Position 0 (xc - yc) LT -> Utf16.Position 0 (xc - Utf16.length tl') where (_, tl') = Utf16.splitAtLine xl tl -- | Combination of 'splitAtLine' and subsequent 'charSplitAt'. -- Time is linear in 'Char.posColumn' and logarithmic in 'Char.posLine'. -- -- >>> :set -XOverloadedStrings -- >>> charSplitAtPosition (Position 1 0) "f\nš€€Ń" -- ("f\n","š€€Ń") -- >>> charSplitAtPosition (Position 1 1) "f\nš€€Ń" -- ("f\nš€€","я") -- >>> charSplitAtPosition (Position 1 2) "f\nš€€Ń" -- ("f\nš€€Ń","") -- >>> charSplitAtPosition (Position 0 2) "f\nš€€Ń" -- ("f\n","š€€Ń") -- >>> charSplitAtPosition (Position 0 3) "f\nš€€Ń" -- ("f\nš€€","я") -- >>> charSplitAtPosition (Position 0 4) "f\nš€€Ń" -- ("f\nš€€Ń","") -- charSplitAtPosition :: HasCallStack => Char.Position -> Rope -> (Rope, Rope) charSplitAtPosition (Char.Position 0 0) = (mempty,) charSplitAtPosition !len = \case Empty -> (Empty, Empty) Node l c r _ _ _ _ | len <= ll -> case charSplitAtPosition len l of (before, after) | null after -> case charSplitAtPosition len' (c <| r) of (r', r'') -> (l <> r', r'') | otherwise -> (before, node after c r) | len <= llc -> case Char.splitAtPosition len' c of (before, after) | TL.null after -> case charSplitAtPosition len'' r of (r', r'') -> ((l |> c) <> r', r'') | otherwise -> (l |> before, after <| r) | otherwise -> case charSplitAtPosition len'' r of (before, after) -> (node l c before, after) where ll = charLengthAsPosition l lc = Char.lengthAsPosition c llc = ll <> lc len' = charSubOnRope l len ll len'' = charSubOnLines c len' lc -- | Combination of 'splitAtLine' and subsequent 'utf16SplitAt'. -- Time is linear in 'Utf16.posColumn' and logarithmic in 'Utf16.posLine'. -- -- >>> :set -XOverloadedStrings -- >>> utf16SplitAtPosition (Position 1 0) "f\nš€€Ń" -- Just ("f\n","š€€Ń") -- >>> utf16SplitAtPosition (Position 1 1) "f\nš€€Ń" -- Nothing -- >>> utf16SplitAtPosition (Position 1 2) "f\nš€€Ń" -- Just ("f\nš€€","я") -- >>> utf16SplitAtPosition (Position 0 2) "f\nš€€Ń" -- Just ("f\n","š€€Ń") -- >>> utf16SplitAtPosition (Position 0 3) "f\nš€€Ń" -- Nothing -- >>> utf16SplitAtPosition (Position 0 4) "f\nš€€Ń" -- Just ("f\nš€€","я") -- utf16SplitAtPosition :: HasCallStack => Utf16.Position -> Rope -> Maybe (Rope, Rope) utf16SplitAtPosition (Utf16.Position 0 0) = Just . (mempty,) utf16SplitAtPosition !len = \case Empty -> Just (Empty, Empty) Node l c r _ _ _ _ | len <= ll -> case utf16SplitAtPosition len l of Nothing -> Nothing Just (before, after) | null after -> case utf16SplitAtPosition len' (c <| r) of Nothing -> Nothing Just (r', r'') -> Just (l <> r', r'') | otherwise -> Just (before, node after c r) | len <= llc -> case Utf16.splitAtPosition len' c of Nothing -> Nothing Just (before, after) | Utf16.null after -> case utf16SplitAtPosition len'' r of Nothing -> Nothing Just (r', r'') -> Just ((l |> c) <> r', r'') | otherwise -> Just (l |> before, after <| r) | otherwise -> case utf16SplitAtPosition len'' r of Nothing -> Nothing Just (before, after) -> Just (node l c before, after) where ll = utf16LengthAsPosition l lc = Utf16.lengthAsPosition c llc = ll <> lc len' = utf16SubOnRope l len ll len'' = utf16SubOnLines c len' lc