-- |
-- Copyright:   (c) 2021-2022 Andrew Lelechenko
-- Licence:     BSD3
-- Maintainer:  Andrew Lelechenko <andrew.lelechenko@gmail.com>

{-# LANGUAGE BangPatterns #-}
{-# LANGUAGE CPP #-}
{-# LANGUAGE MagicHash #-}
{-# LANGUAGE GeneralizedNewtypeDeriving #-}
{-# LANGUAGE UnliftedFFITypes #-}

module Data.Text.Utf16.Lines
  ( TextLines
  , fromText
  , toText
  , null
  -- * Lines
  , lines
  , lengthInLines
  , splitAtLine
  -- * UTF-16 code units
  , length
  , splitAt
  , Position(..)
  , lengthAsPosition
  , splitAtPosition
  ) where

import Prelude ((+), (-), seq)
import Control.DeepSeq (NFData, rnf)
import Data.Bool (Bool, otherwise)
import Data.Coerce (coerce)
import Data.Eq (Eq, (==))
import Data.Function ((.), ($))
import Data.Maybe (Maybe(..))
import Data.Monoid (Monoid(..))
import Data.Ord (Ord, (<=), (>), (>=))
import Data.Semigroup (Semigroup(..))
import Data.String (IsString)
import qualified Data.Text.Array as TA
import Data.Text.Internal (Text(..))
import qualified Data.Text.Lines.Internal as I
import qualified Data.Vector.Unboxed as U
import Data.Word (Word)
import Text.Show (Show, show)

#if MIN_VERSION_text(2,0,0)
import Prelude (fromIntegral)
import Foreign.C.Types (CSize(..))
import GHC.Exts (ByteArray#)
import System.IO (IO)
import System.IO.Unsafe (unsafeDupablePerformIO)
import System.Posix.Types (CSsize(..))
#else
import Data.Bool ((&&))
import Data.Ord ((<))
#endif

#ifdef DEBUG
import GHC.Stack (HasCallStack)
#else
#define HasCallStack ()
#endif

-- | A wrapper around 'Text' for fast line/column navigation.
-- Concatenation takes linear time.
--
-- This is a building block for 'Data.Text.Utf16.Rope.Rope',
-- which provides logarithmic concatenation.
newtype TextLines = TextLines I.TextLines
  deriving (TextLines -> TextLines -> Bool
(TextLines -> TextLines -> Bool)
-> (TextLines -> TextLines -> Bool) -> Eq TextLines
forall a. (a -> a -> Bool) -> (a -> a -> Bool) -> Eq a
/= :: TextLines -> TextLines -> Bool
$c/= :: TextLines -> TextLines -> Bool
== :: TextLines -> TextLines -> Bool
$c== :: TextLines -> TextLines -> Bool
Eq, Eq TextLines
Eq TextLines
-> (TextLines -> TextLines -> Ordering)
-> (TextLines -> TextLines -> Bool)
-> (TextLines -> TextLines -> Bool)
-> (TextLines -> TextLines -> Bool)
-> (TextLines -> TextLines -> Bool)
-> (TextLines -> TextLines -> TextLines)
-> (TextLines -> TextLines -> TextLines)
-> Ord TextLines
TextLines -> TextLines -> Bool
TextLines -> TextLines -> Ordering
TextLines -> TextLines -> TextLines
forall a.
Eq a
-> (a -> a -> Ordering)
-> (a -> a -> Bool)
-> (a -> a -> Bool)
-> (a -> a -> Bool)
-> (a -> a -> Bool)
-> (a -> a -> a)
-> (a -> a -> a)
-> Ord a
min :: TextLines -> TextLines -> TextLines
$cmin :: TextLines -> TextLines -> TextLines
max :: TextLines -> TextLines -> TextLines
$cmax :: TextLines -> TextLines -> TextLines
>= :: TextLines -> TextLines -> Bool
$c>= :: TextLines -> TextLines -> Bool
> :: TextLines -> TextLines -> Bool
$c> :: TextLines -> TextLines -> Bool
<= :: TextLines -> TextLines -> Bool
$c<= :: TextLines -> TextLines -> Bool
< :: TextLines -> TextLines -> Bool
$c< :: TextLines -> TextLines -> Bool
compare :: TextLines -> TextLines -> Ordering
$ccompare :: TextLines -> TextLines -> Ordering
$cp1Ord :: Eq TextLines
Ord, String -> TextLines
(String -> TextLines) -> IsString TextLines
forall a. (String -> a) -> IsString a
fromString :: String -> TextLines
$cfromString :: String -> TextLines
IsString, b -> TextLines -> TextLines
NonEmpty TextLines -> TextLines
TextLines -> TextLines -> TextLines
(TextLines -> TextLines -> TextLines)
-> (NonEmpty TextLines -> TextLines)
-> (forall b. Integral b => b -> TextLines -> TextLines)
-> Semigroup TextLines
forall b. Integral b => b -> TextLines -> TextLines
forall a.
(a -> a -> a)
-> (NonEmpty a -> a)
-> (forall b. Integral b => b -> a -> a)
-> Semigroup a
stimes :: b -> TextLines -> TextLines
$cstimes :: forall b. Integral b => b -> TextLines -> TextLines
sconcat :: NonEmpty TextLines -> TextLines
$csconcat :: NonEmpty TextLines -> TextLines
<> :: TextLines -> TextLines -> TextLines
$c<> :: TextLines -> TextLines -> TextLines
Semigroup, Semigroup TextLines
TextLines
Semigroup TextLines
-> TextLines
-> (TextLines -> TextLines -> TextLines)
-> ([TextLines] -> TextLines)
-> Monoid TextLines
[TextLines] -> TextLines
TextLines -> TextLines -> TextLines
forall a.
Semigroup a -> a -> (a -> a -> a) -> ([a] -> a) -> Monoid a
mconcat :: [TextLines] -> TextLines
$cmconcat :: [TextLines] -> TextLines
mappend :: TextLines -> TextLines -> TextLines
$cmappend :: TextLines -> TextLines -> TextLines
mempty :: TextLines
$cmempty :: TextLines
$cp1Monoid :: Semigroup TextLines
Monoid, TextLines -> ()
(TextLines -> ()) -> NFData TextLines
forall a. (a -> ()) -> NFData a
rnf :: TextLines -> ()
$crnf :: TextLines -> ()
NFData)

instance Show TextLines where
  show :: TextLines -> String
show (TextLines TextLines
t) = TextLines -> String
forall a. Show a => a -> String
show TextLines
t

-- | Create from 'Text', linear time.
fromText :: HasCallStack => Text -> TextLines
fromText :: Text -> TextLines
fromText = (Text -> TextLines) -> Text -> TextLines
coerce Text -> TextLines
I.fromText
{-# INLINE fromText #-}

-- | Extract 'Text', O(1).
toText :: TextLines -> Text
toText :: TextLines -> Text
toText = (TextLines -> Text) -> TextLines -> Text
coerce TextLines -> Text
I.toText
{-# INLINE toText #-}

-- | Check whether a text is empty, O(1).
null :: TextLines -> Bool
null :: TextLines -> Bool
null = (TextLines -> Bool) -> TextLines -> Bool
coerce TextLines -> Bool
I.null
{-# INLINE null #-}

-- | Split into lines by @\\n@, similar to @Data.Text.@'Data.Text.lines'.
-- Each line is produced in O(1).
--
-- >>> :set -XOverloadedStrings
-- >>> lines ""
-- []
-- >>> lines "foo"
-- ["foo"]
-- >>> lines "foo\n"
-- ["foo"]
-- >>> lines "foo\n\n"
-- ["foo",""]
-- >>> lines "foo\nbar"
-- ["foo","bar"]
--
lines :: TextLines -> [Text]
lines :: TextLines -> [Text]
lines = (TextLines -> [Text]) -> TextLines -> [Text]
coerce TextLines -> [Text]
I.lines
{-# INLINE lines #-}

-- | Equivalent to 'Data.List.length' . 'lines', but in O(1).
--
-- >>> :set -XOverloadedStrings
-- >>> lengthInLines ""
-- 0
-- >>> lengthInLines "foo"
-- 1
-- >>> lengthInLines "foo\n"
-- 1
-- >>> lengthInLines "foo\n\n"
-- 2
-- >>> lengthInLines "foo\nbar"
-- 2
--
lengthInLines :: TextLines -> Word
lengthInLines :: TextLines -> Word
lengthInLines = (TextLines -> Word) -> TextLines -> Word
coerce TextLines -> Word
I.lengthInLines
{-# INLINE lengthInLines #-}

-- | Split at given line, O(1).
--
-- >>> :set -XOverloadedStrings
-- >>> map (\l -> splitAtLine l "foo\nbar") [0..3]
-- [("","foo\nbar"),("foo\n","bar"),("foo\nbar",""),("foo\nbar","")]
--
splitAtLine :: HasCallStack => Word -> TextLines -> (TextLines, TextLines)
splitAtLine :: Word -> TextLines -> (TextLines, TextLines)
splitAtLine = (Word -> TextLines -> (TextLines, TextLines))
-> Word -> TextLines -> (TextLines, TextLines)
coerce Word -> TextLines -> (TextLines, TextLines)
I.splitAtLine
{-# INLINE splitAtLine #-}

lengthTextUtf16 :: Text -> Word
#if MIN_VERSION_text(2,0,0)
lengthTextUtf16 (Text (TA.ByteArray arr) off len) = fromIntegral $ unsafeDupablePerformIO $
  lengthUtf8AsUtf16 arr (fromIntegral off) (fromIntegral len)

foreign import ccall unsafe "_hs_text_lines_length_utf8_as_utf16" lengthUtf8AsUtf16
  :: ByteArray# -> CSize -> CSize -> IO CSsize
#else
lengthTextUtf16 :: Text -> Word
lengthTextUtf16 (Text Array
_ Int
_ Int
len) = Int -> Word
I.intToWord Int
len
#endif
{-# INLINABLE lengthTextUtf16 #-}

-- | Length in UTF-16 code units.
-- Takes linear time.
--
-- >>> :set -XOverloadedStrings
-- >>> length "fя𐀀"
-- 4
-- >>> Data.Text.Lines.length "fя𐀀"
-- 3
--
length :: TextLines -> Word
length :: TextLines -> Word
length = Text -> Word
lengthTextUtf16 (Text -> Word) -> (TextLines -> Text) -> TextLines -> Word
forall b c a. (b -> c) -> (a -> b) -> a -> c
. TextLines -> Text
toText
{-# INLINE length #-}

-- | Represent a position in a text.
data Position = Position
  { Position -> Word
posLine   :: !Word -- ^ Line.
  , Position -> Word
posColumn :: !Word -- ^ Column in UTF-16 code units.
  } deriving (Position -> Position -> Bool
(Position -> Position -> Bool)
-> (Position -> Position -> Bool) -> Eq Position
forall a. (a -> a -> Bool) -> (a -> a -> Bool) -> Eq a
/= :: Position -> Position -> Bool
$c/= :: Position -> Position -> Bool
== :: Position -> Position -> Bool
$c== :: Position -> Position -> Bool
Eq, Eq Position
Eq Position
-> (Position -> Position -> Ordering)
-> (Position -> Position -> Bool)
-> (Position -> Position -> Bool)
-> (Position -> Position -> Bool)
-> (Position -> Position -> Bool)
-> (Position -> Position -> Position)
-> (Position -> Position -> Position)
-> Ord Position
Position -> Position -> Bool
Position -> Position -> Ordering
Position -> Position -> Position
forall a.
Eq a
-> (a -> a -> Ordering)
-> (a -> a -> Bool)
-> (a -> a -> Bool)
-> (a -> a -> Bool)
-> (a -> a -> Bool)
-> (a -> a -> a)
-> (a -> a -> a)
-> Ord a
min :: Position -> Position -> Position
$cmin :: Position -> Position -> Position
max :: Position -> Position -> Position
$cmax :: Position -> Position -> Position
>= :: Position -> Position -> Bool
$c>= :: Position -> Position -> Bool
> :: Position -> Position -> Bool
$c> :: Position -> Position -> Bool
<= :: Position -> Position -> Bool
$c<= :: Position -> Position -> Bool
< :: Position -> Position -> Bool
$c< :: Position -> Position -> Bool
compare :: Position -> Position -> Ordering
$ccompare :: Position -> Position -> Ordering
$cp1Ord :: Eq Position
Ord, Int -> Position -> ShowS
[Position] -> ShowS
Position -> String
(Int -> Position -> ShowS)
-> (Position -> String) -> ([Position] -> ShowS) -> Show Position
forall a.
(Int -> a -> ShowS) -> (a -> String) -> ([a] -> ShowS) -> Show a
showList :: [Position] -> ShowS
$cshowList :: [Position] -> ShowS
show :: Position -> String
$cshow :: Position -> String
showsPrec :: Int -> Position -> ShowS
$cshowsPrec :: Int -> Position -> ShowS
Show)

instance NFData Position where
  rnf :: Position -> ()
rnf (Position Word
a Word
b) = Word -> ()
forall a. NFData a => a -> ()
rnf Word
a () -> () -> ()
`seq` Word -> ()
forall a. NFData a => a -> ()
rnf Word
b

-- | Associativity does not hold when 'posLine' overflows.
instance Semigroup Position where
  Position Word
l1 Word
c1 <> :: Position -> Position -> Position
<> Position Word
l2 Word
c2 =
    Word -> Word -> Position
Position (Word
l1 Word -> Word -> Word
forall a. Num a => a -> a -> a
+ Word
l2) (if Word
l2 Word -> Word -> Bool
forall a. Eq a => a -> a -> Bool
== Word
0 then Word
c1 Word -> Word -> Word
forall a. Num a => a -> a -> a
+ Word
c2 else Word
c2)

instance Monoid Position where
  mempty :: Position
mempty = Word -> Word -> Position
Position Word
0 Word
0
  mappend :: Position -> Position -> Position
mappend = Position -> Position -> Position
forall a. Semigroup a => a -> a -> a
(<>)

-- | Measure text length as an amount of lines and columns.
-- Time is proportional to the length of the last line.
--
-- >>> :set -XOverloadedStrings
-- >>> lengthAsPosition "f𐀀"
-- Position {posLine = 0, posColumn = 3}
-- >>> lengthAsPosition "f\n𐀀"
-- Position {posLine = 1, posColumn = 2}
-- >>> lengthAsPosition "f\n𐀀\n"
-- Position {posLine = 2, posColumn = 0}
--
lengthAsPosition
  :: TextLines
  -> Position
lengthAsPosition :: TextLines -> Position
lengthAsPosition (TextLines (I.TextLines (Text Array
arr Int
off Int
len) Vector Int
nls)) = Position :: Word -> Word -> Position
Position
  { posLine :: Word
posLine = Int -> Word
I.intToWord (Int -> Word) -> Int -> Word
forall a b. (a -> b) -> a -> b
$ Vector Int -> Int
forall a. Unbox a => Vector a -> Int
U.length Vector Int
nls
  , posColumn :: Word
posColumn = Text -> Word
lengthTextUtf16 (Text -> Word) -> Text -> Word
forall a b. (a -> b) -> a -> b
$ Array -> Int -> Int -> Text
Text Array
arr Int
nl (Int
off Int -> Int -> Int
forall a. Num a => a -> a -> a
+ Int
len Int -> Int -> Int
forall a. Num a => a -> a -> a
- Int
nl)
  }
  where
    nl :: Int
nl = if Vector Int -> Bool
forall a. Unbox a => Vector a -> Bool
U.null Vector Int
nls then Int
off else Vector Int -> Int
forall a. Unbox a => Vector a -> a
U.last Vector Int
nls Int -> Int -> Int
forall a. Num a => a -> a -> a
+ Int
1
{-# INLINABLE lengthAsPosition #-}

splitTextAtUtf16Index :: Word -> Text -> Maybe (Text, Text)
splitTextAtUtf16Index :: Word -> Text -> Maybe (Text, Text)
splitTextAtUtf16Index Word
k t :: Text
t@(Text Array
arr Int
off Int
len)
  | Word
k Word -> Word -> Bool
forall a. Ord a => a -> a -> Bool
<= Word
0 = (Text, Text) -> Maybe (Text, Text)
forall a. a -> Maybe a
Just (Array -> Int -> Int -> Text
Text Array
arr Int
off Int
0, Text
t)
  | Word
k Word -> Word -> Bool
forall a. Ord a => a -> a -> Bool
>= Int -> Word
I.intToWord Int
len = (Text, Text) -> Maybe (Text, Text)
forall a. a -> Maybe a
Just (Text
t, Text
forall a. Monoid a => a
mempty)
#if MIN_VERSION_text(2,0,0)
  | o >= 0 = Just (Text arr off o, Text arr (off + o) (len - o))
  | otherwise = Nothing
    where
      !(TA.ByteArray arr#) = arr
      o = fromIntegral $ unsafeDupablePerformIO $
        takeUtf8AsUtf16 arr# (fromIntegral off) (fromIntegral len) (fromIntegral k)

foreign import ccall unsafe "_hs_text_lines_take_utf8_as_utf16" takeUtf8AsUtf16
  :: ByteArray# -> CSize -> CSize -> CSize -> IO CSsize
#else
  -- Something wrong is going here:
  | Bool
otherwise = if Word16
c Word16 -> Word16 -> Bool
forall a. Ord a => a -> a -> Bool
>= Word16
0xdc00 Bool -> Bool -> Bool
&& Word16
c Word16 -> Word16 -> Bool
forall a. Ord a => a -> a -> Bool
< Word16
0xe000 then Maybe (Text, Text)
forall a. Maybe a
Nothing else (Text, Text) -> Maybe (Text, Text)
forall a. a -> Maybe a
Just
    (Array -> Int -> Int -> Text
Text Array
arr Int
off Int
k', Array -> Int -> Int -> Text
Text Array
arr (Int
off Int -> Int -> Int
forall a. Num a => a -> a -> a
+ Int
k') (Int
len Int -> Int -> Int
forall a. Num a => a -> a -> a
- Int
k'))
    where
      k' :: Int
k' = Word -> Int
I.wordToInt Word
k
      c :: Word16
c = Array -> Int -> Word16
TA.unsafeIndex Array
arr (Int
off Int -> Int -> Int
forall a. Num a => a -> a -> a
+ Int
k')
#endif
{-# INLINABLE splitTextAtUtf16Index #-}

-- | Combination of 'splitAtLine' and subsequent 'splitAt'.
-- If requested number of code units splits a code point in half, return 'Nothing'.
-- Time is linear in 'posColumn', but does not depend on 'posLine'.
--
-- >>> :set -XOverloadedStrings
-- >>> splitAtPosition (Position 1 0) "f\n𐀀я"
-- Just ("f\n","𐀀я")
-- >>> splitAtPosition (Position 1 1) "f\n𐀀я"
-- Nothing
-- >>> splitAtPosition (Position 1 2) "f\n𐀀я"
-- Just ("f\n𐀀","я")
-- >>> splitAtPosition (Position 0 2) "f\n𐀀я"
-- Just ("f\n","𐀀я")
-- >>> splitAtPosition (Position 0 3) "f\n𐀀я"
-- Nothing
-- >>> splitAtPosition (Position 0 4) "f\n𐀀я"
-- Just ("f\n𐀀","я")
--
splitAtPosition
  :: HasCallStack
  => Position
  -> TextLines
  -> Maybe (TextLines, TextLines)
splitAtPosition :: Position -> TextLines -> Maybe (TextLines, TextLines)
splitAtPosition (Position Word
line Word
column) (TextLines (I.TextLines (Text Array
arr Int
off Int
len) Vector Int
nls)) =
  case Word -> Text -> Maybe (Text, Text)
splitTextAtUtf16Index Word
column Text
tx of
    Maybe (Text, Text)
Nothing -> Maybe (TextLines, TextLines)
forall a. Maybe a
Nothing
    Just (Text Array
_ Int
off' Int
len', Text
tz) -> let n :: Int
n = Vector Int -> Int -> Int
forall a. (Ord a, Unbox a) => Vector a -> a -> Int
I.binarySearch Vector Int
nls (Int
off' Int -> Int -> Int
forall a. Num a => a -> a -> a
+ Int
len') in (TextLines, TextLines) -> Maybe (TextLines, TextLines)
forall a. a -> Maybe a
Just
      ( TextLines -> TextLines
TextLines (TextLines -> TextLines) -> TextLines -> TextLines
forall a b. (a -> b) -> a -> b
$ Text -> Vector Int -> TextLines
I.textLines (Array -> Int -> Int -> Text
Text Array
arr Int
off (Int
off' Int -> Int -> Int
forall a. Num a => a -> a -> a
+ Int
len' Int -> Int -> Int
forall a. Num a => a -> a -> a
- Int
off)) (Int -> Vector Int -> Vector Int
forall a. Unbox a => Int -> Vector a -> Vector a
U.take Int
n Vector Int
nls)
      , TextLines -> TextLines
TextLines (TextLines -> TextLines) -> TextLines -> TextLines
forall a b. (a -> b) -> a -> b
$ Text -> Vector Int -> TextLines
I.textLines Text
tz (Int -> Vector Int -> Vector Int
forall a. Unbox a => Int -> Vector a -> Vector a
U.drop Int
n Vector Int
nls))
  where
    arrLen :: Int
arrLen = Int
off Int -> Int -> Int
forall a. Num a => a -> a -> a
+ Int
len
    nl :: Int
nl
      | Word
line Word -> Word -> Bool
forall a. Ord a => a -> a -> Bool
<= Word
0 = Int
off
      | Word
line Word -> Word -> Bool
forall a. Ord a => a -> a -> Bool
> Int -> Word
I.intToWord (Vector Int -> Int
forall a. Unbox a => Vector a -> Int
U.length Vector Int
nls) = Int
arrLen
      | Bool
otherwise = Vector Int
nls Vector Int -> Int -> Int
forall a. Unbox a => Vector a -> Int -> a
U.! (Word -> Int
I.wordToInt Word
line Int -> Int -> Int
forall a. Num a => a -> a -> a
- Int
1) Int -> Int -> Int
forall a. Num a => a -> a -> a
+ Int
1
    tx :: Text
tx = Array -> Int -> Int -> Text
Text Array
arr Int
nl (Int
arrLen Int -> Int -> Int
forall a. Num a => a -> a -> a
- Int
nl)
{-# INLINABLE splitAtPosition #-}

-- | Split at given UTF-16 code unit.
-- If requested number of code units splits a code point in half, return 'Nothing'.
-- Takes linear time.
--
-- >>> :set -XOverloadedStrings
-- >>> map (\c -> splitAt c "fя𐀀") [0..4]
-- [Just ("","fя𐀀"),Just ("f","я𐀀"),Just ("fя","𐀀"),Nothing,Just ("fя𐀀","")]
--
splitAt :: HasCallStack => Word -> TextLines -> Maybe (TextLines, TextLines)
splitAt :: Word -> TextLines -> Maybe (TextLines, TextLines)
splitAt = Position -> TextLines -> Maybe (TextLines, TextLines)
splitAtPosition (Position -> TextLines -> Maybe (TextLines, TextLines))
-> (Word -> Position)
-> Word
-> TextLines
-> Maybe (TextLines, TextLines)
forall b c a. (b -> c) -> (a -> b) -> a -> c
. Word -> Word -> Position
Position Word
0
{-# INLINE splitAt #-}