-- Copyright 2017 Google Inc. -- -- Licensed under the Apache License, Version 2.0 (the "License"); -- you may not use this file except in compliance with the License. -- You may obtain a copy of the License at -- -- http://www.apache.org/licenses/LICENSE-2.0 -- -- Unless required by applicable law or agreed to in writing, software -- distributed under the License is distributed on an "AS IS" BASIS, -- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -- See the License for the specific language governing permissions and -- limitations under the License. {-# LANGUAGE BangPatterns, OverloadedStrings, RecordWildCards #-} -- | Helpers for calculating char/byte offset from line and column number. -- All line and column offsets in this module are zero-based. -- -- Note: currently only line/column to byte offset calculation is exposed, but -- the 'OffsetTable' structure supports more, which can be added on demand. module Data.Text.Offset ( OffsetTable -- * Creation , createOffsetTable -- * Querying , lineColToByteOffsetDetail , lineColToByteOffset , OffsetError(..) , OverLineKind(..) ) where import Control.Monad (unless) import Data.Foldable (foldl') import Data.Char (ord) import qualified Data.Text.Lazy as TL import qualified Data.Vector as V import qualified Data.Vector.Unboxed as UV -- | Unicode code point count (aka number of Haskell Chars). type CharOffset = Int -- | Raw byte offset, according to UTF-8 encoding. type ByteOffset = Int -- | Offsets at the beginning of a line. type LineOffsets = (CharOffset, ByteOffset) -- | Number of bytes a Char takes when UTF-8 encoded. type CharBytes = Int -- | Offset of Char in the line (zero-based) and its encoded byte count (>1). type MultibyteChar = (Int, CharBytes) -- | Content char count, excluding the end-of-line separator. type ContentCharCount = Int type LineCharInfo = (ContentCharCount, UV.Vector MultibyteChar) -- | Stores info for mapping line/column number (Char-based) to raw offset -- (either Char or UTF-8 encoded bytes-based). data OffsetTable = OffsetTable { lineOffsets :: !(UV.Vector LineOffsets) , lineCharInfo :: !(V.Vector LineCharInfo) } deriving (Eq, Show) -- | Calculates the 'OffsetTable' for the given text. The table calculation is -- not optimized (non-strict tuples are lying around), but lookups on the result -- are performant (since the tuples are stored as unboxed vectors). createOffsetTable :: TL.Text -> OffsetTable createOffsetTable = adjust . foldl' addLineOffsets [((0, 0), undefined)] . TL.splitOn "\n" where addLineOffsets :: [(LineOffsets, LineCharInfo)] -> TL.Text -> [(LineOffsets, LineCharInfo)] -- ^ 'LineOffsets' of next line (if any), multibytes of current line. -- Accumulated in reversed line order. addLineOffsets ofs@(((cs,bs), _):_) s = let !charCount = fromIntegral (TL.length s) multis = multibytes s !byteCount = bytesUntilCharPosition charCount multis !newOffs = (cs + charCount + 1, bs + byteCount + 1) -- +1 for newlines in (newOffs, (charCount, multis)):ofs addLineOffsets _ _ = error "addLineOffsets call with empty accumulator" -- | Drops last offset, and first (bogus) charinfo. adjust :: [(LineOffsets, LineCharInfo)] -> OffsetTable adjust xs = let (ofs, info) = unzip xs in OffsetTable { lineOffsets = UV.fromList . reverse . tail $ ofs , lineCharInfo = V.fromList . tail . reverse $ info } data OffsetError = NoSuchLine | EmptyLine | OverLineEnd !ByteOffset !OverLineKind -- ^ Clients might want to recover from this case, see 'JustAtLineEnd'. deriving (Eq, Ord, Show) data OverLineKind = JustAtLineEnd -- ^ Referenced position is just after the line content. Position can be -- a valid non-inclusive ending span offset. | AfterLineEnd deriving (Eq, Ord, Show) -- | Line and col are zero-based, col is in characters. lineColToByteOffsetDetail :: OffsetTable -> Int -> Int -> Either OffsetError Int lineColToByteOffsetDetail OffsetTable{..} line col = do (_, lineByteOffs) <- note NoSuchLine (lineOffsets UV.!? line) (charCount, multis) <- note NoSuchLine (lineCharInfo V.!? line) unless (charCount > 0) (Left EmptyLine) let fileOffset = bytesUntilCharPosition col multis + lineByteOffs unless (col < charCount) $ let kind = if col == charCount then JustAtLineEnd else AfterLineEnd in Left $! OverLineEnd fileOffset kind return $! fileOffset where note :: e -> Maybe a -> Either e a note e = maybe (Left e) Right -- | Like 'lineColToByteOffsetDetail', but without failure details. lineColToByteOffset :: OffsetTable -> Int -> Int -> Maybe Int lineColToByteOffset t l c = case lineColToByteOffsetDetail t l c of Left _ -> Nothing Right a -> Just a -- | Byte count up to, but excluding the given char position (zero-based). bytesUntilCharPosition :: Int -> UV.Vector MultibyteChar -> Int bytesUntilCharPosition n ms | UV.null ms = n -- Most of the cases should hit this shortcut. | otherwise = let multis = UV.map snd . UV.takeWhile ((< n) . fst) $ ms !res = n - UV.length multis + UV.sum multis in res -- | Positions and bytecounts of multibyte characters of a text. multibytes :: TL.Text -> UV.Vector MultibyteChar multibytes = UV.fromList . reverse . snd . TL.foldl' addMulti (0, []) where addMulti :: (CharOffset, [MultibyteChar]) -> Char -> (CharOffset, [MultibyteChar]) addMulti (coffs, acc) c = let !n = charBytes c !acc1 | n == 1 = acc | otherwise = (coffs, n):acc !coffs1 = coffs + 1 in (coffs1, acc1) -- | Based on 'Data.Text.Internal.Encoding.Utf8'. charBytes :: Char -> Int charBytes = ordBytes . ord where ordBytes c | c < 0x80 = 1 | c < 0x0800 = 2 | c < 0x10000 = 3 | otherwise = 4