{-# LANGUAGE DeriveGeneric #-} {-# LANGUAGE GeneralizedNewtypeDeriving #-} {-| 'Source' models source code, represented as a thin wrapper around a 'B.ByteString' with conveniences for splitting by line, slicing, etc. This module is intended to be imported qualified to avoid name clashes with 'Prelude': > import qualified Source.Source as Source -} module Source.Source ( Source , bytes , fromUTF8 -- * Measurement , Source.Source.length , Source.Source.null , totalRange , totalSpan -- * En/decoding , fromText , toText -- * Slicing , slice , drop , take -- * Splitting , Source.Source.lines , lineRanges , lineRangesWithin , newlineIndices ) where import Prelude hiding (drop, take) import Control.Arrow ((&&&)) import Control.DeepSeq (NFData) import Data.Aeson (FromJSON (..), withText) import qualified Data.ByteString as B import Data.Char (ord) import Data.Maybe (fromMaybe) import Data.Monoid (Last (..)) import Data.Semilattice.Lower import Data.String (IsString (..)) import qualified Data.Text as T import qualified Data.Text.Encoding as T import Data.Text.Encoding.Error (lenientDecode) import GHC.Generics (Generic) import Source.Range import Source.Span (Pos (..), Span (Span)) -- | The contents of a source file. This is represented as a UTF-8 -- 'ByteString' under the hood. Construct these with 'fromUTF8'; obviously, -- passing 'fromUTF8' non-UTF8 bytes will cause crashes. newtype Source = Source { bytes :: B.ByteString } deriving (Eq, Semigroup, Monoid, IsString, Show, Generic, NFData) fromUTF8 :: B.ByteString -> Source fromUTF8 = Source instance FromJSON Source where parseJSON = withText "Source" (pure . fromText) -- Measurement length :: Source -> Int length = B.length . bytes null :: Source -> Bool null = B.null . bytes -- | Return a 'Range' that covers the entire text. totalRange :: Source -> Range totalRange = Range 0 . B.length . bytes -- | Return a 'Span' that covers the entire text. totalSpan :: Source -> Span totalSpan source = Span (Pos 1 1) (Pos (Prelude.length ranges) (succ (end lastRange - start lastRange))) where ranges = lineRanges source lastRange = fromMaybe lowerBound (getLast (foldMap (Last . Just) ranges)) -- En/decoding -- | Return a 'Source' from a 'Text'. fromText :: T.Text -> Source fromText = Source . T.encodeUtf8 -- | Return the Text contained in the 'Source'. toText :: Source -> T.Text toText = T.decodeUtf8With lenientDecode . bytes -- Slicing -- | Return a 'Source' that contains a slice of the given 'Source'. slice :: Source -> Range -> Source slice source range = taking $ dropping source where dropping = drop (start range) taking = take (rangeLength range) drop :: Int -> Source -> Source drop i = Source . B.drop i . bytes take :: Int -> Source -> Source take i = Source . B.take i . bytes -- Splitting -- | Split the contents of the source after newlines. lines :: Source -> [Source] lines source = slice source <$> lineRanges source -- | Compute the 'Range's of each line in a 'Source'. lineRanges :: Source -> [Range] lineRanges source = lineRangesWithin source (totalRange source) -- | Compute the 'Range's of each line in a 'Range' of a 'Source'. lineRangesWithin :: Source -> Range -> [Range] lineRangesWithin source range = uncurry (zipWith Range) . ((start range:) &&& (<> [ end range ])) . fmap (+ succ (start range)) . newlineIndices . bytes $ slice source range -- | Return all indices of newlines ('\n', '\r', and '\r\n') in the 'ByteString'. newlineIndices :: B.ByteString -> [Int] newlineIndices = go 0 where go n bs | B.null bs = [] | otherwise = case (searchCR bs, searchLF bs) of (Nothing, Nothing) -> [] (Just i, Nothing) -> recur n i bs (Nothing, Just i) -> recur n i bs (Just crI, Just lfI) | succ crI == lfI -> recur n lfI bs | otherwise -> recur n (min crI lfI) bs recur n i bs = let j = n + i in j : go (succ j) (B.drop (succ i) bs) searchLF = B.elemIndex (toEnum (ord '\n')) searchCR = B.elemIndex (toEnum (ord '\r')) {-# INLINE newlineIndices #-}