{-# LANGUAGE LambdaCase #-}
{-# LANGUAGE OverloadedStrings #-}
{-# LANGUAGE TupleSections #-}
{-# OPTIONS_GHC -fno-warn-missing-fields #-}

-- |
-- Module      :  GHC.SyntaxHighlighter
-- Copyright   :  © 2018–present Mark Karpov
-- License     :  BSD 3 clause
--
-- Maintainer  :  Mark Karpov <markkarpov92@gmail.com>
-- Stability   :  experimental
-- Portability :  portable
--
-- The module allows you to decompose a 'Text' stream containing Haskell
-- source code into a stream of 'Text' chunks tagged with 'Token'.
--
-- This library uses the GHC's lexer, so the result is guaranteed to be 100%
-- correct, as if it were parsed by GHC itself.
module GHC.SyntaxHighlighter
  ( Token (..),
    Loc (..),
    tokenizeHaskell,
    tokenizeHaskellLoc,
  )
where

import Control.Monad
import Data.List (unfoldr)
import Data.Maybe (isJust)
import Data.Text (Text)
import Data.Text qualified as T
import GHC.Data.EnumSet qualified as ES
import GHC.Data.FastString (mkFastString)
import GHC.Data.StringBuffer
import GHC.LanguageExtensions
import GHC.Parser.Lexer qualified as L
import GHC.Types.SrcLoc
import GHC.Unit.Module.Warnings (emptyWarningCategorySet)
import GHC.Utils.Error (DiagOpts (..))
import GHC.Utils.Outputable (defaultSDocContext)

----------------------------------------------------------------------------
-- Data types

-- | Token types that are used as tags to mark spans of source code.
data Token
  = -- | Keyword
    KeywordTok
  | -- | Pragmas
    PragmaTok
  | -- | Symbols (punctuation that is not an operator)
    SymbolTok
  | -- | Variable name (term level)
    VariableTok
  | -- | Data\/type constructor
    ConstructorTok
  | -- | Operator
    OperatorTok
  | -- | Character
    CharTok
  | -- | String
    StringTok
  | -- | Integer
    IntegerTok
  | -- | Rational number
    RationalTok
  | -- | Comment (including Haddocks)
    CommentTok
  | -- | Space filling
    SpaceTok
  | -- | Something else?
    OtherTok
  deriving (Token -> Token -> Bool
(Token -> Token -> Bool) -> (Token -> Token -> Bool) -> Eq Token
forall a. (a -> a -> Bool) -> (a -> a -> Bool) -> Eq a
$c== :: Token -> Token -> Bool
== :: Token -> Token -> Bool
$c/= :: Token -> Token -> Bool
/= :: Token -> Token -> Bool
Eq, Eq Token
Eq Token =>
(Token -> Token -> Ordering)
-> (Token -> Token -> Bool)
-> (Token -> Token -> Bool)
-> (Token -> Token -> Bool)
-> (Token -> Token -> Bool)
-> (Token -> Token -> Token)
-> (Token -> Token -> Token)
-> Ord Token
Token -> Token -> Bool
Token -> Token -> Ordering
Token -> Token -> Token
forall a.
Eq a =>
(a -> a -> Ordering)
-> (a -> a -> Bool)
-> (a -> a -> Bool)
-> (a -> a -> Bool)
-> (a -> a -> Bool)
-> (a -> a -> a)
-> (a -> a -> a)
-> Ord a
$ccompare :: Token -> Token -> Ordering
compare :: Token -> Token -> Ordering
$c< :: Token -> Token -> Bool
< :: Token -> Token -> Bool
$c<= :: Token -> Token -> Bool
<= :: Token -> Token -> Bool
$c> :: Token -> Token -> Bool
> :: Token -> Token -> Bool
$c>= :: Token -> Token -> Bool
>= :: Token -> Token -> Bool
$cmax :: Token -> Token -> Token
max :: Token -> Token -> Token
$cmin :: Token -> Token -> Token
min :: Token -> Token -> Token
Ord, Int -> Token
Token -> Int
Token -> [Token]
Token -> Token
Token -> Token -> [Token]
Token -> Token -> Token -> [Token]
(Token -> Token)
-> (Token -> Token)
-> (Int -> Token)
-> (Token -> Int)
-> (Token -> [Token])
-> (Token -> Token -> [Token])
-> (Token -> Token -> [Token])
-> (Token -> Token -> Token -> [Token])
-> Enum Token
forall a.
(a -> a)
-> (a -> a)
-> (Int -> a)
-> (a -> Int)
-> (a -> [a])
-> (a -> a -> [a])
-> (a -> a -> [a])
-> (a -> a -> a -> [a])
-> Enum a
$csucc :: Token -> Token
succ :: Token -> Token
$cpred :: Token -> Token
pred :: Token -> Token
$ctoEnum :: Int -> Token
toEnum :: Int -> Token
$cfromEnum :: Token -> Int
fromEnum :: Token -> Int
$cenumFrom :: Token -> [Token]
enumFrom :: Token -> [Token]
$cenumFromThen :: Token -> Token -> [Token]
enumFromThen :: Token -> Token -> [Token]
$cenumFromTo :: Token -> Token -> [Token]
enumFromTo :: Token -> Token -> [Token]
$cenumFromThenTo :: Token -> Token -> Token -> [Token]
enumFromThenTo :: Token -> Token -> Token -> [Token]
Enum, Token
Token -> Token -> Bounded Token
forall a. a -> a -> Bounded a
$cminBound :: Token
minBound :: Token
$cmaxBound :: Token
maxBound :: Token
Bounded, Int -> Token -> ShowS
[Token] -> ShowS
Token -> [Char]
(Int -> Token -> ShowS)
-> (Token -> [Char]) -> ([Token] -> ShowS) -> Show Token
forall a.
(Int -> a -> ShowS) -> (a -> [Char]) -> ([a] -> ShowS) -> Show a
$cshowsPrec :: Int -> Token -> ShowS
showsPrec :: Int -> Token -> ShowS
$cshow :: Token -> [Char]
show :: Token -> [Char]
$cshowList :: [Token] -> ShowS
showList :: [Token] -> ShowS
Show)

-- | The start and end positions of a span. The arguments of the data
-- constructor contain in order:
--
--     * Line number of start position of a span
--     * Column number of start position of a span
--     * Line number of end position of a span
--     * Column number of end position of a span
--
-- @since 0.0.2.0
data Loc = Loc !Int !Int !Int !Int
  deriving (Loc -> Loc -> Bool
(Loc -> Loc -> Bool) -> (Loc -> Loc -> Bool) -> Eq Loc
forall a. (a -> a -> Bool) -> (a -> a -> Bool) -> Eq a
$c== :: Loc -> Loc -> Bool
== :: Loc -> Loc -> Bool
$c/= :: Loc -> Loc -> Bool
/= :: Loc -> Loc -> Bool
Eq, Eq Loc
Eq Loc =>
(Loc -> Loc -> Ordering)
-> (Loc -> Loc -> Bool)
-> (Loc -> Loc -> Bool)
-> (Loc -> Loc -> Bool)
-> (Loc -> Loc -> Bool)
-> (Loc -> Loc -> Loc)
-> (Loc -> Loc -> Loc)
-> Ord Loc
Loc -> Loc -> Bool
Loc -> Loc -> Ordering
Loc -> Loc -> Loc
forall a.
Eq a =>
(a -> a -> Ordering)
-> (a -> a -> Bool)
-> (a -> a -> Bool)
-> (a -> a -> Bool)
-> (a -> a -> Bool)
-> (a -> a -> a)
-> (a -> a -> a)
-> Ord a
$ccompare :: Loc -> Loc -> Ordering
compare :: Loc -> Loc -> Ordering
$c< :: Loc -> Loc -> Bool
< :: Loc -> Loc -> Bool
$c<= :: Loc -> Loc -> Bool
<= :: Loc -> Loc -> Bool
$c> :: Loc -> Loc -> Bool
> :: Loc -> Loc -> Bool
$c>= :: Loc -> Loc -> Bool
>= :: Loc -> Loc -> Bool
$cmax :: Loc -> Loc -> Loc
max :: Loc -> Loc -> Loc
$cmin :: Loc -> Loc -> Loc
min :: Loc -> Loc -> Loc
Ord, Int -> Loc -> ShowS
[Loc] -> ShowS
Loc -> [Char]
(Int -> Loc -> ShowS)
-> (Loc -> [Char]) -> ([Loc] -> ShowS) -> Show Loc
forall a.
(Int -> a -> ShowS) -> (a -> [Char]) -> ([a] -> ShowS) -> Show a
$cshowsPrec :: Int -> Loc -> ShowS
showsPrec :: Int -> Loc -> ShowS
$cshow :: Loc -> [Char]
show :: Loc -> [Char]
$cshowList :: [Loc] -> ShowS
showList :: [Loc] -> ShowS
Show)

----------------------------------------------------------------------------
-- High-level API

-- | Tokenize Haskell source code. If the code cannot be parsed, return
-- 'Nothing'. Otherwise return the original input tagged by 'Token's.
-- 'Nothing' is rarely returned, if ever, because it looks like the lexer is
-- capable of interpreting almost any text as a stream of GHC tokens.
--
-- The parser does not require the input source code to form a valid Haskell
-- program, so as long as the lexer can decompose your input (most of the
-- time), it'll return something in 'Just'.
tokenizeHaskell :: Text -> Maybe [(Token, Text)]
tokenizeHaskell :: Text -> Maybe [(Token, Text)]
tokenizeHaskell Text
input = Text -> [(Token, Loc)] -> [(Token, Text)]
sliceInputStream Text
input ([(Token, Loc)] -> [(Token, Text)])
-> Maybe [(Token, Loc)] -> Maybe [(Token, Text)]
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
<$> Text -> Maybe [(Token, Loc)]
tokenizeHaskellLoc Text
input

-- | Replace 'Loc' locations with actual chunks of input 'Text'.
sliceInputStream :: Text -> [(Token, Loc)] -> [(Token, Text)]
sliceInputStream :: Text -> [(Token, Loc)] -> [(Token, Text)]
sliceInputStream Text
input [(Token, Loc)]
toks = ((Text', [(Token, Loc)])
 -> Maybe ((Token, Text), (Text', [(Token, Loc)])))
-> (Text', [(Token, Loc)]) -> [(Token, Text)]
forall b a. (b -> Maybe (a, b)) -> b -> [a]
unfoldr (Text', [(Token, Loc)])
-> Maybe ((Token, Text), (Text', [(Token, Loc)]))
sliceOnce (Text -> Text'
initText' Text
input, [(Token, Loc)]
toks)
  where
    sliceOnce :: (Text', [(Token, Loc)])
-> Maybe ((Token, Text), (Text', [(Token, Loc)]))
sliceOnce (Text'
txt, []) = do
      (Text'
txt', Text
chunk) <- Text' -> Maybe (Text', Text)
tryFetchRest Text'
txt
      ((Token, Text), (Text', [(Token, Loc)]))
-> Maybe ((Token, Text), (Text', [(Token, Loc)]))
forall a. a -> Maybe a
forall (m :: * -> *) a. Monad m => a -> m a
return ((Token
SpaceTok, Text
chunk), (Text'
txt', []))
    sliceOnce (Text'
txt, tss :: [(Token, Loc)]
tss@((Token
t, Loc
l) : [(Token, Loc)]
ts)) =
      case Text' -> Loc -> Maybe (Text', Text)
tryFetchSpace Text'
txt Loc
l of
        Maybe (Text', Text)
Nothing ->
          let (Text'
txt', Text
chunk) = Text' -> Loc -> (Text', Text)
fetchSpan Text'
txt Loc
l
              t' :: Token
t' = case Token
t of
                Token
CommentTok ->
                  if Text -> Bool
isHeaderPragma Text
chunk
                    then Token
PragmaTok
                    else Token
CommentTok
                Token
tok -> Token
tok
           in ((Token, Text), (Text', [(Token, Loc)]))
-> Maybe ((Token, Text), (Text', [(Token, Loc)]))
forall a. a -> Maybe a
Just ((Token
t', Text
chunk), (Text'
txt', [(Token, Loc)]
ts))
        Just (Text'
txt', Text
chunk) ->
          ((Token, Text), (Text', [(Token, Loc)]))
-> Maybe ((Token, Text), (Text', [(Token, Loc)]))
forall a. a -> Maybe a
Just ((Token
SpaceTok, Text
chunk), (Text'
txt', [(Token, Loc)]
tss))

-- | Similar to 'tokenizeHaskell', but instead of 'Text' chunks provides
-- locations of corresponding spans in the given input stream.
--
-- @since 0.0.2.0
tokenizeHaskellLoc :: Text -> Maybe [(Token, Loc)]
tokenizeHaskellLoc :: Text -> Maybe [(Token, Loc)]
tokenizeHaskellLoc Text
input =
  case P [(Token, Loc)] -> PState -> ParseResult [(Token, Loc)]
forall a. P a -> PState -> ParseResult a
L.unP P [(Token, Loc)]
pLexer PState
parseState of
    L.PFailed {} -> Maybe [(Token, Loc)]
forall a. Maybe a
Nothing
    L.POk PState
_ [(Token, Loc)]
x -> [(Token, Loc)] -> Maybe [(Token, Loc)]
forall a. a -> Maybe a
Just [(Token, Loc)]
x
  where
    location :: RealSrcLoc
location = FastString -> Int -> Int -> RealSrcLoc
mkRealSrcLoc ([Char] -> FastString
mkFastString [Char]
"") Int
1 Int
1
    buffer :: StringBuffer
buffer = [Char] -> StringBuffer
stringToStringBuffer (Text -> [Char]
T.unpack Text
input)
    parseState :: PState
parseState = ParserOpts -> StringBuffer -> RealSrcLoc -> PState
L.initParserState ParserOpts
parserOpts StringBuffer
buffer RealSrcLoc
location
    parserOpts :: ParserOpts
parserOpts =
      EnumSet Extension
-> DiagOpts
-> [[Char]]
-> Bool
-> Bool
-> Bool
-> Bool
-> ParserOpts
L.mkParserOpts
        ([Extension] -> EnumSet Extension
forall a. Enum a => [a] -> EnumSet a
ES.fromList [Extension]
enabledExts)
        DiagOpts
diagOpts
        []
        Bool
True -- safe imports
        Bool
True -- keep Haddock tokens
        Bool
True -- keep comment tokens
        Bool
False -- lex LINE and COLUMN pragmas
    diagOpts :: DiagOpts
diagOpts =
      DiagOpts
        { diag_warning_flags :: EnumSet WarningFlag
diag_warning_flags = EnumSet WarningFlag
forall {k} (a :: k). EnumSet a
ES.empty,
          diag_fatal_warning_flags :: EnumSet WarningFlag
diag_fatal_warning_flags = EnumSet WarningFlag
forall {k} (a :: k). EnumSet a
ES.empty,
          diag_custom_warning_categories :: WarningCategorySet
diag_custom_warning_categories = WarningCategorySet
emptyWarningCategorySet,
          diag_fatal_custom_warning_categories :: WarningCategorySet
diag_fatal_custom_warning_categories = WarningCategorySet
emptyWarningCategorySet,
          diag_warn_is_error :: Bool
diag_warn_is_error = Bool
False,
          diag_reverse_errors :: Bool
diag_reverse_errors = Bool
False,
          diag_max_errors :: Maybe Int
diag_max_errors = Maybe Int
forall a. Maybe a
Nothing,
          diag_ppr_ctx :: SDocContext
diag_ppr_ctx = SDocContext
defaultSDocContext
        }

-- | The Haskell lexer.
pLexer :: L.P [(Token, Loc)]
pLexer :: P [(Token, Loc)]
pLexer = P [(Token, Loc)]
go
  where
    go :: P [(Token, Loc)]
go = do
      Located Token
r <- Bool -> (Located Token -> P (Located Token)) -> P (Located Token)
forall a. Bool -> (Located Token -> P a) -> P a
L.lexer Bool
False Located Token -> P (Located Token)
forall a. a -> P a
forall (m :: * -> *) a. Monad m => a -> m a
return
      case Located Token
r of
        L SrcSpan
_ Token
L.ITeof -> [(Token, Loc)] -> P [(Token, Loc)]
forall a. a -> P a
forall (m :: * -> *) a. Monad m => a -> m a
return []
        Located Token
_ ->
          case Located Token -> Maybe (Token, Loc)
fixupToken Located Token
r of
            Maybe (Token, Loc)
Nothing -> P [(Token, Loc)]
go
            Just (Token, Loc)
x -> ((Token, Loc)
x :) ([(Token, Loc)] -> [(Token, Loc)])
-> P [(Token, Loc)] -> P [(Token, Loc)]
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
<$> P [(Token, Loc)]
go

-- | Convert @'Located' 'L.Token'@ representation into a more convenient for
-- us form.
fixupToken :: Located L.Token -> Maybe (Token, Loc)
fixupToken :: Located Token -> Maybe (Token, Loc)
fixupToken (L SrcSpan
srcSpan Token
tok) = (Token -> Token
classifyToken Token
tok,) (Loc -> (Token, Loc)) -> Maybe Loc -> Maybe (Token, Loc)
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
<$> SrcSpan -> Maybe Loc
srcSpanToLoc SrcSpan
srcSpan

-- | Convert 'SrcSpan' into 'Loc'.
srcSpanToLoc :: SrcSpan -> Maybe Loc
srcSpanToLoc :: SrcSpan -> Maybe Loc
srcSpanToLoc (RealSrcSpan RealSrcSpan
s Maybe BufSpan
_) =
  let srcSpanSLine :: Int
srcSpanSLine = RealSrcSpan -> Int
srcSpanStartLine RealSrcSpan
s
      srcSpanSCol :: Int
srcSpanSCol = RealSrcSpan -> Int
srcSpanStartCol RealSrcSpan
s
      srcSpanELine :: Int
srcSpanELine = RealSrcSpan -> Int
srcSpanEndLine RealSrcSpan
s
      srcSpanECol :: Int
srcSpanECol = RealSrcSpan -> Int
srcSpanEndCol RealSrcSpan
s
      start :: (Int, Int)
start = (Int
srcSpanSLine, Int
srcSpanSCol)
      end :: (Int, Int)
end = (Int
srcSpanELine, Int
srcSpanECol)
   in if (Int, Int)
start (Int, Int) -> (Int, Int) -> Bool
forall a. Eq a => a -> a -> Bool
== (Int, Int)
end
        then Maybe Loc
forall a. Maybe a
Nothing -- NOTE Some magic auto-generated tokens that do not
        -- actually appear in the input stream. Drop them.
        else
          Loc -> Maybe Loc
forall a. a -> Maybe a
Just (Loc -> Maybe Loc) -> Loc -> Maybe Loc
forall a b. (a -> b) -> a -> b
$
            Int -> Int -> Int -> Int -> Loc
Loc
              Int
srcSpanSLine
              Int
srcSpanSCol
              Int
srcSpanELine
              Int
srcSpanECol
srcSpanToLoc SrcSpan
_ = Maybe Loc
forall a. Maybe a
Nothing

-- | Classify a 'L.Token' in terms of 'Token'.
classifyToken :: L.Token -> Token
classifyToken :: Token -> Token
classifyToken = \case
  -- Keywords
  Token
L.ITas -> Token
KeywordTok
  Token
L.ITcase -> Token
KeywordTok
  Token
L.ITlcases -> Token
KeywordTok
  Token
L.ITclass -> Token
KeywordTok
  Token
L.ITdata -> Token
KeywordTok
  Token
L.ITdefault -> Token
KeywordTok
  Token
L.ITderiving -> Token
KeywordTok
  L.ITdo Maybe FastString
_ -> Token
KeywordTok
  Token
L.ITelse -> Token
KeywordTok
  Token
L.IThiding -> Token
KeywordTok
  Token
L.ITforeign -> Token
KeywordTok
  Token
L.ITif -> Token
KeywordTok
  Token
L.ITimport -> Token
KeywordTok
  Token
L.ITin -> Token
KeywordTok
  Token
L.ITinfix -> Token
KeywordTok
  Token
L.ITinfixl -> Token
KeywordTok
  Token
L.ITinfixr -> Token
KeywordTok
  Token
L.ITinstance -> Token
KeywordTok
  Token
L.ITlet -> Token
KeywordTok
  Token
L.ITmodule -> Token
KeywordTok
  Token
L.ITnewtype -> Token
KeywordTok
  Token
L.ITof -> Token
KeywordTok
  Token
L.ITqualified -> Token
KeywordTok
  Token
L.ITthen -> Token
KeywordTok
  Token
L.ITtype -> Token
KeywordTok
  Token
L.ITwhere -> Token
KeywordTok
  L.ITforall IsUnicodeSyntax
_ -> Token
KeywordTok
  Token
L.ITexport -> Token
KeywordTok
  Token
L.ITlabel -> Token
KeywordTok
  Token
L.ITdynamic -> Token
KeywordTok
  Token
L.ITsafe -> Token
KeywordTok
  Token
L.ITinterruptible -> Token
KeywordTok
  Token
L.ITunsafe -> Token
KeywordTok
  Token
L.ITstdcallconv -> Token
KeywordTok
  Token
L.ITccallconv -> Token
KeywordTok
  Token
L.ITcapiconv -> Token
KeywordTok
  Token
L.ITprimcallconv -> Token
KeywordTok
  Token
L.ITjavascriptcallconv -> Token
KeywordTok
  L.ITmdo Maybe FastString
_ -> Token
KeywordTok
  Token
L.ITfamily -> Token
KeywordTok
  Token
L.ITrole -> Token
KeywordTok
  Token
L.ITgroup -> Token
KeywordTok
  Token
L.ITby -> Token
KeywordTok
  Token
L.ITusing -> Token
KeywordTok
  Token
L.ITpattern -> Token
KeywordTok
  Token
L.ITstatic -> Token
KeywordTok
  Token
L.ITstock -> Token
KeywordTok
  Token
L.ITanyclass -> Token
KeywordTok
  Token
L.ITvia -> Token
KeywordTok
  Token
L.ITunit -> Token
KeywordTok
  Token
L.ITsignature -> Token
KeywordTok
  Token
L.ITdependency -> Token
KeywordTok
  Token
L.ITrequires -> Token
KeywordTok
  -- Pragmas
  L.ITinline_prag {} -> Token
PragmaTok
  L.ITspec_prag SourceText
_ -> Token
PragmaTok
  L.ITspec_inline_prag {} -> Token
PragmaTok
  L.ITsource_prag SourceText
_ -> Token
PragmaTok
  L.ITrules_prag SourceText
_ -> Token
PragmaTok
  L.ITwarning_prag SourceText
_ -> Token
PragmaTok
  L.ITdeprecated_prag SourceText
_ -> Token
PragmaTok
  L.ITline_prag SourceText
_ -> Token
PragmaTok
  L.ITcolumn_prag SourceText
_ -> Token
PragmaTok
  L.ITscc_prag SourceText
_ -> Token
PragmaTok
  L.ITunpack_prag SourceText
_ -> Token
PragmaTok
  L.ITnounpack_prag SourceText
_ -> Token
PragmaTok
  L.ITann_prag SourceText
_ -> Token
PragmaTok
  L.ITcomplete_prag SourceText
_ -> Token
PragmaTok
  Token
L.ITclose_prag -> Token
PragmaTok
  L.IToptions_prag [Char]
_ -> Token
PragmaTok
  L.ITinclude_prag [Char]
_ -> Token
PragmaTok
  Token
L.ITlanguage_prag -> Token
PragmaTok
  L.ITminimal_prag SourceText
_ -> Token
PragmaTok
  L.IToverlappable_prag SourceText
_ -> Token
PragmaTok
  L.IToverlapping_prag SourceText
_ -> Token
PragmaTok
  L.IToverlaps_prag SourceText
_ -> Token
PragmaTok
  L.ITincoherent_prag SourceText
_ -> Token
PragmaTok
  L.ITctype SourceText
_ -> Token
PragmaTok
  Token
L.ITcomment_line_prag -> Token
PragmaTok
  L.ITopaque_prag SourceText
_ -> Token
PragmaTok
  -- Reserved symbols
  Token
L.ITdotdot -> Token
SymbolTok
  Token
L.ITcolon -> Token
SymbolTok
  L.ITdcolon IsUnicodeSyntax
_ -> Token
SymbolTok
  Token
L.ITequal -> Token
SymbolTok
  Token
L.ITlam -> Token
SymbolTok
  Token
L.ITlcase -> Token
SymbolTok
  Token
L.ITvbar -> Token
SymbolTok
  L.ITlarrow IsUnicodeSyntax
_ -> Token
SymbolTok
  L.ITrarrow IsUnicodeSyntax
_ -> Token
SymbolTok
  Token
L.ITlolly -> Token
SymbolTok
  Token
L.ITat -> Token
SymbolTok
  Token
L.ITtilde -> Token
SymbolTok
  L.ITdarrow IsUnicodeSyntax
_ -> Token
SymbolTok
  Token
L.ITbang -> Token
SymbolTok
  L.ITstar IsUnicodeSyntax
_ -> Token
SymbolTok
  Token
L.ITbiglam -> Token
SymbolTok
  Token
L.ITocurly -> Token
SymbolTok
  Token
L.ITccurly -> Token
SymbolTok
  Token
L.ITvocurly -> Token
SymbolTok
  Token
L.ITvccurly -> Token
SymbolTok
  Token
L.ITobrack -> Token
SymbolTok
  Token
L.ITopabrack -> Token
SymbolTok
  Token
L.ITcpabrack -> Token
SymbolTok
  Token
L.ITcbrack -> Token
SymbolTok
  Token
L.IToparen -> Token
SymbolTok
  Token
L.ITcparen -> Token
SymbolTok
  Token
L.IToubxparen -> Token
SymbolTok
  Token
L.ITcubxparen -> Token
SymbolTok
  Token
L.ITsemi -> Token
SymbolTok
  Token
L.ITcomma -> Token
SymbolTok
  Token
L.ITunderscore -> Token
SymbolTok
  Token
L.ITbackquote -> Token
SymbolTok
  Token
L.ITsimpleQuote -> Token
SymbolTok
  Token
L.ITpercent -> Token
SymbolTok
  L.ITproj Bool
_ -> Token
SymbolTok
  -- NOTE GHC thinks these are reserved symbols, but I classify them as
  -- operators.
  Token
L.ITminus -> Token
OperatorTok
  Token
L.ITprefixminus -> Token
OperatorTok
  Token
L.ITdot -> Token
OperatorTok
  -- Identifiers
  L.ITvarid FastString
_ -> Token
VariableTok
  L.ITconid FastString
_ -> Token
ConstructorTok
  L.ITvarsym FastString
_ -> Token
OperatorTok
  L.ITconsym FastString
_ -> Token
OperatorTok
  L.ITqvarid (FastString, FastString)
_ -> Token
VariableTok
  L.ITqconid (FastString, FastString)
_ -> Token
ConstructorTok
  L.ITqvarsym (FastString, FastString)
_ -> Token
OperatorTok
  L.ITqconsym (FastString, FastString)
_ -> Token
OperatorTok
  L.ITdupipvarid FastString
_ -> Token
VariableTok
  L.ITlabelvarid {} -> Token
VariableTok
  -- Basic types
  L.ITchar SourceText
_ Char
_ -> Token
CharTok
  L.ITstring SourceText
_ FastString
_ -> Token
StringTok
  L.ITinteger IntegralLit
_ -> Token
IntegerTok
  L.ITrational FractionalLit
_ -> Token
RationalTok
  L.ITprimchar SourceText
_ Char
_ -> Token
CharTok
  L.ITprimstring SourceText
_ ByteString
_ -> Token
StringTok
  L.ITprimint SourceText
_ Integer
_ -> Token
IntegerTok
  L.ITprimint8 SourceText
_ Integer
_ -> Token
IntegerTok
  L.ITprimint16 SourceText
_ Integer
_ -> Token
IntegerTok
  L.ITprimint32 SourceText
_ Integer
_ -> Token
IntegerTok
  L.ITprimint64 SourceText
_ Integer
_ -> Token
IntegerTok
  L.ITprimword SourceText
_ Integer
_ -> Token
IntegerTok
  L.ITprimword8 SourceText
_ Integer
_ -> Token
IntegerTok
  L.ITprimword16 SourceText
_ Integer
_ -> Token
IntegerTok
  L.ITprimword32 SourceText
_ Integer
_ -> Token
IntegerTok
  L.ITprimword64 SourceText
_ Integer
_ -> Token
IntegerTok
  L.ITprimfloat FractionalLit
_ -> Token
RationalTok
  L.ITprimdouble FractionalLit
_ -> Token
RationalTok
  -- Template Haskell extension tokens
  L.ITopenExpQuote HasE
_ IsUnicodeSyntax
_ -> Token
SymbolTok
  Token
L.ITopenPatQuote -> Token
SymbolTok
  Token
L.ITopenDecQuote -> Token
SymbolTok
  Token
L.ITopenTypQuote -> Token
SymbolTok
  L.ITcloseQuote IsUnicodeSyntax
_ -> Token
SymbolTok
  L.ITopenTExpQuote HasE
_ -> Token
SymbolTok
  Token
L.ITcloseTExpQuote -> Token
SymbolTok
  Token
L.ITtyQuote -> Token
SymbolTok
  L.ITquasiQuote (FastString, FastString, PsSpan)
_ -> Token
SymbolTok
  L.ITqQuasiQuote (FastString, FastString, FastString, PsSpan)
_ -> Token
SymbolTok
  Token
L.ITdollar -> Token
SymbolTok
  Token
L.ITdollardollar -> Token
SymbolTok
  -- Arrow notation
  Token
L.ITproc -> Token
KeywordTok
  Token
L.ITrec -> Token
KeywordTok
  L.IToparenbar IsUnicodeSyntax
_ -> Token
SymbolTok
  L.ITcparenbar IsUnicodeSyntax
_ -> Token
SymbolTok
  L.ITlarrowtail IsUnicodeSyntax
_ -> Token
SymbolTok
  L.ITrarrowtail IsUnicodeSyntax
_ -> Token
SymbolTok
  L.ITLarrowtail IsUnicodeSyntax
_ -> Token
SymbolTok
  L.ITRarrowtail IsUnicodeSyntax
_ -> Token
SymbolTok
  -- Type application
  Token
L.ITtypeApp -> Token
SymbolTok
  -- Special
  L.ITunknown [Char]
_ -> Token
OtherTok
  Token
L.ITeof -> Token
OtherTok -- normally is not included in results
  -- Documentation annotations
  L.ITdocComment {} -> Token
CommentTok
  L.ITdocOptions {} -> Token
CommentTok
  L.ITlineComment {} -> Token
CommentTok
  L.ITblockComment {} -> Token
CommentTok

----------------------------------------------------------------------------
-- Text traversing

-- | A type for 'Text' with line\/column location attached.
data Text'
  = Text'
      {-# UNPACK #-} !Int
      {-# UNPACK #-} !Int
      {-# UNPACK #-} !Text
  deriving (Int -> Text' -> ShowS
[Text'] -> ShowS
Text' -> [Char]
(Int -> Text' -> ShowS)
-> (Text' -> [Char]) -> ([Text'] -> ShowS) -> Show Text'
forall a.
(Int -> a -> ShowS) -> (a -> [Char]) -> ([a] -> ShowS) -> Show a
$cshowsPrec :: Int -> Text' -> ShowS
showsPrec :: Int -> Text' -> ShowS
$cshow :: Text' -> [Char]
show :: Text' -> [Char]
$cshowList :: [Text'] -> ShowS
showList :: [Text'] -> ShowS
Show)

-- | Create 'Text'' from 'Text'.
initText' :: Text -> Text'
initText' :: Text -> Text'
initText' = Int -> Int -> Text -> Text'
Text' Int
1 Int
1

-- | Try to fetch white space before start of span at 'Loc'.
tryFetchSpace :: Text' -> Loc -> Maybe (Text', Text)
tryFetchSpace :: Text' -> Loc -> Maybe (Text', Text)
tryFetchSpace Text'
txt (Loc Int
sl Int
sc Int
_ Int
_) =
  let (Text'
txt', Text
r) = Text' -> Int -> Int -> (Text', Text)
reachLoc Text'
txt Int
sl Int
sc
   in if Text -> Bool
T.null Text
r
        then Maybe (Text', Text)
forall a. Maybe a
Nothing
        else (Text', Text) -> Maybe (Text', Text)
forall a. a -> Maybe a
Just (Text'
txt', Text
r)

-- | Try to fetch the rest of 'Text'' stream.
tryFetchRest :: Text' -> Maybe (Text', Text)
tryFetchRest :: Text' -> Maybe (Text', Text)
tryFetchRest (Text' Int
l Int
c Text
txt) =
  if Text -> Bool
T.null Text
txt
    then Maybe (Text', Text)
forall a. Maybe a
Nothing
    else (Text', Text) -> Maybe (Text', Text)
forall a. a -> Maybe a
Just (Int -> Int -> Text -> Text'
Text' Int
l Int
c Text
"", Text
txt)

-- | Fetch a span at 'Loc'.
fetchSpan :: Text' -> Loc -> (Text', Text)
fetchSpan :: Text' -> Loc -> (Text', Text)
fetchSpan Text'
txt (Loc Int
_ Int
_ Int
el Int
ec) = Text' -> Int -> Int -> (Text', Text)
reachLoc Text'
txt Int
el Int
ec

-- | Reach given line\/column location and return 'Text' that has been
-- traversed.
reachLoc ::
  Text' ->
  -- | Line number to reach
  Int ->
  -- | Column number to reach
  Int ->
  (Text', Text)
reachLoc :: Text' -> Int -> Int -> (Text', Text)
reachLoc txt :: Text'
txt@(Text' Int
_ Int
_ Text
original) Int
l Int
c =
  let chunk :: Text
chunk = (Text' -> Maybe (Char, Text')) -> Text' -> Text
forall a. (a -> Maybe (Char, a)) -> a -> Text
T.unfoldr Text' -> Maybe (Char, Text')
f Text'
txt
      f :: Text' -> Maybe (Char, Text')
f (Text' Int
l' Int
c' Text
s) = do
        Bool -> Maybe ()
forall (f :: * -> *). Alternative f => Bool -> f ()
guard (Int
l' Int -> Int -> Bool
forall a. Ord a => a -> a -> Bool
< Int
l Bool -> Bool -> Bool
|| Int
c' Int -> Int -> Bool
forall a. Ord a => a -> a -> Bool
< Int
c)
        (Char
ch, Text
s') <- Text -> Maybe (Char, Text)
T.uncons Text
s
        let (Int
l'', Int
c'') = case Char
ch of
              Char
'\n' -> (Int
l' Int -> Int -> Int
forall a. Num a => a -> a -> a
+ Int
1, Int
1)
              Char
'\t' -> (Int
l', Int
c' Int -> Int -> Int
forall a. Num a => a -> a -> a
+ Int
8 Int -> Int -> Int
forall a. Num a => a -> a -> a
- ((Int
c' Int -> Int -> Int
forall a. Num a => a -> a -> a
- Int
1) Int -> Int -> Int
forall a. Integral a => a -> a -> a
`rem` Int
8))
              Char
_ -> (Int
l', Int
c' Int -> Int -> Int
forall a. Num a => a -> a -> a
+ Int
1)
        (Char, Text') -> Maybe (Char, Text')
forall a. a -> Maybe a
forall (m :: * -> *) a. Monad m => a -> m a
return (Char
ch, Int -> Int -> Text -> Text'
Text' Int
l'' Int
c'' Text
s')
   in (Int -> Int -> Text -> Text'
Text' Int
l Int
c (Int -> Text -> Text
T.drop (Text -> Int
T.length Text
chunk) Text
original), Text
chunk)

----------------------------------------------------------------------------
-- Pragmas detection

-- | Detect file header pragma.
isHeaderPragma :: Text -> Bool
isHeaderPragma :: Text -> Bool
isHeaderPragma Text
txt0 = Maybe () -> Bool
forall a. Maybe a -> Bool
isJust (Maybe () -> Bool) -> Maybe () -> Bool
forall a b. (a -> b) -> a -> b
$ do
  Text
txt1 <- Text -> Text
T.stripStart (Text -> Text) -> Maybe Text -> Maybe Text
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
<$> Text -> Text -> Maybe Text
T.stripPrefix Text
"{-#" Text
txt0
  Bool -> Maybe ()
forall (f :: * -> *). Alternative f => Bool -> f ()
guard (Text -> Text -> Bool
T.isPrefixOf Text
"LANGUAGE" Text
txt1 Bool -> Bool -> Bool
|| Text -> Text -> Bool
T.isPrefixOf Text
"OPTIONS_GHC" Text
txt1)

----------------------------------------------------------------------------
-- Language extensions

-- | Language extensions we enable by default.
enabledExts :: [Extension]
enabledExts :: [Extension]
enabledExts =
  [ Extension
ForeignFunctionInterface,
    Extension
InterruptibleFFI,
    Extension
CApiFFI,
    Extension
Arrows,
    Extension
TemplateHaskell,
    Extension
TemplateHaskellQuotes,
    Extension
ImplicitParams,
    Extension
OverloadedLabels,
    Extension
ExplicitForAll,
    Extension
BangPatterns,
    Extension
PatternSynonyms,
    Extension
MagicHash,
    Extension
RecursiveDo,
    Extension
UnicodeSyntax,
    Extension
UnboxedTuples,
    Extension
UnboxedSums,
    Extension
DatatypeContexts,
    Extension
TransformListComp,
    Extension
QuasiQuotes,
    Extension
LambdaCase,
    Extension
BinaryLiterals,
    Extension
NegativeLiterals,
    Extension
HexFloatLiterals,
    Extension
TypeApplications,
    Extension
StaticPointers,
    Extension
NumericUnderscores,
    Extension
StarIsType
  ]