{-# LANGUAGE LambdaCase #-}
{-# LANGUAGE OverloadedStrings #-}
{-# LANGUAGE TupleSections #-}
{-# OPTIONS_GHC -fno-warn-missing-fields #-}
module GHC.SyntaxHighlighter
( Token (..)
, Loc (..)
, tokenizeHaskell
, tokenizeHaskellLoc )
where
import Control.Monad
import Data.List (foldl', unfoldr)
import Data.Maybe (isJust)
import Data.Text (Text)
import DynFlags
import FastString (mkFastString)
import GHC.LanguageExtensions
import SrcLoc
import StringBuffer
import qualified Data.Text as T
import qualified EnumSet as ES
import qualified Lexer as L
data Token
= KeywordTok
| PragmaTok
| SymbolTok
| VariableTok
| ConstructorTok
| OperatorTok
| CharTok
| StringTok
| IntegerTok
| RationalTok
| CommentTok
| SpaceTok
| OtherTok
deriving (Eq, Ord, Enum, Bounded, Show)
data Loc = Loc !Int !Int !Int !Int
deriving (Eq, Ord, Show)
tokenizeHaskell :: Text -> Maybe [(Token, Text)]
tokenizeHaskell input = sliceInputStream input <$> tokenizeHaskellLoc input
sliceInputStream :: Text -> [(Token, Loc)] -> [(Token, Text)]
sliceInputStream input toks = unfoldr sliceOnce (initText' input, toks)
where
sliceOnce (txt, []) = do
(txt', chunk) <- tryFetchRest txt
return ((SpaceTok, chunk), (txt', []))
sliceOnce (txt, tss@((t, l):ts)) =
case tryFetchSpace txt l of
Nothing ->
let (txt', chunk) = fetchSpan txt l
t' = case t of
CommentTok -> if isHeaderPragma chunk
then PragmaTok
else CommentTok
tok -> tok
in Just ((t', chunk), (txt', ts))
Just (txt', chunk) ->
Just ((SpaceTok, chunk), (txt', tss))
tokenizeHaskellLoc :: Text -> Maybe [(Token, Loc)]
tokenizeHaskellLoc input =
case L.unP pLexer parseState of
L.PFailed {} -> Nothing
L.POk _ x -> Just x
where
location = mkRealSrcLoc (mkFastString "") 1 1
buffer = stringToStringBuffer (T.unpack input)
parseState = L.mkPStatePure parserFlags buffer location
parserFlags = L.mkParserFlags (foldl' xopt_set initialDynFlags enabledExts)
initialDynFlags = DynFlags
{ warningFlags = ES.empty
, generalFlags = ES.fromList
[ Opt_Haddock
, Opt_KeepRawTokenStream
]
, extensions = []
, extensionFlags = ES.empty
, safeHaskell = Sf_Safe
, language = Just Haskell2010
}
pLexer :: L.P [(Token, Loc)]
pLexer = go
where
go = do
r <- L.lexer False return
case r of
L _ L.ITeof -> return []
_ ->
case fixupToken r of
Nothing -> go
Just x -> (x:) <$> go
fixupToken :: Located L.Token -> Maybe (Token, Loc)
fixupToken (L srcSpan tok) = (classifyToken tok,) <$> srcSpanToLoc srcSpan
srcSpanToLoc :: SrcSpan -> Maybe Loc
srcSpanToLoc (RealSrcSpan rss) =
let start = realSrcSpanStart rss
end = realSrcSpanEnd rss
in if start == end
then Nothing
else Just $ Loc (srcLocLine start)
(srcLocCol start)
(srcLocLine end)
(srcLocCol end)
srcSpanToLoc _ = Nothing
classifyToken :: L.Token -> Token
classifyToken = \case
L.ITas -> KeywordTok
L.ITcase -> KeywordTok
L.ITclass -> KeywordTok
L.ITdata -> KeywordTok
L.ITdefault -> KeywordTok
L.ITderiving -> KeywordTok
L.ITdo -> KeywordTok
L.ITelse -> KeywordTok
L.IThiding -> KeywordTok
L.ITforeign -> KeywordTok
L.ITif -> KeywordTok
L.ITimport -> KeywordTok
L.ITin -> KeywordTok
L.ITinfix -> KeywordTok
L.ITinfixl -> KeywordTok
L.ITinfixr -> KeywordTok
L.ITinstance -> KeywordTok
L.ITlet -> KeywordTok
L.ITmodule -> KeywordTok
L.ITnewtype -> KeywordTok
L.ITof -> KeywordTok
L.ITqualified -> KeywordTok
L.ITthen -> KeywordTok
L.ITtype -> KeywordTok
L.ITwhere -> KeywordTok
L.ITforall _ -> KeywordTok
L.ITexport -> KeywordTok
L.ITlabel -> KeywordTok
L.ITdynamic -> KeywordTok
L.ITsafe -> KeywordTok
L.ITinterruptible -> KeywordTok
L.ITunsafe -> KeywordTok
L.ITstdcallconv -> KeywordTok
L.ITccallconv -> KeywordTok
L.ITcapiconv -> KeywordTok
L.ITprimcallconv -> KeywordTok
L.ITjavascriptcallconv -> KeywordTok
L.ITmdo -> KeywordTok
L.ITfamily -> KeywordTok
L.ITrole -> KeywordTok
L.ITgroup -> KeywordTok
L.ITby -> KeywordTok
L.ITusing -> KeywordTok
L.ITpattern -> KeywordTok
L.ITstatic -> KeywordTok
L.ITstock -> KeywordTok
L.ITanyclass -> KeywordTok
L.ITvia -> KeywordTok
L.ITunit -> KeywordTok
L.ITsignature -> KeywordTok
L.ITdependency -> KeywordTok
L.ITrequires -> KeywordTok
L.ITinline_prag {} -> PragmaTok
L.ITspec_prag _ -> PragmaTok
L.ITspec_inline_prag {} -> PragmaTok
L.ITsource_prag _ -> PragmaTok
L.ITrules_prag _ -> PragmaTok
L.ITwarning_prag _ -> PragmaTok
L.ITdeprecated_prag _ -> PragmaTok
L.ITline_prag _ -> PragmaTok
L.ITcolumn_prag _ -> PragmaTok
L.ITscc_prag _ -> PragmaTok
L.ITgenerated_prag _ -> PragmaTok
L.ITcore_prag _ -> PragmaTok
L.ITunpack_prag _ -> PragmaTok
L.ITnounpack_prag _ -> PragmaTok
L.ITann_prag _ -> PragmaTok
L.ITcomplete_prag _ -> PragmaTok
L.ITclose_prag -> PragmaTok
L.IToptions_prag _ -> PragmaTok
L.ITinclude_prag _ -> PragmaTok
L.ITlanguage_prag -> PragmaTok
L.ITminimal_prag _ -> PragmaTok
L.IToverlappable_prag _ -> PragmaTok
L.IToverlapping_prag _ -> PragmaTok
L.IToverlaps_prag _ -> PragmaTok
L.ITincoherent_prag _ -> PragmaTok
L.ITctype _ -> PragmaTok
L.ITcomment_line_prag -> PragmaTok
L.ITdotdot -> SymbolTok
L.ITcolon -> SymbolTok
L.ITdcolon _ -> SymbolTok
L.ITequal -> SymbolTok
L.ITlam -> SymbolTok
L.ITlcase -> SymbolTok
L.ITvbar -> SymbolTok
L.ITlarrow _ -> SymbolTok
L.ITrarrow _ -> SymbolTok
L.ITat -> SymbolTok
L.ITtilde -> SymbolTok
L.ITdarrow _ -> SymbolTok
L.ITbang -> SymbolTok
L.ITstar _ -> SymbolTok
L.ITbiglam -> SymbolTok
L.ITocurly -> SymbolTok
L.ITccurly -> SymbolTok
L.ITvocurly -> SymbolTok
L.ITvccurly -> SymbolTok
L.ITobrack -> SymbolTok
L.ITopabrack -> SymbolTok
L.ITcpabrack -> SymbolTok
L.ITcbrack -> SymbolTok
L.IToparen -> SymbolTok
L.ITcparen -> SymbolTok
L.IToubxparen -> SymbolTok
L.ITcubxparen -> SymbolTok
L.ITsemi -> SymbolTok
L.ITcomma -> SymbolTok
L.ITunderscore -> SymbolTok
L.ITbackquote -> SymbolTok
L.ITsimpleQuote -> SymbolTok
L.ITminus -> OperatorTok
L.ITdot -> OperatorTok
L.ITvarid _ -> VariableTok
L.ITconid _ -> ConstructorTok
L.ITvarsym _ -> OperatorTok
L.ITconsym _ -> OperatorTok
L.ITqvarid _ -> VariableTok
L.ITqconid _ -> ConstructorTok
L.ITqvarsym _ -> OperatorTok
L.ITqconsym _ -> OperatorTok
L.ITdupipvarid _ -> VariableTok
L.ITlabelvarid _ -> VariableTok
L.ITchar _ _ -> CharTok
L.ITstring _ _ -> StringTok
L.ITinteger _ -> IntegerTok
L.ITrational _ -> RationalTok
L.ITprimchar _ _ -> CharTok
L.ITprimstring _ _ -> StringTok
L.ITprimint _ _ -> IntegerTok
L.ITprimword _ _ -> IntegerTok
L.ITprimfloat _ -> RationalTok
L.ITprimdouble _ -> RationalTok
L.ITopenExpQuote _ _ -> SymbolTok
L.ITopenPatQuote -> SymbolTok
L.ITopenDecQuote -> SymbolTok
L.ITopenTypQuote -> SymbolTok
L.ITcloseQuote _ -> SymbolTok
L.ITopenTExpQuote _ -> SymbolTok
L.ITcloseTExpQuote -> SymbolTok
L.ITidEscape _ -> SymbolTok
L.ITparenEscape -> SymbolTok
L.ITidTyEscape _ -> SymbolTok
L.ITparenTyEscape -> SymbolTok
L.ITtyQuote -> SymbolTok
L.ITquasiQuote _ -> SymbolTok
L.ITqQuasiQuote _ -> SymbolTok
L.ITproc -> KeywordTok
L.ITrec -> KeywordTok
L.IToparenbar _ -> SymbolTok
L.ITcparenbar _ -> SymbolTok
L.ITlarrowtail _ -> SymbolTok
L.ITrarrowtail _ -> SymbolTok
L.ITLarrowtail _ -> SymbolTok
L.ITRarrowtail _ -> SymbolTok
L.ITtypeApp -> SymbolTok
L.ITunknown _ -> OtherTok
L.ITeof -> OtherTok
L.ITdocCommentNext _ -> CommentTok
L.ITdocCommentPrev _ -> CommentTok
L.ITdocCommentNamed _ -> CommentTok
L.ITdocSection _ _ -> CommentTok
L.ITdocOptions _ -> CommentTok
L.ITlineComment _ -> CommentTok
L.ITblockComment _ -> CommentTok
data Text' = Text'
{-# UNPACK #-} !Int
{-# UNPACK #-} !Int
{-# UNPACK #-} !Text
deriving (Show)
initText' :: Text -> Text'
initText' = Text' 1 1
tryFetchSpace :: Text' -> Loc -> Maybe (Text', Text)
tryFetchSpace txt (Loc sl sc _ _) =
let (txt', r) = reachLoc txt sl sc
in if T.null r
then Nothing
else Just (txt', r)
tryFetchRest :: Text' -> Maybe (Text', Text)
tryFetchRest (Text' l c txt) =
if T.null txt
then Nothing
else Just (Text' l c "", txt)
fetchSpan :: Text' -> Loc -> (Text', Text)
fetchSpan txt (Loc _ _ el ec) = reachLoc txt el ec
reachLoc
:: Text'
-> Int
-> Int
-> (Text', Text)
reachLoc txt@(Text' _ _ original) l c =
let chunk = T.unfoldr f txt
f (Text' l' c' s) = do
guard (l' < l || c' < c)
(ch, s') <- T.uncons s
let (l'', c'') = case ch of
'\n' -> (l' + 1, 1)
'\t' -> (l', c' + 8 - ((c' - 1) `rem` 8))
_ -> (l', c' + 1)
return (ch, Text' l'' c'' s')
in (Text' l c (T.drop (T.length chunk) original), chunk)
isHeaderPragma :: Text -> Bool
isHeaderPragma txt0 = isJust $ do
txt1 <- T.stripStart <$> T.stripPrefix "{-#" txt0
guard (T.isPrefixOf "LANGUAGE" txt1 || T.isPrefixOf "OPTIONS_GHC" txt1)
enabledExts :: [Extension]
enabledExts =
[ ForeignFunctionInterface
, InterruptibleFFI
, CApiFFI
, Arrows
, TemplateHaskell
, TemplateHaskellQuotes
, ImplicitParams
, OverloadedLabels
, ExplicitForAll
, BangPatterns
, PatternSynonyms
, MagicHash
, RecursiveDo
, UnicodeSyntax
, UnboxedTuples
, UnboxedSums
, DatatypeContexts
, TransformListComp
, QuasiQuotes
, LambdaCase
, BinaryLiterals
, NegativeLiterals
, HexFloatLiterals
, TypeApplications
, StaticPointers
, NumericUnderscores
, StarIsType
]