-- | -- Module : GHC.SyntaxHighlighter -- Copyright : © 2018–2019 Mark Karpov -- License : BSD 3 clause -- -- Maintainer : Mark Karpov -- Stability : experimental -- Portability : portable -- -- The module allows you to decompose a 'Text' stream containing Haskell -- source code into a stream of 'Text' chunks tagged with 'Token'. -- -- This library uses GHC's lexer, so the result is guaranteed to be 100% -- correct, as if it was parsed by GHC itself. {-# LANGUAGE CPP #-} {-# LANGUAGE LambdaCase #-} {-# LANGUAGE OverloadedStrings #-} {-# LANGUAGE TupleSections #-} {-# OPTIONS_GHC -fno-warn-unused-top-binds #-} module GHC.SyntaxHighlighter ( Token (..) , Loc (..) , tokenizeHaskell , tokenizeHaskellLoc ) where import Control.Monad import Data.Bits import Data.List (unfoldr, foldl') import Data.Text (Text) import Data.Word (Word64) import FastString (mkFastString) import Module (newSimpleUnitId, ComponentId (..)) import SrcLoc import StringBuffer import qualified Data.Text as T import qualified EnumSet as ES import qualified Lexer as L ---------------------------------------------------------------------------- -- Data types -- | Token types that are used as tags to mark spans of source code. data Token = KeywordTok -- ^ Keyword | PragmaTok -- ^ Pragmas | SymbolTok -- ^ Symbols (punctuation that is not an operator) | VariableTok -- ^ Variable name (term level) | ConstructorTok -- ^ Data\/type constructor | OperatorTok -- ^ Operator | CharTok -- ^ Character | StringTok -- ^ String | IntegerTok -- ^ Integer | RationalTok -- ^ Rational number | CommentTok -- ^ Comment (including Haddocks) | SpaceTok -- ^ Space filling | OtherTok -- ^ Something else? deriving (Eq, Ord, Enum, Bounded, Show) -- | Start and end positions of a span. The arguments of the data -- constructor contain in order: -- -- * Line number of start position of a span -- * Column number of start position of a span -- * Line number of end position of a span -- * Column number of end position of a span -- -- @since 0.0.2.0 data Loc = Loc !Int !Int !Int !Int deriving (Eq, Ord, Show) ---------------------------------------------------------------------------- -- High-level API -- | Tokenize Haskell source code. If the code cannot be parsed, return -- 'Nothing'. Otherwise return the original input tagged by 'Token's. -- -- The parser does not require the input source code to form a valid Haskell -- program, so as long as the lexer can decompose your input (most of the -- time), it'll return something in 'Just'. tokenizeHaskell :: Text -> Maybe [(Token, Text)] tokenizeHaskell input = sliceInputStream input <$> tokenizeHaskellLoc input -- | Replace 'Loc' locations with actual chunks of input 'Text'. sliceInputStream :: Text -> [(Token, Loc)] -> [(Token, Text)] sliceInputStream input toks = unfoldr sliceOnce (initText' input, toks) where sliceOnce (txt, []) = do (txt', chunk) <- tryFetchRest txt return ((SpaceTok, chunk), (txt', [])) sliceOnce (txt, tss@((t, l):ts)) = case tryFetchSpace txt l of Nothing -> let (txt', chunk) = fetchSpan txt l in Just ((t, chunk), (txt', ts)) Just (txt', chunk) -> Just ((SpaceTok, chunk), (txt', tss)) -- | Similar to 'tokenizeHaskell', but instead of 'Text' chunks provides -- locations of corresponding spans in the given input stream. -- -- @since 0.0.2.0 tokenizeHaskellLoc :: Text -> Maybe [(Token, Loc)] tokenizeHaskellLoc input = case L.unP pLexer parseState of L.PFailed {} -> Nothing L.POk _ x -> Just x where location = mkRealSrcLoc (mkFastString "") 1 1 buffer = stringToStringBuffer (T.unpack input) parseState = L.mkPStatePure parserFlags buffer location parserFlags = L.ParserFlags { L.pWarningFlags = ES.empty , L.pExtensionFlags = ES.empty , L.pThisPackage = newSimpleUnitId (ComponentId (mkFastString "")) , L.pExtsBitmap = extsBitmap } -- | Haskell lexer. pLexer :: L.P [(Token, Loc)] pLexer = go where go = do r <- L.lexer False return case r of L _ L.ITeof -> return [] _ -> case fixupToken r of Nothing -> go Just x -> (x:) <$> go -- | Convert @'Located' 'L.Token'@ representation to a more convenient for -- us form. fixupToken :: Located L.Token -> Maybe (Token, Loc) fixupToken (L srcSpan tok) = (classifyToken tok,) <$> srcSpanToLoc srcSpan -- | Convert 'SrcSpan' to 'Loc'. srcSpanToLoc :: SrcSpan -> Maybe Loc srcSpanToLoc (RealSrcSpan rss) = let start = realSrcSpanStart rss end = realSrcSpanEnd rss in if start == end then Nothing -- NOTE Some magic auto-generated tokens that do not -- actually appear in the input stream. Drop them. else Just $ Loc (srcLocLine start) (srcLocCol start) (srcLocLine end) (srcLocCol end) srcSpanToLoc _ = Nothing -- | Classify a 'L.Token' in terms of 'Token'. classifyToken :: L.Token -> Token classifyToken = \case -- Keywords L.ITas -> KeywordTok L.ITcase -> KeywordTok L.ITclass -> KeywordTok L.ITdata -> KeywordTok L.ITdefault -> KeywordTok L.ITderiving -> KeywordTok L.ITdo -> KeywordTok L.ITelse -> KeywordTok L.IThiding -> KeywordTok L.ITforeign -> KeywordTok L.ITif -> KeywordTok L.ITimport -> KeywordTok L.ITin -> KeywordTok L.ITinfix -> KeywordTok L.ITinfixl -> KeywordTok L.ITinfixr -> KeywordTok L.ITinstance -> KeywordTok L.ITlet -> KeywordTok L.ITmodule -> KeywordTok L.ITnewtype -> KeywordTok L.ITof -> KeywordTok L.ITqualified -> KeywordTok L.ITthen -> KeywordTok L.ITtype -> KeywordTok L.ITwhere -> KeywordTok L.ITforall _ -> KeywordTok L.ITexport -> KeywordTok L.ITlabel -> KeywordTok L.ITdynamic -> KeywordTok L.ITsafe -> KeywordTok L.ITinterruptible -> KeywordTok L.ITunsafe -> KeywordTok L.ITstdcallconv -> KeywordTok L.ITccallconv -> KeywordTok L.ITcapiconv -> KeywordTok L.ITprimcallconv -> KeywordTok L.ITjavascriptcallconv -> KeywordTok L.ITmdo -> KeywordTok L.ITfamily -> KeywordTok L.ITrole -> KeywordTok L.ITgroup -> KeywordTok L.ITby -> KeywordTok L.ITusing -> KeywordTok L.ITpattern -> KeywordTok L.ITstatic -> KeywordTok L.ITstock -> KeywordTok L.ITanyclass -> KeywordTok #if MIN_VERSION_ghc(8,6,1) L.ITvia -> KeywordTok #endif L.ITunit -> KeywordTok L.ITsignature -> KeywordTok L.ITdependency -> KeywordTok L.ITrequires -> KeywordTok -- Pragmas L.ITinline_prag {} -> PragmaTok L.ITspec_prag _ -> PragmaTok L.ITspec_inline_prag {} -> PragmaTok L.ITsource_prag _ -> PragmaTok L.ITrules_prag _ -> PragmaTok L.ITwarning_prag _ -> PragmaTok L.ITdeprecated_prag _ -> PragmaTok L.ITline_prag _ -> PragmaTok L.ITcolumn_prag _ -> PragmaTok L.ITscc_prag _ -> PragmaTok L.ITgenerated_prag _ -> PragmaTok L.ITcore_prag _ -> PragmaTok L.ITunpack_prag _ -> PragmaTok L.ITnounpack_prag _ -> PragmaTok L.ITann_prag _ -> PragmaTok L.ITcomplete_prag _ -> PragmaTok L.ITclose_prag -> PragmaTok L.IToptions_prag _ -> PragmaTok L.ITinclude_prag _ -> PragmaTok L.ITlanguage_prag -> PragmaTok #if !MIN_VERSION_ghc(8,6,1) L.ITvect_prag _ -> PragmaTok L.ITvect_scalar_prag _ -> PragmaTok L.ITnovect_prag _ -> PragmaTok #endif L.ITminimal_prag _ -> PragmaTok L.IToverlappable_prag _ -> PragmaTok L.IToverlapping_prag _ -> PragmaTok L.IToverlaps_prag _ -> PragmaTok L.ITincoherent_prag _ -> PragmaTok L.ITctype _ -> PragmaTok -- Reserved symbols L.ITdotdot -> SymbolTok L.ITcolon -> SymbolTok L.ITdcolon _ -> SymbolTok L.ITequal -> SymbolTok L.ITlam -> SymbolTok L.ITlcase -> SymbolTok L.ITvbar -> SymbolTok L.ITlarrow _ -> SymbolTok L.ITrarrow _ -> SymbolTok L.ITat -> SymbolTok L.ITtilde -> SymbolTok #if !MIN_VERSION_ghc(8,6,1) L.ITtildehsh -> SymbolTok #endif L.ITdarrow _ -> SymbolTok L.ITbang -> SymbolTok #if MIN_VERSION_ghc(8,6,1) L.ITstar _ -> SymbolTok #endif L.ITbiglam -> SymbolTok L.ITocurly -> SymbolTok L.ITccurly -> SymbolTok L.ITvocurly -> SymbolTok L.ITvccurly -> SymbolTok L.ITobrack -> SymbolTok L.ITopabrack -> SymbolTok L.ITcpabrack -> SymbolTok L.ITcbrack -> SymbolTok L.IToparen -> SymbolTok L.ITcparen -> SymbolTok L.IToubxparen -> SymbolTok L.ITcubxparen -> SymbolTok L.ITsemi -> SymbolTok L.ITcomma -> SymbolTok L.ITunderscore -> SymbolTok L.ITbackquote -> SymbolTok L.ITsimpleQuote -> SymbolTok -- NOTE GHC thinks these are reserved symbols, but I classify them as -- operators. L.ITminus -> OperatorTok L.ITdot -> OperatorTok -- Identifiers L.ITvarid _ -> VariableTok L.ITconid _ -> ConstructorTok L.ITvarsym _ -> OperatorTok L.ITconsym _ -> OperatorTok L.ITqvarid _ -> VariableTok L.ITqconid _ -> ConstructorTok L.ITqvarsym _ -> OperatorTok L.ITqconsym _ -> OperatorTok L.ITdupipvarid _ -> VariableTok L.ITlabelvarid _ -> VariableTok -- Basic types L.ITchar _ _ -> CharTok L.ITstring _ _ -> StringTok L.ITinteger _ -> IntegerTok L.ITrational _ -> RationalTok L.ITprimchar _ _ -> CharTok L.ITprimstring _ _ -> StringTok L.ITprimint _ _ -> IntegerTok L.ITprimword _ _ -> IntegerTok L.ITprimfloat _ -> RationalTok L.ITprimdouble _ -> RationalTok -- Template Haskell extension tokens L.ITopenExpQuote _ _ -> SymbolTok L.ITopenPatQuote -> SymbolTok L.ITopenDecQuote -> SymbolTok L.ITopenTypQuote -> SymbolTok L.ITcloseQuote _ -> SymbolTok L.ITopenTExpQuote _ -> SymbolTok L.ITcloseTExpQuote -> SymbolTok L.ITidEscape _ -> SymbolTok L.ITparenEscape -> SymbolTok L.ITidTyEscape _ -> SymbolTok L.ITparenTyEscape -> SymbolTok L.ITtyQuote -> SymbolTok L.ITquasiQuote _ -> SymbolTok L.ITqQuasiQuote _ -> SymbolTok -- Arrow notation L.ITproc -> KeywordTok L.ITrec -> KeywordTok L.IToparenbar _ -> SymbolTok L.ITcparenbar _ -> SymbolTok L.ITlarrowtail _ -> SymbolTok L.ITrarrowtail _ -> SymbolTok L.ITLarrowtail _ -> SymbolTok L.ITRarrowtail _ -> SymbolTok -- Type application L.ITtypeApp -> SymbolTok -- Special L.ITunknown _ -> OtherTok L.ITeof -> OtherTok -- normally is not included in results -- Documentation annotations L.ITdocCommentNext _ -> CommentTok L.ITdocCommentPrev _ -> CommentTok L.ITdocCommentNamed _ -> CommentTok L.ITdocSection _ _ -> CommentTok L.ITdocOptions _ -> CommentTok L.ITlineComment _ -> CommentTok L.ITblockComment _ -> CommentTok ---------------------------------------------------------------------------- -- Text traversing -- | A type for 'Text' with line\/column location attached. data Text' = Text' {-# UNPACK #-} !Int {-# UNPACK #-} !Int {-# UNPACK #-} !Text deriving (Show) -- | Create 'Text'' from 'Text'. initText' :: Text -> Text' initText' = Text' 1 1 -- | Try to fetch white space before start of span at 'Loc'. tryFetchSpace :: Text' -> Loc -> Maybe (Text', Text) tryFetchSpace txt (Loc sl sc _ _) = let (txt', r) = reachLoc txt sl sc in if T.null r then Nothing else Just (txt', r) -- | Try to fetch the rest of 'Text'' stream. tryFetchRest :: Text' -> Maybe (Text', Text) tryFetchRest (Text' l c txt) = if T.null txt then Nothing else Just (Text' l c "", txt) -- | Fetch a span at 'Loc'. fetchSpan :: Text' -> Loc -> (Text', Text) fetchSpan txt (Loc _ _ el ec) = reachLoc txt el ec -- | Reach given line\/column location and return 'Text' that has been -- traversed. reachLoc :: Text' -> Int -- ^ Line number to reach -> Int -- ^ Column number to reach -> (Text', Text) reachLoc txt@(Text' _ _ original) l c = let chunk = T.unfoldr f txt f (Text' l' c' s) = do guard (l' < l || c' < c) (ch, s') <- T.uncons s let (l'', c'') = case ch of '\n' -> (l' + 1, 1) '\t' -> (l', c' + 8 - ((c' - 1) `rem` 8)) _ -> (l', c' + 1) return (ch, Text' l'' c'' s') in (Text' l c (T.drop (T.length chunk) original), chunk) ---------------------------------------------------------------------------- -- Exts bitmap hack -- | Extension bitmap we use in this library. extsBitmap :: Word64 extsBitmap = mkExtsBitmap enabledExts {-# NOINLINE extsBitmap #-} -- | Create extension bitmap similarly to how it's done in GHC. mkExtsBitmap :: [ExtBits] -> Word64 mkExtsBitmap = foldl' f 0 where f w x = bit (fromEnum x) .|. w -- | Copied from GHC sources that are not yet available in the @ghc@ -- package. data ExtBits = FfiBit | InterruptibleFfiBit | CApiFfiBit #if !MIN_VERSION_ghc(8,6,0) | ParrBit #endif | ArrowsBit | ThBit | ThQuotesBit | IpBit | OverloadedLabelsBit -- #x overloaded labels | ExplicitForallBit -- the 'forall' keyword and '.' symbol | BangPatBit -- Tells the parser to understand bang-patterns -- (doesn't affect the lexer) | PatternSynonymsBit -- pattern synonyms | HaddockBit-- Lex and parse Haddock comments | MagicHashBit -- "#" in both functions and operators | RecursiveDoBit -- mdo | UnicodeSyntaxBit -- the forall symbol, arrow symbols, etc | UnboxedTuplesBit -- (# and #) | UnboxedSumsBit -- (# and #) | DatatypeContextsBit | TransformComprehensionsBit | QqBit -- enable quasiquoting | InRulePragBit | RawTokenStreamBit -- producing a token stream with all comments included | SccProfilingOnBit | HpcBit | AlternativeLayoutRuleBit | RelaxedLayoutBit | NondecreasingIndentationBit | SafeHaskellBit | TraditionalRecordSyntaxBit | ExplicitNamespacesBit | LambdaCaseBit | BinaryLiteralsBit | NegativeLiteralsBit | HexFloatLiteralsBit | TypeApplicationsBit | StaticPointersBit | NumericUnderscoresBit #if MIN_VERSION_ghc(8,6,0) | StarIsTypeBit #endif deriving Enum -- | Extension we enable for the best user experience. enabledExts :: [ExtBits] enabledExts = [ FfiBit , InterruptibleFfiBit , CApiFfiBit #if !MIN_VERSION_ghc(8,6,0) , ParrBit #endif , ArrowsBit , ThBit , ThQuotesBit , IpBit , OverloadedLabelsBit , ExplicitForallBit , BangPatBit , PatternSynonymsBit , HaddockBit , MagicHashBit , RecursiveDoBit , UnicodeSyntaxBit , UnboxedTuplesBit , UnboxedSumsBit , DatatypeContextsBit , TransformComprehensionsBit , QqBit , InRulePragBit , RawTokenStreamBit , SafeHaskellBit , LambdaCaseBit , BinaryLiteralsBit , NegativeLiteralsBit , HexFloatLiteralsBit , TypeApplicationsBit , StaticPointersBit , NumericUnderscoresBit #if MIN_VERSION_ghc(8,6,0) , StarIsTypeBit #endif ]