-- |
-- Module      :  GHC.SyntaxHighlighter
-- Copyright   :  © 2018–present Mark Karpov
-- License     :  BSD 3 clause
--
-- Maintainer  :  Mark Karpov <markkarpov92@gmail.com>
-- Stability   :  experimental
-- Portability :  portable
--
-- The module allows you to decompose a 'Text' stream containing Haskell
-- source code into a stream of 'Text' chunks tagged with 'Token'.
--
-- This library uses GHC's lexer, so the result is guaranteed to be 100%
-- correct, as if it was parsed by GHC itself.

{-# LANGUAGE LambdaCase                    #-}
{-# LANGUAGE OverloadedStrings             #-}
{-# LANGUAGE TupleSections                 #-}
{-# OPTIONS_GHC -fno-warn-missing-fields #-}

module GHC.SyntaxHighlighter
  ( Token (..)
  , Loc (..)
  , tokenizeHaskell
  , tokenizeHaskellLoc )
where

import Control.Monad
import Data.List (foldl', unfoldr)
import Data.Maybe (isJust)
import Data.Text (Text)
import DynFlags
import FastString (mkFastString)
import Fingerprint (fingerprint0)
import GHC.LanguageExtensions
import GHC.Version (cProjectVersion)
import SrcLoc
import StringBuffer
import ToolSettings
import qualified Data.Text as T
import qualified EnumSet   as ES
import qualified Lexer     as L

----------------------------------------------------------------------------
-- Data types

-- | Token types that are used as tags to mark spans of source code.

data Token
  = KeywordTok         -- ^ Keyword
  | PragmaTok          -- ^ Pragmas
  | SymbolTok          -- ^ Symbols (punctuation that is not an operator)
  | VariableTok        -- ^ Variable name (term level)
  | ConstructorTok     -- ^ Data\/type constructor
  | OperatorTok        -- ^ Operator
  | CharTok            -- ^ Character
  | StringTok          -- ^ String
  | IntegerTok         -- ^ Integer
  | RationalTok        -- ^ Rational number
  | CommentTok         -- ^ Comment (including Haddocks)
  | SpaceTok           -- ^ Space filling
  | OtherTok           -- ^ Something else?
  deriving (Eq, Ord, Enum, Bounded, Show)

-- | Start and end positions of a span. The arguments of the data
-- constructor contain in order:
--
--     * Line number of start position of a span
--     * Column number of start position of a span
--     * Line number of end position of a span
--     * Column number of end position of a span
--
-- @since 0.0.2.0

data Loc = Loc !Int !Int !Int !Int
  deriving (Eq, Ord, Show)

----------------------------------------------------------------------------
-- High-level API

-- | Tokenize Haskell source code. If the code cannot be parsed, return
-- 'Nothing'. Otherwise return the original input tagged by 'Token's.
--
-- The parser does not require the input source code to form a valid Haskell
-- program, so as long as the lexer can decompose your input (most of the
-- time), it'll return something in 'Just'.

tokenizeHaskell :: Text -> Maybe [(Token, Text)]
tokenizeHaskell input = sliceInputStream input <$> tokenizeHaskellLoc input

-- | Replace 'Loc' locations with actual chunks of input 'Text'.

sliceInputStream :: Text -> [(Token, Loc)] -> [(Token, Text)]
sliceInputStream input toks = unfoldr sliceOnce (initText' input, toks)
  where
    sliceOnce (txt, []) = do
      (txt', chunk) <- tryFetchRest txt
      return ((SpaceTok, chunk), (txt', []))
    sliceOnce (txt, tss@((t, l):ts)) =
      case tryFetchSpace txt l of
        Nothing ->
          let (txt', chunk) = fetchSpan txt l
              t' = case t of
                CommentTok -> if isHeaderPragma chunk
                  then PragmaTok
                  else CommentTok
                tok -> tok
          in Just ((t', chunk), (txt', ts))
        Just (txt', chunk) ->
          Just ((SpaceTok, chunk), (txt', tss))

-- | Similar to 'tokenizeHaskell', but instead of 'Text' chunks provides
-- locations of corresponding spans in the given input stream.
--
-- @since 0.0.2.0

tokenizeHaskellLoc :: Text -> Maybe [(Token, Loc)]
tokenizeHaskellLoc input =
  case L.unP pLexer parseState of
    L.PFailed {} -> Nothing
    L.POk    _ x -> Just x
  where
    location = mkRealSrcLoc (mkFastString "") 1 1
    buffer = stringToStringBuffer (T.unpack input)
    parseState = L.mkPStatePure parserFlags buffer location
    parserFlags = L.mkParserFlags (foldl' xopt_set initialDynFlags enabledExts)
    initialDynFlags = DynFlags
      { warningFlags = ES.empty
      , generalFlags = ES.fromList
        [ Opt_Haddock
        , Opt_KeepRawTokenStream
        ]
      , extensions = []
      , extensionFlags = ES.empty
      , safeHaskell = Sf_Safe
      , language = Just Haskell2010
      , ghcNameVersion = GhcNameVersion
        { ghcNameVersion_programName = "ghc"
        , ghcNameVersion_projectVersion = cProjectVersion
        }
      , fileSettings = FileSettings {}
      , toolSettings = ToolSettings
          { toolSettings_opt_P_fingerprint = fingerprint0
          , toolSettings_pgm_F = ""
          }
      , platformMisc = PlatformMisc {}
      }

-- | Haskell lexer.

pLexer :: L.P [(Token, Loc)]
pLexer = go
  where
    go = do
      r <- L.lexer False return
      case r of
        L _ L.ITeof -> return []
        _           ->
          case fixupToken r of
            Nothing -> go
            Just  x -> (x:) <$> go

-- | Convert @'Located' 'L.Token'@ representation to a more convenient for
-- us form.

fixupToken :: Located L.Token -> Maybe (Token, Loc)
fixupToken (L srcSpan tok) = (classifyToken tok,) <$> srcSpanToLoc srcSpan

-- | Convert 'SrcSpan' to 'Loc'.

srcSpanToLoc :: SrcSpan -> Maybe Loc
srcSpanToLoc (RealSrcSpan rss) =
  let start = realSrcSpanStart rss
      end   = realSrcSpanEnd   rss
  in if start == end
       then Nothing -- NOTE Some magic auto-generated tokens that do not
            -- actually appear in the input stream. Drop them.
       else Just $ Loc (srcLocLine start)
                       (srcLocCol start)
                       (srcLocLine end)
                       (srcLocCol end)
srcSpanToLoc _ = Nothing

-- | Classify a 'L.Token' in terms of 'Token'.

classifyToken :: L.Token -> Token
classifyToken = \case
  -- Keywords
  L.ITas        -> KeywordTok
  L.ITcase      -> KeywordTok
  L.ITclass     -> KeywordTok
  L.ITdata      -> KeywordTok
  L.ITdefault   -> KeywordTok
  L.ITderiving  -> KeywordTok
  L.ITdo        -> KeywordTok
  L.ITelse      -> KeywordTok
  L.IThiding    -> KeywordTok
  L.ITforeign   -> KeywordTok
  L.ITif        -> KeywordTok
  L.ITimport    -> KeywordTok
  L.ITin        -> KeywordTok
  L.ITinfix     -> KeywordTok
  L.ITinfixl    -> KeywordTok
  L.ITinfixr    -> KeywordTok
  L.ITinstance  -> KeywordTok
  L.ITlet       -> KeywordTok
  L.ITmodule    -> KeywordTok
  L.ITnewtype   -> KeywordTok
  L.ITof        -> KeywordTok
  L.ITqualified -> KeywordTok
  L.ITthen      -> KeywordTok
  L.ITtype      -> KeywordTok
  L.ITwhere     -> KeywordTok
  L.ITforall _  -> KeywordTok
  L.ITexport    -> KeywordTok
  L.ITlabel     -> KeywordTok
  L.ITdynamic   -> KeywordTok
  L.ITsafe      -> KeywordTok
  L.ITinterruptible -> KeywordTok
  L.ITunsafe    -> KeywordTok
  L.ITstdcallconv -> KeywordTok
  L.ITccallconv -> KeywordTok
  L.ITcapiconv  -> KeywordTok
  L.ITprimcallconv -> KeywordTok
  L.ITjavascriptcallconv -> KeywordTok
  L.ITmdo       -> KeywordTok
  L.ITfamily    -> KeywordTok
  L.ITrole      -> KeywordTok
  L.ITgroup     -> KeywordTok
  L.ITby        -> KeywordTok
  L.ITusing     -> KeywordTok
  L.ITpattern   -> KeywordTok
  L.ITstatic    -> KeywordTok
  L.ITstock     -> KeywordTok
  L.ITanyclass  -> KeywordTok
  L.ITvia       -> KeywordTok
  L.ITunit      -> KeywordTok
  L.ITsignature -> KeywordTok
  L.ITdependency -> KeywordTok
  L.ITrequires  -> KeywordTok
  -- Pragmas
  L.ITinline_prag {} -> PragmaTok
  L.ITspec_prag _ -> PragmaTok
  L.ITspec_inline_prag {} -> PragmaTok
  L.ITsource_prag _ -> PragmaTok
  L.ITrules_prag _ -> PragmaTok
  L.ITwarning_prag _ -> PragmaTok
  L.ITdeprecated_prag _ -> PragmaTok
  L.ITline_prag _ -> PragmaTok
  L.ITcolumn_prag _ -> PragmaTok
  L.ITscc_prag _ -> PragmaTok
  L.ITgenerated_prag _ -> PragmaTok
  L.ITcore_prag _ -> PragmaTok
  L.ITunpack_prag _ -> PragmaTok
  L.ITnounpack_prag _ -> PragmaTok
  L.ITann_prag _ -> PragmaTok
  L.ITcomplete_prag _ -> PragmaTok
  L.ITclose_prag -> PragmaTok
  L.IToptions_prag _ -> PragmaTok
  L.ITinclude_prag _ -> PragmaTok
  L.ITlanguage_prag -> PragmaTok
  L.ITminimal_prag _ -> PragmaTok
  L.IToverlappable_prag _ -> PragmaTok
  L.IToverlapping_prag _ -> PragmaTok
  L.IToverlaps_prag _ -> PragmaTok
  L.ITincoherent_prag _ -> PragmaTok
  L.ITctype _ -> PragmaTok
  L.ITcomment_line_prag -> PragmaTok
  -- Reserved symbols
  L.ITdotdot -> SymbolTok
  L.ITcolon -> SymbolTok
  L.ITdcolon _ -> SymbolTok
  L.ITequal -> SymbolTok
  L.ITlam -> SymbolTok
  L.ITlcase -> SymbolTok
  L.ITvbar -> SymbolTok
  L.ITlarrow _ -> SymbolTok
  L.ITrarrow _ -> SymbolTok
  L.ITat -> SymbolTok
  L.ITtilde -> SymbolTok
  L.ITdarrow _ -> SymbolTok
  L.ITbang -> SymbolTok
  L.ITstar _ -> SymbolTok
  L.ITbiglam -> SymbolTok
  L.ITocurly -> SymbolTok
  L.ITccurly -> SymbolTok
  L.ITvocurly -> SymbolTok
  L.ITvccurly -> SymbolTok
  L.ITobrack -> SymbolTok
  L.ITopabrack -> SymbolTok
  L.ITcpabrack -> SymbolTok
  L.ITcbrack -> SymbolTok
  L.IToparen -> SymbolTok
  L.ITcparen -> SymbolTok
  L.IToubxparen -> SymbolTok
  L.ITcubxparen -> SymbolTok
  L.ITsemi -> SymbolTok
  L.ITcomma -> SymbolTok
  L.ITunderscore -> SymbolTok
  L.ITbackquote -> SymbolTok
  L.ITsimpleQuote -> SymbolTok
  -- NOTE GHC thinks these are reserved symbols, but I classify them as
  -- operators.
  L.ITminus -> OperatorTok
  L.ITdot -> OperatorTok
  -- Identifiers
  L.ITvarid _ -> VariableTok
  L.ITconid _ -> ConstructorTok
  L.ITvarsym _ -> OperatorTok
  L.ITconsym _ -> OperatorTok
  L.ITqvarid _ -> VariableTok
  L.ITqconid _ -> ConstructorTok
  L.ITqvarsym _ -> OperatorTok
  L.ITqconsym _ -> OperatorTok
  L.ITdupipvarid _ -> VariableTok
  L.ITlabelvarid _ -> VariableTok
  -- Basic types
  L.ITchar _ _ -> CharTok
  L.ITstring _ _ -> StringTok
  L.ITinteger _ -> IntegerTok
  L.ITrational _ -> RationalTok
  L.ITprimchar _ _ -> CharTok
  L.ITprimstring _ _ -> StringTok
  L.ITprimint _ _ -> IntegerTok
  L.ITprimword _ _ -> IntegerTok
  L.ITprimfloat _ -> RationalTok
  L.ITprimdouble _ -> RationalTok
  -- Template Haskell extension tokens
  L.ITopenExpQuote _ _ -> SymbolTok
  L.ITopenPatQuote -> SymbolTok
  L.ITopenDecQuote -> SymbolTok
  L.ITopenTypQuote -> SymbolTok
  L.ITcloseQuote _ -> SymbolTok
  L.ITopenTExpQuote _ -> SymbolTok
  L.ITcloseTExpQuote -> SymbolTok
  L.ITidEscape _ -> SymbolTok
  L.ITparenEscape -> SymbolTok
  L.ITidTyEscape _ -> SymbolTok
  L.ITparenTyEscape -> SymbolTok
  L.ITtyQuote -> SymbolTok
  L.ITquasiQuote _ -> SymbolTok
  L.ITqQuasiQuote _ -> SymbolTok
  -- Arrow notation
  L.ITproc -> KeywordTok
  L.ITrec -> KeywordTok
  L.IToparenbar _ -> SymbolTok
  L.ITcparenbar _ -> SymbolTok
  L.ITlarrowtail _ -> SymbolTok
  L.ITrarrowtail _ -> SymbolTok
  L.ITLarrowtail _ -> SymbolTok
  L.ITRarrowtail _ -> SymbolTok
  -- Type application
  L.ITtypeApp -> SymbolTok
  -- Special
  L.ITunknown _ -> OtherTok
  L.ITeof -> OtherTok -- normally is not included in results
  -- Documentation annotations
  L.ITdocCommentNext _ -> CommentTok
  L.ITdocCommentPrev _ -> CommentTok
  L.ITdocCommentNamed _ -> CommentTok
  L.ITdocSection _ _ -> CommentTok
  L.ITdocOptions _ -> CommentTok
  L.ITlineComment _ -> CommentTok
  L.ITblockComment _ -> CommentTok

----------------------------------------------------------------------------
-- Text traversing

-- | A type for 'Text' with line\/column location attached.

data Text' = Text'
  {-# UNPACK #-} !Int
  {-# UNPACK #-} !Int
  {-# UNPACK #-} !Text
  deriving (Show)

-- | Create 'Text'' from 'Text'.

initText' :: Text -> Text'
initText' = Text' 1 1

-- | Try to fetch white space before start of span at 'Loc'.

tryFetchSpace :: Text' -> Loc -> Maybe (Text', Text)
tryFetchSpace txt (Loc sl sc _ _) =
  let (txt', r) = reachLoc txt sl sc
  in if T.null r
       then Nothing
       else Just (txt', r)

-- | Try to fetch the rest of 'Text'' stream.

tryFetchRest :: Text' -> Maybe (Text', Text)
tryFetchRest (Text' l c txt) =
  if T.null txt
    then Nothing
    else Just (Text' l c "", txt)

-- | Fetch a span at 'Loc'.

fetchSpan :: Text' -> Loc -> (Text', Text)
fetchSpan txt (Loc _ _ el ec) = reachLoc txt el ec

-- | Reach given line\/column location and return 'Text' that has been
-- traversed.

reachLoc
  :: Text'
  -> Int               -- ^ Line number to reach
  -> Int               -- ^ Column number to reach
  -> (Text', Text)
reachLoc txt@(Text' _ _ original) l c =
  let chunk = T.unfoldr f txt
      f (Text' l' c' s) = do
        guard (l' < l || c' < c)
        (ch, s') <- T.uncons s
        let (l'', c'') = case ch of
              '\n' -> (l' + 1, 1)
              '\t' -> (l', c' + 8 - ((c' - 1) `rem` 8))
              _    -> (l', c' + 1)
        return (ch, Text' l'' c'' s')
  in (Text' l c (T.drop (T.length chunk) original), chunk)

----------------------------------------------------------------------------
-- Pragmas detection

-- | Detect file header pragma.

isHeaderPragma :: Text -> Bool
isHeaderPragma txt0 = isJust $ do
  txt1 <- T.stripStart <$> T.stripPrefix "{-#" txt0
  guard (T.isPrefixOf "LANGUAGE" txt1 || T.isPrefixOf "OPTIONS_GHC" txt1)

----------------------------------------------------------------------------
-- Language extensions

-- | Language extensions we enable by default.

enabledExts :: [Extension]
enabledExts =
  [ ForeignFunctionInterface
  , InterruptibleFFI
  , CApiFFI
  , Arrows
  , TemplateHaskell
  , TemplateHaskellQuotes
  , ImplicitParams
  , OverloadedLabels
  , ExplicitForAll
  , BangPatterns
  , PatternSynonyms
  , MagicHash
  , RecursiveDo
  , UnicodeSyntax
  , UnboxedTuples
  , UnboxedSums
  , DatatypeContexts
  , TransformListComp
  , QuasiQuotes
  , LambdaCase
  , BinaryLiterals
  , NegativeLiterals
  , HexFloatLiterals
  , TypeApplications
  , StaticPointers
  , NumericUnderscores
  , StarIsType
  ]