{-# LANGUAGE CPP #-}
{-# LANGUAGE MultiParamTypeClasses #-}
{-# LANGUAGE OverloadedStrings #-}
{-# LANGUAGE PatternGuards #-}

-----------------------------------------------------------------------------

-- |
-- Module      :  Distribution.Fields.Parser
-- License     :  BSD3
--
-- Maintainer  :  cabal-devel@haskell.org
-- Portability :  portable
{- FOURMOLU_DISABLE -}
module Distribution.Fields.Parser
  ( -- * Types
    Field (..)
  , Name (..)
  , FieldLine (..)
  , SectionArg (..)

    -- * Grammar and parsing
    -- $grammar
  , readFields
  , readFields'
#ifdef CABAL_PARSEC_DEBUG

    -- * Internal
  , parseFile
  , parseStr
  , parseBS
#endif
  ) where
{- FOURMOLU_ENABLE -}

import qualified Data.ByteString.Char8 as B8
import Data.Functor.Identity
import Distribution.Compat.Prelude
import Distribution.Fields.Field
import Distribution.Fields.Lexer
import Distribution.Fields.LexerMonad
  ( LexResult (..)
  , LexState (..)
  , LexWarning (..)
  , LexWarningType (..)
  , unLex
  )
import Distribution.Parsec.Position (Position (..), positionCol)
import Text.Parsec.Combinator hiding (eof, notFollowedBy)
import Text.Parsec.Error
import Text.Parsec.Pos
import Text.Parsec.Prim hiding (many, (<|>))
import Prelude ()

#ifdef CABAL_PARSEC_DEBUG
import qualified Data.Text                as T
import qualified Data.Text.Encoding       as T
import qualified Data.Text.Encoding.Error as T
#endif

-- $setup
-- >>> import Data.Either (isLeft)

-- | The 'LexState'' (with a prime) is an instance of parsec's 'Stream'
-- wrapped around lexer's 'LexState' (without a prime)
data LexState' = LexState' !LexState (LToken, LexState')

mkLexState' :: LexState -> LexState'
mkLexState' st =
  LexState'
    st
    (case unLex lexToken st of LexResult st' tok -> (tok, mkLexState' st'))

type Parser a = ParsecT LexState' () Identity a

instance Stream LexState' Identity LToken where
  uncons (LexState' _ (tok, st')) =
    case tok of
      L _ EOF -> return Nothing
      _ -> return (Just (tok, st'))

-- | Get lexer warnings accumulated so far
getLexerWarnings :: Parser [LexWarning]
getLexerWarnings = do
  LexState' (LexState{warnings = ws}) _ <- getInput
  return ws

addLexerWarning :: LexWarning -> Parser ()
addLexerWarning w = do
  LexState' ls@LexState{warnings = ws} _ <- getInput
  setInput $! mkLexState' ls{warnings = w : ws}

-- | Set Alex code i.e. the mode "state" lexer is in.
setLexerMode :: Int -> Parser ()
setLexerMode code = do
  LexState' ls _ <- getInput
  setInput $! mkLexState' ls{curCode = code}

getToken :: (Token -> Maybe a) -> Parser a
getToken getTok = getTokenWithPos (\(L _ t) -> getTok t)

getTokenWithPos :: (LToken -> Maybe a) -> Parser a
getTokenWithPos getTok = tokenPrim (\(L _ t) -> describeToken t) updatePos getTok
  where
    updatePos :: SourcePos -> LToken -> LexState' -> SourcePos
    updatePos pos (L (Position col line) _) _ = newPos (sourceName pos) col line

describeToken :: Token -> String
describeToken t = case t of
  TokSym s -> "symbol " ++ show s
  TokStr s -> "string " ++ show s
  TokOther s -> "operator " ++ show s
  Indent _ -> "new line"
  TokFieldLine _ -> "field content"
  Colon -> "\":\""
  OpenBrace -> "\"{\""
  CloseBrace -> "\"}\""
  --  SemiColon       -> "\";\""
  EOF -> "end of file"
  LexicalError is -> "character in input " ++ show (B8.head is)

tokSym :: Parser (Name Position)
tokSym', tokStr, tokOther :: Parser (SectionArg Position)
tokIndent :: Parser Int
tokColon, tokCloseBrace :: Parser ()
tokOpenBrace :: Parser Position
tokFieldLine :: Parser (FieldLine Position)
tokSym = getTokenWithPos $ \t -> case t of L pos (TokSym x) -> Just (mkName pos x); _ -> Nothing
tokSym' = getTokenWithPos $ \t -> case t of L pos (TokSym x) -> Just (SecArgName pos x); _ -> Nothing
tokStr = getTokenWithPos $ \t -> case t of L pos (TokStr x) -> Just (SecArgStr pos x); _ -> Nothing
tokOther = getTokenWithPos $ \t -> case t of L pos (TokOther x) -> Just (SecArgOther pos x); _ -> Nothing
tokIndent = getToken $ \t -> case t of Indent x -> Just x; _ -> Nothing
tokColon = getToken $ \t -> case t of Colon -> Just (); _ -> Nothing
tokOpenBrace = getTokenWithPos $ \t -> case t of L pos OpenBrace -> Just pos; _ -> Nothing
tokCloseBrace = getToken $ \t -> case t of CloseBrace -> Just (); _ -> Nothing
tokFieldLine = getTokenWithPos $ \t -> case t of L pos (TokFieldLine s) -> Just (FieldLine pos s); _ -> Nothing

colon, openBrace, closeBrace :: Parser ()
sectionArg :: Parser (SectionArg Position)
sectionArg = tokSym' <|> tokStr <|> tokOther <?> "section parameter"

fieldSecName :: Parser (Name Position)
fieldSecName = tokSym <?> "field or section name"

colon = tokColon <?> "\":\""
openBrace = do
  pos <- tokOpenBrace <?> "\"{\""
  addLexerWarning (LexWarning LexBraces pos)
closeBrace = tokCloseBrace <?> "\"}\""

fieldContent :: Parser (FieldLine Position)
fieldContent = tokFieldLine <?> "field contents"

newtype IndentLevel = IndentLevel Int

zeroIndentLevel :: IndentLevel
zeroIndentLevel = IndentLevel 0

incIndentLevel :: IndentLevel -> IndentLevel
incIndentLevel (IndentLevel i) = IndentLevel (succ i)

indentOfAtLeast :: IndentLevel -> Parser IndentLevel
indentOfAtLeast (IndentLevel i) = try $ do
  j <- tokIndent
  guard (j >= i) <?> "indentation of at least " ++ show i
  return (IndentLevel j)

newtype LexerMode = LexerMode Int

inLexerMode :: LexerMode -> Parser p -> Parser p
inLexerMode (LexerMode mode) p =
  do setLexerMode mode; x <- p; setLexerMode in_section; return x

-----------------------
-- Cabal file grammar
--

-- $grammar
--
-- @
-- CabalStyleFile ::= SecElems
--
-- SecElems       ::= SecElem* '\\n'?
-- SecElem        ::= '\\n' SecElemLayout | SecElemBraces
-- SecElemLayout  ::= FieldLayout | FieldBraces | SectionLayout | SectionBraces
-- SecElemBraces  ::= FieldInline | FieldBraces |                 SectionBraces
-- FieldLayout    ::= name ':' line? ('\\n' line)*
-- FieldBraces    ::= name ':' '\\n'? '{' content '}'
-- FieldInline    ::= name ':' content
-- SectionLayout  ::= name arg* SecElems
-- SectionBraces  ::= name arg* '\\n'? '{' SecElems '}'
-- @
--
-- and the same thing but left factored...
--
-- @
-- SecElems              ::= SecElem*
-- SecElem               ::= '\\n' name SecElemLayout
--                         |      name SecElemBraces
-- SecElemLayout         ::= ':'   FieldLayoutOrBraces
--                         | arg*  SectionLayoutOrBraces
-- FieldLayoutOrBraces   ::= '\\n'? '{' content '}'
--                         | line? ('\\n' line)*
-- SectionLayoutOrBraces ::= '\\n'? '{' SecElems '\\n'? '}'
--                         | SecElems
-- SecElemBraces         ::= ':' FieldInlineOrBraces
--                         | arg* '\\n'? '{' SecElems '\\n'? '}'
-- FieldInlineOrBraces   ::= '\\n'? '{' content '}'
--                         | content
-- @
--
-- Note how we have several productions with the sequence:
--
-- > '\\n'? '{'
--
-- That is, an optional newline (and indent) followed by a @{@ token.
-- In the @SectionLayoutOrBraces@ case you can see that this makes it
-- not fully left factored (because @SecElems@ can start with a @\\n@).
-- Fully left factoring here would be ugly, and though we could use a
-- lookahead of two tokens to resolve the alternatives, we can't
-- conveniently use Parsec's 'try' here to get a lookahead of only two.
-- So instead we deal with this case in the lexer by making a line
-- where the first non-space is @{@ lex as just the @{@ token, without
-- the usual indent token. Then in the parser we can resolve everything
-- with just one token of lookahead and so without using 'try'.

-- Top level of a file using cabal syntax
--
cabalStyleFile :: Parser [Field Position]
cabalStyleFile = do
  es <- elements zeroIndentLevel
  eof
  return es

-- Elements that live at the top level or inside a section, i.e. fields
-- and sections content
--
-- elements ::= element*
elements :: IndentLevel -> Parser [Field Position]
elements ilevel = many (element ilevel)

-- An individual element, ie a field or a section. These can either use
-- layout style or braces style. For layout style then it must start on
-- a line on its own (so that we know its indentation level).
--
-- element ::= '\\n' name elementInLayoutContext
--           |      name elementInNonLayoutContext
element :: IndentLevel -> Parser (Field Position)
element ilevel =
  ( do
      ilevel' <- indentOfAtLeast ilevel
      name <- fieldSecName
      elementInLayoutContext (incIndentLevel ilevel') name
  )
    <|> ( do
            name <- fieldSecName
            elementInNonLayoutContext name
        )

-- An element (field or section) that is valid in a layout context.
-- In a layout context we can have fields and sections that themselves
-- either use layout style or that use braces style.
--
-- elementInLayoutContext ::= ':'  fieldLayoutOrBraces
--                          | arg* sectionLayoutOrBraces
elementInLayoutContext :: IndentLevel -> Name Position -> Parser (Field Position)
elementInLayoutContext ilevel name =
  (do colon; fieldLayoutOrBraces ilevel name)
    <|> ( do
            args <- many sectionArg
            elems <- sectionLayoutOrBraces ilevel
            return (Section name args elems)
        )

-- An element (field or section) that is valid in a non-layout context.
-- In a non-layout context we can have only have fields and sections that
-- themselves use braces style, or inline style fields.
--
-- elementInNonLayoutContext ::= ':' FieldInlineOrBraces
--                             | arg* '\\n'? '{' elements '\\n'? '}'
elementInNonLayoutContext :: Name Position -> Parser (Field Position)
elementInNonLayoutContext name =
  (do colon; fieldInlineOrBraces name)
    <|> ( do
            args <- many sectionArg
            openBrace
            elems <- elements zeroIndentLevel
            optional tokIndent
            closeBrace
            return (Section name args elems)
        )

-- The body of a field, using either layout style or braces style.
--
-- fieldLayoutOrBraces   ::= '\\n'? '{' content '}'
--                         | line? ('\\n' line)*
fieldLayoutOrBraces :: IndentLevel -> Name Position -> Parser (Field Position)
fieldLayoutOrBraces ilevel name = braces <|> fieldLayout
  where
    braces = do
      openBrace
      ls <- inLexerMode (LexerMode in_field_braces) (many fieldContent)
      closeBrace
      return (Field name ls)
    fieldLayout = inLexerMode (LexerMode in_field_layout) $ do
      l <- optionMaybe fieldContent
      ls <- many (do _ <- indentOfAtLeast ilevel; fieldContent)
      return $ case l of
        Nothing -> Field name ls
        Just l' -> Field name (l' : ls)

-- The body of a section, using either layout style or braces style.
--
-- sectionLayoutOrBraces ::= '\\n'? '{' elements \\n? '}'
--                         | elements
sectionLayoutOrBraces :: IndentLevel -> Parser [Field Position]
sectionLayoutOrBraces ilevel =
  ( do
      openBrace
      elems <- elements zeroIndentLevel
      optional tokIndent
      closeBrace
      return elems
  )
    <|> (elements ilevel)

-- The body of a field, using either inline style or braces.
--
-- fieldInlineOrBraces   ::= '\\n'? '{' content '}'
--                         | content
fieldInlineOrBraces :: Name Position -> Parser (Field Position)
fieldInlineOrBraces name =
  ( do
      openBrace
      ls <- inLexerMode (LexerMode in_field_braces) (many fieldContent)
      closeBrace
      return (Field name ls)
  )
    <|> ( do
            ls <- inLexerMode (LexerMode in_field_braces) (option [] (fmap (\l -> [l]) fieldContent))
            return (Field name ls)
        )

-- | Parse cabal style 'B8.ByteString' into list of 'Field's, i.e. the cabal AST.
--
-- 'readFields' assumes that input 'B8.ByteString' is valid UTF8, specifically it doesn't validate that file is valid UTF8.
-- Therefore bytestrings inside returned 'Field' will be invalid as UTF8 if the input were.
--
-- >>> readFields "foo: \223"
-- Right [Field (Name (Position 1 1) "foo") [FieldLine (Position 1 6) "\223"]]
--
-- 'readFields' won't (necessarily) fail on invalid UTF8 data, but the reported positions may be off.
--
-- __You may get weird errors on non-UTF8 input__, for example 'readFields' will fail on latin1 encoded non-breaking space:
--
-- >>> isLeft (readFields "\xa0 foo: bar")
-- True
--
-- That is rejected because parser thinks @\\xa0@ is a section name,
-- and section arguments may not contain colon.
-- If there are just latin1 non-breaking spaces, they become part of the name:
--
-- >>> readFields "\xa0\&foo: bar"
-- Right [Field (Name (Position 1 1) "\160foo") [FieldLine (Position 1 7) "bar"]]
--
-- The UTF8 non-breaking space is accepted as an indentation character (but warned about by 'readFields'').
--
-- >>> readFields' "\xc2\xa0 foo: bar"
-- Right ([Field (Name (Position 1 3) "foo") [FieldLine (Position 1 8) "bar"]],[LexWarning LexWarningNBSP (Position 1 1)])
readFields :: B8.ByteString -> Either ParseError [Field Position]
readFields s = fmap fst (readFields' s)

-- | Like 'readFields' but also return lexer warnings.
readFields' :: B8.ByteString -> Either ParseError ([Field Position], [LexWarning])
readFields' s = do
  parse parser "the input" lexSt
  where
    parser = do
      fields <- cabalStyleFile
      ws <- getLexerWarnings -- lexer accumulates warnings in reverse (consing them to the list)
      pure (fields, reverse ws ++ checkIndentation fields [])

    lexSt = mkLexState' (mkLexState s)

-- | Check (recursively) that all fields inside a block are indented the same.
--
-- We have to do this as a post-processing check.
-- As the parser uses indentOfAtLeast approach, we don't know what is the "correct"
-- indentation for following fields.
--
-- To catch during parsing we would need to parse first field/section of a section
-- and then parse the following ones (softly) requiring the exactly the same indentation.
checkIndentation :: [Field Position] -> [LexWarning] -> [LexWarning]
checkIndentation [] = id
checkIndentation (Field name _ : fs') = checkIndentation' (nameAnn name) fs'
checkIndentation (Section name _ fs : fs') = checkIndentation fs . checkIndentation' (nameAnn name) fs'

-- | We compare adjacent fields to reduce the amount of reported indentation warnings.
checkIndentation' :: Position -> [Field Position] -> [LexWarning] -> [LexWarning]
checkIndentation' _ [] = id
checkIndentation' pos (Field name _ : fs') = checkIndentation'' pos (nameAnn name) . checkIndentation' (nameAnn name) fs'
checkIndentation' pos (Section name _ fs : fs') = checkIndentation'' pos (nameAnn name) . checkIndentation fs . checkIndentation' (nameAnn name) fs'

-- | Check that positions' columns are the same.
checkIndentation'' :: Position -> Position -> [LexWarning] -> [LexWarning]
checkIndentation'' a b
  | positionCol a == positionCol b = id
  | otherwise = (LexWarning LexInconsistentIndentation b :)

#ifdef CABAL_PARSEC_DEBUG
parseTest' :: Show a => Parsec LexState' () a -> SourceName -> B8.ByteString -> IO ()
parseTest' p fname s =
    case parse p fname (lexSt s) of
      Left err -> putStrLn (formatError s err)

      Right x  -> print x
  where
    lexSt = mkLexState' . mkLexState

parseFile :: Show a => Parser a -> FilePath -> IO ()
parseFile p f = B8.readFile f >>= \s -> parseTest' p f s

parseStr  :: Show a => Parser a -> String -> IO ()
parseStr p = parseBS p . B8.pack

parseBS  :: Show a => Parser a -> B8.ByteString -> IO ()
parseBS p = parseTest' p "<input string>"

formatError :: B8.ByteString -> ParseError -> String
formatError input perr =
    unlines
      [ "Parse error "++ show (errorPos perr) ++ ":"
      , errLine
      , indicator ++ errmsg ]
  where
    pos       = errorPos perr
    ls        = lines' (T.decodeUtf8With T.lenientDecode input)
    errLine   = T.unpack (ls !! (sourceLine pos - 1))
    indicator = replicate (sourceColumn pos) ' ' ++ "^"
    errmsg    = showErrorMessages "or" "unknown parse error"
                                  "expecting" "unexpected" "end of file"
                                  (errorMessages perr)

-- | Handles windows/osx/unix line breaks uniformly
lines' :: T.Text -> [T.Text]
lines' s1
  | T.null s1 = []
  | otherwise = case T.break (\c -> c == '\r' || c == '\n') s1 of
                  (l, s2) | Just (c,s3) <- T.uncons s2
                         -> case T.uncons s3 of
                              Just ('\n', s4) | c == '\r' -> l : lines' s4
                              _               -> l : lines' s3
                          | otherwise -> [l]
#endif

eof :: Parser ()
eof = notFollowedBy anyToken <?> "end of file"
  where
    notFollowedBy :: Parser LToken -> Parser ()
    notFollowedBy p =
      try
        ( (do L _ t <- try p; unexpected (describeToken t))
            <|> return ()
        )