-----------------------------------------------------------------------------
-- Copyright 2019, Advise-Me project team. This file is distributed under 
-- the terms of the Apache License 2.0. For more information, see the files
-- "LICENSE.txt" and "NOTICE.txt", which are included in the distribution.
-----------------------------------------------------------------------------
-- |
-- Maintainer  :  bastiaan.heeren@ou.nl
-- Stability   :  provisional
-- Portability :  portable (depends on ghc)
--
-- This module defines the lexing (1st) phase of the assessment pipeline.
-- The only function you should need is `extract`.
-- Note that we don't do true lexing: we don't convert symbols into tokens.
-- Instead, we mostly tend to remove text/symbols and do some conversion.
--
-----------------------------------------------------------------------------

module Recognize.Parsing.MathLexer (extract, useColumns, parseColumns, parseSimple, allowedWords, trim) where

import Control.Applicative                      (empty)
import Control.Arrow
import Control.Monad
import Data.Char
import Data.Either
import Data.List
import Data.Maybe
import Ideas.Utils.Parsing
import Recognize.Data.StringLexerOptions
import Text.Parsec (Parsec)

type MathLexer = Parsec String StringLexerOptions

-- | This is the lexer function.
--
-- It takes some options for string lexing, the string to lex and return a list of lines and a boolean denoting columnization.
extract :: StringLexerOptions -> String -> ([String], Bool)
extract opts txt = 
--  (fixParens $ case parseSimple' opts pMathList $ (unlines . removeEnumerations . lines) phase2 of
  (case parseSimple' opts pMathList $ (unlines . removeEnumerations . lines) phase2 of
      Left msg -> error $  "Recognize.MathMathLexer.extract: " ++ txt ++ "\n" ++ msg
      Right xs -> mapMaybe shorten xs, isJust columnized) -- what is the purpose of shorten here
  where
   phase1 = maybe txt (concatMap unlines) columnized
   phase2 = case parseSimple' opts (preParser bRemDashes (replaceXByMultiplication opts)) phase1 of
          (Left _ ) -> phase1
          (Right r) -> r
   bRemDashes = (>3) . length . filter (isPrefixOf "-" . dropWhile (==' ')) . lines $ phase1
   columnized = useColumns opts txt

-- | Attempts to remove or convert certain symbols
preParser :: Bool -> Bool -> MathLexer String
preParser remDashes repAllX = let f = (++) <$> preParse <*> f
                                      <|> [] <$ eof
                              in (++) <$> remFirstDash <*> f
 where
   preParse :: MathLexer String
   preParse =
      try repX
      <|> try remStatement
      <|> try remGraphicalSymbols
      <|> repPercentage
      <|> (if remDashes then remDashess else empty)
      <|> (:[]) <$> anyChar

   -- | Converts x% to x/100
   repPercentage :: MathLexer String
   repPercentage = "/100" <$ char '%'

   repX :: MathLexer String
   repX | repAllX = "*" <$ oneOf "xX"
        | otherwise = (\a b -> a : "*" ++ [b]) <$> (alphaNum <|> char ')') <* tabsOrSpaces <* oneOf "xX" <* tabsOrSpaces <*> (alphaNum <|> char '(')


   -- | Removes '->','-- >', '--'
   --
   -- Each graphical symbol may have more '-' characters
   remGraphicalSymbols :: MathLexer String
   remGraphicalSymbols = [' '] <$ (
      try (many1 (char '-') <* char '>')
      <|> try (many1 (char '-') <* char ' ' <* char '>')
      <|> (char '-' *> char '-' *> many1 (char '-'))
      )

   -- potentially hazardous for input "a:" or "x:"   when actually divided by something.
   remStatement :: MathLexer String
   remStatement = [] <$
                  ( string "pour"
                <|> string "est"
                <|> try (string "avec")
                <|> string "choici"
                <|> (:[]) <$> oneOf "aAxX"
                  ) <* tabsOrSpaces <* many digit <* tabsOrSpaces <* char ':'


   tabsOrSpaces :: MathLexer String
   tabsOrSpaces = many (char ' ' <|> char '\t')

   remFirstDash :: MathLexer String
   remFirstDash | remDashes = "" <$ char '-'
                | otherwise = return ""

   remDashess :: MathLexer String
   remDashess = "\n" <$ char '\n' <* tabsOrSpaces <* char '-'

  {-              removeLeadingDash :: String -> String
                  removeLeadingDash r = if length (splitRegex (mkRegex "\n[' ']*-[' ']*") r) > 4
                                        then subRegex (mkRegex "\n[' ']*-[' ']*") r "\n"
                                        else r
  -}

-- | Determines for a given string whether it should be parsed using columns
-- Columns can occur in a horizontal or vertical fashion.
-- Horizontal columns means that several expressions on a single line form one solution
-- Vertical columns means that several expressions directly below another form one solution
useColumns :: StringLexerOptions -> String -> Maybe [[String]]
useColumns opts s |  nHzColumns > 0 -- There must be horizontal columns
             && mostLinesWithColumns -- Sometimes a single line contains columns but the rest do not.
             && maxXDimension >= maxYDimension = Just allHzColumns -- Determine whether horizontal or vertical columns are used
             | not (null allVtColumns) -- There must be horizontal columns
             && mostLinesWithColumns
             && maxYDimension /= 0 = Just allVtColumns
             | otherwise = Nothing
  where allHzColumns = rights $ map (parseSimple' opts parseColumns) ls
        nHzColumns = length allHzColumns
        allVtColumns = transpose allHzColumns
        maxXDimension = maximum $ map length allHzColumns
        maxYDimension = maximum $ map length allVtColumns
        -- removes spaces at the beginning and end of all lines
        ls = map trim $ lines s
        -- remove empty lines, lines containing too much NL
        ls' = filter (\a -> not $ containsSpaces a || a == "") ls
        mostLinesWithColumns = length ls' - nHzColumns < nHzColumns

complete' :: MathLexer a -> MathLexer a
complete' p = spaces *> (p <* eof)

parseSimple' :: StringLexerOptions -> MathLexer a -> String -> Either String a
parseSimple' opts p = left show . runParser (complete' p) opts ""

-- | Determines whether the string contains at least one space
containsSpaces :: String -> Bool
containsSpaces s = isRight $ parseSimple (many1 space) s

-- | drop spaces before and after
trim :: String -> String
trim = dropWhileEnd isSpace . dropWhile isSpace

-- | Parses several expressions on a single line
parseColumns :: MathLexer [String]
parseColumns = parseColumnLine
  where
    -- We separate columns on ; ; and a space. These may contain additional spaces on the left and right
    seps :: MathLexer Char
    seps = oneOf ";:"
    spaceAsSep :: MathLexer Char
    spaceAsSep = space <* many1 space
    sepsWithSpaces :: MathLexer Char
    sepsWithSpaces = spaces *> seps <* spaces
    -- Parses two or more expressions separated by one of the above
    parseColumnLine = (:) <$> allowedWords <*> many1 ((try spaceAsSep <|> sepsWithSpaces) *> allowedWords)

-- | Matches on expressions that may contain spaces. It currently does parse "60 60" as a single expression
allowedWords :: MathLexer String
allowedWords = (:) <$> choice [alphaNum, oneOf syms] <*> many (try $ optional realSpace *> choice [alphaNum, oneOf syms])
    where syms ="+-–*/=,.:()[]#"

-- | Lex a space
realSpace :: MathLexer Char
realSpace = char ' '

-- | remove unwanted characters from the end
shorten :: String -> Maybe String
shorten = fmap reverse . rec . reverse
 where
   rec []  = Nothing
   rec (x:xs)
      | isSpace x = rec xs
      | x `elem` "+*<=>-–/:([{^.," = rec xs
   rec s = Just s

-- | Removes enumerations:
--
-- * 1:) ..
-- * 2:) ..
-- * etc
removeEnumerations :: [String] -> [String]
removeEnumerations = rec 1
 where
   rec :: Integer -> [String] -> [String]
   rec _ [] = []
   rec n (x:xs) =
      case (hasNumber n x, hasNumber 1 x) of
         (Just ys, _) -> ys ++ rec (n+1) xs
         (_, Just ys) -> ys ++ rec 2 xs
         _            -> x : rec n xs

   hasNumber :: Integer -> String -> Maybe [String]
   hasNumber n s
      | show n `isPrefixOf` s =
           case dropWhile isSpace (drop (length (show n)) s) of
              sep:rest | sep `elem` ":)" ->
                 Just (maybeToList (shorten (dropWhile isSpace rest)))
              _ -> Nothing
      | otherwise = Nothing

-- addParens takes a string and adds parentheses (and other brackets) to the front and end to fix mismatches
-- If the brackets already match, the same string is returned
-- The function assumes that fixing is possible. So in "(])", it assumes that the first two characters are matching brackets and it returns "((])"
-- detParens is a helper function that determines which strings are added to the front and end
fixParens :: [String] -> [String]
fixParens = map addParens

addParens :: String -> String
addParens xs = before ++ xs ++ after
              where (before, after) = detParens "" "" xs

detParens :: String -> String -> String -> (String, String)
detParens before after [] = (before, after)
detParens before after (x:xs) | x=='(' = detParens before (')' : after) xs
                              | x=='{' = detParens before ('}' : after) xs
                              | x=='[' = detParens before (']' : after) xs
detParens before [] (x:xs) | x==')'    = detParens ('(' : before) [] xs
                           | x=='}'    = detParens ('{' : before) [] xs
                           | x==']'    = detParens ('[' : before) [] xs
                           | otherwise = detParens before [] xs
detParens before (y:ys) (x:xs) | elem x ")}]" = detParens before ys xs
                               | otherwise = detParens before (y:ys) xs

      
-- | Lexes math symbols
pMathList :: MathLexer [String]
pMathList = catMaybes <$>
  many (Just <$> pMath <|> Nothing <$ pRest)

-- | Used to parse anything other than math symbols
pRest :: MathLexer String
pRest = concat <$> many1 (pWord <|> otherSym)

-- | Anything but a math symbol
otherSym :: MathLexer String
otherSym = do
   notFollowedBy mathFirstSym
   return <$> anyChar

-- | Natural language
pWord :: MathLexer String
pWord = try $ do
   notFollowedBy pVar
   notFollowedBy reservedWords
   many1 (letter <|> oneOf "'") -- removed '-' (also math symbol)

-- | Match a given string to input case insensitive
pCaseInsensitive :: String -> MathLexer String
pCaseInsensitive = foldr op (return [])
 where
   op :: Char -> MathLexer String -> MathLexer String
   op x m = (toLower x :) <$ (char (toUpper x) <|> char (toLower x)) <*> m

-- | Lex variables.
--
-- Lexing is done case insensitive and we take into account variable white listing
pVar :: MathLexer String
pVar = getState >>= \opts ->
           choice [ try (pCaseInsensitive s) | s <- variableWhitelist opts]
           <|>
             try (do x <- letter
                     notFollowedBy (
                      (if x `elem` "ab"
                      then (do z <- letter
                               guard (z `notElem` "ab")
                               return [z])
                      else return <$> letter)
                      <|> string "'"
                      )
                     return [x])

-- | Lex some math
pMath :: MathLexer String
pMath = (\x xs -> x ++ concat xs) <$> mathFirstSym <*> many mathSym

reservedWords :: MathLexer String
reservedWords = try (pCaseInsensitive "sqrt" <|> pCaseInsensitive "root")

mathFirstSym :: MathLexer String
mathFirstSym = number
           <|> reservedWords
           <|> pVar
           <|> return <$> satisfy (`elem` "+-–([{")

mathSym :: MathLexer String
mathSym = number
      <|> reservedWords
      <|> pVar
      <|> (return <$> satisfy (`elem` "+*<=>-–/:()[]{}^% ")) -- hier x aan toevoegen?
      <|> (return ' ' <$ try (do _ <- satisfy (`elem` "\r\n")
                                 notFollowedBy (char '-'))
          )

number :: MathLexer String
number = digitalNumber
     <|> (:[]) <$> specialNumber
     <|> try (do _ <- char '.'
                 ds <- many1 digit
                 notFollowedBy (char '.')
                 return ("0."++ds))

digitalNumber :: MathLexer String
digitalNumber = do
   xs <- many1 digit
   ys <- option "" (try ((:) <$> (char '.' <|> char ',') <*> many1 digit))
   return (xs ++ ys)

specialNumber :: MathLexer Char
specialNumber = oneOf "½¼¾"