----------------------------------------------------------------------------- -- Copyright 2019, Advise-Me project team. This file is distributed under -- the terms of the Apache License 2.0. For more information, see the files -- "LICENSE.txt" and "NOTICE.txt", which are included in the distribution. ----------------------------------------------------------------------------- -- | -- Maintainer : bastiaan.heeren@ou.nl -- Stability : provisional -- Portability : portable (depends on ghc) -- -- This module defines the lexing (1st) phase of the assessment pipeline. -- The only function you should need is `extract`. -- Note that we don't do true lexing: we don't convert symbols into tokens. -- Instead, we mostly tend to remove text/symbols and do some conversion. -- ----------------------------------------------------------------------------- module Recognize.Parsing.MathLexer (extract, useColumns, parseColumns, parseSimple, allowedWords, trim) where import Control.Applicative (empty) import Control.Arrow import Control.Monad import Data.Char import Data.Either import Data.List import Data.Maybe import Ideas.Utils.Parsing import Recognize.Data.StringLexerOptions import Text.Parsec (Parsec) type MathLexer = Parsec String StringLexerOptions -- | This is the lexer function. -- -- It takes some options for string lexing, the string to lex and return a list of lines and a boolean denoting columnization. extract :: StringLexerOptions -> String -> ([String], Bool) extract opts txt = -- (fixParens $ case parseSimple' opts pMathList $ (unlines . removeEnumerations . lines) phase2 of (case parseSimple' opts pMathList $ (unlines . removeEnumerations . lines) phase2 of Left msg -> error $ "Recognize.MathMathLexer.extract: " ++ txt ++ "\n" ++ msg Right xs -> mapMaybe shorten xs, isJust columnized) -- what is the purpose of shorten here where phase1 = maybe txt (concatMap unlines) columnized phase2 = case parseSimple' opts (preParser bRemDashes (replaceXByMultiplication opts)) phase1 of (Left _ ) -> phase1 (Right r) -> r bRemDashes = (>3) . length . filter (isPrefixOf "-" . dropWhile (==' ')) . lines $ phase1 columnized = useColumns opts txt -- | Attempts to remove or convert certain symbols preParser :: Bool -> Bool -> MathLexer String preParser remDashes repAllX = let f = (++) <$> preParse <*> f <|> [] <$ eof in (++) <$> remFirstDash <*> f where preParse :: MathLexer String preParse = try repX <|> try remStatement <|> try remGraphicalSymbols <|> repPercentage <|> (if remDashes then remDashess else empty) <|> (:[]) <$> anyChar -- | Converts x% to x/100 repPercentage :: MathLexer String repPercentage = "/100" <$ char '%' repX :: MathLexer String repX | repAllX = "*" <$ oneOf "xX" | otherwise = (\a b -> a : "*" ++ [b]) <$> (alphaNum <|> char ')') <* tabsOrSpaces <* oneOf "xX" <* tabsOrSpaces <*> (alphaNum <|> char '(') -- | Removes '->','-- >', '--' -- -- Each graphical symbol may have more '-' characters remGraphicalSymbols :: MathLexer String remGraphicalSymbols = [' '] <$ ( try (many1 (char '-') <* char '>') <|> try (many1 (char '-') <* char ' ' <* char '>') <|> (char '-' *> char '-' *> many1 (char '-')) ) -- potentially hazardous for input "a:" or "x:" when actually divided by something. remStatement :: MathLexer String remStatement = [] <$ ( string "pour" <|> string "est" <|> try (string "avec") <|> string "choici" <|> (:[]) <$> oneOf "aAxX" ) <* tabsOrSpaces <* many digit <* tabsOrSpaces <* char ':' tabsOrSpaces :: MathLexer String tabsOrSpaces = many (char ' ' <|> char '\t') remFirstDash :: MathLexer String remFirstDash | remDashes = "" <$ char '-' | otherwise = return "" remDashess :: MathLexer String remDashess = "\n" <$ char '\n' <* tabsOrSpaces <* char '-' {- removeLeadingDash :: String -> String removeLeadingDash r = if length (splitRegex (mkRegex "\n[' ']*-[' ']*") r) > 4 then subRegex (mkRegex "\n[' ']*-[' ']*") r "\n" else r -} -- | Determines for a given string whether it should be parsed using columns -- Columns can occur in a horizontal or vertical fashion. -- Horizontal columns means that several expressions on a single line form one solution -- Vertical columns means that several expressions directly below another form one solution useColumns :: StringLexerOptions -> String -> Maybe [[String]] useColumns opts s | nHzColumns > 0 -- There must be horizontal columns && mostLinesWithColumns -- Sometimes a single line contains columns but the rest do not. && maxXDimension >= maxYDimension = Just allHzColumns -- Determine whether horizontal or vertical columns are used | not (null allVtColumns) -- There must be horizontal columns && mostLinesWithColumns && maxYDimension /= 0 = Just allVtColumns | otherwise = Nothing where allHzColumns = rights $ map (parseSimple' opts parseColumns) ls nHzColumns = length allHzColumns allVtColumns = transpose allHzColumns maxXDimension = maximum $ map length allHzColumns maxYDimension = maximum $ map length allVtColumns -- removes spaces at the beginning and end of all lines ls = map trim $ lines s -- remove empty lines, lines containing too much NL ls' = filter (\a -> not $ containsSpaces a || a == "") ls mostLinesWithColumns = length ls' - nHzColumns < nHzColumns complete' :: MathLexer a -> MathLexer a complete' p = spaces *> (p <* eof) parseSimple' :: StringLexerOptions -> MathLexer a -> String -> Either String a parseSimple' opts p = left show . runParser (complete' p) opts "" -- | Determines whether the string contains at least one space containsSpaces :: String -> Bool containsSpaces s = isRight $ parseSimple (many1 space) s -- | drop spaces before and after trim :: String -> String trim = dropWhileEnd isSpace . dropWhile isSpace -- | Parses several expressions on a single line parseColumns :: MathLexer [String] parseColumns = parseColumnLine where -- We separate columns on ; ; and a space. These may contain additional spaces on the left and right seps :: MathLexer Char seps = oneOf ";:" spaceAsSep :: MathLexer Char spaceAsSep = space <* many1 space sepsWithSpaces :: MathLexer Char sepsWithSpaces = spaces *> seps <* spaces -- Parses two or more expressions separated by one of the above parseColumnLine = (:) <$> allowedWords <*> many1 ((try spaceAsSep <|> sepsWithSpaces) *> allowedWords) -- | Matches on expressions that may contain spaces. It currently does parse "60 60" as a single expression allowedWords :: MathLexer String allowedWords = (:) <$> choice [alphaNum, oneOf syms] <*> many (try $ optional realSpace *> choice [alphaNum, oneOf syms]) where syms ="+-–*/=,.:()[]#" -- | Lex a space realSpace :: MathLexer Char realSpace = char ' ' -- | remove unwanted characters from the end shorten :: String -> Maybe String shorten = fmap reverse . rec . reverse where rec [] = Nothing rec (x:xs) | isSpace x = rec xs | x `elem` "+*<=>-–/:([{^.," = rec xs rec s = Just s -- | Removes enumerations: -- -- * 1:) .. -- * 2:) .. -- * etc removeEnumerations :: [String] -> [String] removeEnumerations = rec 1 where rec :: Integer -> [String] -> [String] rec _ [] = [] rec n (x:xs) = case (hasNumber n x, hasNumber 1 x) of (Just ys, _) -> ys ++ rec (n+1) xs (_, Just ys) -> ys ++ rec 2 xs _ -> x : rec n xs hasNumber :: Integer -> String -> Maybe [String] hasNumber n s | show n `isPrefixOf` s = case dropWhile isSpace (drop (length (show n)) s) of sep:rest | sep `elem` ":)" -> Just (maybeToList (shorten (dropWhile isSpace rest))) _ -> Nothing | otherwise = Nothing -- addParens takes a string and adds parentheses (and other brackets) to the front and end to fix mismatches -- If the brackets already match, the same string is returned -- The function assumes that fixing is possible. So in "(])", it assumes that the first two characters are matching brackets and it returns "((])" -- detParens is a helper function that determines which strings are added to the front and end fixParens :: [String] -> [String] fixParens = map addParens addParens :: String -> String addParens xs = before ++ xs ++ after where (before, after) = detParens "" "" xs detParens :: String -> String -> String -> (String, String) detParens before after [] = (before, after) detParens before after (x:xs) | x=='(' = detParens before (')' : after) xs | x=='{' = detParens before ('}' : after) xs | x=='[' = detParens before (']' : after) xs detParens before [] (x:xs) | x==')' = detParens ('(' : before) [] xs | x=='}' = detParens ('{' : before) [] xs | x==']' = detParens ('[' : before) [] xs | otherwise = detParens before [] xs detParens before (y:ys) (x:xs) | elem x ")}]" = detParens before ys xs | otherwise = detParens before (y:ys) xs -- | Lexes math symbols pMathList :: MathLexer [String] pMathList = catMaybes <$> many (Just <$> pMath <|> Nothing <$ pRest) -- | Used to parse anything other than math symbols pRest :: MathLexer String pRest = concat <$> many1 (pWord <|> otherSym) -- | Anything but a math symbol otherSym :: MathLexer String otherSym = do notFollowedBy mathFirstSym return <$> anyChar -- | Natural language pWord :: MathLexer String pWord = try $ do notFollowedBy pVar notFollowedBy reservedWords many1 (letter <|> oneOf "'") -- removed '-' (also math symbol) -- | Match a given string to input case insensitive pCaseInsensitive :: String -> MathLexer String pCaseInsensitive = foldr op (return []) where op :: Char -> MathLexer String -> MathLexer String op x m = (toLower x :) <$ (char (toUpper x) <|> char (toLower x)) <*> m -- | Lex variables. -- -- Lexing is done case insensitive and we take into account variable white listing pVar :: MathLexer String pVar = getState >>= \opts -> choice [ try (pCaseInsensitive s) | s <- variableWhitelist opts] <|> try (do x <- letter notFollowedBy ( (if x `elem` "ab" then (do z <- letter guard (z `notElem` "ab") return [z]) else return <$> letter) <|> string "'" ) return [x]) -- | Lex some math pMath :: MathLexer String pMath = (\x xs -> x ++ concat xs) <$> mathFirstSym <*> many mathSym reservedWords :: MathLexer String reservedWords = try (pCaseInsensitive "sqrt" <|> pCaseInsensitive "root") mathFirstSym :: MathLexer String mathFirstSym = number <|> reservedWords <|> pVar <|> return <$> satisfy (`elem` "+-–([{") mathSym :: MathLexer String mathSym = number <|> reservedWords <|> pVar <|> (return <$> satisfy (`elem` "+*<=>-–/:()[]{}^% ")) -- hier x aan toevoegen? <|> (return ' ' <$ try (do _ <- satisfy (`elem` "\r\n") notFollowedBy (char '-')) ) number :: MathLexer String number = digitalNumber <|> (:[]) <$> specialNumber <|> try (do _ <- char '.' ds <- many1 digit notFollowedBy (char '.') return ("0."++ds)) digitalNumber :: MathLexer String digitalNumber = do xs <- many1 digit ys <- option "" (try ((:) <$> (char '.' <|> char ',') <*> many1 digit)) return (xs ++ ys) specialNumber :: MathLexer Char specialNumber = oneOf "½¼¾"