module Recognize.Parsing.MathLexer (extract, useColumns, parseColumns, parseSimple, allowedWords, trim) where
import Control.Applicative (empty)
import Control.Arrow
import Control.Monad
import Data.Char
import Data.Either
import Data.List
import Data.Maybe
import Ideas.Utils.Parsing
import Recognize.Data.StringLexerOptions
import Text.Parsec (Parsec)
type MathLexer = Parsec String StringLexerOptions
extract :: StringLexerOptions -> String -> ([String], Bool)
extract opts txt =
(case parseSimple' opts pMathList $ (unlines . removeEnumerations . lines) phase2 of
Left msg -> error $ "Recognize.MathMathLexer.extract: " ++ txt ++ "\n" ++ msg
Right xs -> mapMaybe shorten xs, isJust columnized)
where
phase1 = maybe txt (concatMap unlines) columnized
phase2 = case parseSimple' opts (preParser bRemDashes (replaceXByMultiplication opts)) phase1 of
(Left _ ) -> phase1
(Right r) -> r
bRemDashes = (>3) . length . filter (isPrefixOf "-" . dropWhile (==' ')) . lines $ phase1
columnized = useColumns opts txt
preParser :: Bool -> Bool -> MathLexer String
preParser remDashes repAllX = let f = (++) <$> preParse <*> f
<|> [] <$ eof
in (++) <$> remFirstDash <*> f
where
preParse :: MathLexer String
preParse =
try repX
<|> try remStatement
<|> try remGraphicalSymbols
<|> repPercentage
<|> (if remDashes then remDashess else empty)
<|> (:[]) <$> anyChar
repPercentage :: MathLexer String
repPercentage = "/100" <$ char '%'
repX :: MathLexer String
repX | repAllX = "*" <$ oneOf "xX"
| otherwise = (\a b -> a : "*" ++ [b]) <$> (alphaNum <|> char ')') <* tabsOrSpaces <* oneOf "xX" <* tabsOrSpaces <*> (alphaNum <|> char '(')
remGraphicalSymbols :: MathLexer String
remGraphicalSymbols = [' '] <$ (
try (many1 (char '-') <* char '>')
<|> try (many1 (char '-') <* char ' ' <* char '>')
<|> (char '-' *> char '-' *> many1 (char '-'))
)
remStatement :: MathLexer String
remStatement = [] <$
( string "pour"
<|> string "est"
<|> try (string "avec")
<|> string "choici"
<|> (:[]) <$> oneOf "aAxX"
) <* tabsOrSpaces <* many digit <* tabsOrSpaces <* char ':'
tabsOrSpaces :: MathLexer String
tabsOrSpaces = many (char ' ' <|> char '\t')
remFirstDash :: MathLexer String
remFirstDash | remDashes = "" <$ char '-'
| otherwise = return ""
remDashess :: MathLexer String
remDashess = "\n" <$ char '\n' <* tabsOrSpaces <* char '-'
useColumns :: StringLexerOptions -> String -> Maybe [[String]]
useColumns opts s | nHzColumns > 0
&& mostLinesWithColumns
&& maxXDimension >= maxYDimension = Just allHzColumns
| not (null allVtColumns)
&& mostLinesWithColumns
&& maxYDimension /= 0 = Just allVtColumns
| otherwise = Nothing
where allHzColumns = rights $ map (parseSimple' opts parseColumns) ls
nHzColumns = length allHzColumns
allVtColumns = transpose allHzColumns
maxXDimension = maximum $ map length allHzColumns
maxYDimension = maximum $ map length allVtColumns
ls = map trim $ lines s
ls' = filter (\a -> not $ containsSpaces a || a == "") ls
mostLinesWithColumns = length ls' - nHzColumns < nHzColumns
complete' :: MathLexer a -> MathLexer a
complete' p = spaces *> (p <* eof)
parseSimple' :: StringLexerOptions -> MathLexer a -> String -> Either String a
parseSimple' opts p = left show . runParser (complete' p) opts ""
containsSpaces :: String -> Bool
containsSpaces s = isRight $ parseSimple (many1 space) s
trim :: String -> String
trim = dropWhileEnd isSpace . dropWhile isSpace
parseColumns :: MathLexer [String]
parseColumns = parseColumnLine
where
seps :: MathLexer Char
seps = oneOf ";:"
spaceAsSep :: MathLexer Char
spaceAsSep = space <* many1 space
sepsWithSpaces :: MathLexer Char
sepsWithSpaces = spaces *> seps <* spaces
parseColumnLine = (:) <$> allowedWords <*> many1 ((try spaceAsSep <|> sepsWithSpaces) *> allowedWords)
allowedWords :: MathLexer String
allowedWords = (:) <$> choice [alphaNum, oneOf syms] <*> many (try $ optional realSpace *> choice [alphaNum, oneOf syms])
where syms ="+-–*/=,.:()[]#"
realSpace :: MathLexer Char
realSpace = char ' '
shorten :: String -> Maybe String
shorten = fmap reverse . rec . reverse
where
rec [] = Nothing
rec (x:xs)
| isSpace x = rec xs
| x `elem` "+*<=>-–/:([{^.," = rec xs
rec s = Just s
removeEnumerations :: [String] -> [String]
removeEnumerations = rec 1
where
rec :: Integer -> [String] -> [String]
rec _ [] = []
rec n (x:xs) =
case (hasNumber n x, hasNumber 1 x) of
(Just ys, _) -> ys ++ rec (n+1) xs
(_, Just ys) -> ys ++ rec 2 xs
_ -> x : rec n xs
hasNumber :: Integer -> String -> Maybe [String]
hasNumber n s
| show n `isPrefixOf` s =
case dropWhile isSpace (drop (length (show n)) s) of
sep:rest | sep `elem` ":)" ->
Just (maybeToList (shorten (dropWhile isSpace rest)))
_ -> Nothing
| otherwise = Nothing
fixParens :: [String] -> [String]
fixParens = map addParens
addParens :: String -> String
addParens xs = before ++ xs ++ after
where (before, after) = detParens "" "" xs
detParens :: String -> String -> String -> (String, String)
detParens before after [] = (before, after)
detParens before after (x:xs) | x=='(' = detParens before (')' : after) xs
| x=='{' = detParens before ('}' : after) xs
| x=='[' = detParens before (']' : after) xs
detParens before [] (x:xs) | x==')' = detParens ('(' : before) [] xs
| x=='}' = detParens ('{' : before) [] xs
| x==']' = detParens ('[' : before) [] xs
| otherwise = detParens before [] xs
detParens before (y:ys) (x:xs) | elem x ")}]" = detParens before ys xs
| otherwise = detParens before (y:ys) xs
pMathList :: MathLexer [String]
pMathList = catMaybes <$>
many (Just <$> pMath <|> Nothing <$ pRest)
pRest :: MathLexer String
pRest = concat <$> many1 (pWord <|> otherSym)
otherSym :: MathLexer String
otherSym = do
notFollowedBy mathFirstSym
return <$> anyChar
pWord :: MathLexer String
pWord = try $ do
notFollowedBy pVar
notFollowedBy reservedWords
many1 (letter <|> oneOf "'")
pCaseInsensitive :: String -> MathLexer String
pCaseInsensitive = foldr op (return [])
where
op :: Char -> MathLexer String -> MathLexer String
op x m = (toLower x :) <$ (char (toUpper x) <|> char (toLower x)) <*> m
pVar :: MathLexer String
pVar = getState >>= \opts ->
choice [ try (pCaseInsensitive s) | s <- variableWhitelist opts]
<|>
try (do x <- letter
notFollowedBy (
(if x `elem` "ab"
then (do z <- letter
guard (z `notElem` "ab")
return [z])
else return <$> letter)
<|> string "'"
)
return [x])
pMath :: MathLexer String
pMath = (\x xs -> x ++ concat xs) <$> mathFirstSym <*> many mathSym
reservedWords :: MathLexer String
reservedWords = try (pCaseInsensitive "sqrt" <|> pCaseInsensitive "root")
mathFirstSym :: MathLexer String
mathFirstSym = number
<|> reservedWords
<|> pVar
<|> return <$> satisfy (`elem` "+-–([{")
mathSym :: MathLexer String
mathSym = number
<|> reservedWords
<|> pVar
<|> (return <$> satisfy (`elem` "+*<=>-–/:()[]{}^% "))
<|> (return ' ' <$ try (do _ <- satisfy (`elem` "\r\n")
notFollowedBy (char '-'))
)
number :: MathLexer String
number = digitalNumber
<|> (:[]) <$> specialNumber
<|> try (do _ <- char '.'
ds <- many1 digit
notFollowedBy (char '.')
return ("0."++ds))
digitalNumber :: MathLexer String
digitalNumber = do
xs <- many1 digit
ys <- option "" (try ((:) <$> (char '.' <|> char ',') <*> many1 digit))
return (xs ++ ys)
specialNumber :: MathLexer Char
specialNumber = oneOf "½¼¾"