-- This file is part of tersmu -- Copyright (C) 2014 Martin Bays -- -- This program is free software: you can redistribute it and/or modify -- it under the terms of version 3 of the GNU General Public License as -- published by the Free Software Foundation. -- -- You should have received a copy of the GNU General Public License -- along with this program. If not, see http://www.gnu.org/licenses/. module ParseText (parseText) where import Lojban import Pappy.Pos import Pappy.Parse import JboSyntax import Control.Applicative -- import Debug.Trace -- traceIt x = traceShow x x parseText :: String -> Either Int Text parseText str = nudgeFrees (lojbanterminatedText . lojbanParse "terminatedText") $ str ++ " %%%END%%%" afterPos :: Pos -> String -> String afterPos p s = drop (posCol p - 1) s -- |nudgeFrees: move indicators and frees backwards until they're in one of -- the prescribed positions where our main grammar can parse them (basically: -- after a term, or at the start of a sentence or similar construct). -- Works as follows: try to parse; if fail, try to parse a free at fail point, -- recursively nudging frees to do so; if find such a misplaced free, move it -- back a word and try parsing again. -- Rather a hack, obviously, but it seems to work! -- -- FIXME: MAI isn't really handled, since the parse failure can come at the -- MAI rather than the start of the free, so e.g. "za'u re'u re mai broda" -- fails. Similarly {le zarci e re mai le zdani} -- -- FIXME: {li ci xi pa su'i vo} - nudging the xi clause back has it pick up -- the {ci}, resulting in a parse error. Can be fixed by bracketing our free -- before nudging it. -- -- FIXME: more seriously, this appraoch doesn't actually work - the parse -- error caused by a misplaced free often isn't at the start of the free, for -- various reasons. Probably we should consider the current algorithm "good -- enough for present purposes", but plan to move to piggy-backing on a full -- camxes implementation. For example, we could literally depend on the -- standard camxes java implementation, parse its output to find frees and -- move them into canonical positions, then put them through this parser - -- with the hacky preparsing implemented in this file as a fallback when we -- don't have access to camxes. -- -- Note: probably makes more sense to have our canonical free positions be -- after QAtom and TanruUnit and at (sub)sentence start, rather than riding on -- Term as at present. -- -- XXX: possible source of parse bugs: misplaced free can interrupt crucial -- greed. e.g. "tag sumti / tag KU?" acts buggily on "vi ue ta", -- and has to be "tag sumti / tag !free KU". I think I've got all those now, -- but may be wrong. -- -- TODO: optimise; currently we're doing string manipulation with String, -- which is a bad idea, and more to the point we're reparsing the whole text -- repeatedly. Chunking the text into paragraphs would be a start. -- nudgeFrees :: (String -> Result LojbanDerivs a) -> String -> Either Int a nudgeFrees parse s = fmap fst $ nudgeFrees' False parse s where nudgeFrees' :: Bool -> (String -> Result LojbanDerivs a) -> String -> Either Int (a,Int) nudgeFrees' inFree parse str = case parse str of Parsed a d _ -> Right (a, parsedPoint d) NoParse e -> let pos = errorPoint e (head,tail) = splitAt pos str in if inFree && pos == 0 then Left 0 else case nudgeFrees' True parseFree tail of Left n -> Left $ pos + n Right (_,flen) -> nudgeFrees' inFree parse $ nudgeFree head tail flen errorPoint e = posCol (errorPos e) - 1 parsedPoint d = posCol (dvPos d) - 1 parseFree = lojbanfree . lojbanParse "free" nudgeFree head tail flen = let ws = words head (headws',headws'') = splitAt (length ws - 1) ws (free,tail') = splitAt flen tail in unwords headws' ++ (if null headws' then "" else " ") ++ free ++ unwords headws'' ++ " " ++ tail' {- -- Direct parsing without any preprocessing - currently unused parseAText :: String -> Either Int [Statement] parseAText str = case lojbanatext1 (lojbanParse "atext1" (str ++ " ")) of Parsed p _ _ -> Right p NoParse e -> Left (posCol (errorPos e)) -- |parseTextSplitting: split text into statements, strip indicators&free from each, -- and parse the stripped statements. Unused. -- Doesn't handle fragments, nor the 'text' production in full detail. parseTextSplitting :: String -> [ Either (String,Int) (Statement,[Free]) ] parseTextSplitting = parseText' . stripTextHead . (++" ") where parseText' str = case parseStatement str of Left n -> [Left (str,n)] Right (parsed,tail) -> if null tail then [Right parsed] else Right parsed:parseText' tail parseStatement :: String -> Either Int ((Statement,[Free]),String) parseStatement = stripFrees $ lojbanwholeStatement . lojbanParse "wholeStatement" stripTextHead :: String -> String stripTextHead str = let Parsed _ d _ = lojbantextHead . lojbanParse "textHead" $ str in afterPos (dvPos d) str -}