{-# LANGUAGE OverloadedStrings #-} -- | The module provides parsing utilities for the LMF dictionary. module NLP.Polh.LMF.Parse ( readPolh , parsePolh , parseLexEntry ) where import Control.Monad (join) import Data.Maybe (mapMaybe, listToMaybe) import qualified Data.Text as T import qualified Data.Text.Lazy as L import qualified Data.Text.Lazy.IO as L import qualified Text.XML.PolySoup as Soup import Text.XML.PolySoup hiding (XmlParser, Parser, join) import NLP.Polh.Types import Debug.Trace (trace) type Parser a = Soup.XmlParser L.Text a lmfP :: Parser [LexEntry] lmfP = true //> lexEntryP lexEntryP :: Parser LexEntry lexEntryP = tag "LexicalEntry" *> getAttr "id" >^> \lexId' -> collTags >>= \tags -> return $ let with p = tagsParseXml (findAll p) tags in LexEntry { lexId = L.toStrict lexId' , lineRef = listToMaybe $ with lineRefP , status = listToMaybe $ with statusP , pos = with posP , lemma = first "lemmaP" (with lemmaP) , forms = with formP , components = join (with compoP) , syntactic = with synP , senses = with senseP , related = with relP } first :: Show a => String -> [a] -> a first _ [x] = x first src [] = error $ src ++ ": null xs" first src xs = error $ src ++ ": xs == " ++ show xs posP :: Parser T.Text posP = featP "partOfSpeech" lineRefP :: Parser T.Text lineRefP = featP "lineRef" statusP :: Parser T.Text statusP = featP "status" lemmaP :: Parser Lemma lemmaP = Lemma <$> (tag "Lemma" /> reprP) formP :: Parser WordForm formP = WordForm <$> (tag "WordForm" /> reprP) compoP :: Parser [T.Text] compoP = map L.toStrict <$> (tag "ListOfComponents" /> cut (getAttr "entry")) relP :: Parser RelForm relP = tag "RelatedForm" *> getAttr "targets" >^> \relTo' -> do rs <- many reprP return $ RelForm { relRepr = rs , relTo = L.toStrict relTo' } otherP :: Parser () otherP = tagOpenName >^> \name -> warning ("tag " ++ L.unpack name ++ " ignored") ignore warning :: String -> Parser a -> Parser a warning msg x = trace ("WARNING: " ++ msg) x grave :: String -> Parser a -> Parser a grave msg x = trace ("ERROR: " ++ msg) x grave' :: String -> a -> Parser a grave' msg x = grave msg (return x) synP :: Parser SynBehaviour synP = tag "SyntacticBehaviour" *> getAttr "senses" >^> \senses' -> do repr' <- reprBodyP let senseIds = T.words (L.toStrict senses') return (SynBehaviour [repr'] senseIds) data SenseContent = SenseDef Definition | SenseStyle T.Text | SenseCxt Context | SenseOther () senseStyle :: SenseContent -> Maybe T.Text senseStyle (SenseStyle x) = Just x senseStyle _ = Nothing senseDef :: SenseContent -> Maybe Definition senseDef (SenseDef def) = Just def senseDef _ = Nothing senseCxt :: SenseContent -> Maybe Context senseCxt (SenseCxt cxt) = Just cxt senseCxt _ = Nothing senseP :: Parser Sense senseP = tag "Sense" *> maybeAttr "id" >^> \senseId' -> do xs <- many $ oneOf [ SenseDef <$> defP , SenseStyle <$> styleP , SenseCxt <$> cxtP , SenseOther <$> otherP ] let styl' = mapMaybe senseStyle xs let defs' = mapMaybe senseDef xs let cxts' = mapMaybe senseCxt xs return $ Sense { senseId = L.toStrict <$> senseId' , style = styl' , defs = defs' , cxts = cxts' } defP :: Parser Definition defP = Definition <$> (tag "Definition" /> reprP) cxtP :: Parser Context cxtP = Context <$> (tag "Context" /> reprP) styleP :: Parser T.Text styleP = featP "style" reprP :: Parser Repr reprP = tag "FormRepresentation" <|> tag "TextRepresentation" ^> reprBodyP reprBodyP :: Parser Repr reprBodyP = Repr <$> featP "writtenForm" <*> (featP "language" <|> grave' "language not specified" "polh") <*> (optional $ featP "sourceID") featP :: L.Text -> Parser T.Text featP att = L.toStrict <$> cut (tag "feat" *> hasAttr "att" att *> getAttr "val") -- | Read the dictionary from the LMF file. readPolh :: FilePath -> IO Polh readPolh = fmap parsePolh . L.readFile -- | Parse the entire dictionary in the LMF format. parsePolh :: L.Text -> Polh parsePolh = parseXml lmfP -- | Parse the lexical entry LMF representation parseLexEntry :: L.Text -> LexEntry parseLexEntry = parseXml lexEntryP