{-# LANGUAGE BangPatterns #-} {-# LANGUAGE OverloadedStrings #-} -- | Parsing the Gazetteer for Polish Named Entities (used formerly within -- the SProUT platform) in the LMF format. module NLP.Nerf.Dict.PNEG ( parsePNEG , readPNEG ) where import Text.XML.PolySoup import qualified Data.Text as T import qualified Data.Text.Lazy as L import qualified Data.Text.Lazy.IO as L import NLP.Nerf.Dict.Base lmfP :: XmlParser L.Text [Entry] lmfP = true ##> lexEntryP lexEntryP :: XmlParser L.Text [Entry] lexEntryP = tag "LexicalEntry" `joinR` do many_ $ cut $ tag "feat" _words <- many wordP sense <- senseP return [Entry x sense | x <- _words] wordP :: XmlParser L.Text Form wordP = head <$> (tag "Lemma" <|> tag "WordForm" /> featP "writtenForm") senseP :: XmlParser L.Text NeType senseP = head <$> (tag "Sense" //> featP "externalReference" <|> featP "label") featP :: L.Text -> XmlParser L.Text T.Text featP x = L.toStrict <$> cut (tag "feat" *> hasAttr "att" x *> getAttr "val") -- | Parse the dictionary to the list of entries. parsePNEG :: L.Text -> [Entry] parsePNEG = parseXml lmfP -- | Read the dictionary from the file. readPNEG :: FilePath -> IO [Entry] readPNEG = fmap parsePNEG . L.readFile