{-# LANGUAGE OverloadedStrings #-} module NLP.Corpora.Parsing where import qualified Data.Text as T import Data.Text (Text) import NLP.Types (Tag(..), parseTag, tagUNK, TaggedSentence(..) , POS(..), Token(..)) -- | Read a POS-tagged corpus out of a Text string of the form: -- "token\/tag token\/tag..." -- -- >>> readPOS "Dear/jj Sirs/nns :/: Let/vb" -- [("Dear",JJ),("Sirs",NNS),(":",Other ":"),("Let",VB)] -- readPOS :: Tag t => Text -> TaggedSentence t readPOS str = readPOSWith parseTag str readPOSWith :: Tag t => (Text -> t) -> Text -> TaggedSentence t readPOSWith parser str = TaggedSent $ map toTagged $ T.words str where toTagged txt | "/" `T.isInfixOf` txt = let (tok, tagStr) = T.breakOnEnd "/" (T.strip txt) in POS (parser tagStr) (Token $ safeInit tok) | otherwise = POS tagUNK (Token txt) -- | Returns all but the last element of a string, unless the string -- is empty, in which case it returns that string. safeInit :: Text -> Text safeInit str | T.length str == 0 = str | otherwise = T.init str