module NLP.Extraction.Parsec
where
import Data.Text (Text)
import qualified Data.Text as T
import Text.Parsec.String ()
import Text.Parsec.Prim (lookAhead, token, Parsec, try, Stream(..))
import qualified Text.Parsec.Combinator as PC
import Text.Parsec.Pos (newPos)
import NLP.Types (TaggedSentence(..), Tag(..), CaseSensitive(..),
POS(..), Token(..), ChunkedSentence(..), ChunkOr(..), ChunkTag)
instance (Monad m, Tag t) => Stream (TaggedSentence t) m (POS t) where
uncons (TaggedSent ts) = do
mRes <- uncons ts
case mRes of
Nothing -> return $ Nothing
Just (mTok, rest) -> return $ Just (mTok, TaggedSent rest)
instance (Monad m, ChunkTag c, Tag t) => Stream (ChunkedSentence c t) m (ChunkOr c t) where
uncons (ChunkedSent ts) = do
mRes <- uncons ts
case mRes of
Nothing -> return $ Nothing
Just (mTok, rest) -> return $ Just (mTok, ChunkedSent rest)
type Extractor t = Parsec (TaggedSentence t) ()
posTok :: Tag t => t -> Extractor t (POS t)
posTok tag = token showTok posFromTok testTok
where
showTok = show
posFromTok _ = newPos "unknown" 0 0
testTok tok@(POS t _) = if tag == t then Just tok else Nothing
posPrefix :: Tag t => Text -> Extractor t (POS t)
posPrefix str = token showTok posFromTok testTok
where
showTok = show
posFromTok _ = newPos "unknown" 0 0
testTok tok@(POS t _) = if str `T.isPrefixOf` (tagTerm t) then Just tok else Nothing
matches :: CaseSensitive -> Token -> Token -> Bool
matches Sensitive x y = x == y
matches Insensitive (Token x) (Token y) = (T.toLower x) == (T.toLower y)
txtTok :: Tag t => CaseSensitive -> Token -> Extractor t (POS t)
txtTok sensitive txt = token showTok posFromTok testTok
where
showTok = show
posFromTok _ = newPos "unknown" 0 0
testTok tok@(POS _ t) | matches sensitive txt t = Just tok
| otherwise = Nothing
anyToken :: Tag t => Extractor t (POS t)
anyToken = token showTok posFromTok testTok
where
showTok = show
posFromTok _ = newPos "unknown" 0 0
testTok tok@(POS _ txt) | txt == "" = Nothing
| otherwise = Just tok
oneOf :: Tag t => CaseSensitive -> [Token] -> Extractor t (POS t)
oneOf sensitive terms = PC.choice (map (\t -> try (txtTok sensitive t)) terms)
followedBy :: Tag t => Extractor t b -> Extractor t a -> Extractor t a
followedBy fill end = do
_ <- PC.manyTill fill (lookAhead end)
end