module NLP.Extraction.Parsec
where
import Data.Text (Text)
import qualified Data.Text as T
import Text.Parsec.String ()
import Text.Parsec.Prim (lookAhead, token, Parsec, try)
import qualified Text.Parsec.Combinator as PC
import Text.Parsec.Pos (newPos)
import NLP.Types (TaggedSentence, Tag(..), CaseSensitive(..))
type Extractor = Parsec TaggedSentence ()
posTok :: Tag -> Extractor (Text, Tag)
posTok tag = token showTok posFromTok testTok
where
showTok (_,t) = show t
posFromTok (_,_) = newPos "unknown" 0 0
testTok tok@(_,t) = if tag == t then Just tok else Nothing
posPrefix :: Text -> Extractor (Text, Tag)
posPrefix str = token showTok posFromTok testTok
where
showTok (_,t) = show t
posFromTok (_,_) = newPos "unknown" 0 0
testTok tok@(_,Tag t) = if str `T.isPrefixOf` t then Just tok else Nothing
matches :: CaseSensitive -> Text -> Text -> Bool
matches Sensitive x y = x == y
matches Insensitive x y = (T.toLower x) == (T.toLower y)
txtTok :: CaseSensitive -> Text -> Extractor (Text, Tag)
txtTok sensitive txt = token showTok posFromTok testTok
where
showTok (t,_) = show t
posFromTok (_,_) = newPos "unknown" 0 0
testTok tok@(t,_) | matches sensitive txt t = Just tok
| otherwise = Nothing
anyToken :: Extractor (Text, Tag)
anyToken = token showTok posFromTok testTok
where
showTok (txt,_) = show txt
posFromTok (_,_) = newPos "unknown" 0 0
testTok tok@(txt,_) | txt == "" = Nothing
| otherwise = Just tok
oneOf :: CaseSensitive -> [Text] -> Extractor (Text, Tag)
oneOf sensitive terms = PC.choice (map (\t -> try (txtTok sensitive t)) terms)
followedBy :: Extractor b -> Extractor a -> Extractor a
followedBy fill end = do
_ <- PC.manyTill fill (lookAhead end)
end