src/NLP/Extraction/Parsec.hs

{-# LANGUAGE OverloadedStrings, RankNTypes, FlexibleContexts #-}
{-# LANGUAGE FlexibleInstances #-}
{-# LANGUAGE MultiParamTypeClasses #-}
{-# OPTIONS_GHC -fno-warn-orphans #-}

-- | This is a very simple wrapper around Parsec for writing
-- Information Extraction patterns.
--
-- Because the particular tags/tokens to parse depends on the training
-- corpus (for POS tagging) and the domain, this module only provides
-- basic extractors.  You can, for example, create an extractor to
-- find noun phrases by combining the components provided here:
--
-- @
--   nounPhrase :: Extractor (Text, Tag)
--   nounPhrase = do
--     nlist <- many1 (try (posTok $ Tag \"NN\")
--                 \<|\> try (posTok $ Tag \"DT\")
--                     \<|\> (posTok $ Tag \"JJ\"))
--     let term = T.intercalate " " (map fst nlist)
--     return (term, Tag "n-phr")
-- @
module NLP.Extraction.Parsec

where

-- See this SO q/a for some possibly useful combinators:
--  http://stackoverflow.com/questions/2473615/parsec-3-1-0-with-custom-token-datatype

import Data.Text (Text)
import qualified Data.Text as T
import Text.Parsec.String () -- required for the `Stream [t] Identity t` instance.
import Text.Parsec.Prim (lookAhead, token, Parsec, try, Stream(..))
import qualified Text.Parsec.Combinator as PC
import Text.Parsec.Pos  (newPos)

import NLP.Types (TaggedSentence(..), Tag(..), CaseSensitive(..),
                  POS(..), Token(..), ChunkedSentence(..), ChunkOr(..), ChunkTag)

instance (Monad m, Tag t) => Stream (TaggedSentence t) m (POS t) where
  uncons (TaggedSent ts) = do
    mRes <- uncons ts
    case mRes of
      Nothing           -> return $ Nothing
      Just (mTok, rest) -> return $ Just (mTok, TaggedSent rest)
  {-# INLINE uncons #-}

instance (Monad m, ChunkTag c, Tag t) => Stream (ChunkedSentence c t) m (ChunkOr c t) where
  uncons (ChunkedSent ts) = do
    mRes <- uncons ts
    case mRes of
      Nothing           -> return $ Nothing
      Just (mTok, rest) -> return $ Just (mTok, ChunkedSent rest)
  {-# INLINE uncons #-}

-- | A Parsec parser.
--
-- Example usage:
--
-- @
-- > set -XOverloadedStrings
-- > import Text.Parsec.Prim
-- > parse myExtractor "interactive repl" someTaggedSentence
-- @
type Extractor t = Parsec (TaggedSentence t) ()

-- | Consume a token with the given POS Tag
posTok :: Tag t => t -> Extractor t (POS t)
posTok tag = token showTok posFromTok testTok
  where
    showTok      = show
    posFromTok _ = newPos "unknown" 0 0
    testTok tok@(POS t _) = if tag == t then Just tok else Nothing

-- | Consume a token with the specified POS prefix.
--
-- @
-- > parse (posPrefix "n") "ghci" [("Bob", Tag "np")]
-- Right [("Bob", Tag "np")]
-- @
posPrefix :: Tag t => Text -> Extractor t (POS t)
posPrefix str = token showTok posFromTok testTok
  where
    showTok = show
    posFromTok _  = newPos "unknown" 0 0
    testTok tok@(POS t _) = if str `T.isPrefixOf` (tagTerm t) then Just tok else Nothing

-- | Text equality matching with optional case sensitivity.
matches :: CaseSensitive -> Token -> Token -> Bool
matches Sensitive   x y = x == y
matches Insensitive (Token x) (Token y) = (T.toLower x) == (T.toLower y)

-- | Consume a token with the given lexical representation.
txtTok :: Tag t => CaseSensitive -> Token -> Extractor t (POS t)
txtTok sensitive txt = token showTok posFromTok testTok
  where
    showTok = show
    posFromTok _  = newPos "unknown" 0 0
    testTok tok@(POS _ t) | matches sensitive txt t = Just tok
                          | otherwise               = Nothing

-- | Consume any one non-empty token.
anyToken :: Tag t => Extractor t (POS t)
anyToken = token showTok posFromTok testTok
  where
    showTok = show
    posFromTok _ = newPos "unknown" 0 0
    testTok tok@(POS _ txt) | txt == "" = Nothing
                            | otherwise = Just tok

oneOf :: Tag t => CaseSensitive -> [Token] -> Extractor t (POS t)
oneOf sensitive terms = PC.choice (map (\t -> try (txtTok sensitive t)) terms)

-- | Skips any number of fill tokens, ending with the end parser, and
-- returning the last parsed result.
--
-- This is useful when you know what you're looking for and (for
-- instance) don't care what comes first.
followedBy :: Tag t => Extractor t b -> Extractor t a -> Extractor t a
followedBy fill end = do
  _ <- PC.manyTill fill (lookAhead end)
  end