module NLP.Types.Tree where
import Prelude hiding (print)
import Control.Applicative ((<$>), (<*>))
import Data.String (IsString(..))
import Data.Text (Text)
import qualified Data.Text as T
import Test.QuickCheck (Arbitrary(..), listOf, elements, NonEmptyList(..))
import Test.QuickCheck.Instances ()
import NLP.Types.Tags
data Sentence = Sent [Token]
deriving (Read, Show, Eq)
instance Arbitrary Sentence where
arbitrary = Sent <$> arbitrary
tokens :: Sentence -> [Token]
tokens (Sent ts) = ts
applyTags :: Tag t => Sentence -> [t] -> TaggedSentence t
applyTags (Sent ts) tags = TaggedSent $ zipWith POS tags ts
data ChunkedSentence chunk tag = ChunkedSent [ChunkOr chunk tag]
deriving (Read, Show, Eq)
data ChunkOr chunk tag = Chunk_CN (Chunk chunk tag)
| POS_CN (POS tag)
deriving (Read, Show, Eq)
data Chunk chunk tag = Chunk chunk [ChunkOr chunk tag]
deriving (Read, Show, Eq)
showChunkedSent :: (ChunkTag c, Tag t) => ChunkedSentence c t -> Text
showChunkedSent (ChunkedSent cs) = T.intercalate " " (map showChunkOr cs)
where
showChunkOr (POS_CN pos) = printPOS pos
showChunkOr (Chunk_CN (Chunk chunk cors)) =
let front = T.concat ["[", fromChunk chunk]
back = "]"
bits = map showChunkOr cors
in T.append (T.intercalate " " (front:bits)) back
instance (ChunkTag c, Arbitrary c, Arbitrary t, Tag t) =>
Arbitrary (ChunkedSentence c t) where
arbitrary = ChunkedSent <$> arbitrary
data TaggedSentence tag = TaggedSent [POS tag]
deriving (Read, Show, Eq)
instance (Arbitrary t, Tag t) => Arbitrary (TaggedSentence t) where
arbitrary = TaggedSent <$> arbitrary
printTS :: Tag t => TaggedSentence t -> Text
printTS (TaggedSent ts) = T.intercalate " " $ map printPOS ts
stripTags :: Tag t => TaggedSentence t -> Sentence
stripTags ts = fst $ unzipTags ts
unzipTags :: Tag t => TaggedSentence t -> (Sentence, [t])
unzipTags (TaggedSent ts) =
let (tags, toks) = unzip $ map topair ts
topair (POS tag tok) = (tag, tok)
in (Sent toks, tags)
unzipChunks :: (ChunkTag c, Tag t) => ChunkedSentence c t -> (TaggedSentence t, [c])
unzipChunks (ChunkedSent cs) = (TaggedSent poss, chunks)
where
(poss, chunks) = unzip (concatMap f cs)
f (POS_CN postag) = [(postag, notChunk)]
f (Chunk_CN (Chunk chTag subTree)) = map (updateChunk chTag) (concatMap f subTree)
updateChunk chunk (ptag, oldChunk) | oldChunk == notChunk = (ptag, chunk)
| otherwise = (ptag, oldChunk)
combine :: Tag t => [TaggedSentence t] -> [TaggedSentence t] -> [TaggedSentence t]
combine xs ys = zipWith combineSentences xs ys
combineSentences :: Tag t => TaggedSentence t -> TaggedSentence t -> TaggedSentence t
combineSentences (TaggedSent xs) (TaggedSent ys) = TaggedSent $ zipWith pickTag xs ys
pickTag :: Tag t => POS t -> POS t -> POS t
pickTag a@(POS t1 txt1) b@(POS t2 txt2)
| txt1 /= txt2 = error ("Text does not match: "++ show a ++ " " ++ show b)
| t1 /= tagUNK = POS t1 txt1
| otherwise = POS t2 txt1
instance (ChunkTag c, Arbitrary c, Arbitrary t, Tag t) => Arbitrary (ChunkOr c t) where
arbitrary = elements =<< do
chunk <- mkChunk <$> arbitrary <*> listOf arbitrary
chink <- mkChink <$> arbitrary <*> arbitrary
return [chunk, chink]
mkChunk :: (ChunkTag chunk, Tag tag) => chunk -> [ChunkOr chunk tag] -> ChunkOr chunk tag
mkChunk chunk children = Chunk_CN (Chunk chunk children)
mkChink :: (ChunkTag chunk, Tag tag) => tag -> Token -> ChunkOr chunk tag
mkChink tag token = POS_CN (POS tag token)
instance (ChunkTag c, Arbitrary c, Arbitrary t, Tag t) => Arbitrary (Chunk c t) where
arbitrary = Chunk <$> arbitrary <*> arbitrary
data POS tag = POS { posTag :: tag
, posToken :: Token
} deriving (Read, Show, Eq)
instance (Arbitrary t, Tag t) => Arbitrary (POS t) where
arbitrary = POS <$> arbitrary <*> arbitrary
showPOStok :: Tag tag => POS tag -> Text
showPOStok (POS _ (Token txt)) = txt
showPOStag :: Tag tag => POS tag -> Text
showPOStag = tagTerm . posTag
printPOS :: Tag tag => POS tag -> Text
printPOS (POS tag (Token txt)) = T.intercalate "" [txt, "/", tagTerm tag]
data Token = Token Text
deriving (Read, Show, Eq)
instance Arbitrary Token where
arbitrary = do NonEmpty txt <- arbitrary
return $ Token (T.pack txt)
instance IsString Token where
fromString = Token . T.pack
showTok :: Token -> Text
showTok (Token txt) = txt
suffix :: Token -> Text
suffix (Token str) | T.length str <= 3 = str
| otherwise = T.drop (T.length str 3) str
unTS :: Tag t => TaggedSentence t -> [POS t]
unTS (TaggedSent ts) = ts
tsLength :: Tag t => TaggedSentence t -> Int
tsLength (TaggedSent ts) = length ts
tsConcat :: Tag t => [TaggedSentence t] -> TaggedSentence t
tsConcat tss = TaggedSent (concatMap unTS tss)
contains :: Tag t => TaggedSentence t -> Text -> Bool
contains (TaggedSent ts) tok = any (posTokMatches tok) ts
containsTag :: Tag t => TaggedSentence t -> t -> Bool
containsTag (TaggedSent ts) tag = any (posTagMatches tag) ts
posTagMatches :: Tag t => t -> POS t -> Bool
posTagMatches t1 (POS t2 _) = t1 == t2
posTokMatches :: Tag t => Text -> POS t -> Bool
posTokMatches txt (POS _ tok) = tokenMatches txt tok
tokenMatches :: Text -> Token -> Bool
tokenMatches txt (Token tok) = txt == tok