module NLP.Types.Tags
where
import Data.Serialize (Serialize, get, put)
import Data.Text (Text)
import qualified Data.Text as T
import Data.Text.Encoding (encodeUtf8, decodeUtf8)
import GHC.Generics
import Test.QuickCheck (Arbitrary(..), NonEmptyList(..))
import Test.QuickCheck.Instances ()
import NLP.Types.General (Error)
class (Ord a, Eq a, Read a, Show a, Generic a, Serialize a) => ChunkTag a where
fromChunk :: a -> Text
parseChunk :: Text -> Either Error a
notChunk :: a
class (Ord a, Eq a, Read a, Show a, Generic a, Serialize a) => Tag a where
fromTag :: a -> Text
parseTag :: Text -> a
tagUNK :: a
tagTerm :: a -> Text
startTag :: a
endTag :: a
isDt :: a -> Bool
newtype RawChunk = RawChunk Text
deriving (Ord, Eq, Read, Show, Generic)
instance Serialize RawChunk
instance ChunkTag RawChunk where
fromChunk (RawChunk ch) = ch
parseChunk txt = Right (RawChunk txt)
notChunk = RawChunk "O"
newtype RawTag = RawTag Text
deriving (Ord, Eq, Read, Show, Generic)
instance Serialize RawTag
instance Tag RawTag where
fromTag (RawTag t) = t
parseTag t = RawTag t
tagUNK = RawTag "Unk"
tagTerm (RawTag t) = t
startTag = RawTag "-START-"
endTag = RawTag "-END-"
isDt (RawTag tg) = tg == "DT"
instance Arbitrary RawTag where
arbitrary = do
NonEmpty str <- arbitrary
return $ RawTag $ T.pack str
instance Serialize Text where
put txt = put $ encodeUtf8 txt
get = fmap decodeUtf8 get