{-# LANGUAGE OverloadedStrings #-} module NLP.Sequor.CoNLL ( Token , Field , Label , Sentence , parse , toLabeled ) where import qualified Data.Text.Lazy as Text import Data.List.Split -- | @Token@ is a representation of a word, which consists of a number of fields. type Token = [Text.Text] -- | @Field@ is a part of a word token, such as word form, lemma or POS tag. type Field = Text.Text -- | @Sentence@ is a sequence of tokens. type Sentence = [Token] -- | @Label@ is a label associated to a token. type Label = Text.Text -- | @parse text@ returns a lazy list of sentences. parse :: Text.Text -> [Sentence] parse = splitWhen null . map Text.words . Text.lines -- | @toLabeled s@ converts the last field of each token in @s@ to a -- label and returns a pair whose first element is the sentence and -- the second the corresponding sequence of labels. toLabeled :: Sentence -> (Sentence, [Label]) toLabeled = unzip . map (\ xs -> (init xs, last xs))