{-# LANGUAGE OverloadedStrings #-}
module NLP.Sequor.CoNLL
    ( Token
    , Field
    , Label
    , Sentence
    , parse
    , toLabeled 
    )
where

import qualified Data.Text.Lazy as Text  
import Data.List.Split 

-- | @Token@ is a representation of a word, which consists of a number of fields.
type Token = [Text.Text]

-- | @Field@ is a part of a word token, such as word form, lemma or POS tag. 
type Field = Text.Text

-- | @Sentence@ is a sequence of tokens.
type Sentence = [Token]

-- | @Label@ is a label associated to a token.
type Label = Text.Text

  
-- | @parse text@ returns a lazy list of sentences.
parse :: Text.Text -> [Sentence]
parse =   
      splitWhen null
    . map Text.words
    . Text.lines 

-- | @toLabeled s@ converts the last field of each token in @s@ to a
-- label and returns a pair whose first element is the sentence and
-- the second the corresponding sequence of labels.
toLabeled :: Sentence -> (Sentence, [Label])
toLabeled = unzip . map (\ xs -> (init xs, last xs))