{-# LANGUAGE OverloadedStrings #-} module CorpusReader ( Token , corpus , corpusLabeled , fromWords ) where import Helper.ListZipper import Helper.Utils (splitWith) import qualified Helper.Text as Text import Helper.Text (Txt) import Data.Maybe (isJust) type Token = [Txt] corpus :: Int -> Txt -> [[ListZipper Token]] corpus len = map toZippers . map (map $ parseFields . take len) . splitWith null . map Text.words . Text.lines corpusLabeled ::Txt -> [([ListZipper Token], [Txt])] corpusLabeled = map (\xys -> let (xs,ys) = unzip xys in (toZippers xs,ys)) . map (map $ parseFieldsLabeled) . splitWith null . map Text.words . Text.lines fromWords :: [Txt] -> [ListZipper Token] fromWords = toZippers . map (\ w -> [w]) parseFieldsLabeled :: [Txt] -> (Token, Txt) parseFieldsLabeled ws = (init ws,last ws) parseFields :: [Txt] -> Token parseFields ws = ws