{-# LANGUAGE OverloadedStrings #-} {-# LANGUAGE ViewPatterns #-} -- | -- -- This module builds on module "Control.Foldl.Text", adding stateful -- transducers and grouping operations. module Control.Foldl.Transduce.Text ( -- * Decoding transducers decoder , utf8 , utf8lenient , utf8strict , decoderE , utf8E -- * Other transducers , newline , stripStart , stripEnd -- * Splitters , words , lines , paragraphs , sections -- * Textual -- $textual , textualSplit , textualBreak ) where import Prelude hiding (lines,words) import Data.Char import Data.Bool import Data.Maybe import Data.List (unfoldr) import Data.Monoid (mempty,(<>)) import Data.Foldable (foldMap,foldl') import qualified Data.ByteString as B import qualified Data.Text import qualified Data.Text as T import qualified Data.Text.Encoding as T import qualified Data.Text.Encoding.Error as T import qualified Data.Monoid.Textual as MT import qualified Data.Monoid.Null as MN import Control.Applicative import Control.Monad.Trans.Except import Control.Monad.IO.Class import Control.Exception.Base import qualified Control.Foldl.Transduce as L import qualified Data.List import Data.List.NonEmpty (NonEmpty(..)) import qualified Data.List.NonEmpty as NonEmpty {- $setup >>> :set -XFlexibleContexts >>> import Data.String hiding (lines,words) >>> import Data.Text (Text) >>> import Control.Applicative >>> import Control.Monad.Trans.Except >>> import qualified Control.Foldl as L >>> import Control.Foldl.Transduce -} data Pair a b = Pair !a !b {-| Builds a decoding 'Transducer' out of a stream-oriented decoding function from "Data.Text.Encoding" and an error handler from "Data.Text.Encoding.Error". -} decoder :: (B.ByteString -> T.Decoding) -> T.OnDecodeError -> L.Transducer B.ByteString T.Text () decoder _step onLeftovers = L.Transducer step (Pair mempty _step) done where step (Pair _ next) i = let T.Some txt leftovers next' = next i in (Pair leftovers next',[txt],[]) done (Pair leftovers _) = if B.null leftovers then ((), [], []) else ((), foldMap (pure . T.singleton) onLeftovers',[]) onLeftovers' = onLeftovers "leftovers" Nothing {-| Builds a UTF8-decoding 'Transducer'. Takes an error handler from "Data.Text.Encoding.Error". -} utf8 :: T.OnDecodeError -> L.Transducer B.ByteString T.Text () utf8 onDecodeError = decoder (T.streamDecodeUtf8With onDecodeError) onDecodeError {-| UTF8-decoding 'Transducer' that replaces invalid input bytes with the Unicode replacement character U+FFFD. >>> L.fold (transduce utf8lenient L.list) (map fromString ["decode","this"]) ["decode","this"] >>> L.fold (transduce utf8lenient L.list) (map fromString ["across \xe2","\x98\x83 boundaries"]) ["across ","\9731 boundaries"] >>> L.fold (transduce utf8lenient L.list) (map fromString ["invalid \xc3\x28 sequence"]) ["invalid \65533 sequence"] >>> L.fold (transduce utf8lenient L.list) (map fromString ["incomplete \xe2"]) ["incomplete ","\65533"] -} utf8lenient :: L.Transducer B.ByteString T.Text () utf8lenient = utf8 T.lenientDecode {-| __/BEWARE!/__ This 'Transducer' may throw 'UnicodeException'. __/BEWARE!/__ >>> L.fold (transduce utf8strict L.list) (map fromString ["invalid \xc3\x28 sequence"]) *** Exception: Cannot decode byte '\x28': Data.Text.Internal.Encoding.streamDecodeUtf8With: Invalid UTF-8 stream >>> L.fold (transduce utf8strict L.list) (map fromString ["incomplete \xe2"]) *** Exception: Cannot decode input: leftovers -} utf8strict :: L.Transducer B.ByteString T.Text () utf8strict = utf8 T.strictDecode {-| Similar to 'decoder', but catches 'UnicodeException' in 'IO' and uses 'Control.Monad.Trans.Except' to communicate the error. -} decoderE :: MonadIO m => (T.OnDecodeError -> B.ByteString -> T.Decoding) -> L.TransducerM (ExceptT T.UnicodeException m) B.ByteString T.Text () decoderE next = L.TransducerM step (return (Pair mempty next')) done where step (Pair _ next1) i = do emc <- liftIO . try . evaluate $ next1 i case emc of Left ue -> do throwE ue Right (T.Some txt leftovers next2) -> do return (Pair leftovers next2,[txt],[]) done (Pair leftovers _) = do if B.null leftovers then return ((), [], []) else do emc <- liftIO . try . evaluate $ onLeftovers' case emc of Left ue -> do throwE ue Right mc -> do return ((), foldMap (return . T.singleton) mc,[]) next' = next T.strictDecode onLeftovers' = T.strictDecode "leftovers" Nothing {-| Like 'utf8strict', but catches 'UnicodeException' in 'IO' and uses 'Control.Monad.Trans.Except' to communicate the error. >>> runExceptT $ L.foldM (transduceM utf8E (L.generalize L.list)) (map fromString ["invalid \xc3\x28 sequence"]) Left Cannot decode byte '\x28': Data.Text.Internal.Encoding.streamDecodeUtf8With: Invalid UTF-8 stream >>> runExceptT $ L.foldM (transduceM utf8E (L.generalize L.list)) (map fromString ["incomplete \xe2"]) Left Cannot decode input: leftovers -} utf8E :: MonadIO m => L.TransducerM (ExceptT T.UnicodeException m) B.ByteString T.Text () utf8E = decoderE T.streamDecodeUtf8With {-| Appends a newline at the end of the stream. >>> L.fold (transduce newline L.list) (map T.pack ["without","newline"]) ["without","newline","\n"] -} newline :: L.Transducer T.Text T.Text () newline = L.surround [] ["\n"] blank :: T.Text -> Bool blank = Data.Text.all isSpace {-| Remove leading white space from a stream of 'Text'. >>> L.fold (transduce stripStart L.list) (map T.pack [" ","", " text "]) ["text "] -} stripStart :: L.Transducer T.Text T.Text () stripStart = L.Transducer step False done where step True i = (True, [i],[]) step False i = if blank i then (False,[],[]) else (True, [T.stripStart i],[]) done _ = ((),[],[]) {-| Remove trailing white space from a stream of 'Text'. __/BEWARE!/__ This function naively accumulates in memory any arriving "blank blocks" of text until a non-blank block or end-of-stream arrives, and therefore it is potentially dangerous. Do not use with untrusted inputs. >>> L.fold (transduce stripEnd L.list) (map T.pack [" ", " \n text ", " ", "" , " "]) [" "," \n text"] -} stripEnd :: L.Transducer T.Text T.Text () stripEnd = L.Transducer step [] done where step txts i = if blank i -- dangerous! then (i:txts, [], []) else ([i], reverse txts, []) done txts = case reverse txts of txt : _ -> ((), [T.stripEnd txt], []) _ -> ((), [], []) {-| Splits a stream of text into lines, removing the newlines. >>> L.fold (L.groups lines (surround [T.pack "x"] []) L.list) (map T.pack ["line 1\n line 2\n"]) ["x","line 1","x"," line 2"] >>> L.fold (L.groups lines newline L.list) (map T.pack ["line 1\n line 2\n"]) ["line 1","\n"," line 2","\n"] Used with 'L.transduce', it simply removes newlines: >>> L.fold (L.transduce lines L.list) (map T.pack ["line 1\n line 2\n"]) ["line 1"," line 2"] -} lines :: L.Transducer T.Text T.Text () lines = L.Transducer step False done where step previousnl txt = if Data.Text.null txt then (previousnl,[],[]) else let lastc = Data.Text.last txt == '\n' txts = T.lines txt in case (previousnl,txts) of (_,[]) -> error "never happens" (True,_) -> (lastc, [], map pure txts) (False,t:ts) -> (lastc, [t], map pure ts) done _ = ((),[],[]) data WordsState = NoLastChar | LastCharSpace | LastCharNotSpace {-| Splits a stream of text into words, removing whitespace. >>> L.fold (folds words L.list L.list) (map T.pack [" a","aa ", "bb c","cc dd ","ee f","f"]) [["a","aa"],["bb"],["c","cc"],["dd"],["ee"],["f","f"]] Used with 'L.transduce', it simply removes all whitespace: >>> L.fold (L.transduce words L.list) (map T.pack [" a","aa ", "bb c","cc dd ","ee f","f"]) ["a","aa","bb","c","cc","dd","ee","f","f"] -} words :: L.Transducer T.Text T.Text () words = L.Transducer step NoLastChar done where step tstate txt | Data.Text.null txt = (tstate,[],[]) | blank txt = case tstate of NoLastChar -> (NoLastChar,[],[]) _ -> (LastCharSpace,[],[]) | otherwise = let nextstate = if isSpace (T.last txt) then LastCharSpace else LastCharNotSpace (oldgroup,newgroups) = case (tstate, T.words txt) of (NoLastChar,w:ws) -> ([w],map pure ws) (LastCharSpace,ws) -> ([],map pure ws) (LastCharNotSpace,w:ws) -> if isSpace (T.head txt) then ([],map pure (w:ws)) else ([w],map pure ws) (_,[]) -> error "never happens, txt not blank" in (nextstate,oldgroup,newgroups) done _ = ((),[],[]) data ParagraphsState = SkippingAfterStreamStart | SkippingAfterNewline | SkippingAfterBlankLine | ContinuingNonemptyLine {-| Splits a stream of text into paragraphs, removing empty lines and trimming newspace from the start of each line. >>> map mconcat (L.fold (folds paragraphs L.list L.list) (map T.pack [" \n aaa","\naa ", " \n\nbb\n"])) ["aaa\naa \n","bb\n"] Used with 'L.transduce', it removes empty lines and trims newspace from the start of each line. -} paragraphs :: L.Transducer T.Text T.Text () paragraphs = L.Transducer step SkippingAfterStreamStart done where step tstate txt | Data.Text.null txt = (tstate,[],[]) | otherwise = let (initlines,lastline) = splittedLines txt (tstate', outputsreversed) = advanceLast (foldl' advance (tstate,pure []) initlines) lastline (xs :| xss) = fmap reverse (NonEmpty.reverse outputsreversed) in (tstate',xs,xss) done _ = ((),[],[]) splittedLines :: T.Text -> ([T.Text],T.Text) splittedLines nonEmptyChunk = let splitted = Data.Text.lines nonEmptyChunk ++ if T.last nonEmptyChunk == '\n' then [mempty] else mempty in (init splitted, last splitted) -- unsafe with empty lists!!! advance :: (ParagraphsState, NonEmpty [T.Text]) -> T.Text -> (ParagraphsState, NonEmpty [T.Text]) advance (s,outputs) i = case (s, blank i) of (SkippingAfterStreamStart, True) -> (,) SkippingAfterStreamStart outputs (SkippingAfterStreamStart, False) -> (,) SkippingAfterNewline (continue ["\n",T.stripStart i] outputs) (SkippingAfterNewline, True) -> (,) SkippingAfterBlankLine outputs (SkippingAfterNewline, False) -> (,) SkippingAfterNewline (continue ["\n",T.stripStart i] outputs) (SkippingAfterBlankLine, True) -> (,) SkippingAfterBlankLine outputs (SkippingAfterBlankLine, False) -> (,) SkippingAfterNewline (continue ["\n",T.stripStart i] (NonEmpty.cons [] outputs)) (ContinuingNonemptyLine, _) -> (,) SkippingAfterNewline (continue ["\n",i] outputs) advanceLast :: (ParagraphsState, NonEmpty [T.Text]) -> T.Text -> (ParagraphsState, NonEmpty [T.Text]) advanceLast (s,outputs) i = case (s, blank i) of (SkippingAfterStreamStart, True) -> (,) SkippingAfterStreamStart outputs (SkippingAfterStreamStart, False) -> (,) ContinuingNonemptyLine (continue [T.stripStart i] outputs) (SkippingAfterNewline, True) -> (,) SkippingAfterNewline outputs (SkippingAfterNewline, False) -> (,) ContinuingNonemptyLine (continue [T.stripStart i] outputs) (SkippingAfterBlankLine, True) -> (,) SkippingAfterBlankLine outputs (SkippingAfterBlankLine, False) -> (,) ContinuingNonemptyLine (continue [T.stripStart i] (NonEmpty.cons [] outputs)) (ContinuingNonemptyLine, _) -> (,) ContinuingNonemptyLine (continue [i] outputs) {-| Given a (possibly infinite) list of section headings, split the stream into sections and remove the headings. >>> map mconcat (L.fold (folds (sections (map T.pack ["#1\n","#2\n"])) L.list L.list) (map T.pack [" #1\naa\n#","2\nbb"])) [" ","aa\n","bb"] >>> map mconcat (L.fold (folds (sections (map T.pack ["1234"])) L.list L.list) (map T.pack [" 1","2","x","1","2","3","4","5"])) [" 12x","5"] Used with 'L.transduce', it simply removes all headings. -} sections :: [T.Text] -> L.Transducer T.Text T.Text () sections seps = L.Transducer step (initialstate seps) done where step tstate txt = let (emitted,fmap snd -> states) = Data.List.unzip (unfoldWithState splitTextStep (txt,tstate)) finalState = NonEmpty.last (tstate :| states) continuing :| following = NonEmpty.reverse (fmap Data.List.reverse (foldl' advance ([]:|[]) emitted)) in (finalState, continuing, following) advance :: NonEmpty [x] -> ([x],Bool) -> NonEmpty [x] advance l (e,b) = bool id (separate []) b (continue e l) done Done = ((),[],[]) done (Pending acc _ _) = ((),[acc],[]) initialstate [] = Done initialstate (x:xs) = Pending T.empty x xs continue :: [a] -> NonEmpty [a] -> NonEmpty [a] continue as (as':| rest) = (as ++ as') :| rest separate :: [x] -> NonEmpty [x] -> NonEmpty [x] separate = NonEmpty.cons data SectionsState = Done | Pending T.Text T.Text [T.Text] -- first is the accumulator deriving (Show) {-| >>> splitTextStep (T.pack "x",Done) Just ((["x"],False),("",Done)) >>> splitTextStep (T.pack "aabbcc",Pending T.empty (T.pack "bb") []) Just ((["aa"],True),("cc",Done)) >>> splitTextStep (T.pack "cc",Pending (T.pack "bb") (T.pack "bbcc") [T.pack "nextsep"]) Just (([""],True),("",Pending "" "nextsep" [])) >>> splitTextStep (T.pack "xx",Pending (T.pack "bb") (T.pack "bbcc") []) Just ((["bbxx"],False),("",Pending "" "bbcc" [])) >>> splitTextStep (T.pack "xbb",Pending (T.pack "bbc") (T.pack "bbcccc") []) Just ((["bbcx"],False),("",Pending "bb" "bbcccc" [])) -} splitTextStep :: (T.Text, SectionsState) -> Maybe (([T.Text],Bool), (T.Text, SectionsState)) splitTextStep (txt, _) | T.null txt = Nothing splitTextStep (txt, Done) = Just (([txt],False),(T.empty,Done)) splitTextStep (txt, Pending acc sep nextseps) = Just $ let (before,after) = T.breakOn sep (acc <> txt) in if T.null after then -- not present let (m0,m) = maxintersect before sep in (([m0],False),(T.empty, Pending m sep nextseps)) else -- present let unprefixed = T.drop (T.length sep) after nextstate = case nextseps of [] -> Done z:zs -> Pending T.empty z zs in (([before],True),(unprefixed,nextstate)) maxintersect :: T.Text -> T.Text -> (T.Text,T.Text) maxintersect txt sep = let prefixes = (tail . reverse . tail . T.inits) sep partialmatches = filter (flip T.isSuffixOf txt) prefixes m = maybe T.empty id (listToMaybe partialmatches) in (T.take (T.length txt - T.length m) txt,m) unfoldWithState :: (b -> Maybe (a, b)) -> b -> [(a, b)] unfoldWithState f = unfoldr (fmap (\t@(_, b) -> (t, b)) . f) ------------------------------------------------------------------------------ {- $textual Transducers that work on 'Text' and other text-like types. -} {-| >>> L.fold (folds (textualSplit (=='.')) L.list L.list) [".","bb.bb","c.c."] [[""],["","bb"],["bb","c"],["c"],[""]] -} textualSplit :: MT.TextualMonoid m => (Char -> Bool) -> L.Transducer m m () textualSplit predicate = L.Transducer step () done where step _ txt = case MT.split predicate txt of x:xs -> ((),[x],map (:[]) xs) _ -> error "never happens" done _ = mempty data SplitWhenWhenState = SplitWhenConditionEncountered | SplitWhenConditionPending {-| >>> L.fold (bisect (textualBreak (=='.')) (reify id) ignore L.list) ["aa","bb.bb","cc"] ["aa","bb"] -} textualBreak :: MT.TextualMonoid m => (Char -> Bool) -> L.Transducer m m () textualBreak predicate = L.Transducer step SplitWhenConditionPending done where step SplitWhenConditionPending (MT.break (const False) predicate -> (i0,i1)) = if MN.null i1 then (SplitWhenConditionPending,[i0],[]) else (SplitWhenConditionEncountered,[i0],[[i1]]) step SplitWhenConditionEncountered i = (SplitWhenConditionEncountered,[i],[]) done = mempty