{-# LANGUAGE CPP #-} -- |This module provides functionality for tokenizing text streams to -- differentiate between printed characters and structural elements -- such as newlines. Once tokenized, such text streams can be -- manipulated with the functions in this module. module Text.Trans.Tokenize ( TextStream(..) , TextStreamEntity(..) , Token(..) -- * To and from strings , tokenize , serialize -- * Inspection , tokenLen , entityToken , streamEntities -- * Manipulation , truncateLine , wrapStream , findLines #ifdef TESTING , isWhitespace , partitions #endif ) where import Data.List ( inits ) -- |The type of text tokens. These should consist of printable -- characters and NOT presentation characters (e.g., newlines). Each -- type of token should have as its contents a string of characters -- all of the same type. Tokens are generalized over an attribute -- type which can be used to annotate each token. data Token a = S { tokenStr :: String -- ^The token's string. , tokenAttr :: a -- ^The token's attribute. } -- ^Non-whitespace tokens. | WS { tokenStr :: String -- ^The token's string. , tokenAttr :: a -- ^The token's attribute. } -- ^Whitespace tokens. -- |A text stream entity is either a token or a structural element. data TextStreamEntity a = T (Token a) -- ^Constructor for ordinary tokens. | NL -- ^Newline. -- |A text stream is a list of text stream entities. A text stream -- |combines structural elements of the text (e.g., newlines) with the -- |text itself (words, whitespace, etc.). data TextStream a = TS [TextStreamEntity a] instance (Show a) => Show (TextStream a) where show (TS ts) = "TS " ++ show ts instance (Show a) => Show (TextStreamEntity a) where show (T t) = "T " ++ show t show NL = "NL" instance (Show a) => Show (Token a) where show (S s a) = "S " ++ show s ++ " " ++ show a show (WS s a) = "WS " ++ show s ++ " " ++ show a instance (Eq a) => Eq (Token a) where a == b = (tokenStr a) == (tokenStr b) && (tokenAttr a) == (tokenAttr b) instance (Eq a) => Eq (TextStreamEntity a) where NL == NL = True T a == T b = a == b _ == _ = False instance (Eq a) => Eq (TextStream a) where (TS as) == (TS bs) = as == bs -- |Get the entities in a stream. streamEntities :: TextStream a -> [TextStreamEntity a] streamEntities (TS es) = es -- |Get the length of a token's string. tokenLen :: Token a -> Int tokenLen (S s _) = length s tokenLen (WS s _) = length s wsChars :: [Char] wsChars = [' ', '\t'] isWs :: Char -> Bool isWs = (`elem` wsChars) isNL :: TextStreamEntity a -> Bool isNL NL = True isNL _ = False -- |Gets a 'Token' from an entity or raises an exception if the entity -- does not contain a token. Used primarily for convenience -- transformations in which the parameter is known to be a token -- entity. entityToken :: TextStreamEntity a -> Token a entityToken (T t) = t entityToken _ = error "Cannot get token from non-token entity" isWhitespace :: Token a -> Bool isWhitespace (WS _ _) = True isWhitespace _ = False isWsEnt :: TextStreamEntity a -> Bool isWsEnt (T (WS _ _)) = True isWsEnt _ = False -- |Given a text stream, serialize the stream to its original textual -- representation. This discards token attribute metadata. serialize :: TextStream a -> String serialize (TS es) = concat $ map serializeEntity es where serializeEntity NL = "\n" serializeEntity (T (WS s _)) = s serializeEntity (T (S s _)) = s -- |Tokenize a string and apply a default attribute to every token in -- the resulting text stream. tokenize :: String -> a -> TextStream a tokenize s def = TS $ findEntities s where findEntities [] = [] findEntities str@(c:_) = nextEntity : findEntities (drop nextLen str) where (nextEntity, nextLen) = if isWs c then (T (WS nextWs def), length nextWs) else if c == '\n' then (NL, 1) else (T (S nextStr def), length nextStr) nextWs = takeWhile isWs str nextStr = takeWhile (\ch -> not $ ch `elem` ('\n':wsChars)) str -- |Given a list of tokens, truncate the list so that its underlying -- string representation does not exceed the specified column width. truncateLine :: Int -> [Token a] -> [Token a] truncateLine l _ | l < 0 = error $ "truncateLine cannot truncate at length = " ++ show l truncateLine _ [] = [] truncateLine width ts = -- If we are returning all tokens, we didn't have to do any -- truncation. But if we *did* have to truncate, return exactly -- 'width' characters' worth of tokens by constructing a new final -- token with the same attribute data. -- -- If there are no passing cases (i.e., remaining is null), just -- return 'width' characters of the first token. if null remaining then [first_tok { tokenStr = take width $ tokenStr first_tok }] else if length tokens == length ts then tokens else if null $ tokenStr lastToken then tokens else tokens ++ [lastToken] where lengths = map (length . tokenStr) ts cases = reverse $ inits lengths remaining = dropWhile ((> width) . sum) cases tokens = take (length $ head remaining) ts truncLength = sum $ head remaining first_tok = ts !! 0 last_tok = ts !! (length tokens) lastToken = last_tok { tokenStr = take (width - truncLength) $ tokenStr last_tok } -- |Given a text stream and a wrapping width, return a new -- 'TextStream' with newlines inserted in appropriate places to wrap -- the text at the specified column. This function results in text -- wrapped without leading or trailing whitespace on wrapped lines, -- although it preserves leading whitespace in the text which was not -- the cause of the wrapping transformation. wrapStream :: (Eq a) => Int -> TextStream a -> TextStream a wrapStream width (TS stream) = TS $ reverse $ dropWhile (== NL) $ reverse $ wrapAll' 0 stream where wrapAll' :: Int -> [TextStreamEntity a] -> [TextStreamEntity a] wrapAll' _ [] = [] wrapAll' _ (NL:rest) = NL : wrapAll' 0 rest wrapAll' accum (T t:rest) = if (length $ tokenStr t) + accum > width then if isWhitespace t then [NL] ++ wrapAll' 0 (dropWhile isWsEnt rest) else if accum == 0 && ((length $ tokenStr t) >= width) then [T t, NL] ++ wrapAll' 0 (dropWhile isWsEnt rest) else [NL, T t] ++ wrapAll' (length $ tokenStr t) rest else T t : wrapAll' (accum + (length $ tokenStr t)) rest partitions :: (a -> Bool) -> [a] -> [[a]] partitions _ [] = [] partitions f as = p : partitions f (drop (length p + 1) as) where p = takeWhile f as -- |Given a list of text stream entities, split up the list wherever -- newlines occur. Returns a list of lines of entities, such that all -- entities wrap tokens and none are newlines. (Safe for use with -- 'entityToken'.) findLines :: [TextStreamEntity a] -> [[TextStreamEntity a]] findLines = partitions (not . isNL)