module Text.Trans.Tokenize
( tokenize
, serialize
, withAnnotation
, truncLine
, isWhitespace
, wrapLine
#ifdef TESTING
, Token(..)
#else
, Token
, tokenString
, tokenAnnotation
#endif
)
where
import Data.List
( inits
, intercalate
)
data Token a = Whitespace { tokenString :: String
, tokenAnnotation :: a
}
| Token { tokenString :: String
, tokenAnnotation :: a
}
deriving (Eq)
instance (Show a) => Show (Token a) where
show (Whitespace s _) = "{" ++ s ++ "}"
show (Token s _) = "<" ++ s ++ ">"
splitWith :: (Eq a) => [a] -> (a -> Bool) -> [[a]]
splitWith [] _ = []
splitWith es f = if null rest
then [first]
else if length rest == 1 && (f $ head rest)
then first : [[]]
else first : splitWith (tail rest) f
where
(first, rest) = break f es
wsChars :: [Char]
wsChars = [' ', '\t']
isWs :: Char -> Bool
isWs = (`elem` wsChars)
tokenize :: String -> a -> [[Token a]]
tokenize [] _ = [[]]
tokenize s def = map (tokenize' def) $ splitWith s (== '\n')
tokenize' :: a -> String -> [Token a]
tokenize' _ [] = []
tokenize' a s@(c:_) | isWs c = Whitespace ws a : tokenize' a rest
where
(ws, rest) = break (not . isWs) s
tokenize' a s = Token t a : tokenize' a rest
where
(t, rest) = break (\c -> isWs c || c == '\n') s
serialize :: [[Token a]] -> String
serialize ls = intercalate "\n" $ map (concatMap tokenString) ls
withAnnotation :: Token a -> a -> Token a
withAnnotation (Whitespace s _) b = Whitespace s b
withAnnotation (Token s _) b = Token s b
isWhitespace :: Token a -> Bool
isWhitespace (Whitespace _ _) = True
isWhitespace _ = False
truncLine :: Int -> [Token a] -> [Token a]
truncLine width ts =
if length tokens == length ts
then tokens
else tokens ++ [lastToken]
where
lengths = map (length . tokenString) ts
cases = reverse $ inits lengths
remaining = dropWhile ((> width) . sum) cases
tokens = take (length $ head remaining) ts
truncLength = sum $ head remaining
lastTokenBasis = ts !! (length tokens)
lastToken = lastTokenBasis {
tokenString = take (width truncLength) (tokenString lastTokenBasis)
}
wrapLine :: Int -> [Token a] -> [[Token a]]
wrapLine _ [] = []
wrapLine width ts =
if null passing
then [ts]
else if null these
then if length those' == 1
then [those']
else [head those'] : (wrapLine width $ tail those')
else these : wrapLine width those
where
lengths = map (length . tokenString) ts
cases = reverse $ inits lengths
passing = dropWhile (\c -> sum c > width) cases
numTokens = length $ head passing
(these, those') = splitAt numTokens ts
those = dropWhile isWhitespace those'