--------------------------------------------------------- -- -- Module : Text.Format.Para -- Copyright : Kevin Quick -- License : BSD3 -- -- Maintainer : Kevin Quick <quick@sparq.org> -- Stability : Unstable -- Portability : portable -- -- Formatting text into paragraphs of specified width. --------------------------------------------------------- -- | A paragraph formatting utility. Provided with input text that is -- arbitrarily split amongst several strings, this utility will -- reformat the text into paragraphs which do not exceed the -- specified width. Paragraphs are delimited by blank lines in the -- input. -- -- This function is roughly equivalent to the Unix `fmt` utility. -- -- Features: -- -- * An indentation/prefix text may be specified. This prefix is -- used on the first paragraph line and determines the standard -- indentation for all subsequent lines. If no indentation is -- specified, the blank indentation of the first line of the first -- paragraph becomes the default indentation for all paragraphs. -- -- * Subsequent paragraphs may increase their indentation over the -- default as determined by the indentation level of their first -- line. Indentation values less than that of the primary -- paragraph are ignored. -- -- * Paragraph text is reformatted to fit the paragraph layout. -- -- * Extra whitespace is removed. -- -- * \"French spacing\" is used: if the current word is capitalized -- and the previous word ended in a punctuation character, then -- two spaces are used between the words instead of a single space -- which is the default elsewhere. -- -- * Avoids orphan words. The last line of a paragraph will usually -- be formatted to contain at least 2 words, pulling from the line -- above it. -- -- * Recognizes lists of items, where each item starts with * or - -- or alphanumeric characters followed by a ) or . character. -- Uses list-oriented per-item indentation independent of -- paragraph indentation. module Text.Format.Para ( formatParas ) where import Data.Maybe import Data.List import Data.Char -- if listmarker, fst is indent of marker, snd is indent of sublines list_marker :: String -> Maybe (Int, Int) list_marker s = let w = words s fi = length $ takeWhile isSpace s isLmark x = let endfirst = if null x' then 'X' else head x' x' = dropWhile isAlphaNum x pm = 1 == length x && head x `elem` "*-" nm = 1 == length x' && endfirst `elem` ")." in pm || nm in if null w || not (isLmark $ head w) then Nothing else Just (fi, fi + (length $ head w) + 1) -- | The 'formatParas' function accepts an arbitrarily-divided list of -- Strings along with a width and optional indentation/prefix and -- returns an array of strings representing paragraphs formatted to -- fit the specified width and indentation. formatParas :: Int -- ^ Width -> Maybe String -- ^ Prefix (defines indent), Nothing means -- indent is taken from first input line -> [String] -- ^ Text to format in arbitrarily-divided -- strings. Blank lines separate paragraphs. -- Paragraphs are indented the same as the -- first line if second argument is Nothing. -> [String] -- ^ Formatted text formatParas w i s = let minTextWidth = 10 -- i' is the actual indent string, either from input or taken from first line i' = if isJust i then fromJust i else if null s then "" else takeWhile isSpace $ head s -- w' is the actual paragraph text width to format paragraphs to w' = max minTextWidth $ w - length i' -- Each element of input s could have newlines in it already. Split those out, -- then look for paragraph separators, then mash each para to do len-based line -- splitting. Use the first non-blank line of the first paragraph as the basis -- for all other paragraph indentations. all_ls = concatMap lines s trim = reverse . dropWhile isSpace . reverse . dropWhile isSpace parablks = filter (\pb -> length (concatMap words pb) > 0) $ groupBy (\_ -> not . is_parasep) all_ls is_parasep x = 0 == (length $ trim x) || (isJust $ list_marker x) paras = map (\p-> (pindent p, (words $ intercalate " " p))) parablks pindent' [] = 0 pindent' ("":pbs) = pindent' pbs pindent' (p:ps) = let ls = takeWhile isSpace p in if ls == p then pindent' ps else length ls p1indent [] = 0 p1indent p = let p1i = length $ takeWhile isSpace $ head p in if p1i < length i' then p1i else p1i - length i' firstindent = p1indent $ head parablks pindent p = let l = if null p then Nothing else list_marker $ head p f' = maybe (pindent' p) fst l f = max (f' - firstindent) 0 s' = maybe f snd l in (f, s') -- Now reconstruct paragraphs from the above paras, which is an array of paragraph -- entries, each has size of indent in input form and array of words in the -- paragraph. Use "french spacing" (two spaces after a period before a -- capitalized word) and don't orphan words. paralines' :: [[String]] paralines' = map fmtpara paras fmtpara (ind,wrds) = let ls = lbreak (w' - fst ind) wrds lm indent n = (replicate (n + length i') ' ' ++) lm = isJust $ list_marker $ head wrds in indent (fst ind) (head ls) : map (indent $ snd ind) (tail ls) lbreak :: Int -> [String] -> Bool -> [String] lbreak _ [] _ = [] lbreak width wrds lm = let lens = scanl (+) 0 wlen dots = False : map ((==) '.' . last) wrds frsp = [ if d && (isUpper $ head e) && not m then 2 else 1 | (e,d,m) <- zip3 wrds dots $ lm : lm : repeat False ] unwordsFrsp fs ws = concat [ sp ++ wrd | (wrd,n) <- zip ws fs, let sp = replicate n ' '] wlen = [ length e + sp | (e,sp) <- zip wrds frsp ] thisline' = takeWhile ((>=) width . snd) (zip wrds lens) thisline | 1 == length thisline' = thisline' | width >= last lens = thisline' | (length thisline' == length wrds) && (length thisline' > 2) = init $ init thisline' | otherwise = init thisline' in unwordsFrsp frsp (map fst thisline) : lbreak width (drop (length thisline) wrds) False paralines = let l1 = i' ++ (drop (length i') $ head $ head paralines') in case paralines' of [] -> [] []:r -> [i'] : r f:r -> (l1 : tail f) : r in intercalate [""] paralines