---------------------------------------------------------
--
-- Module        : Text.Format.Para
-- Copyright     : Kevin Quick
-- License       : BSD3
--
-- Maintainer    : Kevin Quick <quick@sparq.org>
-- Stability     : Unstable
-- Portability   : portable
--
-- Formatting text into paragraphs of specified width.
---------------------------------------------------------

-- | A paragraph formatting utility.  Provided with input text that is
--   arbitrarily split amongst several strings, this utility will
--   reformat the text into paragraphs which do not exceed the
--   specified width.  Paragraphs are delimited by blank lines in the
--   input.
--
--   This function is roughly equivalent to the Unix `fmt` utility.
--
--   Features:
--
--   * An indentation/prefix text may be specified.  This prefix is
--     used on the first paragraph line and determines the standard
--     indentation for all subsequent lines.  If no indentation is
--     specified, the blank indentation of the first line of the first
--     paragraph becomes the default indentation for all paragraphs.
--
--   * Subsequent paragraphs may increase their indentation over the
--     default as determined by the indentation level of their first
--     line.  Indentation values less than that of the primary
--     paragraph are ignored.
--
--   * Paragraph text is reformatted to fit the paragraph layout.
--
--   * Extra whitespace is removed.
--
--   * \"French spacing\" is used: if the current word is capitalized
--     and the previous word ended in a punctuation character, then
--     two spaces are used between the words instead of a single space
--     which is the default elsewhere.
--
--   * Avoids orphan words.  The last line of a paragraph will usually
--     be formatted to contain at least 2 words, pulling from the line
--     above it.
--
--   * Recognizes lists of items, where each item starts with * or -
--     or alphanumeric characters followed by a ) or . character.
--     Uses list-oriented per-item indentation independent of
--     paragraph indentation.

module Text.Format.Para ( formatParas ) where

import Data.Maybe
import Data.List
import Data.Char

-- if listmarker, fst is indent of marker, snd is indent of sublines
list_marker :: String -> Maybe (Int, Int)
list_marker s = let w = words s
                    fi = length $ takeWhile isSpace s
                    isLmark x = let endfirst = if null x' then 'X' else head x'
                                    x' = dropWhile isAlphaNum x
                                    pm = 1 == length x && head x `elem` "*-"
                                    nm = 1 == length x' && endfirst `elem` ")."
                                in pm || nm
                in if null w || not (isLmark $ head w) then Nothing else
                          Just (fi, fi + (length $ head w) + 1)

-- | The 'formatParas' function accepts an arbitrarily-divided list of
--   Strings along with a width and optional indentation/prefix and
--   returns an array of strings representing paragraphs formatted to
--   fit the specified width and indentation.
formatParas :: Int -- ^ Width
           -> Maybe String -- ^ Prefix (defines indent), Nothing means
                           --   indent is taken from first input line
           -> [String] -- ^ Text to format in arbitrarily-divided
                       -- strings.  Blank lines separate paragraphs.
                       -- Paragraphs are indented the same as the
                       -- first line if second argument is Nothing.
           -> [String] -- ^ Formatted text
formatParas w i s =
    let minTextWidth = 10
        -- i' is the actual indent string, either from input or taken from first line
        i' = if isJust i then fromJust i else
                 if null s then "" else takeWhile isSpace $ head s
        -- w' is the actual paragraph text width to format paragraphs to
        w' = max minTextWidth $ w - length i'

        -- Each element of input s could have newlines in it already.  Split those out,
        -- then look for paragraph separators, then mash each para to do len-based line
        -- splitting.  Use the first non-blank line of the first paragraph as the basis
        -- for all other paragraph indentations.

        all_ls = concatMap lines s
        trim = reverse . dropWhile isSpace . reverse . dropWhile isSpace
        parablks = filter (\pb -> length (concatMap words pb) > 0) $
                   groupBy (\_ -> not . is_parasep) all_ls
        is_parasep x = 0 == (length $ trim x) || (isJust $ list_marker x)
        paras = map (\p-> (pindent p, (words $ intercalate " " p))) parablks

        pindent' [] = 0
        pindent' ("":pbs) = pindent' pbs
        pindent' (p:ps) = let ls = takeWhile isSpace p
                        in if ls == p then pindent' ps else length ls
        p1indent [] = 0
        p1indent p = let p1i = length $ takeWhile isSpace $ head p
                     in if p1i < length i' then p1i else p1i - length i'
        firstindent = p1indent $ head parablks
        pindent p = let l = if null p then Nothing else list_marker $ head p
                        f' = maybe (pindent' p) fst l
                        f = max (f' - firstindent) 0
                        s' = maybe f snd l
                    in (f, s')

        -- Now reconstruct paragraphs from the above paras, which is an array of paragraph
        -- entries, each has size of indent in input form and array of words in the
        -- paragraph.  Use "french spacing" (two spaces after a period before a
        -- capitalized word) and don't orphan words.

        paralines' :: [[String]]
        paralines' = map fmtpara paras
        fmtpara (ind,wrds) = let ls = lbreak (w' - fst ind) wrds lm
                                 indent n = (replicate (n + length i') ' ' ++)
                                 lm = isJust $ list_marker $ head wrds
                             in indent (fst ind) (head ls) : map (indent $ snd ind) (tail ls)

        lbreak :: Int -> [String] -> Bool -> [String]
        lbreak _ [] _ = []
        lbreak width wrds lm = let lens = scanl (+) 0 wlen
                                   dots = False : map ((==) '.' . last) wrds
                                   frsp = [ if d && (isUpper $ head e) && not m then 2 else 1
                                            | (e,d,m) <- zip3 wrds dots $ lm : lm : repeat False ]
                                   unwordsFrsp fs ws = concat [ sp ++ wrd
                                                                | (wrd,n) <- zip ws fs,
                                                                let sp = replicate n ' ']
                                   wlen = [ length e + sp | (e,sp) <- zip wrds frsp ]
                                   thisline' = takeWhile ((>=) width . snd) (zip wrds lens)
                                   thisline | 1 == length thisline' = thisline'
                                            | width >= last lens    = thisline'
                                            | (length thisline' == length wrds) &&
                                              (length thisline' > 2) = init $ init thisline'
                                            | otherwise = init thisline'
                               in unwordsFrsp frsp (map fst thisline) :
                                  lbreak width (drop (length thisline) wrds) False
        paralines = let l1 = i' ++ (drop (length i') $ head $ head paralines')
                    in case paralines' of
                      [] -> []
                      []:r -> [i'] : r
                      f:r -> (l1 : tail f) : r

    in intercalate [""] paralines