{-# LANGUAGE TupleSections #-}
{-# LANGUAGE ViewPatterns #-}
{-# LANGUAGE OverloadedStrings #-}
module Text.Pandoc.Citeproc.Locator
  ( parseLocator )
where
import Citeproc.Types
import Data.Text (Text)
import qualified Data.Text as T
import Text.Parsec
import Text.Pandoc.Definition
import Text.Pandoc.Parsing (romanNumeral)
import Text.Pandoc.Shared (stringify)
import Control.Monad (mzero)
import qualified Data.Map as M
import Data.Char (isSpace, isPunctuation, isDigit)

parseLocator :: Locale -> [Inline] -> (Maybe (Text, Text), [Inline])
parseLocator locale inp =
  case parse (pLocatorWords (toLocatorMap locale)) "suffix" $ splitInp inp of
       Right r -> r
       Left _  -> (Nothing, inp)

splitInp :: [Inline] -> [Inline]
splitInp = splitStrWhen (\c -> isSpace c || (isPunctuation c && c /= ':'))

--
-- Locator parsing
--

type LocatorParser = Parsec [Inline] ()

pLocatorWords :: LocatorMap
              -> LocatorParser (Maybe (Text, Text), [Inline])
pLocatorWords locMap = do
  optional $ pMatchChar "," (== ',')
  optional pSpace
  (la, lo) <- pLocatorDelimited locMap <|> pLocatorIntegrated locMap
  s <- getInput -- rest is suffix
  -- need to trim, otherwise "p. 9" and "9" will have 'different' locators later on
  -- i.e. the first one will be " 9"
  return $
    if T.null la && T.null lo
       then (Nothing, s)
       else (Just (la, T.strip lo), s)

pLocatorDelimited :: LocatorMap -> LocatorParser (Text, Text)
pLocatorDelimited locMap = try $ do
  _ <- pMatchChar "{" (== '{')
  skipMany pSpace -- gobble pre-spaces so label doesn't try to include them
  (la, _) <- pLocatorLabelDelimited locMap
  -- we only care about balancing {} and [] (because of the outer [] scope);
  -- the rest can be anything
  let inner = do { t <- anyToken; return (True, stringify t) }
  gs <- many (pBalancedBraces [('{','}'), ('[',']')] inner)
  _ <- pMatchChar "}" (== '}')
  let lo = T.concat $ map snd gs
  return (la, lo)

pLocatorLabelDelimited :: LocatorMap -> LocatorParser (Text, Bool)
pLocatorLabelDelimited locMap
  = pLocatorLabel' locMap lim <|> return ("page", True)
    where
        lim = stringify <$> anyToken

pLocatorIntegrated :: LocatorMap -> LocatorParser (Text, Text)
pLocatorIntegrated locMap = try $ do
  (la, wasImplicit) <- pLocatorLabelIntegrated locMap
  -- if we got the label implicitly, we have presupposed the first one is
  -- going to have a digit, so guarantee that. You _can_ have p. (a)
  -- because you specified it.
  let modifier = if wasImplicit
                    then requireDigits
                    else requireRomansOrDigits
  g <- try $ pLocatorWordIntegrated (not wasImplicit) >>= modifier
  gs <- many (try $ pLocatorWordIntegrated False >>= modifier)
  let lo = T.concat (g:gs)
  return (la, lo)

pLocatorLabelIntegrated :: LocatorMap -> LocatorParser (Text, Bool)
pLocatorLabelIntegrated locMap
  = pLocatorLabel' locMap lim <|> (lookAhead digital >> return ("page", True))
    where
      lim = try $ pLocatorWordIntegrated True >>= requireRomansOrDigits
      digital = try $ pLocatorWordIntegrated True >>= requireDigits

pLocatorLabel' :: LocatorMap -> LocatorParser Text
               -> LocatorParser (Text, Bool)
pLocatorLabel' locMap lim = go ""
    where
      -- grow the match string until we hit the end
      -- trying to find the largest match for a label
      go acc = try $ do
          -- advance at least one token each time
          -- the pathological case is "p.3"
          t <- anyToken
          ts <- manyTill anyToken (try $ lookAhead lim)
          let s = acc <> stringify (t:ts)
          case M.lookup (T.strip s) locMap of
            -- try to find a longer one, or return this one
            Just l -> go s <|> return (l, False)
            Nothing -> go s

-- hard requirement for a locator to have some real digits in it
requireDigits :: (Bool, Text) -> LocatorParser Text
requireDigits (_, s) = if not (T.any isDigit s)
                          then Prelude.fail "requireDigits"
                          else return s

-- soft requirement for a sequence with some roman or arabic parts
-- (a)(iv) -- because iv is roman
-- 1(a)  -- because 1 is an actual digit
-- NOT: a, (a)-(b), hello, (some text in brackets)
requireRomansOrDigits :: (Bool, Text) -> LocatorParser Text
requireRomansOrDigits (d, s) = if not d
                                  then Prelude.fail "requireRomansOrDigits"
                                  else return s

pLocatorWordIntegrated :: Bool -> LocatorParser (Bool, Text)
pLocatorWordIntegrated isFirst = try $ do
  punct <- if isFirst
              then return ""
              else (stringify <$> pLocatorSep) <|> return ""
  sp <- option "" (pSpace >> return " ")
  (dig, s) <- pBalancedBraces [('(',')'), ('[',']'), ('{','}')] pPageSeq
  return (dig, punct <> sp <> s)

-- we want to capture:  123, 123A, C22, XVII, 33-44, 22-33; 22-11
--                      34(1), 34A(A), 34(1)(i)(i), (1)(a)
--                      [17], [17]-[18], '591 [84]'
--                      (because CSL cannot pull out individual pages/sections
--                      to wrap in braces on a per-style basis)
pBalancedBraces :: [(Char, Char)]
                -> LocatorParser (Bool, Text)
                -> LocatorParser (Bool, Text)
pBalancedBraces braces p = try $ do
  ss <- many1 surround
  return $ anyWereDigitLike ss
  where
      except = notFollowedBy pBraces >> p
      -- outer and inner
      surround = foldl (\a (open, close) -> sur open close except <|> a)
                       except
                       braces

      isc c = stringify <$> pMatchChar [c] (== c)

      sur c c' m = try $ do
          (d, mid) <- between (isc c) (isc c') (option (False, "") m)
          return (d, T.cons c . flip T.snoc c' $  mid)

      flattened = concatMap (\(o, c) -> [o, c]) braces
      pBraces = pMatchChar "braces" (`elem` flattened)


-- YES 1, 1.2, 1.2.3
-- NO  1., 1.2. a.6
-- can't use sepBy because we want to leave trailing .s
pPageSeq :: LocatorParser (Bool, Text)
pPageSeq = oneDotTwo <|> withPeriod
  where
      oneDotTwo = do
          u <- pPageUnit
          us <- many withPeriod
          return $ anyWereDigitLike (u:us)
      withPeriod = try $ do
          -- .2
          p <- pMatchChar "." (== '.')
          u <- try pPageUnit
          return (fst u, stringify p <> snd u)

anyWereDigitLike :: [(Bool, Text)] -> (Bool, Text)
anyWereDigitLike as = (any fst as, T.concat $ map snd as)

pPageUnit :: LocatorParser (Bool, Text)
pPageUnit = roman <|> plainUnit
  where
      -- roman is a 'digit'
      roman = (True,) <$> pRoman
      plainUnit = do
          ts <- many1 (notFollowedBy pSpace >>
                       notFollowedBy pLocatorPunct >>
                       anyToken)
          let s = stringify ts
          -- otherwise look for actual digits or -s
          return (T.any isDigit s, s)

pRoman :: LocatorParser Text
pRoman = try $ do
  tok <- anyToken
  case tok of
       Str t -> case parse (romanNumeral True *> eof)
                   "roman numeral" (T.toUpper t) of
                      Left _    -> mzero
                      Right ()  -> return t
       _      -> mzero

pLocatorPunct :: LocatorParser Inline
pLocatorPunct = pMatchChar "punctuation" isLocatorPunct

pLocatorSep :: LocatorParser Inline
pLocatorSep = pMatchChar "locator separator" isLocatorSep

pMatchChar :: String -> (Char -> Bool) -> LocatorParser Inline
pMatchChar msg f = satisfyTok f' <?> msg
    where
        f' (Str (T.unpack -> [c])) = f c
        f' _                       = False

pSpace :: LocatorParser Inline
pSpace = satisfyTok (\t -> isSpacey t || t == Str "\160") <?> "space"

satisfyTok :: (Inline -> Bool) -> LocatorParser Inline
satisfyTok f = tokenPrim show (\sp _ _ -> sp) (\tok -> if f tok
                                                          then Just tok
                                                          else Nothing)

isSpacey :: Inline -> Bool
isSpacey Space     = True
isSpacey SoftBreak = True
isSpacey _         = False

isLocatorPunct :: Char -> Bool
isLocatorPunct '-' = False -- page range
isLocatorPunct '–' = False -- page range, en dash
isLocatorPunct ':' = False -- vol:page-range hack
isLocatorPunct c   = isPunctuation c -- includes [{()}]

isLocatorSep :: Char -> Bool
isLocatorSep ',' = True
isLocatorSep ';' = True
isLocatorSep _   = False

splitStrWhen :: (Char -> Bool) -> [Inline] -> [Inline]
splitStrWhen _ [] = []
splitStrWhen p (Str xs : ys) = go (T.unpack xs) ++ splitStrWhen p ys
  where
   go [] = []
   go s = case break p s of
             ([],[])     -> []
             (zs,[])     -> [Str $ T.pack zs]
             ([],w:ws) -> Str (T.singleton w) : go ws
             (zs,w:ws) -> Str (T.pack zs) : Str (T.singleton w) : go ws
splitStrWhen p (x : ys) = x : splitStrWhen p ys

--
-- Locator Map
--

type LocatorMap = M.Map Text Text

toLocatorMap :: Locale -> LocatorMap
toLocatorMap locale =
  foldr go mempty locatorTerms
 where
  go tname locmap =
    case M.lookup tname (localeTerms locale) of
      Nothing -> locmap
      Just ts -> foldr (\x -> M.insert (snd x) tname) locmap ts

locatorTerms :: [Text]
locatorTerms =
  [ "book"
  , "chapter"
  , "column"
  , "figure"
  , "folio"
  , "issue"
  , "line"
  , "note"
  , "opus"
  , "page"
  , "number-of-pages"
  , "paragraph"
  , "part"
  , "section"
  , "sub verbo"
  , "verse"
  , "volume" ]