{-# LANGUAGE BangPatterns #-}
{-# LANGUAGE CPP #-}
{-# LANGUAGE GeneralizedNewtypeDeriving #-}
{-# LANGUAGE LambdaCase #-}
{-# LANGUAGE OverloadedStrings #-}
{-# LANGUAGE PatternGuards #-}
{-# LANGUAGE ScopedTypeVariables #-}
{-# LANGUAGE ViewPatterns #-}

{-# OPTIONS_GHC -funbox-strict-fields #-}

module FastTags.Tag (
    -- * types
    TagVal(..)
    , Type(..)
    , Tag(..)
    , Pos(..)
    , SrcPos(..)
    , UnstrippedTokens(..)
    -- * process
    , processFile
    , qualify
    , process
    , tokenizeInput
    , processTokens
    -- * util
    , isHsFile
    , isLiterateFile

    -- TODO for testing
    , unstrippedTokensOf
    , stripCpp
    , stripNewlines
    , breakBlocks
    , whereBlock
    )
where
import Control.Arrow ((***))
import Control.Monad
import Control.DeepSeq (NFData, rnf)
import qualified Data.Char as Char
import Data.Functor ((<$>))
import qualified Data.IntSet as IntSet
import qualified Data.List as List
import qualified Data.Map as Map
import Data.Maybe (maybeToList)
import Data.Monoid (Monoid, (<>))
import qualified Data.Text as T
import Data.Text (Text)

import qualified Language.Preprocessor.Unlit as Unlit
import qualified System.FilePath as FilePath

import qualified FastTags.Lexer as Lexer
import qualified FastTags.Token as Token
import FastTags.Token (Pos(..), Token, SrcPos(..), TokenVal(..))
import qualified FastTags.Util as Util

-- * types

data TagVal = TagVal {
    tvName     :: !Text
    , tvType   :: !Type
    , tvParent :: !(Maybe Text)
      -- ^ parent of this tag; parent can only be of type
      -- Class, Data or Family
    } deriving (Show, Eq, Ord)

tagName :: Pos TagVal -> Text
tagName = tvName . valOf

tagLine :: Pos TagVal -> Token.Line
tagLine = posLine . posOf

instance NFData TagVal where
    rnf (TagVal x y z) = rnf x `seq` rnf y `seq` rnf z

-- | The Ord instance is used to sort tags with the same name.  Given multiple
-- matches, vim will visit them in order, so this should be in the order of
-- interest.
--
-- We rely that Type < Constructor.  TODO how and where?  For sorting tags?
data Type =
    Function
    | Type
    | Constructor
    | Class
    | Module
    | Operator
    | Pattern
    | Family
    deriving (Eq, Ord, Show)

instance NFData Type where
    rnf t = t `seq` ()

data Tag =
    Tag !(Pos TagVal)
    -- | Just like Tag, except these should be deduplicated by their TagVal,
    -- where the one with the lowest line number will be preferred.
    -- The idea seems to be that functions will emit a tag for both the
    -- signature and definition.  TODO seems like a hack, why not just
    -- deduplicate all tags?  And I think I do that now with dropAdjacent.
    | RepeatableTag !(Pos TagVal)
    | Warning !(Pos String)
    deriving (Show, Eq, Ord)

onTagVal :: (Pos TagVal -> Pos TagVal) -> Tag -> Tag
onTagVal f (Tag t)           = Tag $ f t
onTagVal f (RepeatableTag t) = RepeatableTag $ f t
onTagVal _ w@(Warning _)     = w

-- | Partition Tag, RepeatableTag, and Warning.
partitionTags :: [Tag] -> ([Pos TagVal], [Pos TagVal], [Pos String])
partitionTags = go [] [] []
    where
    go tags repeats warns [] = (tags, repeats, reverse warns)
    go tags repeats warns (t:ts) = case t of
        Tag a           -> go (a:tags) repeats warns ts
        RepeatableTag a -> go tags (a:repeats) warns ts
        Warning a       -> go tags repeats (a:warns) ts

extractName :: Tag -> Maybe Text
extractName (Tag t)           = Just $ tagName t
extractName (RepeatableTag t) = Just $ tagName t
extractName (Warning _)       = Nothing

-- | Newlines have to remain in the tokens because 'breakBlocks' relies on
-- them.  But they make pattern matching on the tokens unreliable because
-- newlines might be anywhere.  A newtype makes sure that the tokens only get
-- stripped once and that I don't do any pattern matching on unstripped tokens.
newtype UnstrippedTokens = UnstrippedTokens [Token]
#if MIN_VERSION_base(4,11,0)
    deriving (Show, Semigroup, Monoid)
#else
    deriving (Show, Monoid)
#endif

mapTokens :: ([Token] -> [Token]) -> UnstrippedTokens -> UnstrippedTokens
mapTokens f (UnstrippedTokens tokens) = UnstrippedTokens (f tokens)

unstrippedTokensOf :: UnstrippedTokens -> [Token]
unstrippedTokensOf (UnstrippedTokens tokens) = tokens

-- | Drop @n@ non-newline tokens.
dropTokens :: Int -> UnstrippedTokens -> UnstrippedTokens
dropTokens k = mapTokens (f k)
    where
    f :: Int -> [Token] -> [Token]
    f 0 xs                       = xs
    f _ []                       = []
    f n (Pos _ (Newline _) : xs) = f n xs
    f n (Pos _ _           : xs) = f (n - 1) xs

-- * processFile

-- | Read tags from one file.
processFile :: FilePath -> Bool -> IO ([Pos TagVal], [String])
processFile fn trackPrefixes =
    process fn trackPrefixes <$> Util.readFileLenient fn

-- * qualify

-- | Each tag is split into a one qualified with its module name and one
-- without.
--
-- TODO I could mark it static, to put in a file: mark, which would make vim
-- prioritize it for same-file tags, but I think it already does that, so maybe
-- this isn't necessary?
qualify :: Bool -> Text -> Pos TagVal -> Pos TagVal
qualify fullyQualify srcPrefix (Token.Pos pos (TagVal name typ _)) =
    Token.Pos pos TagVal
        { tvName   = qualified
        , tvType   = typ
        , tvParent = Nothing
        }
    where
    qualified = case typ of
        Module -> module_
        _ -> module_ <> "." <> name
    module_
        | fullyQualify = T.replace "/" "." $ T.dropWhile (=='/') $
            dropPrefix srcPrefix $ T.pack file
        | otherwise = T.pack $ FilePath.takeFileName file
    file = FilePath.dropExtension $ Token.posFile pos

dropPrefix :: Text -> Text -> Text
dropPrefix prefix txt = maybe txt id $ T.stripPrefix prefix txt

-- | Process one file's worth of tags.
process :: FilePath -> Bool -> Text -> ([Pos TagVal], [String])
process fn trackPrefixes input =
    case tokenizeInput fn trackPrefixes input  of
        Left msg   -> ([], [msg])
        Right toks -> processTokens toks

tokenizeInput :: FilePath -> Bool -> Text -> Either String [Token]
tokenizeInput fn trackPrefixes =
    Lexer.tokenize fn trackPrefixes . stripCpp . unlit
    where
    unlit :: Text -> Text
    unlit src
        | isLiterateFile fn =
            T.pack $ Unlit.unlit fn $ T.unpack $ stripLiterate src
        | otherwise = src

stripLiterate :: Text -> Text
stripLiterate src
    | "\\begin{code}" `T.isInfixOf` src
            && "\\end{code}" `T.isInfixOf` src =
        T.unlines $ filter (not . birdLiterateLine) $ T.lines src
    | otherwise = src
    where
    birdLiterateLine xs
        | T.null xs = False
        | otherwise = case Util.headt $ T.dropWhile Char.isSpace xs of
            Just '>' -> True
            _ -> False

processTokens :: [Token] -> ([Pos TagVal], [String])
processTokens =
    splitAndRemoveRepeats .
    concatMap blockTags .
    breakBlocks .
    UnstrippedTokens
    where
    splitAndRemoveRepeats :: [Tag] -> ([Pos TagVal], [String])
    splitAndRemoveRepeats tags =
        ( earliestRepeats ++ newTags
        , map valOf warnings
        )
        where
        (newTags, repeatableTags, warnings) = partitionTags tags
        -- For RepeatableTag s with duplicate keys, pick the one with the lowest
        -- posLine.
        earliestRepeats :: [Pos TagVal]
        earliestRepeats = Map.elems $ Map.fromListWith minLine $
            Util.keyOn valOf repeatableTags
        minLine x y
            | tagLine x < tagLine y = x
            | otherwise             = y

-- | Strip cpp lines starting with #. Also strips out hsc detritus.
stripCpp :: Text -> Text
stripCpp =
    T.intercalate "\n" . snd . List.mapAccumL replaceCppLine False . T.lines
    where
    replaceCppLine :: Bool -> Text -> (Bool, Text)
    replaceCppLine insideMacro line
        | "#" `T.isPrefixOf` line = (insideMacro', T.empty)
        | insideMacro             = (insideMacro', T.empty)
        | otherwise               = (False, line)
        where
        insideMacro' = "\\" `T.isSuffixOf` line

startIdentChar :: Char -> Bool
startIdentChar c = Char.isAlpha c || c == '_'

identChar :: Bool -> Char -> Bool
identChar considerDot c = Char.isAlphaNum c || c == '\'' || c == '_'
    || c == '#' || considerDot && c == '.'

isHaskellOp :: Text -> Bool
isHaskellOp str = case Util.headt str of
    Nothing  -> False
    Just ':' -> False
    Just _   -> T.all haskellOpChar str

isHaskellConstructorOp :: Text -> Bool
isHaskellConstructorOp str = case T.uncons str of
    Nothing        -> False
    Just (':', xs) -> T.all haskellOpChar xs
    Just _         -> False

haskellOpChar :: Char -> Bool
haskellOpChar '_' = False
haskellOpChar c   =
    IntSet.member (Char.ord c) opChars
        || Util.isSymbolCharacterCategory (Char.generalCategory c)
    where
    opChars :: IntSet.IntSet
    opChars = IntSet.fromList $ map Char.ord "-!#$%&*+./<=>?@^|~:\\"

isTypeVarStart :: Text -> Bool
isTypeVarStart x = case Util.headt x of
    Just c -> Char.isLower c || c == '_'
    _ -> False

-- | Break the input up into blocks based on indentation.
breakBlocks :: UnstrippedTokens -> [UnstrippedTokens]
breakBlocks =
    map UnstrippedTokens . filter (not . null)
        . go . stripSemicolonsNotInBraces . filterBlank . unstrippedTokensOf
    where
    go :: [Token] -> [[Token]]
    go []     = []
    go tokens = pre : go post
        where (pre, post) = breakBlock tokens
    -- Blank lines mess up the indentation.
    filterBlank :: [Token] -> [Token]
    filterBlank [] = []
    filterBlank (Pos _ (Newline _) : xs@(Pos _ (Newline _) : _)) =
        filterBlank xs
    filterBlank (x:xs) = x : filterBlank xs

-- | Take until a newline, then take lines until the indent established after
-- that newline decreases. Or, alternatively, if "{" is encountered then count
-- it as a block until closing "}" is found taking nesting into account.
breakBlock :: [Token] -> ([Token], [Token])
breakBlock = go []
    where
    go :: [Token] -> [Token] -> ([Token], [Token])
    go acc [] = (reverse acc, [])
    go acc (t@(Pos _ tok) : ts) = case tok of
        Newline indent -> collectIndented acc indent ts
        LBrace         -> collectBracedBlock (t : acc) go ts 1
        _              -> go (t : acc) ts

    collectIndented :: [Token] -> Int -> [Token] -> ([Token], [Token])
    collectIndented acc indent = goIndented acc
        where
        goIndented acc' = \case
            tsFull@(t : ts) -> case t of
                Pos _ (Newline n) | n <= indent -> (reverse acc', tsFull)
                Pos _ LBrace ->
                    collectBracedBlock (t : acc') goIndented ts 1
                _            ->
                    goIndented (t : acc') ts
            [] -> (reverse acc', [])

    collectBracedBlock
        :: Show b
        => [Token]
        -> ([Token] -> [Token] -> ([Token], [b]))
        -> [Token]
        -> Int
        -> ([Token], [b])
    collectBracedBlock acc cont = goBraced acc
        where
        goBraced acc' []       _ = (reverse acc', [])
        goBraced acc' ts       0 = cont acc' ts
        goBraced acc' (t : ts) n = goBraced (t : acc') ts $! case t of
            Pos _ LBrace -> n + 1
            Pos _ RBrace -> n - 1
            _            -> n

stripSemicolonsNotInBraces :: [Token] -> [Token]
stripSemicolonsNotInBraces =
    go False 0 0
  where
    go  :: Bool -- Whether inside let or where block or case expression
        -> Int -- Indent of last newline
        -> Int -- Parenthesis nesting depth
        -> [Token]
        -> [Token]
    go !_     !_ !_ []                                                       = []
    go !b     !k !n (tok@(Pos _ KWWhere)     : tok'@(Pos _ LBrace) : ts)     = tok : tok' : skipBalancedParens b k (inc n) ts
    go !_     !k !n (tok@(Pos _ KWWhere)     : ts)                           = tok : go True k n ts
    go !b     !k !n (tok@(Pos _ KWLet)       : tok'@(Pos _ LBrace) : ts)     = tok : tok' : skipBalancedParens b k (inc n) ts
    go !_     !k !n (tok@(Pos _ KWLet)       : ts)                           = tok : go True k n ts
    go !b     !k !n (tok@(Pos _ KWDo)        : tok'@(Pos _ LBrace) : ts)     = tok : tok' : skipBalancedParens b k (inc n) ts
    go !_     !k !n (tok@(Pos _ KWDo)        : ts)                           = tok : go True k n ts
    go !b     !k !n (tok@(Pos _ KWOf)        : tok'@(Pos _ LBrace) : ts)     = tok : tok' : skipBalancedParens b k (inc n) ts
    go !_     !k !n (tok@(Pos _ KWOf)        : ts)                           = tok : go True k n ts
    go !_     !k !n (tok@(Pos _ KWIn)        : ts)                           = tok : go False k n ts
    go !_     !_ !n (tok@(Pos _ (Newline k)) : ts)                           = tok : go False k n ts
    go !_     !_  0 (     Pos _ Semicolon    : tok@(Pos _ (Newline k)) : ts) = tok : go False k 0 ts
    go  False !k  0 (     Pos p Semicolon    : ts)                           = Pos p (Newline k) : go False k 0 ts
    go !b     !k !n (tok@(Pos _ LParen)      : ts)                           = tok : skipBalancedParens b k (inc n) ts
    go !b     !k !n (tok@(Pos _ LBracket)    : ts)                           = tok : skipBalancedParens b k (inc n) ts
    go !b     !k !n (tok@(Pos _ LBrace)      : ts)                           = tok : skipBalancedParens b k (inc n) ts
    go !b     !k !n (tok@(Pos _ RParen)      : ts)                           = tok : go b k (dec n) ts
    go !b     !k !n (tok@(Pos _ RBracket)    : ts)                           = tok : go b k (dec n) ts
    go !b     !k !n (tok@(Pos _ RBrace)      : ts)                           = tok : go b k (dec n) ts
    go !b     !k !n (tok : ts)                                               = tok : go b k n       ts

    skipBalancedParens
        :: Bool -- Whether inside where block or after equals sign
        -> Int -- Indent of last newline
        -> Int -- Parenthesis nesting depth
        -> [Token]
        -> [Token]
    skipBalancedParens b k = skip
        where
        skip :: Int -> [Token] -> [Token]
        skip _  []                          = []
        skip 0  ts                          = go b k 0 ts
        skip !n (tok@(Pos _ LParen)   : ts) = tok : skip (inc n) ts
        skip !n (tok@(Pos _ LBracket) : ts) = tok : skip (inc n) ts
        skip !n (tok@(Pos _ LBrace)   : ts) = tok : skip (inc n) ts
        skip !n (tok@(Pos _ RParen)   : ts) = tok : skip (dec n) ts
        skip !n (tok@(Pos _ RBracket) : ts) = tok : skip (dec n) ts
        skip !n (tok@(Pos _ RBrace)   : ts) = tok : skip (dec n) ts
        skip !n (tok : ts)                  = tok : skip n ts

    inc :: Int -> Int
    inc n = n + 1
    dec :: Int -> Int
    dec n = max 0 (n - 1)

explodeToplevelBracedBlocks :: [Token] -> [[Token]]
explodeToplevelBracedBlocks toks =
    case toks of
      Pos _ LBrace : toks' -> filter (not . null) $ go [] 1 toks'
      _                    -> [toks]
    where
    go :: [Token] -> Int -> [Token] -> [[Token]]
    go acc _    []                          = [reverse acc]
    go acc 0    ts                          = [reverse acc, ts]
    go acc !n   (tok@(Pos _ LBrace)   : ts) = go (tok : acc) (n + 1) ts
    go acc  1   (     Pos _ RBrace    : ts) = reverse acc : go [] 0 ts
    go acc !n   (tok@(Pos _ RBrace)   : ts) = go (tok : acc) (n - 1) ts
    go acc  n@1 (     Pos _ Semicolon : ts) = reverse acc : go [] n ts
    go acc !n   (tok                  : ts) = go (tok : acc) n ts

-- * extract tags

-- | Get all the tags in one indented block.
-- TODO clean this up to require less nesting, and dropDataContext duplication
blockTags :: UnstrippedTokens -> [Tag]
blockTags unstripped = case stripNewlines unstripped of
    [] -> []
    Pos _ SpliceStart : _ -> []
    Pos _ KWModule : Pos pos (T name) : _ ->
        [mkTag pos (snd (T.breakOnEnd "." name)) Module]
    stripped@(Pos _       (T "pattern") : Pos _ DoubleColon : _) ->
        toplevelFunctionTags stripped
    stripped@(Pos prevPos (T "pattern") : toks) ->
        case tag of
            Nothing -> toplevelFunctionTags stripped
            Just x  -> [x]
        where
        (tag, _, _) = recordVanillaOrInfixName isTypeName Pattern prevPos
            "pattern * =" toks
    Pos _ KWForeign : decl -> foreignTags decl
    -- newtype instance * = ...
    Pos prevPos KWNewtype : Pos _ KWInstance : toks ->
        map (addParent familyNameTag) $ newtypeTags pos $ dropTokens 2 unstripped
        where
        (familyNameTag, pos) = extractFamilyName prevPos "newtype instance * =" toks
    -- newtype X * = X *
    Pos prevPos KWNewtype : toks ->
        maybeToList tag ++ map (addParent tag) (newtypeTags pos (dropTokens 1 unstripped))
        where
        (tag, pos, _) =
            recordVanillaOrInfixName isTypeName Type prevPos "newtype * =" toks
    -- type family X ...
    Pos prevPos KWType : Pos _ KWFamily : toks -> maybeToList tag
        where
        (tag, _,  _) = recordVanillaOrInfixName isTypeFamilyName Family prevPos
            "type family * =" toks
    -- type instance X * = ...
    -- No tags in type family instances
    Pos _ KWType : Pos _ KWInstance : _ -> []
    -- type X * = ...
    Pos prevPos KWType : toks -> maybeToList tag
        where
        (tag, _, _) = recordVanillaOrInfixName isTypeName Type prevPos
            "type * =" toks
    -- data family X ...
    Pos prevPos KWData : Pos _ KWFamily : toks ->
        map (addParent tag) $ maybeToList tag
        where
        (tag, _, _) = recordVanillaOrInfixName isTypeFamilyName Family prevPos
            "data family * =" toks
    -- data instance * = ...
    -- data instance * where ...
    Pos prevPos KWData : Pos _ KWInstance : toks ->
        map (addParent familyNameTag) $ dataConstructorTags pos (dropTokens 2 unstripped)
        where
        (familyNameTag, pos) = extractFamilyName prevPos "data instance * =" toks
    -- data X * = X { X :: *, X :: * }
    -- data X * where ...
    Pos prevPos KWData : toks ->
        maybeToList tag ++ map (addParent tag) (dataConstructorTags pos (dropTokens 1 unstripped))
        where
        (tag, pos, _) = recordVanillaOrInfixName isTypeName Type prevPos
            "data * =" toks
    -- class * => X where X :: * ...
    Pos pos KWClass : _ -> classTags pos (dropTokens 1 unstripped)

    Pos _ KWInfix : _ -> []
    Pos _ KWInfixl : _ -> []
    Pos _ KWInfixr : _ -> []
    -- Deriving introduces no new names, just ignore it
    Pos _ KWDeriving : _ -> []
    -- instance * where data * = X :: * ...
    Pos pos KWInstance : _ ->
        instanceTags pos (dropTokens 1 unstripped)
    -- x, y, z :: *
    stripped -> toplevelFunctionTags stripped

isTypeFamilyName :: Text -> Bool
isTypeFamilyName =
    maybe False (\c -> Char.isUpper c || c == ':') . Util.headt

isTypeName  :: Text -> Bool
isTypeName x = case Util.headt x of
    Just c -> Char.isUpper c || c == ':'
    _ -> False

dropDataContext :: [Token] -> [Token]
dropDataContext = stripParensKindsTypeVars . stripOptContext

recordVanillaOrInfixName
    :: (Text -> Bool)               -- ^ Predicate for names to select
    -> Type                         -- ^ Tope of detecte tag
    -> SrcPos                       -- ^ Previous position to report in errors
    -> String                       -- ^ Context to report in errors
    -> [Token]                      -- ^ Tokens to analyze
    -> (Maybe Tag, SrcPos, [Token]) -- ^ Possibly detected tag and rest of the tokens
recordVanillaOrInfixName isVanillaName tokenType prevPos context tokens =
    case dropDataContext tokens of
        toks | Type <- tokenType
             , Just (pos, name, rest) <- extractSpecialTypeName toks ->
            (Just $ mkTag pos name tokenType, pos, rest)
        Pos _ RParen   : _                -> (Nothing, prevPos, tokens)
        Pos _ LBracket : _                -> (Nothing, prevPos, tokens)
        Pos _ Equals   : _                -> (Nothing, prevPos, tokens)
        Pos _ Comma    : _                -> (Nothing, prevPos, tokens)
        tok : toks ->
            case tok of
                Pos pos (tokToName -> Just name) | isVanillaName name ->
                    (Just $ mkTag pos name tokenType, pos, toks)
                _ -> case dropInfixTypeStart $ tok : toks of
                    Pos pos (tokToName -> Just name) : rest ->
                        (Just $ mkTag pos name tokenType, pos, rest)
                    rest -> (Just $ unexp pos rest, pos, tok : toks)
                        where pos = posOf tok
        [] -> (Just $ unexp prevPos [], prevPos, [])
    where
    unexp pos rest = unexpected pos (UnstrippedTokens tokens) rest context

extractSpecialTypeName :: [Token] -> Maybe (SrcPos, Text, [Token])
extractSpecialTypeName (Pos pos LBracket : Pos _ RBracket : rest) = Just (pos, "[]", rest)
extractSpecialTypeName (Pos pos LParen   : (tupleCommas -> (commas, Pos _ RParen : rest))) =
    Just (pos, "(" <> T.replicate commas "," <> ")", rest)
extractSpecialTypeName (tupleCommas -> (commas, Pos pos RParen : rest)) =
    Just (pos, "(" <> T.replicate commas "," <> ")", rest)
extractSpecialTypeName _ = Nothing

tupleCommas :: [Token] -> (Int, [Token])
tupleCommas = go 0 True
    where
    go :: Int -> Bool -> [Token] -> (Int, [Token])
    go !n False (Pos _ Comma : rest) = go (n + 1) True rest
    go !n False rest                 = (n, rest)
    go !n True  (Pos _ Comma : rest) =
        go (n + 1) True rest
    go !n True  rest'@(Pos _ (T name) : rest)
        | isTypeVarStart name = go n False rest
        | otherwise           = (n, rest')
    go !n _     rest = (n, rest)

-- same as dropWhile with counting
dropInfixTypeStart :: [Token] -> [Token]
dropInfixTypeStart = dropWhile f
    where
    f (Pos _ (T name)) = isInfixTypePrefix name
    f (Pos _ Backtick) = True
    f (Pos _ LParen)   = True
    f _                = False

    isInfixTypePrefix :: Text -> Bool
    isInfixTypePrefix = maybe False Char.isLower . Util.headt

-- | It's easier to scan for tokens without pesky newlines popping up
-- everywhere.  But I need to keep the newlines in in case I hit a @where@
-- and need to call 'breakBlocks' again.
stripNewlines :: UnstrippedTokens -> [Token]
stripNewlines = filter (not . isNewline) . unstrippedTokensOf

-- | Tags from foreign import.
--
-- e.g. @foreign import ccall safe \"name\" c_name :: ...@ will produce a tag
-- for @c_name@.
foreignTags :: [Token] -> [Tag]
foreignTags decl = case decl of
    Pos _ KWImport : decl'
        | Pos pos (T name) : _ <- Util.dropBefore isDoubleColon decl' ->
            [mkTag pos name Function]
    _ -> []
    where
    isDoubleColon (Pos _ DoubleColon) = True
    isDoubleColon _ = False

toplevelFunctionTags :: [Token] -> [Tag]
toplevelFunctionTags toks = case tags of
    -- Tags of toplevel functions are all repeatable, even the ones that come
    -- from the type signature because there will definitely be tags from the
    -- body and they should be sorted out if type signature is present.
    [] -> functionTagsNoSig toks
    ts -> map toRepeatableTag ts
    where
    -- first try to detect tags from type signature, if it fails then
    -- do the actual work of detecting from body
    (tags, _) = functionTags ExpectFunctions toks
    toRepeatableTag :: Tag -> Tag
    toRepeatableTag (Tag t) = RepeatableTag t
    toRepeatableTag t       = t

functionTagsNoSig :: [Token] -> [Tag]
functionTagsNoSig allToks = go' allToks
    where
    go' :: [Token] -> [Tag]
    go' (Pos _ T{} : Pos pos tok : _)
        | Just opName <- tokToOpNameExcludingBangPatSyms ExpectFunctions tok
        = [mkRepeatableTag pos opName Operator]
    go' ts = go ts

    go :: [Token] -> [Tag]
    go []                           = []
    go (Pos _ LParen : Pos _ T{} : Pos _ Backtick : Pos pos' (T name') : Pos _ Backtick : Pos _ T{} : Pos _ RParen : _)
        | functionName ExpectFunctions name' = [mkRepeatableTag pos' name' Function]
    go (Pos _ LParen : Pos _ T{} : Pos pos' tok : Pos _ T{} : Pos _ RParen : _)
        | Just name' <- tokToOpName ExpectFunctions tok
        = [mkRepeatableTag pos' name' Operator]
    go toks@(Pos _ LParen : _)      = go $ stripBalancedParens toks
    go toks@(Pos _ LBrace : _)      = go $ stripBalancedBraces toks
    go toks@(Pos _ LBracket : _)    = go $ stripBalancedBrackets toks
    -- This function does not analyze type signatures.
    go (Pos _ DoubleColon : _)      = []
    go (Pos _ ExclamationMark : ts) = go ts
    go (Pos _ Tilde : ts)           = go ts
    go (Pos _ At : ts)              = go ts
    go (Pos _ Equals : _)           = functionOrOp allToks
    go (Pos _ Pipe : _)             = functionOrOp allToks
    go (Pos _ Backtick : Pos pos' (T name') : _)
        | functionName ExpectFunctions name' = [mkRepeatableTag pos' name' Function]
    go (Pos pos tok : _)
        | Just name <- tokToOpNameExcludingBangPatSyms ExpectFunctions tok
        = [mkRepeatableTag pos name Operator]
    go (Pos pos Dot : _)            = [mkRepeatableTag pos "." Operator]
    go (_ : ts)                     = go ts
    stripOpeningParens :: [Token] -> [Token]
    stripOpeningParens = dropWhile ((== LParen) . valOf)
    functionOrOp :: [Token] -> [Tag]
    functionOrOp toks = case stripOpeningParens toks of
         Pos pos (T name) : _
             | functionName ExpectFunctions name -> [mkRepeatableTag pos name Function]
         Pos pos tok : _ -> case tokToOpName ExpectFunctions tok of
             Just name -> [mkRepeatableTag pos name Operator]
             Nothing   -> []
         [] -> []

tokToOpNameExcludingBangPatSyms :: ExpectedFuncName -> TokenVal -> Maybe Text
tokToOpNameExcludingBangPatSyms expectation tok = case (expectation, tokToNameExcludingBangPatSyms tok) of
    (ExpectFunctions, res@(Just name))
        | isHaskellOp name -> res
    (ExpectConstructors, res@(Just name))
        | isHaskellConstructorOp name -> res
    _ -> Nothing

tokToNameExcludingBangPatSyms :: TokenVal -> Maybe Text
tokToNameExcludingBangPatSyms (T "_")         = Nothing
tokToNameExcludingBangPatSyms (T name)        = Just name
tokToNameExcludingBangPatSyms Dot             = Just "."
tokToNameExcludingBangPatSyms _               = Nothing

tokToOpName :: ExpectedFuncName -> TokenVal -> Maybe Text
tokToOpName expectation tok = case (expectation, tokToName tok) of
    (ExpectFunctions, res@(Just name))
        | isHaskellOp name -> res
    (ExpectConstructors, res@(Just name))
        | isHaskellConstructorOp name -> res
    _ -> Nothing

tokToName :: TokenVal -> Maybe Text
tokToName ExclamationMark = Just "!"
tokToName Tilde           = Just "~"
tokToName x               = tokToNameExcludingBangPatSyms x

-- | Get tags from a function type declaration: token , token , token ::
-- Return the tokens left over.
functionTags :: ExpectedFuncName -- ^ expect constructors or functions
    -> [Token] -> ([Tag], [Token])
functionTags constructors = go []
    where
    (opTag, funcTag) = case constructors of
        ExpectConstructors -> (Constructor, Constructor)
        ExpectFunctions    -> (Operator, Function)
    go :: [Tag] -> [Token] -> ([Tag], [Token])
    go tags (Pos _ LParen : opTok : Pos _ RParen : Pos _ DoubleColon : rest) =
        (reverse $ mkOpTag tags opTag opTok, rest)
    go tags (Pos pos (T name) : Pos _ DoubleColon : rest)
        | functionName constructors name =
            (reverse $ mkTag pos name funcTag : tags, rest)
    go tags (Pos _ LParen : opTok : Pos _ RParen : Pos _ Comma : rest) =
        go (mkOpTag tags opTag opTok) rest
    go tags (Pos pos (T name) : Pos _ Comma : rest)
        | functionName constructors name =
            go (mkTag pos name funcTag : tags) rest
    go tags tokens = (tags, tokens)

    mkOpTag :: [Tag] -> Type -> Token -> [Tag]
    mkOpTag tags opTag' (Pos pos tok) =
      case tokToOpName constructors tok of
        Just name -> mkTag pos name opTag' : tags
        Nothing   -> tags

data ExpectedFuncName = ExpectFunctions | ExpectConstructors

functionName :: ExpectedFuncName -> Text -> Bool
functionName expect = isFunction
    where
    isFunction text = case T.uncons text of
        Just ('_', cs)
            | T.null cs -> False
        Just (c, cs) ->
            firstChar c && startIdentChar c && T.all (identChar True) cs
        Nothing      -> False
    firstChar = case expect of
        ExpectFunctions    -> \c -> Char.isLower c || c == '_'
        ExpectConstructors -> Char.isUpper

-- | * = X *
newtypeTags :: SrcPos -> UnstrippedTokens -> [Tag]
newtypeTags _ unstripped
    | any (\case { Pos _ KWWhere -> True; _ -> False })
            (unstrippedTokensOf unstripped) =
        concatMap gadtTags (whereBlock unstripped)
newtypeTags prevPos unstripped = case dropUntil Equals $ stripNewlines unstripped of
    Pos pos (T name) : rest ->
        let constructor = mkTag pos name Constructor
        in  case rest of
            Pos _ LBrace : Pos funcPos (T funcName) : _ ->
                [constructor, mkTag funcPos funcName Function]
            _ ->
                [constructor]
    rest -> [unexpected prevPos unstripped rest "newtype * ="]

-- | [] (empty data declaration)
-- * = X { X :: *, X :: * }
-- * where X :: * X :: *
-- * = X | X
dataConstructorTags :: SrcPos -> UnstrippedTokens -> [Tag]
dataConstructorTags prevPos unstripped
    -- GADT
    | any (\case { Pos _ KWWhere -> True; _ -> False })
            (unstrippedTokensOf unstripped) =
        concatMap gadtTags (whereBlock unstripped)
    -- plain ADT
    | otherwise = case strip unstripped of
        [] -> [] -- empty data declaration
        rest | Just (Pos pos (T name), rest') <- extractInfixConstructor rest ->
            mkTag pos name Constructor : collectRest rest'
        rest | Just (pos, name, rest') <- extractSpecialTypeName rest ->
            mkTag pos name Constructor : collectRest rest'
        Pos pos (T name) : rest ->
            mkTag pos name Constructor : collectRest rest
        Pos _ LParen : Pos pos (T name) : Pos _ RParen : rest ->
            mkTag pos name Constructor : collectRest rest
        rest -> [unexpected prevPos unstripped rest "data * = *"]
    where
    strip :: UnstrippedTokens -> [Token]
    strip = stripOptBang . stripDatatypeContext . dropUntil Equals
          . stripNewlines
    collectRest :: [Token] -> [Tag]
    collectRest tokens
        | (tags@(_:_), rest) <- functionTags ExpectFunctions tokens =
            tags ++ collectRest (dropUntilNextField rest)
    collectRest toks@(Pos _ LParen : _) = collectRest $ stripBalancedParens toks -- dropUntilNextField rest
    collectRest (Pos pipePos Pipe : rest)
        | Just (Pos pos (T name), rest'') <- extractInfixConstructor rest' =
            mkTag pos name Constructor : collectRest rest''
        | Just (pos, name, rest'') <- extractSpecialTypeName rest' =
            mkTag pos name Constructor : collectRest rest''
        | Pos pos (T name) : rest'' <- rest'
        , functionName ExpectConstructors name =
            mkTag pos name Constructor
                : collectRest (dropUntilNextCaseOrRecordStart rest'')
        | Pos _ LParen : Pos pos (T name) : Pos _ RParen : rest'' <- rest'
        , isHaskellConstructorOp name =
            mkTag pos name Constructor
                : collectRest (dropUntilNextCaseOrRecordStart rest'')
        | otherwise = [unexpected pipePos unstripped rest "| not followed by tokens"]
        where
        rest' = stripOptBang $ stripDatatypeContext rest
    collectRest (_ : rest) = collectRest rest
    collectRest [] = []

    stripOptBang :: [Token] -> [Token]
    stripOptBang (Pos _ ExclamationMark : rest) = rest
    stripOptBang ts = ts

    extractInfixConstructor :: [Token] -> Maybe (Token, [Token])
    extractInfixConstructor = extract . stripTypeParam
        where
        extract :: [Token] -> Maybe (Token, [Token])
        extract (tok@(Pos _ (T name)) : rest)
            | isHaskellConstructorOp name = Just (tok, stripTypeParam rest)
        extract (Pos _ Backtick : tok@(Pos _ _) : Pos _ Backtick : rest) =
            Just (tok, stripTypeParam rest)
        extract _ = Nothing

        stripTypeParam :: [Token] -> [Token]
        stripTypeParam input@(Pos _ LParen : _) =
            stripBalancedParens input
        stripTypeParam input@(Pos _ LBracket : _) =
            stripBalancedBrackets input
        stripTypeParam ts = dropWhile isTypeParam $ drop 1 ts

        isTypeParam :: Token -> Bool
        isTypeParam (Pos _ (T name)) = isTypeVarStart name
        isTypeParam _                = False

    dropUntilNextCaseOrRecordStart :: [Token] -> [Token]
    dropUntilNextCaseOrRecordStart = dropWithStrippingBalanced $
        not . \case { Pipe -> True; LBrace -> True; _ -> False }

    dropUntilNextField :: [Token] -> [Token]
    dropUntilNextField = dropWithStrippingBalanced $
        not . \case { Comma -> True; RBrace -> True; Pipe -> True; _ -> False }

stripDatatypeContext :: [Token] -> [Token]
stripDatatypeContext = stripOptContext . stripOptForall

stripOptForall :: [Token] -> [Token]
stripOptForall (Pos _ (T "forall") : rest) = dropUntil Dot rest
stripOptForall xs                          = xs

stripParensKindsTypeVars :: [Token] -> [Token]
stripParensKindsTypeVars (Pos _ LParen : xs)  =
    stripParensKindsTypeVars xs
stripParensKindsTypeVars (Pos _ DoubleColon : xs) =
    stripParensKindsTypeVars $ drop 1 $
    dropWithStrippingBalanced (\case { RParen -> False; _ -> True }) xs
stripParensKindsTypeVars (Pos _ (T name) : xs)
    | isTypeVarStart name = stripParensKindsTypeVars xs
stripParensKindsTypeVars xs = xs

stripOptContext :: [Token] -> [Token]
stripOptContext (stripBalancedParens -> Pos _ Implies : xs) = xs
stripOptContext (stripBalancedParens -> Pos _ Implies : xs) = xs
stripOptContext origToks = go origToks
    where
    go (Pos _ Implies : xs)    = xs
    go (Pos _ Equals : _)      = origToks
    go (Pos _ Pipe : _)        = origToks
    go (Pos _ LBrace : _)      = origToks
    go (Pos _ RBrace : _)      = origToks
    go toks@(Pos _ LParen : _) = go $ stripBalancedParens toks
    go (Pos _ DoubleColon : _) = origToks
    go (_ : xs)                = go xs
    go []                      = origToks

-- | Drop all tokens for which @pred@ returns True, also drop () or []
-- parenthesized expressions.
dropWithStrippingBalanced :: (TokenVal -> Bool) -> [Token] -> [Token]
dropWithStrippingBalanced p = go
    where
    go input@(Pos _ LParen : _)   = go $ stripBalancedParens input
    go input@(Pos _ LBracket : _) = go $ stripBalancedBrackets input
    go (Pos _ tok : xs) | p tok   = go xs
    go xs = xs

stripBalancedParens :: [Token] -> [Token]
stripBalancedParens = stripBalanced LParen RParen

stripBalancedBrackets :: [Token] -> [Token]
stripBalancedBrackets = stripBalanced LBracket RBracket

stripBalancedBraces :: [Token] -> [Token]
stripBalancedBraces = stripBalanced LBrace RBrace

stripBalanced :: TokenVal -> TokenVal -> [Token] -> [Token]
stripBalanced open close (Pos _ tok : xs)
    | tok == open = go 1 xs
    where
    go :: Int -> [Token] -> [Token]
    go 0 ys = ys
    go !n (Pos _ tok' : ys)
        | tok' == open  = go (n + 1) ys
        | tok' == close = go (n - 1) ys
    go !n (_: ys) = go n ys
    go _ []      = []
stripBalanced _ _ xs = xs

gadtTags :: UnstrippedTokens -> [Tag]
gadtTags unstripped = case rest of
    Pos _ LBrace : rest' -> constructorTag ++ collectFields rest'
    _                    -> constructorTag
    where
    (constructorTag, rest) = functionTags ExpectConstructors $ stripNewlines unstripped
    collectFields :: [Token] -> [Tag]
    collectFields (Pos _ Comma : rest) = collectFields rest
    collectFields (Pos _ RBrace : _)   = []
    collectFields tokens
        | (tags@(_:_), rest) <- functionTags ExpectFunctions tokens =
            tags ++ collectFields (dropUntilNextField rest)
        | otherwise = []
    dropUntilNextField :: [Token] -> [Token]
    dropUntilNextField = dropWithStrippingBalanced $
        not . \case { Comma -> True; RBrace -> True; _ -> False }

-- | * => X where X :: * ...
classTags :: SrcPos -> UnstrippedTokens -> [Tag]
classTags prevPos unstripped =
    maybeToList classTag ++
    map (addParent classTag) (concatMap classBodyTags (whereBlock wherePart))
    where
    (classPart, wherePart) = spanUntil KWWhere unstripped
    (classTag, _, _) = recordVanillaOrInfixName isTypeName Class prevPos
        "class * =>" $ stripUntilImplies $ stripNewlines classPart

stripUntilImplies :: [Token] -> [Token]
stripUntilImplies xs =
    case dropUntil Implies xs of
        []  -> xs
        xs' -> xs'

classBodyTags :: UnstrippedTokens -> [Tag]
classBodyTags unstripped = case stripNewlines unstripped of
    Pos _ KWType : Pos pos (T name) : _ -> [mkTag pos name Family]
    Pos _ KWData : Pos pos (T name) : _ -> [mkTag pos name Family]
    tokens                              -> fst $ functionTags ExpectFunctions tokens

-- | Skip to the where and split the indented block below it.
whereBlock :: UnstrippedTokens -> [UnstrippedTokens]
whereBlock =
    concatMap (breakBlocks . UnstrippedTokens) .
    explodeToplevelBracedBlocks .
    dropUntil KWWhere .
    unstrippedTokensOf

instanceTags :: SrcPos -> UnstrippedTokens -> [Tag]
instanceTags prevPos unstripped =
    -- instances can offer nothing but some fresh data constructors since
    -- the actual datatype is really declared in the class declaration
    concatMap (\toks ->
                   let (parent, pos) = extractFamilyName prevPos "newtype instance * =" (stripNewlines toks)
                   in map (addParent parent) $ newtypeTags pos toks)
        (map (dropTokens 1) (filter isNewtypeDecl block))
    ++ concatMap (\toks ->
                   let (parent, pos) = extractFamilyName prevPos "data instance * =" (stripNewlines toks)
                   in map (addParent parent) $ dataConstructorTags pos toks)
        (map (dropTokens 1) (filter isDataDecl block))
    where
    block = whereBlock unstripped

    isNewtypeDecl :: UnstrippedTokens -> Bool
    isNewtypeDecl (UnstrippedTokens (Pos _ KWNewtype : _)) = True
    isNewtypeDecl _ = False

    isDataDecl :: UnstrippedTokens -> Bool
    isDataDecl (UnstrippedTokens (Pos _ KWData : _)) = True
    isDataDecl _ = False

extractFamilyName :: SrcPos -> String -> [Token] -> (Maybe Tag, SrcPos)
extractFamilyName prevPos context toks = (tag, pos)
    where
    (tag, pos, _) = recordVanillaOrInfixName isTypeFamilyName Family prevPos context toks

-- * util

addParent :: Maybe Tag -> Tag -> Tag
addParent parent = onTagVal f
    where
    f (Pos pos (TagVal name typ _)) =
        Pos pos (TagVal name typ parentName)
    parentName :: Maybe Text
    parentName = join $ extractName <$> parent

mkTag :: SrcPos -> Text -> Type -> Tag
mkTag pos name typ = Tag $ Pos pos (TagVal name typ Nothing)

mkRepeatableTag :: SrcPos -> Text -> Type -> Tag
mkRepeatableTag pos name typ =
    RepeatableTag $ Pos pos TagVal
        { tvName   = name
        , tvType   = typ
        , tvParent = Nothing
        }

warning :: SrcPos -> String -> Tag
warning pos warn = Warning $ Pos pos $ show pos ++ ": " ++ warn

unexpected :: SrcPos -> UnstrippedTokens -> [Token] -> String -> Tag
unexpected prevPos (UnstrippedTokens tokensBefore) tokensHere declaration =
    warning pos ("unexpected " ++ thing ++ " after " ++ declaration)
    where
    thing = maybe "end of block" (show . valOf) (Util.mhead tokensHere)
    pos
        | Just t <- Util.mhead tokensHere = posOf t
        | Just t <- Util.mlast tokensBefore = posOf t
        | otherwise = prevPos

isNewline :: Token -> Bool
isNewline (Pos _ (Newline _)) = True
isNewline _                   = False

dropUntil :: TokenVal -> [Token] -> [Token]
dropUntil token = drop 1 . dropWhile (not . (== token) . valOf)

spanUntil :: TokenVal -> UnstrippedTokens
    -> (UnstrippedTokens, UnstrippedTokens)
spanUntil token =
    (UnstrippedTokens *** UnstrippedTokens)
    .  span (not . (== token) . valOf) . unstrippedTokensOf

-- | Crude predicate for Haskell files
isHsFile :: FilePath -> Bool
isHsFile = (`elem` [".hs", ".hsc", ".lhs"]) . FilePath.takeExtension

isLiterateFile :: FilePath -> Bool
isLiterateFile = (==".lhs") . FilePath.takeExtension