{-# LANGUAGE BangPatterns               #-}
{-# LANGUAGE GeneralizedNewtypeDeriving #-}
{-# LANGUAGE OverloadedStrings          #-}
{-# LANGUAGE PatternGuards              #-}
{-# LANGUAGE ScopedTypeVariables        #-}
{-# LANGUAGE ViewPatterns               #-}

{-# OPTIONS_GHC -funbox-strict-fields #-}

module FastTags
    ( isHsFile
    , isLiterateFile
    , merge
    , TokenVal(..)
    , TagVal(..)
    , Type(..)
    , Tag(..)
    , Pos(..)
    , SrcPos(..)
    , UnstrippedTokens(..)
    , breakString
    , stripComments
    , processFile
    , processAll
    , process
    , tokenize
    , stripCpp
    , annotate
    , stripNewlines
    , breakBlocks
    , unstrippedTokensOf
    , split
    )
where
import Control.Arrow ((***), (&&&))
import Control.DeepSeq (NFData, rnf)
import Control.Monad

import qualified Data.ByteString as ByteString
import qualified Data.Char as Char
import Data.Function (on)
import Data.Functor ((<$>))
import qualified Data.IntSet as IntSet
import qualified Data.List as List
import qualified Data.Map as Map
import Data.Monoid (Monoid, (<>), mconcat)
import qualified Data.Text as T
import Data.Text (Text)
import qualified Data.Text.Encoding as Encoding
import qualified Data.Text.Encoding.Error as Encoding.Error

import qualified Language.Preprocessor.Unlit as Unlit
import Text.Printf (printf)


-- * types

data TagVal = TagVal
    !Text -- ^ prefix
    !Text -- ^ name
    !Type -- ^ tag type
    deriving (Show)

instance NFData TagVal where
    rnf (TagVal x y z) = rnf x `seq` rnf y `seq` rnf z

instance Eq TagVal where
    TagVal _ name t == TagVal _ name' t' = name == name' && t == t'

instance Ord TagVal where
    compare (TagVal _ name t) (TagVal _ name' t') =
        name `compare` name' <> t `compare` t'

-- don't swap constructors since we rely that Type < Constructor == True holds
data Type =
    Function
    | Type
    | Constructor
    | Class
    | Module
    | Operator
    | Pattern
    deriving (Eq, Ord, Show)

instance NFData Type where
    rnf t = t `seq` ()

data TokenVal =
    Token !Text !Text
    | Newline !Int -- ^ indentation
    deriving (Show)

tokenName :: TokenVal -> Text
tokenName (Token _ name) = name
tokenName _ = error "cannot extract name from non-Token TokenVal"

data Tag =
    Tag !(Pos TagVal)
    | RepeatableTag !(Pos TagVal)
    | Warning !String
    deriving (Show, Eq, Ord)

partitionTags :: [Tag] -> ([Pos TagVal], [Pos TagVal], [String])
partitionTags ts = go ts [] [] []
    where
    go []                     xs ys zs = (xs, ys, reverse zs)
    go (Tag t : ts)           xs ys zs = go ts (t:xs) ys     zs
    go (RepeatableTag t : ts) xs ys zs = go ts xs     (t:ys) zs
    go (Warning warn : ts)    xs ys zs = go ts xs     ys     (warn:zs)

type Token = Pos TokenVal

-- | Newlines have to remain in the tokens because 'breakBlocks' relies on
-- them.  But they make pattern matching on the tokens unreliable because
-- newlines might be anywhere.  A newtype makes sure that the tokens only get
-- stripped once and that I don't do any pattern matching on unstripped tokens.
newtype UnstrippedTokens = UnstrippedTokens [Token]
    deriving (Show, Monoid)

mapTokens :: ([Token] -> [Token]) -> UnstrippedTokens -> UnstrippedTokens
mapTokens f (UnstrippedTokens tokens) = UnstrippedTokens (f tokens)

unstrippedTokensOf :: UnstrippedTokens -> [Token]
unstrippedTokensOf (UnstrippedTokens tokens) = tokens

-- | Drop @n@ non-newline tokens.
dropTokens :: Int -> UnstrippedTokens -> UnstrippedTokens
dropTokens n = mapTokens (f n)
    where
    f :: Int -> [Token] -> [Token]
    f 0 xs                       = xs
    f _ []                       = []
    f n (Pos _ (Newline _) : xs) = f n xs
    f n (Pos _ (Token _ _) : xs) = f (n - 1) xs

type Line = Pos Text

data Pos a = Pos {
    posOf :: !SrcPos
    , valOf :: !a
    } deriving (Eq, Ord)

instance (NFData a) => NFData (Pos a) where
    rnf (Pos x y) = rnf x `seq` rnf y

data SrcPos = SrcPos {
    _posFile :: !FilePath
    , posLine  :: !Int
    } deriving (Eq, Ord)

instance NFData SrcPos where
    rnf (SrcPos x y) = rnf x `seq` rnf y

instance Show a => Show (Pos a) where
    show (Pos pos val) = show pos ++ ":" ++ show val
instance Show SrcPos where
    show (SrcPos fn line) = fn ++ ":" ++ show line

-- * process

-- | Global processing for when all tags are together.
processAll :: [[Pos TagVal]] -> [Pos TagVal]
processAll =
    sortDups . dropDups isDuplicatePair
        . combineBalanced (mergeOn tagSortingKey)
        . map (dropDups isDuplicatePair)
    where
    isDuplicatePair :: Pos TagVal -> Pos TagVal -> Bool
    isDuplicatePair t t' =
        posOf t == posOf t'
        && tagText t == tagText t'
        && tagType t == tagType t'

combineBalanced :: forall a. (a -> a -> a) -> [a] -> a
combineBalanced f xs = go xs
    where
    go :: [a] -> a
    go [] = error "cannot combine empty list"
    go xs@(_:_) = case combine xs of
        []  -> error "unexpected empty list when combining nonempty lists"
        [x] -> x
        xs' -> go xs'
    combine :: [a] -> [a]
    combine []        = []
    combine [x]       = [x]
    combine (x:x':xs) = f x x' : combine xs

-- | Given multiple matches, vim will jump to the first one.  So sort adjacent
-- tags with the same text by their type.
--
-- Mostly this is so that given a type with the same name as its module,
-- the type will come first.
sortDups :: [Pos TagVal] -> [Pos TagVal]
sortDups = concatMap (sortOn tagType) .  Map.elems . Map.fromAscListWith (++)
    . map (fst . tagSortingKey &&& (:[]))

tagText :: Pos TagVal -> Text
tagText (Pos _ (TagVal _ text _)) = text

tagType :: Pos TagVal -> Type
tagType (Pos _ (TagVal _ _ t)) = t

tagLine :: Pos TagVal -> Int
tagLine (Pos (SrcPos _ line) _) = line

-- | Read tags from one file.
processFile :: FilePath -> Bool -> IO ([Pos TagVal], [String])
processFile fn trackPrefixes =
    process fn trackPrefixes <$> readFileLenient fn

-- | Read a UTF8 file, but don't crash on encoding errors.
readFileLenient :: FilePath -> IO Text
readFileLenient fname = do
    bytes <- ByteString.readFile fname
    return $ Encoding.decodeUtf8With Encoding.Error.lenientDecode bytes

tagSortingKey :: Pos TagVal -> (Text, Type)
tagSortingKey (Pos _ (TagVal _ name t)) = (name, t)

-- | Process one file's worth of tags.
process :: FilePath -> Bool -> Text -> ([Pos TagVal], [String])
process fn trackPrefixes =
    splitAndRemoveRepeats . concatMap blockTags . breakBlocks . stripComments
        . mconcat . map (tokenize trackPrefixes) . stripCpp . annotate fn
        . unlit'
    where
    splitAndRemoveRepeats :: [Tag] -> ([Pos TagVal], [String])
    splitAndRemoveRepeats tags =
        (mergeOn tagSortingKey (sortOn tagSortingKey newTags) earliestRepeats,
            warnings)
        where
        (newTags, repeatableTags, warnings) = partitionTags tags
        earliestRepeats :: [Pos TagVal]
        earliestRepeats = Map.elems $ Map.fromListWith minLine $
            map (tagSortingKey &&& id) repeatableTags
        minLine x y
            | tagLine x < tagLine y = x
            | otherwise             = y
    unlit' :: Text -> Text
    unlit' s
        | isLiterateFile fn = T.pack $ Unlit.unlit fn $ T.unpack s'
        | otherwise = s
        where
        s' :: Text
        s'  | "\\begin{code}" `T.isInfixOf` s
                    && "\\end{code}" `T.isInfixOf` s =
                T.unlines $ filter (not . birdLiterateLine) $ T.lines s
            | otherwise = s
        birdLiterateLine :: Text -> Bool
        birdLiterateLine xs
            | T.null xs = False
            | otherwise = case headt $ T.dropWhile Char.isSpace xs of
                Just '>' -> True
                _ -> False

-- * tokenize

annotate :: FilePath -> Text -> [Line]
annotate fn text =
    [Pos (SrcPos fn num) line | (num, line) <- zip [1..] (T.lines text)]

-- | Also strips out hsc detritus.
stripCpp :: [Line] -> [Line]
stripCpp = filter $ not . ("#" `T.isPrefixOf`) . valOf

tokenize :: Bool -> Line -> UnstrippedTokens
tokenize trackPrefixes (Pos pos line) =
    UnstrippedTokens $ map (Pos pos) (tokenizeLine trackPrefixes line)

spanToken :: Text -> (Text, Text)
spanToken text
    | T.null text = ("", "")
    -- Special case to prevent "--" from consuming too much input so that
    -- closing "-}" becomes unavailable.
    | Just rest <- T.stripPrefix "--}" text = ("-}", rest)
    | Just (sym, rest) <- consume comments text = (sym, rest)
    -- Find symbol that isn't followed by haskellOpChar.
    | Just (sym, rest) <- consume symbols text
            , maybe True (not . haskellOpChar) (headt rest)
        = (sym, rest)
    | c == '\'' = let (token, rest) = breakChar cs in (T.cons c token, rest)
    | c == '"' =
        let (token, rest) = breakString cs in (T.cons c token, rest)
    | state@(token, _) <- spanSymbol (haskellOpChar c) text,
        not (T.null token) = state
    -- This will tokenize differently than haskell should, e.g., 9x will
    -- be "9x" not "9" "x".  But I just need a wordlike chunk, not an
    -- actual token.  Otherwise I'd have to tokenize numbers.
    | otherwise = case T.span (identChar $ Char.isUpper c) text of
        ("", _)       -> (T.singleton c, cs)
        (token, rest) -> (token, rest)
    where
    -- Safe because of null check above.
    Just (c, cs) = T.uncons text
    comments = ["{-", "-}"]
    symbols = ["--", "=>", "->", "::"]

tokenizeLine :: Bool -> Text -> [TokenVal]
tokenizeLine trackPrefixes text = Newline nspaces : go spaces line
    where
    nspaces = fromIntegral $ T.count " " spaces + T.count "\t" spaces * 8
    (spaces, line) = T.break (not . Char.isSpace) text
    go :: Text -> Text -> [TokenVal]
    go oldPrefix unstripped
        | T.null stripped = []
        | otherwise       =
            let (token, rest) = spanToken stripped
                newPrefix     = if trackPrefixes
                                then oldPrefix <> spaces <> token
                                else T.empty
            in Token newPrefix token : go newPrefix rest
        where (spaces, stripped) = T.break (not . Char.isSpace) unstripped

startIdentChar :: Char -> Bool
startIdentChar c = Char.isAlpha c || c == '_'

identChar :: Bool -> Char -> Bool
identChar considerDot c = Char.isAlphaNum c || c == '\'' || c == '_'
    || c == '#' || considerDot && c == '.'

-- unicode operators are not supported yet
haskellOpChar :: Char -> Bool
haskellOpChar c = IntSet.member (Char.ord c) opChars
    where
    opChars :: IntSet.IntSet
    opChars = IntSet.fromList $ map Char.ord "-!#$%&*+./<=>?@^|~:\\"

isTypeVarStart :: Text -> Bool
isTypeVarStart x = case headt x of
    Just c -> Char.isLower c || c == '_'
    _ -> False

-- | Span a symbol, making sure to not eat comments.
spanSymbol :: Bool -> Text -> (Text, Text)
spanSymbol considerColon text
    | Just res <- haskellOp text [] = res
    | any (`T.isPrefixOf` post) [",", "--", "-}", "{-"] = split
    | Just (c, cs) <- T.uncons post, c == '-' || c == '{' =
        let (pre2, post2) = spanSymbol considerColon cs
        in (pre <> T.cons c pre2, post2)
    | otherwise = split
    where
    split@(pre, post) = T.break
        (\c -> T.any (==c) "-{," || not (symbolChar considerColon c)) text

    haskellOp :: Text -> [Char] -> Maybe (Text, Text)
    haskellOp txt op
      | Just (c, cs) <- T.uncons txt
      , haskellOpChar c
      , not $ "-}" `T.isPrefixOf` txt =
          haskellOp cs $ c : op
      | null op   = Nothing
      | otherwise = Just (T.pack $ reverse op, txt)

symbolChar :: Bool -> Char -> Bool
symbolChar considerColon c =
    (Char.isSymbol c || Char.isPunctuation c)
    && (not (c `elem` ("(),;[]`{}_:\"'"::String)) || considerColon && c == ':')

breakChar :: Text -> (Text, Text)
breakChar text = case headt text of
    Nothing -> ("", "")
    Just '\\' -> T.splitAt 3 text
    _ -> T.splitAt 2 text

-- TODO \ continuation isn't supported.  I'd have to tokenize at the file
-- level instead of the line level.
breakString :: Text -> (Text, Text)
breakString = (T.pack . reverse *** T.pack) . go [] . T.unpack
    where
    go :: String -> String -> (String, String)
    go s []             = (s, [])
    go s ('"':xs)       = ('"': s, xs)
    go s ('\\':[])      = (s, "\\")
    -- handle string continuation
    go s ('\\':'\n':xs) = go s $ dropBackslash $
        dropWhile (\c -> c /= '\\' && c /= '"') xs
    go s ('\\':x:xs)    = go (x : '\\': s) xs
    go s (x:xs)         = go (x : s) xs

    dropBackslash :: String -> String
    dropBackslash ('\\':xs) = xs
    dropBackslash xs        = xs

stripComments :: UnstrippedTokens -> UnstrippedTokens
stripComments = mapTokens (go 0)
    where
    go :: Int -> [Token] -> [Token]
    go _ [] = []
    go nest (pos@(Pos _ token) : rest)
        | token `nameStartsWith` "{-"                     = go (nest + 1) rest
        | token `nameEndsWith` "-}"                       = go (nest - 1) rest
        | nest == 0 && tokenNameSatisfies token isComment =
            go nest (dropLine rest)
        | nest > 0                                        = go nest rest
        | otherwise                                       = pos : go nest rest
    dropLine :: [Token] -> [Token]
    dropLine = dropWhile (not . isNewline)
    isComment :: Text -> Bool
    isComment name =
        "--" `T.isPrefixOf` name
        && T.all (\c -> not (haskellOpChar c) || c == '-') (T.drop 2 name)

-- | Break the input up into blocks based on indentation.
breakBlocks :: UnstrippedTokens -> [UnstrippedTokens]
breakBlocks =
    map UnstrippedTokens . filter (not . null)
        . go . filterBlank . unstrippedTokensOf
    where
    go :: [Token] -> [[Token]]
    go []     = []
    go tokens = pre : go post
        where (pre, post) = breakBlock tokens
    -- Blank lines mess up the indentation.
    filterBlank :: [Token] -> [Token]
    filterBlank [] = []
    filterBlank (Pos _ (Newline _) : xs@(Pos _ (Newline _) : _)) =
        filterBlank xs
    filterBlank (x:xs) = x : filterBlank xs

-- | Take until a newline, then take lines until the indent established after
-- that newline decreases. Or, alternatively, if "{" is encountered then count
-- it as a block until closing "}" is found taking nesting into account.
breakBlock :: [Token] -> ([Token], [Token])
breakBlock (t@(Pos _ tok) : ts) = case tok of
    Newline indent -> collectIndented indent ts
    Token _ "{"    -> collectBracedBlock breakBlock ts 1
    _              -> remember t $ breakBlock ts
    where
    collectIndented :: Int -> [Token] -> ([Token], [Token])
    collectIndented indent tsFull@(t@(Pos _ tok) : ts) = case tok of
        Newline n | n <= indent -> ([], tsFull)
        Token _ "{" ->
            remember t $ collectBracedBlock (collectIndented indent) ts 1
        _           -> remember t $ collectIndented indent ts
    collectIndented _ [] = ([], [])

    collectBracedBlock :: ([Token] -> ([Token], [Token])) -> [Token] -> Int
        -> ([Token], [Token])
    collectBracedBlock _    []                           _ = ([], [])
    collectBracedBlock cont ts                           0 = cont ts
    collectBracedBlock cont (t@(Pos _ (Token _ "{")) : ts) n =
      remember t $ collectBracedBlock cont ts $! n + 1
    collectBracedBlock cont (t@(Pos _ (Token _ "}")) : ts) n =
      remember t $ collectBracedBlock cont ts $! n - 1
    collectBracedBlock cont (t:ts)                       n =
      remember t $ collectBracedBlock cont ts n

    remember :: Token -> ([Token], [Token]) -> ([Token], [Token])
    remember t (xs, ys) = (t : xs, ys)
breakBlock [] = ([], [])

-- * extract tags

-- | Get all the tags in one indented block.
-- TODO clean this up to require less nesting, and dropDataContext duplication
blockTags :: UnstrippedTokens -> [Tag]
blockTags unstripped = case stripNewlines unstripped of
    [] -> []
    Pos _ (Token _ "module") : Pos pos (Token prefix name) : _ ->
        [mkTag pos prefix (snd (T.breakOnEnd "." name)) Module]
    Pos _ (Token _ "pattern") : Pos pos (Token prefix name) : _
        | maybe False Char.isUpper (headt name) ->
            [mkTag pos prefix name Pattern]
    Pos _ (Token _ "foreign") : decl -> foreignTags decl
    -- newtype instance * = ...
    Pos _ (Token _ "newtype") : Pos _ (Token _ "instance")
            : (dropDataContext -> Pos pos _: rest) ->
        newtypeTags pos rest
    -- newtype X * = X *
    Pos _ (Token _ "newtype")
            : (dropDataContext -> whole@(tok@(Pos pos (Token _ name)) : rest))
        | isTypeName name -> tokToTag tok Type : newtypeTags pos rest
        | otherwise -> tok' : newtypeTags pos' rest'
        where (pos', tok', rest') = recordInfixName Type whole
    -- type family X ...
    Pos _ (Token _ "type") : Pos _ (Token _ "family")
            : (dropDataContext -> whole@(tok@(Pos _ (Token _ name)) : _))
        | isTypeFamilyName name -> [tokToTag tok Type]
        | otherwise -> [tok']
        where (_, tok', _) = recordInfixName Type whole
    -- type X * = ...
    Pos _ (Token _ "type")
            : (dropDataContext -> whole@(tok@(Pos _ (Token _ name)) : _))
        | isTypeName name -> [tokToTag tok Type]
        | otherwise -> [tok']
        where (_, tok', _) = recordInfixName Type whole
    -- data family X ...
    Pos _ (Token _ "data") : Pos _ (Token _ "family")
            : (dropDataContext -> tok@(Pos _ (Token _ name)) : rest)
        | isTypeFamilyName name -> [tokToTag tok Type]
        | otherwise -> [tok']
        where (_, tok', _) = recordInfixName Type rest
    -- data instance * = ...
    -- data instance * where ...
    Pos _ (Token _ "data") : Pos _ (Token _ "instance")
            : (dropDataContext -> Pos pos _: _) ->
        dataConstructorTags pos (dropTokens 2 unstripped)
    -- data X * = X { X :: *, X :: * }
    -- data X * where ...
    Pos _ (Token _ "data")
            : (dropDataContext -> tok@(Pos pos (Token _ name)) : rest)
        | isTypeName name -> tokToTag tok Type
            : dataConstructorTags pos (dropTokens 2 unstripped)
      -- if token after data is not a type name then it isn't
      -- infix type as well since it may be only '(' or some
      -- lowercase name, either of which is not type constructor
        | otherwise -> let (pos', tok, _) = recordInfixName Type rest
           in tok : dataConstructorTags pos' (dropTokens 1 unstripped)
    -- class * => X where X :: * ...
    Pos pos (Token _ "class") : _ -> classTags pos (dropTokens 1 unstripped)

    Pos _ (Token _ "infix") : _ -> []
    Pos _ (Token _ "infixl") : _ -> []
    Pos _ (Token _ "infixr") : _ -> []
    -- instance * where data * = X :: * ...
    Pos pos (Token _ "instance") : _ ->
        instanceTags pos (dropTokens 1 unstripped)
    -- x, y, z :: *
    stripped -> toplevelFunctionTags stripped

isTypeFamilyName :: Text -> Bool
isTypeFamilyName = maybe False (\c -> Char.isUpper c || haskellOpChar c) . headt

isTypeName  :: Text -> Bool
isTypeName x = case headt x of
    Just c -> Char.isUpper c || c == ':'
    _ -> False

dropDataContext :: [Token] -> [Token]
dropDataContext = stripParensKindsTypeVars . stripOptContext

recordInfixName :: Type -> [Token] -> (SrcPos, Tag, [Token])
recordInfixName tokenType tokens = (pos, tokToTag tok tokenType, rest)
    where (tok@(Pos pos _) : rest) = dropInfixTypeStart tokens

-- same as dropWhile with counting
dropInfixTypeStart :: [Token] -> [Token]
dropInfixTypeStart tokens = dropWhile f tokens
    where
    f (Pos _ (Token _ name)) = isInfixTypePrefix name || name == "`"
    f _                      = False

    isInfixTypePrefix :: Text -> Bool
    isInfixTypePrefix =
        maybe False ((\c -> Char.isLower c || c == '(')) . headt

-- | It's easier to scan for tokens without pesky newlines popping up
-- everywhere.  But I need to keep the newlines in in case I hit a @where@
-- and need to call 'breakBlocks' again.
stripNewlines :: UnstrippedTokens -> [Token]
stripNewlines = filter (not . isNewline) . (\(UnstrippedTokens t) -> t)

-- | Tags from foreign import.
--
-- e.g. @foreign import ccall safe \"name\" c_name :: ...@ will produce a tag
-- for @c_name@.
foreignTags :: [Token] -> [Tag]
foreignTags decl = case decl of
    Pos _ (Token _ "import") : decl'
        | name : _ <- dropBefore ((=="::") . tokenName . valOf) decl' ->
            [tokToTag name Pattern]
    _ -> []

toplevelFunctionTags :: [Token] -> [Tag]
toplevelFunctionTags toks = case tags of
    -- Tags of toplevel functions are all repeatable, even the ones that come
    -- from the type signature because there will definitely be tags from the
    -- body and they should be sorted out if type signature is present.
    [] -> functionTagsNoSig toks
    _  -> map toRepeatableTag $ tags
    where
    -- first try to detect tags from type signature, if it fails then
    -- do the actual work of detecting from body
    (tags, _) = functionTags False toks
    toRepeatableTag :: Tag -> Tag
    toRepeatableTag (Tag t) = RepeatableTag t
    toRepeatableTag t       = t

functionTagsNoSig :: [Token] -> [Tag]
functionTagsNoSig toks = go toks
    where
    go :: [Token] -> [Tag]
    go []                           = []
    go toks@(Pos _ (Token _ "(") : _) = go $ stripBalancedParens toks
    go (Pos pos (Token prefix name) : ts)
        -- this function does not analyze type signatures
        | name == "::" = []
        | name == "!" || name == "~" || name == "@" = go ts
        | name == "=" || name == "|" = case stripParens toks of
            Pos pos (Token prefix name) : _
                | functionName False name  ->
                    [mkRepeatableTag pos prefix name Function]
                | T.all haskellOpChar name ->
                    [mkRepeatableTag pos prefix name Operator]
            _ -> []
        | name == "`" = case ts of
            Pos pos' (Token prefix' name') : _
                | functionName False name' ->
                    [mkRepeatableTag pos' prefix' name' Function]
            _ -> go ts
      | T.all haskellOpChar name   = [mkRepeatableTag pos prefix name Operator]
      | otherwise                  = go ts
    stripParens :: [Token] -> [Token]
    stripParens = dropWhile ((`hasName` "(") . valOf)

-- | Get tags from a function type declaration: token , token , token ::
-- Return the tokens left over.
functionTags :: Bool -- ^ expect constructors, not functions
    -> [Token] -> ([Tag], [Token])
functionTags constructors = go []
    where
    opTag   = if constructors then Constructor else Operator
    funcTag = if constructors then Constructor else Function
    go :: [Tag] -> [Token] -> ([Tag], [Token])
    go tags (Pos _ (Token _ "(") : Pos pos (Token _ name)
            : Pos _ (Token prefix ")") : Pos _ (Token _ "::") : rest) =
        (reverse $ mkTag pos prefix name opTag : tags, rest)
    go tags (Pos pos (Token prefix name) : Pos _ (Token _ "::") : rest)
        | functionName constructors name =
            (reverse $ mkTag pos prefix name funcTag : tags, rest)
    go tags (Pos _ (Token _ "(") : Pos pos (Token _ name)
            : Pos _ (Token prefix ")") : Pos _ (Token _ ",") : rest) =
        go (mkTag pos prefix name opTag : tags) rest
    go tags (Pos pos (Token prefix name) : Pos _ (Token _ ",") : rest)
        | functionName constructors name =
            go (mkTag pos prefix name funcTag : tags) rest
    go tags tokens = (tags, tokens)

functionName :: Bool -> Text -> Bool
functionName constructors text = isFunction text
    where
    isFunction text = case T.uncons text of
        Just (c, cs) ->
            firstChar c && startIdentChar c && T.all (identChar True) cs
        Nothing      -> False
    firstChar = if constructors
                then Char.isUpper
                else \c -> Char.isLower c || c == '_'

-- | * = X *
newtypeTags :: SrcPos -> [Token] -> [Tag]
newtypeTags prevPos tokens = case dropUntil "=" tokens of
    Pos pos (Token prefix name) : rest ->
        let constructor = mkTag pos prefix name Constructor
        in  case rest of
            Pos _ (Token _ "{") : Pos funcPos (Token funcPrefix funcName) : _ ->
                [constructor, mkTag funcPos funcPrefix funcName Function]
            _ ->
                [constructor]
    rest -> unexpected prevPos (UnstrippedTokens tokens) rest "newtype * ="

-- | [] (empty data declaration)
-- * = X { X :: *, X :: * }
-- * where X :: * X :: *
-- * = X | X
dataConstructorTags :: SrcPos -> UnstrippedTokens -> [Tag]
dataConstructorTags prevPos unstripped
    -- GADT
    | any ((`hasName` "where") . valOf) (unstrippedTokensOf unstripped) =
        concatMap gadtTags (whereBlock unstripped)
    -- plain ADT
    | otherwise = case strip unstripped of
        [] -> [] -- empty data declaration
        rest | Just (Pos pos (Token prefix name), rest') <-
                extractInfixConstructor rest ->
            mkTag pos prefix name Constructor : collectRest rest'
        Pos pos (Token prefix name) : rest ->
            mkTag pos prefix name Constructor : collectRest rest
        rest -> unexpected prevPos unstripped rest "data * ="
    where
    strip = stripOptBang . stripOptContext . stripOptForall . dropUntil "="
         . stripNewlines
    collectRest :: [Token] -> [Tag]
    collectRest tokens
        | (tags@(_:_), rest) <- functionTags False tokens =
            tags ++ collectRest (dropUntilNextField rest)
    collectRest (Pos pipePos (Token _ "|") : rest)
        | Just (Pos pos (Token prefix name), rest'') <-
                extractInfixConstructor rest' =
            mkTag pos prefix name Constructor : collectRest rest''
        | Pos pos (Token prefix name) : rest'' <- rest' =
            mkTag pos prefix name Constructor
                : collectRest (dropUntilNextCaseOrRecordStart rest'')
        | otherwise = error $
            printf "syntax error@%d: | not followed by tokens\n"
                (posLine pipePos)
        where
        rest' = stripOptBang $ stripOptContext $ stripOptForall rest
    collectRest (_ : rest) = collectRest rest
    collectRest [] = []

    stripOptBang :: [Token] -> [Token]
    stripOptBang ((Pos _ (Token _ "!")) : rest) = rest
    stripOptBang ts = ts

    extractInfixConstructor :: [Token] -> Maybe (Token, [Token])
    extractInfixConstructor = extract . stripTypeParam
        where
        extract :: [Token] -> Maybe (Token, [Token])
        extract (tok@(Pos _ (Token _ name)) : rest)
            | ":" `T.isPrefixOf` name = Just (tok, stripTypeParam rest)
        extract (Pos _ (Token _ "`") : tok@(Pos _ _) : Pos _ (Token _ "`")
                : rest) =
            Just (tok, stripTypeParam rest)
        extract _ = Nothing
        stripTypeParam :: [Token] -> [Token]
        stripTypeParam input@((Pos _ (Token _ "(")) : _) =
            stripBalancedParens input
        stripTypeParam input@((Pos _ (Token _ "[")) : _) =
            stripBalancedBrackets input
        stripTypeParam ts = drop 1 ts

    dropUntilNextCaseOrRecordStart :: [Token] -> [Token]
    dropUntilNextCaseOrRecordStart =
        dropWithStrippingBalanced (\x -> not $ x == "|" || x == "{")

    dropUntilNextField :: [Token] -> [Token]
    dropUntilNextField =
        dropWithStrippingBalanced (\x -> not $ x == "," || x == "}" || x == "|")

stripOptForall :: [Token] -> [Token]
stripOptForall (Pos _ (Token _ "forall") : rest) = dropUntil "." rest
stripOptForall xs                               = xs

stripParensKindsTypeVars :: [Token] -> [Token]
stripParensKindsTypeVars (Pos _ (Token _ "(") : xs)  =
    stripParensKindsTypeVars xs
stripParensKindsTypeVars (Pos _ (Token _ "::") : xs) =
    stripParensKindsTypeVars $ tail $ dropWithStrippingBalanced (/= ")") xs
stripParensKindsTypeVars (Pos _ (Token _ name) : xs)
    | isTypeVarStart name = stripParensKindsTypeVars xs
stripParensKindsTypeVars xs = xs

stripOptContext :: [Token] -> [Token]
stripOptContext (stripBalancedParens -> (Pos _ (Token _ "=>") : xs)) = xs
stripOptContext (stripSingleClassContext -> Pos _ (Token _ "=>") : xs) = xs
stripOptContext xs = xs

stripSingleClassContext :: [Token] -> [Token]
stripSingleClassContext (Pos _ (Token _ name) : xs)
    | maybe False Char.isUpper (headt name) =
        dropWithStrippingBalanced isTypeVarStart xs
stripSingleClassContext xs = xs

-- | Drop all tokens for which @pred@ returns True, also drop
-- any parenthesized expressions.
dropWithStrippingBalanced :: (Text -> Bool) -> [Token] -> [Token]
dropWithStrippingBalanced pred input@(Pos _ (Token _ name) : xs)
    | name == "(" = dropWithStrippingBalanced pred $ stripBalancedParens input
    | name == "[" = dropWithStrippingBalanced pred $ stripBalancedBrackets input
    | pred name   = dropWithStrippingBalanced pred xs
dropWithStrippingBalanced _ xs = xs

stripBalancedParens :: [Token] -> [Token]
stripBalancedParens = stripBalanced "(" ")"

stripBalancedBrackets :: [Token] -> [Token]
stripBalancedBrackets = stripBalanced "[" "]"

stripBalanced :: Text -> Text -> [Token] -> [Token]
stripBalanced open close (Pos _ (Token _ name) : xs)
    | name == open = go 1 xs
    where
    go :: Int -> [Token] -> [Token]
    go 0 xs = xs
    go !n (Pos _ (Token _ name) : xs)
        | name == open  = go (n + 1) xs
        | name == close = go (n - 1) xs
        | otherwise     = go n       xs
    go n (_: xs)                     = go n xs
    go _ []                          = []
stripBalanced _ _ xs = xs

gadtTags :: UnstrippedTokens -> [Tag]
gadtTags = fst . functionTags True . stripNewlines

-- | * => X where X :: * ...
classTags :: SrcPos -> UnstrippedTokens -> [Tag]
classTags prevPos unstripped =
    case dropDataContext $ stripNewlines unstripped of
        whole@(tok@(Pos _ (Token _ name)) : _)
          -- Drop the where and start expecting functions.
            | isTypeName name -> tokToTag tok Class : cont
            | otherwise ->
                let (_, tok, _) = recordInfixName Class whole
                in tok : concatMap classBodyTags (whereBlock unstripped)
            where cont = concatMap classBodyTags (whereBlock unstripped)
        rest -> unexpected prevPos unstripped rest "class * =>"

classBodyTags :: UnstrippedTokens -> [Tag]
classBodyTags unstripped = case stripNewlines unstripped of
    Pos _ (Token _ typedata) : Pos pos (Token prefix name) : _
        | typedata `elem` ["type", "data"] -> [mkTag pos prefix name Type]
    tokens -> fst $ functionTags False tokens

-- | Skip to the where and split the indented block below it.
whereBlock :: UnstrippedTokens -> [UnstrippedTokens]
whereBlock = breakBlocks . mapTokens (dropUntil "where")

instanceTags :: SrcPos -> UnstrippedTokens -> [Tag]
instanceTags prevPos unstripped =
    -- instances can offer nothing but some fresh data constructors since
    -- the actual datatype is really declared in the class declaration
    concatMap (newtypeTags prevPos . unstrippedTokensOf)
        (filter isNewtypeDecl block)
    ++ concatMap (dataConstructorTags prevPos)
        (filter isDataDecl block)
    where
    block = whereBlock unstripped

    isNewtypeDecl :: UnstrippedTokens -> Bool
    isNewtypeDecl (UnstrippedTokens (Pos _ (Token _ "newtype") : _)) = True
    isNewtypeDecl _ = False

    isDataDecl :: UnstrippedTokens -> Bool
    isDataDecl (UnstrippedTokens (Pos _ (Token _ "data") : _)) = True
    isDataDecl _ = False

-- * util

mkTag :: SrcPos -> Text -> Text -> Type -> Tag
mkTag pos prefix name typ = Tag $ Pos pos (TagVal prefix name typ)

mkRepeatableTag :: SrcPos -> Text -> Text -> Type -> Tag
mkRepeatableTag pos prefix name typ =
    RepeatableTag $ Pos pos (TagVal prefix name typ)

tokToTag :: Token -> Type -> Tag
tokToTag (Pos pos (Token prefix name)) t = mkTag pos prefix name t

warning :: SrcPos -> String -> Tag
warning pos warn = Warning $ show pos ++ ": " ++ warn

unexpected :: SrcPos -> UnstrippedTokens -> [Token] -> String -> [Tag]
unexpected prevPos (UnstrippedTokens tokensBefore) tokensHere declaration =
    [warning pos ("unexpected " ++ thing ++ " after " ++ declaration)]
    where
    thing = maybe "end of block" (show . valOf) (mhead tokensHere)
    pos
        | Just t <- mhead tokensHere = posOf t
        | Just t <- mlast tokensBefore = posOf t
        | otherwise = prevPos

isNewline :: Token -> Bool
isNewline (Pos _ (Newline _)) = True
isNewline _                   = False

hasName :: TokenVal -> Text -> Bool
hasName tok text = tokenNameSatisfies tok (== text)

nameStartsWith :: TokenVal -> Text -> Bool
nameStartsWith tok text = tokenNameSatisfies tok (text `T.isPrefixOf`)

nameEndsWith :: TokenVal -> Text -> Bool
nameEndsWith tok text = tokenNameSatisfies tok (text `T.isSuffixOf`)

tokenNameSatisfies :: TokenVal -> (Text -> Bool) -> Bool
tokenNameSatisfies (Token _ name) pred = pred name
tokenNameSatisfies _              _    = False

-- * generic utils

-- | Try to match one of the given prefixes.
consume :: [Text] -> Text -> Maybe (Text, Text) -- ^ (prefix, remainder)
consume prefixes t =
    msum [(,) prefix <$> T.stripPrefix prefix t | prefix <- prefixes]

-- | Drop until the element before the matching one.  Return [] if the function
-- never matches.
dropBefore :: (a -> Bool) -> [a] -> [a]
dropBefore f = go
    where
    go [] = []
    go [_] = []
    go xs@(_ : rest@(y:_))
        | f y = xs
        | otherwise = go rest

dropDups :: (a -> a -> Bool) -> [a] -> [a]
dropDups cmp (x:xs) = go x xs
    where
    go a [] = [a]
    go a (b:bs)
        | cmp a b   = go a bs
        | otherwise = a : go b bs
dropDups _ [] = []

dropUntil :: Text -> [Token] -> [Token]
dropUntil token = drop 1 . dropWhile (not . (`hasName` token) . valOf)

sortOn :: (Ord k) => (a -> k) -> [a] -> [a]
sortOn key = List.sortBy (compare `on` key)

-- | Split list into chunks delimited by specified element.
split :: (Eq a) => a -> [a] -> [[a]]
split _ [] = []
split x xs = xs': split x (drop 1 xs'')
    where (xs', xs'') = break (==x) xs

-- | Crude predicate for Haskell files
isHsFile :: FilePath -> Bool
isHsFile fn = ".hs" `List.isSuffixOf` fn  || ".hsc" `List.isSuffixOf` fn
    || isLiterateFile fn

isLiterateFile :: FilePath -> Bool
isLiterateFile fn = ".lhs" `List.isSuffixOf` fn

merge :: Ord a => [a] -> [a] -> [a]
merge = mergeBy compare

mergeOn :: (Ord b) => (a -> b) -> [a] -> [a] -> [a]
mergeOn f = mergeBy (compare `on` f)

mergeBy :: (a -> a -> Ordering) -> [a] -> [a] -> [a]
mergeBy f xs ys = go xs ys
    where
    go []     ys     = ys
    go xs     []     = xs
    go (x:xs) (y:ys) = case f x y of
        EQ -> x: y: go xs ys
        LT -> x: go xs (y:ys)
        GT -> y: go (x:xs) ys

headt :: Text -> Maybe Char
headt = fmap fst . T.uncons

mhead :: [a] -> Maybe a
mhead [] = Nothing
mhead (x:_) = Just x

mlast :: [a] -> Maybe a
mlast xs
    | null xs = Nothing
    | otherwise = Just (last xs)