{-# LANGUAGE BangPatterns #-}
-- | Character based scanners.
module Text.Lexer.Inchworm.Char
        ( module Text.Lexer.Inchworm

          -- * Driver
        , scanString
        , scanStringIO

          -- * Locations
        , Range (..), Location (..)
        , bumpLocationWithChar

          -- * Scanners
        , scanInteger
        , scanHaskellChar
        , scanHaskellString
        , scanHaskellCommentBlock
        , scanHaskellCommentLine)
where
import Text.Lexer.Inchworm
import Text.Lexer.Inchworm.Source
import System.IO.Unsafe
import qualified Data.Char              as Char
import qualified Data.List              as List
import qualified Numeric                as Numeric


-- Driver ---------------------------------------------------------------------
-- | Scan a string.
scanString
        :: String
        -> Scanner IO Location String a
        -> ([a], Location, String)

scanString str scanner
 = unsafePerformIO 
 $ scanListIO
        (Location 0 0)
        bumpLocationWithChar 
        str scanner


-- | Implementation for `scanString`,
--   that uses the IO monad to maintain internal state.
scanStringIO
        :: String
        -> Scanner IO Location String a
        -> IO ([a], Location, String)

scanStringIO str scanner
 = scanListIO
        (Location 0 0)
        bumpLocationWithChar 
        str scanner


-- Locations ------------------------------------------------------------------
-- | Bump a location using the given character,
--   updating the line and column number as appropriate. 
bumpLocationWithChar :: Char -> Location -> Location
bumpLocationWithChar c (Location line col)
 = case c of 
        '\n'    -> Location (line + 1) 0
        _       -> Location line (col + 1) 


-- Integers -------------------------------------------------------------------
-- | Scan a decimal integer, with optional @-@ and @+@ sign specifiers.
scanInteger 
        :: Monad m 
        => Scanner m loc [Char] (Range loc, Integer)

scanInteger 
 = munchPred Nothing matchInt acceptInt
 where
        matchInt  0 !c  
         = c == '-' || c == '+' || Char.isDigit c

        matchInt  _ !c  = Char.isDigit c

        acceptInt ('+' : cs)
         | null cs      = Nothing

        acceptInt ('-' : cs)
         | null cs      = Nothing

        acceptInt cs    = Just $ read cs

{-# SPECIALIZE INLINE
     scanInteger
        :: Scanner IO Location [Char] (Range Location, Integer)
  #-}

-- Strings --------------------------------------------------------------------
-- | Scan a literal string,    enclosed in double quotes.
-- 
--   We handle the escape codes listed in Section 2.6 of the Haskell Report.
--
scanHaskellString 
        :: Monad   m
        => Scanner m loc [Char] (Range loc, String)

scanHaskellString 
 = munchFold Nothing matchC (False, False) acceptC
 where
        -- Expect double quotes as first char.
        matchC 0 '\"' _                 = Just (False, False)

        -- Matcher is done.
        matchC _  _  (True, _bEscape)   = Nothing

        -- Match a character.
        matchC ix c  (False, bEscape)
         | ix < 1                       = Nothing
         | c == '"',  bEscape           = Just (False, False)
         | c == '"'                     = Just (True,  False)
         | c == '\\', bEscape           = Just (False, False)
         | c == '\\'                    = Just (False, True)
         | otherwise                    = Just (False, False)

        acceptC ('"' : cs)              
         = case decodeString cs of
                (str, ('"' : []))       -> Just str
                _                       -> Nothing

        acceptC _                       = Nothing

{-# SPECIALIZE INLINE
     scanHaskellString
        :: Scanner IO Location [Char] (Range Location, String)  
  #-}


-- Characters -----------------------------------------------------------------
-- | Scan a literal character, enclosed in single quotes.
--   
--   We handle the escape codes listed in Section 2.6 of the Haskell Report.
--
scanHaskellChar 
        :: Monad   m
        => Scanner m loc [Char] (Range loc, Char)

scanHaskellChar 
 = munchFold Nothing matchC (False, False) acceptC
 where
        matchC 0 '\'' _                 = Just (False, False)
        matchC _  _  (True,  _bEscape)  = Nothing

        matchC ix c  (False,  bEscape)
         | ix < 1                       = Nothing
         | c == '\'', bEscape           = Just (False, False)
         | c == '\''                    = Just (True,  False)
         | c == '\\', bEscape           = Just (False, False)
         | c == '\\'                    = Just (False, True)
         | otherwise                    = Just (False, False)

        acceptC ('\'' : cs)          
         = case readChar cs of
                -- Character literals do not support gaps or
                -- escape terminators
                Just (Just c, "\'")     -> Just c
                _                       -> Nothing

        acceptC _                       =  Nothing

{-# SPECIALIZE INLINE
     scanHaskellChar
        :: Scanner IO Location [Char] (Range Location, Char)
  #-}


-- Comments -------------------------------------------------------------------
-- | Scan a Haskell block comment.
scanHaskellCommentBlock 
        :: Monad   m
        => Scanner m loc [Char] (Range loc, String)

scanHaskellCommentBlock
 = munchFold Nothing matchC (' ', True) acceptC
 where
        matchC 0 '{' _                  = Just ('{', True)
        matchC 1 '-' ('{', True)        = Just ('-', True)

        matchC _   _  (_,     False)    = Nothing

        matchC ix  c  (cPrev, True)
         | ix < 2                       = Nothing
         | cPrev == '-' && c == '}'     = Just ('}', False)
         | otherwise                    = Just (c,   True)

        acceptC cc@('{' : '-' : _)      = Just cc
        acceptC _                       = Nothing

{-# SPECIALIZE INLINE
     scanHaskellCommentBlock 
        :: Scanner IO Location [Char] (Range Location, String)
  #-}


-- | Scan a Haskell line comment.
scanHaskellCommentLine 
        :: Monad   m
        => Scanner m loc [Char] (Range loc, String)

scanHaskellCommentLine 
 = munchPred Nothing matchC acceptC
 where
        matchC 0 '-'                    = True
        matchC 1 '-'                    = True
        matchC _ '\n'                   = False
        matchC ix _       
         | ix < 2                       = False
         | otherwise                    = True

        acceptC ('-' : '-' : cs)        = Just cs
        acceptC _                       = Nothing

{-# SPECIALIZE INLINE
     scanHaskellCommentLine
        :: Scanner IO Location [Char] (Range Location, String)
  #-}


-------------------------------------------------------------------------------
-- | Decode escape codes in a string.
decodeString :: String -> (String, String)
decodeString ss0
 = go [] ss0
 where
        go !acc []
         = (reverse acc, [])

        go !acc ss@('\"' : _)
         = (reverse acc, ss)

        go !acc ss@(c : cs)
         = case readChar ss of
                Just (Just c', cs')  -> go (c' : acc) cs'
                Just (Nothing, cs')  -> go       acc  cs'
                Nothing              -> go (c  : acc) cs

-- | Result of reading a character: either a real char, or an empty string
--   that is a successful read, but contains no characters.
--   These empty strings are sometimes required to remove ambiguity:
--   for example,'\SO' and '\SOH' are both valid escapes.
--   To distinguish between the strings ['\SO', 'H'] and ['\SOH'],
--   it is necessary to explicitly terminate the escape for the former:
--   '\SO\&H' means ['\SO', 'H'].
type CharGap = Maybe Char

-- | Read a character literal, handling escape codes.
readChar :: String -> Maybe (CharGap, String)

-- Control characters defined by hex escape codes.
readChar ('\\' : 'x' : cs)
 | [(x, rest)]  <- Numeric.readHex cs   = Just (Just $ Char.chr x, rest)
 | otherwise                            = Nothing

-- Control characters defined by octal escape codes.
readChar ('\\' : 'o' : cs)
 | [(x, rest)]  <- Numeric.readOct cs   = Just (Just $ Char.chr x, rest)
 | otherwise                            = Nothing

-- Control characters defined by carret characters, like \^G
readChar ('\\' : '^' : c : rest)
 | c >= 'A' && c <= 'Z' = Just (Just $ Char.chr (Char.ord c - 1), rest)
 | c == '@'             = Just (Just $ Char.chr 0,  rest)
 | c == '['             = Just (Just $ Char.chr 27, rest)
 | c == '\\'            = Just (Just $ Char.chr 28, rest)
 | c == ']'             = Just (Just $ Char.chr 29, rest)
 | c == '^'             = Just (Just $ Char.chr 30, rest)
 | c == '_'             = Just (Just $ Char.chr 31, rest)

-- Control characters defined by decimal escape codes.
readChar ('\\' : cs)
 | (csDigits, csRest)   <- List.span Char.isDigit cs
 , not $ null csDigits
 = Just (Just $ Char.chr (read csDigits), csRest)

-- Escape terminator '\&': see CharGap above
readChar ('\\' : '&' : rest)
 = Just (Nothing, rest)

-- String gap: two backslashes enclosing whitespace.
-- As above, this is equivalent to an empty string.
readChar ('\\' : cs)
 -- At least one character of whitespace
 | (_:_, '\\' : rest) <- List.span Char.isSpace cs
 = Just (Nothing, rest)

-- Control characters defined by ASCII escape codes.
readChar ('\\' : cs)                    
 = let  go [] = Nothing
        go ((str, c) : moar)
         = case List.stripPrefix str cs of
                Nothing   -> go moar
                Just rest -> Just (Just c, rest)

   in   go escapedChars

-- Just a regular character.
readChar (c : rest)     = Just (Just c, rest)

-- Nothing to read.
readChar _              = Nothing

escapedChars :: [(String, Char)]
escapedChars 
 =      [ ("a",   '\a'),   ("b", '\b'),     ("f",   '\f'),   ("n", '\n')
        , ("r",   '\r'),   ("t", '\t'),     ("v",   '\v'),   ("\\",  '\\')
        , ("\"",  '\"'),   ("\'",  '\'')
        , ("NUL", '\NUL'), ("SOH", '\SOH'), ("STX", '\STX'), ("ETX", '\ETX')
        , ("EOT", '\EOT'), ("ENQ", '\ENQ'), ("ACK", '\ACK'), ("BEL", '\BEL')
        , ("BS",  '\BS'),  ("HT",  '\HT'),  ("LF",  '\LF'),  ("VT",  '\VT')
        , ("FF",  '\FF'),  ("CR",  '\CR'),  ("SO",  '\SO'),  ("SI",  '\SI')
        , ("DLE", '\DLE'), ("DC1", '\DC1'), ("DC2", '\DC2'), ("DC3", '\DC3')
        , ("DC4", '\DC4'), ("NAK", '\NAK'), ("SYN", '\SYN'), ("ETB", '\ETB')
        , ("CAN", '\CAN'), ("EM",  '\EM'),  ("SUB", '\SUB'), ("ESC", '\ESC')
        , ("FS",  '\FS'),  ("GS",  '\GS'),  ("RS",  '\RS'),  ("US",  '\US')
        , ("SP",  '\SP'),  ("DEL", '\DEL')]