src/CCO/Lexing.hs

-------------------------------------------------------------------------------
-- |
-- Module      :  CCO.Lexing
-- Copyright   :  (c) 2008 Utrecht University
-- License     :  All rights reserved
--
-- Maintainer  :  stefan@cs.uu.nl
-- Stability   :  provisional
-- Portability :  portable
--
-- A library of lexer combinators that expose their functionality through an
-- 'Applicative' interface.
--
-------------------------------------------------------------------------------

module CCO.Lexing (
    -- * Lexers
    Lexer                                -- abstract, instances: Functor,
                                         -- Applicative, Alternative
  , satisfy                              -- :: Lexer Char
  , message                              -- :: String -> Lexer a

    -- * Derived lexers
  , anyChar                              -- :: Lexer Char
  , anyCharFrom                          -- :: [Char] -> Lexer Char
  , anyCharBut                           -- :: [Char] -> Lexer Char
  , range                                -- :: (Char, Char) -> Lexer Char
  , notInRange                           -- :: (Char, Char) -> Lexer Char
  , char                                 -- :: Lexer Char
  , string                               -- :: Lexer String
  , space                                -- :: Lexer Char
  , lower                                -- :: Lexer Char
  , upper                                -- :: Lexer Char
  , letter                               -- :: Lexer Char
  , alpha                                -- :: Lexer Char
  , alphaNum                             -- :: Lexer Char
  , digit                                -- :: Lexer Char
  , digit_                               -- :: Lexer Int
  , binDigit                             -- :: Lexer Char
  , binDigit_                            -- :: Lexer Int
  , octDigit                             -- :: Lexer Char
  , octDigit_                            -- :: Lexer Int
  , hexDigit                             -- :: Lexer Char
  , hexDigit_                            -- :: Lexer Int

    -- * Ignoring recognised input
  , ignore                               -- :: Lexer a -> Lexer b

    -- * Symbols
  , LexicalUnit (Token, Error, Msg)
  , Symbols (Symbols)

    -- * Lexing
  , lex                                  -- :: Lexer a -> Source -> String ->
                                         --    Symbols a

    -- * Utilities
  , tokens                               -- :: Symbols a -> [(SourcePos, a)]
  , tokens_                              -- :: Symbols a -> [a]
) where

import Prelude hiding (lex)
import CCO.SourcePos (Source, Pos (Pos, EOF), SourcePos (SourcePos))
import Control.Applicative (Applicative (..), Alternative (..), (<$>))
import Data.Char

-------------------------------------------------------------------------------
-- Steps
-------------------------------------------------------------------------------

-- | A @Steps@ represents the tokenisation associated with the recognition of
-- a (part of a) lexeme.
data Steps a = None         -- ^ No tokenisation yet.
             | Return a     -- ^ Produces as a token a value of type @a@.
             | Ignore       -- ^ Produces a value of type @a@ but ignores it as
                            --   far as tokenisation is concerned.
                            --   (Typically used on lexemes that constitute
                            --   comments or whitespace.)
             | Fail String  -- ^ Fail with a specified error message.

instance Functor Steps where
  fmap _ None       = None
  fmap f (Return x) = Return (f x)
  fmap _ Ignore     = Ignore
  fmap _ (Fail msg) = Fail msg

instance Applicative Steps where
  pure x = Return x

  (Fail msg) <*> _          = Fail msg
  _          <*> (Fail msg) = Fail msg
  None       <*> _          = None
  _          <*> None       = None
  Return f   <*> Return x   = Return (f x)
  _          <*> _          = Ignore

instance Alternative Steps where
  empty = None

  steps@(Return _) <|> _                = steps
  steps@Ignore     <|> _                = steps
  _                <|> steps@(Return _) = steps
  _                <|> steps@Ignore     = steps
  None             <|> steps            = steps
  steps            <|> _                = steps

-------------------------------------------------------------------------------
-- Lexers
-------------------------------------------------------------------------------

-- | Type of lexers that produce tokens of type @a@.
data Lexer a = Trie (Steps a) [(Char -> Bool, Lexer (Char -> a))]
               -- ^ A @Lexer@ consists of a @Steps@-component that represents
               --   the tokenisation for the lexeme currently recognised and
               --   a multi-mapping from characters to lexers that represent
               --   the next state of the scanner.

instance Functor Lexer where
  fmap f (Trie steps edges) =
    Trie (fmap f steps) [(p, fmap (f .) next) | (p, next) <- edges]

instance Applicative Lexer where
  pure x = Trie (pure x) []

  Trie None edges <*> lexer = Trie None
    [(p, flip <$> next <*> lexer) | (p, next) <- edges]
  Trie (Fail msg) edges <*> lexer = Trie (Fail msg)
    [(p, flip <$> next <*> lexer) | (p, next) <- edges]
  Trie steps edges <*> lexer@(Trie steps' edges') = Trie (steps <*> steps') $
    [(p, smap ((.) <$> steps) next) | (p, next) <- edges'] ++
    [(p, flip <$> next <*> lexer) | (p, next) <- edges]

instance Alternative Lexer where
  empty = Trie empty []
  Trie steps edges <|> Trie steps' edges' =
    Trie (steps <|> steps') (edges ++ edges')

-- | Maps a tokenisation over a 'Lexer'.  
smap :: Steps (a -> b) -> Lexer a -> Lexer b
smap steps (Trie steps' edges) = Trie (steps <*> steps')
  [(p, smap ((.) <$> steps) next) | (p, next) <- edges] 

-- | A 'Lexer' that recognises any character that satisfies a specified
-- predicate.
satisfy :: (Char -> Bool) -> Lexer Char
satisfy p = Trie None [(p, Trie (Return id) [])]

-- | A 'Lexer' that will produce a specified error message.
message :: String -> Lexer a
message msg = Trie (Fail msg) []

-------------------------------------------------------------------------------
-- Derived lexers
-------------------------------------------------------------------------------

-- | The trivial 'Lexer' that recognises any character.
anyChar :: Lexer Char
anyChar = satisfy (const True)

-- | A 'Lexer' that recognises any character that appears in a given list.
anyCharFrom :: [Char] -> Lexer Char
anyCharFrom cs = satisfy (`elem` cs)

-- | A 'Lexer' that recognises any character that does not appear in a given
-- list.
anyCharBut :: [Char] -> Lexer Char
anyCharBut tabus = satisfy (`notElem` tabus)

-- | A 'Lexer' that recognises any character that appears in a given range.
-- More efficent than @\\(low, up) -> anyCharFrom [low .. up]@.
range :: (Char, Char) -> Lexer Char
range (low, up) = satisfy (\c -> c >= low && c <= up)

-- | A 'Lexer' that recognises any character that does not appear in a given
-- range.
notInRange :: (Char, Char) -> Lexer Char
notInRange (low, up) = satisfy (\c -> c < low || c > up)

-- | A 'Lexer' that recognises a specified character.
char :: Char -> Lexer Char
char c = satisfy (== c)

-- | A 'Lexer' that recognises a specified 'String'.
string :: String -> Lexer String
string []       = pure []
string (c : cs) = (:) <$> char c <*> string cs

-- | A 'Lexer' that recognises a whitespace character
space :: Lexer Char
space = satisfy isSpace

-- | A 'Lexer' that recognises lower-case alphabetic characters.
lower :: Lexer Char
lower = satisfy isLower

-- | A 'Lexer' that recognises upper-case or title-case alphabetic characters.
upper :: Lexer Char
upper = satisfy isUpper

-- | A 'Lexer' that recognises alphabetic characters. Equivalent to 'alpha'.
letter :: Lexer Char
letter = alpha

-- | A 'Lexer' that recognises alphabetic characters. Equivalet to 'letter'.
alpha :: Lexer Char
alpha = satisfy isAlpha

-- | A 'Lexer' that recognises alphabetic or numeric characters.
alphaNum :: Lexer Char
alphaNum = satisfy isAlphaNum

-- | A 'Lexer' that recognises a digit.
digit :: Lexer Char
digit = satisfy isDigit

-- | A 'Lexer' that recognises a digit and tokenises it as an 'Int'.
digit_ :: Lexer Int
digit_ = (\c -> ord c - ordZero) <$> digit

-- | A 'Lexer' that recognises a binary digit.
binDigit :: Lexer Char
binDigit = range ('0', '1')

-- | A 'Lexer' that recognises a binary digit and tokenises it as an 'Int'.
binDigit_ :: Lexer Int
binDigit_ = (\c -> ord c - ordZero) <$> binDigit

-- | A 'Lexer' that recognises an octal digit.
octDigit :: Lexer Char
octDigit = satisfy isOctDigit

-- | A 'Lexer' that recognises an octal digit and tokenises it as an 'Int'.
octDigit_ :: Lexer Int
octDigit_ = (\c -> ord c - ordZero) <$> octDigit

-- | A 'Lexer that recognises a hexedecimal digit.
hexDigit :: Lexer Char
hexDigit = satisfy isHexDigit

-- | A 'Lexer' that recognises a hexadecimal digit and tokenises it as an
-- 'Int'.
hexDigit_ :: Lexer Int
hexDigit_ = (\c -> ord c - ordZero) <$> digit <|>
            (\c -> 10 + (ord c - ordLowerA)) <$> range ('a', 'f') <|>
            (\c -> 10 + (ord c - ordUpperA)) <$> range ('A', 'F')

-------------------------------------------------------------------------------
-- Ignoring recognised input
-------------------------------------------------------------------------------

-- | Produces a 'Lexer' that recognises the same inputs as a given underlying
-- 'Lexer', but that does not result in any tokenisation.
--
-- The input recognised by a 'Lexer' constructed with @ignore@ is simply 
-- ignored when the 'Lexer' is used to turn a stream of characters into a
-- stream of 'LexicalUnit's.
-- 
-- Mainly used to suppress the generation of tokens for lexemes that constitute-- lexical units like comments and whitespace.

ignore :: Lexer a -> Lexer b
ignore = smap Ignore

-------------------------------------------------------------------------------
-- Symbols
-------------------------------------------------------------------------------

-- | The type of lexical units.
data LexicalUnit a
   = Token a Pos String String     -- ^ A tokenised lexeme: its token, its
                                   --   position, the characters it consists
                                   --   of, and its its trailing characters in
                                   --   the input stream.
   | Error Pos String String       -- ^ An invalid lexeme: its position, the
                                   --   characters it consists of, and its
                                   --   trailing characters in the input
                                   --   stream.
   | Msg String Pos String String  -- ^ An invalid lexeme, labeled by a custom
                                   --   error message: the message, its
                                   --   position, the characters it consists
                                   --   of, and its trailing characters in the
                                   --   input stream.

-- | The type of streams of symbols described by tokens of type 'a'.
data Symbols a = Symbols Source [LexicalUnit a]

-------------------------------------------------------------------------------
-- Lexing
-------------------------------------------------------------------------------

-- | A @Scan@: an accumulator for the input recognised, a tokenisation for the
-- input recognised, the current position, and the remaining input.
type Scan a = (String -> String, Steps a, Pos, String)

-- | Scanner: takes a 'Lexer', an input position, and an input stream and then
-- produces a 'Scan' for the next lexeme.
scan :: Lexer a -> Pos -> String -> Scan a
scan (Trie steps edges) pos input = (id, steps, pos, input) `sor` consume input
  where
    consume []       = (id, None, pos, input)
    consume (c : cs) =
      let lexer                       = choice [next | (p, next) <- edges, p c]
          (acc, steps', pos', input') =
            scan (lexer <*> pure c) (incrPos c pos) cs
      in  ((c :) . acc, steps', pos', input')

-- | Picks the \"best\" of two 'Scan's, favouring the second over the first,
-- except when the second does not allow any tokenisation.
sor :: Scan a -> Scan a -> Scan a
sor scn (_, None, _, _) = scn
sor _ scn               = scn

-- | Runs a lexer on a specified input stream.
lex :: Lexer a -> Source -> String ->  Symbols a
lex lexer src = let initpos = initialPos
                in  Symbols src . tokenise initpos id initpos
  where
    tokenise errpos prefix pos input =
      let sentinel = case prefix "" of
                       []     -> id 
                       lexeme -> (Error errpos lexeme input :)
      in  case scan lexer pos input of
            (_, None, _, [])              -> sentinel []
            (acc, None, pos', c : input') -> tokenise errpos (prefix . (c :))
                                               (incrPos c pos') input'
            (acc, Return x, pos', input') -> sentinel $
                                               Token x pos (acc "") input' :
                                               tokenise pos' id pos' input'
            (acc, Fail msg, pos', input') -> Msg msg errpos (acc "") input' :
                                               tokenise pos' id pos' input'
            (_, _, pos', input')          -> sentinel
                                               (tokenise pos' id pos' input')

-------------------------------------------------------------------------------
-- Source positions
-------------------------------------------------------------------------------

-- | Retrieves the first position in an input stream.
initialPos :: Pos
initialPos = Pos 1 1

-- | Increments a 'Pos' based on the character consumed.
-- If a newline character is consumed, the line number is incremented and the
-- column number is reset to 1; for any other character, the line number is
-- kept unchanged and the column number is incremented by 1.
-- Fails if an attempt is made to increment an 'EOF'-value.
incrPos :: Char -> Pos -> Pos
incrPos '\n' (Pos line column) = Pos (line + 1) 1
incrPos _    (Pos line column) = Pos line (column + 1)

-------------------------------------------------------------------------------
-- Utilities
-------------------------------------------------------------------------------

-- | The 'ord' of '\'0\''.
ordZero :: Int
ordZero = ord '0'

-- | The 'ord' of '\'a\''.
ordLowerA :: Int
ordLowerA = ord 'a'

-- | The 'ord' of '\'A\''.
ordUpperA :: Int
ordUpperA = ord 'A'

-- | Combines multiple 'Alternative's by means of '<|>'.
choice :: Alternative f => [f a] -> f a
choice []  = empty
choice [x] = x
choice xs  = foldr1 (<|>) xs

-- | Retrieves all tokens together with their source positions from a list of
-- 'Symbols'.
tokens :: Symbols a -> [(SourcePos, a)]
tokens (Symbols src units) =
  [(SourcePos src pos, x) | Token x pos _ _ <- units]

-- | Retrieves all tokens from a list of 'Symbols'.
tokens_ :: Symbols a -> [a]
tokens_ (Symbols _ units) = [x | Token x _ _ _ <- units]