Codec/Phaser/Common.hs

{-|
Module:     Codec.Parser.Common
Copyright:  Jeremy List
License:    BSD-3
Maintainer: quick.dudley@gmail.com

Common functions which do not need to be in 'Phaser.Core', mostly for using
'Phase's and 'Automaton's as parsers.
-}
module Codec.Phaser.Common (
  Position(..),
  satisfy,
  match,
  char,
  iChar,
  string,
  iString,
  integerDecimal,
  positiveIntegerDecimal,
  decimal,
  directHex,
  hex,
  positiveInteger,
  integer,
  countChar,
  countLine,
  trackPosition,
  parse,
  sepBy,
  sepBy1,
  munch,
  munch1,
  parseFile,
  latin1
 ) where

import Data.Char
import Data.Word
import Control.Monad
import Control.Applicative

import Codec.Phaser.Core
import qualified Codec.Phaser.ByteString as BP

-- | A data type for describing a position in a text file. Constructor arguments
-- are row number and column number.
data Position = Position
  {-# UNPACK #-}!Int
  {-# UNPACK #-}!Int
 deriving (Eq,Ord)

instance Show Position where
  showsPrec p (Position r c) = b m where
    b a = if p > 0
      then ('(' :) . a . (')' :)
      else a
    m = ("Row " ++) . showsPrec 0 r . (", Column " ++) . showsPrec 0 c

instance Read Position where
  readsPrec p = toReadS (toAutomaton (go p)) where
    parenthes a = surround a
      (many (satisfy isSpace) >> char '(')
      (char ')' >> many (satisfy isSpace))
    go 0 = inner <|> parenthes (go 0)
    go _ = parenthes (go 0)
    inner = do
      many (satisfy isSpace)
      iString "row"
      some (satisfy isSpace)
      r <- integer
      many (satisfy isSpace)
      char ','
      many (satisfy isSpace)
      iString "column"
      some (satisfy isSpace)
      c <- integer
      return (Position r c)

-- | Consume one input, return it if it matches the predicate, otherwise fail.
satisfy :: (i -> Bool) -> Phase p i o i
satisfy p = get >>= \c -> if p c then return c else empty

-- | Consume one input, if it's equal to the parameter then return it, otherwise
-- fail.
match :: (Eq i) => i -> Phase p i o i
match t = satisfy (== t)

-- | 'match' specialized to 'Char'
char :: Char -> Phase p Char o Char
char = match

-- | Case insensitive version of 'char'
iChar :: Char -> Phase p Char o Char
iChar t = satisfy (\i -> toLower t == toLower i)

-- | Match a list of input values
string :: Eq i => [i] -> Phase p i o [i]
string t = go t where
  go [] = return t
  go (a:r) = get >>= \c -> if c == a then go r else empty

-- | Match a string (case insensitive)
iString :: String -> Phase p Char o String
iString = mapM iChar

-- | Parse a standard positive base 10 integer
positiveIntegerDecimal :: Num a => Phase p Char o a
positiveIntegerDecimal = go 0 where
  go acc = do
    d <- fmap (fromIntegral . digitToInt) $ satisfy isDigit
    let acc' = acc * 10 + d
    acc' `seq` go acc' <|> return acc'

-- | Parse a standard base 10 integer
integerDecimal :: Num a => Phase p Char o a
integerDecimal = (pure id <|> (char '-' *> munch isSpace *> pure negate)) <*>
  positiveIntegerDecimal

-- | Take some hexadecimal digits and parse a number from hexadecimal
directHex :: Num a => Phase p Char o a
directHex = go 0 where
  go acc = do
    d <- fmap (fromIntegral . digitToInt) $ satisfy isHexDigit
    let acc' = acc * 16 + d
    acc' `seq` go acc' <|> return acc'

-- | Parse a hexadecimal number prefixed with "Ox"
hex :: Num a => Phase p Char o a
hex = string "0x" >> directHex

-- | Parse a positive integer from either decimal or hexadecimal format
positiveInteger :: Num a => Phase p Char o a
positiveInteger = positiveIntegerDecimal <|> hex

-- | Parse a number either from decimal digits or from hexadecimal prefixed with
-- "0x"
integer :: Num a => Phase p Char o a
integer = integerDecimal <|> hex

-- | Parse a number from decimal digits and "."
decimal :: Fractional a => Phase p Char o a
decimal = do
  w <- integerDecimal
  (match '.' >> go True 0.1 w) <|> return w
 where
  go i s acc = do
    let
     p = if i
      then ("At least one digit required after decimal point" <?>)
      else id
    d <- p $ fmap (fromIntegral . digitToInt) $ satisfy isDigit
    let acc' = acc + d * s
    acc' `seq` go False (s / 10) acc' <|> return acc'

-- | Move the position counter one character to the right
countChar :: Phase Position i o ()
{-# INLINE countChar #-}
countChar = count (\(Position r c) -> Position r (c + 1))

-- | Move the position counter to the next line
countLine :: Phase Position i o ()
{-# INLINE countLine #-}
countLine = count (\(Position r _) -> Position (r + 1) 1)

-- | Count the lines and characters from the input before yielding them again.
-- If the phase pipeline does not include this or similar: parsing errors will
-- not report the correct position.
trackPosition :: Phase Position Char Char ()
{-# INLINABLE trackPosition #-}
trackPosition = goR where
  goR = flip (<|>) (return ()) $ get >>= \c -> yield c >> case c of
    '\r' -> countLine >> goN
    '\n' -> countLine >> goR
    _ -> countChar >> goR
  goN = flip (<|>) (return ()) $ get >>= \c -> yield c >> case c of
    '\r' -> countLine >> goN
    '\n' -> goR
    _ -> countChar >> goR

-- | Use a 'Phase' as a parser. Note that unlike other parsers the reported
-- position in the input when the parser fails is the position reached when
-- all parsing options are exhausted, not the beginning of the failing token.
-- Since the characters may be counted nondeterministically: if multiple errors
-- are returned the reported error position may be different for each error
-- report.
parse :: Phase Position i o a -> [i] -> Either [(Position,[String])] [a]
parse = parse_ (Position 1 1)

-- | sepBy p sep parses zero or more occurrences of p, separated by sep. Returns
-- a list of values returned by p.
sepBy :: Phase p i o a -> Phase p i o s -> Phase p i o [a]
sepBy p sep = sepBy1 p sep <|> return []

-- | sepBy1 p sep parses one or more occurrences of p, separated by sep. Returns
-- a list of values returned by p.
sepBy1 :: Phase p i o a -> Phase p i o s -> Phase p i o [a]
sepBy1 p sep = ((:) <$> p <*> many (sep >> p))

surround :: Phase p i o a -> Phase p i o b -> Phase p i o e -> Phase p i o a
surround m o c = (\_ r _ -> r) <$> o <*> m <*> c

-- | Parses the first zero or more values satisfying the predicate. Always
-- succeds, exactly once, having consumed all the characters Hence NOT the same
-- as (many (satisfy p))
munch :: (i -> Bool) -> Phase p i o [i]
munch p = munch1 p <|> (eof >> return [])

-- | Parses the first one or more values satisfying the predicate. Always
-- succeds, exactly once, having consumed all the characters Hence NOT the same
-- as (some (satisfy p))
munch1 :: (i -> Bool) -> Phase p i o [i]
munch1 p = go id where
  go acc = do
    c <- get
    if p c
      then go (acc . (c :)) <|> (eof >> return (acc [c]))
      else put1 c >> return (acc [])

-- | Run a parser on input from a file. Input is provided as bytes, if
-- characters are needed: a decoding phase such as
-- 'Codec.Phaser.UTF8.utf8_stream' or 'latin1' may be used
parseFile :: Phase Position Word8 o a -> FilePath ->
  IO (Either [(Position,[String])] [a])
parseFile = BP.parseFile_ (Position 1 1)

-- | Decode bytes to characters using the Latin1 (ISO8859-1) encoding
latin1 :: Phase p Word8 Char ()
latin1 = go where
  go = flip (<|>) (return ()) $ 
    fmap (toEnum . fromIntegral) get >>= yield >> go