{-# LANGUAGE DeriveDataTypeable #-} {-# LANGUAGE DeriveGeneric #-} {-# LANGUAGE ScopedTypeVariables #-} module Skylighting.Regex ( Regex , RegexException , RE(..) , compileRegex , matchRegex , convertOctalEscapes ) where import qualified Control.Exception as E import qualified Data.ByteString.Char8 as BS import Data.ByteString.UTF8 (toString) import GHC.Generics (Generic) import System.IO.Unsafe (unsafePerformIO) import Text.Printf import Text.Regex.PCRE.ByteString import Data.Typeable newtype RegexException = RegexException String deriving (Show, Typeable, Generic) instance E.Exception RegexException data RE = RE{ reString :: BS.ByteString , reCompiled :: Maybe Regex , reCaseSensitive :: Bool } instance Show RE where show re = "RE{ reString = " ++ show (reString re) ++ ", reCompiled = " ++ (case reCompiled re of Nothing -> "Nothing" Just _ -> "Just (compileRegex " ++ show (reCaseSensitive re) ++ " " ++ show (reString re) ++ ")") ++ ", reCaseSensitive = " ++ show (reCaseSensitive re) ++ "}" -- | Compile a PCRE regex. If the first parameter is True, the regex is -- case-sensitive, otherwise caseless. The regex is compiled from -- a bytestring interpreted as UTF-8. If the regex cannot be compiled, -- a 'RegexException' is thrown. compileRegex :: Bool -> BS.ByteString -> Regex compileRegex caseSensitive regexpStr = let opts = compAnchored + compUTF8 + if caseSensitive then 0 else compCaseless in case unsafePerformIO $ compile opts (execNotEmpty) (BS.cons '.' regexpStr) of Left (off,msg) -> E.throw $ RegexException $ "Error compiling regex /" ++ toString regexpStr ++ "/ at offset " ++ show off ++ "\n" ++ msg Right r -> r -- | Convert octal escapes to the form pcre wants. Note: -- need at least pcre 8.34 for the form \o{dddd}. -- So we prefer \ddd or \x{...}. convertOctalEscapes :: String -> String convertOctalEscapes [] = "" convertOctalEscapes ('\\':'0':x:y:z:rest) | all isOctalDigit [x,y,z] = '\\':x:y:z: convertOctalEscapes rest convertOctalEscapes ('\\':x:y:z:rest) | all isOctalDigit [x,y,z] ='\\':x:y:z: convertOctalEscapes rest convertOctalEscapes ('\\':'o':'{':zs) = case break (=='}') zs of (ds, '}':rest) | all isOctalDigit ds && not (null ds) -> case reads ('0':'o':ds) of ((n :: Int,[]):_) -> printf "\\x{%x}" n ++ convertOctalEscapes rest _ -> E.throw $ RegexException $ "Unable to read octal number: " ++ ds _ -> '\\':'o':'{': convertOctalEscapes zs convertOctalEscapes (x:xs) = x : convertOctalEscapes xs isOctalDigit :: Char -> Bool isOctalDigit c = c >= '0' && c <= '7' -- Match a 'Regex' against a bytestring. Returns 'Nothing' if -- no match, otherwise 'Just' a nonempty list of bytestrings. The first -- bytestring in the list is the match, the others the captures, if any. -- If there are errors in executing the regex, a 'RegexException' is -- thrown. matchRegex :: Regex -> BS.ByteString -> Maybe [BS.ByteString] matchRegex r s = case unsafePerformIO (regexec r s) of Right (Just (_, mat, _ , capts)) -> Just (mat : capts) Right Nothing -> Nothing Left (_rc, msg) -> E.throw $ RegexException msg