Example: NGINX Log Processor ============================ This example program reads lines from NGINX error-log files and access-log files converts them into a unified output format. It is an example of developing REs at scale using macros with the regex test bench. The tool is self-testing: run it with no arguments (or `cabal test`). \begin{code} {-# LANGUAGE NoImplicitPrelude #-} {-# LANGUAGE RecordWildCards #-} {-# LANGUAGE GeneralizedNewtypeDeriving #-} {-# LANGUAGE TemplateHaskell #-} {-# LANGUAGE QuasiQuotes #-} {-# LANGUAGE OverloadedStrings #-} module Main ( main -- development , parse_a , parse_e ) where import Control.Applicative import Control.Exception import Control.Monad import qualified Data.ByteString.Lazy.Char8 as LBS import qualified Data.HashMap.Lazy as HML import Data.Functor.Identity import Data.Maybe import Data.String import qualified Data.Text as T import Data.Time import Prelude.Compat import qualified Shelly as SH import System.Directory import System.Environment import System.Exit import System.IO import Text.RE.Tools.Sed import Text.RE.Types.REOptions import Text.RE.TestBench.Parsers import Text.RE.TestBench import Text.RE.PCRE.ByteString.Lazy import qualified Text.RE.PCRE.String as S import Text.RE.Types.Capture import Text.RE.Types.Match import Text.RE.Types.Replace import Text.Printf \end{code} \begin{code} main :: IO () main = do as <- getArgs case as of ["--macro" ] -> putStr lp_macro_table ["--macro",mid_s] -> putStrLn $ lp_macro_summary $ MacroID mid_s ["--regex" ] -> putStr lp_macro_sources ["--regex",mid_s] -> putStrLn $ lp_macro_source $ MacroID mid_s ["--test" ] -> test [] -> test [in_file ] | is_file in_file -> go True in_file "-" [in_file,out_file ] | is_file in_file -> go True in_file out_file _ -> usage where is_file = not . (== "--") . take 2 usage = do pnm <- getProgName let prg = ((pnm++" ")++) putStr $ unlines [ "usage:" , prg " --help" , prg " --macro" , prg " --macro " , prg " --regex" , prg " --regex " , prg "[--test]" , prg "(-|) [-|]" ] \end{code} \begin{code} -- -- go -- test :: IO () test = do putStrLn "============================================================" putStrLn "Testing the macro environment." putStrLn "nginx-log-processor" dumpMacroTable "nginx-log-processor" regexType lp_env me_ok <- testMacroEnv "nginx-log-processor" regexType lp_env putStrLn "============================================================" putStrLn "Testing the log processor on reference data." putStrLn "" lp_ok <- test_log_processor putStrLn "============================================================" case me_ok && lp_ok of True -> return () False -> exitWith $ ExitFailure 1 test_log_processor :: IO Bool test_log_processor = do createDirectoryIfMissing False "tmp" go False "data/access-errors.log" "tmp/events.log" cmp "tmp/events.log" "data/events.log" \end{code} \begin{code} -- -- go -- go :: Bool -> FilePath -> FilePath -> IO () go rprt_flg in_file out_file = do ctx <- setup rprt_flg sed (script ctx) in_file out_file \end{code} \begin{code} script :: Ctx -> Edits IO RE LBS.ByteString script ctx = Select [ on [re_|@{access}|] ACC parse_access , on [re_|@{access_deg}|] AQQ parse_deg_access , on [re_|@{error}|] ERR parse_error , on [re_|.*|] QQQ parse_def ] where on rex src prs = Function (rex lpo) TOP $ process_line ctx src prs parse_def = fmap capturedText . matchCapture \end{code} \begin{code} process_line :: IsEvent a => Ctx -> Source -> (Match LBS.ByteString->Maybe a) -> LineNo -> Match LBS.ByteString -> Location -> Capture LBS.ByteString -> IO (Maybe LBS.ByteString) process_line ctx src prs lno cs _ _ = do when (event_is_notifiable event) $ flag_event ctx event return $ Just $ presentEvent event where event = maybe def_event (mkEvent lno src) $ prs cs def_event = Event { _event_line = lno , _event_source = src , _event_utc = read "1970-01-01 00:00:00" , _event_severity = Nothing , _event_address = (0,0,0,0) , _event_details = "" } -- -- Ctx, setup, event_is_notifiable, flag_event -- type Ctx = Bool setup :: Bool -> IO Ctx setup = return event_is_notifiable :: Event -> Bool event_is_notifiable Event{..} = fromEnum (fromMaybe Debug _event_severity) <= fromEnum Err flag_event :: Ctx -> Event -> IO () flag_event False = const $ return () flag_event True = LBS.hPutStrLn stderr . presentEvent -- -- Event, presentEvent, IsEvent -- data Event = Event { _event_line :: LineNo , _event_source :: Source , _event_utc :: UTCTime , _event_severity :: Maybe Severity , _event_address :: IPV4Address , _event_details :: LBS.ByteString } deriving (Show) data Source = ACC | AQQ | ERR | QQQ deriving (Show,Read) presentEvent :: Event -> LBS.ByteString presentEvent Event{..} = LBS.pack $ printf "%04d %s %s %-7s %3d.%3d.%3d.%3d [%s]" (getLineNo _event_line ) (show _event_source ) (show _event_utc ) (maybe "-" svrty_kw _event_severity) a b c d (LBS.unpack _event_details ) where (a,b,c,d) = _event_address svrty_kw = T.unpack . fst . severityKeywords class IsEvent a where mkEvent :: LineNo -> Source -> a -> Event instance IsEvent Access where mkEvent lno src Access{..} = Event { _event_line = lno , _event_source = src , _event_utc = _a_time_local , _event_severity = Nothing , _event_address = _a_remote_addr , _event_details = LBS.pack $ printf "%s %d %d %s %s %s" (T.unpack _a_request ) _a_status _a_body_bytes (T.unpack _a_http_referrer ) (T.unpack _a_http_user_agent) (T.unpack _a_other ) } instance IsEvent Error where mkEvent lno src ERROR{..} = Event { _event_line = lno , _event_source = src , _event_utc = UTCTime _e_date $ timeOfDayToTime _e_time , _event_severity = Just _e_severity , _event_address = (0,0,0,0) , _event_details = LBS.pack $ printf "%d#%d: %s" pid tid $ LBS.unpack _e_other } where (pid,tid) = _e_pid_tid instance IsEvent LBS.ByteString where mkEvent lno src lbs = Event { _event_line = lno , _event_source = src , _event_utc = read "1970-01-01 00:00:00" , _event_severity = Nothing , _event_address = (0,0,0,0) , _event_details = lbs } -- -- REOptions and Prelude -- lpo :: REOptions lpo = makeREOptions lp_prelude lp_prelude :: Macros RE lp_prelude = runIdentity $ mkMacros mk regexType ExclCaptures lp_env where mk = maybe oops Identity . compileRegexWithOptions noPreludeREOptions oops = error "lp_prelude" lp_macro_table :: String lp_macro_table = formatMacroTable regexType lp_env lp_macro_summary :: MacroID -> String lp_macro_summary = formatMacroSummary regexType lp_env lp_macro_sources :: String lp_macro_sources = formatMacroSources regexType ExclCaptures lp_env lp_macro_source :: MacroID -> String lp_macro_source = formatMacroSource regexType ExclCaptures lp_env lp_env :: MacroEnv lp_env = preludeEnv `HML.union` HML.fromList [ f "user" user_macro , f "pid#tid:" pid_tid_macro , f "access" access_macro , f "access_deg" access_deg_macro , f "error" error_macro ] where f mid mk = (mid, mk lp_env mid) -- -- The Macro Descriptors -- user_macro :: MacroEnv -> MacroID -> MacroDescriptor user_macro env mid = runTests regexType parse_user samples env mid MacroDescriptor { _md_source = "(?:-|[^[:space:]]+)" , _md_samples = map fst samples , _md_counter_samples = counter_samples , _md_test_results = [] , _md_parser = Just "parse_user" , _md_description = "a user ident (per RFC1413)" } where samples :: [(String,User)] samples = [ f "joe" ] where f nm = (nm,User $ LBS.pack nm) counter_samples = [ "joe user" ] pid_tid_macro :: MacroEnv -> MacroID -> MacroDescriptor pid_tid_macro env mid = runTests regexType parse_pid_tid samples env mid MacroDescriptor { _md_source = "(?:@{%nat})#(?:@{%nat}):" , _md_samples = map fst samples , _md_counter_samples = counter_samples , _md_test_results = [] , _md_parser = Just "parse_pid_tid" , _md_description = "#:" } where samples :: [(String,(Int,Int))] samples = [ f "1378#0:" (1378,0) ] where f = (,) counter_samples = [ "" , "24#:" , "24.365:" ] access_macro :: MacroEnv -> MacroID -> MacroDescriptor access_macro env mid = runTests' regexType (parse_access . fmap LBS.pack) samples env mid MacroDescriptor { _md_source = access_re , _md_samples = map fst samples , _md_counter_samples = counter_samples , _md_test_results = [] , _md_parser = Just "parse_a" , _md_description = "an Nginx access log file line" } where samples :: [(String,Access)] samples = [ (,) "192.168.100.200 - - [12/Jan/2016:12:08:36 +0000] \"GET / HTTP/1.1\" 200 3700 \"-\" \"My Agent\" \"-\"" Access { _a_remote_addr = (192,168,100,200) , _a_remote_user = "-" , _a_time_local = read "2016-01-12 12:08:36 UTC" , _a_request = "GET / HTTP/1.1" , _a_status = 200 , _a_body_bytes = 3700 , _a_http_referrer = "-" , _a_http_user_agent = "My Agent" , _a_other = "-" } ] counter_samples = [ "" , " - [] \"\" \"\" \"\" \"\"" ] access_deg_macro :: MacroEnv -> MacroID -> MacroDescriptor access_deg_macro env mid = runTests' regexType (parse_deg_access . fmap LBS.pack) samples env mid MacroDescriptor { _md_source = " - \\[\\] \"\" \"\" \"\" \"\"" , _md_samples = map fst samples , _md_counter_samples = counter_samples , _md_test_results = [] , _md_parser = Nothing , _md_description = "a degenerate Nginx access log file line" } where samples :: [(String,Access)] samples = [ (,) " - [] \"\" \"\" \"\" \"\"" deg_access ] counter_samples = [ "" , "foo" ] error_macro :: MacroEnv -> MacroID -> MacroDescriptor error_macro env mid = runTests' regexType (parse_error . fmap LBS.pack) samples env mid MacroDescriptor { _md_source = error_re , _md_samples = map fst samples , _md_counter_samples = counter_samples , _md_test_results = [] , _md_parser = Just "parse_e" , _md_description = "an Nginx error log file line" } where samples :: [(String,Error)] samples = [ (,) "2016/12/21 11:53:35 [emerg] 1378#0: foo" ERROR { _e_date = read "2016-12-21" , _e_time = read "11:53:35" , _e_severity = Emerg , _e_pid_tid = (1378,0) , _e_other = " foo" } , (,) "2017/01/04 05:40:19 [error] 31623#0: *1861296 no \"ssl_certificate\" is defined in server listening on SSL port while SSL handshaking, client: 192.168.31.38, server: 0.0.0.0:80" ERROR { _e_date = read "2017-01-04" , _e_time = read "05:40:19" , _e_severity = Err , _e_pid_tid = (31623,0) , _e_other = " *1861296 no \"ssl_certificate\" is defined in server listening on SSL port while SSL handshaking, client: 192.168.31.38, server: 0.0.0.0:80" } ] counter_samples = [ "" , "foo" ] \end{code} \begin{code} -- -- Access, access_re, deg_access, parse_deg_access, parse_access -- data Access = Access { _a_remote_addr :: !IPV4Address , _a_remote_user :: !User , _a_time_local :: !UTCTime , _a_request :: !T.Text , _a_status :: !Int , _a_body_bytes :: !Int , _a_http_referrer :: !T.Text , _a_http_user_agent :: !T.Text , _a_other :: !T.Text } deriving (Eq,Show) \end{code} \begin{code} access_re :: RegexSource access_re = RegexSource $ unwords [ "(@{%address.ipv4})" , "-" , "(@{user})" , "\\[(@{%datetime.clf})\\]" , "(@{%string.simple})" , "(@{%nat})" , "(@{%nat})" , "(@{%string.simple})" , "(@{%string.simple})" , "(@{%string.simple})" ] \end{code} \begin{code} deg_access :: Access deg_access = Access { _a_remote_addr = (0,0,0,0) , _a_remote_user = "-" , _a_time_local = read "1970-01-01 00:00:00" , _a_request = "" , _a_status = 0 , _a_body_bytes = 0 , _a_http_referrer = "" , _a_http_user_agent = "" , _a_other = "" } parse_deg_access :: Match LBS.ByteString -> Maybe Access parse_deg_access Match{..} = case matchSource == " - [] \"\" \"\" \"\" \"\"" of True -> Just deg_access False -> Nothing parse_a :: LBS.ByteString -> Maybe Access parse_a lbs = parse_access $ lbs ?=~ [re_|@{access}|] lpo parse_access :: Match LBS.ByteString -> Maybe Access parse_access cs = Access <$> f parseIPv4Address [cp|1|] <*> f parse_user [cp|2|] <*> f parseDateTimeCLF [cp|3|] <*> f parseSimpleString [cp|4|] <*> f parseInteger [cp|5|] <*> f parseInteger [cp|6|] <*> f parseSimpleString [cp|7|] <*> f parseSimpleString [cp|8|] <*> f parseSimpleString [cp|9|] where f psr i = psr $ capturedText $ capture i cs -- -- Error, error_re, parse_error -- data Error = ERROR { _e_date :: Day , _e_time :: TimeOfDay , _e_severity :: Severity , _e_pid_tid :: (Int,Int) , _e_other :: LBS.ByteString } deriving (Eq,Show) error_re :: RegexSource error_re = RegexSource $ unwords [ "(@{%date.slashes})" , "(@{%time})" , "\\[(@{%syslog.severity})\\]" , "(@{pid#tid:})(.*)" ] parse_e :: LBS.ByteString -> Maybe Error parse_e lbs = parse_error $ lbs ?=~ [re_|@{error}|] lpo parse_error :: Match LBS.ByteString -> Maybe Error parse_error cs = ERROR <$> f parseSlashesDate [cp|1|] <*> f parseTimeOfDay [cp|2|] <*> f parseSeverity [cp|3|] <*> f parse_pid_tid [cp|4|] <*> f Just [cp|5|] where f psr i = psr $ capturedText $ capture i cs -- -- User, parseUser -- newtype User = User { _User :: LBS.ByteString } deriving (IsString,Ord,Eq,Show) parse_user :: Replace a => a -> Maybe User parse_user = Just . User . LBS.pack . unpackR -- -- parse_pid_tid -- parse_pid_tid :: Replace a => a -> Maybe (Int,Int) parse_pid_tid x = case allMatches $ unpackR x S.*=~ [re|@{%nat}|] of [cs,cs'] -> (,) <$> p cs <*> p cs' _ -> Nothing where p cs = matchCapture cs >>= parseInteger . capturedText -- -- cmp -- cmp :: T.Text -> T.Text -> IO Bool cmp src dst = handle hdl $ do _ <- SH.shelly $ SH.verbosely $ SH.run "cmp" [src,dst] return True where hdl :: SomeException -> IO Bool hdl se = do hPutStrLn stderr $ "testing results against model answers failed: " ++ show se return False \end{code}