From 1b35bac62c7e6e3f501d2baecb9afb747419d04c Mon Sep 17 00:00:00 2001 From: Matthew Nibecker Date: Thu, 26 Oct 2023 12:22:18 -0700 Subject: [PATCH] Add grok function Closes #4140 --- docs/language/functions/grok.md | 44 +++++ pkg/grok/base.go | 75 ++++++++ pkg/grok/gen.go | 51 ++++++ pkg/grok/grok-patterns | 98 ++++++++++ pkg/grok/grok.go | 238 +++++++++++++++++++++++++ pkg/grok/host_test.go | 100 +++++++++++ pkg/grok/patterns_test.go | 72 ++++++++ runtime/expr/function/function.go | 3 + runtime/expr/function/grok.go | 111 ++++++++++++ runtime/expr/function/ztests/grok.yaml | 23 +++ 10 files changed, 815 insertions(+) create mode 100644 docs/language/functions/grok.md create mode 100644 pkg/grok/base.go create mode 100644 pkg/grok/gen.go create mode 100644 pkg/grok/grok-patterns create mode 100644 pkg/grok/grok.go create mode 100644 pkg/grok/host_test.go create mode 100644 pkg/grok/patterns_test.go create mode 100644 runtime/expr/function/grok.go create mode 100644 runtime/expr/function/ztests/grok.yaml diff --git a/docs/language/functions/grok.md b/docs/language/functions/grok.md new file mode 100644 index 0000000000..9b822b6e34 --- /dev/null +++ b/docs/language/functions/grok.md @@ -0,0 +1,44 @@ +### Function + +  **grok** — parse a string using a grok pattern + +### Synopsis + +``` +grok(pattern: string, s: string) -> any +grok(extra: string, pattern: string, s: string) -> any +``` + +### Description + +The _grok_ function parses a string using a grok pattern and returns +a record containing the parsed fields. The syntax for a grok pattern +is `{%pattern:field_name}` where _pattern_ is a the name of the pattern +to match text with and _field_name_ is resultant field name of the capture +value. + +When provided with three arguments, the first argument, extra is a list +of named patterns seperated by new lines in the format `PATTERN_NAME PATTERN`. +The named patterns can then be used in the grok pattern. + +#### Included Patterns + +The _grok_ function by default includes a set of builtin named patterns +that can be referenced in any pattern. The included named patterns can be seen +[here](https://raw.githubusercontent.com/brimdata/zed/main/pkg/grok/grok-patterns). + +### Examples + +Parsing a simple log line using the builtin named patterns: +```mdtest-command +echo '"2020-09-16T04:20:42.45+01:00 DEBUG This is a sample debug log message"' \ + | zq -Z 'yield grok("%{TIMESTAMP_ISO8601:timestamp} %{LOGLEVEL:level} %{GREEDYDATA:message}", this)' - +``` +=> +```mdtest-output +{ + timestamp: "2020-09-16T04:20:42.45+01:00", + level: "DEBUG", + message: "This is a sample debug log message" +} +``` diff --git a/pkg/grok/base.go b/pkg/grok/base.go new file mode 100644 index 0000000000..ee89950f2e --- /dev/null +++ b/pkg/grok/base.go @@ -0,0 +1,75 @@ +// Code generated by gen.go; DO NOT EDIT. + +package grok + +func NewBase() Host { + h := New() + h.Must("USERNAME", "[a-zA-Z0-9._-]+") + h.Must("USER", "%{USERNAME}") + h.Must("INT", "(?:[+-]?(?:[0-9]+))") + h.Must("BASE10NUM", "([+-]?(?:[0-9]+(?:\\.[0-9]+)?)|\\.[0-9]+)") + h.Must("NUMBER", "(?:%{BASE10NUM})") + h.Must("BASE16NUM", "[+-]?(?:0x)?(?:[0-9A-Fa-f]+)") + h.Must("BASE16FLOAT", "\\b[+-]?(?:0x)?(?:(?:[0-9A-Fa-f]+(?:\\.[0-9A-Fa-f]*)?)|(?:\\.[0-9A-Fa-f]+))\\b") + h.Must("POSINT", "\\b(?:[1-9][0-9]*)\\b") + h.Must("NONNEGINT", "\\b(?:[0-9]+)\\b") + h.Must("WORD", "\\b\\w+\\b") + h.Must("NOTSPACE", "\\S+") + h.Must("SPACE", "\\s*") + h.Must("DATA", ".*?") + h.Must("GREEDYDATA", ".*") + h.Must("QUOTEDSTRING", "\"([^\"\\\\]*(\\\\.[^\"\\\\]*)*)\"|\\'([^\\'\\\\]*(\\\\.[^\\'\\\\]*)*)\\'") + h.Must("UUID", "[A-Fa-f0-9]{8}-(?:[A-Fa-f0-9]{4}-){3}[A-Fa-f0-9]{12}") + h.Must("CISCOMAC", "(?:(?:[A-Fa-f0-9]{4}\\.){2}[A-Fa-f0-9]{4})") + h.Must("WINDOWSMAC", "(?:(?:[A-Fa-f0-9]{2}-){5}[A-Fa-f0-9]{2})") + h.Must("COMMONMAC", "(?:(?:[A-Fa-f0-9]{2}:){5}[A-Fa-f0-9]{2})") + h.Must("MAC", "(?:%{CISCOMAC}|%{WINDOWSMAC}|%{COMMONMAC})") + h.Must("IPV6", "((([0-9A-Fa-f]{1,4}:){7}([0-9A-Fa-f]{1,4}|:))|(([0-9A-Fa-f]{1,4}:){6}(:[0-9A-Fa-f]{1,4}|((25[0-5]|2[0-4]\\d|1\\d\\d|[1-9]?\\d)(\\.(25[0-5]|2[0-4]\\d|1\\d\\d|[1-9]?\\d)){3})|:))|(([0-9A-Fa-f]{1,4}:){5}(((:[0-9A-Fa-f]{1,4}){1,2})|:((25[0-5]|2[0-4]\\d|1\\d\\d|[1-9]?\\d)(\\.(25[0-5]|2[0-4]\\d|1\\d\\d|[1-9]?\\d)){3})|:))|(([0-9A-Fa-f]{1,4}:){4}(((:[0-9A-Fa-f]{1,4}){1,3})|((:[0-9A-Fa-f]{1,4})?:((25[0-5]|2[0-4]\\d|1\\d\\d|[1-9]?\\d)(\\.(25[0-5]|2[0-4]\\d|1\\d\\d|[1-9]?\\d)){3}))|:))|(([0-9A-Fa-f]{1,4}:){3}(((:[0-9A-Fa-f]{1,4}){1,4})|((:[0-9A-Fa-f]{1,4}){0,2}:((25[0-5]|2[0-4]\\d|1\\d\\d|[1-9]?\\d)(\\.(25[0-5]|2[0-4]\\d|1\\d\\d|[1-9]?\\d)){3}))|:))|(([0-9A-Fa-f]{1,4}:){2}(((:[0-9A-Fa-f]{1,4}){1,5})|((:[0-9A-Fa-f]{1,4}){0,3}:((25[0-5]|2[0-4]\\d|1\\d\\d|[1-9]?\\d)(\\.(25[0-5]|2[0-4]\\d|1\\d\\d|[1-9]?\\d)){3}))|:))|(([0-9A-Fa-f]{1,4}:){1}(((:[0-9A-Fa-f]{1,4}){1,6})|((:[0-9A-Fa-f]{1,4}){0,4}:((25[0-5]|2[0-4]\\d|1\\d\\d|[1-9]?\\d)(\\.(25[0-5]|2[0-4]\\d|1\\d\\d|[1-9]?\\d)){3}))|:))|(:(((:[0-9A-Fa-f]{1,4}){1,7})|((:[0-9A-Fa-f]{1,4}){0,5}:((25[0-5]|2[0-4]\\d|1\\d\\d|[1-9]?\\d)(\\.(25[0-5]|2[0-4]\\d|1\\d\\d|[1-9]?\\d)){3}))|:)))(%.+)?") + h.Must("IPV4", "(?:(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\\.){3}(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)") + h.Must("IP", "(?:%{IPV6}|%{IPV4})") + h.Must("HOSTNAME", "\\b(?:[0-9A-Za-z][0-9A-Za-z-]{0,62})(?:\\.(?:[0-9A-Za-z][0-9A-Za-z-]{0,62}))*(\\.?|\\b)") + h.Must("HOST", "%{HOSTNAME}") + h.Must("IPORHOST", "(?:%{HOSTNAME}|%{IP})") + h.Must("HOSTPORT", "%{IPORHOST}:%{POSINT}") + h.Must("UNIXPATH", "(/[\\w_%!$@:.,-]?/?)(\\S+)?") + h.Must("WINPATH", "([A-Za-z]:|\\\\)(?:\\\\[^\\\\?*]*)+") + h.Must("PATH", "(?:%{UNIXPATH}|%{WINPATH})") + h.Must("TTY", "(?:/dev/(pts|tty([pq])?)(\\w+)?/?(?:[0-9]+))") + h.Must("URIPROTO", "[A-Za-z]+(\\+[A-Za-z+]+)?") + h.Must("URIHOST", "%{IPORHOST}(?::%{POSINT:port})?") + h.Must("URIPATH", "(?:/[A-Za-z0-9$.+!*'(){},~:;=@#%_\\-]*)+") + h.Must("URIPARAM", "\\?[A-Za-z0-9$.+!*'|(){},~@#%&/=:;_?\\-\\[\\]]*") + h.Must("URIPATHPARAM", "%{URIPATH}(?:%{URIPARAM})?") + h.Must("URI", "%{URIPROTO}://(?:%{USER}(?::[^@]*)?@)?(?:%{URIHOST})?(?:%{URIPATHPARAM})?") + h.Must("MONTH", "\\b(?:Jan(?:uary)?|Feb(?:ruary)?|Mar(?:ch)?|Apr(?:il)?|May|Jun(?:e)?|Jul(?:y)?|Aug(?:ust)?|Sep(?:tember)?|Oct(?:ober)?|Nov(?:ember)?|Dec(?:ember)?)\\b") + h.Must("MONTHNUM", "(?:0?[1-9]|1[0-2])") + h.Must("MONTHDAY", "(?:(?:0[1-9])|(?:[12][0-9])|(?:3[01])|[1-9])") + h.Must("DAY", "(?:Mon(?:day)?|Tue(?:sday)?|Wed(?:nesday)?|Thu(?:rsday)?|Fri(?:day)?|Sat(?:urday)?|Sun(?:day)?)") + h.Must("YEAR", "(\\d\\d){1,2}") + h.Must("HOUR", "(?:2[0123]|[01]?[0-9])") + h.Must("MINUTE", "(?:[0-5][0-9])") + h.Must("SECOND", "(?:(?:[0-5][0-9]|60)(?:[:.,][0-9]+)?)") + h.Must("TIME", "([^0-9]?)%{HOUR}:%{MINUTE}(?::%{SECOND})([^0-9]?)") + h.Must("DATE_US", "%{MONTHNUM}[/-]%{MONTHDAY}[/-]%{YEAR}") + h.Must("DATE_EU", "%{MONTHDAY}[./-]%{MONTHNUM}[./-]%{YEAR}") + h.Must("ISO8601_TIMEZONE", "(?:Z|[+-]%{HOUR}(?::?%{MINUTE}))") + h.Must("ISO8601_SECOND", "(?:%{SECOND}|60)") + h.Must("TIMESTAMP_ISO8601", "%{YEAR}-%{MONTHNUM}-%{MONTHDAY}[T ]%{HOUR}:?%{MINUTE}(?::?%{SECOND})?%{ISO8601_TIMEZONE}?") + h.Must("DATE", "%{DATE_US}|%{DATE_EU}") + h.Must("DATESTAMP", "%{DATE}[- ]%{TIME}") + h.Must("TZ", "(?:[PMCE][SD]T|UTC|GMT)") + h.Must("DATESTAMP_RFC822", "%{DAY} %{MONTH} %{MONTHDAY} %{YEAR} %{TIME} %{TZ}") + h.Must("DATESTAMP_OTHER", "%{DAY} %{MONTH} %{MONTHDAY} %{TIME} %{TZ} %{YEAR}") + h.Must("SYSLOGTIMESTAMP", "%{MONTH} +%{MONTHDAY} %{TIME}") + h.Must("PROG", "(?:[\\w._/%-]+)") + h.Must("SYSLOGPROG", "%{PROG:program}(?:\\[%{POSINT:pid}\\])?") + h.Must("SYSLOGHOST", "%{IPORHOST}") + h.Must("SYSLOGFACILITY", "<%{NONNEGINT:facility}.%{NONNEGINT:priority}>") + h.Must("HTTPDATE", "%{MONTHDAY}/%{MONTH}/%{YEAR}:%{TIME} %{INT}") + h.Must("QS", "%{QUOTEDSTRING}") + h.Must("SYSLOGBASE", "%{SYSLOGTIMESTAMP:timestamp} (?:%{SYSLOGFACILITY} )?%{SYSLOGHOST:logsource} %{SYSLOGPROG}:") + h.Must("COMMONAPACHELOG", "%{IPORHOST:clientip} %{USER:ident} %{USER:auth} \\[%{HTTPDATE:timestamp}\\] \"(?:%{WORD:verb} %{NOTSPACE:request}(?: HTTP/%{NUMBER:httpversion})?|%{DATA:rawrequest})\" %{NUMBER:response} (?:%{NUMBER:bytes}|-)") + h.Must("COMBINEDAPACHELOG", "%{COMMONAPACHELOG} %{QS:referrer} %{QS:agent}") + h.Must("LOGLEVEL", "([A-a]lert|ALERT|[T|t]race|TRACE|[D|d]ebug|DEBUG|[N|n]otice|NOTICE|[I|i]nfo|INFO|[W|w]arn?(?:ing)?|WARN?(?:ING)?|[E|e]rr?(?:or)?|ERR?(?:OR)?|[C|c]rit?(?:ical)?|CRIT?(?:ICAL)?|[F|f]atal|FATAL|[S|s]evere|SEVERE|EMERG(?:ENCY)?|[Ee]merg(?:ency)?)") + return h +} diff --git a/pkg/grok/gen.go b/pkg/grok/gen.go new file mode 100644 index 0000000000..91b5d4dc87 --- /dev/null +++ b/pkg/grok/gen.go @@ -0,0 +1,51 @@ +//go:build ignore + +package main + +import ( + "bufio" + _ "embed" + "os" + "regexp" + "strings" + "text/template" +) + +//go:embed grok-patterns +var grokPatterns string + +const baseTemplate = `// Code generated by gen.go; DO NOT EDIT. + +package grok + +func NewBase() Host { + h := New(){{range .}}{{print "\n\t"}}h.Must({{printf "%q" .Name}}, {{printf "%q" .Pattern}}){{end}} + return h +} +` + +func must(err error) { + if err != nil { + panic(err) + } +} + +func main() { + type namedPattern struct{ Name, Pattern string } + var patterns []namedPattern + lineRegexp := regexp.MustCompile(`^(\w+)\s+(.+)$`) + scanner := bufio.NewScanner(strings.NewReader(grokPatterns)) + for scanner.Scan() { + sub := lineRegexp.FindStringSubmatch(scanner.Text()) + if len(sub) == 0 { // not match + continue + } + patterns = append(patterns, namedPattern{Name: sub[1], Pattern: sub[2]}) + } + must(scanner.Err()) + f, err := os.Create("base.go") + must(err) + defer f.Close() + t := template.Must(template.New("base").Parse(baseTemplate)) + must(t.Execute(f, patterns)) +} diff --git a/pkg/grok/grok-patterns b/pkg/grok/grok-patterns new file mode 100644 index 0000000000..40ec27920a --- /dev/null +++ b/pkg/grok/grok-patterns @@ -0,0 +1,98 @@ +# Adapted from https://github.com/vjeantet/grok/blob/master/patterns/grok-patterns +USERNAME [a-zA-Z0-9._-]+ +USER %{USERNAME} +INT (?:[+-]?(?:[0-9]+)) +BASE10NUM ([+-]?(?:[0-9]+(?:\.[0-9]+)?)|\.[0-9]+) +NUMBER (?:%{BASE10NUM}) +BASE16NUM [+-]?(?:0x)?(?:[0-9A-Fa-f]+) +BASE16FLOAT \b[+-]?(?:0x)?(?:(?:[0-9A-Fa-f]+(?:\.[0-9A-Fa-f]*)?)|(?:\.[0-9A-Fa-f]+))\b + +POSINT \b(?:[1-9][0-9]*)\b +NONNEGINT \b(?:[0-9]+)\b +WORD \b\w+\b +NOTSPACE \S+ +SPACE \s* +DATA .*? +GREEDYDATA .* +QUOTEDSTRING "([^"\\]*(\\.[^"\\]*)*)"|\'([^\'\\]*(\\.[^\'\\]*)*)\' +UUID [A-Fa-f0-9]{8}-(?:[A-Fa-f0-9]{4}-){3}[A-Fa-f0-9]{12} + +# Networking +CISCOMAC (?:(?:[A-Fa-f0-9]{4}\.){2}[A-Fa-f0-9]{4}) +WINDOWSMAC (?:(?:[A-Fa-f0-9]{2}-){5}[A-Fa-f0-9]{2}) +COMMONMAC (?:(?:[A-Fa-f0-9]{2}:){5}[A-Fa-f0-9]{2}) +MAC (?:%{CISCOMAC}|%{WINDOWSMAC}|%{COMMONMAC}) +IPV6 ((([0-9A-Fa-f]{1,4}:){7}([0-9A-Fa-f]{1,4}|:))|(([0-9A-Fa-f]{1,4}:){6}(:[0-9A-Fa-f]{1,4}|((25[0-5]|2[0-4]\d|1\d\d|[1-9]?\d)(\.(25[0-5]|2[0-4]\d|1\d\d|[1-9]?\d)){3})|:))|(([0-9A-Fa-f]{1,4}:){5}(((:[0-9A-Fa-f]{1,4}){1,2})|:((25[0-5]|2[0-4]\d|1\d\d|[1-9]?\d)(\.(25[0-5]|2[0-4]\d|1\d\d|[1-9]?\d)){3})|:))|(([0-9A-Fa-f]{1,4}:){4}(((:[0-9A-Fa-f]{1,4}){1,3})|((:[0-9A-Fa-f]{1,4})?:((25[0-5]|2[0-4]\d|1\d\d|[1-9]?\d)(\.(25[0-5]|2[0-4]\d|1\d\d|[1-9]?\d)){3}))|:))|(([0-9A-Fa-f]{1,4}:){3}(((:[0-9A-Fa-f]{1,4}){1,4})|((:[0-9A-Fa-f]{1,4}){0,2}:((25[0-5]|2[0-4]\d|1\d\d|[1-9]?\d)(\.(25[0-5]|2[0-4]\d|1\d\d|[1-9]?\d)){3}))|:))|(([0-9A-Fa-f]{1,4}:){2}(((:[0-9A-Fa-f]{1,4}){1,5})|((:[0-9A-Fa-f]{1,4}){0,3}:((25[0-5]|2[0-4]\d|1\d\d|[1-9]?\d)(\.(25[0-5]|2[0-4]\d|1\d\d|[1-9]?\d)){3}))|:))|(([0-9A-Fa-f]{1,4}:){1}(((:[0-9A-Fa-f]{1,4}){1,6})|((:[0-9A-Fa-f]{1,4}){0,4}:((25[0-5]|2[0-4]\d|1\d\d|[1-9]?\d)(\.(25[0-5]|2[0-4]\d|1\d\d|[1-9]?\d)){3}))|:))|(:(((:[0-9A-Fa-f]{1,4}){1,7})|((:[0-9A-Fa-f]{1,4}){0,5}:((25[0-5]|2[0-4]\d|1\d\d|[1-9]?\d)(\.(25[0-5]|2[0-4]\d|1\d\d|[1-9]?\d)){3}))|:)))(%.+)? +IPV4 (?:(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.){3}(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?) +IP (?:%{IPV6}|%{IPV4}) +HOSTNAME \b(?:[0-9A-Za-z][0-9A-Za-z-]{0,62})(?:\.(?:[0-9A-Za-z][0-9A-Za-z-]{0,62}))*(\.?|\b) +HOST %{HOSTNAME} +IPORHOST (?:%{HOSTNAME}|%{IP}) +HOSTPORT %{IPORHOST}:%{POSINT} + +# paths +UNIXPATH (/[\w_%!$@:.,-]?/?)(\S+)? +WINPATH ([A-Za-z]:|\\)(?:\\[^\\?*]*)+ +PATH (?:%{UNIXPATH}|%{WINPATH}) +TTY (?:/dev/(pts|tty([pq])?)(\w+)?/?(?:[0-9]+)) + +URIPROTO [A-Za-z]+(\+[A-Za-z+]+)? +URIHOST %{IPORHOST}(?::%{POSINT:port})? +# uripath comes loosely from RFC1738, but mostly from what Firefox +# doesn't turn into %XX +URIPATH (?:/[A-Za-z0-9$.+!*'(){},~:;=@#%_\-]*)+ +#URIPARAM \?(?:[A-Za-z0-9]+(?:=(?:[^&]*))?(?:&(?:[A-Za-z0-9]+(?:=(?:[^&]*))?)?)*)? +URIPARAM \?[A-Za-z0-9$.+!*'|(){},~@#%&/=:;_?\-\[\]]* +URIPATHPARAM %{URIPATH}(?:%{URIPARAM})? +URI %{URIPROTO}://(?:%{USER}(?::[^@]*)?@)?(?:%{URIHOST})?(?:%{URIPATHPARAM})? + +# Months: January, Feb, 3, 03, 12, December +MONTH \b(?:Jan(?:uary)?|Feb(?:ruary)?|Mar(?:ch)?|Apr(?:il)?|May|Jun(?:e)?|Jul(?:y)?|Aug(?:ust)?|Sep(?:tember)?|Oct(?:ober)?|Nov(?:ember)?|Dec(?:ember)?)\b +MONTHNUM (?:0?[1-9]|1[0-2]) +MONTHDAY (?:(?:0[1-9])|(?:[12][0-9])|(?:3[01])|[1-9]) + +# Days: Monday, Tue, Thu, etc... +DAY (?:Mon(?:day)?|Tue(?:sday)?|Wed(?:nesday)?|Thu(?:rsday)?|Fri(?:day)?|Sat(?:urday)?|Sun(?:day)?) + +# Years? +#YEAR (?>\d\d){1,2} +#c +YEAR (\d\d){1,2} + +HOUR (?:2[0123]|[01]?[0-9]) +MINUTE (?:[0-5][0-9]) +# '60' is a leap second in most time standards and thus is valid. +SECOND (?:(?:[0-5][0-9]|60)(?:[:.,][0-9]+)?) +#TIME (?!<[0-9])%{HOUR}:%{MINUTE}(?::%{SECOND})(?![0-9]) +#c +TIME ([^0-9]?)%{HOUR}:%{MINUTE}(?::%{SECOND})([^0-9]?) +# datestamp is YYYY/MM/DD-HH:MM:SS.UUUU (or something like it) +DATE_US %{MONTHNUM}[/-]%{MONTHDAY}[/-]%{YEAR} +DATE_EU %{MONTHDAY}[./-]%{MONTHNUM}[./-]%{YEAR} +ISO8601_TIMEZONE (?:Z|[+-]%{HOUR}(?::?%{MINUTE})) +ISO8601_SECOND (?:%{SECOND}|60) +TIMESTAMP_ISO8601 %{YEAR}-%{MONTHNUM}-%{MONTHDAY}[T ]%{HOUR}:?%{MINUTE}(?::?%{SECOND})?%{ISO8601_TIMEZONE}? +DATE %{DATE_US}|%{DATE_EU} +DATESTAMP %{DATE}[- ]%{TIME} +TZ (?:[PMCE][SD]T|UTC|GMT) +DATESTAMP_RFC822 %{DAY} %{MONTH} %{MONTHDAY} %{YEAR} %{TIME} %{TZ} +DATESTAMP_OTHER %{DAY} %{MONTH} %{MONTHDAY} %{TIME} %{TZ} %{YEAR} + +# Syslog Dates: Month Day HH:MM:SS +SYSLOGTIMESTAMP %{MONTH} +%{MONTHDAY} %{TIME} +PROG (?:[\w._/%-]+) +SYSLOGPROG %{PROG:program}(?:\[%{POSINT:pid}\])? +SYSLOGHOST %{IPORHOST} +SYSLOGFACILITY <%{NONNEGINT:facility}.%{NONNEGINT:priority}> +HTTPDATE %{MONTHDAY}/%{MONTH}/%{YEAR}:%{TIME} %{INT} + +# Shortcuts +QS %{QUOTEDSTRING} + +# Log formats +SYSLOGBASE %{SYSLOGTIMESTAMP:timestamp} (?:%{SYSLOGFACILITY} )?%{SYSLOGHOST:logsource} %{SYSLOGPROG}: +COMMONAPACHELOG %{IPORHOST:clientip} %{USER:ident} %{USER:auth} \[%{HTTPDATE:timestamp}\] "(?:%{WORD:verb} %{NOTSPACE:request}(?: HTTP/%{NUMBER:httpversion})?|%{DATA:rawrequest})" %{NUMBER:response} (?:%{NUMBER:bytes}|-) +COMBINEDAPACHELOG %{COMMONAPACHELOG} %{QS:referrer} %{QS:agent} + +# Log Levels +LOGLEVEL ([A-a]lert|ALERT|[T|t]race|TRACE|[D|d]ebug|DEBUG|[N|n]otice|NOTICE|[I|i]nfo|INFO|[W|w]arn?(?:ing)?|WARN?(?:ING)?|[E|e]rr?(?:or)?|ERR?(?:OR)?|[C|c]rit?(?:ical)?|CRIT?(?:ICAL)?|[F|f]atal|FATAL|[S|s]evere|SEVERE|EMERG(?:ENCY)?|[Ee]merg(?:ency)?) diff --git a/pkg/grok/grok.go b/pkg/grok/grok.go new file mode 100644 index 0000000000..d22ab6d09e --- /dev/null +++ b/pkg/grok/grok.go @@ -0,0 +1,238 @@ +//go:generate go run gen.go + +// Adapted from github.com/logrusorgru/grokky +package grok + +import ( + "bufio" + "errors" + "fmt" + "io" + "regexp" + "sort" + "strings" +) + +var ( + // ErrEmptyName arises when pattern name is an empty string + ErrEmptyName = errors.New("an empty name") + // ErrEmptyExpression arises when expression is an empty string + ErrEmptyExpression = errors.New("an empty expression") + // ErrAlreadyExist arises when pattern with given name alrady exists + ErrAlreadyExist = errors.New("the pattern already exist") + // ErrNotExist arises when pattern with given name doesn't exists + ErrNotExist = errors.New("pattern doesn't exist") +) + +// Host is a patterns collection. Host does not need to be kept around +// after all need patterns are generated +type Host map[string]string + +// New returns new empty host +func New() Host { return make(Host) } + +// Add a new pattern to the Host. If a pattern name +// already exists the ErrAlreadyExists will be returned. +func (h Host) Add(name, expr string) error { + if name == "" { + return ErrEmptyName + } + if expr == "" { + return ErrEmptyExpression + } + if _, ok := h[name]; ok { + return ErrAlreadyExist + } + if _, err := h.compileExternal(expr); err != nil { + return err + } + h[name] = expr + return nil +} + +func (h Host) Must(name, expr string) { + if err := h.Add(name, expr); err != nil { + panic(fmt.Errorf("%s: %w", name, err)) + } +} + +func (h Host) compile(name string) (*Pattern, error) { + expr, ok := h[name] + if !ok { + return nil, ErrNotExist + } + return h.compileExternal(expr) +} + +var patternRegexp = regexp.MustCompile(`\%\{(\w+)(\:(\w+))?}`) + +func (h Host) compileExternal(expr string) (*Pattern, error) { + subs := patternRegexp.FindAllString(expr, -1) + ts := make(map[string]struct{}) + for _, s := range subs { + name, sem := split(s) + if _, ok := h[name]; !ok { + return nil, fmt.Errorf("the '%s' pattern doesn't exist", name) + } + ts[sem] = struct{}{} + } + if len(subs) == 0 { + r, err := regexp.Compile(expr) + if err != nil { + return nil, err + } + p := &Pattern{Regexp: r} + return p, nil + } + spl := patternRegexp.Split(expr, -1) + msi := make(map[string]int) + order := 1 // semantic order + var res string + for i := 0; i < len(spl)-1; i++ { + splPart := spl[i] + order += capCount(splPart) + sub := subs[i] + subName, subSem := split(sub) + p, err := h.compile(subName) + if err != nil { + return nil, err + } + sub = p.String() + subNumSubexp := p.NumSubexp() + subNumSubexp++ + sub = wrap(sub) + if subSem != "" { + msi[subSem] = order + } + res += splPart + sub + // add sub semantics to this semantics + for k, v := range p.s { + if _, ok := ts[k]; !ok { + msi[k] = order + v + } + } + order += subNumSubexp + } + res += spl[len(spl)-1] + r, err := regexp.Compile(res) + if err != nil { + return nil, err + } + p := &Pattern{Regexp: r} + p.s = msi + p.order = make(map[int]string) + for k, v := range msi { + p.order[v] = k + } + return p, nil +} + +func split(s string) (name, sem string) { + ss := patternRegexp.FindStringSubmatch(s) + if len(ss) >= 2 { + name = ss[1] + } + if len(ss) >= 4 { + sem = ss[3] + } + return +} + +func wrap(s string) string { return "(" + s + ")" } + +var ( + nonCapLeftRxp = regexp.MustCompile(`\(\?[imsU\-]*\:`) + nonCapFlagsRxp = regexp.MustCompile(`\(?[imsU\-]+\)`) +) + +func capCount(in string) int { + leftParens := strings.Count(in, "(") + nonCapLeft := len(nonCapLeftRxp.FindAllString(in, -1)) + nonCapBoth := len(nonCapFlagsRxp.FindAllString(in, -1)) + escapedLeftParens := strings.Count(in, `\(`) + return leftParens - nonCapLeft - nonCapBoth - escapedLeftParens +} + +// Get pattern by name from the Host. +func (h Host) Get(name string) (*Pattern, error) { + return h.compile(name) +} + +// Compile and get pattern without name (and without adding it to this Host) +func (h Host) Compile(expr string) (*Pattern, error) { + if expr == "" { + return nil, ErrEmptyExpression + } + return h.compileExternal(expr) +} + +type Pattern struct { + *regexp.Regexp + s map[string]int + order map[int]string + cache []string +} + +// Parse returns a map of matches on the input. The map can be empty. +func (p *Pattern) Parse(input string) map[string]string { + ss := p.FindStringSubmatch(input) + r := make(map[string]string) + if len(ss) <= 1 { + return r + } + for sem, order := range p.s { + r[sem] = ss[order] + } + return r +} + +func (p *Pattern) ParseValues(input string) []string { + a := p.FindStringSubmatchIndex(input) + if a == nil { + return nil + } + p.cache = p.cache[:0] + for i := 0; len(p.cache) < len(p.s); i++ { + if _, ok := p.order[i]; !ok { + continue + } + p.cache = append(p.cache, input[a[i*2]:a[i*2+1]]) + } + return p.cache +} + +// Names returns all names that this pattern has in order. +func (p *Pattern) Names() (ss []string) { + ss = make([]string, 0, len(p.s)) + for k := range p.s { + ss = append(ss, k) + } + sort.Slice(ss, func(i, j int) bool { + return p.s[ss[i]] < p.s[ss[j]] + }) + return +} + +// AddFromReader appends all patterns from the reader to this Host. +func (h Host) AddFromReader(r io.Reader) error { + scanner := bufio.NewScanner(r) + for scanner.Scan() { + if err := h.addFromLine(scanner.Text()); err != nil { + return err + } + } + if err := scanner.Err(); err != nil { + return err + } + return nil +} + +var lineRegexp = regexp.MustCompile(`^(\w+)\s+(.+)$`) + +func (h Host) addFromLine(line string) error { + sub := lineRegexp.FindStringSubmatch(line) + if len(sub) == 0 { // no match + return nil + } + return h.Add(sub[1], sub[2]) +} diff --git a/pkg/grok/host_test.go b/pkg/grok/host_test.go new file mode 100644 index 0000000000..acfd483e87 --- /dev/null +++ b/pkg/grok/host_test.go @@ -0,0 +1,100 @@ +package grok + +import ( + "strings" + "testing" + + "github.com/stretchr/testify/require" +) + +func TestNew(t *testing.T) { + h := New() + require.Len(t, h, 0) + require.NotNil(t, h) +} + +func TestHost_Add(t *testing.T) { + h := New() + require.ErrorIs(t, h.Add("", "expr"), ErrEmptyName) + require.Len(t, h, 0) + require.ErrorIs(t, h.Add("name", ""), ErrEmptyExpression) + require.Len(t, h, 0) + require.NoError(t, h.Add("DIGIT", `\d`)) + require.Len(t, h, 1) + require.ErrorIs(t, h.Add("DIGIT", `[+-](0x)?\d`), ErrAlreadyExist) + require.Len(t, h, 1) + require.Error(t, h.Add("BAD", `(?![0-5])`)) + require.Len(t, h, 1) + require.NoError(t, h.Add("TWODIG", `%{DIGIT}-%{DIGIT}`)) + require.Len(t, h, 2) + require.Error(t, h.Add("THREE", `%{NOT}-%{EXIST}`)) + require.Len(t, h, 2) + require.NoError(t, h.Add("FOUR", `%{DIGIT:one}-%{DIGIT:two}`)) + require.Len(t, h, 3) + require.Error(t, h.Add("FIVE", `(?!\d)%{DIGIT}(?!\d)`)) + require.Len(t, h, 3) + require.NoError(t, h.Add("SIX", `%{FOUR:four}-%{DIGIT:six}`)) + require.Len(t, h, 4) +} + +func TestHost_Compile(t *testing.T) { + h := New() + _, err := h.Compile("") + require.ErrorIs(t, err, ErrEmptyExpression) + require.Len(t, h, 0) + p, err := h.Compile(`\d+`) + require.NoError(t, err) + require.NotNil(t, p) + require.Len(t, h, 0) +} + +func TestHost_Get(t *testing.T) { + h := New() + require.NoError(t, h.Add("DIG", `\d`)) + p, err := h.Get("DIG") + require.NoError(t, err) + require.NotNil(t, p) + p, err = h.Get("SEVEN") + require.ErrorIs(t, err, ErrNotExist) + require.Nil(t, p) +} + +func TestHost_AddFromReader(t *testing.T) { + s := `# +# for testing +# +ONE \d +TWO %{ONE:two} +THREE %{ONE:one}-%{TWO}-%{ONE:three} + +# +# enough +#` + h := New() + require.NoError(t, h.AddFromReader(strings.NewReader(s))) + require.Len(t, h, 3) + _, err := h.Get("ONE") + require.NoError(t, err) + _, err = h.Get("TWO") + require.NoError(t, err) + _, err = h.Get("THREE") + require.NoError(t, err) +} + +func TestHost_AddFromReader_malformedPatterns(t *testing.T) { + s := ` +ONE \d +TWO %{THREE:two}` + require.Error(t, New().AddFromReader(strings.NewReader(s))) +} + +func TestHost_inject(t *testing.T) { + h := New() + h["TWO"] = `(?!\d)` + require.Error(t, h.Add("ONE", `%{TWO:one}`)) +} + +func TestHost_addFromLine(t *testing.T) { + h := New() + require.Error(t, h.addFromLine("ONE (?!\\d)")) +} diff --git a/pkg/grok/patterns_test.go b/pkg/grok/patterns_test.go new file mode 100644 index 0000000000..449117f598 --- /dev/null +++ b/pkg/grok/patterns_test.go @@ -0,0 +1,72 @@ +package grok + +import ( + "testing" + + "github.com/stretchr/testify/require" +) + +func mssTest(expect, got map[string]string) bool { + if len(expect) != len(got) { + return false + } + for k, v := range expect { + if v != got[k] { + return false + } + } + return true +} + +func TestPattern_Parse(t *testing.T) { + h := New() + require.NoError(t, h.Add("ONE", `\d`)) + require.NoError(t, h.Add("TWO", `%{ONE:one}-%{ONE:two}`)) + require.NoError(t, h.Add("THREE", `%{ONE:zero}-%{TWO:three}`)) + p, err := h.Get("ONE") + require.NoError(t, err) + require.NotNil(t, p.Parse("1")) + p, err = h.Get("TWO") + require.NoError(t, err) + require.Equal(t, map[string]string{"one": "1", "two": "2"}, p.Parse("1-2")) + p, err = h.Get("THREE") + require.NoError(t, err) + require.Equal(t, map[string]string{ + "one": "1", + "two": "2", + "zero": "0", + "three": "1-2", + }, p.Parse("0-1-2")) + require.NoError(t, h.Add("FOUR", `%{TWO:two}`)) + p, err = h.Get("FOUR") + require.NoError(t, err) + require.Equal(t, map[string]string{"one": "1", "two": "1-2"}, p.Parse("1-2")) +} + +func TestPattern_nestedGroups(t *testing.T) { + h := New() + require.NoError(t, h.Add("ONE", `\d`)) + require.NoError(t, h.Add("TWO", `(?:%{ONE:one})-(?:%{ONE:two})?`)) + p, err := h.Get("TWO") + require.NoError(t, err) + require.Equal(t, map[string]string{"one": "1", "two": "2"}, p.Parse("1-2")) + require.Equal(t, map[string]string{"one": "1", "two": ""}, p.Parse("1-")) +} + +func TestPattern_Names(t *testing.T) { + h := New() + require.NoError(t, h.Add("ONE", `\d`)) + require.NoError(t, h.Add("TWO", `%{ONE:one}-%{ONE:two}`)) + require.NoError(t, h.Add("THREE", `%{ONE:zero}-%{TWO:three}`)) + p, err := h.Get("THREE") + require.NoError(t, err) + require.Equal(t, []string{"zero", "three", "one", "two"}, p.Names()) +} + +func TestPattern_ParseValues(t *testing.T) { + b := NewBase() + p, err := b.Compile("%{TIMESTAMP_ISO8601:event_time} %{LOGLEVEL:log_level} %{GREEDYDATA:log_message}") + require.NoError(t, err) + ss := p.ParseValues("2020-09-16T04:20:42.45+01:00 DEBUG This is a sample debug log message") + require.Equal(t, []string{"2020-09-16T04:20:42.45+01:00", "DEBUG", "This is a sample debug log message"}, ss) +} diff --git a/runtime/expr/function/function.go b/runtime/expr/function/function.go index 886bc046f7..1ec4426a32 100644 --- a/runtime/expr/function/function.go +++ b/runtime/expr/function/function.go @@ -33,6 +33,9 @@ func New(zctx *zed.Context, name string, narg int) (expr.Function, field.Path, e // special grep form will make it look like a function call // and we don't want the error to say unknown function. return nil, nil, errors.New("syntax error") + case "grok": + argmin, argmax = 2, 3 + f = newGrok(zctx) case "len": f = &LenFn{zctx: zctx} case "abs": diff --git a/runtime/expr/function/grok.go b/runtime/expr/function/grok.go new file mode 100644 index 0000000000..3a04aa2a95 --- /dev/null +++ b/runtime/expr/function/grok.go @@ -0,0 +1,111 @@ +package function + +import ( + "errors" + "regexp" + "strings" + + "github.com/brimdata/zed" + "github.com/brimdata/zed/pkg/grok" + "github.com/brimdata/zed/zcode" + "golang.org/x/exp/slices" +) + +type Grok struct { + zctx *zed.Context + builder zcode.Builder + hosts map[string]*host +} + +func newGrok(zctx *zed.Context) *Grok { + return &Grok{ + zctx: zctx, + hosts: make(map[string]*host), + } +} + +func (g *Grok) Call(ectx zed.Allocator, vals []zed.Value) *zed.Value { + v, err := g.call(ectx, vals) + if err != nil { + return ectx.CopyValue(*g.zctx.NewErrorf("grok(): %s", err)) + } + return v +} + +func (g *Grok) call(ectx zed.Allocator, vals []zed.Value) (*zed.Value, error) { + extraArg, patternArg, inputArg := zed.NullString, vals[0], vals[1] + if len(vals) == 3 { + extraArg, patternArg, inputArg = &vals[0], vals[1], vals[2] + } + if zed.TypeUnder(patternArg.Type) != zed.TypeString { + return nil, errors.New("pattern argument be a string") + } + if zed.TypeUnder(inputArg.Type) != zed.TypeString { + return nil, errors.New("input argument be a string") + } + if zed.TypeUnder(extraArg.Type) != zed.TypeString { + return nil, errors.New("external pattern argument must be a string") + } + h, err := g.getHost(extraArg.AsString()) + if err != nil { + return nil, err + } + p, err := h.getPattern(g.zctx, patternArg.AsString()) + if err != nil { + return nil, err + } + ss := p.ParseValues(inputArg.AsString()) + if ss == nil { + return nil, errors.New("no match") + } + g.builder.Reset() + for _, s := range ss { + g.builder.Append([]byte(s)) + } + return ectx.NewValue(p.typ, slices.Clone(g.builder.Bytes())), nil +} + +func (g *Grok) getHost(extra string) (*host, error) { + h, ok := g.hosts[extra] + if !ok { + h = &host{Host: grok.NewBase(), patterns: make(map[string]*pattern)} + if err := h.AddFromReader(strings.NewReader(extra)); err != nil { + return nil, err + } + g.hosts[extra] = h + } + return h, nil +} + +type host struct { + grok.Host + patterns map[string]*pattern +} + +var lineRegexp = regexp.MustCompile(`^(\w+)\s+(.+)$`) + +func (h *host) getPattern(zctx *zed.Context, patternArg string) (*pattern, error) { + p, ok := h.patterns[patternArg] + if !ok { + pat, err := h.Host.Compile(patternArg) + if err != nil { + return nil, err + } + var fields []zed.Field + for _, name := range pat.Names() { + fields = append(fields, zed.NewField(name, zed.TypeString)) + } + typ, err := zctx.LookupTypeRecord(fields) + if err != nil { + return nil, err + } + p = &pattern{Pattern: pat, typ: typ} + h.patterns[patternArg] = p + } + return p, nil +} + +type pattern struct { + *grok.Pattern + typ zed.Type +} diff --git a/runtime/expr/function/ztests/grok.yaml b/runtime/expr/function/ztests/grok.yaml new file mode 100644 index 0000000000..a391ecf66f --- /dev/null +++ b/runtime/expr/function/ztests/grok.yaml @@ -0,0 +1,23 @@ +script: | + zq -z 'yield grok(pattern, field)' simple.zson + echo "// ===" + echo '"0-1-2"' | zq -z -I patterns.zed - + +inputs: + - name: simple.zson + data: | + { + field: "2020-09-16T04:20:42.45+01:00 DEBUG This is a sample debug log message", + pattern: "%{TIMESTAMP_ISO8601:event_time} %{LOGLEVEL:log_level} %{GREEDYDATA:log_message}" + } + - name: patterns.zed + data: | + const pattern = "ONE \\d\n" + "TWO %{ONE:one}-%{ONE:two}" + yield grok(pattern, "%{ONE:zero}-%{TWO:three}", this) + +outputs: + - name: stdout + data: | + {event_time:"2020-09-16T04:20:42.45+01:00",log_level:"DEBUG",log_message:"This is a sample debug log message"} + // === + {zero:"0",three:"1-2",one:"1",two:"2"}