diff --git a/README b/README deleted file mode 100644 index 06be63b..0000000 --- a/README +++ /dev/null @@ -1,4 +0,0 @@ -Python wrapper for semicomplete's Grok library. - -Grok allows you to easily parse logs and other files and turns the unstructured -log and event data into structured data. diff --git a/README.md b/README.md new file mode 100644 index 0000000..9e491f7 --- /dev/null +++ b/README.md @@ -0,0 +1,36 @@ +Python wrapper for semicomplete's Grok library. + +About +----- +Grok allows you to easily parse logs and other files and turns the unstructured log and event data into structured data. + +Installing +---------- + +You will need libgrok installed in other to use libgrok-py. On MacOSX, Grok is available via Homebrew: + +_Install Grok Dependencies_ + + brew install tokyo-cabinet pcre libevent + +_Install Grok_ + + brew install grok + +_Install libgrok-py_ + + python test/testlibgrok.py + python setup.py install + +Usage +----- + + >>> import libgrok + >>> grok = libgrok.Grok() + >>> grok.add_patterns_from_file('test/patterns/base') + >>> grok.compile('%{URI:foo}') + >>> match = grok("http://www.example.com/test/") + >>> match.captures.items() + [('USERNAME', ''), ('HOSTNAME', 'www.example.com'), ('URIPATH', '/test/'), ('IPORHOST', 'www.example.com'), ('POSINT:port', ''), ('URIPROTO', 'http'), ('IP', ''), ('URIHOST', 'www.example.com'), ('URIPATHPARAM', '/test/'), ('URI:foo', 'http://www.example.com/test/'), ('URIPARAM', ''), ('USER', '')] + >>> match["foo"] + 'http://www.example.com/test/' diff --git a/demo.py b/demo.py deleted file mode 100644 index aaac7bd..0000000 --- a/demo.py +++ /dev/null @@ -1,11 +0,0 @@ -import libgrok -import sys - -g = libgrok.Grok() - -for filename in sys.argv[1:]: - g.add_patterns_from_file(filename) - -g.compile(r"^%{NUMBER}$") -print g("200") -print g("404") diff --git a/libgrok.py b/libgrok.py deleted file mode 100644 index 5479e6f..0000000 --- a/libgrok.py +++ /dev/null @@ -1,50 +0,0 @@ -import ctypes - - -_libgrok = ctypes.cdll.LoadLibrary('libgrok.so') - -_grok_new = _libgrok.grok_new -_grok_new.argtypes = [] -_grok_new.restype = ctypes.c_void_p - -_grok_free = _libgrok.grok_free -_grok_free.argtypes = [ctypes.c_void_p] - -_grok_compile = _libgrok.grok_compile -_grok_compile.argtypes = [ctypes.c_void_p, ctypes.c_char_p] -_grok_compile.restype = ctypes.c_int - -_grok_exec = _libgrok.grok_exec -_grok_exec.argtypes = [ctypes.c_void_p, ctypes.c_char_p, ctypes.c_void_p] -_grok_exec.restype = ctypes.c_int - -_grok_pattern_add = _libgrok.grok_pattern_add -_grok_pattern_add.argtypes = [ctypes.c_void_p, - ctypes.c_char_p, ctypes.c_size_t, - ctypes.c_char_p, ctypes.c_size_t] -_grok_pattern_add.restype = ctypes.c_int - -_grok_patterns_import_from_file = _libgrok.grok_patterns_import_from_file -_grok_patterns_import_from_file.argtypes = [ctypes.c_void_p, ctypes.c_char_p] -_grok_patterns_import_from_file.restype = ctypes.c_int - - -class Grok(object): - - def __init__(self): - self._grok = _grok_new() - - def __del__(self): - _grok_free(self._grok) - - def add_pattern(self, name, pattern): - _grok_pattern_add(self._grok, name, len(name), pattern, len(pattern)) - - def add_patterns_from_file(self, filename): - _grok_patterns_import_from_file(self._grok, filename) - - def compile(self, pattern): - _grok_compile(self._grok, pattern) - - def __call__(self, text): - return _grok_exec(self._grok, text, None) diff --git a/libgrok/__init__.py b/libgrok/__init__.py new file mode 100644 index 0000000..3bfe365 --- /dev/null +++ b/libgrok/__init__.py @@ -0,0 +1 @@ +from grok import Grok, GrokError, GrokMatch diff --git a/libgrok/_libgrok.py b/libgrok/_libgrok.py new file mode 100644 index 0000000..98bc54a --- /dev/null +++ b/libgrok/_libgrok.py @@ -0,0 +1,60 @@ +import sys +import ctypes as CTYPES +from ctypes.util import find_library + +_libgrok_lib_name = find_library('grok') + +_libgrok_so = CTYPES.cdll.LoadLibrary(_libgrok_lib_name) + +GROK_OK = 0 + +class _grok_match(CTYPES.Structure): + _fields_ = [("grok_t", CTYPES.c_void_p), + ("subject", CTYPES.c_char_p), + ("start", CTYPES.c_int), + ("end", CTYPES.c_int)] + +_grok_match_p = CTYPES.POINTER(_grok_match) + +_grok_new = _libgrok_so.grok_new +_grok_new.argtypes = [] +_grok_new.restype = CTYPES.c_void_p + +_grok_free = _libgrok_so.grok_free +_grok_free.argtypes = [CTYPES.c_void_p] + +_grok_compile = _libgrok_so.grok_compile +_grok_compile.argtypes = [CTYPES.c_void_p, CTYPES.c_char_p] +_grok_compile.restype = CTYPES.c_int + +_grok_exec = _libgrok_so.grok_exec +_grok_exec.argtypes = [CTYPES.c_void_p, CTYPES.c_char_p, _grok_match_p] +_grok_exec.restype = CTYPES.c_int + +_grok_pattern_add = _libgrok_so.grok_pattern_add +_grok_pattern_add.argtypes = [CTYPES.c_void_p, + CTYPES.c_char_p, CTYPES.c_size_t, + CTYPES.c_char_p, CTYPES.c_size_t] +_grok_pattern_add.restype = CTYPES.c_int + +_grok_patterns_import_from_file = _libgrok_so.grok_patterns_import_from_file +_grok_patterns_import_from_file.argtypes = [CTYPES.c_void_p, CTYPES.c_char_p] +_grok_patterns_import_from_file.restype = CTYPES.c_int + +_grok_match_get_named_substring = _libgrok_so.grok_match_get_named_substring +_grok_match_get_named_substring.argtypes = [_grok_match_p, CTYPES.c_char_p, CTYPES.POINTER(CTYPES.c_char_p), CTYPES.POINTER(CTYPES.c_int)] +_grok_match_get_named_substring.restype = CTYPES.c_int + +_grok_match_walk_init = _libgrok_so.grok_match_walk_init +_grok_match_walk_init.argtypes = [_grok_match_p] + +_grok_match_walk_next = _libgrok_so.grok_match_walk_next +_grok_match_walk_next.argtypes = [_grok_match_p, + CTYPES.POINTER(CTYPES.c_char_p), + CTYPES.POINTER(CTYPES.c_int), + CTYPES.POINTER(CTYPES.c_char_p), + CTYPES.POINTER(CTYPES.c_int)] +_grok_match_walk_next.restype = CTYPES.c_int + +_grok_match_walk_end = _libgrok_so.grok_match_walk_end +_grok_match_walk_end.argtypes = [_grok_match_p] diff --git a/libgrok/grok.py b/libgrok/grok.py new file mode 100644 index 0000000..4d2b1b8 --- /dev/null +++ b/libgrok/grok.py @@ -0,0 +1,118 @@ +import ctypes as CTYPES +import _libgrok + +_fixed_buffer_size = 4096 + +class GrokError(Exception): + def __init__(self, message=None, err=0): + if message: + Exception.__init__(self, message) + else: + Exception.__init__(self, self.error_to_message(err)) + + def error_to_message(self, err): + if err == 1: + return "File not found" + if err == 2: + return "Pattern not found" + if err == 3: + return "Unexpected read size" + if err == 4: + return "Compile failed" + if err == 5: + return "Uninitialized" + if err == 6: + return "PCRE Error" + if err == 7: + return "No match" + else: + return "Unknown Error: %d" % (err) + +class GrokMatch(object): + def __init__(self): + self._grok_match = _libgrok._grok_match() + self._grok_match_ptr = CTYPES.pointer(self._grok_match) + self._captures = None + + @property + def subject(self): + return self._grok_match.subject + + @property + def start(self): + return self._grok_match.start + + @property + def end(self): + return self._grok_match.end + + @property + def captures(self): + if self._captures is None: + self._captures = dict() + for name, data in self.walk(): + self._captures[name] = data + return self._captures + + def walk(self): + _libgrok._grok_match_walk_init(self._grok_match_ptr) + # Create Buffers + name = CTYPES.create_string_buffer( _fixed_buffer_size) + name_ptr = CTYPES.c_char_p(CTYPES.addressof(name)) + name_len = CTYPES.c_int(0) + data = CTYPES.create_string_buffer( _fixed_buffer_size) + data_ptr = CTYPES.c_char_p(CTYPES.addressof(data)) + data_len = CTYPES.c_int(0) + while _libgrok._grok_match_walk_next(self._grok_match_ptr, + CTYPES.byref(name_ptr), + CTYPES.byref(name_len), + CTYPES.byref(data_ptr), + CTYPES.byref(data_len)) == _libgrok.GROK_OK: + yield CTYPES.string_at(name_ptr, name_len.value), CTYPES.string_at(data_ptr, data_len.value) + _libgrok._grok_match_walk_end(self._grok_match_ptr) + + def __getitem__(self, k): + # Create Buffer + substring = CTYPES.create_string_buffer( _fixed_buffer_size) + substring_ptr = CTYPES.c_char_p(CTYPES.addressof(substring)) + substring_len = CTYPES.c_int(0) + ret = _libgrok._grok_match_get_named_substring(self._grok_match_ptr, k, CTYPES.byref(substring_ptr), CTYPES.byref(substring_len)) + if ret != _libgrok.GROK_OK: + return None + return CTYPES.string_at(substring_ptr, substring_len.value) + +class Grok(object): + + def __init__(self): + self._grok = _libgrok._grok_new() + + def __del__(self): + _libgrok._grok_free(self._grok) + + def add_pattern(self, name, pattern): + ret = _libgrok._grok_pattern_add(self._grok, name, len(name), pattern, len(pattern)) + if ret != _libgrok.GROK_OK: + raise GrokError(err=ret) + + def add_patterns_from_file(self, filename): + ret = _libgrok._grok_patterns_import_from_file(self._grok, filename) + if ret != _libgrok.GROK_OK: + raise GrokError(err=ret) + + def compile(self, pattern): + ret = _libgrok._grok_compile(self._grok, pattern) + if ret != _libgrok.GROK_OK: + raise GrokError(err=ret) + + def execute(self, text, match=None): + grok_match_p = None if match is None else match._grok_match_ptr + ret = _libgrok._grok_exec(self._grok, text, grok_match_p) + return ret == _libgrok.GROK_OK + + def __call__(self, text): + match = GrokMatch() + if self.execute(text, match): + return match + else: + return None + diff --git a/setup.py b/setup.py new file mode 100644 index 0000000..d432a88 --- /dev/null +++ b/setup.py @@ -0,0 +1,12 @@ +from setuptools import setup + +setup( + name = "libgrok", + version = "0.0.1", + author = "Matt Goodall", + author_email = "matt.goodall@gmail.com", + description = ("Python wrapper for semicomplete's Grok"), + license = "BSD", + url = "https://github.com/emgee/libgrok-py", + packages=['libgrok'], +) diff --git a/test/patterns/base b/test/patterns/base new file mode 100755 index 0000000..95a9c4d --- /dev/null +++ b/test/patterns/base @@ -0,0 +1,97 @@ +USERNAME [a-zA-Z0-9_-]+ +USER %{USERNAME} +INT (?:[+-]?(?:[0-9]+)) +BASE10NUM (?[+-]?(?:(?:[0-9]+(?:\.[0-9]+)?)|(?:\.[0-9]+))) +NUMBER (?:%{BASE10NUM}) +BASE16NUM (?(?"(?>\\.|[^\\"]+)+"|""|(?>'(?>\\.|[^\\']+)+')|''|(?>`(?>\\.|[^\\`]+)+`)|``)) +UUID [A-Fa-f0-9]{8}-(?:[A-Fa-f0-9]{4}-){3}[A-Fa-f0-9]{12} + +# Networking +MAC (?:%{CISCOMAC}|%{WINDOWSMAC}|%{COMMONMAC}) +CISCOMAC (?:(?:[A-Fa-f0-9]{4}\.){2}[A-Fa-f0-9]{4}) +WINDOWSMAC (?:(?:[A-Fa-f0-9]{2}-){5}[A-Fa-f0-9]{2}) +COMMONMAC (?:(?:[A-Fa-f0-9]{2}:){5}[A-Fa-f0-9]{2}) +IP (?/(?>[\w_%!$@:.,-]+|\\.)*)+ +#UNIXPATH (?/dev/pts/%{NONNEGINT}) +BSDTTY (?>/dev/tty[pq][a-z0-9]) +TTY (?:%{BSDTTY}|%{LINUXTTY}) +WINPATH (?>[A-Za-z]+:|\\)(?:\\[^\\?*]*)+ +URIPROTO [A-Za-z]+(\+[A-Za-z+]+)? +URIHOST %{IPORHOST}(?::%{POSINT:port})? +# uripath comes loosely from RFC1738, but mostly from what Firefox +# doesn't turn into %XX +URIPATH (?:/[A-Za-z0-9$.+!*'(){},~:;=#%_-]*)+ +#URIPARAM \?(?:[A-Za-z0-9]+(?:=(?:[^&]*))?(?:&(?:[A-Za-z0-9]+(?:=(?:[^&]*))?)?)*)? +URIPARAM \?[A-Za-z0-9$.+!*'|(){},~#%&/=:;_?-\[\]]* +URIPATHPARAM %{URIPATH}(?:%{URIPARAM})? +URI %{URIPROTO}://(?:%{USER}(?::[^@]*)?@)?(?:%{URIHOST})?(?:%{URIPATHPARAM})? + +# Months: January, Feb, 3, 03, 12, December +MONTH \b(?:Jan(?:uary)?|Feb(?:ruary)?|Mar(?:ch)?|Apr(?:il)?|May|Jun(?:e)?|Jul(?:y)?|Aug(?:ust)?|Sep(?:tember)?|Oct(?:ober)?|Nov(?:ember)?|Dec(?:ember)?)\b +MONTHNUM (?:0?[1-9]|1[0-2]) +MONTHDAY (?:(?:0[1-9])|(?:[12][0-9])|(?:3[01])|[1-9]) + +# Days: Monday, Tue, Thu, etc... +DAY (?:Mon(?:day)?|Tue(?:sday)?|Wed(?:nesday)?|Thu(?:rsday)?|Fri(?:day)?|Sat(?:urday)?|Sun(?:day)?) + +# Years? +YEAR (?>\d\d){1,2} +# Time: HH:MM:SS +#TIME \d{2}:\d{2}(?::\d{2}(?:\.\d+)?)? +# I'm still on the fence about using grok to perform the time match, +# since it's probably slower. +# TIME %{POSINT<24}:%{POSINT<60}(?::%{POSINT<60}(?:\.%{POSINT})?)? +HOUR (?:2[0123]|[01][0-9]) +MINUTE (?:[0-5][0-9]) +# '60' is a leap second in most time standards and thus is valid. +SECOND (?:(?:[0-5][0-9]|60)(?:[.,][0-9]+)?) +TIME (?!<[0-9])%{HOUR}:%{MINUTE}(?::%{SECOND})(?![0-9]) +# datestamp is YYYY/MM/DD-HH:MM:SS.UUUU (or something like it) +DATE_US %{MONTHNUM}[/-]%{MONTHDAY}[/-]%{YEAR} +DATE_EU %{YEAR}[/-]%{MONTHNUM}[/-]%{MONTHDAY} +ISO8601_TIMEZONE (?:Z|[+-]%{HOUR}(?::?%{MINUTE})) +ISO8601_SECOND (?:%{SECOND}|60) +TIMESTAMP_ISO8601 %{YEAR}-%{MONTHNUM}-%{MONTHDAY}[T ]%{HOUR}:?%{MINUTE}(?::?%{SECOND})?%{ISO8601_TIMEZONE}? +DATE %{DATE_US}|%{DATE_EU} +DATESTAMP %{DATE}[- ]%{TIME} +TZ (?:[PMCE][SD]T) +DATESTAMP_RFC822 %{DAY} %{MONTH} %{MONTHDAY} %{YEAR} %{TIME} %{TZ} +DATESTAMP_OTHER %{DAY} %{MONTH} %{MONTHDAY} %{TIME} %{TZ} %{YEAR} + +# Syslog Dates: Month Day HH:MM:SS +SYSLOGTIMESTAMP %{MONTH} +%{MONTHDAY} %{TIME} +PROG (?:[\w._/%-]+) +SYSLOGPROG %{PROG:program}(?:\[%{POSINT:pid}\])? +SYSLOGHOST %{IPORHOST} +SYSLOGFACILITY <%{NONNEGINT:facility}.%{NONNEGINT:priority}> +HTTPDATE %{MONTHDAY}/%{MONTH}/%{YEAR}:%{TIME} %{INT} + +# Shortcuts +QS %{QUOTEDSTRING} + +# Log formats +SYSLOGBASE %{SYSLOGTIMESTAMP:timestamp} (?:%{SYSLOGFACILITY} )?%{SYSLOGHOST:logsource} %{SYSLOGPROG}: +COMBINEDAPACHELOG %{IPORHOST:clientip} %{USER:ident} %{USER:auth} \[%{HTTPDATE:timestamp}\] "(?:%{WORD:verb} %{NOTSPACE:request}(?: HTTP/%{NUMBER:httpversion})?|-)" %{NUMBER:response} (?:%{NUMBER:bytes}|-) %{QS:referrer} %{QS:agent} + +# Log Levels +LOGLEVEL ([T|t]race|TRACE|[D|d]ebug|DEBUG|[N|n]otice|NOTICE|[I|i]nfo|INFO|[W|w]arn?(?:ing)?|WARN?(?:ING)?|[E|e]rr?(?:or)?|ERR?(?:OR)?|[C|c]rit?(?:ical)?|CRIT?(?:ICAL)?|[F|f]atal|FATAL|[S|s]evere|SEVERE) diff --git a/test/testlibgrok.py b/test/testlibgrok.py new file mode 100644 index 0000000..741a3a3 --- /dev/null +++ b/test/testlibgrok.py @@ -0,0 +1,66 @@ +import os +import sys +import unittest + +sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..'))) + +from libgrok import * + +class GrokTestCase(unittest.TestCase): + + def setUp(self): + self.grok = Grok() + self.grok.add_patterns_from_file('test/patterns/base') + + def tearDown(self): + del self.grok + + def test_grok_add_patterns_file(self): + self.assertRaises(GrokError, self.grok.add_patterns_from_file, 'nosuchfile') + + def test_grok_compile(self): + self.assertRaises(GrokError, self.grok.compile, '%{URI)') + + def test_grok_substring(self): + self.grok.compile("%{URI}") + match = self.grok('https://example.com/test/') + self.assertEquals(match['URIPROTO'], 'https') + self.assertEquals(match['URIPATH'], '/test/') + self.assertEquals(match['foo'], None) + + def test_grok_substring_named(self): + self.grok.compile("%{URI:foo}") + match = self.grok('https://example.com/test/') + self.assertEquals(match['URIPROTO'], 'https') + self.assertEquals(match['URIPATH'], '/test/') + self.assertEquals(match['foo'], 'https://example.com/test/') + self.assertEquals(match['bar'], None) + + def test_grok_captures(self): + self.grok.compile("%{URI}") + match = self.grok('https://example.com/test/') + self.assertTrue('HOSTNAME' in match.captures) + self.assertEquals(match.captures['HOSTNAME'], 'example.com') + self.assertTrue('URIPROTO' in match.captures) + self.assertEquals(match.captures['URIPROTO'], 'https') + self.assertTrue('URIPATH' in match.captures) + self.assertEquals(match.captures['URIPATH'], '/test/') + + def test_grok_captures_named(self): + self.grok.compile("%{URI:foo}") + match = self.grok('https://example.com/test/') + self.assertTrue('URI:foo' in match.captures) + self.assertEquals(match.captures['URI:foo'], 'https://example.com/test/') + + def test_grok_execute(self): + self.grok.compile("%{URI}") + self.assertTrue(self.grok.execute('https://example.com/test/')) + self.assertFalse(self.grok.execute('thisisnotauri.com')) + + def test_grok_call(self): + self.grok.compile("%{URI}") + self.assertTrue(self.grok('https://example.com/test/') != None) + self.assertTrue(self.grok('thisisnotauri.com') == None) + +if __name__ == "__main__": + unittest.main()