Skip to content

Commit 6fb7b58

Browse files
committed
Add finite-automaton simplifier, for re2 and graal
As I've discovered a while ago, finite automaton engines are not very fond of large bounded repetitions. In re2 and regex, that mostly translates to increased memory consumption (e.g. in their default modes, converting `.*` to `.{0,500}` increases the pattern's size by 115x in re2 and 84x in regex, if a capture is added on top then regex balloons to 219x), there is a performance impact but it's high single digit to low double, in regex at least (didn't test re2). However as it turns out Graal uses a JIT-ed DFA, and it *really* doesn't like these patterns, it spends a lot of time JIT-compiling (this is apparently the source of the extra 300% CPU use I could observe on what are purely single-threaded workloads, the JIT desperately trying to optimise regexes) them with no gain in performance: down-converting the regex back to the sensible increases performances by ~25%, though it doesn't seem to impact memory use... So... do that: `fa_simplifier` is the same idea as ua-parser/uap-rust@29b9195 but from the Python side, and applied to graal and re2 (not regex because it does that internally as linked above). Also switch Graal over to the lazy builtins, it kinda spreads the cost but it seems stupid to compile the regexes only to immediately swap (fa_simplifier) and recompile them... so don't do that, especially as I couldn't be arsed to make the replacement conditional (so every eager regex is recompiled, even though only those which actually got modified by `fa_simplifier` need it...). Fixes #228
1 parent 1358e75 commit 6fb7b58

File tree

6 files changed

+82
-9
lines changed

6 files changed

+82
-9
lines changed

pyproject.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -110,6 +110,7 @@ module = [
110110
"test_core",
111111
"test_caches",
112112
"test_parsers_basics",
113+
"test_fa_simplifier",
113114
]
114115

115116
#check_untyped_defs = false

src/ua_parser/__init__.py

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -57,6 +57,7 @@
5757
UserAgent,
5858
)
5959
from .loaders import load_builtins, load_lazy_builtins
60+
from .utils import IS_GRAAL
6061

6162
Re2Resolver: Optional[Callable[[Matchers], Resolver]] = None
6263
if importlib.util.find_spec("re2"):
@@ -132,10 +133,11 @@ def parse_device(self: Resolver, ua: str) -> Optional[Device]:
132133
def __getattr__(name: str) -> Parser:
133134
global parser
134135
if name == "parser":
135-
parser = Parser.from_matchers(
136-
load_builtins() if Re2Resolver is None else load_lazy_builtins()
137-
)
138-
return parser
136+
if Re2Resolver or IS_GRAAL:
137+
matchers = load_lazy_builtins()
138+
else:
139+
matchers = load_builtins()
140+
return Parser.from_matchers(matchers)
139141
raise AttributeError(f"module {__name__!r} has no attribute {name!r}")
140142

141143

src/ua_parser/basic.py

Lines changed: 22 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,9 @@
11
__all__ = ["Resolver"]
22

3+
import re
4+
from itertools import chain
35
from operator import methodcaller
4-
from typing import List
6+
from typing import Any, List
57

68
from .core import (
79
Device,
@@ -12,6 +14,7 @@
1214
PartialResult,
1315
UserAgent,
1416
)
17+
from .utils import IS_GRAAL, fa_simplifier
1518

1619

1720
class Resolver:
@@ -30,6 +33,24 @@ def __init__(
3033
matchers: Matchers,
3134
) -> None:
3235
self.user_agent_matchers, self.os_matchers, self.device_matchers = matchers
36+
if IS_GRAAL:
37+
matcher: Any
38+
kind = next(
39+
(
40+
"eager" if hasattr(type(m), "regex") else "lazy"
41+
for m in chain.from_iterable(matchers)
42+
),
43+
None,
44+
)
45+
if kind == "eager":
46+
for matcher in chain.from_iterable(matchers):
47+
matcher.pattern = re.compile(
48+
fa_simplifier(matcher.pattern.pattern),
49+
flags=matcher.pattern.flags,
50+
)
51+
elif kind == "lazy":
52+
for matcher in chain.from_iterable(matchers):
53+
matcher.regex = fa_simplifier(matcher.pattern.pattern)
3354

3455
def __call__(self, ua: str, domains: Domain, /) -> PartialResult:
3556
parse = methodcaller("__call__", ua)

src/ua_parser/re2.py

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@
1414
PartialResult,
1515
UserAgent,
1616
)
17+
from .utils import fa_simplifier
1718

1819

1920
class DummyFilter:
@@ -38,15 +39,15 @@ def __init__(
3839
if self.user_agent_matchers:
3940
self.ua = re2.Filter()
4041
for u in self.user_agent_matchers:
41-
self.ua.Add(u.regex)
42+
self.ua.Add(fa_simplifier(u.regex))
4243
self.ua.Compile()
4344
else:
4445
self.ua = DummyFilter()
4546

4647
if self.os_matchers:
4748
self.os = re2.Filter()
4849
for o in self.os_matchers:
49-
self.os.Add(o.regex)
50+
self.os.Add(fa_simplifier(o.regex))
5051
self.os.Compile()
5152
else:
5253
self.os = DummyFilter()
@@ -58,9 +59,9 @@ def __init__(
5859
# no pattern uses global flags, but since they're not
5960
# supported in JS that seems safe.
6061
if d.flags & re.IGNORECASE:
61-
self.devices.Add("(?i)" + d.regex)
62+
self.devices.Add("(?i)" + fa_simplifier(d.regex))
6263
else:
63-
self.devices.Add(d.regex)
64+
self.devices.Add(fa_simplifier(d.regex))
6465
self.devices.Compile()
6566
else:
6667
self.devices = DummyFilter()

src/ua_parser/utils.py

Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,9 @@
1+
import platform
12
import re
23
from typing import Match, Optional
34

5+
IS_GRAAL: bool = platform.python_implementation() == "GraalVM"
6+
47

58
def get(m: Match[str], idx: int) -> Optional[str]:
69
return (m[idx] or None) if 0 < idx <= m.re.groups else None
@@ -28,3 +31,33 @@ def replacer(repl: str, m: Match[str]) -> Optional[str]:
2831
return None
2932

3033
return re.sub(r"\$(\d)", lambda n: get(m, int(n[1])) or "", repl).strip() or None
34+
35+
36+
REPETITION_PATTERN = re.compile(r"\{(0|1)\s*,\s*\d{3,}\}")
37+
CLASS_PATTERN = re.compile(
38+
r"""
39+
\[[^]]*\\(d|w)[^]]*\]
40+
|
41+
\\(d|w)
42+
""",
43+
re.VERBOSE,
44+
)
45+
46+
47+
def class_replacer(m: re.Match[str]) -> str:
48+
d, w = ("0-9", "A-Za-z0-9_") if m[1] else ("[0-9]", "[A-Za-z0-9_]")
49+
return m[0].replace(r"\d", d).replace(r"\w", w)
50+
51+
52+
def fa_simplifier(pattern: str) -> str:
53+
"""uap-core makes significant use of large bounded repetitions, to
54+
mitigate catastrophic backtracking.
55+
56+
However this explodes the number of states (and thus graph size)
57+
for finite automaton engines, which significantly increases their
58+
memory use, and for those which use JITs it can exceed the JIT
59+
threshold and force fallback to a slower engine (seems to be the
60+
case for graal's TRegex).
61+
"""
62+
pattern = REPETITION_PATTERN.sub(lambda m: "*" if m[1] == "0" else "+", pattern)
63+
return CLASS_PATTERN.sub(class_replacer, pattern)

tests/test_fa_simplifier.py

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,15 @@
1+
import pytest # type: ignore
2+
3+
from ua_parser.utils import fa_simplifier
4+
5+
6+
@pytest.mark.parametrize(
7+
("from_", "to"),
8+
[
9+
(r"\d", "[0-9]"),
10+
(r"[\d]", "[0-9]"),
11+
(r"[\d\.]", r"[0-9\.]"),
12+
],
13+
)
14+
def test_classes(from_, to):
15+
assert fa_simplifier(from_) == to

0 commit comments

Comments
 (0)