WPT: Handle flake detection in ServoHandler (#37540)

With this PR ServoHandler will store state of first suite and deal with
subsequent suites as they were run from `--retry-unexpected`, so it will
use results to mark unexpected from first run as flaky. Stats that are
used to display current running tests are still reset per suite. This
allows us to use `--retry-unexpected=1` for flake detection instead of
manual rerunning, which will help with proper subsuites support.

Testing: Manual CI run to ensure end results are still the same:
https://github.com/sagudev/servo/actions/runs/15886712204
Fixes: #37319

---------

Signed-off-by: sagudev <16504129+sagudev@users.noreply.github.com>
This commit is contained in:
sagudev 2025-06-26 10:53:07 +02:00 committed by GitHub
parent 3e1cdacd07
commit b9f9abee91
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
2 changed files with 56 additions and 36 deletions

View file

@ -112,7 +112,13 @@ class ServoHandler(mozlog.reader.LogHandler):
"""LogHandler designed to collect unexpected results for use by """LogHandler designed to collect unexpected results for use by
script or by the ServoFormatter output formatter.""" script or by the ServoFormatter output formatter."""
def __init__(self): def __init__(self, detect_flakes=False):
"""
Flake detection assumes first suite is actual run
and rest of the suites are retry-unexpected for flakes detection.
"""
self.detect_flakes = detect_flakes
self.currently_detecting_flakes = False
self.reset_state() self.reset_state()
def reset_state(self): def reset_state(self):
@ -120,6 +126,9 @@ class ServoHandler(mozlog.reader.LogHandler):
self.completed_tests = 0 self.completed_tests = 0
self.need_to_erase_last_line = False self.need_to_erase_last_line = False
self.running_tests: Dict[str, str] = {} self.running_tests: Dict[str, str] = {}
if self.currently_detecting_flakes:
return
self.currently_detecting_flakes = False
self.test_output = collections.defaultdict(str) self.test_output = collections.defaultdict(str)
self.subtest_failures = collections.defaultdict(list) self.subtest_failures = collections.defaultdict(list)
self.tests_with_failing_subtests = [] self.tests_with_failing_subtests = []
@ -146,8 +155,17 @@ class ServoHandler(mozlog.reader.LogHandler):
"PRECONDITION_FAILED": [], "PRECONDITION_FAILED": [],
} }
def any_stable_unexpected(self) -> bool:
return any(not unexpected.flaky for unexpected in self.unexpected_results)
def suite_start(self, data): def suite_start(self, data):
# If there were any unexpected results and we are starting another suite, assume
# that this suite has been launched to detect intermittent tests.
# TODO: Support running more than a single suite at once.
if self.unexpected_results:
self.currently_detecting_flakes = True
self.reset_state() self.reset_state()
self.number_of_tests = sum(len(tests) for tests in itervalues(data["tests"])) self.number_of_tests = sum(len(tests) for tests in itervalues(data["tests"]))
self.suite_start_time = data["time"] self.suite_start_time = data["time"]
@ -171,17 +189,43 @@ class ServoHandler(mozlog.reader.LogHandler):
had_expected_test_result = self.data_was_for_expected_result(data) had_expected_test_result = self.data_was_for_expected_result(data)
subtest_failures = self.subtest_failures.pop(test_path, []) subtest_failures = self.subtest_failures.pop(test_path, [])
test_output = self.test_output.pop(test_path, "")
if had_expected_test_result and not subtest_failures: if had_expected_test_result and not subtest_failures:
if not self.currently_detecting_flakes:
self.expected[test_status] += 1 self.expected[test_status] += 1
else:
# When `retry_unexpected` is passed and we are currently detecting flaky tests
# we assume that this suite only runs tests that have already been run and are
# in the list of unexpected results.
for unexpected in self.unexpected_results:
if unexpected.path == test_path:
unexpected.flaky = True
break
return None return None
# If we are currently detecting flakes and a test still had an unexpected
# result, it's enough to simply return the unexpected result. It isn't
# necessary to update any of the test counting data structures.
if self.currently_detecting_flakes:
return UnexpectedResult(
test_path,
test_status,
data.get("expected", test_status),
data.get("message", ""),
data["time"],
"",
subtest_failures,
)
# If the test crashed or timed out, we also include any process output, # If the test crashed or timed out, we also include any process output,
# because there is a good chance that the test produced a stack trace # because there is a good chance that the test produced a stack trace
# or other error messages. # or other error messages.
stack = data.get("stack", None) stack = data.get("stack", None)
if test_status in ("CRASH", "TIMEOUT"): if test_status in ("CRASH", "TIMEOUT"):
stack = f"\n{stack}" if stack else "" stack = f"\n{stack}" if stack else ""
stack = f"{self.test_output[test_path]}{stack}" stack = f"{test_output}{stack}"
result = UnexpectedResult( result = UnexpectedResult(
test_path, test_path,
@ -285,10 +329,11 @@ class ServoFormatter(mozlog.formatters.base.BaseFormatter, ServoHandler):
def suite_start(self, data): def suite_start(self, data):
ServoHandler.suite_start(self, data) ServoHandler.suite_start(self, data)
maybe_flakes_msg = " to detect flaky tests" if self.currently_detecting_flakes else ""
if self.number_of_tests == 0: if self.number_of_tests == 0:
return "Running tests in %s\n\n" % data["source"] return f"Running tests in {data['source']}{maybe_flakes_msg}\n\n"
else: else:
return "Running %i tests in %s\n\n" % (self.number_of_tests, data["source"]) return f"Running {self.number_of_tests} tests in {data['source']}{maybe_flakes_msg}\n\n"
def test_start(self, data): def test_start(self, data):
ServoHandler.test_start(self, data) ServoHandler.test_start(self, data)

View file

@ -93,6 +93,8 @@ def run_tests(default_binary_path: str, **kwargs):
filter_intermittents_output = kwargs.pop("filter_intermittents", None) filter_intermittents_output = kwargs.pop("filter_intermittents", None)
unexpected_raw_log_output_file = kwargs.pop("log_raw_unexpected", None) unexpected_raw_log_output_file = kwargs.pop("log_raw_unexpected", None)
raw_log_outputs = kwargs.get("log_raw", []) raw_log_outputs = kwargs.get("log_raw", [])
if filter_intermittents_output and kwargs["retry_unexpected"] <= 0:
kwargs["retry_unexpected"] = 1
wptcommandline.check_args(kwargs) wptcommandline.check_args(kwargs)
@ -112,45 +114,18 @@ def run_tests(default_binary_path: str, **kwargs):
else: else:
logger = wptrunner.setup_logging(kwargs, {"servo": sys.stdout}) logger = wptrunner.setup_logging(kwargs, {"servo": sys.stdout})
handler = ServoHandler() handler = ServoHandler(detect_flakes=kwargs["retry_unexpected"] >= 1)
logger.add_handler(handler) logger.add_handler(handler)
wptrunner.run_tests(**kwargs) wptrunner.run_tests(**kwargs)
return_value = 0 if not handler.unexpected_results else 1 return_value = int(handler.any_stable_unexpected())
# Filter intermittents if that was specified on the command-line. # Filter intermittents if that was specified on the command-line.
if handler.unexpected_results and filter_intermittents_output: if filter_intermittents_output:
# Copy the list of unexpected results from the first run, so that we
# can access them after the tests are rerun (which changes
# `handler.unexpected_results`). After rerunning some tests will be
# marked as flaky but otherwise the contents of this original list
# won't change.
unexpected_results = list(handler.unexpected_results)
# This isn't strictly necessary since `handler.suite_start()` clears
# the state, but make sure that we are starting with a fresh handler.
handler.reset_state()
print(80 * "=")
print(f"Rerunning {len(unexpected_results)} tests with unexpected results to detect flaky tests.")
unexpected_results_tests = [result.path for result in unexpected_results]
kwargs["test_list"] = unexpected_results_tests
kwargs["include"] = unexpected_results_tests
kwargs["pause_after_test"] = False
wptrunner.run_tests(**kwargs)
if github_context: if github_context:
os.environ["GITHUB_CONTEXT"] = github_context os.environ["GITHUB_CONTEXT"] = github_context
# Use the second run to mark tests from the first run as flaky, but all_filtered = filter_intermittents(handler.unexpected_results, filter_intermittents_output)
# discard the results otherwise.
# TODO: It might be a good idea to send the new results to the
# dashboard if they were also unexpected.
stable_tests = [result.path for result in handler.unexpected_results]
for result in unexpected_results:
result.flaky = result.path not in stable_tests
all_filtered = filter_intermittents(unexpected_results, filter_intermittents_output)
return_value = 0 if all_filtered else 1 return_value = 0 if all_filtered else 1
# Write the unexpected-only raw log if that was specified on the command-line. # Write the unexpected-only raw log if that was specified on the command-line.