mirror of
https://github.com/servo/servo.git
synced 2025-07-05 14:33:38 +01:00
WPT: Handle flake detection in ServoHandler
(#37540)
With this PR ServoHandler will store state of first suite and deal with subsequent suites as they were run from `--retry-unexpected`, so it will use results to mark unexpected from first run as flaky. Stats that are used to display current running tests are still reset per suite. This allows us to use `--retry-unexpected=1` for flake detection instead of manual rerunning, which will help with proper subsuites support. Testing: Manual CI run to ensure end results are still the same: https://github.com/sagudev/servo/actions/runs/15886712204 Fixes: #37319 --------- Signed-off-by: sagudev <16504129+sagudev@users.noreply.github.com>
This commit is contained in:
parent
3e1cdacd07
commit
b9f9abee91
2 changed files with 56 additions and 36 deletions
|
@ -112,7 +112,13 @@ class ServoHandler(mozlog.reader.LogHandler):
|
||||||
"""LogHandler designed to collect unexpected results for use by
|
"""LogHandler designed to collect unexpected results for use by
|
||||||
script or by the ServoFormatter output formatter."""
|
script or by the ServoFormatter output formatter."""
|
||||||
|
|
||||||
def __init__(self):
|
def __init__(self, detect_flakes=False):
|
||||||
|
"""
|
||||||
|
Flake detection assumes first suite is actual run
|
||||||
|
and rest of the suites are retry-unexpected for flakes detection.
|
||||||
|
"""
|
||||||
|
self.detect_flakes = detect_flakes
|
||||||
|
self.currently_detecting_flakes = False
|
||||||
self.reset_state()
|
self.reset_state()
|
||||||
|
|
||||||
def reset_state(self):
|
def reset_state(self):
|
||||||
|
@ -120,6 +126,9 @@ class ServoHandler(mozlog.reader.LogHandler):
|
||||||
self.completed_tests = 0
|
self.completed_tests = 0
|
||||||
self.need_to_erase_last_line = False
|
self.need_to_erase_last_line = False
|
||||||
self.running_tests: Dict[str, str] = {}
|
self.running_tests: Dict[str, str] = {}
|
||||||
|
if self.currently_detecting_flakes:
|
||||||
|
return
|
||||||
|
self.currently_detecting_flakes = False
|
||||||
self.test_output = collections.defaultdict(str)
|
self.test_output = collections.defaultdict(str)
|
||||||
self.subtest_failures = collections.defaultdict(list)
|
self.subtest_failures = collections.defaultdict(list)
|
||||||
self.tests_with_failing_subtests = []
|
self.tests_with_failing_subtests = []
|
||||||
|
@ -146,8 +155,17 @@ class ServoHandler(mozlog.reader.LogHandler):
|
||||||
"PRECONDITION_FAILED": [],
|
"PRECONDITION_FAILED": [],
|
||||||
}
|
}
|
||||||
|
|
||||||
|
def any_stable_unexpected(self) -> bool:
|
||||||
|
return any(not unexpected.flaky for unexpected in self.unexpected_results)
|
||||||
|
|
||||||
def suite_start(self, data):
|
def suite_start(self, data):
|
||||||
|
# If there were any unexpected results and we are starting another suite, assume
|
||||||
|
# that this suite has been launched to detect intermittent tests.
|
||||||
|
# TODO: Support running more than a single suite at once.
|
||||||
|
if self.unexpected_results:
|
||||||
|
self.currently_detecting_flakes = True
|
||||||
self.reset_state()
|
self.reset_state()
|
||||||
|
|
||||||
self.number_of_tests = sum(len(tests) for tests in itervalues(data["tests"]))
|
self.number_of_tests = sum(len(tests) for tests in itervalues(data["tests"]))
|
||||||
self.suite_start_time = data["time"]
|
self.suite_start_time = data["time"]
|
||||||
|
|
||||||
|
@ -171,17 +189,43 @@ class ServoHandler(mozlog.reader.LogHandler):
|
||||||
|
|
||||||
had_expected_test_result = self.data_was_for_expected_result(data)
|
had_expected_test_result = self.data_was_for_expected_result(data)
|
||||||
subtest_failures = self.subtest_failures.pop(test_path, [])
|
subtest_failures = self.subtest_failures.pop(test_path, [])
|
||||||
|
test_output = self.test_output.pop(test_path, "")
|
||||||
|
|
||||||
if had_expected_test_result and not subtest_failures:
|
if had_expected_test_result and not subtest_failures:
|
||||||
self.expected[test_status] += 1
|
if not self.currently_detecting_flakes:
|
||||||
|
self.expected[test_status] += 1
|
||||||
|
else:
|
||||||
|
# When `retry_unexpected` is passed and we are currently detecting flaky tests
|
||||||
|
# we assume that this suite only runs tests that have already been run and are
|
||||||
|
# in the list of unexpected results.
|
||||||
|
for unexpected in self.unexpected_results:
|
||||||
|
if unexpected.path == test_path:
|
||||||
|
unexpected.flaky = True
|
||||||
|
break
|
||||||
|
|
||||||
return None
|
return None
|
||||||
|
|
||||||
|
# If we are currently detecting flakes and a test still had an unexpected
|
||||||
|
# result, it's enough to simply return the unexpected result. It isn't
|
||||||
|
# necessary to update any of the test counting data structures.
|
||||||
|
if self.currently_detecting_flakes:
|
||||||
|
return UnexpectedResult(
|
||||||
|
test_path,
|
||||||
|
test_status,
|
||||||
|
data.get("expected", test_status),
|
||||||
|
data.get("message", ""),
|
||||||
|
data["time"],
|
||||||
|
"",
|
||||||
|
subtest_failures,
|
||||||
|
)
|
||||||
|
|
||||||
# If the test crashed or timed out, we also include any process output,
|
# If the test crashed or timed out, we also include any process output,
|
||||||
# because there is a good chance that the test produced a stack trace
|
# because there is a good chance that the test produced a stack trace
|
||||||
# or other error messages.
|
# or other error messages.
|
||||||
stack = data.get("stack", None)
|
stack = data.get("stack", None)
|
||||||
if test_status in ("CRASH", "TIMEOUT"):
|
if test_status in ("CRASH", "TIMEOUT"):
|
||||||
stack = f"\n{stack}" if stack else ""
|
stack = f"\n{stack}" if stack else ""
|
||||||
stack = f"{self.test_output[test_path]}{stack}"
|
stack = f"{test_output}{stack}"
|
||||||
|
|
||||||
result = UnexpectedResult(
|
result = UnexpectedResult(
|
||||||
test_path,
|
test_path,
|
||||||
|
@ -285,10 +329,11 @@ class ServoFormatter(mozlog.formatters.base.BaseFormatter, ServoHandler):
|
||||||
|
|
||||||
def suite_start(self, data):
|
def suite_start(self, data):
|
||||||
ServoHandler.suite_start(self, data)
|
ServoHandler.suite_start(self, data)
|
||||||
|
maybe_flakes_msg = " to detect flaky tests" if self.currently_detecting_flakes else ""
|
||||||
if self.number_of_tests == 0:
|
if self.number_of_tests == 0:
|
||||||
return "Running tests in %s\n\n" % data["source"]
|
return f"Running tests in {data['source']}{maybe_flakes_msg}\n\n"
|
||||||
else:
|
else:
|
||||||
return "Running %i tests in %s\n\n" % (self.number_of_tests, data["source"])
|
return f"Running {self.number_of_tests} tests in {data['source']}{maybe_flakes_msg}\n\n"
|
||||||
|
|
||||||
def test_start(self, data):
|
def test_start(self, data):
|
||||||
ServoHandler.test_start(self, data)
|
ServoHandler.test_start(self, data)
|
||||||
|
|
|
@ -93,6 +93,8 @@ def run_tests(default_binary_path: str, **kwargs):
|
||||||
filter_intermittents_output = kwargs.pop("filter_intermittents", None)
|
filter_intermittents_output = kwargs.pop("filter_intermittents", None)
|
||||||
unexpected_raw_log_output_file = kwargs.pop("log_raw_unexpected", None)
|
unexpected_raw_log_output_file = kwargs.pop("log_raw_unexpected", None)
|
||||||
raw_log_outputs = kwargs.get("log_raw", [])
|
raw_log_outputs = kwargs.get("log_raw", [])
|
||||||
|
if filter_intermittents_output and kwargs["retry_unexpected"] <= 0:
|
||||||
|
kwargs["retry_unexpected"] = 1
|
||||||
|
|
||||||
wptcommandline.check_args(kwargs)
|
wptcommandline.check_args(kwargs)
|
||||||
|
|
||||||
|
@ -112,45 +114,18 @@ def run_tests(default_binary_path: str, **kwargs):
|
||||||
else:
|
else:
|
||||||
logger = wptrunner.setup_logging(kwargs, {"servo": sys.stdout})
|
logger = wptrunner.setup_logging(kwargs, {"servo": sys.stdout})
|
||||||
|
|
||||||
handler = ServoHandler()
|
handler = ServoHandler(detect_flakes=kwargs["retry_unexpected"] >= 1)
|
||||||
logger.add_handler(handler)
|
logger.add_handler(handler)
|
||||||
|
|
||||||
wptrunner.run_tests(**kwargs)
|
wptrunner.run_tests(**kwargs)
|
||||||
return_value = 0 if not handler.unexpected_results else 1
|
return_value = int(handler.any_stable_unexpected())
|
||||||
|
|
||||||
# Filter intermittents if that was specified on the command-line.
|
# Filter intermittents if that was specified on the command-line.
|
||||||
if handler.unexpected_results and filter_intermittents_output:
|
if filter_intermittents_output:
|
||||||
# Copy the list of unexpected results from the first run, so that we
|
|
||||||
# can access them after the tests are rerun (which changes
|
|
||||||
# `handler.unexpected_results`). After rerunning some tests will be
|
|
||||||
# marked as flaky but otherwise the contents of this original list
|
|
||||||
# won't change.
|
|
||||||
unexpected_results = list(handler.unexpected_results)
|
|
||||||
|
|
||||||
# This isn't strictly necessary since `handler.suite_start()` clears
|
|
||||||
# the state, but make sure that we are starting with a fresh handler.
|
|
||||||
handler.reset_state()
|
|
||||||
|
|
||||||
print(80 * "=")
|
|
||||||
print(f"Rerunning {len(unexpected_results)} tests with unexpected results to detect flaky tests.")
|
|
||||||
unexpected_results_tests = [result.path for result in unexpected_results]
|
|
||||||
kwargs["test_list"] = unexpected_results_tests
|
|
||||||
kwargs["include"] = unexpected_results_tests
|
|
||||||
kwargs["pause_after_test"] = False
|
|
||||||
wptrunner.run_tests(**kwargs)
|
|
||||||
|
|
||||||
if github_context:
|
if github_context:
|
||||||
os.environ["GITHUB_CONTEXT"] = github_context
|
os.environ["GITHUB_CONTEXT"] = github_context
|
||||||
|
|
||||||
# Use the second run to mark tests from the first run as flaky, but
|
all_filtered = filter_intermittents(handler.unexpected_results, filter_intermittents_output)
|
||||||
# discard the results otherwise.
|
|
||||||
# TODO: It might be a good idea to send the new results to the
|
|
||||||
# dashboard if they were also unexpected.
|
|
||||||
stable_tests = [result.path for result in handler.unexpected_results]
|
|
||||||
for result in unexpected_results:
|
|
||||||
result.flaky = result.path not in stable_tests
|
|
||||||
|
|
||||||
all_filtered = filter_intermittents(unexpected_results, filter_intermittents_output)
|
|
||||||
return_value = 0 if all_filtered else 1
|
return_value = 0 if all_filtered else 1
|
||||||
|
|
||||||
# Write the unexpected-only raw log if that was specified on the command-line.
|
# Write the unexpected-only raw log if that was specified on the command-line.
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue