diff --git a/python/wpt/grouping_formatter.py b/python/wpt/grouping_formatter.py index e6f0abe1cd5..541644e477f 100644 --- a/python/wpt/grouping_formatter.py +++ b/python/wpt/grouping_formatter.py @@ -112,7 +112,13 @@ class ServoHandler(mozlog.reader.LogHandler): """LogHandler designed to collect unexpected results for use by script or by the ServoFormatter output formatter.""" - def __init__(self): + def __init__(self, detect_flakes=False): + """ + Flake detection assumes first suite is actual run + and rest of the suites are retry-unexpected for flakes detection. + """ + self.detect_flakes = detect_flakes + self.currently_detecting_flakes = False self.reset_state() def reset_state(self): @@ -120,6 +126,9 @@ class ServoHandler(mozlog.reader.LogHandler): self.completed_tests = 0 self.need_to_erase_last_line = False self.running_tests: Dict[str, str] = {} + if self.currently_detecting_flakes: + return + self.currently_detecting_flakes = False self.test_output = collections.defaultdict(str) self.subtest_failures = collections.defaultdict(list) self.tests_with_failing_subtests = [] @@ -146,8 +155,17 @@ class ServoHandler(mozlog.reader.LogHandler): "PRECONDITION_FAILED": [], } + def any_stable_unexpected(self) -> bool: + return any(not unexpected.flaky for unexpected in self.unexpected_results) + def suite_start(self, data): + # If there were any unexpected results and we are starting another suite, assume + # that this suite has been launched to detect intermittent tests. + # TODO: Support running more than a single suite at once. + if self.unexpected_results: + self.currently_detecting_flakes = True self.reset_state() + self.number_of_tests = sum(len(tests) for tests in itervalues(data["tests"])) self.suite_start_time = data["time"] @@ -171,17 +189,43 @@ class ServoHandler(mozlog.reader.LogHandler): had_expected_test_result = self.data_was_for_expected_result(data) subtest_failures = self.subtest_failures.pop(test_path, []) + test_output = self.test_output.pop(test_path, "") + if had_expected_test_result and not subtest_failures: - self.expected[test_status] += 1 + if not self.currently_detecting_flakes: + self.expected[test_status] += 1 + else: + # When `retry_unexpected` is passed and we are currently detecting flaky tests + # we assume that this suite only runs tests that have already been run and are + # in the list of unexpected results. + for unexpected in self.unexpected_results: + if unexpected.path == test_path: + unexpected.flaky = True + break + return None + # If we are currently detecting flakes and a test still had an unexpected + # result, it's enough to simply return the unexpected result. It isn't + # necessary to update any of the test counting data structures. + if self.currently_detecting_flakes: + return UnexpectedResult( + test_path, + test_status, + data.get("expected", test_status), + data.get("message", ""), + data["time"], + "", + subtest_failures, + ) + # If the test crashed or timed out, we also include any process output, # because there is a good chance that the test produced a stack trace # or other error messages. stack = data.get("stack", None) if test_status in ("CRASH", "TIMEOUT"): stack = f"\n{stack}" if stack else "" - stack = f"{self.test_output[test_path]}{stack}" + stack = f"{test_output}{stack}" result = UnexpectedResult( test_path, @@ -285,10 +329,11 @@ class ServoFormatter(mozlog.formatters.base.BaseFormatter, ServoHandler): def suite_start(self, data): ServoHandler.suite_start(self, data) + maybe_flakes_msg = " to detect flaky tests" if self.currently_detecting_flakes else "" if self.number_of_tests == 0: - return "Running tests in %s\n\n" % data["source"] + return f"Running tests in {data['source']}{maybe_flakes_msg}\n\n" else: - return "Running %i tests in %s\n\n" % (self.number_of_tests, data["source"]) + return f"Running {self.number_of_tests} tests in {data['source']}{maybe_flakes_msg}\n\n" def test_start(self, data): ServoHandler.test_start(self, data) diff --git a/python/wpt/run.py b/python/wpt/run.py index d44d24f5882..898a96b3600 100644 --- a/python/wpt/run.py +++ b/python/wpt/run.py @@ -93,6 +93,8 @@ def run_tests(default_binary_path: str, **kwargs): filter_intermittents_output = kwargs.pop("filter_intermittents", None) unexpected_raw_log_output_file = kwargs.pop("log_raw_unexpected", None) raw_log_outputs = kwargs.get("log_raw", []) + if filter_intermittents_output and kwargs["retry_unexpected"] <= 0: + kwargs["retry_unexpected"] = 1 wptcommandline.check_args(kwargs) @@ -112,45 +114,18 @@ def run_tests(default_binary_path: str, **kwargs): else: logger = wptrunner.setup_logging(kwargs, {"servo": sys.stdout}) - handler = ServoHandler() + handler = ServoHandler(detect_flakes=kwargs["retry_unexpected"] >= 1) logger.add_handler(handler) wptrunner.run_tests(**kwargs) - return_value = 0 if not handler.unexpected_results else 1 + return_value = int(handler.any_stable_unexpected()) # Filter intermittents if that was specified on the command-line. - if handler.unexpected_results and filter_intermittents_output: - # Copy the list of unexpected results from the first run, so that we - # can access them after the tests are rerun (which changes - # `handler.unexpected_results`). After rerunning some tests will be - # marked as flaky but otherwise the contents of this original list - # won't change. - unexpected_results = list(handler.unexpected_results) - - # This isn't strictly necessary since `handler.suite_start()` clears - # the state, but make sure that we are starting with a fresh handler. - handler.reset_state() - - print(80 * "=") - print(f"Rerunning {len(unexpected_results)} tests with unexpected results to detect flaky tests.") - unexpected_results_tests = [result.path for result in unexpected_results] - kwargs["test_list"] = unexpected_results_tests - kwargs["include"] = unexpected_results_tests - kwargs["pause_after_test"] = False - wptrunner.run_tests(**kwargs) - + if filter_intermittents_output: if github_context: os.environ["GITHUB_CONTEXT"] = github_context - # Use the second run to mark tests from the first run as flaky, but - # discard the results otherwise. - # TODO: It might be a good idea to send the new results to the - # dashboard if they were also unexpected. - stable_tests = [result.path for result in handler.unexpected_results] - for result in unexpected_results: - result.flaky = result.path not in stable_tests - - all_filtered = filter_intermittents(unexpected_results, filter_intermittents_output) + all_filtered = filter_intermittents(handler.unexpected_results, filter_intermittents_output) return_value = 0 if all_filtered else 1 # Write the unexpected-only raw log if that was specified on the command-line.