Auto merge of #24981 - servo:wpt-unexpected, r=jdm

Improve diagnostics for WPT failures * Include the full output (including stdout/stderr) in the intermittent-filtered log * Print the intermittent-filtered log at the end of the main log (which is one less click to reach from Taskcluster’s task view, compared to other task artifacts) * <del>Fail with a specific message when a reftest screenshot is entirely white</del> (This caused over a hundred unexpected results. A few of them in reftests that use `about:blank` as a reference.) * For failing reftests, add a message if the whole screenshot is a solid color, to help recognize instances of https://github.com/servo/servo/issues/24726 ``` ▶ FAIL [expected PASS] /css/CSS2/box-display/root-box-003.xht │ → /css/CSS2/box-display/root-box-003.xht 54a9df64f1476dd12020019d7cf22ac34d727bc0 │ → /css/CSS2/box-display/root-box-003-ref.xht 636eb693bc214b6e1c64e6566c48e69e6777b946 └ → Screenshot is solid color 0xFFFFFF for /css/CSS2/box-display/root-box-003.xht ``` (The last line is new.)
2025-08-21 05:15:33 +01:00 · 2019-12-03 14:31:08 -05:00 · 2019-12-03 14:31:08 -05:00 · d99850ffa8
commit d99850ffa8
parent 7ba88e8237 14f049ddd1
6 changed files with 84 additions and 51 deletions
--- a/tests/wpt/grouping_formatter.py
+++ b/tests/wpt/grouping_formatter.py
@ -4,6 +4,7 @@

 from mozlog.formatters import base
 import collections
+import json
 import os
 import sys
 import subprocess
@ -14,7 +15,7 @@ DEFAULT_MOVE_UP_CODE = u"\x1b[A"
 DEFAULT_CLEAR_EOL_CODE = u"\x1b[K"


-class GroupingFormatter(base.BaseFormatter):
+class ServoFormatter(base.BaseFormatter):
    """Formatter designed to produce unexpected test results grouped
       together in a readable format."""
    def __init__(self):
@ -77,7 +78,7 @@ class GroupingFormatter(base.BaseFormatter):
        return ((self.move_up + self.clear_eol) *
                self.current_display.count('\n'))

-    def generate_output(self, text=None, new_display=None):
+    def generate_output(self, text=None, new_display=None, unexpected_in_test=None):
        if not self.interactive:
            return text

@ -88,11 +89,14 @@ class GroupingFormatter(base.BaseFormatter):
            self.current_display = new_display
        return output + self.current_display

-    def build_status_line(self):
+    def test_counter(self):
        if self.number_of_tests == 0:
-            new_display = "  [%i] " % self.completed_tests
+            return "  [%i] " % self.completed_tests
        else:
-            new_display = "  [%i/%i] " % (self.completed_tests, self.number_of_tests)
+            return "  [%i/%i] " % (self.completed_tests, self.number_of_tests)
+
+    def build_status_line(self):
+        new_display = self.test_counter()

        if self.running_tests:
            indent = " " * len(new_display)
@ -116,8 +120,8 @@ class GroupingFormatter(base.BaseFormatter):

    def test_start(self, data):
        self.running_tests[data['thread']] = data['test']
-        return self.generate_output(text=None,
-                                    new_display=self.build_status_line())
+        if self.interactive:
+            return self.generate_output(new_display=self.build_status_line())

    def wrap_and_indent_lines(self, lines, indent):
        assert(len(lines) > 0)
@ -146,10 +150,11 @@ class GroupingFormatter(base.BaseFormatter):

        lines = [u"%s%s %s" % (status, expected_text, test_name)]
        if message:
-            lines.append(u"  \u2192 %s" % message)
+            for message_line in message.splitlines():
+                lines.append(u"  \u2192 %s" % message_line)
        if stack:
            lines.append("")
-            lines += [stackline for stackline in stack.splitlines()]
+            lines.extend(stack.splitlines())
        return lines

    def get_output_for_unexpected_subtests(self, test_name, unexpected_subtests):
@ -195,15 +200,14 @@ class GroupingFormatter(base.BaseFormatter):
        subtest_failures = self.subtest_failures.pop(test_name, [])

        del self.running_tests[data['thread']]
-        new_display = self.build_status_line()

        if not had_unexpected_test_result and not subtest_failures:
            self.expected[test_status] += 1
            if self.interactive:
-                return self.generate_output(text=None, new_display=new_display)
+                new_display = self.build_status_line()
+                return self.generate_output(new_display=new_display)
            else:
-                return self.generate_output(text="  %s\n" % test_name,
-                                            new_display=new_display)
+                return self.generate_output(text="%s%s\n" % (self.test_counter(), test_name))

        # If the test crashed or timed out, we also include any process output,
        # because there is a good chance that the test produced a stack trace
@ -230,7 +234,9 @@ class GroupingFormatter(base.BaseFormatter):
                                                              subtest_failures)
        self.test_failure_text += output

-        return self.generate_output(text=output, new_display=new_display)
+        new_display = self.build_status_line()
+        return self.generate_output(text=output, new_display=new_display,
+                                    unexpected_in_test=test_name)

    def test_status(self, data):
        if "expected" in data:
@ -289,3 +295,16 @@ class GroupingFormatter(base.BaseFormatter):

        if data['level'] in ('CRITICAL', 'ERROR'):
            return self.generate_output(text=data['message'] + "\n")
+
+
+class ServoJsonFormatter(ServoFormatter):
+    def suite_start(self, data):
+        ServoFormatter.suite_start(self, data)
+        # Don't forward the return value
+
+    def generate_output(self, text=None, new_display=None, unexpected_in_test=None):
+        if unexpected_in_test:
+            return "%s\n" % json.dumps({"test": unexpected_in_test, "output": text})
+
+    def log(self, _):
+        return
--- a/tests/wpt/metadata/MANIFEST.json
+++ b/tests/wpt/metadata/MANIFEST.json
@ -711776,7 +711776,7 @@
   "support"
  ],
  "tools/wptrunner/wptrunner/executors/base.py": [
-   "713d85001135d0cdf23c7a06583bd03d4355d58e",
+   "06b1012ec95f552d104b6f416342aa973512c160",
   "support"
  ],
  "tools/wptrunner/wptrunner/executors/executorchrome.py": [
--- a/tests/wpt/run.py
+++ b/tests/wpt/run.py
@ -34,7 +34,9 @@ def run_tests(**kwargs):
    set_defaults(kwargs)

    mozlog.commandline.log_formatters["servo"] = \
-        (grouping_formatter.GroupingFormatter, "A grouping output formatter")
+        (grouping_formatter.ServoFormatter, "Servo’s grouping output formatter")
+    mozlog.commandline.log_formatters["servojson"] = \
+        (grouping_formatter.ServoJsonFormatter, "Servo's JSON logger of unexpected results")

    use_mach_logging = False
    if len(kwargs["test_list"]) == 1:
--- a/tests/wpt/web-platform-tests/tools/wptrunner/wptrunner/executors/base.py
+++ b/tests/wpt/web-platform-tests/tools/wptrunner/wptrunner/executors/base.py
@ -358,17 +358,17 @@ class RefTestImplementation(object):
    def reset(self):
        self.screenshot_cache.clear()

-    def is_pass(self, hashes, screenshots, relation, fuzzy):
+    def is_pass(self, hashes, screenshots, urls, relation, fuzzy):
        assert relation in ("==", "!=")
        if not fuzzy or fuzzy == ((0,0), (0,0)):
            equal = hashes[0] == hashes[1]
            # sometimes images can have different hashes, but pixels can be identical.
            if not equal:
                self.logger.info("Image hashes didn't match, checking pixel differences")
-                max_per_channel, pixels_different = self.get_differences(screenshots)
+                max_per_channel, pixels_different = self.get_differences(screenshots, urls)
                equal = pixels_different == 0 and max_per_channel == 0
        else:
-            max_per_channel, pixels_different = self.get_differences(screenshots)
+            max_per_channel, pixels_different = self.get_differences(screenshots, urls)
            allowed_per_channel, allowed_different = fuzzy
            self.logger.info("Allowed %s pixels different, maximum difference per channel %s" %
                             ("-".join(str(item) for item in allowed_different),
@ -379,11 +379,13 @@ class RefTestImplementation(object):
                      allowed_different[0] <= pixels_different <= allowed_different[1]))
        return equal if relation == "==" else not equal

-    def get_differences(self, screenshots):
+    def get_differences(self, screenshots, urls):
        from PIL import Image, ImageChops, ImageStat

        lhs = Image.open(io.BytesIO(base64.b64decode(screenshots[0]))).convert("RGB")
        rhs = Image.open(io.BytesIO(base64.b64decode(screenshots[1]))).convert("RGB")
+        self.check_if_solid_color(lhs, urls[0])
+        self.check_if_solid_color(rhs, urls[1])
        diff = ImageChops.difference(lhs, rhs)
        minimal_diff = diff.crop(diff.getbbox())
        mask = minimal_diff.convert("L", dither=None)
@ -394,6 +396,12 @@ class RefTestImplementation(object):
                         (count, per_channel))
        return per_channel, count

+    def check_if_solid_color(self, image, url):
+        extrema = image.getextrema()
+        if all(min == max for min, max in extrema):
+            color = ''.join('%02X' % value for value, _ in extrema)
+            self.message.append("Screenshot is solid color 0x%s for %s\n" % (color, url))
+
    def run_test(self, test):
        viewport_size = test.viewport_size
        dpi = test.dpi
@ -406,6 +414,7 @@ class RefTestImplementation(object):
        while stack:
            hashes = [None, None]
            screenshots = [None, None]
+            urls = [None, None]

            nodes, relation = stack.pop()
            fuzzy = self.get_fuzzy(test, nodes, relation)
@ -416,8 +425,9 @@ class RefTestImplementation(object):
                    return {"status": data[0], "message": data[1]}

                hashes[i], screenshots[i] = data
+                urls[i] = node.url

-            if self.is_pass(hashes, screenshots, relation, fuzzy):
+            if self.is_pass(hashes, screenshots, urls, relation, fuzzy):
                fuzzy = self.get_fuzzy(test, nodes, relation)
                if nodes[1].references:
                    stack.extend(list(((nodes[1], item[0]), item[1]) for item in reversed(nodes[1].references)))