diff --git a/etc/taskcluster/decision_task.py b/etc/taskcluster/decision_task.py
index c9d7c4502b9..54479c20435 100644
--- a/etc/taskcluster/decision_task.py
+++ b/etc/taskcluster/decision_task.py
@@ -748,12 +748,12 @@ def wpt_chunks(platform, make_chunk_task, build_task, total_chunks, processes,
                     | cat
                 time ./mach test-wpt --release --processes $PROCESSES --timeout-multiplier=4 \
                     --headless --log-raw test-wdspec.log \
-                    --log-errorsummary wdspec-errorsummary.log \
+                    --log-servojson wdspec-jsonsummary.log \
                     --always-succeed \
                     webdriver \
                     | cat
                 ./mach filter-intermittents \
-                    wdspec-errorsummary.log \
+                    wdspec-jsonsummary.log \
                     --log-intermittents intermittents.log \
                     --log-filteredsummary filtered-wdspec-errorsummary.log \
                     --tracker-api default \
@@ -768,11 +768,11 @@ def wpt_chunks(platform, make_chunk_task, build_task, total_chunks, processes,
                     --total-chunks "$TOTAL_CHUNKS" \
                     --this-chunk "$THIS_CHUNK" \
                     --log-raw test-wpt.log \
-                    --log-errorsummary wpt-errorsummary.log \
+                    --log-servojson wpt-jsonsummary.log \
                     --always-succeed \
                     | cat
                 ./mach filter-intermittents \
-                    wpt-errorsummary.log \
+                    wpt-jsonsummary.log \
                     --log-intermittents intermittents.log \
                     --log-filteredsummary filtered-wpt-errorsummary.log \
                     --tracker-api default \
diff --git a/python/servo/testing_commands.py b/python/servo/testing_commands.py
index f094ad9bb1b..01cdc14db30 100644
--- a/python/servo/testing_commands.py
+++ b/python/servo/testing_commands.py
@@ -68,6 +68,7 @@ TEST_SUITES_BY_PREFIX = {path: k for k, v in iteritems(TEST_SUITES) if "paths" i
 
 
 def create_parser_wpt():
+    import mozlog.commandline
     parser = wptcommandline.create_parser()
     parser.add_argument('--release', default=False, action="store_true",
                         help="Run with a release build of servo")
@@ -77,6 +78,8 @@ def create_parser_wpt():
                         help="Pass preferences to servo")
     parser.add_argument('--layout-2020', default=False, action="store_true",
                         help="Use expected results for the 2020 layout engine")
+    parser.add_argument('--log-servojson', action="append", type=mozlog.commandline.log_file,
+                        help="Servo's JSON logger of unexpected results")
     parser.add_argument('--always-succeed', default=False, action="store_true",
                         help="Always yield exit code of zero")
     return parser
@@ -511,7 +514,7 @@ class MachCommands(CommandBase):
              description='Given a WPT error summary file, filter out intermittents and other cruft.',
              category='testing')
     @CommandArgument('summary',
-                     help="Error summary log to take un")
+                     help="Error summary log to take in")
     @CommandArgument('--log-filteredsummary', default=None,
                      help='Print filtered log to file')
     @CommandArgument('--log-intermittents', default=None,
@@ -529,10 +532,7 @@ class MachCommands(CommandBase):
                 encoded_auth = base64.encodestring(file.read().strip()).replace('\n', '')
         failures = []
         with open(summary, "r") as file:
-            for line in file:
-                line_json = json.loads(line)
-                if 'status' in line_json:
-                    failures += [line_json]
+            failures = [json.loads(line) for line in file]
         actual_failures = []
         intermittents = []
         for failure in failures:
@@ -546,10 +546,7 @@ class MachCommands(CommandBase):
                 request = urllib.request.Request("%s/query.py?name=%s" % (tracker_api, query))
                 search = urllib.request.urlopen(request)
                 data = json.load(search)
-                if len(data) == 0:
-                    actual_failures += [failure]
-                else:
-                    intermittents += [failure]
+                is_intermittent = len(data) > 0
             else:
                 qstr = "repo:servo/servo+label:I-intermittent+type:issue+state:open+%s" % failure['test']
                 # we want `/` to get quoted, but not `+` (github's API doesn't like that), so we set `safe` to `+`
@@ -559,28 +556,33 @@ class MachCommands(CommandBase):
                     request.add_header("Authorization", "Basic %s" % encoded_auth)
                 search = urllib.request.urlopen(request)
                 data = json.load(search)
-                if data['total_count'] == 0:
-                    actual_failures += [failure]
-                else:
-                    intermittents += [failure]
+                is_intermittent = data['total_count'] > 0
+
+            if is_intermittent:
+                intermittents.append(failure["output"])
+            else:
+                actual_failures.append(failure["output"])
+
+        def format(outputs, description, file=None):
+            formatted = "%s %s:\n%s" % (len(outputs), description, "\n".join(outputs))
+            if file:
+                file.write(formatted.encode("utf-8"))
+            else:
+                print(formatted)
 
         if log_intermittents:
-            with open(log_intermittents, "w") as intermittents_file:
-                for intermittent in intermittents:
-                    json.dump(intermittent, intermittents_file, indent=4)
-                    print("\n", end='', file=intermittents_file)
+            with open(log_intermittents, "wb") as file:
+                format(intermittents, "known-intermittent unexpected results", file)
 
-        output = open(log_filteredsummary, "w") if log_filteredsummary else sys.stdout
-        for failure in actual_failures:
-            json.dump(failure, output, indent=4)
-            print("\n", end='', file=output)
+        description = "unexpected results that are NOT known-intermittents"
+        if log_filteredsummary:
+            with open(log_filteredsummary, "wb") as file:
+                format(actual_failures, description, file)
 
-        if output is not sys.stdout:
-            output.close()
+        if actual_failures:
+            format(actual_failures, description)
 
-        if len(actual_failures) == 0:
-            return 0
-        return 1
+        return bool(actual_failures)
 
     @Command('test-android-startup',
              description='Extremely minimal testing of Servo for Android',
diff --git a/tests/wpt/grouping_formatter.py b/tests/wpt/grouping_formatter.py
index 955c85d916c..24073e10fe7 100644
--- a/tests/wpt/grouping_formatter.py
+++ b/tests/wpt/grouping_formatter.py
@@ -4,6 +4,7 @@
 
 from mozlog.formatters import base
 import collections
+import json
 import os
 import sys
 import subprocess
@@ -14,7 +15,7 @@ DEFAULT_MOVE_UP_CODE = u"\x1b[A"
 DEFAULT_CLEAR_EOL_CODE = u"\x1b[K"
 
 
-class GroupingFormatter(base.BaseFormatter):
+class ServoFormatter(base.BaseFormatter):
     """Formatter designed to produce unexpected test results grouped
        together in a readable format."""
     def __init__(self):
@@ -77,7 +78,7 @@ class GroupingFormatter(base.BaseFormatter):
         return ((self.move_up + self.clear_eol) *
                 self.current_display.count('\n'))
 
-    def generate_output(self, text=None, new_display=None):
+    def generate_output(self, text=None, new_display=None, unexpected_in_test=None):
         if not self.interactive:
             return text
 
@@ -88,11 +89,14 @@ class GroupingFormatter(base.BaseFormatter):
             self.current_display = new_display
         return output + self.current_display
 
-    def build_status_line(self):
+    def test_counter(self):
         if self.number_of_tests == 0:
-            new_display = "  [%i] " % self.completed_tests
+            return "  [%i] " % self.completed_tests
         else:
-            new_display = "  [%i/%i] " % (self.completed_tests, self.number_of_tests)
+            return "  [%i/%i] " % (self.completed_tests, self.number_of_tests)
+
+    def build_status_line(self):
+        new_display = self.test_counter()
 
         if self.running_tests:
             indent = " " * len(new_display)
@@ -116,8 +120,8 @@ class GroupingFormatter(base.BaseFormatter):
 
     def test_start(self, data):
         self.running_tests[data['thread']] = data['test']
-        return self.generate_output(text=None,
-                                    new_display=self.build_status_line())
+        if self.interactive:
+            return self.generate_output(new_display=self.build_status_line())
 
     def wrap_and_indent_lines(self, lines, indent):
         assert(len(lines) > 0)
@@ -146,10 +150,11 @@ class GroupingFormatter(base.BaseFormatter):
 
         lines = [u"%s%s %s" % (status, expected_text, test_name)]
         if message:
-            lines.append(u"  \u2192 %s" % message)
+            for message_line in message.splitlines():
+                lines.append(u"  \u2192 %s" % message_line)
         if stack:
             lines.append("")
-            lines += [stackline for stackline in stack.splitlines()]
+            lines.extend(stack.splitlines())
         return lines
 
     def get_output_for_unexpected_subtests(self, test_name, unexpected_subtests):
@@ -195,15 +200,14 @@ class GroupingFormatter(base.BaseFormatter):
         subtest_failures = self.subtest_failures.pop(test_name, [])
 
         del self.running_tests[data['thread']]
-        new_display = self.build_status_line()
 
         if not had_unexpected_test_result and not subtest_failures:
             self.expected[test_status] += 1
             if self.interactive:
-                return self.generate_output(text=None, new_display=new_display)
+                new_display = self.build_status_line()
+                return self.generate_output(new_display=new_display)
             else:
-                return self.generate_output(text="  %s\n" % test_name,
-                                            new_display=new_display)
+                return self.generate_output(text="%s%s\n" % (self.test_counter(), test_name))
 
         # If the test crashed or timed out, we also include any process output,
         # because there is a good chance that the test produced a stack trace
@@ -230,7 +234,9 @@ class GroupingFormatter(base.BaseFormatter):
                                                               subtest_failures)
         self.test_failure_text += output
 
-        return self.generate_output(text=output, new_display=new_display)
+        new_display = self.build_status_line()
+        return self.generate_output(text=output, new_display=new_display,
+                                    unexpected_in_test=test_name)
 
     def test_status(self, data):
         if "expected" in data:
@@ -289,3 +295,16 @@ class GroupingFormatter(base.BaseFormatter):
 
         if data['level'] in ('CRITICAL', 'ERROR'):
             return self.generate_output(text=data['message'] + "\n")
+
+
+class ServoJsonFormatter(ServoFormatter):
+    def suite_start(self, data):
+        ServoFormatter.suite_start(self, data)
+        # Don't forward the return value
+
+    def generate_output(self, text=None, new_display=None, unexpected_in_test=None):
+        if unexpected_in_test:
+            return "%s\n" % json.dumps({"test": unexpected_in_test, "output": text})
+
+    def log(self, _):
+        return
diff --git a/tests/wpt/metadata/MANIFEST.json b/tests/wpt/metadata/MANIFEST.json
index 89f1a3893da..f28477c82e8 100644
--- a/tests/wpt/metadata/MANIFEST.json
+++ b/tests/wpt/metadata/MANIFEST.json
@@ -711776,7 +711776,7 @@
    "support"
   ],
   "tools/wptrunner/wptrunner/executors/base.py": [
-   "713d85001135d0cdf23c7a06583bd03d4355d58e",
+   "06b1012ec95f552d104b6f416342aa973512c160",
    "support"
   ],
   "tools/wptrunner/wptrunner/executors/executorchrome.py": [
diff --git a/tests/wpt/run.py b/tests/wpt/run.py
index 819fbd744e4..cfc68ea45c3 100644
--- a/tests/wpt/run.py
+++ b/tests/wpt/run.py
@@ -34,7 +34,9 @@ def run_tests(**kwargs):
     set_defaults(kwargs)
 
     mozlog.commandline.log_formatters["servo"] = \
-        (grouping_formatter.GroupingFormatter, "A grouping output formatter")
+        (grouping_formatter.ServoFormatter, "Servo’s grouping output formatter")
+    mozlog.commandline.log_formatters["servojson"] = \
+        (grouping_formatter.ServoJsonFormatter, "Servo's JSON logger of unexpected results")
 
     use_mach_logging = False
     if len(kwargs["test_list"]) == 1:
diff --git a/tests/wpt/web-platform-tests/tools/wptrunner/wptrunner/executors/base.py b/tests/wpt/web-platform-tests/tools/wptrunner/wptrunner/executors/base.py
index 713d8500113..06b1012ec95 100644
--- a/tests/wpt/web-platform-tests/tools/wptrunner/wptrunner/executors/base.py
+++ b/tests/wpt/web-platform-tests/tools/wptrunner/wptrunner/executors/base.py
@@ -358,17 +358,17 @@ class RefTestImplementation(object):
     def reset(self):
         self.screenshot_cache.clear()
 
-    def is_pass(self, hashes, screenshots, relation, fuzzy):
+    def is_pass(self, hashes, screenshots, urls, relation, fuzzy):
         assert relation in ("==", "!=")
         if not fuzzy or fuzzy == ((0,0), (0,0)):
             equal = hashes[0] == hashes[1]
             # sometimes images can have different hashes, but pixels can be identical.
             if not equal:
                 self.logger.info("Image hashes didn't match, checking pixel differences")
-                max_per_channel, pixels_different = self.get_differences(screenshots)
+                max_per_channel, pixels_different = self.get_differences(screenshots, urls)
                 equal = pixels_different == 0 and max_per_channel == 0
         else:
-            max_per_channel, pixels_different = self.get_differences(screenshots)
+            max_per_channel, pixels_different = self.get_differences(screenshots, urls)
             allowed_per_channel, allowed_different = fuzzy
             self.logger.info("Allowed %s pixels different, maximum difference per channel %s" %
                              ("-".join(str(item) for item in allowed_different),
@@ -379,11 +379,13 @@ class RefTestImplementation(object):
                       allowed_different[0] <= pixels_different <= allowed_different[1]))
         return equal if relation == "==" else not equal
 
-    def get_differences(self, screenshots):
+    def get_differences(self, screenshots, urls):
         from PIL import Image, ImageChops, ImageStat
 
         lhs = Image.open(io.BytesIO(base64.b64decode(screenshots[0]))).convert("RGB")
         rhs = Image.open(io.BytesIO(base64.b64decode(screenshots[1]))).convert("RGB")
+        self.check_if_solid_color(lhs, urls[0])
+        self.check_if_solid_color(rhs, urls[1])
         diff = ImageChops.difference(lhs, rhs)
         minimal_diff = diff.crop(diff.getbbox())
         mask = minimal_diff.convert("L", dither=None)
@@ -394,6 +396,12 @@ class RefTestImplementation(object):
                          (count, per_channel))
         return per_channel, count
 
+    def check_if_solid_color(self, image, url):
+        extrema = image.getextrema()
+        if all(min == max for min, max in extrema):
+            color = ''.join('%02X' % value for value, _ in extrema)
+            self.message.append("Screenshot is solid color 0x%s for %s\n" % (color, url))
+
     def run_test(self, test):
         viewport_size = test.viewport_size
         dpi = test.dpi
@@ -406,6 +414,7 @@ class RefTestImplementation(object):
         while stack:
             hashes = [None, None]
             screenshots = [None, None]
+            urls = [None, None]
 
             nodes, relation = stack.pop()
             fuzzy = self.get_fuzzy(test, nodes, relation)
@@ -416,8 +425,9 @@ class RefTestImplementation(object):
                     return {"status": data[0], "message": data[1]}
 
                 hashes[i], screenshots[i] = data
+                urls[i] = node.url
 
-            if self.is_pass(hashes, screenshots, relation, fuzzy):
+            if self.is_pass(hashes, screenshots, urls, relation, fuzzy):
                 fuzzy = self.get_fuzzy(test, nodes, relation)
                 if nodes[1].references:
                     stack.extend(list(((nodes[1], item[0]), item[1]) for item in reversed(nodes[1].references)))