Hack the HTML lexer to skip comments

2025-09-20 20:00:13 +01:00 · 2012-08-21 16:01:55 -07:00 · 2012-08-21 16:01:55 -07:00 · 05efc6a612
commit 05efc6a612
parent e15f2d50cb
1 changed files with 52 additions and 7 deletions
--- a/src/servo/parser/html_lexer.rs
+++ b/src/servo/parser/html_lexer.rs
@ -33,6 +33,7 @@ trait HtmlLexerMethods {
    fn parse_html() -> Token;
    fn parse_in_normal_state(c: u8) -> Token;
    fn parse_in_tag_state(c: u8) -> Token;
+    fn eat_until_end_of_comment();
 }

 impl HtmlLexer : HtmlLexerMethods {
@ -53,6 +54,7 @@ impl HtmlLexer : HtmlLexerMethods {

    fn parse_in_normal_state(c: u8) -> Token {
        let mut ch = c;
+
        if ch == ('<' as u8) {
            match self.input_state.get() {
              CoeChar(c) => { ch = c; }
@ -60,13 +62,30 @@ impl HtmlLexer : HtmlLexerMethods {
            }

            if ch == ('!' as u8) {
-                self.input_state.eat_whitespace();
-                self.input_state.expect_ident(~"DOCTYPE");
-                self.input_state.eat_whitespace();
-                self.input_state.expect_ident(~"html");
-                self.input_state.eat_whitespace();
-                self.input_state.expect('>' as u8);
-                return Doctype;
+                let ch = self.input_state.get();
+                // FIXME: This comment parsing is very hacky
+                if ch == CoeChar('-' as u8) {
+                    self.eat_until_end_of_comment();
+                    return match self.input_state.get() {
+                      CoeChar(c) => self.parse_in_normal_state(c),
+                      CoeEof => self.input_state.parse_err(~"FIXME")
+                    }
+                } else if ch == CoeChar('D' as u8) {
+                    self.input_state.expect_ident(~"OCTYPE");
+                    self.input_state.eat_whitespace();
+                    self.input_state.expect_ident(~"html");
+                    self.input_state.eat_whitespace();
+                    self.input_state.expect('>' as u8);
+                    return Doctype;
+                } else {
+                    self.input_state.eat_whitespace();
+                    self.input_state.expect_ident(~"DOCTYPE");
+                    self.input_state.eat_whitespace();
+                    self.input_state.expect_ident(~"html");
+                    self.input_state.eat_whitespace();
+                    self.input_state.expect('>' as u8);
+                    return Doctype;
+                }
            }

            if ch == ('/' as u8) {
@ -100,6 +119,32 @@ impl HtmlLexer : HtmlLexerMethods {
            }
        }
    }
+
+    fn eat_until_end_of_comment() {
+        let mut state = none;
+
+        loop {
+            match self.input_state.get() {
+              CoeChar(c) => {
+                match c {
+                  '-' as u8 if state == none => {
+                    state = some(~"-")
+                  }
+                  '-' as u8 if state == some(~"-") => {
+                    state = some(~"--")
+                  }
+                  '>' as u8 if state == some(~"--") => {
+                    return
+                  }
+                  _ => {
+                    state = none
+                  }
+                }
+              }
+              CoeEof => return
+            }
+        }
+    }
    
    fn parse_in_tag_state(c: u8) -> Token {
        let mut ch = c;