Hack the HTML lexer to skip comments

2025-09-21 12:20:20 +01:00 · 2012-08-21 16:01:55 -07:00 · 2012-08-21 16:01:55 -07:00 · 05efc6a612
commit 05efc6a612
parent e15f2d50cb
1 changed files with 52 additions and 7 deletions
--- a/src/servo/parser/html_lexer.rs
+++ b/src/servo/parser/html_lexer.rs
@ -33,6 +33,7 @@ trait HtmlLexerMethods {
    fn parse_html() -> Token;
    fn parse_in_normal_state(c: u8) -> Token;
    fn parse_in_tag_state(c: u8) -> Token;
    fn eat_until_end_of_comment();
 }
 impl HtmlLexer : HtmlLexerMethods {
@ -53,6 +54,7 @@ impl HtmlLexer : HtmlLexerMethods {
    fn parse_in_normal_state(c: u8) -> Token {
        let mut ch = c;
        if ch == ('<' as u8) {
            match self.input_state.get() {
              CoeChar(c) => { ch = c; }
@ -60,13 +62,30 @@ impl HtmlLexer : HtmlLexerMethods {
            }
            if ch == ('!' as u8) {
-                self.input_state.eat_whitespace();
+                let ch = self.input_state.get();
-                self.input_state.expect_ident(~"DOCTYPE");
+                // FIXME: This comment parsing is very hacky
-                self.input_state.eat_whitespace();
+                if ch == CoeChar('-' as u8) {
-                self.input_state.expect_ident(~"html");
+                    self.eat_until_end_of_comment();
-                self.input_state.eat_whitespace();
+                    return match self.input_state.get() {
-                self.input_state.expect('>' as u8);
+                      CoeChar(c) => self.parse_in_normal_state(c),
-                return Doctype;
+                      CoeEof => self.input_state.parse_err(~"FIXME")
                    }
                } else if ch == CoeChar('D' as u8) {
                    self.input_state.expect_ident(~"OCTYPE");
                    self.input_state.eat_whitespace();
                    self.input_state.expect_ident(~"html");
                    self.input_state.eat_whitespace();
                    self.input_state.expect('>' as u8);
                    return Doctype;
                } else {
                    self.input_state.eat_whitespace();
                    self.input_state.expect_ident(~"DOCTYPE");
                    self.input_state.eat_whitespace();
                    self.input_state.expect_ident(~"html");
                    self.input_state.eat_whitespace();
                    self.input_state.expect('>' as u8);
                    return Doctype;
                }
            }
            if ch == ('/' as u8) {
@ -100,6 +119,32 @@ impl HtmlLexer : HtmlLexerMethods {
            }
        }
    }
    fn eat_until_end_of_comment() {
        let mut state = none;
        loop {
            match self.input_state.get() {
              CoeChar(c) => {
                match c {
                  '-' as u8 if state == none => {
                    state = some(~"-")
                  }
                  '-' as u8 if state == some(~"-") => {
                    state = some(~"--")
                  }
                  '>' as u8 if state == some(~"--") => {
                    return
                  }
                  _ => {
                    state = none
                  }
                }
              }
              CoeEof => return
            }
        }
    }
    fn parse_in_tag_state(c: u8) -> Token {
        let mut ch = c;