Refactored html and css lexing into separate files and capitalized those types

2025-08-06 14:10:11 +01:00 · 2012-06-20 16:28:30 -07:00 · 2012-06-20 16:28:30 -07:00 · e0ddaf50df
commit e0ddaf50df
parent b754510d53
9 changed files with 588 additions and 584 deletions
--- a/src/servo/content.rs
+++ b/src/servo/content.rs
@ -14,7 +14,8 @@ import dom::base::NodeScope;
 import dom::rcu::WriterMethods;
 import dom::style;
 import style::print_sheet;
-import parser::lexer::{spawn_css_lexer_task, spawn_html_parser_task};
+import parser::css_lexer::spawn_css_lexer_task;
+import parser::html_lexer::spawn_html_lexer_task;
 import parser::css_builder::build_stylesheet;
 import parser::html_builder::build_dom;
 import layout::layout_task;
@ -79,7 +80,7 @@ fn Content(layout: Layout) -> Content {

                // Note: we can parse the next document in parallel
                // with any previous documents.
-                let stream = spawn_html_parser_task(copy filename);
+                let stream = spawn_html_lexer_task(copy filename);
                let root = build_dom(scope, stream);
           
                // Collect the css stylesheet
--- a/src/servo/parser/css_builder.rs
+++ b/src/servo/parser/css_builder.rs
@ -4,35 +4,35 @@
 // are not as expected

 import dom::style::*;
-import parser::lexer::css::{token, to_start_desc, to_end_desc,
-                            to_descendant, to_child, to_sibling,
-                            to_comma, to_elmt, to_attr, to_desc,
-                            to_eof};
+import parser::css_lexer::{Token, StartDescription, EndDescription,
+                           Descendant, Child, Sibling,
+                           Comma, Element, Attr, Description,
+                           Eof};
 import comm::recv;
 import option::is_none;
 import util::color::parsing::parse_color;

-type token_reader = {stream : port<token>, mut lookahead : option<token>};
+type TokenReader = {stream : port<Token>, mut lookahead : option<Token>};

-impl methods for token_reader {
-    fn get() -> token {
+impl methods for TokenReader {
+    fn get() -> Token {
        alt copy self.lookahead {
          some(tok)  { self.lookahead = none; copy tok }
          none       { recv(self.stream) }
        }
    }

-    fn unget(-tok : token) {
+    fn unget(-tok : Token) {
        assert is_none(self.lookahead);
        self.lookahead = some(tok);
    }
 }

-fn parse_element(reader : token_reader) -> option<~selector> {
+fn parse_element(reader : TokenReader) -> option<~selector> {
    // Get the current element type
    let elmt_name = alt reader.get() {
-      to_elmt(tag)  { copy tag }
-      to_eof        { ret none; }
+      Element(tag)  { copy tag }
+      Eof        { ret none; }
      _             { fail "Expected an element" }
    };

@ -42,24 +42,23 @@ fn parse_element(reader : token_reader) -> option<~selector> {
    loop {
        let tok = reader.get();
        alt tok {
-          to_attr(attr)       { attr_list += [copy attr]; }
-          to_start_desc | to_descendant | to_child | to_sibling 
-          | to_comma {
+          Attr(attr)       { attr_list += [copy attr]; }
+          StartDescription | Descendant | Child | Sibling | Comma {
            reader.unget(tok); 
            break;
          }
-          to_eof              { ret none; }          
-          to_elmt(_)          { fail "Unexpected second element without " +
+          Eof              { ret none; }          
+          Element(_)          { fail "Unexpected second element without " +
                                   "relation to first element"; }
-          to_end_desc         { fail "Unexpected '}'"; }
-          to_desc(_, _)       { fail "Unexpected description"; }
+          EndDescription         { fail "Unexpected '}'"; }
+          Description(_, _)       { fail "Unexpected description"; }
        }
    }
        
    ret some(~element(elmt_name, attr_list));
 }

-fn parse_rule(reader : token_reader) -> option<~rule> {
+fn parse_rule(reader : TokenReader) -> option<~rule> {
    let mut sel_list = [];
    let mut desc_list = [];

@ -75,7 +74,7 @@ fn parse_rule(reader : token_reader) -> option<~rule> {
        loop {
            let tok = reader.get();
            alt tok {
-              to_descendant {
+              Descendant {
                alt parse_element(reader) {
                  some(elmt)   { 
                    let built_sel <- cur_sel;
@ -85,7 +84,7 @@ fn parse_rule(reader : token_reader) -> option<~rule> {
                  none         { ret none; }
                }
              }
-              to_child {
+              Child {
                alt parse_element(reader) {
                  some(elmt)   { 
                    let built_sel <- cur_sel;
@ -95,7 +94,7 @@ fn parse_rule(reader : token_reader) -> option<~rule> {
                  none         { ret none; }
                }
              }
-              to_sibling {
+              Sibling {
                alt parse_element(reader) {
                  some(elmt)   { 
                    let built_sel <- cur_sel;
@ -105,30 +104,30 @@ fn parse_rule(reader : token_reader) -> option<~rule> {
                  none         { ret none; }
                }
              }
-              to_start_desc {
+              StartDescription {
                let built_sel <- cur_sel; 
                sel_list += [built_sel];
-                reader.unget(to_start_desc);
+                reader.unget(StartDescription);
                break;
              }
-              to_comma      {
+              Comma      {
                let built_sel <- cur_sel;
                sel_list += [built_sel];
-                reader.unget(to_comma);
+                reader.unget(Comma);
                break;
              }
-              to_attr(_) | to_end_desc | to_elmt(_) | to_desc(_, _) {
+              Attr(_) | EndDescription | Element(_) | Description(_, _) {
                fail #fmt["Unexpected token %? in elements", tok];
              }
-              to_eof        { ret none; }
+              Eof        { ret none; }
            }
        }

        // check if we should break out of the nesting loop as well
        let tok = reader.get();
        alt tok {
-          to_start_desc { break; }
-          to_comma      { }
+          StartDescription { break; }
+          Comma      { }
          _             { reader.unget(tok); }
        }
    }
@ -137,8 +136,8 @@ fn parse_rule(reader : token_reader) -> option<~rule> {
    loop {
        let tok = reader.get();
        alt tok {
-          to_end_desc   { break; }
-          to_desc(prop, val) {
+          EndDescription   { break; }
+          Description(prop, val) {
            alt prop {
              "font-size" {
                // TODO, support more ways to declare a font size than # pt
@ -169,9 +168,9 @@ fn parse_rule(reader : token_reader) -> option<~rule> {
                                  val]; }
            }
          }
-          to_eof        { ret none; }
-          to_start_desc | to_descendant | to_child | to_sibling
-          | to_comma | to_elmt(_) | to_attr(_)  {
+          Eof        { ret none; }
+          StartDescription | Descendant | Child | Sibling
+          | Comma | Element(_) | Attr(_)  {
            fail #fmt["Unexpected token %? in description", tok]; 
          }
        }
@ -180,7 +179,7 @@ fn parse_rule(reader : token_reader) -> option<~rule> {
    ret some(~(sel_list, desc_list));
 }

-fn build_stylesheet(stream : port<token>) -> [~rule] {
+fn build_stylesheet(stream : port<Token>) -> [~rule] {
    let mut rule_list = [];
    let reader = {stream : stream, mut lookahead : none};

--- a/src/servo/parser/css_lexer.rs
+++ b/src/servo/parser/css_lexer.rs
@ -0,0 +1,253 @@
+import comm::{port, chan};
+import dom::style;
+import option::is_none;
+
+import lexer_util::*;
+
+enum ParserState {
+    CssElement,
+    CssRelation,
+    CssDescription,
+    CssAttribute
+}
+
+type CssLexer = {
+    input_state: InputState,
+    mut parser_state: ParserState
+};
+
+enum Token {
+    StartDescription,
+    EndDescription,
+    Descendant,
+    Child,
+    Sibling,
+    Comma,
+    Element(str),
+    Attr(style::attr), 
+    Description(str, str),
+    Eof
+}
+
+impl css_methods for CssLexer {
+    fn parse_css() -> Token {
+        let mut ch: u8;
+        alt self.input_state.get() {
+            CoeChar(c) { ch = c; }
+            CoeEof { ret Eof; }
+        }
+
+        let token = alt self.parser_state {
+          CssDescription        { self.parse_css_description(ch) }
+          CssAttribute   { self.parse_css_attribute(ch) }
+          CssElement        { self.parse_css_element(ch) }
+          CssRelation    { self.parse_css_relation(ch) }
+        };
+
+        #debug["token=%?", token];
+        ret token;
+    }
+
+    fn parse_css_relation(c : u8) -> Token {
+        self.parser_state = CssElement;
+
+        let token = alt c {
+          '{' as u8  { self.parser_state = CssDescription; StartDescription }
+          '>' as u8  { Child }
+          '+' as u8  { Sibling }
+          ',' as u8  { Comma }
+          _          { self.input_state.unget(c); Descendant }
+        };
+
+        self.input_state.eat_whitespace();
+        
+        ret token;
+    }
+
+    fn parse_css_element(c : u8) -> Token {
+        assert is_none(self.input_state.lookahead);
+
+        /* Check for special attributes with an implied element,
+        or a wildcard which is not a alphabet character.*/
+        if c == '.' as u8 || c == '#' as u8 {
+            self.parser_state = CssAttribute;
+            self.input_state.unget(c);
+            ret Element("*");
+        } else if c == '*' as u8 {
+            self.parser_state = CssAttribute;
+            ret Element("*");
+        }
+
+        self.input_state.unget(c);
+        let element = self.input_state.parse_ident();
+
+        self.parser_state = CssAttribute;
+        
+        ret Element(element);
+    }
+
+    fn parse_css_attribute(c : u8) -> Token {
+        let mut ch = c;
+        
+        /* If we've reached the end of this list of attributes,
+        look for the relation to the next element.*/
+        if c.is_whitespace() {
+            self.parser_state = CssRelation;
+            self.input_state.eat_whitespace();
+
+            alt self.input_state.get() {
+              CoeChar(c)  { ch = c }
+              CoeEof      { fail "File ended before description of style" }
+            }
+
+            ret self.parse_css_relation(ch);
+        }
+        
+        alt ch {
+          '.' as u8 { ret Attr(
+              style::includes("class", self.input_state.parse_ident())); }
+          '#' as u8 { ret Attr(
+              style::includes("id", self.input_state.parse_ident())); }
+          '[' as u8 {
+            let attr_name = self.input_state.parse_ident();
+            
+            alt self.input_state.get() {
+              CoeChar(c)    { ch = c; }
+              CoeEof        { fail "File ended before description finished"; }
+            }
+
+            if ch == ']' as u8 {
+                ret Attr(style::exists(attr_name));
+            } else if ch == '=' as u8 {
+                let attr_val = self.input_state.parse_ident();
+                self.input_state.expect(']' as u8);
+                ret Attr(style::exact(attr_name, attr_val));
+            } else if ch == '~' as u8 {
+                self.input_state.expect('=' as u8);
+                let attr_val = self.input_state.parse_ident();
+                self.input_state.expect(']' as u8);
+                ret Attr(style::includes(attr_name, attr_val));
+            } else if ch == '|' as u8 {
+                self.input_state.expect('=' as u8);
+                let attr_val = self.input_state.parse_ident();
+                self.input_state.expect(']' as u8);
+                ret Attr(style::starts_with(attr_name, attr_val));
+            }
+            
+            fail #fmt("Unexpected symbol %c in attribute", ch as char);
+          }
+          _   { fail #fmt("Unexpected symbol %c in attribute", ch as char); }
+        }
+    }
+
+    fn parse_css_description(c: u8) -> Token {
+        let mut ch = c;
+
+        if ch == '}' as u8 {
+            self.parser_state = CssElement;
+            self.input_state.eat_whitespace();
+            ret EndDescription;
+        } else if ch.is_whitespace() {
+            self.input_state.eat_whitespace();
+
+            alt self.input_state.get() {
+              CoeChar(c)  { ch = c }
+              CoeEof      { fail "Reached end of file in CSS description" }
+            }
+        }
+        
+        let mut desc_name = [];
+        
+        // Get the name of the descriptor
+        loop {
+            if ch.is_whitespace() {
+                self.input_state.eat_whitespace();
+            } else if ch == ':' as u8 {
+                if desc_name.len() == 0u {
+                    fail "Expected descriptor name";
+                } else {
+                    break;
+                }
+            } else {
+                desc_name += [ch];
+            }
+
+            alt self.input_state.get() {
+              CoeChar(c)  { ch = c }
+              CoeEof      { fail "Reached end of file in CSS description" }
+            }
+        }
+
+        self.input_state.eat_whitespace();
+        let mut desc_val = [];
+
+        // Get the value of the descriptor
+        loop {
+            alt self.input_state.get() {
+              CoeChar(c)  { ch = c }
+              CoeEof      { fail "Reached end of file in CSS description" }
+            }
+
+            if ch.is_whitespace() {
+                self.input_state.eat_whitespace();
+            } else if ch == '}' as u8 {
+                if desc_val.len() == 0u {
+                    fail "Expected descriptor value";
+                } else {
+                    self.input_state.unget('}' as u8);
+                    break;
+                }
+            } else if ch == ';' as u8 {
+                if desc_val.len() == 0u {
+                    fail "Expected descriptor value";
+                } else {
+                    break;
+                }
+            } else {
+                desc_val += [ch];
+            }
+        }
+
+        ret Description(desc_name.to_str(), desc_val.to_str());
+    }
+}
+
+fn parser(reader: io::reader, state : ParserState) -> CssLexer {
+    ret { input_state: {mut lookahead: none, reader: reader},
+         mut parser_state: state };
+}
+
+#[warn(no_non_implicitly_copyable_typarams)]
+fn spawn_css_lexer_task(-filename: ~str) -> port<Token> {
+    let result_port = port();
+    let result_chan = chan(result_port);
+
+    task::spawn {||
+        assert (*copy filename).ends_with(".css");
+        let file_try = io::read_whole_file(*filename);
+
+        // Check if the given css file existed, if it does, parse it,
+        // otherwise just send an eof.  This is a hack to allow
+        // guessing that if foo.html exists, foo.css is the
+        // corresponding stylesheet.
+        if file_try.is_success() {
+            #debug["Lexing css sheet %s", *copy filename];
+            let file_data = file_try.get();
+            let reader = io::bytes_reader(file_data);
+        
+            let lexer = parser(reader, CssElement);
+
+            loop {
+                let token = lexer.parse_css();
+                let should_break = token == Eof;
+                result_chan.send(token);
+                if should_break { break; }
+            }
+        } else {
+            #debug["Failed to open css sheet %s", *copy filename];
+            result_chan.send(Eof);
+        }
+    };
+
+    ret result_port;
+}
--- a/src/servo/parser/html_builder.rs
+++ b/src/servo/parser/html_builder.rs
@ -7,8 +7,8 @@ import dom::rcu::WriterMethods;
 import geom::size::Size2D;
 import gfx::geometry;
 import gfx::geometry::au;
-import parser = parser::lexer::html;
-import parser::token;
+import parser = parser::html_lexer;
+import parser::Token;

 import dvec::extensions;

@ -66,41 +66,41 @@ fn build_element_kind(tag_name: str) -> ~ElementKind {
    }
 }

-fn build_dom(scope: NodeScope, stream: port<token>) -> Node {
+fn build_dom(scope: NodeScope, stream: port<Token>) -> Node {
    // The current reference node.
    let mut cur = scope.new_node(Element(ElementData("html", ~HTMLDivElement)));
    loop {
        let token = stream.recv();
        alt token {
-            parser::to_eof { break; }
-            parser::to_start_opening_tag(tag_name) {
+            parser::Eof { break; }
+            parser::StartOpeningTag(tag_name) {
                #debug["starting tag %s", tag_name];
                let element_kind = build_element_kind(tag_name);
                let new_node = scope.new_node(Element(ElementData(copy tag_name, element_kind)));
                scope.add_child(cur, new_node);
                cur = new_node;
            }
-            parser::to_attr(key, value) {
+            parser::Attr(key, value) {
                #debug["attr: %? = %?", key, value];
                link_up_attribute(scope, cur, copy key, copy value);
            }
-            parser::to_end_opening_tag {
+            parser::EndOpeningTag {
                #debug("end opening tag");
            }
-            parser::to_end_tag(_) | parser::to_self_close_tag {
+            parser::EndTag(_) | parser::SelfCloseTag {
                // TODO: Assert that the closing tag has the right name.
                // TODO: Fail more gracefully (i.e. according to the HTML5
                //       spec) if we close more tags than we open.
                cur = scope.get_parent(cur).get();
            }
-            parser::to_text(s) if !s.is_whitespace() {
+            parser::Text(s) if !s.is_whitespace() {
                let new_node = scope.new_node(Text(copy s));
                scope.add_child(cur, new_node);
            }
-            parser::to_text(_) {
+            parser::Text(_) {
                // FIXME: Whitespace should not be ignored.
            }
-            parser::to_doctype {
+            parser::Doctype {
                // TODO: Do something here...
            }
        }
--- a/src/servo/parser/html_lexer.rs
+++ b/src/servo/parser/html_lexer.rs
@ -0,0 +1,171 @@
+import comm::{port, chan};
+import dom::style;
+import option::is_none;
+import lexer_util::*;
+
+enum Token {
+    StartOpeningTag(str),
+    EndOpeningTag,
+    EndTag(str),
+    SelfCloseTag,
+    Text(str),
+    Attr(str, str),
+    Doctype,
+    Eof
+}
+
+enum ParseState {
+    NormalHtml,
+    TagHtml,
+}
+
+type HtmlLexer = {
+    input_state: InputState,
+    mut parser_state: ParseState
+};
+
+impl html_methods for HtmlLexer {
+    fn parse_html() -> Token {
+        let mut ch: u8;
+        alt self.input_state.get() {
+          CoeChar(c) { ch = c; }
+          CoeEof { ret Eof; }
+        }
+        let token = alt self.parser_state {
+          NormalHtml   { self.parse_in_normal_state(ch) }
+          TagHtml      { self.parse_in_tag_state(ch) }
+        };
+
+        #debug["token=%?", token];
+        ret token;
+    }
+
+    fn parse_in_normal_state(c: u8) -> Token {
+        let mut ch = c;
+        if ch == ('<' as u8) {
+            alt self.input_state.get() {
+              CoeChar(c) { ch = c; }
+              CoeEof { self.input_state.parse_err("eof after '<'") }
+            }
+
+            if ch == ('!' as u8) {
+                self.input_state.eat_whitespace();
+                self.input_state.expect_ident("DOCTYPE");
+                self.input_state.eat_whitespace();
+                self.input_state.expect_ident("html");
+                self.input_state.eat_whitespace();
+                self.input_state.expect('>' as u8);
+                ret Doctype;
+            }
+
+            if ch == ('/' as u8) {
+                let ident = self.input_state.parse_ident();
+                self.input_state.expect('>' as u8);
+                ret EndTag(ident);
+            }
+
+            self.input_state.unget(ch);
+
+            self.input_state.eat_whitespace();
+            let ident = self.input_state.parse_ident();
+            self.input_state.eat_whitespace();
+
+            self.parser_state = TagHtml;
+            ret StartOpeningTag(ident);
+        }
+        
+        // Make a text node.
+        let mut s: [u8] = [ch];
+        loop {
+            alt self.input_state.get() {
+              CoeChar(c) {
+                if c == ('<' as u8) {
+                    self.input_state.unget(c);
+                    ret s.to_html_token();
+                }
+                s += [c];
+              }
+              CoeEof { ret s.to_html_token(); }
+            }
+        }
+    }
+    
+    fn parse_in_tag_state(c: u8) -> Token {
+        let mut ch = c;
+        
+        if ch == ('>' as u8) {
+            self.parser_state = NormalHtml;
+            ret EndOpeningTag;
+        }
+
+        if ch == ('/' as u8) {
+            self.parser_state = NormalHtml;
+            ret SelfCloseTag;
+        }
+
+        if !ch.is_alpha() {
+            fail #fmt("expected alphabetical in tag but found %c", ch as char);
+        }
+
+        // Parse an attribute.
+        let mut attribute_name = [ch];
+        loop {
+            alt self.input_state.get() {
+              CoeChar(c) {
+                if c == ('=' as u8) { break; }
+                attribute_name += [c];
+              }
+              CoeEof {
+                ret Attr(attribute_name.to_str(),
+                            attribute_name.to_str()); }
+            }
+        }
+
+        // Parse the attribute value.
+        self.input_state.expect('"' as u8);
+        let mut attribute_value = [];
+        loop {
+            alt self.input_state.get() {
+              CoeChar(c) {
+                if c == ('"' as u8) { break; }
+                attribute_value += [c];
+              }
+              CoeEof {
+                ret Attr(attribute_name.to_str(),
+                            attribute_value.to_str());
+              }
+            }
+        }
+
+        // Eat whitespacpe.
+        self.input_state.eat_whitespace();
+
+        ret Attr(attribute_name.to_str(), attribute_value.to_str());
+    }
+}
+
+fn lexer(reader: io::reader, state : ParseState) -> HtmlLexer {
+    ret { input_state: {mut lookahead: none, reader: reader},
+         mut parser_state: state };
+}
+
+#[warn(no_non_implicitly_copyable_typarams)]
+fn spawn_html_lexer_task(-filename: ~str) -> port<Token> {
+    let result_port = port();
+    let result_chan = chan(result_port);
+    task::spawn {||
+        assert (*copy filename).ends_with(".html");
+        let file_data = io::read_whole_file(*filename).get();
+        let reader = io::bytes_reader(file_data);
+        
+        let lexer = lexer(reader, NormalHtml);
+
+        loop {
+            let token = lexer.parse_html();
+            let should_break = token == Eof;
+            result_chan.send(token);
+            if should_break { break; }
+        }
+    };
+    ret result_port;
+}
--- a/src/servo/parser/lexer.rs
+++ b/src/servo/parser/lexer.rs
@ -1,533 +0,0 @@
-import comm::{port, chan};
-import html::html_methods;
-import css::css_methods;
-import dom::style;
-import option::is_none;
-
-enum parse_state {
-    ps_html_normal,
-    ps_html_tag,
-    ps_css_elmt,
-    ps_css_relation,
-    ps_css_desc,
-    ps_css_attribute
-}
-
-type parser = {
-    mut lookahead: option<char_or_eof>,
-    mut state: parse_state,
-    reader: io::reader
-};
-
-enum char_or_eof {
-    coe_char(u8),
-    coe_eof
-}
-
-impl u8_methods for u8 {
-    fn is_whitespace() -> bool {
-        ret self == ' ' as u8 || self == '\n' as u8
-            || self == '\t' as u8;
-    }
-
-    fn is_alpha() -> bool {
-        ret (self >= ('A' as u8) && self <= ('Z' as u8)) ||
-            (self >= ('a' as u8) && self <= ('z' as u8));
-    }
-}
-
-impl u8_vec_methods for [u8] {
-    fn to_str() -> str { ret str::from_bytes(self); }
-    fn to_html_token() -> html::token { ret html::to_text(self.to_str()); }
-    fn to_css_token() -> html::token { ret html::to_text(self.to_str()); }
-}
-
-impl util_methods for parser {
-    fn get() -> char_or_eof {
-        alt copy self.lookahead {
-            some(coe) {
-                let rv = coe;
-                self.lookahead = none;
-                ret rv;
-            }
-            none {
-                /* fall through */
-            }
-        }
-
-        if self.reader.eof() { ret coe_eof; }
-        ret coe_char(self.reader.read_byte() as u8);
-    }
-
-    fn unget(ch: u8) {
-        assert is_none(self.lookahead);
-        self.lookahead = some(coe_char(ch));
-    }
-
-    fn parse_err(err: str) -> ! {
-        fail err
-    }
-
-    fn expect(ch: u8) {
-        alt self.get() {
-            coe_char(c) {
-                if c != ch {
-                    self.parse_err(#fmt("expected '%c'", ch as char));
-                }
-            }
-            coe_eof {
-                self.parse_err(#fmt("expected '%c' at eof", ch as char));
-            }
-        }
-    }
-
-    fn parse_ident() -> str {
-        let mut result: [u8] = [];
-        loop {
-            alt self.get() {
-                coe_char(c) {
-                    if (c.is_alpha()) {
-                        result += [c];
-                    } else if result.len() == 0u {
-                        self.parse_err("expected ident");
-                    } else {
-                        self.unget(c);
-                        break;
-                    }
-                }
-                coe_eof {
-                    self.parse_err("expected ident");
-                }
-            }
-        }
-        ret str::from_bytes(result);
-    }
-
-    fn expect_ident(expected: str) {
-        let actual = self.parse_ident();
-        if expected != actual {
-            self.parse_err(#fmt("expected '%s' but found '%s'",
-                                expected, actual));
-        }
-    }
-
-    fn eat_whitespace() {
-        loop {
-            alt self.get() {
-                coe_char(c) {
-                  if !c.is_whitespace() {
-                        self.unget(c);
-                        ret;
-                    }
-                }
-                coe_eof {
-                    ret;
-                }
-            }
-        }
-    }
-
-    fn parse_html() -> html::token {
-        let mut ch: u8;
-        alt self.get() {
-            coe_char(c) { ch = c; }
-            coe_eof { ret html::to_eof; }
-        }
-
-        let token = alt self.state {
-          ps_html_normal   { self.parse_in_normal_state(ch) }
-          ps_html_tag      { self.parse_in_tag_state(ch) }
-          _                { fail "Parsing in html mode when not in " + 
-                                "an html state" }
-        };
-
-        #debug["token=%?", token];
-        ret token;
-    }
-
-    fn parse_css() -> css::token {
-        let mut ch: u8;
-        alt self.get() {
-            coe_char(c) { ch = c; }
-            coe_eof { ret css::to_eof; }
-        }
-
-        let token = alt self.state {
-          ps_css_desc        { self.parse_css_description(ch) }
-          ps_css_attribute   { self.parse_css_attribute(ch) }
-          ps_css_elmt        { self.parse_css_element(ch) }
-          ps_css_relation    { self.parse_css_relation(ch) }
-          _                  { fail "Parsing in css mode when not in " + 
-                                  "a css state" }
-        };
-
-        #debug["token=%?", token];
-        ret token;
-    }
-}
-
-mod html {
-    enum token {
-        to_start_opening_tag(str),
-        to_end_opening_tag,
-        to_end_tag(str),
-        to_self_close_tag,
-        to_text(str),
-        to_attr(str, str),
-        to_doctype,
-        to_eof
-    }
-
-    impl html_methods for parser {
-        fn parse_in_normal_state(c: u8) -> token {
-            let mut ch = c;
-            if ch == ('<' as u8) {
-                alt self.get() {
-                  coe_char(c) { ch = c; }
-                  coe_eof { self.parse_err("eof after '<'") }
-                }
-
-                if ch == ('!' as u8) {
-                    self.eat_whitespace();
-                    self.expect_ident("DOCTYPE");
-                    self.eat_whitespace();
-                    self.expect_ident("html");
-                    self.eat_whitespace();
-                    self.expect('>' as u8);
-                    ret to_doctype;
-                }
-
-                if ch == ('/' as u8) {
-                    let ident = self.parse_ident();
-                    self.expect('>' as u8);
-                    ret to_end_tag(ident);
-                }
-
-                self.unget(ch);
-
-                self.eat_whitespace();
-                let ident = self.parse_ident();
-                self.eat_whitespace();
-
-                self.state = ps_html_tag;
-                ret to_start_opening_tag(ident);
-            }
-            
-            // Make a text node.
-            let mut s: [u8] = [ch];
-            loop {
-                alt self.get() {
-                  coe_char(c) {
-                    if c == ('<' as u8) {
-                        self.unget(c);
-                        ret s.to_html_token();
-                    }
-                    s += [c];
-                  }
-                  coe_eof { ret s.to_html_token(); }
-                }
-            }
-        }
-        
-        fn parse_in_tag_state(c: u8) -> token {
-            let mut ch = c;
-            
-            if ch == ('>' as u8) {
-                self.state = ps_html_normal;
-                ret to_end_opening_tag;
-            }
-
-            if ch == ('/' as u8) {
-                self.state = ps_html_normal;
-                ret to_self_close_tag;
-            }
-
-            if !ch.is_alpha() {
-                fail #fmt("expected alphabetical in tag but found %c", 
-                          ch as char);
-            }
-
-            // Parse an attribute.
-            let mut attribute_name = [ch];
-            loop {
-                alt self.get() {
-                  coe_char(c) {
-                    if c == ('=' as u8) { break; }
-                    attribute_name += [c];
-                  }
-                  coe_eof {
-                    ret to_attr(attribute_name.to_str(),
-                                attribute_name.to_str()); }
-                }
-            }
-
-            // Parse the attribute value.
-            self.expect('"' as u8);
-            let mut attribute_value = [];
-            loop {
-                alt self.get() {
-                  coe_char(c) {
-                    if c == ('"' as u8) { break; }
-                    attribute_value += [c];
-                  }
-                  coe_eof {
-                    ret to_attr(attribute_name.to_str(),
-                                attribute_value.to_str());
-                  }
-                }
-            }
-
-            // Eat whitespacpe.
-            self.eat_whitespace();
-
-            ret to_attr(attribute_name.to_str(), attribute_value.to_str());
-        }
-    }
-}
-
-mod css {
-    enum token {
-        to_start_desc,
-        to_end_desc,
-        to_descendant,
-        to_child,
-        to_sibling,
-        to_comma,
-        to_elmt(str),
-        to_attr(style::attr), 
-        to_desc(str, str),
-        to_eof
-    }
-
-    impl css_methods for parser {
-        fn parse_css_relation(c : u8) -> token {
-            self.state = ps_css_elmt;
-
-            let token = alt c {
-              '{' as u8  { self.state = ps_css_desc; to_start_desc }
-              '>' as u8  { to_child }
-              '+' as u8  { to_sibling }
-              ',' as u8  { to_comma }
-              _          { self.unget(c); to_descendant }
-            };
-
-            self.eat_whitespace();
-            
-            ret token;
-        }
-
-        fn parse_css_element(c : u8) -> token {
-            assert is_none(self.lookahead);
-
-            /* Check for special attributes with an implied element,
-            or a wildcard which is not a alphabet character.*/
-            if c == '.' as u8 || c == '#' as u8 {
-                self.state = ps_css_attribute;
-                self.unget(c);
-                ret to_elmt("*");
-            } else if c == '*' as u8 {
-                self.state = ps_css_attribute;
-                ret to_elmt("*");
-            }
-
-            self.unget(c);
-            let element = self.parse_ident();
-
-            self.state = ps_css_attribute;
-            
-            ret to_elmt(element);
-        }
-
-        fn parse_css_attribute(c : u8) -> token {
-            let mut ch = c;
-            
-            /* If we've reached the end of this list of attributes,
-            look for the relation to the next element.*/
-            if c.is_whitespace() {
-                self.state = ps_css_relation;
-                self.eat_whitespace();
-
-                alt self.get() {
-                  coe_char(c)  { ch = c }
-                  coe_eof      { fail "File ended before description " +
-                                    "of style" }
-                }
-
-                ret self.parse_css_relation(ch);
-            }
-            
-            alt ch {
-              '.' as u8 { ret to_attr(
-                  style::includes("class", self.parse_ident())); }
-              '#' as u8 { ret to_attr(
-                  style::includes("id", self.parse_ident())); }
-              '[' as u8 {
-                let attr_name = self.parse_ident();
-                
-                alt self.get() {
-                  coe_char(c)    { ch = c; }
-                  coe_eof        { fail "File ended before " + 
-                                      "description finished"; }
-                }
-
-                if ch == ']' as u8 {
-                    ret to_attr(style::exists(attr_name));
-                } else if ch == '=' as u8 {
-                    let attr_val = self.parse_ident();
-                    self.expect(']' as u8);
-                    ret to_attr(style::exact(attr_name, attr_val));
-                } else if ch == '~' as u8 {
-                    self.expect('=' as u8);
-                    let attr_val = self.parse_ident();
-                    self.expect(']' as u8);
-                    ret to_attr(style::includes(attr_name, attr_val));
-                } else if ch == '|' as u8 {
-                    self.expect('=' as u8);
-                    let attr_val = self.parse_ident();
-                    self.expect(']' as u8);
-                    ret to_attr(style::starts_with(attr_name, attr_val));
-                }
-                
-                fail #fmt("Unexpected symbol %c in attribute", ch as char);
-              }
-              _   { fail #fmt("Unexpected symbol %c in attribute", 
-                              ch as char); }
-            }
-        }
-
-        fn parse_css_description(c: u8) -> token {
-            let mut ch = c;
-
-            if ch == '}' as u8 {
-                self.state = ps_css_elmt;
-                self.eat_whitespace();
-                ret to_end_desc;
-            } else if ch.is_whitespace() {
-                self.eat_whitespace();
-
-                alt self.get() {
-                  coe_char(c)  { ch = c }
-                  coe_eof      { fail "Reached end of file " +  
-                                    "in CSS description" }
-                }
-            }
-            
-            let mut desc_name = [];
-            
-            // Get the name of the descriptor
-            loop {
-                if ch.is_whitespace() {
-                    self.eat_whitespace();
-                } else if ch == ':' as u8 {
-                    if desc_name.len() == 0u {
-                        fail "Expected descriptor name";
-                    } else {
-                        break;
-                    }
-                } else {
-                    desc_name += [ch];
-                }
-
-                alt self.get() {
-                  coe_char(c)  { ch = c }
-                  coe_eof      { fail "Reached end of file " +  
-                                    "in CSS description" }
-                }
-            }
-
-            self.eat_whitespace();
-            let mut desc_val = [];
-
-            // Get the value of the descriptor
-            loop {
-                alt self.get() {
-                  coe_char(c)  { ch = c }
-                  coe_eof      { fail "Reached end of file " +  
-                                    "in CSS description" }
-                }
-
-                if ch.is_whitespace() {
-                    self.eat_whitespace();
-                } else if ch == '}' as u8 {
-                    if desc_val.len() == 0u {
-                        fail "Expected descriptor value";
-                    } else {
-                        self.unget('}' as u8);
-                        break;
-                    }
-                } else if ch == ';' as u8 {
-                    if desc_val.len() == 0u {
-                        fail "Expected descriptor value";
-                    } else {
-                        break;
-                    }
-                } else {
-                    desc_val += [ch];
-                }
-            }
-
-            ret to_desc(desc_name.to_str(), desc_val.to_str());
-        }
-    }
-}
-
-fn parser(reader: io::reader, state : parse_state) -> parser {
-    ret { mut lookahead: none, mut state: state, reader: reader };
-}
-
-#[warn(no_non_implicitly_copyable_typarams)]
-fn spawn_html_parser_task(-filename: ~str) -> port<html::token> {
-    let result_port = port();
-    let result_chan = chan(result_port);
-    task::spawn {||
-        let filename = copy *filename;
-        assert (copy filename).ends_with(".html");
-        let file_data = io::read_whole_file(filename).get();
-        let reader = io::bytes_reader(file_data);
-        
-        let parser = parser(reader, ps_html_normal);
-
-        loop {
-            let token = parser.parse_html();
-            let should_break = token == html::to_eof;
-            result_chan.send(token);
-            if should_break { break; }
-        }
-    };
-    ret result_port;
-}
-
-#[warn(no_non_implicitly_copyable_typarams)]
-fn spawn_css_lexer_task(-filename: ~str) -> port<css::token> {
-    let result_port = port();
-    let result_chan = chan(result_port);
-    task::spawn {||
-        let filename = copy *filename;
-
-        assert (copy filename).ends_with(".css");
-        let file_try = io::read_whole_file(filename);
-
-        // Check if the given css file existed, if it does, parse it,
-        // otherwise just send an eof.  This is a hack to allow
-        // guessing that if foo.html exists, foo.css is the
-        // corresponding stylesheet.
-        if file_try.is_success() {
-            #debug["Lexing css sheet %s", filename];
-            let file_data = file_try.get();
-            let reader = io::bytes_reader(file_data);
-        
-            let parser : parser = parser(reader, ps_css_elmt);
-
-            loop {
-                let token = parser.parse_css();
-                let should_break = token == css::to_eof;
-                result_chan.send(token);
-                if should_break { break; }
-            }
-        } else {
-            #debug["Failed to open css sheet %s", filename];
-            result_chan.send(css::to_eof);
-        }
-    };
-    ret result_port;
-}
--- a/src/servo/parser/lexer_util.rs
+++ b/src/servo/parser/lexer_util.rs
@ -0,0 +1,112 @@
+import option::is_none;
+
+enum CharOrEof {
+    CoeChar(u8),
+    CoeEof
+}
+
+type InputState = {
+    mut lookahead: option<CharOrEof>,
+    reader: io::reader
+};
+
+impl u8_methods for u8 {
+    fn is_whitespace() -> bool {
+        ret self == ' ' as u8 || self == '\n' as u8 || self == '\t' as u8;
+    }
+
+    fn is_alpha() -> bool {
+        ret (self >= ('A' as u8) && self <= ('Z' as u8)) ||
+            (self >= ('a' as u8) && self <= ('z' as u8));
+    }
+}
+
+impl u8_vec_methods for [u8] {
+    fn to_html_token() -> html_lexer::Token { ret html_lexer::Text(self.to_str()); }
+    fn to_str() -> str { ret str::from_bytes(self); }
+}
+
+impl util_methods for InputState {
+    fn get() -> CharOrEof {
+        alt copy self.lookahead {
+            some(coe) {
+                let rv = coe;
+                self.lookahead = none;
+                ret rv;
+            }
+            none {
+                /* fall through */
+            }
+        }
+
+        if self.reader.eof() { ret CoeEof; }
+        ret CoeChar(self.reader.read_byte() as u8);
+    }
+
+    fn unget(ch: u8) {
+        assert is_none(self.lookahead);
+        self.lookahead = some(CoeChar(ch));
+    }
+
+    fn parse_err(err: str) -> ! {
+        fail err
+    }
+
+    fn expect(ch: u8) {
+        alt self.get() {
+            CoeChar(c) {
+                if c != ch {
+                    self.parse_err(#fmt("expected '%c'", ch as char));
+                }
+            }
+            CoeEof {
+                self.parse_err(#fmt("expected '%c' at eof", ch as char));
+            }
+        }
+    }
+
+    fn parse_ident() -> str {
+        let mut result: [u8] = [];
+        loop {
+            alt self.get() {
+                CoeChar(c) {
+                    if (c.is_alpha()) {
+                        result += [c];
+                    } else if result.len() == 0u {
+                        self.parse_err("expected ident");
+                    } else {
+                        self.unget(c);
+                        break;
+                    }
+                }
+                CoeEof {
+                    self.parse_err("expected ident");
+                }
+            }
+        }
+        ret str::from_bytes(result);
+    }
+
+    fn expect_ident(expected: str) {
+        let actual = self.parse_ident();
+        if expected != actual {
+            self.parse_err(#fmt("expected '%s' but found '%s'", expected, actual));
+        }
+    }
+
+    fn eat_whitespace() {
+        loop {
+            alt self.get() {
+                CoeChar(c) {
+                  if !c.is_whitespace() {
+                        self.unget(c);
+                        ret;
+                    }
+                }
+                CoeEof {
+                    ret;
+                }
+            }
+        }
+    }
+}
--- a/src/servo/servo.rc
+++ b/src/servo/servo.rc
@ -51,7 +51,9 @@ mod layout {
 }

 mod parser {
-    mod lexer;
+    mod lexer_util;
+    mod css_lexer;
+    mod html_lexer;
    mod html_builder;
    mod css_builder;
 }
--- a/src/servo/servo.rs
+++ b/src/servo/servo.rs
@ -1,5 +1,4 @@
 import comm::*;
-import parser::lexer;
 import result::extensions;
 import gfx::renderer;
 import platform::osmain;