Added css datastructures, pretty-printing, and a lexer

2025-08-06 14:10:11 +01:00 · 2012-05-24 11:08:03 -07:00 · 2012-05-24 11:08:03 -07:00 · d9bdfc01af
commit d9bdfc01af
parent 82d9ff3e56
8 changed files with 639 additions and 279 deletions
--- a/src/servo/content.rs
+++ b/src/servo/content.rs
@ -40,7 +40,7 @@ fn content(to_layout: chan<layout::msg>) -> chan<msg> {
                // Note: we can parse the next document in parallel
                // with any previous documents.
-                let stream = html::spawn_parser_task(filename);
+                let stream = lexer::spawn_html_parser_task(filename);
                let root = parser::html_builder::build_dom(scope, stream);
                // Now, join the layout so that they will see the latest
--- a/src/servo/dom/base.rs
+++ b/src/servo/dom/base.rs
@ -54,9 +54,10 @@ enum element_subclass {
    es_head
 }
-#[doc="The rd_aux data is a (weak) pointer to the layout data, which contains
+#[doc="The rd_aux data is a (weak) pointer to the layout data, which
-       the CSS info as well as the primary box.  Note that there may be multiple
+       contains the CSS info as well as the primary box.  Note that
-       boxes per DOM node."]
+       there may be multiple boxes per DOM node."]
 type node = rcu::handle<node_data, layout_data>;
 type node_scope = rcu::scope<node_data, layout_data>;
--- a/src/servo/dom/style.rs
+++ b/src/servo/dom/style.rs
@ -0,0 +1,125 @@
 import io::println;
 enum display_type{
    block,
    inline
 }
 enum style_decl{
    font_size(uint),
    display(display_type),
    text_color(uint),
    background_color(uint)
 }
 enum attr{
    exists(str),
    exact(str, str),
    includes(str, str),
    starts_with(str, str)
 }
 enum selector{
    element(str, [attr]),
    child(~selector, ~selector),
    descendant(~selector, ~selector),
    sibling(~selector, ~selector)
 }
 type rule = (selector, [style_decl]);
 type stylesheet = [rule];
 fn print_list<T>(list : [T], print : fn(T) -> str) -> str {
    let l = vec::len(list);
    if l == 0u { ret "" }
    let mut res = print(list[0]);
    let mut i = 1u;
    while i < l { 
        res += ", ";
        res += print(list[i]);
        i += 1u;
    }
    ret res;
 } 
 fn print_display(dis_ty : display_type) -> str {
    alt dis_ty { 
      block { "block" } 
      inline { "inline" }      
    }
 }
 fn print_style(decl : style_decl) -> str{
    alt decl {
      font_size(s) { #fmt("Font size = %u px", s) }
      display(dis_ty) { #fmt("Display style = %s", print_display(dis_ty)) }
      text_color(c) { #fmt("Text color = 0x%06x", c) }
      background_color(c) { #fmt("Background color = 0x%06x", c) }
    }
 }
 fn print_attr(attribute : attr) -> str {
    alt attribute {
      exists(att) { #fmt("[%s]", att) }
      exact(att, val) { #fmt("[%s = %s]", att, val) }
      includes(att, val) { #fmt("[%s ~= %s]", att, val) }
      starts_with(att, val) { #fmt("[%s |= %s]", att, val) }
    }
 }
 fn print_selector(select : ~selector) -> str {
    alt *select {
      element(s, attrs) { #fmt("Element %s with attributes: %s", s, 
                               print_list(attrs, print_attr)) }
      child(sel1, sel2) { #fmt("(%s) > (%s)", print_selector(sel1),
                               print_selector(sel2)) }
      descendant(sel1, sel2) { #fmt("(%s) (%s)", print_selector(sel1),
                                    print_selector(sel2)) }
      sibling(sel1, sel2) { #fmt("(%s) + (%s)", print_selector(sel1),
                                    print_selector(sel2)) }
    }
 }
 fn print_rule(rule : rule) -> str {
    alt rule {
      (sel, styles) {
        let sel_str = print_selector(~(copy sel));
        let sty_str = print_list(styles, print_style);        
        #fmt("Selector: %s, Style: {%s}", sel_str, sty_str)
      }
    }
 }
 fn print_sheet(sheet : stylesheet) -> str {
    #fmt("CSS Rules: %s", print_list(sheet, print_rule))
 }
 #[test]
 fn test_pretty_print() {
    let test1 = [(element("p", []), [font_size(32u)])];
    let actual1 = print_sheet(test1);
    let expected1 = "CSS Rules: Selector: Element p with attributes: ," +
        " Style: {Font size = 32 px}";
    assert(actual1 == expected1);
    let elmt1 = ~element("*", []);
    let elmt2 = ~element("body", [exact("class", "2")]);
    let test2 = [(descendant(elmt1, elmt2),
                  [display(block), text_color(0u)])];
    let actual2 = print_sheet(test2);
    let expected2 =  "CSS Rules: Selector: (Element * with attributes: ) " + 
        "(Element body with attributes: [class = 2]), " + 
        "Style: {Display style = block, Text color = 0x000000}";
    assert(actual2 == expected2);
 }
--- a/src/servo/parser/html.rs
+++ b/src/servo/parser/html.rs
@ -1,268 +0,0 @@
 import comm::{port, chan};
 enum parse_state {
    ps_normal,
    ps_tag
 }
 type parser = {
    mut lookahead: option<char_or_eof>,
    mut state: parse_state,
    reader: io::reader
 };
 enum token {
    to_start_opening_tag(str),
    to_end_opening_tag,
    to_end_tag(str),
    to_self_close_tag,
    to_text(str),
    to_attr(str, str),
    to_doctype,
    to_eof
 }
 enum char_or_eof {
    coe_char(u8),
    coe_eof
 }
 impl u8_methods for u8 {
    fn is_alpha() -> bool {
        ret (self >= ('A' as u8) && self <= ('Z' as u8)) ||
            (self >= ('a' as u8) && self <= ('z' as u8));
    }
 }
 impl u8_vec_methods for [u8] {
    fn to_str() -> str { ret str::from_bytes(self); }
    fn to_str_token() -> token { ret to_text(self.to_str()); }
 }
 impl methods for parser {
    fn get() -> char_or_eof {
        alt self.lookahead {
            some(coe) {
                let rv = coe;
                self.lookahead = none;
                ret rv;
            }
            none {
                /* fall through */
            }
        }
        if self.reader.eof() { ret coe_eof; }
        ret coe_char(self.reader.read_byte() as u8);
    }
    fn unget(ch: u8) {
        assert self.lookahead.is_none();
        self.lookahead = some(coe_char(ch));
    }
    fn parse_err(err: str) -> ! {
        fail err
    }
    fn expect(ch: u8) {
        alt self.get() {
            coe_char(c) {
                if c != ch {
                    self.parse_err(#fmt("expected '%c'", ch as char));
                }
            }
            coe_eof {
                self.parse_err(#fmt("expected '%c' at eof", ch as char));
            }
        }
    }
    fn parse_ident() -> str {
        let mut result: [u8] = [];
        loop {
            alt self.get() {
                coe_char(c) {
                    if (c.is_alpha()) {
                        result += [c];
                    } else if result.len() == 0u {
                        self.parse_err("expected ident");
                    } else {
                        self.unget(c);
                        break;
                    }
                }
                coe_eof {
                    self.parse_err("expected ident");
                }
            }
        }
        ret str::from_bytes(result);
    }
    fn expect_ident(expected: str) {
        let actual = self.parse_ident();
        if expected != actual {
            self.parse_err(#fmt("expected '%s' but found '%s'",
                                expected, actual));
        }
    }
    fn eat_whitespace() {
        loop {
            alt self.get() {
                coe_char(c) {
                    if c != (' ' as u8) && c != ('\n' as u8) &&
                           c != ('\t' as u8) {
                        self.unget(c);
                        ret;
                    }
                }
                coe_eof {
                    ret;
                }
            }
        }
    }
    fn parse() -> token {
        let mut ch: u8;
        alt self.get() {
            coe_char(c) { ch = c; }
            coe_eof { ret to_eof; }
        }
        let token = alt self.state {
            ps_normal   { self.parse_in_normal_state(ch) }
            ps_tag      { self.parse_in_tag_state(ch)    }
        };
        #debug["token=%?", token];
        ret token;
    }
    fn parse_in_normal_state(c: u8) -> token {
        let mut ch = c;
        if ch == ('<' as u8) {
            alt self.get() {
                coe_char(c) { ch = c; }
                coe_eof { self.parse_err("eof after '<'") }
            }
            if ch == ('!' as u8) {
                self.eat_whitespace();
                self.expect_ident("DOCTYPE");
                self.eat_whitespace();
                self.expect_ident("html");
                self.eat_whitespace();
                self.expect('>' as u8);
                ret to_doctype;
            }
            if ch == ('/' as u8) {
                let ident = self.parse_ident();
                self.expect('>' as u8);
                ret to_end_tag(ident);
            }
            self.unget(ch);
            self.eat_whitespace();
            let ident = self.parse_ident();
            self.eat_whitespace();
            self.state = ps_tag;
            ret to_start_opening_tag(ident);
        }
        // Make a text node.
        let mut s: [u8] = [ch];
        loop {
            alt self.get() {
                coe_char(c) {
                    if c == ('<' as u8) {
                        self.unget(c);
                        ret s.to_str_token();
                    }
                    s += [c];
                }
                coe_eof { ret s.to_str_token(); }
            }
        }
    }
    fn parse_in_tag_state(c: u8) -> token {
        let mut ch = c;
        if ch == ('>' as u8) {
            self.state = ps_normal;
            ret to_end_opening_tag;
        }
        if ch == ('/' as u8) {
            self.state = ps_normal;
            ret to_self_close_tag;
        }
        if !ch.is_alpha() {
            fail #fmt("expected alphabetical in tag but found %c", ch as char);
        }
        // Parse an attribute.
        let mut attribute_name = [ch];
        loop {
            alt self.get() {
                coe_char(c) {
                    if c == ('=' as u8) { break; }
                    attribute_name += [c];
                }
                coe_eof {
                    ret to_attr(attribute_name.to_str(),
                                attribute_name.to_str()); }
            }
        }
        // Parse the attribute value.
        self.expect('"' as u8);
        let mut attribute_value = [];
        loop {
            alt self.get() {
                coe_char(c) {
                    if c == ('"' as u8) { break; }
                    attribute_value += [c];
                }
                coe_eof {
                    ret to_attr(attribute_name.to_str(),
                                attribute_value.to_str());
                }
            }
        }
        // Eat whitespace.
        self.eat_whitespace();
        ret to_attr(attribute_name.to_str(), attribute_value.to_str());
    }
 }
 fn parser(reader: io::reader) -> parser {
    ret { mut lookahead: none, mut state: ps_normal, reader: reader };
 }
 fn spawn_parser_task(filename: str) -> port<token> {
    let result_port = port();
    let result_chan = chan(result_port);
    task::spawn {||
        let file_data = io::read_whole_file(filename).get();
        let reader = io::bytes_reader(file_data);
        let parser = parser(reader);
        loop {
            let token = parser.parse();
            result_chan.send(token);
            if token == to_eof { break; }
        }
    };
    ret result_port;
 }
--- a/src/servo/parser/html_builder.rs
+++ b/src/servo/parser/html_builder.rs
@ -5,8 +5,8 @@ import dom::base::{attr, element, element_subclass, es_div, es_head, es_img};
 import dom::base::{es_unknown, methods, nk_element, nk_text, rd_tree_ops};
 import dom::base::{wr_tree_ops};
 import dom = dom::base;
-import parser = parser::html;
+import parser = parser::lexer::html;
-import html::token;
+import parser::token;
 import gfx::geom;
 import dvec::extensions;
--- a/src/servo/parser/lexer.rs
+++ b/src/servo/parser/lexer.rs
@ -0,0 +1,501 @@
 import comm::{port, chan};
 import html::html_methods;
 import css::css_methods;
 import dom::style;
 enum parse_state {
    ps_html_normal,
    ps_html_tag,
    ps_css_elmt,
    ps_css_relation,
    ps_css_desc,
    ps_css_attribute
 }
 type parser = {
    mut lookahead: option<char_or_eof>,
    mut state: parse_state,
    reader: io::reader
 };
 enum char_or_eof {
    coe_char(u8),
    coe_eof
 }
 impl u8_methods for u8 {
    fn is_whitespace() -> bool {
        ret self == ' ' as u8 || self == '\n' as u8
            || self == '\t' as u8;
    }
    fn is_alpha() -> bool {
        ret (self >= ('A' as u8) && self <= ('Z' as u8)) ||
            (self >= ('a' as u8) && self <= ('z' as u8));
    }
 }
 impl u8_vec_methods for [u8] {
    fn to_str() -> str { ret str::from_bytes(self); }
    fn to_html_token() -> html::token { ret html::to_text(self.to_str()); }
    fn to_css_token() -> html::token { ret html::to_text(self.to_str()); }
 }
 impl util_methods for parser {
    fn get() -> char_or_eof {
        alt self.lookahead {
            some(coe) {
                let rv = coe;
                self.lookahead = none;
                ret rv;
            }
            none {
                /* fall through */
            }
        }
        if self.reader.eof() { ret coe_eof; }
        ret coe_char(self.reader.read_byte() as u8);
    }
    fn unget(ch: u8) {
        assert self.lookahead.is_none();
        self.lookahead = some(coe_char(ch));
    }
    fn parse_err(err: str) -> ! {
        fail err
    }
    fn expect(ch: u8) {
        alt self.get() {
            coe_char(c) {
                if c != ch {
                    self.parse_err(#fmt("expected '%c'", ch as char));
                }
            }
            coe_eof {
                self.parse_err(#fmt("expected '%c' at eof", ch as char));
            }
        }
    }
    fn parse_ident() -> str {
        let mut result: [u8] = [];
        loop {
            alt self.get() {
                coe_char(c) {
                    if (c.is_alpha()) {
                        result += [c];
                    } else if result.len() == 0u {
                        self.parse_err("expected ident");
                    } else {
                        self.unget(c);
                        break;
                    }
                }
                coe_eof {
                    self.parse_err("expected ident");
                }
            }
        }
        ret str::from_bytes(result);
    }
    fn expect_ident(expected: str) {
        let actual = self.parse_ident();
        if expected != actual {
            self.parse_err(#fmt("expected '%s' but found '%s'",
                                expected, actual));
        }
    }
    fn eat_whitespace() {
        loop {
            alt self.get() {
                coe_char(c) {
                  if c.is_whitespace() {
                        self.unget(c);
                        ret;
                    }
                }
                coe_eof {
                    ret;
                }
            }
        }
    }
    fn parse_html() -> html::token {
        let mut ch: u8;
        alt self.get() {
            coe_char(c) { ch = c; }
            coe_eof { ret html::to_eof; }
        }
        let token = alt self.state {
          ps_html_normal   { self.parse_in_normal_state(ch) }
          ps_html_tag      { self.parse_in_tag_state(ch) }
          _                { fail "Parsing in html mode when not in " + 
                                "an html state" }
        };
        #debug["token=%?", token];
        ret token;
    }
    fn parse_css() -> css::token {
        let mut ch: u8;
        alt self.get() {
            coe_char(c) { ch = c; }
            coe_eof { ret css::to_eof; }
        }
        let token = alt self.state {
          ps_css_desc        { self.parse_css_description(ch) }
          ps_css_attribute   { self.parse_css_attribute(ch) }
          ps_css_elmt        { self.parse_css_element(ch) }
          ps_css_relation    { self.parse_css_relation(ch) }
          _                  { fail "Parsing in css mode when not in " + 
                                  "a css state" }
        };
        #debug["token=%?", token];
        ret token;
    }
 }
 mod html {
    enum token {
        to_start_opening_tag(str),
        to_end_opening_tag,
        to_end_tag(str),
        to_self_close_tag,
        to_text(str),
        to_attr(str, str),
        to_doctype,
        to_eof
    }
    impl html_methods for parser {
        fn parse_in_normal_state(c: u8) -> token {
            let mut ch = c;
            if ch == ('<' as u8) {
                alt self.get() {
                  coe_char(c) { ch = c; }
                  coe_eof { self.parse_err("eof after '<'") }
                }
                if ch == ('!' as u8) {
                    self.eat_whitespace();
                    self.expect_ident("DOCTYPE");
                    self.eat_whitespace();
                    self.expect_ident("html");
                    self.eat_whitespace();
                    self.expect('>' as u8);
                    ret to_doctype;
                }
                if ch == ('/' as u8) {
                    let ident = self.parse_ident();
                    self.expect('>' as u8);
                    ret to_end_tag(ident);
                }
                self.unget(ch);
                self.eat_whitespace();
                let ident = self.parse_ident();
                self.eat_whitespace();
                self.state = ps_html_tag;
                ret to_start_opening_tag(ident);
            }
            // Make a text node.
            let mut s: [u8] = [ch];
            loop {
                alt self.get() {
                  coe_char(c) {
                    if c == ('<' as u8) {
                        self.unget(c);
                        ret s.to_html_token();
                    }
                    s += [c];
                  }
                  coe_eof { ret s.to_html_token(); }
                }
            }
        }
        fn parse_in_tag_state(c: u8) -> token {
            let mut ch = c;
            if ch == ('>' as u8) {
                self.state = ps_html_normal;
                ret to_end_opening_tag;
            }
            if ch == ('/' as u8) {
                self.state = ps_html_normal;
                ret to_self_close_tag;
            }
            if !ch.is_alpha() {
                fail #fmt("expected alphabetical in tag but found %c", 
                          ch as char);
            }
            // Parse an attribute.
            let mut attribute_name = [ch];
            loop {
                alt self.get() {
                  coe_char(c) {
                    if c == ('=' as u8) { break; }
                    attribute_name += [c];
                  }
                  coe_eof {
                    ret to_attr(attribute_name.to_str(),
                                attribute_name.to_str()); }
                }
            }
            // Parse the attribute value.
            self.expect('"' as u8);
            let mut attribute_value = [];
            loop {
                alt self.get() {
                  coe_char(c) {
                    if c == ('"' as u8) { break; }
                    attribute_value += [c];
                  }
                  coe_eof {
                    ret to_attr(attribute_name.to_str(),
                                attribute_value.to_str());
                  }
                }
            }
            // Eat whitespacpe.
            self.eat_whitespace();
            ret to_attr(attribute_name.to_str(), attribute_value.to_str());
        }
    }
 }
 mod css {
    enum token {
        to_start_desc,
        to_end_desc,
        to_descendant,
        to_child,
        to_sibling,
        to_comma,
        to_elmt(str),
        to_attr(style::attr), 
        to_desc(str, str),
        to_eof
    }
    impl css_methods for parser {
        fn parse_css_relation(c : u8) -> token {
            self.state = ps_css_elmt;
            let token = alt c {
              '{' as u8  { self.state = ps_css_desc; to_start_desc }
              '>' as u8  { to_child }
              '+' as u8  { to_sibling }
              ',' as u8  { to_comma }
              _          { to_descendant }                             
            };
            self.eat_whitespace();
            ret token;
        }
        fn parse_css_element(c : u8) -> token {
            /* Check for special attributes with an implied element.*/
            if c == '.' as u8 || c == '#' as u8 {
                self.state = ps_css_attribute;
                self.unget(c);
                ret to_elmt("*");
            }
            let element = self.parse_ident();
            self.state = ps_css_attribute;
            ret to_elmt(element);
        }
        fn parse_css_attribute(c : u8) -> token {
            let mut ch = c;
            /* If we've reached the end of this list of attributes,
            look for the relation to the next element.*/
            if c.is_whitespace() {
                self.state = ps_css_relation;
                self.eat_whitespace();
                alt self.get() {
                  coe_char(c)  { ch = c }
                  coe_eof      { fail "File ended before description " +
                                    "of style" }
                }
                ret self.parse_css_relation(ch);
            }
            alt ch {
              '.' as u8 { ret to_attr(
                  style::includes("class", self.parse_ident())); }
              '#' as u8 { ret to_attr(
                  style::includes("id", self.parse_ident())); }
              '[' as u8 {
                let attr_name = self.parse_ident();
                alt self.get() {
                  coe_char(c)    { ch = c; }
                  coe_eof        { fail "File ended before " + 
                                      "description finished"; }
                }
                if ch == ']' as u8 {
                    ret to_attr(style::exists(attr_name));
                } else if ch == '=' as u8 {
                    let attr_val = self.parse_ident();
                    self.expect(']' as u8);
                    ret to_attr(style::exact(attr_name, attr_val));
                } else if ch == '~' as u8 {
                    self.expect('=' as u8);
                    let attr_val = self.parse_ident();
                    self.expect(']' as u8);
                    ret to_attr(style::includes(attr_name, attr_val));
                } else if ch == '|' as u8 {
                    self.expect('=' as u8);
                    let attr_val = self.parse_ident();
                    self.expect(']' as u8);
                    ret to_attr(style::starts_with(attr_name, attr_val));
                }
                fail #fmt("Unexpected symbol %c in attribute", ch as char);
              }
              _   { fail #fmt("Unexpected symbol %c in attribute", 
                              ch as char); }
            }
        }
        fn parse_css_description(c: u8) -> token {
            let mut ch = c;
            if ch.is_whitespace() {
                self.eat_whitespace();
                alt self.get() {
                  coe_char(c)  { ch = c }
                  coe_eof      { fail "Reached end of file " +  
                                    "in CSS description" }
                }
            }
            let mut desc_name = [];
            // Get the name of the descriptor
            loop {
                if ch.is_whitespace() {
                    self.eat_whitespace();
                } else if ch == ':' as u8 {
                    if desc_name.len() == 0u {
                        fail "Expected descriptor name";
                    } else {
                        break;
                    }
                } else {
                    desc_name += [ch];
                }
                alt self.get() {
                  coe_char(c)  { ch = c }
                  coe_eof      { fail "Reached end of file " +  
                                    "in CSS description" }
                }
            }
            self.eat_whitespace();
            let mut desc_val = [];
            // Get the value of the descriptor
            loop {
                alt self.get() {
                  coe_char(c)  { ch = c }
                  coe_eof      { fail "Reached end of file " +  
                                    "in CSS description" }
                }
                if ch.is_whitespace() {
                    self.eat_whitespace();
                } else if ch == '}' as u8 {
                    if desc_val.len() == 0u {
                        fail "Expected descriptor value";
                    } else {
                        self.state = ps_css_elmt;
                        break;
                    }
                } else if ch == ';' as u8 {
                    if desc_val.len() == 0u {
                        fail "Expected descriptor value";
                    } else {
                        break;
                    }
                } else {
                    desc_val += [ch];
                }
            }
            ret to_desc(desc_name.to_str(), desc_val.to_str());
        }
    }
 }
 fn parser(reader: io::reader, state : parse_state) -> parser {
    ret { mut lookahead: none, mut state: state, reader: reader };
 }
 fn spawn_html_parser_task(filename: str) -> port<html::token> {
    let result_port = port();
    let result_chan = chan(result_port);
    task::spawn {||
        let file_data = io::read_whole_file(filename).get();
        let reader = io::bytes_reader(file_data);
        assert filename.ends_with(".html");
        let parser = parser(reader, ps_html_normal);
        loop {
            let token = parser.parse_html();
            result_chan.send(token);
            if token == html::to_eof { break; }
        }
    };
    ret result_port;
 }
 fn spawn_css_parser_task(filename: str) -> port<css::token> {
    let result_port = port();
    let result_chan = chan(result_port);
    task::spawn {||
        let file_data = io::read_whole_file(filename).get();
        let reader = io::bytes_reader(file_data);
        assert filename.ends_with(".css");
        let parser : parser = parser(reader, ps_css_elmt);
        loop {
            let token = parser.parse_css();
            result_chan.send(token);
            if token == css::to_eof { break; }
        }
    };
    ret result_port;
 }
--- a/src/servo/servo.rc
+++ b/src/servo/servo.rc
@ -16,6 +16,7 @@ use stb_image;
 mod dom {
    mod base;
    mod rcu;
    mod style;
 }
 mod gfx {
@ -26,7 +27,7 @@ mod gfx {
 }
 mod image {
-	mod base;
+    mod base;
    mod encode {
        mod tga;
    }
@ -34,7 +35,7 @@ mod image {
 mod layout {
    mod style {
-		mod apply;
+	mod apply;
        mod style;
    }
@ -48,7 +49,7 @@ mod layout {
 }
 mod parser {
-    mod html;
+    mod lexer;
    mod html_builder;
 }
--- a/src/servo/servo.rs
+++ b/src/servo/servo.rs
@ -1,6 +1,6 @@
 import comm::*;
-import parser::html;
+import parser::lexer;
-import parser::html::methods;
+//import parser::lexer::util_methods;
 import result::extensions;
 import gfx::renderer;
 import platform::osmain;