From e0ddaf50df95262a21024b655496399ad1397310 Mon Sep 17 00:00:00 2001 From: Margaret Meyerhofer Date: Wed, 20 Jun 2012 16:28:30 -0700 Subject: [PATCH] Refactored html and css lexing into separate files and capitalized those types --- src/servo/content.rs | 5 +- src/servo/parser/css_builder.rs | 71 ++-- src/servo/parser/css_lexer.rs | 253 +++++++++++++++ src/servo/parser/html_builder.rs | 22 +- src/servo/parser/html_lexer.rs | 171 ++++++++++ src/servo/parser/lexer.rs | 533 ------------------------------- src/servo/parser/lexer_util.rs | 112 +++++++ src/servo/servo.rc | 4 +- src/servo/servo.rs | 1 - 9 files changed, 588 insertions(+), 584 deletions(-) create mode 100644 src/servo/parser/css_lexer.rs create mode 100644 src/servo/parser/html_lexer.rs delete mode 100644 src/servo/parser/lexer.rs create mode 100644 src/servo/parser/lexer_util.rs diff --git a/src/servo/content.rs b/src/servo/content.rs index 190c9618708..d5adf45745c 100644 --- a/src/servo/content.rs +++ b/src/servo/content.rs @@ -14,7 +14,8 @@ import dom::base::NodeScope; import dom::rcu::WriterMethods; import dom::style; import style::print_sheet; -import parser::lexer::{spawn_css_lexer_task, spawn_html_parser_task}; +import parser::css_lexer::spawn_css_lexer_task; +import parser::html_lexer::spawn_html_lexer_task; import parser::css_builder::build_stylesheet; import parser::html_builder::build_dom; import layout::layout_task; @@ -79,7 +80,7 @@ fn Content(layout: Layout) -> Content { // Note: we can parse the next document in parallel // with any previous documents. - let stream = spawn_html_parser_task(copy filename); + let stream = spawn_html_lexer_task(copy filename); let root = build_dom(scope, stream); // Collect the css stylesheet diff --git a/src/servo/parser/css_builder.rs b/src/servo/parser/css_builder.rs index 19b1abcca82..5f5f0aac1d3 100644 --- a/src/servo/parser/css_builder.rs +++ b/src/servo/parser/css_builder.rs @@ -4,35 +4,35 @@ // are not as expected import dom::style::*; -import parser::lexer::css::{token, to_start_desc, to_end_desc, - to_descendant, to_child, to_sibling, - to_comma, to_elmt, to_attr, to_desc, - to_eof}; +import parser::css_lexer::{Token, StartDescription, EndDescription, + Descendant, Child, Sibling, + Comma, Element, Attr, Description, + Eof}; import comm::recv; import option::is_none; import util::color::parsing::parse_color; -type token_reader = {stream : port, mut lookahead : option}; +type TokenReader = {stream : port, mut lookahead : option}; -impl methods for token_reader { - fn get() -> token { +impl methods for TokenReader { + fn get() -> Token { alt copy self.lookahead { some(tok) { self.lookahead = none; copy tok } none { recv(self.stream) } } } - fn unget(-tok : token) { + fn unget(-tok : Token) { assert is_none(self.lookahead); self.lookahead = some(tok); } } -fn parse_element(reader : token_reader) -> option<~selector> { +fn parse_element(reader : TokenReader) -> option<~selector> { // Get the current element type let elmt_name = alt reader.get() { - to_elmt(tag) { copy tag } - to_eof { ret none; } + Element(tag) { copy tag } + Eof { ret none; } _ { fail "Expected an element" } }; @@ -42,24 +42,23 @@ fn parse_element(reader : token_reader) -> option<~selector> { loop { let tok = reader.get(); alt tok { - to_attr(attr) { attr_list += [copy attr]; } - to_start_desc | to_descendant | to_child | to_sibling - | to_comma { + Attr(attr) { attr_list += [copy attr]; } + StartDescription | Descendant | Child | Sibling | Comma { reader.unget(tok); break; } - to_eof { ret none; } - to_elmt(_) { fail "Unexpected second element without " + + Eof { ret none; } + Element(_) { fail "Unexpected second element without " + "relation to first element"; } - to_end_desc { fail "Unexpected '}'"; } - to_desc(_, _) { fail "Unexpected description"; } + EndDescription { fail "Unexpected '}'"; } + Description(_, _) { fail "Unexpected description"; } } } ret some(~element(elmt_name, attr_list)); } -fn parse_rule(reader : token_reader) -> option<~rule> { +fn parse_rule(reader : TokenReader) -> option<~rule> { let mut sel_list = []; let mut desc_list = []; @@ -75,7 +74,7 @@ fn parse_rule(reader : token_reader) -> option<~rule> { loop { let tok = reader.get(); alt tok { - to_descendant { + Descendant { alt parse_element(reader) { some(elmt) { let built_sel <- cur_sel; @@ -85,7 +84,7 @@ fn parse_rule(reader : token_reader) -> option<~rule> { none { ret none; } } } - to_child { + Child { alt parse_element(reader) { some(elmt) { let built_sel <- cur_sel; @@ -95,7 +94,7 @@ fn parse_rule(reader : token_reader) -> option<~rule> { none { ret none; } } } - to_sibling { + Sibling { alt parse_element(reader) { some(elmt) { let built_sel <- cur_sel; @@ -105,30 +104,30 @@ fn parse_rule(reader : token_reader) -> option<~rule> { none { ret none; } } } - to_start_desc { + StartDescription { let built_sel <- cur_sel; sel_list += [built_sel]; - reader.unget(to_start_desc); + reader.unget(StartDescription); break; } - to_comma { + Comma { let built_sel <- cur_sel; sel_list += [built_sel]; - reader.unget(to_comma); + reader.unget(Comma); break; } - to_attr(_) | to_end_desc | to_elmt(_) | to_desc(_, _) { + Attr(_) | EndDescription | Element(_) | Description(_, _) { fail #fmt["Unexpected token %? in elements", tok]; } - to_eof { ret none; } + Eof { ret none; } } } // check if we should break out of the nesting loop as well let tok = reader.get(); alt tok { - to_start_desc { break; } - to_comma { } + StartDescription { break; } + Comma { } _ { reader.unget(tok); } } } @@ -137,8 +136,8 @@ fn parse_rule(reader : token_reader) -> option<~rule> { loop { let tok = reader.get(); alt tok { - to_end_desc { break; } - to_desc(prop, val) { + EndDescription { break; } + Description(prop, val) { alt prop { "font-size" { // TODO, support more ways to declare a font size than # pt @@ -169,9 +168,9 @@ fn parse_rule(reader : token_reader) -> option<~rule> { val]; } } } - to_eof { ret none; } - to_start_desc | to_descendant | to_child | to_sibling - | to_comma | to_elmt(_) | to_attr(_) { + Eof { ret none; } + StartDescription | Descendant | Child | Sibling + | Comma | Element(_) | Attr(_) { fail #fmt["Unexpected token %? in description", tok]; } } @@ -180,7 +179,7 @@ fn parse_rule(reader : token_reader) -> option<~rule> { ret some(~(sel_list, desc_list)); } -fn build_stylesheet(stream : port) -> [~rule] { +fn build_stylesheet(stream : port) -> [~rule] { let mut rule_list = []; let reader = {stream : stream, mut lookahead : none}; diff --git a/src/servo/parser/css_lexer.rs b/src/servo/parser/css_lexer.rs new file mode 100644 index 00000000000..bc0e5c3ed12 --- /dev/null +++ b/src/servo/parser/css_lexer.rs @@ -0,0 +1,253 @@ +import comm::{port, chan}; +import dom::style; +import option::is_none; + +import lexer_util::*; + +enum ParserState { + CssElement, + CssRelation, + CssDescription, + CssAttribute +} + +type CssLexer = { + input_state: InputState, + mut parser_state: ParserState +}; + +enum Token { + StartDescription, + EndDescription, + Descendant, + Child, + Sibling, + Comma, + Element(str), + Attr(style::attr), + Description(str, str), + Eof +} + +impl css_methods for CssLexer { + fn parse_css() -> Token { + let mut ch: u8; + alt self.input_state.get() { + CoeChar(c) { ch = c; } + CoeEof { ret Eof; } + } + + let token = alt self.parser_state { + CssDescription { self.parse_css_description(ch) } + CssAttribute { self.parse_css_attribute(ch) } + CssElement { self.parse_css_element(ch) } + CssRelation { self.parse_css_relation(ch) } + }; + + #debug["token=%?", token]; + ret token; + } + + fn parse_css_relation(c : u8) -> Token { + self.parser_state = CssElement; + + let token = alt c { + '{' as u8 { self.parser_state = CssDescription; StartDescription } + '>' as u8 { Child } + '+' as u8 { Sibling } + ',' as u8 { Comma } + _ { self.input_state.unget(c); Descendant } + }; + + self.input_state.eat_whitespace(); + + ret token; + } + + fn parse_css_element(c : u8) -> Token { + assert is_none(self.input_state.lookahead); + + /* Check for special attributes with an implied element, + or a wildcard which is not a alphabet character.*/ + if c == '.' as u8 || c == '#' as u8 { + self.parser_state = CssAttribute; + self.input_state.unget(c); + ret Element("*"); + } else if c == '*' as u8 { + self.parser_state = CssAttribute; + ret Element("*"); + } + + self.input_state.unget(c); + let element = self.input_state.parse_ident(); + + self.parser_state = CssAttribute; + + ret Element(element); + } + + fn parse_css_attribute(c : u8) -> Token { + let mut ch = c; + + /* If we've reached the end of this list of attributes, + look for the relation to the next element.*/ + if c.is_whitespace() { + self.parser_state = CssRelation; + self.input_state.eat_whitespace(); + + alt self.input_state.get() { + CoeChar(c) { ch = c } + CoeEof { fail "File ended before description of style" } + } + + ret self.parse_css_relation(ch); + } + + alt ch { + '.' as u8 { ret Attr( + style::includes("class", self.input_state.parse_ident())); } + '#' as u8 { ret Attr( + style::includes("id", self.input_state.parse_ident())); } + '[' as u8 { + let attr_name = self.input_state.parse_ident(); + + alt self.input_state.get() { + CoeChar(c) { ch = c; } + CoeEof { fail "File ended before description finished"; } + } + + if ch == ']' as u8 { + ret Attr(style::exists(attr_name)); + } else if ch == '=' as u8 { + let attr_val = self.input_state.parse_ident(); + self.input_state.expect(']' as u8); + ret Attr(style::exact(attr_name, attr_val)); + } else if ch == '~' as u8 { + self.input_state.expect('=' as u8); + let attr_val = self.input_state.parse_ident(); + self.input_state.expect(']' as u8); + ret Attr(style::includes(attr_name, attr_val)); + } else if ch == '|' as u8 { + self.input_state.expect('=' as u8); + let attr_val = self.input_state.parse_ident(); + self.input_state.expect(']' as u8); + ret Attr(style::starts_with(attr_name, attr_val)); + } + + fail #fmt("Unexpected symbol %c in attribute", ch as char); + } + _ { fail #fmt("Unexpected symbol %c in attribute", ch as char); } + } + } + + fn parse_css_description(c: u8) -> Token { + let mut ch = c; + + if ch == '}' as u8 { + self.parser_state = CssElement; + self.input_state.eat_whitespace(); + ret EndDescription; + } else if ch.is_whitespace() { + self.input_state.eat_whitespace(); + + alt self.input_state.get() { + CoeChar(c) { ch = c } + CoeEof { fail "Reached end of file in CSS description" } + } + } + + let mut desc_name = []; + + // Get the name of the descriptor + loop { + if ch.is_whitespace() { + self.input_state.eat_whitespace(); + } else if ch == ':' as u8 { + if desc_name.len() == 0u { + fail "Expected descriptor name"; + } else { + break; + } + } else { + desc_name += [ch]; + } + + alt self.input_state.get() { + CoeChar(c) { ch = c } + CoeEof { fail "Reached end of file in CSS description" } + } + } + + self.input_state.eat_whitespace(); + let mut desc_val = []; + + // Get the value of the descriptor + loop { + alt self.input_state.get() { + CoeChar(c) { ch = c } + CoeEof { fail "Reached end of file in CSS description" } + } + + if ch.is_whitespace() { + self.input_state.eat_whitespace(); + } else if ch == '}' as u8 { + if desc_val.len() == 0u { + fail "Expected descriptor value"; + } else { + self.input_state.unget('}' as u8); + break; + } + } else if ch == ';' as u8 { + if desc_val.len() == 0u { + fail "Expected descriptor value"; + } else { + break; + } + } else { + desc_val += [ch]; + } + } + + ret Description(desc_name.to_str(), desc_val.to_str()); + } +} + +fn parser(reader: io::reader, state : ParserState) -> CssLexer { + ret { input_state: {mut lookahead: none, reader: reader}, + mut parser_state: state }; +} + +#[warn(no_non_implicitly_copyable_typarams)] +fn spawn_css_lexer_task(-filename: ~str) -> port { + let result_port = port(); + let result_chan = chan(result_port); + + task::spawn {|| + assert (*copy filename).ends_with(".css"); + let file_try = io::read_whole_file(*filename); + + // Check if the given css file existed, if it does, parse it, + // otherwise just send an eof. This is a hack to allow + // guessing that if foo.html exists, foo.css is the + // corresponding stylesheet. + if file_try.is_success() { + #debug["Lexing css sheet %s", *copy filename]; + let file_data = file_try.get(); + let reader = io::bytes_reader(file_data); + + let lexer = parser(reader, CssElement); + + loop { + let token = lexer.parse_css(); + let should_break = token == Eof; + result_chan.send(token); + if should_break { break; } + } + } else { + #debug["Failed to open css sheet %s", *copy filename]; + result_chan.send(Eof); + } + }; + + ret result_port; +} diff --git a/src/servo/parser/html_builder.rs b/src/servo/parser/html_builder.rs index db9df7a5e6f..eb02e142588 100644 --- a/src/servo/parser/html_builder.rs +++ b/src/servo/parser/html_builder.rs @@ -7,8 +7,8 @@ import dom::rcu::WriterMethods; import geom::size::Size2D; import gfx::geometry; import gfx::geometry::au; -import parser = parser::lexer::html; -import parser::token; +import parser = parser::html_lexer; +import parser::Token; import dvec::extensions; @@ -66,41 +66,41 @@ fn build_element_kind(tag_name: str) -> ~ElementKind { } } -fn build_dom(scope: NodeScope, stream: port) -> Node { +fn build_dom(scope: NodeScope, stream: port) -> Node { // The current reference node. let mut cur = scope.new_node(Element(ElementData("html", ~HTMLDivElement))); loop { let token = stream.recv(); alt token { - parser::to_eof { break; } - parser::to_start_opening_tag(tag_name) { + parser::Eof { break; } + parser::StartOpeningTag(tag_name) { #debug["starting tag %s", tag_name]; let element_kind = build_element_kind(tag_name); let new_node = scope.new_node(Element(ElementData(copy tag_name, element_kind))); scope.add_child(cur, new_node); cur = new_node; } - parser::to_attr(key, value) { + parser::Attr(key, value) { #debug["attr: %? = %?", key, value]; link_up_attribute(scope, cur, copy key, copy value); } - parser::to_end_opening_tag { + parser::EndOpeningTag { #debug("end opening tag"); } - parser::to_end_tag(_) | parser::to_self_close_tag { + parser::EndTag(_) | parser::SelfCloseTag { // TODO: Assert that the closing tag has the right name. // TODO: Fail more gracefully (i.e. according to the HTML5 // spec) if we close more tags than we open. cur = scope.get_parent(cur).get(); } - parser::to_text(s) if !s.is_whitespace() { + parser::Text(s) if !s.is_whitespace() { let new_node = scope.new_node(Text(copy s)); scope.add_child(cur, new_node); } - parser::to_text(_) { + parser::Text(_) { // FIXME: Whitespace should not be ignored. } - parser::to_doctype { + parser::Doctype { // TODO: Do something here... } } diff --git a/src/servo/parser/html_lexer.rs b/src/servo/parser/html_lexer.rs new file mode 100644 index 00000000000..75a1f7e966a --- /dev/null +++ b/src/servo/parser/html_lexer.rs @@ -0,0 +1,171 @@ +import comm::{port, chan}; +import dom::style; +import option::is_none; +import lexer_util::*; + +enum Token { + StartOpeningTag(str), + EndOpeningTag, + EndTag(str), + SelfCloseTag, + Text(str), + Attr(str, str), + Doctype, + Eof +} + +enum ParseState { + NormalHtml, + TagHtml, +} + +type HtmlLexer = { + input_state: InputState, + mut parser_state: ParseState +}; + +impl html_methods for HtmlLexer { + fn parse_html() -> Token { + let mut ch: u8; + alt self.input_state.get() { + CoeChar(c) { ch = c; } + CoeEof { ret Eof; } + } + let token = alt self.parser_state { + NormalHtml { self.parse_in_normal_state(ch) } + TagHtml { self.parse_in_tag_state(ch) } + }; + + #debug["token=%?", token]; + ret token; + } + + fn parse_in_normal_state(c: u8) -> Token { + let mut ch = c; + if ch == ('<' as u8) { + alt self.input_state.get() { + CoeChar(c) { ch = c; } + CoeEof { self.input_state.parse_err("eof after '<'") } + } + + if ch == ('!' as u8) { + self.input_state.eat_whitespace(); + self.input_state.expect_ident("DOCTYPE"); + self.input_state.eat_whitespace(); + self.input_state.expect_ident("html"); + self.input_state.eat_whitespace(); + self.input_state.expect('>' as u8); + ret Doctype; + } + + if ch == ('/' as u8) { + let ident = self.input_state.parse_ident(); + self.input_state.expect('>' as u8); + ret EndTag(ident); + } + + self.input_state.unget(ch); + + self.input_state.eat_whitespace(); + let ident = self.input_state.parse_ident(); + self.input_state.eat_whitespace(); + + self.parser_state = TagHtml; + ret StartOpeningTag(ident); + } + + // Make a text node. + let mut s: [u8] = [ch]; + loop { + alt self.input_state.get() { + CoeChar(c) { + if c == ('<' as u8) { + self.input_state.unget(c); + ret s.to_html_token(); + } + s += [c]; + } + CoeEof { ret s.to_html_token(); } + } + } + } + + fn parse_in_tag_state(c: u8) -> Token { + let mut ch = c; + + if ch == ('>' as u8) { + self.parser_state = NormalHtml; + ret EndOpeningTag; + } + + if ch == ('/' as u8) { + self.parser_state = NormalHtml; + ret SelfCloseTag; + } + + if !ch.is_alpha() { + fail #fmt("expected alphabetical in tag but found %c", ch as char); + } + + // Parse an attribute. + let mut attribute_name = [ch]; + loop { + alt self.input_state.get() { + CoeChar(c) { + if c == ('=' as u8) { break; } + attribute_name += [c]; + } + CoeEof { + ret Attr(attribute_name.to_str(), + attribute_name.to_str()); } + } + } + + // Parse the attribute value. + self.input_state.expect('"' as u8); + let mut attribute_value = []; + loop { + alt self.input_state.get() { + CoeChar(c) { + if c == ('"' as u8) { break; } + attribute_value += [c]; + } + CoeEof { + ret Attr(attribute_name.to_str(), + attribute_value.to_str()); + } + } + } + + // Eat whitespacpe. + self.input_state.eat_whitespace(); + + ret Attr(attribute_name.to_str(), attribute_value.to_str()); + } +} + +fn lexer(reader: io::reader, state : ParseState) -> HtmlLexer { + ret { input_state: {mut lookahead: none, reader: reader}, + mut parser_state: state }; +} + +#[warn(no_non_implicitly_copyable_typarams)] +fn spawn_html_lexer_task(-filename: ~str) -> port { + let result_port = port(); + let result_chan = chan(result_port); + task::spawn {|| + assert (*copy filename).ends_with(".html"); + let file_data = io::read_whole_file(*filename).get(); + let reader = io::bytes_reader(file_data); + + let lexer = lexer(reader, NormalHtml); + + loop { + let token = lexer.parse_html(); + let should_break = token == Eof; + result_chan.send(token); + if should_break { break; } + } + }; + ret result_port; +} diff --git a/src/servo/parser/lexer.rs b/src/servo/parser/lexer.rs deleted file mode 100644 index 0ed11685254..00000000000 --- a/src/servo/parser/lexer.rs +++ /dev/null @@ -1,533 +0,0 @@ -import comm::{port, chan}; -import html::html_methods; -import css::css_methods; -import dom::style; -import option::is_none; - -enum parse_state { - ps_html_normal, - ps_html_tag, - ps_css_elmt, - ps_css_relation, - ps_css_desc, - ps_css_attribute -} - -type parser = { - mut lookahead: option, - mut state: parse_state, - reader: io::reader -}; - -enum char_or_eof { - coe_char(u8), - coe_eof -} - -impl u8_methods for u8 { - fn is_whitespace() -> bool { - ret self == ' ' as u8 || self == '\n' as u8 - || self == '\t' as u8; - } - - fn is_alpha() -> bool { - ret (self >= ('A' as u8) && self <= ('Z' as u8)) || - (self >= ('a' as u8) && self <= ('z' as u8)); - } -} - -impl u8_vec_methods for [u8] { - fn to_str() -> str { ret str::from_bytes(self); } - fn to_html_token() -> html::token { ret html::to_text(self.to_str()); } - fn to_css_token() -> html::token { ret html::to_text(self.to_str()); } -} - -impl util_methods for parser { - fn get() -> char_or_eof { - alt copy self.lookahead { - some(coe) { - let rv = coe; - self.lookahead = none; - ret rv; - } - none { - /* fall through */ - } - } - - if self.reader.eof() { ret coe_eof; } - ret coe_char(self.reader.read_byte() as u8); - } - - fn unget(ch: u8) { - assert is_none(self.lookahead); - self.lookahead = some(coe_char(ch)); - } - - fn parse_err(err: str) -> ! { - fail err - } - - fn expect(ch: u8) { - alt self.get() { - coe_char(c) { - if c != ch { - self.parse_err(#fmt("expected '%c'", ch as char)); - } - } - coe_eof { - self.parse_err(#fmt("expected '%c' at eof", ch as char)); - } - } - } - - fn parse_ident() -> str { - let mut result: [u8] = []; - loop { - alt self.get() { - coe_char(c) { - if (c.is_alpha()) { - result += [c]; - } else if result.len() == 0u { - self.parse_err("expected ident"); - } else { - self.unget(c); - break; - } - } - coe_eof { - self.parse_err("expected ident"); - } - } - } - ret str::from_bytes(result); - } - - fn expect_ident(expected: str) { - let actual = self.parse_ident(); - if expected != actual { - self.parse_err(#fmt("expected '%s' but found '%s'", - expected, actual)); - } - } - - fn eat_whitespace() { - loop { - alt self.get() { - coe_char(c) { - if !c.is_whitespace() { - self.unget(c); - ret; - } - } - coe_eof { - ret; - } - } - } - } - - fn parse_html() -> html::token { - let mut ch: u8; - alt self.get() { - coe_char(c) { ch = c; } - coe_eof { ret html::to_eof; } - } - - let token = alt self.state { - ps_html_normal { self.parse_in_normal_state(ch) } - ps_html_tag { self.parse_in_tag_state(ch) } - _ { fail "Parsing in html mode when not in " + - "an html state" } - }; - - #debug["token=%?", token]; - ret token; - } - - fn parse_css() -> css::token { - let mut ch: u8; - alt self.get() { - coe_char(c) { ch = c; } - coe_eof { ret css::to_eof; } - } - - let token = alt self.state { - ps_css_desc { self.parse_css_description(ch) } - ps_css_attribute { self.parse_css_attribute(ch) } - ps_css_elmt { self.parse_css_element(ch) } - ps_css_relation { self.parse_css_relation(ch) } - _ { fail "Parsing in css mode when not in " + - "a css state" } - }; - - #debug["token=%?", token]; - ret token; - } -} - -mod html { - enum token { - to_start_opening_tag(str), - to_end_opening_tag, - to_end_tag(str), - to_self_close_tag, - to_text(str), - to_attr(str, str), - to_doctype, - to_eof - } - - impl html_methods for parser { - fn parse_in_normal_state(c: u8) -> token { - let mut ch = c; - if ch == ('<' as u8) { - alt self.get() { - coe_char(c) { ch = c; } - coe_eof { self.parse_err("eof after '<'") } - } - - if ch == ('!' as u8) { - self.eat_whitespace(); - self.expect_ident("DOCTYPE"); - self.eat_whitespace(); - self.expect_ident("html"); - self.eat_whitespace(); - self.expect('>' as u8); - ret to_doctype; - } - - if ch == ('/' as u8) { - let ident = self.parse_ident(); - self.expect('>' as u8); - ret to_end_tag(ident); - } - - self.unget(ch); - - self.eat_whitespace(); - let ident = self.parse_ident(); - self.eat_whitespace(); - - self.state = ps_html_tag; - ret to_start_opening_tag(ident); - } - - // Make a text node. - let mut s: [u8] = [ch]; - loop { - alt self.get() { - coe_char(c) { - if c == ('<' as u8) { - self.unget(c); - ret s.to_html_token(); - } - s += [c]; - } - coe_eof { ret s.to_html_token(); } - } - } - } - - fn parse_in_tag_state(c: u8) -> token { - let mut ch = c; - - if ch == ('>' as u8) { - self.state = ps_html_normal; - ret to_end_opening_tag; - } - - if ch == ('/' as u8) { - self.state = ps_html_normal; - ret to_self_close_tag; - } - - if !ch.is_alpha() { - fail #fmt("expected alphabetical in tag but found %c", - ch as char); - } - - // Parse an attribute. - let mut attribute_name = [ch]; - loop { - alt self.get() { - coe_char(c) { - if c == ('=' as u8) { break; } - attribute_name += [c]; - } - coe_eof { - ret to_attr(attribute_name.to_str(), - attribute_name.to_str()); } - } - } - - // Parse the attribute value. - self.expect('"' as u8); - let mut attribute_value = []; - loop { - alt self.get() { - coe_char(c) { - if c == ('"' as u8) { break; } - attribute_value += [c]; - } - coe_eof { - ret to_attr(attribute_name.to_str(), - attribute_value.to_str()); - } - } - } - - // Eat whitespacpe. - self.eat_whitespace(); - - ret to_attr(attribute_name.to_str(), attribute_value.to_str()); - } - } -} - -mod css { - enum token { - to_start_desc, - to_end_desc, - to_descendant, - to_child, - to_sibling, - to_comma, - to_elmt(str), - to_attr(style::attr), - to_desc(str, str), - to_eof - } - - impl css_methods for parser { - fn parse_css_relation(c : u8) -> token { - self.state = ps_css_elmt; - - let token = alt c { - '{' as u8 { self.state = ps_css_desc; to_start_desc } - '>' as u8 { to_child } - '+' as u8 { to_sibling } - ',' as u8 { to_comma } - _ { self.unget(c); to_descendant } - }; - - self.eat_whitespace(); - - ret token; - } - - fn parse_css_element(c : u8) -> token { - assert is_none(self.lookahead); - - /* Check for special attributes with an implied element, - or a wildcard which is not a alphabet character.*/ - if c == '.' as u8 || c == '#' as u8 { - self.state = ps_css_attribute; - self.unget(c); - ret to_elmt("*"); - } else if c == '*' as u8 { - self.state = ps_css_attribute; - ret to_elmt("*"); - } - - self.unget(c); - let element = self.parse_ident(); - - self.state = ps_css_attribute; - - ret to_elmt(element); - } - - fn parse_css_attribute(c : u8) -> token { - let mut ch = c; - - /* If we've reached the end of this list of attributes, - look for the relation to the next element.*/ - if c.is_whitespace() { - self.state = ps_css_relation; - self.eat_whitespace(); - - alt self.get() { - coe_char(c) { ch = c } - coe_eof { fail "File ended before description " + - "of style" } - } - - ret self.parse_css_relation(ch); - } - - alt ch { - '.' as u8 { ret to_attr( - style::includes("class", self.parse_ident())); } - '#' as u8 { ret to_attr( - style::includes("id", self.parse_ident())); } - '[' as u8 { - let attr_name = self.parse_ident(); - - alt self.get() { - coe_char(c) { ch = c; } - coe_eof { fail "File ended before " + - "description finished"; } - } - - if ch == ']' as u8 { - ret to_attr(style::exists(attr_name)); - } else if ch == '=' as u8 { - let attr_val = self.parse_ident(); - self.expect(']' as u8); - ret to_attr(style::exact(attr_name, attr_val)); - } else if ch == '~' as u8 { - self.expect('=' as u8); - let attr_val = self.parse_ident(); - self.expect(']' as u8); - ret to_attr(style::includes(attr_name, attr_val)); - } else if ch == '|' as u8 { - self.expect('=' as u8); - let attr_val = self.parse_ident(); - self.expect(']' as u8); - ret to_attr(style::starts_with(attr_name, attr_val)); - } - - fail #fmt("Unexpected symbol %c in attribute", ch as char); - } - _ { fail #fmt("Unexpected symbol %c in attribute", - ch as char); } - } - } - - fn parse_css_description(c: u8) -> token { - let mut ch = c; - - if ch == '}' as u8 { - self.state = ps_css_elmt; - self.eat_whitespace(); - ret to_end_desc; - } else if ch.is_whitespace() { - self.eat_whitespace(); - - alt self.get() { - coe_char(c) { ch = c } - coe_eof { fail "Reached end of file " + - "in CSS description" } - } - } - - let mut desc_name = []; - - // Get the name of the descriptor - loop { - if ch.is_whitespace() { - self.eat_whitespace(); - } else if ch == ':' as u8 { - if desc_name.len() == 0u { - fail "Expected descriptor name"; - } else { - break; - } - } else { - desc_name += [ch]; - } - - alt self.get() { - coe_char(c) { ch = c } - coe_eof { fail "Reached end of file " + - "in CSS description" } - } - } - - self.eat_whitespace(); - let mut desc_val = []; - - // Get the value of the descriptor - loop { - alt self.get() { - coe_char(c) { ch = c } - coe_eof { fail "Reached end of file " + - "in CSS description" } - } - - if ch.is_whitespace() { - self.eat_whitespace(); - } else if ch == '}' as u8 { - if desc_val.len() == 0u { - fail "Expected descriptor value"; - } else { - self.unget('}' as u8); - break; - } - } else if ch == ';' as u8 { - if desc_val.len() == 0u { - fail "Expected descriptor value"; - } else { - break; - } - } else { - desc_val += [ch]; - } - } - - ret to_desc(desc_name.to_str(), desc_val.to_str()); - } - } -} - -fn parser(reader: io::reader, state : parse_state) -> parser { - ret { mut lookahead: none, mut state: state, reader: reader }; -} - -#[warn(no_non_implicitly_copyable_typarams)] -fn spawn_html_parser_task(-filename: ~str) -> port { - let result_port = port(); - let result_chan = chan(result_port); - task::spawn {|| - let filename = copy *filename; - assert (copy filename).ends_with(".html"); - let file_data = io::read_whole_file(filename).get(); - let reader = io::bytes_reader(file_data); - - let parser = parser(reader, ps_html_normal); - - loop { - let token = parser.parse_html(); - let should_break = token == html::to_eof; - result_chan.send(token); - if should_break { break; } - } - }; - ret result_port; -} - -#[warn(no_non_implicitly_copyable_typarams)] -fn spawn_css_lexer_task(-filename: ~str) -> port { - let result_port = port(); - let result_chan = chan(result_port); - task::spawn {|| - let filename = copy *filename; - - assert (copy filename).ends_with(".css"); - let file_try = io::read_whole_file(filename); - - // Check if the given css file existed, if it does, parse it, - // otherwise just send an eof. This is a hack to allow - // guessing that if foo.html exists, foo.css is the - // corresponding stylesheet. - if file_try.is_success() { - #debug["Lexing css sheet %s", filename]; - let file_data = file_try.get(); - let reader = io::bytes_reader(file_data); - - let parser : parser = parser(reader, ps_css_elmt); - - loop { - let token = parser.parse_css(); - let should_break = token == css::to_eof; - result_chan.send(token); - if should_break { break; } - } - } else { - #debug["Failed to open css sheet %s", filename]; - result_chan.send(css::to_eof); - } - }; - ret result_port; -} diff --git a/src/servo/parser/lexer_util.rs b/src/servo/parser/lexer_util.rs new file mode 100644 index 00000000000..86f4082a49e --- /dev/null +++ b/src/servo/parser/lexer_util.rs @@ -0,0 +1,112 @@ +import option::is_none; + +enum CharOrEof { + CoeChar(u8), + CoeEof +} + +type InputState = { + mut lookahead: option, + reader: io::reader +}; + +impl u8_methods for u8 { + fn is_whitespace() -> bool { + ret self == ' ' as u8 || self == '\n' as u8 || self == '\t' as u8; + } + + fn is_alpha() -> bool { + ret (self >= ('A' as u8) && self <= ('Z' as u8)) || + (self >= ('a' as u8) && self <= ('z' as u8)); + } +} + +impl u8_vec_methods for [u8] { + fn to_html_token() -> html_lexer::Token { ret html_lexer::Text(self.to_str()); } + fn to_str() -> str { ret str::from_bytes(self); } +} + +impl util_methods for InputState { + fn get() -> CharOrEof { + alt copy self.lookahead { + some(coe) { + let rv = coe; + self.lookahead = none; + ret rv; + } + none { + /* fall through */ + } + } + + if self.reader.eof() { ret CoeEof; } + ret CoeChar(self.reader.read_byte() as u8); + } + + fn unget(ch: u8) { + assert is_none(self.lookahead); + self.lookahead = some(CoeChar(ch)); + } + + fn parse_err(err: str) -> ! { + fail err + } + + fn expect(ch: u8) { + alt self.get() { + CoeChar(c) { + if c != ch { + self.parse_err(#fmt("expected '%c'", ch as char)); + } + } + CoeEof { + self.parse_err(#fmt("expected '%c' at eof", ch as char)); + } + } + } + + fn parse_ident() -> str { + let mut result: [u8] = []; + loop { + alt self.get() { + CoeChar(c) { + if (c.is_alpha()) { + result += [c]; + } else if result.len() == 0u { + self.parse_err("expected ident"); + } else { + self.unget(c); + break; + } + } + CoeEof { + self.parse_err("expected ident"); + } + } + } + ret str::from_bytes(result); + } + + fn expect_ident(expected: str) { + let actual = self.parse_ident(); + if expected != actual { + self.parse_err(#fmt("expected '%s' but found '%s'", expected, actual)); + } + } + + fn eat_whitespace() { + loop { + alt self.get() { + CoeChar(c) { + if !c.is_whitespace() { + self.unget(c); + ret; + } + } + CoeEof { + ret; + } + } + } + } +} diff --git a/src/servo/servo.rc b/src/servo/servo.rc index 4983a7edffd..99ca26c99ff 100755 --- a/src/servo/servo.rc +++ b/src/servo/servo.rc @@ -51,7 +51,9 @@ mod layout { } mod parser { - mod lexer; + mod lexer_util; + mod css_lexer; + mod html_lexer; mod html_builder; mod css_builder; } diff --git a/src/servo/servo.rs b/src/servo/servo.rs index b9ec97eb8b3..1e12780e7f3 100644 --- a/src/servo/servo.rs +++ b/src/servo/servo.rs @@ -1,5 +1,4 @@ import comm::*; -import parser::lexer; import result::extensions; import gfx::renderer; import platform::osmain;