Mike Gerwitz

Activist for User Freedom

aboutsummaryrefslogtreecommitdiffstats
path: root/tamer
diff options
context:
space:
mode:
authorMike Gerwitz <mike.gerwitz@ryansg.com>2022-03-17 23:22:38 -0400
committerMike Gerwitz <mike.gerwitz@ryansg.com>2022-03-17 23:22:38 -0400
commit150b3b9aa4312a321c0bae0c1dabc32ab38719f2 (patch)
tree2c3bc20f36735cf2e81af743f43fc8d29efdf7b0 /tamer
parentf04d8454528223e955e06ff899470d5ac6986ad1 (diff)
downloadtame-150b3b9aa4312a321c0bae0c1dabc32ab38719f2.tar.gz
tame-150b3b9aa4312a321c0bae0c1dabc32ab38719f2.tar.bz2
tame-150b3b9aa4312a321c0bae0c1dabc32ab38719f2.zip
tamer: xir::flat: Improve parser validation
This does a couple of things: it ensures that documents one and only one root note, and it properly handles dead transitions once parsing is complete (allowing it to be composed). This should make XIRF feature-complete for the time being. It does rely on the assumption that the reader is stripping out any trailing whitespace, so I guess we'll see if that's true as we proceed. DEV-10863
Diffstat (limited to 'tamer')
-rw-r--r--tamer/src/lib.rs3
-rw-r--r--tamer/src/xir/flat.rs86
-rw-r--r--tamer/src/xir/flat/test.rs112
3 files changed, 152 insertions, 49 deletions
diff --git a/tamer/src/lib.rs b/tamer/src/lib.rs
index f20bc04..1510ede 100644
--- a/tamer/src/lib.rs
+++ b/tamer/src/lib.rs
@@ -51,6 +51,9 @@
// We _could_ do without,
// but this provides a nicer API.
#![feature(explicit_generic_args_with_impl_trait)]
+// This simply removes a boilerplate `Default` impl;
+// we can do without if this does not get finalized.
+#![feature(derive_default_enum)]
// We build docs for private items.
#![allow(rustdoc::private_intra_doc_links)]
diff --git a/tamer/src/xir/flat.rs b/tamer/src/xir/flat.rs
index 639c416..37e411d 100644
--- a/tamer/src/xir/flat.rs
+++ b/tamer/src/xir/flat.rs
@@ -28,8 +28,10 @@
//! 1. All closing tags must correspond to a matching opening tag at the
//! same depth;
//! 2. [`Object`] exposes the [`Depth`] of each opening/closing tag;
-//! 3. Attribute tokens are parsed into [`Attr`] objects; and
-//! 4. Parsing will fail if input ends before all elements have been
+//! 3. Attribute tokens are parsed into [`Attr`] objects;
+//! 4. Documents must begin with an element and end with the closing of
+//! that element;
+//! 5. Parsing will fail if input ends before all elements have been
//! closed.
//!
//! XIRF lowering does not perform any dynamic memory allocation;
@@ -118,29 +120,26 @@ where
/// allowing XIRF's parser to avoid memory allocation entirely.
type ElementStack<const MAX_DEPTH: usize> = ArrayVec<(QName, Span), MAX_DEPTH>;
-/// XIRF parser state.
+/// XIRF document parser state.
///
-/// This parser is a pushdown automaton.
-#[derive(Debug, PartialEq, Eq)]
+/// This parser is a pushdown automaton that parses a single XML document.
+#[derive(Debug, Default, PartialEq, Eq)]
pub enum State<const MAX_DEPTH: usize, SA = AttrParseState>
where
SA: FlatAttrParseState,
{
- // TODO: Ensure that non-comment nodes are not encountered before the
- // root,
- // and that we do not encounter any non-comment nodes after the
- // root.
+ /// Document parsing has not yet begun.
+ #[default]
+ PreRoot,
+
/// Parsing nodes.
NodeExpected(ElementStack<MAX_DEPTH>),
/// Delegating to attribute parser.
AttrExpected(ElementStack<MAX_DEPTH>, SA),
-}
-impl<const MD: usize, SA: FlatAttrParseState> Default for State<MD, SA> {
- fn default() -> Self {
- Self::NodeExpected(Default::default())
- }
+ /// End of document has been reached.
+ Done,
}
impl<const MAX_DEPTH: usize, SA> ParseState for State<MAX_DEPTH, SA>
@@ -152,9 +151,22 @@ where
fn parse_token(self, tok: Token) -> TransitionResult<Self> {
use ParseStatus::{Dead, Incomplete, Object as Obj};
- use State::{AttrExpected, NodeExpected};
+ use State::{AttrExpected, Done, NodeExpected, PreRoot};
match (self, tok) {
+ // Comments are permitted before and after the first root element.
+ (st @ (PreRoot | Done), Token::Comment(sym, span)) => {
+ Transition(st).with(Object::Comment(sym, span))
+ }
+
+ (PreRoot, tok @ Token::Open(..)) => {
+ Self::parse_node(Default::default(), tok)
+ }
+
+ (PreRoot, tok) => {
+ Transition(PreRoot).err(StateError::RootOpenExpected(tok))
+ }
+
(NodeExpected(stack), tok) => Self::parse_node(stack, tok),
(AttrExpected(stack, sa), tok) => match sa.parse_token(tok) {
@@ -169,6 +181,8 @@ where
Transition(AttrExpected(stack, sa)).err(x)
}
},
+
+ (Done, tok) => Transition(Done).dead(tok),
}
}
@@ -182,7 +196,7 @@ where
// TODO: It'd be nice if we could also return additional context to
// aid the user in diagnosing the problem,
// e.g. what element(s) still need closing.
- matches!(self, Self::NodeExpected(stack) if stack.len() == 0)
+ *self == State::Done
}
}
@@ -196,7 +210,7 @@ where
tok: Token,
) -> TransitionResult<Self> {
use Object::*;
- use State::{AttrExpected, NodeExpected};
+ use State::{AttrExpected, Done, NodeExpected};
match tok {
Token::Open(qname, span) if stack.len() == MAX_DEPTH => Transition(
@@ -221,9 +235,7 @@ where
Token::Close(close_oqname, close_span) => {
match (close_oqname, stack.pop()) {
- (_, None) => Transition(NodeExpected(stack)).err(
- StateError::ExtraClosingTag(close_oqname, close_span),
- ),
+ (_, None) => unreachable!("parser should be in Done state"),
(Some(qname), Some((open_qname, open_span)))
if qname != open_qname =>
@@ -236,6 +248,13 @@ where
)
}
+ // Final closing tag (for root node) completes the document.
+ (..) if stack.len() == 0 => Transition(Done).with(Close(
+ close_oqname,
+ close_span,
+ Depth(0),
+ )),
+
(..) => {
let depth = stack.len();
@@ -283,6 +302,9 @@ pub fn parse<const MAX_DEPTH: usize>(
/// Parsing error from [`State`].
#[derive(Debug, Eq, PartialEq)]
pub enum StateError {
+ /// Opening root element tag was expected.
+ RootOpenExpected(Token),
+
/// Opening tag exceeds the maximum nesting depth for this parser.
MaxDepthExceeded { open: (QName, Span), max: Depth },
@@ -293,10 +315,6 @@ pub enum StateError {
close: (QName, Span),
},
- /// Attempt to close a tag with no corresponding opening tag
- /// (which would result in a negative depth).
- ExtraClosingTag(Option<QName>, Span),
-
/// Error from the attribute parser.
AttrError(AttrParseError),
}
@@ -306,6 +324,14 @@ impl Display for StateError {
use StateError::*;
match self {
+ RootOpenExpected(tok) => {
+ write!(
+ f,
+ "opening root element tag expected, \
+ but found {tok}"
+ )
+ }
+
MaxDepthExceeded {
open: (name, span),
max,
@@ -329,18 +355,6 @@ impl Display for StateError {
)
}
- ExtraClosingTag(Some(name), span) => {
- write!(f, "closing tag `{name}` at {span} has no opening tag",)
- }
-
- // If this occurs, its likely that something generated invalid
- // XIR;
- // it should be a parsing error on read and no generator
- // should ever produce this.
- ExtraClosingTag(None, span) => {
- write!(f, "self-closing tag at {span} has no opening tag")
- }
-
AttrError(e) => Display::fmt(e, f),
}
}
diff --git a/tamer/src/xir/flat/test.rs b/tamer/src/xir/flat/test.rs
index 46fca05..2801cb9 100644
--- a/tamer/src/xir/flat/test.rs
+++ b/tamer/src/xir/flat/test.rs
@@ -71,19 +71,25 @@ fn empty_element_balanced_close() {
}
// More closing tags than opening.
+//
+// We cannot keep the token and throw our own error because this tag may be
+// part of a parent context.
#[test]
fn extra_closing_tag() {
let name = ("ns", "openclose").unwrap_into();
- let toks = [Token::Close(Some(name), S)].into_iter();
+ let toks = [
+ // We need an opening tag to actually begin document parsing.
+ Token::Open(name, S),
+ Token::Close(Some(name), S2),
+ Token::Close(Some(name), S3),
+ ]
+ .into_iter();
- let mut sut = parse::<1>(toks);
+ let sut = parse::<1>(toks);
assert_eq!(
- sut.next(),
- Some(Err(ParseError::StateError(StateError::ExtraClosingTag(
- Some(name),
- S,
- ))))
+ Err(ParseError::UnexpectedToken(Token::Close(Some(name), S3),)),
+ sut.collect::<Result<Vec<Parsed<Object>>, _>>()
);
}
@@ -92,15 +98,20 @@ fn extra_closing_tag() {
// gotten to XIRF).
#[test]
fn extra_self_closing_tag() {
- let toks = [Token::Close(None, S)].into_iter();
+ let name = ("ns", "openclose").unwrap_into();
+ let toks = [
+ // We need an opening tag to actually begin document parsing.
+ Token::Open(name, S),
+ Token::Close(None, S2),
+ Token::Close(None, S3),
+ ]
+ .into_iter();
- let mut sut = parse::<1>(toks);
+ let sut = parse::<1>(toks);
assert_eq!(
- sut.next(),
- Some(Err(ParseError::StateError(StateError::ExtraClosingTag(
- None, S,
- ))))
+ Err(ParseError::UnexpectedToken(Token::Close(None, S3),)),
+ sut.collect::<Result<Vec<Parsed<Object>>, _>>()
);
}
@@ -355,3 +366,78 @@ fn not_accepting_state_if_element_open() {
// Element was not closed.
assert_eq!(Some(Err(ParseError::UnexpectedEof(Some(S)))), sut.next());
}
+
+// XML permits comment nodes before and after the document root element.
+#[test]
+fn comment_before_or_after_root_ok() {
+ let name = "root".unwrap_into();
+ let cstart = "start comment".intern();
+ let cend = "end comment".intern();
+
+ let toks = [
+ Token::Comment(cstart, S),
+ Token::Open(name, S2),
+ Token::Close(None, S3),
+ Token::Comment(cend, S4),
+ ]
+ .into_iter();
+
+ let sut = parse::<1>(toks);
+
+ assert_eq!(
+ Ok(vec![
+ Parsed::Object(Object::Comment(cstart, S)),
+ Parsed::Object(Object::Open(name, S2, Depth(0))),
+ Parsed::Object(Object::Close(None, S3, Depth(0))),
+ Parsed::Object(Object::Comment(cend, S4)),
+ ]),
+ sut.collect(),
+ );
+}
+
+// But there must be no content at the end of the document after the closing
+// root node.
+// This does not test every applicable token;
+// you can easily verify the actual implementation at a glance.
+//
+// This is just a dead parser state,
+// since it's possible for XIRF to be composed and we want to return to
+// the parent parser.
+#[test]
+fn content_after_root_close_error() {
+ let name = "root".unwrap_into();
+
+ let toks = [
+ Token::Open(name, S),
+ Token::Close(None, S2),
+ // Document ends here
+ Token::Open(name, S3),
+ ]
+ .into_iter();
+
+ let sut = parse::<1>(toks);
+
+ assert_eq!(
+ Result::<Vec<Parsed<Object>>, _>::Err(ParseError::UnexpectedToken(
+ Token::Open(name, S3)
+ )),
+ sut.collect()
+ );
+}
+
+// Non-comment nodes cannot appear before the opening root tag.
+#[test]
+fn content_before_root_open_error() {
+ let text = "foo".intern();
+
+ let toks = [Token::Text(text, S)].into_iter();
+
+ let sut = parse::<1>(toks);
+
+ assert_eq!(
+ Result::<Vec<Parsed<Object>>, _>::Err(ParseError::StateError(
+ StateError::RootOpenExpected(Token::Text(text, S))
+ )),
+ sut.collect()
+ );
+}