Mike Gerwitz

Activist for User Freedom

aboutsummaryrefslogtreecommitdiffstats
path: root/tamer
diff options
context:
space:
mode:
authorMike Gerwitz <mike.gerwitz@ryansg.com>2022-04-07 12:08:51 -0400
committerMike Gerwitz <mike.gerwitz@ryansg.com>2022-04-07 12:13:49 -0400
commit99aacaf7ca723e90c614b594593d9fd0bf1fa190 (patch)
treea0b680258cafbeffaaf8d480647db75d78d1facd /tamer
parentb90bf9d8a8aad714bdbb989813df1652a337d583 (diff)
downloadtame-99aacaf7ca723e90c614b594593d9fd0bf1fa190.tar.gz
tame-99aacaf7ca723e90c614b594593d9fd0bf1fa190.tar.bz2
tame-99aacaf7ca723e90c614b594593d9fd0bf1fa190.zip
tamer: tamec: Replace copy with XIR parsing/writing
When wip-frontends is on, this will parse the input file using XIR and then immediately output it again. This makes the necessary changes to be able to read every source file we have in our largest project, such that the output is identical after having been formatted with `xmllint --format -` (there are differences because e.g. whitespace between attributes is not yet maintained). This is performant too, with times remaining essentially identical despite the additional work. DEV-10413
Diffstat (limited to 'tamer')
-rw-r--r--tamer/src/bin/tamec.rs40
-rw-r--r--tamer/src/xir/error.rs42
-rw-r--r--tamer/src/xir/reader.rs57
-rw-r--r--tamer/src/xir/reader/test.rs49
4 files changed, 160 insertions, 28 deletions
diff --git a/tamer/src/bin/tamec.rs b/tamer/src/bin/tamec.rs
index 21c78d4..9dc9f84 100644
--- a/tamer/src/bin/tamec.rs
+++ b/tamer/src/bin/tamec.rs
@@ -23,18 +23,18 @@
extern crate tamer;
use getopts::{Fail, Options};
-use std::env;
use std::error::Error;
use std::ffi::OsStr;
use std::fs;
use std::path::Path;
+use std::{env, io::BufWriter};
+use tamer::{
+ iter::into_iter_while_ok,
+ xir::{reader::XmlXirReader, DefaultEscaper},
+};
#[cfg(feature = "wip-frontends")]
-use {
- std::io::BufReader,
- tamer::frontend::{FrontendEvent, FrontendParser, XmlFrontendParser},
- tamer::fs::File,
-};
+use {std::io::BufReader, tamer::fs::File};
/// Types of commands
enum Command {
@@ -58,24 +58,26 @@ pub fn main() -> Result<(), Box<dyn Error>> {
let dest = Path::new(&output);
- // This will eventually replace `fs::copy` below.
+ #[cfg(not(feature = "wip-frontends"))]
+ fs::copy(source, dest)?;
+
#[cfg(feature = "wip-frontends")]
{
+ use tamer::xir::writer::XmlWriter;
+
+ let escaper = DefaultEscaper::default();
let file: BufReader<fs::File> = File::open(source)?;
- let mut parser = XmlFrontendParser::new(file);
-
- // Parse all the way through, but don't do anything with it
- // yet.
- loop {
- match parser.parse_next()? {
- FrontendEvent::Eof => break,
- _ => continue,
- }
- }
+ let mut fout = BufWriter::new(fs::File::create(dest)?);
+
+ // Parse into XIR and re-lower into XML,
+ // which is similar to a copy but proves that we're able
+ // to parse source files.
+ into_iter_while_ok(
+ XmlXirReader::new(file, &escaper),
+ |toks| toks.write(&mut fout, Default::default(), &escaper),
+ )??;
}
- fs::copy(source, dest)?;
-
Ok(())
}
Ok(Command::Usage) => {
diff --git a/tamer/src/xir/error.rs b/tamer/src/xir/error.rs
index c31edbc..d2c218a 100644
--- a/tamer/src/xir/error.rs
+++ b/tamer/src/xir/error.rs
@@ -19,7 +19,7 @@
//! XIR error information.
-use crate::tpwrap::quick_xml;
+use crate::{span::Span, sym::SymbolId, tpwrap::quick_xml};
use std::{fmt::Display, str::Utf8Error};
/// Error attempting to produce a XIR object.
@@ -31,12 +31,23 @@ pub enum Error {
NotWhitespace(String),
/// Provided QName is not valid.
InvalidQName(Vec<u8>),
- // A UTF-8 error together with the byte slice that caused it.
- //
- // By storing the raw bytes instead of a string,
- // we allow the displayer to determine how to handle invalid UTF-8
- // encodings.
+ /// A UTF-8 error together with the byte slice that caused it.
+ ///
+ /// By storing the raw bytes instead of a string,
+ /// we allow the displayer to determine how to handle invalid UTF-8
+ /// encodings.
InvalidUtf8(Utf8Error, Vec<u8>),
+ /// XML 1.0 only.
+ ///
+ /// Other versions are not widely in use
+ /// (only 1.1 exists at the time of writing)
+ /// and providing that is either in error,
+ /// copy/paste,
+ /// or the user is expecting something they're not going to get.
+ UnsupportedXmlVersion(SymbolId, Span),
+ /// TAMER expects UTF-8 encoding for everything,
+ /// which should not be an unreasonable expectation.
+ UnsupportedEncoding(SymbolId, Span),
// TODO: Better error translation and spans.
QuickXmlError(quick_xml::Error),
@@ -66,6 +77,23 @@ impl Display for Error {
String::from_utf8_lossy(bytes)
)
}
+ Self::UnsupportedXmlVersion(ver, span) => {
+ write!(
+ f,
+ "expected XML version `1.0` at {span}, \
+ but found unsupported version `{ver}`"
+ )
+ }
+ Self::UnsupportedEncoding(enc, span) => {
+ // TODO: when we have hints,
+ // indicate that they can also entirely remove this
+ // attribute to resolve the error
+ write!(
+ f,
+ "expected `utf-8` or `UTF-8` encoding at {span}, \
+ but found unsupported encoding `{enc}`"
+ )
+ }
// TODO: See Error TODO
Self::QuickXmlError(inner) => {
write!(f, "internal parser error: {:?}", inner)
@@ -91,6 +119,6 @@ impl From<(Utf8Error, &[u8])> for Error {
impl<E: Into<quick_xml::Error>> From<E> for Error {
fn from(err: E) -> Self {
- Self::QuickXmlError(err.into().into())
+ Self::QuickXmlError(err.into())
}
}
diff --git a/tamer/src/xir/reader.rs b/tamer/src/xir/reader.rs
index 46e0e9d..141f9de 100644
--- a/tamer/src/xir/reader.rs
+++ b/tamer/src/xir/reader.rs
@@ -22,10 +22,15 @@
//! This uses [`quick_xml`] as the parser.
use super::{DefaultEscaper, Error, Escaper, Token};
-use crate::{span::DUMMY_SPAN, sym::GlobalSymbolInternBytes};
+use crate::{
+ span::{DUMMY_SPAN, UNKNOWN_SPAN},
+ sym::GlobalSymbolInternBytes,
+};
use quick_xml::{
self,
- events::{attributes::Attributes, BytesStart, Event as QuickXmlEvent},
+ events::{
+ attributes::Attributes, BytesDecl, BytesStart, Event as QuickXmlEvent,
+ },
};
use std::{collections::VecDeque, io::BufRead, result};
@@ -163,11 +168,59 @@ impl<'s, B: BufRead, S: Escaper> XmlXirReader<'s, B, S> {
.map(|text| Token::Comment(text, DUMMY_SPAN)),
),
+ // TODO: This must appear in the Prolog.
+ QuickXmlEvent::Decl(decl) => match Self::validate_decl(&decl) {
+ Err(x) => Some(Err(x)),
+ Ok(()) => self.refill_buf(),
+ },
+
+ // We do not support processor instructions.
+ // TODO: Convert this into an error/warning?
+ // Previously `xml-stylesheet` was present in some older
+ // source files and may linger for a bit after cleanup.
+ QuickXmlEvent::PI(..) => self.refill_buf(),
+
x => todo!("event: {:?}", x),
},
}
}
+ /// Validate an that an XML declaration contains expected values.
+ ///
+ /// A declaration looks like `<?xml version="1.0" encoding="utf-8"?>`,
+ /// where `@encoding` is optional but `@version` is not.
+ /// It may also contain `@standalone`,
+ /// but we do not check for that.
+ ///
+ /// We expect version 1.0 and UTF-8 encoding.
+ /// Failing when these expectations are voilated helps to ensure that
+ /// people unfamiliar with the system do not have expectations that
+ /// are going to be unmet,
+ /// which may result in subtle (or even serious) problems.
+ fn validate_decl(decl: &BytesDecl) -> Result<()> {
+ // NB: `quick-xml` docs state that `version` returns the quotes,
+ // but it does not.
+ let ver = &decl.version()?[..];
+ if ver != b"1.0" {
+ Err(Error::UnsupportedXmlVersion(
+ ver.intern_utf8()?,
+ UNKNOWN_SPAN,
+ ))?
+ }
+
+ if let Some(enc) = decl.encoding() {
+ match &enc?[..] {
+ b"utf-8" | b"UTF-8" => (),
+ invalid => Err(Error::UnsupportedEncoding(
+ invalid.intern_utf8()?,
+ UNKNOWN_SPAN,
+ ))?,
+ }
+ }
+
+ Ok(())
+ }
+
/// Parse opening element and its attributes into a XIR [`Token`]
/// stream.
///
diff --git a/tamer/src/xir/reader/test.rs b/tamer/src/xir/reader/test.rs
index 1587ece..2d63f5a 100644
--- a/tamer/src/xir/reader/test.rs
+++ b/tamer/src/xir/reader/test.rs
@@ -431,3 +431,52 @@ fn attr_value_invalid_utf8() {
_ => panic!("unexpected failure"),
}
}
+
+#[test]
+fn valid_xml_decl_no_encoding() {
+ new_sut!(sut = r#"<?xml version="1.0"?><root />"#);
+
+ assert_eq!(
+ Ok(vec![
+ Token::Open("root".unwrap_into(), DUMMY_SPAN),
+ Token::Close(None, DUMMY_SPAN),
+ ]),
+ sut.collect()
+ );
+}
+
+#[test]
+fn valid_xml_decl_with_encoding_lower() {
+ new_sut!(sut = r#"<?xml version="1.0" encoding="utf-8"?>"#);
+
+ assert_eq!(Ok(vec![]), sut.collect());
+}
+
+#[test]
+fn valid_xml_decl_with_encoding_upper() {
+ new_sut!(sut = r#"<?xml version="1.0" encoding="UTF-8"?>"#);
+
+ assert_eq!(Ok(vec![]), sut.collect());
+}
+
+// Only 1.0 supported.
+#[test]
+fn invalid_xml_decl_version() {
+ new_sut!(sut = r#"<?xml version="1.1"?>"#);
+
+ assert_eq!(
+ Err(Error::UnsupportedXmlVersion("1.1".intern(), UNKNOWN_SPAN)),
+ sut.collect::<Result<Vec<_>>>()
+ );
+}
+
+// Only UTF-8 supported.
+#[test]
+fn invalid_xml_encoding() {
+ new_sut!(sut = r#"<?xml version="1.0" encoding="latin-1"?>"#);
+
+ assert_eq!(
+ Err(Error::UnsupportedEncoding("latin-1".intern(), UNKNOWN_SPAN)),
+ sut.collect::<Result<Vec<_>>>()
+ );
+}