Mike Gerwitz

Activist for User Freedom

aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--tamer/Cargo.lock7
-rw-r--r--tamer/Cargo.toml3
-rw-r--r--tamer/benches/sym.rs223
-rw-r--r--tamer/src/ld/poc.rs2
-rw-r--r--tamer/src/lib.rs3
-rw-r--r--tamer/src/sym.rs674
6 files changed, 910 insertions, 2 deletions
diff --git a/tamer/Cargo.lock b/tamer/Cargo.lock
index 8db9877..9340d78 100644
--- a/tamer/Cargo.lock
+++ b/tamer/Cargo.lock
@@ -6,6 +6,11 @@ version = "0.1.9"
source = "registry+https://github.com/rust-lang/crates.io-index"
[[package]]
+name = "fnv"
+version = "1.0.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+
+[[package]]
name = "memchr"
version = "2.2.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
@@ -37,12 +42,14 @@ name = "tamer"
version = "0.0.0"
dependencies = [
"fixedbitset 0.1.9 (registry+https://github.com/rust-lang/crates.io-index)",
+ "fnv 1.0.6 (registry+https://github.com/rust-lang/crates.io-index)",
"petgraph 0.4.13 (registry+https://github.com/rust-lang/crates.io-index)",
"quick-xml 0.17.0 (registry+https://github.com/rust-lang/crates.io-index)",
]
[metadata]
"checksum fixedbitset 0.1.9 (registry+https://github.com/rust-lang/crates.io-index)" = "86d4de0081402f5e88cdac65c8dcdcc73118c1a7a465e2a05f0da05843a8ea33"
+"checksum fnv 1.0.6 (registry+https://github.com/rust-lang/crates.io-index)" = "2fad85553e09a6f881f739c29f0b00b0f01357c743266d478b68951ce23285f3"
"checksum memchr 2.2.1 (registry+https://github.com/rust-lang/crates.io-index)" = "88579771288728879b57485cc7d6b07d648c9f0141eb955f8ab7f9d45394468e"
"checksum ordermap 0.3.5 (registry+https://github.com/rust-lang/crates.io-index)" = "a86ed3f5f244b372d6b1a00b72ef7f8876d0bc6a78a4c9985c53614041512063"
"checksum petgraph 0.4.13 (registry+https://github.com/rust-lang/crates.io-index)" = "9c3659d1ee90221741f65dd128d9998311b0e40c5d3c23a62445938214abce4f"
diff --git a/tamer/Cargo.toml b/tamer/Cargo.toml
index 406afcb..82cfaf5 100644
--- a/tamer/Cargo.toml
+++ b/tamer/Cargo.toml
@@ -23,7 +23,8 @@ lto = true
lto = true
[dependencies]
-quick-xml = ">= 0.17.0"
+fnv = ">= 1.0.3"
petgraph = ">= 0.4.13"
+quick-xml = ">= 0.17.0"
# used by petgraph
fixedbitset = ">= 0.1"
diff --git a/tamer/benches/sym.rs b/tamer/benches/sym.rs
new file mode 100644
index 0000000..c727188
--- /dev/null
+++ b/tamer/benches/sym.rs
@@ -0,0 +1,223 @@
+// String internment benchmarks and baselines
+//
+// Copyright (C) 2014-2019 Ryan Specialty Group, LLC.
+//
+// This program is free software: you can redistribute it and/or modify
+// it under the terms of the GNU General Public License as published by
+// the Free Software Foundation, either version 3 of the License, or
+// (at your option) any later version.
+//
+// This program is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+// GNU General Public License for more details.
+//
+// You should have received a copy of the GNU General Public License
+// along with this program. If not, see <http://www.gnu.org/licenses/>.
+//
+// Note that the baseline tests have a _suffix_ rather than a prefix so that
+// they are still grouped with the associated test in the output, since it's
+// sorted lexically by function name.
+
+#![feature(test)]
+
+extern crate tamer;
+extern crate test;
+
+use std::rc::Rc;
+use tamer::sym::*;
+use test::Bencher;
+
+mod symbol {
+ use super::*;
+
+ /// This is our baseline. We should never be any slower than this.
+ #[bench]
+ fn new_raw_rc_1000_baseline(bench: &mut Bencher) {
+ let s = "foo bar baz";
+
+ bench.iter(|| {
+ (0..1000)
+ .map(|_| {
+ let _: Rc<str> = s.into();
+ })
+ .for_each(drop);
+ });
+ }
+
+ /// Using the `SymbolRc` wrapper should perform no differently than the
+ /// above test with `Rc<str>`.
+ #[bench]
+ fn new_symbol_rc_1000(bench: &mut Bencher) {
+ let s = "foo bar baz";
+
+ bench.iter(|| {
+ (0..1000).map(|_| SymbolRc::new(s)).for_each(drop);
+ });
+ }
+
+ /// Rc uses pointer comparisons when possible. We want to match that.
+ #[bench]
+ fn eq_check_rc_baseline(bench: &mut Bencher) {
+ bench.iter(|| {
+ let a: Rc<str> = "foobarbazquux".into();
+ let b: Rc<str> = "foobarbazquux".into();
+ let c: Rc<str> = "foobarbazquuxx".into();
+
+ let _ = a == b;
+ let _ = a == c;
+ });
+ }
+
+ #[bench]
+ fn eq_check_symbol_rc_baseline(bench: &mut Bencher) {
+ bench.iter(|| {
+ let a: SymbolRc = "foobarbazquux".into();
+ let b: SymbolRc = "foobarbazquux".into();
+ let c: SymbolRc = "foobarbazquuxx".into();
+
+ let _ = a == b;
+ let _ = a == c;
+ });
+ }
+}
+
+mod hash_set {
+ use super::*;
+ use std::collections::hash_map::RandomState;
+ use std::collections::HashSet;
+ use std::hash::BuildHasher;
+
+ pub struct HashSetSut<S = RandomState>
+ where
+ S: BuildHasher,
+ {
+ pub map: HashSet<Rc<str>, S>,
+ }
+
+ impl<S> HashSetSut<S>
+ where
+ S: BuildHasher + Default,
+ {
+ #[inline]
+ fn new() -> Self {
+ Self {
+ map: HashSet::with_hasher(Default::default()),
+ }
+ }
+
+ pub fn intern(&mut self, value: &str) -> Rc<str> {
+ if !self.map.contains(value) {
+ self.map.insert(value.into());
+ }
+
+ self.map.get(value).unwrap().clone()
+ }
+ }
+
+ fn gen_strs(n: usize) -> Vec<String> {
+ (0..n)
+ .map(|n| n.to_string() + "foobarbazquuxlongsymbol")
+ .collect()
+ }
+
+ /// This is our baseline with a raw Rc<str>.
+ #[bench]
+ fn with_all_new_rc_str_1000_baseline(bench: &mut Bencher) {
+ let strs = gen_strs(1000);
+
+ bench.iter(|| {
+ let mut sut = HashSetSut::<RandomState>::new();
+ strs.iter().map(|s| sut.intern(&s)).for_each(drop);
+ });
+ }
+
+ #[bench]
+ fn with_all_new_1000(bench: &mut Bencher) {
+ let strs = gen_strs(1000);
+
+ bench.iter(|| {
+ let mut sut = HashSetInterner::<SymbolRc>::new();
+ strs.iter().map(|s| sut.intern(&s)).for_each(drop);
+ });
+ }
+
+ #[bench]
+ /// This is our baseline with a raw Rc<str>.
+ fn with_one_new_rc_str_1000_baseline(bench: &mut Bencher) {
+ bench.iter(|| {
+ let mut sut = HashSetSut::<RandomState>::new();
+ (0..1000).map(|_| sut.intern("first")).for_each(drop);
+ });
+ }
+
+ #[bench]
+ fn with_one_new_1000(bench: &mut Bencher) {
+ bench.iter(|| {
+ let mut sut = HashSetInterner::<SymbolRc>::new();
+ (0..1000).map(|_| sut.intern("first")).for_each(drop);
+ });
+ }
+
+ mod fnv {
+ use super::*;
+ use ::fnv::FnvBuildHasher;
+
+ /// This is our baseline with a raw Rc<str>.
+ #[bench]
+ fn with_all_new_rc_str_1000_baseline(bench: &mut Bencher) {
+ let strs = gen_strs(1000);
+ bench.iter(|| {
+ let mut sut = HashSetSut::<FnvBuildHasher>::new();
+ strs.iter().map(|s| sut.intern(&s)).for_each(drop);
+ });
+ }
+
+ #[bench]
+ fn with_all_new_1000(bench: &mut Bencher) {
+ let strs = gen_strs(1000);
+
+ bench.iter(|| {
+ let mut sut =
+ HashSetInterner::<SymbolRc, FnvBuildHasher>::new();
+ strs.iter().map(|s| sut.intern(&s)).for_each(drop);
+ });
+ }
+
+ #[bench]
+ /// This is our baseline with a raw Rc<str>.
+ fn with_one_new_rc_str_1000_baseline(bench: &mut Bencher) {
+ bench.iter(|| {
+ let mut sut: HashSetSut<FnvBuildHasher> = HashSetSut {
+ map: HashSet::with_hasher(Default::default()),
+ };
+ (0..1000).map(|_| sut.intern("first")).for_each(drop);
+ });
+ }
+
+ #[bench]
+ fn with_one_new_1000(bench: &mut Bencher) {
+ bench.iter(|| {
+ let mut sut =
+ HashSetInterner::<SymbolRc, FnvBuildHasher>::new();
+ (0..1000).map(|_| sut.intern("first")).for_each(drop);
+ });
+ }
+
+ /// Since FNV is the best-performing, let's build upon it to demonstrate
+ /// the benefits of with_capacity
+ #[bench]
+ fn with_all_new_1000_with_capacity(bench: &mut Bencher) {
+ let n = 1000;
+ let strs = gen_strs(n);
+
+ bench.iter(|| {
+ let mut sut =
+ HashSetInterner::<SymbolRc, FnvBuildHasher>::with_capacity(
+ n,
+ );
+ strs.iter().map(|s| sut.intern(&s)).for_each(drop);
+ });
+ }
+ }
+}
diff --git a/tamer/src/ld/poc.rs b/tamer/src/ld/poc.rs
index d2b3fcd..d7f9733 100644
--- a/tamer/src/ld/poc.rs
+++ b/tamer/src/ld/poc.rs
@@ -446,7 +446,7 @@ fn discover_roots(depgraph: &DepGraph) -> Vec<SymRef> {
}
#[cfg(test)]
-mod tests {
+mod test {
#[test]
fn placeholder() {}
}
diff --git a/tamer/src/lib.rs b/tamer/src/lib.rs
index bf0f698..c372488 100644
--- a/tamer/src/lib.rs
+++ b/tamer/src/lib.rs
@@ -15,4 +15,7 @@
// You should have received a copy of the GNU General Public License
// along with this program. If not, see <http://www.gnu.org/licenses/>.
+//! An incremental rewrite of TAME in Rust.
+
pub mod ld;
+pub mod sym;
diff --git a/tamer/src/sym.rs b/tamer/src/sym.rs
new file mode 100644
index 0000000..6134e51
--- /dev/null
+++ b/tamer/src/sym.rs
@@ -0,0 +1,674 @@
+// String internment
+//
+// Copyright (C) 2014-2019 Ryan Specialty Group, LLC.
+//
+// This program is free software: you can redistribute it and/or modify
+// it under the terms of the GNU General Public License as published by
+// the Free Software Foundation, either version 3 of the License, or
+// (at your option) any later version.
+//
+// This program is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+// GNU General Public License for more details.
+//
+// You should have received a copy of the GNU General Public License
+// along with this program. If not, see <http://www.gnu.org/licenses/>.
+
+//! String internment system.
+//!
+//! This system is currently a thin wrapper providing an abstraction for
+//! future evolution.
+//!
+//! Interned strings are represented by [`Symbol`]:
+//!
+//! - [`SymbolRc`] - Wrapper around [`Rc`]-backed [`str`] slices.
+//!
+//! Symbols are created, stored, compared, and retrieved by
+//! an [`Interner`]:
+//!
+//! - [`DefaultSetInterner`] - The currently recommended intern pool
+//! configuration for symbol interning.
+//! - [`HashSetInterner`] - Intern pool backed by a [`HashSet`].
+//! - [`FnvHashSetInterner`] - Intern pool backed by a [`HashSet`] using the
+//! Fowler-Noll-Vo hashing algorithm.
+//!
+//! Note that the [`DefaultSetInterner`] has a configuration chosen for
+//! efficient _symbol interning_;
+//! see its documentation for more information and caveats,
+//! since it may not fit your particular use case.
+//!
+//! All [`Symbol`] values implement [`Clone`] and can be passed around
+//! freely without associated lifetimes.
+//!
+//! ```
+//! use tamer::sym::{Interner, DefaultSetInterner};
+//!
+//! // Inputs to be interned
+//! let a = "foo";
+//! let b = &"foo".to_string();
+//! let c = "foobar";
+//! let d = &c[0..3];
+//!
+//! let mut interner = DefaultSetInterner::new();
+//!
+//! let (ia, ib, ic, id) = (
+//! interner.intern(a),
+//! interner.intern(b),
+//! interner.intern(c),
+//! interner.intern(d),
+//! );
+//!
+//! assert_eq!(ia, ib);
+//! assert_eq!(ia, id);
+//! assert_eq!(ib, id);
+//! assert_ne!(ia, ic);
+//!
+//! // All interns can be cloned and clones are eq
+//! assert_eq!(ia, ia.clone());
+//!
+//! // Only "foo" and "foobar" are interned
+//! assert_eq!(2, interner.len());
+//! assert!(interner.contains("foo"));
+//! assert!(interner.contains("foobar"));
+//! assert!(!interner.contains("something else"));
+//! ```
+//!
+//! What Is String Interning?
+//! =========================
+//! _[String interning][]_ is a process by which a single copy of a string
+//! is stored immutably in memory as part of a _pool_.
+//! When the same string is later encountered,
+//! a reference to the string in the pool is used rather than allocating a
+//! new string.
+//! Interned strings are typically referred to as "symbols" or "atoms".
+//!
+//! String comparison then amounts to comparing pointers (`O(1)`) in the
+//! best case (see Equality of Interns below)
+//! rather than having to scan the string (`O(n)`).
+//! There is, however, a cost of interning strings,
+//! as well as comparing new strings to the intern pool.
+//!
+//! Symbols can be used as keys to a map to associate unique data with each
+//! intern,
+//! while at the same time doubling as an intern pool.
+//!
+//! From the perspective of Rust,
+//! this also provides a convenient alternative to passing around
+//! lifetimes,
+//! which becomes unwieldy when shared data is used in many places
+//! throughout the system.
+//! Each [`Symbol`] implements [`Clone`].
+//!
+//! [string interning]: https://en.wikipedia.org/wiki/String_interning
+//!
+//!
+//! Internment Mechanism
+//! ====================
+//! An internment mechanism is easily implemented in Rust using [`Rc`],
+//! which is the approach taken by [`SymbolRc`].
+//! While this does have a minor runtime overhead,
+//! the simplicity afforded by the implementation makes it attractive
+//! until it becomes a bottleneck.
+//! Considering all of the low-hanging fruit in the TAME→TAMER
+//! transition,
+//! this is not expected to be a bottleneck for quite some time.
+//!
+//! Rust implemented [`From`] for slices for [`Rc`] in [RFC 1845][rfc-1845],
+//! where it explicitly used string interning as a use case.
+//! A trivial string interner can be implemented using
+//! `HashSet<Rc<str>>`,
+//! which is done by [`HashSetInterner`] using [`SymbolRc`].
+//! However, the best mechanism by which to perform interning must be
+//! determined on a case-by-case basis.
+//! For example,
+//! if a mapping between strings and some value is required,
+//! then the pool can be combined with the mapping by using a
+//! [`HashMap`](std::collections::HashMap)
+//! with a `Rc<str>` key type.
+//!
+//! [rfc-1845]: https://rust-lang.github.io/rfcs/1845-shared-from-slice.html
+//!
+//! Further, we may want to implement a more robust internment system in the
+//! future,
+//! possibly using existing crates.
+//!
+//! To accommodate the above situations,
+//! this implementation is generalized and defers implementation details
+//! to individual implementers.
+//!
+//! [smart pointer]: https://doc.rust-lang.org/book/ch15-00-smart-pointers.html
+//!
+//!
+//! Lifetime of Interns
+//! -------------------
+//! Implementations need not bother with trying to free up space in the
+//! intern pool when strings are no longer in use,
+//! since compilation processes are short-lived.
+//! This gives much more flexibility in implementation.
+//!
+//!
+//! Equality of Interns
+//! -------------------
+//! Two [`Symbol`] values must be considered [`Eq`] if their wrapped
+//! values are `Eq`.
+//! This requirement,
+//! together with the requirement that equal values share the same hash
+//! (via [`Hash`]),
+//! permit use of hash tables
+//! (such as [`HashSet`] and [`HashMap`](std::collections::HashMap))
+//! to store interned values.
+//!
+//! Comparing [`Rc`] values is usually cheap because checks are first
+//! performed by comparing pointers to the underlying data.
+//! If they represent the same interned symbol,
+//! then pointers will match.
+//! If not,
+//! the underlying values will be compared.
+//! Since they are known to be different,
+//! this comparison should go quickly,
+//! unless the two strings being compared share a common prefix.
+//!
+//! In situations where it is not acceptable to fall back to a potentially
+//! expensive negative case,
+//! you may compare interned values by memory location using
+//! [`Symbol::ptr_eq`]
+//! (so named after [`Rc::ptr_eq`]):
+//!
+//! ```
+//! use tamer::sym::{Symbol, SymbolRc};
+//!
+//! let a = SymbolRc::new("foo");
+//! let b = SymbolRc::new(&"foo".to_string());
+//!
+//! assert_eq!(a, b);
+//! assert_eq!(*a, *b);
+//! assert!(a.ptr_eq(&a.clone()));
+//! assert!(!a.ptr_eq(&b));
+//! ```
+//!
+//!
+//! Name Mangling
+//! =============
+//! Interners do not perform [name mangling][],
+//! nor is it yet clear if they should.
+//! For future consideration,
+//! see [RFC 2603][rfc-2603] and the [Itanium ABI][itanium-abi].
+//!
+//! [name mangling]: https://en.wikipedia.org/wiki/Name_mangling
+//! [rfc-2603]: https://rust-lang.github.io/rfcs/2603-symbol-name-mangling-v2.html
+//! [itanium-abi]: http://refspecs.linuxbase.org/cxxabi-1.86.html#mangling
+//!
+//!
+//! Related Work and Further Reading
+//! ================================
+//! String interning is often tightly coupled with symbols (in the generic
+//! sense),
+//! sometimes called atoms.
+//! Symbols can often be either interned,
+//! and therefore compared for equivalency,
+//! or _uninterned_,
+//! which makes them unique even to symbols of the same name.
+//! Interning may also be done automatically by a language for performance.
+//! Languages listed below that allow for explicit interning may also
+//! perform automatic interning as well
+//! (for example, `'symbol` in Lisp and `lowercase_vars` as atoms in
+//! Erlang).
+//!
+//! | Language | Interned | Uninterned |
+//! | -------- | -------- | ---------- |
+//! | [Erlang][] | [`list_to_atom`][edt] | _(None)_ |
+//! | [GNU Emacs Lisp][] | [`intern`][es], [`intern-soft`][es] | [`make-symbol`][es], [`gensym`][es] |
+//! | [GNU Guile][] | [`string->symbol`][gs], [`gensym`][gs] | [`make-symbol`][gu] |
+//! | [JavaScript][] | [`Symbol.for`][js] | [`Symbol`][js] |
+//! | [Java][] | [`intern`][jvs] | _(None)_ |
+//! | [Lua][] | _(Automatic for string performance)_ | _(None)_ |
+//! | [MIT/GNU Scheme][] | [`intern`][ms], [`intern-soft`][ms], [`string->symbol`][ms] | [`string->uninterned-symbol`][ms], [`generate-uninterned-symbol`][ms] |
+//! | [PHP][] | _(Automatic for string [performance][pp])_ | _(None)_ |
+//! | [Python][] | [`sys.intern`][pys] | _(None)_ |
+//! | [R6RS Scheme][] | [`string->symbol`][r6s] | _(None)_ |
+//! | [Racket][] | [`string->symbol`][rs], [`string->unreadable-symbol`][rs] | [`string->uninterned-symbol`][rs], [`gensym`][rs] |
+//!
+//! [gnu guile]: https://www.gnu.org/software/guile/
+//! [gs]: https://www.gnu.org/software/guile/manual/html_node/Symbol-Primitives.html#Symbol-Primitives
+//! [gu]: https://www.gnu.org/software/guile/manual/html_node/Symbol-Uninterned.html#Symbol-Uninterned
+//! [gnu emacs lisp]: https://www.gnu.org/software/emacs/
+//! [es]: https://www.gnu.org/software/emacs/manual/html_node/elisp/Creating-Symbols.html
+//! [racket]: https://racket-lang.org/
+//! [rs]: https://docs.racket-lang.org/reference/symbols.html
+//! [r6rs scheme]: http://www.r6rs.org/
+//! [r6s]: http://www.r6rs.org/final/html/r6rs/r6rs-Z-H-14.html
+//! [mit/gnu scheme]: https://www.gnu.org/software/mit-scheme/
+//! [ms]: https://www.gnu.org/software/mit-scheme/documentation/mit-scheme-ref/Symbols.html
+//! [javascript]: https://developer.mozilla.org/en-US/docs/Web/JavaScript
+//! [js]: https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Global_Objects/Symbol
+//! [java]: http://openjdk.java.net/
+//! [jvs]: https://cr.openjdk.java.net/~iris/se/12/latestSpec/api/java.base/java/lang/String.html#intern()
+//! [php]: https://www.php.net/
+//! [pp]: https://wiki.php.net/rfc/performanceimprovements
+//! [erlang]: https://erlang.org/
+//! [edt]: http://erlang.org/doc/reference_manual/data_types.html
+//! [lua]: https://www.lua.org/
+//! [python]: https://www.python.org/
+//! [pys]: https://docs.python.org/3/library/sys.html
+//!
+//! More information:
+//! - Wikipedia entry on [string interning][].
+//! - The [flyweight pattern][] in object-oriented programming is a type
+//! of interning.
+//! - [RFC 1845][rfc-1845] gives an example of string interning using
+//! `Rc<str>`,
+//! which is used by [`SymbolRc`].
+//! - Emacs directly exposes the intern pool at runtime as
+//! [`obarray`][es].
+//! - [`string-cache`][rust-string-cache] is a string interning system
+//! for Rust developed by Mozilla for Servo.
+//! - [`string-interner`][rust-string-interner] is another string
+//! interning library for Rust.
+//!
+//! [flyweight pattern]: https://en.wikipedia.org/wiki/Flyweight_pattern
+//! [rust-string-cache]: https://github.com/servo/string-cache
+//! [rust-string-interner]: https://github.com/robbepop/string-interner
+
+use ::fnv::FnvBuildHasher;
+use std::collections::hash_map::RandomState;
+use std::collections::HashSet;
+use std::hash::{BuildHasher, Hash};
+use std::ops::Deref;
+use std::rc::Rc;
+
+/// Reference to an interned string.
+///
+/// This encapsulates implementation details regarding the internment
+/// process and allows it to change over time,
+/// or even vary between subsystems.
+///
+/// An interned string must be treated as if it were the underlying
+/// interned type `T` by implementing [`Deref`].
+/// Further,
+/// since interned string should be mere references to data,
+/// they ought to be cheap to copy.
+/// However,
+/// since reference counting may be involved,
+/// we must use [`Clone`] instead of [`Copy`].
+///
+/// Storing Interns
+/// ===============
+/// _This is a wrapper for an interned value;
+/// it does not necessary mean that a value was actually interned!_
+/// It ought to be created using an [`Interner`],
+/// which is responsible for creation, storage, and retrieval of interned
+/// values.
+pub trait Symbol: Deref<Target = str> + Clone + Eq + Hash {
+ /// Represent `value` as an interned string.
+ ///
+ /// _This does not perform internment;_ see
+ /// [`Interner`]. This merely provides a wrapper for interned
+ /// values which are then stored by an `Interner`.
+ fn new(value: &str) -> Self;
+
+ /// Compare underlying memory location for equality with the memory
+ /// location of another interned value.
+ ///
+ /// Symbol strings are [`Eq`] one-another if their underlying values
+ /// are also `Eq`.
+ /// This allows comparing underlying memory locations for equality,
+ /// both to ensure that internment is actually being performed,
+ /// and to permit separate internment systems sharing the same string
+ /// set.
+ fn ptr_eq(&self, other: &Self) -> bool;
+}
+
+/// An `Rc`-backed reference to an interned string.
+///
+/// This is a zero-cost abstraction and should provide no additional
+/// overhead over using `Rc` directly.
+#[derive(Debug, Eq, PartialEq, Hash)]
+pub struct SymbolRc(Rc<str>);
+
+impl Symbol for SymbolRc {
+ /// Represent `value` as an interned string.
+ ///
+ /// _This does not perform internment;_ see
+ /// [`Interner`]. This merely provides a wrapper for interned
+ /// values which are then stored by an `Interner`.
+ #[inline]
+ fn new(value: &str) -> Self {
+ Self(value.into())
+ }
+
+ /// Compare underlying memory location for equality with the memory
+ /// location of another interned value.
+ ///
+ /// ```
+ /// use tamer::sym::{Symbol, SymbolRc};
+ ///
+ /// let a = SymbolRc::new("foo");
+ /// let b = SymbolRc::new(&"foo".to_string());
+ ///
+ /// assert!(a.ptr_eq(&a.clone()));
+ /// assert!(!a.ptr_eq(&b));
+ /// ```
+ ///
+ /// For more information,
+ /// see [`Symbol::ptr_eq`].
+ #[inline]
+ fn ptr_eq(&self, other: &Self) -> bool {
+ Rc::ptr_eq(&self.0, &other.0)
+ }
+}
+
+impl Deref for SymbolRc {
+ type Target = str;
+
+ /// Retrieve underlying string slice.
+ #[inline]
+ fn deref(&self) -> &Self::Target {
+ &self.0
+ }
+}
+
+impl Clone for SymbolRc {
+ /// Clone a reference to the interned value.
+ ///
+ /// This creates another pointer to the same internal value, increasing
+ /// the strong reference count on the underlying [`Rc`].
+ #[inline]
+ fn clone(&self) -> Self {
+ Self(Rc::clone(&self.0))
+ }
+}
+
+impl From<&str> for SymbolRc {
+ #[inline]
+ fn from(value: &str) -> Self {
+ Self::new(value)
+ }
+}
+
+/// Create, store, compare, and retrieve [`Symbol`] values.
+///
+/// Interners are generic over the type of their interned values.
+/// They accept string slices and produce values of type [`Symbol`].
+///
+/// The intended use case is to attempt to blindly
+/// [`intern`](Interner::intern) slices;
+/// if it has already been interned,
+/// then you will receive a copy of the existing [`Symbol`] value.
+/// If it hasn't been interned,
+/// then it will be added.
+///
+/// If you care whether a value has been interned yet or not,
+/// use [`contains`](Interner::contains).
+///
+/// See the [module-level documentation](self) for an example.
+pub trait Interner<T: Symbol> {
+ /// Intern a string slice or return an existing matching intern.
+ ///
+ /// If the provided string has already been interned,
+ /// then the existing [`Symbol`] value will be cloned and returned.
+ /// Otherwise,
+ /// the string will be interned and a clone returned.
+ ///
+ /// To retrieve an existing intern _without_ interning,
+ /// see [`intern_soft`](Interner::intern_soft).
+ fn intern(&mut self, value: &str) -> T;
+
+ /// Retrieve an existing intern for the string slice `s`.
+ ///
+ /// Unlike [`intern`](Interner::intern),
+ /// this will _not_ intern the string if it has not already been
+ /// interned.
+ fn intern_soft(&mut self, value: &str) -> Option<T>;
+
+ /// Determine whether the given value has already been interned.
+ fn contains(&self, value: &str) -> bool;
+
+ /// Number of interned strings.
+ ///
+ /// This count will increase each time a unique string is interned.
+ /// It does not increase when a string is already interned.
+ fn len(&self) -> usize;
+}
+
+/// An interner backed by a [`HashSet`].
+///
+/// This interner is appropriate to be used when symbols need only be
+/// interned and have no associated data.
+///
+/// For the recommended configuration,
+/// see [`DefaultSetInterner`].
+///
+/// See the [module-level documentation](self) for an example.
+pub struct HashSetInterner<T = SymbolRc, S = RandomState>
+where
+ T: Symbol,
+ S: BuildHasher,
+{
+ /// Intern pool.
+ map: HashSet<T, S>,
+}
+
+impl<T, S> HashSetInterner<T, S>
+where
+ T: Symbol,
+ S: BuildHasher + Default,
+{
+ /// Create a new interner with an underlying [`HashSet`].
+ ///
+ /// Prefer [`with_capacity`](HashSetInterner::with_capacity) over this
+ /// function if capacity can be reasonably estimated.
+ #[inline]
+ pub fn new() -> Self {
+ Self {
+ map: HashSet::with_hasher(Default::default()),
+ }
+ }
+
+ /// Create a new interner with at least the specified capacity for the
+ /// underlying [`HashSet`].
+ ///
+ /// This method of construction should be used any time a capacity can
+ /// be predicated since it reduces the chance of reallocations as
+ /// interns exceed the capacity.
+ /// For example,
+ /// if interns are used to represent symbols and most packages contain
+ /// at least `N` symbols,
+ /// then `capacity` ought to be ≥N.
+ ///
+ /// For more information,
+ /// see [`HashSet::with_capacity`].
+ ///
+ /// See also `intern` benchmarks.
+ pub fn with_capacity(capacity: usize) -> Self {
+ Self {
+ map: HashSet::with_capacity_and_hasher(
+ capacity,
+ Default::default(),
+ ),
+ }
+ }
+}
+
+impl<T, S> Interner<T> for HashSetInterner<T, S>
+where
+ T: Symbol,
+ S: BuildHasher,
+{
+ fn intern(&mut self, value: &str) -> T {
+ let intern = T::new(value);
+
+ if !self.map.contains(&intern) {
+ self.map.insert(intern.clone());
+ return intern;
+ }
+
+ self.map.get(&intern).unwrap().clone()
+ }
+
+ #[inline]
+ fn intern_soft(&mut self, value: &str) -> Option<T> {
+ self.map.get(&T::new(value)).map(|s| s.clone())
+ }
+
+ #[inline]
+ fn contains(&self, value: &str) -> bool {
+ self.map.contains(&T::new(value))
+ }
+
+ #[inline]
+ fn len(&self) -> usize {
+ self.map.len()
+ }
+}
+
+/// Interner using the [Fowler-Noll-Vo][fnv] (FNV) hashing function.
+///
+/// _This is currently the hash function used by [`DefaultSetInterner`]._
+///
+/// If the intern pool will contains strings primarily of length ≤ 60,
+/// and if denial of service is not a concern,
+/// then use of the FNV hash function will outperform the default
+/// [`DefaultHasher`](std::collections::hash_map::DefaultHasher)
+/// (which uses SipHash at the time of writing).
+/// See intern benchmarks for a comparison.
+/// For a comparison of various hashing algorithms,
+/// see the [`hash-rs`][hash-rs] project.
+///
+/// [hash-rs]: https://github.com/Gankra/hash-rs
+pub type FnvHashSetInterner<T> = HashSetInterner<T, FnvBuildHasher>;
+
+/// Recommended configuration for set-based interners.
+///
+/// The choice of this default relies on certain assumptions:
+/// - Denial-of-service attacks against the hash function are not a
+/// concern; and
+/// - Strings to be interned are mostly ≤ 60 characters.
+///
+/// For more information on the hashing algorithm,
+/// see [`FnvHashSetInterner`].
+pub type DefaultSetInterner = FnvHashSetInterner<SymbolRc>;
+
+#[cfg(test)]
+mod test {
+ use super::*;
+
+ mod interned_rc {
+ use super::*;
+
+ /// Dereferencing should fall through all layers to the underlying `T`.
+ #[test]
+ fn derefs_to_inner_value() {
+ let expected = "foo";
+ let sut = SymbolRc::new(expected);
+
+ assert_eq!(expected, &*sut);
+ }
+
+ /// Symbol string must be easily cloned and must represent the same
+ /// underlying data.
+ #[test]
+ fn clones_equal() {
+ let expected = "bar";
+ let sut = SymbolRc::new(expected);
+
+ // This is the true test
+ assert!(sut.ptr_eq(&sut.clone()));
+ assert_eq!(sut, sut.clone());
+
+ // But we also want to verify derefs.
+ assert_eq!(*sut, *sut.clone());
+ }
+
+ #[test]
+ fn not_ptr_eq_if_memory_locations_differ() {
+ let a = SymbolRc::new("foo");
+ let b = SymbolRc::new(&"foo".to_string());
+
+ // Pointers differ
+ assert!(!a.ptr_eq(&b));
+
+ // But values do not
+ assert_eq!(*a, *b);
+ }
+ }
+
+ mod hash_set_interner {
+ use super::*;
+
+ /// Two different string values in memory should be interned to the
+ /// same object if they represent the same data.
+ #[test]
+ fn recognizes_equal_strings() {
+ let a = "foo";
+ let b = a.to_string();
+ let c = "bar";
+ let d = c.to_string();
+
+ let mut sut = HashSetInterner::<SymbolRc>::new();
+
+ let (ia, ib, ic, id) =
+ (sut.intern(a), sut.intern(&b), sut.intern(c), sut.intern(&d));
+
+ assert!(ia.ptr_eq(&ib));
+ assert_eq!(ia, ib);
+ assert_eq!(&ia, &ib);
+ assert_eq!(*ia, *ib);
+
+ assert!(ic.ptr_eq(&id));
+ assert_eq!(ic, id);
+ assert_eq!(&ic, &id);
+ assert_eq!(*ic, *id);
+
+ assert!(!ia.ptr_eq(&ic));
+ assert_ne!(ia, ic);
+ assert_ne!(&ia, &ic);
+ assert_ne!(*ia, *ic);
+ }
+
+ #[test]
+ fn length_increases_with_each_new_intern() {
+ let mut sut = HashSetInterner::<SymbolRc>::new();
+
+ assert_eq!(0, sut.len(), "invalid empty len");
+
+ sut.intern("foo");
+ assert_eq!(1, sut.len(), "increment len");
+
+ // duplicate
+ sut.intern("foo");
+ assert_eq!(1, sut.len(), "do not increment len on duplicates");
+
+ sut.intern("bar");
+ assert_eq!(2, sut.len(), "increment len (2)");
+ }
+
+ #[test]
+ fn can_check_wither_string_is_interned() {
+ let mut sut = HashSetInterner::<SymbolRc>::new();
+
+ assert!(!sut.contains("foo"), "recognize missing value");
+ sut.intern("foo");
+ assert!(sut.contains("foo"), "recognize interned value");
+ }
+
+ #[test]
+ fn intern_soft() {
+ let mut sut = HashSetInterner::<SymbolRc>::new();
+
+ assert_eq!(None, sut.intern_soft("foo"));
+
+ let foo = sut.intern("foo");
+ assert_eq!(Some(foo), sut.intern_soft("foo"));
+ }
+
+ #[test]
+ fn new_with_capacity() {
+ let n = 512;
+ let sut = HashSetInterner::<SymbolRc>::with_capacity(n);
+
+ // note that this is not publicly available
+ assert!(sut.map.capacity() >= n);
+ }
+ }
+}