Mike Gerwitz

Activist for User Freedom

aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorMike Gerwitz <mike.gerwitz@ryansg.com>2019-12-06 15:03:29 -0500
committerMike Gerwitz <mike.gerwitz@ryansg.com>2020-02-24 14:56:28 -0500
commite4e0089815517502f861d196e8a4ed187cbd57ab (patch)
treec74c5bf02b2b6683345511b5d7073104c366547b
parent593faa3491713b2008946284632a6baa7e5ed481 (diff)
downloadtame-e4e0089815517502f861d196e8a4ed187cbd57ab.tar.gz
tame-e4e0089815517502f861d196e8a4ed187cbd57ab.tar.bz2
tame-e4e0089815517502f861d196e8a4ed187cbd57ab.zip
TAMER: Initial string interning abstraction
This is missing two key things that I'll add shortly: a HashMap-based one for use in the ASG for node mapping, and an entry-based system for manipulations. This has been a nice start for exploring various aspects of Rust development, as well as conventions that I'd like to implement. In particular: - Robust documentation intended to guide people through learning the necessary material about the compiler, as well as related work to rationalize design decisions; - Benchmarks; - TDD; - And just getting used to Rust in general. I've beat this one to death, so I'll commit this and make smaller changes going forward to show how easily it can evolve. (This module was originally named `intern` but this commit and those that follow rewrote it to `sym`.)
-rw-r--r--tamer/Cargo.lock7
-rw-r--r--tamer/Cargo.toml3
-rw-r--r--tamer/benches/sym.rs223
-rw-r--r--tamer/src/ld/poc.rs2
-rw-r--r--tamer/src/lib.rs3
-rw-r--r--tamer/src/sym.rs674
6 files changed, 910 insertions, 2 deletions
diff --git a/tamer/Cargo.lock b/tamer/Cargo.lock
index 8db9877..9340d78 100644
--- a/tamer/Cargo.lock
+++ b/tamer/Cargo.lock
@@ -6,6 +6,11 @@ version = "0.1.9"
source = "registry+https://github.com/rust-lang/crates.io-index"
[[package]]
+name = "fnv"
+version = "1.0.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+
+[[package]]
name = "memchr"
version = "2.2.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
@@ -37,12 +42,14 @@ name = "tamer"
version = "0.0.0"
dependencies = [
"fixedbitset 0.1.9 (registry+https://github.com/rust-lang/crates.io-index)",
+ "fnv 1.0.6 (registry+https://github.com/rust-lang/crates.io-index)",
"petgraph 0.4.13 (registry+https://github.com/rust-lang/crates.io-index)",
"quick-xml 0.17.0 (registry+https://github.com/rust-lang/crates.io-index)",
]
[metadata]
"checksum fixedbitset 0.1.9 (registry+https://github.com/rust-lang/crates.io-index)" = "86d4de0081402f5e88cdac65c8dcdcc73118c1a7a465e2a05f0da05843a8ea33"
+"checksum fnv 1.0.6 (registry+https://github.com/rust-lang/crates.io-index)" = "2fad85553e09a6f881f739c29f0b00b0f01357c743266d478b68951ce23285f3"
"checksum memchr 2.2.1 (registry+https://github.com/rust-lang/crates.io-index)" = "88579771288728879b57485cc7d6b07d648c9f0141eb955f8ab7f9d45394468e"
"checksum ordermap 0.3.5 (registry+https://github.com/rust-lang/crates.io-index)" = "a86ed3f5f244b372d6b1a00b72ef7f8876d0bc6a78a4c9985c53614041512063"
"checksum petgraph 0.4.13 (registry+https://github.com/rust-lang/crates.io-index)" = "9c3659d1ee90221741f65dd128d9998311b0e40c5d3c23a62445938214abce4f"
diff --git a/tamer/Cargo.toml b/tamer/Cargo.toml
index 406afcb..82cfaf5 100644
--- a/tamer/Cargo.toml
+++ b/tamer/Cargo.toml
@@ -23,7 +23,8 @@ lto = true
lto = true
[dependencies]
-quick-xml = ">= 0.17.0"
+fnv = ">= 1.0.3"
petgraph = ">= 0.4.13"
+quick-xml = ">= 0.17.0"
# used by petgraph
fixedbitset = ">= 0.1"
diff --git a/tamer/benches/sym.rs b/tamer/benches/sym.rs
new file mode 100644
index 0000000..c727188
--- /dev/null
+++ b/tamer/benches/sym.rs
@@ -0,0 +1,223 @@
+// String internment benchmarks and baselines
+//
+// Copyright (C) 2014-2019 Ryan Specialty Group, LLC.
+//
+// This program is free software: you can redistribute it and/or modify
+// it under the terms of the GNU General Public License as published by
+// the Free Software Foundation, either version 3 of the License, or
+// (at your option) any later version.
+//
+// This program is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+// GNU General Public License for more details.
+//
+// You should have received a copy of the GNU General Public License
+// along with this program. If not, see <http://www.gnu.org/licenses/>.
+//
+// Note that the baseline tests have a _suffix_ rather than a prefix so that
+// they are still grouped with the associated test in the output, since it's
+// sorted lexically by function name.
+
+#![feature(test)]
+
+extern crate tamer;
+extern crate test;
+
+use std::rc::Rc;
+use tamer::sym::*;
+use test::Bencher;
+
+mod symbol {
+ use super::*;
+
+ /// This is our baseline. We should never be any slower than this.
+ #[bench]
+ fn new_raw_rc_1000_baseline(bench: &mut Bencher) {
+ let s = "foo bar baz";
+
+ bench.iter(|| {
+ (0..1000)
+ .map(|_| {
+ let _: Rc<str> = s.into();
+ })
+ .for_each(drop);
+ });
+ }
+
+ /// Using the `SymbolRc` wrapper should perform no differently than the
+ /// above test with `Rc<str>`.
+ #[bench]
+ fn new_symbol_rc_1000(bench: &mut Bencher) {
+ let s = "foo bar baz";
+
+ bench.iter(|| {
+ (0..1000).map(|_| SymbolRc::new(s)).for_each(drop);
+ });
+ }
+
+ /// Rc uses pointer comparisons when possible. We want to match that.
+ #[bench]
+ fn eq_check_rc_baseline(bench: &mut Bencher) {
+ bench.iter(|| {
+ let a: Rc<str> = "foobarbazquux".into();
+ let b: Rc<str> = "foobarbazquux".into();
+ let c: Rc<str> = "foobarbazquuxx".into();
+
+ let _ = a == b;
+ let _ = a == c;
+ });
+ }
+
+ #[bench]
+ fn eq_check_symbol_rc_baseline(bench: &mut Bencher) {
+ bench.iter(|| {
+ let a: SymbolRc = "foobarbazquux".into();
+ let b: SymbolRc = "foobarbazquux".into();
+ let c: SymbolRc = "foobarbazquuxx".into();
+
+ let _ = a == b;
+ let _ = a == c;
+ });
+ }
+}
+
+mod hash_set {
+ use super::*;
+ use std::collections::hash_map::RandomState;
+ use std::collections::HashSet;
+ use std::hash::BuildHasher;
+
+ pub struct HashSetSut<S = RandomState>
+ where
+ S: BuildHasher,
+ {
+ pub map: HashSet<Rc<str>, S>,
+ }
+
+ impl<S> HashSetSut<S>
+ where
+ S: BuildHasher + Default,
+ {
+ #[inline]
+ fn new() -> Self {
+ Self {
+ map: HashSet::with_hasher(Default::default()),
+ }
+ }
+
+ pub fn intern(&mut self, value: &str) -> Rc<str> {
+ if !self.map.contains(value) {
+ self.map.insert(value.into());
+ }
+
+ self.map.get(value).unwrap().clone()
+ }
+ }
+
+ fn gen_strs(n: usize) -> Vec<String> {
+ (0..n)
+ .map(|n| n.to_string() + "foobarbazquuxlongsymbol")
+ .collect()
+ }
+
+ /// This is our baseline with a raw Rc<str>.
+ #[bench]
+ fn with_all_new_rc_str_1000_baseline(bench: &mut Bencher) {
+ let strs = gen_strs(1000);
+
+ bench.iter(|| {
+ let mut sut = HashSetSut::<RandomState>::new();
+ strs.iter().map(|s| sut.intern(&s)).for_each(drop);
+ });
+ }
+
+ #[bench]
+ fn with_all_new_1000(bench: &mut Bencher) {
+ let strs = gen_strs(1000);
+
+ bench.iter(|| {
+ let mut sut = HashSetInterner::<SymbolRc>::new();
+ strs.iter().map(|s| sut.intern(&s)).for_each(drop);
+ });
+ }
+
+ #[bench]
+ /// This is our baseline with a raw Rc<str>.
+ fn with_one_new_rc_str_1000_baseline(bench: &mut Bencher) {
+ bench.iter(|| {
+ let mut sut = HashSetSut::<RandomState>::new();
+ (0..1000).map(|_| sut.intern("first")).for_each(drop);
+ });
+ }
+
+ #[bench]
+ fn with_one_new_1000(bench: &mut Bencher) {
+ bench.iter(|| {
+ let mut sut = HashSetInterner::<SymbolRc>::new();
+ (0..1000).map(|_| sut.intern("first")).for_each(drop);
+ });
+ }
+
+ mod fnv {
+ use super::*;
+ use ::fnv::FnvBuildHasher;
+
+ /// This is our baseline with a raw Rc<str>.
+ #[bench]
+ fn with_all_new_rc_str_1000_baseline(bench: &mut Bencher) {
+ let strs = gen_strs(1000);
+ bench.iter(|| {
+ let mut sut = HashSetSut::<FnvBuildHasher>::new();
+ strs.iter().map(|s| sut.intern(&s)).for_each(drop);
+ });
+ }
+
+ #[bench]
+ fn with_all_new_1000(bench: &mut Bencher) {
+ let strs = gen_strs(1000);
+
+ bench.iter(|| {
+ let mut sut =
+ HashSetInterner::<SymbolRc, FnvBuildHasher>::new();
+ strs.iter().map(|s| sut.intern(&s)).for_each(drop);
+ });
+ }
+
+ #[bench]
+ /// This is our baseline with a raw Rc<str>.
+ fn with_one_new_rc_str_1000_baseline(bench: &mut Bencher) {
+ bench.iter(|| {
+ let mut sut: HashSetSut<FnvBuildHasher> = HashSetSut {
+ map: HashSet::with_hasher(Default::default()),
+ };
+ (0..1000).map(|_| sut.intern("first")).for_each(drop);
+ });
+ }
+
+ #[bench]
+ fn with_one_new_1000(bench: &mut Bencher) {
+ bench.iter(|| {
+ let mut sut =
+ HashSetInterner::<SymbolRc, FnvBuildHasher>::new();
+ (0..1000).map(|_| sut.intern("first")).for_each(drop);
+ });
+ }
+
+ /// Since FNV is the best-performing, let's build upon it to demonstrate
+ /// the benefits of with_capacity
+ #[bench]
+ fn with_all_new_1000_with_capacity(bench: &mut Bencher) {
+ let n = 1000;
+ let strs = gen_strs(n);
+
+ bench.iter(|| {
+ let mut sut =
+ HashSetInterner::<SymbolRc, FnvBuildHasher>::with_capacity(
+ n,
+ );
+ strs.iter().map(|s| sut.intern(&s)).for_each(drop);
+ });
+ }
+ }
+}
diff --git a/tamer/src/ld/poc.rs b/tamer/src/ld/poc.rs
index d2b3fcd..d7f9733 100644
--- a/tamer/src/ld/poc.rs
+++ b/tamer/src/ld/poc.rs
@@ -446,7 +446,7 @@ fn discover_roots(depgraph: &DepGraph) -> Vec<SymRef> {
}
#[cfg(test)]
-mod tests {
+mod test {
#[test]
fn placeholder() {}
}
diff --git a/tamer/src/lib.rs b/tamer/src/lib.rs
index bf0f698..c372488 100644
--- a/tamer/src/lib.rs
+++ b/tamer/src/lib.rs
@@ -15,4 +15,7 @@
// You should have received a copy of the GNU General Public License
// along with this program. If not, see <http://www.gnu.org/licenses/>.
+//! An incremental rewrite of TAME in Rust.
+
pub mod ld;
+pub mod sym;
diff --git a/tamer/src/sym.rs b/tamer/src/sym.rs
new file mode 100644
index 0000000..6134e51
--- /dev/null
+++ b/tamer/src/sym.rs
@@ -0,0 +1,674 @@
+// String internment
+//
+// Copyright (C) 2014-2019 Ryan Specialty Group, LLC.
+//
+// This program is free software: you can redistribute it and/or modify
+// it under the terms of the GNU General Public License as published by
+// the Free Software Foundation, either version 3 of the License, or
+// (at your option) any later version.
+//
+// This program is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+// GNU General Public License for more details.
+//
+// You should have received a copy of the GNU General Public License
+// along with this program. If not, see <http://www.gnu.org/licenses/>.
+
+//! String internment system.
+//!
+//! This system is currently a thin wrapper providing an abstraction for
+//! future evolution.
+//!
+//! Interned strings are represented by [`Symbol`]:
+//!
+//! - [`SymbolRc`] - Wrapper around [`Rc`]-backed [`str`] slices.
+//!
+//! Symbols are created, stored, compared, and retrieved by
+//! an [`Interner`]:
+//!
+//! - [`DefaultSetInterner`] - The currently recommended intern pool
+//! configuration for symbol interning.
+//! - [`HashSetInterner`] - Intern pool backed by a [`HashSet`].
+//! - [`FnvHashSetInterner`] - Intern pool backed by a [`HashSet`] using the
+//! Fowler-Noll-Vo hashing algorithm.
+//!
+//! Note that the [`DefaultSetInterner`] has a configuration chosen for
+//! efficient _symbol interning_;
+//! see its documentation for more information and caveats,
+//! since it may not fit your particular use case.
+//!
+//! All [`Symbol`] values implement [`Clone`] and can be passed around
+//! freely without associated lifetimes.
+//!
+//! ```
+//! use tamer::sym::{Interner, DefaultSetInterner};
+//!
+//! // Inputs to be interned
+//! let a = "foo";
+//! let b = &"foo".to_string();
+//! let c = "foobar";
+//! let d = &c[0..3];
+//!
+//! let mut interner = DefaultSetInterner::new();
+//!
+//! let (ia, ib, ic, id) = (
+//! interner.intern(a),
+//! interner.intern(b),
+//! interner.intern(c),
+//! interner.intern(d),
+//! );
+//!
+//! assert_eq!(ia, ib);
+//! assert_eq!(ia, id);
+//! assert_eq!(ib, id);
+//! assert_ne!(ia, ic);
+//!
+//! // All interns can be cloned and clones are eq
+//! assert_eq!(ia, ia.clone());
+//!
+//! // Only "foo" and "foobar" are interned
+//! assert_eq!(2, interner.len());
+//! assert!(interner.contains("foo"));
+//! assert!(interner.contains("foobar"));
+//! assert!(!interner.contains("something else"));
+//! ```
+//!
+//! What Is String Interning?
+//! =========================
+//! _[String interning][]_ is a process by which a single copy of a string
+//! is stored immutably in memory as part of a _pool_.
+//! When the same string is later encountered,
+//! a reference to the string in the pool is used rather than allocating a
+//! new string.
+//! Interned strings are typically referred to as "symbols" or "atoms".
+//!
+//! String comparison then amounts to comparing pointers (`O(1)`) in the
+//! best case (see Equality of Interns below)
+//! rather than having to scan the string (`O(n)`).
+//! There is, however, a cost of interning strings,
+//! as well as comparing new strings to the intern pool.
+//!
+//! Symbols can be used as keys to a map to associate unique data with each
+//! intern,
+//! while at the same time doubling as an intern pool.
+//!
+//! From the perspective of Rust,
+//! this also provides a convenient alternative to passing around
+//! lifetimes,
+//! which becomes unwieldy when shared data is used in many places
+//! throughout the system.
+//! Each [`Symbol`] implements [`Clone`].
+//!
+//! [string interning]: https://en.wikipedia.org/wiki/String_interning
+//!
+//!
+//! Internment Mechanism
+//! ====================
+//! An internment mechanism is easily implemented in Rust using [`Rc`],
+//! which is the approach taken by [`SymbolRc`].
+//! While this does have a minor runtime overhead,
+//! the simplicity afforded by the implementation makes it attractive
+//! until it becomes a bottleneck.
+//! Considering all of the low-hanging fruit in the TAME→TAMER
+//! transition,
+//! this is not expected to be a bottleneck for quite some time.
+//!
+//! Rust implemented [`From`] for slices for [`Rc`] in [RFC 1845][rfc-1845],
+//! where it explicitly used string interning as a use case.
+//! A trivial string interner can be implemented using
+//! `HashSet<Rc<str>>`,
+//! which is done by [`HashSetInterner`] using [`SymbolRc`].
+//! However, the best mechanism by which to perform interning must be
+//! determined on a case-by-case basis.
+//! For example,
+//! if a mapping between strings and some value is required,
+//! then the pool can be combined with the mapping by using a
+//! [`HashMap`](std::collections::HashMap)
+//! with a `Rc<str>` key type.
+//!
+//! [rfc-1845]: https://rust-lang.github.io/rfcs/1845-shared-from-slice.html
+//!
+//! Further, we may want to implement a more robust internment system in the
+//! future,
+//! possibly using existing crates.
+//!
+//! To accommodate the above situations,
+//! this implementation is generalized and defers implementation details
+//! to individual implementers.
+//!
+//! [smart pointer]: https://doc.rust-lang.org/book/ch15-00-smart-pointers.html
+//!
+//!
+//! Lifetime of Interns
+//! -------------------
+//! Implementations need not bother with trying to free up space in the
+//! intern pool when strings are no longer in use,
+//! since compilation processes are short-lived.
+//! This gives much more flexibility in implementation.
+//!
+//!
+//! Equality of Interns
+//! -------------------
+//! Two [`Symbol`] values must be considered [`Eq`] if their wrapped
+//! values are `Eq`.
+//! This requirement,
+//! together with the requirement that equal values share the same hash
+//! (via [`Hash`]),
+//! permit use of hash tables
+//! (such as [`HashSet`] and [`HashMap`](std::collections::HashMap))
+//! to store interned values.
+//!
+//! Comparing [`Rc`] values is usually cheap because checks are first
+//! performed by comparing pointers to the underlying data.
+//! If they represent the same interned symbol,
+//! then pointers will match.
+//! If not,
+//! the underlying values will be compared.
+//! Since they are known to be different,
+//! this comparison should go quickly,
+//! unless the two strings being compared share a common prefix.
+//!
+//! In situations where it is not acceptable to fall back to a potentially
+//! expensive negative case,
+//! you may compare interned values by memory location using
+//! [`Symbol::ptr_eq`]
+//! (so named after [`Rc::ptr_eq`]):
+//!
+//! ```
+//! use tamer::sym::{Symbol, SymbolRc};
+//!
+//! let a = SymbolRc::new("foo");
+//! let b = SymbolRc::new(&"foo".to_string());
+//!
+//! assert_eq!(a, b);
+//! assert_eq!(*a, *b);
+//! assert!(a.ptr_eq(&a.clone()));
+//! assert!(!a.ptr_eq(&b));
+//! ```
+//!
+//!
+//! Name Mangling
+//! =============
+//! Interners do not perform [name mangling][],
+//! nor is it yet clear if they should.
+//! For future consideration,
+//! see [RFC 2603][rfc-2603] and the [Itanium ABI][itanium-abi].
+//!
+//! [name mangling]: https://en.wikipedia.org/wiki/Name_mangling
+//! [rfc-2603]: https://rust-lang.github.io/rfcs/2603-symbol-name-mangling-v2.html
+//! [itanium-abi]: http://refspecs.linuxbase.org/cxxabi-1.86.html#mangling
+//!
+//!
+//! Related Work and Further Reading
+//! ================================
+//! String interning is often tightly coupled with symbols (in the generic
+//! sense),
+//! sometimes called atoms.
+//! Symbols can often be either interned,
+//! and therefore compared for equivalency,
+//! or _uninterned_,
+//! which makes them unique even to symbols of the same name.
+//! Interning may also be done automatically by a language for performance.
+//! Languages listed below that allow for explicit interning may also
+//! perform automatic interning as well
+//! (for example, `'symbol` in Lisp and `lowercase_vars` as atoms in
+//! Erlang).
+//!
+//! | Language | Interned | Uninterned |
+//! | -------- | -------- | ---------- |
+//! | [Erlang][] | [`list_to_atom`][edt] | _(None)_ |
+//! | [GNU Emacs Lisp][] | [`intern`][es], [`intern-soft`][es] | [`make-symbol`][es], [`gensym`][es] |
+//! | [GNU Guile][] | [`string->symbol`][gs], [`gensym`][gs] | [`make-symbol`][gu] |
+//! | [JavaScript][] | [`Symbol.for`][js] | [`Symbol`][js] |
+//! | [Java][] | [`intern`][jvs] | _(None)_ |
+//! | [Lua][] | _(Automatic for string performance)_ | _(None)_ |
+//! | [MIT/GNU Scheme][] | [`intern`][ms], [`intern-soft`][ms], [`string->symbol`][ms] | [`string->uninterned-symbol`][ms], [`generate-uninterned-symbol`][ms] |
+//! | [PHP][] | _(Automatic for string [performance][pp])_ | _(None)_ |
+//! | [Python][] | [`sys.intern`][pys] | _(None)_ |
+//! | [R6RS Scheme][] | [`string->symbol`][r6s] | _(None)_ |
+//! | [Racket][] | [`string->symbol`][rs], [`string->unreadable-symbol`][rs] | [`string->uninterned-symbol`][rs], [`gensym`][rs] |
+//!
+//! [gnu guile]: https://www.gnu.org/software/guile/
+//! [gs]: https://www.gnu.org/software/guile/manual/html_node/Symbol-Primitives.html#Symbol-Primitives
+//! [gu]: https://www.gnu.org/software/guile/manual/html_node/Symbol-Uninterned.html#Symbol-Uninterned
+//! [gnu emacs lisp]: https://www.gnu.org/software/emacs/
+//! [es]: https://www.gnu.org/software/emacs/manual/html_node/elisp/Creating-Symbols.html
+//! [racket]: https://racket-lang.org/
+//! [rs]: https://docs.racket-lang.org/reference/symbols.html
+//! [r6rs scheme]: http://www.r6rs.org/
+//! [r6s]: http://www.r6rs.org/final/html/r6rs/r6rs-Z-H-14.html
+//! [mit/gnu scheme]: https://www.gnu.org/software/mit-scheme/
+//! [ms]: https://www.gnu.org/software/mit-scheme/documentation/mit-scheme-ref/Symbols.html
+//! [javascript]: https://developer.mozilla.org/en-US/docs/Web/JavaScript
+//! [js]: https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Global_Objects/Symbol
+//! [java]: http://openjdk.java.net/
+//! [jvs]: https://cr.openjdk.java.net/~iris/se/12/latestSpec/api/java.base/java/lang/String.html#intern()
+//! [php]: https://www.php.net/
+//! [pp]: https://wiki.php.net/rfc/performanceimprovements
+//! [erlang]: https://erlang.org/
+//! [edt]: http://erlang.org/doc/reference_manual/data_types.html
+//! [lua]: https://www.lua.org/
+//! [python]: https://www.python.org/
+//! [pys]: https://docs.python.org/3/library/sys.html
+//!
+//! More information:
+//! - Wikipedia entry on [string interning][].
+//! - The [flyweight pattern][] in object-oriented programming is a type
+//! of interning.
+//! - [RFC 1845][rfc-1845] gives an example of string interning using
+//! `Rc<str>`,
+//! which is used by [`SymbolRc`].
+//! - Emacs directly exposes the intern pool at runtime as
+//! [`obarray`][es].
+//! - [`string-cache`][rust-string-cache] is a string interning system
+//! for Rust developed by Mozilla for Servo.
+//! - [`string-interner`][rust-string-interner] is another string
+//! interning library for Rust.
+//!
+//! [flyweight pattern]: https://en.wikipedia.org/wiki/Flyweight_pattern
+//! [rust-string-cache]: https://github.com/servo/string-cache
+//! [rust-string-interner]: https://github.com/robbepop/string-interner
+
+use ::fnv::FnvBuildHasher;
+use std::collections::hash_map::RandomState;
+use std::collections::HashSet;
+use std::hash::{BuildHasher, Hash};
+use std::ops::Deref;
+use std::rc::Rc;
+
+/// Reference to an interned string.
+///
+/// This encapsulates implementation details regarding the internment
+/// process and allows it to change over time,
+/// or even vary between subsystems.
+///
+/// An interned string must be treated as if it were the underlying
+/// interned type `T` by implementing [`Deref`].
+/// Further,
+/// since interned string should be mere references to data,
+/// they ought to be cheap to copy.
+/// However,
+/// since reference counting may be involved,
+/// we must use [`Clone`] instead of [`Copy`].
+///
+/// Storing Interns
+/// ===============
+/// _This is a wrapper for an interned value;
+/// it does not necessary mean that a value was actually interned!_
+/// It ought to be created using an [`Interner`],
+/// which is responsible for creation, storage, and retrieval of interned
+/// values.
+pub trait Symbol: Deref<Target = str> + Clone + Eq + Hash {
+ /// Represent `value` as an interned string.
+ ///
+ /// _This does not perform internment;_ see
+ /// [`Interner`]. This merely provides a wrapper for interned
+ /// values which are then stored by an `Interner`.
+ fn new(value: &str) -> Self;
+
+ /// Compare underlying memory location for equality with the memory
+ /// location of another interned value.
+ ///
+ /// Symbol strings are [`Eq`] one-another if their underlying values
+ /// are also `Eq`.
+ /// This allows comparing underlying memory locations for equality,
+ /// both to ensure that internment is actually being performed,
+ /// and to permit separate internment systems sharing the same string
+ /// set.
+ fn ptr_eq(&self, other: &Self) -> bool;
+}
+
+/// An `Rc`-backed reference to an interned string.
+///
+/// This is a zero-cost abstraction and should provide no additional
+/// overhead over using `Rc` directly.
+#[derive(Debug, Eq, PartialEq, Hash)]
+pub struct SymbolRc(Rc<str>);
+
+impl Symbol for SymbolRc {
+ /// Represent `value` as an interned string.
+ ///
+ /// _This does not perform internment;_ see
+ /// [`Interner`]. This merely provides a wrapper for interned
+ /// values which are then stored by an `Interner`.
+ #[inline]
+ fn new(value: &str) -> Self {
+ Self(value.into())
+ }
+
+ /// Compare underlying memory location for equality with the memory
+ /// location of another interned value.
+ ///
+ /// ```
+ /// use tamer::sym::{Symbol, SymbolRc};
+ ///
+ /// let a = SymbolRc::new("foo");
+ /// let b = SymbolRc::new(&"foo".to_string());
+ ///
+ /// assert!(a.ptr_eq(&a.clone()));
+ /// assert!(!a.ptr_eq(&b));
+ /// ```
+ ///
+ /// For more information,
+ /// see [`Symbol::ptr_eq`].
+ #[inline]
+ fn ptr_eq(&self, other: &Self) -> bool {
+ Rc::ptr_eq(&self.0, &other.0)
+ }
+}
+
+impl Deref for SymbolRc {
+ type Target = str;
+
+ /// Retrieve underlying string slice.
+ #[inline]
+ fn deref(&self) -> &Self::Target {
+ &self.0
+ }
+}
+
+impl Clone for SymbolRc {
+ /// Clone a reference to the interned value.
+ ///
+ /// This creates another pointer to the same internal value, increasing
+ /// the strong reference count on the underlying [`Rc`].
+ #[inline]
+ fn clone(&self) -> Self {
+ Self(Rc::clone(&self.0))
+ }
+}
+
+impl From<&str> for SymbolRc {
+ #[inline]
+ fn from(value: &str) -> Self {
+ Self::new(value)
+ }
+}
+
+/// Create, store, compare, and retrieve [`Symbol`] values.
+///
+/// Interners are generic over the type of their interned values.
+/// They accept string slices and produce values of type [`Symbol`].
+///
+/// The intended use case is to attempt to blindly
+/// [`intern`](Interner::intern) slices;
+/// if it has already been interned,
+/// then you will receive a copy of the existing [`Symbol`] value.
+/// If it hasn't been interned,
+/// then it will be added.
+///
+/// If you care whether a value has been interned yet or not,
+/// use [`contains`](Interner::contains).
+///
+/// See the [module-level documentation](self) for an example.
+pub trait Interner<T: Symbol> {
+ /// Intern a string slice or return an existing matching intern.
+ ///
+ /// If the provided string has already been interned,
+ /// then the existing [`Symbol`] value will be cloned and returned.
+ /// Otherwise,
+ /// the string will be interned and a clone returned.
+ ///
+ /// To retrieve an existing intern _without_ interning,
+ /// see [`intern_soft`](Interner::intern_soft).
+ fn intern(&mut self, value: &str) -> T;
+
+ /// Retrieve an existing intern for the string slice `s`.
+ ///
+ /// Unlike [`intern`](Interner::intern),
+ /// this will _not_ intern the string if it has not already been
+ /// interned.
+ fn intern_soft(&mut self, value: &str) -> Option<T>;
+
+ /// Determine whether the given value has already been interned.
+ fn contains(&self, value: &str) -> bool;
+
+ /// Number of interned strings.
+ ///
+ /// This count will increase each time a unique string is interned.
+ /// It does not increase when a string is already interned.
+ fn len(&self) -> usize;
+}
+
+/// An interner backed by a [`HashSet`].
+///
+/// This interner is appropriate to be used when symbols need only be
+/// interned and have no associated data.
+///
+/// For the recommended configuration,
+/// see [`DefaultSetInterner`].
+///
+/// See the [module-level documentation](self) for an example.
+pub struct HashSetInterner<T = SymbolRc, S = RandomState>
+where
+ T: Symbol,
+ S: BuildHasher,
+{
+ /// Intern pool.
+ map: HashSet<T, S>,
+}
+
+impl<T, S> HashSetInterner<T, S>
+where
+ T: Symbol,
+ S: BuildHasher + Default,
+{
+ /// Create a new interner with an underlying [`HashSet`].
+ ///
+ /// Prefer [`with_capacity`](HashSetInterner::with_capacity) over this
+ /// function if capacity can be reasonably estimated.
+ #[inline]
+ pub fn new() -> Self {
+ Self {
+ map: HashSet::with_hasher(Default::default()),
+ }
+ }
+
+ /// Create a new interner with at least the specified capacity for the
+ /// underlying [`HashSet`].
+ ///
+ /// This method of construction should be used any time a capacity can
+ /// be predicated since it reduces the chance of reallocations as
+ /// interns exceed the capacity.
+ /// For example,
+ /// if interns are used to represent symbols and most packages contain
+ /// at least `N` symbols,
+ /// then `capacity` ought to be ≥N.
+ ///
+ /// For more information,
+ /// see [`HashSet::with_capacity`].
+ ///
+ /// See also `intern` benchmarks.
+ pub fn with_capacity(capacity: usize) -> Self {
+ Self {
+ map: HashSet::with_capacity_and_hasher(
+ capacity,
+ Default::default(),
+ ),
+ }
+ }
+}
+
+impl<T, S> Interner<T> for HashSetInterner<T, S>
+where
+ T: Symbol,
+ S: BuildHasher,
+{
+ fn intern(&mut self, value: &str) -> T {
+ let intern = T::new(value);
+
+ if !self.map.contains(&intern) {
+ self.map.insert(intern.clone());
+ return intern;
+ }
+
+ self.map.get(&intern).unwrap().clone()
+ }
+
+ #[inline]
+ fn intern_soft(&mut self, value: &str) -> Option<T> {
+ self.map.get(&T::new(value)).map(|s| s.clone())
+ }
+
+ #[inline]
+ fn contains(&self, value: &str) -> bool {
+ self.map.contains(&T::new(value))
+ }
+
+ #[inline]
+ fn len(&self) -> usize {
+ self.map.len()
+ }
+}
+
+/// Interner using the [Fowler-Noll-Vo][fnv] (FNV) hashing function.
+///
+/// _This is currently the hash function used by [`DefaultSetInterner`]._
+///
+/// If the intern pool will contains strings primarily of length ≤ 60,
+/// and if denial of service is not a concern,
+/// then use of the FNV hash function will outperform the default
+/// [`DefaultHasher`](std::collections::hash_map::DefaultHasher)
+/// (which uses SipHash at the time of writing).
+/// See intern benchmarks for a comparison.
+/// For a comparison of various hashing algorithms,
+/// see the [`hash-rs`][hash-rs] project.
+///
+/// [hash-rs]: https://github.com/Gankra/hash-rs
+pub type FnvHashSetInterner<T> = HashSetInterner<T, FnvBuildHasher>;
+
+/// Recommended configuration for set-based interners.
+///
+/// The choice of this default relies on certain assumptions:
+/// - Denial-of-service attacks against the hash function are not a
+/// concern; and
+/// - Strings to be interned are mostly ≤ 60 characters.
+///
+/// For more information on the hashing algorithm,
+/// see [`FnvHashSetInterner`].
+pub type DefaultSetInterner = FnvHashSetInterner<SymbolRc>;
+
+#[cfg(test)]
+mod test {
+ use super::*;
+
+ mod interned_rc {
+ use super::*;
+
+ /// Dereferencing should fall through all layers to the underlying `T`.
+ #[test]
+ fn derefs_to_inner_value() {
+ let expected = "foo";
+ let sut = SymbolRc::new(expected);
+
+ assert_eq!(expected, &*sut);
+ }
+
+ /// Symbol string must be easily cloned and must represent the same
+ /// underlying data.
+ #[test]
+ fn clones_equal() {
+ let expected = "bar";
+ let sut = SymbolRc::new(expected);
+
+ // This is the true test
+ assert!(sut.ptr_eq(&sut.clone()));
+ assert_eq!(sut, sut.clone());
+
+ // But we also want to verify derefs.
+ assert_eq!(*sut, *sut.clone());
+ }
+
+ #[test]
+ fn not_ptr_eq_if_memory_locations_differ() {
+ let a = SymbolRc::new("foo");
+ let b = SymbolRc::new(&"foo".to_string());
+
+ // Pointers differ
+ assert!(!a.ptr_eq(&b));
+
+ // But values do not
+ assert_eq!(*a, *b);
+ }
+ }
+
+ mod hash_set_interner {
+ use super::*;
+
+ /// Two different string values in memory should be interned to the
+ /// same object if they represent the same data.
+ #[test]
+ fn recognizes_equal_strings() {
+ let a = "foo";
+ let b = a.to_string();
+ let c = "bar";
+ let d = c.to_string();
+
+ let mut sut = HashSetInterner::<SymbolRc>::new();
+
+ let (ia, ib, ic, id) =
+ (sut.intern(a), sut.intern(&b), sut.intern(c), sut.intern(&d));
+
+ assert!(ia.ptr_eq(&ib));
+ assert_eq!(ia, ib);
+ assert_eq!(&ia, &ib);
+ assert_eq!(*ia, *ib);
+
+ assert!(ic.ptr_eq(&id));
+ assert_eq!(ic, id);
+ assert_eq!(&ic, &id);
+ assert_eq!(*ic, *id);
+
+ assert!(!ia.ptr_eq(&ic));
+ assert_ne!(ia, ic);
+ assert_ne!(&ia, &ic);
+ assert_ne!(*ia, *ic);
+ }
+
+ #[test]
+ fn length_increases_with_each_new_intern() {
+ let mut sut = HashSetInterner::<SymbolRc>::new();
+
+ assert_eq!(0, sut.len(), "invalid empty len");
+
+ sut.intern("foo");
+ assert_eq!(1, sut.len(), "increment len");
+
+ // duplicate
+ sut.intern("foo");
+ assert_eq!(1, sut.len(), "do not increment len on duplicates");
+
+ sut.intern("bar");
+ assert_eq!(2, sut.len(), "increment len (2)");
+ }
+
+ #[test]
+ fn can_check_wither_string_is_interned() {
+ let mut sut = HashSetInterner::<SymbolRc>::new();
+
+ assert!(!sut.contains("foo"), "recognize missing value");
+ sut.intern("foo");
+ assert!(sut.contains("foo"), "recognize interned value");
+ }
+
+ #[test]
+ fn intern_soft() {
+ let mut sut = HashSetInterner::<SymbolRc>::new();
+
+ assert_eq!(None, sut.intern_soft("foo"));
+
+ let foo = sut.intern("foo");
+ assert_eq!(Some(foo), sut.intern_soft("foo"));
+ }
+
+ #[test]
+ fn new_with_capacity() {
+ let n = 512;
+ let sut = HashSetInterner::<SymbolRc>::with_capacity(n);
+
+ // note that this is not publicly available
+ assert!(sut.map.capacity() >= n);
+ }
+ }
+}