Mike Gerwitz

Activist for User Freedom

aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorMike Gerwitz <mike.gerwitz@ryansg.com>2020-04-29 00:48:07 -0400
committerMike Gerwitz <mike.gerwitz@ryansg.com>2020-04-29 11:33:41 -0400
commit0127d4b6984a2512ea731c58c6afd41cabc9a2a3 (patch)
treea4e29fb759991cb368dd74490c36bc9edf1dc615
parent4b643385c810d61a5625d4968b3260b983bb9b74 (diff)
downloadtame-0127d4b6984a2512ea731c58c6afd41cabc9a2a3.tar.gz
tame-0127d4b6984a2512ea731c58c6afd41cabc9a2a3.tar.bz2
tame-0127d4b6984a2512ea731c58c6afd41cabc9a2a3.zip
TAMER: sym::Interner::index_lookup
This was originally omitted because there wasn't a use case for it. Now that we're adding context to errors, however, an owned value is highly desirable. This adds almost no measurable overhead to the internment system in benchmarks (largely within the margin of error).
-rw-r--r--tamer/benches/sym.rs15
-rw-r--r--tamer/src/lib.rs7
-rw-r--r--tamer/src/sym.rs105
3 files changed, 106 insertions, 21 deletions
diff --git a/tamer/benches/sym.rs b/tamer/benches/sym.rs
index b5f0320..120b10e 100644
--- a/tamer/benches/sym.rs
+++ b/tamer/benches/sym.rs
@@ -107,6 +107,21 @@ mod interner {
});
}
+ #[bench]
+ fn index_lookup_unique_1000(bench: &mut Bencher) {
+ let sut = ArenaInterner::<RandomState>::new();
+ let strs = gen_strs(1000);
+
+ let syms = strs
+ .iter()
+ .map(|s| sut.intern(s).index())
+ .collect::<Vec<_>>();
+
+ bench.iter(|| {
+ syms.iter().map(|si| sut.index_lookup(*si)).for_each(drop);
+ });
+ }
+
mod fx {
use super::*;
use fxhash::FxBuildHasher;
diff --git a/tamer/src/lib.rs b/tamer/src/lib.rs
index ce77d84..569d74d 100644
--- a/tamer/src/lib.rs
+++ b/tamer/src/lib.rs
@@ -22,6 +22,9 @@
pub mod global;
#[macro_use]
+extern crate lazy_static;
+
+#[macro_use]
pub mod sym;
pub mod fs;
@@ -30,8 +33,4 @@ pub mod ld;
pub mod obj;
#[cfg(test)]
-#[macro_use]
-extern crate lazy_static;
-
-#[cfg(test)]
pub mod test;
diff --git a/tamer/src/sym.rs b/tamer/src/sym.rs
index 8ed88f9..dbcbdb2 100644
--- a/tamer/src/sym.rs
+++ b/tamer/src/sym.rs
@@ -74,6 +74,9 @@
//! assert_eq!(SymbolIndex::from_u32(1), ib.index());
//! assert_eq!(SymbolIndex::from_u32(2), ic.index());
//! assert_eq!(SymbolIndex::from_u32(1), id.index());
+//!
+//! // Symbols can also be looked up by index.
+//! assert_eq!(Some(ia), interner.index_lookup(ia.index()));
//! ```
//!
//! What Is String Interning?
@@ -126,6 +129,8 @@
//! (see [`Symbol::index`]),
//! which provides a dense range of values suitable for use in vectors
//! as an alternative to [`HashMap`] for mapping symbol data.
+//! A [`SymbolIndex`] can be mapped back into its associated [`Symbol`]
+//! using [`Interner::index_lookup`].
//!
//! Since a reference to the same [`Symbol`] is returned for each
//! [`Interner::intern`] and [`Interner::intern_soft`] call,
@@ -264,7 +269,7 @@
use crate::global;
use bumpalo::Bump;
use fxhash::FxBuildHasher;
-use std::cell::{Cell, RefCell};
+use std::cell::RefCell;
use std::collections::HashMap;
use std::convert::TryInto;
use std::fmt;
@@ -286,12 +291,6 @@ use std::ops::Deref;
#[derive(Copy, Clone, Debug, PartialEq, Eq)]
pub struct SymbolIndex(global::NonZeroProgSymSize);
-impl From<SymbolIndex> for usize {
- fn from(value: SymbolIndex) -> usize {
- value.0.get().try_into().unwrap()
- }
-}
-
impl SymbolIndex {
/// Construct index from a non-zero `u32` value.
///
@@ -313,6 +312,18 @@ impl SymbolIndex {
}
}
+impl From<SymbolIndex> for usize {
+ fn from(value: SymbolIndex) -> usize {
+ value.0.get() as usize
+ }
+}
+
+impl<'i> From<&Symbol<'i>> for SymbolIndex {
+ fn from(sym: &Symbol<'i>) -> Self {
+ sym.index()
+ }
+}
+
/// Interned string.
///
/// A reference to this symbol is returned each time the same string is
@@ -330,6 +341,8 @@ impl SymbolIndex {
/// this is especially beneficial for portions of the system that make
/// use of nearly all interned symbols,
/// like the ASG.
+/// A [`SymbolIndex`] can be mapped back into its [`Symbol`] by calling
+/// [`Interner::index_lookup`] on the same interner that produced it.
///
/// The symbol also stores a string slice referencing the interned string
/// itself,
@@ -456,6 +469,21 @@ pub trait Interner<'i> {
/// It does not increase when a string is already interned.
fn len(&self) -> usize;
+ /// Look up a previously interned [`Symbol`] by its [`SymbolIndex`].
+ ///
+ /// This will always return a [`Symbol`] as long as the provided `index`
+ /// represents a symbol interned with this interner.
+ /// If the index is not found,
+ /// the result is [`None`].
+ ///
+ /// This method is most useful when storing [`Symbol`] is not possible
+ /// or desirable.
+ /// For example,
+ /// borrowed [`Symbol`] references require lifetimes,
+ /// whereas [`SymbolIndex`] is both owned _and_ [`Copy`].
+ /// [`SymbolIndex`] is also much smaller than [`Symbol`].
+ fn index_lookup(&'i self, index: SymbolIndex) -> Option<&'i Symbol<'i>>;
+
/// Intern an assumed-UTF8 slice of bytes or return an existing
/// [`Symbol`].
///
@@ -492,12 +520,14 @@ where
/// String and [`Symbol`] storage.
arena: Bump,
- /// Next available symbol index.
+ /// Symbol references by index.
+ ///
+ /// This vector enables looking up a [`Symbol`] using its
+ /// [`SymbolIndex`].
///
- /// This must always be ≥1.
- /// It is not defined as `NonZeroProgSymSize` because
- /// `intern` enforces the invariant.
- next_index: Cell<global::ProgSymSize>,
+ /// The first index must always be populated during initialization to
+ /// ensure that [`SymbolIndex`] will never be `0`.
+ indexes: RefCell<Vec<&'i Symbol<'i>>>,
/// Map of interned strings to their respective [`Symbol`].
///
@@ -505,6 +535,17 @@ where
map: RefCell<HashMap<&'i str, &'i Symbol<'i>, S>>,
}
+lazy_static! {
+ /// Dummy [`Symbol`] for use at index `0`.
+ ///
+ /// A symbol must never have an index of `0`,
+ /// so this can be used as a placeholder.
+ /// The chosen [`SymbolIndex`] here does not matter since this will
+ /// never be referenced.
+ static ref DUMMY_SYM: Symbol<'static> =
+ Symbol::new(SymbolIndex::from_u32(1), "!BADSYMREF!");
+}
+
impl<'i, S> ArenaInterner<'i, S>
where
S: BuildHasher + Default,
@@ -534,9 +575,14 @@ where
/// [consistent]: https://en.wikipedia.org/wiki/Consistent_hashing
#[inline]
pub fn with_capacity(capacity: usize) -> Self {
+ let mut indexes = Vec::<&'i Symbol<'i>>::with_capacity(capacity);
+
+ // The first index is not used since SymbolIndex cannot be 0.
+ indexes.push(&DUMMY_SYM);
+
Self {
arena: Bump::new(),
- next_index: Cell::new(1),
+ indexes: RefCell::new(indexes),
map: RefCell::new(HashMap::with_capacity_and_hasher(
capacity,
Default::default(),
@@ -556,9 +602,15 @@ where
return sym;
}
- let next_index = self.next_index.get();
+ let mut syms = self.indexes.borrow_mut();
+
+ let next_index: u32 = syms
+ .len()
+ .try_into()
+ .expect("internal error: SymbolIndex range exhausted");
- // Next_index should always be initialized to at least 1.
+ // This is not actually unsafe because next_index is always >0
+ // from initialization.
debug_assert!(next_index != 0);
let id = unsafe { SymbolIndex::from_u32_unchecked(next_index) };
@@ -571,10 +623,10 @@ where
// Symbols are also stored within the arena, adjacent to the
// string. This ensures that both have stable locations in memory.
- let sym: &'i Symbol<'i> = self.arena.alloc(Symbol::new(id, &clone));
+ let sym: &'i Symbol<'i> = self.arena.alloc(Symbol::new(id, clone));
map.insert(clone, sym);
- self.next_index.set(next_index + 1);
+ syms.push(sym);
sym
}
@@ -593,6 +645,13 @@ where
fn len(&self) -> usize {
self.map.borrow().len()
}
+
+ fn index_lookup(&'i self, index: SymbolIndex) -> Option<&'i Symbol<'i>> {
+ self.indexes
+ .borrow()
+ .get(index.0.get() as usize)
+ .map(|sym| *sym)
+ }
}
/// Interner using the [Fx Hash][fxhash] hashing function.
@@ -834,5 +893,17 @@ mod test {
assert_eq!(a, b);
}
+
+ #[test]
+ fn lookup_symbol_by_index() {
+ let sut = Sut::new();
+
+ // Symbol does not yet exist.
+ assert!(sut.index_lookup(SymbolIndex::from_u32(1)).is_none());
+
+ let sym = sut.intern("foo");
+ assert_eq!(Some(sym), sut.index_lookup(sym.index()));
+ assert_eq!(Some(sym), sut.index_lookup(sym.into()));
+ }
}
}