Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Neater hashing interface #4524

Merged
merged 26 commits into from
Dec 18, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
26 commits
Select commit Hold shift + click to select a range
d071489
hashlib: redo interface for flexibility
widlarizer Oct 1, 2024
953508f
driver: add --hash-seed
widlarizer Oct 1, 2024
c10b3f5
abc: sort stats
widlarizer Oct 4, 2024
db04788
hashlib: fix pyosys
widlarizer Oct 9, 2024
c73c880
hashlib: only include in one place
widlarizer Oct 15, 2024
b8738e2
hashlib: use hash_t across the board
widlarizer Oct 18, 2024
582259f
hashlib: hash_t can be set to 64-bit
widlarizer Oct 18, 2024
209ab6f
hashlib: fudge always
widlarizer Oct 18, 2024
c1af19f
hashlib: don't xorshift in between upper and lower word
widlarizer Oct 30, 2024
4d14399
hashlib: allow forcing Hasher state, use it for IdString trivial hashing
widlarizer Oct 30, 2024
b7991ed
hashlib: prevent naive hashing of IdString when hashing SigBit
widlarizer Oct 30, 2024
52b0fc0
hash: solo hashing interface, override for SigBit
widlarizer Nov 4, 2024
704a58a
hashlib: restore hash_obj_ops for pointers to indexed types
widlarizer Nov 4, 2024
02a5783
hashlib: remove is_new from HasherDJB32, implement hash_top for IdString
widlarizer Nov 6, 2024
0dafe06
hashlib: run_hash uses hash_top_ops, not hash_ops
widlarizer Nov 6, 2024
ad0dc17
docs: document the ideas behind the hashing interface
widlarizer Nov 6, 2024
0454787
Docs: Formatting and fixes
KrystalDelusion Nov 6, 2024
1401906
docs: formatting and fixes
widlarizer Nov 6, 2024
6d53454
docs: move hashing-based container details into internal docs from gu…
widlarizer Nov 11, 2024
79acc14
hashlib: add deprecated mkhash function to prevent plugin breakage
widlarizer Nov 11, 2024
4e29ec1
hashlib: acc -> eat
widlarizer Nov 11, 2024
1df8a3e
hashlib: legacy mkhash_add -> djb2_add
widlarizer Nov 11, 2024
0a525f3
hashlib: declare YS_HASHING_VERSION = 1
widlarizer Nov 19, 2024
b9b9515
hashlib: hash_eat -> hash_into
widlarizer Nov 19, 2024
ed70038
hashlib: fixes from jix
widlarizer Nov 20, 2024
026e9da
hashlib: fixes from jix
widlarizer Nov 20, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
26 changes: 13 additions & 13 deletions backends/cxxrtl/cxxrtl_backend.cc
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,7 @@ struct Scheduler {
struct Vertex {
T *data;
Vertex *prev, *next;
pool<Vertex*, hash_ptr_ops> preds, succs;
pool<Vertex*> preds, succs;

Vertex() : data(NULL), prev(this), next(this) {}
Vertex(T *data) : data(data), prev(NULL), next(NULL) {}
Expand Down Expand Up @@ -300,10 +300,10 @@ struct FlowGraph {
};

std::vector<Node*> nodes;
dict<const RTLIL::Wire*, pool<Node*, hash_ptr_ops>> wire_comb_defs, wire_sync_defs, wire_uses;
dict<Node*, pool<const RTLIL::Wire*>, hash_ptr_ops> node_comb_defs, node_sync_defs, node_uses;
dict<const RTLIL::Wire*, pool<Node*>> wire_comb_defs, wire_sync_defs, wire_uses;
dict<Node*, pool<const RTLIL::Wire*>> node_comb_defs, node_sync_defs, node_uses;
dict<const RTLIL::Wire*, bool> wire_def_inlinable;
dict<const RTLIL::Wire*, dict<Node*, bool, hash_ptr_ops>> wire_use_inlinable;
dict<const RTLIL::Wire*, dict<Node*, bool>> wire_use_inlinable;
dict<RTLIL::SigBit, bool> bit_has_state;

~FlowGraph()
Expand Down Expand Up @@ -365,7 +365,7 @@ struct FlowGraph {
return false;
}

bool is_inlinable(const RTLIL::Wire *wire, const pool<Node*, hash_ptr_ops> &nodes) const
bool is_inlinable(const RTLIL::Wire *wire, const pool<Node*> &nodes) const
{
// Can the wire be inlined, knowing that the given nodes are reachable?
if (nodes.size() != 1)
Expand Down Expand Up @@ -3080,7 +3080,7 @@ struct CxxrtlWorker {
// without feedback arcs can generally be evaluated in a single pass, i.e. it always requires only
// a single delta cycle.
Scheduler<FlowGraph::Node> scheduler;
dict<FlowGraph::Node*, Scheduler<FlowGraph::Node>::Vertex*, hash_ptr_ops> node_vertex_map;
dict<FlowGraph::Node*, Scheduler<FlowGraph::Node>::Vertex*> node_vertex_map;
for (auto node : flow.nodes)
node_vertex_map[node] = scheduler.add(node);
for (auto node_comb_def : flow.node_comb_defs) {
Expand All @@ -3095,7 +3095,7 @@ struct CxxrtlWorker {

// Find out whether the order includes any feedback arcs.
std::vector<FlowGraph::Node*> node_order;
pool<FlowGraph::Node*, hash_ptr_ops> evaluated_nodes;
pool<FlowGraph::Node*> evaluated_nodes;
pool<const RTLIL::Wire*> feedback_wires;
for (auto vertex : scheduler.schedule()) {
auto node = vertex->data;
Expand Down Expand Up @@ -3139,7 +3139,7 @@ struct CxxrtlWorker {
}

// Discover nodes reachable from primary outputs (i.e. members) and collect reachable wire users.
pool<FlowGraph::Node*, hash_ptr_ops> worklist;
pool<FlowGraph::Node*> worklist;
for (auto node : flow.nodes) {
if (node->type == FlowGraph::Node::Type::CELL_EVAL && !is_internal_cell(node->cell->type))
worklist.insert(node); // node evaluates a submodule
Expand All @@ -3159,8 +3159,8 @@ struct CxxrtlWorker {
worklist.insert(node); // node drives public wires
}
}
dict<const RTLIL::Wire*, pool<FlowGraph::Node*, hash_ptr_ops>> live_wires;
pool<FlowGraph::Node*, hash_ptr_ops> live_nodes;
dict<const RTLIL::Wire*, pool<FlowGraph::Node*>> live_wires;
pool<FlowGraph::Node*> live_nodes;
while (!worklist.empty()) {
auto node = worklist.pop();
live_nodes.insert(node);
Expand Down Expand Up @@ -3290,15 +3290,15 @@ struct CxxrtlWorker {

// Discover nodes reachable from primary outputs (i.e. outlines) up until primary inputs (i.e. members)
// and collect reachable wire users.
pool<FlowGraph::Node*, hash_ptr_ops> worklist;
pool<FlowGraph::Node*> worklist;
for (auto node : flow.nodes) {
if (flow.node_comb_defs.count(node))
for (auto wire : flow.node_comb_defs[node])
if (debug_wire_types[wire].is_outline())
worklist.insert(node); // node drives outline
}
dict<const RTLIL::Wire*, pool<FlowGraph::Node*, hash_ptr_ops>> debug_live_wires;
pool<FlowGraph::Node*, hash_ptr_ops> debug_live_nodes;
dict<const RTLIL::Wire*, pool<FlowGraph::Node*>> debug_live_wires;
pool<FlowGraph::Node*> debug_live_nodes;
while (!worklist.empty()) {
auto node = worklist.pop();
debug_live_nodes.insert(node);
Expand Down
155 changes: 155 additions & 0 deletions docs/source/yosys_internals/hashing.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1,155 @@
Hashing and associative data structures in Yosys
------------------------------------------------
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Leaving a note that I read this file


Container classes based on hashing
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

Yosys uses ``dict<K, T>`` and ``pool<T>`` as main container classes.
``dict<K, T>`` is essentially a replacement for ``std::unordered_map<K, T>``
and ``pool<T>`` is a replacement for ``std::unordered_set<T>``.
The main characteristics are:

* ``dict<K, T>`` and ``pool<T>`` are about 2x faster than the std containers
(though this claim hasn't been verified for over 10 years)

* references to elements in a ``dict<K, T>`` or ``pool<T>`` are invalidated by
insert and remove operations (similar to ``std::vector<T>`` on ``push_back()``).

* some iterators are invalidated by ``erase()``. specifically, iterators
that have not passed the erased element yet are invalidated. (``erase()``
itself returns valid iterator to the next element.)

* no iterators are invalidated by ``insert()``. elements are inserted at
``begin()``. i.e. only a new iterator that starts at ``begin()`` will see the
inserted elements.

* the method ``.count(key, iterator)`` is like ``.count(key)`` but only
considers elements that can be reached via the iterator.

* iterators can be compared. ``it1 < it2`` means that the position of ``t2``
can be reached via ``t1`` but not vice versa.

* the method ``.sort()`` can be used to sort the elements in the container
the container stays sorted until elements are added or removed.

* ``dict<K, T>`` and ``pool<T>`` will have the same order of iteration across
all compilers, standard libraries and architectures.

In addition to ``dict<K, T>`` and ``pool<T>`` there is also an ``idict<K>`` that
creates a bijective map from ``K`` to the integers. For example:

::

idict<string, 42> si;
log("%d\n", si("hello")); // will print 42
log("%d\n", si("world")); // will print 43
log("%d\n", si.at("world")); // will print 43
log("%d\n", si.at("dummy")); // will throw exception
log("%s\n", si[42].c_str())); // will print hello
log("%s\n", si[43].c_str())); // will print world
log("%s\n", si[44].c_str())); // will throw exception

It is not possible to remove elements from an idict.

Finally ``mfp<K>`` implements a merge-find set data structure (aka. disjoint-set
or union-find) over the type ``K`` ("mfp" = merge-find-promote).

The hash function
~~~~~~~~~~~~~~~~~

The hash function generally used in Yosys is the XOR version of DJB2:

::

state = ((state << 5) + state) ^ value

This is an old-school hash designed to hash ASCII characters. Yosys doesn't hash
a lot of ASCII text, but it still happens to be a local optimum due to factors
described later.

Hash function quality is multi-faceted and highly dependent on what is being
hashed. Yosys isn't concerned by any cryptographic qualities, instead the goal
is minimizing total hashing collision risk given the data patterns within Yosys.
In general, a good hash function typically folds values into a state accumulator
with a mathematical function that is fast to compute and has some beneficial
properties. One of these is the avalanche property, which demands that a small
change such as flipping a bit or incrementing by one in the input produces a
large, unpredictable change in the output. Additionally, the bit independence
criterion states that any pair of output bits should change independently when
any single input bit is inverted. These properties are important for avoiding
hash collision on data patterns like the hash of a sequence not colliding with
its permutation, not losing from the state the information added by hashing
preceding elements, etc.

DJB2 lacks these properties. Instead, since Yosys hashes large numbers of data
structures composed of incrementing integer IDs, Yosys abuses the predictability
of DJB2 to get lower hash collisions, with regular nature of the hashes
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is it to get lower hash collisions or to get better locality?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Hash collisions

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Does this come from observations, or something Claire mentioned was intention? I know some of the primitives were used in a way to get better locality

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This comes from my observations when counting hash collisions in hashlib per std::source_location and clear correlation with extra runtime overhead in some opt and extract_fa passes where the collisions were happening

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

FWIW, cpython also gives reducing collisions as a reason for using small integer values directly as hash value: https://github.com/python/cpython/blob/7538e7f5696408fa0aa02fce8a413a7dfac76a04/Objects/dictobject.c#L289

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks for the resource. I'm perfectly fine with hashing an int trivially, but retaining patterns in the output when hashing a combination of them sketches me out. cpython:

>>> bin(hash((1, 2)))
'-0b11000101000100010101000101001111011111101000000110000010111101'
>>> bin(hash((1, 3)))
'-0b1001111111110101001100100011001110110010011111111100101100100'

But for SigBit we get this:

>>> print(djb2_add(1, 2))
35
>>> print(djb2_add(1, 3))
36
>>> print(djb2_add(2, 3))
69

surviving through the interaction with the "modulo prime" operations in the
associative data structures. For example, some most common objects in Yosys are
interned ``IdString``\ s of incrementing indices or ``SigBit``\ s with bit
offsets into wire (represented by its unique ``IdString`` name) as the typical
case. This is what makes DJB2 a local optimum. Additionally, the ADD version of
DJB2 (like above but with addition instead of XOR) is used to this end for some
types, abandoning the general pattern of folding values into a state value.

Making a type hashable
~~~~~~~~~~~~~~~~~~~~~~

Let's first take a look at the external interface on a simplified level.
Generally, to get the hash for ``T obj``, you would call the utility function
``run_hash<T>(const T& obj)``, corresponding to ``hash_top_ops<T>::hash(obj)``,
the default implementation of which is ``hash_ops<T>::hash_into(Hasher(), obj)``.
``Hasher`` is the class actually implementing the hash function, hiding its
initialized internal state, and passing it out on ``hash_t yield()`` with
perhaps some finalization steps.

``hash_ops<T>`` is the star of the show. By default it pulls the ``Hasher h``
through a ``Hasher T::hash_into(Hasher h)`` method. That's the method you have to
implement to make a record (class or struct) type easily hashable with Yosys
hashlib associative data structures.

``hash_ops<T>`` is specialized for built-in types like ``int`` or ``bool`` and
treats pointers the same as integers, so it doesn't dereference pointers. Since
many RTLIL data structures like ``RTLIL::Wire`` carry their own unique index
``Hasher::hash_t hashidx_;``, there are specializations for ``hash_ops<Wire*>``
and others in ``kernel/hashlib.h`` that actually dereference the pointers and
call ``hash_into`` on the instances pointed to.

``hash_ops<T>`` is also specialized for simple compound types like
``std::pair<U>`` by calling hash_into in sequence on its members. For flexible
size containers like ``std::vector<U>`` the size of the container is hashed
first. That is also how implementing hashing for a custom record data type
should be - unless there is strong reason to do otherwise, call ``h.eat(m)`` on
the ``Hasher h`` you have received for each member in sequence and ``return
h;``. If you do have a strong reason to do so, look at how
``hash_top_ops<RTLIL::SigBit>`` is implemented in ``kernel/rtlil.h``.

Porting plugins from the legacy interface
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

Previously, the interface to implement hashing on custom types was just
``unsigned int T::hash() const``. This meant hashes for members were computed
independently and then ad-hoc combined with the hash function with some xorshift
operations thrown in to mix bits together somewhat. A plugin can stay compatible
with both versions prior and after the break by implementing both interfaces
based on the existance and value of `YS_HASHING_VERSION`.

.. code-block:: cpp
:caption: Example hash compatibility wrapper
:name: hash_plugin_compat

#ifndef YS_HASHING_VERSION
unsigned int T::hash() const {
return mkhash(a, b);
}
#elif YS_HASHING_VERSION == 1
Hasher T::hash_into(Hasher h) const {
h.eat(a);
h.eat(b);
return h;
}
#else
#error "Unsupported hashing interface"
#endif

Feel free to contact Yosys maintainers with related issues.
1 change: 1 addition & 0 deletions docs/source/yosys_internals/index.rst
Original file line number Diff line number Diff line change
Expand Up @@ -39,3 +39,4 @@ as reference to implement a similar system in any language.
extending_yosys/index
techmap
verilog
hashing
2 changes: 1 addition & 1 deletion examples/cxx-api/scopeinfo_example.cc
Original file line number Diff line number Diff line change
Expand Up @@ -90,7 +90,7 @@ struct ScopeinfoExamplePass : public Pass {

// Shuffle wires so this example produces more interesting outputs
std::sort(wires.begin(), wires.end(), [](Wire *a, Wire *b) {
return mkhash_xorshift(a->name.hash() * 0x2c9277b5) < mkhash_xorshift(b->name.hash() * 0x2c9277b5);
return mkhash_xorshift(run_hash(a->name) * 0x2c9277b5) < mkhash_xorshift(run_hash(b->name) * 0x2c9277b5);
});

ModuleHdlnameIndex index(module);
Expand Down
8 changes: 4 additions & 4 deletions flake.nix
Original file line number Diff line number Diff line change
Expand Up @@ -14,15 +14,15 @@
};
# TODO: don't override src when ./abc is empty
# which happens when the command used is `nix build` and not `nix build ?submodules=1`
abc-verifier = pkgs.abc-verifier.overrideAttrs(x: y: {src = ./abc;});
abc-verifier = pkgs.abc-verifier;
yosys = pkgs.clangStdenv.mkDerivation {
name = "yosys";
src = ./. ;
buildInputs = with pkgs; [ clang bison flex libffi tcl readline python3 llvmPackages.libcxxClang zlib git pkg-configUpstream llvmPackages.bintools ];
buildInputs = with pkgs; [ clang bison flex libffi tcl readline python3 zlib git pkg-configUpstream llvmPackages.bintools ];
checkInputs = with pkgs; [ gtest ];
propagatedBuildInputs = [ abc-verifier ];
preConfigure = "make config-clang";
checkTarget = "test";
checkTarget = "unit-test";
installPhase = ''
make install PREFIX=$out ABCEXTERNAL=yosys-abc
ln -s ${abc-verifier}/bin/abc $out/bin/yosys-abc
Expand All @@ -41,7 +41,7 @@
packages.default = yosys;
defaultPackage = yosys;
devShell = pkgs.mkShell {
buildInputs = with pkgs; [ clang llvmPackages.bintools bison flex libffi tcl readline python3 llvmPackages.libcxxClang zlib git gtest abc-verifier ];
buildInputs = with pkgs; [ clang llvmPackages.bintools gcc bison flex libffi tcl readline python3 zlib git gtest abc-verifier verilog boost python3Packages.boost ];
};
}
);
Expand Down
2 changes: 1 addition & 1 deletion frontends/ast/ast.h
Original file line number Diff line number Diff line change
Expand Up @@ -177,7 +177,7 @@ namespace AST
{
// for dict<> and pool<>
unsigned int hashidx_;
unsigned int hash() const { return hashidx_; }
Hasher hash_into(Hasher h) const { h.eat(hashidx_); return h; }

// this nodes type
AstNodeType type;
Expand Down
16 changes: 8 additions & 8 deletions frontends/verific/verific.cc
Original file line number Diff line number Diff line change
Expand Up @@ -619,7 +619,7 @@ RTLIL::SigSpec VerificImporter::operatorInportCase(Instance *inst, const char *p
}
}

RTLIL::SigSpec VerificImporter::operatorOutput(Instance *inst, const pool<Net*, hash_ptr_ops> *any_all_nets)
RTLIL::SigSpec VerificImporter::operatorOutput(Instance *inst, const pool<Net*> *any_all_nets)
{
RTLIL::SigSpec sig;
RTLIL::Wire *dummy_wire = NULL;
Expand Down Expand Up @@ -1576,9 +1576,9 @@ void VerificImporter::import_netlist(RTLIL::Design *design, Netlist *nl, std::ma

module->fixup_ports();

dict<Net*, char, hash_ptr_ops> init_nets;
pool<Net*, hash_ptr_ops> anyconst_nets, anyseq_nets;
pool<Net*, hash_ptr_ops> allconst_nets, allseq_nets;
dict<Net*, char> init_nets;
pool<Net*> anyconst_nets, anyseq_nets;
pool<Net*> allconst_nets, allseq_nets;
any_all_nets.clear();

FOREACH_NET_OF_NETLIST(nl, mi, net)
Expand Down Expand Up @@ -1841,10 +1841,10 @@ void VerificImporter::import_netlist(RTLIL::Design *design, Netlist *nl, std::ma
module->connect(net_map_at(net), module->Anyseq(new_verific_id(net)));

#ifdef VERIFIC_SYSTEMVERILOG_SUPPORT
pool<Instance*, hash_ptr_ops> sva_asserts;
pool<Instance*, hash_ptr_ops> sva_assumes;
pool<Instance*, hash_ptr_ops> sva_covers;
pool<Instance*, hash_ptr_ops> sva_triggers;
pool<Instance*> sva_asserts;
pool<Instance*> sva_assumes;
pool<Instance*> sva_covers;
pool<Instance*> sva_triggers;
#endif

pool<RTLIL::Cell*> past_ffs;
Expand Down
4 changes: 2 additions & 2 deletions frontends/verific/verific.h
Original file line number Diff line number Diff line change
Expand Up @@ -71,7 +71,7 @@ struct VerificImporter

std::map<Verific::Net*, RTLIL::SigBit> net_map;
std::map<Verific::Net*, Verific::Net*> sva_posedge_map;
pool<Verific::Net*, hash_ptr_ops> any_all_nets;
pool<Verific::Net*> any_all_nets;

bool mode_gates, mode_keep, mode_nosva, mode_names, mode_verific;
bool mode_autocover, mode_fullinit;
Expand All @@ -89,7 +89,7 @@ struct VerificImporter
RTLIL::SigSpec operatorInput2(Verific::Instance *inst);
RTLIL::SigSpec operatorInport(Verific::Instance *inst, const char *portname);
RTLIL::SigSpec operatorInportCase(Verific::Instance *inst, const char *portname);
RTLIL::SigSpec operatorOutput(Verific::Instance *inst, const pool<Verific::Net*, hash_ptr_ops> *any_all_nets = nullptr);
RTLIL::SigSpec operatorOutput(Verific::Instance *inst, const pool<Verific::Net*> *any_all_nets = nullptr);

bool import_netlist_instance_gates(Verific::Instance *inst, RTLIL::IdString inst_name);
bool import_netlist_instance_cells(Verific::Instance *inst, RTLIL::IdString inst_name);
Expand Down
2 changes: 1 addition & 1 deletion frontends/verific/verificsva.cc
Original file line number Diff line number Diff line change
Expand Up @@ -1051,7 +1051,7 @@ struct VerificSvaImporter
msg.c_str(), inst->View()->Owner()->Name(), inst->Name()), inst->Linefile());
}

dict<Net*, bool, hash_ptr_ops> check_expression_cache;
dict<Net*, bool> check_expression_cache;

bool check_expression(Net *net, bool raise_error = false)
{
Expand Down
Loading
Loading