<script src="https://cdn.jsdelivr.net/npm/object-hash@2.0.3/dist/object_hash.min.js"></script>
<script src="https://cdn.jsdelivr.net/npm/lodash@4.17.4/lodash.min.js"></script>
async function H(m) {
const msgUint8 = new TextEncoder().encode(m);
const hashBuffer = await crypto.subtle.digest("SHA-256", msgUint8);
const hashArray = Array.from(new Uint8Array(hashBuffer));
return hashArray.map((b) => b.toString(16).padStart(2, "0")).join("");
}
function JSONStringifyOrdered(obj) {
const allKeys = new Set();
JSON.stringify(obj, (key, value) => (allKeys.add(key), value));
return JSON.stringify(obj, Array.from(allKeys).sort());
}
async function prepare() {
window.NUM_ITEMS = 1000;
window.NUM_TAKE = 20;
window.input = [];
window.data = [];
window.indices = [Array(window.NUM_ITEMS).keys()];
// Populate data: entries with a nested array of 50 random entries
for (const i of window.indices) {
window.data[i] = {
ids: window.indices
.slice(0, 1000)
.map((x) => Math.floor(Math.random() * window.NUM_ITEMS * 10)),
counter: i,
counter2: Math.floor(Math.random() * i),
};
}
window.check = window.data.map((obj) => false);
window.stringified = window.data.map(JSONStringifyOrdered);
window.strings = new Set(window.stringified);
// calculate hashes for each data object
window.hashes = new Set(window.data.map(objectHash));
window.stringsHashes = new Set(window.stringified.map(objectHash));
window.shas = new Set(await Promise.all(
window.stringified.map(async (item) => await H(item))
));
window.TAKE_IDX = window.indices.reverse().slice(0, window.NUM_TAKE);
// Uncomment to choose random indices
//window.TAKE_IDX = window.TAKE_IDX.map((x) => Math.floor(Math.random() * window.NUM_ITEMS));
// Populate input: copies of the data
for (const i of window.TAKE_IDX) {
window.input[i] = {
ids: [window.data[i].ids],
counter: window.data[i].counter,
counter2: window.data[i].counter2,
};
}
console.log(window.TAKE_IDX);
console.log("END:prepare");
}
console.log("START:prepare");
prepare();
for (const i of window.TAKE_IDX) {
window.check[i] = window.data.some((item) => _.isEqual(item, window.input[i]));
}
for (const i of window.TAKE_IDX) {
window.check[i] = window.stringified.some((item) => item === JSONStringifyOrdered(window.input[i]));
}
for (const i of window.TAKE_IDX) {
window.check[i] = window.hashes.has(objectHash(window.input[i]));
}
for (const i of window.TAKE_IDX) {
window.check[i] = window.strings.has(JSONStringifyOrdered(window.input[i]));
}
async function testNativeHash() {
for (const i of window.TAKE_IDX) {
window.check[i] = window.shas.has(await H(JSONStringifyOrdered(window.input[i])));
}
}
testNativeHash();
--enable-precise-memory-info
flag.
Test case name | Result |
---|---|
Iteration + _.isEqual | |
Iteration + JSON.stringify | |
Set + objectHash | |
Set + JSON.stringify | |
Set + JSON.stringify + native SHA |
Test name | Executions per second |
---|---|
Iteration + _.isEqual | 53.6 Ops/sec |
Iteration + JSON.stringify | 0.2 Ops/sec |
Set + objectHash | 39.7 Ops/sec |
Set + JSON.stringify | 144.0 Ops/sec |
Set + JSON.stringify + native SHA | 2751.0 Ops/sec |
Let's break down the provided benchmark and explain what is being tested, compared, and their pros and cons.
Benchmark Overview
The provided benchmark measures the performance of different approaches for detecting duplicate data in an array of objects with nested properties. The test consists of four individual test cases:
isEqual
function.objectHash
function to create a hash for each object in the dataset and then checks if this hash exists in a set of hashes created for the input data.Options Compared
Here's a brief overview of each option:
Pros and Cons
Here are some pros and cons of each option:
Benchmark Results
The benchmark results show that:
JSON.stringify
and fast hashing using a custom implementation.In summary, this benchmark compares different approaches for detecting duplicate data in a large dataset. The results highlight that using Set
with custom hashing (e.g., SHA-256) can be more efficient than other methods, especially when dealing with large datasets.