var arabic_str = "kksldfnjeoliwkfmirewogbregiojrmfikrefnهنصتةبهنمخصةبهخصثةبثصهنخقلةثقملىثقمةلكمسقنلمنكقةلمنيبةىلميبنكلكمطسيوبصثقنلحخثقتلثقنلوسيكملنسيكملklrmglkedrmg;ler,g;lkerdmglkermg;ler,g;lerkmglkerglk;ermg;lermg;hjfbwefseoifnuiwefصثعىبصثبىهصثىةبخهمصقاىلثخقتلةحخثقلةثقلةثقلةثقلةثخقلةحخثقلةخقثلةثحخقلةثينقىلنمقثيىلنمقىلمنيثقسىلقىلمقىلنمكقىلمنقىلمنيثقىلمنقىلمنقثىيةلمنخىثيقمنلةيمنلتىنثتقيىلخهمثقيةلخمثقيىلثقفيىلنثقيةلمنخىفقثيلىىىتنعصثىبمخنثةصبحةثصقخمبىثقخمنهلىةثحكقةلكحثقةلمنثقىلمنثقىلكمثةقلكمثقةلمنىثقلمنىثقلمنىثقلنمكثىقلنمكثقىلمنثيقىلمطنثقىلمنطثقىةسلبثمكسقيلةثطقكةلونطحكصثقنلخحثقةلحكخمقفةاقفهىاقفاىقفةلحخثقهخابهقعثالخهثقتلخهثقالخهاثقهعلخباثقحهخبةصحخثجؤوخحجصسثؤوحخصثرةىخهمثقيلىرخهمثقلىخهثقمللاخهثمقلتحخثقتلبحخصثتبحخصثستىبخهمثقىلخهمنثقىلحخثقىلحثقىلخهمنثقصىلثقةلحخثقىلمخهنثيقىلمصثويبجحصضثنيحخصثتبهختثصقخهلبةثقهنخلمةثقيرنمهىيؤنتءراىهخثقةرثقمرةثقيمخهنلىخهثقنلىخهمثقهتل"
var english_str = "UTF-8 encodes each of the 1,112,064 valid code points in the Unicode code space (1,114,112 code points minus 2,048 surrogate code points) using one to four 8-bit bytes (a group of 8 bits is known as an «octet» in the Unicode Standard). Code points wi"
function lengthInUtf8Bytes(str) {
// Matches only the 10.. bytes that are non-initial characters in a multi-byte sequence.
var m = encodeURIComponent(str).match(/%[89ABab]/g);
return str.length + (m ? m.length : 0);
}
function byteLength(str) {
// returns the byte length of an utf8 string
var s = str.length;
for (var i=str.length-1; i>=0; i--) {
var code = str.charCodeAt(i);
if (code > 0x7f && code <= 0x7ff) s++;
else if (code > 0x7ff && code <= 0xffff) s+=2;
}
return s;
}
function byteLengthTextEncoder(str) {
return (new TextEncoder().encode(str)).length;
}
function byteLengthBlob(str) {
return new Blob([str]).size;
}
byteLength(arabic_str);
lengthInUtf8Bytes(arabic_str);
byteLengthTextEncoder(arabic_str)
byteLengthBlob(arabic_str);
--enable-precise-memory-info
flag.
Test case name | Result |
---|---|
with a loop Arabic | |
with a regex Arabic | |
with a TextEncoder ِِArabic | |
with a Blob Arabic |
Test name | Executions per second |
---|---|
with a loop Arabic | 1397361.6 Ops/sec |
with a regex Arabic | 91756.8 Ops/sec |
with a TextEncoder ِِArabic | 567086.8 Ops/sec |
with a Blob Arabic | 19844.5 Ops/sec |
Let's break down the provided JSON and explain what's being tested, compared options, pros and cons of those approaches, library usage, special JS features or syntax (if any), and alternatives.
Benchmark Definition
The benchmark is testing the byte length calculation of the Arabic string arabic_str
using different methods:
byteLength(arabic_str);
lengthInUtf8Bytes(arabic_str);
byteLengthTextEncoder(arabic_str)
byteLengthBlob(arabic_str)
Options Compared
The four options being compared are:
byteLength
function: This method uses a simple loop to iterate through the string and calculate the byte length.lengthInUtf8Bytes
function: This method uses a regex pattern to match non-initial characters in the UTF-8 encoded string, which are likely to be multi-byte sequences.byteLengthTextEncoder
function): This method uses the TextEncoder API to encode the string into an array of bytes and then returns the length of that array.byteLengthBlob
function): This method creates a new Blob object from the string and returns the size of that blob.Pros and Cons
lengthInUtf8Bytes
: More accurate for handling UTF-8 sequences, but slower due to regex overhead.Library Usage
Only the TextEncoder
API is being used in this benchmark, which suggests that modern browsers support it.
Special JS Features or Syntax (None)
No special JavaScript features or syntax are being used in this benchmark.
Alternatives
If you wanted to add more alternatives to the benchmark, some options could be:
However, these alternatives might not be as efficient or accurate as the options already being compared in the benchmark.