diff mbox series

PR d/89177 - Fix unaligned access in std.digest.murmurhash

Message ID 20190224153015.17733-1-johannespfau@gmail.com
State New
Headers show
Series PR d/89177 - Fix unaligned access in std.digest.murmurhash | expand

Commit Message

Johannes Pfau Feb. 24, 2019, 3:30 p.m. UTC
Backport latest murmurhash version from upstream (2.084.1). Ran gdc testsuite
on X86_64 linux and got feedback on the bugzilla this really fixes the issue.

libphobos/ChangeLog:

2019-02-24  Johannes Pfau  <johannespfau@gmail.com>

	* src/std/digest/murmurhash.d: PR d/89177: Backport from upstream.
	Fixes unaligned data access (PR d/89177).

---
 libphobos/src/std/digest/murmurhash.d | 122 +++++++++++++++++++++-----
 1 file changed, 101 insertions(+), 21 deletions(-)

Comments

Iain Buclaw March 1, 2019, 9:43 a.m. UTC | #1
On Sun, 24 Feb 2019 at 16:30, Johannes Pfau <johannespfau@gmail.com> wrote:
>
> Backport latest murmurhash version from upstream (2.084.1). Ran gdc testsuite
> on X86_64 linux and got feedback on the bugzilla this really fixes the issue.
>

Raise a pull request with upstream (dmd-cxx is the branch), then this
is OK to commit.
Johannes Pfau March 2, 2019, 7:20 p.m. UTC | #2
Am 01.03.19 um 10:43 schrieb Iain Buclaw:
> On Sun, 24 Feb 2019 at 16:30, Johannes Pfau <johannespfau@gmail.com> wrote:
>> Backport latest murmurhash version from upstream (2.084.1). Ran gdc testsuite
>> on X86_64 linux and got feedback on the bugzilla this really fixes the issue.
>>
> Raise a pull request with upstream (dmd-cxx is the branch), then this
> is OK to commit.
>
PR at https://github.com/dlang/phobos/pull/6886 , committed as r269343.
diff mbox series

Patch

diff --git a/libphobos/src/std/digest/murmurhash.d b/libphobos/src/std/digest/murmurhash.d
index 74efed56898..b5b5bc74f98 100644
--- a/libphobos/src/std/digest/murmurhash.d
+++ b/libphobos/src/std/digest/murmurhash.d
@@ -9,7 +9,7 @@  The older MurmurHash 1 and 2 are currently not supported.
 
 MurmurHash3 comes in three flavors, listed in increasing order of throughput:
 $(UL
-$(LI $(D MurmurHash3!32) produces a 32-bit value and is optimized for 32-bit architectures)
+$(LI `MurmurHash3!32` produces a 32-bit value and is optimized for 32-bit architectures)
 $(LI $(D MurmurHash3!(128, 32)) produces a 128-bit value and is optimized for 32-bit architectures)
 $(LI $(D MurmurHash3!(128, 64)) produces a 128-bit value and is optimized for 64-bit architectures)
 )
@@ -26,7 +26,7 @@  This module conforms to the APIs defined in $(MREF std, digest).
 
 This module publicly imports $(MREF std, digest) and can be used as a stand-alone module.
 
-Source: $(PHOBOSSRC std/digest/_murmurhash.d)
+Source: $(PHOBOSSRC std/digest/murmurhash.d)
 License: $(HTTP www.boost.org/LICENSE_1_0.txt, Boost License 1.0).
 Authors: Guillaume Chatelet
 References: $(LINK2 https://github.com/aappleby/smhasher, Reference implementation)
@@ -38,6 +38,11 @@  $(BR) $(LINK2 https://en.wikipedia.org/wiki/MurmurHash, Wikipedia)
  */
 module std.digest.murmurhash;
 
+version (X86)
+    version = HaveUnalignedLoads;
+else version (X86_64)
+    version = HaveUnalignedLoads;
+
 ///
 @safe unittest
 {
@@ -500,28 +505,75 @@  struct MurmurHash3(uint size /* 32 or 128 */ , uint opt = size_t.sizeof == 8 ? 6
         // Buffer should never be full while entering this function.
         assert(bufferSize < Element.sizeof);
 
-        // Check if we have some leftover data in the buffer. Then fill the first block buffer.
+        // Check if the incoming data doesn't fill up a whole block buffer.
         if (bufferSize + data.length < Element.sizeof)
         {
             buffer.data[bufferSize .. bufferSize + data.length] = data[];
             bufferSize += data.length;
             return;
         }
-        const bufferLeeway = Element.sizeof - bufferSize;
-        assert(bufferLeeway <= Element.sizeof);
-        buffer.data[bufferSize .. $] = data[0 .. bufferLeeway];
-        putElement(buffer.block);
-        data = data[bufferLeeway .. $];
+
+        // Check if there's some leftover data in the first block buffer, and
+        // fill the remaining space first.
+        if (bufferSize != 0)
+        {
+            const bufferLeeway = Element.sizeof - bufferSize;
+            buffer.data[bufferSize .. $] = data[0 .. bufferLeeway];
+            putElement(buffer.block);
+            element_count += Element.sizeof;
+            data = data[bufferLeeway .. $];
+        }
 
         // Do main work: process chunks of `Element.sizeof` bytes.
         const numElements = data.length / Element.sizeof;
         const remainderStart = numElements * Element.sizeof;
-        foreach (ref const Element block; cast(const(Element[]))(data[0 .. remainderStart]))
+        version (HaveUnalignedLoads)
         {
-            putElement(block);
+            foreach (ref const Element block; cast(const(Element[])) data[0 .. remainderStart])
+            {
+                putElement(block);
+            }
         }
-        // +1 for bufferLeeway Element.
-        element_count += (numElements + 1) * Element.sizeof;
+        else
+        {
+            void processChunks(T)() @trusted
+            {
+                alias TChunk = T[Element.sizeof / T.sizeof];
+                foreach (ref const chunk; cast(const(TChunk[])) data[0 .. remainderStart])
+                {
+                    static if (T.alignof >= Element.alignof)
+                    {
+                        putElement(*cast(const(Element)*) chunk.ptr);
+                    }
+                    else
+                    {
+                        Element[1] alignedCopy = void;
+                        (cast(T[]) alignedCopy)[] = chunk[];
+                        putElement(alignedCopy[0]);
+                    }
+                }
+            }
+
+            const startAddress = cast(size_t) data.ptr;
+            static if (size >= 64)
+            {
+                if ((startAddress & 7) == 0)
+                {
+                    processChunks!ulong();
+                    goto L_end;
+                }
+            }
+            static assert(size >= 32);
+            if ((startAddress & 3) == 0)
+                processChunks!uint();
+            else if ((startAddress & 1) == 0)
+                processChunks!ushort();
+            else
+                processChunks!ubyte();
+
+L_end:
+        }
+        element_count += numElements * Element.sizeof;
         data = data[remainderStart .. $];
 
         // Now add remaining data to buffer.
@@ -532,8 +584,8 @@  struct MurmurHash3(uint size /* 32 or 128 */ , uint opt = size_t.sizeof == 8 ? 6
 
     /++
     Finalizes the computation of the hash and returns the computed value.
-    Note that $(D finish) can be called only once and that no subsequent calls
-    to $(D put) is allowed.
+    Note that `finish` can be called only once and that no subsequent calls
+    to `put` is allowed.
     +/
     ubyte[Element.sizeof] finish() pure nothrow
     {
@@ -558,7 +610,7 @@  struct MurmurHash3(uint size /* 32 or 128 */ , uint opt = size_t.sizeof == 8 ? 6
         static assert(isUnsigned!T);
         debug assert(y >= 0 && y <= (T.sizeof * 8));
     }
-    body
+    do
     {
         return ((x << y) | (x >> ((T.sizeof * 8) - y)));
     }
@@ -606,10 +658,35 @@  struct MurmurHash3(uint size /* 32 or 128 */ , uint opt = size_t.sizeof == 8 ? 6
     }
 }
 
-version (unittest)
+
+/// The convenient digest template allows for quick hashing of any data.
+@safe unittest
 {
-    import std.string : representation;
+    ubyte[4] hashed = digest!(MurmurHash3!32)([1, 2, 3, 4]);
+    assert(hashed == [0, 173, 69, 68]);
+}
 
+/**
+One can also hash ubyte data piecewise by instanciating a hasher and call
+the 'put' method.
+*/
+@safe unittest
+{
+    const(ubyte)[] data1 = [1, 2, 3];
+    const(ubyte)[] data2 = [4, 5, 6, 7];
+    // The incoming data will be buffered and hashed element by element.
+    MurmurHash3!32 hasher;
+    hasher.put(data1);
+    hasher.put(data2);
+    // The call to 'finish' ensures:
+    // - the remaining bits are processed
+    // - the hash gets finalized
+    auto hashed = hasher.finish();
+    assert(hashed == [181, 151, 88, 252]);
+}
+
+version (unittest)
+{
     private auto hash(H, Element = H.Element)(string data)
     {
         H hasher;
@@ -743,10 +820,13 @@  version (unittest)
     // Pushing unaligned data and making sure the result is still coherent.
     void testUnalignedHash(H)()
     {
-        immutable ubyte[1025] data = 0xAC;
-        immutable alignedHash = digest!H(data[0 .. $ - 1]); // 0 .. 1023
-        immutable unalignedHash = digest!H(data[1 .. $]); // 1 .. 1024
-        assert(alignedHash == unalignedHash);
+        immutable ubyte[1028] data = 0xAC;
+        immutable alignedHash = digest!H(data[0 .. 1024]);
+        foreach (i; 1 .. 5)
+        {
+            immutable unalignedHash = digest!H(data[i .. 1024 + i]);
+            assert(alignedHash == unalignedHash);
+        }
     }
 
     testUnalignedHash!(MurmurHash3!32)();