Browse Source

final 2.53.19b1 patches (Hopefully)

Frank-Rainer Grahl 4 months ago
parent
commit
9a5d2f4435

+ 82 - 0
mozilla-release/patches/1193389-125a1.patch

@@ -0,0 +1,82 @@
+# HG changeset patch
+# User Kershaw Chang <kershaw@mozilla.com>
+# Date 1708592526 0
+# Node ID 28503602f196ccd436dd8d85a66e42bcebff78c9
+# Parent  13f36fdafed0e954a0ad013eb3eda913d3da185c
+Bug 1193389 - nsTransportEventSinkProxy clean up, r=necko-reviewers,valentin
+
+This patch might not fix this issue, but we should try to avoid accessing raw pointer.
+
+Differential Revision: https://phabricator.services.mozilla.com/D199976
+
+diff --git a/netwerk/base/nsTransportUtils.cpp b/netwerk/base/nsTransportUtils.cpp
+--- a/netwerk/base/nsTransportUtils.cpp
++++ b/netwerk/base/nsTransportUtils.cpp
+@@ -21,36 +21,32 @@ class nsTransportEventSinkProxy : public
+ public:
+     NS_DECL_THREADSAFE_ISUPPORTS
+     NS_DECL_NSITRANSPORTEVENTSINK
+ 
+     nsTransportEventSinkProxy(nsITransportEventSink *sink,
+                               nsIEventTarget *target)
+         : mSink(sink)
+         , mTarget(target)
+-        , mLock("nsTransportEventSinkProxy.mLock")
+-        , mLastEvent(nullptr)
+-    {
+-        NS_ADDREF(mSink);
+-    }
++        , mLock("nsTransportEventSinkProxy.mLock") {}
+ 
+ private:
+     virtual ~nsTransportEventSinkProxy()
+     {
+         // our reference to mSink could be the last, so be sure to release
+         // it on the target thread.  otherwise, we could get into trouble.
+         NS_ProxyRelease(
+-          "nsTransportEventSinkProxy::mSink", mTarget, dont_AddRef(mSink));
++          "nsTransportEventSinkProxy::mSink", mTarget, mSink.forget());
+     }
+ 
+ public:
+-    nsITransportEventSink           *mSink;
++    nsCOMPtr<nsITransportEventSink> mSink;
+     nsCOMPtr<nsIEventTarget>         mTarget;
+     Mutex                            mLock;
+-    nsTransportStatusEvent          *mLastEvent;
++    RefPtr<nsTransportStatusEvent> mLastEvent;
+ };
+ 
+ class nsTransportStatusEvent : public Runnable
+ {
+ public:
+     nsTransportStatusEvent(nsTransportEventSinkProxy *proxy,
+                            nsITransport *transport,
+                            nsresult status,
+@@ -67,22 +63,24 @@ public:
+     ~nsTransportStatusEvent() {}
+ 
+     NS_IMETHOD Run() override
+     {
+         // since this event is being handled, we need to clear the proxy's ref.
+         // if not coalescing all, then last event may not equal self!
+         {
+             MutexAutoLock lock(mProxy->mLock);
+-            if (mProxy->mLastEvent == this)
+-                mProxy->mLastEvent = nullptr;
++            if (mProxy->mLastEvent == this) {
++              mProxy->mLastEvent = nullptr;
++            }
+         }
+ 
+         mProxy->mSink->OnTransportStatus(mTransport, mStatus, mProgress,
+                                          mProgressMax);
++        mProxy = nullptr;
+         return NS_OK;
+     }
+ 
+     RefPtr<nsTransportEventSinkProxy> mProxy;
+ 
+     // parameters to OnTransportStatus
+     nsCOMPtr<nsITransport> mTransport;
+     nsresult               mStatus;

File diff suppressed because it is too large
+ 215 - 477
mozilla-release/patches/1646299-1-79a1.patch


+ 4582 - 0
mozilla-release/patches/1743896-96a1.patch

@@ -0,0 +1,4582 @@
+# HG changeset patch
+# User Ryan VanderMeulen <ryanvm@gmail.com>
+# Date 1638410036 0
+#      Thu Dec 02 01:53:56 2021 +0000
+# Node ID 231183cd387e10347cc19d5b3e980c158ed3f23d
+# Parent  6c9b3bb2cebb7d45db150dc2dad811773bf980a5
+Bug 1743896 - Update xxHash to version 0.8.1. r=dthayer
+
+Differential Revision: https://phabricator.services.mozilla.com/D132631
+
+diff --git a/mfbt/lz4/xxhash.h b/mfbt/lz4/xxhash.h
+--- a/mfbt/lz4/xxhash.h
++++ b/mfbt/lz4/xxhash.h
+@@ -27,29 +27,34 @@
+  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+  *
+  * You can contact the author at:
+  *   - xxHash homepage: https://www.xxhash.com
+  *   - xxHash source repository: https://github.com/Cyan4973/xxHash
+  */
+-
++/*!
++ * @mainpage xxHash
++ *
++ * @file xxhash.h
++ * xxHash prototypes and implementation
++ */
+ /* TODO: update */
+ /* Notice extracted from xxHash homepage:
+ 
+ xxHash is an extremely fast hash algorithm, running at RAM speed limits.
+ It also successfully passes all tests from the SMHasher suite.
+ 
+ Comparison (single thread, Windows Seven 32 bits, using SMHasher on a Core 2 Duo @3GHz)
+ 
+ Name            Speed       Q.Score   Author
+ xxHash          5.4 GB/s     10
+ CrapWow         3.2 GB/s      2       Andrew
+-MumurHash 3a    2.7 GB/s     10       Austin Appleby
++MurmurHash 3a   2.7 GB/s     10       Austin Appleby
+ SpookyHash      2.0 GB/s     10       Bob Jenkins
+ SBox            1.4 GB/s      9       Bret Mulvey
+ Lookup3         1.2 GB/s      9       Bob Jenkins
+ SuperFastHash   1.2 GB/s      1       Paul Hsieh
+ CityHash64      1.05 GB/s    10       Pike & Alakuijala
+ FNV             0.55 GB/s     5       Fowler, Noll, Vo
+ CRC32           0.43 GB/s     9
+ MD5-32          0.33 GB/s    10       Ronald L. Rivest
+@@ -111,39 +116,90 @@ extern "C" {
+ #    define XXH_PUBLIC_API static __inline
+ #  else
+      /* note: this version may generate warnings for unused static functions */
+ #    define XXH_PUBLIC_API static
+ #  endif
+ 
+    /*
+     * This part deals with the special case where a unit wants to inline xxHash,
+-    * but "xxhash.h" has previously been included without XXH_INLINE_ALL, such
+-    * as part of some previously included *.h header file.
++    * but "xxhash.h" has previously been included without XXH_INLINE_ALL,
++    * such as part of some previously included *.h header file.
+     * Without further action, the new include would just be ignored,
+     * and functions would effectively _not_ be inlined (silent failure).
+     * The following macros solve this situation by prefixing all inlined names,
+     * avoiding naming collision with previous inclusions.
+     */
+-#  ifdef XXH_NAMESPACE
+-#    error "XXH_INLINE_ALL with XXH_NAMESPACE is not supported"
+-     /*
+-      * Note: Alternative: #undef all symbols (it's a pretty large list).
+-      * Without #error: it compiles, but functions are actually not inlined.
+-      */
+-#  endif
++   /* Before that, we unconditionally #undef all symbols,
++    * in case they were already defined with XXH_NAMESPACE.
++    * They will then be redefined for XXH_INLINE_ALL
++    */
++#  undef XXH_versionNumber
++    /* XXH32 */
++#  undef XXH32
++#  undef XXH32_createState
++#  undef XXH32_freeState
++#  undef XXH32_reset
++#  undef XXH32_update
++#  undef XXH32_digest
++#  undef XXH32_copyState
++#  undef XXH32_canonicalFromHash
++#  undef XXH32_hashFromCanonical
++    /* XXH64 */
++#  undef XXH64
++#  undef XXH64_createState
++#  undef XXH64_freeState
++#  undef XXH64_reset
++#  undef XXH64_update
++#  undef XXH64_digest
++#  undef XXH64_copyState
++#  undef XXH64_canonicalFromHash
++#  undef XXH64_hashFromCanonical
++    /* XXH3_64bits */
++#  undef XXH3_64bits
++#  undef XXH3_64bits_withSecret
++#  undef XXH3_64bits_withSeed
++#  undef XXH3_64bits_withSecretandSeed
++#  undef XXH3_createState
++#  undef XXH3_freeState
++#  undef XXH3_copyState
++#  undef XXH3_64bits_reset
++#  undef XXH3_64bits_reset_withSeed
++#  undef XXH3_64bits_reset_withSecret
++#  undef XXH3_64bits_update
++#  undef XXH3_64bits_digest
++#  undef XXH3_generateSecret
++    /* XXH3_128bits */
++#  undef XXH128
++#  undef XXH3_128bits
++#  undef XXH3_128bits_withSeed
++#  undef XXH3_128bits_withSecret
++#  undef XXH3_128bits_reset
++#  undef XXH3_128bits_reset_withSeed
++#  undef XXH3_128bits_reset_withSecret
++#  undef XXH3_128bits_reset_withSecretandSeed
++#  undef XXH3_128bits_update
++#  undef XXH3_128bits_digest
++#  undef XXH128_isEqual
++#  undef XXH128_cmp
++#  undef XXH128_canonicalFromHash
++#  undef XXH128_hashFromCanonical
++    /* Finally, free the namespace itself */
++#  undef XXH_NAMESPACE
++
++    /* employ the namespace for XXH_INLINE_ALL */
+ #  define XXH_NAMESPACE XXH_INLINE_
+    /*
+-    * Some identifiers (enums, type names) are not symbols, but they must
+-    * still be renamed to avoid redeclaration.
++    * Some identifiers (enums, type names) are not symbols,
++    * but they must nonetheless be renamed to avoid redeclaration.
+     * Alternative solution: do not redeclare them.
+-    * However, this requires some #ifdefs, and is a more dispersed action.
+-    * Meanwhile, renaming can be achieved in a single block
++    * However, this requires some #ifdefs, and has a more dispersed impact.
++    * Meanwhile, renaming can be achieved in a single place.
+     */
+-#  define XXH_IPREF(Id)   XXH_INLINE_ ## Id
++#  define XXH_IPREF(Id)   XXH_NAMESPACE ## Id
+ #  define XXH_OK XXH_IPREF(XXH_OK)
+ #  define XXH_ERROR XXH_IPREF(XXH_ERROR)
+ #  define XXH_errorcode XXH_IPREF(XXH_errorcode)
+ #  define XXH32_canonical_t  XXH_IPREF(XXH32_canonical_t)
+ #  define XXH64_canonical_t  XXH_IPREF(XXH64_canonical_t)
+ #  define XXH128_canonical_t XXH_IPREF(XXH128_canonical_t)
+ #  define XXH32_state_s XXH_IPREF(XXH32_state_s)
+ #  define XXH32_state_t XXH_IPREF(XXH32_state_t)
+@@ -160,42 +216,53 @@ extern "C" {
+ 
+ 
+ /* ****************************************************************
+  *  Stable API
+  *****************************************************************/
+ #ifndef XXHASH_H_5627135585666179
+ #define XXHASH_H_5627135585666179 1
+ 
++
++/*!
++ * @defgroup public Public API
++ * Contains details on the public xxHash functions.
++ * @{
++ */
+ /* specific declaration modes for Windows */
+ #if !defined(XXH_INLINE_ALL) && !defined(XXH_PRIVATE_API)
+ #  if defined(WIN32) && defined(_MSC_VER) && (defined(XXH_IMPORT) || defined(XXH_EXPORT))
+ #    ifdef XXH_EXPORT
+ #      define XXH_PUBLIC_API __declspec(dllexport)
+ #    elif XXH_IMPORT
+ #      define XXH_PUBLIC_API __declspec(dllimport)
+ #    endif
+ #  else
+ #    define XXH_PUBLIC_API   /* do nothing */
+ #  endif
+ #endif
+ 
++#ifdef XXH_DOXYGEN
+ /*!
+- * XXH_NAMESPACE, aka Namespace Emulation:
++ * @brief Emulate a namespace by transparently prefixing all symbols.
+  *
+  * If you want to include _and expose_ xxHash functions from within your own
+  * library, but also want to avoid symbol collisions with other libraries which
+  * may also include xxHash, you can use XXH_NAMESPACE to automatically prefix
+  * any public symbol from xxhash library with the value of XXH_NAMESPACE
+  * (therefore, avoid empty or numeric values).
+  *
+  * Note that no change is required within the calling program as long as it
+  * includes `xxhash.h`: Regular symbol names will be automatically translated
+  * by this header.
+  */
++#  define XXH_NAMESPACE /* YOUR NAME HERE */
++#  undef XXH_NAMESPACE
++#endif
++
+ #ifdef XXH_NAMESPACE
+ #  define XXH_CAT(A,B) A##B
+ #  define XXH_NAME2(A,B) XXH_CAT(A,B)
+ #  define XXH_versionNumber XXH_NAME2(XXH_NAMESPACE, XXH_versionNumber)
+ /* XXH32 */
+ #  define XXH32 XXH_NAME2(XXH_NAMESPACE, XXH32)
+ #  define XXH32_createState XXH_NAME2(XXH_NAMESPACE, XXH32_createState)
+ #  define XXH32_freeState XXH_NAME2(XXH_NAMESPACE, XXH32_freeState)
+@@ -214,97 +281,146 @@ extern "C" {
+ #  define XXH64_digest XXH_NAME2(XXH_NAMESPACE, XXH64_digest)
+ #  define XXH64_copyState XXH_NAME2(XXH_NAMESPACE, XXH64_copyState)
+ #  define XXH64_canonicalFromHash XXH_NAME2(XXH_NAMESPACE, XXH64_canonicalFromHash)
+ #  define XXH64_hashFromCanonical XXH_NAME2(XXH_NAMESPACE, XXH64_hashFromCanonical)
+ /* XXH3_64bits */
+ #  define XXH3_64bits XXH_NAME2(XXH_NAMESPACE, XXH3_64bits)
+ #  define XXH3_64bits_withSecret XXH_NAME2(XXH_NAMESPACE, XXH3_64bits_withSecret)
+ #  define XXH3_64bits_withSeed XXH_NAME2(XXH_NAMESPACE, XXH3_64bits_withSeed)
++#  define XXH3_64bits_withSecretandSeed XXH_NAME2(XXH_NAMESPACE, XXH3_64bits_withSecretandSeed)
+ #  define XXH3_createState XXH_NAME2(XXH_NAMESPACE, XXH3_createState)
+ #  define XXH3_freeState XXH_NAME2(XXH_NAMESPACE, XXH3_freeState)
+ #  define XXH3_copyState XXH_NAME2(XXH_NAMESPACE, XXH3_copyState)
+ #  define XXH3_64bits_reset XXH_NAME2(XXH_NAMESPACE, XXH3_64bits_reset)
+ #  define XXH3_64bits_reset_withSeed XXH_NAME2(XXH_NAMESPACE, XXH3_64bits_reset_withSeed)
+ #  define XXH3_64bits_reset_withSecret XXH_NAME2(XXH_NAMESPACE, XXH3_64bits_reset_withSecret)
++#  define XXH3_64bits_reset_withSecretandSeed XXH_NAME2(XXH_NAMESPACE, XXH3_64bits_reset_withSecretandSeed)
+ #  define XXH3_64bits_update XXH_NAME2(XXH_NAMESPACE, XXH3_64bits_update)
+ #  define XXH3_64bits_digest XXH_NAME2(XXH_NAMESPACE, XXH3_64bits_digest)
+ #  define XXH3_generateSecret XXH_NAME2(XXH_NAMESPACE, XXH3_generateSecret)
++#  define XXH3_generateSecret_fromSeed XXH_NAME2(XXH_NAMESPACE, XXH3_generateSecret_fromSeed)
+ /* XXH3_128bits */
+ #  define XXH128 XXH_NAME2(XXH_NAMESPACE, XXH128)
+ #  define XXH3_128bits XXH_NAME2(XXH_NAMESPACE, XXH3_128bits)
+ #  define XXH3_128bits_withSeed XXH_NAME2(XXH_NAMESPACE, XXH3_128bits_withSeed)
+ #  define XXH3_128bits_withSecret XXH_NAME2(XXH_NAMESPACE, XXH3_128bits_withSecret)
++#  define XXH3_128bits_withSecretandSeed XXH_NAME2(XXH_NAMESPACE, XXH3_128bits_withSecretandSeed)
+ #  define XXH3_128bits_reset XXH_NAME2(XXH_NAMESPACE, XXH3_128bits_reset)
+ #  define XXH3_128bits_reset_withSeed XXH_NAME2(XXH_NAMESPACE, XXH3_128bits_reset_withSeed)
+ #  define XXH3_128bits_reset_withSecret XXH_NAME2(XXH_NAMESPACE, XXH3_128bits_reset_withSecret)
++#  define XXH3_128bits_reset_withSecretandSeed XXH_NAME2(XXH_NAMESPACE, XXH3_128bits_reset_withSecretandSeed)
+ #  define XXH3_128bits_update XXH_NAME2(XXH_NAMESPACE, XXH3_128bits_update)
+ #  define XXH3_128bits_digest XXH_NAME2(XXH_NAMESPACE, XXH3_128bits_digest)
+ #  define XXH128_isEqual XXH_NAME2(XXH_NAMESPACE, XXH128_isEqual)
+ #  define XXH128_cmp     XXH_NAME2(XXH_NAMESPACE, XXH128_cmp)
+ #  define XXH128_canonicalFromHash XXH_NAME2(XXH_NAMESPACE, XXH128_canonicalFromHash)
+ #  define XXH128_hashFromCanonical XXH_NAME2(XXH_NAMESPACE, XXH128_hashFromCanonical)
+ #endif
+ 
+ 
+ /* *************************************
+ *  Version
+ ***************************************/
+ #define XXH_VERSION_MAJOR    0
+ #define XXH_VERSION_MINOR    8
+-#define XXH_VERSION_RELEASE  0
++#define XXH_VERSION_RELEASE  1
+ #define XXH_VERSION_NUMBER  (XXH_VERSION_MAJOR *100*100 + XXH_VERSION_MINOR *100 + XXH_VERSION_RELEASE)
++
++/*!
++ * @brief Obtains the xxHash version.
++ *
++ * This is mostly useful when xxHash is compiled as a shared library,
++ * since the returned value comes from the library, as opposed to header file.
++ *
++ * @return `XXH_VERSION_NUMBER` of the invoked library.
++ */
+ XXH_PUBLIC_API unsigned XXH_versionNumber (void);
+ 
+ 
+ /* ****************************
+-*  Definitions
++*  Common basic types
+ ******************************/
+ #include <stddef.h>   /* size_t */
+ typedef enum { XXH_OK=0, XXH_ERROR } XXH_errorcode;
+ 
+ 
+ /*-**********************************************************************
+ *  32-bit hash
+ ************************************************************************/
+-#if !defined (__VMS) \
++#if defined(XXH_DOXYGEN) /* Don't show <stdint.h> include */
++/*!
++ * @brief An unsigned 32-bit integer.
++ *
++ * Not necessarily defined to `uint32_t` but functionally equivalent.
++ */
++typedef uint32_t XXH32_hash_t;
++
++#elif !defined (__VMS) \
+   && (defined (__cplusplus) \
+   || (defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) /* C99 */) )
+ #   include <stdint.h>
+     typedef uint32_t XXH32_hash_t;
++
+ #else
+ #   include <limits.h>
+ #   if UINT_MAX == 0xFFFFFFFFUL
+       typedef unsigned int XXH32_hash_t;
+ #   else
+ #     if ULONG_MAX == 0xFFFFFFFFUL
+         typedef unsigned long XXH32_hash_t;
+ #     else
+ #       error "unsupported platform: need a 32-bit type"
+ #     endif
+ #   endif
+ #endif
+ 
+ /*!
+- * XXH32():
+- *  Calculate the 32-bit hash of sequence "length" bytes stored at memory address "input".
+- *  The memory between input & input+length must be valid (allocated and read-accessible).
+- *  "seed" can be used to alter the result predictably.
+- *  Speed on Core 2 Duo @ 3 GHz (single thread, SMHasher benchmark): 5.4 GB/s
+- *
+- * Note: XXH3 provides competitive speed for both 32-bit and 64-bit systems,
+- * and offers true 64/128 bit hash results. It provides a superior level of
+- * dispersion, and greatly reduces the risks of collisions.
++ * @}
++ *
++ * @defgroup xxh32_family XXH32 family
++ * @ingroup public
++ * Contains functions used in the classic 32-bit xxHash algorithm.
++ *
++ * @note
++ *   XXH32 is useful for older platforms, with no or poor 64-bit performance.
++ *   Note that @ref xxh3_family provides competitive speed
++ *   for both 32-bit and 64-bit systems, and offers true 64/128 bit hash results.
++ *
++ * @see @ref xxh64_family, @ref xxh3_family : Other xxHash families
++ * @see @ref xxh32_impl for implementation details
++ * @{
++ */
++
++/*!
++ * @brief Calculates the 32-bit hash of @p input using xxHash32.
++ *
++ * Speed on Core 2 Duo @ 3 GHz (single thread, SMHasher benchmark): 5.4 GB/s
++ *
++ * @param input The block of data to be hashed, at least @p length bytes in size.
++ * @param length The length of @p input, in bytes.
++ * @param seed The 32-bit seed to alter the hash's output predictably.
++ *
++ * @pre
++ *   The memory between @p input and @p input + @p length must be valid,
++ *   readable, contiguous memory. However, if @p length is `0`, @p input may be
++ *   `NULL`. In C++, this also must be *TriviallyCopyable*.
++ *
++ * @return The calculated 32-bit hash value.
++ *
++ * @see
++ *    XXH64(), XXH3_64bits_withSeed(), XXH3_128bits_withSeed(), XXH128():
++ *    Direct equivalents for the other variants of xxHash.
++ * @see
++ *    XXH32_createState(), XXH32_update(), XXH32_digest(): Streaming version.
+  */
+ XXH_PUBLIC_API XXH32_hash_t XXH32 (const void* input, size_t length, XXH32_hash_t seed);
+ 
+-/*******   Streaming   *******/
+-
+-/*
+- * Streaming functions generate the xxHash value from an incrememtal input.
++/*!
++ * Streaming functions generate the xxHash value from an incremental input.
+  * This method is slower than single-call functions, due to state management.
+  * For small inputs, prefer `XXH32()` and `XXH64()`, which are better optimized.
+  *
+  * An XXH state must first be allocated using `XXH*_createState()`.
+  *
+  * Start a new hash by initializing the state with a seed using `XXH*_reset()`.
+  *
+  * Then, feed the hash state by calling `XXH*_update()` as many times as necessary.
+@@ -314,25 +430,127 @@ XXH_PUBLIC_API XXH32_hash_t XXH32 (const
+  *
+  * Finally, a hash value can be produced anytime, by using `XXH*_digest()`.
+  * This function returns the nn-bits hash as an int or long long.
+  *
+  * It's still possible to continue inserting input into the hash state after a
+  * digest, and generate new hash values later on by invoking `XXH*_digest()`.
+  *
+  * When done, release the state using `XXH*_freeState()`.
++ *
++ * Example code for incrementally hashing a file:
++ * @code{.c}
++ *    #include <stdio.h>
++ *    #include <xxhash.h>
++ *    #define BUFFER_SIZE 256
++ *
++ *    // Note: XXH64 and XXH3 use the same interface.
++ *    XXH32_hash_t
++ *    hashFile(FILE* stream)
++ *    {
++ *        XXH32_state_t* state;
++ *        unsigned char buf[BUFFER_SIZE];
++ *        size_t amt;
++ *        XXH32_hash_t hash;
++ *
++ *        state = XXH32_createState();       // Create a state
++ *        assert(state != NULL);             // Error check here
++ *        XXH32_reset(state, 0xbaad5eed);    // Reset state with our seed
++ *        while ((amt = fread(buf, 1, sizeof(buf), stream)) != 0) {
++ *            XXH32_update(state, buf, amt); // Hash the file in chunks
++ *        }
++ *        hash = XXH32_digest(state);        // Finalize the hash
++ *        XXH32_freeState(state);            // Clean up
++ *        return hash;
++ *    }
++ * @endcode
+  */
+ 
+-typedef struct XXH32_state_s XXH32_state_t;   /* incomplete type */
++/*!
++ * @typedef struct XXH32_state_s XXH32_state_t
++ * @brief The opaque state struct for the XXH32 streaming API.
++ *
++ * @see XXH32_state_s for details.
++ */
++typedef struct XXH32_state_s XXH32_state_t;
++
++/*!
++ * @brief Allocates an @ref XXH32_state_t.
++ *
++ * Must be freed with XXH32_freeState().
++ * @return An allocated XXH32_state_t on success, `NULL` on failure.
++ */
+ XXH_PUBLIC_API XXH32_state_t* XXH32_createState(void);
++/*!
++ * @brief Frees an @ref XXH32_state_t.
++ *
++ * Must be allocated with XXH32_createState().
++ * @param statePtr A pointer to an @ref XXH32_state_t allocated with @ref XXH32_createState().
++ * @return XXH_OK.
++ */
+ XXH_PUBLIC_API XXH_errorcode  XXH32_freeState(XXH32_state_t* statePtr);
++/*!
++ * @brief Copies one @ref XXH32_state_t to another.
++ *
++ * @param dst_state The state to copy to.
++ * @param src_state The state to copy from.
++ * @pre
++ *   @p dst_state and @p src_state must not be `NULL` and must not overlap.
++ */
+ XXH_PUBLIC_API void XXH32_copyState(XXH32_state_t* dst_state, const XXH32_state_t* src_state);
+ 
++/*!
++ * @brief Resets an @ref XXH32_state_t to begin a new hash.
++ *
++ * This function resets and seeds a state. Call it before @ref XXH32_update().
++ *
++ * @param statePtr The state struct to reset.
++ * @param seed The 32-bit seed to alter the hash result predictably.
++ *
++ * @pre
++ *   @p statePtr must not be `NULL`.
++ *
++ * @return @ref XXH_OK on success, @ref XXH_ERROR on failure.
++ */
+ XXH_PUBLIC_API XXH_errorcode XXH32_reset  (XXH32_state_t* statePtr, XXH32_hash_t seed);
++
++/*!
++ * @brief Consumes a block of @p input to an @ref XXH32_state_t.
++ *
++ * Call this to incrementally consume blocks of data.
++ *
++ * @param statePtr The state struct to update.
++ * @param input The block of data to be hashed, at least @p length bytes in size.
++ * @param length The length of @p input, in bytes.
++ *
++ * @pre
++ *   @p statePtr must not be `NULL`.
++ * @pre
++ *   The memory between @p input and @p input + @p length must be valid,
++ *   readable, contiguous memory. However, if @p length is `0`, @p input may be
++ *   `NULL`. In C++, this also must be *TriviallyCopyable*.
++ *
++ * @return @ref XXH_OK on success, @ref XXH_ERROR on failure.
++ */
+ XXH_PUBLIC_API XXH_errorcode XXH32_update (XXH32_state_t* statePtr, const void* input, size_t length);
++
++/*!
++ * @brief Returns the calculated hash value from an @ref XXH32_state_t.
++ *
++ * @note
++ *   Calling XXH32_digest() will not affect @p statePtr, so you can update,
++ *   digest, and update again.
++ *
++ * @param statePtr The state struct to calculate the hash from.
++ *
++ * @pre
++ *  @p statePtr must not be `NULL`.
++ *
++ * @return The calculated xxHash32 value from that state.
++ */
+ XXH_PUBLIC_API XXH32_hash_t  XXH32_digest (const XXH32_state_t* statePtr);
+ 
+ /*******   Canonical representation   *******/
+ 
+ /*
+  * The default return values from XXH functions are unsigned 32 and 64 bit
+  * integers.
+  * This the simplest and fastest format for further post-processing.
+@@ -346,159 +564,280 @@ XXH_PUBLIC_API XXH32_hash_t  XXH32_diges
+  * When writing hash values to storage, sending them over a network, or printing
+  * them, it's highly recommended to use the canonical representation to ensure
+  * portability across a wider range of systems, present and future.
+  *
+  * The following functions allow transformation of hash values to and from
+  * canonical format.
+  */
+ 
+-typedef struct { unsigned char digest[4]; } XXH32_canonical_t;
++/*!
++ * @brief Canonical (big endian) representation of @ref XXH32_hash_t.
++ */
++typedef struct {
++    unsigned char digest[4]; /*!< Hash bytes, big endian */
++} XXH32_canonical_t;
++
++/*!
++ * @brief Converts an @ref XXH32_hash_t to a big endian @ref XXH32_canonical_t.
++ *
++ * @param dst The @ref XXH32_canonical_t pointer to be stored to.
++ * @param hash The @ref XXH32_hash_t to be converted.
++ *
++ * @pre
++ *   @p dst must not be `NULL`.
++ */
+ XXH_PUBLIC_API void XXH32_canonicalFromHash(XXH32_canonical_t* dst, XXH32_hash_t hash);
++
++/*!
++ * @brief Converts an @ref XXH32_canonical_t to a native @ref XXH32_hash_t.
++ *
++ * @param src The @ref XXH32_canonical_t to convert.
++ *
++ * @pre
++ *   @p src must not be `NULL`.
++ *
++ * @return The converted hash.
++ */
+ XXH_PUBLIC_API XXH32_hash_t XXH32_hashFromCanonical(const XXH32_canonical_t* src);
+ 
+ 
++#ifdef __has_attribute
++# define XXH_HAS_ATTRIBUTE(x) __has_attribute(x)
++#else
++# define XXH_HAS_ATTRIBUTE(x) 0
++#endif
++
++/* C-language Attributes are added in C23. */
++#if defined(__STDC_VERSION__) && (__STDC_VERSION__ > 201710L) && defined(__has_c_attribute)
++# define XXH_HAS_C_ATTRIBUTE(x) __has_c_attribute(x)
++#else
++# define XXH_HAS_C_ATTRIBUTE(x) 0
++#endif
++
++#if defined(__cplusplus) && defined(__has_cpp_attribute)
++# define XXH_HAS_CPP_ATTRIBUTE(x) __has_cpp_attribute(x)
++#else
++# define XXH_HAS_CPP_ATTRIBUTE(x) 0
++#endif
++
++/*
++Define XXH_FALLTHROUGH macro for annotating switch case with the 'fallthrough' attribute
++introduced in CPP17 and C23.
++CPP17 : https://en.cppreference.com/w/cpp/language/attributes/fallthrough
++C23   : https://en.cppreference.com/w/c/language/attributes/fallthrough
++*/
++#if XXH_HAS_C_ATTRIBUTE(x)
++# define XXH_FALLTHROUGH [[fallthrough]]
++#elif XXH_HAS_CPP_ATTRIBUTE(x)
++# define XXH_FALLTHROUGH [[fallthrough]]
++#elif XXH_HAS_ATTRIBUTE(__fallthrough__)
++# define XXH_FALLTHROUGH __attribute__ ((fallthrough))
++#else
++# define XXH_FALLTHROUGH
++#endif
++
++/*!
++ * @}
++ * @ingroup public
++ * @{
++ */
++
+ #ifndef XXH_NO_LONG_LONG
+ /*-**********************************************************************
+ *  64-bit hash
+ ************************************************************************/
+-#if !defined (__VMS) \
++#if defined(XXH_DOXYGEN) /* don't include <stdint.h> */
++/*!
++ * @brief An unsigned 64-bit integer.
++ *
++ * Not necessarily defined to `uint64_t` but functionally equivalent.
++ */
++typedef uint64_t XXH64_hash_t;
++#elif !defined (__VMS) \
+   && (defined (__cplusplus) \
+   || (defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) /* C99 */) )
+-#   include <stdint.h>
+-    typedef uint64_t XXH64_hash_t;
++#  include <stdint.h>
++   typedef uint64_t XXH64_hash_t;
+ #else
+-    /* the following type must have a width of 64-bit */
+-    typedef unsigned long long XXH64_hash_t;
++#  include <limits.h>
++#  if defined(__LP64__) && ULONG_MAX == 0xFFFFFFFFFFFFFFFFULL
++     /* LP64 ABI says uint64_t is unsigned long */
++     typedef unsigned long XXH64_hash_t;
++#  else
++     /* the following type must have a width of 64-bit */
++     typedef unsigned long long XXH64_hash_t;
++#  endif
+ #endif
+ 
+ /*!
+- * XXH64():
+- * Returns the 64-bit hash of sequence of length @length stored at memory
+- * address @input.
+- * @seed can be used to alter the result predictably.
++ * @}
++ *
++ * @defgroup xxh64_family XXH64 family
++ * @ingroup public
++ * @{
++ * Contains functions used in the classic 64-bit xxHash algorithm.
++ *
++ * @note
++ *   XXH3 provides competitive speed for both 32-bit and 64-bit systems,
++ *   and offers true 64/128 bit hash results.
++ *   It provides better speed for systems with vector processing capabilities.
++ */
++
++
++/*!
++ * @brief Calculates the 64-bit hash of @p input using xxHash64.
+  *
+  * This function usually runs faster on 64-bit systems, but slower on 32-bit
+  * systems (see benchmark).
+  *
+- * Note: XXH3 provides competitive speed for both 32-bit and 64-bit systems,
+- * and offers true 64/128 bit hash results. It provides a superior level of
+- * dispersion, and greatly reduces the risks of collisions.
++ * @param input The block of data to be hashed, at least @p length bytes in size.
++ * @param length The length of @p input, in bytes.
++ * @param seed The 64-bit seed to alter the hash's output predictably.
++ *
++ * @pre
++ *   The memory between @p input and @p input + @p length must be valid,
++ *   readable, contiguous memory. However, if @p length is `0`, @p input may be
++ *   `NULL`. In C++, this also must be *TriviallyCopyable*.
++ *
++ * @return The calculated 64-bit hash.
++ *
++ * @see
++ *    XXH32(), XXH3_64bits_withSeed(), XXH3_128bits_withSeed(), XXH128():
++ *    Direct equivalents for the other variants of xxHash.
++ * @see
++ *    XXH64_createState(), XXH64_update(), XXH64_digest(): Streaming version.
+  */
+-XXH_PUBLIC_API XXH64_hash_t XXH64 (const void* input, size_t length, XXH64_hash_t seed);
++XXH_PUBLIC_API XXH64_hash_t XXH64(const void* input, size_t length, XXH64_hash_t seed);
+ 
+ /*******   Streaming   *******/
++/*!
++ * @brief The opaque state struct for the XXH64 streaming API.
++ *
++ * @see XXH64_state_s for details.
++ */
+ typedef struct XXH64_state_s XXH64_state_t;   /* incomplete type */
+ XXH_PUBLIC_API XXH64_state_t* XXH64_createState(void);
+ XXH_PUBLIC_API XXH_errorcode  XXH64_freeState(XXH64_state_t* statePtr);
+ XXH_PUBLIC_API void XXH64_copyState(XXH64_state_t* dst_state, const XXH64_state_t* src_state);
+ 
+ XXH_PUBLIC_API XXH_errorcode XXH64_reset  (XXH64_state_t* statePtr, XXH64_hash_t seed);
+ XXH_PUBLIC_API XXH_errorcode XXH64_update (XXH64_state_t* statePtr, const void* input, size_t length);
+ XXH_PUBLIC_API XXH64_hash_t  XXH64_digest (const XXH64_state_t* statePtr);
+ 
+ /*******   Canonical representation   *******/
+ typedef struct { unsigned char digest[sizeof(XXH64_hash_t)]; } XXH64_canonical_t;
+ XXH_PUBLIC_API void XXH64_canonicalFromHash(XXH64_canonical_t* dst, XXH64_hash_t hash);
+ XXH_PUBLIC_API XXH64_hash_t XXH64_hashFromCanonical(const XXH64_canonical_t* src);
+ 
+-
+-/*-**********************************************************************
+-*  XXH3 64-bit variant
+-************************************************************************/
+-
+-/* ************************************************************************
+- * XXH3 is a new hash algorithm featuring:
++/*!
++ * @}
++ * ************************************************************************
++ * @defgroup xxh3_family XXH3 family
++ * @ingroup public
++ * @{
++ *
++ * XXH3 is a more recent hash algorithm featuring:
+  *  - Improved speed for both small and large inputs
+  *  - True 64-bit and 128-bit outputs
+  *  - SIMD acceleration
+  *  - Improved 32-bit viability
+  *
+  * Speed analysis methodology is explained here:
+  *
+  *    https://fastcompression.blogspot.com/2019/03/presenting-xxh3.html
+  *
+- * In general, expect XXH3 to run about ~2x faster on large inputs and >3x
+- * faster on small ones compared to XXH64, though exact differences depend on
+- * the platform.
+- *
+- * The algorithm is portable: Like XXH32 and XXH64, it generates the same hash
+- * on all platforms.
+- *
+- * It benefits greatly from SIMD and 64-bit arithmetic, but does not require it.
+- *
+- * Almost all 32-bit and 64-bit targets that can run XXH32 smoothly can run
+- * XXH3 at competitive speeds, even if XXH64 runs slowly. Further details are
+- * explained in the implementation.
++ * Compared to XXH64, expect XXH3 to run approximately
++ * ~2x faster on large inputs and >3x faster on small ones,
++ * exact differences vary depending on platform.
++ *
++ * XXH3's speed benefits greatly from SIMD and 64-bit arithmetic,
++ * but does not require it.
++ * Any 32-bit and 64-bit targets that can run XXH32 smoothly
++ * can run XXH3 at competitive speeds, even without vector support.
++ * Further details are explained in the implementation.
+  *
+  * Optimized implementations are provided for AVX512, AVX2, SSE2, NEON, POWER8,
+- * ZVector and scalar targets. This can be controlled with the XXH_VECTOR macro.
++ * ZVector and scalar targets. This can be controlled via the XXH_VECTOR macro.
++ *
++ * XXH3 implementation is portable:
++ * it has a generic C90 formulation that can be compiled on any platform,
++ * all implementations generage exactly the same hash value on all platforms.
++ * Starting from v0.8.0, it's also labelled "stable", meaning that
++ * any future version will also generate the same hash value.
+  *
+  * XXH3 offers 2 variants, _64bits and _128bits.
+- * When only 64 bits are needed, prefer calling the _64bits variant, as it
++ *
++ * When only 64 bits are needed, prefer invoking the _64bits variant, as it
+  * reduces the amount of mixing, resulting in faster speed on small inputs.
+- *
+  * It's also generally simpler to manipulate a scalar return type than a struct.
+  *
+- * The 128-bit version adds additional strength, but it is slightly slower.
+- *
+- * The XXH3 algorithm is still in development.
+- * The results it produces may still change in future versions.
+- *
+- * Results produced by v0.7.x are not comparable with results from v0.7.y.
+- * However, the API is completely stable, and it can safely be used for
+- * ephemeral data (local sessions).
+- *
+- * Avoid storing values in long-term storage until the algorithm is finalized.
+- * XXH3's return values will be officially finalized upon reaching v0.8.0.
+- *
+- * After which, return values of XXH3 and XXH128 will no longer change in
+- * future versions.
+- *
+  * The API supports one-shot hashing, streaming mode, and custom secrets.
+  */
+ 
++/*-**********************************************************************
++*  XXH3 64-bit variant
++************************************************************************/
++
+ /* XXH3_64bits():
+  * default 64-bit variant, using default secret and default seed of 0.
+  * It's the fastest variant. */
+ XXH_PUBLIC_API XXH64_hash_t XXH3_64bits(const void* data, size_t len);
+ 
+ /*
+  * XXH3_64bits_withSeed():
+  * This variant generates a custom secret on the fly
+  * based on default secret altered using the `seed` value.
+  * While this operation is decently fast, note that it's not completely free.
+  * Note: seed==0 produces the same results as XXH3_64bits().
+  */
+ XXH_PUBLIC_API XXH64_hash_t XXH3_64bits_withSeed(const void* data, size_t len, XXH64_hash_t seed);
+ 
++/*!
++ * The bare minimum size for a custom secret.
++ *
++ * @see
++ *  XXH3_64bits_withSecret(), XXH3_64bits_reset_withSecret(),
++ *  XXH3_128bits_withSecret(), XXH3_128bits_reset_withSecret().
++ */
++#define XXH3_SECRET_SIZE_MIN 136
++
+ /*
+  * XXH3_64bits_withSecret():
+  * It's possible to provide any blob of bytes as a "secret" to generate the hash.
+  * This makes it more difficult for an external actor to prepare an intentional collision.
+  * The main condition is that secretSize *must* be large enough (>= XXH3_SECRET_SIZE_MIN).
+- * However, the quality of produced hash values depends on secret's entropy.
+- * Technically, the secret must look like a bunch of random bytes.
++ * However, the quality of the secret impacts the dispersion of the hash algorithm.
++ * Therefore, the secret _must_ look like a bunch of random bytes.
+  * Avoid "trivial" or structured data such as repeated sequences or a text document.
+- * Whenever unsure about the "randomness" of the blob of bytes,
+- * consider relabelling it as a "custom seed" instead,
+- * and employ "XXH3_generateSecret()" (see below)
+- * to generate a high entropy secret derived from the custom seed.
++ * Whenever in doubt about the "randomness" of the blob of bytes,
++ * consider employing "XXH3_generateSecret()" instead (see below).
++ * It will generate a proper high entropy secret derived from the blob of bytes.
++ * Another advantage of using XXH3_generateSecret() is that
++ * it guarantees that all bits within the initial blob of bytes
++ * will impact every bit of the output.
++ * This is not necessarily the case when using the blob of bytes directly
++ * because, when hashing _small_ inputs, only a portion of the secret is employed.
+  */
+-#define XXH3_SECRET_SIZE_MIN 136
+ XXH_PUBLIC_API XXH64_hash_t XXH3_64bits_withSecret(const void* data, size_t len, const void* secret, size_t secretSize);
+ 
+ 
+ /*******   Streaming   *******/
+ /*
+  * Streaming requires state maintenance.
+  * This operation costs memory and CPU.
+  * As a consequence, streaming is slower than one-shot hashing.
+  * For better performance, prefer one-shot functions whenever applicable.
+  */
++
++/*!
++ * @brief The state struct for the XXH3 streaming API.
++ *
++ * @see XXH3_state_s for details.
++ */
+ typedef struct XXH3_state_s XXH3_state_t;
+ XXH_PUBLIC_API XXH3_state_t* XXH3_createState(void);
+ XXH_PUBLIC_API XXH_errorcode XXH3_freeState(XXH3_state_t* statePtr);
+ XXH_PUBLIC_API void XXH3_copyState(XXH3_state_t* dst_state, const XXH3_state_t* src_state);
+ 
+ /*
+  * XXH3_64bits_reset():
+  * Initialize with default parameters.
+@@ -528,19 +867,25 @@ XXH_PUBLIC_API XXH64_hash_t  XXH3_64bits
+ /* note : canonical representation of XXH3 is the same as XXH64
+  * since they both produce XXH64_hash_t values */
+ 
+ 
+ /*-**********************************************************************
+ *  XXH3 128-bit variant
+ ************************************************************************/
+ 
++/*!
++ * @brief The return value from 128-bit hashes.
++ *
++ * Stored in little endian order, although the fields themselves are in native
++ * endianness.
++ */
+ typedef struct {
+- XXH64_hash_t low64;
+- XXH64_hash_t high64;
++    XXH64_hash_t low64;   /*!< `value & 0xFFFFFFFFFFFFFFFF` */
++    XXH64_hash_t high64;  /*!< `value >> 64` */
+ } XXH128_hash_t;
+ 
+ XXH_PUBLIC_API XXH128_hash_t XXH3_128bits(const void* data, size_t len);
+ XXH_PUBLIC_API XXH128_hash_t XXH3_128bits_withSeed(const void* data, size_t len, XXH64_hash_t seed);
+ XXH_PUBLIC_API XXH128_hash_t XXH3_128bits_withSecret(const void* data, size_t len, const void* secret, size_t secretSize);
+ 
+ /*******   Streaming   *******/
+ /*
+@@ -587,16 +932,19 @@ XXH_PUBLIC_API int XXH128_cmp(const void
+ /*******   Canonical representation   *******/
+ typedef struct { unsigned char digest[sizeof(XXH128_hash_t)]; } XXH128_canonical_t;
+ XXH_PUBLIC_API void XXH128_canonicalFromHash(XXH128_canonical_t* dst, XXH128_hash_t hash);
+ XXH_PUBLIC_API XXH128_hash_t XXH128_hashFromCanonical(const XXH128_canonical_t* src);
+ 
+ 
+ #endif  /* XXH_NO_LONG_LONG */
+ 
++/*!
++ * @}
++ */
+ #endif /* XXHASH_H_5627135585666179 */
+ 
+ 
+ 
+ #if defined(XXH_STATIC_LINKING_ONLY) && !defined(XXHASH_H_STATIC_13879238742)
+ #define XXHASH_H_STATIC_13879238742
+ /* ****************************************************************************
+  * This section contains declarations which are not guaranteed to remain stable.
+@@ -607,137 +955,273 @@ XXH_PUBLIC_API XXH128_hash_t XXH128_hash
+  ***************************************************************************** */
+ 
+ /*
+  * These definitions are only present to allow static allocation
+  * of XXH states, on stack or in a struct, for example.
+  * Never **ever** access their members directly.
+  */
+ 
++/*!
++ * @internal
++ * @brief Structure for XXH32 streaming API.
++ *
++ * @note This is only defined when @ref XXH_STATIC_LINKING_ONLY,
++ * @ref XXH_INLINE_ALL, or @ref XXH_IMPLEMENTATION is defined. Otherwise it is
++ * an opaque type. This allows fields to safely be changed.
++ *
++ * Typedef'd to @ref XXH32_state_t.
++ * Do not access the members of this struct directly.
++ * @see XXH64_state_s, XXH3_state_s
++ */
+ struct XXH32_state_s {
+-   XXH32_hash_t total_len_32;
+-   XXH32_hash_t large_len;
+-   XXH32_hash_t v1;
+-   XXH32_hash_t v2;
+-   XXH32_hash_t v3;
+-   XXH32_hash_t v4;
+-   XXH32_hash_t mem32[4];
+-   XXH32_hash_t memsize;
+-   XXH32_hash_t reserved;   /* never read nor write, might be removed in a future version */
++   XXH32_hash_t total_len_32; /*!< Total length hashed, modulo 2^32 */
++   XXH32_hash_t large_len;    /*!< Whether the hash is >= 16 (handles @ref total_len_32 overflow) */
++   XXH32_hash_t v[4];         /*!< Accumulator lanes */
++   XXH32_hash_t mem32[4];     /*!< Internal buffer for partial reads. Treated as unsigned char[16]. */
++   XXH32_hash_t memsize;      /*!< Amount of data in @ref mem32 */
++   XXH32_hash_t reserved;     /*!< Reserved field. Do not read or write to it, it may be removed. */
+ };   /* typedef'd to XXH32_state_t */
+ 
+ 
+ #ifndef XXH_NO_LONG_LONG  /* defined when there is no 64-bit support */
+ 
++/*!
++ * @internal
++ * @brief Structure for XXH64 streaming API.
++ *
++ * @note This is only defined when @ref XXH_STATIC_LINKING_ONLY,
++ * @ref XXH_INLINE_ALL, or @ref XXH_IMPLEMENTATION is defined. Otherwise it is
++ * an opaque type. This allows fields to safely be changed.
++ *
++ * Typedef'd to @ref XXH64_state_t.
++ * Do not access the members of this struct directly.
++ * @see XXH32_state_s, XXH3_state_s
++ */
+ struct XXH64_state_s {
+-   XXH64_hash_t total_len;
+-   XXH64_hash_t v1;
+-   XXH64_hash_t v2;
+-   XXH64_hash_t v3;
+-   XXH64_hash_t v4;
+-   XXH64_hash_t mem64[4];
+-   XXH32_hash_t memsize;
+-   XXH32_hash_t reserved32;  /* required for padding anyway */
+-   XXH64_hash_t reserved64;  /* never read nor write, might be removed in a future version */
++   XXH64_hash_t total_len;    /*!< Total length hashed. This is always 64-bit. */
++   XXH64_hash_t v[4];         /*!< Accumulator lanes */
++   XXH64_hash_t mem64[4];     /*!< Internal buffer for partial reads. Treated as unsigned char[32]. */
++   XXH32_hash_t memsize;      /*!< Amount of data in @ref mem64 */
++   XXH32_hash_t reserved32;   /*!< Reserved field, needed for padding anyways*/
++   XXH64_hash_t reserved64;   /*!< Reserved field. Do not read or write to it, it may be removed. */
+ };   /* typedef'd to XXH64_state_t */
+ 
+-#if defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 201112L)   /* C11+ */
++#if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 201112L) /* >= C11 */
+ #  include <stdalign.h>
+ #  define XXH_ALIGN(n)      alignas(n)
++#elif defined(__cplusplus) && (__cplusplus >= 201103L) /* >= C++11 */
++/* In C++ alignas() is a keyword */
++#  define XXH_ALIGN(n)      alignas(n)
+ #elif defined(__GNUC__)
+ #  define XXH_ALIGN(n)      __attribute__ ((aligned(n)))
+ #elif defined(_MSC_VER)
+ #  define XXH_ALIGN(n)      __declspec(align(n))
+ #else
+ #  define XXH_ALIGN(n)   /* disabled */
+ #endif
+ 
+ /* Old GCC versions only accept the attribute after the type in structures. */
+ #if !(defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 201112L))   /* C11+ */ \
++    && ! (defined(__cplusplus) && (__cplusplus >= 201103L)) /* >= C++11 */ \
+     && defined(__GNUC__)
+ #   define XXH_ALIGN_MEMBER(align, type) type XXH_ALIGN(align)
+ #else
+ #   define XXH_ALIGN_MEMBER(align, type) XXH_ALIGN(align) type
+ #endif
+ 
++/*!
++ * @brief The size of the internal XXH3 buffer.
++ *
++ * This is the optimal update size for incremental hashing.
++ *
++ * @see XXH3_64b_update(), XXH3_128b_update().
++ */
+ #define XXH3_INTERNALBUFFER_SIZE 256
++
++/*!
++ * @brief Default size of the secret buffer (and @ref XXH3_kSecret).
++ *
++ * This is the size used in @ref XXH3_kSecret and the seeded functions.
++ *
++ * Not to be confused with @ref XXH3_SECRET_SIZE_MIN.
++ */
+ #define XXH3_SECRET_DEFAULT_SIZE 192
++
++/*!
++ * @internal
++ * @brief Structure for XXH3 streaming API.
++ *
++ * @note This is only defined when @ref XXH_STATIC_LINKING_ONLY,
++ * @ref XXH_INLINE_ALL, or @ref XXH_IMPLEMENTATION is defined.
++ * Otherwise it is an opaque type.
++ * Never use this definition in combination with dynamic library.
++ * This allows fields to safely be changed in the future.
++ *
++ * @note ** This structure has a strict alignment requirement of 64 bytes!! **
++ * Do not allocate this with `malloc()` or `new`,
++ * it will not be sufficiently aligned.
++ * Use @ref XXH3_createState() and @ref XXH3_freeState(), or stack allocation.
++ *
++ * Typedef'd to @ref XXH3_state_t.
++ * Do never access the members of this struct directly.
++ *
++ * @see XXH3_INITSTATE() for stack initialization.
++ * @see XXH3_createState(), XXH3_freeState().
++ * @see XXH32_state_s, XXH64_state_s
++ */
+ struct XXH3_state_s {
+    XXH_ALIGN_MEMBER(64, XXH64_hash_t acc[8]);
+-   /* used to store a custom secret generated from a seed */
++       /*!< The 8 accumulators. Similar to `vN` in @ref XXH32_state_s::v1 and @ref XXH64_state_s */
+    XXH_ALIGN_MEMBER(64, unsigned char customSecret[XXH3_SECRET_DEFAULT_SIZE]);
++       /*!< Used to store a custom secret generated from a seed. */
+    XXH_ALIGN_MEMBER(64, unsigned char buffer[XXH3_INTERNALBUFFER_SIZE]);
++       /*!< The internal buffer. @see XXH32_state_s::mem32 */
+    XXH32_hash_t bufferedSize;
+-   XXH32_hash_t reserved32;
++       /*!< The amount of memory in @ref buffer, @see XXH32_state_s::memsize */
++   XXH32_hash_t useSeed;
++       /*!< Reserved field. Needed for padding on 64-bit. */
+    size_t nbStripesSoFar;
++       /*!< Number or stripes processed. */
+    XXH64_hash_t totalLen;
++       /*!< Total length hashed. 64-bit even on 32-bit targets. */
+    size_t nbStripesPerBlock;
++       /*!< Number of stripes per block. */
+    size_t secretLimit;
++       /*!< Size of @ref customSecret or @ref extSecret */
+    XXH64_hash_t seed;
++       /*!< Seed for _withSeed variants. Must be zero otherwise, @see XXH3_INITSTATE() */
+    XXH64_hash_t reserved64;
+-   const unsigned char* extSecret;  /* reference to external secret;
+-                                     * if == NULL, use .customSecret instead */
++       /*!< Reserved field. */
++   const unsigned char* extSecret;
++       /*!< Reference to an external secret for the _withSecret variants, NULL
++        *   for other variants. */
+    /* note: there may be some padding at the end due to alignment on 64 bytes */
+ }; /* typedef'd to XXH3_state_t */
+ 
+ #undef XXH_ALIGN_MEMBER
+ 
+-/* When the XXH3_state_t structure is merely emplaced on stack,
++/*!
++ * @brief Initializes a stack-allocated `XXH3_state_s`.
++ *
++ * When the @ref XXH3_state_t structure is merely emplaced on stack,
+  * it should be initialized with XXH3_INITSTATE() or a memset()
+  * in case its first reset uses XXH3_NNbits_reset_withSeed().
+  * This init can be omitted if the first reset uses default or _withSecret mode.
+  * This operation isn't necessary when the state is created with XXH3_createState().
+  * Note that this doesn't prepare the state for a streaming operation,
+  * it's still necessary to use XXH3_NNbits_reset*() afterwards.
+  */
+ #define XXH3_INITSTATE(XXH3_state_ptr)   { (XXH3_state_ptr)->seed = 0; }
+ 
+ 
++/* XXH128() :
++ * simple alias to pre-selected XXH3_128bits variant
++ */
++XXH_PUBLIC_API XXH128_hash_t XXH128(const void* data, size_t len, XXH64_hash_t seed);
++
++
+ /* ===   Experimental API   === */
+ /* Symbols defined below must be considered tied to a specific library version. */
+ 
+ /*
+  * XXH3_generateSecret():
+  *
+  * Derive a high-entropy secret from any user-defined content, named customSeed.
+  * The generated secret can be used in combination with `*_withSecret()` functions.
+  * The `_withSecret()` variants are useful to provide a higher level of protection than 64-bit seed,
+  * as it becomes much more difficult for an external actor to guess how to impact the calculation logic.
+  *
+  * The function accepts as input a custom seed of any length and any content,
+- * and derives from it a high-entropy secret of length XXH3_SECRET_DEFAULT_SIZE
+- * into an already allocated buffer secretBuffer.
+- * The generated secret is _always_ XXH_SECRET_DEFAULT_SIZE bytes long.
++ * and derives from it a high-entropy secret of length @secretSize
++ * into an already allocated buffer @secretBuffer.
++ * @secretSize must be >= XXH3_SECRET_SIZE_MIN
+  *
+  * The generated secret can then be used with any `*_withSecret()` variant.
+  * Functions `XXH3_128bits_withSecret()`, `XXH3_64bits_withSecret()`,
+  * `XXH3_128bits_reset_withSecret()` and `XXH3_64bits_reset_withSecret()`
+  * are part of this list. They all accept a `secret` parameter
+- * which must be very long for implementation reasons (>= XXH3_SECRET_SIZE_MIN)
++ * which must be large enough for implementation reasons (>= XXH3_SECRET_SIZE_MIN)
+  * _and_ feature very high entropy (consist of random-looking bytes).
+  * These conditions can be a high bar to meet, so
+- * this function can be used to generate a secret of proper quality.
++ * XXH3_generateSecret() can be employed to ensure proper quality.
+  *
+  * customSeed can be anything. It can have any size, even small ones,
+- * and its content can be anything, even stupidly "low entropy" source such as a bunch of zeroes.
+- * The resulting `secret` will nonetheless provide all expected qualities.
+- *
+- * Supplying NULL as the customSeed copies the default secret into `secretBuffer`.
++ * and its content can be anything, even "poor entropy" sources such as a bunch of zeroes.
++ * The resulting `secret` will nonetheless provide all required qualities.
++ *
+  * When customSeedSize > 0, supplying NULL as customSeed is undefined behavior.
+  */
+-XXH_PUBLIC_API void XXH3_generateSecret(void* secretBuffer, const void* customSeed, size_t customSeedSize);
+-
+-
+-/* simple short-cut to pre-selected XXH3_128bits variant */
+-XXH_PUBLIC_API XXH128_hash_t XXH128(const void* data, size_t len, XXH64_hash_t seed);
++XXH_PUBLIC_API XXH_errorcode XXH3_generateSecret(void* secretBuffer, size_t secretSize, const void* customSeed, size_t customSeedSize);
++
++
++/*
++ * XXH3_generateSecret_fromSeed():
++ *
++ * Generate the same secret as the _withSeed() variants.
++ *
++ * The resulting secret has a length of XXH3_SECRET_DEFAULT_SIZE (necessarily).
++ * @secretBuffer must be already allocated, of size at least XXH3_SECRET_DEFAULT_SIZE bytes.
++ *
++ * The generated secret can be used in combination with
++ *`*_withSecret()` and `_withSecretandSeed()` variants.
++ * This generator is notably useful in combination with `_withSecretandSeed()`,
++ * as a way to emulate a faster `_withSeed()` variant.
++ */
++XXH_PUBLIC_API void XXH3_generateSecret_fromSeed(void* secretBuffer, XXH64_hash_t seed);
++
++/*
++ * *_withSecretandSeed() :
++ * These variants generate hash values using either
++ * @seed for "short" keys (< XXH3_MIDSIZE_MAX = 240 bytes)
++ * or @secret for "large" keys (>= XXH3_MIDSIZE_MAX).
++ *
++ * This generally benefits speed, compared to `_withSeed()` or `_withSecret()`.
++ * `_withSeed()` has to generate the secret on the fly for "large" keys.
++ * It's fast, but can be perceptible for "not so large" keys (< 1 KB).
++ * `_withSecret()` has to generate the masks on the fly for "small" keys,
++ * which requires more instructions than _withSeed() variants.
++ * Therefore, _withSecretandSeed variant combines the best of both worlds.
++ *
++ * When @secret has been generated by XXH3_generateSecret_fromSeed(),
++ * this variant produces *exactly* the same results as `_withSeed()` variant,
++ * hence offering only a pure speed benefit on "large" input,
++ * by skipping the need to regenerate the secret for every large input.
++ *
++ * Another usage scenario is to hash the secret to a 64-bit hash value,
++ * for example with XXH3_64bits(), which then becomes the seed,
++ * and then employ both the seed and the secret in _withSecretandSeed().
++ * On top of speed, an added benefit is that each bit in the secret
++ * has a 50% chance to swap each bit in the output,
++ * via its impact to the seed.
++ * This is not guaranteed when using the secret directly in "small data" scenarios,
++ * because only portions of the secret are employed for small data.
++ */
++XXH_PUBLIC_API XXH64_hash_t
++XXH3_64bits_withSecretandSeed(const void* data, size_t len,
++                              const void* secret, size_t secretSize,
++                              XXH64_hash_t seed);
++
++XXH_PUBLIC_API XXH128_hash_t
++XXH3_128bits_withSecretandSeed(const void* data, size_t len,
++                               const void* secret, size_t secretSize,
++                               XXH64_hash_t seed64);
++
++XXH_PUBLIC_API XXH_errorcode
++XXH3_64bits_reset_withSecretandSeed(XXH3_state_t* statePtr,
++                                    const void* secret, size_t secretSize,
++                                    XXH64_hash_t seed64);
++
++XXH_PUBLIC_API XXH_errorcode
++XXH3_128bits_reset_withSecretandSeed(XXH3_state_t* statePtr,
++                                     const void* secret, size_t secretSize,
++                                     XXH64_hash_t seed64);
+ 
+ 
+ #endif  /* XXH_NO_LONG_LONG */
+-
+-
+ #if defined(XXH_INLINE_ALL) || defined(XXH_PRIVATE_API)
+ #  define XXH_IMPLEMENTATION
+ #endif
+ 
+ #endif  /* defined(XXH_STATIC_LINKING_ONLY) && !defined(XXHASH_H_STATIC_13879238742) */
+ 
+ 
+ /* ======================================================================== */
+@@ -769,97 +1253,114 @@ XXH_PUBLIC_API XXH128_hash_t XXH128(cons
+ 
+ #if ( defined(XXH_INLINE_ALL) || defined(XXH_PRIVATE_API) \
+    || defined(XXH_IMPLEMENTATION) ) && !defined(XXH_IMPLEM_13a8737387)
+ #  define XXH_IMPLEM_13a8737387
+ 
+ /* *************************************
+ *  Tuning parameters
+ ***************************************/
++
+ /*!
+- * XXH_FORCE_MEMORY_ACCESS:
++ * @defgroup tuning Tuning parameters
++ * @{
++ *
++ * Various macros to control xxHash's behavior.
++ */
++#ifdef XXH_DOXYGEN
++/*!
++ * @brief Define this to disable 64-bit code.
++ *
++ * Useful if only using the @ref xxh32_family and you have a strict C90 compiler.
++ */
++#  define XXH_NO_LONG_LONG
++#  undef XXH_NO_LONG_LONG /* don't actually */
++/*!
++ * @brief Controls how unaligned memory is accessed.
++ *
+  * By default, access to unaligned memory is controlled by `memcpy()`, which is
+  * safe and portable.
+  *
+  * Unfortunately, on some target/compiler combinations, the generated assembly
+  * is sub-optimal.
+  *
+  * The below switch allow selection of a different access method
+  * in the search for improved performance.
+- * Method 0 (default):
+- *     Use `memcpy()`. Safe and portable. Default.
+- * Method 1:
+- *     `__attribute__((packed))` statement. It depends on compiler extensions
+- *     and is therefore not portable.
+- *     This method is safe if your compiler supports it, and *generally* as
+- *     fast or faster than `memcpy`.
+- * Method 2:
+- *     Direct access via cast. This method doesn't depend on the compiler but
+- *     violates the C standard.
+- *     It can generate buggy code on targets which do not support unaligned
+- *     memory accesses.
+- *     But in some circumstances, it's the only known way to get the most
+- *     performance (example: GCC + ARMv6)
+- * Method 3:
+- *     Byteshift. This can generate the best code on old compilers which don't
++ *
++ * @par Possible options:
++ *
++ *  - `XXH_FORCE_MEMORY_ACCESS=0` (default): `memcpy`
++ *   @par
++ *     Use `memcpy()`. Safe and portable. Note that most modern compilers will
++ *     eliminate the function call and treat it as an unaligned access.
++ *
++ *  - `XXH_FORCE_MEMORY_ACCESS=1`: `__attribute__((packed))`
++ *   @par
++ *     Depends on compiler extensions and is therefore not portable.
++ *     This method is safe _if_ your compiler supports it,
++ *     and *generally* as fast or faster than `memcpy`.
++ *
++ *  - `XXH_FORCE_MEMORY_ACCESS=2`: Direct cast
++ *  @par
++ *     Casts directly and dereferences. This method doesn't depend on the
++ *     compiler, but it violates the C standard as it directly dereferences an
++ *     unaligned pointer. It can generate buggy code on targets which do not
++ *     support unaligned memory accesses, but in some circumstances, it's the
++ *     only known way to get the most performance.
++ *
++ *  - `XXH_FORCE_MEMORY_ACCESS=3`: Byteshift
++ *  @par
++ *     Also portable. This can generate the best code on old compilers which don't
+  *     inline small `memcpy()` calls, and it might also be faster on big-endian
+- *     systems which lack a native byteswap instruction.
+- * See https://stackoverflow.com/a/32095106/646947 for details.
+- * Prefer these methods in priority order (0 > 1 > 2 > 3)
++ *     systems which lack a native byteswap instruction. However, some compilers
++ *     will emit literal byteshifts even if the target supports unaligned access.
++ *  .
++ *
++ * @warning
++ *   Methods 1 and 2 rely on implementation-defined behavior. Use these with
++ *   care, as what works on one compiler/platform/optimization level may cause
++ *   another to read garbage data or even crash.
++ *
++ * See http://fastcompression.blogspot.com/2015/08/accessing-unaligned-memory.html for details.
++ *
++ * Prefer these methods in priority order (0 > 3 > 1 > 2)
+  */
+-#ifndef XXH_FORCE_MEMORY_ACCESS   /* can be defined externally, on command line for example */
+-#  if !defined(__clang__) && defined(__GNUC__) && defined(__ARM_FEATURE_UNALIGNED) && defined(__ARM_ARCH) && (__ARM_ARCH == 6)
+-#    define XXH_FORCE_MEMORY_ACCESS 2
+-#  elif !defined(__clang__) && ((defined(__INTEL_COMPILER) && !defined(_WIN32)) || \
+-  (defined(__GNUC__) && (defined(__ARM_ARCH) && __ARM_ARCH >= 7)))
+-#    define XXH_FORCE_MEMORY_ACCESS 1
+-#  endif
+-#endif
++#  define XXH_FORCE_MEMORY_ACCESS 0
+ 
+ /*!
+- * XXH_ACCEPT_NULL_INPUT_POINTER:
+- * If the input pointer is NULL, xxHash's default behavior is to dereference it,
+- * triggering a segfault.
+- * When this macro is enabled, xxHash actively checks the input for a null pointer.
+- * If it is, the result for null input pointers is the same as a zero-length input.
+- */
+-#ifndef XXH_ACCEPT_NULL_INPUT_POINTER   /* can be defined externally */
+-#  define XXH_ACCEPT_NULL_INPUT_POINTER 0
+-#endif
+-
+-/*!
+- * XXH_FORCE_ALIGN_CHECK:
+- * This is an important performance trick
+- * for architectures without decent unaligned memory access performance.
+- * It checks for input alignment, and when conditions are met,
+- * uses a "fast path" employing direct 32-bit/64-bit read,
+- * resulting in _dramatically faster_ read speed.
+- *
+- * The check costs one initial branch per hash, which is generally negligible, but not zero.
+- * Moreover, it's not useful to generate binary for an additional code path
+- * if memory access uses same instruction for both aligned and unaligned adresses.
++ * @def XXH_FORCE_ALIGN_CHECK
++ * @brief If defined to non-zero, adds a special path for aligned inputs (XXH32()
++ * and XXH64() only).
++ *
++ * This is an important performance trick for architectures without decent
++ * unaligned memory access performance.
++ *
++ * It checks for input alignment, and when conditions are met, uses a "fast
++ * path" employing direct 32-bit/64-bit reads, resulting in _dramatically
++ * faster_ read speed.
++ *
++ * The check costs one initial branch per hash, which is generally negligible,
++ * but not zero.
++ *
++ * Moreover, it's not useful to generate an additional code path if memory
++ * access uses the same instruction for both aligned and unaligned
++ * addresses (e.g. x86 and aarch64).
+  *
+  * In these cases, the alignment check can be removed by setting this macro to 0.
+  * Then the code will always use unaligned memory access.
+  * Align check is automatically disabled on x86, x64 & arm64,
+  * which are platforms known to offer good unaligned memory accesses performance.
+  *
+  * This option does not affect XXH3 (only XXH32 and XXH64).
+  */
+-#ifndef XXH_FORCE_ALIGN_CHECK  /* can be defined externally */
+-#  if defined(__i386)  || defined(__x86_64__) || defined(__aarch64__) \
+-   || defined(_M_IX86) || defined(_M_X64)     || defined(_M_ARM64) /* visual */
+-#    define XXH_FORCE_ALIGN_CHECK 0
+-#  else
+-#    define XXH_FORCE_ALIGN_CHECK 1
+-#  endif
+-#endif
++#  define XXH_FORCE_ALIGN_CHECK 0
+ 
+ /*!
+- * XXH_NO_INLINE_HINTS:
++ * @def XXH_NO_INLINE_HINTS
++ * @brief When non-zero, sets all functions to `static`.
+  *
+  * By default, xxHash tries to force the compiler to inline almost all internal
+  * functions.
+  *
+  * This can usually improve performance due to reduced jumping and improved
+  * constant folding, but significantly increases the size of the binary which
+  * might not be favorable.
+  *
+@@ -867,99 +1368,168 @@ XXH_PUBLIC_API XXH128_hash_t XXH128(cons
+  * depending on the architecture.
+  *
+  * XXH_NO_INLINE_HINTS marks all internal functions as static, giving the
+  * compiler full control on whether to inline or not.
+  *
+  * When not optimizing (-O0), optimizing for size (-Os, -Oz), or using
+  * -fno-inline with GCC or Clang, this will automatically be defined.
+  */
++#  define XXH_NO_INLINE_HINTS 0
++
++/*!
++ * @def XXH32_ENDJMP
++ * @brief Whether to use a jump for `XXH32_finalize`.
++ *
++ * For performance, `XXH32_finalize` uses multiple branches in the finalizer.
++ * This is generally preferable for performance,
++ * but depending on exact architecture, a jmp may be preferable.
++ *
++ * This setting is only possibly making a difference for very small inputs.
++ */
++#  define XXH32_ENDJMP 0
++
++/*!
++ * @internal
++ * @brief Redefines old internal names.
++ *
++ * For compatibility with code that uses xxHash's internals before the names
++ * were changed to improve namespacing. There is no other reason to use this.
++ */
++#  define XXH_OLD_NAMES
++#  undef XXH_OLD_NAMES /* don't actually use, it is ugly. */
++#endif /* XXH_DOXYGEN */
++/*!
++ * @}
++ */
++
++#ifndef XXH_FORCE_MEMORY_ACCESS   /* can be defined externally, on command line for example */
++   /* prefer __packed__ structures (method 1) for gcc on armv7+ and mips */
++#  if !defined(__clang__) && \
++( \
++    (defined(__INTEL_COMPILER) && !defined(_WIN32)) || \
++    ( \
++        defined(__GNUC__) && ( \
++            (defined(__ARM_ARCH) && __ARM_ARCH >= 7) || \
++            ( \
++                defined(__mips__) && \
++                (__mips <= 5 || __mips_isa_rev < 6) && \
++                (!defined(__mips16) || defined(__mips_mips16e2)) \
++            ) \
++        ) \
++    ) \
++)
++#    define XXH_FORCE_MEMORY_ACCESS 1
++#  endif
++#endif
++
++#ifndef XXH_FORCE_ALIGN_CHECK  /* can be defined externally */
++#  if defined(__i386)  || defined(__x86_64__) || defined(__aarch64__) \
++   || defined(_M_IX86) || defined(_M_X64)     || defined(_M_ARM64) /* visual */
++#    define XXH_FORCE_ALIGN_CHECK 0
++#  else
++#    define XXH_FORCE_ALIGN_CHECK 1
++#  endif
++#endif
++
+ #ifndef XXH_NO_INLINE_HINTS
+ #  if defined(__OPTIMIZE_SIZE__) /* -Os, -Oz */ \
+    || defined(__NO_INLINE__)     /* -O0, -fno-inline */
+ #    define XXH_NO_INLINE_HINTS 1
+ #  else
+ #    define XXH_NO_INLINE_HINTS 0
+ #  endif
+ #endif
+ 
++#ifndef XXH32_ENDJMP
++/* generally preferable for performance */
++#  define XXH32_ENDJMP 0
++#endif
++
+ /*!
+- * XXH_REROLL:
+- * Whether to reroll XXH32_finalize, and XXH64_finalize,
+- * instead of using an unrolled jump table/if statement loop.
+- *
+- * This is automatically defined on -Os/-Oz on GCC and Clang.
++ * @defgroup impl Implementation
++ * @{
+  */
+-#ifndef XXH_REROLL
+-#  if defined(__OPTIMIZE_SIZE__)
+-#    define XXH_REROLL 1
+-#  else
+-#    define XXH_REROLL 0
+-#  endif
+-#endif
+ 
+ 
+ /* *************************************
+ *  Includes & Memory related functions
+ ***************************************/
+-/*!
++/*
+  * Modify the local functions below should you wish to use
+  * different memory routines for malloc() and free()
+  */
+ #include <stdlib.h>
+ 
++/*!
++ * @internal
++ * @brief Modify this function to use a different routine than malloc().
++ */
+ static void* XXH_malloc(size_t s) { return malloc(s); }
++
++/*!
++ * @internal
++ * @brief Modify this function to use a different routine than free().
++ */
+ static void XXH_free(void* p) { free(p); }
+ 
+-/*! and for memcpy() */
+ #include <string.h>
++
++/*!
++ * @internal
++ * @brief Modify this function to use a different routine than memcpy().
++ */
+ static void* XXH_memcpy(void* dest, const void* src, size_t size)
+ {
+     return memcpy(dest,src,size);
+ }
+ 
+ #include <limits.h>   /* ULLONG_MAX */
+ 
+ 
+ /* *************************************
+ *  Compiler Specific Options
+ ***************************************/
+ #ifdef _MSC_VER /* Visual Studio warning fix */
+ #  pragma warning(disable : 4127) /* disable: C4127: conditional expression is constant */
+ #endif
+ 
+ #if XXH_NO_INLINE_HINTS  /* disable inlining hints */
+-#  if defined(__GNUC__)
++#  if defined(__GNUC__) || defined(__clang__)
+ #    define XXH_FORCE_INLINE static __attribute__((unused))
+ #  else
+ #    define XXH_FORCE_INLINE static
+ #  endif
+ #  define XXH_NO_INLINE static
+ /* enable inlining hints */
++#elif defined(__GNUC__) || defined(__clang__)
++#  define XXH_FORCE_INLINE static __inline__ __attribute__((always_inline, unused))
++#  define XXH_NO_INLINE static __attribute__((noinline))
+ #elif defined(_MSC_VER)  /* Visual Studio */
+ #  define XXH_FORCE_INLINE static __forceinline
+ #  define XXH_NO_INLINE static __declspec(noinline)
+-#elif defined(__GNUC__)
+-#  define XXH_FORCE_INLINE static __inline__ __attribute__((always_inline, unused))
+-#  define XXH_NO_INLINE static __attribute__((noinline))
+ #elif defined (__cplusplus) \
+   || (defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L))   /* C99 */
+ #  define XXH_FORCE_INLINE static inline
+ #  define XXH_NO_INLINE static
+ #else
+ #  define XXH_FORCE_INLINE static
+ #  define XXH_NO_INLINE static
+ #endif
+ 
+ 
+ 
+ /* *************************************
+ *  Debug
+ ***************************************/
+-/*
++/*!
++ * @ingroup tuning
++ * @def XXH_DEBUGLEVEL
++ * @brief Sets the debugging level.
++ *
+  * XXH_DEBUGLEVEL is expected to be defined externally, typically via the
+  * compiler's command line options. The value must be a number.
+  */
+ #ifndef XXH_DEBUGLEVEL
+ #  ifdef DEBUGLEVEL /* backwards compat */
+ #    define XXH_DEBUGLEVEL DEBUGLEVEL
+ #  else
+ #    define XXH_DEBUGLEVEL 0
+@@ -969,18 +1539,49 @@ static void* XXH_memcpy(void* dest, cons
+ #if (XXH_DEBUGLEVEL>=1)
+ #  include <assert.h>   /* note: can still be disabled with NDEBUG */
+ #  define XXH_ASSERT(c)   assert(c)
+ #else
+ #  define XXH_ASSERT(c)   ((void)0)
+ #endif
+ 
+ /* note: use after variable declarations */
+-#define XXH_STATIC_ASSERT(c)  do { enum { XXH_sa = 1/(int)(!!(c)) }; } while (0)
+-
++#ifndef XXH_STATIC_ASSERT
++#  if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 201112L)    /* C11 */
++#    include <assert.h>
++#    define XXH_STATIC_ASSERT_WITH_MESSAGE(c,m) do { static_assert((c),m); } while(0)
++#  elif defined(__cplusplus) && (__cplusplus >= 201103L)            /* C++11 */
++#    define XXH_STATIC_ASSERT_WITH_MESSAGE(c,m) do { static_assert((c),m); } while(0)
++#  else
++#    define XXH_STATIC_ASSERT_WITH_MESSAGE(c,m) do { struct xxh_sa { char x[(c) ? 1 : -1]; }; } while(0)
++#  endif
++#  define XXH_STATIC_ASSERT(c) XXH_STATIC_ASSERT_WITH_MESSAGE((c),#c)
++#endif
++
++/*!
++ * @internal
++ * @def XXH_COMPILER_GUARD(var)
++ * @brief Used to prevent unwanted optimizations for @p var.
++ *
++ * It uses an empty GCC inline assembly statement with a register constraint
++ * which forces @p var into a general purpose register (eg eax, ebx, ecx
++ * on x86) and marks it as modified.
++ *
++ * This is used in a few places to avoid unwanted autovectorization (e.g.
++ * XXH32_round()). All vectorization we want is explicit via intrinsics,
++ * and _usually_ isn't wanted elsewhere.
++ *
++ * We also use it to prevent unwanted constant folding for AArch64 in
++ * XXH3_initCustomSecret_scalar().
++ */
++#if defined(__GNUC__) || defined(__clang__)
++#  define XXH_COMPILER_GUARD(var) __asm__ __volatile__("" : "+r" (var))
++#else
++#  define XXH_COMPILER_GUARD(var) ((void)0)
++#endif
+ 
+ /* *************************************
+ *  Basic Types
+ ***************************************/
+ #if !defined (__VMS) \
+  && (defined (__cplusplus) \
+  || (defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) /* C99 */) )
+ # include <stdint.h>
+@@ -993,16 +1594,66 @@ typedef XXH32_hash_t xxh_u32;
+ #ifdef XXH_OLD_NAMES
+ #  define BYTE xxh_u8
+ #  define U8   xxh_u8
+ #  define U32  xxh_u32
+ #endif
+ 
+ /* ***   Memory access   *** */
+ 
++/*!
++ * @internal
++ * @fn xxh_u32 XXH_read32(const void* ptr)
++ * @brief Reads an unaligned 32-bit integer from @p ptr in native endianness.
++ *
++ * Affected by @ref XXH_FORCE_MEMORY_ACCESS.
++ *
++ * @param ptr The pointer to read from.
++ * @return The 32-bit native endian integer from the bytes at @p ptr.
++ */
++
++/*!
++ * @internal
++ * @fn xxh_u32 XXH_readLE32(const void* ptr)
++ * @brief Reads an unaligned 32-bit little endian integer from @p ptr.
++ *
++ * Affected by @ref XXH_FORCE_MEMORY_ACCESS.
++ *
++ * @param ptr The pointer to read from.
++ * @return The 32-bit little endian integer from the bytes at @p ptr.
++ */
++
++/*!
++ * @internal
++ * @fn xxh_u32 XXH_readBE32(const void* ptr)
++ * @brief Reads an unaligned 32-bit big endian integer from @p ptr.
++ *
++ * Affected by @ref XXH_FORCE_MEMORY_ACCESS.
++ *
++ * @param ptr The pointer to read from.
++ * @return The 32-bit big endian integer from the bytes at @p ptr.
++ */
++
++/*!
++ * @internal
++ * @fn xxh_u32 XXH_readLE32_align(const void* ptr, XXH_alignment align)
++ * @brief Like @ref XXH_readLE32(), but has an option for aligned reads.
++ *
++ * Affected by @ref XXH_FORCE_MEMORY_ACCESS.
++ * Note that when @ref XXH_FORCE_ALIGN_CHECK == 0, the @p align parameter is
++ * always @ref XXH_alignment::XXH_unaligned.
++ *
++ * @param ptr The pointer to read from.
++ * @param align Whether @p ptr is aligned.
++ * @pre
++ *   If @p align == @ref XXH_alignment::XXH_aligned, @p ptr must be 4 byte
++ *   aligned.
++ * @return The 32-bit little endian integer from the bytes at @p ptr.
++ */
++
+ #if (defined(XXH_FORCE_MEMORY_ACCESS) && (XXH_FORCE_MEMORY_ACCESS==3))
+ /*
+  * Manual byteshift. Best for old compilers which don't inline memcpy.
+  * We actually directly use XXH_readLE32 and XXH_readBE32.
+  */
+ #elif (defined(XXH_FORCE_MEMORY_ACCESS) && (XXH_FORCE_MEMORY_ACCESS==2))
+ 
+ /*
+@@ -1027,54 +1678,64 @@ static xxh_u32 XXH_read32(const void* pt
+     typedef union { xxh_u32 u32; } __attribute__((packed)) xxh_unalign;
+     return ((const xxh_unalign*)ptr)->u32;
+ }
+ 
+ #else
+ 
+ /*
+  * Portable and safe solution. Generally efficient.
+- * see: https://stackoverflow.com/a/32095106/646947
++ * see: http://fastcompression.blogspot.com/2015/08/accessing-unaligned-memory.html
+  */
+ static xxh_u32 XXH_read32(const void* memPtr)
+ {
+     xxh_u32 val;
+-    memcpy(&val, memPtr, sizeof(val));
++    XXH_memcpy(&val, memPtr, sizeof(val));
+     return val;
+ }
+ 
+ #endif   /* XXH_FORCE_DIRECT_MEMORY_ACCESS */
+ 
+ 
+-/* ***   Endianess   *** */
+-typedef enum { XXH_bigEndian=0, XXH_littleEndian=1 } XXH_endianess;
++/* ***   Endianness   *** */
+ 
+ /*!
+- * XXH_CPU_LITTLE_ENDIAN:
++ * @ingroup tuning
++ * @def XXH_CPU_LITTLE_ENDIAN
++ * @brief Whether the target is little endian.
++ *
+  * Defined to 1 if the target is little endian, or 0 if it is big endian.
+  * It can be defined externally, for example on the compiler command line.
+  *
+- * If it is not defined, a runtime check (which is usually constant folded)
+- * is used instead.
++ * If it is not defined,
++ * a runtime check (which is usually constant folded) is used instead.
++ *
++ * @note
++ *   This is not necessarily defined to an integer constant.
++ *
++ * @see XXH_isLittleEndian() for the runtime check.
+  */
+ #ifndef XXH_CPU_LITTLE_ENDIAN
+ /*
+  * Try to detect endianness automatically, to avoid the nonstandard behavior
+  * in `XXH_isLittleEndian()`
+  */
+ #  if defined(_WIN32) /* Windows is always little endian */ \
+      || defined(__LITTLE_ENDIAN__) \
+      || (defined(__BYTE_ORDER__) && __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__)
+ #    define XXH_CPU_LITTLE_ENDIAN 1
+ #  elif defined(__BIG_ENDIAN__) \
+      || (defined(__BYTE_ORDER__) && __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
+ #    define XXH_CPU_LITTLE_ENDIAN 0
+ #  else
+-/*
+- * runtime test, presumed to simplify to a constant by compiler
++/*!
++ * @internal
++ * @brief Runtime check for @ref XXH_CPU_LITTLE_ENDIAN.
++ *
++ * Most compilers will constant fold this.
+  */
+ static int XXH_isLittleEndian(void)
+ {
+     /*
+      * Portable and well-defined behavior.
+      * Don't use static: it is detrimental to performance.
+      */
+     const union { xxh_u32 u; xxh_u8 c[4]; } one = { 1 };
+@@ -1093,29 +1754,50 @@ static int XXH_isLittleEndian(void)
+ #define XXH_GCC_VERSION (__GNUC__ * 100 + __GNUC_MINOR__)
+ 
+ #ifdef __has_builtin
+ #  define XXH_HAS_BUILTIN(x) __has_builtin(x)
+ #else
+ #  define XXH_HAS_BUILTIN(x) 0
+ #endif
+ 
++/*!
++ * @internal
++ * @def XXH_rotl32(x,r)
++ * @brief 32-bit rotate left.
++ *
++ * @param x The 32-bit integer to be rotated.
++ * @param r The number of bits to rotate.
++ * @pre
++ *   @p r > 0 && @p r < 32
++ * @note
++ *   @p x and @p r may be evaluated multiple times.
++ * @return The rotated result.
++ */
+ #if !defined(NO_CLANG_BUILTIN) && XXH_HAS_BUILTIN(__builtin_rotateleft32) \
+                                && XXH_HAS_BUILTIN(__builtin_rotateleft64)
+ #  define XXH_rotl32 __builtin_rotateleft32
+ #  define XXH_rotl64 __builtin_rotateleft64
+ /* Note: although _rotl exists for minGW (GCC under windows), performance seems poor */
+ #elif defined(_MSC_VER)
+ #  define XXH_rotl32(x,r) _rotl(x,r)
+ #  define XXH_rotl64(x,r) _rotl64(x,r)
+ #else
+ #  define XXH_rotl32(x,r) (((x) << (r)) | ((x) >> (32 - (r))))
+ #  define XXH_rotl64(x,r) (((x) << (r)) | ((x) >> (64 - (r))))
+ #endif
+ 
++/*!
++ * @internal
++ * @fn xxh_u32 XXH_swap32(xxh_u32 x)
++ * @brief A 32-bit byteswap.
++ *
++ * @param x The 32-bit integer to byteswap.
++ * @return @p x, byteswapped.
++ */
+ #if defined(_MSC_VER)     /* Visual Studio */
+ #  define XXH_swap32 _byteswap_ulong
+ #elif XXH_GCC_VERSION >= 403
+ #  define XXH_swap32 __builtin_bswap32
+ #else
+ static xxh_u32 XXH_swap32 (xxh_u32 x)
+ {
+     return  ((x << 24) & 0xff000000 ) |
+@@ -1124,17 +1806,25 @@ static xxh_u32 XXH_swap32 (xxh_u32 x)
+             ((x >> 24) & 0x000000ff );
+ }
+ #endif
+ 
+ 
+ /* ***************************
+ *  Memory reads
+ *****************************/
+-typedef enum { XXH_aligned, XXH_unaligned } XXH_alignment;
++
++/*!
++ * @internal
++ * @brief Enum to indicate whether a pointer is aligned.
++ */
++typedef enum {
++    XXH_aligned,  /*!< Aligned */
++    XXH_unaligned /*!< Possibly unaligned */
++} XXH_alignment;
+ 
+ /*
+  * XXH_FORCE_MEMORY_ACCESS==3 is an endian-independent byteshift load.
+  *
+  * This is ideal for older compilers which don't inline memcpy.
+  */
+ #if (defined(XXH_FORCE_MEMORY_ACCESS) && (XXH_FORCE_MEMORY_ACCESS==3))
+ 
+@@ -1177,48 +1867,66 @@ XXH_readLE32_align(const void* ptr, XXH_
+         return XXH_CPU_LITTLE_ENDIAN ? *(const xxh_u32*)ptr : XXH_swap32(*(const xxh_u32*)ptr);
+     }
+ }
+ 
+ 
+ /* *************************************
+ *  Misc
+ ***************************************/
++/*! @ingroup public */
+ XXH_PUBLIC_API unsigned XXH_versionNumber (void) { return XXH_VERSION_NUMBER; }
+ 
+ 
+ /* *******************************************************************
+ *  32-bit hash functions
+ *********************************************************************/
+-static const xxh_u32 XXH_PRIME32_1 = 0x9E3779B1U;   /* 0b10011110001101110111100110110001 */
+-static const xxh_u32 XXH_PRIME32_2 = 0x85EBCA77U;   /* 0b10000101111010111100101001110111 */
+-static const xxh_u32 XXH_PRIME32_3 = 0xC2B2AE3DU;   /* 0b11000010101100101010111000111101 */
+-static const xxh_u32 XXH_PRIME32_4 = 0x27D4EB2FU;   /* 0b00100111110101001110101100101111 */
+-static const xxh_u32 XXH_PRIME32_5 = 0x165667B1U;   /* 0b00010110010101100110011110110001 */
++/*!
++ * @}
++ * @defgroup xxh32_impl XXH32 implementation
++ * @ingroup impl
++ * @{
++ */
++ /* #define instead of static const, to be used as initializers */
++#define XXH_PRIME32_1  0x9E3779B1U  /*!< 0b10011110001101110111100110110001 */
++#define XXH_PRIME32_2  0x85EBCA77U  /*!< 0b10000101111010111100101001110111 */
++#define XXH_PRIME32_3  0xC2B2AE3DU  /*!< 0b11000010101100101010111000111101 */
++#define XXH_PRIME32_4  0x27D4EB2FU  /*!< 0b00100111110101001110101100101111 */
++#define XXH_PRIME32_5  0x165667B1U  /*!< 0b00010110010101100110011110110001 */
+ 
+ #ifdef XXH_OLD_NAMES
+ #  define PRIME32_1 XXH_PRIME32_1
+ #  define PRIME32_2 XXH_PRIME32_2
+ #  define PRIME32_3 XXH_PRIME32_3
+ #  define PRIME32_4 XXH_PRIME32_4
+ #  define PRIME32_5 XXH_PRIME32_5
+ #endif
+ 
++/*!
++ * @internal
++ * @brief Normal stripe processing routine.
++ *
++ * This shuffles the bits so that any bit from @p input impacts several bits in
++ * @p acc.
++ *
++ * @param acc The accumulator lane.
++ * @param input The stripe of input to mix.
++ * @return The mixed accumulator lane.
++ */
+ static xxh_u32 XXH32_round(xxh_u32 acc, xxh_u32 input)
+ {
+     acc += input * XXH_PRIME32_2;
+     acc  = XXH_rotl32(acc, 13);
+     acc *= XXH_PRIME32_1;
+-#if defined(__GNUC__) && defined(__SSE4_1__) && !defined(XXH_ENABLE_AUTOVECTORIZE)
++#if (defined(__SSE4_1__) || defined(__aarch64__)) && !defined(XXH_ENABLE_AUTOVECTORIZE)
+     /*
+      * UGLY HACK:
+-     * This inline assembly hack forces acc into a normal register. This is the
+-     * only thing that prevents GCC and Clang from autovectorizing the XXH32
+-     * loop (pragmas and attributes don't work for some resason) without globally
+-     * disabling SSE4.1.
++     * A compiler fence is the only thing that prevents GCC and Clang from
++     * autovectorizing the XXH32 loop (pragmas and attributes don't work for some
++     * reason) without globally disabling SSE4.1.
+      *
+      * The reason we want to avoid vectorization is because despite working on
+      * 4 integers at a time, there are multiple factors slowing XXH32 down on
+      * SSE4:
+      * - There's a ridiculous amount of lag from pmulld (10 cycles of latency on
+      *   newer chips!) making it slightly slower to multiply four integers at
+      *   once compared to four integers independently. Even when pmulld was
+      *   fastest, Sandy/Ivy Bridge, it is still not worth it to go into SSE
+@@ -1233,142 +1941,159 @@ static xxh_u32 XXH32_round(xxh_u32 acc, 
+      *      roll   v, 13    // reliably fast across the board
+      *      shldl  v, v, 13 // Sandy Bridge and later prefer this for some reason
+      *
+      * - Instruction level parallelism is actually more beneficial here because
+      *   the SIMD actually serializes this operation: While v1 is rotating, v2
+      *   can load data, while v3 can multiply. SSE forces them to operate
+      *   together.
+      *
+-     * How this hack works:
+-     * __asm__(""       // Declare an assembly block but don't declare any instructions
+-     *          :       // However, as an Input/Output Operand,
+-     *          "+r"    // constrain a read/write operand (+) as a general purpose register (r).
+-     *          (acc)   // and set acc as the operand
+-     * );
+-     *
+-     * Because of the 'r', the compiler has promised that seed will be in a
+-     * general purpose register and the '+' says that it will be 'read/write',
+-     * so it has to assume it has changed. It is like volatile without all the
+-     * loads and stores.
+-     *
+-     * Since the argument has to be in a normal register (not an SSE register),
+-     * each time XXH32_round is called, it is impossible to vectorize.
++     * This is also enabled on AArch64, as Clang autovectorizes it incorrectly
++     * and it is pointless writing a NEON implementation that is basically the
++     * same speed as scalar for XXH32.
+      */
+-    __asm__("" : "+r" (acc));
++    XXH_COMPILER_GUARD(acc);
+ #endif
+     return acc;
+ }
+ 
+-/* mix all bits */
++/*!
++ * @internal
++ * @brief Mixes all bits to finalize the hash.
++ *
++ * The final mix ensures that all input bits have a chance to impact any bit in
++ * the output digest, resulting in an unbiased distribution.
++ *
++ * @param h32 The hash to avalanche.
++ * @return The avalanched hash.
++ */
+ static xxh_u32 XXH32_avalanche(xxh_u32 h32)
+ {
+     h32 ^= h32 >> 15;
+     h32 *= XXH_PRIME32_2;
+     h32 ^= h32 >> 13;
+     h32 *= XXH_PRIME32_3;
+     h32 ^= h32 >> 16;
+     return(h32);
+ }
+ 
+ #define XXH_get32bits(p) XXH_readLE32_align(p, align)
+ 
++/*!
++ * @internal
++ * @brief Processes the last 0-15 bytes of @p ptr.
++ *
++ * There may be up to 15 bytes remaining to consume from the input.
++ * This final stage will digest them to ensure that all input bytes are present
++ * in the final mix.
++ *
++ * @param h32 The hash to finalize.
++ * @param ptr The pointer to the remaining input.
++ * @param len The remaining length, modulo 16.
++ * @param align Whether @p ptr is aligned.
++ * @return The finalized hash.
++ */
+ static xxh_u32
+ XXH32_finalize(xxh_u32 h32, const xxh_u8* ptr, size_t len, XXH_alignment align)
+ {
+ #define XXH_PROCESS1 do {                           \
+     h32 += (*ptr++) * XXH_PRIME32_5;                \
+     h32 = XXH_rotl32(h32, 11) * XXH_PRIME32_1;      \
+ } while (0)
+ 
+ #define XXH_PROCESS4 do {                           \
+     h32 += XXH_get32bits(ptr) * XXH_PRIME32_3;      \
+     ptr += 4;                                   \
+     h32  = XXH_rotl32(h32, 17) * XXH_PRIME32_4;     \
+ } while (0)
+ 
+-    /* Compact rerolled version */
+-    if (XXH_REROLL) {
++    if (ptr==NULL) XXH_ASSERT(len == 0);
++
++    /* Compact rerolled version; generally faster */
++    if (!XXH32_ENDJMP) {
+         len &= 15;
+         while (len >= 4) {
+             XXH_PROCESS4;
+             len -= 4;
+         }
+         while (len > 0) {
+             XXH_PROCESS1;
+             --len;
+         }
+         return XXH32_avalanche(h32);
+     } else {
+          switch(len&15) /* or switch(bEnd - p) */ {
+            case 12:      XXH_PROCESS4;
+-                         /* fallthrough */
++                         XXH_FALLTHROUGH;
+            case 8:       XXH_PROCESS4;
+-                         /* fallthrough */
++                         XXH_FALLTHROUGH;
+            case 4:       XXH_PROCESS4;
+                          return XXH32_avalanche(h32);
+ 
+            case 13:      XXH_PROCESS4;
+-                         /* fallthrough */
++                         XXH_FALLTHROUGH;
+            case 9:       XXH_PROCESS4;
+-                         /* fallthrough */
++                         XXH_FALLTHROUGH;
+            case 5:       XXH_PROCESS4;
+                          XXH_PROCESS1;
+                          return XXH32_avalanche(h32);
+ 
+            case 14:      XXH_PROCESS4;
+-                         /* fallthrough */
++                         XXH_FALLTHROUGH;
+            case 10:      XXH_PROCESS4;
+-                         /* fallthrough */
++                         XXH_FALLTHROUGH;
+            case 6:       XXH_PROCESS4;
+                          XXH_PROCESS1;
+                          XXH_PROCESS1;
+                          return XXH32_avalanche(h32);
+ 
+            case 15:      XXH_PROCESS4;
+-                         /* fallthrough */
++                         XXH_FALLTHROUGH;
+            case 11:      XXH_PROCESS4;
+-                         /* fallthrough */
++                         XXH_FALLTHROUGH;
+            case 7:       XXH_PROCESS4;
+-                         /* fallthrough */
++                         XXH_FALLTHROUGH;
+            case 3:       XXH_PROCESS1;
+-                         /* fallthrough */
++                         XXH_FALLTHROUGH;
+            case 2:       XXH_PROCESS1;
+-                         /* fallthrough */
++                         XXH_FALLTHROUGH;
+            case 1:       XXH_PROCESS1;
+-                         /* fallthrough */
++                         XXH_FALLTHROUGH;
+            case 0:       return XXH32_avalanche(h32);
+         }
+         XXH_ASSERT(0);
+         return h32;   /* reaching this point is deemed impossible */
+     }
+ }
+ 
+ #ifdef XXH_OLD_NAMES
+ #  define PROCESS1 XXH_PROCESS1
+ #  define PROCESS4 XXH_PROCESS4
+ #else
+ #  undef XXH_PROCESS1
+ #  undef XXH_PROCESS4
+ #endif
+ 
++/*!
++ * @internal
++ * @brief The implementation for @ref XXH32().
++ *
++ * @param input , len , seed Directly passed from @ref XXH32().
++ * @param align Whether @p input is aligned.
++ * @return The calculated hash.
++ */
+ XXH_FORCE_INLINE xxh_u32
+ XXH32_endian_align(const xxh_u8* input, size_t len, xxh_u32 seed, XXH_alignment align)
+ {
+-    const xxh_u8* bEnd = input + len;
+     xxh_u32 h32;
+ 
+-#if defined(XXH_ACCEPT_NULL_INPUT_POINTER) && (XXH_ACCEPT_NULL_INPUT_POINTER>=1)
+-    if (input==NULL) {
+-        len=0;
+-        bEnd=input=(const xxh_u8*)(size_t)16;
+-    }
+-#endif
++    if (input==NULL) XXH_ASSERT(len == 0);
+ 
+     if (len>=16) {
++        const xxh_u8* const bEnd = input + len;
+         const xxh_u8* const limit = bEnd - 15;
+         xxh_u32 v1 = seed + XXH_PRIME32_1 + XXH_PRIME32_2;
+         xxh_u32 v2 = seed + XXH_PRIME32_2;
+         xxh_u32 v3 = seed + 0;
+         xxh_u32 v4 = seed - XXH_PRIME32_1;
+ 
+         do {
+             v1 = XXH32_round(v1, XXH_get32bits(input)); input += 4;
+@@ -1383,233 +2108,207 @@ XXH32_endian_align(const xxh_u8* input, 
+         h32  = seed + XXH_PRIME32_5;
+     }
+ 
+     h32 += (xxh_u32)len;
+ 
+     return XXH32_finalize(h32, input, len&15, align);
+ }
+ 
+-
++/*! @ingroup xxh32_family */
+ XXH_PUBLIC_API XXH32_hash_t XXH32 (const void* input, size_t len, XXH32_hash_t seed)
+ {
+ #if 0
+     /* Simple version, good for code maintenance, but unfortunately slow for small inputs */
+     XXH32_state_t state;
+     XXH32_reset(&state, seed);
+     XXH32_update(&state, (const xxh_u8*)input, len);
+     return XXH32_digest(&state);
+-
+ #else
+-
+     if (XXH_FORCE_ALIGN_CHECK) {
+         if ((((size_t)input) & 3) == 0) {   /* Input is 4-bytes aligned, leverage the speed benefit */
+             return XXH32_endian_align((const xxh_u8*)input, len, seed, XXH_aligned);
+     }   }
+ 
+     return XXH32_endian_align((const xxh_u8*)input, len, seed, XXH_unaligned);
+ #endif
+ }
+ 
+ 
+ 
+ /*******   Hash streaming   *******/
+-
++/*!
++ * @ingroup xxh32_family
++ */
+ XXH_PUBLIC_API XXH32_state_t* XXH32_createState(void)
+ {
+     return (XXH32_state_t*)XXH_malloc(sizeof(XXH32_state_t));
+ }
++/*! @ingroup xxh32_family */
+ XXH_PUBLIC_API XXH_errorcode XXH32_freeState(XXH32_state_t* statePtr)
+ {
+     XXH_free(statePtr);
+     return XXH_OK;
+ }
+ 
++/*! @ingroup xxh32_family */
+ XXH_PUBLIC_API void XXH32_copyState(XXH32_state_t* dstState, const XXH32_state_t* srcState)
+ {
+-    memcpy(dstState, srcState, sizeof(*dstState));
++    XXH_memcpy(dstState, srcState, sizeof(*dstState));
+ }
+ 
++/*! @ingroup xxh32_family */
+ XXH_PUBLIC_API XXH_errorcode XXH32_reset(XXH32_state_t* statePtr, XXH32_hash_t seed)
+ {
+     XXH32_state_t state;   /* using a local state to memcpy() in order to avoid strict-aliasing warnings */
+     memset(&state, 0, sizeof(state));
+-    state.v1 = seed + XXH_PRIME32_1 + XXH_PRIME32_2;
+-    state.v2 = seed + XXH_PRIME32_2;
+-    state.v3 = seed + 0;
+-    state.v4 = seed - XXH_PRIME32_1;
++    state.v[0] = seed + XXH_PRIME32_1 + XXH_PRIME32_2;
++    state.v[1] = seed + XXH_PRIME32_2;
++    state.v[2] = seed + 0;
++    state.v[3] = seed - XXH_PRIME32_1;
+     /* do not write into reserved, planned to be removed in a future version */
+-    memcpy(statePtr, &state, sizeof(state) - sizeof(state.reserved));
++    XXH_memcpy(statePtr, &state, sizeof(state) - sizeof(state.reserved));
+     return XXH_OK;
+ }
+ 
+ 
++/*! @ingroup xxh32_family */
+ XXH_PUBLIC_API XXH_errorcode
+ XXH32_update(XXH32_state_t* state, const void* input, size_t len)
+ {
+-    if (input==NULL)
+-#if defined(XXH_ACCEPT_NULL_INPUT_POINTER) && (XXH_ACCEPT_NULL_INPUT_POINTER>=1)
++    if (input==NULL) {
++        XXH_ASSERT(len == 0);
+         return XXH_OK;
+-#else
+-        return XXH_ERROR;
+-#endif
++    }
+ 
+     {   const xxh_u8* p = (const xxh_u8*)input;
+         const xxh_u8* const bEnd = p + len;
+ 
+         state->total_len_32 += (XXH32_hash_t)len;
+         state->large_len |= (XXH32_hash_t)((len>=16) | (state->total_len_32>=16));
+ 
+         if (state->memsize + len < 16)  {   /* fill in tmp buffer */
+             XXH_memcpy((xxh_u8*)(state->mem32) + state->memsize, input, len);
+             state->memsize += (XXH32_hash_t)len;
+             return XXH_OK;
+         }
+ 
+         if (state->memsize) {   /* some data left from previous update */
+             XXH_memcpy((xxh_u8*)(state->mem32) + state->memsize, input, 16-state->memsize);
+             {   const xxh_u32* p32 = state->mem32;
+-                state->v1 = XXH32_round(state->v1, XXH_readLE32(p32)); p32++;
+-                state->v2 = XXH32_round(state->v2, XXH_readLE32(p32)); p32++;
+-                state->v3 = XXH32_round(state->v3, XXH_readLE32(p32)); p32++;
+-                state->v4 = XXH32_round(state->v4, XXH_readLE32(p32));
++                state->v[0] = XXH32_round(state->v[0], XXH_readLE32(p32)); p32++;
++                state->v[1] = XXH32_round(state->v[1], XXH_readLE32(p32)); p32++;
++                state->v[2] = XXH32_round(state->v[2], XXH_readLE32(p32)); p32++;
++                state->v[3] = XXH32_round(state->v[3], XXH_readLE32(p32));
+             }
+             p += 16-state->memsize;
+             state->memsize = 0;
+         }
+ 
+         if (p <= bEnd-16) {
+             const xxh_u8* const limit = bEnd - 16;
+-            xxh_u32 v1 = state->v1;
+-            xxh_u32 v2 = state->v2;
+-            xxh_u32 v3 = state->v3;
+-            xxh_u32 v4 = state->v4;
+ 
+             do {
+-                v1 = XXH32_round(v1, XXH_readLE32(p)); p+=4;
+-                v2 = XXH32_round(v2, XXH_readLE32(p)); p+=4;
+-                v3 = XXH32_round(v3, XXH_readLE32(p)); p+=4;
+-                v4 = XXH32_round(v4, XXH_readLE32(p)); p+=4;
++                state->v[0] = XXH32_round(state->v[0], XXH_readLE32(p)); p+=4;
++                state->v[1] = XXH32_round(state->v[1], XXH_readLE32(p)); p+=4;
++                state->v[2] = XXH32_round(state->v[2], XXH_readLE32(p)); p+=4;
++                state->v[3] = XXH32_round(state->v[3], XXH_readLE32(p)); p+=4;
+             } while (p<=limit);
+ 
+-            state->v1 = v1;
+-            state->v2 = v2;
+-            state->v3 = v3;
+-            state->v4 = v4;
+         }
+ 
+         if (p < bEnd) {
+             XXH_memcpy(state->mem32, p, (size_t)(bEnd-p));
+             state->memsize = (unsigned)(bEnd-p);
+         }
+     }
+ 
+     return XXH_OK;
+ }
+ 
+ 
+-XXH_PUBLIC_API XXH32_hash_t XXH32_digest (const XXH32_state_t* state)
++/*! @ingroup xxh32_family */
++XXH_PUBLIC_API XXH32_hash_t XXH32_digest(const XXH32_state_t* state)
+ {
+     xxh_u32 h32;
+ 
+     if (state->large_len) {
+-        h32 = XXH_rotl32(state->v1, 1)
+-            + XXH_rotl32(state->v2, 7)
+-            + XXH_rotl32(state->v3, 12)
+-            + XXH_rotl32(state->v4, 18);
++        h32 = XXH_rotl32(state->v[0], 1)
++            + XXH_rotl32(state->v[1], 7)
++            + XXH_rotl32(state->v[2], 12)
++            + XXH_rotl32(state->v[3], 18);
+     } else {
+-        h32 = state->v3 /* == seed */ + XXH_PRIME32_5;
++        h32 = state->v[2] /* == seed */ + XXH_PRIME32_5;
+     }
+ 
+     h32 += state->total_len_32;
+ 
+     return XXH32_finalize(h32, (const xxh_u8*)state->mem32, state->memsize, XXH_aligned);
+ }
+ 
+ 
+ /*******   Canonical representation   *******/
+ 
+-/*
++/*!
++ * @ingroup xxh32_family
+  * The default return values from XXH functions are unsigned 32 and 64 bit
+  * integers.
+  *
+  * The canonical representation uses big endian convention, the same convention
+  * as human-readable numbers (large digits first).
+  *
+  * This way, hash values can be written into a file or buffer, remaining
+  * comparable across different systems.
+  *
+  * The following functions allow transformation of hash values to and from their
+  * canonical format.
+  */
+ XXH_PUBLIC_API void XXH32_canonicalFromHash(XXH32_canonical_t* dst, XXH32_hash_t hash)
+ {
+     XXH_STATIC_ASSERT(sizeof(XXH32_canonical_t) == sizeof(XXH32_hash_t));
+     if (XXH_CPU_LITTLE_ENDIAN) hash = XXH_swap32(hash);
+-    memcpy(dst, &hash, sizeof(*dst));
++    XXH_memcpy(dst, &hash, sizeof(*dst));
+ }
+-
++/*! @ingroup xxh32_family */
+ XXH_PUBLIC_API XXH32_hash_t XXH32_hashFromCanonical(const XXH32_canonical_t* src)
+ {
+     return XXH_readBE32(src);
+ }
+ 
+ 
+ #ifndef XXH_NO_LONG_LONG
+ 
+ /* *******************************************************************
+ *  64-bit hash functions
+ *********************************************************************/
+-
++/*!
++ * @}
++ * @ingroup impl
++ * @{
++ */
+ /*******   Memory access   *******/
+ 
+ typedef XXH64_hash_t xxh_u64;
+ 
+ #ifdef XXH_OLD_NAMES
+ #  define U64 xxh_u64
+ #endif
+ 
+-/*!
+- * XXH_REROLL_XXH64:
+- * Whether to reroll the XXH64_finalize() loop.
+- *
+- * Just like XXH32, we can unroll the XXH64_finalize() loop. This can be a
+- * performance gain on 64-bit hosts, as only one jump is required.
+- *
+- * However, on 32-bit hosts, because arithmetic needs to be done with two 32-bit
+- * registers, and 64-bit arithmetic needs to be simulated, it isn't beneficial
+- * to unroll. The code becomes ridiculously large (the largest function in the
+- * binary on i386!), and rerolling it saves anywhere from 3kB to 20kB. It is
+- * also slightly faster because it fits into cache better and is more likely
+- * to be inlined by the compiler.
+- *
+- * If XXH_REROLL is defined, this is ignored and the loop is always rerolled.
+- */
+-#ifndef XXH_REROLL_XXH64
+-#  if (defined(__ILP32__) || defined(_ILP32)) /* ILP32 is often defined on 32-bit GCC family */ \
+-   || !(defined(__x86_64__) || defined(_M_X64) || defined(_M_AMD64) /* x86-64 */ \
+-     || defined(_M_ARM64) || defined(__aarch64__) || defined(__arm64__) /* aarch64 */ \
+-     || defined(__PPC64__) || defined(__PPC64LE__) || defined(__ppc64__) || defined(__powerpc64__) /* ppc64 */ \
+-     || defined(__mips64__) || defined(__mips64)) /* mips64 */ \
+-   || (!defined(SIZE_MAX) || SIZE_MAX < ULLONG_MAX) /* check limits */
+-#    define XXH_REROLL_XXH64 1
+-#  else
+-#    define XXH_REROLL_XXH64 0
+-#  endif
+-#endif /* !defined(XXH_REROLL_XXH64) */
+-
+ #if (defined(XXH_FORCE_MEMORY_ACCESS) && (XXH_FORCE_MEMORY_ACCESS==3))
+ /*
+  * Manual byteshift. Best for old compilers which don't inline memcpy.
+  * We actually directly use XXH_readLE64 and XXH_readBE64.
+  */
+ #elif (defined(XXH_FORCE_MEMORY_ACCESS) && (XXH_FORCE_MEMORY_ACCESS==2))
+ 
+ /* Force direct memory access. Only works on CPU which support unaligned memory access in hardware */
+-static xxh_u64 XXH_read64(const void* memPtr) { return *(const xxh_u64*) memPtr; }
++static xxh_u64 XXH_read64(const void* memPtr)
++{
++    return *(const xxh_u64*) memPtr;
++}
+ 
+ #elif (defined(XXH_FORCE_MEMORY_ACCESS) && (XXH_FORCE_MEMORY_ACCESS==1))
+ 
+ /*
+  * __pack instructions are safer, but compiler specific, hence potentially
+  * problematic for some compilers.
+  *
+  * Currently only defined for GCC and ICC.
+@@ -1622,33 +2321,33 @@ static xxh_u64 XXH_read64(const void* pt
+     typedef union { xxh_u32 u32; xxh_u64 u64; } __attribute__((packed)) xxh_unalign64;
+     return ((const xxh_unalign64*)ptr)->u64;
+ }
+ 
+ #else
+ 
+ /*
+  * Portable and safe solution. Generally efficient.
+- * see: https://stackoverflow.com/a/32095106/646947
++ * see: http://fastcompression.blogspot.com/2015/08/accessing-unaligned-memory.html
+  */
+ static xxh_u64 XXH_read64(const void* memPtr)
+ {
+     xxh_u64 val;
+-    memcpy(&val, memPtr, sizeof(val));
++    XXH_memcpy(&val, memPtr, sizeof(val));
+     return val;
+ }
+ 
+ #endif   /* XXH_FORCE_DIRECT_MEMORY_ACCESS */
+ 
+ #if defined(_MSC_VER)     /* Visual Studio */
+ #  define XXH_swap64 _byteswap_uint64
+ #elif XXH_GCC_VERSION >= 403
+ #  define XXH_swap64 __builtin_bswap64
+ #else
+-static xxh_u64 XXH_swap64 (xxh_u64 x)
++static xxh_u64 XXH_swap64(xxh_u64 x)
+ {
+     return  ((x << 56) & 0xff00000000000000ULL) |
+             ((x << 40) & 0x00ff000000000000ULL) |
+             ((x << 24) & 0x0000ff0000000000ULL) |
+             ((x << 8)  & 0x000000ff00000000ULL) |
+             ((x >> 8)  & 0x00000000ff000000ULL) |
+             ((x >> 24) & 0x0000000000ff0000ULL) |
+             ((x >> 40) & 0x000000000000ff00ULL) |
+@@ -1704,22 +2403,28 @@ XXH_readLE64_align(const void* ptr, XXH_
+     if (align==XXH_unaligned)
+         return XXH_readLE64(ptr);
+     else
+         return XXH_CPU_LITTLE_ENDIAN ? *(const xxh_u64*)ptr : XXH_swap64(*(const xxh_u64*)ptr);
+ }
+ 
+ 
+ /*******   xxh64   *******/
+-
+-static const xxh_u64 XXH_PRIME64_1 = 0x9E3779B185EBCA87ULL;   /* 0b1001111000110111011110011011000110000101111010111100101010000111 */
+-static const xxh_u64 XXH_PRIME64_2 = 0xC2B2AE3D27D4EB4FULL;   /* 0b1100001010110010101011100011110100100111110101001110101101001111 */
+-static const xxh_u64 XXH_PRIME64_3 = 0x165667B19E3779F9ULL;   /* 0b0001011001010110011001111011000110011110001101110111100111111001 */
+-static const xxh_u64 XXH_PRIME64_4 = 0x85EBCA77C2B2AE63ULL;   /* 0b1000010111101011110010100111011111000010101100101010111001100011 */
+-static const xxh_u64 XXH_PRIME64_5 = 0x27D4EB2F165667C5ULL;   /* 0b0010011111010100111010110010111100010110010101100110011111000101 */
++/*!
++ * @}
++ * @defgroup xxh64_impl XXH64 implementation
++ * @ingroup impl
++ * @{
++ */
++/* #define rather that static const, to be used as initializers */
++#define XXH_PRIME64_1  0x9E3779B185EBCA87ULL  /*!< 0b1001111000110111011110011011000110000101111010111100101010000111 */
++#define XXH_PRIME64_2  0xC2B2AE3D27D4EB4FULL  /*!< 0b1100001010110010101011100011110100100111110101001110101101001111 */
++#define XXH_PRIME64_3  0x165667B19E3779F9ULL  /*!< 0b0001011001010110011001111011000110011110001101110111100111111001 */
++#define XXH_PRIME64_4  0x85EBCA77C2B2AE63ULL  /*!< 0b1000010111101011110010100111011111000010101100101010111001100011 */
++#define XXH_PRIME64_5  0x27D4EB2F165667C5ULL  /*!< 0b0010011111010100111010110010111100010110010101100110011111000101 */
+ 
+ #ifdef XXH_OLD_NAMES
+ #  define PRIME64_1 XXH_PRIME64_1
+ #  define PRIME64_2 XXH_PRIME64_2
+ #  define PRIME64_3 XXH_PRIME64_3
+ #  define PRIME64_4 XXH_PRIME64_4
+ #  define PRIME64_5 XXH_PRIME64_5
+ #endif
+@@ -1751,174 +2456,69 @@ static xxh_u64 XXH64_avalanche(xxh_u64 h
+ }
+ 
+ 
+ #define XXH_get64bits(p) XXH_readLE64_align(p, align)
+ 
+ static xxh_u64
+ XXH64_finalize(xxh_u64 h64, const xxh_u8* ptr, size_t len, XXH_alignment align)
+ {
+-#define XXH_PROCESS1_64 do {                                   \
+-    h64 ^= (*ptr++) * XXH_PRIME64_5;                           \
+-    h64 = XXH_rotl64(h64, 11) * XXH_PRIME64_1;                 \
+-} while (0)
+-
+-#define XXH_PROCESS4_64 do {                                   \
+-    h64 ^= (xxh_u64)(XXH_get32bits(ptr)) * XXH_PRIME64_1;      \
+-    ptr += 4;                                              \
+-    h64 = XXH_rotl64(h64, 23) * XXH_PRIME64_2 + XXH_PRIME64_3;     \
+-} while (0)
+-
+-#define XXH_PROCESS8_64 do {                                   \
+-    xxh_u64 const k1 = XXH64_round(0, XXH_get64bits(ptr)); \
+-    ptr += 8;                                              \
+-    h64 ^= k1;                                             \
+-    h64  = XXH_rotl64(h64,27) * XXH_PRIME64_1 + XXH_PRIME64_4;     \
+-} while (0)
+-
+-    /* Rerolled version for 32-bit targets is faster and much smaller. */
+-    if (XXH_REROLL || XXH_REROLL_XXH64) {
+-        len &= 31;
+-        while (len >= 8) {
+-            XXH_PROCESS8_64;
+-            len -= 8;
+-        }
+-        if (len >= 4) {
+-            XXH_PROCESS4_64;
+-            len -= 4;
+-        }
+-        while (len > 0) {
+-            XXH_PROCESS1_64;
+-            --len;
+-        }
+-         return  XXH64_avalanche(h64);
+-    } else {
+-        switch(len & 31) {
+-           case 24: XXH_PROCESS8_64;
+-                         /* fallthrough */
+-           case 16: XXH_PROCESS8_64;
+-                         /* fallthrough */
+-           case  8: XXH_PROCESS8_64;
+-                    return XXH64_avalanche(h64);
+-
+-           case 28: XXH_PROCESS8_64;
+-                         /* fallthrough */
+-           case 20: XXH_PROCESS8_64;
+-                         /* fallthrough */
+-           case 12: XXH_PROCESS8_64;
+-                         /* fallthrough */
+-           case  4: XXH_PROCESS4_64;
+-                    return XXH64_avalanche(h64);
+-
+-           case 25: XXH_PROCESS8_64;
+-                         /* fallthrough */
+-           case 17: XXH_PROCESS8_64;
+-                         /* fallthrough */
+-           case  9: XXH_PROCESS8_64;
+-                    XXH_PROCESS1_64;
+-                    return XXH64_avalanche(h64);
+-
+-           case 29: XXH_PROCESS8_64;
+-                         /* fallthrough */
+-           case 21: XXH_PROCESS8_64;
+-                         /* fallthrough */
+-           case 13: XXH_PROCESS8_64;
+-                         /* fallthrough */
+-           case  5: XXH_PROCESS4_64;
+-                    XXH_PROCESS1_64;
+-                    return XXH64_avalanche(h64);
+-
+-           case 26: XXH_PROCESS8_64;
+-                         /* fallthrough */
+-           case 18: XXH_PROCESS8_64;
+-                         /* fallthrough */
+-           case 10: XXH_PROCESS8_64;
+-                    XXH_PROCESS1_64;
+-                    XXH_PROCESS1_64;
+-                    return XXH64_avalanche(h64);
+-
+-           case 30: XXH_PROCESS8_64;
+-                         /* fallthrough */
+-           case 22: XXH_PROCESS8_64;
+-                         /* fallthrough */
+-           case 14: XXH_PROCESS8_64;
+-                         /* fallthrough */
+-           case  6: XXH_PROCESS4_64;
+-                    XXH_PROCESS1_64;
+-                    XXH_PROCESS1_64;
+-                    return XXH64_avalanche(h64);
+-
+-           case 27: XXH_PROCESS8_64;
+-                         /* fallthrough */
+-           case 19: XXH_PROCESS8_64;
+-                         /* fallthrough */
+-           case 11: XXH_PROCESS8_64;
+-                    XXH_PROCESS1_64;
+-                    XXH_PROCESS1_64;
+-                    XXH_PROCESS1_64;
+-                    return XXH64_avalanche(h64);
+-
+-           case 31: XXH_PROCESS8_64;
+-                         /* fallthrough */
+-           case 23: XXH_PROCESS8_64;
+-                         /* fallthrough */
+-           case 15: XXH_PROCESS8_64;
+-                         /* fallthrough */
+-           case  7: XXH_PROCESS4_64;
+-                         /* fallthrough */
+-           case  3: XXH_PROCESS1_64;
+-                         /* fallthrough */
+-           case  2: XXH_PROCESS1_64;
+-                         /* fallthrough */
+-           case  1: XXH_PROCESS1_64;
+-                         /* fallthrough */
+-           case  0: return XXH64_avalanche(h64);
+-        }
++    if (ptr==NULL) XXH_ASSERT(len == 0);
++    len &= 31;
++    while (len >= 8) {
++        xxh_u64 const k1 = XXH64_round(0, XXH_get64bits(ptr));
++        ptr += 8;
++        h64 ^= k1;
++        h64  = XXH_rotl64(h64,27) * XXH_PRIME64_1 + XXH_PRIME64_4;
++        len -= 8;
+     }
+-    /* impossible to reach */
+-    XXH_ASSERT(0);
+-    return 0;  /* unreachable, but some compilers complain without it */
++    if (len >= 4) {
++        h64 ^= (xxh_u64)(XXH_get32bits(ptr)) * XXH_PRIME64_1;
++        ptr += 4;
++        h64 = XXH_rotl64(h64, 23) * XXH_PRIME64_2 + XXH_PRIME64_3;
++        len -= 4;
++    }
++    while (len > 0) {
++        h64 ^= (*ptr++) * XXH_PRIME64_5;
++        h64 = XXH_rotl64(h64, 11) * XXH_PRIME64_1;
++        --len;
++    }
++    return  XXH64_avalanche(h64);
+ }
+ 
+ #ifdef XXH_OLD_NAMES
+ #  define PROCESS1_64 XXH_PROCESS1_64
+ #  define PROCESS4_64 XXH_PROCESS4_64
+ #  define PROCESS8_64 XXH_PROCESS8_64
+ #else
+ #  undef XXH_PROCESS1_64
+ #  undef XXH_PROCESS4_64
+ #  undef XXH_PROCESS8_64
+ #endif
+ 
+ XXH_FORCE_INLINE xxh_u64
+ XXH64_endian_align(const xxh_u8* input, size_t len, xxh_u64 seed, XXH_alignment align)
+ {
+-    const xxh_u8* bEnd = input + len;
+     xxh_u64 h64;
+-
+-#if defined(XXH_ACCEPT_NULL_INPUT_POINTER) && (XXH_ACCEPT_NULL_INPUT_POINTER>=1)
+-    if (input==NULL) {
+-        len=0;
+-        bEnd=input=(const xxh_u8*)(size_t)32;
+-    }
+-#endif
++    if (input==NULL) XXH_ASSERT(len == 0);
+ 
+     if (len>=32) {
+-        const xxh_u8* const limit = bEnd - 32;
++        const xxh_u8* const bEnd = input + len;
++        const xxh_u8* const limit = bEnd - 31;
+         xxh_u64 v1 = seed + XXH_PRIME64_1 + XXH_PRIME64_2;
+         xxh_u64 v2 = seed + XXH_PRIME64_2;
+         xxh_u64 v3 = seed + 0;
+         xxh_u64 v4 = seed - XXH_PRIME64_1;
+ 
+         do {
+             v1 = XXH64_round(v1, XXH_get64bits(input)); input+=8;
+             v2 = XXH64_round(v2, XXH_get64bits(input)); input+=8;
+             v3 = XXH64_round(v3, XXH_get64bits(input)); input+=8;
+             v4 = XXH64_round(v4, XXH_get64bits(input)); input+=8;
+-        } while (input<=limit);
++        } while (input<limit);
+ 
+         h64 = XXH_rotl64(v1, 1) + XXH_rotl64(v2, 7) + XXH_rotl64(v3, 12) + XXH_rotl64(v4, 18);
+         h64 = XXH64_mergeRound(h64, v1);
+         h64 = XXH64_mergeRound(h64, v2);
+         h64 = XXH64_mergeRound(h64, v3);
+         h64 = XXH64_mergeRound(h64, v4);
+ 
+     } else {
+@@ -1926,204 +2526,208 @@ XXH64_endian_align(const xxh_u8* input, 
+     }
+ 
+     h64 += (xxh_u64) len;
+ 
+     return XXH64_finalize(h64, input, len, align);
+ }
+ 
+ 
++/*! @ingroup xxh64_family */
+ XXH_PUBLIC_API XXH64_hash_t XXH64 (const void* input, size_t len, XXH64_hash_t seed)
+ {
+ #if 0
+     /* Simple version, good for code maintenance, but unfortunately slow for small inputs */
+     XXH64_state_t state;
+     XXH64_reset(&state, seed);
+     XXH64_update(&state, (const xxh_u8*)input, len);
+     return XXH64_digest(&state);
+-
+ #else
+-
+     if (XXH_FORCE_ALIGN_CHECK) {
+         if ((((size_t)input) & 7)==0) {  /* Input is aligned, let's leverage the speed advantage */
+             return XXH64_endian_align((const xxh_u8*)input, len, seed, XXH_aligned);
+     }   }
+ 
+     return XXH64_endian_align((const xxh_u8*)input, len, seed, XXH_unaligned);
+ 
+ #endif
+ }
+ 
+ /*******   Hash Streaming   *******/
+ 
++/*! @ingroup xxh64_family*/
+ XXH_PUBLIC_API XXH64_state_t* XXH64_createState(void)
+ {
+     return (XXH64_state_t*)XXH_malloc(sizeof(XXH64_state_t));
+ }
++/*! @ingroup xxh64_family */
+ XXH_PUBLIC_API XXH_errorcode XXH64_freeState(XXH64_state_t* statePtr)
+ {
+     XXH_free(statePtr);
+     return XXH_OK;
+ }
+ 
++/*! @ingroup xxh64_family */
+ XXH_PUBLIC_API void XXH64_copyState(XXH64_state_t* dstState, const XXH64_state_t* srcState)
+ {
+-    memcpy(dstState, srcState, sizeof(*dstState));
++    XXH_memcpy(dstState, srcState, sizeof(*dstState));
+ }
+ 
++/*! @ingroup xxh64_family */
+ XXH_PUBLIC_API XXH_errorcode XXH64_reset(XXH64_state_t* statePtr, XXH64_hash_t seed)
+ {
+     XXH64_state_t state;   /* use a local state to memcpy() in order to avoid strict-aliasing warnings */
+     memset(&state, 0, sizeof(state));
+-    state.v1 = seed + XXH_PRIME64_1 + XXH_PRIME64_2;
+-    state.v2 = seed + XXH_PRIME64_2;
+-    state.v3 = seed + 0;
+-    state.v4 = seed - XXH_PRIME64_1;
++    state.v[0] = seed + XXH_PRIME64_1 + XXH_PRIME64_2;
++    state.v[1] = seed + XXH_PRIME64_2;
++    state.v[2] = seed + 0;
++    state.v[3] = seed - XXH_PRIME64_1;
+      /* do not write into reserved64, might be removed in a future version */
+-    memcpy(statePtr, &state, sizeof(state) - sizeof(state.reserved64));
++    XXH_memcpy(statePtr, &state, sizeof(state) - sizeof(state.reserved64));
+     return XXH_OK;
+ }
+ 
++/*! @ingroup xxh64_family */
+ XXH_PUBLIC_API XXH_errorcode
+ XXH64_update (XXH64_state_t* state, const void* input, size_t len)
+ {
+-    if (input==NULL)
+-#if defined(XXH_ACCEPT_NULL_INPUT_POINTER) && (XXH_ACCEPT_NULL_INPUT_POINTER>=1)
++    if (input==NULL) {
++        XXH_ASSERT(len == 0);
+         return XXH_OK;
+-#else
+-        return XXH_ERROR;
+-#endif
++    }
+ 
+     {   const xxh_u8* p = (const xxh_u8*)input;
+         const xxh_u8* const bEnd = p + len;
+ 
+         state->total_len += len;
+ 
+         if (state->memsize + len < 32) {  /* fill in tmp buffer */
+             XXH_memcpy(((xxh_u8*)state->mem64) + state->memsize, input, len);
+             state->memsize += (xxh_u32)len;
+             return XXH_OK;
+         }
+ 
+         if (state->memsize) {   /* tmp buffer is full */
+             XXH_memcpy(((xxh_u8*)state->mem64) + state->memsize, input, 32-state->memsize);
+-            state->v1 = XXH64_round(state->v1, XXH_readLE64(state->mem64+0));
+-            state->v2 = XXH64_round(state->v2, XXH_readLE64(state->mem64+1));
+-            state->v3 = XXH64_round(state->v3, XXH_readLE64(state->mem64+2));
+-            state->v4 = XXH64_round(state->v4, XXH_readLE64(state->mem64+3));
+-            p += 32-state->memsize;
++            state->v[0] = XXH64_round(state->v[0], XXH_readLE64(state->mem64+0));
++            state->v[1] = XXH64_round(state->v[1], XXH_readLE64(state->mem64+1));
++            state->v[2] = XXH64_round(state->v[2], XXH_readLE64(state->mem64+2));
++            state->v[3] = XXH64_round(state->v[3], XXH_readLE64(state->mem64+3));
++            p += 32 - state->memsize;
+             state->memsize = 0;
+         }
+ 
+         if (p+32 <= bEnd) {
+             const xxh_u8* const limit = bEnd - 32;
+-            xxh_u64 v1 = state->v1;
+-            xxh_u64 v2 = state->v2;
+-            xxh_u64 v3 = state->v3;
+-            xxh_u64 v4 = state->v4;
+ 
+             do {
+-                v1 = XXH64_round(v1, XXH_readLE64(p)); p+=8;
+-                v2 = XXH64_round(v2, XXH_readLE64(p)); p+=8;
+-                v3 = XXH64_round(v3, XXH_readLE64(p)); p+=8;
+-                v4 = XXH64_round(v4, XXH_readLE64(p)); p+=8;
++                state->v[0] = XXH64_round(state->v[0], XXH_readLE64(p)); p+=8;
++                state->v[1] = XXH64_round(state->v[1], XXH_readLE64(p)); p+=8;
++                state->v[2] = XXH64_round(state->v[2], XXH_readLE64(p)); p+=8;
++                state->v[3] = XXH64_round(state->v[3], XXH_readLE64(p)); p+=8;
+             } while (p<=limit);
+ 
+-            state->v1 = v1;
+-            state->v2 = v2;
+-            state->v3 = v3;
+-            state->v4 = v4;
+         }
+ 
+         if (p < bEnd) {
+             XXH_memcpy(state->mem64, p, (size_t)(bEnd-p));
+             state->memsize = (unsigned)(bEnd-p);
+         }
+     }
+ 
+     return XXH_OK;
+ }
+ 
+ 
+-XXH_PUBLIC_API XXH64_hash_t XXH64_digest (const XXH64_state_t* state)
++/*! @ingroup xxh64_family */
++XXH_PUBLIC_API XXH64_hash_t XXH64_digest(const XXH64_state_t* state)
+ {
+     xxh_u64 h64;
+ 
+     if (state->total_len >= 32) {
+-        xxh_u64 const v1 = state->v1;
+-        xxh_u64 const v2 = state->v2;
+-        xxh_u64 const v3 = state->v3;
+-        xxh_u64 const v4 = state->v4;
+-
+-        h64 = XXH_rotl64(v1, 1) + XXH_rotl64(v2, 7) + XXH_rotl64(v3, 12) + XXH_rotl64(v4, 18);
+-        h64 = XXH64_mergeRound(h64, v1);
+-        h64 = XXH64_mergeRound(h64, v2);
+-        h64 = XXH64_mergeRound(h64, v3);
+-        h64 = XXH64_mergeRound(h64, v4);
++        h64 = XXH_rotl64(state->v[0], 1) + XXH_rotl64(state->v[1], 7) + XXH_rotl64(state->v[2], 12) + XXH_rotl64(state->v[3], 18);
++        h64 = XXH64_mergeRound(h64, state->v[0]);
++        h64 = XXH64_mergeRound(h64, state->v[1]);
++        h64 = XXH64_mergeRound(h64, state->v[2]);
++        h64 = XXH64_mergeRound(h64, state->v[3]);
+     } else {
+-        h64  = state->v3 /*seed*/ + XXH_PRIME64_5;
++        h64  = state->v[2] /*seed*/ + XXH_PRIME64_5;
+     }
+ 
+     h64 += (xxh_u64) state->total_len;
+ 
+     return XXH64_finalize(h64, (const xxh_u8*)state->mem64, (size_t)state->total_len, XXH_aligned);
+ }
+ 
+ 
+ /******* Canonical representation   *******/
+ 
++/*! @ingroup xxh64_family */
+ XXH_PUBLIC_API void XXH64_canonicalFromHash(XXH64_canonical_t* dst, XXH64_hash_t hash)
+ {
+     XXH_STATIC_ASSERT(sizeof(XXH64_canonical_t) == sizeof(XXH64_hash_t));
+     if (XXH_CPU_LITTLE_ENDIAN) hash = XXH_swap64(hash);
+-    memcpy(dst, &hash, sizeof(*dst));
++    XXH_memcpy(dst, &hash, sizeof(*dst));
+ }
+ 
++/*! @ingroup xxh64_family */
+ XXH_PUBLIC_API XXH64_hash_t XXH64_hashFromCanonical(const XXH64_canonical_t* src)
+ {
+     return XXH_readBE64(src);
+ }
+ 
+-
++#ifndef XXH_NO_XXH3
+ 
+ /* *********************************************************************
+ *  XXH3
+ *  New generation hash designed for speed on small keys and vectorization
+ ************************************************************************ */
++/*!
++ * @}
++ * @defgroup xxh3_impl XXH3 implementation
++ * @ingroup impl
++ * @{
++ */
+ 
+ /* ===   Compiler specifics   === */
+ 
+-#if defined (__STDC_VERSION__) && __STDC_VERSION__ >= 199901L   /* >= C99 */
++#if ((defined(sun) || defined(__sun)) && __cplusplus) /* Solaris includes __STDC_VERSION__ with C++. Tested with GCC 5.5 */
++#  define XXH_RESTRICT /* disable */
++#elif defined (__STDC_VERSION__) && __STDC_VERSION__ >= 199901L   /* >= C99 */
+ #  define XXH_RESTRICT   restrict
+ #else
+ /* Note: it might be useful to define __restrict or __restrict__ for some C++ compilers */
+ #  define XXH_RESTRICT   /* disable */
+ #endif
+ 
+ #if (defined(__GNUC__) && (__GNUC__ >= 3))  \
+   || (defined(__INTEL_COMPILER) && (__INTEL_COMPILER >= 800)) \
+   || defined(__clang__)
+ #    define XXH_likely(x) __builtin_expect(x, 1)
+ #    define XXH_unlikely(x) __builtin_expect(x, 0)
+ #else
+ #    define XXH_likely(x) (x)
+ #    define XXH_unlikely(x) (x)
+ #endif
+ 
+-#if defined(__GNUC__)
+-#  if defined(__AVX2__)
++#if defined(__GNUC__) || defined(__clang__)
++#  if defined(__ARM_NEON__) || defined(__ARM_NEON) \
++   || defined(__aarch64__)  || defined(_M_ARM) \
++   || defined(_M_ARM64)     || defined(_M_ARM64EC)
++#    define inline __inline__  /* circumvent a clang bug */
++#    include <arm_neon.h>
++#    undef inline
++#  elif defined(__AVX2__)
+ #    include <immintrin.h>
+ #  elif defined(__SSE2__)
+ #    include <emmintrin.h>
+-#  elif defined(__ARM_NEON__) || defined(__ARM_NEON)
+-#    define inline __inline__  /* circumvent a clang bug */
+-#    include <arm_neon.h>
+-#    undef inline
+ #  endif
+-#elif defined(_MSC_VER)
++#endif
++
++#if defined(_MSC_VER)
+ #  include <intrin.h>
+ #endif
+ 
+ /*
+  * One goal of XXH3 is to make it fast on both 32-bit and 64-bit, while
+  * remaining a true 64-bit/128-bit hash function.
+  *
+  * This is done by prioritizing a subset of 64-bit operations that can be
+@@ -2193,35 +2797,88 @@ XXH_PUBLIC_API XXH64_hash_t XXH64_hashFr
+  */
+ #if defined(__thumb__) && !defined(__thumb2__) && defined(__ARM_ARCH_ISA_ARM)
+ #   warning "XXH3 is highly inefficient without ARM or Thumb-2."
+ #endif
+ 
+ /* ==========================================
+  * Vectorization detection
+  * ========================================== */
+-#define XXH_SCALAR 0  /* Portable scalar version */
+-#define XXH_SSE2   1  /* SSE2 for Pentium 4 and all x86_64 */
+-#define XXH_AVX2   2  /* AVX2 for Haswell and Bulldozer */
+-#define XXH_AVX512 3  /* AVX512 for Skylake and Icelake */
+-#define XXH_NEON   4  /* NEON for most ARMv7-A and all AArch64 */
+-#define XXH_VSX    5  /* VSX and ZVector for POWER8/z13 */
++
++#ifdef XXH_DOXYGEN
++/*!
++ * @ingroup tuning
++ * @brief Overrides the vectorization implementation chosen for XXH3.
++ *
++ * Can be defined to 0 to disable SIMD or any of the values mentioned in
++ * @ref XXH_VECTOR_TYPE.
++ *
++ * If this is not defined, it uses predefined macros to determine the best
++ * implementation.
++ */
++#  define XXH_VECTOR XXH_SCALAR
++/*!
++ * @ingroup tuning
++ * @brief Possible values for @ref XXH_VECTOR.
++ *
++ * Note that these are actually implemented as macros.
++ *
++ * If this is not defined, it is detected automatically.
++ * @ref XXH_X86DISPATCH overrides this.
++ */
++enum XXH_VECTOR_TYPE /* fake enum */ {
++    XXH_SCALAR = 0,  /*!< Portable scalar version */
++    XXH_SSE2   = 1,  /*!<
++                      * SSE2 for Pentium 4, Opteron, all x86_64.
++                      *
++                      * @note SSE2 is also guaranteed on Windows 10, macOS, and
++                      * Android x86.
++                      */
++    XXH_AVX2   = 2,  /*!< AVX2 for Haswell and Bulldozer */
++    XXH_AVX512 = 3,  /*!< AVX512 for Skylake and Icelake */
++    XXH_NEON   = 4,  /*!< NEON for most ARMv7-A and all AArch64 */
++    XXH_VSX    = 5,  /*!< VSX and ZVector for POWER8/z13 (64-bit) */
++};
++/*!
++ * @ingroup tuning
++ * @brief Selects the minimum alignment for XXH3's accumulators.
++ *
++ * When using SIMD, this should match the alignment reqired for said vector
++ * type, so, for example, 32 for AVX2.
++ *
++ * Default: Auto detected.
++ */
++#  define XXH_ACC_ALIGN 8
++#endif
++
++/* Actual definition */
++#ifndef XXH_DOXYGEN
++#  define XXH_SCALAR 0
++#  define XXH_SSE2   1
++#  define XXH_AVX2   2
++#  define XXH_AVX512 3
++#  define XXH_NEON   4
++#  define XXH_VSX    5
++#endif
+ 
+ #ifndef XXH_VECTOR    /* can be defined on command line */
+-#  if defined(__AVX512F__)
++#  if ( \
++        defined(__ARM_NEON__) || defined(__ARM_NEON) /* gcc */ \
++     || defined(_M_ARM) || defined(_M_ARM64) || defined(_M_ARM64EC) /* msvc */ \
++   ) && ( \
++        defined(_WIN32) || defined(__LITTLE_ENDIAN__) /* little endian only */ \
++    || (defined(__BYTE_ORDER__) && __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__) \
++   )
++#    define XXH_VECTOR XXH_NEON
++#  elif defined(__AVX512F__)
+ #    define XXH_VECTOR XXH_AVX512
+ #  elif defined(__AVX2__)
+ #    define XXH_VECTOR XXH_AVX2
+ #  elif defined(__SSE2__) || defined(_M_AMD64) || defined(_M_X64) || (defined(_M_IX86_FP) && (_M_IX86_FP == 2))
+ #    define XXH_VECTOR XXH_SSE2
+-#  elif defined(__GNUC__) /* msvc support maybe later */ \
+-  && (defined(__ARM_NEON__) || defined(__ARM_NEON)) \
+-  && (defined(__LITTLE_ENDIAN__) /* We only support little endian NEON */ \
+-    || (defined(__BYTE_ORDER__) && __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__))
+-#    define XXH_VECTOR XXH_NEON
+ #  elif (defined(__PPC64__) && defined(__POWER8_VECTOR__)) \
+      || (defined(__s390x__) && defined(__VEC__)) \
+      && defined(__GNUC__) /* TODO: IBM XL */
+ #    define XXH_VECTOR XXH_VSX
+ #  else
+ #    define XXH_VECTOR XXH_SCALAR
+ #  endif
+ #endif
+@@ -2351,28 +3008,28 @@ XXH_PUBLIC_API XXH64_hash_t XXH64_hashFr
+  * Instead, we use a literal downcast, vmovn_u64 (XTN), and vshrn_n_u64 (SHRN):
+  *
+  *   shrn    v1.2s, v0.2d, #32  // v1 = (uint32x2_t)(v0 >> 32);
+  *   xtn     v0.2s, v0.2d       // v0 = (uint32x2_t)(v0 & 0xFFFFFFFF);
+  *
+  * This is available on ARMv7-A, but is less efficient than a single VZIP.32.
+  */
+ 
+-/*
++/*!
+  * Function-like macro:
+  * void XXH_SPLIT_IN_PLACE(uint64x2_t &in, uint32x2_t &outLo, uint32x2_t &outHi)
+  * {
+  *     outLo = (uint32x2_t)(in & 0xFFFFFFFF);
+  *     outHi = (uint32x2_t)(in >> 32);
+  *     in = UNDEFINED;
+  * }
+  */
+ # if !defined(XXH_NO_VZIP_HACK) /* define to disable */ \
+-   && defined(__GNUC__) \
+-   && !defined(__aarch64__) && !defined(__arm64__)
++   && (defined(__GNUC__) || defined(__clang__)) \
++   && (defined(__arm__) || defined(__thumb__) || defined(_M_ARM))
+ #  define XXH_SPLIT_IN_PLACE(in, outLo, outHi)                                              \
+     do {                                                                                    \
+       /* Undocumented GCC/Clang operand modifier: %e0 = lower D half, %f0 = upper D half */ \
+       /* https://github.com/gcc-mirror/gcc/blob/38cf91e5/gcc/config/arm/arm.c#L22486 */     \
+       /* https://github.com/llvm-mirror/llvm/blob/2c4ca683/lib/Target/ARM/ARMAsmPrinter.cpp#L399 */ \
+       __asm__("vzip.32  %e0, %f0" : "+w" (in));                                             \
+       (outLo) = vget_low_u32 (vreinterpretq_u32_u64(in));                                   \
+       (outHi) = vget_high_u32(vreinterpretq_u32_u64(in));                                   \
+@@ -2424,36 +3081,38 @@ typedef __vector unsigned xxh_u32x4;
+ #    warning "-maltivec=be is not recommended. Please use native endianness."
+ #    define XXH_VSX_BE 1
+ #  else
+ #    define XXH_VSX_BE 0
+ #  endif
+ # endif /* !defined(XXH_VSX_BE) */
+ 
+ # if XXH_VSX_BE
+-/* A wrapper for POWER9's vec_revb. */
+ #  if defined(__POWER9_VECTOR__) || (defined(__clang__) && defined(__s390x__))
+ #    define XXH_vec_revb vec_revb
+ #  else
++/*!
++ * A polyfill for POWER9's vec_revb().
++ */
+ XXH_FORCE_INLINE xxh_u64x2 XXH_vec_revb(xxh_u64x2 val)
+ {
+     xxh_u8x16 const vByteSwap = { 0x07, 0x06, 0x05, 0x04, 0x03, 0x02, 0x01, 0x00,
+                                   0x0F, 0x0E, 0x0D, 0x0C, 0x0B, 0x0A, 0x09, 0x08 };
+     return vec_perm(val, val, vByteSwap);
+ }
+ #  endif
+ # endif /* XXH_VSX_BE */
+ 
+-/*
+- * Performs an unaligned load and byte swaps it on big endian.
++/*!
++ * Performs an unaligned vector load and byte swaps it on big endian.
+  */
+ XXH_FORCE_INLINE xxh_u64x2 XXH_vec_loadu(const void *ptr)
+ {
+     xxh_u64x2 ret;
+-    memcpy(&ret, ptr, sizeof(xxh_u64x2));
++    XXH_memcpy(&ret, ptr, sizeof(xxh_u64x2));
+ # if XXH_VSX_BE
+     ret = XXH_vec_revb(ret);
+ # endif
+     return ret;
+ }
+ 
+ /*
+  * vec_mulo and vec_mule are very problematic intrinsics on PowerPC
+@@ -2488,17 +3147,17 @@ XXH_FORCE_INLINE xxh_u64x2 XXH_vec_mule(
+ #endif /* XXH_VECTOR == XXH_VSX */
+ 
+ 
+ /* prefetch
+  * can be disabled, by declaring XXH_NO_PREFETCH build macro */
+ #if defined(XXH_NO_PREFETCH)
+ #  define XXH_PREFETCH(ptr)  (void)(ptr)  /* disabled */
+ #else
+-#  if defined(_MSC_VER) && (defined(_M_X64) || defined(_M_I86))  /* _mm_prefetch() is not defined outside of x86/x64 */
++#  if defined(_MSC_VER) && (defined(_M_X64) || defined(_M_IX86))  /* _mm_prefetch() not defined outside of x86/x64 */
+ #    include <mmintrin.h>   /* https://msdn.microsoft.com/fr-fr/library/84szxsww(v=vs.90).aspx */
+ #    define XXH_PREFETCH(ptr)  _mm_prefetch((const char*)(ptr), _MM_HINT_T0)
+ #  elif defined(__GNUC__) && ( (__GNUC__ >= 4) || ( (__GNUC__ == 3) && (__GNUC_MINOR__ >= 1) ) )
+ #    define XXH_PREFETCH(ptr)  __builtin_prefetch((ptr), 0 /* rw==read */, 3 /* locality */)
+ #  else
+ #    define XXH_PREFETCH(ptr) (void)(ptr)  /* disabled */
+ #  endif
+ #endif  /* XXH_NO_PREFETCH */
+@@ -2509,17 +3168,17 @@ XXH_FORCE_INLINE xxh_u64x2 XXH_vec_mule(
+  * ========================================== */
+ 
+ #define XXH_SECRET_DEFAULT_SIZE 192   /* minimum XXH3_SECRET_SIZE_MIN */
+ 
+ #if (XXH_SECRET_DEFAULT_SIZE < XXH3_SECRET_SIZE_MIN)
+ #  error "default keyset is not large enough"
+ #endif
+ 
+-/* Pseudorandom secret taken directly from FARSH */
++/*! Pseudorandom secret taken directly from FARSH. */
+ XXH_ALIGN(64) static const xxh_u8 XXH3_kSecret[XXH_SECRET_DEFAULT_SIZE] = {
+     0xb8, 0xfe, 0x6c, 0x39, 0x23, 0xa4, 0x4b, 0xbe, 0x7c, 0x01, 0x81, 0x2c, 0xf7, 0x21, 0xad, 0x1c,
+     0xde, 0xd4, 0x6d, 0xe9, 0x83, 0x90, 0x97, 0xdb, 0x72, 0x40, 0xa4, 0xa4, 0xb7, 0xb3, 0x67, 0x1f,
+     0xcb, 0x79, 0xe6, 0x4e, 0xcc, 0xc0, 0xe5, 0x78, 0x82, 0x5a, 0xd0, 0x7d, 0xcc, 0xff, 0x72, 0x21,
+     0xb8, 0x08, 0x46, 0x74, 0xf7, 0x43, 0x24, 0x8e, 0xe0, 0x35, 0x90, 0xe6, 0x81, 0x3a, 0x26, 0x4c,
+     0x3c, 0x28, 0x52, 0xbb, 0x91, 0xc3, 0x00, 0xcb, 0x88, 0xd0, 0x65, 0x8b, 0x1b, 0x53, 0x2e, 0xa3,
+     0x71, 0x64, 0x48, 0x97, 0xa2, 0x0d, 0xf9, 0x4e, 0x38, 0x19, 0xef, 0x46, 0xa9, 0xde, 0xac, 0xd8,
+     0xa8, 0xfa, 0x76, 0x3f, 0xe3, 0x9c, 0x34, 0x3f, 0xf9, 0xdc, 0xbb, 0xc7, 0xc7, 0x0b, 0x4f, 0x1d,
+@@ -2530,50 +3189,59 @@ XXH_ALIGN(64) static const xxh_u8 XXH3_k
+     0x45, 0xcb, 0x3a, 0x8f, 0x95, 0x16, 0x04, 0x28, 0xaf, 0xd7, 0xfb, 0xca, 0xbb, 0x4b, 0x40, 0x7e,
+ };
+ 
+ 
+ #ifdef XXH_OLD_NAMES
+ #  define kSecret XXH3_kSecret
+ #endif
+ 
+-/*
+- * Calculates a 32-bit to 64-bit long multiply.
+- *
+- * Wraps __emulu on MSVC x86 because it tends to call __allmul when it doesn't
++#ifdef XXH_DOXYGEN
++/*!
++ * @brief Calculates a 32-bit to 64-bit long multiply.
++ *
++ * Implemented as a macro.
++ *
++ * Wraps `__emulu` on MSVC x86 because it tends to call `__allmul` when it doesn't
+  * need to (but it shouldn't need to anyways, it is about 7 instructions to do
+- * a 64x64 multiply...). Since we know that this will _always_ emit MULL, we
++ * a 64x64 multiply...). Since we know that this will _always_ emit `MULL`, we
+  * use that instead of the normal method.
+  *
+  * If you are compiling for platforms like Thumb-1 and don't have a better option,
+  * you may also want to write your own long multiply routine here.
+  *
+- * XXH_FORCE_INLINE xxh_u64 XXH_mult32to64(xxh_u64 x, xxh_u64 y)
+- * {
+- *    return (x & 0xFFFFFFFF) * (y & 0xFFFFFFFF);
+- * }
++ * @param x, y Numbers to be multiplied
++ * @return 64-bit product of the low 32 bits of @p x and @p y.
+  */
+-#if defined(_MSC_VER) && defined(_M_IX86)
+-#    include <intrin.h>
++XXH_FORCE_INLINE xxh_u64
++XXH_mult32to64(xxh_u64 x, xxh_u64 y)
++{
++   return (x & 0xFFFFFFFF) * (y & 0xFFFFFFFF);
++}
++#elif defined(_MSC_VER) && defined(_M_IX86)
+ #    define XXH_mult32to64(x, y) __emulu((unsigned)(x), (unsigned)(y))
+ #else
+ /*
+  * Downcast + upcast is usually better than masking on older compilers like
+  * GCC 4.2 (especially 32-bit ones), all without affecting newer compilers.
+  *
+  * The other method, (x & 0xFFFFFFFF) * (y & 0xFFFFFFFF), will AND both operands
+  * and perform a full 64x64 multiply -- entirely redundant on 32-bit.
+  */
+ #    define XXH_mult32to64(x, y) ((xxh_u64)(xxh_u32)(x) * (xxh_u64)(xxh_u32)(y))
+ #endif
+ 
+-/*
+- * Calculates a 64->128-bit long multiply.
+- *
+- * Uses __uint128_t and _umul128 if available, otherwise uses a scalar version.
++/*!
++ * @brief Calculates a 64->128-bit long multiply.
++ *
++ * Uses `__uint128_t` and `_umul128` if available, otherwise uses a scalar
++ * version.
++ *
++ * @param lhs , rhs The 64-bit integers to be multiplied
++ * @return The 128-bit result represented in an @ref XXH128_hash_t.
+  */
+ static XXH128_hash_t
+ XXH_mult64to128(xxh_u64 lhs, xxh_u64 rhs)
+ {
+     /*
+      * GCC/Clang __uint128_t method.
+      *
+      * On most 64-bit targets, GCC and Clang define a __uint128_t type.
+@@ -2583,45 +3251,60 @@ XXH_mult64to128(xxh_u64 lhs, xxh_u64 rhs
+      * Usually.
+      *
+      * Despite being a 32-bit platform, Clang (and emscripten) define this type
+      * despite not having the arithmetic for it. This results in a laggy
+      * compiler builtin call which calculates a full 128-bit multiply.
+      * In that case it is best to use the portable one.
+      * https://github.com/Cyan4973/xxHash/issues/211#issuecomment-515575677
+      */
+-#if defined(__GNUC__) && !defined(__wasm__) \
++#if (defined(__GNUC__) || defined(__clang__)) && !defined(__wasm__) \
+     && defined(__SIZEOF_INT128__) \
+     || (defined(_INTEGRAL_MAX_BITS) && _INTEGRAL_MAX_BITS >= 128)
+ 
+     __uint128_t const product = (__uint128_t)lhs * (__uint128_t)rhs;
+     XXH128_hash_t r128;
+     r128.low64  = (xxh_u64)(product);
+     r128.high64 = (xxh_u64)(product >> 64);
+     return r128;
+ 
+     /*
+      * MSVC for x64's _umul128 method.
+      *
+      * xxh_u64 _umul128(xxh_u64 Multiplier, xxh_u64 Multiplicand, xxh_u64 *HighProduct);
+      *
+      * This compiles to single operand MUL on x64.
+      */
+-#elif defined(_M_X64) || defined(_M_IA64)
++#elif (defined(_M_X64) || defined(_M_IA64)) && !defined(_M_ARM64EC)
+ 
+ #ifndef _MSC_VER
+ #   pragma intrinsic(_umul128)
+ #endif
+     xxh_u64 product_high;
+     xxh_u64 const product_low = _umul128(lhs, rhs, &product_high);
+     XXH128_hash_t r128;
+     r128.low64  = product_low;
+     r128.high64 = product_high;
+     return r128;
+ 
++    /*
++     * MSVC for ARM64's __umulh method.
++     *
++     * This compiles to the same MUL + UMULH as GCC/Clang's __uint128_t method.
++     */
++#elif defined(_M_ARM64) || defined(_M_ARM64EC)
++
++#ifndef _MSC_VER
++#   pragma intrinsic(__umulh)
++#endif
++    XXH128_hash_t r128;
++    r128.low64  = lhs * rhs;
++    r128.high64 = __umulh(lhs, rhs);
++    return r128;
++
+ #else
+     /*
+      * Portable scalar method. Optimized for 32-bit and 64-bit ALUs.
+      *
+      * This is a fast and simple grade school multiply, which is shown below
+      * with base 10 arithmetic instead of base 0x100000000.
+      *
+      *           9 3 // D2 lhs = 93
+@@ -2674,30 +3357,34 @@ XXH_mult64to128(xxh_u64 lhs, xxh_u64 rhs
+ 
+     XXH128_hash_t r128;
+     r128.low64  = lower;
+     r128.high64 = upper;
+     return r128;
+ #endif
+ }
+ 
+-/*
+- * Does a 64-bit to 128-bit multiply, then XOR folds it.
++/*!
++ * @brief Calculates a 64-bit to 128-bit multiply, then XOR folds it.
+  *
+  * The reason for the separate function is to prevent passing too many structs
+  * around by value. This will hopefully inline the multiply, but we don't force it.
++ *
++ * @param lhs , rhs The 64-bit integers to multiply
++ * @return The low 64 bits of the product XOR'd by the high 64 bits.
++ * @see XXH_mult64to128()
+  */
+ static xxh_u64
+ XXH3_mul128_fold64(xxh_u64 lhs, xxh_u64 rhs)
+ {
+     XXH128_hash_t product = XXH_mult64to128(lhs, rhs);
+     return product.low64 ^ product.high64;
+ }
+ 
+-/* Seems to produce slightly better code on GCC for some reason. */
++/*! Seems to produce slightly better code on GCC for some reason. */
+ XXH_FORCE_INLINE xxh_u64 XXH_xorshift64(xxh_u64 v64, int shift)
+ {
+     XXH_ASSERT(0 <= shift && shift < 64);
+     return v64 ^ (v64 >> shift);
+ }
+ 
+ /*
+  * This is a fast avalanche stage,
+@@ -2782,33 +3469,33 @@ XXH3_len_1to3_64b(const xxh_u8* input, s
+     }
+ }
+ 
+ XXH_FORCE_INLINE XXH64_hash_t
+ XXH3_len_4to8_64b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXH64_hash_t seed)
+ {
+     XXH_ASSERT(input != NULL);
+     XXH_ASSERT(secret != NULL);
+-    XXH_ASSERT(4 <= len && len < 8);
++    XXH_ASSERT(4 <= len && len <= 8);
+     seed ^= (xxh_u64)XXH_swap32((xxh_u32)seed) << 32;
+     {   xxh_u32 const input1 = XXH_readLE32(input);
+         xxh_u32 const input2 = XXH_readLE32(input + len - 4);
+         xxh_u64 const bitflip = (XXH_readLE64(secret+8) ^ XXH_readLE64(secret+16)) - seed;
+         xxh_u64 const input64 = input2 + (((xxh_u64)input1) << 32);
+         xxh_u64 const keyed = input64 ^ bitflip;
+         return XXH3_rrmxmx(keyed, len);
+     }
+ }
+ 
+ XXH_FORCE_INLINE XXH64_hash_t
+ XXH3_len_9to16_64b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXH64_hash_t seed)
+ {
+     XXH_ASSERT(input != NULL);
+     XXH_ASSERT(secret != NULL);
+-    XXH_ASSERT(8 <= len && len <= 16);
++    XXH_ASSERT(9 <= len && len <= 16);
+     {   xxh_u64 const bitflip1 = (XXH_readLE64(secret+24) ^ XXH_readLE64(secret+32)) + seed;
+         xxh_u64 const bitflip2 = (XXH_readLE64(secret+40) ^ XXH_readLE64(secret+48)) - seed;
+         xxh_u64 const input_lo = XXH_readLE64(input)           ^ bitflip1;
+         xxh_u64 const input_hi = XXH_readLE64(input + len - 8) ^ bitflip2;
+         xxh_u64 const acc = len
+                           + XXH_swap64(input_lo) + input_hi
+                           + XXH3_mul128_fold64(input_lo, input_hi);
+         return XXH3_avalanche(acc);
+@@ -2868,17 +3555,17 @@ XXH_FORCE_INLINE xxh_u64 XXH3_mix16B(con
+      *
+      * FIXME: Clang's output is still _much_ faster -- On an AMD Ryzen 3600,
+      * XXH3_64bits @ len=240 runs at 4.6 GB/s with Clang 9, but 3.3 GB/s on
+      * GCC 9.2, despite both emitting scalar code.
+      *
+      * GCC generates much better scalar code than Clang for the rest of XXH3,
+      * which is why finding a more optimal codepath is an interest.
+      */
+-    __asm__ ("" : "+r" (seed64));
++    XXH_COMPILER_GUARD(seed64);
+ #endif
+     {   xxh_u64 const input_lo = XXH_readLE64(input);
+         xxh_u64 const input_hi = XXH_readLE64(input+8);
+         return XXH3_mul128_fold64(
+             input_lo ^ (XXH_readLE64(secret)   + seed64),
+             input_hi ^ (XXH_readLE64(secret+8) - seed64)
+         );
+     }
+@@ -2978,17 +3665,17 @@ XXH3_len_129to240_64b(const xxh_u8* XXH_
+ #ifdef XXH_OLD_NAMES
+ #  define STRIPE_LEN XXH_STRIPE_LEN
+ #  define ACC_NB XXH_ACC_NB
+ #endif
+ 
+ XXH_FORCE_INLINE void XXH_writeLE64(void* dst, xxh_u64 v64)
+ {
+     if (!XXH_CPU_LITTLE_ENDIAN) v64 = XXH_swap64(v64);
+-    memcpy(dst, &v64, sizeof(v64));
++    XXH_memcpy(dst, &v64, sizeof(v64));
+ }
+ 
+ /* Several intrinsic functions below are supposed to accept __int64 as argument,
+  * as documented in https://software.intel.com/sites/landingpage/IntrinsicsGuide/ .
+  * However, several environments do not define __int64 type,
+  * requiring a workaround.
+  */
+ #if !defined (__VMS) \
+@@ -3018,28 +3705,29 @@ XXH_FORCE_INLINE void XXH_writeLE64(void
+  * essentially independent.
+  *
+  * This doesn't matter on 64-bit hashes since they all get merged together in
+  * the end, so we skip the extra step.
+  *
+  * Both XXH3_64bits and XXH3_128bits use this subroutine.
+  */
+ 
+-#if (XXH_VECTOR == XXH_AVX512) || defined(XXH_X86DISPATCH)
++#if (XXH_VECTOR == XXH_AVX512) \
++     || (defined(XXH_DISPATCH_AVX512) && XXH_DISPATCH_AVX512 != 0)
+ 
+ #ifndef XXH_TARGET_AVX512
+ # define XXH_TARGET_AVX512  /* disable attribute target */
+ #endif
+ 
+ XXH_FORCE_INLINE XXH_TARGET_AVX512 void
+ XXH3_accumulate_512_avx512(void* XXH_RESTRICT acc,
+                      const void* XXH_RESTRICT input,
+                      const void* XXH_RESTRICT secret)
+ {
+-    XXH_ALIGN(64) __m512i* const xacc = (__m512i *) acc;
++    __m512i* const xacc = (__m512i *) acc;
+     XXH_ASSERT((((size_t)acc) & 63) == 0);
+     XXH_STATIC_ASSERT(XXH_STRIPE_LEN == sizeof(__m512i));
+ 
+     {
+         /* data_vec    = input[0]; */
+         __m512i const data_vec    = _mm512_loadu_si512   (input);
+         /* key_vec     = secret[0]; */
+         __m512i const key_vec     = _mm512_loadu_si512   (secret);
+@@ -3078,17 +3766,17 @@ XXH3_accumulate_512_avx512(void* XXH_RES
+  * Both XXH3_64bits and XXH3_128bits use this subroutine.
+  */
+ 
+ XXH_FORCE_INLINE XXH_TARGET_AVX512 void
+ XXH3_scrambleAcc_avx512(void* XXH_RESTRICT acc, const void* XXH_RESTRICT secret)
+ {
+     XXH_ASSERT((((size_t)acc) & 63) == 0);
+     XXH_STATIC_ASSERT(XXH_STRIPE_LEN == sizeof(__m512i));
+-    {   XXH_ALIGN(64) __m512i* const xacc = (__m512i*) acc;
++    {   __m512i* const xacc = (__m512i*) acc;
+         const __m512i prime32 = _mm512_set1_epi32((int)XXH_PRIME32_1);
+ 
+         /* xacc[0] ^= (xacc[0] >> 47) */
+         __m512i const acc_vec     = *xacc;
+         __m512i const shifted     = _mm512_srli_epi64    (acc_vec, 47);
+         __m512i const data_vec    = _mm512_xor_si512     (acc_vec, shifted);
+         /* xacc[0] ^= secret; */
+         __m512i const key_vec     = _mm512_loadu_si512   (secret);
+@@ -3105,48 +3793,51 @@ XXH3_scrambleAcc_avx512(void* XXH_RESTRI
+ XXH_FORCE_INLINE XXH_TARGET_AVX512 void
+ XXH3_initCustomSecret_avx512(void* XXH_RESTRICT customSecret, xxh_u64 seed64)
+ {
+     XXH_STATIC_ASSERT((XXH_SECRET_DEFAULT_SIZE & 63) == 0);
+     XXH_STATIC_ASSERT(XXH_SEC_ALIGN == 64);
+     XXH_ASSERT(((size_t)customSecret & 63) == 0);
+     (void)(&XXH_writeLE64);
+     {   int const nbRounds = XXH_SECRET_DEFAULT_SIZE / sizeof(__m512i);
+-        __m512i const seed = _mm512_mask_set1_epi64(_mm512_set1_epi64((xxh_i64)seed64), 0xAA, -(xxh_i64)seed64);
+-
+-        XXH_ALIGN(64) const __m512i* const src  = (const __m512i*) XXH3_kSecret;
+-        XXH_ALIGN(64)       __m512i* const dest = (      __m512i*) customSecret;
++        __m512i const seed = _mm512_mask_set1_epi64(_mm512_set1_epi64((xxh_i64)seed64), 0xAA, (xxh_i64)(0U - seed64));
++
++        const __m512i* const src  = (const __m512i*) ((const void*) XXH3_kSecret);
++              __m512i* const dest = (      __m512i*) customSecret;
+         int i;
++        XXH_ASSERT(((size_t)src & 63) == 0); /* control alignment */
++        XXH_ASSERT(((size_t)dest & 63) == 0);
+         for (i=0; i < nbRounds; ++i) {
+             /* GCC has a bug, _mm512_stream_load_si512 accepts 'void*', not 'void const*',
+-             * this will warn "discards ‘const’ qualifier". */
++             * this will warn "discards 'const' qualifier". */
+             union {
+-                XXH_ALIGN(64) const __m512i* cp;
+-                XXH_ALIGN(64) void* p;
++                const __m512i* cp;
++                void* p;
+             } remote_const_void;
+             remote_const_void.cp = src + i;
+             dest[i] = _mm512_add_epi64(_mm512_stream_load_si512(remote_const_void.p), seed);
+     }   }
+ }
+ 
+ #endif
+ 
+-#if (XXH_VECTOR == XXH_AVX2) || defined(XXH_X86DISPATCH)
++#if (XXH_VECTOR == XXH_AVX2) \
++    || (defined(XXH_DISPATCH_AVX2) && XXH_DISPATCH_AVX2 != 0)
+ 
+ #ifndef XXH_TARGET_AVX2
+ # define XXH_TARGET_AVX2  /* disable attribute target */
+ #endif
+ 
+ XXH_FORCE_INLINE XXH_TARGET_AVX2 void
+ XXH3_accumulate_512_avx2( void* XXH_RESTRICT acc,
+                     const void* XXH_RESTRICT input,
+                     const void* XXH_RESTRICT secret)
+ {
+     XXH_ASSERT((((size_t)acc) & 31) == 0);
+-    {   XXH_ALIGN(32) __m256i* const xacc    =       (__m256i *) acc;
++    {   __m256i* const xacc    =       (__m256i *) acc;
+         /* Unaligned. This is mainly for pointer arithmetic, and because
+          * _mm256_loadu_si256 requires  a const __m256i * pointer for some reason. */
+         const         __m256i* const xinput  = (const __m256i *) input;
+         /* Unaligned. This is mainly for pointer arithmetic, and because
+          * _mm256_loadu_si256 requires a const __m256i * pointer for some reason. */
+         const         __m256i* const xsecret = (const __m256i *) secret;
+ 
+         size_t i;
+@@ -3168,17 +3859,17 @@ XXH3_accumulate_512_avx2( void* XXH_REST
+             xacc[i] = _mm256_add_epi64(product, sum);
+     }   }
+ }
+ 
+ XXH_FORCE_INLINE XXH_TARGET_AVX2 void
+ XXH3_scrambleAcc_avx2(void* XXH_RESTRICT acc, const void* XXH_RESTRICT secret)
+ {
+     XXH_ASSERT((((size_t)acc) & 31) == 0);
+-    {   XXH_ALIGN(32) __m256i* const xacc = (__m256i*) acc;
++    {   __m256i* const xacc = (__m256i*) acc;
+         /* Unaligned. This is mainly for pointer arithmetic, and because
+          * _mm256_loadu_si256 requires a const __m256i * pointer for some reason. */
+         const         __m256i* const xsecret = (const __m256i *) secret;
+         const __m256i prime32 = _mm256_set1_epi32((int)XXH_PRIME32_1);
+ 
+         size_t i;
+         for (i=0; i < XXH_STRIPE_LEN/sizeof(__m256i); i++) {
+             /* xacc[i] ^= (xacc[i] >> 47) */
+@@ -3200,60 +3891,59 @@ XXH3_scrambleAcc_avx2(void* XXH_RESTRICT
+ 
+ XXH_FORCE_INLINE XXH_TARGET_AVX2 void XXH3_initCustomSecret_avx2(void* XXH_RESTRICT customSecret, xxh_u64 seed64)
+ {
+     XXH_STATIC_ASSERT((XXH_SECRET_DEFAULT_SIZE & 31) == 0);
+     XXH_STATIC_ASSERT((XXH_SECRET_DEFAULT_SIZE / sizeof(__m256i)) == 6);
+     XXH_STATIC_ASSERT(XXH_SEC_ALIGN <= 64);
+     (void)(&XXH_writeLE64);
+     XXH_PREFETCH(customSecret);
+-    {   __m256i const seed = _mm256_set_epi64x(-(xxh_i64)seed64, (xxh_i64)seed64, -(xxh_i64)seed64, (xxh_i64)seed64);
+-
+-        XXH_ALIGN(64) const __m256i* const src  = (const __m256i*) XXH3_kSecret;
+-        XXH_ALIGN(64)       __m256i*       dest = (      __m256i*) customSecret;
++    {   __m256i const seed = _mm256_set_epi64x((xxh_i64)(0U - seed64), (xxh_i64)seed64, (xxh_i64)(0U - seed64), (xxh_i64)seed64);
++
++        const __m256i* const src  = (const __m256i*) ((const void*) XXH3_kSecret);
++              __m256i*       dest = (      __m256i*) customSecret;
+ 
+ #       if defined(__GNUC__) || defined(__clang__)
+         /*
+          * On GCC & Clang, marking 'dest' as modified will cause the compiler:
+          *   - do not extract the secret from sse registers in the internal loop
+          *   - use less common registers, and avoid pushing these reg into stack
+-         * The asm hack causes Clang to assume that XXH3_kSecretPtr aliases with
+-         * customSecret, and on aarch64, this prevented LDP from merging two
+-         * loads together for free. Putting the loads together before the stores
+-         * properly generates LDP.
+          */
+-        __asm__("" : "+r" (dest));
++        XXH_COMPILER_GUARD(dest);
+ #       endif
++        XXH_ASSERT(((size_t)src & 31) == 0); /* control alignment */
++        XXH_ASSERT(((size_t)dest & 31) == 0);
+ 
+         /* GCC -O2 need unroll loop manually */
+         dest[0] = _mm256_add_epi64(_mm256_stream_load_si256(src+0), seed);
+         dest[1] = _mm256_add_epi64(_mm256_stream_load_si256(src+1), seed);
+         dest[2] = _mm256_add_epi64(_mm256_stream_load_si256(src+2), seed);
+         dest[3] = _mm256_add_epi64(_mm256_stream_load_si256(src+3), seed);
+         dest[4] = _mm256_add_epi64(_mm256_stream_load_si256(src+4), seed);
+         dest[5] = _mm256_add_epi64(_mm256_stream_load_si256(src+5), seed);
+     }
+ }
+ 
+ #endif
+ 
++/* x86dispatch always generates SSE2 */
+ #if (XXH_VECTOR == XXH_SSE2) || defined(XXH_X86DISPATCH)
+ 
+ #ifndef XXH_TARGET_SSE2
+ # define XXH_TARGET_SSE2  /* disable attribute target */
+ #endif
+ 
+ XXH_FORCE_INLINE XXH_TARGET_SSE2 void
+ XXH3_accumulate_512_sse2( void* XXH_RESTRICT acc,
+                     const void* XXH_RESTRICT input,
+                     const void* XXH_RESTRICT secret)
+ {
+     /* SSE2 is just a half-scale version of the AVX2 version. */
+     XXH_ASSERT((((size_t)acc) & 15) == 0);
+-    {   XXH_ALIGN(16) __m128i* const xacc    =       (__m128i *) acc;
++    {   __m128i* const xacc    =       (__m128i *) acc;
+         /* Unaligned. This is mainly for pointer arithmetic, and because
+          * _mm_loadu_si128 requires a const __m128i * pointer for some reason. */
+         const         __m128i* const xinput  = (const __m128i *) input;
+         /* Unaligned. This is mainly for pointer arithmetic, and because
+          * _mm_loadu_si128 requires a const __m128i * pointer for some reason. */
+         const         __m128i* const xsecret = (const __m128i *) secret;
+ 
+         size_t i;
+@@ -3275,17 +3965,17 @@ XXH3_accumulate_512_sse2( void* XXH_REST
+             xacc[i] = _mm_add_epi64(product, sum);
+     }   }
+ }
+ 
+ XXH_FORCE_INLINE XXH_TARGET_SSE2 void
+ XXH3_scrambleAcc_sse2(void* XXH_RESTRICT acc, const void* XXH_RESTRICT secret)
+ {
+     XXH_ASSERT((((size_t)acc) & 15) == 0);
+-    {   XXH_ALIGN(16) __m128i* const xacc = (__m128i*) acc;
++    {   __m128i* const xacc = (__m128i*) acc;
+         /* Unaligned. This is mainly for pointer arithmetic, and because
+          * _mm_loadu_si128 requires a const __m128i * pointer for some reason. */
+         const         __m128i* const xsecret = (const __m128i *) secret;
+         const __m128i prime32 = _mm_set1_epi32((int)XXH_PRIME32_1);
+ 
+         size_t i;
+         for (i=0; i < XXH_STRIPE_LEN/sizeof(__m128i); i++) {
+             /* xacc[i] ^= (xacc[i] >> 47) */
+@@ -3307,52 +3997,54 @@ XXH3_scrambleAcc_sse2(void* XXH_RESTRICT
+ 
+ XXH_FORCE_INLINE XXH_TARGET_SSE2 void XXH3_initCustomSecret_sse2(void* XXH_RESTRICT customSecret, xxh_u64 seed64)
+ {
+     XXH_STATIC_ASSERT((XXH_SECRET_DEFAULT_SIZE & 15) == 0);
+     (void)(&XXH_writeLE64);
+     {   int const nbRounds = XXH_SECRET_DEFAULT_SIZE / sizeof(__m128i);
+ 
+ #       if defined(_MSC_VER) && defined(_M_IX86) && _MSC_VER < 1900
+-        // MSVC 32bit mode does not support _mm_set_epi64x before 2015
+-        XXH_ALIGN(16) const xxh_i64 seed64x2[2] = { (xxh_i64)seed64, -(xxh_i64)seed64 };
++        /* MSVC 32bit mode does not support _mm_set_epi64x before 2015 */
++        XXH_ALIGN(16) const xxh_i64 seed64x2[2] = { (xxh_i64)seed64, (xxh_i64)(0U - seed64) };
+         __m128i const seed = _mm_load_si128((__m128i const*)seed64x2);
+ #       else
+-        __m128i const seed = _mm_set_epi64x(-(xxh_i64)seed64, (xxh_i64)seed64);
++        __m128i const seed = _mm_set_epi64x((xxh_i64)(0U - seed64), (xxh_i64)seed64);
+ #       endif
+         int i;
+ 
+-        XXH_ALIGN(64)        const float* const src  = (float const*) XXH3_kSecret;
+-        XXH_ALIGN(XXH_SEC_ALIGN) __m128i*       dest = (__m128i*) customSecret;
++        const void* const src16 = XXH3_kSecret;
++        __m128i* dst16 = (__m128i*) customSecret;
+ #       if defined(__GNUC__) || defined(__clang__)
+         /*
+          * On GCC & Clang, marking 'dest' as modified will cause the compiler:
+          *   - do not extract the secret from sse registers in the internal loop
+          *   - use less common registers, and avoid pushing these reg into stack
+          */
+-        __asm__("" : "+r" (dest));
++        XXH_COMPILER_GUARD(dst16);
+ #       endif
++        XXH_ASSERT(((size_t)src16 & 15) == 0); /* control alignment */
++        XXH_ASSERT(((size_t)dst16 & 15) == 0);
+ 
+         for (i=0; i < nbRounds; ++i) {
+-            dest[i] = _mm_add_epi64(_mm_castps_si128(_mm_load_ps(src+i*4)), seed);
++            dst16[i] = _mm_add_epi64(_mm_load_si128((const __m128i *)src16+i), seed);
+     }   }
+ }
+ 
+ #endif
+ 
+ #if (XXH_VECTOR == XXH_NEON)
+ 
+ XXH_FORCE_INLINE void
+ XXH3_accumulate_512_neon( void* XXH_RESTRICT acc,
+                     const void* XXH_RESTRICT input,
+                     const void* XXH_RESTRICT secret)
+ {
+     XXH_ASSERT((((size_t)acc) & 15) == 0);
+     {
+-        XXH_ALIGN(16) uint64x2_t* const xacc = (uint64x2_t *) acc;
++        uint64x2_t* const xacc = (uint64x2_t *) acc;
+         /* We don't use a uint32x4_t pointer because it causes bus errors on ARMv7. */
+         uint8_t const* const xinput = (const uint8_t *) input;
+         uint8_t const* const xsecret  = (const uint8_t *) secret;
+ 
+         size_t i;
+         for (i=0; i < XXH_STRIPE_LEN / sizeof(uint64x2_t); i++) {
+             /* data_vec = xinput[i]; */
+             uint8x16_t data_vec    = vld1q_u8(xinput  + (i * 16));
+@@ -3389,18 +4081,18 @@ XXH3_scrambleAcc_neon(void* XXH_RESTRICT
+         size_t i;
+         for (i=0; i < XXH_STRIPE_LEN/sizeof(uint64x2_t); i++) {
+             /* xacc[i] ^= (xacc[i] >> 47); */
+             uint64x2_t acc_vec  = xacc[i];
+             uint64x2_t shifted  = vshrq_n_u64 (acc_vec, 47);
+             uint64x2_t data_vec = veorq_u64   (acc_vec, shifted);
+ 
+             /* xacc[i] ^= xsecret[i]; */
+-            uint8x16_t key_vec  = vld1q_u8(xsecret + (i * 16));
+-            uint64x2_t data_key = veorq_u64(data_vec, vreinterpretq_u64_u8(key_vec));
++            uint8x16_t key_vec  = vld1q_u8    (xsecret + (i * 16));
++            uint64x2_t data_key = veorq_u64   (data_vec, vreinterpretq_u64_u8(key_vec));
+ 
+             /* xacc[i] *= XXH_PRIME32_1 */
+             uint32x2_t data_key_lo, data_key_hi;
+             /* data_key_lo = (uint32x2_t) (xacc[i] & 0xFFFFFFFF);
+              * data_key_hi = (uint32x2_t) (xacc[i] >> 32);
+              * xacc[i] = UNDEFINED; */
+             XXH_SPLIT_IN_PLACE(data_key, data_key_lo, data_key_hi);
+             {   /*
+@@ -3434,39 +4126,44 @@ XXH3_scrambleAcc_neon(void* XXH_RESTRICT
+ 
+ #if (XXH_VECTOR == XXH_VSX)
+ 
+ XXH_FORCE_INLINE void
+ XXH3_accumulate_512_vsx(  void* XXH_RESTRICT acc,
+                     const void* XXH_RESTRICT input,
+                     const void* XXH_RESTRICT secret)
+ {
+-          xxh_u64x2* const xacc     =       (xxh_u64x2*) acc;    /* presumed aligned */
++    /* presumed aligned */
++    unsigned int* const xacc = (unsigned int*) acc;
+     xxh_u64x2 const* const xinput   = (xxh_u64x2 const*) input;   /* no alignment restriction */
+     xxh_u64x2 const* const xsecret  = (xxh_u64x2 const*) secret;    /* no alignment restriction */
+     xxh_u64x2 const v32 = { 32, 32 };
+     size_t i;
+     for (i = 0; i < XXH_STRIPE_LEN / sizeof(xxh_u64x2); i++) {
+         /* data_vec = xinput[i]; */
+         xxh_u64x2 const data_vec = XXH_vec_loadu(xinput + i);
+         /* key_vec = xsecret[i]; */
+         xxh_u64x2 const key_vec  = XXH_vec_loadu(xsecret + i);
+         xxh_u64x2 const data_key = data_vec ^ key_vec;
+         /* shuffled = (data_key << 32) | (data_key >> 32); */
+         xxh_u32x4 const shuffled = (xxh_u32x4)vec_rl(data_key, v32);
+         /* product = ((xxh_u64x2)data_key & 0xFFFFFFFF) * ((xxh_u64x2)shuffled & 0xFFFFFFFF); */
+         xxh_u64x2 const product  = XXH_vec_mulo((xxh_u32x4)data_key, shuffled);
+-        xacc[i] += product;
++        /* acc_vec = xacc[i]; */
++        xxh_u64x2 acc_vec        = (xxh_u64x2)vec_xl(0, xacc + 4 * i);
++        acc_vec += product;
+ 
+         /* swap high and low halves */
+ #ifdef __s390x__
+-        xacc[i] += vec_permi(data_vec, data_vec, 2);
++        acc_vec += vec_permi(data_vec, data_vec, 2);
+ #else
+-        xacc[i] += vec_xxpermdi(data_vec, data_vec, 2);
++        acc_vec += vec_xxpermdi(data_vec, data_vec, 2);
+ #endif
++        /* xacc[i] = acc_vec; */
++        vec_xst((xxh_u32x4)acc_vec, 0, xacc + 4 * i);
+     }
+ }
+ 
+ XXH_FORCE_INLINE void
+ XXH3_scrambleAcc_vsx(void* XXH_RESTRICT acc, const void* XXH_RESTRICT secret)
+ {
+     XXH_ASSERT((((size_t)acc) & 15) == 0);
+ 
+@@ -3499,33 +4196,33 @@ XXH3_scrambleAcc_vsx(void* XXH_RESTRICT 
+ 
+ /* scalar variants - universal */
+ 
+ XXH_FORCE_INLINE void
+ XXH3_accumulate_512_scalar(void* XXH_RESTRICT acc,
+                      const void* XXH_RESTRICT input,
+                      const void* XXH_RESTRICT secret)
+ {
+-    XXH_ALIGN(XXH_ACC_ALIGN) xxh_u64* const xacc = (xxh_u64*) acc; /* presumed aligned */
++    xxh_u64* const xacc = (xxh_u64*) acc; /* presumed aligned */
+     const xxh_u8* const xinput  = (const xxh_u8*) input;  /* no alignment restriction */
+     const xxh_u8* const xsecret = (const xxh_u8*) secret;   /* no alignment restriction */
+     size_t i;
+     XXH_ASSERT(((size_t)acc & (XXH_ACC_ALIGN-1)) == 0);
+     for (i=0; i < XXH_ACC_NB; i++) {
+         xxh_u64 const data_val = XXH_readLE64(xinput + 8*i);
+         xxh_u64 const data_key = data_val ^ XXH_readLE64(xsecret + i*8);
+         xacc[i ^ 1] += data_val; /* swap adjacent lanes */
+         xacc[i] += XXH_mult32to64(data_key & 0xFFFFFFFF, data_key >> 32);
+     }
+ }
+ 
+ XXH_FORCE_INLINE void
+ XXH3_scrambleAcc_scalar(void* XXH_RESTRICT acc, const void* XXH_RESTRICT secret)
+ {
+-    XXH_ALIGN(XXH_ACC_ALIGN) xxh_u64* const xacc = (xxh_u64*) acc;   /* presumed aligned */
++    xxh_u64* const xacc = (xxh_u64*) acc;   /* presumed aligned */
+     const xxh_u8* const xsecret = (const xxh_u8*) secret;   /* no alignment restriction */
+     size_t i;
+     XXH_ASSERT((((size_t)acc) & (XXH_ACC_ALIGN-1)) == 0);
+     for (i=0; i < XXH_ACC_NB; i++) {
+         xxh_u64 const key64 = XXH_readLE64(xsecret + 8*i);
+         xxh_u64 acc64 = xacc[i];
+         acc64 = XXH_xorshift64(acc64, 47);
+         acc64 ^= key64;
+@@ -3569,17 +4266,17 @@ XXH3_initCustomSecret_scalar(void* XXH_R
+      *      LDR
+      *  ADD LDR
+      *  SUB     STR
+      *          STR
+      * XXH3_64bits_withSeed, len == 256, Snapdragon 835
+      *   without hack: 2654.4 MB/s
+      *   with hack:    3202.9 MB/s
+      */
+-    __asm__("" : "+r" (kSecretPtr));
++    XXH_COMPILER_GUARD(kSecretPtr);
+ #endif
+     /*
+      * Note: in debug mode, this overrides the asm optimization
+      * and Clang will emit MOVK chains again.
+      */
+     XXH_ASSERT(kSecretPtr == XXH3_kSecret);
+ 
+     {   int const nbRounds = XXH_SECRET_DEFAULT_SIZE / 16;
+@@ -3734,17 +4431,17 @@ XXH3_mergeAccs(const xxh_u64* XXH_RESTRI
+         /*
+          * UGLY HACK:
+          * Prevent autovectorization on Clang ARMv7-a. Exact same problem as
+          * the one in XXH3_len_129to240_64b. Speeds up shorter keys > 240b.
+          * XXH3_64bits, len == 256, Snapdragon 835:
+          *   without hack: 2063.7 MB/s
+          *   with hack:    2560.7 MB/s
+          */
+-        __asm__("" : "+r" (result64));
++        XXH_COMPILER_GUARD(result64);
+ #endif
+     }
+ 
+     return XXH3_avalanche(result64);
+ }
+ 
+ #define XXH3_INIT_ACC { XXH_PRIME32_3, XXH_PRIME64_1, XXH_PRIME64_2, XXH_PRIME64_3, \
+                         XXH_PRIME64_4, XXH_PRIME32_2, XXH_PRIME64_5, XXH_PRIME32_1 }
+@@ -3763,32 +4460,33 @@ XXH3_hashLong_64b_internal(const void* X
+     XXH_STATIC_ASSERT(sizeof(acc) == 64);
+     /* do not align on 8, so that the secret is different from the accumulator */
+ #define XXH_SECRET_MERGEACCS_START 11
+     XXH_ASSERT(secretSize >= sizeof(acc) + XXH_SECRET_MERGEACCS_START);
+     return XXH3_mergeAccs(acc, (const xxh_u8*)secret + XXH_SECRET_MERGEACCS_START, (xxh_u64)len * XXH_PRIME64_1);
+ }
+ 
+ /*
+- * It's important for performance that XXH3_hashLong is not inlined.
++ * It's important for performance to transmit secret's size (when it's static)
++ * so that the compiler can properly optimize the vectorized loop.
++ * This makes a big performance difference for "medium" keys (<1 KB) when using AVX instruction set.
+  */
+-XXH_NO_INLINE XXH64_hash_t
++XXH_FORCE_INLINE XXH64_hash_t
+ XXH3_hashLong_64b_withSecret(const void* XXH_RESTRICT input, size_t len,
+                              XXH64_hash_t seed64, const xxh_u8* XXH_RESTRICT secret, size_t secretLen)
+ {
+     (void)seed64;
+     return XXH3_hashLong_64b_internal(input, len, secret, secretLen, XXH3_accumulate_512, XXH3_scrambleAcc);
+ }
+ 
+ /*
+- * It's important for performance that XXH3_hashLong is not inlined.
+- * Since the function is not inlined, the compiler may not be able to understand that,
+- * in some scenarios, its `secret` argument is actually a compile time constant.
+- * This variant enforces that the compiler can detect that,
+- * and uses this opportunity to streamline the generated code for better performance.
++ * It's preferable for performance that XXH3_hashLong is not inlined,
++ * as it results in a smaller function for small data, easier to the instruction cache.
++ * Note that inside this no_inline function, we do inline the internal loop,
++ * and provide a statically defined secret size to allow optimization of vector loop.
+  */
+ XXH_NO_INLINE XXH64_hash_t
+ XXH3_hashLong_64b_default(const void* XXH_RESTRICT input, size_t len,
+                           XXH64_hash_t seed64, const xxh_u8* XXH_RESTRICT secret, size_t secretLen)
+ {
+     (void)seed64; (void)secret; (void)secretLen;
+     return XXH3_hashLong_64b_internal(input, len, XXH3_kSecret, sizeof(XXH3_kSecret), XXH3_accumulate_512, XXH3_scrambleAcc);
+ }
+@@ -3858,33 +4556,44 @@ XXH3_64bits_internal(const void* XXH_RES
+     if (len <= XXH3_MIDSIZE_MAX)
+         return XXH3_len_129to240_64b((const xxh_u8*)input, len, (const xxh_u8*)secret, secretLen, seed64);
+     return f_hashLong(input, len, seed64, (const xxh_u8*)secret, secretLen);
+ }
+ 
+ 
+ /* ===   Public entry point   === */
+ 
++/*! @ingroup xxh3_family */
+ XXH_PUBLIC_API XXH64_hash_t XXH3_64bits(const void* input, size_t len)
+ {
+     return XXH3_64bits_internal(input, len, 0, XXH3_kSecret, sizeof(XXH3_kSecret), XXH3_hashLong_64b_default);
+ }
+ 
++/*! @ingroup xxh3_family */
+ XXH_PUBLIC_API XXH64_hash_t
+ XXH3_64bits_withSecret(const void* input, size_t len, const void* secret, size_t secretSize)
+ {
+     return XXH3_64bits_internal(input, len, 0, secret, secretSize, XXH3_hashLong_64b_withSecret);
+ }
+ 
++/*! @ingroup xxh3_family */
+ XXH_PUBLIC_API XXH64_hash_t
+ XXH3_64bits_withSeed(const void* input, size_t len, XXH64_hash_t seed)
+ {
+     return XXH3_64bits_internal(input, len, seed, XXH3_kSecret, sizeof(XXH3_kSecret), XXH3_hashLong_64b_withSeed);
+ }
+ 
++XXH_PUBLIC_API XXH64_hash_t
++XXH3_64bits_withSecretandSeed(const void* input, size_t len, const void* secret, size_t secretSize, XXH64_hash_t seed)
++{
++    if (len <= XXH3_MIDSIZE_MAX)
++        return XXH3_64bits_internal(input, len, seed, XXH3_kSecret, sizeof(XXH3_kSecret), NULL);
++    return XXH3_hashLong_64b_withSecret(input, len, seed, (const xxh_u8*)secret, secretSize);
++}
++
+ 
+ /* ===   XXH3 streaming   === */
+ 
+ /*
+  * Malloc's a pointer that is always aligned to align.
+  *
+  * This must be freed with `XXH_alignedFree()`.
+  *
+@@ -3943,87 +4652,107 @@ static void XXH_alignedFree(void* p)
+         xxh_u8* ptr = (xxh_u8*)p;
+         /* Get the offset byte we added in XXH_malloc. */
+         xxh_u8 offset = ptr[-1];
+         /* Free the original malloc'd pointer */
+         xxh_u8* base = ptr - offset;
+         XXH_free(base);
+     }
+ }
++/*! @ingroup xxh3_family */
+ XXH_PUBLIC_API XXH3_state_t* XXH3_createState(void)
+ {
+     XXH3_state_t* const state = (XXH3_state_t*)XXH_alignedMalloc(sizeof(XXH3_state_t), 64);
+     if (state==NULL) return NULL;
+     XXH3_INITSTATE(state);
+     return state;
+ }
+ 
++/*! @ingroup xxh3_family */
+ XXH_PUBLIC_API XXH_errorcode XXH3_freeState(XXH3_state_t* statePtr)
+ {
+     XXH_alignedFree(statePtr);
+     return XXH_OK;
+ }
+ 
++/*! @ingroup xxh3_family */
+ XXH_PUBLIC_API void
+ XXH3_copyState(XXH3_state_t* dst_state, const XXH3_state_t* src_state)
+ {
+-    memcpy(dst_state, src_state, sizeof(*dst_state));
++    XXH_memcpy(dst_state, src_state, sizeof(*dst_state));
+ }
+ 
+ static void
+-XXH3_64bits_reset_internal(XXH3_state_t* statePtr,
+-                           XXH64_hash_t seed,
+-                           const void* secret, size_t secretSize)
++XXH3_reset_internal(XXH3_state_t* statePtr,
++                    XXH64_hash_t seed,
++                    const void* secret, size_t secretSize)
+ {
+     size_t const initStart = offsetof(XXH3_state_t, bufferedSize);
+     size_t const initLength = offsetof(XXH3_state_t, nbStripesPerBlock) - initStart;
+     XXH_ASSERT(offsetof(XXH3_state_t, nbStripesPerBlock) > initStart);
+     XXH_ASSERT(statePtr != NULL);
+     /* set members from bufferedSize to nbStripesPerBlock (excluded) to 0 */
+     memset((char*)statePtr + initStart, 0, initLength);
+     statePtr->acc[0] = XXH_PRIME32_3;
+     statePtr->acc[1] = XXH_PRIME64_1;
+     statePtr->acc[2] = XXH_PRIME64_2;
+     statePtr->acc[3] = XXH_PRIME64_3;
+     statePtr->acc[4] = XXH_PRIME64_4;
+     statePtr->acc[5] = XXH_PRIME32_2;
+     statePtr->acc[6] = XXH_PRIME64_5;
+     statePtr->acc[7] = XXH_PRIME32_1;
+     statePtr->seed = seed;
++    statePtr->useSeed = (seed != 0);
+     statePtr->extSecret = (const unsigned char*)secret;
+     XXH_ASSERT(secretSize >= XXH3_SECRET_SIZE_MIN);
+     statePtr->secretLimit = secretSize - XXH_STRIPE_LEN;
+     statePtr->nbStripesPerBlock = statePtr->secretLimit / XXH_SECRET_CONSUME_RATE;
+ }
+ 
++/*! @ingroup xxh3_family */
+ XXH_PUBLIC_API XXH_errorcode
+ XXH3_64bits_reset(XXH3_state_t* statePtr)
+ {
+     if (statePtr == NULL) return XXH_ERROR;
+-    XXH3_64bits_reset_internal(statePtr, 0, XXH3_kSecret, XXH_SECRET_DEFAULT_SIZE);
++    XXH3_reset_internal(statePtr, 0, XXH3_kSecret, XXH_SECRET_DEFAULT_SIZE);
+     return XXH_OK;
+ }
+ 
++/*! @ingroup xxh3_family */
+ XXH_PUBLIC_API XXH_errorcode
+ XXH3_64bits_reset_withSecret(XXH3_state_t* statePtr, const void* secret, size_t secretSize)
+ {
+     if (statePtr == NULL) return XXH_ERROR;
+-    XXH3_64bits_reset_internal(statePtr, 0, secret, secretSize);
++    XXH3_reset_internal(statePtr, 0, secret, secretSize);
+     if (secret == NULL) return XXH_ERROR;
+     if (secretSize < XXH3_SECRET_SIZE_MIN) return XXH_ERROR;
+     return XXH_OK;
+ }
+ 
++/*! @ingroup xxh3_family */
+ XXH_PUBLIC_API XXH_errorcode
+ XXH3_64bits_reset_withSeed(XXH3_state_t* statePtr, XXH64_hash_t seed)
+ {
+     if (statePtr == NULL) return XXH_ERROR;
+     if (seed==0) return XXH3_64bits_reset(statePtr);
+-    if (seed != statePtr->seed) XXH3_initCustomSecret(statePtr->customSecret, seed);
+-    XXH3_64bits_reset_internal(statePtr, seed, NULL, XXH_SECRET_DEFAULT_SIZE);
++    if ((seed != statePtr->seed) || (statePtr->extSecret != NULL))
++        XXH3_initCustomSecret(statePtr->customSecret, seed);
++    XXH3_reset_internal(statePtr, seed, NULL, XXH_SECRET_DEFAULT_SIZE);
++    return XXH_OK;
++}
++
++/*! @ingroup xxh3_family */
++XXH_PUBLIC_API XXH_errorcode
++XXH3_64bits_reset_withSecretandSeed(XXH3_state_t* statePtr, const void* secret, size_t secretSize, XXH64_hash_t seed64)
++{
++    if (statePtr == NULL) return XXH_ERROR;
++    if (secret == NULL) return XXH_ERROR;
++    if (secretSize < XXH3_SECRET_SIZE_MIN) return XXH_ERROR;
++    XXH3_reset_internal(statePtr, seed64, secret, secretSize);
++    statePtr->useSeed = 1; /* always, even if seed64==0 */
+     return XXH_OK;
+ }
+ 
+ /* Note : when XXH3_consumeStripes() is invoked,
+  * there must be a guarantee that at least one more byte must be consumed from input
+  * so that the function can blindly consume all stripes using the "normal" secret segment */
+ XXH_FORCE_INLINE void
+ XXH3_consumeStripes(xxh_u64* XXH_RESTRICT acc,
+@@ -4044,88 +4773,140 @@ XXH3_consumeStripes(xxh_u64* XXH_RESTRIC
+         XXH3_accumulate(acc, input + nbStripesToEndofBlock * XXH_STRIPE_LEN, secret, nbStripesAfterBlock, f_acc512);
+         *nbStripesSoFarPtr = nbStripesAfterBlock;
+     } else {
+         XXH3_accumulate(acc, input, secret + nbStripesSoFarPtr[0] * XXH_SECRET_CONSUME_RATE, nbStripes, f_acc512);
+         *nbStripesSoFarPtr += nbStripes;
+     }
+ }
+ 
++#ifndef XXH3_STREAM_USE_STACK
++# ifndef __clang__ /* clang doesn't need additional stack space */
++#   define XXH3_STREAM_USE_STACK 1
++# endif
++#endif
+ /*
+  * Both XXH3_64bits_update and XXH3_128bits_update use this routine.
+  */
+ XXH_FORCE_INLINE XXH_errorcode
+-XXH3_update(XXH3_state_t* state,
+-            const xxh_u8* input, size_t len,
++XXH3_update(XXH3_state_t* XXH_RESTRICT const state,
++            const xxh_u8* XXH_RESTRICT input, size_t len,
+             XXH3_f_accumulate_512 f_acc512,
+             XXH3_f_scrambleAcc f_scramble)
+ {
+-    if (input==NULL)
+-#if defined(XXH_ACCEPT_NULL_INPUT_POINTER) && (XXH_ACCEPT_NULL_INPUT_POINTER>=1)
++    if (input==NULL) {
++        XXH_ASSERT(len == 0);
+         return XXH_OK;
+-#else
+-        return XXH_ERROR;
+-#endif
+-
++    }
++
++    XXH_ASSERT(state != NULL);
+     {   const xxh_u8* const bEnd = input + len;
+         const unsigned char* const secret = (state->extSecret == NULL) ? state->customSecret : state->extSecret;
+-
++#if defined(XXH3_STREAM_USE_STACK) && XXH3_STREAM_USE_STACK >= 1
++        /* For some reason, gcc and MSVC seem to suffer greatly
++         * when operating accumulators directly into state.
++         * Operating into stack space seems to enable proper optimization.
++         * clang, on the other hand, doesn't seem to need this trick */
++        XXH_ALIGN(XXH_ACC_ALIGN) xxh_u64 acc[8]; memcpy(acc, state->acc, sizeof(acc));
++#else
++        xxh_u64* XXH_RESTRICT const acc = state->acc;
++#endif
+         state->totalLen += len;
+-
+-        if (state->bufferedSize + len <= XXH3_INTERNALBUFFER_SIZE) {  /* fill in tmp buffer */
++        XXH_ASSERT(state->bufferedSize <= XXH3_INTERNALBUFFER_SIZE);
++
++        /* small input : just fill in tmp buffer */
++        if (state->bufferedSize + len <= XXH3_INTERNALBUFFER_SIZE) {
+             XXH_memcpy(state->buffer + state->bufferedSize, input, len);
+             state->bufferedSize += (XXH32_hash_t)len;
+             return XXH_OK;
+         }
++
+         /* total input is now > XXH3_INTERNALBUFFER_SIZE */
+-
+         #define XXH3_INTERNALBUFFER_STRIPES (XXH3_INTERNALBUFFER_SIZE / XXH_STRIPE_LEN)
+         XXH_STATIC_ASSERT(XXH3_INTERNALBUFFER_SIZE % XXH_STRIPE_LEN == 0);   /* clean multiple */
+ 
+         /*
+          * Internal buffer is partially filled (always, except at beginning)
+          * Complete it, then consume it.
+          */
+         if (state->bufferedSize) {
+             size_t const loadSize = XXH3_INTERNALBUFFER_SIZE - state->bufferedSize;
+             XXH_memcpy(state->buffer + state->bufferedSize, input, loadSize);
+             input += loadSize;
+-            XXH3_consumeStripes(state->acc,
++            XXH3_consumeStripes(acc,
+                                &state->nbStripesSoFar, state->nbStripesPerBlock,
+                                 state->buffer, XXH3_INTERNALBUFFER_STRIPES,
+                                 secret, state->secretLimit,
+                                 f_acc512, f_scramble);
+             state->bufferedSize = 0;
+         }
+         XXH_ASSERT(input < bEnd);
+ 
+-        /* Consume input by a multiple of internal buffer size */
+-        if (input+XXH3_INTERNALBUFFER_SIZE < bEnd) {
+-            const xxh_u8* const limit = bEnd - XXH3_INTERNALBUFFER_SIZE;
+-            do {
+-                XXH3_consumeStripes(state->acc,
+-                                   &state->nbStripesSoFar, state->nbStripesPerBlock,
+-                                    input, XXH3_INTERNALBUFFER_STRIPES,
+-                                    secret, state->secretLimit,
+-                                    f_acc512, f_scramble);
+-                input += XXH3_INTERNALBUFFER_SIZE;
+-            } while (input<limit);
+-            /* for last partial stripe */
+-            memcpy(state->buffer + sizeof(state->buffer) - XXH_STRIPE_LEN, input - XXH_STRIPE_LEN, XXH_STRIPE_LEN);
++        /* large input to consume : ingest per full block */
++        if ((size_t)(bEnd - input) > state->nbStripesPerBlock * XXH_STRIPE_LEN) {
++            size_t nbStripes = (size_t)(bEnd - 1 - input) / XXH_STRIPE_LEN;
++            XXH_ASSERT(state->nbStripesPerBlock >= state->nbStripesSoFar);
++            /* join to current block's end */
++            {   size_t const nbStripesToEnd = state->nbStripesPerBlock - state->nbStripesSoFar;
++                XXH_ASSERT(nbStripes <= nbStripes);
++                XXH3_accumulate(acc, input, secret + state->nbStripesSoFar * XXH_SECRET_CONSUME_RATE, nbStripesToEnd, f_acc512);
++                f_scramble(acc, secret + state->secretLimit);
++                state->nbStripesSoFar = 0;
++                input += nbStripesToEnd * XXH_STRIPE_LEN;
++                nbStripes -= nbStripesToEnd;
++            }
++            /* consume per entire blocks */
++            while(nbStripes >= state->nbStripesPerBlock) {
++                XXH3_accumulate(acc, input, secret, state->nbStripesPerBlock, f_acc512);
++                f_scramble(acc, secret + state->secretLimit);
++                input += state->nbStripesPerBlock * XXH_STRIPE_LEN;
++                nbStripes -= state->nbStripesPerBlock;
++            }
++            /* consume last partial block */
++            XXH3_accumulate(acc, input, secret, nbStripes, f_acc512);
++            input += nbStripes * XXH_STRIPE_LEN;
++            XXH_ASSERT(input < bEnd);  /* at least some bytes left */
++            state->nbStripesSoFar = nbStripes;
++            /* buffer predecessor of last partial stripe */
++            XXH_memcpy(state->buffer + sizeof(state->buffer) - XXH_STRIPE_LEN, input - XXH_STRIPE_LEN, XXH_STRIPE_LEN);
++            XXH_ASSERT(bEnd - input <= XXH_STRIPE_LEN);
++        } else {
++            /* content to consume <= block size */
++            /* Consume input by a multiple of internal buffer size */
++            if (bEnd - input > XXH3_INTERNALBUFFER_SIZE) {
++                const xxh_u8* const limit = bEnd - XXH3_INTERNALBUFFER_SIZE;
++                do {
++                    XXH3_consumeStripes(acc,
++                                       &state->nbStripesSoFar, state->nbStripesPerBlock,
++                                        input, XXH3_INTERNALBUFFER_STRIPES,
++                                        secret, state->secretLimit,
++                                        f_acc512, f_scramble);
++                    input += XXH3_INTERNALBUFFER_SIZE;
++                } while (input<limit);
++                /* buffer predecessor of last partial stripe */
++                XXH_memcpy(state->buffer + sizeof(state->buffer) - XXH_STRIPE_LEN, input - XXH_STRIPE_LEN, XXH_STRIPE_LEN);
++            }
+         }
+-        XXH_ASSERT(input < bEnd);
+ 
+         /* Some remaining input (always) : buffer it */
++        XXH_ASSERT(input < bEnd);
++        XXH_ASSERT(bEnd - input <= XXH3_INTERNALBUFFER_SIZE);
++        XXH_ASSERT(state->bufferedSize == 0);
+         XXH_memcpy(state->buffer, input, (size_t)(bEnd-input));
+         state->bufferedSize = (XXH32_hash_t)(bEnd-input);
++#if defined(XXH3_STREAM_USE_STACK) && XXH3_STREAM_USE_STACK >= 1
++        /* save stack accumulators into state */
++        memcpy(state->acc, acc, sizeof(acc));
++#endif
+     }
+ 
+     return XXH_OK;
+ }
+ 
++/*! @ingroup xxh3_family */
+ XXH_PUBLIC_API XXH_errorcode
+ XXH3_64bits_update(XXH3_state_t* state, const void* input, size_t len)
+ {
+     return XXH3_update(state, (const xxh_u8*)input, len,
+                        XXH3_accumulate_512, XXH3_scrambleAcc);
+ }
+ 
+ 
+@@ -4133,103 +4914,60 @@ XXH_FORCE_INLINE void
+ XXH3_digest_long (XXH64_hash_t* acc,
+                   const XXH3_state_t* state,
+                   const unsigned char* secret)
+ {
+     /*
+      * Digest on a local copy. This way, the state remains unaltered, and it can
+      * continue ingesting more input afterwards.
+      */
+-    memcpy(acc, state->acc, sizeof(state->acc));
++    XXH_memcpy(acc, state->acc, sizeof(state->acc));
+     if (state->bufferedSize >= XXH_STRIPE_LEN) {
+         size_t const nbStripes = (state->bufferedSize - 1) / XXH_STRIPE_LEN;
+         size_t nbStripesSoFar = state->nbStripesSoFar;
+         XXH3_consumeStripes(acc,
+                            &nbStripesSoFar, state->nbStripesPerBlock,
+                             state->buffer, nbStripes,
+                             secret, state->secretLimit,
+                             XXH3_accumulate_512, XXH3_scrambleAcc);
+         /* last stripe */
+         XXH3_accumulate_512(acc,
+                             state->buffer + state->bufferedSize - XXH_STRIPE_LEN,
+                             secret + state->secretLimit - XXH_SECRET_LASTACC_START);
+     } else {  /* bufferedSize < XXH_STRIPE_LEN */
+         xxh_u8 lastStripe[XXH_STRIPE_LEN];
+         size_t const catchupSize = XXH_STRIPE_LEN - state->bufferedSize;
+         XXH_ASSERT(state->bufferedSize > 0);  /* there is always some input buffered */
+-        memcpy(lastStripe, state->buffer + sizeof(state->buffer) - catchupSize, catchupSize);
+-        memcpy(lastStripe + catchupSize, state->buffer, state->bufferedSize);
++        XXH_memcpy(lastStripe, state->buffer + sizeof(state->buffer) - catchupSize, catchupSize);
++        XXH_memcpy(lastStripe + catchupSize, state->buffer, state->bufferedSize);
+         XXH3_accumulate_512(acc,
+                             lastStripe,
+                             secret + state->secretLimit - XXH_SECRET_LASTACC_START);
+     }
+ }
+ 
++/*! @ingroup xxh3_family */
+ XXH_PUBLIC_API XXH64_hash_t XXH3_64bits_digest (const XXH3_state_t* state)
+ {
+     const unsigned char* const secret = (state->extSecret == NULL) ? state->customSecret : state->extSecret;
+     if (state->totalLen > XXH3_MIDSIZE_MAX) {
+         XXH_ALIGN(XXH_ACC_ALIGN) XXH64_hash_t acc[XXH_ACC_NB];
+         XXH3_digest_long(acc, state, secret);
+         return XXH3_mergeAccs(acc,
+                               secret + XXH_SECRET_MERGEACCS_START,
+                               (xxh_u64)state->totalLen * XXH_PRIME64_1);
+     }
+     /* totalLen <= XXH3_MIDSIZE_MAX: digesting a short input */
+-    if (state->seed)
++    if (state->useSeed)
+         return XXH3_64bits_withSeed(state->buffer, (size_t)state->totalLen, state->seed);
+     return XXH3_64bits_withSecret(state->buffer, (size_t)(state->totalLen),
+                                   secret, state->secretLimit + XXH_STRIPE_LEN);
+ }
+ 
+ 
+-#define XXH_MIN(x, y) (((x) > (y)) ? (y) : (x))
+-
+-XXH_PUBLIC_API void
+-XXH3_generateSecret(void* secretBuffer, const void* customSeed, size_t customSeedSize)
+-{
+-    XXH_ASSERT(secretBuffer != NULL);
+-    if (customSeedSize == 0) {
+-        memcpy(secretBuffer, XXH3_kSecret, XXH_SECRET_DEFAULT_SIZE);
+-        return;
+-    }
+-    XXH_ASSERT(customSeed != NULL);
+-
+-    {   size_t const segmentSize = sizeof(XXH128_hash_t);
+-        size_t const nbSegments = XXH_SECRET_DEFAULT_SIZE / segmentSize;
+-        XXH128_canonical_t scrambler;
+-        XXH64_hash_t seeds[12];
+-        size_t segnb;
+-        XXH_ASSERT(nbSegments == 12);
+-        XXH_ASSERT(segmentSize * nbSegments == XXH_SECRET_DEFAULT_SIZE); /* exact multiple */
+-        XXH128_canonicalFromHash(&scrambler, XXH128(customSeed, customSeedSize, 0));
+-
+-        /*
+-        * Copy customSeed to seeds[], truncating or repeating as necessary.
+-        */
+-        {   size_t toFill = XXH_MIN(customSeedSize, sizeof(seeds));
+-            size_t filled = toFill;
+-            memcpy(seeds, customSeed, toFill);
+-            while (filled < sizeof(seeds)) {
+-                toFill = XXH_MIN(filled, sizeof(seeds) - filled);
+-                memcpy((char*)seeds + filled, seeds, toFill);
+-                filled += toFill;
+-        }   }
+-
+-        /* generate secret */
+-        memcpy(secretBuffer, &scrambler, sizeof(scrambler));
+-        for (segnb=1; segnb < nbSegments; segnb++) {
+-            size_t const segmentStart = segnb * segmentSize;
+-            XXH128_canonical_t segment;
+-            XXH128_canonicalFromHash(&segment,
+-                XXH128(&scrambler, sizeof(scrambler), XXH_readLE64(seeds + segnb) + segnb) );
+-            memcpy((char*)secretBuffer + segmentStart, &segment, sizeof(segment));
+-    }   }
+-}
+-
+ 
+ /* ==========================================
+  * XXH3 128 bits (a.k.a XXH128)
+  * ==========================================
+  * XXH3's 128-bit variant has better mixing and strength than the 64-bit variant,
+  * even without counting the significantly larger output size.
+  *
+  * For example, extra steps are taken to avoid the seed-dependent collisions
+@@ -4521,19 +5259,20 @@ XXH3_hashLong_128b_default(const void* X
+                            const void* XXH_RESTRICT secret, size_t secretLen)
+ {
+     (void)seed64; (void)secret; (void)secretLen;
+     return XXH3_hashLong_128b_internal(input, len, XXH3_kSecret, sizeof(XXH3_kSecret),
+                                        XXH3_accumulate_512, XXH3_scrambleAcc);
+ }
+ 
+ /*
+- * It's important for performance that XXH3_hashLong is not inlined.
++ * It's important for performance to pass @secretLen (when it's static)
++ * to the compiler, so that it can properly optimize the vectorized loop.
+  */
+-XXH_NO_INLINE XXH128_hash_t
++XXH_FORCE_INLINE XXH128_hash_t
+ XXH3_hashLong_128b_withSecret(const void* XXH_RESTRICT input, size_t len,
+                               XXH64_hash_t seed64,
+                               const void* XXH_RESTRICT secret, size_t secretLen)
+ {
+     (void)seed64;
+     return XXH3_hashLong_128b_internal(input, len, (const xxh_u8*)secret, secretLen,
+                                        XXH3_accumulate_512, XXH3_scrambleAcc);
+ }
+@@ -4590,96 +5329,103 @@ XXH3_128bits_internal(const void* input,
+     if (len <= XXH3_MIDSIZE_MAX)
+         return XXH3_len_129to240_128b((const xxh_u8*)input, len, (const xxh_u8*)secret, secretLen, seed64);
+     return f_hl128(input, len, seed64, secret, secretLen);
+ }
+ 
+ 
+ /* ===   Public XXH128 API   === */
+ 
++/*! @ingroup xxh3_family */
+ XXH_PUBLIC_API XXH128_hash_t XXH3_128bits(const void* input, size_t len)
+ {
+     return XXH3_128bits_internal(input, len, 0,
+                                  XXH3_kSecret, sizeof(XXH3_kSecret),
+                                  XXH3_hashLong_128b_default);
+ }
+ 
++/*! @ingroup xxh3_family */
+ XXH_PUBLIC_API XXH128_hash_t
+ XXH3_128bits_withSecret(const void* input, size_t len, const void* secret, size_t secretSize)
+ {
+     return XXH3_128bits_internal(input, len, 0,
+                                  (const xxh_u8*)secret, secretSize,
+                                  XXH3_hashLong_128b_withSecret);
+ }
+ 
++/*! @ingroup xxh3_family */
+ XXH_PUBLIC_API XXH128_hash_t
+ XXH3_128bits_withSeed(const void* input, size_t len, XXH64_hash_t seed)
+ {
+     return XXH3_128bits_internal(input, len, seed,
+                                  XXH3_kSecret, sizeof(XXH3_kSecret),
+                                  XXH3_hashLong_128b_withSeed);
+ }
+ 
++/*! @ingroup xxh3_family */
++XXH_PUBLIC_API XXH128_hash_t
++XXH3_128bits_withSecretandSeed(const void* input, size_t len, const void* secret, size_t secretSize, XXH64_hash_t seed)
++{
++    if (len <= XXH3_MIDSIZE_MAX)
++        return XXH3_128bits_internal(input, len, seed, XXH3_kSecret, sizeof(XXH3_kSecret), NULL);
++    return XXH3_hashLong_128b_withSecret(input, len, seed, secret, secretSize);
++}
++
++/*! @ingroup xxh3_family */
+ XXH_PUBLIC_API XXH128_hash_t
+ XXH128(const void* input, size_t len, XXH64_hash_t seed)
+ {
+     return XXH3_128bits_withSeed(input, len, seed);
+ }
+ 
+ 
+ /* ===   XXH3 128-bit streaming   === */
+ 
+ /*
+- * All the functions are actually the same as for 64-bit streaming variant.
+- * The only difference is the finalizatiom routine.
++ * All initialization and update functions are identical to 64-bit streaming variant.
++ * The only difference is the finalization routine.
+  */
+ 
+-static void
+-XXH3_128bits_reset_internal(XXH3_state_t* statePtr,
+-                            XXH64_hash_t seed,
+-                            const void* secret, size_t secretSize)
+-{
+-    XXH3_64bits_reset_internal(statePtr, seed, secret, secretSize);
+-}
+-
++/*! @ingroup xxh3_family */
+ XXH_PUBLIC_API XXH_errorcode
+ XXH3_128bits_reset(XXH3_state_t* statePtr)
+ {
+-    if (statePtr == NULL) return XXH_ERROR;
+-    XXH3_128bits_reset_internal(statePtr, 0, XXH3_kSecret, XXH_SECRET_DEFAULT_SIZE);
+-    return XXH_OK;
++    return XXH3_64bits_reset(statePtr);
+ }
+ 
++/*! @ingroup xxh3_family */
+ XXH_PUBLIC_API XXH_errorcode
+ XXH3_128bits_reset_withSecret(XXH3_state_t* statePtr, const void* secret, size_t secretSize)
+ {
+-    if (statePtr == NULL) return XXH_ERROR;
+-    XXH3_128bits_reset_internal(statePtr, 0, secret, secretSize);
+-    if (secret == NULL) return XXH_ERROR;
+-    if (secretSize < XXH3_SECRET_SIZE_MIN) return XXH_ERROR;
+-    return XXH_OK;
++    return XXH3_64bits_reset_withSecret(statePtr, secret, secretSize);
+ }
+ 
++/*! @ingroup xxh3_family */
+ XXH_PUBLIC_API XXH_errorcode
+ XXH3_128bits_reset_withSeed(XXH3_state_t* statePtr, XXH64_hash_t seed)
+ {
+-    if (statePtr == NULL) return XXH_ERROR;
+-    if (seed==0) return XXH3_128bits_reset(statePtr);
+-    if (seed != statePtr->seed) XXH3_initCustomSecret(statePtr->customSecret, seed);
+-    XXH3_128bits_reset_internal(statePtr, seed, NULL, XXH_SECRET_DEFAULT_SIZE);
+-    return XXH_OK;
++    return XXH3_64bits_reset_withSeed(statePtr, seed);
+ }
+ 
++/*! @ingroup xxh3_family */
++XXH_PUBLIC_API XXH_errorcode
++XXH3_128bits_reset_withSecretandSeed(XXH3_state_t* statePtr, const void* secret, size_t secretSize, XXH64_hash_t seed)
++{
++    return XXH3_64bits_reset_withSecretandSeed(statePtr, secret, secretSize, seed);
++}
++
++/*! @ingroup xxh3_family */
+ XXH_PUBLIC_API XXH_errorcode
+ XXH3_128bits_update(XXH3_state_t* state, const void* input, size_t len)
+ {
+     return XXH3_update(state, (const xxh_u8*)input, len,
+                        XXH3_accumulate_512, XXH3_scrambleAcc);
+ }
+ 
++/*! @ingroup xxh3_family */
+ XXH_PUBLIC_API XXH128_hash_t XXH3_128bits_digest (const XXH3_state_t* state)
+ {
+     const unsigned char* const secret = (state->extSecret == NULL) ? state->customSecret : state->extSecret;
+     if (state->totalLen > XXH3_MIDSIZE_MAX) {
+         XXH_ALIGN(XXH_ACC_ALIGN) XXH64_hash_t acc[XXH_ACC_NB];
+         XXH3_digest_long(acc, state, secret);
+         XXH_ASSERT(state->secretLimit + XXH_STRIPE_LEN >= sizeof(acc) + XXH_SECRET_MERGEACCS_START);
+         {   XXH128_hash_t h128;
+@@ -4700,67 +5446,138 @@ XXH_PUBLIC_API XXH128_hash_t XXH3_128bit
+                                    secret, state->secretLimit + XXH_STRIPE_LEN);
+ }
+ 
+ /* 128-bit utility functions */
+ 
+ #include <string.h>   /* memcmp, memcpy */
+ 
+ /* return : 1 is equal, 0 if different */
++/*! @ingroup xxh3_family */
+ XXH_PUBLIC_API int XXH128_isEqual(XXH128_hash_t h1, XXH128_hash_t h2)
+ {
+     /* note : XXH128_hash_t is compact, it has no padding byte */
+     return !(memcmp(&h1, &h2, sizeof(h1)));
+ }
+ 
+ /* This prototype is compatible with stdlib's qsort().
+  * return : >0 if *h128_1  > *h128_2
+  *          <0 if *h128_1  < *h128_2
+  *          =0 if *h128_1 == *h128_2  */
++/*! @ingroup xxh3_family */
+ XXH_PUBLIC_API int XXH128_cmp(const void* h128_1, const void* h128_2)
+ {
+     XXH128_hash_t const h1 = *(const XXH128_hash_t*)h128_1;
+     XXH128_hash_t const h2 = *(const XXH128_hash_t*)h128_2;
+     int const hcmp = (h1.high64 > h2.high64) - (h2.high64 > h1.high64);
+     /* note : bets that, in most cases, hash values are different */
+     if (hcmp) return hcmp;
+     return (h1.low64 > h2.low64) - (h2.low64 > h1.low64);
+ }
+ 
+ 
+ /*======   Canonical representation   ======*/
++/*! @ingroup xxh3_family */
+ XXH_PUBLIC_API void
+ XXH128_canonicalFromHash(XXH128_canonical_t* dst, XXH128_hash_t hash)
+ {
+     XXH_STATIC_ASSERT(sizeof(XXH128_canonical_t) == sizeof(XXH128_hash_t));
+     if (XXH_CPU_LITTLE_ENDIAN) {
+         hash.high64 = XXH_swap64(hash.high64);
+         hash.low64  = XXH_swap64(hash.low64);
+     }
+-    memcpy(dst, &hash.high64, sizeof(hash.high64));
+-    memcpy((char*)dst + sizeof(hash.high64), &hash.low64, sizeof(hash.low64));
++    XXH_memcpy(dst, &hash.high64, sizeof(hash.high64));
++    XXH_memcpy((char*)dst + sizeof(hash.high64), &hash.low64, sizeof(hash.low64));
+ }
+ 
++/*! @ingroup xxh3_family */
+ XXH_PUBLIC_API XXH128_hash_t
+ XXH128_hashFromCanonical(const XXH128_canonical_t* src)
+ {
+     XXH128_hash_t h;
+     h.high64 = XXH_readBE64(src);
+     h.low64  = XXH_readBE64(src->digest + 8);
+     return h;
+ }
+ 
++
++
++/* ==========================================
++ * Secret generators
++ * ==========================================
++ */
++#define XXH_MIN(x, y) (((x) > (y)) ? (y) : (x))
++
++static void XXH3_combine16(void* dst, XXH128_hash_t h128)
++{
++    XXH_writeLE64( dst, XXH_readLE64(dst) ^ h128.low64 );
++    XXH_writeLE64( (char*)dst+8, XXH_readLE64((char*)dst+8) ^ h128.high64 );
++}
++
++/*! @ingroup xxh3_family */
++XXH_PUBLIC_API XXH_errorcode
++XXH3_generateSecret(void* secretBuffer, size_t secretSize, const void* customSeed, size_t customSeedSize)
++{
++    XXH_ASSERT(secretBuffer != NULL);
++    if (secretBuffer == NULL) return XXH_ERROR;
++    XXH_ASSERT(secretSize >= XXH3_SECRET_SIZE_MIN);
++    if (secretSize < XXH3_SECRET_SIZE_MIN) return XXH_ERROR;
++    if (customSeedSize == 0) {
++        customSeed = XXH3_kSecret;
++        customSeedSize = XXH_SECRET_DEFAULT_SIZE;
++    }
++    XXH_ASSERT(customSeed != NULL);
++    if (customSeed == NULL) return XXH_ERROR;
++
++    /* Fill secretBuffer with a copy of customSeed - repeat as needed */
++    {   size_t pos = 0;
++        while (pos < secretSize) {
++            size_t const toCopy = XXH_MIN((secretSize - pos), customSeedSize);
++            memcpy((char*)secretBuffer + pos, customSeed, toCopy);
++            pos += toCopy;
++    }   }
++
++    {   size_t const nbSeg16 = secretSize / 16;
++        size_t n;
++        XXH128_canonical_t scrambler;
++        XXH128_canonicalFromHash(&scrambler, XXH128(customSeed, customSeedSize, 0));
++        for (n=0; n<nbSeg16; n++) {
++            XXH128_hash_t const h128 = XXH128(&scrambler, sizeof(scrambler), n);
++            XXH3_combine16((char*)secretBuffer + n*16, h128);
++        }
++        /* last segment */
++        XXH3_combine16((char*)secretBuffer + secretSize - 16, XXH128_hashFromCanonical(&scrambler));
++    }
++    return XXH_OK;
++}
++
++/*! @ingroup xxh3_family */
++XXH_PUBLIC_API void
++XXH3_generateSecret_fromSeed(void* secretBuffer, XXH64_hash_t seed)
++{
++    XXH_ALIGN(XXH_SEC_ALIGN) xxh_u8 secret[XXH_SECRET_DEFAULT_SIZE];
++    XXH3_initCustomSecret(secret, seed);
++    XXH_ASSERT(secretBuffer != NULL);
++    memcpy(secretBuffer, secret, XXH_SECRET_DEFAULT_SIZE);
++}
++
++
++
+ /* Pop our optimization override from above */
+ #if XXH_VECTOR == XXH_AVX2 /* AVX2 */ \
+   && defined(__GNUC__) && !defined(__clang__) /* GCC, not Clang */ \
+   && defined(__OPTIMIZE__) && !defined(__OPTIMIZE_SIZE__) /* respect -O0 and -Os */
+ #  pragma GCC pop_options
+ #endif
+ 
+ #endif  /* XXH_NO_LONG_LONG */
+ 
+-
++#endif  /* XXH_NO_XXH3 */
++
++/*!
++ * @}
++ */
+ #endif  /* XXH_IMPLEMENTATION */
+ 
+ 
+ #if defined (__cplusplus)
+ }
+ #endif

+ 118 - 0
mozilla-release/patches/1779993-PARTIAL-NOTESTS-105a1.patch

@@ -0,0 +1,118 @@
+# HG changeset patch
+# User Valentin Gosu <valentin.gosu@gmail.com>
+# Date 1662577462 0
+# Node ID 3f589e5decaed63e287149ff52adcd8252061138
+# Parent  573f6da1def084d91ec7cb82a8d5aa4225c70d9c
+Bug 1779993 - Reject cookies with no name and a __Secure- or __Host- prefix r=necko-reviewers,kershaw a=RyanVM
+
+Differential Revision: https://phabricator.services.mozilla.com/D156554
+
+diff --git a/netwerk/cookie/nsCookieService.cpp b/netwerk/cookie/nsCookieService.cpp
+--- a/netwerk/cookie/nsCookieService.cpp
++++ b/netwerk/cookie/nsCookieService.cpp
+@@ -3356,16 +3356,29 @@ nsCookieService::CanSetCookie(nsIURI*   
+   if (!CheckDomain(aCookieAttributes, aHostURI, aKey.mBaseDomain, aRequireHostMatch)) {
+     COOKIE_LOGFAILURE(SET_COOKIE, aHostURI, savedCookieHeader, "failed the domain tests");
+     return newCookie;
+   }
+   if (!CheckPath(aCookieAttributes, aHostURI)) {
+     COOKIE_LOGFAILURE(SET_COOKIE, aHostURI, savedCookieHeader, "failed the path tests");
+     return newCookie;
+   }
++
++  if (!CheckHiddenPrefix(aCookieData)) {
++    COOKIE_LOGFAILURE(SET_COOKIE, aHostURI, savedCookieHeader,
++                      "failed the CheckHiddenPrefix tests");
++    // CookieLogging::LogMessageToConsole(
++    //     aCRC, aHostURI, nsIScriptError::warningFlag, CONSOLE_REJECTION_CATEGORY,
++    //     "CookieRejectedInvalidPrefix",
++    //     AutoTArray<nsString, 1>{
++    //         NS_ConvertUTF8toUTF16(aCookieData.name()),
++    //     });
++    return newCookie;
++  }
++
+   // magic prefix checks. MUST be run after CheckDomain() and CheckPath()
+   if (!CheckPrefixes(aCookieAttributes, isHTTPS)) {
+     COOKIE_LOGFAILURE(SET_COOKIE, aHostURI, savedCookieHeader, "failed the prefix tests");
+     return newCookie;
+   }
+ 
+   // reject cookie if value contains an RFC 6265 disallowed character - see
+   // https://bugzilla.mozilla.org/show_bug.cgi?id=1191423
+@@ -4201,16 +4214,35 @@ nsCookieService::CheckDomain(nsCookieAtt
+     return false;
+   }
+ 
+   // no domain specified, use hostFromURI
+   aCookieAttributes.host = hostFromURI;
+   return true;
+ }
+ 
++// static
++bool nsCookieService::CheckHiddenPrefix(CookieStruct& aCookieData) {
++  // If a cookie is nameless, then its value must not start with
++  // `__Host-` or `__Secure-`
++  if (!aCookieData.name().IsEmpty()) {
++    return true;
++  }
++
++  if (StringBeginsWith(aCookieData.value(), "__Host-")) {
++    return false;
++  }
++
++  if (StringBeginsWith(aCookieData.value(), "__Secure-")) {
++    return false;
++  }
++
++  return true;
++}
++
+ nsAutoCString
+ nsCookieService::GetPathFromURI(nsIURI* aHostURI)
+ {
+   // strip down everything after the last slash to get the path,
+   // ignoring slashes in the query string part.
+   // if we can QI to nsIURL, that'll take care of the query string portion.
+   // otherwise, it's not an nsIURL and can't have a query string, so just find the last slash.
+   nsAutoCString path;
+@@ -4257,17 +4289,17 @@ nsCookieService::CheckPath(nsCookieAttri
+     return false;
+ 
+   return true;
+ }
+ 
+ // CheckPrefixes
+ //
+ // Reject cookies whose name starts with the magic prefixes from
+-// https://tools.ietf.org/html/draft-ietf-httpbis-cookie-prefixes-00
++// https://datatracker.ietf.org/doc/html/draft-ietf-httpbis-rfc6265bis
+ // if they do not meet the criteria required by the prefix.
+ //
+ // Must not be called until after CheckDomain() and CheckPath() have
+ // regularized and validated the nsCookieAttributes values!
+ bool
+ nsCookieService::CheckPrefixes(nsCookieAttributes &aCookieAttributes,
+                                bool aSecureRequest)
+ {
+diff --git a/netwerk/cookie/nsCookieService.h b/netwerk/cookie/nsCookieService.h
+--- a/netwerk/cookie/nsCookieService.h
++++ b/netwerk/cookie/nsCookieService.h
+@@ -306,16 +306,17 @@ class nsCookieService final : public nsI
+     void                          AddInternal(const nsCookieKey& aKey, nsCookie *aCookie, int64_t aCurrentTimeInUsec, nsIURI *aHostURI, const char *aCookieHeader, bool aFromHttp);
+     void                          RemoveCookieFromList(const nsListIter &aIter, mozIStorageBindingParamsArray *aParamsArray = nullptr);
+     void                          AddCookieToList(const nsCookieKey& aKey, nsCookie *aCookie, DBState *aDBState, mozIStorageBindingParamsArray *aParamsArray, bool aWriteToDB = true);
+     void                          UpdateCookieInList(nsCookie *aCookie, int64_t aLastAccessed, mozIStorageBindingParamsArray *aParamsArray);
+     static bool                   GetTokenValue(nsACString::const_char_iterator &aIter, nsACString::const_char_iterator &aEndIter, nsDependentCSubstring &aTokenString, nsDependentCSubstring &aTokenValue, bool &aEqualsFound);
+     static bool                   ParseAttributes(nsDependentCString &aCookieHeader, nsCookieAttributes &aCookie);
+     bool                          RequireThirdPartyCheck();
+     static bool                   CheckDomain(nsCookieAttributes &aCookie, nsIURI *aHostURI, const nsCString &aBaseDomain, bool aRequireHostMatch);
++    static bool CheckHiddenPrefix(CookieStruct& aCookieData);
+     static bool                   CheckPath(nsCookieAttributes &aCookie, nsIURI *aHostURI);
+     static bool                   CheckPrefixes(nsCookieAttributes &aCookie, bool aSecureRequest);
+     static bool                   GetExpiry(nsCookieAttributes &aCookie, int64_t aServerTime, int64_t aCurrentTime);
+     void                          RemoveAllFromMemory();
+     already_AddRefed<nsIArray>    PurgeCookies(int64_t aCurrentTimeInUsec);
+     bool                          FindCookie(const nsCookieKey& aKey, const nsCString& aHost, const nsCString& aName, const nsCString& aPath, nsListIter &aIter);
+     bool                          FindSecureCookie(const nsCookieKey& aKey, nsCookie* aCookie);
+     int64_t                       FindStaleCookie(nsCookieEntry *aEntry, int64_t aCurrentTime, nsIURI* aSource, const mozilla::Maybe<bool> &aIsSecure, nsListIter &aIter);

+ 5687 - 0
mozilla-release/patches/1784990-106a1.patch

@@ -0,0 +1,5687 @@
+# HG changeset patch
+# User Ryan VanderMeulen <ryanvm@gmail.com>
+# Date 1661962245 0
+#      Wed Aug 31 16:10:45 2022 +0000
+# Node ID 7eef756241bc1408a297d251f396657fda0b5a55
+# Parent  5be68f0fff6d9abd730db7c0921c37c2992cba7b
+Bug 1784990 - Update lz4 to 1.9.4. r=glandium
+
+Differential Revision: https://phabricator.services.mozilla.com/D154770
+
+diff --git a/mfbt/lz4/LICENSE b/mfbt/lz4/LICENSE
+--- a/mfbt/lz4/LICENSE
++++ b/mfbt/lz4/LICENSE
+@@ -1,10 +1,10 @@
+ LZ4 Library
+-Copyright (c) 2011-2016, Yann Collet
++Copyright (c) 2011-2020, Yann Collet
+ All rights reserved.
+ 
+ Redistribution and use in source and binary forms, with or without modification,
+ are permitted provided that the following conditions are met:
+ 
+ * Redistributions of source code must retain the above copyright notice, this
+   list of conditions and the following disclaimer.
+ 
+diff --git a/mfbt/lz4/README.md b/mfbt/lz4/README.md
+--- a/mfbt/lz4/README.md
++++ b/mfbt/lz4/README.md
+@@ -1,46 +1,65 @@
+ LZ4 - Library Files
+ ================================
+ 
+ The `/lib` directory contains many files, but depending on project's objectives,
+-not all of them are necessary.
++not all of them are required.
++Limited systems may want to reduce the nb of source files to include
++as a way to reduce binary size and dependencies.
+ 
+-#### Minimal LZ4 build
++Capabilities are added at the "level" granularity, detailed below.
++
++#### Level 1 : Minimal LZ4 build
+ 
+ The minimum required is **`lz4.c`** and **`lz4.h`**,
+ which provides the fast compression and decompression algorithms.
+ They generate and decode data using the [LZ4 block format].
+ 
+ 
+-#### High Compression variant
++#### Level 2 : High Compression variant
+ 
+ For more compression ratio at the cost of compression speed,
+ the High Compression variant called **lz4hc** is available.
+ Add files **`lz4hc.c`** and **`lz4hc.h`**.
+ This variant also compresses data using the [LZ4 block format],
+ and depends on regular `lib/lz4.*` source files.
+ 
+ 
+-#### Frame support, for interoperability
++#### Level 3 : Frame support, for interoperability
+ 
+ In order to produce compressed data compatible with `lz4` command line utility,
+ it's necessary to use the [official interoperable frame format].
+ This format is generated and decoded automatically by the **lz4frame** library.
+ Its public API is described in `lib/lz4frame.h`.
+ In order to work properly, lz4frame needs all other modules present in `/lib`,
+ including, lz4 and lz4hc, and also **xxhash**.
+-So it's necessary to include all `*.c` and `*.h` files present in `/lib`.
++So it's necessary to also include `xxhash.c` and `xxhash.h`.
++
++
++#### Level 4 : File compression operations
++
++As a helper around file operations,
++the library has been recently extended with `lz4file.c` and `lz4file.h`
++(still considered experimental at the time of this writing).
++These helpers allow opening, reading, writing, and closing files
++using transparent LZ4 compression / decompression.
++As a consequence, using `lz4file` adds a dependency on `<stdio.h>`.
++
++`lz4file` relies on `lz4frame` in order to produce compressed data
++conformant to the [LZ4 Frame format] specification.
++Consequently, to enable this capability,
++it's necessary to include all `*.c` and `*.h` files from `lib/` directory.
+ 
+ 
+ #### Advanced / Experimental API
+ 
+ Definitions which are not guaranteed to remain stable in future versions,
+ are protected behind macros, such as `LZ4_STATIC_LINKING_ONLY`.
+-As the name strongly implies, these definitions should only be invoked
++As the name suggests, these definitions should only be invoked
+ in the context of static linking ***only***.
+ Otherwise, dependent application may fail on API or ABI break in the future.
+ The associated symbols are also not exposed by the dynamic library by default.
+ Should they be nonetheless needed, it's possible to force their publication
+ by using build macros `LZ4_PUBLISH_STATIC_FUNCTIONS`
+ and `LZ4F_PUBLISH_STATIC_FUNCTIONS`.
+ 
+ 
+@@ -53,60 +72,72 @@ The following build macro can be selecte
+   It's also possible to enable or disable it manually, by passing `LZ4_FAST_DEC_LOOP=1` or `0` to the preprocessor.
+   For example, with `gcc` : `-DLZ4_FAST_DEC_LOOP=1`,
+   and with `make` : `CPPFLAGS+=-DLZ4_FAST_DEC_LOOP=1 make lz4`.
+ 
+ - `LZ4_DISTANCE_MAX` : control the maximum offset that the compressor will allow.
+   Set to 65535 by default, which is the maximum value supported by lz4 format.
+   Reducing maximum distance will reduce opportunities for LZ4 to find matches,
+   hence will produce a worse compression ratio.
+-  However, a smaller max distance can allow compatibility with specific decoders using limited memory budget.
++  Setting a smaller max distance could allow compatibility with specific decoders with limited memory budget.
+   This build macro only influences the compressed output of the compressor.
+ 
+ - `LZ4_DISABLE_DEPRECATE_WARNINGS` : invoking a deprecated function will make the compiler generate a warning.
+   This is meant to invite users to update their source code.
+   Should this be a problem, it's generally possible to make the compiler ignore these warnings,
+   for example with `-Wno-deprecated-declarations` on `gcc`,
+   or `_CRT_SECURE_NO_WARNINGS` for Visual Studio.
+   This build macro offers another project-specific method
+   by defining `LZ4_DISABLE_DEPRECATE_WARNINGS` before including the LZ4 header files.
+ 
+-- `LZ4_USER_MEMORY_FUNCTIONS` : replace calls to <stdlib>'s `malloc`, `calloc` and `free`
+-  by user-defined functions, which must be called `LZ4_malloc()`, `LZ4_calloc()` and `LZ4_free()`.
+-  User functions must be available at link time.
+-
+ - `LZ4_FORCE_SW_BITCOUNT` : by default, the compression algorithm tries to determine lengths
+   by using bitcount instructions, generally implemented as fast single instructions in many cpus.
+   In case the target cpus doesn't support it, or compiler intrinsic doesn't work, or feature bad performance,
+   it's possible to use an optimized software path instead.
+-  This is achieved by setting this build macros .
++  This is achieved by setting this build macros.
+   In most cases, it's not expected to be necessary,
+   but it can be legitimately considered for less common platforms.
+ 
+ - `LZ4_ALIGN_TEST` : alignment test ensures that the memory area
+   passed as argument to become a compression state is suitably aligned.
+   This test can be disabled if it proves flaky, by setting this value to 0.
+ 
++- `LZ4_USER_MEMORY_FUNCTIONS` : replace calls to `<stdlib,h>`'s `malloc()`, `calloc()` and `free()`
++  by user-defined functions, which must be named `LZ4_malloc()`, `LZ4_calloc()` and `LZ4_free()`.
++  User functions must be available at link time.
++
++- `LZ4_STATIC_LINKING_ONLY_DISABLE_MEMORY_ALLOCATION` :
++  Remove support of dynamic memory allocation.
++  For more details, see description of this macro in `lib/lz4.c`.
++
++- `LZ4_FREESTANDING` : by setting this build macro to 1,
++  LZ4/HC removes dependencies on the C standard library,
++  including allocation functions and `memmove()`, `memcpy()`, and `memset()`.
++  This build macro is designed to help use LZ4/HC in restricted environments
++  (embedded, bootloader, etc).
++  For more details, see description of this macro in `lib/lz4.h`.
++
++
+ 
+ #### Amalgamation
+ 
+ lz4 source code can be amalgamated into a single file.
+ One can combine all source code into `lz4_all.c` by using following command:
+ ```
+ cat lz4.c lz4hc.c lz4frame.c > lz4_all.c
+ ```
+ (`cat` file order is important) then compile `lz4_all.c`.
+ All `*.h` files present in `/lib` remain necessary to compile `lz4_all.c`.
+ 
+ 
+ #### Windows : using MinGW+MSYS to create DLL
+ 
+ DLL can be created using MinGW+MSYS with the `make liblz4` command.
+ This command creates `dll\liblz4.dll` and the import library `dll\liblz4.lib`.
+-To override the `dlltool` command  when cross-compiling on Linux, just set the `DLLTOOL` variable. Example of cross compilation on Linux with mingw-w64 64 bits:
++To override the `dlltool` command when cross-compiling on Linux, just set the `DLLTOOL` variable. Example of cross compilation on Linux with mingw-w64 64 bits:
+ ```
+ make BUILD_STATIC=no CC=x86_64-w64-mingw32-gcc DLLTOOL=x86_64-w64-mingw32-dlltool OS=Windows_NT
+ ```
+ The import library is only required with Visual C++.
+ The header files `lz4.h`, `lz4hc.h`, `lz4frame.h` and the dynamic library
+ `dll\liblz4.dll` are required to compile a project using gcc/MinGW.
+ The dynamic library has to be added to linking options.
+ It means that if a project that uses LZ4 consists of a single `test-dll.c`
+@@ -122,16 +153,17 @@ The compiled executable will require LZ4
+ Other files present in the directory are not source code. They are :
+ 
+  - `LICENSE` : contains the BSD license text
+  - `Makefile` : `make` script to compile and install lz4 library (static and dynamic)
+  - `liblz4.pc.in` : for `pkg-config` (used in `make install`)
+  - `README.md` : this file
+ 
+ [official interoperable frame format]: ../doc/lz4_Frame_format.md
++[LZ4 Frame format]: ../doc/lz4_Frame_format.md
+ [LZ4 block format]: ../doc/lz4_Block_format.md
+ 
+ 
+ #### License
+ 
+ All source material within __lib__ directory are BSD 2-Clause licensed.
+ See [LICENSE](LICENSE) for details.
+ The license is also reminded at the top of each source file.
+diff --git a/mfbt/lz4/README.mozilla b/mfbt/lz4/README.mozilla
+new file mode 100644
+--- /dev/null
++++ b/mfbt/lz4/README.mozilla
+@@ -0,0 +1,18 @@
++This directory contains the LZ4 source from the upstream repo:
++https://github.com/lz4/lz4/
++
++Current version: 1.9.4 [5ff839680134437dbf4678f3d0c7b371d84f4964]
++
++Our in-tree copy of LZ4 does not depend on any generated files from the
++upstream build system, only the lz4*.{c,h} files found in the lib
++sub-directory. Therefore, it should be sufficient to simply overwrite
++the in-tree files with the updated ones from upstream.
++
++If the collection of source files changes, manual updates to moz.build may be
++needed as we don't use the upstream makefiles.
++
++Note that we do NOT use the copy of xxhash.{c,h} from the LZ4 repo. We
++instead use the newer release from that project's upstream repo:
++https://github.com/Cyan4973/xxHash
++
++Current version: 0.8.1 [35b0373c697b5f160d3db26b1cbb45a0d5ba788c]
+diff --git a/mfbt/lz4/lz4.c b/mfbt/lz4/lz4.c
+--- a/mfbt/lz4/lz4.c
++++ b/mfbt/lz4/lz4.c
+@@ -1,11 +1,11 @@
+ /*
+    LZ4 - Fast LZ compression algorithm
+-   Copyright (C) 2011-present, Yann Collet.
++   Copyright (C) 2011-2020, Yann Collet.
+ 
+    BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php)
+ 
+    Redistribution and use in source and binary forms, with or without
+    modification, are permitted provided that the following conditions are
+    met:
+ 
+        * Redistributions of source code must retain the above copyright
+@@ -119,16 +119,17 @@
+ 
+ 
+ /*-************************************
+ *  Compiler Options
+ **************************************/
+ #if defined(_MSC_VER) && (_MSC_VER >= 1400)  /* Visual Studio 2005+ */
+ #  include <intrin.h>               /* only present in VS2005+ */
+ #  pragma warning(disable : 4127)   /* disable: C4127: conditional expression is constant */
++#  pragma warning(disable : 6237)   /* disable: C6237: conditional expression is always 0 */
+ #endif  /* _MSC_VER */
+ 
+ #ifndef LZ4_FORCE_INLINE
+ #  ifdef _MSC_VER    /* Visual Studio */
+ #    define LZ4_FORCE_INLINE static __forceinline
+ #  else
+ #    if defined (__cplusplus) || defined (__STDC_VERSION__) && __STDC_VERSION__ >= 199901L   /* C99 */
+ #      ifdef __GNUC__
+@@ -182,35 +183,60 @@
+ #ifndef LZ4_ALIGN_TEST  /* can be externally provided */
+ # define LZ4_ALIGN_TEST 1
+ #endif
+ 
+ 
+ /*-************************************
+ *  Memory routines
+ **************************************/
+-#ifdef LZ4_USER_MEMORY_FUNCTIONS
++
++/*! LZ4_STATIC_LINKING_ONLY_DISABLE_MEMORY_ALLOCATION :
++ *  Disable relatively high-level LZ4/HC functions that use dynamic memory
++ *  allocation functions (malloc(), calloc(), free()).
++ *
++ *  Note that this is a compile-time switch. And since it disables
++ *  public/stable LZ4 v1 API functions, we don't recommend using this
++ *  symbol to generate a library for distribution.
++ *
++ *  The following public functions are removed when this symbol is defined.
++ *  - lz4   : LZ4_createStream, LZ4_freeStream,
++ *            LZ4_createStreamDecode, LZ4_freeStreamDecode, LZ4_create (deprecated)
++ *  - lz4hc : LZ4_createStreamHC, LZ4_freeStreamHC,
++ *            LZ4_createHC (deprecated), LZ4_freeHC  (deprecated)
++ *  - lz4frame, lz4file : All LZ4F_* functions
++ */
++#if defined(LZ4_STATIC_LINKING_ONLY_DISABLE_MEMORY_ALLOCATION)
++#  define ALLOC(s)          lz4_error_memory_allocation_is_disabled
++#  define ALLOC_AND_ZERO(s) lz4_error_memory_allocation_is_disabled
++#  define FREEMEM(p)        lz4_error_memory_allocation_is_disabled
++#elif defined(LZ4_USER_MEMORY_FUNCTIONS)
+ /* memory management functions can be customized by user project.
+  * Below functions must exist somewhere in the Project
+  * and be available at link time */
+ void* LZ4_malloc(size_t s);
+ void* LZ4_calloc(size_t n, size_t s);
+ void  LZ4_free(void* p);
+ # define ALLOC(s)          LZ4_malloc(s)
+ # define ALLOC_AND_ZERO(s) LZ4_calloc(1,s)
+ # define FREEMEM(p)        LZ4_free(p)
+ #else
+ # include <stdlib.h>   /* malloc, calloc, free */
+ # define ALLOC(s)          malloc(s)
+ # define ALLOC_AND_ZERO(s) calloc(1,s)
+ # define FREEMEM(p)        free(p)
+ #endif
+ 
+-#include <string.h>   /* memset, memcpy */
+-#define MEM_INIT(p,v,s)   memset((p),(v),(s))
++#if ! LZ4_FREESTANDING
++#  include <string.h>   /* memset, memcpy */
++#endif
++#if !defined(LZ4_memset)
++#  define LZ4_memset(p,v,s) memset((p),(v),(s))
++#endif
++#define MEM_INIT(p,v,s)   LZ4_memset((p),(v),(s))
+ 
+ 
+ /*-************************************
+ *  Common Constants
+ **************************************/
+ #define MINMATCH 4
+ 
+ #define WILDCOPYLENGTH 8
+@@ -311,20 +337,30 @@ typedef enum {
+ /**
+  * LZ4 relies on memcpy with a constant size being inlined. In freestanding
+  * environments, the compiler can't assume the implementation of memcpy() is
+  * standard compliant, so it can't apply its specialized memcpy() inlining
+  * logic. When possible, use __builtin_memcpy() to tell the compiler to analyze
+  * memcpy() as if it were standard compliant, so it can inline it in freestanding
+  * environments. This is needed when decompressing the Linux Kernel, for example.
+  */
+-#if defined(__GNUC__) && (__GNUC__ >= 4)
+-#define LZ4_memcpy(dst, src, size) __builtin_memcpy(dst, src, size)
+-#else
+-#define LZ4_memcpy(dst, src, size) memcpy(dst, src, size)
++#if !defined(LZ4_memcpy)
++#  if defined(__GNUC__) && (__GNUC__ >= 4)
++#    define LZ4_memcpy(dst, src, size) __builtin_memcpy(dst, src, size)
++#  else
++#    define LZ4_memcpy(dst, src, size) memcpy(dst, src, size)
++#  endif
++#endif
++
++#if !defined(LZ4_memmove)
++#  if defined(__GNUC__) && (__GNUC__ >= 4)
++#    define LZ4_memmove __builtin_memmove
++#  else
++#    define LZ4_memmove memmove
++#  endif
+ #endif
+ 
+ static unsigned LZ4_isLittleEndian(void)
+ {
+     const union { U32 u; BYTE c[4]; } one = { 1 };   /* don't use static : performance detrimental */
+     return one.c[0];
+ }
+ 
+@@ -338,24 +374,24 @@ static reg_t LZ4_read_ARCH(const void* m
+ 
+ static void LZ4_write16(void* memPtr, U16 value) { *(U16*)memPtr = value; }
+ static void LZ4_write32(void* memPtr, U32 value) { *(U32*)memPtr = value; }
+ 
+ #elif defined(LZ4_FORCE_MEMORY_ACCESS) && (LZ4_FORCE_MEMORY_ACCESS==1)
+ 
+ /* __pack instructions are safer, but compiler specific, hence potentially problematic for some compilers */
+ /* currently only defined for gcc and icc */
+-typedef union { U16 u16; U32 u32; reg_t uArch; } __attribute__((packed)) unalign;
++typedef union { U16 u16; U32 u32; reg_t uArch; } __attribute__((packed)) LZ4_unalign;
+ 
+-static U16 LZ4_read16(const void* ptr) { return ((const unalign*)ptr)->u16; }
+-static U32 LZ4_read32(const void* ptr) { return ((const unalign*)ptr)->u32; }
+-static reg_t LZ4_read_ARCH(const void* ptr) { return ((const unalign*)ptr)->uArch; }
++static U16 LZ4_read16(const void* ptr) { return ((const LZ4_unalign*)ptr)->u16; }
++static U32 LZ4_read32(const void* ptr) { return ((const LZ4_unalign*)ptr)->u32; }
++static reg_t LZ4_read_ARCH(const void* ptr) { return ((const LZ4_unalign*)ptr)->uArch; }
+ 
+-static void LZ4_write16(void* memPtr, U16 value) { ((unalign*)memPtr)->u16 = value; }
+-static void LZ4_write32(void* memPtr, U32 value) { ((unalign*)memPtr)->u32 = value; }
++static void LZ4_write16(void* memPtr, U16 value) { ((LZ4_unalign*)memPtr)->u16 = value; }
++static void LZ4_write32(void* memPtr, U32 value) { ((LZ4_unalign*)memPtr)->u32 = value; }
+ 
+ #else  /* safe and portable access using memcpy() */
+ 
+ static U16 LZ4_read16(const void* memPtr)
+ {
+     U16 val; LZ4_memcpy(&val, memPtr, sizeof(val)); return val;
+ }
+ 
+@@ -416,20 +452,22 @@ void LZ4_wildCopy8(void* dstPtr, const v
+ 
+ static const unsigned inc32table[8] = {0, 1, 2,  1,  0,  4, 4, 4};
+ static const int      dec64table[8] = {0, 0, 0, -1, -4,  1, 2, 3};
+ 
+ 
+ #ifndef LZ4_FAST_DEC_LOOP
+ #  if defined __i386__ || defined _M_IX86 || defined __x86_64__ || defined _M_X64
+ #    define LZ4_FAST_DEC_LOOP 1
++#  elif defined(__aarch64__) && defined(__APPLE__)
++#    define LZ4_FAST_DEC_LOOP 1
+ #  elif defined(__aarch64__) && !defined(__clang__)
+-     /* On aarch64, we disable this optimization for clang because on certain
+-      * mobile chipsets, performance is reduced with clang. For information
+-      * refer to https://github.com/lz4/lz4/pull/707 */
++     /* On non-Apple aarch64, we disable this optimization for clang because
++      * on certain mobile chipsets, performance is reduced with clang. For
++      * more information refer to https://github.com/lz4/lz4/pull/707 */
+ #    define LZ4_FAST_DEC_LOOP 1
+ #  else
+ #    define LZ4_FAST_DEC_LOOP 0
+ #  endif
+ #endif
+ 
+ #if LZ4_FAST_DEC_LOOP
+ 
+@@ -481,17 +519,24 @@ LZ4_memcpy_using_offset(BYTE* dstPtr, co
+ 
+     switch(offset) {
+     case 1:
+         MEM_INIT(v, *srcPtr, 8);
+         break;
+     case 2:
+         LZ4_memcpy(v, srcPtr, 2);
+         LZ4_memcpy(&v[2], srcPtr, 2);
++#if defined(_MSC_VER) && (_MSC_VER <= 1933) /* MSVC 2022 ver 17.3 or earlier */
++#  pragma warning(push)
++#  pragma warning(disable : 6385) /* warning C6385: Reading invalid data from 'v'. */
++#endif
+         LZ4_memcpy(&v[4], v, 4);
++#if defined(_MSC_VER) && (_MSC_VER <= 1933) /* MSVC 2022 ver 17.3 or earlier */
++#  pragma warning(pop)
++#endif
+         break;
+     case 4:
+         LZ4_memcpy(v, srcPtr, 4);
+         LZ4_memcpy(&v[4], srcPtr, 4);
+         break;
+     default:
+         LZ4_memcpy_using_offset_base(dstPtr, srcPtr, dstEnd, offset);
+         return;
+@@ -510,19 +555,30 @@ LZ4_memcpy_using_offset(BYTE* dstPtr, co
+ /*-************************************
+ *  Common functions
+ **************************************/
+ static unsigned LZ4_NbCommonBytes (reg_t val)
+ {
+     assert(val != 0);
+     if (LZ4_isLittleEndian()) {
+         if (sizeof(val) == 8) {
+-#       if defined(_MSC_VER) && (_MSC_VER >= 1800) && defined(_M_AMD64) && !defined(LZ4_FORCE_SW_BITCOUNT)
++#       if defined(_MSC_VER) && (_MSC_VER >= 1800) && (defined(_M_AMD64) && !defined(_M_ARM64EC)) && !defined(LZ4_FORCE_SW_BITCOUNT)
++/*-*************************************************************************************************
++* ARM64EC is a Microsoft-designed ARM64 ABI compatible with AMD64 applications on ARM64 Windows 11.
++* The ARM64EC ABI does not support AVX/AVX2/AVX512 instructions, nor their relevant intrinsics
++* including _tzcnt_u64. Therefore, we need to neuter the _tzcnt_u64 code path for ARM64EC.
++****************************************************************************************************/
++#         if defined(__clang__) && (__clang_major__ < 10)
++            /* Avoid undefined clang-cl intrinsics issue.
++             * See https://github.com/lz4/lz4/pull/1017 for details. */
++            return (unsigned)__builtin_ia32_tzcnt_u64(val) >> 3;
++#         else
+             /* x64 CPUS without BMI support interpret `TZCNT` as `REP BSF` */
+             return (unsigned)_tzcnt_u64(val) >> 3;
++#         endif
+ #       elif defined(_MSC_VER) && defined(_WIN64) && !defined(LZ4_FORCE_SW_BITCOUNT)
+             unsigned long r = 0;
+             _BitScanForward64(&r, (U64)val);
+             return (unsigned)r >> 3;
+ #       elif (defined(__clang__) || (defined(__GNUC__) && ((__GNUC__ > 3) || \
+                             ((__GNUC__ == 3) && (__GNUC_MINOR__ >= 4))))) && \
+                                         !defined(LZ4_FORCE_SW_BITCOUNT)
+             return (unsigned)__builtin_ctzll((U64)val) >> 3;
+@@ -647,52 +703,54 @@ typedef enum { clearedTable = 0, byPtr, 
+  * - withPrefix64k : Table entries up to ctx->dictSize before the current blob
+  *                   blob being compressed are valid and refer to the preceding
+  *                   content (of length ctx->dictSize), which is available
+  *                   contiguously preceding in memory the content currently
+  *                   being compressed.
+  * - usingExtDict  : Like withPrefix64k, but the preceding content is somewhere
+  *                   else in memory, starting at ctx->dictionary with length
+  *                   ctx->dictSize.
+- * - usingDictCtx  : Like usingExtDict, but everything concerning the preceding
+- *                   content is in a separate context, pointed to by
+- *                   ctx->dictCtx. ctx->dictionary, ctx->dictSize, and table
+- *                   entries in the current context that refer to positions
++ * - usingDictCtx  : Everything concerning the preceding content is
++ *                   in a separate context, pointed to by ctx->dictCtx.
++ *                   ctx->dictionary, ctx->dictSize, and table entries
++ *                   in the current context that refer to positions
+  *                   preceding the beginning of the current compression are
+  *                   ignored. Instead, ctx->dictCtx->dictionary and ctx->dictCtx
+  *                   ->dictSize describe the location and size of the preceding
+  *                   content, and matches are found by looking in the ctx
+  *                   ->dictCtx->hashTable.
+  */
+ typedef enum { noDict = 0, withPrefix64k, usingExtDict, usingDictCtx } dict_directive;
+ typedef enum { noDictIssue = 0, dictSmall } dictIssue_directive;
+ 
+ 
+ /*-************************************
+ *  Local Utils
+ **************************************/
+ int LZ4_versionNumber (void) { return LZ4_VERSION_NUMBER; }
+ const char* LZ4_versionString(void) { return LZ4_VERSION_STRING; }
+ int LZ4_compressBound(int isize)  { return LZ4_COMPRESSBOUND(isize); }
+-int LZ4_sizeofState(void) { return LZ4_STREAMSIZE; }
++int LZ4_sizeofState(void) { return sizeof(LZ4_stream_t); }
+ 
+ 
+-/*-************************************
+-*  Internal Definitions used in Tests
+-**************************************/
++/*-****************************************
++*  Internal Definitions, used only in Tests
++*******************************************/
+ #if defined (__cplusplus)
+ extern "C" {
+ #endif
+ 
+ int LZ4_compress_forceExtDict (LZ4_stream_t* LZ4_dict, const char* source, char* dest, int srcSize);
+ 
+ int LZ4_decompress_safe_forceExtDict(const char* source, char* dest,
+                                      int compressedSize, int maxOutputSize,
+                                      const void* dictStart, size_t dictSize);
+-
++int LZ4_decompress_safe_partial_forceExtDict(const char* source, char* dest,
++                                     int compressedSize, int targetOutputSize, int dstCapacity,
++                                     const void* dictStart, size_t dictSize);
+ #if defined (__cplusplus)
+ }
+ #endif
+ 
+ /*-******************************
+ *  Compression functions
+ ********************************/
+ LZ4_FORCE_INLINE U32 LZ4_hash4(U32 sequence, tableType_t const tableType)
+@@ -822,19 +880,20 @@ LZ4_prepareTable(LZ4_stream_t_internal* 
+             MEM_INIT(cctx->hashTable, 0, LZ4_HASHTABLESIZE);
+             cctx->currentOffset = 0;
+             cctx->tableType = (U32)clearedTable;
+         } else {
+             DEBUGLOG(4, "LZ4_prepareTable: Re-use hash table (no reset)");
+         }
+     }
+ 
+-    /* Adding a gap, so all previous entries are > LZ4_DISTANCE_MAX back, is faster
+-     * than compressing without a gap. However, compressing with
+-     * currentOffset == 0 is faster still, so we preserve that case.
++    /* Adding a gap, so all previous entries are > LZ4_DISTANCE_MAX back,
++     * is faster than compressing without a gap.
++     * However, compressing with currentOffset == 0 is faster still,
++     * so we preserve that case.
+      */
+     if (cctx->currentOffset != 0 && tableType == byU32) {
+         DEBUGLOG(5, "LZ4_prepareTable: adding 64KB to currentOffset");
+         cctx->currentOffset += 64 KB;
+     }
+ 
+     /* Finally, clear history */
+     cctx->dictCtx = NULL;
+@@ -848,17 +907,17 @@ LZ4_prepareTable(LZ4_stream_t_internal* 
+  *  - source != NULL
+  *  - inputSize > 0
+  */
+ LZ4_FORCE_INLINE int LZ4_compress_generic_validated(
+                  LZ4_stream_t_internal* const cctx,
+                  const char* const source,
+                  char* const dest,
+                  const int inputSize,
+-                 int *inputConsumed, /* only written when outputDirective == fillOutput */
++                 int*  inputConsumed, /* only written when outputDirective == fillOutput */
+                  const int maxOutputSize,
+                  const limitedOutput_directive outputDirective,
+                  const tableType_t tableType,
+                  const dict_directive dictDirective,
+                  const dictIssue_directive dictIssue,
+                  const int acceleration)
+ {
+     int result;
+@@ -880,17 +939,18 @@ LZ4_FORCE_INLINE int LZ4_compress_generi
+     const BYTE* const dictEnd = dictionary ? dictionary + dictSize : dictionary;
+     const BYTE* anchor = (const BYTE*) source;
+     const BYTE* const iend = ip + inputSize;
+     const BYTE* const mflimitPlusOne = iend - MFLIMIT + 1;
+     const BYTE* const matchlimit = iend - LASTLITERALS;
+ 
+     /* the dictCtx currentOffset is indexed on the start of the dictionary,
+      * while a dictionary in the current context precedes the currentOffset */
+-    const BYTE* dictBase = !dictionary ? NULL : (dictDirective == usingDictCtx) ?
++    const BYTE* dictBase = (dictionary == NULL) ? NULL :
++                           (dictDirective == usingDictCtx) ?
+                             dictionary + dictSize - dictCtx->currentOffset :
+                             dictionary + dictSize - startIndex;
+ 
+     BYTE* op = (BYTE*) dest;
+     BYTE* const olimit = op + maxOutputSize;
+ 
+     U32 offset = 0;
+     U32 forwardH;
+@@ -976,20 +1036,21 @@ LZ4_FORCE_INLINE int LZ4_compress_generi
+                         matchIndex = LZ4_getIndexOnHash(h, dictCtx->hashTable, byU32);
+                         match = dictBase + matchIndex;
+                         matchIndex += dictDelta;   /* make dictCtx index comparable with current context */
+                         lowLimit = dictionary;
+                     } else {
+                         match = base + matchIndex;
+                         lowLimit = (const BYTE*)source;
+                     }
+-                } else if (dictDirective==usingExtDict) {
++                } else if (dictDirective == usingExtDict) {
+                     if (matchIndex < startIndex) {
+                         DEBUGLOG(7, "extDict candidate: matchIndex=%5u  <  startIndex=%5u", matchIndex, startIndex);
+                         assert(startIndex - matchIndex >= MINMATCH);
++                        assert(dictBase);
+                         match = dictBase + matchIndex;
+                         lowLimit = dictionary;
+                     } else {
+                         match = base + matchIndex;
+                         lowLimit = (const BYTE*)source;
+                     }
+                 } else {   /* single continuous memory segment */
+                     match = base + matchIndex;
+@@ -1043,17 +1104,17 @@ LZ4_FORCE_INLINE int LZ4_compress_generi
+             op+=litLength;
+             DEBUGLOG(6, "seq.start:%i, literals=%u, match.start:%i",
+                         (int)(anchor-(const BYTE*)source), litLength, (int)(ip-(const BYTE*)source));
+         }
+ 
+ _next_match:
+         /* at this stage, the following variables must be correctly set :
+          * - ip : at start of LZ operation
+-         * - match : at start of previous pattern occurence; can be within current prefix, or within extDict
++         * - match : at start of previous pattern occurrence; can be within current prefix, or within extDict
+          * - offset : if maybe_ext_memSegment==1 (constant)
+          * - lowLimit : must be == dictionary to mean "match is within extDict"; must be == source otherwise
+          * - token and *token : position to write 4-bits for match length; higher 4-bits for literal length supposed already written
+          */
+ 
+         if ((outputDirective == fillOutput) &&
+             (op + 2 /* offset */ + 1 /* token */ + MFLIMIT - MINMATCH /* min last literals so last match is <= end - MFLIMIT */ > olimit)) {
+             /* the match was too close to the end, rewind and go to last literals */
+@@ -1168,16 +1229,17 @@ LZ4_FORCE_INLINE int LZ4_compress_generi
+                     lowLimit = dictionary;   /* required for match length counter */
+                     matchIndex += dictDelta;
+                 } else {
+                     match = base + matchIndex;
+                     lowLimit = (const BYTE*)source;  /* required for match length counter */
+                 }
+             } else if (dictDirective==usingExtDict) {
+                 if (matchIndex < startIndex) {
++                    assert(dictBase);
+                     match = dictBase + matchIndex;
+                     lowLimit = dictionary;   /* required for match length counter */
+                 } else {
+                     match = base + matchIndex;
+                     lowLimit = (const BYTE*)source;   /* required for match length counter */
+                 }
+             } else {   /* single memory segment */
+                 match = base + matchIndex;
+@@ -1350,17 +1412,17 @@ int LZ4_compress_fast_extState_fastReset
+     }
+ }
+ 
+ 
+ int LZ4_compress_fast(const char* source, char* dest, int inputSize, int maxOutputSize, int acceleration)
+ {
+     int result;
+ #if (LZ4_HEAPMODE)
+-    LZ4_stream_t* ctxPtr = ALLOC(sizeof(LZ4_stream_t));   /* malloc-calloc always properly aligned */
++    LZ4_stream_t* ctxPtr = (LZ4_stream_t*)ALLOC(sizeof(LZ4_stream_t));   /* malloc-calloc always properly aligned */
+     if (ctxPtr == NULL) return 0;
+ #else
+     LZ4_stream_t ctx;
+     LZ4_stream_t* const ctxPtr = &ctx;
+ #endif
+     result = LZ4_compress_fast_extState(ctxPtr, source, dest, inputSize, maxOutputSize, acceleration);
+ 
+ #if (LZ4_HEAPMODE)
+@@ -1415,25 +1477,27 @@ int LZ4_compress_destSize(const char* sr
+ }
+ 
+ 
+ 
+ /*-******************************
+ *  Streaming functions
+ ********************************/
+ 
++#if !defined(LZ4_STATIC_LINKING_ONLY_DISABLE_MEMORY_ALLOCATION)
+ LZ4_stream_t* LZ4_createStream(void)
+ {
+     LZ4_stream_t* const lz4s = (LZ4_stream_t*)ALLOC(sizeof(LZ4_stream_t));
+-    LZ4_STATIC_ASSERT(LZ4_STREAMSIZE >= sizeof(LZ4_stream_t_internal));    /* A compilation error here means LZ4_STREAMSIZE is not large enough */
++    LZ4_STATIC_ASSERT(sizeof(LZ4_stream_t) >= sizeof(LZ4_stream_t_internal));
+     DEBUGLOG(4, "LZ4_createStream %p", lz4s);
+     if (lz4s == NULL) return NULL;
+     LZ4_initStream(lz4s, sizeof(*lz4s));
+     return lz4s;
+ }
++#endif
+ 
+ static size_t LZ4_stream_t_alignment(void)
+ {
+ #if LZ4_ALIGN_TEST
+     typedef struct { char c; LZ4_stream_t t; } t_a;
+     return sizeof(t_a) - sizeof(LZ4_stream_t);
+ #else
+     return 1;  /* effectively disabled */
+@@ -1457,23 +1521,25 @@ void LZ4_resetStream (LZ4_stream_t* LZ4_
+     DEBUGLOG(5, "LZ4_resetStream (ctx:%p)", LZ4_stream);
+     MEM_INIT(LZ4_stream, 0, sizeof(LZ4_stream_t_internal));
+ }
+ 
+ void LZ4_resetStream_fast(LZ4_stream_t* ctx) {
+     LZ4_prepareTable(&(ctx->internal_donotuse), 0, byU32);
+ }
+ 
++#if !defined(LZ4_STATIC_LINKING_ONLY_DISABLE_MEMORY_ALLOCATION)
+ int LZ4_freeStream (LZ4_stream_t* LZ4_stream)
+ {
+     if (!LZ4_stream) return 0;   /* support free on NULL */
+     DEBUGLOG(5, "LZ4_freeStream %p", LZ4_stream);
+     FREEMEM(LZ4_stream);
+     return (0);
+ }
++#endif
+ 
+ 
+ #define HASH_UNIT sizeof(reg_t)
+ int LZ4_loadDict (LZ4_stream_t* LZ4_dict, const char* dictionary, int dictSize)
+ {
+     LZ4_stream_t_internal* dict = &LZ4_dict->internal_donotuse;
+     const tableType_t tableType = byU32;
+     const BYTE* p = (const BYTE*)dictionary;
+@@ -1509,18 +1575,19 @@ int LZ4_loadDict (LZ4_stream_t* LZ4_dict
+     while (p <= dictEnd-HASH_UNIT) {
+         LZ4_putPosition(p, dict->hashTable, tableType, base);
+         p+=3;
+     }
+ 
+     return (int)dict->dictSize;
+ }
+ 
+-void LZ4_attach_dictionary(LZ4_stream_t* workingStream, const LZ4_stream_t* dictionaryStream) {
+-    const LZ4_stream_t_internal* dictCtx = dictionaryStream == NULL ? NULL :
++void LZ4_attach_dictionary(LZ4_stream_t* workingStream, const LZ4_stream_t* dictionaryStream)
++{
++    const LZ4_stream_t_internal* dictCtx = (dictionaryStream == NULL) ? NULL :
+         &(dictionaryStream->internal_donotuse);
+ 
+     DEBUGLOG(4, "LZ4_attach_dictionary (%p, %p, size %u)",
+              workingStream, dictionaryStream,
+              dictCtx != NULL ? dictCtx->dictSize : 0);
+ 
+     if (dictCtx != NULL) {
+         /* If the current offset is zero, we will never look in the
+@@ -1563,46 +1630,50 @@ static void LZ4_renormDictT(LZ4_stream_t
+ 
+ 
+ int LZ4_compress_fast_continue (LZ4_stream_t* LZ4_stream,
+                                 const char* source, char* dest,
+                                 int inputSize, int maxOutputSize,
+                                 int acceleration)
+ {
+     const tableType_t tableType = byU32;
+-    LZ4_stream_t_internal* streamPtr = &LZ4_stream->internal_donotuse;
+-    const BYTE* dictEnd = streamPtr->dictionary + streamPtr->dictSize;
++    LZ4_stream_t_internal* const streamPtr = &LZ4_stream->internal_donotuse;
++    const char* dictEnd = streamPtr->dictSize ? (const char*)streamPtr->dictionary + streamPtr->dictSize : NULL;
+ 
+-    DEBUGLOG(5, "LZ4_compress_fast_continue (inputSize=%i)", inputSize);
++    DEBUGLOG(5, "LZ4_compress_fast_continue (inputSize=%i, dictSize=%u)", inputSize, streamPtr->dictSize);
+ 
+-    LZ4_renormDictT(streamPtr, inputSize);   /* avoid index overflow */
++    LZ4_renormDictT(streamPtr, inputSize);   /* fix index overflow */
+     if (acceleration < 1) acceleration = LZ4_ACCELERATION_DEFAULT;
+     if (acceleration > LZ4_ACCELERATION_MAX) acceleration = LZ4_ACCELERATION_MAX;
+ 
+     /* invalidate tiny dictionaries */
+-    if ( (streamPtr->dictSize-1 < 4-1)   /* intentional underflow */
+-      && (dictEnd != (const BYTE*)source) ) {
++    if ( (streamPtr->dictSize < 4)     /* tiny dictionary : not enough for a hash */
++      && (dictEnd != source)           /* prefix mode */
++      && (inputSize > 0)               /* tolerance : don't lose history, in case next invocation would use prefix mode */
++      && (streamPtr->dictCtx == NULL)  /* usingDictCtx */
++      ) {
+         DEBUGLOG(5, "LZ4_compress_fast_continue: dictSize(%u) at addr:%p is too small", streamPtr->dictSize, streamPtr->dictionary);
++        /* remove dictionary existence from history, to employ faster prefix mode */
+         streamPtr->dictSize = 0;
+         streamPtr->dictionary = (const BYTE*)source;
+-        dictEnd = (const BYTE*)source;
++        dictEnd = source;
+     }
+ 
+     /* Check overlapping input/dictionary space */
+-    {   const BYTE* sourceEnd = (const BYTE*) source + inputSize;
+-        if ((sourceEnd > streamPtr->dictionary) && (sourceEnd < dictEnd)) {
++    {   const char* const sourceEnd = source + inputSize;
++        if ((sourceEnd > (const char*)streamPtr->dictionary) && (sourceEnd < dictEnd)) {
+             streamPtr->dictSize = (U32)(dictEnd - sourceEnd);
+             if (streamPtr->dictSize > 64 KB) streamPtr->dictSize = 64 KB;
+             if (streamPtr->dictSize < 4) streamPtr->dictSize = 0;
+-            streamPtr->dictionary = dictEnd - streamPtr->dictSize;
++            streamPtr->dictionary = (const BYTE*)dictEnd - streamPtr->dictSize;
+         }
+     }
+ 
+     /* prefix mode : source data follows dictionary */
+-    if (dictEnd == (const BYTE*)source) {
++    if (dictEnd == source) {
+         if ((streamPtr->dictSize < 64 KB) && (streamPtr->dictSize < streamPtr->currentOffset))
+             return LZ4_compress_generic(streamPtr, source, dest, inputSize, NULL, maxOutputSize, limitedOutput, tableType, withPrefix64k, dictSmall, acceleration);
+         else
+             return LZ4_compress_generic(streamPtr, source, dest, inputSize, NULL, maxOutputSize, limitedOutput, tableType, withPrefix64k, noDictIssue, acceleration);
+     }
+ 
+     /* external dictionary mode */
+     {   int result;
+@@ -1618,17 +1689,17 @@ int LZ4_compress_fast_continue (LZ4_stre
+                  * cost to copy the dictionary's tables into the active context,
+                  * so that the compression loop is only looking into one table.
+                  */
+                 LZ4_memcpy(streamPtr, streamPtr->dictCtx, sizeof(*streamPtr));
+                 result = LZ4_compress_generic(streamPtr, source, dest, inputSize, NULL, maxOutputSize, limitedOutput, tableType, usingExtDict, noDictIssue, acceleration);
+             } else {
+                 result = LZ4_compress_generic(streamPtr, source, dest, inputSize, NULL, maxOutputSize, limitedOutput, tableType, usingDictCtx, noDictIssue, acceleration);
+             }
+-        } else {
++        } else {  /* small data <= 4 KB */
+             if ((streamPtr->dictSize < 64 KB) && (streamPtr->dictSize < streamPtr->currentOffset)) {
+                 result = LZ4_compress_generic(streamPtr, source, dest, inputSize, NULL, maxOutputSize, limitedOutput, tableType, usingExtDict, dictSmall, acceleration);
+             } else {
+                 result = LZ4_compress_generic(streamPtr, source, dest, inputSize, NULL, maxOutputSize, limitedOutput, tableType, usingExtDict, noDictIssue, acceleration);
+             }
+         }
+         streamPtr->dictionary = (const BYTE*)source;
+         streamPtr->dictSize = (U32)inputSize;
+@@ -1656,77 +1727,205 @@ int LZ4_compress_forceExtDict (LZ4_strea
+ 
+     return result;
+ }
+ 
+ 
+ /*! LZ4_saveDict() :
+  *  If previously compressed data block is not guaranteed to remain available at its memory location,
+  *  save it into a safer place (char* safeBuffer).
+- *  Note : you don't need to call LZ4_loadDict() afterwards,
+- *         dictionary is immediately usable, you can therefore call LZ4_compress_fast_continue().
+- *  Return : saved dictionary size in bytes (necessarily <= dictSize), or 0 if error.
++ *  Note : no need to call LZ4_loadDict() afterwards, dictionary is immediately usable,
++ *         one can therefore call LZ4_compress_fast_continue() right after.
++ * @return : saved dictionary size in bytes (necessarily <= dictSize), or 0 if error.
+  */
+ int LZ4_saveDict (LZ4_stream_t* LZ4_dict, char* safeBuffer, int dictSize)
+ {
+     LZ4_stream_t_internal* const dict = &LZ4_dict->internal_donotuse;
+-    const BYTE* const previousDictEnd = dict->dictionary + dict->dictSize;
++
++    DEBUGLOG(5, "LZ4_saveDict : dictSize=%i, safeBuffer=%p", dictSize, safeBuffer);
+ 
+     if ((U32)dictSize > 64 KB) { dictSize = 64 KB; } /* useless to define a dictionary > 64 KB */
+     if ((U32)dictSize > dict->dictSize) { dictSize = (int)dict->dictSize; }
+ 
+     if (safeBuffer == NULL) assert(dictSize == 0);
+-    if (dictSize > 0)
+-        memmove(safeBuffer, previousDictEnd - dictSize, dictSize);
++    if (dictSize > 0) {
++        const BYTE* const previousDictEnd = dict->dictionary + dict->dictSize;
++        assert(dict->dictionary);
++        LZ4_memmove(safeBuffer, previousDictEnd - dictSize, (size_t)dictSize);
++    }
+ 
+     dict->dictionary = (const BYTE*)safeBuffer;
+     dict->dictSize = (U32)dictSize;
+ 
+     return dictSize;
+ }
+ 
+ 
+ 
+ /*-*******************************
+  *  Decompression functions
+  ********************************/
+ 
+-typedef enum { endOnOutputSize = 0, endOnInputSize = 1 } endCondition_directive;
+ typedef enum { decode_full_block = 0, partial_decode = 1 } earlyEnd_directive;
+ 
+ #undef MIN
+ #define MIN(a,b)    ( (a) < (b) ? (a) : (b) )
+ 
++
++/* variant for decompress_unsafe()
++ * does not know end of input
++ * presumes input is well formed
++ * note : will consume at least one byte */
++size_t read_long_length_no_check(const BYTE** pp)
++{
++    size_t b, l = 0;
++    do { b = **pp; (*pp)++; l += b; } while (b==255);
++    DEBUGLOG(6, "read_long_length_no_check: +length=%zu using %zu input bytes", l, l/255 + 1)
++    return l;
++}
++
++/* core decoder variant for LZ4_decompress_fast*()
++ * for legacy support only : these entry points are deprecated.
++ * - Presumes input is correctly formed (no defense vs malformed inputs)
++ * - Does not know input size (presume input buffer is "large enough")
++ * - Decompress a full block (only)
++ * @return : nb of bytes read from input.
++ * Note : this variant is not optimized for speed, just for maintenance.
++ *        the goal is to remove support of decompress_fast*() variants by v2.0
++**/
++LZ4_FORCE_INLINE int
++LZ4_decompress_unsafe_generic(
++                 const BYTE* const istart,
++                 BYTE* const ostart,
++                 int decompressedSize,
++
++                 size_t prefixSize,
++                 const BYTE* const dictStart,  /* only if dict==usingExtDict */
++                 const size_t dictSize         /* note: =0 if dictStart==NULL */
++                 )
++{
++    const BYTE* ip = istart;
++    BYTE* op = (BYTE*)ostart;
++    BYTE* const oend = ostart + decompressedSize;
++    const BYTE* const prefixStart = ostart - prefixSize;
++
++    DEBUGLOG(5, "LZ4_decompress_unsafe_generic");
++    if (dictStart == NULL) assert(dictSize == 0);
++
++    while (1) {
++        /* start new sequence */
++        unsigned token = *ip++;
++
++        /* literals */
++        {   size_t ll = token >> ML_BITS;
++            if (ll==15) {
++                /* long literal length */
++                ll += read_long_length_no_check(&ip);
++            }
++            if ((size_t)(oend-op) < ll) return -1; /* output buffer overflow */
++            LZ4_memmove(op, ip, ll); /* support in-place decompression */
++            op += ll;
++            ip += ll;
++            if ((size_t)(oend-op) < MFLIMIT) {
++                if (op==oend) break;  /* end of block */
++                DEBUGLOG(5, "invalid: literals end at distance %zi from end of block", oend-op);
++                /* incorrect end of block :
++                 * last match must start at least MFLIMIT==12 bytes before end of output block */
++                return -1;
++        }   }
++
++        /* match */
++        {   size_t ml = token & 15;
++            size_t const offset = LZ4_readLE16(ip);
++            ip+=2;
++
++            if (ml==15) {
++                /* long literal length */
++                ml += read_long_length_no_check(&ip);
++            }
++            ml += MINMATCH;
++
++            if ((size_t)(oend-op) < ml) return -1; /* output buffer overflow */
++
++            {   const BYTE* match = op - offset;
++
++                /* out of range */
++                if (offset > (size_t)(op - prefixStart) + dictSize) {
++                    DEBUGLOG(6, "offset out of range");
++                    return -1;
++                }
++
++                /* check special case : extDict */
++                if (offset > (size_t)(op - prefixStart)) {
++                    /* extDict scenario */
++                    const BYTE* const dictEnd = dictStart + dictSize;
++                    const BYTE* extMatch = dictEnd - (offset - (size_t)(op-prefixStart));
++                    size_t const extml = (size_t)(dictEnd - extMatch);
++                    if (extml > ml) {
++                        /* match entirely within extDict */
++                        LZ4_memmove(op, extMatch, ml);
++                        op += ml;
++                        ml = 0;
++                    } else {
++                        /* match split between extDict & prefix */
++                        LZ4_memmove(op, extMatch, extml);
++                        op += extml;
++                        ml -= extml;
++                    }
++                    match = prefixStart;
++                }
++
++                /* match copy - slow variant, supporting overlap copy */
++                {   size_t u;
++                    for (u=0; u<ml; u++) {
++                        op[u] = match[u];
++            }   }   }
++            op += ml;
++            if ((size_t)(oend-op) < LASTLITERALS) {
++                DEBUGLOG(5, "invalid: match ends at distance %zi from end of block", oend-op);
++                /* incorrect end of block :
++                 * last match must stop at least LASTLITERALS==5 bytes before end of output block */
++                return -1;
++            }
++        } /* match */
++    } /* main loop */
++    return (int)(ip - istart);
++}
++
++
+ /* Read the variable-length literal or match length.
+  *
+- * ip - pointer to use as input.
+- * lencheck - end ip.  Return an error if ip advances >= lencheck.
+- * loop_check - check ip >= lencheck in body of loop.  Returns loop_error if so.
+- * initial_check - check ip >= lencheck before start of loop.  Returns initial_error if so.
+- * error (output) - error code.  Should be set to 0 before call.
+- */
+-typedef enum { loop_error = -2, initial_error = -1, ok = 0 } variable_length_error;
+-LZ4_FORCE_INLINE unsigned
+-read_variable_length(const BYTE**ip, const BYTE* lencheck,
+-                     int loop_check, int initial_check,
+-                     variable_length_error* error)
++ * @ip : input pointer
++ * @ilimit : position after which if length is not decoded, the input is necessarily corrupted.
++ * @initial_check - check ip >= ipmax before start of loop.  Returns initial_error if so.
++ * @error (output) - error code.  Must be set to 0 before call.
++**/
++typedef size_t Rvl_t;
++static const Rvl_t rvl_error = (Rvl_t)(-1);
++LZ4_FORCE_INLINE Rvl_t
++read_variable_length(const BYTE** ip, const BYTE* ilimit,
++                     int initial_check)
+ {
+-    U32 length = 0;
+-    U32 s;
+-    if (initial_check && unlikely((*ip) >= lencheck)) {    /* overflow detection */
+-        *error = initial_error;
+-        return length;
++    Rvl_t s, length = 0;
++    assert(ip != NULL);
++    assert(*ip !=  NULL);
++    assert(ilimit != NULL);
++    if (initial_check && unlikely((*ip) >= ilimit)) {    /* read limit reached */
++        return rvl_error;
+     }
+     do {
+         s = **ip;
+         (*ip)++;
+         length += s;
+-        if (loop_check && unlikely((*ip) >= lencheck)) {    /* overflow detection */
+-            *error = loop_error;
+-            return length;
++        if (unlikely((*ip) > ilimit)) {    /* read limit reached */
++            return rvl_error;
++        }
++        /* accumulator overflow detection (32-bit mode only) */
++        if ((sizeof(length)<8) && unlikely(length > ((Rvl_t)(-1)/2)) ) {
++            return rvl_error;
+         }
+     } while (s==255);
+ 
+     return length;
+ }
+ 
+ /*! LZ4_decompress_generic() :
+  *  This generic decompression function covers all use cases.
+@@ -1736,167 +1935,153 @@ read_variable_length(const BYTE**ip, con
+  */
+ LZ4_FORCE_INLINE int
+ LZ4_decompress_generic(
+                  const char* const src,
+                  char* const dst,
+                  int srcSize,
+                  int outputSize,         /* If endOnInput==endOnInputSize, this value is `dstCapacity` */
+ 
+-                 endCondition_directive endOnInput,   /* endOnOutputSize, endOnInputSize */
+                  earlyEnd_directive partialDecoding,  /* full, partial */
+                  dict_directive dict,                 /* noDict, withPrefix64k, usingExtDict */
+                  const BYTE* const lowPrefix,  /* always <= dst, == dst when no prefix */
+                  const BYTE* const dictStart,  /* only if dict==usingExtDict */
+                  const size_t dictSize         /* note : = 0 if noDict */
+                  )
+ {
+-    if (src == NULL) { return -1; }
++    if ((src == NULL) || (outputSize < 0)) { return -1; }
+ 
+     {   const BYTE* ip = (const BYTE*) src;
+         const BYTE* const iend = ip + srcSize;
+ 
+         BYTE* op = (BYTE*) dst;
+         BYTE* const oend = op + outputSize;
+         BYTE* cpy;
+ 
+         const BYTE* const dictEnd = (dictStart == NULL) ? NULL : dictStart + dictSize;
+ 
+-        const int safeDecode = (endOnInput==endOnInputSize);
+-        const int checkOffset = ((safeDecode) && (dictSize < (int)(64 KB)));
++        const int checkOffset = (dictSize < (int)(64 KB));
+ 
+ 
+         /* Set up the "end" pointers for the shortcut. */
+-        const BYTE* const shortiend = iend - (endOnInput ? 14 : 8) /*maxLL*/ - 2 /*offset*/;
+-        const BYTE* const shortoend = oend - (endOnInput ? 14 : 8) /*maxLL*/ - 18 /*maxML*/;
++        const BYTE* const shortiend = iend - 14 /*maxLL*/ - 2 /*offset*/;
++        const BYTE* const shortoend = oend - 14 /*maxLL*/ - 18 /*maxML*/;
+ 
+         const BYTE* match;
+         size_t offset;
+         unsigned token;
+         size_t length;
+ 
+ 
+         DEBUGLOG(5, "LZ4_decompress_generic (srcSize:%i, dstSize:%i)", srcSize, outputSize);
+ 
+         /* Special cases */
+         assert(lowPrefix <= op);
+-        if ((endOnInput) && (unlikely(outputSize==0))) {
++        if (unlikely(outputSize==0)) {
+             /* Empty output buffer */
+             if (partialDecoding) return 0;
+             return ((srcSize==1) && (*ip==0)) ? 0 : -1;
+         }
+-        if ((!endOnInput) && (unlikely(outputSize==0))) { return (*ip==0 ? 1 : -1); }
+-        if ((endOnInput) && unlikely(srcSize==0)) { return -1; }
++        if (unlikely(srcSize==0)) { return -1; }
+ 
+-	/* Currently the fast loop shows a regression on qualcomm arm chips. */
++    /* LZ4_FAST_DEC_LOOP:
++     * designed for modern OoO performance cpus,
++     * where copying reliably 32-bytes is preferable to an unpredictable branch.
++     * note : fast loop may show a regression for some client arm chips. */
+ #if LZ4_FAST_DEC_LOOP
+         if ((oend - op) < FASTLOOP_SAFE_DISTANCE) {
+             DEBUGLOG(6, "skip fast decode loop");
+             goto safe_decode;
+         }
+ 
+-        /* Fast loop : decode sequences as long as output < iend-FASTLOOP_SAFE_DISTANCE */
++        /* Fast loop : decode sequences as long as output < oend-FASTLOOP_SAFE_DISTANCE */
+         while (1) {
+             /* Main fastloop assertion: We can always wildcopy FASTLOOP_SAFE_DISTANCE */
+             assert(oend - op >= FASTLOOP_SAFE_DISTANCE);
+-            if (endOnInput) { assert(ip < iend); }
++            assert(ip < iend);
+             token = *ip++;
+             length = token >> ML_BITS;  /* literal length */
+ 
+-            assert(!endOnInput || ip <= iend); /* ip < iend before the increment */
+-
+             /* decode literal length */
+             if (length == RUN_MASK) {
+-                variable_length_error error = ok;
+-                length += read_variable_length(&ip, iend-RUN_MASK, (int)endOnInput, (int)endOnInput, &error);
+-                if (error == initial_error) { goto _output_error; }
+-                if ((safeDecode) && unlikely((uptrval)(op)+length<(uptrval)(op))) { goto _output_error; } /* overflow detection */
+-                if ((safeDecode) && unlikely((uptrval)(ip)+length<(uptrval)(ip))) { goto _output_error; } /* overflow detection */
++                size_t const addl = read_variable_length(&ip, iend-RUN_MASK, 1);
++                if (addl == rvl_error) { goto _output_error; }
++                length += addl;
++                if (unlikely((uptrval)(op)+length<(uptrval)(op))) { goto _output_error; } /* overflow detection */
++                if (unlikely((uptrval)(ip)+length<(uptrval)(ip))) { goto _output_error; } /* overflow detection */
+ 
+                 /* copy literals */
+                 cpy = op+length;
+                 LZ4_STATIC_ASSERT(MFLIMIT >= WILDCOPYLENGTH);
+-                if (endOnInput) {  /* LZ4_decompress_safe() */
+-                    if ((cpy>oend-32) || (ip+length>iend-32)) { goto safe_literal_copy; }
+-                    LZ4_wildCopy32(op, ip, cpy);
+-                } else {   /* LZ4_decompress_fast() */
+-                    if (cpy>oend-8) { goto safe_literal_copy; }
+-                    LZ4_wildCopy8(op, ip, cpy); /* LZ4_decompress_fast() cannot copy more than 8 bytes at a time :
+-                                                 * it doesn't know input length, and only relies on end-of-block properties */
+-                }
++                if ((cpy>oend-32) || (ip+length>iend-32)) { goto safe_literal_copy; }
++                LZ4_wildCopy32(op, ip, cpy);
+                 ip += length; op = cpy;
+             } else {
+                 cpy = op+length;
+-                if (endOnInput) {  /* LZ4_decompress_safe() */
+-                    DEBUGLOG(7, "copy %u bytes in a 16-bytes stripe", (unsigned)length);
+-                    /* We don't need to check oend, since we check it once for each loop below */
+-                    if (ip > iend-(16 + 1/*max lit + offset + nextToken*/)) { goto safe_literal_copy; }
+-                    /* Literals can only be 14, but hope compilers optimize if we copy by a register size */
+-                    LZ4_memcpy(op, ip, 16);
+-                } else {  /* LZ4_decompress_fast() */
+-                    /* LZ4_decompress_fast() cannot copy more than 8 bytes at a time :
+-                     * it doesn't know input length, and relies on end-of-block properties */
+-                    LZ4_memcpy(op, ip, 8);
+-                    if (length > 8) { LZ4_memcpy(op+8, ip+8, 8); }
+-                }
++                DEBUGLOG(7, "copy %u bytes in a 16-bytes stripe", (unsigned)length);
++                /* We don't need to check oend, since we check it once for each loop below */
++                if (ip > iend-(16 + 1/*max lit + offset + nextToken*/)) { goto safe_literal_copy; }
++                /* Literals can only be <= 14, but hope compilers optimize better when copy by a register size */
++                LZ4_memcpy(op, ip, 16);
+                 ip += length; op = cpy;
+             }
+ 
+             /* get offset */
+             offset = LZ4_readLE16(ip); ip+=2;
+             match = op - offset;
+-            assert(match <= op);
++            assert(match <= op);  /* overflow check */
+ 
+             /* get matchlength */
+             length = token & ML_MASK;
+ 
+             if (length == ML_MASK) {
+-                variable_length_error error = ok;
++                size_t const addl = read_variable_length(&ip, iend - LASTLITERALS + 1, 0);
++                if (addl == rvl_error) { goto _output_error; }
++                length += addl;
++                length += MINMATCH;
++                if (unlikely((uptrval)(op)+length<(uptrval)op)) { goto _output_error; } /* overflow detection */
+                 if ((checkOffset) && (unlikely(match + dictSize < lowPrefix))) { goto _output_error; } /* Error : offset outside buffers */
+-                length += read_variable_length(&ip, iend - LASTLITERALS + 1, (int)endOnInput, 0, &error);
+-                if (error != ok) { goto _output_error; }
+-                if ((safeDecode) && unlikely((uptrval)(op)+length<(uptrval)op)) { goto _output_error; } /* overflow detection */
+-                length += MINMATCH;
+                 if (op + length >= oend - FASTLOOP_SAFE_DISTANCE) {
+                     goto safe_match_copy;
+                 }
+             } else {
+                 length += MINMATCH;
+                 if (op + length >= oend - FASTLOOP_SAFE_DISTANCE) {
+                     goto safe_match_copy;
+                 }
+ 
+-                /* Fastpath check: Avoids a branch in LZ4_wildCopy32 if true */
++                /* Fastpath check: skip LZ4_wildCopy32 when true */
+                 if ((dict == withPrefix64k) || (match >= lowPrefix)) {
+                     if (offset >= 8) {
+                         assert(match >= lowPrefix);
+                         assert(match <= op);
+                         assert(op + 18 <= oend);
+ 
+                         LZ4_memcpy(op, match, 8);
+                         LZ4_memcpy(op+8, match+8, 8);
+                         LZ4_memcpy(op+16, match+16, 2);
+                         op += length;
+                         continue;
+             }   }   }
+ 
+             if (checkOffset && (unlikely(match + dictSize < lowPrefix))) { goto _output_error; } /* Error : offset outside buffers */
+             /* match starting within external dictionary */
+             if ((dict==usingExtDict) && (match < lowPrefix)) {
++                assert(dictEnd != NULL);
+                 if (unlikely(op+length > oend-LASTLITERALS)) {
+                     if (partialDecoding) {
+                         DEBUGLOG(7, "partialDecoding: dictionary match, close to dstEnd");
+                         length = MIN(length, (size_t)(oend-op));
+                     } else {
+                         goto _output_error;  /* end-of-block condition violated */
+                 }   }
+ 
+                 if (length <= (size_t)(lowPrefix-match)) {
+                     /* match fits entirely within external dictionary : just copy */
+-                    memmove(op, dictEnd - (lowPrefix-match), length);
++                    LZ4_memmove(op, dictEnd - (lowPrefix-match), length);
+                     op += length;
+                 } else {
+                     /* match stretches into both external dictionary and current block */
+                     size_t const copySize = (size_t)(lowPrefix - match);
+                     size_t const restSize = length - copySize;
+                     LZ4_memcpy(op, dictEnd - copySize, copySize);
+                     op += copySize;
+                     if (restSize > (size_t)(op - lowPrefix)) {  /* overlap copy */
+@@ -1922,35 +2107,34 @@ LZ4_decompress_generic(
+ 
+             op = cpy;   /* wildcopy correction */
+         }
+     safe_decode:
+ #endif
+ 
+         /* Main Loop : decode remaining sequences where output < FASTLOOP_SAFE_DISTANCE */
+         while (1) {
++            assert(ip < iend);
+             token = *ip++;
+             length = token >> ML_BITS;  /* literal length */
+ 
+-            assert(!endOnInput || ip <= iend); /* ip < iend before the increment */
+-
+             /* A two-stage shortcut for the most common case:
+              * 1) If the literal length is 0..14, and there is enough space,
+              * enter the shortcut and copy 16 bytes on behalf of the literals
+              * (in the fast mode, only 8 bytes can be safely copied this way).
+              * 2) Further if the match length is 4..18, copy 18 bytes in a similar
+              * manner; but we ensure that there's enough space in the output for
+              * those 18 bytes earlier, upon entering the shortcut (in other words,
+              * there is a combined check for both stages).
+              */
+-            if ( (endOnInput ? length != RUN_MASK : length <= 8)
++            if ( (length != RUN_MASK)
+                 /* strictly "less than" on input, to re-enter the loop with at least one byte */
+-              && likely((endOnInput ? ip < shortiend : 1) & (op <= shortoend)) ) {
++              && likely((ip < shortiend) & (op <= shortoend)) ) {
+                 /* Copy the literals */
+-                LZ4_memcpy(op, ip, endOnInput ? 16 : 8);
++                LZ4_memcpy(op, ip, 16);
+                 op += length; ip += length;
+ 
+                 /* The second stage: prepare for match copying, decode full info.
+                  * If it doesn't work out, the info won't be wasted. */
+                 length = token & ML_MASK; /* match length */
+                 offset = LZ4_readLE16(ip); ip += 2;
+                 match = op - offset;
+                 assert(match <= op); /* check overflow */
+@@ -1970,42 +2154,39 @@ LZ4_decompress_generic(
+ 
+                 /* The second stage didn't work out, but the info is ready.
+                  * Propel it right to the point of match copying. */
+                 goto _copy_match;
+             }
+ 
+             /* decode literal length */
+             if (length == RUN_MASK) {
+-                variable_length_error error = ok;
+-                length += read_variable_length(&ip, iend-RUN_MASK, (int)endOnInput, (int)endOnInput, &error);
+-                if (error == initial_error) { goto _output_error; }
+-                if ((safeDecode) && unlikely((uptrval)(op)+length<(uptrval)(op))) { goto _output_error; } /* overflow detection */
+-                if ((safeDecode) && unlikely((uptrval)(ip)+length<(uptrval)(ip))) { goto _output_error; } /* overflow detection */
++                size_t const addl = read_variable_length(&ip, iend-RUN_MASK, 1);
++                if (addl == rvl_error) { goto _output_error; }
++                length += addl;
++                if (unlikely((uptrval)(op)+length<(uptrval)(op))) { goto _output_error; } /* overflow detection */
++                if (unlikely((uptrval)(ip)+length<(uptrval)(ip))) { goto _output_error; } /* overflow detection */
+             }
+ 
+             /* copy literals */
+             cpy = op+length;
+ #if LZ4_FAST_DEC_LOOP
+         safe_literal_copy:
+ #endif
+             LZ4_STATIC_ASSERT(MFLIMIT >= WILDCOPYLENGTH);
+-            if ( ((endOnInput) && ((cpy>oend-MFLIMIT) || (ip+length>iend-(2+1+LASTLITERALS))) )
+-              || ((!endOnInput) && (cpy>oend-WILDCOPYLENGTH)) )
+-            {
++            if ((cpy>oend-MFLIMIT) || (ip+length>iend-(2+1+LASTLITERALS))) {
+                 /* We've either hit the input parsing restriction or the output parsing restriction.
+                  * In the normal scenario, decoding a full block, it must be the last sequence,
+                  * otherwise it's an error (invalid input or dimensions).
+                  * In partialDecoding scenario, it's necessary to ensure there is no buffer overflow.
+                  */
+                 if (partialDecoding) {
+                     /* Since we are partial decoding we may be in this block because of the output parsing
+                      * restriction, which is not valid since the output buffer is allowed to be undersized.
+                      */
+-                    assert(endOnInput);
+                     DEBUGLOG(7, "partialDecoding: copying literals, close to input or output end")
+                     DEBUGLOG(7, "partialDecoding: literal length = %u", (unsigned)length);
+                     DEBUGLOG(7, "partialDecoding: remaining space in dstBuffer : %i", (int)(oend - op));
+                     DEBUGLOG(7, "partialDecoding: remaining space in srcBuffer : %i", (int)(iend - ip));
+                     /* Finishing in the middle of a literals segment,
+                      * due to lack of input.
+                      */
+                     if (ip+length > iend) {
+@@ -2016,76 +2197,73 @@ LZ4_decompress_generic(
+                      * due to lack of output space.
+                      */
+                     if (cpy > oend) {
+                         cpy = oend;
+                         assert(op<=oend);
+                         length = (size_t)(oend-op);
+                     }
+                 } else {
+-                    /* We must be on the last sequence because of the parsing limitations so check
+-                     * that we exactly regenerate the original size (must be exact when !endOnInput).
+-                     */
+-                    if ((!endOnInput) && (cpy != oend)) { goto _output_error; }
+                      /* We must be on the last sequence (or invalid) because of the parsing limitations
+                       * so check that we exactly consume the input and don't overrun the output buffer.
+                       */
+-                    if ((endOnInput) && ((ip+length != iend) || (cpy > oend))) {
++                    if ((ip+length != iend) || (cpy > oend)) {
+                         DEBUGLOG(6, "should have been last run of literals")
+                         DEBUGLOG(6, "ip(%p) + length(%i) = %p != iend (%p)", ip, (int)length, ip+length, iend);
+                         DEBUGLOG(6, "or cpy(%p) > oend(%p)", cpy, oend);
+                         goto _output_error;
+                     }
+                 }
+-                memmove(op, ip, length);  /* supports overlapping memory regions; only matters for in-place decompression scenarios */
++                LZ4_memmove(op, ip, length);  /* supports overlapping memory regions, for in-place decompression scenarios */
+                 ip += length;
+                 op += length;
+                 /* Necessarily EOF when !partialDecoding.
+                  * When partialDecoding, it is EOF if we've either
+                  * filled the output buffer or
+                  * can't proceed with reading an offset for following match.
+                  */
+                 if (!partialDecoding || (cpy == oend) || (ip >= (iend-2))) {
+                     break;
+                 }
+             } else {
+-                LZ4_wildCopy8(op, ip, cpy);   /* may overwrite up to WILDCOPYLENGTH beyond cpy */
++                LZ4_wildCopy8(op, ip, cpy);   /* can overwrite up to 8 bytes beyond cpy */
+                 ip += length; op = cpy;
+             }
+ 
+             /* get offset */
+             offset = LZ4_readLE16(ip); ip+=2;
+             match = op - offset;
+ 
+             /* get matchlength */
+             length = token & ML_MASK;
+ 
+     _copy_match:
+             if (length == ML_MASK) {
+-              variable_length_error error = ok;
+-              length += read_variable_length(&ip, iend - LASTLITERALS + 1, (int)endOnInput, 0, &error);
+-              if (error != ok) goto _output_error;
+-                if ((safeDecode) && unlikely((uptrval)(op)+length<(uptrval)op)) goto _output_error;   /* overflow detection */
++                size_t const addl = read_variable_length(&ip, iend - LASTLITERALS + 1, 0);
++                if (addl == rvl_error) { goto _output_error; }
++                length += addl;
++                if (unlikely((uptrval)(op)+length<(uptrval)op)) goto _output_error;   /* overflow detection */
+             }
+             length += MINMATCH;
+ 
+ #if LZ4_FAST_DEC_LOOP
+         safe_match_copy:
+ #endif
+             if ((checkOffset) && (unlikely(match + dictSize < lowPrefix))) goto _output_error;   /* Error : offset outside buffers */
+             /* match starting within external dictionary */
+             if ((dict==usingExtDict) && (match < lowPrefix)) {
++                assert(dictEnd != NULL);
+                 if (unlikely(op+length > oend-LASTLITERALS)) {
+                     if (partialDecoding) length = MIN(length, (size_t)(oend-op));
+                     else goto _output_error;   /* doesn't respect parsing restriction */
+                 }
+ 
+                 if (length <= (size_t)(lowPrefix-match)) {
+                     /* match fits entirely within external dictionary : just copy */
+-                    memmove(op, dictEnd - (lowPrefix-match), length);
++                    LZ4_memmove(op, dictEnd - (lowPrefix-match), length);
+                     op += length;
+                 } else {
+                     /* match stretches into both external dictionary and current block */
+                     size_t const copySize = (size_t)(lowPrefix - match);
+                     size_t const restSize = length - copySize;
+                     LZ4_memcpy(op, dictEnd - copySize, copySize);
+                     op += copySize;
+                     if (restSize > (size_t)(op - lowPrefix)) {  /* overlap copy */
+@@ -2146,152 +2324,176 @@ LZ4_decompress_generic(
+             } else {
+                 LZ4_memcpy(op, match, 8);
+                 if (length > 16)  { LZ4_wildCopy8(op+8, match+8, cpy); }
+             }
+             op = cpy;   /* wildcopy correction */
+         }
+ 
+         /* end of decoding */
+-        if (endOnInput) {
+-            DEBUGLOG(5, "decoded %i bytes", (int) (((char*)op)-dst));
+-           return (int) (((char*)op)-dst);     /* Nb of output bytes decoded */
+-       } else {
+-           return (int) (((const char*)ip)-src);   /* Nb of input bytes read */
+-       }
++        DEBUGLOG(5, "decoded %i bytes", (int) (((char*)op)-dst));
++        return (int) (((char*)op)-dst);     /* Nb of output bytes decoded */
+ 
+         /* Overflow error detected */
+     _output_error:
+         return (int) (-(((const char*)ip)-src))-1;
+     }
+ }
+ 
+ 
+ /*===== Instantiate the API decoding functions. =====*/
+ 
+ LZ4_FORCE_O2
+ int LZ4_decompress_safe(const char* source, char* dest, int compressedSize, int maxDecompressedSize)
+ {
+     return LZ4_decompress_generic(source, dest, compressedSize, maxDecompressedSize,
+-                                  endOnInputSize, decode_full_block, noDict,
++                                  decode_full_block, noDict,
+                                   (BYTE*)dest, NULL, 0);
+ }
+ 
+ LZ4_FORCE_O2
+ int LZ4_decompress_safe_partial(const char* src, char* dst, int compressedSize, int targetOutputSize, int dstCapacity)
+ {
+     dstCapacity = MIN(targetOutputSize, dstCapacity);
+     return LZ4_decompress_generic(src, dst, compressedSize, dstCapacity,
+-                                  endOnInputSize, partial_decode,
++                                  partial_decode,
+                                   noDict, (BYTE*)dst, NULL, 0);
+ }
+ 
+ LZ4_FORCE_O2
+ int LZ4_decompress_fast(const char* source, char* dest, int originalSize)
+ {
+-    return LZ4_decompress_generic(source, dest, 0, originalSize,
+-                                  endOnOutputSize, decode_full_block, withPrefix64k,
+-                                  (BYTE*)dest - 64 KB, NULL, 0);
++    DEBUGLOG(5, "LZ4_decompress_fast");
++    return LZ4_decompress_unsafe_generic(
++                (const BYTE*)source, (BYTE*)dest, originalSize,
++                0, NULL, 0);
+ }
+ 
+ /*===== Instantiate a few more decoding cases, used more than once. =====*/
+ 
+ LZ4_FORCE_O2 /* Exported, an obsolete API function. */
+ int LZ4_decompress_safe_withPrefix64k(const char* source, char* dest, int compressedSize, int maxOutputSize)
+ {
+     return LZ4_decompress_generic(source, dest, compressedSize, maxOutputSize,
+-                                  endOnInputSize, decode_full_block, withPrefix64k,
++                                  decode_full_block, withPrefix64k,
++                                  (BYTE*)dest - 64 KB, NULL, 0);
++}
++
++LZ4_FORCE_O2
++static int LZ4_decompress_safe_partial_withPrefix64k(const char* source, char* dest, int compressedSize, int targetOutputSize, int dstCapacity)
++{
++    dstCapacity = MIN(targetOutputSize, dstCapacity);
++    return LZ4_decompress_generic(source, dest, compressedSize, dstCapacity,
++                                  partial_decode, withPrefix64k,
+                                   (BYTE*)dest - 64 KB, NULL, 0);
+ }
+ 
+ /* Another obsolete API function, paired with the previous one. */
+ int LZ4_decompress_fast_withPrefix64k(const char* source, char* dest, int originalSize)
+ {
+-    /* LZ4_decompress_fast doesn't validate match offsets,
+-     * and thus serves well with any prefixed dictionary. */
+-    return LZ4_decompress_fast(source, dest, originalSize);
++    return LZ4_decompress_unsafe_generic(
++                (const BYTE*)source, (BYTE*)dest, originalSize,
++                64 KB, NULL, 0);
+ }
+ 
+ LZ4_FORCE_O2
+ static int LZ4_decompress_safe_withSmallPrefix(const char* source, char* dest, int compressedSize, int maxOutputSize,
+                                                size_t prefixSize)
+ {
+     return LZ4_decompress_generic(source, dest, compressedSize, maxOutputSize,
+-                                  endOnInputSize, decode_full_block, noDict,
++                                  decode_full_block, noDict,
++                                  (BYTE*)dest-prefixSize, NULL, 0);
++}
++
++LZ4_FORCE_O2
++static int LZ4_decompress_safe_partial_withSmallPrefix(const char* source, char* dest, int compressedSize, int targetOutputSize, int dstCapacity,
++                                               size_t prefixSize)
++{
++    dstCapacity = MIN(targetOutputSize, dstCapacity);
++    return LZ4_decompress_generic(source, dest, compressedSize, dstCapacity,
++                                  partial_decode, noDict,
+                                   (BYTE*)dest-prefixSize, NULL, 0);
+ }
+ 
+ LZ4_FORCE_O2
+ int LZ4_decompress_safe_forceExtDict(const char* source, char* dest,
+                                      int compressedSize, int maxOutputSize,
+                                      const void* dictStart, size_t dictSize)
+ {
+     return LZ4_decompress_generic(source, dest, compressedSize, maxOutputSize,
+-                                  endOnInputSize, decode_full_block, usingExtDict,
++                                  decode_full_block, usingExtDict,
++                                  (BYTE*)dest, (const BYTE*)dictStart, dictSize);
++}
++
++LZ4_FORCE_O2
++int LZ4_decompress_safe_partial_forceExtDict(const char* source, char* dest,
++                                     int compressedSize, int targetOutputSize, int dstCapacity,
++                                     const void* dictStart, size_t dictSize)
++{
++    dstCapacity = MIN(targetOutputSize, dstCapacity);
++    return LZ4_decompress_generic(source, dest, compressedSize, dstCapacity,
++                                  partial_decode, usingExtDict,
+                                   (BYTE*)dest, (const BYTE*)dictStart, dictSize);
+ }
+ 
+ LZ4_FORCE_O2
+ static int LZ4_decompress_fast_extDict(const char* source, char* dest, int originalSize,
+                                        const void* dictStart, size_t dictSize)
+ {
+-    return LZ4_decompress_generic(source, dest, 0, originalSize,
+-                                  endOnOutputSize, decode_full_block, usingExtDict,
+-                                  (BYTE*)dest, (const BYTE*)dictStart, dictSize);
++    return LZ4_decompress_unsafe_generic(
++                (const BYTE*)source, (BYTE*)dest, originalSize,
++                0, (const BYTE*)dictStart, dictSize);
+ }
+ 
+ /* The "double dictionary" mode, for use with e.g. ring buffers: the first part
+  * of the dictionary is passed as prefix, and the second via dictStart + dictSize.
+  * These routines are used only once, in LZ4_decompress_*_continue().
+  */
+ LZ4_FORCE_INLINE
+ int LZ4_decompress_safe_doubleDict(const char* source, char* dest, int compressedSize, int maxOutputSize,
+                                    size_t prefixSize, const void* dictStart, size_t dictSize)
+ {
+     return LZ4_decompress_generic(source, dest, compressedSize, maxOutputSize,
+-                                  endOnInputSize, decode_full_block, usingExtDict,
+-                                  (BYTE*)dest-prefixSize, (const BYTE*)dictStart, dictSize);
+-}
+-
+-LZ4_FORCE_INLINE
+-int LZ4_decompress_fast_doubleDict(const char* source, char* dest, int originalSize,
+-                                   size_t prefixSize, const void* dictStart, size_t dictSize)
+-{
+-    return LZ4_decompress_generic(source, dest, 0, originalSize,
+-                                  endOnOutputSize, decode_full_block, usingExtDict,
++                                  decode_full_block, usingExtDict,
+                                   (BYTE*)dest-prefixSize, (const BYTE*)dictStart, dictSize);
+ }
+ 
+ /*===== streaming decompression functions =====*/
+ 
++#if !defined(LZ4_STATIC_LINKING_ONLY_DISABLE_MEMORY_ALLOCATION)
+ LZ4_streamDecode_t* LZ4_createStreamDecode(void)
+ {
+-    LZ4_streamDecode_t* lz4s = (LZ4_streamDecode_t*) ALLOC_AND_ZERO(sizeof(LZ4_streamDecode_t));
+-    LZ4_STATIC_ASSERT(LZ4_STREAMDECODESIZE >= sizeof(LZ4_streamDecode_t_internal));    /* A compilation error here means LZ4_STREAMDECODESIZE is not large enough */
+-    return lz4s;
++    LZ4_STATIC_ASSERT(sizeof(LZ4_streamDecode_t) >= sizeof(LZ4_streamDecode_t_internal));
++    return (LZ4_streamDecode_t*) ALLOC_AND_ZERO(sizeof(LZ4_streamDecode_t));
+ }
+ 
+ int LZ4_freeStreamDecode (LZ4_streamDecode_t* LZ4_stream)
+ {
+     if (LZ4_stream == NULL) { return 0; }  /* support free on NULL */
+     FREEMEM(LZ4_stream);
+     return 0;
+ }
++#endif
+ 
+ /*! LZ4_setStreamDecode() :
+  *  Use this function to instruct where to find the dictionary.
+  *  This function is not necessary if previous data is still available where it was decoded.
+  *  Loading a size of 0 is allowed (same effect as no dictionary).
+  * @return : 1 if OK, 0 if error
+  */
+ int LZ4_setStreamDecode (LZ4_streamDecode_t* LZ4_streamDecode, const char* dictionary, int dictSize)
+ {
+     LZ4_streamDecode_t_internal* lz4sd = &LZ4_streamDecode->internal_donotuse;
+-    lz4sd->prefixSize = (size_t) dictSize;
+-    lz4sd->prefixEnd = (const BYTE*) dictionary + dictSize;
++    lz4sd->prefixSize = (size_t)dictSize;
++    if (dictSize) {
++        assert(dictionary != NULL);
++        lz4sd->prefixEnd = (const BYTE*) dictionary + dictSize;
++    } else {
++        lz4sd->prefixEnd = (const BYTE*) dictionary;
++    }
+     lz4sd->externalDict = NULL;
+     lz4sd->extDictSize  = 0;
+     return 1;
+ }
+ 
+ /*! LZ4_decoderRingBufferSize() :
+  *  when setting a ring buffer for streaming decompression (optional scenario),
+  *  provides the minimum size of this ring buffer
+@@ -2353,39 +2555,45 @@ int LZ4_decompress_safe_continue (LZ4_st
+         if (result <= 0) return result;
+         lz4sd->prefixSize = (size_t)result;
+         lz4sd->prefixEnd  = (BYTE*)dest + result;
+     }
+ 
+     return result;
+ }
+ 
+-LZ4_FORCE_O2
+-int LZ4_decompress_fast_continue (LZ4_streamDecode_t* LZ4_streamDecode, const char* source, char* dest, int originalSize)
++LZ4_FORCE_O2 int
++LZ4_decompress_fast_continue (LZ4_streamDecode_t* LZ4_streamDecode,
++                        const char* source, char* dest, int originalSize)
+ {
+-    LZ4_streamDecode_t_internal* lz4sd = &LZ4_streamDecode->internal_donotuse;
++    LZ4_streamDecode_t_internal* const lz4sd =
++        (assert(LZ4_streamDecode!=NULL), &LZ4_streamDecode->internal_donotuse);
+     int result;
++
++    DEBUGLOG(5, "LZ4_decompress_fast_continue (toDecodeSize=%i)", originalSize);
+     assert(originalSize >= 0);
+ 
+     if (lz4sd->prefixSize == 0) {
++        DEBUGLOG(5, "first invocation : no prefix nor extDict");
+         assert(lz4sd->extDictSize == 0);
+         result = LZ4_decompress_fast(source, dest, originalSize);
+         if (result <= 0) return result;
+         lz4sd->prefixSize = (size_t)originalSize;
+         lz4sd->prefixEnd = (BYTE*)dest + originalSize;
+     } else if (lz4sd->prefixEnd == (BYTE*)dest) {
+-        if (lz4sd->prefixSize >= 64 KB - 1 || lz4sd->extDictSize == 0)
+-            result = LZ4_decompress_fast(source, dest, originalSize);
+-        else
+-            result = LZ4_decompress_fast_doubleDict(source, dest, originalSize,
+-                                                    lz4sd->prefixSize, lz4sd->externalDict, lz4sd->extDictSize);
++        DEBUGLOG(5, "continue using existing prefix");
++        result = LZ4_decompress_unsafe_generic(
++                        (const BYTE*)source, (BYTE*)dest, originalSize,
++                        lz4sd->prefixSize,
++                        lz4sd->externalDict, lz4sd->extDictSize);
+         if (result <= 0) return result;
+         lz4sd->prefixSize += (size_t)originalSize;
+         lz4sd->prefixEnd  += originalSize;
+     } else {
++        DEBUGLOG(5, "prefix becomes extDict");
+         lz4sd->extDictSize = lz4sd->prefixSize;
+         lz4sd->externalDict = lz4sd->prefixEnd - lz4sd->extDictSize;
+         result = LZ4_decompress_fast_extDict(source, dest, originalSize,
+                                              lz4sd->externalDict, lz4sd->extDictSize);
+         if (result <= 0) return result;
+         lz4sd->prefixSize = (size_t)originalSize;
+         lz4sd->prefixEnd  = (BYTE*)dest + originalSize;
+     }
+@@ -2411,20 +2619,37 @@ int LZ4_decompress_safe_usingDict(const 
+         }
+         assert(dictSize >= 0);
+         return LZ4_decompress_safe_withSmallPrefix(source, dest, compressedSize, maxOutputSize, (size_t)dictSize);
+     }
+     assert(dictSize >= 0);
+     return LZ4_decompress_safe_forceExtDict(source, dest, compressedSize, maxOutputSize, dictStart, (size_t)dictSize);
+ }
+ 
++int LZ4_decompress_safe_partial_usingDict(const char* source, char* dest, int compressedSize, int targetOutputSize, int dstCapacity, const char* dictStart, int dictSize)
++{
++    if (dictSize==0)
++        return LZ4_decompress_safe_partial(source, dest, compressedSize, targetOutputSize, dstCapacity);
++    if (dictStart+dictSize == dest) {
++        if (dictSize >= 64 KB - 1) {
++            return LZ4_decompress_safe_partial_withPrefix64k(source, dest, compressedSize, targetOutputSize, dstCapacity);
++        }
++        assert(dictSize >= 0);
++        return LZ4_decompress_safe_partial_withSmallPrefix(source, dest, compressedSize, targetOutputSize, dstCapacity, (size_t)dictSize);
++    }
++    assert(dictSize >= 0);
++    return LZ4_decompress_safe_partial_forceExtDict(source, dest, compressedSize, targetOutputSize, dstCapacity, dictStart, (size_t)dictSize);
++}
++
+ int LZ4_decompress_fast_usingDict(const char* source, char* dest, int originalSize, const char* dictStart, int dictSize)
+ {
+     if (dictSize==0 || dictStart+dictSize == dest)
+-        return LZ4_decompress_fast(source, dest, originalSize);
++        return LZ4_decompress_unsafe_generic(
++                        (const BYTE*)source, (BYTE*)dest, originalSize,
++                        (size_t)dictSize, NULL, 0);
+     assert(dictSize >= 0);
+     return LZ4_decompress_fast_extDict(source, dest, originalSize, dictStart, (size_t)dictSize);
+ }
+ 
+ 
+ /*=*************************************************
+ *  Obsolete Functions
+ ***************************************************/
+@@ -2466,30 +2691,32 @@ int LZ4_uncompress (const char* source, 
+ }
+ int LZ4_uncompress_unknownOutputSize (const char* source, char* dest, int isize, int maxOutputSize)
+ {
+     return LZ4_decompress_safe(source, dest, isize, maxOutputSize);
+ }
+ 
+ /* Obsolete Streaming functions */
+ 
+-int LZ4_sizeofStreamState(void) { return LZ4_STREAMSIZE; }
++int LZ4_sizeofStreamState(void) { return sizeof(LZ4_stream_t); }
+ 
+ int LZ4_resetStreamState(void* state, char* inputBuffer)
+ {
+     (void)inputBuffer;
+     LZ4_resetStream((LZ4_stream_t*)state);
+     return 0;
+ }
+ 
++#if !defined(LZ4_STATIC_LINKING_ONLY_DISABLE_MEMORY_ALLOCATION)
+ void* LZ4_create (char* inputBuffer)
+ {
+     (void)inputBuffer;
+     return LZ4_createStream();
+ }
++#endif
+ 
+ char* LZ4_slideInputBuffer (void* state)
+ {
+     /* avoid const char * -> char * conversion warning */
+     return (char *)(uptrval)((LZ4_stream_t*)state)->internal_donotuse.dictionary;
+ }
+ 
+ #endif   /* LZ4_COMMONDEFS_ONLY */
+diff --git a/mfbt/lz4/lz4.h b/mfbt/lz4/lz4.h
+--- a/mfbt/lz4/lz4.h
++++ b/mfbt/lz4/lz4.h
+@@ -1,12 +1,12 @@
+ /*
+  *  LZ4 - Fast LZ compression algorithm
+  *  Header File
+- *  Copyright (C) 2011-present, Yann Collet.
++ *  Copyright (C) 2011-2020, Yann Collet.
+ 
+    BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php)
+ 
+    Redistribution and use in source and binary forms, with or without
+    modification, are permitted provided that the following conditions are
+    met:
+ 
+        * Redistributions of source code must retain the above copyright
+@@ -92,46 +92,87 @@ extern "C" {
+ #if defined(LZ4_DLL_EXPORT) && (LZ4_DLL_EXPORT==1)
+ #  define LZ4LIB_API __declspec(dllexport) LZ4LIB_VISIBILITY
+ #elif defined(LZ4_DLL_IMPORT) && (LZ4_DLL_IMPORT==1)
+ #  define LZ4LIB_API __declspec(dllimport) LZ4LIB_VISIBILITY /* It isn't required but allows to generate better code, saving a function pointer load from the IAT and an indirect jump.*/
+ #else
+ #  define LZ4LIB_API LZ4LIB_VISIBILITY
+ #endif
+ 
++/*! LZ4_FREESTANDING :
++ *  When this macro is set to 1, it enables "freestanding mode" that is
++ *  suitable for typical freestanding environment which doesn't support
++ *  standard C library.
++ *
++ *  - LZ4_FREESTANDING is a compile-time switch.
++ *  - It requires the following macros to be defined:
++ *    LZ4_memcpy, LZ4_memmove, LZ4_memset.
++ *  - It only enables LZ4/HC functions which don't use heap.
++ *    All LZ4F_* functions are not supported.
++ *  - See tests/freestanding.c to check its basic setup.
++ */
++#if defined(LZ4_FREESTANDING) && (LZ4_FREESTANDING == 1)
++#  define LZ4_HEAPMODE 0
++#  define LZ4HC_HEAPMODE 0
++#  define LZ4_STATIC_LINKING_ONLY_DISABLE_MEMORY_ALLOCATION 1
++#  if !defined(LZ4_memcpy)
++#    error "LZ4_FREESTANDING requires macro 'LZ4_memcpy'."
++#  endif
++#  if !defined(LZ4_memset)
++#    error "LZ4_FREESTANDING requires macro 'LZ4_memset'."
++#  endif
++#  if !defined(LZ4_memmove)
++#    error "LZ4_FREESTANDING requires macro 'LZ4_memmove'."
++#  endif
++#elif ! defined(LZ4_FREESTANDING)
++#  define LZ4_FREESTANDING 0
++#endif
++
++
+ /*------   Version   ------*/
+ #define LZ4_VERSION_MAJOR    1    /* for breaking interface changes  */
+ #define LZ4_VERSION_MINOR    9    /* for new (non-breaking) interface capabilities */
+-#define LZ4_VERSION_RELEASE  3    /* for tweaks, bug-fixes, or development */
++#define LZ4_VERSION_RELEASE  4    /* for tweaks, bug-fixes, or development */
+ 
+ #define LZ4_VERSION_NUMBER (LZ4_VERSION_MAJOR *100*100 + LZ4_VERSION_MINOR *100 + LZ4_VERSION_RELEASE)
+ 
+ #define LZ4_LIB_VERSION LZ4_VERSION_MAJOR.LZ4_VERSION_MINOR.LZ4_VERSION_RELEASE
+ #define LZ4_QUOTE(str) #str
+ #define LZ4_EXPAND_AND_QUOTE(str) LZ4_QUOTE(str)
+-#define LZ4_VERSION_STRING LZ4_EXPAND_AND_QUOTE(LZ4_LIB_VERSION)
++#define LZ4_VERSION_STRING LZ4_EXPAND_AND_QUOTE(LZ4_LIB_VERSION)  /* requires v1.7.3+ */
+ 
+-LZ4LIB_API int LZ4_versionNumber (void);  /**< library version number; useful to check dll version */
+-LZ4LIB_API const char* LZ4_versionString (void);   /**< library version string; useful to check dll version */
++LZ4LIB_API int LZ4_versionNumber (void);  /**< library version number; useful to check dll version; requires v1.3.0+ */
++LZ4LIB_API const char* LZ4_versionString (void);   /**< library version string; useful to check dll version; requires v1.7.5+ */
+ 
+ 
+ /*-************************************
+ *  Tuning parameter
+ **************************************/
++#define LZ4_MEMORY_USAGE_MIN 10
++#define LZ4_MEMORY_USAGE_DEFAULT 14
++#define LZ4_MEMORY_USAGE_MAX 20
++
+ /*!
+  * LZ4_MEMORY_USAGE :
+- * Memory usage formula : N->2^N Bytes (examples : 10 -> 1KB; 12 -> 4KB ; 16 -> 64KB; 20 -> 1MB; etc.)
+- * Increasing memory usage improves compression ratio.
+- * Reduced memory usage may improve speed, thanks to better cache locality.
++ * Memory usage formula : N->2^N Bytes (examples : 10 -> 1KB; 12 -> 4KB ; 16 -> 64KB; 20 -> 1MB; )
++ * Increasing memory usage improves compression ratio, at the cost of speed.
++ * Reduced memory usage may improve speed at the cost of ratio, thanks to better cache locality.
+  * Default value is 14, for 16KB, which nicely fits into Intel x86 L1 cache
+  */
+ #ifndef LZ4_MEMORY_USAGE
+-# define LZ4_MEMORY_USAGE 14
++# define LZ4_MEMORY_USAGE LZ4_MEMORY_USAGE_DEFAULT
+ #endif
+ 
++#if (LZ4_MEMORY_USAGE < LZ4_MEMORY_USAGE_MIN)
++#  error "LZ4_MEMORY_USAGE is too small !"
++#endif
++
++#if (LZ4_MEMORY_USAGE > LZ4_MEMORY_USAGE_MAX)
++#  error "LZ4_MEMORY_USAGE is too large !"
++#endif
+ 
+ /*-************************************
+ *  Simple Functions
+ **************************************/
+ /*! LZ4_compress_default() :
+  *  Compresses 'srcSize' bytes from buffer 'src'
+  *  into already allocated 'dst' buffer of size 'dstCapacity'.
+  *  Compression is guaranteed to succeed if 'dstCapacity' >= LZ4_compressBound(srcSize).
+@@ -265,18 +306,35 @@ LZ4LIB_API int LZ4_compress_destSize (co
+ LZ4LIB_API int LZ4_decompress_safe_partial (const char* src, char* dst, int srcSize, int targetOutputSize, int dstCapacity);
+ 
+ 
+ /*-*********************************************
+ *  Streaming Compression Functions
+ ***********************************************/
+ typedef union LZ4_stream_u LZ4_stream_t;  /* incomplete type (defined later) */
+ 
++/**
++ Note about RC_INVOKED
++
++ - RC_INVOKED is predefined symbol of rc.exe (the resource compiler which is part of MSVC/Visual Studio).
++   https://docs.microsoft.com/en-us/windows/win32/menurc/predefined-macros
++
++ - Since rc.exe is a legacy compiler, it truncates long symbol (> 30 chars)
++   and reports warning "RC4011: identifier truncated".
++
++ - To eliminate the warning, we surround long preprocessor symbol with
++   "#if !defined(RC_INVOKED) ... #endif" block that means
++   "skip this block when rc.exe is trying to read it".
++*/
++#if !defined(RC_INVOKED) /* https://docs.microsoft.com/en-us/windows/win32/menurc/predefined-macros */
++#if !defined(LZ4_STATIC_LINKING_ONLY_DISABLE_MEMORY_ALLOCATION)
+ LZ4LIB_API LZ4_stream_t* LZ4_createStream(void);
+ LZ4LIB_API int           LZ4_freeStream (LZ4_stream_t* streamPtr);
++#endif /* !defined(LZ4_STATIC_LINKING_ONLY_DISABLE_MEMORY_ALLOCATION) */
++#endif
+ 
+ /*! LZ4_resetStream_fast() : v1.9.0+
+  *  Use this to prepare an LZ4_stream_t for a new chain of dependent blocks
+  *  (e.g., LZ4_compress_fast_continue()).
+  *
+  *  An LZ4_stream_t must be initialized once before usage.
+  *  This is automatically done when created by LZ4_createStream().
+  *  However, should the LZ4_stream_t be simply declared on stack (for example),
+@@ -350,18 +408,22 @@ LZ4LIB_API int LZ4_saveDict (LZ4_stream_
+ *  Bufferless synchronous API
+ ************************************************/
+ typedef union LZ4_streamDecode_u LZ4_streamDecode_t;   /* tracking context */
+ 
+ /*! LZ4_createStreamDecode() and LZ4_freeStreamDecode() :
+  *  creation / destruction of streaming decompression tracking context.
+  *  A tracking context can be re-used multiple times.
+  */
++#if !defined(RC_INVOKED) /* https://docs.microsoft.com/en-us/windows/win32/menurc/predefined-macros */
++#if !defined(LZ4_STATIC_LINKING_ONLY_DISABLE_MEMORY_ALLOCATION)
+ LZ4LIB_API LZ4_streamDecode_t* LZ4_createStreamDecode(void);
+ LZ4LIB_API int                 LZ4_freeStreamDecode (LZ4_streamDecode_t* LZ4_stream);
++#endif /* !defined(LZ4_STATIC_LINKING_ONLY_DISABLE_MEMORY_ALLOCATION) */
++#endif
+ 
+ /*! LZ4_setStreamDecode() :
+  *  An LZ4_streamDecode_t context can be allocated once and re-used multiple times.
+  *  Use this function to start decompression of a new stream of blocks.
+  *  A dictionary can optionally be set. Use NULL or size 0 for a reset order.
+  *  Dictionary is presumed stable : it must remain accessible and unmodified during next decompression.
+  * @return : 1 if OK, 0 if error
+  */
+@@ -401,28 +463,40 @@ LZ4LIB_API int LZ4_decoderRingBufferSize
+  *  - Decompression buffer is larger than encoding buffer, by a minimum of maxBlockSize more bytes.
+  *    In which case, encoding and decoding buffers do not need to be synchronized,
+  *    and encoding ring buffer can have any size, including small ones ( < 64 KB).
+  *
+  *  Whenever these conditions are not possible,
+  *  save the last 64KB of decoded data into a safe buffer where it can't be modified during decompression,
+  *  then indicate where this data is saved using LZ4_setStreamDecode(), before decompressing next block.
+ */
+-LZ4LIB_API int LZ4_decompress_safe_continue (LZ4_streamDecode_t* LZ4_streamDecode, const char* src, char* dst, int srcSize, int dstCapacity);
++LZ4LIB_API int
++LZ4_decompress_safe_continue (LZ4_streamDecode_t* LZ4_streamDecode,
++                        const char* src, char* dst,
++                        int srcSize, int dstCapacity);
+ 
+ 
+ /*! LZ4_decompress_*_usingDict() :
+  *  These decoding functions work the same as
+  *  a combination of LZ4_setStreamDecode() followed by LZ4_decompress_*_continue()
+  *  They are stand-alone, and don't need an LZ4_streamDecode_t structure.
+  *  Dictionary is presumed stable : it must remain accessible and unmodified during decompression.
+  *  Performance tip : Decompression speed can be substantially increased
+  *                    when dst == dictStart + dictSize.
+  */
+-LZ4LIB_API int LZ4_decompress_safe_usingDict (const char* src, char* dst, int srcSize, int dstCapcity, const char* dictStart, int dictSize);
++LZ4LIB_API int
++LZ4_decompress_safe_usingDict(const char* src, char* dst,
++                              int srcSize, int dstCapacity,
++                              const char* dictStart, int dictSize);
++
++LZ4LIB_API int
++LZ4_decompress_safe_partial_usingDict(const char* src, char* dst,
++                                      int compressedSize,
++                                      int targetOutputSize, int maxOutputSize,
++                                      const char* dictStart, int dictSize);
+ 
+ #endif /* LZ4_H_2983827168210 */
+ 
+ 
+ /*^*************************************
+  * !!!!!!   STATIC LINKING ONLY   !!!!!!
+  ***************************************/
+ 
+@@ -491,23 +565,25 @@ LZ4LIB_STATIC_API int LZ4_compress_fast_
+  *  logically immediately precede the data compressed in the first subsequent
+  *  compression call.
+  *
+  *  The dictionary will only remain attached to the working stream through the
+  *  first compression call, at the end of which it is cleared. The dictionary
+  *  stream (and source buffer) must remain in-place / accessible / unchanged
+  *  through the completion of the first compression call on the stream.
+  */
+-LZ4LIB_STATIC_API void LZ4_attach_dictionary(LZ4_stream_t* workingStream, const LZ4_stream_t* dictionaryStream);
++LZ4LIB_STATIC_API void
++LZ4_attach_dictionary(LZ4_stream_t* workingStream,
++                const LZ4_stream_t* dictionaryStream);
+ 
+ 
+ /*! In-place compression and decompression
+  *
+  * It's possible to have input and output sharing the same buffer,
+- * for highly contrained memory environments.
++ * for highly constrained memory environments.
+  * In both cases, it requires input to lay at the end of the buffer,
+  * and decompression to start at beginning of the buffer.
+  * Buffer size must feature some margin, hence be larger than final size.
+  *
+  * |<------------------------buffer--------------------------------->|
+  *                             |<-----------compressed data--------->|
+  * |<-----------decompressed size------------------>|
+  *                                                  |<----margin---->|
+@@ -587,48 +663,36 @@ LZ4LIB_STATIC_API void LZ4_attach_dictio
+   typedef uint32_t LZ4_u32;
+ #else
+   typedef   signed char  LZ4_i8;
+   typedef unsigned char  LZ4_byte;
+   typedef unsigned short LZ4_u16;
+   typedef unsigned int   LZ4_u32;
+ #endif
+ 
++/*! LZ4_stream_t :
++ *  Never ever use below internal definitions directly !
++ *  These definitions are not API/ABI safe, and may change in future versions.
++ *  If you need static allocation, declare or allocate an LZ4_stream_t object.
++**/
++
+ typedef struct LZ4_stream_t_internal LZ4_stream_t_internal;
+ struct LZ4_stream_t_internal {
+     LZ4_u32 hashTable[LZ4_HASH_SIZE_U32];
++    const LZ4_byte* dictionary;
++    const LZ4_stream_t_internal* dictCtx;
+     LZ4_u32 currentOffset;
+     LZ4_u32 tableType;
+-    const LZ4_byte* dictionary;
+-    const LZ4_stream_t_internal* dictCtx;
+     LZ4_u32 dictSize;
++    /* Implicit padding to ensure structure is aligned */
+ };
+ 
+-typedef struct {
+-    const LZ4_byte* externalDict;
+-    size_t extDictSize;
+-    const LZ4_byte* prefixEnd;
+-    size_t prefixSize;
+-} LZ4_streamDecode_t_internal;
+-
+-
+-/*! LZ4_stream_t :
+- *  Do not use below internal definitions directly !
+- *  Declare or allocate an LZ4_stream_t instead.
+- *  LZ4_stream_t can also be created using LZ4_createStream(), which is recommended.
+- *  The structure definition can be convenient for static allocation
+- *  (on stack, or as part of larger structure).
+- *  Init this structure with LZ4_initStream() before first use.
+- *  note : only use this definition in association with static linking !
+- *  this definition is not API/ABI safe, and may change in future versions.
+- */
+-#define LZ4_STREAMSIZE       16416  /* static size, for inter-version compatibility */
+-#define LZ4_STREAMSIZE_VOIDP (LZ4_STREAMSIZE / sizeof(void*))
++#define LZ4_STREAM_MINSIZE  ((1UL << LZ4_MEMORY_USAGE) + 32)  /* static size, for inter-version compatibility */
+ union LZ4_stream_u {
+-    void* table[LZ4_STREAMSIZE_VOIDP];
++    char minStateSize[LZ4_STREAM_MINSIZE];
+     LZ4_stream_t_internal internal_donotuse;
+ }; /* previously typedef'd to LZ4_stream_t */
+ 
+ 
+ /*! LZ4_initStream() : v1.9.0+
+  *  An LZ4_stream_t structure must be initialized at least once.
+  *  This is automatically done when invoking LZ4_createStream(),
+  *  but it's not when the structure is simply declared on stack (for example).
+@@ -636,31 +700,35 @@ union LZ4_stream_u {
+  *  Use LZ4_initStream() to properly initialize a newly declared LZ4_stream_t.
+  *  It can also initialize any arbitrary buffer of sufficient size,
+  *  and will @return a pointer of proper type upon initialization.
+  *
+  *  Note : initialization fails if size and alignment conditions are not respected.
+  *         In which case, the function will @return NULL.
+  *  Note2: An LZ4_stream_t structure guarantees correct alignment and size.
+  *  Note3: Before v1.9.0, use LZ4_resetStream() instead
+- */
++**/
+ LZ4LIB_API LZ4_stream_t* LZ4_initStream (void* buffer, size_t size);
+ 
+ 
+ /*! LZ4_streamDecode_t :
+- *  information structure to track an LZ4 stream during decompression.
+- *  init this structure  using LZ4_setStreamDecode() before first use.
+- *  note : only use in association with static linking !
+- *         this definition is not API/ABI safe,
+- *         and may change in a future version !
+- */
+-#define LZ4_STREAMDECODESIZE_U64 (4 + ((sizeof(void*)==16) ? 2 : 0) /*AS-400*/ )
+-#define LZ4_STREAMDECODESIZE     (LZ4_STREAMDECODESIZE_U64 * sizeof(unsigned long long))
++ *  Never ever use below internal definitions directly !
++ *  These definitions are not API/ABI safe, and may change in future versions.
++ *  If you need static allocation, declare or allocate an LZ4_streamDecode_t object.
++**/
++typedef struct {
++    const LZ4_byte* externalDict;
++    const LZ4_byte* prefixEnd;
++    size_t extDictSize;
++    size_t prefixSize;
++} LZ4_streamDecode_t_internal;
++
++#define LZ4_STREAMDECODE_MINSIZE 32
+ union LZ4_streamDecode_u {
+-    unsigned long long table[LZ4_STREAMDECODESIZE_U64];
++    char minStateSize[LZ4_STREAMDECODE_MINSIZE];
+     LZ4_streamDecode_t_internal internal_donotuse;
+ } ;   /* previously typedef'd to LZ4_streamDecode_t */
+ 
+ 
+ 
+ /*-************************************
+ *  Obsolete Functions
+ **************************************/
+diff --git a/mfbt/lz4/lz4file.c b/mfbt/lz4/lz4file.c
+new file mode 100644
+--- /dev/null
++++ b/mfbt/lz4/lz4file.c
+@@ -0,0 +1,311 @@
++/*
++ * LZ4 file library
++ * Copyright (C) 2022, Xiaomi Inc.
++ *
++ * BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php)
++ *
++ * Redistribution and use in source and binary forms, with or without
++ * modification, are permitted provided that the following conditions are
++ * met:
++ *
++ * - Redistributions of source code must retain the above copyright
++ *   notice, this list of conditions and the following disclaimer.
++ * - Redistributions in binary form must reproduce the above
++ *   copyright notice, this list of conditions and the following disclaimer
++ *   in the documentation and/or other materials provided with the
++ *   distribution.
++ *
++ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
++ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
++ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
++ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
++ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
++ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
++ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
++ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
++ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
++ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
++ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
++ *
++ * You can contact the author at :
++ * - LZ4 homepage : http://www.lz4.org
++ * - LZ4 source repository : https://github.com/lz4/lz4
++ */
++#include <stdlib.h>
++#include <string.h>
++#include "lz4.h"
++#include "lz4file.h"
++
++struct LZ4_readFile_s {
++  LZ4F_dctx* dctxPtr;
++  FILE* fp;
++  LZ4_byte* srcBuf;
++  size_t srcBufNext;
++  size_t srcBufSize;
++  size_t srcBufMaxSize;
++};
++
++struct LZ4_writeFile_s {
++  LZ4F_cctx* cctxPtr;
++  FILE* fp;
++  LZ4_byte* dstBuf;
++  size_t maxWriteSize;
++  size_t dstBufMaxSize;
++  LZ4F_errorCode_t errCode;
++};
++
++LZ4F_errorCode_t LZ4F_readOpen(LZ4_readFile_t** lz4fRead, FILE* fp)
++{
++  char buf[LZ4F_HEADER_SIZE_MAX];
++  size_t consumedSize;
++  LZ4F_errorCode_t ret;
++  LZ4F_frameInfo_t info;
++
++  if (fp == NULL || lz4fRead == NULL) {
++    return -LZ4F_ERROR_GENERIC;
++  }
++
++  *lz4fRead = (LZ4_readFile_t*)calloc(1, sizeof(LZ4_readFile_t));
++  if (*lz4fRead == NULL) {
++    return -LZ4F_ERROR_allocation_failed;
++  }
++
++  ret = LZ4F_createDecompressionContext(&(*lz4fRead)->dctxPtr, LZ4F_getVersion());
++  if (LZ4F_isError(ret)) {
++    free(*lz4fRead);
++    return ret;
++  }
++
++  (*lz4fRead)->fp = fp;
++  consumedSize = fread(buf, 1, sizeof(buf), (*lz4fRead)->fp);
++  if (consumedSize != sizeof(buf)) {
++    free(*lz4fRead);
++    return -LZ4F_ERROR_GENERIC;
++  }
++
++  ret = LZ4F_getFrameInfo((*lz4fRead)->dctxPtr, &info, buf, &consumedSize);
++  if (LZ4F_isError(ret)) {
++      LZ4F_freeDecompressionContext((*lz4fRead)->dctxPtr);
++      free(*lz4fRead);
++      return ret;
++    }
++
++  switch (info.blockSizeID) {
++    case LZ4F_default :
++    case LZ4F_max64KB :
++      (*lz4fRead)->srcBufMaxSize = 64 * 1024;
++      break;
++    case LZ4F_max256KB:
++      (*lz4fRead)->srcBufMaxSize = 256 * 1024;
++      break;
++    case LZ4F_max1MB:
++      (*lz4fRead)->srcBufMaxSize = 1 * 1024 * 1024;
++      break;
++    case LZ4F_max4MB:
++      (*lz4fRead)->srcBufMaxSize = 4 * 1024 * 1024;
++      break;
++    default:
++      LZ4F_freeDecompressionContext((*lz4fRead)->dctxPtr);
++      free(*lz4fRead);
++      return -LZ4F_ERROR_maxBlockSize_invalid;
++  }
++
++  (*lz4fRead)->srcBuf = (LZ4_byte*)malloc((*lz4fRead)->srcBufMaxSize);
++  if ((*lz4fRead)->srcBuf == NULL) {
++    LZ4F_freeDecompressionContext((*lz4fRead)->dctxPtr);
++    free(lz4fRead);
++    return -LZ4F_ERROR_allocation_failed;
++  }
++
++  (*lz4fRead)->srcBufSize = sizeof(buf) - consumedSize;
++  memcpy((*lz4fRead)->srcBuf, buf + consumedSize, (*lz4fRead)->srcBufSize);
++
++  return ret;
++}
++
++size_t LZ4F_read(LZ4_readFile_t* lz4fRead, void* buf, size_t size)
++{
++  LZ4_byte* p = (LZ4_byte*)buf;
++  size_t next = 0;
++
++  if (lz4fRead == NULL || buf == NULL)
++    return -LZ4F_ERROR_GENERIC;
++
++  while (next < size) {
++    size_t srcsize = lz4fRead->srcBufSize - lz4fRead->srcBufNext;
++    size_t dstsize = size - next;
++    size_t ret;
++
++    if (srcsize == 0) {
++      ret = fread(lz4fRead->srcBuf, 1, lz4fRead->srcBufMaxSize, lz4fRead->fp);
++      if (ret > 0) {
++        lz4fRead->srcBufSize = ret;
++        srcsize = lz4fRead->srcBufSize;
++        lz4fRead->srcBufNext = 0;
++      }
++      else if (ret == 0) {
++        break;
++      }
++      else {
++        return -LZ4F_ERROR_GENERIC;
++      }
++    }
++
++    ret = LZ4F_decompress(lz4fRead->dctxPtr,
++                          p, &dstsize,
++                          lz4fRead->srcBuf + lz4fRead->srcBufNext,
++                          &srcsize,
++                          NULL);
++    if (LZ4F_isError(ret)) {
++        return ret;
++    }
++
++    lz4fRead->srcBufNext += srcsize;
++    next += dstsize;
++    p += dstsize;
++  }
++
++  return next;
++}
++
++LZ4F_errorCode_t LZ4F_readClose(LZ4_readFile_t* lz4fRead)
++{
++  if (lz4fRead == NULL)
++    return -LZ4F_ERROR_GENERIC;
++  LZ4F_freeDecompressionContext(lz4fRead->dctxPtr);
++  free(lz4fRead->srcBuf);
++  free(lz4fRead);
++  return LZ4F_OK_NoError;
++}
++
++LZ4F_errorCode_t LZ4F_writeOpen(LZ4_writeFile_t** lz4fWrite, FILE* fp, const LZ4F_preferences_t* prefsPtr)
++{
++  LZ4_byte buf[LZ4F_HEADER_SIZE_MAX];
++  size_t ret;
++
++  if (fp == NULL || lz4fWrite == NULL)
++    return -LZ4F_ERROR_GENERIC;
++
++  *lz4fWrite = (LZ4_writeFile_t*)malloc(sizeof(LZ4_writeFile_t));
++  if (*lz4fWrite == NULL) {
++    return -LZ4F_ERROR_allocation_failed;
++  }
++  if (prefsPtr != NULL) {
++    switch (prefsPtr->frameInfo.blockSizeID) {
++      case LZ4F_default :
++      case LZ4F_max64KB :
++        (*lz4fWrite)->maxWriteSize = 64 * 1024;
++        break;
++      case LZ4F_max256KB:
++        (*lz4fWrite)->maxWriteSize = 256 * 1024;
++        break;
++      case LZ4F_max1MB:
++        (*lz4fWrite)->maxWriteSize = 1 * 1024 * 1024;
++        break;
++      case LZ4F_max4MB:
++        (*lz4fWrite)->maxWriteSize = 4 * 1024 * 1024;
++        break;
++      default:
++        free(lz4fWrite);
++        return -LZ4F_ERROR_maxBlockSize_invalid;
++      }
++    } else {
++      (*lz4fWrite)->maxWriteSize = 64 * 1024;
++    }
++
++  (*lz4fWrite)->dstBufMaxSize = LZ4F_compressBound((*lz4fWrite)->maxWriteSize, prefsPtr);
++  (*lz4fWrite)->dstBuf = (LZ4_byte*)malloc((*lz4fWrite)->dstBufMaxSize);
++  if ((*lz4fWrite)->dstBuf == NULL) {
++    free(*lz4fWrite);
++    return -LZ4F_ERROR_allocation_failed;
++  }
++
++  ret = LZ4F_createCompressionContext(&(*lz4fWrite)->cctxPtr, LZ4F_getVersion());
++  if (LZ4F_isError(ret)) {
++      free((*lz4fWrite)->dstBuf);
++      free(*lz4fWrite);
++      return ret;
++  }
++
++  ret = LZ4F_compressBegin((*lz4fWrite)->cctxPtr, buf, LZ4F_HEADER_SIZE_MAX, prefsPtr);
++  if (LZ4F_isError(ret)) {
++      LZ4F_freeCompressionContext((*lz4fWrite)->cctxPtr);
++      free((*lz4fWrite)->dstBuf);
++      free(*lz4fWrite);
++      return ret;
++  }
++
++  if (ret != fwrite(buf, 1, ret, fp)) {
++    LZ4F_freeCompressionContext((*lz4fWrite)->cctxPtr);
++    free((*lz4fWrite)->dstBuf);
++    free(*lz4fWrite);
++    return -LZ4F_ERROR_GENERIC;
++  }
++
++  (*lz4fWrite)->fp = fp;
++  (*lz4fWrite)->errCode = LZ4F_OK_NoError;
++  return LZ4F_OK_NoError;
++}
++
++size_t LZ4F_write(LZ4_writeFile_t* lz4fWrite, void* buf, size_t size)
++{
++  LZ4_byte* p = (LZ4_byte*)buf;
++  size_t remain = size;
++  size_t chunk;
++  size_t ret;
++
++  if (lz4fWrite == NULL || buf == NULL)
++    return -LZ4F_ERROR_GENERIC;
++  while (remain) {
++    if (remain > lz4fWrite->maxWriteSize)
++      chunk = lz4fWrite->maxWriteSize;
++    else
++      chunk = remain;
++
++    ret = LZ4F_compressUpdate(lz4fWrite->cctxPtr,
++                              lz4fWrite->dstBuf, lz4fWrite->dstBufMaxSize,
++                              p, chunk,
++                              NULL);
++    if (LZ4F_isError(ret)) {
++      lz4fWrite->errCode = ret;
++      return ret;
++    }
++
++    if(ret != fwrite(lz4fWrite->dstBuf, 1, ret, lz4fWrite->fp)) {
++      lz4fWrite->errCode = -LZ4F_ERROR_GENERIC;
++      return -LZ4F_ERROR_GENERIC;
++    }
++
++    p += chunk;
++    remain -= chunk;
++  }
++
++  return size;
++}
++
++LZ4F_errorCode_t LZ4F_writeClose(LZ4_writeFile_t* lz4fWrite)
++{
++  LZ4F_errorCode_t ret = LZ4F_OK_NoError;
++
++  if (lz4fWrite == NULL)
++    return -LZ4F_ERROR_GENERIC;
++
++  if (lz4fWrite->errCode == LZ4F_OK_NoError) {
++    ret =  LZ4F_compressEnd(lz4fWrite->cctxPtr,
++                            lz4fWrite->dstBuf, lz4fWrite->dstBufMaxSize,
++                            NULL);
++    if (LZ4F_isError(ret)) {
++      goto out;
++    }
++
++    if (ret != fwrite(lz4fWrite->dstBuf, 1, ret, lz4fWrite->fp)) {
++      ret = -LZ4F_ERROR_GENERIC;
++    }
++  }
++
++out:
++  LZ4F_freeCompressionContext(lz4fWrite->cctxPtr);
++  free(lz4fWrite->dstBuf);
++  free(lz4fWrite);
++  return ret;
++}
+diff --git a/mfbt/lz4/lz4file.h b/mfbt/lz4/lz4file.h
+new file mode 100644
+--- /dev/null
++++ b/mfbt/lz4/lz4file.h
+@@ -0,0 +1,93 @@
++/*
++   LZ4 file library
++   Header File
++   Copyright (C) 2022, Xiaomi Inc.
++   BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php)
++
++   Redistribution and use in source and binary forms, with or without
++   modification, are permitted provided that the following conditions are
++   met:
++
++       * Redistributions of source code must retain the above copyright
++   notice, this list of conditions and the following disclaimer.
++       * Redistributions in binary form must reproduce the above
++   copyright notice, this list of conditions and the following disclaimer
++   in the documentation and/or other materials provided with the
++   distribution.
++
++   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
++   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
++   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
++   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
++   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
++   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
++   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
++   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
++   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
++   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
++   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
++
++   You can contact the author at :
++   - LZ4 source repository : https://github.com/lz4/lz4
++   - LZ4 public forum : https://groups.google.com/forum/#!forum/lz4c
++*/
++#if defined (__cplusplus)
++extern "C" {
++#endif
++
++#ifndef LZ4FILE_H
++#define LZ4FILE_H
++
++#include <stdio.h>
++#include "lz4frame_static.h"
++
++typedef struct LZ4_readFile_s LZ4_readFile_t;
++typedef struct LZ4_writeFile_s LZ4_writeFile_t;
++
++/*! LZ4F_readOpen() :
++ * Set read lz4file handle.
++ * `lz4f` will set a lz4file handle.
++ * `fp` must be the return value of the lz4 file opened by fopen.
++ */
++LZ4FLIB_STATIC_API LZ4F_errorCode_t LZ4F_readOpen(LZ4_readFile_t** lz4fRead, FILE* fp);
++
++/*! LZ4F_read() :
++ * Read lz4file content to buffer.
++ * `lz4f` must use LZ4_readOpen to set first.
++ * `buf` read data buffer.
++ * `size` read data buffer size.
++ */
++LZ4FLIB_STATIC_API size_t LZ4F_read(LZ4_readFile_t* lz4fRead, void* buf, size_t size);
++
++/*! LZ4F_readClose() :
++ * Close lz4file handle.
++ * `lz4f` must use LZ4_readOpen to set first.
++ */
++LZ4FLIB_STATIC_API LZ4F_errorCode_t LZ4F_readClose(LZ4_readFile_t* lz4fRead);
++
++/*! LZ4F_writeOpen() :
++ * Set write lz4file handle.
++ * `lz4f` will set a lz4file handle.
++ * `fp` must be the return value of the lz4 file opened by fopen.
++ */
++LZ4FLIB_STATIC_API LZ4F_errorCode_t LZ4F_writeOpen(LZ4_writeFile_t** lz4fWrite, FILE* fp, const LZ4F_preferences_t* prefsPtr);
++
++/*! LZ4F_write() :
++ * Write buffer to lz4file.
++ * `lz4f` must use LZ4F_writeOpen to set first.
++ * `buf` write data buffer.
++ * `size` write data buffer size.
++ */
++LZ4FLIB_STATIC_API size_t LZ4F_write(LZ4_writeFile_t* lz4fWrite, void* buf, size_t size);
++
++/*! LZ4F_writeClose() :
++ * Close lz4file handle.
++ * `lz4f` must use LZ4F_writeOpen to set first.
++ */
++LZ4FLIB_STATIC_API LZ4F_errorCode_t LZ4F_writeClose(LZ4_writeFile_t* lz4fWrite);
++
++#endif /* LZ4FILE_H */
++
++#if defined (__cplusplus)
++}
++#endif
+diff --git a/mfbt/lz4/lz4frame.c b/mfbt/lz4/lz4frame.c
+--- a/mfbt/lz4/lz4frame.c
++++ b/mfbt/lz4/lz4frame.c
+@@ -40,17 +40,17 @@
+  * (see Memory Routines below).
+  */
+ 
+ 
+ /*-************************************
+ *  Compiler Options
+ **************************************/
+ #ifdef _MSC_VER    /* Visual Studio */
+-#  pragma warning(disable : 4127)        /* disable: C4127: conditional expression is constant */
++#  pragma warning(disable : 4127)   /* disable: C4127: conditional expression is constant */
+ #endif
+ 
+ 
+ /*-************************************
+ *  Tuning parameters
+ **************************************/
+ /*
+  * LZ4F_HEAPMODE :
+@@ -58,51 +58,89 @@
+  * in memory stack (0:default, fastest), or in memory heap (1:requires malloc()).
+  */
+ #ifndef LZ4F_HEAPMODE
+ #  define LZ4F_HEAPMODE 0
+ #endif
+ 
+ 
+ /*-************************************
+-*  Memory routines
+-**************************************/
+-/*
+- * User may redirect invocations of
+- * malloc(), calloc() and free()
+- * towards another library or solution of their choice
+- * by modifying below section.
+- */
+-#ifndef LZ4_SRC_INCLUDED   /* avoid redefinition when sources are coalesced */
+-#  include <stdlib.h>   /* malloc, calloc, free */
+-#  define ALLOC(s)          malloc(s)
+-#  define ALLOC_AND_ZERO(s) calloc(1,(s))
+-#  define FREEMEM(p)        free(p)
+-#endif
+-
+-#include <string.h>   /* memset, memcpy, memmove */
+-#ifndef LZ4_SRC_INCLUDED  /* avoid redefinition when sources are coalesced */
+-#  define MEM_INIT(p,v,s)   memset((p),(v),(s))
+-#endif
+-
+-
+-/*-************************************
+ *  Library declarations
+ **************************************/
+ #define LZ4F_STATIC_LINKING_ONLY
+ #include "lz4frame.h"
+ #define LZ4_STATIC_LINKING_ONLY
+ #include "lz4.h"
+ #define LZ4_HC_STATIC_LINKING_ONLY
+ #include "lz4hc.h"
+ #define XXH_STATIC_LINKING_ONLY
+ #include "xxhash.h"
+ 
+ 
+ /*-************************************
++*  Memory routines
++**************************************/
++/*
++ * User may redirect invocations of
++ * malloc(), calloc() and free()
++ * towards another library or solution of their choice
++ * by modifying below section.
++**/
++
++#include <string.h>   /* memset, memcpy, memmove */
++#ifndef LZ4_SRC_INCLUDED  /* avoid redefinition when sources are coalesced */
++#  define MEM_INIT(p,v,s)   memset((p),(v),(s))
++#endif
++
++#ifndef LZ4_SRC_INCLUDED   /* avoid redefinition when sources are coalesced */
++#  include <stdlib.h>   /* malloc, calloc, free */
++#  define ALLOC(s)          malloc(s)
++#  define ALLOC_AND_ZERO(s) calloc(1,(s))
++#  define FREEMEM(p)        free(p)
++#endif
++
++static void* LZ4F_calloc(size_t s, LZ4F_CustomMem cmem)
++{
++    /* custom calloc defined : use it */
++    if (cmem.customCalloc != NULL) {
++        return cmem.customCalloc(cmem.opaqueState, s);
++    }
++    /* nothing defined : use default <stdlib.h>'s calloc() */
++    if (cmem.customAlloc == NULL) {
++        return ALLOC_AND_ZERO(s);
++    }
++    /* only custom alloc defined : use it, and combine it with memset() */
++    {   void* const p = cmem.customAlloc(cmem.opaqueState, s);
++        if (p != NULL) MEM_INIT(p, 0, s);
++        return p;
++}   }
++
++static void* LZ4F_malloc(size_t s, LZ4F_CustomMem cmem)
++{
++    /* custom malloc defined : use it */
++    if (cmem.customAlloc != NULL) {
++        return cmem.customAlloc(cmem.opaqueState, s);
++    }
++    /* nothing defined : use default <stdlib.h>'s malloc() */
++    return ALLOC(s);
++}
++
++static void LZ4F_free(void* p, LZ4F_CustomMem cmem)
++{
++    /* custom malloc defined : use it */
++    if (cmem.customFree != NULL) {
++        cmem.customFree(cmem.opaqueState, p);
++        return;
++    }
++    /* nothing defined : use default <stdlib.h>'s free() */
++    FREEMEM(p);
++}
++
++
++/*-************************************
+ *  Debug
+ **************************************/
+ #if defined(LZ4_DEBUG) && (LZ4_DEBUG>=1)
+ #  include <assert.h>
+ #else
+ #  ifndef assert
+ #    define assert(condition) ((void)0)
+ #  endif
+@@ -138,17 +176,17 @@ static int g_debuglog_enable = 1;
+   typedef unsigned char       BYTE;
+   typedef unsigned short      U16;
+   typedef unsigned int        U32;
+   typedef   signed int        S32;
+   typedef unsigned long long  U64;
+ #endif
+ 
+ 
+-/* unoptimized version; solves endianess & alignment issues */
++/* unoptimized version; solves endianness & alignment issues */
+ static U32 LZ4F_readLE32 (const void* src)
+ {
+     const BYTE* const srcPtr = (const BYTE*)src;
+     U32 value32 = srcPtr[0];
+     value32 += ((U32)srcPtr[1])<< 8;
+     value32 += ((U32)srcPtr[2])<<16;
+     value32 += ((U32)srcPtr[3])<<24;
+     return value32;
+@@ -201,46 +239,49 @@ static void LZ4F_writeLE64 (void* dst, U
+ #endif
+ 
+ #define _1BIT  0x01
+ #define _2BITS 0x03
+ #define _3BITS 0x07
+ #define _4BITS 0x0F
+ #define _8BITS 0xFF
+ 
+-#define LZ4F_MAGIC_SKIPPABLE_START 0x184D2A50U
+-#define LZ4F_MAGICNUMBER 0x184D2204U
+ #define LZ4F_BLOCKUNCOMPRESSED_FLAG 0x80000000U
+ #define LZ4F_BLOCKSIZEID_DEFAULT LZ4F_max64KB
+ 
+ static const size_t minFHSize = LZ4F_HEADER_SIZE_MIN;   /*  7 */
+ static const size_t maxFHSize = LZ4F_HEADER_SIZE_MAX;   /* 19 */
+ static const size_t BHSize = LZ4F_BLOCK_HEADER_SIZE;  /* block header : size, and compress flag */
+ static const size_t BFSize = LZ4F_BLOCK_CHECKSUM_SIZE;  /* block footer : checksum (optional) */
+ 
+ 
+ /*-************************************
+ *  Structures and local types
+ **************************************/
++
++typedef enum { LZ4B_COMPRESSED, LZ4B_UNCOMPRESSED} LZ4F_blockCompression_t;
++
+ typedef struct LZ4F_cctx_s
+ {
++    LZ4F_CustomMem cmem;
+     LZ4F_preferences_t prefs;
+     U32    version;
+     U32    cStage;
+     const LZ4F_CDict* cdict;
+     size_t maxBlockSize;
+     size_t maxBufferSize;
+-    BYTE*  tmpBuff;
+-    BYTE*  tmpIn;
+-    size_t tmpInSize;
++    BYTE*  tmpBuff;    /* internal buffer, for streaming */
++    BYTE*  tmpIn;      /* starting position of data compress within internal buffer (>= tmpBuff) */
++    size_t tmpInSize;  /* amount of data to compress after tmpIn */
+     U64    totalInSize;
+     XXH32_state_t xxh;
+     void*  lz4CtxPtr;
+     U16    lz4CtxAlloc; /* sized for: 0 = none, 1 = lz4 ctx, 2 = lz4hc ctx */
+     U16    lz4CtxState; /* in use as: 0 = none, 1 = lz4 ctx, 2 = lz4hc ctx */
++    LZ4F_blockCompression_t  blockCompression;
+ } LZ4F_cctx_t;
+ 
+ 
+ /*-************************************
+ *  Error management
+ **************************************/
+ #define LZ4F_GENERATE_STRING(STRING) #STRING,
+ static const char* LZ4F_errorStrings[] = { LZ4F_LIST_ERRORS(LZ4F_GENERATE_STRING) };
+@@ -259,37 +300,43 @@ const char* LZ4F_getErrorName(LZ4F_error
+ }
+ 
+ LZ4F_errorCodes LZ4F_getErrorCode(size_t functionResult)
+ {
+     if (!LZ4F_isError(functionResult)) return LZ4F_OK_NoError;
+     return (LZ4F_errorCodes)(-(ptrdiff_t)functionResult);
+ }
+ 
+-static LZ4F_errorCode_t err0r(LZ4F_errorCodes code)
++static LZ4F_errorCode_t LZ4F_returnErrorCode(LZ4F_errorCodes code)
+ {
+     /* A compilation error here means sizeof(ptrdiff_t) is not large enough */
+     LZ4F_STATIC_ASSERT(sizeof(ptrdiff_t) >= sizeof(size_t));
+     return (LZ4F_errorCode_t)-(ptrdiff_t)code;
+ }
+ 
++#define RETURN_ERROR(e) return LZ4F_returnErrorCode(LZ4F_ERROR_ ## e)
++
++#define RETURN_ERROR_IF(c,e) if (c) RETURN_ERROR(e)
++
++#define FORWARD_IF_ERROR(r)  if (LZ4F_isError(r)) return (r)
++
+ unsigned LZ4F_getVersion(void) { return LZ4F_VERSION; }
+ 
+ int LZ4F_compressionLevel_max(void) { return LZ4HC_CLEVEL_MAX; }
+ 
+-size_t LZ4F_getBlockSize(unsigned blockSizeID)
++size_t LZ4F_getBlockSize(LZ4F_blockSizeID_t blockSizeID)
+ {
+     static const size_t blockSizes[4] = { 64 KB, 256 KB, 1 MB, 4 MB };
+ 
+     if (blockSizeID == 0) blockSizeID = LZ4F_BLOCKSIZEID_DEFAULT;
+     if (blockSizeID < LZ4F_max64KB || blockSizeID > LZ4F_max4MB)
+-        return err0r(LZ4F_ERROR_maxBlockSize_invalid);
+-    blockSizeID -= LZ4F_max64KB;
+-    return blockSizes[blockSizeID];
+-}
++        RETURN_ERROR(maxBlockSize_invalid);
++    {   int const blockSizeIdx = (int)blockSizeID - (int)LZ4F_max64KB;
++        return blockSizes[blockSizeIdx];
++}   }
+ 
+ /*-************************************
+ *  Private functions
+ **************************************/
+ #define MIN(a,b)   ( (a) < (b) ? (a) : (b) )
+ 
+ static BYTE LZ4F_headerChecksum (const void* header, size_t length)
+ {
+@@ -392,31 +439,30 @@ size_t LZ4F_compressFrame_usingCDict(LZ4
+     prefs.frameInfo.blockSizeID = LZ4F_optimalBSID(prefs.frameInfo.blockSizeID, srcSize);
+     prefs.autoFlush = 1;
+     if (srcSize <= LZ4F_getBlockSize(prefs.frameInfo.blockSizeID))
+         prefs.frameInfo.blockMode = LZ4F_blockIndependent;   /* only one block => no need for inter-block link */
+ 
+     MEM_INIT(&options, 0, sizeof(options));
+     options.stableSrc = 1;
+ 
+-    if (dstCapacity < LZ4F_compressFrameBound(srcSize, &prefs))  /* condition to guarantee success */
+-        return err0r(LZ4F_ERROR_dstMaxSize_tooSmall);
++    RETURN_ERROR_IF(dstCapacity < LZ4F_compressFrameBound(srcSize, &prefs), dstMaxSize_tooSmall);
+ 
+     { size_t const headerSize = LZ4F_compressBegin_usingCDict(cctx, dstBuffer, dstCapacity, cdict, &prefs);  /* write header */
+-      if (LZ4F_isError(headerSize)) return headerSize;
++      FORWARD_IF_ERROR(headerSize);
+       dstPtr += headerSize;   /* header size */ }
+ 
+     assert(dstEnd >= dstPtr);
+     { size_t const cSize = LZ4F_compressUpdate(cctx, dstPtr, (size_t)(dstEnd-dstPtr), srcBuffer, srcSize, &options);
+-      if (LZ4F_isError(cSize)) return cSize;
++      FORWARD_IF_ERROR(cSize);
+       dstPtr += cSize; }
+ 
+     assert(dstEnd >= dstPtr);
+     { size_t const tailSize = LZ4F_compressEnd(cctx, dstPtr, (size_t)(dstEnd-dstPtr), &options);   /* flush last block, and generate suffix */
+-      if (LZ4F_isError(tailSize)) return tailSize;
++      FORWARD_IF_ERROR(tailSize);
+       dstPtr += tailSize; }
+ 
+     assert(dstEnd >= dstStart);
+     return (size_t)(dstPtr - dstStart);
+ }
+ 
+ 
+ /*! LZ4F_compressFrame() :
+@@ -427,139 +473,162 @@ size_t LZ4F_compressFrame_usingCDict(LZ4
+  *           or an error code if it fails (can be tested using LZ4F_isError())
+  */
+ size_t LZ4F_compressFrame(void* dstBuffer, size_t dstCapacity,
+                     const void* srcBuffer, size_t srcSize,
+                     const LZ4F_preferences_t* preferencesPtr)
+ {
+     size_t result;
+ #if (LZ4F_HEAPMODE)
+-    LZ4F_cctx_t *cctxPtr;
++    LZ4F_cctx_t* cctxPtr;
+     result = LZ4F_createCompressionContext(&cctxPtr, LZ4F_VERSION);
+-    if (LZ4F_isError(result)) return result;
++    FORWARD_IF_ERROR(result);
+ #else
+     LZ4F_cctx_t cctx;
+     LZ4_stream_t lz4ctx;
+-    LZ4F_cctx_t *cctxPtr = &cctx;
++    LZ4F_cctx_t* const cctxPtr = &cctx;
+ 
+-    DEBUGLOG(4, "LZ4F_compressFrame");
+     MEM_INIT(&cctx, 0, sizeof(cctx));
+     cctx.version = LZ4F_VERSION;
+     cctx.maxBufferSize = 5 MB;   /* mess with real buffer size to prevent dynamic allocation; works only because autoflush==1 & stableSrc==1 */
+-    if (preferencesPtr == NULL ||
+-        preferencesPtr->compressionLevel < LZ4HC_CLEVEL_MIN)
+-    {
++    if ( preferencesPtr == NULL
++      || preferencesPtr->compressionLevel < LZ4HC_CLEVEL_MIN ) {
+         LZ4_initStream(&lz4ctx, sizeof(lz4ctx));
+         cctxPtr->lz4CtxPtr = &lz4ctx;
+         cctxPtr->lz4CtxAlloc = 1;
+         cctxPtr->lz4CtxState = 1;
+     }
+ #endif
++    DEBUGLOG(4, "LZ4F_compressFrame");
+ 
+     result = LZ4F_compressFrame_usingCDict(cctxPtr, dstBuffer, dstCapacity,
+                                            srcBuffer, srcSize,
+                                            NULL, preferencesPtr);
+ 
+ #if (LZ4F_HEAPMODE)
+     LZ4F_freeCompressionContext(cctxPtr);
+ #else
+-    if (preferencesPtr != NULL &&
+-        preferencesPtr->compressionLevel >= LZ4HC_CLEVEL_MIN)
+-    {
+-        FREEMEM(cctxPtr->lz4CtxPtr);
++    if ( preferencesPtr != NULL
++      && preferencesPtr->compressionLevel >= LZ4HC_CLEVEL_MIN ) {
++        LZ4F_free(cctxPtr->lz4CtxPtr, cctxPtr->cmem);
+     }
+ #endif
+     return result;
+ }
+ 
+ 
+ /*-***************************************************
+ *   Dictionary compression
+ *****************************************************/
+ 
+ struct LZ4F_CDict_s {
++    LZ4F_CustomMem cmem;
+     void* dictContent;
+     LZ4_stream_t* fastCtx;
+     LZ4_streamHC_t* HCCtx;
+ }; /* typedef'd to LZ4F_CDict within lz4frame_static.h */
+ 
+-/*! LZ4F_createCDict() :
+- *  When compressing multiple messages / blocks with the same dictionary, it's recommended to load it just once.
+- *  LZ4F_createCDict() will create a digested dictionary, ready to start future compression operations without startup delay.
+- *  LZ4F_CDict can be created once and shared by multiple threads concurrently, since its usage is read-only.
+- * `dictBuffer` can be released after LZ4F_CDict creation, since its content is copied within CDict
+- * @return : digested dictionary for compression, or NULL if failed */
+-LZ4F_CDict* LZ4F_createCDict(const void* dictBuffer, size_t dictSize)
++LZ4F_CDict*
++LZ4F_createCDict_advanced(LZ4F_CustomMem cmem, const void* dictBuffer, size_t dictSize)
+ {
+     const char* dictStart = (const char*)dictBuffer;
+-    LZ4F_CDict* cdict = (LZ4F_CDict*) ALLOC(sizeof(*cdict));
+-    DEBUGLOG(4, "LZ4F_createCDict");
++    LZ4F_CDict* const cdict = (LZ4F_CDict*)LZ4F_malloc(sizeof(*cdict), cmem);
++    DEBUGLOG(4, "LZ4F_createCDict_advanced");
+     if (!cdict) return NULL;
++    cdict->cmem = cmem;
+     if (dictSize > 64 KB) {
+         dictStart += dictSize - 64 KB;
+         dictSize = 64 KB;
+     }
+-    cdict->dictContent = ALLOC(dictSize);
+-    cdict->fastCtx = LZ4_createStream();
+-    cdict->HCCtx = LZ4_createStreamHC();
++    cdict->dictContent = LZ4F_malloc(dictSize, cmem);
++    cdict->fastCtx = (LZ4_stream_t*)LZ4F_malloc(sizeof(LZ4_stream_t), cmem);
++    if (cdict->fastCtx)
++        LZ4_initStream(cdict->fastCtx, sizeof(LZ4_stream_t));
++    cdict->HCCtx = (LZ4_streamHC_t*)LZ4F_malloc(sizeof(LZ4_streamHC_t), cmem);
++    if (cdict->HCCtx)
++        LZ4_initStream(cdict->HCCtx, sizeof(LZ4_streamHC_t));
+     if (!cdict->dictContent || !cdict->fastCtx || !cdict->HCCtx) {
+         LZ4F_freeCDict(cdict);
+         return NULL;
+     }
+     memcpy(cdict->dictContent, dictStart, dictSize);
+     LZ4_loadDict (cdict->fastCtx, (const char*)cdict->dictContent, (int)dictSize);
+     LZ4_setCompressionLevel(cdict->HCCtx, LZ4HC_CLEVEL_DEFAULT);
+     LZ4_loadDictHC(cdict->HCCtx, (const char*)cdict->dictContent, (int)dictSize);
+     return cdict;
+ }
+ 
++/*! LZ4F_createCDict() :
++ *  When compressing multiple messages / blocks with the same dictionary, it's recommended to load it just once.
++ *  LZ4F_createCDict() will create a digested dictionary, ready to start future compression operations without startup delay.
++ *  LZ4F_CDict can be created once and shared by multiple threads concurrently, since its usage is read-only.
++ * @dictBuffer can be released after LZ4F_CDict creation, since its content is copied within CDict
++ * @return : digested dictionary for compression, or NULL if failed */
++LZ4F_CDict* LZ4F_createCDict(const void* dictBuffer, size_t dictSize)
++{
++    DEBUGLOG(4, "LZ4F_createCDict");
++    return LZ4F_createCDict_advanced(LZ4F_defaultCMem, dictBuffer, dictSize);
++}
++
+ void LZ4F_freeCDict(LZ4F_CDict* cdict)
+ {
+     if (cdict==NULL) return;  /* support free on NULL */
+-    FREEMEM(cdict->dictContent);
+-    LZ4_freeStream(cdict->fastCtx);
+-    LZ4_freeStreamHC(cdict->HCCtx);
+-    FREEMEM(cdict);
++    LZ4F_free(cdict->dictContent, cdict->cmem);
++    LZ4F_free(cdict->fastCtx, cdict->cmem);
++    LZ4F_free(cdict->HCCtx, cdict->cmem);
++    LZ4F_free(cdict, cdict->cmem);
+ }
+ 
+ 
+ /*-*********************************
+ *  Advanced compression functions
+ ***********************************/
+ 
++LZ4F_cctx*
++LZ4F_createCompressionContext_advanced(LZ4F_CustomMem customMem, unsigned version)
++{
++    LZ4F_cctx* const cctxPtr =
++        (LZ4F_cctx*)LZ4F_calloc(sizeof(LZ4F_cctx), customMem);
++    if (cctxPtr==NULL) return NULL;
++
++    cctxPtr->cmem = customMem;
++    cctxPtr->version = version;
++    cctxPtr->cStage = 0;   /* Uninitialized. Next stage : init cctx */
++
++    return cctxPtr;
++}
++
+ /*! LZ4F_createCompressionContext() :
+  *  The first thing to do is to create a compressionContext object, which will be used in all compression operations.
+  *  This is achieved using LZ4F_createCompressionContext(), which takes as argument a version and an LZ4F_preferences_t structure.
+  *  The version provided MUST be LZ4F_VERSION. It is intended to track potential incompatible differences between different binaries.
+  *  The function will provide a pointer to an allocated LZ4F_compressionContext_t object.
+  *  If the result LZ4F_errorCode_t is not OK_NoError, there was an error during context creation.
+  *  Object can release its memory using LZ4F_freeCompressionContext();
+- */
+-LZ4F_errorCode_t LZ4F_createCompressionContext(LZ4F_cctx** LZ4F_compressionContextPtr, unsigned version)
++**/
++LZ4F_errorCode_t
++LZ4F_createCompressionContext(LZ4F_cctx** LZ4F_compressionContextPtr, unsigned version)
+ {
+-    LZ4F_cctx_t* const cctxPtr = (LZ4F_cctx_t*)ALLOC_AND_ZERO(sizeof(LZ4F_cctx_t));
+-    if (cctxPtr==NULL) return err0r(LZ4F_ERROR_allocation_failed);
++    assert(LZ4F_compressionContextPtr != NULL); /* considered a violation of narrow contract */
++    /* in case it nonetheless happen in production */
++    RETURN_ERROR_IF(LZ4F_compressionContextPtr == NULL, parameter_null);
+ 
+-    cctxPtr->version = version;
+-    cctxPtr->cStage = 0;   /* Next stage : init stream */
+-
+-    *LZ4F_compressionContextPtr = cctxPtr;
+-
++    *LZ4F_compressionContextPtr = LZ4F_createCompressionContext_advanced(LZ4F_defaultCMem, version);
++    RETURN_ERROR_IF(*LZ4F_compressionContextPtr==NULL, allocation_failed);
+     return LZ4F_OK_NoError;
+ }
+ 
+ 
+ LZ4F_errorCode_t LZ4F_freeCompressionContext(LZ4F_cctx* cctxPtr)
+ {
+     if (cctxPtr != NULL) {  /* support free on NULL */
+-       FREEMEM(cctxPtr->lz4CtxPtr);  /* note: LZ4_streamHC_t and LZ4_stream_t are simple POD types */
+-       FREEMEM(cctxPtr->tmpBuff);
+-       FREEMEM(cctxPtr);
++       LZ4F_free(cctxPtr->lz4CtxPtr, cctxPtr->cmem);  /* note: LZ4_streamHC_t and LZ4_stream_t are simple POD types */
++       LZ4F_free(cctxPtr->tmpBuff, cctxPtr->cmem);
++       LZ4F_free(cctxPtr, cctxPtr->cmem);
+     }
+-
+     return LZ4F_OK_NoError;
+ }
+ 
+ 
+ /**
+  * This function prepares the internal LZ4(HC) stream for a new compression,
+  * resetting the context and attaching the dictionary, if there is one.
+  *
+@@ -583,78 +652,93 @@ static void LZ4F_initStream(void* ctx,
+         }
+         LZ4_attach_dictionary((LZ4_stream_t *)ctx, cdict ? cdict->fastCtx : NULL);
+     } else {
+         LZ4_resetStreamHC_fast((LZ4_streamHC_t*)ctx, level);
+         LZ4_attach_HC_dictionary((LZ4_streamHC_t *)ctx, cdict ? cdict->HCCtx : NULL);
+     }
+ }
+ 
++static int ctxTypeID_to_size(int ctxTypeID) {
++    switch(ctxTypeID) {
++    case 1:
++        return LZ4_sizeofState();
++    case 2:
++        return LZ4_sizeofStateHC();
++    default:
++        return 0;
++    }
++}
+ 
+ /*! LZ4F_compressBegin_usingCDict() :
+- *  init streaming compression and writes frame header into dstBuffer.
+- *  dstBuffer must be >= LZ4F_HEADER_SIZE_MAX bytes.
+- * @return : number of bytes written into dstBuffer for the header
++ *  init streaming compression AND writes frame header into @dstBuffer.
++ * @dstCapacity must be >= LZ4F_HEADER_SIZE_MAX bytes.
++ * @return : number of bytes written into @dstBuffer for the header
+  *           or an error code (can be tested using LZ4F_isError())
+  */
+ size_t LZ4F_compressBegin_usingCDict(LZ4F_cctx* cctxPtr,
+                           void* dstBuffer, size_t dstCapacity,
+                           const LZ4F_CDict* cdict,
+                           const LZ4F_preferences_t* preferencesPtr)
+ {
+-    LZ4F_preferences_t prefNull;
++    LZ4F_preferences_t const prefNull = LZ4F_INIT_PREFERENCES;
+     BYTE* const dstStart = (BYTE*)dstBuffer;
+     BYTE* dstPtr = dstStart;
+-    BYTE* headerStart;
+ 
+-    if (dstCapacity < maxFHSize) return err0r(LZ4F_ERROR_dstMaxSize_tooSmall);
+-    MEM_INIT(&prefNull, 0, sizeof(prefNull));
++    RETURN_ERROR_IF(dstCapacity < maxFHSize, dstMaxSize_tooSmall);
+     if (preferencesPtr == NULL) preferencesPtr = &prefNull;
+     cctxPtr->prefs = *preferencesPtr;
+ 
+-    /* Ctx Management */
++    /* cctx Management */
+     {   U16 const ctxTypeID = (cctxPtr->prefs.compressionLevel < LZ4HC_CLEVEL_MIN) ? 1 : 2;
+-        if (cctxPtr->lz4CtxAlloc < ctxTypeID) {
+-            FREEMEM(cctxPtr->lz4CtxPtr);
++        int requiredSize = ctxTypeID_to_size(ctxTypeID);
++        int allocatedSize = ctxTypeID_to_size(cctxPtr->lz4CtxAlloc);
++        if (allocatedSize < requiredSize) {
++            /* not enough space allocated */
++            LZ4F_free(cctxPtr->lz4CtxPtr, cctxPtr->cmem);
+             if (cctxPtr->prefs.compressionLevel < LZ4HC_CLEVEL_MIN) {
+-                cctxPtr->lz4CtxPtr = LZ4_createStream();
++                /* must take ownership of memory allocation,
++                 * in order to respect custom allocator contract */
++                cctxPtr->lz4CtxPtr = LZ4F_malloc(sizeof(LZ4_stream_t), cctxPtr->cmem);
++                if (cctxPtr->lz4CtxPtr)
++                    LZ4_initStream(cctxPtr->lz4CtxPtr, sizeof(LZ4_stream_t));
+             } else {
+-                cctxPtr->lz4CtxPtr = LZ4_createStreamHC();
++                cctxPtr->lz4CtxPtr = LZ4F_malloc(sizeof(LZ4_streamHC_t), cctxPtr->cmem);
++                if (cctxPtr->lz4CtxPtr)
++                    LZ4_initStreamHC(cctxPtr->lz4CtxPtr, sizeof(LZ4_streamHC_t));
+             }
+-            if (cctxPtr->lz4CtxPtr == NULL)
+-                return err0r(LZ4F_ERROR_allocation_failed);
++            RETURN_ERROR_IF(cctxPtr->lz4CtxPtr == NULL, allocation_failed);
+             cctxPtr->lz4CtxAlloc = ctxTypeID;
+             cctxPtr->lz4CtxState = ctxTypeID;
+         } else if (cctxPtr->lz4CtxState != ctxTypeID) {
+-            /* otherwise, a sufficient buffer is allocated, but we need to
+-             * reset it to the correct context type */
++            /* otherwise, a sufficient buffer is already allocated,
++             * but we need to reset it to the correct context type */
+             if (cctxPtr->prefs.compressionLevel < LZ4HC_CLEVEL_MIN) {
+-                LZ4_initStream((LZ4_stream_t *) cctxPtr->lz4CtxPtr, sizeof (LZ4_stream_t));
++                LZ4_initStream((LZ4_stream_t*)cctxPtr->lz4CtxPtr, sizeof(LZ4_stream_t));
+             } else {
+-                LZ4_initStreamHC((LZ4_streamHC_t *) cctxPtr->lz4CtxPtr, sizeof(LZ4_streamHC_t));
+-                LZ4_setCompressionLevel((LZ4_streamHC_t *) cctxPtr->lz4CtxPtr, cctxPtr->prefs.compressionLevel);
++                LZ4_initStreamHC((LZ4_streamHC_t*)cctxPtr->lz4CtxPtr, sizeof(LZ4_streamHC_t));
++                LZ4_setCompressionLevel((LZ4_streamHC_t*)cctxPtr->lz4CtxPtr, cctxPtr->prefs.compressionLevel);
+             }
+             cctxPtr->lz4CtxState = ctxTypeID;
+-        }
+-    }
++    }   }
+ 
+     /* Buffer Management */
+     if (cctxPtr->prefs.frameInfo.blockSizeID == 0)
+         cctxPtr->prefs.frameInfo.blockSizeID = LZ4F_BLOCKSIZEID_DEFAULT;
+     cctxPtr->maxBlockSize = LZ4F_getBlockSize(cctxPtr->prefs.frameInfo.blockSizeID);
+ 
+     {   size_t const requiredBuffSize = preferencesPtr->autoFlush ?
+                 ((cctxPtr->prefs.frameInfo.blockMode == LZ4F_blockLinked) ? 64 KB : 0) :  /* only needs past data up to window size */
+                 cctxPtr->maxBlockSize + ((cctxPtr->prefs.frameInfo.blockMode == LZ4F_blockLinked) ? 128 KB : 0);
+ 
+         if (cctxPtr->maxBufferSize < requiredBuffSize) {
+             cctxPtr->maxBufferSize = 0;
+-            FREEMEM(cctxPtr->tmpBuff);
+-            cctxPtr->tmpBuff = (BYTE*)ALLOC_AND_ZERO(requiredBuffSize);
+-            if (cctxPtr->tmpBuff == NULL) return err0r(LZ4F_ERROR_allocation_failed);
++            LZ4F_free(cctxPtr->tmpBuff, cctxPtr->cmem);
++            cctxPtr->tmpBuff = (BYTE*)LZ4F_calloc(requiredBuffSize, cctxPtr->cmem);
++            RETURN_ERROR_IF(cctxPtr->tmpBuff == NULL, allocation_failed);
+             cctxPtr->maxBufferSize = requiredBuffSize;
+     }   }
+     cctxPtr->tmpIn = cctxPtr->tmpBuff;
+     cctxPtr->tmpInSize = 0;
+     (void)XXH32_reset(&(cctxPtr->xxh), 0);
+ 
+     /* context init */
+     cctxPtr->cdict = cdict;
+@@ -664,51 +748,52 @@ size_t LZ4F_compressBegin_usingCDict(LZ4
+     }
+     if (preferencesPtr->compressionLevel >= LZ4HC_CLEVEL_MIN) {
+         LZ4_favorDecompressionSpeed((LZ4_streamHC_t*)cctxPtr->lz4CtxPtr, (int)preferencesPtr->favorDecSpeed);
+     }
+ 
+     /* Magic Number */
+     LZ4F_writeLE32(dstPtr, LZ4F_MAGICNUMBER);
+     dstPtr += 4;
+-    headerStart = dstPtr;
++    {   BYTE* const headerStart = dstPtr;
+ 
+-    /* FLG Byte */
+-    *dstPtr++ = (BYTE)(((1 & _2BITS) << 6)    /* Version('01') */
+-        + ((cctxPtr->prefs.frameInfo.blockMode & _1BIT ) << 5)
+-        + ((cctxPtr->prefs.frameInfo.blockChecksumFlag & _1BIT ) << 4)
+-        + ((unsigned)(cctxPtr->prefs.frameInfo.contentSize > 0) << 3)
+-        + ((cctxPtr->prefs.frameInfo.contentChecksumFlag & _1BIT ) << 2)
+-        +  (cctxPtr->prefs.frameInfo.dictID > 0) );
+-    /* BD Byte */
+-    *dstPtr++ = (BYTE)((cctxPtr->prefs.frameInfo.blockSizeID & _3BITS) << 4);
+-    /* Optional Frame content size field */
+-    if (cctxPtr->prefs.frameInfo.contentSize) {
+-        LZ4F_writeLE64(dstPtr, cctxPtr->prefs.frameInfo.contentSize);
+-        dstPtr += 8;
+-        cctxPtr->totalInSize = 0;
++        /* FLG Byte */
++        *dstPtr++ = (BYTE)(((1 & _2BITS) << 6)    /* Version('01') */
++            + ((cctxPtr->prefs.frameInfo.blockMode & _1BIT ) << 5)
++            + ((cctxPtr->prefs.frameInfo.blockChecksumFlag & _1BIT ) << 4)
++            + ((unsigned)(cctxPtr->prefs.frameInfo.contentSize > 0) << 3)
++            + ((cctxPtr->prefs.frameInfo.contentChecksumFlag & _1BIT ) << 2)
++            +  (cctxPtr->prefs.frameInfo.dictID > 0) );
++        /* BD Byte */
++        *dstPtr++ = (BYTE)((cctxPtr->prefs.frameInfo.blockSizeID & _3BITS) << 4);
++        /* Optional Frame content size field */
++        if (cctxPtr->prefs.frameInfo.contentSize) {
++            LZ4F_writeLE64(dstPtr, cctxPtr->prefs.frameInfo.contentSize);
++            dstPtr += 8;
++            cctxPtr->totalInSize = 0;
++        }
++        /* Optional dictionary ID field */
++        if (cctxPtr->prefs.frameInfo.dictID) {
++            LZ4F_writeLE32(dstPtr, cctxPtr->prefs.frameInfo.dictID);
++            dstPtr += 4;
++        }
++        /* Header CRC Byte */
++        *dstPtr = LZ4F_headerChecksum(headerStart, (size_t)(dstPtr - headerStart));
++        dstPtr++;
+     }
+-    /* Optional dictionary ID field */
+-    if (cctxPtr->prefs.frameInfo.dictID) {
+-        LZ4F_writeLE32(dstPtr, cctxPtr->prefs.frameInfo.dictID);
+-        dstPtr += 4;
+-    }
+-    /* Header CRC Byte */
+-    *dstPtr = LZ4F_headerChecksum(headerStart, (size_t)(dstPtr - headerStart));
+-    dstPtr++;
+ 
+     cctxPtr->cStage = 1;   /* header written, now request input data block */
+     return (size_t)(dstPtr - dstStart);
+ }
+ 
+ 
+ /*! LZ4F_compressBegin() :
+- *  init streaming compression and writes frame header into dstBuffer.
+- *  dstBuffer must be >= LZ4F_HEADER_SIZE_MAX bytes.
+- *  preferencesPtr can be NULL, in which case default parameters are selected.
++ *  init streaming compression AND writes frame header into @dstBuffer.
++ * @dstCapacity must be >= LZ4F_HEADER_SIZE_MAX bytes.
++ * @preferencesPtr can be NULL, in which case default parameters are selected.
+  * @return : number of bytes written into dstBuffer for the header
+  *        or an error code (can be tested using LZ4F_isError())
+  */
+ size_t LZ4F_compressBegin(LZ4F_cctx* cctxPtr,
+                           void* dstBuffer, size_t dstCapacity,
+                           const LZ4F_preferences_t* preferencesPtr)
+ {
+     return LZ4F_compressBegin_usingCDict(cctxPtr, dstBuffer, dstCapacity,
+@@ -739,21 +824,23 @@ typedef int (*compressFunc_t)(void* ctx,
+  */
+ static size_t LZ4F_makeBlock(void* dst,
+                        const void* src, size_t srcSize,
+                              compressFunc_t compress, void* lz4ctx, int level,
+                        const LZ4F_CDict* cdict,
+                              LZ4F_blockChecksum_t crcFlag)
+ {
+     BYTE* const cSizePtr = (BYTE*)dst;
+-    U32 cSize = (U32)compress(lz4ctx, (const char*)src, (char*)(cSizePtr+BHSize),
+-                                      (int)(srcSize), (int)(srcSize-1),
+-                                      level, cdict);
+-    if (cSize == 0) {  /* compression failed */
+-        DEBUGLOG(5, "LZ4F_makeBlock: compression failed, creating a raw block (size %u)", (U32)srcSize);
++    U32 cSize;
++    assert(compress != NULL);
++    cSize = (U32)compress(lz4ctx, (const char*)src, (char*)(cSizePtr+BHSize),
++                          (int)(srcSize), (int)(srcSize-1),
++                          level, cdict);
++
++    if (cSize == 0 || cSize >= srcSize) {
+         cSize = (U32)srcSize;
+         LZ4F_writeLE32(cSizePtr, cSize | LZ4F_BLOCKUNCOMPRESSED_FLAG);
+         memcpy(cSizePtr+BHSize, src, srcSize);
+     } else {
+         LZ4F_writeLE32(cSizePtr, cSize);
+     }
+     if (crcFlag) {
+         U32 const crc32 = XXH32(cSizePtr+BHSize, cSize, 0);  /* checksum of compressed data */
+@@ -761,28 +848,30 @@ static size_t LZ4F_makeBlock(void* dst,
+     }
+     return BHSize + cSize + ((U32)crcFlag)*BFSize;
+ }
+ 
+ 
+ static int LZ4F_compressBlock(void* ctx, const char* src, char* dst, int srcSize, int dstCapacity, int level, const LZ4F_CDict* cdict)
+ {
+     int const acceleration = (level < 0) ? -level + 1 : 1;
++    DEBUGLOG(5, "LZ4F_compressBlock (srcSize=%i)", srcSize);
+     LZ4F_initStream(ctx, cdict, level, LZ4F_blockIndependent);
+     if (cdict) {
+         return LZ4_compress_fast_continue((LZ4_stream_t*)ctx, src, dst, srcSize, dstCapacity, acceleration);
+     } else {
+         return LZ4_compress_fast_extState_fastReset(ctx, src, dst, srcSize, dstCapacity, acceleration);
+     }
+ }
+ 
+ static int LZ4F_compressBlock_continue(void* ctx, const char* src, char* dst, int srcSize, int dstCapacity, int level, const LZ4F_CDict* cdict)
+ {
+     int const acceleration = (level < 0) ? -level + 1 : 1;
+     (void)cdict; /* init once at beginning of frame */
++    DEBUGLOG(5, "LZ4F_compressBlock_continue (srcSize=%i)", srcSize);
+     return LZ4_compress_fast_continue((LZ4_stream_t*)ctx, src, dst, srcSize, dstCapacity, acceleration);
+ }
+ 
+ static int LZ4F_compressBlockHC(void* ctx, const char* src, char* dst, int srcSize, int dstCapacity, int level, const LZ4F_CDict* cdict)
+ {
+     LZ4F_initStream(ctx, cdict, level, LZ4F_blockIndependent);
+     if (cdict) {
+         return LZ4_compress_HC_continue((LZ4_streamHC_t*)ctx, src, dst, srcSize, dstCapacity);
+@@ -791,67 +880,94 @@ static int LZ4F_compressBlockHC(void* ct
+ }
+ 
+ static int LZ4F_compressBlockHC_continue(void* ctx, const char* src, char* dst, int srcSize, int dstCapacity, int level, const LZ4F_CDict* cdict)
+ {
+     (void)level; (void)cdict; /* init once at beginning of frame */
+     return LZ4_compress_HC_continue((LZ4_streamHC_t*)ctx, src, dst, srcSize, dstCapacity);
+ }
+ 
+-static compressFunc_t LZ4F_selectCompression(LZ4F_blockMode_t blockMode, int level)
++static int LZ4F_doNotCompressBlock(void* ctx, const char* src, char* dst, int srcSize, int dstCapacity, int level, const LZ4F_CDict* cdict)
+ {
++    (void)ctx; (void)src; (void)dst; (void)srcSize; (void)dstCapacity; (void)level; (void)cdict;
++    return 0;
++}
++
++static compressFunc_t LZ4F_selectCompression(LZ4F_blockMode_t blockMode, int level, LZ4F_blockCompression_t  compressMode)
++{
++    if (compressMode == LZ4B_UNCOMPRESSED) return LZ4F_doNotCompressBlock;
+     if (level < LZ4HC_CLEVEL_MIN) {
+         if (blockMode == LZ4F_blockIndependent) return LZ4F_compressBlock;
+         return LZ4F_compressBlock_continue;
+     }
+     if (blockMode == LZ4F_blockIndependent) return LZ4F_compressBlockHC;
+     return LZ4F_compressBlockHC_continue;
+ }
+ 
++/* Save history (up to 64KB) into @tmpBuff */
+ static int LZ4F_localSaveDict(LZ4F_cctx_t* cctxPtr)
+ {
+     if (cctxPtr->prefs.compressionLevel < LZ4HC_CLEVEL_MIN)
+         return LZ4_saveDict ((LZ4_stream_t*)(cctxPtr->lz4CtxPtr), (char*)(cctxPtr->tmpBuff), 64 KB);
+     return LZ4_saveDictHC ((LZ4_streamHC_t*)(cctxPtr->lz4CtxPtr), (char*)(cctxPtr->tmpBuff), 64 KB);
+ }
+ 
+ typedef enum { notDone, fromTmpBuffer, fromSrcBuffer } LZ4F_lastBlockStatus;
+ 
+-/*! LZ4F_compressUpdate() :
++static const LZ4F_compressOptions_t k_cOptionsNull = { 0, { 0, 0, 0 } };
++
++
++ /*! LZ4F_compressUpdateImpl() :
+  *  LZ4F_compressUpdate() can be called repetitively to compress as much data as necessary.
+- *  dstBuffer MUST be >= LZ4F_compressBound(srcSize, preferencesPtr).
+- *  LZ4F_compressOptions_t structure is optional : you can provide NULL as argument.
++ *  When successful, the function always entirely consumes @srcBuffer.
++ *  src data is either buffered or compressed into @dstBuffer.
++ *  If the block compression does not match the compression of the previous block, the old data is flushed
++ *  and operations continue with the new compression mode.
++ * @dstCapacity MUST be >= LZ4F_compressBound(srcSize, preferencesPtr) when block compression is turned on.
++ * @compressOptionsPtr is optional : provide NULL to mean "default".
+  * @return : the number of bytes written into dstBuffer. It can be zero, meaning input data was just buffered.
+  *           or an error code if it fails (which can be tested using LZ4F_isError())
++ *  After an error, the state is left in a UB state, and must be re-initialized.
+  */
+-size_t LZ4F_compressUpdate(LZ4F_cctx* cctxPtr,
+-                           void* dstBuffer, size_t dstCapacity,
++static size_t LZ4F_compressUpdateImpl(LZ4F_cctx* cctxPtr,
++                     void* dstBuffer, size_t dstCapacity,
+                      const void* srcBuffer, size_t srcSize,
+-                     const LZ4F_compressOptions_t* compressOptionsPtr)
+-{
+-    LZ4F_compressOptions_t cOptionsNull;
++                     const LZ4F_compressOptions_t* compressOptionsPtr,
++                     LZ4F_blockCompression_t blockCompression)
++  {
+     size_t const blockSize = cctxPtr->maxBlockSize;
+     const BYTE* srcPtr = (const BYTE*)srcBuffer;
+     const BYTE* const srcEnd = srcPtr + srcSize;
+     BYTE* const dstStart = (BYTE*)dstBuffer;
+     BYTE* dstPtr = dstStart;
+     LZ4F_lastBlockStatus lastBlockCompressed = notDone;
+-    compressFunc_t const compress = LZ4F_selectCompression(cctxPtr->prefs.frameInfo.blockMode, cctxPtr->prefs.compressionLevel);
+-
++    compressFunc_t const compress = LZ4F_selectCompression(cctxPtr->prefs.frameInfo.blockMode, cctxPtr->prefs.compressionLevel, blockCompression);
++    size_t bytesWritten;
+     DEBUGLOG(4, "LZ4F_compressUpdate (srcSize=%zu)", srcSize);
+ 
+-    if (cctxPtr->cStage != 1) return err0r(LZ4F_ERROR_GENERIC);
++    RETURN_ERROR_IF(cctxPtr->cStage != 1, compressionState_uninitialized);   /* state must be initialized and waiting for next block */
+     if (dstCapacity < LZ4F_compressBound_internal(srcSize, &(cctxPtr->prefs), cctxPtr->tmpInSize))
+-        return err0r(LZ4F_ERROR_dstMaxSize_tooSmall);
+-    MEM_INIT(&cOptionsNull, 0, sizeof(cOptionsNull));
+-    if (compressOptionsPtr == NULL) compressOptionsPtr = &cOptionsNull;
++        RETURN_ERROR(dstMaxSize_tooSmall);
++
++    if (blockCompression == LZ4B_UNCOMPRESSED && dstCapacity < srcSize)
++        RETURN_ERROR(dstMaxSize_tooSmall);
++
++    /* flush currently written block, to continue with new block compression */
++    if (cctxPtr->blockCompression != blockCompression) {
++        bytesWritten = LZ4F_flush(cctxPtr, dstBuffer, dstCapacity, compressOptionsPtr);
++        dstPtr += bytesWritten;
++        cctxPtr->blockCompression = blockCompression;
++    }
++
++    if (compressOptionsPtr == NULL) compressOptionsPtr = &k_cOptionsNull;
+ 
+     /* complete tmp buffer */
+     if (cctxPtr->tmpInSize > 0) {   /* some data already within tmp buffer */
+         size_t const sizeToCopy = blockSize - cctxPtr->tmpInSize;
++        assert(blockSize > cctxPtr->tmpInSize);
+         if (sizeToCopy > srcSize) {
+             /* add src to tmpIn buffer */
+             memcpy(cctxPtr->tmpIn + cctxPtr->tmpInSize, srcBuffer, srcSize);
+             srcPtr = srcEnd;
+             cctxPtr->tmpInSize += srcSize;
+             /* still needs some CRC */
+         } else {
+             /* complete tmpIn block and then compress it */
+@@ -859,61 +975,64 @@ size_t LZ4F_compressUpdate(LZ4F_cctx* cc
+             memcpy(cctxPtr->tmpIn + cctxPtr->tmpInSize, srcBuffer, sizeToCopy);
+             srcPtr += sizeToCopy;
+ 
+             dstPtr += LZ4F_makeBlock(dstPtr,
+                                      cctxPtr->tmpIn, blockSize,
+                                      compress, cctxPtr->lz4CtxPtr, cctxPtr->prefs.compressionLevel,
+                                      cctxPtr->cdict,
+                                      cctxPtr->prefs.frameInfo.blockChecksumFlag);
+-
+             if (cctxPtr->prefs.frameInfo.blockMode==LZ4F_blockLinked) cctxPtr->tmpIn += blockSize;
+             cctxPtr->tmpInSize = 0;
+-        }
+-    }
++    }   }
+ 
+     while ((size_t)(srcEnd - srcPtr) >= blockSize) {
+         /* compress full blocks */
+         lastBlockCompressed = fromSrcBuffer;
+         dstPtr += LZ4F_makeBlock(dstPtr,
+                                  srcPtr, blockSize,
+                                  compress, cctxPtr->lz4CtxPtr, cctxPtr->prefs.compressionLevel,
+                                  cctxPtr->cdict,
+                                  cctxPtr->prefs.frameInfo.blockChecksumFlag);
+         srcPtr += blockSize;
+     }
+ 
+     if ((cctxPtr->prefs.autoFlush) && (srcPtr < srcEnd)) {
+-        /* compress remaining input < blockSize */
++        /* autoFlush : remaining input (< blockSize) is compressed */
+         lastBlockCompressed = fromSrcBuffer;
+         dstPtr += LZ4F_makeBlock(dstPtr,
+                                  srcPtr, (size_t)(srcEnd - srcPtr),
+                                  compress, cctxPtr->lz4CtxPtr, cctxPtr->prefs.compressionLevel,
+                                  cctxPtr->cdict,
+                                  cctxPtr->prefs.frameInfo.blockChecksumFlag);
+-        srcPtr  = srcEnd;
++        srcPtr = srcEnd;
+     }
+ 
+-    /* preserve dictionary if necessary */
++    /* preserve dictionary within @tmpBuff whenever necessary */
+     if ((cctxPtr->prefs.frameInfo.blockMode==LZ4F_blockLinked) && (lastBlockCompressed==fromSrcBuffer)) {
++        /* linked blocks are only supported in compressed mode, see LZ4F_uncompressedUpdate */
++        assert(blockCompression == LZ4B_COMPRESSED);
+         if (compressOptionsPtr->stableSrc) {
+-            cctxPtr->tmpIn = cctxPtr->tmpBuff;
++            cctxPtr->tmpIn = cctxPtr->tmpBuff;  /* src is stable : dictionary remains in src across invocations */
+         } else {
+             int const realDictSize = LZ4F_localSaveDict(cctxPtr);
+-            if (realDictSize==0) return err0r(LZ4F_ERROR_GENERIC);
++            assert(0 <= realDictSize && realDictSize <= 64 KB);
+             cctxPtr->tmpIn = cctxPtr->tmpBuff + realDictSize;
+         }
+     }
+ 
+     /* keep tmpIn within limits */
+-    if ((cctxPtr->tmpIn + blockSize) > (cctxPtr->tmpBuff + cctxPtr->maxBufferSize)   /* necessarily LZ4F_blockLinked && lastBlockCompressed==fromTmpBuffer */
+-        && !(cctxPtr->prefs.autoFlush))
++    if (!(cctxPtr->prefs.autoFlush)  /* no autoflush : there may be some data left within internal buffer */
++      && (cctxPtr->tmpIn + blockSize) > (cctxPtr->tmpBuff + cctxPtr->maxBufferSize) )  /* not enough room to store next block */
+     {
++        /* only preserve 64KB within internal buffer. Ensures there is enough room for next block.
++         * note: this situation necessarily implies lastBlockCompressed==fromTmpBuffer */
+         int const realDictSize = LZ4F_localSaveDict(cctxPtr);
+         cctxPtr->tmpIn = cctxPtr->tmpBuff + realDictSize;
++        assert((cctxPtr->tmpIn + blockSize) <= (cctxPtr->tmpBuff + cctxPtr->maxBufferSize));
+     }
+ 
+     /* some input data left, necessarily < blockSize */
+     if (srcPtr < srcEnd) {
+         /* fill tmp buffer */
+         size_t const sizeToCopy = (size_t)(srcEnd - srcPtr);
+         memcpy(cctxPtr->tmpIn, srcPtr, sizeToCopy);
+         cctxPtr->tmpInSize = sizeToCopy;
+@@ -921,16 +1040,63 @@ size_t LZ4F_compressUpdate(LZ4F_cctx* cc
+ 
+     if (cctxPtr->prefs.frameInfo.contentChecksumFlag == LZ4F_contentChecksumEnabled)
+         (void)XXH32_update(&(cctxPtr->xxh), srcBuffer, srcSize);
+ 
+     cctxPtr->totalInSize += srcSize;
+     return (size_t)(dstPtr - dstStart);
+ }
+ 
++/*! LZ4F_compressUpdate() :
++ *  LZ4F_compressUpdate() can be called repetitively to compress as much data as necessary.
++ *  When successful, the function always entirely consumes @srcBuffer.
++ *  src data is either buffered or compressed into @dstBuffer.
++ *  If previously an uncompressed block was written, buffered data is flushed
++ *  before appending compressed data is continued.
++ * @dstCapacity MUST be >= LZ4F_compressBound(srcSize, preferencesPtr).
++ * @compressOptionsPtr is optional : provide NULL to mean "default".
++ * @return : the number of bytes written into dstBuffer. It can be zero, meaning input data was just buffered.
++ *           or an error code if it fails (which can be tested using LZ4F_isError())
++ *  After an error, the state is left in a UB state, and must be re-initialized.
++ */
++size_t LZ4F_compressUpdate(LZ4F_cctx* cctxPtr,
++                           void* dstBuffer, size_t dstCapacity,
++                     const void* srcBuffer, size_t srcSize,
++                     const LZ4F_compressOptions_t* compressOptionsPtr)
++{
++     return LZ4F_compressUpdateImpl(cctxPtr,
++                                   dstBuffer, dstCapacity,
++                                   srcBuffer, srcSize,
++                                   compressOptionsPtr, LZ4B_COMPRESSED);
++}
++
++/*! LZ4F_compressUpdate() :
++ *  LZ4F_compressUpdate() can be called repetitively to compress as much data as necessary.
++ *  When successful, the function always entirely consumes @srcBuffer.
++ *  src data is either buffered or compressed into @dstBuffer.
++ *  If previously an uncompressed block was written, buffered data is flushed
++ *  before appending compressed data is continued.
++ *  This is only supported when LZ4F_blockIndependent is used
++ * @dstCapacity MUST be >= LZ4F_compressBound(srcSize, preferencesPtr).
++ * @compressOptionsPtr is optional : provide NULL to mean "default".
++ * @return : the number of bytes written into dstBuffer. It can be zero, meaning input data was just buffered.
++ *           or an error code if it fails (which can be tested using LZ4F_isError())
++ *  After an error, the state is left in a UB state, and must be re-initialized.
++ */
++size_t LZ4F_uncompressedUpdate(LZ4F_cctx* cctxPtr,
++                               void* dstBuffer, size_t dstCapacity,
++                         const void* srcBuffer, size_t srcSize,
++                         const LZ4F_compressOptions_t* compressOptionsPtr) {
++    RETURN_ERROR_IF(cctxPtr->prefs.frameInfo.blockMode != LZ4F_blockIndependent, blockMode_invalid);
++    return LZ4F_compressUpdateImpl(cctxPtr,
++                                   dstBuffer, dstCapacity,
++                                   srcBuffer, srcSize,
++                                   compressOptionsPtr, LZ4B_UNCOMPRESSED);
++}
++
+ 
+ /*! LZ4F_flush() :
+  *  When compressed data must be sent immediately, without waiting for a block to be filled,
+  *  invoke LZ4_flush(), which will immediately compress any remaining data stored within LZ4F_cctx.
+  *  The result of the function is the number of bytes written into dstBuffer.
+  *  It can be zero, this means there was no data left within LZ4F_cctx.
+  *  The function outputs an error code if it fails (can be tested using LZ4F_isError())
+  *  LZ4F_compressOptions_t* is optional. NULL is a valid argument.
+@@ -939,23 +1105,22 @@ size_t LZ4F_flush(LZ4F_cctx* cctxPtr,
+                   void* dstBuffer, size_t dstCapacity,
+             const LZ4F_compressOptions_t* compressOptionsPtr)
+ {
+     BYTE* const dstStart = (BYTE*)dstBuffer;
+     BYTE* dstPtr = dstStart;
+     compressFunc_t compress;
+ 
+     if (cctxPtr->tmpInSize == 0) return 0;   /* nothing to flush */
+-    if (cctxPtr->cStage != 1) return err0r(LZ4F_ERROR_GENERIC);
+-    if (dstCapacity < (cctxPtr->tmpInSize + BHSize + BFSize))
+-        return err0r(LZ4F_ERROR_dstMaxSize_tooSmall);
+-    (void)compressOptionsPtr;   /* not yet useful */
++    RETURN_ERROR_IF(cctxPtr->cStage != 1, compressionState_uninitialized);
++    RETURN_ERROR_IF(dstCapacity < (cctxPtr->tmpInSize + BHSize + BFSize), dstMaxSize_tooSmall);
++    (void)compressOptionsPtr;   /* not useful (yet) */
+ 
+     /* select compression function */
+-    compress = LZ4F_selectCompression(cctxPtr->prefs.frameInfo.blockMode, cctxPtr->prefs.compressionLevel);
++    compress = LZ4F_selectCompression(cctxPtr->prefs.frameInfo.blockMode, cctxPtr->prefs.compressionLevel, cctxPtr->blockCompression);
+ 
+     /* compress tmp buffer */
+     dstPtr += LZ4F_makeBlock(dstPtr,
+                              cctxPtr->tmpIn, cctxPtr->tmpInSize,
+                              compress, cctxPtr->lz4CtxPtr, cctxPtr->prefs.compressionLevel,
+                              cctxPtr->cdict,
+                              cctxPtr->prefs.frameInfo.blockChecksumFlag);
+     assert(((void)"flush overflows dstBuffer!", (size_t)(dstPtr - dstStart) <= dstCapacity));
+@@ -987,40 +1152,40 @@ size_t LZ4F_compressEnd(LZ4F_cctx* cctxP
+                         void* dstBuffer, size_t dstCapacity,
+                   const LZ4F_compressOptions_t* compressOptionsPtr)
+ {
+     BYTE* const dstStart = (BYTE*)dstBuffer;
+     BYTE* dstPtr = dstStart;
+ 
+     size_t const flushSize = LZ4F_flush(cctxPtr, dstBuffer, dstCapacity, compressOptionsPtr);
+     DEBUGLOG(5,"LZ4F_compressEnd: dstCapacity=%u", (unsigned)dstCapacity);
+-    if (LZ4F_isError(flushSize)) return flushSize;
++    FORWARD_IF_ERROR(flushSize);
+     dstPtr += flushSize;
+ 
+     assert(flushSize <= dstCapacity);
+     dstCapacity -= flushSize;
+ 
+-    if (dstCapacity < 4) return err0r(LZ4F_ERROR_dstMaxSize_tooSmall);
++    RETURN_ERROR_IF(dstCapacity < 4, dstMaxSize_tooSmall);
+     LZ4F_writeLE32(dstPtr, 0);
+     dstPtr += 4;   /* endMark */
+ 
+     if (cctxPtr->prefs.frameInfo.contentChecksumFlag == LZ4F_contentChecksumEnabled) {
+         U32 const xxh = XXH32_digest(&(cctxPtr->xxh));
+-        if (dstCapacity < 8) return err0r(LZ4F_ERROR_dstMaxSize_tooSmall);
++        RETURN_ERROR_IF(dstCapacity < 8, dstMaxSize_tooSmall);
+         DEBUGLOG(5,"Writing 32-bit content checksum");
+         LZ4F_writeLE32(dstPtr, xxh);
+         dstPtr+=4;   /* content Checksum */
+     }
+ 
+     cctxPtr->cStage = 0;   /* state is now re-usable (with identical preferences) */
+     cctxPtr->maxBufferSize = 0;  /* reuse HC context */
+ 
+     if (cctxPtr->prefs.frameInfo.contentSize) {
+         if (cctxPtr->prefs.frameInfo.contentSize != cctxPtr->totalInSize)
+-            return err0r(LZ4F_ERROR_frameSize_wrong);
++            RETURN_ERROR(frameSize_wrong);
+     }
+ 
+     return (size_t)(dstPtr - dstStart);
+ }
+ 
+ 
+ /*-***************************************************
+ *   Frame Decompression
+@@ -1034,16 +1199,17 @@ typedef enum {
+     dstage_getCBlock, dstage_storeCBlock,
+     dstage_flushOut,
+     dstage_getSuffix, dstage_storeSuffix,
+     dstage_getSFrameSize, dstage_storeSFrameSize,
+     dstage_skipSkippable
+ } dStage_t;
+ 
+ struct LZ4F_dctx_s {
++    LZ4F_CustomMem cmem;
+     LZ4F_frameInfo_t frameInfo;
+     U32    version;
+     dStage_t dStage;
+     U64    frameRemainingSize;
+     size_t maxBlockSize;
+     size_t maxBufferSize;
+     BYTE*  tmpIn;
+     size_t tmpInSize;
+@@ -1051,59 +1217,71 @@ struct LZ4F_dctx_s {
+     BYTE*  tmpOutBuffer;
+     const BYTE* dict;
+     size_t dictSize;
+     BYTE*  tmpOut;
+     size_t tmpOutSize;
+     size_t tmpOutStart;
+     XXH32_state_t xxh;
+     XXH32_state_t blockChecksum;
++    int    skipChecksum;
+     BYTE   header[LZ4F_HEADER_SIZE_MAX];
+ };  /* typedef'd to LZ4F_dctx in lz4frame.h */
+ 
+ 
++LZ4F_dctx* LZ4F_createDecompressionContext_advanced(LZ4F_CustomMem customMem, unsigned version)
++{
++    LZ4F_dctx* const dctx = (LZ4F_dctx*)LZ4F_calloc(sizeof(LZ4F_dctx), customMem);
++    if (dctx == NULL) return NULL;
++
++    dctx->cmem = customMem;
++    dctx->version = version;
++    return dctx;
++}
++
+ /*! LZ4F_createDecompressionContext() :
+  *  Create a decompressionContext object, which will track all decompression operations.
+  *  Provides a pointer to a fully allocated and initialized LZ4F_decompressionContext object.
+  *  Object can later be released using LZ4F_freeDecompressionContext().
+  * @return : if != 0, there was an error during context creation.
+  */
+-LZ4F_errorCode_t LZ4F_createDecompressionContext(LZ4F_dctx** LZ4F_decompressionContextPtr, unsigned versionNumber)
++LZ4F_errorCode_t
++LZ4F_createDecompressionContext(LZ4F_dctx** LZ4F_decompressionContextPtr, unsigned versionNumber)
+ {
+-    LZ4F_dctx* const dctx = (LZ4F_dctx*)ALLOC_AND_ZERO(sizeof(LZ4F_dctx));
+-    if (dctx == NULL) {  /* failed allocation */
+-        *LZ4F_decompressionContextPtr = NULL;
+-        return err0r(LZ4F_ERROR_allocation_failed);
++    assert(LZ4F_decompressionContextPtr != NULL);  /* violation of narrow contract */
++    RETURN_ERROR_IF(LZ4F_decompressionContextPtr == NULL, parameter_null);  /* in case it nonetheless happen in production */
++
++    *LZ4F_decompressionContextPtr = LZ4F_createDecompressionContext_advanced(LZ4F_defaultCMem, versionNumber);
++    if (*LZ4F_decompressionContextPtr == NULL) {  /* failed allocation */
++        RETURN_ERROR(allocation_failed);
+     }
+-
+-    dctx->version = versionNumber;
+-    *LZ4F_decompressionContextPtr = dctx;
+     return LZ4F_OK_NoError;
+ }
+ 
+ LZ4F_errorCode_t LZ4F_freeDecompressionContext(LZ4F_dctx* dctx)
+ {
+     LZ4F_errorCode_t result = LZ4F_OK_NoError;
+     if (dctx != NULL) {   /* can accept NULL input, like free() */
+       result = (LZ4F_errorCode_t)dctx->dStage;
+-      FREEMEM(dctx->tmpIn);
+-      FREEMEM(dctx->tmpOutBuffer);
+-      FREEMEM(dctx);
++      LZ4F_free(dctx->tmpIn, dctx->cmem);
++      LZ4F_free(dctx->tmpOutBuffer, dctx->cmem);
++      LZ4F_free(dctx, dctx->cmem);
+     }
+     return result;
+ }
+ 
+ 
+ /*==---   Streaming Decompression operations   ---==*/
+ 
+ void LZ4F_resetDecompressionContext(LZ4F_dctx* dctx)
+ {
+     dctx->dStage = dstage_getFrameHeader;
+     dctx->dict = NULL;
+     dctx->dictSize = 0;
++    dctx->skipChecksum = 0;
+ }
+ 
+ 
+ /*! LZ4F_decodeHeader() :
+  *  input   : `src` points at the **beginning of the frame**
+  *  output  : set internal values of dctx, such as
+  *            dctx->frameInfo and dctx->dStage.
+  *            Also allocates internal buffers.
+@@ -1113,53 +1291,52 @@ void LZ4F_resetDecompressionContext(LZ4F
+ static size_t LZ4F_decodeHeader(LZ4F_dctx* dctx, const void* src, size_t srcSize)
+ {
+     unsigned blockMode, blockChecksumFlag, contentSizeFlag, contentChecksumFlag, dictIDFlag, blockSizeID;
+     size_t frameHeaderSize;
+     const BYTE* srcPtr = (const BYTE*)src;
+ 
+     DEBUGLOG(5, "LZ4F_decodeHeader");
+     /* need to decode header to get frameInfo */
+-    if (srcSize < minFHSize) return err0r(LZ4F_ERROR_frameHeader_incomplete);   /* minimal frame header size */
++    RETURN_ERROR_IF(srcSize < minFHSize, frameHeader_incomplete);   /* minimal frame header size */
+     MEM_INIT(&(dctx->frameInfo), 0, sizeof(dctx->frameInfo));
+ 
+     /* special case : skippable frames */
+     if ((LZ4F_readLE32(srcPtr) & 0xFFFFFFF0U) == LZ4F_MAGIC_SKIPPABLE_START) {
+         dctx->frameInfo.frameType = LZ4F_skippableFrame;
+         if (src == (void*)(dctx->header)) {
+             dctx->tmpInSize = srcSize;
+             dctx->tmpInTarget = 8;
+             dctx->dStage = dstage_storeSFrameSize;
+             return srcSize;
+         } else {
+             dctx->dStage = dstage_getSFrameSize;
+             return 4;
+-        }
+-    }
++    }   }
+ 
+     /* control magic number */
+ #ifndef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
+     if (LZ4F_readLE32(srcPtr) != LZ4F_MAGICNUMBER) {
+         DEBUGLOG(4, "frame header error : unknown magic number");
+-        return err0r(LZ4F_ERROR_frameType_unknown);
++        RETURN_ERROR(frameType_unknown);
+     }
+ #endif
+     dctx->frameInfo.frameType = LZ4F_frame;
+ 
+     /* Flags */
+     {   U32 const FLG = srcPtr[4];
+         U32 const version = (FLG>>6) & _2BITS;
+         blockChecksumFlag = (FLG>>4) & _1BIT;
+         blockMode = (FLG>>5) & _1BIT;
+         contentSizeFlag = (FLG>>3) & _1BIT;
+         contentChecksumFlag = (FLG>>2) & _1BIT;
+         dictIDFlag = FLG & _1BIT;
+         /* validate */
+-        if (((FLG>>1)&_1BIT) != 0) return err0r(LZ4F_ERROR_reservedFlag_set); /* Reserved bit */
+-        if (version != 1) return err0r(LZ4F_ERROR_headerVersion_wrong);        /* Version Number, only supported value */
++        if (((FLG>>1)&_1BIT) != 0) RETURN_ERROR(reservedFlag_set); /* Reserved bit */
++        if (version != 1) RETURN_ERROR(headerVersion_wrong);       /* Version Number, only supported value */
+     }
+ 
+     /* Frame Header Size */
+     frameHeaderSize = minFHSize + (contentSizeFlag?8:0) + (dictIDFlag?4:0);
+ 
+     if (srcSize < frameHeaderSize) {
+         /* not enough input to fully decode frame header */
+         if (srcPtr != dctx->header)
+@@ -1168,68 +1345,66 @@ static size_t LZ4F_decodeHeader(LZ4F_dct
+         dctx->tmpInTarget = frameHeaderSize;
+         dctx->dStage = dstage_storeFrameHeader;
+         return srcSize;
+     }
+ 
+     {   U32 const BD = srcPtr[5];
+         blockSizeID = (BD>>4) & _3BITS;
+         /* validate */
+-        if (((BD>>7)&_1BIT) != 0) return err0r(LZ4F_ERROR_reservedFlag_set);   /* Reserved bit */
+-        if (blockSizeID < 4) return err0r(LZ4F_ERROR_maxBlockSize_invalid);    /* 4-7 only supported values for the time being */
+-        if (((BD>>0)&_4BITS) != 0) return err0r(LZ4F_ERROR_reservedFlag_set);  /* Reserved bits */
++        if (((BD>>7)&_1BIT) != 0) RETURN_ERROR(reservedFlag_set);   /* Reserved bit */
++        if (blockSizeID < 4) RETURN_ERROR(maxBlockSize_invalid);    /* 4-7 only supported values for the time being */
++        if (((BD>>0)&_4BITS) != 0) RETURN_ERROR(reservedFlag_set);  /* Reserved bits */
+     }
+ 
+     /* check header */
+     assert(frameHeaderSize > 5);
+ #ifndef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
+     {   BYTE const HC = LZ4F_headerChecksum(srcPtr+4, frameHeaderSize-5);
+-        if (HC != srcPtr[frameHeaderSize-1])
+-            return err0r(LZ4F_ERROR_headerChecksum_invalid);
++        RETURN_ERROR_IF(HC != srcPtr[frameHeaderSize-1], headerChecksum_invalid);
+     }
+ #endif
+ 
+     /* save */
+     dctx->frameInfo.blockMode = (LZ4F_blockMode_t)blockMode;
+     dctx->frameInfo.blockChecksumFlag = (LZ4F_blockChecksum_t)blockChecksumFlag;
+     dctx->frameInfo.contentChecksumFlag = (LZ4F_contentChecksum_t)contentChecksumFlag;
+     dctx->frameInfo.blockSizeID = (LZ4F_blockSizeID_t)blockSizeID;
+-    dctx->maxBlockSize = LZ4F_getBlockSize(blockSizeID);
++    dctx->maxBlockSize = LZ4F_getBlockSize((LZ4F_blockSizeID_t)blockSizeID);
+     if (contentSizeFlag)
+-        dctx->frameRemainingSize =
+-            dctx->frameInfo.contentSize = LZ4F_readLE64(srcPtr+6);
++        dctx->frameRemainingSize = dctx->frameInfo.contentSize = LZ4F_readLE64(srcPtr+6);
+     if (dictIDFlag)
+         dctx->frameInfo.dictID = LZ4F_readLE32(srcPtr + frameHeaderSize - 5);
+ 
+     dctx->dStage = dstage_init;
+ 
+     return frameHeaderSize;
+ }
+ 
+ 
+ /*! LZ4F_headerSize() :
+  * @return : size of frame header
+  *           or an error code, which can be tested using LZ4F_isError()
+  */
+ size_t LZ4F_headerSize(const void* src, size_t srcSize)
+ {
+-    if (src == NULL) return err0r(LZ4F_ERROR_srcPtr_wrong);
++    RETURN_ERROR_IF(src == NULL, srcPtr_wrong);
+ 
+     /* minimal srcSize to determine header size */
+     if (srcSize < LZ4F_MIN_SIZE_TO_KNOW_HEADER_LENGTH)
+-        return err0r(LZ4F_ERROR_frameHeader_incomplete);
++        RETURN_ERROR(frameHeader_incomplete);
+ 
+     /* special case : skippable frames */
+     if ((LZ4F_readLE32(src) & 0xFFFFFFF0U) == LZ4F_MAGIC_SKIPPABLE_START)
+         return 8;
+ 
+     /* control magic number */
+ #ifndef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
+     if (LZ4F_readLE32(src) != LZ4F_MAGICNUMBER)
+-        return err0r(LZ4F_ERROR_frameType_unknown);
++        RETURN_ERROR(frameType_unknown);
+ #endif
+ 
+     /* Frame Header Size */
+     {   BYTE const FLG = ((const BYTE*)src)[4];
+         U32 const contentSizeFlag = (FLG>>3) & _1BIT;
+         U32 const dictIDFlag = FLG & _1BIT;
+         return minFHSize + (contentSizeFlag?8:0) + (dictIDFlag?4:0);
+     }
+@@ -1261,23 +1436,23 @@ LZ4F_errorCode_t LZ4F_getFrameInfo(LZ4F_
+         *srcSizePtr = 0;
+         *frameInfoPtr = dctx->frameInfo;
+         /* returns : recommended nb of bytes for LZ4F_decompress() */
+         return LZ4F_decompress(dctx, NULL, &o, NULL, &i, NULL);
+     } else {
+         if (dctx->dStage == dstage_storeFrameHeader) {
+             /* frame decoding already started, in the middle of header => automatic fail */
+             *srcSizePtr = 0;
+-            return err0r(LZ4F_ERROR_frameDecoding_alreadyStarted);
++            RETURN_ERROR(frameDecoding_alreadyStarted);
+         } else {
+             size_t const hSize = LZ4F_headerSize(srcBuffer, *srcSizePtr);
+             if (LZ4F_isError(hSize)) { *srcSizePtr=0; return hSize; }
+             if (*srcSizePtr < hSize) {
+                 *srcSizePtr=0;
+-                return err0r(LZ4F_ERROR_frameHeader_incomplete);
++                RETURN_ERROR(frameHeader_incomplete);
+             }
+ 
+             {   size_t decodeResult = LZ4F_decodeHeader(dctx, srcBuffer, hSize);
+                 if (LZ4F_isError(decodeResult)) {
+                     *srcSizePtr = 0;
+                 } else {
+                     *srcSizePtr = decodeResult;
+                     decodeResult = BHSize;   /* block header size */
+@@ -1285,26 +1460,24 @@ LZ4F_errorCode_t LZ4F_getFrameInfo(LZ4F_
+                 *frameInfoPtr = dctx->frameInfo;
+                 return decodeResult;
+     }   }   }
+ }
+ 
+ 
+ /* LZ4F_updateDict() :
+  * only used for LZ4F_blockLinked mode
+- * Condition : dstPtr != NULL
++ * Condition : @dstPtr != NULL
+  */
+ static void LZ4F_updateDict(LZ4F_dctx* dctx,
+                       const BYTE* dstPtr, size_t dstSize, const BYTE* dstBufferStart,
+                       unsigned withinTmp)
+ {
+     assert(dstPtr != NULL);
+-    if (dctx->dictSize==0) {
+-        dctx->dict = (const BYTE*)dstPtr;   /* priority to prefix mode */
+-    }
++    if (dctx->dictSize==0) dctx->dict = (const BYTE*)dstPtr;  /* will lead to prefix mode */
+     assert(dctx->dict != NULL);
+ 
+     if (dctx->dict + dctx->dictSize == dstPtr) {  /* prefix mode, everything within dstBuffer */
+         dctx->dictSize += dstSize;
+         return;
+     }
+ 
+     assert(dstPtr >= dstBufferStart);
+@@ -1357,17 +1530,16 @@ static void LZ4F_updateDict(LZ4F_dctx* d
+         memcpy(dctx->tmpOutBuffer, dctx->dict + dctx->dictSize - preserveSize, preserveSize);
+         memcpy(dctx->tmpOutBuffer + preserveSize, dstPtr, dstSize);
+         dctx->dict = dctx->tmpOutBuffer;
+         dctx->dictSize = preserveSize + dstSize;
+     }
+ }
+ 
+ 
+-
+ /*! LZ4F_decompress() :
+  *  Call this function repetitively to regenerate compressed data in srcBuffer.
+  *  The function will attempt to decode up to *srcSizePtr bytes from srcBuffer
+  *  into dstBuffer of capacity *dstSizePtr.
+  *
+  *  The number of bytes regenerated into dstBuffer will be provided within *dstSizePtr (necessarily <= original value).
+  *
+  *  The number of bytes effectively read from srcBuffer will be provided within *srcSizePtr (necessarily <= original value).
+@@ -1401,29 +1573,30 @@ size_t LZ4F_decompress(LZ4F_dctx* dctx,
+     DEBUGLOG(5, "LZ4F_decompress : %p,%u => %p,%u",
+             srcBuffer, (unsigned)*srcSizePtr, dstBuffer, (unsigned)*dstSizePtr);
+     if (dstBuffer == NULL) assert(*dstSizePtr == 0);
+     MEM_INIT(&optionsNull, 0, sizeof(optionsNull));
+     if (decompressOptionsPtr==NULL) decompressOptionsPtr = &optionsNull;
+     *srcSizePtr = 0;
+     *dstSizePtr = 0;
+     assert(dctx != NULL);
++    dctx->skipChecksum |= (decompressOptionsPtr->skipChecksums != 0); /* once set, disable for the remainder of the frame */
+ 
+     /* behaves as a state machine */
+ 
+     while (doAnotherStage) {
+ 
+         switch(dctx->dStage)
+         {
+ 
+         case dstage_getFrameHeader:
+             DEBUGLOG(6, "dstage_getFrameHeader");
+             if ((size_t)(srcEnd-srcPtr) >= maxFHSize) {  /* enough to decode - shortcut */
+                 size_t const hSize = LZ4F_decodeHeader(dctx, srcPtr, (size_t)(srcEnd-srcPtr));  /* will update dStage appropriately */
+-                if (LZ4F_isError(hSize)) return hSize;
++                FORWARD_IF_ERROR(hSize);
+                 srcPtr += hSize;
+                 break;
+             }
+             dctx->tmpInSize = 0;
+             if (srcEnd-srcPtr == 0) return minFHSize;   /* 0-size input */
+             dctx->tmpInTarget = minFHSize;   /* minimum size to decode header */
+             dctx->dStage = dstage_storeFrameHeader;
+             /* fall-through */
+@@ -1435,37 +1608,33 @@ size_t LZ4F_decompress(LZ4F_dctx* dctx,
+                 dctx->tmpInSize += sizeToCopy;
+                 srcPtr += sizeToCopy;
+             }
+             if (dctx->tmpInSize < dctx->tmpInTarget) {
+                 nextSrcSizeHint = (dctx->tmpInTarget - dctx->tmpInSize) + BHSize;   /* rest of header + nextBlockHeader */
+                 doAnotherStage = 0;   /* not enough src data, ask for some more */
+                 break;
+             }
+-            {   size_t const hSize = LZ4F_decodeHeader(dctx, dctx->header, dctx->tmpInTarget);  /* will update dStage appropriately */
+-                if (LZ4F_isError(hSize)) return hSize;
+-            }
++            FORWARD_IF_ERROR( LZ4F_decodeHeader(dctx, dctx->header, dctx->tmpInTarget) ); /* will update dStage appropriately */
+             break;
+ 
+         case dstage_init:
+             DEBUGLOG(6, "dstage_init");
+             if (dctx->frameInfo.contentChecksumFlag) (void)XXH32_reset(&(dctx->xxh), 0);
+             /* internal buffers allocation */
+             {   size_t const bufferNeeded = dctx->maxBlockSize
+                     + ((dctx->frameInfo.blockMode==LZ4F_blockLinked) ? 128 KB : 0);
+                 if (bufferNeeded > dctx->maxBufferSize) {   /* tmp buffers too small */
+                     dctx->maxBufferSize = 0;   /* ensure allocation will be re-attempted on next entry*/
+-                    FREEMEM(dctx->tmpIn);
+-                    dctx->tmpIn = (BYTE*)ALLOC(dctx->maxBlockSize + BFSize /* block checksum */);
+-                    if (dctx->tmpIn == NULL)
+-                        return err0r(LZ4F_ERROR_allocation_failed);
+-                    FREEMEM(dctx->tmpOutBuffer);
+-                    dctx->tmpOutBuffer= (BYTE*)ALLOC(bufferNeeded);
+-                    if (dctx->tmpOutBuffer== NULL)
+-                        return err0r(LZ4F_ERROR_allocation_failed);
++                    LZ4F_free(dctx->tmpIn, dctx->cmem);
++                    dctx->tmpIn = (BYTE*)LZ4F_malloc(dctx->maxBlockSize + BFSize /* block checksum */, dctx->cmem);
++                    RETURN_ERROR_IF(dctx->tmpIn == NULL, allocation_failed);
++                    LZ4F_free(dctx->tmpOutBuffer, dctx->cmem);
++                    dctx->tmpOutBuffer= (BYTE*)LZ4F_malloc(bufferNeeded, dctx->cmem);
++                    RETURN_ERROR_IF(dctx->tmpOutBuffer== NULL, allocation_failed);
+                     dctx->maxBufferSize = bufferNeeded;
+             }   }
+             dctx->tmpInSize = 0;
+             dctx->tmpInTarget = 0;
+             dctx->tmpOut = dctx->tmpOutBuffer;
+             dctx->tmpOutStart = 0;
+             dctx->tmpOutSize = 0;
+ 
+@@ -1504,17 +1673,17 @@ size_t LZ4F_decompress(LZ4F_dctx* dctx,
+                 size_t const nextCBlockSize = blockHeader & 0x7FFFFFFFU;
+                 size_t const crcSize = dctx->frameInfo.blockChecksumFlag * BFSize;
+                 if (blockHeader==0) {  /* frameEnd signal, no more block */
+                     DEBUGLOG(5, "end of frame");
+                     dctx->dStage = dstage_getSuffix;
+                     break;
+                 }
+                 if (nextCBlockSize > dctx->maxBlockSize) {
+-                    return err0r(LZ4F_ERROR_maxBlockSize_invalid);
++                    RETURN_ERROR(maxBlockSize_invalid);
+                 }
+                 if (blockHeader & LZ4F_BLOCKUNCOMPRESSED_FLAG) {
+                     /* next block is uncompressed */
+                     dctx->tmpInTarget = nextCBlockSize;
+                     DEBUGLOG(5, "next block is uncompressed (size %u)", (U32)nextCBlockSize);
+                     if (dctx->frameInfo.blockChecksumFlag) {
+                         (void)XXH32_reset(&dctx->blockChecksum, 0);
+                     }
+@@ -1535,21 +1704,23 @@ size_t LZ4F_decompress(LZ4F_dctx* dctx,
+             DEBUGLOG(6, "dstage_copyDirect");
+             {   size_t sizeToCopy;
+                 if (dstPtr == NULL) {
+                     sizeToCopy = 0;
+                 } else {
+                     size_t const minBuffSize = MIN((size_t)(srcEnd-srcPtr), (size_t)(dstEnd-dstPtr));
+                     sizeToCopy = MIN(dctx->tmpInTarget, minBuffSize);
+                     memcpy(dstPtr, srcPtr, sizeToCopy);
+-                    if (dctx->frameInfo.blockChecksumFlag) {
+-                        (void)XXH32_update(&dctx->blockChecksum, srcPtr, sizeToCopy);
++                    if (!dctx->skipChecksum) {
++                        if (dctx->frameInfo.blockChecksumFlag) {
++                            (void)XXH32_update(&dctx->blockChecksum, srcPtr, sizeToCopy);
++                        }
++                        if (dctx->frameInfo.contentChecksumFlag)
++                            (void)XXH32_update(&dctx->xxh, srcPtr, sizeToCopy);
+                     }
+-                    if (dctx->frameInfo.contentChecksumFlag)
+-                        (void)XXH32_update(&dctx->xxh, srcPtr, sizeToCopy);
+                     if (dctx->frameInfo.contentSize)
+                         dctx->frameRemainingSize -= sizeToCopy;
+ 
+                     /* history management (linked blocks only)*/
+                     if (dctx->frameInfo.blockMode == LZ4F_blockLinked) {
+                         LZ4F_updateDict(dctx, dstPtr, sizeToCopy, dstStart, 0);
+                 }   }
+ 
+@@ -1585,24 +1756,25 @@ size_t LZ4F_decompress(LZ4F_dctx* dctx,
+                     dctx->tmpInSize += sizeToCopy;
+                     srcPtr += sizeToCopy;
+                     if (dctx->tmpInSize < 4) {  /* all input consumed */
+                         doAnotherStage = 0;
+                         break;
+                     }
+                     crcSrc = dctx->header;
+                 }
+-                {   U32 const readCRC = LZ4F_readLE32(crcSrc);
++                if (!dctx->skipChecksum) {
++                    U32 const readCRC = LZ4F_readLE32(crcSrc);
+                     U32 const calcCRC = XXH32_digest(&dctx->blockChecksum);
+ #ifndef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
+                     DEBUGLOG(6, "compare block checksum");
+                     if (readCRC != calcCRC) {
+                         DEBUGLOG(4, "incorrect block checksum: %08X != %08X",
+                                 readCRC, calcCRC);
+-                        return err0r(LZ4F_ERROR_blockChecksum_invalid);
++                        RETURN_ERROR(blockChecksum_invalid);
+                     }
+ #else
+                     (void)readCRC;
+                     (void)calcCRC;
+ #endif
+             }   }
+             dctx->dStage = dstage_getBlockHeader;  /* new block */
+             break;
+@@ -1632,91 +1804,99 @@ size_t LZ4F_decompress(LZ4F_dctx* dctx,
+                                     + BHSize /* next header size */;
+                     doAnotherStage = 0;
+                     break;
+                 }
+                 selectedIn = dctx->tmpIn;
+             }
+ 
+             /* At this stage, input is large enough to decode a block */
++
++            /* First, decode and control block checksum if it exists */
+             if (dctx->frameInfo.blockChecksumFlag) {
++                assert(dctx->tmpInTarget >= 4);
+                 dctx->tmpInTarget -= 4;
+                 assert(selectedIn != NULL);  /* selectedIn is defined at this stage (either srcPtr, or dctx->tmpIn) */
+                 {   U32 const readBlockCrc = LZ4F_readLE32(selectedIn + dctx->tmpInTarget);
+                     U32 const calcBlockCrc = XXH32(selectedIn, dctx->tmpInTarget, 0);
+ #ifndef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
+-                    if (readBlockCrc != calcBlockCrc)
+-                        return err0r(LZ4F_ERROR_blockChecksum_invalid);
++                    RETURN_ERROR_IF(readBlockCrc != calcBlockCrc, blockChecksum_invalid);
+ #else
+                     (void)readBlockCrc;
+                     (void)calcBlockCrc;
+ #endif
+             }   }
+ 
+-            if ((size_t)(dstEnd-dstPtr) >= dctx->maxBlockSize) {
++            /* decode directly into destination buffer if there is enough room */
++            if ( ((size_t)(dstEnd-dstPtr) >= dctx->maxBlockSize)
++                 /* unless the dictionary is stored in tmpOut:
++                  * in which case it's faster to decode within tmpOut
++                  * to benefit from prefix speedup */
++              && !(dctx->dict!= NULL && (const BYTE*)dctx->dict + dctx->dictSize == dctx->tmpOut) )
++            {
+                 const char* dict = (const char*)dctx->dict;
+                 size_t dictSize = dctx->dictSize;
+                 int decodedSize;
+                 assert(dstPtr != NULL);
+                 if (dict && dictSize > 1 GB) {
+-                    /* the dictSize param is an int, avoid truncation / sign issues */
++                    /* overflow control : dctx->dictSize is an int, avoid truncation / sign issues */
+                     dict += dictSize - 64 KB;
+                     dictSize = 64 KB;
+                 }
+-                /* enough capacity in `dst` to decompress directly there */
+                 decodedSize = LZ4_decompress_safe_usingDict(
+                         (const char*)selectedIn, (char*)dstPtr,
+                         (int)dctx->tmpInTarget, (int)dctx->maxBlockSize,
+                         dict, (int)dictSize);
+-                if (decodedSize < 0) return err0r(LZ4F_ERROR_GENERIC);   /* decompression failed */
+-                if (dctx->frameInfo.contentChecksumFlag)
++                RETURN_ERROR_IF(decodedSize < 0, decompressionFailed);
++                if ((dctx->frameInfo.contentChecksumFlag) && (!dctx->skipChecksum))
+                     XXH32_update(&(dctx->xxh), dstPtr, (size_t)decodedSize);
+                 if (dctx->frameInfo.contentSize)
+                     dctx->frameRemainingSize -= (size_t)decodedSize;
+ 
+                 /* dictionary management */
+                 if (dctx->frameInfo.blockMode==LZ4F_blockLinked) {
+                     LZ4F_updateDict(dctx, dstPtr, (size_t)decodedSize, dstStart, 0);
+                 }
+ 
+                 dstPtr += decodedSize;
+-                dctx->dStage = dstage_getBlockHeader;
++                dctx->dStage = dstage_getBlockHeader;  /* end of block, let's get another one */
+                 break;
+             }
+ 
+             /* not enough place into dst : decode into tmpOut */
+-            /* ensure enough place for tmpOut */
++
++            /* manage dictionary */
+             if (dctx->frameInfo.blockMode == LZ4F_blockLinked) {
+                 if (dctx->dict == dctx->tmpOutBuffer) {
++                    /* truncate dictionary to 64 KB if too big */
+                     if (dctx->dictSize > 128 KB) {
+                         memcpy(dctx->tmpOutBuffer, dctx->dict + dctx->dictSize - 64 KB, 64 KB);
+                         dctx->dictSize = 64 KB;
+                     }
+                     dctx->tmpOut = dctx->tmpOutBuffer + dctx->dictSize;
+-                } else {  /* dict not within tmp */
++                } else {  /* dict not within tmpOut */
+                     size_t const reservedDictSpace = MIN(dctx->dictSize, 64 KB);
+                     dctx->tmpOut = dctx->tmpOutBuffer + reservedDictSpace;
+             }   }
+ 
+-            /* Decode block */
++            /* Decode block into tmpOut */
+             {   const char* dict = (const char*)dctx->dict;
+                 size_t dictSize = dctx->dictSize;
+                 int decodedSize;
+                 if (dict && dictSize > 1 GB) {
+                     /* the dictSize param is an int, avoid truncation / sign issues */
+                     dict += dictSize - 64 KB;
+                     dictSize = 64 KB;
+                 }
+                 decodedSize = LZ4_decompress_safe_usingDict(
+                         (const char*)selectedIn, (char*)dctx->tmpOut,
+                         (int)dctx->tmpInTarget, (int)dctx->maxBlockSize,
+                         dict, (int)dictSize);
+-                if (decodedSize < 0)  /* decompression failed */
+-                    return err0r(LZ4F_ERROR_decompressionFailed);
+-                if (dctx->frameInfo.contentChecksumFlag)
++                RETURN_ERROR_IF(decodedSize < 0, decompressionFailed);
++                if (dctx->frameInfo.contentChecksumFlag && !dctx->skipChecksum)
+                     XXH32_update(&(dctx->xxh), dctx->tmpOut, (size_t)decodedSize);
+                 if (dctx->frameInfo.contentSize)
+                     dctx->frameRemainingSize -= (size_t)decodedSize;
+                 dctx->tmpOutSize = (size_t)decodedSize;
+                 dctx->tmpOutStart = 0;
+                 dctx->dStage = dstage_flushOut;
+             }
+             /* fall-through */
+@@ -1739,18 +1919,17 @@ size_t LZ4F_decompress(LZ4F_dctx* dctx,
+                 break;
+             }
+             /* could not flush everything : stop there, just request a block header */
+             doAnotherStage = 0;
+             nextSrcSizeHint = BHSize;
+             break;
+ 
+         case dstage_getSuffix:
+-            if (dctx->frameRemainingSize)
+-                return err0r(LZ4F_ERROR_frameSize_wrong);   /* incorrect frame size decoded */
++            RETURN_ERROR_IF(dctx->frameRemainingSize, frameSize_wrong);   /* incorrect frame size decoded */
+             if (!dctx->frameInfo.contentChecksumFlag) {  /* no checksum, frame is completed */
+                 nextSrcSizeHint = 0;
+                 LZ4F_resetDecompressionContext(dctx);
+                 doAnotherStage = 0;
+                 break;
+             }
+             if ((srcEnd - srcPtr) < 4) {  /* not enough size for entire CRC */
+                 dctx->tmpInSize = 0;
+@@ -1772,30 +1951,30 @@ size_t LZ4F_decompress(LZ4F_dctx* dctx,
+                     nextSrcSizeHint = 4 - dctx->tmpInSize;
+                     doAnotherStage=0;
+                     break;
+                 }
+                 selectedIn = dctx->tmpIn;
+             }   /* if (dctx->dStage == dstage_storeSuffix) */
+ 
+         /* case dstage_checkSuffix: */   /* no direct entry, avoid initialization risks */
+-            {   U32 const readCRC = LZ4F_readLE32(selectedIn);
++            if (!dctx->skipChecksum) {
++                U32 const readCRC = LZ4F_readLE32(selectedIn);
+                 U32 const resultCRC = XXH32_digest(&(dctx->xxh));
+ #ifndef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
+-                if (readCRC != resultCRC)
+-                    return err0r(LZ4F_ERROR_contentChecksum_invalid);
++                RETURN_ERROR_IF(readCRC != resultCRC, contentChecksum_invalid);
+ #else
+                 (void)readCRC;
+                 (void)resultCRC;
+ #endif
+-                nextSrcSizeHint = 0;
+-                LZ4F_resetDecompressionContext(dctx);
+-                doAnotherStage = 0;
+-                break;
+             }
++            nextSrcSizeHint = 0;
++            LZ4F_resetDecompressionContext(dctx);
++            doAnotherStage = 0;
++            break;
+ 
+         case dstage_getSFrameSize:
+             if ((srcEnd - srcPtr) >= 4) {
+                 selectedIn = srcPtr;
+                 srcPtr += 4;
+             } else {
+                 /* not enough input to read cBlockSize field */
+                 dctx->tmpInSize = 4;
+@@ -1836,17 +2015,17 @@ size_t LZ4F_decompress(LZ4F_dctx* dctx,
+                 if (nextSrcSizeHint) break;  /* still more to skip */
+                 /* frame fully skipped : prepare context for a new frame */
+                 LZ4F_resetDecompressionContext(dctx);
+                 break;
+             }
+         }   /* switch (dctx->dStage) */
+     }   /* while (doAnotherStage) */
+ 
+-    /* preserve history within tmp whenever necessary */
++    /* preserve history within tmpOut whenever necessary */
+     LZ4F_STATIC_ASSERT((unsigned)dstage_init == 2);
+     if ( (dctx->frameInfo.blockMode==LZ4F_blockLinked)  /* next block will use up to 64KB from previous ones */
+       && (dctx->dict != dctx->tmpOutBuffer)             /* dictionary is not already within tmp */
+       && (dctx->dict != NULL)                           /* dictionary exists */
+       && (!decompressOptionsPtr->stableDst)             /* cannot rely on dst data to remain there for next call */
+       && ((unsigned)(dctx->dStage)-2 < (unsigned)(dstage_getSuffix)-2) )  /* valid stages : [init ... getSuffix[ */
+     {
+         if (dctx->dStage == dstage_flushOut) {
+diff --git a/mfbt/lz4/lz4frame.h b/mfbt/lz4/lz4frame.h
+--- a/mfbt/lz4/lz4frame.h
++++ b/mfbt/lz4/lz4frame.h
+@@ -1,12 +1,12 @@
+ /*
+-   LZ4 auto-framing library
++   LZ4F - LZ4-Frame library
+    Header File
+-   Copyright (C) 2011-2017, Yann Collet.
++   Copyright (C) 2011-2020, Yann Collet.
+    BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php)
+ 
+    Redistribution and use in source and binary forms, with or without
+    modification, are permitted provided that the following conditions are
+    met:
+ 
+        * Redistributions of source code must retain the above copyright
+    notice, this list of conditions and the following disclaimer.
+@@ -34,37 +34,37 @@
+ 
+ /* LZ4F is a stand-alone API able to create and decode LZ4 frames
+  * conformant with specification v1.6.1 in doc/lz4_Frame_format.md .
+  * Generated frames are compatible with `lz4` CLI.
+  *
+  * LZ4F also offers streaming capabilities.
+  *
+  * lz4.h is not required when using lz4frame.h,
+- * except to extract common constant such as LZ4_VERSION_NUMBER.
++ * except to extract common constants such as LZ4_VERSION_NUMBER.
+  * */
+ 
+ #ifndef LZ4F_H_09782039843
+ #define LZ4F_H_09782039843
+ 
+ #if defined (__cplusplus)
+ extern "C" {
+ #endif
+ 
+ /* ---   Dependency   --- */
+ #include <stddef.h>   /* size_t */
+ 
+ 
+ /**
+-  Introduction
+-
+-  lz4frame.h implements LZ4 frame specification (doc/lz4_Frame_format.md).
+-  lz4frame.h provides frame compression functions that take care
+-  of encoding standard metadata alongside LZ4-compressed blocks.
+-*/
++ * Introduction
++ *
++ * lz4frame.h implements LZ4 frame specification: see doc/lz4_Frame_format.md .
++ * LZ4 Frames are compatible with `lz4` CLI,
++ * and designed to be interoperable with any system.
++**/
+ 
+ /*-***************************************************************
+  *  Compiler specifics
+  *****************************************************************/
+ /*  LZ4_DLL_EXPORT :
+  *  Enable exporting of functions when building a Windows DLL
+  *  LZ4FLIB_VISIBILITY :
+  *  Control library symbols visibility.
+@@ -205,17 +205,17 @@ typedef struct {
+ ***********************************/
+ 
+ LZ4FLIB_API int LZ4F_compressionLevel_max(void);   /* v1.8.0+ */
+ 
+ /*! LZ4F_compressFrameBound() :
+  *  Returns the maximum possible compressed size with LZ4F_compressFrame() given srcSize and preferences.
+  * `preferencesPtr` is optional. It can be replaced by NULL, in which case, the function will assume default preferences.
+  *  Note : this result is only usable with LZ4F_compressFrame().
+- *         It may also be used with LZ4F_compressUpdate() _if no flush() operation_ is performed.
++ *         It may also be relevant to LZ4F_compressUpdate() _only if_ no flush() operation is ever performed.
+  */
+ LZ4FLIB_API size_t LZ4F_compressFrameBound(size_t srcSize, const LZ4F_preferences_t* preferencesPtr);
+ 
+ /*! LZ4F_compressFrame() :
+  *  Compress an entire srcBuffer into a valid LZ4 frame.
+  *  dstCapacity MUST be >= LZ4F_compressFrameBound(srcSize, preferencesPtr).
+  *  The LZ4F_preferences_t structure is optional : you can provide NULL as argument. All preferences will be set to default.
+  * @return : number of bytes written into dstBuffer.
+@@ -225,43 +225,50 @@ LZ4FLIB_API size_t LZ4F_compressFrame(vo
+                                 const void* srcBuffer, size_t srcSize,
+                                 const LZ4F_preferences_t* preferencesPtr);
+ 
+ 
+ /*-***********************************
+ *  Advanced compression functions
+ *************************************/
+ typedef struct LZ4F_cctx_s LZ4F_cctx;   /* incomplete type */
+-typedef LZ4F_cctx* LZ4F_compressionContext_t;   /* for compatibility with previous API version */
++typedef LZ4F_cctx* LZ4F_compressionContext_t;  /* for compatibility with older APIs, prefer using LZ4F_cctx */
+ 
+ typedef struct {
+   unsigned stableSrc;    /* 1 == src content will remain present on future calls to LZ4F_compress(); skip copying src content within tmp buffer */
+   unsigned reserved[3];
+ } LZ4F_compressOptions_t;
+ 
+ /*---   Resource Management   ---*/
+ 
+ #define LZ4F_VERSION 100    /* This number can be used to check for an incompatible API breaking change */
+ LZ4FLIB_API unsigned LZ4F_getVersion(void);
+ 
+ /*! LZ4F_createCompressionContext() :
+- * The first thing to do is to create a compressionContext object, which will be used in all compression operations.
+- * This is achieved using LZ4F_createCompressionContext(), which takes as argument a version.
+- * The version provided MUST be LZ4F_VERSION. It is intended to track potential version mismatch, notably when using DLL.
+- * The function will provide a pointer to a fully allocated LZ4F_cctx object.
+- * If @return != zero, there was an error during context creation.
+- * Object can release its memory using LZ4F_freeCompressionContext();
+- */
++ *  The first thing to do is to create a compressionContext object,
++ *  which will keep track of operation state during streaming compression.
++ *  This is achieved using LZ4F_createCompressionContext(), which takes as argument a version,
++ *  and a pointer to LZ4F_cctx*, to write the resulting pointer into.
++ *  @version provided MUST be LZ4F_VERSION. It is intended to track potential version mismatch, notably when using DLL.
++ *  The function provides a pointer to a fully allocated LZ4F_cctx object.
++ *  @cctxPtr MUST be != NULL.
++ *  If @return != zero, context creation failed.
++ *  A created compression context can be employed multiple times for consecutive streaming operations.
++ *  Once all streaming compression jobs are completed,
++ *  the state object can be released using LZ4F_freeCompressionContext().
++ *  Note1 : LZ4F_freeCompressionContext() is always successful. Its return value can be ignored.
++ *  Note2 : LZ4F_freeCompressionContext() works fine with NULL input pointers (do nothing).
++**/
+ LZ4FLIB_API LZ4F_errorCode_t LZ4F_createCompressionContext(LZ4F_cctx** cctxPtr, unsigned version);
+ LZ4FLIB_API LZ4F_errorCode_t LZ4F_freeCompressionContext(LZ4F_cctx* cctx);
+ 
+ 
+ /*----    Compression    ----*/
+ 
+-#define LZ4F_HEADER_SIZE_MIN  7   /* LZ4 Frame header size can vary, depending on selected paramaters */
++#define LZ4F_HEADER_SIZE_MIN  7   /* LZ4 Frame header size can vary, depending on selected parameters */
+ #define LZ4F_HEADER_SIZE_MAX 19
+ 
+ /* Size in bytes of a block header in little-endian format. Highest bit indicates if block data is uncompressed */
+ #define LZ4F_BLOCK_HEADER_SIZE 4
+ 
+ /* Size in bytes of a block checksum footer in little-endian format. */
+ #define LZ4F_BLOCK_CHECKSUM_SIZE 4
+ 
+@@ -296,18 +303,19 @@ LZ4FLIB_API size_t LZ4F_compressBegin(LZ
+  */
+ LZ4FLIB_API size_t LZ4F_compressBound(size_t srcSize, const LZ4F_preferences_t* prefsPtr);
+ 
+ /*! LZ4F_compressUpdate() :
+  *  LZ4F_compressUpdate() can be called repetitively to compress as much data as necessary.
+  *  Important rule: dstCapacity MUST be large enough to ensure operation success even in worst case situations.
+  *  This value is provided by LZ4F_compressBound().
+  *  If this condition is not respected, LZ4F_compress() will fail (result is an errorCode).
+- *  LZ4F_compressUpdate() doesn't guarantee error recovery.
+- *  When an error occurs, compression context must be freed or resized.
++ *  After an error, the state is left in a UB state, and must be re-initialized or freed.
++ *  If previously an uncompressed block was written, buffered data is flushed
++ *  before appending compressed data is continued.
+  * `cOptPtr` is optional : NULL can be provided, in which case all options are set to default.
+  * @return : number of bytes written into `dstBuffer` (it can be zero, meaning input data was just buffered).
+  *           or an error code if it fails (which can be tested using LZ4F_isError())
+  */
+ LZ4FLIB_API size_t LZ4F_compressUpdate(LZ4F_cctx* cctx,
+                                        void* dstBuffer, size_t dstCapacity,
+                                  const void* srcBuffer, size_t srcSize,
+                                  const LZ4F_compressOptions_t* cOptPtr);
+@@ -342,56 +350,63 @@ LZ4FLIB_API size_t LZ4F_compressEnd(LZ4F
+ 
+ /*-*********************************
+ *  Decompression functions
+ ***********************************/
+ typedef struct LZ4F_dctx_s LZ4F_dctx;   /* incomplete type */
+ typedef LZ4F_dctx* LZ4F_decompressionContext_t;   /* compatibility with previous API versions */
+ 
+ typedef struct {
+-  unsigned stableDst;    /* pledges that last 64KB decompressed data will remain available unmodified. This optimization skips storage operations in tmp buffers. */
+-  unsigned reserved[3];  /* must be set to zero for forward compatibility */
++  unsigned stableDst;     /* pledges that last 64KB decompressed data will remain available unmodified between invocations.
++                           * This optimization skips storage operations in tmp buffers. */
++  unsigned skipChecksums; /* disable checksum calculation and verification, even when one is present in frame, to save CPU time.
++                           * Setting this option to 1 once disables all checksums for the rest of the frame. */
++  unsigned reserved1;     /* must be set to zero for forward compatibility */
++  unsigned reserved0;     /* idem */
+ } LZ4F_decompressOptions_t;
+ 
+ 
+ /* Resource management */
+ 
+ /*! LZ4F_createDecompressionContext() :
+  *  Create an LZ4F_dctx object, to track all decompression operations.
+- *  The version provided MUST be LZ4F_VERSION.
+- *  The function provides a pointer to an allocated and initialized LZ4F_dctx object.
+- *  The result is an errorCode, which can be tested using LZ4F_isError().
++ *  @version provided MUST be LZ4F_VERSION.
++ *  @dctxPtr MUST be valid.
++ *  The function fills @dctxPtr with the value of a pointer to an allocated and initialized LZ4F_dctx object.
++ *  The @return is an errorCode, which can be tested using LZ4F_isError().
+  *  dctx memory can be released using LZ4F_freeDecompressionContext();
+  *  Result of LZ4F_freeDecompressionContext() indicates current state of decompressionContext when being released.
+  *  That is, it should be == 0 if decompression has been completed fully and correctly.
+  */
+ LZ4FLIB_API LZ4F_errorCode_t LZ4F_createDecompressionContext(LZ4F_dctx** dctxPtr, unsigned version);
+ LZ4FLIB_API LZ4F_errorCode_t LZ4F_freeDecompressionContext(LZ4F_dctx* dctx);
+ 
+ 
+ /*-***********************************
+ *  Streaming decompression functions
+ *************************************/
+ 
++#define LZ4F_MAGICNUMBER 0x184D2204U
++#define LZ4F_MAGIC_SKIPPABLE_START 0x184D2A50U
+ #define LZ4F_MIN_SIZE_TO_KNOW_HEADER_LENGTH 5
+ 
+ /*! LZ4F_headerSize() : v1.9.0+
+  *  Provide the header size of a frame starting at `src`.
+  * `srcSize` must be >= LZ4F_MIN_SIZE_TO_KNOW_HEADER_LENGTH,
+  *  which is enough to decode the header length.
+  * @return : size of frame header
+  *           or an error code, which can be tested using LZ4F_isError()
+  *  note : Frame header size is variable, but is guaranteed to be
+  *         >= LZ4F_HEADER_SIZE_MIN bytes, and <= LZ4F_HEADER_SIZE_MAX bytes.
+  */
+ LZ4FLIB_API size_t LZ4F_headerSize(const void* src, size_t srcSize);
+ 
+ /*! LZ4F_getFrameInfo() :
+  *  This function extracts frame parameters (max blockSize, dictID, etc.).
+- *  Its usage is optional: user can call LZ4F_decompress() directly.
++ *  Its usage is optional: user can also invoke LZ4F_decompress() directly.
+  *
+  *  Extracted information will fill an existing LZ4F_frameInfo_t structure.
+  *  This can be useful for allocation and dictionary identification purposes.
+  *
+  *  LZ4F_getFrameInfo() can work in the following situations :
+  *
+  *  1) At the beginning of a new frame, before any invocation of LZ4F_decompress().
+  *     It will decode header from `srcBuffer`,
+@@ -422,19 +437,20 @@ LZ4FLIB_API size_t LZ4F_headerSize(const
+  *  and when decoding the header has been successful.
+  *  Decompression must then resume from (srcBuffer + *srcSizePtr).
+  *
+  * @return : a hint about how many srcSize bytes LZ4F_decompress() expects for next call,
+  *           or an error code which can be tested using LZ4F_isError().
+  *  note 1 : in case of error, dctx is not modified. Decoding operation can resume from beginning safely.
+  *  note 2 : frame parameters are *copied into* an already allocated LZ4F_frameInfo_t structure.
+  */
+-LZ4FLIB_API size_t LZ4F_getFrameInfo(LZ4F_dctx* dctx,
+-                                     LZ4F_frameInfo_t* frameInfoPtr,
+-                                     const void* srcBuffer, size_t* srcSizePtr);
++LZ4FLIB_API size_t
++LZ4F_getFrameInfo(LZ4F_dctx* dctx,
++                  LZ4F_frameInfo_t* frameInfoPtr,
++            const void* srcBuffer, size_t* srcSizePtr);
+ 
+ /*! LZ4F_decompress() :
+  *  Call this function repetitively to regenerate data compressed in `srcBuffer`.
+  *
+  *  The function requires a valid dctx state.
+  *  It will read up to *srcSizePtr bytes from srcBuffer,
+  *  and decompress data into dstBuffer, of capacity *dstSizePtr.
+  *
+@@ -457,20 +473,21 @@ LZ4FLIB_API size_t LZ4F_getFrameInfo(LZ4
+  *  LZ4F_decompress() will stop reading exactly at end of current frame, and @return 0.
+  *
+  *  If decompression failed, @return is an error code, which can be tested using LZ4F_isError().
+  *  After a decompression error, the `dctx` context is not resumable.
+  *  Use LZ4F_resetDecompressionContext() to return to clean state.
+  *
+  *  After a frame is fully decoded, dctx can be used again to decompress another frame.
+  */
+-LZ4FLIB_API size_t LZ4F_decompress(LZ4F_dctx* dctx,
+-                                   void* dstBuffer, size_t* dstSizePtr,
+-                                   const void* srcBuffer, size_t* srcSizePtr,
+-                                   const LZ4F_decompressOptions_t* dOptPtr);
++LZ4FLIB_API size_t
++LZ4F_decompress(LZ4F_dctx* dctx,
++                void* dstBuffer, size_t* dstSizePtr,
++          const void* srcBuffer, size_t* srcSizePtr,
++          const LZ4F_decompressOptions_t* dOptPtr);
+ 
+ 
+ /*! LZ4F_resetDecompressionContext() : added in v1.8.0
+  *  In case of an error, the context is left in "undefined" state.
+  *  In which case, it's necessary to reset it, before re-using it.
+  *  This method can also be used to abruptly stop any unfinished decompression,
+  *  and start a new one using same context resources. */
+ LZ4FLIB_API void LZ4F_resetDecompressionContext(LZ4F_dctx* dctx);   /* always successful */
+@@ -524,27 +541,53 @@ extern "C" {
+         ITEM(ERROR_frameHeader_incomplete) \
+         ITEM(ERROR_frameType_unknown) \
+         ITEM(ERROR_frameSize_wrong) \
+         ITEM(ERROR_srcPtr_wrong) \
+         ITEM(ERROR_decompressionFailed) \
+         ITEM(ERROR_headerChecksum_invalid) \
+         ITEM(ERROR_contentChecksum_invalid) \
+         ITEM(ERROR_frameDecoding_alreadyStarted) \
++        ITEM(ERROR_compressionState_uninitialized) \
++        ITEM(ERROR_parameter_null) \
+         ITEM(ERROR_maxCode)
+ 
+ #define LZ4F_GENERATE_ENUM(ENUM) LZ4F_##ENUM,
+ 
+ /* enum list is exposed, to handle specific errors */
+ typedef enum { LZ4F_LIST_ERRORS(LZ4F_GENERATE_ENUM)
+               _LZ4F_dummy_error_enum_for_c89_never_used } LZ4F_errorCodes;
+ 
+ LZ4FLIB_STATIC_API LZ4F_errorCodes LZ4F_getErrorCode(size_t functionResult);
+ 
+-LZ4FLIB_STATIC_API size_t LZ4F_getBlockSize(unsigned);
++
++/*! LZ4F_getBlockSize() :
++ *  Return, in scalar format (size_t),
++ *  the maximum block size associated with blockSizeID.
++**/
++LZ4FLIB_STATIC_API size_t LZ4F_getBlockSize(LZ4F_blockSizeID_t blockSizeID);
++
++/*! LZ4F_uncompressedUpdate() :
++ *  LZ4F_uncompressedUpdate() can be called repetitively to add as much data uncompressed data as necessary.
++ *  Important rule: dstCapacity MUST be large enough to store the entire source buffer as
++ *  no compression is done for this operation
++ *  If this condition is not respected, LZ4F_uncompressedUpdate() will fail (result is an errorCode).
++ *  After an error, the state is left in a UB state, and must be re-initialized or freed.
++ *  If previously a compressed block was written, buffered data is flushed
++ *  before appending uncompressed data is continued.
++ *  This is only supported when LZ4F_blockIndependent is used
++ * `cOptPtr` is optional : NULL can be provided, in which case all options are set to default.
++ * @return : number of bytes written into `dstBuffer` (it can be zero, meaning input data was just buffered).
++ *           or an error code if it fails (which can be tested using LZ4F_isError())
++ */
++LZ4FLIB_STATIC_API size_t
++LZ4F_uncompressedUpdate(LZ4F_cctx* cctx,
++                        void* dstBuffer, size_t dstCapacity,
++                  const void* srcBuffer, size_t srcSize,
++                  const LZ4F_compressOptions_t* cOptPtr);
+ 
+ /**********************************
+  *  Bulk processing dictionary API
+  *********************************/
+ 
+ /* A Dictionary is useful for the compression of small messages (KB range).
+  * It dramatically improves compression efficiency.
+  *
+@@ -578,46 +621,72 @@ LZ4FLIB_STATIC_API void        LZ4F_free
+  *  cctx must point to a context created by LZ4F_createCompressionContext().
+  *  If cdict==NULL, compress without a dictionary.
+  *  dstBuffer MUST be >= LZ4F_compressFrameBound(srcSize, preferencesPtr).
+  *  If this condition is not respected, function will fail (@return an errorCode).
+  *  The LZ4F_preferences_t structure is optional : you may provide NULL as argument,
+  *  but it's not recommended, as it's the only way to provide dictID in the frame header.
+  * @return : number of bytes written into dstBuffer.
+  *           or an error code if it fails (can be tested using LZ4F_isError()) */
+-LZ4FLIB_STATIC_API size_t LZ4F_compressFrame_usingCDict(
+-    LZ4F_cctx* cctx,
+-    void* dst, size_t dstCapacity,
+-    const void* src, size_t srcSize,
+-    const LZ4F_CDict* cdict,
+-    const LZ4F_preferences_t* preferencesPtr);
++LZ4FLIB_STATIC_API size_t
++LZ4F_compressFrame_usingCDict(LZ4F_cctx* cctx,
++                              void* dst, size_t dstCapacity,
++                        const void* src, size_t srcSize,
++                        const LZ4F_CDict* cdict,
++                        const LZ4F_preferences_t* preferencesPtr);
+ 
+ 
+ /*! LZ4F_compressBegin_usingCDict() :
+  *  Inits streaming dictionary compression, and writes the frame header into dstBuffer.
+  *  dstCapacity must be >= LZ4F_HEADER_SIZE_MAX bytes.
+  * `prefsPtr` is optional : you may provide NULL as argument,
+  *  however, it's the only way to provide dictID in the frame header.
+  * @return : number of bytes written into dstBuffer for the header,
+  *           or an error code (which can be tested using LZ4F_isError()) */
+-LZ4FLIB_STATIC_API size_t LZ4F_compressBegin_usingCDict(
+-    LZ4F_cctx* cctx,
+-    void* dstBuffer, size_t dstCapacity,
+-    const LZ4F_CDict* cdict,
+-    const LZ4F_preferences_t* prefsPtr);
++LZ4FLIB_STATIC_API size_t
++LZ4F_compressBegin_usingCDict(LZ4F_cctx* cctx,
++                              void* dstBuffer, size_t dstCapacity,
++                        const LZ4F_CDict* cdict,
++                        const LZ4F_preferences_t* prefsPtr);
+ 
+ 
+ /*! LZ4F_decompress_usingDict() :
+  *  Same as LZ4F_decompress(), using a predefined dictionary.
+  *  Dictionary is used "in place", without any preprocessing.
+- *  It must remain accessible throughout the entire frame decoding. */
+-LZ4FLIB_STATIC_API size_t LZ4F_decompress_usingDict(
+-    LZ4F_dctx* dctxPtr,
+-    void* dstBuffer, size_t* dstSizePtr,
+-    const void* srcBuffer, size_t* srcSizePtr,
+-    const void* dict, size_t dictSize,
+-    const LZ4F_decompressOptions_t* decompressOptionsPtr);
++**  It must remain accessible throughout the entire frame decoding. */
++LZ4FLIB_STATIC_API size_t
++LZ4F_decompress_usingDict(LZ4F_dctx* dctxPtr,
++                          void* dstBuffer, size_t* dstSizePtr,
++                    const void* srcBuffer, size_t* srcSizePtr,
++                    const void* dict, size_t dictSize,
++                    const LZ4F_decompressOptions_t* decompressOptionsPtr);
++
++
++/*! Custom memory allocation :
++ *  These prototypes make it possible to pass custom allocation/free functions.
++ *  LZ4F_customMem is provided at state creation time, using LZ4F_create*_advanced() listed below.
++ *  All allocation/free operations will be completed using these custom variants instead of regular <stdlib.h> ones.
++ */
++typedef void* (*LZ4F_AllocFunction) (void* opaqueState, size_t size);
++typedef void* (*LZ4F_CallocFunction) (void* opaqueState, size_t size);
++typedef void  (*LZ4F_FreeFunction) (void* opaqueState, void* address);
++typedef struct {
++    LZ4F_AllocFunction customAlloc;
++    LZ4F_CallocFunction customCalloc; /* optional; when not defined, uses customAlloc + memset */
++    LZ4F_FreeFunction customFree;
++    void* opaqueState;
++} LZ4F_CustomMem;
++static
++#ifdef __GNUC__
++__attribute__((__unused__))
++#endif
++LZ4F_CustomMem const LZ4F_defaultCMem = { NULL, NULL, NULL, NULL };  /**< this constant defers to stdlib's functions */
++
++LZ4FLIB_STATIC_API LZ4F_cctx* LZ4F_createCompressionContext_advanced(LZ4F_CustomMem customMem, unsigned version);
++LZ4FLIB_STATIC_API LZ4F_dctx* LZ4F_createDecompressionContext_advanced(LZ4F_CustomMem customMem, unsigned version);
++LZ4FLIB_STATIC_API LZ4F_CDict* LZ4F_createCDict_advanced(LZ4F_CustomMem customMem, const void* dictBuffer, size_t dictSize);
++
+ 
+ #if defined (__cplusplus)
+ }
+ #endif
+ 
+ #endif  /* defined(LZ4F_STATIC_LINKING_ONLY) && !defined(LZ4F_H_STATIC_09782039843) */
+diff --git a/mfbt/lz4/lz4frame_static.h b/mfbt/lz4/lz4frame_static.h
+--- a/mfbt/lz4/lz4frame_static.h
++++ b/mfbt/lz4/lz4frame_static.h
+@@ -1,12 +1,12 @@
+ /*
+    LZ4 auto-framing library
+    Header File for static linking only
+-   Copyright (C) 2011-2016, Yann Collet.
++   Copyright (C) 2011-2020, Yann Collet.
+ 
+    BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php)
+ 
+    Redistribution and use in source and binary forms, with or without
+    modification, are permitted provided that the following conditions are
+    met:
+ 
+        * Redistributions of source code must retain the above copyright
+diff --git a/mfbt/lz4/lz4hc.c b/mfbt/lz4/lz4hc.c
+--- a/mfbt/lz4/lz4hc.c
++++ b/mfbt/lz4/lz4hc.c
+@@ -1,11 +1,11 @@
+ /*
+     LZ4 HC - High Compression Mode of LZ4
+-    Copyright (C) 2011-2017, Yann Collet.
++    Copyright (C) 2011-2020, Yann Collet.
+ 
+     BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php)
+ 
+     Redistribution and use in source and binary forms, with or without
+     modification, are permitted provided that the following conditions are
+     met:
+ 
+     * Redistributions of source code must retain the above copyright
+@@ -37,17 +37,17 @@
+ /* *************************************
+ *  Tuning Parameter
+ ***************************************/
+ 
+ /*! HEAPMODE :
+  *  Select how default compression function will allocate workplace memory,
+  *  in stack (0:fastest), or in heap (1:requires malloc()).
+  *  Since workplace is rather large, heap mode is recommended.
+- */
++**/
+ #ifndef LZ4HC_HEAPMODE
+ #  define LZ4HC_HEAPMODE 1
+ #endif
+ 
+ 
+ /*===    Dependency    ===*/
+ #define LZ4_HC_STATIC_LINKING_ONLY
+ #include "lz4hc.h"
+@@ -94,42 +94,47 @@ static U32 LZ4HC_hashPtr(const void* ptr
+ static void LZ4HC_clearTables (LZ4HC_CCtx_internal* hc4)
+ {
+     MEM_INIT(hc4->hashTable, 0, sizeof(hc4->hashTable));
+     MEM_INIT(hc4->chainTable, 0xFF, sizeof(hc4->chainTable));
+ }
+ 
+ static void LZ4HC_init_internal (LZ4HC_CCtx_internal* hc4, const BYTE* start)
+ {
+-    uptrval startingOffset = (uptrval)(hc4->end - hc4->base);
+-    if (startingOffset > 1 GB) {
++    size_t const bufferSize = (size_t)(hc4->end - hc4->prefixStart);
++    size_t newStartingOffset = bufferSize + hc4->dictLimit;
++    assert(newStartingOffset >= bufferSize);  /* check overflow */
++    if (newStartingOffset > 1 GB) {
+         LZ4HC_clearTables(hc4);
+-        startingOffset = 0;
++        newStartingOffset = 0;
+     }
+-    startingOffset += 64 KB;
+-    hc4->nextToUpdate = (U32) startingOffset;
+-    hc4->base = start - startingOffset;
++    newStartingOffset += 64 KB;
++    hc4->nextToUpdate = (U32)newStartingOffset;
++    hc4->prefixStart = start;
+     hc4->end = start;
+-    hc4->dictBase = start - startingOffset;
+-    hc4->dictLimit = (U32) startingOffset;
+-    hc4->lowLimit = (U32) startingOffset;
++    hc4->dictStart = start;
++    hc4->dictLimit = (U32)newStartingOffset;
++    hc4->lowLimit = (U32)newStartingOffset;
+ }
+ 
+ 
+ /* Update chains up to ip (excluded) */
+ LZ4_FORCE_INLINE void LZ4HC_Insert (LZ4HC_CCtx_internal* hc4, const BYTE* ip)
+ {
+     U16* const chainTable = hc4->chainTable;
+     U32* const hashTable  = hc4->hashTable;
+-    const BYTE* const base = hc4->base;
+-    U32 const target = (U32)(ip - base);
++    const BYTE* const prefixPtr = hc4->prefixStart;
++    U32 const prefixIdx = hc4->dictLimit;
++    U32 const target = (U32)(ip - prefixPtr) + prefixIdx;
+     U32 idx = hc4->nextToUpdate;
++    assert(ip >= prefixPtr);
++    assert(target >= prefixIdx);
+ 
+     while (idx < target) {
+-        U32 const h = LZ4HC_hashPtr(base+idx);
++        U32 const h = LZ4HC_hashPtr(prefixPtr+idx-prefixIdx);
+         size_t delta = idx - hashTable[h];
+         if (delta>LZ4_DISTANCE_MAX) delta = LZ4_DISTANCE_MAX;
+         DELTANEXTU16(chainTable, idx) = (U16)delta;
+         hashTable[h] = idx;
+         idx++;
+     }
+ 
+     hc4->nextToUpdate = target;
+@@ -188,35 +193,34 @@ LZ4HC_countPattern(const BYTE* ip, const
+             ip++; patternByte >>= 8;
+         }
+     } else {  /* big endian */
+         U32 bitOffset = (sizeof(pattern)*8) - 8;
+         while (ip < iEnd) {
+             BYTE const byte = (BYTE)(pattern >> bitOffset);
+             if (*ip != byte) break;
+             ip ++; bitOffset -= 8;
+-        }
+-    }
++    }   }
+ 
+     return (unsigned)(ip - iStart);
+ }
+ 
+ /* LZ4HC_reverseCountPattern() :
+  * pattern must be a sample of repetitive pattern of length 1, 2 or 4 (but not 3!)
+- * read using natural platform endianess */
++ * read using natural platform endianness */
+ static unsigned
+ LZ4HC_reverseCountPattern(const BYTE* ip, const BYTE* const iLow, U32 pattern)
+ {
+     const BYTE* const iStart = ip;
+ 
+     while (likely(ip >= iLow+4)) {
+         if (LZ4_read32(ip-4) != pattern) break;
+         ip -= 4;
+     }
+-    {   const BYTE* bytePtr = (const BYTE*)(&pattern) + 3; /* works for any endianess */
++    {   const BYTE* bytePtr = (const BYTE*)(&pattern) + 3; /* works for any endianness */
+         while (likely(ip>iLow)) {
+             if (ip[-1] != *bytePtr) break;
+             ip--; bytePtr--;
+     }   }
+     return (unsigned)(iStart - ip);
+ }
+ 
+ /* LZ4HC_protectDictEnd() :
+@@ -229,38 +233,38 @@ static int LZ4HC_protectDictEnd(U32 cons
+     return ((U32)((dictLimit - 1) - matchIndex) >= 3);
+ }
+ 
+ typedef enum { rep_untested, rep_not, rep_confirmed } repeat_state_e;
+ typedef enum { favorCompressionRatio=0, favorDecompressionSpeed } HCfavor_e;
+ 
+ LZ4_FORCE_INLINE int
+ LZ4HC_InsertAndGetWiderMatch (
+-    LZ4HC_CCtx_internal* hc4,
+-    const BYTE* const ip,
+-    const BYTE* const iLowLimit,
+-    const BYTE* const iHighLimit,
+-    int longest,
+-    const BYTE** matchpos,
+-    const BYTE** startpos,
+-    const int maxNbAttempts,
+-    const int patternAnalysis,
+-    const int chainSwap,
+-    const dictCtx_directive dict,
+-    const HCfavor_e favorDecSpeed)
++        LZ4HC_CCtx_internal* const hc4,
++        const BYTE* const ip,
++        const BYTE* const iLowLimit, const BYTE* const iHighLimit,
++        int longest,
++        const BYTE** matchpos,
++        const BYTE** startpos,
++        const int maxNbAttempts,
++        const int patternAnalysis, const int chainSwap,
++        const dictCtx_directive dict,
++        const HCfavor_e favorDecSpeed)
+ {
+     U16* const chainTable = hc4->chainTable;
+     U32* const HashTable = hc4->hashTable;
+     const LZ4HC_CCtx_internal * const dictCtx = hc4->dictCtx;
+-    const BYTE* const base = hc4->base;
+-    const U32 dictLimit = hc4->dictLimit;
+-    const BYTE* const lowPrefixPtr = base + dictLimit;
+-    const U32 ipIndex = (U32)(ip - base);
+-    const U32 lowestMatchIndex = (hc4->lowLimit + (LZ4_DISTANCE_MAX + 1) > ipIndex) ? hc4->lowLimit : ipIndex - LZ4_DISTANCE_MAX;
+-    const BYTE* const dictBase = hc4->dictBase;
++    const BYTE* const prefixPtr = hc4->prefixStart;
++    const U32 prefixIdx = hc4->dictLimit;
++    const U32 ipIndex = (U32)(ip - prefixPtr) + prefixIdx;
++    const int withinStartDistance = (hc4->lowLimit + (LZ4_DISTANCE_MAX + 1) > ipIndex);
++    const U32 lowestMatchIndex = (withinStartDistance) ? hc4->lowLimit : ipIndex - LZ4_DISTANCE_MAX;
++    const BYTE* const dictStart = hc4->dictStart;
++    const U32 dictIdx = hc4->lowLimit;
++    const BYTE* const dictEnd = dictStart + prefixIdx - dictIdx;
+     int const lookBackLength = (int)(ip-iLowLimit);
+     int nbAttempts = maxNbAttempts;
+     U32 matchChainPos = 0;
+     U32 const pattern = LZ4_read32(ip);
+     U32 matchIndex;
+     repeat_state_e repeat = rep_untested;
+     size_t srcPatternLength = 0;
+ 
+@@ -272,67 +276,66 @@ LZ4HC_InsertAndGetWiderMatch (
+                 matchIndex, lowestMatchIndex);
+ 
+     while ((matchIndex>=lowestMatchIndex) && (nbAttempts>0)) {
+         int matchLength=0;
+         nbAttempts--;
+         assert(matchIndex < ipIndex);
+         if (favorDecSpeed && (ipIndex - matchIndex < 8)) {
+             /* do nothing */
+-        } else if (matchIndex >= dictLimit) {   /* within current Prefix */
+-            const BYTE* const matchPtr = base + matchIndex;
+-            assert(matchPtr >= lowPrefixPtr);
++        } else if (matchIndex >= prefixIdx) {   /* within current Prefix */
++            const BYTE* const matchPtr = prefixPtr + matchIndex - prefixIdx;
+             assert(matchPtr < ip);
+             assert(longest >= 1);
+             if (LZ4_read16(iLowLimit + longest - 1) == LZ4_read16(matchPtr - lookBackLength + longest - 1)) {
+                 if (LZ4_read32(matchPtr) == pattern) {
+-                    int const back = lookBackLength ? LZ4HC_countBack(ip, matchPtr, iLowLimit, lowPrefixPtr) : 0;
++                    int const back = lookBackLength ? LZ4HC_countBack(ip, matchPtr, iLowLimit, prefixPtr) : 0;
+                     matchLength = MINMATCH + (int)LZ4_count(ip+MINMATCH, matchPtr+MINMATCH, iHighLimit);
+                     matchLength -= back;
+                     if (matchLength > longest) {
+                         longest = matchLength;
+                         *matchpos = matchPtr + back;
+                         *startpos = ip + back;
+             }   }   }
+         } else {   /* lowestMatchIndex <= matchIndex < dictLimit */
+-            const BYTE* const matchPtr = dictBase + matchIndex;
+-            if (LZ4_read32(matchPtr) == pattern) {
+-                const BYTE* const dictStart = dictBase + hc4->lowLimit;
++            const BYTE* const matchPtr = dictStart + (matchIndex - dictIdx);
++            assert(matchIndex >= dictIdx);
++            if ( likely(matchIndex <= prefixIdx - 4)
++              && (LZ4_read32(matchPtr) == pattern) ) {
+                 int back = 0;
+-                const BYTE* vLimit = ip + (dictLimit - matchIndex);
++                const BYTE* vLimit = ip + (prefixIdx - matchIndex);
+                 if (vLimit > iHighLimit) vLimit = iHighLimit;
+                 matchLength = (int)LZ4_count(ip+MINMATCH, matchPtr+MINMATCH, vLimit) + MINMATCH;
+                 if ((ip+matchLength == vLimit) && (vLimit < iHighLimit))
+-                    matchLength += LZ4_count(ip+matchLength, lowPrefixPtr, iHighLimit);
++                    matchLength += LZ4_count(ip+matchLength, prefixPtr, iHighLimit);
+                 back = lookBackLength ? LZ4HC_countBack(ip, matchPtr, iLowLimit, dictStart) : 0;
+                 matchLength -= back;
+                 if (matchLength > longest) {
+                     longest = matchLength;
+-                    *matchpos = base + matchIndex + back;   /* virtual pos, relative to ip, to retrieve offset */
++                    *matchpos = prefixPtr - prefixIdx + matchIndex + back;   /* virtual pos, relative to ip, to retrieve offset */
+                     *startpos = ip + back;
+         }   }   }
+ 
+-        if (chainSwap && matchLength==longest) {    /* better match => select a better chain */
++        if (chainSwap && matchLength==longest) {   /* better match => select a better chain */
+             assert(lookBackLength==0);   /* search forward only */
+             if (matchIndex + (U32)longest <= ipIndex) {
+                 int const kTrigger = 4;
+                 U32 distanceToNextMatch = 1;
+                 int const end = longest - MINMATCH + 1;
+                 int step = 1;
+                 int accel = 1 << kTrigger;
+                 int pos;
+                 for (pos = 0; pos < end; pos += step) {
+                     U32 const candidateDist = DELTANEXTU16(chainTable, matchIndex + (U32)pos);
+                     step = (accel++ >> kTrigger);
+                     if (candidateDist > distanceToNextMatch) {
+                         distanceToNextMatch = candidateDist;
+                         matchChainPos = (U32)pos;
+                         accel = 1 << kTrigger;
+-                    }
+-                }
++                }   }
+                 if (distanceToNextMatch > 1) {
+                     if (distanceToNextMatch > matchIndex) break;   /* avoid overflow */
+                     matchIndex -= distanceToNextMatch;
+                     continue;
+         }   }   }
+ 
+         {   U32 const distNextMatch = DELTANEXTU16(chainTable, matchIndex);
+             if (patternAnalysis && distNextMatch==1 && matchChainPos==0) {
+@@ -342,64 +345,65 @@ LZ4HC_InsertAndGetWiderMatch (
+                     if ( ((pattern & 0xFFFF) == (pattern >> 16))
+                       &  ((pattern & 0xFF)   == (pattern >> 24)) ) {
+                         repeat = rep_confirmed;
+                         srcPatternLength = LZ4HC_countPattern(ip+sizeof(pattern), iHighLimit, pattern) + sizeof(pattern);
+                     } else {
+                         repeat = rep_not;
+                 }   }
+                 if ( (repeat == rep_confirmed) && (matchCandidateIdx >= lowestMatchIndex)
+-                  && LZ4HC_protectDictEnd(dictLimit, matchCandidateIdx) ) {
+-                    const int extDict = matchCandidateIdx < dictLimit;
+-                    const BYTE* const matchPtr = (extDict ? dictBase : base) + matchCandidateIdx;
++                  && LZ4HC_protectDictEnd(prefixIdx, matchCandidateIdx) ) {
++                    const int extDict = matchCandidateIdx < prefixIdx;
++                    const BYTE* const matchPtr = (extDict ? dictStart - dictIdx : prefixPtr - prefixIdx) + matchCandidateIdx;
+                     if (LZ4_read32(matchPtr) == pattern) {  /* good candidate */
+-                        const BYTE* const dictStart = dictBase + hc4->lowLimit;
+-                        const BYTE* const iLimit = extDict ? dictBase + dictLimit : iHighLimit;
++                        const BYTE* const iLimit = extDict ? dictEnd : iHighLimit;
+                         size_t forwardPatternLength = LZ4HC_countPattern(matchPtr+sizeof(pattern), iLimit, pattern) + sizeof(pattern);
+                         if (extDict && matchPtr + forwardPatternLength == iLimit) {
+                             U32 const rotatedPattern = LZ4HC_rotatePattern(forwardPatternLength, pattern);
+-                            forwardPatternLength += LZ4HC_countPattern(lowPrefixPtr, iHighLimit, rotatedPattern);
++                            forwardPatternLength += LZ4HC_countPattern(prefixPtr, iHighLimit, rotatedPattern);
+                         }
+-                        {   const BYTE* const lowestMatchPtr = extDict ? dictStart : lowPrefixPtr;
++                        {   const BYTE* const lowestMatchPtr = extDict ? dictStart : prefixPtr;
+                             size_t backLength = LZ4HC_reverseCountPattern(matchPtr, lowestMatchPtr, pattern);
+                             size_t currentSegmentLength;
+-                            if (!extDict && matchPtr - backLength == lowPrefixPtr && hc4->lowLimit < dictLimit) {
++                            if (!extDict
++                              && matchPtr - backLength == prefixPtr
++                              && dictIdx < prefixIdx) {
+                                 U32 const rotatedPattern = LZ4HC_rotatePattern((U32)(-(int)backLength), pattern);
+-                                backLength += LZ4HC_reverseCountPattern(dictBase + dictLimit, dictStart, rotatedPattern);
++                                backLength += LZ4HC_reverseCountPattern(dictEnd, dictStart, rotatedPattern);
+                             }
+                             /* Limit backLength not go further than lowestMatchIndex */
+                             backLength = matchCandidateIdx - MAX(matchCandidateIdx - (U32)backLength, lowestMatchIndex);
+                             assert(matchCandidateIdx - backLength >= lowestMatchIndex);
+                             currentSegmentLength = backLength + forwardPatternLength;
+                             /* Adjust to end of pattern if the source pattern fits, otherwise the beginning of the pattern */
+                             if ( (currentSegmentLength >= srcPatternLength)   /* current pattern segment large enough to contain full srcPatternLength */
+                               && (forwardPatternLength <= srcPatternLength) ) { /* haven't reached this position yet */
+                                 U32 const newMatchIndex = matchCandidateIdx + (U32)forwardPatternLength - (U32)srcPatternLength;  /* best position, full pattern, might be followed by more match */
+-                                if (LZ4HC_protectDictEnd(dictLimit, newMatchIndex))
++                                if (LZ4HC_protectDictEnd(prefixIdx, newMatchIndex))
+                                     matchIndex = newMatchIndex;
+                                 else {
+                                     /* Can only happen if started in the prefix */
+-                                    assert(newMatchIndex >= dictLimit - 3 && newMatchIndex < dictLimit && !extDict);
+-                                    matchIndex = dictLimit;
++                                    assert(newMatchIndex >= prefixIdx - 3 && newMatchIndex < prefixIdx && !extDict);
++                                    matchIndex = prefixIdx;
+                                 }
+                             } else {
+                                 U32 const newMatchIndex = matchCandidateIdx - (U32)backLength;   /* farthest position in current segment, will find a match of length currentSegmentLength + maybe some back */
+-                                if (!LZ4HC_protectDictEnd(dictLimit, newMatchIndex)) {
+-                                    assert(newMatchIndex >= dictLimit - 3 && newMatchIndex < dictLimit && !extDict);
+-                                    matchIndex = dictLimit;
++                                if (!LZ4HC_protectDictEnd(prefixIdx, newMatchIndex)) {
++                                    assert(newMatchIndex >= prefixIdx - 3 && newMatchIndex < prefixIdx && !extDict);
++                                    matchIndex = prefixIdx;
+                                 } else {
+                                     matchIndex = newMatchIndex;
+                                     if (lookBackLength==0) {  /* no back possible */
+                                         size_t const maxML = MIN(currentSegmentLength, srcPatternLength);
+                                         if ((size_t)longest < maxML) {
+-                                            assert(base + matchIndex != ip);
+-                                            if ((size_t)(ip - base) - matchIndex > LZ4_DISTANCE_MAX) break;
++                                            assert(prefixPtr - prefixIdx + matchIndex != ip);
++                                            if ((size_t)(ip - prefixPtr) + prefixIdx - matchIndex > LZ4_DISTANCE_MAX) break;
+                                             assert(maxML < 2 GB);
+                                             longest = (int)maxML;
+-                                            *matchpos = base + matchIndex;   /* virtual pos, relative to ip, to retrieve offset */
++                                            *matchpos = prefixPtr - prefixIdx + matchIndex;   /* virtual pos, relative to ip, to retrieve offset */
+                                             *startpos = ip;
+                                         }
+                                         {   U32 const distToNextPattern = DELTANEXTU16(chainTable, matchIndex);
+                                             if (distToNextPattern > matchIndex) break;  /* avoid overflow */
+                                             matchIndex -= distToNextPattern;
+                         }   }   }   }   }
+                         continue;
+                 }   }
+@@ -408,52 +412,52 @@ LZ4HC_InsertAndGetWiderMatch (
+         /* follow current chain */
+         matchIndex -= DELTANEXTU16(chainTable, matchIndex + matchChainPos);
+ 
+     }  /* while ((matchIndex>=lowestMatchIndex) && (nbAttempts)) */
+ 
+     if ( dict == usingDictCtxHc
+       && nbAttempts > 0
+       && ipIndex - lowestMatchIndex < LZ4_DISTANCE_MAX) {
+-        size_t const dictEndOffset = (size_t)(dictCtx->end - dictCtx->base);
++        size_t const dictEndOffset = (size_t)(dictCtx->end - dictCtx->prefixStart) + dictCtx->dictLimit;
+         U32 dictMatchIndex = dictCtx->hashTable[LZ4HC_hashPtr(ip)];
+         assert(dictEndOffset <= 1 GB);
+         matchIndex = dictMatchIndex + lowestMatchIndex - (U32)dictEndOffset;
+         while (ipIndex - matchIndex <= LZ4_DISTANCE_MAX && nbAttempts--) {
+-            const BYTE* const matchPtr = dictCtx->base + dictMatchIndex;
++            const BYTE* const matchPtr = dictCtx->prefixStart - dictCtx->dictLimit + dictMatchIndex;
+ 
+             if (LZ4_read32(matchPtr) == pattern) {
+                 int mlt;
+                 int back = 0;
+                 const BYTE* vLimit = ip + (dictEndOffset - dictMatchIndex);
+                 if (vLimit > iHighLimit) vLimit = iHighLimit;
+                 mlt = (int)LZ4_count(ip+MINMATCH, matchPtr+MINMATCH, vLimit) + MINMATCH;
+-                back = lookBackLength ? LZ4HC_countBack(ip, matchPtr, iLowLimit, dictCtx->base + dictCtx->dictLimit) : 0;
++                back = lookBackLength ? LZ4HC_countBack(ip, matchPtr, iLowLimit, dictCtx->prefixStart) : 0;
+                 mlt -= back;
+                 if (mlt > longest) {
+                     longest = mlt;
+-                    *matchpos = base + matchIndex + back;
++                    *matchpos = prefixPtr - prefixIdx + matchIndex + back;
+                     *startpos = ip + back;
+             }   }
+ 
+             {   U32 const nextOffset = DELTANEXTU16(dictCtx->chainTable, dictMatchIndex);
+                 dictMatchIndex -= nextOffset;
+                 matchIndex -= nextOffset;
+     }   }   }
+ 
+     return longest;
+ }
+ 
+-LZ4_FORCE_INLINE
+-int LZ4HC_InsertAndFindBestMatch(LZ4HC_CCtx_internal* const hc4,   /* Index table will be updated */
+-                                 const BYTE* const ip, const BYTE* const iLimit,
+-                                 const BYTE** matchpos,
+-                                 const int maxNbAttempts,
+-                                 const int patternAnalysis,
+-                                 const dictCtx_directive dict)
++LZ4_FORCE_INLINE int
++LZ4HC_InsertAndFindBestMatch(LZ4HC_CCtx_internal* const hc4,   /* Index table will be updated */
++                       const BYTE* const ip, const BYTE* const iLimit,
++                       const BYTE** matchpos,
++                       const int maxNbAttempts,
++                       const int patternAnalysis,
++                       const dictCtx_directive dict)
+ {
+     const BYTE* uselessPtr = ip;
+     /* note : LZ4HC_InsertAndGetWiderMatch() is able to modify the starting position of a match (*startpos),
+      * but this won't be the case here, as we define iLowLimit==ip,
+      * so LZ4HC_InsertAndGetWiderMatch() won't be allowed to search past ip */
+     return LZ4HC_InsertAndGetWiderMatch(hc4, ip, ip, iLimit, MINMATCH-1, matchpos, &uselessPtr, maxNbAttempts, patternAnalysis, 0 /*chainSwap*/, dict, favorCompressionRatio);
+ }
+ 
+@@ -746,17 +750,17 @@ LZ4_FORCE_INLINE int LZ4HC_compress_hash
+         if (lastRunSize >= RUN_MASK) {
+             size_t accumulator = lastRunSize - RUN_MASK;
+             *op++ = (RUN_MASK << ML_BITS);
+             for(; accumulator >= 255 ; accumulator -= 255) *op++ = 255;
+             *op++ = (BYTE) accumulator;
+         } else {
+             *op++ = (BYTE)(lastRunSize << ML_BITS);
+         }
+-        memcpy(op, anchor, lastRunSize);
++        LZ4_memcpy(op, anchor, lastRunSize);
+         op += lastRunSize;
+     }
+ 
+     /* End */
+     *srcSizePtr = (int) (((const char*)ip) - source);
+     return (int) (((char*)op)-dest);
+ 
+ _dest_overflow:
+@@ -879,23 +883,23 @@ LZ4HC_compress_generic_dictCtx (
+         const char* const src,
+         char* const dst,
+         int* const srcSizePtr,
+         int const dstCapacity,
+         int cLevel,
+         limitedOutput_directive limit
+         )
+ {
+-    const size_t position = (size_t)(ctx->end - ctx->base) - ctx->lowLimit;
++    const size_t position = (size_t)(ctx->end - ctx->prefixStart) + (ctx->dictLimit - ctx->lowLimit);
+     assert(ctx->dictCtx != NULL);
+     if (position >= 64 KB) {
+         ctx->dictCtx = NULL;
+         return LZ4HC_compress_generic_noDictCtx(ctx, src, dst, srcSizePtr, dstCapacity, cLevel, limit);
+     } else if (position == 0 && *srcSizePtr > 4 KB) {
+-        memcpy(ctx, ctx->dictCtx, sizeof(LZ4HC_CCtx_internal));
++        LZ4_memcpy(ctx, ctx->dictCtx, sizeof(LZ4HC_CCtx_internal));
+         LZ4HC_setExternalDict(ctx, (const BYTE *)src);
+         ctx->compressionLevel = (short)cLevel;
+         return LZ4HC_compress_generic_noDictCtx(ctx, src, dst, srcSizePtr, dstCapacity, cLevel, limit);
+     } else {
+         return LZ4HC_compress_generic_internal(ctx, src, dst, srcSizePtr, dstCapacity, cLevel, limit, usingDictCtxHc);
+     }
+ }
+ 
+@@ -948,23 +952,25 @@ int LZ4_compress_HC_extStateHC (void* st
+ {
+     LZ4_streamHC_t* const ctx = LZ4_initStreamHC(state, sizeof(*ctx));
+     if (ctx==NULL) return 0;   /* init failure */
+     return LZ4_compress_HC_extStateHC_fastReset(state, src, dst, srcSize, dstCapacity, compressionLevel);
+ }
+ 
+ int LZ4_compress_HC(const char* src, char* dst, int srcSize, int dstCapacity, int compressionLevel)
+ {
++    int cSize;
+ #if defined(LZ4HC_HEAPMODE) && LZ4HC_HEAPMODE==1
+     LZ4_streamHC_t* const statePtr = (LZ4_streamHC_t*)ALLOC(sizeof(LZ4_streamHC_t));
++    if (statePtr==NULL) return 0;
+ #else
+     LZ4_streamHC_t state;
+     LZ4_streamHC_t* const statePtr = &state;
+ #endif
+-    int const cSize = LZ4_compress_HC_extStateHC(statePtr, src, dst, srcSize, dstCapacity, compressionLevel);
++    cSize = LZ4_compress_HC_extStateHC(statePtr, src, dst, srcSize, dstCapacity, compressionLevel);
+ #if defined(LZ4HC_HEAPMODE) && LZ4HC_HEAPMODE==1
+     FREEMEM(statePtr);
+ #endif
+     return cSize;
+ }
+ 
+ /* state is presumed sized correctly (>= sizeof(LZ4_streamHC_t)) */
+ int LZ4_compress_HC_destSize(void* state, const char* source, char* dest, int* sourceSizePtr, int targetDestSize, int cLevel)
+@@ -977,39 +983,39 @@ int LZ4_compress_HC_destSize(void* state
+ }
+ 
+ 
+ 
+ /**************************************
+ *  Streaming Functions
+ **************************************/
+ /* allocation */
++#if !defined(LZ4_STATIC_LINKING_ONLY_DISABLE_MEMORY_ALLOCATION)
+ LZ4_streamHC_t* LZ4_createStreamHC(void)
+ {
+     LZ4_streamHC_t* const state =
+         (LZ4_streamHC_t*)ALLOC_AND_ZERO(sizeof(LZ4_streamHC_t));
+     if (state == NULL) return NULL;
+     LZ4_setCompressionLevel(state, LZ4HC_CLEVEL_DEFAULT);
+     return state;
+ }
+ 
+ int LZ4_freeStreamHC (LZ4_streamHC_t* LZ4_streamHCPtr)
+ {
+     DEBUGLOG(4, "LZ4_freeStreamHC(%p)", LZ4_streamHCPtr);
+     if (!LZ4_streamHCPtr) return 0;  /* support free on NULL */
+     FREEMEM(LZ4_streamHCPtr);
+     return 0;
+ }
++#endif
+ 
+ 
+ LZ4_streamHC_t* LZ4_initStreamHC (void* buffer, size_t size)
+ {
+     LZ4_streamHC_t* const LZ4_streamHCPtr = (LZ4_streamHC_t*)buffer;
+-    /* if compilation fails here, LZ4_STREAMHCSIZE must be increased */
+-    LZ4_STATIC_ASSERT(sizeof(LZ4HC_CCtx_internal) <= LZ4_STREAMHCSIZE);
+     DEBUGLOG(4, "LZ4_initStreamHC(%p, %u)", buffer, (unsigned)size);
+     /* check conditions */
+     if (buffer == NULL) return NULL;
+     if (size < sizeof(LZ4_streamHC_t)) return NULL;
+     if (!LZ4_isAligned(buffer, LZ4_streamHC_t_alignment())) return NULL;
+     /* init */
+     { LZ4HC_CCtx_internal* const hcstate = &(LZ4_streamHCPtr->internal_donotuse);
+       MEM_INIT(hcstate, 0, sizeof(*hcstate)); }
+@@ -1025,23 +1031,23 @@ void LZ4_resetStreamHC (LZ4_streamHC_t* 
+ }
+ 
+ void LZ4_resetStreamHC_fast (LZ4_streamHC_t* LZ4_streamHCPtr, int compressionLevel)
+ {
+     DEBUGLOG(4, "LZ4_resetStreamHC_fast(%p, %d)", LZ4_streamHCPtr, compressionLevel);
+     if (LZ4_streamHCPtr->internal_donotuse.dirty) {
+         LZ4_initStreamHC(LZ4_streamHCPtr, sizeof(*LZ4_streamHCPtr));
+     } else {
+-        /* preserve end - base : can trigger clearTable's threshold */
++        /* preserve end - prefixStart : can trigger clearTable's threshold */
+         if (LZ4_streamHCPtr->internal_donotuse.end != NULL) {
+-            LZ4_streamHCPtr->internal_donotuse.end -= (uptrval)LZ4_streamHCPtr->internal_donotuse.base;
++            LZ4_streamHCPtr->internal_donotuse.end -= (uptrval)LZ4_streamHCPtr->internal_donotuse.prefixStart;
+         } else {
+-            assert(LZ4_streamHCPtr->internal_donotuse.base == NULL);
++            assert(LZ4_streamHCPtr->internal_donotuse.prefixStart == NULL);
+         }
+-        LZ4_streamHCPtr->internal_donotuse.base = NULL;
++        LZ4_streamHCPtr->internal_donotuse.prefixStart = NULL;
+         LZ4_streamHCPtr->internal_donotuse.dictCtx = NULL;
+     }
+     LZ4_setCompressionLevel(LZ4_streamHCPtr, compressionLevel);
+ }
+ 
+ void LZ4_setCompressionLevel(LZ4_streamHC_t* LZ4_streamHCPtr, int compressionLevel)
+ {
+     DEBUGLOG(5, "LZ4_setCompressionLevel(%p, %d)", LZ4_streamHCPtr, compressionLevel);
+@@ -1082,24 +1088,24 @@ void LZ4_attach_HC_dictionary(LZ4_stream
+     working_stream->internal_donotuse.dictCtx = dictionary_stream != NULL ? &(dictionary_stream->internal_donotuse) : NULL;
+ }
+ 
+ /* compression */
+ 
+ static void LZ4HC_setExternalDict(LZ4HC_CCtx_internal* ctxPtr, const BYTE* newBlock)
+ {
+     DEBUGLOG(4, "LZ4HC_setExternalDict(%p, %p)", ctxPtr, newBlock);
+-    if (ctxPtr->end >= ctxPtr->base + ctxPtr->dictLimit + 4)
++    if (ctxPtr->end >= ctxPtr->prefixStart + 4)
+         LZ4HC_Insert (ctxPtr, ctxPtr->end-3);   /* Referencing remaining dictionary content */
+ 
+     /* Only one memory segment for extDict, so any previous extDict is lost at this stage */
+     ctxPtr->lowLimit  = ctxPtr->dictLimit;
+-    ctxPtr->dictLimit = (U32)(ctxPtr->end - ctxPtr->base);
+-    ctxPtr->dictBase  = ctxPtr->base;
+-    ctxPtr->base = newBlock - ctxPtr->dictLimit;
++    ctxPtr->dictStart  = ctxPtr->prefixStart;
++    ctxPtr->dictLimit += (U32)(ctxPtr->end - ctxPtr->prefixStart);
++    ctxPtr->prefixStart = newBlock;
+     ctxPtr->end  = newBlock;
+     ctxPtr->nextToUpdate = ctxPtr->dictLimit;   /* match referencing will resume from there */
+ 
+     /* cannot reference an extDict and a dictCtx at the same time */
+     ctxPtr->dictCtx = NULL;
+ }
+ 
+ static int
+@@ -1108,38 +1114,41 @@ LZ4_compressHC_continue_generic (LZ4_str
+                                  int* srcSizePtr, int dstCapacity,
+                                  limitedOutput_directive limit)
+ {
+     LZ4HC_CCtx_internal* const ctxPtr = &LZ4_streamHCPtr->internal_donotuse;
+     DEBUGLOG(5, "LZ4_compressHC_continue_generic(ctx=%p, src=%p, srcSize=%d, limit=%d)",
+                 LZ4_streamHCPtr, src, *srcSizePtr, limit);
+     assert(ctxPtr != NULL);
+     /* auto-init if forgotten */
+-    if (ctxPtr->base == NULL) LZ4HC_init_internal (ctxPtr, (const BYTE*) src);
++    if (ctxPtr->prefixStart == NULL) LZ4HC_init_internal (ctxPtr, (const BYTE*) src);
+ 
+     /* Check overflow */
+-    if ((size_t)(ctxPtr->end - ctxPtr->base) > 2 GB) {
+-        size_t dictSize = (size_t)(ctxPtr->end - ctxPtr->base) - ctxPtr->dictLimit;
++    if ((size_t)(ctxPtr->end - ctxPtr->prefixStart) + ctxPtr->dictLimit > 2 GB) {
++        size_t dictSize = (size_t)(ctxPtr->end - ctxPtr->prefixStart);
+         if (dictSize > 64 KB) dictSize = 64 KB;
+         LZ4_loadDictHC(LZ4_streamHCPtr, (const char*)(ctxPtr->end) - dictSize, (int)dictSize);
+     }
+ 
+     /* Check if blocks follow each other */
+     if ((const BYTE*)src != ctxPtr->end)
+         LZ4HC_setExternalDict(ctxPtr, (const BYTE*)src);
+ 
+     /* Check overlapping input/dictionary space */
+     {   const BYTE* sourceEnd = (const BYTE*) src + *srcSizePtr;
+-        const BYTE* const dictBegin = ctxPtr->dictBase + ctxPtr->lowLimit;
+-        const BYTE* const dictEnd   = ctxPtr->dictBase + ctxPtr->dictLimit;
++        const BYTE* const dictBegin = ctxPtr->dictStart;
++        const BYTE* const dictEnd   = ctxPtr->dictStart + (ctxPtr->dictLimit - ctxPtr->lowLimit);
+         if ((sourceEnd > dictBegin) && ((const BYTE*)src < dictEnd)) {
+             if (sourceEnd > dictEnd) sourceEnd = dictEnd;
+-            ctxPtr->lowLimit = (U32)(sourceEnd - ctxPtr->dictBase);
+-            if (ctxPtr->dictLimit - ctxPtr->lowLimit < 4) ctxPtr->lowLimit = ctxPtr->dictLimit;
+-    }   }
++            ctxPtr->lowLimit += (U32)(sourceEnd - ctxPtr->dictStart);
++            ctxPtr->dictStart += (U32)(sourceEnd - ctxPtr->dictStart);
++            if (ctxPtr->dictLimit - ctxPtr->lowLimit < 4) {
++                ctxPtr->lowLimit = ctxPtr->dictLimit;
++                ctxPtr->dictStart = ctxPtr->prefixStart;
++    }   }   }
+ 
+     return LZ4HC_compress_generic (ctxPtr, src, dst, srcSizePtr, dstCapacity, ctxPtr->compressionLevel, limit);
+ }
+ 
+ int LZ4_compress_HC_continue (LZ4_streamHC_t* LZ4_streamHCPtr, const char* src, char* dst, int srcSize, int dstCapacity)
+ {
+     if (dstCapacity < LZ4_compressBound(srcSize))
+         return LZ4_compressHC_continue_generic (LZ4_streamHCPtr, src, dst, &srcSize, dstCapacity, limitedOutput);
+@@ -1157,30 +1166,31 @@ int LZ4_compress_HC_continue_destSize (L
+ /* LZ4_saveDictHC :
+  * save history content
+  * into a user-provided buffer
+  * which is then used to continue compression
+  */
+ int LZ4_saveDictHC (LZ4_streamHC_t* LZ4_streamHCPtr, char* safeBuffer, int dictSize)
+ {
+     LZ4HC_CCtx_internal* const streamPtr = &LZ4_streamHCPtr->internal_donotuse;
+-    int const prefixSize = (int)(streamPtr->end - (streamPtr->base + streamPtr->dictLimit));
++    int const prefixSize = (int)(streamPtr->end - streamPtr->prefixStart);
+     DEBUGLOG(5, "LZ4_saveDictHC(%p, %p, %d)", LZ4_streamHCPtr, safeBuffer, dictSize);
+     assert(prefixSize >= 0);
+     if (dictSize > 64 KB) dictSize = 64 KB;
+     if (dictSize < 4) dictSize = 0;
+     if (dictSize > prefixSize) dictSize = prefixSize;
+     if (safeBuffer == NULL) assert(dictSize == 0);
+     if (dictSize > 0)
+-        memmove(safeBuffer, streamPtr->end - dictSize, dictSize);
+-    {   U32 const endIndex = (U32)(streamPtr->end - streamPtr->base);
++        LZ4_memmove(safeBuffer, streamPtr->end - dictSize, dictSize);
++    {   U32 const endIndex = (U32)(streamPtr->end - streamPtr->prefixStart) + streamPtr->dictLimit;
+         streamPtr->end = (const BYTE*)safeBuffer + dictSize;
+-        streamPtr->base = streamPtr->end - endIndex;
++        streamPtr->prefixStart = streamPtr->end - dictSize;
+         streamPtr->dictLimit = endIndex - (U32)dictSize;
+         streamPtr->lowLimit = endIndex - (U32)dictSize;
++        streamPtr->dictStart = streamPtr->prefixStart;
+         if (streamPtr->nextToUpdate < streamPtr->dictLimit)
+             streamPtr->nextToUpdate = streamPtr->dictLimit;
+     }
+     return dictSize;
+ }
+ 
+ 
+ /***************************************************
+@@ -1198,60 +1208,62 @@ int LZ4_compressHC_withStateHC (void* st
+ int LZ4_compressHC_limitedOutput_withStateHC (void* state, const char* src, char* dst, int srcSize, int maxDstSize) { return LZ4_compress_HC_extStateHC (state, src, dst, srcSize, maxDstSize, 0); }
+ int LZ4_compressHC2_withStateHC (void* state, const char* src, char* dst, int srcSize, int cLevel) { return LZ4_compress_HC_extStateHC(state, src, dst, srcSize, LZ4_compressBound(srcSize), cLevel); }
+ int LZ4_compressHC2_limitedOutput_withStateHC (void* state, const char* src, char* dst, int srcSize, int maxDstSize, int cLevel) { return LZ4_compress_HC_extStateHC(state, src, dst, srcSize, maxDstSize, cLevel); }
+ int LZ4_compressHC_continue (LZ4_streamHC_t* ctx, const char* src, char* dst, int srcSize) { return LZ4_compress_HC_continue (ctx, src, dst, srcSize, LZ4_compressBound(srcSize)); }
+ int LZ4_compressHC_limitedOutput_continue (LZ4_streamHC_t* ctx, const char* src, char* dst, int srcSize, int maxDstSize) { return LZ4_compress_HC_continue (ctx, src, dst, srcSize, maxDstSize); }
+ 
+ 
+ /* Deprecated streaming functions */
+-int LZ4_sizeofStreamStateHC(void) { return LZ4_STREAMHCSIZE; }
++int LZ4_sizeofStreamStateHC(void) { return sizeof(LZ4_streamHC_t); }
+ 
+ /* state is presumed correctly sized, aka >= sizeof(LZ4_streamHC_t)
+  * @return : 0 on success, !=0 if error */
+ int LZ4_resetStreamStateHC(void* state, char* inputBuffer)
+ {
+     LZ4_streamHC_t* const hc4 = LZ4_initStreamHC(state, sizeof(*hc4));
+     if (hc4 == NULL) return 1;   /* init failed */
+     LZ4HC_init_internal (&hc4->internal_donotuse, (const BYTE*)inputBuffer);
+     return 0;
+ }
+ 
++#if !defined(LZ4_STATIC_LINKING_ONLY_DISABLE_MEMORY_ALLOCATION)
+ void* LZ4_createHC (const char* inputBuffer)
+ {
+     LZ4_streamHC_t* const hc4 = LZ4_createStreamHC();
+     if (hc4 == NULL) return NULL;   /* not enough memory */
+     LZ4HC_init_internal (&hc4->internal_donotuse, (const BYTE*)inputBuffer);
+     return hc4;
+ }
+ 
+ int LZ4_freeHC (void* LZ4HC_Data)
+ {
+     if (!LZ4HC_Data) return 0;  /* support free on NULL */
+     FREEMEM(LZ4HC_Data);
+     return 0;
+ }
++#endif
+ 
+ int LZ4_compressHC2_continue (void* LZ4HC_Data, const char* src, char* dst, int srcSize, int cLevel)
+ {
+     return LZ4HC_compress_generic (&((LZ4_streamHC_t*)LZ4HC_Data)->internal_donotuse, src, dst, &srcSize, 0, cLevel, notLimited);
+ }
+ 
+ int LZ4_compressHC2_limitedOutput_continue (void* LZ4HC_Data, const char* src, char* dst, int srcSize, int dstCapacity, int cLevel)
+ {
+     return LZ4HC_compress_generic (&((LZ4_streamHC_t*)LZ4HC_Data)->internal_donotuse, src, dst, &srcSize, dstCapacity, cLevel, limitedOutput);
+ }
+ 
+ char* LZ4_slideInputBufferHC(void* LZ4HC_Data)
+ {
+-    LZ4_streamHC_t *ctx = (LZ4_streamHC_t*)LZ4HC_Data;
+-    const BYTE *bufferStart = ctx->internal_donotuse.base + ctx->internal_donotuse.lowLimit;
++    LZ4_streamHC_t* const ctx = (LZ4_streamHC_t*)LZ4HC_Data;
++    const BYTE* bufferStart = ctx->internal_donotuse.prefixStart - ctx->internal_donotuse.dictLimit + ctx->internal_donotuse.lowLimit;
+     LZ4_resetStreamHC_fast(ctx, ctx->internal_donotuse.compressionLevel);
+     /* avoid const char * -> char * conversion warning :( */
+-    return (char *)(uptrval)bufferStart;
++    return (char*)(uptrval)bufferStart;
+ }
+ 
+ 
+ /* ================================================
+  *  LZ4 Optimal parser (levels [LZ4HC_CLEVEL_OPT_MIN - LZ4HC_CLEVEL_MAX])
+  * ===============================================*/
+ typedef struct {
+     int price;
+@@ -1324,17 +1336,17 @@ static int LZ4HC_compress_optimal ( LZ4H
+                                     size_t sufficient_len,
+                                     const limitedOutput_directive limit,
+                                     int const fullUpdate,
+                                     const dictCtx_directive dict,
+                                     const HCfavor_e favorDecSpeed)
+ {
+     int retval = 0;
+ #define TRAILING_LITERALS 3
+-#ifdef LZ4HC_HEAPMODE
++#if defined(LZ4HC_HEAPMODE) && LZ4HC_HEAPMODE==1
+     LZ4HC_optimal_t* const opt = (LZ4HC_optimal_t*)ALLOC(sizeof(LZ4HC_optimal_t) * (LZ4_OPT_NUM + TRAILING_LITERALS));
+ #else
+     LZ4HC_optimal_t opt[LZ4_OPT_NUM + TRAILING_LITERALS];   /* ~64 KB, which is a bit large for stack... */
+ #endif
+ 
+     const BYTE* ip = (const BYTE*) source;
+     const BYTE* anchor = ip;
+     const BYTE* const iend = ip + *srcSizePtr;
+@@ -1342,17 +1354,17 @@ static int LZ4HC_compress_optimal ( LZ4H
+     const BYTE* const matchlimit = iend - LASTLITERALS;
+     BYTE* op = (BYTE*) dst;
+     BYTE* opSaved = (BYTE*) dst;
+     BYTE* oend = op + dstCapacity;
+     int ovml = MINMATCH;  /* overflow - last sequence */
+     const BYTE* ovref = NULL;
+ 
+     /* init */
+-#ifdef LZ4HC_HEAPMODE
++#if defined(LZ4HC_HEAPMODE) && LZ4HC_HEAPMODE==1
+     if (opt == NULL) goto _return_label;
+ #endif
+     DEBUGLOG(5, "LZ4HC_compress_optimal(dst=%p, dstCapa=%u)", dst, (unsigned)dstCapacity);
+     *srcSizePtr = 0;
+     if (limit == fillOutput) oend -= LASTLITERALS;   /* Hack for support LZ4 format restriction */
+     if (sufficient_len >= LZ4_OPT_NUM) sufficient_len = LZ4_OPT_NUM-1;
+ 
+     /* Main Loop */
+@@ -1574,17 +1586,17 @@ encode: /* cur, last_match_pos, best_mle
+          if (lastRunSize >= RUN_MASK) {
+              size_t accumulator = lastRunSize - RUN_MASK;
+              *op++ = (RUN_MASK << ML_BITS);
+              for(; accumulator >= 255 ; accumulator -= 255) *op++ = 255;
+              *op++ = (BYTE) accumulator;
+          } else {
+              *op++ = (BYTE)(lastRunSize << ML_BITS);
+          }
+-         memcpy(op, anchor, lastRunSize);
++         LZ4_memcpy(op, anchor, lastRunSize);
+          op += lastRunSize;
+      }
+ 
+      /* End */
+      *srcSizePtr = (int) (((const char*)ip) - source);
+      retval = (int) ((char*)op-dst);
+      goto _return_label;
+ 
+@@ -1607,13 +1619,13 @@ if (limit == fillOutput) {
+              DEBUGLOG(6, "Space to end : %i + ml (%i)", (int)((oend + LASTLITERALS) - (op + ll_totalCost + 2) - 1), ovml);
+              DEBUGLOG(6, "Before : ip = %p, anchor = %p", ip, anchor);
+              LZ4HC_encodeSequence(UPDATABLE(ip, op, anchor), ovml, ovref, notLimited, oend);
+              DEBUGLOG(6, "After : ip = %p, anchor = %p", ip, anchor);
+      }   }
+      goto _last_literals;
+ }
+ _return_label:
+-#ifdef LZ4HC_HEAPMODE
++#if defined(LZ4HC_HEAPMODE) && LZ4HC_HEAPMODE==1
+      FREEMEM(opt);
+ #endif
+      return retval;
+ }
+diff --git a/mfbt/lz4/lz4hc.h b/mfbt/lz4/lz4hc.h
+--- a/mfbt/lz4/lz4hc.h
++++ b/mfbt/lz4/lz4hc.h
+@@ -1,12 +1,12 @@
+ /*
+    LZ4 HC - High Compression Mode of LZ4
+    Header File
+-   Copyright (C) 2011-2017, Yann Collet.
++   Copyright (C) 2011-2020, Yann Collet.
+    BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php)
+ 
+    Redistribution and use in source and binary forms, with or without
+    modification, are permitted provided that the following conditions are
+    met:
+ 
+        * Redistributions of source code must retain the above copyright
+    notice, this list of conditions and the following disclaimer.
+@@ -193,63 +193,61 @@ LZ4LIB_API int LZ4_saveDictHC (LZ4_strea
+ #define LZ4HC_MAXD (1<<LZ4HC_DICTIONARY_LOGSIZE)
+ #define LZ4HC_MAXD_MASK (LZ4HC_MAXD - 1)
+ 
+ #define LZ4HC_HASH_LOG 15
+ #define LZ4HC_HASHTABLESIZE (1 << LZ4HC_HASH_LOG)
+ #define LZ4HC_HASH_MASK (LZ4HC_HASHTABLESIZE - 1)
+ 
+ 
++/* Never ever use these definitions directly !
++ * Declare or allocate an LZ4_streamHC_t instead.
++**/
+ typedef struct LZ4HC_CCtx_internal LZ4HC_CCtx_internal;
+ struct LZ4HC_CCtx_internal
+ {
+     LZ4_u32   hashTable[LZ4HC_HASHTABLESIZE];
+     LZ4_u16   chainTable[LZ4HC_MAXD];
+     const LZ4_byte* end;       /* next block here to continue on current prefix */
+-    const LZ4_byte* base;      /* All index relative to this position */
+-    const LZ4_byte* dictBase;  /* alternate base for extDict */
++    const LZ4_byte* prefixStart;  /* Indexes relative to this position */
++    const LZ4_byte* dictStart; /* alternate reference for extDict */
+     LZ4_u32   dictLimit;       /* below that point, need extDict */
+     LZ4_u32   lowLimit;        /* below that point, no more dict */
+     LZ4_u32   nextToUpdate;    /* index from which to continue dictionary update */
+     short     compressionLevel;
+     LZ4_i8    favorDecSpeed;   /* favor decompression speed if this flag set,
+                                   otherwise, favor compression ratio */
+     LZ4_i8    dirty;           /* stream has to be fully reset if this flag is set */
+     const LZ4HC_CCtx_internal* dictCtx;
+ };
+ 
+-
+-/* Do not use these definitions directly !
+- * Declare or allocate an LZ4_streamHC_t instead.
+- */
+-#define LZ4_STREAMHCSIZE       262200  /* static size, for inter-version compatibility */
+-#define LZ4_STREAMHCSIZE_VOIDP (LZ4_STREAMHCSIZE / sizeof(void*))
++#define LZ4_STREAMHC_MINSIZE  262200  /* static size, for inter-version compatibility */
+ union LZ4_streamHC_u {
+-    void* table[LZ4_STREAMHCSIZE_VOIDP];
++    char minStateSize[LZ4_STREAMHC_MINSIZE];
+     LZ4HC_CCtx_internal internal_donotuse;
+ }; /* previously typedef'd to LZ4_streamHC_t */
+ 
+ /* LZ4_streamHC_t :
+  * This structure allows static allocation of LZ4 HC streaming state.
+- * This can be used to allocate statically, on state, or as part of a larger structure.
++ * This can be used to allocate statically on stack, or as part of a larger structure.
+  *
+  * Such state **must** be initialized using LZ4_initStreamHC() before first use.
+  *
+  * Note that invoking LZ4_initStreamHC() is not required when
+  * the state was created using LZ4_createStreamHC() (which is recommended).
+  * Using the normal builder, a newly created state is automatically initialized.
+  *
+  * Static allocation shall only be used in combination with static linking.
+  */
+ 
+ /* LZ4_initStreamHC() : v1.9.0+
+  * Required before first use of a statically allocated LZ4_streamHC_t.
+  * Before v1.9.0 : use LZ4_resetStreamHC() instead
+  */
+-LZ4LIB_API LZ4_streamHC_t* LZ4_initStreamHC (void* buffer, size_t size);
++LZ4LIB_API LZ4_streamHC_t* LZ4_initStreamHC(void* buffer, size_t size);
+ 
+ 
+ /*-************************************
+ *  Deprecated Functions
+ **************************************/
+ /* see lz4.h LZ4_DISABLE_DEPRECATE_WARNINGS to turn off deprecation warnings */
+ 
+ /* deprecated compression functions */
+@@ -267,19 +265,21 @@ LZ4_DEPRECATED("use LZ4_compress_HC_cont
+ /* Obsolete streaming functions; degraded functionality; do not use!
+  *
+  * In order to perform streaming compression, these functions depended on data
+  * that is no longer tracked in the state. They have been preserved as well as
+  * possible: using them will still produce a correct output. However, use of
+  * LZ4_slideInputBufferHC() will truncate the history of the stream, rather
+  * than preserve a window-sized chunk of history.
+  */
++#if !defined(LZ4_STATIC_LINKING_ONLY_DISABLE_MEMORY_ALLOCATION)
+ LZ4_DEPRECATED("use LZ4_createStreamHC() instead") LZ4LIB_API void* LZ4_createHC (const char* inputBuffer);
++LZ4_DEPRECATED("use LZ4_freeStreamHC() instead") LZ4LIB_API   int   LZ4_freeHC (void* LZ4HC_Data);
++#endif
+ LZ4_DEPRECATED("use LZ4_saveDictHC() instead") LZ4LIB_API     char* LZ4_slideInputBufferHC (void* LZ4HC_Data);
+-LZ4_DEPRECATED("use LZ4_freeStreamHC() instead") LZ4LIB_API   int   LZ4_freeHC (void* LZ4HC_Data);
+ LZ4_DEPRECATED("use LZ4_compress_HC_continue() instead") LZ4LIB_API int LZ4_compressHC2_continue               (void* LZ4HC_Data, const char* source, char* dest, int inputSize, int compressionLevel);
+ LZ4_DEPRECATED("use LZ4_compress_HC_continue() instead") LZ4LIB_API int LZ4_compressHC2_limitedOutput_continue (void* LZ4HC_Data, const char* source, char* dest, int inputSize, int maxOutputSize, int compressionLevel);
+ LZ4_DEPRECATED("use LZ4_createStreamHC() instead") LZ4LIB_API int   LZ4_sizeofStreamStateHC(void);
+ LZ4_DEPRECATED("use LZ4_initStreamHC() instead") LZ4LIB_API  int   LZ4_resetStreamStateHC(void* state, char* inputBuffer);
+ 
+ 
+ /* LZ4_resetStreamHC() is now replaced by LZ4_initStreamHC().
+  * The intention is to emphasize the difference with LZ4_resetStreamHC_fast(),
+@@ -300,17 +300,17 @@ LZ4LIB_API void LZ4_resetStreamHC (LZ4_s
+ 
+ 
+ /*-**************************************************
+  * !!!!!     STATIC LINKING ONLY     !!!!!
+  * Following definitions are considered experimental.
+  * They should not be linked from DLL,
+  * as there is no guarantee of API stability yet.
+  * Prototypes will be promoted to "stable" status
+- * after successfull usage in real-life scenarios.
++ * after successful usage in real-life scenarios.
+  ***************************************************/
+ #ifdef LZ4_HC_STATIC_LINKING_ONLY   /* protection macro */
+ #ifndef LZ4_HC_SLO_098092834
+ #define LZ4_HC_SLO_098092834
+ 
+ #define LZ4_STATIC_LINKING_ONLY   /* LZ4LIB_STATIC_API */
+ #include "lz4.h"
+ 
+diff --git a/mfbt/moz.build b/mfbt/moz.build
+--- a/mfbt/moz.build
++++ b/mfbt/moz.build
+@@ -146,16 +146,17 @@ if CONFIG['MOZ_BUILD_APP'] != 'tools/cra
+     # tools/crashreporter.
+     TEST_DIRS += ['tests']
+ 
+ DEFINES['IMPL_MFBT'] = True
+ 
+ SOURCES += [
+     'decimal/Decimal.cpp',
+     'lz4/lz4.c',
++    "lz4/lz4file.c",
+     'lz4/lz4frame.c',
+     'lz4/lz4hc.c',
+     'lz4/xxhash.c',
+ ]
+ 
+ if CONFIG['CC_TYPE'] != 'msvc':
+     SOURCES["lz4/xxhash.c"].flags += ["-Wno-unused-function"]
+ 

+ 6059 - 0
mozilla-release/patches/1845018-117a1.patch

@@ -0,0 +1,6059 @@
+# HG changeset patch
+# User Ryan VanderMeulen <ryanvm@gmail.com>
+# Date 1690388471 0
+#      Wed Jul 26 16:21:11 2023 +0000
+# Node ID 5e47300c17df4b438fd098e3e6cc37d4397b6e5d
+# Parent  d9fe2d7384cbe9f161a14cfad4f6e9f21ec57938
+Bug 1845018 - Update xxHash to version 0.8.2. r=dthayer
+
+Differential Revision: https://phabricator.services.mozilla.com/D184325
+
+diff --git a/mfbt/lz4/xxhash.c b/mfbt/lz4/xxhash.c
+--- a/mfbt/lz4/xxhash.c
++++ b/mfbt/lz4/xxhash.c
+@@ -1,11 +1,11 @@
+ /*
+  * xxHash - Extremely Fast Hash algorithm
+- * Copyright (C) 2012-2020 Yann Collet
++ * Copyright (C) 2012-2021 Yann Collet
+  *
+  * BSD 2-Clause License (https://www.opensource.org/licenses/bsd-license.php)
+  *
+  * Redistribution and use in source and binary forms, with or without
+  * modification, are permitted provided that the following conditions are
+  * met:
+  *
+  *    * Redistributions of source code must retain the above copyright
+diff --git a/mfbt/lz4/xxhash.h b/mfbt/lz4/xxhash.h
+--- a/mfbt/lz4/xxhash.h
++++ b/mfbt/lz4/xxhash.h
+@@ -1,12 +1,12 @@
+ /*
+  * xxHash - Extremely Fast Hash algorithm
+  * Header File
+- * Copyright (C) 2012-2020 Yann Collet
++ * Copyright (C) 2012-2021 Yann Collet
+  *
+  * BSD 2-Clause License (https://www.opensource.org/licenses/bsd-license.php)
+  *
+  * Redistribution and use in source and binary forms, with or without
+  * modification, are permitted provided that the following conditions are
+  * met:
+  *
+  *    * Redistributions of source code must retain the above copyright
+@@ -27,83 +27,235 @@
+  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+  *
+  * You can contact the author at:
+  *   - xxHash homepage: https://www.xxhash.com
+  *   - xxHash source repository: https://github.com/Cyan4973/xxHash
+  */
++
+ /*!
+  * @mainpage xxHash
+  *
++ * xxHash is an extremely fast non-cryptographic hash algorithm, working at RAM speed
++ * limits.
++ *
++ * It is proposed in four flavors, in three families:
++ * 1. @ref XXH32_family
++ *   - Classic 32-bit hash function. Simple, compact, and runs on almost all
++ *     32-bit and 64-bit systems.
++ * 2. @ref XXH64_family
++ *   - Classic 64-bit adaptation of XXH32. Just as simple, and runs well on most
++ *     64-bit systems (but _not_ 32-bit systems).
++ * 3. @ref XXH3_family
++ *   - Modern 64-bit and 128-bit hash function family which features improved
++ *     strength and performance across the board, especially on smaller data.
++ *     It benefits greatly from SIMD and 64-bit without requiring it.
++ *
++ * Benchmarks
++ * ---
++ * The reference system uses an Intel i7-9700K CPU, and runs Ubuntu x64 20.04.
++ * The open source benchmark program is compiled with clang v10.0 using -O3 flag.
++ *
++ * | Hash Name            | ISA ext | Width | Large Data Speed | Small Data Velocity |
++ * | -------------------- | ------- | ----: | ---------------: | ------------------: |
++ * | XXH3_64bits()        | @b AVX2 |    64 |        59.4 GB/s |               133.1 |
++ * | MeowHash             | AES-NI  |   128 |        58.2 GB/s |                52.5 |
++ * | XXH3_128bits()       | @b AVX2 |   128 |        57.9 GB/s |               118.1 |
++ * | CLHash               | PCLMUL  |    64 |        37.1 GB/s |                58.1 |
++ * | XXH3_64bits()        | @b SSE2 |    64 |        31.5 GB/s |               133.1 |
++ * | XXH3_128bits()       | @b SSE2 |   128 |        29.6 GB/s |               118.1 |
++ * | RAM sequential read  |         |   N/A |        28.0 GB/s |                 N/A |
++ * | ahash                | AES-NI  |    64 |        22.5 GB/s |               107.2 |
++ * | City64               |         |    64 |        22.0 GB/s |                76.6 |
++ * | T1ha2                |         |    64 |        22.0 GB/s |                99.0 |
++ * | City128              |         |   128 |        21.7 GB/s |                57.7 |
++ * | FarmHash             | AES-NI  |    64 |        21.3 GB/s |                71.9 |
++ * | XXH64()              |         |    64 |        19.4 GB/s |                71.0 |
++ * | SpookyHash           |         |    64 |        19.3 GB/s |                53.2 |
++ * | Mum                  |         |    64 |        18.0 GB/s |                67.0 |
++ * | CRC32C               | SSE4.2  |    32 |        13.0 GB/s |                57.9 |
++ * | XXH32()              |         |    32 |         9.7 GB/s |                71.9 |
++ * | City32               |         |    32 |         9.1 GB/s |                66.0 |
++ * | Blake3*              | @b AVX2 |   256 |         4.4 GB/s |                 8.1 |
++ * | Murmur3              |         |    32 |         3.9 GB/s |                56.1 |
++ * | SipHash*             |         |    64 |         3.0 GB/s |                43.2 |
++ * | Blake3*              | @b SSE2 |   256 |         2.4 GB/s |                 8.1 |
++ * | HighwayHash          |         |    64 |         1.4 GB/s |                 6.0 |
++ * | FNV64                |         |    64 |         1.2 GB/s |                62.7 |
++ * | Blake2*              |         |   256 |         1.1 GB/s |                 5.1 |
++ * | SHA1*                |         |   160 |         0.8 GB/s |                 5.6 |
++ * | MD5*                 |         |   128 |         0.6 GB/s |                 7.8 |
++ * @note
++ *   - Hashes which require a specific ISA extension are noted. SSE2 is also noted,
++ *     even though it is mandatory on x64.
++ *   - Hashes with an asterisk are cryptographic. Note that MD5 is non-cryptographic
++ *     by modern standards.
++ *   - Small data velocity is a rough average of algorithm's efficiency for small
++ *     data. For more accurate information, see the wiki.
++ *   - More benchmarks and strength tests are found on the wiki:
++ *         https://github.com/Cyan4973/xxHash/wiki
++ *
++ * Usage
++ * ------
++ * All xxHash variants use a similar API. Changing the algorithm is a trivial
++ * substitution.
++ *
++ * @pre
++ *    For functions which take an input and length parameter, the following
++ *    requirements are assumed:
++ *    - The range from [`input`, `input + length`) is valid, readable memory.
++ *      - The only exception is if the `length` is `0`, `input` may be `NULL`.
++ *    - For C++, the objects must have the *TriviallyCopyable* property, as the
++ *      functions access bytes directly as if it was an array of `unsigned char`.
++ *
++ * @anchor single_shot_example
++ * **Single Shot**
++ *
++ * These functions are stateless functions which hash a contiguous block of memory,
++ * immediately returning the result. They are the easiest and usually the fastest
++ * option.
++ *
++ * XXH32(), XXH64(), XXH3_64bits(), XXH3_128bits()
++ *
++ * @code{.c}
++ *   #include <string.h>
++ *   #include "xxhash.h"
++ *
++ *   // Example for a function which hashes a null terminated string with XXH32().
++ *   XXH32_hash_t hash_string(const char* string, XXH32_hash_t seed)
++ *   {
++ *       // NULL pointers are only valid if the length is zero
++ *       size_t length = (string == NULL) ? 0 : strlen(string);
++ *       return XXH32(string, length, seed);
++ *   }
++ * @endcode
++ *
++ * @anchor streaming_example
++ * **Streaming**
++ *
++ * These groups of functions allow incremental hashing of unknown size, even
++ * more than what would fit in a size_t.
++ *
++ * XXH32_reset(), XXH64_reset(), XXH3_64bits_reset(), XXH3_128bits_reset()
++ *
++ * @code{.c}
++ *   #include <stdio.h>
++ *   #include <assert.h>
++ *   #include "xxhash.h"
++ *   // Example for a function which hashes a FILE incrementally with XXH3_64bits().
++ *   XXH64_hash_t hashFile(FILE* f)
++ *   {
++ *       // Allocate a state struct. Do not just use malloc() or new.
++ *       XXH3_state_t* state = XXH3_createState();
++ *       assert(state != NULL && "Out of memory!");
++ *       // Reset the state to start a new hashing session.
++ *       XXH3_64bits_reset(state);
++ *       char buffer[4096];
++ *       size_t count;
++ *       // Read the file in chunks
++ *       while ((count = fread(buffer, 1, sizeof(buffer), f)) != 0) {
++ *           // Run update() as many times as necessary to process the data
++ *           XXH3_64bits_update(state, buffer, count);
++ *       }
++ *       // Retrieve the finalized hash. This will not change the state.
++ *       XXH64_hash_t result = XXH3_64bits_digest(state);
++ *       // Free the state. Do not use free().
++ *       XXH3_freeState(state);
++ *       return result;
++ *   }
++ * @endcode
++ *
+  * @file xxhash.h
+  * xxHash prototypes and implementation
+  */
+-/* TODO: update */
+-/* Notice extracted from xxHash homepage:
+-
+-xxHash is an extremely fast hash algorithm, running at RAM speed limits.
+-It also successfully passes all tests from the SMHasher suite.
+-
+-Comparison (single thread, Windows Seven 32 bits, using SMHasher on a Core 2 Duo @3GHz)
+-
+-Name            Speed       Q.Score   Author
+-xxHash          5.4 GB/s     10
+-CrapWow         3.2 GB/s      2       Andrew
+-MurmurHash 3a   2.7 GB/s     10       Austin Appleby
+-SpookyHash      2.0 GB/s     10       Bob Jenkins
+-SBox            1.4 GB/s      9       Bret Mulvey
+-Lookup3         1.2 GB/s      9       Bob Jenkins
+-SuperFastHash   1.2 GB/s      1       Paul Hsieh
+-CityHash64      1.05 GB/s    10       Pike & Alakuijala
+-FNV             0.55 GB/s     5       Fowler, Noll, Vo
+-CRC32           0.43 GB/s     9
+-MD5-32          0.33 GB/s    10       Ronald L. Rivest
+-SHA1-32         0.28 GB/s    10
+-
+-Q.Score is a measure of quality of the hash function.
+-It depends on successfully passing SMHasher test set.
+-10 is a perfect score.
+-
+-Note: SMHasher's CRC32 implementation is not the fastest one.
+-Other speed-oriented implementations can be faster,
+-especially in combination with PCLMUL instruction:
+-https://fastcompression.blogspot.com/2019/03/presenting-xxh3.html?showComment=1552696407071#c3490092340461170735
+-
+-A 64-bit version, named XXH64, is available since r35.
+-It offers much better speed, but for 64-bit applications only.
+-Name     Speed on 64 bits    Speed on 32 bits
+-XXH64       13.8 GB/s            1.9 GB/s
+-XXH32        6.8 GB/s            6.0 GB/s
+-*/
+ 
+ #if defined (__cplusplus)
+ extern "C" {
+ #endif
+ 
+ /* ****************************
+  *  INLINE mode
+  ******************************/
+ /*!
+- * XXH_INLINE_ALL (and XXH_PRIVATE_API)
++ * @defgroup public Public API
++ * Contains details on the public xxHash functions.
++ * @{
++ */
++#ifdef XXH_DOXYGEN
++/*!
++ * @brief Gives access to internal state declaration, required for static allocation.
++ *
++ * Incompatible with dynamic linking, due to risks of ABI changes.
++ *
++ * Usage:
++ * @code{.c}
++ *     #define XXH_STATIC_LINKING_ONLY
++ *     #include "xxhash.h"
++ * @endcode
++ */
++#  define XXH_STATIC_LINKING_ONLY
++/* Do not undef XXH_STATIC_LINKING_ONLY for Doxygen */
++
++/*!
++ * @brief Gives access to internal definitions.
++ *
++ * Usage:
++ * @code{.c}
++ *     #define XXH_STATIC_LINKING_ONLY
++ *     #define XXH_IMPLEMENTATION
++ *     #include "xxhash.h"
++ * @endcode
++ */
++#  define XXH_IMPLEMENTATION
++/* Do not undef XXH_IMPLEMENTATION for Doxygen */
++
++/*!
++ * @brief Exposes the implementation and marks all functions as `inline`.
++ *
+  * Use these build macros to inline xxhash into the target unit.
+  * Inlining improves performance on small inputs, especially when the length is
+  * expressed as a compile-time constant:
+  *
+- *      https://fastcompression.blogspot.com/2018/03/xxhash-for-small-keys-impressive-power.html
++ *  https://fastcompression.blogspot.com/2018/03/xxhash-for-small-keys-impressive-power.html
+  *
+  * It also keeps xxHash symbols private to the unit, so they are not exported.
+  *
+  * Usage:
++ * @code{.c}
+  *     #define XXH_INLINE_ALL
+  *     #include "xxhash.h"
+- *
++ * @endcode
+  * Do not compile and link xxhash.o as a separate object, as it is not useful.
+  */
++#  define XXH_INLINE_ALL
++#  undef XXH_INLINE_ALL
++/*!
++ * @brief Exposes the implementation without marking functions as inline.
++ */
++#  define XXH_PRIVATE_API
++#  undef XXH_PRIVATE_API
++/*!
++ * @brief Emulate a namespace by transparently prefixing all symbols.
++ *
++ * If you want to include _and expose_ xxHash functions from within your own
++ * library, but also want to avoid symbol collisions with other libraries which
++ * may also include xxHash, you can use @ref XXH_NAMESPACE to automatically prefix
++ * any public symbol from xxhash library with the value of @ref XXH_NAMESPACE
++ * (therefore, avoid empty or numeric values).
++ *
++ * Note that no change is required within the calling program as long as it
++ * includes `xxhash.h`: Regular symbol names will be automatically translated
++ * by this header.
++ */
++#  define XXH_NAMESPACE /* YOUR NAME HERE */
++#  undef XXH_NAMESPACE
++#endif
++
+ #if (defined(XXH_INLINE_ALL) || defined(XXH_PRIVATE_API)) \
+     && !defined(XXH_INLINE_ALL_31684351384)
+    /* this section should be traversed only once */
+ #  define XXH_INLINE_ALL_31684351384
+    /* give access to the advanced API, required to compile implementations */
+ #  undef XXH_STATIC_LINKING_ONLY   /* avoid macro redef */
+ #  define XXH_STATIC_LINKING_ONLY
+    /* make all functions private */
+@@ -208,61 +360,35 @@ extern "C" {
+ #  define XXH3_state_s  XXH_IPREF(XXH3_state_s)
+ #  define XXH3_state_t  XXH_IPREF(XXH3_state_t)
+ #  define XXH128_hash_t XXH_IPREF(XXH128_hash_t)
+    /* Ensure the header is parsed again, even if it was previously included */
+ #  undef XXHASH_H_5627135585666179
+ #  undef XXHASH_H_STATIC_13879238742
+ #endif /* XXH_INLINE_ALL || XXH_PRIVATE_API */
+ 
+-
+-
+ /* ****************************************************************
+  *  Stable API
+  *****************************************************************/
+ #ifndef XXHASH_H_5627135585666179
+ #define XXHASH_H_5627135585666179 1
+ 
+-
+-/*!
+- * @defgroup public Public API
+- * Contains details on the public xxHash functions.
+- * @{
+- */
+-/* specific declaration modes for Windows */
++/*! @brief Marks a global symbol. */
+ #if !defined(XXH_INLINE_ALL) && !defined(XXH_PRIVATE_API)
+ #  if defined(WIN32) && defined(_MSC_VER) && (defined(XXH_IMPORT) || defined(XXH_EXPORT))
+ #    ifdef XXH_EXPORT
+ #      define XXH_PUBLIC_API __declspec(dllexport)
+ #    elif XXH_IMPORT
+ #      define XXH_PUBLIC_API __declspec(dllimport)
+ #    endif
+ #  else
+ #    define XXH_PUBLIC_API   /* do nothing */
+ #  endif
+ #endif
+ 
+-#ifdef XXH_DOXYGEN
+-/*!
+- * @brief Emulate a namespace by transparently prefixing all symbols.
+- *
+- * If you want to include _and expose_ xxHash functions from within your own
+- * library, but also want to avoid symbol collisions with other libraries which
+- * may also include xxHash, you can use XXH_NAMESPACE to automatically prefix
+- * any public symbol from xxhash library with the value of XXH_NAMESPACE
+- * (therefore, avoid empty or numeric values).
+- *
+- * Note that no change is required within the calling program as long as it
+- * includes `xxhash.h`: Regular symbol names will be automatically translated
+- * by this header.
+- */
+-#  define XXH_NAMESPACE /* YOUR NAME HERE */
+-#  undef XXH_NAMESPACE
+-#endif
+-
+ #ifdef XXH_NAMESPACE
+ #  define XXH_CAT(A,B) A##B
+ #  define XXH_NAME2(A,B) XXH_CAT(A,B)
+ #  define XXH_versionNumber XXH_NAME2(XXH_NAMESPACE, XXH_versionNumber)
+ /* XXH32 */
+ #  define XXH32 XXH_NAME2(XXH_NAMESPACE, XXH32)
+ #  define XXH32_createState XXH_NAME2(XXH_NAMESPACE, XXH32_createState)
+ #  define XXH32_freeState XXH_NAME2(XXH_NAMESPACE, XXH32_freeState)
+@@ -313,39 +439,73 @@ extern "C" {
+ #  define XXH128_isEqual XXH_NAME2(XXH_NAMESPACE, XXH128_isEqual)
+ #  define XXH128_cmp     XXH_NAME2(XXH_NAMESPACE, XXH128_cmp)
+ #  define XXH128_canonicalFromHash XXH_NAME2(XXH_NAMESPACE, XXH128_canonicalFromHash)
+ #  define XXH128_hashFromCanonical XXH_NAME2(XXH_NAMESPACE, XXH128_hashFromCanonical)
+ #endif
+ 
+ 
+ /* *************************************
++*  Compiler specifics
++***************************************/
++
++/* specific declaration modes for Windows */
++#if !defined(XXH_INLINE_ALL) && !defined(XXH_PRIVATE_API)
++#  if defined(WIN32) && defined(_MSC_VER) && (defined(XXH_IMPORT) || defined(XXH_EXPORT))
++#    ifdef XXH_EXPORT
++#      define XXH_PUBLIC_API __declspec(dllexport)
++#    elif XXH_IMPORT
++#      define XXH_PUBLIC_API __declspec(dllimport)
++#    endif
++#  else
++#    define XXH_PUBLIC_API   /* do nothing */
++#  endif
++#endif
++
++#if defined (__GNUC__)
++# define XXH_CONSTF  __attribute__((const))
++# define XXH_PUREF   __attribute__((pure))
++# define XXH_MALLOCF __attribute__((malloc))
++#else
++# define XXH_CONSTF  /* disable */
++# define XXH_PUREF
++# define XXH_MALLOCF
++#endif
++
++/* *************************************
+ *  Version
+ ***************************************/
+ #define XXH_VERSION_MAJOR    0
+ #define XXH_VERSION_MINOR    8
+-#define XXH_VERSION_RELEASE  1
++#define XXH_VERSION_RELEASE  2
++/*! @brief Version number, encoded as two digits each */
+ #define XXH_VERSION_NUMBER  (XXH_VERSION_MAJOR *100*100 + XXH_VERSION_MINOR *100 + XXH_VERSION_RELEASE)
+ 
+ /*!
+  * @brief Obtains the xxHash version.
+  *
+  * This is mostly useful when xxHash is compiled as a shared library,
+  * since the returned value comes from the library, as opposed to header file.
+  *
+- * @return `XXH_VERSION_NUMBER` of the invoked library.
+- */
+-XXH_PUBLIC_API unsigned XXH_versionNumber (void);
++ * @return @ref XXH_VERSION_NUMBER of the invoked library.
++ */
++XXH_PUBLIC_API XXH_CONSTF unsigned XXH_versionNumber (void);
+ 
+ 
+ /* ****************************
+ *  Common basic types
+ ******************************/
+ #include <stddef.h>   /* size_t */
+-typedef enum { XXH_OK=0, XXH_ERROR } XXH_errorcode;
++/*!
++ * @brief Exit code for the streaming API.
++ */
++typedef enum {
++    XXH_OK = 0, /*!< OK */
++    XXH_ERROR   /*!< Error */
++} XXH_errorcode;
+ 
+ 
+ /*-**********************************************************************
+ *  32-bit hash
+ ************************************************************************/
+ #if defined(XXH_DOXYGEN) /* Don't show <stdint.h> include */
+ /*!
+  * @brief An unsigned 32-bit integer.
+@@ -359,47 +519,47 @@ typedef uint32_t XXH32_hash_t;
+   || (defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) /* C99 */) )
+ #   include <stdint.h>
+     typedef uint32_t XXH32_hash_t;
+ 
+ #else
+ #   include <limits.h>
+ #   if UINT_MAX == 0xFFFFFFFFUL
+       typedef unsigned int XXH32_hash_t;
++#   elif ULONG_MAX == 0xFFFFFFFFUL
++      typedef unsigned long XXH32_hash_t;
+ #   else
+-#     if ULONG_MAX == 0xFFFFFFFFUL
+-        typedef unsigned long XXH32_hash_t;
+-#     else
+-#       error "unsupported platform: need a 32-bit type"
+-#     endif
++#     error "unsupported platform: need a 32-bit type"
+ #   endif
+ #endif
+ 
+ /*!
+  * @}
+  *
+- * @defgroup xxh32_family XXH32 family
++ * @defgroup XXH32_family XXH32 family
+  * @ingroup public
+  * Contains functions used in the classic 32-bit xxHash algorithm.
+  *
+  * @note
+  *   XXH32 is useful for older platforms, with no or poor 64-bit performance.
+- *   Note that @ref xxh3_family provides competitive speed
+- *   for both 32-bit and 64-bit systems, and offers true 64/128 bit hash results.
+- *
+- * @see @ref xxh64_family, @ref xxh3_family : Other xxHash families
+- * @see @ref xxh32_impl for implementation details
++ *   Note that the @ref XXH3_family provides competitive speed for both 32-bit
++ *   and 64-bit systems, and offers true 64/128 bit hash results.
++ *
++ * @see @ref XXH64_family, @ref XXH3_family : Other xxHash families
++ * @see @ref XXH32_impl for implementation details
+  * @{
+  */
+ 
+ /*!
+  * @brief Calculates the 32-bit hash of @p input using xxHash32.
+  *
+  * Speed on Core 2 Duo @ 3 GHz (single thread, SMHasher benchmark): 5.4 GB/s
+  *
++ * See @ref single_shot_example "Single Shot Example" for an example.
++ *
+  * @param input The block of data to be hashed, at least @p length bytes in size.
+  * @param length The length of @p input, in bytes.
+  * @param seed The 32-bit seed to alter the hash's output predictably.
+  *
+  * @pre
+  *   The memory between @p input and @p input + @p length must be valid,
+  *   readable, contiguous memory. However, if @p length is `0`, @p input may be
+  *   `NULL`. In C++, this also must be *TriviallyCopyable*.
+@@ -407,18 +567,19 @@ typedef uint32_t XXH32_hash_t;
+  * @return The calculated 32-bit hash value.
+  *
+  * @see
+  *    XXH64(), XXH3_64bits_withSeed(), XXH3_128bits_withSeed(), XXH128():
+  *    Direct equivalents for the other variants of xxHash.
+  * @see
+  *    XXH32_createState(), XXH32_update(), XXH32_digest(): Streaming version.
+  */
+-XXH_PUBLIC_API XXH32_hash_t XXH32 (const void* input, size_t length, XXH32_hash_t seed);
+-
++XXH_PUBLIC_API XXH_PUREF XXH32_hash_t XXH32 (const void* input, size_t length, XXH32_hash_t seed);
++
++#ifndef XXH_NO_STREAM
+ /*!
+  * Streaming functions generate the xxHash value from an incremental input.
+  * This method is slower than single-call functions, due to state management.
+  * For small inputs, prefer `XXH32()` and `XXH64()`, which are better optimized.
+  *
+  * An XXH state must first be allocated using `XXH*_createState()`.
+  *
+  * Start a new hash by initializing the state with a seed using `XXH*_reset()`.
+@@ -431,59 +592,34 @@ XXH_PUBLIC_API XXH32_hash_t XXH32 (const
+  * Finally, a hash value can be produced anytime, by using `XXH*_digest()`.
+  * This function returns the nn-bits hash as an int or long long.
+  *
+  * It's still possible to continue inserting input into the hash state after a
+  * digest, and generate new hash values later on by invoking `XXH*_digest()`.
+  *
+  * When done, release the state using `XXH*_freeState()`.
+  *
+- * Example code for incrementally hashing a file:
+- * @code{.c}
+- *    #include <stdio.h>
+- *    #include <xxhash.h>
+- *    #define BUFFER_SIZE 256
+- *
+- *    // Note: XXH64 and XXH3 use the same interface.
+- *    XXH32_hash_t
+- *    hashFile(FILE* stream)
+- *    {
+- *        XXH32_state_t* state;
+- *        unsigned char buf[BUFFER_SIZE];
+- *        size_t amt;
+- *        XXH32_hash_t hash;
+- *
+- *        state = XXH32_createState();       // Create a state
+- *        assert(state != NULL);             // Error check here
+- *        XXH32_reset(state, 0xbaad5eed);    // Reset state with our seed
+- *        while ((amt = fread(buf, 1, sizeof(buf), stream)) != 0) {
+- *            XXH32_update(state, buf, amt); // Hash the file in chunks
+- *        }
+- *        hash = XXH32_digest(state);        // Finalize the hash
+- *        XXH32_freeState(state);            // Clean up
+- *        return hash;
+- *    }
+- * @endcode
++ * @see streaming_example at the top of @ref xxhash.h for an example.
+  */
+ 
+ /*!
+  * @typedef struct XXH32_state_s XXH32_state_t
+  * @brief The opaque state struct for the XXH32 streaming API.
+  *
+  * @see XXH32_state_s for details.
+  */
+ typedef struct XXH32_state_s XXH32_state_t;
+ 
+ /*!
+  * @brief Allocates an @ref XXH32_state_t.
+  *
+  * Must be freed with XXH32_freeState().
+  * @return An allocated XXH32_state_t on success, `NULL` on failure.
+  */
+-XXH_PUBLIC_API XXH32_state_t* XXH32_createState(void);
++XXH_PUBLIC_API XXH_MALLOCF XXH32_state_t* XXH32_createState(void);
+ /*!
+  * @brief Frees an @ref XXH32_state_t.
+  *
+  * Must be allocated with XXH32_createState().
+  * @param statePtr A pointer to an @ref XXH32_state_t allocated with @ref XXH32_createState().
+  * @return XXH_OK.
+  */
+ XXH_PUBLIC_API XXH_errorcode  XXH32_freeState(XXH32_state_t* statePtr);
+@@ -541,17 +677,18 @@ XXH_PUBLIC_API XXH_errorcode XXH32_updat
+  *
+  * @param statePtr The state struct to calculate the hash from.
+  *
+  * @pre
+  *  @p statePtr must not be `NULL`.
+  *
+  * @return The calculated xxHash32 value from that state.
+  */
+-XXH_PUBLIC_API XXH32_hash_t  XXH32_digest (const XXH32_state_t* statePtr);
++XXH_PUBLIC_API XXH_PUREF XXH32_hash_t XXH32_digest (const XXH32_state_t* statePtr);
++#endif /* !XXH_NO_STREAM */
+ 
+ /*******   Canonical representation   *******/
+ 
+ /*
+  * The default return values from XXH functions are unsigned 32 and 64 bit
+  * integers.
+  * This the simplest and fastest format for further post-processing.
+  *
+@@ -592,53 +729,82 @@ XXH_PUBLIC_API void XXH32_canonicalFromH
+  *
+  * @param src The @ref XXH32_canonical_t to convert.
+  *
+  * @pre
+  *   @p src must not be `NULL`.
+  *
+  * @return The converted hash.
+  */
+-XXH_PUBLIC_API XXH32_hash_t XXH32_hashFromCanonical(const XXH32_canonical_t* src);
+-
+-
++XXH_PUBLIC_API XXH_PUREF XXH32_hash_t XXH32_hashFromCanonical(const XXH32_canonical_t* src);
++
++
++/*! @cond Doxygen ignores this part */
+ #ifdef __has_attribute
+ # define XXH_HAS_ATTRIBUTE(x) __has_attribute(x)
+ #else
+ # define XXH_HAS_ATTRIBUTE(x) 0
+ #endif
+-
++/*! @endcond */
++
++/*! @cond Doxygen ignores this part */
++/*
++ * C23 __STDC_VERSION__ number hasn't been specified yet. For now
++ * leave as `201711L` (C17 + 1).
++ * TODO: Update to correct value when its been specified.
++ */
++#define XXH_C23_VN 201711L
++/*! @endcond */
++
++/*! @cond Doxygen ignores this part */
+ /* C-language Attributes are added in C23. */
+-#if defined(__STDC_VERSION__) && (__STDC_VERSION__ > 201710L) && defined(__has_c_attribute)
++#if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= XXH_C23_VN) && defined(__has_c_attribute)
+ # define XXH_HAS_C_ATTRIBUTE(x) __has_c_attribute(x)
+ #else
+ # define XXH_HAS_C_ATTRIBUTE(x) 0
+ #endif
+-
++/*! @endcond */
++
++/*! @cond Doxygen ignores this part */
+ #if defined(__cplusplus) && defined(__has_cpp_attribute)
+ # define XXH_HAS_CPP_ATTRIBUTE(x) __has_cpp_attribute(x)
+ #else
+ # define XXH_HAS_CPP_ATTRIBUTE(x) 0
+ #endif
+-
++/*! @endcond */
++
++/*! @cond Doxygen ignores this part */
+ /*
+-Define XXH_FALLTHROUGH macro for annotating switch case with the 'fallthrough' attribute
+-introduced in CPP17 and C23.
+-CPP17 : https://en.cppreference.com/w/cpp/language/attributes/fallthrough
+-C23   : https://en.cppreference.com/w/c/language/attributes/fallthrough
+-*/
+-#if XXH_HAS_C_ATTRIBUTE(x)
+-# define XXH_FALLTHROUGH [[fallthrough]]
+-#elif XXH_HAS_CPP_ATTRIBUTE(x)
++ * Define XXH_FALLTHROUGH macro for annotating switch case with the 'fallthrough' attribute
++ * introduced in CPP17 and C23.
++ * CPP17 : https://en.cppreference.com/w/cpp/language/attributes/fallthrough
++ * C23   : https://en.cppreference.com/w/c/language/attributes/fallthrough
++ */
++#if XXH_HAS_C_ATTRIBUTE(fallthrough) || XXH_HAS_CPP_ATTRIBUTE(fallthrough)
+ # define XXH_FALLTHROUGH [[fallthrough]]
+ #elif XXH_HAS_ATTRIBUTE(__fallthrough__)
+-# define XXH_FALLTHROUGH __attribute__ ((fallthrough))
++# define XXH_FALLTHROUGH __attribute__ ((__fallthrough__))
+ #else
+-# define XXH_FALLTHROUGH
++# define XXH_FALLTHROUGH /* fallthrough */
+ #endif
++/*! @endcond */
++
++/*! @cond Doxygen ignores this part */
++/*
++ * Define XXH_NOESCAPE for annotated pointers in public API.
++ * https://clang.llvm.org/docs/AttributeReference.html#noescape
++ * As of writing this, only supported by clang.
++ */
++#if XXH_HAS_ATTRIBUTE(noescape)
++# define XXH_NOESCAPE __attribute__((noescape))
++#else
++# define XXH_NOESCAPE
++#endif
++/*! @endcond */
++
+ 
+ /*!
+  * @}
+  * @ingroup public
+  * @{
+  */
+ 
+ #ifndef XXH_NO_LONG_LONG
+@@ -666,28 +832,27 @@ typedef uint64_t XXH64_hash_t;
+      /* the following type must have a width of 64-bit */
+      typedef unsigned long long XXH64_hash_t;
+ #  endif
+ #endif
+ 
+ /*!
+  * @}
+  *
+- * @defgroup xxh64_family XXH64 family
++ * @defgroup XXH64_family XXH64 family
+  * @ingroup public
+  * @{
+  * Contains functions used in the classic 64-bit xxHash algorithm.
+  *
+  * @note
+  *   XXH3 provides competitive speed for both 32-bit and 64-bit systems,
+  *   and offers true 64/128 bit hash results.
+  *   It provides better speed for systems with vector processing capabilities.
+  */
+ 
+-
+ /*!
+  * @brief Calculates the 64-bit hash of @p input using xxHash64.
+  *
+  * This function usually runs faster on 64-bit systems, but slower on 32-bit
+  * systems (see benchmark).
+  *
+  * @param input The block of data to be hashed, at least @p length bytes in size.
+  * @param length The length of @p input, in bytes.
+@@ -701,42 +866,141 @@ typedef uint64_t XXH64_hash_t;
+  * @return The calculated 64-bit hash.
+  *
+  * @see
+  *    XXH32(), XXH3_64bits_withSeed(), XXH3_128bits_withSeed(), XXH128():
+  *    Direct equivalents for the other variants of xxHash.
+  * @see
+  *    XXH64_createState(), XXH64_update(), XXH64_digest(): Streaming version.
+  */
+-XXH_PUBLIC_API XXH64_hash_t XXH64(const void* input, size_t length, XXH64_hash_t seed);
++XXH_PUBLIC_API XXH_PUREF XXH64_hash_t XXH64(XXH_NOESCAPE const void* input, size_t length, XXH64_hash_t seed);
+ 
+ /*******   Streaming   *******/
++#ifndef XXH_NO_STREAM
+ /*!
+  * @brief The opaque state struct for the XXH64 streaming API.
+  *
+  * @see XXH64_state_s for details.
+  */
+ typedef struct XXH64_state_s XXH64_state_t;   /* incomplete type */
+-XXH_PUBLIC_API XXH64_state_t* XXH64_createState(void);
++
++/*!
++ * @brief Allocates an @ref XXH64_state_t.
++ *
++ * Must be freed with XXH64_freeState().
++ * @return An allocated XXH64_state_t on success, `NULL` on failure.
++ */
++XXH_PUBLIC_API XXH_MALLOCF XXH64_state_t* XXH64_createState(void);
++
++/*!
++ * @brief Frees an @ref XXH64_state_t.
++ *
++ * Must be allocated with XXH64_createState().
++ * @param statePtr A pointer to an @ref XXH64_state_t allocated with @ref XXH64_createState().
++ * @return XXH_OK.
++ */
+ XXH_PUBLIC_API XXH_errorcode  XXH64_freeState(XXH64_state_t* statePtr);
+-XXH_PUBLIC_API void XXH64_copyState(XXH64_state_t* dst_state, const XXH64_state_t* src_state);
+-
+-XXH_PUBLIC_API XXH_errorcode XXH64_reset  (XXH64_state_t* statePtr, XXH64_hash_t seed);
+-XXH_PUBLIC_API XXH_errorcode XXH64_update (XXH64_state_t* statePtr, const void* input, size_t length);
+-XXH_PUBLIC_API XXH64_hash_t  XXH64_digest (const XXH64_state_t* statePtr);
+-
++
++/*!
++ * @brief Copies one @ref XXH64_state_t to another.
++ *
++ * @param dst_state The state to copy to.
++ * @param src_state The state to copy from.
++ * @pre
++ *   @p dst_state and @p src_state must not be `NULL` and must not overlap.
++ */
++XXH_PUBLIC_API void XXH64_copyState(XXH_NOESCAPE XXH64_state_t* dst_state, const XXH64_state_t* src_state);
++
++/*!
++ * @brief Resets an @ref XXH64_state_t to begin a new hash.
++ *
++ * This function resets and seeds a state. Call it before @ref XXH64_update().
++ *
++ * @param statePtr The state struct to reset.
++ * @param seed The 64-bit seed to alter the hash result predictably.
++ *
++ * @pre
++ *   @p statePtr must not be `NULL`.
++ *
++ * @return @ref XXH_OK on success, @ref XXH_ERROR on failure.
++ */
++XXH_PUBLIC_API XXH_errorcode XXH64_reset  (XXH_NOESCAPE XXH64_state_t* statePtr, XXH64_hash_t seed);
++
++/*!
++ * @brief Consumes a block of @p input to an @ref XXH64_state_t.
++ *
++ * Call this to incrementally consume blocks of data.
++ *
++ * @param statePtr The state struct to update.
++ * @param input The block of data to be hashed, at least @p length bytes in size.
++ * @param length The length of @p input, in bytes.
++ *
++ * @pre
++ *   @p statePtr must not be `NULL`.
++ * @pre
++ *   The memory between @p input and @p input + @p length must be valid,
++ *   readable, contiguous memory. However, if @p length is `0`, @p input may be
++ *   `NULL`. In C++, this also must be *TriviallyCopyable*.
++ *
++ * @return @ref XXH_OK on success, @ref XXH_ERROR on failure.
++ */
++XXH_PUBLIC_API XXH_errorcode XXH64_update (XXH_NOESCAPE XXH64_state_t* statePtr, XXH_NOESCAPE const void* input, size_t length);
++
++/*!
++ * @brief Returns the calculated hash value from an @ref XXH64_state_t.
++ *
++ * @note
++ *   Calling XXH64_digest() will not affect @p statePtr, so you can update,
++ *   digest, and update again.
++ *
++ * @param statePtr The state struct to calculate the hash from.
++ *
++ * @pre
++ *  @p statePtr must not be `NULL`.
++ *
++ * @return The calculated xxHash64 value from that state.
++ */
++XXH_PUBLIC_API XXH_PUREF XXH64_hash_t XXH64_digest (XXH_NOESCAPE const XXH64_state_t* statePtr);
++#endif /* !XXH_NO_STREAM */
+ /*******   Canonical representation   *******/
++
++/*!
++ * @brief Canonical (big endian) representation of @ref XXH64_hash_t.
++ */
+ typedef struct { unsigned char digest[sizeof(XXH64_hash_t)]; } XXH64_canonical_t;
+-XXH_PUBLIC_API void XXH64_canonicalFromHash(XXH64_canonical_t* dst, XXH64_hash_t hash);
+-XXH_PUBLIC_API XXH64_hash_t XXH64_hashFromCanonical(const XXH64_canonical_t* src);
++
++/*!
++ * @brief Converts an @ref XXH64_hash_t to a big endian @ref XXH64_canonical_t.
++ *
++ * @param dst The @ref XXH64_canonical_t pointer to be stored to.
++ * @param hash The @ref XXH64_hash_t to be converted.
++ *
++ * @pre
++ *   @p dst must not be `NULL`.
++ */
++XXH_PUBLIC_API void XXH64_canonicalFromHash(XXH_NOESCAPE XXH64_canonical_t* dst, XXH64_hash_t hash);
++
++/*!
++ * @brief Converts an @ref XXH64_canonical_t to a native @ref XXH64_hash_t.
++ *
++ * @param src The @ref XXH64_canonical_t to convert.
++ *
++ * @pre
++ *   @p src must not be `NULL`.
++ *
++ * @return The converted hash.
++ */
++XXH_PUBLIC_API XXH_PUREF XXH64_hash_t XXH64_hashFromCanonical(XXH_NOESCAPE const XXH64_canonical_t* src);
++
++#ifndef XXH_NO_XXH3
+ 
+ /*!
+  * @}
+  * ************************************************************************
+- * @defgroup xxh3_family XXH3 family
++ * @defgroup XXH3_family XXH3 family
+  * @ingroup public
+  * @{
+  *
+  * XXH3 is a more recent hash algorithm featuring:
+  *  - Improved speed for both small and large inputs
+  *  - True 64-bit and 128-bit outputs
+  *  - SIMD acceleration
+  *  - Improved 32-bit viability
+@@ -746,128 +1010,222 @@ XXH_PUBLIC_API XXH64_hash_t XXH64_hashFr
+  *    https://fastcompression.blogspot.com/2019/03/presenting-xxh3.html
+  *
+  * Compared to XXH64, expect XXH3 to run approximately
+  * ~2x faster on large inputs and >3x faster on small ones,
+  * exact differences vary depending on platform.
+  *
+  * XXH3's speed benefits greatly from SIMD and 64-bit arithmetic,
+  * but does not require it.
+- * Any 32-bit and 64-bit targets that can run XXH32 smoothly
+- * can run XXH3 at competitive speeds, even without vector support.
+- * Further details are explained in the implementation.
+- *
+- * Optimized implementations are provided for AVX512, AVX2, SSE2, NEON, POWER8,
+- * ZVector and scalar targets. This can be controlled via the XXH_VECTOR macro.
++ * Most 32-bit and 64-bit targets that can run XXH32 smoothly can run XXH3
++ * at competitive speeds, even without vector support. Further details are
++ * explained in the implementation.
++ *
++ * XXH3 has a fast scalar implementation, but it also includes accelerated SIMD
++ * implementations for many common platforms:
++ *   - AVX512
++ *   - AVX2
++ *   - SSE2
++ *   - ARM NEON
++ *   - WebAssembly SIMD128
++ *   - POWER8 VSX
++ *   - s390x ZVector
++ * This can be controlled via the @ref XXH_VECTOR macro, but it automatically
++ * selects the best version according to predefined macros. For the x86 family, an
++ * automatic runtime dispatcher is included separately in @ref xxh_x86dispatch.c.
+  *
+  * XXH3 implementation is portable:
+  * it has a generic C90 formulation that can be compiled on any platform,
+- * all implementations generage exactly the same hash value on all platforms.
++ * all implementations generate exactly the same hash value on all platforms.
+  * Starting from v0.8.0, it's also labelled "stable", meaning that
+  * any future version will also generate the same hash value.
+  *
+  * XXH3 offers 2 variants, _64bits and _128bits.
+  *
+  * When only 64 bits are needed, prefer invoking the _64bits variant, as it
+  * reduces the amount of mixing, resulting in faster speed on small inputs.
+  * It's also generally simpler to manipulate a scalar return type than a struct.
+  *
+  * The API supports one-shot hashing, streaming mode, and custom secrets.
+  */
+-
+ /*-**********************************************************************
+ *  XXH3 64-bit variant
+ ************************************************************************/
+ 
+-/* XXH3_64bits():
+- * default 64-bit variant, using default secret and default seed of 0.
+- * It's the fastest variant. */
+-XXH_PUBLIC_API XXH64_hash_t XXH3_64bits(const void* data, size_t len);
+-
+-/*
+- * XXH3_64bits_withSeed():
+- * This variant generates a custom secret on the fly
+- * based on default secret altered using the `seed` value.
++/*!
++ * @brief 64-bit unseeded variant of XXH3.
++ *
++ * This is equivalent to @ref XXH3_64bits_withSeed() with a seed of 0, however
++ * it may have slightly better performance due to constant propagation of the
++ * defaults.
++ *
++ * @see
++ *    XXH32(), XXH64(), XXH3_128bits(): equivalent for the other xxHash algorithms
++ * @see
++ *    XXH3_64bits_withSeed(), XXH3_64bits_withSecret(): other seeding variants
++ * @see
++ *    XXH3_64bits_reset(), XXH3_64bits_update(), XXH3_64bits_digest(): Streaming version.
++ */
++XXH_PUBLIC_API XXH_PUREF XXH64_hash_t XXH3_64bits(XXH_NOESCAPE const void* input, size_t length);
++
++/*!
++ * @brief 64-bit seeded variant of XXH3
++ *
++ * This variant generates a custom secret on the fly based on default secret
++ * altered using the `seed` value.
++ *
+  * While this operation is decently fast, note that it's not completely free.
+- * Note: seed==0 produces the same results as XXH3_64bits().
+- */
+-XXH_PUBLIC_API XXH64_hash_t XXH3_64bits_withSeed(const void* data, size_t len, XXH64_hash_t seed);
++ *
++ * @note
++ *    seed == 0 produces the same results as @ref XXH3_64bits().
++ *
++ * @param input The data to hash
++ * @param length The length
++ * @param seed The 64-bit seed to alter the state.
++ */
++XXH_PUBLIC_API XXH_PUREF XXH64_hash_t XXH3_64bits_withSeed(XXH_NOESCAPE const void* input, size_t length, XXH64_hash_t seed);
+ 
+ /*!
+  * The bare minimum size for a custom secret.
+  *
+  * @see
+  *  XXH3_64bits_withSecret(), XXH3_64bits_reset_withSecret(),
+  *  XXH3_128bits_withSecret(), XXH3_128bits_reset_withSecret().
+  */
+ #define XXH3_SECRET_SIZE_MIN 136
+ 
+-/*
+- * XXH3_64bits_withSecret():
++/*!
++ * @brief 64-bit variant of XXH3 with a custom "secret".
++ *
+  * It's possible to provide any blob of bytes as a "secret" to generate the hash.
+  * This makes it more difficult for an external actor to prepare an intentional collision.
+  * The main condition is that secretSize *must* be large enough (>= XXH3_SECRET_SIZE_MIN).
+  * However, the quality of the secret impacts the dispersion of the hash algorithm.
+  * Therefore, the secret _must_ look like a bunch of random bytes.
+  * Avoid "trivial" or structured data such as repeated sequences or a text document.
+  * Whenever in doubt about the "randomness" of the blob of bytes,
+  * consider employing "XXH3_generateSecret()" instead (see below).
+  * It will generate a proper high entropy secret derived from the blob of bytes.
+  * Another advantage of using XXH3_generateSecret() is that
+  * it guarantees that all bits within the initial blob of bytes
+  * will impact every bit of the output.
+  * This is not necessarily the case when using the blob of bytes directly
+  * because, when hashing _small_ inputs, only a portion of the secret is employed.
+  */
+-XXH_PUBLIC_API XXH64_hash_t XXH3_64bits_withSecret(const void* data, size_t len, const void* secret, size_t secretSize);
++XXH_PUBLIC_API XXH_PUREF XXH64_hash_t XXH3_64bits_withSecret(XXH_NOESCAPE const void* data, size_t len, XXH_NOESCAPE const void* secret, size_t secretSize);
+ 
+ 
+ /*******   Streaming   *******/
++#ifndef XXH_NO_STREAM
+ /*
+  * Streaming requires state maintenance.
+  * This operation costs memory and CPU.
+  * As a consequence, streaming is slower than one-shot hashing.
+  * For better performance, prefer one-shot functions whenever applicable.
+  */
+ 
+ /*!
+  * @brief The state struct for the XXH3 streaming API.
+  *
+  * @see XXH3_state_s for details.
+  */
+ typedef struct XXH3_state_s XXH3_state_t;
+-XXH_PUBLIC_API XXH3_state_t* XXH3_createState(void);
++XXH_PUBLIC_API XXH_MALLOCF XXH3_state_t* XXH3_createState(void);
+ XXH_PUBLIC_API XXH_errorcode XXH3_freeState(XXH3_state_t* statePtr);
+-XXH_PUBLIC_API void XXH3_copyState(XXH3_state_t* dst_state, const XXH3_state_t* src_state);
+-
+-/*
+- * XXH3_64bits_reset():
+- * Initialize with default parameters.
+- * digest will be equivalent to `XXH3_64bits()`.
+- */
+-XXH_PUBLIC_API XXH_errorcode XXH3_64bits_reset(XXH3_state_t* statePtr);
+-/*
+- * XXH3_64bits_reset_withSeed():
+- * Generate a custom secret from `seed`, and store it into `statePtr`.
+- * digest will be equivalent to `XXH3_64bits_withSeed()`.
+- */
+-XXH_PUBLIC_API XXH_errorcode XXH3_64bits_reset_withSeed(XXH3_state_t* statePtr, XXH64_hash_t seed);
+-/*
++
++/*!
++ * @brief Copies one @ref XXH3_state_t to another.
++ *
++ * @param dst_state The state to copy to.
++ * @param src_state The state to copy from.
++ * @pre
++ *   @p dst_state and @p src_state must not be `NULL` and must not overlap.
++ */
++XXH_PUBLIC_API void XXH3_copyState(XXH_NOESCAPE XXH3_state_t* dst_state, XXH_NOESCAPE const XXH3_state_t* src_state);
++
++/*!
++ * @brief Resets an @ref XXH3_state_t to begin a new hash.
++ *
++ * This function resets `statePtr` and generate a secret with default parameters. Call it before @ref XXH3_64bits_update().
++ * Digest will be equivalent to `XXH3_64bits()`.
++ *
++ * @param statePtr The state struct to reset.
++ *
++ * @pre
++ *   @p statePtr must not be `NULL`.
++ *
++ * @return @ref XXH_OK on success, @ref XXH_ERROR on failure.
++ *
++ */
++XXH_PUBLIC_API XXH_errorcode XXH3_64bits_reset(XXH_NOESCAPE XXH3_state_t* statePtr);
++
++/*!
++ * @brief Resets an @ref XXH3_state_t with 64-bit seed to begin a new hash.
++ *
++ * This function resets `statePtr` and generate a secret from `seed`. Call it before @ref XXH3_64bits_update().
++ * Digest will be equivalent to `XXH3_64bits_withSeed()`.
++ *
++ * @param statePtr The state struct to reset.
++ * @param seed     The 64-bit seed to alter the state.
++ *
++ * @pre
++ *   @p statePtr must not be `NULL`.
++ *
++ * @return @ref XXH_OK on success, @ref XXH_ERROR on failure.
++ *
++ */
++XXH_PUBLIC_API XXH_errorcode XXH3_64bits_reset_withSeed(XXH_NOESCAPE XXH3_state_t* statePtr, XXH64_hash_t seed);
++
++/*!
+  * XXH3_64bits_reset_withSecret():
+  * `secret` is referenced, it _must outlive_ the hash streaming session.
+  * Similar to one-shot API, `secretSize` must be >= `XXH3_SECRET_SIZE_MIN`,
+  * and the quality of produced hash values depends on secret's entropy
+  * (secret's content should look like a bunch of random bytes).
+  * When in doubt about the randomness of a candidate `secret`,
+  * consider employing `XXH3_generateSecret()` instead (see below).
+  */
+-XXH_PUBLIC_API XXH_errorcode XXH3_64bits_reset_withSecret(XXH3_state_t* statePtr, const void* secret, size_t secretSize);
+-
+-XXH_PUBLIC_API XXH_errorcode XXH3_64bits_update (XXH3_state_t* statePtr, const void* input, size_t length);
+-XXH_PUBLIC_API XXH64_hash_t  XXH3_64bits_digest (const XXH3_state_t* statePtr);
++XXH_PUBLIC_API XXH_errorcode XXH3_64bits_reset_withSecret(XXH_NOESCAPE XXH3_state_t* statePtr, XXH_NOESCAPE const void* secret, size_t secretSize);
++
++/*!
++ * @brief Consumes a block of @p input to an @ref XXH3_state_t.
++ *
++ * Call this to incrementally consume blocks of data.
++ *
++ * @param statePtr The state struct to update.
++ * @param input The block of data to be hashed, at least @p length bytes in size.
++ * @param length The length of @p input, in bytes.
++ *
++ * @pre
++ *   @p statePtr must not be `NULL`.
++ * @pre
++ *   The memory between @p input and @p input + @p length must be valid,
++ *   readable, contiguous memory. However, if @p length is `0`, @p input may be
++ *   `NULL`. In C++, this also must be *TriviallyCopyable*.
++ *
++ * @return @ref XXH_OK on success, @ref XXH_ERROR on failure.
++ */
++XXH_PUBLIC_API XXH_errorcode XXH3_64bits_update (XXH_NOESCAPE XXH3_state_t* statePtr, XXH_NOESCAPE const void* input, size_t length);
++
++/*!
++ * @brief Returns the calculated XXH3 64-bit hash value from an @ref XXH3_state_t.
++ *
++ * @note
++ *   Calling XXH3_64bits_digest() will not affect @p statePtr, so you can update,
++ *   digest, and update again.
++ *
++ * @param statePtr The state struct to calculate the hash from.
++ *
++ * @pre
++ *  @p statePtr must not be `NULL`.
++ *
++ * @return The calculated XXH3 64-bit hash value from that state.
++ */
++XXH_PUBLIC_API XXH_PUREF XXH64_hash_t  XXH3_64bits_digest (XXH_NOESCAPE const XXH3_state_t* statePtr);
++#endif /* !XXH_NO_STREAM */
+ 
+ /* note : canonical representation of XXH3 is the same as XXH64
+  * since they both produce XXH64_hash_t values */
+ 
+ 
+ /*-**********************************************************************
+ *  XXH3 128-bit variant
+ ************************************************************************/
+@@ -878,68 +1236,175 @@ XXH_PUBLIC_API XXH64_hash_t  XXH3_64bits
+  * Stored in little endian order, although the fields themselves are in native
+  * endianness.
+  */
+ typedef struct {
+     XXH64_hash_t low64;   /*!< `value & 0xFFFFFFFFFFFFFFFF` */
+     XXH64_hash_t high64;  /*!< `value >> 64` */
+ } XXH128_hash_t;
+ 
+-XXH_PUBLIC_API XXH128_hash_t XXH3_128bits(const void* data, size_t len);
+-XXH_PUBLIC_API XXH128_hash_t XXH3_128bits_withSeed(const void* data, size_t len, XXH64_hash_t seed);
+-XXH_PUBLIC_API XXH128_hash_t XXH3_128bits_withSecret(const void* data, size_t len, const void* secret, size_t secretSize);
++/*!
++ * @brief Unseeded 128-bit variant of XXH3
++ *
++ * The 128-bit variant of XXH3 has more strength, but it has a bit of overhead
++ * for shorter inputs.
++ *
++ * This is equivalent to @ref XXH3_128bits_withSeed() with a seed of 0, however
++ * it may have slightly better performance due to constant propagation of the
++ * defaults.
++ *
++ * @see
++ *    XXH32(), XXH64(), XXH3_64bits(): equivalent for the other xxHash algorithms
++ * @see
++ *    XXH3_128bits_withSeed(), XXH3_128bits_withSecret(): other seeding variants
++ * @see
++ *    XXH3_128bits_reset(), XXH3_128bits_update(), XXH3_128bits_digest(): Streaming version.
++ */
++XXH_PUBLIC_API XXH_PUREF XXH128_hash_t XXH3_128bits(XXH_NOESCAPE const void* data, size_t len);
++/*! @brief Seeded 128-bit variant of XXH3. @see XXH3_64bits_withSeed(). */
++XXH_PUBLIC_API XXH_PUREF XXH128_hash_t XXH3_128bits_withSeed(XXH_NOESCAPE const void* data, size_t len, XXH64_hash_t seed);
++/*! @brief Custom secret 128-bit variant of XXH3. @see XXH3_64bits_withSecret(). */
++XXH_PUBLIC_API XXH_PUREF XXH128_hash_t XXH3_128bits_withSecret(XXH_NOESCAPE const void* data, size_t len, XXH_NOESCAPE const void* secret, size_t secretSize);
+ 
+ /*******   Streaming   *******/
++#ifndef XXH_NO_STREAM
+ /*
+  * Streaming requires state maintenance.
+  * This operation costs memory and CPU.
+  * As a consequence, streaming is slower than one-shot hashing.
+  * For better performance, prefer one-shot functions whenever applicable.
+  *
+  * XXH3_128bits uses the same XXH3_state_t as XXH3_64bits().
+  * Use already declared XXH3_createState() and XXH3_freeState().
+  *
+  * All reset and streaming functions have same meaning as their 64-bit counterpart.
+  */
+ 
+-XXH_PUBLIC_API XXH_errorcode XXH3_128bits_reset(XXH3_state_t* statePtr);
+-XXH_PUBLIC_API XXH_errorcode XXH3_128bits_reset_withSeed(XXH3_state_t* statePtr, XXH64_hash_t seed);
+-XXH_PUBLIC_API XXH_errorcode XXH3_128bits_reset_withSecret(XXH3_state_t* statePtr, const void* secret, size_t secretSize);
+-
+-XXH_PUBLIC_API XXH_errorcode XXH3_128bits_update (XXH3_state_t* statePtr, const void* input, size_t length);
+-XXH_PUBLIC_API XXH128_hash_t XXH3_128bits_digest (const XXH3_state_t* statePtr);
++/*!
++ * @brief Resets an @ref XXH3_state_t to begin a new hash.
++ *
++ * This function resets `statePtr` and generate a secret with default parameters. Call it before @ref XXH3_128bits_update().
++ * Digest will be equivalent to `XXH3_128bits()`.
++ *
++ * @param statePtr The state struct to reset.
++ *
++ * @pre
++ *   @p statePtr must not be `NULL`.
++ *
++ * @return @ref XXH_OK on success, @ref XXH_ERROR on failure.
++ *
++ */
++XXH_PUBLIC_API XXH_errorcode XXH3_128bits_reset(XXH_NOESCAPE XXH3_state_t* statePtr);
++
++/*!
++ * @brief Resets an @ref XXH3_state_t with 64-bit seed to begin a new hash.
++ *
++ * This function resets `statePtr` and generate a secret from `seed`. Call it before @ref XXH3_128bits_update().
++ * Digest will be equivalent to `XXH3_128bits_withSeed()`.
++ *
++ * @param statePtr The state struct to reset.
++ * @param seed     The 64-bit seed to alter the state.
++ *
++ * @pre
++ *   @p statePtr must not be `NULL`.
++ *
++ * @return @ref XXH_OK on success, @ref XXH_ERROR on failure.
++ *
++ */
++XXH_PUBLIC_API XXH_errorcode XXH3_128bits_reset_withSeed(XXH_NOESCAPE XXH3_state_t* statePtr, XXH64_hash_t seed);
++/*! @brief Custom secret 128-bit variant of XXH3. @see XXH_64bits_reset_withSecret(). */
++XXH_PUBLIC_API XXH_errorcode XXH3_128bits_reset_withSecret(XXH_NOESCAPE XXH3_state_t* statePtr, XXH_NOESCAPE const void* secret, size_t secretSize);
++
++/*!
++ * @brief Consumes a block of @p input to an @ref XXH3_state_t.
++ *
++ * Call this to incrementally consume blocks of data.
++ *
++ * @param statePtr The state struct to update.
++ * @param input The block of data to be hashed, at least @p length bytes in size.
++ * @param length The length of @p input, in bytes.
++ *
++ * @pre
++ *   @p statePtr must not be `NULL`.
++ * @pre
++ *   The memory between @p input and @p input + @p length must be valid,
++ *   readable, contiguous memory. However, if @p length is `0`, @p input may be
++ *   `NULL`. In C++, this also must be *TriviallyCopyable*.
++ *
++ * @return @ref XXH_OK on success, @ref XXH_ERROR on failure.
++ */
++XXH_PUBLIC_API XXH_errorcode XXH3_128bits_update (XXH_NOESCAPE XXH3_state_t* statePtr, XXH_NOESCAPE const void* input, size_t length);
++
++/*!
++ * @brief Returns the calculated XXH3 128-bit hash value from an @ref XXH3_state_t.
++ *
++ * @note
++ *   Calling XXH3_128bits_digest() will not affect @p statePtr, so you can update,
++ *   digest, and update again.
++ *
++ * @param statePtr The state struct to calculate the hash from.
++ *
++ * @pre
++ *  @p statePtr must not be `NULL`.
++ *
++ * @return The calculated XXH3 128-bit hash value from that state.
++ */
++XXH_PUBLIC_API XXH_PUREF XXH128_hash_t XXH3_128bits_digest (XXH_NOESCAPE const XXH3_state_t* statePtr);
++#endif /* !XXH_NO_STREAM */
+ 
+ /* Following helper functions make it possible to compare XXH128_hast_t values.
+  * Since XXH128_hash_t is a structure, this capability is not offered by the language.
+  * Note: For better performance, these functions can be inlined using XXH_INLINE_ALL */
+ 
+ /*!
+  * XXH128_isEqual():
+  * Return: 1 if `h1` and `h2` are equal, 0 if they are not.
+  */
+-XXH_PUBLIC_API int XXH128_isEqual(XXH128_hash_t h1, XXH128_hash_t h2);
++XXH_PUBLIC_API XXH_PUREF int XXH128_isEqual(XXH128_hash_t h1, XXH128_hash_t h2);
+ 
+ /*!
+- * XXH128_cmp():
+- *
++ * @brief Compares two @ref XXH128_hash_t
+  * This comparator is compatible with stdlib's `qsort()`/`bsearch()`.
+  *
+- * return: >0 if *h128_1  > *h128_2
+- *         =0 if *h128_1 == *h128_2
+- *         <0 if *h128_1  < *h128_2
+- */
+-XXH_PUBLIC_API int XXH128_cmp(const void* h128_1, const void* h128_2);
++ * @return: >0 if *h128_1  > *h128_2
++ *          =0 if *h128_1 == *h128_2
++ *          <0 if *h128_1  < *h128_2
++ */
++XXH_PUBLIC_API XXH_PUREF int XXH128_cmp(XXH_NOESCAPE const void* h128_1, XXH_NOESCAPE const void* h128_2);
+ 
+ 
+ /*******   Canonical representation   *******/
+ typedef struct { unsigned char digest[sizeof(XXH128_hash_t)]; } XXH128_canonical_t;
+-XXH_PUBLIC_API void XXH128_canonicalFromHash(XXH128_canonical_t* dst, XXH128_hash_t hash);
+-XXH_PUBLIC_API XXH128_hash_t XXH128_hashFromCanonical(const XXH128_canonical_t* src);
+-
+-
++
++
++/*!
++ * @brief Converts an @ref XXH128_hash_t to a big endian @ref XXH128_canonical_t.
++ *
++ * @param dst The @ref XXH128_canonical_t pointer to be stored to.
++ * @param hash The @ref XXH128_hash_t to be converted.
++ *
++ * @pre
++ *   @p dst must not be `NULL`.
++ */
++XXH_PUBLIC_API void XXH128_canonicalFromHash(XXH_NOESCAPE XXH128_canonical_t* dst, XXH128_hash_t hash);
++
++/*!
++ * @brief Converts an @ref XXH128_canonical_t to a native @ref XXH128_hash_t.
++ *
++ * @param src The @ref XXH128_canonical_t to convert.
++ *
++ * @pre
++ *   @p src must not be `NULL`.
++ *
++ * @return The converted hash.
++ */
++XXH_PUBLIC_API XXH_PUREF XXH128_hash_t XXH128_hashFromCanonical(XXH_NOESCAPE const XXH128_canonical_t* src);
++
++
++#endif  /* !XXH_NO_XXH3 */
+ #endif  /* XXH_NO_LONG_LONG */
+ 
+ /*!
+  * @}
+  */
+ #endif /* XXHASH_H_5627135585666179 */
+ 
+ 
+@@ -973,17 +1438,17 @@ XXH_PUBLIC_API XXH128_hash_t XXH128_hash
+  * @see XXH64_state_s, XXH3_state_s
+  */
+ struct XXH32_state_s {
+    XXH32_hash_t total_len_32; /*!< Total length hashed, modulo 2^32 */
+    XXH32_hash_t large_len;    /*!< Whether the hash is >= 16 (handles @ref total_len_32 overflow) */
+    XXH32_hash_t v[4];         /*!< Accumulator lanes */
+    XXH32_hash_t mem32[4];     /*!< Internal buffer for partial reads. Treated as unsigned char[16]. */
+    XXH32_hash_t memsize;      /*!< Amount of data in @ref mem32 */
+-   XXH32_hash_t reserved;     /*!< Reserved field. Do not read or write to it, it may be removed. */
++   XXH32_hash_t reserved;     /*!< Reserved field. Do not read nor write to it. */
+ };   /* typedef'd to XXH32_state_t */
+ 
+ 
+ #ifndef XXH_NO_LONG_LONG  /* defined when there is no 64-bit support */
+ 
+ /*!
+  * @internal
+  * @brief Structure for XXH64 streaming API.
+@@ -997,19 +1462,21 @@ struct XXH32_state_s {
+  * @see XXH32_state_s, XXH3_state_s
+  */
+ struct XXH64_state_s {
+    XXH64_hash_t total_len;    /*!< Total length hashed. This is always 64-bit. */
+    XXH64_hash_t v[4];         /*!< Accumulator lanes */
+    XXH64_hash_t mem64[4];     /*!< Internal buffer for partial reads. Treated as unsigned char[32]. */
+    XXH32_hash_t memsize;      /*!< Amount of data in @ref mem64 */
+    XXH32_hash_t reserved32;   /*!< Reserved field, needed for padding anyways*/
+-   XXH64_hash_t reserved64;   /*!< Reserved field. Do not read or write to it, it may be removed. */
++   XXH64_hash_t reserved64;   /*!< Reserved field. Do not read or write to it. */
+ };   /* typedef'd to XXH64_state_t */
+ 
++#ifndef XXH_NO_XXH3
++
+ #if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 201112L) /* >= C11 */
+ #  include <stdalign.h>
+ #  define XXH_ALIGN(n)      alignas(n)
+ #elif defined(__cplusplus) && (__cplusplus >= 201103L) /* >= C++11 */
+ /* In C++ alignas() is a keyword */
+ #  define XXH_ALIGN(n)      alignas(n)
+ #elif defined(__GNUC__)
+ #  define XXH_ALIGN(n)      __attribute__ ((aligned(n)))
+@@ -1033,16 +1500,17 @@ struct XXH64_state_s {
+  *
+  * This is the optimal update size for incremental hashing.
+  *
+  * @see XXH3_64b_update(), XXH3_128b_update().
+  */
+ #define XXH3_INTERNALBUFFER_SIZE 256
+ 
+ /*!
++ * @internal
+  * @brief Default size of the secret buffer (and @ref XXH3_kSecret).
+  *
+  * This is the size used in @ref XXH3_kSecret and the seeded functions.
+  *
+  * Not to be confused with @ref XXH3_SECRET_SIZE_MIN.
+  */
+ #define XXH3_SECRET_DEFAULT_SIZE 192
+ 
+@@ -1065,17 +1533,17 @@ struct XXH64_state_s {
+  * Do never access the members of this struct directly.
+  *
+  * @see XXH3_INITSTATE() for stack initialization.
+  * @see XXH3_createState(), XXH3_freeState().
+  * @see XXH32_state_s, XXH64_state_s
+  */
+ struct XXH3_state_s {
+    XXH_ALIGN_MEMBER(64, XXH64_hash_t acc[8]);
+-       /*!< The 8 accumulators. Similar to `vN` in @ref XXH32_state_s::v1 and @ref XXH64_state_s */
++       /*!< The 8 accumulators. See @ref XXH32_state_s::v and @ref XXH64_state_s::v */
+    XXH_ALIGN_MEMBER(64, unsigned char customSecret[XXH3_SECRET_DEFAULT_SIZE]);
+        /*!< Used to store a custom secret generated from a seed. */
+    XXH_ALIGN_MEMBER(64, unsigned char buffer[XXH3_INTERNALBUFFER_SIZE]);
+        /*!< The internal buffer. @see XXH32_state_s::mem32 */
+    XXH32_hash_t bufferedSize;
+        /*!< The amount of memory in @ref buffer, @see XXH32_state_s::memsize */
+    XXH32_hash_t useSeed;
+        /*!< Reserved field. Needed for padding on 64-bit. */
+@@ -1105,122 +1573,174 @@ struct XXH3_state_s {
+  * When the @ref XXH3_state_t structure is merely emplaced on stack,
+  * it should be initialized with XXH3_INITSTATE() or a memset()
+  * in case its first reset uses XXH3_NNbits_reset_withSeed().
+  * This init can be omitted if the first reset uses default or _withSecret mode.
+  * This operation isn't necessary when the state is created with XXH3_createState().
+  * Note that this doesn't prepare the state for a streaming operation,
+  * it's still necessary to use XXH3_NNbits_reset*() afterwards.
+  */
+-#define XXH3_INITSTATE(XXH3_state_ptr)   { (XXH3_state_ptr)->seed = 0; }
+-
+-
+-/* XXH128() :
++#define XXH3_INITSTATE(XXH3_state_ptr)                       \
++    do {                                                     \
++        XXH3_state_t* tmp_xxh3_state_ptr = (XXH3_state_ptr); \
++        tmp_xxh3_state_ptr->seed = 0;                        \
++        tmp_xxh3_state_ptr->extSecret = NULL;                \
++    } while(0)
++
++
++/*!
+  * simple alias to pre-selected XXH3_128bits variant
+  */
+-XXH_PUBLIC_API XXH128_hash_t XXH128(const void* data, size_t len, XXH64_hash_t seed);
++XXH_PUBLIC_API XXH_PUREF XXH128_hash_t XXH128(XXH_NOESCAPE const void* data, size_t len, XXH64_hash_t seed);
+ 
+ 
+ /* ===   Experimental API   === */
+ /* Symbols defined below must be considered tied to a specific library version. */
+ 
+-/*
++/*!
+  * XXH3_generateSecret():
+  *
+  * Derive a high-entropy secret from any user-defined content, named customSeed.
+  * The generated secret can be used in combination with `*_withSecret()` functions.
+- * The `_withSecret()` variants are useful to provide a higher level of protection than 64-bit seed,
+- * as it becomes much more difficult for an external actor to guess how to impact the calculation logic.
++ * The `_withSecret()` variants are useful to provide a higher level of protection
++ * than 64-bit seed, as it becomes much more difficult for an external actor to
++ * guess how to impact the calculation logic.
+  *
+  * The function accepts as input a custom seed of any length and any content,
+- * and derives from it a high-entropy secret of length @secretSize
+- * into an already allocated buffer @secretBuffer.
+- * @secretSize must be >= XXH3_SECRET_SIZE_MIN
++ * and derives from it a high-entropy secret of length @p secretSize into an
++ * already allocated buffer @p secretBuffer.
+  *
+  * The generated secret can then be used with any `*_withSecret()` variant.
+- * Functions `XXH3_128bits_withSecret()`, `XXH3_64bits_withSecret()`,
+- * `XXH3_128bits_reset_withSecret()` and `XXH3_64bits_reset_withSecret()`
++ * The functions @ref XXH3_128bits_withSecret(), @ref XXH3_64bits_withSecret(),
++ * @ref XXH3_128bits_reset_withSecret() and @ref XXH3_64bits_reset_withSecret()
+  * are part of this list. They all accept a `secret` parameter
+- * which must be large enough for implementation reasons (>= XXH3_SECRET_SIZE_MIN)
++ * which must be large enough for implementation reasons (>= @ref XXH3_SECRET_SIZE_MIN)
+  * _and_ feature very high entropy (consist of random-looking bytes).
+- * These conditions can be a high bar to meet, so
+- * XXH3_generateSecret() can be employed to ensure proper quality.
+- *
+- * customSeed can be anything. It can have any size, even small ones,
+- * and its content can be anything, even "poor entropy" sources such as a bunch of zeroes.
+- * The resulting `secret` will nonetheless provide all required qualities.
+- *
+- * When customSeedSize > 0, supplying NULL as customSeed is undefined behavior.
+- */
+-XXH_PUBLIC_API XXH_errorcode XXH3_generateSecret(void* secretBuffer, size_t secretSize, const void* customSeed, size_t customSeedSize);
+-
+-
+-/*
+- * XXH3_generateSecret_fromSeed():
+- *
+- * Generate the same secret as the _withSeed() variants.
+- *
+- * The resulting secret has a length of XXH3_SECRET_DEFAULT_SIZE (necessarily).
+- * @secretBuffer must be already allocated, of size at least XXH3_SECRET_DEFAULT_SIZE bytes.
++ * These conditions can be a high bar to meet, so @ref XXH3_generateSecret() can
++ * be employed to ensure proper quality.
++ *
++ * @p customSeed can be anything. It can have any size, even small ones,
++ * and its content can be anything, even "poor entropy" sources such as a bunch
++ * of zeroes. The resulting `secret` will nonetheless provide all required qualities.
++ *
++ * @pre
++ *   - @p secretSize must be >= @ref XXH3_SECRET_SIZE_MIN
++ *   - When @p customSeedSize > 0, supplying NULL as customSeed is undefined behavior.
++ *
++ * Example code:
++ * @code{.c}
++ *    #include <stdio.h>
++ *    #include <stdlib.h>
++ *    #include <string.h>
++ *    #define XXH_STATIC_LINKING_ONLY // expose unstable API
++ *    #include "xxhash.h"
++ *    // Hashes argv[2] using the entropy from argv[1].
++ *    int main(int argc, char* argv[])
++ *    {
++ *        char secret[XXH3_SECRET_SIZE_MIN];
++ *        if (argv != 3) { return 1; }
++ *        XXH3_generateSecret(secret, sizeof(secret), argv[1], strlen(argv[1]));
++ *        XXH64_hash_t h = XXH3_64bits_withSecret(
++ *             argv[2], strlen(argv[2]),
++ *             secret, sizeof(secret)
++ *        );
++ *        printf("%016llx\n", (unsigned long long) h);
++ *    }
++ * @endcode
++ */
++XXH_PUBLIC_API XXH_errorcode XXH3_generateSecret(XXH_NOESCAPE void* secretBuffer, size_t secretSize, XXH_NOESCAPE const void* customSeed, size_t customSeedSize);
++
++/*!
++ * @brief Generate the same secret as the _withSeed() variants.
+  *
+  * The generated secret can be used in combination with
+  *`*_withSecret()` and `_withSecretandSeed()` variants.
+- * This generator is notably useful in combination with `_withSecretandSeed()`,
+- * as a way to emulate a faster `_withSeed()` variant.
+- */
+-XXH_PUBLIC_API void XXH3_generateSecret_fromSeed(void* secretBuffer, XXH64_hash_t seed);
+-
+-/*
+- * *_withSecretandSeed() :
++ *
++ * Example C++ `std::string` hash class:
++ * @code{.cpp}
++ *    #include <string>
++ *    #define XXH_STATIC_LINKING_ONLY // expose unstable API
++ *    #include "xxhash.h"
++ *    // Slow, seeds each time
++ *    class HashSlow {
++ *        XXH64_hash_t seed;
++ *    public:
++ *        HashSlow(XXH64_hash_t s) : seed{s} {}
++ *        size_t operator()(const std::string& x) const {
++ *            return size_t{XXH3_64bits_withSeed(x.c_str(), x.length(), seed)};
++ *        }
++ *    };
++ *    // Fast, caches the seeded secret for future uses.
++ *    class HashFast {
++ *        unsigned char secret[XXH3_SECRET_SIZE_MIN];
++ *    public:
++ *        HashFast(XXH64_hash_t s) {
++ *            XXH3_generateSecret_fromSeed(secret, seed);
++ *        }
++ *        size_t operator()(const std::string& x) const {
++ *            return size_t{
++ *                XXH3_64bits_withSecret(x.c_str(), x.length(), secret, sizeof(secret))
++ *            };
++ *        }
++ *    };
++ * @endcode
++ * @param secretBuffer A writable buffer of @ref XXH3_SECRET_SIZE_MIN bytes
++ * @param seed The seed to seed the state.
++ */
++XXH_PUBLIC_API void XXH3_generateSecret_fromSeed(XXH_NOESCAPE void* secretBuffer, XXH64_hash_t seed);
++
++/*!
+  * These variants generate hash values using either
+- * @seed for "short" keys (< XXH3_MIDSIZE_MAX = 240 bytes)
+- * or @secret for "large" keys (>= XXH3_MIDSIZE_MAX).
++ * @p seed for "short" keys (< XXH3_MIDSIZE_MAX = 240 bytes)
++ * or @p secret for "large" keys (>= XXH3_MIDSIZE_MAX).
+  *
+  * This generally benefits speed, compared to `_withSeed()` or `_withSecret()`.
+  * `_withSeed()` has to generate the secret on the fly for "large" keys.
+  * It's fast, but can be perceptible for "not so large" keys (< 1 KB).
+  * `_withSecret()` has to generate the masks on the fly for "small" keys,
+  * which requires more instructions than _withSeed() variants.
+  * Therefore, _withSecretandSeed variant combines the best of both worlds.
+  *
+- * When @secret has been generated by XXH3_generateSecret_fromSeed(),
++ * When @p secret has been generated by XXH3_generateSecret_fromSeed(),
+  * this variant produces *exactly* the same results as `_withSeed()` variant,
+  * hence offering only a pure speed benefit on "large" input,
+  * by skipping the need to regenerate the secret for every large input.
+  *
+  * Another usage scenario is to hash the secret to a 64-bit hash value,
+  * for example with XXH3_64bits(), which then becomes the seed,
+  * and then employ both the seed and the secret in _withSecretandSeed().
+  * On top of speed, an added benefit is that each bit in the secret
+- * has a 50% chance to swap each bit in the output,
+- * via its impact to the seed.
++ * has a 50% chance to swap each bit in the output, via its impact to the seed.
++ *
+  * This is not guaranteed when using the secret directly in "small data" scenarios,
+  * because only portions of the secret are employed for small data.
+  */
+-XXH_PUBLIC_API XXH64_hash_t
+-XXH3_64bits_withSecretandSeed(const void* data, size_t len,
+-                              const void* secret, size_t secretSize,
++XXH_PUBLIC_API XXH_PUREF XXH64_hash_t
++XXH3_64bits_withSecretandSeed(XXH_NOESCAPE const void* data, size_t len,
++                              XXH_NOESCAPE const void* secret, size_t secretSize,
+                               XXH64_hash_t seed);
+-
+-XXH_PUBLIC_API XXH128_hash_t
+-XXH3_128bits_withSecretandSeed(const void* data, size_t len,
+-                               const void* secret, size_t secretSize,
++/*! @copydoc XXH3_64bits_withSecretandSeed() */
++XXH_PUBLIC_API XXH_PUREF XXH128_hash_t
++XXH3_128bits_withSecretandSeed(XXH_NOESCAPE const void* input, size_t length,
++                               XXH_NOESCAPE const void* secret, size_t secretSize,
+                                XXH64_hash_t seed64);
+-
++#ifndef XXH_NO_STREAM
++/*! @copydoc XXH3_64bits_withSecretandSeed() */
+ XXH_PUBLIC_API XXH_errorcode
+-XXH3_64bits_reset_withSecretandSeed(XXH3_state_t* statePtr,
+-                                    const void* secret, size_t secretSize,
++XXH3_64bits_reset_withSecretandSeed(XXH_NOESCAPE XXH3_state_t* statePtr,
++                                    XXH_NOESCAPE const void* secret, size_t secretSize,
+                                     XXH64_hash_t seed64);
+-
++/*! @copydoc XXH3_64bits_withSecretandSeed() */
+ XXH_PUBLIC_API XXH_errorcode
+-XXH3_128bits_reset_withSecretandSeed(XXH3_state_t* statePtr,
+-                                     const void* secret, size_t secretSize,
++XXH3_128bits_reset_withSecretandSeed(XXH_NOESCAPE XXH3_state_t* statePtr,
++                                     XXH_NOESCAPE const void* secret, size_t secretSize,
+                                      XXH64_hash_t seed64);
+-
+-
++#endif /* !XXH_NO_STREAM */
++
++#endif  /* !XXH_NO_XXH3 */
+ #endif  /* XXH_NO_LONG_LONG */
+ #if defined(XXH_INLINE_ALL) || defined(XXH_PRIVATE_API)
+ #  define XXH_IMPLEMENTATION
+ #endif
+ 
+ #endif  /* defined(XXH_STATIC_LINKING_ONLY) && !defined(XXHASH_H_STATIC_13879238742) */
+ 
+ 
+@@ -1264,17 +1784,17 @@ XXH3_128bits_reset_withSecretandSeed(XXH
+  * @{
+  *
+  * Various macros to control xxHash's behavior.
+  */
+ #ifdef XXH_DOXYGEN
+ /*!
+  * @brief Define this to disable 64-bit code.
+  *
+- * Useful if only using the @ref xxh32_family and you have a strict C90 compiler.
++ * Useful if only using the @ref XXH32_family and you have a strict C90 compiler.
+  */
+ #  define XXH_NO_LONG_LONG
+ #  undef XXH_NO_LONG_LONG /* don't actually */
+ /*!
+  * @brief Controls how unaligned memory is accessed.
+  *
+  * By default, access to unaligned memory is controlled by `memcpy()`, which is
+  * safe and portable.
+@@ -1287,17 +1807,17 @@ XXH3_128bits_reset_withSecretandSeed(XXH
+  *
+  * @par Possible options:
+  *
+  *  - `XXH_FORCE_MEMORY_ACCESS=0` (default): `memcpy`
+  *   @par
+  *     Use `memcpy()`. Safe and portable. Note that most modern compilers will
+  *     eliminate the function call and treat it as an unaligned access.
+  *
+- *  - `XXH_FORCE_MEMORY_ACCESS=1`: `__attribute__((packed))`
++ *  - `XXH_FORCE_MEMORY_ACCESS=1`: `__attribute__((aligned(1)))`
+  *   @par
+  *     Depends on compiler extensions and is therefore not portable.
+  *     This method is safe _if_ your compiler supports it,
+  *     and *generally* as fast or faster than `memcpy`.
+  *
+  *  - `XXH_FORCE_MEMORY_ACCESS=2`: Direct cast
+  *  @par
+  *     Casts directly and dereferences. This method doesn't depend on the
+@@ -1307,30 +1827,58 @@ XXH3_128bits_reset_withSecretandSeed(XXH
+  *     only known way to get the most performance.
+  *
+  *  - `XXH_FORCE_MEMORY_ACCESS=3`: Byteshift
+  *  @par
+  *     Also portable. This can generate the best code on old compilers which don't
+  *     inline small `memcpy()` calls, and it might also be faster on big-endian
+  *     systems which lack a native byteswap instruction. However, some compilers
+  *     will emit literal byteshifts even if the target supports unaligned access.
+- *  .
++ *
+  *
+  * @warning
+  *   Methods 1 and 2 rely on implementation-defined behavior. Use these with
+  *   care, as what works on one compiler/platform/optimization level may cause
+  *   another to read garbage data or even crash.
+  *
+- * See http://fastcompression.blogspot.com/2015/08/accessing-unaligned-memory.html for details.
++ * See https://fastcompression.blogspot.com/2015/08/accessing-unaligned-memory.html for details.
+  *
+  * Prefer these methods in priority order (0 > 3 > 1 > 2)
+  */
+ #  define XXH_FORCE_MEMORY_ACCESS 0
+ 
+ /*!
++ * @def XXH_SIZE_OPT
++ * @brief Controls how much xxHash optimizes for size.
++ *
++ * xxHash, when compiled, tends to result in a rather large binary size. This
++ * is mostly due to heavy usage to forced inlining and constant folding of the
++ * @ref XXH3_family to increase performance.
++ *
++ * However, some developers prefer size over speed. This option can
++ * significantly reduce the size of the generated code. When using the `-Os`
++ * or `-Oz` options on GCC or Clang, this is defined to 1 by default,
++ * otherwise it is defined to 0.
++ *
++ * Most of these size optimizations can be controlled manually.
++ *
++ * This is a number from 0-2.
++ *  - `XXH_SIZE_OPT` == 0: Default. xxHash makes no size optimizations. Speed
++ *    comes first.
++ *  - `XXH_SIZE_OPT` == 1: Default for `-Os` and `-Oz`. xxHash is more
++ *    conservative and disables hacks that increase code size. It implies the
++ *    options @ref XXH_NO_INLINE_HINTS == 1, @ref XXH_FORCE_ALIGN_CHECK == 0,
++ *    and @ref XXH3_NEON_LANES == 8 if they are not already defined.
++ *  - `XXH_SIZE_OPT` == 2: xxHash tries to make itself as small as possible.
++ *    Performance may cry. For example, the single shot functions just use the
++ *    streaming API.
++ */
++#  define XXH_SIZE_OPT 0
++
++/*!
+  * @def XXH_FORCE_ALIGN_CHECK
+  * @brief If defined to non-zero, adds a special path for aligned inputs (XXH32()
+  * and XXH64() only).
+  *
+  * This is an important performance trick for architectures without decent
+  * unaligned memory access performance.
+  *
+  * It checks for input alignment, and when conditions are met, uses a "fast
+@@ -1341,19 +1889,21 @@ XXH3_128bits_reset_withSecretandSeed(XXH
+  * but not zero.
+  *
+  * Moreover, it's not useful to generate an additional code path if memory
+  * access uses the same instruction for both aligned and unaligned
+  * addresses (e.g. x86 and aarch64).
+  *
+  * In these cases, the alignment check can be removed by setting this macro to 0.
+  * Then the code will always use unaligned memory access.
+- * Align check is automatically disabled on x86, x64 & arm64,
++ * Align check is automatically disabled on x86, x64, ARM64, and some ARM chips
+  * which are platforms known to offer good unaligned memory accesses performance.
+  *
++ * It is also disabled by default when @ref XXH_SIZE_OPT >= 1.
++ *
+  * This option does not affect XXH3 (only XXH32 and XXH64).
+  */
+ #  define XXH_FORCE_ALIGN_CHECK 0
+ 
+ /*!
+  * @def XXH_NO_INLINE_HINTS
+  * @brief When non-zero, sets all functions to `static`.
+  *
+@@ -1365,22 +1915,39 @@ XXH3_128bits_reset_withSecretandSeed(XXH
+  * might not be favorable.
+  *
+  * Additionally, sometimes the forced inlining can be detrimental to performance,
+  * depending on the architecture.
+  *
+  * XXH_NO_INLINE_HINTS marks all internal functions as static, giving the
+  * compiler full control on whether to inline or not.
+  *
+- * When not optimizing (-O0), optimizing for size (-Os, -Oz), or using
+- * -fno-inline with GCC or Clang, this will automatically be defined.
++ * When not optimizing (-O0), using `-fno-inline` with GCC or Clang, or if
++ * @ref XXH_SIZE_OPT >= 1, this will automatically be defined.
+  */
+ #  define XXH_NO_INLINE_HINTS 0
+ 
+ /*!
++ * @def XXH3_INLINE_SECRET
++ * @brief Determines whether to inline the XXH3 withSecret code.
++ *
++ * When the secret size is known, the compiler can improve the performance
++ * of XXH3_64bits_withSecret() and XXH3_128bits_withSecret().
++ *
++ * However, if the secret size is not known, it doesn't have any benefit. This
++ * happens when xxHash is compiled into a global symbol. Therefore, if
++ * @ref XXH_INLINE_ALL is *not* defined, this will be defined to 0.
++ *
++ * Additionally, this defaults to 0 on GCC 12+, which has an issue with function pointers
++ * that are *sometimes* force inline on -Og, and it is impossible to automatically
++ * detect this optimization level.
++ */
++#  define XXH3_INLINE_SECRET 0
++
++/*!
+  * @def XXH32_ENDJMP
+  * @brief Whether to use a jump for `XXH32_finalize`.
+  *
+  * For performance, `XXH32_finalize` uses multiple branches in the finalizer.
+  * This is generally preferable for performance,
+  * but depending on exact architecture, a jmp may be preferable.
+  *
+  * This setting is only possibly making a difference for very small inputs.
+@@ -1391,91 +1958,130 @@ XXH3_128bits_reset_withSecretandSeed(XXH
+  * @internal
+  * @brief Redefines old internal names.
+  *
+  * For compatibility with code that uses xxHash's internals before the names
+  * were changed to improve namespacing. There is no other reason to use this.
+  */
+ #  define XXH_OLD_NAMES
+ #  undef XXH_OLD_NAMES /* don't actually use, it is ugly. */
++
++/*!
++ * @def XXH_NO_STREAM
++ * @brief Disables the streaming API.
++ *
++ * When xxHash is not inlined and the streaming functions are not used, disabling
++ * the streaming functions can improve code size significantly, especially with
++ * the @ref XXH3_family which tends to make constant folded copies of itself.
++ */
++#  define XXH_NO_STREAM
++#  undef XXH_NO_STREAM /* don't actually */
+ #endif /* XXH_DOXYGEN */
+ /*!
+  * @}
+  */
+ 
+ #ifndef XXH_FORCE_MEMORY_ACCESS   /* can be defined externally, on command line for example */
+-   /* prefer __packed__ structures (method 1) for gcc on armv7+ and mips */
+-#  if !defined(__clang__) && \
+-( \
+-    (defined(__INTEL_COMPILER) && !defined(_WIN32)) || \
+-    ( \
+-        defined(__GNUC__) && ( \
+-            (defined(__ARM_ARCH) && __ARM_ARCH >= 7) || \
+-            ( \
+-                defined(__mips__) && \
+-                (__mips <= 5 || __mips_isa_rev < 6) && \
+-                (!defined(__mips16) || defined(__mips_mips16e2)) \
+-            ) \
+-        ) \
+-    ) \
+-)
++   /* prefer __packed__ structures (method 1) for GCC
++    * < ARMv7 with unaligned access (e.g. Raspbian armhf) still uses byte shifting, so we use memcpy
++    * which for some reason does unaligned loads. */
++#  if defined(__GNUC__) && !(defined(__ARM_ARCH) && __ARM_ARCH < 7 && defined(__ARM_FEATURE_UNALIGNED))
+ #    define XXH_FORCE_MEMORY_ACCESS 1
+ #  endif
+ #endif
+ 
++#ifndef XXH_SIZE_OPT
++   /* default to 1 for -Os or -Oz */
++#  if (defined(__GNUC__) || defined(__clang__)) && defined(__OPTIMIZE_SIZE__)
++#    define XXH_SIZE_OPT 1
++#  else
++#    define XXH_SIZE_OPT 0
++#  endif
++#endif
++
+ #ifndef XXH_FORCE_ALIGN_CHECK  /* can be defined externally */
+-#  if defined(__i386)  || defined(__x86_64__) || defined(__aarch64__) \
+-   || defined(_M_IX86) || defined(_M_X64)     || defined(_M_ARM64) /* visual */
++   /* don't check on sizeopt, x86, aarch64, or arm when unaligned access is available */
++#  if XXH_SIZE_OPT >= 1 || \
++      defined(__i386)  || defined(__x86_64__) || defined(__aarch64__) || defined(__ARM_FEATURE_UNALIGNED) \
++   || defined(_M_IX86) || defined(_M_X64)     || defined(_M_ARM64)    || defined(_M_ARM) /* visual */
+ #    define XXH_FORCE_ALIGN_CHECK 0
+ #  else
+ #    define XXH_FORCE_ALIGN_CHECK 1
+ #  endif
+ #endif
+ 
+ #ifndef XXH_NO_INLINE_HINTS
+-#  if defined(__OPTIMIZE_SIZE__) /* -Os, -Oz */ \
+-   || defined(__NO_INLINE__)     /* -O0, -fno-inline */
++#  if XXH_SIZE_OPT >= 1 || defined(__NO_INLINE__)  /* -O0, -fno-inline */
+ #    define XXH_NO_INLINE_HINTS 1
+ #  else
+ #    define XXH_NO_INLINE_HINTS 0
+ #  endif
+ #endif
+ 
++#ifndef XXH3_INLINE_SECRET
++#  if (defined(__GNUC__) && !defined(__clang__) && __GNUC__ >= 12) \
++     || !defined(XXH_INLINE_ALL)
++#    define XXH3_INLINE_SECRET 0
++#  else
++#    define XXH3_INLINE_SECRET 1
++#  endif
++#endif
++
+ #ifndef XXH32_ENDJMP
+ /* generally preferable for performance */
+ #  define XXH32_ENDJMP 0
+ #endif
+ 
+ /*!
+  * @defgroup impl Implementation
+  * @{
+  */
+ 
+ 
+ /* *************************************
+ *  Includes & Memory related functions
+ ***************************************/
++#if defined(XXH_NO_STREAM)
++/* nothing */
++#elif defined(XXH_NO_STDLIB)
++
++/* When requesting to disable any mention of stdlib,
++ * the library loses the ability to invoked malloc / free.
++ * In practice, it means that functions like `XXH*_createState()`
++ * will always fail, and return NULL.
++ * This flag is useful in situations where
++ * xxhash.h is integrated into some kernel, embedded or limited environment
++ * without access to dynamic allocation.
++ */
++
++static XXH_CONSTF void* XXH_malloc(size_t s) { (void)s; return NULL; }
++static void XXH_free(void* p) { (void)p; }
++
++#else
++
+ /*
+  * Modify the local functions below should you wish to use
+  * different memory routines for malloc() and free()
+  */
+ #include <stdlib.h>
+ 
+ /*!
+  * @internal
+  * @brief Modify this function to use a different routine than malloc().
+  */
+-static void* XXH_malloc(size_t s) { return malloc(s); }
++static XXH_MALLOCF void* XXH_malloc(size_t s) { return malloc(s); }
+ 
+ /*!
+  * @internal
+  * @brief Modify this function to use a different routine than free().
+  */
+ static void XXH_free(void* p) { free(p); }
+ 
++#endif  /* XXH_NO_STDLIB */
++
+ #include <string.h>
+ 
+ /*!
+  * @internal
+  * @brief Modify this function to use a different routine than memcpy().
+  */
+ static void* XXH_memcpy(void* dest, const void* src, size_t size)
+ {
+@@ -1510,16 +2116,21 @@ static void* XXH_memcpy(void* dest, cons
+   || (defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L))   /* C99 */
+ #  define XXH_FORCE_INLINE static inline
+ #  define XXH_NO_INLINE static
+ #else
+ #  define XXH_FORCE_INLINE static
+ #  define XXH_NO_INLINE static
+ #endif
+ 
++#if XXH3_INLINE_SECRET
++#  define XXH3_WITH_SECRET_INLINE XXH_FORCE_INLINE
++#else
++#  define XXH3_WITH_SECRET_INLINE XXH_NO_INLINE
++#endif
+ 
+ 
+ /* *************************************
+ *  Debug
+ ***************************************/
+ /*!
+  * @ingroup tuning
+  * @def XXH_DEBUGLEVEL
+@@ -1535,24 +2146,27 @@ static void* XXH_memcpy(void* dest, cons
+ #    define XXH_DEBUGLEVEL 0
+ #  endif
+ #endif
+ 
+ #if (XXH_DEBUGLEVEL>=1)
+ #  include <assert.h>   /* note: can still be disabled with NDEBUG */
+ #  define XXH_ASSERT(c)   assert(c)
+ #else
+-#  define XXH_ASSERT(c)   ((void)0)
++#  if defined(__INTEL_COMPILER)
++#    define XXH_ASSERT(c)   XXH_ASSUME((unsigned char) (c))
++#  else
++#    define XXH_ASSERT(c)   XXH_ASSUME(c)
++#  endif
+ #endif
+ 
+ /* note: use after variable declarations */
+ #ifndef XXH_STATIC_ASSERT
+ #  if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 201112L)    /* C11 */
+-#    include <assert.h>
+-#    define XXH_STATIC_ASSERT_WITH_MESSAGE(c,m) do { static_assert((c),m); } while(0)
++#    define XXH_STATIC_ASSERT_WITH_MESSAGE(c,m) do { _Static_assert((c),m); } while(0)
+ #  elif defined(__cplusplus) && (__cplusplus >= 201103L)            /* C++11 */
+ #    define XXH_STATIC_ASSERT_WITH_MESSAGE(c,m) do { static_assert((c),m); } while(0)
+ #  else
+ #    define XXH_STATIC_ASSERT_WITH_MESSAGE(c,m) do { struct xxh_sa { char x[(c) ? 1 : -1]; }; } while(0)
+ #  endif
+ #  define XXH_STATIC_ASSERT(c) XXH_STATIC_ASSERT_WITH_MESSAGE((c),#c)
+ #endif
+ 
+@@ -1568,35 +2182,44 @@ static void* XXH_memcpy(void* dest, cons
+  * This is used in a few places to avoid unwanted autovectorization (e.g.
+  * XXH32_round()). All vectorization we want is explicit via intrinsics,
+  * and _usually_ isn't wanted elsewhere.
+  *
+  * We also use it to prevent unwanted constant folding for AArch64 in
+  * XXH3_initCustomSecret_scalar().
+  */
+ #if defined(__GNUC__) || defined(__clang__)
+-#  define XXH_COMPILER_GUARD(var) __asm__ __volatile__("" : "+r" (var))
++#  define XXH_COMPILER_GUARD(var) __asm__("" : "+r" (var))
+ #else
+ #  define XXH_COMPILER_GUARD(var) ((void)0)
+ #endif
+ 
++/* Specifically for NEON vectors which use the "w" constraint, on
++ * Clang. */
++#if defined(__clang__) && defined(__ARM_ARCH) && !defined(__wasm__)
++#  define XXH_COMPILER_GUARD_CLANG_NEON(var) __asm__("" : "+w" (var))
++#else
++#  define XXH_COMPILER_GUARD_CLANG_NEON(var) ((void)0)
++#endif
++
+ /* *************************************
+ *  Basic Types
+ ***************************************/
+ #if !defined (__VMS) \
+  && (defined (__cplusplus) \
+  || (defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) /* C99 */) )
+ # include <stdint.h>
+   typedef uint8_t xxh_u8;
+ #else
+   typedef unsigned char xxh_u8;
+ #endif
+ typedef XXH32_hash_t xxh_u32;
+ 
+ #ifdef XXH_OLD_NAMES
++#  warning "XXH_OLD_NAMES is planned to be removed starting v0.9. If the program depends on it, consider moving away from it by employing newer type names directly"
+ #  define BYTE xxh_u8
+ #  define U8   xxh_u8
+ #  define U32  xxh_u32
+ #endif
+ 
+ /* ***   Memory access   *** */
+ 
+ /*!
+@@ -1660,35 +2283,36 @@ typedef XXH32_hash_t xxh_u32;
+  * Force direct memory access. Only works on CPU which support unaligned memory
+  * access in hardware.
+  */
+ static xxh_u32 XXH_read32(const void* memPtr) { return *(const xxh_u32*) memPtr; }
+ 
+ #elif (defined(XXH_FORCE_MEMORY_ACCESS) && (XXH_FORCE_MEMORY_ACCESS==1))
+ 
+ /*
+- * __pack instructions are safer but compiler specific, hence potentially
+- * problematic for some compilers.
+- *
+- * Currently only defined for GCC and ICC.
++ * __attribute__((aligned(1))) is supported by gcc and clang. Originally the
++ * documentation claimed that it only increased the alignment, but actually it
++ * can decrease it on gcc, clang, and icc:
++ * https://gcc.gnu.org/bugzilla/show_bug.cgi?id=69502,
++ * https://gcc.godbolt.org/z/xYez1j67Y.
+  */
+ #ifdef XXH_OLD_NAMES
+ typedef union { xxh_u32 u32; } __attribute__((packed)) unalign;
+ #endif
+ static xxh_u32 XXH_read32(const void* ptr)
+ {
+-    typedef union { xxh_u32 u32; } __attribute__((packed)) xxh_unalign;
+-    return ((const xxh_unalign*)ptr)->u32;
++    typedef __attribute__((aligned(1))) xxh_u32 xxh_unalign32;
++    return *((const xxh_unalign32*)ptr);
+ }
+ 
+ #else
+ 
+ /*
+  * Portable and safe solution. Generally efficient.
+- * see: http://fastcompression.blogspot.com/2015/08/accessing-unaligned-memory.html
++ * see: https://fastcompression.blogspot.com/2015/08/accessing-unaligned-memory.html
+  */
+ static xxh_u32 XXH_read32(const void* memPtr)
+ {
+     xxh_u32 val;
+     XXH_memcpy(&val, memPtr, sizeof(val));
+     return val;
+ }
+ 
+@@ -1754,16 +2378,61 @@ static int XXH_isLittleEndian(void)
+ #define XXH_GCC_VERSION (__GNUC__ * 100 + __GNUC_MINOR__)
+ 
+ #ifdef __has_builtin
+ #  define XXH_HAS_BUILTIN(x) __has_builtin(x)
+ #else
+ #  define XXH_HAS_BUILTIN(x) 0
+ #endif
+ 
++
++
++/*
++ * C23 and future versions have standard "unreachable()".
++ * Once it has been implemented reliably we can add it as an
++ * additional case:
++ *
++ * ```
++ * #if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= XXH_C23_VN)
++ * #  include <stddef.h>
++ * #  ifdef unreachable
++ * #    define XXH_UNREACHABLE() unreachable()
++ * #  endif
++ * #endif
++ * ```
++ *
++ * Note C++23 also has std::unreachable() which can be detected
++ * as follows:
++ * ```
++ * #if defined(__cpp_lib_unreachable) && (__cpp_lib_unreachable >= 202202L)
++ * #  include <utility>
++ * #  define XXH_UNREACHABLE() std::unreachable()
++ * #endif
++ * ```
++ * NB: `__cpp_lib_unreachable` is defined in the `<version>` header.
++ * We don't use that as including `<utility>` in `extern "C"` blocks
++ * doesn't work on GCC12
++ */
++
++#if XXH_HAS_BUILTIN(__builtin_unreachable)
++#  define XXH_UNREACHABLE() __builtin_unreachable()
++
++#elif defined(_MSC_VER)
++#  define XXH_UNREACHABLE() __assume(0)
++
++#else
++#  define XXH_UNREACHABLE()
++#endif
++
++#if XXH_HAS_BUILTIN(__builtin_assume)
++#  define XXH_ASSUME(c) __builtin_assume(c)
++#else
++#  define XXH_ASSUME(c) if (!(c)) { XXH_UNREACHABLE(); }
++#endif
++
+ /*!
+  * @internal
+  * @def XXH_rotl32(x,r)
+  * @brief 32-bit rotate left.
+  *
+  * @param x The 32-bit integer to be rotated.
+  * @param r The number of bits to rotate.
+  * @pre
+@@ -1876,18 +2545,20 @@ XXH_readLE32_align(const void* ptr, XXH_
+ XXH_PUBLIC_API unsigned XXH_versionNumber (void) { return XXH_VERSION_NUMBER; }
+ 
+ 
+ /* *******************************************************************
+ *  32-bit hash functions
+ *********************************************************************/
+ /*!
+  * @}
+- * @defgroup xxh32_impl XXH32 implementation
++ * @defgroup XXH32_impl XXH32 implementation
+  * @ingroup impl
++ *
++ * Details on the XXH32 implementation.
+  * @{
+  */
+  /* #define instead of static const, to be used as initializers */
+ #define XXH_PRIME32_1  0x9E3779B1U  /*!< 0b10011110001101110111100110110001 */
+ #define XXH_PRIME32_2  0x85EBCA77U  /*!< 0b10000101111010111100101001110111 */
+ #define XXH_PRIME32_3  0xC2B2AE3DU  /*!< 0b11000010101100101010111000111101 */
+ #define XXH_PRIME32_4  0x27D4EB2FU  /*!< 0b00100111110101001110101100101111 */
+ #define XXH_PRIME32_5  0x165667B1U  /*!< 0b00010110010101100110011110110001 */
+@@ -1911,17 +2582,17 @@ XXH_PUBLIC_API unsigned XXH_versionNumbe
+  * @param input The stripe of input to mix.
+  * @return The mixed accumulator lane.
+  */
+ static xxh_u32 XXH32_round(xxh_u32 acc, xxh_u32 input)
+ {
+     acc += input * XXH_PRIME32_2;
+     acc  = XXH_rotl32(acc, 13);
+     acc *= XXH_PRIME32_1;
+-#if (defined(__SSE4_1__) || defined(__aarch64__)) && !defined(XXH_ENABLE_AUTOVECTORIZE)
++#if (defined(__SSE4_1__) || defined(__aarch64__) || defined(__wasm_simd128__)) && !defined(XXH_ENABLE_AUTOVECTORIZE)
+     /*
+      * UGLY HACK:
+      * A compiler fence is the only thing that prevents GCC and Clang from
+      * autovectorizing the XXH32 loop (pragmas and attributes don't work for some
+      * reason) without globally disabling SSE4.1.
+      *
+      * The reason we want to avoid vectorization is because despite working on
+      * 4 integers at a time, there are multiple factors slowing XXH32 down on
+@@ -1941,131 +2612,135 @@ static xxh_u32 XXH32_round(xxh_u32 acc, 
+      *      roll   v, 13    // reliably fast across the board
+      *      shldl  v, v, 13 // Sandy Bridge and later prefer this for some reason
+      *
+      * - Instruction level parallelism is actually more beneficial here because
+      *   the SIMD actually serializes this operation: While v1 is rotating, v2
+      *   can load data, while v3 can multiply. SSE forces them to operate
+      *   together.
+      *
+-     * This is also enabled on AArch64, as Clang autovectorizes it incorrectly
+-     * and it is pointless writing a NEON implementation that is basically the
+-     * same speed as scalar for XXH32.
++     * This is also enabled on AArch64, as Clang is *very aggressive* in vectorizing
++     * the loop. NEON is only faster on the A53, and with the newer cores, it is less
++     * than half the speed.
++     *
++     * Additionally, this is used on WASM SIMD128 because it JITs to the same
++     * SIMD instructions and has the same issue.
+      */
+     XXH_COMPILER_GUARD(acc);
+ #endif
+     return acc;
+ }
+ 
+ /*!
+  * @internal
+  * @brief Mixes all bits to finalize the hash.
+  *
+  * The final mix ensures that all input bits have a chance to impact any bit in
+  * the output digest, resulting in an unbiased distribution.
+  *
+- * @param h32 The hash to avalanche.
++ * @param hash The hash to avalanche.
+  * @return The avalanched hash.
+  */
+-static xxh_u32 XXH32_avalanche(xxh_u32 h32)
+-{
+-    h32 ^= h32 >> 15;
+-    h32 *= XXH_PRIME32_2;
+-    h32 ^= h32 >> 13;
+-    h32 *= XXH_PRIME32_3;
+-    h32 ^= h32 >> 16;
+-    return(h32);
++static xxh_u32 XXH32_avalanche(xxh_u32 hash)
++{
++    hash ^= hash >> 15;
++    hash *= XXH_PRIME32_2;
++    hash ^= hash >> 13;
++    hash *= XXH_PRIME32_3;
++    hash ^= hash >> 16;
++    return hash;
+ }
+ 
+ #define XXH_get32bits(p) XXH_readLE32_align(p, align)
+ 
+ /*!
+  * @internal
+  * @brief Processes the last 0-15 bytes of @p ptr.
+  *
+  * There may be up to 15 bytes remaining to consume from the input.
+  * This final stage will digest them to ensure that all input bytes are present
+  * in the final mix.
+  *
+- * @param h32 The hash to finalize.
++ * @param hash The hash to finalize.
+  * @param ptr The pointer to the remaining input.
+  * @param len The remaining length, modulo 16.
+  * @param align Whether @p ptr is aligned.
+  * @return The finalized hash.
+- */
+-static xxh_u32
+-XXH32_finalize(xxh_u32 h32, const xxh_u8* ptr, size_t len, XXH_alignment align)
+-{
+-#define XXH_PROCESS1 do {                           \
+-    h32 += (*ptr++) * XXH_PRIME32_5;                \
+-    h32 = XXH_rotl32(h32, 11) * XXH_PRIME32_1;      \
++ * @see XXH64_finalize().
++ */
++static XXH_PUREF xxh_u32
++XXH32_finalize(xxh_u32 hash, const xxh_u8* ptr, size_t len, XXH_alignment align)
++{
++#define XXH_PROCESS1 do {                             \
++    hash += (*ptr++) * XXH_PRIME32_5;                 \
++    hash = XXH_rotl32(hash, 11) * XXH_PRIME32_1;      \
+ } while (0)
+ 
+-#define XXH_PROCESS4 do {                           \
+-    h32 += XXH_get32bits(ptr) * XXH_PRIME32_3;      \
+-    ptr += 4;                                   \
+-    h32  = XXH_rotl32(h32, 17) * XXH_PRIME32_4;     \
++#define XXH_PROCESS4 do {                             \
++    hash += XXH_get32bits(ptr) * XXH_PRIME32_3;       \
++    ptr += 4;                                         \
++    hash  = XXH_rotl32(hash, 17) * XXH_PRIME32_4;     \
+ } while (0)
+ 
+     if (ptr==NULL) XXH_ASSERT(len == 0);
+ 
+     /* Compact rerolled version; generally faster */
+     if (!XXH32_ENDJMP) {
+         len &= 15;
+         while (len >= 4) {
+             XXH_PROCESS4;
+             len -= 4;
+         }
+         while (len > 0) {
+             XXH_PROCESS1;
+             --len;
+         }
+-        return XXH32_avalanche(h32);
++        return XXH32_avalanche(hash);
+     } else {
+          switch(len&15) /* or switch(bEnd - p) */ {
+            case 12:      XXH_PROCESS4;
+-                         XXH_FALLTHROUGH;
++                         XXH_FALLTHROUGH;  /* fallthrough */
+            case 8:       XXH_PROCESS4;
+-                         XXH_FALLTHROUGH;
++                         XXH_FALLTHROUGH;  /* fallthrough */
+            case 4:       XXH_PROCESS4;
+-                         return XXH32_avalanche(h32);
++                         return XXH32_avalanche(hash);
+ 
+            case 13:      XXH_PROCESS4;
+-                         XXH_FALLTHROUGH;
++                         XXH_FALLTHROUGH;  /* fallthrough */
+            case 9:       XXH_PROCESS4;
+-                         XXH_FALLTHROUGH;
++                         XXH_FALLTHROUGH;  /* fallthrough */
+            case 5:       XXH_PROCESS4;
+                          XXH_PROCESS1;
+-                         return XXH32_avalanche(h32);
++                         return XXH32_avalanche(hash);
+ 
+            case 14:      XXH_PROCESS4;
+-                         XXH_FALLTHROUGH;
++                         XXH_FALLTHROUGH;  /* fallthrough */
+            case 10:      XXH_PROCESS4;
+-                         XXH_FALLTHROUGH;
++                         XXH_FALLTHROUGH;  /* fallthrough */
+            case 6:       XXH_PROCESS4;
+                          XXH_PROCESS1;
+                          XXH_PROCESS1;
+-                         return XXH32_avalanche(h32);
++                         return XXH32_avalanche(hash);
+ 
+            case 15:      XXH_PROCESS4;
+-                         XXH_FALLTHROUGH;
++                         XXH_FALLTHROUGH;  /* fallthrough */
+            case 11:      XXH_PROCESS4;
+-                         XXH_FALLTHROUGH;
++                         XXH_FALLTHROUGH;  /* fallthrough */
+            case 7:       XXH_PROCESS4;
+-                         XXH_FALLTHROUGH;
++                         XXH_FALLTHROUGH;  /* fallthrough */
+            case 3:       XXH_PROCESS1;
+-                         XXH_FALLTHROUGH;
++                         XXH_FALLTHROUGH;  /* fallthrough */
+            case 2:       XXH_PROCESS1;
+-                         XXH_FALLTHROUGH;
++                         XXH_FALLTHROUGH;  /* fallthrough */
+            case 1:       XXH_PROCESS1;
+-                         XXH_FALLTHROUGH;
+-           case 0:       return XXH32_avalanche(h32);
++                         XXH_FALLTHROUGH;  /* fallthrough */
++           case 0:       return XXH32_avalanche(hash);
+         }
+         XXH_ASSERT(0);
+-        return h32;   /* reaching this point is deemed impossible */
++        return hash;   /* reaching this point is deemed impossible */
+     }
+ }
+ 
+ #ifdef XXH_OLD_NAMES
+ #  define PROCESS1 XXH_PROCESS1
+ #  define PROCESS4 XXH_PROCESS4
+ #else
+ #  undef XXH_PROCESS1
+@@ -2075,17 +2750,17 @@ XXH32_finalize(xxh_u32 h32, const xxh_u8
+ /*!
+  * @internal
+  * @brief The implementation for @ref XXH32().
+  *
+  * @param input , len , seed Directly passed from @ref XXH32().
+  * @param align Whether @p input is aligned.
+  * @return The calculated hash.
+  */
+-XXH_FORCE_INLINE xxh_u32
++XXH_FORCE_INLINE XXH_PUREF xxh_u32
+ XXH32_endian_align(const xxh_u8* input, size_t len, xxh_u32 seed, XXH_alignment align)
+ {
+     xxh_u32 h32;
+ 
+     if (input==NULL) XXH_ASSERT(len == 0);
+ 
+     if (len>=16) {
+         const xxh_u8* const bEnd = input + len;
+@@ -2108,20 +2783,20 @@ XXH32_endian_align(const xxh_u8* input, 
+         h32  = seed + XXH_PRIME32_5;
+     }
+ 
+     h32 += (xxh_u32)len;
+ 
+     return XXH32_finalize(h32, input, len&15, align);
+ }
+ 
+-/*! @ingroup xxh32_family */
++/*! @ingroup XXH32_family */
+ XXH_PUBLIC_API XXH32_hash_t XXH32 (const void* input, size_t len, XXH32_hash_t seed)
+ {
+-#if 0
++#if !defined(XXH_NO_STREAM) && XXH_SIZE_OPT >= 2
+     /* Simple version, good for code maintenance, but unfortunately slow for small inputs */
+     XXH32_state_t state;
+     XXH32_reset(&state, seed);
+     XXH32_update(&state, (const xxh_u8*)input, len);
+     return XXH32_digest(&state);
+ #else
+     if (XXH_FORCE_ALIGN_CHECK) {
+         if ((((size_t)input) & 3) == 0) {   /* Input is 4-bytes aligned, leverage the speed benefit */
+@@ -2130,52 +2805,49 @@ XXH_PUBLIC_API XXH32_hash_t XXH32 (const
+ 
+     return XXH32_endian_align((const xxh_u8*)input, len, seed, XXH_unaligned);
+ #endif
+ }
+ 
+ 
+ 
+ /*******   Hash streaming   *******/
+-/*!
+- * @ingroup xxh32_family
+- */
++#ifndef XXH_NO_STREAM
++/*! @ingroup XXH32_family */
+ XXH_PUBLIC_API XXH32_state_t* XXH32_createState(void)
+ {
+     return (XXH32_state_t*)XXH_malloc(sizeof(XXH32_state_t));
+ }
+-/*! @ingroup xxh32_family */
++/*! @ingroup XXH32_family */
+ XXH_PUBLIC_API XXH_errorcode XXH32_freeState(XXH32_state_t* statePtr)
+ {
+     XXH_free(statePtr);
+     return XXH_OK;
+ }
+ 
+-/*! @ingroup xxh32_family */
++/*! @ingroup XXH32_family */
+ XXH_PUBLIC_API void XXH32_copyState(XXH32_state_t* dstState, const XXH32_state_t* srcState)
+ {
+     XXH_memcpy(dstState, srcState, sizeof(*dstState));
+ }
+ 
+-/*! @ingroup xxh32_family */
++/*! @ingroup XXH32_family */
+ XXH_PUBLIC_API XXH_errorcode XXH32_reset(XXH32_state_t* statePtr, XXH32_hash_t seed)
+ {
+-    XXH32_state_t state;   /* using a local state to memcpy() in order to avoid strict-aliasing warnings */
+-    memset(&state, 0, sizeof(state));
+-    state.v[0] = seed + XXH_PRIME32_1 + XXH_PRIME32_2;
+-    state.v[1] = seed + XXH_PRIME32_2;
+-    state.v[2] = seed + 0;
+-    state.v[3] = seed - XXH_PRIME32_1;
+-    /* do not write into reserved, planned to be removed in a future version */
+-    XXH_memcpy(statePtr, &state, sizeof(state) - sizeof(state.reserved));
++    XXH_ASSERT(statePtr != NULL);
++    memset(statePtr, 0, sizeof(*statePtr));
++    statePtr->v[0] = seed + XXH_PRIME32_1 + XXH_PRIME32_2;
++    statePtr->v[1] = seed + XXH_PRIME32_2;
++    statePtr->v[2] = seed + 0;
++    statePtr->v[3] = seed - XXH_PRIME32_1;
+     return XXH_OK;
+ }
+ 
+ 
+-/*! @ingroup xxh32_family */
++/*! @ingroup XXH32_family */
+ XXH_PUBLIC_API XXH_errorcode
+ XXH32_update(XXH32_state_t* state, const void* input, size_t len)
+ {
+     if (input==NULL) {
+         XXH_ASSERT(len == 0);
+         return XXH_OK;
+     }
+ 
+@@ -2220,17 +2892,17 @@ XXH32_update(XXH32_state_t* state, const
+             state->memsize = (unsigned)(bEnd-p);
+         }
+     }
+ 
+     return XXH_OK;
+ }
+ 
+ 
+-/*! @ingroup xxh32_family */
++/*! @ingroup XXH32_family */
+ XXH_PUBLIC_API XXH32_hash_t XXH32_digest(const XXH32_state_t* state)
+ {
+     xxh_u32 h32;
+ 
+     if (state->large_len) {
+         h32 = XXH_rotl32(state->v[0], 1)
+             + XXH_rotl32(state->v[1], 7)
+             + XXH_rotl32(state->v[2], 12)
+@@ -2238,22 +2910,22 @@ XXH_PUBLIC_API XXH32_hash_t XXH32_digest
+     } else {
+         h32 = state->v[2] /* == seed */ + XXH_PRIME32_5;
+     }
+ 
+     h32 += state->total_len_32;
+ 
+     return XXH32_finalize(h32, (const xxh_u8*)state->mem32, state->memsize, XXH_aligned);
+ }
+-
++#endif /* !XXH_NO_STREAM */
+ 
+ /*******   Canonical representation   *******/
+ 
+ /*!
+- * @ingroup xxh32_family
++ * @ingroup XXH32_family
+  * The default return values from XXH functions are unsigned 32 and 64 bit
+  * integers.
+  *
+  * The canonical representation uses big endian convention, the same convention
+  * as human-readable numbers (large digits first).
+  *
+  * This way, hash values can be written into a file or buffer, remaining
+  * comparable across different systems.
+@@ -2262,17 +2934,17 @@ XXH_PUBLIC_API XXH32_hash_t XXH32_digest
+  * canonical format.
+  */
+ XXH_PUBLIC_API void XXH32_canonicalFromHash(XXH32_canonical_t* dst, XXH32_hash_t hash)
+ {
+     XXH_STATIC_ASSERT(sizeof(XXH32_canonical_t) == sizeof(XXH32_hash_t));
+     if (XXH_CPU_LITTLE_ENDIAN) hash = XXH_swap32(hash);
+     XXH_memcpy(dst, &hash, sizeof(*dst));
+ }
+-/*! @ingroup xxh32_family */
++/*! @ingroup XXH32_family */
+ XXH_PUBLIC_API XXH32_hash_t XXH32_hashFromCanonical(const XXH32_canonical_t* src)
+ {
+     return XXH_readBE32(src);
+ }
+ 
+ 
+ #ifndef XXH_NO_LONG_LONG
+ 
+@@ -2303,35 +2975,36 @@ typedef XXH64_hash_t xxh_u64;
+ static xxh_u64 XXH_read64(const void* memPtr)
+ {
+     return *(const xxh_u64*) memPtr;
+ }
+ 
+ #elif (defined(XXH_FORCE_MEMORY_ACCESS) && (XXH_FORCE_MEMORY_ACCESS==1))
+ 
+ /*
+- * __pack instructions are safer, but compiler specific, hence potentially
+- * problematic for some compilers.
+- *
+- * Currently only defined for GCC and ICC.
++ * __attribute__((aligned(1))) is supported by gcc and clang. Originally the
++ * documentation claimed that it only increased the alignment, but actually it
++ * can decrease it on gcc, clang, and icc:
++ * https://gcc.gnu.org/bugzilla/show_bug.cgi?id=69502,
++ * https://gcc.godbolt.org/z/xYez1j67Y.
+  */
+ #ifdef XXH_OLD_NAMES
+ typedef union { xxh_u32 u32; xxh_u64 u64; } __attribute__((packed)) unalign64;
+ #endif
+ static xxh_u64 XXH_read64(const void* ptr)
+ {
+-    typedef union { xxh_u32 u32; xxh_u64 u64; } __attribute__((packed)) xxh_unalign64;
+-    return ((const xxh_unalign64*)ptr)->u64;
++    typedef __attribute__((aligned(1))) xxh_u64 xxh_unalign64;
++    return *((const xxh_unalign64*)ptr);
+ }
+ 
+ #else
+ 
+ /*
+  * Portable and safe solution. Generally efficient.
+- * see: http://fastcompression.blogspot.com/2015/08/accessing-unaligned-memory.html
++ * see: https://fastcompression.blogspot.com/2015/08/accessing-unaligned-memory.html
+  */
+ static xxh_u64 XXH_read64(const void* memPtr)
+ {
+     xxh_u64 val;
+     XXH_memcpy(&val, memPtr, sizeof(val));
+     return val;
+ }
+ 
+@@ -2405,18 +3078,20 @@ XXH_readLE64_align(const void* ptr, XXH_
+     else
+         return XXH_CPU_LITTLE_ENDIAN ? *(const xxh_u64*)ptr : XXH_swap64(*(const xxh_u64*)ptr);
+ }
+ 
+ 
+ /*******   xxh64   *******/
+ /*!
+  * @}
+- * @defgroup xxh64_impl XXH64 implementation
++ * @defgroup XXH64_impl XXH64 implementation
+  * @ingroup impl
++ *
++ * Details on the XXH64 implementation.
+  * @{
+  */
+ /* #define rather that static const, to be used as initializers */
+ #define XXH_PRIME64_1  0x9E3779B185EBCA87ULL  /*!< 0b1001111000110111011110011011000110000101111010111100101010000111 */
+ #define XXH_PRIME64_2  0xC2B2AE3D27D4EB4FULL  /*!< 0b1100001010110010101011100011110100100111110101001110101101001111 */
+ #define XXH_PRIME64_3  0x165667B19E3779F9ULL  /*!< 0b0001011001010110011001111011000110011110001101110111100111111001 */
+ #define XXH_PRIME64_4  0x85EBCA77C2B2AE63ULL  /*!< 0b1000010111101011110010100111011111000010101100101010111001100011 */
+ #define XXH_PRIME64_5  0x27D4EB2F165667C5ULL  /*!< 0b0010011111010100111010110010111100010110010101100110011111000101 */
+@@ -2424,82 +3099,107 @@ XXH_readLE64_align(const void* ptr, XXH_
+ #ifdef XXH_OLD_NAMES
+ #  define PRIME64_1 XXH_PRIME64_1
+ #  define PRIME64_2 XXH_PRIME64_2
+ #  define PRIME64_3 XXH_PRIME64_3
+ #  define PRIME64_4 XXH_PRIME64_4
+ #  define PRIME64_5 XXH_PRIME64_5
+ #endif
+ 
++/*! @copydoc XXH32_round */
+ static xxh_u64 XXH64_round(xxh_u64 acc, xxh_u64 input)
+ {
+     acc += input * XXH_PRIME64_2;
+     acc  = XXH_rotl64(acc, 31);
+     acc *= XXH_PRIME64_1;
+     return acc;
+ }
+ 
+ static xxh_u64 XXH64_mergeRound(xxh_u64 acc, xxh_u64 val)
+ {
+     val  = XXH64_round(0, val);
+     acc ^= val;
+     acc  = acc * XXH_PRIME64_1 + XXH_PRIME64_4;
+     return acc;
+ }
+ 
+-static xxh_u64 XXH64_avalanche(xxh_u64 h64)
+-{
+-    h64 ^= h64 >> 33;
+-    h64 *= XXH_PRIME64_2;
+-    h64 ^= h64 >> 29;
+-    h64 *= XXH_PRIME64_3;
+-    h64 ^= h64 >> 32;
+-    return h64;
++/*! @copydoc XXH32_avalanche */
++static xxh_u64 XXH64_avalanche(xxh_u64 hash)
++{
++    hash ^= hash >> 33;
++    hash *= XXH_PRIME64_2;
++    hash ^= hash >> 29;
++    hash *= XXH_PRIME64_3;
++    hash ^= hash >> 32;
++    return hash;
+ }
+ 
+ 
+ #define XXH_get64bits(p) XXH_readLE64_align(p, align)
+ 
+-static xxh_u64
+-XXH64_finalize(xxh_u64 h64, const xxh_u8* ptr, size_t len, XXH_alignment align)
++/*!
++ * @internal
++ * @brief Processes the last 0-31 bytes of @p ptr.
++ *
++ * There may be up to 31 bytes remaining to consume from the input.
++ * This final stage will digest them to ensure that all input bytes are present
++ * in the final mix.
++ *
++ * @param hash The hash to finalize.
++ * @param ptr The pointer to the remaining input.
++ * @param len The remaining length, modulo 32.
++ * @param align Whether @p ptr is aligned.
++ * @return The finalized hash
++ * @see XXH32_finalize().
++ */
++static XXH_PUREF xxh_u64
++XXH64_finalize(xxh_u64 hash, const xxh_u8* ptr, size_t len, XXH_alignment align)
+ {
+     if (ptr==NULL) XXH_ASSERT(len == 0);
+     len &= 31;
+     while (len >= 8) {
+         xxh_u64 const k1 = XXH64_round(0, XXH_get64bits(ptr));
+         ptr += 8;
+-        h64 ^= k1;
+-        h64  = XXH_rotl64(h64,27) * XXH_PRIME64_1 + XXH_PRIME64_4;
++        hash ^= k1;
++        hash  = XXH_rotl64(hash,27) * XXH_PRIME64_1 + XXH_PRIME64_4;
+         len -= 8;
+     }
+     if (len >= 4) {
+-        h64 ^= (xxh_u64)(XXH_get32bits(ptr)) * XXH_PRIME64_1;
++        hash ^= (xxh_u64)(XXH_get32bits(ptr)) * XXH_PRIME64_1;
+         ptr += 4;
+-        h64 = XXH_rotl64(h64, 23) * XXH_PRIME64_2 + XXH_PRIME64_3;
++        hash = XXH_rotl64(hash, 23) * XXH_PRIME64_2 + XXH_PRIME64_3;
+         len -= 4;
+     }
+     while (len > 0) {
+-        h64 ^= (*ptr++) * XXH_PRIME64_5;
+-        h64 = XXH_rotl64(h64, 11) * XXH_PRIME64_1;
++        hash ^= (*ptr++) * XXH_PRIME64_5;
++        hash = XXH_rotl64(hash, 11) * XXH_PRIME64_1;
+         --len;
+     }
+-    return  XXH64_avalanche(h64);
++    return  XXH64_avalanche(hash);
+ }
+ 
+ #ifdef XXH_OLD_NAMES
+ #  define PROCESS1_64 XXH_PROCESS1_64
+ #  define PROCESS4_64 XXH_PROCESS4_64
+ #  define PROCESS8_64 XXH_PROCESS8_64
+ #else
+ #  undef XXH_PROCESS1_64
+ #  undef XXH_PROCESS4_64
+ #  undef XXH_PROCESS8_64
+ #endif
+ 
+-XXH_FORCE_INLINE xxh_u64
++/*!
++ * @internal
++ * @brief The implementation for @ref XXH64().
++ *
++ * @param input , len , seed Directly passed from @ref XXH64().
++ * @param align Whether @p input is aligned.
++ * @return The calculated hash.
++ */
++XXH_FORCE_INLINE XXH_PUREF xxh_u64
+ XXH64_endian_align(const xxh_u8* input, size_t len, xxh_u64 seed, XXH_alignment align)
+ {
+     xxh_u64 h64;
+     if (input==NULL) XXH_ASSERT(len == 0);
+ 
+     if (len>=32) {
+         const xxh_u8* const bEnd = input + len;
+         const xxh_u8* const limit = bEnd - 31;
+@@ -2526,20 +3226,20 @@ XXH64_endian_align(const xxh_u8* input, 
+     }
+ 
+     h64 += (xxh_u64) len;
+ 
+     return XXH64_finalize(h64, input, len, align);
+ }
+ 
+ 
+-/*! @ingroup xxh64_family */
+-XXH_PUBLIC_API XXH64_hash_t XXH64 (const void* input, size_t len, XXH64_hash_t seed)
+-{
+-#if 0
++/*! @ingroup XXH64_family */
++XXH_PUBLIC_API XXH64_hash_t XXH64 (XXH_NOESCAPE const void* input, size_t len, XXH64_hash_t seed)
++{
++#if !defined(XXH_NO_STREAM) && XXH_SIZE_OPT >= 2
+     /* Simple version, good for code maintenance, but unfortunately slow for small inputs */
+     XXH64_state_t state;
+     XXH64_reset(&state, seed);
+     XXH64_update(&state, (const xxh_u8*)input, len);
+     return XXH64_digest(&state);
+ #else
+     if (XXH_FORCE_ALIGN_CHECK) {
+         if ((((size_t)input) & 7)==0) {  /* Input is aligned, let's leverage the speed advantage */
+@@ -2547,52 +3247,50 @@ XXH_PUBLIC_API XXH64_hash_t XXH64 (const
+     }   }
+ 
+     return XXH64_endian_align((const xxh_u8*)input, len, seed, XXH_unaligned);
+ 
+ #endif
+ }
+ 
+ /*******   Hash Streaming   *******/
+-
+-/*! @ingroup xxh64_family*/
++#ifndef XXH_NO_STREAM
++/*! @ingroup XXH64_family*/
+ XXH_PUBLIC_API XXH64_state_t* XXH64_createState(void)
+ {
+     return (XXH64_state_t*)XXH_malloc(sizeof(XXH64_state_t));
+ }
+-/*! @ingroup xxh64_family */
++/*! @ingroup XXH64_family */
+ XXH_PUBLIC_API XXH_errorcode XXH64_freeState(XXH64_state_t* statePtr)
+ {
+     XXH_free(statePtr);
+     return XXH_OK;
+ }
+ 
+-/*! @ingroup xxh64_family */
+-XXH_PUBLIC_API void XXH64_copyState(XXH64_state_t* dstState, const XXH64_state_t* srcState)
++/*! @ingroup XXH64_family */
++XXH_PUBLIC_API void XXH64_copyState(XXH_NOESCAPE XXH64_state_t* dstState, const XXH64_state_t* srcState)
+ {
+     XXH_memcpy(dstState, srcState, sizeof(*dstState));
+ }
+ 
+-/*! @ingroup xxh64_family */
+-XXH_PUBLIC_API XXH_errorcode XXH64_reset(XXH64_state_t* statePtr, XXH64_hash_t seed)
+-{
+-    XXH64_state_t state;   /* use a local state to memcpy() in order to avoid strict-aliasing warnings */
+-    memset(&state, 0, sizeof(state));
+-    state.v[0] = seed + XXH_PRIME64_1 + XXH_PRIME64_2;
+-    state.v[1] = seed + XXH_PRIME64_2;
+-    state.v[2] = seed + 0;
+-    state.v[3] = seed - XXH_PRIME64_1;
+-     /* do not write into reserved64, might be removed in a future version */
+-    XXH_memcpy(statePtr, &state, sizeof(state) - sizeof(state.reserved64));
++/*! @ingroup XXH64_family */
++XXH_PUBLIC_API XXH_errorcode XXH64_reset(XXH_NOESCAPE XXH64_state_t* statePtr, XXH64_hash_t seed)
++{
++    XXH_ASSERT(statePtr != NULL);
++    memset(statePtr, 0, sizeof(*statePtr));
++    statePtr->v[0] = seed + XXH_PRIME64_1 + XXH_PRIME64_2;
++    statePtr->v[1] = seed + XXH_PRIME64_2;
++    statePtr->v[2] = seed + 0;
++    statePtr->v[3] = seed - XXH_PRIME64_1;
+     return XXH_OK;
+ }
+ 
+-/*! @ingroup xxh64_family */
++/*! @ingroup XXH64_family */
+ XXH_PUBLIC_API XXH_errorcode
+-XXH64_update (XXH64_state_t* state, const void* input, size_t len)
++XXH64_update (XXH_NOESCAPE XXH64_state_t* state, XXH_NOESCAPE const void* input, size_t len)
+ {
+     if (input==NULL) {
+         XXH_ASSERT(len == 0);
+         return XXH_OK;
+     }
+ 
+     {   const xxh_u8* p = (const xxh_u8*)input;
+         const xxh_u8* const bEnd = p + len;
+@@ -2632,18 +3330,18 @@ XXH64_update (XXH64_state_t* state, cons
+             state->memsize = (unsigned)(bEnd-p);
+         }
+     }
+ 
+     return XXH_OK;
+ }
+ 
+ 
+-/*! @ingroup xxh64_family */
+-XXH_PUBLIC_API XXH64_hash_t XXH64_digest(const XXH64_state_t* state)
++/*! @ingroup XXH64_family */
++XXH_PUBLIC_API XXH64_hash_t XXH64_digest(XXH_NOESCAPE const XXH64_state_t* state)
+ {
+     xxh_u64 h64;
+ 
+     if (state->total_len >= 32) {
+         h64 = XXH_rotl64(state->v[0], 1) + XXH_rotl64(state->v[1], 7) + XXH_rotl64(state->v[2], 12) + XXH_rotl64(state->v[3], 18);
+         h64 = XXH64_mergeRound(h64, state->v[0]);
+         h64 = XXH64_mergeRound(h64, state->v[1]);
+         h64 = XXH64_mergeRound(h64, state->v[2]);
+@@ -2651,72 +3349,92 @@ XXH_PUBLIC_API XXH64_hash_t XXH64_digest
+     } else {
+         h64  = state->v[2] /*seed*/ + XXH_PRIME64_5;
+     }
+ 
+     h64 += (xxh_u64) state->total_len;
+ 
+     return XXH64_finalize(h64, (const xxh_u8*)state->mem64, (size_t)state->total_len, XXH_aligned);
+ }
+-
++#endif /* !XXH_NO_STREAM */
+ 
+ /******* Canonical representation   *******/
+ 
+-/*! @ingroup xxh64_family */
+-XXH_PUBLIC_API void XXH64_canonicalFromHash(XXH64_canonical_t* dst, XXH64_hash_t hash)
++/*! @ingroup XXH64_family */
++XXH_PUBLIC_API void XXH64_canonicalFromHash(XXH_NOESCAPE XXH64_canonical_t* dst, XXH64_hash_t hash)
+ {
+     XXH_STATIC_ASSERT(sizeof(XXH64_canonical_t) == sizeof(XXH64_hash_t));
+     if (XXH_CPU_LITTLE_ENDIAN) hash = XXH_swap64(hash);
+     XXH_memcpy(dst, &hash, sizeof(*dst));
+ }
+ 
+-/*! @ingroup xxh64_family */
+-XXH_PUBLIC_API XXH64_hash_t XXH64_hashFromCanonical(const XXH64_canonical_t* src)
++/*! @ingroup XXH64_family */
++XXH_PUBLIC_API XXH64_hash_t XXH64_hashFromCanonical(XXH_NOESCAPE const XXH64_canonical_t* src)
+ {
+     return XXH_readBE64(src);
+ }
+ 
+ #ifndef XXH_NO_XXH3
+ 
+ /* *********************************************************************
+ *  XXH3
+ *  New generation hash designed for speed on small keys and vectorization
+ ************************************************************************ */
+ /*!
+  * @}
+- * @defgroup xxh3_impl XXH3 implementation
++ * @defgroup XXH3_impl XXH3 implementation
+  * @ingroup impl
+  * @{
+  */
+ 
+ /* ===   Compiler specifics   === */
+ 
+ #if ((defined(sun) || defined(__sun)) && __cplusplus) /* Solaris includes __STDC_VERSION__ with C++. Tested with GCC 5.5 */
+-#  define XXH_RESTRICT /* disable */
++#  define XXH_RESTRICT   /* disable */
+ #elif defined (__STDC_VERSION__) && __STDC_VERSION__ >= 199901L   /* >= C99 */
+ #  define XXH_RESTRICT   restrict
++#elif (defined (__GNUC__) && ((__GNUC__ > 3) || (__GNUC__ == 3 && __GNUC_MINOR__ >= 1))) \
++   || (defined (__clang__)) \
++   || (defined (_MSC_VER) && (_MSC_VER >= 1400)) \
++   || (defined (__INTEL_COMPILER) && (__INTEL_COMPILER >= 1300))
++/*
++ * There are a LOT more compilers that recognize __restrict but this
++ * covers the major ones.
++ */
++#  define XXH_RESTRICT   __restrict
+ #else
+-/* Note: it might be useful to define __restrict or __restrict__ for some C++ compilers */
+ #  define XXH_RESTRICT   /* disable */
+ #endif
+ 
+ #if (defined(__GNUC__) && (__GNUC__ >= 3))  \
+   || (defined(__INTEL_COMPILER) && (__INTEL_COMPILER >= 800)) \
+   || defined(__clang__)
+ #    define XXH_likely(x) __builtin_expect(x, 1)
+ #    define XXH_unlikely(x) __builtin_expect(x, 0)
+ #else
+ #    define XXH_likely(x) (x)
+ #    define XXH_unlikely(x) (x)
+ #endif
+ 
++#ifndef XXH_HAS_INCLUDE
++#  ifdef __has_include
++#    define XXH_HAS_INCLUDE(x) __has_include(x)
++#  else
++#    define XXH_HAS_INCLUDE(x) 0
++#  endif
++#endif
++
+ #if defined(__GNUC__) || defined(__clang__)
++#  if defined(__ARM_FEATURE_SVE)
++#    include <arm_sve.h>
++#  endif
+ #  if defined(__ARM_NEON__) || defined(__ARM_NEON) \
+-   || defined(__aarch64__)  || defined(_M_ARM) \
+-   || defined(_M_ARM64)     || defined(_M_ARM64EC)
++   || (defined(_M_ARM) && _M_ARM >= 7) \
++   || defined(_M_ARM64) || defined(_M_ARM64EC) \
++   || (defined(__wasm_simd128__) && XXH_HAS_INCLUDE(<arm_neon.h>)) /* WASM SIMD128 via SIMDe */
+ #    define inline __inline__  /* circumvent a clang bug */
+ #    include <arm_neon.h>
+ #    undef inline
+ #  elif defined(__AVX2__)
+ #    include <immintrin.h>
+ #  elif defined(__SSE2__)
+ #    include <emmintrin.h>
+ #  endif
+@@ -2817,57 +3535,66 @@ XXH_PUBLIC_API XXH64_hash_t XXH64_hashFr
+ #  define XXH_VECTOR XXH_SCALAR
+ /*!
+  * @ingroup tuning
+  * @brief Possible values for @ref XXH_VECTOR.
+  *
+  * Note that these are actually implemented as macros.
+  *
+  * If this is not defined, it is detected automatically.
+- * @ref XXH_X86DISPATCH overrides this.
++ * internal macro XXH_X86DISPATCH overrides this.
+  */
+ enum XXH_VECTOR_TYPE /* fake enum */ {
+     XXH_SCALAR = 0,  /*!< Portable scalar version */
+     XXH_SSE2   = 1,  /*!<
+                       * SSE2 for Pentium 4, Opteron, all x86_64.
+                       *
+                       * @note SSE2 is also guaranteed on Windows 10, macOS, and
+                       * Android x86.
+                       */
+     XXH_AVX2   = 2,  /*!< AVX2 for Haswell and Bulldozer */
+     XXH_AVX512 = 3,  /*!< AVX512 for Skylake and Icelake */
+-    XXH_NEON   = 4,  /*!< NEON for most ARMv7-A and all AArch64 */
++    XXH_NEON   = 4,  /*!<
++                       * NEON for most ARMv7-A, all AArch64, and WASM SIMD128
++                       * via the SIMDeverywhere polyfill provided with the
++                       * Emscripten SDK.
++                       */
+     XXH_VSX    = 5,  /*!< VSX and ZVector for POWER8/z13 (64-bit) */
++    XXH_SVE    = 6,  /*!< SVE for some ARMv8-A and ARMv9-A */
+ };
+ /*!
+  * @ingroup tuning
+  * @brief Selects the minimum alignment for XXH3's accumulators.
+  *
+- * When using SIMD, this should match the alignment reqired for said vector
++ * When using SIMD, this should match the alignment required for said vector
+  * type, so, for example, 32 for AVX2.
+  *
+  * Default: Auto detected.
+  */
+ #  define XXH_ACC_ALIGN 8
+ #endif
+ 
+ /* Actual definition */
+ #ifndef XXH_DOXYGEN
+ #  define XXH_SCALAR 0
+ #  define XXH_SSE2   1
+ #  define XXH_AVX2   2
+ #  define XXH_AVX512 3
+ #  define XXH_NEON   4
+ #  define XXH_VSX    5
++#  define XXH_SVE    6
+ #endif
+ 
+ #ifndef XXH_VECTOR    /* can be defined on command line */
+-#  if ( \
++#  if defined(__ARM_FEATURE_SVE)
++#    define XXH_VECTOR XXH_SVE
++#  elif ( \
+         defined(__ARM_NEON__) || defined(__ARM_NEON) /* gcc */ \
+      || defined(_M_ARM) || defined(_M_ARM64) || defined(_M_ARM64EC) /* msvc */ \
++     || (defined(__wasm_simd128__) && XXH_HAS_INCLUDE(<arm_neon.h>)) /* wasm simd128 via SIMDe */ \
+    ) && ( \
+         defined(_WIN32) || defined(__LITTLE_ENDIAN__) /* little endian only */ \
+     || (defined(__BYTE_ORDER__) && __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__) \
+    )
+ #    define XXH_VECTOR XXH_NEON
+ #  elif defined(__AVX512F__)
+ #    define XXH_VECTOR XXH_AVX512
+ #  elif defined(__AVX2__)
+@@ -2878,16 +3605,27 @@ enum XXH_VECTOR_TYPE /* fake enum */ {
+      || (defined(__s390x__) && defined(__VEC__)) \
+      && defined(__GNUC__) /* TODO: IBM XL */
+ #    define XXH_VECTOR XXH_VSX
+ #  else
+ #    define XXH_VECTOR XXH_SCALAR
+ #  endif
+ #endif
+ 
++/* __ARM_FEATURE_SVE is only supported by GCC & Clang. */
++#if (XXH_VECTOR == XXH_SVE) && !defined(__ARM_FEATURE_SVE)
++#  ifdef _MSC_VER
++#    pragma warning(once : 4606)
++#  else
++#    warning "__ARM_FEATURE_SVE isn't supported. Use SCALAR instead."
++#  endif
++#  undef XXH_VECTOR
++#  define XXH_VECTOR XXH_SCALAR
++#endif
++
+ /*
+  * Controls the alignment of the accumulator,
+  * for compatibility with aligned vector loads, which are usually faster.
+  */
+ #ifndef XXH_ACC_ALIGN
+ #  if defined(XXH_X86DISPATCH)
+ #     define XXH_ACC_ALIGN 64  /* for compatibility with avx512 */
+ #  elif XXH_VECTOR == XXH_SCALAR  /* scalar */
+@@ -2897,26 +3635,36 @@ enum XXH_VECTOR_TYPE /* fake enum */ {
+ #  elif XXH_VECTOR == XXH_AVX2  /* avx2 */
+ #     define XXH_ACC_ALIGN 32
+ #  elif XXH_VECTOR == XXH_NEON  /* neon */
+ #     define XXH_ACC_ALIGN 16
+ #  elif XXH_VECTOR == XXH_VSX   /* vsx */
+ #     define XXH_ACC_ALIGN 16
+ #  elif XXH_VECTOR == XXH_AVX512  /* avx512 */
+ #     define XXH_ACC_ALIGN 64
++#  elif XXH_VECTOR == XXH_SVE   /* sve */
++#     define XXH_ACC_ALIGN 64
+ #  endif
+ #endif
+ 
+ #if defined(XXH_X86DISPATCH) || XXH_VECTOR == XXH_SSE2 \
+     || XXH_VECTOR == XXH_AVX2 || XXH_VECTOR == XXH_AVX512
+ #  define XXH_SEC_ALIGN XXH_ACC_ALIGN
++#elif XXH_VECTOR == XXH_SVE
++#  define XXH_SEC_ALIGN XXH_ACC_ALIGN
+ #else
+ #  define XXH_SEC_ALIGN 8
+ #endif
+ 
++#if defined(__GNUC__) || defined(__clang__)
++#  define XXH_ALIASING __attribute__((may_alias))
++#else
++#  define XXH_ALIASING /* nothing */
++#endif
++
+ /*
+  * UGLY HACK:
+  * GCC usually generates the best code with -O3 for xxHash.
+  *
+  * However, when targeting AVX2, it is overzealous in its unrolling resulting
+  * in code roughly 3/4 the speed of Clang.
+  *
+  * There are other issues, such as GCC splitting _mm256_loadu_si256 into
+@@ -2930,154 +3678,188 @@ enum XXH_VECTOR_TYPE /* fake enum */ {
+  * for decent performance, or to use Clang instead.
+  *
+  * Fortunately, we can control the first one with a pragma that forces GCC into
+  * -O2, but the other one we can't control without "failed to inline always
+  * inline function due to target mismatch" warnings.
+  */
+ #if XXH_VECTOR == XXH_AVX2 /* AVX2 */ \
+   && defined(__GNUC__) && !defined(__clang__) /* GCC, not Clang */ \
+-  && defined(__OPTIMIZE__) && !defined(__OPTIMIZE_SIZE__) /* respect -O0 and -Os */
++  && defined(__OPTIMIZE__) && XXH_SIZE_OPT <= 0 /* respect -O0 and -Os */
+ #  pragma GCC push_options
+ #  pragma GCC optimize("-O2")
+ #endif
+ 
+-
+ #if XXH_VECTOR == XXH_NEON
++
+ /*
+- * NEON's setup for vmlal_u32 is a little more complicated than it is on
+- * SSE2, AVX2, and VSX.
+- *
+- * While PMULUDQ and VMULEUW both perform a mask, VMLAL.U32 performs an upcast.
+- *
+- * To do the same operation, the 128-bit 'Q' register needs to be split into
+- * two 64-bit 'D' registers, performing this operation::
+- *
+- *   [                a                 |                 b                ]
+- *            |              '---------. .--------'                |
+- *            |                         x                          |
+- *            |              .---------' '--------.                |
+- *   [ a & 0xFFFFFFFF | b & 0xFFFFFFFF ],[    a >> 32     |     b >> 32    ]
+- *
+- * Due to significant changes in aarch64, the fastest method for aarch64 is
+- * completely different than the fastest method for ARMv7-A.
+- *
+- * ARMv7-A treats D registers as unions overlaying Q registers, so modifying
+- * D11 will modify the high half of Q5. This is similar to how modifying AH
+- * will only affect bits 8-15 of AX on x86.
+- *
+- * VZIP takes two registers, and puts even lanes in one register and odd lanes
+- * in the other.
+- *
+- * On ARMv7-A, this strangely modifies both parameters in place instead of
+- * taking the usual 3-operand form.
+- *
+- * Therefore, if we want to do this, we can simply use a D-form VZIP.32 on the
+- * lower and upper halves of the Q register to end up with the high and low
+- * halves where we want - all in one instruction.
+- *
+- *   vzip.32   d10, d11       @ d10 = { d10[0], d11[0] }; d11 = { d10[1], d11[1] }
+- *
+- * Unfortunately we need inline assembly for this: Instructions modifying two
+- * registers at once is not possible in GCC or Clang's IR, and they have to
+- * create a copy.
+- *
+- * aarch64 requires a different approach.
+- *
+- * In order to make it easier to write a decent compiler for aarch64, many
+- * quirks were removed, such as conditional execution.
+- *
+- * NEON was also affected by this.
+- *
+- * aarch64 cannot access the high bits of a Q-form register, and writes to a
+- * D-form register zero the high bits, similar to how writes to W-form scalar
+- * registers (or DWORD registers on x86_64) work.
+- *
+- * The formerly free vget_high intrinsics now require a vext (with a few
+- * exceptions)
+- *
+- * Additionally, VZIP was replaced by ZIP1 and ZIP2, which are the equivalent
+- * of PUNPCKL* and PUNPCKH* in SSE, respectively, in order to only modify one
+- * operand.
+- *
+- * The equivalent of the VZIP.32 on the lower and upper halves would be this
+- * mess:
+- *
+- *   ext     v2.4s, v0.4s, v0.4s, #2 // v2 = { v0[2], v0[3], v0[0], v0[1] }
+- *   zip1    v1.2s, v0.2s, v2.2s     // v1 = { v0[0], v2[0] }
+- *   zip2    v0.2s, v0.2s, v1.2s     // v0 = { v0[1], v2[1] }
+- *
+- * Instead, we use a literal downcast, vmovn_u64 (XTN), and vshrn_n_u64 (SHRN):
+- *
+- *   shrn    v1.2s, v0.2d, #32  // v1 = (uint32x2_t)(v0 >> 32);
+- *   xtn     v0.2s, v0.2d       // v0 = (uint32x2_t)(v0 & 0xFFFFFFFF);
+- *
+- * This is available on ARMv7-A, but is less efficient than a single VZIP.32.
+- */
++ * UGLY HACK: While AArch64 GCC on Linux does not seem to care, on macOS, GCC -O3
++ * optimizes out the entire hashLong loop because of the aliasing violation.
++ *
++ * However, GCC is also inefficient at load-store optimization with vld1q/vst1q,
++ * so the only option is to mark it as aliasing.
++ */
++typedef uint64x2_t xxh_aliasing_uint64x2_t XXH_ALIASING;
++
++/*!
++ * @internal
++ * @brief `vld1q_u64` but faster and alignment-safe.
++ *
++ * On AArch64, unaligned access is always safe, but on ARMv7-a, it is only
++ * *conditionally* safe (`vld1` has an alignment bit like `movdq[ua]` in x86).
++ *
++ * GCC for AArch64 sees `vld1q_u8` as an intrinsic instead of a load, so it
++ * prohibits load-store optimizations. Therefore, a direct dereference is used.
++ *
++ * Otherwise, `vld1q_u8` is used with `vreinterpretq_u8_u64` to do a safe
++ * unaligned load.
++ */
++#if defined(__aarch64__) && defined(__GNUC__) && !defined(__clang__)
++XXH_FORCE_INLINE uint64x2_t XXH_vld1q_u64(void const* ptr) /* silence -Wcast-align */
++{
++    return *(xxh_aliasing_uint64x2_t const *)ptr;
++}
++#else
++XXH_FORCE_INLINE uint64x2_t XXH_vld1q_u64(void const* ptr)
++{
++    return vreinterpretq_u64_u8(vld1q_u8((uint8_t const*)ptr));
++}
++#endif
+ 
+ /*!
+- * Function-like macro:
+- * void XXH_SPLIT_IN_PLACE(uint64x2_t &in, uint32x2_t &outLo, uint32x2_t &outHi)
+- * {
+- *     outLo = (uint32x2_t)(in & 0xFFFFFFFF);
+- *     outHi = (uint32x2_t)(in >> 32);
+- *     in = UNDEFINED;
+- * }
+- */
+-# if !defined(XXH_NO_VZIP_HACK) /* define to disable */ \
+-   && (defined(__GNUC__) || defined(__clang__)) \
+-   && (defined(__arm__) || defined(__thumb__) || defined(_M_ARM))
+-#  define XXH_SPLIT_IN_PLACE(in, outLo, outHi)                                              \
+-    do {                                                                                    \
+-      /* Undocumented GCC/Clang operand modifier: %e0 = lower D half, %f0 = upper D half */ \
+-      /* https://github.com/gcc-mirror/gcc/blob/38cf91e5/gcc/config/arm/arm.c#L22486 */     \
+-      /* https://github.com/llvm-mirror/llvm/blob/2c4ca683/lib/Target/ARM/ARMAsmPrinter.cpp#L399 */ \
+-      __asm__("vzip.32  %e0, %f0" : "+w" (in));                                             \
+-      (outLo) = vget_low_u32 (vreinterpretq_u32_u64(in));                                   \
+-      (outHi) = vget_high_u32(vreinterpretq_u32_u64(in));                                   \
+-   } while (0)
+-# else
+-#  define XXH_SPLIT_IN_PLACE(in, outLo, outHi)                                            \
+-    do {                                                                                  \
+-      (outLo) = vmovn_u64    (in);                                                        \
+-      (outHi) = vshrn_n_u64  ((in), 32);                                                  \
+-    } while (0)
++ * @internal
++ * @brief `vmlal_u32` on low and high halves of a vector.
++ *
++ * This is a workaround for AArch64 GCC < 11 which implemented arm_neon.h with
++ * inline assembly and were therefore incapable of merging the `vget_{low, high}_u32`
++ * with `vmlal_u32`.
++ */
++#if defined(__aarch64__) && defined(__GNUC__) && !defined(__clang__) && __GNUC__ < 11
++XXH_FORCE_INLINE uint64x2_t
++XXH_vmlal_low_u32(uint64x2_t acc, uint32x4_t lhs, uint32x4_t rhs)
++{
++    /* Inline assembly is the only way */
++    __asm__("umlal   %0.2d, %1.2s, %2.2s" : "+w" (acc) : "w" (lhs), "w" (rhs));
++    return acc;
++}
++XXH_FORCE_INLINE uint64x2_t
++XXH_vmlal_high_u32(uint64x2_t acc, uint32x4_t lhs, uint32x4_t rhs)
++{
++    /* This intrinsic works as expected */
++    return vmlal_high_u32(acc, lhs, rhs);
++}
++#else
++/* Portable intrinsic versions */
++XXH_FORCE_INLINE uint64x2_t
++XXH_vmlal_low_u32(uint64x2_t acc, uint32x4_t lhs, uint32x4_t rhs)
++{
++    return vmlal_u32(acc, vget_low_u32(lhs), vget_low_u32(rhs));
++}
++/*! @copydoc XXH_vmlal_low_u32
++ * Assume the compiler converts this to vmlal_high_u32 on aarch64 */
++XXH_FORCE_INLINE uint64x2_t
++XXH_vmlal_high_u32(uint64x2_t acc, uint32x4_t lhs, uint32x4_t rhs)
++{
++    return vmlal_u32(acc, vget_high_u32(lhs), vget_high_u32(rhs));
++}
++#endif
++
++/*!
++ * @ingroup tuning
++ * @brief Controls the NEON to scalar ratio for XXH3
++ *
++ * This can be set to 2, 4, 6, or 8.
++ *
++ * ARM Cortex CPUs are _very_ sensitive to how their pipelines are used.
++ *
++ * For example, the Cortex-A73 can dispatch 3 micro-ops per cycle, but only 2 of those
++ * can be NEON. If you are only using NEON instructions, you are only using 2/3 of the CPU
++ * bandwidth.
++ *
++ * This is even more noticeable on the more advanced cores like the Cortex-A76 which
++ * can dispatch 8 micro-ops per cycle, but still only 2 NEON micro-ops at once.
++ *
++ * Therefore, to make the most out of the pipeline, it is beneficial to run 6 NEON lanes
++ * and 2 scalar lanes, which is chosen by default.
++ *
++ * This does not apply to Apple processors or 32-bit processors, which run better with
++ * full NEON. These will default to 8. Additionally, size-optimized builds run 8 lanes.
++ *
++ * This change benefits CPUs with large micro-op buffers without negatively affecting
++ * most other CPUs:
++ *
++ *  | Chipset               | Dispatch type       | NEON only | 6:2 hybrid | Diff. |
++ *  |:----------------------|:--------------------|----------:|-----------:|------:|
++ *  | Snapdragon 730 (A76)  | 2 NEON/8 micro-ops  |  8.8 GB/s |  10.1 GB/s |  ~16% |
++ *  | Snapdragon 835 (A73)  | 2 NEON/3 micro-ops  |  5.1 GB/s |   5.3 GB/s |   ~5% |
++ *  | Marvell PXA1928 (A53) | In-order dual-issue |  1.9 GB/s |   1.9 GB/s |    0% |
++ *  | Apple M1              | 4 NEON/8 micro-ops  | 37.3 GB/s |  36.1 GB/s |  ~-3% |
++ *
++ * It also seems to fix some bad codegen on GCC, making it almost as fast as clang.
++ *
++ * When using WASM SIMD128, if this is 2 or 6, SIMDe will scalarize 2 of the lanes meaning
++ * it effectively becomes worse 4.
++ *
++ * @see XXH3_accumulate_512_neon()
++ */
++# ifndef XXH3_NEON_LANES
++#  if (defined(__aarch64__) || defined(__arm64__) || defined(_M_ARM64) || defined(_M_ARM64EC)) \
++   && !defined(__APPLE__) && XXH_SIZE_OPT <= 0
++#   define XXH3_NEON_LANES 6
++#  else
++#   define XXH3_NEON_LANES XXH_ACC_NB
++#  endif
+ # endif
+ #endif  /* XXH_VECTOR == XXH_NEON */
+ 
+ /*
+  * VSX and Z Vector helpers.
+  *
+  * This is very messy, and any pull requests to clean this up are welcome.
+  *
+  * There are a lot of problems with supporting VSX and s390x, due to
+  * inconsistent intrinsics, spotty coverage, and multiple endiannesses.
+  */
+ #if XXH_VECTOR == XXH_VSX
++/* Annoyingly, these headers _may_ define three macros: `bool`, `vector`,
++ * and `pixel`. This is a problem for obvious reasons.
++ *
++ * These keywords are unnecessary; the spec literally says they are
++ * equivalent to `__bool`, `__vector`, and `__pixel` and may be undef'd
++ * after including the header.
++ *
++ * We use pragma push_macro/pop_macro to keep the namespace clean. */
++#  pragma push_macro("bool")
++#  pragma push_macro("vector")
++#  pragma push_macro("pixel")
++/* silence potential macro redefined warnings */
++#  undef bool
++#  undef vector
++#  undef pixel
++
+ #  if defined(__s390x__)
+ #    include <s390intrin.h>
+ #  else
+-/* gcc's altivec.h can have the unwanted consequence to unconditionally
+- * #define bool, vector, and pixel keywords,
+- * with bad consequences for programs already using these keywords for other purposes.
+- * The paragraph defining these macros is skipped when __APPLE_ALTIVEC__ is defined.
+- * __APPLE_ALTIVEC__ is _generally_ defined automatically by the compiler,
+- * but it seems that, in some cases, it isn't.
+- * Force the build macro to be defined, so that keywords are not altered.
+- */
+-#    if defined(__GNUC__) && !defined(__APPLE_ALTIVEC__)
+-#      define __APPLE_ALTIVEC__
+-#    endif
+ #    include <altivec.h>
+ #  endif
+ 
++/* Restore the original macro values, if applicable. */
++#  pragma pop_macro("pixel")
++#  pragma pop_macro("vector")
++#  pragma pop_macro("bool")
++
+ typedef __vector unsigned long long xxh_u64x2;
+ typedef __vector unsigned char xxh_u8x16;
+ typedef __vector unsigned xxh_u32x4;
+ 
++/*
++ * UGLY HACK: Similar to aarch64 macOS GCC, s390x GCC has the same aliasing issue.
++ */
++typedef xxh_u64x2 xxh_aliasing_u64x2 XXH_ALIASING;
++
+ # ifndef XXH_VSX_BE
+ #  if defined(__BIG_ENDIAN__) \
+   || (defined(__BYTE_ORDER__) && __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
+ #    define XXH_VSX_BE 1
+ #  elif defined(__VEC_ELEMENT_REG_ORDER__) && __VEC_ELEMENT_REG_ORDER__ == __ORDER_BIG_ENDIAN__
+ #    warning "-maltivec=be is not recommended. Please use native endianness."
+ #    define XXH_VSX_BE 1
+ #  else
+@@ -3119,18 +3901,19 @@ XXH_FORCE_INLINE xxh_u64x2 XXH_vec_loadu
+  *
+  * These intrinsics weren't added until GCC 8, despite existing for a while,
+  * and they are endian dependent. Also, their meaning swap depending on version.
+  * */
+ # if defined(__s390x__)
+  /* s390x is always big endian, no issue on this platform */
+ #  define XXH_vec_mulo vec_mulo
+ #  define XXH_vec_mule vec_mule
+-# elif defined(__clang__) && XXH_HAS_BUILTIN(__builtin_altivec_vmuleuw)
++# elif defined(__clang__) && XXH_HAS_BUILTIN(__builtin_altivec_vmuleuw) && !defined(__ibmxl__)
+ /* Clang has a better way to control this, we can just use the builtin which doesn't swap. */
++ /* The IBM XL Compiler (which defined __clang__) only implements the vec_* operations */
+ #  define XXH_vec_mulo __builtin_altivec_vmulouw
+ #  define XXH_vec_mule __builtin_altivec_vmuleuw
+ # else
+ /* gcc needs inline assembly */
+ /* Adapted from https://github.com/google/highwayhash/blob/master/highwayhash/hh_vsx.h. */
+ XXH_FORCE_INLINE xxh_u64x2 XXH_vec_mulo(xxh_u32x4 a, xxh_u32x4 b)
+ {
+     xxh_u64x2 result;
+@@ -3141,23 +3924,38 @@ XXH_FORCE_INLINE xxh_u64x2 XXH_vec_mule(
+ {
+     xxh_u64x2 result;
+     __asm__("vmuleuw %0, %1, %2" : "=v" (result) : "v" (a), "v" (b));
+     return result;
+ }
+ # endif /* XXH_vec_mulo, XXH_vec_mule */
+ #endif /* XXH_VECTOR == XXH_VSX */
+ 
++#if XXH_VECTOR == XXH_SVE
++#define ACCRND(acc, offset) \
++do { \
++    svuint64_t input_vec = svld1_u64(mask, xinput + offset);         \
++    svuint64_t secret_vec = svld1_u64(mask, xsecret + offset);       \
++    svuint64_t mixed = sveor_u64_x(mask, secret_vec, input_vec);     \
++    svuint64_t swapped = svtbl_u64(input_vec, kSwap);                \
++    svuint64_t mixed_lo = svextw_u64_x(mask, mixed);                 \
++    svuint64_t mixed_hi = svlsr_n_u64_x(mask, mixed, 32);            \
++    svuint64_t mul = svmad_u64_x(mask, mixed_lo, mixed_hi, swapped); \
++    acc = svadd_u64_x(mask, acc, mul);                               \
++} while (0)
++#endif /* XXH_VECTOR == XXH_SVE */
+ 
+ /* prefetch
+  * can be disabled, by declaring XXH_NO_PREFETCH build macro */
+ #if defined(XXH_NO_PREFETCH)
+ #  define XXH_PREFETCH(ptr)  (void)(ptr)  /* disabled */
+ #else
+-#  if defined(_MSC_VER) && (defined(_M_X64) || defined(_M_IX86))  /* _mm_prefetch() not defined outside of x86/x64 */
++#  if XXH_SIZE_OPT >= 1
++#    define XXH_PREFETCH(ptr) (void)(ptr)
++#  elif defined(_MSC_VER) && (defined(_M_X64) || defined(_M_IX86))  /* _mm_prefetch() not defined outside of x86/x64 */
+ #    include <mmintrin.h>   /* https://msdn.microsoft.com/fr-fr/library/84szxsww(v=vs.90).aspx */
+ #    define XXH_PREFETCH(ptr)  _mm_prefetch((const char*)(ptr), _MM_HINT_T0)
+ #  elif defined(__GNUC__) && ( (__GNUC__ >= 4) || ( (__GNUC__ == 3) && (__GNUC_MINOR__ >= 1) ) )
+ #    define XXH_PREFETCH(ptr)  __builtin_prefetch((ptr), 0 /* rw==read */, 3 /* locality */)
+ #  else
+ #    define XXH_PREFETCH(ptr) (void)(ptr)  /* disabled */
+ #  endif
+ #endif  /* XXH_NO_PREFETCH */
+@@ -3184,16 +3982,18 @@ XXH_ALIGN(64) static const xxh_u8 XXH3_k
+     0xa8, 0xfa, 0x76, 0x3f, 0xe3, 0x9c, 0x34, 0x3f, 0xf9, 0xdc, 0xbb, 0xc7, 0xc7, 0x0b, 0x4f, 0x1d,
+     0x8a, 0x51, 0xe0, 0x4b, 0xcd, 0xb4, 0x59, 0x31, 0xc8, 0x9f, 0x7e, 0xc9, 0xd9, 0x78, 0x73, 0x64,
+     0xea, 0xc5, 0xac, 0x83, 0x34, 0xd3, 0xeb, 0xc3, 0xc5, 0x81, 0xa0, 0xff, 0xfa, 0x13, 0x63, 0xeb,
+     0x17, 0x0d, 0xdd, 0x51, 0xb7, 0xf0, 0xda, 0x49, 0xd3, 0x16, 0x55, 0x26, 0x29, 0xd4, 0x68, 0x9e,
+     0x2b, 0x16, 0xbe, 0x58, 0x7d, 0x47, 0xa1, 0xfc, 0x8f, 0xf8, 0xb8, 0xd1, 0x7a, 0xd0, 0x31, 0xce,
+     0x45, 0xcb, 0x3a, 0x8f, 0x95, 0x16, 0x04, 0x28, 0xaf, 0xd7, 0xfb, 0xca, 0xbb, 0x4b, 0x40, 0x7e,
+ };
+ 
++static const xxh_u64 PRIME_MX1 = 0x165667919E3779F9ULL;  /*!< 0b0001011001010110011001111001000110011110001101110111100111111001 */
++static const xxh_u64 PRIME_MX2 = 0x9FB21C651E98DF25ULL;  /*!< 0b1001111110110010000111000110010100011110100110001101111100100101 */
+ 
+ #ifdef XXH_OLD_NAMES
+ #  define kSecret XXH3_kSecret
+ #endif
+ 
+ #ifdef XXH_DOXYGEN
+ /*!
+  * @brief Calculates a 32-bit to 64-bit long multiply.
+@@ -3375,46 +4175,46 @@ XXH_mult64to128(xxh_u64 lhs, xxh_u64 rhs
+ static xxh_u64
+ XXH3_mul128_fold64(xxh_u64 lhs, xxh_u64 rhs)
+ {
+     XXH128_hash_t product = XXH_mult64to128(lhs, rhs);
+     return product.low64 ^ product.high64;
+ }
+ 
+ /*! Seems to produce slightly better code on GCC for some reason. */
+-XXH_FORCE_INLINE xxh_u64 XXH_xorshift64(xxh_u64 v64, int shift)
++XXH_FORCE_INLINE XXH_CONSTF xxh_u64 XXH_xorshift64(xxh_u64 v64, int shift)
+ {
+     XXH_ASSERT(0 <= shift && shift < 64);
+     return v64 ^ (v64 >> shift);
+ }
+ 
+ /*
+  * This is a fast avalanche stage,
+  * suitable when input bits are already partially mixed
+  */
+ static XXH64_hash_t XXH3_avalanche(xxh_u64 h64)
+ {
+     h64 = XXH_xorshift64(h64, 37);
+-    h64 *= 0x165667919E3779F9ULL;
++    h64 *= PRIME_MX1;
+     h64 = XXH_xorshift64(h64, 32);
+     return h64;
+ }
+ 
+ /*
+  * This is a stronger avalanche,
+  * inspired by Pelle Evensen's rrmxmx
+  * preferable when input has not been previously mixed
+  */
+ static XXH64_hash_t XXH3_rrmxmx(xxh_u64 h64, xxh_u64 len)
+ {
+     /* this mix is inspired by Pelle Evensen's rrmxmx */
+     h64 ^= XXH_rotl64(h64, 49) ^ XXH_rotl64(h64, 24);
+-    h64 *= 0x9FB21C651E98DF25ULL;
++    h64 *= PRIME_MX2;
+     h64 ^= (h64 >> 35) + len ;
+-    h64 *= 0x9FB21C651E98DF25ULL;
++    h64 *= PRIME_MX2;
+     return XXH_xorshift64(h64, 28);
+ }
+ 
+ 
+ /* ==========================================
+  * Short keys
+  * ==========================================
+  * One of the shortcomings of XXH32 and XXH64 was that their performance was
+@@ -3442,17 +4242,17 @@ static XXH64_hash_t XXH3_rrmxmx(xxh_u64 
+  * samples with an XOR. This should have no effect on performance on the
+  * seedless or withSeed variants because everything _should_ be constant folded
+  * by modern compilers.
+  *
+  * The XOR mixing hides individual parts of the secret and increases entropy.
+  *
+  * This adds an extra layer of strength for custom secrets.
+  */
+-XXH_FORCE_INLINE XXH64_hash_t
++XXH_FORCE_INLINE XXH_PUREF XXH64_hash_t
+ XXH3_len_1to3_64b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXH64_hash_t seed)
+ {
+     XXH_ASSERT(input != NULL);
+     XXH_ASSERT(1 <= len && len <= 3);
+     XXH_ASSERT(secret != NULL);
+     /*
+      * len = 1: combined = { input[0], 0x01, input[0], input[0] }
+      * len = 2: combined = { input[1], 0x02, input[0], input[1] }
+@@ -3464,50 +4264,50 @@ XXH3_len_1to3_64b(const xxh_u8* input, s
+         xxh_u32 const combined = ((xxh_u32)c1 << 16) | ((xxh_u32)c2  << 24)
+                                | ((xxh_u32)c3 <<  0) | ((xxh_u32)len << 8);
+         xxh_u64 const bitflip = (XXH_readLE32(secret) ^ XXH_readLE32(secret+4)) + seed;
+         xxh_u64 const keyed = (xxh_u64)combined ^ bitflip;
+         return XXH64_avalanche(keyed);
+     }
+ }
+ 
+-XXH_FORCE_INLINE XXH64_hash_t
++XXH_FORCE_INLINE XXH_PUREF XXH64_hash_t
+ XXH3_len_4to8_64b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXH64_hash_t seed)
+ {
+     XXH_ASSERT(input != NULL);
+     XXH_ASSERT(secret != NULL);
+     XXH_ASSERT(4 <= len && len <= 8);
+     seed ^= (xxh_u64)XXH_swap32((xxh_u32)seed) << 32;
+     {   xxh_u32 const input1 = XXH_readLE32(input);
+         xxh_u32 const input2 = XXH_readLE32(input + len - 4);
+         xxh_u64 const bitflip = (XXH_readLE64(secret+8) ^ XXH_readLE64(secret+16)) - seed;
+         xxh_u64 const input64 = input2 + (((xxh_u64)input1) << 32);
+         xxh_u64 const keyed = input64 ^ bitflip;
+         return XXH3_rrmxmx(keyed, len);
+     }
+ }
+ 
+-XXH_FORCE_INLINE XXH64_hash_t
++XXH_FORCE_INLINE XXH_PUREF XXH64_hash_t
+ XXH3_len_9to16_64b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXH64_hash_t seed)
+ {
+     XXH_ASSERT(input != NULL);
+     XXH_ASSERT(secret != NULL);
+     XXH_ASSERT(9 <= len && len <= 16);
+     {   xxh_u64 const bitflip1 = (XXH_readLE64(secret+24) ^ XXH_readLE64(secret+32)) + seed;
+         xxh_u64 const bitflip2 = (XXH_readLE64(secret+40) ^ XXH_readLE64(secret+48)) - seed;
+         xxh_u64 const input_lo = XXH_readLE64(input)           ^ bitflip1;
+         xxh_u64 const input_hi = XXH_readLE64(input + len - 8) ^ bitflip2;
+         xxh_u64 const acc = len
+                           + XXH_swap64(input_lo) + input_hi
+                           + XXH3_mul128_fold64(input_lo, input_hi);
+         return XXH3_avalanche(acc);
+     }
+ }
+ 
+-XXH_FORCE_INLINE XXH64_hash_t
++XXH_FORCE_INLINE XXH_PUREF XXH64_hash_t
+ XXH3_len_0to16_64b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXH64_hash_t seed)
+ {
+     XXH_ASSERT(len <= 16);
+     {   if (XXH_likely(len >  8)) return XXH3_len_9to16_64b(input, len, secret, seed);
+         if (XXH_likely(len >= 4)) return XXH3_len_4to8_64b(input, len, secret, seed);
+         if (len) return XXH3_len_1to3_64b(input, len, secret, seed);
+         return XXH64_avalanche(seed ^ (XXH_readLE64(secret+56) ^ XXH_readLE64(secret+64)));
+     }
+@@ -3567,65 +4367,77 @@ XXH_FORCE_INLINE xxh_u64 XXH3_mix16B(con
+         return XXH3_mul128_fold64(
+             input_lo ^ (XXH_readLE64(secret)   + seed64),
+             input_hi ^ (XXH_readLE64(secret+8) - seed64)
+         );
+     }
+ }
+ 
+ /* For mid range keys, XXH3 uses a Mum-hash variant. */
+-XXH_FORCE_INLINE XXH64_hash_t
++XXH_FORCE_INLINE XXH_PUREF XXH64_hash_t
+ XXH3_len_17to128_64b(const xxh_u8* XXH_RESTRICT input, size_t len,
+                      const xxh_u8* XXH_RESTRICT secret, size_t secretSize,
+                      XXH64_hash_t seed)
+ {
+     XXH_ASSERT(secretSize >= XXH3_SECRET_SIZE_MIN); (void)secretSize;
+     XXH_ASSERT(16 < len && len <= 128);
+ 
+     {   xxh_u64 acc = len * XXH_PRIME64_1;
++#if XXH_SIZE_OPT >= 1
++        /* Smaller and cleaner, but slightly slower. */
++        unsigned int i = (unsigned int)(len - 1) / 32;
++        do {
++            acc += XXH3_mix16B(input+16 * i, secret+32*i, seed);
++            acc += XXH3_mix16B(input+len-16*(i+1), secret+32*i+16, seed);
++        } while (i-- != 0);
++#else
+         if (len > 32) {
+             if (len > 64) {
+                 if (len > 96) {
+                     acc += XXH3_mix16B(input+48, secret+96, seed);
+                     acc += XXH3_mix16B(input+len-64, secret+112, seed);
+                 }
+                 acc += XXH3_mix16B(input+32, secret+64, seed);
+                 acc += XXH3_mix16B(input+len-48, secret+80, seed);
+             }
+             acc += XXH3_mix16B(input+16, secret+32, seed);
+             acc += XXH3_mix16B(input+len-32, secret+48, seed);
+         }
+         acc += XXH3_mix16B(input+0, secret+0, seed);
+         acc += XXH3_mix16B(input+len-16, secret+16, seed);
+-
++#endif
+         return XXH3_avalanche(acc);
+     }
+ }
+ 
+ #define XXH3_MIDSIZE_MAX 240
+ 
+-XXH_NO_INLINE XXH64_hash_t
++XXH_NO_INLINE XXH_PUREF XXH64_hash_t
+ XXH3_len_129to240_64b(const xxh_u8* XXH_RESTRICT input, size_t len,
+                       const xxh_u8* XXH_RESTRICT secret, size_t secretSize,
+                       XXH64_hash_t seed)
+ {
+     XXH_ASSERT(secretSize >= XXH3_SECRET_SIZE_MIN); (void)secretSize;
+     XXH_ASSERT(128 < len && len <= XXH3_MIDSIZE_MAX);
+ 
+     #define XXH3_MIDSIZE_STARTOFFSET 3
+     #define XXH3_MIDSIZE_LASTOFFSET  17
+ 
+     {   xxh_u64 acc = len * XXH_PRIME64_1;
+-        int const nbRounds = (int)len / 16;
+-        int i;
++        xxh_u64 acc_end;
++        unsigned int const nbRounds = (unsigned int)len / 16;
++        unsigned int i;
++        XXH_ASSERT(128 < len && len <= XXH3_MIDSIZE_MAX);
+         for (i=0; i<8; i++) {
+             acc += XXH3_mix16B(input+(16*i), secret+(16*i), seed);
+         }
++        /* last bytes */
++        acc_end = XXH3_mix16B(input + len - 16, secret + XXH3_SECRET_SIZE_MIN - XXH3_MIDSIZE_LASTOFFSET, seed);
++        XXH_ASSERT(nbRounds >= 8);
+         acc = XXH3_avalanche(acc);
+-        XXH_ASSERT(nbRounds >= 8);
+ #if defined(__clang__)                                /* Clang */ \
+     && (defined(__ARM_NEON) || defined(__ARM_NEON__)) /* NEON */ \
+     && !defined(XXH_ENABLE_AUTOVECTORIZE)             /* Define to disable */
+         /*
+          * UGLY HACK:
+          * Clang for ARMv7-A tries to vectorize this loop, similar to GCC x86.
+          * In everywhere else, it uses scalar code.
+          *
+@@ -3642,36 +4454,79 @@ XXH3_len_129to240_64b(const xxh_u8* XXH_
+          *
+          * This loop is the easiest to fix, as unlike XXH32, this pragma
+          * _actually works_ because it is a loop vectorization instead of an
+          * SLP vectorization.
+          */
+         #pragma clang loop vectorize(disable)
+ #endif
+         for (i=8 ; i < nbRounds; i++) {
+-            acc += XXH3_mix16B(input+(16*i), secret+(16*(i-8)) + XXH3_MIDSIZE_STARTOFFSET, seed);
++            /*
++             * Prevents clang for unrolling the acc loop and interleaving with this one.
++             */
++            XXH_COMPILER_GUARD(acc);
++            acc_end += XXH3_mix16B(input+(16*i), secret+(16*(i-8)) + XXH3_MIDSIZE_STARTOFFSET, seed);
+         }
+-        /* last bytes */
+-        acc += XXH3_mix16B(input + len - 16, secret + XXH3_SECRET_SIZE_MIN - XXH3_MIDSIZE_LASTOFFSET, seed);
+-        return XXH3_avalanche(acc);
++        return XXH3_avalanche(acc + acc_end);
+     }
+ }
+ 
+ 
+ /* =======     Long Keys     ======= */
+ 
+ #define XXH_STRIPE_LEN 64
+ #define XXH_SECRET_CONSUME_RATE 8   /* nb of secret bytes consumed at each accumulation */
+ #define XXH_ACC_NB (XXH_STRIPE_LEN / sizeof(xxh_u64))
+ 
+ #ifdef XXH_OLD_NAMES
+ #  define STRIPE_LEN XXH_STRIPE_LEN
+ #  define ACC_NB XXH_ACC_NB
+ #endif
+ 
++#ifndef XXH_PREFETCH_DIST
++#  ifdef __clang__
++#    define XXH_PREFETCH_DIST 320
++#  else
++#    if (XXH_VECTOR == XXH_AVX512)
++#      define XXH_PREFETCH_DIST 512
++#    else
++#      define XXH_PREFETCH_DIST 384
++#    endif
++#  endif  /* __clang__ */
++#endif  /* XXH_PREFETCH_DIST */
++
++/*
++ * These macros are to generate an XXH3_accumulate() function.
++ * The two arguments select the name suffix and target attribute.
++ *
++ * The name of this symbol is XXH3_accumulate_<name>() and it calls
++ * XXH3_accumulate_512_<name>().
++ *
++ * It may be useful to hand implement this function if the compiler fails to
++ * optimize the inline function.
++ */
++#define XXH3_ACCUMULATE_TEMPLATE(name)                      \
++void                                                        \
++XXH3_accumulate_##name(xxh_u64* XXH_RESTRICT acc,           \
++                       const xxh_u8* XXH_RESTRICT input,    \
++                       const xxh_u8* XXH_RESTRICT secret,   \
++                       size_t nbStripes)                    \
++{                                                           \
++    size_t n;                                               \
++    for (n = 0; n < nbStripes; n++ ) {                      \
++        const xxh_u8* const in = input + n*XXH_STRIPE_LEN;  \
++        XXH_PREFETCH(in + XXH_PREFETCH_DIST);               \
++        XXH3_accumulate_512_##name(                         \
++                 acc,                                       \
++                 in,                                        \
++                 secret + n*XXH_SECRET_CONSUME_RATE);       \
++    }                                                       \
++}
++
++
+ XXH_FORCE_INLINE void XXH_writeLE64(void* dst, xxh_u64 v64)
+ {
+     if (!XXH_CPU_LITTLE_ENDIAN) v64 = XXH_swap64(v64);
+     XXH_memcpy(dst, &v64, sizeof(v64));
+ }
+ 
+ /* Several intrinsic functions below are supposed to accept __int64 as argument,
+  * as documented in https://software.intel.com/sites/landingpage/IntrinsicsGuide/ .
+@@ -3682,16 +4537,17 @@ XXH_FORCE_INLINE void XXH_writeLE64(void
+   && (defined (__cplusplus) \
+   || (defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) /* C99 */) )
+     typedef int64_t xxh_i64;
+ #else
+     /* the following type must have a width of 64-bit */
+     typedef long long xxh_i64;
+ #endif
+ 
++
+ /*
+  * XXH3_accumulate_512 is the tightest loop for long inputs, and it is the most optimized.
+  *
+  * It is a hardened version of UMAC, based off of FARSH's implementation.
+  *
+  * This was chosen because it adapts quite well to 32-bit, 64-bit, and SIMD
+  * implementations, and it is ridiculously fast.
+  *
+@@ -3729,26 +4585,27 @@ XXH3_accumulate_512_avx512(void* XXH_RES
+     {
+         /* data_vec    = input[0]; */
+         __m512i const data_vec    = _mm512_loadu_si512   (input);
+         /* key_vec     = secret[0]; */
+         __m512i const key_vec     = _mm512_loadu_si512   (secret);
+         /* data_key    = data_vec ^ key_vec; */
+         __m512i const data_key    = _mm512_xor_si512     (data_vec, key_vec);
+         /* data_key_lo = data_key >> 32; */
+-        __m512i const data_key_lo = _mm512_shuffle_epi32 (data_key, (_MM_PERM_ENUM)_MM_SHUFFLE(0, 3, 0, 1));
++        __m512i const data_key_lo = _mm512_srli_epi64 (data_key, 32);
+         /* product     = (data_key & 0xffffffff) * (data_key_lo & 0xffffffff); */
+         __m512i const product     = _mm512_mul_epu32     (data_key, data_key_lo);
+         /* xacc[0] += swap(data_vec); */
+         __m512i const data_swap = _mm512_shuffle_epi32(data_vec, (_MM_PERM_ENUM)_MM_SHUFFLE(1, 0, 3, 2));
+         __m512i const sum       = _mm512_add_epi64(*xacc, data_swap);
+         /* xacc[0] += product; */
+         *xacc = _mm512_add_epi64(product, sum);
+     }
+ }
++XXH_FORCE_INLINE XXH_TARGET_AVX512 XXH3_ACCUMULATE_TEMPLATE(avx512)
+ 
+ /*
+  * XXH3_scrambleAcc: Scrambles the accumulators to improve mixing.
+  *
+  * Multiplication isn't perfect, as explained by Google in HighwayHash:
+  *
+  *  // Multiplication mixes/scrambles bytes 0-7 of the 64-bit result to
+  *  // varying degrees. In descending order of goodness, bytes
+@@ -3772,53 +4629,46 @@ XXH3_scrambleAcc_avx512(void* XXH_RESTRI
+     XXH_ASSERT((((size_t)acc) & 63) == 0);
+     XXH_STATIC_ASSERT(XXH_STRIPE_LEN == sizeof(__m512i));
+     {   __m512i* const xacc = (__m512i*) acc;
+         const __m512i prime32 = _mm512_set1_epi32((int)XXH_PRIME32_1);
+ 
+         /* xacc[0] ^= (xacc[0] >> 47) */
+         __m512i const acc_vec     = *xacc;
+         __m512i const shifted     = _mm512_srli_epi64    (acc_vec, 47);
+-        __m512i const data_vec    = _mm512_xor_si512     (acc_vec, shifted);
+         /* xacc[0] ^= secret; */
+         __m512i const key_vec     = _mm512_loadu_si512   (secret);
+-        __m512i const data_key    = _mm512_xor_si512     (data_vec, key_vec);
++        __m512i const data_key    = _mm512_ternarylogic_epi32(key_vec, acc_vec, shifted, 0x96 /* key_vec ^ acc_vec ^ shifted */);
+ 
+         /* xacc[0] *= XXH_PRIME32_1; */
+-        __m512i const data_key_hi = _mm512_shuffle_epi32 (data_key, (_MM_PERM_ENUM)_MM_SHUFFLE(0, 3, 0, 1));
++        __m512i const data_key_hi = _mm512_srli_epi64 (data_key, 32);
+         __m512i const prod_lo     = _mm512_mul_epu32     (data_key, prime32);
+         __m512i const prod_hi     = _mm512_mul_epu32     (data_key_hi, prime32);
+         *xacc = _mm512_add_epi64(prod_lo, _mm512_slli_epi64(prod_hi, 32));
+     }
+ }
+ 
+ XXH_FORCE_INLINE XXH_TARGET_AVX512 void
+ XXH3_initCustomSecret_avx512(void* XXH_RESTRICT customSecret, xxh_u64 seed64)
+ {
+     XXH_STATIC_ASSERT((XXH_SECRET_DEFAULT_SIZE & 63) == 0);
+     XXH_STATIC_ASSERT(XXH_SEC_ALIGN == 64);
+     XXH_ASSERT(((size_t)customSecret & 63) == 0);
+     (void)(&XXH_writeLE64);
+     {   int const nbRounds = XXH_SECRET_DEFAULT_SIZE / sizeof(__m512i);
+-        __m512i const seed = _mm512_mask_set1_epi64(_mm512_set1_epi64((xxh_i64)seed64), 0xAA, (xxh_i64)(0U - seed64));
++        __m512i const seed_pos = _mm512_set1_epi64((xxh_i64)seed64);
++        __m512i const seed     = _mm512_mask_sub_epi64(seed_pos, 0xAA, _mm512_set1_epi8(0), seed_pos);
+ 
+         const __m512i* const src  = (const __m512i*) ((const void*) XXH3_kSecret);
+               __m512i* const dest = (      __m512i*) customSecret;
+         int i;
+         XXH_ASSERT(((size_t)src & 63) == 0); /* control alignment */
+         XXH_ASSERT(((size_t)dest & 63) == 0);
+         for (i=0; i < nbRounds; ++i) {
+-            /* GCC has a bug, _mm512_stream_load_si512 accepts 'void*', not 'void const*',
+-             * this will warn "discards 'const' qualifier". */
+-            union {
+-                const __m512i* cp;
+-                void* p;
+-            } remote_const_void;
+-            remote_const_void.cp = src + i;
+-            dest[i] = _mm512_add_epi64(_mm512_stream_load_si512(remote_const_void.p), seed);
++            dest[i] = _mm512_add_epi64(_mm512_load_si512(src + i), seed);
+     }   }
+ }
+ 
+ #endif
+ 
+ #if (XXH_VECTOR == XXH_AVX2) \
+     || (defined(XXH_DISPATCH_AVX2) && XXH_DISPATCH_AVX2 != 0)
+ 
+@@ -3844,26 +4694,27 @@ XXH3_accumulate_512_avx2( void* XXH_REST
+         for (i=0; i < XXH_STRIPE_LEN/sizeof(__m256i); i++) {
+             /* data_vec    = xinput[i]; */
+             __m256i const data_vec    = _mm256_loadu_si256    (xinput+i);
+             /* key_vec     = xsecret[i]; */
+             __m256i const key_vec     = _mm256_loadu_si256   (xsecret+i);
+             /* data_key    = data_vec ^ key_vec; */
+             __m256i const data_key    = _mm256_xor_si256     (data_vec, key_vec);
+             /* data_key_lo = data_key >> 32; */
+-            __m256i const data_key_lo = _mm256_shuffle_epi32 (data_key, _MM_SHUFFLE(0, 3, 0, 1));
++            __m256i const data_key_lo = _mm256_srli_epi64 (data_key, 32);
+             /* product     = (data_key & 0xffffffff) * (data_key_lo & 0xffffffff); */
+             __m256i const product     = _mm256_mul_epu32     (data_key, data_key_lo);
+             /* xacc[i] += swap(data_vec); */
+             __m256i const data_swap = _mm256_shuffle_epi32(data_vec, _MM_SHUFFLE(1, 0, 3, 2));
+             __m256i const sum       = _mm256_add_epi64(xacc[i], data_swap);
+             /* xacc[i] += product; */
+             xacc[i] = _mm256_add_epi64(product, sum);
+     }   }
+ }
++XXH_FORCE_INLINE XXH_TARGET_AVX2 XXH3_ACCUMULATE_TEMPLATE(avx2)
+ 
+ XXH_FORCE_INLINE XXH_TARGET_AVX2 void
+ XXH3_scrambleAcc_avx2(void* XXH_RESTRICT acc, const void* XXH_RESTRICT secret)
+ {
+     XXH_ASSERT((((size_t)acc) & 31) == 0);
+     {   __m256i* const xacc = (__m256i*) acc;
+         /* Unaligned. This is mainly for pointer arithmetic, and because
+          * _mm256_loadu_si256 requires a const __m256i * pointer for some reason. */
+@@ -3876,17 +4727,17 @@ XXH3_scrambleAcc_avx2(void* XXH_RESTRICT
+             __m256i const acc_vec     = xacc[i];
+             __m256i const shifted     = _mm256_srli_epi64    (acc_vec, 47);
+             __m256i const data_vec    = _mm256_xor_si256     (acc_vec, shifted);
+             /* xacc[i] ^= xsecret; */
+             __m256i const key_vec     = _mm256_loadu_si256   (xsecret+i);
+             __m256i const data_key    = _mm256_xor_si256     (data_vec, key_vec);
+ 
+             /* xacc[i] *= XXH_PRIME32_1; */
+-            __m256i const data_key_hi = _mm256_shuffle_epi32 (data_key, _MM_SHUFFLE(0, 3, 0, 1));
++            __m256i const data_key_hi = _mm256_srli_epi64 (data_key, 32);
+             __m256i const prod_lo     = _mm256_mul_epu32     (data_key, prime32);
+             __m256i const prod_hi     = _mm256_mul_epu32     (data_key_hi, prime32);
+             xacc[i] = _mm256_add_epi64(prod_lo, _mm256_slli_epi64(prod_hi, 32));
+         }
+     }
+ }
+ 
+ XXH_FORCE_INLINE XXH_TARGET_AVX2 void XXH3_initCustomSecret_avx2(void* XXH_RESTRICT customSecret, xxh_u64 seed64)
+@@ -3908,22 +4759,22 @@ XXH_FORCE_INLINE XXH_TARGET_AVX2 void XX
+          *   - use less common registers, and avoid pushing these reg into stack
+          */
+         XXH_COMPILER_GUARD(dest);
+ #       endif
+         XXH_ASSERT(((size_t)src & 31) == 0); /* control alignment */
+         XXH_ASSERT(((size_t)dest & 31) == 0);
+ 
+         /* GCC -O2 need unroll loop manually */
+-        dest[0] = _mm256_add_epi64(_mm256_stream_load_si256(src+0), seed);
+-        dest[1] = _mm256_add_epi64(_mm256_stream_load_si256(src+1), seed);
+-        dest[2] = _mm256_add_epi64(_mm256_stream_load_si256(src+2), seed);
+-        dest[3] = _mm256_add_epi64(_mm256_stream_load_si256(src+3), seed);
+-        dest[4] = _mm256_add_epi64(_mm256_stream_load_si256(src+4), seed);
+-        dest[5] = _mm256_add_epi64(_mm256_stream_load_si256(src+5), seed);
++        dest[0] = _mm256_add_epi64(_mm256_load_si256(src+0), seed);
++        dest[1] = _mm256_add_epi64(_mm256_load_si256(src+1), seed);
++        dest[2] = _mm256_add_epi64(_mm256_load_si256(src+2), seed);
++        dest[3] = _mm256_add_epi64(_mm256_load_si256(src+3), seed);
++        dest[4] = _mm256_add_epi64(_mm256_load_si256(src+4), seed);
++        dest[5] = _mm256_add_epi64(_mm256_load_si256(src+5), seed);
+     }
+ }
+ 
+ #endif
+ 
+ /* x86dispatch always generates SSE2 */
+ #if (XXH_VECTOR == XXH_SSE2) || defined(XXH_X86DISPATCH)
+ 
+@@ -3960,16 +4811,17 @@ XXH3_accumulate_512_sse2( void* XXH_REST
+             __m128i const product     = _mm_mul_epu32     (data_key, data_key_lo);
+             /* xacc[i] += swap(data_vec); */
+             __m128i const data_swap = _mm_shuffle_epi32(data_vec, _MM_SHUFFLE(1,0,3,2));
+             __m128i const sum       = _mm_add_epi64(xacc[i], data_swap);
+             /* xacc[i] += product; */
+             xacc[i] = _mm_add_epi64(product, sum);
+     }   }
+ }
++XXH_FORCE_INLINE XXH_TARGET_SSE2 XXH3_ACCUMULATE_TEMPLATE(sse2)
+ 
+ XXH_FORCE_INLINE XXH_TARGET_SSE2 void
+ XXH3_scrambleAcc_sse2(void* XXH_RESTRICT acc, const void* XXH_RESTRICT secret)
+ {
+     XXH_ASSERT((((size_t)acc) & 15) == 0);
+     {   __m128i* const xacc = (__m128i*) acc;
+         /* Unaligned. This is mainly for pointer arithmetic, and because
+          * _mm_loadu_si128 requires a const __m128i * pointer for some reason. */
+@@ -4027,389 +4879,687 @@ XXH_FORCE_INLINE XXH_TARGET_SSE2 void XX
+             dst16[i] = _mm_add_epi64(_mm_load_si128((const __m128i *)src16+i), seed);
+     }   }
+ }
+ 
+ #endif
+ 
+ #if (XXH_VECTOR == XXH_NEON)
+ 
++/* forward declarations for the scalar routines */
++XXH_FORCE_INLINE void
++XXH3_scalarRound(void* XXH_RESTRICT acc, void const* XXH_RESTRICT input,
++                 void const* XXH_RESTRICT secret, size_t lane);
++
++XXH_FORCE_INLINE void
++XXH3_scalarScrambleRound(void* XXH_RESTRICT acc,
++                         void const* XXH_RESTRICT secret, size_t lane);
++
++/*!
++ * @internal
++ * @brief The bulk processing loop for NEON and WASM SIMD128.
++ *
++ * The NEON code path is actually partially scalar when running on AArch64. This
++ * is to optimize the pipelining and can have up to 15% speedup depending on the
++ * CPU, and it also mitigates some GCC codegen issues.
++ *
++ * @see XXH3_NEON_LANES for configuring this and details about this optimization.
++ *
++ * NEON's 32-bit to 64-bit long multiply takes a half vector of 32-bit
++ * integers instead of the other platforms which mask full 64-bit vectors,
++ * so the setup is more complicated than just shifting right.
++ *
++ * Additionally, there is an optimization for 4 lanes at once noted below.
++ *
++ * Since, as stated, the most optimal amount of lanes for Cortexes is 6,
++ * there needs to be *three* versions of the accumulate operation used
++ * for the remaining 2 lanes.
++ *
++ * WASM's SIMD128 uses SIMDe's arm_neon.h polyfill because the intrinsics overlap
++ * nearly perfectly.
++ */
++
+ XXH_FORCE_INLINE void
+ XXH3_accumulate_512_neon( void* XXH_RESTRICT acc,
+                     const void* XXH_RESTRICT input,
+                     const void* XXH_RESTRICT secret)
+ {
+     XXH_ASSERT((((size_t)acc) & 15) == 0);
+-    {
+-        uint64x2_t* const xacc = (uint64x2_t *) acc;
++    XXH_STATIC_ASSERT(XXH3_NEON_LANES > 0 && XXH3_NEON_LANES <= XXH_ACC_NB && XXH3_NEON_LANES % 2 == 0);
++    {   /* GCC for darwin arm64 does not like aliasing here */
++        xxh_aliasing_uint64x2_t* const xacc = (xxh_aliasing_uint64x2_t*) acc;
+         /* We don't use a uint32x4_t pointer because it causes bus errors on ARMv7. */
+-        uint8_t const* const xinput = (const uint8_t *) input;
+-        uint8_t const* const xsecret  = (const uint8_t *) secret;
++        uint8_t const* xinput = (const uint8_t *) input;
++        uint8_t const* xsecret  = (const uint8_t *) secret;
+ 
+         size_t i;
+-        for (i=0; i < XXH_STRIPE_LEN / sizeof(uint64x2_t); i++) {
++#ifdef __wasm_simd128__
++        /*
++         * On WASM SIMD128, Clang emits direct address loads when XXH3_kSecret
++         * is constant propagated, which results in it converting it to this
++         * inside the loop:
++         *
++         *    a = v128.load(XXH3_kSecret +  0 + $secret_offset, offset = 0)
++         *    b = v128.load(XXH3_kSecret + 16 + $secret_offset, offset = 0)
++         *    ...
++         *
++         * This requires a full 32-bit address immediate (and therefore a 6 byte
++         * instruction) as well as an add for each offset.
++         *
++         * Putting an asm guard prevents it from folding (at the cost of losing
++         * the alignment hint), and uses the free offset in `v128.load` instead
++         * of adding secret_offset each time which overall reduces code size by
++         * about a kilobyte and improves performance.
++         */
++        XXH_COMPILER_GUARD(xsecret);
++#endif
++        /* Scalar lanes use the normal scalarRound routine */
++        for (i = XXH3_NEON_LANES; i < XXH_ACC_NB; i++) {
++            XXH3_scalarRound(acc, input, secret, i);
++        }
++        i = 0;
++        /* 4 NEON lanes at a time. */
++        for (; i+1 < XXH3_NEON_LANES / 2; i+=2) {
+             /* data_vec = xinput[i]; */
+-            uint8x16_t data_vec    = vld1q_u8(xinput  + (i * 16));
++            uint64x2_t data_vec_1 = XXH_vld1q_u64(xinput  + (i * 16));
++            uint64x2_t data_vec_2 = XXH_vld1q_u64(xinput  + ((i+1) * 16));
+             /* key_vec  = xsecret[i];  */
+-            uint8x16_t key_vec     = vld1q_u8(xsecret + (i * 16));
+-            uint64x2_t data_key;
+-            uint32x2_t data_key_lo, data_key_hi;
+-            /* xacc[i] += swap(data_vec); */
+-            uint64x2_t const data64  = vreinterpretq_u64_u8(data_vec);
+-            uint64x2_t const swapped = vextq_u64(data64, data64, 1);
+-            xacc[i] = vaddq_u64 (xacc[i], swapped);
++            uint64x2_t key_vec_1  = XXH_vld1q_u64(xsecret + (i * 16));
++            uint64x2_t key_vec_2  = XXH_vld1q_u64(xsecret + ((i+1) * 16));
++            /* data_swap = swap(data_vec) */
++            uint64x2_t data_swap_1 = vextq_u64(data_vec_1, data_vec_1, 1);
++            uint64x2_t data_swap_2 = vextq_u64(data_vec_2, data_vec_2, 1);
+             /* data_key = data_vec ^ key_vec; */
+-            data_key = vreinterpretq_u64_u8(veorq_u8(data_vec, key_vec));
+-            /* data_key_lo = (uint32x2_t) (data_key & 0xFFFFFFFF);
+-             * data_key_hi = (uint32x2_t) (data_key >> 32);
+-             * data_key = UNDEFINED; */
+-            XXH_SPLIT_IN_PLACE(data_key, data_key_lo, data_key_hi);
+-            /* xacc[i] += (uint64x2_t) data_key_lo * (uint64x2_t) data_key_hi; */
+-            xacc[i] = vmlal_u32 (xacc[i], data_key_lo, data_key_hi);
+-
++            uint64x2_t data_key_1 = veorq_u64(data_vec_1, key_vec_1);
++            uint64x2_t data_key_2 = veorq_u64(data_vec_2, key_vec_2);
++
++            /*
++             * If we reinterpret the 64x2 vectors as 32x4 vectors, we can use a
++             * de-interleave operation for 4 lanes in 1 step with `vuzpq_u32` to
++             * get one vector with the low 32 bits of each lane, and one vector
++             * with the high 32 bits of each lane.
++             *
++             * The intrinsic returns a double vector because the original ARMv7-a
++             * instruction modified both arguments in place. AArch64 and SIMD128 emit
++             * two instructions from this intrinsic.
++             *
++             *  [ dk11L | dk11H | dk12L | dk12H ] -> [ dk11L | dk12L | dk21L | dk22L ]
++             *  [ dk21L | dk21H | dk22L | dk22H ] -> [ dk11H | dk12H | dk21H | dk22H ]
++             */
++            uint32x4x2_t unzipped = vuzpq_u32(
++                vreinterpretq_u32_u64(data_key_1),
++                vreinterpretq_u32_u64(data_key_2)
++            );
++            /* data_key_lo = data_key & 0xFFFFFFFF */
++            uint32x4_t data_key_lo = unzipped.val[0];
++            /* data_key_hi = data_key >> 32 */
++            uint32x4_t data_key_hi = unzipped.val[1];
++            /*
++             * Then, we can split the vectors horizontally and multiply which, as for most
++             * widening intrinsics, have a variant that works on both high half vectors
++             * for free on AArch64. A similar instruction is available on SIMD128.
++             *
++             * sum = data_swap + (u64x2) data_key_lo * (u64x2) data_key_hi
++             */
++            uint64x2_t sum_1 = XXH_vmlal_low_u32(data_swap_1, data_key_lo, data_key_hi);
++            uint64x2_t sum_2 = XXH_vmlal_high_u32(data_swap_2, data_key_lo, data_key_hi);
++            /*
++             * Clang reorders
++             *    a += b * c;     // umlal   swap.2d, dkl.2s, dkh.2s
++             *    c += a;         // add     acc.2d, acc.2d, swap.2d
++             * to
++             *    c += a;         // add     acc.2d, acc.2d, swap.2d
++             *    c += b * c;     // umlal   acc.2d, dkl.2s, dkh.2s
++             *
++             * While it would make sense in theory since the addition is faster,
++             * for reasons likely related to umlal being limited to certain NEON
++             * pipelines, this is worse. A compiler guard fixes this.
++             */
++            XXH_COMPILER_GUARD_CLANG_NEON(sum_1);
++            XXH_COMPILER_GUARD_CLANG_NEON(sum_2);
++            /* xacc[i] = acc_vec + sum; */
++            xacc[i]   = vaddq_u64(xacc[i], sum_1);
++            xacc[i+1] = vaddq_u64(xacc[i+1], sum_2);
++        }
++        /* Operate on the remaining NEON lanes 2 at a time. */
++        for (; i < XXH3_NEON_LANES / 2; i++) {
++            /* data_vec = xinput[i]; */
++            uint64x2_t data_vec = XXH_vld1q_u64(xinput  + (i * 16));
++            /* key_vec  = xsecret[i];  */
++            uint64x2_t key_vec  = XXH_vld1q_u64(xsecret + (i * 16));
++            /* acc_vec_2 = swap(data_vec) */
++            uint64x2_t data_swap = vextq_u64(data_vec, data_vec, 1);
++            /* data_key = data_vec ^ key_vec; */
++            uint64x2_t data_key = veorq_u64(data_vec, key_vec);
++            /* For two lanes, just use VMOVN and VSHRN. */
++            /* data_key_lo = data_key & 0xFFFFFFFF; */
++            uint32x2_t data_key_lo = vmovn_u64(data_key);
++            /* data_key_hi = data_key >> 32; */
++            uint32x2_t data_key_hi = vshrn_n_u64(data_key, 32);
++            /* sum = data_swap + (u64x2) data_key_lo * (u64x2) data_key_hi; */
++            uint64x2_t sum = vmlal_u32(data_swap, data_key_lo, data_key_hi);
++            /* Same Clang workaround as before */
++            XXH_COMPILER_GUARD_CLANG_NEON(sum);
++            /* xacc[i] = acc_vec + sum; */
++            xacc[i] = vaddq_u64 (xacc[i], sum);
+         }
+     }
+ }
++XXH_FORCE_INLINE XXH3_ACCUMULATE_TEMPLATE(neon)
+ 
+ XXH_FORCE_INLINE void
+ XXH3_scrambleAcc_neon(void* XXH_RESTRICT acc, const void* XXH_RESTRICT secret)
+ {
+     XXH_ASSERT((((size_t)acc) & 15) == 0);
+ 
+-    {   uint64x2_t* xacc       = (uint64x2_t*) acc;
++    {   xxh_aliasing_uint64x2_t* xacc       = (xxh_aliasing_uint64x2_t*) acc;
+         uint8_t const* xsecret = (uint8_t const*) secret;
+-        uint32x2_t prime       = vdup_n_u32 (XXH_PRIME32_1);
+ 
+         size_t i;
+-        for (i=0; i < XXH_STRIPE_LEN/sizeof(uint64x2_t); i++) {
++        /* WASM uses operator overloads and doesn't need these. */
++#ifndef __wasm_simd128__
++        /* { prime32_1, prime32_1 } */
++        uint32x2_t const kPrimeLo = vdup_n_u32(XXH_PRIME32_1);
++        /* { 0, prime32_1, 0, prime32_1 } */
++        uint32x4_t const kPrimeHi = vreinterpretq_u32_u64(vdupq_n_u64((xxh_u64)XXH_PRIME32_1 << 32));
++#endif
++
++        /* AArch64 uses both scalar and neon at the same time */
++        for (i = XXH3_NEON_LANES; i < XXH_ACC_NB; i++) {
++            XXH3_scalarScrambleRound(acc, secret, i);
++        }
++        for (i=0; i < XXH3_NEON_LANES / 2; i++) {
+             /* xacc[i] ^= (xacc[i] >> 47); */
+             uint64x2_t acc_vec  = xacc[i];
+-            uint64x2_t shifted  = vshrq_n_u64 (acc_vec, 47);
+-            uint64x2_t data_vec = veorq_u64   (acc_vec, shifted);
++            uint64x2_t shifted  = vshrq_n_u64(acc_vec, 47);
++            uint64x2_t data_vec = veorq_u64(acc_vec, shifted);
+ 
+             /* xacc[i] ^= xsecret[i]; */
+-            uint8x16_t key_vec  = vld1q_u8    (xsecret + (i * 16));
+-            uint64x2_t data_key = veorq_u64   (data_vec, vreinterpretq_u64_u8(key_vec));
+-
++            uint64x2_t key_vec  = XXH_vld1q_u64(xsecret + (i * 16));
++            uint64x2_t data_key = veorq_u64(data_vec, key_vec);
+             /* xacc[i] *= XXH_PRIME32_1 */
+-            uint32x2_t data_key_lo, data_key_hi;
+-            /* data_key_lo = (uint32x2_t) (xacc[i] & 0xFFFFFFFF);
+-             * data_key_hi = (uint32x2_t) (xacc[i] >> 32);
+-             * xacc[i] = UNDEFINED; */
+-            XXH_SPLIT_IN_PLACE(data_key, data_key_lo, data_key_hi);
+-            {   /*
+-                 * prod_hi = (data_key >> 32) * XXH_PRIME32_1;
+-                 *
+-                 * Avoid vmul_u32 + vshll_n_u32 since Clang 6 and 7 will
+-                 * incorrectly "optimize" this:
+-                 *   tmp     = vmul_u32(vmovn_u64(a), vmovn_u64(b));
+-                 *   shifted = vshll_n_u32(tmp, 32);
+-                 * to this:
+-                 *   tmp     = "vmulq_u64"(a, b); // no such thing!
+-                 *   shifted = vshlq_n_u64(tmp, 32);
+-                 *
+-                 * However, unlike SSE, Clang lacks a 64-bit multiply routine
+-                 * for NEON, and it scalarizes two 64-bit multiplies instead.
+-                 *
+-                 * vmull_u32 has the same timing as vmul_u32, and it avoids
+-                 * this bug completely.
+-                 * See https://bugs.llvm.org/show_bug.cgi?id=39967
+-                 */
+-                uint64x2_t prod_hi = vmull_u32 (data_key_hi, prime);
+-                /* xacc[i] = prod_hi << 32; */
+-                xacc[i] = vshlq_n_u64(prod_hi, 32);
+-                /* xacc[i] += (prod_hi & 0xFFFFFFFF) * XXH_PRIME32_1; */
+-                xacc[i] = vmlal_u32(xacc[i], data_key_lo, prime);
+-            }
+-    }   }
+-}
+-
++#ifdef __wasm_simd128__
++            /* SIMD128 has multiply by u64x2, use it instead of expanding and scalarizing */
++            xacc[i] = data_key * XXH_PRIME32_1;
++#else
++            /*
++             * Expanded version with portable NEON intrinsics
++             *
++             *    lo(x) * lo(y) + (hi(x) * lo(y) << 32)
++             *
++             * prod_hi = hi(data_key) * lo(prime) << 32
++             *
++             * Since we only need 32 bits of this multiply a trick can be used, reinterpreting the vector
++             * as a uint32x4_t and multiplying by { 0, prime, 0, prime } to cancel out the unwanted bits
++             * and avoid the shift.
++             */
++            uint32x4_t prod_hi = vmulq_u32 (vreinterpretq_u32_u64(data_key), kPrimeHi);
++            /* Extract low bits for vmlal_u32  */
++            uint32x2_t data_key_lo = vmovn_u64(data_key);
++            /* xacc[i] = prod_hi + lo(data_key) * XXH_PRIME32_1; */
++            xacc[i] = vmlal_u32(vreinterpretq_u64_u32(prod_hi), data_key_lo, kPrimeLo);
++#endif
++        }
++    }
++}
+ #endif
+ 
+ #if (XXH_VECTOR == XXH_VSX)
+ 
+ XXH_FORCE_INLINE void
+ XXH3_accumulate_512_vsx(  void* XXH_RESTRICT acc,
+                     const void* XXH_RESTRICT input,
+                     const void* XXH_RESTRICT secret)
+ {
+     /* presumed aligned */
+-    unsigned int* const xacc = (unsigned int*) acc;
+-    xxh_u64x2 const* const xinput   = (xxh_u64x2 const*) input;   /* no alignment restriction */
+-    xxh_u64x2 const* const xsecret  = (xxh_u64x2 const*) secret;    /* no alignment restriction */
++    xxh_aliasing_u64x2* const xacc = (xxh_aliasing_u64x2*) acc;
++    xxh_u8 const* const xinput   = (xxh_u8 const*) input;   /* no alignment restriction */
++    xxh_u8 const* const xsecret  = (xxh_u8 const*) secret;    /* no alignment restriction */
+     xxh_u64x2 const v32 = { 32, 32 };
+     size_t i;
+     for (i = 0; i < XXH_STRIPE_LEN / sizeof(xxh_u64x2); i++) {
+         /* data_vec = xinput[i]; */
+-        xxh_u64x2 const data_vec = XXH_vec_loadu(xinput + i);
++        xxh_u64x2 const data_vec = XXH_vec_loadu(xinput + 16*i);
+         /* key_vec = xsecret[i]; */
+-        xxh_u64x2 const key_vec  = XXH_vec_loadu(xsecret + i);
++        xxh_u64x2 const key_vec  = XXH_vec_loadu(xsecret + 16*i);
+         xxh_u64x2 const data_key = data_vec ^ key_vec;
+         /* shuffled = (data_key << 32) | (data_key >> 32); */
+         xxh_u32x4 const shuffled = (xxh_u32x4)vec_rl(data_key, v32);
+         /* product = ((xxh_u64x2)data_key & 0xFFFFFFFF) * ((xxh_u64x2)shuffled & 0xFFFFFFFF); */
+         xxh_u64x2 const product  = XXH_vec_mulo((xxh_u32x4)data_key, shuffled);
+         /* acc_vec = xacc[i]; */
+-        xxh_u64x2 acc_vec        = (xxh_u64x2)vec_xl(0, xacc + 4 * i);
++        xxh_u64x2 acc_vec        = xacc[i];
+         acc_vec += product;
+ 
+         /* swap high and low halves */
+ #ifdef __s390x__
+         acc_vec += vec_permi(data_vec, data_vec, 2);
+ #else
+         acc_vec += vec_xxpermdi(data_vec, data_vec, 2);
+ #endif
+-        /* xacc[i] = acc_vec; */
+-        vec_xst((xxh_u32x4)acc_vec, 0, xacc + 4 * i);
++        xacc[i] = acc_vec;
+     }
+ }
++XXH_FORCE_INLINE XXH3_ACCUMULATE_TEMPLATE(vsx)
+ 
+ XXH_FORCE_INLINE void
+ XXH3_scrambleAcc_vsx(void* XXH_RESTRICT acc, const void* XXH_RESTRICT secret)
+ {
+     XXH_ASSERT((((size_t)acc) & 15) == 0);
+ 
+-    {         xxh_u64x2* const xacc    =       (xxh_u64x2*) acc;
+-        const xxh_u64x2* const xsecret = (const xxh_u64x2*) secret;
++    {   xxh_aliasing_u64x2* const xacc = (xxh_aliasing_u64x2*) acc;
++        const xxh_u8* const xsecret = (const xxh_u8*) secret;
+         /* constants */
+         xxh_u64x2 const v32  = { 32, 32 };
+         xxh_u64x2 const v47 = { 47, 47 };
+         xxh_u32x4 const prime = { XXH_PRIME32_1, XXH_PRIME32_1, XXH_PRIME32_1, XXH_PRIME32_1 };
+         size_t i;
+         for (i = 0; i < XXH_STRIPE_LEN / sizeof(xxh_u64x2); i++) {
+             /* xacc[i] ^= (xacc[i] >> 47); */
+             xxh_u64x2 const acc_vec  = xacc[i];
+             xxh_u64x2 const data_vec = acc_vec ^ (acc_vec >> v47);
+ 
+             /* xacc[i] ^= xsecret[i]; */
+-            xxh_u64x2 const key_vec  = XXH_vec_loadu(xsecret + i);
++            xxh_u64x2 const key_vec  = XXH_vec_loadu(xsecret + 16*i);
+             xxh_u64x2 const data_key = data_vec ^ key_vec;
+ 
+             /* xacc[i] *= XXH_PRIME32_1 */
+             /* prod_lo = ((xxh_u64x2)data_key & 0xFFFFFFFF) * ((xxh_u64x2)prime & 0xFFFFFFFF);  */
+             xxh_u64x2 const prod_even  = XXH_vec_mule((xxh_u32x4)data_key, prime);
+             /* prod_hi = ((xxh_u64x2)data_key >> 32) * ((xxh_u64x2)prime >> 32);  */
+             xxh_u64x2 const prod_odd  = XXH_vec_mulo((xxh_u32x4)data_key, prime);
+             xacc[i] = prod_odd + (prod_even << v32);
+     }   }
+ }
+ 
+ #endif
+ 
++#if (XXH_VECTOR == XXH_SVE)
++
++XXH_FORCE_INLINE void
++XXH3_accumulate_512_sve( void* XXH_RESTRICT acc,
++                   const void* XXH_RESTRICT input,
++                   const void* XXH_RESTRICT secret)
++{
++    uint64_t *xacc = (uint64_t *)acc;
++    const uint64_t *xinput = (const uint64_t *)(const void *)input;
++    const uint64_t *xsecret = (const uint64_t *)(const void *)secret;
++    svuint64_t kSwap = sveor_n_u64_z(svptrue_b64(), svindex_u64(0, 1), 1);
++    uint64_t element_count = svcntd();
++    if (element_count >= 8) {
++        svbool_t mask = svptrue_pat_b64(SV_VL8);
++        svuint64_t vacc = svld1_u64(mask, xacc);
++        ACCRND(vacc, 0);
++        svst1_u64(mask, xacc, vacc);
++    } else if (element_count == 2) {   /* sve128 */
++        svbool_t mask = svptrue_pat_b64(SV_VL2);
++        svuint64_t acc0 = svld1_u64(mask, xacc + 0);
++        svuint64_t acc1 = svld1_u64(mask, xacc + 2);
++        svuint64_t acc2 = svld1_u64(mask, xacc + 4);
++        svuint64_t acc3 = svld1_u64(mask, xacc + 6);
++        ACCRND(acc0, 0);
++        ACCRND(acc1, 2);
++        ACCRND(acc2, 4);
++        ACCRND(acc3, 6);
++        svst1_u64(mask, xacc + 0, acc0);
++        svst1_u64(mask, xacc + 2, acc1);
++        svst1_u64(mask, xacc + 4, acc2);
++        svst1_u64(mask, xacc + 6, acc3);
++    } else {
++        svbool_t mask = svptrue_pat_b64(SV_VL4);
++        svuint64_t acc0 = svld1_u64(mask, xacc + 0);
++        svuint64_t acc1 = svld1_u64(mask, xacc + 4);
++        ACCRND(acc0, 0);
++        ACCRND(acc1, 4);
++        svst1_u64(mask, xacc + 0, acc0);
++        svst1_u64(mask, xacc + 4, acc1);
++    }
++}
++
++XXH_FORCE_INLINE void
++XXH3_accumulate_sve(xxh_u64* XXH_RESTRICT acc,
++               const xxh_u8* XXH_RESTRICT input,
++               const xxh_u8* XXH_RESTRICT secret,
++               size_t nbStripes)
++{
++    if (nbStripes != 0) {
++        uint64_t *xacc = (uint64_t *)acc;
++        const uint64_t *xinput = (const uint64_t *)(const void *)input;
++        const uint64_t *xsecret = (const uint64_t *)(const void *)secret;
++        svuint64_t kSwap = sveor_n_u64_z(svptrue_b64(), svindex_u64(0, 1), 1);
++        uint64_t element_count = svcntd();
++        if (element_count >= 8) {
++            svbool_t mask = svptrue_pat_b64(SV_VL8);
++            svuint64_t vacc = svld1_u64(mask, xacc + 0);
++            do {
++                /* svprfd(svbool_t, void *, enum svfprop); */
++                svprfd(mask, xinput + 128, SV_PLDL1STRM);
++                ACCRND(vacc, 0);
++                xinput += 8;
++                xsecret += 1;
++                nbStripes--;
++           } while (nbStripes != 0);
++
++           svst1_u64(mask, xacc + 0, vacc);
++        } else if (element_count == 2) { /* sve128 */
++            svbool_t mask = svptrue_pat_b64(SV_VL2);
++            svuint64_t acc0 = svld1_u64(mask, xacc + 0);
++            svuint64_t acc1 = svld1_u64(mask, xacc + 2);
++            svuint64_t acc2 = svld1_u64(mask, xacc + 4);
++            svuint64_t acc3 = svld1_u64(mask, xacc + 6);
++            do {
++                svprfd(mask, xinput + 128, SV_PLDL1STRM);
++                ACCRND(acc0, 0);
++                ACCRND(acc1, 2);
++                ACCRND(acc2, 4);
++                ACCRND(acc3, 6);
++                xinput += 8;
++                xsecret += 1;
++                nbStripes--;
++           } while (nbStripes != 0);
++
++           svst1_u64(mask, xacc + 0, acc0);
++           svst1_u64(mask, xacc + 2, acc1);
++           svst1_u64(mask, xacc + 4, acc2);
++           svst1_u64(mask, xacc + 6, acc3);
++        } else {
++            svbool_t mask = svptrue_pat_b64(SV_VL4);
++            svuint64_t acc0 = svld1_u64(mask, xacc + 0);
++            svuint64_t acc1 = svld1_u64(mask, xacc + 4);
++            do {
++                svprfd(mask, xinput + 128, SV_PLDL1STRM);
++                ACCRND(acc0, 0);
++                ACCRND(acc1, 4);
++                xinput += 8;
++                xsecret += 1;
++                nbStripes--;
++           } while (nbStripes != 0);
++
++           svst1_u64(mask, xacc + 0, acc0);
++           svst1_u64(mask, xacc + 4, acc1);
++       }
++    }
++}
++
++#endif
++
+ /* scalar variants - universal */
+ 
++#if defined(__aarch64__) && (defined(__GNUC__) || defined(__clang__))
++/*
++ * In XXH3_scalarRound(), GCC and Clang have a similar codegen issue, where they
++ * emit an excess mask and a full 64-bit multiply-add (MADD X-form).
++ *
++ * While this might not seem like much, as AArch64 is a 64-bit architecture, only
++ * big Cortex designs have a full 64-bit multiplier.
++ *
++ * On the little cores, the smaller 32-bit multiplier is used, and full 64-bit
++ * multiplies expand to 2-3 multiplies in microcode. This has a major penalty
++ * of up to 4 latency cycles and 2 stall cycles in the multiply pipeline.
++ *
++ * Thankfully, AArch64 still provides the 32-bit long multiply-add (UMADDL) which does
++ * not have this penalty and does the mask automatically.
++ */
++XXH_FORCE_INLINE xxh_u64
++XXH_mult32to64_add64(xxh_u64 lhs, xxh_u64 rhs, xxh_u64 acc)
++{
++    xxh_u64 ret;
++    /* note: %x = 64-bit register, %w = 32-bit register */
++    __asm__("umaddl %x0, %w1, %w2, %x3" : "=r" (ret) : "r" (lhs), "r" (rhs), "r" (acc));
++    return ret;
++}
++#else
++XXH_FORCE_INLINE xxh_u64
++XXH_mult32to64_add64(xxh_u64 lhs, xxh_u64 rhs, xxh_u64 acc)
++{
++    return XXH_mult32to64((xxh_u32)lhs, (xxh_u32)rhs) + acc;
++}
++#endif
++
++/*!
++ * @internal
++ * @brief Scalar round for @ref XXH3_accumulate_512_scalar().
++ *
++ * This is extracted to its own function because the NEON path uses a combination
++ * of NEON and scalar.
++ */
++XXH_FORCE_INLINE void
++XXH3_scalarRound(void* XXH_RESTRICT acc,
++                 void const* XXH_RESTRICT input,
++                 void const* XXH_RESTRICT secret,
++                 size_t lane)
++{
++    xxh_u64* xacc = (xxh_u64*) acc;
++    xxh_u8 const* xinput  = (xxh_u8 const*) input;
++    xxh_u8 const* xsecret = (xxh_u8 const*) secret;
++    XXH_ASSERT(lane < XXH_ACC_NB);
++    XXH_ASSERT(((size_t)acc & (XXH_ACC_ALIGN-1)) == 0);
++    {
++        xxh_u64 const data_val = XXH_readLE64(xinput + lane * 8);
++        xxh_u64 const data_key = data_val ^ XXH_readLE64(xsecret + lane * 8);
++        xacc[lane ^ 1] += data_val; /* swap adjacent lanes */
++        xacc[lane] = XXH_mult32to64_add64(data_key /* & 0xFFFFFFFF */, data_key >> 32, xacc[lane]);
++    }
++}
++
++/*!
++ * @internal
++ * @brief Processes a 64 byte block of data using the scalar path.
++ */
+ XXH_FORCE_INLINE void
+ XXH3_accumulate_512_scalar(void* XXH_RESTRICT acc,
+                      const void* XXH_RESTRICT input,
+                      const void* XXH_RESTRICT secret)
+ {
+-    xxh_u64* const xacc = (xxh_u64*) acc; /* presumed aligned */
+-    const xxh_u8* const xinput  = (const xxh_u8*) input;  /* no alignment restriction */
+-    const xxh_u8* const xsecret = (const xxh_u8*) secret;   /* no alignment restriction */
+     size_t i;
+-    XXH_ASSERT(((size_t)acc & (XXH_ACC_ALIGN-1)) == 0);
++    /* ARM GCC refuses to unroll this loop, resulting in a 24% slowdown on ARMv6. */
++#if defined(__GNUC__) && !defined(__clang__) \
++  && (defined(__arm__) || defined(__thumb2__)) \
++  && defined(__ARM_FEATURE_UNALIGNED) /* no unaligned access just wastes bytes */ \
++  && XXH_SIZE_OPT <= 0
++#  pragma GCC unroll 8
++#endif
+     for (i=0; i < XXH_ACC_NB; i++) {
+-        xxh_u64 const data_val = XXH_readLE64(xinput + 8*i);
+-        xxh_u64 const data_key = data_val ^ XXH_readLE64(xsecret + i*8);
+-        xacc[i ^ 1] += data_val; /* swap adjacent lanes */
+-        xacc[i] += XXH_mult32to64(data_key & 0xFFFFFFFF, data_key >> 32);
++        XXH3_scalarRound(acc, input, secret, i);
+     }
+ }
+-
++XXH_FORCE_INLINE XXH3_ACCUMULATE_TEMPLATE(scalar)
++
++/*!
++ * @internal
++ * @brief Scalar scramble step for @ref XXH3_scrambleAcc_scalar().
++ *
++ * This is extracted to its own function because the NEON path uses a combination
++ * of NEON and scalar.
++ */
+ XXH_FORCE_INLINE void
+-XXH3_scrambleAcc_scalar(void* XXH_RESTRICT acc, const void* XXH_RESTRICT secret)
++XXH3_scalarScrambleRound(void* XXH_RESTRICT acc,
++                         void const* XXH_RESTRICT secret,
++                         size_t lane)
+ {
+     xxh_u64* const xacc = (xxh_u64*) acc;   /* presumed aligned */
+     const xxh_u8* const xsecret = (const xxh_u8*) secret;   /* no alignment restriction */
+-    size_t i;
+     XXH_ASSERT((((size_t)acc) & (XXH_ACC_ALIGN-1)) == 0);
+-    for (i=0; i < XXH_ACC_NB; i++) {
+-        xxh_u64 const key64 = XXH_readLE64(xsecret + 8*i);
+-        xxh_u64 acc64 = xacc[i];
++    XXH_ASSERT(lane < XXH_ACC_NB);
++    {
++        xxh_u64 const key64 = XXH_readLE64(xsecret + lane * 8);
++        xxh_u64 acc64 = xacc[lane];
+         acc64 = XXH_xorshift64(acc64, 47);
+         acc64 ^= key64;
+         acc64 *= XXH_PRIME32_1;
+-        xacc[i] = acc64;
++        xacc[lane] = acc64;
++    }
++}
++
++/*!
++ * @internal
++ * @brief Scrambles the accumulators after a large chunk has been read
++ */
++XXH_FORCE_INLINE void
++XXH3_scrambleAcc_scalar(void* XXH_RESTRICT acc, const void* XXH_RESTRICT secret)
++{
++    size_t i;
++    for (i=0; i < XXH_ACC_NB; i++) {
++        XXH3_scalarScrambleRound(acc, secret, i);
+     }
+ }
+ 
+ XXH_FORCE_INLINE void
+ XXH3_initCustomSecret_scalar(void* XXH_RESTRICT customSecret, xxh_u64 seed64)
+ {
+     /*
+      * We need a separate pointer for the hack below,
+      * which requires a non-const pointer.
+      * Any decent compiler will optimize this out otherwise.
+      */
+     const xxh_u8* kSecretPtr = XXH3_kSecret;
+     XXH_STATIC_ASSERT((XXH_SECRET_DEFAULT_SIZE & 15) == 0);
+ 
+-#if defined(__clang__) && defined(__aarch64__)
++#if defined(__GNUC__) && defined(__aarch64__)
+     /*
+      * UGLY HACK:
+-     * Clang generates a bunch of MOV/MOVK pairs for aarch64, and they are
++     * GCC and Clang generate a bunch of MOV/MOVK pairs for aarch64, and they are
+      * placed sequentially, in order, at the top of the unrolled loop.
+      *
+      * While MOVK is great for generating constants (2 cycles for a 64-bit
+-     * constant compared to 4 cycles for LDR), long MOVK chains stall the
+-     * integer pipelines:
++     * constant compared to 4 cycles for LDR), it fights for bandwidth with
++     * the arithmetic instructions.
++     *
+      *   I   L   S
+      * MOVK
+      * MOVK
+      * MOVK
+      * MOVK
+      * ADD
+      * SUB      STR
+      *          STR
+-     * By forcing loads from memory (as the asm line causes Clang to assume
++     * By forcing loads from memory (as the asm line causes the compiler to assume
+      * that XXH3_kSecretPtr has been changed), the pipelines are used more
+      * efficiently:
+      *   I   L   S
+      *      LDR
+      *  ADD LDR
+      *  SUB     STR
+      *          STR
++     *
++     * See XXH3_NEON_LANES for details on the pipsline.
++     *
+      * XXH3_64bits_withSeed, len == 256, Snapdragon 835
+      *   without hack: 2654.4 MB/s
+      *   with hack:    3202.9 MB/s
+      */
+     XXH_COMPILER_GUARD(kSecretPtr);
+ #endif
+-    /*
+-     * Note: in debug mode, this overrides the asm optimization
+-     * and Clang will emit MOVK chains again.
+-     */
+-    XXH_ASSERT(kSecretPtr == XXH3_kSecret);
+-
+     {   int const nbRounds = XXH_SECRET_DEFAULT_SIZE / 16;
+         int i;
+         for (i=0; i < nbRounds; i++) {
+             /*
+-             * The asm hack causes Clang to assume that kSecretPtr aliases with
++             * The asm hack causes the compiler to assume that kSecretPtr aliases with
+              * customSecret, and on aarch64, this prevented LDP from merging two
+              * loads together for free. Putting the loads together before the stores
+              * properly generates LDP.
+              */
+             xxh_u64 lo = XXH_readLE64(kSecretPtr + 16*i)     + seed64;
+             xxh_u64 hi = XXH_readLE64(kSecretPtr + 16*i + 8) - seed64;
+             XXH_writeLE64((xxh_u8*)customSecret + 16*i,     lo);
+             XXH_writeLE64((xxh_u8*)customSecret + 16*i + 8, hi);
+     }   }
+ }
+ 
+ 
+-typedef void (*XXH3_f_accumulate_512)(void* XXH_RESTRICT, const void*, const void*);
++typedef void (*XXH3_f_accumulate)(xxh_u64* XXH_RESTRICT, const xxh_u8* XXH_RESTRICT, const xxh_u8* XXH_RESTRICT, size_t);
+ typedef void (*XXH3_f_scrambleAcc)(void* XXH_RESTRICT, const void*);
+ typedef void (*XXH3_f_initCustomSecret)(void* XXH_RESTRICT, xxh_u64);
+ 
+ 
+ #if (XXH_VECTOR == XXH_AVX512)
+ 
+ #define XXH3_accumulate_512 XXH3_accumulate_512_avx512
++#define XXH3_accumulate     XXH3_accumulate_avx512
+ #define XXH3_scrambleAcc    XXH3_scrambleAcc_avx512
+ #define XXH3_initCustomSecret XXH3_initCustomSecret_avx512
+ 
+ #elif (XXH_VECTOR == XXH_AVX2)
+ 
+ #define XXH3_accumulate_512 XXH3_accumulate_512_avx2
++#define XXH3_accumulate     XXH3_accumulate_avx2
+ #define XXH3_scrambleAcc    XXH3_scrambleAcc_avx2
+ #define XXH3_initCustomSecret XXH3_initCustomSecret_avx2
+ 
+ #elif (XXH_VECTOR == XXH_SSE2)
+ 
+ #define XXH3_accumulate_512 XXH3_accumulate_512_sse2
++#define XXH3_accumulate     XXH3_accumulate_sse2
+ #define XXH3_scrambleAcc    XXH3_scrambleAcc_sse2
+ #define XXH3_initCustomSecret XXH3_initCustomSecret_sse2
+ 
+ #elif (XXH_VECTOR == XXH_NEON)
+ 
+ #define XXH3_accumulate_512 XXH3_accumulate_512_neon
++#define XXH3_accumulate     XXH3_accumulate_neon
+ #define XXH3_scrambleAcc    XXH3_scrambleAcc_neon
+ #define XXH3_initCustomSecret XXH3_initCustomSecret_scalar
+ 
+ #elif (XXH_VECTOR == XXH_VSX)
+ 
+ #define XXH3_accumulate_512 XXH3_accumulate_512_vsx
++#define XXH3_accumulate     XXH3_accumulate_vsx
+ #define XXH3_scrambleAcc    XXH3_scrambleAcc_vsx
+ #define XXH3_initCustomSecret XXH3_initCustomSecret_scalar
+ 
++#elif (XXH_VECTOR == XXH_SVE)
++#define XXH3_accumulate_512 XXH3_accumulate_512_sve
++#define XXH3_accumulate     XXH3_accumulate_sve
++#define XXH3_scrambleAcc    XXH3_scrambleAcc_scalar
++#define XXH3_initCustomSecret XXH3_initCustomSecret_scalar
++
+ #else /* scalar */
+ 
+ #define XXH3_accumulate_512 XXH3_accumulate_512_scalar
++#define XXH3_accumulate     XXH3_accumulate_scalar
+ #define XXH3_scrambleAcc    XXH3_scrambleAcc_scalar
+ #define XXH3_initCustomSecret XXH3_initCustomSecret_scalar
+ 
+ #endif
+ 
+-
+-
+-#ifndef XXH_PREFETCH_DIST
+-#  ifdef __clang__
+-#    define XXH_PREFETCH_DIST 320
+-#  else
+-#    if (XXH_VECTOR == XXH_AVX512)
+-#      define XXH_PREFETCH_DIST 512
+-#    else
+-#      define XXH_PREFETCH_DIST 384
+-#    endif
+-#  endif  /* __clang__ */
+-#endif  /* XXH_PREFETCH_DIST */
+-
+-/*
+- * XXH3_accumulate()
+- * Loops over XXH3_accumulate_512().
+- * Assumption: nbStripes will not overflow the secret size
+- */
+-XXH_FORCE_INLINE void
+-XXH3_accumulate(     xxh_u64* XXH_RESTRICT acc,
+-                const xxh_u8* XXH_RESTRICT input,
+-                const xxh_u8* XXH_RESTRICT secret,
+-                      size_t nbStripes,
+-                      XXH3_f_accumulate_512 f_acc512)
+-{
+-    size_t n;
+-    for (n = 0; n < nbStripes; n++ ) {
+-        const xxh_u8* const in = input + n*XXH_STRIPE_LEN;
+-        XXH_PREFETCH(in + XXH_PREFETCH_DIST);
+-        f_acc512(acc,
+-                 in,
+-                 secret + n*XXH_SECRET_CONSUME_RATE);
+-    }
+-}
++#if XXH_SIZE_OPT >= 1 /* don't do SIMD for initialization */
++#  undef XXH3_initCustomSecret
++#  define XXH3_initCustomSecret XXH3_initCustomSecret_scalar
++#endif
+ 
+ XXH_FORCE_INLINE void
+ XXH3_hashLong_internal_loop(xxh_u64* XXH_RESTRICT acc,
+                       const xxh_u8* XXH_RESTRICT input, size_t len,
+                       const xxh_u8* XXH_RESTRICT secret, size_t secretSize,
+-                            XXH3_f_accumulate_512 f_acc512,
++                            XXH3_f_accumulate f_acc,
+                             XXH3_f_scrambleAcc f_scramble)
+ {
+     size_t const nbStripesPerBlock = (secretSize - XXH_STRIPE_LEN) / XXH_SECRET_CONSUME_RATE;
+     size_t const block_len = XXH_STRIPE_LEN * nbStripesPerBlock;
+     size_t const nb_blocks = (len - 1) / block_len;
+ 
+     size_t n;
+ 
+     XXH_ASSERT(secretSize >= XXH3_SECRET_SIZE_MIN);
+ 
+     for (n = 0; n < nb_blocks; n++) {
+-        XXH3_accumulate(acc, input + n*block_len, secret, nbStripesPerBlock, f_acc512);
++        f_acc(acc, input + n*block_len, secret, nbStripesPerBlock);
+         f_scramble(acc, secret + secretSize - XXH_STRIPE_LEN);
+     }
+ 
+     /* last partial block */
+     XXH_ASSERT(len > XXH_STRIPE_LEN);
+     {   size_t const nbStripes = ((len - 1) - (block_len * nb_blocks)) / XXH_STRIPE_LEN;
+         XXH_ASSERT(nbStripes <= (secretSize / XXH_SECRET_CONSUME_RATE));
+-        XXH3_accumulate(acc, input + nb_blocks*block_len, secret, nbStripes, f_acc512);
++        f_acc(acc, input + nb_blocks*block_len, secret, nbStripes);
+ 
+         /* last stripe */
+         {   const xxh_u8* const p = input + len - XXH_STRIPE_LEN;
+ #define XXH_SECRET_LASTACC_START 7  /* not aligned on 8, last secret is different from acc & scrambler */
+-            f_acc512(acc, p, secret + secretSize - XXH_STRIPE_LEN - XXH_SECRET_LASTACC_START);
++            XXH3_accumulate_512(acc, p, secret + secretSize - XXH_STRIPE_LEN - XXH_SECRET_LASTACC_START);
+     }   }
+ }
+ 
+ XXH_FORCE_INLINE xxh_u64
+ XXH3_mix2Accs(const xxh_u64* XXH_RESTRICT acc, const xxh_u8* XXH_RESTRICT secret)
+ {
+     return XXH3_mul128_fold64(
+                acc[0] ^ XXH_readLE64(secret),
+@@ -4444,97 +5594,101 @@ XXH3_mergeAccs(const xxh_u64* XXH_RESTRI
+ }
+ 
+ #define XXH3_INIT_ACC { XXH_PRIME32_3, XXH_PRIME64_1, XXH_PRIME64_2, XXH_PRIME64_3, \
+                         XXH_PRIME64_4, XXH_PRIME32_2, XXH_PRIME64_5, XXH_PRIME32_1 }
+ 
+ XXH_FORCE_INLINE XXH64_hash_t
+ XXH3_hashLong_64b_internal(const void* XXH_RESTRICT input, size_t len,
+                            const void* XXH_RESTRICT secret, size_t secretSize,
+-                           XXH3_f_accumulate_512 f_acc512,
++                           XXH3_f_accumulate f_acc,
+                            XXH3_f_scrambleAcc f_scramble)
+ {
+     XXH_ALIGN(XXH_ACC_ALIGN) xxh_u64 acc[XXH_ACC_NB] = XXH3_INIT_ACC;
+ 
+-    XXH3_hashLong_internal_loop(acc, (const xxh_u8*)input, len, (const xxh_u8*)secret, secretSize, f_acc512, f_scramble);
++    XXH3_hashLong_internal_loop(acc, (const xxh_u8*)input, len, (const xxh_u8*)secret, secretSize, f_acc, f_scramble);
+ 
+     /* converge into final hash */
+     XXH_STATIC_ASSERT(sizeof(acc) == 64);
+     /* do not align on 8, so that the secret is different from the accumulator */
+ #define XXH_SECRET_MERGEACCS_START 11
+     XXH_ASSERT(secretSize >= sizeof(acc) + XXH_SECRET_MERGEACCS_START);
+     return XXH3_mergeAccs(acc, (const xxh_u8*)secret + XXH_SECRET_MERGEACCS_START, (xxh_u64)len * XXH_PRIME64_1);
+ }
+ 
+ /*
+  * It's important for performance to transmit secret's size (when it's static)
+  * so that the compiler can properly optimize the vectorized loop.
+  * This makes a big performance difference for "medium" keys (<1 KB) when using AVX instruction set.
+- */
+-XXH_FORCE_INLINE XXH64_hash_t
++ * When the secret size is unknown, or on GCC 12 where the mix of NO_INLINE and FORCE_INLINE
++ * breaks -Og, this is XXH_NO_INLINE.
++ */
++XXH3_WITH_SECRET_INLINE XXH64_hash_t
+ XXH3_hashLong_64b_withSecret(const void* XXH_RESTRICT input, size_t len,
+                              XXH64_hash_t seed64, const xxh_u8* XXH_RESTRICT secret, size_t secretLen)
+ {
+     (void)seed64;
+-    return XXH3_hashLong_64b_internal(input, len, secret, secretLen, XXH3_accumulate_512, XXH3_scrambleAcc);
++    return XXH3_hashLong_64b_internal(input, len, secret, secretLen, XXH3_accumulate, XXH3_scrambleAcc);
+ }
+ 
+ /*
+  * It's preferable for performance that XXH3_hashLong is not inlined,
+  * as it results in a smaller function for small data, easier to the instruction cache.
+  * Note that inside this no_inline function, we do inline the internal loop,
+  * and provide a statically defined secret size to allow optimization of vector loop.
+  */
+-XXH_NO_INLINE XXH64_hash_t
++XXH_NO_INLINE XXH_PUREF XXH64_hash_t
+ XXH3_hashLong_64b_default(const void* XXH_RESTRICT input, size_t len,
+                           XXH64_hash_t seed64, const xxh_u8* XXH_RESTRICT secret, size_t secretLen)
+ {
+     (void)seed64; (void)secret; (void)secretLen;
+-    return XXH3_hashLong_64b_internal(input, len, XXH3_kSecret, sizeof(XXH3_kSecret), XXH3_accumulate_512, XXH3_scrambleAcc);
++    return XXH3_hashLong_64b_internal(input, len, XXH3_kSecret, sizeof(XXH3_kSecret), XXH3_accumulate, XXH3_scrambleAcc);
+ }
+ 
+ /*
+  * XXH3_hashLong_64b_withSeed():
+  * Generate a custom key based on alteration of default XXH3_kSecret with the seed,
+  * and then use this key for long mode hashing.
+  *
+  * This operation is decently fast but nonetheless costs a little bit of time.
+  * Try to avoid it whenever possible (typically when seed==0).
+  *
+  * It's important for performance that XXH3_hashLong is not inlined. Not sure
+  * why (uop cache maybe?), but the difference is large and easily measurable.
+  */
+ XXH_FORCE_INLINE XXH64_hash_t
+ XXH3_hashLong_64b_withSeed_internal(const void* input, size_t len,
+                                     XXH64_hash_t seed,
+-                                    XXH3_f_accumulate_512 f_acc512,
++                                    XXH3_f_accumulate f_acc,
+                                     XXH3_f_scrambleAcc f_scramble,
+                                     XXH3_f_initCustomSecret f_initSec)
+ {
++#if XXH_SIZE_OPT <= 0
+     if (seed == 0)
+         return XXH3_hashLong_64b_internal(input, len,
+                                           XXH3_kSecret, sizeof(XXH3_kSecret),
+-                                          f_acc512, f_scramble);
++                                          f_acc, f_scramble);
++#endif
+     {   XXH_ALIGN(XXH_SEC_ALIGN) xxh_u8 secret[XXH_SECRET_DEFAULT_SIZE];
+         f_initSec(secret, seed);
+         return XXH3_hashLong_64b_internal(input, len, secret, sizeof(secret),
+-                                          f_acc512, f_scramble);
++                                          f_acc, f_scramble);
+     }
+ }
+ 
+ /*
+  * It's important for performance that XXH3_hashLong is not inlined.
+  */
+ XXH_NO_INLINE XXH64_hash_t
+-XXH3_hashLong_64b_withSeed(const void* input, size_t len,
+-                           XXH64_hash_t seed, const xxh_u8* secret, size_t secretLen)
++XXH3_hashLong_64b_withSeed(const void* XXH_RESTRICT input, size_t len,
++                           XXH64_hash_t seed, const xxh_u8* XXH_RESTRICT secret, size_t secretLen)
+ {
+     (void)secret; (void)secretLen;
+     return XXH3_hashLong_64b_withSeed_internal(input, len, seed,
+-                XXH3_accumulate_512, XXH3_scrambleAcc, XXH3_initCustomSecret);
++                XXH3_accumulate, XXH3_scrambleAcc, XXH3_initCustomSecret);
+ }
+ 
+ 
+ typedef XXH64_hash_t (*XXH3_hashLong64_f)(const void* XXH_RESTRICT, size_t,
+                                           XXH64_hash_t, const xxh_u8* XXH_RESTRICT, size_t);
+ 
+ XXH_FORCE_INLINE XXH64_hash_t
+ XXH3_64bits_internal(const void* XXH_RESTRICT input, size_t len,
+@@ -4556,47 +5710,47 @@ XXH3_64bits_internal(const void* XXH_RES
+     if (len <= XXH3_MIDSIZE_MAX)
+         return XXH3_len_129to240_64b((const xxh_u8*)input, len, (const xxh_u8*)secret, secretLen, seed64);
+     return f_hashLong(input, len, seed64, (const xxh_u8*)secret, secretLen);
+ }
+ 
+ 
+ /* ===   Public entry point   === */
+ 
+-/*! @ingroup xxh3_family */
+-XXH_PUBLIC_API XXH64_hash_t XXH3_64bits(const void* input, size_t len)
+-{
+-    return XXH3_64bits_internal(input, len, 0, XXH3_kSecret, sizeof(XXH3_kSecret), XXH3_hashLong_64b_default);
+-}
+-
+-/*! @ingroup xxh3_family */
++/*! @ingroup XXH3_family */
++XXH_PUBLIC_API XXH64_hash_t XXH3_64bits(XXH_NOESCAPE const void* input, size_t length)
++{
++    return XXH3_64bits_internal(input, length, 0, XXH3_kSecret, sizeof(XXH3_kSecret), XXH3_hashLong_64b_default);
++}
++
++/*! @ingroup XXH3_family */
+ XXH_PUBLIC_API XXH64_hash_t
+-XXH3_64bits_withSecret(const void* input, size_t len, const void* secret, size_t secretSize)
+-{
+-    return XXH3_64bits_internal(input, len, 0, secret, secretSize, XXH3_hashLong_64b_withSecret);
+-}
+-
+-/*! @ingroup xxh3_family */
++XXH3_64bits_withSecret(XXH_NOESCAPE const void* input, size_t length, XXH_NOESCAPE const void* secret, size_t secretSize)
++{
++    return XXH3_64bits_internal(input, length, 0, secret, secretSize, XXH3_hashLong_64b_withSecret);
++}
++
++/*! @ingroup XXH3_family */
+ XXH_PUBLIC_API XXH64_hash_t
+-XXH3_64bits_withSeed(const void* input, size_t len, XXH64_hash_t seed)
+-{
+-    return XXH3_64bits_internal(input, len, seed, XXH3_kSecret, sizeof(XXH3_kSecret), XXH3_hashLong_64b_withSeed);
++XXH3_64bits_withSeed(XXH_NOESCAPE const void* input, size_t length, XXH64_hash_t seed)
++{
++    return XXH3_64bits_internal(input, length, seed, XXH3_kSecret, sizeof(XXH3_kSecret), XXH3_hashLong_64b_withSeed);
+ }
+ 
+ XXH_PUBLIC_API XXH64_hash_t
+-XXH3_64bits_withSecretandSeed(const void* input, size_t len, const void* secret, size_t secretSize, XXH64_hash_t seed)
+-{
+-    if (len <= XXH3_MIDSIZE_MAX)
+-        return XXH3_64bits_internal(input, len, seed, XXH3_kSecret, sizeof(XXH3_kSecret), NULL);
+-    return XXH3_hashLong_64b_withSecret(input, len, seed, (const xxh_u8*)secret, secretSize);
++XXH3_64bits_withSecretandSeed(XXH_NOESCAPE const void* input, size_t length, XXH_NOESCAPE const void* secret, size_t secretSize, XXH64_hash_t seed)
++{
++    if (length <= XXH3_MIDSIZE_MAX)
++        return XXH3_64bits_internal(input, length, seed, XXH3_kSecret, sizeof(XXH3_kSecret), NULL);
++    return XXH3_hashLong_64b_withSecret(input, length, seed, (const xxh_u8*)secret, secretSize);
+ }
+ 
+ 
+ /* ===   XXH3 streaming   === */
+-
++#ifndef XXH_NO_STREAM
+ /*
+  * Malloc's a pointer that is always aligned to align.
+  *
+  * This must be freed with `XXH_alignedFree()`.
+  *
+  * malloc typically guarantees 16 byte alignment on 64-bit systems and 8 byte
+  * alignment on 32-bit. This isn't enough for the 32 byte aligned loads in AVX2
+  * or on 32-bit, the 16 byte aligned loads in SSE2 and NEON.
+@@ -4610,17 +5764,17 @@ XXH3_64bits_withSecretandSeed(const void
+  * like this anyways, and besides, testing for the existence of library
+  * functions without relying on external build tools is impossible.
+  *
+  * The method is simple: Overallocate, manually align, and store the offset
+  * to the original behind the returned pointer.
+  *
+  * Align must be a power of 2 and 8 <= align <= 128.
+  */
+-static void* XXH_alignedMalloc(size_t s, size_t align)
++static XXH_MALLOCF void* XXH_alignedMalloc(size_t s, size_t align)
+ {
+     XXH_ASSERT(align <= 128 && align >= 8); /* range check */
+     XXH_ASSERT((align & (align-1)) == 0);   /* power of 2 */
+     XXH_ASSERT(s != 0 && s < (s + align));  /* empty/overflow */
+     {   /* Overallocate to make room for manual realignment and an offset byte */
+         xxh_u8* base = (xxh_u8*)XXH_malloc(s + align);
+         if (base != NULL) {
+             /*
+@@ -4652,35 +5806,48 @@ static void XXH_alignedFree(void* p)
+         xxh_u8* ptr = (xxh_u8*)p;
+         /* Get the offset byte we added in XXH_malloc. */
+         xxh_u8 offset = ptr[-1];
+         /* Free the original malloc'd pointer */
+         xxh_u8* base = ptr - offset;
+         XXH_free(base);
+     }
+ }
+-/*! @ingroup xxh3_family */
++/*! @ingroup XXH3_family */
++/*!
++ * @brief Allocate an @ref XXH3_state_t.
++ *
++ * Must be freed with XXH3_freeState().
++ * @return An allocated XXH3_state_t on success, `NULL` on failure.
++ */
+ XXH_PUBLIC_API XXH3_state_t* XXH3_createState(void)
+ {
+     XXH3_state_t* const state = (XXH3_state_t*)XXH_alignedMalloc(sizeof(XXH3_state_t), 64);
+     if (state==NULL) return NULL;
+     XXH3_INITSTATE(state);
+     return state;
+ }
+ 
+-/*! @ingroup xxh3_family */
++/*! @ingroup XXH3_family */
++/*!
++ * @brief Frees an @ref XXH3_state_t.
++ *
++ * Must be allocated with XXH3_createState().
++ * @param statePtr A pointer to an @ref XXH3_state_t allocated with @ref XXH3_createState().
++ * @return XXH_OK.
++ */
+ XXH_PUBLIC_API XXH_errorcode XXH3_freeState(XXH3_state_t* statePtr)
+ {
+     XXH_alignedFree(statePtr);
+     return XXH_OK;
+ }
+ 
+-/*! @ingroup xxh3_family */
++/*! @ingroup XXH3_family */
+ XXH_PUBLIC_API void
+-XXH3_copyState(XXH3_state_t* dst_state, const XXH3_state_t* src_state)
++XXH3_copyState(XXH_NOESCAPE XXH3_state_t* dst_state, XXH_NOESCAPE const XXH3_state_t* src_state)
+ {
+     XXH_memcpy(dst_state, src_state, sizeof(*dst_state));
+ }
+ 
+ static void
+ XXH3_reset_internal(XXH3_state_t* statePtr,
+                     XXH64_hash_t seed,
+                     const void* secret, size_t secretSize)
+@@ -4702,123 +5869,150 @@ XXH3_reset_internal(XXH3_state_t* stateP
+     statePtr->seed = seed;
+     statePtr->useSeed = (seed != 0);
+     statePtr->extSecret = (const unsigned char*)secret;
+     XXH_ASSERT(secretSize >= XXH3_SECRET_SIZE_MIN);
+     statePtr->secretLimit = secretSize - XXH_STRIPE_LEN;
+     statePtr->nbStripesPerBlock = statePtr->secretLimit / XXH_SECRET_CONSUME_RATE;
+ }
+ 
+-/*! @ingroup xxh3_family */
++/*! @ingroup XXH3_family */
+ XXH_PUBLIC_API XXH_errorcode
+-XXH3_64bits_reset(XXH3_state_t* statePtr)
++XXH3_64bits_reset(XXH_NOESCAPE XXH3_state_t* statePtr)
+ {
+     if (statePtr == NULL) return XXH_ERROR;
+     XXH3_reset_internal(statePtr, 0, XXH3_kSecret, XXH_SECRET_DEFAULT_SIZE);
+     return XXH_OK;
+ }
+ 
+-/*! @ingroup xxh3_family */
++/*! @ingroup XXH3_family */
+ XXH_PUBLIC_API XXH_errorcode
+-XXH3_64bits_reset_withSecret(XXH3_state_t* statePtr, const void* secret, size_t secretSize)
++XXH3_64bits_reset_withSecret(XXH_NOESCAPE XXH3_state_t* statePtr, XXH_NOESCAPE const void* secret, size_t secretSize)
+ {
+     if (statePtr == NULL) return XXH_ERROR;
+     XXH3_reset_internal(statePtr, 0, secret, secretSize);
+     if (secret == NULL) return XXH_ERROR;
+     if (secretSize < XXH3_SECRET_SIZE_MIN) return XXH_ERROR;
+     return XXH_OK;
+ }
+ 
+-/*! @ingroup xxh3_family */
++/*! @ingroup XXH3_family */
+ XXH_PUBLIC_API XXH_errorcode
+-XXH3_64bits_reset_withSeed(XXH3_state_t* statePtr, XXH64_hash_t seed)
++XXH3_64bits_reset_withSeed(XXH_NOESCAPE XXH3_state_t* statePtr, XXH64_hash_t seed)
+ {
+     if (statePtr == NULL) return XXH_ERROR;
+     if (seed==0) return XXH3_64bits_reset(statePtr);
+     if ((seed != statePtr->seed) || (statePtr->extSecret != NULL))
+         XXH3_initCustomSecret(statePtr->customSecret, seed);
+     XXH3_reset_internal(statePtr, seed, NULL, XXH_SECRET_DEFAULT_SIZE);
+     return XXH_OK;
+ }
+ 
+-/*! @ingroup xxh3_family */
++/*! @ingroup XXH3_family */
+ XXH_PUBLIC_API XXH_errorcode
+-XXH3_64bits_reset_withSecretandSeed(XXH3_state_t* statePtr, const void* secret, size_t secretSize, XXH64_hash_t seed64)
++XXH3_64bits_reset_withSecretandSeed(XXH_NOESCAPE XXH3_state_t* statePtr, XXH_NOESCAPE const void* secret, size_t secretSize, XXH64_hash_t seed64)
+ {
+     if (statePtr == NULL) return XXH_ERROR;
+     if (secret == NULL) return XXH_ERROR;
+     if (secretSize < XXH3_SECRET_SIZE_MIN) return XXH_ERROR;
+     XXH3_reset_internal(statePtr, seed64, secret, secretSize);
+     statePtr->useSeed = 1; /* always, even if seed64==0 */
+     return XXH_OK;
+ }
+ 
+-/* Note : when XXH3_consumeStripes() is invoked,
+- * there must be a guarantee that at least one more byte must be consumed from input
+- * so that the function can blindly consume all stripes using the "normal" secret segment */
+-XXH_FORCE_INLINE void
++/*!
++ * @internal
++ * @brief Processes a large input for XXH3_update() and XXH3_digest_long().
++ *
++ * Unlike XXH3_hashLong_internal_loop(), this can process data that overlaps a block.
++ *
++ * @param acc                Pointer to the 8 accumulator lanes
++ * @param nbStripesSoFarPtr  In/out pointer to the number of leftover stripes in the block*
++ * @param nbStripesPerBlock  Number of stripes in a block
++ * @param input              Input pointer
++ * @param nbStripes          Number of stripes to process
++ * @param secret             Secret pointer
++ * @param secretLimit        Offset of the last block in @p secret
++ * @param f_acc              Pointer to an XXH3_accumulate implementation
++ * @param f_scramble         Pointer to an XXH3_scrambleAcc implementation
++ * @return                   Pointer past the end of @p input after processing
++ */
++XXH_FORCE_INLINE const xxh_u8 *
+ XXH3_consumeStripes(xxh_u64* XXH_RESTRICT acc,
+                     size_t* XXH_RESTRICT nbStripesSoFarPtr, size_t nbStripesPerBlock,
+                     const xxh_u8* XXH_RESTRICT input, size_t nbStripes,
+                     const xxh_u8* XXH_RESTRICT secret, size_t secretLimit,
+-                    XXH3_f_accumulate_512 f_acc512,
++                    XXH3_f_accumulate f_acc,
+                     XXH3_f_scrambleAcc f_scramble)
+ {
+-    XXH_ASSERT(nbStripes <= nbStripesPerBlock);  /* can handle max 1 scramble per invocation */
+-    XXH_ASSERT(*nbStripesSoFarPtr < nbStripesPerBlock);
+-    if (nbStripesPerBlock - *nbStripesSoFarPtr <= nbStripes) {
+-        /* need a scrambling operation */
+-        size_t const nbStripesToEndofBlock = nbStripesPerBlock - *nbStripesSoFarPtr;
+-        size_t const nbStripesAfterBlock = nbStripes - nbStripesToEndofBlock;
+-        XXH3_accumulate(acc, input, secret + nbStripesSoFarPtr[0] * XXH_SECRET_CONSUME_RATE, nbStripesToEndofBlock, f_acc512);
+-        f_scramble(acc, secret + secretLimit);
+-        XXH3_accumulate(acc, input + nbStripesToEndofBlock * XXH_STRIPE_LEN, secret, nbStripesAfterBlock, f_acc512);
+-        *nbStripesSoFarPtr = nbStripesAfterBlock;
+-    } else {
+-        XXH3_accumulate(acc, input, secret + nbStripesSoFarPtr[0] * XXH_SECRET_CONSUME_RATE, nbStripes, f_acc512);
++    const xxh_u8* initialSecret = secret + *nbStripesSoFarPtr * XXH_SECRET_CONSUME_RATE;
++    /* Process full blocks */
++    if (nbStripes >= (nbStripesPerBlock - *nbStripesSoFarPtr)) {
++        /* Process the initial partial block... */
++        size_t nbStripesThisIter = nbStripesPerBlock - *nbStripesSoFarPtr;
++
++        do {
++            /* Accumulate and scramble */
++            f_acc(acc, input, initialSecret, nbStripesThisIter);
++            f_scramble(acc, secret + secretLimit);
++            input += nbStripesThisIter * XXH_STRIPE_LEN;
++            nbStripes -= nbStripesThisIter;
++            /* Then continue the loop with the full block size */
++            nbStripesThisIter = nbStripesPerBlock;
++            initialSecret = secret;
++        } while (nbStripes >= nbStripesPerBlock);
++        *nbStripesSoFarPtr = 0;
++    }
++    /* Process a partial block */
++    if (nbStripes > 0) {
++        f_acc(acc, input, initialSecret, nbStripes);
++        input += nbStripes * XXH_STRIPE_LEN;
+         *nbStripesSoFarPtr += nbStripes;
+     }
++    /* Return end pointer */
++    return input;
+ }
+ 
+ #ifndef XXH3_STREAM_USE_STACK
+-# ifndef __clang__ /* clang doesn't need additional stack space */
++# if XXH_SIZE_OPT <= 0 && !defined(__clang__) /* clang doesn't need additional stack space */
+ #   define XXH3_STREAM_USE_STACK 1
+ # endif
+ #endif
+ /*
+  * Both XXH3_64bits_update and XXH3_128bits_update use this routine.
+  */
+ XXH_FORCE_INLINE XXH_errorcode
+ XXH3_update(XXH3_state_t* XXH_RESTRICT const state,
+             const xxh_u8* XXH_RESTRICT input, size_t len,
+-            XXH3_f_accumulate_512 f_acc512,
++            XXH3_f_accumulate f_acc,
+             XXH3_f_scrambleAcc f_scramble)
+ {
+     if (input==NULL) {
+         XXH_ASSERT(len == 0);
+         return XXH_OK;
+     }
+ 
+     XXH_ASSERT(state != NULL);
+     {   const xxh_u8* const bEnd = input + len;
+         const unsigned char* const secret = (state->extSecret == NULL) ? state->customSecret : state->extSecret;
+ #if defined(XXH3_STREAM_USE_STACK) && XXH3_STREAM_USE_STACK >= 1
+         /* For some reason, gcc and MSVC seem to suffer greatly
+          * when operating accumulators directly into state.
+          * Operating into stack space seems to enable proper optimization.
+          * clang, on the other hand, doesn't seem to need this trick */
+-        XXH_ALIGN(XXH_ACC_ALIGN) xxh_u64 acc[8]; memcpy(acc, state->acc, sizeof(acc));
++        XXH_ALIGN(XXH_ACC_ALIGN) xxh_u64 acc[8];
++        XXH_memcpy(acc, state->acc, sizeof(acc));
+ #else
+         xxh_u64* XXH_RESTRICT const acc = state->acc;
+ #endif
+         state->totalLen += len;
+         XXH_ASSERT(state->bufferedSize <= XXH3_INTERNALBUFFER_SIZE);
+ 
+         /* small input : just fill in tmp buffer */
+-        if (state->bufferedSize + len <= XXH3_INTERNALBUFFER_SIZE) {
++        if (len <= XXH3_INTERNALBUFFER_SIZE - state->bufferedSize) {
+             XXH_memcpy(state->buffer + state->bufferedSize, input, len);
+             state->bufferedSize += (XXH32_hash_t)len;
+             return XXH_OK;
+         }
+ 
+         /* total input is now > XXH3_INTERNALBUFFER_SIZE */
+         #define XXH3_INTERNALBUFFER_STRIPES (XXH3_INTERNALBUFFER_SIZE / XXH_STRIPE_LEN)
+         XXH_STATIC_ASSERT(XXH3_INTERNALBUFFER_SIZE % XXH_STRIPE_LEN == 0);   /* clean multiple */
+@@ -4830,143 +6024,109 @@ XXH3_update(XXH3_state_t* XXH_RESTRICT c
+         if (state->bufferedSize) {
+             size_t const loadSize = XXH3_INTERNALBUFFER_SIZE - state->bufferedSize;
+             XXH_memcpy(state->buffer + state->bufferedSize, input, loadSize);
+             input += loadSize;
+             XXH3_consumeStripes(acc,
+                                &state->nbStripesSoFar, state->nbStripesPerBlock,
+                                 state->buffer, XXH3_INTERNALBUFFER_STRIPES,
+                                 secret, state->secretLimit,
+-                                f_acc512, f_scramble);
++                                f_acc, f_scramble);
+             state->bufferedSize = 0;
+         }
+         XXH_ASSERT(input < bEnd);
+-
+-        /* large input to consume : ingest per full block */
+-        if ((size_t)(bEnd - input) > state->nbStripesPerBlock * XXH_STRIPE_LEN) {
++        if (bEnd - input > XXH3_INTERNALBUFFER_SIZE) {
+             size_t nbStripes = (size_t)(bEnd - 1 - input) / XXH_STRIPE_LEN;
+-            XXH_ASSERT(state->nbStripesPerBlock >= state->nbStripesSoFar);
+-            /* join to current block's end */
+-            {   size_t const nbStripesToEnd = state->nbStripesPerBlock - state->nbStripesSoFar;
+-                XXH_ASSERT(nbStripes <= nbStripes);
+-                XXH3_accumulate(acc, input, secret + state->nbStripesSoFar * XXH_SECRET_CONSUME_RATE, nbStripesToEnd, f_acc512);
+-                f_scramble(acc, secret + state->secretLimit);
+-                state->nbStripesSoFar = 0;
+-                input += nbStripesToEnd * XXH_STRIPE_LEN;
+-                nbStripes -= nbStripesToEnd;
+-            }
+-            /* consume per entire blocks */
+-            while(nbStripes >= state->nbStripesPerBlock) {
+-                XXH3_accumulate(acc, input, secret, state->nbStripesPerBlock, f_acc512);
+-                f_scramble(acc, secret + state->secretLimit);
+-                input += state->nbStripesPerBlock * XXH_STRIPE_LEN;
+-                nbStripes -= state->nbStripesPerBlock;
+-            }
+-            /* consume last partial block */
+-            XXH3_accumulate(acc, input, secret, nbStripes, f_acc512);
+-            input += nbStripes * XXH_STRIPE_LEN;
+-            XXH_ASSERT(input < bEnd);  /* at least some bytes left */
+-            state->nbStripesSoFar = nbStripes;
+-            /* buffer predecessor of last partial stripe */
++            input = XXH3_consumeStripes(acc,
++                                       &state->nbStripesSoFar, state->nbStripesPerBlock,
++                                       input, nbStripes,
++                                       secret, state->secretLimit,
++                                       f_acc, f_scramble);
+             XXH_memcpy(state->buffer + sizeof(state->buffer) - XXH_STRIPE_LEN, input - XXH_STRIPE_LEN, XXH_STRIPE_LEN);
+-            XXH_ASSERT(bEnd - input <= XXH_STRIPE_LEN);
+-        } else {
+-            /* content to consume <= block size */
+-            /* Consume input by a multiple of internal buffer size */
+-            if (bEnd - input > XXH3_INTERNALBUFFER_SIZE) {
+-                const xxh_u8* const limit = bEnd - XXH3_INTERNALBUFFER_SIZE;
+-                do {
+-                    XXH3_consumeStripes(acc,
+-                                       &state->nbStripesSoFar, state->nbStripesPerBlock,
+-                                        input, XXH3_INTERNALBUFFER_STRIPES,
+-                                        secret, state->secretLimit,
+-                                        f_acc512, f_scramble);
+-                    input += XXH3_INTERNALBUFFER_SIZE;
+-                } while (input<limit);
+-                /* buffer predecessor of last partial stripe */
+-                XXH_memcpy(state->buffer + sizeof(state->buffer) - XXH_STRIPE_LEN, input - XXH_STRIPE_LEN, XXH_STRIPE_LEN);
+-            }
++
+         }
+-
+         /* Some remaining input (always) : buffer it */
+         XXH_ASSERT(input < bEnd);
+         XXH_ASSERT(bEnd - input <= XXH3_INTERNALBUFFER_SIZE);
+         XXH_ASSERT(state->bufferedSize == 0);
+         XXH_memcpy(state->buffer, input, (size_t)(bEnd-input));
+         state->bufferedSize = (XXH32_hash_t)(bEnd-input);
+ #if defined(XXH3_STREAM_USE_STACK) && XXH3_STREAM_USE_STACK >= 1
+         /* save stack accumulators into state */
+-        memcpy(state->acc, acc, sizeof(acc));
++        XXH_memcpy(state->acc, acc, sizeof(acc));
+ #endif
+     }
+ 
+     return XXH_OK;
+ }
+ 
+-/*! @ingroup xxh3_family */
++/*! @ingroup XXH3_family */
+ XXH_PUBLIC_API XXH_errorcode
+-XXH3_64bits_update(XXH3_state_t* state, const void* input, size_t len)
++XXH3_64bits_update(XXH_NOESCAPE XXH3_state_t* state, XXH_NOESCAPE const void* input, size_t len)
+ {
+     return XXH3_update(state, (const xxh_u8*)input, len,
+-                       XXH3_accumulate_512, XXH3_scrambleAcc);
++                       XXH3_accumulate, XXH3_scrambleAcc);
+ }
+ 
+ 
+ XXH_FORCE_INLINE void
+ XXH3_digest_long (XXH64_hash_t* acc,
+                   const XXH3_state_t* state,
+                   const unsigned char* secret)
+ {
++    xxh_u8 lastStripe[XXH_STRIPE_LEN];
++    const xxh_u8* lastStripePtr;
++
+     /*
+      * Digest on a local copy. This way, the state remains unaltered, and it can
+      * continue ingesting more input afterwards.
+      */
+     XXH_memcpy(acc, state->acc, sizeof(state->acc));
+     if (state->bufferedSize >= XXH_STRIPE_LEN) {
++        /* Consume remaining stripes then point to remaining data in buffer */
+         size_t const nbStripes = (state->bufferedSize - 1) / XXH_STRIPE_LEN;
+         size_t nbStripesSoFar = state->nbStripesSoFar;
+         XXH3_consumeStripes(acc,
+                            &nbStripesSoFar, state->nbStripesPerBlock,
+                             state->buffer, nbStripes,
+                             secret, state->secretLimit,
+-                            XXH3_accumulate_512, XXH3_scrambleAcc);
+-        /* last stripe */
+-        XXH3_accumulate_512(acc,
+-                            state->buffer + state->bufferedSize - XXH_STRIPE_LEN,
+-                            secret + state->secretLimit - XXH_SECRET_LASTACC_START);
++                            XXH3_accumulate, XXH3_scrambleAcc);
++        lastStripePtr = state->buffer + state->bufferedSize - XXH_STRIPE_LEN;
+     } else {  /* bufferedSize < XXH_STRIPE_LEN */
+-        xxh_u8 lastStripe[XXH_STRIPE_LEN];
++        /* Copy to temp buffer */
+         size_t const catchupSize = XXH_STRIPE_LEN - state->bufferedSize;
+         XXH_ASSERT(state->bufferedSize > 0);  /* there is always some input buffered */
+         XXH_memcpy(lastStripe, state->buffer + sizeof(state->buffer) - catchupSize, catchupSize);
+         XXH_memcpy(lastStripe + catchupSize, state->buffer, state->bufferedSize);
+-        XXH3_accumulate_512(acc,
+-                            lastStripe,
+-                            secret + state->secretLimit - XXH_SECRET_LASTACC_START);
++        lastStripePtr = lastStripe;
+     }
+-}
+-
+-/*! @ingroup xxh3_family */
+-XXH_PUBLIC_API XXH64_hash_t XXH3_64bits_digest (const XXH3_state_t* state)
++    /* Last stripe */
++    XXH3_accumulate_512(acc,
++                        lastStripePtr,
++                        secret + state->secretLimit - XXH_SECRET_LASTACC_START);
++}
++
++/*! @ingroup XXH3_family */
++XXH_PUBLIC_API XXH64_hash_t XXH3_64bits_digest (XXH_NOESCAPE const XXH3_state_t* state)
+ {
+     const unsigned char* const secret = (state->extSecret == NULL) ? state->customSecret : state->extSecret;
+     if (state->totalLen > XXH3_MIDSIZE_MAX) {
+         XXH_ALIGN(XXH_ACC_ALIGN) XXH64_hash_t acc[XXH_ACC_NB];
+         XXH3_digest_long(acc, state, secret);
+         return XXH3_mergeAccs(acc,
+                               secret + XXH_SECRET_MERGEACCS_START,
+                               (xxh_u64)state->totalLen * XXH_PRIME64_1);
+     }
+     /* totalLen <= XXH3_MIDSIZE_MAX: digesting a short input */
+     if (state->useSeed)
+         return XXH3_64bits_withSeed(state->buffer, (size_t)state->totalLen, state->seed);
+     return XXH3_64bits_withSecret(state->buffer, (size_t)(state->totalLen),
+                                   secret, state->secretLimit + XXH_STRIPE_LEN);
+ }
+-
++#endif /* !XXH_NO_STREAM */
+ 
+ 
+ /* ==========================================
+  * XXH3 128 bits (a.k.a XXH128)
+  * ==========================================
+  * XXH3's 128-bit variant has better mixing and strength than the 64-bit variant,
+  * even without counting the significantly larger output size.
+  *
+@@ -4976,17 +6136,17 @@ XXH_PUBLIC_API XXH64_hash_t XXH3_64bits_
+  * This strength naturally comes at the cost of some speed, especially on short
+  * lengths. Note that longer hashes are about as fast as the 64-bit version
+  * due to it using only a slight modification of the 64-bit loop.
+  *
+  * XXH128 is also more oriented towards 64-bit machines. It is still extremely
+  * fast for a _128-bit_ hash on 32-bit (it usually clears XXH64).
+  */
+ 
+-XXH_FORCE_INLINE XXH128_hash_t
++XXH_FORCE_INLINE XXH_PUREF XXH128_hash_t
+ XXH3_len_1to3_128b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXH64_hash_t seed)
+ {
+     /* A doubled version of 1to3_64b with different constants. */
+     XXH_ASSERT(input != NULL);
+     XXH_ASSERT(1 <= len && len <= 3);
+     XXH_ASSERT(secret != NULL);
+     /*
+      * len = 1: combinedl = { input[0], 0x01, input[0], input[0] }
+@@ -5005,17 +6165,17 @@ XXH3_len_1to3_128b(const xxh_u8* input, 
+         xxh_u64 const keyed_hi = (xxh_u64)combinedh ^ bitfliph;
+         XXH128_hash_t h128;
+         h128.low64  = XXH64_avalanche(keyed_lo);
+         h128.high64 = XXH64_avalanche(keyed_hi);
+         return h128;
+     }
+ }
+ 
+-XXH_FORCE_INLINE XXH128_hash_t
++XXH_FORCE_INLINE XXH_PUREF XXH128_hash_t
+ XXH3_len_4to8_128b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXH64_hash_t seed)
+ {
+     XXH_ASSERT(input != NULL);
+     XXH_ASSERT(secret != NULL);
+     XXH_ASSERT(4 <= len && len <= 8);
+     seed ^= (xxh_u64)XXH_swap32((xxh_u32)seed) << 32;
+     {   xxh_u32 const input_lo = XXH_readLE32(input);
+         xxh_u32 const input_hi = XXH_readLE32(input + len - 4);
+@@ -5025,24 +6185,24 @@ XXH3_len_4to8_128b(const xxh_u8* input, 
+ 
+         /* Shift len to the left to ensure it is even, this avoids even multiplies. */
+         XXH128_hash_t m128 = XXH_mult64to128(keyed, XXH_PRIME64_1 + (len << 2));
+ 
+         m128.high64 += (m128.low64 << 1);
+         m128.low64  ^= (m128.high64 >> 3);
+ 
+         m128.low64   = XXH_xorshift64(m128.low64, 35);
+-        m128.low64  *= 0x9FB21C651E98DF25ULL;
++        m128.low64  *= PRIME_MX2;
+         m128.low64   = XXH_xorshift64(m128.low64, 28);
+         m128.high64  = XXH3_avalanche(m128.high64);
+         return m128;
+     }
+ }
+ 
+-XXH_FORCE_INLINE XXH128_hash_t
++XXH_FORCE_INLINE XXH_PUREF XXH128_hash_t
+ XXH3_len_9to16_128b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXH64_hash_t seed)
+ {
+     XXH_ASSERT(input != NULL);
+     XXH_ASSERT(secret != NULL);
+     XXH_ASSERT(9 <= len && len <= 16);
+     {   xxh_u64 const bitflipl = (XXH_readLE64(secret+32) ^ XXH_readLE64(secret+40)) - seed;
+         xxh_u64 const bitfliph = (XXH_readLE64(secret+48) ^ XXH_readLE64(secret+56)) + seed;
+         xxh_u64 const input_lo = XXH_readLE64(input);
+@@ -5107,17 +6267,17 @@ XXH3_len_9to16_128b(const xxh_u8* input,
+             h128.high64  = XXH3_avalanche(h128.high64);
+             return h128;
+     }   }
+ }
+ 
+ /*
+  * Assumption: `secret` size is >= XXH3_SECRET_SIZE_MIN
+  */
+-XXH_FORCE_INLINE XXH128_hash_t
++XXH_FORCE_INLINE XXH_PUREF XXH128_hash_t
+ XXH3_len_0to16_128b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXH64_hash_t seed)
+ {
+     XXH_ASSERT(len <= 16);
+     {   if (len > 8) return XXH3_len_9to16_128b(input, len, secret, seed);
+         if (len >= 4) return XXH3_len_4to8_128b(input, len, secret, seed);
+         if (len) return XXH3_len_1to3_128b(input, len, secret, seed);
+         {   XXH128_hash_t h128;
+             xxh_u64 const bitflipl = XXH_readLE64(secret+64) ^ XXH_readLE64(secret+72);
+@@ -5138,107 +6298,127 @@ XXH128_mix32B(XXH128_hash_t acc, const x
+     acc.low64  += XXH3_mix16B (input_1, secret+0, seed);
+     acc.low64  ^= XXH_readLE64(input_2) + XXH_readLE64(input_2 + 8);
+     acc.high64 += XXH3_mix16B (input_2, secret+16, seed);
+     acc.high64 ^= XXH_readLE64(input_1) + XXH_readLE64(input_1 + 8);
+     return acc;
+ }
+ 
+ 
+-XXH_FORCE_INLINE XXH128_hash_t
++XXH_FORCE_INLINE XXH_PUREF XXH128_hash_t
+ XXH3_len_17to128_128b(const xxh_u8* XXH_RESTRICT input, size_t len,
+                       const xxh_u8* XXH_RESTRICT secret, size_t secretSize,
+                       XXH64_hash_t seed)
+ {
+     XXH_ASSERT(secretSize >= XXH3_SECRET_SIZE_MIN); (void)secretSize;
+     XXH_ASSERT(16 < len && len <= 128);
+ 
+     {   XXH128_hash_t acc;
+         acc.low64 = len * XXH_PRIME64_1;
+         acc.high64 = 0;
++
++#if XXH_SIZE_OPT >= 1
++        {
++            /* Smaller, but slightly slower. */
++            unsigned int i = (unsigned int)(len - 1) / 32;
++            do {
++                acc = XXH128_mix32B(acc, input+16*i, input+len-16*(i+1), secret+32*i, seed);
++            } while (i-- != 0);
++        }
++#else
+         if (len > 32) {
+             if (len > 64) {
+                 if (len > 96) {
+                     acc = XXH128_mix32B(acc, input+48, input+len-64, secret+96, seed);
+                 }
+                 acc = XXH128_mix32B(acc, input+32, input+len-48, secret+64, seed);
+             }
+             acc = XXH128_mix32B(acc, input+16, input+len-32, secret+32, seed);
+         }
+         acc = XXH128_mix32B(acc, input, input+len-16, secret, seed);
++#endif
+         {   XXH128_hash_t h128;
+             h128.low64  = acc.low64 + acc.high64;
+             h128.high64 = (acc.low64    * XXH_PRIME64_1)
+                         + (acc.high64   * XXH_PRIME64_4)
+                         + ((len - seed) * XXH_PRIME64_2);
+             h128.low64  = XXH3_avalanche(h128.low64);
+             h128.high64 = (XXH64_hash_t)0 - XXH3_avalanche(h128.high64);
+             return h128;
+         }
+     }
+ }
+ 
+-XXH_NO_INLINE XXH128_hash_t
++XXH_NO_INLINE XXH_PUREF XXH128_hash_t
+ XXH3_len_129to240_128b(const xxh_u8* XXH_RESTRICT input, size_t len,
+                        const xxh_u8* XXH_RESTRICT secret, size_t secretSize,
+                        XXH64_hash_t seed)
+ {
+     XXH_ASSERT(secretSize >= XXH3_SECRET_SIZE_MIN); (void)secretSize;
+     XXH_ASSERT(128 < len && len <= XXH3_MIDSIZE_MAX);
+ 
+     {   XXH128_hash_t acc;
+-        int const nbRounds = (int)len / 32;
+-        int i;
++        unsigned i;
+         acc.low64 = len * XXH_PRIME64_1;
+         acc.high64 = 0;
+-        for (i=0; i<4; i++) {
++        /*
++         *  We set as `i` as offset + 32. We do this so that unchanged
++         * `len` can be used as upper bound. This reaches a sweet spot
++         * where both x86 and aarch64 get simple agen and good codegen
++         * for the loop.
++         */
++        for (i = 32; i < 160; i += 32) {
+             acc = XXH128_mix32B(acc,
+-                                input  + (32 * i),
+-                                input  + (32 * i) + 16,
+-                                secret + (32 * i),
++                                input  + i - 32,
++                                input  + i - 16,
++                                secret + i - 32,
+                                 seed);
+         }
+         acc.low64 = XXH3_avalanche(acc.low64);
+         acc.high64 = XXH3_avalanche(acc.high64);
+-        XXH_ASSERT(nbRounds >= 4);
+-        for (i=4 ; i < nbRounds; i++) {
++        /*
++         * NB: `i <= len` will duplicate the last 32-bytes if
++         * len % 32 was zero. This is an unfortunate necessity to keep
++         * the hash result stable.
++         */
++        for (i=160; i <= len; i += 32) {
+             acc = XXH128_mix32B(acc,
+-                                input + (32 * i),
+-                                input + (32 * i) + 16,
+-                                secret + XXH3_MIDSIZE_STARTOFFSET + (32 * (i - 4)),
++                                input + i - 32,
++                                input + i - 16,
++                                secret + XXH3_MIDSIZE_STARTOFFSET + i - 160,
+                                 seed);
+         }
+         /* last bytes */
+         acc = XXH128_mix32B(acc,
+                             input + len - 16,
+                             input + len - 32,
+                             secret + XXH3_SECRET_SIZE_MIN - XXH3_MIDSIZE_LASTOFFSET - 16,
+-                            0ULL - seed);
++                            (XXH64_hash_t)0 - seed);
+ 
+         {   XXH128_hash_t h128;
+             h128.low64  = acc.low64 + acc.high64;
+             h128.high64 = (acc.low64    * XXH_PRIME64_1)
+                         + (acc.high64   * XXH_PRIME64_4)
+                         + ((len - seed) * XXH_PRIME64_2);
+             h128.low64  = XXH3_avalanche(h128.low64);
+             h128.high64 = (XXH64_hash_t)0 - XXH3_avalanche(h128.high64);
+             return h128;
+         }
+     }
+ }
+ 
+ XXH_FORCE_INLINE XXH128_hash_t
+ XXH3_hashLong_128b_internal(const void* XXH_RESTRICT input, size_t len,
+                             const xxh_u8* XXH_RESTRICT secret, size_t secretSize,
+-                            XXH3_f_accumulate_512 f_acc512,
++                            XXH3_f_accumulate f_acc,
+                             XXH3_f_scrambleAcc f_scramble)
+ {
+     XXH_ALIGN(XXH_ACC_ALIGN) xxh_u64 acc[XXH_ACC_NB] = XXH3_INIT_ACC;
+ 
+-    XXH3_hashLong_internal_loop(acc, (const xxh_u8*)input, len, secret, secretSize, f_acc512, f_scramble);
++    XXH3_hashLong_internal_loop(acc, (const xxh_u8*)input, len, secret, secretSize, f_acc, f_scramble);
+ 
+     /* converge into final hash */
+     XXH_STATIC_ASSERT(sizeof(acc) == 64);
+     XXH_ASSERT(secretSize >= sizeof(acc) + XXH_SECRET_MERGEACCS_START);
+     {   XXH128_hash_t h128;
+         h128.low64  = XXH3_mergeAccs(acc,
+                                      secret + XXH_SECRET_MERGEACCS_START,
+                                      (xxh_u64)len * XXH_PRIME64_1);
+@@ -5246,70 +6426,73 @@ XXH3_hashLong_128b_internal(const void* 
+                                      secret + secretSize
+                                             - sizeof(acc) - XXH_SECRET_MERGEACCS_START,
+                                      ~((xxh_u64)len * XXH_PRIME64_2));
+         return h128;
+     }
+ }
+ 
+ /*
+- * It's important for performance that XXH3_hashLong is not inlined.
+- */
+-XXH_NO_INLINE XXH128_hash_t
++ * It's important for performance that XXH3_hashLong() is not inlined.
++ */
++XXH_NO_INLINE XXH_PUREF XXH128_hash_t
+ XXH3_hashLong_128b_default(const void* XXH_RESTRICT input, size_t len,
+                            XXH64_hash_t seed64,
+                            const void* XXH_RESTRICT secret, size_t secretLen)
+ {
+     (void)seed64; (void)secret; (void)secretLen;
+     return XXH3_hashLong_128b_internal(input, len, XXH3_kSecret, sizeof(XXH3_kSecret),
+-                                       XXH3_accumulate_512, XXH3_scrambleAcc);
++                                       XXH3_accumulate, XXH3_scrambleAcc);
+ }
+ 
+ /*
+- * It's important for performance to pass @secretLen (when it's static)
++ * It's important for performance to pass @p secretLen (when it's static)
+  * to the compiler, so that it can properly optimize the vectorized loop.
+- */
+-XXH_FORCE_INLINE XXH128_hash_t
++ *
++ * When the secret size is unknown, or on GCC 12 where the mix of NO_INLINE and FORCE_INLINE
++ * breaks -Og, this is XXH_NO_INLINE.
++ */
++XXH3_WITH_SECRET_INLINE XXH128_hash_t
+ XXH3_hashLong_128b_withSecret(const void* XXH_RESTRICT input, size_t len,
+                               XXH64_hash_t seed64,
+                               const void* XXH_RESTRICT secret, size_t secretLen)
+ {
+     (void)seed64;
+     return XXH3_hashLong_128b_internal(input, len, (const xxh_u8*)secret, secretLen,
+-                                       XXH3_accumulate_512, XXH3_scrambleAcc);
++                                       XXH3_accumulate, XXH3_scrambleAcc);
+ }
+ 
+ XXH_FORCE_INLINE XXH128_hash_t
+ XXH3_hashLong_128b_withSeed_internal(const void* XXH_RESTRICT input, size_t len,
+                                 XXH64_hash_t seed64,
+-                                XXH3_f_accumulate_512 f_acc512,
++                                XXH3_f_accumulate f_acc,
+                                 XXH3_f_scrambleAcc f_scramble,
+                                 XXH3_f_initCustomSecret f_initSec)
+ {
+     if (seed64 == 0)
+         return XXH3_hashLong_128b_internal(input, len,
+                                            XXH3_kSecret, sizeof(XXH3_kSecret),
+-                                           f_acc512, f_scramble);
++                                           f_acc, f_scramble);
+     {   XXH_ALIGN(XXH_SEC_ALIGN) xxh_u8 secret[XXH_SECRET_DEFAULT_SIZE];
+         f_initSec(secret, seed64);
+         return XXH3_hashLong_128b_internal(input, len, (const xxh_u8*)secret, sizeof(secret),
+-                                           f_acc512, f_scramble);
++                                           f_acc, f_scramble);
+     }
+ }
+ 
+ /*
+  * It's important for performance that XXH3_hashLong is not inlined.
+  */
+ XXH_NO_INLINE XXH128_hash_t
+ XXH3_hashLong_128b_withSeed(const void* input, size_t len,
+                             XXH64_hash_t seed64, const void* XXH_RESTRICT secret, size_t secretLen)
+ {
+     (void)secret; (void)secretLen;
+     return XXH3_hashLong_128b_withSeed_internal(input, len, seed64,
+-                XXH3_accumulate_512, XXH3_scrambleAcc, XXH3_initCustomSecret);
++                XXH3_accumulate, XXH3_scrambleAcc, XXH3_initCustomSecret);
+ }
+ 
+ typedef XXH128_hash_t (*XXH3_hashLong128_f)(const void* XXH_RESTRICT, size_t,
+                                             XXH64_hash_t, const void* XXH_RESTRICT, size_t);
+ 
+ XXH_FORCE_INLINE XXH128_hash_t
+ XXH3_128bits_internal(const void* input, size_t len,
+                       XXH64_hash_t seed64, const void* XXH_RESTRICT secret, size_t secretLen,
+@@ -5329,104 +6512,103 @@ XXH3_128bits_internal(const void* input,
+     if (len <= XXH3_MIDSIZE_MAX)
+         return XXH3_len_129to240_128b((const xxh_u8*)input, len, (const xxh_u8*)secret, secretLen, seed64);
+     return f_hl128(input, len, seed64, secret, secretLen);
+ }
+ 
+ 
+ /* ===   Public XXH128 API   === */
+ 
+-/*! @ingroup xxh3_family */
+-XXH_PUBLIC_API XXH128_hash_t XXH3_128bits(const void* input, size_t len)
++/*! @ingroup XXH3_family */
++XXH_PUBLIC_API XXH128_hash_t XXH3_128bits(XXH_NOESCAPE const void* input, size_t len)
+ {
+     return XXH3_128bits_internal(input, len, 0,
+                                  XXH3_kSecret, sizeof(XXH3_kSecret),
+                                  XXH3_hashLong_128b_default);
+ }
+ 
+-/*! @ingroup xxh3_family */
++/*! @ingroup XXH3_family */
+ XXH_PUBLIC_API XXH128_hash_t
+-XXH3_128bits_withSecret(const void* input, size_t len, const void* secret, size_t secretSize)
++XXH3_128bits_withSecret(XXH_NOESCAPE const void* input, size_t len, XXH_NOESCAPE const void* secret, size_t secretSize)
+ {
+     return XXH3_128bits_internal(input, len, 0,
+                                  (const xxh_u8*)secret, secretSize,
+                                  XXH3_hashLong_128b_withSecret);
+ }
+ 
+-/*! @ingroup xxh3_family */
++/*! @ingroup XXH3_family */
+ XXH_PUBLIC_API XXH128_hash_t
+-XXH3_128bits_withSeed(const void* input, size_t len, XXH64_hash_t seed)
++XXH3_128bits_withSeed(XXH_NOESCAPE const void* input, size_t len, XXH64_hash_t seed)
+ {
+     return XXH3_128bits_internal(input, len, seed,
+                                  XXH3_kSecret, sizeof(XXH3_kSecret),
+                                  XXH3_hashLong_128b_withSeed);
+ }
+ 
+-/*! @ingroup xxh3_family */
++/*! @ingroup XXH3_family */
+ XXH_PUBLIC_API XXH128_hash_t
+-XXH3_128bits_withSecretandSeed(const void* input, size_t len, const void* secret, size_t secretSize, XXH64_hash_t seed)
++XXH3_128bits_withSecretandSeed(XXH_NOESCAPE const void* input, size_t len, XXH_NOESCAPE const void* secret, size_t secretSize, XXH64_hash_t seed)
+ {
+     if (len <= XXH3_MIDSIZE_MAX)
+         return XXH3_128bits_internal(input, len, seed, XXH3_kSecret, sizeof(XXH3_kSecret), NULL);
+     return XXH3_hashLong_128b_withSecret(input, len, seed, secret, secretSize);
+ }
+ 
+-/*! @ingroup xxh3_family */
++/*! @ingroup XXH3_family */
+ XXH_PUBLIC_API XXH128_hash_t
+-XXH128(const void* input, size_t len, XXH64_hash_t seed)
++XXH128(XXH_NOESCAPE const void* input, size_t len, XXH64_hash_t seed)
+ {
+     return XXH3_128bits_withSeed(input, len, seed);
+ }
+ 
+ 
+ /* ===   XXH3 128-bit streaming   === */
+-
++#ifndef XXH_NO_STREAM
+ /*
+  * All initialization and update functions are identical to 64-bit streaming variant.
+  * The only difference is the finalization routine.
+  */
+ 
+-/*! @ingroup xxh3_family */
++/*! @ingroup XXH3_family */
+ XXH_PUBLIC_API XXH_errorcode
+-XXH3_128bits_reset(XXH3_state_t* statePtr)
++XXH3_128bits_reset(XXH_NOESCAPE XXH3_state_t* statePtr)
+ {
+     return XXH3_64bits_reset(statePtr);
+ }
+ 
+-/*! @ingroup xxh3_family */
++/*! @ingroup XXH3_family */
+ XXH_PUBLIC_API XXH_errorcode
+-XXH3_128bits_reset_withSecret(XXH3_state_t* statePtr, const void* secret, size_t secretSize)
++XXH3_128bits_reset_withSecret(XXH_NOESCAPE XXH3_state_t* statePtr, XXH_NOESCAPE const void* secret, size_t secretSize)
+ {
+     return XXH3_64bits_reset_withSecret(statePtr, secret, secretSize);
+ }
+ 
+-/*! @ingroup xxh3_family */
++/*! @ingroup XXH3_family */
+ XXH_PUBLIC_API XXH_errorcode
+-XXH3_128bits_reset_withSeed(XXH3_state_t* statePtr, XXH64_hash_t seed)
++XXH3_128bits_reset_withSeed(XXH_NOESCAPE XXH3_state_t* statePtr, XXH64_hash_t seed)
+ {
+     return XXH3_64bits_reset_withSeed(statePtr, seed);
+ }
+ 
+-/*! @ingroup xxh3_family */
++/*! @ingroup XXH3_family */
+ XXH_PUBLIC_API XXH_errorcode
+-XXH3_128bits_reset_withSecretandSeed(XXH3_state_t* statePtr, const void* secret, size_t secretSize, XXH64_hash_t seed)
++XXH3_128bits_reset_withSecretandSeed(XXH_NOESCAPE XXH3_state_t* statePtr, XXH_NOESCAPE const void* secret, size_t secretSize, XXH64_hash_t seed)
+ {
+     return XXH3_64bits_reset_withSecretandSeed(statePtr, secret, secretSize, seed);
+ }
+ 
+-/*! @ingroup xxh3_family */
++/*! @ingroup XXH3_family */
+ XXH_PUBLIC_API XXH_errorcode
+-XXH3_128bits_update(XXH3_state_t* state, const void* input, size_t len)
+-{
+-    return XXH3_update(state, (const xxh_u8*)input, len,
+-                       XXH3_accumulate_512, XXH3_scrambleAcc);
+-}
+-
+-/*! @ingroup xxh3_family */
+-XXH_PUBLIC_API XXH128_hash_t XXH3_128bits_digest (const XXH3_state_t* state)
++XXH3_128bits_update(XXH_NOESCAPE XXH3_state_t* state, XXH_NOESCAPE const void* input, size_t len)
++{
++    return XXH3_64bits_update(state, input, len);
++}
++
++/*! @ingroup XXH3_family */
++XXH_PUBLIC_API XXH128_hash_t XXH3_128bits_digest (XXH_NOESCAPE const XXH3_state_t* state)
+ {
+     const unsigned char* const secret = (state->extSecret == NULL) ? state->customSecret : state->extSecret;
+     if (state->totalLen > XXH3_MIDSIZE_MAX) {
+         XXH_ALIGN(XXH_ACC_ALIGN) XXH64_hash_t acc[XXH_ACC_NB];
+         XXH3_digest_long(acc, state, secret);
+         XXH_ASSERT(state->secretLimit + XXH_STRIPE_LEN >= sizeof(acc) + XXH_SECRET_MERGEACCS_START);
+         {   XXH128_hash_t h128;
+             h128.low64  = XXH3_mergeAccs(acc,
+@@ -5440,97 +6622,105 @@ XXH_PUBLIC_API XXH128_hash_t XXH3_128bit
+         }
+     }
+     /* len <= XXH3_MIDSIZE_MAX : short code */
+     if (state->seed)
+         return XXH3_128bits_withSeed(state->buffer, (size_t)state->totalLen, state->seed);
+     return XXH3_128bits_withSecret(state->buffer, (size_t)(state->totalLen),
+                                    secret, state->secretLimit + XXH_STRIPE_LEN);
+ }
+-
++#endif /* !XXH_NO_STREAM */
+ /* 128-bit utility functions */
+ 
+ #include <string.h>   /* memcmp, memcpy */
+ 
+ /* return : 1 is equal, 0 if different */
+-/*! @ingroup xxh3_family */
++/*! @ingroup XXH3_family */
+ XXH_PUBLIC_API int XXH128_isEqual(XXH128_hash_t h1, XXH128_hash_t h2)
+ {
+     /* note : XXH128_hash_t is compact, it has no padding byte */
+     return !(memcmp(&h1, &h2, sizeof(h1)));
+ }
+ 
+ /* This prototype is compatible with stdlib's qsort().
+- * return : >0 if *h128_1  > *h128_2
+- *          <0 if *h128_1  < *h128_2
+- *          =0 if *h128_1 == *h128_2  */
+-/*! @ingroup xxh3_family */
+-XXH_PUBLIC_API int XXH128_cmp(const void* h128_1, const void* h128_2)
++ * @return : >0 if *h128_1  > *h128_2
++ *           <0 if *h128_1  < *h128_2
++ *           =0 if *h128_1 == *h128_2  */
++/*! @ingroup XXH3_family */
++XXH_PUBLIC_API int XXH128_cmp(XXH_NOESCAPE const void* h128_1, XXH_NOESCAPE const void* h128_2)
+ {
+     XXH128_hash_t const h1 = *(const XXH128_hash_t*)h128_1;
+     XXH128_hash_t const h2 = *(const XXH128_hash_t*)h128_2;
+     int const hcmp = (h1.high64 > h2.high64) - (h2.high64 > h1.high64);
+     /* note : bets that, in most cases, hash values are different */
+     if (hcmp) return hcmp;
+     return (h1.low64 > h2.low64) - (h2.low64 > h1.low64);
+ }
+ 
+ 
+ /*======   Canonical representation   ======*/
+-/*! @ingroup xxh3_family */
++/*! @ingroup XXH3_family */
+ XXH_PUBLIC_API void
+-XXH128_canonicalFromHash(XXH128_canonical_t* dst, XXH128_hash_t hash)
++XXH128_canonicalFromHash(XXH_NOESCAPE XXH128_canonical_t* dst, XXH128_hash_t hash)
+ {
+     XXH_STATIC_ASSERT(sizeof(XXH128_canonical_t) == sizeof(XXH128_hash_t));
+     if (XXH_CPU_LITTLE_ENDIAN) {
+         hash.high64 = XXH_swap64(hash.high64);
+         hash.low64  = XXH_swap64(hash.low64);
+     }
+     XXH_memcpy(dst, &hash.high64, sizeof(hash.high64));
+     XXH_memcpy((char*)dst + sizeof(hash.high64), &hash.low64, sizeof(hash.low64));
+ }
+ 
+-/*! @ingroup xxh3_family */
++/*! @ingroup XXH3_family */
+ XXH_PUBLIC_API XXH128_hash_t
+-XXH128_hashFromCanonical(const XXH128_canonical_t* src)
++XXH128_hashFromCanonical(XXH_NOESCAPE const XXH128_canonical_t* src)
+ {
+     XXH128_hash_t h;
+     h.high64 = XXH_readBE64(src);
+     h.low64  = XXH_readBE64(src->digest + 8);
+     return h;
+ }
+ 
+ 
+ 
+ /* ==========================================
+  * Secret generators
+  * ==========================================
+  */
+ #define XXH_MIN(x, y) (((x) > (y)) ? (y) : (x))
+ 
+-static void XXH3_combine16(void* dst, XXH128_hash_t h128)
++XXH_FORCE_INLINE void XXH3_combine16(void* dst, XXH128_hash_t h128)
+ {
+     XXH_writeLE64( dst, XXH_readLE64(dst) ^ h128.low64 );
+     XXH_writeLE64( (char*)dst+8, XXH_readLE64((char*)dst+8) ^ h128.high64 );
+ }
+ 
+-/*! @ingroup xxh3_family */
++/*! @ingroup XXH3_family */
+ XXH_PUBLIC_API XXH_errorcode
+-XXH3_generateSecret(void* secretBuffer, size_t secretSize, const void* customSeed, size_t customSeedSize)
+-{
++XXH3_generateSecret(XXH_NOESCAPE void* secretBuffer, size_t secretSize, XXH_NOESCAPE const void* customSeed, size_t customSeedSize)
++{
++#if (XXH_DEBUGLEVEL >= 1)
+     XXH_ASSERT(secretBuffer != NULL);
++    XXH_ASSERT(secretSize >= XXH3_SECRET_SIZE_MIN);
++#else
++    /* production mode, assert() are disabled */
+     if (secretBuffer == NULL) return XXH_ERROR;
+-    XXH_ASSERT(secretSize >= XXH3_SECRET_SIZE_MIN);
+     if (secretSize < XXH3_SECRET_SIZE_MIN) return XXH_ERROR;
++#endif
++
+     if (customSeedSize == 0) {
+         customSeed = XXH3_kSecret;
+         customSeedSize = XXH_SECRET_DEFAULT_SIZE;
+     }
++#if (XXH_DEBUGLEVEL >= 1)
+     XXH_ASSERT(customSeed != NULL);
++#else
+     if (customSeed == NULL) return XXH_ERROR;
++#endif
+ 
+     /* Fill secretBuffer with a copy of customSeed - repeat as needed */
+     {   size_t pos = 0;
+         while (pos < secretSize) {
+             size_t const toCopy = XXH_MIN((secretSize - pos), customSeedSize);
+             memcpy((char*)secretBuffer + pos, customSeed, toCopy);
+             pos += toCopy;
+     }   }
+@@ -5544,40 +6734,40 @@ XXH3_generateSecret(void* secretBuffer, 
+             XXH3_combine16((char*)secretBuffer + n*16, h128);
+         }
+         /* last segment */
+         XXH3_combine16((char*)secretBuffer + secretSize - 16, XXH128_hashFromCanonical(&scrambler));
+     }
+     return XXH_OK;
+ }
+ 
+-/*! @ingroup xxh3_family */
++/*! @ingroup XXH3_family */
+ XXH_PUBLIC_API void
+-XXH3_generateSecret_fromSeed(void* secretBuffer, XXH64_hash_t seed)
++XXH3_generateSecret_fromSeed(XXH_NOESCAPE void* secretBuffer, XXH64_hash_t seed)
+ {
+     XXH_ALIGN(XXH_SEC_ALIGN) xxh_u8 secret[XXH_SECRET_DEFAULT_SIZE];
+     XXH3_initCustomSecret(secret, seed);
+     XXH_ASSERT(secretBuffer != NULL);
+     memcpy(secretBuffer, secret, XXH_SECRET_DEFAULT_SIZE);
+ }
+ 
+ 
+ 
+ /* Pop our optimization override from above */
+ #if XXH_VECTOR == XXH_AVX2 /* AVX2 */ \
+   && defined(__GNUC__) && !defined(__clang__) /* GCC, not Clang */ \
+-  && defined(__OPTIMIZE__) && !defined(__OPTIMIZE_SIZE__) /* respect -O0 and -Os */
++  && defined(__OPTIMIZE__) && XXH_SIZE_OPT <= 0 /* respect -O0 and -Os */
+ #  pragma GCC pop_options
+ #endif
+ 
+ #endif  /* XXH_NO_LONG_LONG */
+ 
+ #endif  /* XXH_NO_XXH3 */
+ 
+ /*!
+  * @}
+  */
+ #endif  /* XXH_IMPLEMENTATION */
+ 
+ 
+ #if defined (__cplusplus)
+-}
++} /* extern "C" */
+ #endif

+ 34 - 0
mozilla-release/patches/1891234-PARTIAL-NOTESTS-11512.patch

@@ -0,0 +1,34 @@
+# HG changeset patch
+# User Neil Deakin <neil@mozilla.com>
+# Date 1717094509 0
+# Node ID 2062b99385d0d9fd5d57389da40fb2358a26a128
+# Parent  5bf558db4e07bfd511e8c9a7c7ac299f39f701cb
+Bug 1891234, additional filename filter checks,  a=pascalc
+
+Original Revision: https://phabricator.services.mozilla.com/D208659
+
+Differential Revision: https://phabricator.services.mozilla.com/D211288
+
+diff --git a/uriloader/exthandler/nsExternalHelperAppService.cpp b/uriloader/exthandler/nsExternalHelperAppService.cpp
+--- a/uriloader/exthandler/nsExternalHelperAppService.cpp
++++ b/uriloader/exthandler/nsExternalHelperAppService.cpp
+@@ -1195,17 +1195,18 @@ nsExternalAppHandler::nsExternalAppHandl
+ {
+ 
+   // make sure the extention includes the '.'
+   if (!aTempFileExtension.IsEmpty() && aTempFileExtension.First() != '.')
+     mTempFileExtension = char16_t('.');
+   AppendUTF8toUTF16(aTempFileExtension, mTempFileExtension);
+ 
+   // replace platform specific path separator and illegal characters to avoid any confusion
+-  mSuggestedFileName.ReplaceChar(KNOWN_PATH_SEPARATORS FILE_ILLEGAL_CHARACTERS, '_');
++  mSuggestedFileName.ReplaceChar(KNOWN_PATH_SEPARATORS FILE_ILLEGAL_CHARACTERS "%", '_');
++  mSuggestedFileName.StripChar(char16_t(0));
+   mTempFileExtension.ReplaceChar(KNOWN_PATH_SEPARATORS FILE_ILLEGAL_CHARACTERS, '_');
+ 
+   // Remove unsafe bidi characters which might have spoofing implications (bug 511521).
+   const char16_t unsafeBidiCharacters[] = {
+     char16_t(0x061c), // Arabic Letter Mark
+     char16_t(0x200e), // Left-to-Right Mark
+     char16_t(0x200f), // Right-to-Left Mark
+     char16_t(0x202a), // Left-to-Right Embedding

+ 155 - 0
mozilla-release/patches/1891349-127a1.patch

@@ -0,0 +1,155 @@
+# HG changeset patch
+# User longsonr <longsonr@gmail.com>
+# Date 1713324138 0
+# Node ID 715ecd9aaf756262d13626c73180dfb1eee25a60
+# Parent  dbba2f5d1a21907a5b1159c6362961f42028b060
+Bug 1891349 - Treat cookie name prefixes as case-insensitive r=dveditz,cookie-reviewers,valentin
+
+Differential Revision: https://phabricator.services.mozilla.com/D207412
+
+diff --git a/netwerk/cookie/nsCookieService.cpp b/netwerk/cookie/nsCookieService.cpp
+--- a/netwerk/cookie/nsCookieService.cpp
++++ b/netwerk/cookie/nsCookieService.cpp
+@@ -3279,16 +3279,28 @@ nsCookieService::GetCookieStringInternal
+       }
+     }
+   }
+ 
+   if (!aCookieString.IsEmpty())
+     COOKIE_LOGSUCCESS(GET_COOKIE, aHostURI, aCookieString, nullptr, false);
+ }
+ 
++bool HasSecurePrefix(const nsCString& aString) {
++  static const char kSecure[] = "__Secure-";
++  static constexpr uint32_t kSecureLen = sizeof(kSecure) - 1;
++  return nsCRT::strncasecmp(aString.get(), kSecure, kSecureLen) == 0;
++}
++
++bool HasHostPrefix(const nsCString& aString) {
++  static const char kHost[] = "__Host-";
++  static constexpr uint32_t kHostLen = sizeof(kHost) - 1;
++  return nsCRT::strncasecmp(aString.get(), kHost, kHostLen) == 0;
++}
++
+ // processes a single cookie, and returns true if there are more cookies
+ // to be processed
+ bool
+ nsCookieService::CanSetCookie(nsIURI*             aHostURI,
+                               const nsCookieKey&  aKey,
+                               nsCookieAttributes& aCookieAttributes,
+                               bool                aRequireHostMatch,
+                               CookieStatus        aStatus,
+@@ -3357,24 +3369,27 @@ nsCookieService::CanSetCookie(nsIURI*   
+     COOKIE_LOGFAILURE(SET_COOKIE, aHostURI, savedCookieHeader, "failed the domain tests");
+     return newCookie;
+   }
+   if (!CheckPath(aCookieAttributes, aHostURI)) {
+     COOKIE_LOGFAILURE(SET_COOKIE, aHostURI, savedCookieHeader, "failed the path tests");
+     return newCookie;
+   }
+ 
+-  if (!CheckHiddenPrefix(aCookieData)) {
++  // If a cookie is nameless, then its value must not start with
++  // `__Host-` or `__Secure-`
++  if (aCookieAttributes.name.IsEmpty() && (HasSecurePrefix(aCookieAttributes.value) ||
++                                             HasHostPrefix(aCookieAttributes.value))) {
+     COOKIE_LOGFAILURE(SET_COOKIE, aHostURI, savedCookieHeader,
+-                      "failed the CheckHiddenPrefix tests");
++                      "failed hidden prefix tests");
+     // CookieLogging::LogMessageToConsole(
+     //     aCRC, aHostURI, nsIScriptError::warningFlag, CONSOLE_REJECTION_CATEGORY,
+     //     "CookieRejectedInvalidPrefix",
+     //     AutoTArray<nsString, 1>{
+-    //         NS_ConvertUTF8toUTF16(aCookieData.name()),
++    //         NS_ConvertUTF8toUTF16(aCookieAttributes.name()),
+     //     });
+     return newCookie;
+   }
+ 
+   // magic prefix checks. MUST be run after CheckDomain() and CheckPath()
+   if (!CheckPrefixes(aCookieAttributes, isHTTPS)) {
+     COOKIE_LOGFAILURE(SET_COOKIE, aHostURI, savedCookieHeader, "failed the prefix tests");
+     return newCookie;
+@@ -4214,35 +4229,16 @@ nsCookieService::CheckDomain(nsCookieAtt
+     return false;
+   }
+ 
+   // no domain specified, use hostFromURI
+   aCookieAttributes.host = hostFromURI;
+   return true;
+ }
+ 
+-// static
+-bool nsCookieService::CheckHiddenPrefix(CookieStruct& aCookieData) {
+-  // If a cookie is nameless, then its value must not start with
+-  // `__Host-` or `__Secure-`
+-  if (!aCookieData.name().IsEmpty()) {
+-    return true;
+-  }
+-
+-  if (StringBeginsWith(aCookieData.value(), "__Host-")) {
+-    return false;
+-  }
+-
+-  if (StringBeginsWith(aCookieData.value(), "__Secure-")) {
+-    return false;
+-  }
+-
+-  return true;
+-}
+-
+ nsAutoCString
+ nsCookieService::GetPathFromURI(nsIURI* aHostURI)
+ {
+   // strip down everything after the last slash to get the path,
+   // ignoring slashes in the query string part.
+   // if we can QI to nsIURL, that'll take care of the query string portion.
+   // otherwise, it's not an nsIURL and can't have a query string, so just find the last slash.
+   nsAutoCString path;
+@@ -4298,23 +4294,18 @@ nsCookieService::CheckPath(nsCookieAttri
+ // if they do not meet the criteria required by the prefix.
+ //
+ // Must not be called until after CheckDomain() and CheckPath() have
+ // regularized and validated the nsCookieAttributes values!
+ bool
+ nsCookieService::CheckPrefixes(nsCookieAttributes &aCookieAttributes,
+                                bool aSecureRequest)
+ {
+-  static const char kSecure[] = "__Secure-";
+-  static const char kHost[]   = "__Host-";
+-  static const int kSecureLen = sizeof( kSecure ) - 1;
+-  static const int kHostLen   = sizeof( kHost ) - 1;
+-
+-  bool isSecure = strncmp( aCookieAttributes.name.get(), kSecure, kSecureLen ) == 0;
+-  bool isHost   = strncmp( aCookieAttributes.name.get(), kHost, kHostLen ) == 0;
++  bool isSecure = HasSecurePrefix(aCookieAttributes.name);
++  bool isHost = HasHostPrefix(aCookieAttributes.name);
+ 
+   if ( !isSecure && !isHost ) {
+     // not one of the magic prefixes: carry on
+     return true;
+   }
+ 
+   if ( !aSecureRequest || !aCookieAttributes.isSecure ) {
+     // the magic prefixes may only be used from a secure request and
+diff --git a/netwerk/cookie/nsCookieService.h b/netwerk/cookie/nsCookieService.h
+--- a/netwerk/cookie/nsCookieService.h
++++ b/netwerk/cookie/nsCookieService.h
+@@ -306,17 +306,16 @@ class nsCookieService final : public nsI
+     void                          AddInternal(const nsCookieKey& aKey, nsCookie *aCookie, int64_t aCurrentTimeInUsec, nsIURI *aHostURI, const char *aCookieHeader, bool aFromHttp);
+     void                          RemoveCookieFromList(const nsListIter &aIter, mozIStorageBindingParamsArray *aParamsArray = nullptr);
+     void                          AddCookieToList(const nsCookieKey& aKey, nsCookie *aCookie, DBState *aDBState, mozIStorageBindingParamsArray *aParamsArray, bool aWriteToDB = true);
+     void                          UpdateCookieInList(nsCookie *aCookie, int64_t aLastAccessed, mozIStorageBindingParamsArray *aParamsArray);
+     static bool                   GetTokenValue(nsACString::const_char_iterator &aIter, nsACString::const_char_iterator &aEndIter, nsDependentCSubstring &aTokenString, nsDependentCSubstring &aTokenValue, bool &aEqualsFound);
+     static bool                   ParseAttributes(nsDependentCString &aCookieHeader, nsCookieAttributes &aCookie);
+     bool                          RequireThirdPartyCheck();
+     static bool                   CheckDomain(nsCookieAttributes &aCookie, nsIURI *aHostURI, const nsCString &aBaseDomain, bool aRequireHostMatch);
+-    static bool CheckHiddenPrefix(CookieStruct& aCookieData);
+     static bool                   CheckPath(nsCookieAttributes &aCookie, nsIURI *aHostURI);
+     static bool                   CheckPrefixes(nsCookieAttributes &aCookie, bool aSecureRequest);
+     static bool                   GetExpiry(nsCookieAttributes &aCookie, int64_t aServerTime, int64_t aCurrentTime);
+     void                          RemoveAllFromMemory();
+     already_AddRefed<nsIArray>    PurgeCookies(int64_t aCurrentTimeInUsec);
+     bool                          FindCookie(const nsCookieKey& aKey, const nsCString& aHost, const nsCString& aName, const nsCString& aPath, nsListIter &aIter);
+     bool                          FindSecureCookie(const nsCookieKey& aKey, nsCookie* aCookie);
+     int64_t                       FindStaleCookie(nsCookieEntry *aEntry, int64_t aCurrentTime, nsIURI* aSource, const mozilla::Maybe<bool> &aIsSecure, nsListIter &aIter);

+ 69 - 0
mozilla-release/patches/1893388-11512.patch

@@ -0,0 +1,69 @@
+# HG changeset patch
+# User alwu <alwu@mozilla.com>
+# Date 1715711976 0
+# Node ID aed8b21bdcd29217faec25369a330dde60298066
+# Parent  9f164c16a5326daff00390d3fdf9b898dcdcf405
+Bug 1893388 - clone video info properly. r=media-playback-reviewers,chunmin a=RyanVM
+
+Differential Revision: https://phabricator.services.mozilla.com/D209439
+
+diff --git a/dom/media/MediaInfo.h b/dom/media/MediaInfo.h
+--- a/dom/media/MediaInfo.h
++++ b/dom/media/MediaInfo.h
+@@ -215,29 +215,43 @@ public:
+     , mImage(aSize)
+     , mCodecSpecificConfig(new MediaByteBuffer)
+     , mExtraData(new MediaByteBuffer)
+     , mRotation(kDegree_0)
+     , mImageRect(gfx::IntRect(gfx::IntPoint(), aSize))
+   {
+   }
+ 
+-  VideoInfo(const VideoInfo& aOther)
+-    : TrackInfo(aOther)
+-    , mDisplay(aOther.mDisplay)
+-    , mStereoMode(aOther.mStereoMode)
+-    , mImage(aOther.mImage)
+-    , mCodecSpecificConfig(aOther.mCodecSpecificConfig)
+-    , mExtraData(aOther.mExtraData)
+-    , mRotation(aOther.mRotation)
+-    , mBitDepth(aOther.mBitDepth)
+-    , mImageRect(aOther.mImageRect)
+-    , mAlphaPresent(aOther.mAlphaPresent)
+-  {
+-  }
++  VideoInfo(const VideoInfo& aOther) : TrackInfo(aOther) {
++    if (aOther.mCodecSpecificConfig) {
++      mCodecSpecificConfig = new MediaByteBuffer();
++      mCodecSpecificConfig->AppendElements(
++          reinterpret_cast<uint8_t*>(aOther.mCodecSpecificConfig->Elements()),
++          aOther.mCodecSpecificConfig->Length());
++    }
++    if (aOther.mExtraData) {
++      mExtraData = new MediaByteBuffer();
++      mExtraData->AppendElements(
++          reinterpret_cast<uint8_t*>(aOther.mExtraData->Elements()),
++          aOther.mExtraData->Length());
++    }
++    mDisplay = aOther.mDisplay;
++    mStereoMode = aOther.mStereoMode;
++    mImage = aOther.mImage;
++    mRotation = aOther.mRotation;
++    mBitDepth = aOther.mBitDepth;
++    // mColorDepth = aOther.mColorDepth;
++    // mColorSpace = aOther.mColorSpace;
++    // mColorPrimaries = aOther.mColorPrimaries;
++    // mTransferFunction = aOther.mTransferFunction;
++    // mColorRange = aOther.mColorRange;
++    mImageRect = aOther.mImageRect;
++    mAlphaPresent = aOther.mAlphaPresent;
++    // mFrameRate = aOther.mFrameRate;
++  };
+ 
+   bool IsValid() const override
+   {
+     return mDisplay.width > 0 && mDisplay.height > 0;
+   }
+ 
+   VideoInfo* GetAsVideoInfo() override
+   {

+ 60 - 0
mozilla-release/patches/1895086-11512.patch

@@ -0,0 +1,60 @@
+# HG changeset patch
+# User Jon Coppeard <jcoppeard@mozilla.com>
+# Date 1715761236 0
+# Node ID 25c5e091b01e6b64378d0b5c43fe93bceea31eda
+# Parent  62f6be2a4bc74b50217e801829f19e3c4d37311a
+Bug 1895086 - Suppress GC during JSObject::swap r=jandem a=RyanVM
+
+We already suppress GC for part of this, but not for the part where we call
+JSObject::setIsUsedAsPrototype. This can GC (which was surprising to me) and so
+we can sweep before the pre-write barrier which comes after this.
+
+The simplest and safest thing is to suppress GC for the whole method.
+
+Differential Revision: https://phabricator.services.mozilla.com/D209813
+
+diff --git a/js/src/vm/JSObject.cpp b/js/src/vm/JSObject.cpp
+--- a/js/src/vm/JSObject.cpp
++++ b/js/src/vm/JSObject.cpp
+@@ -1637,16 +1637,20 @@ JSObject::swap(JSContext* cx, HandleObje
+ 
+     AutoEnterOOMUnsafeRegion oomUnsafe;
+ 
+     if (!JSObject::getGroup(cx, a))
+         oomUnsafe.crash("JSObject::swap");
+     if (!JSObject::getGroup(cx, b))
+         oomUnsafe.crash("JSObject::swap");
+ 
++    // Don't allow a GC which may observe intermediate state or run before we
++    // execute all necessary barriers.
++    AutoSuppressGC suppress(cx);
++
+     /*
+      * Neither object may be in the nursery, but ensure we update any embedded
+      * nursery pointers in either object.
+      */
+     MOZ_ASSERT(!IsInsideNursery(a) && !IsInsideNursery(b));
+     cx->zone()->group()->storeBuffer().putWholeCell(a);
+     cx->zone()->group()->storeBuffer().putWholeCell(b);
+ 
+@@ -1694,20 +1698,16 @@ JSObject::swap(JSContext* cx, HandleObje
+         a->fixDictionaryShapeAfterSwap();
+         b->fixDictionaryShapeAfterSwap();
+ 
+         if (aIsProxyWithInlineValues)
+             b->as<ProxyObject>().setInlineValueArray();
+         if (bIsProxyWithInlineValues)
+             a->as<ProxyObject>().setInlineValueArray();
+     } else {
+-        // Avoid GC in here to avoid confusing the tracing code with our
+-        // intermediate state.
+-        AutoSuppressGC suppress(cx);
+-
+         // When the objects have different sizes, they will have different
+         // numbers of fixed slots before and after the swap, so the slots for
+         // native objects will need to be rearranged.
+         NativeObject* na = a->isNative() ? &a->as<NativeObject>() : nullptr;
+         NativeObject* nb = b->isNative() ? &b->as<NativeObject>() : nullptr;
+ 
+         // Remember the original values from the objects.
+         Vector<Value> avals(cx);

+ 40 - 0
mozilla-release/patches/1896208-127a1.patch

@@ -0,0 +1,40 @@
+# HG changeset patch
+# User Daniel Holbert <dholbert@cs.stanford.edu>
+# Date 1715379035 0
+# Node ID 7173962a524e0dbec8da6b4046b8611937e6ea71
+# Parent  71ca1fe69f244d2b1164849d4a6a11cfc2d8079e
+Bug 1896208: Initialize PerSpanData members mBaseline and mReflowInput. r=layout-reviewers,emilio
+
+We null-initialize all of the other pointer members in NewPerSpanData; we
+should do the same for these ones, for consistency & robustness.
+
+(In practice, the callers end up initializing these members before reading them
+anyway, so it's been benign that we weren't initializing them.  But better
+for safety & futureproofing to have them reliably initialized.)
+
+Differential Revision: https://phabricator.services.mozilla.com/D210106
+
+diff --git a/layout/generic/nsLineLayout.cpp b/layout/generic/nsLineLayout.cpp
+--- a/layout/generic/nsLineLayout.cpp
++++ b/layout/generic/nsLineLayout.cpp
+@@ -385,18 +385,20 @@ nsLineLayout::NewPerSpanData()
+   }
+   else {
+     outerLineLayout->mSpanFreeList = psd->mNextFreeSpan;
+   }
+   psd->mParent = nullptr;
+   psd->mFrame = nullptr;
+   psd->mFirstFrame = nullptr;
+   psd->mLastFrame = nullptr;
++  psd->mReflowInput = nullptr;
+   psd->mContainsFloat = false;
+   psd->mHasNonemptyContent = false;
++  psd->mBaseline = nullptr;
+ 
+ #ifdef DEBUG
+   outerLineLayout->mSpansAllocated++;
+ #endif
+   return psd;
+ }
+ 
+ void

+ 1 - 1
mozilla-release/patches/TOP-NOBUG-seamonkey-credits.patch → mozilla-release/patches/1902935-seamonkey-credits-25320.patch

@@ -1,7 +1,7 @@
 # HG changeset patch
 # User Bill Gianopoulos <wgianopoulos@gmail.com>
 # Date 1712347236 0
-No Bug - Implement SeaMonkey about:credits.
+Bug 1898467 - Implement SeaMonkey about:credits. r=IanN
 
 diff --git a/docshell/base/nsAboutRedirector.cpp b/docshell/base/nsAboutRedirector.cpp
 --- a/docshell/base/nsAboutRedirector.cpp

+ 1 - 46
mozilla-release/patches/TOP-NOBUG-PLASTER-PY310_support-25314.patch

@@ -1,7 +1,7 @@
 # HG changeset patch
 # User Myckel Habets <gentoo-bugs@habets-dobben.nl>
 # Date 1660912651 -7200
-# Parent  ac511f47ef1a7d5d9cf5055ab81c327128d6dcca
+# Parent  6e93510e8869e51460a7623215085a4489f5c58d
 No Bug - Support Python 3.10. r=frg a=frg
 
 Python 3.10 changed a lot of things, but while migrating from Python 2.7
@@ -12,51 +12,6 @@ we catch up with Python 3.10 and drop Python 2.7.
 - Various classes from collections moved to collections.abc, which isn't
   present in Python 2.7
 
-diff --git a/third_party/python/pyyaml/lib3/yaml/constructor.py b/third_party/python/pyyaml/lib3/yaml/constructor.py
---- a/third_party/python/pyyaml/lib3/yaml/constructor.py
-+++ b/third_party/python/pyyaml/lib3/yaml/constructor.py
-@@ -2,16 +2,22 @@
- __all__ = ['BaseConstructor', 'SafeConstructor', 'Constructor',
-     'ConstructorError']
- 
- from .error import *
- from .nodes import *
- 
- import collections, datetime, base64, binascii, re, sys, types
- 
-+try:
-+    from collections.abc import Hashable
-+except ImportError:
-+    from collections import Hashable
-+
-+
- class ConstructorError(MarkedYAMLError):
-     pass
- 
- class BaseConstructor:
- 
-     yaml_constructors = {}
-     yaml_multi_constructors = {}
- 
-@@ -118,17 +124,17 @@ class BaseConstructor:
-     def construct_mapping(self, node, deep=False):
-         if not isinstance(node, MappingNode):
-             raise ConstructorError(None, None,
-                     "expected a mapping node, but found %s" % node.id,
-                     node.start_mark)
-         mapping = {}
-         for key_node, value_node in node.value:
-             key = self.construct_object(key_node, deep=deep)
--            if not isinstance(key, collections.Hashable):
-+            if not isinstance(key, Hashable):
-                 raise ConstructorError("while constructing a mapping", node.start_mark,
-                         "found unhashable key", key_node.start_mark)
-             value = self.construct_object(value_node, deep=deep)
-             mapping[key] = value
-         return mapping
- 
-     def construct_pairs(self, node, deep=False):
-         if not isinstance(node, MappingNode):
 diff --git a/third_party/python/requests/requests/cookies.py b/third_party/python/requests/requests/cookies.py
 --- a/third_party/python/requests/requests/cookies.py
 +++ b/third_party/python/requests/requests/cookies.py

+ 132 - 122
mozilla-release/patches/series

@@ -6774,128 +6774,6 @@ NOBUG-nukemozlinker-25319.patch
 1881183-PARTIAL-NOTESTS-11510.patch
 1890514-11511.patch
 1893340-PARTIAL-NOTESTS-11511.patch
-PPPPPPP-check_stdcxx-warn.patch
-PPPPPPP-NOBUG-PLASTER-getrandom.patch
-PPPPPPP-NSSgetentropy.patch
-WIP-1729459-comment25.patch
-TOP-1294490-7-PLASTER-webp-2535.patch
-TOP-1493400-6-PLASTER-dav1d-avoid-mColorDepth-2535.patch
-TOP-1445683-14-PLASTER-aom-fix-win32-bustage-2535.patch
-TOP-1683545-PLASTER-webrender-2536.patch
-TOP-1667581-3-PLASTER-2537.patch
-TOP-1469021-PLASTER-2538.patch
-TOP-1699835-PARTIAL-7810.patch
-TOP-1758291-fixgithubpolyfill-253111.patch
-TOP-1398895-2a-57a1.patch
-TOP-NOBUG-skiptests-25312.patch
-TOP-NOBUG-dav1d-V1-support-25312.patch
-TOP-NOBUG-unfiedloadicon-25312.patch
-TOP-NOBUG-nometadata-25312.patch
-TOP-1641640-BACKOUT-25313.patch
-TOP-NOBUG-fixup-VS2022-25313.patch
-TOP-1779027-freebsd-25314.patch
-TOP-1722226-aarch64-webrtc-25315.patch
-TOP-NOBUG-nsslink-25315.patch
-TOP-NOBUG-fixtests-25315.patch
-TOP-NOBUG-nonodedefault-25315.patch
-TOP-1797696-macos11sdk-25315.patch
-TOP-1804537-macostransparent-25315.patch
-TOP-1804537-macosfullscreen-25315.patch
-TOP-1804539-fixlangpack-25316.patch
-TOP-1750671-1only-PARTIAL-98a1.patch
-TOP-1788837-PARTIAL-108a1.patch
-TOP-1807802-shared-tree-styling-25316.patch
-TOP-1584803-rust133.patch
-TOP-1584803-rust135.patch
-TOP-NOBUG-cubeb-25317.patch
-TOP-NOBUG-PLASTER-IOSurface-fix-25317.patch
-TOP-1512450-60.patch
-TOP-NOBUG-texth-25317.patch
-TOP-NOBUG-PLASTER-wayland-25317.patch
-TOP-NOBUG-PLASTER-Stylo-25314.patch
-TOP-1834230-HTMLTableEditor-tb-td-25317.patch
-TOP-1794292-1-10210.patch
-TOP-1794292-2-10210.patch
-TOP-NOBUG-PLASTER-PY3-Codegen-25317.patch
-TOP-NOBUG-PLASTER-PY3-GenerateCSSPropsGenerated-25317.patch
-TOP-NOBUG-PLASTER-PY3-idl-parser-25317.patch
-TOP-NOBUG-PLASTER-PY3-typelib-25317.patch
-TOP-NOBUG-PLASTER-PY3-check_binary-25317.patch
-TOP-1620143-PARTIAL-PY3-dependentlibs-75a1.patch
-TOP-NOBUG-PLASTER-PY3-25317.patch
-TOP-NOBUG-PLASTER-PY310_support-25314.patch
-TOP-NOBUG-enableCE-25318.patch
-TOP-1539694-allsettled-68a1-25313.patch
-TOP-1378808-optchain-63a1-25313.patch
-TOP-1466000-1-optchain-64a1-25313.patch
-TOP-1566143-1to2-optchain-74a1-25313.patch
-TOP-1566143-3-optchain-74a1-25313.patch
-TOP-1610447-optchain-74a1-25313.patch
-TOP-1611777-12-74a1-25313.patch
-TOP-1378808-2-optchain-63a1-25313.patch
-TOP-NOBUG-optchain-baselinejit-25313.patch
-TOP-1629106-1-logicassign-25317.patch
-TOP-1629106-2-logicassign-25317.patch
-TOP-NOBUG-revendor-25318.patch
-TOP-NOBUG-backout1440761-25318.patch
-TOP-NOBUG-test-fixes-25318.patch
-TOP-NOBUG-PLASTER-fix-strip-25319.patch
-TOP-1707096-91a1.patch
-TOP-1846703-binutilsfix-11504.patch
-TOP-1859635-NSS3901-11506.patch
-TOP-NOBUG-killtelemetry-debugger-25319.patch
-TOP-1472170-PARTIAL-NOTESTS-63a1.patch
-TOP-1864587-angle-11507.patch
-TOP-1880562-NSS3902-11509.patch
-TOP-NOBUG-REGEXP-01-Import-25318.patch
-TOP-NOBUG-REGEXP-02-1361856-1-dotall-76a1-25318.patch
-TOP-NOBUG-REGEXP-03-1537978-68a1-25318.patch
-TOP-NOBUG-REGEXP-04-1539690-68a1-25318.patch
-TOP-NOBUG-REGEXP-05-1546300-68a1-25318.patch
-TOP-NOBUG-REGEXP-06-1504947-10-68a1-25318.patch
-TOP-NOBUG-REGEXP-07-1626713-76a1-25318.patch
-TOP-NOBUG-REGEXP-09-1627356-77a1-25318.patch
-TOP-NOBUG-REGEXP-10-1627838-77a1-25318.patch
-TOP-NOBUG-REGEXP-11-deunify-25318.patch
-TOP-NOBUG-REGEXP-12-1628835-77a1-25318.patch
-TOP-NOBUG-REGEXP-13-1361856-2-76a1-25318.patch
-TOP-NOBUG-REGEXP-14-1629670-77a1-25318.patch
-TOP-NOBUG-REGEXP-15-1630090-1-77a1-25318.patch
-TOP-NOBUG-REGEXP-16-1630090-2-77a1-25318.patch
-TOP-1630383-02-77a1.patch
-TOP-1630383-07-77a1.patch
-TOP-NOBUG-REGEXP-17-1630383-77a1-25318.patch
-TOP-1607405-77a1.patch
-TOP-1636495-1-78a1.patch
-TOP-1636495-2no3-78a1.patch
-TOP-1637199-78a1.patch
-TOP-NOBUG-REGEXP-21-1634135-78a1-25318.patch
-TOP-NOBUG-REGEXP-22-1637977-78a1-25318.patch
-TOP-NOBUG-REGEXP-23-1637913-78a1-25318.patch
-TOP-NOBUG-REGEXP-24-1631504-77a1-25318.patch
-TOP-NOBUG-REGEXP-25-1362154-1to4-78a1-25318.patch
-TOP-NOBUG-REGEXP-26-1362154-5to9-78a1-25318.patch
-TOP-NOBUG-REGEXP-27-1640487-78a1-25318.patch
-TOP-NOBUG-REGEXP-28-1640592-78a1-25318.patch
-TOP-NOBUG-REGEXP-29-1640475-78a1-25318.patch
-TOP-NOBUG-REGEXP-30-1640479-78a1-25318.patch
-TOP-NOBUG-REGEXP-31-1640473-78a1-25318.patch
-TOP-NOBUG-REGEXP-32-1638154-78a1-25318.patch
-TOP-NOBUG-REGEXP-33-1641352-79a1-25318.patch
-TOP-NOBUG-REGEXP-34-1361856-1-fix-76a1-25318.patch
-TOP-NOBUG-REGEXP-35-1435829-66a1-25318.patch
-TOP-NOBUG-REGEXP-36-1590543-73a1-25318.patch
-TOP-NOBUG-REGEXP-37-1642493-79a1-25318.patch
-TOP-NOBUG-REGEXP-38-1662073-82a1-25318.patch
-TOP-NOBUG-REGEXP-39-1681084-1-85a1-25318.patch
-TOP-NOBUG-REGEXP-40-1644590-79a1-25318.patch
-TOP-NOBUG-REGEXP-41-1667094-83a1-25318.patch
-TOP-NOBUG-REGEXP-42-1643171-79a1-25318.patch
-TOP-NOBUG-REGEXP-43-1691184-88a1-25318.patch
-TOP-NOBUG-REGEXP-44-irregexp-25318.patch
-TOP-NOBUG-REGEXP-45-final-25318.patch
-TOP-NOBUG-REGEXP-46-fixes-25318.patch
-TOP-NOBUG-seamonkey-credits.patch
 1897801-about-seamonkey-mozilla-25319.patch
 1449035-62a1.patch
 1485454-1-63a1.patch
@@ -7071,9 +6949,141 @@ TOP-NOBUG-seamonkey-credits.patch
 1713613-3-91a1.patch
 1713610-91a1.patch
 1799483-108a1.patch
+1743896-96a1.patch
+1779993-PARTIAL-NOTESTS-105a1.patch
+1784990-106a1.patch
+1845018-117a1.patch
+1193389-125a1.patch
 1870579-126a1.patch
+1891349-127a1.patch
 1892449-127a1.patch
 1893891-127a1.patch
+1896208-127a1.patch
+1895086-11512.patch
+1893388-11512.patch
+1891234-PARTIAL-NOTESTS-11512.patch
+PPPPPPP-check_stdcxx-warn.patch
+PPPPPPP-NOBUG-PLASTER-getrandom.patch
+PPPPPPP-NSSgetentropy.patch
+WIP-1729459-comment25.patch
+TOP-1294490-7-PLASTER-webp-2535.patch
+TOP-1493400-6-PLASTER-dav1d-avoid-mColorDepth-2535.patch
+TOP-1445683-14-PLASTER-aom-fix-win32-bustage-2535.patch
+TOP-1683545-PLASTER-webrender-2536.patch
+TOP-1667581-3-PLASTER-2537.patch
+TOP-1469021-PLASTER-2538.patch
+TOP-1699835-PARTIAL-7810.patch
+TOP-1758291-fixgithubpolyfill-253111.patch
+TOP-1398895-2a-57a1.patch
+TOP-NOBUG-skiptests-25312.patch
+TOP-NOBUG-dav1d-V1-support-25312.patch
+TOP-NOBUG-unfiedloadicon-25312.patch
+TOP-NOBUG-nometadata-25312.patch
+TOP-1641640-BACKOUT-25313.patch
+TOP-NOBUG-fixup-VS2022-25313.patch
+TOP-1779027-freebsd-25314.patch
+TOP-1722226-aarch64-webrtc-25315.patch
+TOP-NOBUG-nsslink-25315.patch
+TOP-NOBUG-fixtests-25315.patch
+TOP-NOBUG-nonodedefault-25315.patch
+TOP-1797696-macos11sdk-25315.patch
+TOP-1804537-macostransparent-25315.patch
+TOP-1804537-macosfullscreen-25315.patch
+TOP-1804539-fixlangpack-25316.patch
+TOP-1750671-1only-PARTIAL-98a1.patch
+TOP-1788837-PARTIAL-108a1.patch
+TOP-1807802-shared-tree-styling-25316.patch
+TOP-1584803-rust133.patch
+TOP-1584803-rust135.patch
+TOP-NOBUG-cubeb-25317.patch
+TOP-NOBUG-PLASTER-IOSurface-fix-25317.patch
+TOP-1512450-60.patch
+TOP-NOBUG-texth-25317.patch
+TOP-NOBUG-PLASTER-wayland-25317.patch
+TOP-NOBUG-PLASTER-Stylo-25314.patch
+TOP-1834230-HTMLTableEditor-tb-td-25317.patch
+TOP-1794292-1-10210.patch
+TOP-1794292-2-10210.patch
+TOP-NOBUG-PLASTER-PY3-Codegen-25317.patch
+TOP-NOBUG-PLASTER-PY3-GenerateCSSPropsGenerated-25317.patch
+TOP-NOBUG-PLASTER-PY3-idl-parser-25317.patch
+TOP-NOBUG-PLASTER-PY3-typelib-25317.patch
+TOP-NOBUG-PLASTER-PY3-check_binary-25317.patch
+TOP-1620143-PARTIAL-PY3-dependentlibs-75a1.patch
+TOP-NOBUG-PLASTER-PY3-25317.patch
+TOP-NOBUG-PLASTER-PY310_support-25314.patch
+TOP-NOBUG-enableCE-25318.patch
+TOP-1539694-allsettled-68a1-25313.patch
+TOP-1378808-optchain-63a1-25313.patch
+TOP-1466000-1-optchain-64a1-25313.patch
+TOP-1566143-1to2-optchain-74a1-25313.patch
+TOP-1566143-3-optchain-74a1-25313.patch
+TOP-1610447-optchain-74a1-25313.patch
+TOP-1611777-12-74a1-25313.patch
+TOP-1378808-2-optchain-63a1-25313.patch
+TOP-NOBUG-optchain-baselinejit-25313.patch
+TOP-1629106-1-logicassign-25317.patch
+TOP-1629106-2-logicassign-25317.patch
+TOP-NOBUG-revendor-25318.patch
+TOP-NOBUG-backout1440761-25318.patch
+TOP-NOBUG-test-fixes-25318.patch
+TOP-NOBUG-PLASTER-fix-strip-25319.patch
+TOP-1707096-91a1.patch
+TOP-1846703-binutilsfix-11504.patch
+TOP-1859635-NSS3901-11506.patch
+TOP-NOBUG-killtelemetry-debugger-25319.patch
+TOP-1472170-PARTIAL-NOTESTS-63a1.patch
+TOP-1864587-angle-11507.patch
+TOP-1880562-NSS3902-11509.patch
+TOP-NOBUG-REGEXP-01-Import-25318.patch
+TOP-NOBUG-REGEXP-02-1361856-1-dotall-76a1-25318.patch
+TOP-NOBUG-REGEXP-03-1537978-68a1-25318.patch
+TOP-NOBUG-REGEXP-04-1539690-68a1-25318.patch
+TOP-NOBUG-REGEXP-05-1546300-68a1-25318.patch
+TOP-NOBUG-REGEXP-06-1504947-10-68a1-25318.patch
+TOP-NOBUG-REGEXP-07-1626713-76a1-25318.patch
+TOP-NOBUG-REGEXP-09-1627356-77a1-25318.patch
+TOP-NOBUG-REGEXP-10-1627838-77a1-25318.patch
+TOP-NOBUG-REGEXP-11-deunify-25318.patch
+TOP-NOBUG-REGEXP-12-1628835-77a1-25318.patch
+TOP-NOBUG-REGEXP-13-1361856-2-76a1-25318.patch
+TOP-NOBUG-REGEXP-14-1629670-77a1-25318.patch
+TOP-NOBUG-REGEXP-15-1630090-1-77a1-25318.patch
+TOP-NOBUG-REGEXP-16-1630090-2-77a1-25318.patch
+TOP-1630383-02-77a1.patch
+TOP-1630383-07-77a1.patch
+TOP-NOBUG-REGEXP-17-1630383-77a1-25318.patch
+TOP-1607405-77a1.patch
+TOP-1636495-1-78a1.patch
+TOP-1636495-2no3-78a1.patch
+TOP-1637199-78a1.patch
+TOP-NOBUG-REGEXP-21-1634135-78a1-25318.patch
+TOP-NOBUG-REGEXP-22-1637977-78a1-25318.patch
+TOP-NOBUG-REGEXP-23-1637913-78a1-25318.patch
+TOP-NOBUG-REGEXP-24-1631504-77a1-25318.patch
+TOP-NOBUG-REGEXP-25-1362154-1to4-78a1-25318.patch
+TOP-NOBUG-REGEXP-26-1362154-5to9-78a1-25318.patch
+TOP-NOBUG-REGEXP-27-1640487-78a1-25318.patch
+TOP-NOBUG-REGEXP-28-1640592-78a1-25318.patch
+TOP-NOBUG-REGEXP-29-1640475-78a1-25318.patch
+TOP-NOBUG-REGEXP-30-1640479-78a1-25318.patch
+TOP-NOBUG-REGEXP-31-1640473-78a1-25318.patch
+TOP-NOBUG-REGEXP-32-1638154-78a1-25318.patch
+TOP-NOBUG-REGEXP-33-1641352-79a1-25318.patch
+TOP-NOBUG-REGEXP-34-1361856-1-fix-76a1-25318.patch
+TOP-NOBUG-REGEXP-35-1435829-66a1-25318.patch
+TOP-NOBUG-REGEXP-36-1590543-73a1-25318.patch
+TOP-NOBUG-REGEXP-37-1642493-79a1-25318.patch
+TOP-NOBUG-REGEXP-38-1662073-82a1-25318.patch
+TOP-NOBUG-REGEXP-39-1681084-1-85a1-25318.patch
+TOP-NOBUG-REGEXP-40-1644590-79a1-25318.patch
+TOP-NOBUG-REGEXP-41-1667094-83a1-25318.patch
+TOP-NOBUG-REGEXP-42-1643171-79a1-25318.patch
+TOP-NOBUG-REGEXP-43-1691184-88a1-25318.patch
+TOP-NOBUG-REGEXP-44-irregexp-25318.patch
+TOP-NOBUG-REGEXP-45-final-25318.patch
+TOP-NOBUG-REGEXP-46-fixes-25318.patch
 1861843-2-version-beta-mr-25319.patch
 1902849-version-release-mr-25319.patch
 1902851-1-version-prebeta-mr-25320.patch
+1902935-seamonkey-credits-25320.patch

Some files were not shown because too many files changed in this diff