diff --git a/src/Nethermind/Ethereum.Blockchain.Test/TimeConsumingTests.cs b/src/Nethermind/Ethereum.Blockchain.Test/TimeConsumingTests.cs
index 7b78c9bf89e..1a320eec009 100644
--- a/src/Nethermind/Ethereum.Blockchain.Test/TimeConsumingTests.cs
+++ b/src/Nethermind/Ethereum.Blockchain.Test/TimeConsumingTests.cs
@@ -28,11 +28,6 @@ public class TimeConsumingTests : GeneralStateTestBase
[TestCaseSource(nameof(LoadTests))]
public void Test(GeneralStateTest test)
{
- if (test.Name.Contains("CALLBlake2f_MaxRound"))
- {
- return;
- }
-
Assert.True(RunTest(test).Pass);
}
diff --git a/src/Nethermind/Nethermind.Benchmark/Evm/Blake2Benchmark.cs b/src/Nethermind/Nethermind.Benchmark/Evm/Blake2Benchmark.cs
index 7f190325e6f..d3336b3aa35 100644
--- a/src/Nethermind/Nethermind.Benchmark/Evm/Blake2Benchmark.cs
+++ b/src/Nethermind/Nethermind.Benchmark/Evm/Blake2Benchmark.cs
@@ -21,7 +21,7 @@
using Nethermind.Core;
using Nethermind.Core.Crypto;
using Nethermind.Core.Extensions;
-using Nethermind.Crypto;
+using Nethermind.Crypto.Blake2;
namespace Nethermind.Benchmarks.Evm
{
diff --git a/src/Nethermind/Nethermind.Core.Test/Blake2Tests.cs b/src/Nethermind/Nethermind.Core.Test/Blake2Tests.cs
index a5a8e74b0a4..d8c98162b17 100644
--- a/src/Nethermind/Nethermind.Core.Test/Blake2Tests.cs
+++ b/src/Nethermind/Nethermind.Core.Test/Blake2Tests.cs
@@ -14,28 +14,80 @@
// You should have received a copy of the GNU Lesser General Public License
// along with the Nethermind. If not, see .
+using System.Collections.Generic;
using FluentAssertions;
using Nethermind.Core.Extensions;
-using Nethermind.Crypto;
+using Nethermind.Crypto.Blake2;
using NUnit.Framework;
namespace Nethermind.Core.Test
{
public class Blake2Tests
{
+ private readonly Blake2Compression _blake2Compression = new();
+ const string InputExceptRounds = "48c9bdf267e6096a3ba7ca8485ae67bb2bf894fe72f36e3cf1361d5f3af54fa5d182e6ad7f520e511f6c3e2b8c68059b6bbd41fbabd9831f79217e1319cde05b61626300000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000300000000000000000000000000000001";
+
[Test]
[TestCase("0000000048c9bdf267e6096a3ba7ca8485ae67bb2bf894fe72f36e3cf1361d5f3af54fa5d182e6ad7f520e511f6c3e2b8c68059b6bbd41fbabd9831f79217e1319cde05b61626300000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000300000000000000000000000000000001", "08c9bcf367e6096a3ba7ca8485ae67bb2bf894fe72f36e3cf1361d5f3af54fa5d282e6ad7f520e511f6c3e2b8c68059b9442be0454267ce079217e1319cde05b")]
[TestCase("0000000c48c9bdf267e6096a3ba7ca8485ae67bb2bf894fe72f36e3cf1361d5f3af54fa5d182e6ad7f520e511f6c3e2b8c68059b6bbd41fbabd9831f79217e1319cde05b61626300000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000300000000000000000000000000000001", "ba80a53f981c4d0d6a2797b69f12f6e94c212f14685ac4b74b12bb6fdbffa2d17d87c5392aab792dc252d5de4533cc9518d38aa8dbf1925ab92386edd4009923")]
[TestCase("0000000c48c9bdf267e6096a3ba7ca8485ae67bb2bf894fe72f36e3cf1361d5f3af54fa5d182e6ad7f520e511f6c3e2b8c68059b6bbd41fbabd9831f79217e1319cde05b61626300000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000300000000000000000000000000000000", "75ab69d3190a562c51aef8d88f1c2775876944407270c42c9844252c26d2875298743e7f6d5ea2f2d3e8d226039cd31b4e426ac4f2d3d666a610c2116fde4735")]
[TestCase("0000000148c9bdf267e6096a3ba7ca8485ae67bb2bf894fe72f36e3cf1361d5f3af54fa5d182e6ad7f520e511f6c3e2b8c68059b6bbd41fbabd9831f79217e1319cde05b61626300000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000300000000000000000000000000000001", "b63a380cb2897d521994a85234ee2c181b5f844d2c624c002677e9703449d2fba551b3a8333bcdf5f2f7e08993d53923de3d64fcc68c034e717b9293fed7a421")]
-// [TestCase("ffffffff48c9bdf267e6096a3ba7ca8485ae67bb2bf894fe72f36e3cf1361d5f3af54fa5d182e6ad7f520e511f6c3e2b8c68059b6bbd41fbabd9831f79217e1319cde05b61626300000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000300000000000000000000000000000001", "fc59093aafa9ab43daae0e914c57635c5402d8e3d2130eb9b3cc181de7f0ecf9b22bf99a7815ce16419e200e01846e6b5df8cc7703041bbceb571de6631d2615")]
+ [TestCase("ffffffff48c9bdf267e6096a3ba7ca8485ae67bb2bf894fe72f36e3cf1361d5f3af54fa5d182e6ad7f520e511f6c3e2b8c68059b6bbd41fbabd9831f79217e1319cde05b61626300000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000300000000000000000000000000000001", "fc59093aafa9ab43daae0e914c57635c5402d8e3d2130eb9b3cc181de7f0ecf9b22bf99a7815ce16419e200e01846e6b5df8cc7703041bbceb571de6631d2615")]
public void compression_function_should_return_valid_output(string input, string output)
{
- Blake2Compression? blake2Optimized = new();
byte[] blake2Result = new byte[64];
- blake2Optimized.Compress(Bytes.FromHexString(input), blake2Result);
+ _blake2Compression.Compress(Bytes.FromHexString(input), blake2Result);
string? result = blake2Result.ToHexString();
result.Should().BeEquivalentTo(output);
}
+
+ [TestCaseSource(nameof(TestCaseSource))]
+ public void avx2_should_compute_correct_values((int Rounds, string Output) testCase)
+ {
+ (int rounds, string output) = testCase;
+ Test(rounds, output, Blake2CompressMethod.Avx2);
+ }
+
+ [TestCaseSource(nameof(TestCaseSource))]
+ public void sse41_should_compute_correct_values((int Rounds, string Output) testCase)
+ {
+ (int rounds, string output) = testCase;
+ Test(rounds, output, Blake2CompressMethod.Sse41);
+ }
+
+ [TestCaseSource(nameof(TestCaseSource))]
+ public void scalar_should_compute_correct_values((int Rounds, string Output) testCase)
+ {
+ (int rounds, string output) = testCase;
+ Test(rounds, output, Blake2CompressMethod.Scalar);
+ }
+
+ private void Test(int rounds, string output, Blake2CompressMethod method)
+ {
+ string input = string.Concat(rounds.ToString("x8"), InputExceptRounds);
+
+ byte[] blake2Result = new byte[64];
+ _blake2Compression.Compress(Bytes.FromHexString(input), blake2Result, method);
+ string result = blake2Result.ToHexString();
+ result.Should().BeEquivalentTo(output);
+ }
+
+ public static IEnumerable<(int, string)> TestCaseSource()
+ {
+ yield return (0, "08c9bcf367e6096a3ba7ca8485ae67bb2bf894fe72f36e3cf1361d5f3af54fa5d282e6ad7f520e511f6c3e2b8c68059b9442be0454267ce079217e1319cde05b");
+ yield return (1, "b63a380cb2897d521994a85234ee2c181b5f844d2c624c002677e9703449d2fba551b3a8333bcdf5f2f7e08993d53923de3d64fcc68c034e717b9293fed7a421");
+ yield return (2, "2c96ff1bd7926f1b8bcd7824d808fdde9cf850920b625c59f1558bc608fb66a50070f53367230679e4949e7d32baac94f33af05175b7abf3b4972425a7b068ca");
+ yield return (3, "b70b167bd40e83abf720fa83d014b07db1f64ae0a7c0b4d74eace08cd2515ca7927a6d6268d80043628698e31ea7d4a4f69dac2cf3ce6746825f5cff08b401cc");
+ yield return (4, "0d2c9a214539ea7898029c0c95681cab88a360f633fd94ff5fae7d1e184bfab0a598296b7b046dd346ce75add0a457e3076fbc0a72ceff7eb9d4ed790d9356e9");
+ yield return (5, "021e4bc08df8b11f90392a07fc4e86b0d0159d2ff06f5c329a793847e4f0c848c6aefce2d2e11ee7a73dfaadbeebfb33e3a4ad083bfd3b4e93e7b23621a97960");
+ yield return (6, "a12a6af6b6d84ace0a8fcff0ae165e91b7de3bf70d9f19405e8701f2ea69ef1ed9e0206d78e61aa7867536b6982938c361e6a84ee1be15bc13b14adcd38459a1");
+ yield return (7, "125dd3e4baa7f300be309deab1181db034967cc20ebecc3c0de038b0a714afaa744cea00cd843042b75c25b1d2e3931d2203111e871f35723741418117efe781");
+ yield return (8, "59d8d7cbf70b0336e6f4f7a20d2ebd05f9b27ad7bb278faff380c206b68962ae630e8a4d2af1dce8a853cd722ad174e259c7ca284137fe52b61524fb5fe327f7");
+ yield return (9, "69fbbdf42d5f5f2eb657faaa82862c9a492237cbb93ffd9938ff7b757671fac0a19b9f27d130b78180d070f9b9b96ee1bb1d69e2edae0c1b7602f2f2e0977614");
+ yield return (10, "5a4308e0e1daede181b47775d926a6b4b6a0adf86d05bfea696fac45f08419623976bd3c786f61500b9f94a043b9dcf397e38ee237f3c273a7d812be20874f5a");
+ yield return (11, "60faa8f91624b2b718210df242b788c7ae887e953dce3c7f80862bc5e4f88d827cada4d95d2c4ac41eb66b84fcdc0e12ab0c66f4d9d546ff8a0d712f324e1845");
+ yield return (12, "ba80a53f981c4d0d6a2797b69f12f6e94c212f14685ac4b74b12bb6fdbffa2d17d87c5392aab792dc252d5de4533cc9518d38aa8dbf1925ab92386edd4009923");
+ yield return (10_000_000, "5b6d1ca8ee5370f08008240579096021dcf8860de693cc8f5a1476ba70c3b32ba8f93c62a0b2fbcd305caaa22bc96e0dbab199a65fcd234e31404ca4b1766252");
+ }
}
}
diff --git a/src/Nethermind/Nethermind.Crypto/Blake2/Blake2Avx2.cs b/src/Nethermind/Nethermind.Crypto/Blake2/Blake2Avx2.cs
new file mode 100644
index 00000000000..914a4786ac8
--- /dev/null
+++ b/src/Nethermind/Nethermind.Crypto/Blake2/Blake2Avx2.cs
@@ -0,0 +1,721 @@
+// Copyright (c) 2022 Demerzel Solutions Limited
+// This file is part of the Nethermind library.
+//
+// The Nethermind library is free software: you can redistribute it and/or modify
+// it under the terms of the GNU Lesser General Public License as published by
+// the Free Software Foundation, either version 3 of the License, or
+// (at your option) any later version.
+//
+// The Nethermind library is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+// GNU Lesser General Public License for more details.
+//
+// You should have received a copy of the GNU Lesser General Public License
+// along with the Nethermind. If not, see .
+//
+
+using System.Runtime.CompilerServices;
+using System.Runtime.InteropServices;
+using System.Runtime.Intrinsics;
+using System.Runtime.Intrinsics.X86;
+
+namespace Nethermind.Crypto.Blake2;
+
+///
+/// Code adapted from Blake2Fast (https://github.com/saucecontrol/Blake2Fast)
+///
+public unsafe partial class Blake2Compression
+{
+ // SIMD algorithm described in https://eprint.iacr.org/2012/275.pdf
+ [MethodImpl(MethodImplOptions.AggressiveOptimization)]
+ [SkipLocalsInit]
+ private static void ComputeAvx2(ulong* sh, ulong* m, uint rounds)
+ {
+ // Rotate shuffle masks. We can safely convert the ref to a pointer because the compiler guarantees the
+ // data is in a fixed location, and the ref itself is converted from a pointer. Same for the IV below.
+ byte* prm = (byte*)Unsafe.AsPointer(ref MemoryMarshal.GetReference(Rormask));
+ var r24 = Avx2.BroadcastVector128ToVector256(prm);
+ var r16 = Avx2.BroadcastVector128ToVector256(prm + Vector128.Count);
+
+ var row1 = Avx.LoadVector256(sh);
+ var row2 = Avx.LoadVector256(sh + Vector256.Count);
+
+ ulong* piv = (ulong*)Unsafe.AsPointer(ref MemoryMarshal.GetReference(Ivle));
+ var row3 = Avx.LoadVector256(piv);
+ var row4 = Avx.LoadVector256(piv + Vector256.Count);
+
+ row4 = Avx2.Xor(row4, Avx.LoadVector256(sh + Vector256.Count * 2)); // t[] and f[]
+
+ var m0 = Avx2.BroadcastVector128ToVector256(m);
+ var m1 = Avx2.BroadcastVector128ToVector256(m + Vector128.Count);
+ var m2 = Avx2.BroadcastVector128ToVector256(m + Vector128.Count * 2);
+ var m3 = Avx2.BroadcastVector128ToVector256(m + Vector128.Count * 3);
+ var m4 = Avx2.BroadcastVector128ToVector256(m + Vector128.Count * 4);
+ var m5 = Avx2.BroadcastVector128ToVector256(m + Vector128.Count * 5);
+ var m6 = Avx2.BroadcastVector128ToVector256(m + Vector128.Count * 6);
+ var m7 = Avx2.BroadcastVector128ToVector256(m + Vector128.Count * 7);
+ Vector256 t0;
+ Vector256 t1;
+ Vector256 b0;
+
+ for (uint i = 0; i < rounds; i++)
+ {
+ //ROUND 1
+ t0 = Avx2.UnpackLow(m0, m1);
+ t1 = Avx2.UnpackLow(m2, m3);
+ b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64();
+
+ //G1
+ row1 = Avx2.Add(Avx2.Add(row1, b0), row2);
+ row4 = Avx2.Xor(row4, row1);
+ row4 = Avx2.Shuffle(row4.AsUInt32(), 0b_10_11_00_01).AsUInt64();
+
+ row3 = Avx2.Add(row3, row4);
+ row2 = Avx2.Xor(row2, row3);
+ row2 = Avx2.Shuffle(row2.AsByte(), r24).AsUInt64();
+
+ t0 = Avx2.UnpackHigh(m0, m1);
+ t1 = Avx2.UnpackHigh(m2, m3);
+ b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64();
+
+ //G2
+ row1 = Avx2.Add(Avx2.Add(row1, b0), row2);
+ row4 = Avx2.Xor(row4, row1);
+ row4 = Avx2.Shuffle(row4.AsByte(), r16).AsUInt64();
+
+ row3 = Avx2.Add(row3, row4);
+ row2 = Avx2.Xor(row2, row3);
+ row2 = Avx2.Xor(Avx2.ShiftRightLogical(row2, 63), Avx2.Add(row2, row2));
+
+ //DIAGONALIZE
+ row1 = Avx2.Permute4x64(row1, 0b_10_01_00_11);
+ row4 = Avx2.Permute4x64(row4, 0b_01_00_11_10);
+ row3 = Avx2.Permute4x64(row3, 0b_00_11_10_01);
+
+ t0 = Avx2.UnpackLow(m7, m4);
+ t1 = Avx2.UnpackLow(m5, m6);
+ b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64();
+
+ //G1
+ row1 = Avx2.Add(Avx2.Add(row1, b0), row2);
+ row4 = Avx2.Xor(row4, row1);
+ row4 = Avx2.Shuffle(row4.AsUInt32(), 0b_10_11_00_01).AsUInt64();
+
+ row3 = Avx2.Add(row3, row4);
+ row2 = Avx2.Xor(row2, row3);
+ row2 = Avx2.Shuffle(row2.AsByte(), r24).AsUInt64();
+
+ t0 = Avx2.UnpackHigh(m7, m4);
+ t1 = Avx2.UnpackHigh(m5, m6);
+ b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64();
+
+ //G2
+ row1 = Avx2.Add(Avx2.Add(row1, b0), row2);
+ row4 = Avx2.Xor(row4, row1);
+ row4 = Avx2.Shuffle(row4.AsByte(), r16).AsUInt64();
+
+ row3 = Avx2.Add(row3, row4);
+ row2 = Avx2.Xor(row2, row3);
+ row2 = Avx2.Xor(Avx2.ShiftRightLogical(row2, 63), Avx2.Add(row2, row2));
+
+ //UNDIAGONALIZE
+ row1 = Avx2.Permute4x64(row1, 0b_00_11_10_01);
+ row4 = Avx2.Permute4x64(row4, 0b_01_00_11_10);
+ row3 = Avx2.Permute4x64(row3, 0b_10_01_00_11);
+
+ if (++i == rounds) break;
+
+ //ROUND 2
+ t0 = Avx2.UnpackLow(m7, m2);
+ t1 = Avx2.UnpackHigh(m4, m6);
+ b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64();
+
+ //G1
+ row1 = Avx2.Add(Avx2.Add(row1, b0), row2);
+ row4 = Avx2.Xor(row4, row1);
+ row4 = Avx2.Shuffle(row4.AsUInt32(), 0b_10_11_00_01).AsUInt64();
+
+ row3 = Avx2.Add(row3, row4);
+ row2 = Avx2.Xor(row2, row3);
+ row2 = Avx2.Shuffle(row2.AsByte(), r24).AsUInt64();
+
+ t0 = Avx2.UnpackLow(m5, m4);
+ t1 = Avx2.AlignRight(m3, m7, 8);
+ b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64();
+
+ //G2
+ row1 = Avx2.Add(Avx2.Add(row1, b0), row2);
+ row4 = Avx2.Xor(row4, row1);
+ row4 = Avx2.Shuffle(row4.AsByte(), r16).AsUInt64();
+
+ row3 = Avx2.Add(row3, row4);
+ row2 = Avx2.Xor(row2, row3);
+ row2 = Avx2.Xor(Avx2.ShiftRightLogical(row2, 63), Avx2.Add(row2, row2));
+
+ //DIAGONALIZE
+ row1 = Avx2.Permute4x64(row1, 0b_10_01_00_11);
+ row4 = Avx2.Permute4x64(row4, 0b_01_00_11_10);
+ row3 = Avx2.Permute4x64(row3, 0b_00_11_10_01);
+
+ t0 = Avx2.UnpackHigh(m2, m0);
+ t1 = Avx2.Blend(m0.AsUInt32(), m5.AsUInt32(), 0b_1100_1100).AsUInt64();
+ b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64();
+
+ //G1
+ row1 = Avx2.Add(Avx2.Add(row1, b0), row2);
+ row4 = Avx2.Xor(row4, row1);
+ row4 = Avx2.Shuffle(row4.AsUInt32(), 0b_10_11_00_01).AsUInt64();
+
+ row3 = Avx2.Add(row3, row4);
+ row2 = Avx2.Xor(row2, row3);
+ row2 = Avx2.Shuffle(row2.AsByte(), r24).AsUInt64();
+
+ t0 = Avx2.AlignRight(m6, m1, 8);
+ t1 = Avx2.Blend(m1.AsUInt32(), m3.AsUInt32(), 0b_1100_1100).AsUInt64();
+ b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64();
+
+ //G2
+ row1 = Avx2.Add(Avx2.Add(row1, b0), row2);
+ row4 = Avx2.Xor(row4, row1);
+ row4 = Avx2.Shuffle(row4.AsByte(), r16).AsUInt64();
+
+ row3 = Avx2.Add(row3, row4);
+ row2 = Avx2.Xor(row2, row3);
+ row2 = Avx2.Xor(Avx2.ShiftRightLogical(row2, 63), Avx2.Add(row2, row2));
+
+ //UNDIAGONALIZE
+ row1 = Avx2.Permute4x64(row1, 0b_00_11_10_01);
+ row4 = Avx2.Permute4x64(row4, 0b_01_00_11_10);
+ row3 = Avx2.Permute4x64(row3, 0b_10_01_00_11);
+
+ if (++i == rounds) break;
+
+ //ROUND 3
+ t0 = Avx2.AlignRight(m6, m5, 8);
+ t1 = Avx2.UnpackHigh(m2, m7);
+ b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64();
+
+ //G1
+ row1 = Avx2.Add(Avx2.Add(row1, b0), row2);
+ row4 = Avx2.Xor(row4, row1);
+ row4 = Avx2.Shuffle(row4.AsUInt32(), 0b_10_11_00_01).AsUInt64();
+
+ row3 = Avx2.Add(row3, row4);
+ row2 = Avx2.Xor(row2, row3);
+ row2 = Avx2.Shuffle(row2.AsByte(), r24).AsUInt64();
+
+ t0 = Avx2.UnpackLow(m4, m0);
+ t1 = Avx2.Blend(m1.AsUInt32(), m6.AsUInt32(), 0b_1100_1100).AsUInt64();
+ b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64();
+
+ //G2
+ row1 = Avx2.Add(Avx2.Add(row1, b0), row2);
+ row4 = Avx2.Xor(row4, row1);
+ row4 = Avx2.Shuffle(row4.AsByte(), r16).AsUInt64();
+
+ row3 = Avx2.Add(row3, row4);
+ row2 = Avx2.Xor(row2, row3);
+ row2 = Avx2.Xor(Avx2.ShiftRightLogical(row2, 63), Avx2.Add(row2, row2));
+
+ //DIAGONALIZE
+ row1 = Avx2.Permute4x64(row1, 0b_10_01_00_11);
+ row4 = Avx2.Permute4x64(row4, 0b_01_00_11_10);
+ row3 = Avx2.Permute4x64(row3, 0b_00_11_10_01);
+
+ t0 = Avx2.AlignRight(m5, m4, 8);
+ t1 = Avx2.UnpackHigh(m1, m3);
+ b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64();
+
+ //G1
+ row1 = Avx2.Add(Avx2.Add(row1, b0), row2);
+ row4 = Avx2.Xor(row4, row1);
+ row4 = Avx2.Shuffle(row4.AsUInt32(), 0b_10_11_00_01).AsUInt64();
+
+ row3 = Avx2.Add(row3, row4);
+ row2 = Avx2.Xor(row2, row3);
+ row2 = Avx2.Shuffle(row2.AsByte(), r24).AsUInt64();
+
+ t0 = Avx2.UnpackLow(m2, m7);
+ t1 = Avx2.Blend(m3.AsUInt32(), m0.AsUInt32(), 0b_1100_1100).AsUInt64();
+ b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64();
+
+ //G2
+ row1 = Avx2.Add(Avx2.Add(row1, b0), row2);
+ row4 = Avx2.Xor(row4, row1);
+ row4 = Avx2.Shuffle(row4.AsByte(), r16).AsUInt64();
+
+ row3 = Avx2.Add(row3, row4);
+ row2 = Avx2.Xor(row2, row3);
+ row2 = Avx2.Xor(Avx2.ShiftRightLogical(row2, 63), Avx2.Add(row2, row2));
+
+ //UNDIAGONALIZE
+ row1 = Avx2.Permute4x64(row1, 0b_00_11_10_01);
+ row4 = Avx2.Permute4x64(row4, 0b_01_00_11_10);
+ row3 = Avx2.Permute4x64(row3, 0b_10_01_00_11);
+
+ if (++i == rounds) break;
+
+ //ROUND 4
+ t0 = Avx2.UnpackHigh(m3, m1);
+ t1 = Avx2.UnpackHigh(m6, m5);
+ b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64();
+
+ //G1
+ row1 = Avx2.Add(Avx2.Add(row1, b0), row2);
+ row4 = Avx2.Xor(row4, row1);
+ row4 = Avx2.Shuffle(row4.AsUInt32(), 0b_10_11_00_01).AsUInt64();
+
+ row3 = Avx2.Add(row3, row4);
+ row2 = Avx2.Xor(row2, row3);
+ row2 = Avx2.Shuffle(row2.AsByte(), r24).AsUInt64();
+
+ t0 = Avx2.UnpackHigh(m4, m0);
+ t1 = Avx2.UnpackLow(m6, m7);
+ b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64();
+
+ //G2
+ row1 = Avx2.Add(Avx2.Add(row1, b0), row2);
+ row4 = Avx2.Xor(row4, row1);
+ row4 = Avx2.Shuffle(row4.AsByte(), r16).AsUInt64();
+
+ row3 = Avx2.Add(row3, row4);
+ row2 = Avx2.Xor(row2, row3);
+ row2 = Avx2.Xor(Avx2.ShiftRightLogical(row2, 63), Avx2.Add(row2, row2));
+
+ //DIAGONALIZE
+ row1 = Avx2.Permute4x64(row1, 0b_10_01_00_11);
+ row4 = Avx2.Permute4x64(row4, 0b_01_00_11_10);
+ row3 = Avx2.Permute4x64(row3, 0b_00_11_10_01);
+
+ t0 = Avx2.AlignRight(m1, m7, 8);
+ t1 = Avx2.Shuffle(m2.AsUInt32(), 0b_01_00_11_10).AsUInt64();
+ b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64();
+
+ //G1
+ row1 = Avx2.Add(Avx2.Add(row1, b0), row2);
+ row4 = Avx2.Xor(row4, row1);
+ row4 = Avx2.Shuffle(row4.AsUInt32(), 0b_10_11_00_01).AsUInt64();
+
+ row3 = Avx2.Add(row3, row4);
+ row2 = Avx2.Xor(row2, row3);
+ row2 = Avx2.Shuffle(row2.AsByte(), r24).AsUInt64();
+
+ t0 = Avx2.UnpackLow(m4, m3);
+ t1 = Avx2.UnpackLow(m5, m0);
+ b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64();
+
+ //G2
+ row1 = Avx2.Add(Avx2.Add(row1, b0), row2);
+ row4 = Avx2.Xor(row4, row1);
+ row4 = Avx2.Shuffle(row4.AsByte(), r16).AsUInt64();
+
+ row3 = Avx2.Add(row3, row4);
+ row2 = Avx2.Xor(row2, row3);
+ row2 = Avx2.Xor(Avx2.ShiftRightLogical(row2, 63), Avx2.Add(row2, row2));
+
+ //UNDIAGONALIZE
+ row1 = Avx2.Permute4x64(row1, 0b_00_11_10_01);
+ row4 = Avx2.Permute4x64(row4, 0b_01_00_11_10);
+ row3 = Avx2.Permute4x64(row3, 0b_10_01_00_11);
+
+ if (++i == rounds) break;
+
+ //ROUND 5
+ t0 = Avx2.UnpackHigh(m4, m2);
+ t1 = Avx2.UnpackLow(m1, m5);
+ b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64();
+
+ //G1
+ row1 = Avx2.Add(Avx2.Add(row1, b0), row2);
+ row4 = Avx2.Xor(row4, row1);
+ row4 = Avx2.Shuffle(row4.AsUInt32(), 0b_10_11_00_01).AsUInt64();
+
+ row3 = Avx2.Add(row3, row4);
+ row2 = Avx2.Xor(row2, row3);
+ row2 = Avx2.Shuffle(row2.AsByte(), r24).AsUInt64();
+
+ t0 = Avx2.Blend(m0.AsUInt32(), m3.AsUInt32(), 0b_1100_1100).AsUInt64();
+ t1 = Avx2.Blend(m2.AsUInt32(), m7.AsUInt32(), 0b_1100_1100).AsUInt64();
+ b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64();
+
+ //G2
+ row1 = Avx2.Add(Avx2.Add(row1, b0), row2);
+ row4 = Avx2.Xor(row4, row1);
+ row4 = Avx2.Shuffle(row4.AsByte(), r16).AsUInt64();
+
+ row3 = Avx2.Add(row3, row4);
+ row2 = Avx2.Xor(row2, row3);
+ row2 = Avx2.Xor(Avx2.ShiftRightLogical(row2, 63), Avx2.Add(row2, row2));
+
+ //DIAGONALIZE
+ row1 = Avx2.Permute4x64(row1, 0b_10_01_00_11);
+ row4 = Avx2.Permute4x64(row4, 0b_01_00_11_10);
+ row3 = Avx2.Permute4x64(row3, 0b_00_11_10_01);
+
+ t0 = Avx2.AlignRight(m7, m1, 8);
+ t1 = Avx2.AlignRight(m3, m5, 8);
+ b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64();
+
+ //G1
+ row1 = Avx2.Add(Avx2.Add(row1, b0), row2);
+ row4 = Avx2.Xor(row4, row1);
+ row4 = Avx2.Shuffle(row4.AsUInt32(), 0b_10_11_00_01).AsUInt64();
+
+ row3 = Avx2.Add(row3, row4);
+ row2 = Avx2.Xor(row2, row3);
+ row2 = Avx2.Shuffle(row2.AsByte(), r24).AsUInt64();
+
+ t0 = Avx2.UnpackHigh(m6, m0);
+ t1 = Avx2.UnpackLow(m6, m4);
+ b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64();
+
+ //G2
+ row1 = Avx2.Add(Avx2.Add(row1, b0), row2);
+ row4 = Avx2.Xor(row4, row1);
+ row4 = Avx2.Shuffle(row4.AsByte(), r16).AsUInt64();
+
+ row3 = Avx2.Add(row3, row4);
+ row2 = Avx2.Xor(row2, row3);
+ row2 = Avx2.Xor(Avx2.ShiftRightLogical(row2, 63), Avx2.Add(row2, row2));
+
+ //UNDIAGONALIZE
+ row1 = Avx2.Permute4x64(row1, 0b_00_11_10_01);
+ row4 = Avx2.Permute4x64(row4, 0b_01_00_11_10);
+ row3 = Avx2.Permute4x64(row3, 0b_10_01_00_11);
+
+ if (++i == rounds) break;
+
+ //ROUND 6
+ t0 = Avx2.UnpackLow(m1, m3);
+ t1 = Avx2.UnpackLow(m0, m4);
+ b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64();
+
+ //G1
+ row1 = Avx2.Add(Avx2.Add(row1, b0), row2);
+ row4 = Avx2.Xor(row4, row1);
+ row4 = Avx2.Shuffle(row4.AsUInt32(), 0b_10_11_00_01).AsUInt64();
+
+ row3 = Avx2.Add(row3, row4);
+ row2 = Avx2.Xor(row2, row3);
+ row2 = Avx2.Shuffle(row2.AsByte(), r24).AsUInt64();
+
+ t0 = Avx2.UnpackLow(m6, m5);
+ t1 = Avx2.UnpackHigh(m5, m1);
+ b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64();
+
+ //G2
+ row1 = Avx2.Add(Avx2.Add(row1, b0), row2);
+ row4 = Avx2.Xor(row4, row1);
+ row4 = Avx2.Shuffle(row4.AsByte(), r16).AsUInt64();
+
+ row3 = Avx2.Add(row3, row4);
+ row2 = Avx2.Xor(row2, row3);
+ row2 = Avx2.Xor(Avx2.ShiftRightLogical(row2, 63), Avx2.Add(row2, row2));
+
+ //DIAGONALIZE
+ row1 = Avx2.Permute4x64(row1, 0b_10_01_00_11);
+ row4 = Avx2.Permute4x64(row4, 0b_01_00_11_10);
+ row3 = Avx2.Permute4x64(row3, 0b_00_11_10_01);
+
+ t0 = Avx2.AlignRight(m2, m0, 8);
+ t1 = Avx2.UnpackHigh(m3, m7);
+ b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64();
+
+ //G1
+ row1 = Avx2.Add(Avx2.Add(row1, b0), row2);
+ row4 = Avx2.Xor(row4, row1);
+ row4 = Avx2.Shuffle(row4.AsUInt32(), 0b_10_11_00_01).AsUInt64();
+
+ row3 = Avx2.Add(row3, row4);
+ row2 = Avx2.Xor(row2, row3);
+ row2 = Avx2.Shuffle(row2.AsByte(), r24).AsUInt64();
+
+ t0 = Avx2.UnpackHigh(m4, m6);
+ t1 = Avx2.AlignRight(m7, m2, 8);
+ b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64();
+
+ //G2
+ row1 = Avx2.Add(Avx2.Add(row1, b0), row2);
+ row4 = Avx2.Xor(row4, row1);
+ row4 = Avx2.Shuffle(row4.AsByte(), r16).AsUInt64();
+
+ row3 = Avx2.Add(row3, row4);
+ row2 = Avx2.Xor(row2, row3);
+ row2 = Avx2.Xor(Avx2.ShiftRightLogical(row2, 63), Avx2.Add(row2, row2));
+
+ //UNDIAGONALIZE
+ row1 = Avx2.Permute4x64(row1, 0b_00_11_10_01);
+ row4 = Avx2.Permute4x64(row4, 0b_01_00_11_10);
+ row3 = Avx2.Permute4x64(row3, 0b_10_01_00_11);
+
+ if (++i == rounds) break;
+
+ //ROUND 7
+ t0 = Avx2.Blend(m6.AsUInt32(), m0.AsUInt32(), 0b_1100_1100).AsUInt64();
+ t1 = Avx2.UnpackLow(m7, m2);
+ b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64();
+
+ //G1
+ row1 = Avx2.Add(Avx2.Add(row1, b0), row2);
+ row4 = Avx2.Xor(row4, row1);
+ row4 = Avx2.Shuffle(row4.AsUInt32(), 0b_10_11_00_01).AsUInt64();
+
+ row3 = Avx2.Add(row3, row4);
+ row2 = Avx2.Xor(row2, row3);
+ row2 = Avx2.Shuffle(row2.AsByte(), r24).AsUInt64();
+
+ t0 = Avx2.UnpackHigh(m2, m7);
+ t1 = Avx2.AlignRight(m5, m6, 8);
+ b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64();
+
+ //G2
+ row1 = Avx2.Add(Avx2.Add(row1, b0), row2);
+ row4 = Avx2.Xor(row4, row1);
+ row4 = Avx2.Shuffle(row4.AsByte(), r16).AsUInt64();
+
+ row3 = Avx2.Add(row3, row4);
+ row2 = Avx2.Xor(row2, row3);
+ row2 = Avx2.Xor(Avx2.ShiftRightLogical(row2, 63), Avx2.Add(row2, row2));
+
+ //DIAGONALIZE
+ row1 = Avx2.Permute4x64(row1, 0b_10_01_00_11);
+ row4 = Avx2.Permute4x64(row4, 0b_01_00_11_10);
+ row3 = Avx2.Permute4x64(row3, 0b_00_11_10_01);
+
+ t0 = Avx2.UnpackLow(m4, m0);
+ t1 = Avx2.Blend(m3.AsUInt32(), m4.AsUInt32(), 0b_1100_1100).AsUInt64();
+ b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64();
+
+ //G1
+ row1 = Avx2.Add(Avx2.Add(row1, b0), row2);
+ row4 = Avx2.Xor(row4, row1);
+ row4 = Avx2.Shuffle(row4.AsUInt32(), 0b_10_11_00_01).AsUInt64();
+
+ row3 = Avx2.Add(row3, row4);
+ row2 = Avx2.Xor(row2, row3);
+ row2 = Avx2.Shuffle(row2.AsByte(), r24).AsUInt64();
+
+ t0 = Avx2.UnpackHigh(m5, m3);
+ t1 = Avx2.Shuffle(m1.AsUInt32(), 0b_01_00_11_10).AsUInt64();
+ b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64();
+
+ //G2
+ row1 = Avx2.Add(Avx2.Add(row1, b0), row2);
+ row4 = Avx2.Xor(row4, row1);
+ row4 = Avx2.Shuffle(row4.AsByte(), r16).AsUInt64();
+
+ row3 = Avx2.Add(row3, row4);
+ row2 = Avx2.Xor(row2, row3);
+ row2 = Avx2.Xor(Avx2.ShiftRightLogical(row2, 63), Avx2.Add(row2, row2));
+
+ //UNDIAGONALIZE
+ row1 = Avx2.Permute4x64(row1, 0b_00_11_10_01);
+ row4 = Avx2.Permute4x64(row4, 0b_01_00_11_10);
+ row3 = Avx2.Permute4x64(row3, 0b_10_01_00_11);
+
+ if (++i == rounds) break;
+
+ //ROUND 8
+ t0 = Avx2.UnpackHigh(m6, m3);
+ t1 = Avx2.Blend(m6.AsUInt32(), m1.AsUInt32(), 0b_1100_1100).AsUInt64();
+ b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64();
+
+ //G1
+ row1 = Avx2.Add(Avx2.Add(row1, b0), row2);
+ row4 = Avx2.Xor(row4, row1);
+ row4 = Avx2.Shuffle(row4.AsUInt32(), 0b_10_11_00_01).AsUInt64();
+
+ row3 = Avx2.Add(row3, row4);
+ row2 = Avx2.Xor(row2, row3);
+ row2 = Avx2.Shuffle(row2.AsByte(), r24).AsUInt64();
+
+ t0 = Avx2.AlignRight(m7, m5, 8);
+ t1 = Avx2.UnpackHigh(m0, m4);
+ b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64();
+
+ //G2
+ row1 = Avx2.Add(Avx2.Add(row1, b0), row2);
+ row4 = Avx2.Xor(row4, row1);
+ row4 = Avx2.Shuffle(row4.AsByte(), r16).AsUInt64();
+
+ row3 = Avx2.Add(row3, row4);
+ row2 = Avx2.Xor(row2, row3);
+ row2 = Avx2.Xor(Avx2.ShiftRightLogical(row2, 63), Avx2.Add(row2, row2));
+
+ //DIAGONALIZE
+ row1 = Avx2.Permute4x64(row1, 0b_10_01_00_11);
+ row4 = Avx2.Permute4x64(row4, 0b_01_00_11_10);
+ row3 = Avx2.Permute4x64(row3, 0b_00_11_10_01);
+
+ t0 = Avx2.Blend(m1.AsUInt32(), m2.AsUInt32(), 0b_1100_1100).AsUInt64();
+ t1 = Avx2.AlignRight(m4, m7, 8);
+ b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64();
+
+ //G1
+ row1 = Avx2.Add(Avx2.Add(row1, b0), row2);
+ row4 = Avx2.Xor(row4, row1);
+ row4 = Avx2.Shuffle(row4.AsUInt32(), 0b_10_11_00_01).AsUInt64();
+
+ row3 = Avx2.Add(row3, row4);
+ row2 = Avx2.Xor(row2, row3);
+ row2 = Avx2.Shuffle(row2.AsByte(), r24).AsUInt64();
+
+ t0 = Avx2.UnpackLow(m5, m0);
+ t1 = Avx2.UnpackLow(m2, m3);
+ b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64();
+
+ //G2
+ row1 = Avx2.Add(Avx2.Add(row1, b0), row2);
+ row4 = Avx2.Xor(row4, row1);
+ row4 = Avx2.Shuffle(row4.AsByte(), r16).AsUInt64();
+
+ row3 = Avx2.Add(row3, row4);
+ row2 = Avx2.Xor(row2, row3);
+ row2 = Avx2.Xor(Avx2.ShiftRightLogical(row2, 63), Avx2.Add(row2, row2));
+
+ //UNDIAGONALIZE
+ row1 = Avx2.Permute4x64(row1, 0b_00_11_10_01);
+ row4 = Avx2.Permute4x64(row4, 0b_01_00_11_10);
+ row3 = Avx2.Permute4x64(row3, 0b_10_01_00_11);
+
+ if (++i == rounds) break;
+
+ //ROUND 9
+ t0 = Avx2.UnpackLow(m3, m7);
+ t1 = Avx2.AlignRight(m0, m5, 8);
+ b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64();
+
+ //G1
+ row1 = Avx2.Add(Avx2.Add(row1, b0), row2);
+ row4 = Avx2.Xor(row4, row1);
+ row4 = Avx2.Shuffle(row4.AsUInt32(), 0b_10_11_00_01).AsUInt64();
+
+ row3 = Avx2.Add(row3, row4);
+ row2 = Avx2.Xor(row2, row3);
+ row2 = Avx2.Shuffle(row2.AsByte(), r24).AsUInt64();
+
+ t0 = Avx2.UnpackHigh(m7, m4);
+ t1 = Avx2.AlignRight(m4, m1, 8);
+ b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64();
+
+ //G2
+ row1 = Avx2.Add(Avx2.Add(row1, b0), row2);
+ row4 = Avx2.Xor(row4, row1);
+ row4 = Avx2.Shuffle(row4.AsByte(), r16).AsUInt64();
+
+ row3 = Avx2.Add(row3, row4);
+ row2 = Avx2.Xor(row2, row3);
+ row2 = Avx2.Xor(Avx2.ShiftRightLogical(row2, 63), Avx2.Add(row2, row2));
+
+ //DIAGONALIZE
+ row1 = Avx2.Permute4x64(row1, 0b_10_01_00_11);
+ row4 = Avx2.Permute4x64(row4, 0b_01_00_11_10);
+ row3 = Avx2.Permute4x64(row3, 0b_00_11_10_01);
+
+ t0 = Avx2.UnpackLow(m5, m6);
+ t1 = Avx2.UnpackHigh(m6, m0);
+ b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64();
+
+ //G1
+ row1 = Avx2.Add(Avx2.Add(row1, b0), row2);
+ row4 = Avx2.Xor(row4, row1);
+ row4 = Avx2.Shuffle(row4.AsUInt32(), 0b_10_11_00_01).AsUInt64();
+
+ row3 = Avx2.Add(row3, row4);
+ row2 = Avx2.Xor(row2, row3);
+ row2 = Avx2.Shuffle(row2.AsByte(), r24).AsUInt64();
+
+ t0 = Avx2.AlignRight(m1, m2, 8);
+ t1 = Avx2.AlignRight(m2, m3, 8);
+ b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64();
+
+ //G2
+ row1 = Avx2.Add(Avx2.Add(row1, b0), row2);
+ row4 = Avx2.Xor(row4, row1);
+ row4 = Avx2.Shuffle(row4.AsByte(), r16).AsUInt64();
+
+ row3 = Avx2.Add(row3, row4);
+ row2 = Avx2.Xor(row2, row3);
+ row2 = Avx2.Xor(Avx2.ShiftRightLogical(row2, 63), Avx2.Add(row2, row2));
+
+ //UNDIAGONALIZE
+ row1 = Avx2.Permute4x64(row1, 0b_00_11_10_01);
+ row4 = Avx2.Permute4x64(row4, 0b_01_00_11_10);
+ row3 = Avx2.Permute4x64(row3, 0b_10_01_00_11);
+
+ if (++i == rounds) break;
+
+ //ROUND 10
+ t0 = Avx2.UnpackLow(m5, m4);
+ t1 = Avx2.UnpackHigh(m3, m0);
+ b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64();
+
+ //G1
+ row1 = Avx2.Add(Avx2.Add(row1, b0), row2);
+ row4 = Avx2.Xor(row4, row1);
+ row4 = Avx2.Shuffle(row4.AsUInt32(), 0b_10_11_00_01).AsUInt64();
+
+ row3 = Avx2.Add(row3, row4);
+ row2 = Avx2.Xor(row2, row3);
+ row2 = Avx2.Shuffle(row2.AsByte(), r24).AsUInt64();
+
+ t0 = Avx2.UnpackLow(m1, m2);
+ t1 = Avx2.Blend(m3.AsUInt32(), m2.AsUInt32(), 0b_1100_1100).AsUInt64();
+ b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64();
+
+ //G2
+ row1 = Avx2.Add(Avx2.Add(row1, b0), row2);
+ row4 = Avx2.Xor(row4, row1);
+ row4 = Avx2.Shuffle(row4.AsByte(), r16).AsUInt64();
+
+ row3 = Avx2.Add(row3, row4);
+ row2 = Avx2.Xor(row2, row3);
+ row2 = Avx2.Xor(Avx2.ShiftRightLogical(row2, 63), Avx2.Add(row2, row2));
+
+ //DIAGONALIZE
+ row1 = Avx2.Permute4x64(row1, 0b_10_01_00_11);
+ row4 = Avx2.Permute4x64(row4, 0b_01_00_11_10);
+ row3 = Avx2.Permute4x64(row3, 0b_00_11_10_01);
+
+ t0 = Avx2.UnpackHigh(m6, m7);
+ t1 = Avx2.UnpackHigh(m4, m1);
+ b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64();
+
+ //G1
+ row1 = Avx2.Add(Avx2.Add(row1, b0), row2);
+ row4 = Avx2.Xor(row4, row1);
+ row4 = Avx2.Shuffle(row4.AsUInt32(), 0b_10_11_00_01).AsUInt64();
+
+ row3 = Avx2.Add(row3, row4);
+ row2 = Avx2.Xor(row2, row3);
+ row2 = Avx2.Shuffle(row2.AsByte(), r24).AsUInt64();
+
+ t0 = Avx2.Blend(m0.AsUInt32(), m5.AsUInt32(), 0b_1100_1100).AsUInt64();
+ t1 = Avx2.UnpackLow(m7, m6);
+ b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64();
+
+ //G2
+ row1 = Avx2.Add(Avx2.Add(row1, b0), row2);
+ row4 = Avx2.Xor(row4, row1);
+ row4 = Avx2.Shuffle(row4.AsByte(), r16).AsUInt64();
+
+ row3 = Avx2.Add(row3, row4);
+ row2 = Avx2.Xor(row2, row3);
+ row2 = Avx2.Xor(Avx2.ShiftRightLogical(row2, 63), Avx2.Add(row2, row2));
+
+ //UNDIAGONALIZE
+ row1 = Avx2.Permute4x64(row1, 0b_00_11_10_01);
+ row4 = Avx2.Permute4x64(row4, 0b_01_00_11_10);
+ row3 = Avx2.Permute4x64(row3, 0b_10_01_00_11);
+ }
+
+ row1 = Avx2.Xor(row1, row3);
+ row2 = Avx2.Xor(row2, row4);
+ row1 = Avx2.Xor(row1, Avx.LoadVector256(sh));
+ row2 = Avx2.Xor(row2, Avx.LoadVector256(sh + Vector256.Count));
+
+ Avx.Store(sh, row1);
+ Avx.Store(sh + Vector256.Count, row2);
+ }
+}
diff --git a/src/Nethermind/Nethermind.Crypto/Blake2/Blake2CompressMethod.cs b/src/Nethermind/Nethermind.Crypto/Blake2/Blake2CompressMethod.cs
new file mode 100644
index 00000000000..6d87db5de10
--- /dev/null
+++ b/src/Nethermind/Nethermind.Crypto/Blake2/Blake2CompressMethod.cs
@@ -0,0 +1,27 @@
+// Copyright (c) 2022 Demerzel Solutions Limited
+// This file is part of the Nethermind library.
+//
+// The Nethermind library is free software: you can redistribute it and/or modify
+// it under the terms of the GNU Lesser General Public License as published by
+// the Free Software Foundation, either version 3 of the License, or
+// (at your option) any later version.
+//
+// The Nethermind library is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+// GNU Lesser General Public License for more details.
+//
+// You should have received a copy of the GNU Lesser General Public License
+// along with the Nethermind. If not, see .
+//
+
+
+namespace Nethermind.Crypto.Blake2;
+
+public enum Blake2CompressMethod
+{
+ Avx2,
+ Sse41,
+ Scalar,
+ Optimal
+}
diff --git a/src/Nethermind/Nethermind.Crypto/Blake2/Blake2Compression.cs b/src/Nethermind/Nethermind.Crypto/Blake2/Blake2Compression.cs
new file mode 100644
index 00000000000..7c530122744
--- /dev/null
+++ b/src/Nethermind/Nethermind.Crypto/Blake2/Blake2Compression.cs
@@ -0,0 +1,99 @@
+// Copyright (c) 2021 Demerzel Solutions Limited
+// This file is part of the Nethermind library.
+//
+// The Nethermind library is free software: you can redistribute it and/or modify
+// it under the terms of the GNU Lesser General Public License as published by
+// the Free Software Foundation, either version 3 of the License, or
+// (at your option) any later version.
+//
+// The Nethermind library is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+// GNU Lesser General Public License for more details.
+//
+// You should have received a copy of the GNU Lesser General Public License
+// along with the Nethermind. If not, see .
+
+using System;
+using System.Buffers.Binary;
+using System.Runtime.InteropServices;
+using System.Runtime.Intrinsics.X86;
+
+namespace Nethermind.Crypto.Blake2
+{
+ ///
+ /// Code adapted from pantheon (https://github.com/PegaSysEng/pantheon)
+ /// and from Blake2Fast (https://github.com/saucecontrol/Blake2Fast)
+ ///
+ public partial class Blake2Compression
+ {
+ const byte NumberOfBytesInUlong = 8;
+ const byte NumberOfHWords = 8;
+ const byte NumberOfMWords = 16;
+ const byte StartOfHWords = 4;
+ const byte StartOfMWords = 68;
+ const byte StartOfTWords = 196;
+ const byte ByteOfFWord = 212;
+
+ private static ReadOnlySpan Ivle => new byte[]
+ {
+ 0x08, 0xC9, 0xBC, 0xF3, 0x67, 0xE6, 0x09, 0x6A, 0x3B, 0xA7, 0xCA, 0x84, 0x85, 0xAE, 0x67, 0xBB, 0x2B,
+ 0xF8, 0x94, 0xFE, 0x72, 0xF3, 0x6E, 0x3C, 0xF1, 0x36, 0x1D, 0x5F, 0x3A, 0xF5, 0x4F, 0xA5, 0xD1, 0x82,
+ 0xE6, 0xAD, 0x7F, 0x52, 0x0E, 0x51, 0x1F, 0x6C, 0x3E, 0x2B, 0x8C, 0x68, 0x05, 0x9B, 0x6B, 0xBD, 0x41,
+ 0xFB, 0xAB, 0xD9, 0x83, 0x1F, 0x79, 0x21, 0x7E, 0x13, 0x19, 0xCD, 0xE0, 0x5B
+ };
+
+ private static ReadOnlySpan Rormask => new byte[]
+ {
+ 3, 4, 5, 6, 7, 0, 1, 2, 11, 12, 13, 14, 15, 8, 9, 10, //r24
+ 2, 3, 4, 5, 6, 7, 0, 1, 10, 11, 12, 13, 14, 15, 8, 9 //r16
+ };
+
+ public unsafe void Compress(ReadOnlySpan input, Span output, Blake2CompressMethod method = Blake2CompressMethod.Optimal)
+ {
+ // sh length = h words length + t[0] + t[1] + f[0]
+ ulong* sh = stackalloc ulong[NumberOfHWords + 3];
+ ulong* m = stackalloc ulong[NumberOfMWords];
+
+ uint rounds = BinaryPrimitives.ReadUInt32BigEndian(input);
+
+ for (int i = 0; i < NumberOfHWords; i++)
+ {
+ sh[i] = MemoryMarshal.Cast(input.Slice(StartOfHWords + i * NumberOfBytesInUlong, NumberOfBytesInUlong)).GetPinnableReference();
+ }
+
+ // t[0]
+ sh[8] = MemoryMarshal.Cast(input.Slice(StartOfTWords, NumberOfBytesInUlong)).GetPinnableReference();
+ // t[1]
+ sh[9] = MemoryMarshal.Cast(input.Slice(StartOfTWords + NumberOfBytesInUlong, NumberOfBytesInUlong)).GetPinnableReference();
+ // f[0]
+ sh[10] = input[ByteOfFWord] != 0 ? ulong.MaxValue : ulong.MinValue;
+
+ for (int i = 0; i < NumberOfMWords; i++)
+ {
+ m[i] = MemoryMarshal.Cast(input.Slice(StartOfMWords + i * NumberOfBytesInUlong, NumberOfBytesInUlong)).GetPinnableReference();
+ }
+
+ switch (method)
+ {
+ case Blake2CompressMethod.Optimal when Avx2.IsSupported:
+ case Blake2CompressMethod.Avx2:
+ ComputeAvx2(sh, m, rounds);
+ break;
+ case Blake2CompressMethod.Optimal when Sse41.IsSupported:
+ case Blake2CompressMethod.Sse41:
+ ComputeSse41(sh, m, rounds);
+ break;
+ default:
+ ComputeScalar(sh, m, rounds);
+ break;
+ }
+
+ Span outputUlongs = MemoryMarshal.Cast(output);
+ for (int offset = 0; offset < NumberOfHWords; offset++)
+ {
+ outputUlongs[offset] = sh[offset];
+ }
+ }
+ }
+}
diff --git a/src/Nethermind/Nethermind.Crypto/Blake2/Blake2Scalar.cs b/src/Nethermind/Nethermind.Crypto/Blake2/Blake2Scalar.cs
new file mode 100644
index 00000000000..e1907b50e7a
--- /dev/null
+++ b/src/Nethermind/Nethermind.Crypto/Blake2/Blake2Scalar.cs
@@ -0,0 +1,1390 @@
+// Copyright (c) 2022 Demerzel Solutions Limited
+// This file is part of the Nethermind library.
+//
+// The Nethermind library is free software: you can redistribute it and/or modify
+// it under the terms of the GNU Lesser General Public License as published by
+// the Free Software Foundation, either version 3 of the License, or
+// (at your option) any later version.
+//
+// The Nethermind library is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+// GNU Lesser General Public License for more details.
+//
+// You should have received a copy of the GNU Lesser General Public License
+// along with the Nethermind. If not, see .
+//
+
+using System.Runtime.CompilerServices;
+
+namespace Nethermind.Crypto.Blake2;
+
+///
+/// Code adapted from Blake2Fast (https://github.com/saucecontrol/Blake2Fast)
+///
+public unsafe partial class Blake2Compression
+{
+ [MethodImpl(MethodImplOptions.AggressiveOptimization)]
+ [SkipLocalsInit]
+ private static void ComputeScalar(ulong* sh, ulong* m, uint rounds)
+ {
+ ulong m00 = m[00];
+ ulong m01 = m[01];
+ ulong m02 = m[02];
+ ulong m03 = m[03];
+ ulong m04 = m[04];
+ ulong m05 = m[05];
+ ulong m06 = m[06];
+ ulong m07 = m[07];
+ ulong m08 = m[08];
+ ulong m09 = m[09];
+ ulong m10 = m[10];
+ ulong m11 = m[11];
+ ulong m12 = m[12];
+ ulong m13 = m[13];
+ ulong m14 = m[14];
+ ulong m15 = m[15];
+
+ ulong v00 = sh[0];
+ ulong v01 = sh[1];
+ ulong v02 = sh[2];
+ ulong v03 = sh[3];
+ ulong v04 = sh[4];
+ ulong v05 = sh[5];
+ ulong v06 = sh[6];
+ ulong v07 = sh[7];
+
+ ulong v08 = 0x6A09E667F3BCC908ul;
+ ulong v09 = 0xBB67AE8584CAA73Bul;
+ ulong v10 = 0x3C6EF372FE94F82Bul;
+ ulong v11 = 0xA54FF53A5F1D36F1ul;
+ ulong v12 = 0x510E527FADE682D1ul;
+ ulong v13 = 0x9B05688C2B3E6C1Ful;
+ ulong v14 = 0x1F83D9ABFB41BD6Bul;
+ ulong v15 = 0x5BE0CD19137E2179ul;
+
+ v12 ^= sh[8]; // t[0]
+ v13 ^= sh[9]; // t[1]
+ v14 ^= sh[10]; // f[0]
+
+ for (uint i = 0; i < rounds; i++)
+ {
+ //ROUND 1
+ v00 += m00;
+ v00 += v04;
+ v12 ^= v00;
+ v12 = (v12 >> 32) ^ (v12 << 32);
+ v08 += v12;
+ v04 ^= v08;
+ v04 = (v04 >> 24) ^ (v04 << 40);
+
+ v01 += m02;
+ v01 += v05;
+ v13 ^= v01;
+ v13 = (v13 >> 32) ^ (v13 << 32);
+ v09 += v13;
+ v05 ^= v09;
+ v05 = (v05 >> 24) ^ (v05 << 40);
+
+ v02 += m04;
+ v02 += v06;
+ v14 ^= v02;
+ v14 = (v14 >> 32) ^ (v14 << 32);
+ v10 += v14;
+ v06 ^= v10;
+ v06 = (v06 >> 24) ^ (v06 << 40);
+
+ v03 += m06;
+ v03 += v07;
+ v15 ^= v03;
+ v15 = (v15 >> 32) ^ (v15 << 32);
+ v11 += v15;
+ v07 ^= v11;
+ v07 = (v07 >> 24) ^ (v07 << 40);
+
+ v02 += m05;
+ v02 += v06;
+ v14 ^= v02;
+ v14 = (v14 >> 16) ^ (v14 << 48);
+ v10 += v14;
+ v06 ^= v10;
+ v06 = (v06 >> 63) ^ (v06 << 1);
+
+ v03 += m07;
+ v03 += v07;
+ v15 ^= v03;
+ v15 = (v15 >> 16) ^ (v15 << 48);
+ v11 += v15;
+ v07 ^= v11;
+ v07 = (v07 >> 63) ^ (v07 << 1);
+
+ v00 += m01;
+ v00 += v04;
+ v12 ^= v00;
+ v12 = (v12 >> 16) ^ (v12 << 48);
+ v08 += v12;
+ v04 ^= v08;
+ v04 = (v04 >> 63) ^ (v04 << 1);
+
+ v01 += m03;
+ v01 += v05;
+ v13 ^= v01;
+ v13 = (v13 >> 16) ^ (v13 << 48);
+ v09 += v13;
+ v05 ^= v09;
+ v05 = (v05 >> 63) ^ (v05 << 1);
+
+ v00 += m08;
+ v00 += v05;
+ v15 ^= v00;
+ v15 = (v15 >> 32) ^ (v15 << 32);
+ v10 += v15;
+ v05 ^= v10;
+ v05 = (v05 >> 24) ^ (v05 << 40);
+
+ v01 += m10;
+ v01 += v06;
+ v12 ^= v01;
+ v12 = (v12 >> 32) ^ (v12 << 32);
+ v11 += v12;
+ v06 ^= v11;
+ v06 = (v06 >> 24) ^ (v06 << 40);
+
+ v02 += m12;
+ v02 += v07;
+ v13 ^= v02;
+ v13 = (v13 >> 32) ^ (v13 << 32);
+ v08 += v13;
+ v07 ^= v08;
+ v07 = (v07 >> 24) ^ (v07 << 40);
+
+ v03 += m14;
+ v03 += v04;
+ v14 ^= v03;
+ v14 = (v14 >> 32) ^ (v14 << 32);
+ v09 += v14;
+ v04 ^= v09;
+ v04 = (v04 >> 24) ^ (v04 << 40);
+
+ v02 += m13;
+ v02 += v07;
+ v13 ^= v02;
+ v13 = (v13 >> 16) ^ (v13 << 48);
+ v08 += v13;
+ v07 ^= v08;
+ v07 = (v07 >> 63) ^ (v07 << 1);
+
+ v03 += m15;
+ v03 += v04;
+ v14 ^= v03;
+ v14 = (v14 >> 16) ^ (v14 << 48);
+ v09 += v14;
+ v04 ^= v09;
+ v04 = (v04 >> 63) ^ (v04 << 1);
+
+ v00 += m09;
+ v00 += v05;
+ v15 ^= v00;
+ v15 = (v15 >> 16) ^ (v15 << 48);
+ v10 += v15;
+ v05 ^= v10;
+ v05 = (v05 >> 63) ^ (v05 << 1);
+
+ v01 += m11;
+ v01 += v06;
+ v12 ^= v01;
+ v12 = (v12 >> 16) ^ (v12 << 48);
+ v11 += v12;
+ v06 ^= v11;
+ v06 = (v06 >> 63) ^ (v06 << 1);
+
+ if (++i == rounds) break;
+
+ //ROUND 2
+ v00 += m14;
+ v00 += v04;
+ v12 ^= v00;
+ v12 = (v12 >> 32) ^ (v12 << 32);
+ v08 += v12;
+ v04 ^= v08;
+ v04 = (v04 >> 24) ^ (v04 << 40);
+
+ v01 += m04;
+ v01 += v05;
+ v13 ^= v01;
+ v13 = (v13 >> 32) ^ (v13 << 32);
+ v09 += v13;
+ v05 ^= v09;
+ v05 = (v05 >> 24) ^ (v05 << 40);
+
+ v02 += m09;
+ v02 += v06;
+ v14 ^= v02;
+ v14 = (v14 >> 32) ^ (v14 << 32);
+ v10 += v14;
+ v06 ^= v10;
+ v06 = (v06 >> 24) ^ (v06 << 40);
+
+ v03 += m13;
+ v03 += v07;
+ v15 ^= v03;
+ v15 = (v15 >> 32) ^ (v15 << 32);
+ v11 += v15;
+ v07 ^= v11;
+ v07 = (v07 >> 24) ^ (v07 << 40);
+
+ v02 += m15;
+ v02 += v06;
+ v14 ^= v02;
+ v14 = (v14 >> 16) ^ (v14 << 48);
+ v10 += v14;
+ v06 ^= v10;
+ v06 = (v06 >> 63) ^ (v06 << 1);
+
+ v03 += m06;
+ v03 += v07;
+ v15 ^= v03;
+ v15 = (v15 >> 16) ^ (v15 << 48);
+ v11 += v15;
+ v07 ^= v11;
+ v07 = (v07 >> 63) ^ (v07 << 1);
+
+ v00 += m10;
+ v00 += v04;
+ v12 ^= v00;
+ v12 = (v12 >> 16) ^ (v12 << 48);
+ v08 += v12;
+ v04 ^= v08;
+ v04 = (v04 >> 63) ^ (v04 << 1);
+
+ v01 += m08;
+ v01 += v05;
+ v13 ^= v01;
+ v13 = (v13 >> 16) ^ (v13 << 48);
+ v09 += v13;
+ v05 ^= v09;
+ v05 = (v05 >> 63) ^ (v05 << 1);
+
+ v00 += m01;
+ v00 += v05;
+ v15 ^= v00;
+ v15 = (v15 >> 32) ^ (v15 << 32);
+ v10 += v15;
+ v05 ^= v10;
+ v05 = (v05 >> 24) ^ (v05 << 40);
+
+ v01 += m00;
+ v01 += v06;
+ v12 ^= v01;
+ v12 = (v12 >> 32) ^ (v12 << 32);
+ v11 += v12;
+ v06 ^= v11;
+ v06 = (v06 >> 24) ^ (v06 << 40);
+
+ v02 += m11;
+ v02 += v07;
+ v13 ^= v02;
+ v13 = (v13 >> 32) ^ (v13 << 32);
+ v08 += v13;
+ v07 ^= v08;
+ v07 = (v07 >> 24) ^ (v07 << 40);
+
+ v03 += m05;
+ v03 += v04;
+ v14 ^= v03;
+ v14 = (v14 >> 32) ^ (v14 << 32);
+ v09 += v14;
+ v04 ^= v09;
+ v04 = (v04 >> 24) ^ (v04 << 40);
+
+ v02 += m07;
+ v02 += v07;
+ v13 ^= v02;
+ v13 = (v13 >> 16) ^ (v13 << 48);
+ v08 += v13;
+ v07 ^= v08;
+ v07 = (v07 >> 63) ^ (v07 << 1);
+
+ v03 += m03;
+ v03 += v04;
+ v14 ^= v03;
+ v14 = (v14 >> 16) ^ (v14 << 48);
+ v09 += v14;
+ v04 ^= v09;
+ v04 = (v04 >> 63) ^ (v04 << 1);
+
+ v00 += m12;
+ v00 += v05;
+ v15 ^= v00;
+ v15 = (v15 >> 16) ^ (v15 << 48);
+ v10 += v15;
+ v05 ^= v10;
+ v05 = (v05 >> 63) ^ (v05 << 1);
+
+ v01 += m02;
+ v01 += v06;
+ v12 ^= v01;
+ v12 = (v12 >> 16) ^ (v12 << 48);
+ v11 += v12;
+ v06 ^= v11;
+ v06 = (v06 >> 63) ^ (v06 << 1);
+
+ if (++i == rounds) break;
+
+ //ROUND 3
+ v00 += m11;
+ v00 += v04;
+ v12 ^= v00;
+ v12 = (v12 >> 32) ^ (v12 << 32);
+ v08 += v12;
+ v04 ^= v08;
+ v04 = (v04 >> 24) ^ (v04 << 40);
+
+ v01 += m12;
+ v01 += v05;
+ v13 ^= v01;
+ v13 = (v13 >> 32) ^ (v13 << 32);
+ v09 += v13;
+ v05 ^= v09;
+ v05 = (v05 >> 24) ^ (v05 << 40);
+
+ v02 += m05;
+ v02 += v06;
+ v14 ^= v02;
+ v14 = (v14 >> 32) ^ (v14 << 32);
+ v10 += v14;
+ v06 ^= v10;
+ v06 = (v06 >> 24) ^ (v06 << 40);
+
+ v03 += m15;
+ v03 += v07;
+ v15 ^= v03;
+ v15 = (v15 >> 32) ^ (v15 << 32);
+ v11 += v15;
+ v07 ^= v11;
+ v07 = (v07 >> 24) ^ (v07 << 40);
+
+ v02 += m02;
+ v02 += v06;
+ v14 ^= v02;
+ v14 = (v14 >> 16) ^ (v14 << 48);
+ v10 += v14;
+ v06 ^= v10;
+ v06 = (v06 >> 63) ^ (v06 << 1);
+
+ v03 += m13;
+ v03 += v07;
+ v15 ^= v03;
+ v15 = (v15 >> 16) ^ (v15 << 48);
+ v11 += v15;
+ v07 ^= v11;
+ v07 = (v07 >> 63) ^ (v07 << 1);
+
+ v00 += m08;
+ v00 += v04;
+ v12 ^= v00;
+ v12 = (v12 >> 16) ^ (v12 << 48);
+ v08 += v12;
+ v04 ^= v08;
+ v04 = (v04 >> 63) ^ (v04 << 1);
+
+ v01 += m00;
+ v01 += v05;
+ v13 ^= v01;
+ v13 = (v13 >> 16) ^ (v13 << 48);
+ v09 += v13;
+ v05 ^= v09;
+ v05 = (v05 >> 63) ^ (v05 << 1);
+
+ v00 += m10;
+ v00 += v05;
+ v15 ^= v00;
+ v15 = (v15 >> 32) ^ (v15 << 32);
+ v10 += v15;
+ v05 ^= v10;
+ v05 = (v05 >> 24) ^ (v05 << 40);
+
+ v01 += m03;
+ v01 += v06;
+ v12 ^= v01;
+ v12 = (v12 >> 32) ^ (v12 << 32);
+ v11 += v12;
+ v06 ^= v11;
+ v06 = (v06 >> 24) ^ (v06 << 40);
+
+ v02 += m07;
+ v02 += v07;
+ v13 ^= v02;
+ v13 = (v13 >> 32) ^ (v13 << 32);
+ v08 += v13;
+ v07 ^= v08;
+ v07 = (v07 >> 24) ^ (v07 << 40);
+
+ v03 += m09;
+ v03 += v04;
+ v14 ^= v03;
+ v14 = (v14 >> 32) ^ (v14 << 32);
+ v09 += v14;
+ v04 ^= v09;
+ v04 = (v04 >> 24) ^ (v04 << 40);
+
+ v02 += m01;
+ v02 += v07;
+ v13 ^= v02;
+ v13 = (v13 >> 16) ^ (v13 << 48);
+ v08 += v13;
+ v07 ^= v08;
+ v07 = (v07 >> 63) ^ (v07 << 1);
+
+ v03 += m04;
+ v03 += v04;
+ v14 ^= v03;
+ v14 = (v14 >> 16) ^ (v14 << 48);
+ v09 += v14;
+ v04 ^= v09;
+ v04 = (v04 >> 63) ^ (v04 << 1);
+
+ v00 += m14;
+ v00 += v05;
+ v15 ^= v00;
+ v15 = (v15 >> 16) ^ (v15 << 48);
+ v10 += v15;
+ v05 ^= v10;
+ v05 = (v05 >> 63) ^ (v05 << 1);
+
+ v01 += m06;
+ v01 += v06;
+ v12 ^= v01;
+ v12 = (v12 >> 16) ^ (v12 << 48);
+ v11 += v12;
+ v06 ^= v11;
+ v06 = (v06 >> 63) ^ (v06 << 1);
+
+ if (++i == rounds) break;
+
+ //ROUND 4
+ v00 += m07;
+ v00 += v04;
+ v12 ^= v00;
+ v12 = (v12 >> 32) ^ (v12 << 32);
+ v08 += v12;
+ v04 ^= v08;
+ v04 = (v04 >> 24) ^ (v04 << 40);
+
+ v01 += m03;
+ v01 += v05;
+ v13 ^= v01;
+ v13 = (v13 >> 32) ^ (v13 << 32);
+ v09 += v13;
+ v05 ^= v09;
+ v05 = (v05 >> 24) ^ (v05 << 40);
+
+ v02 += m13;
+ v02 += v06;
+ v14 ^= v02;
+ v14 = (v14 >> 32) ^ (v14 << 32);
+ v10 += v14;
+ v06 ^= v10;
+ v06 = (v06 >> 24) ^ (v06 << 40);
+
+ v03 += m11;
+ v03 += v07;
+ v15 ^= v03;
+ v15 = (v15 >> 32) ^ (v15 << 32);
+ v11 += v15;
+ v07 ^= v11;
+ v07 = (v07 >> 24) ^ (v07 << 40);
+
+ v02 += m12;
+ v02 += v06;
+ v14 ^= v02;
+ v14 = (v14 >> 16) ^ (v14 << 48);
+ v10 += v14;
+ v06 ^= v10;
+ v06 = (v06 >> 63) ^ (v06 << 1);
+
+ v03 += m14;
+ v03 += v07;
+ v15 ^= v03;
+ v15 = (v15 >> 16) ^ (v15 << 48);
+ v11 += v15;
+ v07 ^= v11;
+ v07 = (v07 >> 63) ^ (v07 << 1);
+
+ v00 += m09;
+ v00 += v04;
+ v12 ^= v00;
+ v12 = (v12 >> 16) ^ (v12 << 48);
+ v08 += v12;
+ v04 ^= v08;
+ v04 = (v04 >> 63) ^ (v04 << 1);
+
+ v01 += m01;
+ v01 += v05;
+ v13 ^= v01;
+ v13 = (v13 >> 16) ^ (v13 << 48);
+ v09 += v13;
+ v05 ^= v09;
+ v05 = (v05 >> 63) ^ (v05 << 1);
+
+ v00 += m02;
+ v00 += v05;
+ v15 ^= v00;
+ v15 = (v15 >> 32) ^ (v15 << 32);
+ v10 += v15;
+ v05 ^= v10;
+ v05 = (v05 >> 24) ^ (v05 << 40);
+
+ v01 += m05;
+ v01 += v06;
+ v12 ^= v01;
+ v12 = (v12 >> 32) ^ (v12 << 32);
+ v11 += v12;
+ v06 ^= v11;
+ v06 = (v06 >> 24) ^ (v06 << 40);
+
+ v02 += m04;
+ v02 += v07;
+ v13 ^= v02;
+ v13 = (v13 >> 32) ^ (v13 << 32);
+ v08 += v13;
+ v07 ^= v08;
+ v07 = (v07 >> 24) ^ (v07 << 40);
+
+ v03 += m15;
+ v03 += v04;
+ v14 ^= v03;
+ v14 = (v14 >> 32) ^ (v14 << 32);
+ v09 += v14;
+ v04 ^= v09;
+ v04 = (v04 >> 24) ^ (v04 << 40);
+
+ v02 += m00;
+ v02 += v07;
+ v13 ^= v02;
+ v13 = (v13 >> 16) ^ (v13 << 48);
+ v08 += v13;
+ v07 ^= v08;
+ v07 = (v07 >> 63) ^ (v07 << 1);
+
+ v03 += m08;
+ v03 += v04;
+ v14 ^= v03;
+ v14 = (v14 >> 16) ^ (v14 << 48);
+ v09 += v14;
+ v04 ^= v09;
+ v04 = (v04 >> 63) ^ (v04 << 1);
+
+ v00 += m06;
+ v00 += v05;
+ v15 ^= v00;
+ v15 = (v15 >> 16) ^ (v15 << 48);
+ v10 += v15;
+ v05 ^= v10;
+ v05 = (v05 >> 63) ^ (v05 << 1);
+
+ v01 += m10;
+ v01 += v06;
+ v12 ^= v01;
+ v12 = (v12 >> 16) ^ (v12 << 48);
+ v11 += v12;
+ v06 ^= v11;
+ v06 = (v06 >> 63) ^ (v06 << 1);
+
+ if (++i == rounds) break;
+
+ //ROUND 5
+ v00 += m09;
+ v00 += v04;
+ v12 ^= v00;
+ v12 = (v12 >> 32) ^ (v12 << 32);
+ v08 += v12;
+ v04 ^= v08;
+ v04 = (v04 >> 24) ^ (v04 << 40);
+
+ v01 += m05;
+ v01 += v05;
+ v13 ^= v01;
+ v13 = (v13 >> 32) ^ (v13 << 32);
+ v09 += v13;
+ v05 ^= v09;
+ v05 = (v05 >> 24) ^ (v05 << 40);
+
+ v02 += m02;
+ v02 += v06;
+ v14 ^= v02;
+ v14 = (v14 >> 32) ^ (v14 << 32);
+ v10 += v14;
+ v06 ^= v10;
+ v06 = (v06 >> 24) ^ (v06 << 40);
+
+ v03 += m10;
+ v03 += v07;
+ v15 ^= v03;
+ v15 = (v15 >> 32) ^ (v15 << 32);
+ v11 += v15;
+ v07 ^= v11;
+ v07 = (v07 >> 24) ^ (v07 << 40);
+
+ v02 += m04;
+ v02 += v06;
+ v14 ^= v02;
+ v14 = (v14 >> 16) ^ (v14 << 48);
+ v10 += v14;
+ v06 ^= v10;
+ v06 = (v06 >> 63) ^ (v06 << 1);
+
+ v03 += m15;
+ v03 += v07;
+ v15 ^= v03;
+ v15 = (v15 >> 16) ^ (v15 << 48);
+ v11 += v15;
+ v07 ^= v11;
+ v07 = (v07 >> 63) ^ (v07 << 1);
+
+ v00 += m00;
+ v00 += v04;
+ v12 ^= v00;
+ v12 = (v12 >> 16) ^ (v12 << 48);
+ v08 += v12;
+ v04 ^= v08;
+ v04 = (v04 >> 63) ^ (v04 << 1);
+
+ v01 += m07;
+ v01 += v05;
+ v13 ^= v01;
+ v13 = (v13 >> 16) ^ (v13 << 48);
+ v09 += v13;
+ v05 ^= v09;
+ v05 = (v05 >> 63) ^ (v05 << 1);
+
+ v00 += m14;
+ v00 += v05;
+ v15 ^= v00;
+ v15 = (v15 >> 32) ^ (v15 << 32);
+ v10 += v15;
+ v05 ^= v10;
+ v05 = (v05 >> 24) ^ (v05 << 40);
+
+ v01 += m11;
+ v01 += v06;
+ v12 ^= v01;
+ v12 = (v12 >> 32) ^ (v12 << 32);
+ v11 += v12;
+ v06 ^= v11;
+ v06 = (v06 >> 24) ^ (v06 << 40);
+
+ v02 += m06;
+ v02 += v07;
+ v13 ^= v02;
+ v13 = (v13 >> 32) ^ (v13 << 32);
+ v08 += v13;
+ v07 ^= v08;
+ v07 = (v07 >> 24) ^ (v07 << 40);
+
+ v03 += m03;
+ v03 += v04;
+ v14 ^= v03;
+ v14 = (v14 >> 32) ^ (v14 << 32);
+ v09 += v14;
+ v04 ^= v09;
+ v04 = (v04 >> 24) ^ (v04 << 40);
+
+ v02 += m08;
+ v02 += v07;
+ v13 ^= v02;
+ v13 = (v13 >> 16) ^ (v13 << 48);
+ v08 += v13;
+ v07 ^= v08;
+ v07 = (v07 >> 63) ^ (v07 << 1);
+
+ v03 += m13;
+ v03 += v04;
+ v14 ^= v03;
+ v14 = (v14 >> 16) ^ (v14 << 48);
+ v09 += v14;
+ v04 ^= v09;
+ v04 = (v04 >> 63) ^ (v04 << 1);
+
+ v00 += m01;
+ v00 += v05;
+ v15 ^= v00;
+ v15 = (v15 >> 16) ^ (v15 << 48);
+ v10 += v15;
+ v05 ^= v10;
+ v05 = (v05 >> 63) ^ (v05 << 1);
+
+ v01 += m12;
+ v01 += v06;
+ v12 ^= v01;
+ v12 = (v12 >> 16) ^ (v12 << 48);
+ v11 += v12;
+ v06 ^= v11;
+ v06 = (v06 >> 63) ^ (v06 << 1);
+
+ if (++i == rounds) break;
+
+ //ROUND 6
+ v00 += m02;
+ v00 += v04;
+ v12 ^= v00;
+ v12 = (v12 >> 32) ^ (v12 << 32);
+ v08 += v12;
+ v04 ^= v08;
+ v04 = (v04 >> 24) ^ (v04 << 40);
+
+ v01 += m06;
+ v01 += v05;
+ v13 ^= v01;
+ v13 = (v13 >> 32) ^ (v13 << 32);
+ v09 += v13;
+ v05 ^= v09;
+ v05 = (v05 >> 24) ^ (v05 << 40);
+
+ v02 += m00;
+ v02 += v06;
+ v14 ^= v02;
+ v14 = (v14 >> 32) ^ (v14 << 32);
+ v10 += v14;
+ v06 ^= v10;
+ v06 = (v06 >> 24) ^ (v06 << 40);
+
+ v03 += m08;
+ v03 += v07;
+ v15 ^= v03;
+ v15 = (v15 >> 32) ^ (v15 << 32);
+ v11 += v15;
+ v07 ^= v11;
+ v07 = (v07 >> 24) ^ (v07 << 40);
+
+ v02 += m11;
+ v02 += v06;
+ v14 ^= v02;
+ v14 = (v14 >> 16) ^ (v14 << 48);
+ v10 += v14;
+ v06 ^= v10;
+ v06 = (v06 >> 63) ^ (v06 << 1);
+
+ v03 += m03;
+ v03 += v07;
+ v15 ^= v03;
+ v15 = (v15 >> 16) ^ (v15 << 48);
+ v11 += v15;
+ v07 ^= v11;
+ v07 = (v07 >> 63) ^ (v07 << 1);
+
+ v00 += m12;
+ v00 += v04;
+ v12 ^= v00;
+ v12 = (v12 >> 16) ^ (v12 << 48);
+ v08 += v12;
+ v04 ^= v08;
+ v04 = (v04 >> 63) ^ (v04 << 1);
+
+ v01 += m10;
+ v01 += v05;
+ v13 ^= v01;
+ v13 = (v13 >> 16) ^ (v13 << 48);
+ v09 += v13;
+ v05 ^= v09;
+ v05 = (v05 >> 63) ^ (v05 << 1);
+
+ v00 += m04;
+ v00 += v05;
+ v15 ^= v00;
+ v15 = (v15 >> 32) ^ (v15 << 32);
+ v10 += v15;
+ v05 ^= v10;
+ v05 = (v05 >> 24) ^ (v05 << 40);
+
+ v01 += m07;
+ v01 += v06;
+ v12 ^= v01;
+ v12 = (v12 >> 32) ^ (v12 << 32);
+ v11 += v12;
+ v06 ^= v11;
+ v06 = (v06 >> 24) ^ (v06 << 40);
+
+ v02 += m15;
+ v02 += v07;
+ v13 ^= v02;
+ v13 = (v13 >> 32) ^ (v13 << 32);
+ v08 += v13;
+ v07 ^= v08;
+ v07 = (v07 >> 24) ^ (v07 << 40);
+
+ v03 += m01;
+ v03 += v04;
+ v14 ^= v03;
+ v14 = (v14 >> 32) ^ (v14 << 32);
+ v09 += v14;
+ v04 ^= v09;
+ v04 = (v04 >> 24) ^ (v04 << 40);
+
+ v02 += m14;
+ v02 += v07;
+ v13 ^= v02;
+ v13 = (v13 >> 16) ^ (v13 << 48);
+ v08 += v13;
+ v07 ^= v08;
+ v07 = (v07 >> 63) ^ (v07 << 1);
+
+ v03 += m09;
+ v03 += v04;
+ v14 ^= v03;
+ v14 = (v14 >> 16) ^ (v14 << 48);
+ v09 += v14;
+ v04 ^= v09;
+ v04 = (v04 >> 63) ^ (v04 << 1);
+
+ v00 += m13;
+ v00 += v05;
+ v15 ^= v00;
+ v15 = (v15 >> 16) ^ (v15 << 48);
+ v10 += v15;
+ v05 ^= v10;
+ v05 = (v05 >> 63) ^ (v05 << 1);
+
+ v01 += m05;
+ v01 += v06;
+ v12 ^= v01;
+ v12 = (v12 >> 16) ^ (v12 << 48);
+ v11 += v12;
+ v06 ^= v11;
+ v06 = (v06 >> 63) ^ (v06 << 1);
+
+ if (++i == rounds) break;
+
+ //ROUND 7
+ v00 += m12;
+ v00 += v04;
+ v12 ^= v00;
+ v12 = (v12 >> 32) ^ (v12 << 32);
+ v08 += v12;
+ v04 ^= v08;
+ v04 = (v04 >> 24) ^ (v04 << 40);
+
+ v01 += m01;
+ v01 += v05;
+ v13 ^= v01;
+ v13 = (v13 >> 32) ^ (v13 << 32);
+ v09 += v13;
+ v05 ^= v09;
+ v05 = (v05 >> 24) ^ (v05 << 40);
+
+ v02 += m14;
+ v02 += v06;
+ v14 ^= v02;
+ v14 = (v14 >> 32) ^ (v14 << 32);
+ v10 += v14;
+ v06 ^= v10;
+ v06 = (v06 >> 24) ^ (v06 << 40);
+
+ v03 += m04;
+ v03 += v07;
+ v15 ^= v03;
+ v15 = (v15 >> 32) ^ (v15 << 32);
+ v11 += v15;
+ v07 ^= v11;
+ v07 = (v07 >> 24) ^ (v07 << 40);
+
+ v02 += m13;
+ v02 += v06;
+ v14 ^= v02;
+ v14 = (v14 >> 16) ^ (v14 << 48);
+ v10 += v14;
+ v06 ^= v10;
+ v06 = (v06 >> 63) ^ (v06 << 1);
+
+ v03 += m10;
+ v03 += v07;
+ v15 ^= v03;
+ v15 = (v15 >> 16) ^ (v15 << 48);
+ v11 += v15;
+ v07 ^= v11;
+ v07 = (v07 >> 63) ^ (v07 << 1);
+
+ v00 += m05;
+ v00 += v04;
+ v12 ^= v00;
+ v12 = (v12 >> 16) ^ (v12 << 48);
+ v08 += v12;
+ v04 ^= v08;
+ v04 = (v04 >> 63) ^ (v04 << 1);
+
+ v01 += m15;
+ v01 += v05;
+ v13 ^= v01;
+ v13 = (v13 >> 16) ^ (v13 << 48);
+ v09 += v13;
+ v05 ^= v09;
+ v05 = (v05 >> 63) ^ (v05 << 1);
+
+ v00 += m00;
+ v00 += v05;
+ v15 ^= v00;
+ v15 = (v15 >> 32) ^ (v15 << 32);
+ v10 += v15;
+ v05 ^= v10;
+ v05 = (v05 >> 24) ^ (v05 << 40);
+
+ v01 += m06;
+ v01 += v06;
+ v12 ^= v01;
+ v12 = (v12 >> 32) ^ (v12 << 32);
+ v11 += v12;
+ v06 ^= v11;
+ v06 = (v06 >> 24) ^ (v06 << 40);
+
+ v02 += m09;
+ v02 += v07;
+ v13 ^= v02;
+ v13 = (v13 >> 32) ^ (v13 << 32);
+ v08 += v13;
+ v07 ^= v08;
+ v07 = (v07 >> 24) ^ (v07 << 40);
+
+ v03 += m08;
+ v03 += v04;
+ v14 ^= v03;
+ v14 = (v14 >> 32) ^ (v14 << 32);
+ v09 += v14;
+ v04 ^= v09;
+ v04 = (v04 >> 24) ^ (v04 << 40);
+
+ v02 += m02;
+ v02 += v07;
+ v13 ^= v02;
+ v13 = (v13 >> 16) ^ (v13 << 48);
+ v08 += v13;
+ v07 ^= v08;
+ v07 = (v07 >> 63) ^ (v07 << 1);
+
+ v03 += m11;
+ v03 += v04;
+ v14 ^= v03;
+ v14 = (v14 >> 16) ^ (v14 << 48);
+ v09 += v14;
+ v04 ^= v09;
+ v04 = (v04 >> 63) ^ (v04 << 1);
+
+ v00 += m07;
+ v00 += v05;
+ v15 ^= v00;
+ v15 = (v15 >> 16) ^ (v15 << 48);
+ v10 += v15;
+ v05 ^= v10;
+ v05 = (v05 >> 63) ^ (v05 << 1);
+
+ v01 += m03;
+ v01 += v06;
+ v12 ^= v01;
+ v12 = (v12 >> 16) ^ (v12 << 48);
+ v11 += v12;
+ v06 ^= v11;
+ v06 = (v06 >> 63) ^ (v06 << 1);
+
+ if (++i == rounds) break;
+
+ //ROUND 8
+ v00 += m13;
+ v00 += v04;
+ v12 ^= v00;
+ v12 = (v12 >> 32) ^ (v12 << 32);
+ v08 += v12;
+ v04 ^= v08;
+ v04 = (v04 >> 24) ^ (v04 << 40);
+
+ v01 += m07;
+ v01 += v05;
+ v13 ^= v01;
+ v13 = (v13 >> 32) ^ (v13 << 32);
+ v09 += v13;
+ v05 ^= v09;
+ v05 = (v05 >> 24) ^ (v05 << 40);
+
+ v02 += m12;
+ v02 += v06;
+ v14 ^= v02;
+ v14 = (v14 >> 32) ^ (v14 << 32);
+ v10 += v14;
+ v06 ^= v10;
+ v06 = (v06 >> 24) ^ (v06 << 40);
+
+ v03 += m03;
+ v03 += v07;
+ v15 ^= v03;
+ v15 = (v15 >> 32) ^ (v15 << 32);
+ v11 += v15;
+ v07 ^= v11;
+ v07 = (v07 >> 24) ^ (v07 << 40);
+
+ v02 += m01;
+ v02 += v06;
+ v14 ^= v02;
+ v14 = (v14 >> 16) ^ (v14 << 48);
+ v10 += v14;
+ v06 ^= v10;
+ v06 = (v06 >> 63) ^ (v06 << 1);
+
+ v03 += m09;
+ v03 += v07;
+ v15 ^= v03;
+ v15 = (v15 >> 16) ^ (v15 << 48);
+ v11 += v15;
+ v07 ^= v11;
+ v07 = (v07 >> 63) ^ (v07 << 1);
+
+ v00 += m11;
+ v00 += v04;
+ v12 ^= v00;
+ v12 = (v12 >> 16) ^ (v12 << 48);
+ v08 += v12;
+ v04 ^= v08;
+ v04 = (v04 >> 63) ^ (v04 << 1);
+
+ v01 += m14;
+ v01 += v05;
+ v13 ^= v01;
+ v13 = (v13 >> 16) ^ (v13 << 48);
+ v09 += v13;
+ v05 ^= v09;
+ v05 = (v05 >> 63) ^ (v05 << 1);
+
+ v00 += m05;
+ v00 += v05;
+ v15 ^= v00;
+ v15 = (v15 >> 32) ^ (v15 << 32);
+ v10 += v15;
+ v05 ^= v10;
+ v05 = (v05 >> 24) ^ (v05 << 40);
+
+ v01 += m15;
+ v01 += v06;
+ v12 ^= v01;
+ v12 = (v12 >> 32) ^ (v12 << 32);
+ v11 += v12;
+ v06 ^= v11;
+ v06 = (v06 >> 24) ^ (v06 << 40);
+
+ v02 += m08;
+ v02 += v07;
+ v13 ^= v02;
+ v13 = (v13 >> 32) ^ (v13 << 32);
+ v08 += v13;
+ v07 ^= v08;
+ v07 = (v07 >> 24) ^ (v07 << 40);
+
+ v03 += m02;
+ v03 += v04;
+ v14 ^= v03;
+ v14 = (v14 >> 32) ^ (v14 << 32);
+ v09 += v14;
+ v04 ^= v09;
+ v04 = (v04 >> 24) ^ (v04 << 40);
+
+ v02 += m06;
+ v02 += v07;
+ v13 ^= v02;
+ v13 = (v13 >> 16) ^ (v13 << 48);
+ v08 += v13;
+ v07 ^= v08;
+ v07 = (v07 >> 63) ^ (v07 << 1);
+
+ v03 += m10;
+ v03 += v04;
+ v14 ^= v03;
+ v14 = (v14 >> 16) ^ (v14 << 48);
+ v09 += v14;
+ v04 ^= v09;
+ v04 = (v04 >> 63) ^ (v04 << 1);
+
+ v00 += m00;
+ v00 += v05;
+ v15 ^= v00;
+ v15 = (v15 >> 16) ^ (v15 << 48);
+ v10 += v15;
+ v05 ^= v10;
+ v05 = (v05 >> 63) ^ (v05 << 1);
+
+ v01 += m04;
+ v01 += v06;
+ v12 ^= v01;
+ v12 = (v12 >> 16) ^ (v12 << 48);
+ v11 += v12;
+ v06 ^= v11;
+ v06 = (v06 >> 63) ^ (v06 << 1);
+
+ if (++i == rounds) break;
+
+ //ROUND 9
+ v00 += m06;
+ v00 += v04;
+ v12 ^= v00;
+ v12 = (v12 >> 32) ^ (v12 << 32);
+ v08 += v12;
+ v04 ^= v08;
+ v04 = (v04 >> 24) ^ (v04 << 40);
+
+ v01 += m14;
+ v01 += v05;
+ v13 ^= v01;
+ v13 = (v13 >> 32) ^ (v13 << 32);
+ v09 += v13;
+ v05 ^= v09;
+ v05 = (v05 >> 24) ^ (v05 << 40);
+
+ v02 += m11;
+ v02 += v06;
+ v14 ^= v02;
+ v14 = (v14 >> 32) ^ (v14 << 32);
+ v10 += v14;
+ v06 ^= v10;
+ v06 = (v06 >> 24) ^ (v06 << 40);
+
+ v03 += m00;
+ v03 += v07;
+ v15 ^= v03;
+ v15 = (v15 >> 32) ^ (v15 << 32);
+ v11 += v15;
+ v07 ^= v11;
+ v07 = (v07 >> 24) ^ (v07 << 40);
+
+ v02 += m03;
+ v02 += v06;
+ v14 ^= v02;
+ v14 = (v14 >> 16) ^ (v14 << 48);
+ v10 += v14;
+ v06 ^= v10;
+ v06 = (v06 >> 63) ^ (v06 << 1);
+
+ v03 += m08;
+ v03 += v07;
+ v15 ^= v03;
+ v15 = (v15 >> 16) ^ (v15 << 48);
+ v11 += v15;
+ v07 ^= v11;
+ v07 = (v07 >> 63) ^ (v07 << 1);
+
+ v00 += m15;
+ v00 += v04;
+ v12 ^= v00;
+ v12 = (v12 >> 16) ^ (v12 << 48);
+ v08 += v12;
+ v04 ^= v08;
+ v04 = (v04 >> 63) ^ (v04 << 1);
+
+ v01 += m09;
+ v01 += v05;
+ v13 ^= v01;
+ v13 = (v13 >> 16) ^ (v13 << 48);
+ v09 += v13;
+ v05 ^= v09;
+ v05 = (v05 >> 63) ^ (v05 << 1);
+
+ v00 += m12;
+ v00 += v05;
+ v15 ^= v00;
+ v15 = (v15 >> 32) ^ (v15 << 32);
+ v10 += v15;
+ v05 ^= v10;
+ v05 = (v05 >> 24) ^ (v05 << 40);
+
+ v01 += m13;
+ v01 += v06;
+ v12 ^= v01;
+ v12 = (v12 >> 32) ^ (v12 << 32);
+ v11 += v12;
+ v06 ^= v11;
+ v06 = (v06 >> 24) ^ (v06 << 40);
+
+ v02 += m01;
+ v02 += v07;
+ v13 ^= v02;
+ v13 = (v13 >> 32) ^ (v13 << 32);
+ v08 += v13;
+ v07 ^= v08;
+ v07 = (v07 >> 24) ^ (v07 << 40);
+
+ v03 += m10;
+ v03 += v04;
+ v14 ^= v03;
+ v14 = (v14 >> 32) ^ (v14 << 32);
+ v09 += v14;
+ v04 ^= v09;
+ v04 = (v04 >> 24) ^ (v04 << 40);
+
+ v02 += m04;
+ v02 += v07;
+ v13 ^= v02;
+ v13 = (v13 >> 16) ^ (v13 << 48);
+ v08 += v13;
+ v07 ^= v08;
+ v07 = (v07 >> 63) ^ (v07 << 1);
+
+ v03 += m05;
+ v03 += v04;
+ v14 ^= v03;
+ v14 = (v14 >> 16) ^ (v14 << 48);
+ v09 += v14;
+ v04 ^= v09;
+ v04 = (v04 >> 63) ^ (v04 << 1);
+
+ v00 += m02;
+ v00 += v05;
+ v15 ^= v00;
+ v15 = (v15 >> 16) ^ (v15 << 48);
+ v10 += v15;
+ v05 ^= v10;
+ v05 = (v05 >> 63) ^ (v05 << 1);
+
+ v01 += m07;
+ v01 += v06;
+ v12 ^= v01;
+ v12 = (v12 >> 16) ^ (v12 << 48);
+ v11 += v12;
+ v06 ^= v11;
+ v06 = (v06 >> 63) ^ (v06 << 1);
+
+ if (++i == rounds) break;
+
+ //ROUND 10
+ v00 += m10;
+ v00 += v04;
+ v12 ^= v00;
+ v12 = (v12 >> 32) ^ (v12 << 32);
+ v08 += v12;
+ v04 ^= v08;
+ v04 = (v04 >> 24) ^ (v04 << 40);
+
+ v01 += m08;
+ v01 += v05;
+ v13 ^= v01;
+ v13 = (v13 >> 32) ^ (v13 << 32);
+ v09 += v13;
+ v05 ^= v09;
+ v05 = (v05 >> 24) ^ (v05 << 40);
+
+ v02 += m07;
+ v02 += v06;
+ v14 ^= v02;
+ v14 = (v14 >> 32) ^ (v14 << 32);
+ v10 += v14;
+ v06 ^= v10;
+ v06 = (v06 >> 24) ^ (v06 << 40);
+
+ v03 += m01;
+ v03 += v07;
+ v15 ^= v03;
+ v15 = (v15 >> 32) ^ (v15 << 32);
+ v11 += v15;
+ v07 ^= v11;
+ v07 = (v07 >> 24) ^ (v07 << 40);
+
+ v02 += m06;
+ v02 += v06;
+ v14 ^= v02;
+ v14 = (v14 >> 16) ^ (v14 << 48);
+ v10 += v14;
+ v06 ^= v10;
+ v06 = (v06 >> 63) ^ (v06 << 1);
+
+ v03 += m05;
+ v03 += v07;
+ v15 ^= v03;
+ v15 = (v15 >> 16) ^ (v15 << 48);
+ v11 += v15;
+ v07 ^= v11;
+ v07 = (v07 >> 63) ^ (v07 << 1);
+
+ v00 += m02;
+ v00 += v04;
+ v12 ^= v00;
+ v12 = (v12 >> 16) ^ (v12 << 48);
+ v08 += v12;
+ v04 ^= v08;
+ v04 = (v04 >> 63) ^ (v04 << 1);
+
+ v01 += m04;
+ v01 += v05;
+ v13 ^= v01;
+ v13 = (v13 >> 16) ^ (v13 << 48);
+ v09 += v13;
+ v05 ^= v09;
+ v05 = (v05 >> 63) ^ (v05 << 1);
+
+ v00 += m15;
+ v00 += v05;
+ v15 ^= v00;
+ v15 = (v15 >> 32) ^ (v15 << 32);
+ v10 += v15;
+ v05 ^= v10;
+ v05 = (v05 >> 24) ^ (v05 << 40);
+
+ v01 += m09;
+ v01 += v06;
+ v12 ^= v01;
+ v12 = (v12 >> 32) ^ (v12 << 32);
+ v11 += v12;
+ v06 ^= v11;
+ v06 = (v06 >> 24) ^ (v06 << 40);
+
+ v02 += m03;
+ v02 += v07;
+ v13 ^= v02;
+ v13 = (v13 >> 32) ^ (v13 << 32);
+ v08 += v13;
+ v07 ^= v08;
+ v07 = (v07 >> 24) ^ (v07 << 40);
+
+ v03 += m13;
+ v03 += v04;
+ v14 ^= v03;
+ v14 = (v14 >> 32) ^ (v14 << 32);
+ v09 += v14;
+ v04 ^= v09;
+ v04 = (v04 >> 24) ^ (v04 << 40);
+
+ v02 += m12;
+ v02 += v07;
+ v13 ^= v02;
+ v13 = (v13 >> 16) ^ (v13 << 48);
+ v08 += v13;
+ v07 ^= v08;
+ v07 = (v07 >> 63) ^ (v07 << 1);
+
+ v03 += m00;
+ v03 += v04;
+ v14 ^= v03;
+ v14 = (v14 >> 16) ^ (v14 << 48);
+ v09 += v14;
+ v04 ^= v09;
+ v04 = (v04 >> 63) ^ (v04 << 1);
+
+ v00 += m11;
+ v00 += v05;
+ v15 ^= v00;
+ v15 = (v15 >> 16) ^ (v15 << 48);
+ v10 += v15;
+ v05 ^= v10;
+ v05 = (v05 >> 63) ^ (v05 << 1);
+
+ v01 += m14;
+ v01 += v06;
+ v12 ^= v01;
+ v12 = (v12 >> 16) ^ (v12 << 48);
+ v11 += v12;
+ v06 ^= v11;
+ v06 = (v06 >> 63) ^ (v06 << 1);
+ }
+
+ sh[0] ^= v00 ^ v08;
+ sh[1] ^= v01 ^ v09;
+ sh[2] ^= v02 ^ v10;
+ sh[3] ^= v03 ^ v11;
+ sh[4] ^= v04 ^ v12;
+ sh[5] ^= v05 ^ v13;
+ sh[6] ^= v06 ^ v14;
+ sh[7] ^= v07 ^ v15;
+ }
+}
diff --git a/src/Nethermind/Nethermind.Crypto/Blake2/Blake2Sse41.cs b/src/Nethermind/Nethermind.Crypto/Blake2/Blake2Sse41.cs
new file mode 100644
index 00000000000..6b86f5fe459
--- /dev/null
+++ b/src/Nethermind/Nethermind.Crypto/Blake2/Blake2Sse41.cs
@@ -0,0 +1,1291 @@
+// Copyright (c) 2022 Demerzel Solutions Limited
+// This file is part of the Nethermind library.
+//
+// The Nethermind library is free software: you can redistribute it and/or modify
+// it under the terms of the GNU Lesser General Public License as published by
+// the Free Software Foundation, either version 3 of the License, or
+// (at your option) any later version.
+//
+// The Nethermind library is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+// GNU Lesser General Public License for more details.
+//
+// You should have received a copy of the GNU Lesser General Public License
+// along with the Nethermind. If not, see .
+//
+
+using System.Runtime.CompilerServices;
+using System.Runtime.InteropServices;
+using System.Runtime.Intrinsics;
+using System.Runtime.Intrinsics.X86;
+
+namespace Nethermind.Crypto.Blake2;
+
+///
+/// Code adapted from Blake2Fast (https://github.com/saucecontrol/Blake2Fast)
+///
+public unsafe partial class Blake2Compression
+{
+ // SIMD algorithm described in https://eprint.iacr.org/2012/275.pdf
+ [MethodImpl(MethodImplOptions.AggressiveOptimization)]
+ [SkipLocalsInit]
+ private static void ComputeSse41(ulong* sh, ulong* m, uint rounds)
+ {
+ ref byte rrm = ref MemoryMarshal.GetReference(Rormask);
+ var r24 = Unsafe.As>(ref rrm);
+ var r16 = Unsafe.As>(ref Unsafe.Add(ref rrm, Vector128.Count));
+
+ var row1l = Sse2.LoadVector128(sh);
+ var row1h = Sse2.LoadVector128(sh + 2);
+ var row2l = Sse2.LoadVector128(sh + 4);
+ var row2h = Sse2.LoadVector128(sh + 6);
+
+ ref byte riv = ref MemoryMarshal.GetReference(Ivle);
+ var row3l = Unsafe.As>(ref riv);
+ var row3h = Unsafe.As>(ref Unsafe.Add(ref riv, 16));
+ var row4l = Unsafe.As>(ref Unsafe.Add(ref riv, 32));
+ var row4h = Unsafe.As>(ref Unsafe.Add(ref riv, 48));
+
+ row4l = Sse2.Xor(row4l, Sse2.LoadVector128(sh + 8)); // t[]
+ row4h = Sse2.Xor(row4h, Sse2.LoadVector128(sh + 10)); // f[]
+
+ var m0 = Sse2.LoadVector128(m);
+ var m1 = Sse2.LoadVector128(m + 2);
+ var m2 = Sse2.LoadVector128(m + 4);
+ var m3 = Sse2.LoadVector128(m + 6);
+ var m4 = Sse2.LoadVector128(m + 8);
+ var m5 = Sse2.LoadVector128(m + 10);
+ var m6 = Sse2.LoadVector128(m + 12);
+ var m7 = Sse2.LoadVector128(m + 14);
+ Vector128 t0;
+ Vector128 t1;
+ Vector128 b0;
+ Vector128 b1;
+
+ for (uint i = 0; i < rounds; i++)
+ {
+ //ROUND 1
+ b0 = Sse2.UnpackLow(m0, m1);
+ b1 = Sse2.UnpackLow(m2, m3);
+
+ //G1
+ row1l = Sse2.Add(Sse2.Add(row1l, b0), row2l);
+ row1h = Sse2.Add(Sse2.Add(row1h, b1), row2h);
+
+ row4l = Sse2.Xor(row4l, row1l);
+ row4h = Sse2.Xor(row4h, row1h);
+
+ row4l = Sse2.Shuffle(row4l.AsUInt32(), 0b_10_11_00_01).AsUInt64();
+ row4h = Sse2.Shuffle(row4h.AsUInt32(), 0b_10_11_00_01).AsUInt64();
+
+ row3l = Sse2.Add(row3l, row4l);
+ row3h = Sse2.Add(row3h, row4h);
+
+ row2l = Sse2.Xor(row2l, row3l);
+ row2h = Sse2.Xor(row2h, row3h);
+
+ row2l = Ssse3.Shuffle(row2l.AsByte(), r24).AsUInt64();
+ row2h = Ssse3.Shuffle(row2h.AsByte(), r24).AsUInt64();
+
+ b0 = Sse2.UnpackHigh(m0, m1);
+ b1 = Sse2.UnpackHigh(m2, m3);
+
+ //G2
+ row1l = Sse2.Add(Sse2.Add(row1l, b0), row2l);
+ row1h = Sse2.Add(Sse2.Add(row1h, b1), row2h);
+
+ row4l = Sse2.Xor(row4l, row1l);
+ row4h = Sse2.Xor(row4h, row1h);
+
+ row4l = Ssse3.Shuffle(row4l.AsByte(), r16).AsUInt64();
+ row4h = Ssse3.Shuffle(row4h.AsByte(), r16).AsUInt64();
+
+ row3l = Sse2.Add(row3l, row4l);
+ row3h = Sse2.Add(row3h, row4h);
+
+ row2l = Sse2.Xor(row2l, row3l);
+ row2h = Sse2.Xor(row2h, row3h);
+
+ row2l = Sse2.Xor(Sse2.ShiftRightLogical(row2l, 63), Sse2.Add(row2l, row2l));
+ row2h = Sse2.Xor(Sse2.ShiftRightLogical(row2h, 63), Sse2.Add(row2h, row2h));
+
+ //DIAGONALIZE
+ t0 = Ssse3.AlignRight(row2h, row2l, 8);
+ t1 = Ssse3.AlignRight(row2l, row2h, 8);
+ row2l = t0;
+ row2h = t1;
+
+ b0 = row3l;
+ row3l = row3h;
+ row3h = b0;
+
+ t0 = Ssse3.AlignRight(row4h, row4l, 8);
+ t1 = Ssse3.AlignRight(row4l, row4h, 8);
+ row4l = t1;
+ row4h = t0;
+
+ b0 = Sse2.UnpackLow(m4, m5);
+ b1 = Sse2.UnpackLow(m6, m7);
+
+ //G1
+ row1l = Sse2.Add(Sse2.Add(row1l, b0), row2l);
+ row1h = Sse2.Add(Sse2.Add(row1h, b1), row2h);
+
+ row4l = Sse2.Xor(row4l, row1l);
+ row4h = Sse2.Xor(row4h, row1h);
+
+ row4l = Sse2.Shuffle(row4l.AsUInt32(), 0b_10_11_00_01).AsUInt64();
+ row4h = Sse2.Shuffle(row4h.AsUInt32(), 0b_10_11_00_01).AsUInt64();
+
+ row3l = Sse2.Add(row3l, row4l);
+ row3h = Sse2.Add(row3h, row4h);
+
+ row2l = Sse2.Xor(row2l, row3l);
+ row2h = Sse2.Xor(row2h, row3h);
+
+ row2l = Ssse3.Shuffle(row2l.AsByte(), r24).AsUInt64();
+ row2h = Ssse3.Shuffle(row2h.AsByte(), r24).AsUInt64();
+
+ b0 = Sse2.UnpackHigh(m4, m5);
+ b1 = Sse2.UnpackHigh(m6, m7);
+
+ //G2
+ row1l = Sse2.Add(Sse2.Add(row1l, b0), row2l);
+ row1h = Sse2.Add(Sse2.Add(row1h, b1), row2h);
+
+ row4l = Sse2.Xor(row4l, row1l);
+ row4h = Sse2.Xor(row4h, row1h);
+
+ row4l = Ssse3.Shuffle(row4l.AsByte(), r16).AsUInt64();
+ row4h = Ssse3.Shuffle(row4h.AsByte(), r16).AsUInt64();
+
+ row3l = Sse2.Add(row3l, row4l);
+ row3h = Sse2.Add(row3h, row4h);
+
+ row2l = Sse2.Xor(row2l, row3l);
+ row2h = Sse2.Xor(row2h, row3h);
+
+ row2l = Sse2.Xor(Sse2.ShiftRightLogical(row2l, 63), Sse2.Add(row2l, row2l));
+ row2h = Sse2.Xor(Sse2.ShiftRightLogical(row2h, 63), Sse2.Add(row2h, row2h));
+
+ //UNDIAGONALIZE
+ t0 = Ssse3.AlignRight(row2l, row2h, 8);
+ t1 = Ssse3.AlignRight(row2h, row2l, 8);
+ row2l = t0;
+ row2h = t1;
+
+ b0 = row3l;
+ row3l = row3h;
+ row3h = b0;
+
+ t0 = Ssse3.AlignRight(row4l, row4h, 8);
+ t1 = Ssse3.AlignRight(row4h, row4l, 8);
+ row4l = t1;
+ row4h = t0;
+
+ if (++i == rounds) break;
+
+ //ROUND 2
+ b0 = Sse2.UnpackLow(m7, m2);
+ b1 = Sse2.UnpackHigh(m4, m6);
+
+ //G1
+ row1l = Sse2.Add(Sse2.Add(row1l, b0), row2l);
+ row1h = Sse2.Add(Sse2.Add(row1h, b1), row2h);
+
+ row4l = Sse2.Xor(row4l, row1l);
+ row4h = Sse2.Xor(row4h, row1h);
+
+ row4l = Sse2.Shuffle(row4l.AsUInt32(), 0b_10_11_00_01).AsUInt64();
+ row4h = Sse2.Shuffle(row4h.AsUInt32(), 0b_10_11_00_01).AsUInt64();
+
+ row3l = Sse2.Add(row3l, row4l);
+ row3h = Sse2.Add(row3h, row4h);
+
+ row2l = Sse2.Xor(row2l, row3l);
+ row2h = Sse2.Xor(row2h, row3h);
+
+ row2l = Ssse3.Shuffle(row2l.AsByte(), r24).AsUInt64();
+ row2h = Ssse3.Shuffle(row2h.AsByte(), r24).AsUInt64();
+
+ b0 = Sse2.UnpackLow(m5, m4);
+ b1 = Ssse3.AlignRight(m3, m7, 8);
+
+ //G2
+ row1l = Sse2.Add(Sse2.Add(row1l, b0), row2l);
+ row1h = Sse2.Add(Sse2.Add(row1h, b1), row2h);
+
+ row4l = Sse2.Xor(row4l, row1l);
+ row4h = Sse2.Xor(row4h, row1h);
+
+ row4l = Ssse3.Shuffle(row4l.AsByte(), r16).AsUInt64();
+ row4h = Ssse3.Shuffle(row4h.AsByte(), r16).AsUInt64();
+
+ row3l = Sse2.Add(row3l, row4l);
+ row3h = Sse2.Add(row3h, row4h);
+
+ row2l = Sse2.Xor(row2l, row3l);
+ row2h = Sse2.Xor(row2h, row3h);
+
+ row2l = Sse2.Xor(Sse2.ShiftRightLogical(row2l, 63), Sse2.Add(row2l, row2l));
+ row2h = Sse2.Xor(Sse2.ShiftRightLogical(row2h, 63), Sse2.Add(row2h, row2h));
+
+ //DIAGONALIZE
+ t0 = Ssse3.AlignRight(row2h, row2l, 8);
+ t1 = Ssse3.AlignRight(row2l, row2h, 8);
+ row2l = t0;
+ row2h = t1;
+
+ b0 = row3l;
+ row3l = row3h;
+ row3h = b0;
+
+ t0 = Ssse3.AlignRight(row4h, row4l, 8);
+ t1 = Ssse3.AlignRight(row4l, row4h, 8);
+ row4l = t1;
+ row4h = t0;
+
+ b0 = Sse2.Shuffle(m0.AsUInt32(), 0b_01_00_11_10).AsUInt64();
+ b1 = Sse2.UnpackHigh(m5, m2);
+
+ //G1
+ row1l = Sse2.Add(Sse2.Add(row1l, b0), row2l);
+ row1h = Sse2.Add(Sse2.Add(row1h, b1), row2h);
+
+ row4l = Sse2.Xor(row4l, row1l);
+ row4h = Sse2.Xor(row4h, row1h);
+
+ row4l = Sse2.Shuffle(row4l.AsUInt32(), 0b_10_11_00_01).AsUInt64();
+ row4h = Sse2.Shuffle(row4h.AsUInt32(), 0b_10_11_00_01).AsUInt64();
+
+ row3l = Sse2.Add(row3l, row4l);
+ row3h = Sse2.Add(row3h, row4h);
+
+ row2l = Sse2.Xor(row2l, row3l);
+ row2h = Sse2.Xor(row2h, row3h);
+
+ row2l = Ssse3.Shuffle(row2l.AsByte(), r24).AsUInt64();
+ row2h = Ssse3.Shuffle(row2h.AsByte(), r24).AsUInt64();
+
+ b0 = Sse2.UnpackLow(m6, m1);
+ b1 = Sse2.UnpackHigh(m3, m1);
+
+ //G2
+ row1l = Sse2.Add(Sse2.Add(row1l, b0), row2l);
+ row1h = Sse2.Add(Sse2.Add(row1h, b1), row2h);
+
+ row4l = Sse2.Xor(row4l, row1l);
+ row4h = Sse2.Xor(row4h, row1h);
+
+ row4l = Ssse3.Shuffle(row4l.AsByte(), r16).AsUInt64();
+ row4h = Ssse3.Shuffle(row4h.AsByte(), r16).AsUInt64();
+
+ row3l = Sse2.Add(row3l, row4l);
+ row3h = Sse2.Add(row3h, row4h);
+
+ row2l = Sse2.Xor(row2l, row3l);
+ row2h = Sse2.Xor(row2h, row3h);
+
+ row2l = Sse2.Xor(Sse2.ShiftRightLogical(row2l, 63), Sse2.Add(row2l, row2l));
+ row2h = Sse2.Xor(Sse2.ShiftRightLogical(row2h, 63), Sse2.Add(row2h, row2h));
+
+ //UNDIAGONALIZE
+ t0 = Ssse3.AlignRight(row2l, row2h, 8);
+ t1 = Ssse3.AlignRight(row2h, row2l, 8);
+ row2l = t0;
+ row2h = t1;
+
+ b0 = row3l;
+ row3l = row3h;
+ row3h = b0;
+
+ t0 = Ssse3.AlignRight(row4l, row4h, 8);
+ t1 = Ssse3.AlignRight(row4h, row4l, 8);
+ row4l = t1;
+ row4h = t0;
+
+ if (++i == rounds) break;
+
+ //ROUND 3
+ b0 = Ssse3.AlignRight(m6, m5, 8);
+ b1 = Sse2.UnpackHigh(m2, m7);
+
+ //G1
+ row1l = Sse2.Add(Sse2.Add(row1l, b0), row2l);
+ row1h = Sse2.Add(Sse2.Add(row1h, b1), row2h);
+
+ row4l = Sse2.Xor(row4l, row1l);
+ row4h = Sse2.Xor(row4h, row1h);
+
+ row4l = Sse2.Shuffle(row4l.AsUInt32(), 0b_10_11_00_01).AsUInt64();
+ row4h = Sse2.Shuffle(row4h.AsUInt32(), 0b_10_11_00_01).AsUInt64();
+
+ row3l = Sse2.Add(row3l, row4l);
+ row3h = Sse2.Add(row3h, row4h);
+
+ row2l = Sse2.Xor(row2l, row3l);
+ row2h = Sse2.Xor(row2h, row3h);
+
+ row2l = Ssse3.Shuffle(row2l.AsByte(), r24).AsUInt64();
+ row2h = Ssse3.Shuffle(row2h.AsByte(), r24).AsUInt64();
+
+ b0 = Sse2.UnpackLow(m4, m0);
+ b1 = Sse41.Blend(m1.AsUInt16(), m6.AsUInt16(), 0b_1111_0000).AsUInt64();
+
+ //G2
+ row1l = Sse2.Add(Sse2.Add(row1l, b0), row2l);
+ row1h = Sse2.Add(Sse2.Add(row1h, b1), row2h);
+
+ row4l = Sse2.Xor(row4l, row1l);
+ row4h = Sse2.Xor(row4h, row1h);
+
+ row4l = Ssse3.Shuffle(row4l.AsByte(), r16).AsUInt64();
+ row4h = Ssse3.Shuffle(row4h.AsByte(), r16).AsUInt64();
+
+ row3l = Sse2.Add(row3l, row4l);
+ row3h = Sse2.Add(row3h, row4h);
+
+ row2l = Sse2.Xor(row2l, row3l);
+ row2h = Sse2.Xor(row2h, row3h);
+
+ row2l = Sse2.Xor(Sse2.ShiftRightLogical(row2l, 63), Sse2.Add(row2l, row2l));
+ row2h = Sse2.Xor(Sse2.ShiftRightLogical(row2h, 63), Sse2.Add(row2h, row2h));
+
+ //DIAGONALIZE
+ t0 = Ssse3.AlignRight(row2h, row2l, 8);
+ t1 = Ssse3.AlignRight(row2l, row2h, 8);
+ row2l = t0;
+ row2h = t1;
+
+ b0 = row3l;
+ row3l = row3h;
+ row3h = b0;
+
+ t0 = Ssse3.AlignRight(row4h, row4l, 8);
+ t1 = Ssse3.AlignRight(row4l, row4h, 8);
+ row4l = t1;
+ row4h = t0;
+
+ b0 = Sse41.Blend(m5.AsUInt16(), m1.AsUInt16(), 0b_1111_0000).AsUInt64();
+ b1 = Sse2.UnpackHigh(m3, m4);
+
+ //G1
+ row1l = Sse2.Add(Sse2.Add(row1l, b0), row2l);
+ row1h = Sse2.Add(Sse2.Add(row1h, b1), row2h);
+
+ row4l = Sse2.Xor(row4l, row1l);
+ row4h = Sse2.Xor(row4h, row1h);
+
+ row4l = Sse2.Shuffle(row4l.AsUInt32(), 0b_10_11_00_01).AsUInt64();
+ row4h = Sse2.Shuffle(row4h.AsUInt32(), 0b_10_11_00_01).AsUInt64();
+
+ row3l = Sse2.Add(row3l, row4l);
+ row3h = Sse2.Add(row3h, row4h);
+
+ row2l = Sse2.Xor(row2l, row3l);
+ row2h = Sse2.Xor(row2h, row3h);
+
+ row2l = Ssse3.Shuffle(row2l.AsByte(), r24).AsUInt64();
+ row2h = Ssse3.Shuffle(row2h.AsByte(), r24).AsUInt64();
+
+ b0 = Sse2.UnpackLow(m7, m3);
+ b1 = Ssse3.AlignRight(m2, m0, 8);
+
+ //G2
+ row1l = Sse2.Add(Sse2.Add(row1l, b0), row2l);
+ row1h = Sse2.Add(Sse2.Add(row1h, b1), row2h);
+
+ row4l = Sse2.Xor(row4l, row1l);
+ row4h = Sse2.Xor(row4h, row1h);
+
+ row4l = Ssse3.Shuffle(row4l.AsByte(), r16).AsUInt64();
+ row4h = Ssse3.Shuffle(row4h.AsByte(), r16).AsUInt64();
+
+ row3l = Sse2.Add(row3l, row4l);
+ row3h = Sse2.Add(row3h, row4h);
+
+ row2l = Sse2.Xor(row2l, row3l);
+ row2h = Sse2.Xor(row2h, row3h);
+
+ row2l = Sse2.Xor(Sse2.ShiftRightLogical(row2l, 63), Sse2.Add(row2l, row2l));
+ row2h = Sse2.Xor(Sse2.ShiftRightLogical(row2h, 63), Sse2.Add(row2h, row2h));
+
+ //UNDIAGONALIZE
+ t0 = Ssse3.AlignRight(row2l, row2h, 8);
+ t1 = Ssse3.AlignRight(row2h, row2l, 8);
+ row2l = t0;
+ row2h = t1;
+
+ b0 = row3l;
+ row3l = row3h;
+ row3h = b0;
+
+ t0 = Ssse3.AlignRight(row4l, row4h, 8);
+ t1 = Ssse3.AlignRight(row4h, row4l, 8);
+ row4l = t1;
+ row4h = t0;
+
+ if (++i == rounds) break;
+
+ //ROUND 4
+ b0 = Sse2.UnpackHigh(m3, m1);
+ b1 = Sse2.UnpackHigh(m6, m5);
+
+ //G1
+ row1l = Sse2.Add(Sse2.Add(row1l, b0), row2l);
+ row1h = Sse2.Add(Sse2.Add(row1h, b1), row2h);
+
+ row4l = Sse2.Xor(row4l, row1l);
+ row4h = Sse2.Xor(row4h, row1h);
+
+ row4l = Sse2.Shuffle(row4l.AsUInt32(), 0b_10_11_00_01).AsUInt64();
+ row4h = Sse2.Shuffle(row4h.AsUInt32(), 0b_10_11_00_01).AsUInt64();
+
+ row3l = Sse2.Add(row3l, row4l);
+ row3h = Sse2.Add(row3h, row4h);
+
+ row2l = Sse2.Xor(row2l, row3l);
+ row2h = Sse2.Xor(row2h, row3h);
+
+ row2l = Ssse3.Shuffle(row2l.AsByte(), r24).AsUInt64();
+ row2h = Ssse3.Shuffle(row2h.AsByte(), r24).AsUInt64();
+
+ b0 = Sse2.UnpackHigh(m4, m0);
+ b1 = Sse2.UnpackLow(m6, m7);
+
+ //G2
+ row1l = Sse2.Add(Sse2.Add(row1l, b0), row2l);
+ row1h = Sse2.Add(Sse2.Add(row1h, b1), row2h);
+
+ row4l = Sse2.Xor(row4l, row1l);
+ row4h = Sse2.Xor(row4h, row1h);
+
+ row4l = Ssse3.Shuffle(row4l.AsByte(), r16).AsUInt64();
+ row4h = Ssse3.Shuffle(row4h.AsByte(), r16).AsUInt64();
+
+ row3l = Sse2.Add(row3l, row4l);
+ row3h = Sse2.Add(row3h, row4h);
+
+ row2l = Sse2.Xor(row2l, row3l);
+ row2h = Sse2.Xor(row2h, row3h);
+
+ row2l = Sse2.Xor(Sse2.ShiftRightLogical(row2l, 63), Sse2.Add(row2l, row2l));
+ row2h = Sse2.Xor(Sse2.ShiftRightLogical(row2h, 63), Sse2.Add(row2h, row2h));
+
+ //DIAGONALIZE
+ t0 = Ssse3.AlignRight(row2h, row2l, 8);
+ t1 = Ssse3.AlignRight(row2l, row2h, 8);
+ row2l = t0;
+ row2h = t1;
+
+ b0 = row3l;
+ row3l = row3h;
+ row3h = b0;
+
+ t0 = Ssse3.AlignRight(row4h, row4l, 8);
+ t1 = Ssse3.AlignRight(row4l, row4h, 8);
+ row4l = t1;
+ row4h = t0;
+
+ b0 = Sse41.Blend(m1.AsUInt16(), m2.AsUInt16(), 0b_1111_0000).AsUInt64();
+ b1 = Sse41.Blend(m2.AsUInt16(), m7.AsUInt16(), 0b_1111_0000).AsUInt64();
+
+ //G1
+ row1l = Sse2.Add(Sse2.Add(row1l, b0), row2l);
+ row1h = Sse2.Add(Sse2.Add(row1h, b1), row2h);
+
+ row4l = Sse2.Xor(row4l, row1l);
+ row4h = Sse2.Xor(row4h, row1h);
+
+ row4l = Sse2.Shuffle(row4l.AsUInt32(), 0b_10_11_00_01).AsUInt64();
+ row4h = Sse2.Shuffle(row4h.AsUInt32(), 0b_10_11_00_01).AsUInt64();
+
+ row3l = Sse2.Add(row3l, row4l);
+ row3h = Sse2.Add(row3h, row4h);
+
+ row2l = Sse2.Xor(row2l, row3l);
+ row2h = Sse2.Xor(row2h, row3h);
+
+ row2l = Ssse3.Shuffle(row2l.AsByte(), r24).AsUInt64();
+ row2h = Ssse3.Shuffle(row2h.AsByte(), r24).AsUInt64();
+
+ b0 = Sse2.UnpackLow(m3, m5);
+ b1 = Sse2.UnpackLow(m0, m4);
+
+ //G2
+ row1l = Sse2.Add(Sse2.Add(row1l, b0), row2l);
+ row1h = Sse2.Add(Sse2.Add(row1h, b1), row2h);
+
+ row4l = Sse2.Xor(row4l, row1l);
+ row4h = Sse2.Xor(row4h, row1h);
+
+ row4l = Ssse3.Shuffle(row4l.AsByte(), r16).AsUInt64();
+ row4h = Ssse3.Shuffle(row4h.AsByte(), r16).AsUInt64();
+
+ row3l = Sse2.Add(row3l, row4l);
+ row3h = Sse2.Add(row3h, row4h);
+
+ row2l = Sse2.Xor(row2l, row3l);
+ row2h = Sse2.Xor(row2h, row3h);
+
+ row2l = Sse2.Xor(Sse2.ShiftRightLogical(row2l, 63), Sse2.Add(row2l, row2l));
+ row2h = Sse2.Xor(Sse2.ShiftRightLogical(row2h, 63), Sse2.Add(row2h, row2h));
+
+ //UNDIAGONALIZE
+ t0 = Ssse3.AlignRight(row2l, row2h, 8);
+ t1 = Ssse3.AlignRight(row2h, row2l, 8);
+ row2l = t0;
+ row2h = t1;
+
+ b0 = row3l;
+ row3l = row3h;
+ row3h = b0;
+
+ t0 = Ssse3.AlignRight(row4l, row4h, 8);
+ t1 = Ssse3.AlignRight(row4h, row4l, 8);
+ row4l = t1;
+ row4h = t0;
+
+ if (++i == rounds) break;
+
+ //ROUND 5
+ b0 = Sse2.UnpackHigh(m4, m2);
+ b1 = Sse2.UnpackLow(m1, m5);
+
+ //G1
+ row1l = Sse2.Add(Sse2.Add(row1l, b0), row2l);
+ row1h = Sse2.Add(Sse2.Add(row1h, b1), row2h);
+
+ row4l = Sse2.Xor(row4l, row1l);
+ row4h = Sse2.Xor(row4h, row1h);
+
+ row4l = Sse2.Shuffle(row4l.AsUInt32(), 0b_10_11_00_01).AsUInt64();
+ row4h = Sse2.Shuffle(row4h.AsUInt32(), 0b_10_11_00_01).AsUInt64();
+
+ row3l = Sse2.Add(row3l, row4l);
+ row3h = Sse2.Add(row3h, row4h);
+
+ row2l = Sse2.Xor(row2l, row3l);
+ row2h = Sse2.Xor(row2h, row3h);
+
+ row2l = Ssse3.Shuffle(row2l.AsByte(), r24).AsUInt64();
+ row2h = Ssse3.Shuffle(row2h.AsByte(), r24).AsUInt64();
+
+ b0 = Sse41.Blend(m0.AsUInt16(), m3.AsUInt16(), 0b_1111_0000).AsUInt64();
+ b1 = Sse41.Blend(m2.AsUInt16(), m7.AsUInt16(), 0b_1111_0000).AsUInt64();
+
+ //G2
+ row1l = Sse2.Add(Sse2.Add(row1l, b0), row2l);
+ row1h = Sse2.Add(Sse2.Add(row1h, b1), row2h);
+
+ row4l = Sse2.Xor(row4l, row1l);
+ row4h = Sse2.Xor(row4h, row1h);
+
+ row4l = Ssse3.Shuffle(row4l.AsByte(), r16).AsUInt64();
+ row4h = Ssse3.Shuffle(row4h.AsByte(), r16).AsUInt64();
+
+ row3l = Sse2.Add(row3l, row4l);
+ row3h = Sse2.Add(row3h, row4h);
+
+ row2l = Sse2.Xor(row2l, row3l);
+ row2h = Sse2.Xor(row2h, row3h);
+
+ row2l = Sse2.Xor(Sse2.ShiftRightLogical(row2l, 63), Sse2.Add(row2l, row2l));
+ row2h = Sse2.Xor(Sse2.ShiftRightLogical(row2h, 63), Sse2.Add(row2h, row2h));
+
+ //DIAGONALIZE
+ t0 = Ssse3.AlignRight(row2h, row2l, 8);
+ t1 = Ssse3.AlignRight(row2l, row2h, 8);
+ row2l = t0;
+ row2h = t1;
+
+ b0 = row3l;
+ row3l = row3h;
+ row3h = b0;
+
+ t0 = Ssse3.AlignRight(row4h, row4l, 8);
+ t1 = Ssse3.AlignRight(row4l, row4h, 8);
+ row4l = t1;
+ row4h = t0;
+
+ b0 = Sse41.Blend(m7.AsUInt16(), m5.AsUInt16(), 0b_1111_0000).AsUInt64();
+ b1 = Sse41.Blend(m3.AsUInt16(), m1.AsUInt16(), 0b_1111_0000).AsUInt64();
+
+ //G1
+ row1l = Sse2.Add(Sse2.Add(row1l, b0), row2l);
+ row1h = Sse2.Add(Sse2.Add(row1h, b1), row2h);
+
+ row4l = Sse2.Xor(row4l, row1l);
+ row4h = Sse2.Xor(row4h, row1h);
+
+ row4l = Sse2.Shuffle(row4l.AsUInt32(), 0b_10_11_00_01).AsUInt64();
+ row4h = Sse2.Shuffle(row4h.AsUInt32(), 0b_10_11_00_01).AsUInt64();
+
+ row3l = Sse2.Add(row3l, row4l);
+ row3h = Sse2.Add(row3h, row4h);
+
+ row2l = Sse2.Xor(row2l, row3l);
+ row2h = Sse2.Xor(row2h, row3h);
+
+ row2l = Ssse3.Shuffle(row2l.AsByte(), r24).AsUInt64();
+ row2h = Ssse3.Shuffle(row2h.AsByte(), r24).AsUInt64();
+
+ b0 = Ssse3.AlignRight(m6, m0, 8);
+ b1 = Sse41.Blend(m4.AsUInt16(), m6.AsUInt16(), 0b_1111_0000).AsUInt64();
+
+ //G2
+ row1l = Sse2.Add(Sse2.Add(row1l, b0), row2l);
+ row1h = Sse2.Add(Sse2.Add(row1h, b1), row2h);
+
+ row4l = Sse2.Xor(row4l, row1l);
+ row4h = Sse2.Xor(row4h, row1h);
+
+ row4l = Ssse3.Shuffle(row4l.AsByte(), r16).AsUInt64();
+ row4h = Ssse3.Shuffle(row4h.AsByte(), r16).AsUInt64();
+
+ row3l = Sse2.Add(row3l, row4l);
+ row3h = Sse2.Add(row3h, row4h);
+
+ row2l = Sse2.Xor(row2l, row3l);
+ row2h = Sse2.Xor(row2h, row3h);
+
+ row2l = Sse2.Xor(Sse2.ShiftRightLogical(row2l, 63), Sse2.Add(row2l, row2l));
+ row2h = Sse2.Xor(Sse2.ShiftRightLogical(row2h, 63), Sse2.Add(row2h, row2h));
+
+ //UNDIAGONALIZE
+ t0 = Ssse3.AlignRight(row2l, row2h, 8);
+ t1 = Ssse3.AlignRight(row2h, row2l, 8);
+ row2l = t0;
+ row2h = t1;
+
+ b0 = row3l;
+ row3l = row3h;
+ row3h = b0;
+
+ t0 = Ssse3.AlignRight(row4l, row4h, 8);
+ t1 = Ssse3.AlignRight(row4h, row4l, 8);
+ row4l = t1;
+ row4h = t0;
+
+ if (++i == rounds) break;
+
+ //ROUND 6
+ b0 = Sse2.UnpackLow(m1, m3);
+ b1 = Sse2.UnpackLow(m0, m4);
+
+ //G1
+ row1l = Sse2.Add(Sse2.Add(row1l, b0), row2l);
+ row1h = Sse2.Add(Sse2.Add(row1h, b1), row2h);
+
+ row4l = Sse2.Xor(row4l, row1l);
+ row4h = Sse2.Xor(row4h, row1h);
+
+ row4l = Sse2.Shuffle(row4l.AsUInt32(), 0b_10_11_00_01).AsUInt64();
+ row4h = Sse2.Shuffle(row4h.AsUInt32(), 0b_10_11_00_01).AsUInt64();
+
+ row3l = Sse2.Add(row3l, row4l);
+ row3h = Sse2.Add(row3h, row4h);
+
+ row2l = Sse2.Xor(row2l, row3l);
+ row2h = Sse2.Xor(row2h, row3h);
+
+ row2l = Ssse3.Shuffle(row2l.AsByte(), r24).AsUInt64();
+ row2h = Ssse3.Shuffle(row2h.AsByte(), r24).AsUInt64();
+
+ b0 = Sse2.UnpackLow(m6, m5);
+ b1 = Sse2.UnpackHigh(m5, m1);
+
+ //G2
+ row1l = Sse2.Add(Sse2.Add(row1l, b0), row2l);
+ row1h = Sse2.Add(Sse2.Add(row1h, b1), row2h);
+
+ row4l = Sse2.Xor(row4l, row1l);
+ row4h = Sse2.Xor(row4h, row1h);
+
+ row4l = Ssse3.Shuffle(row4l.AsByte(), r16).AsUInt64();
+ row4h = Ssse3.Shuffle(row4h.AsByte(), r16).AsUInt64();
+
+ row3l = Sse2.Add(row3l, row4l);
+ row3h = Sse2.Add(row3h, row4h);
+
+ row2l = Sse2.Xor(row2l, row3l);
+ row2h = Sse2.Xor(row2h, row3h);
+
+ row2l = Sse2.Xor(Sse2.ShiftRightLogical(row2l, 63), Sse2.Add(row2l, row2l));
+ row2h = Sse2.Xor(Sse2.ShiftRightLogical(row2h, 63), Sse2.Add(row2h, row2h));
+
+ //DIAGONALIZE
+ t0 = Ssse3.AlignRight(row2h, row2l, 8);
+ t1 = Ssse3.AlignRight(row2l, row2h, 8);
+ row2l = t0;
+ row2h = t1;
+
+ b0 = row3l;
+ row3l = row3h;
+ row3h = b0;
+
+ t0 = Ssse3.AlignRight(row4h, row4l, 8);
+ t1 = Ssse3.AlignRight(row4l, row4h, 8);
+ row4l = t1;
+ row4h = t0;
+
+ b0 = Sse41.Blend(m2.AsUInt16(), m3.AsUInt16(), 0b_1111_0000).AsUInt64();
+ b1 = Sse2.UnpackHigh(m7, m0);
+
+ //G1
+ row1l = Sse2.Add(Sse2.Add(row1l, b0), row2l);
+ row1h = Sse2.Add(Sse2.Add(row1h, b1), row2h);
+
+ row4l = Sse2.Xor(row4l, row1l);
+ row4h = Sse2.Xor(row4h, row1h);
+
+ row4l = Sse2.Shuffle(row4l.AsUInt32(), 0b_10_11_00_01).AsUInt64();
+ row4h = Sse2.Shuffle(row4h.AsUInt32(), 0b_10_11_00_01).AsUInt64();
+
+ row3l = Sse2.Add(row3l, row4l);
+ row3h = Sse2.Add(row3h, row4h);
+
+ row2l = Sse2.Xor(row2l, row3l);
+ row2h = Sse2.Xor(row2h, row3h);
+
+ row2l = Ssse3.Shuffle(row2l.AsByte(), r24).AsUInt64();
+ row2h = Ssse3.Shuffle(row2h.AsByte(), r24).AsUInt64();
+
+ b0 = Sse2.UnpackHigh(m6, m2);
+ b1 = Sse41.Blend(m7.AsUInt16(), m4.AsUInt16(), 0b_1111_0000).AsUInt64();
+
+ //G2
+ row1l = Sse2.Add(Sse2.Add(row1l, b0), row2l);
+ row1h = Sse2.Add(Sse2.Add(row1h, b1), row2h);
+
+ row4l = Sse2.Xor(row4l, row1l);
+ row4h = Sse2.Xor(row4h, row1h);
+
+ row4l = Ssse3.Shuffle(row4l.AsByte(), r16).AsUInt64();
+ row4h = Ssse3.Shuffle(row4h.AsByte(), r16).AsUInt64();
+
+ row3l = Sse2.Add(row3l, row4l);
+ row3h = Sse2.Add(row3h, row4h);
+
+ row2l = Sse2.Xor(row2l, row3l);
+ row2h = Sse2.Xor(row2h, row3h);
+
+ row2l = Sse2.Xor(Sse2.ShiftRightLogical(row2l, 63), Sse2.Add(row2l, row2l));
+ row2h = Sse2.Xor(Sse2.ShiftRightLogical(row2h, 63), Sse2.Add(row2h, row2h));
+
+ //UNDIAGONALIZE
+ t0 = Ssse3.AlignRight(row2l, row2h, 8);
+ t1 = Ssse3.AlignRight(row2h, row2l, 8);
+ row2l = t0;
+ row2h = t1;
+
+ b0 = row3l;
+ row3l = row3h;
+ row3h = b0;
+
+ t0 = Ssse3.AlignRight(row4l, row4h, 8);
+ t1 = Ssse3.AlignRight(row4h, row4l, 8);
+ row4l = t1;
+ row4h = t0;
+
+ if (++i == rounds) break;
+
+ //ROUND 7
+ b0 = Sse41.Blend(m6.AsUInt16(), m0.AsUInt16(), 0b_1111_0000).AsUInt64();
+ b1 = Sse2.UnpackLow(m7, m2);
+
+ //G1
+ row1l = Sse2.Add(Sse2.Add(row1l, b0), row2l);
+ row1h = Sse2.Add(Sse2.Add(row1h, b1), row2h);
+
+ row4l = Sse2.Xor(row4l, row1l);
+ row4h = Sse2.Xor(row4h, row1h);
+
+ row4l = Sse2.Shuffle(row4l.AsUInt32(), 0b_10_11_00_01).AsUInt64();
+ row4h = Sse2.Shuffle(row4h.AsUInt32(), 0b_10_11_00_01).AsUInt64();
+
+ row3l = Sse2.Add(row3l, row4l);
+ row3h = Sse2.Add(row3h, row4h);
+
+ row2l = Sse2.Xor(row2l, row3l);
+ row2h = Sse2.Xor(row2h, row3h);
+
+ row2l = Ssse3.Shuffle(row2l.AsByte(), r24).AsUInt64();
+ row2h = Ssse3.Shuffle(row2h.AsByte(), r24).AsUInt64();
+
+ b0 = Sse2.UnpackHigh(m2, m7);
+ b1 = Ssse3.AlignRight(m5, m6, 8);
+
+ //G2
+ row1l = Sse2.Add(Sse2.Add(row1l, b0), row2l);
+ row1h = Sse2.Add(Sse2.Add(row1h, b1), row2h);
+
+ row4l = Sse2.Xor(row4l, row1l);
+ row4h = Sse2.Xor(row4h, row1h);
+
+ row4l = Ssse3.Shuffle(row4l.AsByte(), r16).AsUInt64();
+ row4h = Ssse3.Shuffle(row4h.AsByte(), r16).AsUInt64();
+
+ row3l = Sse2.Add(row3l, row4l);
+ row3h = Sse2.Add(row3h, row4h);
+
+ row2l = Sse2.Xor(row2l, row3l);
+ row2h = Sse2.Xor(row2h, row3h);
+
+ row2l = Sse2.Xor(Sse2.ShiftRightLogical(row2l, 63), Sse2.Add(row2l, row2l));
+ row2h = Sse2.Xor(Sse2.ShiftRightLogical(row2h, 63), Sse2.Add(row2h, row2h));
+
+ //DIAGONALIZE
+ t0 = Ssse3.AlignRight(row2h, row2l, 8);
+ t1 = Ssse3.AlignRight(row2l, row2h, 8);
+ row2l = t0;
+ row2h = t1;
+
+ b0 = row3l;
+ row3l = row3h;
+ row3h = b0;
+
+ t0 = Ssse3.AlignRight(row4h, row4l, 8);
+ t1 = Ssse3.AlignRight(row4l, row4h, 8);
+ row4l = t1;
+ row4h = t0;
+
+ b0 = Sse2.UnpackLow(m0, m3);
+ b1 = Sse2.Shuffle(m4.AsUInt32(), 0b_01_00_11_10).AsUInt64();
+
+ //G1
+ row1l = Sse2.Add(Sse2.Add(row1l, b0), row2l);
+ row1h = Sse2.Add(Sse2.Add(row1h, b1), row2h);
+
+ row4l = Sse2.Xor(row4l, row1l);
+ row4h = Sse2.Xor(row4h, row1h);
+
+ row4l = Sse2.Shuffle(row4l.AsUInt32(), 0b_10_11_00_01).AsUInt64();
+ row4h = Sse2.Shuffle(row4h.AsUInt32(), 0b_10_11_00_01).AsUInt64();
+
+ row3l = Sse2.Add(row3l, row4l);
+ row3h = Sse2.Add(row3h, row4h);
+
+ row2l = Sse2.Xor(row2l, row3l);
+ row2h = Sse2.Xor(row2h, row3h);
+
+ row2l = Ssse3.Shuffle(row2l.AsByte(), r24).AsUInt64();
+ row2h = Ssse3.Shuffle(row2h.AsByte(), r24).AsUInt64();
+
+ b0 = Sse2.UnpackHigh(m3, m1);
+ b1 = Sse41.Blend(m1.AsUInt16(), m5.AsUInt16(), 0b_1111_0000).AsUInt64();
+
+ //G2
+ row1l = Sse2.Add(Sse2.Add(row1l, b0), row2l);
+ row1h = Sse2.Add(Sse2.Add(row1h, b1), row2h);
+
+ row4l = Sse2.Xor(row4l, row1l);
+ row4h = Sse2.Xor(row4h, row1h);
+
+ row4l = Ssse3.Shuffle(row4l.AsByte(), r16).AsUInt64();
+ row4h = Ssse3.Shuffle(row4h.AsByte(), r16).AsUInt64();
+
+ row3l = Sse2.Add(row3l, row4l);
+ row3h = Sse2.Add(row3h, row4h);
+
+ row2l = Sse2.Xor(row2l, row3l);
+ row2h = Sse2.Xor(row2h, row3h);
+
+ row2l = Sse2.Xor(Sse2.ShiftRightLogical(row2l, 63), Sse2.Add(row2l, row2l));
+ row2h = Sse2.Xor(Sse2.ShiftRightLogical(row2h, 63), Sse2.Add(row2h, row2h));
+
+ //UNDIAGONALIZE
+ t0 = Ssse3.AlignRight(row2l, row2h, 8);
+ t1 = Ssse3.AlignRight(row2h, row2l, 8);
+ row2l = t0;
+ row2h = t1;
+
+ b0 = row3l;
+ row3l = row3h;
+ row3h = b0;
+
+ t0 = Ssse3.AlignRight(row4l, row4h, 8);
+ t1 = Ssse3.AlignRight(row4h, row4l, 8);
+ row4l = t1;
+ row4h = t0;
+
+ if (++i == rounds) break;
+
+ //ROUND 8
+ b0 = Sse2.UnpackHigh(m6, m3);
+ b1 = Sse41.Blend(m6.AsUInt16(), m1.AsUInt16(), 0b_1111_0000).AsUInt64();
+
+ //G1
+ row1l = Sse2.Add(Sse2.Add(row1l, b0), row2l);
+ row1h = Sse2.Add(Sse2.Add(row1h, b1), row2h);
+
+ row4l = Sse2.Xor(row4l, row1l);
+ row4h = Sse2.Xor(row4h, row1h);
+
+ row4l = Sse2.Shuffle(row4l.AsUInt32(), 0b_10_11_00_01).AsUInt64();
+ row4h = Sse2.Shuffle(row4h.AsUInt32(), 0b_10_11_00_01).AsUInt64();
+
+ row3l = Sse2.Add(row3l, row4l);
+ row3h = Sse2.Add(row3h, row4h);
+
+ row2l = Sse2.Xor(row2l, row3l);
+ row2h = Sse2.Xor(row2h, row3h);
+
+ row2l = Ssse3.Shuffle(row2l.AsByte(), r24).AsUInt64();
+ row2h = Ssse3.Shuffle(row2h.AsByte(), r24).AsUInt64();
+
+ b0 = Ssse3.AlignRight(m7, m5, 8);
+ b1 = Sse2.UnpackHigh(m0, m4);
+
+ //G2
+ row1l = Sse2.Add(Sse2.Add(row1l, b0), row2l);
+ row1h = Sse2.Add(Sse2.Add(row1h, b1), row2h);
+
+ row4l = Sse2.Xor(row4l, row1l);
+ row4h = Sse2.Xor(row4h, row1h);
+
+ row4l = Ssse3.Shuffle(row4l.AsByte(), r16).AsUInt64();
+ row4h = Ssse3.Shuffle(row4h.AsByte(), r16).AsUInt64();
+
+ row3l = Sse2.Add(row3l, row4l);
+ row3h = Sse2.Add(row3h, row4h);
+
+ row2l = Sse2.Xor(row2l, row3l);
+ row2h = Sse2.Xor(row2h, row3h);
+
+ row2l = Sse2.Xor(Sse2.ShiftRightLogical(row2l, 63), Sse2.Add(row2l, row2l));
+ row2h = Sse2.Xor(Sse2.ShiftRightLogical(row2h, 63), Sse2.Add(row2h, row2h));
+
+ //DIAGONALIZE
+ t0 = Ssse3.AlignRight(row2h, row2l, 8);
+ t1 = Ssse3.AlignRight(row2l, row2h, 8);
+ row2l = t0;
+ row2h = t1;
+
+ b0 = row3l;
+ row3l = row3h;
+ row3h = b0;
+
+ t0 = Ssse3.AlignRight(row4h, row4l, 8);
+ t1 = Ssse3.AlignRight(row4l, row4h, 8);
+ row4l = t1;
+ row4h = t0;
+
+ b0 = Sse2.UnpackHigh(m2, m7);
+ b1 = Sse2.UnpackLow(m4, m1);
+
+ //G1
+ row1l = Sse2.Add(Sse2.Add(row1l, b0), row2l);
+ row1h = Sse2.Add(Sse2.Add(row1h, b1), row2h);
+
+ row4l = Sse2.Xor(row4l, row1l);
+ row4h = Sse2.Xor(row4h, row1h);
+
+ row4l = Sse2.Shuffle(row4l.AsUInt32(), 0b_10_11_00_01).AsUInt64();
+ row4h = Sse2.Shuffle(row4h.AsUInt32(), 0b_10_11_00_01).AsUInt64();
+
+ row3l = Sse2.Add(row3l, row4l);
+ row3h = Sse2.Add(row3h, row4h);
+
+ row2l = Sse2.Xor(row2l, row3l);
+ row2h = Sse2.Xor(row2h, row3h);
+
+ row2l = Ssse3.Shuffle(row2l.AsByte(), r24).AsUInt64();
+ row2h = Ssse3.Shuffle(row2h.AsByte(), r24).AsUInt64();
+
+ b0 = Sse2.UnpackLow(m0, m2);
+ b1 = Sse2.UnpackLow(m3, m5);
+
+ //G2
+ row1l = Sse2.Add(Sse2.Add(row1l, b0), row2l);
+ row1h = Sse2.Add(Sse2.Add(row1h, b1), row2h);
+
+ row4l = Sse2.Xor(row4l, row1l);
+ row4h = Sse2.Xor(row4h, row1h);
+
+ row4l = Ssse3.Shuffle(row4l.AsByte(), r16).AsUInt64();
+ row4h = Ssse3.Shuffle(row4h.AsByte(), r16).AsUInt64();
+
+ row3l = Sse2.Add(row3l, row4l);
+ row3h = Sse2.Add(row3h, row4h);
+
+ row2l = Sse2.Xor(row2l, row3l);
+ row2h = Sse2.Xor(row2h, row3h);
+
+ row2l = Sse2.Xor(Sse2.ShiftRightLogical(row2l, 63), Sse2.Add(row2l, row2l));
+ row2h = Sse2.Xor(Sse2.ShiftRightLogical(row2h, 63), Sse2.Add(row2h, row2h));
+
+ //UNDIAGONALIZE
+ t0 = Ssse3.AlignRight(row2l, row2h, 8);
+ t1 = Ssse3.AlignRight(row2h, row2l, 8);
+ row2l = t0;
+ row2h = t1;
+
+ b0 = row3l;
+ row3l = row3h;
+ row3h = b0;
+
+ t0 = Ssse3.AlignRight(row4l, row4h, 8);
+ t1 = Ssse3.AlignRight(row4h, row4l, 8);
+ row4l = t1;
+ row4h = t0;
+
+ if (++i == rounds) break;
+
+ //ROUND 9
+ b0 = Sse2.UnpackLow(m3, m7);
+ b1 = Ssse3.AlignRight(m0, m5, 8);
+
+ //G1
+ row1l = Sse2.Add(Sse2.Add(row1l, b0), row2l);
+ row1h = Sse2.Add(Sse2.Add(row1h, b1), row2h);
+
+ row4l = Sse2.Xor(row4l, row1l);
+ row4h = Sse2.Xor(row4h, row1h);
+
+ row4l = Sse2.Shuffle(row4l.AsUInt32(), 0b_10_11_00_01).AsUInt64();
+ row4h = Sse2.Shuffle(row4h.AsUInt32(), 0b_10_11_00_01).AsUInt64();
+
+ row3l = Sse2.Add(row3l, row4l);
+ row3h = Sse2.Add(row3h, row4h);
+
+ row2l = Sse2.Xor(row2l, row3l);
+ row2h = Sse2.Xor(row2h, row3h);
+
+ row2l = Ssse3.Shuffle(row2l.AsByte(), r24).AsUInt64();
+ row2h = Ssse3.Shuffle(row2h.AsByte(), r24).AsUInt64();
+
+ b0 = Sse2.UnpackHigh(m7, m4);
+ b1 = Ssse3.AlignRight(m4, m1, 8);
+
+ //G2
+ row1l = Sse2.Add(Sse2.Add(row1l, b0), row2l);
+ row1h = Sse2.Add(Sse2.Add(row1h, b1), row2h);
+
+ row4l = Sse2.Xor(row4l, row1l);
+ row4h = Sse2.Xor(row4h, row1h);
+
+ row4l = Ssse3.Shuffle(row4l.AsByte(), r16).AsUInt64();
+ row4h = Ssse3.Shuffle(row4h.AsByte(), r16).AsUInt64();
+
+ row3l = Sse2.Add(row3l, row4l);
+ row3h = Sse2.Add(row3h, row4h);
+
+ row2l = Sse2.Xor(row2l, row3l);
+ row2h = Sse2.Xor(row2h, row3h);
+
+ row2l = Sse2.Xor(Sse2.ShiftRightLogical(row2l, 63), Sse2.Add(row2l, row2l));
+ row2h = Sse2.Xor(Sse2.ShiftRightLogical(row2h, 63), Sse2.Add(row2h, row2h));
+
+ //DIAGONALIZE
+ t0 = Ssse3.AlignRight(row2h, row2l, 8);
+ t1 = Ssse3.AlignRight(row2l, row2h, 8);
+ row2l = t0;
+ row2h = t1;
+
+ b0 = row3l;
+ row3l = row3h;
+ row3h = b0;
+
+ t0 = Ssse3.AlignRight(row4h, row4l, 8);
+ t1 = Ssse3.AlignRight(row4l, row4h, 8);
+ row4l = t1;
+ row4h = t0;
+
+ b0 = m6;
+ b1 = Ssse3.AlignRight(m5, m0, 8);
+
+ //G1
+ row1l = Sse2.Add(Sse2.Add(row1l, b0), row2l);
+ row1h = Sse2.Add(Sse2.Add(row1h, b1), row2h);
+
+ row4l = Sse2.Xor(row4l, row1l);
+ row4h = Sse2.Xor(row4h, row1h);
+
+ row4l = Sse2.Shuffle(row4l.AsUInt32(), 0b_10_11_00_01).AsUInt64();
+ row4h = Sse2.Shuffle(row4h.AsUInt32(), 0b_10_11_00_01).AsUInt64();
+
+ row3l = Sse2.Add(row3l, row4l);
+ row3h = Sse2.Add(row3h, row4h);
+
+ row2l = Sse2.Xor(row2l, row3l);
+ row2h = Sse2.Xor(row2h, row3h);
+
+ row2l = Ssse3.Shuffle(row2l.AsByte(), r24).AsUInt64();
+ row2h = Ssse3.Shuffle(row2h.AsByte(), r24).AsUInt64();
+
+ b0 = Sse41.Blend(m1.AsUInt16(), m3.AsUInt16(), 0b_1111_0000).AsUInt64();
+ b1 = m2;
+
+ //G2
+ row1l = Sse2.Add(Sse2.Add(row1l, b0), row2l);
+ row1h = Sse2.Add(Sse2.Add(row1h, b1), row2h);
+
+ row4l = Sse2.Xor(row4l, row1l);
+ row4h = Sse2.Xor(row4h, row1h);
+
+ row4l = Ssse3.Shuffle(row4l.AsByte(), r16).AsUInt64();
+ row4h = Ssse3.Shuffle(row4h.AsByte(), r16).AsUInt64();
+
+ row3l = Sse2.Add(row3l, row4l);
+ row3h = Sse2.Add(row3h, row4h);
+
+ row2l = Sse2.Xor(row2l, row3l);
+ row2h = Sse2.Xor(row2h, row3h);
+
+ row2l = Sse2.Xor(Sse2.ShiftRightLogical(row2l, 63), Sse2.Add(row2l, row2l));
+ row2h = Sse2.Xor(Sse2.ShiftRightLogical(row2h, 63), Sse2.Add(row2h, row2h));
+
+ //UNDIAGONALIZE
+ t0 = Ssse3.AlignRight(row2l, row2h, 8);
+ t1 = Ssse3.AlignRight(row2h, row2l, 8);
+ row2l = t0;
+ row2h = t1;
+
+ b0 = row3l;
+ row3l = row3h;
+ row3h = b0;
+
+ t0 = Ssse3.AlignRight(row4l, row4h, 8);
+ t1 = Ssse3.AlignRight(row4h, row4l, 8);
+ row4l = t1;
+ row4h = t0;
+
+ if (++i == rounds) break;
+
+ //ROUND 10
+ b0 = Sse2.UnpackLow(m5, m4);
+ b1 = Sse2.UnpackHigh(m3, m0);
+
+ //G1
+ row1l = Sse2.Add(Sse2.Add(row1l, b0), row2l);
+ row1h = Sse2.Add(Sse2.Add(row1h, b1), row2h);
+
+ row4l = Sse2.Xor(row4l, row1l);
+ row4h = Sse2.Xor(row4h, row1h);
+
+ row4l = Sse2.Shuffle(row4l.AsUInt32(), 0b_10_11_00_01).AsUInt64();
+ row4h = Sse2.Shuffle(row4h.AsUInt32(), 0b_10_11_00_01).AsUInt64();
+
+ row3l = Sse2.Add(row3l, row4l);
+ row3h = Sse2.Add(row3h, row4h);
+
+ row2l = Sse2.Xor(row2l, row3l);
+ row2h = Sse2.Xor(row2h, row3h);
+
+ row2l = Ssse3.Shuffle(row2l.AsByte(), r24).AsUInt64();
+ row2h = Ssse3.Shuffle(row2h.AsByte(), r24).AsUInt64();
+
+ b0 = Sse2.UnpackLow(m1, m2);
+ b1 = Sse41.Blend(m3.AsUInt16(), m2.AsUInt16(), 0b_1111_0000).AsUInt64();
+
+ //G2
+ row1l = Sse2.Add(Sse2.Add(row1l, b0), row2l);
+ row1h = Sse2.Add(Sse2.Add(row1h, b1), row2h);
+
+ row4l = Sse2.Xor(row4l, row1l);
+ row4h = Sse2.Xor(row4h, row1h);
+
+ row4l = Ssse3.Shuffle(row4l.AsByte(), r16).AsUInt64();
+ row4h = Ssse3.Shuffle(row4h.AsByte(), r16).AsUInt64();
+
+ row3l = Sse2.Add(row3l, row4l);
+ row3h = Sse2.Add(row3h, row4h);
+
+ row2l = Sse2.Xor(row2l, row3l);
+ row2h = Sse2.Xor(row2h, row3h);
+
+ row2l = Sse2.Xor(Sse2.ShiftRightLogical(row2l, 63), Sse2.Add(row2l, row2l));
+ row2h = Sse2.Xor(Sse2.ShiftRightLogical(row2h, 63), Sse2.Add(row2h, row2h));
+
+ //DIAGONALIZE
+ t0 = Ssse3.AlignRight(row2h, row2l, 8);
+ t1 = Ssse3.AlignRight(row2l, row2h, 8);
+ row2l = t0;
+ row2h = t1;
+
+ b0 = row3l;
+ row3l = row3h;
+ row3h = b0;
+
+ t0 = Ssse3.AlignRight(row4h, row4l, 8);
+ t1 = Ssse3.AlignRight(row4l, row4h, 8);
+ row4l = t1;
+ row4h = t0;
+
+ b0 = Sse2.UnpackHigh(m7, m4);
+ b1 = Sse2.UnpackHigh(m1, m6);
+
+ //G1
+ row1l = Sse2.Add(Sse2.Add(row1l, b0), row2l);
+ row1h = Sse2.Add(Sse2.Add(row1h, b1), row2h);
+
+ row4l = Sse2.Xor(row4l, row1l);
+ row4h = Sse2.Xor(row4h, row1h);
+
+ row4l = Sse2.Shuffle(row4l.AsUInt32(), 0b_10_11_00_01).AsUInt64();
+ row4h = Sse2.Shuffle(row4h.AsUInt32(), 0b_10_11_00_01).AsUInt64();
+
+ row3l = Sse2.Add(row3l, row4l);
+ row3h = Sse2.Add(row3h, row4h);
+
+ row2l = Sse2.Xor(row2l, row3l);
+ row2h = Sse2.Xor(row2h, row3h);
+
+ row2l = Ssse3.Shuffle(row2l.AsByte(), r24).AsUInt64();
+ row2h = Ssse3.Shuffle(row2h.AsByte(), r24).AsUInt64();
+
+ b0 = Ssse3.AlignRight(m7, m5, 8);
+ b1 = Sse2.UnpackLow(m6, m0);
+
+ //G2
+ row1l = Sse2.Add(Sse2.Add(row1l, b0), row2l);
+ row1h = Sse2.Add(Sse2.Add(row1h, b1), row2h);
+
+ row4l = Sse2.Xor(row4l, row1l);
+ row4h = Sse2.Xor(row4h, row1h);
+
+ row4l = Ssse3.Shuffle(row4l.AsByte(), r16).AsUInt64();
+ row4h = Ssse3.Shuffle(row4h.AsByte(), r16).AsUInt64();
+
+ row3l = Sse2.Add(row3l, row4l);
+ row3h = Sse2.Add(row3h, row4h);
+
+ row2l = Sse2.Xor(row2l, row3l);
+ row2h = Sse2.Xor(row2h, row3h);
+
+ row2l = Sse2.Xor(Sse2.ShiftRightLogical(row2l, 63), Sse2.Add(row2l, row2l));
+ row2h = Sse2.Xor(Sse2.ShiftRightLogical(row2h, 63), Sse2.Add(row2h, row2h));
+
+ //UNDIAGONALIZE
+ t0 = Ssse3.AlignRight(row2l, row2h, 8);
+ t1 = Ssse3.AlignRight(row2h, row2l, 8);
+ row2l = t0;
+ row2h = t1;
+
+ b0 = row3l;
+ row3l = row3h;
+ row3h = b0;
+
+ t0 = Ssse3.AlignRight(row4l, row4h, 8);
+ t1 = Ssse3.AlignRight(row4h, row4l, 8);
+ row4l = t1;
+ row4h = t0;
+ }
+
+ row1l = Sse2.Xor(row1l, row3l);
+ row1h = Sse2.Xor(row1h, row3h);
+ row1l = Sse2.Xor(row1l, Sse2.LoadVector128(sh));
+ row1h = Sse2.Xor(row1h, Sse2.LoadVector128(sh + 2));
+ Sse2.Store(sh, row1l);
+ Sse2.Store(sh + 2, row1h);
+
+ row2l = Sse2.Xor(row2l, row4l);
+ row2h = Sse2.Xor(row2h, row4h);
+ row2l = Sse2.Xor(row2l, Sse2.LoadVector128(sh + 4));
+ row2h = Sse2.Xor(row2h, Sse2.LoadVector128(sh + 6));
+ Sse2.Store(sh + 4, row2l);
+ Sse2.Store(sh + 6, row2h);
+ }
+}
diff --git a/src/Nethermind/Nethermind.Crypto/Blake2Compression.cs b/src/Nethermind/Nethermind.Crypto/Blake2Compression.cs
deleted file mode 100644
index 0771977b86a..00000000000
--- a/src/Nethermind/Nethermind.Crypto/Blake2Compression.cs
+++ /dev/null
@@ -1,111 +0,0 @@
-// Copyright (c) 2021 Demerzel Solutions Limited
-// This file is part of the Nethermind library.
-//
-// The Nethermind library is free software: you can redistribute it and/or modify
-// it under the terms of the GNU Lesser General Public License as published by
-// the Free Software Foundation, either version 3 of the License, or
-// (at your option) any later version.
-//
-// The Nethermind library is distributed in the hope that it will be useful,
-// but WITHOUT ANY WARRANTY; without even the implied warranty of
-// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
-// GNU Lesser General Public License for more details.
-//
-// You should have received a copy of the GNU Lesser General Public License
-// along with the Nethermind. If not, see .
-
-using System;
-using System.Buffers.Binary;
-using System.Runtime.CompilerServices;
-using System.Runtime.InteropServices;
-
-namespace Nethermind.Crypto
-{
- ///
- /// Code adapted from pantheon (https://github.com/PegaSysEng/pantheon)
- ///
- public class Blake2Compression
- {
- private static readonly byte[][] Precomputed =
- {
- new byte[] {0, 2, 4, 6, 1, 3, 5, 7, 8, 10, 12, 14, 9, 11, 13, 15},
- new byte[] {14, 4, 9, 13, 10, 8, 15, 6, 1, 0, 11, 5, 12, 2, 7, 3},
- new byte[] {11, 12, 5, 15, 8, 0, 2, 13, 10, 3, 7, 9, 14, 6, 1, 4},
- new byte[] {7, 3, 13, 11, 9, 1, 12, 14, 2, 5, 4, 15, 6, 10, 0, 8},
- new byte[] {9, 5, 2, 10, 0, 7, 4, 15, 14, 11, 6, 3, 1, 12, 8, 13},
- new byte[] {2, 6, 0, 8, 12, 10, 11, 3, 4, 7, 15, 1, 13, 5, 14, 9},
- new byte[] {12, 1, 14, 4, 5, 15, 13, 10, 0, 6, 9, 8, 7, 3, 2, 11},
- new byte[] {13, 7, 12, 3, 11, 14, 1, 9, 5, 15, 8, 2, 0, 4, 6, 10},
- new byte[] {6, 14, 11, 0, 15, 9, 3, 8, 12, 13, 1, 10, 2, 7, 4, 5},
- new byte[] {10, 8, 7, 1, 2, 4, 6, 5, 15, 9, 3, 13, 11, 14, 12, 0}
- };
-
- private static readonly ulong[] IV =
- {
- 0x6a09e667f3bcc908ul, 0xbb67ae8584caa73bul, 0x3c6ef372fe94f82bul,
- 0xa54ff53a5f1d36f1ul, 0x510e527fade682d1ul, 0x9b05688c2b3e6c1ful,
- 0x1f83d9abfb41bd6bul, 0x5be0cd19137e2179ul
- };
-
- public void Compress(ReadOnlySpan input, Span output)
- {
- Span v = stackalloc ulong[16];
-
- uint rounds = BinaryPrimitives.ReadUInt32BigEndian(input);
- ReadOnlySpan h = MemoryMarshal.Cast(input.Slice(4, 64));
- ReadOnlySpan m = MemoryMarshal.Cast(input.Slice(68, 128));
- ReadOnlySpan t = MemoryMarshal.Cast(input.Slice(196, 16));
- bool f = input[212] != 0;
-
- h.CopyTo(v.Slice(0, 8));
- IV.AsSpan().CopyTo(v.Slice(8, 8));
-
- v[12] ^= t[0];
- v[13] ^= t[1];
-
- if (f)
- {
- v[14] ^= 0xfffffffffffffffful;
- }
-
- for (uint i = 0; i < rounds; ++i)
- {
- byte[] s = Precomputed[i % 10];
- Compute(v, m[s[0]], m[s[4]], 0, 4, 8, 12);
- Compute(v, m[s[1]], m[s[5]], 1, 5, 9, 13);
- Compute(v, m[s[2]], m[s[6]], 2, 6, 10, 14);
- Compute(v, m[s[3]], m[s[7]], 3, 7, 11, 15);
- Compute(v, m[s[8]], m[s[12]], 0, 5, 10, 15);
- Compute(v, m[s[9]], m[s[13]], 1, 6, 11, 12);
- Compute(v, m[s[10]], m[s[14]], 2, 7, 8, 13);
- Compute(v, m[s[11]], m[s[15]], 3, 4, 9, 14);
- }
-
- MemoryMarshal.Cast(h).CopyTo(output);
- Span outputUlongs = MemoryMarshal.Cast(output);
- for (int offset = 0; offset < h.Length; offset++)
- {
- outputUlongs[offset] = h[offset] ^ v[offset] ^ v[offset + 8];
- }
- }
-
- [MethodImpl(MethodImplOptions.AggressiveInlining)]
- private void Compute(Span v, ulong a, ulong b, int i, int j, int k, int l)
- {
- v[i] += a + v[j];
- v[l] = RotateLeft(v[l] ^ v[i], -32);
- v[k] += v[l];
- v[j] = RotateLeft(v[j] ^ v[k], -24);
-
- v[i] += b + v[j];
- v[l] = RotateLeft(v[l] ^ v[i], -16);
- v[k] += v[l];
- v[j] = RotateLeft(v[j] ^ v[k], -63);
- }
-
- private static ulong RotateLeft(ulong value, int count)
- {
- return (value << count) | (value >> (64 - count));
- }
- }
-}
diff --git a/src/Nethermind/Nethermind.Evm/Precompiles/Blake2FPrecompile.cs b/src/Nethermind/Nethermind.Evm/Precompiles/Blake2FPrecompile.cs
index 12faf7ea334..7f2d180dae2 100644
--- a/src/Nethermind/Nethermind.Evm/Precompiles/Blake2FPrecompile.cs
+++ b/src/Nethermind/Nethermind.Evm/Precompiles/Blake2FPrecompile.cs
@@ -18,7 +18,7 @@
using Nethermind.Core;
using Nethermind.Core.Extensions;
using Nethermind.Core.Specs;
-using Nethermind.Crypto;
+using Nethermind.Crypto.Blake2;
namespace Nethermind.Evm.Precompiles
{