diff --git a/src/Nethermind/Ethereum.Blockchain.Test/TimeConsumingTests.cs b/src/Nethermind/Ethereum.Blockchain.Test/TimeConsumingTests.cs index 7b78c9bf89e..1a320eec009 100644 --- a/src/Nethermind/Ethereum.Blockchain.Test/TimeConsumingTests.cs +++ b/src/Nethermind/Ethereum.Blockchain.Test/TimeConsumingTests.cs @@ -28,11 +28,6 @@ public class TimeConsumingTests : GeneralStateTestBase [TestCaseSource(nameof(LoadTests))] public void Test(GeneralStateTest test) { - if (test.Name.Contains("CALLBlake2f_MaxRound")) - { - return; - } - Assert.True(RunTest(test).Pass); } diff --git a/src/Nethermind/Nethermind.Benchmark/Evm/Blake2Benchmark.cs b/src/Nethermind/Nethermind.Benchmark/Evm/Blake2Benchmark.cs index 7f190325e6f..d3336b3aa35 100644 --- a/src/Nethermind/Nethermind.Benchmark/Evm/Blake2Benchmark.cs +++ b/src/Nethermind/Nethermind.Benchmark/Evm/Blake2Benchmark.cs @@ -21,7 +21,7 @@ using Nethermind.Core; using Nethermind.Core.Crypto; using Nethermind.Core.Extensions; -using Nethermind.Crypto; +using Nethermind.Crypto.Blake2; namespace Nethermind.Benchmarks.Evm { diff --git a/src/Nethermind/Nethermind.Core.Test/Blake2Tests.cs b/src/Nethermind/Nethermind.Core.Test/Blake2Tests.cs index a5a8e74b0a4..d8c98162b17 100644 --- a/src/Nethermind/Nethermind.Core.Test/Blake2Tests.cs +++ b/src/Nethermind/Nethermind.Core.Test/Blake2Tests.cs @@ -14,28 +14,80 @@ // You should have received a copy of the GNU Lesser General Public License // along with the Nethermind. If not, see . +using System.Collections.Generic; using FluentAssertions; using Nethermind.Core.Extensions; -using Nethermind.Crypto; +using Nethermind.Crypto.Blake2; using NUnit.Framework; namespace Nethermind.Core.Test { public class Blake2Tests { + private readonly Blake2Compression _blake2Compression = new(); + const string InputExceptRounds = "48c9bdf267e6096a3ba7ca8485ae67bb2bf894fe72f36e3cf1361d5f3af54fa5d182e6ad7f520e511f6c3e2b8c68059b6bbd41fbabd9831f79217e1319cde05b61626300000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000300000000000000000000000000000001"; + [Test] [TestCase("0000000048c9bdf267e6096a3ba7ca8485ae67bb2bf894fe72f36e3cf1361d5f3af54fa5d182e6ad7f520e511f6c3e2b8c68059b6bbd41fbabd9831f79217e1319cde05b61626300000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000300000000000000000000000000000001", "08c9bcf367e6096a3ba7ca8485ae67bb2bf894fe72f36e3cf1361d5f3af54fa5d282e6ad7f520e511f6c3e2b8c68059b9442be0454267ce079217e1319cde05b")] [TestCase("0000000c48c9bdf267e6096a3ba7ca8485ae67bb2bf894fe72f36e3cf1361d5f3af54fa5d182e6ad7f520e511f6c3e2b8c68059b6bbd41fbabd9831f79217e1319cde05b61626300000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000300000000000000000000000000000001", "ba80a53f981c4d0d6a2797b69f12f6e94c212f14685ac4b74b12bb6fdbffa2d17d87c5392aab792dc252d5de4533cc9518d38aa8dbf1925ab92386edd4009923")] [TestCase("0000000c48c9bdf267e6096a3ba7ca8485ae67bb2bf894fe72f36e3cf1361d5f3af54fa5d182e6ad7f520e511f6c3e2b8c68059b6bbd41fbabd9831f79217e1319cde05b61626300000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000300000000000000000000000000000000", "75ab69d3190a562c51aef8d88f1c2775876944407270c42c9844252c26d2875298743e7f6d5ea2f2d3e8d226039cd31b4e426ac4f2d3d666a610c2116fde4735")] [TestCase("0000000148c9bdf267e6096a3ba7ca8485ae67bb2bf894fe72f36e3cf1361d5f3af54fa5d182e6ad7f520e511f6c3e2b8c68059b6bbd41fbabd9831f79217e1319cde05b61626300000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000300000000000000000000000000000001", "b63a380cb2897d521994a85234ee2c181b5f844d2c624c002677e9703449d2fba551b3a8333bcdf5f2f7e08993d53923de3d64fcc68c034e717b9293fed7a421")] -// [TestCase("ffffffff48c9bdf267e6096a3ba7ca8485ae67bb2bf894fe72f36e3cf1361d5f3af54fa5d182e6ad7f520e511f6c3e2b8c68059b6bbd41fbabd9831f79217e1319cde05b61626300000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000300000000000000000000000000000001", "fc59093aafa9ab43daae0e914c57635c5402d8e3d2130eb9b3cc181de7f0ecf9b22bf99a7815ce16419e200e01846e6b5df8cc7703041bbceb571de6631d2615")] + [TestCase("ffffffff48c9bdf267e6096a3ba7ca8485ae67bb2bf894fe72f36e3cf1361d5f3af54fa5d182e6ad7f520e511f6c3e2b8c68059b6bbd41fbabd9831f79217e1319cde05b61626300000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000300000000000000000000000000000001", "fc59093aafa9ab43daae0e914c57635c5402d8e3d2130eb9b3cc181de7f0ecf9b22bf99a7815ce16419e200e01846e6b5df8cc7703041bbceb571de6631d2615")] public void compression_function_should_return_valid_output(string input, string output) { - Blake2Compression? blake2Optimized = new(); byte[] blake2Result = new byte[64]; - blake2Optimized.Compress(Bytes.FromHexString(input), blake2Result); + _blake2Compression.Compress(Bytes.FromHexString(input), blake2Result); string? result = blake2Result.ToHexString(); result.Should().BeEquivalentTo(output); } + + [TestCaseSource(nameof(TestCaseSource))] + public void avx2_should_compute_correct_values((int Rounds, string Output) testCase) + { + (int rounds, string output) = testCase; + Test(rounds, output, Blake2CompressMethod.Avx2); + } + + [TestCaseSource(nameof(TestCaseSource))] + public void sse41_should_compute_correct_values((int Rounds, string Output) testCase) + { + (int rounds, string output) = testCase; + Test(rounds, output, Blake2CompressMethod.Sse41); + } + + [TestCaseSource(nameof(TestCaseSource))] + public void scalar_should_compute_correct_values((int Rounds, string Output) testCase) + { + (int rounds, string output) = testCase; + Test(rounds, output, Blake2CompressMethod.Scalar); + } + + private void Test(int rounds, string output, Blake2CompressMethod method) + { + string input = string.Concat(rounds.ToString("x8"), InputExceptRounds); + + byte[] blake2Result = new byte[64]; + _blake2Compression.Compress(Bytes.FromHexString(input), blake2Result, method); + string result = blake2Result.ToHexString(); + result.Should().BeEquivalentTo(output); + } + + public static IEnumerable<(int, string)> TestCaseSource() + { + yield return (0, "08c9bcf367e6096a3ba7ca8485ae67bb2bf894fe72f36e3cf1361d5f3af54fa5d282e6ad7f520e511f6c3e2b8c68059b9442be0454267ce079217e1319cde05b"); + yield return (1, "b63a380cb2897d521994a85234ee2c181b5f844d2c624c002677e9703449d2fba551b3a8333bcdf5f2f7e08993d53923de3d64fcc68c034e717b9293fed7a421"); + yield return (2, "2c96ff1bd7926f1b8bcd7824d808fdde9cf850920b625c59f1558bc608fb66a50070f53367230679e4949e7d32baac94f33af05175b7abf3b4972425a7b068ca"); + yield return (3, "b70b167bd40e83abf720fa83d014b07db1f64ae0a7c0b4d74eace08cd2515ca7927a6d6268d80043628698e31ea7d4a4f69dac2cf3ce6746825f5cff08b401cc"); + yield return (4, "0d2c9a214539ea7898029c0c95681cab88a360f633fd94ff5fae7d1e184bfab0a598296b7b046dd346ce75add0a457e3076fbc0a72ceff7eb9d4ed790d9356e9"); + yield return (5, "021e4bc08df8b11f90392a07fc4e86b0d0159d2ff06f5c329a793847e4f0c848c6aefce2d2e11ee7a73dfaadbeebfb33e3a4ad083bfd3b4e93e7b23621a97960"); + yield return (6, "a12a6af6b6d84ace0a8fcff0ae165e91b7de3bf70d9f19405e8701f2ea69ef1ed9e0206d78e61aa7867536b6982938c361e6a84ee1be15bc13b14adcd38459a1"); + yield return (7, "125dd3e4baa7f300be309deab1181db034967cc20ebecc3c0de038b0a714afaa744cea00cd843042b75c25b1d2e3931d2203111e871f35723741418117efe781"); + yield return (8, "59d8d7cbf70b0336e6f4f7a20d2ebd05f9b27ad7bb278faff380c206b68962ae630e8a4d2af1dce8a853cd722ad174e259c7ca284137fe52b61524fb5fe327f7"); + yield return (9, "69fbbdf42d5f5f2eb657faaa82862c9a492237cbb93ffd9938ff7b757671fac0a19b9f27d130b78180d070f9b9b96ee1bb1d69e2edae0c1b7602f2f2e0977614"); + yield return (10, "5a4308e0e1daede181b47775d926a6b4b6a0adf86d05bfea696fac45f08419623976bd3c786f61500b9f94a043b9dcf397e38ee237f3c273a7d812be20874f5a"); + yield return (11, "60faa8f91624b2b718210df242b788c7ae887e953dce3c7f80862bc5e4f88d827cada4d95d2c4ac41eb66b84fcdc0e12ab0c66f4d9d546ff8a0d712f324e1845"); + yield return (12, "ba80a53f981c4d0d6a2797b69f12f6e94c212f14685ac4b74b12bb6fdbffa2d17d87c5392aab792dc252d5de4533cc9518d38aa8dbf1925ab92386edd4009923"); + yield return (10_000_000, "5b6d1ca8ee5370f08008240579096021dcf8860de693cc8f5a1476ba70c3b32ba8f93c62a0b2fbcd305caaa22bc96e0dbab199a65fcd234e31404ca4b1766252"); + } } } diff --git a/src/Nethermind/Nethermind.Crypto/Blake2/Blake2Avx2.cs b/src/Nethermind/Nethermind.Crypto/Blake2/Blake2Avx2.cs new file mode 100644 index 00000000000..914a4786ac8 --- /dev/null +++ b/src/Nethermind/Nethermind.Crypto/Blake2/Blake2Avx2.cs @@ -0,0 +1,721 @@ +// Copyright (c) 2022 Demerzel Solutions Limited +// This file is part of the Nethermind library. +// +// The Nethermind library is free software: you can redistribute it and/or modify +// it under the terms of the GNU Lesser General Public License as published by +// the Free Software Foundation, either version 3 of the License, or +// (at your option) any later version. +// +// The Nethermind library is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU Lesser General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public License +// along with the Nethermind. If not, see . +// + +using System.Runtime.CompilerServices; +using System.Runtime.InteropServices; +using System.Runtime.Intrinsics; +using System.Runtime.Intrinsics.X86; + +namespace Nethermind.Crypto.Blake2; + +/// +/// Code adapted from Blake2Fast (https://github.com/saucecontrol/Blake2Fast) +/// +public unsafe partial class Blake2Compression +{ + // SIMD algorithm described in https://eprint.iacr.org/2012/275.pdf + [MethodImpl(MethodImplOptions.AggressiveOptimization)] + [SkipLocalsInit] + private static void ComputeAvx2(ulong* sh, ulong* m, uint rounds) + { + // Rotate shuffle masks. We can safely convert the ref to a pointer because the compiler guarantees the + // data is in a fixed location, and the ref itself is converted from a pointer. Same for the IV below. + byte* prm = (byte*)Unsafe.AsPointer(ref MemoryMarshal.GetReference(Rormask)); + var r24 = Avx2.BroadcastVector128ToVector256(prm); + var r16 = Avx2.BroadcastVector128ToVector256(prm + Vector128.Count); + + var row1 = Avx.LoadVector256(sh); + var row2 = Avx.LoadVector256(sh + Vector256.Count); + + ulong* piv = (ulong*)Unsafe.AsPointer(ref MemoryMarshal.GetReference(Ivle)); + var row3 = Avx.LoadVector256(piv); + var row4 = Avx.LoadVector256(piv + Vector256.Count); + + row4 = Avx2.Xor(row4, Avx.LoadVector256(sh + Vector256.Count * 2)); // t[] and f[] + + var m0 = Avx2.BroadcastVector128ToVector256(m); + var m1 = Avx2.BroadcastVector128ToVector256(m + Vector128.Count); + var m2 = Avx2.BroadcastVector128ToVector256(m + Vector128.Count * 2); + var m3 = Avx2.BroadcastVector128ToVector256(m + Vector128.Count * 3); + var m4 = Avx2.BroadcastVector128ToVector256(m + Vector128.Count * 4); + var m5 = Avx2.BroadcastVector128ToVector256(m + Vector128.Count * 5); + var m6 = Avx2.BroadcastVector128ToVector256(m + Vector128.Count * 6); + var m7 = Avx2.BroadcastVector128ToVector256(m + Vector128.Count * 7); + Vector256 t0; + Vector256 t1; + Vector256 b0; + + for (uint i = 0; i < rounds; i++) + { + //ROUND 1 + t0 = Avx2.UnpackLow(m0, m1); + t1 = Avx2.UnpackLow(m2, m3); + b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); + + //G1 + row1 = Avx2.Add(Avx2.Add(row1, b0), row2); + row4 = Avx2.Xor(row4, row1); + row4 = Avx2.Shuffle(row4.AsUInt32(), 0b_10_11_00_01).AsUInt64(); + + row3 = Avx2.Add(row3, row4); + row2 = Avx2.Xor(row2, row3); + row2 = Avx2.Shuffle(row2.AsByte(), r24).AsUInt64(); + + t0 = Avx2.UnpackHigh(m0, m1); + t1 = Avx2.UnpackHigh(m2, m3); + b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); + + //G2 + row1 = Avx2.Add(Avx2.Add(row1, b0), row2); + row4 = Avx2.Xor(row4, row1); + row4 = Avx2.Shuffle(row4.AsByte(), r16).AsUInt64(); + + row3 = Avx2.Add(row3, row4); + row2 = Avx2.Xor(row2, row3); + row2 = Avx2.Xor(Avx2.ShiftRightLogical(row2, 63), Avx2.Add(row2, row2)); + + //DIAGONALIZE + row1 = Avx2.Permute4x64(row1, 0b_10_01_00_11); + row4 = Avx2.Permute4x64(row4, 0b_01_00_11_10); + row3 = Avx2.Permute4x64(row3, 0b_00_11_10_01); + + t0 = Avx2.UnpackLow(m7, m4); + t1 = Avx2.UnpackLow(m5, m6); + b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); + + //G1 + row1 = Avx2.Add(Avx2.Add(row1, b0), row2); + row4 = Avx2.Xor(row4, row1); + row4 = Avx2.Shuffle(row4.AsUInt32(), 0b_10_11_00_01).AsUInt64(); + + row3 = Avx2.Add(row3, row4); + row2 = Avx2.Xor(row2, row3); + row2 = Avx2.Shuffle(row2.AsByte(), r24).AsUInt64(); + + t0 = Avx2.UnpackHigh(m7, m4); + t1 = Avx2.UnpackHigh(m5, m6); + b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); + + //G2 + row1 = Avx2.Add(Avx2.Add(row1, b0), row2); + row4 = Avx2.Xor(row4, row1); + row4 = Avx2.Shuffle(row4.AsByte(), r16).AsUInt64(); + + row3 = Avx2.Add(row3, row4); + row2 = Avx2.Xor(row2, row3); + row2 = Avx2.Xor(Avx2.ShiftRightLogical(row2, 63), Avx2.Add(row2, row2)); + + //UNDIAGONALIZE + row1 = Avx2.Permute4x64(row1, 0b_00_11_10_01); + row4 = Avx2.Permute4x64(row4, 0b_01_00_11_10); + row3 = Avx2.Permute4x64(row3, 0b_10_01_00_11); + + if (++i == rounds) break; + + //ROUND 2 + t0 = Avx2.UnpackLow(m7, m2); + t1 = Avx2.UnpackHigh(m4, m6); + b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); + + //G1 + row1 = Avx2.Add(Avx2.Add(row1, b0), row2); + row4 = Avx2.Xor(row4, row1); + row4 = Avx2.Shuffle(row4.AsUInt32(), 0b_10_11_00_01).AsUInt64(); + + row3 = Avx2.Add(row3, row4); + row2 = Avx2.Xor(row2, row3); + row2 = Avx2.Shuffle(row2.AsByte(), r24).AsUInt64(); + + t0 = Avx2.UnpackLow(m5, m4); + t1 = Avx2.AlignRight(m3, m7, 8); + b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); + + //G2 + row1 = Avx2.Add(Avx2.Add(row1, b0), row2); + row4 = Avx2.Xor(row4, row1); + row4 = Avx2.Shuffle(row4.AsByte(), r16).AsUInt64(); + + row3 = Avx2.Add(row3, row4); + row2 = Avx2.Xor(row2, row3); + row2 = Avx2.Xor(Avx2.ShiftRightLogical(row2, 63), Avx2.Add(row2, row2)); + + //DIAGONALIZE + row1 = Avx2.Permute4x64(row1, 0b_10_01_00_11); + row4 = Avx2.Permute4x64(row4, 0b_01_00_11_10); + row3 = Avx2.Permute4x64(row3, 0b_00_11_10_01); + + t0 = Avx2.UnpackHigh(m2, m0); + t1 = Avx2.Blend(m0.AsUInt32(), m5.AsUInt32(), 0b_1100_1100).AsUInt64(); + b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); + + //G1 + row1 = Avx2.Add(Avx2.Add(row1, b0), row2); + row4 = Avx2.Xor(row4, row1); + row4 = Avx2.Shuffle(row4.AsUInt32(), 0b_10_11_00_01).AsUInt64(); + + row3 = Avx2.Add(row3, row4); + row2 = Avx2.Xor(row2, row3); + row2 = Avx2.Shuffle(row2.AsByte(), r24).AsUInt64(); + + t0 = Avx2.AlignRight(m6, m1, 8); + t1 = Avx2.Blend(m1.AsUInt32(), m3.AsUInt32(), 0b_1100_1100).AsUInt64(); + b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); + + //G2 + row1 = Avx2.Add(Avx2.Add(row1, b0), row2); + row4 = Avx2.Xor(row4, row1); + row4 = Avx2.Shuffle(row4.AsByte(), r16).AsUInt64(); + + row3 = Avx2.Add(row3, row4); + row2 = Avx2.Xor(row2, row3); + row2 = Avx2.Xor(Avx2.ShiftRightLogical(row2, 63), Avx2.Add(row2, row2)); + + //UNDIAGONALIZE + row1 = Avx2.Permute4x64(row1, 0b_00_11_10_01); + row4 = Avx2.Permute4x64(row4, 0b_01_00_11_10); + row3 = Avx2.Permute4x64(row3, 0b_10_01_00_11); + + if (++i == rounds) break; + + //ROUND 3 + t0 = Avx2.AlignRight(m6, m5, 8); + t1 = Avx2.UnpackHigh(m2, m7); + b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); + + //G1 + row1 = Avx2.Add(Avx2.Add(row1, b0), row2); + row4 = Avx2.Xor(row4, row1); + row4 = Avx2.Shuffle(row4.AsUInt32(), 0b_10_11_00_01).AsUInt64(); + + row3 = Avx2.Add(row3, row4); + row2 = Avx2.Xor(row2, row3); + row2 = Avx2.Shuffle(row2.AsByte(), r24).AsUInt64(); + + t0 = Avx2.UnpackLow(m4, m0); + t1 = Avx2.Blend(m1.AsUInt32(), m6.AsUInt32(), 0b_1100_1100).AsUInt64(); + b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); + + //G2 + row1 = Avx2.Add(Avx2.Add(row1, b0), row2); + row4 = Avx2.Xor(row4, row1); + row4 = Avx2.Shuffle(row4.AsByte(), r16).AsUInt64(); + + row3 = Avx2.Add(row3, row4); + row2 = Avx2.Xor(row2, row3); + row2 = Avx2.Xor(Avx2.ShiftRightLogical(row2, 63), Avx2.Add(row2, row2)); + + //DIAGONALIZE + row1 = Avx2.Permute4x64(row1, 0b_10_01_00_11); + row4 = Avx2.Permute4x64(row4, 0b_01_00_11_10); + row3 = Avx2.Permute4x64(row3, 0b_00_11_10_01); + + t0 = Avx2.AlignRight(m5, m4, 8); + t1 = Avx2.UnpackHigh(m1, m3); + b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); + + //G1 + row1 = Avx2.Add(Avx2.Add(row1, b0), row2); + row4 = Avx2.Xor(row4, row1); + row4 = Avx2.Shuffle(row4.AsUInt32(), 0b_10_11_00_01).AsUInt64(); + + row3 = Avx2.Add(row3, row4); + row2 = Avx2.Xor(row2, row3); + row2 = Avx2.Shuffle(row2.AsByte(), r24).AsUInt64(); + + t0 = Avx2.UnpackLow(m2, m7); + t1 = Avx2.Blend(m3.AsUInt32(), m0.AsUInt32(), 0b_1100_1100).AsUInt64(); + b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); + + //G2 + row1 = Avx2.Add(Avx2.Add(row1, b0), row2); + row4 = Avx2.Xor(row4, row1); + row4 = Avx2.Shuffle(row4.AsByte(), r16).AsUInt64(); + + row3 = Avx2.Add(row3, row4); + row2 = Avx2.Xor(row2, row3); + row2 = Avx2.Xor(Avx2.ShiftRightLogical(row2, 63), Avx2.Add(row2, row2)); + + //UNDIAGONALIZE + row1 = Avx2.Permute4x64(row1, 0b_00_11_10_01); + row4 = Avx2.Permute4x64(row4, 0b_01_00_11_10); + row3 = Avx2.Permute4x64(row3, 0b_10_01_00_11); + + if (++i == rounds) break; + + //ROUND 4 + t0 = Avx2.UnpackHigh(m3, m1); + t1 = Avx2.UnpackHigh(m6, m5); + b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); + + //G1 + row1 = Avx2.Add(Avx2.Add(row1, b0), row2); + row4 = Avx2.Xor(row4, row1); + row4 = Avx2.Shuffle(row4.AsUInt32(), 0b_10_11_00_01).AsUInt64(); + + row3 = Avx2.Add(row3, row4); + row2 = Avx2.Xor(row2, row3); + row2 = Avx2.Shuffle(row2.AsByte(), r24).AsUInt64(); + + t0 = Avx2.UnpackHigh(m4, m0); + t1 = Avx2.UnpackLow(m6, m7); + b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); + + //G2 + row1 = Avx2.Add(Avx2.Add(row1, b0), row2); + row4 = Avx2.Xor(row4, row1); + row4 = Avx2.Shuffle(row4.AsByte(), r16).AsUInt64(); + + row3 = Avx2.Add(row3, row4); + row2 = Avx2.Xor(row2, row3); + row2 = Avx2.Xor(Avx2.ShiftRightLogical(row2, 63), Avx2.Add(row2, row2)); + + //DIAGONALIZE + row1 = Avx2.Permute4x64(row1, 0b_10_01_00_11); + row4 = Avx2.Permute4x64(row4, 0b_01_00_11_10); + row3 = Avx2.Permute4x64(row3, 0b_00_11_10_01); + + t0 = Avx2.AlignRight(m1, m7, 8); + t1 = Avx2.Shuffle(m2.AsUInt32(), 0b_01_00_11_10).AsUInt64(); + b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); + + //G1 + row1 = Avx2.Add(Avx2.Add(row1, b0), row2); + row4 = Avx2.Xor(row4, row1); + row4 = Avx2.Shuffle(row4.AsUInt32(), 0b_10_11_00_01).AsUInt64(); + + row3 = Avx2.Add(row3, row4); + row2 = Avx2.Xor(row2, row3); + row2 = Avx2.Shuffle(row2.AsByte(), r24).AsUInt64(); + + t0 = Avx2.UnpackLow(m4, m3); + t1 = Avx2.UnpackLow(m5, m0); + b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); + + //G2 + row1 = Avx2.Add(Avx2.Add(row1, b0), row2); + row4 = Avx2.Xor(row4, row1); + row4 = Avx2.Shuffle(row4.AsByte(), r16).AsUInt64(); + + row3 = Avx2.Add(row3, row4); + row2 = Avx2.Xor(row2, row3); + row2 = Avx2.Xor(Avx2.ShiftRightLogical(row2, 63), Avx2.Add(row2, row2)); + + //UNDIAGONALIZE + row1 = Avx2.Permute4x64(row1, 0b_00_11_10_01); + row4 = Avx2.Permute4x64(row4, 0b_01_00_11_10); + row3 = Avx2.Permute4x64(row3, 0b_10_01_00_11); + + if (++i == rounds) break; + + //ROUND 5 + t0 = Avx2.UnpackHigh(m4, m2); + t1 = Avx2.UnpackLow(m1, m5); + b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); + + //G1 + row1 = Avx2.Add(Avx2.Add(row1, b0), row2); + row4 = Avx2.Xor(row4, row1); + row4 = Avx2.Shuffle(row4.AsUInt32(), 0b_10_11_00_01).AsUInt64(); + + row3 = Avx2.Add(row3, row4); + row2 = Avx2.Xor(row2, row3); + row2 = Avx2.Shuffle(row2.AsByte(), r24).AsUInt64(); + + t0 = Avx2.Blend(m0.AsUInt32(), m3.AsUInt32(), 0b_1100_1100).AsUInt64(); + t1 = Avx2.Blend(m2.AsUInt32(), m7.AsUInt32(), 0b_1100_1100).AsUInt64(); + b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); + + //G2 + row1 = Avx2.Add(Avx2.Add(row1, b0), row2); + row4 = Avx2.Xor(row4, row1); + row4 = Avx2.Shuffle(row4.AsByte(), r16).AsUInt64(); + + row3 = Avx2.Add(row3, row4); + row2 = Avx2.Xor(row2, row3); + row2 = Avx2.Xor(Avx2.ShiftRightLogical(row2, 63), Avx2.Add(row2, row2)); + + //DIAGONALIZE + row1 = Avx2.Permute4x64(row1, 0b_10_01_00_11); + row4 = Avx2.Permute4x64(row4, 0b_01_00_11_10); + row3 = Avx2.Permute4x64(row3, 0b_00_11_10_01); + + t0 = Avx2.AlignRight(m7, m1, 8); + t1 = Avx2.AlignRight(m3, m5, 8); + b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); + + //G1 + row1 = Avx2.Add(Avx2.Add(row1, b0), row2); + row4 = Avx2.Xor(row4, row1); + row4 = Avx2.Shuffle(row4.AsUInt32(), 0b_10_11_00_01).AsUInt64(); + + row3 = Avx2.Add(row3, row4); + row2 = Avx2.Xor(row2, row3); + row2 = Avx2.Shuffle(row2.AsByte(), r24).AsUInt64(); + + t0 = Avx2.UnpackHigh(m6, m0); + t1 = Avx2.UnpackLow(m6, m4); + b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); + + //G2 + row1 = Avx2.Add(Avx2.Add(row1, b0), row2); + row4 = Avx2.Xor(row4, row1); + row4 = Avx2.Shuffle(row4.AsByte(), r16).AsUInt64(); + + row3 = Avx2.Add(row3, row4); + row2 = Avx2.Xor(row2, row3); + row2 = Avx2.Xor(Avx2.ShiftRightLogical(row2, 63), Avx2.Add(row2, row2)); + + //UNDIAGONALIZE + row1 = Avx2.Permute4x64(row1, 0b_00_11_10_01); + row4 = Avx2.Permute4x64(row4, 0b_01_00_11_10); + row3 = Avx2.Permute4x64(row3, 0b_10_01_00_11); + + if (++i == rounds) break; + + //ROUND 6 + t0 = Avx2.UnpackLow(m1, m3); + t1 = Avx2.UnpackLow(m0, m4); + b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); + + //G1 + row1 = Avx2.Add(Avx2.Add(row1, b0), row2); + row4 = Avx2.Xor(row4, row1); + row4 = Avx2.Shuffle(row4.AsUInt32(), 0b_10_11_00_01).AsUInt64(); + + row3 = Avx2.Add(row3, row4); + row2 = Avx2.Xor(row2, row3); + row2 = Avx2.Shuffle(row2.AsByte(), r24).AsUInt64(); + + t0 = Avx2.UnpackLow(m6, m5); + t1 = Avx2.UnpackHigh(m5, m1); + b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); + + //G2 + row1 = Avx2.Add(Avx2.Add(row1, b0), row2); + row4 = Avx2.Xor(row4, row1); + row4 = Avx2.Shuffle(row4.AsByte(), r16).AsUInt64(); + + row3 = Avx2.Add(row3, row4); + row2 = Avx2.Xor(row2, row3); + row2 = Avx2.Xor(Avx2.ShiftRightLogical(row2, 63), Avx2.Add(row2, row2)); + + //DIAGONALIZE + row1 = Avx2.Permute4x64(row1, 0b_10_01_00_11); + row4 = Avx2.Permute4x64(row4, 0b_01_00_11_10); + row3 = Avx2.Permute4x64(row3, 0b_00_11_10_01); + + t0 = Avx2.AlignRight(m2, m0, 8); + t1 = Avx2.UnpackHigh(m3, m7); + b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); + + //G1 + row1 = Avx2.Add(Avx2.Add(row1, b0), row2); + row4 = Avx2.Xor(row4, row1); + row4 = Avx2.Shuffle(row4.AsUInt32(), 0b_10_11_00_01).AsUInt64(); + + row3 = Avx2.Add(row3, row4); + row2 = Avx2.Xor(row2, row3); + row2 = Avx2.Shuffle(row2.AsByte(), r24).AsUInt64(); + + t0 = Avx2.UnpackHigh(m4, m6); + t1 = Avx2.AlignRight(m7, m2, 8); + b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); + + //G2 + row1 = Avx2.Add(Avx2.Add(row1, b0), row2); + row4 = Avx2.Xor(row4, row1); + row4 = Avx2.Shuffle(row4.AsByte(), r16).AsUInt64(); + + row3 = Avx2.Add(row3, row4); + row2 = Avx2.Xor(row2, row3); + row2 = Avx2.Xor(Avx2.ShiftRightLogical(row2, 63), Avx2.Add(row2, row2)); + + //UNDIAGONALIZE + row1 = Avx2.Permute4x64(row1, 0b_00_11_10_01); + row4 = Avx2.Permute4x64(row4, 0b_01_00_11_10); + row3 = Avx2.Permute4x64(row3, 0b_10_01_00_11); + + if (++i == rounds) break; + + //ROUND 7 + t0 = Avx2.Blend(m6.AsUInt32(), m0.AsUInt32(), 0b_1100_1100).AsUInt64(); + t1 = Avx2.UnpackLow(m7, m2); + b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); + + //G1 + row1 = Avx2.Add(Avx2.Add(row1, b0), row2); + row4 = Avx2.Xor(row4, row1); + row4 = Avx2.Shuffle(row4.AsUInt32(), 0b_10_11_00_01).AsUInt64(); + + row3 = Avx2.Add(row3, row4); + row2 = Avx2.Xor(row2, row3); + row2 = Avx2.Shuffle(row2.AsByte(), r24).AsUInt64(); + + t0 = Avx2.UnpackHigh(m2, m7); + t1 = Avx2.AlignRight(m5, m6, 8); + b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); + + //G2 + row1 = Avx2.Add(Avx2.Add(row1, b0), row2); + row4 = Avx2.Xor(row4, row1); + row4 = Avx2.Shuffle(row4.AsByte(), r16).AsUInt64(); + + row3 = Avx2.Add(row3, row4); + row2 = Avx2.Xor(row2, row3); + row2 = Avx2.Xor(Avx2.ShiftRightLogical(row2, 63), Avx2.Add(row2, row2)); + + //DIAGONALIZE + row1 = Avx2.Permute4x64(row1, 0b_10_01_00_11); + row4 = Avx2.Permute4x64(row4, 0b_01_00_11_10); + row3 = Avx2.Permute4x64(row3, 0b_00_11_10_01); + + t0 = Avx2.UnpackLow(m4, m0); + t1 = Avx2.Blend(m3.AsUInt32(), m4.AsUInt32(), 0b_1100_1100).AsUInt64(); + b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); + + //G1 + row1 = Avx2.Add(Avx2.Add(row1, b0), row2); + row4 = Avx2.Xor(row4, row1); + row4 = Avx2.Shuffle(row4.AsUInt32(), 0b_10_11_00_01).AsUInt64(); + + row3 = Avx2.Add(row3, row4); + row2 = Avx2.Xor(row2, row3); + row2 = Avx2.Shuffle(row2.AsByte(), r24).AsUInt64(); + + t0 = Avx2.UnpackHigh(m5, m3); + t1 = Avx2.Shuffle(m1.AsUInt32(), 0b_01_00_11_10).AsUInt64(); + b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); + + //G2 + row1 = Avx2.Add(Avx2.Add(row1, b0), row2); + row4 = Avx2.Xor(row4, row1); + row4 = Avx2.Shuffle(row4.AsByte(), r16).AsUInt64(); + + row3 = Avx2.Add(row3, row4); + row2 = Avx2.Xor(row2, row3); + row2 = Avx2.Xor(Avx2.ShiftRightLogical(row2, 63), Avx2.Add(row2, row2)); + + //UNDIAGONALIZE + row1 = Avx2.Permute4x64(row1, 0b_00_11_10_01); + row4 = Avx2.Permute4x64(row4, 0b_01_00_11_10); + row3 = Avx2.Permute4x64(row3, 0b_10_01_00_11); + + if (++i == rounds) break; + + //ROUND 8 + t0 = Avx2.UnpackHigh(m6, m3); + t1 = Avx2.Blend(m6.AsUInt32(), m1.AsUInt32(), 0b_1100_1100).AsUInt64(); + b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); + + //G1 + row1 = Avx2.Add(Avx2.Add(row1, b0), row2); + row4 = Avx2.Xor(row4, row1); + row4 = Avx2.Shuffle(row4.AsUInt32(), 0b_10_11_00_01).AsUInt64(); + + row3 = Avx2.Add(row3, row4); + row2 = Avx2.Xor(row2, row3); + row2 = Avx2.Shuffle(row2.AsByte(), r24).AsUInt64(); + + t0 = Avx2.AlignRight(m7, m5, 8); + t1 = Avx2.UnpackHigh(m0, m4); + b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); + + //G2 + row1 = Avx2.Add(Avx2.Add(row1, b0), row2); + row4 = Avx2.Xor(row4, row1); + row4 = Avx2.Shuffle(row4.AsByte(), r16).AsUInt64(); + + row3 = Avx2.Add(row3, row4); + row2 = Avx2.Xor(row2, row3); + row2 = Avx2.Xor(Avx2.ShiftRightLogical(row2, 63), Avx2.Add(row2, row2)); + + //DIAGONALIZE + row1 = Avx2.Permute4x64(row1, 0b_10_01_00_11); + row4 = Avx2.Permute4x64(row4, 0b_01_00_11_10); + row3 = Avx2.Permute4x64(row3, 0b_00_11_10_01); + + t0 = Avx2.Blend(m1.AsUInt32(), m2.AsUInt32(), 0b_1100_1100).AsUInt64(); + t1 = Avx2.AlignRight(m4, m7, 8); + b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); + + //G1 + row1 = Avx2.Add(Avx2.Add(row1, b0), row2); + row4 = Avx2.Xor(row4, row1); + row4 = Avx2.Shuffle(row4.AsUInt32(), 0b_10_11_00_01).AsUInt64(); + + row3 = Avx2.Add(row3, row4); + row2 = Avx2.Xor(row2, row3); + row2 = Avx2.Shuffle(row2.AsByte(), r24).AsUInt64(); + + t0 = Avx2.UnpackLow(m5, m0); + t1 = Avx2.UnpackLow(m2, m3); + b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); + + //G2 + row1 = Avx2.Add(Avx2.Add(row1, b0), row2); + row4 = Avx2.Xor(row4, row1); + row4 = Avx2.Shuffle(row4.AsByte(), r16).AsUInt64(); + + row3 = Avx2.Add(row3, row4); + row2 = Avx2.Xor(row2, row3); + row2 = Avx2.Xor(Avx2.ShiftRightLogical(row2, 63), Avx2.Add(row2, row2)); + + //UNDIAGONALIZE + row1 = Avx2.Permute4x64(row1, 0b_00_11_10_01); + row4 = Avx2.Permute4x64(row4, 0b_01_00_11_10); + row3 = Avx2.Permute4x64(row3, 0b_10_01_00_11); + + if (++i == rounds) break; + + //ROUND 9 + t0 = Avx2.UnpackLow(m3, m7); + t1 = Avx2.AlignRight(m0, m5, 8); + b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); + + //G1 + row1 = Avx2.Add(Avx2.Add(row1, b0), row2); + row4 = Avx2.Xor(row4, row1); + row4 = Avx2.Shuffle(row4.AsUInt32(), 0b_10_11_00_01).AsUInt64(); + + row3 = Avx2.Add(row3, row4); + row2 = Avx2.Xor(row2, row3); + row2 = Avx2.Shuffle(row2.AsByte(), r24).AsUInt64(); + + t0 = Avx2.UnpackHigh(m7, m4); + t1 = Avx2.AlignRight(m4, m1, 8); + b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); + + //G2 + row1 = Avx2.Add(Avx2.Add(row1, b0), row2); + row4 = Avx2.Xor(row4, row1); + row4 = Avx2.Shuffle(row4.AsByte(), r16).AsUInt64(); + + row3 = Avx2.Add(row3, row4); + row2 = Avx2.Xor(row2, row3); + row2 = Avx2.Xor(Avx2.ShiftRightLogical(row2, 63), Avx2.Add(row2, row2)); + + //DIAGONALIZE + row1 = Avx2.Permute4x64(row1, 0b_10_01_00_11); + row4 = Avx2.Permute4x64(row4, 0b_01_00_11_10); + row3 = Avx2.Permute4x64(row3, 0b_00_11_10_01); + + t0 = Avx2.UnpackLow(m5, m6); + t1 = Avx2.UnpackHigh(m6, m0); + b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); + + //G1 + row1 = Avx2.Add(Avx2.Add(row1, b0), row2); + row4 = Avx2.Xor(row4, row1); + row4 = Avx2.Shuffle(row4.AsUInt32(), 0b_10_11_00_01).AsUInt64(); + + row3 = Avx2.Add(row3, row4); + row2 = Avx2.Xor(row2, row3); + row2 = Avx2.Shuffle(row2.AsByte(), r24).AsUInt64(); + + t0 = Avx2.AlignRight(m1, m2, 8); + t1 = Avx2.AlignRight(m2, m3, 8); + b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); + + //G2 + row1 = Avx2.Add(Avx2.Add(row1, b0), row2); + row4 = Avx2.Xor(row4, row1); + row4 = Avx2.Shuffle(row4.AsByte(), r16).AsUInt64(); + + row3 = Avx2.Add(row3, row4); + row2 = Avx2.Xor(row2, row3); + row2 = Avx2.Xor(Avx2.ShiftRightLogical(row2, 63), Avx2.Add(row2, row2)); + + //UNDIAGONALIZE + row1 = Avx2.Permute4x64(row1, 0b_00_11_10_01); + row4 = Avx2.Permute4x64(row4, 0b_01_00_11_10); + row3 = Avx2.Permute4x64(row3, 0b_10_01_00_11); + + if (++i == rounds) break; + + //ROUND 10 + t0 = Avx2.UnpackLow(m5, m4); + t1 = Avx2.UnpackHigh(m3, m0); + b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); + + //G1 + row1 = Avx2.Add(Avx2.Add(row1, b0), row2); + row4 = Avx2.Xor(row4, row1); + row4 = Avx2.Shuffle(row4.AsUInt32(), 0b_10_11_00_01).AsUInt64(); + + row3 = Avx2.Add(row3, row4); + row2 = Avx2.Xor(row2, row3); + row2 = Avx2.Shuffle(row2.AsByte(), r24).AsUInt64(); + + t0 = Avx2.UnpackLow(m1, m2); + t1 = Avx2.Blend(m3.AsUInt32(), m2.AsUInt32(), 0b_1100_1100).AsUInt64(); + b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); + + //G2 + row1 = Avx2.Add(Avx2.Add(row1, b0), row2); + row4 = Avx2.Xor(row4, row1); + row4 = Avx2.Shuffle(row4.AsByte(), r16).AsUInt64(); + + row3 = Avx2.Add(row3, row4); + row2 = Avx2.Xor(row2, row3); + row2 = Avx2.Xor(Avx2.ShiftRightLogical(row2, 63), Avx2.Add(row2, row2)); + + //DIAGONALIZE + row1 = Avx2.Permute4x64(row1, 0b_10_01_00_11); + row4 = Avx2.Permute4x64(row4, 0b_01_00_11_10); + row3 = Avx2.Permute4x64(row3, 0b_00_11_10_01); + + t0 = Avx2.UnpackHigh(m6, m7); + t1 = Avx2.UnpackHigh(m4, m1); + b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); + + //G1 + row1 = Avx2.Add(Avx2.Add(row1, b0), row2); + row4 = Avx2.Xor(row4, row1); + row4 = Avx2.Shuffle(row4.AsUInt32(), 0b_10_11_00_01).AsUInt64(); + + row3 = Avx2.Add(row3, row4); + row2 = Avx2.Xor(row2, row3); + row2 = Avx2.Shuffle(row2.AsByte(), r24).AsUInt64(); + + t0 = Avx2.Blend(m0.AsUInt32(), m5.AsUInt32(), 0b_1100_1100).AsUInt64(); + t1 = Avx2.UnpackLow(m7, m6); + b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); + + //G2 + row1 = Avx2.Add(Avx2.Add(row1, b0), row2); + row4 = Avx2.Xor(row4, row1); + row4 = Avx2.Shuffle(row4.AsByte(), r16).AsUInt64(); + + row3 = Avx2.Add(row3, row4); + row2 = Avx2.Xor(row2, row3); + row2 = Avx2.Xor(Avx2.ShiftRightLogical(row2, 63), Avx2.Add(row2, row2)); + + //UNDIAGONALIZE + row1 = Avx2.Permute4x64(row1, 0b_00_11_10_01); + row4 = Avx2.Permute4x64(row4, 0b_01_00_11_10); + row3 = Avx2.Permute4x64(row3, 0b_10_01_00_11); + } + + row1 = Avx2.Xor(row1, row3); + row2 = Avx2.Xor(row2, row4); + row1 = Avx2.Xor(row1, Avx.LoadVector256(sh)); + row2 = Avx2.Xor(row2, Avx.LoadVector256(sh + Vector256.Count)); + + Avx.Store(sh, row1); + Avx.Store(sh + Vector256.Count, row2); + } +} diff --git a/src/Nethermind/Nethermind.Crypto/Blake2/Blake2CompressMethod.cs b/src/Nethermind/Nethermind.Crypto/Blake2/Blake2CompressMethod.cs new file mode 100644 index 00000000000..6d87db5de10 --- /dev/null +++ b/src/Nethermind/Nethermind.Crypto/Blake2/Blake2CompressMethod.cs @@ -0,0 +1,27 @@ +// Copyright (c) 2022 Demerzel Solutions Limited +// This file is part of the Nethermind library. +// +// The Nethermind library is free software: you can redistribute it and/or modify +// it under the terms of the GNU Lesser General Public License as published by +// the Free Software Foundation, either version 3 of the License, or +// (at your option) any later version. +// +// The Nethermind library is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU Lesser General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public License +// along with the Nethermind. If not, see . +// + + +namespace Nethermind.Crypto.Blake2; + +public enum Blake2CompressMethod +{ + Avx2, + Sse41, + Scalar, + Optimal +} diff --git a/src/Nethermind/Nethermind.Crypto/Blake2/Blake2Compression.cs b/src/Nethermind/Nethermind.Crypto/Blake2/Blake2Compression.cs new file mode 100644 index 00000000000..7c530122744 --- /dev/null +++ b/src/Nethermind/Nethermind.Crypto/Blake2/Blake2Compression.cs @@ -0,0 +1,99 @@ +// Copyright (c) 2021 Demerzel Solutions Limited +// This file is part of the Nethermind library. +// +// The Nethermind library is free software: you can redistribute it and/or modify +// it under the terms of the GNU Lesser General Public License as published by +// the Free Software Foundation, either version 3 of the License, or +// (at your option) any later version. +// +// The Nethermind library is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU Lesser General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public License +// along with the Nethermind. If not, see . + +using System; +using System.Buffers.Binary; +using System.Runtime.InteropServices; +using System.Runtime.Intrinsics.X86; + +namespace Nethermind.Crypto.Blake2 +{ + /// + /// Code adapted from pantheon (https://github.com/PegaSysEng/pantheon) + /// and from Blake2Fast (https://github.com/saucecontrol/Blake2Fast) + /// + public partial class Blake2Compression + { + const byte NumberOfBytesInUlong = 8; + const byte NumberOfHWords = 8; + const byte NumberOfMWords = 16; + const byte StartOfHWords = 4; + const byte StartOfMWords = 68; + const byte StartOfTWords = 196; + const byte ByteOfFWord = 212; + + private static ReadOnlySpan Ivle => new byte[] + { + 0x08, 0xC9, 0xBC, 0xF3, 0x67, 0xE6, 0x09, 0x6A, 0x3B, 0xA7, 0xCA, 0x84, 0x85, 0xAE, 0x67, 0xBB, 0x2B, + 0xF8, 0x94, 0xFE, 0x72, 0xF3, 0x6E, 0x3C, 0xF1, 0x36, 0x1D, 0x5F, 0x3A, 0xF5, 0x4F, 0xA5, 0xD1, 0x82, + 0xE6, 0xAD, 0x7F, 0x52, 0x0E, 0x51, 0x1F, 0x6C, 0x3E, 0x2B, 0x8C, 0x68, 0x05, 0x9B, 0x6B, 0xBD, 0x41, + 0xFB, 0xAB, 0xD9, 0x83, 0x1F, 0x79, 0x21, 0x7E, 0x13, 0x19, 0xCD, 0xE0, 0x5B + }; + + private static ReadOnlySpan Rormask => new byte[] + { + 3, 4, 5, 6, 7, 0, 1, 2, 11, 12, 13, 14, 15, 8, 9, 10, //r24 + 2, 3, 4, 5, 6, 7, 0, 1, 10, 11, 12, 13, 14, 15, 8, 9 //r16 + }; + + public unsafe void Compress(ReadOnlySpan input, Span output, Blake2CompressMethod method = Blake2CompressMethod.Optimal) + { + // sh length = h words length + t[0] + t[1] + f[0] + ulong* sh = stackalloc ulong[NumberOfHWords + 3]; + ulong* m = stackalloc ulong[NumberOfMWords]; + + uint rounds = BinaryPrimitives.ReadUInt32BigEndian(input); + + for (int i = 0; i < NumberOfHWords; i++) + { + sh[i] = MemoryMarshal.Cast(input.Slice(StartOfHWords + i * NumberOfBytesInUlong, NumberOfBytesInUlong)).GetPinnableReference(); + } + + // t[0] + sh[8] = MemoryMarshal.Cast(input.Slice(StartOfTWords, NumberOfBytesInUlong)).GetPinnableReference(); + // t[1] + sh[9] = MemoryMarshal.Cast(input.Slice(StartOfTWords + NumberOfBytesInUlong, NumberOfBytesInUlong)).GetPinnableReference(); + // f[0] + sh[10] = input[ByteOfFWord] != 0 ? ulong.MaxValue : ulong.MinValue; + + for (int i = 0; i < NumberOfMWords; i++) + { + m[i] = MemoryMarshal.Cast(input.Slice(StartOfMWords + i * NumberOfBytesInUlong, NumberOfBytesInUlong)).GetPinnableReference(); + } + + switch (method) + { + case Blake2CompressMethod.Optimal when Avx2.IsSupported: + case Blake2CompressMethod.Avx2: + ComputeAvx2(sh, m, rounds); + break; + case Blake2CompressMethod.Optimal when Sse41.IsSupported: + case Blake2CompressMethod.Sse41: + ComputeSse41(sh, m, rounds); + break; + default: + ComputeScalar(sh, m, rounds); + break; + } + + Span outputUlongs = MemoryMarshal.Cast(output); + for (int offset = 0; offset < NumberOfHWords; offset++) + { + outputUlongs[offset] = sh[offset]; + } + } + } +} diff --git a/src/Nethermind/Nethermind.Crypto/Blake2/Blake2Scalar.cs b/src/Nethermind/Nethermind.Crypto/Blake2/Blake2Scalar.cs new file mode 100644 index 00000000000..e1907b50e7a --- /dev/null +++ b/src/Nethermind/Nethermind.Crypto/Blake2/Blake2Scalar.cs @@ -0,0 +1,1390 @@ +// Copyright (c) 2022 Demerzel Solutions Limited +// This file is part of the Nethermind library. +// +// The Nethermind library is free software: you can redistribute it and/or modify +// it under the terms of the GNU Lesser General Public License as published by +// the Free Software Foundation, either version 3 of the License, or +// (at your option) any later version. +// +// The Nethermind library is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU Lesser General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public License +// along with the Nethermind. If not, see . +// + +using System.Runtime.CompilerServices; + +namespace Nethermind.Crypto.Blake2; + +/// +/// Code adapted from Blake2Fast (https://github.com/saucecontrol/Blake2Fast) +/// +public unsafe partial class Blake2Compression +{ + [MethodImpl(MethodImplOptions.AggressiveOptimization)] + [SkipLocalsInit] + private static void ComputeScalar(ulong* sh, ulong* m, uint rounds) + { + ulong m00 = m[00]; + ulong m01 = m[01]; + ulong m02 = m[02]; + ulong m03 = m[03]; + ulong m04 = m[04]; + ulong m05 = m[05]; + ulong m06 = m[06]; + ulong m07 = m[07]; + ulong m08 = m[08]; + ulong m09 = m[09]; + ulong m10 = m[10]; + ulong m11 = m[11]; + ulong m12 = m[12]; + ulong m13 = m[13]; + ulong m14 = m[14]; + ulong m15 = m[15]; + + ulong v00 = sh[0]; + ulong v01 = sh[1]; + ulong v02 = sh[2]; + ulong v03 = sh[3]; + ulong v04 = sh[4]; + ulong v05 = sh[5]; + ulong v06 = sh[6]; + ulong v07 = sh[7]; + + ulong v08 = 0x6A09E667F3BCC908ul; + ulong v09 = 0xBB67AE8584CAA73Bul; + ulong v10 = 0x3C6EF372FE94F82Bul; + ulong v11 = 0xA54FF53A5F1D36F1ul; + ulong v12 = 0x510E527FADE682D1ul; + ulong v13 = 0x9B05688C2B3E6C1Ful; + ulong v14 = 0x1F83D9ABFB41BD6Bul; + ulong v15 = 0x5BE0CD19137E2179ul; + + v12 ^= sh[8]; // t[0] + v13 ^= sh[9]; // t[1] + v14 ^= sh[10]; // f[0] + + for (uint i = 0; i < rounds; i++) + { + //ROUND 1 + v00 += m00; + v00 += v04; + v12 ^= v00; + v12 = (v12 >> 32) ^ (v12 << 32); + v08 += v12; + v04 ^= v08; + v04 = (v04 >> 24) ^ (v04 << 40); + + v01 += m02; + v01 += v05; + v13 ^= v01; + v13 = (v13 >> 32) ^ (v13 << 32); + v09 += v13; + v05 ^= v09; + v05 = (v05 >> 24) ^ (v05 << 40); + + v02 += m04; + v02 += v06; + v14 ^= v02; + v14 = (v14 >> 32) ^ (v14 << 32); + v10 += v14; + v06 ^= v10; + v06 = (v06 >> 24) ^ (v06 << 40); + + v03 += m06; + v03 += v07; + v15 ^= v03; + v15 = (v15 >> 32) ^ (v15 << 32); + v11 += v15; + v07 ^= v11; + v07 = (v07 >> 24) ^ (v07 << 40); + + v02 += m05; + v02 += v06; + v14 ^= v02; + v14 = (v14 >> 16) ^ (v14 << 48); + v10 += v14; + v06 ^= v10; + v06 = (v06 >> 63) ^ (v06 << 1); + + v03 += m07; + v03 += v07; + v15 ^= v03; + v15 = (v15 >> 16) ^ (v15 << 48); + v11 += v15; + v07 ^= v11; + v07 = (v07 >> 63) ^ (v07 << 1); + + v00 += m01; + v00 += v04; + v12 ^= v00; + v12 = (v12 >> 16) ^ (v12 << 48); + v08 += v12; + v04 ^= v08; + v04 = (v04 >> 63) ^ (v04 << 1); + + v01 += m03; + v01 += v05; + v13 ^= v01; + v13 = (v13 >> 16) ^ (v13 << 48); + v09 += v13; + v05 ^= v09; + v05 = (v05 >> 63) ^ (v05 << 1); + + v00 += m08; + v00 += v05; + v15 ^= v00; + v15 = (v15 >> 32) ^ (v15 << 32); + v10 += v15; + v05 ^= v10; + v05 = (v05 >> 24) ^ (v05 << 40); + + v01 += m10; + v01 += v06; + v12 ^= v01; + v12 = (v12 >> 32) ^ (v12 << 32); + v11 += v12; + v06 ^= v11; + v06 = (v06 >> 24) ^ (v06 << 40); + + v02 += m12; + v02 += v07; + v13 ^= v02; + v13 = (v13 >> 32) ^ (v13 << 32); + v08 += v13; + v07 ^= v08; + v07 = (v07 >> 24) ^ (v07 << 40); + + v03 += m14; + v03 += v04; + v14 ^= v03; + v14 = (v14 >> 32) ^ (v14 << 32); + v09 += v14; + v04 ^= v09; + v04 = (v04 >> 24) ^ (v04 << 40); + + v02 += m13; + v02 += v07; + v13 ^= v02; + v13 = (v13 >> 16) ^ (v13 << 48); + v08 += v13; + v07 ^= v08; + v07 = (v07 >> 63) ^ (v07 << 1); + + v03 += m15; + v03 += v04; + v14 ^= v03; + v14 = (v14 >> 16) ^ (v14 << 48); + v09 += v14; + v04 ^= v09; + v04 = (v04 >> 63) ^ (v04 << 1); + + v00 += m09; + v00 += v05; + v15 ^= v00; + v15 = (v15 >> 16) ^ (v15 << 48); + v10 += v15; + v05 ^= v10; + v05 = (v05 >> 63) ^ (v05 << 1); + + v01 += m11; + v01 += v06; + v12 ^= v01; + v12 = (v12 >> 16) ^ (v12 << 48); + v11 += v12; + v06 ^= v11; + v06 = (v06 >> 63) ^ (v06 << 1); + + if (++i == rounds) break; + + //ROUND 2 + v00 += m14; + v00 += v04; + v12 ^= v00; + v12 = (v12 >> 32) ^ (v12 << 32); + v08 += v12; + v04 ^= v08; + v04 = (v04 >> 24) ^ (v04 << 40); + + v01 += m04; + v01 += v05; + v13 ^= v01; + v13 = (v13 >> 32) ^ (v13 << 32); + v09 += v13; + v05 ^= v09; + v05 = (v05 >> 24) ^ (v05 << 40); + + v02 += m09; + v02 += v06; + v14 ^= v02; + v14 = (v14 >> 32) ^ (v14 << 32); + v10 += v14; + v06 ^= v10; + v06 = (v06 >> 24) ^ (v06 << 40); + + v03 += m13; + v03 += v07; + v15 ^= v03; + v15 = (v15 >> 32) ^ (v15 << 32); + v11 += v15; + v07 ^= v11; + v07 = (v07 >> 24) ^ (v07 << 40); + + v02 += m15; + v02 += v06; + v14 ^= v02; + v14 = (v14 >> 16) ^ (v14 << 48); + v10 += v14; + v06 ^= v10; + v06 = (v06 >> 63) ^ (v06 << 1); + + v03 += m06; + v03 += v07; + v15 ^= v03; + v15 = (v15 >> 16) ^ (v15 << 48); + v11 += v15; + v07 ^= v11; + v07 = (v07 >> 63) ^ (v07 << 1); + + v00 += m10; + v00 += v04; + v12 ^= v00; + v12 = (v12 >> 16) ^ (v12 << 48); + v08 += v12; + v04 ^= v08; + v04 = (v04 >> 63) ^ (v04 << 1); + + v01 += m08; + v01 += v05; + v13 ^= v01; + v13 = (v13 >> 16) ^ (v13 << 48); + v09 += v13; + v05 ^= v09; + v05 = (v05 >> 63) ^ (v05 << 1); + + v00 += m01; + v00 += v05; + v15 ^= v00; + v15 = (v15 >> 32) ^ (v15 << 32); + v10 += v15; + v05 ^= v10; + v05 = (v05 >> 24) ^ (v05 << 40); + + v01 += m00; + v01 += v06; + v12 ^= v01; + v12 = (v12 >> 32) ^ (v12 << 32); + v11 += v12; + v06 ^= v11; + v06 = (v06 >> 24) ^ (v06 << 40); + + v02 += m11; + v02 += v07; + v13 ^= v02; + v13 = (v13 >> 32) ^ (v13 << 32); + v08 += v13; + v07 ^= v08; + v07 = (v07 >> 24) ^ (v07 << 40); + + v03 += m05; + v03 += v04; + v14 ^= v03; + v14 = (v14 >> 32) ^ (v14 << 32); + v09 += v14; + v04 ^= v09; + v04 = (v04 >> 24) ^ (v04 << 40); + + v02 += m07; + v02 += v07; + v13 ^= v02; + v13 = (v13 >> 16) ^ (v13 << 48); + v08 += v13; + v07 ^= v08; + v07 = (v07 >> 63) ^ (v07 << 1); + + v03 += m03; + v03 += v04; + v14 ^= v03; + v14 = (v14 >> 16) ^ (v14 << 48); + v09 += v14; + v04 ^= v09; + v04 = (v04 >> 63) ^ (v04 << 1); + + v00 += m12; + v00 += v05; + v15 ^= v00; + v15 = (v15 >> 16) ^ (v15 << 48); + v10 += v15; + v05 ^= v10; + v05 = (v05 >> 63) ^ (v05 << 1); + + v01 += m02; + v01 += v06; + v12 ^= v01; + v12 = (v12 >> 16) ^ (v12 << 48); + v11 += v12; + v06 ^= v11; + v06 = (v06 >> 63) ^ (v06 << 1); + + if (++i == rounds) break; + + //ROUND 3 + v00 += m11; + v00 += v04; + v12 ^= v00; + v12 = (v12 >> 32) ^ (v12 << 32); + v08 += v12; + v04 ^= v08; + v04 = (v04 >> 24) ^ (v04 << 40); + + v01 += m12; + v01 += v05; + v13 ^= v01; + v13 = (v13 >> 32) ^ (v13 << 32); + v09 += v13; + v05 ^= v09; + v05 = (v05 >> 24) ^ (v05 << 40); + + v02 += m05; + v02 += v06; + v14 ^= v02; + v14 = (v14 >> 32) ^ (v14 << 32); + v10 += v14; + v06 ^= v10; + v06 = (v06 >> 24) ^ (v06 << 40); + + v03 += m15; + v03 += v07; + v15 ^= v03; + v15 = (v15 >> 32) ^ (v15 << 32); + v11 += v15; + v07 ^= v11; + v07 = (v07 >> 24) ^ (v07 << 40); + + v02 += m02; + v02 += v06; + v14 ^= v02; + v14 = (v14 >> 16) ^ (v14 << 48); + v10 += v14; + v06 ^= v10; + v06 = (v06 >> 63) ^ (v06 << 1); + + v03 += m13; + v03 += v07; + v15 ^= v03; + v15 = (v15 >> 16) ^ (v15 << 48); + v11 += v15; + v07 ^= v11; + v07 = (v07 >> 63) ^ (v07 << 1); + + v00 += m08; + v00 += v04; + v12 ^= v00; + v12 = (v12 >> 16) ^ (v12 << 48); + v08 += v12; + v04 ^= v08; + v04 = (v04 >> 63) ^ (v04 << 1); + + v01 += m00; + v01 += v05; + v13 ^= v01; + v13 = (v13 >> 16) ^ (v13 << 48); + v09 += v13; + v05 ^= v09; + v05 = (v05 >> 63) ^ (v05 << 1); + + v00 += m10; + v00 += v05; + v15 ^= v00; + v15 = (v15 >> 32) ^ (v15 << 32); + v10 += v15; + v05 ^= v10; + v05 = (v05 >> 24) ^ (v05 << 40); + + v01 += m03; + v01 += v06; + v12 ^= v01; + v12 = (v12 >> 32) ^ (v12 << 32); + v11 += v12; + v06 ^= v11; + v06 = (v06 >> 24) ^ (v06 << 40); + + v02 += m07; + v02 += v07; + v13 ^= v02; + v13 = (v13 >> 32) ^ (v13 << 32); + v08 += v13; + v07 ^= v08; + v07 = (v07 >> 24) ^ (v07 << 40); + + v03 += m09; + v03 += v04; + v14 ^= v03; + v14 = (v14 >> 32) ^ (v14 << 32); + v09 += v14; + v04 ^= v09; + v04 = (v04 >> 24) ^ (v04 << 40); + + v02 += m01; + v02 += v07; + v13 ^= v02; + v13 = (v13 >> 16) ^ (v13 << 48); + v08 += v13; + v07 ^= v08; + v07 = (v07 >> 63) ^ (v07 << 1); + + v03 += m04; + v03 += v04; + v14 ^= v03; + v14 = (v14 >> 16) ^ (v14 << 48); + v09 += v14; + v04 ^= v09; + v04 = (v04 >> 63) ^ (v04 << 1); + + v00 += m14; + v00 += v05; + v15 ^= v00; + v15 = (v15 >> 16) ^ (v15 << 48); + v10 += v15; + v05 ^= v10; + v05 = (v05 >> 63) ^ (v05 << 1); + + v01 += m06; + v01 += v06; + v12 ^= v01; + v12 = (v12 >> 16) ^ (v12 << 48); + v11 += v12; + v06 ^= v11; + v06 = (v06 >> 63) ^ (v06 << 1); + + if (++i == rounds) break; + + //ROUND 4 + v00 += m07; + v00 += v04; + v12 ^= v00; + v12 = (v12 >> 32) ^ (v12 << 32); + v08 += v12; + v04 ^= v08; + v04 = (v04 >> 24) ^ (v04 << 40); + + v01 += m03; + v01 += v05; + v13 ^= v01; + v13 = (v13 >> 32) ^ (v13 << 32); + v09 += v13; + v05 ^= v09; + v05 = (v05 >> 24) ^ (v05 << 40); + + v02 += m13; + v02 += v06; + v14 ^= v02; + v14 = (v14 >> 32) ^ (v14 << 32); + v10 += v14; + v06 ^= v10; + v06 = (v06 >> 24) ^ (v06 << 40); + + v03 += m11; + v03 += v07; + v15 ^= v03; + v15 = (v15 >> 32) ^ (v15 << 32); + v11 += v15; + v07 ^= v11; + v07 = (v07 >> 24) ^ (v07 << 40); + + v02 += m12; + v02 += v06; + v14 ^= v02; + v14 = (v14 >> 16) ^ (v14 << 48); + v10 += v14; + v06 ^= v10; + v06 = (v06 >> 63) ^ (v06 << 1); + + v03 += m14; + v03 += v07; + v15 ^= v03; + v15 = (v15 >> 16) ^ (v15 << 48); + v11 += v15; + v07 ^= v11; + v07 = (v07 >> 63) ^ (v07 << 1); + + v00 += m09; + v00 += v04; + v12 ^= v00; + v12 = (v12 >> 16) ^ (v12 << 48); + v08 += v12; + v04 ^= v08; + v04 = (v04 >> 63) ^ (v04 << 1); + + v01 += m01; + v01 += v05; + v13 ^= v01; + v13 = (v13 >> 16) ^ (v13 << 48); + v09 += v13; + v05 ^= v09; + v05 = (v05 >> 63) ^ (v05 << 1); + + v00 += m02; + v00 += v05; + v15 ^= v00; + v15 = (v15 >> 32) ^ (v15 << 32); + v10 += v15; + v05 ^= v10; + v05 = (v05 >> 24) ^ (v05 << 40); + + v01 += m05; + v01 += v06; + v12 ^= v01; + v12 = (v12 >> 32) ^ (v12 << 32); + v11 += v12; + v06 ^= v11; + v06 = (v06 >> 24) ^ (v06 << 40); + + v02 += m04; + v02 += v07; + v13 ^= v02; + v13 = (v13 >> 32) ^ (v13 << 32); + v08 += v13; + v07 ^= v08; + v07 = (v07 >> 24) ^ (v07 << 40); + + v03 += m15; + v03 += v04; + v14 ^= v03; + v14 = (v14 >> 32) ^ (v14 << 32); + v09 += v14; + v04 ^= v09; + v04 = (v04 >> 24) ^ (v04 << 40); + + v02 += m00; + v02 += v07; + v13 ^= v02; + v13 = (v13 >> 16) ^ (v13 << 48); + v08 += v13; + v07 ^= v08; + v07 = (v07 >> 63) ^ (v07 << 1); + + v03 += m08; + v03 += v04; + v14 ^= v03; + v14 = (v14 >> 16) ^ (v14 << 48); + v09 += v14; + v04 ^= v09; + v04 = (v04 >> 63) ^ (v04 << 1); + + v00 += m06; + v00 += v05; + v15 ^= v00; + v15 = (v15 >> 16) ^ (v15 << 48); + v10 += v15; + v05 ^= v10; + v05 = (v05 >> 63) ^ (v05 << 1); + + v01 += m10; + v01 += v06; + v12 ^= v01; + v12 = (v12 >> 16) ^ (v12 << 48); + v11 += v12; + v06 ^= v11; + v06 = (v06 >> 63) ^ (v06 << 1); + + if (++i == rounds) break; + + //ROUND 5 + v00 += m09; + v00 += v04; + v12 ^= v00; + v12 = (v12 >> 32) ^ (v12 << 32); + v08 += v12; + v04 ^= v08; + v04 = (v04 >> 24) ^ (v04 << 40); + + v01 += m05; + v01 += v05; + v13 ^= v01; + v13 = (v13 >> 32) ^ (v13 << 32); + v09 += v13; + v05 ^= v09; + v05 = (v05 >> 24) ^ (v05 << 40); + + v02 += m02; + v02 += v06; + v14 ^= v02; + v14 = (v14 >> 32) ^ (v14 << 32); + v10 += v14; + v06 ^= v10; + v06 = (v06 >> 24) ^ (v06 << 40); + + v03 += m10; + v03 += v07; + v15 ^= v03; + v15 = (v15 >> 32) ^ (v15 << 32); + v11 += v15; + v07 ^= v11; + v07 = (v07 >> 24) ^ (v07 << 40); + + v02 += m04; + v02 += v06; + v14 ^= v02; + v14 = (v14 >> 16) ^ (v14 << 48); + v10 += v14; + v06 ^= v10; + v06 = (v06 >> 63) ^ (v06 << 1); + + v03 += m15; + v03 += v07; + v15 ^= v03; + v15 = (v15 >> 16) ^ (v15 << 48); + v11 += v15; + v07 ^= v11; + v07 = (v07 >> 63) ^ (v07 << 1); + + v00 += m00; + v00 += v04; + v12 ^= v00; + v12 = (v12 >> 16) ^ (v12 << 48); + v08 += v12; + v04 ^= v08; + v04 = (v04 >> 63) ^ (v04 << 1); + + v01 += m07; + v01 += v05; + v13 ^= v01; + v13 = (v13 >> 16) ^ (v13 << 48); + v09 += v13; + v05 ^= v09; + v05 = (v05 >> 63) ^ (v05 << 1); + + v00 += m14; + v00 += v05; + v15 ^= v00; + v15 = (v15 >> 32) ^ (v15 << 32); + v10 += v15; + v05 ^= v10; + v05 = (v05 >> 24) ^ (v05 << 40); + + v01 += m11; + v01 += v06; + v12 ^= v01; + v12 = (v12 >> 32) ^ (v12 << 32); + v11 += v12; + v06 ^= v11; + v06 = (v06 >> 24) ^ (v06 << 40); + + v02 += m06; + v02 += v07; + v13 ^= v02; + v13 = (v13 >> 32) ^ (v13 << 32); + v08 += v13; + v07 ^= v08; + v07 = (v07 >> 24) ^ (v07 << 40); + + v03 += m03; + v03 += v04; + v14 ^= v03; + v14 = (v14 >> 32) ^ (v14 << 32); + v09 += v14; + v04 ^= v09; + v04 = (v04 >> 24) ^ (v04 << 40); + + v02 += m08; + v02 += v07; + v13 ^= v02; + v13 = (v13 >> 16) ^ (v13 << 48); + v08 += v13; + v07 ^= v08; + v07 = (v07 >> 63) ^ (v07 << 1); + + v03 += m13; + v03 += v04; + v14 ^= v03; + v14 = (v14 >> 16) ^ (v14 << 48); + v09 += v14; + v04 ^= v09; + v04 = (v04 >> 63) ^ (v04 << 1); + + v00 += m01; + v00 += v05; + v15 ^= v00; + v15 = (v15 >> 16) ^ (v15 << 48); + v10 += v15; + v05 ^= v10; + v05 = (v05 >> 63) ^ (v05 << 1); + + v01 += m12; + v01 += v06; + v12 ^= v01; + v12 = (v12 >> 16) ^ (v12 << 48); + v11 += v12; + v06 ^= v11; + v06 = (v06 >> 63) ^ (v06 << 1); + + if (++i == rounds) break; + + //ROUND 6 + v00 += m02; + v00 += v04; + v12 ^= v00; + v12 = (v12 >> 32) ^ (v12 << 32); + v08 += v12; + v04 ^= v08; + v04 = (v04 >> 24) ^ (v04 << 40); + + v01 += m06; + v01 += v05; + v13 ^= v01; + v13 = (v13 >> 32) ^ (v13 << 32); + v09 += v13; + v05 ^= v09; + v05 = (v05 >> 24) ^ (v05 << 40); + + v02 += m00; + v02 += v06; + v14 ^= v02; + v14 = (v14 >> 32) ^ (v14 << 32); + v10 += v14; + v06 ^= v10; + v06 = (v06 >> 24) ^ (v06 << 40); + + v03 += m08; + v03 += v07; + v15 ^= v03; + v15 = (v15 >> 32) ^ (v15 << 32); + v11 += v15; + v07 ^= v11; + v07 = (v07 >> 24) ^ (v07 << 40); + + v02 += m11; + v02 += v06; + v14 ^= v02; + v14 = (v14 >> 16) ^ (v14 << 48); + v10 += v14; + v06 ^= v10; + v06 = (v06 >> 63) ^ (v06 << 1); + + v03 += m03; + v03 += v07; + v15 ^= v03; + v15 = (v15 >> 16) ^ (v15 << 48); + v11 += v15; + v07 ^= v11; + v07 = (v07 >> 63) ^ (v07 << 1); + + v00 += m12; + v00 += v04; + v12 ^= v00; + v12 = (v12 >> 16) ^ (v12 << 48); + v08 += v12; + v04 ^= v08; + v04 = (v04 >> 63) ^ (v04 << 1); + + v01 += m10; + v01 += v05; + v13 ^= v01; + v13 = (v13 >> 16) ^ (v13 << 48); + v09 += v13; + v05 ^= v09; + v05 = (v05 >> 63) ^ (v05 << 1); + + v00 += m04; + v00 += v05; + v15 ^= v00; + v15 = (v15 >> 32) ^ (v15 << 32); + v10 += v15; + v05 ^= v10; + v05 = (v05 >> 24) ^ (v05 << 40); + + v01 += m07; + v01 += v06; + v12 ^= v01; + v12 = (v12 >> 32) ^ (v12 << 32); + v11 += v12; + v06 ^= v11; + v06 = (v06 >> 24) ^ (v06 << 40); + + v02 += m15; + v02 += v07; + v13 ^= v02; + v13 = (v13 >> 32) ^ (v13 << 32); + v08 += v13; + v07 ^= v08; + v07 = (v07 >> 24) ^ (v07 << 40); + + v03 += m01; + v03 += v04; + v14 ^= v03; + v14 = (v14 >> 32) ^ (v14 << 32); + v09 += v14; + v04 ^= v09; + v04 = (v04 >> 24) ^ (v04 << 40); + + v02 += m14; + v02 += v07; + v13 ^= v02; + v13 = (v13 >> 16) ^ (v13 << 48); + v08 += v13; + v07 ^= v08; + v07 = (v07 >> 63) ^ (v07 << 1); + + v03 += m09; + v03 += v04; + v14 ^= v03; + v14 = (v14 >> 16) ^ (v14 << 48); + v09 += v14; + v04 ^= v09; + v04 = (v04 >> 63) ^ (v04 << 1); + + v00 += m13; + v00 += v05; + v15 ^= v00; + v15 = (v15 >> 16) ^ (v15 << 48); + v10 += v15; + v05 ^= v10; + v05 = (v05 >> 63) ^ (v05 << 1); + + v01 += m05; + v01 += v06; + v12 ^= v01; + v12 = (v12 >> 16) ^ (v12 << 48); + v11 += v12; + v06 ^= v11; + v06 = (v06 >> 63) ^ (v06 << 1); + + if (++i == rounds) break; + + //ROUND 7 + v00 += m12; + v00 += v04; + v12 ^= v00; + v12 = (v12 >> 32) ^ (v12 << 32); + v08 += v12; + v04 ^= v08; + v04 = (v04 >> 24) ^ (v04 << 40); + + v01 += m01; + v01 += v05; + v13 ^= v01; + v13 = (v13 >> 32) ^ (v13 << 32); + v09 += v13; + v05 ^= v09; + v05 = (v05 >> 24) ^ (v05 << 40); + + v02 += m14; + v02 += v06; + v14 ^= v02; + v14 = (v14 >> 32) ^ (v14 << 32); + v10 += v14; + v06 ^= v10; + v06 = (v06 >> 24) ^ (v06 << 40); + + v03 += m04; + v03 += v07; + v15 ^= v03; + v15 = (v15 >> 32) ^ (v15 << 32); + v11 += v15; + v07 ^= v11; + v07 = (v07 >> 24) ^ (v07 << 40); + + v02 += m13; + v02 += v06; + v14 ^= v02; + v14 = (v14 >> 16) ^ (v14 << 48); + v10 += v14; + v06 ^= v10; + v06 = (v06 >> 63) ^ (v06 << 1); + + v03 += m10; + v03 += v07; + v15 ^= v03; + v15 = (v15 >> 16) ^ (v15 << 48); + v11 += v15; + v07 ^= v11; + v07 = (v07 >> 63) ^ (v07 << 1); + + v00 += m05; + v00 += v04; + v12 ^= v00; + v12 = (v12 >> 16) ^ (v12 << 48); + v08 += v12; + v04 ^= v08; + v04 = (v04 >> 63) ^ (v04 << 1); + + v01 += m15; + v01 += v05; + v13 ^= v01; + v13 = (v13 >> 16) ^ (v13 << 48); + v09 += v13; + v05 ^= v09; + v05 = (v05 >> 63) ^ (v05 << 1); + + v00 += m00; + v00 += v05; + v15 ^= v00; + v15 = (v15 >> 32) ^ (v15 << 32); + v10 += v15; + v05 ^= v10; + v05 = (v05 >> 24) ^ (v05 << 40); + + v01 += m06; + v01 += v06; + v12 ^= v01; + v12 = (v12 >> 32) ^ (v12 << 32); + v11 += v12; + v06 ^= v11; + v06 = (v06 >> 24) ^ (v06 << 40); + + v02 += m09; + v02 += v07; + v13 ^= v02; + v13 = (v13 >> 32) ^ (v13 << 32); + v08 += v13; + v07 ^= v08; + v07 = (v07 >> 24) ^ (v07 << 40); + + v03 += m08; + v03 += v04; + v14 ^= v03; + v14 = (v14 >> 32) ^ (v14 << 32); + v09 += v14; + v04 ^= v09; + v04 = (v04 >> 24) ^ (v04 << 40); + + v02 += m02; + v02 += v07; + v13 ^= v02; + v13 = (v13 >> 16) ^ (v13 << 48); + v08 += v13; + v07 ^= v08; + v07 = (v07 >> 63) ^ (v07 << 1); + + v03 += m11; + v03 += v04; + v14 ^= v03; + v14 = (v14 >> 16) ^ (v14 << 48); + v09 += v14; + v04 ^= v09; + v04 = (v04 >> 63) ^ (v04 << 1); + + v00 += m07; + v00 += v05; + v15 ^= v00; + v15 = (v15 >> 16) ^ (v15 << 48); + v10 += v15; + v05 ^= v10; + v05 = (v05 >> 63) ^ (v05 << 1); + + v01 += m03; + v01 += v06; + v12 ^= v01; + v12 = (v12 >> 16) ^ (v12 << 48); + v11 += v12; + v06 ^= v11; + v06 = (v06 >> 63) ^ (v06 << 1); + + if (++i == rounds) break; + + //ROUND 8 + v00 += m13; + v00 += v04; + v12 ^= v00; + v12 = (v12 >> 32) ^ (v12 << 32); + v08 += v12; + v04 ^= v08; + v04 = (v04 >> 24) ^ (v04 << 40); + + v01 += m07; + v01 += v05; + v13 ^= v01; + v13 = (v13 >> 32) ^ (v13 << 32); + v09 += v13; + v05 ^= v09; + v05 = (v05 >> 24) ^ (v05 << 40); + + v02 += m12; + v02 += v06; + v14 ^= v02; + v14 = (v14 >> 32) ^ (v14 << 32); + v10 += v14; + v06 ^= v10; + v06 = (v06 >> 24) ^ (v06 << 40); + + v03 += m03; + v03 += v07; + v15 ^= v03; + v15 = (v15 >> 32) ^ (v15 << 32); + v11 += v15; + v07 ^= v11; + v07 = (v07 >> 24) ^ (v07 << 40); + + v02 += m01; + v02 += v06; + v14 ^= v02; + v14 = (v14 >> 16) ^ (v14 << 48); + v10 += v14; + v06 ^= v10; + v06 = (v06 >> 63) ^ (v06 << 1); + + v03 += m09; + v03 += v07; + v15 ^= v03; + v15 = (v15 >> 16) ^ (v15 << 48); + v11 += v15; + v07 ^= v11; + v07 = (v07 >> 63) ^ (v07 << 1); + + v00 += m11; + v00 += v04; + v12 ^= v00; + v12 = (v12 >> 16) ^ (v12 << 48); + v08 += v12; + v04 ^= v08; + v04 = (v04 >> 63) ^ (v04 << 1); + + v01 += m14; + v01 += v05; + v13 ^= v01; + v13 = (v13 >> 16) ^ (v13 << 48); + v09 += v13; + v05 ^= v09; + v05 = (v05 >> 63) ^ (v05 << 1); + + v00 += m05; + v00 += v05; + v15 ^= v00; + v15 = (v15 >> 32) ^ (v15 << 32); + v10 += v15; + v05 ^= v10; + v05 = (v05 >> 24) ^ (v05 << 40); + + v01 += m15; + v01 += v06; + v12 ^= v01; + v12 = (v12 >> 32) ^ (v12 << 32); + v11 += v12; + v06 ^= v11; + v06 = (v06 >> 24) ^ (v06 << 40); + + v02 += m08; + v02 += v07; + v13 ^= v02; + v13 = (v13 >> 32) ^ (v13 << 32); + v08 += v13; + v07 ^= v08; + v07 = (v07 >> 24) ^ (v07 << 40); + + v03 += m02; + v03 += v04; + v14 ^= v03; + v14 = (v14 >> 32) ^ (v14 << 32); + v09 += v14; + v04 ^= v09; + v04 = (v04 >> 24) ^ (v04 << 40); + + v02 += m06; + v02 += v07; + v13 ^= v02; + v13 = (v13 >> 16) ^ (v13 << 48); + v08 += v13; + v07 ^= v08; + v07 = (v07 >> 63) ^ (v07 << 1); + + v03 += m10; + v03 += v04; + v14 ^= v03; + v14 = (v14 >> 16) ^ (v14 << 48); + v09 += v14; + v04 ^= v09; + v04 = (v04 >> 63) ^ (v04 << 1); + + v00 += m00; + v00 += v05; + v15 ^= v00; + v15 = (v15 >> 16) ^ (v15 << 48); + v10 += v15; + v05 ^= v10; + v05 = (v05 >> 63) ^ (v05 << 1); + + v01 += m04; + v01 += v06; + v12 ^= v01; + v12 = (v12 >> 16) ^ (v12 << 48); + v11 += v12; + v06 ^= v11; + v06 = (v06 >> 63) ^ (v06 << 1); + + if (++i == rounds) break; + + //ROUND 9 + v00 += m06; + v00 += v04; + v12 ^= v00; + v12 = (v12 >> 32) ^ (v12 << 32); + v08 += v12; + v04 ^= v08; + v04 = (v04 >> 24) ^ (v04 << 40); + + v01 += m14; + v01 += v05; + v13 ^= v01; + v13 = (v13 >> 32) ^ (v13 << 32); + v09 += v13; + v05 ^= v09; + v05 = (v05 >> 24) ^ (v05 << 40); + + v02 += m11; + v02 += v06; + v14 ^= v02; + v14 = (v14 >> 32) ^ (v14 << 32); + v10 += v14; + v06 ^= v10; + v06 = (v06 >> 24) ^ (v06 << 40); + + v03 += m00; + v03 += v07; + v15 ^= v03; + v15 = (v15 >> 32) ^ (v15 << 32); + v11 += v15; + v07 ^= v11; + v07 = (v07 >> 24) ^ (v07 << 40); + + v02 += m03; + v02 += v06; + v14 ^= v02; + v14 = (v14 >> 16) ^ (v14 << 48); + v10 += v14; + v06 ^= v10; + v06 = (v06 >> 63) ^ (v06 << 1); + + v03 += m08; + v03 += v07; + v15 ^= v03; + v15 = (v15 >> 16) ^ (v15 << 48); + v11 += v15; + v07 ^= v11; + v07 = (v07 >> 63) ^ (v07 << 1); + + v00 += m15; + v00 += v04; + v12 ^= v00; + v12 = (v12 >> 16) ^ (v12 << 48); + v08 += v12; + v04 ^= v08; + v04 = (v04 >> 63) ^ (v04 << 1); + + v01 += m09; + v01 += v05; + v13 ^= v01; + v13 = (v13 >> 16) ^ (v13 << 48); + v09 += v13; + v05 ^= v09; + v05 = (v05 >> 63) ^ (v05 << 1); + + v00 += m12; + v00 += v05; + v15 ^= v00; + v15 = (v15 >> 32) ^ (v15 << 32); + v10 += v15; + v05 ^= v10; + v05 = (v05 >> 24) ^ (v05 << 40); + + v01 += m13; + v01 += v06; + v12 ^= v01; + v12 = (v12 >> 32) ^ (v12 << 32); + v11 += v12; + v06 ^= v11; + v06 = (v06 >> 24) ^ (v06 << 40); + + v02 += m01; + v02 += v07; + v13 ^= v02; + v13 = (v13 >> 32) ^ (v13 << 32); + v08 += v13; + v07 ^= v08; + v07 = (v07 >> 24) ^ (v07 << 40); + + v03 += m10; + v03 += v04; + v14 ^= v03; + v14 = (v14 >> 32) ^ (v14 << 32); + v09 += v14; + v04 ^= v09; + v04 = (v04 >> 24) ^ (v04 << 40); + + v02 += m04; + v02 += v07; + v13 ^= v02; + v13 = (v13 >> 16) ^ (v13 << 48); + v08 += v13; + v07 ^= v08; + v07 = (v07 >> 63) ^ (v07 << 1); + + v03 += m05; + v03 += v04; + v14 ^= v03; + v14 = (v14 >> 16) ^ (v14 << 48); + v09 += v14; + v04 ^= v09; + v04 = (v04 >> 63) ^ (v04 << 1); + + v00 += m02; + v00 += v05; + v15 ^= v00; + v15 = (v15 >> 16) ^ (v15 << 48); + v10 += v15; + v05 ^= v10; + v05 = (v05 >> 63) ^ (v05 << 1); + + v01 += m07; + v01 += v06; + v12 ^= v01; + v12 = (v12 >> 16) ^ (v12 << 48); + v11 += v12; + v06 ^= v11; + v06 = (v06 >> 63) ^ (v06 << 1); + + if (++i == rounds) break; + + //ROUND 10 + v00 += m10; + v00 += v04; + v12 ^= v00; + v12 = (v12 >> 32) ^ (v12 << 32); + v08 += v12; + v04 ^= v08; + v04 = (v04 >> 24) ^ (v04 << 40); + + v01 += m08; + v01 += v05; + v13 ^= v01; + v13 = (v13 >> 32) ^ (v13 << 32); + v09 += v13; + v05 ^= v09; + v05 = (v05 >> 24) ^ (v05 << 40); + + v02 += m07; + v02 += v06; + v14 ^= v02; + v14 = (v14 >> 32) ^ (v14 << 32); + v10 += v14; + v06 ^= v10; + v06 = (v06 >> 24) ^ (v06 << 40); + + v03 += m01; + v03 += v07; + v15 ^= v03; + v15 = (v15 >> 32) ^ (v15 << 32); + v11 += v15; + v07 ^= v11; + v07 = (v07 >> 24) ^ (v07 << 40); + + v02 += m06; + v02 += v06; + v14 ^= v02; + v14 = (v14 >> 16) ^ (v14 << 48); + v10 += v14; + v06 ^= v10; + v06 = (v06 >> 63) ^ (v06 << 1); + + v03 += m05; + v03 += v07; + v15 ^= v03; + v15 = (v15 >> 16) ^ (v15 << 48); + v11 += v15; + v07 ^= v11; + v07 = (v07 >> 63) ^ (v07 << 1); + + v00 += m02; + v00 += v04; + v12 ^= v00; + v12 = (v12 >> 16) ^ (v12 << 48); + v08 += v12; + v04 ^= v08; + v04 = (v04 >> 63) ^ (v04 << 1); + + v01 += m04; + v01 += v05; + v13 ^= v01; + v13 = (v13 >> 16) ^ (v13 << 48); + v09 += v13; + v05 ^= v09; + v05 = (v05 >> 63) ^ (v05 << 1); + + v00 += m15; + v00 += v05; + v15 ^= v00; + v15 = (v15 >> 32) ^ (v15 << 32); + v10 += v15; + v05 ^= v10; + v05 = (v05 >> 24) ^ (v05 << 40); + + v01 += m09; + v01 += v06; + v12 ^= v01; + v12 = (v12 >> 32) ^ (v12 << 32); + v11 += v12; + v06 ^= v11; + v06 = (v06 >> 24) ^ (v06 << 40); + + v02 += m03; + v02 += v07; + v13 ^= v02; + v13 = (v13 >> 32) ^ (v13 << 32); + v08 += v13; + v07 ^= v08; + v07 = (v07 >> 24) ^ (v07 << 40); + + v03 += m13; + v03 += v04; + v14 ^= v03; + v14 = (v14 >> 32) ^ (v14 << 32); + v09 += v14; + v04 ^= v09; + v04 = (v04 >> 24) ^ (v04 << 40); + + v02 += m12; + v02 += v07; + v13 ^= v02; + v13 = (v13 >> 16) ^ (v13 << 48); + v08 += v13; + v07 ^= v08; + v07 = (v07 >> 63) ^ (v07 << 1); + + v03 += m00; + v03 += v04; + v14 ^= v03; + v14 = (v14 >> 16) ^ (v14 << 48); + v09 += v14; + v04 ^= v09; + v04 = (v04 >> 63) ^ (v04 << 1); + + v00 += m11; + v00 += v05; + v15 ^= v00; + v15 = (v15 >> 16) ^ (v15 << 48); + v10 += v15; + v05 ^= v10; + v05 = (v05 >> 63) ^ (v05 << 1); + + v01 += m14; + v01 += v06; + v12 ^= v01; + v12 = (v12 >> 16) ^ (v12 << 48); + v11 += v12; + v06 ^= v11; + v06 = (v06 >> 63) ^ (v06 << 1); + } + + sh[0] ^= v00 ^ v08; + sh[1] ^= v01 ^ v09; + sh[2] ^= v02 ^ v10; + sh[3] ^= v03 ^ v11; + sh[4] ^= v04 ^ v12; + sh[5] ^= v05 ^ v13; + sh[6] ^= v06 ^ v14; + sh[7] ^= v07 ^ v15; + } +} diff --git a/src/Nethermind/Nethermind.Crypto/Blake2/Blake2Sse41.cs b/src/Nethermind/Nethermind.Crypto/Blake2/Blake2Sse41.cs new file mode 100644 index 00000000000..6b86f5fe459 --- /dev/null +++ b/src/Nethermind/Nethermind.Crypto/Blake2/Blake2Sse41.cs @@ -0,0 +1,1291 @@ +// Copyright (c) 2022 Demerzel Solutions Limited +// This file is part of the Nethermind library. +// +// The Nethermind library is free software: you can redistribute it and/or modify +// it under the terms of the GNU Lesser General Public License as published by +// the Free Software Foundation, either version 3 of the License, or +// (at your option) any later version. +// +// The Nethermind library is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU Lesser General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public License +// along with the Nethermind. If not, see . +// + +using System.Runtime.CompilerServices; +using System.Runtime.InteropServices; +using System.Runtime.Intrinsics; +using System.Runtime.Intrinsics.X86; + +namespace Nethermind.Crypto.Blake2; + +/// +/// Code adapted from Blake2Fast (https://github.com/saucecontrol/Blake2Fast) +/// +public unsafe partial class Blake2Compression +{ + // SIMD algorithm described in https://eprint.iacr.org/2012/275.pdf + [MethodImpl(MethodImplOptions.AggressiveOptimization)] + [SkipLocalsInit] + private static void ComputeSse41(ulong* sh, ulong* m, uint rounds) + { + ref byte rrm = ref MemoryMarshal.GetReference(Rormask); + var r24 = Unsafe.As>(ref rrm); + var r16 = Unsafe.As>(ref Unsafe.Add(ref rrm, Vector128.Count)); + + var row1l = Sse2.LoadVector128(sh); + var row1h = Sse2.LoadVector128(sh + 2); + var row2l = Sse2.LoadVector128(sh + 4); + var row2h = Sse2.LoadVector128(sh + 6); + + ref byte riv = ref MemoryMarshal.GetReference(Ivle); + var row3l = Unsafe.As>(ref riv); + var row3h = Unsafe.As>(ref Unsafe.Add(ref riv, 16)); + var row4l = Unsafe.As>(ref Unsafe.Add(ref riv, 32)); + var row4h = Unsafe.As>(ref Unsafe.Add(ref riv, 48)); + + row4l = Sse2.Xor(row4l, Sse2.LoadVector128(sh + 8)); // t[] + row4h = Sse2.Xor(row4h, Sse2.LoadVector128(sh + 10)); // f[] + + var m0 = Sse2.LoadVector128(m); + var m1 = Sse2.LoadVector128(m + 2); + var m2 = Sse2.LoadVector128(m + 4); + var m3 = Sse2.LoadVector128(m + 6); + var m4 = Sse2.LoadVector128(m + 8); + var m5 = Sse2.LoadVector128(m + 10); + var m6 = Sse2.LoadVector128(m + 12); + var m7 = Sse2.LoadVector128(m + 14); + Vector128 t0; + Vector128 t1; + Vector128 b0; + Vector128 b1; + + for (uint i = 0; i < rounds; i++) + { + //ROUND 1 + b0 = Sse2.UnpackLow(m0, m1); + b1 = Sse2.UnpackLow(m2, m3); + + //G1 + row1l = Sse2.Add(Sse2.Add(row1l, b0), row2l); + row1h = Sse2.Add(Sse2.Add(row1h, b1), row2h); + + row4l = Sse2.Xor(row4l, row1l); + row4h = Sse2.Xor(row4h, row1h); + + row4l = Sse2.Shuffle(row4l.AsUInt32(), 0b_10_11_00_01).AsUInt64(); + row4h = Sse2.Shuffle(row4h.AsUInt32(), 0b_10_11_00_01).AsUInt64(); + + row3l = Sse2.Add(row3l, row4l); + row3h = Sse2.Add(row3h, row4h); + + row2l = Sse2.Xor(row2l, row3l); + row2h = Sse2.Xor(row2h, row3h); + + row2l = Ssse3.Shuffle(row2l.AsByte(), r24).AsUInt64(); + row2h = Ssse3.Shuffle(row2h.AsByte(), r24).AsUInt64(); + + b0 = Sse2.UnpackHigh(m0, m1); + b1 = Sse2.UnpackHigh(m2, m3); + + //G2 + row1l = Sse2.Add(Sse2.Add(row1l, b0), row2l); + row1h = Sse2.Add(Sse2.Add(row1h, b1), row2h); + + row4l = Sse2.Xor(row4l, row1l); + row4h = Sse2.Xor(row4h, row1h); + + row4l = Ssse3.Shuffle(row4l.AsByte(), r16).AsUInt64(); + row4h = Ssse3.Shuffle(row4h.AsByte(), r16).AsUInt64(); + + row3l = Sse2.Add(row3l, row4l); + row3h = Sse2.Add(row3h, row4h); + + row2l = Sse2.Xor(row2l, row3l); + row2h = Sse2.Xor(row2h, row3h); + + row2l = Sse2.Xor(Sse2.ShiftRightLogical(row2l, 63), Sse2.Add(row2l, row2l)); + row2h = Sse2.Xor(Sse2.ShiftRightLogical(row2h, 63), Sse2.Add(row2h, row2h)); + + //DIAGONALIZE + t0 = Ssse3.AlignRight(row2h, row2l, 8); + t1 = Ssse3.AlignRight(row2l, row2h, 8); + row2l = t0; + row2h = t1; + + b0 = row3l; + row3l = row3h; + row3h = b0; + + t0 = Ssse3.AlignRight(row4h, row4l, 8); + t1 = Ssse3.AlignRight(row4l, row4h, 8); + row4l = t1; + row4h = t0; + + b0 = Sse2.UnpackLow(m4, m5); + b1 = Sse2.UnpackLow(m6, m7); + + //G1 + row1l = Sse2.Add(Sse2.Add(row1l, b0), row2l); + row1h = Sse2.Add(Sse2.Add(row1h, b1), row2h); + + row4l = Sse2.Xor(row4l, row1l); + row4h = Sse2.Xor(row4h, row1h); + + row4l = Sse2.Shuffle(row4l.AsUInt32(), 0b_10_11_00_01).AsUInt64(); + row4h = Sse2.Shuffle(row4h.AsUInt32(), 0b_10_11_00_01).AsUInt64(); + + row3l = Sse2.Add(row3l, row4l); + row3h = Sse2.Add(row3h, row4h); + + row2l = Sse2.Xor(row2l, row3l); + row2h = Sse2.Xor(row2h, row3h); + + row2l = Ssse3.Shuffle(row2l.AsByte(), r24).AsUInt64(); + row2h = Ssse3.Shuffle(row2h.AsByte(), r24).AsUInt64(); + + b0 = Sse2.UnpackHigh(m4, m5); + b1 = Sse2.UnpackHigh(m6, m7); + + //G2 + row1l = Sse2.Add(Sse2.Add(row1l, b0), row2l); + row1h = Sse2.Add(Sse2.Add(row1h, b1), row2h); + + row4l = Sse2.Xor(row4l, row1l); + row4h = Sse2.Xor(row4h, row1h); + + row4l = Ssse3.Shuffle(row4l.AsByte(), r16).AsUInt64(); + row4h = Ssse3.Shuffle(row4h.AsByte(), r16).AsUInt64(); + + row3l = Sse2.Add(row3l, row4l); + row3h = Sse2.Add(row3h, row4h); + + row2l = Sse2.Xor(row2l, row3l); + row2h = Sse2.Xor(row2h, row3h); + + row2l = Sse2.Xor(Sse2.ShiftRightLogical(row2l, 63), Sse2.Add(row2l, row2l)); + row2h = Sse2.Xor(Sse2.ShiftRightLogical(row2h, 63), Sse2.Add(row2h, row2h)); + + //UNDIAGONALIZE + t0 = Ssse3.AlignRight(row2l, row2h, 8); + t1 = Ssse3.AlignRight(row2h, row2l, 8); + row2l = t0; + row2h = t1; + + b0 = row3l; + row3l = row3h; + row3h = b0; + + t0 = Ssse3.AlignRight(row4l, row4h, 8); + t1 = Ssse3.AlignRight(row4h, row4l, 8); + row4l = t1; + row4h = t0; + + if (++i == rounds) break; + + //ROUND 2 + b0 = Sse2.UnpackLow(m7, m2); + b1 = Sse2.UnpackHigh(m4, m6); + + //G1 + row1l = Sse2.Add(Sse2.Add(row1l, b0), row2l); + row1h = Sse2.Add(Sse2.Add(row1h, b1), row2h); + + row4l = Sse2.Xor(row4l, row1l); + row4h = Sse2.Xor(row4h, row1h); + + row4l = Sse2.Shuffle(row4l.AsUInt32(), 0b_10_11_00_01).AsUInt64(); + row4h = Sse2.Shuffle(row4h.AsUInt32(), 0b_10_11_00_01).AsUInt64(); + + row3l = Sse2.Add(row3l, row4l); + row3h = Sse2.Add(row3h, row4h); + + row2l = Sse2.Xor(row2l, row3l); + row2h = Sse2.Xor(row2h, row3h); + + row2l = Ssse3.Shuffle(row2l.AsByte(), r24).AsUInt64(); + row2h = Ssse3.Shuffle(row2h.AsByte(), r24).AsUInt64(); + + b0 = Sse2.UnpackLow(m5, m4); + b1 = Ssse3.AlignRight(m3, m7, 8); + + //G2 + row1l = Sse2.Add(Sse2.Add(row1l, b0), row2l); + row1h = Sse2.Add(Sse2.Add(row1h, b1), row2h); + + row4l = Sse2.Xor(row4l, row1l); + row4h = Sse2.Xor(row4h, row1h); + + row4l = Ssse3.Shuffle(row4l.AsByte(), r16).AsUInt64(); + row4h = Ssse3.Shuffle(row4h.AsByte(), r16).AsUInt64(); + + row3l = Sse2.Add(row3l, row4l); + row3h = Sse2.Add(row3h, row4h); + + row2l = Sse2.Xor(row2l, row3l); + row2h = Sse2.Xor(row2h, row3h); + + row2l = Sse2.Xor(Sse2.ShiftRightLogical(row2l, 63), Sse2.Add(row2l, row2l)); + row2h = Sse2.Xor(Sse2.ShiftRightLogical(row2h, 63), Sse2.Add(row2h, row2h)); + + //DIAGONALIZE + t0 = Ssse3.AlignRight(row2h, row2l, 8); + t1 = Ssse3.AlignRight(row2l, row2h, 8); + row2l = t0; + row2h = t1; + + b0 = row3l; + row3l = row3h; + row3h = b0; + + t0 = Ssse3.AlignRight(row4h, row4l, 8); + t1 = Ssse3.AlignRight(row4l, row4h, 8); + row4l = t1; + row4h = t0; + + b0 = Sse2.Shuffle(m0.AsUInt32(), 0b_01_00_11_10).AsUInt64(); + b1 = Sse2.UnpackHigh(m5, m2); + + //G1 + row1l = Sse2.Add(Sse2.Add(row1l, b0), row2l); + row1h = Sse2.Add(Sse2.Add(row1h, b1), row2h); + + row4l = Sse2.Xor(row4l, row1l); + row4h = Sse2.Xor(row4h, row1h); + + row4l = Sse2.Shuffle(row4l.AsUInt32(), 0b_10_11_00_01).AsUInt64(); + row4h = Sse2.Shuffle(row4h.AsUInt32(), 0b_10_11_00_01).AsUInt64(); + + row3l = Sse2.Add(row3l, row4l); + row3h = Sse2.Add(row3h, row4h); + + row2l = Sse2.Xor(row2l, row3l); + row2h = Sse2.Xor(row2h, row3h); + + row2l = Ssse3.Shuffle(row2l.AsByte(), r24).AsUInt64(); + row2h = Ssse3.Shuffle(row2h.AsByte(), r24).AsUInt64(); + + b0 = Sse2.UnpackLow(m6, m1); + b1 = Sse2.UnpackHigh(m3, m1); + + //G2 + row1l = Sse2.Add(Sse2.Add(row1l, b0), row2l); + row1h = Sse2.Add(Sse2.Add(row1h, b1), row2h); + + row4l = Sse2.Xor(row4l, row1l); + row4h = Sse2.Xor(row4h, row1h); + + row4l = Ssse3.Shuffle(row4l.AsByte(), r16).AsUInt64(); + row4h = Ssse3.Shuffle(row4h.AsByte(), r16).AsUInt64(); + + row3l = Sse2.Add(row3l, row4l); + row3h = Sse2.Add(row3h, row4h); + + row2l = Sse2.Xor(row2l, row3l); + row2h = Sse2.Xor(row2h, row3h); + + row2l = Sse2.Xor(Sse2.ShiftRightLogical(row2l, 63), Sse2.Add(row2l, row2l)); + row2h = Sse2.Xor(Sse2.ShiftRightLogical(row2h, 63), Sse2.Add(row2h, row2h)); + + //UNDIAGONALIZE + t0 = Ssse3.AlignRight(row2l, row2h, 8); + t1 = Ssse3.AlignRight(row2h, row2l, 8); + row2l = t0; + row2h = t1; + + b0 = row3l; + row3l = row3h; + row3h = b0; + + t0 = Ssse3.AlignRight(row4l, row4h, 8); + t1 = Ssse3.AlignRight(row4h, row4l, 8); + row4l = t1; + row4h = t0; + + if (++i == rounds) break; + + //ROUND 3 + b0 = Ssse3.AlignRight(m6, m5, 8); + b1 = Sse2.UnpackHigh(m2, m7); + + //G1 + row1l = Sse2.Add(Sse2.Add(row1l, b0), row2l); + row1h = Sse2.Add(Sse2.Add(row1h, b1), row2h); + + row4l = Sse2.Xor(row4l, row1l); + row4h = Sse2.Xor(row4h, row1h); + + row4l = Sse2.Shuffle(row4l.AsUInt32(), 0b_10_11_00_01).AsUInt64(); + row4h = Sse2.Shuffle(row4h.AsUInt32(), 0b_10_11_00_01).AsUInt64(); + + row3l = Sse2.Add(row3l, row4l); + row3h = Sse2.Add(row3h, row4h); + + row2l = Sse2.Xor(row2l, row3l); + row2h = Sse2.Xor(row2h, row3h); + + row2l = Ssse3.Shuffle(row2l.AsByte(), r24).AsUInt64(); + row2h = Ssse3.Shuffle(row2h.AsByte(), r24).AsUInt64(); + + b0 = Sse2.UnpackLow(m4, m0); + b1 = Sse41.Blend(m1.AsUInt16(), m6.AsUInt16(), 0b_1111_0000).AsUInt64(); + + //G2 + row1l = Sse2.Add(Sse2.Add(row1l, b0), row2l); + row1h = Sse2.Add(Sse2.Add(row1h, b1), row2h); + + row4l = Sse2.Xor(row4l, row1l); + row4h = Sse2.Xor(row4h, row1h); + + row4l = Ssse3.Shuffle(row4l.AsByte(), r16).AsUInt64(); + row4h = Ssse3.Shuffle(row4h.AsByte(), r16).AsUInt64(); + + row3l = Sse2.Add(row3l, row4l); + row3h = Sse2.Add(row3h, row4h); + + row2l = Sse2.Xor(row2l, row3l); + row2h = Sse2.Xor(row2h, row3h); + + row2l = Sse2.Xor(Sse2.ShiftRightLogical(row2l, 63), Sse2.Add(row2l, row2l)); + row2h = Sse2.Xor(Sse2.ShiftRightLogical(row2h, 63), Sse2.Add(row2h, row2h)); + + //DIAGONALIZE + t0 = Ssse3.AlignRight(row2h, row2l, 8); + t1 = Ssse3.AlignRight(row2l, row2h, 8); + row2l = t0; + row2h = t1; + + b0 = row3l; + row3l = row3h; + row3h = b0; + + t0 = Ssse3.AlignRight(row4h, row4l, 8); + t1 = Ssse3.AlignRight(row4l, row4h, 8); + row4l = t1; + row4h = t0; + + b0 = Sse41.Blend(m5.AsUInt16(), m1.AsUInt16(), 0b_1111_0000).AsUInt64(); + b1 = Sse2.UnpackHigh(m3, m4); + + //G1 + row1l = Sse2.Add(Sse2.Add(row1l, b0), row2l); + row1h = Sse2.Add(Sse2.Add(row1h, b1), row2h); + + row4l = Sse2.Xor(row4l, row1l); + row4h = Sse2.Xor(row4h, row1h); + + row4l = Sse2.Shuffle(row4l.AsUInt32(), 0b_10_11_00_01).AsUInt64(); + row4h = Sse2.Shuffle(row4h.AsUInt32(), 0b_10_11_00_01).AsUInt64(); + + row3l = Sse2.Add(row3l, row4l); + row3h = Sse2.Add(row3h, row4h); + + row2l = Sse2.Xor(row2l, row3l); + row2h = Sse2.Xor(row2h, row3h); + + row2l = Ssse3.Shuffle(row2l.AsByte(), r24).AsUInt64(); + row2h = Ssse3.Shuffle(row2h.AsByte(), r24).AsUInt64(); + + b0 = Sse2.UnpackLow(m7, m3); + b1 = Ssse3.AlignRight(m2, m0, 8); + + //G2 + row1l = Sse2.Add(Sse2.Add(row1l, b0), row2l); + row1h = Sse2.Add(Sse2.Add(row1h, b1), row2h); + + row4l = Sse2.Xor(row4l, row1l); + row4h = Sse2.Xor(row4h, row1h); + + row4l = Ssse3.Shuffle(row4l.AsByte(), r16).AsUInt64(); + row4h = Ssse3.Shuffle(row4h.AsByte(), r16).AsUInt64(); + + row3l = Sse2.Add(row3l, row4l); + row3h = Sse2.Add(row3h, row4h); + + row2l = Sse2.Xor(row2l, row3l); + row2h = Sse2.Xor(row2h, row3h); + + row2l = Sse2.Xor(Sse2.ShiftRightLogical(row2l, 63), Sse2.Add(row2l, row2l)); + row2h = Sse2.Xor(Sse2.ShiftRightLogical(row2h, 63), Sse2.Add(row2h, row2h)); + + //UNDIAGONALIZE + t0 = Ssse3.AlignRight(row2l, row2h, 8); + t1 = Ssse3.AlignRight(row2h, row2l, 8); + row2l = t0; + row2h = t1; + + b0 = row3l; + row3l = row3h; + row3h = b0; + + t0 = Ssse3.AlignRight(row4l, row4h, 8); + t1 = Ssse3.AlignRight(row4h, row4l, 8); + row4l = t1; + row4h = t0; + + if (++i == rounds) break; + + //ROUND 4 + b0 = Sse2.UnpackHigh(m3, m1); + b1 = Sse2.UnpackHigh(m6, m5); + + //G1 + row1l = Sse2.Add(Sse2.Add(row1l, b0), row2l); + row1h = Sse2.Add(Sse2.Add(row1h, b1), row2h); + + row4l = Sse2.Xor(row4l, row1l); + row4h = Sse2.Xor(row4h, row1h); + + row4l = Sse2.Shuffle(row4l.AsUInt32(), 0b_10_11_00_01).AsUInt64(); + row4h = Sse2.Shuffle(row4h.AsUInt32(), 0b_10_11_00_01).AsUInt64(); + + row3l = Sse2.Add(row3l, row4l); + row3h = Sse2.Add(row3h, row4h); + + row2l = Sse2.Xor(row2l, row3l); + row2h = Sse2.Xor(row2h, row3h); + + row2l = Ssse3.Shuffle(row2l.AsByte(), r24).AsUInt64(); + row2h = Ssse3.Shuffle(row2h.AsByte(), r24).AsUInt64(); + + b0 = Sse2.UnpackHigh(m4, m0); + b1 = Sse2.UnpackLow(m6, m7); + + //G2 + row1l = Sse2.Add(Sse2.Add(row1l, b0), row2l); + row1h = Sse2.Add(Sse2.Add(row1h, b1), row2h); + + row4l = Sse2.Xor(row4l, row1l); + row4h = Sse2.Xor(row4h, row1h); + + row4l = Ssse3.Shuffle(row4l.AsByte(), r16).AsUInt64(); + row4h = Ssse3.Shuffle(row4h.AsByte(), r16).AsUInt64(); + + row3l = Sse2.Add(row3l, row4l); + row3h = Sse2.Add(row3h, row4h); + + row2l = Sse2.Xor(row2l, row3l); + row2h = Sse2.Xor(row2h, row3h); + + row2l = Sse2.Xor(Sse2.ShiftRightLogical(row2l, 63), Sse2.Add(row2l, row2l)); + row2h = Sse2.Xor(Sse2.ShiftRightLogical(row2h, 63), Sse2.Add(row2h, row2h)); + + //DIAGONALIZE + t0 = Ssse3.AlignRight(row2h, row2l, 8); + t1 = Ssse3.AlignRight(row2l, row2h, 8); + row2l = t0; + row2h = t1; + + b0 = row3l; + row3l = row3h; + row3h = b0; + + t0 = Ssse3.AlignRight(row4h, row4l, 8); + t1 = Ssse3.AlignRight(row4l, row4h, 8); + row4l = t1; + row4h = t0; + + b0 = Sse41.Blend(m1.AsUInt16(), m2.AsUInt16(), 0b_1111_0000).AsUInt64(); + b1 = Sse41.Blend(m2.AsUInt16(), m7.AsUInt16(), 0b_1111_0000).AsUInt64(); + + //G1 + row1l = Sse2.Add(Sse2.Add(row1l, b0), row2l); + row1h = Sse2.Add(Sse2.Add(row1h, b1), row2h); + + row4l = Sse2.Xor(row4l, row1l); + row4h = Sse2.Xor(row4h, row1h); + + row4l = Sse2.Shuffle(row4l.AsUInt32(), 0b_10_11_00_01).AsUInt64(); + row4h = Sse2.Shuffle(row4h.AsUInt32(), 0b_10_11_00_01).AsUInt64(); + + row3l = Sse2.Add(row3l, row4l); + row3h = Sse2.Add(row3h, row4h); + + row2l = Sse2.Xor(row2l, row3l); + row2h = Sse2.Xor(row2h, row3h); + + row2l = Ssse3.Shuffle(row2l.AsByte(), r24).AsUInt64(); + row2h = Ssse3.Shuffle(row2h.AsByte(), r24).AsUInt64(); + + b0 = Sse2.UnpackLow(m3, m5); + b1 = Sse2.UnpackLow(m0, m4); + + //G2 + row1l = Sse2.Add(Sse2.Add(row1l, b0), row2l); + row1h = Sse2.Add(Sse2.Add(row1h, b1), row2h); + + row4l = Sse2.Xor(row4l, row1l); + row4h = Sse2.Xor(row4h, row1h); + + row4l = Ssse3.Shuffle(row4l.AsByte(), r16).AsUInt64(); + row4h = Ssse3.Shuffle(row4h.AsByte(), r16).AsUInt64(); + + row3l = Sse2.Add(row3l, row4l); + row3h = Sse2.Add(row3h, row4h); + + row2l = Sse2.Xor(row2l, row3l); + row2h = Sse2.Xor(row2h, row3h); + + row2l = Sse2.Xor(Sse2.ShiftRightLogical(row2l, 63), Sse2.Add(row2l, row2l)); + row2h = Sse2.Xor(Sse2.ShiftRightLogical(row2h, 63), Sse2.Add(row2h, row2h)); + + //UNDIAGONALIZE + t0 = Ssse3.AlignRight(row2l, row2h, 8); + t1 = Ssse3.AlignRight(row2h, row2l, 8); + row2l = t0; + row2h = t1; + + b0 = row3l; + row3l = row3h; + row3h = b0; + + t0 = Ssse3.AlignRight(row4l, row4h, 8); + t1 = Ssse3.AlignRight(row4h, row4l, 8); + row4l = t1; + row4h = t0; + + if (++i == rounds) break; + + //ROUND 5 + b0 = Sse2.UnpackHigh(m4, m2); + b1 = Sse2.UnpackLow(m1, m5); + + //G1 + row1l = Sse2.Add(Sse2.Add(row1l, b0), row2l); + row1h = Sse2.Add(Sse2.Add(row1h, b1), row2h); + + row4l = Sse2.Xor(row4l, row1l); + row4h = Sse2.Xor(row4h, row1h); + + row4l = Sse2.Shuffle(row4l.AsUInt32(), 0b_10_11_00_01).AsUInt64(); + row4h = Sse2.Shuffle(row4h.AsUInt32(), 0b_10_11_00_01).AsUInt64(); + + row3l = Sse2.Add(row3l, row4l); + row3h = Sse2.Add(row3h, row4h); + + row2l = Sse2.Xor(row2l, row3l); + row2h = Sse2.Xor(row2h, row3h); + + row2l = Ssse3.Shuffle(row2l.AsByte(), r24).AsUInt64(); + row2h = Ssse3.Shuffle(row2h.AsByte(), r24).AsUInt64(); + + b0 = Sse41.Blend(m0.AsUInt16(), m3.AsUInt16(), 0b_1111_0000).AsUInt64(); + b1 = Sse41.Blend(m2.AsUInt16(), m7.AsUInt16(), 0b_1111_0000).AsUInt64(); + + //G2 + row1l = Sse2.Add(Sse2.Add(row1l, b0), row2l); + row1h = Sse2.Add(Sse2.Add(row1h, b1), row2h); + + row4l = Sse2.Xor(row4l, row1l); + row4h = Sse2.Xor(row4h, row1h); + + row4l = Ssse3.Shuffle(row4l.AsByte(), r16).AsUInt64(); + row4h = Ssse3.Shuffle(row4h.AsByte(), r16).AsUInt64(); + + row3l = Sse2.Add(row3l, row4l); + row3h = Sse2.Add(row3h, row4h); + + row2l = Sse2.Xor(row2l, row3l); + row2h = Sse2.Xor(row2h, row3h); + + row2l = Sse2.Xor(Sse2.ShiftRightLogical(row2l, 63), Sse2.Add(row2l, row2l)); + row2h = Sse2.Xor(Sse2.ShiftRightLogical(row2h, 63), Sse2.Add(row2h, row2h)); + + //DIAGONALIZE + t0 = Ssse3.AlignRight(row2h, row2l, 8); + t1 = Ssse3.AlignRight(row2l, row2h, 8); + row2l = t0; + row2h = t1; + + b0 = row3l; + row3l = row3h; + row3h = b0; + + t0 = Ssse3.AlignRight(row4h, row4l, 8); + t1 = Ssse3.AlignRight(row4l, row4h, 8); + row4l = t1; + row4h = t0; + + b0 = Sse41.Blend(m7.AsUInt16(), m5.AsUInt16(), 0b_1111_0000).AsUInt64(); + b1 = Sse41.Blend(m3.AsUInt16(), m1.AsUInt16(), 0b_1111_0000).AsUInt64(); + + //G1 + row1l = Sse2.Add(Sse2.Add(row1l, b0), row2l); + row1h = Sse2.Add(Sse2.Add(row1h, b1), row2h); + + row4l = Sse2.Xor(row4l, row1l); + row4h = Sse2.Xor(row4h, row1h); + + row4l = Sse2.Shuffle(row4l.AsUInt32(), 0b_10_11_00_01).AsUInt64(); + row4h = Sse2.Shuffle(row4h.AsUInt32(), 0b_10_11_00_01).AsUInt64(); + + row3l = Sse2.Add(row3l, row4l); + row3h = Sse2.Add(row3h, row4h); + + row2l = Sse2.Xor(row2l, row3l); + row2h = Sse2.Xor(row2h, row3h); + + row2l = Ssse3.Shuffle(row2l.AsByte(), r24).AsUInt64(); + row2h = Ssse3.Shuffle(row2h.AsByte(), r24).AsUInt64(); + + b0 = Ssse3.AlignRight(m6, m0, 8); + b1 = Sse41.Blend(m4.AsUInt16(), m6.AsUInt16(), 0b_1111_0000).AsUInt64(); + + //G2 + row1l = Sse2.Add(Sse2.Add(row1l, b0), row2l); + row1h = Sse2.Add(Sse2.Add(row1h, b1), row2h); + + row4l = Sse2.Xor(row4l, row1l); + row4h = Sse2.Xor(row4h, row1h); + + row4l = Ssse3.Shuffle(row4l.AsByte(), r16).AsUInt64(); + row4h = Ssse3.Shuffle(row4h.AsByte(), r16).AsUInt64(); + + row3l = Sse2.Add(row3l, row4l); + row3h = Sse2.Add(row3h, row4h); + + row2l = Sse2.Xor(row2l, row3l); + row2h = Sse2.Xor(row2h, row3h); + + row2l = Sse2.Xor(Sse2.ShiftRightLogical(row2l, 63), Sse2.Add(row2l, row2l)); + row2h = Sse2.Xor(Sse2.ShiftRightLogical(row2h, 63), Sse2.Add(row2h, row2h)); + + //UNDIAGONALIZE + t0 = Ssse3.AlignRight(row2l, row2h, 8); + t1 = Ssse3.AlignRight(row2h, row2l, 8); + row2l = t0; + row2h = t1; + + b0 = row3l; + row3l = row3h; + row3h = b0; + + t0 = Ssse3.AlignRight(row4l, row4h, 8); + t1 = Ssse3.AlignRight(row4h, row4l, 8); + row4l = t1; + row4h = t0; + + if (++i == rounds) break; + + //ROUND 6 + b0 = Sse2.UnpackLow(m1, m3); + b1 = Sse2.UnpackLow(m0, m4); + + //G1 + row1l = Sse2.Add(Sse2.Add(row1l, b0), row2l); + row1h = Sse2.Add(Sse2.Add(row1h, b1), row2h); + + row4l = Sse2.Xor(row4l, row1l); + row4h = Sse2.Xor(row4h, row1h); + + row4l = Sse2.Shuffle(row4l.AsUInt32(), 0b_10_11_00_01).AsUInt64(); + row4h = Sse2.Shuffle(row4h.AsUInt32(), 0b_10_11_00_01).AsUInt64(); + + row3l = Sse2.Add(row3l, row4l); + row3h = Sse2.Add(row3h, row4h); + + row2l = Sse2.Xor(row2l, row3l); + row2h = Sse2.Xor(row2h, row3h); + + row2l = Ssse3.Shuffle(row2l.AsByte(), r24).AsUInt64(); + row2h = Ssse3.Shuffle(row2h.AsByte(), r24).AsUInt64(); + + b0 = Sse2.UnpackLow(m6, m5); + b1 = Sse2.UnpackHigh(m5, m1); + + //G2 + row1l = Sse2.Add(Sse2.Add(row1l, b0), row2l); + row1h = Sse2.Add(Sse2.Add(row1h, b1), row2h); + + row4l = Sse2.Xor(row4l, row1l); + row4h = Sse2.Xor(row4h, row1h); + + row4l = Ssse3.Shuffle(row4l.AsByte(), r16).AsUInt64(); + row4h = Ssse3.Shuffle(row4h.AsByte(), r16).AsUInt64(); + + row3l = Sse2.Add(row3l, row4l); + row3h = Sse2.Add(row3h, row4h); + + row2l = Sse2.Xor(row2l, row3l); + row2h = Sse2.Xor(row2h, row3h); + + row2l = Sse2.Xor(Sse2.ShiftRightLogical(row2l, 63), Sse2.Add(row2l, row2l)); + row2h = Sse2.Xor(Sse2.ShiftRightLogical(row2h, 63), Sse2.Add(row2h, row2h)); + + //DIAGONALIZE + t0 = Ssse3.AlignRight(row2h, row2l, 8); + t1 = Ssse3.AlignRight(row2l, row2h, 8); + row2l = t0; + row2h = t1; + + b0 = row3l; + row3l = row3h; + row3h = b0; + + t0 = Ssse3.AlignRight(row4h, row4l, 8); + t1 = Ssse3.AlignRight(row4l, row4h, 8); + row4l = t1; + row4h = t0; + + b0 = Sse41.Blend(m2.AsUInt16(), m3.AsUInt16(), 0b_1111_0000).AsUInt64(); + b1 = Sse2.UnpackHigh(m7, m0); + + //G1 + row1l = Sse2.Add(Sse2.Add(row1l, b0), row2l); + row1h = Sse2.Add(Sse2.Add(row1h, b1), row2h); + + row4l = Sse2.Xor(row4l, row1l); + row4h = Sse2.Xor(row4h, row1h); + + row4l = Sse2.Shuffle(row4l.AsUInt32(), 0b_10_11_00_01).AsUInt64(); + row4h = Sse2.Shuffle(row4h.AsUInt32(), 0b_10_11_00_01).AsUInt64(); + + row3l = Sse2.Add(row3l, row4l); + row3h = Sse2.Add(row3h, row4h); + + row2l = Sse2.Xor(row2l, row3l); + row2h = Sse2.Xor(row2h, row3h); + + row2l = Ssse3.Shuffle(row2l.AsByte(), r24).AsUInt64(); + row2h = Ssse3.Shuffle(row2h.AsByte(), r24).AsUInt64(); + + b0 = Sse2.UnpackHigh(m6, m2); + b1 = Sse41.Blend(m7.AsUInt16(), m4.AsUInt16(), 0b_1111_0000).AsUInt64(); + + //G2 + row1l = Sse2.Add(Sse2.Add(row1l, b0), row2l); + row1h = Sse2.Add(Sse2.Add(row1h, b1), row2h); + + row4l = Sse2.Xor(row4l, row1l); + row4h = Sse2.Xor(row4h, row1h); + + row4l = Ssse3.Shuffle(row4l.AsByte(), r16).AsUInt64(); + row4h = Ssse3.Shuffle(row4h.AsByte(), r16).AsUInt64(); + + row3l = Sse2.Add(row3l, row4l); + row3h = Sse2.Add(row3h, row4h); + + row2l = Sse2.Xor(row2l, row3l); + row2h = Sse2.Xor(row2h, row3h); + + row2l = Sse2.Xor(Sse2.ShiftRightLogical(row2l, 63), Sse2.Add(row2l, row2l)); + row2h = Sse2.Xor(Sse2.ShiftRightLogical(row2h, 63), Sse2.Add(row2h, row2h)); + + //UNDIAGONALIZE + t0 = Ssse3.AlignRight(row2l, row2h, 8); + t1 = Ssse3.AlignRight(row2h, row2l, 8); + row2l = t0; + row2h = t1; + + b0 = row3l; + row3l = row3h; + row3h = b0; + + t0 = Ssse3.AlignRight(row4l, row4h, 8); + t1 = Ssse3.AlignRight(row4h, row4l, 8); + row4l = t1; + row4h = t0; + + if (++i == rounds) break; + + //ROUND 7 + b0 = Sse41.Blend(m6.AsUInt16(), m0.AsUInt16(), 0b_1111_0000).AsUInt64(); + b1 = Sse2.UnpackLow(m7, m2); + + //G1 + row1l = Sse2.Add(Sse2.Add(row1l, b0), row2l); + row1h = Sse2.Add(Sse2.Add(row1h, b1), row2h); + + row4l = Sse2.Xor(row4l, row1l); + row4h = Sse2.Xor(row4h, row1h); + + row4l = Sse2.Shuffle(row4l.AsUInt32(), 0b_10_11_00_01).AsUInt64(); + row4h = Sse2.Shuffle(row4h.AsUInt32(), 0b_10_11_00_01).AsUInt64(); + + row3l = Sse2.Add(row3l, row4l); + row3h = Sse2.Add(row3h, row4h); + + row2l = Sse2.Xor(row2l, row3l); + row2h = Sse2.Xor(row2h, row3h); + + row2l = Ssse3.Shuffle(row2l.AsByte(), r24).AsUInt64(); + row2h = Ssse3.Shuffle(row2h.AsByte(), r24).AsUInt64(); + + b0 = Sse2.UnpackHigh(m2, m7); + b1 = Ssse3.AlignRight(m5, m6, 8); + + //G2 + row1l = Sse2.Add(Sse2.Add(row1l, b0), row2l); + row1h = Sse2.Add(Sse2.Add(row1h, b1), row2h); + + row4l = Sse2.Xor(row4l, row1l); + row4h = Sse2.Xor(row4h, row1h); + + row4l = Ssse3.Shuffle(row4l.AsByte(), r16).AsUInt64(); + row4h = Ssse3.Shuffle(row4h.AsByte(), r16).AsUInt64(); + + row3l = Sse2.Add(row3l, row4l); + row3h = Sse2.Add(row3h, row4h); + + row2l = Sse2.Xor(row2l, row3l); + row2h = Sse2.Xor(row2h, row3h); + + row2l = Sse2.Xor(Sse2.ShiftRightLogical(row2l, 63), Sse2.Add(row2l, row2l)); + row2h = Sse2.Xor(Sse2.ShiftRightLogical(row2h, 63), Sse2.Add(row2h, row2h)); + + //DIAGONALIZE + t0 = Ssse3.AlignRight(row2h, row2l, 8); + t1 = Ssse3.AlignRight(row2l, row2h, 8); + row2l = t0; + row2h = t1; + + b0 = row3l; + row3l = row3h; + row3h = b0; + + t0 = Ssse3.AlignRight(row4h, row4l, 8); + t1 = Ssse3.AlignRight(row4l, row4h, 8); + row4l = t1; + row4h = t0; + + b0 = Sse2.UnpackLow(m0, m3); + b1 = Sse2.Shuffle(m4.AsUInt32(), 0b_01_00_11_10).AsUInt64(); + + //G1 + row1l = Sse2.Add(Sse2.Add(row1l, b0), row2l); + row1h = Sse2.Add(Sse2.Add(row1h, b1), row2h); + + row4l = Sse2.Xor(row4l, row1l); + row4h = Sse2.Xor(row4h, row1h); + + row4l = Sse2.Shuffle(row4l.AsUInt32(), 0b_10_11_00_01).AsUInt64(); + row4h = Sse2.Shuffle(row4h.AsUInt32(), 0b_10_11_00_01).AsUInt64(); + + row3l = Sse2.Add(row3l, row4l); + row3h = Sse2.Add(row3h, row4h); + + row2l = Sse2.Xor(row2l, row3l); + row2h = Sse2.Xor(row2h, row3h); + + row2l = Ssse3.Shuffle(row2l.AsByte(), r24).AsUInt64(); + row2h = Ssse3.Shuffle(row2h.AsByte(), r24).AsUInt64(); + + b0 = Sse2.UnpackHigh(m3, m1); + b1 = Sse41.Blend(m1.AsUInt16(), m5.AsUInt16(), 0b_1111_0000).AsUInt64(); + + //G2 + row1l = Sse2.Add(Sse2.Add(row1l, b0), row2l); + row1h = Sse2.Add(Sse2.Add(row1h, b1), row2h); + + row4l = Sse2.Xor(row4l, row1l); + row4h = Sse2.Xor(row4h, row1h); + + row4l = Ssse3.Shuffle(row4l.AsByte(), r16).AsUInt64(); + row4h = Ssse3.Shuffle(row4h.AsByte(), r16).AsUInt64(); + + row3l = Sse2.Add(row3l, row4l); + row3h = Sse2.Add(row3h, row4h); + + row2l = Sse2.Xor(row2l, row3l); + row2h = Sse2.Xor(row2h, row3h); + + row2l = Sse2.Xor(Sse2.ShiftRightLogical(row2l, 63), Sse2.Add(row2l, row2l)); + row2h = Sse2.Xor(Sse2.ShiftRightLogical(row2h, 63), Sse2.Add(row2h, row2h)); + + //UNDIAGONALIZE + t0 = Ssse3.AlignRight(row2l, row2h, 8); + t1 = Ssse3.AlignRight(row2h, row2l, 8); + row2l = t0; + row2h = t1; + + b0 = row3l; + row3l = row3h; + row3h = b0; + + t0 = Ssse3.AlignRight(row4l, row4h, 8); + t1 = Ssse3.AlignRight(row4h, row4l, 8); + row4l = t1; + row4h = t0; + + if (++i == rounds) break; + + //ROUND 8 + b0 = Sse2.UnpackHigh(m6, m3); + b1 = Sse41.Blend(m6.AsUInt16(), m1.AsUInt16(), 0b_1111_0000).AsUInt64(); + + //G1 + row1l = Sse2.Add(Sse2.Add(row1l, b0), row2l); + row1h = Sse2.Add(Sse2.Add(row1h, b1), row2h); + + row4l = Sse2.Xor(row4l, row1l); + row4h = Sse2.Xor(row4h, row1h); + + row4l = Sse2.Shuffle(row4l.AsUInt32(), 0b_10_11_00_01).AsUInt64(); + row4h = Sse2.Shuffle(row4h.AsUInt32(), 0b_10_11_00_01).AsUInt64(); + + row3l = Sse2.Add(row3l, row4l); + row3h = Sse2.Add(row3h, row4h); + + row2l = Sse2.Xor(row2l, row3l); + row2h = Sse2.Xor(row2h, row3h); + + row2l = Ssse3.Shuffle(row2l.AsByte(), r24).AsUInt64(); + row2h = Ssse3.Shuffle(row2h.AsByte(), r24).AsUInt64(); + + b0 = Ssse3.AlignRight(m7, m5, 8); + b1 = Sse2.UnpackHigh(m0, m4); + + //G2 + row1l = Sse2.Add(Sse2.Add(row1l, b0), row2l); + row1h = Sse2.Add(Sse2.Add(row1h, b1), row2h); + + row4l = Sse2.Xor(row4l, row1l); + row4h = Sse2.Xor(row4h, row1h); + + row4l = Ssse3.Shuffle(row4l.AsByte(), r16).AsUInt64(); + row4h = Ssse3.Shuffle(row4h.AsByte(), r16).AsUInt64(); + + row3l = Sse2.Add(row3l, row4l); + row3h = Sse2.Add(row3h, row4h); + + row2l = Sse2.Xor(row2l, row3l); + row2h = Sse2.Xor(row2h, row3h); + + row2l = Sse2.Xor(Sse2.ShiftRightLogical(row2l, 63), Sse2.Add(row2l, row2l)); + row2h = Sse2.Xor(Sse2.ShiftRightLogical(row2h, 63), Sse2.Add(row2h, row2h)); + + //DIAGONALIZE + t0 = Ssse3.AlignRight(row2h, row2l, 8); + t1 = Ssse3.AlignRight(row2l, row2h, 8); + row2l = t0; + row2h = t1; + + b0 = row3l; + row3l = row3h; + row3h = b0; + + t0 = Ssse3.AlignRight(row4h, row4l, 8); + t1 = Ssse3.AlignRight(row4l, row4h, 8); + row4l = t1; + row4h = t0; + + b0 = Sse2.UnpackHigh(m2, m7); + b1 = Sse2.UnpackLow(m4, m1); + + //G1 + row1l = Sse2.Add(Sse2.Add(row1l, b0), row2l); + row1h = Sse2.Add(Sse2.Add(row1h, b1), row2h); + + row4l = Sse2.Xor(row4l, row1l); + row4h = Sse2.Xor(row4h, row1h); + + row4l = Sse2.Shuffle(row4l.AsUInt32(), 0b_10_11_00_01).AsUInt64(); + row4h = Sse2.Shuffle(row4h.AsUInt32(), 0b_10_11_00_01).AsUInt64(); + + row3l = Sse2.Add(row3l, row4l); + row3h = Sse2.Add(row3h, row4h); + + row2l = Sse2.Xor(row2l, row3l); + row2h = Sse2.Xor(row2h, row3h); + + row2l = Ssse3.Shuffle(row2l.AsByte(), r24).AsUInt64(); + row2h = Ssse3.Shuffle(row2h.AsByte(), r24).AsUInt64(); + + b0 = Sse2.UnpackLow(m0, m2); + b1 = Sse2.UnpackLow(m3, m5); + + //G2 + row1l = Sse2.Add(Sse2.Add(row1l, b0), row2l); + row1h = Sse2.Add(Sse2.Add(row1h, b1), row2h); + + row4l = Sse2.Xor(row4l, row1l); + row4h = Sse2.Xor(row4h, row1h); + + row4l = Ssse3.Shuffle(row4l.AsByte(), r16).AsUInt64(); + row4h = Ssse3.Shuffle(row4h.AsByte(), r16).AsUInt64(); + + row3l = Sse2.Add(row3l, row4l); + row3h = Sse2.Add(row3h, row4h); + + row2l = Sse2.Xor(row2l, row3l); + row2h = Sse2.Xor(row2h, row3h); + + row2l = Sse2.Xor(Sse2.ShiftRightLogical(row2l, 63), Sse2.Add(row2l, row2l)); + row2h = Sse2.Xor(Sse2.ShiftRightLogical(row2h, 63), Sse2.Add(row2h, row2h)); + + //UNDIAGONALIZE + t0 = Ssse3.AlignRight(row2l, row2h, 8); + t1 = Ssse3.AlignRight(row2h, row2l, 8); + row2l = t0; + row2h = t1; + + b0 = row3l; + row3l = row3h; + row3h = b0; + + t0 = Ssse3.AlignRight(row4l, row4h, 8); + t1 = Ssse3.AlignRight(row4h, row4l, 8); + row4l = t1; + row4h = t0; + + if (++i == rounds) break; + + //ROUND 9 + b0 = Sse2.UnpackLow(m3, m7); + b1 = Ssse3.AlignRight(m0, m5, 8); + + //G1 + row1l = Sse2.Add(Sse2.Add(row1l, b0), row2l); + row1h = Sse2.Add(Sse2.Add(row1h, b1), row2h); + + row4l = Sse2.Xor(row4l, row1l); + row4h = Sse2.Xor(row4h, row1h); + + row4l = Sse2.Shuffle(row4l.AsUInt32(), 0b_10_11_00_01).AsUInt64(); + row4h = Sse2.Shuffle(row4h.AsUInt32(), 0b_10_11_00_01).AsUInt64(); + + row3l = Sse2.Add(row3l, row4l); + row3h = Sse2.Add(row3h, row4h); + + row2l = Sse2.Xor(row2l, row3l); + row2h = Sse2.Xor(row2h, row3h); + + row2l = Ssse3.Shuffle(row2l.AsByte(), r24).AsUInt64(); + row2h = Ssse3.Shuffle(row2h.AsByte(), r24).AsUInt64(); + + b0 = Sse2.UnpackHigh(m7, m4); + b1 = Ssse3.AlignRight(m4, m1, 8); + + //G2 + row1l = Sse2.Add(Sse2.Add(row1l, b0), row2l); + row1h = Sse2.Add(Sse2.Add(row1h, b1), row2h); + + row4l = Sse2.Xor(row4l, row1l); + row4h = Sse2.Xor(row4h, row1h); + + row4l = Ssse3.Shuffle(row4l.AsByte(), r16).AsUInt64(); + row4h = Ssse3.Shuffle(row4h.AsByte(), r16).AsUInt64(); + + row3l = Sse2.Add(row3l, row4l); + row3h = Sse2.Add(row3h, row4h); + + row2l = Sse2.Xor(row2l, row3l); + row2h = Sse2.Xor(row2h, row3h); + + row2l = Sse2.Xor(Sse2.ShiftRightLogical(row2l, 63), Sse2.Add(row2l, row2l)); + row2h = Sse2.Xor(Sse2.ShiftRightLogical(row2h, 63), Sse2.Add(row2h, row2h)); + + //DIAGONALIZE + t0 = Ssse3.AlignRight(row2h, row2l, 8); + t1 = Ssse3.AlignRight(row2l, row2h, 8); + row2l = t0; + row2h = t1; + + b0 = row3l; + row3l = row3h; + row3h = b0; + + t0 = Ssse3.AlignRight(row4h, row4l, 8); + t1 = Ssse3.AlignRight(row4l, row4h, 8); + row4l = t1; + row4h = t0; + + b0 = m6; + b1 = Ssse3.AlignRight(m5, m0, 8); + + //G1 + row1l = Sse2.Add(Sse2.Add(row1l, b0), row2l); + row1h = Sse2.Add(Sse2.Add(row1h, b1), row2h); + + row4l = Sse2.Xor(row4l, row1l); + row4h = Sse2.Xor(row4h, row1h); + + row4l = Sse2.Shuffle(row4l.AsUInt32(), 0b_10_11_00_01).AsUInt64(); + row4h = Sse2.Shuffle(row4h.AsUInt32(), 0b_10_11_00_01).AsUInt64(); + + row3l = Sse2.Add(row3l, row4l); + row3h = Sse2.Add(row3h, row4h); + + row2l = Sse2.Xor(row2l, row3l); + row2h = Sse2.Xor(row2h, row3h); + + row2l = Ssse3.Shuffle(row2l.AsByte(), r24).AsUInt64(); + row2h = Ssse3.Shuffle(row2h.AsByte(), r24).AsUInt64(); + + b0 = Sse41.Blend(m1.AsUInt16(), m3.AsUInt16(), 0b_1111_0000).AsUInt64(); + b1 = m2; + + //G2 + row1l = Sse2.Add(Sse2.Add(row1l, b0), row2l); + row1h = Sse2.Add(Sse2.Add(row1h, b1), row2h); + + row4l = Sse2.Xor(row4l, row1l); + row4h = Sse2.Xor(row4h, row1h); + + row4l = Ssse3.Shuffle(row4l.AsByte(), r16).AsUInt64(); + row4h = Ssse3.Shuffle(row4h.AsByte(), r16).AsUInt64(); + + row3l = Sse2.Add(row3l, row4l); + row3h = Sse2.Add(row3h, row4h); + + row2l = Sse2.Xor(row2l, row3l); + row2h = Sse2.Xor(row2h, row3h); + + row2l = Sse2.Xor(Sse2.ShiftRightLogical(row2l, 63), Sse2.Add(row2l, row2l)); + row2h = Sse2.Xor(Sse2.ShiftRightLogical(row2h, 63), Sse2.Add(row2h, row2h)); + + //UNDIAGONALIZE + t0 = Ssse3.AlignRight(row2l, row2h, 8); + t1 = Ssse3.AlignRight(row2h, row2l, 8); + row2l = t0; + row2h = t1; + + b0 = row3l; + row3l = row3h; + row3h = b0; + + t0 = Ssse3.AlignRight(row4l, row4h, 8); + t1 = Ssse3.AlignRight(row4h, row4l, 8); + row4l = t1; + row4h = t0; + + if (++i == rounds) break; + + //ROUND 10 + b0 = Sse2.UnpackLow(m5, m4); + b1 = Sse2.UnpackHigh(m3, m0); + + //G1 + row1l = Sse2.Add(Sse2.Add(row1l, b0), row2l); + row1h = Sse2.Add(Sse2.Add(row1h, b1), row2h); + + row4l = Sse2.Xor(row4l, row1l); + row4h = Sse2.Xor(row4h, row1h); + + row4l = Sse2.Shuffle(row4l.AsUInt32(), 0b_10_11_00_01).AsUInt64(); + row4h = Sse2.Shuffle(row4h.AsUInt32(), 0b_10_11_00_01).AsUInt64(); + + row3l = Sse2.Add(row3l, row4l); + row3h = Sse2.Add(row3h, row4h); + + row2l = Sse2.Xor(row2l, row3l); + row2h = Sse2.Xor(row2h, row3h); + + row2l = Ssse3.Shuffle(row2l.AsByte(), r24).AsUInt64(); + row2h = Ssse3.Shuffle(row2h.AsByte(), r24).AsUInt64(); + + b0 = Sse2.UnpackLow(m1, m2); + b1 = Sse41.Blend(m3.AsUInt16(), m2.AsUInt16(), 0b_1111_0000).AsUInt64(); + + //G2 + row1l = Sse2.Add(Sse2.Add(row1l, b0), row2l); + row1h = Sse2.Add(Sse2.Add(row1h, b1), row2h); + + row4l = Sse2.Xor(row4l, row1l); + row4h = Sse2.Xor(row4h, row1h); + + row4l = Ssse3.Shuffle(row4l.AsByte(), r16).AsUInt64(); + row4h = Ssse3.Shuffle(row4h.AsByte(), r16).AsUInt64(); + + row3l = Sse2.Add(row3l, row4l); + row3h = Sse2.Add(row3h, row4h); + + row2l = Sse2.Xor(row2l, row3l); + row2h = Sse2.Xor(row2h, row3h); + + row2l = Sse2.Xor(Sse2.ShiftRightLogical(row2l, 63), Sse2.Add(row2l, row2l)); + row2h = Sse2.Xor(Sse2.ShiftRightLogical(row2h, 63), Sse2.Add(row2h, row2h)); + + //DIAGONALIZE + t0 = Ssse3.AlignRight(row2h, row2l, 8); + t1 = Ssse3.AlignRight(row2l, row2h, 8); + row2l = t0; + row2h = t1; + + b0 = row3l; + row3l = row3h; + row3h = b0; + + t0 = Ssse3.AlignRight(row4h, row4l, 8); + t1 = Ssse3.AlignRight(row4l, row4h, 8); + row4l = t1; + row4h = t0; + + b0 = Sse2.UnpackHigh(m7, m4); + b1 = Sse2.UnpackHigh(m1, m6); + + //G1 + row1l = Sse2.Add(Sse2.Add(row1l, b0), row2l); + row1h = Sse2.Add(Sse2.Add(row1h, b1), row2h); + + row4l = Sse2.Xor(row4l, row1l); + row4h = Sse2.Xor(row4h, row1h); + + row4l = Sse2.Shuffle(row4l.AsUInt32(), 0b_10_11_00_01).AsUInt64(); + row4h = Sse2.Shuffle(row4h.AsUInt32(), 0b_10_11_00_01).AsUInt64(); + + row3l = Sse2.Add(row3l, row4l); + row3h = Sse2.Add(row3h, row4h); + + row2l = Sse2.Xor(row2l, row3l); + row2h = Sse2.Xor(row2h, row3h); + + row2l = Ssse3.Shuffle(row2l.AsByte(), r24).AsUInt64(); + row2h = Ssse3.Shuffle(row2h.AsByte(), r24).AsUInt64(); + + b0 = Ssse3.AlignRight(m7, m5, 8); + b1 = Sse2.UnpackLow(m6, m0); + + //G2 + row1l = Sse2.Add(Sse2.Add(row1l, b0), row2l); + row1h = Sse2.Add(Sse2.Add(row1h, b1), row2h); + + row4l = Sse2.Xor(row4l, row1l); + row4h = Sse2.Xor(row4h, row1h); + + row4l = Ssse3.Shuffle(row4l.AsByte(), r16).AsUInt64(); + row4h = Ssse3.Shuffle(row4h.AsByte(), r16).AsUInt64(); + + row3l = Sse2.Add(row3l, row4l); + row3h = Sse2.Add(row3h, row4h); + + row2l = Sse2.Xor(row2l, row3l); + row2h = Sse2.Xor(row2h, row3h); + + row2l = Sse2.Xor(Sse2.ShiftRightLogical(row2l, 63), Sse2.Add(row2l, row2l)); + row2h = Sse2.Xor(Sse2.ShiftRightLogical(row2h, 63), Sse2.Add(row2h, row2h)); + + //UNDIAGONALIZE + t0 = Ssse3.AlignRight(row2l, row2h, 8); + t1 = Ssse3.AlignRight(row2h, row2l, 8); + row2l = t0; + row2h = t1; + + b0 = row3l; + row3l = row3h; + row3h = b0; + + t0 = Ssse3.AlignRight(row4l, row4h, 8); + t1 = Ssse3.AlignRight(row4h, row4l, 8); + row4l = t1; + row4h = t0; + } + + row1l = Sse2.Xor(row1l, row3l); + row1h = Sse2.Xor(row1h, row3h); + row1l = Sse2.Xor(row1l, Sse2.LoadVector128(sh)); + row1h = Sse2.Xor(row1h, Sse2.LoadVector128(sh + 2)); + Sse2.Store(sh, row1l); + Sse2.Store(sh + 2, row1h); + + row2l = Sse2.Xor(row2l, row4l); + row2h = Sse2.Xor(row2h, row4h); + row2l = Sse2.Xor(row2l, Sse2.LoadVector128(sh + 4)); + row2h = Sse2.Xor(row2h, Sse2.LoadVector128(sh + 6)); + Sse2.Store(sh + 4, row2l); + Sse2.Store(sh + 6, row2h); + } +} diff --git a/src/Nethermind/Nethermind.Crypto/Blake2Compression.cs b/src/Nethermind/Nethermind.Crypto/Blake2Compression.cs deleted file mode 100644 index 0771977b86a..00000000000 --- a/src/Nethermind/Nethermind.Crypto/Blake2Compression.cs +++ /dev/null @@ -1,111 +0,0 @@ -// Copyright (c) 2021 Demerzel Solutions Limited -// This file is part of the Nethermind library. -// -// The Nethermind library is free software: you can redistribute it and/or modify -// it under the terms of the GNU Lesser General Public License as published by -// the Free Software Foundation, either version 3 of the License, or -// (at your option) any later version. -// -// The Nethermind library is distributed in the hope that it will be useful, -// but WITHOUT ANY WARRANTY; without even the implied warranty of -// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -// GNU Lesser General Public License for more details. -// -// You should have received a copy of the GNU Lesser General Public License -// along with the Nethermind. If not, see . - -using System; -using System.Buffers.Binary; -using System.Runtime.CompilerServices; -using System.Runtime.InteropServices; - -namespace Nethermind.Crypto -{ - /// - /// Code adapted from pantheon (https://github.com/PegaSysEng/pantheon) - /// - public class Blake2Compression - { - private static readonly byte[][] Precomputed = - { - new byte[] {0, 2, 4, 6, 1, 3, 5, 7, 8, 10, 12, 14, 9, 11, 13, 15}, - new byte[] {14, 4, 9, 13, 10, 8, 15, 6, 1, 0, 11, 5, 12, 2, 7, 3}, - new byte[] {11, 12, 5, 15, 8, 0, 2, 13, 10, 3, 7, 9, 14, 6, 1, 4}, - new byte[] {7, 3, 13, 11, 9, 1, 12, 14, 2, 5, 4, 15, 6, 10, 0, 8}, - new byte[] {9, 5, 2, 10, 0, 7, 4, 15, 14, 11, 6, 3, 1, 12, 8, 13}, - new byte[] {2, 6, 0, 8, 12, 10, 11, 3, 4, 7, 15, 1, 13, 5, 14, 9}, - new byte[] {12, 1, 14, 4, 5, 15, 13, 10, 0, 6, 9, 8, 7, 3, 2, 11}, - new byte[] {13, 7, 12, 3, 11, 14, 1, 9, 5, 15, 8, 2, 0, 4, 6, 10}, - new byte[] {6, 14, 11, 0, 15, 9, 3, 8, 12, 13, 1, 10, 2, 7, 4, 5}, - new byte[] {10, 8, 7, 1, 2, 4, 6, 5, 15, 9, 3, 13, 11, 14, 12, 0} - }; - - private static readonly ulong[] IV = - { - 0x6a09e667f3bcc908ul, 0xbb67ae8584caa73bul, 0x3c6ef372fe94f82bul, - 0xa54ff53a5f1d36f1ul, 0x510e527fade682d1ul, 0x9b05688c2b3e6c1ful, - 0x1f83d9abfb41bd6bul, 0x5be0cd19137e2179ul - }; - - public void Compress(ReadOnlySpan input, Span output) - { - Span v = stackalloc ulong[16]; - - uint rounds = BinaryPrimitives.ReadUInt32BigEndian(input); - ReadOnlySpan h = MemoryMarshal.Cast(input.Slice(4, 64)); - ReadOnlySpan m = MemoryMarshal.Cast(input.Slice(68, 128)); - ReadOnlySpan t = MemoryMarshal.Cast(input.Slice(196, 16)); - bool f = input[212] != 0; - - h.CopyTo(v.Slice(0, 8)); - IV.AsSpan().CopyTo(v.Slice(8, 8)); - - v[12] ^= t[0]; - v[13] ^= t[1]; - - if (f) - { - v[14] ^= 0xfffffffffffffffful; - } - - for (uint i = 0; i < rounds; ++i) - { - byte[] s = Precomputed[i % 10]; - Compute(v, m[s[0]], m[s[4]], 0, 4, 8, 12); - Compute(v, m[s[1]], m[s[5]], 1, 5, 9, 13); - Compute(v, m[s[2]], m[s[6]], 2, 6, 10, 14); - Compute(v, m[s[3]], m[s[7]], 3, 7, 11, 15); - Compute(v, m[s[8]], m[s[12]], 0, 5, 10, 15); - Compute(v, m[s[9]], m[s[13]], 1, 6, 11, 12); - Compute(v, m[s[10]], m[s[14]], 2, 7, 8, 13); - Compute(v, m[s[11]], m[s[15]], 3, 4, 9, 14); - } - - MemoryMarshal.Cast(h).CopyTo(output); - Span outputUlongs = MemoryMarshal.Cast(output); - for (int offset = 0; offset < h.Length; offset++) - { - outputUlongs[offset] = h[offset] ^ v[offset] ^ v[offset + 8]; - } - } - - [MethodImpl(MethodImplOptions.AggressiveInlining)] - private void Compute(Span v, ulong a, ulong b, int i, int j, int k, int l) - { - v[i] += a + v[j]; - v[l] = RotateLeft(v[l] ^ v[i], -32); - v[k] += v[l]; - v[j] = RotateLeft(v[j] ^ v[k], -24); - - v[i] += b + v[j]; - v[l] = RotateLeft(v[l] ^ v[i], -16); - v[k] += v[l]; - v[j] = RotateLeft(v[j] ^ v[k], -63); - } - - private static ulong RotateLeft(ulong value, int count) - { - return (value << count) | (value >> (64 - count)); - } - } -} diff --git a/src/Nethermind/Nethermind.Evm/Precompiles/Blake2FPrecompile.cs b/src/Nethermind/Nethermind.Evm/Precompiles/Blake2FPrecompile.cs index 12faf7ea334..7f2d180dae2 100644 --- a/src/Nethermind/Nethermind.Evm/Precompiles/Blake2FPrecompile.cs +++ b/src/Nethermind/Nethermind.Evm/Precompiles/Blake2FPrecompile.cs @@ -18,7 +18,7 @@ using Nethermind.Core; using Nethermind.Core.Extensions; using Nethermind.Core.Specs; -using Nethermind.Crypto; +using Nethermind.Crypto.Blake2; namespace Nethermind.Evm.Precompiles {