Files
MultiPar/source/par2j/gf16.c
2024-11-30 13:06:17 +09:00

3450 lines
121 KiB
C
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
// The basic of 16-bit Galois Field arithmetic is based on Galois.c by James S. Plank.
// Modified by Yutaka Sawada to support MMX, SSE2, and SSSE3.
/* Galois.c
* James S. Plank
Jerasure - A C/C++ Library for a Variety of Reed-Solomon and RAID-6 Erasure Coding Techniques
Revision 1.2A
May 24, 2011
James S. Plank
Department of Electrical Engineering and Computer Science
University of Tennessee
Knoxville, TN 37996
plank@cs.utk.edu
Copyright (c) 2011, James S. Plank
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions
are met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
- Neither the name of the University of Tennessee nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY
WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
POSSIBILITY OF SUCH DAMAGE.
*/
#define _WIN32_WINNT 0x0601 // Windows 7 or later
#include <stdlib.h>
#include <stdio.h>
#include <windows.h>
#include <intrin.h> // 組み込み関数(intrinsic)を使用する場合インクルード
#include "gf16.h"
#include "gf_jit.h" // ParPar の JIT コード用
extern unsigned int cpu_flag; // declared in common2.h
#ifndef _WIN64 // 32-bit 版なら
#pragma warning(disable:4731) // inhibit VC's "ebp modified" warning
#pragma warning(disable:4799) // inhibit VC's "missing emms" warning
#endif
/* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */
// CPU によって使う関数を変更する際の仮宣言
//#define NO_SIMD // SIMD を使わない場合
int sse_unit;
void galois_align16_multiply(unsigned char *r1, unsigned char *r2, unsigned int len, int factor);
void galois_align32_multiply(unsigned char *r1, unsigned char *r2, unsigned int len, int factor);
void galois_align32avx_multiply(unsigned char *r1, unsigned char *r2, unsigned int len, int factor);
void galois_align256_multiply(unsigned char *r1, unsigned char *r2, unsigned int len, int factor);
void galois_align32_multiply2(unsigned char *src1, unsigned char *src2, unsigned char *dst, unsigned int len, int factor1, int factor2);
void galois_align32avx_multiply2(unsigned char *src1, unsigned char *src2, unsigned char *dst, unsigned int len, int factor1, int factor2);
void galois_altmap_none(unsigned char *data, unsigned int bsize);
// AVX2 と SSSE3 の ALTMAP は 32バイト単位で行う
void galois_altmap32_change(unsigned char *data, unsigned int bsize);
void galois_altmap32_return(unsigned char *data, unsigned int bsize);
void checksum16_altmap32(unsigned char *data, unsigned char *hash, int byte_size);
void checksum16_return32(unsigned char *data, unsigned char *hash, int byte_size);
// JIT(SSE2) は 256バイト単位で計算する
void galois_altmap256_change(unsigned char *data, unsigned int bsize);
void galois_altmap256_return(unsigned char *data, unsigned int bsize);
void checksum16_altmap256(unsigned char *data, unsigned char *hash, int byte_size);
void checksum16_return256(unsigned char *data, unsigned char *hash, int byte_size);
/* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */
#define NW 65536
#define NWM1 65535
#define PRIM_POLY 0x1100B
// なぜかテーブルは 2バイト整数を使った方が速い
static unsigned short *galois_log_table = NULL;
static unsigned short *galois_exp_table;
int galois_create_table(void)
{
unsigned int j, b;
if (galois_log_table != NULL) return 0;
galois_log_table = _aligned_malloc(sizeof(unsigned short) * NW * 2, 64);
if (galois_log_table == NULL) return -1;
galois_exp_table = galois_log_table + NW; // 要素数は 65536個
b = 1;
for (j = 0; j < NWM1; j++){
galois_log_table[b] = (unsigned short)j;
galois_exp_table[j] = (unsigned short)b;
b = b << 1;
if (b & NW) b ^= PRIM_POLY;
}
galois_exp_table[NWM1] = galois_exp_table[0]; // copy for reduction (? mod NWM1)
// CPU によって使う関数を変更する
sse_unit = 16; // 16, 32, 64, 128 のどれでもいい (32のSSSE3は少し速い、GPUが識別するのに注意)
galois_align_multiply = galois_align16_multiply;
galois_align_multiply2 = NULL;
galois_altmap_change = galois_altmap_none;
galois_altmap_return = galois_altmap_none;
checksum16_altmap = checksum16;
checksum16_return = checksum16;
#ifndef NO_SIMD
if (cpu_flag & 256){ // AVX2, SSSE3, JIT(SSE2) の並び替えを使わない場合
// 将来的には AVX-512 などの命令に対応してもいい
//printf("\nWithout ALTMAP\n");
//sse_unit = 32;
} else if (cpu_flag & 16){ // AVX2 対応なら
//printf("\nUse AVX2 & ALTMAP\n");
sse_unit = 32; // 32, 64, 128 のどれでもいい
galois_align_multiply = galois_align32avx_multiply;
galois_align_multiply2 = galois_align32avx_multiply2;
galois_altmap_change = galois_altmap32_change;
galois_altmap_return = galois_altmap32_return;
checksum16_altmap = checksum16_altmap32;
checksum16_return = checksum16_return32;
} else if (cpu_flag & 1){ // SSSE3 対応なら
//printf("\nUse SSSE3 & ALTMAP\n");
sse_unit = 32; // 32, 64, 128 のどれでもいい
galois_align_multiply = galois_align32_multiply;
galois_align_multiply2 = galois_align32_multiply2;
galois_altmap_change = galois_altmap32_change;
galois_altmap_return = galois_altmap32_return;
checksum16_altmap = checksum16_altmap32;
checksum16_return = checksum16_return32;
} else { // SSSE3 が利用できない場合
if ((cpu_flag & 128) && (jit_alloc() == 0)){ // JIT(SSE2) を使う
//printf("\nUse JIT(SSE2) & ALTMAP\n");
sse_unit = 256;
galois_align_multiply = galois_align256_multiply;
galois_align_multiply2 = NULL;
galois_altmap_change = galois_altmap256_change;
galois_altmap_return = galois_altmap256_return;
checksum16_altmap = checksum16_altmap256;
checksum16_return = checksum16_return256;
}
}
#endif
return 0;
}
unsigned short galois_multiply(int x, int y)
{
int sum;
if ((x == 0) || (y == 0)) return 0;
sum = galois_log_table[x] + galois_log_table[y]; // result is from 2 to NWM1 * 2
//if (sum >= NWM1) sum -= NWM1;
sum = (sum >> 16) + (sum & NWM1); // result is from 0 to NWM1
return galois_exp_table[sum];
}
// multiply when "y" is a fixed value
unsigned short galois_multiply_fix(int x, int log_y)
{
int sum;
if (x == 0) return 0;
sum = galois_log_table[x] + log_y; // result is from 2 to NWM1 * 2
sum = (sum >> 16) + (sum & NWM1); // result is from 0 to NWM1
return galois_exp_table[sum];
}
unsigned short galois_divide(int x, int y)
{
int sum;
if (y == 0) return NWM1; // 除算エラー
if (x == 0) return 0;
sum = galois_log_table[x] - galois_log_table[y];
if (sum < 0) sum += NWM1;
return galois_exp_table[sum];
}
// ガロア体上での乗数計算、x の y 乗
unsigned short galois_power(int x, int y)
{
unsigned int sum;
if (x == 0) return 0; // 0**y = 0
if (y == 0) return 1; // x**0 = 1
if (y == 1) return (unsigned short)x; // x**1 = x
sum = (unsigned int)(galois_log_table[x]) * (unsigned int)y; // result is from 1 to NWM1 * NWM1
//sum = sum % NWM1;
sum = (sum >> 16) + (sum & NWM1); // result is from 1 to NWM1 * 2
sum = (sum >> 16) + (sum & NWM1); // result is from 0 to NWM1
return galois_exp_table[sum];
}
// ガロア体上での逆数、1 / x
unsigned short galois_reciprocal(int x)
{
if (x == 0) return NWM1; // 除算エラー
return galois_exp_table[NWM1 - (int)(galois_log_table[x])];
}
void galois_free_table(void) // テーブルを解放するために追加
{
if (galois_log_table != NULL){
_aligned_free(galois_log_table);
galois_log_table = NULL;
#ifndef _WIN64 // 32-bit 版ならインライン・アセンブラを使う
if (((cpu_flag & 1) == 0) && ((cpu_flag & 128) == 0)) // SSSE3 を使わない場合、MMX の終了処理
_mm_empty();
#endif
// SSSE3 を使わない場合で、JIT(SSE2) を使った場合の終了処理
if (((cpu_flag & 1) == 0) && ((cpu_flag & 128) != 0))
jit_free();
}
}
/* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */
// MMX functions are based on code by Paul Houle (paulhoule.com) March 22, 2008
#ifndef _WIN64 // 32-bit 版ならインライン・アセンブラを使う
// Processes block of data a multiple of 8 bytes long using SIMD (mmx) opcodes.
// The amount of data to process (bsize) must be a non-zero multiple of 8.
// Paul's original code was modified to calculate each 8-bytes and removed last shift by Yutaka Sawada
static void DoBlock8(unsigned char *input, unsigned char *output, unsigned int bsize, unsigned int *pMtab)
{
__asm {
push ebp
mov ebx,bsize ;bytes to process (multiple of 8)
mov esi,input ;source
mov edi,output ;destination
mov ebp,pMtab ;combined multiplication table
mov eax,[esi] ;load 1st 8 source bytes
movd mm4,[esi+4]
sub ebx,8 ;reduce last 8-bytes from loop
add esi,ebx ;point to end of input/output
add edi,ebx
neg ebx ;convert byte size to count-up
lp8:
movzx edx,al
movzx ecx,ah
shr eax,16
movd mm0,[ebp+edx*4] ;order is [_][_][_][0]
movd mm1,[ebp+400h+ecx*4]
movzx edx,al
movzx ecx,ah
movd eax,mm4
movq mm4,[esi+ebx+8] ;read-ahead next 8 source bytes
movd mm2,[ebp+edx*4] ;order is [_][_][_][1]
movzx edx,al
movq mm5,[edi+ebx]
movd mm3,[ebp+400h+ecx*4]
movzx ecx,ah
shr eax,16
punpcklwd mm0,[ebp+edx*4] ;order is [_][_][2][0]
movzx edx,al
punpcklwd mm1,[ebp+400h+ecx*4]
movzx ecx,ah
punpcklwd mm2,[ebp+edx*4] ;order is [_][_][3][1]
pxor mm1,mm0
punpcklwd mm3,[ebp+400h+ecx*4]
pxor mm3,mm2
movd eax,mm4 ;prepare src bytes 3-0 for next loop
punpcklwd mm1,mm3 ;order is [3][2][1][0]
psrlq mm4,32 ;align src bytes 7-4 for next loop
pxor mm1,mm5
movq [edi+ebx],mm1
add ebx,8
jnz lp8
;no need to pre-read in last 8-bytes
movzx edx,al
movzx ecx,ah
shr eax,16
movd mm0,[ebp+edx*4] ;order is [_][_][_][0]
movd mm1,[ebp+400h+ecx*4]
movzx edx,al
movzx ecx,ah
movd eax,mm4
movd mm2,[ebp+edx*4] ;order is [_][_][_][1]
movzx edx,al
movq mm5,[edi+ebx]
movd mm3,[ebp+400h+ecx*4]
movzx ecx,ah
shr eax,16
punpcklwd mm0,[ebp+edx*4] ;order is [_][_][2][0]
movzx edx,al
punpcklwd mm1,[ebp+400h+ecx*4]
movzx ecx,ah
punpcklwd mm2,[ebp+edx*4] ;order is [_][_][3][1]
pxor mm1,mm0
punpcklwd mm3,[ebp+400h+ecx*4]
pxor mm3,mm2
punpcklwd mm1,mm3 ;order is [3][2][1][0]
pxor mm1,mm5
movq [edi+ebx],mm1
pop ebp
}
}
#endif
// calculate multiplication tables instantly
static void create_two_table(unsigned int *mtab, int factor){
int shift_table[8], i, j, sum;
// factor * 2の乗数を計算する
shift_table[0] = factor; // factor * 1
for (i = 1; i < 8; i++){
// if (factor & 0x8000){
// factor <<= 1;
// factor ^= 0x1100B;
// } else {
// factor <<= 1;
// }
factor = (factor << 1) ^ (((factor << 16) >> 31) & 0x1100B);
shift_table[i] = factor; // factor * (2**i)
}
for (j = 0; j < 2; j++){
for (i = 0; i < 256; i += 32){
/*
sum = 0;
if (i & 32)
sum = shift_table[5];
if (i & 64)
sum ^= shift_table[6];
if (i & 128)
sum ^= shift_table[7];
*/
sum = shift_table[5] & ((i << 26) >> 31);
sum ^= shift_table[6] & ((i << 25) >> 31);
sum ^= shift_table[7] & ((i << 24) >> 31);
mtab[i ] = sum;
mtab[i + 1] = sum ^ shift_table[0];
mtab[i + 2] = sum ^ shift_table[1];
mtab[i + 3] = sum ^ shift_table[1] ^ shift_table[0];
mtab[i + 4] = sum ^ shift_table[2];
mtab[i + 5] = sum ^ shift_table[2] ^ shift_table[0];
mtab[i + 6] = sum ^ shift_table[2] ^ shift_table[1];
mtab[i + 7] = sum ^ shift_table[2] ^ shift_table[1] ^ shift_table[0];
mtab[i + 8] = sum ^ shift_table[3];
mtab[i + 9] = sum ^ shift_table[3] ^ shift_table[0];
mtab[i + 10] = sum ^ shift_table[3] ^ shift_table[1];
mtab[i + 11] = sum ^ shift_table[3] ^ shift_table[1] ^ shift_table[0];
mtab[i + 12] = sum ^ shift_table[3] ^ shift_table[2];
mtab[i + 13] = sum ^ shift_table[3] ^ shift_table[2] ^ shift_table[0];
mtab[i + 14] = sum ^ shift_table[3] ^ shift_table[2] ^ shift_table[1];
mtab[i + 15] = sum ^ shift_table[3] ^ shift_table[2] ^ shift_table[1] ^ shift_table[0];
sum ^= shift_table[4];
mtab[i + 16] = sum;
mtab[i + 17] = sum ^ shift_table[0];
mtab[i + 18] = sum ^ shift_table[1];
mtab[i + 19] = sum ^ shift_table[1] ^ shift_table[0];
mtab[i + 20] = sum ^ shift_table[2];
mtab[i + 21] = sum ^ shift_table[2] ^ shift_table[0];
mtab[i + 22] = sum ^ shift_table[2] ^ shift_table[1];
mtab[i + 23] = sum ^ shift_table[2] ^ shift_table[1] ^ shift_table[0];
mtab[i + 24] = sum ^ shift_table[3];
mtab[i + 25] = sum ^ shift_table[3] ^ shift_table[0];
mtab[i + 26] = sum ^ shift_table[3] ^ shift_table[1];
mtab[i + 27] = sum ^ shift_table[3] ^ shift_table[1] ^ shift_table[0];
mtab[i + 28] = sum ^ shift_table[3] ^ shift_table[2];
mtab[i + 29] = sum ^ shift_table[3] ^ shift_table[2] ^ shift_table[0];
mtab[i + 30] = sum ^ shift_table[3] ^ shift_table[2] ^ shift_table[1];
mtab[i + 31] = sum ^ shift_table[3] ^ shift_table[2] ^ shift_table[1] ^ shift_table[0];
}
for (i = 0; i < 8; i++){
factor = (factor << 1) ^ (((factor << 16) >> 31) & 0x1100B);
shift_table[i] = factor; // factor * (2**i)
}
mtab += 256;
}
}
/* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */
// The method of using SSSE3 is based on Plank's papar;
// "Screaming Fast Galois Field Arithmetic Using Intel SIMD Instructions".
/*
static void create_eight_table_c(unsigned char *mtab, int factor)
{
int shift_table[16], i, sum;
// factor * 2の乗数を計算する
shift_table[0] = factor; // factor * 1
for (i = 1; i < 16; i++){
// if (factor & 0x8000){
// factor <<= 1;
// factor ^= 0x1100B;
// } else {
// factor <<= 1;
// }
factor = (factor << 1) ^ (((factor << 16) >> 31) & 0x1100B);
shift_table[i] = factor; // factor * (2**i)
}
for (i = 0; i < 16; i += 4){ // 4-bit ごとに計算する
mtab[i * 8 ] = 0;
mtab[i * 8 + 16] = 0;
sum = shift_table[i];
mtab[i * 8 + 1] = (unsigned char)sum; // lower 8-bit
mtab[i * 8 + 17] = (unsigned char)(sum >> 8); // higher 8-bit
sum = shift_table[i + 1];
mtab[i * 8 + 2] = (unsigned char)sum;
mtab[i * 8 + 18] = (unsigned char)(sum >> 8);
sum = sum ^ shift_table[i];
mtab[i * 8 + 3] = (unsigned char)sum;
mtab[i * 8 + 19] = (unsigned char)(sum >> 8);
sum = shift_table[i + 2];
mtab[i * 8 + 4] = (unsigned char)sum;
mtab[i * 8 + 20] = (unsigned char)(sum >> 8);
sum = sum ^ shift_table[i];
mtab[i * 8 + 5] = (unsigned char)sum;
mtab[i * 8 + 21] = (unsigned char)(sum >> 8);
sum = shift_table[i + 2] ^ shift_table[i + 1];
mtab[i * 8 + 6] = (unsigned char)sum;
mtab[i * 8 + 22] = (unsigned char)(sum >> 8);
sum = sum ^ shift_table[i];
mtab[i * 8 + 7] = (unsigned char)sum;
mtab[i * 8 + 23] = (unsigned char)(sum >> 8);
sum = shift_table[i + 3];
mtab[i * 8 + 8] = (unsigned char)sum;
mtab[i * 8 + 24] = (unsigned char)(sum >> 8);
sum = sum ^ shift_table[i];
mtab[i * 8 + 9] = (unsigned char)sum;
mtab[i * 8 + 25] = (unsigned char)(sum >> 8);
sum = shift_table[i + 3] ^ shift_table[i + 1];
mtab[i * 8 + 10] = (unsigned char)sum;
mtab[i * 8 + 26] = (unsigned char)(sum >> 8);
sum = sum ^ shift_table[i];
mtab[i * 8 + 11] = (unsigned char)sum;
mtab[i * 8 + 27] = (unsigned char)(sum >> 8);
sum = shift_table[i + 3] ^ shift_table[i + 2];
mtab[i * 8 + 12] = (unsigned char)sum;
mtab[i * 8 + 28] = (unsigned char)(sum >> 8);
sum = sum ^ shift_table[i];
mtab[i * 8 + 13] = (unsigned char)sum;
mtab[i * 8 + 29] = (unsigned char)(sum >> 8);
sum = shift_table[i + 3] ^ shift_table[i + 2] ^ shift_table[i + 1];
mtab[i * 8 + 14] = (unsigned char)sum;
mtab[i * 8 + 30] = (unsigned char)(sum >> 8);
sum = sum ^ shift_table[i];
mtab[i * 8 + 15] = (unsigned char)sum;
mtab[i * 8 + 31] = (unsigned char)(sum >> 8);
}
}
*/
#ifndef _WIN64 // 32-bit 版ならインライン・アセンブラを使う
// tables for split four combined multiplication
static void create_eight_table(unsigned char *mtab, int factor)
{
__asm { // This implementation requires SSE2
mov eax, factor
mov edx, mtab
mov ecx, -128
; create mask for 8-bit
pcmpeqw xmm7, xmm7 ; 0xFFFF *8
psrlw xmm7, 8 ; 0x00FF *8
lp32:
; factor * 1, *2, *4, *8
movd xmm0, eax ; [_][_][_][_][_][_][_][1]
pxor xmm1, xmm1
movsx ebx, ax
sar ebx, 31
shl eax, 1
and ebx, 0x1100B
xor eax, ebx
pinsrw xmm1, eax, 1 ; [_][_][_][_][_][_][2][_]
pxor xmm2, xmm2
movsx ebx, ax
sar ebx, 31
shl eax, 1
and ebx, 0x1100B
xor eax, ebx
pinsrw xmm2, eax, 4 ; [_][_][_][4][_][_][_][_]
punpcklwd xmm1, xmm1 ; [_][_][_][_][2][2][_][_]
movsx ebx, ax
sar ebx, 31
shl eax, 1
and ebx, 0x1100B
xor eax, ebx
movd xmm3, eax ; [_][_][_][_][_][_][_][8]
pshuflw xmm0, xmm0, 17 ; [_][_][_][_][1][_][1][_]
punpcklwd xmm3, xmm3 ; [_][_][_][_][_][_][8][8]
pxor xmm0, xmm1 ; [_][_][_][_][3][2][1][_]
pshufhw xmm2, xmm2, 0 ; [4][4][4][4][_][_][_][_]
punpcklqdq xmm0, xmm0 ; [3][2][1][_][3][2][1][_]
pshufd xmm3, xmm3, 0 ; [8][8][8][8][8][8][8][8]
pxor xmm2, xmm0 ; [7][6][5][4][3][2][1][_]
pxor xmm3, xmm2 ; [15][14][13][12][11][10][9][8]
movdqa xmm0, xmm2
movdqa xmm1, xmm3
pand xmm0, xmm7
pand xmm1, xmm7
packuswb xmm0, xmm1 ; lower 8-bit * 16
psrlw xmm2, 8
psrlw xmm3, 8
packuswb xmm2, xmm3 ; higher 8-bit * 16
movdqa [edx+128+ecx], xmm0
movdqa [edx+144+ecx], xmm2
; for next loop
movsx ebx, ax ; move with sign of word
sar ebx, 31
shl eax, 1
and ebx, 0x1100B
xor eax, ebx
add ecx, 32
jnz lp32
}
}
// VC2008 は SSSE3 をインライン・アセンブラで使える
// Address (input) does not need be 16-byte aligned
static void gf16_ssse3_block16u(unsigned char *input, unsigned char *output, unsigned int bsize, unsigned char *table)
{
__asm {
mov ecx, input ; source
mov edx, output ; destination
mov eax, bsize ; bytes to process (multiple of 16)
mov ebx, table ; multiplication table
; create mask for 8 entries
pcmpeqw xmm7, xmm7 ; 0xFFFF *8
psrlw xmm7, 12 ; 0x000F *8
add ecx, eax ; point to end of input/output
add edx, eax
neg eax ; convert byte size to count-up
lp16:
movdqu xmm0, [ecx+eax] ; read source 16-bytes
movdqa xmm2, [edx+eax]
movdqa xmm3, [ebx] ; low table
movdqa xmm4, [ebx+16] ; high table
movdqa xmm1, xmm0 ; copy source
psrlw xmm0, 4 ; prepare next 4-bit
pand xmm1, xmm7 ; src & 0x000F
pshufb xmm3, xmm1 ; table look-up
psllw xmm1, 8 ; shift 8-bit for higher table
pshufb xmm4, xmm1
movdqa xmm5, [ebx+32] ; low table
movdqa xmm6, [ebx+48] ; high table
pxor xmm3, xmm4 ; combine high and low
pxor xmm2, xmm3
movdqa xmm1, xmm0 ; copy source
psrlw xmm0, 4 ; prepare next 4-bit
pand xmm1, xmm7 ; (src >> 4) & 0x000F
pshufb xmm5, xmm1 ; table look-up
psllw xmm1, 8 ; shift 8-bit for higher table
pshufb xmm6, xmm1
movdqa xmm3, [ebx+64] ; low table
movdqa xmm4, [ebx+80] ; high table
pxor xmm5, xmm6 ; combine high and low
pxor xmm2, xmm5
movdqa xmm1, xmm0 ; copy source
psrlw xmm0, 4 ; prepare next 4-bit
pand xmm1, xmm7 ; (src >> 8) & 0x000F
pshufb xmm3, xmm1 ; table look-up
psllw xmm1, 8 ; shift 8-bit for higher table
pshufb xmm4, xmm1
movdqa xmm5, [ebx+96] ; low table
movdqa xmm6, [ebx+112] ; high table
pxor xmm3, xmm4 ; combine high and low
pxor xmm2, xmm3
pshufb xmm5, xmm0 ; table look-up
psllw xmm0, 8 ; shift 8-bit for higher table
pshufb xmm6, xmm0
pxor xmm5, xmm6 ; combine high and low
pxor xmm2, xmm5
movdqa [edx+eax], xmm2
add eax, 16
jnz lp16
}
}
// その場で 32バイトごとに並び替えて、計算後に戻す方法50% faster than 16-byte version)
// Address (input) does not need be 16-byte aligned
static void gf16_ssse3_block32u(unsigned char *input, unsigned char *output, unsigned int bsize, unsigned char *table)
{
__asm {
mov ecx, input ; source
mov edx, output ; destination
mov eax, bsize ; bytes to process (multiple of 32)
mov ebx, table ; multiplication table
; create mask for 16 entries
pcmpeqw xmm7, xmm7 ; 0xFFFF *8
pcmpeqw xmm6, xmm6 ; 0xFFFF *8
psrlw xmm7, 12 ; 0x000F *8
psrlw xmm6, 8 ; 0x00FF *8
packuswb xmm7, xmm7 ; 0x0F *16
add ecx, eax ; point to end of input/output
add edx, eax
neg eax ; convert byte size to count-up
lp32:
movdqu xmm0, [ecx+eax ] ; read source 32-bytes
movdqu xmm2, [ecx+eax+16]
movdqa xmm1, xmm0 ; copy source
movdqa xmm3, xmm2
pand xmm0, xmm6 ; erase higher byte
pand xmm2, xmm6
psrlw xmm1, 8 ; move higher byte to lower
psrlw xmm3, 8
packuswb xmm0, xmm2 ; select lower byte of each word
packuswb xmm1, xmm3 ; select higher byte of each word
movdqa xmm4, [ebx] ; low table
movdqa xmm5, [ebx+16] ; high table
movdqa xmm3, xmm0 ; copy source
psrlw xmm0, 4 ; prepare next 4-bit
pand xmm3, xmm7 ; src & 0x0F
pand xmm0, xmm7 ; (src >> 4) & 0x0F
pshufb xmm4, xmm3 ; table look-up
pshufb xmm5, xmm3
movdqa xmm2, [ebx+32] ; low table
movdqa xmm3, [ebx+48] ; high table
pshufb xmm2, xmm0 ; table look-up
pshufb xmm3, xmm0
pxor xmm4, xmm2 ; combine result
pxor xmm5, xmm3
movdqa xmm2, [ebx+64] ; low table
movdqa xmm3, [ebx+80] ; high table
movdqa xmm0, xmm1 ; copy source
psrlw xmm1, 4 ; prepare next 4-bit
pand xmm0, xmm7 ; src & 0x0F
pand xmm1, xmm7 ; (src >> 4) & 0x0F
pshufb xmm2, xmm0 ; table look-up
pshufb xmm3, xmm0
pxor xmm4, xmm2 ; combine result
pxor xmm5, xmm3
movdqa xmm2, [ebx+96] ; low table
movdqa xmm3, [ebx+112] ; high table
pshufb xmm2, xmm1 ; table look-up
pshufb xmm3, xmm1
pxor xmm4, xmm2 ; combine result
pxor xmm5, xmm3
movdqa xmm0, [edx+eax] ; read dest 32-bytes
movdqa xmm1, [edx+eax+16]
movdqa xmm3, xmm4 ; copy result
punpcklbw xmm3, xmm5 ; interleave lower and higher bytes
punpckhbw xmm4, xmm5
pxor xmm0, xmm3
pxor xmm1, xmm4
movdqa [edx+eax], xmm0 ; write dest 32-bytes
movdqa [edx+eax+16], xmm1
add eax, 32
jnz lp32
}
}
// 先に 32バイトごとに並び替えてあるデータを扱う方法
static void gf16_ssse3_block32_altmap(unsigned char *input, unsigned char *output, unsigned int bsize, unsigned char *table)
{
__asm {
mov ecx, input ; source
mov edx, output ; destination
mov eax, bsize ; bytes to process (multiple of 32)
mov ebx, table ; multiplication table
; create mask for 16 entries
pcmpeqw xmm7, xmm7 ; 0xFFFF *8
psrlw xmm7, 12 ; 0x000F *8
packuswb xmm7, xmm7 ; 0x0F *16
add ecx, eax ; point to end of input/output
add edx, eax
neg eax ; convert byte size to count-up
lp32:
movdqa xmm0, [ecx+eax] ; read source 32-bytes
movdqa xmm1, [ecx+eax+16]
movdqa xmm4, [ebx] ; low table
movdqa xmm5, [ebx+16] ; high table
movdqa xmm3, xmm0 ; copy source
psrlw xmm0, 4 ; prepare next 4-bit
pand xmm3, xmm7 ; src & 0x0F
pand xmm0, xmm7 ; (src >> 4) & 0x0F
pshufb xmm4, xmm3 ; table look-up
pshufb xmm5, xmm3
movdqa xmm2, [ebx+32] ; low table
movdqa xmm3, [ebx+48] ; high table
pshufb xmm2, xmm0 ; table look-up
pshufb xmm3, xmm0
pxor xmm2, xmm4 ; combine result
pxor xmm3, xmm5
movdqa xmm4, [ebx+64] ; low table
movdqa xmm5, [ebx+80] ; high table
movdqa xmm0, xmm1 ; copy source
psrlw xmm0, 4 ; prepare next 4-bit
pand xmm1, xmm7 ; src & 0x0F
pand xmm0, xmm7 ; (src >> 4) & 0x0F
pshufb xmm4, xmm1 ; table look-up
pshufb xmm5, xmm1
pxor xmm4, xmm2 ; combine result
pxor xmm5, xmm3
movdqa xmm2, [ebx+96] ; low table
movdqa xmm3, [ebx+112] ; high table
pshufb xmm2, xmm0 ; table look-up
pshufb xmm3, xmm0
movdqa xmm0, [edx+eax] ; read dest 32-bytes
movdqa xmm1, [edx+eax+16]
pxor xmm4, xmm2 ; combine result
pxor xmm5, xmm3
pxor xmm4, xmm0
pxor xmm5, xmm1
movdqa [edx+eax], xmm4 ; write dest 32-bytes
movdqa [edx+eax+16], xmm5
add eax, 32
jnz lp32
}
}
#else // 64-bit 版ではインライン・アセンブラを使えない
// (__m128i *) で逐次ポインターをキャスト変換するよりも、
// 先に __m128i* で定義しておいた方が、連続した領域へのアクセス最適化がうまくいく?
// ほとんど変わらない気がする(むしろ遅い?)・・・コンパイラ次第なのかも
// tables for split four combined multiplication
static void create_eight_table(unsigned char *mtab, int factor)
{
int count = 4;
__m128i *tbl;
__m128i xmm0, xmm1, xmm2, xmm3, mask;
tbl = (__m128i *)mtab;
// create mask for 8-bit
mask = _mm_setzero_si128();
mask = _mm_cmpeq_epi16(mask, mask); // 0xFFFF *8
mask = _mm_srli_epi16(mask, 8); // 0x00FF *8
while (1){
xmm0 = _mm_cvtsi32_si128(factor); // [_][_][_][_][_][_][_][1]
xmm1 = _mm_setzero_si128();
factor = (factor << 1) ^ (((factor << 16) >> 31) & 0x1100B);
xmm1 = _mm_insert_epi16(xmm1, factor, 1); // [_][_][_][_][_][_][2][_]
xmm2 = _mm_setzero_si128();
factor = (factor << 1) ^ (((factor << 16) >> 31) & 0x1100B);
xmm2 = _mm_insert_epi16(xmm2, factor, 4); // [_][_][_][4][_][_][_][_]
xmm1 = _mm_unpacklo_epi16(xmm1, xmm1); // [_][_][_][_][2][2][_][_]
factor = (factor << 1) ^ (((factor << 16) >> 31) & 0x1100B);
xmm3 = _mm_cvtsi32_si128(factor); // [_][_][_][_][_][_][_][8]
xmm0 = _mm_shufflelo_epi16(xmm0, _MM_SHUFFLE(0, 1, 0, 1)); // [_][_][_][_][1][_][1][_]
xmm3 = _mm_unpacklo_epi16(xmm3, xmm3); // [_][_][_][_][_][_][8][8]
xmm0 = _mm_xor_si128(xmm0, xmm1); // [_][_][_][_][3][2][1][_]
xmm2 = _mm_shufflehi_epi16(xmm2, _MM_SHUFFLE(0, 0, 0, 0)); // [4][4][4][4][_][_][_][_]
xmm0 = _mm_unpacklo_epi64(xmm0, xmm0); // [3][2][1][_][3][2][1][_]
xmm3 = _mm_shuffle_epi32(xmm3, _MM_SHUFFLE(0, 0, 0, 0)); // [8][8][8][8][8][8][8][8]
xmm2 = _mm_xor_si128(xmm2, xmm0); // [7][6][5][4][3][2][1][_]
xmm3 = _mm_xor_si128(xmm3, xmm2); // [15][14][13][12][11][10][9][8]
xmm0 = _mm_load_si128(&xmm2);
xmm1 = _mm_load_si128(&xmm3);
xmm0 = _mm_and_si128(xmm0, mask);
xmm1 = _mm_and_si128(xmm1, mask);
xmm0 = _mm_packus_epi16(xmm0, xmm1); // lower 8-bit * 16
xmm2 = _mm_srli_epi16(xmm2, 8);
xmm3 = _mm_srli_epi16(xmm3, 8);
xmm2 = _mm_packus_epi16(xmm2, xmm3); // higher 8-bit * 16
_mm_store_si128(tbl , xmm0);
_mm_store_si128(tbl + 1, xmm2);
count--;
if (count == 0)
break;
factor = (factor << 1) ^ (((factor << 16) >> 31) & 0x1100B);
tbl += 2;
}
}
// 16バイトごとに計算する方法、_mm_shuffle_epi8 の利用効率が悪い。
// Address (input) does not need be 16-byte aligned
static void gf16_ssse3_block16u(unsigned char *input, unsigned char *output, unsigned int bsize, unsigned char *table)
{
__m128i *src, *dst, *tbl;
__m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
src = (__m128i *)input;
dst = (__m128i *)output;
tbl = (__m128i *)table;
// create mask for 8 entries
xmm7 = _mm_setzero_si128();
xmm7 = _mm_cmpeq_epi16(xmm7, xmm7); // 0xFFFF *8
xmm7 = _mm_srli_epi16(xmm7, 12); // 0x000F *8
while (bsize != 0){
xmm0 = _mm_loadu_si128(src); // read source 16-bytes
xmm2 = _mm_load_si128(dst);
xmm3 = _mm_load_si128(tbl); // low table
xmm4 = _mm_load_si128(tbl + 1); // high table
xmm1 = _mm_load_si128(&xmm0); // copy source
xmm0 = _mm_srli_epi16(xmm0, 4); // prepare next 4-bit
xmm1 = _mm_and_si128(xmm1, xmm7); // src & 0x000F
xmm3 = _mm_shuffle_epi8(xmm3, xmm1); // table look-up
xmm1 = _mm_slli_epi16(xmm1, 8); // shift 8-bit for higher table
xmm4 = _mm_shuffle_epi8(xmm4, xmm1);
xmm5 = _mm_load_si128(tbl + 2); // low table
xmm6 = _mm_load_si128(tbl + 3); // high table
xmm3 = _mm_xor_si128(xmm3, xmm4); // combine high and low
xmm2 = _mm_xor_si128(xmm2, xmm3);
xmm1 = _mm_load_si128(&xmm0); // copy source
xmm0 = _mm_srli_epi16(xmm0, 4); // prepare next 4-bit
xmm1 = _mm_and_si128(xmm1, xmm7); // src & 0x000F
xmm5 = _mm_shuffle_epi8(xmm5, xmm1); // table look-up
xmm1 = _mm_slli_epi16(xmm1, 8); // shift 8-bit for higher table
xmm6 = _mm_shuffle_epi8(xmm6, xmm1);
xmm3 = _mm_load_si128(tbl + 4); // low table
xmm4 = _mm_load_si128(tbl + 5); // high table
xmm5 = _mm_xor_si128(xmm5, xmm6); // combine high and low
xmm2 = _mm_xor_si128(xmm2, xmm5);
xmm1 = _mm_load_si128(&xmm0); // copy source
xmm0 = _mm_srli_epi16(xmm0, 4); // prepare next 4-bit
xmm1 = _mm_and_si128(xmm1, xmm7); // src & 0x000F
xmm3 = _mm_shuffle_epi8(xmm3, xmm1); // table look-up
xmm1 = _mm_slli_epi16(xmm1, 8); // shift 8-bit for higher table
xmm4 = _mm_shuffle_epi8(xmm4, xmm1);
xmm5 = _mm_load_si128(tbl + 6); // low table
xmm6 = _mm_load_si128(tbl + 7); // high table
xmm3 = _mm_xor_si128(xmm3, xmm4); // combine high and low
xmm2 = _mm_xor_si128(xmm2, xmm3);
xmm5 = _mm_shuffle_epi8(xmm5, xmm0); // table look-up
xmm0 = _mm_slli_epi16(xmm0, 8); // shift 8-bit for higher table
xmm6 = _mm_shuffle_epi8(xmm6, xmm0);
xmm5 = _mm_xor_si128(xmm5, xmm6); // combine high and low
xmm2 = _mm_xor_si128(xmm2, xmm5);
_mm_store_si128(dst, xmm2);
src += 1;
dst += 1;
bsize -= 16;
}
}
// その場で 32バイトごとに並び替えて、計算後に戻す方法50% faster than 16-byte version)
// なぜか asm を使わない方が速い!? 32-bit と 64-bit の両方で使える
// xmm レジスタを 8個までしか使わない方が 32-bit 版で速いし安定する
// Address (input) does not need be 16-byte aligned
static void gf16_ssse3_block32u(unsigned char *input, unsigned char *output, unsigned int bsize, unsigned char *table)
{
__m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
__m128i tbl0, tbl1, tbl2, tbl3, tbl4, tbl5, tbl6, tbl7;
// copy tables to local
tbl0 = _mm_load_si128((__m128i *)table);
tbl1 = _mm_load_si128((__m128i *)table + 1);
tbl2 = _mm_load_si128((__m128i *)table + 2);
tbl3 = _mm_load_si128((__m128i *)table + 3);
tbl4 = _mm_load_si128((__m128i *)table + 4);
tbl5 = _mm_load_si128((__m128i *)table + 5);
tbl6 = _mm_load_si128((__m128i *)table + 6);
tbl7 = _mm_load_si128((__m128i *)table + 7);
// create mask for 16 entries
xmm7 = _mm_setzero_si128();
xmm7 = _mm_cmpeq_epi16(xmm7, xmm7); // 0xFFFF *8
xmm6 = _mm_srli_epi16(xmm7, 8); // 0x00FF *8
xmm7 = _mm_srli_epi16(xmm7, 12); // 0x000F *8
xmm7 = _mm_packus_epi16(xmm7, xmm7); // 0x0F *16
while (bsize != 0){
xmm1 = _mm_loadu_si128((__m128i *)input); // read source 32-bytes
xmm3 = _mm_loadu_si128((__m128i *)input + 1);
xmm0 = _mm_and_si128(xmm1, xmm6); // erase higher byte
xmm2 = _mm_and_si128(xmm3, xmm6);
xmm1 = _mm_srli_epi16(xmm1, 8); // move higher byte to lower
xmm3 = _mm_srli_epi16(xmm3, 8);
xmm0 = _mm_packus_epi16(xmm0, xmm2); // select lower byte of each word
xmm1 = _mm_packus_epi16(xmm1, xmm3); // select higher byte of each word
xmm4 = _mm_load_si128(&tbl0); // load tables
xmm5 = _mm_load_si128(&tbl1);
xmm3 = _mm_and_si128(xmm0, xmm7); // src & 0x0F
xmm0 = _mm_and_si128(_mm_srli_epi16(xmm0, 4), xmm7); // (src >> 4) & 0x0F
xmm4 = _mm_shuffle_epi8(xmm4, xmm3); // table look-up
xmm5 = _mm_shuffle_epi8(xmm5, xmm3);
xmm2 = _mm_load_si128(&tbl2); // load tables
xmm3 = _mm_load_si128(&tbl3);
xmm2 = _mm_shuffle_epi8(xmm2, xmm0); // table look-up
xmm3 = _mm_shuffle_epi8(xmm3, xmm0);
xmm4 = _mm_xor_si128(xmm4, xmm2); // combine result
xmm5 = _mm_xor_si128(xmm5, xmm3);
xmm2 = _mm_load_si128(&tbl4); // load tables
xmm3 = _mm_load_si128(&tbl5);
xmm0 = _mm_and_si128(xmm1, xmm7); // src & 0x0F
xmm1 = _mm_and_si128(_mm_srli_epi16(xmm1, 4), xmm7); // (src >> 4) & 0x0F
xmm2 = _mm_shuffle_epi8(xmm2, xmm0); // table look-up
xmm3 = _mm_shuffle_epi8(xmm3, xmm0);
xmm4 = _mm_xor_si128(xmm4, xmm2); // combine result
xmm5 = _mm_xor_si128(xmm5, xmm3);
xmm2 = _mm_load_si128(&tbl6); // load tables
xmm3 = _mm_load_si128(&tbl7);
xmm2 = _mm_shuffle_epi8(xmm2, xmm1); // table look-up
xmm3 = _mm_shuffle_epi8(xmm3, xmm1);
xmm4 = _mm_xor_si128(xmm4, xmm2); // combine result
xmm5 = _mm_xor_si128(xmm5, xmm3);
xmm0 = _mm_load_si128((__m128i *)output); // read dest 32-bytes
xmm1 = _mm_load_si128((__m128i *)output + 1);
xmm3 = _mm_unpacklo_epi8(xmm4, xmm5); // interleave lower and higher bytes
xmm4 = _mm_unpackhi_epi8(xmm4, xmm5);
xmm0 = _mm_xor_si128(xmm0, xmm3);
xmm1 = _mm_xor_si128(xmm1, xmm4);
_mm_store_si128((__m128i *)output, xmm0); // write dest 32-bytes
_mm_store_si128((__m128i *)output + 1, xmm1);
input += 32;
output += 32;
bsize -= 32;
}
}
// xmm レジスタにテーブルを読み込む方が 64-bit 版で微妙に速い
static void gf16_ssse3_block32_altmap(unsigned char *input, unsigned char *output, unsigned int bsize, unsigned char *table)
{
__m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm7;
__m128i tbl0, tbl1, tbl2, tbl3, tbl4, tbl5, tbl6, tbl7;
// copy tables to local
tbl0 = _mm_load_si128((__m128i *)table);
tbl1 = _mm_load_si128((__m128i *)table + 1);
tbl2 = _mm_load_si128((__m128i *)table + 2);
tbl3 = _mm_load_si128((__m128i *)table + 3);
tbl4 = _mm_load_si128((__m128i *)table + 4);
tbl5 = _mm_load_si128((__m128i *)table + 5);
tbl6 = _mm_load_si128((__m128i *)table + 6);
tbl7 = _mm_load_si128((__m128i *)table + 7);
// create mask for 16 entries
xmm7 = _mm_setzero_si128();
xmm7 = _mm_cmpeq_epi16(xmm7, xmm7); // 0xFFFF *8
xmm7 = _mm_srli_epi16(xmm7, 12); // 0x000F *8
xmm7 = _mm_packus_epi16(xmm7, xmm7); // 0x0F *16
while (bsize != 0){
xmm0 = _mm_load_si128((__m128i *)input); // read source 32-bytes
xmm1 = _mm_load_si128((__m128i *)input + 1);
xmm3 = _mm_load_si128(&xmm0); // copy source
xmm0 = _mm_srli_epi16(xmm0, 4); // prepare next 4-bit
xmm3 = _mm_and_si128(xmm3, xmm7); // src & 0x0F
xmm0 = _mm_and_si128(xmm0, xmm7); // (src >> 4) & 0x0F
xmm4 = _mm_load_si128(&tbl0); // load tables
xmm5 = _mm_load_si128(&tbl1);
xmm4 = _mm_shuffle_epi8(xmm4, xmm3); // table look-up
xmm5 = _mm_shuffle_epi8(xmm5, xmm3);
xmm2 = _mm_load_si128(&tbl2); // load tables
xmm3 = _mm_load_si128(&tbl3);
xmm2 = _mm_shuffle_epi8(xmm2, xmm0); // table look-up
xmm3 = _mm_shuffle_epi8(xmm3, xmm0);
xmm4 = _mm_xor_si128(xmm4, xmm2); // combine result
xmm5 = _mm_xor_si128(xmm5, xmm3);
xmm0 = _mm_load_si128(&xmm1); // copy source
xmm0 = _mm_srli_epi16(xmm0, 4); // prepare next 4-bit
xmm1 = _mm_and_si128(xmm1, xmm7); // src & 0x0F
xmm0 = _mm_and_si128(xmm0, xmm7); // (src >> 4) & 0x0F
xmm2 = _mm_load_si128(&tbl4); // load tables
xmm3 = _mm_load_si128(&tbl5);
xmm2 = _mm_shuffle_epi8(xmm2, xmm1); // table look-up
xmm3 = _mm_shuffle_epi8(xmm3, xmm1);
xmm4 = _mm_xor_si128(xmm4, xmm2); // combine result
xmm5 = _mm_xor_si128(xmm5, xmm3);
xmm2 = _mm_load_si128(&tbl6); // load tables
xmm3 = _mm_load_si128(&tbl7);
xmm2 = _mm_shuffle_epi8(xmm2, xmm0); // table look-up
xmm3 = _mm_shuffle_epi8(xmm3, xmm0);
xmm0 = _mm_load_si128((__m128i *)output); // read dest 32-bytes
xmm1 = _mm_load_si128((__m128i *)output + 1);
xmm4 = _mm_xor_si128(xmm4, xmm2); // combine result
xmm5 = _mm_xor_si128(xmm5, xmm3);
xmm4 = _mm_xor_si128(xmm4, xmm0);
xmm5 = _mm_xor_si128(xmm5, xmm1);
_mm_store_si128((__m128i *)output, xmm4); // write dest 32-bytes
_mm_store_si128((__m128i *)output + 1, xmm5);
input += 32;
output += 32;
bsize -= 32;
}
}
/*
static void gf16_ssse3_block32_altmap(unsigned char *input, unsigned char *output, unsigned int bsize, unsigned char *table)
{
__m128i *src, *dst, *tbl;
__m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm7;
src = (__m128i *)input;
dst = (__m128i *)output;
tbl = (__m128i *)table;
// create mask for 16 entries
xmm7 = _mm_setzero_si128();
xmm7 = _mm_cmpeq_epi16(xmm7, xmm7); // 0xFFFF *8
xmm7 = _mm_srli_epi16(xmm7, 12); // 0x000F *8
xmm7 = _mm_packus_epi16(xmm7, xmm7); // 0x0F *16
while (bsize != 0){
xmm0 = _mm_load_si128(src); // read source 32-bytes
xmm1 = _mm_load_si128(src + 1);
xmm3 = _mm_load_si128(&xmm0); // copy source
xmm0 = _mm_srli_epi16(xmm0, 4); // prepare next 4-bit
xmm3 = _mm_and_si128(xmm3, xmm7); // src & 0x0F
xmm0 = _mm_and_si128(xmm0, xmm7); // (src >> 4) & 0x0F
xmm4 = _mm_load_si128(tbl); // load tables
xmm5 = _mm_load_si128(tbl + 1);
xmm4 = _mm_shuffle_epi8(xmm4, xmm3); // table look-up
xmm5 = _mm_shuffle_epi8(xmm5, xmm3);
xmm2 = _mm_load_si128(tbl + 2); // load tables
xmm3 = _mm_load_si128(tbl + 3);
xmm2 = _mm_shuffle_epi8(xmm2, xmm0); // table look-up
xmm3 = _mm_shuffle_epi8(xmm3, xmm0);
xmm4 = _mm_xor_si128(xmm4, xmm2); // combine result
xmm5 = _mm_xor_si128(xmm5, xmm3);
xmm0 = _mm_load_si128(&xmm1); // copy source
xmm0 = _mm_srli_epi16(xmm0, 4); // prepare next 4-bit
xmm1 = _mm_and_si128(xmm1, xmm7); // src & 0x0F
xmm0 = _mm_and_si128(xmm0, xmm7); // (src >> 4) & 0x0F
xmm2 = _mm_load_si128(tbl + 4); // load tables
xmm3 = _mm_load_si128(tbl + 5);
xmm2 = _mm_shuffle_epi8(xmm2, xmm1); // table look-up
xmm3 = _mm_shuffle_epi8(xmm3, xmm1);
xmm4 = _mm_xor_si128(xmm4, xmm2); // combine result
xmm5 = _mm_xor_si128(xmm5, xmm3);
xmm2 = _mm_load_si128(tbl + 6); // load tables
xmm3 = _mm_load_si128(tbl + 7);
xmm2 = _mm_shuffle_epi8(xmm2, xmm0); // table look-up
xmm3 = _mm_shuffle_epi8(xmm3, xmm0);
xmm0 = _mm_load_si128(dst); // read dest 32-bytes
xmm1 = _mm_load_si128(dst + 1);
xmm4 = _mm_xor_si128(xmm4, xmm2); // combine result
xmm5 = _mm_xor_si128(xmm5, xmm3);
xmm4 = _mm_xor_si128(xmm4, xmm0);
xmm5 = _mm_xor_si128(xmm5, xmm1);
_mm_store_si128(dst, xmm4); // write dest 32-bytes
_mm_store_si128(dst + 1, xmm5);
src += 2;
dst += 2;
bsize -= 32;
}
}
*/
#endif
// 逆行列計算用に掛け算だけするXORで追加しない
static void gf16_ssse3_block16s(unsigned char *data, unsigned int bsize, unsigned char *table)
{
__m128i dest, mask, xmm0, xmm1, xmm3, xmm4, xmm5, xmm6;
__m128i tbl0, tbl1, tbl2, tbl3, tbl4, tbl5, tbl6, tbl7;
// copy tables to local
tbl0 = _mm_load_si128((__m128i *)table);
tbl1 = _mm_load_si128((__m128i *)table + 1);
tbl2 = _mm_load_si128((__m128i *)table + 2);
tbl3 = _mm_load_si128((__m128i *)table + 3);
tbl4 = _mm_load_si128((__m128i *)table + 4);
tbl5 = _mm_load_si128((__m128i *)table + 5);
tbl6 = _mm_load_si128((__m128i *)table + 6);
tbl7 = _mm_load_si128((__m128i *)table + 7);
// create mask for 8 entries
mask = _mm_setzero_si128();
mask = _mm_cmpeq_epi16(mask, mask); // 0xFFFF *8
mask = _mm_srli_epi16(mask, 12); // 0x000F *8
while (bsize != 0){
xmm0 = _mm_load_si128((__m128i *)data); // read source 16-bytes
xmm3 = _mm_load_si128(&tbl0); // low table
xmm4 = _mm_load_si128(&tbl1); // high table
xmm1 = _mm_load_si128(&xmm0); // copy source
xmm0 = _mm_srli_epi16(xmm0, 4); // prepare next 4-bit
xmm1 = _mm_and_si128(xmm1, mask); // src & 0x000F
xmm3 = _mm_shuffle_epi8(xmm3, xmm1); // table look-up
xmm1 = _mm_slli_epi16(xmm1, 8); // shift 8-bit for higher table
xmm4 = _mm_shuffle_epi8(xmm4, xmm1);
xmm5 = _mm_load_si128(&tbl2); // low table
xmm6 = _mm_load_si128(&tbl3); // high table
dest = _mm_xor_si128(xmm3, xmm4); // combine high and low
xmm1 = _mm_load_si128(&xmm0); // copy source
xmm0 = _mm_srli_epi16(xmm0, 4); // prepare next 4-bit
xmm1 = _mm_and_si128(xmm1, mask); // src & 0x000F
xmm5 = _mm_shuffle_epi8(xmm5, xmm1); // table look-up
xmm1 = _mm_slli_epi16(xmm1, 8); // shift 8-bit for higher table
xmm6 = _mm_shuffle_epi8(xmm6, xmm1);
xmm3 = _mm_load_si128(&tbl4); // low table
xmm4 = _mm_load_si128(&tbl5); // high table
xmm5 = _mm_xor_si128(xmm5, xmm6); // combine high and low
dest = _mm_xor_si128(dest, xmm5);
xmm1 = _mm_load_si128(&xmm0); // copy source
xmm0 = _mm_srli_epi16(xmm0, 4); // prepare next 4-bit
xmm1 = _mm_and_si128(xmm1, mask); // src & 0x000F
xmm3 = _mm_shuffle_epi8(xmm3, xmm1); // table look-up
xmm1 = _mm_slli_epi16(xmm1, 8); // shift 8-bit for higher table
xmm4 = _mm_shuffle_epi8(xmm4, xmm1);
xmm5 = _mm_load_si128(&tbl6); // low table
xmm6 = _mm_load_si128(&tbl7); // high table
xmm3 = _mm_xor_si128(xmm3, xmm4); // combine high and low
dest = _mm_xor_si128(dest, xmm3);
xmm5 = _mm_shuffle_epi8(xmm5, xmm0); // table look-up
xmm0 = _mm_slli_epi16(xmm0, 8); // shift 8-bit for higher table
xmm6 = _mm_shuffle_epi8(xmm6, xmm0);
xmm5 = _mm_xor_si128(xmm5, xmm6); // combine high and low
dest = _mm_xor_si128(dest, xmm5);
_mm_store_si128((__m128i *)data, dest);
data += 16;
bsize -= 16;
}
}
// 2ブロック同時に計算することで、メモリーへのアクセス回数を減らす
// 128バイトのテーブルを個用意しておくこと
// xmm レジスタの数が足りないので、テーブルを毎回ロードする
static void gf16_ssse3_block32_altmap2(unsigned char *input1, unsigned char *input2, unsigned char *output, unsigned int bsize, unsigned char *table)
{
__m128i *tbl;
__m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, mask;
tbl = (__m128i *)table;
// create mask for 16 entries
mask = _mm_setzero_si128();
mask = _mm_cmpeq_epi16(mask, mask); // 0xFFFF *8
mask = _mm_srli_epi16(mask, 12); // 0x000F *8
mask = _mm_packus_epi16(mask, mask); // 0x0F *16
while (bsize != 0){
xmm0 = _mm_load_si128((__m128i *)input1); // read source 32-bytes
xmm1 = _mm_load_si128((__m128i *)input1 + 1);
xmm6 = _mm_load_si128(&xmm0); // copy source
xmm0 = _mm_srli_epi16(xmm0, 4); // prepare next 4-bit
xmm6 = _mm_and_si128(xmm6, mask); // src & 0x0F
xmm0 = _mm_and_si128(xmm0, mask); // (src >> 4) & 0x0F
xmm4 = _mm_load_si128(tbl); // load tables
xmm5 = _mm_load_si128(tbl + 1);
xmm4 = _mm_shuffle_epi8(xmm4, xmm6); // table look-up
xmm5 = _mm_shuffle_epi8(xmm5, xmm6);
xmm2 = _mm_load_si128(tbl + 2); // load tables
xmm3 = _mm_load_si128(tbl + 3);
xmm2 = _mm_shuffle_epi8(xmm2, xmm0); // table look-up
xmm3 = _mm_shuffle_epi8(xmm3, xmm0);
xmm4 = _mm_xor_si128(xmm4, xmm2); // combine result
xmm5 = _mm_xor_si128(xmm5, xmm3);
xmm0 = _mm_load_si128(&xmm1); // copy source
xmm0 = _mm_srli_epi16(xmm0, 4); // prepare next 4-bit
xmm1 = _mm_and_si128(xmm1, mask); // src & 0x0F
xmm0 = _mm_and_si128(xmm0, mask); // (src >> 4) & 0x0F
xmm2 = _mm_load_si128(tbl + 4); // load tables
xmm3 = _mm_load_si128(tbl + 5);
xmm2 = _mm_shuffle_epi8(xmm2, xmm1); // table look-up
xmm3 = _mm_shuffle_epi8(xmm3, xmm1);
xmm4 = _mm_xor_si128(xmm4, xmm2); // combine result
xmm5 = _mm_xor_si128(xmm5, xmm3);
xmm2 = _mm_load_si128(tbl + 6); // load tables
xmm3 = _mm_load_si128(tbl + 7);
xmm2 = _mm_shuffle_epi8(xmm2, xmm0); // table look-up
xmm3 = _mm_shuffle_epi8(xmm3, xmm0);
xmm4 = _mm_xor_si128(xmm4, xmm2); // combine result
xmm5 = _mm_xor_si128(xmm5, xmm3);
xmm0 = _mm_load_si128((__m128i *)input2); // read source 32-bytes
xmm1 = _mm_load_si128((__m128i *)input2 + 1);
xmm6 = _mm_load_si128(&xmm0); // copy source
xmm0 = _mm_srli_epi16(xmm0, 4); // prepare next 4-bit
xmm6 = _mm_and_si128(xmm6, mask); // src & 0x0F
xmm0 = _mm_and_si128(xmm0, mask); // (src >> 4) & 0x0F
xmm2 = _mm_load_si128(tbl + 8); // load tables
xmm3 = _mm_load_si128(tbl + 9);
xmm2 = _mm_shuffle_epi8(xmm2, xmm6); // table look-up
xmm3 = _mm_shuffle_epi8(xmm3, xmm6);
xmm4 = _mm_xor_si128(xmm4, xmm2); // combine result
xmm5 = _mm_xor_si128(xmm5, xmm3);
xmm2 = _mm_load_si128(tbl + 10); // load tables
xmm3 = _mm_load_si128(tbl + 11);
xmm2 = _mm_shuffle_epi8(xmm2, xmm0); // table look-up
xmm3 = _mm_shuffle_epi8(xmm3, xmm0);
xmm4 = _mm_xor_si128(xmm4, xmm2); // combine result
xmm5 = _mm_xor_si128(xmm5, xmm3);
xmm0 = _mm_load_si128(&xmm1); // copy source
xmm0 = _mm_srli_epi16(xmm0, 4); // prepare next 4-bit
xmm1 = _mm_and_si128(xmm1, mask); // src & 0x0F
xmm0 = _mm_and_si128(xmm0, mask); // (src >> 4) & 0x0F
xmm2 = _mm_load_si128(tbl + 12); // load tables
xmm3 = _mm_load_si128(tbl + 13);
xmm2 = _mm_shuffle_epi8(xmm2, xmm1); // table look-up
xmm3 = _mm_shuffle_epi8(xmm3, xmm1);
xmm4 = _mm_xor_si128(xmm4, xmm2); // combine result
xmm5 = _mm_xor_si128(xmm5, xmm3);
xmm2 = _mm_load_si128(tbl + 14); // load tables
xmm3 = _mm_load_si128(tbl + 15);
xmm2 = _mm_shuffle_epi8(xmm2, xmm0); // table look-up
xmm3 = _mm_shuffle_epi8(xmm3, xmm0);
xmm4 = _mm_xor_si128(xmm4, xmm2); // combine result
xmm5 = _mm_xor_si128(xmm5, xmm3);
xmm0 = _mm_load_si128((__m128i *)output); // read dest 32-bytes
xmm1 = _mm_load_si128((__m128i *)output + 1);
xmm0 = _mm_xor_si128(xmm0, xmm4);
xmm1 = _mm_xor_si128(xmm1, xmm5);
_mm_store_si128((__m128i *)output, xmm0); // write dest 32-bytes
_mm_store_si128((__m128i *)output + 1, xmm1);
input1 += 32;
input2 += 32;
output += 32;
bsize -= 32;
}
}
/* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */
// AVX2 命令を使うには Windows 7 以降じゃないといけない
// _mm256_permute2x128_si256 の control の意味は以下を参照
// http://www.felixcloutier.com/x86/VPERM2I128.html
// AVX2 を使って全体を倍していくと、13% ぐらい速くなる
// でも、テーブル作成が少し速くなっても、全体的な速度はほとんど変わらない・・・
static void create_eight_table_avx2(unsigned char *mtab, int factor)
{
int count;
__m128i xmm0, xmm1, xmm2, xmm3, mask8;
__m256i ymm0, ymm1, ymm2, ymm3, base, poly, mask16;
// create mask for 8-bit
mask8 = _mm_setzero_si128();
mask8 = _mm_cmpeq_epi16(mask8, mask8); // 0xFFFF *8
mask8 = _mm_srli_epi16(mask8, 8); // 0x00FF *8
xmm0 = _mm_cvtsi32_si128(factor); // [_][_][_][_][_][_][_][1]
xmm1 = _mm_setzero_si128();
factor = (factor << 1) ^ (((factor << 16) >> 31) & 0x1100B);
xmm1 = _mm_insert_epi16(xmm1, factor, 1); // [_][_][_][_][_][_][2][_]
xmm2 = _mm_setzero_si128();
factor = (factor << 1) ^ (((factor << 16) >> 31) & 0x1100B);
xmm2 = _mm_insert_epi16(xmm2, factor, 4); // [_][_][_][4][_][_][_][_]
xmm1 = _mm_unpacklo_epi16(xmm1, xmm1); // [_][_][_][_][2][2][_][_]
factor = (factor << 1) ^ (((factor << 16) >> 31) & 0x1100B);
xmm3 = _mm_cvtsi32_si128(factor); // [_][_][_][_][_][_][_][8]
xmm0 = _mm_shufflelo_epi16(xmm0, _MM_SHUFFLE(0, 1, 0, 1)); // [_][_][_][_][1][_][1][_]
xmm3 = _mm_unpacklo_epi16(xmm3, xmm3); // [_][_][_][_][_][_][8][8]
xmm0 = _mm_xor_si128(xmm0, xmm1); // [_][_][_][_][3][2][1][_]
xmm2 = _mm_shufflehi_epi16(xmm2, _MM_SHUFFLE(0, 0, 0, 0)); // [4][4][4][4][_][_][_][_]
xmm0 = _mm_unpacklo_epi64(xmm0, xmm0); // [3][2][1][_][3][2][1][_]
xmm3 = _mm_shuffle_epi32(xmm3, _MM_SHUFFLE(0, 0, 0, 0)); // [8][8][8][8][8][8][8][8]
xmm2 = _mm_xor_si128(xmm2, xmm0); // [7][6][5][4][3][2][1][_]
xmm3 = _mm_xor_si128(xmm3, xmm2); // [15][14][13][12][11][10][9][8]
// 途中で AVX2 命令を使っても遅くならないっぽい
poly = _mm256_set1_epi32(0x100B100B); // PRIM_POLY = 0x1100B * 16
mask16 = _mm256_cmpeq_epi16(poly, poly);
mask16 = _mm256_srli_epi16(mask16, 8); // 0x00FF *16
base = _mm256_setzero_si256();
base = _mm256_inserti128_si256(base, xmm2, 0);
base = _mm256_inserti128_si256(base, xmm3, 1);
// ymm レジスタに読み込んでる間にメモリーに書き込んだ方が速い
xmm0 = _mm_and_si128(xmm2, mask8);
xmm1 = _mm_and_si128(xmm3, mask8);
xmm0 = _mm_packus_epi16(xmm0, xmm1); // lower 8-bit * 16
xmm2 = _mm_srli_epi16(xmm2, 8);
xmm3 = _mm_srli_epi16(xmm3, 8);
xmm2 = _mm_packus_epi16(xmm2, xmm3); // higher 8-bit * 16
_mm_store_si128((__m128i *)mtab , xmm0);
_mm_store_si128((__m128i *)mtab + 1, xmm2);
for (count = 1; count < 4; count++){
// 全体を2倍する
ymm0 = _mm256_slli_epi16(base, 1);
ymm1 = _mm256_srai_epi16(base, 15);
ymm1 = _mm256_and_si256(ymm1, poly);
base = _mm256_xor_si256(ymm1, ymm0);
// 全体を2倍する
ymm0 = _mm256_slli_epi16(base, 1);
ymm1 = _mm256_srai_epi16(base, 15);
ymm1 = _mm256_and_si256(ymm1, poly);
base = _mm256_xor_si256(ymm1, ymm0);
// 全体を2倍する
ymm0 = _mm256_slli_epi16(base, 1);
ymm1 = _mm256_srai_epi16(base, 15);
ymm1 = _mm256_and_si256(ymm1, poly);
base = _mm256_xor_si256(ymm1, ymm0);
// 全体を2倍する
ymm0 = _mm256_slli_epi16(base, 1);
ymm1 = _mm256_srai_epi16(base, 15);
ymm1 = _mm256_and_si256(ymm1, poly);
base = _mm256_xor_si256(ymm1, ymm0);
// 並び替えて保存する
ymm0 = _mm256_and_si256(base, mask16); // lower 8-bit * 16
ymm1 = _mm256_srli_epi16(base, 8); // higher 8-bit * 16
ymm2 = _mm256_permute2x128_si256(ymm0, ymm1, 0x20);
ymm3 = _mm256_permute2x128_si256(ymm0, ymm1, 0x31);
ymm0 = _mm256_packus_epi16(ymm2, ymm3);
_mm256_store_si256((__m256i *)mtab + count, ymm0);
}
// AVX-SSE 切り替えの回避
_mm256_zeroupper();
}
// 逆行列計算用に掛け算だけするXORで追加しない
static void gf16_avx2_block32s(unsigned char *data, unsigned int bsize, unsigned char *table)
{
__m256i tbl0, tbl1, tbl2, tbl3, tbl4, tbl5, tbl6, tbl7;
__m256i mask, dest, src0, src1, tmp0, tmp1, tmp2, tmp3;
// copy tables to local
tmp0 = _mm256_load_si256((__m256i *)table); // tbl0[low0][high0] <- 0x0f[lo][lo]
tmp1 = _mm256_load_si256((__m256i *)table + 1); // tbl1[low1][high1] <- 0xf0[lo][lo]
tmp2 = _mm256_load_si256((__m256i *)table + 2); // tbl2[low2][high2] <- 0x0f[hi][hi]
tmp3 = _mm256_load_si256((__m256i *)table + 3); // tbl3[low3][high3] <- 0xf0[hi][hi]
// split to 8 tables
tbl0 = _mm256_permute2x128_si256(tmp0, tmp0, 0x00); // tbl0[low0][low0]
tbl1 = _mm256_permute2x128_si256(tmp1, tmp1, 0x00); // tbl1[low1][low1]
tbl2 = _mm256_permute2x128_si256(tmp2, tmp2, 0x00); // tbl2[low2][low2]
tbl3 = _mm256_permute2x128_si256(tmp3, tmp3, 0x00); // tbl3[low3][low3]
tbl4 = _mm256_permute2x128_si256(tmp0, tmp0, 0x11); // tbl0[high0][high0]
tbl5 = _mm256_permute2x128_si256(tmp1, tmp1, 0x11); // tbl1[high1][high1]
tbl6 = _mm256_permute2x128_si256(tmp2, tmp2, 0x11); // tbl2[high2][high2]
tbl7 = _mm256_permute2x128_si256(tmp3, tmp3, 0x11); // tbl3[high3][high3]
// create mask for 16 entries
mask = _mm256_cmpeq_epi16(tmp0, tmp0); // 0xFFFF *16
mask = _mm256_srli_epi16(mask, 12); // 0x000F *16
while (bsize != 0){
src0 = _mm256_load_si256((__m256i *)data); // read source 32-bytes
src1 = _mm256_and_si256(src0, mask); // src & 0x0F
tmp0 = _mm256_shuffle_epi8(tbl0, src1); // table look-up
src1 = _mm256_slli_epi16(src1, 8); // shift 8-bit for higher table
tmp1 = _mm256_shuffle_epi8(tbl4, src1);
src0 = _mm256_srli_epi16(src0, 4); // prepare next 4-bit
dest = _mm256_xor_si256(tmp0, tmp1); // combine high and low
src1 = _mm256_and_si256(src0, mask); // src & 0x0F
tmp0 = _mm256_shuffle_epi8(tbl1, src1); // table look-up
src1 = _mm256_slli_epi16(src1, 8); // shift 8-bit for higher table
tmp1 = _mm256_shuffle_epi8(tbl5, src1);
dest = _mm256_xor_si256(dest, tmp0); // combine high and low
src0 = _mm256_srli_epi16(src0, 4); // prepare next 4-bit
dest = _mm256_xor_si256(dest, tmp1);
src1 = _mm256_and_si256(src0, mask); // src & 0x0F
tmp0 = _mm256_shuffle_epi8(tbl2, src1); // table look-up
src1 = _mm256_slli_epi16(src1, 8); // shift 8-bit for higher table
tmp1 = _mm256_shuffle_epi8(tbl6, src1);
dest = _mm256_xor_si256(dest, tmp0); // combine high and low
src0 = _mm256_srli_epi16(src0, 4); // prepare next 4-bit
dest = _mm256_xor_si256(dest, tmp1);
src1 = _mm256_and_si256(src0, mask); // src & 0x0F
tmp0 = _mm256_shuffle_epi8(tbl3, src1); // table look-up
src1 = _mm256_slli_epi16(src1, 8); // shift 8-bit for higher table
tmp1 = _mm256_shuffle_epi8(tbl7, src1);
dest = _mm256_xor_si256(dest, tmp0); // combine high and low
dest = _mm256_xor_si256(dest, tmp1);
_mm256_store_si256((__m256i *)data, dest); // write dest 32-bytes
data += 32;
bsize -= 32;
}
// AVX-SSE 切り替えの回避
_mm256_zeroupper();
}
// 逆行列計算用に ALTMAP されてないソースにも対応しておく
// Address (input) does not need be 32-byte aligned
static void gf16_avx2_block32u(unsigned char *input, unsigned char *output, unsigned int bsize, unsigned char *table)
{
__m256i tbl0, tbl1, tbl2, tbl3, tbl4, tbl5, tbl6, tbl7;
__m256i mask, dest, src0, src1, tmp0, tmp1, tmp2, tmp3;
// copy tables to local
tmp0 = _mm256_load_si256((__m256i *)table); // tbl0[low0][high0] <- 0x0f[lo][lo]
tmp1 = _mm256_load_si256((__m256i *)table + 1); // tbl1[low1][high1] <- 0xf0[lo][lo]
tmp2 = _mm256_load_si256((__m256i *)table + 2); // tbl2[low2][high2] <- 0x0f[hi][hi]
tmp3 = _mm256_load_si256((__m256i *)table + 3); // tbl3[low3][high3] <- 0xf0[hi][hi]
// split to 8 tables
tbl0 = _mm256_permute2x128_si256(tmp0, tmp0, 0x00); // tbl0[low0][low0]
tbl1 = _mm256_permute2x128_si256(tmp1, tmp1, 0x00); // tbl1[low1][low1]
tbl2 = _mm256_permute2x128_si256(tmp2, tmp2, 0x00); // tbl2[low2][low2]
tbl3 = _mm256_permute2x128_si256(tmp3, tmp3, 0x00); // tbl3[low3][low3]
tbl4 = _mm256_permute2x128_si256(tmp0, tmp0, 0x11); // tbl0[high0][high0]
tbl5 = _mm256_permute2x128_si256(tmp1, tmp1, 0x11); // tbl1[high1][high1]
tbl6 = _mm256_permute2x128_si256(tmp2, tmp2, 0x11); // tbl2[high2][high2]
tbl7 = _mm256_permute2x128_si256(tmp3, tmp3, 0x11); // tbl3[high3][high3]
// create mask for 16 entries
mask = _mm256_cmpeq_epi16(tmp0, tmp0); // 0xFFFF *16
mask = _mm256_srli_epi16(mask, 12); // 0x000F *16
while (bsize != 0){
src0 = _mm256_loadu_si256((__m256i *)input); // read source 32-bytes
dest = _mm256_load_si256((__m256i *)output); // read dest 32-bytes
src1 = _mm256_and_si256(src0, mask); // src & 0x0F
tmp0 = _mm256_shuffle_epi8(tbl0, src1); // table look-up
src1 = _mm256_slli_epi16(src1, 8); // shift 8-bit for higher table
tmp1 = _mm256_shuffle_epi8(tbl4, src1);
dest = _mm256_xor_si256(dest, tmp0); // combine high and low
src0 = _mm256_srli_epi16(src0, 4); // prepare next 4-bit
dest = _mm256_xor_si256(dest, tmp1);
src1 = _mm256_and_si256(src0, mask); // src & 0x0F
tmp0 = _mm256_shuffle_epi8(tbl1, src1); // table look-up
src1 = _mm256_slli_epi16(src1, 8); // shift 8-bit for higher table
tmp1 = _mm256_shuffle_epi8(tbl5, src1);
dest = _mm256_xor_si256(dest, tmp0); // combine high and low
src0 = _mm256_srli_epi16(src0, 4); // prepare next 4-bit
dest = _mm256_xor_si256(dest, tmp1);
src1 = _mm256_and_si256(src0, mask); // src & 0x0F
tmp0 = _mm256_shuffle_epi8(tbl2, src1); // table look-up
src1 = _mm256_slli_epi16(src1, 8); // shift 8-bit for higher table
tmp1 = _mm256_shuffle_epi8(tbl6, src1);
dest = _mm256_xor_si256(dest, tmp0); // combine high and low
src0 = _mm256_srli_epi16(src0, 4); // prepare next 4-bit
dest = _mm256_xor_si256(dest, tmp1);
src1 = _mm256_and_si256(src0, mask); // src & 0x0F
tmp0 = _mm256_shuffle_epi8(tbl3, src1); // table look-up
src1 = _mm256_slli_epi16(src1, 8); // shift 8-bit for higher table
tmp1 = _mm256_shuffle_epi8(tbl7, src1);
dest = _mm256_xor_si256(dest, tmp0); // combine high and low
dest = _mm256_xor_si256(dest, tmp1);
_mm256_store_si256((__m256i *)output, dest); // write dest 32-bytes
input += 32;
output += 32;
bsize -= 32;
}
// AVX-SSE 切り替えの回避
_mm256_zeroupper();
}
// テーブルを並び替えて使えば、ループ内の並び替え回数を一回に減らせる
static void gf16_avx2_block32(unsigned char *input, unsigned char *output, unsigned int bsize, unsigned char *table)
{
__m256i tbl0, tbl1, tbl2, tbl3, mask, dest, src0, src1, tmp0, tmp1, tmp2, tmp3;
// copy tables to local
tmp0 = _mm256_load_si256((__m256i *)table); // tbl0[low0][high0] <- 0x0f[lo][lo]
tmp1 = _mm256_load_si256((__m256i *)table + 1); // tbl1[low1][high1] <- 0xf0[lo][lo]
tmp2 = _mm256_load_si256((__m256i *)table + 2); // tbl2[low2][high2] <- 0x0f[hi][hi]
tmp3 = _mm256_load_si256((__m256i *)table + 3); // tbl3[low3][high3] <- 0xf0[hi][hi]
// re-arrange table order (permute より blend の方が速いらしい)
tbl0 = _mm256_blend_epi32(tmp0, tmp2, 0xF0); // tbl0[low0][high2] <- 0x0f[lo][hi]
tbl1 = _mm256_blend_epi32(tmp1, tmp3, 0xF0); // tbl1[low1][high3] <- 0xf0[lo][hi]
tbl2 = _mm256_permute2x128_si256(tmp2, tmp0, 0x03); // tbl2[high0][low2] <- 0x0f[lo][hi]
tbl3 = _mm256_permute2x128_si256(tmp3, tmp1, 0x03); // tbl3[high1][low3] <- 0xf0[lo][hi]
// create mask for 32 entries
mask = _mm256_cmpeq_epi16(tmp0, tmp0); // 0xFFFF *16
mask = _mm256_srli_epi16(mask, 12); // 0x000F *16
mask = _mm256_packus_epi16(mask, mask); // 0x0F *32
while (bsize != 0){
src0 = _mm256_load_si256((__m256i *)input); // read source 32-bytes
src1 = _mm256_srli_epi16(src0, 4); // prepare next 4-bit
src0 = _mm256_and_si256(src0, mask); // src & 0x0F
src1 = _mm256_and_si256(src1, mask); // (src >> 4) & 0x0F
tmp0 = _mm256_shuffle_epi8(tbl0, src0); // table look-up
tmp1 = _mm256_shuffle_epi8(tbl1, src1);
tmp2 = _mm256_shuffle_epi8(tbl2, src0);
tmp3 = _mm256_shuffle_epi8(tbl3, src1);
tmp0 = _mm256_xor_si256(tmp0, tmp1); // combine result
tmp2 = _mm256_xor_si256(tmp2, tmp3);
tmp2 = _mm256_permute2x128_si256(tmp2, tmp2, 0x01); // exchange low & high 128-bit
dest = _mm256_load_si256((__m256i *)output); // read dest 32-bytes
tmp0 = _mm256_xor_si256(tmp0, tmp2);
dest = _mm256_xor_si256(dest, tmp0);
_mm256_store_si256((__m256i *)output, dest); // write dest 32-bytes
input += 32;
output += 32;
bsize -= 32;
}
// AVX-SSE 切り替えの回避
_mm256_zeroupper();
}
/*
// テーブルを並び替えて使えば、ループ内の並び替え回数を減らせる
static void gf16_avx2_block32(unsigned char *input, unsigned char *output, unsigned int bsize, unsigned char *table)
{
__m256i tbl0, tbl1, tbl2, tbl3, mask, dest, src0, src1, src2, src3;
// copy tables to local
src0 = _mm256_load_si256((__m256i *)table); // tbl0[low0][high0] <- 0x0f[lo][lo]
src1 = _mm256_load_si256((__m256i *)table + 1); // tbl1[low1][high1] <- 0xf0[lo][lo]
src2 = _mm256_load_si256((__m256i *)table + 2); // tbl2[low2][high2] <- 0x0f[hi][hi]
src3 = _mm256_load_si256((__m256i *)table + 3); // tbl3[low3][high3] <- 0xf0[hi][hi]
// re-arrange table order
tbl0 = _mm256_permute2x128_si256(src0, src2, 0x30); // tblA[low0][high2] <- 0x0f[lo][hi]
tbl1 = _mm256_permute2x128_si256(src1, src3, 0x30); // tblB[low1][high3] <- 0xf0[lo][hi]
tbl2 = _mm256_permute2x128_si256(src2, src0, 0x30); // tblC[low2][high0] <- 0x0f[hi][lo]
tbl3 = _mm256_permute2x128_si256(src3, src1, 0x30); // tblD[low3][high1] <- 0xf0[hi][lo]
// create mask for 32 entries
mask = _mm256_set1_epi8(0x0F); // 0x0F *32
while (bsize != 0){
src0 = _mm256_load_si256((__m256i *)input); // read source 32-bytes
dest = _mm256_load_si256((__m256i *)output); // read dest 32-bytes
src1 = _mm256_srli_epi16(src0, 4); // prepare next 4-bit
src0 = _mm256_and_si256(src0, mask); // src & 0x0F
src1 = _mm256_and_si256(src1, mask); // (src >> 4) & 0x0F
src2 = _mm256_permute2x128_si256(src0, src0, 0x01); // exchange low & high 128-bit of "src & 0x0F"
src3 = _mm256_permute2x128_si256(src1, src1, 0x01); // exchange low & high 128-bit of "(src >> 4) & 0x0F"
src0 = _mm256_shuffle_epi8(tbl0, src0); // table look-up
src1 = _mm256_shuffle_epi8(tbl1, src1);
src2 = _mm256_shuffle_epi8(tbl2, src2);
src3 = _mm256_shuffle_epi8(tbl3, src3);
src0 = _mm256_xor_si256(src0, src1); // combine result
src2 = _mm256_xor_si256(src2, src3);
dest = _mm256_xor_si256(dest, src0);
dest = _mm256_xor_si256(dest, src2);
_mm256_store_si256((__m256i *)output, dest); // write dest 32-bytes
input += 32;
output += 32;
bsize -= 32;
}
// AVX-SSE 切り替えの回避
_mm256_zeroupper();
}
*/
/*
// レジスタを大量に使って依存関係をなくせば並列処理できるかも?
static void gf16_avx2_block32(unsigned char *input, unsigned char *output, unsigned int bsize, unsigned char *table)
{
__m256i tbl0, tbl1, tbl2, tbl3, mask, dest, src0, src1, tmp0, tmp1, tmp2, tmp3;
// copy tables to local
tbl0 = _mm256_load_si256((__m256i *)table);
tbl1 = _mm256_load_si256((__m256i *)table + 1);
tbl2 = _mm256_load_si256((__m256i *)table + 2);
tbl3 = _mm256_load_si256((__m256i *)table + 3);
// create mask for 32 entries
mask = _mm256_set1_epi8(0x0F); // 0x0F *32
while (bsize != 0){
src0 = _mm256_load_si256((__m256i *)input); // read source 32-bytes
dest = _mm256_load_si256((__m256i *)output); // read dest 32-bytes
src1 = _mm256_srli_epi16(src0, 4); // prepare next 4-bit
src0 = _mm256_and_si256(src0, mask); // src & 0x0F
src1 = _mm256_and_si256(src1, mask); // (src >> 4) & 0x0F
tmp0 = _mm256_permute2x128_si256(src0, src0, 0x00); // copy low 128-bit to high from "src & 0x0F"
tmp1 = _mm256_permute2x128_si256(src1, src1, 0x00); // copy low 128-bit to high from "(src >> 4) & 0x0F"
tmp2 = _mm256_permute2x128_si256(src0, src0, 0x11); // copy high 128-bit to low from "src & 0x0F"
tmp3 = _mm256_permute2x128_si256(src1, src1, 0x11); // copy high 128-bit to low from "(src >> 4) & 0x0F"
tmp0 = _mm256_shuffle_epi8(tbl0, tmp0); // table look-up
tmp1 = _mm256_shuffle_epi8(tbl1, tmp1);
tmp2 = _mm256_shuffle_epi8(tbl2, tmp2);
tmp3 = _mm256_shuffle_epi8(tbl3, tmp3);
tmp0 = _mm256_xor_si256(tmp0, tmp1); // combine result
tmp2 = _mm256_xor_si256(tmp2, tmp3);
dest = _mm256_xor_si256(dest, tmp0);
dest = _mm256_xor_si256(dest, tmp2);
_mm256_store_si256((__m256i *)output, dest); // write dest 32-bytes
input += 32;
output += 32;
bsize -= 32;
}
// AVX-SSE 切り替えの回避
_mm256_zeroupper();
}
*/
// 2ブロック同時に計算することで、メモリーへのアクセス回数を減らす
// 128バイトのテーブルを個用意しておくこと
static void gf16_avx2_block32_2(unsigned char *input1, unsigned char *input2, unsigned char *output, unsigned int bsize, unsigned char *table)
{
__m256i mask, src0, src1, tmp0, tmp1, tmp2, tmp3;
__m256i tbl0, tbl1, tbl2, tbl3, tbl4, tbl5, tbl6, tbl7;
// copy tables to local
tmp0 = _mm256_load_si256((__m256i *)table); // tbl0[low0][high0] <- 0x0f[lo][lo]
tmp1 = _mm256_load_si256((__m256i *)table + 1); // tbl1[low1][high1] <- 0xf0[lo][lo]
tmp2 = _mm256_load_si256((__m256i *)table + 2); // tbl2[low2][high2] <- 0x0f[hi][hi]
tmp3 = _mm256_load_si256((__m256i *)table + 3); // tbl3[low3][high3] <- 0xf0[hi][hi]
// re-arrange table order (permute より blend の方が速いらしい)
tbl0 = _mm256_blend_epi32(tmp0, tmp2, 0xF0); // tbl0[low0][high2] <- 0x0f[lo][hi]
tbl1 = _mm256_blend_epi32(tmp1, tmp3, 0xF0); // tbl1[low1][high3] <- 0xf0[lo][hi]
tbl2 = _mm256_permute2x128_si256(tmp2, tmp0, 0x03); // tbl2[high0][low2] <- 0x0f[lo][hi]
tbl3 = _mm256_permute2x128_si256(tmp3, tmp1, 0x03); // tbl3[high1][low3] <- 0xf0[lo][hi]
tmp0 = _mm256_load_si256((__m256i *)table + 4);
tmp1 = _mm256_load_si256((__m256i *)table + 5);
tmp2 = _mm256_load_si256((__m256i *)table + 6);
tmp3 = _mm256_load_si256((__m256i *)table + 7);
tbl4 = _mm256_blend_epi32(tmp0, tmp2, 0xF0);
tbl5 = _mm256_blend_epi32(tmp1, tmp3, 0xF0);
tbl6 = _mm256_permute2x128_si256(tmp2, tmp0, 0x03);
tbl7 = _mm256_permute2x128_si256(tmp3, tmp1, 0x03);
// create mask for 32 entries
mask = _mm256_cmpeq_epi16(tmp0, tmp0); // 0xFFFF *16
mask = _mm256_srli_epi16(mask, 12); // 0x000F *16
mask = _mm256_packus_epi16(mask, mask); // 0x0F *32
while (bsize != 0){
src0 = _mm256_load_si256((__m256i *)input1); // read source 32-bytes
src1 = _mm256_srli_epi16(src0, 4); // prepare next 4-bit
src0 = _mm256_and_si256(src0, mask); // src & 0x0F
src1 = _mm256_and_si256(src1, mask); // (src >> 4) & 0x0F
tmp0 = _mm256_shuffle_epi8(tbl0, src0); // table look-up
tmp1 = _mm256_shuffle_epi8(tbl1, src1);
tmp2 = _mm256_shuffle_epi8(tbl2, src0);
tmp3 = _mm256_shuffle_epi8(tbl3, src1);
tmp0 = _mm256_xor_si256(tmp0, tmp1); // combine result
tmp2 = _mm256_xor_si256(tmp2, tmp3);
src0 = _mm256_load_si256((__m256i *)input2); // read source 32-bytes
src1 = _mm256_srli_epi16(src0, 4); // prepare next 4-bit
src0 = _mm256_and_si256(src0, mask); // src & 0x0F
src1 = _mm256_and_si256(src1, mask); // (src >> 4) & 0x0F
tmp1 = _mm256_shuffle_epi8(tbl4, src0); // table look-up
tmp3 = _mm256_shuffle_epi8(tbl6, src0);
tmp0 = _mm256_xor_si256(tmp0, tmp1); // combine result
tmp2 = _mm256_xor_si256(tmp2, tmp3);
tmp1 = _mm256_shuffle_epi8(tbl5, src1); // table look-up
tmp3 = _mm256_shuffle_epi8(tbl7, src1);
tmp0 = _mm256_xor_si256(tmp0, tmp1); // combine result
tmp2 = _mm256_xor_si256(tmp2, tmp3);
src0 = _mm256_load_si256((__m256i *)output); // read dest 32-bytes
tmp2 = _mm256_permute2x128_si256(tmp2, tmp2, 0x01); // exchange low & high 128-bit
src0 = _mm256_xor_si256(src0, tmp0);
src0 = _mm256_xor_si256(src0, tmp2);
_mm256_store_si256((__m256i *)output, src0); // write dest 32-bytes
input1 += 32;
input2 += 32;
output += 32;
bsize -= 32;
}
// AVX-SSE 切り替えの回避
_mm256_zeroupper();
}
/* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */
// データを並び替えることで、メモリーアクセスを高速化する
void galois_altmap32_change(unsigned char *data, unsigned int bsize)
{
__m128i xmm0, xmm1, xmm2, xmm3, mask;
mask = _mm_setzero_si128();
mask = _mm_cmpeq_epi16(mask, mask); // 0xFFFF *8
mask = _mm_srli_epi16(mask, 8); // 0x00FF *8
while (bsize != 0){
xmm1 = _mm_load_si128((__m128i *)data); // read 32-bytes
xmm3 = _mm_load_si128((__m128i *)data + 1);
xmm0 = _mm_and_si128(xmm1, mask); // erase higher byte
xmm2 = _mm_and_si128(xmm3, mask);
xmm1 = _mm_srli_epi16(xmm1, 8); // move higher byte to lower
xmm3 = _mm_srli_epi16(xmm3, 8);
xmm0 = _mm_packus_epi16(xmm0, xmm2); // select lower byte of each word
xmm1 = _mm_packus_epi16(xmm1, xmm3); // select higher byte of each word
_mm_store_si128((__m128i *)data, xmm0); // write 32-bytes
_mm_store_si128((__m128i *)data + 1, xmm1);
data += 32;
bsize -= 32;
}
}
// データの並びを元に戻す
void galois_altmap32_return(unsigned char *data, unsigned int bsize)
{
__m128i xmm0, xmm1, xmm2;
while (bsize != 0){
xmm1 = _mm_load_si128((__m128i *)data); // read 32-bytes
xmm2 = _mm_load_si128((__m128i *)data + 1);
xmm0 = _mm_unpacklo_epi8(xmm1, xmm2); // interleave lower and higher bytes
xmm1 = _mm_unpackhi_epi8(xmm1, xmm2);
_mm_store_si128((__m128i *)data, xmm0); // write 32-bytes
_mm_store_si128((__m128i *)data + 1, xmm1);
data += 32;
bsize -= 32;
}
}
// 並び替えない場合
void galois_altmap_none(unsigned char *data, unsigned int bsize)
{
}
/* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */
/*
from ParPar; "gf_w16_additions.c"
gf_w16_xor_lazy_sse_altmap_multiply_region
gf_w16_xor_lazy_sse_jit_altmap_multiply_region
*/
// 256バイトごとにビット単位の XOR で計算する方法
// input と output の領域は重ならないようにすること
static void gf16_sse2_block256(unsigned char *input, unsigned char *output, unsigned int bsize, int factor)
{
unsigned int i, bit;
unsigned int counts[16];
uintptr_t deptable[16][16];
__m128i depmask1, depmask2, polymask1, polymask2, addvals1, addvals2;
unsigned short tmp_depmask[16];
// calculate dependent bits
addvals1 = _mm_set_epi16(1<< 7, 1<< 6, 1<< 5, 1<< 4, 1<< 3, 1<< 2, 1<<1, 1<<0);
addvals2 = _mm_set_epi16(1<<15, 1<<14, 1<<13, 1<<12, 1<<11, 1<<10, 1<<9, 1<<8);
// duplicate each bit in the polynomial 16 times
polymask2 = _mm_set1_epi16(PRIM_POLY & 0xFFFF); // chop off top bit, although not really necessary
polymask1 = _mm_and_si128(polymask2, _mm_set_epi16(1<< 8, 1<< 9, 1<<10, 1<<11, 1<<12, 1<<13, 1<<14, 1<<15));
polymask2 = _mm_and_si128(polymask2, _mm_set_epi16(1<< 0, 1<< 1, 1<< 2, 1<< 3, 1<< 4, 1<< 5, 1<< 6, 1<< 7));
polymask1 = _mm_cmpeq_epi16(_mm_setzero_si128(), polymask1);
polymask2 = _mm_cmpeq_epi16(_mm_setzero_si128(), polymask2);
if (factor & (1<<15)){
// XOR
depmask1 = addvals1;
depmask2 = addvals2;
} else {
depmask1 = _mm_setzero_si128();
depmask2 = _mm_setzero_si128();
}
for (i = (1<<14); i; i >>= 1){
// rotate
__m128i last = _mm_shuffle_epi32(_mm_shufflelo_epi16(depmask1, 0), 0);
depmask1 = _mm_insert_epi16(
_mm_srli_si128(depmask1, 2),
_mm_extract_epi16(depmask2, 0),
7
);
depmask2 = _mm_srli_si128(depmask2, 2);
// XOR poly
depmask1 = _mm_xor_si128(depmask1, _mm_andnot_si128(polymask1, last));
depmask2 = _mm_xor_si128(depmask2, _mm_andnot_si128(polymask2, last));
if (factor & i){
// XOR
depmask1 = _mm_xor_si128(depmask1, addvals1);
depmask2 = _mm_xor_si128(depmask2, addvals2);
}
}
// generate needed tables
_mm_storeu_si128((__m128i*)(tmp_depmask), depmask1);
_mm_storeu_si128((__m128i*)(tmp_depmask + 8), depmask2);
for (bit = 0; bit < 16; bit++){
unsigned int cnt = 0;
for (i = 0; i < 16; i++){
if (tmp_depmask[bit] & (1<<i)){
deptable[bit][cnt++] = i<<4; // pre-multiply because x86 addressing can't do a x16; this saves a shift operation later
}
}
counts[bit] = cnt;
}
while (bsize != 0){
#define STEP(bit) { \
uintptr_t* deps = deptable[bit]; \
__m128i tmp = _mm_load_si128((__m128i *)output + bit); \
switch (counts[bit]){ \
case 16: tmp = _mm_xor_si128(tmp, *(__m128i *)(input + deps[15])); \
case 15: tmp = _mm_xor_si128(tmp, *(__m128i *)(input + deps[14])); \
case 14: tmp = _mm_xor_si128(tmp, *(__m128i *)(input + deps[13])); \
case 13: tmp = _mm_xor_si128(tmp, *(__m128i *)(input + deps[12])); \
case 12: tmp = _mm_xor_si128(tmp, *(__m128i *)(input + deps[11])); \
case 11: tmp = _mm_xor_si128(tmp, *(__m128i *)(input + deps[10])); \
case 10: tmp = _mm_xor_si128(tmp, *(__m128i *)(input + deps[ 9])); \
case 9: tmp = _mm_xor_si128(tmp, *(__m128i *)(input + deps[ 8])); \
case 8: tmp = _mm_xor_si128(tmp, *(__m128i *)(input + deps[ 7])); \
case 7: tmp = _mm_xor_si128(tmp, *(__m128i *)(input + deps[ 6])); \
case 6: tmp = _mm_xor_si128(tmp, *(__m128i *)(input + deps[ 5])); \
case 5: tmp = _mm_xor_si128(tmp, *(__m128i *)(input + deps[ 4])); \
case 4: tmp = _mm_xor_si128(tmp, *(__m128i *)(input + deps[ 3])); \
case 3: tmp = _mm_xor_si128(tmp, *(__m128i *)(input + deps[ 2])); \
case 2: tmp = _mm_xor_si128(tmp, *(__m128i *)(input + deps[ 1])); \
case 1: tmp = _mm_xor_si128(tmp, *(__m128i *)(input + deps[ 0])); \
} \
_mm_store_si128((__m128i *)output + bit, tmp); \
}
STEP( 0)
STEP( 1)
STEP( 2)
STEP( 3)
STEP( 4)
STEP( 5)
STEP( 6)
STEP( 7)
STEP( 8)
STEP( 9)
STEP(10)
STEP(11)
STEP(12)
STEP(13)
STEP(14)
STEP(15)
#undef STEP
input += 256;
output += 256;
bsize -= 256;
}
}
// bsize が 0 にならないようにすること
static void gf16_sse2_block256_jit(unsigned char *input, unsigned char *output, unsigned int bsize, int factor)
{
FAST_U32 i, bit;
long inBit;
__m128i depmask1, depmask2, polymask1, polymask2, addvals1, addvals2;
__m128i common_mask;
unsigned short tmp_depmask[16], common_depmask[8];
unsigned char * pos_startloop;
unsigned char *jit_exec, *jit_ptr;
int j, thread_id;
// calculate dependent bits
addvals1 = _mm_set_epi16(1<< 7, 1<< 6, 1<< 5, 1<< 4, 1<< 3, 1<< 2, 1<<1, 1<<0);
addvals2 = _mm_set_epi16(1<<15, 1<<14, 1<<13, 1<<12, 1<<11, 1<<10, 1<<9, 1<<8);
// duplicate each bit in the polynomial 16 times
polymask2 = _mm_set1_epi16(PRIM_POLY & 0xFFFF); // chop off top bit, although not really necessary
polymask1 = _mm_and_si128(polymask2, _mm_set_epi16(1<< 8, 1<< 9, 1<<10, 1<<11, 1<<12, 1<<13, 1<<14, 1<<15));
polymask2 = _mm_and_si128(polymask2, _mm_set_epi16(1<< 0, 1<< 1, 1<< 2, 1<< 3, 1<< 4, 1<< 5, 1<< 6, 1<< 7));
polymask1 = _mm_cmpeq_epi16(_mm_setzero_si128(), polymask1);
polymask2 = _mm_cmpeq_epi16(_mm_setzero_si128(), polymask2);
if (factor & (1<<15)){
// XOR
depmask1 = addvals1;
depmask2 = addvals2;
} else {
depmask1 = _mm_setzero_si128();
depmask2 = _mm_setzero_si128();
}
for (i = (1<<14); i; i >>= 1){
// rotate
__m128i last = _mm_shuffle_epi32(_mm_shufflelo_epi16(depmask1, 0), 0);
depmask1 = _mm_insert_epi16(
_mm_srli_si128(depmask1, 2),
_mm_extract_epi16(depmask2, 0),
7
);
depmask2 = _mm_srli_si128(depmask2, 2);
// XOR poly
depmask1 = _mm_xor_si128(depmask1, _mm_andnot_si128(polymask1, last));
depmask2 = _mm_xor_si128(depmask2, _mm_andnot_si128(polymask2, last));
if (factor & i){
// XOR
depmask1 = _mm_xor_si128(depmask1, addvals1);
depmask2 = _mm_xor_si128(depmask2, addvals2);
}
}
// attempt to remove some redundant XOR ops with a simple heuristic
// heuristic: we just find common XOR elements between bit pairs
{
__m128i tmp1, tmp2;
// first, we need to re-arrange words so that we can perform bitwise AND on neighbouring pairs
// unfortunately, PACKUSDW is SSE4.1 only, so emulate it with shuffles
// 01234567 -> 02461357
tmp1 = _mm_shuffle_epi32(
_mm_shufflelo_epi16(
_mm_shufflehi_epi16(depmask1, 0xD8), /* 0xD8 == 0b11011000 */
0xD8
),
0xD8
);
tmp2 = _mm_shuffle_epi32(
_mm_shufflelo_epi16(
_mm_shufflehi_epi16(depmask2, 0xD8),
0xD8
),
0xD8
);
common_mask = _mm_and_si128(
// [02461357, 8ACE9BDF] -> [02468ACE, 13579BDF]
_mm_unpacklo_epi64(tmp1, tmp2),
_mm_unpackhi_epi64(tmp1, tmp2)
);
// we have the common elements between pairs, but it doesn't make sense to process a separate queue if there's only one common element (0 XORs), so eliminate those
common_mask = _mm_andnot_si128(_mm_cmpeq_epi16(
_mm_setzero_si128(),
// "(v & (v-1)) == 0" is true if only zero/one bit is set in each word
_mm_and_si128(common_mask, _mm_sub_epi16(common_mask, _mm_set1_epi16(1)))
), common_mask);
// we now have a common elements mask without 1-bit words, just simply merge stuff in
depmask1 = _mm_xor_si128(depmask1, _mm_unpacklo_epi16(common_mask, common_mask));
depmask2 = _mm_xor_si128(depmask2, _mm_unpackhi_epi16(common_mask, common_mask));
_mm_storeu_si128((__m128i*)common_depmask, common_mask);
}
_mm_storeu_si128((__m128i*)(tmp_depmask), depmask1);
_mm_storeu_si128((__m128i*)(tmp_depmask + 8), depmask2);
// Multi-threading だとスレッドごとに実行領域を分離しないとアクセス違反エラーが発生する
thread_id = GetCurrentThreadId(); // 自分のスレッド ID を取得する
for (j = 0; j < MAX_CPU; j++){ // 対応するスレッド個数は MAX_CPU 個まで
if (jit_id[j] == thread_id)
break;
}
if (j == MAX_CPU){ // 初期状態では jit_code 内は全て 0 なので jit_id も 0 だけ
for (j = 0; j < MAX_CPU; j++){
if (InterlockedCompareExchange(jit_id + j, thread_id, 0) == 0) // 0と置き換えたなら
break;
}
}
jit_exec = jit_code + 4096 * j;
jit_ptr = jit_exec;
#ifdef _WIN64
_jit_push(&jit_ptr, BP);
_jit_mov_r(&jit_ptr, BP, SP);
// align pointer (avoid SP because stuff is encoded differently with it)
_jit_mov_r(&jit_ptr, AX, SP);
_jit_and_i(&jit_ptr, AX, 0xF);
_jit_sub_r(&jit_ptr, BP, AX);
// make Windows happy and save XMM6-15 registers
// ideally should be done by this function, not JIT code, but MSVC has a convenient policy of no inline ASM
for (i = 6; i < 16; i++)
_jit_movaps_store(&jit_ptr, BP, -((int32_t)i-5)*16, (uint8_t)i);
#endif
// adding 128 to the destination pointer allows the register offset to be coded in 1 byte
// eg: 'movdqa xmm0, [rdx+0x90]' is 8 bytes, whilst 'movdqa xmm0, [rdx-0x60]' is 5 bytes
_jit_mov_i(&jit_ptr, AX, (intptr_t)input + 128);
_jit_mov_i(&jit_ptr, DX, (intptr_t)output + 128);
_jit_mov_i(&jit_ptr, CX, (intptr_t)output + bsize + 128);
_jit_align32(&jit_ptr);
pos_startloop = jit_ptr;
//_jit_movaps_load(reg, xreg, offs)
// (we just save a conditional by hardcoding this)
#define _LD_APS(xreg, mreg, offs) \
*(int32_t*)(jit_ptr) = 0x40280F + ((xreg) <<19) + ((mreg) <<16) + (((offs)&0xFF) <<24); \
jit_ptr += 4
#define _ST_APS(mreg, offs, xreg) \
*(int32_t*)(jit_ptr) = 0x40290F + ((xreg) <<19) + ((mreg) <<16) + (((offs)&0xFF) <<24); \
jit_ptr += 4
#define _LD_APS64(xreg, mreg, offs) \
*(int64_t*)(jit_ptr) = 0x40280F44 + ((xreg-8) <<27) + ((mreg) <<24) + ((int64_t)((offs)&0xFF) <<32); \
jit_ptr += 5
#define _ST_APS64(mreg, offs, xreg) \
*(int64_t*)(jit_ptr) = 0x40290F44 + ((xreg-8) <<27) + ((mreg) <<24) + ((int64_t)((offs)&0xFF) <<32); \
jit_ptr += 5
#ifdef _WIN64
#define _LD_DQA(xreg, mreg, offs) \
*(int64_t*)(jit_ptr) = 0x406F0F66 + ((xreg) <<27) + ((mreg) <<24) + ((int64_t)((offs)&0xFF) <<32); \
jit_ptr += 5
#define _ST_DQA(mreg, offs, xreg) \
*(int64_t*)(jit_ptr) = 0x407F0F66 + ((xreg) <<27) + ((mreg) <<24) + ((int64_t)((offs)&0xFF) <<32); \
jit_ptr += 5
#else
#define _LD_DQA(xreg, mreg, offs) \
*(int32_t*)(jit_ptr) = 0x406F0F66 + ((xreg) <<27) + ((mreg) <<24); \
*(jit_ptr +4) = (uint8_t)((offs)&0xFF); \
jit_ptr += 5
#define _ST_DQA(mreg, offs, xreg) \
*(int32_t*)(jit_ptr) = 0x407F0F66 + ((xreg) <<27) + ((mreg) <<24); \
*(jit_ptr +4) = (uint8_t)((offs)&0xFF); \
jit_ptr += 5
#endif
#define _LD_DQA64(xreg, mreg, offs) \
*(int64_t*)(jit_ptr) = 0x406F0F4466 + ((int64_t)(xreg-8) <<35) + ((int64_t)(mreg) <<32) + ((int64_t)((offs)&0xFF) <<40); \
jit_ptr += 6
#define _ST_DQA64(mreg, offs, xreg) \
*(int64_t*)(jit_ptr) = 0x407F0F4466 + ((int64_t)(xreg-8) <<35) + ((int64_t)(mreg) <<32) + ((int64_t)((offs)&0xFF) <<40); \
jit_ptr += 6
//_jit_xorps_m(reg, AX, offs<<4);
#define _XORPS_M_(reg, offs, tr) \
*(int32_t*)(jit_ptr) = (0x40570F + ((reg) << 19) + (((offs)&0xFF) <<28)) ^ (tr)
#define _C_XORPS_M(reg, offs, c) \
_XORPS_M_(reg, offs, 0); \
jit_ptr += (c)<<2
#define _XORPS_M64_(reg, offs, tr) \
*(int64_t*)(jit_ptr) = (0x40570F44 + (((reg)-8) << 27) + ((int64_t)((offs)&0xFF) <<36)) ^ ((tr)<<8)
#define _C_XORPS_M64(reg, offs, c) \
_XORPS_M64_(reg, offs, 0); \
jit_ptr += ((c)<<2)+(c)
//_jit_pxor_m(1, AX, offs<<4);
#ifdef _WIN64
#define _PXOR_M_(reg, offs, tr) \
*(int64_t*)(jit_ptr) = (0x40EF0F66 + ((reg) << 27) + ((int64_t)((offs)&0xFF) << 36)) ^ (tr)
#else
#define _PXOR_M_(reg, offs, tr) \
*(int32_t*)(jit_ptr) = (0x40EF0F66 + ((reg) << 27)) ^ (tr); \
*(jit_ptr +4) = (uint8_t)(((offs)&0xFF) << 4)
#endif
#define _PXOR_M(reg, offs) \
_PXOR_M_(reg, offs, 0); \
jit_ptr += 5
#define _C_PXOR_M(reg, offs, c) \
_PXOR_M_(reg, offs, 0); \
jit_ptr += ((c)<<2)+(c)
#define _PXOR_M64_(reg, offs, tr) \
*(int64_t*)(jit_ptr) = (0x40EF0F4466 + ((int64_t)((reg)-8) << 35) + ((int64_t)((offs)&0xFF) << 44)) ^ ((tr)<<8)
#define _C_PXOR_M64(reg, offs, c) \
_PXOR_M64_(reg, offs, 0); \
jit_ptr += ((c)<<2)+((c)<<1)
//_jit_xorps_r(r2, r1)
#define _XORPS_R_(r2, r1, tr) \
*(int32_t*)(jit_ptr) = (0xC0570F + ((r2) <<19) + ((r1) <<16)) ^ (tr)
#define _XORPS_R(r2, r1) \
_XORPS_R_(r2, r1, 0); \
jit_ptr += 3
#define _C_XORPS_R(r2, r1, c) \
_XORPS_R_(r2, r1, 0); \
jit_ptr += ((c)<<1)+(c)
// r2 is always < 8, r1 here is >= 8
#define _XORPS_R64_(r2, r1, tr) \
*(int32_t*)(jit_ptr) = (0xC0570F41 + ((r2) <<27) + ((r1) <<24)) ^ ((tr)<<8)
#define _C_XORPS_R64(r2, r1, c) \
_XORPS_R64_(r2, r1, 0); \
jit_ptr += (c)<<2
//_jit_pxor_r(r2, r1)
#define _PXOR_R_(r2, r1, tr) \
*(int32_t*)(jit_ptr) = (0xC0EF0F66 + ((r2) <<27) + ((r1) <<24)) ^ (tr)
#define _PXOR_R(r2, r1) \
_PXOR_R_(r2, r1, 0); \
jit_ptr += 4
#define _C_PXOR_R(r2, r1, c) \
_PXOR_R_(r2, r1, 0); \
jit_ptr += (c)<<2
#define _PXOR_R64_(r2, r1, tr) \
*(int64_t*)(jit_ptr) = (0xC0EF0F4166 + ((int64_t)(r2) <<35) + ((int64_t)(r1) <<32)) ^ (((int64_t)tr)<<8)
#define _C_PXOR_R64(r2, r1, c) \
_PXOR_R64_(r2, r1, 0); \
jit_ptr += ((c)<<2)+(c)
// optimised mix of xor/mov operations
#define _MOV_OR_XOR_FP_M(reg, offs, flag, c) \
_XORPS_M_(reg, offs, flag); \
flag &= (c)-1; \
jit_ptr += (c)<<2
#define _MOV_OR_XOR_FP_M64(reg, offs, flag, c) \
_XORPS_M64_(reg, offs, flag); \
flag &= (c)-1; \
jit_ptr += ((c)<<2)+(c)
#define _MOV_OR_XOR_FP_INIT (0x570F ^ 0x280F)
#define _MOV_OR_XOR_INT_M(reg, offs, flag, c) \
_PXOR_M_(reg, offs, flag); \
flag &= (c)-1; \
jit_ptr += ((c)<<2)+(c)
#define _MOV_OR_XOR_INT_M64(reg, offs, flag, c) \
_PXOR_M64_(reg, offs, flag); \
flag &= (c)-1; \
jit_ptr += ((c)<<2)+((c)<<1)
#define _MOV_OR_XOR_INT_INIT (0xEF0F00 ^ 0x6F0F00)
#define _MOV_OR_XOR_R_FP(r2, r1, flag, c) \
_XORPS_R_(r2, r1, flag); \
flag &= (c)-1; \
jit_ptr += ((c)<<1)+(c)
#define _MOV_OR_XOR_R64_FP(r2, r1, flag, c) \
_XORPS_R64_(r2, r1, flag); \
flag &= (c)-1; \
jit_ptr += (c)<<2
#define _MOV_OR_XOR_R_INT(r2, r1, flag, c) \
_PXOR_R_(r2, r1, flag); \
flag &= (c)-1; \
jit_ptr += (c)<<2
#define _MOV_OR_XOR_R64_INT(r2, r1, flag, c) \
_PXOR_R64_(r2, r1, flag); \
flag &= (c)-1; \
jit_ptr += ((c)<<2)+(c)
// generate code
#ifdef _WIN64
// preload upper 13 inputs into registers
#define _XORS_FROM_MEMORY 3
for (inBit = 3; inBit < 8; inBit++){
_LD_APS(inBit, AX, (inBit-8)<<4);
}
for (; inBit<16; inBit++){
_LD_APS64(inBit, AX, (inBit-8)<<4);
}
#else
// can only fit 5 in 32-bit mode :(
#define _XORS_FROM_MEMORY 11
for (inBit = 3; inBit < 8; inBit++){ // despite appearances, we're actually loading the top 5, not mid 5
_LD_APS(inBit, AX, inBit<<4);
}
#endif
for (bit = 0; bit < 16; bit += 2){
int destOffs = (int)((bit<<4)-128);
FAST_U32 movC = _MOV_OR_XOR_INT_INIT;
FAST_U16 mask1 = tmp_depmask[bit], mask2 = tmp_depmask[bit+1], maskC = common_depmask[bit>>1];
_LD_APS(0, DX, destOffs);
_LD_DQA(1, DX, destOffs+16);
for (inBit = -8; inBit < (_XORS_FROM_MEMORY-8); inBit++){
_MOV_OR_XOR_INT_M(2, inBit, movC, maskC & 1);
_C_XORPS_M(0, inBit, mask1 & 1);
_C_PXOR_M(1, inBit, mask2 & 1);
mask1 >>= 1;
mask2 >>= 1;
maskC >>= 1;
}
// at least 5 can come from registers
for (inBit = 3; inBit < 8; inBit++){
_MOV_OR_XOR_R_INT(2, inBit, (int32_t)movC, maskC & 1);
_C_XORPS_R(0, inBit, mask1 & 1);
_C_PXOR_R(1, inBit, mask2 & 1);
mask1 >>= 1;
mask2 >>= 1;
maskC >>= 1;
}
#ifdef _WIN64
// more XORs can come from 64-bit registers
for (inBit = 0; inBit < 8; inBit++){
_MOV_OR_XOR_R64_INT(2, inBit, movC, maskC & 1);
_C_XORPS_R64(0, inBit, mask1 & 1);
_C_PXOR_R64(1, inBit, mask2 & 1);
mask1 >>= 1;
mask2 >>= 1;
maskC >>= 1;
}
#endif
if (!movC){
_XORPS_R(0, 2);
_PXOR_R(1, 2); // penalty?
}
_ST_APS(DX, destOffs, 0);
_ST_DQA(DX, destOffs+16, 1);
}
#undef _XORS_FROM_MEMORY
_jit_add_i(&jit_ptr, AX, 256);
_jit_add_i(&jit_ptr, DX, 256);
_jit_cmp_r(&jit_ptr, DX, CX);
_jit_jcc(&jit_ptr, JL, pos_startloop);
#ifdef _WIN64
for (i = 6; i < 16; i++)
_jit_movaps_load(&jit_ptr, (uint8_t)i, BP, -((int32_t)i-5)*16);
_jit_pop(&jit_ptr, BP);
#endif
_jit_ret(&jit_ptr);
// exec
(*(void(*)(void))jit_exec)();
}
/*
XOR method is based on ParPar.
https://github.com/animetosho/ParPar/
*/
// bit 単位で XOR するには、256バイトごとに並び替える
void galois_altmap256_change(unsigned char *data, unsigned int bsize)
{
unsigned short dtmp[128];
int i, j;
__m128i ta, tb, lmask, th, tl;
lmask = _mm_set1_epi16(0xff);
while (bsize != 0){
for (j = 0; j < 8; j++){
ta = _mm_load_si128((__m128i *)data); // read 32-bytes
tb = _mm_load_si128((__m128i *)data + 1);
// split to high/low parts
th = _mm_packus_epi16(_mm_srli_epi16(tb, 8), _mm_srli_epi16(ta, 8));
tl = _mm_packus_epi16(_mm_and_si128(tb, lmask), _mm_and_si128(ta, lmask));
// save to dest by extracting 16-bit masks
dtmp[0 + j] = _mm_movemask_epi8(th);
for (i = 1; i < 8; i++){
th = _mm_slli_epi16(th, 1); // byte shift would be nicer, but ultimately doesn't matter here
dtmp[i*8 + j] = _mm_movemask_epi8(th);
}
dtmp[64 + j] = _mm_movemask_epi8(tl);
for (i = 1; i < 8; i++){
tl = _mm_slli_epi16(tl, 1);
dtmp[64 + i*8 + j] = _mm_movemask_epi8(tl);
}
data += 32;
}
// we only really need to copy temp -> dest
memcpy(data - 256, dtmp, 256);
bsize -= 256;
}
}
void galois_altmap256_return(unsigned char *data, unsigned int bsize)
{
unsigned short dtmp[128];
int i, j;
__m128i ta, tb, lmask, th, tl;
th = _mm_setzero_si128(); // shut up compiler warning
tl = _mm_setzero_si128();
lmask = _mm_set1_epi16(0xff);
while (bsize != 0){
for (j = 0; j < 8; j++){
// load in pattern: [0011223344556677] [8899AABBCCDDEEFF]
tl = _mm_insert_epi16(tl, *(int *)(data + 240), 0);
th = _mm_insert_epi16(th, *(int *)(data + 112), 0);
tl = _mm_insert_epi16(tl, *(int *)(data + 224), 1);
th = _mm_insert_epi16(th, *(int *)(data + 96), 1);
tl = _mm_insert_epi16(tl, *(int *)(data + 208), 2);
th = _mm_insert_epi16(th, *(int *)(data + 80), 2);
tl = _mm_insert_epi16(tl, *(int *)(data + 192), 3);
th = _mm_insert_epi16(th, *(int *)(data + 64), 3);
tl = _mm_insert_epi16(tl, *(int *)(data + 176), 4);
th = _mm_insert_epi16(th, *(int *)(data + 48), 4);
tl = _mm_insert_epi16(tl, *(int *)(data + 160), 5);
th = _mm_insert_epi16(th, *(int *)(data + 32), 5);
tl = _mm_insert_epi16(tl, *(int *)(data + 144), 6);
th = _mm_insert_epi16(th, *(int *)(data + 16), 6);
tl = _mm_insert_epi16(tl, *(int *)(data + 128), 7);
th = _mm_insert_epi16(th, *(int *)data, 7);
// swizzle to [0123456789ABCDEF] [0123456789ABCDEF]
ta = _mm_packus_epi16(_mm_srli_epi16(tl, 8),_mm_srli_epi16(th, 8));
tb = _mm_packus_epi16(_mm_and_si128(tl, lmask),_mm_and_si128(th, lmask));
// extract top bits
dtmp[j*16 + 7] = _mm_movemask_epi8(ta);
dtmp[j*16 + 15] = _mm_movemask_epi8(tb);
for (i = 1; i < 8; i++){
ta = _mm_slli_epi16(ta, 1);
tb = _mm_slli_epi16(tb, 1);
dtmp[j*16 + 7-i] = _mm_movemask_epi8(ta);
dtmp[j*16 + 15-i] = _mm_movemask_epi8(tb);
}
data += 2;
}
// we only really need to copy temp -> dest
memcpy(data - 16, dtmp, 256);
data += 240;
bsize -= 256;
}
}
/* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */
// 複数のテーブルを使って掛け算する
void galois_region_multiply(
unsigned short *r1, // Region to multiply
unsigned short *r2, // Products go here
unsigned int count, // Count of number in short
int factor) // Number to multiply by
{
if (factor <= 1){
if (factor == 0)
return;
// アドレスが 4の倍数で無い場合は 4バイト単位で計算する効率が落ちる
if ((ULONG_PTR)r2 & 2){
// そこで最初の 1個(2バイト)だけ普通に計算する
*r2 ^= *r1;
r1++;
r2++;
count--;
}
while (count >= 2){ // 2個(4バイト)ずつ計算する
((unsigned int *)r2)[0] ^= ((unsigned int *)r1)[0];
r1 += 2;
r2 += 2;
count -= 2;
}
if (count == 1) // 最後に余ったやつだけ普通に計算する
*r2 ^= *r1;
return;
}
if (count >= 64){ // 64バイト以上なら掛け算用のテーブルを使った方が速い
#ifndef NO_SIMD
if (cpu_flag & 16){ // AVX2 対応なら
__declspec( align(32) ) unsigned char small_table[128];
int s, d;
create_eight_table_avx2(small_table, factor);
// アドレスが 32の倍数で無い場合は 32バイト単位で計算する効率が落ちる
while ((ULONG_PTR)r2 & 0x1E){
// そこで最初の 115個(230バイト)だけ普通に計算する
s = r1[0];
d = r2[0];
d ^= small_table[s & 0xF] | ((int)(small_table[16 + (s & 0xF)]) << 8);
s = s >> 4;
d ^= small_table[32 + (s & 0xF)] | ((int)(small_table[48 + (s & 0xF)]) << 8);
s = s >> 4;
d ^= small_table[64 + (s & 0xF)] | ((int)(small_table[80 + (s & 0xF)]) << 8);
s = s >> 4;
d ^= small_table[96 + s] | ((int)(small_table[112 + s]) << 8);
r2[0] = (unsigned short)d;
r1++;
r2++;
count--;
}
// 16個ずつ計算するので 16の倍数にする
gf16_avx2_block32u((unsigned char *)r1, (unsigned char *)r2,
(count & 0xFFFFFFF0) << 1, small_table);
r1 += count & 0xFFFFFFF0;
r2 += count & 0xFFFFFFF0;
count &= 15;
// 残りは 1個ずつ計算する
while (count != 0){
s = r1[0];
d = r2[0];
d ^= small_table[s & 0xF] | ((int)(small_table[16 + (s & 0xF)]) << 8);
s = s >> 4;
d ^= small_table[32 + (s & 0xF)] | ((int)(small_table[48 + (s & 0xF)]) << 8);
s = s >> 4;
d ^= small_table[64 + (s & 0xF)] | ((int)(small_table[80 + (s & 0xF)]) << 8);
s = s >> 4;
d ^= small_table[96 + s] | ((int)(small_table[112 + s]) << 8);
r2[0] = (unsigned short)d;
r1++;
r2++;
count--;
}
} else if (cpu_flag & 1){ // SSSE3 対応なら
__declspec( align(16) ) unsigned char small_table[128];
int s, d;
create_eight_table(small_table, factor);
// アドレスが 16の倍数で無い場合は 16バイト単位で計算する効率が落ちる
while ((ULONG_PTR)r2 & 0xE){
// そこで最初の 17個(214バイト)だけ普通に計算する
s = r1[0];
d = r2[0];
d ^= small_table[s & 0xF] | ((int)(small_table[16 + (s & 0xF)]) << 8);
s = s >> 4;
d ^= small_table[32 + (s & 0xF)] | ((int)(small_table[48 + (s & 0xF)]) << 8);
s = s >> 4;
d ^= small_table[64 + (s & 0xF)] | ((int)(small_table[80 + (s & 0xF)]) << 8);
s = s >> 4;
d ^= small_table[96 + s] | ((int)(small_table[112 + s]) << 8);
r2[0] = (unsigned short)d;
r1++;
r2++;
count--;
}
if (sse_unit == 16){
// 8個ずつ計算するので 8の倍数にする
gf16_ssse3_block16u((unsigned char *)r1, (unsigned char *)r2,
(count & 0xFFFFFFF8) << 1, small_table);
r1 += count & 0xFFFFFFF8;
r2 += count & 0xFFFFFFF8;
count &= 7;
} else {
// 16個ずつ計算するので 16の倍数にする
gf16_ssse3_block32u((unsigned char *)r1, (unsigned char *)r2,
(count & 0xFFFFFFF0) << 1, small_table);
r1 += count & 0xFFFFFFF0;
r2 += count & 0xFFFFFFF0;
count &= 15;
}
// 残りは 1個ずつ計算する
while (count != 0){
s = r1[0];
d = r2[0];
d ^= small_table[s & 0xF] | ((int)(small_table[16 + (s & 0xF)]) << 8);
s = s >> 4;
d ^= small_table[32 + (s & 0xF)] | ((int)(small_table[48 + (s & 0xF)]) << 8);
s = s >> 4;
d ^= small_table[64 + (s & 0xF)] | ((int)(small_table[80 + (s & 0xF)]) << 8);
s = s >> 4;
d ^= small_table[96 + s] | ((int)(small_table[112 + s]) << 8);
r2[0] = (unsigned short)d;
r1++;
r2++;
count--;
}
} else { // Combined Multi Table support (2 tables of 256-entries)
#endif
unsigned int mtab[256 * 2];
create_two_table(mtab, factor); // build combined multiplication tables
// アドレスが 8の倍数で無い場合は 8バイト単位で計算する効率が落ちる
while ((ULONG_PTR)r2 & 6){
// そこで最初の 13個(26バイト)だけ普通に計算する
r2[0] ^= mtab[((unsigned char *)r1)[0]] ^ mtab[256 + ((unsigned char *)r1)[1]];
r1++;
r2++;
count--;
}
#ifndef _WIN64 // 32-bit 版なら MMX を使う
#ifndef NO_SIMD
// 4個(8バイト)ずつ計算するので 4の倍数にする
DoBlock8((unsigned char *)r1, (unsigned char *)r2, (count & 0xFFFFFFFC) << 1, mtab);
r1 += count & 0xFFFFFFFC;
r2 += count & 0xFFFFFFFC;
count &= 3;
#else // MMX を使わないなら
// バッファーを 32-bit整数として扱う
while (count >= 2){ // 2個(4バイト)ずつ計算する
// 先に計算しておいた 2個の参照テーブルを使う
((unsigned int *)r2)[0] ^= mtab[((unsigned char *)r1)[0]] ^ mtab[256 + ((unsigned char *)r1)[1]] ^
((mtab[((unsigned char *)r1)[2]] ^ mtab[256 + ((unsigned char *)r1)[3]]) << 16);
r1 += 2;
r2 += 2;
count -= 2;
}
#endif
#else // 64-bit 版なら 64-bit 整数を使う
// バッファーを 64-bit整数として扱う
while (count >= 4){ // 4個(8バイト)ずつ計算する
// 先に計算しておいた 2個の参照テーブルを使う
((unsigned __int64 *)r2)[0] ^=
mtab[((unsigned char *)r1)[0]] ^ mtab[256 + ((unsigned char *)r1)[1]] ^
((mtab[((unsigned char *)r1)[2]] ^ mtab[256 + ((unsigned char *)r1)[3]]) << 16) ^
((unsigned __int64)(mtab[((unsigned char *)r1)[4]] ^ mtab[256 + ((unsigned char *)r1)[5]]) << 32) ^
((unsigned __int64)(mtab[((unsigned char *)r1)[6]] ^ mtab[256 + ((unsigned char *)r1)[7]]) << 48);
r1 += 4;
r2 += 4;
count -= 4;
}
#endif
// 残りは 1個ずつ計算する
while (count != 0){
r2[0] ^= mtab[((unsigned char *)r1)[0]] ^ mtab[256 + ((unsigned char *)r1)[1]];
r1++;
r2++;
count--;
}
#ifndef NO_SIMD
}
#endif
} else { // 小さいデータは普通に計算する
int log_y = galois_log_table[factor];
while (count != 0){
r2[0] ^= galois_multiply_fix(r1[0], log_y);
r1++;
r2++;
count--;
}
}
}
// 行列の割り算用、微妙に速いけど大差無し
void galois_region_divide(
unsigned short *r1, // Region to divide. products go here
unsigned int count, // Count of number in short
int factor) // Number to divide by
{
factor = galois_reciprocal(factor); // factor = 1 / factor
if (count >= 64){
// 行列サイズが小さいのでテーブル作成に時間がかかって、全く速くならない・・・
/*
#ifndef NO_SIMD
if (cpu_flag & 16){ // AVX2 対応なら
__declspec( align(32) ) unsigned char small_table[128];
int s, d;
create_eight_table_avx2(small_table, factor);
// アドレスが 32の倍数で無い場合は 32バイト単位で計算する効率が落ちる
while ((ULONG_PTR)r1 & 0x1E){
// そこで最初の 115個(230バイト)だけ普通に計算する
s = r1[0];
d = small_table[s & 0xF] | ((int)(small_table[16 + (s & 0xF)]) << 8);
s = s >> 4;
d ^= small_table[32 + (s & 0xF)] | ((int)(small_table[48 + (s & 0xF)]) << 8);
s = s >> 4;
d ^= small_table[64 + (s & 0xF)] | ((int)(small_table[80 + (s & 0xF)]) << 8);
s = s >> 4;
d ^= small_table[96 + s] | ((int)(small_table[112 + s]) << 8);
r1[0] = (unsigned short)d;
r1++;
count--;
}
// 16個ずつ計算するので 16の倍数にする
gf16_avx2_block32s((unsigned char *)r1, (count & 0xFFFFFFF0) << 1, small_table);
r1 += count & 0xFFFFFFF0;
count &= 15;
// 残りは 1個ずつ計算する
while (count != 0){
s = r1[0];
d = small_table[s & 0xF] | ((int)(small_table[16 + (s & 0xF)]) << 8);
s = s >> 4;
d ^= small_table[32 + (s & 0xF)] | ((int)(small_table[48 + (s & 0xF)]) << 8);
s = s >> 4;
d ^= small_table[64 + (s & 0xF)] | ((int)(small_table[80 + (s & 0xF)]) << 8);
s = s >> 4;
d ^= small_table[96 + s] | ((int)(small_table[112 + s]) << 8);
r1[0] = (unsigned short)d;
r1++;
count--;
}
} else if (cpu_flag & 1){ // SSSE3 対応なら
__declspec( align(16) ) unsigned char small_table[128];
int s, d;
create_eight_table(small_table, factor);
// アドレスが 16の倍数で無い場合は 16バイト単位で計算する効率が落ちる
while ((ULONG_PTR)r1 & 0xE){
// そこで最初の 17個(214バイト)だけ普通に計算する
s = r1[0];
d = small_table[s & 0xF] | ((int)(small_table[16 + (s & 0xF)]) << 8);
s = s >> 4;
d ^= small_table[32 + (s & 0xF)] | ((int)(small_table[48 + (s & 0xF)]) << 8);
s = s >> 4;
d ^= small_table[64 + (s & 0xF)] | ((int)(small_table[80 + (s & 0xF)]) << 8);
s = s >> 4;
d ^= small_table[96 + s] | ((int)(small_table[112 + s]) << 8);
r1[0] = (unsigned short)d;
r1++;
count--;
}
// 8個ずつ計算するので 8の倍数にする
gf16_ssse3_block16s((unsigned char *)r1, (count & 0xFFFFFFF8) << 1, small_table);
r1 += count & 0xFFFFFFF8;
count &= 7;
// 残りは 1個ずつ計算する
while (count != 0){
s = r1[0];
d = small_table[s & 0xF] | ((int)(small_table[16 + (s & 0xF)]) << 8);
s = s >> 4;
d ^= small_table[32 + (s & 0xF)] | ((int)(small_table[48 + (s & 0xF)]) << 8);
s = s >> 4;
d ^= small_table[64 + (s & 0xF)] | ((int)(small_table[80 + (s & 0xF)]) << 8);
s = s >> 4;
d ^= small_table[96 + s] | ((int)(small_table[112 + s]) << 8);
r1[0] = (unsigned short)d;
r1++;
count--;
}
} else { // Combined Multi Table support (2 tables of 256-entries)
#endif
*/
unsigned int mtab[256 * 2];
create_two_table(mtab, factor); // 掛け算用のテーブルをその場で構成する
// アドレスが 4の倍数で無い場合は 4バイト単位で計算する効率が落ちる
if (((ULONG_PTR)r1 & 2) != 0){
// そこで最初の 1個(2バイト)だけ普通に計算する
r1[0] = (unsigned short)(mtab[((unsigned char *)r1)[0]] ^ mtab[256 + ((unsigned char *)r1)[1]]);
r1++;
count--;
}
// バッファーを 32-bit整数として扱う
while (count >= 2){ // 2個(4バイト)ずつ計算する
// 先に計算しておいた 2個の参照テーブルを使う
((unsigned int *)r1)[0] = mtab[((unsigned char *)r1)[0]] ^ mtab[256 + ((unsigned char *)r1)[1]] ^
((mtab[((unsigned char *)r1)[2]] ^ mtab[256 + ((unsigned char *)r1)[3]]) << 16);
r1 += 2;
count -= 2;
}
// 奇数なら最後に 1個余る
if (count == 1)
r1[0] = (unsigned short)(mtab[((unsigned char *)r1)[0]] ^ mtab[256 + ((unsigned char *)r1)[1]]);
/*
#ifndef NO_SIMD
}
#endif
*/
} else { // 小さいデータは普通に計算する
int log_y = galois_log_table[factor];
while (count != 0){
r1[0] = galois_multiply_fix(r1[0], log_y);
r1++;
count--;
}
}
}
// 16バイト境界のバッファー専用のXOR
void galois_align_xor(
unsigned char *r1, // Region to multiply
unsigned char *r2, // Products go here
unsigned int len) // Byte length
{
#ifndef NO_SIMD
__m128i xmm0, xmm1; // 16バイトごとに XOR する
while (len != 0){
xmm0 = _mm_load_si128((__m128i *)r1);
xmm1 = _mm_load_si128((__m128i *)r2);
xmm1 = _mm_xor_si128(xmm1, xmm0);
_mm_store_si128((__m128i *)r2, xmm1);
r1 += 16;
r2 += 16;
len -= 16;
}
#else // SSE2 を使わないなら
while (len != 0){ // 4バイトずつ計算する
((unsigned int *)r2)[0] ^= ((unsigned int *)r1)[0];
r1 += 4;
r2 += 4;
len -= 4;
}
#endif
}
// 16バイト境界のバッファー専用の掛け算 (ALTMAP しない)
void galois_align16_multiply(
unsigned char *r1, // Region to multiply (must be aligned by 16)
unsigned char *r2, // Products go here
unsigned int len, // Byte length (must be multiple of 16)
int factor) // Number to multiply by
{
if (factor <= 1){
if (factor == 0)
return;
#ifndef NO_SIMD
{ __m128i xmm0, xmm1; // 16バイトごとに XOR する
while (len != 0){
xmm0 = _mm_load_si128((__m128i *)r1);
xmm1 = _mm_load_si128((__m128i *)r2);
xmm1 = _mm_xor_si128(xmm1, xmm0);
_mm_store_si128((__m128i *)r2, xmm1);
r1 += 16;
r2 += 16;
len -= 16;
}
}
#else // SSE2 を使わないなら
while (len != 0){ // 4バイトずつ計算する
((unsigned int *)r2)[0] ^= ((unsigned int *)r1)[0];
r1 += 4;
r2 += 4;
len -= 4;
}
#endif
// 掛け算用のテーブルを常に作成する (32バイトだと少し遅くなる)
#ifndef NO_SIMD
/*
// sse_unit が 32の倍数な時だけ
} else if (cpu_flag & 16){ // AVX2 対応なら
__declspec( align(32) ) unsigned char small_table[128];
create_eight_table_avx2(small_table, factor);
gf16_avx2_block32u(r1, r2, len, small_table);
*/
} else if (cpu_flag & 1){ // SSSE3 対応なら
__declspec( align(16) ) unsigned char small_table[128];
create_eight_table(small_table, factor);
gf16_ssse3_block16u(r1, r2, len, small_table);
// sse_unit が 32の倍数ならこちらでもいい
//gf16_ssse3_block32u(r1, r2, len, small_table);
#endif
} else { // Combined Multi Table support (2 tables of 256-entries)
unsigned int mtab[256 * 2];
create_two_table(mtab, factor); // build combined multiplication tables
#ifndef _WIN64 // 32-bit 版なら MMX を使う
#ifndef NO_SIMD
DoBlock8(r1, r2, len, mtab); // process large chunk 8-bytes a shot
#else // MMX を使わないなら
// バッファーを 32-bit整数として扱う
while (len != 0){ // 4バイトずつ計算する
((unsigned int *)r2)[0] ^= mtab[r1[0]] ^ mtab[256 + r1[1]] ^
((mtab[r1[2]] ^ mtab[256 + r1[3]]) << 16);
r1 += 4;
r2 += 4;
len -= 4;
}
#endif
#else // 64-bit 版なら 64-bit 整数を使う
// バッファーを 64-bit整数として扱う
while (len != 0){ // 8バイトずつ計算する
((unsigned __int64 *)r2)[0] ^=
mtab[r1[0]] ^ mtab[256 + r1[1]] ^
((mtab[r1[2]] ^ mtab[256 + r1[3]]) << 16) ^
((unsigned __int64)(mtab[r1[4]] ^ mtab[256 + r1[5]]) << 32) ^
((unsigned __int64)(mtab[r1[6]] ^ mtab[256 + r1[7]]) << 48);
r1 += 8;
r2 += 8;
len -= 8;
}
#endif
}
}
// 32バイトごとに並び替えられたバッファー専用の掛け算 (SSSE3 & ALTMAP)
void galois_align32_multiply(
unsigned char *r1, // Region to multiply (must be aligned by 16)
unsigned char *r2, // Products go here
unsigned int len, // Byte length (must be multiple of 32)
int factor) // Number to multiply by
{
if (factor <= 1){
if (factor != 0){
__m128i xmm0, xmm1; // 16バイトごとに XOR する
while (len != 0){
xmm0 = _mm_load_si128((__m128i *)r1);
xmm1 = _mm_load_si128((__m128i *)r2);
xmm1 = _mm_xor_si128(xmm1, xmm0);
_mm_store_si128((__m128i *)r2, xmm1);
r1 += 16;
r2 += 16;
len -= 16;
}
}
// 掛け算用のテーブルを常に作成する (32バイトだと少し遅くなる)
} else {
__declspec( align(16) ) unsigned char small_table[128];
create_eight_table(small_table, factor);
gf16_ssse3_block32_altmap(r1, r2, len, small_table);
}
}
// 掛け算を2回行って、一度に更新する (SSSE3 & ALTMAP)
void galois_align32_multiply2(
unsigned char *src1, // Region to multiply (must be aligned by 16)
unsigned char *src2,
unsigned char *dst, // Products go here
unsigned int len, // Byte length (must be multiple of 32)
int factor1, // Number to multiply by
int factor2)
{
if ((factor1 == 1) && (factor2 == 1)){ // 両方の factor が 1の場合
__m128i xmm0, xmm1, xmm2;
while (len != 0){
xmm0 = _mm_load_si128((__m128i *)dst);
xmm1 = _mm_load_si128((__m128i *)src1);
xmm2 = _mm_load_si128((__m128i *)src2);
xmm0 = _mm_xor_si128(xmm0, xmm1);
xmm0 = _mm_xor_si128(xmm0, xmm2);
_mm_store_si128((__m128i *)dst, xmm0);
src1 += 16;
src2 += 16;
dst += 16;
len -= 16;
}
// 掛け算用のテーブルを常に作成する (32バイトだと少し遅くなる)
} else {
__declspec( align(16) ) unsigned char small_table[256];
create_eight_table(small_table, factor1);
create_eight_table(small_table + 128, factor2);
gf16_ssse3_block32_altmap2(src1, src2, dst, len, small_table);
}
}
// 256バイトごとに並び替えられたバッファー専用の JIT(SSE2) を使った掛け算
void galois_align256_multiply(
unsigned char *r1, // Region to multiply (must be aligned by 16)
unsigned char *r2, // Products go here
unsigned int len, // Byte length (must be multiple of 32)
int factor) // Number to multiply by
{
if (factor <= 1){
if (factor != 0){
__m128i xmm0, xmm1; // 16バイトごとに XOR する
while (len != 0){
xmm0 = _mm_load_si128((__m128i *)r1);
xmm1 = _mm_load_si128((__m128i *)r2);
xmm1 = _mm_xor_si128(xmm1, xmm0);
_mm_store_si128((__m128i *)r2, xmm1);
r1 += 16;
r2 += 16;
len -= 16;
}
}
// 常に JIT(SSE2) を使う
} else {
gf16_sse2_block256_jit(r1, r2, len, factor);
}
}
// 32バイトごとに並び替えられたバッファー専用の掛け算 (AVX2 & ALTMAP)
void galois_align32avx_multiply(
unsigned char *r1, // Region to multiply (must be aligned by 32)
unsigned char *r2, // Products go here
unsigned int len, // Byte length (must be multiple of 32)
int factor) // Number to multiply by
{
if (factor <= 1){
if (factor != 0){
__m256i ymm0, ymm1; // 32バイトごとに XOR する
while (len != 0){
ymm0 = _mm256_load_si256((__m256i *)r1);
ymm1 = _mm256_load_si256((__m256i *)r2);
ymm1 = _mm256_xor_si256(ymm1, ymm0);
_mm256_store_si256((__m256i *)r2, ymm1);
r1 += 32;
r2 += 32;
len -= 32;
}
_mm256_zeroupper(); // AVX-SSE 切り替えの回避
}
// 掛け算用のテーブルを常に作成する (32バイトだと少し遅くなる)
} else {
__declspec( align(32) ) unsigned char small_table[128];
create_eight_table_avx2(small_table, factor);
gf16_avx2_block32(r1, r2, len, small_table);
}
}
// 掛け算を2回行って、一度に更新する (AVX2 & ALTMAP)
void galois_align32avx_multiply2(
unsigned char *src1, // Region to multiply (must be aligned by 32)
unsigned char *src2,
unsigned char *dst, // Products go here
unsigned int len, // Byte length (must be multiple of 32)
int factor1, // Number to multiply by
int factor2)
{
if ((factor1 == 1) && (factor2 == 1)){ // 両方の factor が 1の場合
__m256i ymm0, ymm1, ymm2;
while (len != 0){
ymm0 = _mm256_load_si256((__m256i *)dst);
ymm1 = _mm256_load_si256((__m256i *)src1);
ymm2 = _mm256_load_si256((__m256i *)src2);
ymm0 = _mm256_xor_si256(ymm0, ymm1);
ymm0 = _mm256_xor_si256(ymm0, ymm2);
_mm256_store_si256((__m256i *)dst, ymm0);
src1 += 32;
src2 += 32;
dst += 32;
len -= 32;
}
_mm256_zeroupper(); // AVX-SSE 切り替えの回避
// 掛け算用のテーブルを常に作成する (32バイトだと少し遅くなる)
} else {
__declspec( align(32) ) unsigned char small_table[256];
create_eight_table_avx2(small_table, factor1);
create_eight_table_avx2(small_table + 128, factor2);
gf16_avx2_block32_2(src1, src2, dst, len, small_table);
}
}
/* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */
// チェックサムを計算する
// buffer alignment must be 16, length must be multiple of 16
void checksum16(unsigned char *data, unsigned char *hash, int byte_size)
{
int i, count;
#ifndef NO_SIMD // SSE2 を使うなら
__m128i temp16, prev16, data16, mask16, zero16, poly16;
count = byte_size / 16;
prev16 = _mm_setzero_si128();
zero16 = _mm_setzero_si128();
poly16 = _mm_set1_epi32(0x100B100B); // PRIM_POLY = 0x1100B
while (count > 0){ // HASH_RANGE バイトごとに
// 16バイトごとに XOR する
temp16 = _mm_setzero_si128();
if (count < HASH_RANGE / 16){
i = count;
count = 0;
} else {
i = HASH_RANGE / 16;
count -= HASH_RANGE / 16;
}
while (i > 0){
data16 = _mm_load_si128((__m128i *)data); // load 16-bytes
temp16 = _mm_xor_si128(temp16, data16);
data += 16;
i--;
}
// 前回の値を 2倍して、今回の値を追加する
//temp16 = _mm_xor_si128(temp16, prev16); // 3倍する場合は、元の値も XOR すればいい
mask16 = _mm_cmpgt_epi16(zero16, prev16); // (0 > prev) ? 0xFFFF : 0x0000
prev16 = _mm_slli_epi16(prev16, 1); // prev *= 2
mask16 = _mm_and_si128(mask16, poly16); // 0x100B or 0x0000
prev16 = _mm_xor_si128(prev16, mask16);
prev16 = _mm_xor_si128(prev16, temp16);
}
_mm_store_si128((__m128i *)hash, prev16);
#else // 4バイト整数で計算するなら
unsigned int *data4, temp[4], prev[4], x, mask;
data4 = (unsigned int *)data;
count = byte_size / 4;
for (i = 0; i < 4; i++)
prev[i] = 0;
while (count > 0){ // HASH_RANGE バイトごとに
// 4バイトごとに 16バイトに XOR する
for (i = 0; i < 4; i++)
temp[i] = 0;
for (i = 0; i < HASH_RANGE / 4; i++){
temp[i & 3] ^= data4[i];
count--;
if (count == 0)
break;
}
data4 += HASH_RANGE / 4;
// 前回の値を 2倍して、今回の値を追加する
for (i = 0; i < 4; i++){
x = prev[i];
mask = (x & 0x80008000) >> 15; // 0x00010001 or 0x00000000
x = (x & 0x7FFF7FFF) << 1; // 3倍する場合は「^=」にすればいい
x ^= mask * 0x100B; // 0x100B100B or 0x00000000
prev[i] = x ^ temp[i];
}
}
for (i = 0; i < 4; i++)
((unsigned int *)hash)[i] = prev[i];
#endif
}
// チェックサムを計算すると同時にデータを並び替える
// buffer alignment must be 16, length must be (multiple of 32) - 16
void checksum16_altmap32(unsigned char *data, unsigned char *hash, int byte_size)
{
int i, count;
__m128i temp16, prev16, mask16, zero16, poly16;
__m128i dataA, dataB, dataC, dataD, maskB;
count = byte_size / 16;
prev16 = _mm_setzero_si128();
zero16 = _mm_setzero_si128();
poly16 = _mm_set1_epi32(0x100B100B); // PRIM_POLY = 0x1100B
maskB = _mm_set1_epi16(0x00FF); // 0x00FF *8
while (count > 0){ // HASH_RANGE バイトごとに
// 16バイトごとに XOR する
temp16 = _mm_setzero_si128();
if (count < HASH_RANGE / 16){
i = count;
count = 0;
} else {
i = HASH_RANGE / 16;
count -= HASH_RANGE / 16;
}
while (i >= 2){
dataA = _mm_load_si128((__m128i *)data); // read 32-bytes
dataB = _mm_load_si128((__m128i *)data + 1);
temp16 = _mm_xor_si128(temp16, dataA);
temp16 = _mm_xor_si128(temp16, dataB);
dataC = _mm_and_si128(dataA, maskB); // erase higher byte
dataD = _mm_and_si128(dataB, maskB);
dataA = _mm_srli_epi16(dataA, 8); // move higher byte to lower
dataB = _mm_srli_epi16(dataB, 8);
dataC = _mm_packus_epi16(dataC, dataD); // select lower byte of each word
dataA = _mm_packus_epi16(dataA, dataB); // select higher byte of each word
_mm_store_si128((__m128i *)data, dataC); // write 32-bytes
_mm_store_si128((__m128i *)data + 1, dataA);
data += 32;
i -= 2;
}
if (i > 0){
dataA = _mm_load_si128((__m128i *)data); // load 16-bytes
temp16 = _mm_xor_si128(temp16, dataA);
}
// 前回の値を 2倍して、今回の値を追加する
//temp16 = _mm_xor_si128(temp16, prev16); // 3倍する場合は、元の値も XOR すればいい
mask16 = _mm_cmpgt_epi16(zero16, prev16); // (0 > prev) ? 0xFFFF : 0x0000
prev16 = _mm_slli_epi16(prev16, 1); // prev *= 2
mask16 = _mm_and_si128(mask16, poly16); // 0x100B or 0x0000
prev16 = _mm_xor_si128(prev16, mask16);
prev16 = _mm_xor_si128(prev16, temp16);
}
if (hash != data + 16) // ハッシュ値の保存先が別なら
_mm_store_si128((__m128i *)hash, prev16);
// 最後にハッシュ値も並び替える
dataC = _mm_and_si128(dataA, maskB); // erase higher byte
dataD = _mm_and_si128(prev16, maskB);
dataA = _mm_srli_epi16(dataA, 8); // move higher byte to lower
dataB = _mm_srli_epi16(prev16, 8);
dataC = _mm_packus_epi16(dataC, dataD); // select lower byte of each word
dataA = _mm_packus_epi16(dataA, dataB); // select higher byte of each word
_mm_store_si128((__m128i *)data, dataC); // write 32-bytes
_mm_store_si128((__m128i *)data + 1, dataA);
}
// データの並びを元に戻すと同時にチェックサムを計算する
// buffer alignment must be 32, length must be (multiple of 32) - 16
void checksum16_return32(unsigned char *data, unsigned char *hash, int byte_size)
{
int i, count;
__m128i temp16, prev16, mask16, zero16, poly16;
__m128i dataA, dataB, dataC;
count = byte_size / 16;
prev16 = _mm_setzero_si128();
zero16 = _mm_setzero_si128();
poly16 = _mm_set1_epi32(0x100B100B); // PRIM_POLY = 0x1100B
while (count > 0){ // HASH_RANGE バイトごとに
// 16バイトごとに XOR する
temp16 = _mm_setzero_si128();
if (count < HASH_RANGE / 16){
i = count;
count = 0;
} else {
i = HASH_RANGE / 16;
count -= HASH_RANGE / 16;
}
while (i >= 2){
dataA = _mm_load_si128((__m128i *)data); // read 32-bytes
dataB = _mm_load_si128((__m128i *)data + 1);
dataC = _mm_unpacklo_epi8(dataA, dataB); // interleave lower and higher bytes
dataA = _mm_unpackhi_epi8(dataA, dataB);
_mm_store_si128((__m128i *)data, dataC); // write 32-bytes
_mm_store_si128((__m128i *)data + 1, dataA);
temp16 = _mm_xor_si128(temp16, dataC);
temp16 = _mm_xor_si128(temp16, dataA);
data += 32;
i -= 2;
}
if (i > 0){
dataA = _mm_load_si128((__m128i *)data); // read 32-bytes
dataB = _mm_load_si128((__m128i *)data + 1);
dataC = _mm_unpacklo_epi8(dataA, dataB); // interleave lower and higher bytes
dataA = _mm_unpackhi_epi8(dataA, dataB);
_mm_store_si128((__m128i *)data, dataC); // write 32-bytes
_mm_store_si128((__m128i *)data + 1, dataA);
temp16 = _mm_xor_si128(temp16, dataC);
}
// 前回の値を 2倍して、今回の値を追加する
//temp16 = _mm_xor_si128(temp16, prev16); // 3倍する場合は、元の値も XOR すればいい
mask16 = _mm_cmpgt_epi16(zero16, prev16); // (0 > prev) ? 0xFFFF : 0x0000
prev16 = _mm_slli_epi16(prev16, 1); // prev *= 2
mask16 = _mm_and_si128(mask16, poly16); // 0x100B or 0x0000
prev16 = _mm_xor_si128(prev16, mask16);
prev16 = _mm_xor_si128(prev16, temp16);
}
if (hash != data + 16) // ハッシュ値の保存先が別なら
_mm_store_si128((__m128i *)hash, prev16);
}
// チェックサムを計算すると同時にデータを並び替える
// buffer alignment must be 256, length must be (multiple of 256) - 16
void checksum16_altmap256(unsigned char *data, unsigned char *hash, int byte_size)
{
// 順番に処理する場合
// checksum16(data, hash, byte_size);
// galois_altmap256_change(data, byte_size + HASH_SIZE);
unsigned short dtmp[128];
int i, j;
__m128i ta, tb, lmask, th, tl, temp16, prev16, poly16;
lmask = _mm_set1_epi16(0xff);
prev16 = _mm_setzero_si128();
poly16 = _mm_set1_epi32(0x100B100B); // PRIM_POLY = 0x1100B
byte_size += HASH_SIZE;
while (byte_size != 0){
for (j = 0; j < 8; j++){
ta = _mm_load_si128((__m128i *)data); // read 32-bytes
tb = _mm_load_si128((__m128i *)data + 1);
// 128バイトを XOR して 16バイトにする
if ((j & 3) == 0)
temp16 = _mm_setzero_si128();
temp16 = _mm_xor_si128(temp16, ta);
if ((j != 7) || (byte_size != 256)) // 最後の 16バイトだけ含めない
temp16 = _mm_xor_si128(temp16, tb);
if ((j & 3) == 3){ // j = 3 or 7
// 前回の値を 2倍して、今回の値を追加する
//temp16 = _mm_xor_si128(temp16, prev16); // 3倍する場合は、元の値も XOR すればいい
tl = _mm_setzero_si128();
th = _mm_cmpgt_epi16(tl, prev16); // (0 > prev) ? 0xFFFF : 0x0000
prev16 = _mm_slli_epi16(prev16, 1); // prev *= 2
th = _mm_and_si128(th, poly16); // 0x100B or 0x0000
prev16 = _mm_xor_si128(prev16, th);
prev16 = _mm_xor_si128(prev16, temp16);
if ((j == 7) && (byte_size == 256)){
if (hash != data + 16) // ハッシュ値の保存先が別なら
_mm_store_si128((__m128i *)hash, prev16);
_mm_store_si128(&tb, prev16); // 最後の 16バイトにハッシュ値を置く
}
}
// split to high/low parts
th = _mm_packus_epi16(_mm_srli_epi16(tb, 8), _mm_srli_epi16(ta, 8));
tl = _mm_packus_epi16(_mm_and_si128(tb, lmask), _mm_and_si128(ta, lmask));
// save to dest by extracting 16-bit masks
dtmp[0 + j] = _mm_movemask_epi8(th);
for (i = 1; i < 8; i++){
th = _mm_slli_epi16(th, 1); // byte shift would be nicer, but ultimately doesn't matter here
dtmp[i*8 + j] = _mm_movemask_epi8(th);
}
dtmp[64 + j] = _mm_movemask_epi8(tl);
for (i = 1; i < 8; i++){
tl = _mm_slli_epi16(tl, 1);
dtmp[64 + i*8 + j] = _mm_movemask_epi8(tl);
}
data += 32;
}
// we only really need to copy temp -> dest
memcpy(data - 256, dtmp, 256);
byte_size -= 256;
}
}
// データの並びを元に戻すと同時にチェックサムを計算する
// buffer alignment must be 256, length must be (multiple of 256) - 16
void checksum16_return256(unsigned char *data, unsigned char *hash, int byte_size)
{
// 順番に処理する場合
// galois_altmap256_return(data, byte_size + HASH_SIZE);
// checksum16(data, hash, byte_size);
__declspec( align(16) ) unsigned short dtmp[128];
int i, j;
__m128i ta, tb, lmask, th, tl, temp16, prev16, poly16;
th = _mm_setzero_si128(); // shut up compiler warning
tl = _mm_setzero_si128();
lmask = _mm_set1_epi16(0xff);
prev16 = _mm_setzero_si128();
poly16 = _mm_set1_epi32(0x100B100B); // PRIM_POLY = 0x1100B
byte_size += HASH_SIZE;
while (byte_size != 0){
for (j = 0; j < 8; j++){
// load in pattern: [0011223344556677] [8899AABBCCDDEEFF]
tl = _mm_insert_epi16(tl, *(int *)(data + 240), 0);
th = _mm_insert_epi16(th, *(int *)(data + 112), 0);
tl = _mm_insert_epi16(tl, *(int *)(data + 224), 1);
th = _mm_insert_epi16(th, *(int *)(data + 96), 1);
tl = _mm_insert_epi16(tl, *(int *)(data + 208), 2);
th = _mm_insert_epi16(th, *(int *)(data + 80), 2);
tl = _mm_insert_epi16(tl, *(int *)(data + 192), 3);
th = _mm_insert_epi16(th, *(int *)(data + 64), 3);
tl = _mm_insert_epi16(tl, *(int *)(data + 176), 4);
th = _mm_insert_epi16(th, *(int *)(data + 48), 4);
tl = _mm_insert_epi16(tl, *(int *)(data + 160), 5);
th = _mm_insert_epi16(th, *(int *)(data + 32), 5);
tl = _mm_insert_epi16(tl, *(int *)(data + 144), 6);
th = _mm_insert_epi16(th, *(int *)(data + 16), 6);
tl = _mm_insert_epi16(tl, *(int *)(data + 128), 7);
th = _mm_insert_epi16(th, *(int *)data, 7);
// swizzle to [0123456789ABCDEF] [0123456789ABCDEF]
ta = _mm_packus_epi16(_mm_srli_epi16(tl, 8),_mm_srli_epi16(th, 8));
tb = _mm_packus_epi16(_mm_and_si128(tl, lmask),_mm_and_si128(th, lmask));
// extract top bits
dtmp[j*16 + 7] = _mm_movemask_epi8(ta);
dtmp[j*16 + 15] = _mm_movemask_epi8(tb);
for (i = 1; i < 8; i++){
ta = _mm_slli_epi16(ta, 1);
tb = _mm_slli_epi16(tb, 1);
dtmp[j*16 + 7-i] = _mm_movemask_epi8(ta);
dtmp[j*16 + 15-i] = _mm_movemask_epi8(tb);
}
data += 2;
}
// 128バイトを XOR して 16バイトにする
temp16 = _mm_setzero_si128();
for (j = 0; j < 8; j++){
ta = _mm_load_si128((__m128i *)dtmp + j);
temp16 = _mm_xor_si128(temp16, ta);
}
// 前回の値を 2倍して、今回の値を追加する
//temp16 = _mm_xor_si128(temp16, prev16); // 3倍する場合は、元の値も XOR すればいい
tl = _mm_setzero_si128();
th = _mm_cmpgt_epi16(tl, prev16); // (0 > prev) ? 0xFFFF : 0x0000
prev16 = _mm_slli_epi16(prev16, 1); // prev *= 2
th = _mm_and_si128(th, poly16); // 0x100B or 0x0000
prev16 = _mm_xor_si128(prev16, th);
prev16 = _mm_xor_si128(prev16, temp16);
if (byte_size != 256){
// 次の 128バイトを XOR して 16バイトにする
temp16 = _mm_setzero_si128();
for (j = 8; j < 16; j++){
ta = _mm_load_si128((__m128i *)dtmp + j);
temp16 = _mm_xor_si128(temp16, ta);
}
// 前回の値を 2倍して、今回の値を追加する
//temp16 = _mm_xor_si128(temp16, prev16); // 3倍する場合は、元の値も XOR すればいい
tl = _mm_setzero_si128();
th = _mm_cmpgt_epi16(tl, prev16); // (0 > prev) ? 0xFFFF : 0x0000
prev16 = _mm_slli_epi16(prev16, 1); // prev *= 2
th = _mm_and_si128(th, poly16); // 0x100B or 0x0000
prev16 = _mm_xor_si128(prev16, th);
prev16 = _mm_xor_si128(prev16, temp16);
} else { // 最後の 128バイトは特別
// 次の 112バイトを XOR して 16バイトにする
temp16 = _mm_setzero_si128();
for (j = 8; j < 15; j++){
ta = _mm_load_si128((__m128i *)dtmp + j);
temp16 = _mm_xor_si128(temp16, ta);
}
// 前回の値を 2倍して、今回の値を追加する
//temp16 = _mm_xor_si128(temp16, prev16); // 3倍する場合は、元の値も XOR すればいい
tl = _mm_setzero_si128();
th = _mm_cmpgt_epi16(tl, prev16); // (0 > prev) ? 0xFFFF : 0x0000
prev16 = _mm_slli_epi16(prev16, 1); // prev *= 2
th = _mm_and_si128(th, poly16); // 0x100B or 0x0000
prev16 = _mm_xor_si128(prev16, th);
prev16 = _mm_xor_si128(prev16, temp16);
}
// we only really need to copy temp -> dest
memcpy(data - 16, dtmp, 256);
data += 240;
byte_size -= 256;
}
if (hash != data - 16) // ハッシュ値の保存先が別なら
_mm_store_si128((__m128i *)hash, prev16);
}